fix: 🐛 Improve the way to handle AcquisitionDateTime

Some typos fixed, set group_UNK if issues with AcquisitionDateTime and report, handle AcquisitionDateTime without ms
pontikos-lab · Nov 13, 2024 · 65afd80 · 65afd80
1 parent ecc4c6e
commit 65afd80
Show file tree

Hide file tree

Showing 10 changed files with 92 additions and 40 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -8,7 +8,7 @@ repos:
       - id: check-shebang-scripts-are-executable
       - id: check-added-large-files
         args:
-          - "--maxkb=12000"
+          - "--maxkb=5000"
       # - id: debug-statements # not for process_dcm
       - id: check-yaml
         exclude: mkdocs.yml

diff --git a/.talismanrc b/.talismanrc
@@ -1,6 +1,8 @@
 fileignoreconfig:
+  - filename: tests/test_main.py
+    checksum: cc564a13bd73b557d0ef095252a2f1a9fd166a4a70d68d805ee9fd706fc28b32
+  - filename: tests/test_utils.py
+    checksum: 27c2987e02ca5255d6711a82e4b8921ea6585bc4ac06a893f250fe12b4b236a5
   - filename: poetry.lock
-    checksum: e8dbedc758eb55f4bd5e6d5e8fdac611ad139df6d4f769603fa6671649032f5c
-  - filename: README.md
-    checksum: b842faad8156171b1388e3b3083a5c2bc120c45b25fa2412ca0f12ae1b5d5899
+    checksum: 4fe8810d9b7b0faa6e9aabea51871fde73b004dc93ebdba8462ec31cac3e22a8
 version: ""
diff --git a/poetry.lock b/poetry.lock
diff --git a/process_dcm/main.py b/process_dcm/main.py
@@ -1,4 +1,4 @@
-"""app to procces DCM files."""
+"""app to process DCM files."""
 
 import csv
 import os

diff --git a/process_dcm/utils.py b/process_dcm/utils.py
@@ -47,7 +47,7 @@ def set_output_dir(ref_path: str | Path, a_path: str | Path) -> str:
 
     Args:
         ref_path: A reference path used when the given path is relative or broken.
-        a_path: The path to be analyzed and potentially resolved.
+        a_path: The path to be analysed and potentially resolved.
 
     Returns:
         The determined output directory as a POSIX string (using forward slashes).
@@ -123,7 +123,7 @@ def meta_images(dcm_obj: FileDataset) -> dict:
                         {"photo_locations": [{"start": {"x": cc[1], "y": cc[0]}, "end": {"x": cc[3], "y": cc[2]}}]}
                     )
                 else:
-                    typer.secho("WARN: empty photo_locations", fg=typer.colors.RED)
+                    typer.secho("\nWARN: empty photo_locations", fg=typer.colors.RED)
                     meta["contents"].append({"photo_locations": []})
 
     return meta
@@ -275,23 +275,44 @@ def update_modality(dcm: FileDataset) -> bool:
     return True  # Modality updated successfully
 
 
-def group_dcms_by_acquisition_time(dcms: list[FileDataset], tolerance_seconds: int = 2) -> dict[str, list[FileDataset]]:
-    """Group DICOM files by AcquisitionDateTime within the specified tolerance."""
+def group_dcms_by_acquisition_time(dcms: list[FileDataset], tol: int = 2) -> dict[str, list[FileDataset]]:
+    """Group DICOM files by AcquisitionDateTime within the specified tolerance.
+
+    Args:
+        dcms: List of DICOM FileDataset objects.
+        tol: Time tolerance for grouping in seconds.
+
+    Returns:
+        Dictionary of grouped DICOM files, keyed by acquisition datetime string.
+    """
     grouped_dcms: dict[str, list[FileDataset]] = defaultdict(list)
+
+    def parse_datetime(dt_str: str) -> datetime:
+        try:
+            return datetime.strptime(dt_str, "%Y%m%d%H%M%S.%f")
+        except ValueError:
+            return datetime.strptime(dt_str, "%Y%m%d%H%M%S")
+
     for dcm in dcms:
         acquisition_datetime_str = dcm.get("AcquisitionDateTime", "unknown")
         if acquisition_datetime_str != "unknown":
-            acquisition_datetime = datetime.strptime(acquisition_datetime_str, "%Y%m%d%H%M%S.%f")
-            # Find the closest group within the tolerance
-            for group_time_str, group in grouped_dcms.items():
-                if group_time_str != "unknown":
-                    group_time = datetime.strptime(group_time_str, "%Y%m%d%H%M%S.%f")
-                    if abs(acquisition_datetime - group_time) <= timedelta(seconds=tolerance_seconds):
-                        grouped_dcms[group_time_str].append(dcm)
-                        break
-            else:
-                # If no close group found, create a new one
-                grouped_dcms[acquisition_datetime_str].append(dcm)
+            try:
+                acquisition_datetime = parse_datetime(acquisition_datetime_str)
+                # Find the closest group within the tolerance
+                for group_time_str, group in grouped_dcms.items():
+                    if group_time_str != "unknown":
+                        group_time = parse_datetime(group_time_str)
+                        if abs(acquisition_datetime - group_time) <= timedelta(seconds=tol):
+                            grouped_dcms[group_time_str].append(dcm)
+                            break
+                else:
+                    # If no close group found, create a new one
+                    grouped_dcms[acquisition_datetime_str].append(dcm)
+            except ValueError:
+                typer.secho(
+                    f"\nWARN: Unexpected AcquisitionDateTime format: {acquisition_datetime_str}", fg=typer.colors.RED
+                )
+                grouped_dcms["unknown"].append(dcm)
         else:
             grouped_dcms["unknown"].append(dcm)
 
@@ -382,13 +403,20 @@ def process_dcm(
 
     if group:
         # Group DICOM files by AcquisitionDateTime
-        grouped_dcms = group_dcms_by_acquisition_time(dcms, tolerance_seconds=tol)
+        grouped_dcms = group_dcms_by_acquisition_time(dcms, tol=tol)
 
         # Sort groups by AcquisitionDateTime
         sorted_groups = sorted(grouped_dcms.items())
 
-        for gid, (_, group_dcms) in enumerate(sorted_groups):
+        for gid, (acquisition_time, group_dcms) in enumerate(sorted_groups):
             group_dir = os.path.join(output_dir, f"group_{gid}")
+
+            if acquisition_time == "unknown":
+                group_dir = os.path.join(output_dir, "group_UNK")
+                typer.secho(
+                    f"\nWARN: unknown AcquisitionDateTime, results in {group_dir} are not reliable", fg=typer.colors.RED
+                )
+
             os.makedirs(group_dir, exist_ok=True)
 
             for dcm in group_dcms:
@@ -400,6 +428,12 @@ def process_dcm(
 
                 for i in range(dcm.NumberOfFrames):
                     out_img = os.path.join(group_dir, f"{dcm.Modality.code}-{dcm.AccessionNumber}_{i}.{image_format}")
+                    if os.path.exists(out_img):
+                        dcm.AccessionNumber += 1  # increase group_id
+                        out_img = os.path.join(
+                            group_dir, f"{dcm.Modality.code}-{dcm.AccessionNumber}_{i}.{image_format}"
+                        )
+
                     array = cv2.normalize(arr[i], None, 0, 255, cv2.NORM_MINMAX, dtype=cv2.CV_8UC1)  # type: ignore #AWSS
                     image = Image.fromarray(array)
                     image.save
@@ -551,7 +585,7 @@ def process_and_save_csv(unique_sorted_results: list, reserved_csv: str, quiet:
         unique_sorted_results (list): The data to be written to the CSV file. Each sublist
                                       represents a row with 'study_id' and 'patient_id'.
         reserved_csv (str): The path to the reserved CSV file.
-        quiet (bool, optional): Silene verbosity. Defaults to False.
+        quiet (bool, optional): Silence verbosity. Defaults to False.
     """
     temp_filename = save_to_temp_file(unique_sorted_results)
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -30,7 +30,7 @@ python-gdcm = "^3.0.24.1"
 process-dcm = "process_dcm.main:cli"
 
 [tool.poetry.group.dev.dependencies]
-boto3-stubs = { extras = ["lambda", "s3"], version = "1.35.58" }
+boto3-stubs = { extras = ["lambda", "s3"], version = "1.35.59" }
 ipdb = "0.13.13"
 ipython = "8.29.0"
 jupyterlab = "4.3.0"

diff --git a/tests/cataract/eye.dcm b/tests/cataract/eye.dcm
diff --git a/tests/dummy_ex/wrong_acqui_time.dcm b/tests/dummy_ex/wrong_acqui_time.dcm
diff --git a/tests/test_main.py b/tests/test_main.py
@@ -97,9 +97,9 @@ def test_main_dummy(janitor, runner):
     assert result.exit_code == 0
     tof = sorted(glob("dummy_dir/**/*"))
     of = [x for x in tof if "metadata.json" not in x]
-    assert len(tof) == 2
-    assert get_md5("dummy_dir/dummy_ex/metadata.json", bottom) == "0fbf0de5556e77447925ff3bfaf8168e"
-    assert get_md5(of) in ["377b5a17c284226518ccef9bddff25af"]
+    assert len(tof) == 3
+    assert get_md5("dummy_dir/dummy_ex/metadata.json", bottom) == "0693469a3fcf388d89627eb212ace2bc"
+    assert get_md5(of) in ["30b70623445f7c12d8ad773c9738c7ce"]
 
 
 def test_main_mapping(janitor, runner):
@@ -207,6 +207,21 @@ def test_process_task_optos():
         assert result == ("0570586923", "BEH002")
 
 
+def test_process_acquisition_datetime():
+    with TemporaryDirectory() as tmpdirname:
+        output_dir = Path(tmpdirname)
+        task_data = ("tests/cataract/", str(output_dir))
+        image_format = "png"
+        overwrite = True
+        verbose = True
+        keep = ""
+        mapping = ""
+        group = True
+        tol = 2
+        result = process_task(task_data, image_format, overwrite, verbose, keep, mapping, group, tol)
+        assert result == ("0558756784", "20241113-093410")
+
+
 # def test_process_taskL():
 #     with TemporaryDirectory() as tmpdirname:
 #         output_dir = Path(tmpdirname)

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -36,7 +36,7 @@ def test_meta_images_optopol(dicom_opotopol: FileDataset, mocker: MockerFixture)
 
     assert meta["modality"] == "OCT", "Modality extraction failed"
     assert len(meta["contents"]) == 1  # Should correspond to NumberOfFrames
-    mock_secho.assert_called_with("WARN: empty photo_locations", fg=typer.colors.RED)
+    mock_secho.assert_called_with("\nWARN: empty photo_locations", fg=typer.colors.RED)
 
 
 def test_meta_images_attribute_error(dicom_attribute_error: FileDataset) -> None:
@@ -277,7 +277,7 @@ def test_process_dcm_dummy(temp_output_dir):
         input_dir="tests/dummy_ex", output_dir=temp_output_dir, overwrite=True
     )
     assert new_patient_key, original_patient_key == ("2375458543", "123456")
-    assert get_md5(os.path.join(temp_output_dir, "metadata.json"), bottom) == "75a81f29397d3919603a8d55b305fdb1"
+    assert get_md5(os.path.join(temp_output_dir, "metadata.json"), bottom) == "b1fb22938cd95348cbcb44a63ed34fcf"
 
 
 def test_process_dcm_dummy_group(temp_output_dir):
@@ -286,7 +286,8 @@ def test_process_dcm_dummy_group(temp_output_dir):
     )
     assert new_patient_key, original_patient_key == ("2375458543", "123456")
     assert (
-        get_md5(os.path.join(temp_output_dir, "group_0", "metadata.json"), bottom) == "75a81f29397d3919603a8d55b305fdb1"
+        get_md5(os.path.join(temp_output_dir, "group_UNK", "metadata.json"), bottom)
+        == "b1fb22938cd95348cbcb44a63ed34fcf"
     )
 
 
@@ -295,7 +296,7 @@ def test_process_dcm_dummy_mapping(temp_output_dir):
         input_dir="tests/dummy_ex", output_dir=temp_output_dir, overwrite=True, mapping="tests/map.csv"
     )
     assert new_patient_key, original_patient_key == ("2375458543", "123456")
-    assert get_md5(os.path.join(temp_output_dir, "metadata.json"), bottom) == "75a81f29397d3919603a8d55b305fdb1"
+    assert get_md5(os.path.join(temp_output_dir, "metadata.json"), bottom) == "b1fb22938cd95348cbcb44a63ed34fcf"
 
 
 def test_delete_empty_folder(temp_directory):