From eea973f74b25bfb69510499c6337ee50b0be6c6e Mon Sep 17 00:00:00 2001 From: Alan Silva <3899850+alanwilter@users.noreply.github.com> Date: Thu, 5 Sep 2024 16:17:09 +0100 Subject: [PATCH] feat: :sparkles: Added anonymiser option by default --- .pre-commit-config.yaml | 10 +- .talismanrc | 14 +-- process_dcm/const.py | 2 + process_dcm/main.py | 107 ++++++++++++++++-- process_dcm/utils.py | 239 +++++++++++++++++++++++++++++++++++++--- tests/conftest.py | 54 ++++++++- tests/map.csv | 2 + tests/test_main.py | 151 +++++++++++++++++++++++-- tests/test_utils.py | 114 +++++++++++++++++-- 9 files changed, 629 insertions(+), 64 deletions(-) create mode 100644 tests/map.csv diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5f1105c..2af51a1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,7 +8,7 @@ repos: - id: check-shebang-scripts-are-executable - id: check-added-large-files args: - - "--maxkb=10000" + - "--maxkb=250000" # - id: debug-statements # not for process_dcm - id: check-yaml exclude: mkdocs.yml @@ -26,18 +26,18 @@ repos: - id: check-merge-conflict - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.5 + rev: v0.6.3 hooks: - id: ruff args: ["--fix"] files: ^hooks - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.5 + rev: v0.6.3 hooks: - id: ruff-format files: ^hooks - repo: https://github.com/commitizen-tools/commitizen - rev: v3.28.0 + rev: v3.29.0 hooks: - id: commitizen stages: @@ -50,7 +50,7 @@ repos: - prettier # not for process_dcm - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.11.1 + rev: v1.11.2 hooks: - id: mypy exclude: ^tests/ diff --git a/.talismanrc b/.talismanrc index 2074e24..2e82d7d 100644 --- a/.talismanrc +++ b/.talismanrc @@ -1,12 +1,10 @@ fileignoreconfig: - - filename: .cruft.json - checksum: 03b17a0041ffe209156ea08f01b12f2cbb770b4570a6f6678ce8dcae4d945535 - filename: tests/test_main.py - checksum: 3e4ad81306892573518b0e85e38544a978e160e32608229231f8d9f2b2dcd83c - - filename: poetry.lock - checksum: f4cee8b1c6d6f33b5f34cbbfc093008bcfe7820a26df4225ccea2b3fa5e65e15 + checksum: efa949f7b639498667939891d29938f01047b7b9a70ffd61a74d47cd127ddbf8 + - filename: process_dcm/main.py + checksum: 3b69ef0a904c0b9e04574fc70e4a35bcf0ed23a75ef9ff99ce6bf80cbaa43551 + - filename: tests/test_utils.py + checksum: 00912e74713f45921e593b5f1f98e23b324ab13bd5c9d44edb24c49a08f325b8 - filename: process_dcm/utils.py - checksum: 4427feb0e00b54eeda3d878b1457be0c0b62f7c39f68e0691442e4ad3988bbf2 - - filename: .github/workflows/release.yml - checksum: 86fbb1303400278bbebbe4523cd4746b4f5e8d1ef036371c8e7a9cbfc0adc254 + checksum: fdb35056dbd109dbbac6fd5d3668fbff90ace8f46aa37b787eaa2572c02d0aaf version: "" diff --git a/process_dcm/const.py b/process_dcm/const.py index 464b66c..6776666 100644 --- a/process_dcm/const.py +++ b/process_dcm/const.py @@ -3,6 +3,8 @@ from enum import Enum, Flag, auto, unique from typing import cast +RESERVED_CSV = "patient_2_study_id.csv" + class ModalityFlag(Flag): """A flag representing different modalities in DICOM files.""" diff --git a/process_dcm/main.py b/process_dcm/main.py index 33f14ab..6de8bc1 100644 --- a/process_dcm/main.py +++ b/process_dcm/main.py @@ -1,5 +1,7 @@ """app to procces DCM files.""" +import csv +import os from functools import partial from multiprocessing import Pool @@ -7,13 +9,11 @@ from tqdm import tqdm from process_dcm import __version__ -from process_dcm.utils import find_dcm_subfolders, process_dcm +from process_dcm.const import RESERVED_CSV +from process_dcm.utils import find_dicom_folders_with_base, process_and_save_csv, process_dcm app = typer.Typer(context_settings={"help_option_names": ["-h", "--help"]}) -out_msg = """Output directory for extracted images and metadata. Defaults to: __input_dir__/exported_data\n -Use absolute path if you want to save the output in a specific location.""" - def print_version(value: bool) -> None: """Print the version of the app.""" @@ -22,14 +22,50 @@ def print_version(value: bool) -> None: raise typer.Exit() +def process_task( + task: tuple[str, str], image_format: str, overwrite: bool, verbose: bool, keep: str, mapping: str +) -> tuple[str, str]: + """Process task.""" + subfolder, out_dir = task + return process_dcm( + input_dir=subfolder, + output_dir=out_dir, + image_format=image_format, + overwrite=overwrite, + verbose=verbose, + keep=keep, + mapping=mapping, + ) + + @app.command() def main( input_dir: str = typer.Argument(..., help="Input directory containing subfolders with DICOM files."), image_format: str = typer.Option( "png", "-f", "--image_format", help="Image format for extracted images (png, jpg, webp). Defaults to: png" ), - output_dir: str = typer.Option("exported_data", "-o", "--output_dir", help=out_msg), + output_dir: str = typer.Option( + "exported_data", + "-o", + "--output_dir", + help="Output directory for extracted images and metadata. Defaults to: exported_data", + ), + relative: bool = typer.Option( + False, "-r", "--relative", help="Save extracted data in folders relative to _input_dir_." + ), n_jobs: int = typer.Option(1, "-j", "--n_jobs", help="Number of parallel jobs. Defaults to: 1"), + mapping: str = typer.Option( + "", + "-m", + "--mapping", + help=f"Path to CSV containing patient_id to study_id mapping. If not provided and patient_id is not anonymised, a '{RESERVED_CSV}' file will be generated", # noqa: E501 + ), + keep: str = typer.Option( + "", + "-k", + "--keep", + help="Keep the specified fields (p: patient_key, n: names, d: date_of_birth, D: year-only DOB, g: gender)", + ), overwrite: bool = typer.Option(False, "-w", "--overwrite", help="Overwrite existing images if found."), verbose: bool = typer.Option(False, "-v", "--verbose", help="Verbose output."), version: bool = typer.Option( @@ -37,15 +73,64 @@ def main( ), ) -> None: """Process DICOM files in subfolders, extract images and metadata using parallel processing.""" - subfolders = find_dcm_subfolders(input_dir) + if mapping == RESERVED_CSV: + typer.secho(f"Can't use reserved CSV file name: {RESERVED_CSV}", fg="red") + raise typer.Abort() + if "p" in keep and mapping: + typer.secho(f"WARN:'--mapping' x '--keep p': File , {mapping} it will overwrite patient_id anyway", fg="yellow") - # Create a partial function with fixed arguments - process_dcm_with_args = partial( - process_dcm, image_format=image_format, output_dir=output_dir, overwrite=overwrite, verbose=verbose - ) + len_sf, base_dir, subfolders = find_dicom_folders_with_base(input_dir) + output_dirs = [] + + if relative and os.path.isabs(output_dir): + relative = False + typer.secho( + "WARN: '--relative' x 'absolute --output_dir' are incompatible, absolute 'output_dir' takes precedence", + fg="yellow", + ) + output_dirs = [x.replace(base_dir, output_dir) for x in subfolders] + elif relative and not os.path.isabs(output_dir): + output_dirs = [os.path.join(x, output_dir) for x in subfolders] + elif not relative and not os.path.isabs(output_dir): + output_dir = os.path.abspath(output_dir) + output_dirs = [x.replace(base_dir, output_dir) for x in subfolders] + + tasks = list(zip(subfolders, output_dirs)) with Pool(n_jobs) as pool: - results = list(tqdm(pool.imap(process_dcm_with_args, subfolders), total=len(subfolders))) + results = list( + tqdm( + pool.imap( + partial( + process_task, + image_format=image_format, + overwrite=overwrite, + verbose=verbose, + keep=keep, + mapping=mapping, + ), + tasks, + ), + total=len_sf, + ) + ) + + unique_sorted_results = sorted(set(results)) # (study_id, patient_id) + dict_res = dict(unique_sorted_results) + + if mapping: + with open(mapping) as file: + reader = csv.reader(file) + mapping_study_ids = set(row[1] for row in reader) + + missing_study_ids = set(result[0] for result in unique_sorted_results) - mapping_study_ids + + for study_id in missing_study_ids: + typer.secho( + f"Missing map in {mapping}: {dict_res[study_id]} -> {study_id} (<- new hash created)", fg="yellow" + ) + else: + process_and_save_csv(unique_sorted_results, RESERVED_CSV) print(f"Processed {len(results)} DICOM folders.") diff --git a/process_dcm/utils.py b/process_dcm/utils.py index d4c7d81..83559c6 100644 --- a/process_dcm/utils.py +++ b/process_dcm/utils.py @@ -1,9 +1,12 @@ """utils module.""" +import csv +import filecmp import hashlib import json import os import shutil +import tempfile import warnings from collections import defaultdict from datetime import datetime @@ -16,7 +19,7 @@ from pydicom.dataset import FileDataset from pydicom.filereader import dcmread -from process_dcm.const import ImageModality +from process_dcm.const import RESERVED_CSV, ImageModality warnings.filterwarnings("ignore", category=UserWarning, message="A value of type *") @@ -123,12 +126,20 @@ def meta_images(dcm_obj: FileDataset) -> dict: return meta -def process_dcm_meta(dcm_objs: list[FileDataset], output_dir: str) -> None: +boo = "hello" + + +def process_dcm_meta( + dcm_objs: list[FileDataset], output_dir: str, mapping: str = "", keep: str = "" +) -> tuple[str, str]: """Extract and save metadata from a list of DICOM files to a JSON file. Args: dcm_objs (list[FileDataset]): A list of FileDataset objects representing the DICOM files. output_dir (str): The directory where the metadata JSON file will be saved. + mapping (str): Optional path to the CSV file containing patient ID to study ID mapping. + If not provided and patient_id is not anonymised, a '{RESERVED_CSV}' file will be generated. + keep (str): String containing the letters indicating which fields to keep (p, n, d, D, g). Returns: None @@ -141,17 +152,56 @@ def process_dcm_meta(dcm_objs: list[FileDataset], output_dir: str) -> None: metadata["images"]["images"] = [] metadata["parser_version"] = [1, 5, 2] metadata["py_dcm_version"] = [0, 1, 0] + + keep_gender = "g" in keep + keep_names = "n" in keep + keep_patient_key = "p" in keep + + # Read the mapping file if provided + patient_to_study = {} + if mapping: + patient_to_study = dict(read_csv(mapping)) + + original_patient_key = "" + new_patient_key = "" + for dcm_obj in dcm_objs: - metadata["patient"]["patient_key"] = dcm_obj.get("PatientID") - metadata["patient"]["first_name"] = dcm_obj.get("PatientName.name_prefix") - metadata["patient"]["last_name"] = dcm_obj.get("PatientName.name_suffix") - metadata["patient"]["date_of_birth"] = do_date(dcm_obj.get("PatientBirthDate"), "%Y%m%d", "%Y-%m-%d") - metadata["patient"]["gender"] = dcm_obj.get("PatientSex") + patient_key = dcm_obj.get("PatientID", "") + original_patient_key = patient_key + if patient_key in patient_to_study: + patient_key = patient_to_study[patient_key] + elif not keep_patient_key: + patient_key = get_hash(patient_key) + + new_patient_key = patient_key + + first_name = dcm_obj.get("PatientName.name_prefix") + last_name = dcm_obj.get("PatientName.name_suffix") + if not keep_names: + first_name = None if first_name else first_name + last_name = None if last_name else last_name + + date_of_birth = do_date(dcm_obj.get("PatientBirthDate", "10010101"), "%Y%m%d", "%Y-%m-%d") + if "D" in keep: + year = date_of_birth[:4] + date_of_birth = f"{year}-01-01" + elif "d" not in keep: + date_of_birth = "1001-01-01" + + gender = dcm_obj.get("PatientSex") + if not keep_gender: + gender = None if gender else gender + + metadata["patient"]["patient_key"] = patient_key + metadata["patient"]["first_name"] = first_name + metadata["patient"]["last_name"] = last_name + metadata["patient"]["date_of_birth"] = date_of_birth + metadata["patient"]["gender"] = gender metadata["patient"]["source_id"] = dcm_obj.get("StudyInstanceUID") metadata["exam"]["manufacturer"] = dcm_obj.get("Manufacturer") metadata["exam"]["scan_datetime"] = do_date( - dcm_obj.get("AcquisitionDateTime"), "%Y%m%d%H%M%S.%f", "%Y-%m-%d %H:%M:%S" + dcm_obj.get("AcquisitionDateTime", "00000000"), "%Y%m%d%H%M%S.%f", "%Y-%m-%d %H:%M:%S" ) metadata["exam"]["scanner_model"] = dcm_obj.get("ManufacturerModelName") metadata["exam"]["scanner_serial_number"] = dcm_obj.get("DeviceSerialNumber") @@ -174,6 +224,13 @@ def process_dcm_meta(dcm_objs: list[FileDataset], output_dir: str) -> None: with open(meta_file, "w") as f: json.dump(metadata, f, indent=4) + return (new_patient_key, original_patient_key) + + +process_dcm_meta.__doc__ = ( + process_dcm_meta.__doc__.format(RESERVED_CSV=RESERVED_CSV) if process_dcm_meta.__doc__ else None +) + def update_modality(dcm: FileDataset) -> bool: """Updates the modality of the given DICOM object based on its Manufacturer and SeriesDescription attributes. @@ -217,20 +274,25 @@ def process_dcm( input_dir: str | Path, image_format: str = "png", output_dir: str = "exported_data", + mapping: str = "", + keep: str = "", overwrite: bool = False, verbose: bool = False, -) -> None: +) -> tuple[str, str]: """Process DICOM files from the input directory and save images in the specified format. Args: input_dir (str|Path): Path to the directory containing DICOM files. output_dir (str): Path to the directory where images will be saved. Defaults to "__input_dir__/exported_data". Use full path if wanting to save to a specific folder. + mapping (str): Optional path to the CSV file containing patient ID to study ID mapping. + If not provided and patient_id is not anonymised, a '{RESERVED_CSV}' file will be generated. image_format (str): The format in which to save the images. Defaults to "png". overwrite (bool): Whether to overwrite existing files in the output directory. Defaults to False. verbose (bool, optional): Whether to print out progress information during processing. Defaults to True. + keep (str): String containing the letters indicating which fields to keep (p, n, d, D, g). """ - output_dir = set_output_dir(input_dir, output_dir) + # output_dir = set_output_dir(input_dir, output_dir) if overwrite: shutil.rmtree(output_dir, ignore_errors=True) @@ -271,25 +333,47 @@ def process_dcm( dcms.append(dcm) - process_dcm_meta(dcms, output_dir) + return process_dcm_meta(dcm_objs=dcms, output_dir=output_dir, mapping=mapping, keep=keep) + + +process_dcm.__doc__ = process_dcm.__doc__.format(RESERVED_CSV=RESERVED_CSV) if process_dcm.__doc__ else None + +def find_dicom_folders_with_base(root_folder: str) -> tuple[int, str, list[str]]: + """Finds all unique subfolders within the root folder that contain at least one DICOM (.dcm) file. -def find_dcm_subfolders(root_folder: str) -> list[str]: - """Finds all unique subfolders within the root folder that contain at least one DCM file. + It also returns the common base directory and the number of found subfolders. Args: - root_folder: The path to the root folder to search. + root_folder (str): The path to the root folder to search for subfolders containing DICOM files. Returns: - A naturally sorted list of unique full paths of subfolders containing DCM files. + tuple[int, str, list[str]]: A tuple containing: + - int: The number of subfolders containing at least one DICOM file. + - str: The base directory common to all found subfolders, or an empty string if none found. + - list[str]: A naturally sorted list of unique full paths of subfolders containing DICOM files. + + Example: + find_dicom_folders_with_base("/data/patient") """ unique_subfolders = set() - for dirpath, _, filenames in os.walk(root_folder): if any(filename.lower().endswith(".dcm") for filename in filenames): - unique_subfolders.add(dirpath) # Store full path + unique_subfolders.add(dirpath) # Add the full path of subfolders containing DCM files + + folders = list(unique_subfolders) + len_ins = len(folders) + + if len_ins == 0: + return 0, "", [] - return natsorted(list(unique_subfolders)) + if len_ins == 1: + base_dir = folders[0].rstrip("/") # Strip trailing slash if present + base_dir = os.path.dirname(base_dir) + else: + base_dir = os.path.commonpath(folders) # Get the common base directory + + return len_ins, base_dir, natsorted(folders) def get_md5(file_path: Path | str | list[str]) -> str: @@ -305,3 +389,122 @@ def get_md5(file_path: Path | str | list[str]) -> str: while chunk := f.read(4096): md5_hash.update(chunk) return md5_hash.hexdigest() + + +def get_hash(value: str) -> str: + """Get a 10 digit hash based on the input string.""" + hex_dig = hashlib.sha256(str(value).encode()).hexdigest() + return f"{int(hex_dig[:8], 16):010}" + + +def get_versioned_filename(base_filename: str, version: int) -> str: + """Generates a file name with a version suffix. + + Args: + base_filename (str): The base name of the file. + version (int): The version number to append. + + Returns: + str: The generated file name with a version suffix. + """ + base, ext = os.path.splitext(base_filename) + return f"{base}_{version}{ext}" + + +def save_to_temp_file(data: list[list[str]]) -> str: + """Saves data to a temporary CSV file. + + Args: + data (list[list[str]]): The data to be written to the temporary file. + + Returns: + str: The path of the temporary file. + """ + temp_file = tempfile.NamedTemporaryFile(delete=False, mode="w", newline="", suffix=".csv") + temp_file.close() # Close the NamedTemporaryFile to be reused by write_to_csv + write_to_csv(temp_file.name, data, header=["study_id", "patient_id"]) + return temp_file.name + + +def files_are_identical(file1: str, file2: str) -> bool: + """Checks if two files are identical. + + Args: + file1 (str): The path to the first file. + file2 (str): The path to the second file. + + Returns: + bool: True if the files are identical, False otherwise. + """ + return filecmp.cmp(file1, file2, shallow=False) + + +def process_and_save_csv(unique_sorted_results: list, reserved_csv: str, verbose: bool = False) -> None: + """Processes unique sorted results and saves them to the reserved CSV file. + + If the content is identical to the existing file, it leaves the existing file unchanged. + If the content differs, it renames the existing file with a suffix and saves the new content as the reserved CSV. + + Args: + unique_sorted_results (list): The data to be written to the CSV file. Each sublist + represents a row with 'study_id' and 'patient_id'. + reserved_csv (str): The path to the reserved CSV file. + verbose (bool, optional): Verbosity. Defaults to False. + """ + temp_filename = save_to_temp_file(unique_sorted_results) + + if os.path.exists(reserved_csv): + if files_are_identical(temp_filename, reserved_csv): + os.remove(temp_filename) + if verbose: + print(f"No changes detected. {reserved_csv} remains unchanged.") + else: + version = 1 + new_version_filename = get_versioned_filename(reserved_csv, version) + while os.path.exists(new_version_filename): + version += 1 + new_version_filename = get_versioned_filename(reserved_csv, version) + + shutil.move(reserved_csv, new_version_filename) + if verbose: + print(f"Old {reserved_csv} renamed to {new_version_filename}") + shutil.move(temp_filename, reserved_csv) + if verbose: + print(f"New generated mapping saved to {reserved_csv}") + else: + shutil.move(temp_filename, reserved_csv) + if verbose: + print(f"Generated mapping saved to {reserved_csv}") + + +def read_csv(file_path: str) -> list[list[str]]: + """Reads a CSV file and returns its contents as a list of rows. + + Each row is represented as a list of strings. + + Args: + file_path (str): The path to the CSV file to be read. + + Returns: + list[list[str]]: A list of rows, where each row is a list of strings representing the CSV data. + """ + with open(file_path) as file: + reader = csv.reader(file) + return list(reader) + + +def write_to_csv(file_path: str | Path, data: list[list[str]], header: list[str] = []) -> None: + """Writes data to a CSV file at the specified file path. + + Args: + file_path (str|Path): The path to the CSV file. + data (list[list[str]]): The data to write to the CSV file. Each sublist represents a row. + header (list[str], optional): An optional list representing the CSV header. + Defaults to None. + """ + file_path = Path(file_path) + with file_path.open(mode="w", newline="") as file: + writer = csv.writer(file) + if header: + writer.writerow(header) + writer.writerows(data) diff --git a/tests/conftest.py b/tests/conftest.py index 4a4ea7a..ab165c2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,7 @@ +import os import tempfile from collections import defaultdict +from collections.abc import Generator from pathlib import Path import pytest @@ -7,6 +9,7 @@ from typer.testing import CliRunner from process_dcm import __version__ as version +from process_dcm.const import ImageModality def pytest_report_header(): @@ -98,7 +101,56 @@ def dicom_with_photo_locations(): def dicom_base(): """Base fixture for creating a mocked DICOM FileDataset.""" dataset = FileDataset("test.dcm", {}, file_meta=defaultdict(str), preamble=b"\0" * 128) - dataset.Modality = "" + dataset.AccessionNumber = 0 + dataset.Modality = ImageModality.OCT + dataset.PatientBirthDate = "19020202" dataset.Manufacturer = "" dataset.SeriesDescription = "" + dataset.PatientID = "bbff7a25-d32c-4192-9330-0bb01d49f746" return dataset + + +@pytest.fixture +def csv_data(): + return [ + ["study_id_1", "patient_id_1"], + ["study_id_2", "patient_id_2"], + ] + + +@pytest.fixture +def unique_sorted_results(): + return [ + ["study_id_3", "patient_id_3"], + ["study_id_4", "patient_id_4"], + ] + + +@pytest.fixture +def janitor() -> Generator[list[str], None, None]: + to_delete: list[str] = [] + yield to_delete + del_file_paths(to_delete) + + +def del_file_paths(file_paths: list[str]) -> None: + """Deletes all files and folders in the list of file paths. + + Args: + file_paths (List[str]): A list of file paths to delete. + + Returns: + None + """ + for path in file_paths: + if not os.path.exists(path): + continue + if os.path.isfile(path): + os.remove(path) + elif os.path.isdir(path): + for root, dirs, files in os.walk(path, topdown=False): + for name in files: + os.remove(os.path.join(root, name)) + for name in dirs: + os.rmdir(os.path.join(root, name)) + os.rmdir(path) diff --git a/tests/map.csv b/tests/map.csv new file mode 100644 index 0000000..db209f8 --- /dev/null +++ b/tests/map.csv @@ -0,0 +1,2 @@ +bbff7a25-d32c-4192-9330-0bb01d49f746,00123 +010-0001,012345 diff --git a/tests/test_main.py b/tests/test_main.py index 02d91e2..ad2508f 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -2,8 +2,12 @@ from glob import glob from pathlib import Path +import pytest +import typer + from process_dcm import __version__ -from process_dcm.main import app, cli, main +from process_dcm.const import RESERVED_CSV +from process_dcm.main import app, cli, main, process_task from process_dcm.utils import get_md5 @@ -52,21 +56,150 @@ def test_cli_without_args(capsys): assert "Missing argument 'INPUT_DIR'" in captured.err -def test_main(): +@pytest.mark.parametrize( + "md5, meta, keep", + [ + (["837808d746aef8e2dd08defbdbc70818"], "0a9a930806f2784aa4e60d47b3bad6ed", "pndg"), + (["7a355bb7e0c95155d1541c7fe0941c5e"], "fd6c5a84aca6499b0ea8b99d4e25dc92", "pnDg"), + (["2319181ecfc33d35b01dcec65ab2c568"], "35fe295648681e3521da8dddaed63705", ""), + ], +) +def test_main(md5, meta, keep, janitor): input_dir = "tests/example-dcms" image_format = "png" n_jobs = 1 overwrite = False verbose = True - md5 = ["837808d746aef8e2dd08defbdbc70818"] - + mapping = "" + janitor.append("patient_2_study_id.csv") # Create a temporary directory using the tempfile module with tempfile.TemporaryDirectory() as tmpdirname: output_dir = Path(tmpdirname) - - # Run your app's main function with the test inputs - main(input_dir, image_format, str(output_dir), n_jobs, overwrite, verbose) - of = sorted(glob(f"{output_dir}/*")) + main( + input_dir=input_dir, + image_format=image_format, + output_dir=str(output_dir), + n_jobs=n_jobs, + overwrite=overwrite, + verbose=verbose, + keep=keep, + mapping=mapping, + ) + of = sorted(glob(f"{output_dir}/**/*")) assert len(of) == 51 - assert get_md5(output_dir / "metadata.json") == "0a9a930806f2784aa4e60d47b3bad6ed" + assert get_md5(output_dir / "example-dcms/metadata.json") == meta assert get_md5(of) in md5 + + +def test_main_mapping(janitor): + input_dir = "tests/example-dcms" + image_format = "png" + n_jobs = 1 + overwrite = False + verbose = True + janitor.append("patient_2_study_id.csv") + with tempfile.TemporaryDirectory() as tmpdirname: + output_dir = Path(tmpdirname) + main( + input_dir=input_dir, + image_format=image_format, + output_dir=str(output_dir), + n_jobs=n_jobs, + overwrite=overwrite, + verbose=verbose, + keep="p", + mapping="tests/map.csv", + ) + of = sorted(glob(f"{output_dir}/**/*")) + assert len(of) == 51 + assert get_md5(output_dir / "example-dcms/metadata.json") == "261826ad2e067e9adb7143bb6c053dbc" + assert get_md5(of) in "6ff8e2fe69c5fbe86f81f44f74496cab" + + +def test_main_abort(): + input_dir = "tests/example-dcms" + image_format = "png" + n_jobs = 1 + overwrite = False + verbose = True + # Expect the typer.Abort exception to be raised + with pytest.raises(typer.Abort): + main( + input_dir=input_dir, + image_format=image_format, + output_dir="/tmp", + n_jobs=n_jobs, + overwrite=overwrite, + verbose=verbose, + keep="p", + mapping=RESERVED_CSV, + ) + + +def test_main_mapping_example_dir(janitor): + input_dir = "tests/example_dir" + image_format = "png" + n_jobs = 2 + overwrite = True + verbose = True + janitor.append("patient_2_study_id.csv") + janitor.append("patient_2_study_id_1.csv") + with tempfile.TemporaryDirectory() as tmpdirname: + output_dir = Path(tmpdirname) + main( + input_dir=input_dir, + image_format=image_format, + output_dir=str(output_dir), + n_jobs=n_jobs, + overwrite=overwrite, + verbose=verbose, + keep="nDg", + mapping="tests/map.csv", + ) + of = sorted(glob(f"{output_dir}/**/**/*")) + assert len(of) == 262 + assert get_md5(output_dir / "010-0001/20180724_L/metadata.json") == "1b46961177c80daf69e7dea7379fcc31" + assert get_md5(output_dir / "010-0002/20180926_R/metadata.json") == "bbf5c47f9fb28f46b4cc1bf08c311593" + + +def test_main_mapping_example_dir_relative(janitor): + input_dir = "tests/example_dir" + image_format = "png" + n_jobs = 2 + overwrite = False + verbose = True + relative = True + janitor.append("patient_2_study_id.csv") + janitor.append("patient_2_study_id_1.csv") + main( + input_dir=input_dir, + image_format=image_format, + output_dir="dummy", + n_jobs=n_jobs, + overwrite=overwrite, + verbose=verbose, + keep="nDg", + mapping="tests/map.csv", + relative=relative, + ) + of = sorted(glob(f"{input_dir}/**/**/dummy/*")) + path1 = Path(input_dir) / "010-0001/20180724_L/dummy" + path2 = Path(input_dir) / "010-0002/20180926_R/dummy" + janitor.append(path1) + janitor.append(path2) + assert len(of) == 262 + assert get_md5(path1 / "metadata.json") == "1b46961177c80daf69e7dea7379fcc31" + assert get_md5(path2 / "metadata.json") == "bbf5c47f9fb28f46b4cc1bf08c311593" + + +def test_process_task(): + with tempfile.TemporaryDirectory() as tmpdirname: + output_dir = Path(tmpdirname) + task_data = ("tests/example-dcms/", str(output_dir)) + image_format = "png" + overwrite = True + verbose = False + keep = "" + mapping = "" + result = process_task(task_data, image_format, overwrite, verbose, keep, mapping) + assert result == ("0780320450", "bbff7a25-d32c-4192-9330-0bb01d49f746") diff --git a/tests/test_utils.py b/tests/test_utils.py index d9c5b26..b3a45a0 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,3 +1,4 @@ +import json import os import tempfile from pathlib import Path @@ -6,7 +7,17 @@ import pytest from process_dcm.const import ImageModality -from process_dcm.utils import do_date, find_dcm_subfolders, meta_images, process_dcm, set_output_dir, update_modality +from process_dcm.utils import ( + do_date, + meta_images, + process_and_save_csv, + process_dcm, + process_dcm_meta, + read_csv, + set_output_dir, + update_modality, + write_to_csv, +) def test_meta_images_optopol(dicom_opotopol, mocker): @@ -102,17 +113,6 @@ def test_do_date() -> None: assert do_date("InvalidDate", "%Y%m%d", "%Y-%m-%d") == "" -def test_find_dcm_subfolders() -> None: - with tempfile.TemporaryDirectory() as tmpdir: - dcm_folder1 = os.path.join(tmpdir, "folder1") - dcm_folder2 = os.path.join(tmpdir, "folder2") - os.makedirs(dcm_folder1) - os.makedirs(dcm_folder2) - open(os.path.join(dcm_folder1, "image.dcm"), "w").close() - subfolders = find_dcm_subfolders(tmpdir) - assert len(subfolders) == 1 - - def test_update_modality_opt(dicom_base): """Test updating modality when the modality is OPT.""" dicom_base.Modality = "OPT" @@ -171,3 +171,93 @@ def test_update_modality_op_various_descriptions(dicom_base, description, expect dicom_base.SeriesDescription = description assert update_modality(dicom_base) is True assert dicom_base.Modality == expected_modality + + +def test_process_dcm_meta_with_D_in_keep_and_mapping(dicom_base): + # Call the function with "D" in keep + with tempfile.TemporaryDirectory() as tmpdir: + process_dcm_meta([dicom_base], tmpdir, keep="D", mapping="tests/map.csv") + rjson = json.load(open(os.path.join(tmpdir, "metadata.json"))) + assert rjson["patient"]["date_of_birth"] == "1902-01-01" + assert rjson["patient"]["patient_key"] == "00123" + + +def test_process_and_save_csv(csv_data, unique_sorted_results): + with tempfile.TemporaryDirectory() as temp_dir: + reserved_csv = Path(temp_dir) / "reserved.csv" + + # Create initial reserved CSV with initial csv_data + write_to_csv(reserved_csv, csv_data, header=["study_id", "patient_id"]) + + # Process and save new CSV data + process_and_save_csv(unique_sorted_results, reserved_csv, verbose=True) + + # Check if reserved CSV was updated + updated_data = read_csv(reserved_csv) + expected_data = [["study_id", "patient_id"], *unique_sorted_results] + assert updated_data == expected_data, f"Expected {expected_data}, but got {updated_data}" + + # Check if backup was created + backup_file = Path(temp_dir) / "reserved_1.csv" + assert backup_file.exists(), f"Expected backup file {backup_file} to exist" + + backup_data = read_csv(backup_file) + expected_backup_data = [["study_id", "patient_id"], *csv_data] + assert backup_data == expected_backup_data, f"Expected {expected_backup_data}, but got {backup_data}" + + +def test_process_and_save_csv_no_existing_file(unique_sorted_results): + with tempfile.TemporaryDirectory() as temp_dir: + reserved_csv = Path(temp_dir) / "reserved.csv" + + # Process and save new CSV data with no existing reserved CSV + process_and_save_csv(unique_sorted_results, reserved_csv, verbose=True) + + # Check if reserved CSV was created and contains the expected data + created_data = read_csv(reserved_csv) + expected_data = [["study_id", "patient_id"], *unique_sorted_results] + assert created_data == expected_data, f"Expected {expected_data}, but got {created_data}" + + +def test_process_and_save_csv_with_existing_file(csv_data, unique_sorted_results): + with tempfile.TemporaryDirectory() as temp_dir: + reserved_csv = Path(temp_dir) / "reserved.csv" + + # Create initial reserved CSV with initial csv_data + write_to_csv(reserved_csv, csv_data, header=["study_id", "patient_id"]) + + # Process and save new CSV data + process_and_save_csv(unique_sorted_results, reserved_csv, verbose=True) + + # Check if reserved CSV was updated + updated_data = read_csv(reserved_csv) + expected_data = [["study_id", "patient_id"], *unique_sorted_results] + assert updated_data == expected_data, f"Expected {expected_data}, but got {updated_data}" + + # Check if backup was created + backup_file = Path(temp_dir) / "reserved_1.csv" + assert backup_file.exists(), f"Expected backup file {backup_file} to exist" + + backup_data = read_csv(backup_file) + expected_backup_data = [["study_id", "patient_id"], *csv_data] + assert backup_data == expected_backup_data, f"Expected {expected_backup_data}, but got {backup_data}" + + +def test_process_and_save_csv_no_changes(csv_data): + with tempfile.TemporaryDirectory() as temp_dir: + reserved_csv = Path(temp_dir) / "reserved.csv" + + # Create reserved CSV with initial csv_data + write_to_csv(reserved_csv, csv_data, header=["study_id", "patient_id"]) + + # Process and save the same CSV data + process_and_save_csv(csv_data, reserved_csv, verbose=True) + + # Check if reserved CSV remains unchanged + unchanged_data = read_csv(reserved_csv) + expected_data = [["study_id", "patient_id"], *csv_data] + assert unchanged_data == expected_data, f"Expected {expected_data}, but got {unchanged_data}" + + # Check that no backup was created + backup_file = Path(temp_dir) / "reserved_1.csv" + assert not backup_file.exists(), f"Did not expect backup file {backup_file} to exist"