From eea973f74b25bfb69510499c6337ee50b0be6c6e Mon Sep 17 00:00:00 2001
From: Alan Silva <3899850+alanwilter@users.noreply.github.com>
Date: Thu, 5 Sep 2024 16:17:09 +0100
Subject: [PATCH] feat: :sparkles: Added anonymiser option by default

---
 .pre-commit-config.yaml |  10 +-
 .talismanrc             |  14 +--
 process_dcm/const.py    |   2 +
 process_dcm/main.py     | 107 ++++++++++++++++--
 process_dcm/utils.py    | 239 +++++++++++++++++++++++++++++++++++++---
 tests/conftest.py       |  54 ++++++++-
 tests/map.csv           |   2 +
 tests/test_main.py      | 151 +++++++++++++++++++++++--
 tests/test_utils.py     | 114 +++++++++++++++++--
 9 files changed, 629 insertions(+), 64 deletions(-)
 create mode 100644 tests/map.csv

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5f1105c..2af51a1 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,7 +8,7 @@ repos:
       - id: check-shebang-scripts-are-executable
       - id: check-added-large-files
         args:
-          - "--maxkb=10000"
+          - "--maxkb=250000"
       # - id: debug-statements # not for process_dcm
       - id: check-yaml
         exclude: mkdocs.yml
@@ -26,18 +26,18 @@ repos:
       - id: check-merge-conflict
       - id: check-docstring-first
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.5.5
+    rev: v0.6.3
     hooks:
       - id: ruff
         args: ["--fix"]
         files: ^hooks
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.5.5
+    rev: v0.6.3
     hooks:
       - id: ruff-format
         files: ^hooks
   - repo: https://github.com/commitizen-tools/commitizen
-    rev: v3.28.0
+    rev: v3.29.0
     hooks:
       - id: commitizen
         stages:
@@ -50,7 +50,7 @@ repos:
           - prettier
   # not for process_dcm
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.11.1
+    rev: v1.11.2
     hooks:
       - id: mypy
         exclude: ^tests/
diff --git a/.talismanrc b/.talismanrc
index 2074e24..2e82d7d 100644
--- a/.talismanrc
+++ b/.talismanrc
@@ -1,12 +1,10 @@
 fileignoreconfig:
-  - filename: .cruft.json
-    checksum: 03b17a0041ffe209156ea08f01b12f2cbb770b4570a6f6678ce8dcae4d945535
   - filename: tests/test_main.py
-    checksum: 3e4ad81306892573518b0e85e38544a978e160e32608229231f8d9f2b2dcd83c
-  - filename: poetry.lock
-    checksum: f4cee8b1c6d6f33b5f34cbbfc093008bcfe7820a26df4225ccea2b3fa5e65e15
+    checksum: efa949f7b639498667939891d29938f01047b7b9a70ffd61a74d47cd127ddbf8
+  - filename: process_dcm/main.py
+    checksum: 3b69ef0a904c0b9e04574fc70e4a35bcf0ed23a75ef9ff99ce6bf80cbaa43551
+  - filename: tests/test_utils.py
+    checksum: 00912e74713f45921e593b5f1f98e23b324ab13bd5c9d44edb24c49a08f325b8
   - filename: process_dcm/utils.py
-    checksum: 4427feb0e00b54eeda3d878b1457be0c0b62f7c39f68e0691442e4ad3988bbf2
-  - filename: .github/workflows/release.yml
-    checksum: 86fbb1303400278bbebbe4523cd4746b4f5e8d1ef036371c8e7a9cbfc0adc254
+    checksum: fdb35056dbd109dbbac6fd5d3668fbff90ace8f46aa37b787eaa2572c02d0aaf
 version: ""
diff --git a/process_dcm/const.py b/process_dcm/const.py
index 464b66c..6776666 100644
--- a/process_dcm/const.py
+++ b/process_dcm/const.py
@@ -3,6 +3,8 @@
 from enum import Enum, Flag, auto, unique
 from typing import cast
 
+RESERVED_CSV = "patient_2_study_id.csv"
+
 
 class ModalityFlag(Flag):
     """A flag representing different modalities in DICOM files."""
diff --git a/process_dcm/main.py b/process_dcm/main.py
index 33f14ab..6de8bc1 100644
--- a/process_dcm/main.py
+++ b/process_dcm/main.py
@@ -1,5 +1,7 @@
 """app to procces DCM files."""
 
+import csv
+import os
 from functools import partial
 from multiprocessing import Pool
 
@@ -7,13 +9,11 @@
 from tqdm import tqdm
 
 from process_dcm import __version__
-from process_dcm.utils import find_dcm_subfolders, process_dcm
+from process_dcm.const import RESERVED_CSV
+from process_dcm.utils import find_dicom_folders_with_base, process_and_save_csv, process_dcm
 
 app = typer.Typer(context_settings={"help_option_names": ["-h", "--help"]})
 
-out_msg = """Output directory for extracted images and metadata. Defaults to: __input_dir__/exported_data\n
-Use absolute path if you want to save the output in a specific location."""
-
 
 def print_version(value: bool) -> None:
     """Print the version of the app."""
@@ -22,14 +22,50 @@ def print_version(value: bool) -> None:
         raise typer.Exit()
 
 
+def process_task(
+    task: tuple[str, str], image_format: str, overwrite: bool, verbose: bool, keep: str, mapping: str
+) -> tuple[str, str]:
+    """Process task."""
+    subfolder, out_dir = task
+    return process_dcm(
+        input_dir=subfolder,
+        output_dir=out_dir,
+        image_format=image_format,
+        overwrite=overwrite,
+        verbose=verbose,
+        keep=keep,
+        mapping=mapping,
+    )
+
+
 @app.command()
 def main(
     input_dir: str = typer.Argument(..., help="Input directory containing subfolders with DICOM files."),
     image_format: str = typer.Option(
         "png", "-f", "--image_format", help="Image format for extracted images (png, jpg, webp). Defaults to: png"
     ),
-    output_dir: str = typer.Option("exported_data", "-o", "--output_dir", help=out_msg),
+    output_dir: str = typer.Option(
+        "exported_data",
+        "-o",
+        "--output_dir",
+        help="Output directory for extracted images and metadata. Defaults to: exported_data",
+    ),
+    relative: bool = typer.Option(
+        False, "-r", "--relative", help="Save extracted data in folders relative to _input_dir_."
+    ),
     n_jobs: int = typer.Option(1, "-j", "--n_jobs", help="Number of parallel jobs. Defaults to: 1"),
+    mapping: str = typer.Option(
+        "",
+        "-m",
+        "--mapping",
+        help=f"Path to CSV containing patient_id to study_id mapping. If not provided and patient_id is not anonymised, a '{RESERVED_CSV}' file will be generated",  # noqa: E501
+    ),
+    keep: str = typer.Option(
+        "",
+        "-k",
+        "--keep",
+        help="Keep the specified fields (p: patient_key, n: names, d: date_of_birth, D: year-only DOB, g: gender)",
+    ),
     overwrite: bool = typer.Option(False, "-w", "--overwrite", help="Overwrite existing images if found."),
     verbose: bool = typer.Option(False, "-v", "--verbose", help="Verbose output."),
     version: bool = typer.Option(
@@ -37,15 +73,64 @@ def main(
     ),
 ) -> None:
     """Process DICOM files in subfolders, extract images and metadata using parallel processing."""
-    subfolders = find_dcm_subfolders(input_dir)
+    if mapping == RESERVED_CSV:
+        typer.secho(f"Can't use reserved CSV file name: {RESERVED_CSV}", fg="red")
+        raise typer.Abort()
+    if "p" in keep and mapping:
+        typer.secho(f"WARN:'--mapping' x '--keep p': File , {mapping} it will overwrite patient_id anyway", fg="yellow")
 
-    # Create a partial function with fixed arguments
-    process_dcm_with_args = partial(
-        process_dcm, image_format=image_format, output_dir=output_dir, overwrite=overwrite, verbose=verbose
-    )
+    len_sf, base_dir, subfolders = find_dicom_folders_with_base(input_dir)
+    output_dirs = []
+
+    if relative and os.path.isabs(output_dir):
+        relative = False
+        typer.secho(
+            "WARN: '--relative' x 'absolute --output_dir' are incompatible, absolute 'output_dir' takes precedence",
+            fg="yellow",
+        )
+        output_dirs = [x.replace(base_dir, output_dir) for x in subfolders]
+    elif relative and not os.path.isabs(output_dir):
+        output_dirs = [os.path.join(x, output_dir) for x in subfolders]
+    elif not relative and not os.path.isabs(output_dir):
+        output_dir = os.path.abspath(output_dir)
+        output_dirs = [x.replace(base_dir, output_dir) for x in subfolders]
+
+    tasks = list(zip(subfolders, output_dirs))
 
     with Pool(n_jobs) as pool:
-        results = list(tqdm(pool.imap(process_dcm_with_args, subfolders), total=len(subfolders)))
+        results = list(
+            tqdm(
+                pool.imap(
+                    partial(
+                        process_task,
+                        image_format=image_format,
+                        overwrite=overwrite,
+                        verbose=verbose,
+                        keep=keep,
+                        mapping=mapping,
+                    ),
+                    tasks,
+                ),
+                total=len_sf,
+            )
+        )
+
+    unique_sorted_results = sorted(set(results))  # (study_id, patient_id)
+    dict_res = dict(unique_sorted_results)
+
+    if mapping:
+        with open(mapping) as file:
+            reader = csv.reader(file)
+            mapping_study_ids = set(row[1] for row in reader)
+
+        missing_study_ids = set(result[0] for result in unique_sorted_results) - mapping_study_ids
+
+        for study_id in missing_study_ids:
+            typer.secho(
+                f"Missing map in {mapping}: {dict_res[study_id]} -> {study_id} (<- new hash created)", fg="yellow"
+            )
+    else:
+        process_and_save_csv(unique_sorted_results, RESERVED_CSV)
 
     print(f"Processed {len(results)} DICOM folders.")
 
diff --git a/process_dcm/utils.py b/process_dcm/utils.py
index d4c7d81..83559c6 100644
--- a/process_dcm/utils.py
+++ b/process_dcm/utils.py
@@ -1,9 +1,12 @@
 """utils module."""
 
+import csv
+import filecmp
 import hashlib
 import json
 import os
 import shutil
+import tempfile
 import warnings
 from collections import defaultdict
 from datetime import datetime
@@ -16,7 +19,7 @@
 from pydicom.dataset import FileDataset
 from pydicom.filereader import dcmread
 
-from process_dcm.const import ImageModality
+from process_dcm.const import RESERVED_CSV, ImageModality
 
 warnings.filterwarnings("ignore", category=UserWarning, message="A value of type *")
 
@@ -123,12 +126,20 @@ def meta_images(dcm_obj: FileDataset) -> dict:
     return meta
 
 
-def process_dcm_meta(dcm_objs: list[FileDataset], output_dir: str) -> None:
+boo = "hello"
+
+
+def process_dcm_meta(
+    dcm_objs: list[FileDataset], output_dir: str, mapping: str = "", keep: str = ""
+) -> tuple[str, str]:
     """Extract and save metadata from a list of DICOM files to a JSON file.
 
     Args:
         dcm_objs (list[FileDataset]): A list of FileDataset objects representing the DICOM files.
         output_dir (str): The directory where the metadata JSON file will be saved.
+        mapping (str): Optional path to the CSV file containing patient ID to study ID mapping.
+                       If not provided and patient_id is not anonymised, a '{RESERVED_CSV}' file will be generated.
+        keep (str): String containing the letters indicating which fields to keep (p, n, d, D, g).
 
     Returns:
         None
@@ -141,17 +152,56 @@ def process_dcm_meta(dcm_objs: list[FileDataset], output_dir: str) -> None:
     metadata["images"]["images"] = []
     metadata["parser_version"] = [1, 5, 2]
     metadata["py_dcm_version"] = [0, 1, 0]
+
+    keep_gender = "g" in keep
+    keep_names = "n" in keep
+    keep_patient_key = "p" in keep
+
+    # Read the mapping file if provided
+    patient_to_study = {}
+    if mapping:
+        patient_to_study = dict(read_csv(mapping))
+
+    original_patient_key = ""
+    new_patient_key = ""
+
     for dcm_obj in dcm_objs:
-        metadata["patient"]["patient_key"] = dcm_obj.get("PatientID")
-        metadata["patient"]["first_name"] = dcm_obj.get("PatientName.name_prefix")
-        metadata["patient"]["last_name"] = dcm_obj.get("PatientName.name_suffix")
-        metadata["patient"]["date_of_birth"] = do_date(dcm_obj.get("PatientBirthDate"), "%Y%m%d", "%Y-%m-%d")
-        metadata["patient"]["gender"] = dcm_obj.get("PatientSex")
+        patient_key = dcm_obj.get("PatientID", "")
+        original_patient_key = patient_key
+        if patient_key in patient_to_study:
+            patient_key = patient_to_study[patient_key]
+        elif not keep_patient_key:
+            patient_key = get_hash(patient_key)
+
+        new_patient_key = patient_key
+
+        first_name = dcm_obj.get("PatientName.name_prefix")
+        last_name = dcm_obj.get("PatientName.name_suffix")
+        if not keep_names:
+            first_name = None if first_name else first_name
+            last_name = None if last_name else last_name
+
+        date_of_birth = do_date(dcm_obj.get("PatientBirthDate", "10010101"), "%Y%m%d", "%Y-%m-%d")
+        if "D" in keep:
+            year = date_of_birth[:4]
+            date_of_birth = f"{year}-01-01"
+        elif "d" not in keep:
+            date_of_birth = "1001-01-01"
+
+        gender = dcm_obj.get("PatientSex")
+        if not keep_gender:
+            gender = None if gender else gender
+
+        metadata["patient"]["patient_key"] = patient_key
+        metadata["patient"]["first_name"] = first_name
+        metadata["patient"]["last_name"] = last_name
+        metadata["patient"]["date_of_birth"] = date_of_birth
+        metadata["patient"]["gender"] = gender
         metadata["patient"]["source_id"] = dcm_obj.get("StudyInstanceUID")
 
         metadata["exam"]["manufacturer"] = dcm_obj.get("Manufacturer")
         metadata["exam"]["scan_datetime"] = do_date(
-            dcm_obj.get("AcquisitionDateTime"), "%Y%m%d%H%M%S.%f", "%Y-%m-%d %H:%M:%S"
+            dcm_obj.get("AcquisitionDateTime", "00000000"), "%Y%m%d%H%M%S.%f", "%Y-%m-%d %H:%M:%S"
         )
         metadata["exam"]["scanner_model"] = dcm_obj.get("ManufacturerModelName")
         metadata["exam"]["scanner_serial_number"] = dcm_obj.get("DeviceSerialNumber")
@@ -174,6 +224,13 @@ def process_dcm_meta(dcm_objs: list[FileDataset], output_dir: str) -> None:
     with open(meta_file, "w") as f:
         json.dump(metadata, f, indent=4)
 
+    return (new_patient_key, original_patient_key)
+
+
+process_dcm_meta.__doc__ = (
+    process_dcm_meta.__doc__.format(RESERVED_CSV=RESERVED_CSV) if process_dcm_meta.__doc__ else None
+)
+
 
 def update_modality(dcm: FileDataset) -> bool:
     """Updates the modality of the given DICOM object based on its Manufacturer and SeriesDescription attributes.
@@ -217,20 +274,25 @@ def process_dcm(
     input_dir: str | Path,
     image_format: str = "png",
     output_dir: str = "exported_data",
+    mapping: str = "",
+    keep: str = "",
     overwrite: bool = False,
     verbose: bool = False,
-) -> None:
+) -> tuple[str, str]:
     """Process DICOM files from the input directory and save images in the specified format.
 
     Args:
         input_dir (str|Path): Path to the directory containing DICOM files.
         output_dir (str): Path to the directory where images will be saved. Defaults to "__input_dir__/exported_data".
                           Use full path if wanting to save to a specific folder.
+        mapping (str): Optional path to the CSV file containing patient ID to study ID mapping.
+                       If not provided and patient_id is not anonymised, a '{RESERVED_CSV}' file will be generated.
         image_format (str): The format in which to save the images. Defaults to "png".
         overwrite (bool): Whether to overwrite existing files in the output directory. Defaults to False.
         verbose (bool, optional): Whether to print out progress information during processing. Defaults to True.
+        keep (str): String containing the letters indicating which fields to keep (p, n, d, D, g).
     """
-    output_dir = set_output_dir(input_dir, output_dir)
+    # output_dir = set_output_dir(input_dir, output_dir)
 
     if overwrite:
         shutil.rmtree(output_dir, ignore_errors=True)
@@ -271,25 +333,47 @@ def process_dcm(
 
         dcms.append(dcm)
 
-    process_dcm_meta(dcms, output_dir)
+    return process_dcm_meta(dcm_objs=dcms, output_dir=output_dir, mapping=mapping, keep=keep)
+
+
+process_dcm.__doc__ = process_dcm.__doc__.format(RESERVED_CSV=RESERVED_CSV) if process_dcm.__doc__ else None
+
 
+def find_dicom_folders_with_base(root_folder: str) -> tuple[int, str, list[str]]:
+    """Finds all unique subfolders within the root folder that contain at least one DICOM (.dcm) file.
 
-def find_dcm_subfolders(root_folder: str) -> list[str]:
-    """Finds all unique subfolders within the root folder that contain at least one DCM file.
+    It also returns the common base directory and the number of found subfolders.
 
     Args:
-        root_folder: The path to the root folder to search.
+        root_folder (str): The path to the root folder to search for subfolders containing DICOM files.
 
     Returns:
-        A naturally sorted list of unique full paths of subfolders containing DCM files.
+        tuple[int, str, list[str]]: A tuple containing:
+            - int: The number of subfolders containing at least one DICOM file.
+            - str: The base directory common to all found subfolders, or an empty string if none found.
+            - list[str]: A naturally sorted list of unique full paths of subfolders containing DICOM files.
+
+    Example:
+        find_dicom_folders_with_base("/data/patient")
     """
     unique_subfolders = set()
-
     for dirpath, _, filenames in os.walk(root_folder):
         if any(filename.lower().endswith(".dcm") for filename in filenames):
-            unique_subfolders.add(dirpath)  # Store full path
+            unique_subfolders.add(dirpath)  # Add the full path of subfolders containing DCM files
+
+    folders = list(unique_subfolders)
+    len_ins = len(folders)
+
+    if len_ins == 0:
+        return 0, "", []
 
-    return natsorted(list(unique_subfolders))
+    if len_ins == 1:
+        base_dir = folders[0].rstrip("/")  # Strip trailing slash if present
+        base_dir = os.path.dirname(base_dir)
+    else:
+        base_dir = os.path.commonpath(folders)  # Get the common base directory
+
+    return len_ins, base_dir, natsorted(folders)
 
 
 def get_md5(file_path: Path | str | list[str]) -> str:
@@ -305,3 +389,122 @@ def get_md5(file_path: Path | str | list[str]) -> str:
                 while chunk := f.read(4096):
                     md5_hash.update(chunk)
     return md5_hash.hexdigest()
+
+
+def get_hash(value: str) -> str:
+    """Get a 10 digit hash based on the input string."""
+    hex_dig = hashlib.sha256(str(value).encode()).hexdigest()
+    return f"{int(hex_dig[:8], 16):010}"
+
+
+def get_versioned_filename(base_filename: str, version: int) -> str:
+    """Generates a file name with a version suffix.
+
+    Args:
+        base_filename (str): The base name of the file.
+        version (int): The version number to append.
+
+    Returns:
+        str: The generated file name with a version suffix.
+    """
+    base, ext = os.path.splitext(base_filename)
+    return f"{base}_{version}{ext}"
+
+
+def save_to_temp_file(data: list[list[str]]) -> str:
+    """Saves data to a temporary CSV file.
+
+    Args:
+        data (list[list[str]]): The data to be written to the temporary file.
+
+    Returns:
+        str: The path of the temporary file.
+    """
+    temp_file = tempfile.NamedTemporaryFile(delete=False, mode="w", newline="", suffix=".csv")
+    temp_file.close()  # Close the NamedTemporaryFile to be reused by write_to_csv
+    write_to_csv(temp_file.name, data, header=["study_id", "patient_id"])
+    return temp_file.name
+
+
+def files_are_identical(file1: str, file2: str) -> bool:
+    """Checks if two files are identical.
+
+    Args:
+        file1 (str): The path to the first file.
+        file2 (str): The path to the second file.
+
+    Returns:
+        bool: True if the files are identical, False otherwise.
+    """
+    return filecmp.cmp(file1, file2, shallow=False)
+
+
+def process_and_save_csv(unique_sorted_results: list, reserved_csv: str, verbose: bool = False) -> None:
+    """Processes unique sorted results and saves them to the reserved CSV file.
+
+    If the content is identical to the existing file, it leaves the existing file unchanged.
+    If the content differs, it renames the existing file with a suffix and saves the new content as the reserved CSV.
+
+    Args:
+        unique_sorted_results (list): The data to be written to the CSV file. Each sublist
+                                      represents a row with 'study_id' and 'patient_id'.
+        reserved_csv (str): The path to the reserved CSV file.
+        verbose (bool, optional): Verbosity. Defaults to False.
+    """
+    temp_filename = save_to_temp_file(unique_sorted_results)
+
+    if os.path.exists(reserved_csv):
+        if files_are_identical(temp_filename, reserved_csv):
+            os.remove(temp_filename)
+            if verbose:
+                print(f"No changes detected. {reserved_csv} remains unchanged.")
+        else:
+            version = 1
+            new_version_filename = get_versioned_filename(reserved_csv, version)
+            while os.path.exists(new_version_filename):
+                version += 1
+                new_version_filename = get_versioned_filename(reserved_csv, version)
+
+            shutil.move(reserved_csv, new_version_filename)
+            if verbose:
+                print(f"Old {reserved_csv} renamed to {new_version_filename}")
+            shutil.move(temp_filename, reserved_csv)
+            if verbose:
+                print(f"New generated mapping saved to {reserved_csv}")
+    else:
+        shutil.move(temp_filename, reserved_csv)
+        if verbose:
+            print(f"Generated mapping saved to {reserved_csv}")
+
+
+def read_csv(file_path: str) -> list[list[str]]:
+    """Reads a CSV file and returns its contents as a list of rows.
+
+    Each row is represented as a list of strings.
+
+    Args:
+        file_path (str): The path to the CSV file to be read.
+
+    Returns:
+        list[list[str]]: A list of rows, where each row is a list of strings representing the CSV data.
+    """
+    with open(file_path) as file:
+        reader = csv.reader(file)
+        return list(reader)
+
+
+def write_to_csv(file_path: str | Path, data: list[list[str]], header: list[str] = []) -> None:
+    """Writes data to a CSV file at the specified file path.
+
+    Args:
+        file_path (str|Path): The path to the CSV file.
+        data (list[list[str]]): The data to write to the CSV file. Each sublist represents a row.
+        header (list[str], optional): An optional list representing the CSV header.
+                                      Defaults to None.
+    """
+    file_path = Path(file_path)
+    with file_path.open(mode="w", newline="") as file:
+        writer = csv.writer(file)
+        if header:
+            writer.writerow(header)
+        writer.writerows(data)
diff --git a/tests/conftest.py b/tests/conftest.py
index 4a4ea7a..ab165c2 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,5 +1,7 @@
+import os
 import tempfile
 from collections import defaultdict
+from collections.abc import Generator
 from pathlib import Path
 
 import pytest
@@ -7,6 +9,7 @@
 from typer.testing import CliRunner
 
 from process_dcm import __version__ as version
+from process_dcm.const import ImageModality
 
 
 def pytest_report_header():
@@ -98,7 +101,56 @@ def dicom_with_photo_locations():
 def dicom_base():
     """Base fixture for creating a mocked DICOM FileDataset."""
     dataset = FileDataset("test.dcm", {}, file_meta=defaultdict(str), preamble=b"\0" * 128)
-    dataset.Modality = ""
+    dataset.AccessionNumber = 0
+    dataset.Modality = ImageModality.OCT
+    dataset.PatientBirthDate = "19020202"
     dataset.Manufacturer = ""
     dataset.SeriesDescription = ""
+    dataset.PatientID = "bbff7a25-d32c-4192-9330-0bb01d49f746"
     return dataset
+
+
+@pytest.fixture
+def csv_data():
+    return [
+        ["study_id_1", "patient_id_1"],
+        ["study_id_2", "patient_id_2"],
+    ]
+
+
+@pytest.fixture
+def unique_sorted_results():
+    return [
+        ["study_id_3", "patient_id_3"],
+        ["study_id_4", "patient_id_4"],
+    ]
+
+
+@pytest.fixture
+def janitor() -> Generator[list[str], None, None]:
+    to_delete: list[str] = []
+    yield to_delete
+    del_file_paths(to_delete)
+
+
+def del_file_paths(file_paths: list[str]) -> None:
+    """Deletes all files and folders in the list of file paths.
+
+    Args:
+        file_paths (List[str]): A list of file paths to delete.
+
+    Returns:
+        None
+    """
+    for path in file_paths:
+        if not os.path.exists(path):
+            continue
+        if os.path.isfile(path):
+            os.remove(path)
+        elif os.path.isdir(path):
+            for root, dirs, files in os.walk(path, topdown=False):
+                for name in files:
+                    os.remove(os.path.join(root, name))
+                for name in dirs:
+                    os.rmdir(os.path.join(root, name))
+            os.rmdir(path)
diff --git a/tests/map.csv b/tests/map.csv
new file mode 100644
index 0000000..db209f8
--- /dev/null
+++ b/tests/map.csv
@@ -0,0 +1,2 @@
+bbff7a25-d32c-4192-9330-0bb01d49f746,00123
+010-0001,012345
diff --git a/tests/test_main.py b/tests/test_main.py
index 02d91e2..ad2508f 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -2,8 +2,12 @@
 from glob import glob
 from pathlib import Path
 
+import pytest
+import typer
+
 from process_dcm import __version__
-from process_dcm.main import app, cli, main
+from process_dcm.const import RESERVED_CSV
+from process_dcm.main import app, cli, main, process_task
 from process_dcm.utils import get_md5
 
 
@@ -52,21 +56,150 @@ def test_cli_without_args(capsys):
     assert "Missing argument 'INPUT_DIR'" in captured.err
 
 
-def test_main():
+@pytest.mark.parametrize(
+    "md5, meta, keep",
+    [
+        (["837808d746aef8e2dd08defbdbc70818"], "0a9a930806f2784aa4e60d47b3bad6ed", "pndg"),
+        (["7a355bb7e0c95155d1541c7fe0941c5e"], "fd6c5a84aca6499b0ea8b99d4e25dc92", "pnDg"),
+        (["2319181ecfc33d35b01dcec65ab2c568"], "35fe295648681e3521da8dddaed63705", ""),
+    ],
+)
+def test_main(md5, meta, keep, janitor):
     input_dir = "tests/example-dcms"
     image_format = "png"
     n_jobs = 1
     overwrite = False
     verbose = True
-    md5 = ["837808d746aef8e2dd08defbdbc70818"]
-
+    mapping = ""
+    janitor.append("patient_2_study_id.csv")
     # Create a temporary directory using the tempfile module
     with tempfile.TemporaryDirectory() as tmpdirname:
         output_dir = Path(tmpdirname)
-
-        # Run your app's main function with the test inputs
-        main(input_dir, image_format, str(output_dir), n_jobs, overwrite, verbose)
-        of = sorted(glob(f"{output_dir}/*"))
+        main(
+            input_dir=input_dir,
+            image_format=image_format,
+            output_dir=str(output_dir),
+            n_jobs=n_jobs,
+            overwrite=overwrite,
+            verbose=verbose,
+            keep=keep,
+            mapping=mapping,
+        )
+        of = sorted(glob(f"{output_dir}/**/*"))
         assert len(of) == 51
-        assert get_md5(output_dir / "metadata.json") == "0a9a930806f2784aa4e60d47b3bad6ed"
+        assert get_md5(output_dir / "example-dcms/metadata.json") == meta
         assert get_md5(of) in md5
+
+
+def test_main_mapping(janitor):
+    input_dir = "tests/example-dcms"
+    image_format = "png"
+    n_jobs = 1
+    overwrite = False
+    verbose = True
+    janitor.append("patient_2_study_id.csv")
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        output_dir = Path(tmpdirname)
+        main(
+            input_dir=input_dir,
+            image_format=image_format,
+            output_dir=str(output_dir),
+            n_jobs=n_jobs,
+            overwrite=overwrite,
+            verbose=verbose,
+            keep="p",
+            mapping="tests/map.csv",
+        )
+        of = sorted(glob(f"{output_dir}/**/*"))
+        assert len(of) == 51
+        assert get_md5(output_dir / "example-dcms/metadata.json") == "261826ad2e067e9adb7143bb6c053dbc"
+        assert get_md5(of) in "6ff8e2fe69c5fbe86f81f44f74496cab"
+
+
+def test_main_abort():
+    input_dir = "tests/example-dcms"
+    image_format = "png"
+    n_jobs = 1
+    overwrite = False
+    verbose = True
+    # Expect the typer.Abort exception to be raised
+    with pytest.raises(typer.Abort):
+        main(
+            input_dir=input_dir,
+            image_format=image_format,
+            output_dir="/tmp",
+            n_jobs=n_jobs,
+            overwrite=overwrite,
+            verbose=verbose,
+            keep="p",
+            mapping=RESERVED_CSV,
+        )
+
+
+def test_main_mapping_example_dir(janitor):
+    input_dir = "tests/example_dir"
+    image_format = "png"
+    n_jobs = 2
+    overwrite = True
+    verbose = True
+    janitor.append("patient_2_study_id.csv")
+    janitor.append("patient_2_study_id_1.csv")
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        output_dir = Path(tmpdirname)
+        main(
+            input_dir=input_dir,
+            image_format=image_format,
+            output_dir=str(output_dir),
+            n_jobs=n_jobs,
+            overwrite=overwrite,
+            verbose=verbose,
+            keep="nDg",
+            mapping="tests/map.csv",
+        )
+        of = sorted(glob(f"{output_dir}/**/**/*"))
+        assert len(of) == 262
+        assert get_md5(output_dir / "010-0001/20180724_L/metadata.json") == "1b46961177c80daf69e7dea7379fcc31"
+        assert get_md5(output_dir / "010-0002/20180926_R/metadata.json") == "bbf5c47f9fb28f46b4cc1bf08c311593"
+
+
+def test_main_mapping_example_dir_relative(janitor):
+    input_dir = "tests/example_dir"
+    image_format = "png"
+    n_jobs = 2
+    overwrite = False
+    verbose = True
+    relative = True
+    janitor.append("patient_2_study_id.csv")
+    janitor.append("patient_2_study_id_1.csv")
+    main(
+        input_dir=input_dir,
+        image_format=image_format,
+        output_dir="dummy",
+        n_jobs=n_jobs,
+        overwrite=overwrite,
+        verbose=verbose,
+        keep="nDg",
+        mapping="tests/map.csv",
+        relative=relative,
+    )
+    of = sorted(glob(f"{input_dir}/**/**/dummy/*"))
+    path1 = Path(input_dir) / "010-0001/20180724_L/dummy"
+    path2 = Path(input_dir) / "010-0002/20180926_R/dummy"
+    janitor.append(path1)
+    janitor.append(path2)
+    assert len(of) == 262
+    assert get_md5(path1 / "metadata.json") == "1b46961177c80daf69e7dea7379fcc31"
+    assert get_md5(path2 / "metadata.json") == "bbf5c47f9fb28f46b4cc1bf08c311593"
+
+
+def test_process_task():
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        output_dir = Path(tmpdirname)
+        task_data = ("tests/example-dcms/", str(output_dir))
+        image_format = "png"
+        overwrite = True
+        verbose = False
+        keep = ""
+        mapping = ""
+        result = process_task(task_data, image_format, overwrite, verbose, keep, mapping)
+        assert result == ("0780320450", "bbff7a25-d32c-4192-9330-0bb01d49f746")
diff --git a/tests/test_utils.py b/tests/test_utils.py
index d9c5b26..b3a45a0 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,3 +1,4 @@
+import json
 import os
 import tempfile
 from pathlib import Path
@@ -6,7 +7,17 @@
 import pytest
 
 from process_dcm.const import ImageModality
-from process_dcm.utils import do_date, find_dcm_subfolders, meta_images, process_dcm, set_output_dir, update_modality
+from process_dcm.utils import (
+    do_date,
+    meta_images,
+    process_and_save_csv,
+    process_dcm,
+    process_dcm_meta,
+    read_csv,
+    set_output_dir,
+    update_modality,
+    write_to_csv,
+)
 
 
 def test_meta_images_optopol(dicom_opotopol, mocker):
@@ -102,17 +113,6 @@ def test_do_date() -> None:
     assert do_date("InvalidDate", "%Y%m%d", "%Y-%m-%d") == ""
 
 
-def test_find_dcm_subfolders() -> None:
-    with tempfile.TemporaryDirectory() as tmpdir:
-        dcm_folder1 = os.path.join(tmpdir, "folder1")
-        dcm_folder2 = os.path.join(tmpdir, "folder2")
-        os.makedirs(dcm_folder1)
-        os.makedirs(dcm_folder2)
-        open(os.path.join(dcm_folder1, "image.dcm"), "w").close()
-        subfolders = find_dcm_subfolders(tmpdir)
-        assert len(subfolders) == 1
-
-
 def test_update_modality_opt(dicom_base):
     """Test updating modality when the modality is OPT."""
     dicom_base.Modality = "OPT"
@@ -171,3 +171,93 @@ def test_update_modality_op_various_descriptions(dicom_base, description, expect
     dicom_base.SeriesDescription = description
     assert update_modality(dicom_base) is True
     assert dicom_base.Modality == expected_modality
+
+
+def test_process_dcm_meta_with_D_in_keep_and_mapping(dicom_base):
+    # Call the function with "D" in keep
+    with tempfile.TemporaryDirectory() as tmpdir:
+        process_dcm_meta([dicom_base], tmpdir, keep="D", mapping="tests/map.csv")
+        rjson = json.load(open(os.path.join(tmpdir, "metadata.json")))
+        assert rjson["patient"]["date_of_birth"] == "1902-01-01"
+        assert rjson["patient"]["patient_key"] == "00123"
+
+
+def test_process_and_save_csv(csv_data, unique_sorted_results):
+    with tempfile.TemporaryDirectory() as temp_dir:
+        reserved_csv = Path(temp_dir) / "reserved.csv"
+
+        # Create initial reserved CSV with initial csv_data
+        write_to_csv(reserved_csv, csv_data, header=["study_id", "patient_id"])
+
+        # Process and save new CSV data
+        process_and_save_csv(unique_sorted_results, reserved_csv, verbose=True)
+
+        # Check if reserved CSV was updated
+        updated_data = read_csv(reserved_csv)
+        expected_data = [["study_id", "patient_id"], *unique_sorted_results]
+        assert updated_data == expected_data, f"Expected {expected_data}, but got {updated_data}"
+
+        # Check if backup was created
+        backup_file = Path(temp_dir) / "reserved_1.csv"
+        assert backup_file.exists(), f"Expected backup file {backup_file} to exist"
+
+        backup_data = read_csv(backup_file)
+        expected_backup_data = [["study_id", "patient_id"], *csv_data]
+        assert backup_data == expected_backup_data, f"Expected {expected_backup_data}, but got {backup_data}"
+
+
+def test_process_and_save_csv_no_existing_file(unique_sorted_results):
+    with tempfile.TemporaryDirectory() as temp_dir:
+        reserved_csv = Path(temp_dir) / "reserved.csv"
+
+        # Process and save new CSV data with no existing reserved CSV
+        process_and_save_csv(unique_sorted_results, reserved_csv, verbose=True)
+
+        # Check if reserved CSV was created and contains the expected data
+        created_data = read_csv(reserved_csv)
+        expected_data = [["study_id", "patient_id"], *unique_sorted_results]
+        assert created_data == expected_data, f"Expected {expected_data}, but got {created_data}"
+
+
+def test_process_and_save_csv_with_existing_file(csv_data, unique_sorted_results):
+    with tempfile.TemporaryDirectory() as temp_dir:
+        reserved_csv = Path(temp_dir) / "reserved.csv"
+
+        # Create initial reserved CSV with initial csv_data
+        write_to_csv(reserved_csv, csv_data, header=["study_id", "patient_id"])
+
+        # Process and save new CSV data
+        process_and_save_csv(unique_sorted_results, reserved_csv, verbose=True)
+
+        # Check if reserved CSV was updated
+        updated_data = read_csv(reserved_csv)
+        expected_data = [["study_id", "patient_id"], *unique_sorted_results]
+        assert updated_data == expected_data, f"Expected {expected_data}, but got {updated_data}"
+
+        # Check if backup was created
+        backup_file = Path(temp_dir) / "reserved_1.csv"
+        assert backup_file.exists(), f"Expected backup file {backup_file} to exist"
+
+        backup_data = read_csv(backup_file)
+        expected_backup_data = [["study_id", "patient_id"], *csv_data]
+        assert backup_data == expected_backup_data, f"Expected {expected_backup_data}, but got {backup_data}"
+
+
+def test_process_and_save_csv_no_changes(csv_data):
+    with tempfile.TemporaryDirectory() as temp_dir:
+        reserved_csv = Path(temp_dir) / "reserved.csv"
+
+        # Create reserved CSV with initial csv_data
+        write_to_csv(reserved_csv, csv_data, header=["study_id", "patient_id"])
+
+        # Process and save the same CSV data
+        process_and_save_csv(csv_data, reserved_csv, verbose=True)
+
+        # Check if reserved CSV remains unchanged
+        unchanged_data = read_csv(reserved_csv)
+        expected_data = [["study_id", "patient_id"], *csv_data]
+        assert unchanged_data == expected_data, f"Expected {expected_data}, but got {unchanged_data}"
+
+        # Check that no backup was created
+        backup_file = Path(temp_dir) / "reserved_1.csv"
+        assert not backup_file.exists(), f"Did not expect backup file {backup_file} to exist"