Skip to content

Commit

Permalink
feat: ✨ Added anonymiser option by default
Browse files Browse the repository at this point in the history
  • Loading branch information
alanwilter committed Sep 5, 2024
1 parent 00f56c1 commit eea973f
Show file tree
Hide file tree
Showing 9 changed files with 629 additions and 64 deletions.
10 changes: 5 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ repos:
- id: check-shebang-scripts-are-executable
- id: check-added-large-files
args:
- "--maxkb=10000"
- "--maxkb=250000"
# - id: debug-statements # not for process_dcm
- id: check-yaml
exclude: mkdocs.yml
Expand All @@ -26,18 +26,18 @@ repos:
- id: check-merge-conflict
- id: check-docstring-first
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.5.5
rev: v0.6.3
hooks:
- id: ruff
args: ["--fix"]
files: ^hooks
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.5.5
rev: v0.6.3
hooks:
- id: ruff-format
files: ^hooks
- repo: https://github.com/commitizen-tools/commitizen
rev: v3.28.0
rev: v3.29.0
hooks:
- id: commitizen
stages:
Expand All @@ -50,7 +50,7 @@ repos:
- prettier
# not for process_dcm
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.11.1
rev: v1.11.2
hooks:
- id: mypy
exclude: ^tests/
Expand Down
14 changes: 6 additions & 8 deletions .talismanrc
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
fileignoreconfig:
- filename: .cruft.json
checksum: 03b17a0041ffe209156ea08f01b12f2cbb770b4570a6f6678ce8dcae4d945535
- filename: tests/test_main.py
checksum: 3e4ad81306892573518b0e85e38544a978e160e32608229231f8d9f2b2dcd83c
- filename: poetry.lock
checksum: f4cee8b1c6d6f33b5f34cbbfc093008bcfe7820a26df4225ccea2b3fa5e65e15
checksum: efa949f7b639498667939891d29938f01047b7b9a70ffd61a74d47cd127ddbf8
- filename: process_dcm/main.py
checksum: 3b69ef0a904c0b9e04574fc70e4a35bcf0ed23a75ef9ff99ce6bf80cbaa43551
- filename: tests/test_utils.py
checksum: 00912e74713f45921e593b5f1f98e23b324ab13bd5c9d44edb24c49a08f325b8
- filename: process_dcm/utils.py
checksum: 4427feb0e00b54eeda3d878b1457be0c0b62f7c39f68e0691442e4ad3988bbf2
- filename: .github/workflows/release.yml
checksum: 86fbb1303400278bbebbe4523cd4746b4f5e8d1ef036371c8e7a9cbfc0adc254
checksum: fdb35056dbd109dbbac6fd5d3668fbff90ace8f46aa37b787eaa2572c02d0aaf
version: ""
2 changes: 2 additions & 0 deletions process_dcm/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from enum import Enum, Flag, auto, unique
from typing import cast

RESERVED_CSV = "patient_2_study_id.csv"


class ModalityFlag(Flag):
"""A flag representing different modalities in DICOM files."""
Expand Down
107 changes: 96 additions & 11 deletions process_dcm/main.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
"""app to procces DCM files."""

import csv
import os
from functools import partial
from multiprocessing import Pool

import typer
from tqdm import tqdm

from process_dcm import __version__
from process_dcm.utils import find_dcm_subfolders, process_dcm
from process_dcm.const import RESERVED_CSV
from process_dcm.utils import find_dicom_folders_with_base, process_and_save_csv, process_dcm

app = typer.Typer(context_settings={"help_option_names": ["-h", "--help"]})

out_msg = """Output directory for extracted images and metadata. Defaults to: __input_dir__/exported_data\n
Use absolute path if you want to save the output in a specific location."""


def print_version(value: bool) -> None:
"""Print the version of the app."""
Expand All @@ -22,30 +22,115 @@ def print_version(value: bool) -> None:
raise typer.Exit()


def process_task(
task: tuple[str, str], image_format: str, overwrite: bool, verbose: bool, keep: str, mapping: str
) -> tuple[str, str]:
"""Process task."""
subfolder, out_dir = task
return process_dcm(
input_dir=subfolder,
output_dir=out_dir,
image_format=image_format,
overwrite=overwrite,
verbose=verbose,
keep=keep,
mapping=mapping,
)


@app.command()
def main(
input_dir: str = typer.Argument(..., help="Input directory containing subfolders with DICOM files."),
image_format: str = typer.Option(
"png", "-f", "--image_format", help="Image format for extracted images (png, jpg, webp). Defaults to: png"
),
output_dir: str = typer.Option("exported_data", "-o", "--output_dir", help=out_msg),
output_dir: str = typer.Option(
"exported_data",
"-o",
"--output_dir",
help="Output directory for extracted images and metadata. Defaults to: exported_data",
),
relative: bool = typer.Option(
False, "-r", "--relative", help="Save extracted data in folders relative to _input_dir_."
),
n_jobs: int = typer.Option(1, "-j", "--n_jobs", help="Number of parallel jobs. Defaults to: 1"),
mapping: str = typer.Option(
"",
"-m",
"--mapping",
help=f"Path to CSV containing patient_id to study_id mapping. If not provided and patient_id is not anonymised, a '{RESERVED_CSV}' file will be generated", # noqa: E501
),
keep: str = typer.Option(
"",
"-k",
"--keep",
help="Keep the specified fields (p: patient_key, n: names, d: date_of_birth, D: year-only DOB, g: gender)",
),
overwrite: bool = typer.Option(False, "-w", "--overwrite", help="Overwrite existing images if found."),
verbose: bool = typer.Option(False, "-v", "--verbose", help="Verbose output."),
version: bool = typer.Option(
None, "-V", "--version", callback=print_version, is_eager=True, help="Prints app version"
),
) -> None:
"""Process DICOM files in subfolders, extract images and metadata using parallel processing."""
subfolders = find_dcm_subfolders(input_dir)
if mapping == RESERVED_CSV:
typer.secho(f"Can't use reserved CSV file name: {RESERVED_CSV}", fg="red")
raise typer.Abort()
if "p" in keep and mapping:
typer.secho(f"WARN:'--mapping' x '--keep p': File , {mapping} it will overwrite patient_id anyway", fg="yellow")

# Create a partial function with fixed arguments
process_dcm_with_args = partial(
process_dcm, image_format=image_format, output_dir=output_dir, overwrite=overwrite, verbose=verbose
)
len_sf, base_dir, subfolders = find_dicom_folders_with_base(input_dir)
output_dirs = []

if relative and os.path.isabs(output_dir):
relative = False
typer.secho(
"WARN: '--relative' x 'absolute --output_dir' are incompatible, absolute 'output_dir' takes precedence",
fg="yellow",
)
output_dirs = [x.replace(base_dir, output_dir) for x in subfolders]
elif relative and not os.path.isabs(output_dir):
output_dirs = [os.path.join(x, output_dir) for x in subfolders]
elif not relative and not os.path.isabs(output_dir):
output_dir = os.path.abspath(output_dir)
output_dirs = [x.replace(base_dir, output_dir) for x in subfolders]

tasks = list(zip(subfolders, output_dirs))

with Pool(n_jobs) as pool:
results = list(tqdm(pool.imap(process_dcm_with_args, subfolders), total=len(subfolders)))
results = list(
tqdm(
pool.imap(
partial(
process_task,
image_format=image_format,
overwrite=overwrite,
verbose=verbose,
keep=keep,
mapping=mapping,
),
tasks,
),
total=len_sf,
)
)

unique_sorted_results = sorted(set(results)) # (study_id, patient_id)
dict_res = dict(unique_sorted_results)

if mapping:
with open(mapping) as file:
reader = csv.reader(file)
mapping_study_ids = set(row[1] for row in reader)

missing_study_ids = set(result[0] for result in unique_sorted_results) - mapping_study_ids

for study_id in missing_study_ids:
typer.secho(
f"Missing map in {mapping}: {dict_res[study_id]} -> {study_id} (<- new hash created)", fg="yellow"
)
else:
process_and_save_csv(unique_sorted_results, RESERVED_CSV)

print(f"Processed {len(results)} DICOM folders.")

Expand Down
Loading

0 comments on commit eea973f

Please sign in to comment.