Skip to content

Commit

Permalink
partners tmaps time selection and cardiac surgery tmap rework (#309)
Browse files Browse the repository at this point in the history
* dynamic time series tmaps

* time series persistence #305 and redo cardiac surgery tmaps

* voltage _exact length tmaps, population_normalize -> normalization in TMap

* validator for voltage

* remove apollo xref, get newest surgery

* fixes selection of mrn_col_name in _sample_csv_to_set

* fix validator

* warning -> debug

* dsw infection

* columns

* outcome

* prolonged vent column name

* reformat voltage tmaps

* explicit _pc tmps

* type hint

* delete redundant length and zero tmaps

* use xref output csv to get newest surgery with preop ecg

* adds train_simple_model (#317)

* patient sex categorical tmaps

* explicit voltage tmaps

* sex tmap cats

* dsw outcomes resolved

* gender -> sex in plots

* voltage stats

* train/valid/test not useful in progress bar

* report median, generator

* consolidate simple shallow model

* revert change

* version TFA TFP #320

* fix abbreviations

Co-authored-by: Erik Reinertsen <[email protected]>
  • Loading branch information
StevenSong and Erik Reinertsen authored Jun 18, 2020
1 parent 9492172 commit 9754136
Show file tree
Hide file tree
Showing 8 changed files with 334 additions and 1,002 deletions.
4 changes: 2 additions & 2 deletions docker/vm_boot_images/config/tensorflow-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ bokeh
Pillow==7.0.0
notebook
pytest
tensorflow-addons
tensorflow_probability
tensorflow-addons==0.9.1
tensorflow_probability==0.9.0
numcodecs
beautifulsoup4
lxml
Expand Down
8 changes: 5 additions & 3 deletions ml4cvd/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from ml4cvd.models import parent_sort, BottleneckType, check_no_bottleneck
from ml4cvd.tensor_maps_by_hand import TMAPS
from ml4cvd.defines import IMPUTATION_RANDOM, IMPUTATION_MEAN
from ml4cvd.tensor_maps_partners_ecg import build_partners_tensor_maps, build_cardiac_surgery_tensor_maps
from ml4cvd.tensor_maps_partners_ecg import build_partners_tensor_maps, build_cardiac_surgery_tensor_maps, build_partners_time_series_tensor_maps
from ml4cvd.tensor_map_maker import generate_continuous_tensor_map_from_file


Expand Down Expand Up @@ -329,15 +329,17 @@ def _get_tmap(name: str, needed_tensor_maps: List[str]) -> TensorMap:
if name in TMAPS:
return TMAPS[name]

TMAPS.update(build_partners_time_series_tensor_maps(needed_tensor_maps))
if name in TMAPS:
return TMAPS[name]

from ml4cvd.tensor_maps_partners_ecg import TMAPS as partners_tmaps
TMAPS.update(partners_tmaps)

if name in TMAPS:
return TMAPS[name]

from ml4cvd.tensor_maps_partners_ecg_labels import TMAPS as partners_label_tmaps
TMAPS.update(partners_label_tmaps)

if name in TMAPS:
return TMAPS[name]

Expand Down
9 changes: 3 additions & 6 deletions ml4cvd/explorations.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,7 @@ def _hd5_to_disk(tmaps, path, gen_name, tot, output_folder, id):
with count.get_lock():
i = count.value
if i % 500 == 0:
logging.info(f"{gen_name} - Parsing {i}/{tot} ({i/tot*100:.1f}%) done")
logging.info(f"Parsing {i}/{tot} ({i/tot*100:.1f}%) done")
count.value += 1

# each worker should write to it's own file
Expand Down Expand Up @@ -750,7 +750,7 @@ def _tensors_to_df(args):
tmaps = [tm for tm in args.tensor_maps_in]
global count # TODO figure out how to not use global
count = multiprocess.Value('l', 1)
paths = [(path, gen.name) for gen in generators for worker_paths in gen.path_iters for path in worker_paths.paths]
paths = [(path, gen.name.replace('_worker', '')) for gen in generators for worker_paths in gen.path_iters for path in worker_paths.paths]
num_hd5 = len(paths)
chunksize = num_hd5 // args.num_workers
with multiprocess.Pool(processes=args.num_workers) as pool:
Expand Down Expand Up @@ -781,9 +781,6 @@ def _tensors_to_df(args):
logging.debug(f'Appended {fpath} to overall dataframe')
temp_files.append(fpath)

# Remove "_worker" from "generator" values
df["generator"].replace("_worker", "", regex=True, inplace=True)

logging.info(f"Extracted {len(tmaps)} tmaps from {len(df)} tensors across {num_hd5} hd5 files into DataFrame")

# remove temporary files
Expand Down Expand Up @@ -958,7 +955,7 @@ def explore(args):
df_stats = df_stats.round(2)
df_stats.to_csv(fpath)
logging.info(f"Saved summary stats of {Interpretation.LANGUAGE} tmaps to {fpath}")

if args.plot_hist == "True":
for tm in args.tensor_maps_in:
if tm.interpretation == Interpretation.CONTINUOUS:
Expand Down
15 changes: 7 additions & 8 deletions ml4cvd/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,13 @@ def make_shallow_model(
my_metrics = {}
loss_weights = []
input_tensors = [Input(shape=tm.shape, name=tm.input_name()) for tm in tensor_maps_in]
if len(input_tensors) > 1:
logging.warning('multi input tensors not fully supported')
for it in input_tensors:
for ot in tensor_maps_out:
losses.append(ot.loss)
loss_weights.append(ot.loss_weight)
my_metrics[ot.output_name()] = ot.metrics
outputs.append(Dense(units=len(ot.channel_map), activation=ot.activation, name=ot.output_name())(it))

it = concatenate(input_tensors) if len(input_tensors) > 1 else input_tensors[0]
for ot in tensor_maps_out:
losses.append(ot.loss)
loss_weights.append(ot.loss_weight)
my_metrics[ot.output_name()] = ot.metrics
outputs.append(Dense(units=len(ot.channel_map), activation=ot.activation, name=ot.output_name())(it))

opt = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
m = Model(inputs=input_tensors, outputs=outputs)
Expand Down
6 changes: 3 additions & 3 deletions ml4cvd/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -704,8 +704,8 @@ def _partners_top_panel(data, ax0):
ax0.text(0.55, 0.9, f"{data['sitename']}", weight='bold')

ax0.text(0.0, 0.75, f"{dob} ({age} yr)", weight='bold') # TODO age units
gender = {value: key for key, value in data['gender'].items()}
ax0.text(0.0, 0.67, f"{gender[1]}".title(), weight='bold')
sex = {value: key for key, value in data['sex'].items()}
ax0.text(0.0, 0.67, f"{sex[1]}".title(), weight='bold')
ax0.text(0.0, 0.51, f"Room: ", weight='bold') # TODO room?
ax0.text(0.0, 0.43, f"Loc: {data['location']}", weight='bold')

Expand Down Expand Up @@ -988,7 +988,7 @@ def _partners_clinical(data, args):
def plot_partners_ecgs(args):
plot_tensors = [
'partners_ecg_patientid', 'partners_ecg_firstname', 'partners_ecg_lastname',
'partners_ecg_gender', 'partners_ecg_dob', 'partners_ecg_age',
'partners_ecg_sex', 'partners_ecg_dob', 'partners_ecg_age',
'partners_ecg_datetime', 'partners_ecg_sitename', 'partners_ecg_location',
'partners_ecg_read_md_raw', 'partners_ecg_taxis_md', 'partners_ecg_rate_md',
'partners_ecg_pr_md', 'partners_ecg_qrs_md', 'partners_ecg_qt_md',
Expand Down
2 changes: 0 additions & 2 deletions ml4cvd/recipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,6 @@ def run(args):
train_siamese_model(args)
elif 'write_tensor_maps' == args.mode:
write_tensor_maps(args)
elif 'sort_csv' == args.mode:
sort_csv(args.tensors, args.tensor_maps_in)
elif 'append_continuous_csv' == args.mode:
append_fields_from_csv(args.tensors, args.app_csv, 'continuous', ',')
elif 'append_categorical_csv' == args.mode:
Expand Down
8 changes: 4 additions & 4 deletions ml4cvd/tensor_generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,15 +552,15 @@ def _sample_csv_to_set(sample_csv: Optional[str] = None) -> Union[None, Set[str]
# If no matches, assume the first column is MRN
if not matches:
mrn_col_name = df.columns[0]
else:
# Get first string from set of matches to use as column name
mrn_col_name = next(iter(matches))

elif len(matches) > 1:
if len(matches) > 1:
logging.warning(
f"{sample_csv} has more than one potential column for MRNs. Inferring most likely column name, but recommend explicitly setting MRN column name.",
)

# Get one string from the set of matches; this is the column name
mrn_col_name = next(iter(matches))

# Isolate this column from the dataframe, and cast to strings
sample_ids = df[mrn_col_name].apply(str)

Expand Down
Loading

0 comments on commit 9754136

Please sign in to comment.