Skip to content

Commit

Permalink
variant lookup table has sample_type (#4289) (#4326)
Browse files Browse the repository at this point in the history
* variant lookup table has sample_type

* project_sample_types

* try hail

* switch sample_type and family_guid in project_samples dict"
"
  • Loading branch information
jklugherz authored Aug 22, 2024
1 parent 4de5dae commit adcdf12
Show file tree
Hide file tree
Showing 56 changed files with 25 additions and 40 deletions.
Binary file modified hail_search/fixtures/GRCh37/SNV_INDEL/lookup.ht/.README.txt.crc
Binary file not shown.
Binary file not shown.
2 changes: 1 addition & 1 deletion hail_search/fixtures/GRCh37/SNV_INDEL/lookup.ht/README.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.128-eead8100a1c1
Created at 2024/04/03 17:08:32
Created at 2024/08/16 15:39:04
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified hail_search/fixtures/GRCh37/SNV_INDEL/lookup.ht/metadata.json.gz
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified hail_search/fixtures/GRCh38/MITO/lookup.ht/.README.txt.crc
Binary file not shown.
Binary file modified hail_search/fixtures/GRCh38/MITO/lookup.ht/.metadata.json.gz.crc
Binary file not shown.
2 changes: 1 addition & 1 deletion hail_search/fixtures/GRCh38/MITO/lookup.ht/README.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.128-eead8100a1c1
Created at 2024/04/03 15:52:09
Created at 2024/08/16 15:39:56
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified hail_search/fixtures/GRCh38/MITO/lookup.ht/globals/parts/part-0
Binary file not shown.
Binary file modified hail_search/fixtures/GRCh38/MITO/lookup.ht/metadata.json.gz
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified hail_search/fixtures/GRCh38/SNV_INDEL/lookup.ht/.README.txt.crc
Binary file not shown.
Binary file not shown.
2 changes: 1 addition & 1 deletion hail_search/fixtures/GRCh38/SNV_INDEL/lookup.ht/README.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.128-eead8100a1c1
Created at 2024/04/03 17:00:55
Created at 2024/08/16 15:40:56
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified hail_search/fixtures/GRCh38/SNV_INDEL/lookup.ht/metadata.json.gz
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
30 changes: 10 additions & 20 deletions hail_search/queries/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ def _parse_sample_data(self, sample_data):
project_samples = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
for s in sample_data:
families.add(s['family_guid'])
project_samples[s['project_guid']][s['family_guid']][s['sample_type']].append(s)
project_samples[s['project_guid']][s['sample_type']][s['family_guid']].append(s)

num_families = len(families)
logger.info(f'Loading {self.DATA_TYPE} data for {num_families} families in {len(project_samples)} projects')
Expand All @@ -297,13 +297,12 @@ def _load_filtered_project_hts(self, project_samples, skip_all_missing=False, n_
if len(project_samples) == 1:
project_guid = list(project_samples.keys())[0]
# for variant lookup, project_samples looks like
# {<project_guid>: {<family_guid>: {<sample_type>: True}, {<family_guid>: {<sample_type_2>: True}}, <project_guid_2>: ...}
# {<project_guid>: {<sample_type>: {<family_guid>: True}, <sample_type_2>: {<family_guid_2>: True}}, <project_guid_2>: ...}
# for variant search, project_samples looks like
# {<project_guid>: {<family_guid>: {<sample_type>: [<sample_data>, <sample_data>, ...], <sample_type_2>: ...}, <family_guid_2>: ...}, <project_guid_2>: ...}
first_family_samples = list(project_samples[project_guid].values())[0]
sample_type = list(first_family_samples.keys())[0]
# {<project_guid>: {<sample_type>: {<family_guid>: [<sample_data>, <sample_data>, ...]}, <sample_type_2>: {<family_guid_2>: []} ...}, <project_guid_2>: ...}
sample_type = list(project_samples[project_guid].keys())[0]
project_ht = self._read_table(f'projects/{sample_type}/{project_guid}.ht', use_ssd_dir=True)
return self._filter_entries_table(project_ht, project_samples[project_guid], **kwargs)
return self._filter_entries_table(project_ht, project_samples[project_guid][sample_type], **kwargs)

# Need to chunk tables or else evaluating table globals throws LineTooLong exception
# However, minimizing number of chunks minimizes number of aggregations/ evals and improves performance
Expand All @@ -314,14 +313,13 @@ def _load_filtered_project_hts(self, project_samples, skip_all_missing=False, n_
project_hts = []
sample_data = {}
for project_guid, project_sample_data in project_samples.items():
first_family_samples = list(project_sample_data.values())[0]
sample_type = list(first_family_samples.keys())[0]
sample_type = list(project_sample_data.keys())[0]
project_ht = self._read_table(f'projects/{sample_type}/{project_guid}.ht', use_ssd_dir=True)

if project_ht is None:
continue
project_hts.append(project_ht.select_globals('sample_type', 'family_guids', 'family_samples'))
sample_data.update(project_sample_data)
sample_data.update(project_sample_data[sample_type])

if len(project_hts) >= chunk_size:
self._filter_merged_project_hts(
Expand All @@ -342,14 +340,14 @@ def _load_filtered_project_hts(self, project_samples, skip_all_missing=False, n_
def import_filtered_table(self, project_samples, num_families, **kwargs):
if num_families == 1:
family_sample_data = list(project_samples.values())[0]
family_guid = list(family_sample_data.keys())[0]
sample_type = list(family_sample_data[family_guid].keys())[0]
sample_type = list(family_sample_data.keys())[0]
family_guid = list(family_sample_data[sample_type].keys())[0]
family_ht = self._read_table(f'families/{sample_type}/{family_guid}.ht', use_ssd_dir=True)
family_ht = family_ht.transmute(family_entries=[family_ht.entries])
family_ht = family_ht.annotate_globals(
family_guids=[family_guid], family_samples={family_guid: family_ht.sample_ids},
)
families_ht, comp_het_families_ht = self._filter_entries_table(family_ht, family_sample_data, **kwargs)
families_ht, comp_het_families_ht = self._filter_entries_table(family_ht, family_sample_data[sample_type], **kwargs)
else:
families_ht, comp_het_families_ht = self._load_filtered_project_hts(project_samples, **kwargs)

Expand Down Expand Up @@ -399,14 +397,6 @@ def _merge_project_hts(project_hts, n_partitions, include_all_globals=False):
def _filter_entries_table(self, ht, sample_data, inheritance_filter=None, quality_filter=None, **kwargs):
ht = self._prefilter_entries_table(ht, **kwargs)

# Temporarily reset sample_data until full blended eS/GS support is added
for family_guid, samples_by_sample_type in sample_data.items():
if isinstance(list(samples_by_sample_type.values())[0], list):
samples = [s for samples in samples_by_sample_type.values() for s in samples]
sample_data[family_guid] = samples
else:
sample_data[family_guid] = True

ht, sorted_family_sample_data = self._add_entry_sample_families(ht, sample_data)

passes_quality_filter = self._get_family_passes_quality_filter(quality_filter, ht, **kwargs)
Expand Down
29 changes: 12 additions & 17 deletions hail_search/queries/mito.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import os
from collections import defaultdict

from aiohttp.web import HTTPNotFound
import hail as hl
Expand Down Expand Up @@ -310,34 +310,29 @@ def _gene_rank_sort(cls, r, gene_ranks):

def _add_project_lookup_data(self, ht, annotation_fields, *args, **kwargs):
# Get all the project-families for the looked up variant formatted as a dict of dicts:
# {<project_guid>: {<family_guid>: {<sample_type>: True}, {<family_guid_2>: {<sample_type_2>: True}}, <project_guid_2>: ...}
# {<project_guid>: {<sample_type>: {<family_guid>: True}, <sample_type_2>: {<family_guid_2>: True}}, <project_guid_2>: ...}
lookup_ht = self._read_table('lookup.ht', use_ssd_dir=True, skip_missing_field='project_stats')
if lookup_ht is None:
raise HTTPNotFound()
variant_projects = lookup_ht.aggregate(hl.agg.take(
hl.dict(hl.enumerate(lookup_ht.project_stats).starmap(lambda i, ps: (
lookup_ht.project_guids[i],
lookup_ht.project_sample_types[i],
hl.enumerate(ps).starmap(
lambda j, s: hl.or_missing(self._stat_has_non_ref(s), j)
).filter(hl.is_defined),
)).filter(
lambda x: x[1].any(hl.is_defined)
).starmap(lambda project_guid, family_indices: (
project_guid,
hl.dict(family_indices.map(lambda j: (lookup_ht.project_families[project_guid][j], True))),
))), 1),
).starmap(lambda project_key, family_indices: (
project_key,
hl.dict(family_indices.map(lambda j: (lookup_ht.project_families[project_key][j], True))),
)).group_by(
lambda x: x[0][0]
).map_values(
lambda project_data: hl.dict(project_data.starmap(
lambda project_key, families: (project_key[1], families)
)))), 1)
)[0]

for project_guid, families in variant_projects.items():
# Temporarily use try/except to determine sample_type, to be removed when lookup table contains sample_type
try:
hl.read_table(self._get_table_path(f'projects/WES/{project_guid}.ht', use_ssd_dir=True))
sample_type = 'WES'
except Exception:
sample_type = 'WGS'
for family_guid, value in families.items():
families[family_guid] = {sample_type: value}

# Variant can be present in the lookup table with only ref calls, so is still not present in any projects
if not variant_projects:
raise HTTPNotFound()
Expand Down

0 comments on commit adcdf12

Please sign in to comment.