Skip to content

Commit

Permalink
handle wes/wgs inheritance edge case (#4440)
Browse files Browse the repository at this point in the history
* almost working

* updates

* or_missing

* fix sample tracking logic

* f string

* another pass

* minor things

* less code

* reuse as much code from _annotate_families_inheritance

* rstructure passes inheritance field

* fix _family_has_valid_inheritance

* fix up test cases

* oop

* typo

* typo 2

* PR comments

* comments

* extra lines
  • Loading branch information
jklugherz authored Nov 12, 2024
1 parent 4791930 commit e2293f3
Show file tree
Hide file tree
Showing 16 changed files with 193 additions and 77 deletions.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.130-bea04d9c79b5
Created at 2024/10/02 14:46:35
Created at 2024/11/04 13:45:23
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
58 changes: 35 additions & 23 deletions hail_search/queries/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,9 +472,7 @@ def _apply_entry_filters(ht):
def _filter_single_entries_table(self, ht, project_families, inheritance_filter=None, quality_filter=None, is_merged_ht=False, **kwargs):
ht, sorted_family_sample_data = self._add_entry_sample_families(ht, project_families, is_merged_ht)
ht = self._filter_quality(ht, quality_filter, **kwargs)
ht, ch_ht = self._filter_inheritance(
ht, None, inheritance_filter, sorted_family_sample_data,
)
ht, ch_ht = self._filter_inheritance(ht, None, inheritance_filter, sorted_family_sample_data)
ht = self._apply_entry_filters(ht)
ch_ht = self._apply_entry_filters(ch_ht)

Expand Down Expand Up @@ -574,7 +572,7 @@ def _get_sample_type(cls, family_index, ht_globals):

def _filter_inheritance(
self, ht, comp_het_ht, inheritance_filter, sorted_family_sample_data,
annotation='family_entries', entries_ht_field='family_entries'
annotation='family_entries', entries_ht_field='family_entries', **kwargs
):
any_valid_entry = lambda x: self.GENOTYPE_QUERY_MAP[HAS_ALT](x.GT)

Expand All @@ -584,14 +582,14 @@ def _filter_inheritance(
any_valid_entry = lambda x: prev_any_valid_entry(x) & (x.affected_id == AFFECTED_ID)

ht = ht.annotate(**{
annotation: ht[entries_ht_field].map(
entries_ht_field: ht[entries_ht_field].map(
lambda entries: hl.or_missing(entries.any(any_valid_entry), entries)
)})

if self._has_comp_het_search:
comp_het_ht = self._annotate_families_inheritance(
comp_het_ht if comp_het_ht is not None else ht, COMPOUND_HET, inheritance_filter,
sorted_family_sample_data, annotation, entries_ht_field
sorted_family_sample_data, annotation, entries_ht_field, **kwargs
)

if is_any_affected or not (inheritance_filter or self._inheritance_mode):
Expand All @@ -600,15 +598,43 @@ def _filter_inheritance(

ht = None if self._inheritance_mode == COMPOUND_HET else self._annotate_families_inheritance(
ht, self._inheritance_mode, inheritance_filter, sorted_family_sample_data,
annotation, entries_ht_field
annotation, entries_ht_field, **kwargs
)

return ht, comp_het_ht

def _annotate_families_inheritance(
self, ht, inheritance_mode, inheritance_filter, sorted_family_sample_data,
annotation, entries_ht_field,
annotation, entries_ht_field, family_passes_inheritance_filter = None
):
if not family_passes_inheritance_filter:
family_passes_inheritance_filter = self._get_family_passes_inheritance_filter

entry_indices_by_gt = self._get_entry_indices_by_gt_map(
inheritance_filter, inheritance_mode, sorted_family_sample_data
)

for genotype, entry_indices in entry_indices_by_gt.items():
if not entry_indices:
continue
entry_indices = hl.dict(entry_indices)
ht = ht.annotate(**{
annotation: hl.enumerate(ht[entries_ht_field]).starmap(
lambda family_idx, family_samples: family_passes_inheritance_filter(
entry_indices, family_idx, genotype, family_samples, ht, annotation
)
)
})

return ht

def _get_family_passes_inheritance_filter(self, entry_indices, family_idx, genotype, family_samples, *args):
return hl.or_missing(
~entry_indices.contains(family_idx) | entry_indices[family_idx].all(
lambda sample_i: self.GENOTYPE_QUERY_MAP[genotype](family_samples[sample_i].GT)
), family_samples)

def _get_entry_indices_by_gt_map(self, inheritance_filter, inheritance_mode, sorted_family_sample_data):
individual_genotype_filter = (inheritance_filter or {}).get('genotype')

# Create a mapping of genotypes to check against a list of samples for a family
Expand All @@ -630,21 +656,7 @@ def _annotate_families_inheritance(
]
self.max_unaffected_samples = max(family_unaffected_counts) if family_unaffected_counts else 0

for genotype, entry_indices in entry_indices_by_gt.items():
if not entry_indices:
continue
entry_indices = hl.dict(entry_indices)
ht = ht.annotate(**{
annotation: hl.enumerate(ht[entries_ht_field]).starmap(
lambda family_i, family_samples: hl.or_missing(
~entry_indices.contains(family_i) | entry_indices[family_i].all(
lambda sample_i: self.GENOTYPE_QUERY_MAP[genotype](family_samples[sample_i].GT)
), family_samples,
),
)
})

return ht
return entry_indices_by_gt

def _get_family_passes_quality_filter(self, quality_filter, ht, **kwargs):
quality_filter = quality_filter or {}
Expand Down
77 changes: 59 additions & 18 deletions hail_search/queries/mito.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,37 +205,62 @@ def _filter_entries_ht_both_sample_types(
)

ch_ht = None
family_guid_idx_map = defaultdict(dict)
family_idx_map = defaultdict(dict)
for sample_type, sorted_family_sample_data in sample_types:
ht = self._annotate_initial_passes_inheritance(ht, sample_type)
ch_ht = self._annotate_initial_passes_inheritance(ch_ht, sample_type)
ht, ch_ht = self._filter_inheritance(
ht, ch_ht, inheritance_filter, sorted_family_sample_data,
annotation=sample_type.passes_inheritance_field, entries_ht_field=sample_type.family_entries_field
annotation=sample_type.passes_inheritance_field, entries_ht_field=sample_type.family_entries_field,
family_passes_inheritance_filter=self._get_family_passes_inheritance_filter_both_sample_types
)
for family_idx, samples in enumerate(sorted_family_sample_data):
family_guid = samples[0]['familyGuid']
family_guid_idx_map[family_guid][sample_type.value] = family_idx
family_idx_map[family_guid][sample_type.value] = family_idx

family_idx_map = hl.dict(family_guid_idx_map)
family_idx_map = hl.dict(family_idx_map)
ht = self._apply_multi_sample_type_entry_filters(ht, family_idx_map)
ch_ht = self._apply_multi_sample_type_entry_filters(ch_ht, family_idx_map)
return ht, ch_ht

@staticmethod
def _annotate_initial_passes_inheritance(ht, sample_type):
if ht is None:
return ht

return ht.annotate(**{
sample_type.passes_inheritance_field: ht[sample_type.family_entries_field].map(
lambda family_entries: hl.array(
hl.range(0, hl.len(family_entries)).map(lambda _: True)
)
)})

def _get_family_passes_inheritance_filter_both_sample_types(
self, entry_indices, family_idx, genotype, family_samples, ht, annotation
):
return hl.enumerate(ht[annotation][family_idx]).starmap(
lambda sample_idx, passes: (hl.case()
.when(~entry_indices.get(family_idx).contains(sample_idx), passes)
.when(~self.GENOTYPE_QUERY_MAP[genotype](family_samples[sample_idx].GT), False)
.default(passes))
)

def _apply_multi_sample_type_entry_filters(self, ht, family_idx_map):
if ht is None:
return ht

# Keep family from both sample types if either passes quality AND inheritance
for sample_type in SampleType:
ht = ht.annotate(**{
sample_type.family_entries_field: hl.enumerate(ht[sample_type.family_entries_field]).starmap(
lambda i, family_samples: hl.or_missing(
hl.bind(
lambda other_sample_type_idx: (
self._family_has_valid_sample_type_entries(ht, sample_type, i) |
self._family_has_valid_sample_type_entries(ht, sample_type.other_sample_type, other_sample_type_idx)
),
family_idx_map.get(hl.coalesce(family_samples)[0]['familyGuid']).get(sample_type.other_sample_type.value),
), family_samples)
lambda family_idx, family_samples: hl.or_missing(
hl.bind(lambda other_sample_type_family_idx: ((
self._family_has_valid_quality(ht, sample_type, family_idx) |
self._family_has_valid_quality(ht, sample_type.other_sample_type, other_sample_type_family_idx)
) &
self._family_has_valid_inheritance(ht, sample_type, family_idx, other_sample_type_family_idx) &
self._family_has_valid_inheritance(ht, sample_type.other_sample_type, other_sample_type_family_idx, family_idx)
), family_idx_map.get(hl.coalesce(family_samples)[0]['familyGuid']).get(sample_type.other_sample_type.value),
), family_samples)
)})

# Merge family entries and filters from both sample types
Expand All @@ -253,13 +278,29 @@ def _apply_multi_sample_type_entry_filters(self, ht, family_idx_map):
return ht.filter(ht.family_entries.any(hl.is_defined))

@staticmethod
def _family_has_valid_sample_type_entries(ht, sample_type, sample_type_family_idx):
# Note: This logic does not sufficiently handle case 2 here https://docs.google.com/presentation/d/1hqDV8ulhviUcR5C4PtNUqkCLXKDsc6pccgFVlFmWUAU/edit?usp=sharing
# and will need to be changed to support it - https://github.com/broadinstitute/seqr/issues/4403
def _family_has_valid_quality(ht, sample_type, sample_type_family_idx):
return (
hl.is_defined(sample_type_family_idx) &
hl.is_defined(ht[sample_type.passes_quality_field][sample_type_family_idx]) &
hl.is_defined(ht[sample_type.passes_inheritance_field][sample_type_family_idx])
hl.is_defined(ht[sample_type.passes_quality_field][sample_type_family_idx])
)

@staticmethod
def _family_has_valid_inheritance(ht, sample_type, family_idx, other_sample_type_family_idx):
return hl.bind(
lambda sample_type_fail_samples, other_sample_type_pass_samples: (
sample_type_fail_samples.all(other_sample_type_pass_samples.contains)
), hl.enumerate(ht[sample_type.family_entries_field][family_idx]).starmap(
lambda sample_idx, sample: hl.or_missing(
~ht[sample_type.passes_inheritance_field][family_idx][sample_idx],
sample['sampleId'],
)
).filter(hl.is_defined),
hl.enumerate(ht[sample_type.other_sample_type.family_entries_field][other_sample_type_family_idx]).starmap(
lambda sample_idx, sample: hl.or_missing(
ht[sample_type.other_sample_type.passes_inheritance_field][other_sample_type_family_idx][sample_idx],
sample['sampleId'],
)
).filter(hl.is_defined),
)

def _get_sample_genotype(self, samples, r=None, include_genotype_overrides=False, select_fields=None, **kwargs):
Expand Down
40 changes: 20 additions & 20 deletions hail_search/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@
FAMILY_2_MITO_SAMPLE_DATA, FAMILY_2_ALL_SAMPLE_DATA, MITO_VARIANT1, MITO_VARIANT2, MITO_VARIANT3, \
EXPECTED_SAMPLE_DATA_WITH_SEX, SV_WGS_SAMPLE_DATA_WITH_SEX, VARIANT_LOOKUP_VARIANT, \
MULTI_PROJECT_SAMPLE_TYPES_SAMPLE_DATA, FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA, \
VARIANT1_BOTH_SAMPLE_TYPES, VARIANT2_BOTH_SAMPLE_TYPES, FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA_MISSING_PARENTAL_WGS
VARIANT1_BOTH_SAMPLE_TYPES, VARIANT2_BOTH_SAMPLE_TYPES, FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA_MISSING_PARENTAL_WGS, \
VARIANT3_BOTH_SAMPLE_TYPES, VARIANT4_BOTH_SAMPLE_TYPES, VARIANT2_BOTH_SAMPLE_TYPES_PROBAND_WGS_ONLY, \
VARIANT1_BOTH_SAMPLE_TYPES_PROBAND_WGS_ONLY, VARIANT3_BOTH_SAMPLE_TYPES_PROBAND_WGS_ONLY, \
VARIANT4_BOTH_SAMPLE_TYPES_PROBAND_WGS_ONLY
from hail_search.web_app import init_web_app, sync_to_async_hail_query
from hail_search.queries.base import BaseHailTableQuery

Expand Down Expand Up @@ -365,34 +368,31 @@ async def test_both_sample_types_search(self):
MULTI_PROJECT_BOTH_SAMPLE_TYPE_VARIANTS, gene_counts=GENE_COUNTS, sample_data=MULTI_PROJECT_SAMPLE_TYPES_SAMPLE_DATA,
)

# Variant1 in family_2 is de novo in exome but maternally inherited in genome.
# Genome passes quality and inheritance, show genotypes for both sample types.
variant1_interval = ['1', 10438, 10440]
# Variant 1 is de novo in exome but inherited and homozygous in genome.
# Variant 2 is inherited and homozygous in exome and de novo and homozygous in genome.
# Variant 3 is inherited in both sample types. Variant 4 is de novo in both sample types.
inheritance_mode = 'recessive'
await self._assert_expected_search(
[VARIANT1_BOTH_SAMPLE_TYPES], sample_data=FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA, inheritance_mode=inheritance_mode,
**COMP_HET_ALL_PASS_FILTERS, intervals=[variant1_interval]
[VARIANT1_BOTH_SAMPLE_TYPES, VARIANT2_BOTH_SAMPLE_TYPES, [VARIANT3_BOTH_SAMPLE_TYPES, VARIANT4_BOTH_SAMPLE_TYPES]],
sample_data=FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA, inheritance_mode=inheritance_mode,
**COMP_HET_ALL_PASS_FILTERS
)
# Exome passes quality and inheritance, show genotypes for both sample types.
inheritance_mode = 'de_novo'
await self._assert_expected_search(
[VARIANT1_BOTH_SAMPLE_TYPES], sample_data=FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA, inheritance_mode=inheritance_mode,
intervals=[variant1_interval]
[VARIANT1_BOTH_SAMPLE_TYPES_PROBAND_WGS_ONLY, VARIANT2_BOTH_SAMPLE_TYPES_PROBAND_WGS_ONLY,
[VARIANT3_BOTH_SAMPLE_TYPES_PROBAND_WGS_ONLY, VARIANT4_BOTH_SAMPLE_TYPES_PROBAND_WGS_ONLY]],
sample_data=FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA_MISSING_PARENTAL_WGS, inheritance_mode=inheritance_mode,
**COMP_HET_ALL_PASS_FILTERS
)

# Variant 2 in family_2 is inherited in exome and there is no parental data in genome.
# Genome and exome pass quality and inheritance, show genotypes for both sample types.
variant2_interval = ['1', 38724418, 38724420]
inheritance_mode = 'recessive'
inheritance_mode = 'de_novo'
await self._assert_expected_search(
[VARIANT2_BOTH_SAMPLE_TYPES], sample_data=FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA_MISSING_PARENTAL_WGS,
inheritance_mode=inheritance_mode, **COMP_HET_ALL_PASS_FILTERS, intervals=[variant2_interval]
[VARIANT1_BOTH_SAMPLE_TYPES, VARIANT2_BOTH_SAMPLE_TYPES, VARIANT4_BOTH_SAMPLE_TYPES],
sample_data=FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA, inheritance_mode=inheritance_mode,
)
# Genome passes quality and inheritance exome fails inheritance (parental data shows variant is inherited).
inheritance_mode = 'de_novo'
# Variant 2 fails inheritance when parental data is missing in genome
await self._assert_expected_search(
[VARIANT2_BOTH_SAMPLE_TYPES], sample_data=FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA_MISSING_PARENTAL_WGS,
inheritance_mode=inheritance_mode, intervals=[variant2_interval]
[VARIANT1_BOTH_SAMPLE_TYPES_PROBAND_WGS_ONLY, VARIANT4_BOTH_SAMPLE_TYPES_PROBAND_WGS_ONLY],
sample_data=FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA_MISSING_PARENTAL_WGS, inheritance_mode=inheritance_mode,
)

async def test_inheritance_filter(self):
Expand Down
Loading

0 comments on commit e2293f3

Please sign in to comment.