Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

handle wes/wgs inheritance edge case #4440

Merged
merged 18 commits into from
Nov 12, 2024
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.130-bea04d9c79b5
Created at 2024/10/02 14:46:35
Created at 2024/11/04 13:45:23
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
58 changes: 35 additions & 23 deletions hail_search/queries/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,9 +472,7 @@ def _apply_entry_filters(ht):
def _filter_single_entries_table(self, ht, project_families, inheritance_filter=None, quality_filter=None, is_merged_ht=False, **kwargs):
ht, sorted_family_sample_data = self._add_entry_sample_families(ht, project_families, is_merged_ht)
ht = self._filter_quality(ht, quality_filter, **kwargs)
ht, ch_ht = self._filter_inheritance(
ht, None, inheritance_filter, sorted_family_sample_data,
)
ht, ch_ht = self._filter_inheritance(ht, None, inheritance_filter, sorted_family_sample_data)
ht = self._apply_entry_filters(ht)
ch_ht = self._apply_entry_filters(ch_ht)

Expand Down Expand Up @@ -574,7 +572,7 @@ def _get_sample_type(cls, family_index, ht_globals):

def _filter_inheritance(
self, ht, comp_het_ht, inheritance_filter, sorted_family_sample_data,
annotation='family_entries', entries_ht_field='family_entries'
annotation='family_entries', entries_ht_field='family_entries', **kwargs
):
any_valid_entry = lambda x: self.GENOTYPE_QUERY_MAP[HAS_ALT](x.GT)

Expand All @@ -584,14 +582,14 @@ def _filter_inheritance(
any_valid_entry = lambda x: prev_any_valid_entry(x) & (x.affected_id == AFFECTED_ID)

ht = ht.annotate(**{
annotation: ht[entries_ht_field].map(
entries_ht_field: ht[entries_ht_field].map(
lambda entries: hl.or_missing(entries.any(any_valid_entry), entries)
)})

if self._has_comp_het_search:
comp_het_ht = self._annotate_families_inheritance(
comp_het_ht if comp_het_ht is not None else ht, COMPOUND_HET, inheritance_filter,
sorted_family_sample_data, annotation, entries_ht_field
sorted_family_sample_data, annotation, entries_ht_field, **kwargs
)

if is_any_affected or not (inheritance_filter or self._inheritance_mode):
Expand All @@ -600,15 +598,43 @@ def _filter_inheritance(

ht = None if self._inheritance_mode == COMPOUND_HET else self._annotate_families_inheritance(
ht, self._inheritance_mode, inheritance_filter, sorted_family_sample_data,
annotation, entries_ht_field
annotation, entries_ht_field, **kwargs
)

return ht, comp_het_ht

def _annotate_families_inheritance(
self, ht, inheritance_mode, inheritance_filter, sorted_family_sample_data,
annotation, entries_ht_field,
annotation, entries_ht_field, family_passes_inheritance_filter = None
):
if not family_passes_inheritance_filter:
family_passes_inheritance_filter = self._get_family_passes_inheritance_filter

entry_indices_by_gt = self._get_entry_indices_by_gt_map(
inheritance_filter, inheritance_mode, sorted_family_sample_data
)

for genotype, entry_indices in entry_indices_by_gt.items():
if not entry_indices:
continue
entry_indices = hl.dict(entry_indices)
ht = ht.annotate(**{
annotation: hl.enumerate(ht[entries_ht_field]).starmap(
lambda family_idx, family_samples: family_passes_inheritance_filter(
entry_indices, family_idx, genotype, family_samples, ht, annotation
)
)
})

return ht

def _get_family_passes_inheritance_filter(self, entry_indices, family_idx, genotype, family_samples, *args):
return hl.or_missing(
~entry_indices.contains(family_idx) | entry_indices[family_idx].all(
lambda sample_i: self.GENOTYPE_QUERY_MAP[genotype](family_samples[sample_i].GT)
), family_samples)

def _get_entry_indices_by_gt_map(self, inheritance_filter, inheritance_mode, sorted_family_sample_data):
individual_genotype_filter = (inheritance_filter or {}).get('genotype')

# Create a mapping of genotypes to check against a list of samples for a family
Expand All @@ -630,21 +656,7 @@ def _annotate_families_inheritance(
]
self.max_unaffected_samples = max(family_unaffected_counts) if family_unaffected_counts else 0

for genotype, entry_indices in entry_indices_by_gt.items():
if not entry_indices:
continue
entry_indices = hl.dict(entry_indices)
ht = ht.annotate(**{
annotation: hl.enumerate(ht[entries_ht_field]).starmap(
lambda family_i, family_samples: hl.or_missing(
~entry_indices.contains(family_i) | entry_indices[family_i].all(
lambda sample_i: self.GENOTYPE_QUERY_MAP[genotype](family_samples[sample_i].GT)
), family_samples,
),
)
})

return ht
return entry_indices_by_gt

def _get_family_passes_quality_filter(self, quality_filter, ht, **kwargs):
quality_filter = quality_filter or {}
Expand Down
77 changes: 59 additions & 18 deletions hail_search/queries/mito.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,37 +205,62 @@ def _filter_entries_ht_both_sample_types(
)

ch_ht = None
family_guid_idx_map = defaultdict(dict)
family_idx_map = defaultdict(dict)
for sample_type, sorted_family_sample_data in sample_types:
ht = self._annotate_initial_passes_inheritance(ht, sample_type)
ch_ht = self._annotate_initial_passes_inheritance(ch_ht, sample_type)
ht, ch_ht = self._filter_inheritance(
ht, ch_ht, inheritance_filter, sorted_family_sample_data,
annotation=sample_type.passes_inheritance_field, entries_ht_field=sample_type.family_entries_field
annotation=sample_type.passes_inheritance_field, entries_ht_field=sample_type.family_entries_field,
family_passes_inheritance_filter=self._get_family_passes_inheritance_filter_both_sample_types
)
for family_idx, samples in enumerate(sorted_family_sample_data):
family_guid = samples[0]['familyGuid']
family_guid_idx_map[family_guid][sample_type.value] = family_idx
family_idx_map[family_guid][sample_type.value] = family_idx

family_idx_map = hl.dict(family_guid_idx_map)
family_idx_map = hl.dict(family_idx_map)
ht = self._apply_multi_sample_type_entry_filters(ht, family_idx_map)
ch_ht = self._apply_multi_sample_type_entry_filters(ch_ht, family_idx_map)
return ht, ch_ht

@staticmethod
def _annotate_initial_passes_inheritance(ht, sample_type):
if ht is None:
return ht

return ht.annotate(**{
sample_type.passes_inheritance_field: ht[sample_type.family_entries_field].map(
lambda family_entries: hl.array(
hl.range(0, hl.len(family_entries)).map(lambda _: True)
)
)})

def _get_family_passes_inheritance_filter_both_sample_types(
self, entry_indices, family_idx, genotype, family_samples, ht, annotation
):
return hl.enumerate(ht[annotation][family_idx]).starmap(
lambda sample_idx, passes: (hl.case()
.when(~entry_indices.get(family_idx).contains(sample_idx), passes)
.when(~self.GENOTYPE_QUERY_MAP[genotype](family_samples[sample_idx].GT), False)
.default(passes))
)

def _apply_multi_sample_type_entry_filters(self, ht, family_idx_map):
if ht is None:
return ht

# Keep family from both sample types if either passes quality AND inheritance
for sample_type in SampleType:
ht = ht.annotate(**{
sample_type.family_entries_field: hl.enumerate(ht[sample_type.family_entries_field]).starmap(
lambda i, family_samples: hl.or_missing(
hl.bind(
lambda other_sample_type_idx: (
self._family_has_valid_sample_type_entries(ht, sample_type, i) |
self._family_has_valid_sample_type_entries(ht, sample_type.other_sample_type, other_sample_type_idx)
),
family_idx_map.get(hl.coalesce(family_samples)[0]['familyGuid']).get(sample_type.other_sample_type.value),
), family_samples)
lambda family_idx, family_samples: hl.or_missing(
hl.bind(lambda other_sample_type_family_idx: ((
self._family_has_valid_quality(ht, sample_type, family_idx) |
self._family_has_valid_quality(ht, sample_type.other_sample_type, other_sample_type_family_idx)
) &
self._family_has_valid_inheritance(ht, sample_type, family_idx, other_sample_type_family_idx) &
self._family_has_valid_inheritance(ht, sample_type.other_sample_type, other_sample_type_family_idx, family_idx)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Commenting for posterity that I'm not 100% sure that this is the final logic we want, because it would allow us to return variants that pass quality and fail inheritance in one sample type and fail quality and pass inheritance in the other, meaning theres no sample type that clearly passes both inheritance and quality. However, I think we maybe do want to return these, the logic gets kind of confusing and I can't quite be sure these would not be helpful. I think being overly permissive here is better, if the analysts are seeing a bunch of cases where they ultimately think that the returned variants are not helpful and should be filtered out we can always go back later and make this a stricter criteria, so we should leave this as is for now

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes sense, we may want to consider quality and inheritance passing together and handle it differently (the & seems like it could be too simple) but it's not clear to me what the logic/change would be. I agree that trying this out and getting feedback from analysts before we do that is the way to go.

), family_idx_map.get(hl.coalesce(family_samples)[0]['familyGuid']).get(sample_type.other_sample_type.value),
), family_samples)
)})

# Merge family entries and filters from both sample types
Expand All @@ -253,13 +278,29 @@ def _apply_multi_sample_type_entry_filters(self, ht, family_idx_map):
return ht.filter(ht.family_entries.any(hl.is_defined))

@staticmethod
def _family_has_valid_sample_type_entries(ht, sample_type, sample_type_family_idx):
# Note: This logic does not sufficiently handle case 2 here https://docs.google.com/presentation/d/1hqDV8ulhviUcR5C4PtNUqkCLXKDsc6pccgFVlFmWUAU/edit?usp=sharing
# and will need to be changed to support it - https://github.com/broadinstitute/seqr/issues/4403
def _family_has_valid_quality(ht, sample_type, sample_type_family_idx):
return (
hl.is_defined(sample_type_family_idx) &
hl.is_defined(ht[sample_type.passes_quality_field][sample_type_family_idx]) &
hl.is_defined(ht[sample_type.passes_inheritance_field][sample_type_family_idx])
hl.is_defined(ht[sample_type.passes_quality_field][sample_type_family_idx])
)

@staticmethod
def _family_has_valid_inheritance(ht, sample_type, family_idx, other_sample_type_family_idx):
return hl.bind(
lambda sample_type_fail_samples, other_sample_type_pass_samples: (
sample_type_fail_samples.all(other_sample_type_pass_samples.contains)
), hl.enumerate(ht[sample_type.family_entries_field][family_idx]).starmap(
lambda sample_idx, sample: hl.or_missing(
~ht[sample_type.passes_inheritance_field][family_idx][sample_idx],
sample['sampleId'],
)
).filter(hl.is_defined),
hl.enumerate(ht[sample_type.other_sample_type.family_entries_field][other_sample_type_family_idx]).starmap(
lambda sample_idx, sample: hl.or_missing(
ht[sample_type.other_sample_type.passes_inheritance_field][other_sample_type_family_idx][sample_idx],
sample['sampleId'],
)
).filter(hl.is_defined),
)

def _get_sample_genotype(self, samples, r=None, include_genotype_overrides=False, select_fields=None, **kwargs):
Expand Down
40 changes: 20 additions & 20 deletions hail_search/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@
FAMILY_2_MITO_SAMPLE_DATA, FAMILY_2_ALL_SAMPLE_DATA, MITO_VARIANT1, MITO_VARIANT2, MITO_VARIANT3, \
EXPECTED_SAMPLE_DATA_WITH_SEX, SV_WGS_SAMPLE_DATA_WITH_SEX, VARIANT_LOOKUP_VARIANT, \
MULTI_PROJECT_SAMPLE_TYPES_SAMPLE_DATA, FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA, \
VARIANT1_BOTH_SAMPLE_TYPES, VARIANT2_BOTH_SAMPLE_TYPES, FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA_MISSING_PARENTAL_WGS
VARIANT1_BOTH_SAMPLE_TYPES, VARIANT2_BOTH_SAMPLE_TYPES, FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA_MISSING_PARENTAL_WGS, \
VARIANT3_BOTH_SAMPLE_TYPES, VARIANT4_BOTH_SAMPLE_TYPES, VARIANT2_BOTH_SAMPLE_TYPES_PROBAND_WGS_ONLY, \
VARIANT1_BOTH_SAMPLE_TYPES_PROBAND_WGS_ONLY, VARIANT3_BOTH_SAMPLE_TYPES_PROBAND_WGS_ONLY, \
VARIANT4_BOTH_SAMPLE_TYPES_PROBAND_WGS_ONLY
from hail_search.web_app import init_web_app, sync_to_async_hail_query
from hail_search.queries.base import BaseHailTableQuery

Expand Down Expand Up @@ -365,34 +368,31 @@ async def test_both_sample_types_search(self):
MULTI_PROJECT_BOTH_SAMPLE_TYPE_VARIANTS, gene_counts=GENE_COUNTS, sample_data=MULTI_PROJECT_SAMPLE_TYPES_SAMPLE_DATA,
)

# Variant1 in family_2 is de novo in exome but maternally inherited in genome.
# Genome passes quality and inheritance, show genotypes for both sample types.
variant1_interval = ['1', 10438, 10440]
# Variant 1 is de novo in exome but inherited and homozygous in genome.
# Variant 2 is inherited and homozygous in exome and de novo and homozygous in genome.
# Variant 3 is inherited in both sample types. Variant 4 is de novo in both sample types.
inheritance_mode = 'recessive'
await self._assert_expected_search(
[VARIANT1_BOTH_SAMPLE_TYPES], sample_data=FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA, inheritance_mode=inheritance_mode,
**COMP_HET_ALL_PASS_FILTERS, intervals=[variant1_interval]
[VARIANT1_BOTH_SAMPLE_TYPES, VARIANT2_BOTH_SAMPLE_TYPES, [VARIANT3_BOTH_SAMPLE_TYPES, VARIANT4_BOTH_SAMPLE_TYPES]],
sample_data=FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA, inheritance_mode=inheritance_mode,
**COMP_HET_ALL_PASS_FILTERS
)
# Exome passes quality and inheritance, show genotypes for both sample types.
inheritance_mode = 'de_novo'
await self._assert_expected_search(
[VARIANT1_BOTH_SAMPLE_TYPES], sample_data=FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA, inheritance_mode=inheritance_mode,
intervals=[variant1_interval]
[VARIANT1_BOTH_SAMPLE_TYPES_PROBAND_WGS_ONLY, VARIANT2_BOTH_SAMPLE_TYPES_PROBAND_WGS_ONLY,
[VARIANT3_BOTH_SAMPLE_TYPES_PROBAND_WGS_ONLY, VARIANT4_BOTH_SAMPLE_TYPES_PROBAND_WGS_ONLY]],
sample_data=FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA_MISSING_PARENTAL_WGS, inheritance_mode=inheritance_mode,
**COMP_HET_ALL_PASS_FILTERS
)

# Variant 2 in family_2 is inherited in exome and there is no parental data in genome.
# Genome and exome pass quality and inheritance, show genotypes for both sample types.
variant2_interval = ['1', 38724418, 38724420]
inheritance_mode = 'recessive'
inheritance_mode = 'de_novo'
await self._assert_expected_search(
[VARIANT2_BOTH_SAMPLE_TYPES], sample_data=FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA_MISSING_PARENTAL_WGS,
inheritance_mode=inheritance_mode, **COMP_HET_ALL_PASS_FILTERS, intervals=[variant2_interval]
[VARIANT1_BOTH_SAMPLE_TYPES, VARIANT2_BOTH_SAMPLE_TYPES, VARIANT4_BOTH_SAMPLE_TYPES],
sample_data=FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA, inheritance_mode=inheritance_mode,
)
# Genome passes quality and inheritance exome fails inheritance (parental data shows variant is inherited).
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still think a comment here explaining whats being tested is helpful. Maybe something like "Variant 2 fails inheritance when parental data is present"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated!

inheritance_mode = 'de_novo'
# Variant 2 fails inheritance when parental data is missing in genome
await self._assert_expected_search(
[VARIANT2_BOTH_SAMPLE_TYPES], sample_data=FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA_MISSING_PARENTAL_WGS,
inheritance_mode=inheritance_mode, intervals=[variant2_interval]
[VARIANT1_BOTH_SAMPLE_TYPES_PROBAND_WGS_ONLY, VARIANT4_BOTH_SAMPLE_TYPES_PROBAND_WGS_ONLY],
sample_data=FAMILY_2_BOTH_SAMPLE_TYPE_SAMPLE_DATA_MISSING_PARENTAL_WGS, inheritance_mode=inheritance_mode,
)

async def test_inheritance_filter(self):
Expand Down
Loading
Loading