From 4eef429962f7414f5187e5d55e38d91092b16f96 Mon Sep 17 00:00:00 2001 From: Sai Chen Date: Tue, 17 Sep 2019 17:25:19 -0700 Subject: [PATCH 1/2] GT-825 v2.4 release --- paper-data/download-instructions.txt | 26 +- .../genotyping_test_2/expected-genotypes.vcf | 6 +- .../insertions/insertion-test-1.json | 100 ++++---- .../insertions/insertion-test-1.noas.json | 84 +++---- .../insertions/insertion-test-1.ref.fa | 2 - .../insertions/insertion-test-1.ref.fa.fai | 1 - .../paragraph/insertions/insertion-test-1.vcf | 69 +----- .../insertions/insertion-test-2.json | 131 ---------- .../insertions/insertion-test-2.noas.json | 109 --------- .../insertions/insertion-test-2.ref.fa | 2 - .../insertions/insertion-test-2.ref.fa.fai | 1 - .../paragraph/insertions/insertion-test-2.vcf | 66 ----- .../round-trip-genotyping/sample1.sam | 6 - .../round-trip-genotyping/sample2.sam | 6 - src/c++/lib/idxdepth/IndexBinning.cpp | 2 +- src/python/bin/multigrmpy.py | 2 +- src/python/lib/grm/vcfgraph/vcfupdate.py | 7 +- src/python/test/test_VCF2Paragraph.py | 95 +------ src/python/test/test_multigrmpy.py | 231 ------------------ 19 files changed, 113 insertions(+), 833 deletions(-) delete mode 100644 share/test-data/paragraph/insertions/insertion-test-1.ref.fa delete mode 100644 share/test-data/paragraph/insertions/insertion-test-1.ref.fa.fai delete mode 100644 share/test-data/paragraph/insertions/insertion-test-2.json delete mode 100644 share/test-data/paragraph/insertions/insertion-test-2.noas.json delete mode 100644 share/test-data/paragraph/insertions/insertion-test-2.ref.fa delete mode 100644 share/test-data/paragraph/insertions/insertion-test-2.ref.fa.fai delete mode 100644 share/test-data/paragraph/insertions/insertion-test-2.vcf delete mode 100644 share/test-data/round-trip-genotyping/sample1.sam delete mode 100644 share/test-data/round-trip-genotyping/sample2.sam diff --git a/paper-data/download-instructions.txt b/paper-data/download-instructions.txt index da42ca2..d106af9 100644 --- a/paper-data/download-instructions.txt +++ b/paper-data/download-instructions.txt @@ -1,15 +1,25 @@ -Please use the following S3 link to download the output VCF from Paragraph manuscript: +Please use the following S3 link to download data. -Genotypes of HG002 Long-read ground truth (LRGT) SVs on the Illumina HiSeq X 34.5x bam (VCF format): -https://s3-us-west-1.amazonaws.com/paragraph-paper-data/hg002_sniffles_ccs.paragraph.vcf.gz +The VCF for long read ground truth (with PBSV genotypes): +https://paragraph-paper-data.s3-us-west-1.amazonaws.com/sample3-sw.sorted.pass.vcf.gz +Note that chrX and chrY were excluded in our analysis -HG002 Long-read ground truth (LRGT) SVs on 100 individuals from Polaris (JSON format): -Site only: -https://s3-us-west-1.amazonaws.com/paragraph-paper-data/sniffles_ccs_polaris.filtered.autosome.del_ins.json.gz +The VCF for all SVs (LRGT + Clustered SVs): +https://paragraph-paper-data.s3-us-west-1.amazonaws.com/sample3-sw.sorted.vcf.gz -Genotypes included: -https://s3-us-west-1.amazonaws.com/paragraph-paper-data/sniffles_ccs_polaris.json.gz +In the filter field, we have "PASS" for LRGT SVs and "NEARBY" for clustered SVs. Note that chrX and chrY were excluded in all analysis in our manuscript. + +Paragraph genotypes of LRGT and clustered SVs for: +NA24385/HG002: +https://paragraph-paper-data.s3-us-west-1.amazonaws.com/HG002.paragraph.vcf.gz +NA12878: +https://paragraph-paper-data.s3-us-west-1.amazonaws.com/NA12878.paragraph.vcf.gz +NA24361/HG005: +https://paragraph-paper-data.s3-us-west-1.amazonaws.com/HG005.paragraph.vcf.gz + +Paragraph genotyping summary of LRGT and clustered SVs in the 100 unrelated individuals in the Polaris population: +https://paragraph-paper-data.s3-us-west-1.amazonaws.com/polaris.summary.csv Sample name map (S3 ID to regular ID): https://s3-us-west-1.amazonaws.com/paragraph-paper-data/sample_map.txt diff --git a/share/test-data/genotyping_test_2/expected-genotypes.vcf b/share/test-data/genotyping_test_2/expected-genotypes.vcf index f8038e9..fef18cd 100644 --- a/share/test-data/genotyping_test_2/expected-genotypes.vcf +++ b/share/test-data/genotyping_test_2/expected-genotypes.vcf @@ -1,3 +1,3 @@ -chrA 1500 swap1 GCTGCCCCTT GCTAGTAACTT . PASS GRMPY_ID=swaps.vcf@42527ba8a8840f1c955f8e6879b567988bbf858febd25ba5b4555895dbbcfef7:1 GT:OLD_GT:DP:FT:AD:ADF:ADR:PL 0/0:0/0:1100:PASS:604,4:604,4:0,0:0,3364,32458 -chrB 1499 swap2 TAGGCCATACG TTCAGGTTGTCTTATGCTTGGCATCGTTCTT . PASS GRMPY_ID=swaps.vcf@42527ba8a8840f1c955f8e6879b567988bbf858febd25ba5b4555895dbbcfef7:2 GT:OLD_GT:DP:FT:AD:ADF:ADR:PL 1/1:1/1:952:PASS:0,596:0,596:0,0:32425,3031,0 -chrC 1500 swap3 CGCGTTGTAAGCTACCATATTCAATCTGTGCCAGGGATCGAGCCACAGGCACCGCTCAATCTCGCGGGAGATTGTGCAAAGAGTCTTACCTTTCGTCGACCTCCGCCTCGCTCGTGAATCTTGCGATCGATTGAAAGTCACGGGTAGAGTGATGTTCGGGCGAATCAGACAGGCAGATGCAATGGAGGTTCCCGGATAGT CCGGAGATACCCTCTGTCTCCGCTAACATTTCCCCGCGGACAAAATTTGTCGGCTGGGAGGAATAGGTGCAAACGCATAATATACCCCTCTTACTTTTTGTTAGGGTCTAGTCCGAATCTAAAAAATGACTAAGGACTCTCAGAGTGATGGATATATGCCTCGCGACGCCGATCTGTGCTTATGTCGCAGCTTTGGCATCAAACCAGTTTCACATACCCTGCCTAAAAGATTCCCATACTGCGAAATCGCAAGATTGTACAAGTTGTAGTCTGTGCGCCAGCGTGAGCACGGCACTCGGT . PASS GRMPY_ID=swaps.vcf@42527ba8a8840f1c955f8e6879b567988bbf858febd25ba5b4555895dbbcfef7:3 GT:OLD_GT:DP:FT:AD:ADF:ADR:PL 0/1:0/1:1008:PASS:538,538:538,538:0,0:4132,0,4132 +chrA 1500 swap1 GCTGCCCCTT GCTAGTAACTT . PASS GRMPY_ID=swaps.vcf@42527ba8a8840f1c955f8e6879b567988bbf858febd25ba5b4555895dbbcfef7:1 GT:OLD_GT:DP:FT:AD:ADF:ADR:PL 0/0:0/0:1100:PASS:2396,10:2396,10:0,0:0,3364,32458 +chrB 1499 swap2 TAGGCCATACG TTCAGGTTGTCTTATGCTTGGCATCGTTCTT . PASS GRMPY_ID=swaps.vcf@42527ba8a8840f1c955f8e6879b567988bbf858febd25ba5b4555895dbbcfef7:2 GT:OLD_GT:DP:FT:AD:ADF:ADR:PL 1/1:1/1:952:PASS:0,1788:0,1788:0,0:32425,3031,0 +chrC 1500 swap3 CGCGTTGTAAGCTACCATATTCAATCTGTGCCAGGGATCGAGCCACAGGCACCGCTCAATCTCGCGGGAGATTGTGCAAAGAGTCTTACCTTTCGTCGACCTCCGCCTCGCTCGTGAATCTTGCGATCGATTGAAAGTCACGGGTAGAGTGATGTTCGGGCGAATCAGACAGGCAGATGCAATGGAGGTTCCCGGATAGT CCGGAGATACCCTCTGTCTCCGCTAACATTTCCCCGCGGACAAAATTTGTCGGCTGGGAGGAATAGGTGCAAACGCATAATATACCCCTCTTACTTTTTGTTAGGGTCTAGTCCGAATCTAAAAAATGACTAAGGACTCTCAGAGTGATGGATATATGCCTCGCGACGCCGATCTGTGCTTATGTCGCAGCTTTGGCATCAAACCAGTTTCACATACCCTGCCTAAAAGATTCCCATACTGCGAAATCGCAAGATTGTACAAGTTGTAGTCTGTGCGCCAGCGTGAGCACGGCACTCGGT . PASS GRMPY_ID=swaps.vcf@42527ba8a8840f1c955f8e6879b567988bbf858febd25ba5b4555895dbbcfef7:3 GT:OLD_GT:DP:FT:AD:ADF:ADR:PL 0/1:0/1:1008:PASS:1762,1426:1762,1426:0,0:4132,0,4132 diff --git a/share/test-data/paragraph/insertions/insertion-test-1.json b/share/test-data/paragraph/insertions/insertion-test-1.json index 3caa944..75cc0c6 100644 --- a/share/test-data/paragraph/insertions/insertion-test-1.json +++ b/share/test-data/paragraph/insertions/insertion-test-1.json @@ -2,89 +2,75 @@ "edges": [ { "from": "source", - "name": "source_chr22:32-31:TGAGC", - "to": "chr22:32-31:TGAGC" + "name": "source_chr20:149014-149013:CTGAC", + "to": "chr20:149014-149013:CTGAC" }, { "from": "source", - "name": "source_ref-chr22:26-30", - "to": "ref-chr22:26-30" + "name": "source_ref-chr20:149008-149013", + "to": "ref-chr20:149008-149013" }, { - "from": "chr22:32-31:TGAGC", - "name": "chr22:32-31:TGAGC_ref-chr22:32-36", + "from": "chr20:149014-149013:CTGAC", + "name": "chr20:149014-149013:CTGAC_ref-chr20:149014-149018", "sequences": [ - "20482:1" + "HG002_pbsv.INS.49079:1" ], - "to": "ref-chr22:32-36" + "to": "ref-chr20:149014-149018" }, { - "from": "ref-chr22:26-30", - "name": "ref-chr22:26-30_ref-chr22:31-31", + "from": "ref-chr20:149008-149013", + "name": "ref-chr20:149008-149013_chr20:149014-149013:CCATA", "sequences": [ - "20482:0", - "REF" + "HG002_pbsv.INS.49079:1" ], - "to": "ref-chr22:31-31" + "to": "chr20:149014-149013:CCATA" }, { - "from": "ref-chr22:31-31", - "name": "ref-chr22:31-31_ref-chr22:32-36", + "from": "ref-chr20:149008-149013", + "name": "ref-chr20:149008-149013_ref-chr20:149014-149018", "sequences": [ - "20482:0", + "HG002_pbsv.INS.49079:0", "REF" ], - "to": "ref-chr22:32-36" + "to": "ref-chr20:149014-149018" }, { - "from": "ref-chr22:31-31", - "name": "ref-chr22:31-31_chr22:32-31:AGGGC", - "sequences": [ - "20482:1" - ], - "to": "chr22:32-31:AGGGC" - }, - { - "from": "ref-chr22:32-36", - "name": "ref-chr22:32-36_sink", + "from": "chr20:149014-149013:CCATA", + "name": "chr20:149014-149013:CCATA_sink", "to": "sink" }, { - "from": "chr22:32-31:AGGGC", - "name": "chr22:32-31:AGGGC_sink", + "from": "ref-chr20:149014-149018", + "name": "ref-chr20:149014-149018_sink", "to": "sink" } ], - "model_name": "Graph from insertions/insertion-test-1.vcf", + "model_name": "Graph from ../paragraph-tools/share/test-data/paragraph/insertions/insertion-test-1.vcf", "nodes": [ { "name": "source", "sequence": "NNNNNNNNNN" }, { - "name": "chr22:32-31:TGAGC", - "position": "chr22:32-31", - "sequence": "TGAGC" - }, - { - "name": "ref-chr22:26-30", - "reference": "chr22:26-30", - "reference_sequence": "AGACC" + "name": "chr20:149014-149013:CTGAC", + "position": "chr20:149014-149013", + "sequence": "CTGAC" }, { - "name": "ref-chr22:31-31", - "reference": "chr22:31-31", - "reference_sequence": "A" + "name": "ref-chr20:149008-149013", + "reference": "chr20:149008-149013", + "reference_sequence": "GAACAA" }, { - "name": "ref-chr22:32-36", - "reference": "chr22:32-36", - "reference_sequence": "G" + "name": "chr20:149014-149013:CCATA", + "position": "chr20:149014-149013", + "sequence": "CCATA" }, { - "name": "chr22:32-31:AGGGC", - "position": "chr22:32-31", - "sequence": "AGGGC" + "name": "ref-chr20:149014-149018", + "reference": "chr20:149014-149018", + "reference_sequence": "CCATA" }, { "name": "sink", @@ -94,38 +80,36 @@ "paths": [ { "nodes": [ - "ref-chr22:26-30", - "ref-chr22:31-31", - "ref-chr22:32-36" + "ref-chr20:149008-149013", + "ref-chr20:149014-149018" ], "path_id": "REF|1", "sequence": "REF" }, { "nodes": [ - "ref-chr22:26-30", - "ref-chr22:31-31", - "chr22:32-31:AGGGC" + "ref-chr20:149008-149013", + "chr20:149014-149013:CCATA" ], "path_id": "ALT|1", "sequence": "ALT" }, { "nodes": [ - "chr22:32-31:TGAGC", - "ref-chr22:32-36" + "chr20:149014-149013:CTGAC", + "ref-chr20:149014-149018" ], "path_id": "ALT|2", "sequence": "ALT" } ], "sequencenames": [ - "20482:0", - "20482:1", "ALT", + "HG002_pbsv.INS.49079:0", + "HG002_pbsv.INS.49079:1", "REF" ], "target_regions": [ - "chr22:26-36" + "chr20:149008-149018" ] } diff --git a/share/test-data/paragraph/insertions/insertion-test-1.noas.json b/share/test-data/paragraph/insertions/insertion-test-1.noas.json index 5e9124e..2106530 100644 --- a/share/test-data/paragraph/insertions/insertion-test-1.noas.json +++ b/share/test-data/paragraph/insertions/insertion-test-1.noas.json @@ -2,74 +2,60 @@ "edges": [ { "from": "source", - "name": "source_ref-chr22:26-30", - "to": "ref-chr22:26-30" + "name": "source_ref-chr20:149008-149013", + "to": "ref-chr20:149008-149013" }, { - "from": "ref-chr22:26-30", - "name": "ref-chr22:26-30_ref-chr22:31-31", + "from": "ref-chr20:149008-149013", + "name": "ref-chr20:149008-149013_chr20:149014-149013:CCATATTTGGGAGGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCGAATCCTGAC", "sequences": [ - "20482:0", - "REF" + "HG002_pbsv.INS.49079:1" ], - "to": "ref-chr22:31-31" + "to": "chr20:149014-149013:CCATATTTGGGAGGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCGAATCCTGAC" }, { - "from": "ref-chr22:31-31", - "name": "ref-chr22:31-31_chr22:32-31:AGGGCAAACATTCAGGACACAGCAGAGTATTGTTGTAATCCTATGTGAGC", + "from": "ref-chr20:149008-149013", + "name": "ref-chr20:149008-149013_ref-chr20:149014-149018", "sequences": [ - "20482:1" - ], - "to": "chr22:32-31:AGGGCAAACATTCAGGACACAGCAGAGTATTGTTGTAATCCTATGTGAGC" - }, - { - "from": "ref-chr22:31-31", - "name": "ref-chr22:31-31_ref-chr22:32-36", - "sequences": [ - "20482:0", + "HG002_pbsv.INS.49079:0", "REF" ], - "to": "ref-chr22:32-36" + "to": "ref-chr20:149014-149018" }, { - "from": "chr22:32-31:AGGGCAAACATTCAGGACACAGCAGAGTATTGTTGTAATCCTATGTGAGC", - "name": "chr22:32-31:AGGGCAAACATTCAGGACACAGCAGAGTATTGTTGTAATCCTATGTGAGC_ref-chr22:32-36", + "from": "chr20:149014-149013:CCATATTTGGGAGGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCGAATCCTGAC", + "name": "chr20:149014-149013:CCATATTTGGGAGGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCGAATCCTGAC_ref-chr20:149014-149018", "sequences": [ - "20482:1" + "HG002_pbsv.INS.49079:1" ], - "to": "ref-chr22:32-36" + "to": "ref-chr20:149014-149018" }, { - "from": "ref-chr22:32-36", - "name": "ref-chr22:32-36_sink", + "from": "ref-chr20:149014-149018", + "name": "ref-chr20:149014-149018_sink", "to": "sink" } ], - "model_name": "Graph from insertions/insertion-test-1.vcf", + "model_name": "Graph from ../paragraph-tools/share/test-data/paragraph/insertions/insertion-test-1.vcf", "nodes": [ { "name": "source", "sequence": "NNNNNNNNNN" }, { - "name": "ref-chr22:26-30", - "reference": "chr22:26-30", - "reference_sequence": "AGACC" - }, - { - "name": "ref-chr22:31-31", - "reference": "chr22:31-31", - "reference_sequence": "A" + "name": "ref-chr20:149008-149013", + "reference": "chr20:149008-149013", + "reference_sequence": "GAACAA" }, { - "name": "chr22:32-31:AGGGCAAACATTCAGGACACAGCAGAGTATTGTTGTAATCCTATGTGAGC", - "position": "chr22:32-31", - "sequence": "AGGGCAAACATTCAGGACACAGCAGAGTATTGTTGTAATCCTATGTGAGC" + "name": "chr20:149014-149013:CCATATTTGGGAGGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCGAATCCTGAC", + "position": "chr20:149014-149013", + "sequence": "CCATATTTGGGAGGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCGAATCCTGAC" }, { - "name": "ref-chr22:32-36", - "reference": "chr22:32-36", - "reference_sequence": "G" + "name": "ref-chr20:149014-149018", + "reference": "chr20:149014-149018", + "reference_sequence": "CCATA" }, { "name": "sink", @@ -79,31 +65,29 @@ "paths": [ { "nodes": [ - "ref-chr22:26-30", - "ref-chr22:31-31", - "ref-chr22:32-36" + "ref-chr20:149008-149013", + "ref-chr20:149014-149018" ], "path_id": "REF|1", "sequence": "REF" }, { "nodes": [ - "ref-chr22:26-30", - "ref-chr22:31-31", - "chr22:32-31:AGGGCAAACATTCAGGACACAGCAGAGTATTGTTGTAATCCTATGTGAGC", - "ref-chr22:32-36" + "ref-chr20:149008-149013", + "chr20:149014-149013:CCATATTTGGGAGGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCGAATCCTGAC", + "ref-chr20:149014-149018" ], "path_id": "ALT|1", "sequence": "ALT" } ], "sequencenames": [ - "20482:0", - "20482:1", "ALT", + "HG002_pbsv.INS.49079:0", + "HG002_pbsv.INS.49079:1", "REF" ], "target_regions": [ - "chr22:26-36" + "chr20:149008-149018" ] } diff --git a/share/test-data/paragraph/insertions/insertion-test-1.ref.fa b/share/test-data/paragraph/insertions/insertion-test-1.ref.fa deleted file mode 100644 index 8052aec..0000000 --- a/share/test-data/paragraph/insertions/insertion-test-1.ref.fa +++ /dev/null @@ -1,2 +0,0 @@ ->chr22 chr22:11464535-11464566 -AATCCTATGTGAGGTACAAACATTCAGACCAG \ No newline at end of file diff --git a/share/test-data/paragraph/insertions/insertion-test-1.ref.fa.fai b/share/test-data/paragraph/insertions/insertion-test-1.ref.fa.fai deleted file mode 100644 index 50c4ef6..0000000 --- a/share/test-data/paragraph/insertions/insertion-test-1.ref.fa.fai +++ /dev/null @@ -1 +0,0 @@ -chr22 32 31 32 33 diff --git a/share/test-data/paragraph/insertions/insertion-test-1.vcf b/share/test-data/paragraph/insertions/insertion-test-1.vcf index 90ac2ed..ce0391e 100644 --- a/share/test-data/paragraph/insertions/insertion-test-1.vcf +++ b/share/test-data/paragraph/insertions/insertion-test-1.vcf @@ -1,66 +1,9 @@ ##fileformat=VCFv4.2 -##FILTER= -##FILTER= -##source=Sniffles +##reference=GRCh38-with-decoy ##fileDate=20180308 -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= +##FILTER= +##INFO= +##INFO= ##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##ALT= -##ALT= -##ALT= -##ALT= -##ALT= -##ALT= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##FORMAT= -##FORMAT= -##FORMAT= -##bcftools_viewVersion=1.7+htslib-1.7 -##bcftools_viewCommand=view -h /Users/pkrusche/Downloads/merged_na12878.sort.bam.sniffles1kb_auto_seq_s10.sort.vcf.gz; Date=Thu Apr 12 12:18:58 2018 -##bcftools_viewCommand=view -o sniffles-insertions.vcf.gz -O z sniffles-insertions.vcf; Date=Thu Apr 12 12:21:30 2018 -##bcftools_viewCommand=view -i SEQ!="" -o /Users/pkrusche/workspace/test_data/sniffles-insertions-with-seq.vcf.gz -O z /Users/pkrusche/workspace/test_data/sniffles-insertions.vcf.gz; Date=Thu Apr 12 15:16:12 2018 -##bcftools_viewVersion=1.2+htslib-1.2.1 -##bcftools_viewCommand=view -o sniffles-insertions-chr22.vcf.gz -O z /home/pkrusche/sniffles-insertions-with-seq.vcf.gz chr22 -##bcftools_viewCommand=view -o passing-example.vcf.gz -O z sniffles-insertions-chr22.vcf.gz chr22:11464565-11464565 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT /seq/schatz/fritz/cancer_ONT/pacbio_NA12878/merged_na12878.sort.bam -chr22 31 20482 N . PASS PRECISE;SVMETHOD=Snifflesv1.0.8;CHR2=22;END=31;STD_quant_start=8;STD_quant_stop=6;Kurtosis_quant_start=-1;Kurtosis_quant_stop=-1;SVTYPE=INS;SUPTYPE=AL;SVLEN=50;STRANDS=+-;SEQ=AGGGCAAACATTCAGGACACAGCAGAGTATTGTTGTAATCCTATGTGAGC;RE=34;AF=0.369565 GT:DR:DV 0/1:58:34 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT +chr20 149013 HG002_pbsv.INS.49079 A ACCATATTTGGGAGGCAATTTTACCTGTTCTCAAGGCCGCATCTCTACCCCATCTCATGCGAATCCTGAC . PASS SVTYPE=INS;TANDEM diff --git a/share/test-data/paragraph/insertions/insertion-test-2.json b/share/test-data/paragraph/insertions/insertion-test-2.json deleted file mode 100644 index d51ee4f..0000000 --- a/share/test-data/paragraph/insertions/insertion-test-2.json +++ /dev/null @@ -1,131 +0,0 @@ -{ - "edges": [ - { - "from": "source", - "name": "source_chr22:32-31:GGATC", - "to": "chr22:32-31:GGATC" - }, - { - "from": "source", - "name": "source_ref-chr22:26-30", - "to": "ref-chr22:26-30" - }, - { - "from": "chr22:32-31:GGATC", - "name": "chr22:32-31:GGATC_ref-chr22:32-36", - "sequences": [ - "20836:1" - ], - "to": "ref-chr22:32-36" - }, - { - "from": "ref-chr22:26-30", - "name": "ref-chr22:26-30_ref-chr22:31-31", - "sequences": [ - "20836:0", - "REF" - ], - "to": "ref-chr22:31-31" - }, - { - "from": "ref-chr22:31-31", - "name": "ref-chr22:31-31_ref-chr22:32-36", - "sequences": [ - "20836:0", - "REF" - ], - "to": "ref-chr22:32-36" - }, - { - "from": "ref-chr22:31-31", - "name": "ref-chr22:31-31_chr22:32-31:TTTAT", - "sequences": [ - "20836:1" - ], - "to": "chr22:32-31:TTTAT" - }, - { - "from": "ref-chr22:32-36", - "name": "ref-chr22:32-36_sink", - "to": "sink" - }, - { - "from": "chr22:32-31:TTTAT", - "name": "chr22:32-31:TTTAT_sink", - "to": "sink" - } - ], - "model_name": "Graph from insertions/insertion-test-2.vcf", - "nodes": [ - { - "name": "source", - "sequence": "NNNNNNNNNN" - }, - { - "name": "chr22:32-31:GGATC", - "position": "chr22:32-31", - "sequence": "GGATC" - }, - { - "name": "ref-chr22:26-30", - "reference": "chr22:26-30", - "reference_sequence": "GACAT" - }, - { - "name": "ref-chr22:31-31", - "reference": "chr22:31-31", - "reference_sequence": "C" - }, - { - "name": "ref-chr22:32-36", - "reference": "chr22:32-36", - "reference_sequence": "T" - }, - { - "name": "chr22:32-31:TTTAT", - "position": "chr22:32-31", - "sequence": "TTTAT" - }, - { - "name": "sink", - "sequence": "NNNNNNNNNN" - } - ], - "paths": [ - { - "nodes": [ - "ref-chr22:26-30", - "ref-chr22:31-31", - "ref-chr22:32-36" - ], - "path_id": "REF|1", - "sequence": "REF" - }, - { - "nodes": [ - "ref-chr22:26-30", - "ref-chr22:31-31", - "chr22:32-31:TTTAT" - ], - "path_id": "ALT|1", - "sequence": "ALT" - }, - { - "nodes": [ - "chr22:32-31:GGATC", - "ref-chr22:32-36" - ], - "path_id": "ALT|2", - "sequence": "ALT" - } - ], - "sequencenames": [ - "20836:0", - "20836:1", - "ALT", - "REF" - ], - "target_regions": [ - "chr22:26-36" - ] -} diff --git a/share/test-data/paragraph/insertions/insertion-test-2.noas.json b/share/test-data/paragraph/insertions/insertion-test-2.noas.json deleted file mode 100644 index 587027a..0000000 --- a/share/test-data/paragraph/insertions/insertion-test-2.noas.json +++ /dev/null @@ -1,109 +0,0 @@ -{ - "edges": [ - { - "from": "source", - "name": "source_ref-chr22:26-30", - "to": "ref-chr22:26-30" - }, - { - "from": "ref-chr22:26-30", - "name": "ref-chr22:26-30_ref-chr22:31-31", - "sequences": [ - "20836:0", - "REF" - ], - "to": "ref-chr22:31-31" - }, - { - "from": "ref-chr22:31-31", - "name": "ref-chr22:31-31_chr22:32-31:TTTATTTTATTTGCTATTTTTTGGTGTACCAAAGCTGTACATAAATTATGTACAGACTCAATGAGTTTTGGGATC", - "sequences": [ - "20836:1" - ], - "to": "chr22:32-31:TTTATTTTATTTGCTATTTTTTGGTGTACCAAAGCTGTACATAAATTATGTACAGACTCAATGAGTTTTGGGATC" - }, - { - "from": "ref-chr22:31-31", - "name": "ref-chr22:31-31_ref-chr22:32-36", - "sequences": [ - "20836:0", - "REF" - ], - "to": "ref-chr22:32-36" - }, - { - "from": "chr22:32-31:TTTATTTTATTTGCTATTTTTTGGTGTACCAAAGCTGTACATAAATTATGTACAGACTCAATGAGTTTTGGGATC", - "name": "chr22:32-31:TTTATTTTATTTGCTATTTTTTGGTGTACCAAAGCTGTACATAAATTATGTACAGACTCAATGAGTTTTGGGATC_ref-chr22:32-36", - "sequences": [ - "20836:1" - ], - "to": "ref-chr22:32-36" - }, - { - "from": "ref-chr22:32-36", - "name": "ref-chr22:32-36_sink", - "to": "sink" - } - ], - "model_name": "Graph from insertions/insertion-test-2.vcf", - "nodes": [ - { - "name": "source", - "sequence": "NNNNNNNNNN" - }, - { - "name": "ref-chr22:26-30", - "reference": "chr22:26-30", - "reference_sequence": "GACAT" - }, - { - "name": "ref-chr22:31-31", - "reference": "chr22:31-31", - "reference_sequence": "C" - }, - { - "name": "chr22:32-31:TTTATTTTATTTGCTATTTTTTGGTGTACCAAAGCTGTACATAAATTATGTACAGACTCAATGAGTTTTGGGATC", - "position": "chr22:32-31", - "sequence": "TTTATTTTATTTGCTATTTTTTGGTGTACCAAAGCTGTACATAAATTATGTACAGACTCAATGAGTTTTGGGATC" - }, - { - "name": "ref-chr22:32-36", - "reference": "chr22:32-36", - "reference_sequence": "T" - }, - { - "name": "sink", - "sequence": "NNNNNNNNNN" - } - ], - "paths": [ - { - "nodes": [ - "ref-chr22:26-30", - "ref-chr22:31-31", - "ref-chr22:32-36" - ], - "path_id": "REF|1", - "sequence": "REF" - }, - { - "nodes": [ - "ref-chr22:26-30", - "ref-chr22:31-31", - "chr22:32-31:TTTATTTTATTTGCTATTTTTTGGTGTACCAAAGCTGTACATAAATTATGTACAGACTCAATGAGTTTTGGGATC", - "ref-chr22:32-36" - ], - "path_id": "ALT|1", - "sequence": "ALT" - } - ], - "sequencenames": [ - "20836:0", - "20836:1", - "ALT", - "REF" - ], - "target_regions": [ - "chr22:26-36" - ] -} diff --git a/share/test-data/paragraph/insertions/insertion-test-2.ref.fa b/share/test-data/paragraph/insertions/insertion-test-2.ref.fa deleted file mode 100644 index 7dfaf16..0000000 --- a/share/test-data/paragraph/insertions/insertion-test-2.ref.fa +++ /dev/null @@ -1,2 +0,0 @@ ->chr22 chr22:49226558-49226589 -TAACAAAAAAAGAGTAGATGAATTTGACATCT \ No newline at end of file diff --git a/share/test-data/paragraph/insertions/insertion-test-2.ref.fa.fai b/share/test-data/paragraph/insertions/insertion-test-2.ref.fa.fai deleted file mode 100644 index 50c4ef6..0000000 --- a/share/test-data/paragraph/insertions/insertion-test-2.ref.fa.fai +++ /dev/null @@ -1 +0,0 @@ -chr22 32 31 32 33 diff --git a/share/test-data/paragraph/insertions/insertion-test-2.vcf b/share/test-data/paragraph/insertions/insertion-test-2.vcf deleted file mode 100644 index 33b7012..0000000 --- a/share/test-data/paragraph/insertions/insertion-test-2.vcf +++ /dev/null @@ -1,66 +0,0 @@ -##fileformat=VCFv4.2 -##FILTER= -##FILTER= -##source=Sniffles -##fileDate=20180308 -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##ALT= -##ALT= -##ALT= -##ALT= -##ALT= -##ALT= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##FORMAT= -##FORMAT= -##FORMAT= -##bcftools_viewVersion=1.7+htslib-1.7 -##bcftools_viewCommand=view -h /Users/pkrusche/Downloads/merged_na12878.sort.bam.sniffles1kb_auto_seq_s10.sort.vcf.gz; Date=Thu Apr 12 12:18:58 2018 -##bcftools_viewCommand=view -o sniffles-insertions.vcf.gz -O z sniffles-insertions.vcf; Date=Thu Apr 12 12:21:30 2018 -##bcftools_viewCommand=view -i SEQ!="" -o /Users/pkrusche/workspace/test_data/sniffles-insertions-with-seq.vcf.gz -O z /Users/pkrusche/workspace/test_data/sniffles-insertions.vcf.gz; Date=Thu Apr 12 15:16:12 2018 -##bcftools_viewVersion=1.2+htslib-1.2.1 -##bcftools_viewCommand=view -o sniffles-insertions-chr22.vcf.gz -O z /home/pkrusche/sniffles-insertions-with-seq.vcf.gz chr22 -##bcftools_viewCommand=view -o passing-example-2.vcf.gz -O z sniffles-insertions-chr22.vcf.gz chr22:49226588-49226590 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT /seq/schatz/fritz/cancer_ONT/pacbio_NA12878/merged_na12878.sort.bam -chr22 31 20836 N . PASS PRECISE;SVMETHOD=Snifflesv1.0.8;CHR2=22;END=31;STD_quant_start=0;STD_quant_stop=2;Kurtosis_quant_start=5;Kurtosis_quant_stop=0;SVTYPE=INS;SUPTYPE=AL;SVLEN=75;STRANDS=+-;SEQ=TTTATTTTATTTGCTATTTTTTGGTGTACCAAAGCTGTACATAAATTATGTACAGACTCAATGAGTTTTGGGATC;RE=33;AF=0.464789 GT:DR:DV 0/1:38:33 diff --git a/share/test-data/round-trip-genotyping/sample1.sam b/share/test-data/round-trip-genotyping/sample1.sam deleted file mode 100644 index 05eef0a..0000000 --- a/share/test-data/round-trip-genotyping/sample1.sam +++ /dev/null @@ -1,6 +0,0 @@ -@HD VN:1.3 SO:coordinate -@SQ SN:chr1 LN:440 -fragment0 64 chr1 40 0 50M = 40 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATGGGGGGCAAAAAAA ?????????????????????????????????????????????????? -fragment0 64 chr1 40 0 50M = 40 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATGGGGGGCAAAAAA ?????????????????????????????????????????????????? -fragment1 64 chr1 40 0 50M = 40 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATGGGGGGCAAA ?????????????????????????????????????????????????? -fragment1 64 chr1 40 0 50M = 40 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATGGG ?????????????????????????????????????????????????? diff --git a/share/test-data/round-trip-genotyping/sample2.sam b/share/test-data/round-trip-genotyping/sample2.sam deleted file mode 100644 index 2ad7944..0000000 --- a/share/test-data/round-trip-genotyping/sample2.sam +++ /dev/null @@ -1,6 +0,0 @@ -@HD VN:1.3 SO:coordinate -@SQ SN:chr1 LN:440 -fragment0 64 chr1 40 0 50M = 40 0 AAAAAAAAAAAAAAAAAAAAAAAAATAAAAAAAAAAAAAAAAAAAAAAAA ?????????????????????????????????????????????????? -fragment1 64 chr1 40 0 50M = 40 0 AAAAAATAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA ?????????????????????????????????????????????????? -fragment2 64 chr1 40 0 50M = 40 0 AAAAAAAAATAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA ?????????????????????????????????????????????????? -fragment3 64 chr1 40 0 50M = 40 0 AAAAAAAAAATAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA ?????????????????????????????????????????????????? diff --git a/src/c++/lib/idxdepth/IndexBinning.cpp b/src/c++/lib/idxdepth/IndexBinning.cpp index 2fbd5b8..db0ff10 100644 --- a/src/c++/lib/idxdepth/IndexBinning.cpp +++ b/src/c++/lib/idxdepth/IndexBinning.cpp @@ -56,7 +56,7 @@ void getIndexBins(std::string const& bam_path, std::vector& output) if (hts_file_ptr->format.format == bam) { logger->debug("\t[Input {} is a BAM file]", bam_path); - error("BAM index reading not available yet."); + error("BAM index reading not available yet. binnedcov.txt won't be generated."); } else if (hts_file_ptr->format.format == cram) { diff --git a/src/python/bin/multigrmpy.py b/src/python/bin/multigrmpy.py index dd4a8df..6bb0e6c 100644 --- a/src/python/bin/multigrmpy.py +++ b/src/python/bin/multigrmpy.py @@ -237,7 +237,7 @@ def run(args): # manifest sanity check with open(args.manifest) as manifest_file: headers = {"id": False, "path": False, "idxdepth": False, "depth": False, - "read length": False, "sex": False, "depth variance": False} + "read length": False, "sex": False, "depth variance": False, "depth sd": False} for line in manifest_file: if line.startswith("#"): line = line[1:] diff --git a/src/python/lib/grm/vcfgraph/vcfupdate.py b/src/python/lib/grm/vcfgraph/vcfupdate.py index 965df61..a2be823 100644 --- a/src/python/lib/grm/vcfgraph/vcfupdate.py +++ b/src/python/lib/grm/vcfgraph/vcfupdate.py @@ -210,6 +210,7 @@ def update_vcf_from_grmpy(inVcfFilename, grmpyOutput, outVcfFilename, sample_nam for ii, aId in enumerate(alleleIds): alleleMap[aId] = ii + num_bpdepth_sample = 0 for sample in sample_names: if vcf_samples: if sample in vcf_samples: @@ -222,7 +223,7 @@ def update_vcf_from_grmpy(inVcfFilename, grmpyOutput, outVcfFilename, sample_nam record.samples[sample]["OLD_GT"] = [None] record.samples[sample]["GT"] = [None] record.samples[sample]["DP"] = None - record.samples[sample]["FT"] = "." + record.samples[sample]["FT"] = None record.samples[sample]["AD"] = [None] * (1 + len(record.alts)) record.samples[sample]["ADF"] = [None] * (1 + len(record.alts)) record.samples[sample]["ADR"] = [None] * (1 + len(record.alts)) @@ -233,6 +234,10 @@ def update_vcf_from_grmpy(inVcfFilename, grmpyOutput, outVcfFilename, sample_nam logging.warning("VCF key error for sample " + str(sample) + " at " + '_'.join(map(str, [record.chrom, record.pos, record.stop]))) continue + if "BP_DEPTH" in record.samples[sample]["FT"] or "BP_NO_GT" in record.samples[sample]["FT"]: + num_bpdepth_sample += 1 + if num_bpdepth_sample * 2 > len(grmpyRecord["samples"]): + record.filter.add("BP_DEPTH") bcf_out.write(record) diff --git a/src/python/test/test_VCF2Paragraph.py b/src/python/test/test_VCF2Paragraph.py index df6d572..f9f5830 100644 --- a/src/python/test/test_VCF2Paragraph.py +++ b/src/python/test/test_VCF2Paragraph.py @@ -30,36 +30,8 @@ class TestVCF2Paragraph(unittest.TestCase): def setUp(self): self.test_data_dir = os.path.join(GRMPY_ROOT, "share", "test-data", "paragraph") - self.test_haplo_vcfs = sorted(glob.glob(self.test_data_dir + "/simple/*.vcf")) - self.test_haplo_vcfs = sorted(glob.glob(self.test_data_dir + "/haplo-complex/*.vcf")) - self.test_haplo_vcfs += sorted(glob.glob(self.test_data_dir + - "/pg-het-ins/*.vcf")) - self.test_haplo_vcfs += sorted(glob.glob(self.test_data_dir + - "/long-del/chr4-21369091-21376907.vcf")) - - self.test_allele_vcfs = sorted(glob.glob(self.test_data_dir + "/pg-complex/*.vcf")) - self.test_allele_vcfs += sorted(glob.glob(os.path.join(GRMPY_ROOT, "share", - "test-data", "genotyping_test_2", "chr*.vcf"))) - self.test_insertion_vcfs = sorted(glob.glob(self.test_data_dir + "/insertions/*.vcf")) - hg19_locations = [ - "/Users/pkrusche/workspace/human_genome/hg19.fa", - "/illumina/sync/igenomes/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa", - "/Users/schen6/Documents/hg19/genome.fa" - ] - if "HG19" in os.environ and os.environ["HG19"]: - hg19_locations.insert(0, os.environ["HG19"]) - self.hg19 = None - for f in hg19_locations: - if os.path.exists(f) and os.path.isfile(f): - self.hg19 = f - break - - if not self.hg19: - raise Exception( - "hg19 fasta file not found in expected locations: %s" % str(hg19_locations)) - hg38_locations = [ "/Users/pkrusche/workspace/human_genome/hg38.fa", "/illumina/sync/igenomes/Homo_sapiens/NCBI/GRCh38Decoy/Sequence/WholeGenomeFasta/genome.fa", @@ -79,68 +51,6 @@ def setUp(self): self.vcf2paragraph = os.path.join( GRMPY_ROOT, "src", "python", "bin", "vcf2paragraph.py") - def test_haplotype_vcfs(self): - for x in self.test_haplo_vcfs: - print("Testing output for %s..." % x) - tf1 = tempfile.NamedTemporaryFile(suffix=".json") - tf1.close() - try: - expected_json = x.replace(".vcf", ".json") - subprocess.check_call( - f"python3 {quote(self.vcf2paragraph)} {quote(x)} {tf1.name} -r {self.hg19}", - shell=True) - subprocess.check_call( - f"python3 -mjson.tool --sort-keys {tf1.name} > {tf1.name + '.pp.json'}", - shell=True) - subprocess.check_call( - f"diff --ignore-matching-lines='.*model_name.*' {tf1.name + '.pp.json'} {expected_json}", - shell=True) - except: - from shutil import copy - current_dir = os.path.abspath(os.path.dirname(__file__)) - basename = os.path.basename(expected_json) - fail_path = os.path.join(current_dir, "test-failed-haplotype." + basename) - copy(tf1.name + ".pp.json", fail_path) - os.chmod(fail_path, 0o777) - print("[test_haplotype_vcfs] failed output saved in %s" % fail_path) - raise - finally: - os.remove(tf1.name) - os.remove(tf1.name + ".pp.json") - - def test_allele_graph_vcfs(self): - for x in self.test_allele_vcfs: - print("Testing output for %s..." % x) - tf1 = tempfile.NamedTemporaryFile(suffix=".json") - tf1.close() - try: - if "genotyping_test_2" in x: - reference = os.path.join(GRMPY_ROOT, "share", "test-data", "genotyping_test_2", "swaps.fa") - else: - reference = self.hg38 - expected_json = x.replace(".vcf", ".json") - subprocess.check_call("python3 %s %s %s -r %s -R -g alleles" % ( - quote(self.vcf2paragraph), quote(x), tf1.name, reference - ), shell=True) - subprocess.check_call( - "python3 -mjson.tool --sort-keys %s > %s" % (tf1.name, tf1.name + ".pp.json"), - shell=True) - subprocess.check_call("diff --ignore-matching-lines='.*model_name.*' %s %s" % - (tf1.name + ".pp.json", expected_json), - shell=True) - except: # pylint: disable=bare-except - from shutil import copy - current_dir = os.path.abspath(os.path.dirname(__file__)) - basename = os.path.basename(expected_json) - failed_path = os.path.join(current_dir, "test-failed-allele." + basename) - copy(tf1.name + ".pp.json", failed_path) - os.chmod(failed_path, 0o777) - print("[test_allele_graph_vcfs] Failed output saved in %s" % failed_path) - raise - finally: - os.remove(tf1.name) - os.remove(tf1.name + ".pp.json") - def test_allele_graph_insertion_vcfs(self): assert self.test_insertion_vcfs for x in self.test_insertion_vcfs: @@ -149,12 +59,11 @@ def test_allele_graph_insertion_vcfs(self): tf1.close() try: expected_json = x.replace(".vcf", ".json") - reference = x.replace(".vcf", ".ref.fa") # check with allele splitting subprocess.check_call("python3 %s %s %s -r %s --alt-splitting " "--read-len 5 --max-ref-node-length 10 --alt-paths " "--retrieve-reference-sequence -g alleles" % ( - quote(self.vcf2paragraph), quote(x), tf1.name, reference + quote(self.vcf2paragraph), quote(x), tf1.name, self.hg38 ), shell=True) subprocess.check_call( "python3 -mjson.tool --sort-keys %s > %s" % (tf1.name, tf1.name + ".pp.json"), @@ -168,7 +77,7 @@ def test_allele_graph_insertion_vcfs(self): subprocess.check_call("python3 %s %s %s -r %s " "--read-len 5 --max-ref-node-length 10 --alt-paths " "--retrieve-reference-sequence -g alleles" % ( - quote(self.vcf2paragraph), quote(x), tf1.name, reference + quote(self.vcf2paragraph), quote(x), tf1.name, self.hg38 ), shell=True) subprocess.check_call( "python3 -mjson.tool --sort-keys %s > %s" % (tf1.name, tf1.name + ".pp.json"), diff --git a/src/python/test/test_multigrmpy.py b/src/python/test/test_multigrmpy.py index ff7f64d..00d00cd 100644 --- a/src/python/test/test_multigrmpy.py +++ b/src/python/test/test_multigrmpy.py @@ -22,7 +22,6 @@ import unittest import tempfile import gzip -import difflib GRMPY_ROOT = os.path.abspath(os.path.join( os.path.dirname(__file__), "..", "..", "..")) @@ -60,25 +59,6 @@ def setUp(self): self.reference = os.path.join(GRMPY_ROOT, "share", "test-data", "round-trip-genotyping", "dummy.fa") self.manifest = os.path.join(GRMPY_ROOT, "share", "test-data", "round-trip-genotyping", "samples.txt") - self.swaps_input_vcf = os.path.join(GRMPY_ROOT, "share", "test-data", "genotyping_test_2", "swaps.vcf") - self.swaps_expected_genotypes_json = os.path.join(GRMPY_ROOT, "share", "test-data", "genotyping_test_2", - "expected-genotypes.json") - self.swaps_expected_genotypes_vcf = os.path.join(GRMPY_ROOT, "share", "test-data", "genotyping_test_2", - "expected-genotypes.vcf") - self.swaps_expected_variants_json = os.path.join(GRMPY_ROOT, "share", "test-data", "genotyping_test_2", - "expected-variants.json") - self.swaps_reference = os.path.join(GRMPY_ROOT, "share", "test-data", "genotyping_test_2", "swaps.fa") - self.swaps_manifest = os.path.join(GRMPY_ROOT, "share", "test-data", "genotyping_test_2", "samples.txt") - - self.hg38_input_vcf = os.path.join(GRMPY_ROOT, "share", "test-data", "paragraph", "pg-het-ins", - "pg-het-ins.vcf") - self.hg38_expected_genotypes_json = os.path.join(GRMPY_ROOT, "share", "test-data", "paragraph", "pg-het-ins", - "genotypes_expected.json") - self.hg38_expected_variants_json = os.path.join(GRMPY_ROOT, "share", "test-data", "paragraph", "pg-het-ins", - "variants_expected.json") - self.hg38_reference = HG38 - self.hg38_manifest = os.path.join(GRMPY_ROOT, "share", "test-data", "paragraph", "pg-het-ins", "manifest.txt") - @unittest.skipIf(not GRMPY_INSTALL, "No compiled/install path specified, please set the GRMPY_INSTALL variable to point to an installation.") def test_multigrmpy(self): @@ -127,217 +107,6 @@ def test_multigrmpy(self): self.assertEqual(item["samples"]["sample1"]["gt"]["GT"], "./.") self.assertEqual(item["samples"]["sample2"]["gt"]["GT"], "S1/S1") - @unittest.skipIf(not GRMPY_INSTALL, - "No compiled/install path specified, please set the GRMPY_INSTALL variable to point to an installation.") - def test_multigrmpy_expected_genotypes(self): - import multigrmpy - - with tempfile.TemporaryDirectory() as output_dir: - args = ADict({ - "input": self.swaps_input_vcf, - "manifest": self.swaps_manifest, - "reference": self.swaps_reference, - "output": output_dir, - "grmpy": os.path.join(os.path.dirname(__file__), "module-wrapper.sh") + " " + os.path.join( - GRMPY_INSTALL, "bin", "grmpy"), - "verbose": False, - "quiet": True, - "logfile": None, - "infer_read_haplotypes": False, - "write_alignments": False, - "graph_sequence_matching": True, - "klib_sequence_matching": False, - "kmer_sequence_matching": False, - "bad_align_uniq_kmer_len": 0, - "threads": 1, - "sample_threads": 1, - "genotyping_parameters": None, - "max_reads_per_event": 10000, - "split_type": "lines", - "read_length": 150, - "max_ref_node_length": 1000, - "ins_info_key": "SEQ", - "graph_type": "alleles", - "alt_splitting": True, - "retrieve_reference_sequence": False, - "scratch_dir": None, - "keep_scratch": False, - }) - multigrmpy.run(args) - - output_json_path = os.path.join(output_dir, "genotypes.json.gz") - with gzip.open(output_json_path, 'rt') as result_json: - observed = json.load(result_json) - - with open(self.swaps_expected_genotypes_json, 'rt') as expected_json: - expected = json.load(expected_json) - - # event ordering is not guaranteed - observed = sorted(observed, key=lambda x: x["graphinfo"]["ID"]) - expected = sorted(expected, key=lambda x: x["graphinfo"]["ID"]) - - match = True - for i, o in enumerate(observed): - if o["samples"]["SWAPS"]["gt"]["GT"] != expected[i]["samples"]["SWAPS"]["gt"]["GT"]: - match = False - break - - if not match: - observed_lines = json.dumps(observed, sort_keys=True, indent=4, separators=(',', ': ')) - expected_lines = json.dumps(expected, sort_keys=True, indent=4, separators=(',', ': ')) - for line in difflib.context_diff(expected_lines.split("\n"), observed_lines.split("\n"), - fromfile=self.swaps_expected_genotypes_json, tofile=output_json_path): - sys.stderr.write(line + "\n") - - with open("test_swaps_genotypes.json", "wt") as out_file: - json.dump(observed, out_file, sort_keys=True, indent=4, separators=(',', ': ')) - - raise Exception("Swaps test genotyping output doesn't match! If this is expected and new behavior, " - "cp test_swaps_genotypes.json %s" % self.swaps_expected_genotypes_json) - - output_vcf_path = os.path.join(output_dir, "genotypes.vcf.gz") - - with gzip.open(output_vcf_path, 'rt') as result_vcf: - observed_lines = result_vcf.read().splitlines(keepends=False) - - observed_lines = [x for x in observed_lines if not x.startswith("#")] - - with open(self.swaps_expected_genotypes_vcf, 'rt') as expected_vcf: - expected_lines = expected_vcf.read().splitlines(keepends=False) - - if observed_lines != expected_lines: - for line in difflib.context_diff(expected_lines, observed_lines, - fromfile=self.swaps_expected_genotypes_vcf, tofile=output_vcf_path): - sys.stderr.write(line + "\n") - - with open("test_swaps_genotypes.vcf", "wt") as out_file: - for x in observed_lines: - print(x, file=out_file) - - raise Exception("Swaps VCF output doesn't match! If this is expected and new behavior, " - "cp test_swaps_genotypes.vcf %s" % self.swaps_expected_genotypes_vcf) - - output_json_path = os.path.join(output_dir, "variants.json.gz") - with gzip.open(output_json_path, 'rt') as result_json: - observed = json.load(result_json) - - with open(self.swaps_expected_variants_json, 'rt') as expected_json: - expected = json.load(expected_json) - - # event ordering is not guaranteed - observed = sorted(observed, key=lambda x: x["ID"]) - expected = sorted(expected, key=lambda x: x["ID"]) - - # remove temp file information - for o in observed: - del o["graph"]["model_name"] - for e in expected: - if "model_name" in e["graph"]: - del e["graph"]["model_name"] - - observed_lines = json.dumps(observed, sort_keys=True, indent=4, separators=(',', ': ')) - expected_lines = json.dumps(expected, sort_keys=True, indent=4, separators=(',', ': ')) - if observed_lines != expected_lines: - for line in difflib.context_diff(expected_lines.split("\n"), observed_lines.split("\n"), - fromfile=self.swaps_expected_variants_json, tofile=output_json_path): - sys.stderr.write(line + "\n") - - with open("test_swaps_variants.json", "wt") as out_file: - json.dump(observed, out_file, sort_keys=True, indent=4, separators=(',', ': ')) - - raise Exception("Swaps test converted variants don't match! If this is expected and new behavior, " - "cp test_swaps_variants.json %s" % self.swaps_expected_variants_json) - - @unittest.skipIf(not GRMPY_INSTALL, - "No compiled/install path specified, please set the GRMPY_INSTALL variable to point to an installation.") - @unittest.skipIf(not HG38, "No hg38 reference fasta file was found.") - def test_multigrmpy_pg_het_ins(self): - import multigrmpy - - with tempfile.TemporaryDirectory() as output_dir: - args = ADict({ - "input": self.hg38_input_vcf, - "manifest": self.hg38_manifest, - "reference": self.hg38_reference, - "output": output_dir, - "grmpy": os.path.join(os.path.dirname(__file__), "module-wrapper.sh") + " " + os.path.join( - GRMPY_INSTALL, "bin", "grmpy"), - "verbose": False, - "quiet": True, - "logfile": None, - "write_alignments": False, - "infer_read_haplotypes": False, - "graph_sequence_matching": True, - "klib_sequence_matching": False, - "kmer_sequence_matching": False, - "bad_align_uniq_kmer_len": 0, - "threads": 1, - "sample_threads": 1, - "genotyping_parameters": None, - "max_reads_per_event": 10000, - "split_type": "lines", - "read_length": 150, - "max_ref_node_length": 1000, - "ins_info_key": "SEQ", - "graph_type": "alleles", - "alt_splitting": True, - "retrieve_reference_sequence": False, - "scratch_dir": None, - "keep_scratch": True, - }) - output_json_path = os.path.join(output_dir, "genotypes.json.gz") - multigrmpy.run(args) - - # compare genotyping results - with gzip.open(output_json_path, 'rt') as result_json: - observed = json.load(result_json) - - with open(self.hg38_expected_genotypes_json, 'rt') as expected_json: - expected = json.load(expected_json) - - # uncomment to keep observed json - # check if genotypes are the same - observed_lines = json.dumps(observed, sort_keys=True, indent=4, separators=(',', ': ')) - expected_lines = json.dumps(expected, sort_keys=True, indent=4, separators=(',', ': ')) - if observed_lines != expected_lines: - for line in difflib.context_diff(expected_lines.split("\n"), observed_lines.split("\n"), - fromfile=self.hg38_expected_genotypes_json, tofile=output_json_path): - sys.stderr.write(line + "\n") - - with open("test_gt.json", "wt") as out_file: - json.dump(observed, out_file, sort_keys=True, indent=4, separators=(',', ': ')) - - raise Exception("Genotyping results don't match! If this is expected and new behavior, " - "cp test_gt.json %s" % self.hg38_expected_genotypes_json) - - # check if variants are the same - output_json_path = os.path.join(output_dir, "variants.json.gz") - with gzip.open(output_json_path, 'rt') as result_json: - observed = json.load(result_json) - - # uncomment to keep observed json - - with open(self.hg38_expected_variants_json, 'rt') as expected_json: - expected = json.load(expected_json) - - # contains temp file locations - del observed[0]["graph"]["model_name"] - if "model_name" in expected[0]["graph"]: - del expected[0]["graph"]["model_name"] - - observed_lines = json.dumps(observed, sort_keys=True, indent=4, separators=(',', ': ')) - expected_lines = json.dumps(expected, sort_keys=True, indent=4, separators=(',', ': ')) - if observed_lines != expected_lines: - for line in difflib.context_diff(expected_lines.split("\n"), observed_lines.split("\n"), - fromfile=self.hg38_expected_genotypes_json, tofile=output_json_path): - sys.stderr.write(line + "\n") - - with open("test_variants.json", "wt") as out_file: - json.dump(observed, out_file, sort_keys=True, indent=4, separators=(',', ': ')) - - raise Exception("Converted variants don't match! If this is expected and new behavior, " - "cp test_variants.json %s" % self.hg38_expected_variants_json) - if __name__ == "__main__": unittest.main() From 970796c2a36d8f951606fe7d8af1d637f5d03893 Mon Sep 17 00:00:00 2001 From: Sai Chen Date: Tue, 24 Sep 2019 15:45:39 -0700 Subject: [PATCH 2/2] GT-825 add v2.4a fix --- README.md | 13 ++++++------- src/python/bin/multigrmpy.py | 4 ++-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 7c1bae0..413504f 100644 --- a/README.md +++ b/README.md @@ -22,17 +22,16 @@ ## Introduction -Accurate genotyping of known variants is a critical for analysis of whole-genome sequencing data. - -Paragraph aims to facilitate these tasks by providing: -- an accurate genotyper for Structural Variations in short-read data -- a suite of graph-based tools to align and genotype complex events. +Accurate genotyping of known variants is a critical for the analysis of whole-genome sequencing data. Paragraph aims to facilitate this by providing an accurate genotyper for Structural Variations with short-read data. Please reference Paragraph using: -- Chen, et al (2019) [Paragraph: A graph-based structural variant genotyper for short-read sequence data](https://www.biorxiv.org/content/10.1101/635011v1). *bioRxiv*. doi: https://doi.org/10.1101/635011 +- Chen, et al (2019) [Paragraph: A graph-based structural variant genotyper for short-read sequence data](https://www.biorxiv.org/content/10.1101/635011v2). *bioRxiv*. doi: https://doi.org/10.1101/635011 + +(Second version uploaded at September 24, 2019) + +Genotyping data in this paper can be found at [paper-data/download-instructions.txt](paper-data/download-instructions.txt) -Genotyping calls in this paper can be found at [paper-data/download-instructions.txt](paper-data/download-instructions.txt) ## Installation diff --git a/src/python/bin/multigrmpy.py b/src/python/bin/multigrmpy.py index 6bb0e6c..70fcf63 100644 --- a/src/python/bin/multigrmpy.py +++ b/src/python/bin/multigrmpy.py @@ -180,7 +180,7 @@ def make_argument_parser(): verbosity_options.add_argument("--debug", dest="debug", default=False, action="store_true", help="Log debug level events.") - stat_options = parser.add_mutually_exclusive_group(required=False) + stat_options = parser.add_argument_group('gt-parameter-group') stat_options.add_argument("-G", "--genotyping-parameters", dest="genotyping_parameters", default="", type=str, help="JSON string or file with genotyping model parameters.") @@ -188,7 +188,7 @@ def make_argument_parser(): stat_options.add_argument("-M", "--max-reads-per-event", dest="max_reads_per_event", default=0, type=int, help="Maximum number of reads to process for a single event.") - vcf2json_options = parser.add_mutually_exclusive_group(required=False) + vcf2json_options = parser.add_argument_group('vcf2json-option-group') vcf2json_options.add_argument("--vcf-split", default="lines", dest="split_type", choices=["lines", "full", "by_id", "superloci"], help="Mode for splitting the input VCF: lines (default) -- one graph per record ;"