From d8d01677ab4418973ca3643cf9e9561fe53e7aec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hannes=20P=C3=A9tur=20Eggertsson?= Date: Sun, 17 May 2020 12:08:30 +0000 Subject: [PATCH] Version 2.5.1 (#38) * Bugfix in some rare sv graph construction * Bugfix in somecases of VCF merging in * Made failed copy of tabix index a warning in genotype_sv instead of error --- CMakeLists.txt | 2 +- include/graphtyper/graph/graph.hpp | 5 +- include/graphtyper/typer/vcf_operations.hpp | 3 +- src/graph/graph.cpp | 96 ++++++++++++++------- src/typer/vcf.cpp | 30 +++++-- src/typer/vcf_operations.cpp | 19 ++-- src/utilities/genotype.cpp | 13 ++- src/utilities/genotype_camou.cpp | 2 +- src/utilities/genotype_sv.cpp | 17 +++- src/utilities/hts_parallel_reader.cpp | 47 ++++++++-- 10 files changed, 167 insertions(+), 67 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ea22e3..c273872 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ include(ExternalProject) # The version number set (graphtyper_VERSION_MAJOR 2) set (graphtyper_VERSION_MINOR 5) -set (graphtyper_VERSION_PATCH 0) +set (graphtyper_VERSION_PATCH 1) set(STATIC_DIR "" CACHE STRING "If set, GraphTyper will be built as a static binary using libraries from the given STATIC_DIR.") # Get the current working branch diff --git a/include/graphtyper/graph/graph.hpp b/include/graphtyper/graph/graph.hpp index fcd0f97..54c9740 100644 --- a/include/graphtyper/graph/graph.hpp +++ b/include/graphtyper/graph/graph.hpp @@ -172,9 +172,10 @@ class Graph /********************** * GRAPH MODIFICATION * **********************/ - bool add_reference(unsigned end_pos, + void add_reference(unsigned end_pos, unsigned const & num_var, - std::vector const & reference_sequence); + std::vector const & reference_sequence + ); void add_variants(VarRecord && record); diff --git a/include/graphtyper/typer/vcf_operations.hpp b/include/graphtyper/typer/vcf_operations.hpp index 6b06e1c..1843947 100644 --- a/include/graphtyper/typer/vcf_operations.hpp +++ b/include/graphtyper/typer/vcf_operations.hpp @@ -21,7 +21,8 @@ void vcf_merge_and_break(std::vector const & vcfs, std::string const & output, std::string const & region, bool const FILTER_ZERO_QUAL, - bool const force_no_variant_overlap); + bool const force_no_variant_overlap, + bool const force_no_break_down); void vcf_update_info(std::string const & vcf, std::string const & output); diff --git a/src/graph/graph.cpp b/src/graph/graph.cpp index d29aa65..adc197d 100644 --- a/src/graph/graph.cpp +++ b/src/graph/graph.cpp @@ -87,6 +87,15 @@ Graph::add_genomic_region(std::vector && reference_sequence, var_records.end() ); + for (long v = 0; v < static_cast(var_records.size()); ++v) + { + if (var_records[v].pos >= genomic_region.end) + { + var_records.resize(v); + break; + } + } + if (Options::instance()->add_all_variants) { BOOST_LOG_TRIVIAL(debug) << "[" << __HERE__ << "] Constructing graph of " @@ -157,7 +166,7 @@ Graph::add_genomic_region(std::vector && reference_sequence, } else { - BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Constructing graph of " + BOOST_LOG_TRIVIAL(debug) << "[" << __HERE__ << "] Constructing graph of " << var_records.size() << " variants."; @@ -199,6 +208,46 @@ Graph::add_genomic_region(std::vector && reference_sequence, continue; } + std::vector suffix = var_records[i].get_common_suffix(); + + //if (var_records[i + 1].pos >= var_records[i].pos + var_records[i].ref.size() - suffix.size()) + //{ + // long const suffix_to_remove = var_records[i].pos + var_records[i].ref.size() - var_records[i + 1].pos; + // + // // Check if it is possible to remove enough suffix so we can safely join the two records + // if (suffix_to_remove <= 0 || suffix_to_remove > static_cast(suffix.size())) + // { + // var_records[i + 1].merge_one_path(std::move(var_records[i])); + // } + // else + // { + // // Erase from previous record so that it ends where the next record starts + // assert(static_cast(var_records[i].ref.size()) > suffix_to_remove); + // var_records[i].ref.erase(var_records[i].ref.end() - suffix_to_remove, var_records[i].ref.end()); + // + // for (auto & alt : var_records[i].alts) + // { + // assert(static_cast(alt.size()) > suffix_to_remove); + // alt.erase(alt.end() - suffix_to_remove, alt.end()); + // } + // + // assert(var_records[i + 1].pos == var_records[i].pos + var_records[i].ref.size()); + // + // // Merge the two records + // long const suffix_to_add = suffix_to_remove - static_cast(var_records[i + 1].ref.size()); + // + // var_records[i + 1].merge(std::move(var_records[i])); + // + // if (suffix_to_add > 0) + // { + // var_records[i + 1].ref.insert(var_records[i + 1].ref.end(), suffix.end() - suffix_to_add, suffix.end()); + // + // for (auto & alt : var_records[i + 1].alts) + // alt.insert(alt.end(), suffix.end() - suffix_to_add, suffix.end()); + // } + // } + //} + //else { // In a few extreme scenarios we cannot merge only one path here. //BOOST_LOG_TRIVIAL(fatal) << "i : " << var_records[i].to_string(); @@ -217,10 +266,6 @@ Graph::add_genomic_region(std::vector && reference_sequence, ++i; } // while } - - BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Done merging overlapping variant records. Now I have " - << var_records.size() - << " variants."; } // Erase alternatives which are identical to the reference sequence @@ -252,7 +297,6 @@ Graph::add_genomic_region(std::vector && reference_sequence, if (suffix.size() > 0) { - BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Removing commong suffix."; assert(suffix.size() < var_record.ref.size()); var_record.ref.erase(var_record.ref.begin() + (var_record.ref.size() - suffix.size()), var_record.ref.end()); assert(var_record.ref.size() > 0); @@ -265,33 +309,22 @@ Graph::add_genomic_region(std::vector && reference_sequence, } } - BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Adding reference and variants to the graph."; - bool is_continue{true}; - - for (long i = 0; i < static_cast(var_records.size()); ++i) + // Add reference and variants to the graph + for (unsigned i = 0; i < var_records.size(); ++i) { - BOOST_LOG_TRIVIAL(debug) << __HERE__ << " i = " << i << ", pos = " << var_records[i].pos; - is_continue = add_reference(var_records[i].pos, - static_cast(var_records[i].alts.size()) + 1u, - reference_sequence); // force in the first iteration + add_reference(var_records[i].pos, + static_cast(var_records[i].alts.size()) + 1u, + reference_sequence); - if (is_continue) - { - BOOST_LOG_TRIVIAL(debug) << __HERE__ << " adding variant"; - add_variants(std::move(var_records[i])); - } + add_variants(std::move(var_records[i])); } - if (is_continue) - { - BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Adding final reference sequence behind the last variant."; - add_reference(static_cast(reference_sequence.size()) + genomic_region.begin, 0, reference_sequence); - } + // Add final reference sequence behind the last variant + add_reference(static_cast(reference_sequence.size()) + genomic_region.begin, 0, reference_sequence); // If we chose to use absolute positions we need to change all labels if (use_absolute_positions) { - BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Updating labels."; uint32_t const offset = genomic_region.get_absolute_position(1); unsigned r = 0; assert(r < ref_nodes.size()); @@ -572,10 +605,8 @@ Graph::add_variants(VarRecord && record) } -bool -Graph::add_reference(unsigned end_pos, - unsigned const & num_var, - std::vector const & reference_sequence) +void +Graph::add_reference(unsigned end_pos, unsigned const & num_var, std::vector const & reference_sequence) { if (end_pos > reference_sequence.size() + genomic_region.begin) { @@ -591,13 +622,13 @@ Graph::add_reference(unsigned end_pos, } // Make sure end_pos is larger or equal to start_pos - bool is_continue = start_pos <= end_pos; + end_pos = std::max(start_pos, end_pos); + assert(start_pos >= genomic_region.begin); auto start_it = (start_pos - genomic_region.begin) < reference_sequence.size() ? reference_sequence.begin() + (start_pos - genomic_region.begin) : reference_sequence.end(); - auto end_it = (end_pos - genomic_region.begin) < reference_sequence.size() ? reference_sequence.begin() + (end_pos - genomic_region.begin) : reference_sequence.end(); @@ -607,11 +638,10 @@ Graph::add_reference(unsigned end_pos, // Create a vector of indexes std::vector var_indexes(num_var); - for (long i = 0; i < static_cast(num_var); ++i) + for (unsigned i = 0; i < num_var; ++i) var_indexes[i] = i + var_nodes.size(); ref_nodes.push_back(RefNode(Label(start_pos, std::move(current_dna), 0), std::move(var_indexes))); - return is_continue; } diff --git a/src/typer/vcf.cpp b/src/typer/vcf.cpp index 2fb6d7d..d6a5e5f 100644 --- a/src/typer/vcf.cpp +++ b/src/typer/vcf.cpp @@ -1666,17 +1666,32 @@ save_vcf(Vcf const & vcf, std::string const & filename) void load_vcf(Vcf & vcf, std::string const & filename, long n_batch) { - std::string batch_filename = filename + "_" + std::to_string(n_batch); - std::ifstream ifs(batch_filename.c_str(), std::ios::binary); + std::string const VCF_GZ = ".vcf.gz"; + + if (filename.size() > VCF_GZ.size() && + std::equal(filename.rbegin(), + filename.rbegin() + VCF_GZ.size(), + VCF_GZ.rbegin())) - if (!ifs.is_open()) { - BOOST_LOG_TRIVIAL(error) << "Could not open VCF file " << batch_filename; - std::exit(1); + vcf.open(READ_MODE, filename); + vcf.read(); } + else + { + std::string batch_filename = filename + "_" + std::to_string(n_batch); + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Loading variants from " << batch_filename; + std::ifstream ifs(batch_filename.c_str(), std::ios::binary); - boost::archive::binary_iarchive ia(ifs); - ia >> vcf; + if (!ifs.is_open()) + { + BOOST_LOG_TRIVIAL(error) << __HERE__ << " Could not open file " << batch_filename; + std::exit(1); + } + + boost::archive::binary_iarchive ia(ifs); + ia >> vcf; + } } @@ -1685,6 +1700,7 @@ append_vcf(Vcf & vcf, std::string const & filename, long n_batch) { Vcf new_vcf; std::string batch_filename = filename + "_" + std::to_string(n_batch); + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Appending variants from " << batch_filename; std::ifstream ifs(batch_filename.c_str(), std::ios::binary); if (!ifs.is_open()) diff --git a/src/typer/vcf_operations.cpp b/src/typer/vcf_operations.cpp index 79d326a..b101fc1 100644 --- a/src/typer/vcf_operations.cpp +++ b/src/typer/vcf_operations.cpp @@ -288,7 +288,8 @@ vcf_merge_and_break(std::vector const & vcfs, std::string const & output, std::string const & region, bool const FILTER_ZERO_QUAL, - bool const force_no_variant_overlapping) + bool const force_no_variant_overlapping, + bool const force_no_break_down) { if (vcfs.size() == 0) return; @@ -323,6 +324,7 @@ vcf_merge_and_break(std::vector const & vcfs, for (long i = 1; i < static_cast(vcfs.size()); ++i) { auto const & vcf_fn = vcfs[i]; + assert((i - 1) < static_cast(next_vcfs.size())); gyper::Vcf & next_vcf = next_vcfs[i - 1]; load_vcf(next_vcf, vcf_fn, 0); @@ -423,7 +425,7 @@ vcf_merge_and_break(std::vector const & vcfs, } // Trigger read next batch - if ((v - v_next) == static_cast(next_vcfs[0].variants.size())) + if (next_vcfs.size() > 0 && (v - v_next) == static_cast(next_vcfs[0].variants.size())) { BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Updating next_vcfs"; @@ -530,10 +532,15 @@ vcf_merge_and_break(std::vector const & vcfs, force_no_variant_overlapping}; bool const is_all_biallelic{copts.is_all_biallelic}; - std::vector new_variants = break_down_variant(std::move(var), - reach, - is_no_variant_overlapping, - is_all_biallelic); + std::vector new_variants; + + if (force_no_break_down) + new_variants.push_back(std::move(var)); + else + new_variants = break_down_variant(std::move(var), + reach, + is_no_variant_overlapping, + is_all_biallelic); assert(new_variants.size() > 0); for (auto & new_var : new_variants) diff --git a/src/utilities/genotype.cpp b/src/utilities/genotype.cpp index 88b4968..fc8c0dc 100644 --- a/src/utilities/genotype.cpp +++ b/src/utilities/genotype.cpp @@ -324,8 +324,8 @@ genotype_only_with_a_vcf(std::string const & ref_path, //for (auto & path : paths) // path += "_calls.vcf.gz"; - //> FILTER_ZERO_QUAL, force_no_variant_overlapping - vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", region.to_string(), true, false); + //> FILTER_ZERO_QUAL, force_no_variant_overlapping, force_no_break_down + vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", region.to_string(), true, false, false); // free memory graph = Graph(); @@ -652,12 +652,17 @@ genotype(std::string ref_path, // path += "_calls.vcf.gz"; //> FILTER_ZERO_QUAL, force_no_variant_overlapping - vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", region.to_string(), true, false); + vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", region.to_string(), true, false, false); if (copts.normal_and_no_variant_overlapping) { //> FILTER_ZERO_QUAL, force_no_variant_overlapping - vcf_merge_and_break(paths, tmp + "/graphtyper.no_variant_overlapping.vcf.gz", region.to_string(), true, true); + vcf_merge_and_break(paths, + tmp + "/graphtyper.no_variant_overlapping.vcf.gz", + region.to_string(), + true, + true, + false); } } diff --git a/src/utilities/genotype_camou.cpp b/src/utilities/genotype_camou.cpp index d7f0a18..289eac3 100644 --- a/src/utilities/genotype_camou.cpp +++ b/src/utilities/genotype_camou.cpp @@ -256,7 +256,7 @@ genotype_camou(std::string const & interval_fn, // path += "_calls.vcf.gz"; //> FILTER_ZERO_QUAL, force_no_variant_overlapping - vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", genomic_region.to_string(), false, false); + vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", genomic_region.to_string(), false, false, false); } auto copy_camou_vcf_to_system = diff --git a/src/utilities/genotype_sv.cpp b/src/utilities/genotype_sv.cpp index 65bb013..78d8b57 100644 --- a/src/utilities/genotype_sv.cpp +++ b/src/utilities/genotype_sv.cpp @@ -151,12 +151,12 @@ genotype_sv(std::string ref_path, // VCF merge { - //// Append _calls.vcf.gz + // Append _calls.vcf.gz //for (auto & path : paths) // path += "_calls.vcf.gz"; //> FILTER_ZERO_QUAL, force_no_variant_overlapping - vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", genomic_region.to_string(), false, false); + vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", genomic_region.to_string(), false, false, true); } } @@ -176,8 +176,17 @@ genotype_sv(std::string ref_path, if (ret != 0) { - BOOST_LOG_TRIVIAL(error) << "This command failed '" << ss_cmd.str() << "'"; - std::exit(ret); + if (extension.size() == 0) + { + // Error if copying final VCF + BOOST_LOG_TRIVIAL(error) << "This command failed '" << ss_cmd.str() << "'"; + std::exit(ret); + } + else + { + // Otherwise a warning + BOOST_LOG_TRIVIAL(warning) << "This command failed '" << ss_cmd.str() << "'"; + } } }; diff --git a/src/utilities/hts_parallel_reader.cpp b/src/utilities/hts_parallel_reader.cpp index 43a3412..e45115e 100644 --- a/src/utilities/hts_parallel_reader.cpp +++ b/src/utilities/hts_parallel_reader.cpp @@ -637,7 +637,6 @@ parallel_reader_genotype_only(std::string * out_path, << output_dir << "/" << first_sample << "_*'"; - std::ostringstream vcf_calls_path; Vcf vcf; // Set sample names @@ -651,10 +650,26 @@ parallel_reader_genotype_only(std::string * out_path, } for (auto & var : vcf.variants) - var.trim_sequences(false); // Don't keep one match + var.trim_sequences(false); // Don't keep one match + + if (graph.is_sv_graph) + { + reformat_sv_vcf_records(vcf.variants, reference_depth); + + if (vcf.sample_names.size() > 0) + { + for (auto & var : vcf.variants) + var.generate_infos(); + } + + // Non-SVs are at the end from the reformat_sv_vcf_records function, so this is needed + std::sort(vcf.variants.begin(), + vcf.variants.end(), + [](Variant const & a, Variant const & b){ + return a.abs_pos < b.abs_pos || (a.abs_pos == b.abs_pos && a.seqs < b.seqs); + }); + } - // If the graph is an SV graph, then reformat the SV record - reformat_sv_vcf_records(vcf.variants, reference_depth); save_vcf(vcf, output_dir + "/" + first_sample); } @@ -809,7 +824,7 @@ parallel_reader_with_discovery(std::string * out_path, << output_dir << "/" << first_sample << "_*'"; - std::ostringstream vcf_calls_path; + // Handle SNP/indel graphs Vcf vcf; // Set sample names @@ -823,10 +838,26 @@ parallel_reader_with_discovery(std::string * out_path, } for (auto & var : vcf.variants) - var.trim_sequences(false); // Don't keep one match + var.trim_sequences(false); // Don't keep one match + + if (graph.is_sv_graph) + { + reformat_sv_vcf_records(vcf.variants, reference_depth); + + if (vcf.sample_names.size() > 0) + { + for (auto & var : vcf.variants) + var.generate_infos(); + } + + // Non-SVs are at the end from the reformat_sv_vcf_records function, so this is needed + std::sort(vcf.variants.begin(), + vcf.variants.end(), + [](Variant const & a, Variant const & b){ + return a.abs_pos < b.abs_pos || (a.abs_pos == b.abs_pos && a.seqs < b.seqs); + }); + } - // If the graph is an SV graph, then reformat the SV record - reformat_sv_vcf_records(vcf.variants, reference_depth); save_vcf(vcf, output_dir + "/" + first_sample); }