Skip to content

Commit

Permalink
Version 2.5.1 (#38)
Browse files Browse the repository at this point in the history
* Bugfix in some rare sv graph construction
* Bugfix in somecases of VCF merging in
* Made failed copy of tabix index a warning in genotype_sv instead of error
  • Loading branch information
hannespetur authored May 17, 2020
1 parent 9d9dfe6 commit d8d0167
Show file tree
Hide file tree
Showing 10 changed files with 167 additions and 67 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ include(ExternalProject)
# The version number
set (graphtyper_VERSION_MAJOR 2)
set (graphtyper_VERSION_MINOR 5)
set (graphtyper_VERSION_PATCH 0)
set (graphtyper_VERSION_PATCH 1)
set(STATIC_DIR "" CACHE STRING "If set, GraphTyper will be built as a static binary using libraries from the given STATIC_DIR.")

# Get the current working branch
Expand Down
5 changes: 3 additions & 2 deletions include/graphtyper/graph/graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,10 @@ class Graph
/**********************
* GRAPH MODIFICATION *
**********************/
bool add_reference(unsigned end_pos,
void add_reference(unsigned end_pos,
unsigned const & num_var,
std::vector<char> const & reference_sequence);
std::vector<char> const & reference_sequence
);

void add_variants(VarRecord && record);

Expand Down
3 changes: 2 additions & 1 deletion include/graphtyper/typer/vcf_operations.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ void vcf_merge_and_break(std::vector<std::string> const & vcfs,
std::string const & output,
std::string const & region,
bool const FILTER_ZERO_QUAL,
bool const force_no_variant_overlap);
bool const force_no_variant_overlap,
bool const force_no_break_down);

void vcf_update_info(std::string const & vcf, std::string const & output);

Expand Down
96 changes: 63 additions & 33 deletions src/graph/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,15 @@ Graph::add_genomic_region(std::vector<char> && reference_sequence,
var_records.end()
);

for (long v = 0; v < static_cast<long>(var_records.size()); ++v)
{
if (var_records[v].pos >= genomic_region.end)
{
var_records.resize(v);
break;
}
}

if (Options::instance()->add_all_variants)
{
BOOST_LOG_TRIVIAL(debug) << "[" << __HERE__ << "] Constructing graph of "
Expand Down Expand Up @@ -157,7 +166,7 @@ Graph::add_genomic_region(std::vector<char> && reference_sequence,
}
else
{
BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Constructing graph of "
BOOST_LOG_TRIVIAL(debug) << "[" << __HERE__ << "] Constructing graph of "
<< var_records.size()
<< " variants.";

Expand Down Expand Up @@ -199,6 +208,46 @@ Graph::add_genomic_region(std::vector<char> && reference_sequence,
continue;
}

std::vector<char> suffix = var_records[i].get_common_suffix();

//if (var_records[i + 1].pos >= var_records[i].pos + var_records[i].ref.size() - suffix.size())
//{
// long const suffix_to_remove = var_records[i].pos + var_records[i].ref.size() - var_records[i + 1].pos;
//
// // Check if it is possible to remove enough suffix so we can safely join the two records
// if (suffix_to_remove <= 0 || suffix_to_remove > static_cast<long>(suffix.size()))
// {
// var_records[i + 1].merge_one_path(std::move(var_records[i]));
// }
// else
// {
// // Erase from previous record so that it ends where the next record starts
// assert(static_cast<long>(var_records[i].ref.size()) > suffix_to_remove);
// var_records[i].ref.erase(var_records[i].ref.end() - suffix_to_remove, var_records[i].ref.end());
//
// for (auto & alt : var_records[i].alts)
// {
// assert(static_cast<long>(alt.size()) > suffix_to_remove);
// alt.erase(alt.end() - suffix_to_remove, alt.end());
// }
//
// assert(var_records[i + 1].pos == var_records[i].pos + var_records[i].ref.size());
//
// // Merge the two records
// long const suffix_to_add = suffix_to_remove - static_cast<long>(var_records[i + 1].ref.size());
//
// var_records[i + 1].merge(std::move(var_records[i]));
//
// if (suffix_to_add > 0)
// {
// var_records[i + 1].ref.insert(var_records[i + 1].ref.end(), suffix.end() - suffix_to_add, suffix.end());
//
// for (auto & alt : var_records[i + 1].alts)
// alt.insert(alt.end(), suffix.end() - suffix_to_add, suffix.end());
// }
// }
//}
//else
{
// In a few extreme scenarios we cannot merge only one path here.
//BOOST_LOG_TRIVIAL(fatal) << "i : " << var_records[i].to_string();
Expand All @@ -217,10 +266,6 @@ Graph::add_genomic_region(std::vector<char> && reference_sequence,
++i;
} // while
}

BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Done merging overlapping variant records. Now I have "
<< var_records.size()
<< " variants.";
}

// Erase alternatives which are identical to the reference sequence
Expand Down Expand Up @@ -252,7 +297,6 @@ Graph::add_genomic_region(std::vector<char> && reference_sequence,

if (suffix.size() > 0)
{
BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Removing commong suffix.";
assert(suffix.size() < var_record.ref.size());
var_record.ref.erase(var_record.ref.begin() + (var_record.ref.size() - suffix.size()), var_record.ref.end());
assert(var_record.ref.size() > 0);
Expand All @@ -265,33 +309,22 @@ Graph::add_genomic_region(std::vector<char> && reference_sequence,
}
}

BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Adding reference and variants to the graph.";
bool is_continue{true};

for (long i = 0; i < static_cast<long>(var_records.size()); ++i)
// Add reference and variants to the graph
for (unsigned i = 0; i < var_records.size(); ++i)
{
BOOST_LOG_TRIVIAL(debug) << __HERE__ << " i = " << i << ", pos = " << var_records[i].pos;
is_continue = add_reference(var_records[i].pos,
static_cast<unsigned>(var_records[i].alts.size()) + 1u,
reference_sequence); // force in the first iteration
add_reference(var_records[i].pos,
static_cast<unsigned>(var_records[i].alts.size()) + 1u,
reference_sequence);

if (is_continue)
{
BOOST_LOG_TRIVIAL(debug) << __HERE__ << " adding variant";
add_variants(std::move(var_records[i]));
}
add_variants(std::move(var_records[i]));
}

if (is_continue)
{
BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Adding final reference sequence behind the last variant.";
add_reference(static_cast<uint32_t>(reference_sequence.size()) + genomic_region.begin, 0, reference_sequence);
}
// Add final reference sequence behind the last variant
add_reference(static_cast<uint32_t>(reference_sequence.size()) + genomic_region.begin, 0, reference_sequence);

// If we chose to use absolute positions we need to change all labels
if (use_absolute_positions)
{
BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Updating labels.";
uint32_t const offset = genomic_region.get_absolute_position(1);
unsigned r = 0;
assert(r < ref_nodes.size());
Expand Down Expand Up @@ -572,10 +605,8 @@ Graph::add_variants(VarRecord && record)
}


bool
Graph::add_reference(unsigned end_pos,
unsigned const & num_var,
std::vector<char> const & reference_sequence)
void
Graph::add_reference(unsigned end_pos, unsigned const & num_var, std::vector<char> const & reference_sequence)
{
if (end_pos > reference_sequence.size() + genomic_region.begin)
{
Expand All @@ -591,13 +622,13 @@ Graph::add_reference(unsigned end_pos,
}

// Make sure end_pos is larger or equal to start_pos
bool is_continue = start_pos <= end_pos;
end_pos = std::max(start_pos, end_pos);

assert(start_pos >= genomic_region.begin);

auto start_it = (start_pos - genomic_region.begin) < reference_sequence.size() ?
reference_sequence.begin() + (start_pos - genomic_region.begin) :
reference_sequence.end();

auto end_it = (end_pos - genomic_region.begin) < reference_sequence.size() ?
reference_sequence.begin() + (end_pos - genomic_region.begin) :
reference_sequence.end();
Expand All @@ -607,11 +638,10 @@ Graph::add_reference(unsigned end_pos,
// Create a vector of indexes
std::vector<TNodeIndex> var_indexes(num_var);

for (long i = 0; i < static_cast<long>(num_var); ++i)
for (unsigned i = 0; i < num_var; ++i)
var_indexes[i] = i + var_nodes.size();

ref_nodes.push_back(RefNode(Label(start_pos, std::move(current_dna), 0), std::move(var_indexes)));
return is_continue;
}


Expand Down
30 changes: 23 additions & 7 deletions src/typer/vcf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1666,17 +1666,32 @@ save_vcf(Vcf const & vcf, std::string const & filename)
void
load_vcf(Vcf & vcf, std::string const & filename, long n_batch)
{
std::string batch_filename = filename + "_" + std::to_string(n_batch);
std::ifstream ifs(batch_filename.c_str(), std::ios::binary);
std::string const VCF_GZ = ".vcf.gz";

if (filename.size() > VCF_GZ.size() &&
std::equal(filename.rbegin(),
filename.rbegin() + VCF_GZ.size(),
VCF_GZ.rbegin()))

if (!ifs.is_open())
{
BOOST_LOG_TRIVIAL(error) << "Could not open VCF file " << batch_filename;
std::exit(1);
vcf.open(READ_MODE, filename);
vcf.read();
}
else
{
std::string batch_filename = filename + "_" + std::to_string(n_batch);
BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Loading variants from " << batch_filename;
std::ifstream ifs(batch_filename.c_str(), std::ios::binary);

boost::archive::binary_iarchive ia(ifs);
ia >> vcf;
if (!ifs.is_open())
{
BOOST_LOG_TRIVIAL(error) << __HERE__ << " Could not open file " << batch_filename;
std::exit(1);
}

boost::archive::binary_iarchive ia(ifs);
ia >> vcf;
}
}


Expand All @@ -1685,6 +1700,7 @@ append_vcf(Vcf & vcf, std::string const & filename, long n_batch)
{
Vcf new_vcf;
std::string batch_filename = filename + "_" + std::to_string(n_batch);
BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Appending variants from " << batch_filename;
std::ifstream ifs(batch_filename.c_str(), std::ios::binary);

if (!ifs.is_open())
Expand Down
19 changes: 13 additions & 6 deletions src/typer/vcf_operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,8 @@ vcf_merge_and_break(std::vector<std::string> const & vcfs,
std::string const & output,
std::string const & region,
bool const FILTER_ZERO_QUAL,
bool const force_no_variant_overlapping)
bool const force_no_variant_overlapping,
bool const force_no_break_down)
{
if (vcfs.size() == 0)
return;
Expand Down Expand Up @@ -323,6 +324,7 @@ vcf_merge_and_break(std::vector<std::string> const & vcfs,
for (long i = 1; i < static_cast<long>(vcfs.size()); ++i)
{
auto const & vcf_fn = vcfs[i];
assert((i - 1) < static_cast<long>(next_vcfs.size()));
gyper::Vcf & next_vcf = next_vcfs[i - 1];
load_vcf(next_vcf, vcf_fn, 0);

Expand Down Expand Up @@ -423,7 +425,7 @@ vcf_merge_and_break(std::vector<std::string> const & vcfs,
}

// Trigger read next batch
if ((v - v_next) == static_cast<long>(next_vcfs[0].variants.size()))
if (next_vcfs.size() > 0 && (v - v_next) == static_cast<long>(next_vcfs[0].variants.size()))
{
BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Updating next_vcfs";

Expand Down Expand Up @@ -530,10 +532,15 @@ vcf_merge_and_break(std::vector<std::string> const & vcfs,
force_no_variant_overlapping};

bool const is_all_biallelic{copts.is_all_biallelic};
std::vector<Variant> new_variants = break_down_variant(std::move(var),
reach,
is_no_variant_overlapping,
is_all_biallelic);
std::vector<Variant> new_variants;

if (force_no_break_down)
new_variants.push_back(std::move(var));
else
new_variants = break_down_variant(std::move(var),
reach,
is_no_variant_overlapping,
is_all_biallelic);
assert(new_variants.size() > 0);

for (auto & new_var : new_variants)
Expand Down
13 changes: 9 additions & 4 deletions src/utilities/genotype.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,8 @@ genotype_only_with_a_vcf(std::string const & ref_path,
//for (auto & path : paths)
// path += "_calls.vcf.gz";

//> FILTER_ZERO_QUAL, force_no_variant_overlapping
vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", region.to_string(), true, false);
//> FILTER_ZERO_QUAL, force_no_variant_overlapping, force_no_break_down
vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", region.to_string(), true, false, false);

// free memory
graph = Graph();
Expand Down Expand Up @@ -652,12 +652,17 @@ genotype(std::string ref_path,
// path += "_calls.vcf.gz";

//> FILTER_ZERO_QUAL, force_no_variant_overlapping
vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", region.to_string(), true, false);
vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", region.to_string(), true, false, false);

if (copts.normal_and_no_variant_overlapping)
{
//> FILTER_ZERO_QUAL, force_no_variant_overlapping
vcf_merge_and_break(paths, tmp + "/graphtyper.no_variant_overlapping.vcf.gz", region.to_string(), true, true);
vcf_merge_and_break(paths,
tmp + "/graphtyper.no_variant_overlapping.vcf.gz",
region.to_string(),
true,
true,
false);
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/utilities/genotype_camou.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ genotype_camou(std::string const & interval_fn,
// path += "_calls.vcf.gz";

//> FILTER_ZERO_QUAL, force_no_variant_overlapping
vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", genomic_region.to_string(), false, false);
vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", genomic_region.to_string(), false, false, false);
}

auto copy_camou_vcf_to_system =
Expand Down
17 changes: 13 additions & 4 deletions src/utilities/genotype_sv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,12 +151,12 @@ genotype_sv(std::string ref_path,

// VCF merge
{
//// Append _calls.vcf.gz
// Append _calls.vcf.gz
//for (auto & path : paths)
// path += "_calls.vcf.gz";

//> FILTER_ZERO_QUAL, force_no_variant_overlapping
vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", genomic_region.to_string(), false, false);
vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", genomic_region.to_string(), false, false, true);
}
}

Expand All @@ -176,8 +176,17 @@ genotype_sv(std::string ref_path,

if (ret != 0)
{
BOOST_LOG_TRIVIAL(error) << "This command failed '" << ss_cmd.str() << "'";
std::exit(ret);
if (extension.size() == 0)
{
// Error if copying final VCF
BOOST_LOG_TRIVIAL(error) << "This command failed '" << ss_cmd.str() << "'";
std::exit(ret);
}
else
{
// Otherwise a warning
BOOST_LOG_TRIVIAL(warning) << "This command failed '" << ss_cmd.str() << "'";
}
}
};

Expand Down
Loading

0 comments on commit d8d0167

Please sign in to comment.