From 07404b74d3fe42b20f1362f4d8625f284b9d536c Mon Sep 17 00:00:00 2001 From: fritzsedlazeck Date: Mon, 25 Nov 2019 20:55:46 -0600 Subject: [PATCH] enable ratio merging --- src/SURVIVOR.cpp | 13 +++-- src/Summarize_SV.cpp | 16 +++++- src/Summarize_SV.h | 2 +- src/merge_vcf/IntervallTree.cpp | 94 ++++++++++++++++++++++----------- src/merge_vcf/Paramer.h | 2 +- src/merge_vcf/TNode.h | 26 +++++++++ src/merge_vcf/combine_svs.cpp | 5 +- src/merge_vcf/combine_svs.h | 19 +++++-- 8 files changed, 133 insertions(+), 44 deletions(-) diff --git a/src/SURVIVOR.cpp b/src/SURVIVOR.cpp index 72af117..4339309 100644 --- a/src/SURVIVOR.cpp +++ b/src/SURVIVOR.cpp @@ -47,6 +47,8 @@ Parameter* Parameter::m_pInstance = NULL; +//todo: merge 1: Check all subtypes during overlap. 2: Manage split up after merge. +//todo: maybe a low mem version..?? Just storing the variant ID per input, then go back in the end.. //make file: LIBS +=-lz void official_interface(int argc, char *argv[]) { @@ -120,10 +122,10 @@ void official_interface(int argc, char *argv[]) { if (argc == 10) { //merge 3 SV calls from the same strain // combine_calls_new(std::string(argv[2]), atoi(argv[3]), atoi(argv[4]), std::string(argv[5])); - combine_calls_svs(std::string(argv[2]), atoi(argv[3]), atoi(argv[4]), atoi(argv[5]), atoi(argv[6]), atoi(argv[7]), atoi(argv[8]), std::string(argv[9])); + combine_calls_svs(std::string(argv[2]), atof(argv[3]), atoi(argv[4]), atoi(argv[5]), atoi(argv[6]), atoi(argv[7]), atoi(argv[8]), std::string(argv[9])); } else { std::cerr << "File with VCF names and paths" << std::endl; - std::cerr << "max distance between breakpoints " << std::endl; + std::cerr << "max distance between breakpoints (0-1 percent of length, 1- number of bp) " << std::endl; std::cerr << "Minimum number of supporting caller" << std::endl; std::cerr << "Take the type into account (1==yes, else no)" << std::endl; std::cerr << "Take the strands of SVs into account (1==yes, else no)" << std::endl; @@ -236,10 +238,11 @@ void official_interface(int argc, char *argv[]) { exit(0); } else if (strcmp(argv[1], "genComp") == 0) { - if (argc == 4) { - summary_venn(std::string(argv[2]), std::string(argv[3])); + if (argc == 5) { + summary_venn(std::string(argv[2]), bool(atoi(argv[3])==1), std::string(argv[4])); } else { std::cerr << "Merged Vcf file" << std::endl; + std::cerr << "Normalize output (1==yes, else no)" << std::endl; std::cerr << "Output: pariwise overlap matrix" << std::endl; } exit(0); @@ -546,7 +549,7 @@ int main(int argc, char *argv[]) { case 19: if (argc == 4) { //summarize venn - summary_venn(std::string(argv[2]), std::string(argv[3])); + summary_venn(std::string(argv[2]),false, std::string(argv[3])); } else { std::cerr << "vcf venn file" << std::endl; std::cerr << "output file" << std::endl; diff --git a/src/Summarize_SV.cpp b/src/Summarize_SV.cpp index ea5d59a..58e400e 100644 --- a/src/Summarize_SV.cpp +++ b/src/Summarize_SV.cpp @@ -530,7 +530,7 @@ std::vector parse_supp_vec(std::string buffer) { return ids; } -void summary_venn(std::string filename, std::string output) { +void summary_venn(std::string filename, bool normalize, std::string output) { std::vector > mat; // cout << "file: " << filename << endl; @@ -600,7 +600,19 @@ void summary_venn(std::string filename, std::string output) { FILE * file = fopen(output.c_str(), "w"); for (size_t i = 0; i < mat.size(); i++) { for (size_t j = 0; j < mat.size(); j++) { - fprintf(file, "%i", mat[i][j]); + if(normalize){ + //if(i>j){ + if(mat[i][i]>0){ + fprintf(file, "%e", (double)mat[i][j]/(double)mat[i][i]); + }else{ + fprintf(file, "%e", 0); + } + // }else{ + // fprintf(file, "%e", (double)mat[i][j]/(double)mat[i][i]); + //} + }else{ + fprintf(file, "%i", mat[i][j]); + } fprintf(file, "%c", '\t'); } fprintf(file, "%c", '\n'); diff --git a/src/Summarize_SV.h b/src/Summarize_SV.h index dd22da5..2545707 100644 --- a/src/Summarize_SV.h +++ b/src/Summarize_SV.h @@ -20,7 +20,7 @@ using namespace std; void summary_SV(std::string filename, int min_size, int max_size, int min_reads,std::string output); -void summary_venn(std::string filename, std::string output); +void summary_venn(std::string filename, bool normalize, std::string output); void summary_SV_stream(int min_size, int max_size, std::string output); #endif /* SUMMARIZE_SV_H_ */ diff --git a/src/merge_vcf/IntervallTree.cpp b/src/merge_vcf/IntervallTree.cpp index a40ffe1..aa9c0e2 100644 --- a/src/merge_vcf/IntervallTree.cpp +++ b/src/merge_vcf/IntervallTree.cpp @@ -12,18 +12,43 @@ bool IntervallTree::same_breakpoint(breakpoint_str first, breakpoint_str second, return (strcmp(first.chr.c_str(), second.chr.c_str()) == 0 && (abs(first.position - second.position) < max_dist)); } -bool is_same_strand(std::pair first, std::pair second) { - return (first.first == second.first && first.second == second.second); +bool is_same_strand(std::pair first, bool strands[]){//std::pair second) { + + short count=0; + if(strands[0] && first.first){ + count++; + } + if(strands[1] && !first.first){ + count++; + } + if(strands[2] && first.second){ + count++; + } + if(strands[3] && !first.second){ + count++; + } + + return(bool)(count==2); + + + //return (first.first == second.first && first.second == second.second); } -bool same_type(short first, short second) { - if (first == second) { +bool same_type(short first, bool types[]) { + + if(types[first]==1){ return true; - } else if (((first == 3 || first == 2) && second == 5) || ((second == 3 || second == 2) && first == 5)) { // compare BND to inv or tra -> same type! + }else if(((first==3 || first ==2 ) && types[5]==1) || ((types[3]==1 || types[2]==1) && first==5)){ return true; } + + //if (first == second) { + // return true; + //} else if (((first == 3 || first == 2) && second == 5) || ((second == 3 || second == 2) && first == 5)) { // compare BND to inv or tra -> same type! + // return true; + //} return false; } -int get_dist(long dist, short type) { +/*int get_dist(long dist, short type) { if (type == 3) { //TRA return Parameter::Instance()->max_dist; //TODO: change! @@ -32,7 +57,7 @@ int get_dist(long dist, short type) { dist = std::max((long) Parameter::Instance()->min_length * 2, dist); return std::min((int) (dist * 4), Parameter::Instance()->max_dist); -} +}*/ long IntervallTree::overlap_SNP(breakpoint_str start, SVS_Node * curr_svs) { @@ -74,39 +99,46 @@ long IntervallTree::overlap(breakpoint_str start, breakpoint_str stop, short typ }*/ /* if (start.position == 112179238 || start.position == 112179329) { - std::cout << "Comp: " << start.chr << " " << start.position << " " << curr_svs->first.chr << " " << curr_svs->first.position << " " << type << " " << curr_svs->type << std::endl; - }*/ + std::cout << "Comp: " << start.chr << " " << start.position << " " << curr_svs->first.chr << " " << curr_svs->first.position << " " << type << " " << curr_svs->type << std::endl; + }*/ - int max_dist = Parameter::Instance()->max_dist; + + double max_dist = Parameter::Instance()->max_dist; + if(max_dist<1 && strcmp(start.chr.c_str(),stop.chr.c_str())!=0){ + max_dist=1000; + }else if(max_dist<1){ + max_dist=std::floor((double)(stop.position - start.position) * max_dist); + //std::cout<dynamic_size) { max_dist = std::min(get_dist(stop.position - start.position, type), get_dist(curr_svs->first.position - curr_svs->second.position, curr_svs->type)); // } */ - if (((!Parameter::Instance()->use_strand || is_same_strand(strands, curr_svs->strand)) && (!Parameter::Instance()->use_type || same_type(type, curr_svs->type))) && (same_breakpoint(start, curr_svs->first, max_dist) && same_breakpoint(stop, curr_svs->second, max_dist))) { - /* if (start.position == 112179238 || start.position == 112179329) { - std::cout << "MERGE" << std::endl; - std::cout << std::endl; - }*/ + if (((!Parameter::Instance()->use_strand || is_same_strand(strands, curr_svs->strands)) && (!Parameter::Instance()->use_type || same_type(type, curr_svs->types))) && (same_breakpoint(start, curr_svs->first, max_dist) && same_breakpoint(stop, curr_svs->second, max_dist))) { + /* if (start.position == 112179238 || start.position == 112179329) { + std::cout << "MERGE" << std::endl; + std::cout << std::endl; + }*/ return 0; //to be merged } /*if (start.position == 112179238 || start.position == 112179329) { - std::cout << "no MERGE: "<second.chr<<" "<second.position << std::endl; - if (is_same_strand(strands, curr_svs->strand)) { - std::cout << "\tsame strand" << std::endl; - } - if (same_type(type, curr_svs->type)) { - std::cout << "\tsame type" << std::endl; - } - std::cout << "dist: " << max_dist << std::endl; - if (same_breakpoint(start, curr_svs->first, max_dist)) { - std::cout << "\tsame breakpoint start" << std::endl; - } - if (same_breakpoint(stop, curr_svs->second, max_dist)) { - std::cout << "\tsame breakpoint stop" << std::endl; - } - std::cout << std::endl; - }*/ + std::cout << "no MERGE: "<second.chr<<" "<second.position << std::endl; + if (is_same_strand(strands, curr_svs->strand)) { + std::cout << "\tsame strand" << std::endl; + } + if (same_type(type, curr_svs->type)) { + std::cout << "\tsame type" << std::endl; + } + std::cout << "dist: " << max_dist << std::endl; + if (same_breakpoint(start, curr_svs->first, max_dist)) { + std::cout << "\tsame breakpoint start" << std::endl; + } + if (same_breakpoint(stop, curr_svs->second, max_dist)) { + std::cout << "\tsame breakpoint stop" << std::endl; + } + std::cout << std::endl; + }*/ /* std::cout<<"no MERGE: "<first.position <<" vs "<< stop.position << " " << curr_svs->second.position<strand)){ std::cout<<"\tsame strand"<data->second = stop; this->data->type = type; this->data->strand = strands; + if (strands.first) { + this->data->strands[0] = true; + } else { + this->data->strands[1] = true; + } + + if (strands.second) { + this->data->strands[2] = true; + } else { + this->data->strands[3] = true; + } + this->data->genotype = meta_info.genotype; //do I need this? + this->data->types[type] = true; init(); Support_Node * tmp = new Support_Node(); @@ -103,6 +116,19 @@ class TNode { this->data->caller_info.push_back(tmp); } + this->data->types[type] = true; //extend if there is an in sample merge! + if (strands.first) { + this->data->strands[0] = true; + } else { + this->data->strands[1] = true; + } + + if (strands.second) { + this->data->strands[2] = true; + } else { + this->data->strands[3] = true; + } + this->data->caller_info[index]->starts.push_back(start.position); this->data->caller_info[index]->stops.push_back(stop.position); this->data->caller_info[index]->types.push_back(type); diff --git a/src/merge_vcf/combine_svs.cpp b/src/merge_vcf/combine_svs.cpp index 1e6dcb3..31b1509 100644 --- a/src/merge_vcf/combine_svs.cpp +++ b/src/merge_vcf/combine_svs.cpp @@ -620,7 +620,7 @@ void parse_vcf_header(std::map &chrs, std::string filename) { } } -void combine_calls_svs(std::string files, int max_dist, int min_support, int type_save, int strand_save, int dynamic_size, int min_svs, std::string output) { +void combine_calls_svs(std::string files, double max_dist, int min_support, int type_save, int strand_save, int dynamic_size, int min_svs, std::string output) { std::vector names = parse_filename(files); Parameter::Instance()->max_caller = names.size(); @@ -691,6 +691,9 @@ void combine_calls_svs(std::string files, int max_dist, int min_support, int typ } if (support >= min_support && len > min_svs) { + if(Parameter::Instance()->use_strand || Parameter::Instance()->use_type){ + + } print_entry_overlap(file, (*i), id); } diff --git a/src/merge_vcf/combine_svs.h b/src/merge_vcf/combine_svs.h index c28871c..9b3da01 100644 --- a/src/merge_vcf/combine_svs.h +++ b/src/merge_vcf/combine_svs.h @@ -74,6 +74,7 @@ class SVS_Node { //just for testing! SVS_Node() { + type=-1; num_support.first=-1; num_support.second=-1; @@ -81,6 +82,18 @@ class SVS_Node { strand.second=false; caller_info.clear(); genotype="./."; + + types[0]=false; //DEL + types[1]=false; //DUP + types[2]=false; //INV + types[3]=false; //TRA + types[4]=false; //UNK + + strands[0]=false; //+ + strands[1]=false; //- + strands[2]=false; //+ + strands[3]=false; //- + } ~SVS_Node() { caller_info.clear(); @@ -94,15 +107,15 @@ class SVS_Node { std::pair num_support; std::pair strand; std::string genotype; - - + bool types[5]; + bool strands[4]; }; #include "../structs.h" #include "IntervallTree.h" #include "../vcfs/Merge_VCF.h" -void combine_calls_svs(std::string file, int max_dist, int min_support, int type_save, int strand_save,int dynamic_size,int min_svs, std::string output); +void combine_calls_svs(std::string file, double max_dist, int min_support, int type_save, int strand_save,int dynamic_size,int min_svs, std::string output); breakpoint_str convert_position(strcoordinate pos); void summarize_VCF_files(std::string filename, int min_size, std::string output); void print_entry_overlap(FILE *& file, SVS_Node * entry, int id);