From b57adcf1bcd47e5cb59aa3be8f43f1fd2547f4fb Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 11 Sep 2024 08:53:25 -0400 Subject: [PATCH 1/5] add normalizer to vg paths --- src/subcommand/paths_main.cpp | 34 ++++++--- src/traversal_clusters.cpp | 132 +++++++++++++++++++++++++++++++++- src/traversal_clusters.hpp | 11 +++ 3 files changed, 168 insertions(+), 9 deletions(-) diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index d6dd1f4adf8..1e4906bbcfd 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -16,6 +16,7 @@ #include "../vg.hpp" #include "../xg.hpp" #include "../gbwt_helper.hpp" +#include "../traversal_clusters.hpp" #include #include #include @@ -37,6 +38,7 @@ void help_paths(char** argv) { << " -V, --extract-vg output a path-only graph covering the selected paths" << endl << " -d, --drop-paths output a graph with the selected paths removed" << endl << " -r, --retain-paths output a graph with only the selected paths retained" << endl + << " -n, --normalize-paths output a graph where all equivalent paths in a site a merged (using selected paths to snap to if possible)" << endl << " output path data:" << endl << " -X, --extract-gam print (as GAM alignments) the stored paths in the graph" << endl << " -A, --extract-gaf print (as GAF alignments) the stored paths in the graph" << endl @@ -117,6 +119,7 @@ int main_paths(int argc, char** argv) { size_t input_formats = 0; bool coverage = false; const size_t coverage_bins = 10; + bool normalize_paths = false; int c; optind = 2; // force optind past command positional argument @@ -130,6 +133,7 @@ int main_paths(int argc, char** argv) { {"extract-vg", no_argument, 0, 'V'}, {"drop-paths", no_argument, 0, 'd'}, {"retain-paths", no_argument, 0, 'r'}, + {"normalize-paths", no_argument, 0, 'n'}, {"extract-gam", no_argument, 0, 'X'}, {"extract-gaf", no_argument, 0, 'A'}, {"list", no_argument, 0, 'L'}, @@ -154,7 +158,7 @@ int main_paths(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:draGRHp:c", + c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drnaGRHp:c", long_options, &option_index); // Detect the end of the options. @@ -189,6 +193,11 @@ int main_paths(int argc, char** argv) { retain_paths = true; output_formats++; break; + + case 'n': + normalize_paths = true; + output_formats++; + break; case 'X': extract_as_gam = true; @@ -309,7 +318,7 @@ int main_paths(int argc, char** argv) { if (!gbwt_file.empty()) { bool need_graph = (extract_as_gam || extract_as_gaf || extract_as_vg || drop_paths || retain_paths || extract_as_fasta || list_lengths); if (need_graph && graph_file.empty()) { - std::cerr << "error: [vg paths] a graph is needed for extracting threads in -X, -A, -V, -d, -r, -E or -F format" << std::endl; + std::cerr << "error: [vg paths] a graph is needed for extracting threads in -X, -A, -V, -d, -r, -n, -E or -F format" << std::endl; std::exit(EXIT_FAILURE); } if (!need_graph && !graph_file.empty()) { @@ -320,7 +329,7 @@ int main_paths(int argc, char** argv) { } } if (output_formats != 1) { - std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -L, -F, -E, -C or -c) must be specified" << std::endl; + std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -n, -L, -F, -E, -C or -c) must be specified" << std::endl; std::exit(EXIT_FAILURE); } if (selection_criteria > 1) { @@ -335,8 +344,8 @@ int main_paths(int argc, char** argv) { std::cerr << "error: [vg paths] listing path metadata is not compatible with a GBWT index" << std::endl; std::exit(EXIT_FAILURE); } - if ((drop_paths || retain_paths) && !gbwt_file.empty()) { - std::cerr << "error: [vg paths] dropping or retaining paths only works on embedded graph paths, not GBWT threads" << std::endl; + if ((drop_paths || retain_paths || normalize_paths) && !gbwt_file.empty()) { + std::cerr << "error: [vg paths] dropping, retaining or normalizing paths only works on embedded graph paths, not GBWT threads" << std::endl; std::exit(EXIT_FAILURE); } if (coverage && !gbwt_file.empty()) { @@ -566,7 +575,7 @@ int main_paths(int argc, char** argv) { }; - if (drop_paths || retain_paths) { + if (drop_paths || retain_paths || normalize_paths) { MutablePathMutableHandleGraph* mutable_graph = dynamic_cast(graph.get()); if (!mutable_graph) { std::cerr << "error[vg paths]: graph cannot be modified" << std::endl; @@ -583,12 +592,21 @@ int main_paths(int argc, char** argv) { for_each_selected_path([&](const path_handle_t& path_handle) { to_destroy.push_back(path_handle); }); - } else { + } else if (retain_paths) { for_each_unselected_path([&](const path_handle_t& path_handle) { to_destroy.push_back(path_handle); }); + } else { + assert(normalize_paths); + unordered_set selected_paths; + for_each_selected_path([&](const path_handle_t& path_handle) { + selected_paths.insert(path_handle); + }); + merge_equivalent_traversals_in_graph(mutable_graph, selected_paths); + } + if (!to_destroy.empty()) { + mutable_graph->destroy_paths(to_destroy); } - mutable_graph->destroy_paths(to_destroy); // output the graph serializable_graph->serialize(cout); diff --git a/src/traversal_clusters.cpp b/src/traversal_clusters.cpp index b18348e16c0..4b2e0621155 100644 --- a/src/traversal_clusters.cpp +++ b/src/traversal_clusters.cpp @@ -1,9 +1,13 @@ #include "traversal_clusters.hpp" +#include "traversal_finder.hpp" +#include "integrated_snarl_finder.hpp" +#include "snarl_distance_index.hpp" //#define debug namespace vg { +using namespace bdsg; // specialized version of jaccard_coefficient() that weights jaccard by node lenths. double weighted_jaccard_coefficient(const PathHandleGraph* graph, @@ -279,9 +283,135 @@ vector> assign_child_snarls_to_traversals(const PathHandleGraph* gra return trav_to_child_snarls; } +static void merge_equivalent_traversals_in_snarl(MutablePathHandleGraph* graph, const unordered_set& selected_paths, + PathTraversalFinder& path_trav_finder, + const handle_t& start_handle, const handle_t& end_handle ) { + // find the path traversals through the snarl + vector path_travs; + vector path_intervals; + std::tie(path_travs, path_intervals) = path_trav_finder.find_path_traversals(start_handle, end_handle); + vector trav_names(path_travs.size()); + vector trav_paths(path_travs.size()); + vector trav_reversed(path_travs.size()); + // organize paths by their alleles strings + // todo: we could use a fancier index to save memory here (prob not necessary tho) + unordered_map> string_to_travs; + for (int64_t i = 0; i < path_travs.size(); ++i) { + const Traversal& trav = path_travs[i]; + string allele; + for (int64_t j = 1; j < trav.size() - 1; ++j) { + allele += toUppercase(graph->get_sequence(trav[j])); + } + string_to_travs[allele].push_back(i); + trav_paths[i] = graph->get_path_handle_of_step(path_intervals[i].first); + trav_names[i] = graph->get_path_name(trav_paths[i]); + trav_reversed[i] = graph->get_is_reverse(graph->get_handle_of_step(path_intervals[i].first)) != + graph->get_is_reverse(start_handle); +#ifdef debug + cerr << "trav " << i << ": "; + cerr << "n=" << trav_names[i] << " " + << "i=" << (graph->get_is_reverse(graph->get_handle_of_step(path_intervals[i].first)) ? "<" : ">") + << graph->get_id(graph->get_handle_of_step(path_intervals[i].first)) + << (graph->get_is_reverse(graph->get_handle_of_step(path_intervals[i].second)) ? "<" : ">") + << graph->get_id(graph->get_handle_of_step(path_intervals[i].second)) << " t="; + for (auto xx : trav) { + cerr << (graph->get_is_reverse(xx) ? "<" : ">") << graph->get_id(xx); + } + cerr << " a=" << allele << " r=" << trav_reversed[i] << endl; +#endif + } + + // rank the traversals, putting selected ones first otherwise alphabetical + function trav_idx_less = [&](int64_t i, int64_t j) { + bool iref = selected_paths.count(trav_paths[i]); + bool jref = selected_paths.count(trav_paths[j]); + if (iref != jref) { + return iref; + } + return trav_names[i] < trav_names[j]; + }; + + // merge up the paths + for (const auto& allele_travs : string_to_travs) { + if (allele_travs.first.length() > 0 && allele_travs.second.size() > 1) { + const vector& eq_travs = allele_travs.second; + auto canonical_it = std::min_element(eq_travs.begin(), eq_travs.end(), trav_idx_less); + assert(canonical_it != eq_travs.end()); + int64_t canonical_idx = *canonical_it; + Traversal canonical_trav = path_travs[canonical_idx]; + if (trav_reversed[canonical_idx]) { + Traversal flip_trav; + for (auto i = canonical_trav.rbegin(); i != canonical_trav.rend(); ++i) { + flip_trav.push_back(graph->flip(*i)); + } + std::swap(canonical_trav, flip_trav); + } + + // edit it into every other path + // + // WARNING: in the case of loops, we could be editing the same path more than once + // so making the big undocumented assumption that step handles outside edit + // are unaffected!! + for (int64_t i = 0; i < eq_travs.size(); ++i) { + int64_t replace_idx = eq_travs[i]; + if (replace_idx != canonical_idx && path_travs[replace_idx] != path_travs[canonical_idx]) { + PathInterval interval_to_replace = path_intervals[replace_idx]; + if (trav_reversed[replace_idx]) { + std::swap(interval_to_replace.first, interval_to_replace.second); + } +#ifdef debug + cerr << "editing interval of size " << path_travs[replace_idx].size() + << " from path " << trav_names[replace_idx] << " with canonical interval of size " + << path_travs[canonical_idx].size() << " from path " + << trav_names[canonical_idx] << endl; - + cerr << "interval to replace first " << graph->get_id(graph->get_handle_of_step(interval_to_replace.first)) + << " second " << graph->get_id(graph->get_handle_of_step(interval_to_replace.second)) << endl; +#endif + graph->rewrite_segment(interval_to_replace.first, graph->get_next_step(interval_to_replace.second), + canonical_trav); + } + } + } + } +} + +void merge_equivalent_traversals_in_graph(MutablePathHandleGraph* graph, const unordered_set& selected_paths) { + // compute the snarls + SnarlDistanceIndex distance_index; + { + IntegratedSnarlFinder snarl_finder(*graph); + // todo: why can't I pass in 0 below -- I don't want any dinstances! + fill_in_distance_index(&distance_index, graph, &snarl_finder, 1); + } + + // only consider embedded paths that span snarl + PathTraversalFinder path_trav_finder(*graph); + + // do every snarl top-down. this is because it's possible (tho probably rare) for a child snarl to + // be redundant after normalizing its parent. don't think the opposite (normalizing child) + // causes redundant parent.. todo: can we guarantee?! + net_handle_t root = distance_index.get_root(); + deque queue = {root}; + + while (!queue.empty()) { + net_handle_t net_handle = queue.front(); + queue.pop_front(); + if (distance_index.is_snarl(net_handle)) { + net_handle_t start_bound = distance_index.get_bound(net_handle, false, true); + net_handle_t end_bound = distance_index.get_bound(net_handle, true, false); + handle_t start_handle = distance_index.get_handle(start_bound, graph); + handle_t end_handle = distance_index.get_handle(end_bound, graph); + merge_equivalent_traversals_in_snarl(graph, selected_paths, path_trav_finder, start_handle, end_handle); + } + if (net_handle == root || distance_index.is_snarl(net_handle) || distance_index.is_chain(net_handle)) { + distance_index.for_each_child(net_handle, [&](net_handle_t child_handle) { + queue.push_back(child_handle); + }); + } + } +} } diff --git a/src/traversal_clusters.hpp b/src/traversal_clusters.hpp index a4e7710d4b5..646cbf46409 100644 --- a/src/traversal_clusters.hpp +++ b/src/traversal_clusters.hpp @@ -90,4 +90,15 @@ vector> assign_child_snarls_to_traversals(const PathHandleGraph* gra const vector& traversals, const vector>& child_snarls); +/// For every top-level snarl in the graph, compute the traversal strings of every embedded path that spans it +/// If two or more traversals share an allele string, then a "canoncial" path is chosen and all remaining paths +/// are edited so that they share the exact same interval through the snarl as the canonical path's traversal. +/// A path is considered "canoncial" if it's in the "selected_paths" and the other paths are not +/// (otherwise the lowest name is used as a fallback) +/// +/// Note: this doesn't modify the graph toplogy, so uncovered nodes and edges as a result of path editing +/// would usually need removale with vg clip afterwards +/// +void merge_equivalent_traversals_in_graph(MutablePathHandleGraph* graph, const unordered_set& selected_paths); + } From 62ac6a290c94957ea5156d4f3d569a328e4a0e1c Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 11 Sep 2024 10:42:20 -0400 Subject: [PATCH 2/5] fix strand bug --- src/traversal_clusters.cpp | 46 +++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/src/traversal_clusters.cpp b/src/traversal_clusters.cpp index 4b2e0621155..81d8e8fe46e 100644 --- a/src/traversal_clusters.cpp +++ b/src/traversal_clusters.cpp @@ -294,6 +294,7 @@ static void merge_equivalent_traversals_in_snarl(MutablePathHandleGraph* graph, vector trav_paths(path_travs.size()); vector trav_reversed(path_travs.size()); + // organize paths by their alleles strings // todo: we could use a fancier index to save memory here (prob not necessary tho) unordered_map> string_to_travs; @@ -309,18 +310,15 @@ static void merge_equivalent_traversals_in_snarl(MutablePathHandleGraph* graph, trav_reversed[i] = graph->get_is_reverse(graph->get_handle_of_step(path_intervals[i].first)) != graph->get_is_reverse(start_handle); #ifdef debug - cerr << "trav " << i << ": "; - cerr << "n=" << trav_names[i] << " " - << "i=" << (graph->get_is_reverse(graph->get_handle_of_step(path_intervals[i].first)) ? "<" : ">") - << graph->get_id(graph->get_handle_of_step(path_intervals[i].first)) - << (graph->get_is_reverse(graph->get_handle_of_step(path_intervals[i].second)) ? "<" : ">") - << graph->get_id(graph->get_handle_of_step(path_intervals[i].second)) << " t="; - for (auto xx : trav) { - cerr << (graph->get_is_reverse(xx) ? "<" : ">") << graph->get_id(xx); - } - cerr << " a=" << allele << " r=" << trav_reversed[i] << endl; + cerr << "snarl " << graph_interval_to_string(graph, start_handle, end_handle) << endl + << "trav " << i << ": " + << "n=" << trav_names[i] << " " + << "i=" << graph_interval_to_string(graph, graph->get_handle_of_step(path_intervals[i].first), + graph->get_handle_of_step(path_intervals[i].second)) + << " t=" << traversal_to_string(graph, trav) + << " a=" << allele << " r=" << trav_reversed[i] << endl; #endif - } + } // rank the traversals, putting selected ones first otherwise alphabetical function trav_idx_less = [&](int64_t i, int64_t j) { @@ -339,13 +337,10 @@ static void merge_equivalent_traversals_in_snarl(MutablePathHandleGraph* graph, auto canonical_it = std::min_element(eq_travs.begin(), eq_travs.end(), trav_idx_less); assert(canonical_it != eq_travs.end()); int64_t canonical_idx = *canonical_it; - Traversal canonical_trav = path_travs[canonical_idx]; - if (trav_reversed[canonical_idx]) { - Traversal flip_trav; - for (auto i = canonical_trav.rbegin(); i != canonical_trav.rend(); ++i) { - flip_trav.push_back(graph->flip(*i)); - } - std::swap(canonical_trav, flip_trav); + const Traversal& canonical_trav = path_travs[canonical_idx]; + Traversal canonical_trav_flip; + for (auto i = canonical_trav.rbegin(); i != canonical_trav.rend(); ++i) { + canonical_trav_flip.push_back(graph->flip(*i)); } // edit it into every other path @@ -360,18 +355,23 @@ static void merge_equivalent_traversals_in_snarl(MutablePathHandleGraph* graph, if (trav_reversed[replace_idx]) { std::swap(interval_to_replace.first, interval_to_replace.second); } + const Traversal& replacement_trav = trav_reversed[replace_idx] ? canonical_trav_flip : canonical_trav; #ifdef debug cerr << "editing interval of size " << path_travs[replace_idx].size() << " from path " << trav_names[replace_idx] << " with canonical interval of size " - << path_travs[canonical_idx].size() << " from path " + << replacement_trav.size() << " from path " << trav_names[canonical_idx] << endl; - cerr << "interval to replace first " << graph->get_id(graph->get_handle_of_step(interval_to_replace.first)) - << " second " << graph->get_id(graph->get_handle_of_step(interval_to_replace.second)) << endl; + cerr << "--interval to replace: " + << graph_interval_to_string(graph, graph->get_handle_of_step(interval_to_replace.first), + graph->get_handle_of_step(interval_to_replace.second)) << endl + << "--interval coming in: " << traversal_to_string(graph, replacement_trav) << endl; -#endif +#endif + assert(graph->get_handle_of_step(interval_to_replace.first) == replacement_trav.front()); + assert(graph->get_handle_of_step(interval_to_replace.second) == replacement_trav.back()); graph->rewrite_segment(interval_to_replace.first, graph->get_next_step(interval_to_replace.second), - canonical_trav); + replacement_trav); } } } From c0be4ef5b485d2ece7de25bdd9ee76c19b6108c0 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 11 Sep 2024 12:29:13 -0400 Subject: [PATCH 3/5] threads option --- src/subcommand/paths_main.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index 1e4906bbcfd..b2a70fd6efc 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -55,7 +55,8 @@ void help_paths(char** argv) { << " -a, --variant-paths select the variant paths added by 'vg construct -a'" << endl << " -G, --generic-paths select the generic, non-reference, non-haplotype paths" << endl << " -R, --reference-paths select the reference paths" << endl - << " -H, --haplotype-paths select the haplotype paths paths" << endl; + << " -H, --haplotype-paths select the haplotype paths paths" << endl + << " -t, --threads N number of threads to use [all available]. applies only to snarl finding within -n" << endl; } /// Chunk a path and emit it in Graph messages. @@ -158,7 +159,7 @@ int main_paths(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drnaGRHp:c", + c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drnaGRHp:ct:", long_options, &option_index); // Detect the end of the options. @@ -284,6 +285,17 @@ int main_paths(int argc, char** argv) { selection_criteria++; break; + case 't': + { + int num_threads = parse(optarg); + if (num_threads <= 0) { + cerr << "error:[vg paths] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; + exit(1); + } + omp_set_num_threads(num_threads); + break; + } + case 'h': case '?': help_paths(argv); From a1bf04d8201cc8b501a476d49599839a0f03fafa Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 11 Sep 2024 14:54:37 -0400 Subject: [PATCH 4/5] add tests for paths normalizer --- src/subcommand/paths_main.cpp | 8 ++----- test/t/11_vg_paths.t | 45 ++++++++++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index b2a70fd6efc..fb668d7edd4 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -17,6 +17,7 @@ #include "../xg.hpp" #include "../gbwt_helper.hpp" #include "../traversal_clusters.hpp" +#include "../io/save_handle_graph.hpp" #include #include #include @@ -593,11 +594,6 @@ int main_paths(int argc, char** argv) { std::cerr << "error[vg paths]: graph cannot be modified" << std::endl; exit(1); } - SerializableHandleGraph* serializable_graph = dynamic_cast(graph.get()); - if (!serializable_graph) { - std::cerr << "error[vg paths]: graph cannot be saved after modification" << std::endl; - exit(1); - } vector to_destroy; if (drop_paths) { @@ -621,7 +617,7 @@ int main_paths(int argc, char** argv) { } // output the graph - serializable_graph->serialize(cout); + vg::io::save_handle_graph(mutable_graph, cout); } else if (coverage) { // for every node, count the number of unique paths. then add the coverage count to each one diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t index 3bba6231d56..36729a7a0ba 100644 --- a/test/t/11_vg_paths.t +++ b/test/t/11_vg_paths.t @@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg export LC_ALL="C" # force a consistent sort order -plan tests 20 +plan tests 26 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg vg construct -r small/x.fa -v small/x.vcf.gz > x2.vg @@ -61,4 +61,47 @@ is $? 0 "vg path coverage reports correct lengths in first column" rm -f q.vg q.cov.len q.len +# redundant paths are x2,x3,x4,x5 but not x1 +vg paths -x graphs/path_norm_test.gfa -n -Q x2 > norm_x2.gfa +vg validate norm_x2.gfa +is $? 0 "path normalizer produces valid graph" + +vg paths -x graphs/path_norm_test.gfa -F | sort > original.fa +vg paths -x norm_x2.gfa -F | sort > norm_x2.fa +diff original.fa norm_x2.fa +is $? 0 "path normalizer doesnt alter path sequences" + +grep x2 norm_x2.gfa | awk '{print $3}' > x2.path +grep x2 norm_x2.gfa | awk '{print $3}' >> x2.path +grep x3 norm_x2.gfa | awk '{print $3}'> x2.norm.path +grep x5 norm_x2.gfa | awk '{print $3}' >> x2.norm.path +diff x2.path x2.norm.path +is $? 0 "path normalizere correctly snapped all equivalent paths to x2" + +rm -f norm_x2.gfa original.fa norm_x2.fa x2.path x2.norm.path + +vg paths -x graphs/path_norm_test.gfa -n -Q x4 > norm_x4.gfa +vg validate norm_x4.gfa +is $? 0 "path normalizer produces valid graph" + +vg paths -x graphs/path_norm_test.gfa -F | sort > original.fa +vg paths -x norm_x4.gfa -F | sort > norm_x4.fa +diff original.fa norm_x4.fa +is $? 0 "path normalizer doesnt alter path sequences" + +# note: x3 is x4 in reverse, so we key on that +grep x3 norm_x2.gfa | awk '{print $3}' > x4.path +grep x3 norm_x2.gfa | awk '{print $3}' >> x4.path +grep x3 norm_x2.gfa | awk '{print $3}'> x4.norm.path +grep x5 norm_x2.gfa | awk '{print $3}' >> x4.norm.path +diff x4.path x4.norm.path +is $? 0 "path normalizere correctly snapped all equivalent paths to x4" + +rm -f norm_x4.gfa original.fa norm_x4.fa x4.path x4.norm.path + + + + + + From d966fa746f5f4c58a2232134894d22bf8570f32c Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 11 Sep 2024 16:35:29 -0400 Subject: [PATCH 5/5] Forgotten test file --- src/traversal_clusters.cpp | 13 +++++----- test/graphs/path_norm_test.gfa | 47 ++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 6 deletions(-) create mode 100644 test/graphs/path_norm_test.gfa diff --git a/src/traversal_clusters.cpp b/src/traversal_clusters.cpp index 81d8e8fe46e..7e361c84f6a 100644 --- a/src/traversal_clusters.cpp +++ b/src/traversal_clusters.cpp @@ -294,7 +294,10 @@ static void merge_equivalent_traversals_in_snarl(MutablePathHandleGraph* graph, vector trav_paths(path_travs.size()); vector trav_reversed(path_travs.size()); - +#ifdef debug + cerr << "snarl " << graph_interval_to_string(graph, start_handle, end_handle) << endl; +#endif + // organize paths by their alleles strings // todo: we could use a fancier index to save memory here (prob not necessary tho) unordered_map> string_to_travs; @@ -310,8 +313,7 @@ static void merge_equivalent_traversals_in_snarl(MutablePathHandleGraph* graph, trav_reversed[i] = graph->get_is_reverse(graph->get_handle_of_step(path_intervals[i].first)) != graph->get_is_reverse(start_handle); #ifdef debug - cerr << "snarl " << graph_interval_to_string(graph, start_handle, end_handle) << endl - << "trav " << i << ": " + cerr << "trav " << i << ": " << "n=" << trav_names[i] << " " << "i=" << graph_interval_to_string(graph, graph->get_handle_of_step(path_intervals[i].first), graph->get_handle_of_step(path_intervals[i].second)) @@ -360,9 +362,8 @@ static void merge_equivalent_traversals_in_snarl(MutablePathHandleGraph* graph, cerr << "editing interval of size " << path_travs[replace_idx].size() << " from path " << trav_names[replace_idx] << " with canonical interval of size " << replacement_trav.size() << " from path " - << trav_names[canonical_idx] << endl; - - cerr << "--interval to replace: " + << trav_names[canonical_idx] << endl + << "--interval to replace: " << graph_interval_to_string(graph, graph->get_handle_of_step(interval_to_replace.first), graph->get_handle_of_step(interval_to_replace.second)) << endl << "--interval coming in: " << traversal_to_string(graph, replacement_trav) << endl; diff --git a/test/graphs/path_norm_test.gfa b/test/graphs/path_norm_test.gfa new file mode 100644 index 00000000000..84d905e0e51 --- /dev/null +++ b/test/graphs/path_norm_test.gfa @@ -0,0 +1,47 @@ +H VN:Z:1.1 +S 5 C +S 7 A +S 12 ATAT +S 8 G +S 1 CAAATAAG +S 4 T +S 6 TTG +S 15 CCAACTCTCTG +S 2 A +S 35 GT +S 53 AC +S 10 A +S 9 AAATTTTCTGGAGTTCTAT +S 11 T +S 13 A +S 14 T +S 3 G +P x1 1+,3+,5+,6+,8+,9+,11+,12+,14+,15+ * +P x2 1+,3+,4+,6+,8+,9+,11+,12+,14+,15+ * +P x3 1+,35+,6+,8+,9+,11+,12+,14+,15+ * +P x4 15-,14-,12-,11-,9-,8-,6-,35-,1- * +P x5 1+,53-,6+,8+,9+,11+,12+,14+,15+ * +L 5 + 6 + 0M +L 7 + 9 + 0M +L 12 + 13 + 0M +L 12 + 14 + 0M +L 8 + 9 + 0M +L 1 + 2 + 0M +L 1 + 3 + 0M +L 1 + 35 + 0M +L 1 + 53 - 0M +L 4 + 6 + 0M +L 35 + 6 + 0M +L 53 - 6 + 0M +L 6 + 7 + 0M +L 6 + 8 + 0M +L 2 + 4 + 0M +L 2 + 5 + 0M +L 10 + 12 + 0M +L 9 + 10 + 0M +L 9 + 11 + 0M +L 11 + 12 + 0M +L 13 + 15 + 0M +L 14 + 15 + 0M +L 3 + 4 + 0M +L 3 + 5 + 0M