diff --git a/src/algorithms/coverage_depth.cpp b/src/algorithms/coverage_depth.cpp index c92471348da..d49e1880bf5 100644 --- a/src/algorithms/coverage_depth.cpp +++ b/src/algorithms/coverage_depth.cpp @@ -295,7 +295,8 @@ pair sample_gam_depth(const HandleGraph& graph, const vector& ignore_paths, ostream& out_stream) { assert(graph.has_path(path_name)); path_handle_t path_handle = graph.get_path_handle(path_name); @@ -311,10 +312,14 @@ void path_depths(const PathHandleGraph& graph, const string& path_name, size_t m size_t step_count = 0; handle_t handle = graph.get_handle_of_step(step_handle); graph.for_each_step_on_handle(handle, [&](step_handle_t step_handle_2) { + path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle_2); + if (ignore_paths.count(step_path_handle)) { + return true; + } if (count_cycles) { ++step_count; } else { - path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle_2); + auto it = path_to_name.find(step_path_handle); if (it == path_to_name.end()) { string step_path_name = graph.get_path_name(step_path_handle); @@ -323,6 +328,7 @@ void path_depths(const PathHandleGraph& graph, const string& path_name, size_t m } path_set.insert(it->second); } + return true; }); size_t coverage = (count_cycles ? step_count : path_set.size()) - 1; size_t node_len = graph.get_length(handle); @@ -339,7 +345,7 @@ void path_depths(const PathHandleGraph& graph, const string& path_name, size_t m pair path_depth_of_bin(const PathHandleGraph& graph, step_handle_t start_step, step_handle_t end_plus_one_step, - size_t min_coverage, bool count_cycles) { + size_t min_coverage, bool count_cycles, const unordered_set& ignore_paths) { // compute the mean and variance of our base coverage across the bin size_t bin_length = 0; @@ -357,10 +363,13 @@ pair path_depth_of_bin(const PathHandleGraph& graph, unordered_set path_set; size_t step_count = 0; graph.for_each_step_on_handle(cur_handle, [&](step_handle_t step_handle) { + path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle); + if (ignore_paths.count(step_path_handle)) { + return true; + } if (count_cycles) { ++step_count; } else { - path_handle_t step_path_handle = graph.get_path_handle_of_step(step_handle); auto it = path_to_name.find(step_path_handle); if (it == path_to_name.end()) { string step_path_name = graph.get_path_name(step_path_handle); @@ -368,7 +377,8 @@ pair path_depth_of_bin(const PathHandleGraph& graph, it = path_to_name.insert(make_pair(step_path_handle, Paths::strip_subrange(step_path_name))).first; } path_set.insert(it->second); - } + } + return true; }); size_t coverage = (count_cycles ? step_count : path_set.size()) - 1; @@ -386,7 +396,8 @@ vector> binned_path_depth(const PathHandle const string& path_name, size_t bin_size, size_t min_coverage, - bool count_cycles) { + bool count_cycles, + const unordered_set& ignore_paths) { path_handle_t path_handle = graph.get_path_handle(path_name); @@ -414,7 +425,8 @@ vector> binned_path_depth(const PathHandle step_handle_t bin_end_step = i < bins.size() - 1 ? bins[i+1].second : end_step; size_t bin_start = bins[i].first; size_t bin_end = i < bins.size() - 1 ? bins[i+1].first : offset; - pair coverage = path_depth_of_bin(graph, bin_start_step, bin_end_step, min_coverage, count_cycles); + pair coverage = path_depth_of_bin(graph, bin_start_step, bin_end_step, min_coverage, count_cycles, + ignore_paths); binned_depths[i] = make_tuple(bin_start, bin_end, coverage.first, coverage.second); } diff --git a/src/algorithms/coverage_depth.hpp b/src/algorithms/coverage_depth.hpp index 021cc57fa52..9cb7af0a1a6 100644 --- a/src/algorithms/coverage_depth.hpp +++ b/src/algorithms/coverage_depth.hpp @@ -67,14 +67,16 @@ pair sample_mapping_depth(const HandleGraph& graph, const vector /// print path-name offset base-coverage for every base on a path (just like samtools depth) /// ignoring things below min_coverage. offsets are 1-based in output stream /// coverage here is the number of steps from (unique) other paths -void path_depths(const PathHandleGraph& graph, const string& path_name, size_t min_coverage, bool count_cycles, ostream& out_stream); +void path_depths(const PathHandleGraph& graph, const string& path_name, size_t min_coverage, bool count_cycles, + const unordered_set& ignore_paths, ostream& out_stream); /// like packed_depth_of_bin (above), but use paths (as in path_depths) for measuring coverage pair path_depth_of_bin(const PathHandleGraph& graph, step_handle_t start_step, step_handle_t end_plus_one_step, - size_t min_coverage, bool count_cycles); + size_t min_coverage, bool count_cycles, const unordered_set& ignore_paths); -vector> binned_path_depth(const PathHandleGraph& graph, const string& path_name, size_t bin_size, - size_t min_coverage, bool count_cycles); +vector> binned_path_depth(const PathHandleGraph& graph, const string& path_name, + size_t bin_size, size_t min_coverage, bool count_cycles, + const unordered_set& ignore_paths); } } diff --git a/src/algorithms/gfa_to_handle.cpp b/src/algorithms/gfa_to_handle.cpp index 65f19320676..d58479cdc72 100644 --- a/src/algorithms/gfa_to_handle.cpp +++ b/src/algorithms/gfa_to_handle.cpp @@ -1,6 +1,6 @@ #include "gfa_to_handle.hpp" #include "../path.hpp" - +#include "../rgfa.hpp" #include namespace vg { @@ -291,7 +291,6 @@ static void add_path_listeners(GFAParser& parser, MutablePathMutableHandleGraph* } if (found == rgfa_cache->end()) { // Need to make a new path, possibly with subrange start info. - std::pair subrange; if (offset == 0) { // Don't send a subrange @@ -300,14 +299,22 @@ static void add_path_listeners(GFAParser& parser, MutablePathMutableHandleGraph* // Start later than 0 subrange = std::pair(offset, PathMetadata::NO_END_POSITION); } + + string rgfa_path_name; + if (path_rank > 0) { + // Special logic for off-reference paths, which get loaded into special rGFA cover paths + rgfa_path_name = RGFACover::make_rgfa_path_name(path_name, offset, length, false); + } else { + // TODO: See if we can split up the path name into a sample/haplotype/etc. to give it a ref sense. + rgfa_path_name = PathMetadata::create_path_name(PathSense::GENERIC, + PathMetadata::NO_SAMPLE_NAME, + path_name, + PathMetadata::NO_HAPLOTYPE, + PathMetadata::NO_PHASE_BLOCK, + subrange); + } + path_handle_t path = graph->create_path_handle(rgfa_path_name); - // TODO: See if we can split up the path name into a sample/haplotype/etc. to give it a ref sense. - path_handle_t path = graph->create_path(PathSense::GENERIC, - PathMetadata::NO_SAMPLE_NAME, - path_name, - PathMetadata::NO_HAPLOTYPE, - PathMetadata::NO_PHASE_BLOCK, - subrange); // Then cache it found = rgfa_cache->emplace_hint(found, path_name, std::make_pair(path, offset)); } @@ -667,10 +674,6 @@ tuple, GFAParser::chars_t, GFAPar // Grab the haplotype number int64_t haplotype_number = take_number(cursor, end, -1, "parsing haplotype number"); - if (haplotype_number == -1) { - // This field is required - throw GFAFormatError("Missing haplotype number in W line", cursor); - } take_tab(cursor, end, "parsing end of haplotype number"); // Grab the sequence/contig/locus name @@ -1041,7 +1044,13 @@ void GFAParser::parse(istream& in) { } else { // This path existed already. Make sure we aren't showing a conflicting rank if (rgfa_path_rank != get<0>(found->second)) { - throw GFAFormatError("rGFA path " + rgfa_path_name + " has conflicting ranks " + std::to_string(rgfa_path_rank) + " and " + std::to_string(get<0>(found->second))); + // vg paths can now write different fragments of the same sample/hap with different ranks + // (unlike minigraph where all fragments of teh same assembly share a rank) + // by consequence, this check needs to be disabled. + // TODO: we still want to check that each individual fragment has consisten ranks + // so the check should be moved downstream + // + // throw GFAFormatError("rGFA path " + rgfa_path_name + " has conflicting ranks " + std::to_string(rgfa_path_rank) + " and " + std::to_string(get<0>(found->second))); } } auto& visit_queue = get<2>(found->second); diff --git a/src/clip.cpp b/src/clip.cpp index 0d2c5efd78d..45e0e5367fc 100644 --- a/src/clip.cpp +++ b/src/clip.cpp @@ -481,7 +481,8 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph function)> iterate_handles, function)> iterate_edges, int64_t min_depth, const vector& ref_prefixes, - int64_t min_fragment_len, bool verbose) { + int64_t min_fragment_len, const unordered_set& ignore_paths, + bool verbose) { // find all nodes in the snarl that are not on the reference interval (reference path name from containing interval) unordered_set to_delete; @@ -502,6 +503,10 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph bool on_ref = false; size_t depth = 0; graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle) { + path_handle_t path_handle = graph->get_path_handle_of_step(step_handle); + if (ignore_paths.count(path_handle)) { + return true; + } ++depth; if (depth > min_depth || on_ref) { return false; @@ -509,7 +514,7 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph if (!ref_prefixes.empty() || region) { // if we have a region, do exact comparison to it. // otherwise, do a prefix check against ref_prefix - string path_name = graph->get_path_name(graph->get_path_handle_of_step(step_handle)); + string path_name = graph->get_path_name(path_handle); if ((region && region->seq == path_name) || (!region && check_prefixes(path_name))) { on_ref = true; return false; @@ -540,6 +545,7 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph edge_depths.resize(edge_count + 1); graph->for_each_path_handle([&](path_handle_t path_handle) { + if (!ignore_paths.count(path_handle)) { bool is_ref_path = check_prefixes(graph->get_path_name(path_handle)); handle_t prev_handle; bool first = true; @@ -563,6 +569,7 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph } prev_handle = handle; }); + } }); unordered_set edges_to_delete; @@ -622,7 +629,7 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph } void clip_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, int64_t min_depth, const vector& ref_prefixes, - int64_t min_fragment_len, bool verbose) { + int64_t min_fragment_len, const unordered_set& ignore_paths, bool verbose) { function)> iterate_handles = [&] (function visit_handle) { graph->for_each_handle([&](handle_t handle) { @@ -636,11 +643,13 @@ void clip_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, int64_ }); }; - clip_low_depth_nodes_and_edges_generic(graph, iterate_handles, iterate_edges, min_depth, ref_prefixes, min_fragment_len, verbose); + clip_low_depth_nodes_and_edges_generic(graph, iterate_handles, iterate_edges, min_depth, ref_prefixes, min_fragment_len, + ignore_paths, verbose); } void clip_contained_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, PathPositionHandleGraph* pp_graph, const vector& regions, - SnarlManager& snarl_manager, bool include_endpoints, int64_t min_depth, int64_t min_fragment_len, bool verbose) { + SnarlManager& snarl_manager, bool include_endpoints, int64_t min_depth, int64_t min_fragment_len, + const unordered_set& ignore_paths, bool verbose) { function)> iterate_handles = [&] (function visit_handle) { @@ -678,7 +687,7 @@ void clip_contained_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* gra } vector ref_paths_from_regions(ref_path_set.begin(), ref_path_set.end()); - clip_low_depth_nodes_and_edges_generic(graph, iterate_handles, iterate_edges, min_depth, ref_paths_from_regions, min_fragment_len, verbose); + clip_low_depth_nodes_and_edges_generic(graph, iterate_handles, iterate_edges, min_depth, ref_paths_from_regions, min_fragment_len, ignore_paths, verbose); } diff --git a/src/clip.hpp b/src/clip.hpp index e57a260b98c..4c1294aad71 100644 --- a/src/clip.hpp +++ b/src/clip.hpp @@ -64,19 +64,22 @@ void clip_low_depth_nodes_and_edges_generic(MutablePathMutableHandleGraph* graph function)> iterate_handles, function)> iterate_edges, int64_t min_depth, const vector& ref_prefixes, - int64_t min_fragment_len, bool verbose); + int64_t min_fragment_len, const unordered_set& ignore_paths, + bool verbose); /** * Run above function on graph */ void clip_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, int64_t min_depth, const vector& ref_prefixes, - int64_t min_fragment_len, bool verbose); + int64_t min_fragment_len, const unordered_set& ignore_paths, bool verbose); /** * Or on contained snarls */ -void clip_contained_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, PathPositionHandleGraph* pp_graph, const vector& regions, - SnarlManager& snarl_manager, bool include_endpoints, int64_t min_depth, int64_t min_fragment_len, bool verbose); +void clip_contained_low_depth_nodes_and_edges(MutablePathMutableHandleGraph* graph, PathPositionHandleGraph* pp_graph, + const vector& regions, SnarlManager& snarl_manager, bool include_endpoints, + int64_t min_depth, int64_t min_fragment_len, + const unordered_set& ignore_paths, bool verbose); /** * clip out deletion edges diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index 118d9df45a5..cd61c721c7c 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -1,6 +1,7 @@ #include "deconstructor.hpp" #include "traversal_finder.hpp" #include +#include "rgfa.hpp" //#define debug @@ -611,7 +612,8 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { } #endif if (ref_paths.count(path_trav_name) && - (ref_trav_name.empty() || path_trav_name < ref_trav_name)) { + (ref_trav_name.empty() || path_trav_name < ref_trav_name || + (RGFACover::is_rgfa_path_name(ref_trav_name) && !RGFACover::is_rgfa_path_name(path_trav_name)))) { ref_trav_name = path_trav_name; #ifdef debug #pragma omp critical (cerr) @@ -792,22 +794,24 @@ bool Deconstructor::deconstruct_site(const Snarl* snarl) const { const SnarlTraversal& ref_trav = path_travs.first[ref_trav_idx]; + string fixed_trav_name = RGFACover::revert_rgfa_path_name(ref_trav_name, true); + vcflib::Variant v; v.quality = 60; // in VCF we usually just want the contig - string contig_name = PathMetadata::parse_locus_name(ref_trav_name); + string contig_name = PathMetadata::parse_locus_name(fixed_trav_name); if (contig_name == PathMetadata::NO_LOCUS_NAME) { - contig_name = ref_trav_name; + contig_name = fixed_trav_name; } else if (long_ref_contig) { // the sample name isn't unique enough, so put a full ugly name in the vcf - if (PathMetadata::parse_sense(ref_trav_name) == PathSense::GENERIC) { - contig_name = ref_trav_name; + if (PathMetadata::parse_sense(fixed_trav_name) == PathSense::GENERIC) { + contig_name = fixed_trav_name; } else { contig_name = PathMetadata::create_path_name(PathSense::REFERENCE, - PathMetadata::parse_sample_name(ref_trav_name), + PathMetadata::parse_sample_name(fixed_trav_name), contig_name, - PathMetadata::parse_haplotype(ref_trav_name), + PathMetadata::parse_haplotype(fixed_trav_name), PathMetadata::NO_PHASE_BLOCK, PathMetadata::NO_SUBRANGE); } @@ -939,13 +943,14 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand graph->for_each_path_handle([&](const path_handle_t& path_handle) { string path_name = graph->get_path_name(path_handle); if (!this->ref_paths.count(path_name)) { - string sample_name = graph->get_sample_name(path_handle); + path_name = RGFACover::revert_rgfa_path_name(path_name, true); + string sample_name = PathMetadata::parse_sample_name(path_name); // for backward compatibility if (sample_name == PathMetadata::NO_SAMPLE_NAME) { sample_name = path_name; } if (!ref_samples.count(sample_name)) { - size_t haplotype = graph->get_haplotype(path_handle); + size_t haplotype = PathMetadata::parse_haplotype(path_name); if (haplotype == PathMetadata::NO_HAPLOTYPE) { haplotype = 0; } @@ -1032,18 +1037,19 @@ void Deconstructor::deconstruct(vector ref_paths, const PathPositionHand for (handle_t handle : graph->scan_path(path_handle)) { path_len += graph->get_length(handle); } - string locus_name = graph->get_locus_name(path_handle); + string fixed_refpath = RGFACover::revert_rgfa_path_name(refpath, true); + string locus_name = PathMetadata::parse_locus_name(fixed_refpath); if (locus_name == PathMetadata::NO_LOCUS_NAME) { - locus_name = refpath; + locus_name = fixed_refpath; } else if (long_ref_contig) { // the sample name isn't unique enough, so put a full ugly name in the vcf - if (graph->get_sense(path_handle) == PathSense::GENERIC) { - locus_name = graph->get_path_name(path_handle); + if (PathMetadata::parse_sense(fixed_refpath) == PathSense::GENERIC) { + locus_name = fixed_refpath; } else { locus_name = PathMetadata::create_path_name(PathSense::REFERENCE, - graph->get_sample_name(path_handle), + PathMetadata::parse_sample_name(fixed_refpath), locus_name, - graph->get_haplotype(path_handle), + PathMetadata::parse_haplotype(fixed_refpath), PathMetadata::NO_PHASE_BLOCK, PathMetadata::NO_SUBRANGE); } diff --git a/src/gfa.cpp b/src/gfa.cpp index 762ddd1bc6a..76dafde5a8f 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -1,6 +1,7 @@ #include "gfa.hpp" #include "utility.hpp" #include "path.hpp" +#include "rgfa.hpp" #include #include @@ -18,7 +19,16 @@ static void write_w_line(const PathHandleGraph* graph, ostream& out, path_handle void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& rgfa_paths, bool rgfa_pline, bool use_w_lines) { - + + RGFACover rgfa_cover; + if (!rgfa_paths.empty()) { + // index the rgfa cover in the graph (combination of rank-0 rgfa_paths and rank-1 paths with rgfa sample name) + unordered_set rgfa_path_handles; + for (const string& rgfa_path_name : rgfa_paths) { + rgfa_path_handles.insert(graph->get_path_handle(rgfa_path_name)); + } + rgfa_cover.load(graph, rgfa_path_handles); + } // TODO: Support sorting nodes, paths, and/or edges for canonical output // TODO: Use a NamedNodeBackTranslation (or forward translation?) to properly round-trip GFA that has had to be chopped. @@ -40,12 +50,18 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& } out << "\n"; - //Compute the rGFA tags of given paths (todo: support non-zero ranks) + //Compute the rGFA tags of given paths. These paths are the rank-0 reference paths (passed in rgfa_paths set) + //along with paths that have the special rgfa sample name (rank>0) paths. unordered_map> node_offsets; - for (const string& path_name : rgfa_paths) { - path_handle_t path_handle = graph->get_path_handle(path_name); - size_t offset = 0; - graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = graph->get_path_name(path_handle); + if (rgfa_paths.count(path_name) || (!rgfa_paths.empty() && RGFACover::is_rgfa_path_name(path_name))) { + size_t offset = 0; + subrange_t path_subrange = graph->get_subrange(path_handle); + if (path_subrange != PathMetadata::NO_SUBRANGE) { + offset = path_subrange.first; + } + graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { handle_t handle = graph->get_handle_of_step(step_handle); nid_t node_id = graph->get_id(handle); if (graph->get_is_reverse(handle)) { @@ -62,7 +78,8 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& } offset += graph->get_length(handle); }); - } + } + }); //Go through each node in the graph graph->for_each_handle([&](const handle_t& h) { @@ -73,9 +90,20 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& auto it = node_offsets.find(node_id); if (it != node_offsets.end()) { // add rGFA tags - out << "\t" << "SN:Z:" << graph->get_path_name(it->second.first) + string sample_name = graph->get_sample_name(it->second.first); + string locus_name = graph->get_locus_name(it->second.first); + int64_t haplotype = graph->get_haplotype(it->second.first); + if (!rgfa_paths.empty() && RGFACover::is_rgfa_path_name(graph->get_path_name(it->second.first))) { + std::tie(sample_name, locus_name) = RGFACover::parse_rgfa_locus_name(locus_name); + } + string rgfa_sn = PathMetadata::create_path_name(sample_name.empty() ? PathSense::GENERIC : PathSense::REFERENCE, + sample_name, locus_name, + sample_name.empty() ? PathMetadata::NO_HAPLOTYPE : haplotype, + PathMetadata::NO_PHASE_BLOCK, + PathMetadata::NO_SUBRANGE); + out << "\t" << "SN:Z:" << rgfa_sn << "\t" << "SO:i:" << it->second.second - << "\t" << "SR:i:0"; // todo: support non-zero ranks? + << "\t" << "SR:i:" << rgfa_cover.get_rank(node_id); } out << "\n"; // Writing `std::endl` would flush the buffer. return true; @@ -106,7 +134,8 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& // Paths as P-lines for (const path_handle_t& h : path_handles) { auto path_name = graph->get_path_name(h); - if (rgfa_pline || !rgfa_paths.count(path_name)) { + if (rgfa_pline || rgfa_paths.empty() || + (!rgfa_paths.count(path_name) && !RGFACover::is_rgfa_path_name(path_name))) { if (graph->get_sense(h) != PathSense::REFERENCE && reference_samples.count(graph->get_sample_name(h))) { // We have a mix of reference and non-reference paths on the same sample which GFA can't handle. cerr << "warning [gfa]: path " << path_name << " will be interpreted as reference sense " diff --git a/src/graph_caller.cpp b/src/graph_caller.cpp index 5dd250282c5..fd09a799224 100644 --- a/src/graph_caller.cpp +++ b/src/graph_caller.cpp @@ -1,6 +1,7 @@ #include "graph_caller.hpp" #include "algorithms/expand_context.hpp" #include "annotation.hpp" +#include "rgfa.hpp" //#define debug @@ -22,26 +23,28 @@ void GraphCaller::call_top_level_snarls(const HandleGraph& graph, RecurseType re // Used to recurse on children of parents that can't be called size_t thread_count = get_thread_count(); - vector> snarl_queue(thread_count); + vector>> snarl_queue(thread_count); std::atomic top_snarl_count(0); std::atomic nested_snarl_count(0); bool top_level = true; // Run the snarl caller on a snarl, and queue up the children if it fails - auto process_snarl = [&](const Snarl* snarl) { + auto process_snarl = [&](const Snarl* snarl, int ploidy_override) { if (!snarl_manager.is_trivial(snarl, graph)) { #ifdef debug cerr << "GraphCaller running call_snarl on " << pb2json(*snarl) << endl; #endif - - bool was_called = call_snarl(*snarl); - if (recurse_type == RecurseAlways || (!was_called && recurse_type == RecurseOnFail)) { - const vector& children = snarl_manager.children_of(snarl); - vector& thread_queue = snarl_queue[omp_get_thread_num()]; - thread_queue.insert(thread_queue.end(), children.begin(), children.end()); + const vector& children = snarl_manager.children_of(snarl); + vector child_ploidies(children.size(), -1); + bool was_called = call_snarl(*snarl, ploidy_override, &child_ploidies); + if (recurse_type == RecurseAlways || (!was_called && recurse_type == RecurseOnFail)) { + vector>& thread_queue = snarl_queue[omp_get_thread_num()]; + for (int64_t i = 0; i < children.size(); ++i) { + thread_queue.push_back(make_pair(children[i], child_ploidies[i])); + } } if (show_progress) { @@ -63,16 +66,24 @@ void GraphCaller::call_top_level_snarls(const HandleGraph& graph, RecurseType re }; // Start with the top level snarls - snarl_manager.for_each_top_level_snarl_parallel(process_snarl); + // Queue them up since process_snarl is no longer a valid callback for the iterator snarl_manager.for_each_top_level_snarl() + vector top_level_snarls; + snarl_manager.for_each_top_level_snarl([&](const Snarl* snarl) { + top_level_snarls.push_back(snarl); + }); +#pragma omp parallel for schedule(dynamic, 1) + for (int64_t i = 0; i < top_level_snarls.size(); ++i) { + process_snarl(top_level_snarls[i], -1); + } if (show_progress) cerr << "[vg call]: Finished processing " << top_snarl_count << " top-level snarls" << endl; top_level = false; // Then recurse on any children the snarl caller failed to handle while (!std::all_of(snarl_queue.begin(), snarl_queue.end(), - [](const vector& snarl_vec) {return snarl_vec.empty();})) { - vector cur_queue; - for (vector& thread_queue : snarl_queue) { + [](const vector>& snarl_vec) {return snarl_vec.empty();})) { + vector> cur_queue; + for (vector>& thread_queue : snarl_queue) { cur_queue.reserve(cur_queue.size() + thread_queue.size()); std::move(thread_queue.begin(), thread_queue.end(), std::back_inserter(cur_queue)); thread_queue.clear(); @@ -80,7 +91,7 @@ void GraphCaller::call_top_level_snarls(const HandleGraph& graph, RecurseType re #pragma omp parallel for schedule(dynamic, 1) for (int i = 0; i < cur_queue.size(); ++i) { - process_snarl(cur_queue[i]); + process_snarl(cur_queue[i].first, cur_queue[i].second); } } if (show_progress && nested_snarl_count > 0) cerr << "[vg call]: Finished processing " << nested_snarl_count << " nested snarls" << endl; @@ -203,8 +214,48 @@ vector GraphCaller::break_chain(const HandleGraph& graph, const Chain& ch return chain_frags; } + + +void GraphCaller::resolve_child_ploidies(const HandleGraph& graph, const Snarl* snarl, const vector& travs, + const vector& genotype, vector& child_ploidies) { + const vector& children = snarl_manager.children_of(snarl); + + child_ploidies.resize(children.size()); -VCFOutputCaller::VCFOutputCaller(const string& sample_name) : sample_name(sample_name), translation(nullptr), include_nested(false) + if (!children.empty()) { + // index the nodes in the traversals + vector> trav_indexes(genotype.size()); + for (int64_t i = 0; i < genotype.size(); ++i) { + const SnarlTraversal& trav = travs[genotype[i]]; + unordered_set& trav_idx = trav_indexes[i]; + if (i > 0 && genotype[i] == genotype[i-1]) { + trav_idx = trav_indexes[i-1]; + } else { + for (int j = 0; j < trav.visit_size(); ++j) { + trav_idx.insert(graph.get_handle(trav.visit(j).node_id(), trav.visit(j).backward())); + } + } + } + + // for every child snarl, count the number of traversals that spans it -- this will be its ploidy + for (int64_t i = 0; i < children.size(); ++i) { + const Snarl* child_snarl = children[i]; + child_ploidies[i] = 0; + handle_t child_start = graph.get_handle(child_snarl->start().node_id(), child_snarl->start().backward()); + handle_t child_end = graph.get_handle(child_snarl->end().node_id(), child_snarl->end().backward()); + for (int64_t j = 0; j < trav_indexes.size(); ++j) { + unordered_set& trav_idx = trav_indexes[j]; + if ((trav_idx.count(child_start) && trav_idx.count(child_end)) || + (trav_idx.count(graph.flip(child_start)) && trav_idx.count(graph.flip(child_end)))) { + ++child_ploidies[i]; + } + } + } + } +} + + +VCFOutputCaller::VCFOutputCaller(const string& sample_name) : sample_name(sample_name), translation(nullptr), include_nested(false), write_full_names(false) { output_variants.resize(get_thread_count()); } @@ -519,11 +570,11 @@ void VCFOutputCaller::emit_variant(const PathPositionHandleGraph& graph, SnarlCa // resolve subpath naming subrange_t subrange; - string basepath_name = Paths::strip_subrange(ref_path_name, &subrange); + string basepath_name = Paths::strip_subrange(RGFACover::revert_rgfa_path_name(ref_path_name), &subrange); size_t basepath_offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; // in VCF we usually just want a contig string contig_name = PathMetadata::parse_locus_name(basepath_name); - if (contig_name != PathMetadata::NO_LOCUS_NAME) { + if (contig_name != PathMetadata::NO_LOCUS_NAME && !this->write_full_names) { basepath_name = contig_name; } // fill out the rest of the variant @@ -794,6 +845,20 @@ void VCFOutputCaller::scan_snarl(const string& allele_string, function& ref_paths) { + set samples; + set haplotypes; + for (const string& path_name : ref_paths) { + samples.insert(PathMetadata::parse_sample_name(path_name)); + haplotypes.insert(PathMetadata::parse_haplotype(path_name)); + if (samples.size() > 1 || haplotypes.size() > 1) { + this->write_full_names = true; + return; + } + } + this->write_full_names = false; +} + GAFOutputCaller::GAFOutputCaller(AlignmentEmitter* emitter, const string& sample_name, const vector& ref_paths, size_t trav_padding) : emitter(emitter), @@ -1082,7 +1147,7 @@ VCFGenotyper::~VCFGenotyper() { } -bool VCFGenotyper::call_snarl(const Snarl& snarl) { +bool VCFGenotyper::call_snarl(const Snarl& snarl, int ploidy_override, vector* out_child_ploidies) { // could be that our graph is a subgraph of the graph the snarls were computed from // so bypass snarls we can't process @@ -1324,6 +1389,8 @@ LegacyCaller::LegacyCaller(const PathPositionHandleGraph& graph, // our graph is not in vg format. we will make graphs for each site as needed and work with those traversal_finder = nullptr; } + + this->toggle_full_names_from_paths(ref_paths); } LegacyCaller::~LegacyCaller() { @@ -1333,7 +1400,7 @@ LegacyCaller::~LegacyCaller() { } } -bool LegacyCaller::call_snarl(const Snarl& snarl) { +bool LegacyCaller::call_snarl(const Snarl& snarl, int ploidy_override, vector* out_child_ploidies) { // if we can't handle the snarl, then the GraphCaller framework will recurse on its children if (!is_traversable(snarl)) { @@ -1658,13 +1725,14 @@ FlowCaller::FlowCaller(const PathPositionHandleGraph& graph, ref_ploidies[ref_paths[i]] = i < ref_path_ploidies.size() ? ref_path_ploidies[i] : 2; } + this->toggle_full_names_from_paths(ref_paths); } FlowCaller::~FlowCaller() { } -bool FlowCaller::call_snarl(const Snarl& managed_snarl) { +bool FlowCaller::call_snarl(const Snarl& managed_snarl, int ploidy_override, vector* out_child_ploidies) { // todo: In order to experiment with merging consecutive snarls to make longer traversals, // I am experimenting with sending "fake" snarls through this code. So make a local @@ -1672,6 +1740,11 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) { // wants a pointer will crash. Snarl snarl = managed_snarl; + if (ploidy_override == 0) { + // returning true is a bit ironic, but we do *not* want to recurse so it's important we do + return true; + } + if (snarl.start().node_id() == snarl.end().node_id() || !graph.has_node(snarl.start().node_id()) || !graph.has_node(snarl.end().node_id())) { // can't call one-node or out-of graph snarls. @@ -1838,10 +1911,14 @@ bool FlowCaller::call_snarl(const Snarl& managed_snarl) { // use our support caller to choose our genotype vector trav_genotype; unique_ptr trav_call_info; - int ploidy = ref_ploidies[ref_path_name]; + int ploidy = ploidy_override == -1 ? ref_ploidies[ref_path_name] : ploidy_override; std::tie(trav_genotype, trav_call_info) = snarl_caller.genotype(snarl, travs, ref_trav_idx, ploidy, ref_path_name, make_pair(get<0>(ref_interval), get<1>(ref_interval))); - + if (out_child_ploidies != nullptr) { + // derive ploidies for child snarls from the genotype traversals and save them + resolve_child_ploidies(graph, &managed_snarl, travs, trav_genotype, *out_child_ploidies); + } + assert(trav_genotype.empty() || trav_genotype.size() == ploidy); if (!gaf_output) { @@ -1901,14 +1978,15 @@ NestedFlowCaller::NestedFlowCaller(const PathPositionHandleGraph& graph, ref_path_set.insert(ref_paths[i]); ref_ploidies[ref_paths[i]] = i < ref_path_ploidies.size() ? ref_path_ploidies[i] : 2; } - + + this->toggle_full_names_from_paths(ref_paths); } NestedFlowCaller::~NestedFlowCaller() { } -bool NestedFlowCaller::call_snarl(const Snarl& managed_snarl) { +bool NestedFlowCaller::call_snarl(const Snarl& managed_snarl, int ploidy_override, vector* out_child_ploidies) { // remember the calls for each child snarl in this table CallTable call_table; diff --git a/src/graph_caller.hpp b/src/graph_caller.hpp index 7bdcfe9d880..5babeaaec8f 100644 --- a/src/graph_caller.hpp +++ b/src/graph_caller.hpp @@ -49,7 +49,7 @@ class GraphCaller { RecurseType recurise_type = RecurseOnFail); /// Call a given snarl, and print the output to out_stream - virtual bool call_snarl(const Snarl& snarl) = 0; + virtual bool call_snarl(const Snarl& snarl, int ploidy_override = -1, vector* out_child_ploidies = nullptr) = 0; /// toggle progress messages void set_show_progress(bool show_progress); @@ -58,6 +58,10 @@ class GraphCaller { /// Break up a chain into bits that we want to call using size heuristics vector break_chain(const HandleGraph& graph, const Chain& chain, size_t max_edges, size_t max_trivial); + + /// Compute the child ploidies baseds on the called genotype + void resolve_child_ploidies(const HandleGraph& graph, const Snarl* snarl, const vector& travs, + const vector& genotype, vector& child_ploidies); protected: @@ -141,6 +145,11 @@ class VCFOutputCaller { // update the PS and LV tags in the output buffer (called in write_variants if include_nested is true) void update_nesting_info_tags(const SnarlManager* snarl_manager); + + // toggle write_full_names automatically depending on ref_paths + // (keeps backwards compatibility when it was always false whenever possible, + // but flips to full names when necessary) + void toggle_full_names_from_paths(const vector& ref_paths); /// output vcf mutable vcflib::VariantCallFile output_vcf; @@ -160,6 +169,9 @@ class VCFOutputCaller { // need to write LV/PS info tags bool include_nested; + + // toggle writing full name or just contig name + bool write_full_names; }; /** @@ -227,7 +239,7 @@ class VCFGenotyper : public GraphCaller, public VCFOutputCaller, public GAFOutpu virtual ~VCFGenotyper(); - virtual bool call_snarl(const Snarl& snarl); + virtual bool call_snarl(const Snarl& snarl, int ploidy_override = -1, vector* out_child_ploidies = nullptr); virtual string vcf_header(const PathHandleGraph& graph, const vector& contigs, const vector& contig_length_overrides = {}) const; @@ -280,7 +292,7 @@ class LegacyCaller : public GraphCaller, public VCFOutputCaller { virtual ~LegacyCaller(); - virtual bool call_snarl(const Snarl& snarl); + virtual bool call_snarl(const Snarl& snarl, int ploidy_override = -1, vector* out_child_ploidies = nullptr); virtual string vcf_header(const PathHandleGraph& graph, const vector& contigs, const vector& contig_length_overrides = {}) const; @@ -381,7 +393,7 @@ class FlowCaller : public GraphCaller, public VCFOutputCaller, public GAFOutputC virtual ~FlowCaller(); - virtual bool call_snarl(const Snarl& snarl); + virtual bool call_snarl(const Snarl& snarl, int ploidy_override = -1, vector* out_child_ploidies = nullptr); virtual string vcf_header(const PathHandleGraph& graph, const vector& contigs, const vector& contig_length_overrides = {}) const; @@ -458,7 +470,7 @@ class NestedFlowCaller : public GraphCaller, public VCFOutputCaller, public GAFO virtual ~NestedFlowCaller(); - virtual bool call_snarl(const Snarl& snarl); + virtual bool call_snarl(const Snarl& snarl, int ploidy_override = -1, vector* out_child_ploidies = nullptr); virtual string vcf_header(const PathHandleGraph& graph, const vector& contigs, const vector& contig_length_overrides = {}) const; diff --git a/src/hts_alignment_emitter.cpp b/src/hts_alignment_emitter.cpp index 2df5a1fac51..c8ae527a13f 100644 --- a/src/hts_alignment_emitter.cpp +++ b/src/hts_alignment_emitter.cpp @@ -12,6 +12,7 @@ #include "algorithms/find_translation.hpp" #include #include +#include "rgfa.hpp" #include @@ -150,8 +151,9 @@ static bool for_each_subpath_of(const PathPositionHandleGraph& graph, const stri /// Returns the base path name for this path (i.e. the path's name without any subrange). static string get_path_base_name(const PathPositionHandleGraph& graph, const path_handle_t& path) { - if (graph.get_subrange(path) == PathMetadata::NO_SUBRANGE) { - // This is a full path + string path_name = graph.get_path_name(path); + if (graph.get_subrange(path) == PathMetadata::NO_SUBRANGE || RGFACover::is_rgfa_path_name(path_name)) { + // This is a full path or an rGFA fragment that we don't want to touch return graph.get_path_name(path); } else { // This is a subpath, so remember what it's a subpath of, and use that. @@ -179,6 +181,7 @@ pair>, unordered_map> extract_path auto& own_length = get<1>(path_info); auto& base_length = get<2>(path_info); string base_path_name = subpath_support ? get_path_base_name(graph, path) : graph.get_path_name(path); + base_path_name = RGFACover::revert_rgfa_path_name(base_path_name); if (!base_path_set.count(base_path_name)) { path_names_and_lengths.push_back(make_pair(base_path_name, base_length)); base_path_set.insert(base_path_name); @@ -229,7 +232,14 @@ vector> get_sequence_dictionary(const strin if (print_subrange_warnings) { subrange_t subrange; - std::string base_path_name = Paths::strip_subrange(sequence_name, &subrange); + std::string base_path_name; + if (RGFACover::is_rgfa_path_name(sequence_name)) { + base_path_name = RGFACover::revert_rgfa_path_name(sequence_name, false); + // a white lie, but we want our subranges in rgfa intervals, at least for now + subrange = PathMetadata::NO_SUBRANGE; + } else { + base_path_name = Paths::strip_subrange(sequence_name, &subrange); + } if (subrange != PathMetadata::NO_SUBRANGE) { // The user is asking explicitly to surject to a path that is a // subrange of some other logical path, like @@ -693,10 +703,17 @@ void HTSAlignmentEmitter::convert_alignment(const Alignment& aln, vector& dest) { diff --git a/src/io/save_handle_graph.hpp b/src/io/save_handle_graph.hpp index bee5b1297c5..58946fd5490 100644 --- a/src/io/save_handle_graph.hpp +++ b/src/io/save_handle_graph.hpp @@ -42,11 +42,14 @@ using namespace std; * Save a handle graph. * Todo: should this be somewhere else (ie in vgio with new types registered?) */ -inline void save_handle_graph(HandleGraph* graph, ostream& os) { +inline void save_handle_graph(HandleGraph* graph, ostream& os, + const set& rgfa_paths = {}, + bool rgfa_pline = false, + bool use_w_lines = true) { if (dynamic_cast(graph) != nullptr) { // We loaded a GFA into a handle graph, want to write back to GFA - graph_to_gfa(dynamic_cast(graph), os); + graph_to_gfa(dynamic_cast(graph), os, rgfa_paths, rgfa_pline, use_w_lines); } else if (dynamic_cast(graph) != nullptr) { // SerializableHandleGraphs are all serialized bare, without VPKG framing, for libbdsg compatibility. dynamic_cast(graph)->serialize(os); @@ -58,14 +61,17 @@ inline void save_handle_graph(HandleGraph* graph, ostream& os) { } } -inline void save_handle_graph(HandleGraph* graph, const string& dest_path) { +inline void save_handle_graph(HandleGraph* graph, const string& dest_path, + const set& rgfa_paths = {}, + bool rgfa_pline = false, + bool use_w_lines = true) { if (dynamic_cast(graph) != nullptr) { // We loaded a GFA into a handle graph, want to write back to GFA ofstream os(dest_path); if (!os) { throw runtime_error("error[save_handle_graph]: Unable to write to: " + dest_path); } - graph_to_gfa(dynamic_cast(graph), os); + graph_to_gfa(dynamic_cast(graph), os, rgfa_paths, rgfa_pline, use_w_lines); } else if (dynamic_cast(graph) != nullptr) { // SerializableHandleGraphs are all serialized bare, without VPKG framing, for libbdsg compatibility. dynamic_cast(graph)->serialize(dest_path); diff --git a/src/rgfa.cpp b/src/rgfa.cpp new file mode 100644 index 00000000000..4af858a9d02 --- /dev/null +++ b/src/rgfa.cpp @@ -0,0 +1,1158 @@ +#include "rgfa.hpp" +#include +#include +#include +#include +#include + +//#define debug + +namespace vg { + +using namespace std; + +const string RGFACover::rgfa_sample_name = "_rGFA_"; + +string RGFACover::make_rgfa_path_name(const string& path_name, int64_t start, int64_t length, + bool specify_subrange_end) { + + PathSense path_sense; + string path_sample; + string path_locus; + size_t path_haplotype; + size_t path_phase_block; + subrange_t path_subrange; + PathMetadata::parse_path_name(path_name, path_sense, path_sample, path_locus, + path_haplotype, path_phase_block, path_subrange); + if (path_subrange == PathMetadata::NO_SUBRANGE) { + if (path_phase_block != PathMetadata::NO_PHASE_BLOCK && path_phase_block > 0) { + // gbz puts subranges into phase blocks + // so we interpret them as subranges here, which allows + // things like vg convert graph.gbz | vg paths -x - -f ... to make cover + path_subrange.first = path_phase_block; + path_subrange.second = PathMetadata::NO_END_POSITION; + } else { + path_subrange = make_pair(0, 0); + } + } + + // we move the sample into the locus + // todo: is there something nicer? + string rgfa_locus; + assert(path_locus != PathMetadata::NO_LOCUS_NAME); + if (path_sample != PathMetadata::NO_SAMPLE_NAME) { + rgfa_locus = path_sample + "::"; + } + // the contig name will be behind SC... + rgfa_locus += path_locus; + + // we apply the subrange offset + path_subrange.first += start; + path_subrange.second = specify_subrange_end ? path_subrange.first + length : PathMetadata::NO_END_POSITION; + + + // and return the final path, with sample/locus/rgfa-rank embedded in locus + // (as it's a reference path, we alsos strip out the phase block) + return PathMetadata::create_path_name(PathSense::REFERENCE, RGFACover::rgfa_sample_name, + rgfa_locus, path_haplotype, + PathMetadata::NO_PHASE_BLOCK, path_subrange); + +} + +bool RGFACover::is_rgfa_path_name(const string& path_name) { + return PathMetadata::parse_sample_name(path_name) == RGFACover::rgfa_sample_name; +} + +pair RGFACover::parse_rgfa_locus_name(const string& locus_name) { + pair sample_locus = make_pair(PathMetadata::NO_SAMPLE_NAME, PathMetadata::NO_LOCUS_NAME); + + auto pos = locus_name.find("::"); + if (pos != string::npos) { + sample_locus.first = locus_name.substr(0, pos); + sample_locus.second = locus_name.substr(pos + 2); + } else { + sample_locus.second = locus_name; + } + + return sample_locus; +} + +string RGFACover::revert_rgfa_path_name(const string& rgfa_path_name, bool strip_subrange) { + if (!is_rgfa_path_name(rgfa_path_name)) { + return rgfa_path_name; + } + PathSense path_sense; + string path_sample; + string path_locus; + size_t path_haplotype; + size_t path_phase_block; + subrange_t path_subrange; + PathMetadata::parse_path_name(rgfa_path_name, path_sense, path_sample, path_locus, + path_haplotype, path_phase_block, path_subrange); + + std::tie(path_sample, path_locus) = parse_rgfa_locus_name(path_locus); + + if (path_sense == PathSense::REFERENCE && path_sample == PathMetadata::NO_SAMPLE_NAME) { + path_sense = PathSense::GENERIC; + } + return PathMetadata::create_path_name(path_sense, path_sample, + path_locus, path_haplotype, + path_phase_block, strip_subrange ? PathMetadata::NO_SUBRANGE : path_subrange); +} + +void RGFACover::clear(MutablePathMutableHandleGraph* graph) { + vector rgfa_paths; + graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) { + rgfa_paths.push_back(path_handle); + }); + for (path_handle_t path_handle : rgfa_paths) { + graph->destroy_path(path_handle); + } +} + +void RGFACover::compute(const PathHandleGraph* graph, + SnarlManager* snarl_manager, + const unordered_set& reference_paths, + int64_t minimum_length) { + + // start from scratch + this->rgfa_intervals.clear(); + this->node_to_interval.clear(); + this->graph = graph; + + // start with the reference paths + for (const path_handle_t& ref_path_handle : reference_paths) { + this->rgfa_intervals.push_back(make_pair(graph->path_begin(ref_path_handle), + graph->path_end(ref_path_handle))); + graph->for_each_step_in_path(ref_path_handle, [&](step_handle_t step_handle) { + nid_t node_id = graph->get_id(graph->get_handle_of_step(step_handle)); + if (node_to_interval.count(node_id)) { + cerr << "[rgfa error]: node " << node_id << " covered by two reference paths," + << " including " << graph->get_path_name(ref_path_handle) << " and " + << graph->get_path_name(graph->get_path_handle_of_step(rgfa_intervals.at(node_to_interval.at(node_id)).first)) + << ". rGFA support current requires disjoint acyclic reference paths" << endl; + exit(1); + } + node_to_interval[node_id] = rgfa_intervals.size() - 1; + }); + } + this->num_ref_intervals = this->rgfa_intervals.size(); + +#ifdef debug +#pragma omp critical(cerr) + cerr << "[rgfa] Selected " << rgfa_intervals.size() << " rank=0 reference paths" << endl; +#endif + + // we use the path traversal finder for everything + // (even gbwt haplotypes, as we're using the path handle interface) + PathTraversalFinder path_trav_finder(*graph, *snarl_manager); + + // we collect the rgfa cover in parallel as a list of path fragments + size_t thread_count = get_thread_count(); + vector>> rgfa_intervals_vector(thread_count); + vector> node_to_interval_vector(thread_count); + + // we process top-level snarls in parallel + snarl_manager->for_each_top_level_snarl_parallel([&](const Snarl* snarl) { + // per-thread output + vector>& thread_rgfa_intervals = rgfa_intervals_vector[omp_get_thread_num()]; + unordered_map& thread_node_to_interval = node_to_interval_vector[omp_get_thread_num()]; + + vector queue = {snarl}; + + while(!queue.empty()) { + const Snarl* cur_snarl = queue.back(); + queue.pop_back(); + + // get the snarl cover + compute_snarl(*cur_snarl, path_trav_finder, minimum_length, + thread_rgfa_intervals, + thread_node_to_interval); + + + // recurse on the children + const vector& children = snarl_manager->children_of(cur_snarl); + for (const Snarl* child_snarl : children) { + queue.push_back(child_snarl); + } + } + }); + + // now we need to fold up the thread covers + for (int64_t t = 0; t < thread_count; ++t) { +#ifdef debug +#pragma omp critical(cerr) + cerr << "Adding " << rgfa_intervals_vector[t].size() << " rgfa intervals from thread " << t << endl; +#endif + // important to go through function rather than do a raw copy since + // inter-top-level snarl merging may need to happen + for (int64_t j = 0; j < rgfa_intervals_vector[t].size(); ++j) { + // the true flag at the end disables the overlap check. since they were computed + // in separate threads, snarls can overlap by a single node + const pair& interval = rgfa_intervals_vector[t][j]; + if (interval.first != graph->path_end(graph->get_path_handle_of_step(interval.first))) { + add_interval(this->rgfa_intervals, this->node_to_interval, rgfa_intervals_vector[t][j], true); + } + } + rgfa_intervals_vector[t].clear(); + node_to_interval_vector[t].clear(); + } + + // remove any intervals that were made redundant by add_interval + defragment_intervals(); + + // todo: we could optionally go through uncovered nodes and try to greedily search + // for rgfa intervals at this point, since it seems for some graphs there are + // regions that don't get found via the traversal finder + +} + +void RGFACover::load(const PathHandleGraph* graph, + const unordered_set& reference_paths, + bool use_original_paths) { + // start from scratch + this->rgfa_intervals.clear(); + this->node_to_interval.clear(); + this->graph = graph; + + // start with the reference paths + for (const path_handle_t& ref_path_handle : reference_paths) { + graph->for_each_step_in_path(ref_path_handle, [&](step_handle_t step_handle) { + nid_t node_id = graph->get_id(graph->get_handle_of_step(step_handle)); + if (graph->get_is_reverse(graph->get_handle_of_step(step_handle))) { + cerr << "[rgfa] error: Reversed step " << node_id << " found in rank-0 reference " + << graph->get_path_name(ref_path_handle) << ". All rGFA path fragments must be forward-only." << endl; + exit(1); + } + if (node_to_interval.count(node_id)) { + cerr << "[rgfa] error: Cycle found on node " << node_id << "in rank-0 reference " + << graph->get_path_name(ref_path_handle) << ". All rGFA path fragments must be acyclic." << endl; + exit(1); + + } + node_to_interval[node_id] = rgfa_intervals.size(); + }); + this->rgfa_intervals.push_back(make_pair(graph->path_begin(ref_path_handle), + graph->path_end(ref_path_handle))); + } + this->num_ref_intervals = this->rgfa_intervals.size(); + + if (!use_original_paths) { + // just suck up the rgfa fragments right into the data structure + graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) { + graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + node_to_interval[graph->get_id(graph->get_handle_of_step(step_handle))] = rgfa_intervals.size(); + }); + this->rgfa_intervals.push_back(make_pair(graph->path_begin(path_handle), + graph->path_end(path_handle))); + }); + } else { + // then the rgfa cover paths + // since we want to keep our structures in terms of original paths, we have to map back + // to them here (if we don't have original paths, then we can't find the overlaps and + // therefore nesting relationships between them. + + // we start by making a little index in two scans, as I'm worried about quadratic path scan below otherwise + // (this does not make guarantees about degenerate fragmentation related cases tho) + unordered_map, vector> sample_locus_to_paths; + graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) { + string locus_name = graph->get_locus_name(path_handle); + sample_locus_to_paths[parse_rgfa_locus_name(locus_name)] = {}; + }); + graph->for_each_path_handle([&](path_handle_t path_handle) { + pair sample_locus = make_pair(graph->get_sample_name(path_handle), graph->get_locus_name(path_handle)); + if (sample_locus_to_paths.count(sample_locus)) { + sample_locus_to_paths[sample_locus].push_back(path_handle); + } + }); + + // next, we scan each rgfa path fragment, and use the index to semi-quickly find its source path interval + // todo: An inconsistency between cover paths and source paths is a possibility if someone messed up their graph + // so should probably have a better error message than the asserts below (ie if exact interval match not found) + graph->for_each_path_of_sample(RGFACover::rgfa_sample_name, [&](path_handle_t path_handle) { + // pase the rgfa locus to get the original sample and locus + pair source_sample_locus = parse_rgfa_locus_name(graph->get_locus_name(path_handle)); + // find the sample in our index + const vector& source_paths = sample_locus_to_paths.at(source_sample_locus); + // find the containing path + subrange_t rgfa_subrange = graph->get_subrange(path_handle); + assert(rgfa_subrange != PathMetadata::NO_SUBRANGE); + int64_t rgfa_haplotype = graph->get_haplotype(path_handle); + // allow match between 0 and NO_HAPLOTYPE + if (rgfa_haplotype == PathMetadata::NO_HAPLOTYPE) { + rgfa_haplotype = 0; + } + const path_handle_t* source_path = nullptr; + subrange_t source_subrange; + for (const path_handle_t& source_path_candidate : source_paths) { + int64_t source_haplotype = graph->get_haplotype(source_path_candidate); + if (source_haplotype == PathMetadata::NO_HAPLOTYPE) { + source_haplotype = 0; + } + if (source_haplotype == rgfa_haplotype) { + source_subrange = graph->get_subrange(source_path_candidate); + if (source_subrange == PathMetadata::NO_SUBRANGE) { + source_subrange.first = 0; + } + if (source_subrange == PathMetadata::NO_SUBRANGE || source_subrange.second == PathMetadata::NO_END_POSITION) { + source_subrange.second = 0; + graph->for_each_step_in_path(source_path_candidate, [&](step_handle_t step) { + source_subrange.second += graph->get_length(graph->get_handle_of_step(step)); + }); + } + if (rgfa_subrange.first >= source_subrange.first && rgfa_subrange.second <= source_subrange.second) { + source_path = &source_path_candidate; + break; + } + } + } + assert(source_path != nullptr); + // now find the exact interval in the containing path and update our data structure + bool found_start = false; + step_handle_t source_start; + int64_t cur_offset = 0; + graph->for_each_step_in_path(*source_path, [&](step_handle_t cur_step) { + if (cur_offset + source_subrange.first == rgfa_subrange.first) { + source_start = cur_step; + found_start = true; + } else { + cur_offset += graph->get_length(graph->get_handle_of_step(cur_step)); + } + return !found_start; + }); + assert(found_start); + assert(graph->get_id(graph->get_handle_of_step(source_start)) == + graph->get_id(graph->get_handle_of_step(graph->path_begin(path_handle)))); + assert(graph->get_is_reverse(graph->get_handle_of_step(source_start)) == + graph->get_is_reverse(graph->get_handle_of_step(graph->path_begin(path_handle)))); + + bool found_end = false; + step_handle_t source_end; + int64_t num_steps = 0; + for (step_handle_t cur_step = source_start;; cur_step = graph->get_next_step(cur_step)) { + ++num_steps; + cur_offset += graph->get_length(graph->get_handle_of_step(cur_step)); + + if (cur_offset + source_subrange.first == rgfa_subrange.second) { + found_end = true; + source_end = cur_step; + break; + } + if (!graph->has_next_step(cur_step)) { + break; + } + } + assert(found_end); + assert(num_steps == graph->get_step_count(path_handle)); + + // we can finally add our interval + source_end = graph->get_next_step(source_end); + this->rgfa_intervals.push_back(make_pair(source_start, source_end)); + for (step_handle_t cur_step = source_start; cur_step != source_end; cur_step = graph->get_next_step(cur_step)) { + int64_t cur_id = graph->get_id(graph->get_handle_of_step(cur_step)); + assert(!node_to_interval.count(cur_id)); + node_to_interval[cur_id] = rgfa_intervals.size() - 1; + } + }); + } +} + +void RGFACover::apply(MutablePathMutableHandleGraph* mutable_graph) { + assert(this->graph == static_cast(mutable_graph)); +#ifdef debug + cerr << "applying rGFA cover with " << this->num_ref_intervals << " ref intervals " + << " and " << this->rgfa_intervals.size() << " total intervals" << endl; +#endif + // index paths isued in rgfa cover + unordered_map step_to_offset; + unordered_set source_path_set; + for (int64_t i = this->num_ref_intervals; i < this->rgfa_intervals.size(); ++i) { + path_handle_t source_path_handle = mutable_graph->get_path_handle_of_step(rgfa_intervals[i].first); + step_to_offset[rgfa_intervals[i].first] = -1; + source_path_set.insert(source_path_handle); + } + vector source_paths(source_path_set.begin(), source_path_set.end()); // for old-style omp interface +#pragma omp parallel for + for (int64_t i = 0; i < source_paths.size(); ++i) { + int64_t offset = 0; + graph->for_each_step_in_path(source_paths[i], [&](step_handle_t step_handle) { + if (step_to_offset.count(step_handle)) { + step_to_offset[step_handle] = offset; + } + offset += graph->get_length(graph->get_handle_of_step(step_handle)); + }); + } + + // compute the offsets in parallel + vector rgfa_offsets(this->rgfa_intervals.size()); + vector rgfa_lengths(this->rgfa_intervals.size()); + +#pragma omp parallel for + for (int64_t i = this->num_ref_intervals; i < this->rgfa_intervals.size(); ++i) { + rgfa_offsets[i] = step_to_offset.at(rgfa_intervals[i].first); + rgfa_lengths[i] = 0; + for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != rgfa_intervals[i].second; + step_handle = mutable_graph->get_next_step(step_handle)) { + rgfa_lengths[i] += graph->get_length(graph->get_handle_of_step(step_handle)); + } + } + step_to_offset.clear(); + + // write the rgfa paths + for (int64_t i = this->num_ref_intervals; i < this->rgfa_intervals.size(); ++i) { + path_handle_t source_path_handle = mutable_graph->get_path_handle_of_step(rgfa_intervals[i].first); + string source_path_name = graph->get_path_name(source_path_handle); + string rgfa_path_name = make_rgfa_path_name(source_path_name, rgfa_offsets[i], rgfa_lengths[i]); + path_handle_t rgfa_path_handle = mutable_graph->create_path_handle(rgfa_path_name); + for (step_handle_t step_handle = rgfa_intervals[i].first; step_handle != rgfa_intervals[i].second; + step_handle = mutable_graph->get_next_step(step_handle)) { + mutable_graph->append_step(rgfa_path_handle, mutable_graph->get_handle_of_step(step_handle)); + } + } + + this->forwardize_rgfa_paths(mutable_graph); +} + +int64_t RGFACover::get_rank(nid_t node_id) const { + + // search back to reference in order to find the rank. + vector> ref_steps = this->get_reference_nodes(node_id, true); + return ref_steps.at(0).first; +} + +pair*, + const pair*> +RGFACover::get_parent_intervals(const pair& interval) const { + + pair*, + const pair*> parents = make_pair(nullptr, nullptr); + + // since our decomposition is baseds on snarl tranversals, we know that fragments must + // overlap their parents on snarl end points (at the very least) + // therefore we can find parents by scanning along the rgfa paths. + step_handle_t left_parent = graph->get_previous_step(interval.first); + if (left_parent != graph->path_front_end(graph->get_path_handle_of_step(interval.first))) { + int64_t interval_idx = this->node_to_interval.at(graph->get_id(graph->get_handle_of_step(left_parent))); + parents.first = &this->rgfa_intervals.at(interval_idx); + } + + step_handle_t right_parent = graph->get_next_step(interval.second); + if (right_parent != graph->path_end(graph->get_path_handle_of_step(interval.second))) { + int64_t interval_idx = node_to_interval.at(graph->get_id(graph->get_handle_of_step(right_parent))); + parents.second = &this->rgfa_intervals.at(interval_idx); + } + return parents; +} + +const vector>& RGFACover::get_intervals() const { + return this->rgfa_intervals; +} + +const pair* RGFACover::get_interval(nid_t node_id) const { + if (this->node_to_interval.count(node_id)) { + return &this->rgfa_intervals.at(node_to_interval.at(node_id)); + } + return nullptr; +} + + +void RGFACover::compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav_finder, int64_t minimum_length, + vector>& thread_rgfa_intervals, + unordered_map& thread_node_to_interval) { + + // start by finding the path traversals through the snarl + vector> travs; + vector trav_names; + { + pair, vector > > path_travs = path_trav_finder.find_path_traversals(snarl); + travs.reserve(path_travs.first.size()); + + // reduce protobuf usage by going back to vector of steps instead of keeping SnarlTraversals around + for (int64_t i = 0; i < path_travs.first.size(); ++i) { + string trav_path_name = graph->get_path_name(graph->get_path_handle_of_step(path_travs.second[i].first)); + if (is_rgfa_path_name(trav_path_name)) { + // we ignore existing (off-reference) rGFA paths + // todo: shoulgd there be better error handling? + cerr << "Warning : skipping existing rgfa traversal " << trav_path_name << endl; + continue; + } + bool reversed = false; + if (graph->get_is_reverse(graph->get_handle_of_step(path_travs.second[i].first)) != snarl.start().backward()) { + reversed = true; + } + assert((graph->get_is_reverse(graph->get_handle_of_step(path_travs.second[i].second)) != snarl.end().backward()) == reversed); + vector trav; + trav.reserve(path_travs.first[i].visit_size()); + bool done = false; + function visit_next_step = [&](step_handle_t step_handle) { + return reversed ? graph->get_previous_step(step_handle) : graph->get_next_step(step_handle); + }; + for (step_handle_t step_handle = path_travs.second[i].first; !done; step_handle = visit_next_step(step_handle)) { + trav.push_back(step_handle); + if (step_handle == path_travs.second[i].second) { + done = true; + } + } + if (reversed) { + std::reverse(trav.begin(), trav.end()); + } + travs.push_back(trav); + trav_names.push_back(trav_path_name); + } + } +#ifdef debug +#pragma omp critical(cerr) + cerr << "doing snarl " << pb2json(snarl.start()) << "-" << pb2json(snarl.end()) << " with " << travs.size() << " travs" << endl; +#endif + + + // build an initial ranked list of candidate traversal fragments + vector ranked_trav_fragments; + for (int64_t trav_idx = 0; trav_idx < travs.size(); ++trav_idx) { + // only a reference traversal (or deletion that we don't need to consider) + // will have its first two nodes covered + if (this->node_to_interval.count(graph->get_id(graph->get_handle_of_step(travs[trav_idx][0]))) && + this->node_to_interval.count(graph->get_id(graph->get_handle_of_step(travs[trav_idx][1])))) { + continue; + } + + const vector& trav = travs.at(trav_idx); + vector> uncovered_intervals = get_uncovered_intervals(trav, thread_node_to_interval); + + for (const auto& uncovered_interval : uncovered_intervals) { + unordered_set cycle_check; + bool cyclic = false; + int64_t interval_length = 0; + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second && !cyclic; ++i) { + handle_t handle = graph->get_handle_of_step(trav[i]); + interval_length += graph->get_length(handle); + nid_t node_id = graph->get_id(handle); + if (cycle_check.count(node_id)) { + cyclic = true; + } else { + cycle_check.insert(node_id); + } + } + if (!cyclic && interval_length >= minimum_length) { + int64_t trav_coverage = get_coverage(trav, uncovered_interval); + ranked_trav_fragments.push_back({trav_coverage, &trav_names[trav_idx], trav_idx, uncovered_interval}); + } + } + } + + // put the fragments into a max heap + std::make_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end()); + + // now greedily pull out traversal intervals from the ranked list until none are left + while (!ranked_trav_fragments.empty()) { + + // get the best scoring (max) fragment from heap + auto best_stats_fragment = ranked_trav_fragments.front(); + std::pop_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end()); + ranked_trav_fragments.pop_back(); + + const vector& trav = travs.at(best_stats_fragment.trav_idx); + const pair& uncovered_interval = best_stats_fragment.fragment; + +#ifdef debug +#pragma omp critical(cerr) + { + cerr << "best trav: "; + for (auto xx : trav) cerr << " " << graph->get_id(graph->get_handle_of_step(xx)); + cerr << endl << "uncovered interval [" << uncovered_interval.first << "," << uncovered_interval.second << "]" <> chopped_intervals; + int64_t cur_start = -1; + bool chopped = false; + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { + nid_t node_id = graph->get_id(graph->get_handle_of_step(trav[i])); + bool covered = this->node_to_interval.count(node_id) || thread_node_to_interval.count(node_id); + if (!covered && cur_start == -1) { + cur_start = i; + } else if (covered) { + chopped = true; + if (cur_start != -1) { + chopped_intervals.push_back(make_pair(cur_start, i)); + cur_start = -1; + } + } + } + if (cur_start != -1) { + chopped_intervals.push_back(make_pair(cur_start, uncovered_interval.second)); + } + if (chopped) { + for (const pair& chopped_interval : chopped_intervals) { + int64_t chopped_trav_length = 0; + for (int64_t i = chopped_interval.first; i < chopped_interval.second; ++i) { + chopped_trav_length += graph->get_length(graph->get_handle_of_step(trav[i])); + } + if (chopped_trav_length >= minimum_length) { + int64_t trav_coverage = get_coverage(trav, chopped_interval); + ranked_trav_fragments.push_back({trav_coverage, best_stats_fragment.name, best_stats_fragment.trav_idx, chopped_interval}); + std::push_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end()); + } + } + continue; + } + pair new_interval = make_pair(trav.at(uncovered_interval.first), + graph->get_next_step(trav.at(uncovered_interval.second - 1))); +#ifdef debug + int64_t interval_length = uncovered_interval.second - uncovered_interval.first; +#pragma omp critical(cerr) + cerr << "adding interval with length " << interval_length << endl; +#endif + add_interval(thread_rgfa_intervals, thread_node_to_interval, new_interval); + } +} + +vector> RGFACover::get_uncovered_intervals(const vector& trav, + const unordered_map& thread_node_to_interval) { + + vector> intervals; + int64_t start = -1; + unordered_set dupe_check; + for (size_t i = 0; i < trav.size(); ++i) { + nid_t node_id = graph->get_id(graph->get_handle_of_step(trav[i])); + bool covered = this->node_to_interval.count(node_id) || thread_node_to_interval.count(node_id); + // we break at dupes even if uncovered -- never want same id twice in an interval + bool dupe = !covered && dupe_check.count(node_id); + dupe_check.insert(node_id); + if (covered || dupe) { + if (start != -1) { + intervals.push_back(make_pair(start, i)); + } + start = dupe ? i : -1; + } else { + if (start == -1) { + start = i; + } + } + } + if (start != -1) { + intervals.push_back(make_pair(start, trav.size())); + } + return intervals; +} + +bool RGFACover::add_interval(vector>& thread_rgfa_intervals, + unordered_map& thread_node_to_interval, + const pair& new_interval, + bool global) { + +#ifdef debug +#pragma omp critical(cerr) + cerr << "adding interval " << graph->get_path_name(graph->get_path_handle_of_step(new_interval.first)) + << (graph->get_is_reverse(graph->get_handle_of_step(new_interval.first)) ? "<" : ">") + << ":" << graph->get_id(graph->get_handle_of_step(new_interval.first)); + if (new_interval.second == graph->path_end(graph->get_path_handle_of_step(new_interval.first))) { + cerr << "PATH_END" << endl; + } else { + cerr << "-" << (graph->get_is_reverse(graph->get_handle_of_step(new_interval.second)) ? "<" : ">") + << graph->get_id(graph->get_handle_of_step(new_interval.second)) << endl; + } +#endif + bool merged = false; + int64_t merged_interval_idx = -1; + path_handle_t path_handle = graph->get_path_handle_of_step(new_interval.first); + + // check the before-first step. if it's in an interval then it must be immediately + // preceeding so we merge the new interval to the end of the found interval + step_handle_t before_first_step = graph->get_previous_step(new_interval.first); + if (before_first_step != graph->path_front_end(graph->get_path_handle_of_step(before_first_step))) { + nid_t prev_node_id = graph->get_id(graph->get_handle_of_step(before_first_step)); + if (thread_node_to_interval.count(prev_node_id)) { + int64_t prev_idx = thread_node_to_interval[prev_node_id]; + pair& prev_interval = thread_rgfa_intervals[prev_idx]; + if (graph->get_path_handle_of_step(prev_interval.first) == path_handle) { + if (prev_interval.second == new_interval.first || + (global && graph->get_previous_step(prev_interval.second) == new_interval.first)) { +#ifdef debug +#pragma omp critical(cerr) + cerr << "prev interval found" << graph->get_path_name(graph->get_path_handle_of_step(prev_interval.first)) + << ":" << (graph->get_is_reverse(graph->get_handle_of_step(prev_interval.first)) ? "<" : ">") + << graph->get_id(graph->get_handle_of_step(prev_interval.first)); + if (prev_interval.second == graph->path_end(graph->get_path_handle_of_step(prev_interval.first))) { + cerr << "PATH_END" << endl; + } else { + cerr << "-" << (graph->get_is_reverse(graph->get_handle_of_step(prev_interval.second)) ? "<" : ">") + << graph->get_id(graph->get_handle_of_step(prev_interval.second)) << endl; + } +#endif + prev_interval.second = new_interval.second; + merged = true; + merged_interval_idx = prev_idx; + } + } + } + } + + // check the end step. if it's in an interval then it must be immediately + // following we merge the new interval to the front of the found interval + int64_t deleted_idx = -1; + if (new_interval.second != graph->path_end(graph->get_path_handle_of_step(new_interval.second))) { + nid_t next_node_id = graph->get_id(graph->get_handle_of_step(new_interval.second)); + if (thread_node_to_interval.count(next_node_id)) { + int64_t next_idx = thread_node_to_interval[next_node_id]; + pair& next_interval = thread_rgfa_intervals[next_idx]; + path_handle_t next_path = graph->get_path_handle_of_step(next_interval.first); + if (graph->get_path_handle_of_step(next_interval.first) == path_handle) { + if (next_interval.first == new_interval.second || + (global && next_interval.first == graph->get_previous_step(new_interval.second))) { +#ifdef debug +#pragma omp critical(cerr) + cerr << "next interval found" << graph->get_path_name(graph->get_path_handle_of_step(next_interval.first)) + << ":" << graph->get_id(graph->get_handle_of_step(next_interval.first)); + if (next_interval.second == graph->path_end(graph->get_path_handle_of_step(next_interval.second))) { + cerr << "PATH_END" << endl; + } else { + cerr << "-" << graph->get_id(graph->get_handle_of_step(next_interval.second)) << endl; + } +#endif + if (merged == true) { + // decomission next_interval + next_interval.first = graph->path_end(next_path); + next_interval.second = graph->path_front_end(next_path); + deleted_idx = next_idx; + // extend the previous interval right + thread_rgfa_intervals[merged_interval_idx].second = new_interval.second; + } else { + // extend next_interval left + next_interval.first = new_interval.first; + merged = true; + merged_interval_idx = next_idx; + } + } + } + } + } + + // add the interval to the local (thread safe) structures + if (!merged) { + merged_interval_idx = thread_rgfa_intervals.size(); + thread_rgfa_intervals.push_back(new_interval); + } + for (step_handle_t step = new_interval.first; step != new_interval.second; step = graph->get_next_step(step)) { + thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = merged_interval_idx; + } + if (deleted_idx >= 0) { + // move the links to the deleted interval to the new interval + const pair& deleted_interval = thread_rgfa_intervals[deleted_idx]; + for (step_handle_t step = deleted_interval.first; step != deleted_interval.second; step = graph->get_next_step(step)) { + thread_node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = merged_interval_idx; + } + } + return !merged; +} + +void RGFACover::defragment_intervals() { + vector> new_intervals; + this->node_to_interval.clear(); + for (int64_t i = 0; i < this->rgfa_intervals.size(); ++i) { + const pair& interval = this->rgfa_intervals[i]; + path_handle_t path_handle = graph->get_path_handle_of_step(interval.first); + if (interval.first != graph->path_end(path_handle)) { + new_intervals.push_back(interval); + } + } + this->rgfa_intervals = std::move(new_intervals); + for (int64_t i = 0; i < this->rgfa_intervals.size(); ++i) { + const pair& interval = this->rgfa_intervals[i]; + for (step_handle_t step = interval.first; step != interval.second; step = graph->get_next_step(step)) { + this->node_to_interval[graph->get_id(graph->get_handle_of_step(step))] = i; + } + } +} + +int64_t RGFACover::get_coverage(const vector& trav, const pair& uncovered_interval) { + path_handle_t path_handle = graph->get_path_handle_of_step(trav.front()); + int64_t coverage = 0; + + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { + const step_handle_t& step = trav[i]; + handle_t handle = graph->get_handle_of_step(step); + vector all_steps = graph->steps_of_handle(handle); + int64_t length = graph->get_length(handle); + coverage += length * all_steps.size(); + } + + return coverage; +} + + + +// copied pretty much verbatem from +// https://github.com/ComparativeGenomicsToolkit/hal2vg/blob/v1.1.2/clip-vg.cpp#L809-L880 +void RGFACover::forwardize_rgfa_paths(MutablePathMutableHandleGraph* mutable_graph) { + assert(this->graph == static_cast(mutable_graph)); + + unordered_map id_map; + mutable_graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = mutable_graph->get_path_name(path_handle); + if (is_rgfa_path_name(path_name)) { + size_t fw_count = 0; + size_t total_steps = 0; + mutable_graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + handle_t handle = mutable_graph->get_handle_of_step(step_handle); + if (mutable_graph->get_is_reverse(handle)) { + handle_t flipped_handle = mutable_graph->create_handle(mutable_graph->get_sequence(handle)); + id_map[mutable_graph->get_id(flipped_handle)] = mutable_graph->get_id(handle); + mutable_graph->follow_edges(handle, true, [&](handle_t prev_handle) { + if (mutable_graph->get_id(prev_handle) != mutable_graph->get_id(handle)) { + mutable_graph->create_edge(prev_handle, flipped_handle); + } + }); + mutable_graph->follow_edges(handle, false, [&](handle_t next_handle) { + if (mutable_graph->get_id(handle) != mutable_graph->get_id(next_handle)) { + mutable_graph->create_edge(flipped_handle, next_handle); + } + }); + // self-loop cases we punted on above: + if (mutable_graph->has_edge(handle, handle)) { + mutable_graph->create_edge(flipped_handle, flipped_handle); + } + if (mutable_graph->has_edge(handle, mutable_graph->flip(handle))) { + mutable_graph->create_edge(flipped_handle, mutable_graph->flip(flipped_handle)); + } + if (mutable_graph->has_edge(mutable_graph->flip(handle), handle)) { + mutable_graph->create_edge(mutable_graph->flip(flipped_handle), flipped_handle); + } + vector steps = mutable_graph->steps_of_handle(handle); + size_t ref_count = 0; + for (step_handle_t step : steps) { + if (mutable_graph->get_path_handle_of_step(step) == path_handle) { + ++ref_count; + } + step_handle_t next_step = mutable_graph->get_next_step(step); + handle_t new_handle = mutable_graph->get_is_reverse(mutable_graph->get_handle_of_step(step)) ? flipped_handle : + mutable_graph->flip(flipped_handle); + mutable_graph->rewrite_segment(step, next_step, {new_handle}); + } + if (ref_count > 1) { + cerr << "[rGFA] error: Cycle detected in rGFA path " << path_name << " at node " << mutable_graph->get_id(handle) << endl; + exit(1); + } + ++fw_count; + assert(mutable_graph->steps_of_handle(handle).empty()); + dynamic_cast(mutable_graph)->destroy_handle(handle); + } + ++total_steps; + }); + } + }); + + // rename all the ids back to what they were (so nodes keep their ids, just get flipped around) + mutable_graph->reassign_node_ids([&id_map](nid_t new_id) { + return id_map.count(new_id) ? id_map[new_id] : new_id; + }); + + // do a check just to be sure + mutable_graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = mutable_graph->get_path_name(path_handle); + if (is_rgfa_path_name(path_name)) { + mutable_graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + handle_t handle = mutable_graph->get_handle_of_step(step_handle); + if (mutable_graph->get_is_reverse(handle)) { + cerr << "[rGFA] error: Failed to fowardize node " << mutable_graph->get_id(handle) << " in path " << path_name << endl; + exit(1); + } + }); + } + }); +} + +vector> RGFACover::get_reference_nodes(nid_t node_id, bool first) const { + + // search back to reference in order to find the rank. + unordered_set visited; + priority_queue> queue; + queue.push(make_pair(0, node_id)); + + nid_t current_id; + int64_t distance = 0; + + // output reference intervals + vector> output_reference_nodes; + + while (!queue.empty()) { + std::tie(distance, current_id) = queue.top(); + queue.pop(); + + if (!visited.count(current_id)) { + + visited.insert(current_id); + + if (this->node_to_interval.count(current_id)) { + int64_t interval_idx = this->node_to_interval.at(current_id); + + const pair& rgfa_interval = this->rgfa_intervals.at(interval_idx); + + // we've hit the reference, fish out its step and stop searching. + if (interval_idx < this->num_ref_intervals) { + output_reference_nodes.push_back(make_pair(distance, current_id)); + if (first) { + break; + } + continue; + } + + // search out of the snarl -- any parent traversals will overlap here + graph->follow_edges(graph->get_handle_of_step(rgfa_interval.first), true, [&](handle_t prev) { + queue.push(make_pair(distance + 1, graph->get_id(prev))); + }); + // hack around gbwtgraph bug (feature?) that does not let you decrement path_end + path_handle_t path_handle = graph->get_path_handle_of_step(rgfa_interval.first); + step_handle_t last_step; + if (rgfa_interval.second == graph->path_end(path_handle)) { + last_step = graph->path_back(path_handle); + } else { + last_step = graph->get_previous_step(rgfa_interval.second); + } + graph->follow_edges(graph->get_handle_of_step(last_step), false, [&](handle_t next) { + queue.push(make_pair(distance + 1, graph->get_id(next))); + }); + + } else { + // revert to graph search if node not in interval (distance doesn't increase -- we only count intervals) + graph->follow_edges(graph->get_handle(current_id), false, [&](handle_t next) { + queue.push(make_pair(distance, graph->get_id(next))); + }); + graph->follow_edges(graph->get_handle(current_id), true, [&](handle_t next) { + queue.push(make_pair(distance, graph->get_id(next))); + }); + } + + } + } + + assert(output_reference_nodes.size() > 0 && (!first || (output_reference_nodes.size() == 1))); + return output_reference_nodes; +} + +pair RGFACover::parse_variant_id(const string& variant_id) const { + + if (variant_id[0] == '>' || variant_id[0] == '<') { + size_t p = variant_id.find_first_of("<>", 2); + if (p != string::npos) { + int64_t id1 = parse(variant_id.substr(1, p-1)); + int64_t id2 = parse(variant_id.substr(p+1)); + if (!graph->has_node(id1) || !graph->has_node(id2)) { + throw runtime_error("Unable to parse " + variant_id + ": node(s) not found in graph"); + } + return make_pair(graph->get_handle(id1, variant_id[0] == '<'), + graph->get_handle(id2, variant_id[p] == '<')); + } + } + throw runtime_error("Failed to parse " + variant_id); + return make_pair(handle_t(), handle_t()); +} + +void RGFACover::annotate_vcf(vcflib::VariantCallFile& vcf, ostream& os) { + vcflib::Variant var(vcf); + + // want a reference position lookup + unordered_map node_to_ref_pos; + for (int64_t i = 0; i < this->num_ref_intervals; ++i) { + const pair& ref_interval = this->rgfa_intervals.at(i); + // assumption: ref intervals span entire path + assert(graph->path_begin(graph->get_path_handle_of_step(ref_interval.first)) == ref_interval.first); + int64_t pos = 0; + for (step_handle_t step_handle = ref_interval.first; step_handle != ref_interval.second; + step_handle = graph->get_next_step(step_handle)) { + handle_t handle = graph->get_handle_of_step(step_handle); + nid_t node_id = graph->get_id(handle); + assert(!graph->get_is_reverse(handle)); + assert(!node_to_ref_pos.count(node_id)); + node_to_ref_pos[node_id] = pos; + pos += graph->get_length(handle); + } + } + + // remove header lines we're going to add + vcf.removeInfoHeaderLine("R_CHROM"); + vcf.removeInfoHeaderLine("R_START"); + vcf.removeInfoHeaderLine("R_END"); + vcf.removeInfoHeaderLine("RANK"); + + // and add them back, so as not to duplicate them if they are already there + vcf.addHeaderLine("##INFO="); + vcf.addHeaderLine("##INFO="); + vcf.addHeaderLine("##INFO="); + vcf.addHeaderLine("##INFO="); + + os << vcf.header << endl; + + while (vcf.getNextVariant(var)) { + pair id_handles = parse_variant_id(var.id); + string r_chrom; + string r_start; + string r_end; + string rank; + + // compute from the cover + nid_t first_node = graph->get_id(id_handles.first); + vector> ref_nodes = this->get_reference_nodes(first_node, false); + + int64_t min_ref_pos = numeric_limits::max(); + int64_t max_ref_pos = -1; + int64_t min_rank = numeric_limits::max(); + for (const pair& rank_node : ref_nodes) { + step_handle_t ref_step = this->rgfa_intervals.at(node_to_interval.at(rank_node.second)).first; + path_handle_t ref_path = graph->get_path_handle_of_step(ref_step); + string name = graph->get_path_name(ref_path); + // we assume one reference contig (which is built into the whole structure) + assert(r_chrom.empty() || r_chrom == name); + r_chrom = name; + int64_t ref_pos = node_to_ref_pos.at(rank_node.second); + // assume snarl is forward on both reference nodes + // todo: this won't be exact for some inversion cases, I don't think -- + // need to test these and either add check / or move to oriented search + min_ref_pos = min(min_ref_pos, ref_pos + (int64_t)graph->get_length(graph->get_handle(rank_node.second))); + max_ref_pos = max(max_ref_pos, ref_pos); + min_rank = min(min_rank, rank_node.first); + } + + r_start = std::to_string(min_ref_pos); + r_end = std::to_string(max_ref_pos); + rank = std::to_string(min_rank); + + var.info["R_CHROM"] = {r_chrom}; + var.info["R_START"] = {r_start}; + var.info["R_END"] = {r_end}; + var.info["RANK"] = {rank}; + + os << var << endl; + } +} + +void RGFACover::print_stats(ostream& os) { + + // todo: mostly copied from annotate function above. should probably refactor common + // logic into its own, shared thing.y + unordered_map node_to_ref_pos; + for (int64_t i = 0; i < this->num_ref_intervals; ++i) { + const pair& ref_interval = this->rgfa_intervals.at(i); + // assumption: ref intervals span entire path + assert(graph->path_begin(graph->get_path_handle_of_step(ref_interval.first)) == ref_interval.first); + int64_t pos = 0; + for (step_handle_t step_handle = ref_interval.first; step_handle != ref_interval.second; + step_handle = graph->get_next_step(step_handle)) { + handle_t handle = graph->get_handle_of_step(step_handle); + nid_t node_id = graph->get_id(handle); + assert(!graph->get_is_reverse(handle)); + assert(!node_to_ref_pos.count(node_id)); + node_to_ref_pos[node_id] = pos; + pos += graph->get_length(handle); + } + } + + // the header + os << "#Sample" << "\t" + << "Haplotype" << "\t" + << "Locus" << "\t" + << "Start" << "\t" + << "End" << "\t" + << "NodeStart" << "\t" + << "NodeEnd" << "\t" + << "Rank" << "\t" + << "AvgDepth" << "\t" + << "AvgSampleDepth" << "\t" + << "RefSequence" << "\t" + << "RefStart" << "\t" + << "RefEnd" << endl; + + for (int64_t i = this->num_ref_intervals; i < this->rgfa_intervals.size(); ++i) { + const pair& interval = this->rgfa_intervals[i]; + path_handle_t path_handle = graph->get_path_handle_of_step(interval.first); + subrange_t rgfa_subrange = graph->get_subrange(path_handle); + assert(rgfa_subrange != PathMetadata::NO_SUBRANGE); + + int64_t path_length = 0; + int64_t tot_depth = 0; + int64_t tot_sample_depth = 0; + int64_t tot_steps = 0; + for (step_handle_t step = interval.first; step != interval.second; step = graph->get_next_step(step)) { + path_length += graph->get_length(graph->get_handle_of_step(step)); + set sample_set; + vector steps = graph->steps_of_handle(graph->get_handle_of_step(step)); + for (step_handle_t& other_step : steps) { + path_handle_t other_path = graph->get_path_handle_of_step(other_step); + sample_set.insert(graph->get_sample_name(other_path)); + } + tot_depth += steps.size(); + tot_sample_depth += sample_set.size(); + // we don't want to count the rgfa cover + // todo (can remove this when move away from path-based scheme) + assert(sample_set.count(RGFACover::rgfa_sample_name)); + --tot_depth; + --tot_sample_depth; + ++tot_steps; + } + pair sample_locus = RGFACover::parse_rgfa_locus_name(graph->get_locus_name(path_handle)); + size_t haplotype = graph->get_haplotype(path_handle); + + nid_t first_node = graph->get_id(graph->get_handle_of_step(interval.first)); + vector> ref_nodes = this->get_reference_nodes(first_node, false); + // interval is open ended, so we go back to last node + step_handle_t last_step; + if (interval.second == graph->path_end(graph->get_path_handle_of_step(interval.first))) { + // can't get previous step of end in gbz, so we treat as special case here + last_step = graph->path_back(graph->get_path_handle_of_step(interval.first)); + } else { + last_step = graph->get_previous_step(interval.second); + } + handle_t last_handle = graph->get_handle_of_step(last_step); + + int64_t min_ref_pos = numeric_limits::max(); + int64_t max_ref_pos = -1; + int64_t min_rank = numeric_limits::max(); + string r_chrom; + map> chrom_pos; + for (const pair& rank_node : ref_nodes) { + step_handle_t ref_step = this->rgfa_intervals.at(node_to_interval.at(rank_node.second)).first; + path_handle_t ref_path = graph->get_path_handle_of_step(ref_step); + string name = graph->get_path_name(ref_path); + // we assume one reference contig (which is built into the whole structure) + int64_t ref_pos = node_to_ref_pos.at(rank_node.second); + int64_t last_len = (int64_t)graph->get_length(graph->get_handle(rank_node.second)); + if (chrom_pos.count(name)) { + tuple& pos = chrom_pos[name]; + // assume snarl is forward on both reference nodes + // todo: this won't be exact for some inversion cases, I don't think -- + // need to test these and either add check / or move to oriented search + pos = make_tuple(min(get<0>(pos), ref_pos + last_len), max(get<1>(pos), ref_pos), min(get<2>(pos), rank_node.first)); + } else { + chrom_pos[name] = make_tuple(ref_pos + last_len, ref_pos, rank_node.first); + } + if (rank_node.first < min_rank) { + // todo: is there a better heuristic to use when choosing a reference? + // also: need to fix vcf annotate maybe to just list them all... + min_rank = rank_node.first; + r_chrom = name; + } + } + + os << sample_locus.first << "\t" + << sample_locus.second << "\t" + << (haplotype != PathMetadata::NO_HAPLOTYPE ? (int64_t)haplotype : (int64_t)-1) << "\t" + << rgfa_subrange.first << "\t" + << (rgfa_subrange.first + path_length) << "\t" + << (graph->get_is_reverse(graph->get_handle_of_step(interval.first)) ? "<" : ">") + << graph->get_id(graph->get_handle_of_step(interval.first)) << "\t" + << (graph->get_is_reverse(last_handle) ? "<" : ">") + << graph->get_id(last_handle) << "\t" + << min_rank << "\t" + << std::fixed << std::setprecision(2) << (tot_depth / tot_steps) << "\t" + << std::fixed << std::setprecision(2) << (tot_sample_depth / tot_steps) << "\t" + << Paths::strip_subrange(r_chrom) << "\t" + << get<0>(chrom_pos[r_chrom]) << "\t" + << get<1>(chrom_pos[r_chrom]) << "\n"; + } +} + +} + diff --git a/src/rgfa.hpp b/src/rgfa.hpp new file mode 100644 index 00000000000..db5abdfe850 --- /dev/null +++ b/src/rgfa.hpp @@ -0,0 +1,166 @@ +#ifndef VG_RGFA_HPP_INCLUDED +#define VG_RGFA_HPP_INCLUDED + +/** + * \file rgfa.hpp + * + * Interface for computing and querying rGFA path covers. + * + * An rGFA cover is a set of path fragments (stored as separate paths) in the graph. + * They are always relative to an existing reference sample (ie GRCh38). + * rGFA path fragments have a special name format, where sample=rGFA + * + * The data structures used in this class are always relative to the original paths + * in the graph. The REfERENCE-SENSE fragments that are used to serialize the + * cover (and for surjection) can be created and loaded, but they are not used + * beyond that. + */ + +#include "handle.hpp" +#include "snarls.hpp" +#include "traversal_finder.hpp" + +namespace vg { + +using namespace std; + +class RGFACover { +public: + // I'm not sure how much we want to allow this to change. For now just define it here + static const string rgfa_sample_name; + + // make an rGFA path name from a subrange of a normal path + static string make_rgfa_path_name(const string& path_name, int64_t start, int64_t length, + bool specify_subrange_end = true); + + // test if path name is rGFA + static bool is_rgfa_path_name(const string& path_name); + + // break up the rGFA locus name back into original sample and locus + static pair parse_rgfa_locus_name(const string& locus_name); + + // undo make_rgfa_path_name, getting back an original non-rgfa name + // if the input's not a rgfa path name, just return it + static string revert_rgfa_path_name(const string& path_name, + bool strip_subrange = false); + +public: + // clear out the existing rGFA cover from the graph. recommended to run this + // before compute() + void clear(MutablePathMutableHandleGraph* graph); + + // compute the rgfa cover from the graph, starting with a given set of reference paths + void compute(const PathHandleGraph* graph, + SnarlManager* snarl_manager, + const unordered_set& reference_paths, + int64_t minimum_length); + + // load the rgfa cover from the graph, assuming it's been computed already and + // saved in special rgfa paths + // the use_original_paths flag specifies whether the cover indexes the original + // paths or the rgfa paths fragments (if the former, then they must exist + // and be 100% consistent with the paths used to construct the fragments) + void load(const PathHandleGraph* graph, + const unordered_set& reference_paths, + bool use_original_paths = false); + + // apply the rgfa cover to a graph (must have been computed first), adding it + // as a bunch of (reference sense) paths with a special sample name + void apply(MutablePathMutableHandleGraph* mutable_graph); + + // get the rgfa rank (level) of a given node (0 if on a reference path) + int64_t get_rank(nid_t node_id) const; + + // get the rgfa step of a given node + step_handle_t get_step(nid_t node_id) const; + + // get the parent intervals + pair*, + const pair*> get_parent_intervals(const pair& interval) const; + + // get the intervals + const vector>& get_intervals() const; + + // get an interval from a node + // return nullptr if node not in an interval + const pair* get_interval(nid_t node_id) const; + + // parse an id of the form >244>2334 and return the pair of handles + pair parse_variant_id(const string& variant_id) const; + + // add R_CHROM, R_START, R_END, F_LEN tags to a VCF using the cover + void annotate_vcf(vcflib::VariantCallFile& vcf, ostream& os); + + // print out a table of statistics + void print_stats(ostream& os); + +protected: + + // compute the cover for the given snarl, by greedily finding the convered rgfa paths through it + // the cover is added to the two "thread_" structures. + void compute_snarl(const Snarl& snarl, PathTraversalFinder& path_trav_finder, int64_t minimum_length, + vector>& thread_rgfa_intervals, + unordered_map& thread_node_to_interval); + + // get intervals in travs that are not covered according to this->node_to_interval or + // the thread_node_to_interval parameter + vector> get_uncovered_intervals(const vector& trav, + const unordered_map& thread_node_to_interval); + + // add a new interval into the rgfa_intervals veector and update the node_to_interval map + // if the interval can be merged into an existing, contiguous interval, do that instead + // returns true if a new interval was added, false if an existing interval was updated + bool add_interval(vector>& thread_rgfa_intervals, + unordered_map& thread_node_to_interval, + const pair& new_interval, + bool global = false); + + // add_interval() can delete an existing interval. ex if ABC are contiguous intervals and B is added to A_C, + // then A gets extended out to the whole range and C is left dangling. Since we're using a vector, + // this requires a full update of everything at the end. + void defragment_intervals(); + + // get the total coverage of a traversal (sum of step lengths) + int64_t get_coverage(const vector& trav, const pair& uncovered_interval); + + // make sure all nodes in all rGFA paths are in forward orientation + // this is always possible because they are, by definition, disjoint + // this should only be run from inside apply() + void forwardize_rgfa_paths(MutablePathMutableHandleGraph* mutable_graph); + + // search back to the reference and return when its found + // (here distance is the number of intervals crossed, aka rank) + // "first" toggles returning the first interfval found vs all of them + vector> get_reference_nodes(nid_t node_id, bool first) const; + +protected: + + const PathHandleGraph* graph; + + // intervals are end-exclusive (like bed) + vector> rgfa_intervals; + + // so rgfa_intervals[0,num_ref_intervals-1] will be all rank-0 reference intervals + int64_t num_ref_intervals; + + unordered_map node_to_interval; + + // used when selecting traversals to make the greedy cover + struct RankedFragment { + int64_t coverage; + const string* name; + int64_t trav_idx; + pair fragment; + bool operator<(const RankedFragment& f2) const { + // note: name comparison is flipped because we want to select high coverage / low name + return this->coverage < f2.coverage || (this->coverage == f2.coverage && *this->name > *f2.name); + } + }; +}; + + + +/// Export the given VG graph to the given GFA file. +} + +#endif diff --git a/src/subcommand/call_main.cpp b/src/subcommand/call_main.cpp index 4e035454d76..cb1b2706a79 100644 --- a/src/subcommand/call_main.cpp +++ b/src/subcommand/call_main.cpp @@ -58,7 +58,7 @@ void help_call(char** argv) { << " -N, --translation FILE Node ID translation (as created by vg gbwt --translation) to apply to snarl names in output" << endl << " -O, --gbz-translation Use the ID translation from the input gbz to apply snarl names to snarl names and AT fields in output" << endl << " -p, --ref-path NAME Reference path to call on (multipile allowed. defaults to all paths)" << endl - << " -S, --ref-sample NAME Call on all paths with given sample name (cannot be used with -p)" << endl + << " -S, --ref-sample NAME Call on all paths with given sample name (cannot be used with -p; multiple allowed)" << endl << " -o, --ref-offset N Offset in reference path (multiple allowed, 1 per path)" << endl << " -l, --ref-length N Override length of reference in the contig field of output VCF" << endl << " -d, --ploidy N Ploidy of sample. Only 1 and 2 supported. (default: 2)" << endl @@ -84,7 +84,7 @@ int main_call(int argc, char** argv) { string ref_fasta_filename; string ins_fasta_filename; vector ref_paths; - string ref_sample; + set ref_sample_set; vector ref_path_offsets; vector ref_path_lengths; string min_support_string; @@ -232,7 +232,7 @@ int main_call(int argc, char** argv) { ref_paths.push_back(optarg); break; case 'S': - ref_sample = optarg; + ref_sample_set.insert(optarg); break; case 'o': ref_path_offsets.push_back(parse(optarg)); @@ -369,7 +369,7 @@ int main_call(int argc, char** argv) { cerr << "error [vg call]: -c/-C no supported with -v, -l or -n" << endl; return 1; } - if (!ref_paths.empty() && !ref_sample.empty()) { + if (!ref_paths.empty() && !ref_sample_set.empty()) { cerr << "error [vg call]: -S cannot be used with -p" << endl; return 1; } @@ -521,7 +521,7 @@ int main_call(int argc, char** argv) { PathSense path_sense = PathMetadata::parse_sense(name); if (!Paths::is_alt(name) && path_sense != PathSense::HAPLOTYPE) { string sample_name = PathMetadata::parse_sample_name(name); - if (ref_sample.empty() || sample_name == ref_sample) { + if (ref_sample_set.empty() || ref_sample_set.count(sample_name)) { ref_paths.push_back(name); // keep track of length best we can using maximum coordinate in event of subpaths subrange_t subrange; @@ -535,7 +535,7 @@ int main_call(int argc, char** argv) { } } }); - if (ref_sample_names.size() > 1 && ref_sample.empty()) { + if (ref_sample_names.size() > 1 && ref_sample_set.empty()) { cerr << "error [vg call]: Multiple reference samples detected: ["; size_t count = 0; for (const string& n : ref_sample_names) { @@ -598,8 +598,18 @@ int main_call(int argc, char** argv) { // make sure we have some ref paths if (ref_paths.empty()) { - if (!ref_sample.empty()) { - cerr << "error [vg call]: No paths with selected reference sample \"" << ref_sample << "\" found. " + if (!ref_sample_set.empty()) { + string sample_string; + if (ref_sample_set.size() == 1) { + sample_string = "sample \"" + *ref_sample_set.begin() + "\""; + } else { + sample_string = "samples \""; + for (const string& rsn : ref_sample_set) { + sample_string += (sample_string.length() > 9 ? " " : "") + rsn; + } + sample_string += "\""; + } + cerr << "error [vg call]: No paths with selected reference " << sample_string << " found. " << "Try using vg paths -M to see which samples are in your graph" << endl; return 1; } diff --git a/src/subcommand/clip_main.cpp b/src/subcommand/clip_main.cpp index ddbd5956862..d219b6dd35b 100644 --- a/src/subcommand/clip_main.cpp +++ b/src/subcommand/clip_main.cpp @@ -8,6 +8,7 @@ #include #include "../clip.hpp" #include +#include "../rgfa.hpp" #include #include @@ -44,6 +45,7 @@ void help_clip(char** argv) { << " -m, --min-fragment-len N Don't write novel path fragment if it is less than N bp long" << endl << " -B, --output-bed Write BED-style file of affected intervals instead of clipped graph. " << endl << " Columns 4-9 are: snarl node-count edge-count shallow-node-count shallow-edge-count avg-degree" << endl + << " -i, --ignore-path-prefix Ignore all paths beginning with given prefix, ex when computing depth [_rGFA_]." << endl << " -t, --threads N number of threads to use [default: all available]" << endl << " -v, --verbose Print some logging messages" << endl << endl; @@ -54,6 +56,7 @@ int main_clip(int argc, char** argv) { string bed_path; string snarls_path; vector ref_prefixes; + vector ignore_prefixes = {RGFACover::rgfa_sample_name}; int64_t min_depth = -1; int64_t min_fragment_len = 0; bool verbose = false; @@ -102,13 +105,14 @@ int main_clip(int argc, char** argv) { {"snarls", required_argument, 0, 'r'}, {"min-fragment-len", required_argument, 0, 'm'}, {"output-bed", no_argument, 0, 'B'}, + {"ignore-path-prefixes", required_argument, 0, 'i'}, {"threads", required_argument, 0, 't'}, {"verbose", required_argument, 0, 'v'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hb:d:sSn:e:N:E:a:l:L:D:c:P:r:m:Bt:v", + c = getopt_long (argc, argv, "hb:d:sSn:e:N:E:a:l:L:D:c:P:r:m:Bi:t:v", long_options, &option_index); // Detect the end of the options. @@ -180,6 +184,9 @@ int main_clip(int argc, char** argv) { case 'B': out_bed = true; break; + case 'i': + ignore_prefixes.push_back(optarg); + break; case 'v': verbose = true; break; @@ -336,13 +343,24 @@ int main_clip(int argc, char** argv) { } if (min_depth >= 0) { + // compute paths to ignore + unordered_set ignore_set; + graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = graph->get_path_name(path_handle); + for (const string& ip : ignore_prefixes) { + if (path_name.compare(0, ip.length(), ip) == 0) { + ignore_set.insert(path_handle); + break; + } + } + }); // run the depth clipping if (bed_path.empty()) { // do the whole graph - clip_low_depth_nodes_and_edges(graph.get(), min_depth, ref_prefixes, min_fragment_len, verbose); + clip_low_depth_nodes_and_edges(graph.get(), min_depth, ref_prefixes, min_fragment_len, ignore_set, verbose); } else { // do the contained snarls - clip_contained_low_depth_nodes_and_edges(graph.get(), pp_graph, bed_regions, *snarl_manager, false, min_depth, min_fragment_len, verbose); + clip_contained_low_depth_nodes_and_edges(graph.get(), pp_graph, bed_regions, *snarl_manager, false, min_depth, min_fragment_len, ignore_set, verbose); } } else if (max_deletion >= 0) { diff --git a/src/subcommand/depth_main.cpp b/src/subcommand/depth_main.cpp index 54284f67fd5..2fd42cd6ae6 100644 --- a/src/subcommand/depth_main.cpp +++ b/src/subcommand/depth_main.cpp @@ -45,6 +45,7 @@ void help_depth(char** argv) { << " -P, --paths-by STR select the paths with the given name prefix" << endl << " -b, --bin-size N bin size (in bases) [1] (2 extra columns printed when N>1: bin-end-pos and stddev)" << endl << " -m, --min-coverage N ignore nodes with less than N coverage depth [1]" << endl + << " -i, --ignore-prefix P ignore paths with given prefix when computing path coverage [_rGFA_]" << endl << " -t, --threads N number of threads to use [all available]" << endl; } @@ -69,6 +70,7 @@ int main_depth(int argc, char** argv) { bool count_cycles = false; size_t min_coverage = 1; + vector ignore_path_prefixes; int c; optind = 2; // force optind past command positional argument @@ -87,13 +89,14 @@ int main_depth(int argc, char** argv) { {"min-mapq", required_argument, 0, 'Q'}, {"min-coverage", required_argument, 0, 'm'}, {"count-cycles", no_argument, 0, 'c'}, + {"ignore-prefix", required_argument, 0, 'i'}, {"threads", required_argument, 0, 't'}, {"help", no_argument, 0, 'h'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hk:p:P:b:dg:a:n:s:m:ct:", + c = getopt_long (argc, argv, "hk:p:P:b:dg:a:n:s:m:ci:t:", long_options, &option_index); // Detect the end of the options. @@ -138,6 +141,9 @@ int main_depth(int argc, char** argv) { case 'c': count_cycles = true; break; + case 'i': + ignore_path_prefixes.push_back(optarg); + break; case 't': { int num_threads = parse(optarg); @@ -197,6 +203,7 @@ int main_depth(int argc, char** argv) { // we want our paths sorted by the subpath parse so the output is sorted map, string> ref_paths; unordered_set base_path_set; + unordered_set ignore_paths; graph->for_each_path_handle([&](path_handle_t path_handle) { string path_name = graph->get_path_name(path_handle); @@ -222,6 +229,14 @@ int main_depth(int argc, char** argv) { assert(!ref_paths.count(coord)); ref_paths[coord] = path_name; } + if (pack_filename.empty()) { + for (const string& ignore_prefix : ignore_path_prefixes) { + if (path_name.compare(0, ignore_prefix.length(), ignore_prefix) == 0) { + ignore_paths.insert(path_handle); + break; + } + } + } }); for (const auto& ref_name : ref_paths_input_set) { @@ -240,7 +255,8 @@ int main_depth(int argc, char** argv) { if (!pack_filename.empty()) { binned_depth = algorithms::binned_packed_depth(*packer, ref_path, bin_size, min_coverage, count_dels); } else { - binned_depth = algorithms::binned_path_depth(*graph, ref_path, bin_size, min_coverage, count_cycles); + binned_depth = algorithms::binned_path_depth(*graph, ref_path, bin_size, min_coverage, count_cycles, + ignore_paths); } for (auto& bin_cov : binned_depth) { // bins can ben nan if min_coverage filters everything out. just skip @@ -253,7 +269,7 @@ int main_depth(int argc, char** argv) { if (!pack_filename.empty()) { algorithms::packed_depths(*packer, ref_path, min_coverage, cout); } else { - algorithms::path_depths(*graph, ref_path, min_coverage, count_cycles, cout); + algorithms::path_depths(*graph, ref_path, min_coverage, count_cycles, ignore_paths, cout); } } } diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index 1fc293df752..e5621f0c287 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -16,6 +16,9 @@ #include "../vg.hpp" #include "../xg.hpp" #include "../gbwt_helper.hpp" +#include "../integrated_snarl_finder.hpp" +#include "../rgfa.hpp" +#include "../io/save_handle_graph.hpp" #include #include #include @@ -37,6 +40,12 @@ void help_paths(char** argv) { << " -V, --extract-vg output a path-only graph covering the selected paths" << endl << " -d, --drop-paths output a graph with the selected paths removed" << endl << " -r, --retain-paths output a graph with only the selected paths retained" << endl + << " rGFA cover" << endl + << " -f, --rgfa-min-length N add rGFA cover to graph, using seleciton from -Q/-S as rank-0 backbone, only adding fragments >= Nbp (default:-1=disabled)" << endl + << " -s, --snarls FILE snarls (from vg snarls) to avoid recomputing. snarls only used for rgfa cover." << endl + << " -t, --threads N use up to N threads when computing rGFA cover (default: all available)" << endl + << " --rgfa-vcf FILE add rGFA cover tags to VCF (which must be called on rGFA cover in graph from -f)" << endl + << " --rgfa-intervals print out table of rgfa intervals and their ranks and positions" << endl << " output path data:" << endl << " -X, --extract-gam print (as GAM alignments) the stored paths in the graph" << endl << " -A, --extract-gaf print (as GAF alignments) the stored paths in the graph" << endl @@ -50,6 +59,7 @@ void help_paths(char** argv) { << " -p, --paths-file FILE select the paths named in a file (one per line)" << endl << " -Q, --paths-by STR select the paths with the given name prefix" << endl << " -S, --sample STR select the haplotypes or reference paths for this sample" << endl + << " --rgfa-paths select all rGFA fragments (must have been added with -R previously)" << endl << " -a, --variant-paths select the variant paths added by 'vg construct -a'" << endl << " -G, --generic-paths select the generic, non-reference, non-haplotype paths" << endl << " -R, --reference-paths select the reference paths" << endl @@ -87,6 +97,11 @@ unordered_map SENSE_TO_STRING { {PathSense::HAPLOTYPE, "HAPLOTYPE"} }; +// Long options with no corresponding short options. +const int OPT_SELECT_RGFA_FRAGMENTS = 1000; +const int OPT_ANNOTATE_RGFA_VCF = 1001; +const int OPT_RGFA_INTERVALS = 1002; + int main_paths(int argc, char** argv) { if (argc == 2) { @@ -101,6 +116,10 @@ int main_paths(int argc, char** argv) { bool extract_as_fasta = false; bool drop_paths = false; bool retain_paths = false; + int64_t rgfa_min_len = -1; + string rgfa_vcf_filename; + bool rgfa_print_intervals = false; + string snarl_filename; string graph_file; string gbwt_file; string path_prefix; @@ -130,6 +149,8 @@ int main_paths(int argc, char** argv) { {"extract-vg", no_argument, 0, 'V'}, {"drop-paths", no_argument, 0, 'd'}, {"retain-paths", no_argument, 0, 'r'}, + {"rgfa-min-length", required_argument, 0, 'f'}, + {"snarls", required_argument, 0, 's'}, {"extract-gam", no_argument, 0, 'X'}, {"extract-gaf", no_argument, 0, 'A'}, {"list", no_argument, 0, 'L'}, @@ -144,17 +165,19 @@ int main_paths(int argc, char** argv) { {"generic-paths", no_argument, 0, 'G'}, {"reference-paths", no_argument, 0, 'R'}, {"haplotype-paths", no_argument, 0, 'H'}, - {"coverage", no_argument, 0, 'c'}, - + {"coverage", no_argument, 0, 'c'}, + {"rgfa-paths", no_argument, 0, OPT_SELECT_RGFA_FRAGMENTS}, + {"rgfa-vcf", required_argument, 0, OPT_ANNOTATE_RGFA_VCF}, + {"rgfa-intervals", no_argument, 0, OPT_RGFA_INTERVALS}, + {"threads", required_argument, 0, 't'}, // Hidden options for backward compatibility. - {"threads", no_argument, 0, 'T'}, {"threads-by", required_argument, 0, 'q'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:draGRHp:c", + c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drf:R:s:aGHp:ct:", long_options, &option_index); // Detect the end of the options. @@ -183,13 +206,22 @@ int main_paths(int argc, char** argv) { case 'd': drop_paths = true; output_formats++; - break; + break; case 'r': retain_paths = true; output_formats++; break; - + + case 'f': + rgfa_min_len = parse(optarg); + output_formats++; + break; + + case 's': + snarl_filename = optarg; + break; + case 'X': extract_as_gam = true; output_formats++; @@ -265,9 +297,31 @@ int main_paths(int argc, char** argv) { output_formats++; break; - case 'T': - std::cerr << "warning: [vg paths] option --threads is obsolete and unnecessary" << std::endl; + case OPT_SELECT_RGFA_FRAGMENTS: + sample_name = RGFACover::rgfa_sample_name; + selection_criteria++; + break; + + case OPT_ANNOTATE_RGFA_VCF: + rgfa_vcf_filename = optarg; + output_formats++; + break; + + case OPT_RGFA_INTERVALS: + rgfa_print_intervals = true; + output_formats++; + break; + + case 't': + { + int num_threads = parse(optarg); + if (num_threads <= 0) { + cerr << "error:[vg call] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; + exit(1); + } + omp_set_num_threads(num_threads); break; + } case 'q': std::cerr << "warning: [vg paths] option --threads-by is deprecated; please use --paths-by" << std::endl; @@ -320,11 +374,11 @@ int main_paths(int argc, char** argv) { } } if (output_formats != 1) { - std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -L, -F, -E, -C or -c) must be specified" << std::endl; + std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -R, -L, -F, -E, -C, -c, --rgfa-vcf, --rgfa-intervals) must be specified" << std::endl; std::exit(EXIT_FAILURE); } if (selection_criteria > 1) { - std::cerr << "error: [vg paths] multiple selection criteria (-Q, -S, -a, -G/-R/-H, -p) cannot be used" << std::endl; + std::cerr << "error: [vg paths] multiple selection criteria (-Q, -S, -a, -G/-R/-H, -p, --rgfa-paths) cannot be used" << std::endl; std::exit(EXIT_FAILURE); } if (select_alt_paths && !gbwt_file.empty()) { @@ -371,9 +425,23 @@ int main_paths(int argc, char** argv) { exit(1); } } - - - + + // Load or compute the snarls + unique_ptr snarl_manager; + if (rgfa_min_len >= 0) { + if (!snarl_filename.empty()) { + ifstream snarl_file(snarl_filename.c_str()); + if (!snarl_file) { + cerr << "Error [vg paths]: Unable to load snarls file: " << snarl_filename << endl; + return 1; + } + snarl_manager = vg::io::VPKG::load_one(snarl_file); + } else { + IntegratedSnarlFinder finder(*graph); + snarl_manager = unique_ptr(new SnarlManager(std::move(finder.find_snarls_parallel()))); + } + } + set path_names; if (!path_file.empty()) { ifstream path_stream(path_file); @@ -469,7 +537,7 @@ int main_paths(int argc, char** argv) { // Write as a Path in a VG chunk_to_emitter(path, *graph_emitter); } else if (extract_as_fasta) { - write_fasta_sequence(name, path_sequence(*graph, path), cout); + write_fasta_sequence(RGFACover::revert_rgfa_path_name(name, false), path_sequence(*graph, path), cout); } if (list_lengths) { cout << path.name() << "\t" << path_to_length(path) << endl; @@ -566,7 +634,7 @@ int main_paths(int argc, char** argv) { }; - if (drop_paths || retain_paths) { + if (drop_paths || retain_paths || rgfa_min_len >= 0) { MutablePathMutableHandleGraph* mutable_graph = dynamic_cast(graph.get()); if (!mutable_graph) { std::cerr << "error[vg paths]: graph cannot be modified" << std::endl; @@ -578,26 +646,67 @@ int main_paths(int argc, char** argv) { exit(1); } - vector to_destroy; - if (drop_paths) { + set reference_path_names; + if (drop_paths || retain_paths) { + vector to_destroy; + if (drop_paths) { + for_each_selected_path([&](const path_handle_t& path_handle) { + string name = graph->get_path_name(path_handle); + to_destroy.push_back(name); + }); + } else { + for_each_unselected_path([&](const path_handle_t& path_handle) { + string name = graph->get_path_name(path_handle); + to_destroy.push_back(name); + }); + } + for (string& path_name : to_destroy) { + mutable_graph->destroy_path(graph->get_path_handle(path_name)); + } + } else { + assert(rgfa_min_len >= 0); + RGFACover rgfa_cover; + // clean out existing cover + rgfa_cover.clear(mutable_graph); + // load up the rank-0 reference path selection + unordered_set reference_paths; for_each_selected_path([&](const path_handle_t& path_handle) { - string name = graph->get_path_name(path_handle); - to_destroy.push_back(name); + reference_paths.insert(path_handle); + reference_path_names.insert(graph->get_path_name(path_handle)); }); - } else { - for_each_unselected_path([&](const path_handle_t& path_handle) { - string name = graph->get_path_name(path_handle); - to_destroy.push_back(name); - }); - } - for (string& path_name : to_destroy) { - mutable_graph->destroy_path(graph->get_path_handle(path_name)); + // compute the new cover + rgfa_cover.compute(mutable_graph, snarl_manager.get(), reference_paths, rgfa_min_len); + // and write it out to the graph + rgfa_cover.apply(mutable_graph); } // output the graph - serializable_graph->serialize(cout); - } - else if (coverage) { + vg::io::save_handle_graph(graph.get(), std::cout, reference_path_names); + } else if (!rgfa_vcf_filename.empty() || rgfa_print_intervals) { + RGFACover rgfa_cover; + // load up the rank-0 reference path selection + unordered_set reference_paths; + for_each_selected_path([&](const path_handle_t& path_handle) { + reference_paths.insert(path_handle); + }); + // load up the cover (which must have been previously computed and serialized with -f) + rgfa_cover.load(graph.get(), reference_paths); + if (rgfa_cover.get_intervals().size() == 0) { + cerr << "error[vg paths]: no rGFA cover found in graph. Please compute one with -f before annotating a VCF" << endl; + exit(1); + } + if (!rgfa_vcf_filename.empty()) { + vcflib::VariantCallFile variant_file; + variant_file.open(rgfa_vcf_filename); + if (!variant_file.is_open()) { + cerr << "error[vg paths]: unable to open VCF file: " << rgfa_vcf_filename << endl; + exit(1); + } + rgfa_cover.annotate_vcf(variant_file, cout); + } else if (rgfa_print_intervals) { + rgfa_cover.print_stats(cout); + } + } else if (coverage) { // for every node, count the number of unique paths. then add the coverage count to each one // (we're doing the whole graph here, which could be inefficient in the case the user is selecting // a small path) @@ -763,7 +872,8 @@ int main_paths(int argc, char** argv) { } else if (extract_as_vg) { chunk_to_emitter(path, *graph_emitter); } else if (extract_as_fasta) { - write_fasta_sequence(graph->get_path_name(path_handle), path_sequence(*graph, path), cout); + write_fasta_sequence(RGFACover::revert_rgfa_path_name(graph->get_path_name(path_handle), false), + path_sequence(*graph, path), cout); } } }); diff --git a/src/subcommand/surject_main.cpp b/src/subcommand/surject_main.cpp index 8121ababf47..6d4d21e1dfa 100644 --- a/src/subcommand/surject_main.cpp +++ b/src/subcommand/surject_main.cpp @@ -39,6 +39,7 @@ void help_surject(char** argv) { << " -t, --threads N number of threads to use" << endl << " -p, --into-path NAME surject into this path or its subpaths (many allowed, default: reference, then non-alt generic)" << endl << " -F, --into-paths FILE surject into path names listed in HTSlib sequence dictionary or path list FILE" << endl + << " -n, --into-sample NAME surject into pahts from this sample (multiple allowed)" << endl << " -i, --interleaved GAM is interleaved paired-ended, so when outputting HTS formats, pair reads" << endl << " -M, --multimap include secondary alignments to all overlapping paths instead of just primary" << endl << " -G, --gaf-input input file is GAF instead of GAM" << endl @@ -84,6 +85,7 @@ int main_surject(int argc, char** argv) { string xg_name; vector path_names; + vector path_sample_names; string path_file; string output_format = "GAM"; string input_format = "GAM"; @@ -113,6 +115,7 @@ int main_surject(int argc, char** argv) { {"threads", required_argument, 0, 't'}, {"into-path", required_argument, 0, 'p'}, {"into-paths", required_argument, 0, 'F'}, + {"into-sample", required_argument, 0, 'n'}, {"ref-paths", required_argument, 0, 'F'}, // Now an alias for --into-paths {"subpath-local", required_argument, 0, 'l'}, {"interleaved", no_argument, 0, 'i'}, @@ -137,7 +140,7 @@ int main_surject(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "hx:p:F:liGmcbsN:R:f:C:t:SPa:ALMVw:", + c = getopt_long (argc, argv, "hx:p:F:n:liGmcbsN:R:f:C:t:SPa:ALMVw:", long_options, &option_index); // Detect the end of the options. @@ -158,6 +161,10 @@ int main_surject(int argc, char** argv) { case 'F': path_file = optarg; break; + + case 'n': + path_sample_names.push_back(optarg); + break; case 'l': subpath_global = false; @@ -279,6 +286,11 @@ int main_surject(int argc, char** argv) { // Get the paths to surject into and their length information, either from // the given file, or from the provided list, or from sniffing the graph. + for (const string& sample_name : path_sample_names) { + path_handle_graph->for_each_path_of_sample(sample_name, [&](path_handle_t path_handle) { + path_names.push_back(path_handle_graph->get_path_name(path_handle)); + }); + } vector> sequence_dictionary = get_sequence_dictionary(path_file, path_names, *xgidx); // Clear out path_names so we don't accidentally use it path_names.clear(); diff --git a/test/rgfa/rgfa_ins.gfa b/test/rgfa/rgfa_ins.gfa new file mode 100644 index 00000000000..9751b52ba47 --- /dev/null +++ b/test/rgfa/rgfa_ins.gfa @@ -0,0 +1,15 @@ +H VN:Z:1.0 +S 1 CAAATAAG +S 2 TTT +S 3 TTT +S 4 TTT +S 5 TTT +P x 1+,5+ * +P y 1+,2+,4+,5+ * +P z 1+,2+,3+,4+,5+ * +L 1 + 2 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 4 + 0M +L 4 + 5 + 0M +L 1 + 5 + 0M diff --git a/test/rgfa/rgfa_ins2.gfa b/test/rgfa/rgfa_ins2.gfa new file mode 100644 index 00000000000..ef6f1f3055c --- /dev/null +++ b/test/rgfa/rgfa_ins2.gfa @@ -0,0 +1,18 @@ +H VN:Z:1.0 +S 1 CAAATAAG +S 2 TTT +S 3 TTT +S 4 TTT +S 5 TTT +S 6 TTTTTTTTTT +P x 1+,5+ * +P y 1+,2+,6+,4+,5+ * +P z 1+,2+,3+,4+,5+ * +L 1 + 2 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 4 + 0M +L 4 + 5 + 0M +L 1 + 5 + 0M +L 2 + 6 + 0M +L 6 + 4 + 0M diff --git a/test/rgfa/rgfa_ins3.gfa b/test/rgfa/rgfa_ins3.gfa new file mode 100644 index 00000000000..861ac58d3b5 --- /dev/null +++ b/test/rgfa/rgfa_ins3.gfa @@ -0,0 +1,18 @@ +H VN:Z:1.0 +S 1 CAAATAAG +S 2 TTT +S 3 TTT +S 4 TTT +S 5 TTT +S 6 TTTTTTTTTT +P x 1+,5+ * +P y 5-,4-,6-,2-,1- * +P z 1+,2+,3+,4+,5+ * +L 1 + 2 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 4 + 0M +L 4 + 5 + 0M +L 1 + 5 + 0M +L 2 + 6 + 0M +L 6 + 4 + 0M diff --git a/test/rgfa/rgfa_tiny.gfa b/test/rgfa/rgfa_tiny.gfa new file mode 100644 index 00000000000..498d2e0a6dc --- /dev/null +++ b/test/rgfa/rgfa_tiny.gfa @@ -0,0 +1,38 @@ +H VN:Z:1.0 +S 5 C +S 7 A +S 12 ATAT +S 8 G +S 1 CAAATAAG +S 4 T +S 6 TTG +S 15 CCAACTCTCTG +S 2 A +S 10 A +S 9 AAATTTTCTGGAGTTCTAT +S 11 T +S 13 A +S 14 T +S 3 G +P x 1+,3+,5+,6+,8+,9+,11+,12+,14+,15+ * +P y 1+,2+,4+,6+,8+,9+,10+,12+,13+,15+ * +L 5 + 6 + 0M +L 7 + 9 + 0M +L 12 + 13 + 0M +L 12 + 14 + 0M +L 8 + 9 + 0M +L 1 + 2 + 0M +L 1 + 3 + 0M +L 4 + 6 + 0M +L 6 + 7 + 0M +L 6 + 8 + 0M +L 2 + 4 + 0M +L 2 + 5 + 0M +L 10 + 12 + 0M +L 9 + 10 + 0M +L 9 + 11 + 0M +L 11 + 12 + 0M +L 13 + 15 + 0M +L 14 + 15 + 0M +L 3 + 4 + 0M +L 3 + 5 + 0M diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t index 3bba6231d56..382e6f740a0 100644 --- a/test/t/11_vg_paths.t +++ b/test/t/11_vg_paths.t @@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg export LC_ALL="C" # force a consistent sort order -plan tests 20 +plan tests 26 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg vg construct -r small/x.fa -v small/x.vcf.gz > x2.vg @@ -61,4 +61,62 @@ is $? 0 "vg path coverage reports correct lengths in first column" rm -f q.vg q.cov.len q.len +vg paths -v rgfa/rgfa_tiny.gfa --rgfa-min-length 1 -Q x | vg convert - -fW > rgfa_tiny.rgfa +printf "P _rGFA_#y[33-34] 10+ * +P _rGFA_#y[38-39] 13+ * +P _rGFA_#y[8-9] 2+,4+ *\n" > rgfa_tiny_expected_fragments.rgfa +grep ^P rgfa_tiny.rgfa | grep rGFA | sort > rgfa_tiny_fragments.rgfa +diff rgfa_tiny_fragments.rgfa rgfa_tiny_expected_fragments.rgfa +is $? 0 "Found the expected rgfa SNP cover of tiny graph" + +rm -f rgfa_tiny.rgfa rgfa_tiny_expected_fragments.rgfa rgfa_tiny_fragments.rgfa + +vg paths -v rgfa/rgfa_ins.gfa --rgfa-min-length 5 -Q x | vg convert - -fW > rgfa_ins.rgfa +printf "P _rGFA_#z[8-11] 2+,3+,4+ *\n" > rgfa_ins_expected_fragments.rgfa +grep ^P rgfa_ins.rgfa | grep _rGFA_ | sort > rgfa_ins_fragments.rgfa +diff rgfa_ins_fragments.rgfa rgfa_ins_expected_fragments.rgfa +is $? 0 "Found the expected rgfa cover for simple nested insertion" + +rm -f rgfa_ins.rgfa rgfa_ins_expected_fragments.rgfa rgfa_ins_fragments.rgfa + +vg paths -v rgfa/rgfa_ins2.gfa --rgfa-min-length 3 -Q x | vg convert - -fW > rgfa_ins2.rgfa +printf "P _rGFA_#y[8-11] 2+,6+,4+ * +P _rGFA_#z[11-14] 3+ *\n" > rgfa_ins2_expected_fragments.rgfa +grep ^P rgfa_ins2.rgfa | grep _rGFA_ | sort > rgfa_ins2_fragments.rgfa +diff rgfa_ins2_fragments.rgfa rgfa_ins2_expected_fragments.rgfa +is $? 0 "Found the expected rgfa cover for simple nested insertion that requires two fragments" + +rm -f rgfa_ins2.rgfa rgfa_ins2_expected_fragments.rgfa rgfa_ins2_fragments.rgfa + +vg paths -v rgfa/rgfa_ins2.gfa --rgfa-min-length 3 -Q x | grep ^S | sort -nk2 > rgfa_ins2.rgfa +printf "S 1 CAAATAAG SN:Z:x SO:i:0 SR:i:0 +S 2 TTT SN:Z:y SO:i:8 SR:i:1 +S 3 TTT SN:Z:z SO:i:11 SR:i:2 +S 4 TTT SN:Z:y SO:i:21 SR:i:1 +S 5 TTT SN:Z:x SO:i:8 SR:i:0 +S 6 TTTTTTTTTT SN:Z:y SO:i:11 SR:i:1\n" > rgfa_ins2_expected_segs.rgfa +diff rgfa_ins2_expected_segs.rgfa rgfa_ins2.rgfa +is $? 0 "Found the expected rgfa tags for simple nested insertion that requires two fragments" + +rm -f rgfa_ins2_expected_segs.rgfa rgfa_ins2.rgfa + +vg paths -v rgfa/rgfa_ins2.gfa --rgfa-min-length 5 -Q x | vg convert - -fW > rgfa_ins2R5.rgfa +printf "P _rGFA_#y[8-11] 2+,6+,4+ *\n" > rgfa_ins2R5_expected_fragments.rgfa +grep ^P rgfa_ins2R5.rgfa | grep _rGFA_ | sort > rgfa_ins2R5_fragments.rgfa +diff rgfa_ins2R5_fragments.rgfa rgfa_ins2R5_expected_fragments.rgfa +is $? 0 "rgfa Minimum fragment length filters out small fragment" + +rm -f rgfa_ins2R5.rgfa rgfa_ins2R5_expected_fragments.rgfa rgfa_ins2R5_fragments.rgfa + +vg paths -v rgfa/rgfa_ins3.gfa --rgfa-min-length 3 -Q x | vg convert - -fW > rgfa_ins3.rgfa +printf "P x 1+,5+ * +P _rGFA_#y[3-6] 4+,6+,2+ * +P y 5-,4+,6+,2+,1- * +P _rGFA_#z[11-14] 3+ * +P z 1+,2-,3+,4-,5+ *\n" | sort > rgfa_ins3_expected_fragments.rgfa +grep ^P rgfa_ins3.rgfa | sort > rgfa_ins3_fragments.rgfa +diff rgfa_ins3_fragments.rgfa rgfa_ins3_expected_fragments.rgfa +is $? 0 "Found the expected rgfa cover for simple nested insertion that has a reversed path that needs forwardization" + +rm -f rgfa_ins3.rgfa rgfa_ins3_expected_fragments.rgfa rgfa_ins3_fragments.rgfa diff --git a/test/t/18_vg_call.t b/test/t/18_vg_call.t index 6a457f4eb9f..64668b15a5c 100644 --- a/test/t/18_vg_call.t +++ b/test/t/18_vg_call.t @@ -6,7 +6,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 19 +plan tests 20 # Toy example of hand-made pileup (and hand inspected truth) to make sure some # obvious (and only obvious) SNPs are detected by vg call @@ -191,6 +191,18 @@ is $? 0 "overriding contig length does not change calls" rm -f x_sub1.fa x_sub1.fa.fai x_sub2.fa x_sub2.fa.fai x_sub1.vcf.gz x_sub1.vcf.gz.tbi x_sub2.vcf.gz x_sub2.vcf.gz.tbi sim.gam x_subs.vcf x_subs_override.vcf x_subs_nocontig.vcf x_subs_override_nocontig.vcf +vg paths -x rgfa/rgfa_ins2.gfa -Q x -f 1 | grep -v ^P | grep -v ^W > rgfa_ins2.rgfa +vg sim -x rgfa_ins2.rgfa -n 30 -s 99 -l 20 -a > rgfa_ins2.gam +vg pack -x rgfa_ins2.rgfa -g rgfa_ins2.gam -o rgfa_ins2.pack +vg call rgfa_ins2.rgfa -k rgfa_ins2.pack -Aa | grep -v "^#" | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5}' > rgfa_ins2.vcf + +printf "x 8 >1>5 G GTTTTTTTTTTTTTTTT +y 11 >2>4 TTTTTTTTTTT T,TTTT\n" > rgfa_ins2.truth +diff rgfa_ins2.vcf rgfa_ins2.truth +is $? 0 "vg call makes expected calls on rgfa references" + +rm -f rgfa_ins2.rgfa rgfa_ins2.gam rgfa_ins2.pack rgfa_ins2.vcf rgfa_ins2.truth + diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index aa11d17d092..2f955daedd4 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 24 +plan tests 26 vg construct -r tiny/tiny.fa -v tiny/tiny.vcf.gz > tiny.vg vg index tiny.vg -x tiny.xg @@ -157,3 +157,10 @@ is "$?" 0 "gbz deconstruction gives same output as gbwt deconstruction" rm -f x.vg x.xg x.gbwt x.decon.vcf.gz x.decon.vcf.gz.tbi x.decon.vcf x.gbz.decon.vcf x.giraffe.gbz x.min x.dist small.s1.h1.fa small.s1.h2.fa decon.s1.h1.fa decon.s1.h2.fa +vg deconstruct rgfa/rgfa_ins2.gfa -P y -ea | grep "^y 11" | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' > rgfa_ins2_y.vcf +vg paths -x rgfa/rgfa_ins2.gfa -f 1 -Q x | vg deconstruct - -P _rGFA_ -ea | grep "^y 11" | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' > rgfa_ins2_rgfa.vcf +diff rgfa_ins2_y.vcf rgfa_ins2_rgfa.vcf +is "$?" 0 "deconstruct properly handles rGFA reference" +is $(cat rgfa_ins2_rgfa.vcf | wc -l) 1 "deconstruct properly handles rGFA reference (sanity check)" + +rm -f rgfa_ins2_y.vcf rgfa_ins2_rgfa.vcf diff --git a/test/t/48_vg_convert.t b/test/t/48_vg_convert.t index bed23a12c41..7eb46260af5 100644 --- a/test/t/48_vg_convert.t +++ b/test/t/48_vg_convert.t @@ -189,7 +189,7 @@ is "$?" 0 "converting gfa and equivalent rgfa produces same output" rm -f tiny.gfa.gfa tiny.rgfa.gfa -is "$(vg convert -g tiny/tiny.rgfa -r 1 | vg paths -M --paths-by y -x - | grep -v "^#" | cut -f1)" "y[35]" "rank-1 rgfa subpath found" +is "$(vg convert -g tiny/tiny.rgfa -r 1 | vg paths -M --paths-by _rGFA_ -x - | grep -v "^#" | cut -f1)" "_rGFA_#y[35-36]" "rank-1 rgfa subpath found" vg convert -g tiny/tiny.rgfa -T gfa-id-mapping.tsv > /dev/null grep ^S tiny/tiny.rgfa | awk '{print $2}' | sort > rgfa_nodes @@ -404,12 +404,13 @@ is "${?}" "0" "GFA -> HashGraph -> GFA conversion preserves path metadata" # We can't do rGFA to GBZ directly until the GBWTGraph GFA parser learns to read tags # rGFA to HashGraph to GBZ to GFA -vg paths -M -x graphs/rgfa_with_reference.rgfa | sort >paths.truth.txt +vg paths -M -x graphs/rgfa_with_reference.rgfa | sed -e 's/-2//g' -e 's/-8//g' | sort >paths.truth.txt vg convert -a graphs/rgfa_with_reference.rgfa > rgfa_with_reference.hg vg gbwt -x rgfa_with_reference.hg -E --gbz-format -g rgfa_with_reference.gbz vg paths -M -x rgfa_with_reference.gbz | sort >paths.gbz.txt cmp paths.gbz.txt paths.truth.txt is "${?}" "0" "rGFA -> HashGraph -> GBZ conversion preserves path metadata" +vg paths -M -x graphs/rgfa_with_reference.rgfa | sort >paths.truth.txt vg convert -f rgfa_with_reference.gbz >extracted.gfa vg paths -M -x extracted.gfa | sort >paths.gfa.txt cmp paths.gfa.txt paths.truth.txt