Skip to content

Commit

Permalink
Merge pull request #4396 from vgteam/path-normalize
Browse files Browse the repository at this point in the history
Add normalizer that snaps together redundant path traversals through sites
  • Loading branch information
glennhickey authored Sep 17, 2024
2 parents 4db75c2 + d966fa7 commit 818ee2c
Show file tree
Hide file tree
Showing 5 changed files with 275 additions and 17 deletions.
56 changes: 41 additions & 15 deletions src/subcommand/paths_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#include "../vg.hpp"
#include "../xg.hpp"
#include "../gbwt_helper.hpp"
#include "../traversal_clusters.hpp"
#include "../io/save_handle_graph.hpp"
#include <vg/io/vpkg.hpp>
#include <vg/io/stream.hpp>
#include <vg/io/alignment_emitter.hpp>
Expand All @@ -37,6 +39,7 @@ void help_paths(char** argv) {
<< " -V, --extract-vg output a path-only graph covering the selected paths" << endl
<< " -d, --drop-paths output a graph with the selected paths removed" << endl
<< " -r, --retain-paths output a graph with only the selected paths retained" << endl
<< " -n, --normalize-paths output a graph where all equivalent paths in a site a merged (using selected paths to snap to if possible)" << endl
<< " output path data:" << endl
<< " -X, --extract-gam print (as GAM alignments) the stored paths in the graph" << endl
<< " -A, --extract-gaf print (as GAF alignments) the stored paths in the graph" << endl
Expand All @@ -53,7 +56,8 @@ void help_paths(char** argv) {
<< " -a, --variant-paths select the variant paths added by 'vg construct -a'" << endl
<< " -G, --generic-paths select the generic, non-reference, non-haplotype paths" << endl
<< " -R, --reference-paths select the reference paths" << endl
<< " -H, --haplotype-paths select the haplotype paths paths" << endl;
<< " -H, --haplotype-paths select the haplotype paths paths" << endl
<< " -t, --threads N number of threads to use [all available]. applies only to snarl finding within -n" << endl;
}

/// Chunk a path and emit it in Graph messages.
Expand Down Expand Up @@ -117,6 +121,7 @@ int main_paths(int argc, char** argv) {
size_t input_formats = 0;
bool coverage = false;
const size_t coverage_bins = 10;
bool normalize_paths = false;

int c;
optind = 2; // force optind past command positional argument
Expand All @@ -130,6 +135,7 @@ int main_paths(int argc, char** argv) {
{"extract-vg", no_argument, 0, 'V'},
{"drop-paths", no_argument, 0, 'd'},
{"retain-paths", no_argument, 0, 'r'},
{"normalize-paths", no_argument, 0, 'n'},
{"extract-gam", no_argument, 0, 'X'},
{"extract-gaf", no_argument, 0, 'A'},
{"list", no_argument, 0, 'L'},
Expand All @@ -154,7 +160,7 @@ int main_paths(int argc, char** argv) {
};

int option_index = 0;
c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:draGRHp:c",
c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drnaGRHp:ct:",
long_options, &option_index);

// Detect the end of the options.
Expand Down Expand Up @@ -189,6 +195,11 @@ int main_paths(int argc, char** argv) {
retain_paths = true;
output_formats++;
break;

case 'n':
normalize_paths = true;
output_formats++;
break;

case 'X':
extract_as_gam = true;
Expand Down Expand Up @@ -275,6 +286,17 @@ int main_paths(int argc, char** argv) {
selection_criteria++;
break;

case 't':
{
int num_threads = parse<int>(optarg);
if (num_threads <= 0) {
cerr << "error:[vg paths] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl;
exit(1);
}
omp_set_num_threads(num_threads);
break;
}

case 'h':
case '?':
help_paths(argv);
Expand Down Expand Up @@ -309,7 +331,7 @@ int main_paths(int argc, char** argv) {
if (!gbwt_file.empty()) {
bool need_graph = (extract_as_gam || extract_as_gaf || extract_as_vg || drop_paths || retain_paths || extract_as_fasta || list_lengths);
if (need_graph && graph_file.empty()) {
std::cerr << "error: [vg paths] a graph is needed for extracting threads in -X, -A, -V, -d, -r, -E or -F format" << std::endl;
std::cerr << "error: [vg paths] a graph is needed for extracting threads in -X, -A, -V, -d, -r, -n, -E or -F format" << std::endl;
std::exit(EXIT_FAILURE);
}
if (!need_graph && !graph_file.empty()) {
Expand All @@ -320,7 +342,7 @@ int main_paths(int argc, char** argv) {
}
}
if (output_formats != 1) {
std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -L, -F, -E, -C or -c) must be specified" << std::endl;
std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -n, -L, -F, -E, -C or -c) must be specified" << std::endl;
std::exit(EXIT_FAILURE);
}
if (selection_criteria > 1) {
Expand All @@ -335,8 +357,8 @@ int main_paths(int argc, char** argv) {
std::cerr << "error: [vg paths] listing path metadata is not compatible with a GBWT index" << std::endl;
std::exit(EXIT_FAILURE);
}
if ((drop_paths || retain_paths) && !gbwt_file.empty()) {
std::cerr << "error: [vg paths] dropping or retaining paths only works on embedded graph paths, not GBWT threads" << std::endl;
if ((drop_paths || retain_paths || normalize_paths) && !gbwt_file.empty()) {
std::cerr << "error: [vg paths] dropping, retaining or normalizing paths only works on embedded graph paths, not GBWT threads" << std::endl;
std::exit(EXIT_FAILURE);
}
if (coverage && !gbwt_file.empty()) {
Expand Down Expand Up @@ -566,32 +588,36 @@ int main_paths(int argc, char** argv) {

};

if (drop_paths || retain_paths) {
if (drop_paths || retain_paths || normalize_paths) {
MutablePathMutableHandleGraph* mutable_graph = dynamic_cast<MutablePathMutableHandleGraph*>(graph.get());
if (!mutable_graph) {
std::cerr << "error[vg paths]: graph cannot be modified" << std::endl;
exit(1);
}
SerializableHandleGraph* serializable_graph = dynamic_cast<SerializableHandleGraph*>(graph.get());
if (!serializable_graph) {
std::cerr << "error[vg paths]: graph cannot be saved after modification" << std::endl;
exit(1);
}

vector<path_handle_t> to_destroy;
if (drop_paths) {
for_each_selected_path([&](const path_handle_t& path_handle) {
to_destroy.push_back(path_handle);
});
} else {
} else if (retain_paths) {
for_each_unselected_path([&](const path_handle_t& path_handle) {
to_destroy.push_back(path_handle);
});
} else {
assert(normalize_paths);
unordered_set<path_handle_t> selected_paths;
for_each_selected_path([&](const path_handle_t& path_handle) {
selected_paths.insert(path_handle);
});
merge_equivalent_traversals_in_graph(mutable_graph, selected_paths);
}
if (!to_destroy.empty()) {
mutable_graph->destroy_paths(to_destroy);
}
mutable_graph->destroy_paths(to_destroy);

// output the graph
serializable_graph->serialize(cout);
vg::io::save_handle_graph(mutable_graph, cout);
}
else if (coverage) {
// for every node, count the number of unique paths. then add the coverage count to each one
Expand Down
133 changes: 132 additions & 1 deletion src/traversal_clusters.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
#include "traversal_clusters.hpp"
#include "traversal_finder.hpp"
#include "integrated_snarl_finder.hpp"
#include "snarl_distance_index.hpp"

//#define debug

namespace vg {

using namespace bdsg;

// specialized version of jaccard_coefficient() that weights jaccard by node lenths.
double weighted_jaccard_coefficient(const PathHandleGraph* graph,
Expand Down Expand Up @@ -279,9 +283,136 @@ vector<vector<int>> assign_child_snarls_to_traversals(const PathHandleGraph* gra
return trav_to_child_snarls;
}

static void merge_equivalent_traversals_in_snarl(MutablePathHandleGraph* graph, const unordered_set<path_handle_t>& selected_paths,
PathTraversalFinder& path_trav_finder,
const handle_t& start_handle, const handle_t& end_handle ) {
// find the path traversals through the snarl
vector<Traversal> path_travs;
vector<PathInterval> path_intervals;
std::tie(path_travs, path_intervals) = path_trav_finder.find_path_traversals(start_handle, end_handle);
vector<string> trav_names(path_travs.size());
vector<path_handle_t> trav_paths(path_travs.size());
vector<bool> trav_reversed(path_travs.size());

#ifdef debug
cerr << "snarl " << graph_interval_to_string(graph, start_handle, end_handle) << endl;
#endif

// organize paths by their alleles strings
// todo: we could use a fancier index to save memory here (prob not necessary tho)
unordered_map<string, vector<int64_t>> string_to_travs;
for (int64_t i = 0; i < path_travs.size(); ++i) {
const Traversal& trav = path_travs[i];
string allele;
for (int64_t j = 1; j < trav.size() - 1; ++j) {
allele += toUppercase(graph->get_sequence(trav[j]));
}
string_to_travs[allele].push_back(i);
trav_paths[i] = graph->get_path_handle_of_step(path_intervals[i].first);
trav_names[i] = graph->get_path_name(trav_paths[i]);
trav_reversed[i] = graph->get_is_reverse(graph->get_handle_of_step(path_intervals[i].first)) !=
graph->get_is_reverse(start_handle);
#ifdef debug
cerr << "trav " << i << ": "
<< "n=" << trav_names[i] << " "
<< "i=" << graph_interval_to_string(graph, graph->get_handle_of_step(path_intervals[i].first),
graph->get_handle_of_step(path_intervals[i].second))
<< " t=" << traversal_to_string(graph, trav)
<< " a=" << allele << " r=" << trav_reversed[i] << endl;
#endif
}

// rank the traversals, putting selected ones first otherwise alphabetical
function<bool(int64_t, int64_t)> trav_idx_less = [&](int64_t i, int64_t j) {
bool iref = selected_paths.count(trav_paths[i]);
bool jref = selected_paths.count(trav_paths[j]);
if (iref != jref) {
return iref;
}
return trav_names[i] < trav_names[j];
};


// merge up the paths
for (const auto& allele_travs : string_to_travs) {
if (allele_travs.first.length() > 0 && allele_travs.second.size() > 1) {
const vector<int64_t>& eq_travs = allele_travs.second;
auto canonical_it = std::min_element(eq_travs.begin(), eq_travs.end(), trav_idx_less);
assert(canonical_it != eq_travs.end());
int64_t canonical_idx = *canonical_it;
const Traversal& canonical_trav = path_travs[canonical_idx];
Traversal canonical_trav_flip;
for (auto i = canonical_trav.rbegin(); i != canonical_trav.rend(); ++i) {
canonical_trav_flip.push_back(graph->flip(*i));
}

// edit it into every other path
//
// WARNING: in the case of loops, we could be editing the same path more than once
// so making the big undocumented assumption that step handles outside edit
// are unaffected!!
for (int64_t i = 0; i < eq_travs.size(); ++i) {
int64_t replace_idx = eq_travs[i];
if (replace_idx != canonical_idx && path_travs[replace_idx] != path_travs[canonical_idx]) {
PathInterval interval_to_replace = path_intervals[replace_idx];
if (trav_reversed[replace_idx]) {
std::swap(interval_to_replace.first, interval_to_replace.second);
}
const Traversal& replacement_trav = trav_reversed[replace_idx] ? canonical_trav_flip : canonical_trav;
#ifdef debug
cerr << "editing interval of size " << path_travs[replace_idx].size()
<< " from path " << trav_names[replace_idx] << " with canonical interval of size "
<< replacement_trav.size() << " from path "
<< trav_names[canonical_idx] << endl
<< "--interval to replace: "
<< graph_interval_to_string(graph, graph->get_handle_of_step(interval_to_replace.first),
graph->get_handle_of_step(interval_to_replace.second)) << endl
<< "--interval coming in: " << traversal_to_string(graph, replacement_trav) << endl;

#endif
assert(graph->get_handle_of_step(interval_to_replace.first) == replacement_trav.front());
assert(graph->get_handle_of_step(interval_to_replace.second) == replacement_trav.back());
graph->rewrite_segment(interval_to_replace.first, graph->get_next_step(interval_to_replace.second),
replacement_trav);
}
}
}
}
}

void merge_equivalent_traversals_in_graph(MutablePathHandleGraph* graph, const unordered_set<path_handle_t>& selected_paths) {
// compute the snarls
SnarlDistanceIndex distance_index;
{
IntegratedSnarlFinder snarl_finder(*graph);
// todo: why can't I pass in 0 below -- I don't want any dinstances!
fill_in_distance_index(&distance_index, graph, &snarl_finder, 1);
}

// only consider embedded paths that span snarl
PathTraversalFinder path_trav_finder(*graph);

// do every snarl top-down. this is because it's possible (tho probably rare) for a child snarl to
// be redundant after normalizing its parent. don't think the opposite (normalizing child)
// causes redundant parent.. todo: can we guarantee?!
net_handle_t root = distance_index.get_root();
deque<net_handle_t> queue = {root};

while (!queue.empty()) {
net_handle_t net_handle = queue.front();
queue.pop_front();
if (distance_index.is_snarl(net_handle)) {
net_handle_t start_bound = distance_index.get_bound(net_handle, false, true);
net_handle_t end_bound = distance_index.get_bound(net_handle, true, false);
handle_t start_handle = distance_index.get_handle(start_bound, graph);
handle_t end_handle = distance_index.get_handle(end_bound, graph);
merge_equivalent_traversals_in_snarl(graph, selected_paths, path_trav_finder, start_handle, end_handle);
}
if (net_handle == root || distance_index.is_snarl(net_handle) || distance_index.is_chain(net_handle)) {
distance_index.for_each_child(net_handle, [&](net_handle_t child_handle) {
queue.push_back(child_handle);
});
}
}
}

}
11 changes: 11 additions & 0 deletions src/traversal_clusters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,4 +90,15 @@ vector<vector<int>> assign_child_snarls_to_traversals(const PathHandleGraph* gra
const vector<Traversal>& traversals,
const vector<pair<handle_t, handle_t>>& child_snarls);

/// For every top-level snarl in the graph, compute the traversal strings of every embedded path that spans it
/// If two or more traversals share an allele string, then a "canoncial" path is chosen and all remaining paths
/// are edited so that they share the exact same interval through the snarl as the canonical path's traversal.
/// A path is considered "canoncial" if it's in the "selected_paths" and the other paths are not
/// (otherwise the lowest name is used as a fallback)
///
/// Note: this doesn't modify the graph toplogy, so uncovered nodes and edges as a result of path editing
/// would usually need removale with vg clip afterwards
///
void merge_equivalent_traversals_in_graph(MutablePathHandleGraph* graph, const unordered_set<path_handle_t>& selected_paths);

}
47 changes: 47 additions & 0 deletions test/graphs/path_norm_test.gfa
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
H VN:Z:1.1
S 5 C
S 7 A
S 12 ATAT
S 8 G
S 1 CAAATAAG
S 4 T
S 6 TTG
S 15 CCAACTCTCTG
S 2 A
S 35 GT
S 53 AC
S 10 A
S 9 AAATTTTCTGGAGTTCTAT
S 11 T
S 13 A
S 14 T
S 3 G
P x1 1+,3+,5+,6+,8+,9+,11+,12+,14+,15+ *
P x2 1+,3+,4+,6+,8+,9+,11+,12+,14+,15+ *
P x3 1+,35+,6+,8+,9+,11+,12+,14+,15+ *
P x4 15-,14-,12-,11-,9-,8-,6-,35-,1- *
P x5 1+,53-,6+,8+,9+,11+,12+,14+,15+ *
L 5 + 6 + 0M
L 7 + 9 + 0M
L 12 + 13 + 0M
L 12 + 14 + 0M
L 8 + 9 + 0M
L 1 + 2 + 0M
L 1 + 3 + 0M
L 1 + 35 + 0M
L 1 + 53 - 0M
L 4 + 6 + 0M
L 35 + 6 + 0M
L 53 - 6 + 0M
L 6 + 7 + 0M
L 6 + 8 + 0M
L 2 + 4 + 0M
L 2 + 5 + 0M
L 10 + 12 + 0M
L 9 + 10 + 0M
L 9 + 11 + 0M
L 11 + 12 + 0M
L 13 + 15 + 0M
L 14 + 15 + 0M
L 3 + 4 + 0M
L 3 + 5 + 0M
Loading

1 comment on commit 818ee2c

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17335 seconds

Please sign in to comment.