Skip to content

Commit

Permalink
Merge pull request #205 from vgteam/hint-ref-paths
Browse files Browse the repository at this point in the history
Allow hinting extra paths to index as reference paths
  • Loading branch information
adamnovak authored Oct 29, 2024
2 parents 21dd9b2 + e98cda2 commit 14f9d7b
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 13 deletions.
11 changes: 7 additions & 4 deletions bdsg/include/bdsg/overlays/overlay_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,11 @@ template<typename T, typename U, typename V>
class OverlayHelper {
public:
// Handle non-const base graph
T* apply(V* input_graph) {
template <typename ...Params>
T* apply(V* input_graph, Params&&... params) {
auto mutable_overlaid = dynamic_cast<T*>(input_graph);
if (mutable_overlaid == nullptr) {
overlay = make_unique<U>(input_graph);
overlay = make_unique<U>(input_graph, std::forward<Params>(params)...);
mutable_overlaid = dynamic_cast<T*>(overlay.get());
assert(mutable_overlaid != nullptr);
}
Expand All @@ -78,10 +79,11 @@ class OverlayHelper {
}

// Handle const base graph
const T* apply(const V* input_graph) {
template <typename ...Params>
const T* apply(const V* input_graph, Params&&... params) {
overlaid = dynamic_cast<const T*>(input_graph);
if (overlaid == nullptr) {
overlay = make_unique<U>(input_graph);
overlay = make_unique<U>(input_graph, std::forward<Params>(params)...);
overlaid = dynamic_cast<T*>(overlay.get());
assert(overlaid != nullptr);
}
Expand All @@ -100,6 +102,7 @@ class OverlayHelper {
/// Implementation of overlay helper functionality for when multiple overlays need to be stacked.
// There must be a way to generalize with variadic templates
// (I had trouble chaining the output of the nested overlays together and getting the types right when trying)
// TODO: Add support for passing overlay constructor arguments through.
template<typename T1, typename U1, typename V1, typename T2, typename U2, typename V2>
class PairOverlayHelper {
public:
Expand Down
5 changes: 4 additions & 1 deletion bdsg/include/bdsg/overlays/packed_path_position_overlay.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ using namespace handlegraph;

/*
* An overlay that adds the PathPositionHandleGraph interface to a static PathHandleGraph
* by augmenting it with compressed index data structures
* by augmenting it with compressed index data structures.
*
* TODO: Make the overlay transparent so that paths hidden in the base graph
* remain accessible through the path metadata queries.
*/
class PackedPositionOverlay : public PathPositionHandleGraph, public ExpandingOverlayGraph {

Expand Down
8 changes: 8 additions & 0 deletions bdsg/include/bdsg/overlays/packed_reference_path_overlay.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@ using namespace handlegraph;
/*
* An overlay that adds fast access to paths in addition to allowing path
* position queries on them.
*
* TODO: Won't work properly with paths hidden from for_each_path_handle on the
* backing graph, since they won't be indexed but we also won't pass any kind
* of queries through to the backign graph for queries we expect to be able to
* fulfil from the index. Unkike in PackedPositionOverlay, we now expect the
* index to have some path data in it, not just offset tables that we wouldn't
* expect to use for hidden (i,e, haplotype) paths. We should make the overlay
* transparent so hidden paths work properly, or remove hidden paths.
*/
class PackedReferencePathOverlay : public PackedPositionOverlay {

Expand Down
18 changes: 17 additions & 1 deletion bdsg/include/bdsg/overlays/reference_path_overlay.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#define BDSG_REFERENCE_PATH_OVERLAY_HPP_INCLUDED

#include <unordered_map>
#include <unordered_set>

#include <handlegraph/path_position_handle_graph.hpp>
#include <sdsl/bit_vectors.hpp>
Expand All @@ -25,12 +26,27 @@ using namespace handlegraph;
* An overlay that adds fast access to paths in addition to allowing path
* position queries on them. The original graph's handle_t's and path_handle_t's
* remain valid for the overlay, but not the step_t's.
*
* Note that paths that are not indexed as reference paths (i.e. those that are
* hidden from for_each_path_handle by the backing graph and not listed in
* extra_path_names on construction) *will not be accessible through the
* overlay*! You can ask if they exist, but trying to get handles to steps on
* them will not work. To actually look at them you will need to go back to the
* base graph.
*
* TODO: Make the overlay transparent so that paths that don't get indexed in
* the overlay remain accessible but without the (fast versions of?) the
* position queries.
*/
class ReferencePathOverlay : public PathPositionHandleGraph {

public:

ReferencePathOverlay(const PathHandleGraph* graph);
/// Create a ReferencePathOverlay indexing all non-hidden paths in the
/// backing graph (which show up in for_each_path_handle()). For path names
/// in extra_path_names, look them up and index them too, even if they are
/// hidden.
ReferencePathOverlay(const PathHandleGraph* graph, const std::unordered_set<std::string>& extra_path_names = {});
ReferencePathOverlay() = default;
~ReferencePathOverlay() = default;

Expand Down
45 changes: 38 additions & 7 deletions bdsg/src/reference_path_overlay.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,56 @@
#include <omp.h>

#include <handlegraph/util.hpp>
#include <handlegraph/types.hpp>

namespace bdsg {

using namespace std;
using namespace handlegraph;

ReferencePathOverlay::ReferencePathOverlay(const PathHandleGraph* graph) : graph(graph) {
ReferencePathOverlay::ReferencePathOverlay(const PathHandleGraph* graph, const std::unordered_set<std::string>& extra_path_names) : graph(graph) {

// init the base hash table and gather path handles
uint64_t max_path_handle = 0;
std::vector<path_handle_t> path_handles;
// Get step counts for all paths we want to process, once.
std::unordered_map<path_handle_t, size_t> cached_step_counts;
graph->for_each_path_handle([&](const path_handle_t& path) {
// Find and measure all the non-hidden paths.
// TODO: If we made the overlay transparent so we could access paths
// that didn't get indexed, we wouldn't be weirdly indexing haplotype
// paths from backends that don't hide them in the "reference" path
// overlay.
cached_step_counts[path] = graph->get_step_count(path);
});
for (auto& path_name : extra_path_names) {
// Also index hidden paths that the user is asking for by name.
if (graph->has_path(path_name)) {
// The graph actually has this path.
path_handle_t path = graph->get_path_handle(path_name);
auto found = cached_step_counts.find(path);
if (found == cached_step_counts.end()) {
// And it's not already reference sense.
// Count steps and remember it
cached_step_counts.emplace_hint(found, path, graph->get_step_count(path));
}
}
}

// Now use the cache as a source of truth and make a vector of the paths.
std::vector<path_handle_t> path_handles;
// We also track the numerically max path handle
uint64_t max_path_handle = 0;
for (auto& handle_and_length : cached_step_counts) {
const path_handle_t& path = handle_and_length.first;
path_handles.push_back(path);

// Each of the paths needs a PathRecord
reference_paths.insert(pair<path_handle_t, PathRecord>(path, PathRecord()));
// And needs to be maxed into the max handles.
max_path_handle = std::max<uint64_t>(max_path_handle, handlegraph::as_integer(path));
});
}

// sort in descending order by length to limit parallel scheduling makespan
std::sort(path_handles.begin(), path_handles.end(), [&](path_handle_t a, path_handle_t b) {
return graph->get_step_count(a) > graph->get_step_count(b);
return cached_step_counts.at(a) > cached_step_counts.at(b);
});

std::vector<std::atomic<size_t>> num_steps(graph->max_node_id() + 1);
Expand All @@ -35,7 +66,7 @@ ReferencePathOverlay::ReferencePathOverlay(const PathHandleGraph* graph) : graph
auto& path_record = reference_paths.at(path);

// init the step vectors
size_t path_size = graph->get_step_count(path);
size_t path_size = cached_step_counts.at(path);
path_record.steps.resize(path_size);

// record the steps and the path length
Expand Down

0 comments on commit 14f9d7b

Please sign in to comment.