Skip to content

Commit

Permalink
Merge pull request #199 from vgteam/unpacked-ref-overlay
Browse files Browse the repository at this point in the history
A simpler overlay to speed up reference path queries
  • Loading branch information
adamnovak authored Jun 18, 2024
2 parents d2a7428 + aab874e commit c8f6a12
Show file tree
Hide file tree
Showing 4 changed files with 953 additions and 1 deletion.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ add_library(bdsg_objs OBJECT
${bdsg_DIR}/src/packed_graph.cpp
${bdsg_DIR}/src/packed_path_position_overlay.cpp
${bdsg_DIR}/src/packed_reference_path_overlay.cpp
${bdsg_DIR}/src/reference_path_overlay.cpp
${bdsg_DIR}/src/packed_subgraph_overlay.cpp
${bdsg_DIR}/src/path_position_overlays.cpp
${bdsg_DIR}/src/path_subgraph_overlay.cpp
Expand Down
258 changes: 258 additions & 0 deletions bdsg/include/bdsg/overlays/reference_path_overlay.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
//
// reference_path_overlay.hpp
//
// Contains an overlay that adds fast access to paths so they can be used as
// references.
//

#ifndef BDSG_REFERENCE_PATH_OVERLAY_HPP_INCLUDED
#define BDSG_REFERENCE_PATH_OVERLAY_HPP_INCLUDED

#include <unordered_map>

#include <handlegraph/path_position_handle_graph.hpp>
#include <sdsl/bit_vectors.hpp>

#include "bdsg/internal/hash_map.hpp"
#include "bdsg/internal/packed_structs.hpp"

namespace bdsg {

using namespace std;
using namespace handlegraph;

/*
* An overlay that adds fast access to paths in addition to allowing path
* position queries on them. The original graph's handle_t's and path_handle_t's
* remain valid for the overlay, but not the step_t's.
*/
class ReferencePathOverlay : public PathPositionHandleGraph {

public:

ReferencePathOverlay(const PathHandleGraph* graph);
ReferencePathOverlay() = default;
~ReferencePathOverlay() = default;

////////////////////////////////////////////////////////////////////////////
// HandleGraph interface implementations
////////////////////////////////////////////////////////////////////////////

/// Method to check if a node exists by ID
bool has_node(nid_t node_id) const;

/// Look up the handle for the node with the given ID in the given orientation
handle_t get_handle(const nid_t& node_id, bool is_reverse = false) const;

/// Get the ID from a handle
nid_t get_id(const handle_t& handle) const;

/// Get the orientation of a handle
bool get_is_reverse(const handle_t& handle) const;

/// Invert the orientation of a handle (potentially without getting its ID)
handle_t flip(const handle_t& handle) const;

/// Get the length of a node
size_t get_length(const handle_t& handle) const;

/// Get the sequence of a node, presented in the handle's local forward
/// orientation.
std::string get_sequence(const handle_t& handle) const;

/// Return the number of nodes in the graph
size_t get_node_count() const;

/// Return the smallest ID in the graph, or some smaller number if the
/// smallest ID is unavailable. Return value is unspecified if the graph is empty.
nid_t min_node_id() const;

/// Return the largest ID in the graph, or some larger number if the
/// largest ID is unavailable. Return value is unspecified if the graph is empty.
nid_t max_node_id() const;

////////////////////////////////////////////////////////////////////////////
// Additional HandleGraph interface
////////////////////////////////////////////////////////////////////////////
///
/// Get the number of edges on the right (go_left = false) or left (go_left
/// = true) side of the given handle. The default implementation is O(n) in
/// the number of edges returned, but graph implementations that track this
/// information more efficiently can override this method.
size_t get_degree(const handle_t& handle, bool go_left) const;

/// Returns true if there is an edge that allows traversal from the left
/// handle to the right handle. By default O(n) in the number of edges
/// on left, but can be overridden with more efficient implementations.
bool has_edge(const handle_t& left, const handle_t& right) const;

/// Return the total number of edges in the graph. If not overridden,
/// counts them all in linear time.
size_t get_edge_count() const;

/// Return the total length of all nodes in the graph, in bp. If not
/// overridden, loops over all nodes in linear time.
size_t get_total_length() const;

/// Returns one base of a handle's sequence, in the orientation of the
/// handle.
char get_base(const handle_t& handle, size_t index) const;

/// Returns a substring of a handle's sequence, in the orientation of the
/// handle. If the indicated substring would extend beyond the end of the
/// handle's sequence, the return value is truncated to the sequence's end.
/// By default O(n) in the size of the handle's sequence, but can be overriden.
std::string get_subsequence(const handle_t& handle, size_t index, size_t size) const;

////////////////////////////////////////////////////////////////////////////
// Path handle interface implementations
////////////////////////////////////////////////////////////////////////////

/// Returns the number of paths stored in the graph
size_t get_path_count() const;

/// Determine if a path name exists and is legal to get a path handle for.
bool has_path(const std::string& path_name) const;

/// Look up the path handle for the given path name.
/// The path with that name must exist.
path_handle_t get_path_handle(const std::string& path_name) const;

/// Look up the name of a path from a handle to it
std::string get_path_name(const path_handle_t& path_handle) const;

/// Look up whether a path is circular
bool get_is_circular(const path_handle_t& path_handle) const;

/// Returns the number of node steps in the path
size_t get_step_count(const path_handle_t& path_handle) const;

/// Get a node handle (node ID and orientation) from a handle to an step on a path
handle_t get_handle_of_step(const step_handle_t& step_handle) const;

/// Returns a handle to the path that an step is on
path_handle_t get_path_handle_of_step(const step_handle_t& step_handle) const;

/// Get a handle to the first step, which will be an arbitrary step in a circular path
/// that we consider "first" based on our construction of the path. If the path is empty,
/// then the implementation must return the same value as path_end().
step_handle_t path_begin(const path_handle_t& path_handle) const;

/// Get a handle to a fictitious position past the end of a path. This position is
/// returned by get_next_step for the final step in a path in a non-circular path.
/// Note: get_next_step will *NEVER* return this value for a circular path.
step_handle_t path_end(const path_handle_t& path_handle) const;

/// Get a handle to the last step, which will be an arbitrary step in a circular path that
/// we consider "last" based on our construction of the path. If the path is empty
/// then the implementation must return the same value as path_front_end().
step_handle_t path_back(const path_handle_t& path_handle) const;

/// Get a handle to a fictitious position before the beginning of a path. This position is
/// return by get_previous_step for the first step in a path in a non-circular path.
/// Note: get_previous_step will *NEVER* return this value for a circular path.
step_handle_t path_front_end(const path_handle_t& path_handle) const;

/// Returns true if the step is not the last step in a non-circular path.
bool has_next_step(const step_handle_t& step_handle) const;

/// Returns true if the step is not the first step in a non-circular path.
bool has_previous_step(const step_handle_t& step_handle) const;

/// Returns a handle to the next step on the path. If the given step is the final step
/// of a non-circular path, this method has undefined behavior. In a circular path,
/// the "last" step will loop around to the "first" step.
step_handle_t get_next_step(const step_handle_t& step_handle) const;

/// Returns a handle to the previous step on the path. If the given step is the first
/// step of a non-circular path, this method has undefined behavior. In a circular path,
/// it will loop around from the "first" step (i.e. the one returned by path_begin) to
/// the "last" step.
step_handle_t get_previous_step(const step_handle_t& step_handle) const;

////////////////////////////////////////////////////////////////////////////
// PathPositionHandleGraph interface
////////////////////////////////////////////////////////////////////////////

/// Returns the length of a path measured in bases of sequence.
size_t get_path_length(const path_handle_t& path_handle) const;

/// Returns the position along the path of the beginning of this step measured in
/// bases of sequence. In a circular path, positions start at the step returned by
/// path_begin().
size_t get_position_of_step(const step_handle_t& step) const;

/// Returns the step at this position, measured in bases of sequence starting at
/// the step returned by path_begin(). If the position is past the end of the
/// path, returns path_end().
step_handle_t get_step_at_position(const path_handle_t& path,
const size_t& position) const;

protected:

////////////////////////////////////////////////////////////////////////////
// HandleGraph backing implementations
////////////////////////////////////////////////////////////////////////////

/// Loop over all the handles to next/previous (right/left) nodes. Passes
/// them to a callback which returns false to stop iterating and true to
/// continue. Returns true if we finished and false if we stopped early.
bool follow_edges_impl(const handle_t& handle, bool go_left, const std::function<bool(const handle_t&)>& iteratee) const;

/// Loop over all the nodes in the graph in their local forward
/// orientations, in their internal stored order. Stop if the iteratee
/// returns false. Can be told to run in parallel, in which case stopping
/// after a false return value is on a best-effort basis and iteration
/// order is not defined. Returns true if we finished and false if we
/// stopped early.
bool for_each_handle_impl(const std::function<bool(const handle_t&)>& iteratee, bool parallel = false) const;


////////////////////////////////////////////////////////////////////////////
// PathHandleGraph backing implementations
////////////////////////////////////////////////////////////////////////////

/// Execute a function on each path in the graph. If it returns false, stop
/// iteration. Returns true if we finished and false if we stopped early.
///
/// If the graph contains compressed haplotype paths and properly
/// implements for_each_path_of_sense to retrieve them, they should not be
/// visible here. Only reference or generic named paths should be visible.
bool for_each_path_handle_impl(const std::function<bool(const path_handle_t&)>& iteratee) const;

/// Execute a function on each step of a handle in any path. If it
/// returns false, stop iteration. Returns true if we finished and false if
/// we stopped early.
///
/// If the graph contains compressed haplotype paths and properly
/// implements for_each_step_of_sense to find them, they should not be
/// visible here. Only reference or generic named paths should be visible.
bool for_each_step_on_handle_impl(const handle_t& handle,
const std::function<bool(const step_handle_t&)>& iteratee) const;


const PathHandleGraph* graph = nullptr;

struct PathRecord {
public:
PathRecord() = default;
~PathRecord() = default;

PackedVector<> steps;
sdsl::bit_vector offsets;
sdsl::bit_vector::rank_1_type offsets_rank;
sdsl::bit_vector::select_1_type offsets_select;
};

std::unordered_map<path_handle_t, PathRecord> reference_paths;

// indexed by node ID, the index that the node's steps begin in the step vectors
sdsl::int_vector<> steps_begin;
sdsl::int_vector<> step_1;
sdsl::int_vector<> step_2;
};

}


#endif
Loading

0 comments on commit c8f6a12

Please sign in to comment.