Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement batch delete functions for paths #201

Merged
merged 1 commit into from
Jul 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions bdsg/include/bdsg/hash_graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,11 @@ class HashGraph : public MutablePathDeletableHandleGraph, public SerializableHan
*/
void destroy_path(const path_handle_t& path);

/**
* Destroy the given set of paths. Invalidates handles to all the paths and their steps.
*/
void destroy_paths(const std::vector<path_handle_t>& paths);

/**
* Create a path with the given name. The caller must ensure that no path
* with the given name exists already, or the behavior is undefined.
Expand Down
103 changes: 66 additions & 37 deletions bdsg/include/bdsg/internal/base_packed_graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,11 @@ class BasePackedGraph {
* Destroy the given path. Invalidates handles to the path and its node steps.
*/
void destroy_path(const path_handle_t& path);

/**
* Destroy the given set of paths. Invalidates handles to all the paths and their steps.
*/
void destroy_paths(const std::vector<path_handle_t>& paths);

/**
* Create a path with the given name. The caller must ensure that no path
Expand Down Expand Up @@ -2607,52 +2612,76 @@ string BasePackedGraph<Backend>::decode_path_name(const int64_t& path_idx) const

template<typename Backend>
void BasePackedGraph<Backend>::destroy_path(const path_handle_t& path) {
destroy_paths({path});
}

template<typename Backend>
void BasePackedGraph<Backend>::destroy_paths(const std::vector<path_handle_t>& paths) {

PackedPath& packed_path = paths.at(as_integer(path));
std::unordered_set<path_handle_t> paths_set(paths.begin(), paths.end());

// remove node membership records corresponding to this path
bool first_iter = true;
for (uint64_t step_offset = path_head_iv.get(as_integer(path));
step_offset != 0 && (step_offset != path_head_iv.get(as_integer(path)) || first_iter);
step_offset = get_step_next(packed_path, step_offset)) {

uint64_t trav = get_step_trav(packed_path, step_offset);
size_t node_member_idx = graph_index_to_node_member_index(graph_iv_index(decode_traversal(trav)));
PackedSet<Backend> nodes_visited;

for (const auto& path : paths) {

// find a membership record for this path
size_t prev = 0;
size_t here = path_membership_node_iv.get(node_member_idx);
while (as_path_handle(get_membership_path(here)) != path) {
prev = here;
here = get_next_membership(here);
// note: we don't need to be careful about getting the exact corresponding step since this node
// should try to delete a membership record exactly as many times as it occurs on this path -- all of
// the records will get deleted
}
PackedPath& packed_path = this->paths.at(as_integer(path));

if (prev == 0) {
// this was the first record, set following one to be the head
path_membership_node_iv.set(node_member_idx, get_next_membership(here));
}
else {
// make the link from the previous record skip over the current one
set_next_membership(prev, get_next_membership(here));
// remove node membership records corresponding to this path
bool first_iter = true;
for (uint64_t step_offset = path_head_iv.get(as_integer(path));
step_offset != 0 && (step_offset != path_head_iv.get(as_integer(path)) || first_iter);
step_offset = get_step_next(packed_path, step_offset)) {

uint64_t trav = get_step_trav(packed_path, step_offset);
// if there are multiple paths, we check for whether we've gone over the same
// node multiple times (which would be wasteful)
if (paths.size() > 1) {
nid_t node_id = get_id(decode_traversal(trav));
if (nodes_visited.find(node_id)) {
continue;
}
nodes_visited.insert(node_id);
}

size_t node_member_idx = graph_index_to_node_member_index(graph_iv_index(decode_traversal(trav)));

// find a membership record for this path
size_t prev = 0;
size_t here = path_membership_node_iv.get(node_member_idx);
while (here) {
if (paths_set.count(as_path_handle(get_membership_path(here)))) {
// this is a membership record for a path that we're deleting
if (prev == 0) {
// this was the first record, set following one to be the head
path_membership_node_iv.set(node_member_idx, get_next_membership(here));
}
else {
// make the link from the previous record skip over the current one
set_next_membership(prev, get_next_membership(here));
}

++deleted_membership_records;
}
else {
prev = here;
}

here = get_next_membership(here);
}

first_iter = false;
}

++deleted_membership_records;
path_id.erase(extract_encoded_path_name(as_integer(path)));

first_iter = false;
path_is_deleted_iv.set(as_integer(path), true);
packed_path.steps_iv.clear();
packed_path.links_iv.clear();
path_head_iv.set(as_integer(path), 0);
path_tail_iv.set(as_integer(path), 0);
path_deleted_steps_iv.set(as_integer(path), 0);
}

path_id.erase(extract_encoded_path_name(as_integer(path)));

path_is_deleted_iv.set(as_integer(path), true);
packed_path.steps_iv.clear();
packed_path.links_iv.clear();
path_head_iv.set(as_integer(path), 0);
path_tail_iv.set(as_integer(path), 0);
path_deleted_steps_iv.set(as_integer(path), 0);

defragment();
}

Expand Down
10 changes: 4 additions & 6 deletions bdsg/include/bdsg/internal/packed_structs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1370,9 +1370,6 @@ inline uint64_t PackedSet<Backend>::optimal_anchor() const {

template<typename Backend>
inline void PackedSet<Backend>::rehash(bool shrink) {
// Keeping the RNG state in the class isn't bvery portable, so we make one
// per thread in thread storage.
static thread_local default_random_engine gen(random_device{}());

// move to the next size in the schedule
if (shrink) {
Expand All @@ -1390,6 +1387,10 @@ inline void PackedSet<Backend>::rehash(bool shrink) {
PackedVec new_table;
new_table.resize(bdsg_packed_set_size_schedule[schedule_val]);

// we use random-at-coding-time linear congruential generator to get a pseudo-random seed
// that makes the coefficients in a deterministic, system-independent way
uint64_t seed = 17340519333326003581ull * (schedule_val + 14057138822558802453ull) + 6918838906415272680ull;
std::mt19937_64 gen(seed);
std::uniform_int_distribution<uint64_t> distr(0, new_table.size() - 1);
for (size_t i = 0; i < 5; ++i) {
coefs[i] = distr(gen);
Expand Down Expand Up @@ -1511,9 +1512,6 @@ inline void PackedSet<Backend>::set_load_factors(double min_load_factor, double
assert(max_load_factor < 1.0);
min_load = min_load_factor;
max_load = max_load_factor;
assert(max_load > min_load);
assert(min_load >= 0.0);
assert(max_load < 1.0);
if (table.size() > 0) {
while (num_items <= min_load * table.size()) {
rehash(true);
Expand Down
53 changes: 37 additions & 16 deletions bdsg/src/hash_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -592,25 +592,46 @@ namespace bdsg {
}
return true;
}

void HashGraph::destroy_path(const path_handle_t& path) {
destroy_paths({path});
}

void HashGraph::destroy_paths(const vector<path_handle_t>& paths) {

// remove the records of nodes occurring on this path
for_each_step_in_path(path, [&](const step_handle_t& step) {
path_mapping_t* mapping = (path_mapping_t*) intptr_t(as_integers(step)[1]);
vector<path_mapping_t*>& node_occs = graph[get_id(mapping->handle)].occurrences;
for (size_t i = 0; i < node_occs.size(); i++) {
if (node_occs[i] == mapping) {
node_occs[i] = node_occs.back();
node_occs.pop_back();
break;
}
}
});
unordered_set<int64_t> path_ids;
for (auto path : paths) {
path_ids.emplace(as_integer(path));
}
unordered_set<nid_t> nodes_visited;

// erase the path itself
path_id.erase(paths[as_integer(path)].name);
paths.erase(as_integer(path));
for (auto path : paths) {
// remove the records of nodes occurring on this path
for_each_step_in_path(path, [&](const step_handle_t& step) {
path_mapping_t* mapping = (path_mapping_t*) intptr_t(as_integers(step)[1]);
if (paths.size() > 1) {
bool did_insert = nodes_visited.insert(get_id(mapping->handle)).second;
if (!did_insert) {
// we've already deleted on this node
return;
}
}
vector<path_mapping_t*>& node_occs = graph[get_id(mapping->handle)].occurrences;
for (size_t i = 0; i < node_occs.size(); ) {
if (path_ids.count(node_occs[i]->path_id)) {
node_occs[i] = node_occs.back();
node_occs.pop_back();
}
else {
++i;
}
}
});

// erase the path itself
path_id.erase(this->paths[as_integer(path)].name);
this->paths.erase(as_integer(path));
}
}

path_handle_t HashGraph::create_path_handle(const string& name, bool is_circular) {
Expand Down
67 changes: 67 additions & 0 deletions bdsg/src/test_libbdsg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2285,6 +2285,73 @@ void test_deletable_handle_graphs() {
}
}

// batch deletion of paths works as expected
{
vector<MutablePathDeletableHandleGraph*> implementations;

// Add implementations

PackedGraph pg;
implementations.push_back(&pg);

HashGraph hg;
implementations.push_back(&hg);

MappedPackedGraph mpg;
implementations.push_back(&mpg);

for (int imp = 0; imp < implementations.size(); ++imp) {

MutablePathDeletableHandleGraph& graph = *implementations[imp];

auto h1 = graph.create_handle("A");
auto h2 = graph.create_handle("A");
auto h3 = graph.create_handle("A");

graph.create_edge(h1, h2);
graph.create_edge(h2, h3);

auto p1 = graph.create_path_handle("1");
auto p2 = graph.create_path_handle("2");
auto p3 = graph.create_path_handle("3");
auto p4 = graph.create_path_handle("4");
auto p5 = graph.create_path_handle("5");

for (const auto& p : {p1, p2, p3, p4, p5}) {
for (auto h : {h1, h2, h3}) {
graph.append_step(p, h);
}
}

graph.destroy_paths({p1, p3, p4});

set<path_handle_t> paths_seen;
set<path_handle_t> paths_expected{p2, p5};
graph.for_each_path_handle([&](const path_handle_t& path) {
assert(!paths_seen.count(path));
paths_seen.insert(path);
std::vector<handle_t> handles;
std::vector<handle_t> handles_expected{h1, h2, h3};
for (auto h : graph.scan_path(path)) {
handles.push_back(h);
}
assert(handles == handles_expected);
});

assert(paths_seen == paths_expected);

graph.for_each_handle([&](const handle_t& h) {
set<path_handle_t> paths;
graph.for_each_step_on_handle(h, [&](const step_handle_t& step) {
auto p = graph.get_path_handle_of_step(step);
assert(!paths.count(p));
paths.insert(p);
});
assert(paths_seen == paths_expected);
});
}
}

cerr << "DeletableHandleGraph tests successful!" << endl;
}

Expand Down
Loading