From 39cb4fd944cd6131a03589a6eef5af9b05b5d749 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Fri, 19 Jul 2024 15:27:42 -0700 Subject: [PATCH] implement batch delete for paths --- bdsg/deps/libhandlegraph | 2 +- bdsg/include/bdsg/hash_graph.hpp | 5 + .../bdsg/internal/base_packed_graph.hpp | 103 +++++++++++------- bdsg/include/bdsg/internal/packed_structs.hpp | 10 +- bdsg/src/hash_graph.cpp | 53 ++++++--- bdsg/src/test_libbdsg.cpp | 67 ++++++++++++ 6 files changed, 180 insertions(+), 60 deletions(-) diff --git a/bdsg/deps/libhandlegraph b/bdsg/deps/libhandlegraph index a9e33664..ccc11459 160000 --- a/bdsg/deps/libhandlegraph +++ b/bdsg/deps/libhandlegraph @@ -1 +1 @@ -Subproject commit a9e33664ab73a3751f139bc4a7d07098de9e8515 +Subproject commit ccc11459019680ae53b65bcc19077d41ecff3075 diff --git a/bdsg/include/bdsg/hash_graph.hpp b/bdsg/include/bdsg/hash_graph.hpp index abd39a8d..2c463307 100644 --- a/bdsg/include/bdsg/hash_graph.hpp +++ b/bdsg/include/bdsg/hash_graph.hpp @@ -269,6 +269,11 @@ class HashGraph : public MutablePathDeletableHandleGraph, public SerializableHan */ void destroy_path(const path_handle_t& path); + /** + * Destroy the given set of paths. Invalidates handles to all the paths and their steps. + */ + void destroy_paths(const std::vector& paths); + /** * Create a path with the given name. The caller must ensure that no path * with the given name exists already, or the behavior is undefined. diff --git a/bdsg/include/bdsg/internal/base_packed_graph.hpp b/bdsg/include/bdsg/internal/base_packed_graph.hpp index 81a3f94e..39cc7020 100644 --- a/bdsg/include/bdsg/internal/base_packed_graph.hpp +++ b/bdsg/include/bdsg/internal/base_packed_graph.hpp @@ -333,6 +333,11 @@ class BasePackedGraph { * Destroy the given path. Invalidates handles to the path and its node steps. */ void destroy_path(const path_handle_t& path); + + /** + * Destroy the given set of paths. Invalidates handles to all the paths and their steps. + */ + void destroy_paths(const std::vector& paths); /** * Create a path with the given name. The caller must ensure that no path @@ -2607,52 +2612,76 @@ string BasePackedGraph::decode_path_name(const int64_t& path_idx) const template void BasePackedGraph::destroy_path(const path_handle_t& path) { + destroy_paths({path}); +} + +template +void BasePackedGraph::destroy_paths(const std::vector& paths) { - PackedPath& packed_path = paths.at(as_integer(path)); + std::unordered_set paths_set(paths.begin(), paths.end()); - // remove node membership records corresponding to this path - bool first_iter = true; - for (uint64_t step_offset = path_head_iv.get(as_integer(path)); - step_offset != 0 && (step_offset != path_head_iv.get(as_integer(path)) || first_iter); - step_offset = get_step_next(packed_path, step_offset)) { - - uint64_t trav = get_step_trav(packed_path, step_offset); - size_t node_member_idx = graph_index_to_node_member_index(graph_iv_index(decode_traversal(trav))); + PackedSet nodes_visited; + + for (const auto& path : paths) { - // find a membership record for this path - size_t prev = 0; - size_t here = path_membership_node_iv.get(node_member_idx); - while (as_path_handle(get_membership_path(here)) != path) { - prev = here; - here = get_next_membership(here); - // note: we don't need to be careful about getting the exact corresponding step since this node - // should try to delete a membership record exactly as many times as it occurs on this path -- all of - // the records will get deleted - } + PackedPath& packed_path = this->paths.at(as_integer(path)); - if (prev == 0) { - // this was the first record, set following one to be the head - path_membership_node_iv.set(node_member_idx, get_next_membership(here)); - } - else { - // make the link from the previous record skip over the current one - set_next_membership(prev, get_next_membership(here)); + // remove node membership records corresponding to this path + bool first_iter = true; + for (uint64_t step_offset = path_head_iv.get(as_integer(path)); + step_offset != 0 && (step_offset != path_head_iv.get(as_integer(path)) || first_iter); + step_offset = get_step_next(packed_path, step_offset)) { + + uint64_t trav = get_step_trav(packed_path, step_offset); + // if there are multiple paths, we check for whether we've gone over the same + // node multiple times (which would be wasteful) + if (paths.size() > 1) { + nid_t node_id = get_id(decode_traversal(trav)); + if (nodes_visited.find(node_id)) { + continue; + } + nodes_visited.insert(node_id); + } + + size_t node_member_idx = graph_index_to_node_member_index(graph_iv_index(decode_traversal(trav))); + + // find a membership record for this path + size_t prev = 0; + size_t here = path_membership_node_iv.get(node_member_idx); + while (here) { + if (paths_set.count(as_path_handle(get_membership_path(here)))) { + // this is a membership record for a path that we're deleting + if (prev == 0) { + // this was the first record, set following one to be the head + path_membership_node_iv.set(node_member_idx, get_next_membership(here)); + } + else { + // make the link from the previous record skip over the current one + set_next_membership(prev, get_next_membership(here)); + } + + ++deleted_membership_records; + } + else { + prev = here; + } + + here = get_next_membership(here); + } + + first_iter = false; } - ++deleted_membership_records; + path_id.erase(extract_encoded_path_name(as_integer(path))); - first_iter = false; + path_is_deleted_iv.set(as_integer(path), true); + packed_path.steps_iv.clear(); + packed_path.links_iv.clear(); + path_head_iv.set(as_integer(path), 0); + path_tail_iv.set(as_integer(path), 0); + path_deleted_steps_iv.set(as_integer(path), 0); } - path_id.erase(extract_encoded_path_name(as_integer(path))); - - path_is_deleted_iv.set(as_integer(path), true); - packed_path.steps_iv.clear(); - packed_path.links_iv.clear(); - path_head_iv.set(as_integer(path), 0); - path_tail_iv.set(as_integer(path), 0); - path_deleted_steps_iv.set(as_integer(path), 0); - defragment(); } diff --git a/bdsg/include/bdsg/internal/packed_structs.hpp b/bdsg/include/bdsg/internal/packed_structs.hpp index 24a40306..c17cbe6e 100644 --- a/bdsg/include/bdsg/internal/packed_structs.hpp +++ b/bdsg/include/bdsg/internal/packed_structs.hpp @@ -1370,9 +1370,6 @@ inline uint64_t PackedSet::optimal_anchor() const { template inline void PackedSet::rehash(bool shrink) { - // Keeping the RNG state in the class isn't bvery portable, so we make one - // per thread in thread storage. - static thread_local default_random_engine gen(random_device{}()); // move to the next size in the schedule if (shrink) { @@ -1390,6 +1387,10 @@ inline void PackedSet::rehash(bool shrink) { PackedVec new_table; new_table.resize(bdsg_packed_set_size_schedule[schedule_val]); + // we use random-at-coding-time linear congruential generator to get a pseudo-random seed + // that makes the coefficients in a deterministic, system-independent way + uint64_t seed = 17340519333326003581ull * (schedule_val + 14057138822558802453ull) + 6918838906415272680ull; + std::mt19937_64 gen(seed); std::uniform_int_distribution distr(0, new_table.size() - 1); for (size_t i = 0; i < 5; ++i) { coefs[i] = distr(gen); @@ -1511,9 +1512,6 @@ inline void PackedSet::set_load_factors(double min_load_factor, double assert(max_load_factor < 1.0); min_load = min_load_factor; max_load = max_load_factor; - assert(max_load > min_load); - assert(min_load >= 0.0); - assert(max_load < 1.0); if (table.size() > 0) { while (num_items <= min_load * table.size()) { rehash(true); diff --git a/bdsg/src/hash_graph.cpp b/bdsg/src/hash_graph.cpp index 1e757442..b9b830a9 100644 --- a/bdsg/src/hash_graph.cpp +++ b/bdsg/src/hash_graph.cpp @@ -592,25 +592,46 @@ namespace bdsg { } return true; } - + void HashGraph::destroy_path(const path_handle_t& path) { + destroy_paths({path}); + } + + void HashGraph::destroy_paths(const vector& paths) { - // remove the records of nodes occurring on this path - for_each_step_in_path(path, [&](const step_handle_t& step) { - path_mapping_t* mapping = (path_mapping_t*) intptr_t(as_integers(step)[1]); - vector& node_occs = graph[get_id(mapping->handle)].occurrences; - for (size_t i = 0; i < node_occs.size(); i++) { - if (node_occs[i] == mapping) { - node_occs[i] = node_occs.back(); - node_occs.pop_back(); - break; - } - } - }); + unordered_set path_ids; + for (auto path : paths) { + path_ids.emplace(as_integer(path)); + } + unordered_set nodes_visited; - // erase the path itself - path_id.erase(paths[as_integer(path)].name); - paths.erase(as_integer(path)); + for (auto path : paths) { + // remove the records of nodes occurring on this path + for_each_step_in_path(path, [&](const step_handle_t& step) { + path_mapping_t* mapping = (path_mapping_t*) intptr_t(as_integers(step)[1]); + if (paths.size() > 1) { + bool did_insert = nodes_visited.insert(get_id(mapping->handle)).second; + if (!did_insert) { + // we've already deleted on this node + return; + } + } + vector& node_occs = graph[get_id(mapping->handle)].occurrences; + for (size_t i = 0; i < node_occs.size(); ) { + if (path_ids.count(node_occs[i]->path_id)) { + node_occs[i] = node_occs.back(); + node_occs.pop_back(); + } + else { + ++i; + } + } + }); + + // erase the path itself + path_id.erase(this->paths[as_integer(path)].name); + this->paths.erase(as_integer(path)); + } } path_handle_t HashGraph::create_path_handle(const string& name, bool is_circular) { diff --git a/bdsg/src/test_libbdsg.cpp b/bdsg/src/test_libbdsg.cpp index f803184e..13195118 100644 --- a/bdsg/src/test_libbdsg.cpp +++ b/bdsg/src/test_libbdsg.cpp @@ -2285,6 +2285,73 @@ void test_deletable_handle_graphs() { } } + // batch deletion of paths works as expected + { + vector implementations; + + // Add implementations + + PackedGraph pg; + implementations.push_back(&pg); + + HashGraph hg; + implementations.push_back(&hg); + + MappedPackedGraph mpg; + implementations.push_back(&mpg); + + for (int imp = 0; imp < implementations.size(); ++imp) { + + MutablePathDeletableHandleGraph& graph = *implementations[imp]; + + auto h1 = graph.create_handle("A"); + auto h2 = graph.create_handle("A"); + auto h3 = graph.create_handle("A"); + + graph.create_edge(h1, h2); + graph.create_edge(h2, h3); + + auto p1 = graph.create_path_handle("1"); + auto p2 = graph.create_path_handle("2"); + auto p3 = graph.create_path_handle("3"); + auto p4 = graph.create_path_handle("4"); + auto p5 = graph.create_path_handle("5"); + + for (const auto& p : {p1, p2, p3, p4, p5}) { + for (auto h : {h1, h2, h3}) { + graph.append_step(p, h); + } + } + + graph.destroy_paths({p1, p3, p4}); + + set paths_seen; + set paths_expected{p2, p5}; + graph.for_each_path_handle([&](const path_handle_t& path) { + assert(!paths_seen.count(path)); + paths_seen.insert(path); + std::vector handles; + std::vector handles_expected{h1, h2, h3}; + for (auto h : graph.scan_path(path)) { + handles.push_back(h); + } + assert(handles == handles_expected); + }); + + assert(paths_seen == paths_expected); + + graph.for_each_handle([&](const handle_t& h) { + set paths; + graph.for_each_step_on_handle(h, [&](const step_handle_t& step) { + auto p = graph.get_path_handle_of_step(step); + assert(!paths.count(p)); + paths.insert(p); + }); + assert(paths_seen == paths_expected); + }); + } + } + cerr << "DeletableHandleGraph tests successful!" << endl; }