|
| 1 | +// A column-based tuple databased used to power the datalog queries. |
| 2 | +// First, there's a string hashmap of Term variables to an integer ID, and a variable to store the max ID. |
| 3 | +// Each Term also stores its 'index' within each table in sorted order. |
| 4 | +// There's a list of Tables - one per "relation". |
| 5 | +// Each table is composed of columns. |
| 6 | +// The column is composed of two representations: |
| 7 | +// An array of each distinct term element, which point to a list of indexes where that term appears in this column. |
| 8 | +// - The indexes are stored with a byte offset. |
| 9 | +// - A 0 offset indicates the value is beyond the 0-255 byte range, and appears in an auxillary array as the exact index. |
| 10 | +// The column additionally has a raw list of byte values, indicating which term appears in that position (based on the term index) |
| 11 | + |
| 12 | +const std = @import("std"); |
| 13 | +const Allocator = std.mem.Allocator; |
| 14 | +const ArrayList = std.ArrayList; |
| 15 | +const StringHashMap = std.StringHashMap; |
| 16 | +const assert = std.debug.assert; |
| 17 | + |
| 18 | +const DB = struct { |
| 19 | + allocator: Allocator, |
| 20 | + terms: std.StringHashMap(Term), |
| 21 | + tables: std.ArrayList(Table), |
| 22 | + max_column_id: u32 = 0, |
| 23 | + max_term_id: u32 = 0, |
| 24 | + max_table_id: u32 = 0, |
| 25 | + |
| 26 | + pub fn init(allocator: Allocator) DB { |
| 27 | + return DB{ |
| 28 | + .allocator = allocator, |
| 29 | + .terms = std.StringHashMap(Term).init(allocator), |
| 30 | + .tables = std.ArrayList(Table).init(allocator), |
| 31 | + }; |
| 32 | + } |
| 33 | + |
| 34 | + pub fn deinit(self: *DB) void { |
| 35 | + self.terms.deinit(); |
| 36 | + self.tables.deinit(); |
| 37 | + } |
| 38 | +}; |
| 39 | + |
| 40 | +const Table = struct { |
| 41 | + db: *DB, |
| 42 | + table_id: u32, |
| 43 | + name: []const u8, |
| 44 | + allocator: Allocator, |
| 45 | + columns: std.StringArrayHashMap(Column), |
| 46 | + |
| 47 | + pub fn init(db: *DB, allocator: Allocator, name: []const u8) Table { |
| 48 | + return Table{ |
| 49 | + .db = db, |
| 50 | + .allocator = allocator, |
| 51 | + .name = name, |
| 52 | + .columns = std.StringArrayHashMap(Column).init(allocator), |
| 53 | + }; |
| 54 | + } |
| 55 | + |
| 56 | + pub fn deinit(self: *Table) void { |
| 57 | + self.columns.deinit(); |
| 58 | + } |
| 59 | + |
| 60 | + pub fn addColumn(self: *Table, name: []const u8) !u32 { |
| 61 | + const column_id = self.db.max_column_id; |
| 62 | + self.db.max_column_id += 1; |
| 63 | + try self.columns.put(name, Column.init(self, self.allocator, column_id)); |
| 64 | + return column_id; |
| 65 | + } |
| 66 | +}; |
| 67 | + |
| 68 | +const Column = struct { |
| 69 | + id: u32, // Global Column ID |
| 70 | + table: *Table, |
| 71 | + allocator: Allocator, |
| 72 | + // List of distinct terms which appear in this column. The index in this table is what's used everywhere else. |
| 73 | + // This list shouldn't be used much. Instead, prefer the term->column ID for lookups. |
| 74 | + // We could remove this and replace it with a max term ID if needed. |
| 75 | + terms: std.ArrayList(Term), |
| 76 | + // The raw list of byte values, indicating which term appears in that position (based on the term index) |
| 77 | + // If there are more than 255 values, then the second arraylist is used for newer terms going forward. |
| 78 | + order: std.ArrayList(u8), |
| 79 | + // Only used if there are more than 255 distinct terms (i.e. term ID overflow). Initialized to empty capacity. |
| 80 | + // Since columns only add, and term IDs only increase, you can conceptually think of this array as continuing where order left off. |
| 81 | + order16: std.ArrayList(u16), |
| 82 | + // Term ID -> List of its references. Indexed by the local term index. |
| 83 | + refs: std.ArrayList(TermRefs), |
| 84 | + length: u32 = 0, // Total length. Equals order.items.len + order16.items.len |
| 85 | + |
| 86 | + pub fn init(table: *Table, allocator: Allocator, column_id: u32) Column { |
| 87 | + return Column{ |
| 88 | + .allocator = allocator, |
| 89 | + .table = table, |
| 90 | + .id = column_id, |
| 91 | + .terms = std.ArrayList(Term).init(allocator), |
| 92 | + .order = std.ArrayList(u8).init(allocator), |
| 93 | + .order16 = std.ArrayList(u16).initCapacity(allocator, 0), |
| 94 | + .refs = std.ArrayList(TermRefs).init(allocator), |
| 95 | + }; |
| 96 | + } |
| 97 | + |
| 98 | + pub fn deinit(self: *Column) void { |
| 99 | + self.terms.deinit(); |
| 100 | + self.order.deinit(); |
| 101 | + self.order16.deinit(); |
| 102 | + self.refs.deinit(); |
| 103 | + } |
| 104 | + |
| 105 | + fn pushOrder(self: *Column, termIdx: u16) !u32 { |
| 106 | + if (self.terms.items.len < 255) { |
| 107 | + assert(termIdx < 255); |
| 108 | + const val: u8 = @truncate(termIdx); |
| 109 | + try self.order.append(val); |
| 110 | + } else { |
| 111 | + try self.order16.append(termIdx); |
| 112 | + } |
| 113 | + self.length += 1; |
| 114 | + return self.length; |
| 115 | + } |
| 116 | + |
| 117 | + fn pushRef(self: *Column, termIdx: u16, orderIndex: u32) !void { |
| 118 | + const termRefs = self.refs.items[termIdx]; |
| 119 | + termRefs.pushRef(orderIndex); |
| 120 | + } |
| 121 | + |
| 122 | + pub fn push(self: *Column, termIdx: u16) !void { |
| 123 | + const orderIndex = try self.pushOrder(termIdx); |
| 124 | + try self.pushRef(termIdx, orderIndex); |
| 125 | + } |
| 126 | + |
| 127 | + pub fn addTerm(self: *Column, term: Term) u16 { |
| 128 | + // Add a new term to this column and push it. |
| 129 | + // The caller is responsible for making sure it's a net-new term |
| 130 | + // Otherwise, insertion performance would be dominated by that term-existence lookup. |
| 131 | + const termIndex = self.terms.items.len; |
| 132 | + assert(termIndex <= std.math.maxInt(u16)); |
| 133 | + try self.terms.append(term); |
| 134 | + // assume: no concurrent access to self.length. And assuming we're immediately pushing this term after this call. |
| 135 | + const lastIndex = try self.pushOrder(termIndex); |
| 136 | + try self.refs.append(TermRefs{ .allocator = self.allocator, .lastIndex = lastIndex }); |
| 137 | + return @truncate(termIndex); |
| 138 | + } |
| 139 | +}; |
| 140 | + |
| 141 | +const TermRefs = struct { |
| 142 | + allocator: Allocator, |
| 143 | + // Each column maintains an inverted index of where all each term appears |
| 144 | + // Mostly stored as an offset array. 0 indicates offset overflow, which is stored in a second array. |
| 145 | + offsets: std.ArrayList(u8), // Offset from previous appearance. |
| 146 | + overflow: std.ArrayList(u32), // Any index where offset = 0 is found here. Else offset is offset from previous. |
| 147 | + lastIndex: u32, // Last absolute index where this term appeared. |
| 148 | + |
| 149 | + pub fn init(allocator: Allocator, initial_index: u32) TermRefs { |
| 150 | + var offsets = std.ArrayList(u8).init(allocator); |
| 151 | + var overflow = std.ArrayList(u32).init(allocator); |
| 152 | + // Store the initial index in the array as well |
| 153 | + try offsets.append(0); |
| 154 | + try overflow.append(initial_index); |
| 155 | + |
| 156 | + return TermRefs{ |
| 157 | + .allocator = allocator, |
| 158 | + .offsets = offsets, |
| 159 | + .overflow = overflow, |
| 160 | + .lastIndex = initial_index, |
| 161 | + }; |
| 162 | + } |
| 163 | + |
| 164 | + pub fn deinit(self: *TermRefs) void { |
| 165 | + self.offsets.deinit(); |
| 166 | + self.overflow.deinit(); |
| 167 | + } |
| 168 | + |
| 169 | + fn pushRef(self: *TermRefs, index: u32) !void { |
| 170 | + const offset = index - self.lastIndex; |
| 171 | + if (offset < 255) { |
| 172 | + try self.offsets.append(@truncate(offset)); |
| 173 | + } else { |
| 174 | + try self.offsets.append(0); |
| 175 | + try self.overflow.append(offset); |
| 176 | + } |
| 177 | + self.lastIndex = index; |
| 178 | + } |
| 179 | +}; |
| 180 | + |
| 181 | +const ColumnRef = packed struct(u64) { |
| 182 | + // TODO: Order here likely matters as a micro-optimization. |
| 183 | + column_id: u32, |
| 184 | + term_index: u32, |
| 185 | +}; |
| 186 | + |
| 187 | +fn orderColumnRef(context: ColumnRef, item: ColumnRef) std.math.Order { |
| 188 | + return std.math.order(context.column_id, item.column_id); |
| 189 | +} |
| 190 | + |
| 191 | +const Term = struct { |
| 192 | + // Store a sorted arraylist of Column ID -> index of this term within that column's terms list. |
| 193 | + // A column may contain many terms, but it's expected that a term only makes an appearance in some small number of columns. |
| 194 | + refs: std.ArrayList(ColumnRef), |
| 195 | + |
| 196 | + pub fn init(allocator: Allocator) Term { |
| 197 | + return Term{ |
| 198 | + .allocator = allocator, |
| 199 | + .refs = std.ArrayList(ColumnRef).init(allocator), |
| 200 | + }; |
| 201 | + } |
| 202 | + |
| 203 | + pub fn getColumnRef(self: *Term, column_id: u32) ?u32 { |
| 204 | + // Arbitrary boundary. General intuition is that binary-search will be slower than linear for small arrays. |
| 205 | + if (self.refs.items.len < 128) { |
| 206 | + // Linear search. Terminate early if value > column_id |
| 207 | + var i: usize = 0; |
| 208 | + while (i < self.refs.items.len and self.refs.items[i].column_id < column_id) { |
| 209 | + i += 1; |
| 210 | + } |
| 211 | + if (i < self.refs.items.len and self.refs.items[i].column_id == column_id) { |
| 212 | + return self.refs.items[i].term_index; |
| 213 | + } |
| 214 | + return null; |
| 215 | + } else { |
| 216 | + // Binary search if the list is large - which we expect to be fairly rare. |
| 217 | + const index = std.sort.binarySearch(u32, self.refs.items, column_id, orderColumnRef); |
| 218 | + if (index == null) { |
| 219 | + return null; |
| 220 | + } |
| 221 | + return self.refs.items[index].term_index; |
| 222 | + } |
| 223 | + } |
| 224 | + |
| 225 | + pub fn addColumnRef(self: *Term, column_id: u32, term_index: u32) !void { |
| 226 | + const column_ref = ColumnRef{ .column_id = column_id, .term_index = term_index }; |
| 227 | + var index: usize = 0; |
| 228 | + if (self.refs.items.len < 128) { |
| 229 | + while (index < self.refs.items.len and self.refs.items[index].column_id < column_id) { |
| 230 | + index += 1; |
| 231 | + } |
| 232 | + } else { |
| 233 | + // insertion sort to add to refs |
| 234 | + index = std.sort.upperBound(ColumnRef, self.refs.items, column_ref, orderColumnRef); |
| 235 | + } |
| 236 | + // TODO: Do we need to heap-allocate column_ref? |
| 237 | + try self.refs.insert(index, column_ref); |
| 238 | + } |
| 239 | +}; |
0 commit comments