Skip to content

Commit d51ed10

Browse files
committed
Implement some core datastructures as foundation for a datalog rule engine. Implemented a simple column-store DB. Started on a sparse bitset.
1 parent 896f01b commit d51ed10

File tree

5 files changed

+316
-6
lines changed

5 files changed

+316
-6
lines changed

Code/Compiler/src/db.zig

+239
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
// A column-based tuple databased used to power the datalog queries.
2+
// First, there's a string hashmap of Term variables to an integer ID, and a variable to store the max ID.
3+
// Each Term also stores its 'index' within each table in sorted order.
4+
// There's a list of Tables - one per "relation".
5+
// Each table is composed of columns.
6+
// The column is composed of two representations:
7+
// An array of each distinct term element, which point to a list of indexes where that term appears in this column.
8+
// - The indexes are stored with a byte offset.
9+
// - A 0 offset indicates the value is beyond the 0-255 byte range, and appears in an auxillary array as the exact index.
10+
// The column additionally has a raw list of byte values, indicating which term appears in that position (based on the term index)
11+
12+
const std = @import("std");
13+
const Allocator = std.mem.Allocator;
14+
const ArrayList = std.ArrayList;
15+
const StringHashMap = std.StringHashMap;
16+
const assert = std.debug.assert;
17+
18+
const DB = struct {
19+
allocator: Allocator,
20+
terms: std.StringHashMap(Term),
21+
tables: std.ArrayList(Table),
22+
max_column_id: u32 = 0,
23+
max_term_id: u32 = 0,
24+
max_table_id: u32 = 0,
25+
26+
pub fn init(allocator: Allocator) DB {
27+
return DB{
28+
.allocator = allocator,
29+
.terms = std.StringHashMap(Term).init(allocator),
30+
.tables = std.ArrayList(Table).init(allocator),
31+
};
32+
}
33+
34+
pub fn deinit(self: *DB) void {
35+
self.terms.deinit();
36+
self.tables.deinit();
37+
}
38+
};
39+
40+
const Table = struct {
41+
db: *DB,
42+
table_id: u32,
43+
name: []const u8,
44+
allocator: Allocator,
45+
columns: std.StringArrayHashMap(Column),
46+
47+
pub fn init(db: *DB, allocator: Allocator, name: []const u8) Table {
48+
return Table{
49+
.db = db,
50+
.allocator = allocator,
51+
.name = name,
52+
.columns = std.StringArrayHashMap(Column).init(allocator),
53+
};
54+
}
55+
56+
pub fn deinit(self: *Table) void {
57+
self.columns.deinit();
58+
}
59+
60+
pub fn addColumn(self: *Table, name: []const u8) !u32 {
61+
const column_id = self.db.max_column_id;
62+
self.db.max_column_id += 1;
63+
try self.columns.put(name, Column.init(self, self.allocator, column_id));
64+
return column_id;
65+
}
66+
};
67+
68+
const Column = struct {
69+
id: u32, // Global Column ID
70+
table: *Table,
71+
allocator: Allocator,
72+
// List of distinct terms which appear in this column. The index in this table is what's used everywhere else.
73+
// This list shouldn't be used much. Instead, prefer the term->column ID for lookups.
74+
// We could remove this and replace it with a max term ID if needed.
75+
terms: std.ArrayList(Term),
76+
// The raw list of byte values, indicating which term appears in that position (based on the term index)
77+
// If there are more than 255 values, then the second arraylist is used for newer terms going forward.
78+
order: std.ArrayList(u8),
79+
// Only used if there are more than 255 distinct terms (i.e. term ID overflow). Initialized to empty capacity.
80+
// Since columns only add, and term IDs only increase, you can conceptually think of this array as continuing where order left off.
81+
order16: std.ArrayList(u16),
82+
// Term ID -> List of its references. Indexed by the local term index.
83+
refs: std.ArrayList(TermRefs),
84+
length: u32 = 0, // Total length. Equals order.items.len + order16.items.len
85+
86+
pub fn init(table: *Table, allocator: Allocator, column_id: u32) Column {
87+
return Column{
88+
.allocator = allocator,
89+
.table = table,
90+
.id = column_id,
91+
.terms = std.ArrayList(Term).init(allocator),
92+
.order = std.ArrayList(u8).init(allocator),
93+
.order16 = std.ArrayList(u16).initCapacity(allocator, 0),
94+
.refs = std.ArrayList(TermRefs).init(allocator),
95+
};
96+
}
97+
98+
pub fn deinit(self: *Column) void {
99+
self.terms.deinit();
100+
self.order.deinit();
101+
self.order16.deinit();
102+
self.refs.deinit();
103+
}
104+
105+
fn pushOrder(self: *Column, termIdx: u16) !u32 {
106+
if (self.terms.items.len < 255) {
107+
assert(termIdx < 255);
108+
const val: u8 = @truncate(termIdx);
109+
try self.order.append(val);
110+
} else {
111+
try self.order16.append(termIdx);
112+
}
113+
self.length += 1;
114+
return self.length;
115+
}
116+
117+
fn pushRef(self: *Column, termIdx: u16, orderIndex: u32) !void {
118+
const termRefs = self.refs.items[termIdx];
119+
termRefs.pushRef(orderIndex);
120+
}
121+
122+
pub fn push(self: *Column, termIdx: u16) !void {
123+
const orderIndex = try self.pushOrder(termIdx);
124+
try self.pushRef(termIdx, orderIndex);
125+
}
126+
127+
pub fn addTerm(self: *Column, term: Term) u16 {
128+
// Add a new term to this column and push it.
129+
// The caller is responsible for making sure it's a net-new term
130+
// Otherwise, insertion performance would be dominated by that term-existence lookup.
131+
const termIndex = self.terms.items.len;
132+
assert(termIndex <= std.math.maxInt(u16));
133+
try self.terms.append(term);
134+
// assume: no concurrent access to self.length. And assuming we're immediately pushing this term after this call.
135+
const lastIndex = try self.pushOrder(termIndex);
136+
try self.refs.append(TermRefs{ .allocator = self.allocator, .lastIndex = lastIndex });
137+
return @truncate(termIndex);
138+
}
139+
};
140+
141+
const TermRefs = struct {
142+
allocator: Allocator,
143+
// Each column maintains an inverted index of where all each term appears
144+
// Mostly stored as an offset array. 0 indicates offset overflow, which is stored in a second array.
145+
offsets: std.ArrayList(u8), // Offset from previous appearance.
146+
overflow: std.ArrayList(u32), // Any index where offset = 0 is found here. Else offset is offset from previous.
147+
lastIndex: u32, // Last absolute index where this term appeared.
148+
149+
pub fn init(allocator: Allocator, initial_index: u32) TermRefs {
150+
var offsets = std.ArrayList(u8).init(allocator);
151+
var overflow = std.ArrayList(u32).init(allocator);
152+
// Store the initial index in the array as well
153+
try offsets.append(0);
154+
try overflow.append(initial_index);
155+
156+
return TermRefs{
157+
.allocator = allocator,
158+
.offsets = offsets,
159+
.overflow = overflow,
160+
.lastIndex = initial_index,
161+
};
162+
}
163+
164+
pub fn deinit(self: *TermRefs) void {
165+
self.offsets.deinit();
166+
self.overflow.deinit();
167+
}
168+
169+
fn pushRef(self: *TermRefs, index: u32) !void {
170+
const offset = index - self.lastIndex;
171+
if (offset < 255) {
172+
try self.offsets.append(@truncate(offset));
173+
} else {
174+
try self.offsets.append(0);
175+
try self.overflow.append(offset);
176+
}
177+
self.lastIndex = index;
178+
}
179+
};
180+
181+
const ColumnRef = packed struct(u64) {
182+
// TODO: Order here likely matters as a micro-optimization.
183+
column_id: u32,
184+
term_index: u32,
185+
};
186+
187+
fn orderColumnRef(context: ColumnRef, item: ColumnRef) std.math.Order {
188+
return std.math.order(context.column_id, item.column_id);
189+
}
190+
191+
const Term = struct {
192+
// Store a sorted arraylist of Column ID -> index of this term within that column's terms list.
193+
// A column may contain many terms, but it's expected that a term only makes an appearance in some small number of columns.
194+
refs: std.ArrayList(ColumnRef),
195+
196+
pub fn init(allocator: Allocator) Term {
197+
return Term{
198+
.allocator = allocator,
199+
.refs = std.ArrayList(ColumnRef).init(allocator),
200+
};
201+
}
202+
203+
pub fn getColumnRef(self: *Term, column_id: u32) ?u32 {
204+
// Arbitrary boundary. General intuition is that binary-search will be slower than linear for small arrays.
205+
if (self.refs.items.len < 128) {
206+
// Linear search. Terminate early if value > column_id
207+
var i: usize = 0;
208+
while (i < self.refs.items.len and self.refs.items[i].column_id < column_id) {
209+
i += 1;
210+
}
211+
if (i < self.refs.items.len and self.refs.items[i].column_id == column_id) {
212+
return self.refs.items[i].term_index;
213+
}
214+
return null;
215+
} else {
216+
// Binary search if the list is large - which we expect to be fairly rare.
217+
const index = std.sort.binarySearch(u32, self.refs.items, column_id, orderColumnRef);
218+
if (index == null) {
219+
return null;
220+
}
221+
return self.refs.items[index].term_index;
222+
}
223+
}
224+
225+
pub fn addColumnRef(self: *Term, column_id: u32, term_index: u32) !void {
226+
const column_ref = ColumnRef{ .column_id = column_id, .term_index = term_index };
227+
var index: usize = 0;
228+
if (self.refs.items.len < 128) {
229+
while (index < self.refs.items.len and self.refs.items[index].column_id < column_id) {
230+
index += 1;
231+
}
232+
} else {
233+
// insertion sort to add to refs
234+
index = std.sort.upperBound(ColumnRef, self.refs.items, column_ref, orderColumnRef);
235+
}
236+
// TODO: Do we need to heap-allocate column_ref?
237+
try self.refs.insert(index, column_ref);
238+
}
239+
};

Code/Compiler/src/engraph.zig

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
// A specialized graph data structure used for storing relational connection information
2+
// Efficient lookups for connectedness, path existence, cycle checking.
3+
// This can power a lot of datalog style queries.
4+
5+
const std = @import("std");
6+
const stdbits = std.bit_set;
7+
8+
pub const BitSet64 = stdbits.IntegerBitSet(64);
9+
10+
const BitsetLevel = packed struct(u64) {
11+
header: u7, // Indicates which segements
12+
isPtr: bool, // Order matters here.
13+
data: u56,
14+
};
15+
16+
const SparseLevelBitset = struct {
17+
// A sparse representation of a bitset.
18+
// Expectation is that set bits will mostly be clustered together.
19+
// Backed by an array, so merging may require a lot of shifting,
20+
// but should be fine as long as the sparseness assumption holds.
21+
22+
const Self = @This();
23+
24+
allocator: std.mem.Allocator,
25+
26+
// An array of bitsets. If all of the bits to set is in 1-63, it'll just use the top-bit.
27+
// Else the top-layer indicates which of the 64 bit segments
28+
data: std.ArrayList(BitSet64),
29+
30+
pub fn init(allocator: std.mem.Allocator) Self {
31+
return Self{
32+
.allocator = allocator,
33+
.data = std.ArrayList(u64).init(allocator),
34+
};
35+
}
36+
37+
pub fn deinit(self: *Self) void {
38+
self.data.deinit();
39+
}
40+
};
41+
42+
const NodeEngraph = struct {
43+
// For each node, we store a list of incoming edges.
44+
const Self = @This();
45+
46+
allocator: std.mem.Allocator,
47+
48+
pub fn init(allocator: std.mem.Allocator) Self {
49+
return Self{
50+
.allocator = allocator,
51+
};
52+
}
53+
};

Code/Compiler/src/lexer.zig

+16
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,9 @@ pub const Lexer = struct {
415415
var containsLowercase = false;
416416
while (self.index < self.buffer.len) {
417417
const ch = self.buffer[self.index];
418+
if (IDENTIFIER_DELIIMITERS.isSet(ch)) {
419+
break;
420+
}
418421
switch (ch) {
419422
' ' => {
420423
break;
@@ -733,6 +736,19 @@ pub const Lexer = struct {
733736
_ = try self.token_indentation();
734737
try self.emitAux(tok.AUX_STREAM_END);
735738
try self.flushPrev(false);
739+
740+
if (DEBUG) {
741+
print("\n------------- Lexer End --------------- \n", .{});
742+
// Print the full interned symbol table
743+
print("\nInterned Symbol Table:\n", .{});
744+
var symbolIter = self.symbolTable.iterator();
745+
while (symbolIter.next()) |entry| {
746+
print("{d: <3} {s}\n", .{
747+
entry.value_ptr.*,
748+
entry.key_ptr.*,
749+
});
750+
}
751+
}
736752
}
737753
};
738754

Tests/FileTests/cond_print.ifi

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
print("Before hello\n")
2+
if 1 > 2:
3+
print("1 is greater than 2\n")
4+
else:
5+
print("1 is not greater than 2\n")
6+
print("Afterwards\n")

Tests/FileTests/run.ifi

+2-6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,2 @@
1-
print("Before hello\n")
2-
if 1 > 2:
3-
print("1 is greater than 2\n")
4-
else:
5-
print("1 is not greater than 2\n")
6-
print("Afterwards\n")
1+
ZeroAdd(X): X + 0
2+
ZeroAdd(X): X

0 commit comments

Comments
 (0)