roc-lang · rtfeldman · Aug 2, 2025 · Aug 2, 2025 · Aug 2, 2025 · Aug 2, 2025
diff --git a/src/base/Ident.zig b/src/base/Ident.zig
diff --git a/src/base/Scratch.zig b/src/base/Scratch.zig
@@ -2,7 +2,7 @@
 //! when working with recursive operations
 
 const std = @import("std");
-const base = @import("../base.zig");
+const base = @import("mod.zig");
 
 /// A stack for easily adding and removing index types when doing recursive operations
 pub fn Scratch(comptime T: type) type {

diff --git a/src/base/test/base_test.zig b/src/base/test/base_test.zig
@@ -233,12 +233,12 @@ test "Ident.Store basic CompactWriter roundtrip" {
     const idx3 = try original.insert(gpa, ident3);
 
     // Verify the attributes in the indices
-    try std.testing.expect(!idx1.attributes.effectful);
-    try std.testing.expect(!idx1.attributes.ignored);
-    try std.testing.expect(idx2.attributes.effectful);
-    try std.testing.expect(!idx2.attributes.ignored);
-    try std.testing.expect(!idx3.attributes.effectful);
-    try std.testing.expect(idx3.attributes.ignored);
+    try std.testing.expect(!idx1.attributes().effectful);
+    try std.testing.expect(!idx1.attributes().ignored);
+    try std.testing.expect(idx2.attributes().effectful);
+    try std.testing.expect(!idx2.attributes().ignored);
+    try std.testing.expect(!idx3.attributes().effectful);
+    try std.testing.expect(idx3.attributes().ignored);
 
     // Create a temp file
     var tmp_dir = std.testing.tmpDir(.{});
@@ -284,11 +284,15 @@ test "Ident.Store basic CompactWriter roundtrip" {
 
     // Check the bytes length for validation
     const bytes_len = deserialized.interner.bytes.len();
-    const idx1_value = @intFromEnum(@as(SmallStringInterner.Idx, @enumFromInt(@as(u32, idx1.idx))));
 
-    // Verify the index is valid
-    if (bytes_len <= idx1_value) {
-        return error.InvalidIndex;
+    // Get the interner index if it exists (skip validation for small indices)
+    if (idx1.getInternerIdx()) |interner_idx| {
+        const idx1_value = @intFromEnum(@as(SmallStringInterner.Idx, @enumFromInt(interner_idx)));
+
+        // Verify the index is valid
+        if (bytes_len <= idx1_value) {
+            return error.InvalidIndex;
+        }
     }
 
     // Verify the identifiers are accessible
@@ -454,30 +458,32 @@ test "Ident.Store comprehensive CompactWriter roundtrip" {
     defer original.deinit(gpa);
 
     // Test various identifier types and edge cases
-    // Note: SmallStringInterner starts with a 0 byte at index 0, so strings start at index 1
-    const test_idents = [_]struct { text: []const u8, expected_idx: u32 }{
-        .{ .text = "hello", .expected_idx = 1 },
-        .{ .text = "world!", .expected_idx = 7 },
-        .{ .text = "_ignored", .expected_idx = 14 },
-        .{ .text = "a", .expected_idx = 23 }, // single character
-        .{ .text = "very_long_identifier_name_that_might_cause_issues", .expected_idx = 25 },
-        .{ .text = "effectful!", .expected_idx = 75 },
-        .{ .text = "_", .expected_idx = 86 }, // Just underscore
-        .{ .text = "CamelCase", .expected_idx = 88 },
-        .{ .text = "snake_case", .expected_idx = 98 },
-        .{ .text = "SCREAMING_CASE", .expected_idx = 109 },
-        .{ .text = "hello", .expected_idx = 1 }, // duplicate, should reuse
+    // Don't check specific indices as they depend on implementation details
+    // Just verify that identifiers round-trip correctly
+    const test_idents = [_][]const u8{
+        "hello", // 5 chars, goes to interner
+        "world!", // has !, goes to interner
+        "_ignored", // starts with _, goes to interner
+        "a", // single character - may be inlined
+        "very_long_identifier_name_that_might_cause_issues",
+        "effectful!",
+        "_", // Just underscore
+        "CamelCase",
+        "snake_case",
+        "SCREAMING_CASE",
+        "hello", // duplicate, should reuse
     };
 
     var indices = std.ArrayList(Ident.Idx).init(gpa);
     defer indices.deinit();
 
-    for (test_idents) |test_ident| {
-        const ident = Ident.for_text(test_ident.text);
+    for (test_idents) |text| {
+        const ident = Ident.for_text(text);
         const idx = try original.insert(gpa, ident);
         try indices.append(idx);
-        // Verify the index matches expectation
-        try std.testing.expectEqual(test_ident.expected_idx, idx.idx);
+        // Just verify we can retrieve the text correctly
+        const retrieved = original.getText(idx);
+        try std.testing.expectEqualStrings(text, retrieved);
     }
 
     // Add some unique names
@@ -530,10 +536,10 @@ test "Ident.Store comprehensive CompactWriter roundtrip" {
     deserialized.relocate(@as(isize, @intCast(@intFromPtr(buffer.ptr))));
 
     // Verify all identifiers (skip duplicate at end)
-    for (test_idents[0..10], 0..) |test_ident, i| {
+    for (test_idents[0..10], 0..) |expected_text, i| {
         const idx = indices.items[i];
         const text = deserialized.getText(idx);
-        try std.testing.expectEqualStrings(test_ident.text, text);
+        try std.testing.expectEqualStrings(expected_text, text);
     }
 
     // Verify unique names
@@ -542,7 +548,8 @@ test "Ident.Store comprehensive CompactWriter roundtrip" {
 
     // Verify the interner's entry count is preserved
     // We inserted 10 unique strings + 2 generated unique names = 12 total
-    try std.testing.expectEqual(@as(u32, 12), deserialized.interner.entry_count);
+    // With SmallIdx, "a" is inlined so we have one less entry in the interner
+    try std.testing.expectEqual(@as(u32, 11), deserialized.interner.entry_count);
 
     // Verify next_unique_name
     try std.testing.expectEqual(@as(u32, 2), deserialized.next_unique_name);
@@ -1695,3 +1702,7 @@ test "TargetUsize conversion to usize" {
     try std.testing.expectEqual(TargetUsize.u32.size(), 4);
     try std.testing.expectEqual(TargetUsize.u64.size(), 8);
 }
+
+test {
+    _ = @import("ident_corruption_test.zig");
+}
diff --git a/src/base/test/ident_corruption_test.zig b/src/base/test/ident_corruption_test.zig
@@ -0,0 +1,83 @@
+const std = @import("std");
+const base = @import("base");
+const Ident = base.Ident;
+
+// Test 1: nominal_import_wildcard.md regression
+// The diff shows "red" corrupted to "aed"
+test "red identifier should not be corrupted" {
+    const testing = std.testing;
+
+    const idx_opt = Ident.Idx.try_inline("red");
+    try testing.expect(idx_opt != null);
+
+    const idx = idx_opt.?;
+    const inner = idx.toInner();
+    try testing.expect(inner.is_small);
+
+    // Create a copy to avoid alignment issues with packed struct
+    const small_copy = inner.data.small;
+    var buffer: [7]u8 = undefined;
+    const result = small_copy.writeTextToBuffer(&buffer);
+
+    // This should be "red", not "aed"
+    try testing.expectEqualStrings("red", result);
+}
+
+// Test 2: underscore_in_regular_annotations.md regression
+// The diff shows "_field2" corrupted to "_field"
+test "_field2 identifier should not be corrupted" {
+    const testing = std.testing;
+    const gpa = testing.allocator;
+
+    var store = try Ident.Store.initCapacity(gpa, 10);
+    defer store.deinit(gpa);
+
+    const ident = Ident.for_text("_field2");
+    const idx = try store.insert(gpa, ident);
+    const result = store.getText(idx);
+
+    // This should be "_field2", not "_field"
+    try testing.expectEqualStrings("_field2", result);
+}
+
+// Test 3: plume_package/Color.md regression
+// The diff shows "U8" corrupted to "a8"
+test "U8 identifier should not be corrupted" {
+    const testing = std.testing;
+
+    const idx_opt = Ident.Idx.try_inline("U8");
+    try testing.expect(idx_opt != null);
+
+    const idx = idx_opt.?;
+    const inner = idx.toInner();
+    try testing.expect(inner.is_small);
+
+    // Create a copy to avoid alignment issues with packed struct
+    const small_copy = inner.data.small;
+    var buffer: [7]u8 = undefined;
+    const result = small_copy.writeTextToBuffer(&buffer);
+
+    // This should be "U8", not "a8"
+    try testing.expectEqualStrings("U8", result);
+}
+
+// Test 4: test_exact_pattern_crash.md regression
+// The diff shows "_e" corrupted to "_a"
+test "_e identifier should not be corrupted" {
+    const testing = std.testing;
+
+    const idx_opt = Ident.Idx.try_inline("_e");
+    try testing.expect(idx_opt != null);
+
+    const idx = idx_opt.?;
+    const inner = idx.toInner();
+    try testing.expect(inner.is_small);
+
+    // Create a copy to avoid alignment issues with packed struct
+    const small_copy = inner.data.small;
+    var buffer: [7]u8 = undefined;
+    const result = small_copy.writeTextToBuffer(&buffer);
+
+    // This should be "_e", not "_a"
+    try testing.expectEqualStrings("_e", result);
+}
diff --git a/src/cache/CacheManager.zig b/src/cache/CacheManager.zig
@@ -9,7 +9,6 @@ const Cache = cache_mod.CacheModule;
 const CacheConfig = cache_mod.CacheConfig;
 const CacheStats = cache_mod.CacheStats;
 const CacheReporting = @import("CacheReporting.zig");
-const SERIALIZATION_ALIGNMENT = 16;
 const coordinate_simple = @import("../coordinate_simple.zig");
 
 const Allocator = std.mem.Allocator;
@@ -115,7 +114,7 @@ pub const CacheManager = struct {
     ///
     /// Serializes the ProcessResult and stores it in the cache using BLAKE3-based
     /// filenames with subdirectory splitting.
-    pub fn store(self: *Self, cache_key: [32]u8, process_result: *const coordinate_simple.ProcessResult) !void {
+    pub fn store(self: *Self, cache_key: [32]u8, process_result: *const coordinate_simple.ProcessResult, arena_allocator: Allocator) !void {
         if (!self.config.enabled) {
             return;
         }
@@ -129,11 +128,7 @@ pub const CacheManager = struct {
             return;
         };
 
-        // Create arena for serialization
-        var arena = std.heap.ArenaAllocator.init(self.allocator);
-        defer arena.deinit();
-
-        const cache_data = Cache.create(self.allocator, arena.allocator(), process_result.cir, process_result.cir, process_result.error_count, process_result.warning_count) catch |err| {
+        const cache_data = Cache.create(self.allocator, arena_allocator, process_result.cir, process_result.cir, process_result.error_count, process_result.warning_count) catch |err| {
             if (self.config.verbose) {
                 std.log.debug("Failed to serialize cache data: {}", .{err});
             }
@@ -249,7 +244,7 @@ pub const CacheManager = struct {
     /// The caller must not free it after calling this function.
     fn restoreFromCache(
         self: *Self,
-        cache_data: []align(SERIALIZATION_ALIGNMENT) const u8,
+        cache_data: []align(@alignOf(ModuleEnv)) const u8,
         source: []const u8,
     ) !struct {
         result: coordinate_simple.ProcessResult,

diff --git a/src/cache/CacheModule.zig b/src/cache/CacheModule.zig
@@ -14,8 +14,6 @@ const serialization = @import("serialization");
 const SExprTree = base.SExprTree;
 const Filesystem = @import("../fs/Filesystem.zig");
 
-const SERIALIZATION_ALIGNMENT = 16;
-
 const Allocator = std.mem.Allocator;
 const TypeStore = types.Store;
 const ModuleEnv = compile.ModuleEnv;
@@ -43,9 +41,6 @@ pub const Header = struct {
     error_count: u32,
     warning_count: u32,
 
-    /// Padding to ensure alignment
-    _padding: [12]u8 = [_]u8{0} ** 12,
-
     /// Error specific to initializing a Header from bytes
     pub const InitError = error{
         PartialRead,
@@ -79,7 +74,7 @@ pub const Header = struct {
 /// Memory-mapped cache that can be read directly from disk
 pub const CacheModule = struct {
     header: *const Header,
-    data: []align(SERIALIZATION_ALIGNMENT) const u8,
+    data: []align(@alignOf(ModuleEnv)) const u8,
 
     /// Create a cache by serializing ModuleEnv and CIR data.
     /// The provided allocator is used for the returned cache data, while
@@ -91,7 +86,7 @@ pub const CacheModule = struct {
         _: *const ModuleEnv, // ModuleEnv contains the canonical IR
         error_count: u32,
         warning_count: u32,
-    ) ![]align(SERIALIZATION_ALIGNMENT) u8 {
+    ) ![]align(@alignOf(ModuleEnv)) u8 {
         const CompactWriter = serialization.CompactWriter;
 
         // Create CompactWriter
@@ -111,9 +106,9 @@ pub const CacheModule = struct {
         const total_data_size = writer.total_bytes;
 
         // Allocate cache_data for header + data
-        const header_size = std.mem.alignForward(usize, @sizeOf(Header), SERIALIZATION_ALIGNMENT);
+        const header_size = std.mem.alignForward(usize, @sizeOf(Header), @alignOf(ModuleEnv));
         const total_size = header_size + total_data_size;
-        const cache_data = try allocator.alignedAlloc(u8, SERIALIZATION_ALIGNMENT, total_size);
+        const cache_data = try allocator.alignedAlloc(u8, @alignOf(ModuleEnv), total_size);
         errdefer allocator.free(cache_data);
 
         // Initialize header
@@ -124,7 +119,6 @@ pub const CacheModule = struct {
             .data_size = @intCast(total_data_size),
             .error_count = error_count,
             .warning_count = warning_count,
-            ._padding = [_]u8{0} ** 12,
         };
 
         // Consolidate the scattered iovecs into the cache data buffer
@@ -140,7 +134,7 @@ pub const CacheModule = struct {
     }
 
     /// Load a cache from memory-mapped data
-    pub fn fromMappedMemory(mapped_data: []align(SERIALIZATION_ALIGNMENT) const u8) !CacheModule {
+    pub fn fromMappedMemory(mapped_data: []align(@alignOf(ModuleEnv)) const u8) !CacheModule {
         if (mapped_data.len < @sizeOf(Header)) {
             return error.BufferTooSmall;
         }
@@ -156,12 +150,12 @@ pub const CacheModule = struct {
         if (mapped_data.len < expected_total_size) return error.BufferTooSmall;
 
         // Get data section (must be aligned)
-        const header_size = std.mem.alignForward(usize, @sizeOf(Header), SERIALIZATION_ALIGNMENT);
+        const header_size = std.mem.alignForward(usize, @sizeOf(Header), @alignOf(ModuleEnv));
         const data = mapped_data[header_size .. header_size + header.data_size];
 
         return CacheModule{
             .header = header,
-            .data = @as([]align(SERIALIZATION_ALIGNMENT) const u8, @alignCast(data)),
+            .data = @as([]align(@alignOf(ModuleEnv)) const u8, @alignCast(data)),
         };
     }
 
@@ -222,11 +216,11 @@ pub const CacheModule = struct {
         allocator: Allocator,
         file_path: []const u8,
         filesystem: anytype,
-    ) ![]align(SERIALIZATION_ALIGNMENT) u8 {
+    ) ![]align(@alignOf(ModuleEnv)) u8 {
         const file_data = try filesystem.readFile(file_path, allocator);
         defer allocator.free(file_data);
 
-        const buffer = try allocator.alignedAlloc(u8, SERIALIZATION_ALIGNMENT, file_data.len);
+        const buffer = try allocator.alignedAlloc(u8, @alignOf(ModuleEnv), file_data.len);
         @memcpy(buffer, file_data);
 
         return buffer;
@@ -235,14 +229,14 @@ pub const CacheModule = struct {
     /// Tagged union to represent cache data that can be either memory-mapped or heap-allocated
     pub const CacheData = union(enum) {
         mapped: struct {
-            ptr: [*]align(SERIALIZATION_ALIGNMENT) const u8,
+            ptr: [*]align(@alignOf(ModuleEnv)) const u8,
             len: usize,
             unaligned_ptr: [*]const u8,
             unaligned_len: usize,
         },
-        allocated: []align(SERIALIZATION_ALIGNMENT) const u8,
+        allocated: []align(@alignOf(ModuleEnv)) const u8,
 
-        pub fn data(self: CacheData) []align(SERIALIZATION_ALIGNMENT) const u8 {
+        pub fn data(self: CacheData) []align(@alignOf(ModuleEnv)) const u8 {
             return switch (self) {
                 .mapped => |m| m.ptr[0..m.len],
                 .allocated => |a| a,
@@ -324,7 +318,7 @@ pub const CacheModule = struct {
             // Find the aligned portion within the mapped memory
             const unaligned_ptr = @as([*]const u8, @ptrCast(result.ptr));
             const addr = @intFromPtr(unaligned_ptr);
-            const aligned_addr = std.mem.alignForward(usize, addr, SERIALIZATION_ALIGNMENT);
+            const aligned_addr = std.mem.alignForward(usize, addr, @alignOf(ModuleEnv));
             const offset = aligned_addr - addr;
 
             if (offset >= file_size_usize) {
@@ -336,7 +330,7 @@ pub const CacheModule = struct {
                 return CacheData{ .allocated = data };
             }
 
-            const aligned_ptr = @as([*]align(SERIALIZATION_ALIGNMENT) const u8, @ptrFromInt(aligned_addr));
+            const aligned_ptr = @as([*]align(@alignOf(ModuleEnv)) const u8, @ptrFromInt(aligned_addr));
             const aligned_len = file_size_usize - offset;
 
             return CacheData{
@@ -362,11 +356,6 @@ pub const Diagnostics = struct {
     data_size: u32,
 };
 
-test "Header alignment" {
-    // Verify the header is properly aligned
-    try std.testing.expect(@sizeOf(Header) % SERIALIZATION_ALIGNMENT == 0);
-}
-
 test "create and restore cache" {
     // This test is skipepd for now, but will be replaced by the iovecs/relocation approach anyway.
     const skip_test = true;