diff --git a/src/interchange/yaml.zig b/src/interchange/yaml.zig index 9f38ec1050a8cf..e9d6b37ef7c26a 100644 --- a/src/interchange/yaml.zig +++ b/src/interchange/yaml.zig @@ -726,7 +726,7 @@ pub fn Parser(comptime enc: Encoding) type { try self.scan(.{}); } - if (self.token.line == document_end_line) { + if (self.token.data != .eof and self.token.line == document_end_line) { return unexpectedToken(); } }, @@ -2282,16 +2282,25 @@ pub fn Parser(comptime enc: Encoding) type { .line_indent = self.line_indent, }; + // Track whether we're at the start of a new line. + // Document markers (--- and ...) are only valid at line start. + var nl = self.pos == .zero; + if (!nl) { + const prev = self.input[self.pos.sub(1).cast()]; + nl = prev == '\n' or prev == '\r'; + } + next: switch (self.next()) { 0 => { return ctx.done(); }, '-' => { - if (self.line_indent == .none and self.remainStartsWith("---") and self.isAnyOrEofAt(" \t\n\r", 3)) { + if (nl and self.line_indent == .none and self.remainStartsWith("---") and self.isAnyOrEofAt(" \t\n\r", 3)) { return ctx.done(); } + nl = false; if (!ctx.resolved and ctx.str_builder.len() == 0) { try ctx.appendSource('-', self.pos); self.inc(1); @@ -2305,10 +2314,11 @@ pub fn Parser(comptime enc: Encoding) type { }, '.' => { - if (self.line_indent == .none and self.remainStartsWith("...") and self.isAnyOrEofAt(" \t\n\r", 3)) { + if (nl and self.line_indent == .none and self.remainStartsWith("...") and self.isAnyOrEofAt(" \t\n\r", 3)) { return ctx.done(); } + nl = false; if (!ctx.resolved and ctx.str_builder.len() == 0) { switch (self.peek(1)) { 'n', @@ -2335,6 +2345,7 @@ pub fn Parser(comptime enc: Encoding) type { }, ':' => { + nl = false; if (self.isSWhiteOrBCharOrEofAt(1)) { return ctx.done(); } @@ -2365,6 +2376,7 @@ pub fn Parser(comptime enc: Encoding) type { }, '#' => { + nl = false; const prev = self.input[self.pos.sub(1).cast()]; if (self.pos == .zero or switch (prev) { ' ', @@ -2388,6 +2400,7 @@ pub fn Parser(comptime enc: Encoding) type { '{', '}', => |c| { + nl = false; switch (self.context.get()) { .block_in, .block_out, @@ -2408,6 +2421,7 @@ pub fn Parser(comptime enc: Encoding) type { ' ', '\t', => |c| { + nl = false; try ctx.appendSourceWhitespace(c, self.pos); self.inc(1); continue :next self.next(); @@ -2448,10 +2462,12 @@ pub fn Parser(comptime enc: Encoding) type { try ctx.appendWhitespaceNTimes('\n', lines); + nl = true; continue :next self.next(); }, else => |c| { + nl = false; if (ctx.resolved or ctx.str_builder.len() != 0) { const start = self.pos; self.inc(1); @@ -2921,7 +2937,11 @@ pub fn Parser(comptime enc: Encoding) type { }, '-' => { - if (self.line_indent == .none and self.remainStartsWith("---") and self.isAnyOrEofAt(" \t\n\r", 3)) { + const line_start = self.pos == .zero or switch (self.input[self.pos.sub(1).cast()]) { + '\n', '\r' => true, + else => false, + }; + if (line_start and self.line_indent == .none and self.remainStartsWith("---") and self.isAnyOrEofAt(" \t\n\r", 3)) { return ctx.done(false); } @@ -2940,7 +2960,11 @@ pub fn Parser(comptime enc: Encoding) type { }, '.' => { - if (self.line_indent == .none and self.remainStartsWith("...") and self.isAnyOrEofAt(" \t\n\r", 3)) { + const line_start = self.pos == .zero or switch (self.input[self.pos.sub(1).cast()]) { + '\n', '\r' => true, + else => false, + }; + if (line_start and self.line_indent == .none and self.remainStartsWith("...") and self.isAnyOrEofAt(" \t\n\r", 3)) { return ctx.done(false); } @@ -3644,7 +3668,11 @@ pub fn Parser(comptime enc: Encoding) type { '-' => { const start = self.pos; - if (self.line_indent == .none and self.remainStartsWith(enc.literal("---")) and self.isSWhiteOrBCharOrEofAt(3)) { + const line_start = self.pos == .zero or switch (self.input[self.pos.sub(1).cast()]) { + '\n', '\r' => true, + else => false, + }; + if (line_start and self.line_indent == .none and self.remainStartsWith(enc.literal("---")) and self.isSWhiteOrBCharOrEofAt(3)) { self.inc(3); break :next .documentStart(.{ .start = start, @@ -3724,7 +3752,11 @@ pub fn Parser(comptime enc: Encoding) type { '.' => { const start = self.pos; - if (self.line_indent == .none and self.remainStartsWith(enc.literal("...")) and self.isSWhiteOrBCharOrEofAt(3)) { + const line_start = self.pos == .zero or switch (self.input[self.pos.sub(1).cast()]) { + '\n', '\r' => true, + else => false, + }; + if (line_start and self.line_indent == .none and self.remainStartsWith(enc.literal("...")) and self.isSWhiteOrBCharOrEofAt(3)) { self.inc(3); break :next .documentEnd(.{ .start = start, @@ -4276,7 +4308,7 @@ pub fn Parser(comptime enc: Encoding) type { if (pos.isLessThan(self.input.len)) { return std.mem.indexOfScalar(enc.unit(), values, self.input[pos.cast()]) != null; } - return false; + return true; } fn isEof(self: *const @This()) bool { diff --git a/test/regression/issue/25660.test.ts b/test/regression/issue/25660.test.ts new file mode 100644 index 00000000000000..2bad6014200a24 --- /dev/null +++ b/test/regression/issue/25660.test.ts @@ -0,0 +1,106 @@ +/** + * Regression test for issue #25660 + * YAML.parse() incorrectly splits on `---` inside values + * + * @see https://github.com/oven-sh/bun/issues/25660 + */ +import { describe, expect, test } from "bun:test"; + +describe("YAML.parse document separator handling", () => { + test("should not split on --- inside scalar values", () => { + const text = ` +name: some-text--- +description: Lorem ipsum dolor sit amet, consectetur adipiscing elit. +`; + const parsed = Bun.YAML.parse(text); + expect(parsed).toEqual({ + name: "some-text---", + description: "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + }); + }); + + test("should not split on --- in middle of value", () => { + const text = `key: hello---world`; + const parsed = Bun.YAML.parse(text); + expect(parsed).toEqual({ key: "hello---world" }); + }); + + test("should not split on ... inside scalar values", () => { + const text = `key: hello...world`; + const parsed = Bun.YAML.parse(text); + expect(parsed).toEqual({ key: "hello...world" }); + }); + + test("should not treat document markers as values after ':'", () => { + const text = ` +key1: --- +key2: ... +`; + const parsed = Bun.YAML.parse(text); + expect(parsed).toEqual({ key1: "---", key2: "..." }); + }); + + test("should not treat --- at line start in multiline plain scalar as a document separator", () => { + const text = ` +message: first line + --- still part of the value + last line +`; + const parsed = Bun.YAML.parse(text); + expect(parsed).toEqual({ + message: "first line --- still part of the value last line", + }); + }); + + test("should correctly handle actual document separator at line start", () => { + const text = ` +doc1: value1 +--- +doc2: value2 +`; + const parsed = Bun.YAML.parse(text); + // When there's an actual document separator, it returns an array + expect(Array.isArray(parsed)).toBe(true); + expect(parsed).toHaveLength(2); + expect(parsed[0]).toEqual({ doc1: "value1" }); + expect(parsed[1]).toEqual({ doc2: "value2" }); + }); + + test("should handle value ending with multiple dashes", () => { + const text = ` +title: My-Title--- +subtitle: Another---Value--- +`; + const parsed = Bun.YAML.parse(text); + expect(parsed).toEqual({ + title: "My-Title---", + subtitle: "Another---Value---", + }); + }); + + test("should handle value ending with dots", () => { + const text = `message: Hello...`; + const parsed = Bun.YAML.parse(text); + expect(parsed).toEqual({ message: "Hello..." }); + }); + + test("should recognize ... as document end marker for top-level plain scalar", () => { + // This is the case Dylan pointed out: a top-level plain scalar followed by + // document end marker should correctly recognize ... as document end + const text = `hello +...`; + const parsed = Bun.YAML.parse(text); + expect(parsed).toBe("hello"); + }); + + test("should recognize --- as document separator for top-level plain scalar", () => { + const text = `first +--- +second`; + const parsed = Bun.YAML.parse(text); + expect(Array.isArray(parsed)).toBe(true); + expect(parsed).toHaveLength(2); + expect(parsed[0]).toBe("first"); + expect(parsed[1]).toBe("second"); + }); +});