Skip to content

Commit d692006

Browse files
committed
feat(chunking): list-aware break point scanner
Replaces the two naive list patterns in BREAK_PATTERNS with a stack-based scanner that tracks nested list frames and emits depth-weighted break points plus a list-end transition break point. Old behavior: [/\n[-*]\s/g, 5, 'list'] [/\n\d+\.\s/g, 5, 'numlist'] Both scored every list-item start at 5, so the break point almost always lost to nearby heading/blank/codeblock scores and chunks landed mid-item on long lists. Nested sublists and the ordered `1)` form were not detected at all. New scanner (findListBreakPoints): - depth 0 item (top-level): score 70 - depth 1 item (first sublist): score 45 - depth 2+ item (deeper): score 25 - list-end (list -> non-list transition): score 75 Scope: - Unordered markers: `-`, `*` (matches previous behavior; `+` not supported — agents and modern docs don't use it) - Ordered markers: `1.` and `1)` (new: `1)` was never detected) - Mixed marker characters at the same indent are treated as one list (simpler than CommonMark's split rule, better for chunking) - Nested sublists with proper depth tracking (new) - Blank lines inside items don't terminate the list - Column-0 non-list lines terminate the list and emit list-end Deliberately deferred: - Loose vs tight list distinction (rendering concern, no chunking impact) - Lazy continuation (column-0 line that CommonMark folds back into the preceding item) - 4-space indented code blocks inside items (ambiguous with continuation; defer) - Tab-as-marker-separator (`-\t`); not a regression since neither old nor new matches tab indentation Integration: chunkDocument and chunkDocumentAsync now merge findListBreakPoints output with scanBreakPoints before passing to chunkDocumentWithBreakPoints. mergeBreakPoints already handles "higher score wins at same position." AST points continue to layer on top in the async path. 16 new tests in test/store.test.ts covering empty input, prose, unordered/ordered/mixed lists, three-deep nesting, mixed marker nesting, list-end at prose and EOF, blank-line continuation, `+` rejection, position convention, and an end-to-end integration test through chunkDocument confirming long lists split at item boundaries.
1 parent 2458400 commit d692006

3 files changed

Lines changed: 294 additions & 14 deletions

File tree

CHANGELOG.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,17 @@
22

33
## [Unreleased]
44

5+
### Changes
6+
7+
- List-aware chunking. A new scanner tracks nested list items with a
8+
stack-based state machine and emits weighted break points: top-level
9+
items score 70, second-level 45, third-level and deeper 25, and the
10+
transition from a list back to prose scores 75. Previously the naive
11+
`list: 5` and `numlist: 5` patterns produced break points too weak to
12+
influence chunking. Long lists now split cleanly at item boundaries
13+
instead of mid-item. Ordered `1)` form is newly supported, as is
14+
proper detection of nested sublists (which the old regex missed).
15+
516
### Fixes
617

718
- Code fence detection now follows CommonMark pairing rules. Fences

src/store.ts

Lines changed: 145 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,6 @@ export const BREAK_PATTERNS: [RegExp, number, string][] = [
106106
[/\n(?:`{3,}|~{3,})/g, 80, 'codeblock'], // code block boundary (same as h3)
107107
[/\n(?:---|\*\*\*|___)\s*\n/g, 60, 'hr'], // horizontal rule
108108
[/\n\n+/g, 20, 'blank'], // paragraph boundary
109-
[/\n[-*]\s/g, 5, 'list'], // unordered list item
110-
[/\n\d+\.\s/g, 5, 'numlist'], // ordered list item
111109
[/\n/g, 1, 'newline'], // minimal break
112110
];
113111

@@ -189,6 +187,145 @@ export function isInsideProtectedRegion(pos: number, regions: ProtectedRegion[])
189187
return regions.some(r => pos > r.start && pos < r.end);
190188
}
191189

190+
interface ListFrame {
191+
indent: number; // column of marker character
192+
contentCol: number; // column where item body begins (after marker + space)
193+
}
194+
195+
/**
196+
* List-aware break point scanner. Walks the document line by line, tracking
197+
* nested list frames on a stack. Emits break points at list item boundaries
198+
* and when a list transitions back to non-list content.
199+
*
200+
* Scoring:
201+
* - list-end: 75
202+
* - depth 0 item: 70
203+
* - depth 1 item: 45
204+
* - depth 2+ item: 25
205+
*
206+
* Marker-type transitions at the same indent are ignored (not CommonMark).
207+
* Only `-`, `*`, and `\d+[.)]` markers are recognized (no `+`).
208+
* Positions match scanBreakPoints convention: the `\n` before the line.
209+
*/
210+
export function findListBreakPoints(text: string): BreakPoint[] {
211+
const points: BreakPoint[] = [];
212+
if (text.length === 0) return points;
213+
214+
const itemScores = [70, 45, 25];
215+
const itemTypes = ['list-item-0', 'list-item-1', 'list-item-2'];
216+
const scoreFor = (depth: number): number =>
217+
depth < itemScores.length ? itemScores[depth]! : itemScores[itemScores.length - 1]!;
218+
const typeFor = (depth: number): string =>
219+
depth < itemTypes.length ? itemTypes[depth]! : itemTypes[itemTypes.length - 1]!;
220+
221+
const stack: ListFrame[] = [];
222+
223+
// Iterate lines. lineStart is the index of the first character of the line.
224+
// The break-point position we emit is lineStart - 1 (the preceding `\n`),
225+
// except for the first line where there's no preceding newline.
226+
let lineStart = 0;
227+
const n = text.length;
228+
229+
// Match list item: optional leading spaces, then marker + at least one space.
230+
// Unordered: - or * (NOT +)
231+
// Ordered: digits followed by . or )
232+
//
233+
// Known limitations (deliberate, to keep the scanner simple):
234+
// - Space-separated markers only. `-\t` (dash followed by a literal tab)
235+
// is not recognized. This pattern does not occur in practice.
236+
// - Tab-indented lines are not recognized as list items. Modern markdown
237+
// uses spaces for indentation; tab indentation was never supported by
238+
// the previous regex either, so this is not a regression.
239+
// - Loose/tight list distinction, lazy continuation, and 4-space indented
240+
// code blocks inside items are not tracked — those are rendering-level
241+
// concerns that don't change where chunks should split.
242+
const itemRegex = /^( *)(?:([-*])|(\d+)([.)]))( +)/;
243+
244+
while (lineStart <= n) {
245+
// Find end of line
246+
let lineEnd = text.indexOf('\n', lineStart);
247+
if (lineEnd === -1) lineEnd = n;
248+
const line = text.slice(lineStart, lineEnd);
249+
const bpPos = lineStart === 0 ? 0 : lineStart - 1;
250+
251+
const isBlank = line.trim().length === 0;
252+
253+
if (isBlank) {
254+
// Don't change state; next non-blank decides.
255+
if (lineEnd === n) break;
256+
lineStart = lineEnd + 1;
257+
continue;
258+
}
259+
260+
const match = itemRegex.exec(line);
261+
if (match) {
262+
const leading = match[1]!;
263+
const indent = leading.length;
264+
const bullet = match[2];
265+
const digits = match[3];
266+
const ordPunct = match[4];
267+
const spaces = match[5]!;
268+
const markerLen = bullet ? 1 : (digits!.length + ordPunct!.length);
269+
const contentCol = indent + markerLen + spaces.length;
270+
271+
// Pop frames whose indent exceeds this line's indent (dedent).
272+
while (stack.length > 0 && indent < stack[stack.length - 1]!.indent) {
273+
stack.pop();
274+
}
275+
276+
let depth: number;
277+
if (stack.length === 0) {
278+
stack.push({ indent, contentCol });
279+
depth = 0;
280+
} else {
281+
const top = stack[stack.length - 1]!;
282+
if (indent >= top.contentCol) {
283+
// Deeper nesting
284+
stack.push({ indent, contentCol });
285+
depth = stack.length - 1;
286+
} else if (indent === top.indent) {
287+
// Sibling
288+
depth = stack.length - 1;
289+
} else {
290+
// Indent between top.indent and top.contentCol, or less than top.indent
291+
// after popping. Treat as sibling at current level.
292+
depth = stack.length - 1;
293+
}
294+
}
295+
296+
// Skip the first line of the document: position 0 can never be a
297+
// chunk break (chunks always start at 0) and there's no preceding
298+
// newline to point to anyway.
299+
if (lineStart > 0) {
300+
points.push({ pos: bpPos, score: scoreFor(depth), type: typeFor(depth) });
301+
}
302+
} else {
303+
// Non-blank, non-list line.
304+
if (stack.length > 0) {
305+
const indent = line.length - line.trimStart().length;
306+
const bottom = stack[0]!;
307+
if (indent >= bottom.contentCol) {
308+
// Continuation of outermost item; keep state.
309+
} else {
310+
// List ends.
311+
stack.length = 0;
312+
points.push({ pos: bpPos, score: 75, type: 'list-end' });
313+
}
314+
}
315+
}
316+
317+
if (lineEnd === n) break;
318+
lineStart = lineEnd + 1;
319+
}
320+
321+
// End of document: if still in a list, emit a list-end at text.length.
322+
if (stack.length > 0) {
323+
points.push({ pos: n, score: 75, type: 'list-end' });
324+
}
325+
326+
return points.sort((a, b) => a.pos - b.pos);
327+
}
328+
192329
/**
193330
* Find the best cut position using scored break points with distance decay.
194331
*
@@ -2179,7 +2316,9 @@ export function chunkDocument(
21792316
overlapChars: number = CHUNK_OVERLAP_CHARS,
21802317
windowChars: number = CHUNK_WINDOW_CHARS
21812318
): { text: string; pos: number }[] {
2182-
const breakPoints = scanBreakPoints(content);
2319+
const regexPoints = scanBreakPoints(content);
2320+
const listPoints = findListBreakPoints(content);
2321+
const breakPoints = mergeBreakPoints(regexPoints, listPoints);
21832322
const protectedRegions = findCodeFences(content);
21842323
return chunkDocumentWithBreakPoints(content, breakPoints, protectedRegions, maxChars, overlapChars, windowChars);
21852324
}
@@ -2201,14 +2340,15 @@ export async function chunkDocumentAsync(
22012340
chunkStrategy: ChunkStrategy = "regex",
22022341
): Promise<{ text: string; pos: number }[]> {
22032342
const regexPoints = scanBreakPoints(content);
2343+
const listPoints = findListBreakPoints(content);
22042344
const protectedRegions = findCodeFences(content);
22052345

2206-
let breakPoints = regexPoints;
2346+
let breakPoints = mergeBreakPoints(regexPoints, listPoints);
22072347
if (chunkStrategy === "auto" && filepath) {
22082348
const { getASTBreakPoints } = await import("./ast.js");
22092349
const astPoints = await getASTBreakPoints(content, filepath);
22102350
if (astPoints.length > 0) {
2211-
breakPoints = mergeBreakPoints(regexPoints, astPoints);
2351+
breakPoints = mergeBreakPoints(breakPoints, astPoints);
22122352
}
22132353
}
22142354

test/store.test.ts

Lines changed: 138 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import {
3333
mergeBreakPoints,
3434
scanBreakPoints,
3535
findCodeFences,
36+
findListBreakPoints,
3637
isInsideProtectedRegion,
3738
findBestCutoff,
3839
type BreakPoint,
@@ -609,17 +610,11 @@ describe("scanBreakPoints", () => {
609610
expect(blank!.score).toBe(20);
610611
});
611612

612-
test("detects list items", () => {
613+
test("does not detect list items (handled by findListBreakPoints)", () => {
613614
const text = "Intro\n- Item 1\n- Item 2\n1. Numbered";
614615
const breaks = scanBreakPoints(text);
615-
616-
const lists = breaks.filter(b => b.type === 'list');
617-
const numLists = breaks.filter(b => b.type === 'numlist');
618-
619-
expect(lists.length).toBe(2);
620-
expect(numLists.length).toBe(1);
621-
expect(lists[0]!.score).toBe(5);
622-
expect(numLists[0]!.score).toBe(5);
616+
expect(breaks.filter(b => b.type === 'list').length).toBe(0);
617+
expect(breaks.filter(b => b.type === 'numlist').length).toBe(0);
623618
});
624619

625620
test("detects newlines as fallback", () => {
@@ -796,6 +791,140 @@ describe("findCodeFences", () => {
796791
});
797792
});
798793

794+
describe("findListBreakPoints", () => {
795+
test("empty input produces no break points", () => {
796+
expect(findListBreakPoints("")).toEqual([]);
797+
});
798+
799+
test("pure prose produces no break points", () => {
800+
const text = "Just a paragraph.\nAnother line of prose.\nAnd more.";
801+
expect(findListBreakPoints(text)).toEqual([]);
802+
});
803+
804+
test("single unordered list: item + list-end break points", () => {
805+
const text = "Intro\n- one\n- two\n- three\n\nAfter";
806+
const bps = findListBreakPoints(text);
807+
// 3 item breaks (all depth 0, score 70) + 1 list-end (score 75)
808+
const items = bps.filter(b => b.type === 'list-item-0');
809+
const ends = bps.filter(b => b.type === 'list-end');
810+
expect(items.length).toBe(3);
811+
expect(ends.length).toBe(1);
812+
expect(items.every(b => b.score === 70)).toBe(true);
813+
expect(ends[0]!.score).toBe(75);
814+
});
815+
816+
test("ordered list with 1.", () => {
817+
const text = "Intro\n1. one\n2. two\n3. three\n\nAfter";
818+
const bps = findListBreakPoints(text);
819+
expect(bps.filter(b => b.type === 'list-item-0').length).toBe(3);
820+
expect(bps.filter(b => b.type === 'list-end').length).toBe(1);
821+
});
822+
823+
test("ordered list with 1)", () => {
824+
const text = "Intro\n1) one\n2) two\n3) three\n\nAfter";
825+
const bps = findListBreakPoints(text);
826+
expect(bps.filter(b => b.type === 'list-item-0').length).toBe(3);
827+
expect(bps.filter(b => b.type === 'list-end').length).toBe(1);
828+
});
829+
830+
test("mixed marker characters at same indent are one list", () => {
831+
const text = "Intro\n- one\n* two\n- three\n\nAfter";
832+
const bps = findListBreakPoints(text);
833+
expect(bps.filter(b => b.type === 'list-item-0').length).toBe(3);
834+
expect(bps.filter(b => b.type === 'list-end').length).toBe(1);
835+
});
836+
837+
test("nested unordered list uses depth-based scores", () => {
838+
const text = "Intro\n- one\n - sub1\n - sub2\n- two\n\nAfter";
839+
const bps = findListBreakPoints(text);
840+
const top = bps.filter(b => b.type === 'list-item-0');
841+
const sub = bps.filter(b => b.type === 'list-item-1');
842+
expect(top.length).toBe(2);
843+
expect(sub.length).toBe(2);
844+
expect(top.every(b => b.score === 70)).toBe(true);
845+
expect(sub.every(b => b.score === 45)).toBe(true);
846+
});
847+
848+
test("three-deep nesting produces depth 0/1/2 scores", () => {
849+
const text = "Intro\n- one\n - two\n - three\n\nAfter";
850+
const bps = findListBreakPoints(text);
851+
expect(bps.find(b => b.type === 'list-item-0')!.score).toBe(70);
852+
expect(bps.find(b => b.type === 'list-item-1')!.score).toBe(45);
853+
expect(bps.find(b => b.type === 'list-item-2')!.score).toBe(25);
854+
});
855+
856+
test("mixed nesting: unordered top with ordered sublist", () => {
857+
const text = "Intro\n- one\n 1. sub\n 2. sub\n- two\n\nAfter";
858+
const bps = findListBreakPoints(text);
859+
expect(bps.filter(b => b.type === 'list-item-0').length).toBe(2);
860+
expect(bps.filter(b => b.type === 'list-item-1').length).toBe(2);
861+
});
862+
863+
test("list followed by prose emits list-end at position of prose line", () => {
864+
const text = "- a\n- b\nprose";
865+
const bps = findListBreakPoints(text);
866+
const end = bps.find(b => b.type === 'list-end')!;
867+
// list-end at the \n before "prose"
868+
expect(end.pos).toBe(text.indexOf("\nprose"));
869+
});
870+
871+
test("list at end of document emits list-end at text.length", () => {
872+
const text = "- a\n- b\n- c";
873+
const bps = findListBreakPoints(text);
874+
const end = bps.find(b => b.type === 'list-end')!;
875+
expect(end.pos).toBe(text.length);
876+
});
877+
878+
test("single blank line between items does not terminate list", () => {
879+
const text = "Intro\n- a\n\n- b\n\nAfter";
880+
const bps = findListBreakPoints(text);
881+
expect(bps.filter(b => b.type === 'list-item-0').length).toBe(2);
882+
expect(bps.filter(b => b.type === 'list-end').length).toBe(1);
883+
});
884+
885+
test("blank then non-list prose terminates list", () => {
886+
const text = "- a\n- b\n\nSome prose here";
887+
const bps = findListBreakPoints(text);
888+
expect(bps.filter(b => b.type === 'list-end').length).toBe(1);
889+
});
890+
891+
test("+ markers are not detected as list items", () => {
892+
const text = "Intro\n+ foo\n+ bar\n+ baz\n\nAfter";
893+
const bps = findListBreakPoints(text);
894+
expect(bps.length).toBe(0);
895+
});
896+
897+
test("position convention: pos is the \\n before the line", () => {
898+
const text = "Intro\n- one\n- two\n\nAfter";
899+
const bps = findListBreakPoints(text);
900+
const items = bps.filter(b => b.type === 'list-item-0');
901+
// First item: \n before "- one" at index 5
902+
expect(items[0]!.pos).toBe(text.indexOf("\n- one"));
903+
expect(items[1]!.pos).toBe(text.indexOf("\n- two"));
904+
});
905+
906+
test("integration: chunkDocument splits a long list at item boundaries", () => {
907+
// Build a list long enough to force splitting
908+
const items: string[] = [];
909+
for (let i = 0; i < 200; i++) {
910+
items.push(`- list item number ${i} with some descriptive text here to consume characters`);
911+
}
912+
const text = "# Header\n\n" + items.join("\n") + "\n";
913+
const chunks = chunkDocument(text, 1000, 100, 300);
914+
expect(chunks.length).toBeGreaterThan(1);
915+
// Each chunk except the last should end on a complete list item line,
916+
// meaning the split landed at a list-item break point (the \n before
917+
// the next item).
918+
for (let i = 0; i < chunks.length - 1; i++) {
919+
const chunkText = chunks[i]!.text;
920+
const lines = chunkText.split("\n");
921+
// Drop trailing empty from a terminal \n
922+
const last = lines[lines.length - 1] === "" ? lines[lines.length - 2]! : lines[lines.length - 1]!;
923+
expect(last.startsWith("- list item")).toBe(true);
924+
}
925+
});
926+
});
927+
799928
describe("isInsideProtectedRegion", () => {
800929
test("returns true for position inside fence", () => {
801930
const fences: ProtectedRegion[] = [{ start: 10, end: 30 }];

0 commit comments

Comments
 (0)