feat(chunking): list-aware break point scanner

galligan · galligan · commit d69200683676 · 2026-04-08T09:56:44.000-04:00
Replaces the two naive list patterns in BREAK_PATTERNS with a
stack-based scanner that tracks nested list frames and emits
depth-weighted break points plus a list-end transition break point.

Old behavior:
  [/\n[-*]\s/g, 5, 'list']
  [/\n\d+\.\s/g, 5, 'numlist']

Both scored every list-item start at 5, so the break point almost
always lost to nearby heading/blank/codeblock scores and chunks
landed mid-item on long lists. Nested sublists and the ordered `1)`
form were not detected at all.

New scanner (findListBreakPoints):
  - depth 0 item (top-level): score 70
  - depth 1 item (first sublist): score 45
  - depth 2+ item (deeper): score 25
  - list-end (list -&gt; non-list transition): score 75

Scope:
  - Unordered markers: `-`, `*` (matches previous behavior; `+` not
    supported — agents and modern docs don't use it)
  - Ordered markers: `1.` and `1)` (new: `1)` was never detected)
  - Mixed marker characters at the same indent are treated as one
    list (simpler than CommonMark's split rule, better for chunking)
  - Nested sublists with proper depth tracking (new)
  - Blank lines inside items don't terminate the list
  - Column-0 non-list lines terminate the list and emit list-end

Deliberately deferred:
  - Loose vs tight list distinction (rendering concern, no chunking
    impact)
  - Lazy continuation (column-0 line that CommonMark folds back into
    the preceding item)
  - 4-space indented code blocks inside items (ambiguous with
    continuation; defer)
  - Tab-as-marker-separator (`-\t`); not a regression since neither
    old nor new matches tab indentation

Integration: chunkDocument and chunkDocumentAsync now merge
findListBreakPoints output with scanBreakPoints before passing to
chunkDocumentWithBreakPoints. mergeBreakPoints already handles
"higher score wins at same position." AST points continue to layer
on top in the async path.

16 new tests in test/store.test.ts covering empty input, prose,
unordered/ordered/mixed lists, three-deep nesting, mixed marker
nesting, list-end at prose and EOF, blank-line continuation, `+`
rejection, position convention, and an end-to-end integration test
through chunkDocument confirming long lists split at item boundaries.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,17 @@
 
 ## [Unreleased]
 
+### Changes
+
+- List-aware chunking. A new scanner tracks nested list items with a
+  stack-based state machine and emits weighted break points: top-level
+  items score 70, second-level 45, third-level and deeper 25, and the
+  transition from a list back to prose scores 75. Previously the naive
+  `list: 5` and `numlist: 5` patterns produced break points too weak to
+  influence chunking. Long lists now split cleanly at item boundaries
+  instead of mid-item. Ordered `1)` form is newly supported, as is
+  proper detection of nested sublists (which the old regex missed).
+
 ### Fixes
 
 - Code fence detection now follows CommonMark pairing rules. Fences
diff --git a/src/store.ts b/src/store.ts
@@ -106,8 +106,6 @@ export const BREAK_PATTERNS: [RegExp, number, string][] = [
   [/\n(?:`{3,}|~{3,})/g, 80, 'codeblock'],  // code block boundary (same as h3)
   [/\n(?:---|\*\*\*|___)\s*\n/g, 60, 'hr'],  // horizontal rule
   [/\n\n+/g, 20, 'blank'],         // paragraph boundary
-  [/\n[-*]\s/g, 5, 'list'],        // unordered list item
-  [/\n\d+\.\s/g, 5, 'numlist'],    // ordered list item
   [/\n/g, 1, 'newline'],           // minimal break
 ];
 
@@ -189,6 +187,145 @@ export function isInsideProtectedRegion(pos: number, regions: ProtectedRegion[])
   return regions.some(r => pos > r.start && pos < r.end);
 }
 
+interface ListFrame {
+  indent: number;       // column of marker character
+  contentCol: number;   // column where item body begins (after marker + space)
+}
+
+/**
+ * List-aware break point scanner. Walks the document line by line, tracking
+ * nested list frames on a stack. Emits break points at list item boundaries
+ * and when a list transitions back to non-list content.
+ *
+ * Scoring:
+ *   - list-end: 75
+ *   - depth 0 item: 70
+ *   - depth 1 item: 45
+ *   - depth 2+ item: 25
+ *
+ * Marker-type transitions at the same indent are ignored (not CommonMark).
+ * Only `-`, `*`, and `\d+[.)]` markers are recognized (no `+`).
+ * Positions match scanBreakPoints convention: the `\n` before the line.
+ */
+export function findListBreakPoints(text: string): BreakPoint[] {
+  const points: BreakPoint[] = [];
+  if (text.length === 0) return points;
+
+  const itemScores = [70, 45, 25];
+  const itemTypes = ['list-item-0', 'list-item-1', 'list-item-2'];
+  const scoreFor = (depth: number): number =>
+    depth < itemScores.length ? itemScores[depth]! : itemScores[itemScores.length - 1]!;
+  const typeFor = (depth: number): string =>
+    depth < itemTypes.length ? itemTypes[depth]! : itemTypes[itemTypes.length - 1]!;
+
+  const stack: ListFrame[] = [];
+
+  // Iterate lines. lineStart is the index of the first character of the line.
+  // The break-point position we emit is lineStart - 1 (the preceding `\n`),
+  // except for the first line where there's no preceding newline.
+  let lineStart = 0;
+  const n = text.length;
+
+  // Match list item: optional leading spaces, then marker + at least one space.
+  // Unordered: - or *  (NOT +)
+  // Ordered: digits followed by . or )
+  //
+  // Known limitations (deliberate, to keep the scanner simple):
+  //   - Space-separated markers only. `-\t` (dash followed by a literal tab)
+  //     is not recognized. This pattern does not occur in practice.
+  //   - Tab-indented lines are not recognized as list items. Modern markdown
+  //     uses spaces for indentation; tab indentation was never supported by
+  //     the previous regex either, so this is not a regression.
+  //   - Loose/tight list distinction, lazy continuation, and 4-space indented
+  //     code blocks inside items are not tracked — those are rendering-level
+  //     concerns that don't change where chunks should split.
+  const itemRegex = /^( *)(?:([-*])|(\d+)([.)]))( +)/;
+
+  while (lineStart <= n) {
+    // Find end of line
+    let lineEnd = text.indexOf('\n', lineStart);
+    if (lineEnd === -1) lineEnd = n;
+    const line = text.slice(lineStart, lineEnd);
+    const bpPos = lineStart === 0 ? 0 : lineStart - 1;
+
+    const isBlank = line.trim().length === 0;
+
+    if (isBlank) {
+      // Don't change state; next non-blank decides.
+      if (lineEnd === n) break;
+      lineStart = lineEnd + 1;
+      continue;
+    }
+
+    const match = itemRegex.exec(line);
+    if (match) {
+      const leading = match[1]!;
+      const indent = leading.length;
+      const bullet = match[2];
+      const digits = match[3];
+      const ordPunct = match[4];
+      const spaces = match[5]!;
+      const markerLen = bullet ? 1 : (digits!.length + ordPunct!.length);
+      const contentCol = indent + markerLen + spaces.length;
+
+      // Pop frames whose indent exceeds this line's indent (dedent).
+      while (stack.length > 0 && indent < stack[stack.length - 1]!.indent) {
+        stack.pop();
+      }
+
+      let depth: number;
+      if (stack.length === 0) {
+        stack.push({ indent, contentCol });
+        depth = 0;
+      } else {
+        const top = stack[stack.length - 1]!;
+        if (indent >= top.contentCol) {
+          // Deeper nesting
+          stack.push({ indent, contentCol });
+          depth = stack.length - 1;
+        } else if (indent === top.indent) {
+          // Sibling
+          depth = stack.length - 1;
+        } else {
+          // Indent between top.indent and top.contentCol, or less than top.indent
+          // after popping. Treat as sibling at current level.
+          depth = stack.length - 1;
+        }
+      }
+
+      // Skip the first line of the document: position 0 can never be a
+      // chunk break (chunks always start at 0) and there's no preceding
+      // newline to point to anyway.
+      if (lineStart > 0) {
+        points.push({ pos: bpPos, score: scoreFor(depth), type: typeFor(depth) });
+      }
+    } else {
+      // Non-blank, non-list line.
+      if (stack.length > 0) {
+        const indent = line.length - line.trimStart().length;
+        const bottom = stack[0]!;
+        if (indent >= bottom.contentCol) {
+          // Continuation of outermost item; keep state.
+        } else {
+          // List ends.
+          stack.length = 0;
+          points.push({ pos: bpPos, score: 75, type: 'list-end' });
+        }
+      }
+    }
+
+    if (lineEnd === n) break;
+    lineStart = lineEnd + 1;
+  }
+
+  // End of document: if still in a list, emit a list-end at text.length.
+  if (stack.length > 0) {
+    points.push({ pos: n, score: 75, type: 'list-end' });
+  }
+
+  return points.sort((a, b) => a.pos - b.pos);
+}
+
 /**
  * Find the best cut position using scored break points with distance decay.
  *
@@ -2179,7 +2316,9 @@ export function chunkDocument(
   overlapChars: number = CHUNK_OVERLAP_CHARS,
   windowChars: number = CHUNK_WINDOW_CHARS
 ): { text: string; pos: number }[] {
-  const breakPoints = scanBreakPoints(content);
+  const regexPoints = scanBreakPoints(content);
+  const listPoints = findListBreakPoints(content);
+  const breakPoints = mergeBreakPoints(regexPoints, listPoints);
   const protectedRegions = findCodeFences(content);
   return chunkDocumentWithBreakPoints(content, breakPoints, protectedRegions, maxChars, overlapChars, windowChars);
 }
@@ -2201,14 +2340,15 @@ export async function chunkDocumentAsync(
   chunkStrategy: ChunkStrategy = "regex",
 ): Promise<{ text: string; pos: number }[]> {
   const regexPoints = scanBreakPoints(content);
+  const listPoints = findListBreakPoints(content);
   const protectedRegions = findCodeFences(content);
 
-  let breakPoints = regexPoints;
+  let breakPoints = mergeBreakPoints(regexPoints, listPoints);
   if (chunkStrategy === "auto" && filepath) {
     const { getASTBreakPoints } = await import("./ast.js");
     const astPoints = await getASTBreakPoints(content, filepath);
     if (astPoints.length > 0) {
-      breakPoints = mergeBreakPoints(regexPoints, astPoints);
+      breakPoints = mergeBreakPoints(breakPoints, astPoints);
     }
   }
 
diff --git a/test/store.test.ts b/test/store.test.ts
@@ -33,6 +33,7 @@ import {
   mergeBreakPoints,
   scanBreakPoints,
   findCodeFences,
+  findListBreakPoints,
   isInsideProtectedRegion,
   findBestCutoff,
   type BreakPoint,
@@ -609,17 +610,11 @@ describe("scanBreakPoints", () => {
     expect(blank!.score).toBe(20);
   });
 
-  test("detects list items", () => {
+  test("does not detect list items (handled by findListBreakPoints)", () => {
     const text = "Intro\n- Item 1\n- Item 2\n1. Numbered";
     const breaks = scanBreakPoints(text);
-
-    const lists = breaks.filter(b => b.type === 'list');
-    const numLists = breaks.filter(b => b.type === 'numlist');
-
-    expect(lists.length).toBe(2);
-    expect(numLists.length).toBe(1);
-    expect(lists[0]!.score).toBe(5);
-    expect(numLists[0]!.score).toBe(5);
+    expect(breaks.filter(b => b.type === 'list').length).toBe(0);
+    expect(breaks.filter(b => b.type === 'numlist').length).toBe(0);
   });
 
   test("detects newlines as fallback", () => {
@@ -796,6 +791,140 @@ describe("findCodeFences", () => {
   });
 });
 
+describe("findListBreakPoints", () => {
+  test("empty input produces no break points", () => {
+    expect(findListBreakPoints("")).toEqual([]);
+  });
+
+  test("pure prose produces no break points", () => {
+    const text = "Just a paragraph.\nAnother line of prose.\nAnd more.";
+    expect(findListBreakPoints(text)).toEqual([]);
+  });
+
+  test("single unordered list: item + list-end break points", () => {
+    const text = "Intro\n- one\n- two\n- three\n\nAfter";
+    const bps = findListBreakPoints(text);
+    // 3 item breaks (all depth 0, score 70) + 1 list-end (score 75)
+    const items = bps.filter(b => b.type === 'list-item-0');
+    const ends = bps.filter(b => b.type === 'list-end');
+    expect(items.length).toBe(3);
+    expect(ends.length).toBe(1);
+    expect(items.every(b => b.score === 70)).toBe(true);
+    expect(ends[0]!.score).toBe(75);
+  });
+
+  test("ordered list with 1.", () => {
+    const text = "Intro\n1. one\n2. two\n3. three\n\nAfter";
+    const bps = findListBreakPoints(text);
+    expect(bps.filter(b => b.type === 'list-item-0').length).toBe(3);
+    expect(bps.filter(b => b.type === 'list-end').length).toBe(1);
+  });
+
+  test("ordered list with 1)", () => {
+    const text = "Intro\n1) one\n2) two\n3) three\n\nAfter";
+    const bps = findListBreakPoints(text);
+    expect(bps.filter(b => b.type === 'list-item-0').length).toBe(3);
+    expect(bps.filter(b => b.type === 'list-end').length).toBe(1);
+  });
+
+  test("mixed marker characters at same indent are one list", () => {
+    const text = "Intro\n- one\n* two\n- three\n\nAfter";
+    const bps = findListBreakPoints(text);
+    expect(bps.filter(b => b.type === 'list-item-0').length).toBe(3);
+    expect(bps.filter(b => b.type === 'list-end').length).toBe(1);
+  });
+
+  test("nested unordered list uses depth-based scores", () => {
+    const text = "Intro\n- one\n  - sub1\n  - sub2\n- two\n\nAfter";
+    const bps = findListBreakPoints(text);
+    const top = bps.filter(b => b.type === 'list-item-0');
+    const sub = bps.filter(b => b.type === 'list-item-1');
+    expect(top.length).toBe(2);
+    expect(sub.length).toBe(2);
+    expect(top.every(b => b.score === 70)).toBe(true);
+    expect(sub.every(b => b.score === 45)).toBe(true);
+  });
+
+  test("three-deep nesting produces depth 0/1/2 scores", () => {
+    const text = "Intro\n- one\n  - two\n    - three\n\nAfter";
+    const bps = findListBreakPoints(text);
+    expect(bps.find(b => b.type === 'list-item-0')!.score).toBe(70);
+    expect(bps.find(b => b.type === 'list-item-1')!.score).toBe(45);
+    expect(bps.find(b => b.type === 'list-item-2')!.score).toBe(25);
+  });
+
+  test("mixed nesting: unordered top with ordered sublist", () => {
+    const text = "Intro\n- one\n  1. sub\n  2. sub\n- two\n\nAfter";
+    const bps = findListBreakPoints(text);
+    expect(bps.filter(b => b.type === 'list-item-0').length).toBe(2);
+    expect(bps.filter(b => b.type === 'list-item-1').length).toBe(2);
+  });
+
+  test("list followed by prose emits list-end at position of prose line", () => {
+    const text = "- a\n- b\nprose";
+    const bps = findListBreakPoints(text);
+    const end = bps.find(b => b.type === 'list-end')!;
+    // list-end at the \n before "prose"
+    expect(end.pos).toBe(text.indexOf("\nprose"));
+  });
+
+  test("list at end of document emits list-end at text.length", () => {
+    const text = "- a\n- b\n- c";
+    const bps = findListBreakPoints(text);
+    const end = bps.find(b => b.type === 'list-end')!;
+    expect(end.pos).toBe(text.length);
+  });
+
+  test("single blank line between items does not terminate list", () => {
+    const text = "Intro\n- a\n\n- b\n\nAfter";
+    const bps = findListBreakPoints(text);
+    expect(bps.filter(b => b.type === 'list-item-0').length).toBe(2);
+    expect(bps.filter(b => b.type === 'list-end').length).toBe(1);
+  });
+
+  test("blank then non-list prose terminates list", () => {
+    const text = "- a\n- b\n\nSome prose here";
+    const bps = findListBreakPoints(text);
+    expect(bps.filter(b => b.type === 'list-end').length).toBe(1);
+  });
+
+  test("+ markers are not detected as list items", () => {
+    const text = "Intro\n+ foo\n+ bar\n+ baz\n\nAfter";
+    const bps = findListBreakPoints(text);
+    expect(bps.length).toBe(0);
+  });
+
+  test("position convention: pos is the \\n before the line", () => {
+    const text = "Intro\n- one\n- two\n\nAfter";
+    const bps = findListBreakPoints(text);
+    const items = bps.filter(b => b.type === 'list-item-0');
+    // First item: \n before "- one" at index 5
+    expect(items[0]!.pos).toBe(text.indexOf("\n- one"));
+    expect(items[1]!.pos).toBe(text.indexOf("\n- two"));
+  });
+
+  test("integration: chunkDocument splits a long list at item boundaries", () => {
+    // Build a list long enough to force splitting
+    const items: string[] = [];
+    for (let i = 0; i < 200; i++) {
+      items.push(`- list item number ${i} with some descriptive text here to consume characters`);
+    }
+    const text = "# Header\n\n" + items.join("\n") + "\n";
+    const chunks = chunkDocument(text, 1000, 100, 300);
+    expect(chunks.length).toBeGreaterThan(1);
+    // Each chunk except the last should end on a complete list item line,
+    // meaning the split landed at a list-item break point (the \n before
+    // the next item).
+    for (let i = 0; i < chunks.length - 1; i++) {
+      const chunkText = chunks[i]!.text;
+      const lines = chunkText.split("\n");
+      // Drop trailing empty from a terminal \n
+      const last = lines[lines.length - 1] === "" ? lines[lines.length - 2]! : lines[lines.length - 1]!;
+      expect(last.startsWith("- list item")).toBe(true);
+    }
+  });
+});
+
 describe("isInsideProtectedRegion", () => {
   test("returns true for position inside fence", () => {
     const fences: ProtectedRegion[] = [{ start: 10, end: 30 }];