Skip to content

Commit c775f7f

Browse files
committed
fix: list numbering
1 parent 30a2eb6 commit c775f7f

2 files changed

Lines changed: 555 additions & 71 deletions

File tree

src/splitter.test.ts

Lines changed: 302 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import { toMarkdown } from 'mdast-util-to-markdown';
2-
import { toString } from 'mdast-util-to-string';
32
import { describe, expect, it } from 'vitest';
43
import { chunkdown, getContentSize } from './splitter';
54

@@ -458,10 +457,57 @@ End of instructions.`;
458457

459458
expect(listChunks.length).toBe(6);
460459
for (let i = 1; i < listChunks.length; i++) {
461-
expect(listChunks[i]).toMatch(new RegExp(`^[${i + 1}]\.`));
460+
expect(listChunks[i]).toMatch(new RegExp(`^[${i + 1}].`));
462461
}
463462
});
464463

464+
it('should preserve ordered list numbering with long items that get split', () => {
465+
const splitter = chunkdown({
466+
chunkSize: 200,
467+
maxOverflowRatio: 1.5,
468+
});
469+
const text = `1. **First item with very long content.** This item contains substantial text that will exceed the chunk size limit and force the splitter to break it into multiple chunks, which can cause numbering issues if not handled correctly.
470+
471+
2. **Second item with moderate content.** This item has enough content to potentially cause issues but should fit in a single chunk.
472+
473+
3. **Third item with short content.**
474+
475+
4. **Fourth item with extremely long content that will definitely be split.** This is a very detailed item that contains multiple sentences with comprehensive explanations and examples. It includes technical details, step-by-step instructions, and various formatting elements that make it substantially longer than the configured chunk size, ensuring it will be split across multiple chunks during processing.
476+
477+
5. **Fifth item with another very long section.** Similar to item 4, this contains extensive content that will cause the text splitter to break it into multiple chunks, testing whether the ordered list numbering is preserved correctly across these splits.
478+
479+
6. **Sixth item with normal content.**
480+
481+
7. **Seventh item with more long content.** This item also has substantial text that will likely exceed the chunk size and test the numbering preservation functionality in various scenarios.
482+
483+
8. **Eighth item is short.**
484+
485+
9. **Ninth and final item.**`;
486+
487+
const chunks = splitter.splitText(text);
488+
489+
// Extract all list item numbers from all chunks (not just those that start chunks)
490+
const allListNumbers: number[] = [];
491+
chunks.forEach((chunk) => {
492+
const matches = chunk.matchAll(/^(\d+)\./gm);
493+
for (const match of matches) {
494+
allListNumbers.push(Number.parseInt(match[1], 10));
495+
}
496+
});
497+
498+
// Should preserve sequential numbering: 1, 2, 3, 4, 5, 6, 7, 8, 9
499+
const expectedNumbers = [1, 2, 3, 4, 5, 6, 7, 8, 9];
500+
501+
expect(allListNumbers).toEqual(expectedNumbers);
502+
503+
// Verify that we have exactly 9 list items
504+
expect(allListNumbers.length).toBe(9);
505+
506+
// Additional verification: ensure no numbering resets to 1 after the first item
507+
const numbersAfterFirst = allListNumbers.slice(1);
508+
expect(numbersAfterFirst).not.toContain(1);
509+
});
510+
465511
it('should keep tables together if possible', () => {
466512
const splitter = chunkdown({
467513
chunkSize: 50,
@@ -557,6 +603,237 @@ End of blockquote.`;
557603
});
558604
});
559605

606+
describe('Section Merging', () => {
607+
describe('Parent-Descendant Merging', () => {
608+
it('should merge parent section with child sections when they fit together', () => {
609+
const splitter = chunkdown({
610+
chunkSize: 1000,
611+
maxOverflowRatio: 1.5,
612+
});
613+
614+
const text = `## Main Section
615+
616+
This is the main section with some introductory content that explains what this section is about.
617+
618+
### Child Section 1
619+
620+
This is the first child section with moderate content that should fit with the parent.
621+
622+
### Child Section 2
623+
624+
This is the second child section with some additional content.
625+
626+
### Child Section 3
627+
628+
This is the third child section with final content for this group.
629+
630+
## Another Main Section
631+
632+
This section should be separate since it's a sibling of the first main section.`;
633+
634+
const chunks = splitter.splitText(text);
635+
636+
// Main Section + children, Another Main Section
637+
expect(chunks.length).toBe(2);
638+
639+
// First chunk should contain parent and multiple children
640+
expect(chunks[0]).toContain('## Main Section');
641+
expect(chunks[0]).toContain('### Child Section 1');
642+
expect(chunks[0]).toContain('### Child Section 2');
643+
expect(chunks[0]).toContain('### Child Section 3');
644+
645+
// Second chunk should contain the other main section
646+
expect(chunks[1]).toContain('## Another Main Section');
647+
648+
// Should stay within allowed size
649+
chunks.forEach((chunk) => {
650+
expect(getContentSize(chunk)).toBeLessThanOrEqual(1500); // 1000 * 1.5
651+
});
652+
});
653+
654+
it('should not merge if combined size exceeds maxAllowedSize', () => {
655+
const splitter = chunkdown({
656+
chunkSize: 200,
657+
maxOverflowRatio: 1.2, // Only 240 chars allowed
658+
});
659+
660+
const text = `## Main Section
661+
662+
This is a longer main section with substantial introductory content that explains what this section is about in great detail with many words and explanations.
663+
664+
### Child Section 1
665+
666+
This child section also has substantial content that would make the combined size exceed the maximum allowed size when merged with the parent section.
667+
668+
### Child Section 2
669+
670+
Another child section with content.`;
671+
672+
const chunks = splitter.splitText(text);
673+
674+
// Should create 2 chunks due to size constraints
675+
expect(chunks.length).toBe(2);
676+
expect(chunks[0]).toContain('## Main Section');
677+
expect(chunks[1]).toContain('### Child Section 1');
678+
679+
// No chunk should exceed the allowed size
680+
chunks.forEach((chunk) => {
681+
expect(getContentSize(chunk)).toBeLessThanOrEqual(240); // 200 * 1.2
682+
});
683+
});
684+
});
685+
686+
describe('Sibling Section Merging', () => {
687+
it('should merge sibling sections when parent is too large to merge', () => {
688+
const splitter = chunkdown({
689+
chunkSize: 300,
690+
maxOverflowRatio: 1.5, // 450 chars allowed
691+
});
692+
693+
const text = `# Large Parent Section
694+
695+
This is a large parent section with substantial content that takes up significant space. It contains multiple sentences with detailed explanations and examples. This content is designed to be large enough that it cannot merge with its child sections due to size constraints. The parent section alone should be close to or exceed the base chunk size to prevent parent-child merging but allow sibling merging of the children.
696+
697+
## First Child Section
698+
699+
Short content for first child.
700+
701+
## Second Child Section
702+
703+
Short content for second child.
704+
705+
## Third Child Section
706+
707+
Short content for third child.`;
708+
709+
const chunks = splitter.splitText(text);
710+
711+
// Should create 2 chunks: large parent separate, siblings merged
712+
expect(chunks.length).toBe(2);
713+
714+
// First chunk should be the large parent alone
715+
expect(chunks[0]).toContain('# Large Parent Section');
716+
expect(chunks[0]).not.toContain('## First Child Section');
717+
718+
// Second chunk should contain merged siblings
719+
expect(chunks[1]).toContain('## First Child Section');
720+
expect(chunks[1]).toContain('## Second Child Section');
721+
expect(chunks[1]).toContain('## Third Child Section');
722+
723+
// All chunks should stay within allowed size
724+
chunks.forEach((chunk) => {
725+
expect(getContentSize(chunk)).toBeLessThanOrEqual(450); // 300 * 1.5
726+
});
727+
});
728+
729+
it('should merge some siblings but not others based on size constraints', () => {
730+
const splitter = chunkdown({
731+
chunkSize: 150,
732+
maxOverflowRatio: 1.3, // 195 chars allowed
733+
});
734+
735+
// Create scenario where parent can't merge with children,
736+
// and siblings have mixed sizes preventing complete merging
737+
const text = `# Parent Section
738+
739+
This is a parent section with substantial content that is designed to be large enough to prevent merging with any child sections. The parent section contains multiple detailed sentences with comprehensive explanations and examples that ensure its size exceeds the merge threshold when combined with any child section.
740+
741+
## Small Sibling A
742+
743+
Short content A.
744+
745+
## Small Sibling B
746+
747+
Short content B.
748+
749+
## Large Sibling Section
750+
751+
This is a much larger sibling section with substantial content that contains multiple sentences and detailed explanations that make it too large to merge with the small siblings.
752+
753+
## Small Sibling C
754+
755+
Short content C.`;
756+
757+
const chunks = splitter.splitText(text);
758+
759+
// Parent gets split due to size, siblings show selective merging behavior
760+
// Small siblings A+B merge together, large sibling separate, small sibling C separate
761+
expect(chunks.length).toBe(7);
762+
763+
// Key behavior to test: Small siblings A+B merged, but sibling C separate
764+
// Find the chunk containing small siblings A and B (merged together)
765+
const siblingABChunk = chunks.find(
766+
(chunk) =>
767+
chunk.includes('## Small Sibling A') &&
768+
chunk.includes('## Small Sibling B'),
769+
);
770+
expect(siblingABChunk).toBeDefined();
771+
expect(siblingABChunk).not.toContain('## Large Sibling Section');
772+
expect(siblingABChunk).not.toContain('## Small Sibling C');
773+
774+
// Large sibling should be in separate chunk(s)
775+
const largeSiblingChunks = chunks.filter((chunk) =>
776+
chunk.includes('## Large Sibling Section'),
777+
);
778+
expect(largeSiblingChunks.length).toBeDefined();
779+
expect(largeSiblingChunks).not.toContain('## Small Sibling A');
780+
expect(largeSiblingChunks).not.toContain('## Small Sibling B');
781+
expect(largeSiblingChunks).not.toContain('## Small Sibling C');
782+
783+
// Small sibling C should be alone
784+
const siblingCChunk = chunks.find((chunk) =>
785+
chunk.includes('## Small Sibling C'),
786+
);
787+
expect(siblingCChunk).toBeDefined();
788+
expect(siblingCChunk).not.toContain('## Small Sibling A');
789+
expect(siblingCChunk).not.toContain('## Small Sibling B');
790+
expect(siblingCChunk).not.toContain('## Large Sibling Section');
791+
792+
// Verify size constraints
793+
chunks.forEach((chunk) => {
794+
expect(getContentSize(chunk)).toBeLessThanOrEqual(195);
795+
});
796+
});
797+
798+
it('should handle orphaned sections (limitation: currently processed individually)', () => {
799+
const splitter = chunkdown({
800+
chunkSize: 100,
801+
maxOverflowRatio: 1.5, // 150 chars allowed
802+
});
803+
804+
// Test pure sibling sections without a hierarchical parent
805+
// Note: Current implementation treats these as individual sections
806+
// This could be improved in future versions to merge orphaned siblings
807+
const text = `## Section Alpha
808+
809+
Short content A.
810+
811+
## Section Beta
812+
813+
Short content B.
814+
815+
## Section Gamma
816+
817+
Short content C.`;
818+
819+
const chunks = splitter.splitText(text);
820+
821+
// Currently creates 3 separate chunks (limitation of current implementation)
822+
expect(chunks.length).toBe(3);
823+
824+
// Each chunk should contain one section
825+
expect(chunks[0]).toContain('## Section Alpha');
826+
expect(chunks[1]).toContain('## Section Beta');
827+
expect(chunks[2]).toContain('## Section Gamma');
828+
829+
// Verify size constraints
830+
chunks.forEach((chunk) => {
831+
expect(getContentSize(chunk)).toBeLessThanOrEqual(150);
832+
});
833+
});
834+
});
835+
});
836+
560837
describe('Examples', () => {
561838
describe('AI SDK Core Documentation', () => {
562839
const text = `# AI SDK Core
@@ -943,8 +1220,9 @@ Here's a sentence with a footnote[^1].
9431220
##### H5 Heading
9441221
9451222
###### H6 Heading",
946-
"# Alternative H1 (Setext)",
947-
"## Alternative H2 (Setext)",
1223+
"# Alternative H1 (Setext)
1224+
1225+
## Alternative H2 (Setext)",
9481226
"## Text Formatting
9491227
9501228
**Bold text with asterisks** and **bold text with underscores**
@@ -977,8 +1255,9 @@ Here's a sentence with a footnote[^1].
9771255
2. Second item
9781256
1. Nested ordered item
9791257
2. Another nested item
980-
3. Third item",
981-
"### Task Lists (GFM)
1258+
3. Third item
1259+
1260+
### Task Lists (GFM)
9821261
9831262
* [x] Completed task
9841263
* [ ] Incomplete task
@@ -1000,8 +1279,9 @@ Here's a sentence with a footnote[^1].
10001279
[1]: https://example.com
10011280
10021281
[reference]: https://example.com "Reference with title"",
1003-
"## Code Blocks",
1004-
"### Fenced Code Blocks
1282+
"## Code Blocks
1283+
1284+
### Fenced Code Blocks
10051285
10061286
\`\`\`javascript
10071287
function hello() {
@@ -1173,9 +1453,11 @@ Here's a sentence with a footnote[^1].
11731453
##### H5 Heading
11741454
11751455
###### H6 Heading",
1176-
"# Alternative H1 (Setext)",
1177-
"## Alternative H2 (Setext)",
1178-
"## Text Formatting
1456+
"# Alternative H1 (Setext)
1457+
1458+
## Alternative H2 (Setext)
1459+
1460+
## Text Formatting
11791461
11801462
**Bold text with asterisks** and **bold text with underscores**
11811463
@@ -1186,8 +1468,9 @@ Here's a sentence with a footnote[^1].
11861468
~~Strikethrough text~~
11871469
11881470
\`Inline code\` with backticks",
1189-
"## Lists",
1190-
"### Unordered Lists (3 variants)
1471+
"## Lists
1472+
1473+
### Unordered Lists (3 variants)
11911474
11921475
* Item 1 with dash
11931476
* Item 2 with dash
@@ -1209,8 +1492,9 @@ Here's a sentence with a footnote[^1].
12091492
2. Second item
12101493
1. Nested ordered item
12111494
2. Another nested item
1212-
3. Third item",
1213-
"### Task Lists (GFM)
1495+
3. Third item
1496+
1497+
### Task Lists (GFM)
12141498
12151499
* [x] Completed task
12161500
* [ ] Incomplete task
@@ -1288,8 +1572,9 @@ Here's a sentence with a footnote[^1].
12881572
>
12891573
> > This is nested
12901574
> >
1291-
> > > And this is deeply nested",
1292-
"## Horizontal Rules (3 variants)",
1575+
> > > And this is deeply nested
1576+
1577+
## Horizontal Rules (3 variants)",
12931578
"***
12941579
12951580
***

0 commit comments

Comments
 (0)