|
1 | 1 | <div align='center'> |
2 | 2 |
|
3 | | -# chunkdown 🧩 |
| 3 | +# Chunkdown 🧩 |
4 | 4 |
|
5 | 5 | > Create chunks worth embedding |
6 | 6 |
|
@@ -677,34 +677,3 @@ const chunks = splitter.splitText(text, { |
677 | 677 | // - **This is a very long bold text that** |
678 | 678 | // - **might be split into two chunks** |
679 | 679 | ``` |
680 | | - |
681 | | -### Return Chunk Start and End Positions |
682 | | - |
683 | | -Currently, the splitter returns the chunks as array of strings. That means the original position of each chunk in the source text is lost. |
684 | | -In a typical RAG setup, the source document and each chunk is stored with it's embedding in a database. This duplicates lots of text since each chunk contains parts of the original document. |
685 | | - |
686 | | -Chunkdown could return the start and end positions of each chunk in the original text, allowing to store only the original document and reference the chunk positions when needed. |
687 | | - |
688 | | -```ts |
689 | | -const document = '...'; // original markdown document |
690 | | -const chunks = splitter.splitDocument(document); |
691 | | -// Result: |
692 | | -// [ |
693 | | -// { text: 'First chunk text...', start: 0, end: 256 }, |
694 | | -// { text: 'Second chunk text...', start: 257, end: 512 }, |
695 | | -// ... |
696 | | -// ] |
697 | | - |
698 | | -await db.insert(documentTable).values({ |
699 | | - text: document, |
700 | | -}); |
701 | | - |
702 | | -await db.insert(chunkTable).values( |
703 | | - chunks.map((chunk) => ({ |
704 | | - start: chunk.start, // start position in original document |
705 | | - end: chunk.end, // end position in original document |
706 | | - text: null, // chunk text not stored separately |
707 | | - embedding: await embed(chunk.text), |
708 | | - })), |
709 | | -); |
710 | | -``` |
0 commit comments