-
-
Notifications
You must be signed in to change notification settings - Fork 467
Feat/document utils #893
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Feat/document utils #893
Changes from all commits
f5a111d
dcb68f4
9fc5c34
7eb6e66
af1b163
52142d4
b0ed29e
5fa09b9
8fd206e
1b58ba3
112194c
64e2cff
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| --- | ||
| "@voltagent/documents": minor | ||
| "@voltagent/core": minor | ||
| --- | ||
|
|
||
| feat: implement document chunking and embedding utilities and integrate into core |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,73 @@ | ||
| import { DocumentProcessor, type ProcessedDocument } from "@voltagent/documents"; | ||
| import type { BaseMessage } from "../agent/providers"; | ||
| import { BaseRetriever } from "./retriever"; | ||
| import type { RetrieveOptions, RetrieverOptions } from "./types"; | ||
|
|
||
| export interface DocumentRetrieverOptions extends RetrieverOptions { | ||
| /** | ||
| * Optional custom document processor. | ||
| * If not provided, a default one will be created. | ||
| */ | ||
| processor?: DocumentProcessor; | ||
| } | ||
|
|
||
| /** | ||
| * Abstract base class for retrievers that handle document ingestion and vector search. | ||
| */ | ||
| export abstract class DocumentRetriever extends BaseRetriever { | ||
| protected processor: DocumentProcessor; | ||
|
|
||
| constructor(options: DocumentRetrieverOptions = {}) { | ||
| super(options); | ||
| this.processor = options.processor || new DocumentProcessor(); | ||
| } | ||
|
|
||
| /** | ||
| * Ingests text, chunks it, embeds it, and stores it using upsertDocuments. | ||
| * @param text The raw text to ingest | ||
| * @param metadata Optional metadata to attach to all chunks | ||
| */ | ||
| async ingest(text: string, metadata?: Record<string, any>): Promise<void> { | ||
| this.logger.debug("Ingesting document text", { length: text.length }); | ||
| const documents = await this.processor.process(text, metadata); | ||
| await this.upsertDocuments(documents); | ||
| this.logger.debug("Document ingestion complete", { chunks: documents.length }); | ||
| } | ||
|
|
||
| /** | ||
| * Abstract method to store processed documents in the underlying storage (e.g., Vector DB). | ||
| * @param documents The processed documents containing embeddings and metadata | ||
| */ | ||
| abstract upsertDocuments(documents: ProcessedDocument[]): Promise<void>; | ||
|
|
||
| /** | ||
| * Abstract method to retrieve documents based on a query vector. | ||
| * This is a helper for the main retrieve method. | ||
| * @param vector The query vector | ||
| * @param k Number of results to return | ||
| */ | ||
| abstract queryVectors(vector: number[], k: number): Promise<ProcessedDocument[]>; | ||
|
|
||
| /** | ||
| * Default implementation of retrieve that embeds the query and searches vectors. | ||
| * Can be overridden if needed. | ||
| */ | ||
| async retrieve(input: string | BaseMessage[], options: RetrieveOptions = {}): Promise<string> { | ||
| if (Array.isArray(input) && input.length === 0) { | ||
| return ""; | ||
| } | ||
| const textQuery = typeof input === "string" ? input : input[input.length - 1].content; | ||
|
|
||
| // We assume the processor's embedder has an embedQuery method. | ||
| // Since DocumentProcessor exposes 'embedder', we can use it. | ||
| const queryVector = await this.processor.embedder.embedQuery(textQuery as string); | ||
|
|
||
| // Default top-k to 4, can be customizable via options | ||
| const k = (options as any).k ?? 4; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Avoid Using Proposed fixDefine proper types for the k parameter: +export interface DocumentRetrieveOptions extends RetrieveOptions {
+ /**
+ * Number of top results to return from vector search.
+ * @default 4
+ */
+ k?: number;
+}
+
export abstract class DocumentRetriever extends BaseRetriever {
// ...
- async retrieve(input: string | BaseMessage[], options: RetrieveOptions = {}): Promise<string> {
+ async retrieve(input: string | BaseMessage[], options: DocumentRetrieveOptions = {}): Promise<string> {
// ...
- const k = (options as any).k ?? 4;
+ const k = options.k ?? 4;🤖 Prompt for AI Agents |
||
|
|
||
| const results = await this.queryVectors(queryVector, k); | ||
|
|
||
| // Join the text of the results | ||
| return results.map((doc) => doc.text).join("\n\n"); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,71 @@ | ||
| # @voltagent/documents | ||
|
|
||
| Utilities for document processing, chunking, and embedding generation. | ||
|
|
||
| ## Installation | ||
|
|
||
| ```bash | ||
| pnpm add @voltagent/documents | ||
| ``` | ||
|
|
||
| ## Usage | ||
|
|
||
| ### Text Splitting | ||
|
|
||
| Use `RecursiveCharacterTextSplitter` to split text into chunks while preserving context. | ||
|
|
||
| ```typescript | ||
| import { RecursiveCharacterTextSplitter } from "@voltagent/documents"; | ||
|
|
||
| const splitter = new RecursiveCharacterTextSplitter({ | ||
| chunkSize: 1000, | ||
| chunkOverlap: 200, | ||
| }); | ||
|
|
||
| const text = "Your long text here..."; | ||
| const chunks = await splitter.splitText(text); | ||
| ``` | ||
|
|
||
| ### Embeddings | ||
|
|
||
| Use `OpenAIEmbeddingModel` to generate embeddings for your text. | ||
|
|
||
| ```typescript | ||
| import { OpenAIEmbeddingModel } from "@voltagent/documents"; | ||
|
|
||
| const embedder = new OpenAIEmbeddingModel({ | ||
| apiKey: process.env.OPENAI_API_KEY, // Optional if set in env | ||
| model: "text-embedding-ada-002", // Default | ||
| }); | ||
|
|
||
| const embedding = await embedder.embedQuery("Hello world"); | ||
| ``` | ||
|
|
||
| ### Document Processor | ||
|
|
||
| The `DocumentProcessor` combines splitting and embedding. | ||
|
|
||
| ```typescript | ||
| import { DocumentProcessor } from "@voltagent/documents"; | ||
|
|
||
| const processor = new DocumentProcessor(); | ||
| // Or with custom splitter/embedder: | ||
| // const processor = new DocumentProcessor(uniqueSplitter, uniqueEmbedder); | ||
|
|
||
| const documents = await processor.process("Long text content...", { | ||
| source: "example.txt", | ||
| author: "Me", | ||
| }); | ||
|
|
||
| /* | ||
| Returns: | ||
| [ | ||
| { | ||
| text: "chunk 1...", | ||
| embedding: [0.123, ...], | ||
| metadata: { source: "example.txt", author: "Me", chunkIndex: 0, ... } | ||
| }, | ||
| ... | ||
| ] | ||
| */ | ||
| ``` |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,38 @@ | ||
| { | ||
| "name": "@voltagent/documents", | ||
| "version": "0.0.1", | ||
| "description": "Document processing and embedding utilities for VoltAgent", | ||
| "main": "dist/index.js", | ||
| "module": "dist/index.mjs", | ||
| "types": "dist/index.d.ts", | ||
| "exports": { | ||
| ".": { | ||
| "import": { | ||
| "types": "./dist/index.d.mts", | ||
| "default": "./dist/index.mjs" | ||
| }, | ||
| "require": { | ||
| "types": "./dist/index.d.ts", | ||
| "default": "./dist/index.js" | ||
| } | ||
| } | ||
| }, | ||
| "files": [ | ||
| "dist" | ||
| ], | ||
| "scripts": { | ||
| "build": "tsup", | ||
| "dev": "tsup --watch", | ||
| "typecheck": "tsc --noEmit", | ||
| "test": "vitest run" | ||
| }, | ||
| "dependencies": { | ||
| "openai": "^4.20.0" | ||
| }, | ||
|
Comment on lines
+29
to
+31
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: # Check latest openai version and release info
curl -s https://registry.npmjs.org/openai | jq -r '.["dist-tags"] | to_entries[] | "\(.key): \(.value)"'Repository: VoltAgent/voltagent Length of output: 137 🌐 Web query:
💡 Result: Summary (as of Jan 13, 2026):
Recommended actions:
Sources: Update OpenAI SDK to a supported version. The package depends on 🤖 Prompt for AI Agents |
||
| "devDependencies": { | ||
| "tsup": "^8.5.0", | ||
| "typescript": "^5.8.2", | ||
| "vitest": "^3.2.4", | ||
| "@types/node": "^24.2.1" | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,44 @@ | ||
| import { describe, expect, it } from "vitest"; | ||
| import { DocumentProcessor } from "./DocumentProcessor"; | ||
| import type { EmbeddingModel } from "./embeddings/EmbeddingModel"; | ||
| import { TextSplitter } from "./text-splitters/TextSplitter"; | ||
|
|
||
| class MockSplitter extends TextSplitter { | ||
| async splitText(text: string): Promise<string[]> { | ||
| return text.split("|"); | ||
| } | ||
| } | ||
|
|
||
| class MockEmbedder implements EmbeddingModel { | ||
| async embedQuery(_text: string): Promise<number[]> { | ||
| return [0.1, 0.2]; | ||
| } | ||
| async embedDocuments(documents: string[]): Promise<number[][]> { | ||
| return documents.map(() => [0.1, 0.2]); | ||
| } | ||
| } | ||
|
|
||
| describe("DocumentProcessor", () => { | ||
| it("processes text into documents with embeddings", async () => { | ||
| const processor = new DocumentProcessor(new MockSplitter(), new MockEmbedder()); | ||
| const result = await processor.process("part1|part2", { file: "test.txt" }); | ||
|
|
||
| expect(result).toHaveLength(2); | ||
|
|
||
| expect(result[0].text).toBe("part1"); | ||
| expect(result[0].embedding).toEqual([0.1, 0.2]); | ||
| expect(result[0].metadata).toEqual({ | ||
| file: "test.txt", | ||
| chunkIndex: 0, | ||
| chunkCount: 2, | ||
| }); | ||
|
|
||
| expect(result[1].text).toBe("part2"); | ||
| expect(result[1].embedding).toEqual([0.1, 0.2]); | ||
| expect(result[1].metadata).toEqual({ | ||
| file: "test.txt", | ||
| chunkIndex: 1, | ||
| chunkCount: 2, | ||
| }); | ||
| }); | ||
| }); |
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,35 @@ | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| import type { EmbeddingModel } from "./embeddings/EmbeddingModel"; | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| import { OpenAIEmbeddingModel } from "./embeddings/OpenAIEmbeddingModel"; | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| import { RecursiveCharacterTextSplitter } from "./text-splitters/RecursiveCharacterTextSplitter"; | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| import type { TextSplitter } from "./text-splitters/TextSplitter"; | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| export interface ProcessedDocument { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| text: string; | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| embedding: number[]; | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| metadata?: Record<string, any>; | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| export class DocumentProcessor { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| splitter: TextSplitter; | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| embedder: EmbeddingModel; | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| constructor(splitter?: TextSplitter, embedder?: EmbeddingModel) { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| this.splitter = splitter ?? new RecursiveCharacterTextSplitter(); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| this.embedder = embedder ?? new OpenAIEmbeddingModel(); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| async process(text: string, metadata?: Record<string, any>): Promise<ProcessedDocument[]> { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| const chunks = await this.splitter.splitText(text); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| const embeddings = await this.embedder.embedDocuments(chunks); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| return chunks.map((chunk, index) => ({ | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| text: chunk, | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| embedding: embeddings[index], | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| metadata: { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| ...metadata, | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| chunkIndex: index, | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| chunkCount: chunks.length, | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| }, | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| })); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
Comment on lines
+21
to
+34
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Potential undefined embedding if array lengths mismatch. If 🛡️ Proposed defensive check async process(text: string, metadata?: Record<string, any>): Promise<ProcessedDocument[]> {
const chunks = await this.splitter.splitText(text);
const embeddings = await this.embedder.embedDocuments(chunks);
+ if (embeddings.length !== chunks.length) {
+ throw new Error(
+ `Embedding count mismatch: expected ${chunks.length}, got ${embeddings.length}`
+ );
+ }
+
return chunks.map((chunk, index) => ({
text: chunk,
embedding: embeddings[index],📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,76 @@ | ||
| import { describe, expect, it } from "vitest"; | ||
| import { RecursiveCharacterTextSplitter } from "./text-splitters/RecursiveCharacterTextSplitter"; | ||
|
|
||
| describe("RecursiveCharacterTextSplitter", () => { | ||
| it("splits text based on characters", async () => { | ||
| const splitter = new RecursiveCharacterTextSplitter({ | ||
| chunkSize: 10, | ||
| chunkOverlap: 0, | ||
| }); | ||
| const text = "abcdefghijklmnopqrstuvwxyz"; | ||
| const chunks = await splitter.splitText(text); | ||
| // Expect chunks to be size 10 -> "abcdefghij", "klmnopqrst", "uvwxyz" | ||
| expect(chunks).toEqual(["abcdefghij", "klmnopqrst", "uvwxyz"]); | ||
| }); | ||
|
|
||
| it("splits text with simple separator", async () => { | ||
| const splitter = new RecursiveCharacterTextSplitter({ | ||
| chunkSize: 10, | ||
| chunkOverlap: 0, | ||
| separators: [" "], | ||
| }); | ||
| const text = "hello world how are you"; | ||
| // "hello world" is 11 chars > 10. | ||
| // "hello" (5) | ||
| // "world" (5) | ||
| // "how" (3) | ||
| // "are" (3) | ||
| // "you" (3) | ||
| // "how are you" -> 3+1+3+1+3 = 11 > 10. | ||
| // So "how are" (7) | ||
| // "you" (3) | ||
| const chunks = await splitter.splitText(text); | ||
| // My implementation logic: | ||
| // split by " ". -> ["hello", "world", "how", "are", "you"] | ||
| // "hello" -> current. | ||
| // "world" -> len 5. "hello" + 1 + "world" = 11 > 10. Flush "hello". current="world". | ||
| // "how" -> len 3. "world" + 1 + "how" = 9 <= 10. current="world how". | ||
| // "are" -> len 3. "world how" + 1 + "are" = 9+1+3=13 > 10. Flush "world how". current="are". | ||
| // "you" -> len 3. "are" + 1 + "you" = 7 <= 10. current="are you". | ||
| // Flush "are you". | ||
|
|
||
| expect(chunks).toEqual(["hello", "world how", "are you"]); | ||
| }); | ||
|
|
||
| it("handles recursion with multiple separators", async () => { | ||
| // This tests the recursion logic | ||
| const splitter = new RecursiveCharacterTextSplitter({ | ||
| chunkSize: 20, | ||
| chunkOverlap: 0, | ||
| separators: ["\n", " "], | ||
| }); | ||
| // "Para1 word word word" -> 20 chars | ||
| const text = "Para1 is longer than 20 chars\nPara2 is short"; | ||
| const chunks = await splitter.splitText(text); | ||
|
|
||
| // Split by \n: | ||
| // "Para1 is longer than 20 chars" (29 chars) -> Too big -> Recurse with [" "] | ||
| // "Para2 is short" (14 chars) -> Fits. | ||
|
|
||
| // Recursion on "Para1...": | ||
| // Split by " ": "Para1", "is", "longer", "than", "20", "chars" | ||
| // Accumulate: | ||
| // "Para1 is" (8) | ||
| // + "longer" (6) -> "Para1 is longer" (15) | ||
| // + "than" (4) -> "Para1 is longer than" (20) -> Perfect fit? (15+1+4=20). Yes. | ||
| // + "20" (2) -> "Para1 is longer than 20" (23) -> Flush "Para1 is longer than". Current="20". | ||
| // "chars" -> "20 chars" (8). | ||
|
|
||
| // So result should be: | ||
| // "Para1 is longer than" | ||
| // "20 chars" | ||
| // "Para2 is short" | ||
|
|
||
| expect(chunks).toEqual(["Para1 is longer than", "20 chars", "Para2 is short"]); | ||
| }); | ||
| }); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Validate
contenttype before embedding.The
contentproperty ofBaseMessagemay not always be a string (e.g., structured content in multi-modal messages). The cast on line 63 could mask runtime issues ifcontentis an object or undefined.Suggested defensive handling
📝 Committable suggestion
🤖 Prompt for AI Agents