Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changeset/document-utils-integration.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
"@voltagent/documents": minor
"@voltagent/core": minor
---

feat: implement document chunking and embedding utilities and integrate into core
4 changes: 3 additions & 1 deletion biome.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@
".wrangler",
"packages/server-hono/src/vendor",
"examples/with-nuxt",
"examples/with-assistant-ui"
"examples/with-assistant-ui",
"archive",
"website"
]
},
"overrides": [
Expand Down
3 changes: 2 additions & 1 deletion packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"@opentelemetry/sdk-trace-base": "^2.0.0",
"@opentelemetry/sdk-trace-node": "^2.0.0",
"@opentelemetry/semantic-conventions": "^1.28.0",
"@voltagent/documents": "workspace:*",
"@voltagent/internal": "^1.0.2",
"fast-glob": "^3.3.3",
"gray-matter": "^4.0.3",
Expand Down Expand Up @@ -82,4 +83,4 @@
"typecheck": "tsc --noEmit"
},
"types": "dist/index.d.ts"
}
}
1 change: 1 addition & 0 deletions packages/core/src/agent/subagent/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ ${guidelinesText}
/**
* Hand off a task to another agent using AgentV2
*/
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: legacy complexity allowed
public async handoffTask(options: {
task: string;
targetAgent: SubAgentConfig;
Expand Down
73 changes: 73 additions & 0 deletions packages/core/src/retriever/document-retriever.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import { DocumentProcessor, type ProcessedDocument } from "@voltagent/documents";
import type { BaseMessage } from "../agent/providers";
import { BaseRetriever } from "./retriever";
import type { RetrieveOptions, RetrieverOptions } from "./types";

export interface DocumentRetrieverOptions extends RetrieverOptions {
/**
* Optional custom document processor.
* If not provided, a default one will be created.
*/
processor?: DocumentProcessor;
}

/**
* Abstract base class for retrievers that handle document ingestion and vector search.
*/
export abstract class DocumentRetriever extends BaseRetriever {
protected processor: DocumentProcessor;

constructor(options: DocumentRetrieverOptions = {}) {
super(options);
this.processor = options.processor || new DocumentProcessor();
}

/**
* Ingests text, chunks it, embeds it, and stores it using upsertDocuments.
* @param text The raw text to ingest
* @param metadata Optional metadata to attach to all chunks
*/
async ingest(text: string, metadata?: Record<string, any>): Promise<void> {
this.logger.debug("Ingesting document text", { length: text.length });
const documents = await this.processor.process(text, metadata);
await this.upsertDocuments(documents);
this.logger.debug("Document ingestion complete", { chunks: documents.length });
}

/**
* Abstract method to store processed documents in the underlying storage (e.g., Vector DB).
* @param documents The processed documents containing embeddings and metadata
*/
abstract upsertDocuments(documents: ProcessedDocument[]): Promise<void>;

/**
* Abstract method to retrieve documents based on a query vector.
* This is a helper for the main retrieve method.
* @param vector The query vector
* @param k Number of results to return
*/
abstract queryVectors(vector: number[], k: number): Promise<ProcessedDocument[]>;

/**
* Default implementation of retrieve that embeds the query and searches vectors.
* Can be overridden if needed.
*/
async retrieve(input: string | BaseMessage[], options: RetrieveOptions = {}): Promise<string> {
if (Array.isArray(input) && input.length === 0) {
return "";
}
const textQuery = typeof input === "string" ? input : input[input.length - 1].content;

// We assume the processor's embedder has an embedQuery method.
// Since DocumentProcessor exposes 'embedder', we can use it.
const queryVector = await this.processor.embedder.embedQuery(textQuery as string);
Comment on lines +59 to +63
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Validate content type before embedding.

The content property of BaseMessage may not always be a string (e.g., structured content in multi-modal messages). The cast on line 63 could mask runtime issues if content is an object or undefined.

Suggested defensive handling
-    const textQuery = typeof input === "string" ? input : input[input.length - 1].content;
-
-    // We assume the processor's embedder has an embedQuery method.
-    // Since DocumentProcessor exposes 'embedder', we can use it.
-    const queryVector = await this.processor.embedder.embedQuery(textQuery as string);
+    let textQuery: string;
+    if (typeof input === "string") {
+      textQuery = input;
+    } else {
+      const lastContent = input[input.length - 1].content;
+      textQuery = typeof lastContent === "string" ? lastContent : String(lastContent);
+    }
+
+    const queryVector = await this.processor.embedder.embedQuery(textQuery);
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
const textQuery = typeof input === "string" ? input : input[input.length - 1].content;
// We assume the processor's embedder has an embedQuery method.
// Since DocumentProcessor exposes 'embedder', we can use it.
const queryVector = await this.processor.embedder.embedQuery(textQuery as string);
let textQuery: string;
if (typeof input === "string") {
textQuery = input;
} else {
const lastContent = input[input.length - 1].content;
textQuery = typeof lastContent === "string" ? lastContent : String(lastContent);
}
const queryVector = await this.processor.embedder.embedQuery(textQuery);
🤖 Prompt for AI Agents
In @packages/core/src/retriever/document-retriever.ts around lines 59 - 63, The
code assumes the selected BaseMessage content is a string and casts it before
calling processor.embedder.embedQuery; instead validate the content type from
input (and the last BaseMessage) before embedding: ensure textQuery is a string,
handle undefined or non-string content by extracting a text field if present
(e.g., content.text), falling back to a safe serialization like
JSON.stringify(content) or returning/logging an error and not calling
embedQuery; replace the direct cast on textQuery and guard the call to
this.processor.embedder.embedQuery accordingly to avoid runtime crashes.


// Default top-k to 4, can be customizable via options
const k = (options as any).k ?? 4;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Avoid any cast; extend RetrieveOptions with proper typing.

Using (options as any).k bypasses type safety. Consider extending RetrieveOptions or creating a dedicated options type for vector retrieval.

Proposed fix

Define proper types for the k parameter:

+export interface DocumentRetrieveOptions extends RetrieveOptions {
+  /**
+   * Number of top results to return from vector search.
+   * @default 4
+   */
+  k?: number;
+}
+
 export abstract class DocumentRetriever extends BaseRetriever {
   // ...
-  async retrieve(input: string | BaseMessage[], options: RetrieveOptions = {}): Promise<string> {
+  async retrieve(input: string | BaseMessage[], options: DocumentRetrieveOptions = {}): Promise<string> {
     // ...
-    const k = (options as any).k ?? 4;
+    const k = options.k ?? 4;
🤖 Prompt for AI Agents
In @packages/core/src/retriever/document-retriever.ts at line 66, The code uses
a cast to any to read options.k; instead extend the RetrieveOptions type (or
create a new VectorRetrieveOptions that extends RetrieveOptions) to include an
optional k?: number, update the method signature to accept that typed options,
and replace the cast line with a typed access (e.g., destructure or read
options.k with a default of 4) so type safety is preserved for the k parameter
in DocumentRetriever/document-retrieval logic.


const results = await this.queryVectors(queryVector, k);

// Join the text of the results
return results.map((doc) => doc.text).join("\n\n");
}
}
3 changes: 3 additions & 0 deletions packages/core/src/retriever/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
* @module retriever
*/

export type { ProcessedDocument } from "@voltagent/documents";

export { BaseRetriever } from "./retriever";
export { DocumentRetriever, type DocumentRetrieverOptions } from "./document-retriever";
export type { Retriever, RetrieverOptions, RetrieveOptions } from "./types";
export {
VoltAgentRagRetriever,
Expand Down
71 changes: 71 additions & 0 deletions packages/documents/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# @voltagent/documents

Utilities for document processing, chunking, and embedding generation.

## Installation

```bash
pnpm add @voltagent/documents
```

## Usage

### Text Splitting

Use `RecursiveCharacterTextSplitter` to split text into chunks while preserving context.

```typescript
import { RecursiveCharacterTextSplitter } from "@voltagent/documents";

const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000,
chunkOverlap: 200,
});

const text = "Your long text here...";
const chunks = await splitter.splitText(text);
```

### Embeddings

Use `OpenAIEmbeddingModel` to generate embeddings for your text.

```typescript
import { OpenAIEmbeddingModel } from "@voltagent/documents";

const embedder = new OpenAIEmbeddingModel({
apiKey: process.env.OPENAI_API_KEY, // Optional if set in env
model: "text-embedding-ada-002", // Default
});

const embedding = await embedder.embedQuery("Hello world");
```

### Document Processor

The `DocumentProcessor` combines splitting and embedding.

```typescript
import { DocumentProcessor } from "@voltagent/documents";

const processor = new DocumentProcessor();
// Or with custom splitter/embedder:
// const processor = new DocumentProcessor(uniqueSplitter, uniqueEmbedder);

const documents = await processor.process("Long text content...", {
source: "example.txt",
author: "Me",
});

/*
Returns:
[
{
text: "chunk 1...",
embedding: [0.123, ...],
metadata: { source: "example.txt", author: "Me", chunkIndex: 0, ... }
},
...
]
*/
```
38 changes: 38 additions & 0 deletions packages/documents/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"name": "@voltagent/documents",
"version": "0.0.1",
"description": "Document processing and embedding utilities for VoltAgent",
"main": "dist/index.js",
"module": "dist/index.mjs",
"types": "dist/index.d.ts",
"exports": {
".": {
"import": {
"types": "./dist/index.d.mts",
"default": "./dist/index.mjs"
},
"require": {
"types": "./dist/index.d.ts",
"default": "./dist/index.js"
}
}
},
"files": [
"dist"
],
"scripts": {
"build": "tsup",
"dev": "tsup --watch",
"typecheck": "tsc --noEmit",
"test": "vitest run"
},
"dependencies": {
"openai": "^4.20.0"
},
Comment on lines +29 to +31
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

# Check latest openai version and release info
curl -s https://registry.npmjs.org/openai | jq -r '.["dist-tags"] | to_entries[] | "\(.key): \(.value)"'

Repository: VoltAgent/voltagent

Length of output: 137


🌐 Web query:

openai npm package latest version 2025 security vulnerabilities

💡 Result:

Summary (as of Jan 13, 2026):

  • Latest official "openai" npm package (registry) — see package page on npm for the current published version. No direct vulnerabilities are listed for the package in Snyk's database. [1][2]

  • Snyk security scan: "No known security issues" for the latest openai release (Snyk package page). [1]

  • Official npm registry: package/version history and tags are on npmjs (check npm package page to confirm the exact version number you have/plan to install). [2]

Recommended actions:

  • Run npm audit and/or Snyk/Dependabot against your project before installing/upgrading.
  • Inspect your dependency tree for transitive vulnerabilities (audit covers those).
  • If you need, I can check the exact latest version number right now and scan advisories (CVE/GitHub/GitLab) for any security reports — shall I do that?

Sources:
[1] Snyk — openai package security page.
[2] npmjs — openai package page and version listings.


Update OpenAI SDK to a supported version.

The package depends on openai@^4.20.0, which is 2 major versions behind the latest release (6.16.0). While the current version has no known security vulnerabilities, the caret constraint prevents adoption of newer versions. Consider upgrading to a recent version within the 6.x line to benefit from API improvements and continued maintenance.

🤖 Prompt for AI Agents
In @packages/documents/package.json around lines 29 - 31, The package.json
currently pins the "openai" dependency to "^4.20.0"; update that dependency to a
supported 6.x release (e.g., "^6.16.0") in the "dependencies" entry for
"openai", then run your package manager (npm/yarn/pnpm) to install and update
lockfiles; after upgrading, run tests and fix any breaking API changes in code
that uses the OpenAI SDK (search for imports/usages of "openai" and update
client construction and method names per the 6.x migration guide).

"devDependencies": {
"tsup": "^8.5.0",
"typescript": "^5.8.2",
"vitest": "^3.2.4",
"@types/node": "^24.2.1"
}
}
44 changes: 44 additions & 0 deletions packages/documents/src/DocumentProcessor.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import { describe, expect, it } from "vitest";
import { DocumentProcessor } from "./DocumentProcessor";
import type { EmbeddingModel } from "./embeddings/EmbeddingModel";
import { TextSplitter } from "./text-splitters/TextSplitter";

class MockSplitter extends TextSplitter {
async splitText(text: string): Promise<string[]> {
return text.split("|");
}
}

class MockEmbedder implements EmbeddingModel {
async embedQuery(_text: string): Promise<number[]> {
return [0.1, 0.2];
}
async embedDocuments(documents: string[]): Promise<number[][]> {
return documents.map(() => [0.1, 0.2]);
}
}

describe("DocumentProcessor", () => {
it("processes text into documents with embeddings", async () => {
const processor = new DocumentProcessor(new MockSplitter(), new MockEmbedder());
const result = await processor.process("part1|part2", { file: "test.txt" });

expect(result).toHaveLength(2);

expect(result[0].text).toBe("part1");
expect(result[0].embedding).toEqual([0.1, 0.2]);
expect(result[0].metadata).toEqual({
file: "test.txt",
chunkIndex: 0,
chunkCount: 2,
});

expect(result[1].text).toBe("part2");
expect(result[1].embedding).toEqual([0.1, 0.2]);
expect(result[1].metadata).toEqual({
file: "test.txt",
chunkIndex: 1,
chunkCount: 2,
});
});
});
35 changes: 35 additions & 0 deletions packages/documents/src/DocumentProcessor.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import type { EmbeddingModel } from "./embeddings/EmbeddingModel";
import { OpenAIEmbeddingModel } from "./embeddings/OpenAIEmbeddingModel";
import { RecursiveCharacterTextSplitter } from "./text-splitters/RecursiveCharacterTextSplitter";
import type { TextSplitter } from "./text-splitters/TextSplitter";

export interface ProcessedDocument {
text: string;
embedding: number[];
metadata?: Record<string, any>;
}

export class DocumentProcessor {
splitter: TextSplitter;
embedder: EmbeddingModel;

constructor(splitter?: TextSplitter, embedder?: EmbeddingModel) {
this.splitter = splitter ?? new RecursiveCharacterTextSplitter();
this.embedder = embedder ?? new OpenAIEmbeddingModel();
}

async process(text: string, metadata?: Record<string, any>): Promise<ProcessedDocument[]> {
const chunks = await this.splitter.splitText(text);
const embeddings = await this.embedder.embedDocuments(chunks);

return chunks.map((chunk, index) => ({
text: chunk,
embedding: embeddings[index],
metadata: {
...metadata,
chunkIndex: index,
chunkCount: chunks.length,
},
}));
}
Comment on lines +21 to +34
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Potential undefined embedding if array lengths mismatch.

If embedder.embedDocuments(chunks) returns fewer embeddings than chunks (due to an API error or implementation bug), embeddings[index] will be undefined, leading to corrupted ProcessedDocument objects.

🛡️ Proposed defensive check
 async process(text: string, metadata?: Record<string, any>): Promise<ProcessedDocument[]> {
   const chunks = await this.splitter.splitText(text);
   const embeddings = await this.embedder.embedDocuments(chunks);

+  if (embeddings.length !== chunks.length) {
+    throw new Error(
+      `Embedding count mismatch: expected ${chunks.length}, got ${embeddings.length}`
+    );
+  }
+
   return chunks.map((chunk, index) => ({
     text: chunk,
     embedding: embeddings[index],
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
async process(text: string, metadata?: Record<string, any>): Promise<ProcessedDocument[]> {
const chunks = await this.splitter.splitText(text);
const embeddings = await this.embedder.embedDocuments(chunks);
return chunks.map((chunk, index) => ({
text: chunk,
embedding: embeddings[index],
metadata: {
...metadata,
chunkIndex: index,
chunkCount: chunks.length,
},
}));
}
async process(text: string, metadata?: Record<string, any>): Promise<ProcessedDocument[]> {
const chunks = await this.splitter.splitText(text);
const embeddings = await this.embedder.embedDocuments(chunks);
if (embeddings.length !== chunks.length) {
throw new Error(
`Embedding count mismatch: expected ${chunks.length}, got ${embeddings.length}`
);
}
return chunks.map((chunk, index) => ({
text: chunk,
embedding: embeddings[index],
metadata: {
...metadata,
chunkIndex: index,
chunkCount: chunks.length,
},
}));
}
🤖 Prompt for AI Agents
In @packages/documents/src/DocumentProcessor.ts around lines 21 - 34, In
process, guard against embedder.embedDocuments returning fewer items than
chunks: after const embeddings = await this.embedder.embedDocuments(chunks);
check that embeddings is an array and embeddings.length === chunks.length (or at
least >= chunks.length); if not, either throw a clear error or fill missing
entries with a safe fallback (e.g., null vector or empty embedding) and log the
mismatch via the class logger; ensure the returned ProcessedDocument objects use
validated/fallback embeddings so embeddings[index] cannot be undefined.

}
76 changes: 76 additions & 0 deletions packages/documents/src/RecursiveCharacterTextSplitter.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import { describe, expect, it } from "vitest";
import { RecursiveCharacterTextSplitter } from "./text-splitters/RecursiveCharacterTextSplitter";

describe("RecursiveCharacterTextSplitter", () => {
it("splits text based on characters", async () => {
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 10,
chunkOverlap: 0,
});
const text = "abcdefghijklmnopqrstuvwxyz";
const chunks = await splitter.splitText(text);
// Expect chunks to be size 10 -> "abcdefghij", "klmnopqrst", "uvwxyz"
expect(chunks).toEqual(["abcdefghij", "klmnopqrst", "uvwxyz"]);
});

it("splits text with simple separator", async () => {
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 10,
chunkOverlap: 0,
separators: [" "],
});
const text = "hello world how are you";
// "hello world" is 11 chars > 10.
// "hello" (5)
// "world" (5)
// "how" (3)
// "are" (3)
// "you" (3)
// "how are you" -> 3+1+3+1+3 = 11 > 10.
// So "how are" (7)
// "you" (3)
const chunks = await splitter.splitText(text);
// My implementation logic:
// split by " ". -> ["hello", "world", "how", "are", "you"]
// "hello" -> current.
// "world" -> len 5. "hello" + 1 + "world" = 11 > 10. Flush "hello". current="world".
// "how" -> len 3. "world" + 1 + "how" = 9 <= 10. current="world how".
// "are" -> len 3. "world how" + 1 + "are" = 9+1+3=13 > 10. Flush "world how". current="are".
// "you" -> len 3. "are" + 1 + "you" = 7 <= 10. current="are you".
// Flush "are you".

expect(chunks).toEqual(["hello", "world how", "are you"]);
});

it("handles recursion with multiple separators", async () => {
// This tests the recursion logic
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 20,
chunkOverlap: 0,
separators: ["\n", " "],
});
// "Para1 word word word" -> 20 chars
const text = "Para1 is longer than 20 chars\nPara2 is short";
const chunks = await splitter.splitText(text);

// Split by \n:
// "Para1 is longer than 20 chars" (29 chars) -> Too big -> Recurse with [" "]
// "Para2 is short" (14 chars) -> Fits.

// Recursion on "Para1...":
// Split by " ": "Para1", "is", "longer", "than", "20", "chars"
// Accumulate:
// "Para1 is" (8)
// + "longer" (6) -> "Para1 is longer" (15)
// + "than" (4) -> "Para1 is longer than" (20) -> Perfect fit? (15+1+4=20). Yes.
// + "20" (2) -> "Para1 is longer than 20" (23) -> Flush "Para1 is longer than". Current="20".
// "chars" -> "20 chars" (8).

// So result should be:
// "Para1 is longer than"
// "20 chars"
// "Para2 is short"

expect(chunks).toEqual(["Para1 is longer than", "20 chars", "Para2 is short"]);
});
});
Loading
Loading