Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions api/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
"klona": "^2.0.6",
"librechat-data-provider": "*",
"lodash": "^4.17.23",
"mammoth": "^1.11.0",
"mathjs": "^15.1.0",
"meilisearch": "^0.38.0",
"memorystore": "^1.6.7",
Expand All @@ -103,6 +104,7 @@
"passport-jwt": "^4.0.1",
"passport-ldapauth": "^3.0.1",
"passport-local": "^1.0.0",
"pdfjs-dist": "^5.4.530",
"rate-limit-redis": "^4.2.0",
"sharp": "^0.33.5",
"tiktoken": "^1.0.15",
Expand All @@ -111,6 +113,7 @@
"undici": "^7.18.2",
"winston": "^3.11.0",
"winston-daily-rotate-file": "^5.0.0",
"xlsx": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz",
"zod": "^3.22.4"
},
"devDependencies": {
Expand Down
62 changes: 62 additions & 0 deletions api/server/services/Files/Documents/__tests__/documents.spec.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
const path = require('path');
const { parseDocument } = require('~/server/services/Files/Documents/crud');

describe('Document Parser', () => {
test('parseDocument() parses text from docx', async () => {
const file = {
filename: 'sample.docx',
path: path.join(__dirname, 'sample.docx'),
mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
};

const document = await parseDocument({ file });

expect(document).toEqual({
bytes: 116,
filename: 'sample.docx',
filepath: 'document_parser',
images: [],
text: 'This is a sample DOCX file.\n\n',
});
});

test('parseDocument() parses text from xlsx', async () => {
const file = {
filename: 'sample.xlsx',
path: path.join(__dirname, 'sample.xlsx'),
mimetype: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
};

const document = await parseDocument({ file });

expect(document).toEqual({
bytes: 264,
filename: 'sample.xlsx',
filepath: 'document_parser',
images: [],
text: 'Sheet One:\nData,on,first,sheet\nSecond Sheet:\nData,On\nSecond,Sheet\n',
});
});

test('parseDocument() throws error for unhandled document type', async () => {
const file = {
filename: 'nonexistent.file',
path: path.join(__dirname, 'nonexistent.file'),
mimetype: 'application/invalid',
};

await expect(parseDocument({ file })).rejects.toThrow(
'Unsupported file type in document parser: application/invalid',
);
});

test('parseDocument() throws error for empty document', async () => {
const file = {
filename: 'empty.docx',
path: path.join(__dirname, 'empty.docx'),
mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
};

await expect(parseDocument({ file })).rejects.toThrow('No text found in document');
});
});
Binary file not shown.
Binary file not shown.
Binary file not shown.
99 changes: 99 additions & 0 deletions api/server/services/Files/Documents/crud.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
const fs = require('fs');
const { FileSources } = require('librechat-data-provider');
const mammoth = require('mammoth');
const XLSX = require('xlsx');

/**
* Retrieves a readable stream for a file from local storage.
*
* Throws an Error if it fails to parse.
*
* @param {Express.Multer.File} file - The file.
* @returns {MistralOCRUploadResult} A readable stream of the file.
*/
async function parseDocument({ file }) {
let text;
switch (file.mimetype) {
case 'application/pdf':
text = await pdfToText(file);
break;
case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
text = await wordDocToText(file);
break;
case 'application/vnd.ms-excel':
case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
text = excelSheetToText(file);
break;
default:
throw new Error(`Unsupported file type in document parser: ${file.mimetype}`);
}

if (!text?.trim()) {
throw Error('No text found in document');
}

return {
filename: file.filename,
bytes: text.length * 4,
filepath: FileSources.document_parser,
text,
images: [],
};
}

/**
* Parses PDF, returns text inside.
*
* @param {Express.Multer.File} file - The file.
* @returns {Promise<string>} the text contents of the PDF.
*/
async function pdfToText(file) {
// Imported inline so that Jest can test other routes without failing due to loading ESM
const { getDocument } = require('pdfjs-dist/legacy/build/pdf.mjs');

const data = new Uint8Array(fs.readFileSync(file.path));
const pdf = await getDocument({ data }).promise;

// Extract text from all pages
let fullText = '';
for (let i = 1; i <= pdf.numPages; i++) {
const page = await pdf.getPage(i);
const textContent = await page.getTextContent();
const pageText = textContent.items.map((item) => item.str).join(' ');
fullText += pageText + '\n';
}

return fullText;
}

/**
* Parses Word document, returns text inside.
*
* @param {Express.Multer.File} file - The file.
* @returns {Promise<string>} the text contents of the Word document.
*/
async function wordDocToText(file) {
const rawText = await mammoth.extractRawText({ path: file.path });
return rawText.value;
}

/**
* Parses Excel sheet, returns text inside.
*
* @param {Express.Multer.File} file - The file.
* @returns {string} the text contents of the XLS/XLSX.
*/
function excelSheetToText(file) {
const workbook = XLSX.readFile(file.path);

let text = '';
workbook.SheetNames.forEach((sheetName) => {
const worksheet = workbook.Sheets[sheetName];
const worksheetAsCsvString = XLSX.utils.sheet_to_csv(worksheet);
text += `${sheetName}:\n${worksheetAsCsvString}\n`;
});

return text;
}

module.exports = { parseDocument };
23 changes: 23 additions & 0 deletions api/server/services/Files/strategies.js
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ const {
const { uploadOpenAIFile, deleteOpenAIFile, getOpenAIFileStream } = require('./OpenAI');
const { getCodeOutputDownloadStream, uploadCodeEnvFile } = require('./Code');
const { uploadVectors, deleteVectors } = require('./VectorDB');
const { parseDocument } = require('~/server/services/Files/Documents/crud');

/**
* Firebase Storage Strategy Functions
Expand Down Expand Up @@ -246,6 +247,26 @@ const vertexMistralOCRStrategy = () => ({
handleFileUpload: uploadGoogleVertexMistralOCR,
});

const documentParserStrategy = () => ({
/** @type {typeof saveFileFromURL | null} */
saveURL: null,
/** @type {typeof getLocalFileURL | null} */
getFileURL: null,
/** @type {typeof saveLocalBuffer | null} */
saveBuffer: null,
/** @type {typeof processLocalAvatar | null} */
processAvatar: null,
/** @type {typeof uploadLocalImage | null} */
handleImageUpload: null,
/** @type {typeof prepareImagesLocal | null} */
prepareImagePayload: null,
/** @type {typeof deleteLocalFile | null} */
deleteFile: null,
/** @type {typeof getLocalFileStream | null} */
getDownloadStream: null,
handleFileUpload: parseDocument,
});

// Strategy Selector
const getStrategyFunctions = (fileSource) => {
if (fileSource === FileSources.firebase) {
Expand All @@ -270,6 +291,8 @@ const getStrategyFunctions = (fileSource) => {
return azureMistralOCRStrategy();
} else if (fileSource === FileSources.vertexai_mistral_ocr) {
return vertexMistralOCRStrategy();
} else if (fileSource === FileSources.document_parser) {
return documentParserStrategy();
} else if (fileSource === FileSources.text) {
return localStrategy(); // Text files use local strategy
} else {
Expand Down
Loading