feat: Added "document parser" OCR strategy #11519

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

dlew wants to merge 1 commit into danny-avila:dev from newjersey:dlew/document-parser-ocr

+606 −6

api/package.json

-Original file line number
+Diff line change
@@ Expand Up / @@ -81,6 +81,7 @@ @@
         "klona": "^2.0.6",
         "librechat-data-provider": "*",
         "lodash": "^4.17.23",
+        "mammoth": "^1.11.0",
         "mathjs": "^15.1.0",
         "meilisearch": "^0.38.0",
         "memorystore": "^1.6.7",
@@ Expand All / @@ -103,6 +104,7 @@ @@
         "passport-jwt": "^4.0.1",
         "passport-ldapauth": "^3.0.1",
         "passport-local": "^1.0.0",
+        "pdfjs-dist": "^5.4.530",
         "rate-limit-redis": "^4.2.0",
         "sharp": "^0.33.5",
         "tiktoken": "^1.0.15",
@@ Expand All / @@ -111,6 +113,7 @@ @@
         "undici": "^7.18.2",
         "winston": "^3.11.0",
         "winston-daily-rotate-file": "^5.0.0",
+        "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz",
         "zod": "^3.22.4"
       },
       "devDependencies": {
@@ Expand Down @@

api/server/services/Files/Documents/__tests__/documents.spec.js

-Original file line number
+Diff line change
@@ -0,0 +1,62 @@
+    const path = require('path');
+    const { parseDocument } = require('~/server/services/Files/Documents/crud');
+    describe('Document Parser', () => {
+      test('parseDocument() parses text from docx', async () => {
+        const file = {
+          filename: 'sample.docx',
+          path: path.join(__dirname, 'sample.docx'),
+          mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+        };
+        const document = await parseDocument({ file });
+        expect(document).toEqual({
+          bytes: 116,
+          filename: 'sample.docx',
+          filepath: 'document_parser',
+          images: [],
+          text: 'This is a sample DOCX file.\n\n',
+        });
+      });
+      test('parseDocument() parses text from xlsx', async () => {
+        const file = {
+          filename: 'sample.xlsx',
+          path: path.join(__dirname, 'sample.xlsx'),
+          mimetype: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+        };
+        const document = await parseDocument({ file });
+        expect(document).toEqual({
+          bytes: 264,
+          filename: 'sample.xlsx',
+          filepath: 'document_parser',
+          images: [],
+          text: 'Sheet One:\nData,on,first,sheet\nSecond Sheet:\nData,On\nSecond,Sheet\n',
+        });
+      });
+      test('parseDocument() throws error for unhandled document type', async () => {
+        const file = {
+          filename: 'nonexistent.file',
+          path: path.join(__dirname, 'nonexistent.file'),
+          mimetype: 'application/invalid',
+        };
+        await expect(parseDocument({ file })).rejects.toThrow(
+          'Unsupported file type in document parser: application/invalid',
+        );
+      });
+      test('parseDocument() throws error for empty document', async () => {
+        const file = {
+          filename: 'empty.docx',
+          path: path.join(__dirname, 'empty.docx'),
+          mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+        };
+        await expect(parseDocument({ file })).rejects.toThrow('No text found in document');
+      });
+    });

api/server/services/Files/Documents/__tests__/empty.docx

Binary file not shown.

api/server/services/Files/Documents/__tests__/sample.docx

Binary file not shown.

api/server/services/Files/Documents/__tests__/sample.xlsx

Binary file not shown.

api/server/services/Files/Documents/crud.js

-Original file line number
+Diff line change
@@ -0,0 +1,99 @@
+    const fs = require('fs');
+    const { FileSources } = require('librechat-data-provider');
+    const mammoth = require('mammoth');
+    const XLSX = require('xlsx');
+    /**
+     * Retrieves a readable stream for a file from local storage.
+     *
+     * Throws an Error if it fails to parse.
+     *
+     * @param {Express.Multer.File} file - The file.
+     * @returns {MistralOCRUploadResult} A readable stream of the file.
+     */
+    async function parseDocument({ file }) {
+      let text;
+      switch (file.mimetype) {
+        case 'application/pdf':
+          text = await pdfToText(file);
+          break;
+        case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
+          text = await wordDocToText(file);
+          break;
+        case 'application/vnd.ms-excel':
+        case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
+          text = excelSheetToText(file);
+          break;
+        default:
+          throw new Error(`Unsupported file type in document parser: ${file.mimetype}`);
+      }
+      if (!text?.trim()) {
+        throw Error('No text found in document');
+      }
+      return {
+        filename: file.filename,
+        bytes: text.length * 4,
+        filepath: FileSources.document_parser,
+        text,
+        images: [],
+      };
+    }
+    /**
+     * Parses PDF, returns text inside.
+     *
+     * @param {Express.Multer.File} file - The file.
+     * @returns {Promise<string>} the text contents of the PDF.
+     */
+    async function pdfToText(file) {
+      // Imported inline so that Jest can test other routes without failing due to loading ESM
+      const { getDocument } = require('pdfjs-dist/legacy/build/pdf.mjs');
+      const data = new Uint8Array(fs.readFileSync(file.path));
+      const pdf = await getDocument({ data }).promise;
+      // Extract text from all pages
+      let fullText = '';
+      for (let i = 1; i <= pdf.numPages; i++) {
+        const page = await pdf.getPage(i);
+        const textContent = await page.getTextContent();
+        const pageText = textContent.items.map((item) => item.str).join(' ');
+        fullText += pageText + '\n';
+      }
+      return fullText;
+    }
+    /**
+     * Parses Word document, returns text inside.
+     *
+     * @param {Express.Multer.File} file - The file.
+     * @returns {Promise<string>} the text contents of the Word document.
+     */
+    async function wordDocToText(file) {
+      const rawText = await mammoth.extractRawText({ path: file.path });
+      return rawText.value;
+    }
+    /**
+     * Parses Excel sheet, returns text inside.
+     *
+     * @param {Express.Multer.File} file - The file.
+     * @returns {string} the text contents of the XLS/XLSX.
+     */
+    function excelSheetToText(file) {
+      const workbook = XLSX.readFile(file.path);
+      let text = '';
+      workbook.SheetNames.forEach((sheetName) => {
+        const worksheet = workbook.Sheets[sheetName];
+        const worksheetAsCsvString = XLSX.utils.sheet_to_csv(worksheet);
+        text += `${sheetName}:\n${worksheetAsCsvString}\n`;
+      });
+      return text;
+    }
+    module.exports = { parseDocument };

api/server/services/Files/strategies.js

-Original file line number
+Diff line change
@@ Expand Up / @@ -51,6 +51,7 @@ const { @@
     const { uploadOpenAIFile, deleteOpenAIFile, getOpenAIFileStream } = require('./OpenAI');
     const { getCodeOutputDownloadStream, uploadCodeEnvFile } = require('./Code');
     const { uploadVectors, deleteVectors } = require('./VectorDB');
+    const { parseDocument } = require('~/server/services/Files/Documents/crud');
     /**
      * Firebase Storage Strategy Functions
@@ Expand Down Expand Up / @@ -246,6 +247,26 @@ const vertexMistralOCRStrategy = () => ({ @@
       handleFileUpload: uploadGoogleVertexMistralOCR,
     });
+    const documentParserStrategy = () => ({
+      /** @type {typeof saveFileFromURL | null} */
+      saveURL: null,
+      /** @type {typeof getLocalFileURL | null} */
+      getFileURL: null,
+      /** @type {typeof saveLocalBuffer | null} */
+      saveBuffer: null,
+      /** @type {typeof processLocalAvatar | null} */
+      processAvatar: null,
+      /** @type {typeof uploadLocalImage | null} */
+      handleImageUpload: null,
+      /** @type {typeof prepareImagesLocal | null} */
+      prepareImagePayload: null,
+      /** @type {typeof deleteLocalFile | null} */
+      deleteFile: null,
+      /** @type {typeof getLocalFileStream | null} */
+      getDownloadStream: null,
+      handleFileUpload: parseDocument,
+    });
     // Strategy Selector
     const getStrategyFunctions = (fileSource) => {
       if (fileSource === FileSources.firebase) {
@@ Expand All / @@ -270,6 +291,8 @@ const getStrategyFunctions = (fileSource) => { @@
         return azureMistralOCRStrategy();
       } else if (fileSource === FileSources.vertexai_mistral_ocr) {
         return vertexMistralOCRStrategy();
+      } else if (fileSource === FileSources.document_parser) {
+        return documentParserStrategy();
       } else if (fileSource === FileSources.text) {
         return localStrategy(); // Text files use local strategy
       } else {
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

feat: Added "document parser" OCR strategy #11519

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

feat: Added "document parser" OCR strategy #11519

Are you sure you want to change the base?

feat: Added "document parser" OCR strategy #11519

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!