diff --git a/raganything/parser.py b/raganything/parser.py index a07443e24..368d827e4 100644 --- a/raganything/parser.py +++ b/raganything/parser.py @@ -18,6 +18,10 @@ import subprocess import tempfile import logging +import urllib.parse +import urllib.request +import shutil +import os from pathlib import Path from typing import ( Dict, @@ -58,6 +62,90 @@ class Parser: # Class-level logger logger = logging.getLogger(__name__) + @staticmethod + def _is_url(path: str) -> bool: + """Check if the path is a URL.""" + try: + result = urllib.parse.urlparse(str(path)) + return all([result.scheme, result.netloc]) + except ValueError: + return False + + def _download_file(self, url: str) -> Path: + """ + Download a file from a URL to a temporary file. + Attempts to preserve the file extension from the URL or Content-Type header. + """ + tmp_path = None + response = None + try: + self.logger.info(f"Downloading file from URL: {url}") + + # Parse URL to get path and extension + parsed_url = urllib.parse.urlparse(url) + path = Path(parsed_url.path) + suffix = path.suffix if path.suffix else "" + + # Create request with User-Agent to avoid 403 Forbidden from some sites + req = urllib.request.Request( + url, + data=None, + headers={ + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36" + }, + ) + + # Open connection to get headers (with an explicit timeout to prevent hanging) + response = urllib.request.urlopen(req, timeout=30) + + # If no extension in URL, try Content-Type header + if not suffix: + content_type = ( + response.headers.get("Content-Type", "").split(";")[0].strip() + ) + if content_type: + import mimetypes + + guessed_ext = mimetypes.guess_extension(content_type) + if guessed_ext: + suffix = guessed_ext + self.logger.info( + f"Inferred file extension '{suffix}' from Content-Type: {content_type}" + ) + + # Create a temporary file with the correct extension + fd, tmp_path = tempfile.mkstemp(suffix=suffix) + os.close(fd) + tmp_path = Path(tmp_path) + + # Download the file content + with open(tmp_path, "wb") as out_file: + shutil.copyfileobj(response, out_file) + + self.logger.info( + f"Downloaded to temporary file: {tmp_path} ({tmp_path.stat().st_size} bytes)" + ) + return tmp_path + + except Exception as e: + # Clean up temp file if it was created + if tmp_path and tmp_path.exists(): + try: + tmp_path.unlink() + self.logger.debug( + f"Cleaned up temporary file after failed download: {tmp_path}" + ) + except Exception as cleanup_error: + self.logger.warning( + f"Failed to clean up temp file {tmp_path}: {cleanup_error}" + ) + + self.logger.error(f"Failed to download file from {url}: {e}") + raise RuntimeError(f"Failed to download file from {url}: {e}") + finally: + if response: + response.close() + def __init__(self) -> None: """Initialize the base parser.""" pass @@ -1341,7 +1429,7 @@ def parse_document( Parse document using Docling based on file extension Args: - file_path: Path to the file to be parsed + file_path: Path to the file to be parsed or URL method: Parsing method output_dir: Output directory path lang: Document language for optimization @@ -1350,27 +1438,45 @@ def parse_document( Returns: List[Dict[str, Any]]: List of content blocks """ - # Convert to Path object - file_path = Path(file_path) - if not file_path.exists(): - raise FileNotFoundError(f"File does not exist: {file_path}") + downloaded_temp_file = None - # Get file extension - ext = file_path.suffix.lower() + try: + # Check if input is a URL + if self._is_url(file_path): + file_path = self._download_file(file_path) + downloaded_temp_file = file_path - # Choose appropriate parser based on file type - if ext == ".pdf": - return self.parse_pdf(file_path, output_dir, method, lang, **kwargs) - elif ext in self.OFFICE_FORMATS: - return self.parse_office_doc(file_path, output_dir, lang, **kwargs) - elif ext in self.HTML_FORMATS: - return self.parse_html(file_path, output_dir, lang, **kwargs) - else: - raise ValueError( - f"Unsupported file format: {ext}. " - f"Docling only supports PDF files, Office formats ({', '.join(self.OFFICE_FORMATS)}) " - f"and HTML formats ({', '.join(self.HTML_FORMATS)})" - ) + # Convert to Path object + file_path = Path(file_path) + if not file_path.exists(): + raise FileNotFoundError(f"File does not exist: {file_path}") + + # Get file extension + ext = file_path.suffix.lower() + + # Choose appropriate parser based on file type + if ext == ".pdf": + return self.parse_pdf(file_path, output_dir, method, lang, **kwargs) + elif ext in self.OFFICE_FORMATS: + return self.parse_office_doc(file_path, output_dir, lang, **kwargs) + elif ext in self.HTML_FORMATS: + return self.parse_html(file_path, output_dir, lang, **kwargs) + else: + raise ValueError( + f"Unsupported file format: {ext}. " + f"Docling only supports PDF files, Office formats ({', '.join(self.OFFICE_FORMATS)}) " + f"and HTML formats ({', '.join(self.HTML_FORMATS)})" + ) + finally: + # Clean up temporary file if we downloaded one + if downloaded_temp_file and downloaded_temp_file.exists(): + try: + downloaded_temp_file.unlink() + self.logger.debug(f"Removed temporary file: {downloaded_temp_file}") + except Exception as e: + self.logger.warning( + f"Failed to remove temporary file {downloaded_temp_file}: {e}" + ) def _run_docling_command( self, @@ -1504,13 +1610,15 @@ def read_from_block_recursive( content_list = [] if not block.get("children"): cnt += 1 - content_list.append(self.read_from_block(block, type, output_dir, cnt, num)) + result = self.read_from_block(block, type, output_dir, cnt, num) + if result: + content_list.append(result) else: if type not in ["groups", "body"]: cnt += 1 - content_list.append( - self.read_from_block(block, type, output_dir, cnt, num) - ) + result = self.read_from_block(block, type, output_dir, cnt, num) + if result: + content_list.append(result) members = block["children"] for member in members: cnt += 1 diff --git a/requirements.txt b/requirements.txt index 9cd2d0e83..e063055f5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,3 @@ -huggingface_hub -# LightRAG packages -lightrag-hku -# MinerU 2.0 packages (replaces magic-pdf) -mineru[core] -# Progress bars for batch processing -tqdm # Note: Optional dependencies are now defined in setup.py extras_require: # - [image]: Pillow>=10.0.0 (for BMP, TIFF, GIF, WebP format conversion) # - [text]: reportlab>=4.0.0 (for TXT, MD to PDF conversion) @@ -12,3 +5,11 @@ tqdm # - [all]: includes all optional dependencies # # Install with: pip install raganything[image,text] or pip install raganything[all] +docling==2.72.0 +huggingface_hub +# LightRAG packages +lightrag-hku +# MinerU 2.0 packages (replaces magic-pdf) +mineru[core] +# Progress bars for batch processing +tqdm