diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py index c91345daa..2e220721c 100644 --- a/libs/community/langchain_community/document_loaders/__init__.py +++ b/libs/community/langchain_community/document_loaders/__init__.py @@ -449,6 +449,9 @@ from langchain_community.document_loaders.surrealdb import ( SurrealDBLoader, ) + from langchain_community.document_loaders.supadata import ( + SupadataLoader, + ) from langchain_community.document_loaders.telegram import ( TelegramChatApiLoader, TelegramChatFileLoader, @@ -534,6 +537,7 @@ ) + _module_lookup = { "AZLyricsLoader": "langchain_community.document_loaders.azlyrics", "AcreomLoader": "langchain_community.document_loaders.acreom", @@ -689,6 +693,7 @@ "SpreedlyLoader": "langchain_community.document_loaders.spreedly", "StripeLoader": "langchain_community.document_loaders.stripe", "SurrealDBLoader": "langchain_community.document_loaders.surrealdb", + "SupadataLoader": "langchain_community.document_loaders.supadata", "TelegramChatApiLoader": "langchain_community.document_loaders.telegram", "TelegramChatFileLoader": "langchain_community.document_loaders.telegram", "TelegramChatLoader": "langchain_community.document_loaders.telegram", @@ -731,7 +736,7 @@ "XorbitsLoader": "langchain_community.document_loaders.xorbits", "YoutubeAudioLoader": "langchain_community.document_loaders.blob_loaders", "YoutubeLoader": "langchain_community.document_loaders.youtube", - "YuqueLoader": "langchain_community.document_loaders.yuque", + "YuqueLoader": "langchain_community.document_loaders.yuque" } @@ -897,6 +902,7 @@ def __getattr__(name: str) -> Any: "SpreedlyLoader", "StripeLoader", "SurrealDBLoader", + "SupadataLoader", "TelegramChatApiLoader", "TelegramChatFileLoader", "TelegramChatLoader", diff --git a/libs/community/langchain_community/document_loaders/supadata.py b/libs/community/langchain_community/document_loaders/supadata.py new file mode 100644 index 000000000..b0803808d --- /dev/null +++ b/libs/community/langchain_community/document_loaders/supadata.py @@ -0,0 +1,138 @@ +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from typing import Any, Dict, Iterable, List, Literal, Optional + +from langchain_core.documents import Document +from langchain_core.utils import get_from_env +from langchain_community.document_loaders.base import BaseLoader + +SupadataOperation = Literal["metadata", "transcript"] + +# Module-level alias so tests can patch `Supadata` via +# langchain_community.document_loaders.supadata.Supadata +try: # pragma: no cover - exercised via tests with mocking + from supadata import Supadata # type: ignore[import] +except Exception: # pragma: no cover + Supadata = None # type: ignore[assignment] + + +@dataclass +class SupadataLoader(BaseLoader): + """Load documents from the Supadata Web & Video Data API. + + This loader wraps the official :mod:`supadata` Python SDK to fetch either: + + * structured media metadata (``operation="metadata"``) + * media transcripts (``operation="transcript"``) + + Parameters + ---------- + urls: + List of URLs to fetch from Supadata. + api_key: + Supadata API key. If omitted, the ``SUPADATA_API_KEY`` environment + variable is used. + operation: + Which Supadata endpoint to call: ``"metadata"`` or ``"transcript"``. + lang: + Optional transcript language preference. + text: + When ``True``, request a plain-text transcript instead of timestamped + chunks (see Supadata documentation). + mode: + Transcript mode, for example ``"native"``, ``"auto"`` or ``"generate"``. + params: + Extra keyword arguments forwarded to the underlying Supadata SDK call. + """ + + urls: List[str] + api_key: Optional[str] = None + operation: SupadataOperation = "transcript" + + lang: Optional[str] = None + text: Optional[bool] = None + mode: Optional[str] = None + + params: Dict[str, Any] = field(default_factory=dict) + + def __post_init__(self) -> None: + # Explicit api_key wins; otherwise read from env. + if self.api_key is None: + # get_from_env(key, env_var) -> str + self.api_key = get_from_env("api_key", "SUPADATA_API_KEY") + + def _get_client(self) -> Any: + if Supadata is None: + raise ImportError( + "Could not import 'supadata'. Install it with " + "`pip install supadata` to use `SupadataLoader`." + ) + + if not self.api_key: + raise ValueError( + "Supadata API key is empty. " + "Set the SUPADATA_API_KEY environment variable or " + "pass `api_key` when constructing SupadataLoader." + ) + + return Supadata(api_key=self.api_key) + + def lazy_load(self) -> Iterable[Document]: + client = self._get_client() + + for url in self.urls: + if self.operation == "metadata": + yield self._load_metadata(client, url) + elif self.operation == "transcript": + yield self._load_transcript(client, url) + else: + raise ValueError( + f"Unsupported operation: {self.operation!r}. " + "Expected 'metadata' or 'transcript'." + ) + + def _load_metadata(self, client: Any, url: str) -> Document: + result = client.metadata(url=url, **self.params) + + page_content = json.dumps(result, ensure_ascii=False, indent=2) + metadata = { + "source": url, + "supadata_operation": "metadata", + } + + return Document(page_content=page_content, metadata=metadata) + + def _load_transcript(self, client: Any, url: str) -> Document: + transcript_kwargs: Dict[str, Any] = dict(self.params) + if self.lang is not None: + transcript_kwargs["lang"] = self.lang + if self.text is not None: + transcript_kwargs["text"] = self.text + if self.mode is not None: + transcript_kwargs["mode"] = self.mode + + result = client.transcript(url=url, **transcript_kwargs) + + content = getattr(result, "content", None) + if content is not None: + # Immediate transcript result. + metadata = { + "source": url, + "supadata_operation": "transcript", + "lang": getattr(result, "lang", None), + "mode": transcript_kwargs.get("mode"), + } + return Document(page_content=str(content), metadata=metadata) + + # Asynchronous job: result carries a job_id instead of content. + job_id = getattr(result, "job_id", None) + metadata = { + "source": url, + "supadata_operation": "transcript_job", + "job_id": job_id, + "lang": transcript_kwargs.get("lang"), + "mode": transcript_kwargs.get("mode"), + } + return Document(page_content="", metadata=metadata) diff --git a/libs/community/tests/unit_tests/document_loaders/test_supadata_loader.py b/libs/community/tests/unit_tests/document_loaders/test_supadata_loader.py new file mode 100644 index 000000000..65d27c791 --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/test_supadata_loader.py @@ -0,0 +1,184 @@ +""" +Tests for SupadataLoader integration. + +These tests: +- Do NOT hit the real Supadata API. +- Patch the Supadata SDK client and assert correct calls. +- Check that Documents are created with expected content/metadata. +""" + +from __future__ import annotations + +import os +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest +from langchain_core.documents import Document + +# >>> IMPORTANT <<< +# If your loader lives somewhere else (e.g. data_loaders.supadata), +# change BOTH of these strings accordingly. +LOADER_IMPORT_PATH = "langchain_community.document_loaders.supadata" +MODULE_PATH = LOADER_IMPORT_PATH + + +@pytest.fixture(autouse=True) +def clear_supadata_env(monkeypatch: pytest.MonkeyPatch) -> None: + """Ensure SUPADATA_API_KEY is clean for each test.""" + monkeypatch.delenv("SUPADATA_API_KEY", raising=False) + + +def make_mock_client() -> MagicMock: + """Create a mock Supadata client with metadata/transcript methods.""" + client = MagicMock() + client.metadata = MagicMock() + client.transcript = MagicMock() + return client + + +@patch(f"{MODULE_PATH}.Supadata") +def test_metadata_operation_uses_explicit_api_key( + mock_supadata_cls: MagicMock, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Loader with operation='metadata' should call Supadata.metadata correctly.""" + # No env var, we want to force usage of explicit api_key + monkeypatch.delenv("SUPADATA_API_KEY", raising=False) + + from langchain_community.document_loaders.supadata import SupadataLoader + + mock_client = make_mock_client() + mock_client.metadata.return_value = {"title": "Test Title"} + mock_supadata_cls.return_value = mock_client + + url = "https://example.com/video" + loader = SupadataLoader( + urls=[url], + api_key="EXPLICIT_KEY", + operation="metadata", + params={"foo": "bar"}, + ) + + docs = list(loader.lazy_load()) + + # Supadata client should be constructed with our explicit key + mock_supadata_cls.assert_called_once_with(api_key="EXPLICIT_KEY") + + # metadata() should be called with url + params + mock_client.metadata.assert_called_once_with(url=url, foo="bar") + + assert len(docs) == 1 + doc = docs[0] + assert isinstance(doc, Document) + assert '"title"' in doc.page_content + assert doc.metadata["source"] == url + assert doc.metadata["supadata_operation"] == "metadata" + + +@patch(f"{MODULE_PATH}.Supadata") +def test_metadata_operation_uses_env_api_key_when_not_provided( + mock_supadata_cls: MagicMock, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """If api_key is not passed, loader should use SUPADATA_API_KEY env var.""" + monkeypatch.setenv("SUPADATA_API_KEY", "ENV_KEY") + + from langchain_community.document_loaders.supadata import SupadataLoader + + mock_client = make_mock_client() + mock_client.metadata.return_value = {"id": "123"} + mock_supadata_cls.return_value = mock_client + + loader = SupadataLoader( + urls=["https://example.com"], + operation="metadata", + ) + + list(loader.lazy_load()) + + mock_supadata_cls.assert_called_once_with(api_key="ENV_KEY") + + +@patch(f"{MODULE_PATH}.Supadata") +def test_transcript_operation_immediate_result( + mock_supadata_cls: MagicMock, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """ + For smaller inputs Supadata.transcript returns a transcript object + with a 'content' attribute. Loader should put that into page_content. + """ + monkeypatch.setenv("SUPADATA_API_KEY", "TEST_KEY") + + from langchain_community.document_loaders.supadata import SupadataLoader + + mock_client = make_mock_client() + + class DummyTranscript: + def __init__(self) -> None: + self.content = "hello from transcript" + self.lang = "en" + + mock_client.transcript.return_value = DummyTranscript() + mock_supadata_cls.return_value = mock_client + + url = "https://example.com/video" + loader = SupadataLoader( + urls=[url], + operation="transcript", + lang="en", + text=True, + mode="auto", + ) + + docs = list(loader.lazy_load()) + assert len(docs) == 1 + + doc = docs[0] + assert isinstance(doc, Document) + assert "hello from transcript" in doc.page_content + assert doc.metadata["source"] == url + assert doc.metadata["supadata_operation"] == "transcript" + assert doc.metadata.get("lang") in ("en", None) # depends on your impl + + +@patch(f"{MODULE_PATH}.Supadata") +def test_transcript_operation_job_result( + mock_supadata_cls: MagicMock, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """ + For larger inputs Supadata.transcript may return a job object with 'job_id'. + Loader should return an empty page_content and put job_id in metadata. + """ + monkeypatch.setenv("SUPADATA_API_KEY", "TEST_KEY") + + from langchain_community.document_loaders.supadata import SupadataLoader + + mock_client = make_mock_client() + + class DummyJob: + def __init__(self) -> None: + self.job_id = "job-123" + + mock_client.transcript.return_value = DummyJob() + mock_supadata_cls.return_value = mock_client + + url = "https://example.com/long-video" + loader = SupadataLoader( + urls=[url], + operation="transcript", + lang="en", + mode="auto", + ) + + docs = list(loader.lazy_load()) + assert len(docs) == 1 + + doc = docs[0] + assert isinstance(doc, Document) + assert doc.page_content == "" # by design for job-based results + assert doc.metadata["source"] == url + assert doc.metadata["supadata_operation"] in ("transcript_job", "transcript") + assert doc.metadata.get("job_id") == "job-123"