Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,9 @@
from langchain_community.document_loaders.surrealdb import (
SurrealDBLoader,
)
from langchain_community.document_loaders.supadata import (
SupadataLoader,
)
from langchain_community.document_loaders.telegram import (
TelegramChatApiLoader,
TelegramChatFileLoader,
Expand Down Expand Up @@ -534,6 +537,7 @@
)



_module_lookup = {
"AZLyricsLoader": "langchain_community.document_loaders.azlyrics",
"AcreomLoader": "langchain_community.document_loaders.acreom",
Expand Down Expand Up @@ -689,6 +693,7 @@
"SpreedlyLoader": "langchain_community.document_loaders.spreedly",
"StripeLoader": "langchain_community.document_loaders.stripe",
"SurrealDBLoader": "langchain_community.document_loaders.surrealdb",
"SupadataLoader": "langchain_community.document_loaders.supadata",
"TelegramChatApiLoader": "langchain_community.document_loaders.telegram",
"TelegramChatFileLoader": "langchain_community.document_loaders.telegram",
"TelegramChatLoader": "langchain_community.document_loaders.telegram",
Expand Down Expand Up @@ -731,7 +736,7 @@
"XorbitsLoader": "langchain_community.document_loaders.xorbits",
"YoutubeAudioLoader": "langchain_community.document_loaders.blob_loaders",
"YoutubeLoader": "langchain_community.document_loaders.youtube",
"YuqueLoader": "langchain_community.document_loaders.yuque",
"YuqueLoader": "langchain_community.document_loaders.yuque"
}


Expand Down Expand Up @@ -897,6 +902,7 @@ def __getattr__(name: str) -> Any:
"SpreedlyLoader",
"StripeLoader",
"SurrealDBLoader",
"SupadataLoader",
"TelegramChatApiLoader",
"TelegramChatFileLoader",
"TelegramChatLoader",
Expand Down
138 changes: 138 additions & 0 deletions libs/community/langchain_community/document_loaders/supadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
from __future__ import annotations

import json
from dataclasses import dataclass, field
from typing import Any, Dict, Iterable, List, Literal, Optional

from langchain_core.documents import Document
from langchain_core.utils import get_from_env
from langchain_community.document_loaders.base import BaseLoader

SupadataOperation = Literal["metadata", "transcript"]

# Module-level alias so tests can patch `Supadata` via
# langchain_community.document_loaders.supadata.Supadata
try: # pragma: no cover - exercised via tests with mocking
from supadata import Supadata # type: ignore[import]
except Exception: # pragma: no cover
Supadata = None # type: ignore[assignment]


@dataclass
class SupadataLoader(BaseLoader):
"""Load documents from the Supadata Web & Video Data API.

This loader wraps the official :mod:`supadata` Python SDK to fetch either:

* structured media metadata (``operation="metadata"``)
* media transcripts (``operation="transcript"``)

Parameters
----------
urls:
List of URLs to fetch from Supadata.
api_key:
Supadata API key. If omitted, the ``SUPADATA_API_KEY`` environment
variable is used.
operation:
Which Supadata endpoint to call: ``"metadata"`` or ``"transcript"``.
lang:
Optional transcript language preference.
text:
When ``True``, request a plain-text transcript instead of timestamped
chunks (see Supadata documentation).
mode:
Transcript mode, for example ``"native"``, ``"auto"`` or ``"generate"``.
params:
Extra keyword arguments forwarded to the underlying Supadata SDK call.
"""

urls: List[str]
api_key: Optional[str] = None
operation: SupadataOperation = "transcript"

lang: Optional[str] = None
text: Optional[bool] = None
mode: Optional[str] = None

params: Dict[str, Any] = field(default_factory=dict)

def __post_init__(self) -> None:
# Explicit api_key wins; otherwise read from env.
if self.api_key is None:
# get_from_env(key, env_var) -> str
self.api_key = get_from_env("api_key", "SUPADATA_API_KEY")

def _get_client(self) -> Any:
if Supadata is None:
raise ImportError(
"Could not import 'supadata'. Install it with "
"`pip install supadata` to use `SupadataLoader`."
)

if not self.api_key:
raise ValueError(
"Supadata API key is empty. "
"Set the SUPADATA_API_KEY environment variable or "
"pass `api_key` when constructing SupadataLoader."
)

return Supadata(api_key=self.api_key)

def lazy_load(self) -> Iterable[Document]:
client = self._get_client()

for url in self.urls:
if self.operation == "metadata":
yield self._load_metadata(client, url)
elif self.operation == "transcript":
yield self._load_transcript(client, url)
else:
raise ValueError(
f"Unsupported operation: {self.operation!r}. "
"Expected 'metadata' or 'transcript'."
)

def _load_metadata(self, client: Any, url: str) -> Document:
result = client.metadata(url=url, **self.params)

page_content = json.dumps(result, ensure_ascii=False, indent=2)
metadata = {
"source": url,
"supadata_operation": "metadata",
}

return Document(page_content=page_content, metadata=metadata)

def _load_transcript(self, client: Any, url: str) -> Document:
transcript_kwargs: Dict[str, Any] = dict(self.params)
if self.lang is not None:
transcript_kwargs["lang"] = self.lang
if self.text is not None:
transcript_kwargs["text"] = self.text
if self.mode is not None:
transcript_kwargs["mode"] = self.mode

result = client.transcript(url=url, **transcript_kwargs)

content = getattr(result, "content", None)
if content is not None:
# Immediate transcript result.
metadata = {
"source": url,
"supadata_operation": "transcript",
"lang": getattr(result, "lang", None),
"mode": transcript_kwargs.get("mode"),
}
return Document(page_content=str(content), metadata=metadata)

# Asynchronous job: result carries a job_id instead of content.
job_id = getattr(result, "job_id", None)
metadata = {
"source": url,
"supadata_operation": "transcript_job",
"job_id": job_id,
"lang": transcript_kwargs.get("lang"),
"mode": transcript_kwargs.get("mode"),
}
return Document(page_content="", metadata=metadata)
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
"""
Tests for SupadataLoader integration.

These tests:
- Do NOT hit the real Supadata API.
- Patch the Supadata SDK client and assert correct calls.
- Check that Documents are created with expected content/metadata.
"""

from __future__ import annotations

import os
from typing import Any
from unittest.mock import MagicMock, patch

import pytest
from langchain_core.documents import Document

# >>> IMPORTANT <<<
# If your loader lives somewhere else (e.g. data_loaders.supadata),
# change BOTH of these strings accordingly.
LOADER_IMPORT_PATH = "langchain_community.document_loaders.supadata"
MODULE_PATH = LOADER_IMPORT_PATH


@pytest.fixture(autouse=True)
def clear_supadata_env(monkeypatch: pytest.MonkeyPatch) -> None:
"""Ensure SUPADATA_API_KEY is clean for each test."""
monkeypatch.delenv("SUPADATA_API_KEY", raising=False)


def make_mock_client() -> MagicMock:
"""Create a mock Supadata client with metadata/transcript methods."""
client = MagicMock()
client.metadata = MagicMock()
client.transcript = MagicMock()
return client


@patch(f"{MODULE_PATH}.Supadata")
def test_metadata_operation_uses_explicit_api_key(
mock_supadata_cls: MagicMock,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Loader with operation='metadata' should call Supadata.metadata correctly."""
# No env var, we want to force usage of explicit api_key
monkeypatch.delenv("SUPADATA_API_KEY", raising=False)

from langchain_community.document_loaders.supadata import SupadataLoader

mock_client = make_mock_client()
mock_client.metadata.return_value = {"title": "Test Title"}
mock_supadata_cls.return_value = mock_client

url = "https://example.com/video"
loader = SupadataLoader(
urls=[url],
api_key="EXPLICIT_KEY",
operation="metadata",
params={"foo": "bar"},
)

docs = list(loader.lazy_load())

# Supadata client should be constructed with our explicit key
mock_supadata_cls.assert_called_once_with(api_key="EXPLICIT_KEY")

# metadata() should be called with url + params
mock_client.metadata.assert_called_once_with(url=url, foo="bar")

assert len(docs) == 1
doc = docs[0]
assert isinstance(doc, Document)
assert '"title"' in doc.page_content
assert doc.metadata["source"] == url
assert doc.metadata["supadata_operation"] == "metadata"


@patch(f"{MODULE_PATH}.Supadata")
def test_metadata_operation_uses_env_api_key_when_not_provided(
mock_supadata_cls: MagicMock,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""If api_key is not passed, loader should use SUPADATA_API_KEY env var."""
monkeypatch.setenv("SUPADATA_API_KEY", "ENV_KEY")

from langchain_community.document_loaders.supadata import SupadataLoader

mock_client = make_mock_client()
mock_client.metadata.return_value = {"id": "123"}
mock_supadata_cls.return_value = mock_client

loader = SupadataLoader(
urls=["https://example.com"],
operation="metadata",
)

list(loader.lazy_load())

mock_supadata_cls.assert_called_once_with(api_key="ENV_KEY")


@patch(f"{MODULE_PATH}.Supadata")
def test_transcript_operation_immediate_result(
mock_supadata_cls: MagicMock,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""
For smaller inputs Supadata.transcript returns a transcript object
with a 'content' attribute. Loader should put that into page_content.
"""
monkeypatch.setenv("SUPADATA_API_KEY", "TEST_KEY")

from langchain_community.document_loaders.supadata import SupadataLoader

mock_client = make_mock_client()

class DummyTranscript:
def __init__(self) -> None:
self.content = "hello from transcript"
self.lang = "en"

mock_client.transcript.return_value = DummyTranscript()
mock_supadata_cls.return_value = mock_client

url = "https://example.com/video"
loader = SupadataLoader(
urls=[url],
operation="transcript",
lang="en",
text=True,
mode="auto",
)

docs = list(loader.lazy_load())
assert len(docs) == 1

doc = docs[0]
assert isinstance(doc, Document)
assert "hello from transcript" in doc.page_content
assert doc.metadata["source"] == url
assert doc.metadata["supadata_operation"] == "transcript"
assert doc.metadata.get("lang") in ("en", None) # depends on your impl


@patch(f"{MODULE_PATH}.Supadata")
def test_transcript_operation_job_result(
mock_supadata_cls: MagicMock,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""
For larger inputs Supadata.transcript may return a job object with 'job_id'.
Loader should return an empty page_content and put job_id in metadata.
"""
monkeypatch.setenv("SUPADATA_API_KEY", "TEST_KEY")

from langchain_community.document_loaders.supadata import SupadataLoader

mock_client = make_mock_client()

class DummyJob:
def __init__(self) -> None:
self.job_id = "job-123"

mock_client.transcript.return_value = DummyJob()
mock_supadata_cls.return_value = mock_client

url = "https://example.com/long-video"
loader = SupadataLoader(
urls=[url],
operation="transcript",
lang="en",
mode="auto",
)

docs = list(loader.lazy_load())
assert len(docs) == 1

doc = docs[0]
assert isinstance(doc, Document)
assert doc.page_content == "" # by design for job-based results
assert doc.metadata["source"] == url
assert doc.metadata["supadata_operation"] in ("transcript_job", "transcript")
assert doc.metadata.get("job_id") == "job-123"