Skip to content

Commit 1bf9a6e

Browse files
committed
Add SupadataLoader document loader
1 parent a94b1a6 commit 1bf9a6e

File tree

3 files changed

+328
-0
lines changed

3 files changed

+328
-0
lines changed

libs/community/langchain_community/document_loaders/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,9 @@
532532
from langchain_community.document_loaders.yuque import (
533533
YuqueLoader,
534534
)
535+
from langchain_community.document_loaders.supadata import (
536+
SupadataLoader,
537+
)
535538

536539

537540
_module_lookup = {
@@ -732,6 +735,8 @@
732735
"YoutubeAudioLoader": "langchain_community.document_loaders.blob_loaders",
733736
"YoutubeLoader": "langchain_community.document_loaders.youtube",
734737
"YuqueLoader": "langchain_community.document_loaders.yuque",
738+
"SupadataLoader": "langchain_community.document_loaders.supadata",
739+
735740
}
736741

737742

@@ -940,4 +945,5 @@ def __getattr__(name: str) -> Any:
940945
"YoutubeAudioLoader",
941946
"YoutubeLoader",
942947
"YuqueLoader",
948+
"SupadataLoader"
943949
]
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
from __future__ import annotations
2+
3+
import json
4+
from dataclasses import dataclass, field
5+
from typing import Any, Dict, Iterable, List, Literal, Optional
6+
7+
from langchain_core.documents import Document
8+
from langchain_core.utils import get_from_env
9+
from langchain_community.document_loaders.base import BaseLoader
10+
11+
SupadataOperation = Literal["metadata", "transcript"]
12+
13+
# Module-level alias so tests can patch `Supadata` via
14+
# langchain_community.document_loaders.supadata.Supadata
15+
try: # pragma: no cover - exercised via tests with mocking
16+
from supadata import Supadata # type: ignore[import]
17+
except Exception: # pragma: no cover
18+
Supadata = None # type: ignore[assignment]
19+
20+
21+
@dataclass
22+
class SupadataLoader(BaseLoader):
23+
"""Load documents from the Supadata Web & Video Data API.
24+
25+
This loader wraps the official :mod:`supadata` Python SDK to fetch either:
26+
27+
* structured media metadata (``operation="metadata"``)
28+
* media transcripts (``operation="transcript"``)
29+
30+
Parameters
31+
----------
32+
urls:
33+
List of URLs to fetch from Supadata.
34+
api_key:
35+
Supadata API key. If omitted, the ``SUPADATA_API_KEY`` environment
36+
variable is used.
37+
operation:
38+
Which Supadata endpoint to call: ``"metadata"`` or ``"transcript"``.
39+
lang:
40+
Optional transcript language preference.
41+
text:
42+
When ``True``, request a plain-text transcript instead of timestamped
43+
chunks (see Supadata documentation).
44+
mode:
45+
Transcript mode, for example ``"native"``, ``"auto"`` or ``"generate"``.
46+
params:
47+
Extra keyword arguments forwarded to the underlying Supadata SDK call.
48+
"""
49+
50+
urls: List[str]
51+
api_key: Optional[str] = None
52+
operation: SupadataOperation = "transcript"
53+
54+
lang: Optional[str] = None
55+
text: Optional[bool] = None
56+
mode: Optional[str] = None
57+
58+
params: Dict[str, Any] = field(default_factory=dict)
59+
60+
def __post_init__(self) -> None:
61+
# Explicit api_key wins; otherwise read from env.
62+
if self.api_key is None:
63+
# get_from_env(key, env_var) -> str
64+
self.api_key = get_from_env("api_key", "SUPADATA_API_KEY")
65+
66+
def _get_client(self) -> Any:
67+
if Supadata is None:
68+
raise ImportError(
69+
"Could not import 'supadata'. Install it with "
70+
"`pip install supadata` to use `SupadataLoader`."
71+
)
72+
73+
if not self.api_key:
74+
raise ValueError(
75+
"Supadata API key is empty. "
76+
"Set the SUPADATA_API_KEY environment variable or "
77+
"pass `api_key` when constructing SupadataLoader."
78+
)
79+
80+
return Supadata(api_key=self.api_key)
81+
82+
def lazy_load(self) -> Iterable[Document]:
83+
client = self._get_client()
84+
85+
for url in self.urls:
86+
if self.operation == "metadata":
87+
yield self._load_metadata(client, url)
88+
elif self.operation == "transcript":
89+
yield self._load_transcript(client, url)
90+
else:
91+
raise ValueError(
92+
f"Unsupported operation: {self.operation!r}. "
93+
"Expected 'metadata' or 'transcript'."
94+
)
95+
96+
def _load_metadata(self, client: Any, url: str) -> Document:
97+
result = client.metadata(url=url, **self.params)
98+
99+
page_content = json.dumps(result, ensure_ascii=False, indent=2)
100+
metadata = {
101+
"source": url,
102+
"supadata_operation": "metadata",
103+
}
104+
105+
return Document(page_content=page_content, metadata=metadata)
106+
107+
def _load_transcript(self, client: Any, url: str) -> Document:
108+
transcript_kwargs: Dict[str, Any] = dict(self.params)
109+
if self.lang is not None:
110+
transcript_kwargs["lang"] = self.lang
111+
if self.text is not None:
112+
transcript_kwargs["text"] = self.text
113+
if self.mode is not None:
114+
transcript_kwargs["mode"] = self.mode
115+
116+
result = client.transcript(url=url, **transcript_kwargs)
117+
118+
content = getattr(result, "content", None)
119+
if content is not None:
120+
# Immediate transcript result.
121+
metadata = {
122+
"source": url,
123+
"supadata_operation": "transcript",
124+
"lang": getattr(result, "lang", None),
125+
"mode": transcript_kwargs.get("mode"),
126+
}
127+
return Document(page_content=str(content), metadata=metadata)
128+
129+
# Asynchronous job: result carries a job_id instead of content.
130+
job_id = getattr(result, "job_id", None)
131+
metadata = {
132+
"source": url,
133+
"supadata_operation": "transcript_job",
134+
"job_id": job_id,
135+
"lang": transcript_kwargs.get("lang"),
136+
"mode": transcript_kwargs.get("mode"),
137+
}
138+
return Document(page_content="", metadata=metadata)
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
"""
2+
Tests for SupadataLoader integration.
3+
4+
These tests:
5+
- Do NOT hit the real Supadata API.
6+
- Patch the Supadata SDK client and assert correct calls.
7+
- Check that Documents are created with expected content/metadata.
8+
"""
9+
10+
from __future__ import annotations
11+
12+
import os
13+
from typing import Any
14+
from unittest.mock import MagicMock, patch
15+
16+
import pytest
17+
from langchain_core.documents import Document
18+
19+
# >>> IMPORTANT <<<
20+
# If your loader lives somewhere else (e.g. data_loaders.supadata),
21+
# change BOTH of these strings accordingly.
22+
LOADER_IMPORT_PATH = "langchain_community.document_loaders.supadata"
23+
MODULE_PATH = LOADER_IMPORT_PATH
24+
25+
26+
@pytest.fixture(autouse=True)
27+
def clear_supadata_env(monkeypatch: pytest.MonkeyPatch) -> None:
28+
"""Ensure SUPADATA_API_KEY is clean for each test."""
29+
monkeypatch.delenv("SUPADATA_API_KEY", raising=False)
30+
31+
32+
def make_mock_client() -> MagicMock:
33+
"""Create a mock Supadata client with metadata/transcript methods."""
34+
client = MagicMock()
35+
client.metadata = MagicMock()
36+
client.transcript = MagicMock()
37+
return client
38+
39+
40+
@patch(f"{MODULE_PATH}.Supadata")
41+
def test_metadata_operation_uses_explicit_api_key(
42+
mock_supadata_cls: MagicMock,
43+
monkeypatch: pytest.MonkeyPatch,
44+
) -> None:
45+
"""Loader with operation='metadata' should call Supadata.metadata correctly."""
46+
# No env var, we want to force usage of explicit api_key
47+
monkeypatch.delenv("SUPADATA_API_KEY", raising=False)
48+
49+
from langchain_community.document_loaders.supadata import SupadataLoader
50+
51+
mock_client = make_mock_client()
52+
mock_client.metadata.return_value = {"title": "Test Title"}
53+
mock_supadata_cls.return_value = mock_client
54+
55+
url = "https://example.com/video"
56+
loader = SupadataLoader(
57+
urls=[url],
58+
api_key="EXPLICIT_KEY",
59+
operation="metadata",
60+
params={"foo": "bar"},
61+
)
62+
63+
docs = list(loader.lazy_load())
64+
65+
# Supadata client should be constructed with our explicit key
66+
mock_supadata_cls.assert_called_once_with(api_key="EXPLICIT_KEY")
67+
68+
# metadata() should be called with url + params
69+
mock_client.metadata.assert_called_once_with(url=url, foo="bar")
70+
71+
assert len(docs) == 1
72+
doc = docs[0]
73+
assert isinstance(doc, Document)
74+
assert '"title"' in doc.page_content
75+
assert doc.metadata["source"] == url
76+
assert doc.metadata["supadata_operation"] == "metadata"
77+
78+
79+
@patch(f"{MODULE_PATH}.Supadata")
80+
def test_metadata_operation_uses_env_api_key_when_not_provided(
81+
mock_supadata_cls: MagicMock,
82+
monkeypatch: pytest.MonkeyPatch,
83+
) -> None:
84+
"""If api_key is not passed, loader should use SUPADATA_API_KEY env var."""
85+
monkeypatch.setenv("SUPADATA_API_KEY", "ENV_KEY")
86+
87+
from langchain_community.document_loaders.supadata import SupadataLoader
88+
89+
mock_client = make_mock_client()
90+
mock_client.metadata.return_value = {"id": "123"}
91+
mock_supadata_cls.return_value = mock_client
92+
93+
loader = SupadataLoader(
94+
urls=["https://example.com"],
95+
operation="metadata",
96+
)
97+
98+
list(loader.lazy_load())
99+
100+
mock_supadata_cls.assert_called_once_with(api_key="ENV_KEY")
101+
102+
103+
@patch(f"{MODULE_PATH}.Supadata")
104+
def test_transcript_operation_immediate_result(
105+
mock_supadata_cls: MagicMock,
106+
monkeypatch: pytest.MonkeyPatch,
107+
) -> None:
108+
"""
109+
For smaller inputs Supadata.transcript returns a transcript object
110+
with a 'content' attribute. Loader should put that into page_content.
111+
"""
112+
monkeypatch.setenv("SUPADATA_API_KEY", "TEST_KEY")
113+
114+
from langchain_community.document_loaders.supadata import SupadataLoader
115+
116+
mock_client = make_mock_client()
117+
118+
class DummyTranscript:
119+
def __init__(self) -> None:
120+
self.content = "hello from transcript"
121+
self.lang = "en"
122+
123+
mock_client.transcript.return_value = DummyTranscript()
124+
mock_supadata_cls.return_value = mock_client
125+
126+
url = "https://example.com/video"
127+
loader = SupadataLoader(
128+
urls=[url],
129+
operation="transcript",
130+
lang="en",
131+
text=True,
132+
mode="auto",
133+
)
134+
135+
docs = list(loader.lazy_load())
136+
assert len(docs) == 1
137+
138+
doc = docs[0]
139+
assert isinstance(doc, Document)
140+
assert "hello from transcript" in doc.page_content
141+
assert doc.metadata["source"] == url
142+
assert doc.metadata["supadata_operation"] == "transcript"
143+
assert doc.metadata.get("lang") in ("en", None) # depends on your impl
144+
145+
146+
@patch(f"{MODULE_PATH}.Supadata")
147+
def test_transcript_operation_job_result(
148+
mock_supadata_cls: MagicMock,
149+
monkeypatch: pytest.MonkeyPatch,
150+
) -> None:
151+
"""
152+
For larger inputs Supadata.transcript may return a job object with 'job_id'.
153+
Loader should return an empty page_content and put job_id in metadata.
154+
"""
155+
monkeypatch.setenv("SUPADATA_API_KEY", "TEST_KEY")
156+
157+
from langchain_community.document_loaders.supadata import SupadataLoader
158+
159+
mock_client = make_mock_client()
160+
161+
class DummyJob:
162+
def __init__(self) -> None:
163+
self.job_id = "job-123"
164+
165+
mock_client.transcript.return_value = DummyJob()
166+
mock_supadata_cls.return_value = mock_client
167+
168+
url = "https://example.com/long-video"
169+
loader = SupadataLoader(
170+
urls=[url],
171+
operation="transcript",
172+
lang="en",
173+
mode="auto",
174+
)
175+
176+
docs = list(loader.lazy_load())
177+
assert len(docs) == 1
178+
179+
doc = docs[0]
180+
assert isinstance(doc, Document)
181+
assert doc.page_content == "" # by design for job-based results
182+
assert doc.metadata["source"] == url
183+
assert doc.metadata["supadata_operation"] in ("transcript_job", "transcript")
184+
assert doc.metadata.get("job_id") == "job-123"

0 commit comments

Comments
 (0)