|
| 1 | +""" |
| 2 | +Tests for SupadataLoader integration. |
| 3 | +
|
| 4 | +These tests: |
| 5 | +- Do NOT hit the real Supadata API. |
| 6 | +- Patch the Supadata SDK client and assert correct calls. |
| 7 | +- Check that Documents are created with expected content/metadata. |
| 8 | +""" |
| 9 | + |
| 10 | +from __future__ import annotations |
| 11 | + |
| 12 | +import os |
| 13 | +from typing import Any |
| 14 | +from unittest.mock import MagicMock, patch |
| 15 | + |
| 16 | +import pytest |
| 17 | +from langchain_core.documents import Document |
| 18 | + |
| 19 | +# >>> IMPORTANT <<< |
| 20 | +# If your loader lives somewhere else (e.g. data_loaders.supadata), |
| 21 | +# change BOTH of these strings accordingly. |
| 22 | +LOADER_IMPORT_PATH = "langchain_community.document_loaders.supadata" |
| 23 | +MODULE_PATH = LOADER_IMPORT_PATH |
| 24 | + |
| 25 | + |
| 26 | +@pytest.fixture(autouse=True) |
| 27 | +def clear_supadata_env(monkeypatch: pytest.MonkeyPatch) -> None: |
| 28 | + """Ensure SUPADATA_API_KEY is clean for each test.""" |
| 29 | + monkeypatch.delenv("SUPADATA_API_KEY", raising=False) |
| 30 | + |
| 31 | + |
| 32 | +def make_mock_client() -> MagicMock: |
| 33 | + """Create a mock Supadata client with metadata/transcript methods.""" |
| 34 | + client = MagicMock() |
| 35 | + client.metadata = MagicMock() |
| 36 | + client.transcript = MagicMock() |
| 37 | + return client |
| 38 | + |
| 39 | + |
| 40 | +@patch(f"{MODULE_PATH}.Supadata") |
| 41 | +def test_metadata_operation_uses_explicit_api_key( |
| 42 | + mock_supadata_cls: MagicMock, |
| 43 | + monkeypatch: pytest.MonkeyPatch, |
| 44 | +) -> None: |
| 45 | + """Loader with operation='metadata' should call Supadata.metadata correctly.""" |
| 46 | + # No env var, we want to force usage of explicit api_key |
| 47 | + monkeypatch.delenv("SUPADATA_API_KEY", raising=False) |
| 48 | + |
| 49 | + from langchain_community.document_loaders.supadata import SupadataLoader |
| 50 | + |
| 51 | + mock_client = make_mock_client() |
| 52 | + mock_client.metadata.return_value = {"title": "Test Title"} |
| 53 | + mock_supadata_cls.return_value = mock_client |
| 54 | + |
| 55 | + url = "https://example.com/video" |
| 56 | + loader = SupadataLoader( |
| 57 | + urls=[url], |
| 58 | + api_key="EXPLICIT_KEY", |
| 59 | + operation="metadata", |
| 60 | + params={"foo": "bar"}, |
| 61 | + ) |
| 62 | + |
| 63 | + docs = list(loader.lazy_load()) |
| 64 | + |
| 65 | + # Supadata client should be constructed with our explicit key |
| 66 | + mock_supadata_cls.assert_called_once_with(api_key="EXPLICIT_KEY") |
| 67 | + |
| 68 | + # metadata() should be called with url + params |
| 69 | + mock_client.metadata.assert_called_once_with(url=url, foo="bar") |
| 70 | + |
| 71 | + assert len(docs) == 1 |
| 72 | + doc = docs[0] |
| 73 | + assert isinstance(doc, Document) |
| 74 | + assert '"title"' in doc.page_content |
| 75 | + assert doc.metadata["source"] == url |
| 76 | + assert doc.metadata["supadata_operation"] == "metadata" |
| 77 | + |
| 78 | + |
| 79 | +@patch(f"{MODULE_PATH}.Supadata") |
| 80 | +def test_metadata_operation_uses_env_api_key_when_not_provided( |
| 81 | + mock_supadata_cls: MagicMock, |
| 82 | + monkeypatch: pytest.MonkeyPatch, |
| 83 | +) -> None: |
| 84 | + """If api_key is not passed, loader should use SUPADATA_API_KEY env var.""" |
| 85 | + monkeypatch.setenv("SUPADATA_API_KEY", "ENV_KEY") |
| 86 | + |
| 87 | + from langchain_community.document_loaders.supadata import SupadataLoader |
| 88 | + |
| 89 | + mock_client = make_mock_client() |
| 90 | + mock_client.metadata.return_value = {"id": "123"} |
| 91 | + mock_supadata_cls.return_value = mock_client |
| 92 | + |
| 93 | + loader = SupadataLoader( |
| 94 | + urls=["https://example.com"], |
| 95 | + operation="metadata", |
| 96 | + ) |
| 97 | + |
| 98 | + list(loader.lazy_load()) |
| 99 | + |
| 100 | + mock_supadata_cls.assert_called_once_with(api_key="ENV_KEY") |
| 101 | + |
| 102 | + |
| 103 | +@patch(f"{MODULE_PATH}.Supadata") |
| 104 | +def test_transcript_operation_immediate_result( |
| 105 | + mock_supadata_cls: MagicMock, |
| 106 | + monkeypatch: pytest.MonkeyPatch, |
| 107 | +) -> None: |
| 108 | + """ |
| 109 | + For smaller inputs Supadata.transcript returns a transcript object |
| 110 | + with a 'content' attribute. Loader should put that into page_content. |
| 111 | + """ |
| 112 | + monkeypatch.setenv("SUPADATA_API_KEY", "TEST_KEY") |
| 113 | + |
| 114 | + from langchain_community.document_loaders.supadata import SupadataLoader |
| 115 | + |
| 116 | + mock_client = make_mock_client() |
| 117 | + |
| 118 | + class DummyTranscript: |
| 119 | + def __init__(self) -> None: |
| 120 | + self.content = "hello from transcript" |
| 121 | + self.lang = "en" |
| 122 | + |
| 123 | + mock_client.transcript.return_value = DummyTranscript() |
| 124 | + mock_supadata_cls.return_value = mock_client |
| 125 | + |
| 126 | + url = "https://example.com/video" |
| 127 | + loader = SupadataLoader( |
| 128 | + urls=[url], |
| 129 | + operation="transcript", |
| 130 | + lang="en", |
| 131 | + text=True, |
| 132 | + mode="auto", |
| 133 | + ) |
| 134 | + |
| 135 | + docs = list(loader.lazy_load()) |
| 136 | + assert len(docs) == 1 |
| 137 | + |
| 138 | + doc = docs[0] |
| 139 | + assert isinstance(doc, Document) |
| 140 | + assert "hello from transcript" in doc.page_content |
| 141 | + assert doc.metadata["source"] == url |
| 142 | + assert doc.metadata["supadata_operation"] == "transcript" |
| 143 | + assert doc.metadata.get("lang") in ("en", None) # depends on your impl |
| 144 | + |
| 145 | + |
| 146 | +@patch(f"{MODULE_PATH}.Supadata") |
| 147 | +def test_transcript_operation_job_result( |
| 148 | + mock_supadata_cls: MagicMock, |
| 149 | + monkeypatch: pytest.MonkeyPatch, |
| 150 | +) -> None: |
| 151 | + """ |
| 152 | + For larger inputs Supadata.transcript may return a job object with 'job_id'. |
| 153 | + Loader should return an empty page_content and put job_id in metadata. |
| 154 | + """ |
| 155 | + monkeypatch.setenv("SUPADATA_API_KEY", "TEST_KEY") |
| 156 | + |
| 157 | + from langchain_community.document_loaders.supadata import SupadataLoader |
| 158 | + |
| 159 | + mock_client = make_mock_client() |
| 160 | + |
| 161 | + class DummyJob: |
| 162 | + def __init__(self) -> None: |
| 163 | + self.job_id = "job-123" |
| 164 | + |
| 165 | + mock_client.transcript.return_value = DummyJob() |
| 166 | + mock_supadata_cls.return_value = mock_client |
| 167 | + |
| 168 | + url = "https://example.com/long-video" |
| 169 | + loader = SupadataLoader( |
| 170 | + urls=[url], |
| 171 | + operation="transcript", |
| 172 | + lang="en", |
| 173 | + mode="auto", |
| 174 | + ) |
| 175 | + |
| 176 | + docs = list(loader.lazy_load()) |
| 177 | + assert len(docs) == 1 |
| 178 | + |
| 179 | + doc = docs[0] |
| 180 | + assert isinstance(doc, Document) |
| 181 | + assert doc.page_content == "" # by design for job-based results |
| 182 | + assert doc.metadata["source"] == url |
| 183 | + assert doc.metadata["supadata_operation"] in ("transcript_job", "transcript") |
| 184 | + assert doc.metadata.get("job_id") == "job-123" |
0 commit comments