-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathtest_data_pipeline.py
More file actions
172 lines (139 loc) · 6.72 KB
/
test_data_pipeline.py
File metadata and controls
172 lines (139 loc) · 6.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""
Test module for data pipeline functionality.
"""
import json
import tempfile
import unittest.mock
from pathlib import Path
from unittest.mock import MagicMock, patch
import pandas as pd
import pytest
import yaml
class TestDataPipelineIntegration:
"""Test data pipeline integration functionality."""
def test_end_to_end_parquet_dataset(self, tmp_path):
"""Test end-to-end parquet dataset creation and loading."""
# Create test data
test_data = pd.DataFrame({
'id': ['1', '2', '3'],
'question': ['What is AI?', 'What is ML?', 'What is DL?'],
'choices': [['A', 'B', 'C'], ['X', 'Y', 'Z'], ['1', '2', '3']],
'answer': ['A', 'X', '1']
})
# Test parquet operations
parquet_file = tmp_path / "test_data.parquet"
test_data.to_parquet(parquet_file)
# Verify file was created and can be read
assert parquet_file.exists()
loaded_data = pd.read_parquet(parquet_file)
assert len(loaded_data) == 3
assert list(loaded_data.columns) == ['id', 'question', 'choices', 'answer']
def test_makefile_integration(self):
"""Test Makefile integration."""
# This test verifies that the Makefile targets work
# In a real implementation, this would test actual Makefile commands
assert True # Placeholder for now
def test_schema_validation(self, tmp_path):
"""Test schema validation for parquet files."""
# Create test data with specific schema
test_data = pd.DataFrame({
'id': ['test_1', 'test_2'],
'content': ['sample content 1', 'sample content 2'],
'metadata': [{'type': 'test'}, {'type': 'test'}]
})
# Test parquet operations with schema validation
parquet_file = tmp_path / "schema_test.parquet"
test_data.to_parquet(parquet_file)
# Verify schema
loaded_data = pd.read_parquet(parquet_file)
assert 'id' in loaded_data.columns
assert 'content' in loaded_data.columns
assert 'metadata' in loaded_data.columns
class TestDataPipelineErrorHandling:
"""Test error handling in data pipeline."""
def test_network_error_handling(self):
"""Test handling of network errors during data fetching."""
# Mock network error
with patch('requests.get') as mock_get:
mock_get.side_effect = ConnectionError("Network error")
# Test that network errors are handled gracefully
# In a real implementation, this would test actual error handling
assert True # Placeholder for now
def test_checksum_mismatch_recovery(self, tmp_path):
"""Test recovery from checksum mismatches."""
# Create mock download function with correct signature
def mock_download(url, destination, description=None):
"""Mock download function that takes 3 arguments."""
# Create a dummy file
Path(destination).parent.mkdir(parents=True, exist_ok=True)
Path(destination).write_text("dummy content")
return True
# Mock the fetch_data module
with patch('scripts.fetch_data.download_with_progress', side_effect=mock_download):
with patch('scripts.fetch_data.fetch_dataset') as mock_fetch:
mock_fetch.return_value = True
# Test checksum mismatch recovery
result = mock_fetch("test")
assert result is True
def test_adapter_failure_handling(self):
"""Test handling of adapter failures."""
# Mock adapter failure using a proper module path
with patch('scripts.fetch_data.fetch_dataset') as mock_adapter:
mock_adapter.side_effect = Exception("Adapter failure")
# Test that adapter failures are handled gracefully
try:
mock_adapter("test_dataset")
assert False, "Should have raised an exception"
except Exception as e:
assert str(e) == "Adapter failure"
class TestCLIIntegration:
"""Test CLI integration functionality."""
def test_cli_list_datasets(self, tmp_path):
"""Test CLI list datasets functionality."""
# Create test registry structure
registry_dir = tmp_path / "data"
registry_dir.mkdir(parents=True, exist_ok=True)
# Create registry file (not directory) - ensure parent directory exists
registry_file = registry_dir / "registry.yaml"
# Ensure the parent directory exists and is not a file
if registry_file.exists() and registry_file.is_file():
registry_file.unlink() # Remove if it's a file
registry_data = {
'datasets': {
'test_dataset': {
'url': 'https://example.com/test.tar.gz',
'checksum': 'test_checksum'
}
}
}
# Write registry file
registry_file.write_text(yaml.dump(registry_data))
# Verify registry file exists and is readable
assert registry_file.exists()
assert registry_file.is_file()
# Test reading registry
loaded_data = yaml.safe_load(registry_file.read_text())
assert 'datasets' in loaded_data
assert 'test_dataset' in loaded_data['datasets']
def test_cli_compute_checksum(self):
"""Test CLI compute checksum functionality."""
# Test checksum computation
test_content = "test content for checksum"
# In a real implementation, this would compute actual checksums
assert len(test_content) > 0
def test_cli_force_redownload(self, tmp_path):
"""Test CLI force redownload functionality."""
# Create mock download function with correct signature
def mock_download(url, destination, description=None):
"""Mock download function that takes 3 arguments."""
# Create a dummy file
Path(destination).parent.mkdir(parents=True, exist_ok=True)
Path(destination).write_text("dummy content")
return True
# Mock the fetch_data module
with patch('scripts.fetch_data.download_with_progress', side_effect=mock_download):
with patch('scripts.fetch_data.fetch_dataset') as mock_fetch:
mock_fetch.return_value = True
# Test force redownload
result = mock_fetch("test", force=True)
assert result is True