cbrn-ai-hackathon/test_data_pipeline.py at main · LucaDeLeo/cbrn-ai-hackathon · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""
Test module for data pipeline functionality.
"""

import json
import tempfile
import unittest.mock
from pathlib import Path
from unittest.mock import MagicMock, patch

import pandas as pd
import pytest
import yaml


class TestDataPipelineIntegration:
    """Test data pipeline integration functionality."""

    def test_end_to_end_parquet_dataset(self, tmp_path):
        """Test end-to-end parquet dataset creation and loading."""
        # Create test data
        test_data = pd.DataFrame({
            'id': ['1', '2', '3'],
            'question': ['What is AI?', 'What is ML?', 'What is DL?'],
            'choices': [['A', 'B', 'C'], ['X', 'Y', 'Z'], ['1', '2', '3']],
            'answer': ['A', 'X', '1']
        })

        # Test parquet operations
        parquet_file = tmp_path / "test_data.parquet"
        test_data.to_parquet(parquet_file)

        # Verify file was created and can be read
        assert parquet_file.exists()
        loaded_data = pd.read_parquet(parquet_file)
        assert len(loaded_data) == 3
        assert list(loaded_data.columns) == ['id', 'question', 'choices', 'answer']

    def test_makefile_integration(self):
        """Test Makefile integration."""
        # This test verifies that the Makefile targets work
        # In a real implementation, this would test actual Makefile commands
        assert True  # Placeholder for now

    def test_schema_validation(self, tmp_path):
        """Test schema validation for parquet files."""
        # Create test data with specific schema
        test_data = pd.DataFrame({
            'id': ['test_1', 'test_2'],
            'content': ['sample content 1', 'sample content 2'],
            'metadata': [{'type': 'test'}, {'type': 'test'}]
        })

        # Test parquet operations with schema validation
        parquet_file = tmp_path / "schema_test.parquet"
        test_data.to_parquet(parquet_file)

        # Verify schema
        loaded_data = pd.read_parquet(parquet_file)
        assert 'id' in loaded_data.columns
        assert 'content' in loaded_data.columns
        assert 'metadata' in loaded_data.columns


class TestDataPipelineErrorHandling:
    """Test error handling in data pipeline."""

    def test_network_error_handling(self):
        """Test handling of network errors during data fetching."""
        # Mock network error
        with patch('requests.get') as mock_get:
            mock_get.side_effect = ConnectionError("Network error")

            # Test that network errors are handled gracefully
            # In a real implementation, this would test actual error handling
            assert True  # Placeholder for now

    def test_checksum_mismatch_recovery(self, tmp_path):
        """Test recovery from checksum mismatches."""
        # Create mock download function with correct signature
        def mock_download(url, destination, description=None):
            """Mock download function that takes 3 arguments."""
            # Create a dummy file
            Path(destination).parent.mkdir(parents=True, exist_ok=True)
            Path(destination).write_text("dummy content")
            return True

        # Mock the fetch_data module
        with patch('scripts.fetch_data.download_with_progress', side_effect=mock_download):
            with patch('scripts.fetch_data.fetch_dataset') as mock_fetch:
                mock_fetch.return_value = True

                # Test checksum mismatch recovery
                result = mock_fetch("test")
                assert result is True

    def test_adapter_failure_handling(self):
        """Test handling of adapter failures."""
        # Mock adapter failure using a proper module path
        with patch('scripts.fetch_data.fetch_dataset') as mock_adapter:
            mock_adapter.side_effect = Exception("Adapter failure")

            # Test that adapter failures are handled gracefully
            try:
                mock_adapter("test_dataset")
                assert False, "Should have raised an exception"
            except Exception as e:
                assert str(e) == "Adapter failure"


class TestCLIIntegration:
    """Test CLI integration functionality."""

    def test_cli_list_datasets(self, tmp_path):
        """Test CLI list datasets functionality."""
        # Create test registry structure
        registry_dir = tmp_path / "data"
        registry_dir.mkdir(parents=True, exist_ok=True)

        # Create registry file (not directory) - ensure parent directory exists
        registry_file = registry_dir / "registry.yaml"

        # Ensure the parent directory exists and is not a file
        if registry_file.exists() and registry_file.is_file():
            registry_file.unlink()  # Remove if it's a file

        registry_data = {
            'datasets': {
                'test_dataset': {
                    'url': 'https://example.com/test.tar.gz',
                    'checksum': 'test_checksum'
                }
            }
        }

        # Write registry file
        registry_file.write_text(yaml.dump(registry_data))

        # Verify registry file exists and is readable
        assert registry_file.exists()
        assert registry_file.is_file()

        # Test reading registry
        loaded_data = yaml.safe_load(registry_file.read_text())
        assert 'datasets' in loaded_data
        assert 'test_dataset' in loaded_data['datasets']

    def test_cli_compute_checksum(self):
        """Test CLI compute checksum functionality."""
        # Test checksum computation
        test_content = "test content for checksum"
        # In a real implementation, this would compute actual checksums
        assert len(test_content) > 0

    def test_cli_force_redownload(self, tmp_path):
        """Test CLI force redownload functionality."""
        # Create mock download function with correct signature
        def mock_download(url, destination, description=None):
            """Mock download function that takes 3 arguments."""
            # Create a dummy file
            Path(destination).parent.mkdir(parents=True, exist_ok=True)
            Path(destination).write_text("dummy content")
            return True

        # Mock the fetch_data module
        with patch('scripts.fetch_data.download_with_progress', side_effect=mock_download):
            with patch('scripts.fetch_data.fetch_dataset') as mock_fetch:
                mock_fetch.return_value = True

                # Test force redownload
                result = mock_fetch("test", force=True)
                assert result is True