Skip to content

Commit f24e449

Browse files
committed
fix: DRY compliance and platform compatibility for v0.1.23
DRY Principle Compliance: - Consolidate ErrorCategory enum into single source in event_system.py - Consolidate RECOVERABLE_CATEGORIES into event_system.py - Remove duplicate format_size() from batch_tui.py (now imports from terminal_utils.py) - Add config constants: EVENT_DISPATCH_TIMEOUT, EVENT_DISPATCH_FAST_TIMEOUT, HTML_CHUNK_SIZE, HTML_MAX_CHUNK_SIZE - Update apias.py, event_system.py, status_pipeline.py to use centralized constants Platform Compatibility: - Fix terminal size detection with try/except fallback for headless/CI environments - Add DEFAULT_TERMINAL_WIDTH fallback in _clear_line() and create_summary_box() All 351 tests pass (72.53% coverage)
1 parent 2ebb5cf commit f24e449

File tree

12 files changed

+157
-79
lines changed

12 files changed

+157
-79
lines changed

CHANGELOG.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,23 @@ All notable changes to APIAS (API Auto Scraper) will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [0.1.23] - 2025-12-02
9+
10+
### DRY Principle Compliance (Code Deduplication)
11+
- Consolidate duplicate ErrorCategory enum into single source in event_system.py by @Emasoft
12+
- Consolidate duplicate RECOVERABLE_CATEGORIES into event_system.py by @Emasoft
13+
- Remove duplicate format_size() function from batch_tui.py - now imports from terminal_utils.py by @Emasoft
14+
- Add missing hardcoded values to config.py: EVENT_DISPATCH_TIMEOUT, EVENT_DISPATCH_FAST_TIMEOUT, HTML_CHUNK_SIZE, HTML_MAX_CHUNK_SIZE by @Emasoft
15+
- Update apias.py, event_system.py, status_pipeline.py to use centralized config constants by @Emasoft
16+
17+
### Platform Compatibility
18+
- Fix terminal size detection in apias.py - add try/except with fallback for headless/CI environments by @Emasoft
19+
- Add DEFAULT_TERMINAL_WIDTH fallback from config.py in _clear_line() and create_summary_box() by @Emasoft
20+
21+
### Documentation
22+
- Add comprehensive WHY/DO NOT comments explaining DRY consolidation decisions by @Emasoft
23+
- Update docstrings to reference centralized config constants by @Emasoft
24+
825
## [0.1.22] - 2025-12-02
926

1027
### Critical Fixes (Comprehensive Codebase Audit)

apias/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.1.22"
1+
__version__ = "0.1.23"
22
from .apias import parse_documentation, validate_config
33

44
__all__ = ["__version__", "parse_documentation", "validate_config"]

apias/apias.py

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,10 @@
142142

143143
from .batch_tui import BatchTUIManager, URLState
144144
from .config import BATCH_TUI_POLL_INTERVAL # For hybrid polling in batch mode
145+
from .config import EVENT_DISPATCH_FAST_TIMEOUT # For tight loop event processing
146+
from .config import EVENT_DISPATCH_TIMEOUT # For standard event processing
147+
from .config import HTML_CHUNK_SIZE # Default chunk size for HTML chunking
148+
from .config import HTML_MAX_CHUNK_SIZE # Maximum chunk size for very large pages
145149
from .config import KEYBOARD_THREAD_TIMEOUT # For thread.join() timeouts
146150
from .config import ( # Network timeouts - DO NOT hardcode these values elsewhere; API configuration; Batch processing - DO NOT hardcode polling intervals; Thread cleanup - DO NOT hardcode thread timeouts; File system; Progress percentages - DO NOT hardcode progress values elsewhere; Use ProgressPercent.SCRAPING, ProgressPercent.SENDING, etc.
147151
BATCH_FINAL_STATE_PAUSE,
@@ -1183,8 +1187,17 @@ def end(self) -> None:
11831187

11841188
def _clear_line(self) -> None:
11851189
if not self.quiet:
1190+
# PLATFORM SAFETY: get_terminal_size() can fail in headless/CI environments
1191+
# WHY: In Docker, CI pipelines, or non-TTY contexts, there's no terminal
1192+
# DO NOT remove try/except - causes crashes in headless mode
1193+
try:
1194+
term_width = shutil.get_terminal_size().columns
1195+
except OSError:
1196+
term_width = (
1197+
DEFAULT_TERMINAL_WIDTH # Use centralized fallback from config
1198+
)
11861199
print(
1187-
"\r" + " " * (shutil.get_terminal_size().columns - 1),
1200+
"\r" + " " * (term_width - 1),
11881201
end="",
11891202
flush=True,
11901203
)
@@ -1915,11 +1928,20 @@ async def call_openai_api(
19151928
return content, request_cost
19161929

19171930

1918-
def chunk_html_by_size(html_content: str, max_chars: int = 200000) -> List[str]:
1931+
def chunk_html_by_size(
1932+
html_content: str, max_chars: int = HTML_MAX_CHUNK_SIZE
1933+
) -> List[str]:
19191934
"""
19201935
Split HTML content into chunks that won't exceed token limits.
19211936
Tries to split on logical boundaries (doc objects, sections).
1922-
Max chars ~200K = ~85K tokens with safety margin for GPT-5 Nano.
1937+
1938+
Args:
1939+
html_content: The HTML content to split
1940+
max_chars: Maximum characters per chunk. Default from config.HTML_MAX_CHUNK_SIZE
1941+
(~200K chars = ~85K tokens with safety margin for GPT-5 Nano)
1942+
1943+
Returns:
1944+
List of HTML chunk strings
19231945
"""
19241946
if len(html_content) <= max_chars:
19251947
return [html_content]
@@ -2131,9 +2153,10 @@ async def call_llm_to_convert_html_to_xml(
21312153
error_collector: Optional ErrorCollector for comprehensive error tracking
21322154
url: Optional URL being processed (for error reporting)
21332155
"""
2134-
# Reduced chunk size to ~80K chars (~27K tokens worst-case with 1:1 ratio)
2156+
# Use centralized chunk size from config.py - DO NOT hardcode
2157+
# HTML_CHUNK_SIZE (~80K chars = ~27K tokens worst-case with 1:1 ratio)
21352158
# This ensures each chunk stays well within GPT-5 Nano's safe input limits
2136-
chunks = chunk_html_by_size(html_content, max_chars=80000)
2159+
chunks = chunk_html_by_size(html_content, max_chars=HTML_CHUNK_SIZE)
21372160
num_chunks = len(chunks)
21382161
logger.info(f"Processing content in {num_chunks} chunk(s)")
21392162

@@ -3861,7 +3884,8 @@ def process_multiple_pages(
38613884
# PROCESS EVENTS: Dispatch all pending events from worker threads
38623885
# WHY: EventBus.dispatch() processes StatusEvent, ErrorEvent, etc.
38633886
# that were published by process_url() in worker threads
3864-
event_bus.dispatch(timeout=0.01)
3887+
# Use centralized fast timeout from config - DO NOT hardcode
3888+
event_bus.dispatch(timeout=EVENT_DISPATCH_FAST_TIMEOUT)
38653889

38663890
# CHECK COMPLETED FUTURES: Non-blocking check
38673891
# WHY: timeout=0 means don't wait for completion, just check status
@@ -4085,7 +4109,8 @@ def process_multiple_pages(
40854109
critical = status_pipeline.wait_for_update(
40864110
timeout=BATCH_TUI_POLL_INTERVAL
40874111
)
4088-
event_bus.dispatch(timeout=0.01)
4112+
# Use centralized fast timeout from config - DO NOT hardcode
4113+
event_bus.dispatch(timeout=EVENT_DISPATCH_FAST_TIMEOUT)
40894114

40904115
# Check completed futures (non-blocking)
40914116
done, _ = concurrent.futures.wait(
@@ -5082,7 +5107,14 @@ def start_single_scrape(
50825107

50835108

50845109
def create_summary_box(summary_data: Dict[str, str]) -> str:
5085-
terminal_width = shutil.get_terminal_size().columns
5110+
# PLATFORM SAFETY: get_terminal_size() can fail in headless/CI environments
5111+
# WHY: In Docker, CI pipelines, or non-TTY contexts, there's no terminal
5112+
# DO NOT remove try/except - causes crashes in headless mode
5113+
try:
5114+
terminal_width = shutil.get_terminal_size().columns
5115+
except OSError:
5116+
terminal_width = DEFAULT_TERMINAL_WIDTH # Use centralized fallback from config
5117+
50865118
max_label_length = max(len(label) for label in summary_data.keys())
50875119
max_value_length = max(len(str(value)) for value in summary_data.values())
50885120

apias/batch_tui.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@
4747
)
4848

4949
# Import shared terminal utilities for cross-platform support
50+
# DRY: format_size is a utility function - don't duplicate it locally
51+
from apias.terminal_utils import (
52+
format_size, # DRY: Single source of truth for size formatting
53+
)
5054
from apias.terminal_utils import (
5155
BaseTUIManager,
5256
ProcessState,
@@ -1051,15 +1055,8 @@ def show_final_summary(self, output_dir: str = "") -> None:
10511055
total_size_out = sum(t.size_out for t in self.tasks.values() if t.size_out > 0)
10521056
total_size_in = sum(t.size_in for t in self.tasks.values() if t.size_in > 0)
10531057

1054-
# Format size for display
1055-
def format_size(size_bytes: int) -> str:
1056-
"""Format bytes into human-readable string."""
1057-
if size_bytes < 1024:
1058-
return f"{size_bytes} B"
1059-
elif size_bytes < 1024 * 1024:
1060-
return f"{size_bytes / 1024:.1f} KB"
1061-
else:
1062-
return f"{size_bytes / (1024 * 1024):.2f} MB"
1058+
# DRY: format_size is imported from terminal_utils.py - single source of truth
1059+
# DO NOT define a local format_size function here (was previous DRY violation)
10631060

10641061
stats_table.add_row(
10651062
"Success Rate",

apias/config.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,27 @@
7070
# Maximum delay cap to prevent excessively long waits
7171
RETRY_MAX_DELAY_SECONDS: Final[float] = 30.0
7272

73+
# --- Event System Timeouts ---
74+
# Event dispatch timeout for processing queued events in main loop
75+
# WHY 50ms: Long enough to process multiple events, short enough for responsive TUI
76+
EVENT_DISPATCH_TIMEOUT: Final[float] = (
77+
0.05 # seconds (50ms) for standard event processing
78+
)
79+
# Fast timeout for tight processing loops where responsiveness is critical
80+
# WHY 10ms: Minimizes latency in rapid-fire processing loops
81+
EVENT_DISPATCH_FAST_TIMEOUT: Final[float] = (
82+
0.01 # seconds (10ms) for fast loop processing
83+
)
84+
85+
# --- HTML Content Chunking ---
86+
# Maximum characters per HTML chunk for AI processing
87+
# WHY 200000: Large chunks reduce API calls but must fit within model context window
88+
# GPT-4 has 128K context, so 200K chars (~50K tokens) leaves room for response
89+
HTML_MAX_CHUNK_SIZE: Final[int] = 200000 # Max chars for very large pages
90+
# Default chunk size for normal processing - smaller for better accuracy
91+
# WHY 80000: Balance between API cost efficiency and processing reliability
92+
HTML_CHUNK_SIZE: Final[int] = 80000 # Default chunk size for chunking
93+
7394
# --- Batch Processing ---
7495
DEFAULT_NUM_THREADS: Final[int] = 5 # Default concurrent threads for batch mode
7596
MAX_SAFE_THREADS: Final[int] = 20 # Warn if more threads requested

apias/error_collector.py

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -46,26 +46,32 @@
4646

4747
import yaml # type: ignore[import-untyped]
4848

49-
from apias.event_system import CircuitBreakerEvent, ErrorCategory, ErrorEvent, EventBus
50-
51-
# Explicitly re-export ErrorCategory for type-safe imports
52-
__all__ = ["ErrorCategory", "ErrorCollector", "load_error_config"]
49+
# DRY PRINCIPLE: Import ErrorCategory and RECOVERABLE_CATEGORIES from single source
50+
# DO NOT define duplicate enums or sets - use the canonical event_system versions
51+
from apias.event_system import (
52+
RECOVERABLE_CATEGORIES,
53+
CircuitBreakerEvent,
54+
ErrorCategory,
55+
ErrorEvent,
56+
EventBus,
57+
)
58+
59+
# Explicitly re-export for backwards compatibility
60+
__all__ = [
61+
"ErrorCategory",
62+
"ErrorCollector",
63+
"load_error_config",
64+
"RECOVERABLE_CATEGORIES",
65+
]
5366

5467
logger = logging.getLogger(__name__)
5568

5669

5770
# ============================================================================
5871
# Constants and Configuration
5972
# ============================================================================
60-
61-
# Recoverable error categories (can be retried)
62-
RECOVERABLE_CATEGORIES: Set[ErrorCategory] = {
63-
ErrorCategory.API_TIMEOUT,
64-
ErrorCategory.CONNECTION_ERROR,
65-
ErrorCategory.SERVER_ERROR,
66-
ErrorCategory.PARSE_ERROR,
67-
ErrorCategory.XML_VALIDATION,
68-
}
73+
# Note: RECOVERABLE_CATEGORIES is imported from event_system.py - single source of truth
74+
# DO NOT redefine it here - this was the previous DRY violation
6975

7076
# Default thresholds if config file missing
7177
DEFAULT_THRESHOLDS: Dict[ErrorCategory, int] = {

apias/error_handler.py

Lines changed: 13 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,12 @@
2424
import threading
2525
from dataclasses import dataclass, field
2626
from datetime import datetime
27-
from enum import Enum, auto
2827
from typing import Dict, Final, List, Optional
2928

29+
# DRY PRINCIPLE: Import ErrorCategory and RECOVERABLE_CATEGORIES from single source
30+
# DO NOT define a duplicate ErrorCategory enum here - use the canonical one
31+
from apias.event_system import RECOVERABLE_CATEGORIES, ErrorCategory
32+
3033
# Module-level logger for tracing error classification and circuit breaker events
3134
# This enables debugging of error handling behavior without code changes
3235
logger = logging.getLogger(__name__)
@@ -40,33 +43,10 @@
4043
DEFAULT_MAX_ERRORS: Final[int] = 1000 # Max errors to track (prevents memory bloat)
4144
DEFAULT_QUOTA_IMMEDIATE_STOP: Final[bool] = True # Stop immediately on quota exceeded
4245

43-
44-
class ErrorCategory(Enum):
45-
"""
46-
Classification of error types for summary reporting.
47-
48-
Each category maps to a specific class of failure that helps users
49-
understand WHY their job failed and what action to take.
50-
51-
Categories are ordered by severity - fatal errors that cannot be
52-
retried should be checked first in classification logic.
53-
"""
54-
55-
NONE = auto() # No error - successful operation
56-
RATE_LIMIT = auto() # 429 Too Many Requests - can retry after delay
57-
QUOTA_EXCEEDED = auto() # Insufficient quota - FATAL, must add credits
58-
API_TIMEOUT = auto() # Request timeout - can retry
59-
CONNECTION_ERROR = auto() # Network/connection failure - can retry
60-
INVALID_RESPONSE = auto() # XML validation failed - may need different prompt
61-
SOURCE_NOT_FOUND = auto() # Page not found (404) - skip this URL
62-
AUTHENTICATION = auto() # API key invalid - FATAL, fix credentials
63-
SERVER_ERROR = auto() # 5xx errors - can retry later
64-
UNKNOWN = auto() # Unclassified errors - investigate logs
65-
66-
6746
# =============================================================================
6847
# ERROR METADATA TABLES - Centralized display info for each category
6948
# =============================================================================
49+
# Note: ErrorCategory and RECOVERABLE_CATEGORIES are imported from event_system.py
7050
# DO NOT duplicate these strings elsewhere - always use get_error_icon/description
7151

7252
ERROR_ICONS: Final[Dict[ErrorCategory, str]] = {
@@ -78,7 +58,10 @@ class ErrorCategory(Enum):
7858
ErrorCategory.INVALID_RESPONSE: "XML",
7959
ErrorCategory.SOURCE_NOT_FOUND: "404",
8060
ErrorCategory.AUTHENTICATION: "KEY",
61+
ErrorCategory.INVALID_API_KEY: "KEY", # Same icon as AUTHENTICATION
8162
ErrorCategory.SERVER_ERROR: "5xx",
63+
ErrorCategory.PARSE_ERROR: "HTM", # HTML parsing errors
64+
ErrorCategory.XML_VALIDATION: "XSD", # XML schema validation errors
8265
ErrorCategory.UNKNOWN: "???",
8366
}
8467

@@ -93,21 +76,15 @@ class ErrorCategory(Enum):
9376
ErrorCategory.INVALID_RESPONSE: "Invalid response from API",
9477
ErrorCategory.SOURCE_NOT_FOUND: "Source page not found (404)",
9578
ErrorCategory.AUTHENTICATION: "API key invalid or expired",
79+
ErrorCategory.INVALID_API_KEY: "API key invalid or missing",
9680
ErrorCategory.SERVER_ERROR: "API server error (5xx)",
81+
ErrorCategory.PARSE_ERROR: "HTML parsing failed",
82+
ErrorCategory.XML_VALIDATION: "XML validation failed",
9783
ErrorCategory.UNKNOWN: "Unknown error occurred",
9884
}
9985

100-
# Categories that are recoverable (can be retried)
101-
# IMPORTANT: RATE_LIMIT is NOT recoverable - it means we're hitting API limits
102-
# and should stop immediately to avoid wasting time and potentially being banned.
103-
# Only transient errors (timeout, connection, server error) are recoverable.
104-
RECOVERABLE_CATEGORIES: Final[frozenset[ErrorCategory]] = frozenset(
105-
{
106-
ErrorCategory.API_TIMEOUT,
107-
ErrorCategory.CONNECTION_ERROR,
108-
ErrorCategory.SERVER_ERROR,
109-
}
110-
)
86+
# Note: RECOVERABLE_CATEGORIES is imported from event_system.py - single source of truth
87+
# DO NOT redefine it here
11188

11289

11390
@dataclass

apias/event_system.py

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,9 @@
2424
# Publish (from worker thread)
2525
event_bus.publish(ErrorEvent(...))
2626
27-
# Process (from main thread)
28-
event_bus.dispatch(timeout=0.05)
27+
# Process (from main thread) - use centralized timeout from config.py
28+
from apias.config import EVENT_DISPATCH_TIMEOUT
29+
event_bus.dispatch(timeout=EVENT_DISPATCH_TIMEOUT)
2930
"""
3031

3132
import logging
@@ -112,17 +113,22 @@ class StatusEvent(Event):
112113

113114
class ErrorCategory(Enum):
114115
"""
115-
Error classification for tracking and circuit breaker logic.
116+
SINGLE SOURCE OF TRUTH for error classification across APIAS.
116117
117118
These categories determine:
118119
- Circuit breaker thresholds (per-category in YAML config)
119-
- Error recoverability (some errors are fatal, others transient)
120+
- Error recoverability (via RECOVERABLE_CATEGORIES set below)
120121
- User messaging (different guidance per category)
122+
- Summary reporting icons and descriptions
121123
122-
NOTE: This mirrors ErrorCategory in error_handler.py.
123-
Eventually we'll consolidate to use this enum in both places.
124+
DRY PRINCIPLE: This enum is THE definitive error classification.
125+
DO NOT create duplicate ErrorCategory enums elsewhere.
126+
Import this: `from apias.event_system import ErrorCategory, RECOVERABLE_CATEGORIES`
124127
"""
125128

129+
# Success state
130+
NONE = auto() # No error - successful operation
131+
126132
# API Errors
127133
QUOTA_EXCEEDED = auto() # Insufficient API quota - FATAL
128134
RATE_LIMIT = auto() # 429 rate limit - FATAL (short-term)
@@ -144,6 +150,26 @@ class ErrorCategory(Enum):
144150
UNKNOWN = auto() # Unclassified error
145151

146152

153+
# =============================================================================
154+
# RECOVERABLE CATEGORIES - Single source of truth for retry logic
155+
# =============================================================================
156+
# IMPORTANT: RATE_LIMIT is NOT recoverable - hitting API limits means we should
157+
# stop immediately to avoid wasting time and potentially being banned.
158+
# Only transient errors (timeout, connection, server error) are recoverable.
159+
# DRY: Import this set - DO NOT recreate elsewhere.
160+
RECOVERABLE_CATEGORIES: frozenset[ErrorCategory] = frozenset(
161+
{
162+
ErrorCategory.API_TIMEOUT,
163+
ErrorCategory.CONNECTION_ERROR,
164+
ErrorCategory.SERVER_ERROR,
165+
# Parse/validation errors can be retried with different input
166+
ErrorCategory.PARSE_ERROR,
167+
ErrorCategory.XML_VALIDATION,
168+
ErrorCategory.INVALID_RESPONSE,
169+
}
170+
)
171+
172+
147173
@dataclass
148174
class ErrorEvent(Event):
149175
"""

0 commit comments

Comments
 (0)