fix: DRY compliance and platform compatibility for v0.1.23

Emasoft · Emasoft · commit f24e4498a0db · 2025-12-02T00:52:37.000+01:00
DRY Principle Compliance:
- Consolidate ErrorCategory enum into single source in event_system.py
- Consolidate RECOVERABLE_CATEGORIES into event_system.py
- Remove duplicate format_size() from batch_tui.py (now imports from terminal_utils.py)
- Add config constants: EVENT_DISPATCH_TIMEOUT, EVENT_DISPATCH_FAST_TIMEOUT, HTML_CHUNK_SIZE, HTML_MAX_CHUNK_SIZE
- Update apias.py, event_system.py, status_pipeline.py to use centralized constants

Platform Compatibility:
- Fix terminal size detection with try/except fallback for headless/CI environments
- Add DEFAULT_TERMINAL_WIDTH fallback in _clear_line() and create_summary_box()

All 351 tests pass (72.53% coverage)
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,23 @@ All notable changes to APIAS (API Auto Scraper) will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.1.23] - 2025-12-02
+
+### DRY Principle Compliance (Code Deduplication)
+- Consolidate duplicate ErrorCategory enum into single source in event_system.py by @Emasoft
+- Consolidate duplicate RECOVERABLE_CATEGORIES into event_system.py by @Emasoft
+- Remove duplicate format_size() function from batch_tui.py - now imports from terminal_utils.py by @Emasoft
+- Add missing hardcoded values to config.py: EVENT_DISPATCH_TIMEOUT, EVENT_DISPATCH_FAST_TIMEOUT, HTML_CHUNK_SIZE, HTML_MAX_CHUNK_SIZE by @Emasoft
+- Update apias.py, event_system.py, status_pipeline.py to use centralized config constants by @Emasoft
+
+### Platform Compatibility
+- Fix terminal size detection in apias.py - add try/except with fallback for headless/CI environments by @Emasoft
+- Add DEFAULT_TERMINAL_WIDTH fallback from config.py in _clear_line() and create_summary_box() by @Emasoft
+
+### Documentation
+- Add comprehensive WHY/DO NOT comments explaining DRY consolidation decisions by @Emasoft
+- Update docstrings to reference centralized config constants by @Emasoft
+
 ## [0.1.22] - 2025-12-02
 
 ### Critical Fixes (Comprehensive Codebase Audit)
diff --git a/apias/__init__.py b/apias/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.1.22"
+__version__ = "0.1.23"
 from .apias import parse_documentation, validate_config
 
 __all__ = ["__version__", "parse_documentation", "validate_config"]
diff --git a/apias/apias.py b/apias/apias.py
@@ -142,6 +142,10 @@
 
 from .batch_tui import BatchTUIManager, URLState
 from .config import BATCH_TUI_POLL_INTERVAL  # For hybrid polling in batch mode
+from .config import EVENT_DISPATCH_FAST_TIMEOUT  # For tight loop event processing
+from .config import EVENT_DISPATCH_TIMEOUT  # For standard event processing
+from .config import HTML_CHUNK_SIZE  # Default chunk size for HTML chunking
+from .config import HTML_MAX_CHUNK_SIZE  # Maximum chunk size for very large pages
 from .config import KEYBOARD_THREAD_TIMEOUT  # For thread.join() timeouts
 from .config import (  # Network timeouts - DO NOT hardcode these values elsewhere; API configuration; Batch processing - DO NOT hardcode polling intervals; Thread cleanup - DO NOT hardcode thread timeouts; File system; Progress percentages - DO NOT hardcode progress values elsewhere; Use ProgressPercent.SCRAPING, ProgressPercent.SENDING, etc.
     BATCH_FINAL_STATE_PAUSE,
@@ -1183,8 +1187,17 @@ def end(self) -> None:
 
     def _clear_line(self) -> None:
         if not self.quiet:
+            # PLATFORM SAFETY: get_terminal_size() can fail in headless/CI environments
+            # WHY: In Docker, CI pipelines, or non-TTY contexts, there's no terminal
+            # DO NOT remove try/except - causes crashes in headless mode
+            try:
+                term_width = shutil.get_terminal_size().columns
+            except OSError:
+                term_width = (
+                    DEFAULT_TERMINAL_WIDTH  # Use centralized fallback from config
+                )
             print(
-                "\r" + " " * (shutil.get_terminal_size().columns - 1),
+                "\r" + " " * (term_width - 1),
                 end="",
                 flush=True,
             )
@@ -1915,11 +1928,20 @@ async def call_openai_api(
     return content, request_cost
 
 
-def chunk_html_by_size(html_content: str, max_chars: int = 200000) -> List[str]:
+def chunk_html_by_size(
+    html_content: str, max_chars: int = HTML_MAX_CHUNK_SIZE
+) -> List[str]:
     """
     Split HTML content into chunks that won't exceed token limits.
     Tries to split on logical boundaries (doc objects, sections).
-    Max chars ~200K = ~85K tokens with safety margin for GPT-5 Nano.
+
+    Args:
+        html_content: The HTML content to split
+        max_chars: Maximum characters per chunk. Default from config.HTML_MAX_CHUNK_SIZE
+                   (~200K chars = ~85K tokens with safety margin for GPT-5 Nano)
+
+    Returns:
+        List of HTML chunk strings
     """
     if len(html_content) <= max_chars:
         return [html_content]
@@ -2131,9 +2153,10 @@ async def call_llm_to_convert_html_to_xml(
         error_collector: Optional ErrorCollector for comprehensive error tracking
         url: Optional URL being processed (for error reporting)
     """
-    # Reduced chunk size to ~80K chars (~27K tokens worst-case with 1:1 ratio)
+    # Use centralized chunk size from config.py - DO NOT hardcode
+    # HTML_CHUNK_SIZE (~80K chars = ~27K tokens worst-case with 1:1 ratio)
     # This ensures each chunk stays well within GPT-5 Nano's safe input limits
-    chunks = chunk_html_by_size(html_content, max_chars=80000)
+    chunks = chunk_html_by_size(html_content, max_chars=HTML_CHUNK_SIZE)
     num_chunks = len(chunks)
     logger.info(f"Processing content in {num_chunks} chunk(s)")
 
@@ -3861,7 +3884,8 @@ def process_multiple_pages(
                 # PROCESS EVENTS: Dispatch all pending events from worker threads
                 # WHY: EventBus.dispatch() processes StatusEvent, ErrorEvent, etc.
                 # that were published by process_url() in worker threads
-                event_bus.dispatch(timeout=0.01)
+                # Use centralized fast timeout from config - DO NOT hardcode
+                event_bus.dispatch(timeout=EVENT_DISPATCH_FAST_TIMEOUT)
 
                 # CHECK COMPLETED FUTURES: Non-blocking check
                 # WHY: timeout=0 means don't wait for completion, just check status
@@ -4085,7 +4109,8 @@ def process_multiple_pages(
                         critical = status_pipeline.wait_for_update(
                             timeout=BATCH_TUI_POLL_INTERVAL
                         )
-                        event_bus.dispatch(timeout=0.01)
+                        # Use centralized fast timeout from config - DO NOT hardcode
+                        event_bus.dispatch(timeout=EVENT_DISPATCH_FAST_TIMEOUT)
 
                         # Check completed futures (non-blocking)
                         done, _ = concurrent.futures.wait(
@@ -5082,7 +5107,14 @@ def start_single_scrape(
 
 
 def create_summary_box(summary_data: Dict[str, str]) -> str:
-    terminal_width = shutil.get_terminal_size().columns
+    # PLATFORM SAFETY: get_terminal_size() can fail in headless/CI environments
+    # WHY: In Docker, CI pipelines, or non-TTY contexts, there's no terminal
+    # DO NOT remove try/except - causes crashes in headless mode
+    try:
+        terminal_width = shutil.get_terminal_size().columns
+    except OSError:
+        terminal_width = DEFAULT_TERMINAL_WIDTH  # Use centralized fallback from config
+
     max_label_length = max(len(label) for label in summary_data.keys())
     max_value_length = max(len(str(value)) for value in summary_data.values())
 
diff --git a/apias/batch_tui.py b/apias/batch_tui.py
@@ -47,6 +47,10 @@
 )
 
 # Import shared terminal utilities for cross-platform support
+# DRY: format_size is a utility function - don't duplicate it locally
+from apias.terminal_utils import (
+    format_size,  # DRY: Single source of truth for size formatting
+)
 from apias.terminal_utils import (
     BaseTUIManager,
     ProcessState,
@@ -1051,15 +1055,8 @@ def show_final_summary(self, output_dir: str = "") -> None:
         total_size_out = sum(t.size_out for t in self.tasks.values() if t.size_out > 0)
         total_size_in = sum(t.size_in for t in self.tasks.values() if t.size_in > 0)
 
-        # Format size for display
-        def format_size(size_bytes: int) -> str:
-            """Format bytes into human-readable string."""
-            if size_bytes < 1024:
-                return f"{size_bytes} B"
-            elif size_bytes < 1024 * 1024:
-                return f"{size_bytes / 1024:.1f} KB"
-            else:
-                return f"{size_bytes / (1024 * 1024):.2f} MB"
+        # DRY: format_size is imported from terminal_utils.py - single source of truth
+        # DO NOT define a local format_size function here (was previous DRY violation)
 
         stats_table.add_row(
             "Success Rate",
diff --git a/apias/config.py b/apias/config.py
@@ -70,6 +70,27 @@
 # Maximum delay cap to prevent excessively long waits
 RETRY_MAX_DELAY_SECONDS: Final[float] = 30.0
 
+# --- Event System Timeouts ---
+# Event dispatch timeout for processing queued events in main loop
+# WHY 50ms: Long enough to process multiple events, short enough for responsive TUI
+EVENT_DISPATCH_TIMEOUT: Final[float] = (
+    0.05  # seconds (50ms) for standard event processing
+)
+# Fast timeout for tight processing loops where responsiveness is critical
+# WHY 10ms: Minimizes latency in rapid-fire processing loops
+EVENT_DISPATCH_FAST_TIMEOUT: Final[float] = (
+    0.01  # seconds (10ms) for fast loop processing
+)
+
+# --- HTML Content Chunking ---
+# Maximum characters per HTML chunk for AI processing
+# WHY 200000: Large chunks reduce API calls but must fit within model context window
+# GPT-4 has 128K context, so 200K chars (~50K tokens) leaves room for response
+HTML_MAX_CHUNK_SIZE: Final[int] = 200000  # Max chars for very large pages
+# Default chunk size for normal processing - smaller for better accuracy
+# WHY 80000: Balance between API cost efficiency and processing reliability
+HTML_CHUNK_SIZE: Final[int] = 80000  # Default chunk size for chunking
+
 # --- Batch Processing ---
 DEFAULT_NUM_THREADS: Final[int] = 5  # Default concurrent threads for batch mode
 MAX_SAFE_THREADS: Final[int] = 20  # Warn if more threads requested
diff --git a/apias/error_collector.py b/apias/error_collector.py
@@ -46,26 +46,32 @@
 
 import yaml  # type: ignore[import-untyped]
 
-from apias.event_system import CircuitBreakerEvent, ErrorCategory, ErrorEvent, EventBus
-
-# Explicitly re-export ErrorCategory for type-safe imports
-__all__ = ["ErrorCategory", "ErrorCollector", "load_error_config"]
+# DRY PRINCIPLE: Import ErrorCategory and RECOVERABLE_CATEGORIES from single source
+# DO NOT define duplicate enums or sets - use the canonical event_system versions
+from apias.event_system import (
+    RECOVERABLE_CATEGORIES,
+    CircuitBreakerEvent,
+    ErrorCategory,
+    ErrorEvent,
+    EventBus,
+)
+
+# Explicitly re-export for backwards compatibility
+__all__ = [
+    "ErrorCategory",
+    "ErrorCollector",
+    "load_error_config",
+    "RECOVERABLE_CATEGORIES",
+]
 
 logger = logging.getLogger(__name__)
 
 
 # ============================================================================
 # Constants and Configuration
 # ============================================================================
-
-# Recoverable error categories (can be retried)
-RECOVERABLE_CATEGORIES: Set[ErrorCategory] = {
-    ErrorCategory.API_TIMEOUT,
-    ErrorCategory.CONNECTION_ERROR,
-    ErrorCategory.SERVER_ERROR,
-    ErrorCategory.PARSE_ERROR,
-    ErrorCategory.XML_VALIDATION,
-}
+# Note: RECOVERABLE_CATEGORIES is imported from event_system.py - single source of truth
+# DO NOT redefine it here - this was the previous DRY violation
 
 # Default thresholds if config file missing
 DEFAULT_THRESHOLDS: Dict[ErrorCategory, int] = {
diff --git a/apias/error_handler.py b/apias/error_handler.py
@@ -24,9 +24,12 @@
 import threading
 from dataclasses import dataclass, field
 from datetime import datetime
-from enum import Enum, auto
 from typing import Dict, Final, List, Optional
 
+# DRY PRINCIPLE: Import ErrorCategory and RECOVERABLE_CATEGORIES from single source
+# DO NOT define a duplicate ErrorCategory enum here - use the canonical one
+from apias.event_system import RECOVERABLE_CATEGORIES, ErrorCategory
+
 # Module-level logger for tracing error classification and circuit breaker events
 # This enables debugging of error handling behavior without code changes
 logger = logging.getLogger(__name__)
@@ -40,33 +43,10 @@
 DEFAULT_MAX_ERRORS: Final[int] = 1000  # Max errors to track (prevents memory bloat)
 DEFAULT_QUOTA_IMMEDIATE_STOP: Final[bool] = True  # Stop immediately on quota exceeded
 
-
-class ErrorCategory(Enum):
-    """
-    Classification of error types for summary reporting.
-
-    Each category maps to a specific class of failure that helps users
-    understand WHY their job failed and what action to take.
-
-    Categories are ordered by severity - fatal errors that cannot be
-    retried should be checked first in classification logic.
-    """
-
-    NONE = auto()  # No error - successful operation
-    RATE_LIMIT = auto()  # 429 Too Many Requests - can retry after delay
-    QUOTA_EXCEEDED = auto()  # Insufficient quota - FATAL, must add credits
-    API_TIMEOUT = auto()  # Request timeout - can retry
-    CONNECTION_ERROR = auto()  # Network/connection failure - can retry
-    INVALID_RESPONSE = auto()  # XML validation failed - may need different prompt
-    SOURCE_NOT_FOUND = auto()  # Page not found (404) - skip this URL
-    AUTHENTICATION = auto()  # API key invalid - FATAL, fix credentials
-    SERVER_ERROR = auto()  # 5xx errors - can retry later
-    UNKNOWN = auto()  # Unclassified errors - investigate logs
-
-
 # =============================================================================
 # ERROR METADATA TABLES - Centralized display info for each category
 # =============================================================================
+# Note: ErrorCategory and RECOVERABLE_CATEGORIES are imported from event_system.py
 # DO NOT duplicate these strings elsewhere - always use get_error_icon/description
 
 ERROR_ICONS: Final[Dict[ErrorCategory, str]] = {
@@ -78,7 +58,10 @@ class ErrorCategory(Enum):
     ErrorCategory.INVALID_RESPONSE: "XML",
     ErrorCategory.SOURCE_NOT_FOUND: "404",
     ErrorCategory.AUTHENTICATION: "KEY",
+    ErrorCategory.INVALID_API_KEY: "KEY",  # Same icon as AUTHENTICATION
     ErrorCategory.SERVER_ERROR: "5xx",
+    ErrorCategory.PARSE_ERROR: "HTM",  # HTML parsing errors
+    ErrorCategory.XML_VALIDATION: "XSD",  # XML schema validation errors
     ErrorCategory.UNKNOWN: "???",
 }
 
@@ -93,21 +76,15 @@ class ErrorCategory(Enum):
     ErrorCategory.INVALID_RESPONSE: "Invalid response from API",
     ErrorCategory.SOURCE_NOT_FOUND: "Source page not found (404)",
     ErrorCategory.AUTHENTICATION: "API key invalid or expired",
+    ErrorCategory.INVALID_API_KEY: "API key invalid or missing",
     ErrorCategory.SERVER_ERROR: "API server error (5xx)",
+    ErrorCategory.PARSE_ERROR: "HTML parsing failed",
+    ErrorCategory.XML_VALIDATION: "XML validation failed",
     ErrorCategory.UNKNOWN: "Unknown error occurred",
 }
 
-# Categories that are recoverable (can be retried)
-# IMPORTANT: RATE_LIMIT is NOT recoverable - it means we're hitting API limits
-# and should stop immediately to avoid wasting time and potentially being banned.
-# Only transient errors (timeout, connection, server error) are recoverable.
-RECOVERABLE_CATEGORIES: Final[frozenset[ErrorCategory]] = frozenset(
-    {
-        ErrorCategory.API_TIMEOUT,
-        ErrorCategory.CONNECTION_ERROR,
-        ErrorCategory.SERVER_ERROR,
-    }
-)
+# Note: RECOVERABLE_CATEGORIES is imported from event_system.py - single source of truth
+# DO NOT redefine it here
 
 
 @dataclass
diff --git a/apias/event_system.py b/apias/event_system.py
@@ -24,8 +24,9 @@
     # Publish (from worker thread)
     event_bus.publish(ErrorEvent(...))
 
-    # Process (from main thread)
-    event_bus.dispatch(timeout=0.05)
+    # Process (from main thread) - use centralized timeout from config.py
+    from apias.config import EVENT_DISPATCH_TIMEOUT
+    event_bus.dispatch(timeout=EVENT_DISPATCH_TIMEOUT)
 """
 
 import logging
@@ -112,17 +113,22 @@ class StatusEvent(Event):
 
 class ErrorCategory(Enum):
     """
-    Error classification for tracking and circuit breaker logic.
+    SINGLE SOURCE OF TRUTH for error classification across APIAS.
 
     These categories determine:
     - Circuit breaker thresholds (per-category in YAML config)
-    - Error recoverability (some errors are fatal, others transient)
+    - Error recoverability (via RECOVERABLE_CATEGORIES set below)
     - User messaging (different guidance per category)
+    - Summary reporting icons and descriptions
 
-    NOTE: This mirrors ErrorCategory in error_handler.py.
-    Eventually we'll consolidate to use this enum in both places.
+    DRY PRINCIPLE: This enum is THE definitive error classification.
+    DO NOT create duplicate ErrorCategory enums elsewhere.
+    Import this: `from apias.event_system import ErrorCategory, RECOVERABLE_CATEGORIES`
     """
 
+    # Success state
+    NONE = auto()  # No error - successful operation
+
     # API Errors
     QUOTA_EXCEEDED = auto()  # Insufficient API quota - FATAL
     RATE_LIMIT = auto()  # 429 rate limit - FATAL (short-term)
@@ -144,6 +150,26 @@ class ErrorCategory(Enum):
     UNKNOWN = auto()  # Unclassified error
 
 
+# =============================================================================
+# RECOVERABLE CATEGORIES - Single source of truth for retry logic
+# =============================================================================
+# IMPORTANT: RATE_LIMIT is NOT recoverable - hitting API limits means we should
+# stop immediately to avoid wasting time and potentially being banned.
+# Only transient errors (timeout, connection, server error) are recoverable.
+# DRY: Import this set - DO NOT recreate elsewhere.
+RECOVERABLE_CATEGORIES: frozenset[ErrorCategory] = frozenset(
+    {
+        ErrorCategory.API_TIMEOUT,
+        ErrorCategory.CONNECTION_ERROR,
+        ErrorCategory.SERVER_ERROR,
+        # Parse/validation errors can be retried with different input
+        ErrorCategory.PARSE_ERROR,
+        ErrorCategory.XML_VALIDATION,
+        ErrorCategory.INVALID_RESPONSE,
+    }
+)
+
+
 @dataclass
 class ErrorEvent(Event):
     """
diff --git a/apias/status_pipeline.py b/apias/status_pipeline.py
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/setup.py b/setup.py
diff --git a/uv.lock b/uv.lock