|
142 | 142 |
|
143 | 143 | from .batch_tui import BatchTUIManager, URLState |
144 | 144 | from .config import BATCH_TUI_POLL_INTERVAL # For hybrid polling in batch mode |
| 145 | +from .config import EVENT_DISPATCH_FAST_TIMEOUT # For tight loop event processing |
| 146 | +from .config import EVENT_DISPATCH_TIMEOUT # For standard event processing |
| 147 | +from .config import HTML_CHUNK_SIZE # Default chunk size for HTML chunking |
| 148 | +from .config import HTML_MAX_CHUNK_SIZE # Maximum chunk size for very large pages |
145 | 149 | from .config import KEYBOARD_THREAD_TIMEOUT # For thread.join() timeouts |
146 | 150 | from .config import ( # Network timeouts - DO NOT hardcode these values elsewhere; API configuration; Batch processing - DO NOT hardcode polling intervals; Thread cleanup - DO NOT hardcode thread timeouts; File system; Progress percentages - DO NOT hardcode progress values elsewhere; Use ProgressPercent.SCRAPING, ProgressPercent.SENDING, etc. |
147 | 151 | BATCH_FINAL_STATE_PAUSE, |
@@ -1183,8 +1187,17 @@ def end(self) -> None: |
1183 | 1187 |
|
1184 | 1188 | def _clear_line(self) -> None: |
1185 | 1189 | if not self.quiet: |
| 1190 | + # PLATFORM SAFETY: get_terminal_size() can fail in headless/CI environments |
| 1191 | + # WHY: In Docker, CI pipelines, or non-TTY contexts, there's no terminal |
| 1192 | + # DO NOT remove try/except - causes crashes in headless mode |
| 1193 | + try: |
| 1194 | + term_width = shutil.get_terminal_size().columns |
| 1195 | + except OSError: |
| 1196 | + term_width = ( |
| 1197 | + DEFAULT_TERMINAL_WIDTH # Use centralized fallback from config |
| 1198 | + ) |
1186 | 1199 | print( |
1187 | | - "\r" + " " * (shutil.get_terminal_size().columns - 1), |
| 1200 | + "\r" + " " * (term_width - 1), |
1188 | 1201 | end="", |
1189 | 1202 | flush=True, |
1190 | 1203 | ) |
@@ -1915,11 +1928,20 @@ async def call_openai_api( |
1915 | 1928 | return content, request_cost |
1916 | 1929 |
|
1917 | 1930 |
|
1918 | | -def chunk_html_by_size(html_content: str, max_chars: int = 200000) -> List[str]: |
| 1931 | +def chunk_html_by_size( |
| 1932 | + html_content: str, max_chars: int = HTML_MAX_CHUNK_SIZE |
| 1933 | +) -> List[str]: |
1919 | 1934 | """ |
1920 | 1935 | Split HTML content into chunks that won't exceed token limits. |
1921 | 1936 | Tries to split on logical boundaries (doc objects, sections). |
1922 | | - Max chars ~200K = ~85K tokens with safety margin for GPT-5 Nano. |
| 1937 | +
|
| 1938 | + Args: |
| 1939 | + html_content: The HTML content to split |
| 1940 | + max_chars: Maximum characters per chunk. Default from config.HTML_MAX_CHUNK_SIZE |
| 1941 | + (~200K chars = ~85K tokens with safety margin for GPT-5 Nano) |
| 1942 | +
|
| 1943 | + Returns: |
| 1944 | + List of HTML chunk strings |
1923 | 1945 | """ |
1924 | 1946 | if len(html_content) <= max_chars: |
1925 | 1947 | return [html_content] |
@@ -2131,9 +2153,10 @@ async def call_llm_to_convert_html_to_xml( |
2131 | 2153 | error_collector: Optional ErrorCollector for comprehensive error tracking |
2132 | 2154 | url: Optional URL being processed (for error reporting) |
2133 | 2155 | """ |
2134 | | - # Reduced chunk size to ~80K chars (~27K tokens worst-case with 1:1 ratio) |
| 2156 | + # Use centralized chunk size from config.py - DO NOT hardcode |
| 2157 | + # HTML_CHUNK_SIZE (~80K chars = ~27K tokens worst-case with 1:1 ratio) |
2135 | 2158 | # This ensures each chunk stays well within GPT-5 Nano's safe input limits |
2136 | | - chunks = chunk_html_by_size(html_content, max_chars=80000) |
| 2159 | + chunks = chunk_html_by_size(html_content, max_chars=HTML_CHUNK_SIZE) |
2137 | 2160 | num_chunks = len(chunks) |
2138 | 2161 | logger.info(f"Processing content in {num_chunks} chunk(s)") |
2139 | 2162 |
|
@@ -3861,7 +3884,8 @@ def process_multiple_pages( |
3861 | 3884 | # PROCESS EVENTS: Dispatch all pending events from worker threads |
3862 | 3885 | # WHY: EventBus.dispatch() processes StatusEvent, ErrorEvent, etc. |
3863 | 3886 | # that were published by process_url() in worker threads |
3864 | | - event_bus.dispatch(timeout=0.01) |
| 3887 | + # Use centralized fast timeout from config - DO NOT hardcode |
| 3888 | + event_bus.dispatch(timeout=EVENT_DISPATCH_FAST_TIMEOUT) |
3865 | 3889 |
|
3866 | 3890 | # CHECK COMPLETED FUTURES: Non-blocking check |
3867 | 3891 | # WHY: timeout=0 means don't wait for completion, just check status |
@@ -4085,7 +4109,8 @@ def process_multiple_pages( |
4085 | 4109 | critical = status_pipeline.wait_for_update( |
4086 | 4110 | timeout=BATCH_TUI_POLL_INTERVAL |
4087 | 4111 | ) |
4088 | | - event_bus.dispatch(timeout=0.01) |
| 4112 | + # Use centralized fast timeout from config - DO NOT hardcode |
| 4113 | + event_bus.dispatch(timeout=EVENT_DISPATCH_FAST_TIMEOUT) |
4089 | 4114 |
|
4090 | 4115 | # Check completed futures (non-blocking) |
4091 | 4116 | done, _ = concurrent.futures.wait( |
@@ -5082,7 +5107,14 @@ def start_single_scrape( |
5082 | 5107 |
|
5083 | 5108 |
|
5084 | 5109 | def create_summary_box(summary_data: Dict[str, str]) -> str: |
5085 | | - terminal_width = shutil.get_terminal_size().columns |
| 5110 | + # PLATFORM SAFETY: get_terminal_size() can fail in headless/CI environments |
| 5111 | + # WHY: In Docker, CI pipelines, or non-TTY contexts, there's no terminal |
| 5112 | + # DO NOT remove try/except - causes crashes in headless mode |
| 5113 | + try: |
| 5114 | + terminal_width = shutil.get_terminal_size().columns |
| 5115 | + except OSError: |
| 5116 | + terminal_width = DEFAULT_TERMINAL_WIDTH # Use centralized fallback from config |
| 5117 | + |
5086 | 5118 | max_label_length = max(len(label) for label in summary_data.keys()) |
5087 | 5119 | max_value_length = max(len(str(value)) for value in summary_data.values()) |
5088 | 5120 |
|
|
0 commit comments