-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathseleniumfetch.py
More file actions
332 lines (272 loc) · 12.6 KB
/
seleniumfetch.py
File metadata and controls
332 lines (272 loc) · 12.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
"""
seleniumfetch.py
Selenium-based web scraping system for JavaScript-rendered content and dynamic sites.
Provides functions to fetch and parse posts from sites requiring JavaScript rendering
or special handling, using Selenium WebDriver and BeautifulSoup. Includes site-specific
configurations, shared driver management, and thread-safe operations.
"""
# =============================================================================
# STANDARD LIBRARY IMPORTS
# =============================================================================
import time
import threading
import logging
# =============================================================================
# THIRD-PARTY IMPORTS
# =============================================================================
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import WebDriverException
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.core.os_manager import ChromeType
# =============================================================================
# LOCAL IMPORTS
# =============================================================================
from shared import g_logger
from browser_fetch import (
BROWSER_TIMEOUT,
SharedBrowserManager, get_common_chrome_options,
SeleniumElementExtractor
)
# =============================================================================
# SELENIUM LOGGING CONFIGURATION
# =============================================================================
# Disable Selenium's verbose HTTP logging to reduce log noise
logging.getLogger('selenium').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('selenium.webdriver.remote.remote_connection').setLevel(logging.WARNING)
# Note: All timeout constants are imported from browser_fetch
# =============================================================================
# WEBDRIVER CONFIGURATION AND CREATION
# =============================================================================
def create_driver(use_tor, user_agent):
"""
Create and configure a Chrome WebDriver instance using shared browser creation logic.
Args:
use_tor (bool): Whether to use Tor proxy for connections
user_agent (str): User agent string to use for requests
Returns:
webdriver.Chrome: Configured Chrome WebDriver instance
"""
try:
g_logger.info(f"Creating Chrome driver with Tor: {use_tor}, User-Agent: {user_agent[:50]}...")
# Create driver with simplified logging
# Use shared browser creation logic
chrome_args = get_common_chrome_options(use_tor, user_agent)
options = Options()
for arg in chrome_args:
options.add_argument(arg)
# Add Selenium-specific options
options.add_experimental_option("excludeSwitches", ["enable-automation"])
# NOTE: We DON'T disable useAutomationExtension as that would break Selenium!
g_logger.debug("Installing ChromeDriver...")
service = Service(ChromeDriverManager(
chrome_type=ChromeType.CHROMIUM).install())
g_logger.debug("ChromeDriver installed successfully")
g_logger.debug("Creating Chrome WebDriver instance...")
driver = webdriver.Chrome(service=service, options=options)
g_logger.debug("Chrome WebDriver created successfully")
# Set timeouts to prevent hanging
driver.set_page_load_timeout(BROWSER_TIMEOUT)
driver.set_script_timeout(BROWSER_TIMEOUT)
g_logger.info("Chrome driver setup completed successfully")
return driver
except (WebDriverException, OSError, ValueError) as e:
g_logger.error(f"Error creating Chrome driver: {e}")
g_logger.error(f"Error type: {type(e).__name__}")
import traceback
g_logger.error(f"Full traceback: {traceback.format_exc()}")
raise
# =============================================================================
# SHARED SELENIUM DRIVER MANAGEMENT
# =============================================================================
class SharedSeleniumDriver(SharedBrowserManager):
"""
Thread-safe manager for Selenium WebDriver instances.
Inherits from SharedBrowserManager to use shared lock management,
instance validation, and cleanup operations.
"""
_instance = None
_lock = threading.Lock()
def __init__(self, use_tor, user_agent):
"""
Initialize a new SharedSeleniumDriver instance.
Args:
use_tor (bool): Whether to use Tor proxy
user_agent (str): User agent string for the driver
"""
super().__init__(use_tor, user_agent)
g_logger.info(f"Initializing SharedSeleniumDriver with Tor: {use_tor}")
try:
self.driver = create_driver(use_tor, user_agent)
g_logger.debug("SharedSeleniumDriver initialized successfully")
except Exception as e:
g_logger.error(f"Error in SharedSeleniumDriver.__init__: {e}")
raise
@classmethod
def get_driver(cls, use_tor, user_agent):
"""
Get or create a WebDriver instance.
Returns the driver instance, creating it if needed.
Note: Cleanup is now handled after each fetch operation, not here.
Args:
use_tor (bool): Whether to use Tor proxy
user_agent (str): User agent string for the driver
Returns:
webdriver.Chrome: Configured Chrome WebDriver instance
"""
with cls._lock:
g_logger.debug(f"Driver request - Tor: {use_tor}")
# Create instance if needed or invalid
if cls._instance is None or not cls._is_instance_valid(cls._instance, use_tor, user_agent):
if cls._instance:
g_logger.debug("Cleaning up invalid driver instance")
cls._cleanup_instance()
try:
cls._instance = SharedSeleniumDriver(use_tor, user_agent)
g_logger.info(f"Created new driver instance with Tor: {use_tor}")
except Exception as e:
g_logger.error(f"Error creating driver: {e}")
cls._instance = None
return None
cls._instance.last_used = time.time()
g_logger.debug(f"Returning driver instance, last used: {cls._instance.last_used}")
return cls._instance.driver
def is_valid(self):
"""
Check if the Selenium driver is still valid and responsive.
Returns:
bool: True if driver is valid, False otherwise
"""
try:
if hasattr(self, 'driver') and self.driver:
# Use a simple command to test if driver is alive
self.driver.execute_script("return document.readyState;")
g_logger.debug("Driver health check passed")
return True
g_logger.debug("Driver health check failed: no driver instance")
return False
except Exception as e:
g_logger.debug(f"Driver health check failed: {e}")
return False
@classmethod
def _cleanup_instance(cls):
"""
Clean up the current driver instance safely.
"""
if cls._instance:
g_logger.info("Starting cleanup of driver instance")
try:
# Try to quit the driver gracefully
cls._instance.driver.quit()
g_logger.debug("WebDriver quit successfully")
except Exception as e:
g_logger.error(f"Error quitting WebDriver: {e}")
try:
# Fallback: try to close the driver
cls._instance.driver.close()
g_logger.debug("WebDriver closed successfully")
except Exception as e2:
g_logger.error(f"Error closing WebDriver: {e2}")
finally:
cls._instance = None
g_logger.debug("Driver instance cleanup completed")
else:
g_logger.debug("No driver instance to cleanup")
@classmethod
def force_cleanup_after_request(cls):
"""
Force cleanup of the current driver instance after a request.
This should be called after each fetch operation.
"""
g_logger.debug("Forcing driver cleanup after request")
with cls._lock:
cls._cleanup_instance()
# =============================================================================
# GLOBAL CLEANUP FUNCTION
# =============================================================================
def cleanup_selenium_drivers():
"""
Global cleanup function to ensure all Selenium drivers are properly shut down.
This function can be called from other modules or during application shutdown
to ensure no WebDriver instances are left running.
"""
g_logger.debug("Cleaning up Selenium driver...")
SharedSeleniumDriver._cleanup_instance()
# =============================================================================
# SELENIUM-SPECIFIC UTILITY FUNCTIONS
# =============================================================================
def extract_post_data_selenium(post, config, url, use_selenium):
"""
Selenium-specific wrapper for extract_post_data using shared element extractor.
"""
# Import here to avoid circular imports
from browser_fetch import extract_post_data as shared_extract_post_data
extractor = SeleniumElementExtractor()
return shared_extract_post_data(
post, config, url, use_selenium,
get_text_func=extractor.get_text,
find_func=extractor.find_element,
get_attr_func=extractor.get_attribute
)
def extract_post_data(post, config, url, use_selenium):
"""
Extract post data from a web element using the provided configuration.
"""
if use_selenium:
return extract_post_data_selenium(post, config, url, use_selenium)
else:
# BeautifulSoup fallback - use shared function directly
from browser_fetch import extract_post_data as shared_extract_post_data
return shared_extract_post_data(
post, config, url, use_selenium,
get_text_func=None,
find_func=None,
get_attr_func=None
)
# =============================================================================
# MAIN SITE FETCHING FUNCTION
# =============================================================================
def fetch_site_posts(url, user_agent):
"""
Fetch posts from a website using Selenium WebDriver.
This function now delegates to the unified browser_fetch implementation
to eliminate code duplication while maintaining Selenium-specific functionality.
Includes automatic cleanup after each request to prevent zombie processes.
Args:
url (str): URL of the site to fetch posts from
user_agent (str): User agent string for HTTP requests
Returns:
dict: Feed-like structure with entries, metadata, and status information
"""
g_logger.info(f"Starting Selenium fetch for URL: {url}")
try:
# Import here to avoid circular imports
from browser_fetch import fetch_site_posts as unified_fetch_site_posts
g_logger.debug(f"Calling unified fetch_site_posts for URL: {url}")
result = unified_fetch_site_posts(url, user_agent)
g_logger.debug(f"Unified fetch_site_posts completed for URL: {url}")
# Force cleanup after each request to prevent zombie processes
g_logger.debug(f"Selenium fetch completed for URL: {url}, forcing cleanup")
SharedSeleniumDriver.force_cleanup_after_request()
g_logger.debug(f"Cleanup completed for URL: {url}")
return result
except Exception as e:
g_logger.error(f"Error during Selenium fetch for {url}: {e}")
# Ensure cleanup even on error
try:
g_logger.debug(f"Attempting cleanup after error for URL: {url}")
SharedSeleniumDriver.force_cleanup_after_request()
g_logger.debug(f"Cleanup after error completed for URL: {url}")
except Exception as cleanup_error:
g_logger.error(f"Error during cleanup after fetch error: {cleanup_error}")
# Return empty result on error
return {
'entries': [],
'etag': '',
'modified': None,
'feed': {'title': url, 'link': url, 'description': ''},
'href': url,
'status': 500
}