LinuxReport/playwrightfetch.py at master · KeithCu/LinuxReport · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
"""
playwrightfetch.py

Playwright-based web scraping system for JavaScript-rendered content and dynamic sites.
Provides functions to fetch and parse posts from sites requiring JavaScript rendering
or special handling, using Playwright and BeautifulSoup. Includes site-specific
configurations, and thread-safe operations.
"""

# =============================================================================
# STANDARD LIBRARY IMPORTS
# =============================================================================
import time
import threading

# =============================================================================
# THIRD-PARTY IMPORTS
# =============================================================================
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError

try:
    import psutil
except ImportError:
    psutil = None

# =============================================================================
# LOCAL IMPORTS
# =============================================================================
from shared import g_logger
from browser_fetch import (
    BROWSER_TIMEOUT, BROWSER_WAIT_TIMEOUT,
    SharedBrowserManager, get_common_context_options, BrowserErrorHandler,
    PlaywrightElementExtractor, BrowserUtils
)

# =============================================================================
# TIMEOUT CONSTANTS
# =============================================================================

# Note: All timeout constants are now imported from browser_fetch
# to maintain consistency across all browser modules

# =============================================================================
# PLAYWRIGHT BROWSER CONFIGURATION AND CREATION
# =============================================================================

def _safe_playwright_start():
    """
    Safely start Playwright with logging isolation to avoid Apache/mod_wsgi conflicts.

    Returns:
        Playwright instance or None if initialization fails
    """
    try:
        # Temporarily suppress Playwright's logging to avoid Apache/mod_wsgi conflicts
        import logging

        # Suppress Playwright's internal logging
        playwright_logger = logging.getLogger("playwright")
        original_level = playwright_logger.level
        playwright_logger.setLevel(logging.CRITICAL)

        try:
            playwright_instance = sync_playwright().start()
            return playwright_instance
        finally:
            # Restore original logging level
            playwright_logger.setLevel(original_level)

    except Exception as e:
        g_logger.error(f"Error starting Playwright safely: {e}")
        return None

def create_browser_context(playwright, use_tor, user_agent):
    """
    Create and configure a Playwright browser context using shared browser creation logic.

    Args:
        playwright: Playwright instance
        use_tor (bool): Whether to use Tor proxy for connections
        user_agent (str): User agent string to use for requests

    Returns:
        tuple: (browser, context) - Configured Playwright browser and context instances
    """
    try:
        g_logger.info(f"Creating Playwright browser with Tor: {use_tor}, User-Agent: {user_agent[:50]}...")

        # Use shared browser creation logic
        from browser_fetch import get_common_browser_args
        args = get_common_browser_args(use_tor, user_agent)

        # Launch browser with shared options
        browser_options = {
            "headless": True,
            "args": args['anti_detection'] + args['performance']
        }

        browser = playwright.chromium.launch(**browser_options)

        # Use shared context options
        context_options = get_common_context_options(use_tor, user_agent)
        context = browser.new_context(**context_options)

        g_logger.debug("Playwright browser and context created successfully")
        return browser, context

    except Exception as e:
        g_logger.error(f"Error creating Playwright browser: {e}")
        g_logger.error(f"Error type: {type(e).__name__}")
        import traceback
        g_logger.error(f"Full traceback: {traceback.format_exc()}")
        raise

# =============================================================================
# SHARED PLAYWRIGHT BROWSER MANAGEMENT
# =============================================================================

class SharedPlaywrightBrowser(SharedBrowserManager):
    """
    Thread-safe manager for Playwright browser contexts.

    Uses thread-local storage to avoid Playwright's threading limitations.
    Each thread gets its own browser instance to prevent "Cannot switch to a different thread" errors.
    Inherits from SharedBrowserManager to use shared lock management and cleanup operations.
    """

    _instances = {}  # Thread-local instances
    _lock = threading.Lock()

    def __init__(self, use_tor, user_agent):
        """
        Initialize a new SharedPlaywrightBrowser instance.

        Args:
            use_tor (bool): Whether to use Tor proxy
            user_agent (str): User agent string for the browser context
        """
        super().__init__(use_tor, user_agent)
        g_logger.debug(f"Initializing SharedPlaywrightBrowser with Tor: {use_tor}")
        try:
            # Use safe Playwright initialization to avoid Apache/mod_wsgi conflicts
            self.playwright = _safe_playwright_start()
            if not self.playwright:
                raise Exception("Failed to start Playwright safely")

            self.browser, self.context = create_browser_context(self.playwright, use_tor, user_agent)
            g_logger.debug("SharedPlaywrightBrowser initialized successfully")

        except Exception as e:
            g_logger.error(f"Error in SharedPlaywrightBrowser.__init__: {e}")
            raise

    @classmethod
    def get_browser_context(cls, use_tor, user_agent):
        """
        Get or create a thread-local browser context.

        Returns a thread-local browser context, creating it if needed.
        Each thread gets its own browser instance to avoid threading issues.

        Args:
            use_tor (bool): Whether to use Tor proxy
            user_agent (str): User agent string for the context

        Returns:
            tuple: (browser, context) - Playwright browser and context instances
        """
        thread_id = threading.get_ident()

        with cls._lock:
            # Get or create thread-local instance
            if thread_id not in cls._instances or not cls._is_instance_valid(cls._instances[thread_id], use_tor, user_agent):
                if thread_id in cls._instances:
                    cls._cleanup_thread_instance(thread_id)
                try:
                    cls._instances[thread_id] = SharedPlaywrightBrowser(use_tor, user_agent)
                    g_logger.info(f"Created new thread-local browser context instance with Tor: {use_tor} for thread {thread_id}")
                except Exception as e:
                    g_logger.error(f"Error creating browser context for thread {thread_id}: {e}")
                    cls._instances[thread_id] = None
                    return None, None

            cls._instances[thread_id].last_used = time.time()
            return cls._instances[thread_id].browser, cls._instances[thread_id].context

    def is_valid(self):
        """
        Check if the Playwright context is still valid and responsive.

        Returns:
            bool: True if context is valid, False otherwise
        """
        try:
            if hasattr(self, 'context') and self.context:
                # Simple test to check if context is alive
                pages = self.context.pages
                return True
            return False
        except Exception as e:
            g_logger.debug(f"Context health check failed: {e}")
            return False

    @classmethod
    def _cleanup_thread_instance(cls, thread_id):
        """
        Clean up a specific thread's browser instance safely.
        """
        if thread_id in cls._instances and cls._instances[thread_id]:
            instance = cls._instances[thread_id]
            try:
                # Close context first, then browser
                if hasattr(instance, 'context') and instance.context:
                    instance.context.close()
                    g_logger.debug(f"Browser context closed successfully for thread {thread_id}")
                if hasattr(instance, 'browser') and instance.browser:
                    instance.browser.close()
                    g_logger.debug(f"Browser closed successfully for thread {thread_id}")
                if hasattr(instance, 'playwright') and instance.playwright:
                    instance.playwright.stop()
                    g_logger.debug(f"Playwright stopped successfully for thread {thread_id}")
            except Exception as e:
                g_logger.error(f"Error during browser cleanup for thread {thread_id}: {e}")
            finally:
                cls._instances[thread_id] = None
                g_logger.debug(f"Browser instance set to None for thread {thread_id}")

    @classmethod
    def _cleanup_all_instances(cls):
        """
        Clean up all thread-local browser instances safely.
        """
        for thread_id in list(cls._instances.keys()):
            cls._cleanup_thread_instance(thread_id)

    @classmethod
    def reset_for_testing(cls):
        """
        Reset all thread-local instances for testing purposes.
        This method is only used in test environments.
        """
        with cls._lock:
            cls._cleanup_all_instances()
            g_logger.debug("Reset for testing completed")


# =============================================================================
# GLOBAL CLEANUP FUNCTIONS
# =============================================================================

def cleanup_playwright_browsers():
    """
    Global cleanup function to ensure all Playwright browsers are properly shut down.

    This function can be called from other modules or during application shutdown
    to ensure no browser instances are left running.
    """
    g_logger.info("Cleaning up all Playwright browsers...")
    SharedPlaywrightBrowser.force_cleanup()

# =============================================================================
# PLAYWRIGHT-SPECIFIC UTILITY FUNCTIONS
# =============================================================================

def extract_post_data_playwright(post, config, url, use_playwright):
    """
    Playwright-specific wrapper for extract_post_data using shared element extractor.
    """
    # Import here to avoid circular imports
    from browser_fetch import extract_post_data as shared_extract_post_data

    extractor = PlaywrightElementExtractor()
    return shared_extract_post_data(
        post, config, url, use_playwright,
        get_text_func=extractor.get_text,
        find_func=extractor.find_element,
        get_attr_func=extractor.get_attribute
    )

def extract_post_data(post, config, url, use_playwright):
    """
    Extract post data from a web element using the provided configuration.

    Parses title, link, and other metadata from a post element using either
    Playwright Locator or BeautifulSoup object depending on the extraction method.

    Args:
        post: Playwright Locator or BeautifulSoup Tag containing post data
        config: Configuration object with selectors and settings
        url (str): Base URL for resolving relative links
        use_playwright (bool): Whether using Playwright or BeautifulSoup for extraction

    Returns:
        dict or list: Extracted post data with title, link, id, summary, and timestamps,
                     or None if extraction fails. For RSS feeds, returns a list of entries.
    """
    if use_playwright:
        return extract_post_data_playwright(post, config, url, use_playwright)
    else:
        # BeautifulSoup fallback - use shared function directly
        from browser_fetch import extract_post_data as shared_extract_post_data
        return shared_extract_post_data(
            post, config, url, use_playwright,
            get_text_func=None,
            find_func=None,
            get_attr_func=None
        )

# =============================================================================
# MAIN SITE FETCHING FUNCTION
# =============================================================================

def fetch_site_posts(url, user_agent):
    """
    Fetch posts from a website using Playwright browser automation.

    This function now delegates to the unified browser_fetch implementation
    to eliminate code duplication while maintaining Playwright-specific functionality.

    Args:
        url (str): URL of the site to fetch posts from
        user_agent (str): User agent string for HTTP requests

    Returns:
        dict: Feed-like structure with entries, metadata, and status information
    """
    # Import here to avoid circular imports
    from browser_fetch import fetch_site_posts as unified_fetch_site_posts
    return unified_fetch_site_posts(url, user_agent)