-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathReddit.py
More file actions
723 lines (598 loc) · 29.4 KB
/
Reddit.py
File metadata and controls
723 lines (598 loc) · 29.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
"""
Reddit.py
Handles PRAW (Python Reddit API Wrapper) authentication and fetching Reddit feeds in a feedparser-compatible format for the LinuxReport project.
USAGE & CONFIG SUMMARY (READ THIS FIRST):
- Do NOT put Reddit client_id, client_secret, username, or password into config.yaml or any tracked file.
- Do NOT commit reddit_token.json to git.
Bootstrap (one-time, interactive, on the server):
1. On the deployment server, from the LinuxReport project directory, run this module directly:
python Reddit.py
2. When prompted, enter:
- Reddit Client ID
- Reddit Client Secret
- Reddit Username
- Reddit Password (stored securely for PRAW authentication)
3. This script will:
- Store the credentials in "reddit_token.json" for PRAW to use.
- Set permissions on reddit_token.json to 0600 (owner read/write only).
- PRAW handles token refresh automatically.
Runtime behavior:
- At runtime, get_valid_reddit_client() creates a PRAW Reddit instance from reddit_token.json.
- workers.py uses fetch_reddit_feed_as_feedparser() (when ENABLE_REDDIT_API_FETCH is True)
to fetch Reddit data via PRAW using the stored credentials.
- No secrets are loaded from config.yaml by default.
- PRAW automatically handles authentication and token refresh.
"""
# NOTE: No caching or rate limiting is implemented here because it's handled at higher levels
# (workers.py, FeedHistory.py). This module makes requests very infrequently -
# at most one per Reddit feed per hour initially, but FeedHistory.py's intelligent scheduling
# (based on historical fetch success rates and time patterns) typically stretches this to
# much longer intervals (up to 12 hours), making requests respectful of Reddit's servers.
import getpass
import itertools
import json
# Standard library imports
import os
import re
import time
from urllib.parse import urlparse
# Third-party imports
import praw
# Local imports
from shared import g_logger, USER_AGENT, g_c, EXPIRE_WEEK
from app_config import get_reddit_username
# --- Configuration ---
# Credentials are handled via the token file for PRAW authentication
CREDENTIALS_FILE = 'reddit_token.json' # Simple file storage for PRAW credentials
DEFAULT_FEED_LIMIT = 10
FEED_TYPES = ['hot', 'new', 'rising', 'controversial', 'top']
REQUIRED_CREDENTIAL_KEYS = ['client_id', 'client_secret', 'username', 'password']
# Constants for magic numbers and strings
CREDENTIALS_FILE_MODE = 0o600
CONTENT_TYPE_HTML = 'text/html'
CONTENT_TYPE_PLAIN = 'text/plain'
SUBREDDIT_NAME_MAX_LENGTH = 21 # Reddit's subreddit name limit
REDDIT_BASE_URL = 'https://www.reddit.com'
# Reddit-specific user agent combining official app user agent with Reddit username from config
REDDIT_USER_AGENT = f"{USER_AGENT} (by /u/{get_reddit_username()})"
def _create_error_response(status, exception, feed_url, title_suffix=''):
"""
Creates a standardized error response structure compatible with feedparser format.
Args:
status: HTTP status code
exception: Exception object or error message
feed_url: Original feed URL
title_suffix: Optional suffix for feed title
Returns:
dict: Standardized error response structure
"""
return {
'bozo': 1,
'bozo_exception': exception,
'feed': {'title': f"Reddit Feed{title_suffix}", 'link': feed_url} if title_suffix else {},
'entries': [],
'status': status,
'href': feed_url
}
def _parse_timestamp(created_utc):
"""
Parses Reddit UTC timestamp into feedparser-compatible format.
Args:
created_utc: Unix timestamp (float or int)
Returns:
tuple: (published_parsed, published_str) or (None, None) on error
"""
if not created_utc:
return None, None
try:
published_parsed = time.gmtime(float(created_utc))
published_str = time.strftime('%a, %d %b %Y %H:%M:%S GMT', published_parsed)
return published_parsed, published_str
except (ValueError, TypeError):
return None, None
def _get_submissions(subreddit_obj, feed_type, limit=DEFAULT_FEED_LIMIT):
"""
Fetches submissions from a subreddit based on feed type.
Args:
subreddit_obj: PRAW Subreddit object
feed_type: Type of feed ('hot', 'new', 'rising', 'controversial', 'top')
limit: Maximum number of submissions to fetch
Returns:
Generator: PRAW submission generator
"""
feed_methods = {
'hot': subreddit_obj.hot,
'new': subreddit_obj.new,
'rising': subreddit_obj.rising,
'controversial': subreddit_obj.controversial,
'top': subreddit_obj.top
}
method = feed_methods.get(feed_type, subreddit_obj.hot)
return method(limit=limit)
# --- Token Handling ---
def save_token(credentials):
"""
Saves PRAW credentials to a JSON file with secure permissions.
Args:
credentials: Dictionary containing Reddit API credentials
(client_id, client_secret, username, password, user_agent)
"""
try:
# Ensure sensitive data like client_secret isn't inadvertently logged
# In a real app, consider encrypting CREDENTIALS_FILE or using secure storage
g_logger.info(f"Saving credentials for user '{credentials.get('username', 'N/A')}' to {CREDENTIALS_FILE}")
with open(CREDENTIALS_FILE, 'w') as f:
json.dump(credentials, f, indent=4) # Add indent for readability
os.chmod(CREDENTIALS_FILE, CREDENTIALS_FILE_MODE) # Read/write only for owner
# Verify file permissions are correct
actual_mode = os.stat(CREDENTIALS_FILE).st_mode & 0o777
if actual_mode != CREDENTIALS_FILE_MODE:
g_logger.warning(f"Credentials file permissions may not be secure. Expected {oct(CREDENTIALS_FILE_MODE)}, got {oct(actual_mode)}")
except (IOError, OSError, TypeError) as e:
g_logger.error(f"Error saving credentials file '{CREDENTIALS_FILE}': {e}")
def load_token():
"""
Loads PRAW credentials from the JSON file.
Returns:
dict: Credentials dictionary if file exists and is valid, None otherwise
"""
if not os.path.exists(CREDENTIALS_FILE):
return None
try:
with open(CREDENTIALS_FILE, 'r') as f:
credentials = json.load(f)
# Validate: check for essential PRAW keys
if not all(k in credentials for k in REQUIRED_CREDENTIAL_KEYS):
g_logger.debug(f"Credentials file '{CREDENTIALS_FILE}' exists but is missing required PRAW credentials. Will prompt for new credentials.")
return None
return credentials
except json.JSONDecodeError as e:
g_logger.error(f"Error parsing JSON from credentials file '{CREDENTIALS_FILE}': {e}. File may be corrupted or invalid.")
return None
except (IOError, OSError) as e:
g_logger.error(f"Error reading credentials file '{CREDENTIALS_FILE}': {e}")
return None
except TypeError as e:
g_logger.error(f"Unexpected data type in credentials file '{CREDENTIALS_FILE}': {e}")
return None
def _prompt_for_credentials():
"""
Prompts user for Reddit API credentials and saves them to file.
Returns:
dict: Credentials dictionary or None on failure
"""
try:
client_id = input("Enter your Reddit Client ID: ").strip()
client_secret = getpass.getpass("Enter your Reddit Client Secret: ").strip()
username = input("Enter your Reddit Username: ").strip()
password = getpass.getpass("Enter your Reddit Password (stored securely for PRAW): ").strip()
if not all([client_id, client_secret, username, password]):
g_logger.error("Error: All credentials must be provided for initial setup and cannot be empty or whitespace-only.")
return None
# Store credentials for PRAW
# Note: user_agent stored here is for reference only; runtime uses REDDIT_USER_AGENT
credentials = {
'client_id': client_id,
'client_secret': client_secret,
'username': username,
'password': password,
'user_agent': REDDIT_USER_AGENT
}
save_token(credentials)
g_logger.info(f"Credentials saved successfully to '{CREDENTIALS_FILE}' for user '{username}'.")
return credentials
except EOFError:
g_logger.error("Error: Cannot prompt for credentials in non-interactive mode. Please create credentials file manually or run in interactive mode.")
return None
except (IOError, OSError) as e:
g_logger.error(f"Error reading input or saving credentials: {e}")
return None
except ValueError as e:
g_logger.error(f"Invalid input value: {e}")
return None
def _create_reddit_instance(credentials, user_agent):
"""
Creates and tests a PRAW Reddit client instance.
Args:
credentials: Dictionary with Reddit API credentials
user_agent: User agent string to use for API requests (required).
Returns:
praw.Reddit instance or None on failure
Raises:
ValueError: If user_agent is None or empty.
"""
if not user_agent or not isinstance(user_agent, str) or not user_agent.strip():
raise ValueError("user_agent must be a non-empty string")
try:
g_logger.debug(f"Creating PRAW Reddit client for user '{credentials['username']}'")
reddit = praw.Reddit(
client_id=credentials['client_id'],
client_secret=credentials['client_secret'],
username=credentials['username'],
password=credentials['password'],
user_agent=user_agent
)
# Test the connection by trying to access user info
try:
user = reddit.user.me()
g_logger.info(f"Successfully authenticated as user: {user.name}")
except praw.exceptions.Forbidden as e:
g_logger.error(f"Authentication failed: Reddit API returned Forbidden (403). User '{credentials['username']}' may be suspended or credentials may be invalid: {e}")
return None
except praw.exceptions.Unauthorized as e:
g_logger.error(f"Authentication failed: Reddit API returned Unauthorized (401). Invalid credentials for user '{credentials['username']}': {e}")
return None
except Exception as e:
g_logger.error(f"Authentication test failed for user '{credentials['username']}': {e}")
return None
return reddit
except KeyError as e:
g_logger.error(f"Missing required credential field: {e}. Credentials file may be corrupted.")
return None
except Exception as e:
g_logger.error(f"Failed to create PRAW Reddit client: {e}")
return None
def get_valid_reddit_client():
"""
Gets a valid PRAW Reddit client instance,
prompting for initial credentials if needed.
PRAW handles token refresh automatically.
Always uses the official Reddit-formatted user agent.
Returns:
praw.Reddit instance or None on failure.
"""
credentials = load_token()
if not credentials:
g_logger.info(f"No valid credentials file found ('{CREDENTIALS_FILE}'). Prompting for Reddit API credentials.")
credentials = _prompt_for_credentials()
if not credentials:
return None
# Create and test PRAW Reddit instance
# Always use the official Reddit-formatted user agent for API compliance
return _create_reddit_instance(credentials, REDDIT_USER_AGENT)
def parse_reddit_url(url):
"""
Parses a Reddit URL to extract subreddit and feed type.
Handles various Reddit URL formats and validates them for correctness.
Args:
url: Reddit URL string to parse
Returns:
tuple: (subreddit, feed_type) or (None, None) if invalid.
Feed type defaults to 'hot' if not specified.
Examples:
>>> parse_reddit_url("https://www.reddit.com/r/linux/hot")
('linux', 'hot')
>>> parse_reddit_url("https://reddit.com/r/python/new/.rss")
('python', 'new')
>>> parse_reddit_url("https://www.reddit.com/r/AskElectronics/")
('AskElectronics', 'hot')
Edge cases:
- Invalid URL schemes (not http/https) return (None, None)
- Subreddit names longer than 21 characters return (None, None)
- Invalid feed types return (None, None)
- Malformed URLs return (None, None)
"""
try:
parsed_url = urlparse(url)
# Validate URL scheme
if parsed_url.scheme.lower() not in ('http', 'https'):
g_logger.warning(f"Invalid URL scheme in Reddit URL: {parsed_url.scheme}. Must be http or https.")
return None, None
# Robustly split path, removing empty segments from leading/trailing slashes
path_parts = [part for part in parsed_url.path.split('/') if part]
if not path_parts or path_parts[0].lower() != 'r' or len(path_parts) < 2:
return None, None # Must start with /r/ and have a subreddit name
subreddit = path_parts[1]
# Validate that subreddit is not empty
if not subreddit or not subreddit.strip():
g_logger.warning("Empty subreddit name found in URL")
return None, None
# Basic check for valid subreddit characters
if not re.match(r'^[a-zA-Z0-9_]+$', subreddit):
g_logger.warning(f"Invalid subreddit format found: {subreddit}")
return None, None
# Validate subreddit name length (Reddit limit is 21 characters)
if len(subreddit) > SUBREDDIT_NAME_MAX_LENGTH:
g_logger.warning(f"Subreddit name too long: {subreddit} ({len(subreddit)} chars, max {SUBREDDIT_NAME_MAX_LENGTH})")
return None, None
feed_type = 'hot' # Default
if len(path_parts) > 2:
# Check third part, removing potential .rss/.json suffix
potential_feed = path_parts[2].lower().rsplit('.', 1)[0]
if potential_feed and potential_feed.strip() and potential_feed in FEED_TYPES:
feed_type = potential_feed
else:
g_logger.warning(f"Invalid feed type '{potential_feed}' for subreddit {subreddit}. Valid types: {FEED_TYPES}")
return None, None
return subreddit, feed_type
except (AttributeError, TypeError, ValueError) as e:
g_logger.error(f"Error parsing URL '{url}': {e}")
return None, None
def _build_entry_basic_fields(submission, entry):
"""Builds basic entry fields: title, author, link, id."""
entry['title'] = submission.title or ''
entry['author'] = submission.author.name if submission.author and hasattr(submission.author, 'name') else '[deleted]'
# Reddit 'permalink' is relative, needs domain prepended
entry['link'] = f"{REDDIT_BASE_URL}{submission.permalink}" if submission.permalink else ''
# Use Reddit's 'name' (e.g., t3_xxxxx) as the stable ID
entry['id'] = submission.name or entry['link'] # Fallback to link if name missing
def _build_entry_content(submission, entry):
"""Builds entry content: summary and content array."""
# Summary & Content (Mimic feedparser structure)
summary = submission.selftext or '' # Markdown version
content_html = submission.selftext_html # HTML version
# If not a self-post and summary is empty, use the external URL as summary
if not submission.is_self and not summary:
summary = submission.url or '' # Link posts point to external URL here
entry['summary'] = summary
# feedparser 'content' is often a list of dicts
entry['content'] = []
if content_html:
# Note: Reddit's selftext_html from PRAW may already be unescaped
entry['content'].append({
'type': CONTENT_TYPE_HTML,
'language': None,
'base': entry['link'],
'value': content_html
})
elif summary and submission.is_self: # Only add plain text content for self posts
entry['content'].append({
'type': CONTENT_TYPE_PLAIN,
'language': None,
'base': entry['link'],
'value': summary
})
def _build_entry_timestamps(submission, entry):
"""Builds entry timestamp fields: published and updated."""
# Time parsing - Reddit provides UTC timestamp
published_parsed, published_str = _parse_timestamp(submission.created_utc)
if published_parsed:
entry['published_parsed'] = published_parsed
entry['published'] = published_str
# Use published time for updated time as well, as Reddit doesn't track updates in listings
entry['updated_parsed'] = published_parsed
entry['updated'] = published_str
else:
entry['published_parsed'] = None
entry['published'] = None
entry['updated_parsed'] = None
entry['updated'] = None
def _build_entry_reddit_fields(submission, entry):
"""Builds Reddit-specific entry fields."""
# Add custom Reddit-specific fields, prefixed for clarity
entry['reddit_score'] = submission.score
entry['reddit_num_comments'] = submission.num_comments
# Provide thumbnail URL only if it's a valid HTTP/HTTPS URL
thumb = submission.thumbnail
entry['reddit_thumbnail'] = thumb if thumb and isinstance(thumb, str) and (thumb.startswith('http://') or thumb.startswith('https://')) else None
entry['reddit_url'] = submission.url # The external link for link posts
entry['reddit_domain'] = submission.domain
entry['reddit_is_self'] = submission.is_self
entry['reddit_subreddit'] = submission.subreddit.display_name if submission.subreddit and hasattr(submission.subreddit, 'display_name') else ''
def format_reddit_entry(submission):
"""
Formats a single PRAW Submission object into a feedparser-like entry dict.
Args:
submission: PRAW Submission object to format
Returns:
dict: Feedparser-compatible entry dict, or None if formatting fails
Edge cases:
- Deleted/suspended posts may have missing author information (shown as '[deleted]')
- Private/banned subreddits may have missing subreddit information
- Malformed submissions may cause formatting to fail and return None
"""
try:
entry = {}
_build_entry_basic_fields(submission, entry)
_build_entry_content(submission, entry)
_build_entry_timestamps(submission, entry)
_build_entry_reddit_fields(submission, entry)
return entry
except AttributeError as e:
g_logger.error(f"Error formatting PRAW submission: Missing expected attribute. Submission may be deleted or malformed: {e}")
return None
except Exception as e:
g_logger.error(f"Error formatting PRAW submission: Unexpected error: {e}")
return None
def _create_success_response(praw_url):
"""Creates base structure for successful feed response."""
return {
'bozo': 0,
'bozo_exception': None,
'feed': {},
'entries': [],
'headers': {},
'href': praw_url,
'status': 200,
'encoding': 'utf-8',
'version': 'praw_api_v1'
}
def _populate_feed_metadata(output, subreddit, feed_type, praw_url, submissions_list):
"""Populates feed metadata in the output structure."""
output['feed']['title'] = f"r/{subreddit} - {feed_type}"
output['feed']['link'] = praw_url
output['feed']['links'] = [{'rel': 'alternate', 'type': 'text/html', 'href': praw_url}]
output['feed']['subtitle'] = f"Posts from r/{subreddit} sorted by {feed_type} via PRAW"
output['feed']['language'] = 'en'
# Use the timestamp of the newest post as the feed's updated time
if submissions_list:
updated_parsed, updated_str = _parse_timestamp(submissions_list[0].created_utc)
if updated_parsed:
output['feed']['updated_parsed'] = updated_parsed
output['feed']['updated'] = updated_str
def _handle_praw_exception(e, subreddit, feed_type, praw_url, output):
"""
Handles PRAW exceptions and updates output accordingly.
Returns:
bool: True if exception was handled, False otherwise
"""
exception_handlers = {
praw.exceptions.NotFound: (404, f"r/{subreddit} - {feed_type} (Not Found)",
f"Subreddit 'r/{subreddit}' not found or does not exist"),
praw.exceptions.Forbidden: (403, f"r/{subreddit} - {feed_type} (Forbidden)",
f"Access forbidden to subreddit 'r/{subreddit}'. Subreddit may be private or banned."),
praw.exceptions.RedditAPIException: (500, f"r/{subreddit} - {feed_type} (API Error)",
f"Reddit API error fetching 'r/{subreddit}/{feed_type}'")
}
for exc_type, (status, title, log_msg) in exception_handlers.items():
if isinstance(e, exc_type):
g_logger.error(f"{log_msg}: {e}")
output['bozo'] = 1
output['bozo_exception'] = e
output['feed']['title'] = title
output['feed']['link'] = praw_url
output['status'] = status
return True
return False
def _get_seen_reddit_ids(feed_url):
"""
Retrieves the set of seen Reddit submission IDs for a given feed URL.
Args:
feed_url: The Reddit feed URL to get seen IDs for
Returns:
set: Set of submission IDs (e.g., {"t3_xxxxx", "t3_yyyyy"}) or empty set if none found
"""
cache_key = f"reddit_seen_ids:{feed_url}"
try:
seen_ids = g_c.get(cache_key)
if seen_ids is None:
return set()
# Ensure we return a set (cache might return a list or other iterable)
return set(seen_ids) if isinstance(seen_ids, (set, list, tuple)) else set()
except Exception as e:
g_logger.warning(f"Error retrieving seen Reddit IDs from cache for {feed_url}: {e}")
return set()
def _mark_reddit_ids_seen(feed_url, submission_ids):
"""
Stores Reddit submission IDs in cache for a given feed URL.
Args:
feed_url: The Reddit feed URL these IDs belong to
submission_ids: Set or list of submission IDs to mark as seen
"""
if not submission_ids:
return
cache_key = f"reddit_seen_ids:{feed_url}"
try:
# Get existing seen IDs and merge with new ones
existing_ids = _get_seen_reddit_ids(feed_url)
all_ids = existing_ids | set(submission_ids)
# Store in cache with expiration
g_c.put(cache_key, all_ids, timeout=EXPIRE_WEEK)
g_logger.debug(f"Marked {len(submission_ids)} Reddit IDs as seen for {feed_url} (total: {len(all_ids)})")
except Exception as e:
g_logger.warning(f"Error storing seen Reddit IDs in cache for {feed_url}: {e}")
def fetch_reddit_feed_as_feedparser(feed_url):
"""
Fetches data from Reddit API based on an RSS-like URL
and returns it in a structure similar to feedparser output.
Uses PRAW for authentication and API calls with the official Reddit-formatted user agent.
Args:
feed_url: Reddit URL to fetch (e.g., "https://www.reddit.com/r/linux/hot")
Returns:
dict: Feedparser-compatible dictionary with the following structure:
- 'bozo': 0 for success, 1 for errors
- 'bozo_exception': Exception object if bozo=1, None otherwise
- 'feed': Dictionary with feed metadata (title, link, subtitle, language, etc.)
- 'entries': List of entry dictionaries, each containing:
- Standard feedparser fields: title, author, link, id, summary, content, published, updated
- Reddit-specific fields: reddit_score, reddit_num_comments, reddit_thumbnail,
reddit_url, reddit_domain, reddit_is_self, reddit_subreddit
- 'status': HTTP status code (200 for success, 4xx/5xx for errors)
- 'href': The canonical Reddit URL used for fetching
- 'encoding': 'utf-8'
- 'version': 'praw_api_v1'
On error, returns a structure with bozo=1, appropriate status code, and error details.
Structure matches _create_success_response() for success and _create_error_response() for errors.
"""
subreddit, feed_type = parse_reddit_url(feed_url)
if not subreddit or not feed_type:
g_logger.error(f"Could not parse subreddit/feed type from URL: {feed_url}. Parsed subreddit: '{subreddit}', feed_type: '{feed_type}'")
return _create_error_response(400, ValueError(f"Invalid Reddit URL format: {feed_url}"), feed_url)
# Get PRAW Reddit client
reddit = get_valid_reddit_client()
if not reddit:
g_logger.error(f"Could not obtain valid PRAW Reddit client for URL: {feed_url}")
return _create_error_response(401, ConnectionError("Failed to get PRAW Reddit client"), feed_url)
praw_url = f"{REDDIT_BASE_URL}/r/{subreddit}/{feed_type}"
g_logger.info(f"Fetching {feed_type} feed for r/{subreddit} using PRAW")
g_logger.debug(f"Reddit API URL: {praw_url}")
output = _create_success_response(praw_url)
# Get seen submission IDs before fetching to enable early-stop deduplication
seen_ids = _get_seen_reddit_ids(feed_url)
g_logger.debug(f"Found {len(seen_ids)} previously seen Reddit IDs for {feed_url}")
try:
subreddit_obj = reddit.subreddit(subreddit)
submissions = _get_submissions(subreddit_obj, feed_type)
# Process submissions one at a time with early-stop on seen submissions
submissions_list = []
new_submission_ids = []
stopped_early = False
for submission in itertools.islice(submissions, DEFAULT_FEED_LIMIT):
# Check if we've already seen this submission
submission_id = submission.name if hasattr(submission, 'name') and submission.name else None
if not submission_id:
g_logger.debug(f"Skipping submission without ID in r/{subreddit}/{feed_type}")
continue
if submission_id in seen_ids:
# Stop iterating immediately - all subsequent submissions are older/seen
stopped_early = True
g_logger.debug(f"Stopped early at seen submission {submission_id} in r/{subreddit}/{feed_type}")
break
# This is a new submission - format and collect it
submissions_list.append(submission)
new_submission_ids.append(submission_id)
entry = format_reddit_entry(submission)
if entry:
output['entries'].append(entry)
else:
g_logger.debug(f"Failed to format submission {submission_id} in r/{subreddit}/{feed_type}")
except (praw.exceptions.NotFound, praw.exceptions.Forbidden, praw.exceptions.RedditAPIException) as e:
if _handle_praw_exception(e, subreddit, feed_type, praw_url, output):
return output
# Fall through to generic handler if not matched
except Exception as e:
# Unexpected error type - log with full context for debugging
g_logger.error(f"Unexpected error type '{type(e).__name__}' fetching Reddit data with PRAW for 'r/{subreddit}/{feed_type}' from URL {feed_url}: {e}", exc_info=True)
output['bozo'] = 1
output['bozo_exception'] = e
output['feed']['title'] = f"r/{subreddit} - {feed_type} (Error)"
output['feed']['link'] = praw_url
output['status'] = 500
return output
# Update cache with newly seen submission IDs
if new_submission_ids:
_mark_reddit_ids_seen(feed_url, new_submission_ids)
# Populate feed metadata using the submissions we processed
_populate_feed_metadata(output, subreddit, feed_type, praw_url, submissions_list)
if not submissions_list:
g_logger.warning(f"No new posts found for r/{subreddit}/{feed_type} (Subreddit might be empty, filtered, or all posts already seen).")
else:
log_msg = f"Successfully fetched {len(output['entries'])} new entries for r/{subreddit}/{feed_type}"
if stopped_early:
log_msg += " (stopped early at already-seen submission)"
g_logger.debug(log_msg)
return output
# --- Main Execution Logic ---
if __name__ == '__main__':
g_logger.info("Attempting to ensure a valid PRAW Reddit client can be created...")
# This call will handle loading, prompting for initial creds
# Uses the official Reddit-formatted user agent automatically
reddit = get_valid_reddit_client()
if reddit:
g_logger.info(f"\nSuccessfully created PRAW Reddit client for user: {reddit.user.me().name}.")
g_logger.info("You can now use functions like fetch_reddit_feed_as_feedparser.")
# Example usage (optional):
# test_url = "https://www.reddit.com/r/python/hot/.rss"
# print(f"\nAttempting to fetch example feed: {test_url}")
# feed_data = fetch_reddit_feed_as_feedparser(test_url)
# if feed_data and not feed_data.get('bozo'):
# print(f"Successfully fetched {len(feed_data.get('entries', []))} entries.")
# # print(json.dumps(feed_data, indent=2)) # Pretty print result
# elif feed_data:
# print(f"Failed to fetch feed. Status: {feed_data.get('status')}, Error: {feed_data.get('bozo_exception')}")
# else:
# print("Failed to fetch feed (returned None).")
else:
g_logger.error("\nFailed to create a valid PRAW Reddit client. Cannot proceed with API calls.")
g_logger.error(f"Check for errors above. If it's the first run, ensure you provide correct credentials when prompted.")
g_logger.error(f"Make sure the token file '{CREDENTIALS_FILE}' is writable if it needs to be created/updated.")