Skip to content

Commit ba8782b

Browse files
committedApr 16, 2023
Updated Session and Cache Control
1 parent d1a9d5b commit ba8782b

File tree

2 files changed

+23
-7
lines changed

2 files changed

+23
-7
lines changed
 

‎requirements.txt

1.24 KB
Binary file not shown.

‎src/tasks/news_scraper.py

+23-7
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,28 @@
11
import pandas as pd
22
import yfinance as yf
33
from bs4 import BeautifulSoup
4-
4+
from requests_cache import CachedSession
55
from models import RssArticle
66
from rss_feeds import parse_feeds
77
from src.tasks import download_article
8+
from datetime import timedelta
9+
10+
request_session = CachedSession('finance_news.cache', use_cache_dir=True,
11+
cache_control=False,
12+
# Use Cache-Control response headers for expiration, if available
13+
expire_after=timedelta(hours=3),
14+
# Otherwise expire responses after one day
15+
allowable_codes=[200, 400],
16+
# Cache 400 responses as a solemn reminder of your failures
17+
allowable_methods=['GET', 'POST'],
18+
# Cache whatever HTTP methods you want
19+
ignored_parameters=['api_key'],
20+
# Don't match this request param, and redact if from the cache
21+
match_headers=['Accept-Language'],
22+
# Cache a different response per language
23+
stale_if_error=True,
24+
# In case of request errors, use stale cache data if possible
25+
)
826

927

1028
async def scrape_news_yahoo(tickers: list[str]) -> list[dict[str, list[dict[str, str]]]]:
@@ -16,9 +34,9 @@ async def scrape_news_yahoo(tickers: list[str]) -> list[dict[str, list[dict[str,
1634
:return: A list of dictionaries containing ticker symbols as keys and a list of articles as values.
1735
"""
1836
news = []
19-
37+
request_session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'})
2038
for ticker in tickers:
21-
ticker = yf.Ticker(ticker)
39+
ticker = yf.Ticker(ticker=ticker.upper(), session=request_session)
2240
news_df = pd.DataFrame(ticker.news)
2341
articles = []
2442
for i in range(len(news_df)):
@@ -64,9 +82,7 @@ async def alternate_news_sources() -> list[dict[str, list[dict[str, str]]]]:
6482

6583

6684
async def parse_article(article: RssArticle) -> tuple[str, str, list[dict[str, str | int]]]:
67-
headers = {
68-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
69-
}
85+
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
7086

7187
html = await download_article(link=article.link, headers=headers, timeout=60)
7288
if html is None:
@@ -81,4 +97,4 @@ async def parse_article(article: RssArticle) -> tuple[str, str, list[dict[str, s
8197
body += elem.get_text()
8298
elif elem.name == 'img':
8399
images.append(dict(src=elem['src'], alt=elem['alt'], width=elem['width'], height=elem['height']))
84-
return summary, body, images
100+
return summary, body, images

0 commit comments

Comments
 (0)
Please sign in to comment.