1
1
import pandas as pd
2
2
import yfinance as yf
3
3
from bs4 import BeautifulSoup
4
-
4
+ from requests_cache import CachedSession
5
5
from models import RssArticle
6
6
from rss_feeds import parse_feeds
7
7
from src .tasks import download_article
8
+ from datetime import timedelta
9
+
10
+ request_session = CachedSession ('finance_news.cache' , use_cache_dir = True ,
11
+ cache_control = False ,
12
+ # Use Cache-Control response headers for expiration, if available
13
+ expire_after = timedelta (hours = 3 ),
14
+ # Otherwise expire responses after one day
15
+ allowable_codes = [200 , 400 ],
16
+ # Cache 400 responses as a solemn reminder of your failures
17
+ allowable_methods = ['GET' , 'POST' ],
18
+ # Cache whatever HTTP methods you want
19
+ ignored_parameters = ['api_key' ],
20
+ # Don't match this request param, and redact if from the cache
21
+ match_headers = ['Accept-Language' ],
22
+ # Cache a different response per language
23
+ stale_if_error = True ,
24
+ # In case of request errors, use stale cache data if possible
25
+ )
8
26
9
27
10
28
async def scrape_news_yahoo (tickers : list [str ]) -> list [dict [str , list [dict [str , str ]]]]:
@@ -16,9 +34,9 @@ async def scrape_news_yahoo(tickers: list[str]) -> list[dict[str, list[dict[str,
16
34
:return: A list of dictionaries containing ticker symbols as keys and a list of articles as values.
17
35
"""
18
36
news = []
19
-
37
+ request_session . headers . update ({ 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' })
20
38
for ticker in tickers :
21
- ticker = yf .Ticker (ticker )
39
+ ticker = yf .Ticker (ticker = ticker . upper (), session = request_session )
22
40
news_df = pd .DataFrame (ticker .news )
23
41
articles = []
24
42
for i in range (len (news_df )):
@@ -64,9 +82,7 @@ async def alternate_news_sources() -> list[dict[str, list[dict[str, str]]]]:
64
82
65
83
66
84
async def parse_article (article : RssArticle ) -> tuple [str , str , list [dict [str , str | int ]]]:
67
- headers = {
68
- 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
69
- }
85
+ headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' }
70
86
71
87
html = await download_article (link = article .link , headers = headers , timeout = 60 )
72
88
if html is None :
@@ -81,4 +97,4 @@ async def parse_article(article: RssArticle) -> tuple[str, str, list[dict[str, s
81
97
body += elem .get_text ()
82
98
elif elem .name == 'img' :
83
99
images .append (dict (src = elem ['src' ], alt = elem ['alt' ], width = elem ['width' ], height = elem ['height' ]))
84
- return summary , body , images
100
+ return summary , body , images
0 commit comments