27
27
28
28
DATA_DIR = pathlib .Path (__file__ ).parent / "data"
29
29
30
+ MAX_PAGES = 169
31
+
32
+ # https://guides.loc.gov/digital-scholarship/faq
33
+ # Stay within 20 requests per minute rate limit.
34
+ SLEEP_SECONDS = 60.0 / 20.0
30
35
31
36
# target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100"
32
- target_url_template = "https://www.loc.gov/collections/national-jukebox/?c=100&sb=date_desc&sp={}"
37
+ # target_url_template = "https://www.loc.gov/collections/national-jukebox/?c=100&sb=date_desc&sp={}"
38
+ target_url_template = "https://www.loc.gov/collections/national-jukebox/?c=100&sb=date&sp={}"
39
+ # target_url_template = "https://www.loc.gov/collections/national-jukebox/?c=100&fa=original-format:sound+recording&sb=date&sp={}"
33
40
34
41
35
42
def download_and_extract_item (base_url ):
36
43
print (f"Fetching content from: { base_url } " )
37
- # https://guides.loc.gov/digital-scholarship/faq
38
- # Stay within 20 requests per minute rate limit.
39
- time .sleep (3 )
44
+ time .sleep (SLEEP_SECONDS )
40
45
41
46
try :
42
47
response = requests .get (base_url , timeout = 10 )
@@ -55,7 +60,7 @@ def download_and_extract_item(base_url):
55
60
56
61
def download_page (page_number ):
57
62
target_url = target_url_template .format (page_number )
58
- item_urls = list_urls .get_national_jukebox_song_detail_urls (target_url )
63
+ item_urls = list_urls .get_national_jukebox_song_detail_urls (target_url , sleep_seconds = SLEEP_SECONDS )
59
64
60
65
visited_urls = set ()
61
66
jukebox_path = DATA_DIR / "jukebox.jsonl"
@@ -81,15 +86,19 @@ def download_page(page_number):
81
86
82
87
83
88
if __name__ == "__main__" :
84
- page_number = 30 # 4
89
+ page_number = 1
85
90
while True :
86
91
print (f"Page { page_number } " )
87
92
try :
88
93
download_page (page_number )
89
- download_mp3s .download_all ()
94
+ # Server is currently down for audio.
95
+ # download_mp3s.download_all(sleep_seconds=SLEEP_SECONDS)
90
96
except requests .exceptions .HTTPError as exc :
91
97
if exc .response .status_code == 404 :
92
98
print ("Reached last page?" )
93
99
break
94
100
page_number += 1
95
101
102
+ if page_number > MAX_PAGES :
103
+ break
104
+
0 commit comments