Skip to content

Commit 2575a4b

Browse files
committed
tweak downloads a bit to add rate limit when retrying
1 parent cee0e18 commit 2575a4b

File tree

3 files changed

+24
-12
lines changed

3 files changed

+24
-12
lines changed

2025/national-jukebox/download_all.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,16 +27,21 @@
2727

2828
DATA_DIR = pathlib.Path(__file__).parent / "data"
2929

30+
MAX_PAGES = 169
31+
32+
# https://guides.loc.gov/digital-scholarship/faq
33+
# Stay within 20 requests per minute rate limit.
34+
SLEEP_SECONDS = 60.0 / 20.0
3035

3136
# target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100"
32-
target_url_template = "https://www.loc.gov/collections/national-jukebox/?c=100&sb=date_desc&sp={}"
37+
# target_url_template = "https://www.loc.gov/collections/national-jukebox/?c=100&sb=date_desc&sp={}"
38+
target_url_template = "https://www.loc.gov/collections/national-jukebox/?c=100&sb=date&sp={}"
39+
# target_url_template = "https://www.loc.gov/collections/national-jukebox/?c=100&fa=original-format:sound+recording&sb=date&sp={}"
3340

3441

3542
def download_and_extract_item(base_url):
3643
print(f"Fetching content from: {base_url}")
37-
# https://guides.loc.gov/digital-scholarship/faq
38-
# Stay within 20 requests per minute rate limit.
39-
time.sleep(3)
44+
time.sleep(SLEEP_SECONDS)
4045

4146
try:
4247
response = requests.get(base_url, timeout=10)
@@ -55,7 +60,7 @@ def download_and_extract_item(base_url):
5560

5661
def download_page(page_number):
5762
target_url = target_url_template.format(page_number)
58-
item_urls = list_urls.get_national_jukebox_song_detail_urls(target_url)
63+
item_urls = list_urls.get_national_jukebox_song_detail_urls(target_url, sleep_seconds=SLEEP_SECONDS)
5964

6065
visited_urls = set()
6166
jukebox_path = DATA_DIR / "jukebox.jsonl"
@@ -81,15 +86,19 @@ def download_page(page_number):
8186

8287

8388
if __name__ == "__main__":
84-
page_number = 30 # 4
89+
page_number = 1
8590
while True:
8691
print(f"Page {page_number}")
8792
try:
8893
download_page(page_number)
89-
download_mp3s.download_all()
94+
# Server is currently down for audio.
95+
# download_mp3s.download_all(sleep_seconds=SLEEP_SECONDS)
9096
except requests.exceptions.HTTPError as exc:
9197
if exc.response.status_code == 404:
9298
print("Reached last page?")
9399
break
94100
page_number += 1
95101

102+
if page_number > MAX_PAGES:
103+
break
104+

2025/national-jukebox/download_mp3s.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,11 @@
2323

2424

2525

26-
def download_mp3(base_url):
26+
def download_mp3(base_url, sleep_seconds: float = 3.0):
2727
print(f"Fetching content from: {base_url}")
2828
# https://guides.loc.gov/digital-scholarship/faq
2929
# Stay within 20 requests per minute rate limit.
30-
time.sleep(3)
30+
time.sleep(sleep_seconds)
3131

3232
try:
3333
response = requests.get(base_url, timeout=60)
@@ -39,7 +39,7 @@ def download_mp3(base_url):
3939
return response.content
4040

4141

42-
def download_all():
42+
def download_all(sleep_seconds: float = 3.0):
4343
jukebox_path = DATA_DIR / "jukebox.jsonl"
4444
jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
4545

@@ -50,7 +50,7 @@ def download_all():
5050
if mp3_path.exists():
5151
continue
5252

53-
mp3_bytes = download_mp3(row["MP3 URL"])
53+
mp3_bytes = download_mp3(row["MP3 URL"], sleep_seconds=sleep_seconds)
5454
if mp3_bytes is None:
5555
continue
5656

2025/national-jukebox/list_urls.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,13 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
import time
16+
1517
import requests
1618
from bs4 import BeautifulSoup
1719
from urllib.parse import urljoin
1820

19-
def get_national_jukebox_song_detail_urls(base_url: str) -> list[str]:
21+
def get_national_jukebox_song_detail_urls(base_url: str, sleep_seconds: float = 3.0) -> list[str]:
2022
"""
2123
Scrapes the National Jukebox collection page to extract URLs for individual song detail pages.
2224
@@ -26,6 +28,7 @@ def get_national_jukebox_song_detail_urls(base_url: str) -> list[str]:
2628
Returns:
2729
A list of URLs for the song detail pages.
2830
"""
31+
time.sleep(sleep_time)
2932
print(f"Fetching content from: {base_url}")
3033
try:
3134
response = requests.get(base_url)

0 commit comments

Comments
 (0)