tweak downloads a bit to add rate limit when retrying

tswast · tswast · commit 2575a4b59d15 · 2025-07-14T19:37:11.000Z
diff --git a/2025/national-jukebox/download_all.py b/2025/national-jukebox/download_all.py
@@ -27,16 +27,21 @@
 
 DATA_DIR = pathlib.Path(__file__).parent / "data"
 
+MAX_PAGES = 169
+
+# https://guides.loc.gov/digital-scholarship/faq
+# Stay within 20 requests per minute rate limit.
+SLEEP_SECONDS = 60.0 / 20.0
 
 # target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100"
-target_url_template = "https://www.loc.gov/collections/national-jukebox/?c=100&sb=date_desc&sp={}"
+# target_url_template = "https://www.loc.gov/collections/national-jukebox/?c=100&sb=date_desc&sp={}"
+target_url_template = "https://www.loc.gov/collections/national-jukebox/?c=100&sb=date&sp={}"
+# target_url_template = "https://www.loc.gov/collections/national-jukebox/?c=100&fa=original-format:sound+recording&sb=date&sp={}"
 
 
 def download_and_extract_item(base_url):
     print(f"Fetching content from: {base_url}")
-    # https://guides.loc.gov/digital-scholarship/faq
-    # Stay within 20 requests per minute rate limit.
-    time.sleep(3)
+    time.sleep(SLEEP_SECONDS)
 
     try:
         response = requests.get(base_url, timeout=10)
@@ -55,7 +60,7 @@ def download_and_extract_item(base_url):
 
 def download_page(page_number):
     target_url = target_url_template.format(page_number)
-    item_urls = list_urls.get_national_jukebox_song_detail_urls(target_url)
+    item_urls = list_urls.get_national_jukebox_song_detail_urls(target_url, sleep_seconds=SLEEP_SECONDS)
 
     visited_urls = set()
     jukebox_path = DATA_DIR / "jukebox.jsonl"
@@ -81,15 +86,19 @@ def download_page(page_number):
 
 
 if __name__ == "__main__":
-    page_number = 30  # 4
+    page_number = 1
     while True:
         print(f"Page {page_number}")
         try:
             download_page(page_number)
-            download_mp3s.download_all()
+            # Server is currently down for audio.
+            # download_mp3s.download_all(sleep_seconds=SLEEP_SECONDS)
         except requests.exceptions.HTTPError as exc:
             if exc.response.status_code == 404:
                 print("Reached last page?")
                 break
         page_number += 1
 
+        if page_number > MAX_PAGES:
+            break
+
diff --git a/2025/national-jukebox/download_mp3s.py b/2025/national-jukebox/download_mp3s.py
@@ -23,11 +23,11 @@
 
 
 
-def download_mp3(base_url):
+def download_mp3(base_url, sleep_seconds: float = 3.0):
     print(f"Fetching content from: {base_url}")
     # https://guides.loc.gov/digital-scholarship/faq
     # Stay within 20 requests per minute rate limit.
-    time.sleep(3)
+    time.sleep(sleep_seconds)
 
     try:
         response = requests.get(base_url, timeout=60)
@@ -39,7 +39,7 @@ def download_mp3(base_url):
     return response.content
 
 
-def download_all():
+def download_all(sleep_seconds: float = 3.0):
     jukebox_path = DATA_DIR / "jukebox.jsonl"
     jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
 
@@ -50,7 +50,7 @@ def download_all():
         if mp3_path.exists():
             continue
 
-        mp3_bytes = download_mp3(row["MP3 URL"])
+        mp3_bytes = download_mp3(row["MP3 URL"], sleep_seconds=sleep_seconds)
         if mp3_bytes is None:
             continue
 
diff --git a/2025/national-jukebox/list_urls.py b/2025/national-jukebox/list_urls.py
@@ -12,11 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import time
+
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
 
-def get_national_jukebox_song_detail_urls(base_url: str) -> list[str]:
+def get_national_jukebox_song_detail_urls(base_url: str, sleep_seconds: float = 3.0) -> list[str]:
     """
     Scrapes the National Jukebox collection page to extract URLs for individual song detail pages.
 
@@ -26,6 +28,7 @@ def get_national_jukebox_song_detail_urls(base_url: str) -> list[str]:
     Returns:
         A list of URLs for the song detail pages.
     """
+    time.sleep(sleep_time)
     print(f"Fetching content from: {base_url}")
     try:
         response = requests.get(base_url)