Skip to content

Commit 8a64542

Browse files
committed
Fix formatting and linting issues in europeana_fetch.py to comply with pre-commit hooks
1 parent 45acb65 commit 8a64542

File tree

1 file changed

+77
-42
lines changed

1 file changed

+77
-42
lines changed

scripts/1-fetch/europeana_fetch.py

Lines changed: 77 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env python
22
"""
33
Fetch high-level Europeana statistics for Quantifying the Commons.
4-
Aggregates data by DATA_PROVIDER, LEGAL_TOOL, and COUNT.
4+
Aggregates data by DATA_PROVIDER, LEGAL_TOOL, THEME, and COUNT.
55
"""
66

77
# Standard library
@@ -20,6 +20,7 @@
2020
from pygments import highlight
2121
from pygments.formatters import TerminalFormatter
2222
from pygments.lexers import PythonTracebackLexer
23+
from requests.adapters import HTTPAdapter, Retry
2324

2425
# Add parent directory so shared can be imported
2526
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -37,17 +38,15 @@
3738
EUROPEANA_API_KEY = os.getenv("EUROPEANA_API_KEY")
3839
BASE_URL = "https://api.europeana.eu/record/v2/search.json"
3940
FILE_STATS = shared.path_join(PATHS["data_phase"], "europeana_1_count.csv")
40-
HEADER_STATS = ["DATA_PROVIDER", "LEGAL_TOOL", "COUNT"]
41+
HEADER_STATS = ["DATA_PROVIDER", "LEGAL_TOOL", "THEME", "COUNT"]
4142
QUARTER = os.path.basename(PATHS["data_quarter"])
4243

4344
# Log the start of script execution
4445
LOGGER.info("Europeana high-level stats script execution started.")
4546

4647

4748
def parse_arguments():
48-
"""
49-
Parse command-line options, returns parsed argument namespace.
50-
"""
49+
"""Parse command-line options."""
5150
LOGGER.info("Parsing command-line options.")
5251
parser = argparse.ArgumentParser(description=__doc__)
5352
parser.add_argument(
@@ -90,10 +89,28 @@ def initialize_all_data_files(args):
9089
initialize_data_file(FILE_STATS, HEADER_STATS)
9190

9291

92+
def get_requests_session():
93+
"""Create a requests session with retry and headers."""
94+
max_retries = Retry(
95+
total=5,
96+
backoff_factor=5,
97+
status_forcelist=shared.RETRY_STATUS_FORCELIST,
98+
)
99+
session = requests.Session()
100+
session.mount("https://", HTTPAdapter(max_retries=max_retries))
101+
session.headers.update(
102+
{
103+
"accept": "application/json",
104+
"User-Agent": shared.USER_AGENT,
105+
}
106+
)
107+
return session
108+
109+
93110
def fetch_europeana_data(args):
94111
"""
95112
Fetch and aggregate data from the Europeana Search API
96-
by DATA_PROVIDER and LEGAL_TOOL.
113+
by DATA_PROVIDER, LEGAL_TOOL, and THEME.
97114
"""
98115
LOGGER.info("Fetching aggregated Europeana data.")
99116

@@ -102,32 +119,50 @@ def fetch_europeana_data(args):
102119
"EUROPEANA_API_KEY not found in environment variables", 1
103120
)
104121

105-
# Try different queries to get diverse content
106-
queries = ["art", "history", "science", "music", "photography"]
107-
items_per_query = max(20, args.limit // len(queries))
122+
# Define Europeana themes to query
123+
# Provided in Europeana's site
124+
themes = [
125+
"art",
126+
"fashion",
127+
"music",
128+
"industrial",
129+
"sport",
130+
"photography",
131+
"archaeology",
132+
]
133+
134+
items_per_query = max(20, args.limit // len(themes))
108135
all_items = []
136+
# Initialize a session for efficient and reliable requests
137+
session = get_requests_session()
109138

110-
for query in queries:
139+
for theme in themes:
111140
params = {
112141
"wskey": EUROPEANA_API_KEY,
113142
"rows": min(items_per_query, 20),
114143
"profile": "rich",
115-
"query": query,
144+
"query": "*",
145+
"theme": theme,
116146
}
117147

118148
try:
119149
LOGGER.info(
120-
f"Fetching {params['rows']} records for query: '{query}'"
150+
f"Fetching {params['rows']} records for theme: '{theme}'"
121151
)
122-
response = requests.get(BASE_URL, params=params, timeout=30)
123-
response.raise_for_status()
124-
results = response.json()
125-
items = results.get("items", [])
152+
with session.get(BASE_URL, params=params, timeout=30) as response:
153+
response.raise_for_status()
154+
results = response.json()
155+
items = results.get("items", [])
156+
157+
# Tag each item with the theme used for easy tracking
158+
for item in items:
159+
item["theme_used"] = theme
160+
126161
all_items.extend(items)
127-
LOGGER.info(f"Retrieved {len(items)} items for '{query}'")
128-
time.sleep(1) # Be nice to the API
162+
LOGGER.info(f"Retrieved {len(items)} items for theme '{theme}'")
163+
time.sleep(1)
129164
except requests.RequestException as e:
130-
LOGGER.warning(f"Failed to fetch data for query '{query}': {e}")
165+
LOGGER.warning(f"Failed to fetch data for theme '{theme}': {e}")
131166
continue
132167

133168
if not all_items:
@@ -136,39 +171,33 @@ def fetch_europeana_data(args):
136171

137172
LOGGER.info(f"Total items retrieved: {len(all_items)}")
138173

139-
# Aggregate by data provider and legal tool
140-
aggregation = defaultdict(lambda: defaultdict(int))
174+
# Aggregate by data provider, legal tool, and theme
175+
aggregation = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
141176

142177
for item in all_items:
143178
# Handle dataProvider (can be array or string)
144179
data_providers = item.get("dataProvider", [])
145180
if isinstance(data_providers, str):
146181
data_provider = data_providers
147182
elif data_providers and isinstance(data_providers, list):
148-
data_provider = data_providers[0] if data_providers else "Unknown"
183+
data_provider = data_providers[0]
149184
else:
150185
data_provider = "Unknown"
151186

152-
# Handle rights/license information - extract only the license code
187+
# Handle rights/license information
153188
rights = item.get("rights", [])
154189
if isinstance(rights, str):
155190
legal_tool = rights
156191
elif rights and isinstance(rights, list):
157-
legal_tool = rights[0] if rights else "Unknown"
192+
legal_tool = rights[0]
158193
else:
159194
legal_tool = "Unknown"
160195

161196
# Simplify legal tool (e.g., extract 'by/4.0/' → 'CC BY 4.0')
162-
if (
163-
legal_tool
164-
and legal_tool != "Unknown"
165-
and legal_tool.startswith("http")
166-
):
197+
if legal_tool and legal_tool.startswith("http"):
167198
parts = legal_tool.strip("/").split("/")
168-
last_parts = parts[-2:] # e.g., ['by', '4.0'] or ['InC', '1.0']
199+
last_parts = parts[-2:]
169200
if last_parts:
170-
# Join neatly with spaces and add CC if
171-
# it’s a Creative Commons license
172201
joined = " ".join(part.upper() for part in last_parts if part)
173202
if "creativecommons.org" in legal_tool:
174203
legal_tool = f"CC {joined}"
@@ -177,22 +206,28 @@ def fetch_europeana_data(args):
177206
else:
178207
legal_tool = "Unknown"
179208

180-
aggregation[data_provider][legal_tool] += 1
209+
# Use the theme from the query loop
210+
theme = item.get("theme_used", "Unknown")
211+
212+
aggregation[data_provider][legal_tool][theme] += 1
181213

182214
# Convert to flat list
183215
output = []
184216
for provider, licenses in aggregation.items():
185-
for legal_tool, count in licenses.items():
186-
output.append(
187-
{
188-
"DATA_PROVIDER": provider,
189-
"LEGAL_TOOL": legal_tool,
190-
"COUNT": count,
191-
}
192-
)
217+
for legal_tool, themes_dict in licenses.items():
218+
for theme, count in themes_dict.items():
219+
output.append(
220+
{
221+
"DATA_PROVIDER": provider,
222+
"LEGAL_TOOL": legal_tool,
223+
"THEME": theme,
224+
"COUNT": count,
225+
}
226+
)
193227

194228
LOGGER.info(
195-
f"Aggregated data into {len(output)} provider-license combinations"
229+
f"Aggregated data into {len(output)} "
230+
f"provider-license-theme combinations"
196231
)
197232
return output
198233

0 commit comments

Comments
 (0)