Skip to content

Commit 45acb65

Browse files
committed
Done the necessary changes
1 parent ba2467c commit 45acb65

File tree

2 files changed

+266
-0
lines changed

2 files changed

+266
-0
lines changed

env.example

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,5 +34,6 @@
3434
## Flickr
3535

3636
# "The flickr developer guide: https://www.flickr.com/services/developer/"
37+
3738
# FLICKR_API_KEY =
3839
# FLICKR_API_SECRET =

scripts/1-fetch/europeana_fetch.py

Lines changed: 265 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,265 @@
1+
#!/usr/bin/env python
2+
"""
3+
Fetch high-level Europeana statistics for Quantifying the Commons.
4+
Aggregates data by DATA_PROVIDER, LEGAL_TOOL, and COUNT.
5+
"""
6+
7+
# Standard library
8+
import argparse
9+
import csv
10+
import os
11+
import sys
12+
import textwrap
13+
import time
14+
import traceback
15+
from collections import defaultdict
16+
17+
# Third-party
18+
import requests
19+
from dotenv import load_dotenv
20+
from pygments import highlight
21+
from pygments.formatters import TerminalFormatter
22+
from pygments.lexers import PythonTracebackLexer
23+
24+
# Add parent directory so shared can be imported
25+
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
26+
27+
# First-party/Local
28+
import shared # noqa: E402
29+
30+
# Setup
31+
LOGGER, PATHS = shared.setup(__file__)
32+
33+
# Load environment variables
34+
load_dotenv(PATHS["dotenv"])
35+
36+
# Constants
37+
EUROPEANA_API_KEY = os.getenv("EUROPEANA_API_KEY")
38+
BASE_URL = "https://api.europeana.eu/record/v2/search.json"
39+
FILE_STATS = shared.path_join(PATHS["data_phase"], "europeana_1_count.csv")
40+
HEADER_STATS = ["DATA_PROVIDER", "LEGAL_TOOL", "COUNT"]
41+
QUARTER = os.path.basename(PATHS["data_quarter"])
42+
43+
# Log the start of script execution
44+
LOGGER.info("Europeana high-level stats script execution started.")
45+
46+
47+
def parse_arguments():
48+
"""
49+
Parse command-line options, returns parsed argument namespace.
50+
"""
51+
LOGGER.info("Parsing command-line options.")
52+
parser = argparse.ArgumentParser(description=__doc__)
53+
parser.add_argument(
54+
"--limit",
55+
type=int,
56+
default=100,
57+
help="Limit number of results to fetch (default: 100).",
58+
)
59+
parser.add_argument(
60+
"--enable-save",
61+
action="store_true",
62+
help="Enable saving aggregated results to CSV.",
63+
)
64+
parser.add_argument(
65+
"--enable-git",
66+
action="store_true",
67+
help="Enable git actions (fetch, merge, add, commit, push).",
68+
)
69+
args = parser.parse_args()
70+
if not args.enable_save and args.enable_git:
71+
parser.error("--enable-git requires --enable-save")
72+
return args
73+
74+
75+
def initialize_data_file(file_path, header):
76+
"""Initialize the data file with a header if it doesn't exist."""
77+
if not os.path.isfile(file_path):
78+
with open(file_path, "w", newline="") as file_obj:
79+
writer = csv.DictWriter(
80+
file_obj, fieldnames=header, dialect="unix"
81+
)
82+
writer.writeheader()
83+
84+
85+
def initialize_all_data_files(args):
86+
"""Ensure data directories and files exist."""
87+
if not args.enable_save:
88+
return
89+
os.makedirs(PATHS["data_phase"], exist_ok=True)
90+
initialize_data_file(FILE_STATS, HEADER_STATS)
91+
92+
93+
def fetch_europeana_data(args):
94+
"""
95+
Fetch and aggregate data from the Europeana Search API
96+
by DATA_PROVIDER and LEGAL_TOOL.
97+
"""
98+
LOGGER.info("Fetching aggregated Europeana data.")
99+
100+
if not EUROPEANA_API_KEY:
101+
raise shared.QuantifyingException(
102+
"EUROPEANA_API_KEY not found in environment variables", 1
103+
)
104+
105+
# Try different queries to get diverse content
106+
queries = ["art", "history", "science", "music", "photography"]
107+
items_per_query = max(20, args.limit // len(queries))
108+
all_items = []
109+
110+
for query in queries:
111+
params = {
112+
"wskey": EUROPEANA_API_KEY,
113+
"rows": min(items_per_query, 20),
114+
"profile": "rich",
115+
"query": query,
116+
}
117+
118+
try:
119+
LOGGER.info(
120+
f"Fetching {params['rows']} records for query: '{query}'"
121+
)
122+
response = requests.get(BASE_URL, params=params, timeout=30)
123+
response.raise_for_status()
124+
results = response.json()
125+
items = results.get("items", [])
126+
all_items.extend(items)
127+
LOGGER.info(f"Retrieved {len(items)} items for '{query}'")
128+
time.sleep(1) # Be nice to the API
129+
except requests.RequestException as e:
130+
LOGGER.warning(f"Failed to fetch data for query '{query}': {e}")
131+
continue
132+
133+
if not all_items:
134+
LOGGER.error("No items retrieved from any query")
135+
return []
136+
137+
LOGGER.info(f"Total items retrieved: {len(all_items)}")
138+
139+
# Aggregate by data provider and legal tool
140+
aggregation = defaultdict(lambda: defaultdict(int))
141+
142+
for item in all_items:
143+
# Handle dataProvider (can be array or string)
144+
data_providers = item.get("dataProvider", [])
145+
if isinstance(data_providers, str):
146+
data_provider = data_providers
147+
elif data_providers and isinstance(data_providers, list):
148+
data_provider = data_providers[0] if data_providers else "Unknown"
149+
else:
150+
data_provider = "Unknown"
151+
152+
# Handle rights/license information - extract only the license code
153+
rights = item.get("rights", [])
154+
if isinstance(rights, str):
155+
legal_tool = rights
156+
elif rights and isinstance(rights, list):
157+
legal_tool = rights[0] if rights else "Unknown"
158+
else:
159+
legal_tool = "Unknown"
160+
161+
# Simplify legal tool (e.g., extract 'by/4.0/' → 'CC BY 4.0')
162+
if (
163+
legal_tool
164+
and legal_tool != "Unknown"
165+
and legal_tool.startswith("http")
166+
):
167+
parts = legal_tool.strip("/").split("/")
168+
last_parts = parts[-2:] # e.g., ['by', '4.0'] or ['InC', '1.0']
169+
if last_parts:
170+
# Join neatly with spaces and add CC if
171+
# it’s a Creative Commons license
172+
joined = " ".join(part.upper() for part in last_parts if part)
173+
if "creativecommons.org" in legal_tool:
174+
legal_tool = f"CC {joined}"
175+
else:
176+
legal_tool = joined
177+
else:
178+
legal_tool = "Unknown"
179+
180+
aggregation[data_provider][legal_tool] += 1
181+
182+
# Convert to flat list
183+
output = []
184+
for provider, licenses in aggregation.items():
185+
for legal_tool, count in licenses.items():
186+
output.append(
187+
{
188+
"DATA_PROVIDER": provider,
189+
"LEGAL_TOOL": legal_tool,
190+
"COUNT": count,
191+
}
192+
)
193+
194+
LOGGER.info(
195+
f"Aggregated data into {len(output)} provider-license combinations"
196+
)
197+
return output
198+
199+
200+
def save_to_csv(args, data):
201+
"""Save aggregated data to CSV."""
202+
if not args.enable_save:
203+
LOGGER.info("Save disabled - skipping file write")
204+
return
205+
if not data:
206+
LOGGER.warning("No data to save")
207+
return
208+
209+
with open(FILE_STATS, "w", newline="") as file_obj:
210+
writer = csv.DictWriter(
211+
file_obj, fieldnames=HEADER_STATS, dialect="unix"
212+
)
213+
writer.writeheader()
214+
for row in data:
215+
writer.writerow(row)
216+
LOGGER.info(f"Saved {len(data)} aggregated rows to {FILE_STATS}.")
217+
218+
219+
def main():
220+
args = parse_arguments()
221+
shared.paths_log(LOGGER, PATHS)
222+
shared.git_fetch_and_merge(args, PATHS["repo"])
223+
initialize_all_data_files(args)
224+
225+
data = fetch_europeana_data(args)
226+
save_to_csv(args, data)
227+
228+
args = shared.git_add_and_commit(
229+
args,
230+
PATHS["repo"],
231+
PATHS["data_quarter"],
232+
f"Add and commit Europeana high-level statistics for {QUARTER}",
233+
)
234+
shared.git_push_changes(args, PATHS["repo"])
235+
236+
LOGGER.info("Europeana high-level stats script completed successfully.")
237+
238+
239+
if __name__ == "__main__":
240+
try:
241+
main()
242+
except shared.QuantifyingException as e:
243+
if e.exit_code == 0:
244+
LOGGER.info(e.message)
245+
else:
246+
LOGGER.error(e.message)
247+
sys.exit(e.exit_code)
248+
except SystemExit as e:
249+
if e.code != 0:
250+
LOGGER.error(f"System exit with code: {e.code}")
251+
sys.exit(e.code)
252+
except KeyboardInterrupt:
253+
LOGGER.info("(130) Halted via KeyboardInterrupt.")
254+
sys.exit(130)
255+
except Exception:
256+
traceback_formatted = textwrap.indent(
257+
highlight(
258+
traceback.format_exc(),
259+
PythonTracebackLexer(),
260+
TerminalFormatter(),
261+
),
262+
" ",
263+
)
264+
LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
265+
sys.exit(1)

0 commit comments

Comments
 (0)