11#!/usr/bin/env python
22"""
33Fetch high-level Europeana statistics for Quantifying the Commons.
4- Aggregates data by DATA_PROVIDER, LEGAL_TOOL, and COUNT.
4+ Aggregates data by DATA_PROVIDER, LEGAL_TOOL, THEME, and COUNT.
55"""
66
77# Standard library
2020from pygments import highlight
2121from pygments .formatters import TerminalFormatter
2222from pygments .lexers import PythonTracebackLexer
23+ from requests .adapters import HTTPAdapter , Retry
2324
2425# Add parent directory so shared can be imported
2526sys .path .append (os .path .join (os .path .dirname (__file__ ), ".." ))
3738EUROPEANA_API_KEY = os .getenv ("EUROPEANA_API_KEY" )
3839BASE_URL = "https://api.europeana.eu/record/v2/search.json"
3940FILE_STATS = shared .path_join (PATHS ["data_phase" ], "europeana_1_count.csv" )
40- HEADER_STATS = ["DATA_PROVIDER" , "LEGAL_TOOL" , "COUNT" ]
41+ HEADER_STATS = ["DATA_PROVIDER" , "LEGAL_TOOL" , "THEME" , " COUNT" ]
4142QUARTER = os .path .basename (PATHS ["data_quarter" ])
4243
4344# Log the start of script execution
4445LOGGER .info ("Europeana high-level stats script execution started." )
4546
4647
4748def parse_arguments ():
48- """
49- Parse command-line options, returns parsed argument namespace.
50- """
49+ """Parse command-line options."""
5150 LOGGER .info ("Parsing command-line options." )
5251 parser = argparse .ArgumentParser (description = __doc__ )
5352 parser .add_argument (
@@ -90,10 +89,28 @@ def initialize_all_data_files(args):
9089 initialize_data_file (FILE_STATS , HEADER_STATS )
9190
9291
92+ def get_requests_session ():
93+ """Create a requests session with retry and headers."""
94+ max_retries = Retry (
95+ total = 5 ,
96+ backoff_factor = 5 ,
97+ status_forcelist = shared .RETRY_STATUS_FORCELIST ,
98+ )
99+ session = requests .Session ()
100+ session .mount ("https://" , HTTPAdapter (max_retries = max_retries ))
101+ session .headers .update (
102+ {
103+ "accept" : "application/json" ,
104+ "User-Agent" : shared .USER_AGENT ,
105+ }
106+ )
107+ return session
108+
109+
93110def fetch_europeana_data (args ):
94111 """
95112 Fetch and aggregate data from the Europeana Search API
96- by DATA_PROVIDER and LEGAL_TOOL .
113+ by DATA_PROVIDER, LEGAL_TOOL, and THEME .
97114 """
98115 LOGGER .info ("Fetching aggregated Europeana data." )
99116
@@ -102,32 +119,50 @@ def fetch_europeana_data(args):
102119 "EUROPEANA_API_KEY not found in environment variables" , 1
103120 )
104121
105- # Try different queries to get diverse content
106- queries = ["art" , "history" , "science" , "music" , "photography" ]
107- items_per_query = max (20 , args .limit // len (queries ))
122+ # Define Europeana themes to query
123+ # Provided in Europeana's site
124+ themes = [
125+ "art" ,
126+ "fashion" ,
127+ "music" ,
128+ "industrial" ,
129+ "sport" ,
130+ "photography" ,
131+ "archaeology" ,
132+ ]
133+
134+ items_per_query = max (20 , args .limit // len (themes ))
108135 all_items = []
136+ # Initialize a session for efficient and reliable requests
137+ session = get_requests_session ()
109138
110- for query in queries :
139+ for theme in themes :
111140 params = {
112141 "wskey" : EUROPEANA_API_KEY ,
113142 "rows" : min (items_per_query , 20 ),
114143 "profile" : "rich" ,
115- "query" : query ,
144+ "query" : "*" ,
145+ "theme" : theme ,
116146 }
117147
118148 try :
119149 LOGGER .info (
120- f"Fetching { params ['rows' ]} records for query : '{ query } '"
150+ f"Fetching { params ['rows' ]} records for theme : '{ theme } '"
121151 )
122- response = requests .get (BASE_URL , params = params , timeout = 30 )
123- response .raise_for_status ()
124- results = response .json ()
125- items = results .get ("items" , [])
152+ with session .get (BASE_URL , params = params , timeout = 30 ) as response :
153+ response .raise_for_status ()
154+ results = response .json ()
155+ items = results .get ("items" , [])
156+
157+ # Tag each item with the theme used for easy tracking
158+ for item in items :
159+ item ["theme_used" ] = theme
160+
126161 all_items .extend (items )
127- LOGGER .info (f"Retrieved { len (items )} items for ' { query } '" )
128- time .sleep (1 ) # Be nice to the API
162+ LOGGER .info (f"Retrieved { len (items )} items for theme ' { theme } '" )
163+ time .sleep (1 )
129164 except requests .RequestException as e :
130- LOGGER .warning (f"Failed to fetch data for query '{ query } ': { e } " )
165+ LOGGER .warning (f"Failed to fetch data for theme '{ theme } ': { e } " )
131166 continue
132167
133168 if not all_items :
@@ -136,39 +171,33 @@ def fetch_europeana_data(args):
136171
137172 LOGGER .info (f"Total items retrieved: { len (all_items )} " )
138173
139- # Aggregate by data provider and legal tool
140- aggregation = defaultdict (lambda : defaultdict (int ))
174+ # Aggregate by data provider, legal tool, and theme
175+ aggregation = defaultdict (lambda : defaultdict (lambda : defaultdict ( int ) ))
141176
142177 for item in all_items :
143178 # Handle dataProvider (can be array or string)
144179 data_providers = item .get ("dataProvider" , [])
145180 if isinstance (data_providers , str ):
146181 data_provider = data_providers
147182 elif data_providers and isinstance (data_providers , list ):
148- data_provider = data_providers [0 ] if data_providers else "Unknown"
183+ data_provider = data_providers [0 ]
149184 else :
150185 data_provider = "Unknown"
151186
152- # Handle rights/license information - extract only the license code
187+ # Handle rights/license information
153188 rights = item .get ("rights" , [])
154189 if isinstance (rights , str ):
155190 legal_tool = rights
156191 elif rights and isinstance (rights , list ):
157- legal_tool = rights [0 ] if rights else "Unknown"
192+ legal_tool = rights [0 ]
158193 else :
159194 legal_tool = "Unknown"
160195
161196 # Simplify legal tool (e.g., extract 'by/4.0/' → 'CC BY 4.0')
162- if (
163- legal_tool
164- and legal_tool != "Unknown"
165- and legal_tool .startswith ("http" )
166- ):
197+ if legal_tool and legal_tool .startswith ("http" ):
167198 parts = legal_tool .strip ("/" ).split ("/" )
168- last_parts = parts [- 2 :] # e.g., ['by', '4.0'] or ['InC', '1.0']
199+ last_parts = parts [- 2 :]
169200 if last_parts :
170- # Join neatly with spaces and add CC if
171- # it’s a Creative Commons license
172201 joined = " " .join (part .upper () for part in last_parts if part )
173202 if "creativecommons.org" in legal_tool :
174203 legal_tool = f"CC { joined } "
@@ -177,22 +206,28 @@ def fetch_europeana_data(args):
177206 else :
178207 legal_tool = "Unknown"
179208
180- aggregation [data_provider ][legal_tool ] += 1
209+ # Use the theme from the query loop
210+ theme = item .get ("theme_used" , "Unknown" )
211+
212+ aggregation [data_provider ][legal_tool ][theme ] += 1
181213
182214 # Convert to flat list
183215 output = []
184216 for provider , licenses in aggregation .items ():
185- for legal_tool , count in licenses .items ():
186- output .append (
187- {
188- "DATA_PROVIDER" : provider ,
189- "LEGAL_TOOL" : legal_tool ,
190- "COUNT" : count ,
191- }
192- )
217+ for legal_tool , themes_dict in licenses .items ():
218+ for theme , count in themes_dict .items ():
219+ output .append (
220+ {
221+ "DATA_PROVIDER" : provider ,
222+ "LEGAL_TOOL" : legal_tool ,
223+ "THEME" : theme ,
224+ "COUNT" : count ,
225+ }
226+ )
193227
194228 LOGGER .info (
195- f"Aggregated data into { len (output )} provider-license combinations"
229+ f"Aggregated data into { len (output )} "
230+ f"provider-license-theme combinations"
196231 )
197232 return output
198233
0 commit comments