88import sys
99import configparser
1010import os
11+ import argparse
12+ from typing import Optional , TextIO
1113
1214def load_config ():
1315 """Load configuration from config.conf file"""
@@ -42,10 +44,11 @@ def load_config():
4244 sys .exit (1 )
4345
4446class FinalBulkImporter :
45- def __init__ (self , tandoor_url , api_token , delay ):
47+ def __init__ (self , tandoor_url : str , api_token : str , delay : int , output_file : Optional [ TextIO ] = None ):
4648 self .tandoor_url = tandoor_url
4749 self .api_token = api_token
4850 self .delay = delay
51+ self .output_file = output_file
4952
5053 self .session = requests .Session ()
5154 self .session .headers .update ({
@@ -135,19 +138,19 @@ def get_existing_source_urls(self):
135138 existing_urls = set ()
136139 page = 1
137140
138- print ("🔍 Fetching existing recipes for duplicate detection..." )
141+ self . log_output ("🔍 Fetching existing recipes for duplicate detection..." )
139142
140143 while True :
141144 try :
142145 response = self .session .get (f"{ self .tandoor_url } /api/recipe/?page={ page } &page_size=100" , timeout = 15 )
143146
144147 if response .status_code == 429 :
145- print ("⏳ Rate limited while fetching existing recipes, waiting..." )
148+ self . log_output ("⏳ Rate limited while fetching existing recipes, waiting..." )
146149 time .sleep (60 )
147150 continue
148151
149152 if response .status_code != 200 :
150- print (f"❌ Error fetching existing recipes: { response .status_code } " )
153+ self . log_output (f"❌ Error fetching existing recipes: { response .status_code } " )
151154 break
152155
153156 data = response .json ()
@@ -169,10 +172,10 @@ def get_existing_source_urls(self):
169172 time .sleep (1 ) # Small delay between pagination requests
170173
171174 except Exception as e :
172- print (f"❌ Error getting existing recipes: { e } " )
175+ self . log_output (f"❌ Error getting existing recipes: { e } " )
173176 break
174177
175- print (f"📊 Found { len (existing_urls )} existing recipes with source URLs" )
178+ self . log_output (f"📊 Found { len (existing_urls )} existing recipes with source URLs" )
176179 return existing_urls
177180
178181 def scrape_recipe (self , url ):
@@ -246,33 +249,33 @@ def create_recipe(self, recipe_data, images=None):
246249
247250 def import_single_recipe (self , url , index , total ):
248251 """Complete import process for a single recipe"""
249- print (f"\n 📝 [{ index } /{ total } ] Importing: { url } " )
252+ self . log_output (f"\n 📝 [{ index } /{ total } ] Importing: { url } " )
250253
251254 # Step 1: Scrape
252255 scrape_success , scrape_result , images , _ = self .scrape_recipe (url )
253256 if not scrape_success :
254257 if "rate_limited" in scrape_result :
255258 self .stats ['rate_limited' ] += 1
256- print ("⏳ Rate limited during scrape" )
259+ self . log_output ("⏳ Rate limited during scrape" )
257260 return "rate_limited"
258261 elif "duplicate" in scrape_result :
259262 self .stats ['duplicates' ] += 1
260- print (f"⚠️ Duplicate: { scrape_result } " )
263+ self . log_output (f"⚠️ Duplicate: { scrape_result } " )
261264 return "duplicate"
262265 elif "non_recipe:" in scrape_result :
263266 self .stats ['non_recipe_urls' ] += 1
264267 self .failed_urls ['non_recipe_urls' ].append ((url , scrape_result ))
265- print (f"🚫 Non-recipe URL: { scrape_result } " )
268+ self . log_output (f"🚫 Non-recipe URL: { scrape_result } " )
266269 return "non_recipe"
267270 elif "connection:" in scrape_result :
268271 self .stats ['connection_errors' ] += 1
269272 self .failed_urls ['connection_errors' ].append ((url , scrape_result ))
270- print (f"🌐 Connection error: { scrape_result } " )
273+ self . log_output (f"🌐 Connection error: { scrape_result } " )
271274 return "connection_error"
272275 else :
273276 self .stats ['failed_scrape' ] += 1
274277 self .failed_urls ['failed_scrape' ].append ((url , scrape_result ))
275- print (f"❌ Scrape failed: { scrape_result } " )
278+ self . log_output (f"❌ Scrape failed: { scrape_result } " )
276279 return "failed_scrape"
277280
278281 recipe_data = scrape_result
@@ -283,50 +286,57 @@ def import_single_recipe(self, url, index, total):
283286 if not create_success :
284287 if "rate_limited" in create_result :
285288 self .stats ['rate_limited' ] += 1
286- print ("⏳ Rate limited during creation" )
289+ self . log_output ("⏳ Rate limited during creation" )
287290 return "rate_limited"
288291 else :
289292 self .stats ['failed_create' ] += 1
290293 self .failed_urls ['failed_create' ].append ((url , create_result ))
291- print (f"❌ Create failed: { create_result } " )
294+ self . log_output (f"❌ Create failed: { create_result } " )
292295 return "failed_create"
293296
294297 self .stats ['successful' ] += 1
295- print (f"✅ SUCCESS: '{ recipe_name } ' (ID: { recipe_id } )" )
298+ self . log_output (f"✅ SUCCESS: '{ recipe_name } ' (ID: { recipe_id } )" )
296299 return "success"
297300
298301 def wait_for_rate_limit_reset (self ):
299302 """Wait for rate limit to reset"""
300- print ("⏳ Waiting for rate limit to reset..." )
303+ self . log_output ("⏳ Waiting for rate limit to reset..." )
301304
302305 # Try a simple GET request to check rate limit status
303306 for attempt in range (12 ): # Try for up to 10 minutes
304307 try :
305308 response = self .session .get (f"{ self .tandoor_url } /api/recipe/?page_size=1" , timeout = 10 )
306309
307310 if response .status_code != 429 :
308- print ("✅ Rate limit appears to be reset!" )
311+ self . log_output ("✅ Rate limit appears to be reset!" )
309312 return True
310313
311- print (f"⏳ Still rate limited... waiting 30s (attempt { attempt + 1 } /12)" )
314+ self . log_output (f"⏳ Still rate limited... waiting 30s (attempt { attempt + 1 } /12)" )
312315 time .sleep (30 )
313316
314317 except Exception as e :
315- print (f"⚠️ Error checking rate limit: { e } " )
318+ self . log_output (f"⚠️ Error checking rate limit: { e } " )
316319 time .sleep (30 )
317320
318- print ("❌ Rate limit did not reset after 10 minutes" )
321+ self . log_output ("❌ Rate limit did not reset after 10 minutes" )
319322 return False
320323
321- def import_from_file (self , filename , start_from = 0 , max_imports = None ):
324+ def log_output (self , message : str ) -> None :
325+ """Output message to both console and file if specified."""
326+ print (message )
327+ if self .output_file :
328+ self .output_file .write (f"{ message } \n " )
329+ self .output_file .flush ()
330+
331+ def import_from_file (self , filename : str , start_from : int = 0 , max_imports : Optional [int ] = None ) -> None :
322332 """Import recipes from URL list file"""
323- print (f"📂 Loading URLs from { filename } " )
333+ self . log_output (f"📂 Loading URLs from { filename } " )
324334
325335 try :
326336 with open (filename , 'r' ) as f :
327337 urls = [line .strip () for line in f if line .strip ()]
328338 except Exception as e :
329- print (f"❌ Error reading file: { e } " )
339+ self . log_output (f"❌ Error reading file: { e } " )
330340 return
331341
332342 # Filter and validate URLs
@@ -337,23 +347,23 @@ def import_from_file(self, filename, start_from=0, max_imports=None):
337347 else :
338348 self .stats ['invalid_urls' ] += 1
339349 self .failed_urls ['invalid_urls' ].append (url )
340- print (f"🚫 Skipping invalid/non-recipe URL: { url [:60 ]} { '...' if len (url ) > 60 else '' } " )
350+ self . log_output (f"🚫 Skipping invalid/non-recipe URL: { url [:60 ]} { '...' if len (url ) > 60 else '' } " )
341351
342- print (f"📊 Found { len (valid_urls )} valid URLs ({ self .stats ['invalid_urls' ]} invalid)" )
352+ self . log_output (f"📊 Found { len (valid_urls )} valid URLs ({ self .stats ['invalid_urls' ]} invalid)" )
343353
344354 # Apply start/limit filters
345355 if start_from > 0 :
346356 valid_urls = valid_urls [start_from :]
347- print (f"📊 Starting from index { start_from } , { len (valid_urls )} URLs remaining" )
357+ self . log_output (f"📊 Starting from index { start_from } , { len (valid_urls )} URLs remaining" )
348358
349359 if max_imports :
350360 valid_urls = valid_urls [:max_imports ]
351- print (f"📊 Limited to { max_imports } imports" )
361+ self . log_output (f"📊 Limited to { max_imports } imports" )
352362
353363 self .stats ['total' ] = len (valid_urls )
354364
355365 if not valid_urls :
356- print ("❌ No valid URLs to import!" )
366+ self . log_output ("❌ No valid URLs to import!" )
357367 return
358368
359369 # Get existing recipes to skip duplicates
@@ -362,115 +372,141 @@ def import_from_file(self, filename, start_from=0, max_imports=None):
362372 pre_existing_count = len (valid_urls ) - len (new_urls )
363373
364374 if pre_existing_count > 0 :
365- print (f"⚠️ Skipping { pre_existing_count } URLs that already exist in database" )
375+ self . log_output (f"⚠️ Skipping { pre_existing_count } URLs that already exist in database" )
366376 self .stats ['duplicates' ] += pre_existing_count
367377
368378 if not new_urls :
369- print ("✅ All URLs already imported!" )
379+ self . log_output ("✅ All URLs already imported!" )
370380 return
371381
372- print (f"🚀 Starting import of { len (new_urls )} new recipes..." )
382+ self . log_output (f"🚀 Starting import of { len (new_urls )} new recipes..." )
373383 estimated_minutes = (len (new_urls ) * self .delay ) / 60
374- print (f"⏱️ Estimated time: { estimated_minutes :.1f} minutes" )
384+ self . log_output (f"⏱️ Estimated time: { estimated_minutes :.1f} minutes" )
375385
376386 # Import each URL
377387 for i , url in enumerate (new_urls , 1 ):
378388 result = self .import_single_recipe (url , i , len (new_urls ))
379389
380390 # Handle rate limiting
381391 if result == "rate_limited" :
382- print ("⏳ Hit rate limit, waiting for reset..." )
392+ self . log_output ("⏳ Hit rate limit, waiting for reset..." )
383393 if self .wait_for_rate_limit_reset ():
384- print ("🔄 Retrying current recipe..." )
394+ self . log_output ("🔄 Retrying current recipe..." )
385395 result = self .import_single_recipe (url , i , len (new_urls ))
386396 else :
387- print ("❌ Could not recover from rate limit, stopping import" )
397+ self . log_output ("❌ Could not recover from rate limit, stopping import" )
388398 break
389399
390400 # Print progress
391401 success_rate = (self .stats ['successful' ] / i ) * 100 if i > 0 else 0
392- print (f"📊 Progress: { i } /{ len (new_urls )} ({ i / len (new_urls )* 100 :.1f} %) | Success rate: { success_rate :.1f} %" )
393- print (f"📈 Stats: ✅{ self .stats ['successful' ]} ⚠️{ self .stats ['duplicates' ]} "
402+ self . log_output (f"📊 Progress: { i } /{ len (new_urls )} ({ i / len (new_urls )* 100 :.1f} %) | Success rate: { success_rate :.1f} %" )
403+ self . log_output (f"📈 Stats: ✅{ self .stats ['successful' ]} ⚠️{ self .stats ['duplicates' ]} "
394404 f"🚫{ self .stats ['non_recipe_urls' ]} 🌐{ self .stats ['connection_errors' ]} "
395405 f"❌{ self .stats ['failed_scrape' ]+ self .stats ['failed_create' ]} ⏳{ self .stats ['rate_limited' ]} " )
396406
397407 # Wait between requests (except on last one)
398408 if i < len (new_urls ):
399- print (f"⏱️ Waiting { self .delay } s before next import..." )
409+ self . log_output (f"⏱️ Waiting { self .delay } s before next import..." )
400410 time .sleep (self .delay )
401411
402412 # Final report
403- print ("\n 🎉 BULK IMPORT COMPLETE!" )
404- print ("📊 Final Stats:" )
405- print (f" Total processed: { self .stats ['total' ]} " )
406- print (f" ✅ Successful imports: { self .stats ['successful' ]} " )
407- print (f" ⚠️ Duplicates skipped: { self .stats ['duplicates' ]} " )
408- print (f" ❌ Failed scraping: { self .stats ['failed_scrape' ]} " )
409- print (f" ❌ Failed creation: { self .stats ['failed_create' ]} " )
410- print (f" 🚫 Non-recipe URLs: { self .stats ['non_recipe_urls' ]} " )
411- print (f" 🌐 Connection errors: { self .stats ['connection_errors' ]} " )
412- print (f" ⏳ Rate limited: { self .stats ['rate_limited' ]} " )
413- print (f" 🚫 Invalid URLs: { self .stats ['invalid_urls' ]} " )
413+ self . log_output ("\n 🎉 BULK IMPORT COMPLETE!" )
414+ self . log_output ("📊 Final Stats:" )
415+ self . log_output (f" Total processed: { self .stats ['total' ]} " )
416+ self . log_output (f" ✅ Successful imports: { self .stats ['successful' ]} " )
417+ self . log_output (f" ⚠️ Duplicates skipped: { self .stats ['duplicates' ]} " )
418+ self . log_output (f" ❌ Failed scraping: { self .stats ['failed_scrape' ]} " )
419+ self . log_output (f" ❌ Failed creation: { self .stats ['failed_create' ]} " )
420+ self . log_output (f" 🚫 Non-recipe URLs: { self .stats ['non_recipe_urls' ]} " )
421+ self . log_output (f" 🌐 Connection errors: { self .stats ['connection_errors' ]} " )
422+ self . log_output (f" ⏳ Rate limited: { self .stats ['rate_limited' ]} " )
423+ self . log_output (f" 🚫 Invalid URLs: { self .stats ['invalid_urls' ]} " )
414424
415425 success_rate = (self .stats ['successful' ] / max (1 , len (new_urls ))) * 100
416- print (f" 📈 Success rate: { success_rate :.1f} %" )
426+ self . log_output (f" 📈 Success rate: { success_rate :.1f} %" )
417427
418428 # Display failed URLs if any
419429 total_failures = (self .stats ['failed_scrape' ] + self .stats ['failed_create' ] +
420430 self .stats ['non_recipe_urls' ] + self .stats ['connection_errors' ] +
421431 self .stats ['invalid_urls' ])
422432
423433 if total_failures > 0 :
424- print (f"\n ❌ FAILED URLS ({ total_failures } total):" )
434+ self . log_output (f"\n ❌ FAILED URLS ({ total_failures } total):" )
425435
426436 if self .failed_urls ['invalid_urls' ]:
427- print (f"\n 🚫 Invalid URLs ({ len (self .failed_urls ['invalid_urls' ])} ):" )
437+ self . log_output (f"\n 🚫 Invalid URLs ({ len (self .failed_urls ['invalid_urls' ])} ):" )
428438 for url in self .failed_urls ['invalid_urls' ]:
429- print (f" { url } " )
439+ self . log_output (f" { url } " )
430440
431441 if self .failed_urls ['non_recipe_urls' ]:
432- print (f"\n 🚫 Non-recipe URLs ({ len (self .failed_urls ['non_recipe_urls' ])} ):" )
442+ self . log_output (f"\n 🚫 Non-recipe URLs ({ len (self .failed_urls ['non_recipe_urls' ])} ):" )
433443 for url , reason in self .failed_urls ['non_recipe_urls' ]:
434- print (f" { url } - { reason } " )
444+ self . log_output (f" { url } - { reason } " )
435445
436446 if self .failed_urls ['connection_errors' ]:
437- print (f"\n 🌐 Connection errors ({ len (self .failed_urls ['connection_errors' ])} ):" )
447+ self . log_output (f"\n 🌐 Connection errors ({ len (self .failed_urls ['connection_errors' ])} ):" )
438448 for url , reason in self .failed_urls ['connection_errors' ]:
439- print (f" { url } - { reason } " )
449+ self . log_output (f" { url } - { reason } " )
440450
441451 if self .failed_urls ['failed_scrape' ]:
442- print (f"\n ❌ Failed scraping ({ len (self .failed_urls ['failed_scrape' ])} ):" )
452+ self . log_output (f"\n ❌ Failed scraping ({ len (self .failed_urls ['failed_scrape' ])} ):" )
443453 for url , reason in self .failed_urls ['failed_scrape' ]:
444- print (f" { url } - { reason } " )
454+ self . log_output (f" { url } - { reason } " )
445455
446456 if self .failed_urls ['failed_create' ]:
447- print (f"\n ❌ Failed creation ({ len (self .failed_urls ['failed_create' ])} ):" )
457+ self . log_output (f"\n ❌ Failed creation ({ len (self .failed_urls ['failed_create' ])} ):" )
448458 for url , reason in self .failed_urls ['failed_create' ]:
449- print (f" { url } - { reason } " )
459+ self . log_output (f" { url } - { reason } " )
450460 else :
451- print ("\n ✅ No failed URLs!" )
452-
453-
454- def main ():
455- if len (sys .argv ) < 2 :
456- print ("Usage: python3 tandoor-importer.py <url_file> [start_index] [max_imports]" )
457- print ("Example: python3 tandoor-importer.py url-list.txt 0 10" )
458- sys .exit (1 )
459-
460- filename = sys .argv [1 ]
461- start_from = int (sys .argv [2 ]) if len (sys .argv ) > 2 else 0
462- max_imports = int (sys .argv [3 ]) if len (sys .argv ) > 3 else None
463-
461+ self .log_output ("\n ✅ No failed URLs!" )
462+
463+
464+ def main () -> None :
465+ """Main entry point with argument parsing."""
466+ parser = argparse .ArgumentParser (
467+ description = "Bulk import recipes from URLs into Tandoor Recipes" ,
468+ formatter_class = argparse .RawDescriptionHelpFormatter ,
469+ epilog = """Examples:
470+ %(prog)s url-list.txt
471+ %(prog)s url-list.txt --start-from 100
472+ %(prog)s url-list.txt --max-imports 50 --output results.log
473+ %(prog)s url-list.txt --start-from 100 --max-imports 25 -o import.log"""
474+ )
475+
476+ parser .add_argument ("url_file" , help = "Path to text file containing recipe URLs" )
477+ parser .add_argument ("--start-from" , type = int , default = 0 ,
478+ help = "Line number to start from (default: 0)" )
479+ parser .add_argument ("--max-imports" , type = int ,
480+ help = "Maximum number of recipes to import" )
481+ parser .add_argument ("-o" , "--output" , type = str ,
482+ help = "Output results to file" )
483+
484+ args = parser .parse_args ()
485+
464486 # Load configuration
465487 tandoor_url , api_token , delay = load_config ()
466-
467- importer = FinalBulkImporter (tandoor_url , api_token , delay )
468-
469- print ("🔧 TANDOOR BULK RECIPE IMPORTER" )
470- print ("Using corrected two-step import process" )
471- print ("=" * 60 )
472-
473- importer .import_from_file (filename , start_from , max_imports )
488+
489+ # Setup output file if specified
490+ output_file = None
491+ if args .output :
492+ try :
493+ output_file = open (args .output , 'w' , encoding = 'utf-8' )
494+ except IOError as e :
495+ print (f"❌ Error opening output file { args .output } : { e } " )
496+ sys .exit (1 )
497+
498+ try :
499+ importer = FinalBulkImporter (tandoor_url , api_token , delay , output_file )
500+
501+ importer .log_output ("🔧 TANDOOR BULK RECIPE IMPORTER" )
502+ importer .log_output ("Using corrected two-step import process" )
503+ importer .log_output ("=" * 60 )
504+
505+ importer .import_from_file (args .url_file , args .start_from , args .max_imports )
506+
507+ finally :
508+ if output_file :
509+ output_file .close ()
474510
475511
476512if __name__ == "__main__" :
0 commit comments