Skip to content

Commit 1bee388

Browse files
committed
Enhanced logging operations.
1 parent a9dad31 commit 1bee388

File tree

1 file changed

+118
-82
lines changed

1 file changed

+118
-82
lines changed

tandoor-importer.py

Lines changed: 118 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
import sys
99
import configparser
1010
import os
11+
import argparse
12+
from typing import Optional, TextIO
1113

1214
def load_config():
1315
"""Load configuration from config.conf file"""
@@ -42,10 +44,11 @@ def load_config():
4244
sys.exit(1)
4345

4446
class FinalBulkImporter:
45-
def __init__(self, tandoor_url, api_token, delay):
47+
def __init__(self, tandoor_url: str, api_token: str, delay: int, output_file: Optional[TextIO] = None):
4648
self.tandoor_url = tandoor_url
4749
self.api_token = api_token
4850
self.delay = delay
51+
self.output_file = output_file
4952

5053
self.session = requests.Session()
5154
self.session.headers.update({
@@ -135,19 +138,19 @@ def get_existing_source_urls(self):
135138
existing_urls = set()
136139
page = 1
137140

138-
print("🔍 Fetching existing recipes for duplicate detection...")
141+
self.log_output("🔍 Fetching existing recipes for duplicate detection...")
139142

140143
while True:
141144
try:
142145
response = self.session.get(f"{self.tandoor_url}/api/recipe/?page={page}&page_size=100", timeout=15)
143146

144147
if response.status_code == 429:
145-
print("⏳ Rate limited while fetching existing recipes, waiting...")
148+
self.log_output("⏳ Rate limited while fetching existing recipes, waiting...")
146149
time.sleep(60)
147150
continue
148151

149152
if response.status_code != 200:
150-
print(f"❌ Error fetching existing recipes: {response.status_code}")
153+
self.log_output(f"❌ Error fetching existing recipes: {response.status_code}")
151154
break
152155

153156
data = response.json()
@@ -169,10 +172,10 @@ def get_existing_source_urls(self):
169172
time.sleep(1) # Small delay between pagination requests
170173

171174
except Exception as e:
172-
print(f"❌ Error getting existing recipes: {e}")
175+
self.log_output(f"❌ Error getting existing recipes: {e}")
173176
break
174177

175-
print(f"📊 Found {len(existing_urls)} existing recipes with source URLs")
178+
self.log_output(f"📊 Found {len(existing_urls)} existing recipes with source URLs")
176179
return existing_urls
177180

178181
def scrape_recipe(self, url):
@@ -246,33 +249,33 @@ def create_recipe(self, recipe_data, images=None):
246249

247250
def import_single_recipe(self, url, index, total):
248251
"""Complete import process for a single recipe"""
249-
print(f"\n📝 [{index}/{total}] Importing: {url}")
252+
self.log_output(f"\n📝 [{index}/{total}] Importing: {url}")
250253

251254
# Step 1: Scrape
252255
scrape_success, scrape_result, images, _ = self.scrape_recipe(url)
253256
if not scrape_success:
254257
if "rate_limited" in scrape_result:
255258
self.stats['rate_limited'] += 1
256-
print("⏳ Rate limited during scrape")
259+
self.log_output("⏳ Rate limited during scrape")
257260
return "rate_limited"
258261
elif "duplicate" in scrape_result:
259262
self.stats['duplicates'] += 1
260-
print(f"⚠️ Duplicate: {scrape_result}")
263+
self.log_output(f"⚠️ Duplicate: {scrape_result}")
261264
return "duplicate"
262265
elif "non_recipe:" in scrape_result:
263266
self.stats['non_recipe_urls'] += 1
264267
self.failed_urls['non_recipe_urls'].append((url, scrape_result))
265-
print(f"🚫 Non-recipe URL: {scrape_result}")
268+
self.log_output(f"🚫 Non-recipe URL: {scrape_result}")
266269
return "non_recipe"
267270
elif "connection:" in scrape_result:
268271
self.stats['connection_errors'] += 1
269272
self.failed_urls['connection_errors'].append((url, scrape_result))
270-
print(f"🌐 Connection error: {scrape_result}")
273+
self.log_output(f"🌐 Connection error: {scrape_result}")
271274
return "connection_error"
272275
else:
273276
self.stats['failed_scrape'] += 1
274277
self.failed_urls['failed_scrape'].append((url, scrape_result))
275-
print(f"❌ Scrape failed: {scrape_result}")
278+
self.log_output(f"❌ Scrape failed: {scrape_result}")
276279
return "failed_scrape"
277280

278281
recipe_data = scrape_result
@@ -283,50 +286,57 @@ def import_single_recipe(self, url, index, total):
283286
if not create_success:
284287
if "rate_limited" in create_result:
285288
self.stats['rate_limited'] += 1
286-
print("⏳ Rate limited during creation")
289+
self.log_output("⏳ Rate limited during creation")
287290
return "rate_limited"
288291
else:
289292
self.stats['failed_create'] += 1
290293
self.failed_urls['failed_create'].append((url, create_result))
291-
print(f"❌ Create failed: {create_result}")
294+
self.log_output(f"❌ Create failed: {create_result}")
292295
return "failed_create"
293296

294297
self.stats['successful'] += 1
295-
print(f"✅ SUCCESS: '{recipe_name}' (ID: {recipe_id})")
298+
self.log_output(f"✅ SUCCESS: '{recipe_name}' (ID: {recipe_id})")
296299
return "success"
297300

298301
def wait_for_rate_limit_reset(self):
299302
"""Wait for rate limit to reset"""
300-
print("⏳ Waiting for rate limit to reset...")
303+
self.log_output("⏳ Waiting for rate limit to reset...")
301304

302305
# Try a simple GET request to check rate limit status
303306
for attempt in range(12): # Try for up to 10 minutes
304307
try:
305308
response = self.session.get(f"{self.tandoor_url}/api/recipe/?page_size=1", timeout=10)
306309

307310
if response.status_code != 429:
308-
print("✅ Rate limit appears to be reset!")
311+
self.log_output("✅ Rate limit appears to be reset!")
309312
return True
310313

311-
print(f"⏳ Still rate limited... waiting 30s (attempt {attempt + 1}/12)")
314+
self.log_output(f"⏳ Still rate limited... waiting 30s (attempt {attempt + 1}/12)")
312315
time.sleep(30)
313316

314317
except Exception as e:
315-
print(f"⚠️ Error checking rate limit: {e}")
318+
self.log_output(f"⚠️ Error checking rate limit: {e}")
316319
time.sleep(30)
317320

318-
print("❌ Rate limit did not reset after 10 minutes")
321+
self.log_output("❌ Rate limit did not reset after 10 minutes")
319322
return False
320323

321-
def import_from_file(self, filename, start_from=0, max_imports=None):
324+
def log_output(self, message: str) -> None:
325+
"""Output message to both console and file if specified."""
326+
print(message)
327+
if self.output_file:
328+
self.output_file.write(f"{message}\n")
329+
self.output_file.flush()
330+
331+
def import_from_file(self, filename: str, start_from: int = 0, max_imports: Optional[int] = None) -> None:
322332
"""Import recipes from URL list file"""
323-
print(f"📂 Loading URLs from {filename}")
333+
self.log_output(f"📂 Loading URLs from {filename}")
324334

325335
try:
326336
with open(filename, 'r') as f:
327337
urls = [line.strip() for line in f if line.strip()]
328338
except Exception as e:
329-
print(f"❌ Error reading file: {e}")
339+
self.log_output(f"❌ Error reading file: {e}")
330340
return
331341

332342
# Filter and validate URLs
@@ -337,23 +347,23 @@ def import_from_file(self, filename, start_from=0, max_imports=None):
337347
else:
338348
self.stats['invalid_urls'] += 1
339349
self.failed_urls['invalid_urls'].append(url)
340-
print(f"🚫 Skipping invalid/non-recipe URL: {url[:60]}{'...' if len(url) > 60 else ''}")
350+
self.log_output(f"🚫 Skipping invalid/non-recipe URL: {url[:60]}{'...' if len(url) > 60 else ''}")
341351

342-
print(f"📊 Found {len(valid_urls)} valid URLs ({self.stats['invalid_urls']} invalid)")
352+
self.log_output(f"📊 Found {len(valid_urls)} valid URLs ({self.stats['invalid_urls']} invalid)")
343353

344354
# Apply start/limit filters
345355
if start_from > 0:
346356
valid_urls = valid_urls[start_from:]
347-
print(f"📊 Starting from index {start_from}, {len(valid_urls)} URLs remaining")
357+
self.log_output(f"📊 Starting from index {start_from}, {len(valid_urls)} URLs remaining")
348358

349359
if max_imports:
350360
valid_urls = valid_urls[:max_imports]
351-
print(f"📊 Limited to {max_imports} imports")
361+
self.log_output(f"📊 Limited to {max_imports} imports")
352362

353363
self.stats['total'] = len(valid_urls)
354364

355365
if not valid_urls:
356-
print("❌ No valid URLs to import!")
366+
self.log_output("❌ No valid URLs to import!")
357367
return
358368

359369
# Get existing recipes to skip duplicates
@@ -362,115 +372,141 @@ def import_from_file(self, filename, start_from=0, max_imports=None):
362372
pre_existing_count = len(valid_urls) - len(new_urls)
363373

364374
if pre_existing_count > 0:
365-
print(f"⚠️ Skipping {pre_existing_count} URLs that already exist in database")
375+
self.log_output(f"⚠️ Skipping {pre_existing_count} URLs that already exist in database")
366376
self.stats['duplicates'] += pre_existing_count
367377

368378
if not new_urls:
369-
print("✅ All URLs already imported!")
379+
self.log_output("✅ All URLs already imported!")
370380
return
371381

372-
print(f"🚀 Starting import of {len(new_urls)} new recipes...")
382+
self.log_output(f"🚀 Starting import of {len(new_urls)} new recipes...")
373383
estimated_minutes = (len(new_urls) * self.delay) / 60
374-
print(f"⏱️ Estimated time: {estimated_minutes:.1f} minutes")
384+
self.log_output(f"⏱️ Estimated time: {estimated_minutes:.1f} minutes")
375385

376386
# Import each URL
377387
for i, url in enumerate(new_urls, 1):
378388
result = self.import_single_recipe(url, i, len(new_urls))
379389

380390
# Handle rate limiting
381391
if result == "rate_limited":
382-
print("⏳ Hit rate limit, waiting for reset...")
392+
self.log_output("⏳ Hit rate limit, waiting for reset...")
383393
if self.wait_for_rate_limit_reset():
384-
print("🔄 Retrying current recipe...")
394+
self.log_output("🔄 Retrying current recipe...")
385395
result = self.import_single_recipe(url, i, len(new_urls))
386396
else:
387-
print("❌ Could not recover from rate limit, stopping import")
397+
self.log_output("❌ Could not recover from rate limit, stopping import")
388398
break
389399

390400
# Print progress
391401
success_rate = (self.stats['successful'] / i) * 100 if i > 0 else 0
392-
print(f"📊 Progress: {i}/{len(new_urls)} ({i/len(new_urls)*100:.1f}%) | Success rate: {success_rate:.1f}%")
393-
print(f"📈 Stats: ✅{self.stats['successful']} ⚠️{self.stats['duplicates']} "
402+
self.log_output(f"📊 Progress: {i}/{len(new_urls)} ({i/len(new_urls)*100:.1f}%) | Success rate: {success_rate:.1f}%")
403+
self.log_output(f"📈 Stats: ✅{self.stats['successful']} ⚠️{self.stats['duplicates']} "
394404
f"🚫{self.stats['non_recipe_urls']} 🌐{self.stats['connection_errors']} "
395405
f"❌{self.stats['failed_scrape']+self.stats['failed_create']}{self.stats['rate_limited']}")
396406

397407
# Wait between requests (except on last one)
398408
if i < len(new_urls):
399-
print(f"⏱️ Waiting {self.delay}s before next import...")
409+
self.log_output(f"⏱️ Waiting {self.delay}s before next import...")
400410
time.sleep(self.delay)
401411

402412
# Final report
403-
print("\n🎉 BULK IMPORT COMPLETE!")
404-
print("📊 Final Stats:")
405-
print(f" Total processed: {self.stats['total']}")
406-
print(f" ✅ Successful imports: {self.stats['successful']}")
407-
print(f" ⚠️ Duplicates skipped: {self.stats['duplicates']}")
408-
print(f" ❌ Failed scraping: {self.stats['failed_scrape']}")
409-
print(f" ❌ Failed creation: {self.stats['failed_create']}")
410-
print(f" 🚫 Non-recipe URLs: {self.stats['non_recipe_urls']}")
411-
print(f" 🌐 Connection errors: {self.stats['connection_errors']}")
412-
print(f" ⏳ Rate limited: {self.stats['rate_limited']}")
413-
print(f" 🚫 Invalid URLs: {self.stats['invalid_urls']}")
413+
self.log_output("\n🎉 BULK IMPORT COMPLETE!")
414+
self.log_output("📊 Final Stats:")
415+
self.log_output(f" Total processed: {self.stats['total']}")
416+
self.log_output(f" ✅ Successful imports: {self.stats['successful']}")
417+
self.log_output(f" ⚠️ Duplicates skipped: {self.stats['duplicates']}")
418+
self.log_output(f" ❌ Failed scraping: {self.stats['failed_scrape']}")
419+
self.log_output(f" ❌ Failed creation: {self.stats['failed_create']}")
420+
self.log_output(f" 🚫 Non-recipe URLs: {self.stats['non_recipe_urls']}")
421+
self.log_output(f" 🌐 Connection errors: {self.stats['connection_errors']}")
422+
self.log_output(f" ⏳ Rate limited: {self.stats['rate_limited']}")
423+
self.log_output(f" 🚫 Invalid URLs: {self.stats['invalid_urls']}")
414424

415425
success_rate = (self.stats['successful'] / max(1, len(new_urls))) * 100
416-
print(f" 📈 Success rate: {success_rate:.1f}%")
426+
self.log_output(f" 📈 Success rate: {success_rate:.1f}%")
417427

418428
# Display failed URLs if any
419429
total_failures = (self.stats['failed_scrape'] + self.stats['failed_create'] +
420430
self.stats['non_recipe_urls'] + self.stats['connection_errors'] +
421431
self.stats['invalid_urls'])
422432

423433
if total_failures > 0:
424-
print(f"\n❌ FAILED URLS ({total_failures} total):")
434+
self.log_output(f"\n❌ FAILED URLS ({total_failures} total):")
425435

426436
if self.failed_urls['invalid_urls']:
427-
print(f"\n🚫 Invalid URLs ({len(self.failed_urls['invalid_urls'])}):")
437+
self.log_output(f"\n🚫 Invalid URLs ({len(self.failed_urls['invalid_urls'])}):")
428438
for url in self.failed_urls['invalid_urls']:
429-
print(f" {url}")
439+
self.log_output(f" {url}")
430440

431441
if self.failed_urls['non_recipe_urls']:
432-
print(f"\n🚫 Non-recipe URLs ({len(self.failed_urls['non_recipe_urls'])}):")
442+
self.log_output(f"\n🚫 Non-recipe URLs ({len(self.failed_urls['non_recipe_urls'])}):")
433443
for url, reason in self.failed_urls['non_recipe_urls']:
434-
print(f" {url} - {reason}")
444+
self.log_output(f" {url} - {reason}")
435445

436446
if self.failed_urls['connection_errors']:
437-
print(f"\n🌐 Connection errors ({len(self.failed_urls['connection_errors'])}):")
447+
self.log_output(f"\n🌐 Connection errors ({len(self.failed_urls['connection_errors'])}):")
438448
for url, reason in self.failed_urls['connection_errors']:
439-
print(f" {url} - {reason}")
449+
self.log_output(f" {url} - {reason}")
440450

441451
if self.failed_urls['failed_scrape']:
442-
print(f"\n❌ Failed scraping ({len(self.failed_urls['failed_scrape'])}):")
452+
self.log_output(f"\n❌ Failed scraping ({len(self.failed_urls['failed_scrape'])}):")
443453
for url, reason in self.failed_urls['failed_scrape']:
444-
print(f" {url} - {reason}")
454+
self.log_output(f" {url} - {reason}")
445455

446456
if self.failed_urls['failed_create']:
447-
print(f"\n❌ Failed creation ({len(self.failed_urls['failed_create'])}):")
457+
self.log_output(f"\n❌ Failed creation ({len(self.failed_urls['failed_create'])}):")
448458
for url, reason in self.failed_urls['failed_create']:
449-
print(f" {url} - {reason}")
459+
self.log_output(f" {url} - {reason}")
450460
else:
451-
print("\n✅ No failed URLs!")
452-
453-
454-
def main():
455-
if len(sys.argv) < 2:
456-
print("Usage: python3 tandoor-importer.py <url_file> [start_index] [max_imports]")
457-
print("Example: python3 tandoor-importer.py url-list.txt 0 10")
458-
sys.exit(1)
459-
460-
filename = sys.argv[1]
461-
start_from = int(sys.argv[2]) if len(sys.argv) > 2 else 0
462-
max_imports = int(sys.argv[3]) if len(sys.argv) > 3 else None
463-
461+
self.log_output("\n✅ No failed URLs!")
462+
463+
464+
def main() -> None:
465+
"""Main entry point with argument parsing."""
466+
parser = argparse.ArgumentParser(
467+
description="Bulk import recipes from URLs into Tandoor Recipes",
468+
formatter_class=argparse.RawDescriptionHelpFormatter,
469+
epilog="""Examples:
470+
%(prog)s url-list.txt
471+
%(prog)s url-list.txt --start-from 100
472+
%(prog)s url-list.txt --max-imports 50 --output results.log
473+
%(prog)s url-list.txt --start-from 100 --max-imports 25 -o import.log"""
474+
)
475+
476+
parser.add_argument("url_file", help="Path to text file containing recipe URLs")
477+
parser.add_argument("--start-from", type=int, default=0,
478+
help="Line number to start from (default: 0)")
479+
parser.add_argument("--max-imports", type=int,
480+
help="Maximum number of recipes to import")
481+
parser.add_argument("-o", "--output", type=str,
482+
help="Output results to file")
483+
484+
args = parser.parse_args()
485+
464486
# Load configuration
465487
tandoor_url, api_token, delay = load_config()
466-
467-
importer = FinalBulkImporter(tandoor_url, api_token, delay)
468-
469-
print("🔧 TANDOOR BULK RECIPE IMPORTER")
470-
print("Using corrected two-step import process")
471-
print("=" * 60)
472-
473-
importer.import_from_file(filename, start_from, max_imports)
488+
489+
# Setup output file if specified
490+
output_file = None
491+
if args.output:
492+
try:
493+
output_file = open(args.output, 'w', encoding='utf-8')
494+
except IOError as e:
495+
print(f"❌ Error opening output file {args.output}: {e}")
496+
sys.exit(1)
497+
498+
try:
499+
importer = FinalBulkImporter(tandoor_url, api_token, delay, output_file)
500+
501+
importer.log_output("🔧 TANDOOR BULK RECIPE IMPORTER")
502+
importer.log_output("Using corrected two-step import process")
503+
importer.log_output("=" * 60)
504+
505+
importer.import_from_file(args.url_file, args.start_from, args.max_imports)
506+
507+
finally:
508+
if output_file:
509+
output_file.close()
474510

475511

476512
if __name__ == "__main__":

0 commit comments

Comments
 (0)