|
| 1 | +from collections import defaultdict |
| 2 | + |
| 3 | +from django.db import models |
| 4 | + |
| 5 | +from sde_collections.models.candidate_url import CandidateURL |
| 6 | +from sde_collections.models.collection import Collection |
| 7 | + |
| 8 | +# Get all field names except 'id' and 'collection' (since we're already looping by collection) |
| 9 | +duplicate_fields = [field.name for field in CandidateURL._meta.get_fields() if field.name not in ["id", "collection"]] |
| 10 | + |
| 11 | + |
| 12 | +def analyze_duplicates(): |
| 13 | + """Analyze duplicates and print how many would be deleted in each collection.""" |
| 14 | + deletion_stats = defaultdict(lambda: {"total": 0, "to_delete": 0}) |
| 15 | + |
| 16 | + # Loop through each collection |
| 17 | + for collection in Collection.objects.all(): |
| 18 | + # Count total URLs for the collection |
| 19 | + total_urls = CandidateURL.objects.filter(collection=collection).count() |
| 20 | + deletion_stats[collection.config_folder]["total"] = total_urls |
| 21 | + |
| 22 | + # Group CandidateURL instances by all fields dynamically |
| 23 | + duplicates_in_collection = ( |
| 24 | + CandidateURL.objects.filter(collection=collection) |
| 25 | + .values(*duplicate_fields) |
| 26 | + .annotate(count=models.Count("id")) |
| 27 | + .filter(count__gt=1) |
| 28 | + ) |
| 29 | + |
| 30 | + # Count potential deletions without deleting |
| 31 | + for entry in duplicates_in_collection: |
| 32 | + duplicates_count = CandidateURL.objects.filter( |
| 33 | + collection=collection, **{field: entry[field] for field in duplicate_fields} |
| 34 | + ).count() |
| 35 | + deletion_stats[collection.config_folder]["to_delete"] += duplicates_count - 1 |
| 36 | + |
| 37 | + # Print analysis results |
| 38 | + print("Duplicate analysis completed.") |
| 39 | + for config_folder, stats in deletion_stats.items(): |
| 40 | + print(f"{config_folder}' has {stats['total']} total URL(s), with {stats['to_delete']} duplicates.") |
| 41 | + |
| 42 | + |
| 43 | +def delete_duplicates(): |
| 44 | + """Delete duplicates based on previously analyzed duplicates.""" |
| 45 | + deletion_stats = defaultdict(int) |
| 46 | + |
| 47 | + # Loop through each collection |
| 48 | + for collection in Collection.objects.all(): |
| 49 | + # Group CandidateURL instances by all fields dynamically |
| 50 | + duplicates_in_collection = ( |
| 51 | + CandidateURL.objects.filter(collection=collection) |
| 52 | + .values(*duplicate_fields) |
| 53 | + .annotate(count=models.Count("id")) |
| 54 | + .filter(count__gt=1) |
| 55 | + ) |
| 56 | + |
| 57 | + # Delete duplicates and track deletions |
| 58 | + for entry in duplicates_in_collection: |
| 59 | + duplicates = CandidateURL.objects.filter( |
| 60 | + collection=collection, **{field: entry[field] for field in duplicate_fields} |
| 61 | + ) |
| 62 | + |
| 63 | + # Keep the first instance and delete the rest |
| 64 | + for candidate in duplicates[1:]: # Skip the first to retain it |
| 65 | + candidate.delete() |
| 66 | + deletion_stats[collection.config_folder] += 1 |
| 67 | + |
| 68 | + # Print deletion results |
| 69 | + print("Duplicate URL cleanup completed.") |
| 70 | + for config_folder, deleted_count in deletion_stats.items(): |
| 71 | + print(f"Collection '{config_folder}' had {deleted_count} duplicate URL(s) deleted.") |
| 72 | + |
| 73 | + |
| 74 | +# Usage |
| 75 | +analyze_duplicates() # First analyze duplicates |
| 76 | +delete_duplicates() # Then delete duplicates based on analysis |
0 commit comments