Skip to content

Commit 9153d33

Browse files
committed
add script to analyze and remove url duplicates
1 parent 8f48e3d commit 9153d33

File tree

1 file changed

+76
-0
lines changed

1 file changed

+76
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
from collections import defaultdict
2+
3+
from django.db import models
4+
5+
from sde_collections.models.candidate_url import CandidateURL
6+
from sde_collections.models.collection import Collection
7+
8+
# Get all field names except 'id' and 'collection' (since we're already looping by collection)
9+
duplicate_fields = [field.name for field in CandidateURL._meta.get_fields() if field.name not in ["id", "collection"]]
10+
11+
12+
def analyze_duplicates():
13+
"""Analyze duplicates and print how many would be deleted in each collection."""
14+
deletion_stats = defaultdict(lambda: {"total": 0, "to_delete": 0})
15+
16+
# Loop through each collection
17+
for collection in Collection.objects.all():
18+
# Count total URLs for the collection
19+
total_urls = CandidateURL.objects.filter(collection=collection).count()
20+
deletion_stats[collection.config_folder]["total"] = total_urls
21+
22+
# Group CandidateURL instances by all fields dynamically
23+
duplicates_in_collection = (
24+
CandidateURL.objects.filter(collection=collection)
25+
.values(*duplicate_fields)
26+
.annotate(count=models.Count("id"))
27+
.filter(count__gt=1)
28+
)
29+
30+
# Count potential deletions without deleting
31+
for entry in duplicates_in_collection:
32+
duplicates_count = CandidateURL.objects.filter(
33+
collection=collection, **{field: entry[field] for field in duplicate_fields}
34+
).count()
35+
deletion_stats[collection.config_folder]["to_delete"] += duplicates_count - 1
36+
37+
# Print analysis results
38+
print("Duplicate analysis completed.")
39+
for config_folder, stats in deletion_stats.items():
40+
print(f"{config_folder}' has {stats['total']} total URL(s), with {stats['to_delete']} duplicates.")
41+
42+
43+
def delete_duplicates():
44+
"""Delete duplicates based on previously analyzed duplicates."""
45+
deletion_stats = defaultdict(int)
46+
47+
# Loop through each collection
48+
for collection in Collection.objects.all():
49+
# Group CandidateURL instances by all fields dynamically
50+
duplicates_in_collection = (
51+
CandidateURL.objects.filter(collection=collection)
52+
.values(*duplicate_fields)
53+
.annotate(count=models.Count("id"))
54+
.filter(count__gt=1)
55+
)
56+
57+
# Delete duplicates and track deletions
58+
for entry in duplicates_in_collection:
59+
duplicates = CandidateURL.objects.filter(
60+
collection=collection, **{field: entry[field] for field in duplicate_fields}
61+
)
62+
63+
# Keep the first instance and delete the rest
64+
for candidate in duplicates[1:]: # Skip the first to retain it
65+
candidate.delete()
66+
deletion_stats[collection.config_folder] += 1
67+
68+
# Print deletion results
69+
print("Duplicate URL cleanup completed.")
70+
for config_folder, deleted_count in deletion_stats.items():
71+
print(f"Collection '{config_folder}' had {deleted_count} duplicate URL(s) deleted.")
72+
73+
74+
# Usage
75+
analyze_duplicates() # First analyze duplicates
76+
delete_duplicates() # Then delete duplicates based on analysis

0 commit comments

Comments
 (0)