Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
f491e45
Add remove-from-solr script to arclight
barbarahui Aug 11, 2025
77ceade
Add cincoctrl django management command remove_finding_aid
barbarahui Aug 11, 2025
147b201
Add bulk-remove-from-solr command to arclight
barbarahui Aug 12, 2025
e1bf858
Make linter happy
barbarahui Aug 12, 2025
eaa26d1
Update arlight remove from solr scripts
barbarahui Aug 13, 2025
781152b
Add a script to run a job on an arclight ecs container [infra]
amywieliczka Jul 24, 2025
167dee1
Add arclight_job docs [infra]
amywieliczka Jul 25, 2025
a5e8d2f
Assume SOLR_URL = solr leader url for arclight_job [infra]
amywieliczka Jul 25, 2025
c665846
Add --memory and --cpu to arclight_job [infra, arclight]
amywieliczka Jul 25, 2025
d6edc0f
Removal argument is always an ark [arclight, cincoctrl]
amywieliczka Sep 2, 2025
1f33b2c
Add Delete Finding Aid dag [dags]
amywieliczka Sep 2, 2025
c0c6f10
Add Unpublish Finding Aid dag [dags]
amywieliczka Sep 2, 2025
89c9858
Delete files from s3 when they are also deleted from cincoctrl [cinco…
amywieliczka Sep 2, 2025
b0159ac
Add cloudfront cache invalidation to remove-from-solr script [arclight]
amywieliczka Sep 2, 2025
cad82ea
Remove static finding aid in delete finding aid dag [dags]
amywieliczka Sep 3, 2025
2712221
Update bulk-remove-from-solr with cache invalidation [arclight]
amywieliczka Sep 3, 2025
f0bda7e
Bulk remove finding aids from cincoctrl [cincoctrl]
amywieliczka Sep 3, 2025
d52b7ac
Add bulk delete finding aid dag [dags]
amywieliczka Sep 3, 2025
77f0479
There are 7 dags now
amywieliczka Sep 3, 2025
ccef754
Remove ecs_arclight_command in favor of arclight_job
amywieliczka Sep 3, 2025
f856da2
s3 key input should be relative removals/ path [cincoctrl]
amywieliczka Sep 3, 2025
2016aea
Bump amazon/aws-cli in /cincoctrl/compose/production/aws
dependabot[bot] Sep 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions arclight/bin/bulk-remove-from-solr
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/bin/bash -e

# This script takes as an argument the name of a key
# in s3://$S3_BUCKET/removals/ containing a csv list
# of finding aid ID, repository code pairs to delete,
# and deletes the finding aids from solr

# check that we have an s3 key
if [ -z "$1" ]; then
echo "Usage: $0 <s3_key>"
exit 1
fi

# check that the S3_BUCKET environment variable is set
if [ -z "$S3_BUCKET" ]; then
echo "The S3_BUCKET environment variable must be set"
exit 1
fi

echo "Downloading S3 file $S3_BUCKET/$1 to /tmp/$1"
aws s3 cp s3://"$S3_BUCKET"/removals/"$1" /tmp/"$1"

queries=""
while IFS=, read -r findaid_id repo_code || [ -n "$findaid_id" ]
do
# Trim leading/trailing whitespace
findaid_id="${findaid_id##*( )}"
findaid_id="${findaid_id%%*( )}"
escaped_findaidid="${findaid_id//\:/\\\:}"
escaped_findaidid="${escaped_findaidid//\//\\\/}"
queries+="<query>_root_:$escaped_findaidid</query>"
done < "/tmp/$1"

curl -X POST "$SOLR_WRITER/update?commit=true" \
-H "Content-Type: text/xml" \
--data-binary "<delete>$queries</delete>"

echo "Finished deleting finding aids from arclight"
echo "Now creating a cache invalidation for each page and their repositories"

paths=""
while IFS=, read -r findaid_id repo_code || [ -n "$findaid_id" ]
do
# Trim leading/trailing whitespace of findaid_id and repo_code
findaid_id="${findaid_id##*( )}"
findaid_id="${findaid_id%%*( )}"
repo_code="${repo_code##*( )}"
repo_code="${repo_code%%*( )}"
repo_name=$(bundle exec rake repo:urlized_name[$repo_code])
paths+="\"/findaid/$findaid_id\" \"/search?f%5Blevel%5D%5B%5D=Collection&f%5Brepository%5D%5B%5D=$repo_name&sort=title_sort+asc\" "
done

echo "Running cache invalidation for $paths"
if [ -z "$CLOUDFRONT_DISTRIBUTION_ID" ]; then
echo "CLOUDFRONT_DISTRIBUTION_ID not set, skipping cache invalidation."
else
echo "Invalidating urls: $paths"
cf=$(aws cloudfront create-invalidation --distribution-id $CLOUDFRONT_DISTRIBUTION_ID --paths $paths)
echo "Invalidation submitted: $cf"
fi
42 changes: 42 additions & 0 deletions arclight/bin/remove-from-solr
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/bin/bash -e

# This script takes as an argument the ID of a finding aid
# and deletes the finding aid from Solr

# check that we have a finding aid ID and repository code
if [ -z "$2" ]; then
echo "Usage: $0 <finding_aid_id - typically an ark> <repository_code>"
exit 1
fi

# check that the SOLR_WRITER environment variable is set
if [ -z "$SOLR_WRITER" ]; then
echo "The SOLR_WRITER environment variable must be set"
exit 1
fi

echo "Deleting Finding Aid: $1"
findaid_id=$1
escaped_findaidid="${findaid_id//\:/\\\:}"
escaped_findaidid="${escaped_findaidid//\//\\\/}"

curl -X POST "$SOLR_WRITER/update?commit=true" \
-H "Content-Type: text/xml" \
--data-binary "<delete><query>id:$escaped_findaidid*</query></delete>"

# alternative delete query
# curl -X POST "$SOLR_WRITER/update?commit=true" \
# -H "Content-Type: text/xml" \
# --data-binary "<delete><query>_root_:$escaped_findaidid</query></delete>"

# deal with any errors

REPO_NAME=$(bundle exec rake repo:urlized_name\[$2\])
echo "Running cache invalidation for $1 and $2 landing page"
if [ -z "$CLOUDFRONT_DISTRIBUTION_ID" ]; then
echo "CLOUDFRONT_DISTRIBUTION_ID not set, skipping cache invalidation."
else
echo "Invalidating urls: /findaid/$1 and /search?f%5Blevel%5D%5B%5D=Collection&f%5Brepository%5D%5B%5D=$REPO_NAME&sort=title_sort+asc"
cf=$(aws cloudfront create-invalidation --distribution-id $CLOUDFRONT_DISTRIBUTION_ID --paths "/findaid/$1" "/search?f%5Blevel%5D%5B%5D=Collection&f%5Brepository%5D%5B%5D=$REPO_NAME&sort=title_sort+asc")
echo "Invalidation submitted: $cf"
fi
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import boto3
from django.core import settings
from django.core.management.base import BaseCommand

from cincoctrl.findingaids.models import FindingAid


class Command(BaseCommand):
"""Bulk remove finding aids"""

help = "Bulk remove finding aids"

def add_arguments(self, parser):
parser.add_argument(
"--s3_key",
help="""The s3_key where a csv of 'finding aid ark, repository code' pairs
to be deleted are stored (relative to removals/)""",
type=str,
)

def handle(self, *args, **options):
s3_key = options.get("s3_key")
s3_client = boto3.client("s3")
obj = s3_client.get_object(
Bucket=settings.AWS_STORAGE_BUCKET_NAME,
Key=f"removals/{s3_key}",
)
body = obj["Body"].read().decode("utf-8")
for line in body.splitlines():
ark, repo_code = line.split(",")
ark = ark.strip()
f = FindingAid.objects.get(ark=ark)
f.delete()
self.stdout.write(f"Deleted finding aid {ark}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from django.core.management.base import BaseCommand

from cincoctrl.findingaids.models import FindingAid


class Command(BaseCommand):
"""Remove a finding aid"""

help = "Remove a finding aid"

def add_arguments(self, parser):
parser.add_argument(
"--ark",
help="the ark of the finding aid to delete",
type=str,
)

def handle(self, *args, **options):
ark = options.get("ark")
f = FindingAid.objects.get(ark=ark)
f.delete()
self.stdout.write(f"Deleted finding aid {ark}")
23 changes: 23 additions & 0 deletions cincoctrl/cincoctrl/findingaids/signals.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import boto3
from django.db.models import Q
from django.db.models.signals import post_save
from django.db.models.signals import pre_delete
from django.db.models.signals import pre_save
from django.dispatch import receiver

Expand Down Expand Up @@ -31,6 +33,27 @@ def update_ead_warnings(sender, instance, created, **kwargs):
instance.validationwarning_set.exclude(pk__in=warn_ids).delete()


@receiver(pre_delete, sender=FindingAid)
def delete_s3_ead_file_on_model_delete(sender, instance, **kwargs):
# instance.ead_file.delete(save=False) tells django-storages to delete
# the associated S3 object without attempting to save the model instance
# again - unnecessary during deletion.
if instance.ead_file:
instance.ead_file.delete(save=False)


@receiver(pre_delete, sender=SupplementaryFile)
def delete_s3_pdf_file_on_model_delete(sender, instance, **kwargs):
# instance.pdf_file.delete(save=False) tells django-storages to delete
# the associated S3 object without attempting to save the model instance
# again - unnecessary during deletion.
if instance.pdf_file:
instance.pdf_file.delete(save=False)
if instance.textract_output:
s3_client = boto3.client("s3")
s3_client.delete_object(Bucket="cinco-stage", Key=instance.textract_output)


@receiver(pre_save, sender=SupplementaryFile)
def pre_save(sender, instance, **kwargs):
if instance.pk:
Expand Down
2 changes: 1 addition & 1 deletion cincoctrl/compose/production/aws/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM docker.io/amazon/aws-cli:2.28.12
FROM docker.io/amazon/aws-cli:2.28.23

# Clear entrypoint from the base image, otherwise it's always calling the aws CLI
ENTRYPOINT []
Expand Down
8 changes: 8 additions & 0 deletions dags/arclight_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ def __init__(
f"bin/{arclight_command}",
s3_key,
]
elif arclight_command == "remove-from-solr":
command = [f"bin/{arclight_command}", finding_aid_ark, repository_code]
elif arclight_command == "bulk-remove-from-solr":
command = [f"bin/{arclight_command}", s3_key]

args = {
"launch_type": "FARGATE",
Expand Down Expand Up @@ -177,6 +181,10 @@ def __init__(
f"bin/{arclight_command}",
s3_key,
]
elif arclight_command == "remove-from-solr":
command = [f"bin/{arclight_command}", finding_aid_ark, repository_code]
elif arclight_command == "bulk-remove-from-solr":
command = [f"bin/{arclight_command}", s3_key]

args = {
"image": f"{container_image}:{container_version}",
Expand Down
77 changes: 77 additions & 0 deletions dags/bulk_delete_finding_aids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import boto3
from datetime import datetime
from airflow.decorators import dag, task
from airflow.models.param import Param
from airflow.models import Variable


from cinco.cincoctrl_operator import CincoCtrlOperator
from cinco.arclight_operator import ArcLightOperator


@dag(
dag_id="bulk_delete_finding_aids",
schedule=None,
start_date=datetime(2025, 1, 1),
catchup=False,
params={
"s3_key": Param(
"",
type="string",
description="The s3_key where a csv of 'finding aid ark, repository code' pairs to be deleted are stored",
),
"cinco_environment": Param(
"stage",
enum=["stage", "prd"],
description="The CincoCtrl and ArcLight environment to run",
),
},
tags=["cinco"],
# on_failure_callback=notify_dag_failure,
# on_success_callback=notify_dag_success,
)
def bulk_delete_finding_aid():
bulk_remove_from_index = ArcLightOperator(
task_id="bulk_remove_from_index",
arclight_command="bulk-remove-from-solr",
s3_key="{{ params.s3_key }}",
cinco_environment="{{ params.cinco_environment }}",
)

bulk_remove_from_database = CincoCtrlOperator(
task_id="bulk_remove_from_database",
manage_cmd="bulk_remove_finding_aids",
s3_key="{{ params.s3_key }}",
cinco_environment="{{ params.cinco_environment }}",
)

@task()
def bulk_remove_static_finding_aids(s3_key, cinco_environment="stage"):
s3 = boto3.resource("s3")
if cinco_environment == "prd":
bucket_name = Variable.get("CINCO_S3_BUCKET_PRD")
else:
bucket_name = Variable.get("CINCO_S3_BUCKET_STAGE")

removals = s3.get_object(Bucket=bucket_name, Key=s3_key)
removals = removals["Body"].read().decode("utf-8").splitlines()

for line in removals:
ark, _ = line.split(",", 1)
prefix = f"static_findaids/static_findaids/{ark}"
print(f"Deleting objects in {bucket_name} at {prefix}")
bucket = s3.Bucket(bucket_name)
delete_results = bucket.objects.filter(Prefix=prefix).delete()
print(delete_results)

(
bulk_remove_from_index
>> bulk_remove_from_database
>> bulk_remove_static_finding_aids(
"{{ params.s3_key }}",
cinco_environment="{{ params.cinco_environment }}",
)
)


bulk_delete_finding_aid = bulk_delete_finding_aid()
22 changes: 22 additions & 0 deletions dags/cincoctrl_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def __init__(
finding_aid_id=None,
s3_key=None,
repository_id=None,
finding_aid_ark=None,
**kwargs,
):
manage_args = []
Expand All @@ -57,6 +58,16 @@ def __init__(
"--s3-key",
s3_key,
]
elif manage_cmd == "remove_finding_aid":
manage_args = [
"--ark",
finding_aid_ark,
]
elif manage_cmd == "bulk_remove_finding_aids":
manage_args = [
"--s3-key",
s3_key,
]

container_name = f"cinco-ctrl-{cinco_environment}-container"
# TODO: specify task definition revision? how?
Expand Down Expand Up @@ -127,6 +138,7 @@ def __init__(
finding_aid_id=None,
s3_key=None,
repository_id=None,
finding_aid_ark=None,
cinco_environment="dev",
**kwargs,
):
Expand All @@ -145,6 +157,16 @@ def __init__(
"--s3-key",
s3_key,
]
elif manage_cmd == "remove_finding_aid":
manage_args = [
"--ark",
finding_aid_ark,
]
elif manage_cmd == "bulk_remove_finding_aids":
manage_args = [
"--s3-key",
s3_key,
]

# set in startup.sh, path to cinco/cincoctrl on local
if os.environ.get("CINCO_MOUNT_CINCOCTRL"):
Expand Down
2 changes: 1 addition & 1 deletion dags/dags_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,5 @@ def dag_bag(self):
return DagBag(dag_folder=DAGS_FOLDER, include_examples=False)

def test_no_import_errors(self):
assert len(self.dag_bag().dags) == 4
assert len(self.dag_bag().dags) == 7
assert not self.dag_bag().import_errors
Loading
Loading