Change readable_id values for podcasts and episodes (#1232)

mitodl · Jul 11, 2024 · d9cbccc · d9cbccc
1 parent ed7dd68
commit d9cbccc
Show file tree

Hide file tree

Showing 9 changed files with 261 additions and 54 deletions.
diff --git a/learning_resources/etl/loaders.py b/learning_resources/etl/loaders.py
@@ -682,6 +682,10 @@ def load_podcast(podcast_data: dict) -> LearningResource:
         LearningResource.objects.filter(id__in=unpublished_episode_ids).update(
             published=False
         )
+        bulk_resources_unpublished_actions(
+            unpublished_episode_ids,
+            LearningResourceType.podcast_episode.name,
+        )
         episode_ids.extend(unpublished_episode_ids)
         learning_resource.resources.set(
             episode_ids,
@@ -719,13 +723,22 @@ def load_podcasts(podcasts_data: list[dict]) -> list[LearningResource]:
 
     # unpublish the podcasts and episodes we're no longer tracking
     ids = [podcast.id for podcast in podcast_resources]
-    LearningResource.objects.filter(
+    unpublished_podcasts = LearningResource.objects.filter(
         resource_type=LearningResourceType.podcast.name
-    ).exclude(id__in=ids).update(published=False)
-    LearningResource.objects.filter(
+    ).exclude(id__in=ids)
+    unpublished_podcasts.update(published=False)
+    bulk_resources_unpublished_actions(
+        unpublished_podcasts.values_list("id", flat=True),
+        LearningResourceType.podcast.name,
+    )
+    unpublished_episodes = LearningResource.objects.filter(
         resource_type=LearningResourceType.podcast_episode.name
-    ).exclude(parents__parent__in=ids).update(published=False)
-
+    ).exclude(parents__parent__in=ids)
+    unpublished_episodes.update(published=False)
+    bulk_resources_unpublished_actions(
+        unpublished_episodes.values_list("id", flat=True),
+        LearningResourceType.podcast_episode.name,
+    )
     return podcast_resources
 
 

diff --git a/learning_resources/etl/podcast.py b/learning_resources/etl/podcast.py
@@ -12,7 +12,7 @@
 
 from learning_resources.constants import LearningResourceType
 from learning_resources.etl.constants import ETLSource
-from learning_resources.etl.utils import clean_data, generate_readable_id
+from learning_resources.etl.utils import clean_data
 from learning_resources.models import PodcastEpisode
 from main.utils import frontend_absolute_url, now_in_utc
 
@@ -100,6 +100,19 @@ def get_podcast_configs():
     return podcast_configs
 
 
+def parse_readable_id_from_url(url):
+    """
+    Parse readable id from podcast/episode url
+
+    Args:
+        url (str): the podcast/episode url
+
+    Returns:
+        str: the readable id
+    """
+    return url.split("//")[-1]
+
+
 def extract():
     """
     Function for extracting podcast data
@@ -125,7 +138,7 @@ def extract():
             log.exception("Invalid rss url %s", rss_url)
 
 
-def transform_episode(rss_data, offered_by, topics, parent_image, podcast_id):
+def transform_episode(rss_data, offered_by, topics, parent_image):
     """
     Transform a podcast episode into our normalized data
 
@@ -140,23 +153,22 @@ def transform_episode(rss_data, offered_by, topics, parent_image, podcast_id):
             normalized podcast episode data
     """
 
-    rss_data.guid.string = f"{podcast_id}: {rss_data.guid.text}"
-
     return {
-        "readable_id": generate_readable_id(rss_data.title.text[:95]),
+        "readable_id": rss_data.guid.text
+        or parse_readable_id_from_url(
+            rss_data.link.text if rss_data.link else rss_data.enclosure["url"]
+        ),
         "etl_source": ETLSource.podcast.name,
         "resource_type": LearningResourceType.podcast_episode.name,
         "title": rss_data.title.text,
         "offered_by": offered_by,
         "description": clean_data(rss_data.description.text),
         "url": rss_data.enclosure["url"],
         "image": {
-            "url": (
-                rss_data.find("image")["href"]
-                if rss_data.find("image")
-                else parent_image
-            ),
-        },
+            "url": (rss_data.find("image")["href"]),
+        }
+        if rss_data.find("image")
+        else parent_image,
         "last_modified": parse(rss_data.pubDate.text),
         "published": True,
         "topics": topics,
@@ -186,7 +198,7 @@ def transform(extracted_podcasts):
     for rss_data, config_data in extracted_podcasts:
         try:
             image = (
-                rss_data.channel.find("itunes:image")["href"]
+                {"url": rss_data.channel.find("itunes:image")["href"]}
                 if rss_data.channel.find("itunes:image")
                 else None
             )
@@ -203,23 +215,20 @@ def transform(extracted_podcasts):
             apple_podcasts_url = config_data.get("apple_podcasts_url")
             google_podcasts_url = config_data.get("google_podcasts_url")
             title = config_data.get("podcast_title", rss_data.channel.title.text)
-            podcast_id = generate_readable_id(title[:95])
 
             yield {
-                "readable_id": podcast_id,
+                "readable_id": parse_readable_id_from_url(config_data["rss_url"]),
                 "title": title,
                 "etl_source": ETLSource.podcast.name,
                 "resource_type": LearningResourceType.podcast.name,
                 "offered_by": offered_by,
                 "description": clean_data(rss_data.channel.description.text),
-                "image": {"url": rss_data.channel.find("itunes:image")["href"]},
+                "image": image,
                 "published": True,
                 "url": config_data["website"],
                 "topics": topics,
                 "episodes": (
-                    transform_episode(
-                        episode_rss, offered_by, topics, image, podcast_id
-                    )
+                    transform_episode(episode_rss, offered_by, topics, image)
                     for episode_rss in rss_data.find_all("item")
                 ),
                 "podcast": {

diff --git a/learning_resources/etl/podcast_test.py b/learning_resources/etl/podcast_test.py
@@ -43,23 +43,22 @@ def rss_content():
 def mock_podcast_file(  # pylint: disable=too-many-arguments  # noqa: PLR0913
     podcast_title=None,
     topics=None,
-    website_url="website_url",
+    website_url="http://website.url/podcast",
     offered_by=None,
     google_podcasts_url="google_podcasts_url",
     apple_podcasts_url="apple_podcasts_url",
-    rss_url="rss_url",
+    rss_url="http://website.url/podcast/rss.xml",
 ):
     """Mock podcast github file"""
 
     content = f"""---
-rss_url: rss_url
+rss_url: {rss_url}
 { "podcast_title: " + podcast_title if podcast_title else "" }
 { "topics: " + topics if topics else "" }
 { "offered_by: " + offered_by if offered_by else "" }
 website:  {website_url}
 google_podcasts_url: {google_podcasts_url}
 apple_podcasts_url: {apple_podcasts_url}
-rss_url: {rss_url}
 """
     return Mock(decoded_content=content)
 
@@ -123,22 +122,14 @@ def test_transform(mock_github_client, title, topics, offered_by):
     )
 
     expected_title = title if title else "A Podcast"
-    expected_readable_id = (
-        "custom-titleb04b26d38dd63a2c829393e9e075927d"
-        if title
-        else "a-podcast7e3a1ebb0c4d3196ba4c7f8254af4d2d"
-    )
 
     expected_offered_by = {"name": offered_by} if offered_by else None
 
     episodes_rss = list(bs(rss_content(), "xml").find_all("item"))
 
-    for episode in episodes_rss:
-        episode.guid.string = f"{expected_readable_id}: {episode.guid.text}"
-
     expected_results = [
         {
-            "readable_id": expected_readable_id,
+            "readable_id": "website.url/podcast/rss.xml",
             "etl_source": ETLSource.podcast.name,
             "title": expected_title,
             "offered_by": expected_offered_by,
@@ -149,13 +140,13 @@ def test_transform(mock_github_client, title, topics, offered_by):
             "podcast": {
                 "google_podcasts_url": "google_podcasts_url",
                 "apple_podcasts_url": "apple_podcasts_url",
-                "rss_url": "rss_url",
+                "rss_url": "http://website.url/podcast/rss.xml",
             },
             "resource_type": LearningResourceType.podcast.name,
             "topics": expected_topics,
             "episodes": [
                 {
-                    "readable_id": "episode15ede89915db9342fb76bc91918d22016",
+                    "readable_id": "tag:soundcloud,2010:tracks/numbers1",
                     "etl_source": ETLSource.podcast.name,
                     "title": "Episode1",
                     "offered_by": expected_offered_by,
@@ -178,7 +169,7 @@ def test_transform(mock_github_client, title, topics, offered_by):
                     "topics": expected_topics,
                 },
                 {
-                    "readable_id": "episode205c066df9ed531e48c6414f6e72d3b96",
+                    "readable_id": "tag:soundcloud,2010:tracks/numbers2",
                     "etl_source": ETLSource.podcast.name,
                     "title": "Episode2",
                     "offered_by": expected_offered_by,
@@ -229,11 +220,11 @@ def test_transform_with_error(mocker, mock_github_client):
     results = list(transform(extract_results))
 
     mock_exception_log.assert_called_once_with(
-        "Error parsing podcast data from %s", "rss_url"
+        "Error parsing podcast data from %s", "http://website.url/podcast/rss.xml"
     )
 
     assert len(results) == 1
-    assert results[0]["url"] == "website_url"
+    assert results[0]["url"] == "http://website.url/podcast"
 
 
 @pytest.mark.django_db()

diff --git a/learning_resources/management/commands/transfer_list_resources.py b/learning_resources/management/commands/transfer_list_resources.py
@@ -0,0 +1,63 @@
+"""Management command for migrating unpublished resource path/list relationships"""
+
+from django.core.management import BaseCommand
+
+from learning_resources.utils import transfer_list_resources
+from main.utils import now_in_utc
+
+
+class Command(BaseCommand):
+    """
+    Migrate relationships in learningpaths and userlists from unpublished
+    resources to published replacement resources.
+    """
+
+    help = "Migrate relationships from unpublished resources to published resources."
+
+    def add_arguments(self, parser):
+        parser.add_argument("resource_type", type=str, help="Resource type to migrate")
+        parser.add_argument(
+            "match_field", type=str, help="Resource field to match resources by"
+        )
+        parser.add_argument(
+            "from_source", type=str, help="ETL Source for unpublished resources"
+        )
+        parser.add_argument(
+            "to_source", type=str, help="ETL Source for published resources"
+        )
+        parser.add_argument(
+            "--delete",
+            dest="delete",
+            action="store_true",
+            help="Delete unpublished resources after migrating relationships",
+        )
+        super().add_arguments(parser)
+
+    def handle(self, *args, **options):  # noqa: ARG002
+        """
+        Migrate relationships in learningpaths and userlists from unpublished
+        resources to published replacement resources.
+        """
+        resource_type = options["resource_type"]
+        match_field = options["match_field"]
+        from_source = options["from_source"]
+        to_source = options["to_source"]
+        delete = options["delete"]
+
+        self.stdout.write(
+            f"Migrate {resource_type} relationships from "
+            f"{from_source} to {to_source}, matching on {match_field}"
+        )
+        start = now_in_utc()
+        unpublished, matching = transfer_list_resources(
+            resource_type,
+            match_field,
+            from_source,
+            to_source,
+            delete_unpublished=delete,
+        )
+        total_seconds = (now_in_utc() - start).total_seconds()
+        self.stdout.write(
+            f"Processed {unpublished} resources and found {matching} "
+            f"published matches, took {total_seconds} seconds"
+        )
diff --git a/learning_resources/migrations/0056_alter_learningresource_readable_id.py b/learning_resources/migrations/0056_alter_learningresource_readable_id.py
@@ -0,0 +1,17 @@
+# Generated by Django 4.2.13 on 2024-07-08 16:30
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("learning_resources", "0055_alter_relationship_options_ordering"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="learningresource",
+            name="readable_id",
+            field=models.CharField(max_length=512),
+        ),
+    ]
diff --git a/learning_resources/models.py b/learning_resources/models.py
@@ -185,7 +185,7 @@ class LearningResource(TimestampedModel):
         *([item.name for item in LearningResourceType]),
     ]
 
-    readable_id = models.CharField(max_length=128, null=False, blank=False)
+    readable_id = models.CharField(max_length=512, null=False, blank=False)
     title = models.CharField(max_length=256)
     description = models.TextField(null=True, blank=True)  # noqa: DJ001
     full_description = models.TextField(null=True, blank=True)  # noqa: DJ001

diff --git a/learning_resources/utils.py b/learning_resources/utils.py
@@ -16,6 +16,7 @@
 
 from learning_resources.constants import (
     GROUP_STAFF_LISTS_EDITORS,
+    LearningResourceRelationTypes,
     semester_mapping,
 )
 from learning_resources.hooks import get_plugin_manager
@@ -24,9 +25,11 @@
     LearningResourceDepartment,
     LearningResourceOfferor,
     LearningResourcePlatform,
+    LearningResourceRelationship,
     LearningResourceRun,
     LearningResourceSchool,
     LearningResourceTopic,
+    UserListRelationship,
 )
 from main.utils import generate_filepath
 
@@ -559,3 +562,53 @@ def add_parent_topics_to_learning_resource(resource):
     for topic in resource.topics.all():
         if topic.parent:
             _walk_lr_topic_parents(resource, topic.parent)
+
+
+def transfer_list_resources(
+    resource_type: str,
+    matching_field: str,
+    from_source: str,
+    to_source: str,
+    *,
+    delete_unpublished: bool = False,
+) -> tuple[int, int]:
+    """
+    Migrate unpublished learningpath/userlist resources that have
+    been replaced with new resource objects.
+
+    Args:
+        resource_type (str): the resource type
+        matching_field (str): the unique field to match identical resources
+        from_source (str): the ETL source of unpublished resources
+        to_source (str): the ETL source of published resources
+        delete_unpublished (bool): whether to delete the unpublished resources
+
+    Returns:
+        tuple[int, int]: the number of unpublished and matching published resources
+    """
+    unpublished_resources = LearningResource.objects.filter(
+        resource_type=resource_type, published=False, etl_source=from_source
+    )
+    unpublished_count = 0
+    published_count = 0
+    for resource in unpublished_resources:
+        unpublished_count += 1
+        unique_value = getattr(resource, matching_field)
+        published_replacement = LearningResource.objects.filter(
+            **{matching_field: unique_value},
+            resource_type=resource_type,
+            published=True,
+            etl_source=to_source,
+        ).first()
+        if published_replacement is not None:
+            published_count += 1
+            LearningResourceRelationship.objects.filter(
+                relation_type=LearningResourceRelationTypes.LEARNING_PATH_ITEMS.value,
+                child=resource,
+            ).update(child=published_replacement)
+            UserListRelationship.objects.filter(child=resource).update(
+                child=published_replacement
+            )
+    if delete_unpublished:
+        unpublished_resources.delete()
+    return unpublished_count, published_count