Skip to content

Commit

Permalink
Change readable_id values for podcasts and episodes (#1232)
Browse files Browse the repository at this point in the history
  • Loading branch information
mbertrand authored Jul 11, 2024
1 parent ed7dd68 commit d9cbccc
Show file tree
Hide file tree
Showing 9 changed files with 261 additions and 54 deletions.
23 changes: 18 additions & 5 deletions learning_resources/etl/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,10 @@ def load_podcast(podcast_data: dict) -> LearningResource:
LearningResource.objects.filter(id__in=unpublished_episode_ids).update(
published=False
)
bulk_resources_unpublished_actions(
unpublished_episode_ids,
LearningResourceType.podcast_episode.name,
)
episode_ids.extend(unpublished_episode_ids)
learning_resource.resources.set(
episode_ids,
Expand Down Expand Up @@ -719,13 +723,22 @@ def load_podcasts(podcasts_data: list[dict]) -> list[LearningResource]:

# unpublish the podcasts and episodes we're no longer tracking
ids = [podcast.id for podcast in podcast_resources]
LearningResource.objects.filter(
unpublished_podcasts = LearningResource.objects.filter(
resource_type=LearningResourceType.podcast.name
).exclude(id__in=ids).update(published=False)
LearningResource.objects.filter(
).exclude(id__in=ids)
unpublished_podcasts.update(published=False)
bulk_resources_unpublished_actions(
unpublished_podcasts.values_list("id", flat=True),
LearningResourceType.podcast.name,
)
unpublished_episodes = LearningResource.objects.filter(
resource_type=LearningResourceType.podcast_episode.name
).exclude(parents__parent__in=ids).update(published=False)

).exclude(parents__parent__in=ids)
unpublished_episodes.update(published=False)
bulk_resources_unpublished_actions(
unpublished_episodes.values_list("id", flat=True),
LearningResourceType.podcast_episode.name,
)
return podcast_resources


Expand Down
45 changes: 27 additions & 18 deletions learning_resources/etl/podcast.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from learning_resources.constants import LearningResourceType
from learning_resources.etl.constants import ETLSource
from learning_resources.etl.utils import clean_data, generate_readable_id
from learning_resources.etl.utils import clean_data
from learning_resources.models import PodcastEpisode
from main.utils import frontend_absolute_url, now_in_utc

Expand Down Expand Up @@ -100,6 +100,19 @@ def get_podcast_configs():
return podcast_configs


def parse_readable_id_from_url(url):
"""
Parse readable id from podcast/episode url
Args:
url (str): the podcast/episode url
Returns:
str: the readable id
"""
return url.split("//")[-1]


def extract():
"""
Function for extracting podcast data
Expand All @@ -125,7 +138,7 @@ def extract():
log.exception("Invalid rss url %s", rss_url)


def transform_episode(rss_data, offered_by, topics, parent_image, podcast_id):
def transform_episode(rss_data, offered_by, topics, parent_image):
"""
Transform a podcast episode into our normalized data
Expand All @@ -140,23 +153,22 @@ def transform_episode(rss_data, offered_by, topics, parent_image, podcast_id):
normalized podcast episode data
"""

rss_data.guid.string = f"{podcast_id}: {rss_data.guid.text}"

return {
"readable_id": generate_readable_id(rss_data.title.text[:95]),
"readable_id": rss_data.guid.text
or parse_readable_id_from_url(
rss_data.link.text if rss_data.link else rss_data.enclosure["url"]
),
"etl_source": ETLSource.podcast.name,
"resource_type": LearningResourceType.podcast_episode.name,
"title": rss_data.title.text,
"offered_by": offered_by,
"description": clean_data(rss_data.description.text),
"url": rss_data.enclosure["url"],
"image": {
"url": (
rss_data.find("image")["href"]
if rss_data.find("image")
else parent_image
),
},
"url": (rss_data.find("image")["href"]),
}
if rss_data.find("image")
else parent_image,
"last_modified": parse(rss_data.pubDate.text),
"published": True,
"topics": topics,
Expand Down Expand Up @@ -186,7 +198,7 @@ def transform(extracted_podcasts):
for rss_data, config_data in extracted_podcasts:
try:
image = (
rss_data.channel.find("itunes:image")["href"]
{"url": rss_data.channel.find("itunes:image")["href"]}
if rss_data.channel.find("itunes:image")
else None
)
Expand All @@ -203,23 +215,20 @@ def transform(extracted_podcasts):
apple_podcasts_url = config_data.get("apple_podcasts_url")
google_podcasts_url = config_data.get("google_podcasts_url")
title = config_data.get("podcast_title", rss_data.channel.title.text)
podcast_id = generate_readable_id(title[:95])

yield {
"readable_id": podcast_id,
"readable_id": parse_readable_id_from_url(config_data["rss_url"]),
"title": title,
"etl_source": ETLSource.podcast.name,
"resource_type": LearningResourceType.podcast.name,
"offered_by": offered_by,
"description": clean_data(rss_data.channel.description.text),
"image": {"url": rss_data.channel.find("itunes:image")["href"]},
"image": image,
"published": True,
"url": config_data["website"],
"topics": topics,
"episodes": (
transform_episode(
episode_rss, offered_by, topics, image, podcast_id
)
transform_episode(episode_rss, offered_by, topics, image)
for episode_rss in rss_data.find_all("item")
),
"podcast": {
Expand Down
27 changes: 9 additions & 18 deletions learning_resources/etl/podcast_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,23 +43,22 @@ def rss_content():
def mock_podcast_file( # pylint: disable=too-many-arguments # noqa: PLR0913
podcast_title=None,
topics=None,
website_url="website_url",
website_url="http://website.url/podcast",
offered_by=None,
google_podcasts_url="google_podcasts_url",
apple_podcasts_url="apple_podcasts_url",
rss_url="rss_url",
rss_url="http://website.url/podcast/rss.xml",
):
"""Mock podcast github file"""

content = f"""---
rss_url: rss_url
rss_url: {rss_url}
{ "podcast_title: " + podcast_title if podcast_title else "" }
{ "topics: " + topics if topics else "" }
{ "offered_by: " + offered_by if offered_by else "" }
website: {website_url}
google_podcasts_url: {google_podcasts_url}
apple_podcasts_url: {apple_podcasts_url}
rss_url: {rss_url}
"""
return Mock(decoded_content=content)

Expand Down Expand Up @@ -123,22 +122,14 @@ def test_transform(mock_github_client, title, topics, offered_by):
)

expected_title = title if title else "A Podcast"
expected_readable_id = (
"custom-titleb04b26d38dd63a2c829393e9e075927d"
if title
else "a-podcast7e3a1ebb0c4d3196ba4c7f8254af4d2d"
)

expected_offered_by = {"name": offered_by} if offered_by else None

episodes_rss = list(bs(rss_content(), "xml").find_all("item"))

for episode in episodes_rss:
episode.guid.string = f"{expected_readable_id}: {episode.guid.text}"

expected_results = [
{
"readable_id": expected_readable_id,
"readable_id": "website.url/podcast/rss.xml",
"etl_source": ETLSource.podcast.name,
"title": expected_title,
"offered_by": expected_offered_by,
Expand All @@ -149,13 +140,13 @@ def test_transform(mock_github_client, title, topics, offered_by):
"podcast": {
"google_podcasts_url": "google_podcasts_url",
"apple_podcasts_url": "apple_podcasts_url",
"rss_url": "rss_url",
"rss_url": "http://website.url/podcast/rss.xml",
},
"resource_type": LearningResourceType.podcast.name,
"topics": expected_topics,
"episodes": [
{
"readable_id": "episode15ede89915db9342fb76bc91918d22016",
"readable_id": "tag:soundcloud,2010:tracks/numbers1",
"etl_source": ETLSource.podcast.name,
"title": "Episode1",
"offered_by": expected_offered_by,
Expand All @@ -178,7 +169,7 @@ def test_transform(mock_github_client, title, topics, offered_by):
"topics": expected_topics,
},
{
"readable_id": "episode205c066df9ed531e48c6414f6e72d3b96",
"readable_id": "tag:soundcloud,2010:tracks/numbers2",
"etl_source": ETLSource.podcast.name,
"title": "Episode2",
"offered_by": expected_offered_by,
Expand Down Expand Up @@ -229,11 +220,11 @@ def test_transform_with_error(mocker, mock_github_client):
results = list(transform(extract_results))

mock_exception_log.assert_called_once_with(
"Error parsing podcast data from %s", "rss_url"
"Error parsing podcast data from %s", "http://website.url/podcast/rss.xml"
)

assert len(results) == 1
assert results[0]["url"] == "website_url"
assert results[0]["url"] == "http://website.url/podcast"


@pytest.mark.django_db()
Expand Down
63 changes: 63 additions & 0 deletions learning_resources/management/commands/transfer_list_resources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Management command for migrating unpublished resource path/list relationships"""

from django.core.management import BaseCommand

from learning_resources.utils import transfer_list_resources
from main.utils import now_in_utc


class Command(BaseCommand):
"""
Migrate relationships in learningpaths and userlists from unpublished
resources to published replacement resources.
"""

help = "Migrate relationships from unpublished resources to published resources."

def add_arguments(self, parser):
parser.add_argument("resource_type", type=str, help="Resource type to migrate")
parser.add_argument(
"match_field", type=str, help="Resource field to match resources by"
)
parser.add_argument(
"from_source", type=str, help="ETL Source for unpublished resources"
)
parser.add_argument(
"to_source", type=str, help="ETL Source for published resources"
)
parser.add_argument(
"--delete",
dest="delete",
action="store_true",
help="Delete unpublished resources after migrating relationships",
)
super().add_arguments(parser)

def handle(self, *args, **options): # noqa: ARG002
"""
Migrate relationships in learningpaths and userlists from unpublished
resources to published replacement resources.
"""
resource_type = options["resource_type"]
match_field = options["match_field"]
from_source = options["from_source"]
to_source = options["to_source"]
delete = options["delete"]

self.stdout.write(
f"Migrate {resource_type} relationships from "
f"{from_source} to {to_source}, matching on {match_field}"
)
start = now_in_utc()
unpublished, matching = transfer_list_resources(
resource_type,
match_field,
from_source,
to_source,
delete_unpublished=delete,
)
total_seconds = (now_in_utc() - start).total_seconds()
self.stdout.write(
f"Processed {unpublished} resources and found {matching} "
f"published matches, took {total_seconds} seconds"
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Generated by Django 4.2.13 on 2024-07-08 16:30

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("learning_resources", "0055_alter_relationship_options_ordering"),
]

operations = [
migrations.AlterField(
model_name="learningresource",
name="readable_id",
field=models.CharField(max_length=512),
),
]
2 changes: 1 addition & 1 deletion learning_resources/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ class LearningResource(TimestampedModel):
*([item.name for item in LearningResourceType]),
]

readable_id = models.CharField(max_length=128, null=False, blank=False)
readable_id = models.CharField(max_length=512, null=False, blank=False)
title = models.CharField(max_length=256)
description = models.TextField(null=True, blank=True) # noqa: DJ001
full_description = models.TextField(null=True, blank=True) # noqa: DJ001
Expand Down
53 changes: 53 additions & 0 deletions learning_resources/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from learning_resources.constants import (
GROUP_STAFF_LISTS_EDITORS,
LearningResourceRelationTypes,
semester_mapping,
)
from learning_resources.hooks import get_plugin_manager
Expand All @@ -24,9 +25,11 @@
LearningResourceDepartment,
LearningResourceOfferor,
LearningResourcePlatform,
LearningResourceRelationship,
LearningResourceRun,
LearningResourceSchool,
LearningResourceTopic,
UserListRelationship,
)
from main.utils import generate_filepath

Expand Down Expand Up @@ -559,3 +562,53 @@ def add_parent_topics_to_learning_resource(resource):
for topic in resource.topics.all():
if topic.parent:
_walk_lr_topic_parents(resource, topic.parent)


def transfer_list_resources(
resource_type: str,
matching_field: str,
from_source: str,
to_source: str,
*,
delete_unpublished: bool = False,
) -> tuple[int, int]:
"""
Migrate unpublished learningpath/userlist resources that have
been replaced with new resource objects.
Args:
resource_type (str): the resource type
matching_field (str): the unique field to match identical resources
from_source (str): the ETL source of unpublished resources
to_source (str): the ETL source of published resources
delete_unpublished (bool): whether to delete the unpublished resources
Returns:
tuple[int, int]: the number of unpublished and matching published resources
"""
unpublished_resources = LearningResource.objects.filter(
resource_type=resource_type, published=False, etl_source=from_source
)
unpublished_count = 0
published_count = 0
for resource in unpublished_resources:
unpublished_count += 1
unique_value = getattr(resource, matching_field)
published_replacement = LearningResource.objects.filter(
**{matching_field: unique_value},
resource_type=resource_type,
published=True,
etl_source=to_source,
).first()
if published_replacement is not None:
published_count += 1
LearningResourceRelationship.objects.filter(
relation_type=LearningResourceRelationTypes.LEARNING_PATH_ITEMS.value,
child=resource,
).update(child=published_replacement)
UserListRelationship.objects.filter(child=resource).update(
child=published_replacement
)
if delete_unpublished:
unpublished_resources.delete()
return unpublished_count, published_count
Loading

0 comments on commit d9cbccc

Please sign in to comment.