-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbackpopulate_youtube_data.py
120 lines (105 loc) · 4.42 KB
/
backpopulate_youtube_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""Management command for populating youtube course data"""
from datetime import UTC, datetime
from django.core.management import BaseCommand
from learning_resources.etl.constants import ETLSource
from learning_resources.models import LearningResource, VideoChannel
from learning_resources.tasks import get_youtube_data, get_youtube_transcripts
from learning_resources.utils import resource_delete_actions
from main.constants import ISOFORMAT
from main.utils import now_in_utc
class Command(BaseCommand):
"""Populate youtube videos"""
help = """Populates youtube videos"""
def add_arguments(self, parser):
"""Configure arguments for this command"""
subparsers = parser.add_subparsers(dest="command")
# delete subcommand
subparsers.add_parser("delete", help="Delete all existing records first")
# fetch subcommand
fetch_parser = subparsers.add_parser(
"fetch", help="Fetches video data, defaulting to recently published ones"
)
fetch_parser.add_argument(
"-c",
"--channel-id",
dest="channel_ids",
action="append",
default=None,
help="Only fetch channels specified by channel id",
)
# transcripts subcommand
transcripts_parser = subparsers.add_parser(
"transcripts", help="Fetches video transcript data"
)
transcripts_parser.add_argument(
"--created-after",
dest="created_after",
default=None,
help="Only fetch transcripts for videos indexed after timestamp (yyyy-mm-ddThh:mm:ssZ)", # noqa: E501
)
transcripts_parser.add_argument(
"--created-minutes",
dest="created_minutes",
default=None,
help="Only fetch transcripts for videos indexed this number of minutes ago and later", # noqa: E501
)
transcripts_parser.add_argument(
"--overwrite",
dest="overwrite",
action="store_true",
help="Overwrite any existing transcript records",
)
super().add_arguments(parser)
def handle(self, *args, **options): # noqa: ARG002
"""Run Populate youtube videos"""
command = options["command"]
if command == "delete":
videos_playlists = LearningResource.objects.filter(
etl_source=ETLSource.youtube.name
)
self.stdout.write(
f"Deleting {videos_playlists.count()} existing YouTube resources"
)
for resource in videos_playlists:
resource_delete_actions(resource)
VideoChannel.objects.all().delete()
self.stdout.write("Complete")
elif command == "fetch":
channel_ids = options["channel_ids"]
task = get_youtube_data.delay(channel_ids=channel_ids)
self.stdout.write(f"Started task {task} to get YouTube video data")
self.stdout.write("Waiting on task...")
start = now_in_utc()
result = task.get()
total_seconds = (now_in_utc() - start).total_seconds()
self.stdout.write(
f"Fetched {result} YouTube channel in {total_seconds} seconds"
)
elif command == "transcripts":
created_after = options["created_after"]
created_minutes = options["created_minutes"]
overwrite = options["overwrite"]
if created_after:
try:
created_after = datetime.strptime(created_after, ISOFORMAT).replace(
tzinfo=UTC
)
except ValueError:
self.stdout.write("Invalid date format")
return
if created_minutes:
try:
created_minutes = int(created_minutes)
except ValueError:
self.stdout.write("created_minutes must be an integer")
return
task = get_youtube_transcripts.delay(
created_after=created_after,
created_minutes=created_minutes,
overwrite=overwrite,
)
self.stdout.write("Waiting on task...")
start = now_in_utc()
task.get()
total_seconds = (now_in_utc() - start).total_seconds()
self.stdout.write(f"Completed in {total_seconds} seconds")