Skip to content

Commit 46ef314

Browse files
authored
Celery task to embed new contentfiles (#2044)
* adding initial task to embed new contentfiles * adding test * adding celery task to periodically pick up contentfiles
1 parent 8b57ae8 commit 46ef314

File tree

3 files changed

+72
-7
lines changed

3 files changed

+72
-7
lines changed

main/settings_celery.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -131,12 +131,6 @@
131131
"schedule": crontab(minute=30, hour=18), # 2:30pm EST
132132
"kwargs": {"period": "daily", "subscription_type": "channel_subscription_type"},
133133
},
134-
"daily_embed_new_learning_resources": {
135-
"task": "vector_search.tasks.embed_new_learning_resources",
136-
"schedule": get_int(
137-
"EMBED_NEW_RESOURCES_SCHEDULE_SECONDS", 60 * 30
138-
), # default is every 30 minutes
139-
},
140134
"send-search-subscription-emails-every-1-days": {
141135
"task": "learning_resources_search.tasks.send_subscription_emails",
142136
"schedule": crontab(minute=0, hour=19), # 3:00pm EST
@@ -156,6 +150,13 @@
156150
"EMBED_NEW_RESOURCES_SCHEDULE_SECONDS", 60 * 30
157151
), # default is every 30 minutes
158152
}
153+
CELERY_BEAT_SCHEDULE["daily_embed_new_content_files"] = {
154+
"task": "vector_search.tasks.embed_new_content_files",
155+
"schedule": get_int(
156+
"EMBED_NEW_CONTENT_FILES_SCHEDULE_SECONDS", 60 * 30
157+
), # default is every 30 minutes
158+
}
159+
159160

160161
CELERY_TASK_SERIALIZER = "json"
161162
CELERY_RESULT_SERIALIZER = "json"

vector_search/tasks.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,3 +254,30 @@ def embed_new_learning_resources(self):
254254
embed_tasks = celery.group(tasks)
255255

256256
return self.replace(embed_tasks)
257+
258+
259+
@app.task(bind=True)
260+
def embed_new_content_files(self):
261+
"""
262+
Embed new content files from the last day
263+
"""
264+
log.info("Running content file embedding task")
265+
delta = datetime.timedelta(days=1)
266+
since = now_in_utc() - delta
267+
new_content_files = ContentFile.objects.filter(
268+
published=True,
269+
created_on__gt=since,
270+
run__published=True,
271+
)
272+
273+
tasks = [
274+
generate_embeddings.si(ids, CONTENT_FILE_TYPE, overwrite=False)
275+
for ids in chunks(
276+
new_content_files.values_list("id", flat=True),
277+
chunk_size=settings.QDRANT_CHUNK_SIZE,
278+
)
279+
]
280+
281+
embed_tasks = celery.group(tasks)
282+
283+
return self.replace(embed_tasks)

vector_search/tasks_test.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,14 @@
1111
LearningResourceRunFactory,
1212
ProgramFactory,
1313
)
14-
from learning_resources.models import LearningResource
14+
from learning_resources.models import ContentFile, LearningResource
1515
from learning_resources_search.constants import (
1616
COURSE_TYPE,
1717
)
1818
from main.utils import now_in_utc
1919
from vector_search.tasks import (
2020
embed_learning_resources_by_id,
21+
embed_new_content_files,
2122
embed_new_learning_resources,
2223
start_embed_resources,
2324
)
@@ -149,6 +150,42 @@ def test_embed_new_learning_resources(mocker, mocked_celery):
149150
assert sorted(daily_resource_ids) == sorted(embedded_ids)
150151

151152

153+
def test_embed_new_content_files(mocker, mocked_celery):
154+
"""
155+
embed_new_content_files should generate embeddings for new content files
156+
created within the last day
157+
"""
158+
mocker.patch("vector_search.tasks.load_course_blocklist", return_value=[])
159+
160+
daily_since = now_in_utc() - datetime.timedelta(hours=5)
161+
162+
ContentFileFactory.create_batch(4, created_on=daily_since, published=True)
163+
# create resources older than a day
164+
ContentFileFactory.create_batch(
165+
4,
166+
created_on=now_in_utc() - datetime.timedelta(days=5),
167+
published=True,
168+
)
169+
170+
daily_content_file_ids = [
171+
resource.id
172+
for resource in ContentFile.objects.filter(
173+
created_on__gt=now_in_utc() - datetime.timedelta(days=1)
174+
)
175+
]
176+
177+
generate_embeddings_mock = mocker.patch(
178+
"vector_search.tasks.generate_embeddings", autospec=True
179+
)
180+
181+
with pytest.raises(mocked_celery.replace_exception_class):
182+
embed_new_content_files.delay()
183+
list(mocked_celery.group.call_args[0][0])
184+
185+
embedded_ids = generate_embeddings_mock.si.mock_calls[0].args[0]
186+
assert sorted(daily_content_file_ids) == sorted(embedded_ids)
187+
188+
152189
def test_embed_learning_resources_by_id(mocker, mocked_celery):
153190
"""
154191
embed_learning_resources_by_id should generate embeddings for resources

0 commit comments

Comments
 (0)