Skip to content

Commit

Permalink
Duration and/or commitment for Sloan and Professional Education (#1906)
Browse files Browse the repository at this point in the history
  • Loading branch information
mbertrand authored Jan 2, 2025
1 parent f832053 commit 36ebf4f
Show file tree
Hide file tree
Showing 8 changed files with 229 additions and 2 deletions.
30 changes: 30 additions & 0 deletions learning_resources/etl/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,36 @@

MIT_OWNER_KEYS = ["MITx", "MITx_PRO"]

TIME_INTERVAL_MAPPING = {
"half-days": ["days"],
"half-day": ["day"],
"hours": [
"horas",
],
"hour": [
"hora",
],
"days": [
"días",
"jours",
],
"day": [
"día",
"jour",
],
"weeks": ["semanas", "semaines", "settimanes"],
"week": [
"semana",
"semaine",
"settimane",
],
"months": ["meses", "mois", "mesi"],
"month": [
"mes",
"mois",
],
}


OfferedByLoaderConfig = namedtuple( # noqa: PYI024
"OfferedByLoaderConfig", ["additive"], defaults=[False]
Expand Down
6 changes: 5 additions & 1 deletion learning_resources/etl/mitpe.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
PlatformType,
)
from learning_resources.etl.constants import ETLSource
from learning_resources.etl.utils import transform_delivery
from learning_resources.etl.utils import parse_resource_duration, transform_delivery
from main.utils import clean_data, now_in_utc

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -231,6 +231,7 @@ def _transform_runs(resource_data: dict) -> list[dict]:
resource_data["end_date"].split("|"),
resource_data["enrollment_end_date"].split("|"),
)
duration = parse_resource_duration(resource_data.get("duration"))
published_runs = []
for run_data in runs_data:
start = parse_date(run_data[1])
Expand Down Expand Up @@ -264,6 +265,9 @@ def _transform_runs(resource_data: dict) -> list[dict]:
"availability": Availability.dated.name,
"delivery": transform_delivery(resource_data["learning_format"]),
"location": parse_location(resource_data),
"duration": duration.duration,
"min_weeks": duration.min_weeks,
"max_weeks": duration.max_weeks,
}
)
return published_runs
Expand Down
9 changes: 9 additions & 0 deletions learning_resources/etl/mitpe_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@
"availability": Availability.dated.name,
"delivery": [LearningResourceDelivery.online.name],
"location": "",
"duration": "9 semanas",
"min_weeks": 9,
"max_weeks": 9,
}
],
"format": [Format.asynchronous.name],
Expand Down Expand Up @@ -111,6 +114,9 @@
"availability": Availability.dated.name,
"delivery": [LearningResourceDelivery.in_person.name],
"location": "On Campus",
"duration": "3 Days",
"min_weeks": 1,
"max_weeks": 1,
}
],
"format": [Format.asynchronous.name],
Expand Down Expand Up @@ -156,6 +162,9 @@
"availability": Availability.dated.name,
"location": "",
"delivery": [LearningResourceDelivery.online.name],
"duration": "6 to 10 Weeks",
"min_weeks": 6,
"max_weeks": 10,
}
],
"courses": [EXPECTED_COURSES[0], EXPECTED_COURSES[1]],
Expand Down
10 changes: 10 additions & 0 deletions learning_resources/etl/sloan.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
)
from learning_resources.etl.constants import ETLSource
from learning_resources.etl.utils import (
parse_resource_commitment,
parse_resource_duration,
transform_delivery,
transform_price,
transform_topics,
Expand Down Expand Up @@ -235,6 +237,8 @@ def transform_run(run_data, course_data):
faculty_names = (
run_data["Faculty_Name"].split(",") if run_data["Faculty_Name"] else []
)
duration = parse_resource_duration(run_data["Duration"])
commitment = parse_resource_commitment(run_data["Time_Commitment"])
return {
"run_id": run_data["CO_Title"],
"start_date": parse_datetime(run_data["Start_Date"]),
Expand All @@ -254,6 +258,12 @@ def transform_run(run_data, course_data):
"pace": [parse_pace(run_data)],
"format": parse_format(run_data),
"location": parse_location(run_data),
"duration": duration.duration,
"min_weeks": duration.min_weeks,
"max_weeks": duration.max_weeks,
"time_commitment": commitment.commitment,
"min_weekly_hours": commitment.min_weekly_hours,
"max_weekly_hours": commitment.max_weekly_hours,
}


Expand Down
12 changes: 12 additions & 0 deletions learning_resources/etl/sloan_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
transform_delivery,
transform_run,
)
from learning_resources.etl.utils import (
parse_resource_commitment,
parse_resource_duration,
)
from learning_resources.factories import (
LearningResourceOfferorFactory,
LearningResourceTopicMappingFactory,
Expand Down Expand Up @@ -117,6 +121,8 @@ def test_transform_run(
faculty_names = (
run_data["Faculty_Name"].split(",") if run_data["Faculty_Name"] else []
)
duration = parse_resource_duration(run_data["Duration"])
commitment = parse_resource_commitment(run_data["Time_Commitment"])
assert transform_run(run_data, course_data) == {
"run_id": run_data["CO_Title"],
"start_date": parse_datetime(run_data["Start_Date"]),
Expand All @@ -132,6 +138,12 @@ def test_transform_run(
"pace": [Pace.instructor_paced.name],
"format": [Format.synchronous.name],
"location": run_data["Location"],
"duration": duration.duration,
"min_weeks": duration.min_weeks,
"max_weeks": duration.max_weeks,
"time_commitment": commitment.commitment,
"min_weekly_hours": commitment.min_weekly_hours,
"max_weekly_hours": commitment.max_weekly_hours,
}


Expand Down
112 changes: 112 additions & 0 deletions learning_resources/etl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import glob
import json
import logging
import math
import mimetypes
import os
import re
Expand Down Expand Up @@ -40,7 +41,10 @@
)
from learning_resources.etl.constants import (
RESOURCE_DELIVERY_MAPPING,
TIME_INTERVAL_MAPPING,
CommitmentConfig,
CourseNumberType,
DurationConfig,
ETLSource,
)
from learning_resources.models import (
Expand Down Expand Up @@ -758,3 +762,111 @@ def parse_string_to_int(int_str: str) -> int | None:
return int(int_str)
except (TypeError, ValueError):
return None


def calculate_weeks(num: int, from_unit: str) -> int:
"""
Transform any # of days or months to weeks
Args:
num (int): the numerical value
from_unit (str): the time unit
Returns:
int: the number of weeks
"""
if "day" in from_unit:
return max(math.ceil(num / 5), 1) # Assuming weekends should be excluded
elif "month" in from_unit:
return num * 4
return num


def transform_interval(interval_txt: str) -> str or None:
"""
Transform any interval units to standard English units
Only languages currently supported are English and Spanish
Args:
interval_txt (str): the interval text
Returns:
str: the interval text with intervals translated to English
"""
english_matches = re.search(
rf"{'|'.join(TIME_INTERVAL_MAPPING.keys())}(\s|\/|$)",
interval_txt,
re.IGNORECASE,
)
if english_matches:
return english_matches.group(0).lower()
reverse_map = {
interval: k for k, v in TIME_INTERVAL_MAPPING.items() for interval in v
}
other_matches = re.search(
rf"{'|'.join(reverse_map.keys())}(\s|\/|$)", interval_txt, re.IGNORECASE
)
if other_matches:
return reverse_map[other_matches.group(0).lower()]
return None


def parse_resource_duration(duration_str: str) -> DurationConfig:
"""
Standardize duration string and return it if it is valid,
otherwise return an empty string
Args:
course_data (str): the course data
Returns:
DurationConfig: the standardized duration
"""
if duration_str:
duration_regex = re.compile(r"(\d+)\s*(to|-)*\s*(\d+)?\s*(\w+)?", re.IGNORECASE)
interval = transform_interval(duration_str)
match = duration_regex.match(duration_str.lower().strip())
if match and interval:
dmin = match.group(1)
dmax = match.group(3)
return DurationConfig(
duration=duration_str,
min_weeks=calculate_weeks(int(dmin), interval.lower()),
max_weeks=calculate_weeks(
int(dmax or dmin),
interval.lower(),
),
)
else:
log.warning("Invalid duration: %s", duration_str)
return DurationConfig(duration=duration_str or "")


def parse_resource_commitment(commitment_str: str) -> CommitmentConfig:
"""
Standardize time commitment value and return it if it is valid,
otherwise return an empty string
Args:
course_data (str): the course data
Returns:
str: the standardized time commitment, min, and max in hours
"""
if commitment_str:
commitment_regex = re.compile(
r"(\d+)\D+(\d+)?\s*(\w+)?",
re.IGNORECASE,
)
match = commitment_regex.match(commitment_str.strip())
if match:
cmin = match.group(1)
cmax = match.group(2)
return CommitmentConfig(
commitment=commitment_str,
min_weekly_hours=int(cmin),
max_weekly_hours=int(cmax if cmax else cmin),
)
else:
log.warning("Invalid commitment: %s", commitment_str)
return CommitmentConfig(commitment=commitment_str or "")
50 changes: 50 additions & 0 deletions learning_resources/etl/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
RunStatus,
)
from learning_resources.etl import utils
from learning_resources.etl.constants import CommitmentConfig, DurationConfig
from learning_resources.etl.utils import parse_certification, parse_string_to_int
from learning_resources.factories import (
ContentFileFactory,
Expand Down Expand Up @@ -499,3 +500,52 @@ def test_text_from_sjson_content():
def test_parse_string_to_int(hour, expected):
"""Test that the weekly hours are correctly parsed"""
assert parse_string_to_int(hour) == expected


@pytest.mark.parametrize(
("raw_value", "min_weeks", "max_weeks"),
[
("3 Days", 1, 1), # <= 5 days == 1 week
("7 Days", 2, 2), # >7 days = 2 weeks
("4 to 8 Days", 1, 2),
("3-4 Weeks, no weekends", 3, 4),
("5 - 6 MoNths", 20, 24),
("1 WEEK", 1, 1),
("1 month more or less", 4, 4),
("2-3 meses", 8, 12), # 2-3 months in Spanish
("1 mes", 4, 4), # 1 month in Spanish
("1 semana", 1, 1), # 1 week in Spanish
("2 - 3 semanas", 2, 3),
("Unparseable duration", None, None),
("", None, None),
("2 days in person+3 live webinars", 1, 1),
("2 weeks in person+3 live webinars", 2, 2),
],
)
def test_parse_resource_duration(raw_value, min_weeks, max_weeks):
"""Test that parse_resource_duration returns the expected min/max weeks"""
assert utils.parse_resource_duration(raw_value) == DurationConfig(
duration=raw_value, min_weeks=min_weeks, max_weeks=max_weeks
)


@pytest.mark.parametrize(
("raw_value", "min_hours", "max_hours"),
[
("5 Hours", 5, 5),
("3-4 Hours per Week", 3, 4),
("15 - 16 Hours per Week", 15, 16),
("5-8 hrs per week", 5, 8),
("5 - 10", 5, 10),
("5 to 10", 5, 10),
("6 horas", 6, 6),
("1 hour", 1, 1),
("1 hora", 1, 1),
("", None, None),
],
)
def test_parse_resource_commitment(raw_value, min_hours, max_hours):
"""Test that parse_resource_commitment returns the expected min/max hours"""
assert utils.parse_resource_commitment(raw_value) == CommitmentConfig(
commitment=raw_value, min_weekly_hours=min_hours, max_weekly_hours=max_hours
)
2 changes: 1 addition & 1 deletion test_json/mitpe/professional_ed_resources_0.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"language": "",
"node_id": "719",
"location": "Online",
"duration": "10 semanas",
"duration": "6 to 10 Weeks",
"continuing_ed_credits": "8",
"run__readable_id": "7192023070620230914"
},
Expand Down

0 comments on commit 36ebf4f

Please sign in to comment.