diff --git a/src/parse/parse.ts b/src/parse/parse.ts index 3841286..3c354d6 100644 --- a/src/parse/parse.ts +++ b/src/parse/parse.ts @@ -1,11 +1,11 @@ import nearly from "nearley"; import type { ParsedCatalogEntry } from "./types"; -import { HRowType, HSectionType } from "@/tokenize"; -import type { - HRow, - HSection, - TextRow, - TokenizedCatalogEntry, +import { HRowType, HSectionType, ConcentrationLeadingHeaderExceptionValue, ConcentrationTrailingHeaderExceptionValue } from "@/tokenize"; +import { + type HRow, + type HSection, + type TextRow, + type TokenizedCatalogEntry, } from "@/tokenize"; import { writeFile } from "fs/promises"; import { FileName } from "@/classify"; @@ -125,10 +125,57 @@ export const parseTokens = (sections: HSection[]) => { .filter(metaSection => metaSection.type === HSectionType.CONCENTRATION) .map(metaSection => { metaSection.entries = metaSection.entries.filter( - row => - row.type !== HRowType.COMMENT && row.type !== HRowType.SUBSUBHEADER, + row => row.type !== HRowType.SUBSUBHEADER ); + metaSection.entries = metaSection.entries.flatMap((row, index) => { + console.log("NEW KOBE READING ROW") + console.log(row) + // if this row is a comment and the previous row is an exception elective header, + // then this row is probably a comment that is meant to be a X_OF_MANY row + if ((row.type == HRowType.COMMENT || row.type == HRowType.SECTION_INFO) && index > 0) { + if (row.description.startsWith("If")) { + // special case introduced by "Concentration in Campaigns and Elections" in the following major + // https://catalog.northeastern.edu/archive/2021-2022/undergraduate/arts-media-design/journalism/journalism-political-science-ba/#programrequirementstext + return []; + } + const prevRow = metaSection.entries[index - 1]!; + console.log("KOBE CHECKING PREVIOUS ROW") + console.log(prevRow) + if (prevRow.type == HRowType.HEADER && isConcentrationExceptionValue(prevRow.description)) { + console.log("CONVERTING TO X_OF_MANY") + return [ + { + type: HRowType.X_OF_MANY, + description: row.description, + hour: row.hour, + }, + ]; + } else { + return []; + } + } + + // if this row is a header and the 'Required Courses' exception type, + // then the description of the section should be used to identify the concentration section + // otherwise, remove the 'Electives' exception type + if (row.type == HRowType.HEADER && isConcentrationExceptionValue(row.description)) { + if (isConcentrationLeadingHeaderExceptionValue(row.description) && index == 0) { + console.log("KOBE CONVERTING TO CONCENTRATION " + metaSection.description) + return [ + { + ...row, + description: metaSection.description, + } + ]; + } else { + console.log("KOBE REMOVING EXCEPTION") + return [] + } + } + return row; + }); + if ( metaSection.entries.length >= 1 && metaSection.entries[0]?.type != HRowType.HEADER @@ -150,3 +197,33 @@ export const parseTokens = (sections: HSection[]) => { concentrations, }; }; + +/** + * Checks if the text is a concentration exception type. + * https://www.geeksforgeeks.org/what-is-type-predicates-in-typescript/ + */ +/* +function isConcentrationExceptionValue(value: string): value is ConcentrationValueExceptionType { + return Object.values(ConcentrationValueExceptionType).includes(value as ConcentrationValueExceptionType); +} +*/ +function isConcentrationExceptionValue( + value: string +): value is (ConcentrationLeadingHeaderExceptionValue | ConcentrationTrailingHeaderExceptionValue) { + return ( + Object.values(ConcentrationLeadingHeaderExceptionValue).includes( + value as ConcentrationLeadingHeaderExceptionValue + ) || + Object.values(ConcentrationTrailingHeaderExceptionValue).includes( + value as ConcentrationTrailingHeaderExceptionValue + ) + ); +} + +function isConcentrationLeadingHeaderExceptionValue(value: string): value is ConcentrationLeadingHeaderExceptionValue { + return Object.values(ConcentrationLeadingHeaderExceptionValue).includes(value as ConcentrationLeadingHeaderExceptionValue); +} + +function isConcentrationTrailingHeaderExceptionValue(value: string): value is ConcentrationTrailingHeaderExceptionValue { + return Object.values(ConcentrationTrailingHeaderExceptionValue).includes(value as ConcentrationTrailingHeaderExceptionValue); +} \ No newline at end of file diff --git a/src/tokenize/tokenize.ts b/src/tokenize/tokenize.ts index fc52261..3cec228 100644 --- a/src/tokenize/tokenize.ts +++ b/src/tokenize/tokenize.ts @@ -375,6 +375,8 @@ const getRowType = ( throw Error(`td class was not "codecol": "${tdClasses}"`); } + const tdText = parseText(td); + if (trClasses.has("subheader")) { const isSubSubHeader = $(tr).find("span").hasClass("commentindent"); if (isSubSubHeader) { @@ -385,7 +387,6 @@ const getRowType = ( return HRowType.HEADER; } - const tdText = parseText(td); // Different range types if ( RANGE_LOWER_BOUNDED_MAYBE_EXCEPTIONS_1.test(tdText) || diff --git a/src/tokenize/types.ts b/src/tokenize/types.ts index dbb5cdf..c1fa48c 100644 --- a/src/tokenize/types.ts +++ b/src/tokenize/types.ts @@ -152,3 +152,44 @@ export type TokenizedCatalogEntry = TypedCatalogEntry & { programRequiredHours: number; sections: HSection[]; }; + +/** + * Enumerations specifying the headers that lead to common concentration name issue. + * Typically, the headers within the requirement tables specifiy the requirement section name. + * However, in some cases, the headers of concentration sections specify the concentration requirements + * instead of the concentration name. This causes issues with concentration names becoming the requirement section name. + * This type is used to identify concentration header issues, and provide separate tokenization and parsing logic. + * + * Read more here: + * https://www.notion.so/sandboxnu/Concentration-Issue-1a118273b1f4806da9e9fa99c9ca9a27?pvs=4 + */ +export enum ConcentrationExceptionValue { + ELECTIVES = "Electives", + REQUIRED_COURSES = "Required Courses", +} + +/** + * Leading headers are identified to be headers that may need to be replaced by the concentration name. + */ +export enum ConcentrationLeadingHeaderExceptionValue { + REQUIRED_COURSES = "Required Courses", + // caused by: https://catalog.northeastern.edu/archive/2021-2022/undergraduate/arts-media-design/journalism/journalism-political-science-ba/#programrequirementstext + THEORETICAL_REQUIREMENTS = "Theoretical Requirement", + CORE_COURSE = "Core Course", + EXPERIENTIAL_REQUIREMENT = "Experiential/Practicum Requirement", + CORE_REQUIREMENT = "Core Requirement", + +} + +/** + * + */ +export enum ConcentrationTrailingHeaderExceptionValue { + ELECTIVES = "Electives", + // caused by: https://catalog.northeastern.edu/archive/2021-2022/undergraduate/arts-media-design/journalism/journalism-political-science-ba/#programrequirementstext + CAMPAIGNS_AND_ELECTIONS_ELECTIVES = "Campaigns and Elections Electives", + REGIONAL_REQUIREMENTS = "Regional Requirements", + EXPERIENTIAL_REQUIREMENT = "Experiential/Practicum Requirement", + CORE_COURSE = "Core Courses", +} +