diff --git a/src/extractors/ExtractorBase.py b/src/extractors/ExtractorBase.py index db294bb..9880210 100644 --- a/src/extractors/ExtractorBase.py +++ b/src/extractors/ExtractorBase.py @@ -1,7 +1,6 @@ import json import random from abc import abstractmethod -from collections import Counter from os import makedirs from os.path import exists from pathlib import Path @@ -43,18 +42,18 @@ def is_multilingual(multi_option_data: ExtractionData) -> bool: return False @staticmethod - def get_train_test_sets( - extraction_data: ExtractionData, seed: int = 22, limit_samples: bool = True - ) -> (ExtractionData, ExtractionData): - if len(extraction_data.samples) < 15: + def get_train_test_sets(extraction_data: ExtractionData, limit_samples: bool = True) -> (ExtractionData, ExtractionData): + if len(extraction_data.samples) < 8: return extraction_data, extraction_data train_size = int(len(extraction_data.samples) * 0.8) - random.seed(seed) - random.shuffle(extraction_data.samples) train_set: list[TrainingSample] = extraction_data.samples[:train_size] - test_set: list[TrainingSample] = extraction_data.samples[train_size:] + + if len(extraction_data.samples) < 15: + test_set: list[TrainingSample] = extraction_data.samples[-10:] + else: + test_set = extraction_data.samples[train_size:] if limit_samples: train_set = train_set[:80] diff --git a/src/extractors/pdf_to_multi_option_extractor/PdfMultiOptionMethod.py b/src/extractors/pdf_to_multi_option_extractor/PdfMultiOptionMethod.py index d20a6af..36dfd98 100644 --- a/src/extractors/pdf_to_multi_option_extractor/PdfMultiOptionMethod.py +++ b/src/extractors/pdf_to_multi_option_extractor/PdfMultiOptionMethod.py @@ -5,6 +5,7 @@ from data.ExtractionIdentifier import ExtractionIdentifier from data.Option import Option from data.ExtractionData import ExtractionData +from data.TrainingSample import TrainingSample from extractors.ExtractorBase import ExtractorBase from extractors.pdf_to_multi_option_extractor.MultiLabelMethod import MultiLabelMethod from extractors.pdf_to_multi_option_extractor.FilterSegmentsMethod import FilterSegmentsMethod @@ -19,15 +20,17 @@ def __init__( self.multi_label_method = multi_label_method self.filter_segments_method = filter_segments_method self.extraction_identifier = ExtractionIdentifier(run_name="not set", extraction_name="not set") - self.options: list[Option] = [] + self.options: list[Option] = list() self.multi_value = False self.base_path = "" + self.extraction_data = None def set_parameters(self, multi_option_data: ExtractionData): self.extraction_identifier = multi_option_data.extraction_identifier self.options = multi_option_data.options self.multi_value = multi_option_data.multi_value self.base_path = multi_option_data.extraction_identifier.get_path() + self.extraction_data = multi_option_data def get_name(self): if self.filter_segments_method and self.multi_label_method: @@ -44,7 +47,7 @@ def get_performance(self, multi_option_data: ExtractionData, repetitions: int = scores = list() seeds = [22, 23, 24, 25] for i in range(repetitions): - train_set, test_set = ExtractorBase.get_train_test_sets(multi_option_data, seeds[i]) + train_set, test_set = ExtractorBase.get_train_test_sets(multi_option_data) truth_one_hot = self.one_hot_to_options_list([x.labeled_data.values for x in test_set.samples], self.options) self.train(train_set) @@ -97,6 +100,12 @@ def predict(self, multi_option_data: ExtractionData) -> list[list[Option]]: return predictions + def get_samples_for_context(self, extraction_data: ExtractionData) -> list[TrainingSample]: + if self.extraction_data: + return self.extraction_data.samples + + return extraction_data.samples + def can_be_used(self, multi_option_data: ExtractionData) -> bool: if self.multi_label_method: multi_label = self.multi_label_method(self.extraction_identifier, self.options, self.multi_value) diff --git a/src/extractors/pdf_to_multi_option_extractor/PdfToMultiOptionExtractor.py b/src/extractors/pdf_to_multi_option_extractor/PdfToMultiOptionExtractor.py index aa9de96..2d36896 100644 --- a/src/extractors/pdf_to_multi_option_extractor/PdfToMultiOptionExtractor.py +++ b/src/extractors/pdf_to_multi_option_extractor/PdfToMultiOptionExtractor.py @@ -41,6 +41,19 @@ from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.FuzzySegmentSelector import ( FuzzySegmentSelector, ) +from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.NextWordsTokenSelectorFuzzy75 import ( + NextWordsTokenSelectorFuzzy75, +) + +from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.PreviousWordsSentenceSelectorFuzzyCommas import ( + PreviousWordsSentenceSelectorFuzzyCommas, +) +from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.PreviousWordsTokenSelectorFuzzy75 import ( + PreviousWordsTokenSelectorFuzzy75, +) +from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.SentenceSelectorFuzzyCommas import ( + SentenceSelectorFuzzyCommas, +) from send_logs import send_logs @@ -53,6 +66,10 @@ class PdfToMultiOptionExtractor(ExtractorBase): FuzzyAll75(), FuzzyAll88(), FuzzyAll100(), + PreviousWordsTokenSelectorFuzzy75(), + NextWordsTokenSelectorFuzzy75(), + PreviousWordsSentenceSelectorFuzzyCommas(), + SentenceSelectorFuzzyCommas(), FastSegmentSelectorFuzzy95(), FastSegmentSelectorFuzzyCommas(), FuzzySegmentSelector(), @@ -123,7 +140,7 @@ def get_predictions(self, predictions_samples: list[PredictionSample]) -> (list[ if not self.multi_value: prediction = [x[:1] for x in prediction] - return training_samples, prediction + return method.get_samples_for_context(extraction_data), prediction def load_options(self): if not exists(self.options_path) or not exists(self.multi_value_path): @@ -139,7 +156,7 @@ def get_best_method(self, multi_option_data: ExtractionData) -> PdfMultiOptionMe best_method_instance = self.METHODS[0] best_performance = 0 for method in self.METHODS: - performance = self.get_performance(method, multi_option_data) + performance = self.get_method_performance(method, multi_option_data) if performance == 100: send_logs(self.extraction_identifier, f"Best method {method.get_name()} with {performance}%") @@ -152,10 +169,10 @@ def get_best_method(self, multi_option_data: ExtractionData) -> PdfMultiOptionMe send_logs(self.extraction_identifier, f"Best method {best_method_instance.get_name()}") return best_method_instance - def get_performance(self, method, multi_option_data): + def get_method_performance(self, method: PdfMultiOptionMethod, multi_option_data: ExtractionData): method.set_parameters(multi_option_data) - if len(self.METHODS) == 1 or not method.can_be_used(multi_option_data): + if not method.can_be_used(multi_option_data): return 0 send_logs(self.extraction_identifier, f"Checking {method.get_name()}") @@ -164,9 +181,9 @@ def get_performance(self, method, multi_option_data): performance = method.get_performance(multi_option_data) except Exception as e: send_logs(self.extraction_identifier, f"Error checking {method.get_name()}: {e}", Severity.error) - performance = 0 + self.reset_extraction_data(multi_option_data) send_logs(self.extraction_identifier, f"Performance {method.get_name()}: {performance}%") return performance @@ -187,3 +204,9 @@ def can_be_used(self, extraction_data: ExtractionData) -> bool: return True return False + + @staticmethod + def reset_extraction_data(multi_option_data: ExtractionData): + for sample in multi_option_data.samples: + for segment in sample.pdf_data.pdf_data_segments: + segment.ml_label = 0 diff --git a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FastSegmentSelector.py b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FastSegmentSelector.py index 2882ade..4926e95 100644 --- a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FastSegmentSelector.py +++ b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FastSegmentSelector.py @@ -1,7 +1,7 @@ import json import os from collections import Counter -from os.path import join +from os.path import join, exists from pathlib import Path import numpy as np @@ -26,19 +26,26 @@ def __init__(self, extraction_identifier: ExtractionIdentifier): self.next_words_path = join(self.fast_segment_selector_path, "next_words.txt") self.model_path = join(self.fast_segment_selector_path, "lightgbm_model.txt") - def get_features(self, segment: PdfDataSegment): + def get_features(self, segment: PdfDataSegment, segments: list[PdfDataSegment]): features = list() text = segment.text_content - index = self.text_segments.index(segment) - previous_segment_text = self.text_segments[index - 1].text_content if index > 0 else "" - next_segment_text = self.text_segments[index + 1].text_content if index + 1 < len(self.text_segments) else "" + if segment in self.text_segments: + index = self.text_segments.index(segment) + previous_segment_texts = self.clean_texts(self.text_segments[index - 1]) if index > 0 else [] + next_segment_texts = ( + self.clean_texts(self.text_segments[index + 1]) if index + 1 < len(self.text_segments) else [] + ) + else: + index = segments.index(segment) + previous_segment_texts = self.clean_texts(segments[index - 1]) if index > 0 else "" + next_segment_texts = self.clean_texts(segments[index + 1]) if index + 1 < len(segments) else "" for word in self.previous_words: - features.append(1 if word in previous_segment_text.lower() else 0) + features.append(1 if word in previous_segment_texts else 0) for word in self.next_words: - features.append(1 if word in next_segment_text.lower() else 0) + features.append(1 if word in next_segment_texts else 0) features.append(len([x for x in text if x == ","]) / len(text) if text else 0) @@ -52,30 +59,33 @@ def get_most_common_words(train_segments): return [x[0] for x in counter.most_common(30)] @staticmethod - def get_predictive_common_words(segments): + def clean_texts(pdf_segment: PdfDataSegment) -> list[str]: + clean_letters = [letter for letter in pdf_segment.text_content.lower() if letter.isalnum() or letter == " "] + return "".join(clean_letters).split() + + def save_predictive_common_words(self, segments): most_common_words = FastSegmentSelector.get_most_common_words(segments) counter_previous_segment = Counter() counter_next_segment = Counter() for previous_segment, segment, next_segment in zip(segments, segments[1:], segments[2:]): - if segment.ml_label: - counter_previous_segment.update( - [x for x in previous_segment.text_content.strip().lower().split() if x not in most_common_words] - ) - counter_next_segment.update( - [x for x in next_segment.text_content.strip().lower().split() if x not in most_common_words] - ) - break + if not segment.ml_label: + continue - return ([x[0] for x in counter_previous_segment.most_common(3)], [x[0] for x in counter_next_segment.most_common(3)]) + counter_previous_segment.update([x for x in self.clean_texts(previous_segment) if x not in most_common_words]) + counter_next_segment.update([x for x in self.clean_texts(next_segment) if x not in most_common_words]) + break - def create_model(self, segments: list[PdfDataSegment]): - self.text_segments = [x for x in segments if x.segment_type in self.text_types] - self.previous_words, self.next_words = self.get_predictive_common_words(self.text_segments) + self.previous_words = [x[0] for x in counter_previous_segment.most_common(2)] + self.next_words = [x[0] for x in counter_next_segment.most_common(2)] Path(self.previous_words_path).write_text(json.dumps(self.previous_words)) Path(self.next_words_path).write_text(json.dumps(self.next_words)) + def create_model(self, segments: list[PdfDataSegment]): + self.text_segments = [x for x in segments if x.segment_type in self.text_types] + self.save_predictive_common_words(self.text_segments) + x, y = self.get_x_y(segments) train_data = lgb.Dataset(x, y) @@ -88,7 +98,7 @@ def get_x_y(self, segments): y = [] for segment in segments: - x_rows.append(self.get_features(segment)) + x_rows.append(self.get_features(segment, segments)) y.append(segment.ml_label) x_train = np.zeros((len(x_rows), len(x_rows[0]) if x_rows else 0)) @@ -109,9 +119,11 @@ def predict(self, segments): return [segment for i, segment in enumerate(segments) if predictions[i] > 0.5] def load_repeated_words(self): - try: + self.previous_words = [] + self.next_words = [] + + if exists(self.previous_words_path): self.previous_words = json.loads(Path(self.previous_words_path).read_text()) + + if exists(self.next_words_path): self.next_words = json.loads(Path(self.next_words_path).read_text()) - except: - self.previous_words = [] - self.next_words = [] diff --git a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FastSegmentSelectorFuzzy95.py b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FastSegmentSelectorFuzzy95.py index 276424e..5ff9755 100644 --- a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FastSegmentSelectorFuzzy95.py +++ b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FastSegmentSelectorFuzzy95.py @@ -31,7 +31,7 @@ def get_appearances(self, pdf_segment: PdfDataSegment, options: list[str]) -> li if fuzz.partial_ratio(option, pdf_segment.text_content.lower()) >= self.threshold: appearances.append(option) - return list(set(appearances)) + return list(dict.fromkeys(appearances)) def train(self, multi_option_data: ExtractionData): marked_segments = list() @@ -41,18 +41,19 @@ def train(self, multi_option_data: ExtractionData): FastSegmentSelector(self.extraction_identifier).create_model(marked_segments) def predict(self, multi_option_data: ExtractionData) -> list[list[Option]]: - predict_data = self.get_prediction_data(multi_option_data) - return FuzzyAll95().predict(predict_data) + self.set_parameters(multi_option_data) + self.extraction_data = self.get_prediction_data(multi_option_data) + return FuzzyAll95().predict(self.extraction_data) - def get_prediction_data(self, multi_option_data): + def get_prediction_data(self, extraction_data: ExtractionData) -> ExtractionData: fast_segment_selector = FastSegmentSelector(self.extraction_identifier) predict_samples = list() - for sample in multi_option_data.samples: + for sample in extraction_data.samples: selected_segments = fast_segment_selector.predict(self.fix_two_pages_segments(sample)) - self.mark_segments_for_context(sample.pdf_data.pdf_data_segments, selected_segments) + self.mark_segments_for_context(selected_segments) - pdf_data = PdfData(None) + pdf_data = PdfData(None, file_name=sample.pdf_data.file_name) pdf_data.pdf_data_segments = selected_segments training_sample = TrainingSample(pdf_data=pdf_data, labeled_data=sample.labeled_data) @@ -60,8 +61,8 @@ def get_prediction_data(self, multi_option_data): return ExtractionData( samples=predict_samples, - options=multi_option_data.options, - multi_value=multi_option_data.multi_value, + options=self.extraction_data.options, + multi_value=self.extraction_data.multi_value, extraction_identifier=self.extraction_identifier, ) @@ -94,7 +95,7 @@ def get_cleaned_options(self, options: list[Option]) -> list[str]: def get_marked_segments(self, training_sample: TrainingSample) -> list[PdfDataSegment]: cleaned_values = self.get_cleaned_options(training_sample.labeled_data.values) - appearances_threshold = math.ceil(len(cleaned_values) * self.threshold / 100) + appearances_threshold = math.ceil(len(cleaned_values) * 0.68) if not appearances_threshold: return training_sample.pdf_data.pdf_data_segments @@ -106,7 +107,6 @@ def get_marked_segments(self, training_sample: TrainingSample) -> list[PdfDataSe if appearances_threshold <= appearances: segment.ml_label = 1 - break return fixed_segments @@ -117,7 +117,6 @@ def fix_two_pages_segments(self, training_sample: TrainingSample) -> list[PdfDat merged_segment = None for segment in training_sample.pdf_data.pdf_data_segments: if segment == merged_segment: - fixed_segments.append(segment) merged_segment = None continue @@ -138,19 +137,11 @@ def fix_segment(segment: PdfDataSegment, text_type_segments: list[PdfDataSegment return segment, None segment = deepcopy(segment) - text_type_segments[index + 1] = deepcopy(text_type_segments[index + 1]) - segment.text_content += " " + text_type_segments[index + 1].text_content - text_type_segments[index + 1].text_content = segment.text_content + return segment, text_type_segments[index + 1] @staticmethod - def mark_segments_for_context(all_segments: list[PdfDataSegment], selected_segments: list[PdfDataSegment]): - for segment in all_segments: - for selected_segment in selected_segments: - if segment.page_number != selected_segment.page_number: - continue - - if segment.bounding_box.get_intersection_percentage(selected_segment.bounding_box) > 0.1: - segment.ml_label = 1 - break + def mark_segments_for_context(segments: list[PdfDataSegment]): + for segment in segments: + segment.ml_label = 1 diff --git a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FastSegmentSelectorFuzzyCommas.py b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FastSegmentSelectorFuzzyCommas.py index 9a2d2b6..e611752 100644 --- a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FastSegmentSelectorFuzzyCommas.py +++ b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FastSegmentSelectorFuzzyCommas.py @@ -9,5 +9,10 @@ class FastSegmentSelectorFuzzyCommas(FastSegmentSelectorFuzzy95): def predict(self, multi_option_data: ExtractionData) -> list[list[Option]]: - predict_data = self.get_prediction_data(multi_option_data) - return FuzzyCommas().predict(predict_data) + self.set_parameters(multi_option_data) + self.extraction_data = self.get_prediction_data(multi_option_data) + return FuzzyCommas().predict(self.extraction_data) + + def train(self, multi_option_data: ExtractionData): + super().train(multi_option_data) + FuzzyCommas().train(multi_option_data) diff --git a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FuzzyAll100.py b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FuzzyAll100.py index 8bcfdb2..461b213 100644 --- a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FuzzyAll100.py +++ b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FuzzyAll100.py @@ -16,7 +16,7 @@ class FuzzyAll100(PdfMultiOptionMethod): def get_appearances(self, pdf_segments: list[PdfDataSegment], options: list[str]) -> list[str]: appearances = [] for pdf_segment in pdf_segments: - text = pdf_segment.text_content.lower() + text = " ".join(pdf_segment.text_content.lower().split()) for option in options: if len(text) < math.ceil(len(option) * self.threshold / 100): continue @@ -28,7 +28,7 @@ def get_appearances(self, pdf_segments: list[PdfDataSegment], options: list[str] if option in text: text = text.replace(option, "") - return list(set(appearances)) + return list(dict.fromkeys(appearances)) def predict(self, multi_option_data: ExtractionData) -> list[list[Option]]: predictions = list() diff --git a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FuzzyCommas.py b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FuzzyCommas.py index 4e62ac7..759e0bc 100644 --- a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FuzzyCommas.py +++ b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FuzzyCommas.py @@ -1,56 +1,144 @@ +import json +import os import re +from os.path import join +from pathlib import Path +import rapidfuzz from rapidfuzz import fuzz from data.Option import Option from data.PdfDataSegment import PdfDataSegment +from data.TrainingSample import TrainingSample from extractors.pdf_to_multi_option_extractor.PdfMultiOptionMethod import PdfMultiOptionMethod from data.ExtractionData import ExtractionData class FuzzyCommas(PdfMultiOptionMethod): - threshold = 92 - def get_appearances(self, pdf_segments: list[PdfDataSegment], options: list[str]) -> list[str]: + def __init__(self): + super().__init__() + self.options_cleaned: list[str] = list() + self.options_cleaned_by_length: list[str] = list() + self.options_cleaned_words_sorted: list[str] = list() + self.options_cleaned_words_sorted_by_length: list[str] = list() + + def get_appearances_for_segments( + self, pdf_segments: list[PdfDataSegment], aliases: dict[str, list[str]] + ) -> tuple[list[str], list[str]]: appearances = [] + not_found_texts = list() for pdf_segment in pdf_segments: - text = pdf_segment.text_content.lower() - texts_separated_by_comma = self.clean_options(re.split(",|:| and ", text)) + text = pdf_segment.text_content + texts_separated_by_comma = self.clean_texts(re.split(",|:| and ", text), False) for one_piece_text in texts_separated_by_comma: - for option in options: - if len(one_piece_text) < len(option) * 0.92 or len(one_piece_text) > len(option) * 1.2: - continue - if fuzz.partial_ratio(option, one_piece_text) >= self.threshold: - pdf_segment.ml_label = 1 - appearances.append(option) - break + appearance = self.get_appearances_one_segment(one_piece_text, aliases) + + if appearance: + pdf_segment.ml_label = 1 + appearances.append(appearance) + else: + not_found_texts.append(one_piece_text) + + return appearances, not_found_texts + + def get_appearances_one_segment(self, text: str, aliases: dict[str, list[str]]) -> str: + for option in self.options_cleaned_words_sorted_by_length: + if len(text) < len(option) * 0.92 or len(text) > len(option) * 1.2: + continue + + if fuzz.partial_ratio(option, self.clean_text(text, True)) >= self.threshold: + return self.options_cleaned[self.options_cleaned_words_sorted.index(option)] - return list(set(appearances)) + for option in self.options_cleaned_by_length: + if not aliases or option not in aliases: + continue + + for alias in aliases[option]: + if rapidfuzz.fuzz.ratio(alias, text) > self.threshold: + return option + + return "" @staticmethod - def clean_option(option: str) -> str: - option = option.lower() - option = "".join([letter for letter in option if letter.isalnum() or letter == " "]) - option = " ".join(sorted(option.split())) - return option + def clean_text(text: str, sort: bool) -> str: + text = text.lower() + text = "".join([letter for letter in text if letter.isalnum() or letter == " "]) + + if sort: + text = " ".join(sorted(text.split())) + else: + text = " ".join(text.split()) - def clean_options(self, options: list[str]) -> list[str]: - return list([self.clean_option(option) for option in options]) + return text + + def clean_texts(self, texts: list[str], sort: bool) -> list[str]: + return list([self.clean_text(option, sort) for option in texts]) def predict(self, multi_option_data: ExtractionData) -> list[list[Option]]: + self.set_parameters(multi_option_data) + self.set_options_variants() + + try: + aliases = json.loads(self.get_aliases_path().read_text()) + except FileNotFoundError: + aliases = dict() + predictions = list() - options_labels = self.clean_options([x.label for x in multi_option_data.options]) - clean_options_sorted = list(sorted(options_labels, key=lambda x: len(x), reverse=True)) for multi_option_sample in multi_option_data.samples: pdf_segments: list[PdfDataSegment] = [x for x in multi_option_sample.pdf_data.pdf_data_segments] - predictions_sample = self.get_appearances(pdf_segments, clean_options_sorted) - predictions.append([multi_option_data.options[options_labels.index(x)] for x in predictions_sample]) + predictions_sample, _ = self.get_appearances_for_segments(pdf_segments, aliases) + prediction_options = [self.options[self.options_cleaned.index(x)] for x in predictions_sample] + predictions.append(prediction_options) return predictions def train(self, multi_option_data: ExtractionData): - pass + self.set_parameters(multi_option_data) + self.set_options_variants() + + aliases: dict[str, list[str]] = {option: list() for option in self.options_cleaned} + for sample in multi_option_data.samples: + sample_aliases = self.get_aliases(sample) + for option, sample_alias in sample_aliases.items(): + aliases[option] = list(dict.fromkeys(aliases[option] + [sample_alias])) + + self.get_aliases_path().write_text(json.dumps(aliases)) + + def get_aliases_path(self) -> Path: + path = Path(join(self.extraction_identifier.get_path(), "fuzzy_commas")) + + if not path.exists(): + os.makedirs(path, exist_ok=True) + + return Path(join(path, "aliases.json")) + + def get_aliases(self, sample: TrainingSample) -> dict[str, str]: + segments = [segment for segment in sample.pdf_data.pdf_data_segments if segment.ml_label] + appearances, not_found_texts = self.get_appearances_for_segments(segments, dict()) + truth_options = self.clean_texts([option.label for option in sample.labeled_data.values], False) + + not_found_options = [option for option in truth_options if option not in appearances] + return self.find_aliases(not_found_options, not_found_texts) + + @staticmethod + def find_aliases(not_found_options: list[str], not_found_texts: list[str]) -> dict[str, str]: + aliases = dict() + cleaned_texts = [" ".join(text.lower().strip().split()) for text in not_found_texts] + + for option in not_found_options: + for text in cleaned_texts: + if rapidfuzz.fuzz.partial_ratio(option, text) > 80: + aliases[option] = text + + return aliases + + def set_options_variants(self): + self.options_cleaned = self.clean_texts(texts=[x.label for x in self.options], sort=False) + self.options_cleaned_by_length = sorted(self.options_cleaned, key=lambda x: -len(x)) + self.options_cleaned_words_sorted = self.clean_texts(texts=[x.label for x in self.options], sort=True) + self.options_cleaned_words_sorted_by_length = sorted(self.options_cleaned_words_sorted, key=lambda x: -len(x)) diff --git a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FuzzyLast.py b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FuzzyLast.py index 9d0f99d..2e97d41 100644 --- a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FuzzyLast.py +++ b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FuzzyLast.py @@ -10,7 +10,7 @@ class FuzzyLast(PdfMultiOptionMethod): @staticmethod - def get_first_appearance(pdf_segments: list[PdfDataSegment], options: list[str]) -> list[str]: + def get_last_appearance(pdf_segments: list[PdfDataSegment], options: list[str]) -> list[str]: for pdf_segment in reversed(pdf_segments): for ratio_threshold in range(100, 69, -10): for option in options: @@ -28,7 +28,7 @@ def predict(self, multi_option_data: ExtractionData) -> list[list[Option]]: options_sorted = list(sorted(options_labels, key=lambda x: len(x), reverse=True)) for multi_option_sample in multi_option_data.samples: pdf_segments: list[PdfDataSegment] = [x for x in multi_option_sample.pdf_data.pdf_data_segments] - prediction = self.get_first_appearance(pdf_segments, options_sorted) + prediction = self.get_last_appearance(pdf_segments, options_sorted) if prediction: predictions.append([multi_option_data.options[options_labels.index(prediction[0])]]) else: diff --git a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FuzzySegmentSelector.py b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FuzzySegmentSelector.py index f93a3ed..3d7eeac 100644 --- a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FuzzySegmentSelector.py +++ b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FuzzySegmentSelector.py @@ -26,7 +26,7 @@ def get_appearances(pdf_segment: PdfDataSegment, options: list[str]) -> list[str if fuzz.partial_ratio(option, pdf_segment.text_content.lower()) >= threshold: appearances.append(option) - return list(set(appearances)) + return list(dict.fromkeys(appearances)) def predict(self, multi_option_data: ExtractionData) -> list[list[Option]]: segment_selector = SegmentSelector(self.extraction_identifier) diff --git a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/NextWordsSegmentSelector.py b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/NextWordsSegmentSelector.py new file mode 100644 index 0000000..c503938 --- /dev/null +++ b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/NextWordsSegmentSelector.py @@ -0,0 +1,28 @@ +from data.PdfDataSegment import PdfDataSegment +from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.PreviousWordsSegmentSelector import ( + PreviousWordsSegmentSelector, +) + +from rapidfuzz import fuzz + + +class NextWordsSegmentSelector(PreviousWordsSegmentSelector): + def predict(self, segments: list[PdfDataSegment]) -> list[PdfDataSegment]: + self.text_segments = [x for x in segments if x.segment_type in self.text_types] + self.load_repeated_words() + + predicted_segments = [] + for segment in self.text_segments: + + index = self.text_segments.index(segment) + + next_segment_texts = [] + if index < len(self.text_segments) - 1: + next_segment_texts = self.clean_texts(self.text_segments[index + 1]) + + for word in self.next_words: + if fuzz.partial_ratio(word, " ".join(next_segment_texts)) >= 90: + predicted_segments.append(segment) + break + + return predicted_segments diff --git a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/NextWordsTokenSelectorFuzzy75.py b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/NextWordsTokenSelectorFuzzy75.py new file mode 100644 index 0000000..159ec35 --- /dev/null +++ b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/NextWordsTokenSelectorFuzzy75.py @@ -0,0 +1,59 @@ +import math + +from data.Option import Option +from data.ExtractionData import ExtractionData +from data.PdfData import PdfData +from data.PdfDataSegment import PdfDataSegment +from data.TrainingSample import TrainingSample +from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.FastSegmentSelectorFuzzy95 import ( + FastSegmentSelectorFuzzy95, +) +from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.FuzzyAll75 import FuzzyAll75 +from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.NextWordsSegmentSelector import ( + NextWordsSegmentSelector, +) + + +class NextWordsTokenSelectorFuzzy75(FastSegmentSelectorFuzzy95): + threshold = 75 + + def predict(self, multi_option_data: ExtractionData) -> list[list[Option]]: + self.set_parameters(multi_option_data) + self.get_token_extraction_data(multi_option_data) + segment_selector = NextWordsSegmentSelector(self.extraction_identifier) + + for sample in self.extraction_data.samples: + sample.pdf_data.pdf_data_segments = segment_selector.predict(sample.pdf_data.pdf_data_segments) + self.mark_segments_for_context(sample.pdf_data.pdf_data_segments) + + return FuzzyAll75().predict(self.extraction_data) + + def train(self, multi_option_data: ExtractionData): + self.set_parameters(multi_option_data) + self.get_token_extraction_data(multi_option_data) + marked_segments = list() + for sample in self.extraction_data.samples: + marked_segments.extend(self.get_marked_segments(sample)) + + NextWordsSegmentSelector(self.extraction_identifier).create_model(marked_segments) + + def get_token_extraction_data(self, extraction_data: ExtractionData): + samples = list() + for sample in extraction_data.samples: + token_segments = [] + if sample.pdf_data.pdf_features and sample.pdf_data.pdf_features.pages: + for page in sample.pdf_data.pdf_features.pages: + token_segments.extend([PdfDataSegment.from_pdf_token(token) for token in page.tokens]) + + pdf_data = PdfData(None, file_name=sample.pdf_data.file_name) + pdf_data.pdf_data_segments = token_segments + + training_sample = TrainingSample(pdf_data=pdf_data, labeled_data=sample.labeled_data) + samples.append(training_sample) + + self.extraction_data = ExtractionData( + samples=samples, + options=self.options, + multi_value=self.multi_value, + extraction_identifier=self.extraction_identifier, + ) diff --git a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/PreviousWordsSegmentSelector.py b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/PreviousWordsSegmentSelector.py new file mode 100644 index 0000000..6436cd7 --- /dev/null +++ b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/PreviousWordsSegmentSelector.py @@ -0,0 +1,27 @@ +from data.PdfDataSegment import PdfDataSegment + +from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.FastSegmentSelector import FastSegmentSelector +from rapidfuzz import fuzz + + +class PreviousWordsSegmentSelector(FastSegmentSelector): + def create_model(self, segments: list[PdfDataSegment]): + self.text_segments = [x for x in segments if x.segment_type in self.text_types] + self.save_predictive_common_words(self.text_segments) + + def predict(self, segments): + self.text_segments = [x for x in segments if x.segment_type in self.text_types] + self.load_repeated_words() + + predicted_segments = [] + for segment in self.text_segments: + + index = self.text_segments.index(segment) + previous_segment_texts = self.clean_texts(self.text_segments[index - 1]) if index > 0 else [] + + for word in self.previous_words: + if fuzz.partial_ratio(word, " ".join(previous_segment_texts)) >= 90: + predicted_segments.append(segment) + break + + return predicted_segments diff --git a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/PreviousWordsSentenceSelectorFuzzyCommas.py b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/PreviousWordsSentenceSelectorFuzzyCommas.py new file mode 100644 index 0000000..f41eb1d --- /dev/null +++ b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/PreviousWordsSentenceSelectorFuzzyCommas.py @@ -0,0 +1,50 @@ +from data.Option import Option +from data.ExtractionData import ExtractionData +from data.PdfData import PdfData +from data.TrainingSample import TrainingSample +from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.FuzzyCommas import FuzzyCommas +from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.PreviousWordsSegmentSelector import ( + PreviousWordsSegmentSelector, +) +from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.SentenceSelectorFuzzyCommas import ( + SentenceSelectorFuzzyCommas, +) + + +class PreviousWordsSentenceSelectorFuzzyCommas(SentenceSelectorFuzzyCommas): + + def train(self, multi_option_data: ExtractionData): + extraction_data_by_sentences = self.get_extraction_data_by_sentence(multi_option_data) + marked_segments = list() + for sample in extraction_data_by_sentences.samples: + marked_segments.extend(self.get_marked_segments(sample)) + + PreviousWordsSegmentSelector(self.extraction_identifier).create_model(marked_segments) + FuzzyCommas().train(extraction_data_by_sentences) + + def predict(self, multi_option_data: ExtractionData) -> list[list[Option]]: + extraction_data_by_sentences = self.get_extraction_data_by_sentence(multi_option_data) + self.set_parameters(extraction_data_by_sentences) + self.extraction_data = self.get_prediction_data(extraction_data_by_sentences) + return FuzzyCommas().predict(self.extraction_data) + + def get_prediction_data(self, extraction_data: ExtractionData) -> ExtractionData: + segment_selector = PreviousWordsSegmentSelector(self.extraction_identifier) + predict_samples = list() + for sample in extraction_data.samples: + selected_segments = segment_selector.predict(self.fix_two_pages_segments(sample)) + + self.mark_segments_for_context(selected_segments) + + pdf_data = PdfData(None, file_name=sample.pdf_data.file_name) + pdf_data.pdf_data_segments = selected_segments + + training_sample = TrainingSample(pdf_data=pdf_data, labeled_data=sample.labeled_data) + predict_samples.append(training_sample) + + return ExtractionData( + samples=predict_samples, + options=self.extraction_data.options, + multi_value=self.extraction_data.multi_value, + extraction_identifier=self.extraction_identifier, + ) diff --git a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/PreviousWordsTokenSelectorFuzzy75.py b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/PreviousWordsTokenSelectorFuzzy75.py new file mode 100644 index 0000000..771b280 --- /dev/null +++ b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/PreviousWordsTokenSelectorFuzzy75.py @@ -0,0 +1,33 @@ +from data.Option import Option +from data.ExtractionData import ExtractionData +from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.FuzzyAll75 import FuzzyAll75 +from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.NextWordsTokenSelectorFuzzy75 import ( + NextWordsTokenSelectorFuzzy75, +) +from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.PreviousWordsSegmentSelector import ( + PreviousWordsSegmentSelector, +) + + +class PreviousWordsTokenSelectorFuzzy75(NextWordsTokenSelectorFuzzy75): + threshold = 75 + + def predict(self, multi_option_data: ExtractionData) -> list[list[Option]]: + self.set_parameters(multi_option_data) + self.get_token_extraction_data(multi_option_data) + segment_selector = PreviousWordsSegmentSelector(self.extraction_identifier) + + for sample in self.extraction_data.samples: + sample.pdf_data.pdf_data_segments = segment_selector.predict(sample.pdf_data.pdf_data_segments) + self.mark_segments_for_context(sample.pdf_data.pdf_data_segments) + + return FuzzyAll75().predict(self.extraction_data) + + def train(self, multi_option_data: ExtractionData): + self.set_parameters(multi_option_data) + self.get_token_extraction_data(multi_option_data) + marked_segments = list() + for sample in self.extraction_data.samples: + marked_segments.extend(self.get_marked_segments(sample)) + + PreviousWordsSegmentSelector(self.extraction_identifier).create_model(marked_segments) diff --git a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/SentenceSelectorFuzzyCommas.py b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/SentenceSelectorFuzzyCommas.py new file mode 100644 index 0000000..9d8dfb9 --- /dev/null +++ b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/SentenceSelectorFuzzyCommas.py @@ -0,0 +1,91 @@ +import re +from copy import deepcopy + +from pdf_features.Rectangle import Rectangle + +from data.Option import Option +from data.PdfData import PdfData +from data.PdfDataSegment import PdfDataSegment +from data.ExtractionData import ExtractionData +from data.TrainingSample import TrainingSample +from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.FastSegmentSelectorFuzzyCommas import ( + FastSegmentSelectorFuzzyCommas, +) + + +class SentenceSelectorFuzzyCommas(FastSegmentSelectorFuzzyCommas): + + def train(self, multi_option_data: ExtractionData): + extraction_data_by_sentences = self.get_extraction_data_by_sentence(multi_option_data) + super().train(extraction_data_by_sentences) + + def predict(self, multi_option_data: ExtractionData) -> list[list[Option]]: + extraction_data_by_sentences = self.get_extraction_data_by_sentence(multi_option_data) + return super().predict(extraction_data_by_sentences) + + def get_extraction_data_by_sentence(self, multi_option_data: ExtractionData) -> ExtractionData: + samples_by_sentence = [] + for sample in multi_option_data.samples: + sentence_segment_list = self.get_sentence_segment_list(sample.pdf_data.pdf_data_segments) + samples_by_sentence.append(self.get_sample(sample, sentence_segment_list)) + + return ExtractionData( + samples=samples_by_sentence, + options=multi_option_data.options, + multi_value=multi_option_data.multi_value, + extraction_identifier=multi_option_data.extraction_identifier, + ) + + def get_sentence_segment_list(self, pdf_data_segments) -> list[(str, PdfDataSegment)]: + text_segments = [segment for segment in pdf_data_segments if segment.segment_type in self.text_types] + merged_sentences = self.get_segments_merged(text_segments) + + sentence_segment_list = [] + for segment in merged_sentences: + segment_text = " ".join(segment.text_content.split()) + for text in re.split(r"\.|:", segment_text): + if not text: + continue + + sentence_segment_list.append((text, segment)) + + sentences_across_pages = list() + sentences_across_pages.append(sentence_segment_list[0]) + for sentence, next_sentence in zip(sentence_segment_list, sentence_segment_list[1:]): + if sentence[0][-1] == ",": + merged_sentences = " ".join([sentences_across_pages[-1][0], next_sentence[0]]) + sentences_across_pages[-1] = (merged_sentences, sentences_across_pages[-1][1]) + continue + + sentences_across_pages.append(next_sentence) + + return sentences_across_pages + + def get_segments_merged(self, segments): + segments = [segment for segment in segments if segment.text_content.strip()] + merged_sentences = [segments[0]] + for segment in segments[1:]: + previous_segment_text = " ".join(merged_sentences[-1].text_content.split()) + + if previous_segment_text[-1] not in [".", ":"]: + merged_segment = deepcopy(merged_sentences[-1]) + merged_segment.text_content = f"{previous_segment_text}, {' '.join(segment.text_content.split())}" + bounding_boxes = [merged_segment.bounding_box, segment.bounding_box] + merged_segment.bounding_box = Rectangle.merge_rectangles(bounding_boxes) + merged_sentences[-1] = merged_segment + continue + + merged_sentences.append(segment) + return merged_sentences + + @staticmethod + def get_sample(sample: TrainingSample, sentence_segment_list: list[(str, PdfDataSegment)]) -> TrainingSample: + sentence_segments = list() + for sentence, segment in sentence_segment_list: + sentence_segment = deepcopy(segment) + sentence_segment.text_content = sentence + sentence_segments.append(sentence_segment) + + sentence_pdf_data = PdfData(pdf_features=None, file_name=sample.pdf_data.file_name) + sentence_pdf_data.pdf_data_segments = sentence_segments + return TrainingSample(pdf_data=sentence_pdf_data, labeled_data=sample.labeled_data) diff --git a/src/extractors/pdf_to_multi_option_extractor/test/test_fuzzy_methods.py b/src/extractors/pdf_to_multi_option_extractor/test/test_fuzzy_methods.py index 19b1d09..d566a6e 100644 --- a/src/extractors/pdf_to_multi_option_extractor/test/test_fuzzy_methods.py +++ b/src/extractors/pdf_to_multi_option_extractor/test/test_fuzzy_methods.py @@ -61,6 +61,7 @@ def test_fuzzy_commas(self): multi_value=True, options=options, samples=samples, extraction_identifier=extraction_identifier ) + FuzzyCommas().train(multi_option_data) predictions = FuzzyCommas().predict(multi_option_data) self.assertEqual(2, len(predictions)) @@ -73,6 +74,31 @@ def test_fuzzy_commas(self): self.assertTrue(Option(id="2", label="item 2") not in predictions[1]) self.assertTrue(Option(id="10", label="item 10") in predictions[1]) + def test_fuzzy_commas_aliases(self): + extraction_identifier = ExtractionIdentifier(run_name=self.TENANT, extraction_name=self.extraction_id) + options = [Option(id="1", label=" United Kingdom ")] + + pdf_data_1 = PdfData.from_texts( + ["blah, United Kingdom of Great Britain and Northern Ireland , 2 item, item 3, blah"] + ) + + pdf_data_1.pdf_data_segments[0].ml_label = 1 + + samples = [ + TrainingSample(pdf_data_1, LabeledData(values=[options[0]])), + ] + + multi_option_data = ExtractionData( + multi_value=True, options=options, samples=samples, extraction_identifier=extraction_identifier + ) + + FuzzyCommas().train(multi_option_data) + predictions = FuzzyCommas().predict(multi_option_data) + + self.assertEqual(1, len(predictions)) + + self.assertTrue(Option(id="1", label=" United Kingdom ") in predictions[0]) + def test_fast_segment_selector_fuzzy_95(self): extraction_identifier = ExtractionIdentifier(run_name=self.TENANT, extraction_name=self.extraction_id) options = [Option(id="1", label="item 1"), Option(id="2", label="item 2"), Option(id="10", label="item 10")] diff --git a/src/extractors/research_multi_option_extraction/fuzzy_selector_benchmark.py b/src/extractors/research_multi_option_extraction/fuzzy_selector_benchmark.py index 4a7e251..9fd50d4 100644 --- a/src/extractors/research_multi_option_extraction/fuzzy_selector_benchmark.py +++ b/src/extractors/research_multi_option_extraction/fuzzy_selector_benchmark.py @@ -1,4 +1,3 @@ -from pdf_features.Rectangle import Rectangle from pdf_token_type_labels.TaskMistakes import TaskMistakes from sklearn.metrics import f1_score @@ -7,7 +6,7 @@ from extractors.pdf_to_multi_option_extractor.PdfToMultiOptionExtractor import PdfToMultiOptionExtractor from extractors.segment_selector.evaluate_config import PDF_LABELED_DATA_PATH -from pdf_multi_option_classification_benchmark import get_multi_option_benchmark_data +from scripts.pdf_multi_option_classification_benchmark import get_multi_option_benchmark_data def show_mistakes(prediction_samples): diff --git a/src/performance_report.py b/src/performance_report.py new file mode 100644 index 0000000..ba8c07e --- /dev/null +++ b/src/performance_report.py @@ -0,0 +1,211 @@ +import json +import pickle +import random +from os import listdir +from os.path import join +from pathlib import Path +from time import sleep, time + +from paragraph_extraction_trainer.Paragraph import Paragraph +from paragraph_extraction_trainer.ParagraphExtractorTrainer import ParagraphExtractorTrainer +from paragraph_extraction_trainer.download_models import paragraph_extraction_model_path +from paragraph_extraction_trainer.model_configuration import MODEL_CONFIGURATION +from pdf_features.PdfFeatures import PdfFeatures +from pdf_tokens_type_trainer.ModelConfiguration import ModelConfiguration +from pdf_tokens_type_trainer.TokenTypeTrainer import TokenTypeTrainer +from sklearn.metrics import f1_score + +from config import APP_PATH, ROOT_PATH +from data.ExtractionData import ExtractionData +from data.ExtractionIdentifier import ExtractionIdentifier +from data.LabeledData import LabeledData +from data.Option import Option +from data.PdfData import PdfData +from data.PredictionSample import PredictionSample +from data.TrainingSample import TrainingSample +from extractors.pdf_to_multi_option_extractor.PdfMultiOptionMethod import PdfMultiOptionMethod +from extractors.pdf_to_multi_option_extractor.PdfToMultiOptionExtractor import PdfToMultiOptionExtractor + +PDF_MULTI_OPTION_EXTRACTION_LABELED_DATA_PATH = join( + Path(__file__).parent, "extractors", "pdf_to_multi_option_extractor", "labeled_data" +) +PDF_DATA_FOLDER_PATH = join(ROOT_PATH, "data", "pdf_data_cache") +LABELED_DATA_PATH = join(APP_PATH, "pdf_topic_classification", "labeled_data") + +LABELED_DATA_PDFS_PATH = join(ROOT_PATH.parent, "pdf-labeled-data", "pdfs") + +BASE_LINE = { + "cejil_president": (100.0, "NextWordsTokenSelectorFuzzy75"), + "cyrilla_keywords": (53.49, "FuzzyFirstCleanLabel"), + "cejil_date": (20.83, "FuzzyAll88"), + "cejil_countries": (69.05, "FuzzyFirstCleanLabel"), + "d4la_document_type": (44.07, "CleanBeginningDotDigits500_SingleLabelSetFit"), + "cejil_secretary": (80.0, "FuzzyAll75"), + "countries_in_favor": (99.75, "PreviousWordsSentenceSelectorFuzzyCommas"), + "cejil_judge": (92.86, "FuzzyLast"), +} + + +def get_task_pdf_names(): + task_pdf_names: dict[str, set[str]] = dict() + + for task_name in listdir(str(PDF_MULTI_OPTION_EXTRACTION_LABELED_DATA_PATH)): + with open(join(PDF_MULTI_OPTION_EXTRACTION_LABELED_DATA_PATH, task_name, "labels.json"), mode="r") as file: + labels_dict: dict[str, list[str]] = json.load(file) + task_pdf_names.setdefault(task_name, set()).update(labels_dict.keys()) + + return task_pdf_names + + +def cache_pdf_data(pdf_name: str, pickle_path: Path): + pdf_features = PdfFeatures.from_poppler_etree(join(LABELED_DATA_PDFS_PATH, pdf_name, "etree.xml")) + + trainer = TokenTypeTrainer([pdf_features], ModelConfiguration()) + trainer.set_token_types() + trainer = ParagraphExtractorTrainer(pdfs_features=[pdf_features], model_configuration=MODEL_CONFIGURATION) + paragraphs: list[Paragraph] = trainer.get_paragraphs(paragraph_extraction_model_path) + + pdf_data = PdfData(pdf_features, file_name=pdf_name) + pdf_data.set_segments_from_paragraphs(paragraphs) + + with open(pickle_path, mode="wb") as file: + pickle.dump(pdf_data, file) + + return pdf_data + + +def get_samples(task_name): + with open(join(PDF_MULTI_OPTION_EXTRACTION_LABELED_DATA_PATH, task_name, "labels.json"), mode="r") as file: + labels_dict: dict[str, list[str]] = json.load(file) + + multi_option_samples: list[TrainingSample] = list() + for pdf_name in sorted(get_task_pdf_names()[task_name]): + pickle_path = join(PDF_DATA_FOLDER_PATH, f"{pdf_name}.pickle") + + if Path(pickle_path).exists(): + with open(pickle_path, mode="rb") as file: + pdf_data: PdfData = pickle.load(file) + else: + pdf_data: PdfData = cache_pdf_data(pdf_name, Path(pickle_path)) + + values = [Option(id=x, label=x) for x in labels_dict[pdf_name]] + language_iso = "es" if "cejil" in task_name else "en" + + extraction_sample = TrainingSample( + pdf_data=pdf_data, labeled_data=LabeledData(values=values, language_iso=language_iso) + ) + multi_option_samples.append(extraction_sample) + + random.seed(42) + random.shuffle(multi_option_samples) + return multi_option_samples + + +def get_multi_option_benchmark_data(filter_by: list[str] = None) -> list[ExtractionData]: + benchmark_data: list[ExtractionData] = list() + for task_name in listdir(str(PDF_MULTI_OPTION_EXTRACTION_LABELED_DATA_PATH)): + if filter_by and task_name not in filter_by: + continue + + print(f"Loading task {task_name}") + + with open(join(PDF_MULTI_OPTION_EXTRACTION_LABELED_DATA_PATH, task_name, "options.json"), mode="r") as file: + options = [Option(id=x, label=x) for x in json.load(file)] + + multi_option_samples = get_samples(task_name) + multi_value: bool = len([sample for sample in multi_option_samples if len(sample.labeled_data.values) > 1]) != 0 + extraction_identifier = ExtractionIdentifier(run_name="benchmark", extraction_name=task_name) + benchmark_data.append( + ExtractionData( + samples=multi_option_samples, + options=options, + multi_value=multi_value, + extraction_identifier=extraction_identifier, + ) + ) + + return benchmark_data + + +def performance_report(): + f1s_method_name = get_f1_scores_method_names() + sleep(1) + print() + print("REPORT:") + print("-------") + for key, (value, method_name) in f1s_method_name.items(): + if value < BASE_LINE[key][0]: + print(f"{key}: PERFORMANCE DECREASED!!!!!") + else: + print(f"{key}: Good performance") + + print(f"Base performance: {BASE_LINE[key][0]}% with method {BASE_LINE[key][1]}") + print(f"Performance: {value}% with method {method_name}") + print() + + +def get_f1_scores_method_names() -> dict[str, (float, str)]: + f1s_method_name = dict() + for dataset in get_multi_option_benchmark_data(filter_by=[]): + truth_one_hot, prediction_one_hot, method_name, _ = get_predictions(dataset) + f1 = round(100 * f1_score(truth_one_hot, prediction_one_hot, average="micro"), 2) + f1s_method_name[dataset.extraction_identifier.extraction_name] = (f1, method_name) + + return f1s_method_name + + +def get_predictions(dataset: ExtractionData) -> (list[list[int]], list[list[int]], str): + training_samples_number = int(len(dataset.samples) * 0.5) if len(dataset.samples) > 10 else 10 + training_samples = dataset.samples[:training_samples_number] + test_samples = dataset.samples[training_samples_number:] if len(dataset.samples) > 20 else dataset.samples + + training_dataset = ExtractionData( + samples=training_samples, + options=dataset.options, + multi_value=dataset.multi_value, + extraction_identifier=dataset.extraction_identifier, + ) + extractor = PdfToMultiOptionExtractor(dataset.extraction_identifier) + extractor.create_model(training_dataset) + prediction_samples = [PredictionSample(pdf_data=sample.pdf_data) for sample in test_samples] + context_samples, predictions = extractor.get_predictions(prediction_samples) + values_list = [x.labeled_data.values for x in test_samples] + truth_one_hot = PdfMultiOptionMethod.one_hot_to_options_list(values_list, dataset.options) + prediction_one_hot = PdfMultiOptionMethod.one_hot_to_options_list(predictions, dataset.options) + return truth_one_hot, prediction_one_hot, extractor.get_best_method(training_dataset).get_name(), context_samples + + +def get_mistakes() -> dict[str, (float, str)]: + f1s_method_name = dict() + for dataset in get_multi_option_benchmark_data(filter_by=["cejil_president"]): + truth_one_hot, prediction_one_hot, method_name, test_samples = get_predictions(dataset) + + correct = 0 + mistakes = 0 + for truth, prediction, sample in zip(truth_one_hot, prediction_one_hot, test_samples): + text = " ".join([x.text_content for x in sample.pdf_data.pdf_data_segments if x.ml_label]) + missing = [dataset.options[i].label for i in range(len(truth)) if truth[i] and not prediction[i]] + wrong = [dataset.options[i].label for i in range(len(truth)) if not truth[i] and prediction[i]] + + if missing or wrong: + print() + print(f"PDF: {sample.pdf_data.file_name}") + print(f"Text: {text}") + print(f"Missing: {missing}") + print(f"Wrong: {wrong}") + mistakes += 1 + else: + correct += 1 + + print(f"\n\nCorrect predictions for: {correct} PDFs") + print(f"Incorrect predictions for {mistakes} PDFs") + + return f1s_method_name + + +if __name__ == "__main__": + start = time() + print("start") + performance_report() + # get_mistakes() + print("time", round(time() - start, 2), "s") diff --git a/src/cache_pdf_data.py b/src/scripts/cache_pdf_data.py similarity index 95% rename from src/cache_pdf_data.py rename to src/scripts/cache_pdf_data.py index 94c1301..8b6486a 100644 --- a/src/cache_pdf_data.py +++ b/src/scripts/cache_pdf_data.py @@ -14,7 +14,7 @@ from config import ROOT_PATH from data.PdfData import PdfData -from pdf_multi_option_classification_benchmark import get_task_pdf_names, PDF_DATA_FOLDER_PATH +from performance_report import get_task_pdf_names, PDF_DATA_FOLDER_PATH LABELED_DATA_PDFS_PATH = join(ROOT_PATH.parent, "pdf-labeled-data", "pdfs") diff --git a/src/check_performance.py b/src/scripts/check_performance.py similarity index 100% rename from src/check_performance.py rename to src/scripts/check_performance.py diff --git a/src/paragraph_selector_benchmark.py b/src/scripts/paragraph_selector_benchmark.py similarity index 100% rename from src/paragraph_selector_benchmark.py rename to src/scripts/paragraph_selector_benchmark.py diff --git a/src/pdf_multi_option_classification_benchmark.py b/src/scripts/pdf_multi_option_classification_benchmark.py similarity index 99% rename from src/pdf_multi_option_classification_benchmark.py rename to src/scripts/pdf_multi_option_classification_benchmark.py index 6776295..4e34bf8 100644 --- a/src/pdf_multi_option_classification_benchmark.py +++ b/src/scripts/pdf_multi_option_classification_benchmark.py @@ -162,7 +162,7 @@ def get_multi_option_extractor_benchmark(): for extraction_data in extractions_data: start = time() extractor = PdfToMultiOptionExtractor(extraction_identifier=extraction_data.extraction_identifier) - train_set, test_set = ExtractorBase.get_train_test_sets(extraction_data, 23) + train_set, test_set = ExtractorBase.get_train_test_sets(extraction_data) values_list = [x.labeled_data.values for x in test_set.samples] truth_one_hot = PdfMultiOptionMethod.one_hot_to_options_list(values_list, extraction_data.options) extractor.create_model(train_set) @@ -190,7 +190,7 @@ def check_results(): extractor = PdfToMultiOptionExtractor(extraction_identifier=extraction_data.extraction_identifier) print(f"Calculating {extractor.extraction_identifier} {extractor.get_name()}") - train_set, test_set = ExtractorBase.get_train_test_sets(extraction_data, 23) + train_set, test_set = ExtractorBase.get_train_test_sets(extraction_data) labels = [x.labeled_data.values for x in test_set.samples] test_data = [PredictionSample(pdf_data=x.pdf_data) for x in test_set.samples] suggestions = extractor.get_suggestions(test_data) diff --git a/src/text_to_multi_option_benchmark.py b/src/text_to_multi_option_benchmark.py index 6729f25..8c2fc21 100644 --- a/src/text_to_multi_option_benchmark.py +++ b/src/text_to_multi_option_benchmark.py @@ -70,7 +70,7 @@ def get_benchmark(): for extraction_data in extractions_data: start = time() extractor = TextToMultiOptionExtractor(extraction_identifier=extraction_data.extraction_identifier) - train_set, test_set = ExtractorBase.get_train_test_sets(extraction_data, 22, limit_samples=False) + train_set, test_set = ExtractorBase.get_train_test_sets(extraction_data, limit_samples=False) values_list = [x.labeled_data.values for x in test_set.samples] truth_one_hot = PdfMultiOptionMethod.one_hot_to_options_list(values_list, extraction_data.options) extractor.create_model(train_set) @@ -105,7 +105,7 @@ def check_results(): extractions_data: list[ExtractionData] = get_extraction_data(filter_by=[""]) for extraction_data in extractions_data: extractor = TextToMultiOptionExtractor(extraction_identifier=extraction_data.extraction_identifier) - train_set, test_set = ExtractorBase.get_train_test_sets(extraction_data, 22, limit_samples=False) + train_set, test_set = ExtractorBase.get_train_test_sets(extraction_data, limit_samples=False) test_data = [ PredictionSample(tags_texts=x.tags_texts, entity_name=x.labeled_data.entity_name) for x in test_set.samples ]