diff --git a/Makefile b/Makefile index 27dcbba..43322fc 100644 --- a/Makefile +++ b/Makefile @@ -28,6 +28,10 @@ start: docker compose -f local-docker-compose.yml up --attach pdf_metadata_extraction_worker --attach pdf_metadata_extraction_api --build start_gpu: + docker compose -f gpu-docker-compose.yml up --attach pdf_metadata_extraction_worker --attach pdf_metadata_extraction_api --build + + +start_local_gpu: docker compose -f local-gpu-docker-compose.yml up --attach pdf_metadata_extraction_worker --attach pdf_metadata_extraction_api --build start_detached: diff --git a/docker-compose.yml b/docker-compose.yml index 58a38b3..e82a7fe 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,7 +2,7 @@ version: "3.8" services: pdf_metadata_extraction_api: container_name: pdf_metadata_extraction_api - entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056" ] + entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056", "--timeout", "300" ] init: true restart: unless-stopped build: diff --git a/gpu-docker-compose.yml b/gpu-docker-compose.yml index 8c5096e..cb7a01a 100755 --- a/gpu-docker-compose.yml +++ b/gpu-docker-compose.yml @@ -2,7 +2,7 @@ version: "3.8" services: pdf_metadata_extraction_api: container_name: pdf_metadata_extraction_api - entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056" ] + entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056", "--timeout", "300" ] init: true restart: unless-stopped build: diff --git a/local-docker-compose.yml b/local-docker-compose.yml index 0ccf512..0f1d3bb 100755 --- a/local-docker-compose.yml +++ b/local-docker-compose.yml @@ -2,7 +2,7 @@ version: "3.8" services: pdf_metadata_extraction_api: container_name: pdf_metadata_extraction_api - entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056" ] + entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056", "--timeout", "300" ] init: true restart: unless-stopped build: diff --git a/local-gpu-docker-compose.yml b/local-gpu-docker-compose.yml index 1acef4a..14a1353 100755 --- a/local-gpu-docker-compose.yml +++ b/local-gpu-docker-compose.yml @@ -2,7 +2,7 @@ version: "3.8" services: pdf_metadata_extraction_api: container_name: pdf_metadata_extraction_api - entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056" ] + entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056", "--timeout", "300" ] init: true restart: unless-stopped build: diff --git a/performance_results/paragraph_selector_2023_10_13_16_09.md b/performance_results/paragraph_selector_2023_10_13_16_09.md deleted file mode 100644 index faa780b..0000000 --- a/performance_results/paragraph_selector_2023_10_13_16_09.md +++ /dev/null @@ -1,18 +0,0 @@ - ╷ ╷ ╷ ╷ ╷ - │ │ Train │ Test │ │ - Dataset │ Method │ size │ size │ Time │ Acc. - ═════════════════════════════╪════════════════╪════════╪═══════╪══════╪══════ - title │ avoiding_words │ 15 │ 11 │ 2s │ 83% - decides │ avoiding_words │ 82 │ 55 │ 115s │ 86% - signatories │ avoiding_words │ 82 │ 55 │ 9s │ 96% - first paragraph having seen │ avoiding_words │ 82 │ 55 │ 21s │ 96% - secretary │ avoiding_words │ 82 │ 55 │ 5s │ 96% - president │ avoiding_words │ 82 │ 55 │ 5s │ 97% - date │ avoiding_words │ 82 │ 55 │ 8s │ 97% - plan many date │ avoiding_words │ 95 │ 64 │ 10s │ 98% - plan many title │ avoiding_words │ 95 │ 64 │ 12s │ 99% - semantic president │ avoiding_words │ 150 │ 100 │ 574s │ 77% - Average │ │ 0 │ 0 │ 0s │ 93% - │ │ │ │ │ - average │ │ │ │ │ 93% - ╵ ╵ ╵ ╵ ╵ diff --git a/requirements.txt b/requirements.txt index 5fe0175..a279be9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,7 +20,7 @@ pandas==2.1.1 dateparser==1.1.8 langcodes==3.3.0 nltk==3.8.1 -transformers==4.34.0 +transformers==4.34.1 httpx==0.25.0 sentencepiece==0.1.99 accelerate==0.23.0 diff --git a/src/evaluate_paragraph_selector_methods.py b/src/evaluate_paragraph_selector_methods.py index 1d78e02..589e71c 100644 --- a/src/evaluate_paragraph_selector_methods.py +++ b/src/evaluate_paragraph_selector_methods.py @@ -20,7 +20,7 @@ from metadata_extraction.PdfSegments import PdfSegments from performance.Results import Results from segment_selector.Paragraphs import Paragraphs -from segment_selector.evaluate_config import SIZES, SEED, LABELED_DATA_TO_USE, METHODS_TO_EXECUTE +from segment_selector.evaluate_config import SIZES, SEED, LABELED_DATA_TO_USE, METHODS_TO_EXECUTE, PDF_LABELED_DATA_PATH RANDOM_SEED = 42 @@ -68,6 +68,7 @@ def load_pdf_segments(task: str, pdf_name: str) -> PdfSegments: pdfs_path = join(labeled_data_root_path, "pdfs") pdf_features = PdfFeatures.from_poppler_etree(join(pdfs_path, pdf_name, "etree.xml")) + pdf_features.file_name = pdf_name pdf_path = join(pdfs_path, pdf_name, "document.pdf") segmentation_data: SegmentationData = get_segmentation_data(pdf_path, pdf_name) @@ -115,7 +116,27 @@ def snake_case_to_pascal_case(name: str): return "".join(word.title() for word in name.split("_")) -def run_one_method(method_name, task, training_pdfs_segments, testing_pdfs_segments, results): +def save_mistakes(method_name: str, task: str, testing_pdfs_segments: list[PdfSegments], predictions_binary: list[int]): + prediction_index = 0 + for pdf_segments in testing_pdfs_segments: + y_true = [segment.ml_label for segment in pdf_segments.pdf_segments] + pdf_segments_predictions = predictions_binary[prediction_index : prediction_index + len(y_true)] + prediction_index += len(y_true) + + task_mistakes = TaskMistakes(PDF_LABELED_DATA_PATH, task + "_" + method_name, pdf_segments.pdf_features.file_name) + for segment, truth, prediction in zip(pdf_segments.pdf_segments, y_true, pdf_segments_predictions): + task_mistakes.add_label(segment.bounding_box, truth, prediction) + + task_mistakes.save() + + +def run_one_method( + method_name: str, + task: str, + training_pdfs_segments: list[PdfSegments], + testing_pdfs_segments: list[PdfSegments], + results: Results, +): results.set_start_time() method_class_name = snake_case_to_pascal_case(method_name) import_from = f"segment_selector.methods.{method_name}.{method_class_name}" @@ -133,6 +154,9 @@ def run_one_method(method_name, task, training_pdfs_segments, testing_pdfs_segme y_true = [x.ml_label for test in testing_pdfs_segments for x in test.pdf_segments] prediction_binary = [1 if prediction > 0.5 else 0 for prediction in predictions] + + save_mistakes(method_name, task, testing_pdfs_segments, prediction_binary) + f1 = round(100 * f1_score(y_true, prediction_binary), 2) results.save_result( @@ -154,24 +178,25 @@ def evaluate_methods(): f1s = list() for size, seed, task in get_loop_values(): training_pdfs_segments, testing_pdfs_segments = load_training_testing_data(task, seed) - training_pdfs_segments = training_pdfs_segments[:size] - - print( - f"\n\nevaluating time:{datetime.now():%Y/%m/%d %H:%M} size:{size} seed:{seed} task:{task} method:{method_name}" - ) - f1 = run_one_method(method_name, task, training_pdfs_segments, testing_pdfs_segments, results) - f1s.append(f1) - - results.set_start_time() - results.save_result( - dataset="Average", - method="", - accuracy=round(sum(f1s) / len(f1s), 2), - train_length=0, - test_length=0, - ) - - results.write_results() + save_mistakes(task, testing_pdfs_segments, list()) + # training_pdfs_segments = training_pdfs_segments[:size] + # + # print( + # f"\n\nevaluating time:{datetime.now():%Y/%m/%d %H:%M} size:{size} seed:{seed} task:{task} method:{method_name}" + # ) + # f1 = run_one_method(method_name, task, training_pdfs_segments, testing_pdfs_segments, results) + # f1s.append(f1) + # + # results.set_start_time() + # results.save_result( + # dataset="Average", + # method="", + # accuracy=round(sum(f1s) / len(f1s), 2), + # train_length=0, + # test_length=0, + # ) + # + # results.write_results() if __name__ == "__main__": diff --git a/src/segment_selector/evaluate_config.py b/src/segment_selector/evaluate_config.py index 8ee4282..9c73d20 100644 --- a/src/segment_selector/evaluate_config.py +++ b/src/segment_selector/evaluate_config.py @@ -1,3 +1,7 @@ +from os.path import join + +from config import ROOT_PATH + METHODS_TO_EXECUTE = ["avoiding_words"] LABELED_DATA_TO_USE = [ @@ -15,3 +19,5 @@ SIZES = [3000] SEED = 42 + +PDF_LABELED_DATA_PATH = join(ROOT_PATH.parent, "pdf-labeled-data") diff --git a/src/semantic_metadata_extraction/methods/MT5TrueCaseEnglishSpanishMethod.py b/src/semantic_metadata_extraction/methods/MT5TrueCaseEnglishSpanishMethod.py index eb5186a..07da3eb 100644 --- a/src/semantic_metadata_extraction/methods/MT5TrueCaseEnglishSpanishMethod.py +++ b/src/semantic_metadata_extraction/methods/MT5TrueCaseEnglishSpanishMethod.py @@ -11,7 +11,7 @@ import pandas as pd import csv from transformers.utils import logging as logging_hf -from transformers import AutoTokenizer, MT5Tokenizer, MT5ForConditionalGeneration +from transformers import AutoTokenizer, MT5ForConditionalGeneration from config import DATA_PATH, config_logger from data.PdfTagData import PdfTagData @@ -161,7 +161,7 @@ def predict(self, semantic_predictions_data: list[SemanticPredictionData]) -> li return texts predictions = list() - tokenizer = MT5Tokenizer.from_pretrained("HURIDOCS/mt5-small-spanish-es") + tokenizer = AutoTokenizer.from_pretrained("HURIDOCS/mt5-small-spanish-es") model = MT5ForConditionalGeneration.from_pretrained(self.get_model_path()) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.to(device) diff --git a/src/semantic_metadata_extraction/methods/run_seq_2_seq.py b/src/semantic_metadata_extraction/methods/run_seq_2_seq.py index ca77c07..fa76847 100644 --- a/src/semantic_metadata_extraction/methods/run_seq_2_seq.py +++ b/src/semantic_metadata_extraction/methods/run_seq_2_seq.py @@ -589,6 +589,8 @@ def post_processing_function( preds = outputs if isinstance(outputs, (np.ndarray, np.generic)) else outputs.predictions if isinstance(preds, tuple): preds = preds[0] + + preds[preds == -100] = 0 decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) # Build a map example to its corresponding features.