Merge pull request #73 from huridocs/fix-tokenizer

Fix tokenizer
huridocs · Oct 19, 2023 · def2a74 · def2a74
2 parents bae1ff6 + c246191
commit def2a74
Show file tree

Hide file tree

Showing 11 changed files with 64 additions and 45 deletions.
diff --git a/Makefile b/Makefile
@@ -28,6 +28,10 @@ start:
 	docker compose -f local-docker-compose.yml up --attach pdf_metadata_extraction_worker --attach pdf_metadata_extraction_api --build
 
 start_gpu:
+	docker compose -f gpu-docker-compose.yml up --attach pdf_metadata_extraction_worker --attach pdf_metadata_extraction_api --build
+
+
+start_local_gpu:
 	docker compose -f local-gpu-docker-compose.yml up --attach pdf_metadata_extraction_worker --attach pdf_metadata_extraction_api --build
 
 start_detached:

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -2,7 +2,7 @@ version: "3.8"
 services:
   pdf_metadata_extraction_api:
     container_name: pdf_metadata_extraction_api
-    entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056" ]
+    entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056", "--timeout", "300"  ]
     init: true
     restart: unless-stopped
     build:

diff --git a/gpu-docker-compose.yml b/gpu-docker-compose.yml
@@ -2,7 +2,7 @@ version: "3.8"
 services:
   pdf_metadata_extraction_api:
     container_name: pdf_metadata_extraction_api
-    entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056" ]
+    entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056", "--timeout", "300"  ]
     init: true
     restart: unless-stopped
     build:

diff --git a/local-docker-compose.yml b/local-docker-compose.yml
@@ -2,7 +2,7 @@ version: "3.8"
 services:
   pdf_metadata_extraction_api:
     container_name: pdf_metadata_extraction_api
-    entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056" ]
+    entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056", "--timeout", "300"   ]
     init: true
     restart: unless-stopped
     build:

diff --git a/local-gpu-docker-compose.yml b/local-gpu-docker-compose.yml
@@ -2,7 +2,7 @@ version: "3.8"
 services:
   pdf_metadata_extraction_api:
     container_name: pdf_metadata_extraction_api
-    entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056" ]
+    entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056", "--timeout", "300"  ]
     init: true
     restart: unless-stopped
     build:

diff --git a/performance_results/paragraph_selector_2023_10_13_16_09.md b/performance_results/paragraph_selector_2023_10_13_16_09.md
diff --git a/requirements.txt b/requirements.txt
@@ -20,7 +20,7 @@ pandas==2.1.1
 dateparser==1.1.8
 langcodes==3.3.0
 nltk==3.8.1
-transformers==4.34.0
+transformers==4.34.1
 httpx==0.25.0
 sentencepiece==0.1.99
 accelerate==0.23.0

diff --git a/src/evaluate_paragraph_selector_methods.py b/src/evaluate_paragraph_selector_methods.py
@@ -20,7 +20,7 @@
 from metadata_extraction.PdfSegments import PdfSegments
 from performance.Results import Results
 from segment_selector.Paragraphs import Paragraphs
-from segment_selector.evaluate_config import SIZES, SEED, LABELED_DATA_TO_USE, METHODS_TO_EXECUTE
+from segment_selector.evaluate_config import SIZES, SEED, LABELED_DATA_TO_USE, METHODS_TO_EXECUTE, PDF_LABELED_DATA_PATH
 
 RANDOM_SEED = 42
 
@@ -68,6 +68,7 @@ def load_pdf_segments(task: str, pdf_name: str) -> PdfSegments:
 
     pdfs_path = join(labeled_data_root_path, "pdfs")
     pdf_features = PdfFeatures.from_poppler_etree(join(pdfs_path, pdf_name, "etree.xml"))
+    pdf_features.file_name = pdf_name
 
     pdf_path = join(pdfs_path, pdf_name, "document.pdf")
     segmentation_data: SegmentationData = get_segmentation_data(pdf_path, pdf_name)
@@ -115,7 +116,27 @@ def snake_case_to_pascal_case(name: str):
     return "".join(word.title() for word in name.split("_"))
 
 
-def run_one_method(method_name, task, training_pdfs_segments, testing_pdfs_segments, results):
+def save_mistakes(method_name: str, task: str, testing_pdfs_segments: list[PdfSegments], predictions_binary: list[int]):
+    prediction_index = 0
+    for pdf_segments in testing_pdfs_segments:
+        y_true = [segment.ml_label for segment in pdf_segments.pdf_segments]
+        pdf_segments_predictions = predictions_binary[prediction_index : prediction_index + len(y_true)]
+        prediction_index += len(y_true)
+
+        task_mistakes = TaskMistakes(PDF_LABELED_DATA_PATH, task + "_" + method_name, pdf_segments.pdf_features.file_name)
+        for segment, truth, prediction in zip(pdf_segments.pdf_segments, y_true, pdf_segments_predictions):
+            task_mistakes.add_label(segment.bounding_box, truth, prediction)
+
+        task_mistakes.save()
+
+
+def run_one_method(
+    method_name: str,
+    task: str,
+    training_pdfs_segments: list[PdfSegments],
+    testing_pdfs_segments: list[PdfSegments],
+    results: Results,
+):
     results.set_start_time()
     method_class_name = snake_case_to_pascal_case(method_name)
     import_from = f"segment_selector.methods.{method_name}.{method_class_name}"
@@ -133,6 +154,9 @@ def run_one_method(method_name, task, training_pdfs_segments, testing_pdfs_segme
 
     y_true = [x.ml_label for test in testing_pdfs_segments for x in test.pdf_segments]
     prediction_binary = [1 if prediction > 0.5 else 0 for prediction in predictions]
+
+    save_mistakes(method_name, task, testing_pdfs_segments, prediction_binary)
+
     f1 = round(100 * f1_score(y_true, prediction_binary), 2)
 
     results.save_result(
@@ -154,24 +178,25 @@ def evaluate_methods():
         f1s = list()
         for size, seed, task in get_loop_values():
             training_pdfs_segments, testing_pdfs_segments = load_training_testing_data(task, seed)
-            training_pdfs_segments = training_pdfs_segments[:size]
-
-            print(
-                f"\n\nevaluating time:{datetime.now():%Y/%m/%d %H:%M} size:{size} seed:{seed} task:{task} method:{method_name}"
-            )
-            f1 = run_one_method(method_name, task, training_pdfs_segments, testing_pdfs_segments, results)
-            f1s.append(f1)
-
-        results.set_start_time()
-        results.save_result(
-            dataset="Average",
-            method="",
-            accuracy=round(sum(f1s) / len(f1s), 2),
-            train_length=0,
-            test_length=0,
-        )
-
-    results.write_results()
+            save_mistakes(task, testing_pdfs_segments, list())
+    #         training_pdfs_segments = training_pdfs_segments[:size]
+    #
+    #         print(
+    #             f"\n\nevaluating time:{datetime.now():%Y/%m/%d %H:%M} size:{size} seed:{seed} task:{task} method:{method_name}"
+    #         )
+    #         f1 = run_one_method(method_name, task, training_pdfs_segments, testing_pdfs_segments, results)
+    #         f1s.append(f1)
+    #
+    #     results.set_start_time()
+    #     results.save_result(
+    #         dataset="Average",
+    #         method="",
+    #         accuracy=round(sum(f1s) / len(f1s), 2),
+    #         train_length=0,
+    #         test_length=0,
+    #     )
+    #
+    # results.write_results()
 
 
 if __name__ == "__main__":

diff --git a/src/segment_selector/evaluate_config.py b/src/segment_selector/evaluate_config.py
@@ -1,3 +1,7 @@
+from os.path import join
+
+from config import ROOT_PATH
+
 METHODS_TO_EXECUTE = ["avoiding_words"]
 
 LABELED_DATA_TO_USE = [
@@ -15,3 +19,5 @@
 
 SIZES = [3000]
 SEED = 42
+
+PDF_LABELED_DATA_PATH = join(ROOT_PATH.parent, "pdf-labeled-data")
diff --git a/src/semantic_metadata_extraction/methods/MT5TrueCaseEnglishSpanishMethod.py b/src/semantic_metadata_extraction/methods/MT5TrueCaseEnglishSpanishMethod.py
@@ -11,7 +11,7 @@
 import pandas as pd
 import csv
 from transformers.utils import logging as logging_hf
-from transformers import AutoTokenizer, MT5Tokenizer, MT5ForConditionalGeneration
+from transformers import AutoTokenizer, MT5ForConditionalGeneration
 
 from config import DATA_PATH, config_logger
 from data.PdfTagData import PdfTagData
@@ -161,7 +161,7 @@ def predict(self, semantic_predictions_data: list[SemanticPredictionData]) -> li
             return texts
 
         predictions = list()
-        tokenizer = MT5Tokenizer.from_pretrained("HURIDOCS/mt5-small-spanish-es")
+        tokenizer = AutoTokenizer.from_pretrained("HURIDOCS/mt5-small-spanish-es")
         model = MT5ForConditionalGeneration.from_pretrained(self.get_model_path())
         device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
         model.to(device)

diff --git a/src/semantic_metadata_extraction/methods/run_seq_2_seq.py b/src/semantic_metadata_extraction/methods/run_seq_2_seq.py
@@ -589,6 +589,8 @@ def post_processing_function(
         preds = outputs if isinstance(outputs, (np.ndarray, np.generic)) else outputs.predictions
         if isinstance(preds, tuple):
             preds = preds[0]
+
+        preds[preds == -100] = 0
         decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
 
         # Build a map example to its corresponding features.