Skip to content

Commit

Permalink
Merge pull request #73 from huridocs/fix-tokenizer
Browse files Browse the repository at this point in the history
Fix tokenizer
  • Loading branch information
gabriel-piles authored Oct 19, 2023
2 parents bae1ff6 + c246191 commit def2a74
Show file tree
Hide file tree
Showing 11 changed files with 64 additions and 45 deletions.
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ start:
docker compose -f local-docker-compose.yml up --attach pdf_metadata_extraction_worker --attach pdf_metadata_extraction_api --build

start_gpu:
docker compose -f gpu-docker-compose.yml up --attach pdf_metadata_extraction_worker --attach pdf_metadata_extraction_api --build


start_local_gpu:
docker compose -f local-gpu-docker-compose.yml up --attach pdf_metadata_extraction_worker --attach pdf_metadata_extraction_api --build

start_detached:
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: "3.8"
services:
pdf_metadata_extraction_api:
container_name: pdf_metadata_extraction_api
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056" ]
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056", "--timeout", "300" ]
init: true
restart: unless-stopped
build:
Expand Down
2 changes: 1 addition & 1 deletion gpu-docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: "3.8"
services:
pdf_metadata_extraction_api:
container_name: pdf_metadata_extraction_api
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056" ]
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056", "--timeout", "300" ]
init: true
restart: unless-stopped
build:
Expand Down
2 changes: 1 addition & 1 deletion local-docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: "3.8"
services:
pdf_metadata_extraction_api:
container_name: pdf_metadata_extraction_api
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056" ]
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056", "--timeout", "300" ]
init: true
restart: unless-stopped
build:
Expand Down
2 changes: 1 addition & 1 deletion local-gpu-docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: "3.8"
services:
pdf_metadata_extraction_api:
container_name: pdf_metadata_extraction_api
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056" ]
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5056", "--timeout", "300" ]
init: true
restart: unless-stopped
build:
Expand Down
18 changes: 0 additions & 18 deletions performance_results/paragraph_selector_2023_10_13_16_09.md

This file was deleted.

2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ pandas==2.1.1
dateparser==1.1.8
langcodes==3.3.0
nltk==3.8.1
transformers==4.34.0
transformers==4.34.1
httpx==0.25.0
sentencepiece==0.1.99
accelerate==0.23.0
Expand Down
65 changes: 45 additions & 20 deletions src/evaluate_paragraph_selector_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from metadata_extraction.PdfSegments import PdfSegments
from performance.Results import Results
from segment_selector.Paragraphs import Paragraphs
from segment_selector.evaluate_config import SIZES, SEED, LABELED_DATA_TO_USE, METHODS_TO_EXECUTE
from segment_selector.evaluate_config import SIZES, SEED, LABELED_DATA_TO_USE, METHODS_TO_EXECUTE, PDF_LABELED_DATA_PATH

RANDOM_SEED = 42

Expand Down Expand Up @@ -68,6 +68,7 @@ def load_pdf_segments(task: str, pdf_name: str) -> PdfSegments:

pdfs_path = join(labeled_data_root_path, "pdfs")
pdf_features = PdfFeatures.from_poppler_etree(join(pdfs_path, pdf_name, "etree.xml"))
pdf_features.file_name = pdf_name

pdf_path = join(pdfs_path, pdf_name, "document.pdf")
segmentation_data: SegmentationData = get_segmentation_data(pdf_path, pdf_name)
Expand Down Expand Up @@ -115,7 +116,27 @@ def snake_case_to_pascal_case(name: str):
return "".join(word.title() for word in name.split("_"))


def run_one_method(method_name, task, training_pdfs_segments, testing_pdfs_segments, results):
def save_mistakes(method_name: str, task: str, testing_pdfs_segments: list[PdfSegments], predictions_binary: list[int]):
prediction_index = 0
for pdf_segments in testing_pdfs_segments:
y_true = [segment.ml_label for segment in pdf_segments.pdf_segments]
pdf_segments_predictions = predictions_binary[prediction_index : prediction_index + len(y_true)]
prediction_index += len(y_true)

task_mistakes = TaskMistakes(PDF_LABELED_DATA_PATH, task + "_" + method_name, pdf_segments.pdf_features.file_name)
for segment, truth, prediction in zip(pdf_segments.pdf_segments, y_true, pdf_segments_predictions):
task_mistakes.add_label(segment.bounding_box, truth, prediction)

task_mistakes.save()


def run_one_method(
method_name: str,
task: str,
training_pdfs_segments: list[PdfSegments],
testing_pdfs_segments: list[PdfSegments],
results: Results,
):
results.set_start_time()
method_class_name = snake_case_to_pascal_case(method_name)
import_from = f"segment_selector.methods.{method_name}.{method_class_name}"
Expand All @@ -133,6 +154,9 @@ def run_one_method(method_name, task, training_pdfs_segments, testing_pdfs_segme

y_true = [x.ml_label for test in testing_pdfs_segments for x in test.pdf_segments]
prediction_binary = [1 if prediction > 0.5 else 0 for prediction in predictions]

save_mistakes(method_name, task, testing_pdfs_segments, prediction_binary)

f1 = round(100 * f1_score(y_true, prediction_binary), 2)

results.save_result(
Expand All @@ -154,24 +178,25 @@ def evaluate_methods():
f1s = list()
for size, seed, task in get_loop_values():
training_pdfs_segments, testing_pdfs_segments = load_training_testing_data(task, seed)
training_pdfs_segments = training_pdfs_segments[:size]

print(
f"\n\nevaluating time:{datetime.now():%Y/%m/%d %H:%M} size:{size} seed:{seed} task:{task} method:{method_name}"
)
f1 = run_one_method(method_name, task, training_pdfs_segments, testing_pdfs_segments, results)
f1s.append(f1)

results.set_start_time()
results.save_result(
dataset="Average",
method="",
accuracy=round(sum(f1s) / len(f1s), 2),
train_length=0,
test_length=0,
)

results.write_results()
save_mistakes(task, testing_pdfs_segments, list())
# training_pdfs_segments = training_pdfs_segments[:size]
#
# print(
# f"\n\nevaluating time:{datetime.now():%Y/%m/%d %H:%M} size:{size} seed:{seed} task:{task} method:{method_name}"
# )
# f1 = run_one_method(method_name, task, training_pdfs_segments, testing_pdfs_segments, results)
# f1s.append(f1)
#
# results.set_start_time()
# results.save_result(
# dataset="Average",
# method="",
# accuracy=round(sum(f1s) / len(f1s), 2),
# train_length=0,
# test_length=0,
# )
#
# results.write_results()


if __name__ == "__main__":
Expand Down
6 changes: 6 additions & 0 deletions src/segment_selector/evaluate_config.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
from os.path import join

from config import ROOT_PATH

METHODS_TO_EXECUTE = ["avoiding_words"]

LABELED_DATA_TO_USE = [
Expand All @@ -15,3 +19,5 @@

SIZES = [3000]
SEED = 42

PDF_LABELED_DATA_PATH = join(ROOT_PATH.parent, "pdf-labeled-data")
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import pandas as pd
import csv
from transformers.utils import logging as logging_hf
from transformers import AutoTokenizer, MT5Tokenizer, MT5ForConditionalGeneration
from transformers import AutoTokenizer, MT5ForConditionalGeneration

from config import DATA_PATH, config_logger
from data.PdfTagData import PdfTagData
Expand Down Expand Up @@ -161,7 +161,7 @@ def predict(self, semantic_predictions_data: list[SemanticPredictionData]) -> li
return texts

predictions = list()
tokenizer = MT5Tokenizer.from_pretrained("HURIDOCS/mt5-small-spanish-es")
tokenizer = AutoTokenizer.from_pretrained("HURIDOCS/mt5-small-spanish-es")
model = MT5ForConditionalGeneration.from_pretrained(self.get_model_path())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
Expand Down
2 changes: 2 additions & 0 deletions src/semantic_metadata_extraction/methods/run_seq_2_seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,6 +589,8 @@ def post_processing_function(
preds = outputs if isinstance(outputs, (np.ndarray, np.generic)) else outputs.predictions
if isinstance(preds, tuple):
preds = preds[0]

preds[preds == -100] = 0
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

# Build a map example to its corresponding features.
Expand Down

0 comments on commit def2a74

Please sign in to comment.