Skip to content

Commit

Permalink
Merge pull request #76 from huridocs/extractors-base
Browse files Browse the repository at this point in the history
Text to multi-option extractor
  • Loading branch information
gabriel-piles authored Apr 29, 2024
2 parents a662718 + b729059 commit 6419d98
Show file tree
Hide file tree
Showing 167 changed files with 817,398 additions and 1,286 deletions.
4 changes: 2 additions & 2 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
-r requirements.txt
mongomock==4.1.2
pytest==8.1.1
black==24.3.0
pytest==8.2.0
black==24.4.2
27 changes: 18 additions & 9 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,32 @@ git+https://github.com/huridocs/pdf_paragraphs_extraction@d581e146883fce5b6ba60f
slugify==0.0.1
PyRSMQ==0.5.0
redis==5.0.3
python-Levenshtein==0.25.0
python-Levenshtein==0.25.1
tdda==2.0.9
datasets==2.18.0
torch==2.2.2
datasets==2.19.0
torch==2.3.0
evaluate==0.4.1
pandas==2.2.1
pandas==2.2.2
dateparser==1.2.0
langcodes==3.3.0
langcodes==3.4.0
nltk==3.8.1
sentencepiece==0.2.0
accelerate==0.28.0
accelerate==0.29.3
mongomock==4.1.2
fasttext-wheel==0.9.2
rich==13.7.1
joblib==1.3.2
joblib==1.4.0
tqdm==4.66.2
spacy==3.7.4
rapidfuzz==3.7.0
transformers==4.39.3
rapidfuzz==3.8.1
transformers==4.40.1
sentry_sdk==1.44.0
pydantic==2.6.4
pymongo==4.6.3
fastapi==0.110.1
graypy==2.1.0
numpy==1.26.4
lightgbm==4.3.0
setfit==1.0.3
fuzzywuzzy==0.18.0
requests==2.31.0
174 changes: 96 additions & 78 deletions src/Extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,168 +6,186 @@

import pymongo

from config import config_logger, MONGO_PORT, MONGO_HOST, DATA_PATH
from config import MONGO_PORT, MONGO_HOST, DATA_PATH, config_logger
from data.ExtractionIdentifier import ExtractionIdentifier
from data.LabeledData import LabeledData
from data.LogsMessage import Severity
from data.Option import Option

from data.PredictionData import PredictionData
from data.PredictionSample import PredictionSample
from data.SegmentationData import SegmentationData
from data.Suggestion import Suggestion
from data.ExtractionTask import ExtractionTask
from FilterValidSegmentsPages import FilterValidSegmentsPages
from extractors.ExtractorBase import ExtractorBase
from extractors.NaiveExtractor import NaiveExtractor
from extractors.pdf_to_text_extractor.PdfToTextExtractor import PdfToTextExtractor
from data.PdfData import PdfData

from XmlFile import XmlFile
from data.ExtractionData import ExtractionData
from data.ExtractionSample import ExtractionSample
from data.TrainingSample import TrainingSample
from extractors.pdf_to_multi_option_extractor.PdfToMultiOptionExtractor import PdfToMultiOptionExtractor
from extractors.text_to_multi_option_extractor.TextToMultiOptionExtractor import TextToMultiOptionExtractor
from extractors.text_to_text_extractor.TextToTextExtractor import TextToTextExtractor
from send_logs import send_logs


class Extractor:
EXTRACTORS: list[type[ExtractorBase]] = [
TextToMultiOptionExtractor,
PdfToMultiOptionExtractor,
PdfToTextExtractor,
TextToTextExtractor,
NaiveExtractor,
]

CREATE_MODEL_TASK_NAME = "create_model"
SUGGESTIONS_TASK_NAME = "suggestions"

def __init__(self, extraction_identifier: ExtractionIdentifier, options: list[Option] = None, multi_value: bool = False):
self.extraction_identifier = extraction_identifier
self.multi_value = multi_value
self.options = options

client = pymongo.MongoClient(f"{MONGO_HOST}:{MONGO_PORT}")
self.pdf_metadata_extraction_db = client["pdf_metadata_extraction"]
self.mongo_filter = {"tenant": self.extraction_identifier.run_name, "id": self.extraction_identifier.extraction_name}

self.pdfs_data: list[PdfData] = list()
self.labeled_data_list: list[LabeledData] = list()
self.predictions_data_list: list[PredictionData] = list()

def get_labeled_data(self):
labeled_data_list = []
for document in self.pdf_metadata_extraction_db.labeled_data.find(self.mongo_filter):
labeled_data_list.append(LabeledData(**document))

return labeled_data_list

def set_pdf_data_for_training(self):
start = time()
config_logger.info(f"Loading data to create model for {str(self.extraction_identifier)}")
labeled_data_list = self.get_labeled_data()
def get_extraction_data_for_training(self, labeled_data_list: list[LabeledData]) -> ExtractionData:
multi_option_samples: list[TrainingSample] = list()
page_numbers_list = FilterValidSegmentsPages(self.extraction_identifier).for_training(labeled_data_list)
for labeled_data, page_numbers_to_keep in zip(labeled_data_list, page_numbers_list):
segmentation_data = SegmentationData.from_labeled_data(labeled_data)
extraction_identifier = ExtractionIdentifier(run_name=labeled_data.tenant, extraction_name=labeled_data.id)
xml_file = XmlFile(
extraction_identifier=extraction_identifier,
extraction_identifier=self.extraction_identifier,
to_train=True,
xml_file_name=labeled_data.xml_file_name,
)

pdf_data = PdfData.from_xml_file(xml_file, segmentation_data, page_numbers_to_keep)
if exists(xml_file.xml_file_path) and not os.path.isdir(xml_file.xml_file_path):
pdf_data = PdfData.from_xml_file(xml_file, segmentation_data, page_numbers_to_keep)
else:
pdf_data = PdfData.from_texts([""])
sample = TrainingSample(pdf_data=pdf_data, labeled_data=labeled_data, tags_texts=[labeled_data.source_text])
multi_option_samples.append(sample)

return ExtractionData(
samples=multi_option_samples,
options=self.options,
multi_value=self.multi_value,
extraction_identifier=self.extraction_identifier,
)

def create_models(self) -> (bool, str):
start = time()
send_logs(self.extraction_identifier, "Loading data to create model")
extraction_data: ExtractionData = self.get_extraction_data_for_training(self.get_labeled_data())
send_logs(self.extraction_identifier, f"Set data in {round(time() - start, 2)} seconds")

if not extraction_data or not extraction_data.samples:
self.delete_training_data()
return False, "No data to create model"

if not pdf_data:
for extractor in self.EXTRACTORS:
extractor_instance = extractor(self.extraction_identifier)

if not extractor_instance.can_be_used(extraction_data):
continue

self.labeled_data_list.append(labeled_data)
self.pdfs_data.append(pdf_data)
send_logs(self.extraction_identifier, f"Using extractor {extractor_instance.get_name()}")
send_logs(self.extraction_identifier, f"Creating models")
self.extraction_identifier.get_extractor_used_path().write_text(extractor_instance.get_name())
self.delete_training_data()
return extractor_instance.create_model(extraction_data)

config_logger.info(f"Set pdf data {round(time() - start, 2)} seconds")
self.delete_training_data()
send_logs(self.extraction_identifier, "Error creating extractor", Severity.error)

def set_pdf_data_for_predictions(self):
config_logger.info(f"Loading data to calculate suggestions for {self.extraction_identifier}")
prediction_data_list = []
for document in self.pdf_metadata_extraction_db.prediction_data.find(self.mongo_filter):
prediction_data_list.append(PredictionData(**document))
return False, "Error creating extractor"

def get_prediction_samples(self, prediction_data_list: list[PredictionData] = None) -> list[PredictionSample]:
filter_valid_pages = FilterValidSegmentsPages(self.extraction_identifier)
page_numbers_list = filter_valid_pages.for_prediction(prediction_data_list)
config_logger.info(f"Filter pages for prediction: total {len(page_numbers_list)} documents.")
config_logger.info(f"Filter: {page_numbers_list}")

prediction_samples: list[PredictionSample] = []
for prediction_data, page_numbers in zip(prediction_data_list, page_numbers_list):
self.predictions_data_list.append(prediction_data)
segmentation_data = SegmentationData.from_prediction_data(prediction_data)
entity_name = prediction_data.entity_name if prediction_data.entity_name else prediction_data.xml_file_name

xml_file = XmlFile(
extraction_identifier=self.extraction_identifier,
to_train=False,
xml_file_name=prediction_data.xml_file_name,
)
self.pdfs_data.append(PdfData.from_xml_file(xml_file, segmentation_data, page_numbers))

def create_models(self):
self.set_pdf_data_for_training()
is_multi_option = len(self.options) > 1
if is_multi_option:
multi_option_extractor = PdfToMultiOptionExtractor(self.extraction_identifier)
model_created = multi_option_extractor.create_model(self.get_multi_option_data())
else:
pdf_metadata_extractor = PdfToTextExtractor(
extraction_identifier=self.extraction_identifier, pdfs_data=self.pdfs_data
)
model_created = pdf_metadata_extractor.create_model(self.labeled_data_list)

self.delete_training_data()
return model_created
if exists(xml_file.xml_file_path) and not os.path.isdir(xml_file.xml_file_path):
pdf_data = PdfData.from_xml_file(xml_file, segmentation_data, page_numbers)
else:
pdf_data = PdfData.from_texts([""])

sample = PredictionSample(pdf_data=pdf_data, entity_name=entity_name, tags_texts=[prediction_data.source_text])
prediction_samples.append(sample)

return prediction_samples

def get_prediction_data_from_db(self):
prediction_data_list = []
for document in self.pdf_metadata_extraction_db.prediction_data.find(self.mongo_filter):
prediction_data_list.append(PredictionData(**document))
return prediction_data_list

def delete_training_data(self):
training_xml_path = XmlFile.get_xml_folder_path(extraction_identifier=self.extraction_identifier, to_train=True)
send_logs(self.extraction_identifier, f"Deleting training data in {training_xml_path}")
shutil.rmtree(training_xml_path, ignore_errors=True)
self.pdf_metadata_extraction_db.labeled_data.delete_many(self.mongo_filter)

def insert_suggestions_in_db(self, suggestions: list[Suggestion]) -> (bool, str):
if not suggestions:
return False, "No data to calculate suggestions"

config_logger.info(f"Calculated and inserting {len(suggestions)} suggestions")
send_logs(self.extraction_identifier, f"Calculated and inserting {len(suggestions)} suggestions")

self.pdf_metadata_extraction_db.suggestions.insert_many([x.to_dict() for x in suggestions])
xml_folder_path = XmlFile.get_xml_folder_path(extraction_identifier=self.extraction_identifier, to_train=False)
for suggestion in suggestions:
xml_name = {"xml_file_name": suggestion.xml_file_name}
self.pdf_metadata_extraction_db.prediction_data.delete_many({**self.mongo_filter, **xml_name})
Path(join(xml_folder_path, suggestion.xml_file_name)).unlink(missing_ok=True)
entity_name = {"entity_name": suggestion.entity_name, "xml_file_name": ""}
xml_file_name = {"xml_file_name": suggestion.xml_file_name, "entity_name": ""}
self.pdf_metadata_extraction_db.prediction_data.delete_many({**self.mongo_filter, **entity_name})
self.pdf_metadata_extraction_db.prediction_data.delete_many({**self.mongo_filter, **xml_file_name})
path = Path(join(xml_folder_path, suggestion.xml_file_name))
if not path.is_dir():
path.unlink(missing_ok=True)

return True, ""

def get_suggestions(self) -> list[Suggestion]:
self.set_pdf_data_for_predictions()

if PdfToMultiOptionExtractor.is_multi_option_extraction(self.extraction_identifier):
multi_option_extractor = PdfToMultiOptionExtractor(self.extraction_identifier)
return multi_option_extractor.get_suggestions(self.pdfs_data)
send_logs(self.extraction_identifier, f"Gathering data to calculate suggestions")

pdf_metadata_extractor = PdfToTextExtractor(self.extraction_identifier, self.pdfs_data)
semantic_predictions_texts = pdf_metadata_extractor.get_metadata_predictions()
prediction_samples = self.get_prediction_samples(self.get_prediction_data_from_db())

if not semantic_predictions_texts:
if not self.extraction_identifier.get_extractor_used_path().exists():
send_logs(self.extraction_identifier, f"No extractor available", Severity.error)
return []

suggestions = self.get_empty_suggestions()

for suggestion, semantic_prediction, pdf_data in zip(suggestions, semantic_predictions_texts, self.pdfs_data):
suggestion.add_prediction(semantic_prediction, pdf_data)

return suggestions

def get_empty_suggestions(self):
suggestions = []
for prediction_data in self.predictions_data_list:
suggestions.append(Suggestion.get_empty(self.extraction_identifier, prediction_data.xml_file_name))
return suggestions
extractor_name = self.extraction_identifier.get_extractor_used_path().read_text()
for extractor in self.EXTRACTORS:
extractor_instance = extractor(self.extraction_identifier)
if extractor_instance.get_name() != extractor_name:
continue

def get_multi_option_data(self):
multi_option_samples: list[ExtractionSample] = list()
for pdf_data, labeled_data in zip(self.pdfs_data, self.labeled_data_list):
multi_option_sample = ExtractionSample(pdf_data=pdf_data, labeled_data=labeled_data)
multi_option_samples.append(multi_option_sample)
send_logs(self.extraction_identifier, f"Calculating suggestions with {extractor_instance.get_name()}")
return extractor_instance.get_suggestions(prediction_samples)

return ExtractionData(
samples=multi_option_samples,
options=self.options,
multi_value=self.multi_value,
extraction_identifier=self.extraction_identifier,
)
send_logs(self.extraction_identifier, f"No extractor available", Severity.error)
return []

@staticmethod
def remove_old_models(extractor_identifier: ExtractionIdentifier):
Expand All @@ -181,6 +199,7 @@ def remove_old_models(extractor_identifier: ExtractionIdentifier):
for extraction_name in os.listdir(join(DATA_PATH, run_name)):
extractor_identifier_to_check = ExtractionIdentifier(run_name=run_name, extraction_name=extraction_name)
if extractor_identifier_to_check.is_old():
config_logger.info(f"Removing old model folder {extractor_identifier_to_check.get_path()}")
shutil.rmtree(extractor_identifier_to_check.get_path(), ignore_errors=True)

@staticmethod
Expand All @@ -196,7 +215,6 @@ def calculate_task(extraction_task: ExtractionTask) -> (bool, str):
return extractor.create_models()

if extraction_task.task == Extractor.SUGGESTIONS_TASK_NAME:
config_logger.info("Calculating suggestions")
extractor = Extractor(extractor_identifier)
suggestions = extractor.get_suggestions()
return extractor.insert_suggestions_in_db(suggestions)
Expand Down
13 changes: 12 additions & 1 deletion src/QueueProcessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,16 @@
from sentry_sdk.integrations.redis import RedisIntegration
import sentry_sdk

from config import config_logger, SERVICE_HOST, SERVICE_PORT, REDIS_HOST, REDIS_PORT, TASK_QUEUE_NAME, RESULTS_QUEUE_NAME
from config import (
config_logger,
SERVICE_HOST,
SERVICE_PORT,
REDIS_HOST,
REDIS_PORT,
TASK_QUEUE_NAME,
RESULTS_QUEUE_NAME,
logs_queue,
)
from data.ExtractionTask import ExtractionTask
from data.ResultsMessage import ResultsMessage
from Extractor import Extractor
Expand Down Expand Up @@ -86,6 +95,7 @@ def subscribe_to_tasks_queue(self):
try:
self.task_queue.getQueueAttributes().exec_command()
self.results_queue.getQueueAttributes().exec_command()
logs_queue.getQueueAttributes().exec_command()

redis_smq_consumer = RedisSMQConsumer(
qname=TASK_QUEUE_NAME,
Expand All @@ -101,6 +111,7 @@ def subscribe_to_tasks_queue(self):
config_logger.info("Creating queues")
self.task_queue.createQueue().exceptions(False).execute()
self.results_queue.createQueue().exceptions(False).execute()
logs_queue.createQueue().exceptions(False).execute()
config_logger.info("Queues have been created")


Expand Down
26 changes: 0 additions & 26 deletions src/check_mistakes.py

This file was deleted.

1 change: 0 additions & 1 deletion src/check_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import pandas as pd
from datetime import datetime

from data.SemanticExtractionData import SemanticExtractionData
from performance.Results import Results
from extractors.text_to_text_extractor.TextToTextMethod import TextToTextMethod
from extractors.text_to_text_extractor.methods.MT5TrueCaseEnglishSpanishMethod import MT5TrueCaseEnglishSpanishMethod
Expand Down
Loading

0 comments on commit 6419d98

Please sign in to comment.