Skip to content

Commit

Permalink
Merge pull request #79 from huridocs/extract-countries-in-favor
Browse files Browse the repository at this point in the history
Add fast fuzzy segment selector
  • Loading branch information
gabriel-piles authored Jun 10, 2024
2 parents a7a8222 + 1910fc8 commit 499906b
Show file tree
Hide file tree
Showing 26 changed files with 3,085 additions and 96 deletions.
5 changes: 4 additions & 1 deletion src/cache_pdf_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,17 @@

from config import ROOT_PATH
from data.PdfData import PdfData
from multi_option_classification_benchmark import get_task_pdf_names, PDF_DATA_FOLDER_PATH
from pdf_multi_option_classification_benchmark import get_task_pdf_names, PDF_DATA_FOLDER_PATH

LABELED_DATA_PDFS_PATH = join(ROOT_PATH.parent, "pdf-labeled-data", "pdfs")


def cache_pdf_data():
task_pdf_names = get_task_pdf_names()
for task, pdf_names in task_pdf_names.items():
if task != "countries_in_favor":
continue

for pdf_name in pdf_names:
pdf_data_pickle_path = Path(str(join(PDF_DATA_FOLDER_PATH, pdf_name + ".pickle")))
os.makedirs(pdf_data_pickle_path.parent, exist_ok=True)
Expand Down
6 changes: 5 additions & 1 deletion src/data/PdfData.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ def set_ml_label_from_segmentation_data(self, segmentation_data: SegmentationDat
if segment.is_selected(label_segment_box.get_bounding_box()):
segment.ml_label = 1

def clean_text(self):
for segment in self.pdf_data_segments:
segment.text_content = " ".join(segment.text_content.split())

@staticmethod
def get_blank():
return PdfData(None)
Expand All @@ -87,7 +91,7 @@ def from_xml_file(xml_file: XmlFile, segmentation_data: SegmentationData, pages_
pdf_data = PdfData(pdf_features)
pdf_data.set_segments_from_segmentation_data(segmentation_data)
pdf_data.set_ml_label_from_segmentation_data(segmentation_data)

pdf_data.clean_text()
return pdf_data

@staticmethod
Expand Down
2 changes: 1 addition & 1 deletion src/data/PdfDataSegment.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,4 @@ def from_list_to_merge(pdf_segments_to_merge: list["PdfDataSegment"]):

@staticmethod
def from_texts(texts: list[str]):
return [PdfDataSegment(1, Rectangle(0, 0, 0, 0), text) for text in texts]
return [PdfDataSegment(i + 1, Rectangle(0, 0, 0, 0), text) for i, text in enumerate(texts)]
Original file line number Diff line number Diff line change
@@ -1,14 +1,10 @@
import random
import shutil
from abc import abstractmethod
from typing import Type

from sklearn.metrics import f1_score

from data.ExtractionIdentifier import ExtractionIdentifier
from data.Option import Option
from data.ExtractionData import ExtractionData
from data.TrainingSample import TrainingSample
from extractors.ExtractorBase import ExtractorBase
from extractors.pdf_to_multi_option_extractor.MultiLabelMethod import MultiLabelMethod
from extractors.pdf_to_multi_option_extractor.FilterSegmentsMethod import FilterSegmentsMethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@
from extractors.pdf_to_multi_option_extractor.multi_labels_methods.SetFitMethod import SetFitMethod
from extractors.pdf_to_multi_option_extractor.multi_labels_methods.SingleLabelSetFitMethod import SingleLabelSetFitMethod
from extractors.pdf_to_multi_option_extractor.multi_labels_methods.TfIdfMethod import TfIdfMethod
from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.FastSegmentSelectorFuzzy95 import (
FastSegmentSelectorFuzzy95,
)
from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.FastSegmentSelectorFuzzyCommas import (
FastSegmentSelectorFuzzyCommas,
)
from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.FuzzyAll100 import FuzzyAll100
from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.FuzzyAll75 import FuzzyAll75
from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.FuzzyAll88 import FuzzyAll88
Expand All @@ -32,6 +38,9 @@
)
from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.FuzzyLast import FuzzyLast
from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.FuzzyLastCleanLabel import FuzzyLastCleanLabel
from extractors.pdf_to_multi_option_extractor.multi_option_extraction_methods.FuzzySegmentSelector import (
FuzzySegmentSelector,
)
from send_logs import send_logs


Expand All @@ -44,6 +53,9 @@ class PdfToMultiOptionExtractor(ExtractorBase):
FuzzyAll75(),
FuzzyAll88(),
FuzzyAll100(),
FastSegmentSelectorFuzzy95(),
FastSegmentSelectorFuzzyCommas(),
FuzzySegmentSelector(),
PdfMultiOptionMethod(CleanBeginningDigits3000, TfIdfMethod),
PdfMultiOptionMethod(CleanEndDotDigits1000, TfIdfMethod),
PdfMultiOptionMethod(CleanBeginningDotDigits500, FastTextMethod),
Expand Down
Loading

0 comments on commit 499906b

Please sign in to comment.