Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DEV-11721]: create dataset with email options #333

Merged
merged 4 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions indico/queries/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from pathlib import Path
from typing import Dict, List, Optional, Union

import deprecation
import jsons
import pandas as pd

Expand All @@ -22,6 +21,7 @@
from indico.queries.storage import UploadBatched, UploadImages
from indico.types.dataset import (
Dataset,
EmailOptions,
OcrEngine,
OcrInputLanguage,
OmnipageOcrOptionsInput,
Expand Down Expand Up @@ -228,6 +228,7 @@ def __init__(
omnipage_ocr_options: OmnipageOcrOptionsInput = None,
read_api_ocr_options: ReadApiOcrOptionsInput = None,
request_interval: Union[int, float] = 5,
email_options: EmailOptions = None,
):
self.files = files
self.name = name
Expand All @@ -240,6 +241,7 @@ def __init__(
self.omnipage_ocr_options = omnipage_ocr_options
self.read_api_ocr_options = read_api_ocr_options
self.request_interval = request_interval
self.email_options = email_options
if omnipage_ocr_options is not None and read_api_ocr_options is not None:
raise IndicoInputError(
"Must supply either omnipage or readapi options but not both."
Expand Down Expand Up @@ -279,6 +281,7 @@ def requests(self):
readapi_ocr_options=self.read_api_ocr_options,
omnipage_ocr_options=self.omnipage_ocr_options,
ocr_engine=self.ocr_engine,
email_options=self.email_options,
)
yield _AddFiles(
dataset_id=self.previous.id, metadata=file_metadata, autoprocess=True
Expand Down Expand Up @@ -376,6 +379,7 @@ def __init__(
ocr_engine: OcrEngine = None,
omnipage_ocr_options: OmnipageOcrOptionsInput = None,
readapi_ocr_options: ReadApiOcrOptionsInput = None,
email_options: EmailOptions = None,
):
if not dataset_type:
dataset_type = "TEXT"
Expand All @@ -386,7 +390,8 @@ def __init__(
"ocrEngine": ocr_engine.name,
"omnipageOptions": omnipage_ocr_options,
"readapiOptions": readapi_ocr_options,
}
},
"emailOptions": email_options,
}
super().__init__(
self.query,
Expand Down
45 changes: 39 additions & 6 deletions indico/types/dataset.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from enum import Enum
from typing import List
from typing import List, Optional

from indico.errors import IndicoInputError
from indico.types.base import BaseType
from indico.types.datafile import Datafile
from indico.errors import IndicoInputError


class DataColumn(BaseType):
Expand Down Expand Up @@ -57,28 +57,35 @@ class Dataset(BaseType):

def labelset_by_name(self, name: str) -> LabelSet:
if name not in [lab.name for lab in self.labelsets]:
raise IndicoInputError(f"No labelset found for {name}. Current labelset names include {[lab.name for lab in self.labelsets]}.")
raise IndicoInputError(
f"No labelset found for {name}. Current labelset names include {[lab.name for lab in self.labelsets]}."
)
return next(lab for lab in self.labelsets if lab.name == name)

def datacolumn_by_name(self, name: str) -> DataColumn:
if name not in [datacol.name for datacol in self.datacolumns]:
raise IndicoInputError(f"No datacolumn found for {name}. Current datacolumn names include {[datacol.name for datacol in self.datacolumns]}.")
raise IndicoInputError(
f"No datacolumn found for {name}. Current datacolumn names include {[datacol.name for datacol in self.datacolumns]}."
)
return next(datacol for datacol in self.datacolumns if datacol.name == name)


class TableReadOrder(Enum):
ROW = 0
COLUMN = 1


class OcrEngine(Enum):
"""
Enum representing available OCR engines.
"""

OMNIPAGE = 0
READAPI = 1
READAPI_V2 = 2
READAPI_TABLES_V1 = 3


class OmnipageOcrOptionsInput(BaseType):
"""
Omnipage specific OCR options for dataset creation.
Expand All @@ -95,6 +102,7 @@ class OmnipageOcrOptionsInput(BaseType):
table_read_order(TableReadOrder): Read table by row or column.

"""

auto_rotate: bool
single_column: bool
upscale_images: bool
Expand All @@ -105,6 +113,7 @@ class OmnipageOcrOptionsInput(BaseType):
native_pdf: bool
table_read_order: TableReadOrder


class ReadApiOcrOptionsInput(BaseType):
"""
Read API OCR options.
Expand All @@ -115,20 +124,44 @@ class ReadApiOcrOptionsInput(BaseType):
upscale_images(bool): Scale up low resolution images.
languages(List[str]): List of languages to use.
"""

auto_rotate: bool
single_column: bool
upscale_images: bool
languages: List[str]


class OcrInputLanguage(BaseType):
name: str
code: str

class OcrOptionsInput():

class IncludeSections(BaseType):
header: Optional[bool]
body: Optional[bool]
attachments: Optional[bool]


class EmailOptions(BaseType):
"""
Email options

Args:
include_sections: Sections of the email to include after parsing (header, body, attachments)
unpack: Unpack an email and treat it as a multi-file Submission
preserve_body_whitespace: Preserve whitespace in the body of the email
"""

include_sections: Optional[IncludeSections]
unpack: Optional[bool]
preserve_body_whitespace: Optional[bool]


class OcrOptionsInput:
"""
Input options for OCR engine.
"""

ocr_engine: OcrEngine
omnipage_options: OmnipageOcrOptionsInput
readapi_options: ReadApiOcrOptionsInput

23 changes: 23 additions & 0 deletions tests/integration/queries/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,3 +425,26 @@ def test_bad_csv_create_dataset(indico):
assert dataset.status == "CREATING"
dataset = client.call(GetDatasetFileStatus(id=dataset.id))
assert all([f.status == "FAILED" for f in dataset.files])


@pytest.mark.ocr("readapi")
def test_create_with_email_options_readapi(indico):
client = IndicoClient()
readapi_config: ReadApiOcrOptionsInput = {
"auto_rotate": True,
"single_column": False,
"upscale_images": True,
"languages": ["AUTO"],
}
email_config = {
"include_sections": {"header": True, "body": True, "attachments": True},
"unpack": True,
}
dataset = client.call(
CreateEmptyDataset(
name=f"dataset-{int(time.time())}",
ocr_engine=OcrEngine.READAPI,
readapi_ocr_options=readapi_config,
email_options=email_config,
)
)