Skip to content

Commit

Permalink
wip: header parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
msto committed Mar 23, 2024
1 parent 96e21fa commit c06bd0b
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 6 deletions.
96 changes: 92 additions & 4 deletions dataclass_io/reader.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,24 @@
from csv import DictReader
from dataclasses import dataclass
from dataclasses import fields
from dataclasses import is_dataclass
from io import TextIOWrapper
from pathlib import Path
from types import TracebackType
from typing import IO
from typing import Any
from typing import ClassVar
from typing import Optional
from typing import Protocol
from typing import TextIO
from typing import Type
from typing import TypeAlias

from dataclass_io.lib import assert_readable_dataclass
from dataclass_io.lib import assert_readable_file

ReadableFileHandle: TypeAlias = TextIOWrapper | IO | TextIO


class DataclassInstance(Protocol):
"""
Expand All @@ -22,14 +30,34 @@ class DataclassInstance(Protocol):
https://stackoverflow.com/a/55240861
"""

__dataclass_fields__: ClassVar[dict[str, Any]]


@dataclass(frozen=True, kw_only=True)
class FileHeader:
"""
Header of a file.
A file's header contains an optional preface, consisting of lines prefixed by a comment
character and/or empty lines, and a required row of fieldnames before the data rows begin.
Attributes:
preface: A list of any lines preceding the fieldnames.
fieldnames: The field names specified in the final line of the header.
"""

preface: list[str]
fieldnames: list[str]


class DataclassReader:
def __init__(
self,
path: Path,
dataclass_type: type,
delimiter: str = "\t",
header_comment_char: str = "#",
**kwds: Any,
) -> None:
"""
Expand All @@ -56,10 +84,30 @@ def __init__(
if not is_dataclass(dataclass_type):
raise TypeError(f"The provided type must be a dataclass: {dataclass_type.__name__}")

self._dataclass_type = dataclass_type
self.dataclass_type = dataclass_type
self.delimiter = delimiter
self.header_comment_char = header_comment_char

self._fin = path.open("r")
self._reader = DictReader(self._fin, **kwds)

self._header = self._get_header(self._fin)
if self._header is None:
raise ValueError(f"Could not find a header in the provided file: {path}")

if self._header.fieldnames != [f.name for f in fields(dataclass_type)]:
raise ValueError(
"The provided file does not have the same field names as the provided dataclass:\n"
f"\tDataclass: {dataclass_type.__name__}\n"
f"\tFile: {path}\n"
f"\tDataclass fields: {dataclass_type.__name__}\n"
f"\tFile: {path}\n"
)

self._reader = DictReader(
self._fin,
fieldnames=self._header.fieldnames,
delimiter=self.delimiter,
)

def __enter__(self) -> "DataclassReader":
return self
Expand Down Expand Up @@ -88,8 +136,48 @@ def _row_to_dataclass(self, row: dict[str, str]) -> DataclassInstance:
coerced_values: dict[str, Any] = {}

# Coerce each value in the row to the type of the corresponding field
for field in fields(self._dataclass_type):
for field in fields(self.dataclass_type):
value = row[field.name]
coerced_values[field.name] = field.type(value)

return self._dataclass_type(**coerced_values)
return self.dataclass_type(**coerced_values)

def _get_header(
self,
reader: ReadableFileHandle,
) -> Optional[FileHeader]:
"""
Read the header from an open file.
The first row after any commented or empty lines will be used as the fieldnames.
Lines preceding the fieldnames will be returned in the `preface.`
NB: This function returns `Optional` instead of raising an error because the name of the
source file is not in scope, making it difficult to provide a helpful error message. It is
the responsibility of the caller to raise an error if the file is empty.
See original proof-of-concept here: https://github.com/fulcrumgenomics/fgpyo/pull/103
Args:
reader: An open, readable file handle.
comment_char: The character which indicates the start of a comment line.
Returns:
A `FileHeader` containing the field names and any preceding lines.
None if the file was empty or contained only comments or empty lines.
"""

preface: list[str] = []

for line in reader:
if line.startswith(self.header_comment_char) or line.strip() == "":
preface.append(line.strip())
else:
break
else:
return None

fieldnames = line.strip().split(self.delimiter)

return FileHeader(preface=preface, fieldnames=fieldnames)
4 changes: 2 additions & 2 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ def test_reader(tmp_path: Path) -> None:
fpath = tmp_path / "test.txt"

with open(fpath, "w") as f:
f.write("foo\tbar\n")
f.write("abc\t1\n")

dictreader_kwds = {"fieldnames": ["foo", "bar"], "delimiter": "\t"}
with DataclassReader(path=fpath, dataclass_type=FakeDataclass, **dictreader_kwds) as reader:
with DataclassReader(path=fpath, dataclass_type=FakeDataclass) as reader:
rows = [row for row in reader]

assert rows[0] == FakeDataclass(foo="abc", bar=1)

0 comments on commit c06bd0b

Please sign in to comment.