wip: header parsing

msto · Mar 23, 2024 · c06bd0b · c06bd0b
1 parent 96e21fa
commit c06bd0b
Show file tree

Hide file tree

Showing 2 changed files with 94 additions and 6 deletions.
diff --git a/dataclass_io/reader.py b/dataclass_io/reader.py
@@ -1,16 +1,24 @@
 from csv import DictReader
+from dataclasses import dataclass
 from dataclasses import fields
 from dataclasses import is_dataclass
+from io import TextIOWrapper
 from pathlib import Path
 from types import TracebackType
+from typing import IO
 from typing import Any
 from typing import ClassVar
+from typing import Optional
 from typing import Protocol
+from typing import TextIO
 from typing import Type
+from typing import TypeAlias
 
 from dataclass_io.lib import assert_readable_dataclass
 from dataclass_io.lib import assert_readable_file
 
+ReadableFileHandle: TypeAlias = TextIOWrapper | IO | TextIO
+
 
 class DataclassInstance(Protocol):
     """
@@ -22,14 +30,34 @@ class DataclassInstance(Protocol):
 
     https://stackoverflow.com/a/55240861
     """
+
     __dataclass_fields__: ClassVar[dict[str, Any]]
 
 
+@dataclass(frozen=True, kw_only=True)
+class FileHeader:
+    """
+    Header of a file.
+
+    A file's header contains an optional preface, consisting of lines prefixed by a comment
+    character and/or empty lines, and a required row of fieldnames before the data rows begin.
+
+    Attributes:
+        preface: A list of any lines preceding the fieldnames.
+        fieldnames: The field names specified in the final line of the header.
+    """
+
+    preface: list[str]
+    fieldnames: list[str]
+
+
 class DataclassReader:
     def __init__(
         self,
         path: Path,
         dataclass_type: type,
+        delimiter: str = "\t",
+        header_comment_char: str = "#",
         **kwds: Any,
     ) -> None:
         """
@@ -56,10 +84,30 @@ def __init__(
         if not is_dataclass(dataclass_type):
             raise TypeError(f"The provided type must be a dataclass: {dataclass_type.__name__}")
 
-        self._dataclass_type = dataclass_type
+        self.dataclass_type = dataclass_type
+        self.delimiter = delimiter
+        self.header_comment_char = header_comment_char
 
         self._fin = path.open("r")
-        self._reader = DictReader(self._fin, **kwds)
+
+        self._header = self._get_header(self._fin)
+        if self._header is None:
+            raise ValueError(f"Could not find a header in the provided file: {path}")
+
+        if self._header.fieldnames != [f.name for f in fields(dataclass_type)]:
+            raise ValueError(
+                "The provided file does not have the same field names as the provided dataclass:\n"
+                f"\tDataclass: {dataclass_type.__name__}\n"
+                f"\tFile: {path}\n"
+                f"\tDataclass fields: {dataclass_type.__name__}\n"
+                f"\tFile: {path}\n"
+            )
+
+        self._reader = DictReader(
+            self._fin,
+            fieldnames=self._header.fieldnames,
+            delimiter=self.delimiter,
+        )
 
     def __enter__(self) -> "DataclassReader":
         return self
@@ -88,8 +136,48 @@ def _row_to_dataclass(self, row: dict[str, str]) -> DataclassInstance:
         coerced_values: dict[str, Any] = {}
 
         # Coerce each value in the row to the type of the corresponding field
-        for field in fields(self._dataclass_type):
+        for field in fields(self.dataclass_type):
             value = row[field.name]
             coerced_values[field.name] = field.type(value)
 
-        return self._dataclass_type(**coerced_values)
+        return self.dataclass_type(**coerced_values)
+
+    def _get_header(
+        self,
+        reader: ReadableFileHandle,
+    ) -> Optional[FileHeader]:
+        """
+        Read the header from an open file.
+
+        The first row after any commented or empty lines will be used as the fieldnames.
+
+        Lines preceding the fieldnames will be returned in the `preface.`
+
+        NB: This function returns `Optional` instead of raising an error because the name of the
+        source file is not in scope, making it difficult to provide a helpful error message. It is
+        the responsibility of the caller to raise an error if the file is empty.
+
+        See original proof-of-concept here: https://github.com/fulcrumgenomics/fgpyo/pull/103
+
+        Args:
+            reader: An open, readable file handle.
+            comment_char: The character which indicates the start of a comment line.
+
+        Returns:
+            A `FileHeader` containing the field names and any preceding lines.
+            None if the file was empty or contained only comments or empty lines.
+        """
+
+        preface: list[str] = []
+
+        for line in reader:
+            if line.startswith(self.header_comment_char) or line.strip() == "":
+                preface.append(line.strip())
+            else:
+                break
+        else:
+            return None
+
+        fieldnames = line.strip().split(self.delimiter)
+
+        return FileHeader(preface=preface, fieldnames=fieldnames)
diff --git a/tests/test_reader.py b/tests/test_reader.py
@@ -14,10 +14,10 @@ def test_reader(tmp_path: Path) -> None:
     fpath = tmp_path / "test.txt"
 
     with open(fpath, "w") as f:
+        f.write("foo\tbar\n")
         f.write("abc\t1\n")
 
-    dictreader_kwds = {"fieldnames": ["foo", "bar"], "delimiter": "\t"}
-    with DataclassReader(path=fpath, dataclass_type=FakeDataclass, **dictreader_kwds) as reader:
+    with DataclassReader(path=fpath, dataclass_type=FakeDataclass) as reader:
         rows = [row for row in reader]
 
     assert rows[0] == FakeDataclass(foo="abc", bar=1)