NetherlandsForensicInstitute
diff --git a/‎lrmodule/input_data.py‎
Lines changed: 36 additions & 61 deletions b/‎lrmodule/input_data.py‎
Lines changed: 36 additions & 61 deletions
diff --git a/‎lrmodule/mcmc.py‎
Lines changed: 0 additions & 203 deletions b/‎lrmodule/mcmc.py‎
Lines changed: 0 additions & 203 deletions
@@ -1,12 +1,14 @@
-from collections import defaultdict
-from collections.abc import Iterator
+import logging
+from collections.abc import Iterable
 from enum import StrEnum
 from functools import cache
 from pathlib import Path
 
-import pandas as pd
-from lir.data.data_strategies import DataStrategy
-from lir.data.models import FeatureData
+from lir.data.io import search_path
+from lir.data.models import DataStrategy, FeatureData
+from lir.datasets.feature_data_csv import ExtraField, FeatureDataCsvParser
+
+LOG = logging.getLogger(__name__)
 
 
 class TestTrainSplit(StrEnum):
@@ -15,16 +17,29 @@ class TestTrainSplit(StrEnum):
     TEST = "v"
 
 
-class ScratchData(DataStrategy):
-    def __init__(self, input_file_path: Path):
+SPLIT_COLUMNS = ["split1", "split2", "split3"]
+
+
+class ScratchCsvReader(FeatureDataCsvParser):
+    def __init__(self, input_file_path: Path | str):
         """Read and represent Scratch specific input data, as corresponding instances.
 
         The data might include n-fold cross validation splits, where each fold has a train/test split.
         This class provides access to iterate over the available folds and the corresponding train and test splits.
         """
-        self.file_path = input_file_path
+        super().__init__(
+            source_id_column=["weapon1", "weapon2"],
+            label_column="hypothesis",
+            extra_fields=[
+                ExtraField("split", SPLIT_COLUMNS, str),
+            ],
+            message_prefix=f"{input_file_path}: ",
+        )
+
+        self.file_path = Path(input_file_path)
 
-    def _read_instances_from_file(self):
+    @cache
+    def get_instances(self) -> FeatureData:
         """Read K-fold cross validation CSV input data to a list of K corresponding subsets of test/train folds.
 
         In the CSV file, subsets of the data are indicated by the "split<N>" column. For example, 3-fold cross
@@ -36,58 +51,18 @@ def _read_instances_from_file(self):
         The remaining columns are treated as features. This means that the pipeline in which this data is used
         should filter out any non-relevant feature columns before training or evaluating a model.
         """
-        df = pd.read_csv(self.file_path)
-
-        # Ensure all expected columns are present
-        expected_columns = ["weapon1", "weapon2", "hypothesis", "split1"]
-        if not all(column in df.columns for column in expected_columns):
-            raise ValueError(
-                f"Missing one of the expected columns: {', '.join(set(expected_columns) - set(df.columns))}"
-            )
-
-        # Find all columns regarding the prepared folds, named 'split*' ('split1', 'split2', etc.)
-        fold_column_names = [c for c in df.columns if c.startswith("split")]
+        path = search_path(self.file_path)
+        LOG.debug(f"parsing CSV file: {self.file_path} as {path}")
+        with open(path) as f:
+            return self._parse_file(f)
 
-        # Feature columns are all columns that are not the expected columns
-        feature_columns = [c for c in df.columns if c not in expected_columns and c not in fold_column_names]
 
-        label_column = ["hypothesis"]
-
-        # Group the folds by the column name, i.e. 'split1', 'split2', etc.
-        df_with_subsets = df.melt(
-            id_vars=label_column + feature_columns,
-            value_vars=fold_column_names,
-            var_name="subset",
-            value_name="test_train_split",
-        )
-
-        subsets = []
-
-        # Loop over each subset
-        for _, folds in df_with_subsets.groupby("subset"):
-            # Filter out the data marked as "not used"
-            test_train_folds = folds[folds.test_train_split != TestTrainSplit.NOT_USED]
-
-            # Loop over 'train' / 'test' folds for the current subset
-            subset_folds = defaultdict()
-
-            for test_or_train_indicator, raw_data in test_train_folds.groupby("test_train_split"):
-                # The `test_or_train_indicator` refers to the role of this data
-                # in the current fold; belonging to either the 'test' or 'train' split.
-                features = raw_data[feature_columns].to_numpy(dtype=float).reshape(-1, len(feature_columns))
-                labels = raw_data[label_column].to_numpy(dtype=int).flatten()
-
-                subset_folds[test_or_train_indicator] = FeatureData(features=features, labels=labels)
-
-            subsets.append((subset_folds[TestTrainSplit.TRAIN], subset_folds[TestTrainSplit.TEST]))
-
-        return subsets
-
-    @cache
-    def _get_instances(self):
-        """Read instances from file only once."""
-        return self._read_instances_from_file()
+class PredefinedCrossValidation(DataStrategy):
+    """Return a series of train/test sets for a predefined cross-validation setup."""
 
-    def __iter__(self) -> Iterator[tuple[FeatureData, FeatureData]]:
-        """Allow iteration by looping over the resulting train/test split(s)."""
-        yield from self._get_instances()
+    def apply(self, instances: FeatureData) -> Iterable[tuple[FeatureData, FeatureData]]:
+        """Return a series of train/test sets for a predefined cross-validation setup."""
+        for split in range(len(SPLIT_COLUMNS)):
+            training_data = instances[instances.split[:, split] == TestTrainSplit.TRAIN.value]  # type: ignore
+            test_data = instances[instances.split[:, split] == TestTrainSplit.TEST.value]  # type: ignore
+            yield training_data, test_data