neuroinformatics-unit · adamltyson · Mar 10, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
diff --git a/oscar/breeding_scheme.py b/oscar/breeding_scheme.py
@@ -1,10 +1,11 @@
 import itertools
-from enum import Enum
+from enum import IntEnum
+from typing import Self
 
 import numpy as np
 
 
-class Genotype(Enum):
+class Genotype(IntEnum):
     """Genotype status: homozygous (HOM), heterozygous (HET) or WT (wildtype).
 
     Each animal will have two copies (alleles) of a particular gene - each
@@ -16,6 +17,30 @@ class Genotype(Enum):
     HET = 1
     HOM = 2
 
+    @classmethod
+    def from_string(cls, genotype_str: str) -> tuple[Self, ...]:
+        """Create a tuple of Genotype from a string representation.
+
+        E.g. wt_het_hom -> (Genotype.WT, Genotype.HET, Genotype.HOM)
+
+        Parameters
+        ----------
+        genotype_str : str
+            String representing 1 or multiple genotypes. Each should be
+            wt, het or hom separated by an underscore.
+
+        Returns
+        -------
+        tuple[Self, ...]
+            Converted tuple of genotypes
+        """
+        genotype_strings = genotype_str.split("_")
+        genotypes = [
+            cls[genotype_string.upper()]
+            for genotype_string in genotype_strings
+        ]
+        return tuple(genotypes)
+
 
 class BreedingScheme:
     """
@@ -25,9 +50,32 @@ class BreedingScheme:
 
     def __init__(
         self,
-        parent_1_genotype: tuple[Genotype, ...],
-        parent_2_genotype: tuple[Genotype, ...],
+        parent_1_genotype: tuple[Genotype, ...] | str,
+        parent_2_genotype: tuple[Genotype, ...] | str,
     ):
+        """Create a breeding scheme with two parent genotypes.
+
+        Parameters
+        ----------
+        parent_1_genotype : tuple[Genotype, ...] | str
+            Genotype of parent 1 either as a tuple of Genotypes or as a
+            string representation like het_hom_het
+        parent_2_genotype : tuple[Genotype, ...] | str
+            Genotype of parent 2 either as a tuple of Genotypes or as a
+            string representation like het_hom_het
+
+        Raises
+        ------
+        ValueError
+            If the parent genotypes don't have the same length
+        """
+
+        if isinstance(parent_1_genotype, str):
+            parent_1_genotype = Genotype.from_string(parent_1_genotype)
+
+        if isinstance(parent_2_genotype, str):
+            parent_2_genotype = Genotype.from_string(parent_2_genotype)
+
         if len(parent_1_genotype) != len(parent_2_genotype):
             raise ValueError(
                 "Both parents must have a genotype of the same length"
@@ -45,6 +93,22 @@ def __eq__(self, other):
             [other.parent_1_genotype, other.parent_2_genotype]
         )
 
+    def __hash__(self):
+        # Hash should be equal if the breeding scheme combines the same
+        # two genotypes in any order.
+        genotypes = sorted([self.parent_1_genotype, self.parent_2_genotype])
+        return hash(tuple(genotypes))
+
+    def __repr__(self):
+        parent_1_str = "_".join(
+            [genotype.name.lower() for genotype in self.parent_1_genotype]
+        )
+        parent_2_str = "_".join(
+            [genotype.name.lower() for genotype in self.parent_2_genotype]
+        )
+
+        return f"BreedingScheme({parent_1_str}x{parent_2_str})"
+
     def mendelian_ratio(self) -> dict[tuple[Genotype, ...], float]:
         """Calculate the theoretical mendelian ratio for this breeding scheme.
 

diff --git a/oscar/historical_stats.py b/oscar/historical_stats.py
@@ -0,0 +1,151 @@
+from dataclasses import dataclass, field
+
+import pandas as pd
+
+from oscar.breeding_scheme import (
+    BreedingScheme,
+    Genotype,
+)
+
+
+@dataclass
+class BreedingSchemeStatistics:
+    n_breeding_pairs: int = 0
+    n_successful_matings: int = 0
+    average_litter_size: float = 0
+    average_n_litters_per_pair: float = 0
+    total_n_offspring: int = 0
+    n_offspring_per_genotype: dict[tuple[Genotype, ...], int] = field(
+        default_factory=dict
+    )
+    proportion_offspring_per_genotype: dict[tuple[Genotype, ...], float] = (
+        field(default_factory=dict)
+    )
+
+
+@dataclass
+class LineStatistics:
+    total_n_offspring: int = 0
+    total_n_offspring_per_genotype: dict[tuple[Genotype, ...], int] = field(
+        default_factory=dict
+    )
+
+    stats_per_breeding_scheme: dict[
+        BreedingScheme, BreedingSchemeStatistics
+    ] = field(default_factory=dict)
+
+
+def calculate_historical_stats_for_line(
+    standardised_data: pd.DataFrame, line_name: str
+) -> LineStatistics:
+    """Calculate summary statistics for a specific line from standardised
+    historical data.
+
+    Parameters
+    ----------
+    standardised_data : pd.DataFrame
+        Standardised historical data e.g. from standardise_pyrat_csv
+    line_name : str
+        Name of line
+
+    Returns
+    -------
+    LineStatistics
+        Summary statistics for the given line
+    """
+
+    line_data = standardised_data.loc[
+        standardised_data.line_name == line_name, :
+    ]
+    if len(line_data) == 0:
+        raise ValueError(f"No data for {line_name} found")
+
+    breeding_schemes = line_data.apply(_create_breeding_scheme, axis=1)
+    data_with_schemes = line_data.copy()
+    data_with_schemes["breeding_scheme"] = breeding_schemes
+
+    line_stats = LineStatistics(total_n_offspring=len(line_data))
+
+    for breeding_scheme in data_with_schemes["breeding_scheme"].unique():
+        breeding_scheme_data = data_with_schemes.loc[
+            data_with_schemes.breeding_scheme == breeding_scheme, :
+        ]
+        scheme_stats = _historical_stats_for_breeding_scheme(
+            breeding_scheme_data
+        )
+        line_stats.stats_per_breeding_scheme[breeding_scheme] = scheme_stats
+
+        # Update summary of number of offspring per genotype across entire line
+        for (
+            genotype,
+            n_offspring,
+        ) in scheme_stats.n_offspring_per_genotype.items():
+            if genotype in line_stats.total_n_offspring_per_genotype:
+                line_stats.total_n_offspring_per_genotype[genotype] += (
+                    n_offspring
+                )
+            else:
+                line_stats.total_n_offspring_per_genotype[genotype] = (
+                    n_offspring
+                )
+
+    return line_stats
+
+
+def _create_breeding_scheme(row: pd.Series) -> BreedingScheme:
+    return BreedingScheme(row.genotype_father, row.genotype_mother)
+
+
+def _historical_stats_for_breeding_scheme(
+    scheme_data: pd.DataFrame,
+) -> BreedingSchemeStatistics:
+    """Calculate summary statistics for an individual breeding scheme
+    (within a specific line).
+
+    Parameters
+    ----------
+    scheme_data : pd.DataFrame
+        Dataframe of data for a single breeding scheme and line
+
+    Returns
+    -------
+    BreedingSchemeStatistics
+        Summary statistics for the breeding scheme
+    """
+    stats = BreedingSchemeStatistics()
+
+    # breeding pairs is unique combos of father ID x mother ID
+    stats.n_breeding_pairs = scheme_data.groupby(
+        ["ID_father", "ID_mother"]
+    ).ngroups
+
+    # Successful matings is unique combos of father ID x mother ID x date
+    # (assuming only one per day)
+    stats.n_successful_matings = scheme_data.groupby(
+        ["ID_father", "ID_mother", "date_of_birth"]
+    ).ngroups
+
+    stats.total_n_offspring = len(scheme_data)
+    stats.average_litter_size = (
+        stats.total_n_offspring / stats.n_successful_matings
+    )
+    stats.average_n_litters_per_pair = (
+        stats.n_successful_matings / stats.n_breeding_pairs
+    )
+
+    # convert string representation e.g. wt_hom_het to tuple representation
+    # of genotype: (Genotype.WT, Genotype.HOM, Genotype.HET)
+    scheme_data["genotype_offspring"] = scheme_data[
+        "genotype_offspring"
+    ].apply(Genotype.from_string)
+
+    # Number and proportion of offspring per genotype
+    stats.n_offspring_per_genotype = (
+        scheme_data.groupby("genotype_offspring").size().to_dict()
+    )
+
+    for genotype, n_offspring in stats.n_offspring_per_genotype.items():
+        proportion = n_offspring / stats.total_n_offspring
+        stats.proportion_offspring_per_genotype[genotype] = proportion
+
+    return stats
diff --git a/tests/pooch_registry.txt b/tests/pooch_registry.txt
@@ -1,8 +1,8 @@
-pyrat-data-single-mutation.csv a953acea3569208145689e9344136b0887b7de4b6f7ee7ac84e07d16b758f97f
-standardised-data-single-mutation.csv 58e491465593cdacccc3b0de56c872cc77a92969c87e919b2c25e745ebd99143
-pyrat-data-2-mutations.csv 7fb1e2c037358aca4b3d3c37d65481b899b5cb18635ad840293e6545c5952e04
-standardised-data-2-mutations.csv 90f89288a83f9195f9ac87d65ed6b3b3dd9f0bfca803180a9bfd2ba9595681c1
-pyrat-data-3-mutations.csv 6c762f5bbf07c7a2688a90158dffafc79b9e573bd6fd5af21941aa836f98bd5f
-standardised-data-3-mutations.csv 7442121aa5e75c1b2f151d4bf1f87e378a6f34226b0e03b474c4db76628823cc
+pyrat-data-single-mutation.csv 9fb87b7b865eac7932ec2554156cba59f0eb76cfa8d7442aafe91899a2f3c1dc
+standardised-data-single-mutation.csv e6444ea52bf0679747be1a4952319c93bd5965702d1ad7cc71786b259d95fe9c
+pyrat-data-2-mutations.csv 8a9a51b905d37039de931cf26269e5d7370fad5114649cb974f388d509d61332
+standardised-data-2-mutations.csv 3b2e41fbdd145bf63dae0330059b2fffbc5432e8eeb4f05b2bcf05078b892f44
+pyrat-data-3-mutations.csv 311369de2fb39e897ee63e17ba5de88beb81d19e4315339427f5e1d5ba454d06
+standardised-data-3-mutations.csv e7b0cba4e901bc0a069db27609e3d231b5c75090281b9616b3c92597beb9f086
 pyrat-data-forbidden-genotypes.csv c59c27fedf4332813b312dfb8e242d3e544806b6757f07efd041084b31e5df98
 standardised-data-forbidden-genotypes.csv b64f71adc0adae6b6b336184a9f5bf491fb1c1c0900b8458c0c6ac72736a3d6b