diff --git a/oscar/breeding_scheme.py b/oscar/breeding_scheme.py index 8b107ec..060266e 100644 --- a/oscar/breeding_scheme.py +++ b/oscar/breeding_scheme.py @@ -1,10 +1,11 @@ import itertools -from enum import Enum +from enum import IntEnum +from typing import Self import numpy as np -class Genotype(Enum): +class Genotype(IntEnum): """Genotype status: homozygous (HOM), heterozygous (HET) or WT (wildtype). Each animal will have two copies (alleles) of a particular gene - each @@ -16,6 +17,30 @@ class Genotype(Enum): HET = 1 HOM = 2 + @classmethod + def from_string(cls, genotype_str: str) -> tuple[Self, ...]: + """Create a tuple of Genotype from a string representation. + + E.g. wt_het_hom -> (Genotype.WT, Genotype.HET, Genotype.HOM) + + Parameters + ---------- + genotype_str : str + String representing 1 or multiple genotypes. Each should be + wt, het or hom separated by an underscore. + + Returns + ------- + tuple[Self, ...] + Converted tuple of genotypes + """ + genotype_strings = genotype_str.split("_") + genotypes = [ + cls[genotype_string.upper()] + for genotype_string in genotype_strings + ] + return tuple(genotypes) + class BreedingScheme: """ @@ -25,9 +50,32 @@ class BreedingScheme: def __init__( self, - parent_1_genotype: tuple[Genotype, ...], - parent_2_genotype: tuple[Genotype, ...], + parent_1_genotype: tuple[Genotype, ...] | str, + parent_2_genotype: tuple[Genotype, ...] | str, ): + """Create a breeding scheme with two parent genotypes. + + Parameters + ---------- + parent_1_genotype : tuple[Genotype, ...] | str + Genotype of parent 1 either as a tuple of Genotypes or as a + string representation like het_hom_het + parent_2_genotype : tuple[Genotype, ...] | str + Genotype of parent 2 either as a tuple of Genotypes or as a + string representation like het_hom_het + + Raises + ------ + ValueError + If the parent genotypes don't have the same length + """ + + if isinstance(parent_1_genotype, str): + parent_1_genotype = Genotype.from_string(parent_1_genotype) + + if isinstance(parent_2_genotype, str): + parent_2_genotype = Genotype.from_string(parent_2_genotype) + if len(parent_1_genotype) != len(parent_2_genotype): raise ValueError( "Both parents must have a genotype of the same length" @@ -45,6 +93,22 @@ def __eq__(self, other): [other.parent_1_genotype, other.parent_2_genotype] ) + def __hash__(self): + # Hash should be equal if the breeding scheme combines the same + # two genotypes in any order. + genotypes = sorted([self.parent_1_genotype, self.parent_2_genotype]) + return hash(tuple(genotypes)) + + def __repr__(self): + parent_1_str = "_".join( + [genotype.name.lower() for genotype in self.parent_1_genotype] + ) + parent_2_str = "_".join( + [genotype.name.lower() for genotype in self.parent_2_genotype] + ) + + return f"BreedingScheme({parent_1_str}x{parent_2_str})" + def mendelian_ratio(self) -> dict[tuple[Genotype, ...], float]: """Calculate the theoretical mendelian ratio for this breeding scheme. diff --git a/oscar/historical_stats.py b/oscar/historical_stats.py new file mode 100644 index 0000000..4c5ca33 --- /dev/null +++ b/oscar/historical_stats.py @@ -0,0 +1,151 @@ +from dataclasses import dataclass, field + +import pandas as pd + +from oscar.breeding_scheme import ( + BreedingScheme, + Genotype, +) + + +@dataclass +class BreedingSchemeStatistics: + n_breeding_pairs: int = 0 + n_successful_matings: int = 0 + average_litter_size: float = 0 + average_n_litters_per_pair: float = 0 + total_n_offspring: int = 0 + n_offspring_per_genotype: dict[tuple[Genotype, ...], int] = field( + default_factory=dict + ) + proportion_offspring_per_genotype: dict[tuple[Genotype, ...], float] = ( + field(default_factory=dict) + ) + + +@dataclass +class LineStatistics: + total_n_offspring: int = 0 + total_n_offspring_per_genotype: dict[tuple[Genotype, ...], int] = field( + default_factory=dict + ) + + stats_per_breeding_scheme: dict[ + BreedingScheme, BreedingSchemeStatistics + ] = field(default_factory=dict) + + +def calculate_historical_stats_for_line( + standardised_data: pd.DataFrame, line_name: str +) -> LineStatistics: + """Calculate summary statistics for a specific line from standardised + historical data. + + Parameters + ---------- + standardised_data : pd.DataFrame + Standardised historical data e.g. from standardise_pyrat_csv + line_name : str + Name of line + + Returns + ------- + LineStatistics + Summary statistics for the given line + """ + + line_data = standardised_data.loc[ + standardised_data.line_name == line_name, : + ] + if len(line_data) == 0: + raise ValueError(f"No data for {line_name} found") + + breeding_schemes = line_data.apply(_create_breeding_scheme, axis=1) + data_with_schemes = line_data.copy() + data_with_schemes["breeding_scheme"] = breeding_schemes + + line_stats = LineStatistics(total_n_offspring=len(line_data)) + + for breeding_scheme in data_with_schemes["breeding_scheme"].unique(): + breeding_scheme_data = data_with_schemes.loc[ + data_with_schemes.breeding_scheme == breeding_scheme, : + ] + scheme_stats = _historical_stats_for_breeding_scheme( + breeding_scheme_data + ) + line_stats.stats_per_breeding_scheme[breeding_scheme] = scheme_stats + + # Update summary of number of offspring per genotype across entire line + for ( + genotype, + n_offspring, + ) in scheme_stats.n_offspring_per_genotype.items(): + if genotype in line_stats.total_n_offspring_per_genotype: + line_stats.total_n_offspring_per_genotype[genotype] += ( + n_offspring + ) + else: + line_stats.total_n_offspring_per_genotype[genotype] = ( + n_offspring + ) + + return line_stats + + +def _create_breeding_scheme(row: pd.Series) -> BreedingScheme: + return BreedingScheme(row.genotype_father, row.genotype_mother) + + +def _historical_stats_for_breeding_scheme( + scheme_data: pd.DataFrame, +) -> BreedingSchemeStatistics: + """Calculate summary statistics for an individual breeding scheme + (within a specific line). + + Parameters + ---------- + scheme_data : pd.DataFrame + Dataframe of data for a single breeding scheme and line + + Returns + ------- + BreedingSchemeStatistics + Summary statistics for the breeding scheme + """ + stats = BreedingSchemeStatistics() + + # breeding pairs is unique combos of father ID x mother ID + stats.n_breeding_pairs = scheme_data.groupby( + ["ID_father", "ID_mother"] + ).ngroups + + # Successful matings is unique combos of father ID x mother ID x date + # (assuming only one per day) + stats.n_successful_matings = scheme_data.groupby( + ["ID_father", "ID_mother", "date_of_birth"] + ).ngroups + + stats.total_n_offspring = len(scheme_data) + stats.average_litter_size = ( + stats.total_n_offspring / stats.n_successful_matings + ) + stats.average_n_litters_per_pair = ( + stats.n_successful_matings / stats.n_breeding_pairs + ) + + # convert string representation e.g. wt_hom_het to tuple representation + # of genotype: (Genotype.WT, Genotype.HOM, Genotype.HET) + scheme_data["genotype_offspring"] = scheme_data[ + "genotype_offspring" + ].apply(Genotype.from_string) + + # Number and proportion of offspring per genotype + stats.n_offspring_per_genotype = ( + scheme_data.groupby("genotype_offspring").size().to_dict() + ) + + for genotype, n_offspring in stats.n_offspring_per_genotype.items(): + proportion = n_offspring / stats.total_n_offspring + stats.proportion_offspring_per_genotype[genotype] = proportion + + return stats diff --git a/tests/pooch_registry.txt b/tests/pooch_registry.txt index a4860a4..8f0ac66 100644 --- a/tests/pooch_registry.txt +++ b/tests/pooch_registry.txt @@ -1,8 +1,8 @@ -pyrat-data-single-mutation.csv a953acea3569208145689e9344136b0887b7de4b6f7ee7ac84e07d16b758f97f -standardised-data-single-mutation.csv 58e491465593cdacccc3b0de56c872cc77a92969c87e919b2c25e745ebd99143 -pyrat-data-2-mutations.csv 7fb1e2c037358aca4b3d3c37d65481b899b5cb18635ad840293e6545c5952e04 -standardised-data-2-mutations.csv 90f89288a83f9195f9ac87d65ed6b3b3dd9f0bfca803180a9bfd2ba9595681c1 -pyrat-data-3-mutations.csv 6c762f5bbf07c7a2688a90158dffafc79b9e573bd6fd5af21941aa836f98bd5f -standardised-data-3-mutations.csv 7442121aa5e75c1b2f151d4bf1f87e378a6f34226b0e03b474c4db76628823cc +pyrat-data-single-mutation.csv 9fb87b7b865eac7932ec2554156cba59f0eb76cfa8d7442aafe91899a2f3c1dc +standardised-data-single-mutation.csv e6444ea52bf0679747be1a4952319c93bd5965702d1ad7cc71786b259d95fe9c +pyrat-data-2-mutations.csv 8a9a51b905d37039de931cf26269e5d7370fad5114649cb974f388d509d61332 +standardised-data-2-mutations.csv 3b2e41fbdd145bf63dae0330059b2fffbc5432e8eeb4f05b2bcf05078b892f44 +pyrat-data-3-mutations.csv 311369de2fb39e897ee63e17ba5de88beb81d19e4315339427f5e1d5ba454d06 +standardised-data-3-mutations.csv e7b0cba4e901bc0a069db27609e3d231b5c75090281b9616b3c92597beb9f086 pyrat-data-forbidden-genotypes.csv c59c27fedf4332813b312dfb8e242d3e544806b6757f07efd041084b31e5df98 standardised-data-forbidden-genotypes.csv b64f71adc0adae6b6b336184a9f5bf491fb1c1c0900b8458c0c6ac72736a3d6b diff --git a/tests/test_unit/test_historical_stats.py b/tests/test_unit/test_historical_stats.py new file mode 100644 index 0000000..39d301f --- /dev/null +++ b/tests/test_unit/test_historical_stats.py @@ -0,0 +1,359 @@ +import pandas as pd +import pytest + +from oscar.breeding_scheme import BreedingScheme, Genotype +from oscar.historical_stats import ( + BreedingSchemeStatistics, + LineStatistics, + calculate_historical_stats_for_line, +) + + +@pytest.fixture +def expected_stats_single_mutation(): + return LineStatistics( + total_n_offspring=18, + total_n_offspring_per_genotype={ + (Genotype.WT,): 6, + (Genotype.HET,): 10, + (Genotype.HOM,): 2, + }, + stats_per_breeding_scheme={ + BreedingScheme("wt", "het"): BreedingSchemeStatistics( + n_breeding_pairs=2, + n_successful_matings=3, + average_litter_size=pytest.approx(2.666, abs=1e-3), + average_n_litters_per_pair=pytest.approx(1.5, abs=1e-3), + total_n_offspring=8, + n_offspring_per_genotype={ + (Genotype.WT,): 4, + (Genotype.HET,): 4, + }, + proportion_offspring_per_genotype={ + (Genotype.WT,): pytest.approx(0.5, abs=1e-3), + (Genotype.HET,): pytest.approx(0.5, abs=1e-3), + }, + ), + BreedingScheme("wt", "hom"): BreedingSchemeStatistics( + n_breeding_pairs=2, + n_successful_matings=2, + average_litter_size=pytest.approx(2, abs=1e-3), + average_n_litters_per_pair=pytest.approx(1.0, abs=1e-3), + total_n_offspring=4, + n_offspring_per_genotype={(Genotype.HET,): 4}, + proportion_offspring_per_genotype={ + (Genotype.HET,): pytest.approx(1.0, abs=1e-3) + }, + ), + BreedingScheme("het", "het"): BreedingSchemeStatistics( + n_breeding_pairs=2, + n_successful_matings=2, + average_litter_size=pytest.approx(2, abs=1e-3), + average_n_litters_per_pair=pytest.approx(1.0, abs=1e-3), + total_n_offspring=4, + n_offspring_per_genotype={ + (Genotype.WT,): 2, + (Genotype.HET,): 1, + (Genotype.HOM,): 1, + }, + proportion_offspring_per_genotype={ + (Genotype.WT,): pytest.approx(0.5, abs=1e-3), + (Genotype.HET,): pytest.approx(0.25, abs=1e-3), + (Genotype.HOM,): pytest.approx(0.25, abs=1e-3), + }, + ), + BreedingScheme("het", "hom"): BreedingSchemeStatistics( + n_breeding_pairs=2, + n_successful_matings=2, + average_litter_size=pytest.approx(1.0, abs=1e-3), + average_n_litters_per_pair=pytest.approx(1.0, abs=1e-3), + total_n_offspring=2, + n_offspring_per_genotype={ + (Genotype.HET,): 1, + (Genotype.HOM,): 1, + }, + proportion_offspring_per_genotype={ + (Genotype.HET,): pytest.approx(0.5, abs=1e-3), + (Genotype.HOM,): pytest.approx(0.5, abs=1e-3), + }, + ), + }, + ) + + +@pytest.fixture +def expected_stats_2_mutations(): + return LineStatistics( + total_n_offspring=20, + total_n_offspring_per_genotype={ + (Genotype.HOM, Genotype.HOM): 2, + (Genotype.HET, Genotype.HOM): 6, + (Genotype.HOM, Genotype.HET): 4, + (Genotype.HET, Genotype.WT): 2, + (Genotype.WT, Genotype.HET): 4, + (Genotype.HET, Genotype.HET): 2, + }, + stats_per_breeding_scheme={ + BreedingScheme("het_hom", "hom_het"): BreedingSchemeStatistics( + n_breeding_pairs=2, + n_successful_matings=3, + average_litter_size=pytest.approx(2.666, abs=1e-3), + average_n_litters_per_pair=pytest.approx(1.5, abs=1e-3), + total_n_offspring=8, + n_offspring_per_genotype={ + (Genotype.HOM, Genotype.HOM): 2, + (Genotype.HET, Genotype.HOM): 6, + }, + proportion_offspring_per_genotype={ + (Genotype.HOM, Genotype.HOM): pytest.approx( + 0.25, abs=1e-3 + ), + (Genotype.HET, Genotype.HOM): pytest.approx( + 0.75, abs=1e-3 + ), + }, + ), + BreedingScheme("hom_wt", "het_het"): BreedingSchemeStatistics( + n_breeding_pairs=2, + n_successful_matings=2, + average_litter_size=pytest.approx(2.0, abs=1e-3), + average_n_litters_per_pair=pytest.approx(1.0, abs=1e-3), + total_n_offspring=4, + n_offspring_per_genotype={ + ( + Genotype.HOM, + Genotype.HET, + ): 4 + }, + proportion_offspring_per_genotype={ + ( + Genotype.HOM, + Genotype.HET, + ): pytest.approx(1.0, abs=1e-3) + }, + ), + BreedingScheme("het_wt", "wt_het"): BreedingSchemeStatistics( + n_breeding_pairs=2, + n_successful_matings=2, + average_litter_size=pytest.approx(1.5, abs=1e-3), + average_n_litters_per_pair=pytest.approx(1.0, abs=1e-3), + total_n_offspring=3, + n_offspring_per_genotype={ + (Genotype.HET, Genotype.WT): 2, + (Genotype.WT, Genotype.HET): 1, + }, + proportion_offspring_per_genotype={ + (Genotype.HET, Genotype.WT): pytest.approx( + 0.666, abs=1e-3 + ), + (Genotype.WT, Genotype.HET): pytest.approx( + 0.333, abs=1e-3 + ), + }, + ), + BreedingScheme("het_wt", "wt_hom"): BreedingSchemeStatistics( + n_breeding_pairs=2, + n_successful_matings=2, + average_litter_size=pytest.approx(1.0, abs=1e-3), + average_n_litters_per_pair=pytest.approx(1.0, abs=1e-3), + total_n_offspring=2, + n_offspring_per_genotype={ + ( + Genotype.WT, + Genotype.HET, + ): 2, + }, + proportion_offspring_per_genotype={ + (Genotype.WT, Genotype.HET): pytest.approx(1.0, abs=1e-3), + }, + ), + BreedingScheme("hom_wt", "wt_hom"): BreedingSchemeStatistics( + n_breeding_pairs=1, + n_successful_matings=1, + average_litter_size=pytest.approx(3.0, abs=1e-3), + average_n_litters_per_pair=pytest.approx(1.0, abs=1e-3), + total_n_offspring=3, + n_offspring_per_genotype={ + ( + Genotype.HET, + Genotype.HET, + ): 2, + (Genotype.WT, Genotype.HET): 1, + }, + proportion_offspring_per_genotype={ + (Genotype.HET, Genotype.HET): pytest.approx( + 0.666, abs=1e-3 + ), + (Genotype.WT, Genotype.HET): pytest.approx( + 0.333, abs=1e-3 + ), + }, + ), + }, + ) + + +@pytest.fixture +def expected_stats_3_mutations(): + return LineStatistics( + total_n_offspring=20, + total_n_offspring_per_genotype={ + (Genotype.HET, Genotype.WT, Genotype.HOM): 8, + (Genotype.WT, Genotype.HET, Genotype.HOM): 2, + (Genotype.HOM, Genotype.WT, Genotype.HET): 2, + (Genotype.WT, Genotype.WT, Genotype.HET): 1, + (Genotype.HET, Genotype.WT, Genotype.WT): 1, + (Genotype.WT, Genotype.HET, Genotype.WT): 3, + (Genotype.HET, Genotype.HET, Genotype.WT): 2, + (Genotype.WT, Genotype.HET, Genotype.HET): 1, + }, + stats_per_breeding_scheme={ + BreedingScheme( + "wt_wt_het", "het_het_het" + ): BreedingSchemeStatistics( + n_breeding_pairs=2, + n_successful_matings=3, + average_litter_size=pytest.approx(2.666, abs=1e-3), + average_n_litters_per_pair=pytest.approx(1.5, abs=1e-3), + total_n_offspring=8, + n_offspring_per_genotype={ + (Genotype.HET, Genotype.WT, Genotype.HOM): 4, + (Genotype.WT, Genotype.HET, Genotype.HOM): 2, + (Genotype.HOM, Genotype.WT, Genotype.HET): 2, + }, + proportion_offspring_per_genotype={ + (Genotype.HET, Genotype.WT, Genotype.HOM): pytest.approx( + 0.5, abs=1e-3 + ), + (Genotype.WT, Genotype.HET, Genotype.HOM): pytest.approx( + 0.25, abs=1e-3 + ), + (Genotype.HOM, Genotype.WT, Genotype.HET): pytest.approx( + 0.25, abs=1e-3 + ), + }, + ), + BreedingScheme( + "wt_het_het", "het_het_hom" + ): BreedingSchemeStatistics( + n_breeding_pairs=2, + n_successful_matings=2, + average_litter_size=pytest.approx(2.0, abs=1e-3), + average_n_litters_per_pair=pytest.approx(1.0, abs=1e-3), + total_n_offspring=4, + n_offspring_per_genotype={ + (Genotype.HET, Genotype.WT, Genotype.HOM): 4 + }, + proportion_offspring_per_genotype={ + (Genotype.HET, Genotype.WT, Genotype.HOM): pytest.approx( + 1.0, abs=1e-3 + ) + }, + ), + BreedingScheme( + "het_wt_wt", "wt_het_hom" + ): BreedingSchemeStatistics( + n_breeding_pairs=2, + n_successful_matings=2, + average_litter_size=pytest.approx(1.5, abs=1e-3), + average_n_litters_per_pair=pytest.approx(1.0, abs=1e-3), + total_n_offspring=3, + n_offspring_per_genotype={ + (Genotype.WT, Genotype.WT, Genotype.HET): 1, + (Genotype.HET, Genotype.WT, Genotype.WT): 1, + (Genotype.WT, Genotype.HET, Genotype.WT): 1, + }, + proportion_offspring_per_genotype={ + (Genotype.WT, Genotype.WT, Genotype.HET): pytest.approx( + 0.333, abs=1e-3 + ), + (Genotype.HET, Genotype.WT, Genotype.WT): pytest.approx( + 0.333, abs=1e-3 + ), + (Genotype.WT, Genotype.HET, Genotype.WT): pytest.approx( + 0.333, abs=1e-3 + ), + }, + ), + BreedingScheme( + "het_wt_wt", "wt_hom_het" + ): BreedingSchemeStatistics( + n_breeding_pairs=2, + n_successful_matings=2, + average_litter_size=pytest.approx(1.0, abs=1e-3), + average_n_litters_per_pair=pytest.approx(1.0, abs=1e-3), + total_n_offspring=2, + n_offspring_per_genotype={ + (Genotype.WT, Genotype.HET, Genotype.WT): 2, + }, + proportion_offspring_per_genotype={ + (Genotype.WT, Genotype.HET, Genotype.WT): pytest.approx( + 1.0, abs=1e-3 + ), + }, + ), + BreedingScheme( + "wt_hom_het", "hom_wt_wt" + ): BreedingSchemeStatistics( + n_breeding_pairs=1, + n_successful_matings=1, + average_litter_size=pytest.approx(3.0, abs=1e-3), + average_n_litters_per_pair=pytest.approx(1.0, abs=1e-3), + total_n_offspring=3, + n_offspring_per_genotype={ + (Genotype.HET, Genotype.HET, Genotype.WT): 2, + (Genotype.WT, Genotype.HET, Genotype.HET): 1, + }, + proportion_offspring_per_genotype={ + (Genotype.HET, Genotype.HET, Genotype.WT): pytest.approx( + 0.666, abs=1e-3 + ), + (Genotype.WT, Genotype.HET, Genotype.HET): pytest.approx( + 0.333, abs=1e-3 + ), + }, + ), + }, + ) + + +@pytest.mark.parametrize( + "standardised_csv_path, line_name, expected_stats", + [ + pytest.param( + "standardised_single_mutation_csv_path", + "Line-A", + "expected_stats_single_mutation", + id="1 mutation", + ), + pytest.param( + "standardised_2_mutations_csv_path", + "Line-AB", + "expected_stats_2_mutations", + id="2 mutations", + ), + pytest.param( + "standardised_3_mutations_csv_path", + "Line-ABC", + "expected_stats_3_mutations", + id="3 mutations", + ), + ], +) +def test_calculate_historical_stats_for_line( + standardised_csv_path, line_name, expected_stats, request +): + """ + Test calculation of summary historical statistics for lines with 1, 2 or + 3 mutations. + """ + + standardised_csv = pd.read_csv( + request.getfixturevalue(standardised_csv_path) + ) + expected_stats = request.getfixturevalue(expected_stats) + + line_stats = calculate_historical_stats_for_line( + standardised_csv, line_name + ) + assert line_stats == expected_stats