Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 68 additions & 4 deletions oscar/breeding_scheme.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import itertools
from enum import Enum
from enum import IntEnum
from typing import Self

import numpy as np


class Genotype(Enum):
class Genotype(IntEnum):
"""Genotype status: homozygous (HOM), heterozygous (HET) or WT (wildtype).

Each animal will have two copies (alleles) of a particular gene - each
Expand All @@ -16,6 +17,30 @@ class Genotype(Enum):
HET = 1
HOM = 2

@classmethod
def from_string(cls, genotype_str: str) -> tuple[Self, ...]:
"""Create a tuple of Genotype from a string representation.

E.g. wt_het_hom -> (Genotype.WT, Genotype.HET, Genotype.HOM)

Parameters
----------
genotype_str : str
String representing 1 or multiple genotypes. Each should be
wt, het or hom separated by an underscore.

Returns
-------
tuple[Self, ...]
Converted tuple of genotypes
"""
genotype_strings = genotype_str.split("_")
genotypes = [
cls[genotype_string.upper()]
for genotype_string in genotype_strings
]
return tuple(genotypes)


class BreedingScheme:
"""
Expand All @@ -25,9 +50,32 @@ class BreedingScheme:

def __init__(
self,
parent_1_genotype: tuple[Genotype, ...],
parent_2_genotype: tuple[Genotype, ...],
parent_1_genotype: tuple[Genotype, ...] | str,
parent_2_genotype: tuple[Genotype, ...] | str,
):
"""Create a breeding scheme with two parent genotypes.

Parameters
----------
parent_1_genotype : tuple[Genotype, ...] | str
Genotype of parent 1 either as a tuple of Genotypes or as a
string representation like het_hom_het
parent_2_genotype : tuple[Genotype, ...] | str
Genotype of parent 2 either as a tuple of Genotypes or as a
string representation like het_hom_het

Raises
------
ValueError
If the parent genotypes don't have the same length
"""

if isinstance(parent_1_genotype, str):
parent_1_genotype = Genotype.from_string(parent_1_genotype)

if isinstance(parent_2_genotype, str):
parent_2_genotype = Genotype.from_string(parent_2_genotype)

if len(parent_1_genotype) != len(parent_2_genotype):
raise ValueError(
"Both parents must have a genotype of the same length"
Expand All @@ -45,6 +93,22 @@ def __eq__(self, other):
[other.parent_1_genotype, other.parent_2_genotype]
)

def __hash__(self):
# Hash should be equal if the breeding scheme combines the same
# two genotypes in any order.
genotypes = sorted([self.parent_1_genotype, self.parent_2_genotype])
return hash(tuple(genotypes))

def __repr__(self):
parent_1_str = "_".join(
[genotype.name.lower() for genotype in self.parent_1_genotype]
)
parent_2_str = "_".join(
[genotype.name.lower() for genotype in self.parent_2_genotype]
)

return f"BreedingScheme({parent_1_str}x{parent_2_str})"

def mendelian_ratio(self) -> dict[tuple[Genotype, ...], float]:
"""Calculate the theoretical mendelian ratio for this breeding scheme.

Expand Down
151 changes: 151 additions & 0 deletions oscar/historical_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
from dataclasses import dataclass, field

import pandas as pd

from oscar.breeding_scheme import (
BreedingScheme,
Genotype,
)


@dataclass
class BreedingSchemeStatistics:
n_breeding_pairs: int = 0
n_successful_matings: int = 0
average_litter_size: float = 0
average_n_litters_per_pair: float = 0
total_n_offspring: int = 0
n_offspring_per_genotype: dict[tuple[Genotype, ...], int] = field(
default_factory=dict
)
proportion_offspring_per_genotype: dict[tuple[Genotype, ...], float] = (
field(default_factory=dict)
)


@dataclass
class LineStatistics:
total_n_offspring: int = 0
total_n_offspring_per_genotype: dict[tuple[Genotype, ...], int] = field(
default_factory=dict
)

stats_per_breeding_scheme: dict[
BreedingScheme, BreedingSchemeStatistics
] = field(default_factory=dict)


def calculate_historical_stats_for_line(
standardised_data: pd.DataFrame, line_name: str
) -> LineStatistics:
"""Calculate summary statistics for a specific line from standardised
historical data.

Parameters
----------
standardised_data : pd.DataFrame
Standardised historical data e.g. from standardise_pyrat_csv
line_name : str
Name of line

Returns
-------
LineStatistics
Summary statistics for the given line
"""

line_data = standardised_data.loc[
standardised_data.line_name == line_name, :
]
if len(line_data) == 0:
raise ValueError(f"No data for {line_name} found")

breeding_schemes = line_data.apply(_create_breeding_scheme, axis=1)
data_with_schemes = line_data.copy()
data_with_schemes["breeding_scheme"] = breeding_schemes

line_stats = LineStatistics(total_n_offspring=len(line_data))

for breeding_scheme in data_with_schemes["breeding_scheme"].unique():
breeding_scheme_data = data_with_schemes.loc[
data_with_schemes.breeding_scheme == breeding_scheme, :
]
scheme_stats = _historical_stats_for_breeding_scheme(
breeding_scheme_data
)
line_stats.stats_per_breeding_scheme[breeding_scheme] = scheme_stats

# Update summary of number of offspring per genotype across entire line
for (
genotype,
n_offspring,
) in scheme_stats.n_offspring_per_genotype.items():
if genotype in line_stats.total_n_offspring_per_genotype:
line_stats.total_n_offspring_per_genotype[genotype] += (
n_offspring
)
else:
line_stats.total_n_offspring_per_genotype[genotype] = (
n_offspring
)

return line_stats


def _create_breeding_scheme(row: pd.Series) -> BreedingScheme:
return BreedingScheme(row.genotype_father, row.genotype_mother)


def _historical_stats_for_breeding_scheme(
scheme_data: pd.DataFrame,
) -> BreedingSchemeStatistics:
"""Calculate summary statistics for an individual breeding scheme
(within a specific line).

Parameters
----------
scheme_data : pd.DataFrame
Dataframe of data for a single breeding scheme and line

Returns
-------
BreedingSchemeStatistics
Summary statistics for the breeding scheme
"""
stats = BreedingSchemeStatistics()

# breeding pairs is unique combos of father ID x mother ID
stats.n_breeding_pairs = scheme_data.groupby(
["ID_father", "ID_mother"]
).ngroups

# Successful matings is unique combos of father ID x mother ID x date
# (assuming only one per day)
stats.n_successful_matings = scheme_data.groupby(
["ID_father", "ID_mother", "date_of_birth"]
).ngroups

stats.total_n_offspring = len(scheme_data)
stats.average_litter_size = (
stats.total_n_offspring / stats.n_successful_matings
)
stats.average_n_litters_per_pair = (
stats.n_successful_matings / stats.n_breeding_pairs
)

# convert string representation e.g. wt_hom_het to tuple representation
# of genotype: (Genotype.WT, Genotype.HOM, Genotype.HET)
scheme_data["genotype_offspring"] = scheme_data[
"genotype_offspring"
].apply(Genotype.from_string)

# Number and proportion of offspring per genotype
stats.n_offspring_per_genotype = (
scheme_data.groupby("genotype_offspring").size().to_dict()
)

for genotype, n_offspring in stats.n_offspring_per_genotype.items():
proportion = n_offspring / stats.total_n_offspring
stats.proportion_offspring_per_genotype[genotype] = proportion

return stats
12 changes: 6 additions & 6 deletions tests/pooch_registry.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
pyrat-data-single-mutation.csv a953acea3569208145689e9344136b0887b7de4b6f7ee7ac84e07d16b758f97f
standardised-data-single-mutation.csv 58e491465593cdacccc3b0de56c872cc77a92969c87e919b2c25e745ebd99143
pyrat-data-2-mutations.csv 7fb1e2c037358aca4b3d3c37d65481b899b5cb18635ad840293e6545c5952e04
standardised-data-2-mutations.csv 90f89288a83f9195f9ac87d65ed6b3b3dd9f0bfca803180a9bfd2ba9595681c1
pyrat-data-3-mutations.csv 6c762f5bbf07c7a2688a90158dffafc79b9e573bd6fd5af21941aa836f98bd5f
standardised-data-3-mutations.csv 7442121aa5e75c1b2f151d4bf1f87e378a6f34226b0e03b474c4db76628823cc
pyrat-data-single-mutation.csv 9fb87b7b865eac7932ec2554156cba59f0eb76cfa8d7442aafe91899a2f3c1dc
standardised-data-single-mutation.csv e6444ea52bf0679747be1a4952319c93bd5965702d1ad7cc71786b259d95fe9c
pyrat-data-2-mutations.csv 8a9a51b905d37039de931cf26269e5d7370fad5114649cb974f388d509d61332
standardised-data-2-mutations.csv 3b2e41fbdd145bf63dae0330059b2fffbc5432e8eeb4f05b2bcf05078b892f44
pyrat-data-3-mutations.csv 311369de2fb39e897ee63e17ba5de88beb81d19e4315339427f5e1d5ba454d06
standardised-data-3-mutations.csv e7b0cba4e901bc0a069db27609e3d231b5c75090281b9616b3c92597beb9f086
pyrat-data-forbidden-genotypes.csv c59c27fedf4332813b312dfb8e242d3e544806b6757f07efd041084b31e5df98
standardised-data-forbidden-genotypes.csv b64f71adc0adae6b6b336184a9f5bf491fb1c1c0900b8458c0c6ac72736a3d6b
Loading
Loading