Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

emprob_mat3 #24

Draft
wants to merge 32 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,5 @@ dmypy.json

# Pyre type checker
.pyre/

*.npy
4 changes: 2 additions & 2 deletions src/calicost/calicost_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from calicost.parse_input import *
from calicost.utils_plotting import *


@profile
def main(configuration_file):
try:
config = read_configuration_file(configuration_file)
Expand Down Expand Up @@ -441,4 +441,4 @@ def main(configuration_file):
parser.add_argument("-c", "--configfile", help="configuration file of CalicoST", required=True, type=str)
args = parser.parse_args()

main(args.configfile)
main(args.configfile)
430 changes: 430 additions & 0 deletions src/calicost/compute_emission.py

Large diffs are not rendered by default.

59 changes: 24 additions & 35 deletions src/calicost/hmm_NB_BB_nophasing_v2.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
import copy
import logging

import networkx as nx
import numpy as np
from numba import njit
from scipy.stats import norm, multivariate_normal, poisson
import scipy.special
from scipy.optimize import minimize
from scipy.optimize import Bounds
from sklearn.mixture import GaussianMixture
from tqdm import trange
import statsmodels.api as sm
from numba import njit
from scipy.optimize import Bounds, minimize
from scipy.stats import multivariate_normal, norm, poisson
from sklearn.mixture import GaussianMixture
from statsmodels.base.model import GenericLikelihoodModel
import copy
from tqdm import trange

from calicost.utils_distribution_fitting import *
from calicost.utils_hmm import *
import networkx as nx
from calicost.compute_emission import compute_emission_probability_nb_betabinom_mix, compute_emission_probability_nb_betabinom

"""
Joint NB-BB HMM that accounts for tumor/normal genome proportions. Tumor genome proportion is weighted by mu in BB distribution.
Expand All @@ -35,7 +37,7 @@ def __init__(self, params="stmp", t=1-1e-4):
"""
self.params = params
self.t = t
#

@staticmethod
def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus):
"""
Expand Down Expand Up @@ -67,28 +69,9 @@ def compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, t
log_emission : array, shape (n_states, n_obs, n_spots)
Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots.
"""
n_obs = X.shape[0]
n_comp = X.shape[1]
n_spots = X.shape[2]
n_states = log_mu.shape[0]
# initialize log_emission
log_emission_rdr = np.zeros((n_states, n_obs, n_spots))
log_emission_baf = np.zeros((n_states, n_obs, n_spots))
for i in np.arange(n_states):
for s in np.arange(n_spots):
# expression from NB distribution
idx_nonzero_rdr = np.where(base_nb_mean[:,s] > 0)[0]
if len(idx_nonzero_rdr) > 0:
nb_mean = base_nb_mean[idx_nonzero_rdr,s] * np.exp(log_mu[i, s])
nb_std = np.sqrt(nb_mean + alphas[i, s] * nb_mean**2)
n, p = convert_params(nb_mean, nb_std)
log_emission_rdr[i, idx_nonzero_rdr, s] = scipy.stats.nbinom.logpmf(X[idx_nonzero_rdr, 0, s], n, p)
# AF from BetaBinom distribution
idx_nonzero_baf = np.where(total_bb_RD[:,s] > 0)[0]
if len(idx_nonzero_baf) > 0:
log_emission_baf[i, idx_nonzero_baf, s] = scipy.stats.betabinom.logpmf(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], p_binom[i, s] * taus[i, s], (1-p_binom[i, s]) * taus[i, s])
return log_emission_rdr, log_emission_baf
#

return compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus)

@staticmethod
def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop, **kwargs):
"""
Expand Down Expand Up @@ -120,6 +103,11 @@ def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alpha
log_emission : array, shape (n_states, n_obs, n_spots)
Log emission probability for each gene each spot (or sample) under each state. There is a common bag of states across all spots.
"""

return compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus, tumor_prop, **kwargs)

# TODO DEPRECATE
"""
n_obs = X.shape[0]
n_comp = X.shape[1]
n_spots = X.shape[2]
Expand Down Expand Up @@ -153,7 +141,8 @@ def compute_emission_probability_nb_betabinom_mix(X, base_nb_mean, log_mu, alpha
mix_p_B = (1 - p_binom[i, s]) * this_weighted_tp[idx_nonzero_baf] + 0.5 * (1 - this_weighted_tp[idx_nonzero_baf])
log_emission_baf[i, idx_nonzero_baf, s] += scipy.stats.betabinom.logpmf(X[idx_nonzero_baf,1,s], total_bb_RD[idx_nonzero_baf,s], mix_p_A * taus[i, s], mix_p_B * taus[i, s])
return log_emission_rdr, log_emission_baf
#
"""

@staticmethod
@njit
def forward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sitewise_transmat):
Expand Down Expand Up @@ -223,7 +212,7 @@ def backward_lattice(lengths, log_transmat, log_startprob, log_emission, log_sit
cumlen += le
return log_beta

#
@profile
def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD, log_sitewise_transmat=None, tumor_prop=None, \
fix_NB_dispersion=False, shared_NB_dispersion=False, fix_BB_dispersion=False, shared_BB_dispersion=False, \
is_diag=False, init_log_mu=None, init_p_binom=None, init_alphas=None, init_taus=None, max_iter=100, tol=1e-4, **kwargs):
Expand Down Expand Up @@ -260,8 +249,8 @@ def run_baum_welch_nb_bb(self, X, lengths, n_states, base_nb_mean, total_bb_RD,
# a trick to speed up BetaBinom optimization: taking only unique values of (B allele count, total SNP covering read count)
unique_values_nb, mapping_matrices_nb = construct_unique_matrix(X[:,0,:], base_nb_mean)
unique_values_bb, mapping_matrices_bb = construct_unique_matrix(X[:,1,:], total_bb_RD)
# EM algorithm
for r in trange(max_iter):

for r in trange(max_iter, desc="EM algorithm"):
# E step
if tumor_prop is None:
log_emission_rdr, log_emission_baf = hmm_nophasing_v2.compute_emission_probability_nb_betabinom(X, base_nb_mean, log_mu, alphas, total_bb_RD, p_binom, taus)
Expand Down
2 changes: 1 addition & 1 deletion src/calicost/hmm_NB_BB_phaseswitch.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@ def viterbi_nb_bb_sitewise(X, lengths, base_nb_mean, log_mu, alphas, total_bb_RD
cumlen += le
return labels, merged_labels


@profile
def pipeline_baum_welch(output_prefix, X, lengths, n_states, base_nb_mean, total_bb_RD, log_sitewise_transmat, tumor_prop=None, \
hmmclass=hmm_sitewise, params="smp", t=1-1e-6, random_state=0, \
in_log_space=True, only_minor=False, fix_NB_dispersion=False, shared_NB_dispersion=True, fix_BB_dispersion=False, shared_BB_dispersion=True, \
Expand Down
84 changes: 69 additions & 15 deletions src/calicost/hmrf.py
Original file line number Diff line number Diff line change
Expand Up @@ -763,56 +763,110 @@ def hmrfmix_pipeline(outdir, prefix, single_X, lengths, single_base_nb_mean, sin


def hmrfmix_reassignment_posterior_concatenate(single_X, single_base_nb_mean, single_total_bb_RD, single_tumor_prop, res, smooth_mat, adjacency_mat, prev_assignment, sample_ids, log_persample_weights, spatial_weight, hmmclass=hmm_sitewise, return_posterior=False):
"""
TODO
"""
# TODO define N
N = single_X.shape[2]
n_obs = single_X.shape[0]
n_clones = np.max(prev_assignment) + 1
n_states = res["new_p_binom"].shape[0]

single_llf = np.zeros((N, n_clones))

new_assignment = copy.copy(prev_assignment)
#
lambd = np.sum(single_base_nb_mean, axis=1) / np.sum(single_base_nb_mean)
if np.sum(single_base_nb_mean) > 0:

single_base_nb_mean_sum = np.sum(single_base_nb_mean)

lambd = np.sum(single_base_nb_mean, axis=1) / single_base_nb_mean_sum
log_lambd = np.log(lambd).reshape(-1,1)

if single_base_nb_mean_sum > 0:
logmu_shift = []

for c in range(n_clones):
this_pred_cnv = np.argmax(res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0)%n_states
logmu_shift.append( scipy.special.logsumexp(res["new_log_mu"][this_pred_cnv,:] + np.log(lambd).reshape(-1,1), axis=0) )
logmu_shift = np.vstack(logmu_shift)
kwargs = {"logmu_shift":logmu_shift, "sample_length":np.ones(n_clones,dtype=int) * n_obs}
this_pred_cnv = np.argmax(res["log_gamma"][:, (c * n_obs): (c * n_obs + n_obs)], axis=0) % n_states

to_append = scipy.special.logsumexp(res["new_log_mu"][this_pred_cnv,:] + log_lambd, axis=0)
logmu_shift.append( to_append )

kwargs = {"logmu_shift": np.vstack(logmu_shift), "sample_length": n_obs * np.ones(n_clones, dtype=int)}
else:
kwargs = {}
#

posterior = np.zeros((N, n_clones))

for i in trange(N):
idx = smooth_mat[i,:].nonzero()[1]
idx = idx[~np.isnan(single_tumor_prop[idx])]
for c in range(n_clones):
tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix( np.sum(single_X[:,:,idx], axis=2, keepdims=True), \
np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True), res["new_log_mu"], res["new_alphas"], \
np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True), res["new_p_binom"], res["new_taus"], np.ones((n_obs,1)) * np.mean(single_tumor_prop[idx]), **kwargs )

if np.sum(single_base_nb_mean[:,i:(i+1)] > 0) > 0 and np.sum(single_total_bb_RD[:,i:(i+1)] > 0) > 0:
ratio_nonzeros = 1.0 * np.sum(single_total_bb_RD[:,i:(i+1)] > 0) / np.sum(single_base_nb_mean[:,i:(i+1)] > 0)
tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix(
np.sum(single_X[:,:,idx], axis=2, keepdims=True),
np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True),
res["new_log_mu"],
res["new_alphas"],
np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True),
res["new_p_binom"],
res["new_taus"],
np.ones((n_obs,1)) * np.mean(single_tumor_prop[idx]),
**kwargs
)

# TODO single element?
single_base_nb_mean_sumi = np.sum(single_base_nb_mean[:,i:(i+1)] > 0)
single_total_bb_RD_sumi = np.sum(single_total_bb_RD[:,i:(i+1)] > 0)

for c in range(n_clones):
# DEPRECATE
# tmp_log_emission_rdr, tmp_log_emission_baf = hmmclass.compute_emission_probability_nb_betabinom_mix(
# np.sum(single_X[:,:,idx], axis=2, keepdims=True),
# np.sum(single_base_nb_mean[:,idx], axis=1, keepdims=True),
# res["new_log_mu"],
# res["new_alphas"],
# np.sum(single_total_bb_RD[:,idx], axis=1, keepdims=True),
# res["new_p_binom"],
# res["new_taus"],
# np.ones((n_obs,1)) * np.mean(single_tumor_prop[idx]),
# **kwargs
# )

if single_base_nb_mean_sumi > 0 and single_total_bb_RD_sumi > 0:
ratio_nonzeros = 1.0 * single_total_bb_RD_sumi / single_base_nb_mean_sumi
# ratio_nonzeros = 1.0 * np.sum(np.sum(single_total_bb_RD[:,idx], axis=1) > 0) / np.sum(np.sum(single_base_nb_mean[:,idx], axis=1) > 0)

# this_log_gamma = res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)]

# single_llf[i,c] = ratio_nonzeros * np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:, :, 0] + this_log_gamma, axis=0 )
# single_llf[i,c] += np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:, :, 0] + this_log_gamma, axis=0) )

single_llf[i,c] = ratio_nonzeros * np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0) ) + \
np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:, :, 0] + res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0) )

else:
# single_llf[i,c] = np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:, :, 0] + this_log_gamma, axis=0) )
# single_llf[i,c] += np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:, :, 0] + this_log_gamma, axis=0) )

single_llf[i,c] = np.sum( scipy.special.logsumexp(tmp_log_emission_rdr[:, :, 0] + res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0) ) + \
np.sum( scipy.special.logsumexp(tmp_log_emission_baf[:, :, 0] + res["log_gamma"][:, (c*n_obs):(c*n_obs+n_obs)], axis=0) )

w_node = single_llf[i,:]
w_node += log_persample_weights[:,sample_ids[i]]
w_edge = np.zeros(n_clones)

for j in adjacency_mat[i,:].nonzero()[1]:
# w_edge[new_assignment[j]] += 1
w_edge[new_assignment[j]] += adjacency_mat[i,j]

new_assignment[i] = np.argmax( w_node + spatial_weight * w_edge )
#

posterior[i,:] = np.exp( w_node + spatial_weight * w_edge - scipy.special.logsumexp(w_node + spatial_weight * w_edge) )

# compute total log likelihood log P(X | Z) + log P(Z)
total_llf = np.sum(single_llf[np.arange(N), new_assignment])

for i in range(N):
total_llf += np.sum( spatial_weight * np.sum(new_assignment[adjacency_mat[i,:].nonzero()[1]] == new_assignment[i]) )

if return_posterior:
return new_assignment, single_llf, total_llf, posterior
else:
Expand Down
27 changes: 27 additions & 0 deletions src/calicost/sandbox/profile_context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import io
import cProfile
import pstats

class ProfileContext:
def __init__(self, line_threshold=10):
self.threshold = line_threshold

def __enter__(self):
self.profiler = cProfile.Profile()
self.profiler.enable()
return self

def __exit__(self, *args):
self.profiler.disable()

if True:
ss = io.StringIO()

ps = pstats.Stats(self.profiler, stream=ss).sort_stats("cumulative")
ps.print_stats(self.threshold)

profile = ss.getvalue()

# Only print if there's something to print
if profile.strip():
print(profile)
Loading