Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Powell #25

Draft
wants to merge 13 commits into
base: main_logged_verbose_abc_tweaks
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 112 additions & 0 deletions examples/prostate_example/degenerate_control/fixed_dispersion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import sys
import dill
import logging
import numpy as np

from scipy.stats import betabinom
from scipy.optimize import minimize_scalar
from calicost.utils_distribution_fitting import (
Weighted_BetaBinom,
Weighted_BetaBinom_fixdispersion,
)

logger = logging.getLogger("calicost")
logger.setLevel(logging.INFO)

handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter(
"%(asctime)s - %(process)d - %(levelname)s - %(name)s:%(lineno)d - %(message)s"
)

handler.setFormatter(formatter)
logger.addHandler(handler)

def solve(endog, exog, exposure, weights):
logger.info("Solving for sample state means...")

means = []

for selection in exog.T:
selection = selection.astype(bool)

mean = np.sum(endog[selection] * weights[selection]) / np.sum(
exposure[selection] * weights[selection]
)

means.append(mean)

ps = endog[selection] / exposure[selection]

means = np.array(means)

logger.info(f"Found sample state means={means}.")

def nloglikeobs(tau):
a = (exog @ means) * tau
b = (1.0 - exog @ means) * tau

return -betabinom.logpmf(endog, exposure, a, b).dot(
weights
)

result = minimize_scalar(nloglikeobs, bracket=[1., exposure.max()], method="Golden", tol=1.e-2)

print(result)

return np.concatenate([means, np.array([result.x])])


if __name__ == "__main__":
fpath = "snapshots/weighted_betabinom/weighted_betabinom_snapshot_1.dill"

with open(fpath, "rb") as f:
ds = dill.load(f)

logger.info(f"Loaded {ds.get_ninstance()}th instance of {ds.__class__.__name__}")

# NB {"Nelder-Mead"}
ds.bounds = ds.exog.shape[1] * [(0., 1.)] + [(1.e-2, ds.exposure.max())]

ds.method = "Powell"
best_fit = ds.fit()

ds.method = "Nelder-Mead"
best_fit = ds.fit()

start_params = ds.get_default_start_params()
start_params[-1] = best_fit[-1]

new_fit = ds.fit(start_params=start_params)

ds.method = "Powell"
new_fit = ds.fit(start_params=start_params)

# NB solve for sample means and tau.
est_params = solve(ds.endog, ds.exog, ds.exposure, ds.weights)

replica = Weighted_BetaBinom(
ds.endog,
ds.exog,
ds.weights,
ds.exposure,
snapshot=False,
)
replica.method = "Powell"
conditioned_fit = replica.fit(start_params=est_params, xtol=1.e-2)

"""
best_fit_tau = best_fit.params[-1]

fixed_model = Weighted_BetaBinom_fixdispersion(
ds.endog, ds.exog, best_fit_tau, ds.weights, ds.exposure
)
fixed_model.method = "nm"

logger.info(
f"Loaded {fixed_model.get_ninstance()}th instance of {fixed_model.__class__.__name__}"
)

start_params = np.array([0.5, 0.5, 0.5, 0.5, 0.5])

fixed_best_fit = fixed_model.fit(start_params=start_params)
"""
106 changes: 105 additions & 1 deletion src/calicost/calicost_main.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
import warnings

warnings.filterwarnings("ignore", message="Variable names are not unique. To make them unique, call `.var_names_make_unique`.")
warnings.filterwarnings("ignore", message="FutureWarning: Keyword arguments have been passed to the optimizer that have no effect.")
warnings.filterwarnings("ignore", message="df_model + k_constant + k_extra differs from k_params")
warnings.filterwarnings("ignore", message="df_resid differs from nobs - k_params")
warnings.filterwarnings("ignore", message="more exog_names than parameters")

import sys
import numpy as np
import scipy
Expand Down Expand Up @@ -40,7 +48,6 @@

logger.addHandler(handler)


def main(configuration_file):
start = datetime.datetime.now()

Expand Down Expand Up @@ -83,7 +90,104 @@ def main(configuration_file):
smooth_mat,
exp_counts,
) = run_parse_n_load(config)
"""
if single_tumor_prop is not None:
normal_frac = np.mean(single_tumor_prop < 0.2)

logger.info(f"**** ESTIMATING NB EMISSION DISPERSION FOR NORMAL SPOTS ({normal_frac:.3f} fraction) ****")

for prop_threshold in np.arange(0.05, 0.6, 0.05):
normal_candidate = (single_tumor_prop < prop_threshold)

if np.sum(copy_single_X_rdr[:, (normal_candidate==True)]) > single_X.shape[0] * 200:
break

index_normal = np.where(normal_candidate)[0]

lengths, single_X, single_base_nb_mean, single_total_bb_RD, log_sitewise_transmat, df_gene_snp = bin_selection_basedon_normal(
df_gene_snp,
single_X,
single_base_nb_mean,
single_total_bb_RD,
config["nu"],
config["logphase_shift"],
index_normal,
config["geneticmap_file"]
)

assert df_bininfo.shape[0] == copy_single_X_rdr.shape[0]

df_bininfo = genesnp_to_bininfo(df_gene_snp)
copy_single_X_rdr = copy.copy(single_X[:,0,:])

# filter out high-UMI DE genes, which may bias RDR estimates
copy_single_X_rdr, _ = filter_de_genes_tri(
exp_counts,
df_bininfo,
normal_candidate,
sample_list=sample_list,
sample_ids=sample_ids
)

MIN_NORMAL_COUNT_PERBIN = 20

bidx_inconfident = np.where(
np.sum(copy_single_X_rdr[:, (normal_candidate==True)], axis=1) < MIN_NORMAL_COUNT_PERBIN
)[0]

rdr_normal = np.sum(copy_single_X_rdr[:, (normal_candidate==True)], axis=1)
rdr_normal[bidx_inconfident] = 0
rdr_normal = rdr_normal / np.sum(rdr_normal)

# NB avoid ill-defined distributions if normal has 0 count in that bin.
copy_single_X_rdr[bidx_inconfident, :] = 0
copy_single_base_nb_mean = rdr_normal.reshape(-1,1) @ np.sum(copy_single_X_rdr, axis=0).reshape(1,-1)

single_X[:,0,:] = copy_single_X_rdr
single_base_nb_mean = copy_single_base_nb_mean

X, base_nb_mean, total_bb_RD = merge_pseudobulk_by_index(
single_X,
single_base_nb_mean,
single_total_bb_RD,
[ np.where(single_tumor_prop < 0.2)[0] ],
)

n_obs, _, n_spots = X.shape

unique_values_nb, mapping_matrices_nb = construct_unique_matrix(
X[:, 0, :], base_nb_mean
)

# NB gamma[i,t] = P(q_t = i | Observed, hypers),
#
# each normal spot assumed to exist in a single state,
# for which we determine a dispersion.
log_gamma = np.zeros(1 * n_obs).reshape(1, n_obs)

# TODO why states x spots?
start_log_mu = np.vstack([np.zeros(1) for r in range(n_spots)]).T

# NB replicate previous default
alphas = 0.1 * np.ones((1, n_spots))

_, new_alphas = (
update_emission_params_nb_nophasing_uniqvalues(
unique_values_nb,
mapping_matrices_nb,
log_gamma,
alphas,
start_log_mu=start_log_mu,
fix_NB_dispersion=False,
shared_NB_dispersion=True,
)
)

print(alphas)
print(new_alphas)

breakpoint()
"""
logger.info(f"**** ESTIMATING INITIAL CLONES USING BAF ONLY ****")

# NB setting transcript & baseline count to 0 so the emission probability will be ignored.
Expand Down
Loading