Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test nb how many genes it fits #13

Merged
merged 30 commits into from
Mar 11, 2019
Merged
Changes from 1 commit
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
b2a3d0f
setup code and NB model
stemangiola Feb 14, 2019
a8789e5
update analysis for T cells
stemangiola Feb 18, 2019
80486bf
update with TCGA leukimia
stemangiola Feb 18, 2019
15c6ce0
update sigma prior for stability
stemangiola Feb 20, 2019
e3b960d
added test on 0s
stemangiola Feb 20, 2019
37ed120
added skew normal for lambda
stemangiola Feb 22, 2019
dd087cf
applied exp linear with success!
stemangiola Feb 23, 2019
aaae4ed
Merge pull request #11 from stemangiola/exp-linear-on-sigma
stemangiola Feb 23, 2019
724e74f
created test to see if it works
stemangiola Feb 25, 2019
1c4d574
setup universal MPI - does not parallelise!
stemangiola Feb 26, 2019
62359bc
Fixed MPI now works!
stemangiola Feb 26, 2019
7bba631
added non MPI
stemangiola Feb 26, 2019
4a6d29b
update can't see
stemangiola Feb 26, 2019
c5bd59a
make sure MPI effective on TCGA
stemangiola Feb 27, 2019
678ebae
solved some index inconsistencies
stemangiola Mar 1, 2019
09a8c30
added exposure rate inference for MPI
stemangiola Mar 1, 2019
aa38d41
corrected plot
stemangiola Mar 1, 2019
85b3be0
commented NON MPI model
stemangiola Mar 2, 2019
64e0026
Merge pull request #12 from stemangiola/develop-MPI
stemangiola Mar 2, 2019
488105d
added plots and more generated quantities
stemangiola Mar 4, 2019
e7e8285
fixed bugs for plotting
stemangiola Mar 4, 2019
a83b322
updated data set of errors for Martin
stemangiola Mar 4, 2019
ea3095f
update for handling big data + correct log bug for plotting
stemangiola Mar 6, 2019
6d77ea5
code adjustement for big data
stemangiola Mar 6, 2019
9f6b7fa
prepare for inclusion of all cell types
stemangiola Mar 8, 2019
52eefae
remake index array for less error prone
stemangiola Mar 10, 2019
c6ddf8a
update cell type signature production
stemangiola Mar 10, 2019
87189e0
update cell type signature cell names
stemangiola Mar 10, 2019
d1915ef
updated preprocess cellTypes
stemangiola Mar 11, 2019
c960ac9
Merge branch 'mixed-poissons' into test-NB-how-many-genes-it-fits
stemangiola Mar 11, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
setup universal MPI - does not parallelise!
stemangiola committed Feb 26, 2019
commit 1c4d57458920551a3b11292437f4cc9663bb8a34
120 changes: 90 additions & 30 deletions docs/NB_filtering_cell_types.Rmd
Original file line number Diff line number Diff line change
@@ -7,6 +7,7 @@ output: html_document

```{r setup, include=FALSE}
# Import libraries
setwd("~/PostDoc/RNAseq-noise-model/docs")

#setwd("/wehisan/bioinf/bioinf-data/Papenfuss_lab/projects/mangiola.s/PostDoc/RNAseq-noise-model")
set.seed(13254)
@@ -67,6 +68,8 @@ counts =
read_csv(f) %>% mutate(sample = sample %>% as.character, isoform = isoform %>% as.character)
} %>%

filter(symbol %>% is.na %>% `!`) %>%

# Median redundant
do_parallel_start(40, "symbol") %>%
do({
@@ -116,7 +119,7 @@ counts =
`anova p-value` =
ifelse(
(.) %>% distinct(`Data base`) %>% nrow > 1,
(.) %>% aov(`read count normalised log` ~ `Cell type formatted` + `Data base`, .) %>% anova %$% `Pr(>F)` %>% `[` (1),
(.) %>% aov(`read count normalised log` ~ `Cell type formatted` + `Data base`, .) %>% anova %$% `Pr(>F)` %>% `[` (1),
(.) %>% aov(`read count normalised log` ~ `Cell type formatted`, .) %>% anova %$% `Pr(>F)` %>% `[` (1)
)
)
@@ -133,10 +136,12 @@ counts =
`hard bimodality` =
(`bimodal p-value` < 0.05) +
(`bimodal coefficient` > 0.6666667) +
(`anova p-value` < 0.05 ) >=
2
(`anova p-value` < 0.05 ) >= 2
) %>%
mutate(`soft bimodality` = `anova p-value` < 0.0001)
mutate(`soft bimodality` = `anova p-value` < 0.0001) %>%

mutate(symbol = symbol %>% as.factor) %>%
mutate(sample = sample %>% as.factor)
```

Inference
@@ -150,44 +155,99 @@ counts_stan =
counts %>%
filter(!filt_for_calc) %>%
filter(!`hard bimodality`) %>%
#inner_join( (.) %>% distinct(symbol) %>% sample_n(500) ) %>%
inner_join( (.) %>% distinct(symbol) %>% sample_n(66) ) %>%

left_join(
(.) %>%
distinct(sample, symbol, `read count normalised`) %>%
arrange(symbol) %>%
count(symbol) %>%
mutate(end = cumsum(n)) %>%
mutate(start = c(1, .$end %>% rev() %>% `[` (-1) %>% rev %>% `+` (1)))
) %>%
arrange(symbol)
arrange(symbol) %>%

# Add sample idx
mutate(sample_idx = as.integer(sample))

data_for_stan = list(
N = counts_stan %>% nrow,
G = counts_stan %>% distinct(symbol) %>% nrow,
S = counts_stan %>% distinct(sample) %>% nrow,
symbol_start_end = counts_stan %>% distinct(start, end) %>% select(start, end),
counts = counts_stan %>% select(`read count normalised`) %>% pull(1),
sample_idx =
counts_stan %>%
mutate(sample = factor(sample)) %>%
mutate(sample_idx = as.integer(sample)) %>%
select(sample_idx) %>%
pull(1),
shards = 5
counts_stan_MPI =
counts_stan %>%

# Add index for MPI
left_join(
(.) %>%
distinct(symbol) %>%
mutate( idx_MPI = head( rep(1:shards, (.) %>% nrow %>% `/` (shards) %>% ceiling ) %>% sort, n=(.) %>% nrow) )
)

data_for_stan_MPI = list(

N = counts_stan_MPI %>% distinct(sample, symbol, idx_MPI) %>% count(idx_MPI) %>% pull(n) %>% max,
M = counts_stan_MPI %>% distinct(start, idx_MPI) %>% count(idx_MPI) %>% pull(n) %>% max,
G = counts_stan_MPI %>% distinct(symbol) %>% nrow(),
S = counts_stan_MPI %>% distinct(sample) %>% nrow(),
n_shards = shards,
G_per_shard = counts_stan_MPI %>% distinct(symbol, idx_MPI) %>% count(idx_MPI) %>% pull(n) ,
G_per_shard_idx = c(0, counts_stan_MPI %>% distinct(symbol, idx_MPI) %>% count(idx_MPI) %>% pull(n) %>% cumsum),

counts = foreach(idx = counts_stan_MPI %>% distinct(idx_MPI) %>% pull(idx_MPI), .combine = full_join) %do% {
counts_stan_MPI %>%
filter(idx_MPI == idx) %>%
select(`read count`) %>%
setNames(idx) %>%
mutate(i = 1:n())
} %>%
select(-i) %>%
replace(is.na(.), 0) %>%
mutate_if(is.numeric, as.integer) %>%
as_matrix() %>% t,

symbol_start = foreach(idx = counts_stan_MPI %>% distinct(idx_MPI) %>% pull(idx_MPI), .combine = full_join) %do% {
counts_stan_MPI %>%
filter(idx_MPI == idx) %>%
distinct(start) %>%
rbind( counts_stan_MPI %>% filter(idx_MPI == idx) %>% pull(end) %>% max) %>%
mutate(start = start - ( (.) %>% head(n=1) %>% pull(start)) + 1) %>%
setNames(idx) %>%
mutate(i = 1:n())
} %>%
select(-i) %>%
replace(is.na(.), 0) %>%
mutate_if(is.numeric, as.integer) %>%
as_matrix() %>% t,

sample_idx = foreach(idx = counts_stan_MPI %>% distinct(idx_MPI) %>% pull(idx_MPI), .combine = full_join) %do% {
counts_stan_MPI %>%
filter(idx_MPI == idx) %>%
select(sample_idx) %>%
setNames(idx) %>%
mutate(i = 1:n())
} %>%
select(-i) %>%
replace(is.na(.), 0) %>%
mutate_if(is.numeric, as.integer) %>%
as_matrix() %>% t

# Info on data set
omit_data = 0,
generate_quantities = 1,
is_prior_asymetric = 1
)

# fit =
# sampling(
# nb_model,
# data = data_for_stan,
# chains=4, iter=500
# )
# fit_parsed = fit %>% spread_draws(lambda[G], sigma[G], sigma_raw[G])
#fit %>% summary() %$% summary %>% as_tibble(rownames="par")
file("~/.R/Makevars") %>% {
writeLines(c("CXX14FLAGS += -O3", "CXXFLAGS += -DSTAN_THREADS", "CXXFLAGS += -pthread"), (.));
(.)
} %>%
close(.)
nb_model = stan_model(here("stan",sprintf("%s.stan", "negBinomial_tidy_MPI")))
Sys.setenv("STAN_NUM_THREADS" = 8)


fit =
sampling(
nb_model,
data = data_for_stan_MPI,
chains=1, iter=1
)
fit_parsed = fit %>% spread_draws(lambda[G], sigma[G], sigma_raw[G])
fit %>% summary() %$% summary %>% as_tibble(rownames="par")


load("fit_parsed_all_genes_t_cell_18022019.RData")
166 changes: 73 additions & 93 deletions stan/negBinomial_tidy_MPI.stan
Original file line number Diff line number Diff line change
@@ -1,136 +1,116 @@
functions{
real gamma_log_lpdf(vector x_log, real a, real b){

// This function is the probability of the log gamma funnction
// in case you have data that is aleady in log form
vector lp_reduce( vector global_parameters , vector local_parameters , real[] xr , int[] xi ) {
int M = xi[1];
int N = xi[2];
int G_per_shard = xi[3];
int symbol_start[M+1] = xi[(3+1):(3+1+M)];
int sample_idx[N] = xi[(3+1+M+1):(3+1+M+1+N-1)];
int counts[N] = xi[(3+1+M+1+N):size(xi)];

vector[rows(x_log)] jacob = x_log; //jacobian
real norm_constant = a * log(b) -lgamma(a);
real a_minus_1 = a-1;
return sum( jacob ) + norm_constant * rows(x_log) + sum( x_log * a_minus_1 - exp(x_log) * b ) ;

}


vector gen_inv_logit(vector b1s, vector X_no_intercept, real inflection, real y_cross) {

real b0 = -inflection * b1s[1];
vector[num_elements(b1s) + 1] b = append_row(b0, b1s);
matrix[rows(X_no_intercept), num_elements(b1s) + 1] X = append_col(rep_vector(1, rows(X_no_intercept)), X_no_intercept);

return y_cross * (1 + exp(-b0)) ./ to_vector(1 + exp(- ( X * b ))) ;
}
vector[G_per_shard] lambda_MPI = local_parameters[1:G_per_shard];
vector[G_per_shard] sigma_MPI = local_parameters[(G_per_shard+1):(G_per_shard*2)];
vector[G_per_shard] lp;

vector lp_reduce( vector beta , vector theta , real[] xr , int[] xi ) {
int M = rows(theta)/2;
int S = rows(to_vector(xi))/M;
vector[M] lambda_MPI = theta[1:M];
vector[M] sigma_MPI = theta[(M+1):(M*2)];
vector[M] lp;
int counts[M, S];

for(m in 1:M){
int j = 1 + (m-1)*S;
int k = m*S;
counts[m] = xi[j:k];
}

for(m in 1:M)
lp[m] = neg_binomial_2_log_lpmf(counts[m] | lambda_MPI[m], sigma_MPI[m]);
for(g in 1:G_per_shard){
lp[g] = neg_binomial_2_log_lpmf(counts[symbol_start[g]:symbol_start[g+1]] | lambda_MPI[g], sigma_MPI[g]);
}


return [sum(lp)]';
}


}
data {
int<lower=0> N;
int<lower=0> M;
int<lower=0> G;
int<lower=0> S;
int<lower=0> counts[G, S];

int n_shards;
int<lower=0> counts[n_shards, N];
int<lower=0> symbol_start[n_shards, M+1];
int<lower=0> sample_idx[n_shards, N];
int<lower=0> G_per_shard[n_shards];
int<lower=0> G_per_shard_idx[n_shards + 1];

}
transformed data {
// 7 shards
// M = N/7 = 124621/7 = 17803
int n_shards = 10;
int M = G/n_shards;
int counts_MPI[n_shards, M*S]; // 2M because two variables, and they get stacked in array
real xr[n_shards, 0];
vector[0] global_parameters;
// an empty set of per-shard parameters
//vector[M] theta[n_shards];
// split into shards
real xr[n_shards, 0];

int<lower=0> int_MPI[n_shards, 3+(M+1)+N+N];

// Shards - MPI
for ( i in 1:n_shards ) {
int j = 1 + (i-1)*M;
int k = i*M;
counts_MPI[i,] = to_array_1d(counts[ j:k ]);
int M_N_Gps[3];
M_N_Gps[1] = M;
M_N_Gps[2] = N;
M_N_Gps[3] = G_per_shard[i];

int_MPI[i,] = append_array(append_array(append_array(M_N_Gps, symbol_start[i]), sample_idx[i]), counts[i]);
}
}
parameters {
// Overall properties of the data
real lambda_mu; // So is compatible with logGamma prior
real<lower=0> lambda_sigma;
real lambda_skew;
vector[S] exposure_rate;

// Overall properties of the data
real<lower=0> lambda_mu; // So is compatible with logGamma prior
real<lower=0> lambda_sigma_raw;
vector[S] exposure_rate;
// Gene-wise properties of the data
vector[G] lambda;
vector<lower=0>[G] sigma_raw;

// Gene-wise properties of the data
vector[G] lambda;
vector<lower=0>[G] sigma_raw;

// Signa linear model
// Signa linear model

real sigma_inflection;
real<lower=0> sigma_y_cross;
vector<upper=0>[1] sigma_slope;
real<lower=0>sigma_sigma;
real<upper=0> sigma_slope;
real sigma_intercept;
real<lower=0>sigma_sigma;

}
transformed parameters {
real<lower=0> lambda_sigma = lambda_sigma_raw / 1000;
vector[G] sigma = 1.0 ./ sigma_raw;

// Sigma linear model
//vector[G] sigma_mu = exp(lambda * sigma_slope + sigma_0); //the expected values (linear predictor)
vector[G] sigma_mu = gen_inv_logit(sigma_slope, lambda, sigma_inflection, sigma_y_cross) ;
vector[G] sigma_alpha = sigma_mu .* sigma_mu / sigma_sigma; //shape parameter for the gamma distribution
vector[G] sigma_beta = sigma_mu / sigma_sigma; //rate parameter for the gamma distribution
// Sigma
vector[G] sigma = 1.0 ./ sigma_raw;

// Shards - MPI
vector[2*M] lambda_sigma_MPI[n_shards];
for( i in 1:(n_shards) ) {

vector[ (M*2) - (G_per_shard[i]*2) ] buffer = rep_vector(0.0,(M*2) - (G_per_shard[i]*2));

lambda_sigma_MPI[i] = append_row(append_row(
lambda[(G_per_shard_idx[i]+1):(G_per_shard_idx[i+1])],
sigma[(G_per_shard_idx[i]+1):(G_per_shard_idx[i+1])]
), buffer);

for( i in 1:n_shards ) {
int j = 1 + (i-1)*M;
int k = i*M;
lambda_sigma_MPI[i] = append_row(lambda[j:k], sigma[j:k]);
}

}
model {

// Overall properties of the data
lambda_mu ~ gamma(3,2);
lambda_sigma_raw ~ normal(0,1);
//sigma_raw ~ normal(0,1);
exposure_rate ~ normal(0,1);
sum(exposure_rate) ~ normal(0, 0.001 * S);
// Overall properties of the data
int i = 1;
lambda_mu ~ normal(0,2);
lambda_sigma ~ normal(0,2);
lambda_skew ~ normal(0,2);

sigma_inflection ~ normal(0,2);
sigma_y_cross ~ cauchy(0,2);
sigma_slope ~ normal(0,1);
sigma_sigma ~ cauchy(0,2.5);
//sigma_raw ~ normal(0,1);
exposure_rate ~ normal(0,1);
sum(exposure_rate) ~ normal(0, 0.001 * S);

// Gene-wise properties of the data
target += sum( map_rect( lp_reduce , global_parameters , lambda_sigma_MPI , xr , counts_MPI ) );
sigma_intercept ~ normal(0,2);
sigma_slope ~ normal(0,2);
sigma_sigma ~ normal(0,2);

lambda ~ gamma_log_lpdf(lambda_mu, lambda_sigma);
sigma_raw ~ gamma(sigma_alpha,sigma_beta);
// Gene-wise properties of the data
// lambda ~ normal_or_gammaLog(lambda_mu, lambda_sigma, is_prior_asymetric);
lambda ~ skew_normal(lambda_mu,lambda_sigma, lambda_skew);
sigma_raw ~ lognormal(sigma_slope * lambda + sigma_intercept,sigma_sigma);

// Sample from data
// if(omit_data==0) for(g in 1:G)
// counts[symbol_start_end[g,1]:symbol_start_end[g,2]] ~ neg_binomial_2_log(
// exposure_rate[sample_idx[symbol_start_end[g,1]:symbol_start_end[g,2]]] + lambda[g],
// sigma[g]
// );
// Gene-wise properties of the data
target += sum( map_rect( lp_reduce , global_parameters , lambda_sigma_MPI , xr , int_MPI ) );

}