Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Model for abundances of pilot syncom #1

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
.Rproj.user
.Rhistory
.RData
.Ruserdata
*.Rproj
132 changes: 132 additions & 0 deletions lme_pilot/analyze_models.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
setwd("~/micropopgen/exp/2025/today3/")
library(tidyverse)
library(lme4)
library(brms)


load("2025-03-11.data_for_aplicaciones_lcg/models.rdat")
Dat <- read_tsv("2025-03-11.data_for_aplicaciones_lcg/pilot_dat_perfect_design.tsv") %>%
mutate(community = factor(community, levels = paste0("R", 1:12)))
dat <- Dat %>%
filter(hrs == 24)

summary(m5.t1)

#' Get the effect of community
b_com <- ranef(m5.t1, condVar = TRUE, whichel = "b_com", postVar = TRUE)
res <- tibble(effect = row.names(b_com$b_com),
est = b_com$b_com[,1],
postVar = attr(b_com$b_com, "postVar")[,,]) %>%
mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)),
upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)),
pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>%
mutate(qval = p.adjust(pval)) %>%
arrange(est) %>%
mutate(effect = factor(effect, levels = effect)) %>%
print(n = 100)
res

res %>%
ggplot(aes(x = est, y = effect)) +
geom_errorbarh(aes(xmin = lower, xmax = upper)) +
geom_point() +
geom_vline(xintercept = 0) +
theme_classic()


dat %>%
filter(strain == "ST00046") %>%
filter(added == 1) %>%
# filter(community %in% c("R1", "R2", "R3", "R4", "R5", "R12")) %>%
mutate(effect = as.character(community)) %>%
mutate(effect = replace(effect, effect %in% c("R1", "R2", "R3", "R4"), "pos")) %>%
mutate(effect = replace(effect, effect %in% c("R5", "R12"), "neg")) %>%
ggplot(aes(col = factor(temp))) +
facet_wrap(~effect + temp) +
geom_segment(aes(x = 0, y = i_freq, xend = hrs, yend = count / depth, linetype = exp )) +
ggtitle(label = "ST00046") +
theme_classic()




dat %>%
filter(strain == "ST00046") %>%
filter(added == 1) %>%
filter(community == "R5") %>%
# filter(community %in% c("R1", "R2", "R3", "R4", "R5", "R12")) %>%
mutate(effect = as.character(community)) %>%
mutate(effect = replace(effect, effect %in% c("R1", "R2", "R3", "R4"), "pos")) %>%
mutate(effect = replace(effect, effect %in% c("R5", "R12"), "neg")) %>%
ggplot(aes(col = factor(temp))) +
facet_wrap(~temp) +
geom_segment(aes(x = 0, y = i_freq, xend = hrs, yend = count / depth, linetype = exp )) +
ggtitle(label = "ST00046") +
theme_classic()

#' Get the effect of temp
b_temp <- ranef(m5.t1, condVar = TRUE, whichel = "b_temp", postVar = TRUE)
res <- tibble(effect = row.names(b_temp$b_temp),
est = b_temp$b_temp[,1],
postVar = attr(b_temp$b_temp, "postVar")[,,]) %>%
mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)),
upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)),
pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>%
mutate(qval = p.adjust(pval)) %>%
arrange(est) %>%
mutate(effect = factor(effect, levels = effect))
res

res %>%
ggplot(aes(x = est, y = effect)) +
geom_errorbarh(aes(xmin = lower, xmax = upper)) +
geom_point() +
geom_vline(xintercept = 0) +
theme_classic()


#' Example of ST00046 showing that it decreases more at 28 than 32 (not significant)
dat %>%
filter(strain == "ST00110") %>%
filter(added == 1) %>%
filter(hrs %in% c(24)) %>%
ggplot(aes(col = factor(temp))) +
facet_wrap(~temp) +
geom_segment(aes(x = 0, y = i_freq, xend = hrs, yend = count / depth, linetype = exp )) +
ggtitle(label = "ST00110") +
theme_classic()



b_temp_com <- ranef(m5.t1, condVar = TRUE, whichel = "b_temp_com", postVar = TRUE)
res <- tibble(effect = row.names(b_temp_com$b_temp_com),
est = b_temp_com$b_temp_com[,1],
postVar = attr(b_temp_com$b_temp_com, "postVar")[,,]) %>%
mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)),
upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)),
pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>%
mutate(qval = p.adjust(pval)) %>%
arrange(est) %>%
mutate(effect = factor(effect, levels = effect))
res %>%
print(n = 1000)


res %>%
ggplot(aes(x = est, y = effect)) +
geom_errorbarh(aes(xmin = lower, xmax = upper)) +
geom_point() +
geom_vline(xintercept = 0) +
theme_classic()




dat %>%
filter(strain == "ST00143") %>%
filter(added == 1) %>%
filter(community == "R10") %>%
ggplot(aes(col = factor(temp))) +
geom_segment(aes(x = 0, y = i_freq, xend = hrs, yend = count / depth, linetype = exp )) +
ggtitle(label = "ST00143") +
theme_classic()
225 changes: 225 additions & 0 deletions lme_pilot/process_data.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
#' Script to prepare data from pilot for modelling. We require to put every
#' count observation in a single line in a master data table. We are also
#' Calculating the frequency of inoculum. *Should we also calculate frequency
#' at time t-1*

#' The working directory may be set different depending on where are the
#' relevant files. I'm using NSM's ancom_data files downloaded on 2025-03-06.

knitr::opts_knit$set(root.dir = "/Users/sur/lab/exp/2025/today")
library(tidyverse)

#' # Read data

#' We add column names to the sample ids (`id`) and strain ids (`strain`)
Tab <- read_csv("../2025-03-06.pilot_lme4/2025-03-06.ancom_data/ancom_data/freq.csv") %>%
rename(strain = 1)
Tab

Meta <- read_csv("../2025-03-06.pilot_lme4/2025-03-06.ancom_data/ancom_data/meta.csv") %>%
rename(id = 1)
Meta

#' **NOTE**: The following are locations and files in Sur's lab desktop. Format
#' might be different in someone else
syncoms <- read_tsv("../../../data/2024_rhizo_pilot_syncom_NS/NS1/syncoms.tsv") %>%
full_join(read_tsv("../../../data/2024_rhizo_pilot_syncom_NS/NS2/syncoms.tsv"),
by = "strain")
syncoms

#' # Format data
#' We add a column per strain to the metadata table to indicate which species
#' were added to that community, 1 means added, NA means not added (though
#' it might be detected still). We create a new object.
meta <- Meta %>%
# filter(community %in% c("R1", "R2")) %>%
left_join(syncoms %>%
# select(strain, R1,R2) %>%
pivot_longer(-strain, names_to = "community", values_to = "presence") %>%
pivot_wider(names_from = "strain", values_from = "presence"),
by = "community")
meta

#' Then we pivot the table so there is one row per *strain x sample*
#' combination. The new column added indicates if the relevant strain
#' was added (1) or not (0)
meta <- meta %>%
pivot_longer(-c("id", "community", "hrs", "techrep", "exp", "temp", "color_comsint", "community_temp"),
names_to = "strain", values_to = "added") %>%
mutate(added = replace_na(added, 0))
meta

#' Before adding count information we need to homogenize the strain names
strains <- Tab$strain
strains <- strains %>%
str_replace("NS_042g_27F", "ST00042") %>%
str_replace("NS_164C_27F", "ST00164") %>%
str_replace("NS_110C_1_27F", "ST00110")
strains
Tab$strain <- strains

#' Now we pivot the count table to have 1 row per *sample x strain* combination
#' and we join it with the metadata table. We also calculate the sequencing
#' depth per sample and add that information to the metadata. We create a new
#' object
tab <- Tab %>%
pivot_longer(-strain, names_to = "id", values_to = "count")
Dat <- meta %>%
left_join(tab %>%
group_by(id) %>%
summarize(depth = sum(count)),
by = "id") %>%
left_join(tab, by = c("id", "strain"))
Dat

#' Calculate relative abundances of each strain in the inoculum
#' of each experiment, and add that information to the data
Dat <- Dat %>%
left_join(Dat %>%
filter(hrs == 0) %>%
group_by(strain, community, exp) %>%
summarise(i_freq = sum(count) / sum(depth),
.groups = "drop"),
by = c("strain", "community", "exp"))
Dat

#' # Add modelling variables
#'
#' We add variables for things we want to model. We are **ignoring time** for
#' the time (;)) being
#'
#' ## Assuming design is perfect
#'
#' We discard effects when a strain wasn't added
dat <- Dat %>%
# The effect of community on strain abundance
mutate(b_com = paste0(community, "_", strain)) %>%
mutate(b_com = replace(b_com, added == 0, NA)) %>%
mutate(b_com = replace(b_com, hrs == 0, NA)) %>%

# The effect of temperature on strain abundance
mutate(b_temp = paste0(temp, "_", strain)) %>%
mutate(b_temp = replace(b_temp, added == 0, NA)) %>%
mutate(b_temp = replace(b_temp, hrs == 0, NA)) %>%

# The effect of community *color* on strain abundance
mutate(b_col = paste0(color_comsint, "_", strain)) %>%
mutate(b_col = replace(b_col, added == 0, NA)) %>%
mutate(b_col = replace(b_col, hrs == 0, NA)) %>%

# The effect of biological replicate on strain abundance
mutate(b_rep = paste0(exp, "_", strain)) %>%
mutate(b_rep = replace(b_rep, added == 0, NA)) %>%
mutate(b_rep = replace(b_rep, hrs == 0, NA)) %>%

# The effect of *temperature x community* on strain abundance
mutate(b_temp_com = paste0(temp, "_", community, "_", strain)) %>%
mutate(b_temp_com = replace(b_temp_com, added == 0, NA)) %>%
mutate(b_temp_com = replace(b_temp_com, hrs == 0, NA)) %>%

# Individual observation-level effect (for overdispersion)
mutate(b_obs = as.character(1:n()))
dat

# Dat %>%
# filter(hrs == 0) %>%
# print(n = 500)
#
#
# Dat %>%
# filter(hrs != 0) %>%
# print(n = 1500)

write_tsv(dat, "pilot_dat_perfect_design.tsv")


#' ## Based on empirical results


#' ### Error in design construction (procrastination)
#'
#' In general there is a very large difference between the expected (added == 1)
#' and unexpected (added != 0) strain counts
Dat %>%
# filter(community %in% c("R3", "R4")) %>%
ggplot(aes(x = added == 1, y = count)) +
facet_wrap(~ strain, scales = "free_y") +
geom_point(position = position_jitter(width = 0.1, height = 0)) +
theme_classic()

#' We model added as a function of counts and depth and do model selection
#' To find the best predictors of added

m1 <- glm(added ~ lcount + strain + lcount * strain + community + lcount * community + exp + shrs + shrs2,
data = Dat %>%
mutate(shrs = scale(hrs)) %>%
mutate(shrs2 = shrs * shrs) %>%
mutate(lcount = log(count + 1)),
family = binomial(link = "logit"))
m1
summary(m1)
drop(anova(m1, test = "LRT"))


m2 <- glm(added ~ lcount + strain + lcount * strain + community + lcount * community + shrs + shrs2,
data = Dat %>%
mutate(shrs = scale(hrs)) %>%
mutate(shrs2 = shrs * shrs) %>%
mutate(lcount = log(count + 1)),
family = binomial(link = "logit"))
m2
summary(m2)
drop(anova(m2, test = "LRT"))


m3 <- glm(added ~ lcount + strain + lcount * strain + community + lcount * community + shrs,
data = Dat %>%
mutate(shrs = scale(hrs)) %>%
mutate(shrs2 = shrs * shrs) %>%
mutate(lcount = log(count + 1)),
family = binomial(link = "logit"))
m3
summary(m3)
drop(anova(m3, test = "LRT"))
AIC(m1,m2, m3)

#' I'm not conviced of using this method, we would need some gold standard
#' and training, it seems to miss manu cases where strain is there despite not
#' added
hist(predict(m3, type = "response"))
table(predict(m3, type = "response") >= .8, Dat$added)

Dat %>%
filter(predict(m3, type = "response") >= .8 & added == 0)

Dat %>%
filter(added == 0) %>%
arrange(desc(count)) %>%
filter(count > 0) %>%
print(., n = nrow(.))

#' Instead lets setup a 200 read threshold for calling a strain present despite
#' not being added
#'
Dat
Dat %>%
# filter(added == 0) %>%
arrange(desc(count)) %>%
mutate(obs = 1:length(count)) %>%
ggplot(aes(x = obs, y = count)) +
geom_point(aes(col = factor(added == 1))) +
geom_hline(yintercept = 200) +
scale_x_log10() +
# scale_y_log10() +
theme_classic()

Dat %>%
filter(added == 0) %>%
arrange(desc(count)) %>%
mutate(obs = 1:length(count)) %>%
ggplot(aes(x = obs, y = count)) +
geom_point(aes(col = factor(added == 1))) +
geom_hline(yintercept = 200) +
scale_x_log10() +
# scale_y_log10() +
theme_classic()
Loading