ecoevolab · surh · Mar 6, 2025 · Mar 7, 2025 · Mar 7, 2025 · Mar 7, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
+*.Rproj
diff --git a/lme_pilot/analyze_models.r b/lme_pilot/analyze_models.r
@@ -0,0 +1,132 @@
+setwd("~/micropopgen/exp/2025/today3/")
+library(tidyverse)
+library(lme4)
+library(brms)
+
+
+load("2025-03-11.data_for_aplicaciones_lcg/models.rdat")
+Dat <- read_tsv("2025-03-11.data_for_aplicaciones_lcg/pilot_dat_perfect_design.tsv") %>%
+  mutate(community = factor(community, levels = paste0("R", 1:12)))
+dat <- Dat %>%
+  filter(hrs == 24) 
+
+summary(m5.t1)
+
+#' Get the effect of community
+b_com <- ranef(m5.t1, condVar = TRUE, whichel = "b_com", postVar = TRUE)
+res <- tibble(effect = row.names(b_com$b_com),
+              est = b_com$b_com[,1],
+              postVar = attr(b_com$b_com, "postVar")[,,]) %>%
+  mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)),
+         upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)),
+         pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>%
+  mutate(qval = p.adjust(pval)) %>%
+  arrange(est) %>%
+  mutate(effect = factor(effect, levels = effect)) %>%
+  print(n = 100)
+res
+
+res %>%
+  ggplot(aes(x = est, y = effect)) +
+  geom_errorbarh(aes(xmin = lower, xmax = upper)) +
+  geom_point() +
+  geom_vline(xintercept = 0) +
+  theme_classic()
+
+
+dat %>%
+  filter(strain == "ST00046") %>%
+  filter(added == 1) %>%
+  # filter(community %in% c("R1", "R2", "R3", "R4", "R5", "R12")) %>%
+  mutate(effect = as.character(community)) %>%
+  mutate(effect = replace(effect, effect %in% c("R1", "R2", "R3", "R4"), "pos")) %>%
+  mutate(effect = replace(effect, effect %in% c("R5", "R12"), "neg")) %>%
+  ggplot(aes(col = factor(temp))) +
+  facet_wrap(~effect + temp) +
+  geom_segment(aes(x = 0, y = i_freq, xend = hrs, yend = count / depth, linetype = exp )) +
+  ggtitle(label = "ST00046") +
+  theme_classic() 
+
+
+
+
+dat %>%
+  filter(strain == "ST00046") %>%
+  filter(added == 1) %>%
+  filter(community == "R5") %>%
+  # filter(community %in% c("R1", "R2", "R3", "R4", "R5", "R12")) %>%
+  mutate(effect = as.character(community)) %>%
+  mutate(effect = replace(effect, effect %in% c("R1", "R2", "R3", "R4"), "pos")) %>%
+  mutate(effect = replace(effect, effect %in% c("R5", "R12"), "neg")) %>%
+  ggplot(aes(col = factor(temp))) +
+  facet_wrap(~temp) +
+  geom_segment(aes(x = 0, y = i_freq, xend = hrs, yend = count / depth, linetype = exp )) +
+  ggtitle(label = "ST00046") +
+  theme_classic() 
+
+#' Get the effect of temp
+b_temp <- ranef(m5.t1, condVar = TRUE, whichel = "b_temp", postVar = TRUE)
+res <- tibble(effect = row.names(b_temp$b_temp),
+              est = b_temp$b_temp[,1],
+              postVar = attr(b_temp$b_temp, "postVar")[,,]) %>%
+  mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)),
+         upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)),
+         pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>%
+  mutate(qval = p.adjust(pval)) %>%
+  arrange(est) %>%
+  mutate(effect = factor(effect, levels = effect))
+res
+
+res %>%
+  ggplot(aes(x = est, y = effect)) +
+  geom_errorbarh(aes(xmin = lower, xmax = upper)) +
+  geom_point() +
+  geom_vline(xintercept = 0) +
+  theme_classic()
+
+
+#' Example of ST00046 showing that it decreases more at 28 than 32 (not significant)
+dat %>%
+  filter(strain == "ST00110") %>%
+  filter(added == 1) %>%
+  filter(hrs %in% c(24)) %>%
+  ggplot(aes(col = factor(temp))) +
+  facet_wrap(~temp) +
+  geom_segment(aes(x = 0, y = i_freq, xend = hrs, yend = count / depth, linetype = exp )) +
+  ggtitle(label = "ST00110") +
+  theme_classic() 
+
+
+
+b_temp_com <- ranef(m5.t1, condVar = TRUE, whichel = "b_temp_com", postVar = TRUE)
+res <- tibble(effect = row.names(b_temp_com$b_temp_com),
+              est = b_temp_com$b_temp_com[,1],
+              postVar = attr(b_temp_com$b_temp_com, "postVar")[,,]) %>%
+  mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)),
+         upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)),
+         pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>%
+  mutate(qval = p.adjust(pval)) %>%
+  arrange(est) %>%
+  mutate(effect = factor(effect, levels = effect))
+res %>%
+  print(n = 1000)
+
+
+res %>%
+  ggplot(aes(x = est, y = effect)) +
+  geom_errorbarh(aes(xmin = lower, xmax = upper)) +
+  geom_point() +
+  geom_vline(xintercept = 0) +
+  theme_classic()
+
+
+
+
+dat %>%
+  filter(strain == "ST00143") %>%
+  filter(added == 1) %>%
+  filter(community == "R10") %>%
+  ggplot(aes(col = factor(temp))) +
+  geom_segment(aes(x = 0, y = i_freq, xend = hrs, yend = count / depth, linetype = exp )) +
+  ggtitle(label = "ST00143") +
+  theme_classic() 
diff --git a/lme_pilot/process_data.r b/lme_pilot/process_data.r
@@ -0,0 +1,225 @@
+#' Script to prepare data from pilot for modelling. We require to put every
+#' count observation in a single line in a master data table. We are also
+#' Calculating the frequency of inoculum. *Should we also calculate frequency
+#' at time t-1*
+
+#' The working directory may be set different depending on where are the
+#' relevant files. I'm using NSM's ancom_data files downloaded on 2025-03-06.
+
+knitr::opts_knit$set(root.dir = "/Users/sur/lab/exp/2025/today")
+library(tidyverse)
+
+#' # Read data
+
+#' We add column names to the sample ids (`id`) and strain ids (`strain`)
+Tab <- read_csv("../2025-03-06.pilot_lme4/2025-03-06.ancom_data/ancom_data/freq.csv") %>%
+  rename(strain = 1)
+Tab
+
+Meta <- read_csv("../2025-03-06.pilot_lme4/2025-03-06.ancom_data/ancom_data/meta.csv") %>%
+  rename(id = 1)
+Meta
+
+#' **NOTE**: The following are locations and files in Sur's lab desktop. Format
+#' might be different in someone else
+syncoms <- read_tsv("../../../data/2024_rhizo_pilot_syncom_NS/NS1/syncoms.tsv") %>%
+  full_join(read_tsv("../../../data/2024_rhizo_pilot_syncom_NS/NS2/syncoms.tsv"), 
+            by = "strain")
+syncoms
+
+#' # Format data
+#' We add a column per strain to the metadata table to indicate which species
+#' were added to that community, 1 means added, NA means not added (though
+#' it might be detected still). We create a new object.
+meta <- Meta %>%
+  # filter(community %in% c("R1", "R2")) %>%
+  left_join(syncoms %>%
+              # select(strain, R1,R2) %>%
+              pivot_longer(-strain, names_to = "community", values_to = "presence") %>%
+              pivot_wider(names_from = "strain", values_from = "presence"),
+            by = "community")
+meta 
+
+#' Then we pivot the table so there is one row per *strain x sample* 
+#' combination. The new column added indicates if the relevant strain
+#' was added (1) or not (0)
+meta <- meta %>%
+  pivot_longer(-c("id", "community", "hrs", "techrep", "exp", "temp", "color_comsint", "community_temp"),
+               names_to = "strain", values_to = "added") %>%
+  mutate(added = replace_na(added, 0))
+meta
+
+#' Before adding count information we need to homogenize the strain names
+strains <- Tab$strain
+strains <- strains %>%
+  str_replace("NS_042g_27F", "ST00042") %>%
+  str_replace("NS_164C_27F", "ST00164") %>%
+  str_replace("NS_110C_1_27F", "ST00110") 
+strains
+Tab$strain <- strains
+
+#' Now we pivot the count table to have 1 row per *sample x strain* combination
+#' and we join it with the metadata table. We also calculate the sequencing
+#' depth per sample and add that information to the metadata. We create a new
+#' object
+tab <- Tab %>%
+  pivot_longer(-strain, names_to = "id", values_to = "count") 
+Dat <- meta %>%
+  left_join(tab %>%
+              group_by(id) %>%
+              summarize(depth = sum(count)),
+            by = "id") %>%
+  left_join(tab, by = c("id", "strain"))
+Dat
+
+#' Calculate relative abundances of each strain in the inoculum
+#' of each experiment, and add that information to the data
+Dat <- Dat %>%
+  left_join(Dat %>%
+              filter(hrs == 0) %>%
+              group_by(strain, community, exp) %>%
+              summarise(i_freq = sum(count) / sum(depth),
+                        .groups = "drop"),
+            by = c("strain", "community", "exp"))
+Dat
+
+#' # Add modelling variables
+#' 
+#' We add variables for things we want to model. We are **ignoring time** for 
+#' the time (;)) being
+#' 
+#' ## Assuming design is perfect
+#' 
+#' We discard effects when a strain wasn't added 
+dat <- Dat %>%
+  # The effect of community on strain abundance
+  mutate(b_com = paste0(community, "_", strain)) %>%
+  mutate(b_com = replace(b_com, added == 0, NA)) %>%
+  mutate(b_com = replace(b_com, hrs == 0, NA)) %>%
+
+  # The effect of temperature on strain abundance
+  mutate(b_temp = paste0(temp, "_", strain)) %>%
+  mutate(b_temp = replace(b_temp, added == 0, NA)) %>%
+  mutate(b_temp = replace(b_temp, hrs == 0, NA)) %>%
+
+  # The effect of community *color* on strain abundance
+  mutate(b_col = paste0(color_comsint, "_", strain)) %>%
+  mutate(b_col = replace(b_col, added == 0, NA)) %>%
+  mutate(b_col = replace(b_col, hrs == 0, NA)) %>%
+
+  # The effect of biological replicate on strain abundance
+  mutate(b_rep = paste0(exp, "_", strain)) %>%
+  mutate(b_rep = replace(b_rep, added == 0, NA)) %>%
+  mutate(b_rep = replace(b_rep, hrs == 0, NA)) %>%
+
+  # The effect of *temperature x community* on strain abundance
+  mutate(b_temp_com = paste0(temp, "_", community, "_", strain)) %>%
+  mutate(b_temp_com = replace(b_temp_com, added == 0, NA)) %>%
+  mutate(b_temp_com = replace(b_temp_com, hrs == 0, NA)) %>%
+
+  # Individual observation-level effect (for overdispersion)
+  mutate(b_obs = as.character(1:n()))
+dat
+
+# Dat %>%
+#   filter(hrs == 0) %>%
+#   print(n = 500)
+# 
+# 
+# Dat %>%
+#   filter(hrs != 0) %>%
+#   print(n = 1500)
+
+write_tsv(dat, "pilot_dat_perfect_design.tsv")
+
+
+#' ## Based on empirical results
+
+
+#' ### Error in design construction (procrastination)
+#'
+#' In general there is a very large difference between the expected (added == 1)
+#' and unexpected (added != 0) strain counts
+Dat %>%
+  # filter(community %in% c("R3", "R4")) %>%
+  ggplot(aes(x = added == 1, y = count)) +
+  facet_wrap(~ strain, scales = "free_y") +
+  geom_point(position = position_jitter(width = 0.1, height = 0)) +
+  theme_classic()
+
+#' We model added as a function of counts and depth and do model selection
+#' To find the best predictors of added
+
+m1 <- glm(added ~ lcount + strain + lcount * strain + community +  lcount * community + exp + shrs + shrs2,
+          data = Dat %>%
+            mutate(shrs = scale(hrs)) %>%
+            mutate(shrs2 = shrs * shrs) %>%
+            mutate(lcount = log(count + 1)),
+          family = binomial(link = "logit"))
+m1
+summary(m1)
+drop(anova(m1, test = "LRT"))
+
+
+m2 <- glm(added ~ lcount + strain + lcount * strain + community +  lcount * community + shrs + shrs2,
+          data = Dat %>%
+            mutate(shrs = scale(hrs)) %>%
+            mutate(shrs2 = shrs * shrs) %>%
+            mutate(lcount = log(count + 1)),
+          family = binomial(link = "logit"))
+m2
+summary(m2)
+drop(anova(m2, test = "LRT"))
+
+
+m3 <- glm(added ~ lcount + strain + lcount * strain + community +  lcount * community + shrs,
+          data = Dat %>%
+            mutate(shrs = scale(hrs)) %>%
+            mutate(shrs2 = shrs * shrs) %>%
+            mutate(lcount = log(count + 1)),
+          family = binomial(link = "logit"))
+m3
+summary(m3)
+drop(anova(m3, test = "LRT"))
+AIC(m1,m2, m3)
+
+#' I'm not conviced of using this method, we would need some gold standard
+#' and training, it seems to miss manu cases where strain is there despite not
+#' added
+hist(predict(m3, type = "response"))
+table(predict(m3, type = "response") >= .8, Dat$added)
+
+Dat %>%
+  filter(predict(m3, type = "response") >= .8 & added == 0)
+
+Dat %>%
+  filter(added == 0) %>%
+  arrange(desc(count)) %>%
+  filter(count > 0) %>%
+  print(., n = nrow(.))
+
+#' Instead lets setup a 200 read threshold for calling a strain present despite
+#' not being added
+#' 
+Dat
+Dat %>%
+  # filter(added == 0) %>%
+  arrange(desc(count)) %>%
+  mutate(obs = 1:length(count)) %>%
+  ggplot(aes(x = obs, y = count)) +
+  geom_point(aes(col = factor(added == 1))) +
+  geom_hline(yintercept = 200) +
+  scale_x_log10() +
+  # scale_y_log10() +
+  theme_classic()
+
+Dat %>%
+  filter(added == 0) %>%
+  arrange(desc(count)) %>%
+  mutate(obs = 1:length(count)) %>%
+  ggplot(aes(x = obs, y = count)) +
+  geom_point(aes(col = factor(added == 1))) +
+  geom_hline(yintercept = 200) +
+  scale_x_log10() +
+  # scale_y_log10() +
+  theme_classic()