From 7b74497905a099f11f86785a61f2b28f7b83c9a8 Mon Sep 17 00:00:00 2001 From: sur Date: Thu, 6 Mar 2025 16:22:52 -0600 Subject: [PATCH 01/17] Model pilot strain abundances --- .gitignore | 4 + NSM_comsints.Rproj | 13 ++++ lme_pilot/process_data.r | 159 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 176 insertions(+) create mode 100644 .gitignore create mode 100644 NSM_comsints.Rproj create mode 100644 lme_pilot/process_data.r diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5b6a065 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata diff --git a/NSM_comsints.Rproj b/NSM_comsints.Rproj new file mode 100644 index 0000000..8e3c2eb --- /dev/null +++ b/NSM_comsints.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/lme_pilot/process_data.r b/lme_pilot/process_data.r new file mode 100644 index 0000000..ee32d90 --- /dev/null +++ b/lme_pilot/process_data.r @@ -0,0 +1,159 @@ +setwd("/Users/sur/lab/exp/2025/today") + +library(tidyverse) + +Tab <- read_csv("2025-03-06.ancom_data/ancom_data/freq.csv") %>% + rename(strain = 1) +Tab + +Meta <- read_csv("2025-03-06.ancom_data/ancom_data/meta.csv") %>% + rename(id = 1) +Meta + + + +syncoms <- read_tsv("../../../data/2024_rhizo_pilot_syncom_NS/NS1/syncoms.tsv") + +#' Select two communities to test + +# %>% select(community, ST00046, ST00154, ST00101, ST00109, ST00042, ST00060 + +meta <- Meta %>% + filter(community %in% c("R1", "R2")) %>% + left_join(syncoms %>% + select(strain, R1,R2) %>% + pivot_longer(-strain, names_to = "community", values_to = "presence") %>% + pivot_wider(names_from = "strain", values_from = "presence"), + by = "community") + +meta %>% + print(n = 100) + +meta <- meta %>% + pivot_longer(-c("id", "community", "hrs", "techrep", "exp", "temp", "color_comsint", "community_temp"), + names_to = "strain", values_to = "added") %>% + mutate(added = replace_na(added, 0)) + +meta %>% + print(n = 100) + + +strains <- Tab$strain +strains <- strains %>% + str_replace("NS_042g_27F", "ST00042") %>% + str_replace("NS_164C_27F", "ST00164") %>% + str_replace("NS_110C_1_27F", "ST00110") +strains +Tab$strain <- strains + + +tab <- Tab %>% + pivot_longer(-strain, names_to = "id", values_to = "count") +tab + + + +Dat <- meta %>% + left_join(tab %>% + group_by(id) %>% + summarize(depth = sum(count)), + by = "id") %>% + left_join(tab, by = c("id", "strain")) +Dat + +#' Calculate relative abundances of each strain on the inoculum +#' of each experiment + +Dat <- Dat %>% + left_join(Dat %>% + filter(hrs == 0) %>% + group_by(strain, community, exp) %>% + summarise(i_freq = sum(count) / sum(depth), + .groups = "drop"), + by = c("strain", "community", "exp")) +Dat + + +#' Confirm that count match expected species +#' In general it does. ST00060 seems to be the only semi problematic +Dat %>% + ggplot(aes(x = added == 1, y = count)) + + facet_wrap(~ strain, scales = "free_y") + + geom_point(position = position_jitter(width = 0.1, height = 0)) + + theme_classic() + + + +#' # Model with lme4 +library(lme4) +Dat + + +dat <- Dat %>% + mutate(b_com = paste0(community, "_", strain)) %>% + mutate(b_com = replace(b_com, added == 0, NA)) %>% + + mutate(b_rep = paste0(exp, "_", strain)) %>% + mutate(b_rep = replace(b_rep, added == 0, NA)) %>% + + mutate(b_temp = paste0(temp, "_", strain)) %>% + mutate(b_temp = replace(b_temp, added == 0, NA)) %>% + + mutate(b_obs = as.character(1:n())) +dat + +m1 <- glmer(count ~ log(depth) + i_freq + (1|b_com) + + (1|b_rep) + (1|b_temp) + (1|b_obs), + data = dat %>% + filter(hrs == 24), + family = poisson(link = log) ) +summary(m1) +AIC(m1) + +#' Get the effect of community +b_com <- ranef(m1, condVar = TRUE, whichel = "b_com", postVar = TRUE) +res <- tibble(effect = row.names(b_com$b_com), + est = b_com$b_com[,1], + postVar = attr(b_com$b_com, "postVar")[,,]) %>% + mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)), + upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)), + pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>% + mutate(qval = p.adjust(pval)) %>% + arrange(est) %>% + mutate(effect = factor(effect, levels = effect)) +res + +res %>% + ggplot(aes(x = est, y = effect)) + + geom_errorbarh(aes(xmin = lower, xmax = upper)) + + geom_point() + + geom_vline(xintercept = 0) + + theme_classic() + + +#' Get the effect of temp +b_temp <- ranef(m1, condVar = TRUE, whichel = "b_temp", postVar = TRUE) +res <- tibble(effect = row.names(b_temp$b_temp), + est = b_temp$b_temp[,1], + postVar = attr(b_temp$b_temp, "postVar")[,,]) %>% + mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)), + upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)), + pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>% + mutate(qval = p.adjust(pval)) %>% + arrange(est) %>% + mutate(effect = factor(effect, levels = effect)) +res + +res %>% + ggplot(aes(x = est, y = effect)) + + geom_errorbarh(aes(xmin = lower, xmax = upper)) + + geom_point() + + geom_vline(xintercept = 0) + + theme_classic() + + + +dat %>% + filter(strain == "ST00046") + + From fcec68109c9acdfbae651c3d4b78cd82b2fcaab8 Mon Sep 17 00:00:00 2001 From: sur Date: Fri, 7 Mar 2025 11:20:27 -0600 Subject: [PATCH 02/17] Script to generate report --- lme_pilot/process_data.r | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lme_pilot/process_data.r b/lme_pilot/process_data.r index ee32d90..06c4564 100644 --- a/lme_pilot/process_data.r +++ b/lme_pilot/process_data.r @@ -109,6 +109,7 @@ m1 <- glmer(count ~ log(depth) + i_freq + (1|b_com) + family = poisson(link = log) ) summary(m1) AIC(m1) +BIC(m1) #' Get the effect of community b_com <- ranef(m1, condVar = TRUE, whichel = "b_com", postVar = TRUE) @@ -152,8 +153,13 @@ res %>% theme_classic() - +#' Example of ST00046 showing that it decreases more at 28 than 32 (not significant) dat %>% - filter(strain == "ST00046") + filter(strain == "ST00046") %>% + filter(hrs %in% c(24)) %>% + ggplot(aes(col = factor(temp))) + + facet_wrap(~community) + + geom_segment(aes(x = 0, y = i_freq, xend = temp, yend = count / depth, linetype = exp )) + + theme_classic() From 767759f1f855c9b1110c11b5ecf15e107863834a Mon Sep 17 00:00:00 2001 From: sur Date: Fri, 7 Mar 2025 11:46:17 -0600 Subject: [PATCH 03/17] Comenting --- lme_pilot/process_data.r | 63 ++++++++------ lme_pilot/run_lme4_model.r | 165 +++++++++++++++++++++++++++++++++++++ 2 files changed, 203 insertions(+), 25 deletions(-) create mode 100644 lme_pilot/run_lme4_model.r diff --git a/lme_pilot/process_data.r b/lme_pilot/process_data.r index 06c4564..7d3dbd8 100644 --- a/lme_pilot/process_data.r +++ b/lme_pilot/process_data.r @@ -1,7 +1,17 @@ -setwd("/Users/sur/lab/exp/2025/today") +#' Script to prepare data from pilot for modelling. We require to put every +#' count observation in a single line in a master data table. We are also +#' Calculating the frequency of inoculum. *Should we also calculate frequency +#' at time t-1* + +#' The working directory may be set different depending on where are the +#' relevant files. I'm using NSM's ancom_data files downloaded on 2025-03-06. +setwd("/Users/sur/lab/exp/2025/today") library(tidyverse) +#' # Read data + +#' We add column names to the sample ids (`id`) and strain ids (`strain`) Tab <- read_csv("2025-03-06.ancom_data/ancom_data/freq.csv") %>% rename(strain = 1) Tab @@ -10,34 +20,36 @@ Meta <- read_csv("2025-03-06.ancom_data/ancom_data/meta.csv") %>% rename(id = 1) Meta - - -syncoms <- read_tsv("../../../data/2024_rhizo_pilot_syncom_NS/NS1/syncoms.tsv") - -#' Select two communities to test - -# %>% select(community, ST00046, ST00154, ST00101, ST00109, ST00042, ST00060 - +#' **NOTE**: The following are locations and files in Sur's lab desktop. Format +#' might be different in someone else +syncoms <- read_tsv("../../../data/2024_rhizo_pilot_syncom_NS/NS1/syncoms.tsv") %>% + full_join(read_tsv("../../../data/2024_rhizo_pilot_syncom_NS/NS2/syncoms.tsv"), + by = "strain") +syncoms + +#' # Format data +#' We add a column per strain to the metadata table to indicate which species +#' were added to that community, 1 means added, NA means not added (though +#' it might be detected still). We create a new object. meta <- Meta %>% - filter(community %in% c("R1", "R2")) %>% + # filter(community %in% c("R1", "R2")) %>% left_join(syncoms %>% select(strain, R1,R2) %>% pivot_longer(-strain, names_to = "community", values_to = "presence") %>% pivot_wider(names_from = "strain", values_from = "presence"), by = "community") +meta -meta %>% - print(n = 100) - +#' Then we pivot the table so there is one row per *strain x sample* +#' combination. The new column added indicates if the relevant strain +#' was added (1) or not (0) meta <- meta %>% pivot_longer(-c("id", "community", "hrs", "techrep", "exp", "temp", "color_comsint", "community_temp"), names_to = "strain", values_to = "added") %>% mutate(added = replace_na(added, 0)) +meta -meta %>% - print(n = 100) - - +#' Before adding count information we need to homogenize the strain names strains <- Tab$strain strains <- strains %>% str_replace("NS_042g_27F", "ST00042") %>% @@ -46,13 +58,12 @@ strains <- strains %>% strains Tab$strain <- strains - +#' Now we pivot the count table to have 1 row per *sample x strain* combination +#' and we join it with the metadata table. We also calculate the sequencing +#' depth per sample and add that information to the metadata. We create a new +#' object tab <- Tab %>% pivot_longer(-strain, names_to = "id", values_to = "count") -tab - - - Dat <- meta %>% left_join(tab %>% group_by(id) %>% @@ -61,9 +72,8 @@ Dat <- meta %>% left_join(tab, by = c("id", "strain")) Dat -#' Calculate relative abundances of each strain on the inoculum -#' of each experiment - +#' Calculate relative abundances of each strain in the inoculum +#' of each experiment, and add that information to the data Dat <- Dat %>% left_join(Dat %>% filter(hrs == 0) %>% @@ -74,6 +84,9 @@ Dat <- Dat %>% Dat + + + #' Confirm that count match expected species #' In general it does. ST00060 seems to be the only semi problematic Dat %>% diff --git a/lme_pilot/run_lme4_model.r b/lme_pilot/run_lme4_model.r new file mode 100644 index 0000000..3cc8055 --- /dev/null +++ b/lme_pilot/run_lme4_model.r @@ -0,0 +1,165 @@ +setwd("/Users/sur/lab/exp/2025/today") + +library(tidyverse) + +Tab <- read_csv("2025-03-06.ancom_data/ancom_data/freq.csv") %>% + rename(strain = 1) +Tab + +Meta <- read_csv("2025-03-06.ancom_data/ancom_data/meta.csv") %>% + rename(id = 1) +Meta + + + +syncoms <- read_tsv("../../../data/2024_rhizo_pilot_syncom_NS/NS1/syncoms.tsv") + +#' Select two communities to test + +# %>% select(community, ST00046, ST00154, ST00101, ST00109, ST00042, ST00060 + +meta <- Meta %>% + filter(community %in% c("R1", "R2")) %>% + left_join(syncoms %>% + select(strain, R1,R2) %>% + pivot_longer(-strain, names_to = "community", values_to = "presence") %>% + pivot_wider(names_from = "strain", values_from = "presence"), + by = "community") + +meta %>% + print(n = 100) + +meta <- meta %>% + pivot_longer(-c("id", "community", "hrs", "techrep", "exp", "temp", "color_comsint", "community_temp"), + names_to = "strain", values_to = "added") %>% + mutate(added = replace_na(added, 0)) + +meta %>% + print(n = 100) + + +strains <- Tab$strain +strains <- strains %>% + str_replace("NS_042g_27F", "ST00042") %>% + str_replace("NS_164C_27F", "ST00164") %>% + str_replace("NS_110C_1_27F", "ST00110") +strains +Tab$strain <- strains + + +tab <- Tab %>% + pivot_longer(-strain, names_to = "id", values_to = "count") +tab + + + +Dat <- meta %>% + left_join(tab %>% + group_by(id) %>% + summarize(depth = sum(count)), + by = "id") %>% + left_join(tab, by = c("id", "strain")) +Dat + +#' Calculate relative abundances of each strain on the inoculum +#' of each experiment + +Dat <- Dat %>% + left_join(Dat %>% + filter(hrs == 0) %>% + group_by(strain, community, exp) %>% + summarise(i_freq = sum(count) / sum(depth), + .groups = "drop"), + by = c("strain", "community", "exp")) +Dat + + +#' Confirm that count match expected species +#' In general it does. ST00060 seems to be the only semi problematic +Dat %>% + ggplot(aes(x = added == 1, y = count)) + + facet_wrap(~ strain, scales = "free_y") + + geom_point(position = position_jitter(width = 0.1, height = 0)) + + theme_classic() + + + +#' # Model with lme4 +library(lme4) +Dat + + +dat <- Dat %>% + mutate(b_com = paste0(community, "_", strain)) %>% + mutate(b_com = replace(b_com, added == 0, NA)) %>% + + mutate(b_rep = paste0(exp, "_", strain)) %>% + mutate(b_rep = replace(b_rep, added == 0, NA)) %>% + + mutate(b_temp = paste0(temp, "_", strain)) %>% + mutate(b_temp = replace(b_temp, added == 0, NA)) %>% + + mutate(b_obs = as.character(1:n())) +dat + +m1 <- glmer(count ~ log(depth) + i_freq + (1|b_com) + + (1|b_rep) + (1|b_temp) + (1|b_obs), + data = dat %>% + filter(hrs == 24), + family = poisson(link = log) ) +summary(m1) +AIC(m1) +BIC(m1) + +#' Get the effect of community +b_com <- ranef(m1, condVar = TRUE, whichel = "b_com", postVar = TRUE) +res <- tibble(effect = row.names(b_com$b_com), + est = b_com$b_com[,1], + postVar = attr(b_com$b_com, "postVar")[,,]) %>% + mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)), + upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)), + pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>% + mutate(qval = p.adjust(pval)) %>% + arrange(est) %>% + mutate(effect = factor(effect, levels = effect)) +res + +res %>% + ggplot(aes(x = est, y = effect)) + + geom_errorbarh(aes(xmin = lower, xmax = upper)) + + geom_point() + + geom_vline(xintercept = 0) + + theme_classic() + + +#' Get the effect of temp +b_temp <- ranef(m1, condVar = TRUE, whichel = "b_temp", postVar = TRUE) +res <- tibble(effect = row.names(b_temp$b_temp), + est = b_temp$b_temp[,1], + postVar = attr(b_temp$b_temp, "postVar")[,,]) %>% + mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)), + upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)), + pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>% + mutate(qval = p.adjust(pval)) %>% + arrange(est) %>% + mutate(effect = factor(effect, levels = effect)) +res + +res %>% + ggplot(aes(x = est, y = effect)) + + geom_errorbarh(aes(xmin = lower, xmax = upper)) + + geom_point() + + geom_vline(xintercept = 0) + + theme_classic() + + +#' Example of ST00046 showing that it decreases more at 28 than 32 (not significant) +dat %>% + filter(strain == "ST00046") %>% + filter(hrs %in% c(24)) %>% + ggplot(aes(col = factor(temp))) + + facet_wrap(~community) + + geom_segment(aes(x = 0, y = i_freq, xend = temp, yend = count / depth, linetype = exp )) + + theme_classic() + + From 24bce90e57f6de064f2e49e8822c4363af882e17 Mon Sep 17 00:00:00 2001 From: sur Date: Fri, 7 Mar 2025 11:57:03 -0600 Subject: [PATCH 04/17] Found error in added --- lme_pilot/process_data.r | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lme_pilot/process_data.r b/lme_pilot/process_data.r index 7d3dbd8..14a84f4 100644 --- a/lme_pilot/process_data.r +++ b/lme_pilot/process_data.r @@ -83,6 +83,8 @@ Dat <- Dat %>% by = c("strain", "community", "exp")) Dat +#' # Add modelling variables + @@ -90,6 +92,7 @@ Dat #' Confirm that count match expected species #' In general it does. ST00060 seems to be the only semi problematic Dat %>% + filter(community %in% c("R3", "R4")) %>% ggplot(aes(x = added == 1, y = count)) + facet_wrap(~ strain, scales = "free_y") + geom_point(position = position_jitter(width = 0.1, height = 0)) + @@ -97,6 +100,11 @@ Dat %>% +Dat %>% filter(strain == "ST00046") %>% filter(community %in% c("R3","R4")) %>% arrange(added) %>% print(n = 200) + +Dat %>% filter(community %in% c("R3","R4")) %>% + filter(added == 1) + #' # Model with lme4 library(lme4) Dat From 0571b3d3a655d181ec332c137866aba4f554fc5d Mon Sep 17 00:00:00 2001 From: sur Date: Fri, 7 Mar 2025 16:24:20 -0600 Subject: [PATCH 05/17] Fixed error in added line, cleaning code --- lme_pilot/process_data.r | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/lme_pilot/process_data.r b/lme_pilot/process_data.r index 14a84f4..7824302 100644 --- a/lme_pilot/process_data.r +++ b/lme_pilot/process_data.r @@ -34,7 +34,7 @@ syncoms meta <- Meta %>% # filter(community %in% c("R1", "R2")) %>% left_join(syncoms %>% - select(strain, R1,R2) %>% + # select(strain, R1,R2) %>% pivot_longer(-strain, names_to = "community", values_to = "presence") %>% pivot_wider(names_from = "strain", values_from = "presence"), by = "community") @@ -85,14 +85,36 @@ Dat #' # Add modelling variables +#' We add variables for things we want to model. We are **ignoring time** for +#' the time (;)) being +Dat <- Dat %>% + # The effect of community on strain abundance + mutate(b_com = paste0(community, "_", strain)) %>% + mutate(b_com = replace(b_com, added == 0, NA)) %>% + + # The effect of biological replicat on strain abundance + mutate(b_rep = paste0(exp, "_", strain)) %>% + mutate(b_rep = replace(b_rep, added == 0, NA)) %>% + + # The effect of temperature on strain abundance + mutate(b_temp = paste0(temp, "_", strain)) %>% + mutate(b_temp = replace(b_temp, added == 0, NA)) %>% + + # The effect of *temperature x community* on strain abundance + + # The effect of community *color* on strain abundance + + # Individual observation-level effect (for overdispersion) + mutate(b_obs = as.character(1:n())) +Dat #' Confirm that count match expected species #' In general it does. ST00060 seems to be the only semi problematic Dat %>% - filter(community %in% c("R3", "R4")) %>% + # filter(community %in% c("R3", "R4")) %>% ggplot(aes(x = added == 1, y = count)) + facet_wrap(~ strain, scales = "free_y") + geom_point(position = position_jitter(width = 0.1, height = 0)) + @@ -100,7 +122,7 @@ Dat %>% -Dat %>% filter(strain == "ST00046") %>% filter(community %in% c("R3","R4")) %>% arrange(added) %>% print(n = 200) +Dat %>% filter(strain == "ST00046") %>% arrange(added) %>% print(n = 200) Dat %>% filter(community %in% c("R3","R4")) %>% filter(added == 1) From a1397407d55e9b6c89f4e8e556bc3764263dce61 Mon Sep 17 00:00:00 2001 From: sur Date: Fri, 7 Mar 2025 18:26:42 -0600 Subject: [PATCH 06/17] Trying to get a definition of preence when not added --- lme_pilot/process_data.r | 107 ++++++++++++++++++++++++++++++++------- 1 file changed, 90 insertions(+), 17 deletions(-) diff --git a/lme_pilot/process_data.r b/lme_pilot/process_data.r index 7824302..77c45b4 100644 --- a/lme_pilot/process_data.r +++ b/lme_pilot/process_data.r @@ -84,35 +84,60 @@ Dat <- Dat %>% Dat #' # Add modelling variables - +#' #' We add variables for things we want to model. We are **ignoring time** for #' the time (;)) being - -Dat <- Dat %>% +#' +#' ## Assuming design is perfect +#' +#' We discard effects when a strain wasn't added +dat <- Dat %>% # The effect of community on strain abundance mutate(b_com = paste0(community, "_", strain)) %>% mutate(b_com = replace(b_com, added == 0, NA)) %>% - - # The effect of biological replicat on strain abundance - mutate(b_rep = paste0(exp, "_", strain)) %>% - mutate(b_rep = replace(b_rep, added == 0, NA)) %>% + mutate(b_com = replace(b_com, hrs == 0, NA)) %>% # The effect of temperature on strain abundance mutate(b_temp = paste0(temp, "_", strain)) %>% mutate(b_temp = replace(b_temp, added == 0, NA)) %>% - - # The effect of *temperature x community* on strain abundance + mutate(b_temp = replace(b_temp, hrs == 0, NA)) %>% # The effect of community *color* on strain abundance + mutate(b_col = paste0(color_comsint, "_", strain)) %>% + mutate(b_col = replace(b_col, added == 0, NA)) %>% + mutate(b_col = replace(b_col, hrs == 0, NA)) %>% + + # The effect of biological replicate on strain abundance + mutate(b_rep = paste0(exp, "_", strain)) %>% + mutate(b_rep = replace(b_rep, added == 0, NA)) %>% + mutate(b_rep = replace(b_rep, hrs == 0, NA)) %>% + # The effect of *temperature x community* on strain abundance + mutate(b_temp_com = paste0(temp, "_", community, "_", strain)) %>% + mutate(b_temp_com = replace(b_temp_com, added == 0, NA)) %>% + mutate(b_temp_com = replace(b_temp_com, hrs == 0, NA)) %>% + # Individual observation-level effect (for overdispersion) mutate(b_obs = as.character(1:n())) -Dat +dat + +# Dat %>% +# filter(hrs == 0) %>% +# print(n = 500) +# +# +# Dat %>% +# filter(hrs != 0) %>% +# print(n = 1500) +write_tsv(dat, "pilot_dat_perfect_design.tsv") -#' Confirm that count match expected species -#' In general it does. ST00060 seems to be the only semi problematic +#' ## Based on empirical results +#' ### Error in design construction +#' +#' In general there is a very large difference between the expected (added == 1) +#' and unexpected (added != 0) strain counts Dat %>% # filter(community %in% c("R3", "R4")) %>% ggplot(aes(x = added == 1, y = count)) + @@ -120,13 +145,61 @@ Dat %>% geom_point(position = position_jitter(width = 0.1, height = 0)) + theme_classic() +#' We model added as a function of counts and depth and do model selection +#' To find the best predictors of added + +m1 <- glm(added ~ lcount + strain + lcount * strain + community + lcount * community + exp + shrs + shrs2, + data = Dat %>% + mutate(shrs = scale(hrs)) %>% + mutate(shrs2 = shrs * shrs) %>% + mutate(lcount = log(count + 1)), + family = binomial(link = "logit")) +m1 +summary(m1) +drop(anova(m1, test = "LRT")) + + +m2 <- glm(added ~ lcount + strain + lcount * strain + community + lcount * community + shrs + shrs2, + data = Dat %>% + mutate(shrs = scale(hrs)) %>% + mutate(shrs2 = shrs * shrs) %>% + mutate(lcount = log(count + 1)), + family = binomial(link = "logit")) +m2 +summary(m2) +drop(anova(m2, test = "LRT")) + + +m3 <- glm(added ~ lcount + strain + lcount * strain + community + lcount * community + shrs, + data = Dat %>% + mutate(shrs = scale(hrs)) %>% + mutate(shrs2 = shrs * shrs) %>% + mutate(lcount = log(count + 1)), + family = binomial(link = "logit")) +m3 +summary(m3) +drop(anova(m3, test = "LRT")) +AIC(m1,m2, m3) + +#' I'm not conviced of using this method, we would need some gold standard +#' and training, it seems to miss manu cases where strain is there despite not +#' added +hist(predict(m3, type = "response")) +table(predict(m3, type = "response") >= .8, Dat$added) + +Dat %>% + filter(predict(m3, type = "response") >= .8 & added == 0) + +Dat %>% + filter(added == 0) %>% + arrange(desc(count)) %>% + filter(count > 0) %>% + print(., n = nrow(.)) + +#' Instead lets setup a 200 read threshold for calling a strain present despite +#' not being added -Dat %>% filter(strain == "ST00046") %>% arrange(added) %>% print(n = 200) - -Dat %>% filter(community %in% c("R3","R4")) %>% - filter(added == 1) - #' # Model with lme4 library(lme4) Dat From 796e5438b41b692516717a9849d1120769fb0b3b Mon Sep 17 00:00:00 2001 From: sur Date: Mon, 10 Mar 2025 12:29:41 -0600 Subject: [PATCH 07/17] First data processing Need to check if it works when a strain is not added (currently NA) Need to modify to include effect when strain present even if not added --- lme_pilot/process_data.r | 114 ++++++++++----------------------------- 1 file changed, 29 insertions(+), 85 deletions(-) diff --git a/lme_pilot/process_data.r b/lme_pilot/process_data.r index 77c45b4..e441301 100644 --- a/lme_pilot/process_data.r +++ b/lme_pilot/process_data.r @@ -6,17 +6,17 @@ #' The working directory may be set different depending on where are the #' relevant files. I'm using NSM's ancom_data files downloaded on 2025-03-06. -setwd("/Users/sur/lab/exp/2025/today") +knitr::opts_knit$set(root.dir = "/Users/sur/lab/exp/2025/today") library(tidyverse) #' # Read data #' We add column names to the sample ids (`id`) and strain ids (`strain`) -Tab <- read_csv("2025-03-06.ancom_data/ancom_data/freq.csv") %>% +Tab <- read_csv("../2025-03-06.pilot_lme4/2025-03-06.ancom_data/ancom_data/freq.csv") %>% rename(strain = 1) Tab -Meta <- read_csv("2025-03-06.ancom_data/ancom_data/meta.csv") %>% +Meta <- read_csv("../2025-03-06.pilot_lme4/2025-03-06.ancom_data/ancom_data/meta.csv") %>% rename(id = 1) Meta @@ -134,17 +134,19 @@ write_tsv(dat, "pilot_dat_perfect_design.tsv") #' ## Based on empirical results -#' ### Error in design construction -#' -#' In general there is a very large difference between the expected (added == 1) + + +#' ### Error in design construction (procrastination) +#' +#' In general there is a very large difference between the expected (added == 1) #' and unexpected (added != 0) strain counts -Dat %>% +Dat %>% # filter(community %in% c("R3", "R4")) %>% ggplot(aes(x = added == 1, y = count)) + facet_wrap(~ strain, scales = "free_y") + geom_point(position = position_jitter(width = 0.1, height = 0)) + theme_classic() - + #' We model added as a function of counts and depth and do model selection #' To find the best predictors of added @@ -198,84 +200,26 @@ Dat %>% #' Instead lets setup a 200 read threshold for calling a strain present despite #' not being added - - -#' # Model with lme4 -library(lme4) +#' Dat - - -dat <- Dat %>% - mutate(b_com = paste0(community, "_", strain)) %>% - mutate(b_com = replace(b_com, added == 0, NA)) %>% - - mutate(b_rep = paste0(exp, "_", strain)) %>% - mutate(b_rep = replace(b_rep, added == 0, NA)) %>% - - mutate(b_temp = paste0(temp, "_", strain)) %>% - mutate(b_temp = replace(b_temp, added == 0, NA)) %>% - - mutate(b_obs = as.character(1:n())) -dat - -m1 <- glmer(count ~ log(depth) + i_freq + (1|b_com) + - (1|b_rep) + (1|b_temp) + (1|b_obs), - data = dat %>% - filter(hrs == 24), - family = poisson(link = log) ) -summary(m1) -AIC(m1) -BIC(m1) - -#' Get the effect of community -b_com <- ranef(m1, condVar = TRUE, whichel = "b_com", postVar = TRUE) -res <- tibble(effect = row.names(b_com$b_com), - est = b_com$b_com[,1], - postVar = attr(b_com$b_com, "postVar")[,,]) %>% - mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)), - upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)), - pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>% - mutate(qval = p.adjust(pval)) %>% - arrange(est) %>% - mutate(effect = factor(effect, levels = effect)) -res - -res %>% - ggplot(aes(x = est, y = effect)) + - geom_errorbarh(aes(xmin = lower, xmax = upper)) + - geom_point() + - geom_vline(xintercept = 0) + +Dat %>% + # filter(added == 0) %>% + arrange(desc(count)) %>% + mutate(obs = 1:length(count)) %>% + ggplot(aes(x = obs, y = count)) + + geom_point(aes(col = factor(added == 1))) + + geom_hline(yintercept = 200) + + scale_x_log10() + + # scale_y_log10() + theme_classic() - -#' Get the effect of temp -b_temp <- ranef(m1, condVar = TRUE, whichel = "b_temp", postVar = TRUE) -res <- tibble(effect = row.names(b_temp$b_temp), - est = b_temp$b_temp[,1], - postVar = attr(b_temp$b_temp, "postVar")[,,]) %>% - mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)), - upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)), - pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>% - mutate(qval = p.adjust(pval)) %>% - arrange(est) %>% - mutate(effect = factor(effect, levels = effect)) -res - -res %>% - ggplot(aes(x = est, y = effect)) + - geom_errorbarh(aes(xmin = lower, xmax = upper)) + - geom_point() + - geom_vline(xintercept = 0) + +Dat %>% + filter(added == 0) %>% + arrange(desc(count)) %>% + mutate(obs = 1:length(count)) %>% + ggplot(aes(x = obs, y = count)) + + geom_point(aes(col = factor(added == 1))) + + geom_hline(yintercept = 200) + + scale_x_log10() + + # scale_y_log10() + theme_classic() - - -#' Example of ST00046 showing that it decreases more at 28 than 32 (not significant) -dat %>% - filter(strain == "ST00046") %>% - filter(hrs %in% c(24)) %>% - ggplot(aes(col = factor(temp))) + - facet_wrap(~community) + - geom_segment(aes(x = 0, y = i_freq, xend = temp, yend = count / depth, linetype = exp )) + - theme_classic() - - From 63f1e98fbdb808576597aa7742403d9bf712b965 Mon Sep 17 00:00:00 2001 From: sur Date: Mon, 10 Mar 2025 13:02:23 -0600 Subject: [PATCH 08/17] Testing 4 basic models --- lme_pilot/run_lme4_model.r | 115 ++++++++++--------------------------- 1 file changed, 30 insertions(+), 85 deletions(-) diff --git a/lme_pilot/run_lme4_model.r b/lme_pilot/run_lme4_model.r index 3cc8055..94af99f 100644 --- a/lme_pilot/run_lme4_model.r +++ b/lme_pilot/run_lme4_model.r @@ -1,106 +1,51 @@ setwd("/Users/sur/lab/exp/2025/today") - library(tidyverse) +library(lme4) -Tab <- read_csv("2025-03-06.ancom_data/ancom_data/freq.csv") %>% - rename(strain = 1) -Tab - -Meta <- read_csv("2025-03-06.ancom_data/ancom_data/meta.csv") %>% - rename(id = 1) -Meta - - - -syncoms <- read_tsv("../../../data/2024_rhizo_pilot_syncom_NS/NS1/syncoms.tsv") - -#' Select two communities to test - -# %>% select(community, ST00046, ST00154, ST00101, ST00109, ST00042, ST00060 - -meta <- Meta %>% - filter(community %in% c("R1", "R2")) %>% - left_join(syncoms %>% - select(strain, R1,R2) %>% - pivot_longer(-strain, names_to = "community", values_to = "presence") %>% - pivot_wider(names_from = "strain", values_from = "presence"), - by = "community") - -meta %>% - print(n = 100) - -meta <- meta %>% - pivot_longer(-c("id", "community", "hrs", "techrep", "exp", "temp", "color_comsint", "community_temp"), - names_to = "strain", values_to = "added") %>% - mutate(added = replace_na(added, 0)) - -meta %>% - print(n = 100) - +Dat <- read_tsv("pilot_dat_perfect_design.tsv") +Dat -strains <- Tab$strain -strains <- strains %>% - str_replace("NS_042g_27F", "ST00042") %>% - str_replace("NS_164C_27F", "ST00164") %>% - str_replace("NS_110C_1_27F", "ST00110") -strains -Tab$strain <- strains +#' Try simple model -tab <- Tab %>% - pivot_longer(-strain, names_to = "id", values_to = "count") -tab +#' 24 hours only +dat <- Dat %>% + filter(hrs == 24) +#' Four basic poisson models to check the effect of the overdispersion +#' variable (b_obs) and the interaction (temp * community) effects +m1.t1 <- glmer(count ~ log(depth) + i_freq + + (1|b_com) + (1|b_temp) + (b_temp_com) + (1|b_rep) + (1|b_obs), + data = dat, + family = poisson(link = "log") ) -Dat <- meta %>% - left_join(tab %>% - group_by(id) %>% - summarize(depth = sum(count)), - by = "id") %>% - left_join(tab, by = c("id", "strain")) -Dat +m2.t1 <- glmer(count ~ log(depth) + i_freq + + (1|b_com) + (1|b_temp) + (1|b_rep) + (1|b_obs), + data = dat, + family = poisson(link = "log") ) -#' Calculate relative abundances of each strain on the inoculum -#' of each experiment +m3.t1 <- glmer(count ~ log(depth) + i_freq + + (1|b_com) + (1|b_temp) + (b_temp_com) + (1|b_rep), + data = dat, + family = poisson(link = "log")) -Dat <- Dat %>% - left_join(Dat %>% - filter(hrs == 0) %>% - group_by(strain, community, exp) %>% - summarise(i_freq = sum(count) / sum(depth), - .groups = "drop"), - by = c("strain", "community", "exp")) -Dat +m4.t1 <- glmer(count ~ log(depth) + i_freq + + (1|b_com) + (1|b_temp) + (1|b_rep), + data = dat, + family = poisson(link = "log") ) + +AIC(m1.t1, m2.t1, m3.t1, m4.t1) +BIC(m1.t1, m2.t1, m3.t1, m4.t1) -#' Confirm that count match expected species -#' In general it does. ST00060 seems to be the only semi problematic -Dat %>% - ggplot(aes(x = added == 1, y = count)) + - facet_wrap(~ strain, scales = "free_y") + - geom_point(position = position_jitter(width = 0.1, height = 0)) + - theme_classic() +summary(m1.t1) -#' # Model with lme4 -library(lme4) -Dat +lattice::qqmath(ranef(m1.t1)) -dat <- Dat %>% - mutate(b_com = paste0(community, "_", strain)) %>% - mutate(b_com = replace(b_com, added == 0, NA)) %>% - - mutate(b_rep = paste0(exp, "_", strain)) %>% - mutate(b_rep = replace(b_rep, added == 0, NA)) %>% - - mutate(b_temp = paste0(temp, "_", strain)) %>% - mutate(b_temp = replace(b_temp, added == 0, NA)) %>% - - mutate(b_obs = as.character(1:n())) -dat m1 <- glmer(count ~ log(depth) + i_freq + (1|b_com) + (1|b_rep) + (1|b_temp) + (1|b_obs), From 9a9cae40aadbfdc0cf6133ffe2c6beb626add7e8 Mon Sep 17 00:00:00 2001 From: sur Date: Mon, 10 Mar 2025 15:54:06 -0600 Subject: [PATCH 09/17] Trying models --- lme_pilot/run_lme4_model.r | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/lme_pilot/run_lme4_model.r b/lme_pilot/run_lme4_model.r index 94af99f..5f6ab94 100644 --- a/lme_pilot/run_lme4_model.r +++ b/lme_pilot/run_lme4_model.r @@ -18,28 +18,46 @@ dat <- Dat %>% m1.t1 <- glmer(count ~ log(depth) + i_freq + (1|b_com) + (1|b_temp) + (b_temp_com) + (1|b_rep) + (1|b_obs), data = dat, - family = poisson(link = "log") ) + family = poisson(link = "log"), + control=glmerControl(optCtrl=list(maxfun=4*1e4))) m2.t1 <- glmer(count ~ log(depth) + i_freq + (1|b_com) + (1|b_temp) + (1|b_rep) + (1|b_obs), data = dat, - family = poisson(link = "log") ) + family = poisson(link = "log"), + control=glmerControl(optCtrl=list(maxfun=4*1e4))) m3.t1 <- glmer(count ~ log(depth) + i_freq + (1|b_com) + (1|b_temp) + (b_temp_com) + (1|b_rep), data = dat, - family = poisson(link = "log")) + family = poisson(link = "log"), + control=glmerControl(optCtrl=list(maxfun=4*1e4))) m4.t1 <- glmer(count ~ log(depth) + i_freq + (1|b_com) + (1|b_temp) + (1|b_rep), data = dat, - family = poisson(link = "log") ) + family = poisson(link = "log"), + control=glmerControl(optCtrl=list(maxfun=4*1e4))) AIC(m1.t1, m2.t1, m3.t1, m4.t1) BIC(m1.t1, m2.t1, m3.t1, m4.t1) +#' Now fit the negative binomial +m5.t1 <- glmer.nb(count ~ log(depth) + i_freq + + (1|b_com) + (1|b_temp) + (b_temp_com) + (1|b_rep), + data = dat) +m6.t1 <- glmer.nb(count ~ log(depth) + i_freq + + (1|b_com) + (1|b_temp) + (1|b_rep), + data = dat) + + + +AIC(m1.t1, m2.t1, m3.t1, m4.t1, m5.t1, m6.t1) +BIC(m1.t1, m2.t1, m3.t1, m4.t1, m5.t1, m6.t1) + +save() summary(m1.t1) From aae6666b91c64079582e3bbb05dc1d9c24734ae8 Mon Sep 17 00:00:00 2001 From: sur Date: Mon, 10 Mar 2025 18:44:01 -0600 Subject: [PATCH 10/17] Adding brms of negbinomial model --- lme_pilot/run_lme4_model.r | 82 +++++++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/lme_pilot/run_lme4_model.r b/lme_pilot/run_lme4_model.r index 5f6ab94..0be2c6b 100644 --- a/lme_pilot/run_lme4_model.r +++ b/lme_pilot/run_lme4_model.r @@ -42,6 +42,8 @@ m4.t1 <- glmer(count ~ log(depth) + i_freq + AIC(m1.t1, m2.t1, m3.t1, m4.t1) BIC(m1.t1, m2.t1, m3.t1, m4.t1) +#' Basically we need to include the overdispersion parameter. +#' The model without interactions provides a better fit. #' Now fit the negative binomial m5.t1 <- glmer.nb(count ~ log(depth) + i_freq + @@ -55,9 +57,87 @@ m6.t1 <- glmer.nb(count ~ log(depth) + i_freq + AIC(m1.t1, m2.t1, m3.t1, m4.t1, m5.t1, m6.t1) +# df AIC +# m1.t1 126 4288.263 +# m2.t1 7 4570.509 +# m3.t1 125 91657.828 +# m4.t1 6 240681.080 +# m5.t1 126 4277.931 +# m6.t1 7 4563.837 BIC(m1.t1, m2.t1, m3.t1, m4.t1, m5.t1, m6.t1) +# df BIC +# m1.t1 126 4724.171 +# m2.t1 7 4594.726 +# m3.t1 125 92090.277 +# m4.t1 6 240701.837 +# m5.t1 126 4713.839 +# m6.t1 7 4588.054 + +save(m1.t1, m2.t1, m3.t1, m4.t1, m5.t1, m6.t1, file = "models.rdat") + +#' There were convergence issues but negative binomial performs well +#' There is disagreement between AIC and BIC. BIC gives the models without +#' temp by community interactions better scores while AIC gives the models with +#' temperature x community interaction as better +#' +#' From the winbugs manual: +#' +#' +#' In hierarchical models, these three techniques are +#' essentially answering different prediction problems. +#' Suppose the three levels of our model concerned +#' classes within schools within a country. Then +#' 1. if we were interested in predicting results of +#' future classes in those actual schools, then +#' DIC is appropriate (ie the random effects +#' themselves are of interest); +#' 2. if we were interested in predicting results of +#' future schools in that country, then marginal- +#' likelihood methods such as AIC are +#' appropriate (ie the population parameters are +#' of interest); +#' 3. if we were interested in predicting results for +#' a new country, then BIC/ Bayes factors are +#' appropriate (ie the 'true' underlying model is +#' of interest). + +#' We are interested in future run of these communities, thus +#' DIC may be more relevant? + + +#' Will run brms with the negbinomial model +library(brms) + + +m5.br.t1 <- brm(count ~ log(depth) + i_freq + + (1|b_com) + (1|b_temp) + (b_temp_com) + (1|b_rep), + data = dat, + chains = 4, + iter = 4000, + warmup = 1000, + cores = 4, + thin = 2, + family = "negbinomial") + + +m6.br.t1 <- brm(count ~ log(depth) + i_freq + + (1|b_com) + (1|b_temp) + (1|b_rep), + data = dat, + chains = 4, + iter = 4000, + warmup = 1000, + cores = 4, + thin = 2, + family = "negbinomial") + + +summary(m6.br.t1) +summary(m6.t1) + + +brms::waic(m5.br.t1, m6.br.t1) + -save() summary(m1.t1) From 19fb6ca24b4b6448fb85ff2920054ba19b24195d Mon Sep 17 00:00:00 2001 From: sur Date: Mon, 10 Mar 2025 19:01:22 -0600 Subject: [PATCH 11/17] Fixing temp by env variable --- lme_pilot/run_lme4_model.r | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/lme_pilot/run_lme4_model.r b/lme_pilot/run_lme4_model.r index 0be2c6b..565a9c0 100644 --- a/lme_pilot/run_lme4_model.r +++ b/lme_pilot/run_lme4_model.r @@ -16,7 +16,7 @@ dat <- Dat %>% #' Four basic poisson models to check the effect of the overdispersion #' variable (b_obs) and the interaction (temp * community) effects m1.t1 <- glmer(count ~ log(depth) + i_freq + - (1|b_com) + (1|b_temp) + (b_temp_com) + (1|b_rep) + (1|b_obs), + (1|b_com) + (1|b_temp) + (1|b_temp_com) + (1|b_rep) + (1|b_obs), data = dat, family = poisson(link = "log"), control=glmerControl(optCtrl=list(maxfun=4*1e4))) @@ -28,7 +28,7 @@ m2.t1 <- glmer(count ~ log(depth) + i_freq + control=glmerControl(optCtrl=list(maxfun=4*1e4))) m3.t1 <- glmer(count ~ log(depth) + i_freq + - (1|b_com) + (1|b_temp) + (b_temp_com) + (1|b_rep), + (1|b_com) + (1|b_temp) + (1|b_temp_com) + (1|b_rep), data = dat, family = poisson(link = "log"), control=glmerControl(optCtrl=list(maxfun=4*1e4))) @@ -47,7 +47,7 @@ BIC(m1.t1, m2.t1, m3.t1, m4.t1) #' Now fit the negative binomial m5.t1 <- glmer.nb(count ~ log(depth) + i_freq + - (1|b_com) + (1|b_temp) + (b_temp_com) + (1|b_rep), + (1|b_com) + (1|b_temp) + (1|b_temp_com) + (1|b_rep), data = dat) m6.t1 <- glmer.nb(count ~ log(depth) + i_freq + @@ -107,10 +107,11 @@ save(m1.t1, m2.t1, m3.t1, m4.t1, m5.t1, m6.t1, file = "models.rdat") #' Will run brms with the negbinomial model library(brms) +load("brms_models.rdat") - +# Takes about 6 minutes to run, significant numeric issues. m5.br.t1 <- brm(count ~ log(depth) + i_freq + - (1|b_com) + (1|b_temp) + (b_temp_com) + (1|b_rep), + (1|b_com) + (1|b_temp) + (1|b_temp_com) + (1|b_rep), data = dat, chains = 4, iter = 4000, @@ -119,7 +120,11 @@ m5.br.t1 <- brm(count ~ log(depth) + i_freq + thin = 2, family = "negbinomial") +summary(m5.br.t1) +summary(m5.t1) +pairs(m5.br.t1) +# Takes about 2 minutes to run with compilation m6.br.t1 <- brm(count ~ log(depth) + i_freq + (1|b_com) + (1|b_temp) + (1|b_rep), data = dat, @@ -134,9 +139,15 @@ m6.br.t1 <- brm(count ~ log(depth) + i_freq + summary(m6.br.t1) summary(m6.t1) +save(m5.br.t1, m6.br.t1, file = "brms_models.rdat") +#' We compare with waic and loo brms::waic(m5.br.t1, m6.br.t1) +m5.br.t1.loo <- brms::loo(m5.br.t1) +m6.br.t1.loo <- brms::loo(m6.br.t1) +m5.br.t1.loo +m6.br.t1.loo summary(m1.t1) From 478b30e904cc73cb5af74c0ef4b0220d0f493ec9 Mon Sep 17 00:00:00 2001 From: sur Date: Mon, 10 Mar 2025 19:12:04 -0600 Subject: [PATCH 12/17] Rerunning everything --- lme_pilot/analyze_models.r | 68 +++++++++++++++++++++++++++++++++ lme_pilot/run_lme4_model.r | 77 ++++---------------------------------- 2 files changed, 75 insertions(+), 70 deletions(-) create mode 100644 lme_pilot/analyze_models.r diff --git a/lme_pilot/analyze_models.r b/lme_pilot/analyze_models.r new file mode 100644 index 0000000..c78da53 --- /dev/null +++ b/lme_pilot/analyze_models.r @@ -0,0 +1,68 @@ + +summary(m1.t1) + + +lattice::qqmath(ranef(m1.t1)) + + +m1 <- glmer(count ~ log(depth) + i_freq + (1|b_com) + + (1|b_rep) + (1|b_temp) + (1|b_obs), + data = dat %>% + filter(hrs == 24), + family = poisson(link = log) ) +summary(m1) +AIC(m1) +BIC(m1) + +#' Get the effect of community +b_com <- ranef(m1, condVar = TRUE, whichel = "b_com", postVar = TRUE) +res <- tibble(effect = row.names(b_com$b_com), + est = b_com$b_com[,1], + postVar = attr(b_com$b_com, "postVar")[,,]) %>% + mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)), + upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)), + pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>% + mutate(qval = p.adjust(pval)) %>% + arrange(est) %>% + mutate(effect = factor(effect, levels = effect)) +res + +res %>% + ggplot(aes(x = est, y = effect)) + + geom_errorbarh(aes(xmin = lower, xmax = upper)) + + geom_point() + + geom_vline(xintercept = 0) + + theme_classic() + + +#' Get the effect of temp +b_temp <- ranef(m1, condVar = TRUE, whichel = "b_temp", postVar = TRUE) +res <- tibble(effect = row.names(b_temp$b_temp), + est = b_temp$b_temp[,1], + postVar = attr(b_temp$b_temp, "postVar")[,,]) %>% + mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)), + upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)), + pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>% + mutate(qval = p.adjust(pval)) %>% + arrange(est) %>% + mutate(effect = factor(effect, levels = effect)) +res + +res %>% + ggplot(aes(x = est, y = effect)) + + geom_errorbarh(aes(xmin = lower, xmax = upper)) + + geom_point() + + geom_vline(xintercept = 0) + + theme_classic() + + +#' Example of ST00046 showing that it decreases more at 28 than 32 (not significant) +dat %>% + filter(strain == "ST00046") %>% + filter(hrs %in% c(24)) %>% + ggplot(aes(col = factor(temp))) + + facet_wrap(~community) + + geom_segment(aes(x = 0, y = i_freq, xend = temp, yend = count / depth, linetype = exp )) + + theme_classic() + + diff --git a/lme_pilot/run_lme4_model.r b/lme_pilot/run_lme4_model.r index 565a9c0..e40d754 100644 --- a/lme_pilot/run_lme4_model.r +++ b/lme_pilot/run_lme4_model.r @@ -1,4 +1,7 @@ -setwd("/Users/sur/lab/exp/2025/today") +# rmarkdown::render("/Users/sur/lab/src/NSM_comsints/lme_pilot/run_lme4_model.r", output_format = "html_document", output_dir = "/Users/sur/lab/exp/2025/today") +# setwd("/Users/sur/lab/exp/2025/today") +knitr::opts_knit$set(root.dir = "/Users/sur/lab/exp/2025/today") + library(tidyverse) library(lme4) @@ -109,7 +112,7 @@ save(m1.t1, m2.t1, m3.t1, m4.t1, m5.t1, m6.t1, file = "models.rdat") library(brms) load("brms_models.rdat") -# Takes about 6 minutes to run, significant numeric issues. +# No convergence issues, takes 2 min with compile m5.br.t1 <- brm(count ~ log(depth) + i_freq + (1|b_com) + (1|b_temp) + (1|b_temp_com) + (1|b_rep), data = dat, @@ -142,78 +145,12 @@ summary(m6.t1) save(m5.br.t1, m6.br.t1, file = "brms_models.rdat") #' We compare with waic and loo -brms::waic(m5.br.t1, m6.br.t1) +compare_ic(brms::waic(m5.br.t1), brms::waic(m6.br.t1)) m5.br.t1.loo <- brms::loo(m5.br.t1) m6.br.t1.loo <- brms::loo(m6.br.t1) m5.br.t1.loo m6.br.t1.loo - -summary(m1.t1) - - -lattice::qqmath(ranef(m1.t1)) - - -m1 <- glmer(count ~ log(depth) + i_freq + (1|b_com) + - (1|b_rep) + (1|b_temp) + (1|b_obs), - data = dat %>% - filter(hrs == 24), - family = poisson(link = log) ) -summary(m1) -AIC(m1) -BIC(m1) - -#' Get the effect of community -b_com <- ranef(m1, condVar = TRUE, whichel = "b_com", postVar = TRUE) -res <- tibble(effect = row.names(b_com$b_com), - est = b_com$b_com[,1], - postVar = attr(b_com$b_com, "postVar")[,,]) %>% - mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)), - upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)), - pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>% - mutate(qval = p.adjust(pval)) %>% - arrange(est) %>% - mutate(effect = factor(effect, levels = effect)) -res - -res %>% - ggplot(aes(x = est, y = effect)) + - geom_errorbarh(aes(xmin = lower, xmax = upper)) + - geom_point() + - geom_vline(xintercept = 0) + - theme_classic() - - -#' Get the effect of temp -b_temp <- ranef(m1, condVar = TRUE, whichel = "b_temp", postVar = TRUE) -res <- tibble(effect = row.names(b_temp$b_temp), - est = b_temp$b_temp[,1], - postVar = attr(b_temp$b_temp, "postVar")[,,]) %>% - mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)), - upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)), - pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>% - mutate(qval = p.adjust(pval)) %>% - arrange(est) %>% - mutate(effect = factor(effect, levels = effect)) -res - -res %>% - ggplot(aes(x = est, y = effect)) + - geom_errorbarh(aes(xmin = lower, xmax = upper)) + - geom_point() + - geom_vline(xintercept = 0) + - theme_classic() - - -#' Example of ST00046 showing that it decreases more at 28 than 32 (not significant) -dat %>% - filter(strain == "ST00046") %>% - filter(hrs %in% c(24)) %>% - ggplot(aes(col = factor(temp))) + - facet_wrap(~community) + - geom_segment(aes(x = 0, y = i_freq, xend = temp, yend = count / depth, linetype = exp )) + - theme_classic() - +# loo_compare(brms::loo(m5.br.t1, moment_match = TRUE), brms::loo(m6.br.t1, moment_match = TRUE)) From 5ebd306e5a2befd0bbb1b704aa5f8c282df7736d Mon Sep 17 00:00:00 2001 From: sur Date: Mon, 10 Mar 2025 19:14:38 -0600 Subject: [PATCH 13/17] re-starting --- lme_pilot/run_lme4_model.r | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lme_pilot/run_lme4_model.r b/lme_pilot/run_lme4_model.r index e40d754..20cb652 100644 --- a/lme_pilot/run_lme4_model.r +++ b/lme_pilot/run_lme4_model.r @@ -1,5 +1,5 @@ # rmarkdown::render("/Users/sur/lab/src/NSM_comsints/lme_pilot/run_lme4_model.r", output_format = "html_document", output_dir = "/Users/sur/lab/exp/2025/today") -# setwd("/Users/sur/lab/exp/2025/today") +setwd("/Users/sur/lab/exp/2025/today") knitr::opts_knit$set(root.dir = "/Users/sur/lab/exp/2025/today") library(tidyverse) From 0c077e4bfd1860e00cb4dca15e4da75085ba520e Mon Sep 17 00:00:00 2001 From: sur Date: Tue, 11 Mar 2025 09:05:52 -0600 Subject: [PATCH 14/17] Run final version --- lme_pilot/run_lme4_model.r | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lme_pilot/run_lme4_model.r b/lme_pilot/run_lme4_model.r index 20cb652..e40d754 100644 --- a/lme_pilot/run_lme4_model.r +++ b/lme_pilot/run_lme4_model.r @@ -1,5 +1,5 @@ # rmarkdown::render("/Users/sur/lab/src/NSM_comsints/lme_pilot/run_lme4_model.r", output_format = "html_document", output_dir = "/Users/sur/lab/exp/2025/today") -setwd("/Users/sur/lab/exp/2025/today") +# setwd("/Users/sur/lab/exp/2025/today") knitr::opts_knit$set(root.dir = "/Users/sur/lab/exp/2025/today") library(tidyverse) From 8ab4984044c0200a6369fb25e8c59256cc193b86 Mon Sep 17 00:00:00 2001 From: surh Date: Wed, 12 Mar 2025 02:40:12 -0600 Subject: [PATCH 15/17] Ignoring rproj --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 5b6a065..f4f606b 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ .Rhistory .RData .Ruserdata +*.Rproj From f952b32cd8b4c0f0db261fb57f1c443e69f0f40e Mon Sep 17 00:00:00 2001 From: surh Date: Wed, 12 Mar 2025 02:40:55 -0600 Subject: [PATCH 16/17] cleaning --- NSM_comsints.Rproj | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 NSM_comsints.Rproj diff --git a/NSM_comsints.Rproj b/NSM_comsints.Rproj deleted file mode 100644 index 8e3c2eb..0000000 --- a/NSM_comsints.Rproj +++ /dev/null @@ -1,13 +0,0 @@ -Version: 1.0 - -RestoreWorkspace: Default -SaveWorkspace: Default -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -UseSpacesForTab: Yes -NumSpacesForTab: 2 -Encoding: UTF-8 - -RnwWeave: Sweave -LaTeX: pdfLaTeX From 743c986c688fb712ac419b67ae183dbcaff2243a Mon Sep 17 00:00:00 2001 From: surh Date: Tue, 18 Mar 2025 15:38:32 -0600 Subject: [PATCH 17/17] Version of analyze lme model --- lme_pilot/analyze_models.r | 100 ++++++++++++++++++++++++++++++------- 1 file changed, 82 insertions(+), 18 deletions(-) diff --git a/lme_pilot/analyze_models.r b/lme_pilot/analyze_models.r index c78da53..5b68e07 100644 --- a/lme_pilot/analyze_models.r +++ b/lme_pilot/analyze_models.r @@ -1,21 +1,19 @@ +setwd("~/micropopgen/exp/2025/today3/") +library(tidyverse) +library(lme4) +library(brms) -summary(m1.t1) +load("2025-03-11.data_for_aplicaciones_lcg/models.rdat") +Dat <- read_tsv("2025-03-11.data_for_aplicaciones_lcg/pilot_dat_perfect_design.tsv") %>% + mutate(community = factor(community, levels = paste0("R", 1:12))) +dat <- Dat %>% + filter(hrs == 24) -lattice::qqmath(ranef(m1.t1)) - - -m1 <- glmer(count ~ log(depth) + i_freq + (1|b_com) + - (1|b_rep) + (1|b_temp) + (1|b_obs), - data = dat %>% - filter(hrs == 24), - family = poisson(link = log) ) -summary(m1) -AIC(m1) -BIC(m1) +summary(m5.t1) #' Get the effect of community -b_com <- ranef(m1, condVar = TRUE, whichel = "b_com", postVar = TRUE) +b_com <- ranef(m5.t1, condVar = TRUE, whichel = "b_com", postVar = TRUE) res <- tibble(effect = row.names(b_com$b_com), est = b_com$b_com[,1], postVar = attr(b_com$b_com, "postVar")[,,]) %>% @@ -24,7 +22,8 @@ res <- tibble(effect = row.names(b_com$b_com), pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>% mutate(qval = p.adjust(pval)) %>% arrange(est) %>% - mutate(effect = factor(effect, levels = effect)) + mutate(effect = factor(effect, levels = effect)) %>% + print(n = 100) res res %>% @@ -35,8 +34,38 @@ res %>% theme_classic() +dat %>% + filter(strain == "ST00046") %>% + filter(added == 1) %>% + # filter(community %in% c("R1", "R2", "R3", "R4", "R5", "R12")) %>% + mutate(effect = as.character(community)) %>% + mutate(effect = replace(effect, effect %in% c("R1", "R2", "R3", "R4"), "pos")) %>% + mutate(effect = replace(effect, effect %in% c("R5", "R12"), "neg")) %>% + ggplot(aes(col = factor(temp))) + + facet_wrap(~effect + temp) + + geom_segment(aes(x = 0, y = i_freq, xend = hrs, yend = count / depth, linetype = exp )) + + ggtitle(label = "ST00046") + + theme_classic() + + + + +dat %>% + filter(strain == "ST00046") %>% + filter(added == 1) %>% + filter(community == "R5") %>% + # filter(community %in% c("R1", "R2", "R3", "R4", "R5", "R12")) %>% + mutate(effect = as.character(community)) %>% + mutate(effect = replace(effect, effect %in% c("R1", "R2", "R3", "R4"), "pos")) %>% + mutate(effect = replace(effect, effect %in% c("R5", "R12"), "neg")) %>% + ggplot(aes(col = factor(temp))) + + facet_wrap(~temp) + + geom_segment(aes(x = 0, y = i_freq, xend = hrs, yend = count / depth, linetype = exp )) + + ggtitle(label = "ST00046") + + theme_classic() + #' Get the effect of temp -b_temp <- ranef(m1, condVar = TRUE, whichel = "b_temp", postVar = TRUE) +b_temp <- ranef(m5.t1, condVar = TRUE, whichel = "b_temp", postVar = TRUE) res <- tibble(effect = row.names(b_temp$b_temp), est = b_temp$b_temp[,1], postVar = attr(b_temp$b_temp, "postVar")[,,]) %>% @@ -58,11 +87,46 @@ res %>% #' Example of ST00046 showing that it decreases more at 28 than 32 (not significant) dat %>% - filter(strain == "ST00046") %>% + filter(strain == "ST00110") %>% + filter(added == 1) %>% filter(hrs %in% c(24)) %>% ggplot(aes(col = factor(temp))) + - facet_wrap(~community) + - geom_segment(aes(x = 0, y = i_freq, xend = temp, yend = count / depth, linetype = exp )) + + facet_wrap(~temp) + + geom_segment(aes(x = 0, y = i_freq, xend = hrs, yend = count / depth, linetype = exp )) + + ggtitle(label = "ST00110") + theme_classic() + +b_temp_com <- ranef(m5.t1, condVar = TRUE, whichel = "b_temp_com", postVar = TRUE) +res <- tibble(effect = row.names(b_temp_com$b_temp_com), + est = b_temp_com$b_temp_com[,1], + postVar = attr(b_temp_com$b_temp_com, "postVar")[,,]) %>% + mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)), + upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)), + pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>% + mutate(qval = p.adjust(pval)) %>% + arrange(est) %>% + mutate(effect = factor(effect, levels = effect)) +res %>% + print(n = 1000) + + +res %>% + ggplot(aes(x = est, y = effect)) + + geom_errorbarh(aes(xmin = lower, xmax = upper)) + + geom_point() + + geom_vline(xintercept = 0) + + theme_classic() + + + + +dat %>% + filter(strain == "ST00143") %>% + filter(added == 1) %>% + filter(community == "R10") %>% + ggplot(aes(col = factor(temp))) + + geom_segment(aes(x = 0, y = i_freq, xend = hrs, yend = count / depth, linetype = exp )) + + ggtitle(label = "ST00143") + + theme_classic()