From 7b74497905a099f11f86785a61f2b28f7b83c9a8 Mon Sep 17 00:00:00 2001
From: sur <surh@users.noreply.github.com>
Date: Thu, 6 Mar 2025 16:22:52 -0600
Subject: [PATCH 01/17] Model pilot strain abundances

---
 .gitignore               |   4 +
 NSM_comsints.Rproj       |  13 ++++
 lme_pilot/process_data.r | 159 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 176 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 NSM_comsints.Rproj
 create mode 100644 lme_pilot/process_data.r

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5b6a065
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
diff --git a/NSM_comsints.Rproj b/NSM_comsints.Rproj
new file mode 100644
index 0000000..8e3c2eb
--- /dev/null
+++ b/NSM_comsints.Rproj
@@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
diff --git a/lme_pilot/process_data.r b/lme_pilot/process_data.r
new file mode 100644
index 0000000..ee32d90
--- /dev/null
+++ b/lme_pilot/process_data.r
@@ -0,0 +1,159 @@
+setwd("/Users/sur/lab/exp/2025/today")
+
+library(tidyverse)
+
+Tab <- read_csv("2025-03-06.ancom_data/ancom_data/freq.csv") %>%
+  rename(strain = 1)
+Tab
+
+Meta <- read_csv("2025-03-06.ancom_data/ancom_data/meta.csv") %>%
+  rename(id = 1)
+Meta
+
+
+
+syncoms <- read_tsv("../../../data/2024_rhizo_pilot_syncom_NS/NS1/syncoms.tsv")
+
+#' Select two communities to test
+
+# %>% select(community, ST00046, ST00154, ST00101, ST00109, ST00042, ST00060
+
+meta <- Meta %>%
+  filter(community %in% c("R1", "R2")) %>%
+  left_join(syncoms %>%
+              select(strain, R1,R2) %>%
+              pivot_longer(-strain, names_to = "community", values_to = "presence") %>%
+              pivot_wider(names_from = "strain", values_from = "presence"),
+            by = "community")
+
+meta %>% 
+  print(n = 100)
+
+meta <- meta %>%
+  pivot_longer(-c("id", "community", "hrs", "techrep", "exp", "temp", "color_comsint", "community_temp"),
+               names_to = "strain", values_to = "added") %>%
+  mutate(added = replace_na(added, 0))
+
+meta %>% 
+  print(n = 100)
+
+
+strains <- Tab$strain
+strains <- strains %>%
+  str_replace("NS_042g_27F", "ST00042") %>%
+  str_replace("NS_164C_27F", "ST00164") %>%
+  str_replace("NS_110C_1_27F", "ST00110") 
+strains
+Tab$strain <- strains
+
+
+tab <- Tab %>%
+  pivot_longer(-strain, names_to = "id", values_to = "count") 
+tab
+
+
+
+Dat <- meta %>%
+  left_join(tab %>%
+              group_by(id) %>%
+              summarize(depth = sum(count)),
+            by = "id") %>%
+  left_join(tab, by = c("id", "strain"))
+Dat
+
+#' Calculate relative abundances of each strain on the inoculum
+#' of each experiment
+
+Dat <- Dat %>%
+  left_join(Dat %>%
+              filter(hrs == 0) %>%
+              group_by(strain, community, exp) %>%
+              summarise(i_freq = sum(count) / sum(depth),
+                        .groups = "drop"),
+            by = c("strain", "community", "exp"))
+Dat
+
+
+#' Confirm that count match expected species
+#' In general it does. ST00060 seems to be the only semi problematic
+Dat %>% 
+  ggplot(aes(x = added == 1, y = count)) +
+  facet_wrap(~ strain, scales = "free_y") +
+  geom_point(position = position_jitter(width = 0.1, height = 0)) +
+  theme_classic()
+  
+
+
+#' # Model with lme4
+library(lme4)
+Dat
+
+
+dat <- Dat %>%
+  mutate(b_com = paste0(community, "_", strain)) %>%
+  mutate(b_com = replace(b_com, added == 0, NA)) %>%
+  
+  mutate(b_rep = paste0(exp, "_", strain)) %>%
+  mutate(b_rep = replace(b_rep, added == 0, NA)) %>%
+  
+  mutate(b_temp = paste0(temp, "_", strain)) %>%
+  mutate(b_temp = replace(b_temp, added == 0, NA)) %>%
+  
+  mutate(b_obs = as.character(1:n()))
+dat
+
+m1 <- glmer(count ~ log(depth) + i_freq + (1|b_com) + 
+              (1|b_rep) + (1|b_temp) + (1|b_obs), 
+            data = dat %>%
+              filter(hrs == 24), 
+            family = poisson(link = log) )
+summary(m1)
+AIC(m1)
+
+#' Get the effect of community
+b_com <- ranef(m1, condVar = TRUE, whichel = "b_com", postVar = TRUE)
+res <- tibble(effect = row.names(b_com$b_com),
+              est = b_com$b_com[,1],
+              postVar = attr(b_com$b_com, "postVar")[,,]) %>%
+  mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)),
+         upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)),
+         pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>%
+  mutate(qval = p.adjust(pval)) %>%
+  arrange(est) %>%
+  mutate(effect = factor(effect, levels = effect))
+res
+  
+res %>%
+  ggplot(aes(x = est, y = effect)) +
+  geom_errorbarh(aes(xmin = lower, xmax = upper)) +
+  geom_point() +
+  geom_vline(xintercept = 0) +
+  theme_classic()
+
+
+#' Get the effect of temp
+b_temp <- ranef(m1, condVar = TRUE, whichel = "b_temp", postVar = TRUE)
+res <- tibble(effect = row.names(b_temp$b_temp),
+              est = b_temp$b_temp[,1],
+              postVar = attr(b_temp$b_temp, "postVar")[,,]) %>%
+  mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)),
+         upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)),
+         pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>%
+  mutate(qval = p.adjust(pval)) %>%
+  arrange(est) %>%
+  mutate(effect = factor(effect, levels = effect))
+res
+
+res %>%
+  ggplot(aes(x = est, y = effect)) +
+  geom_errorbarh(aes(xmin = lower, xmax = upper)) +
+  geom_point() +
+  geom_vline(xintercept = 0) +
+  theme_classic()
+
+
+
+dat %>%
+  filter(strain == "ST00046") 
+
+

From fcec68109c9acdfbae651c3d4b78cd82b2fcaab8 Mon Sep 17 00:00:00 2001
From: sur <surh@users.noreply.github.com>
Date: Fri, 7 Mar 2025 11:20:27 -0600
Subject: [PATCH 02/17] Script to generate report

---
 lme_pilot/process_data.r | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/lme_pilot/process_data.r b/lme_pilot/process_data.r
index ee32d90..06c4564 100644
--- a/lme_pilot/process_data.r
+++ b/lme_pilot/process_data.r
@@ -109,6 +109,7 @@ m1 <- glmer(count ~ log(depth) + i_freq + (1|b_com) +
             family = poisson(link = log) )
 summary(m1)
 AIC(m1)
+BIC(m1)
 
 #' Get the effect of community
 b_com <- ranef(m1, condVar = TRUE, whichel = "b_com", postVar = TRUE)
@@ -152,8 +153,13 @@ res %>%
   theme_classic()
 
 
-
+#' Example of ST00046 showing that it decreases more at 28 than 32 (not significant)
 dat %>%
-  filter(strain == "ST00046") 
+  filter(strain == "ST00046") %>%
+  filter(hrs %in% c(24)) %>%
+  ggplot(aes(col = factor(temp))) +
+  facet_wrap(~community) +
+  geom_segment(aes(x = 0, y = i_freq, xend = temp, yend = count / depth, linetype = exp )) +
+  theme_classic() 
 
 

From 767759f1f855c9b1110c11b5ecf15e107863834a Mon Sep 17 00:00:00 2001
From: sur <surh@users.noreply.github.com>
Date: Fri, 7 Mar 2025 11:46:17 -0600
Subject: [PATCH 03/17] Comenting

---
 lme_pilot/process_data.r   |  63 ++++++++------
 lme_pilot/run_lme4_model.r | 165 +++++++++++++++++++++++++++++++++++++
 2 files changed, 203 insertions(+), 25 deletions(-)
 create mode 100644 lme_pilot/run_lme4_model.r

diff --git a/lme_pilot/process_data.r b/lme_pilot/process_data.r
index 06c4564..7d3dbd8 100644
--- a/lme_pilot/process_data.r
+++ b/lme_pilot/process_data.r
@@ -1,7 +1,17 @@
-setwd("/Users/sur/lab/exp/2025/today")
+#' Script to prepare data from pilot for modelling. We require to put every
+#' count observation in a single line in a master data table. We are also
+#' Calculating the frequency of inoculum. *Should we also calculate frequency
+#' at time t-1*
+
+#' The working directory may be set different depending on where are the
+#' relevant files. I'm using NSM's ancom_data files downloaded on 2025-03-06.
 
+setwd("/Users/sur/lab/exp/2025/today")
 library(tidyverse)
 
+#' # Read data
+
+#' We add column names to the sample ids (`id`) and strain ids (`strain`)
 Tab <- read_csv("2025-03-06.ancom_data/ancom_data/freq.csv") %>%
   rename(strain = 1)
 Tab
@@ -10,34 +20,36 @@ Meta <- read_csv("2025-03-06.ancom_data/ancom_data/meta.csv") %>%
   rename(id = 1)
 Meta
 
-
-
-syncoms <- read_tsv("../../../data/2024_rhizo_pilot_syncom_NS/NS1/syncoms.tsv")
-
-#' Select two communities to test
-
-# %>% select(community, ST00046, ST00154, ST00101, ST00109, ST00042, ST00060
-
+#' **NOTE**: The following are locations and files in Sur's lab desktop. Format
+#' might be different in someone else
+syncoms <- read_tsv("../../../data/2024_rhizo_pilot_syncom_NS/NS1/syncoms.tsv") %>%
+  full_join(read_tsv("../../../data/2024_rhizo_pilot_syncom_NS/NS2/syncoms.tsv"), 
+            by = "strain")
+syncoms
+
+#' # Format data
+#' We add a column per strain to the metadata table to indicate which species
+#' were added to that community, 1 means added, NA means not added (though
+#' it might be detected still). We create a new object.
 meta <- Meta %>%
-  filter(community %in% c("R1", "R2")) %>%
+  # filter(community %in% c("R1", "R2")) %>%
   left_join(syncoms %>%
               select(strain, R1,R2) %>%
               pivot_longer(-strain, names_to = "community", values_to = "presence") %>%
               pivot_wider(names_from = "strain", values_from = "presence"),
             by = "community")
+meta 
 
-meta %>% 
-  print(n = 100)
-
+#' Then we pivot the table so there is one row per *strain x sample* 
+#' combination. The new column added indicates if the relevant strain
+#' was added (1) or not (0)
 meta <- meta %>%
   pivot_longer(-c("id", "community", "hrs", "techrep", "exp", "temp", "color_comsint", "community_temp"),
                names_to = "strain", values_to = "added") %>%
   mutate(added = replace_na(added, 0))
+meta
 
-meta %>% 
-  print(n = 100)
-
-
+#' Before adding count information we need to homogenize the strain names
 strains <- Tab$strain
 strains <- strains %>%
   str_replace("NS_042g_27F", "ST00042") %>%
@@ -46,13 +58,12 @@ strains <- strains %>%
 strains
 Tab$strain <- strains
 
-
+#' Now we pivot the count table to have 1 row per *sample x strain* combination
+#' and we join it with the metadata table. We also calculate the sequencing
+#' depth per sample and add that information to the metadata. We create a new
+#' object
 tab <- Tab %>%
   pivot_longer(-strain, names_to = "id", values_to = "count") 
-tab
-
-
-
 Dat <- meta %>%
   left_join(tab %>%
               group_by(id) %>%
@@ -61,9 +72,8 @@ Dat <- meta %>%
   left_join(tab, by = c("id", "strain"))
 Dat
 
-#' Calculate relative abundances of each strain on the inoculum
-#' of each experiment
-
+#' Calculate relative abundances of each strain in the inoculum
+#' of each experiment, and add that information to the data
 Dat <- Dat %>%
   left_join(Dat %>%
               filter(hrs == 0) %>%
@@ -74,6 +84,9 @@ Dat <- Dat %>%
 Dat
 
 
+
+
+
 #' Confirm that count match expected species
 #' In general it does. ST00060 seems to be the only semi problematic
 Dat %>% 
diff --git a/lme_pilot/run_lme4_model.r b/lme_pilot/run_lme4_model.r
new file mode 100644
index 0000000..3cc8055
--- /dev/null
+++ b/lme_pilot/run_lme4_model.r
@@ -0,0 +1,165 @@
+setwd("/Users/sur/lab/exp/2025/today")
+
+library(tidyverse)
+
+Tab <- read_csv("2025-03-06.ancom_data/ancom_data/freq.csv") %>%
+  rename(strain = 1)
+Tab
+
+Meta <- read_csv("2025-03-06.ancom_data/ancom_data/meta.csv") %>%
+  rename(id = 1)
+Meta
+
+
+
+syncoms <- read_tsv("../../../data/2024_rhizo_pilot_syncom_NS/NS1/syncoms.tsv")
+
+#' Select two communities to test
+
+# %>% select(community, ST00046, ST00154, ST00101, ST00109, ST00042, ST00060
+
+meta <- Meta %>%
+  filter(community %in% c("R1", "R2")) %>%
+  left_join(syncoms %>%
+              select(strain, R1,R2) %>%
+              pivot_longer(-strain, names_to = "community", values_to = "presence") %>%
+              pivot_wider(names_from = "strain", values_from = "presence"),
+            by = "community")
+
+meta %>% 
+  print(n = 100)
+
+meta <- meta %>%
+  pivot_longer(-c("id", "community", "hrs", "techrep", "exp", "temp", "color_comsint", "community_temp"),
+               names_to = "strain", values_to = "added") %>%
+  mutate(added = replace_na(added, 0))
+
+meta %>% 
+  print(n = 100)
+
+
+strains <- Tab$strain
+strains <- strains %>%
+  str_replace("NS_042g_27F", "ST00042") %>%
+  str_replace("NS_164C_27F", "ST00164") %>%
+  str_replace("NS_110C_1_27F", "ST00110") 
+strains
+Tab$strain <- strains
+
+
+tab <- Tab %>%
+  pivot_longer(-strain, names_to = "id", values_to = "count") 
+tab
+
+
+
+Dat <- meta %>%
+  left_join(tab %>%
+              group_by(id) %>%
+              summarize(depth = sum(count)),
+            by = "id") %>%
+  left_join(tab, by = c("id", "strain"))
+Dat
+
+#' Calculate relative abundances of each strain on the inoculum
+#' of each experiment
+
+Dat <- Dat %>%
+  left_join(Dat %>%
+              filter(hrs == 0) %>%
+              group_by(strain, community, exp) %>%
+              summarise(i_freq = sum(count) / sum(depth),
+                        .groups = "drop"),
+            by = c("strain", "community", "exp"))
+Dat
+
+
+#' Confirm that count match expected species
+#' In general it does. ST00060 seems to be the only semi problematic
+Dat %>% 
+  ggplot(aes(x = added == 1, y = count)) +
+  facet_wrap(~ strain, scales = "free_y") +
+  geom_point(position = position_jitter(width = 0.1, height = 0)) +
+  theme_classic()
+
+
+
+#' # Model with lme4
+library(lme4)
+Dat
+
+
+dat <- Dat %>%
+  mutate(b_com = paste0(community, "_", strain)) %>%
+  mutate(b_com = replace(b_com, added == 0, NA)) %>%
+  
+  mutate(b_rep = paste0(exp, "_", strain)) %>%
+  mutate(b_rep = replace(b_rep, added == 0, NA)) %>%
+  
+  mutate(b_temp = paste0(temp, "_", strain)) %>%
+  mutate(b_temp = replace(b_temp, added == 0, NA)) %>%
+  
+  mutate(b_obs = as.character(1:n()))
+dat
+
+m1 <- glmer(count ~ log(depth) + i_freq + (1|b_com) + 
+              (1|b_rep) + (1|b_temp) + (1|b_obs), 
+            data = dat %>%
+              filter(hrs == 24), 
+            family = poisson(link = log) )
+summary(m1)
+AIC(m1)
+BIC(m1)
+
+#' Get the effect of community
+b_com <- ranef(m1, condVar = TRUE, whichel = "b_com", postVar = TRUE)
+res <- tibble(effect = row.names(b_com$b_com),
+              est = b_com$b_com[,1],
+              postVar = attr(b_com$b_com, "postVar")[,,]) %>%
+  mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)),
+         upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)),
+         pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>%
+  mutate(qval = p.adjust(pval)) %>%
+  arrange(est) %>%
+  mutate(effect = factor(effect, levels = effect))
+res
+
+res %>%
+  ggplot(aes(x = est, y = effect)) +
+  geom_errorbarh(aes(xmin = lower, xmax = upper)) +
+  geom_point() +
+  geom_vline(xintercept = 0) +
+  theme_classic()
+
+
+#' Get the effect of temp
+b_temp <- ranef(m1, condVar = TRUE, whichel = "b_temp", postVar = TRUE)
+res <- tibble(effect = row.names(b_temp$b_temp),
+              est = b_temp$b_temp[,1],
+              postVar = attr(b_temp$b_temp, "postVar")[,,]) %>%
+  mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)),
+         upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)),
+         pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>%
+  mutate(qval = p.adjust(pval)) %>%
+  arrange(est) %>%
+  mutate(effect = factor(effect, levels = effect))
+res
+
+res %>%
+  ggplot(aes(x = est, y = effect)) +
+  geom_errorbarh(aes(xmin = lower, xmax = upper)) +
+  geom_point() +
+  geom_vline(xintercept = 0) +
+  theme_classic()
+
+
+#' Example of ST00046 showing that it decreases more at 28 than 32 (not significant)
+dat %>%
+  filter(strain == "ST00046") %>%
+  filter(hrs %in% c(24)) %>%
+  ggplot(aes(col = factor(temp))) +
+  facet_wrap(~community) +
+  geom_segment(aes(x = 0, y = i_freq, xend = temp, yend = count / depth, linetype = exp )) +
+  theme_classic() 
+
+

From 24bce90e57f6de064f2e49e8822c4363af882e17 Mon Sep 17 00:00:00 2001
From: sur <surh@users.noreply.github.com>
Date: Fri, 7 Mar 2025 11:57:03 -0600
Subject: [PATCH 04/17] Found error in added

---
 lme_pilot/process_data.r | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/lme_pilot/process_data.r b/lme_pilot/process_data.r
index 7d3dbd8..14a84f4 100644
--- a/lme_pilot/process_data.r
+++ b/lme_pilot/process_data.r
@@ -83,6 +83,8 @@ Dat <- Dat %>%
             by = c("strain", "community", "exp"))
 Dat
 
+#' # Add modelling variables
+
 
 
 
@@ -90,6 +92,7 @@ Dat
 #' Confirm that count match expected species
 #' In general it does. ST00060 seems to be the only semi problematic
 Dat %>% 
+  filter(community %in% c("R3", "R4")) %>%
   ggplot(aes(x = added == 1, y = count)) +
   facet_wrap(~ strain, scales = "free_y") +
   geom_point(position = position_jitter(width = 0.1, height = 0)) +
@@ -97,6 +100,11 @@ Dat %>%
   
 
 
+Dat %>% filter(strain == "ST00046") %>% filter(community %in% c("R3","R4")) %>% arrange(added) %>% print(n = 200)
+
+Dat %>% filter(community %in% c("R3","R4")) %>%
+  filter(added == 1)
+
 #' # Model with lme4
 library(lme4)
 Dat

From 0571b3d3a655d181ec332c137866aba4f554fc5d Mon Sep 17 00:00:00 2001
From: sur <surh@users.noreply.github.com>
Date: Fri, 7 Mar 2025 16:24:20 -0600
Subject: [PATCH 05/17] Fixed error in added line, cleaning code

---
 lme_pilot/process_data.r | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/lme_pilot/process_data.r b/lme_pilot/process_data.r
index 14a84f4..7824302 100644
--- a/lme_pilot/process_data.r
+++ b/lme_pilot/process_data.r
@@ -34,7 +34,7 @@ syncoms
 meta <- Meta %>%
   # filter(community %in% c("R1", "R2")) %>%
   left_join(syncoms %>%
-              select(strain, R1,R2) %>%
+              # select(strain, R1,R2) %>%
               pivot_longer(-strain, names_to = "community", values_to = "presence") %>%
               pivot_wider(names_from = "strain", values_from = "presence"),
             by = "community")
@@ -85,14 +85,36 @@ Dat
 
 #' # Add modelling variables
 
+#' We add variables for things we want to model. We are **ignoring time** for 
+#' the time (;)) being
 
+Dat <- Dat %>%
+  # The effect of community on strain abundance
+  mutate(b_com = paste0(community, "_", strain)) %>%
+  mutate(b_com = replace(b_com, added == 0, NA)) %>%
+  
+  # The effect of biological replicat on strain abundance
+  mutate(b_rep = paste0(exp, "_", strain)) %>%
+  mutate(b_rep = replace(b_rep, added == 0, NA)) %>%
+  
+  # The effect of temperature on strain abundance
+  mutate(b_temp = paste0(temp, "_", strain)) %>%
+  mutate(b_temp = replace(b_temp, added == 0, NA)) %>%
+  
+  # The effect of *temperature x community* on strain abundance
+  
+  # The effect of community *color* on strain abundance
+  
+  # Individual observation-level effect (for overdispersion)
+  mutate(b_obs = as.character(1:n()))
+Dat
 
 
 
 #' Confirm that count match expected species
 #' In general it does. ST00060 seems to be the only semi problematic
 Dat %>% 
-  filter(community %in% c("R3", "R4")) %>%
+  # filter(community %in% c("R3", "R4")) %>%
   ggplot(aes(x = added == 1, y = count)) +
   facet_wrap(~ strain, scales = "free_y") +
   geom_point(position = position_jitter(width = 0.1, height = 0)) +
@@ -100,7 +122,7 @@ Dat %>%
   
 
 
-Dat %>% filter(strain == "ST00046") %>% filter(community %in% c("R3","R4")) %>% arrange(added) %>% print(n = 200)
+Dat %>% filter(strain == "ST00046")  %>% arrange(added) %>% print(n = 200)
 
 Dat %>% filter(community %in% c("R3","R4")) %>%
   filter(added == 1)

From a1397407d55e9b6c89f4e8e556bc3764263dce61 Mon Sep 17 00:00:00 2001
From: sur <surh@users.noreply.github.com>
Date: Fri, 7 Mar 2025 18:26:42 -0600
Subject: [PATCH 06/17] Trying to get a definition of preence when not added

---
 lme_pilot/process_data.r | 107 ++++++++++++++++++++++++++++++++-------
 1 file changed, 90 insertions(+), 17 deletions(-)

diff --git a/lme_pilot/process_data.r b/lme_pilot/process_data.r
index 7824302..77c45b4 100644
--- a/lme_pilot/process_data.r
+++ b/lme_pilot/process_data.r
@@ -84,35 +84,60 @@ Dat <- Dat %>%
 Dat
 
 #' # Add modelling variables
-
+#' 
 #' We add variables for things we want to model. We are **ignoring time** for 
 #' the time (;)) being
-
-Dat <- Dat %>%
+#' 
+#' ## Assuming design is perfect
+#' 
+#' We discard effects when a strain wasn't added 
+dat <- Dat %>%
   # The effect of community on strain abundance
   mutate(b_com = paste0(community, "_", strain)) %>%
   mutate(b_com = replace(b_com, added == 0, NA)) %>%
-  
-  # The effect of biological replicat on strain abundance
-  mutate(b_rep = paste0(exp, "_", strain)) %>%
-  mutate(b_rep = replace(b_rep, added == 0, NA)) %>%
+  mutate(b_com = replace(b_com, hrs == 0, NA)) %>%
   
   # The effect of temperature on strain abundance
   mutate(b_temp = paste0(temp, "_", strain)) %>%
   mutate(b_temp = replace(b_temp, added == 0, NA)) %>%
-  
-  # The effect of *temperature x community* on strain abundance
+  mutate(b_temp = replace(b_temp, hrs == 0, NA)) %>%
   
   # The effect of community *color* on strain abundance
+  mutate(b_col = paste0(color_comsint, "_", strain)) %>%
+  mutate(b_col = replace(b_col, added == 0, NA)) %>%
+  mutate(b_col = replace(b_col, hrs == 0, NA)) %>%
+  
+  # The effect of biological replicate on strain abundance
+  mutate(b_rep = paste0(exp, "_", strain)) %>%
+  mutate(b_rep = replace(b_rep, added == 0, NA)) %>%
+  mutate(b_rep = replace(b_rep, hrs == 0, NA)) %>%
   
+  # The effect of *temperature x community* on strain abundance
+  mutate(b_temp_com = paste0(temp, "_", community, "_", strain)) %>%
+  mutate(b_temp_com = replace(b_temp_com, added == 0, NA)) %>%
+  mutate(b_temp_com = replace(b_temp_com, hrs == 0, NA)) %>%
+
   # Individual observation-level effect (for overdispersion)
   mutate(b_obs = as.character(1:n()))
-Dat
+dat
+
+# Dat %>%
+#   filter(hrs == 0) %>%
+#   print(n = 500)
+# 
+# 
+# Dat %>%
+#   filter(hrs != 0) %>%
+#   print(n = 1500)
 
+write_tsv(dat, "pilot_dat_perfect_design.tsv")
 
 
-#' Confirm that count match expected species
-#' In general it does. ST00060 seems to be the only semi problematic
+#' ## Based on empirical results
+#' ### Error in design construction
+#' 
+#' In general there is a very large difference between the expected (added == 1) 
+#' and unexpected (added != 0) strain counts
 Dat %>% 
   # filter(community %in% c("R3", "R4")) %>%
   ggplot(aes(x = added == 1, y = count)) +
@@ -120,13 +145,61 @@ Dat %>%
   geom_point(position = position_jitter(width = 0.1, height = 0)) +
   theme_classic()
   
+#' We model added as a function of counts and depth and do model selection
+#' To find the best predictors of added
+
+m1 <- glm(added ~ lcount + strain + lcount * strain + community +  lcount * community + exp + shrs + shrs2,
+          data = Dat %>%
+            mutate(shrs = scale(hrs)) %>%
+            mutate(shrs2 = shrs * shrs) %>%
+            mutate(lcount = log(count + 1)),
+          family = binomial(link = "logit"))
+m1
+summary(m1)
+drop(anova(m1, test = "LRT"))
+
+
+m2 <- glm(added ~ lcount + strain + lcount * strain + community +  lcount * community + shrs + shrs2,
+          data = Dat %>%
+            mutate(shrs = scale(hrs)) %>%
+            mutate(shrs2 = shrs * shrs) %>%
+            mutate(lcount = log(count + 1)),
+          family = binomial(link = "logit"))
+m2
+summary(m2)
+drop(anova(m2, test = "LRT"))
+
+
+m3 <- glm(added ~ lcount + strain + lcount * strain + community +  lcount * community + shrs,
+          data = Dat %>%
+            mutate(shrs = scale(hrs)) %>%
+            mutate(shrs2 = shrs * shrs) %>%
+            mutate(lcount = log(count + 1)),
+          family = binomial(link = "logit"))
+m3
+summary(m3)
+drop(anova(m3, test = "LRT"))
+AIC(m1,m2, m3)
+
+#' I'm not conviced of using this method, we would need some gold standard
+#' and training, it seems to miss manu cases where strain is there despite not
+#' added
+hist(predict(m3, type = "response"))
+table(predict(m3, type = "response") >= .8, Dat$added)
+
+Dat %>%
+  filter(predict(m3, type = "response") >= .8 & added == 0)
+
+Dat %>%
+  filter(added == 0) %>%
+  arrange(desc(count)) %>%
+  filter(count > 0) %>%
+  print(., n = nrow(.))
+
+#' Instead lets setup a 200 read threshold for calling a strain present despite
+#' not being added
 
 
-Dat %>% filter(strain == "ST00046")  %>% arrange(added) %>% print(n = 200)
-
-Dat %>% filter(community %in% c("R3","R4")) %>%
-  filter(added == 1)
-
 #' # Model with lme4
 library(lme4)
 Dat

From 796e5438b41b692516717a9849d1120769fb0b3b Mon Sep 17 00:00:00 2001
From: sur <surh@users.noreply.github.com>
Date: Mon, 10 Mar 2025 12:29:41 -0600
Subject: [PATCH 07/17] First data processing

Need to check if it works when a strain is not added (currently NA)
Need to modify to include effect when strain present even if not added
---
 lme_pilot/process_data.r | 114 ++++++++++-----------------------------
 1 file changed, 29 insertions(+), 85 deletions(-)

diff --git a/lme_pilot/process_data.r b/lme_pilot/process_data.r
index 77c45b4..e441301 100644
--- a/lme_pilot/process_data.r
+++ b/lme_pilot/process_data.r
@@ -6,17 +6,17 @@
 #' The working directory may be set different depending on where are the
 #' relevant files. I'm using NSM's ancom_data files downloaded on 2025-03-06.
 
-setwd("/Users/sur/lab/exp/2025/today")
+knitr::opts_knit$set(root.dir = "/Users/sur/lab/exp/2025/today")
 library(tidyverse)
 
 #' # Read data
 
 #' We add column names to the sample ids (`id`) and strain ids (`strain`)
-Tab <- read_csv("2025-03-06.ancom_data/ancom_data/freq.csv") %>%
+Tab <- read_csv("../2025-03-06.pilot_lme4/2025-03-06.ancom_data/ancom_data/freq.csv") %>%
   rename(strain = 1)
 Tab
 
-Meta <- read_csv("2025-03-06.ancom_data/ancom_data/meta.csv") %>%
+Meta <- read_csv("../2025-03-06.pilot_lme4/2025-03-06.ancom_data/ancom_data/meta.csv") %>%
   rename(id = 1)
 Meta
 
@@ -134,17 +134,19 @@ write_tsv(dat, "pilot_dat_perfect_design.tsv")
 
 
 #' ## Based on empirical results
-#' ### Error in design construction
-#' 
-#' In general there is a very large difference between the expected (added == 1) 
+ 
+
+#' ### Error in design construction (procrastination)
+#'
+#' In general there is a very large difference between the expected (added == 1)
 #' and unexpected (added != 0) strain counts
-Dat %>% 
+Dat %>%
   # filter(community %in% c("R3", "R4")) %>%
   ggplot(aes(x = added == 1, y = count)) +
   facet_wrap(~ strain, scales = "free_y") +
   geom_point(position = position_jitter(width = 0.1, height = 0)) +
   theme_classic()
-  
+
 #' We model added as a function of counts and depth and do model selection
 #' To find the best predictors of added
 
@@ -198,84 +200,26 @@ Dat %>%
 
 #' Instead lets setup a 200 read threshold for calling a strain present despite
 #' not being added
-
-
-#' # Model with lme4
-library(lme4)
+#' 
 Dat
-
-
-dat <- Dat %>%
-  mutate(b_com = paste0(community, "_", strain)) %>%
-  mutate(b_com = replace(b_com, added == 0, NA)) %>%
-  
-  mutate(b_rep = paste0(exp, "_", strain)) %>%
-  mutate(b_rep = replace(b_rep, added == 0, NA)) %>%
-  
-  mutate(b_temp = paste0(temp, "_", strain)) %>%
-  mutate(b_temp = replace(b_temp, added == 0, NA)) %>%
-  
-  mutate(b_obs = as.character(1:n()))
-dat
-
-m1 <- glmer(count ~ log(depth) + i_freq + (1|b_com) + 
-              (1|b_rep) + (1|b_temp) + (1|b_obs), 
-            data = dat %>%
-              filter(hrs == 24), 
-            family = poisson(link = log) )
-summary(m1)
-AIC(m1)
-BIC(m1)
-
-#' Get the effect of community
-b_com <- ranef(m1, condVar = TRUE, whichel = "b_com", postVar = TRUE)
-res <- tibble(effect = row.names(b_com$b_com),
-              est = b_com$b_com[,1],
-              postVar = attr(b_com$b_com, "postVar")[,,]) %>%
-  mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)),
-         upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)),
-         pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>%
-  mutate(qval = p.adjust(pval)) %>%
-  arrange(est) %>%
-  mutate(effect = factor(effect, levels = effect))
-res
-  
-res %>%
-  ggplot(aes(x = est, y = effect)) +
-  geom_errorbarh(aes(xmin = lower, xmax = upper)) +
-  geom_point() +
-  geom_vline(xintercept = 0) +
+Dat %>%
+  # filter(added == 0) %>%
+  arrange(desc(count)) %>%
+  mutate(obs = 1:length(count)) %>%
+  ggplot(aes(x = obs, y = count)) +
+  geom_point(aes(col = factor(added == 1))) +
+  geom_hline(yintercept = 200) +
+  scale_x_log10() +
+  # scale_y_log10() +
   theme_classic()
 
-
-#' Get the effect of temp
-b_temp <- ranef(m1, condVar = TRUE, whichel = "b_temp", postVar = TRUE)
-res <- tibble(effect = row.names(b_temp$b_temp),
-              est = b_temp$b_temp[,1],
-              postVar = attr(b_temp$b_temp, "postVar")[,,]) %>%
-  mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)),
-         upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)),
-         pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>%
-  mutate(qval = p.adjust(pval)) %>%
-  arrange(est) %>%
-  mutate(effect = factor(effect, levels = effect))
-res
-
-res %>%
-  ggplot(aes(x = est, y = effect)) +
-  geom_errorbarh(aes(xmin = lower, xmax = upper)) +
-  geom_point() +
-  geom_vline(xintercept = 0) +
+Dat %>%
+  filter(added == 0) %>%
+  arrange(desc(count)) %>%
+  mutate(obs = 1:length(count)) %>%
+  ggplot(aes(x = obs, y = count)) +
+  geom_point(aes(col = factor(added == 1))) +
+  geom_hline(yintercept = 200) +
+  scale_x_log10() +
+  # scale_y_log10() +
   theme_classic()
-
-
-#' Example of ST00046 showing that it decreases more at 28 than 32 (not significant)
-dat %>%
-  filter(strain == "ST00046") %>%
-  filter(hrs %in% c(24)) %>%
-  ggplot(aes(col = factor(temp))) +
-  facet_wrap(~community) +
-  geom_segment(aes(x = 0, y = i_freq, xend = temp, yend = count / depth, linetype = exp )) +
-  theme_classic() 
-
-

From 63f1e98fbdb808576597aa7742403d9bf712b965 Mon Sep 17 00:00:00 2001
From: sur <surh@users.noreply.github.com>
Date: Mon, 10 Mar 2025 13:02:23 -0600
Subject: [PATCH 08/17] Testing 4 basic models

---
 lme_pilot/run_lme4_model.r | 115 ++++++++++---------------------------
 1 file changed, 30 insertions(+), 85 deletions(-)

diff --git a/lme_pilot/run_lme4_model.r b/lme_pilot/run_lme4_model.r
index 3cc8055..94af99f 100644
--- a/lme_pilot/run_lme4_model.r
+++ b/lme_pilot/run_lme4_model.r
@@ -1,106 +1,51 @@
 setwd("/Users/sur/lab/exp/2025/today")
-
 library(tidyverse)
+library(lme4)
 
-Tab <- read_csv("2025-03-06.ancom_data/ancom_data/freq.csv") %>%
-  rename(strain = 1)
-Tab
-
-Meta <- read_csv("2025-03-06.ancom_data/ancom_data/meta.csv") %>%
-  rename(id = 1)
-Meta
-
-
-
-syncoms <- read_tsv("../../../data/2024_rhizo_pilot_syncom_NS/NS1/syncoms.tsv")
-
-#' Select two communities to test
-
-# %>% select(community, ST00046, ST00154, ST00101, ST00109, ST00042, ST00060
-
-meta <- Meta %>%
-  filter(community %in% c("R1", "R2")) %>%
-  left_join(syncoms %>%
-              select(strain, R1,R2) %>%
-              pivot_longer(-strain, names_to = "community", values_to = "presence") %>%
-              pivot_wider(names_from = "strain", values_from = "presence"),
-            by = "community")
-
-meta %>% 
-  print(n = 100)
-
-meta <- meta %>%
-  pivot_longer(-c("id", "community", "hrs", "techrep", "exp", "temp", "color_comsint", "community_temp"),
-               names_to = "strain", values_to = "added") %>%
-  mutate(added = replace_na(added, 0))
-
-meta %>% 
-  print(n = 100)
-
+Dat <- read_tsv("pilot_dat_perfect_design.tsv")
+Dat
 
-strains <- Tab$strain
-strains <- strains %>%
-  str_replace("NS_042g_27F", "ST00042") %>%
-  str_replace("NS_164C_27F", "ST00164") %>%
-  str_replace("NS_110C_1_27F", "ST00110") 
-strains
-Tab$strain <- strains
 
+#' Try simple model
 
-tab <- Tab %>%
-  pivot_longer(-strain, names_to = "id", values_to = "count") 
-tab
 
+#' 24 hours only
+dat <- Dat %>%
+  filter(hrs == 24)
 
+#' Four basic poisson models to check the effect of the overdispersion
+#' variable (b_obs) and the interaction (temp * community) effects
+m1.t1 <-  glmer(count ~ log(depth) + i_freq + 
+                  (1|b_com) + (1|b_temp) + (b_temp_com) + (1|b_rep) + (1|b_obs),
+                data = dat, 
+                family = poisson(link = "log") )
 
-Dat <- meta %>%
-  left_join(tab %>%
-              group_by(id) %>%
-              summarize(depth = sum(count)),
-            by = "id") %>%
-  left_join(tab, by = c("id", "strain"))
-Dat
+m2.t1 <-  glmer(count ~ log(depth) + i_freq + 
+                  (1|b_com) + (1|b_temp) + (1|b_rep) + (1|b_obs),
+                data = dat, 
+                family = poisson(link = "log") )
 
-#' Calculate relative abundances of each strain on the inoculum
-#' of each experiment
+m3.t1 <-  glmer(count ~ log(depth) + i_freq + 
+                  (1|b_com) + (1|b_temp) + (b_temp_com) + (1|b_rep),
+                data = dat, 
+                family = poisson(link = "log"))
 
-Dat <- Dat %>%
-  left_join(Dat %>%
-              filter(hrs == 0) %>%
-              group_by(strain, community, exp) %>%
-              summarise(i_freq = sum(count) / sum(depth),
-                        .groups = "drop"),
-            by = c("strain", "community", "exp"))
-Dat
+m4.t1 <-  glmer(count ~ log(depth) + i_freq + 
+                  (1|b_com) + (1|b_temp) + (1|b_rep),
+                data = dat, 
+                family = poisson(link = "log") )
+                
+AIC(m1.t1, m2.t1, m3.t1, m4.t1)
+BIC(m1.t1, m2.t1, m3.t1, m4.t1)
 
 
-#' Confirm that count match expected species
-#' In general it does. ST00060 seems to be the only semi problematic
-Dat %>% 
-  ggplot(aes(x = added == 1, y = count)) +
-  facet_wrap(~ strain, scales = "free_y") +
-  geom_point(position = position_jitter(width = 0.1, height = 0)) +
-  theme_classic()
 
 
+summary(m1.t1)
 
-#' # Model with lme4
-library(lme4)
-Dat
 
+lattice::qqmath(ranef(m1.t1))
 
-dat <- Dat %>%
-  mutate(b_com = paste0(community, "_", strain)) %>%
-  mutate(b_com = replace(b_com, added == 0, NA)) %>%
-  
-  mutate(b_rep = paste0(exp, "_", strain)) %>%
-  mutate(b_rep = replace(b_rep, added == 0, NA)) %>%
-  
-  mutate(b_temp = paste0(temp, "_", strain)) %>%
-  mutate(b_temp = replace(b_temp, added == 0, NA)) %>%
-  
-  mutate(b_obs = as.character(1:n()))
-dat
 
 m1 <- glmer(count ~ log(depth) + i_freq + (1|b_com) + 
               (1|b_rep) + (1|b_temp) + (1|b_obs), 

From 9a9cae40aadbfdc0cf6133ffe2c6beb626add7e8 Mon Sep 17 00:00:00 2001
From: sur <surh@users.noreply.github.com>
Date: Mon, 10 Mar 2025 15:54:06 -0600
Subject: [PATCH 09/17] Trying models

---
 lme_pilot/run_lme4_model.r | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/lme_pilot/run_lme4_model.r b/lme_pilot/run_lme4_model.r
index 94af99f..5f6ab94 100644
--- a/lme_pilot/run_lme4_model.r
+++ b/lme_pilot/run_lme4_model.r
@@ -18,28 +18,46 @@ dat <- Dat %>%
 m1.t1 <-  glmer(count ~ log(depth) + i_freq + 
                   (1|b_com) + (1|b_temp) + (b_temp_com) + (1|b_rep) + (1|b_obs),
                 data = dat, 
-                family = poisson(link = "log") )
+                family = poisson(link = "log"),
+                control=glmerControl(optCtrl=list(maxfun=4*1e4)))
 
 m2.t1 <-  glmer(count ~ log(depth) + i_freq + 
                   (1|b_com) + (1|b_temp) + (1|b_rep) + (1|b_obs),
                 data = dat, 
-                family = poisson(link = "log") )
+                family = poisson(link = "log"),
+                control=glmerControl(optCtrl=list(maxfun=4*1e4)))
 
 m3.t1 <-  glmer(count ~ log(depth) + i_freq + 
                   (1|b_com) + (1|b_temp) + (b_temp_com) + (1|b_rep),
                 data = dat, 
-                family = poisson(link = "log"))
+                family = poisson(link = "log"),
+                control=glmerControl(optCtrl=list(maxfun=4*1e4)))
 
 m4.t1 <-  glmer(count ~ log(depth) + i_freq + 
                   (1|b_com) + (1|b_temp) + (1|b_rep),
                 data = dat, 
-                family = poisson(link = "log") )
+                family = poisson(link = "log"),
+                control=glmerControl(optCtrl=list(maxfun=4*1e4)))
                 
 AIC(m1.t1, m2.t1, m3.t1, m4.t1)
 BIC(m1.t1, m2.t1, m3.t1, m4.t1)
 
+#' Now fit the negative binomial
 
+m5.t1 <- glmer.nb(count ~ log(depth) + i_freq + 
+                    (1|b_com) + (1|b_temp) + (b_temp_com) + (1|b_rep),
+                  data = dat)
 
+m6.t1 <- glmer.nb(count ~ log(depth) + i_freq + 
+                    (1|b_com) + (1|b_temp) + (1|b_rep),
+                  data = dat)
+
+
+
+AIC(m1.t1, m2.t1, m3.t1, m4.t1, m5.t1, m6.t1)
+BIC(m1.t1, m2.t1, m3.t1, m4.t1, m5.t1, m6.t1)
+
+save()
 
 summary(m1.t1)
 

From aae6666b91c64079582e3bbb05dc1d9c24734ae8 Mon Sep 17 00:00:00 2001
From: sur <surh@users.noreply.github.com>
Date: Mon, 10 Mar 2025 18:44:01 -0600
Subject: [PATCH 10/17] Adding brms of negbinomial model

---
 lme_pilot/run_lme4_model.r | 82 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

diff --git a/lme_pilot/run_lme4_model.r b/lme_pilot/run_lme4_model.r
index 5f6ab94..0be2c6b 100644
--- a/lme_pilot/run_lme4_model.r
+++ b/lme_pilot/run_lme4_model.r
@@ -42,6 +42,8 @@ m4.t1 <-  glmer(count ~ log(depth) + i_freq +
 AIC(m1.t1, m2.t1, m3.t1, m4.t1)
 BIC(m1.t1, m2.t1, m3.t1, m4.t1)
 
+#' Basically we need to include the overdispersion parameter.
+#' The model without interactions provides a better fit.
 #' Now fit the negative binomial
 
 m5.t1 <- glmer.nb(count ~ log(depth) + i_freq + 
@@ -55,9 +57,87 @@ m6.t1 <- glmer.nb(count ~ log(depth) + i_freq +
 
 
 AIC(m1.t1, m2.t1, m3.t1, m4.t1, m5.t1, m6.t1)
+# df        AIC
+# m1.t1 126   4288.263
+# m2.t1   7   4570.509
+# m3.t1 125  91657.828
+# m4.t1   6 240681.080
+# m5.t1 126   4277.931
+# m6.t1   7   4563.837
 BIC(m1.t1, m2.t1, m3.t1, m4.t1, m5.t1, m6.t1)
+# df        BIC
+# m1.t1 126   4724.171
+# m2.t1   7   4594.726
+# m3.t1 125  92090.277
+# m4.t1   6 240701.837
+# m5.t1 126   4713.839
+# m6.t1   7   4588.054
+
+save(m1.t1, m2.t1, m3.t1, m4.t1, m5.t1, m6.t1, file = "models.rdat")
+
+#' There were convergence issues but negative binomial performs well
+#' There is disagreement between AIC and BIC. BIC gives the models without
+#' temp by community interactions better scores while AIC gives the models with
+#' temperature x community interaction as better
+#' 
+#' From the winbugs manual:
+#' 
+#' 
+#' In hierarchical models, these three techniques are
+#' essentially answering different prediction problems.
+#' Suppose the three levels of our model concerned
+#' classes within schools within a country. Then
+#' 1. if we were interested in predicting results of
+#' future classes in those actual schools, then
+#' DIC is appropriate (ie the random effects
+#'                     themselves are of interest);
+#' 2. if we were interested in predicting results of
+#' future schools in that country, then marginal-
+#'   likelihood methods such as AIC are
+#' appropriate (ie the population parameters are
+#'              of interest);
+#' 3. if we were interested in predicting results for
+#' a new country, then BIC/ Bayes factors are
+#' appropriate (ie the 'true' underlying model is
+#'              of interest).
+
+#' We are interested in future run of these communities, thus
+#' DIC may be more relevant?
+
+
+#' Will run brms with the negbinomial model
+library(brms)
+
+
+m5.br.t1 <- brm(count ~ log(depth) + i_freq + 
+                  (1|b_com) + (1|b_temp) + (b_temp_com) + (1|b_rep),
+                data = dat,
+                chains = 4,
+                iter = 4000,
+                warmup = 1000,
+                cores = 4,
+                thin = 2,
+                family = "negbinomial")
+
+
+m6.br.t1 <- brm(count ~ log(depth) + i_freq + 
+                  (1|b_com) + (1|b_temp) + (1|b_rep),
+                data = dat,
+                chains = 4,
+                iter = 4000,
+                warmup = 1000,
+                cores = 4,
+                thin = 2,
+                family = "negbinomial")
+
+
+summary(m6.br.t1)
+summary(m6.t1)
+
+
+brms::waic(m5.br.t1, m6.br.t1)
+
 
-save()
 
 summary(m1.t1)
 

From 19fb6ca24b4b6448fb85ff2920054ba19b24195d Mon Sep 17 00:00:00 2001
From: sur <surh@users.noreply.github.com>
Date: Mon, 10 Mar 2025 19:01:22 -0600
Subject: [PATCH 11/17] Fixing temp by env variable

---
 lme_pilot/run_lme4_model.r | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/lme_pilot/run_lme4_model.r b/lme_pilot/run_lme4_model.r
index 0be2c6b..565a9c0 100644
--- a/lme_pilot/run_lme4_model.r
+++ b/lme_pilot/run_lme4_model.r
@@ -16,7 +16,7 @@ dat <- Dat %>%
 #' Four basic poisson models to check the effect of the overdispersion
 #' variable (b_obs) and the interaction (temp * community) effects
 m1.t1 <-  glmer(count ~ log(depth) + i_freq + 
-                  (1|b_com) + (1|b_temp) + (b_temp_com) + (1|b_rep) + (1|b_obs),
+                  (1|b_com) + (1|b_temp) + (1|b_temp_com) + (1|b_rep) + (1|b_obs),
                 data = dat, 
                 family = poisson(link = "log"),
                 control=glmerControl(optCtrl=list(maxfun=4*1e4)))
@@ -28,7 +28,7 @@ m2.t1 <-  glmer(count ~ log(depth) + i_freq +
                 control=glmerControl(optCtrl=list(maxfun=4*1e4)))
 
 m3.t1 <-  glmer(count ~ log(depth) + i_freq + 
-                  (1|b_com) + (1|b_temp) + (b_temp_com) + (1|b_rep),
+                  (1|b_com) + (1|b_temp) + (1|b_temp_com) + (1|b_rep),
                 data = dat, 
                 family = poisson(link = "log"),
                 control=glmerControl(optCtrl=list(maxfun=4*1e4)))
@@ -47,7 +47,7 @@ BIC(m1.t1, m2.t1, m3.t1, m4.t1)
 #' Now fit the negative binomial
 
 m5.t1 <- glmer.nb(count ~ log(depth) + i_freq + 
-                    (1|b_com) + (1|b_temp) + (b_temp_com) + (1|b_rep),
+                    (1|b_com) + (1|b_temp) + (1|b_temp_com) + (1|b_rep),
                   data = dat)
 
 m6.t1 <- glmer.nb(count ~ log(depth) + i_freq + 
@@ -107,10 +107,11 @@ save(m1.t1, m2.t1, m3.t1, m4.t1, m5.t1, m6.t1, file = "models.rdat")
 
 #' Will run brms with the negbinomial model
 library(brms)
+load("brms_models.rdat")
 
-
+# Takes about 6 minutes to run, significant numeric issues.
 m5.br.t1 <- brm(count ~ log(depth) + i_freq + 
-                  (1|b_com) + (1|b_temp) + (b_temp_com) + (1|b_rep),
+                  (1|b_com) + (1|b_temp) + (1|b_temp_com) + (1|b_rep),
                 data = dat,
                 chains = 4,
                 iter = 4000,
@@ -119,7 +120,11 @@ m5.br.t1 <- brm(count ~ log(depth) + i_freq +
                 thin = 2,
                 family = "negbinomial")
 
+summary(m5.br.t1)
+summary(m5.t1)
+pairs(m5.br.t1)
 
+# Takes about 2 minutes to run with compilation
 m6.br.t1 <- brm(count ~ log(depth) + i_freq + 
                   (1|b_com) + (1|b_temp) + (1|b_rep),
                 data = dat,
@@ -134,9 +139,15 @@ m6.br.t1 <- brm(count ~ log(depth) + i_freq +
 summary(m6.br.t1)
 summary(m6.t1)
 
+save(m5.br.t1, m6.br.t1, file = "brms_models.rdat")
 
+#' We compare with waic and loo
 brms::waic(m5.br.t1, m6.br.t1)
 
+m5.br.t1.loo <- brms::loo(m5.br.t1)
+m6.br.t1.loo <- brms::loo(m6.br.t1)
+m5.br.t1.loo
+m6.br.t1.loo
 
 
 summary(m1.t1)

From 478b30e904cc73cb5af74c0ef4b0220d0f493ec9 Mon Sep 17 00:00:00 2001
From: sur <surh@users.noreply.github.com>
Date: Mon, 10 Mar 2025 19:12:04 -0600
Subject: [PATCH 12/17] Rerunning everything

---
 lme_pilot/analyze_models.r | 68 +++++++++++++++++++++++++++++++++
 lme_pilot/run_lme4_model.r | 77 ++++----------------------------------
 2 files changed, 75 insertions(+), 70 deletions(-)
 create mode 100644 lme_pilot/analyze_models.r

diff --git a/lme_pilot/analyze_models.r b/lme_pilot/analyze_models.r
new file mode 100644
index 0000000..c78da53
--- /dev/null
+++ b/lme_pilot/analyze_models.r
@@ -0,0 +1,68 @@
+
+summary(m1.t1)
+
+
+lattice::qqmath(ranef(m1.t1))
+
+
+m1 <- glmer(count ~ log(depth) + i_freq + (1|b_com) + 
+              (1|b_rep) + (1|b_temp) + (1|b_obs), 
+            data = dat %>%
+              filter(hrs == 24), 
+            family = poisson(link = log) )
+summary(m1)
+AIC(m1)
+BIC(m1)
+
+#' Get the effect of community
+b_com <- ranef(m1, condVar = TRUE, whichel = "b_com", postVar = TRUE)
+res <- tibble(effect = row.names(b_com$b_com),
+              est = b_com$b_com[,1],
+              postVar = attr(b_com$b_com, "postVar")[,,]) %>%
+  mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)),
+         upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)),
+         pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>%
+  mutate(qval = p.adjust(pval)) %>%
+  arrange(est) %>%
+  mutate(effect = factor(effect, levels = effect))
+res
+
+res %>%
+  ggplot(aes(x = est, y = effect)) +
+  geom_errorbarh(aes(xmin = lower, xmax = upper)) +
+  geom_point() +
+  geom_vline(xintercept = 0) +
+  theme_classic()
+
+
+#' Get the effect of temp
+b_temp <- ranef(m1, condVar = TRUE, whichel = "b_temp", postVar = TRUE)
+res <- tibble(effect = row.names(b_temp$b_temp),
+              est = b_temp$b_temp[,1],
+              postVar = attr(b_temp$b_temp, "postVar")[,,]) %>%
+  mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)),
+         upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)),
+         pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>%
+  mutate(qval = p.adjust(pval)) %>%
+  arrange(est) %>%
+  mutate(effect = factor(effect, levels = effect))
+res
+
+res %>%
+  ggplot(aes(x = est, y = effect)) +
+  geom_errorbarh(aes(xmin = lower, xmax = upper)) +
+  geom_point() +
+  geom_vline(xintercept = 0) +
+  theme_classic()
+
+
+#' Example of ST00046 showing that it decreases more at 28 than 32 (not significant)
+dat %>%
+  filter(strain == "ST00046") %>%
+  filter(hrs %in% c(24)) %>%
+  ggplot(aes(col = factor(temp))) +
+  facet_wrap(~community) +
+  geom_segment(aes(x = 0, y = i_freq, xend = temp, yend = count / depth, linetype = exp )) +
+  theme_classic() 
+
+
diff --git a/lme_pilot/run_lme4_model.r b/lme_pilot/run_lme4_model.r
index 565a9c0..e40d754 100644
--- a/lme_pilot/run_lme4_model.r
+++ b/lme_pilot/run_lme4_model.r
@@ -1,4 +1,7 @@
-setwd("/Users/sur/lab/exp/2025/today")
+# rmarkdown::render("/Users/sur/lab/src/NSM_comsints/lme_pilot/run_lme4_model.r", output_format = "html_document", output_dir = "/Users/sur/lab/exp/2025/today")
+# setwd("/Users/sur/lab/exp/2025/today")
+knitr::opts_knit$set(root.dir = "/Users/sur/lab/exp/2025/today")
+
 library(tidyverse)
 library(lme4)
 
@@ -109,7 +112,7 @@ save(m1.t1, m2.t1, m3.t1, m4.t1, m5.t1, m6.t1, file = "models.rdat")
 library(brms)
 load("brms_models.rdat")
 
-# Takes about 6 minutes to run, significant numeric issues.
+# No convergence issues, takes 2 min with compile
 m5.br.t1 <- brm(count ~ log(depth) + i_freq + 
                   (1|b_com) + (1|b_temp) + (1|b_temp_com) + (1|b_rep),
                 data = dat,
@@ -142,78 +145,12 @@ summary(m6.t1)
 save(m5.br.t1, m6.br.t1, file = "brms_models.rdat")
 
 #' We compare with waic and loo
-brms::waic(m5.br.t1, m6.br.t1)
+compare_ic(brms::waic(m5.br.t1), brms::waic(m6.br.t1))
 
 m5.br.t1.loo <- brms::loo(m5.br.t1)
 m6.br.t1.loo <- brms::loo(m6.br.t1)
 m5.br.t1.loo
 m6.br.t1.loo
 
-
-summary(m1.t1)
-
-
-lattice::qqmath(ranef(m1.t1))
-
-
-m1 <- glmer(count ~ log(depth) + i_freq + (1|b_com) + 
-              (1|b_rep) + (1|b_temp) + (1|b_obs), 
-            data = dat %>%
-              filter(hrs == 24), 
-            family = poisson(link = log) )
-summary(m1)
-AIC(m1)
-BIC(m1)
-
-#' Get the effect of community
-b_com <- ranef(m1, condVar = TRUE, whichel = "b_com", postVar = TRUE)
-res <- tibble(effect = row.names(b_com$b_com),
-              est = b_com$b_com[,1],
-              postVar = attr(b_com$b_com, "postVar")[,,]) %>%
-  mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)),
-         upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)),
-         pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>%
-  mutate(qval = p.adjust(pval)) %>%
-  arrange(est) %>%
-  mutate(effect = factor(effect, levels = effect))
-res
-
-res %>%
-  ggplot(aes(x = est, y = effect)) +
-  geom_errorbarh(aes(xmin = lower, xmax = upper)) +
-  geom_point() +
-  geom_vline(xintercept = 0) +
-  theme_classic()
-
-
-#' Get the effect of temp
-b_temp <- ranef(m1, condVar = TRUE, whichel = "b_temp", postVar = TRUE)
-res <- tibble(effect = row.names(b_temp$b_temp),
-              est = b_temp$b_temp[,1],
-              postVar = attr(b_temp$b_temp, "postVar")[,,]) %>%
-  mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)),
-         upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)),
-         pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>%
-  mutate(qval = p.adjust(pval)) %>%
-  arrange(est) %>%
-  mutate(effect = factor(effect, levels = effect))
-res
-
-res %>%
-  ggplot(aes(x = est, y = effect)) +
-  geom_errorbarh(aes(xmin = lower, xmax = upper)) +
-  geom_point() +
-  geom_vline(xintercept = 0) +
-  theme_classic()
-
-
-#' Example of ST00046 showing that it decreases more at 28 than 32 (not significant)
-dat %>%
-  filter(strain == "ST00046") %>%
-  filter(hrs %in% c(24)) %>%
-  ggplot(aes(col = factor(temp))) +
-  facet_wrap(~community) +
-  geom_segment(aes(x = 0, y = i_freq, xend = temp, yend = count / depth, linetype = exp )) +
-  theme_classic() 
-
+# loo_compare(brms::loo(m5.br.t1, moment_match = TRUE), brms::loo(m6.br.t1, moment_match = TRUE))
 

From 5ebd306e5a2befd0bbb1b704aa5f8c282df7736d Mon Sep 17 00:00:00 2001
From: sur <surh@users.noreply.github.com>
Date: Mon, 10 Mar 2025 19:14:38 -0600
Subject: [PATCH 13/17] re-starting

---
 lme_pilot/run_lme4_model.r | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lme_pilot/run_lme4_model.r b/lme_pilot/run_lme4_model.r
index e40d754..20cb652 100644
--- a/lme_pilot/run_lme4_model.r
+++ b/lme_pilot/run_lme4_model.r
@@ -1,5 +1,5 @@
 # rmarkdown::render("/Users/sur/lab/src/NSM_comsints/lme_pilot/run_lme4_model.r", output_format = "html_document", output_dir = "/Users/sur/lab/exp/2025/today")
-# setwd("/Users/sur/lab/exp/2025/today")
+setwd("/Users/sur/lab/exp/2025/today")
 knitr::opts_knit$set(root.dir = "/Users/sur/lab/exp/2025/today")
 
 library(tidyverse)

From 0c077e4bfd1860e00cb4dca15e4da75085ba520e Mon Sep 17 00:00:00 2001
From: sur <surh@users.noreply.github.com>
Date: Tue, 11 Mar 2025 09:05:52 -0600
Subject: [PATCH 14/17] Run final version

---
 lme_pilot/run_lme4_model.r | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lme_pilot/run_lme4_model.r b/lme_pilot/run_lme4_model.r
index 20cb652..e40d754 100644
--- a/lme_pilot/run_lme4_model.r
+++ b/lme_pilot/run_lme4_model.r
@@ -1,5 +1,5 @@
 # rmarkdown::render("/Users/sur/lab/src/NSM_comsints/lme_pilot/run_lme4_model.r", output_format = "html_document", output_dir = "/Users/sur/lab/exp/2025/today")
-setwd("/Users/sur/lab/exp/2025/today")
+# setwd("/Users/sur/lab/exp/2025/today")
 knitr::opts_knit$set(root.dir = "/Users/sur/lab/exp/2025/today")
 
 library(tidyverse)

From 8ab4984044c0200a6369fb25e8c59256cc193b86 Mon Sep 17 00:00:00 2001
From: surh <sur00mx@gmail.com>
Date: Wed, 12 Mar 2025 02:40:12 -0600
Subject: [PATCH 15/17] Ignoring rproj

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 5b6a065..f4f606b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 .Rhistory
 .RData
 .Ruserdata
+*.Rproj

From f952b32cd8b4c0f0db261fb57f1c443e69f0f40e Mon Sep 17 00:00:00 2001
From: surh <sur00mx@gmail.com>
Date: Wed, 12 Mar 2025 02:40:55 -0600
Subject: [PATCH 16/17] cleaning

---
 NSM_comsints.Rproj | 13 -------------
 1 file changed, 13 deletions(-)
 delete mode 100644 NSM_comsints.Rproj

diff --git a/NSM_comsints.Rproj b/NSM_comsints.Rproj
deleted file mode 100644
index 8e3c2eb..0000000
--- a/NSM_comsints.Rproj
+++ /dev/null
@@ -1,13 +0,0 @@
-Version: 1.0
-
-RestoreWorkspace: Default
-SaveWorkspace: Default
-AlwaysSaveHistory: Default
-
-EnableCodeIndexing: Yes
-UseSpacesForTab: Yes
-NumSpacesForTab: 2
-Encoding: UTF-8
-
-RnwWeave: Sweave
-LaTeX: pdfLaTeX

From 743c986c688fb712ac419b67ae183dbcaff2243a Mon Sep 17 00:00:00 2001
From: surh <sur00mx@gmail.com>
Date: Tue, 18 Mar 2025 15:38:32 -0600
Subject: [PATCH 17/17] Version of analyze lme model

---
 lme_pilot/analyze_models.r | 100 ++++++++++++++++++++++++++++++-------
 1 file changed, 82 insertions(+), 18 deletions(-)

diff --git a/lme_pilot/analyze_models.r b/lme_pilot/analyze_models.r
index c78da53..5b68e07 100644
--- a/lme_pilot/analyze_models.r
+++ b/lme_pilot/analyze_models.r
@@ -1,21 +1,19 @@
+setwd("~/micropopgen/exp/2025/today3/")
+library(tidyverse)
+library(lme4)
+library(brms)
 
-summary(m1.t1)
 
+load("2025-03-11.data_for_aplicaciones_lcg/models.rdat")
+Dat <- read_tsv("2025-03-11.data_for_aplicaciones_lcg/pilot_dat_perfect_design.tsv") %>%
+  mutate(community = factor(community, levels = paste0("R", 1:12)))
+dat <- Dat %>%
+  filter(hrs == 24) 
 
-lattice::qqmath(ranef(m1.t1))
-
-
-m1 <- glmer(count ~ log(depth) + i_freq + (1|b_com) + 
-              (1|b_rep) + (1|b_temp) + (1|b_obs), 
-            data = dat %>%
-              filter(hrs == 24), 
-            family = poisson(link = log) )
-summary(m1)
-AIC(m1)
-BIC(m1)
+summary(m5.t1)
 
 #' Get the effect of community
-b_com <- ranef(m1, condVar = TRUE, whichel = "b_com", postVar = TRUE)
+b_com <- ranef(m5.t1, condVar = TRUE, whichel = "b_com", postVar = TRUE)
 res <- tibble(effect = row.names(b_com$b_com),
               est = b_com$b_com[,1],
               postVar = attr(b_com$b_com, "postVar")[,,]) %>%
@@ -24,7 +22,8 @@ res <- tibble(effect = row.names(b_com$b_com),
          pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>%
   mutate(qval = p.adjust(pval)) %>%
   arrange(est) %>%
-  mutate(effect = factor(effect, levels = effect))
+  mutate(effect = factor(effect, levels = effect)) %>%
+  print(n = 100)
 res
 
 res %>%
@@ -35,8 +34,38 @@ res %>%
   theme_classic()
 
 
+dat %>%
+  filter(strain == "ST00046") %>%
+  filter(added == 1) %>%
+  # filter(community %in% c("R1", "R2", "R3", "R4", "R5", "R12")) %>%
+  mutate(effect = as.character(community)) %>%
+  mutate(effect = replace(effect, effect %in% c("R1", "R2", "R3", "R4"), "pos")) %>%
+  mutate(effect = replace(effect, effect %in% c("R5", "R12"), "neg")) %>%
+  ggplot(aes(col = factor(temp))) +
+  facet_wrap(~effect + temp) +
+  geom_segment(aes(x = 0, y = i_freq, xend = hrs, yend = count / depth, linetype = exp )) +
+  ggtitle(label = "ST00046") +
+  theme_classic() 
+
+
+
+
+dat %>%
+  filter(strain == "ST00046") %>%
+  filter(added == 1) %>%
+  filter(community == "R5") %>%
+  # filter(community %in% c("R1", "R2", "R3", "R4", "R5", "R12")) %>%
+  mutate(effect = as.character(community)) %>%
+  mutate(effect = replace(effect, effect %in% c("R1", "R2", "R3", "R4"), "pos")) %>%
+  mutate(effect = replace(effect, effect %in% c("R5", "R12"), "neg")) %>%
+  ggplot(aes(col = factor(temp))) +
+  facet_wrap(~temp) +
+  geom_segment(aes(x = 0, y = i_freq, xend = hrs, yend = count / depth, linetype = exp )) +
+  ggtitle(label = "ST00046") +
+  theme_classic() 
+
 #' Get the effect of temp
-b_temp <- ranef(m1, condVar = TRUE, whichel = "b_temp", postVar = TRUE)
+b_temp <- ranef(m5.t1, condVar = TRUE, whichel = "b_temp", postVar = TRUE)
 res <- tibble(effect = row.names(b_temp$b_temp),
               est = b_temp$b_temp[,1],
               postVar = attr(b_temp$b_temp, "postVar")[,,]) %>%
@@ -58,11 +87,46 @@ res %>%
 
 #' Example of ST00046 showing that it decreases more at 28 than 32 (not significant)
 dat %>%
-  filter(strain == "ST00046") %>%
+  filter(strain == "ST00110") %>%
+  filter(added == 1) %>%
   filter(hrs %in% c(24)) %>%
   ggplot(aes(col = factor(temp))) +
-  facet_wrap(~community) +
-  geom_segment(aes(x = 0, y = i_freq, xend = temp, yend = count / depth, linetype = exp )) +
+  facet_wrap(~temp) +
+  geom_segment(aes(x = 0, y = i_freq, xend = hrs, yend = count / depth, linetype = exp )) +
+  ggtitle(label = "ST00110") +
   theme_classic() 
 
 
+
+b_temp_com <- ranef(m5.t1, condVar = TRUE, whichel = "b_temp_com", postVar = TRUE)
+res <- tibble(effect = row.names(b_temp_com$b_temp_com),
+              est = b_temp_com$b_temp_com[,1],
+              postVar = attr(b_temp_com$b_temp_com, "postVar")[,,]) %>%
+  mutate(lower = qnorm(p = 0.025, mean = est, sd = sqrt(postVar)),
+         upper = qnorm(p = 0.975, mean = est, sd = sqrt(postVar)),
+         pval = 2 * pnorm(q = -abs(est), mean = 0, sd = sqrt(postVar))) %>%
+  mutate(qval = p.adjust(pval)) %>%
+  arrange(est) %>%
+  mutate(effect = factor(effect, levels = effect))
+res %>%
+  print(n = 1000)
+
+
+res %>%
+  ggplot(aes(x = est, y = effect)) +
+  geom_errorbarh(aes(xmin = lower, xmax = upper)) +
+  geom_point() +
+  geom_vline(xintercept = 0) +
+  theme_classic()
+
+
+
+
+dat %>%
+  filter(strain == "ST00143") %>%
+  filter(added == 1) %>%
+  filter(community == "R10") %>%
+  ggplot(aes(col = factor(temp))) +
+  geom_segment(aes(x = 0, y = i_freq, xend = hrs, yend = count / depth, linetype = exp )) +
+  ggtitle(label = "ST00143") +
+  theme_classic()