release 1.0.7

ftwkoopmans · Apr 28, 2024 · ec32942 · ec32942
1 parent 166239e
commit ec32942
Show file tree

Hide file tree

Showing 46 changed files with 400 additions and 193 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -10,3 +10,4 @@
 ^README.Rmd$
 ^README.md$
 ^docker$
+^manuscript$
diff --git a/.gitignore b/.gitignore
@@ -16,4 +16,5 @@ Thumbs.db
 /docker/docker_compose/**
 /docker/README
 /docker/Dockerfile.dev
+/docker/_test_versions/**
 /manuscript/**
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -3,7 +3,7 @@ Package: msdap
 Title: Mass Spectrometry Downstream Analysis Pipeline
 Description: Analyze label-free proteomics datasets from various sources (MaxQuant, Spectronaut, etc) using a pipeline that facilitates peptide filtering and many algorithms for normalization and statistical analysis. A comprehensive PDF report can be generated that includes many data visualizations and documentation thereof.
 URL: https://github.com/ftwkoopmans/msdap
-Version: 1.0.6
+Version: 1.0.7
 Authors@R: 
     person(given = "Frank",
            family = "Koopmans",
@@ -67,8 +67,9 @@ Imports:
     missForest (>= 1.4),
     BiocParallel (>= 1.20),
     variancePartition (>= 1.16),
-    testthat (>= 2.1),
     archive(>= 1.0.1)
+Suggests:
+    testthat (>= 2.1)
 Remotes: 
     bioc::release/Biobase,
     bioc::release/MSnbase,
@@ -84,5 +85,5 @@ BiocViews:
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.0
+RoxygenNote: 7.3.1
 VignetteBuilder: knitr
diff --git a/R/dea.R b/R/dea.R
@@ -79,6 +79,19 @@ get_column_intensity = function(peptides, contr_lbl = NA) {
 
 
 
+#' return sample groups for a given contrast (column name in dataset$samples)
+#'
+#' @param dataset your dataset
+#' @param contr_lbl a string describing a contrast = column in dataset$samples, e.g. "contrast: WT vs KO"
+#' @returns list of unique sample groups, first element is left-hand side of the contrast and second element the right-hand side
+contrast_to_samplegroups = function(dataset, contr_lbl) {
+  stopifnot(contr_lbl %in% colnames(dataset$samples))
+  x = dataset$samples %>% select(group, contrast = !!contr_lbl)
+  return(list(unique(x$group[x$contrast == 1]), unique(x$group[x$contrast == 2])))
+}
+
+
+
 #' Differential expression analysis
 #'
 #' @param dataset your dataset

diff --git a/R/export_data_tables.R b/R/export_data_tables.R
@@ -107,7 +107,7 @@ export_protein_abundance_matrix = function(dataset, rollup_algorithm, output_dir
     # check if output file is writable
     remove_file_if_exists(fname)
     # write to TSV file. note that we enforce a dot as decimal character (so ignore user's locale)
-    write.table(tib, file = fname, quote = F, sep="\t", na="", row.names = F, col.names = T, dec = ".")
+    utils::write.table(tib, file = fname, quote = F, sep="\t", na="", row.names = F, col.names = T, dec = ".")
   }
 }
 
@@ -150,7 +150,7 @@ export_peptide_abundance_matrix = function(dataset, output_dir) {
     # check if output file is writable
     remove_file_if_exists(fname)
     # write to TSV file. note that we enforce a dot as decimal character (so ignore user's locale)
-    write.table(tibw, file = fname, quote = F, sep="\t", na="", row.names = F, col.names = T, dec = ".")
+    utils::write.table(tibw, file = fname, quote = F, sep="\t", na="", row.names = F, col.names = T, dec = ".")
   }
 
 }
diff --git a/R/filter_peptides.R b/R/filter_peptides.R
@@ -222,6 +222,7 @@ filter_dataset = function(dataset,
                           rollup_algorithm = "maxlfq",
                           # which filters to apply
                           by_group = T, all_group = T, by_contrast = F) {
+
   start_time = Sys.time()
 
   ##### input validation
@@ -278,8 +279,8 @@ filter_dataset = function(dataset,
   data.table::setkey(dataset$dt_pep_group, key_group) # set key for data.table::merge()
   dt_criteria = data.table::data.table(dataset$groups %>% select(key_group, ndetect_min, nquant_min, ndetect_noexclude_min, nquant_noexclude_min) %>% mutate_all(as.integer), key="key_group")
   dt_filter_group = data.table::merge.data.table(dataset$dt_pep_group, dt_criteria, all.x = T, sort = FALSE)
-  dt_filter_group[ , `:=`(pass = ndetect>=ndetect_min & nquant>=nquant_min,
-                          pass_noexclude = ndetect_noexclude>=ndetect_noexclude_min & nquant_noexclude>=nquant_noexclude_min)]
+  dt_filter_group[ , `:=`(pass = ndetect >= ndetect_min & nquant >= nquant_min,
+                          pass_noexclude = ndetect_noexclude >= ndetect_noexclude_min & nquant_noexclude >= nquant_noexclude_min)]
 
   ## now we know for each peptide whether it passes the filter criteria in each group (with and without taking 'exclude' samples into account)
 

diff --git a/R/gene_idmapping.R b/R/gene_idmapping.R
@@ -2,6 +2,11 @@
 #' Parse HGNC gene identifier lookup table that was downloaded from genenames.org
 #'
 #' download link: https://www.genenames.org/download/statistics-and-files/
+#' table: "Complete dataset download links" -->> "Complete HGNC approved dataset text json" -->> download the "TXT" table
+#' filename is typically something like hgnc_complete_set.txt
+#' URL as of September 2023; https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt
+#'
+#' # alternatively;
 #' table: "Total Approved Symbols" -->> "TXT" / "text file in TSV format"
 #' filename is typically something like non_alt_loci_set.txt
 #'
@@ -88,7 +93,7 @@ hgnc_lookuptable = function(f) {
   }
   if("entrez_id" %in% colnames(hgnc)) {
     hgnc$entrez_id = gsub("\\D.*", "", as.character(hgnc$entrez_id)) # remove multiple mappings
-    hgnc$entrez_id[is.na(hgnc$ensembl_id) | !grepl("^\\d+$", hgnc$entrez_id)] = NA # invalid IDs to NA
+    hgnc$entrez_id[is.na(hgnc$entrez_id) | !grepl("^\\d+$", hgnc$entrez_id)] = NA # invalid IDs to NA
     result = bind_rows(
       result,
       hgnc %>% filter(!is.na(entrez_id)) %>% mutate(type = "entrez_id") %>% select(hgnc_id, type, value = entrez_id)
@@ -131,7 +136,19 @@ hgnc_lookuptable = function(f) {
     return(hgnc %>% select(hgnc_id, hgnc_symbol))
   }
 
+  # add hgnc_id that do not have any synonyms or mappings to mgi/rgd/entrez/ensembl
+  id_missing = setdiff(hgnc$hgnc_id, result$hgnc_id)
+  if(length(id_missing) > 0) {
+    id_missing__symbol = hgnc$hgnc_symbol[match(id_missing, hgnc$hgnc_id)]
+    result = bind_rows(
+      result,
+      tibble::tibble(hgnc_id = id_missing, type = "onlysymbol", value = id_missing__symbol)
+    )
+  }
+
+  # add HGNC symbols
   result$hgnc_symbol = hgnc$hgnc_symbol[match(result$hgnc_id, hgnc$hgnc_id)]
+  # return results with re-ordered column names
   result %>% select(hgnc_id, hgnc_symbol, type, value)
 }
 

diff --git a/R/msdap-package.R b/R/msdap-package.R
@@ -0,0 +1,16 @@
+#' MS-DAP: Mass Spectrometry Downstream Analysis Pipeline for label-free proteomics data.
+#'
+#' http://github.com/ftwkoopmans/msdap
+#'
+#' @keywords internal
+#' @import ggplot2
+#' @importFrom grDevices colorRampPalette dev.off graphics.off pdf rainbow
+#' @importFrom graphics abline boxplot legend lines mtext par plot points text
+#' @importFrom stats .lm.fit contrasts cov density ecdf mad median na.exclude na.omit p.adjust p.adjust.methods pt quantile resid residuals rt sd sigma weights loess optim predict
+#' @importFrom utils capture.output combn data head relist tail help packageVersion
+#' @importFrom rlang .data :=
+"_PACKAGE"
+
+## usethis namespace: start
+## usethis namespace: end
+NULL
diff --git a/R/normalize_vwmb.R b/R/normalize_vwmb.R
@@ -19,6 +19,22 @@
 #'
 #' note; if you want to treat replicate samples that are flagged as 'exclude' upstream differently while still including them in the data matrix, you could set parameter `groups=paste(samples$group, samples$exclude)` to put them in separate groups.
 #'
+#' @examples \dontrun{
+#' # Define a custom normalization function that we'll use in the MS-DAP pipeline later.
+#' # All it does is apply the first part of the VWMB algorithm to reduce varation between replicates.
+#' norm_within_only = function(x_as_log2, group_by_cols, ...) {
+#'   normalize_vwmb(x_as_log2, groups = group_by_cols,
+#'                  metric_within="var", metric_between = "")
+#' }
+#'
+#' # now we use the main MS-DAP function as typical, but note that we here set the
+#' # normalization algorithm to the function we defined in the lines above.
+#' dataset = analysis_quickstart(
+#'   dataset,
+#'   norm_algorithm = "norm_within_only",
+#'   # <other parameters here>
+#' )
+#' }
 #' @param x numerical data matrix to normalize, should be log transformed
 #' @param groups array describing the grouping of the columns in x (sample groups). Or alternatively set to NA to indicate there are no groups
 #' @param metric_within how should replicate samples within a group be normalized? valid arguments: "var" reduce overall variation (default). "mode" reduce overall foldchange mode. pass empty string to disable
@@ -33,7 +49,7 @@ normalize_vwmb = function(x, groups=NA, metric_within="var", metric_between="mod
   stopifnot(is.matrix(x) && typeof(x) == "double" && ncol(x) > 1 && nrow(x) > 1) # valid numeric matrix
   stopifnot((length(groups)==1 && is.na(groups)) || (length(groups) == ncol(x) && all(!is.na(groups)))) # groups should be NA or an array describing all columns of x
   stopifnot(length(metric_within)==1 && metric_within %in% c("mode","var", "")) # how to normalize within groups, with "" to disable
-  stopifnot(length(metric_between)==1 && metric_between %in% c("mode","var")) # how to normalize between groups; cannot disable between-group normalization. if you don't want to treat groups differently, disable groups by setting groups=NA
+  stopifnot(length(metric_between)==1 && metric_between %in% c("mode","var", "")) # how to normalize between groups
 
   ## restrict group names: assume there are no groups if groups=NA  +  enforces character type for groups
   if(length(groups) == 1) {
@@ -47,7 +63,7 @@ normalize_vwmb = function(x, groups=NA, metric_within="var", metric_between="mod
   ## restrict input matrix
   # remove infinite values (eg; some upstream log(0)), then compute summary stats for input data
   x[!is.finite(x)] = NA
-  x_median = median(x, na.rm = T)
+  x_median = stats::median(x, na.rm = T)
   if(include_attributes) {
     x_colmedians = matrixStats::colMedians(x, na.rm = T)
   }
@@ -103,7 +119,7 @@ normalize_vwmb = function(x, groups=NA, metric_within="var", metric_between="mod
 
 
   #### normalize between groups
-  if(group_count > 1) {
+  if(group_count > 1 && metric_between != "") {
     # analogous to above
     # metric_between is either "mode" or "var"
     if(metric_between == "var") {
@@ -119,7 +135,7 @@ normalize_vwmb = function(x, groups=NA, metric_within="var", metric_between="mod
 
 
   #### finally, scale entire result matrix such that the median intensity is the same as input (deals with the rare case where abundances in one group are much higher, and this normalization consequentially shifts overall abundance levels @ between-group scaling)
-  x = x + (x_median - median(x, na.rm = T))
+  x = x + (x_median - stats::median(x, na.rm = T))
   # reset colnames
   colnames(x) = x_colnames
   # threshold very low values
@@ -215,14 +231,15 @@ pairwise_modes = function(x, min_overlap = 10) {
 
 
 
-#' find the mode in a numeric array using the default density function from base R
+#' find the mode in a numeric array using the density function from base R
 #' @param x numeric array
 #' @return mode of x
 get_mode = function(x) {
   if(length(x) == 0) return(NA)
   if(length(x) == 1) return(x)
 
-  density_estimate = density(x, na.rm=T)
+  density_estimate = density(x, na.rm = T) # default density kernel (left as-is for consistency with prior MS-DAP versions)
+  # density_estimate = density(x, bw = "sj", adjust = 0.9, na.rm = T) # alternative bandwidth; we need to benchmark to validate this is an improvement for the vast majority of datasets (prior to updating previous default)
   # which.max() returns first 'max' value (thus always 1 return value). eg; which.max(c(1,1,2,2,1))
   return(density_estimate$x[which.max(density_estimate$y)])
 }
@@ -280,7 +297,7 @@ norm_scales_var = function(x) {
   mle_norm = function(...) {
     s = unlist(...)
     # use the median instead of the mean, as the latter is too sensitive to outliers (eg; upstream data has some errors so few features have an extreme value -->> dominates the scaling of all features)
-    return( median(matrixStats::rowSds(x_scaled + rep(s, rep.int(nrow(x_scaled), ncol(x_scaled))), na.rm=T), na.rm=T) )
+    return( stats::median(matrixStats::rowSds(x_scaled + rep(s, rep.int(nrow(x_scaled), ncol(x_scaled))), na.rm=T), na.rm=T) )
     # optimizations; column-wise matrix operations; https://stackoverflow.com/a/32364355   https://stats.stackexchange.com/a/51750
     # optimizations; median @ https://stackoverflow.com/questions/34771088/why-is-standard-r-median-function-so-much-slower-than-a-simple-c-alternative
   }

diff --git a/R/parse_fragpipe_ionquant.R b/R/parse_fragpipe_ionquant.R
@@ -151,6 +151,7 @@ import_dataset_fragpipe_ionquant = function(path, acquisition_mode, confidence_t
 #' @param file_ion full path to a ion.tsv file
 #' @param file_psm full path to a psm.tsv file, this must match the `file_ion` argument (e.g. files in the same dir)
 parse_fragpipe_psm_ion_pair = function(file_ion, file_psm) {
+  spectrum = NULL
   # basically this reads the CSV/TSV table from file and maps column names to expected names.
   # (complicated) downstream code handles compressed files, efficient parsing of only the requested columns, etc.
   append_log(paste("reading FragPipe file;", file_ion), type = "info")

diff --git a/R/plot_differential_detect.R b/R/plot_differential_detect.R
@@ -2,9 +2,10 @@
 #' Plot differential detection results as a histogram
 #'
 #' @param dataset dataset object
+#' @param zscore_threshold cutoff used in the plot (for absolute values)
 #' @returns list of ggplot objects with 1 plot per contrast. If no differential detection data is available, returns an empty list
 #' @export
-plot_differential_detect = function(dataset) {
+plot_differential_detect = function(dataset, zscore_threshold = 4) {
   result = list()
   if(!"dd_proteins" %in% names(dataset) || nrow(dataset$dd_proteins) == 0) {
     return(result)
@@ -19,8 +20,8 @@ plot_differential_detect = function(dataset) {
     tib_contr = tib_contr %>% filter(type %in% c("detect", "quant"))
     tib_contr_detect = tib_contr %>% filter(type == "detect")
     tib_contr_quant = tib_contr %>% filter(type == "quant")
-    lbl_detect = sprintf("only detected peptides; #proteins tested: %d  #abs(zscore) >= 4: %d", nrow(tib_contr_detect), sum(abs(tib_contr_detect$zscore) >= 4))
-    lbl_quant = sprintf("all quantified peptides; #proteins tested: %d  #abs(zscore) >= 4: %d", nrow(tib_contr_quant), sum(abs(tib_contr_quant$zscore) >= 4))
+    lbl_detect = sprintf("only detected peptides; #proteins tested: %d  #abs(zscore) >= %s: %d", nrow(tib_contr_detect), as.character(zscore_threshold), sum(abs(tib_contr_detect$zscore) >= zscore_threshold))
+    lbl_quant = sprintf("all quantified peptides; #proteins tested: %d  #abs(zscore) >= %s: %d", nrow(tib_contr_quant), as.character(zscore_threshold), sum(abs(tib_contr_quant$zscore) >= zscore_threshold))
 
     tib_contr = tib_contr %>%
       arrange(type) %>%
@@ -29,8 +30,8 @@ plot_differential_detect = function(dataset) {
 
     p_hist = ggplot(tib_contr, aes(zscore)) +
       geom_histogram(bins=25, boundary = 0, colour = "black", fill="lightgrey", na.rm=T) +
-      geom_vline(xintercept = c(-4, 4), colour = "red") +
-      facet_wrap(.~type_label, ncol = 1) +
+      geom_vline(xintercept = c(-zscore_threshold, zscore_threshold), colour = "red") +
+      facet_wrap(.~type_label, ncol = 1, scales = "free") +
       labs(x="Differential z-score for observed peptides", y="Protein count", colour = "", title = contr) +
       theme_bw() +
       theme(plot.title = element_text(hjust = 0.5, size=9),