release 1.1.3

ftwkoopmans · Aug 26, 2024 · de80abb · de80abb
1 parent 27aad7d
commit de80abb
Show file tree

Hide file tree

Showing 17 changed files with 171 additions and 54 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -3,7 +3,7 @@ Package: msdap
 Title: Mass Spectrometry Downstream Analysis Pipeline
 Description: Analyze label-free proteomics datasets from various sources (MaxQuant, Spectronaut, etc) using a pipeline that facilitates peptide filtering and many algorithms for normalization and statistical analysis. A comprehensive PDF report can be generated that includes many data visualizations and documentation thereof.
 URL: https://github.com/ftwkoopmans/msdap
-Version: 1.1.2
+Version: 1.1.3
 Authors@R: 
     person(given = "Frank",
            family = "Koopmans",
@@ -67,7 +67,8 @@ Imports:
     missForest (>= 1.4),
     BiocParallel (>= 1.20),
     variancePartition (>= 1.16),
-    archive(>= 1.0.1)
+    archive(>= 1.0.1),
+    arrow(>= 13.0.0)
 Suggests:
     testthat (>= 2.1)
 Remotes: 

diff --git a/NAMESPACE b/NAMESPACE
@@ -3,6 +3,7 @@
 export(analysis_quickstart)
 export(append_log)
 export(as_matrix_except_first_column)
+export(assert_lme4_functional)
 export(cache_filtering_data)
 export(check_dataset_hascache)
 export(check_dataset_integrity)

diff --git a/R/gene_idmapping.R b/R/gene_idmapping.R
@@ -11,11 +11,13 @@
 #' filename is typically something like non_alt_loci_set.txt
 #'
 #' @param f full path to the downloaded table (expected to be tsv format)
+#' @param uppercase_symbols convert all gene symbols to upper case? default: TRUE
 #' @returns a long-format table with columns; hgnc_id, hgnc_symbol, type, value
 #' @export
-hgnc_lookuptable = function(f) {
+hgnc_lookuptable = function(f, uppercase_symbols = TRUE) {
   result = NULL
   check_parameter_is_string(f)
+  check_parameter_is_boolean(uppercase_symbols)
 
   # parse HGNC table from disk
   f = path_exists(f, NULL, try_compressed = TRUE)
@@ -39,7 +41,9 @@ hgnc_lookuptable = function(f) {
   )
   hgnc = robust_header_matching(hgnc, column_spec = cols, columns_required = c("hgnc_id", "hgnc_symbol"))
 
-  hgnc$hgnc_symbol = toupper(hgnc$hgnc_symbol)
+  if(uppercase_symbols) {
+    hgnc$hgnc_symbol = toupper(hgnc$hgnc_symbol)
+  }
 
   # remove invalid rows (this shouldn't occur but check anyway)
   hgnc = hgnc %>% filter(!is.na(hgnc_id) & hgnc_id != "" & !is.na(hgnc_symbol) & nchar(hgnc_symbol) >= 2)

diff --git a/R/msqrobsum__bugfix.R b/R/msqrobsum__bugfix.R
@@ -85,6 +85,13 @@ msqrobsum <- function(
   # df_failed <- filter(df, is.na(df))### FRANK
   # df <- filter(df, !is.na(df))### FRANK
   if(!nrow(df)) {warning("No models could be fitted"); return(df_prot_failed)}
+
+  ### FRANK: identify issues with lme4::lmer() when all proteins/models failed to help users diagnose R installation issues. Related to MS-DAP GitHub issue #36
+  if(!"sigma" %in% colnames(df)) { # if all fit_fun() calls throw errors, the 'mm' column is NULL/empty and unnest() does not yield a sigma column
+    assert_lme4_functional() # util function, new in MS-DAP 1.1.3
+    stop("msqrob model estimation failed for all proteins") # halt even if we didn't find lme4::lmer() issues because below code will fail when 'sigma' column is missing
+  }
+
   ## Squeeze variance
   df <- mutate(df, sigma_post = sigma, df_prior = 0, sigma_prior = 0)
   if(squeeze_variance) {
@@ -160,7 +167,7 @@ do_mm <- function(d, formulas, contrasts, mode ='msqrobsum', robust_lmer_iter =
     out$data_summarized <- list(d)
   }
   if (mode == 'sum') return(new_tibble(out ,nrow = 1L))
-  for (form in c(formulas)){
+  for (form in c(formulas)) {
     ## TODO do lm fit if there are no random effects (all output inside loop, do while loop)
     model <- try(do_lmerfit(d, form,  robust_lmer_iter,lmer_args), silent = TRUE)
     if (is(model,"lmerMod")) break #else message(paste0('Cannot fit ', format(form)))

diff --git a/R/parse_longformat_generic.R b/R/parse_longformat_generic.R
@@ -78,7 +78,7 @@ import_dataset_skyline = function(filename, acquisition_mode, confidence_thresho
 
 #' Import a label-free proteomics dataset from DIA-NN
 #'
-#' @param filename the full file path of the input file
+#' @param filename the full file path of the DIA-NN report; this can be either the report.tsv file or the report.parquet file (new since DIA-NN 1.9.1)
 #' @param confidence_threshold confidence score threshold at which a peptide is considered 'identified', default: 0.01 (target value must be lesser than or equals)
 #' @param do_plot logical indicating whether to create QC plots that are shown in the downstream PDF report (if enabled)
 #' @param use_irt logical indicating whether to use standardized (IRT) or the empirical (RT) retention times
@@ -88,22 +88,37 @@ import_dataset_diann = function(filename, confidence_threshold = 0.01, use_norma
   reset_log()
   append_log("reading DIA-NN report...", type = "info")
 
-  rt_col = "iRT"
-  if(!use_irt) rt_col = "RT"
+  rt_col = ifelse(use_irt, "iRT", "RT")
 
-  ds = import_dataset_in_long_format(filename,
-                                       attributes_required = list(sample_id = "File.Name",
-                                                                  protein_id = "Protein.Group",
-                                                                  sequence_modified = "Modified.Sequence",
-                                                                  sequence_plain = "Stripped.Sequence",
-                                                                  charge = "Precursor.Charge",
-                                                                  rt = rt_col,
-                                                                  # isdecoy = "",
-                                                                  intensity = ifelse(use_normalized_intensities, "Precursor.Normalised", "Precursor.Quantity"),
-                                                                  confidence = "Q.Value",
-                                                                  confidence_score = "Evidence"),
-                                       # attributes_optional = list(confidence_score = "Evidence"),
-                                       confidence_threshold = confidence_threshold,
+  # if the filename ends with ".parquet", use the Apache arrow library to read the DIA-NN report in parquet format
+  f = df = NULL
+  if(grepl("\\.parquet$", filename)) {
+    append_log(paste("input file:", filename), type = "info")
+    df = arrow::read_parquet(
+      filename,
+      col_select = NULL,
+      as_data_frame = TRUE,
+      props = arrow::ParquetArrowReaderProperties$create(),
+      mmap = TRUE
+    )
+  } else { # not a parquet file, assume input file is a .tsv file
+    f = filename
+  }
+
+  ds = import_dataset_in_long_format(filename = f,
+                                     x = df,
+                                     attributes_required = list(sample_id = c("File.Name", "Run"),
+                                                                protein_id = "Protein.Group",
+                                                                sequence_modified = "Modified.Sequence",
+                                                                sequence_plain = "Stripped.Sequence",
+                                                                charge = "Precursor.Charge",
+                                                                rt = rt_col,
+                                                                # isdecoy = "",
+                                                                intensity = ifelse(use_normalized_intensities, "Precursor.Normalised", "Precursor.Quantity"),
+                                                                confidence = "Q.Value",
+                                                                confidence_score = "Evidence"),
+                                     # attributes_optional = list(confidence_score = "Evidence"),
+                                     confidence_threshold = confidence_threshold,
                                      return_decoys = F,
                                      do_plot = do_plot)
 
@@ -325,6 +340,10 @@ import_dataset_in_long_format = function(filename = NULL, x = NULL, attributes_r
   # this code is also checking for required data, so run even if data table is already in memory
   map_required = map_headers(headers, attributes_required, error_on_missing = T, allow_multiple = T)
   map_optional = map_headers(headers, attributes_optional, error_on_missing = F, allow_multiple = T)
+  # collect all column indices and their desired names
+  col_indices = c(map_required, map_optional)
+  col_indices_dupe = duplicated(as.integer(col_indices))
+  col_indices_unique = col_indices[!col_indices_dupe] # don't use unique(), it'll drop the names
 
   if (is_file) {
     ### try to detect data tables with a comma as decimal separation character
@@ -342,36 +361,36 @@ import_dataset_in_long_format = function(filename = NULL, x = NULL, attributes_r
 
 
     ### read table
-    # collect all column indices and their desired names
-    col_indices = c(map_required, map_optional)
-    col_indices_dupe = duplicated(as.integer(col_indices))
-    col_indices_unique = col_indices[!col_indices_dupe] # don't use unique(), it'll drop the names
-
     # only read columns of interest to speed up file parsing
     # don't parse first row as header and then skip it to avoid issues with tables that have mismatched headers (e.g. MetaMorpheus files that have extra trailing tab on header row)
     DT = read_textfile_compressed(filename, skip_empty_rows = F, as_table = T, select = as.integer(col_indices_unique), header = F, skip = 1, stringsAsFactors = F, dec = dec_char, data.table = T)
     if(is.null(DT)) {
       append_log("failed to read data table from file", type = "error")
     }
     colnames(DT) = names(col_indices_unique) # overwrite column names from file with the desired names from column specification
+  } else {
+    x = x[,as.integer(col_indices_unique)]
+    colnames(x) = names(col_indices_unique) # overwrite column names from file with the desired names from column specification
 
-    # suppose the same column from input table is assigned to multiple attributes (eg; Spectronaut FG.Id as input for both charge and modseq)
-    # j = index in 'col_indices' where we should borrow content from other column
-    for(j in which(col_indices_dupe)) {
-      j_source_column = names(col_indices_unique)[as.integer(col_indices_unique) == as.integer(col_indices[j])]
-      DT[,names(col_indices)[j]] = DT[[j_source_column]] # use column names !
-    }
+    DT = data.table::as.data.table(x) # cast input table to data.table type
+  }
 
 
-    ### inject function for custom manipulation of DT. example; for some input format, repair a column. for FragPipe, the modified peptide column is empty for peptides without a modification, so there is no column with proper modseq available. have to 'manually repair', but other than that we want to maintain this generic function
-    if(length(custom_func_manipulate_DT_onload) == 1 && is.function(custom_func_manipulate_DT_onload)) {
-      DT = custom_func_manipulate_DT_onload(DT)
-    }
-  } else {
-    DT = data.table::as.data.table(x) # cast input table to data.table type
+  ### suppose the same column from input table is assigned to multiple attributes (eg; Spectronaut FG.Id as input for both charge and modseq)
+  # j = index in 'col_indices' where we should borrow content from other column
+  for(j in which(col_indices_dupe)) {
+    j_source_column = names(col_indices_unique)[as.integer(col_indices_unique) == as.integer(col_indices[j])]
+    DT[,names(col_indices)[j]] = DT[[j_source_column]] # use column names !
   }
 
 
+  ### inject function for custom manipulation of DT. example; for some input format, repair a column. for FragPipe, the modified peptide column is empty for peptides without a modification, so there is no column with proper modseq available. have to 'manually repair', but other than that we want to maintain this generic function
+  if(length(custom_func_manipulate_DT_onload) == 1 && is.function(custom_func_manipulate_DT_onload)) {
+    DT = custom_func_manipulate_DT_onload(DT)
+  }
+
+
+
   ### decoys
   if("isdecoy" %in% colnames(DT)) {
     DT[ , isdecoy := isdecoy %in% c(TRUE, 1, "decoy", "true", "True", "TRUE"), by=isdecoy]

diff --git a/R/process_peptide_data.R b/R/process_peptide_data.R
@@ -92,6 +92,7 @@ remove_proteins_by_name = function(dataset, remove_irt_peptides = FALSE, regular
 #' @param files an array of filenames, these should be the full path
 #' @param fasta_id_type what type of fasta files are these? options: "uniprot" (highly recommended) or otherwise any other character string (as we have no special rules for generic fasta files)
 #' @param protein_separation_character the separation character for protein identifiers in your dataset. Most commonly this is a semicolon (eg; in maxquant/metamorpheus/skyline/etc.)
+#' @param uppercase_symbols convert all gene symbols to upper case? default: TRUE
 #' @return table where
 #' protein_id = provided proteingroup identifier,
 #' accessions = result from fasta_id_short applied to each semicolon-delimited element in protein_id (result is a semicolon-collapsed string),
@@ -100,7 +101,7 @@ remove_proteins_by_name = function(dataset, remove_irt_peptides = FALSE, regular
 #' gene_symbols_or_id = unique set of valid 'gene_symbol', or the FASTA full/long ID when there is no gene information
 #' gene_symbol_ucount = number of unique gene_symbols for this proteingroup (i.e. unique valid elements in 'gene_symbols')
 #' @export
-import_fasta = function(dataset, files = NULL, fasta_id_type = "uniprot", protein_separation_character = ";") {
+import_fasta = function(dataset, files = NULL, fasta_id_type = "uniprot", protein_separation_character = ";", uppercase_symbols = TRUE) {
   if(length(files) == 0) {
     append_log("no fasta files were provided, downstream analysis/code will use protein IDs as surrogate for fasta headers", type = "info")
     dataset$proteins = empty_protein_tibble(peptides)
@@ -109,7 +110,8 @@ import_fasta = function(dataset, files = NULL, fasta_id_type = "uniprot", protei
       protein_id = dataset$peptides %>% filter({if("isdecoy" %in% names(.)) isdecoy else F} != TRUE) %>% distinct(protein_id) %>% pull(),
       fasta_files = files,
       fasta_id_type = fasta_id_type,
-      protein_separation_character = protein_separation_character
+      protein_separation_character = protein_separation_character,
+      uppercase_symbols = uppercase_symbols
     )
   }
 
@@ -124,18 +126,37 @@ import_fasta = function(dataset, files = NULL, fasta_id_type = "uniprot", protei
 #' @param fasta_files an array of filenames, these should be the full path
 #' @param fasta_id_type what type of fasta files are these? options: "uniprot" (highly recommended) or otherwise any other character string (as we have no special rules for generic fasta files)
 #' @param protein_separation_character the separation character for protein identifiers in your dataset. Most commonly this is a semicolon (eg; in maxquant/metamorpheus/skyline/etc.)
+#' @param uppercase_symbols convert all gene symbols to upper case? default: TRUE
 #' @return table where
 #' protein_id = provided proteingroup identifier,
 #' accessions = result from fasta_id_short applied to each semicolon-delimited element in protein_id (result is a semicolon-collapsed string),
 #' fasta_headers = analogous to accessions, but matching the full FASTA header to each 'accessions' element,
 #' gene_symbols = full set of gene symbols, '-' where missing in FASTA, matching each element in 'accessions',
 #' gene_symbols_or_id = unique set of valid 'gene_symbol', or the FASTA full/long ID when there is no gene information
 #' gene_symbol_ucount = number of unique gene_symbols for this proteingroup (i.e. unique valid elements in 'gene_symbols')
-import_protein_metadata_from_fasta = function(protein_id, fasta_files, fasta_id_type = "uniprot", protein_separation_character = ";") {
+import_protein_metadata_from_fasta = function(protein_id, fasta_files, fasta_id_type = "uniprot", protein_separation_character = ";", uppercase_symbols = TRUE) {
+  if(length(protein_id) == 0 || !all(!is.na(protein_id) & is.character(protein_id) & nchar(protein_id) > 0)) {
+    append_log('protein_id parameter must be an array of non-empty strings (no NA values, no "")', type = "error")
+  }
+  # valid paths for `fasta_files` is checked downstream and includes guessing compressed files / filename variations
+  if(length(fasta_files) == 0 || !all(!is.na(fasta_files) & is.character(fasta_files) & nchar(fasta_files) > 0)) {
+    append_log('fasta_files parameter must be an array of non-empty strings (no NA values, no "")', type = "error")
+  }
+  if(length(fasta_id_type) != 1 || is.na(fasta_id_type) || !is.character(fasta_id_type) || nchar(fasta_id_type) == 0) {
+    append_log('fasta_id_type parameter must be a non-empty string (no NA values, no "")', type = "error")
+  }
+  if(length(protein_separation_character) != 1 || is.na(protein_separation_character) || !is.character(protein_separation_character) || nchar(protein_separation_character) == 0) {
+    append_log('protein_separation_character parameter must be a non-empty string (no NA values, no "")', type = "error")
+  }
+  if(!uppercase_symbols %in% c(TRUE, FALSE)) {
+    append_log('uppercase_symbols parameter must be a boolean value (TRUE or FALSE)', type = "error")
+  }
+
+
   protein_id = unique(protein_id)
 
   #### read data from fasta files into tibble
-  fasta = fasta_parse_file(fasta_files, fasta_id_type) %>%
+  fasta = fasta_parse_file(fasta_files, fasta_id_type, uppercase_symbols) %>%
     # remove decoys from fasta file then drop column
     filter(isdecoy == FALSE) %>%
     select(-isdecoy)

diff --git a/R/util_fasta.R b/R/util_fasta.R
@@ -50,7 +50,7 @@ fasta_header_to_gene = function(x, fasta_id_type = "uniprot") {
   # if multiple GN=XXX tags are present (should never occur for uniprot), the last element is taken
   res = sub(".* GN=([^ ]{2,}).*", "\\1", x) # require at least 2 characters
   res[res == x] = ""
-  toupper(res)
+  return(res)
 }
 
 
@@ -59,7 +59,8 @@ fasta_header_to_gene = function(x, fasta_id_type = "uniprot") {
 #
 #' @param fasta_files array of fill paths to fasta files
 #' @param fasta_id_type fasta type, typically this is 'uniprot'
-fasta_parse_file = function(fasta_files, fasta_id_type = "uniprot") {
+#' @param uppercase_symbols convert all gene symbols to upper case? default: TRUE
+fasta_parse_file = function(fasta_files, fasta_id_type = "uniprot", uppercase_symbols = TRUE) {
   # first, validate input files
   ff = NULL
   for (f in fasta_files) {
@@ -75,7 +76,7 @@ fasta_parse_file = function(fasta_files, fasta_id_type = "uniprot") {
     fasta = c(fasta, l[char1 == ">"]) # only fasta header lines
   }
 
-  tibble(
+  result = tibble(
     idlong = fasta_header_to_id(fasta),
     gene = fasta_header_to_gene(fasta, fasta_id_type),
     header = fasta
@@ -85,6 +86,11 @@ fasta_parse_file = function(fasta_files, fasta_id_type = "uniprot") {
     # use the idlong because reverse/decoy tags might be prepended. e.g. "rev_tr|U3KQF4|U3KQF4_HUMAN"
     distinct(idlong, .keep_all = T) %>%
     mutate(isdecoy = fasta_identifier_isdecoy(idlong) | fasta_identifier_isdecoy(idshort))
+
+  if(uppercase_symbols) {
+    result$gene = toupper(result$gene)
+  }
+  return(result)
 }
 
 

diff --git a/R/util_generic.R b/R/util_generic.R
@@ -603,3 +603,34 @@ fit_t_dist_fixed_mu = function(x) {
   )
   if(!is.null(fit)) return(c(sigma=fit$par[1], df=fit$par[2]))
 }
+
+
+
+#' Throw an error if lme4::lmer() cannot be executed
+#'
+#' @description
+#' `stop()` is called when lme4 or Matrix is not installed, or when `lme4::lmer()`
+#' cannot be executed (with default settings and mock data).
+#' If this function does not throw an error, we may assume `lme4::lmer()` is operational.
+#' @export
+assert_lme4_functional = function() {
+  if(!requireNamespace("lme4", quietly = TRUE)) {
+    stop("required package lme4 is not installed", call. = FALSE)
+  }
+  if(!requireNamespace("Matrix", quietly = TRUE)) {
+    stop("required package Matrix is not installed", call. = FALSE)
+  }
+
+  result = tryCatch({
+    suppressMessages(lme4::lmer(formula = expression ~ (1 | condition), data = data.frame(
+      expression = c(14.19, 16.09, 14.94, 15.51, 14.82, 14.87),
+      condition = c(1, 1, 1, 2, 2, 2)
+    )))
+  }, error = function(e) {
+    return(e)
+  })
+
+  if("error" %in% class(result)) {
+    stop(paste0("error while executing function lme4::lmer(), the installed version of R packages lme4 and Matrix are possibly incompatible -> ", result$message))
+  }
+}