release 1.1.2

ftwkoopmans · Aug 12, 2024 · 27aad7d · 27aad7d
1 parent 24e069c
commit 27aad7d
Show file tree

Hide file tree

Showing 11 changed files with 94 additions and 56 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -3,7 +3,7 @@ Package: msdap
 Title: Mass Spectrometry Downstream Analysis Pipeline
 Description: Analyze label-free proteomics datasets from various sources (MaxQuant, Spectronaut, etc) using a pipeline that facilitates peptide filtering and many algorithms for normalization and statistical analysis. A comprehensive PDF report can be generated that includes many data visualizations and documentation thereof.
 URL: https://github.com/ftwkoopmans/msdap
-Version: 1.1.1
+Version: 1.1.2
 Authors@R: 
     person(given = "Frank",
            family = "Koopmans",

diff --git a/R/dataset.R b/R/dataset.R
@@ -275,7 +275,7 @@ tibble_peptides_reorder = function(tib) {
 #' @param peptides peptide tibble in long format
 empty_protein_tibble = function(peptides) {
   uprot = unique(peptides$protein_id)
-  return(tibble(protein_id = uprot, fasta_headers = uprot, gene_symbols = uprot, gene_symbols_ucount = 0L, gene_symbols_or_id = uprot))
+  return(tibble(protein_id = uprot, fasta_headers = uprot, gene_symbols = uprot, gene_symbol_ucount = 0L, gene_symbols_or_id = uprot))
 }
 
 

diff --git a/R/export_data_tables.R b/R/export_data_tables.R
@@ -94,8 +94,13 @@ export_protein_abundance_matrix = function(dataset, rollup_algorithm, output_dir
 
     # add protein metadata
     tib = dataset$proteins %>%
-      select(protein_id, fasta_headers, gene_symbols_or_id, gene_symbol_ucount) %>%
-      inner_join(as_tibble(m) %>% add_column(protein_id = rownames(m)), by="protein_id") %>%
+      select(protein_id, fasta_headers, gene_symbols_or_id, tidyselect::any_of("gene_symbol_ucount")) %>%
+      inner_join(as_tibble(m) %>% add_column(protein_id = rownames(m)), by="protein_id")
+    # if gene symbol count is lacking (i.e. dataset from old MS-DAP version), sort such that proteingroups without a symbol go on top (by guessing if gene_symbols_or_id is a long-format accession)
+    if( ! "gene_symbol_ucount" %in% colnames(tib)) {
+      tib$gene_symbol_ucount = as.integer(!grepl("|", tib$gene_symbols_or_id, fixed = TRUE) & tib$protein_id != tib$gene_symbols_or_id) # count = 1 when not a protein identifier, 0 otherwise
+    }
+    tib = tib %>%
       arrange(gene_symbol_ucount > 0, gene_symbols_or_id) %>% # proteins without gene symbol first, then sort by symbol
       select(-gene_symbol_ucount)
 

diff --git a/R/export_stats_genesummary.R b/R/export_stats_genesummary.R
@@ -140,7 +140,7 @@ export_stats_genesummary = function(dataset, gene_ambiguity = "prio_specific", d
   if(!(is.list(dataset) && "de_proteins" %in% names(dataset))) {
     append_log("dataset parameter should be a dataset that contains DEA results (did you forget to setup contrasts and run the analysis_quickstart function, or dea() ?)", type = "error")
   }
-  if(!(is.list(dataset) && "proteins" %in% names(dataset) && all(c("protein_id", "gene_symbols") %in% colnames(dataset$proteins)) && !all(dataset$proteins$protein_id == dataset$proteins$gene_symbols))) {
+  if(!(is.list(dataset) && "proteins" %in% names(dataset) && all(c("protein_id", "gene_symbols") %in% colnames(dataset$proteins)))) {
     append_log("dataset is missing essential information in the 'gene_symbols' column of the protein table. No FASTA has been imported for this dataset or the dataset was analyzed with an outdated MS-DAP version.\nTo use this function you won't have to re-run the entire analysis pipeline for this dataset, just run the import_fasta() function prior to applying export_stats_genesummary() to update the current dataset object (assuming this is a dataset searched against a uniprot FASTA)", type = "error")
   }
   if(!(length(gene_ambiguity) == 1 && all(gene_ambiguity %in% c("leading_gene", "prio_specific", "only_specific")))) {

diff --git a/R/process_peptide_data.R b/R/process_peptide_data.R
@@ -9,33 +9,6 @@ peptides_collapse_by_sequence = function(peptides, prop_peptide = "sequence_plai
 
 
 
-#' Import fasta file(s) that match your dataset.
-#'
-#' For fasta files from uniprot, we extract the gene symbols for each protein-group (not available for non-uniprot fasta files).
-#'
-#' @param dataset your dataset
-#' @param files an array of filenames, these should be the full path
-#' @param fasta_id_type what type of fasta files are these? options: "uniprot" (highly recommended) or otherwise any other character string (as we have no special rules for generic fasta files)
-#' @param protein_separation_character the separation character for protein identifiers in your dataset. Most commonly this is a semicolon (eg; in maxquant/metamorpheus/skyline/etc.)
-#' @export
-import_fasta = function(dataset, files = NULL, fasta_id_type = "uniprot", protein_separation_character = ";") {
-  if(length(files) == 0) {
-    append_log("no fasta files were provided, downstream analysis/code will use protein IDs as surrogate for fasta headers", type = "info")
-    dataset$proteins = empty_protein_tibble(peptides)
-  } else {
-    dataset$proteins = import_protein_metadata_from_fasta(
-      protein_id = dataset$peptides %>% filter({if("isdecoy" %in% names(.)) isdecoy else F} != TRUE) %>% distinct(protein_id) %>% pull(),
-      fasta_files = files,
-      fasta_id_type = fasta_id_type,
-      protein_separation_character = protein_separation_character
-    )
-  }
-
-  return(dataset)
-}
-
-
-
 #' Remove all peptides that match some protein-level filters
 #'
 #' @examples
@@ -111,11 +84,46 @@ remove_proteins_by_name = function(dataset, remove_irt_peptides = FALSE, regular
 
 
 
-#' placeholder title
-#' @param protein_id todo
-#' @param fasta_files todo
-#' @param fasta_id_type todo
-#' @param protein_separation_character todo
+#' Import fasta file(s) that match your dataset.
+#'
+#' For fasta files from uniprot, we extract the gene symbols for each protein-group (not available for non-uniprot fasta files).
+#'
+#' @param dataset your dataset
+#' @param files an array of filenames, these should be the full path
+#' @param fasta_id_type what type of fasta files are these? options: "uniprot" (highly recommended) or otherwise any other character string (as we have no special rules for generic fasta files)
+#' @param protein_separation_character the separation character for protein identifiers in your dataset. Most commonly this is a semicolon (eg; in maxquant/metamorpheus/skyline/etc.)
+#' @return table where
+#' protein_id = provided proteingroup identifier,
+#' accessions = result from fasta_id_short applied to each semicolon-delimited element in protein_id (result is a semicolon-collapsed string),
+#' fasta_headers = analogous to accessions, but matching the full FASTA header to each 'accessions' element,
+#' gene_symbols = full set of gene symbols, '-' where missing in FASTA, matching each element in 'accessions',
+#' gene_symbols_or_id = unique set of valid 'gene_symbol', or the FASTA full/long ID when there is no gene information
+#' gene_symbol_ucount = number of unique gene_symbols for this proteingroup (i.e. unique valid elements in 'gene_symbols')
+#' @export
+import_fasta = function(dataset, files = NULL, fasta_id_type = "uniprot", protein_separation_character = ";") {
+  if(length(files) == 0) {
+    append_log("no fasta files were provided, downstream analysis/code will use protein IDs as surrogate for fasta headers", type = "info")
+    dataset$proteins = empty_protein_tibble(peptides)
+  } else {
+    dataset$proteins = import_protein_metadata_from_fasta(
+      protein_id = dataset$peptides %>% filter({if("isdecoy" %in% names(.)) isdecoy else F} != TRUE) %>% distinct(protein_id) %>% pull(),
+      fasta_files = files,
+      fasta_id_type = fasta_id_type,
+      protein_separation_character = protein_separation_character
+    )
+  }
+
+  return(dataset)
+}
+
+
+
+#' Parse fasta files and construct a protein metadata table for provided protein identifiers
+#'
+#' @param protein_id an array of protein identifiers (that should be available in provided `fasta_files`)
+#' @param fasta_files an array of filenames, these should be the full path
+#' @param fasta_id_type what type of fasta files are these? options: "uniprot" (highly recommended) or otherwise any other character string (as we have no special rules for generic fasta files)
+#' @param protein_separation_character the separation character for protein identifiers in your dataset. Most commonly this is a semicolon (eg; in maxquant/metamorpheus/skyline/etc.)
 #' @return table where
 #' protein_id = provided proteingroup identifier,
 #' accessions = result from fasta_id_short applied to each semicolon-delimited element in protein_id (result is a semicolon-collapsed string),
@@ -128,9 +136,9 @@ import_protein_metadata_from_fasta = function(protein_id, fasta_files, fasta_id_
 
   #### read data from fasta files into tibble
   fasta = fasta_parse_file(fasta_files, fasta_id_type) %>%
-    filter(isdecoy == FALSE) %>% # remove decoys from fasta file
+    # remove decoys from fasta file then drop column
+    filter(isdecoy == FALSE) %>%
     select(-isdecoy)
-  fasta$gene_symbols_or_id = ifelse(nchar(fasta$gene) < 2, fasta$idlong, fasta$gene)
 
   #### protein-to-accession mapping table
   # convert protein_id to a list of respective accessions  -->>  long-format tibble
@@ -152,9 +160,14 @@ import_protein_metadata_from_fasta = function(protein_id, fasta_files, fasta_id_
   # finally, remove all invalid mappings and select unique elements (eg; protein-group with same protein accession listed twice is de-duped)
   prot2acc = prot2acc %>%
     filter(!is.na(acc) & isdecoy == FALSE) %>%
+    # merge in the fasta metadata
+    left_join(fasta, by = c(acc = "idshort")) %>%
+    # take unique set, using input with `fasta` sorting as-is = for each idshort, use data from first fasta file (where idshort was found)
     distinct(protein_id, acc_input, .keep_all = T)
-  # merge in the fasta metadata
-  prot2acc = prot2acc %>% left_join(fasta, by = c(acc = "idshort"))
+
+  # replace invalid/missing gene symbols with a '-'
+  # also removed uniprot-provided gene symbols like MGI:1111. example; ">sp|Q8BR90|CE051_MOUSE UPF0600 protein C5orf51 homolog OS=Mus musculus (Mouse) OX=10090 GN=MGI:2146232 PE=1 SV=1"
+  prot2acc$gene[is.na(prot2acc$gene) | nchar(prot2acc$gene) < 2 | grepl("^(MGI|RGD|HGNC):", prot2acc$gene)] = "-"
 
   # report mapping success rate at accession and protein-group level
   n_acc = nrow(prot2acc)
@@ -172,17 +185,22 @@ import_protein_metadata_from_fasta = function(protein_id, fasta_files, fasta_id_
     print(prot2acc %>% filter(is.na(header)), n=25)
   }
 
+
   #### collapse at protein-group level
   tib_result = prot2acc %>%
     group_by(protein_id) %>%
     summarise(
       accessions = paste(acc, collapse = ";"),
       fasta_headers = paste(header, collapse = ";"),
       # full list of gene symbols, '-' where missing, matching number-of-elements in accessions/fasta_headers
-      gene_symbols = paste(ifelse(nchar(gene) < 2, "-", gene), collapse = ";"),
-      gene_symbol_ucount = n_distinct(gene[nchar(gene) >= 2]),
-      # gene_symbols_or_id; take unique values, remove NA, remove strings of less than 2 characters
-      gene_symbols_or_id = paste(remove_by_charlength(unique(gene_symbols_or_id), minchar = 2), collapse = ";")
+      gene_symbols = paste(gene, collapse = ";"),
+      gene_symbol_ucount = n_distinct(gene[gene != "-"]),
+      # gene_symbols_or_id; take unique genes (remove missing/empty). If there are none, take (all) long-format accessions
+      gene_symbols_or_id = ifelse(
+        gene_symbol_ucount > 0,
+        paste(unique(gene[gene != "-"]), collapse = ";"), # protein_id without gene symbol are ignored / assumed same as gene symbols that we did find
+        paste(idlong, collapse = ";")
+      )
     )
 
   rows_fail = is.na(tib_result$gene_symbols_or_id) | tib_result$gene_symbols_or_id == ""
@@ -198,8 +216,11 @@ import_protein_metadata_from_fasta = function(protein_id, fasta_files, fasta_id_
   }
 
   rows_miss_fasta = is.na(tib_result$fasta_headers)
-  tib_result$fasta_headers[rows_miss_fasta] = tib_result$gene_symbols_or_id[rows_miss_fasta] = tib_result$protein_id[rows_miss_fasta]
+  if(any(rows_miss_fasta)) {
+    tib_result$fasta_headers[rows_miss_fasta] = tib_result$gene_symbols_or_id[rows_miss_fasta] = tib_result$protein_id[rows_miss_fasta]
+  }
 
+  # debug: tib_result %>% filter(gene_symbol_ucount ==0) %>% select(fasta_headers) %>% print(n=100)
   return(tib_result)
 }
 

diff --git a/R/stats_summary.R b/R/stats_summary.R
@@ -92,6 +92,9 @@ summarise_stats__for_contrast = function(dataset, return_dea, return_diffdetect,
   if(!all(c("protein_id", "gene_symbols_or_id") %in% colnames(dataset$proteins)) || !is.character(dataset$proteins$gene_symbols_or_id) || any(dataset$proteins$gene_symbols_or_id == "") ) {
     append_log("requires MS-DAP dataset where the proteins table contains gene symbols (did you forget to import_fasta() ?)", type = "error")
   }
+  if(!(is.list(dataset) && "proteins" %in% names(dataset) && all(c("protein_id", "gene_symbols") %in% colnames(dataset$proteins)))) {
+    append_log("dataset is missing essential information in the 'gene_symbols' column of the protein table. No FASTA has been imported for this dataset or the dataset was analyzed with an outdated MS-DAP version.\nTo use this function you won't have to re-run the entire analysis pipeline for this dataset, just run the import_fasta() function prior to applying export_stats_genesummary() to update the current dataset object (assuming this is a dataset searched against a uniprot FASTA)", type = "error")
+  }
   if(length(return_dea) != 1 || ! return_dea %in% c(TRUE, FALSE)) {
     append_log("return_dea must be single boolean", type = "error")
   }

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -33,8 +33,8 @@ RUN R -e "devtools::install_version('remaCor', '0.0.18', upgrade = 'never', repo
 ### MS-DAP
 # from local devtools::build() file
 RUN mkdir /msdap
-COPY temp/msdap_1.1.1.tar.gz /msdap/
-RUN R -e "devtools::install_local('/msdap/msdap_1.1.1.tar.gz', upgrade = 'never')"
+COPY temp/msdap_1.1.2.tar.gz /msdap/
+RUN R -e "devtools::install_local('/msdap/msdap_1.1.2.tar.gz', upgrade = 'never')"
 # alternatively, install from github; RUN R -e "devtools::install_github('ftwkoopmans/[email protected]', upgrade = 'never')"
 
 

diff --git a/docker/msdap_launcher_unix.sh b/docker/msdap_launcher_unix.sh
@@ -3,7 +3,7 @@
 # MS-DAP launch script
 # https://github.com/ftwkoopmans/msdap
 
-VERSION="1.1.1"
+VERSION="1.1.2"
 
 
 ### OS

diff --git a/docker/msdap_launcher_windows.ps1 b/docker/msdap_launcher_windows.ps1
@@ -1,7 +1,7 @@
 # MS-DAP launch script
 # https://github.com/ftwkoopmans/msdap
 
-$VERSION = "1.1.1"
+$VERSION = "1.1.2"
 
 
 Write-Host "$((Get-Date).ToString("HH:mm:ss")) - Starting MS-DAP launcher script for version: $VERSION"

diff --git a/man/import_fasta.Rd b/man/import_fasta.Rd
diff --git a/man/import_protein_metadata_from_fasta.Rd b/man/import_protein_metadata_from_fasta.Rd