release 1.1.1

ftwkoopmans · Aug 9, 2024 · 24e069c · 24e069c
1 parent 9239384
commit 24e069c
Show file tree

Hide file tree

Showing 28 changed files with 1,092 additions and 662 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -3,7 +3,7 @@ Package: msdap
 Title: Mass Spectrometry Downstream Analysis Pipeline
 Description: Analyze label-free proteomics datasets from various sources (MaxQuant, Spectronaut, etc) using a pipeline that facilitates peptide filtering and many algorithms for normalization and statistical analysis. A comprehensive PDF report can be generated that includes many data visualizations and documentation thereof.
 URL: https://github.com/ftwkoopmans/msdap
-Version: 1.1
+Version: 1.1.1
 Authors@R: 
     person(given = "Frank",
            family = "Koopmans",

diff --git a/R/dataset.R b/R/dataset.R
@@ -252,8 +252,8 @@ check_valid_tibble_samples = function(tib) {
 #'
 #' @param peptides peptide tibble in long format
 log_peptide_tibble_pep_prot_counts = function(peptides) {
-  report_unique = peptides %>% group_by(isdecoy) %>% summarise(n_precursor=n_distinct(peptide_id), n_seq=n_distinct(sequence_plain), n_prot=n_distinct(protein_id)) %>% arrange(isdecoy)
-  append_log(sprintf("%d target precursors, %d (plain)sequences, %d proteins", report_unique$n_precursor[1], report_unique$n_seq[1], report_unique$n_prot[1]), type = "info")
+  report_unique = peptides %>% group_by(isdecoy) %>% summarise(n_precursor=n_distinct(peptide_id), n_seq=n_distinct(sequence_plain), n_prot=n_distinct(protein_id), n_sample=n_distinct(sample_id)) %>% arrange(isdecoy)
+  append_log(sprintf("%d target precursors, %d (plain)sequences, %d proteins, %d samples", report_unique$n_precursor[1], report_unique$n_seq[1], report_unique$n_prot[1], report_unique$n_sample[1]), type = "info")
   if(nrow(report_unique) > 1) { # if there are decoys
     append_log(sprintf("%d decoy precursors, %d (plain)sequences, %d proteins", report_unique$n_precursor[2], report_unique$n_seq[2], report_unique$n_prot[2]), type = "info")
   }
@@ -275,7 +275,7 @@ tibble_peptides_reorder = function(tib) {
 #' @param peptides peptide tibble in long format
 empty_protein_tibble = function(peptides) {
   uprot = unique(peptides$protein_id)
-  return(tibble(protein_id = uprot, fasta_headers = uprot, gene_symbols = uprot, gene_symbols_or_id = uprot))
+  return(tibble(protein_id = uprot, fasta_headers = uprot, gene_symbols = uprot, gene_symbols_ucount = 0L, gene_symbols_or_id = uprot))
 }
 
 

diff --git a/R/export_data_tables.R b/R/export_data_tables.R
@@ -94,9 +94,10 @@ export_protein_abundance_matrix = function(dataset, rollup_algorithm, output_dir
 
     # add protein metadata
     tib = dataset$proteins %>%
-      select(protein_id, fasta_headers, gene_symbols_or_id) %>%
+      select(protein_id, fasta_headers, gene_symbols_or_id, gene_symbol_ucount) %>%
       inner_join(as_tibble(m) %>% add_column(protein_id = rownames(m)), by="protein_id") %>%
-      arrange(gene_symbols_or_id!=protein_id, gene_symbols_or_id) # proteins without gene symbol first, then sort by symbol
+      arrange(gene_symbol_ucount > 0, gene_symbols_or_id) %>% # proteins without gene symbol first, then sort by symbol
+      select(-gene_symbol_ucount)
 
     ## write to file
     # generate filename. if very long (eg; huge contrast name + long path in output_dir), try to shorting with md5 hash
@@ -135,11 +136,12 @@ export_peptide_abundance_matrix = function(dataset, output_dir) {
     tibw = tib %>%
       pivot_wider(id_cols = c(protein_id, peptide_id), names_from = "sample_id", values_from = "intensity") %>%
       # add protein metadata
-      left_join(dataset$proteins %>% select(protein_id, gene_symbols_or_id), by = "protein_id") %>%
+      left_join(dataset$proteins %>% select(protein_id, gene_symbols_or_id, gene_symbol_ucount), by = "protein_id") %>%
       # sort columns by order in sample metadata table
-      select(gene_symbols_or_id, protein_id, peptide_id, !!sid) %>%
+      select(gene_symbols_or_id, gene_symbol_ucount, protein_id, peptide_id, !!sid) %>%
       # sort output table
-      arrange(gene_symbols_or_id!=protein_id, gene_symbols_or_id, protein_id, peptide_id) # proteins without gene symbol first, then sort by symbol
+      arrange(gene_symbol_ucount > 0, gene_symbols_or_id, protein_id, peptide_id) %>% # proteins without gene symbol first, then sort by symbol
+      select(-gene_symbol_ucount)
 
     ## write to file
     # generate filename. if very long (eg; huge contrast name + long path in output_dir), try to shorting with md5 hash

diff --git a/R/parse_fragpipe.R b/R/parse_fragpipe.R
diff --git a/R/parse_fragpipe_ionquant.R b/R/parse_fragpipe_ionquant.R
diff --git a/R/parse_fragpipe_psm.R b/R/parse_fragpipe_psm.R
diff --git a/R/parse_longformat_generic.R b/R/parse_longformat_generic.R
@@ -274,7 +274,7 @@ update_protein_mapping = function(tib, pep2prot) {
 #'
 #' @importFrom data.table as.data.table chmatch setkey setorder
 #' @export
-import_dataset_in_long_format = function(filename=NULL, x = NULL, attributes_required=list(), attributes_optional = list(), confidence_threshold = 0.01,
+import_dataset_in_long_format = function(filename = NULL, x = NULL, attributes_required=list(), attributes_optional = list(), confidence_threshold = 0.01,
                                          remove_peptides_never_above_confidence_threshold = TRUE, select_unique_precursor_per_modseq = TRUE,
                                          custom_func_manipulate_DT_onload = NULL,
                                          return_decoys = TRUE, do_plot = TRUE) {

diff --git a/R/process_peptide_data.R b/R/process_peptide_data.R
@@ -116,37 +116,43 @@ remove_proteins_by_name = function(dataset, remove_irt_peptides = FALSE, regular
 #' @param fasta_files todo
 #' @param fasta_id_type todo
 #' @param protein_separation_character todo
+#' @return table where
+#' protein_id = provided proteingroup identifier,
+#' accessions = result from fasta_id_short applied to each semicolon-delimited element in protein_id (result is a semicolon-collapsed string),
+#' fasta_headers = analogous to accessions, but matching the full FASTA header to each 'accessions' element,
+#' gene_symbols = full set of gene symbols, '-' where missing in FASTA, matching each element in 'accessions',
+#' gene_symbols_or_id = unique set of valid 'gene_symbol', or the FASTA full/long ID when there is no gene information
+#' gene_symbol_ucount = number of unique gene_symbols for this proteingroup (i.e. unique valid elements in 'gene_symbols')
 import_protein_metadata_from_fasta = function(protein_id, fasta_files, fasta_id_type = "uniprot", protein_separation_character = ";") {
   protein_id = unique(protein_id)
 
   #### read data from fasta files into tibble
-  fasta = fasta_parse_file(fasta_files, fasta_id_type)
-  fasta$gene_symbols_or_id = ifelse(fasta$gene == "", fasta$idlong, fasta$gene)
+  fasta = fasta_parse_file(fasta_files, fasta_id_type) %>%
+    filter(isdecoy == FALSE) %>% # remove decoys from fasta file
+    select(-isdecoy)
+  fasta$gene_symbols_or_id = ifelse(nchar(fasta$gene) < 2, fasta$idlong, fasta$gene)
 
   #### protein-to-accession mapping table
   # convert protein_id to a list of respective accessions  -->>  long-format tibble
   l = strsplit(protein_id, protein_separation_character, fixed = T)
   prot2acc = tibble(
     protein_id = rep(protein_id, lengths(l)),
-    acc_input = unlist(l, use.names = F)
+    acc_input = unlist(l, use.names = F),
+    isdecoy = FALSE
   )
 
   # extract the accession
   prot2acc$acc = fasta_id_short(prot2acc$acc_input, fasta_id_type = fasta_id_type)
   # accession cannot be less than 3 characters
   prot2acc$acc[nchar(prot2acc$acc) < 3] = NA
-  # Filter valid accessions by regex. contaminant or decoy entries are not allowed. eg; CON_ REV_ DECOY_
-  # Check for ids that start or end with CON, REV or DECOY. Do this for both the full protein id and idshort.
-  # eg; suppose that input software prefixes the full ID with reverse: REV_sp|abc123|HUMAN_EXAMPLE -->> the reverse tag is lost in fasta_id_short(...)
-  # note that we cannot blindly match "_CON" to protein identifiers as this could fail for uniprot long IDs, eg; sp|abc123|HUMAN_CONTACTIN or whatever
-  prot2acc$acc[!is.na(prot2acc$acc) & (grepl("^CON_|_CON$|^REV_|_REV$|^DECOY_|_DECOY$", prot2acc$acc, ignore.case = T) |
-                                       grepl("^CON_|_CON$|^REV_|_REV$|^DECOY_|_DECOY$", prot2acc$acc_input, ignore.case = T))] = NA
-  # prot2acc$acc[!grepl("^[[:alnum:]-]+$", prot2acc$acc)] = NA # previous implementation only supports uniprot/ensembl, but not refseq where fasta headers are like; >NP_000006.2 arylamine N-acetyltransferase 2 [Homo sapiens]
+  # Filter valid accessions by regex, decoy entries are not allowed. eg; REV_ DECOY_
+  # Check both full and "short" IDs. eg; suppose that input software prefixes the full ID with reverse: REV_sp|abc123|HUMAN_EXAMPLE -->> the reverse tag is lost in fasta_id_short(...)
+  prot2acc$isdecoy = !is.na(prot2acc$acc) & (fasta_identifier_isdecoy(prot2acc$acc_input) | fasta_identifier_isdecoy(prot2acc$acc))
 
   # finally, remove all invalid mappings and select unique elements (eg; protein-group with same protein accession listed twice is de-duped)
   prot2acc = prot2acc %>%
-    drop_na() %>%
-    distinct(.keep_all = T)
+    filter(!is.na(acc) & isdecoy == FALSE) %>%
+    distinct(protein_id, acc_input, .keep_all = T)
   # merge in the fasta metadata
   prot2acc = prot2acc %>% left_join(fasta, by = c(acc = "idshort"))
 
@@ -172,9 +178,9 @@ import_protein_metadata_from_fasta = function(protein_id, fasta_files, fasta_id_
     summarise(
       accessions = paste(acc, collapse = ";"),
       fasta_headers = paste(header, collapse = ";"),
-      # full list of gene symbols, NA where missing
-      gene_symbols = paste(ifelse(is.na(gene) | nchar(gene) < 2, "-", gene), collapse = ";"), # for consistency, collapse
-      # gene_symbols = list(ifelse(is.na(gene) | nchar(gene) < 2, NA_character_, gene)), # alternatively, store as list
+      # full list of gene symbols, '-' where missing, matching number-of-elements in accessions/fasta_headers
+      gene_symbols = paste(ifelse(nchar(gene) < 2, "-", gene), collapse = ";"),
+      gene_symbol_ucount = n_distinct(gene[nchar(gene) >= 2]),
       # gene_symbols_or_id; take unique values, remove NA, remove strings of less than 2 characters
       gene_symbols_or_id = paste(remove_by_charlength(unique(gene_symbols_or_id), minchar = 2), collapse = ";")
     )