release 1.1

ftwkoopmans · Jul 21, 2024 · 9239384 · 9239384
1 parent 0ae8038
commit 9239384
Show file tree

Hide file tree

Showing 32 changed files with 928 additions and 775 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -3,7 +3,7 @@ Package: msdap
 Title: Mass Spectrometry Downstream Analysis Pipeline
 Description: Analyze label-free proteomics datasets from various sources (MaxQuant, Spectronaut, etc) using a pipeline that facilitates peptide filtering and many algorithms for normalization and statistical analysis. A comprehensive PDF report can be generated that includes many data visualizations and documentation thereof.
 URL: https://github.com/ftwkoopmans/msdap
-Version: 1.0.9
+Version: 1.1
 Authors@R: 
     person(given = "Frank",
            family = "Koopmans",

diff --git a/NAMESPACE b/NAMESPACE
@@ -22,6 +22,7 @@ export(differential_detect)
 export(export_peptide_abundance_matrix)
 export(export_protein_abundance_matrix)
 export(export_statistical_results)
+export(export_stats_genesummary)
 export(file_check)
 export(filename_strip_illegal_characters)
 export(filter_dataset)
@@ -69,8 +70,6 @@ export(plot_sample_pca__sample_in_contrast)
 export(plot_variance_explained)
 export(plot_volcano)
 export(print_dataset_summary)
-export(protein2gene_by_symbol)
-export(protein2gene_orthologs)
 export(protein_eset_from_data)
 export(read_textfile_compressed)
 export(regex_classification)
@@ -84,7 +83,6 @@ export(rollup_pep2prot_tmp)
 export(sample_color_coding__long_format)
 export(sample_metadata_custom)
 export(setup_contrasts)
-export(summarise_stats)
 export(tibble_as_eset)
 export(update_protein_mapping)
 export(update_protein_mapping_from_maxquant)

diff --git a/R/dataset.R b/R/dataset.R
@@ -275,7 +275,7 @@ tibble_peptides_reorder = function(tib) {
 #' @param peptides peptide tibble in long format
 empty_protein_tibble = function(peptides) {
   uprot = unique(peptides$protein_id)
-  return(tibble(protein_id = uprot, fasta_headers = uprot, gene_symbols_or_id = uprot))
+  return(tibble(protein_id = uprot, fasta_headers = uprot, gene_symbols = uprot, gene_symbols_or_id = uprot))
 }
 
 
@@ -314,7 +314,7 @@ diffdetect_summary_prettyprint = function(dataset, use_quant = FALSE, trim_contr
     # summary stats per contrast
     group_by(contrast) %>%
     summarise(`#proteins` = n(),
-              `#abs(zscore) >= 4` = sum(abs(zscore) >= 4),
+              `#abs(zscore) >= 6` = sum(abs(zscore) >= 6),
               `top10` = tolower(paste(stringr::str_trunc(head(gene_symbols_or_id, 10), width = 10, side = "right"), collapse=", ") )) %>%
     ungroup() %>%
     # sort contrasts in same order as defined by user

diff --git a/R/dea.R b/R/dea.R
@@ -494,15 +494,21 @@ dea_results_to_wide = function(dataset) {
   }
 
   # first, get the number of peptides used in each contrast. next, add the results from each dea algorithm in each contrast
+  tmp = dataset$de_proteins %>% select(protein_id, dea_algorithm, contrast, foldchange.log2, tidyselect::any_of("effectsize"), pvalue, qvalue, signif)
+  if("effectsize" %in% colnames(tmp)) {
+    tmp = tmp %>% pivot_wider(names_from = c(dea_algorithm, contrast), values_from = c(foldchange.log2, effectsize, pvalue, qvalue, signif))
+  } else {
+    tmp = tmp %>% pivot_wider(names_from = c(dea_algorithm, contrast), values_from = c(foldchange.log2, pvalue, qvalue, signif))
+  }
+
+
   tib = left_join(dataset$de_proteins %>%
                     select(protein_id, contrast, peptides_used_for_dea) %>%
                     distinct(protein_id, contrast, .keep_all = T) %>%
                     pivot_wider(names_from = contrast, values_from = peptides_used_for_dea, names_prefix = "peptides_used_for_dea_") %>%
                     replace(is.na(.), 0),
                   #
-                  dataset$de_proteins %>%
-                    select(protein_id, dea_algorithm, contrast, foldchange.log2, pvalue, qvalue, signif) %>%
-                    pivot_wider(names_from = c(dea_algorithm, contrast), values_from = c(foldchange.log2, pvalue, qvalue, signif)),
+                  tmp,
                   by="protein_id")
 
   # if there are multiple DEA algorithms in the results, add a column that combines their results such that all proteins significant in 2 or more tests/algorithms are flagged

diff --git a/R/export_data_tables.R b/R/export_data_tables.R
@@ -93,13 +93,10 @@ export_protein_abundance_matrix = function(dataset, rollup_algorithm, output_dir
     m = m[ , order(match(colnames(m), dataset$samples$sample_id)), drop=F]
 
     # add protein metadata
-    tib = dataset$proteins %>% inner_join(as_tibble(m) %>% add_column(protein_id = rownames(m)), by="protein_id") %>% arrange(protein_id)
-    if("accessions" %in% colnames(tib)) {
-      tib = tib %>% select(-accessions) # not useful for user, redundant with protein_id column in virtually all datasets
-    }
-    if("gene_symbols_or_id" %in% colnames(tib)) {
-      tib = tib %>% arrange(gene_symbols_or_id!=protein_id, gene_symbols_or_id) # proteins without gene symbol first, then sort by symbol
-    }
+    tib = dataset$proteins %>%
+      select(protein_id, fasta_headers, gene_symbols_or_id) %>%
+      inner_join(as_tibble(m) %>% add_column(protein_id = rownames(m)), by="protein_id") %>%
+      arrange(gene_symbols_or_id!=protein_id, gene_symbols_or_id) # proteins without gene symbol first, then sort by symbol
 
     ## write to file
     # generate filename. if very long (eg; huge contrast name + long path in output_dir), try to shorting with md5 hash