diff --git a/DESCRIPTION b/DESCRIPTION index 5d964fc..96c24e6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Package: msdap Title: Mass Spectrometry Downstream Analysis Pipeline Description: Analyze label-free proteomics datasets from various sources (MaxQuant, Spectronaut, etc) using a pipeline that facilitates peptide filtering and many algorithms for normalization and statistical analysis. A comprehensive PDF report can be generated that includes many data visualizations and documentation thereof. URL: https://github.com/ftwkoopmans/msdap -Version: 1.0.9 +Version: 1.1 Authors@R: person(given = "Frank", family = "Koopmans", diff --git a/NAMESPACE b/NAMESPACE index fd43944..8564914 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,6 +22,7 @@ export(differential_detect) export(export_peptide_abundance_matrix) export(export_protein_abundance_matrix) export(export_statistical_results) +export(export_stats_genesummary) export(file_check) export(filename_strip_illegal_characters) export(filter_dataset) @@ -69,8 +70,6 @@ export(plot_sample_pca__sample_in_contrast) export(plot_variance_explained) export(plot_volcano) export(print_dataset_summary) -export(protein2gene_by_symbol) -export(protein2gene_orthologs) export(protein_eset_from_data) export(read_textfile_compressed) export(regex_classification) @@ -84,7 +83,6 @@ export(rollup_pep2prot_tmp) export(sample_color_coding__long_format) export(sample_metadata_custom) export(setup_contrasts) -export(summarise_stats) export(tibble_as_eset) export(update_protein_mapping) export(update_protein_mapping_from_maxquant) diff --git a/R/dataset.R b/R/dataset.R index 603fc4e..29ce70c 100644 --- a/R/dataset.R +++ b/R/dataset.R @@ -275,7 +275,7 @@ tibble_peptides_reorder = function(tib) { #' @param peptides peptide tibble in long format empty_protein_tibble = function(peptides) { uprot = unique(peptides$protein_id) - return(tibble(protein_id = uprot, fasta_headers = uprot, gene_symbols_or_id = uprot)) + return(tibble(protein_id = uprot, fasta_headers = uprot, gene_symbols = uprot, gene_symbols_or_id = uprot)) } @@ -314,7 +314,7 @@ diffdetect_summary_prettyprint = function(dataset, use_quant = FALSE, trim_contr # summary stats per contrast group_by(contrast) %>% summarise(`#proteins` = n(), - `#abs(zscore) >= 4` = sum(abs(zscore) >= 4), + `#abs(zscore) >= 6` = sum(abs(zscore) >= 6), `top10` = tolower(paste(stringr::str_trunc(head(gene_symbols_or_id, 10), width = 10, side = "right"), collapse=", ") )) %>% ungroup() %>% # sort contrasts in same order as defined by user diff --git a/R/dea.R b/R/dea.R index 4542760..020f049 100644 --- a/R/dea.R +++ b/R/dea.R @@ -494,15 +494,21 @@ dea_results_to_wide = function(dataset) { } # first, get the number of peptides used in each contrast. next, add the results from each dea algorithm in each contrast + tmp = dataset$de_proteins %>% select(protein_id, dea_algorithm, contrast, foldchange.log2, tidyselect::any_of("effectsize"), pvalue, qvalue, signif) + if("effectsize" %in% colnames(tmp)) { + tmp = tmp %>% pivot_wider(names_from = c(dea_algorithm, contrast), values_from = c(foldchange.log2, effectsize, pvalue, qvalue, signif)) + } else { + tmp = tmp %>% pivot_wider(names_from = c(dea_algorithm, contrast), values_from = c(foldchange.log2, pvalue, qvalue, signif)) + } + + tib = left_join(dataset$de_proteins %>% select(protein_id, contrast, peptides_used_for_dea) %>% distinct(protein_id, contrast, .keep_all = T) %>% pivot_wider(names_from = contrast, values_from = peptides_used_for_dea, names_prefix = "peptides_used_for_dea_") %>% replace(is.na(.), 0), # - dataset$de_proteins %>% - select(protein_id, dea_algorithm, contrast, foldchange.log2, pvalue, qvalue, signif) %>% - pivot_wider(names_from = c(dea_algorithm, contrast), values_from = c(foldchange.log2, pvalue, qvalue, signif)), + tmp, by="protein_id") # if there are multiple DEA algorithms in the results, add a column that combines their results such that all proteins significant in 2 or more tests/algorithms are flagged diff --git a/R/export_data_tables.R b/R/export_data_tables.R index 7ce947e..ea0239b 100644 --- a/R/export_data_tables.R +++ b/R/export_data_tables.R @@ -93,13 +93,10 @@ export_protein_abundance_matrix = function(dataset, rollup_algorithm, output_dir m = m[ , order(match(colnames(m), dataset$samples$sample_id)), drop=F] # add protein metadata - tib = dataset$proteins %>% inner_join(as_tibble(m) %>% add_column(protein_id = rownames(m)), by="protein_id") %>% arrange(protein_id) - if("accessions" %in% colnames(tib)) { - tib = tib %>% select(-accessions) # not useful for user, redundant with protein_id column in virtually all datasets - } - if("gene_symbols_or_id" %in% colnames(tib)) { - tib = tib %>% arrange(gene_symbols_or_id!=protein_id, gene_symbols_or_id) # proteins without gene symbol first, then sort by symbol - } + tib = dataset$proteins %>% + select(protein_id, fasta_headers, gene_symbols_or_id) %>% + inner_join(as_tibble(m) %>% add_column(protein_id = rownames(m)), by="protein_id") %>% + arrange(gene_symbols_or_id!=protein_id, gene_symbols_or_id) # proteins without gene symbol first, then sort by symbol ## write to file # generate filename. if very long (eg; huge contrast name + long path in output_dir), try to shorting with md5 hash diff --git a/R/export_stats_genesummary.R b/R/export_stats_genesummary.R new file mode 100644 index 0000000..7118bad --- /dev/null +++ b/R/export_stats_genesummary.R @@ -0,0 +1,351 @@ + +#' Create gene-level summary tables for your DEA and differential detection results +#' +#' @description +#' +#' To expedite downstream analysis, this function maps the proteingroup results from your +#' differential expression analysis to Human gene identifiers and exports these as Excel tables that are ready for +#' use with GO tools. Optionally, one can also include results from differential detection analysis. +#' Multiple proteingroups that map to the same unique gene are merged and there are various options +#' for dealing with ambiguous proteingroups. +#' +#' The resulting output tables can be used as input with the GOAT online geneset analysis tool @ \url{https://ftwkoopmans.github.io/goat/} +#' +#' *combining differential expression and differential detection stats* +#' +#' If differential detection (DD) was performed on this dataset and the `diffdetect_zscore_threshold` parameter is not NA, +#' the DD z-scores (per contrast) are converted to p-values by `pnorm(abs(zscore), lower.tail = F)`, +#' assuming these are approximately normally distributed (!), and then multiple testing correction by FDR is applied. +#' However, this is just an approximation; the differential detection scores are a very simplistic approach and should +#' be considered inferior to DEA analyses. Double-check the zscore distributions and treat these results with care ! +#' Plot your DD results with `plot_differential_detect(dataset, zscore_threshold = 6)`. +#' Stringent cutoffs (absolute diffdetect_zscore_threshold of at least 5~6, or higher) are encouraged when using differential detection results. +#' +#' If DD is enabled (`diffdetect_zscore_threshold` parameter is not NA), the effectsizes from DEA and DD are both standardized to make +#' their distributions somewhat comparable. Note that the downside is that the effectsizes returned by this function +#' no longer will be the exact effectsizes returned by the DEA model (those can be then found @ output column `effectsize_dea`). +#' +#' *dealing with overlapping gene symbols between proteingroups* +#' +#' Importantly, there may be multiple proteingroups (isoforms) in any proteomics dataset that match the same gene symbol. +#' This function will return the proteingroup/row with the lowest p-value for each unique gene (per contrast, per DEA algorithm). +#' Your selected option for `gene_ambiguity` will affect this; +#' if for example "leading_gene" is selected as an option then a "GRIA1;GRIA2" proteingroup with pvalue=0.01 will be +#' chosen as representative value for "GRIA1" over a "GRIA1" proteingroup with pvalue=0.5 (see further below parameter documentation) +#' +#' +#' @examples \dontrun{ +#' +#' ## note that in this example, we have renamed the downloaded files +#' ## to include a timestamp; useful for tracking versions +#' # load the HGNC data table you previously downloaded +#' hgnc_table = hgnc_lookuptable("C:/DATA/hgnc_complete_set__2024-01-01.txt") +#' +#' # MGI Mouse database, only needed for Mouse datasets +#' #mgi_table = mgi_lookuptable("C:/DATA/MRK_SwissProt_TrEMBL__2024-01-01.rpt") +#' # RGD Rat database, only needed for Rat datasets +#' #rgd_table = rgd_lookuptable("C:/DATA/GENES_RAT__2024-01-01.rpt") +#' +#' export_stats_genesummary( +#' dataset, +#' # set NA to ignore differential detection (default), or define an absolute +#' # zscore threshold (e.g. 6). ! when using diffdetect, first review the +#' # zscore distributions using MS-DAP function: plot_differential_detect() +#' diffdetect_zscore_threshold = NA, +#' # options for dealing with proteingroups that map to multiple genes: +#' # leading_gene, prio_specific, only_specific +#' gene_ambiguity = "prio_specific", # prio_specific and only_specific are recommended +#' # always need the HGNC data table +#' hgnc = hgnc_table, +#' # example: for Mouse datasets, provide the MGI database for more accurate ID mapping +#' # xref = mgi_table, +#' # example: for Rat datasets, provide the RGD database for more accurate ID mapping +#' # xref = rgd_table, +#' # write the files to the same directory as main MS-DAP results +#' output_dir = dataset$output_dir +#' ) +#' } +#' @param dataset to use this function, your dataset should meet these requirements: +#' \itemize{ +#' \item dataset was searched against a uniprot FASTA in MaxQuant/Spectronaut/DIA-NN/etc. (i.e. MS-DAP can only extract gene symbol info from from uniprot.org FASTA databases) +#' \item `import_fasta()` has been applied to this dataset +#' \item differential expression analysis has been applied (e.g. typical `analysis_quickstart()` workflow, or custom code that applied `filter_dataset()` and `dea()`) +#' } +#' @param gene_ambiguity options: leading_gene, prio_specific, only_specific (default: prio_specific). +#' ## example table from GOAT online website +#' this list describes various scenarios; gene symbol(s) that represent some proteingroup together with a description of presented information +#' \itemize{ +#' \item GRIA1 = protein group maps to exactly 1 gene +#' \item GRIA2 = protein group maps to exactly 1 gene +#' \item GRIA1;GRIA2 = ambiguous, one might want to use only the first entry ('leading' gene) +#' \item GRIA3;GRIA4 = ambiguous, but this row contributes a new gene (GRIA3) +#' \item tr|A8K0K0|A8K0K0_HUMAN;GRIA2;GRIA3 = ambiguous, but the first entry has no gene symbol only an accession +#' } +#' +#' ## options and their result when applied to above examples +#' \itemize{ +#' \item leading_gene = use only the leading/first gene per proteingroup. e.g. "GRIA1;GRIA2" is mapped to "GRIA1" and will overwrite a specific "GRIA1" proteingroup if (and only if) the former has a lower p-value. +#' all rows in this example will be mapped to a gene ID (respectively, GRIA1, GRIA2, GRIA1, GRIA3, GRIA2). +#' \item prio_specific = discard ambiguous proteingroups if their leading/first gene overlaps with a specific proteingroup. e.g. "GRIA1;GRIA2" is mapped to "GRIA1" only if there is no specific/unambiguous "GRIA1" proteingroup. +#' rows 3 and 5 in this example are discarded because there exist specific entries for GRIA1 and GRIA2 (respectively, GRIA1, GRIA2, -, GRIA3, -). This approach favors rows that are specific and supplements the results only with ambiguous rows that contribute new information (leading gene is not in the results yet). +#' \item only_specific = all ambiguous proteingroups are discarded (included in output table, but human gene ID is set to NA). +#' only the first 2 rows in this example are mapped (respectively, GRIA1, GRIA2, -, -, -). +#' } +#' @param diffdetect_zscore_threshold Optionally, a differential detection absolute z-score cutoff. Set to NA to disable diffdetect (default) to return only DEA results. When using this option, we strongly recommend to first review the zscore distributions using MS-DAP function: `plot_differential_detect()` -> double-check that your selected absolute zscore threshold cuts at intended positions in the distribution ! +#' @param diffdetect_type type of differential detect scores. options: +#' 'auto' = set to 'detect' if this score is available, 'quant' otherwise +#' 'detect' = differential detection z-scores computed from only "detected" peptides (no MBR) +#' 'quant' = differential detection z-scores computed from all quantified peptides (uses MBR) +#' @param dea_logfc_instead_of_effectsize boolean parameter: should log2fc values be returned in the effectsize column? In some edge cases this might be helpful for downstream analysis but should generally be left at default value (FALSE) +#' @param output_dir where to write the Excel output tables (e.g. "C:/temp"). Set to NA to disable writing tables +#' @param hgnc HGNC lookup table, the output from function `hgnc_lookuptable()` +#' @param xref For Mouse and Rat datasets we recommend to provide MGI/RGD database files to increase ID mapping accuracy. For Mouse species, this should be the MGI lookup table from function `mgi_lookuptable()`. For Rat, the RGD lookup table from function `rgd_lookuptable()` +#' @param remove_nohgnc set to TRUE to remove all rows that could not be matched to a HGNC ID. default; FALSE +#' @returns the table(s) produced by this function are gene-level summaries of your statistical analyses. +#' The R data.table returned by this function is in long-format (i.e. results from multiple DEA algorithms and contrast are appended). +#' The Excel output files are split by contrast and DEA algorithm for convenience: these can be directly used as input for gene set analysis tools. +#' Proteingroups that failed to map to human genes are included in the output table but will have NA values in the hgnc_id column. +#' +#' A description of columns: +#' \itemize{ +#' \item 'protein_id' contains the proteingroup identifier (concatenation of protein identifiers) +#' \item 'contrast' describes the statistical contrast (e.g. WT vs KO) +#' \item 'dea_algorithm' describes the DEA algorithm that configured. Note that if you select multiple algorithms while running `analysis_quickstart()`, there will be multiple rows in this table for each gene*contrast ! +#' \item 'score_type' is either 'dea', 'dd' or 'dea,dd' indicating whether the effectsize/pvalue/pvalue_adjust shown for this row originates from differential expression analysis (DEA) or differential detection (DD), or both (i.e. statistics were available for both, in which case the metric with lowest pvalue was used to populate the effectsize/pvalue/pvalue_adjust columns) +#' \item 'peptide_count' describes the number of peptides that was used for DEA. So this is NOT the total number of detected peptides, rather this is the subset of peptides that passed your specified filtering rules (and were thus used for DEA) +#' \item 'log2fc' log2 foldchanges; for DEA results this value is taken straight from the DEA result table. For DD results this is the 'diffdetect zscore' +#' \item 'pvalue' analogous to `log2fc` +#' \item 'pvalue_adjust' analogous to `log2fc` +#' \item 'signif' boolean flag indicating significance. For DEA, the criteria for significance that were previously configured when performing DEA using `dea()` or +#' `analysis_quickstart()` functions are reused here (i.e. cutoffs for adjusted p-value and log2 foldchange). For DD results, any included protein (that passes the `diffdetect_zscore_threshold` cutoff) is set to TRUE. +#' \item The 'effectsize' column data depends on the user parameters/configuration; +#' When only differential detect is used, this contains the z-scores. When only DEA is used, this contains effectsizes from DEA as-is. +#' However, when DEA and differential detect are combined, the effectsizes from DEA are standardized (divided by std) such that effectsizes from +#' both statistics can be integrated into 1 distribution (and thus these can be compared/ranked in downstream analyses). +#' For rows that have both DD and DEA results, see item `score_type` +#' Not a perfect solution, but in combination with stringent `diffdetect_zscore_threshold` parameter this seems to work ok in our hands. +#' \item 'effectsize_dea' is only included when DD is enabled; it shows the DEA effectsizes as-is without any rescaling (and rescaled values merged with rescaled DD values are shown in 'effectsize') +#' \item 'gene_symbols_or_id' contains the uniprot gene symbols for respective accessions in the proteingroup (protein_id column) as per usual in MS-DAP. +#' \item 'hgnc_id' the mapped Human gene ID (can be NA in case of failed mappings) +#' \item 'symbol' the gene symbol of the respective 'hgnc_id' (can be NA in case of failed mappings) +#' \item 'ensembl_id' the Ensembl gene ID of the respective 'hgnc_id' (can be NA in case of failed mappings) +#' \item 'entrez_id' the NCBI Entrez gene ID of the respective 'hgnc_id' (can be NA in case of failed mappings) +#' \item 'gene' same as the 'entrez_id' column. Provided for drag-and-drop compatibility with the GOAT online tool +#' } +#' @export +export_stats_genesummary = function(dataset, gene_ambiguity = "prio_specific", diffdetect_zscore_threshold = NA, diffdetect_type = "auto", dea_logfc_instead_of_effectsize = FALSE, output_dir = NA, hgnc, xref = NULL, remove_nohgnc = FALSE) { + result = NULL + + ### input validation + if(!(is.list(dataset) && "de_proteins" %in% names(dataset))) { + append_log("dataset parameter should be a dataset that contains DEA results (did you forget to setup contrasts and run the analysis_quickstart function, or dea() ?)", type = "error") + } + if(!(is.list(dataset) && "proteins" %in% names(dataset) && all(c("protein_id", "gene_symbols") %in% colnames(dataset$proteins)) && !all(dataset$proteins$protein_id == dataset$proteins$gene_symbols))) { + append_log("dataset is missing essential information in the 'gene_symbols' column of the protein table. No FASTA has been imported for this dataset or the dataset was analyzed with an outdated MS-DAP version.\nTo use this function you won't have to re-run the entire analysis pipeline for this dataset, just run the import_fasta() function prior to applying export_stats_genesummary() to update the current dataset object (assuming this is a dataset searched against a uniprot FASTA)", type = "error") + } + if(!(length(gene_ambiguity) == 1 && all(gene_ambiguity %in% c("leading_gene", "prio_specific", "only_specific")))) { + append_log("gene_ambiguity parameter should be 1 of the following options: leading_gene, prio_specific, only_specific", type = "error") + } + if(!(length(diffdetect_zscore_threshold) == 1 && (is.na(diffdetect_zscore_threshold) || (is.numeric(diffdetect_zscore_threshold) && is.finite(diffdetect_zscore_threshold) && diffdetect_zscore_threshold > 0)))) { + append_log("diffdetect_zscore_threshold parameter should be either NA (to disable) or a positive number", type = "error") + } + if(!(length(dea_logfc_instead_of_effectsize) == 1 && dea_logfc_instead_of_effectsize %in% c(TRUE, FALSE))) { + append_log("dea_logfc_instead_of_effectsize parameter should be TRUE or FALSE", type = "error") + } + if(!(length(output_dir) == 1 && (is.na(output_dir) || is.character(output_dir) && output_dir != ""))) { + append_log("output_dir parameter should be either NA (to disable) or a (non-empty) character/string", type = "error") + } + if(!(is.data.frame(hgnc) && all(c("hgnc_id", "hgnc_symbol", "type", "value") %in% colnames(hgnc)))) { + append_log("hgnc parameter should be a data.frame with columns hgnc_id, hgnc_symbol, type, value (output from hgnc_lookuptable() function)", type = "error") + } + if(!is.null(xref) && !(is.data.frame(xref) && nrow(xref) > 0 && ncol(xref) >= 3 && all(c("symbol", "uniprot_id") %in% colnames(xref)))) { + append_log("xref parameter should be a data.frame with columns mgi_id/rgd_id, symbol, uniprot_id (output from mgi_lookuptable() or rgd_lookuptable() function)", type = "error") + } + if(!is.null(xref) && !colnames(xref)[1] %in% c("mgi_id", "rgd_id")) { + append_log("xref parameter only works with MGI and RGD atm (output from mgi_lookuptable() or rgd_lookuptable() function): first column must be mgi_id or rgd_id", type = "error") + } + if(!(length(remove_nohgnc) == 1 && remove_nohgnc %in% c(TRUE, FALSE))) { + append_log("remove_nohgnc parameter should be TRUE or FALSE", type = "error") + } + + ### auto-detect species + issue user warning + if("fasta_headers" %in% colnames(dataset$proteins)) { + # example fasta header; ">tr|A0A024R4E5|A0A024R4E5_HUMAN Isoform of Q00341, High density lipoprotein binding protein (Vigilin)…" + tmp = dataset$proteins %>% filter(protein_id %in% unique(dataset$de_proteins$protein_id)) %>% pull(fasta_headers) + tmp = gsub(" .*", "", tmp) # remove description, full ID remains + tmp = gsub(".*_", "", tmp) # remove everything except the species + tmp = sort(table(tolower(tmp)), decreasing = TRUE) # counts for the unique species set + tmp_subset = tmp[tmp > sum(tmp) * 0.6] # remove minor species + + # xref is provided: double-check that it matches species in the FASTA + if(!is.null(xref)) { + xref_is_mouse = colnames(xref)[1] == "mgi_id" + xref_is_rat = colnames(xref)[1] == "rgd_id" + + if(xref_is_mouse && ! "mouse" %in% names(tmp_subset)) { + append_log(paste0("export_stats_genesummary: an MGI idmapping database is provided in parameter 'xref' while the vast majority of proteins in the dataset (auto-detected from uniprot headers in dataset$proteins) are not Mouse species.\nDetected majority species: ", + names(tmp_subset)[1], "\nOnly provide a crossreference database (parameter 'xref') when your dataset contains mostly proteins from that specific species!"), type = "error") + } + + if(xref_is_rat && ! "rat" %in% names(tmp_subset)) { + append_log(paste0("export_stats_genesummary: an RGD idmapping database is provided in parameter 'xref' while the vast majority of proteins in the dataset (auto-detected from uniprot headers in dataset$proteins) are not Rat species.\nDetected majority species: ", + names(tmp_subset)[1], "\nOnly provide a crossreference database (parameter 'xref') when your dataset contains mostly proteins from that specific species!"), type = "error") + } + } else { + # no xref is provided, but maybe the user should ! + if(any(c("mouse", "rat") %in% names(tmp_subset))) { + append_log(paste0("export_stats_genesummary: majority of protein species that was auto-detected from uniprot headers: ", + paste(unique(names(tmp_subset)), collapse = ","), "\nFor Mouse and Rat datasets we recommend to provide MGI/RGD database files to increase ID mapping accuracy (see further export_stats_genesummary() function documentation)"), type = "warning") + } + } + } + + + + ### combine DEA and DD + # Summarise the statistical data in your dataset, assuming DEA was applied. + # note that if you analyzed multiple contrasts, optionally with multiple DEA algorithms, all results are appended in this table. + allstats = summarise_stats( + dataset, + return_dea = TRUE, + return_diffdetect = is.finite(diffdetect_zscore_threshold), # set to TRUE to integrate respective 'strong z-scores' + dea_logfc_as_effectsize = dea_logfc_instead_of_effectsize, + diffdetect_zscore_threshold = ifelse(is.finite(diffdetect_zscore_threshold), diffdetect_zscore_threshold, 6), + diffdetect_type = diffdetect_type + ) + + + + ### create long-format table of uniprot_id*symbol for each unique proteingroup + proteins = allstats %>% + select(protein_id, gene_symbols) %>% + distinct(protein_id, .keep_all = TRUE) %>% + mutate(uniprot_id = strsplit(protein_id, "[ ,;]+"), + symbol = strsplit(gene_symbols, "[ ,;]+")) + + if(!all(lengths(proteins$uniprot_id) == lengths(proteins$symbol))) { + append_log("technical error: protein_id and gene_symbols columns contain different element counts. Rerun this dataset with the latest MS-DAP R function to amend, or create a GitHub ticket if this issue persists", type = "error") + } + + proteins_long = proteins %>% select(-gene_symbols) %>% tidyr::unchop(cols = c(uniprot_id, symbol)) + + + + ### map unique proteingroups to Human gene identifiers + # first try to use the provided cross-reference database + # (from uniprot to MGI, fallback from symbol to MGI, then MGI to HGNC) + if(!is.null(xref)) { + proteins_long = idmap_uniprotid(proteins_long, hgnc = hgnc, idmap = xref, use_symbols = TRUE) + } + # always map by gene symbol; will skip already-mapped (e.g. if mouse species & mapped via MGI) + proteins_long = idmap_symbols(proteins_long, hgnc = hgnc, use_synonyms = TRUE) + + + + ### now join the unique proteingroup-to-HGNC mappings with table `allstats` + # hgnc_list = a list column where each protein_id has an array of HGNC identifiers (length > 0), NA where missing/failed + allstats = allstats %>% + left_join(proteins_long %>% select(protein_id, hgnc_list = hgnc_id) %>% tidyr::chop(cols = "hgnc_list"), by = "protein_id") + + + + ### apply mapping rules + ### importantly, sorting of `allstats` table prior to running below code MUST be ascending by pvalue + + allstats$hgnc_list_unique_notna = lapply(allstats$hgnc_list, function(g) { unique(na.omit(g)) }) # naive/slow, but prefer simple code + # dev note: don't use `allstats %>% filter(lengths(hgnc_list) == 1)` because a proteingroup can contain multiple uniprot identifiers that all map to the same gene --> hgnc_list will not be length 1 + + allstats$isambiguous = grepl(";", allstats$gene_symbols_or_id, fixed = TRUE) # infer ambiguous directly from uniprot gene symbols (i.e. failed mapping to HGNC have no effect) + # allstats$isambiguous = lengths(allstats$hgnc_list_unique_notna) > 1 # alternatively, determine "ambiguous" from successful HGNC mappings @ hgnc_list_unique_notna + + # take the first non-NA HGNC identifier (NA if none) + allstats$hgnc_id = unlist(lapply(allstats$hgnc_list_unique_notna, function(g) { + # always return exactly 1 value, thus downstream unlist() will yield an array that exactly matches table length + if(length(g) == 0) return(NA_character_) + return(g[1]) + }), recursive = FALSE, use.names = FALSE) + + if(gene_ambiguity == "only_specific") { + # invalidate all entries that are ambiguous + # importantly, do this before computing `hgnc_id_FORUNIQUE` + allstats$hgnc_id[allstats$isambiguous] = NA_character_ + } + + if(gene_ambiguity == "prio_specific") { + # sort table such that ambiguous hits are at the bottom; importantly, ties-of-ambiguity are still sorted as-is + # taking first match per gene downstream = prio specific/unambiguous over ambiguous proteingroup that has a "better pvalue" + allstats = allstats %>% arrange(isambiguous) + } + + # ! importantly, add a unique ID to proteingroups not mapped to HGNC so we can use distinct() downstream without grouping NA values (failed mappings) + allstats$hgnc_id_FORUNIQUE = ifelse(is.na(allstats$hgnc_id), allstats$gene_symbols_or_id, allstats$hgnc_id) + # result table: retain unique rows per gene*contrast*dea_algorithm (assuming table has been sorted upstream) + result = allstats %>% distinct(hgnc_id_FORUNIQUE, contrast, dea_algorithm, .keep_all = TRUE) + + + + ### finally, stable sort result table + add gene database crossreferences + # sort table again ("prio_specific" option affects sorting, so here enforce stable ordering) + result = result %>% + arrange(desc(signif), pvalue) %>% + select(-tidyselect::any_of(c("gene_symbols", "symbol", "hgnc_list", "hgnc_list_unique_notna", "hgnc_id_FORUNIQUE", "isambiguous"))) %>% + hgnc_add_xrefs(hgnc = hgnc) + # add named column specifically for GOAT + result$gene = result$entrez_id + + + + tmp = allstats %>% distinct(protein_id, .keep_all = TRUE) + log_pg_ambiguous = sum(tmp$isambiguous) + + if(gene_ambiguity == "only_specific") { + # for "only_specific" option, don't count ambiguous genes @ success rate + tmp = tmp %>% filter(isambiguous == FALSE) + log_count_pg = nrow(tmp) + log_count_pg_success = tmp %>% filter(!is.na(hgnc_id)) %>% nrow() + + log_string = sprintf( + "%d / %d (%.1f%%) proteingroups were successfully mapped to %d unique human genes. %d additional ambiguous proteingroups were ignored (due to gene_ambiguity='only_specific')", + log_count_pg_success, log_count_pg, log_count_pg_success / log_count_pg * 100, n_distinct(na.omit(tmp$hgnc_id)), log_pg_ambiguous + ) + } else { # not 'only_specific' + log_count_pg = nrow(tmp) + log_count_pg_success = tmp %>% filter(!is.na(hgnc_id)) %>% nrow() + log_string = sprintf( + "%d / %d (%.1f%%) proteingroups were successfully mapped to %d unique human genes. ", + log_count_pg_success, log_count_pg, log_count_pg_success / log_count_pg * 100, n_distinct(na.omit(tmp$hgnc_id)) + ) + + tmp_g = tmp %>% filter(isambiguous == TRUE) %>% pull(gene_symbols_or_id) + log_pg_ambiguous_overlap_unambiguous = sum(gsub(";.*", "", tmp_g) %in% tmp$gene_symbols_or_id) + log_string = sprintf( + "%s%d / %d ambiguous proteingroups had a leading gene symbol that overlapped with an unambiguous proteingroup", + log_string, log_pg_ambiguous_overlap_unambiguous, log_pg_ambiguous + ) + } + append_log(log_string, type = "info") + + + + ### optionally, remove failed mappings + if(remove_nohgnc) { + result = result %>% filter(!is.na(hgnc_id)) + } + + + + ### print output tables + if(is.na(output_dir)) { + append_log("output_dir parameter set to NA, skip writing Excel output tables", type = "info") + } else { + for(contr in unique(result$contrast)) { + for(algo_dea in unique(result$dea_algorithm)) { + tmp = result %>% filter(contrast == contr & dea_algorithm == algo_dea) + f = sprintf("%s/%s__%s.xlsx", output_dir, filename_strip_illegal_characters(sub("contrast: *", "", contr), strict = TRUE), algo_dea) + openxlsx::write.xlsx(tmp, f) + } + } + } + + return(invisible(result)) +} diff --git a/R/gene_idmapping.R b/R/gene_idmapping.R index 8ec22d9..6751d40 100644 --- a/R/gene_idmapping.R +++ b/R/gene_idmapping.R @@ -14,15 +14,17 @@ #' @returns a long-format table with columns; hgnc_id, hgnc_symbol, type, value #' @export hgnc_lookuptable = function(f) { - check_parameter_is_string(f) - if(!file.exists(f)) { - append_log(paste("file does not exist:", f), type = "error") - } - result = NULL + check_parameter_is_string(f) # parse HGNC table from disk - hgnc = data.table::fread(f, data.table = F, stringsAsFactors = F) + f = path_exists(f, NULL, try_compressed = TRUE) + hgnc = as_tibble(read_textfile_compressed(f, as_table = TRUE)) + # if(!file.exists(f)) { + # append_log(paste("file does not exist:", f), type = "error") + # } + # hgnc = data.table::fread(f, data.table = F, stringsAsFactors = F) + # check that all required columns are present + rename columns cols = list( @@ -164,12 +166,16 @@ hgnc_lookuptable = function(f) { #' @export mgi_lookuptable = function(f) { check_parameter_is_string(f) - if(!file.exists(f)) { - append_log(paste("file does not exist:", f), type = "error") - } + + # parse HGNC table from disk + f = path_exists(f, NULL, try_compressed = TRUE) + x = as_tibble(read_textfile_compressed(f, as_table = TRUE, header = FALSE)) + # if(!file.exists(f)) { + # append_log(paste("file does not exist:", f), type = "error") + # } + # x = data.table::fread(f, data.table = FALSE, stringsAsFactors = FALSE, header = FALSE) # MGI Marker Accession ID | Marker Symbol | Status | Marker Name | cM Position | Chromosome | SWISS-PROT/TrEMBL Protein Accession IDs (space-delimited) - x = data.table::fread(f, data.table = FALSE, stringsAsFactors = FALSE) if(ncol(x) != 7) { append_log(paste0("invalid MGI table; we expected 7 columns but found ", ncol(x), ". Is this MRK_SwissProt_TrEMBL.rpt @ MGI Marker associations to SWISS-PROT and TrEMBL protein IDs (tab-delimited) ?"), type = "error") } @@ -199,17 +205,19 @@ mgi_lookuptable = function(f) { #' @export rgd_lookuptable = function(f) { check_parameter_is_string(f) - if(!file.exists(f)) { - append_log(paste("file does not exist:", f), type = "error") - } - # load table from disk - x = data.table::fread(f, data.table = FALSE, stringsAsFactors = FALSE) + # parse table from disk + f = path_exists(f, NULL, try_compressed = TRUE) + x = as_tibble(read_textfile_compressed(f, as_table = TRUE, datatable_skip = "GENE_RGD_ID")) + # if(!file.exists(f)) { + # append_log(paste("file does not exist:", f), type = "error") + # } + # x = data.table::fread(f, data.table = FALSE, stringsAsFactors = FALSE, skip = "GENE_RGD_ID") # check that all required columns are present + rename columns cols = list( rgd_id = c("GENE_RGD_ID"), - symbol = c("symbol", "SYMBOL"), + symbol = c("SYMBOL"), uniprot_id = c("UNIPROT_ID") ) x = robust_header_matching(x, column_spec = cols, columns_required = names(cols)) @@ -227,183 +235,108 @@ rgd_lookuptable = function(f) { -#' Map the protein_id column, which are assumed to contain uniprot mouse/rat identifiers, in a table to HGNC human gene IDs -#' -#' This function is currently limited such that the first matching human gene per proteingroup is returned, -#' so ambiguous proteingroups with multiple gene symbols (e.g. "GRIA1;GRIA2") might not always yield the 'leading' -#' gene and additional/ambiguous genes are disregarded). Columns describing the HGNC (genenames.org) ID, -#' NCBI Entez ID and Ensembl gene ID are appended to the provided table. +#' helper function to match a protein table to HGNC via gene symbols (and synonyms) #' -#' Workflow for ID mapping, where subsequent steps are fallbacks; -#' 1) uniprot id -> mgi/rgd id -> hgnc id -#' 2) uniprot symbol -> mgi/rgd id -> hgnc id -#' 3) uniprot symbol -> hgnc official symbol -> hgnc id -#' 4) uniprot symbol -> hgnc non-ambiguous synonyms -> hgnc id -#' -#' @examples \dontrun{ -#' # prior to this, import a dataset/samples/fasta & setup contrasts & apply filter_datasets() -#' # alternatively, use analysis_quickstart() to do everything -#' dataset = dea(dataset, dea_algorithm = "deqms") -#' dataset = differential_detect(dataset, min_peptides_observed = 2, min_samples_observed = 3, -#' min_fraction_observed = 0.5, return_wide_format = FALSE) -#' print_dataset_summary(dataset) -#' -#' x = summarise_stats(dataset, return_dea = TRUE, return_diffdetect = TRUE, -#' diffdetect_zscore_threshold = 5, remove_ambiguous_proteingroups = TRUE) -#' hgnc = hgnc_lookuptable(f = "C:/DATA/hgnc/hgnc__non_alt_loci_set___2023-03-21.tsv") -#' mgi = mgi_lookuptable(f = "C:/DATA/mgi/2023-07-15/MRK_SwissProt_TrEMBL.rpt") -#' y = protein2gene_orthologs(x, hgnc, mgi) -#' } -#' @param x a data.table with columns protein_id and symbol (e.g. the "leading symbol" extracted from the uniprot gene symbols @ dataset$proteins$gene_symbols_or_id) +#' @description typically called from `export_stats_genesummary` +#' @param proteins_long long-format data.frame with columns protein_id and symbol pairs (i.e. no semicolon-delimted values) #' @param hgnc HGNC lookup table from `hgnc_lookuptable()` -#' @param idmap MGI lookup table from `mgi_lookuptable()` or a RGD lookup table from `rgd_lookuptable()` -#' @param remove_nohgnc set to TRUE to remove all rows that could not be matched to a HGNC ID. default; FALSE -#' @export -protein2gene_orthologs = function(x, hgnc, idmap, remove_nohgnc = FALSE) { - if(!is.data.frame(x) || nrow(x) == 0 || !all(c("protein_id", "symbol") %in% colnames(x)) ) { - append_log("x must be a data.frame with columns 'protein_id' and 'symbol'", type = "error") - } - if(anyNA(x$protein_id) || !is.character(x$protein_id) || any(x$protein_id == "")) { - append_log("column 'protein_id' in x must be of character type and not contain any NA values or empty strings", type = "error") +#' @param use_synonyms boolean; use synonyms? if FALSE, only uses hgnc_symbol +idmap_symbols = function(proteins_long, hgnc, use_synonyms = TRUE) { + if(!is.data.frame(proteins_long) || nrow(proteins_long) == 0 || !"symbol" %in% colnames(proteins_long) ) { + append_log("proteins_long parameter must be a data.frame with column 'symbol'", type = "error") } if(!is.data.frame(hgnc) || nrow(hgnc) == 0 || !all(c("hgnc_id", "hgnc_symbol", "type", "value") %in% colnames(hgnc)) ) { - append_log("hgnc must be a data.frame with columns 'hgnc_id', 'hgnc_symbol', 'type', 'value' as typically prepared using the hgnc_lookuptable() function", type = "error") + append_log("hgnc parameter must be a data.frame with columns 'hgnc_id', 'hgnc_symbol', 'type', 'value' as typically prepared using the hgnc_lookuptable() function", type = "error") } - if(!is.data.frame(idmap) || nrow(idmap) == 0 || !all(c("symbol", "uniprot_id") %in% colnames(idmap)) || !colnames(idmap)[1] %in% hgnc$type) { - append_log("idmap must be a data.frame with columns 'symbol', 'uniprot_id' and the first column should contain identifiers that are also in the hgnc table (e.g. first column name should be 'mgi_id' or 'rgd_id'). e.g. as obtained from the mgi_lookuptable() function", type = "error") + if(length(use_synonyms) != 1 || !all(use_synonyms %in% c(TRUE, FALSE))) { + append_log("use_synonyms parameter must be TRUE or FALSE", type = "error") } - # remove preexisting columns, if any. e.g. suppose that `x` contains entrez_id column but here we don't have entrez_id - # in the hgnc table; this table never gets overwritten - x$hgnc_id = x$hgnc_symbol = x$ensembl_id = x$entrez_id = NULL - - # replace empty symbols, if any - x$symbol_input = x$symbol - rows_symbol_fail = is.na(x$symbol) | nchar(x$symbol) < 2 - x$symbol[rows_symbol_fail] = x$protein_id[rows_symbol_fail] - x$symbol = toupper(x$symbol) # enforce uppercase - - proteins_long = x %>% - select(protein_id, symbol) %>% - distinct(protein_id, .keep_all = TRUE) %>% - mutate(uniprot_id = strsplit(protein_id, split = " *; *")) %>% - tidyr::unchop(uniprot_id, keep_empty = TRUE) %>% - filter(nchar(uniprot_id) >= 3) %>% # shouldn't be needed but enforce check for empty IDs anyway - mutate( - uniprot_id = gsub("-.*", "", uniprot_id) #, - # by_symbol = TRUE # default = leftover matching by symbol - ) + # initialize result column, if missing + if(!"hgnc_id" %in% colnames(proteins_long)) { + proteins_long$hgnc_id = NA_character_ + } - ## map from uniprot ID to MGI/RGD - i = match(proteins_long$uniprot_id, idmap$uniprot_id) # match by uniprot id - i[is.na(i)] = match(proteins_long$symbol[is.na(i)], toupper(idmap$symbol)) # fallback matching by symbol (should align well between MGI and uniprot mouse proteome) - proteins_long$xref_id = unlist(idmap[i,1], recursive = F, use.names = F) # first column in the data.frame / tibble - - ## map to HGNC - # hgnc subset of relevant crossreference identifiers - hgnc_xref = hgnc %>% filter(type == colnames(idmap)[1]) - proteins_long$hgnc_id = hgnc_xref$hgnc_id[match(proteins_long$xref_id, hgnc_xref$value)] - # flag all protein_id that we directly matched as 'not matched by symbols but rather through identifiers' - # proteins_long$by_symbol[!is.na(proteins_long$hgnc_id)] = FALSE - - # fallback matching by official gene symbol - # note that we shouldn't use subsets of the hgnc table, e.g. for crossreference or synonyms, as not all HGNC entries might have crossreferences/synonyms - proteins_long$hgnc_id[is.na(proteins_long$hgnc_id)] = hgnc$hgnc_id[match(proteins_long$symbol[is.na(proteins_long$hgnc_id)], hgnc$hgnc_symbol)] - - # fallback matching by synonym - rows = is.na(proteins_long$hgnc_id) - if(any(rows)) { - hgnc_synonym = hgnc %>% filter(type == "synonym") - if(nrow(hgnc_synonym) > 0) { - proteins_long$hgnc_id[rows] = hgnc_synonym$hgnc_id[match(proteins_long$symbol[rows], hgnc_synonym$value)] + # the hgnc mapping table contains official gene symbols in a separate column (i.e. not available as a "type") + hgnc_subset_symbol = hgnc %>% distinct(hgnc_symbol, .keep_all = TRUE) # subset for efficiency + + # rows that we need to match by official gene symbol + rows = is.na(proteins_long$hgnc_id) & + # double-check that we don't even attempt to match missing gene symbols + !is.na(proteins_long$symbol) & nchar(proteins_long$symbol) >= 2 + proteins_long$hgnc_id[rows] = hgnc_subset_symbol$hgnc_id[match(toupper(proteins_long$symbol[rows]), toupper(hgnc_subset_symbol$hgnc_symbol))] + + # fallback matching by synonym, using data in the "type" column + if(use_synonyms) { + rows = is.na(proteins_long$hgnc_id) & # analogous to above, but here update the selection of rows + !is.na(proteins_long$symbol) & nchar(proteins_long$symbol) >= 2 + if(any(rows)) { + hgnc_subset_synonym = hgnc %>% filter(type == "synonym") # subset only synonyms + if(nrow(hgnc_subset_synonym) > 0) { + proteins_long$hgnc_id[rows] = hgnc_subset_synonym$hgnc_id[match(toupper(proteins_long$symbol[rows]), toupper(hgnc_subset_synonym$value))] + } } } - - ### mapping table from proteingroups (protein_id) to HGNC - # importantly, we don't deal with ambiguous mappings atm - # e.g. if a proteingroup contains multiple gene symbols, we just return the first human gene ID we found - protein_to_gene = proteins_long %>% - filter(!is.na(hgnc_id)) %>% - # retain just 1 entry per proteingroup -->> any 1:n mapping is lost - distinct(protein_id, .keep_all = TRUE) - # direct 1:1 matching - i = match(x$protein_id, protein_to_gene$protein_id) - x$hgnc_id = protein_to_gene$hgnc_id[i] - - ### add hgnc_symbol and entrez/ensembl IDs if present in hgnc table - x = hgnc_add_xrefs(x, hgnc) - - ### restore input symbols - x$symbol = x$symbol_input - x$symbol_input = NULL - - # log status to console. use distinct(protein_id) because some input tables might contain multiple entries for - # some protein_id, e.g. dataset$de_proteins is provided as-is when it contains 2 contrasts - tmp = x %>% distinct(protein_id, .keep_all = TRUE) - n_input = nrow(tmp) - n_fail = sum(is.na(tmp$hgnc_id)) - cat(sprintf("%d / %d (%.1f%%) unique protein_id could not be mapped to a HGNC human gene ID\n", - n_fail, n_input, n_fail / n_input * 100 )) - - if(remove_nohgnc) { - x = x %>% filter(!is.na(hgnc_id)) - } - return(x) + return(proteins_long) } -#' Map the the symbol column in a table to HGNC human gene IDs by matching official gene symbols and synonyms +#' helper function to match a protein table to HGNC via an intermediate mapping table (e.g. MGI/RGD) #' -#' @param x a data.table with a column symbol +#' @description typically called from `export_stats_genesummary` +#' @param proteins_long long-format data.frame with columns protein_id and symbol pairs (i.e. no semicolon-delimted values) #' @param hgnc HGNC lookup table from `hgnc_lookuptable()` -#' @export -protein2gene_by_symbol = function(x, hgnc) { - if(!is.data.frame(x) || nrow(x) == 0 || !all("symbol" %in% colnames(x) || !is.character(x$symbol)) ) { - append_log("x must be a data.frame with column 'symbol' (character type)", type = "error") +#' @param idmap MGI or RGD lookup table from `mgi_lookuptable()` or `rgd_lookuptable()` +#' @param use_symbols boolean; use symbols to map from proteins_long to the idmap table? if FALSE, only matches by uniprot identifier +idmap_uniprotid = function(proteins_long, hgnc, idmap, use_symbols = TRUE) { + if(!is.data.frame(proteins_long) || nrow(proteins_long) == 0 || !all(c("uniprot_id", "symbol") %in% colnames(proteins_long)) ) { + append_log("proteins_long parameter must be a data.frame with columns 'protein_id' and 'symbol'", type = "error") } if(!is.data.frame(hgnc) || nrow(hgnc) == 0 || !all(c("hgnc_id", "hgnc_symbol", "type", "value") %in% colnames(hgnc)) ) { - append_log("hgnc must be a data.frame with columns 'hgnc_id', 'hgnc_symbol', 'type', 'value' as typically prepared using the hgnc_lookuptable() function", type = "error") + append_log("hgnc parameter must be a data.frame with columns 'hgnc_id', 'hgnc_symbol', 'type', 'value' as typically prepared using the hgnc_lookuptable() function", type = "error") + } + if(!is.data.frame(idmap) || nrow(idmap) == 0 || ncol(idmap) < 3 || !all(c("symbol", "uniprot_id") %in% colnames(idmap)[-1]) || !colnames(idmap)[1] %in% hgnc$type) { + append_log("idmap parameter must be a data.frame with columns 'symbol', 'uniprot_id' and the first column should contain identifiers that are also in the hgnc table (e.g. first column name should be 'mgi_id' or 'rgd_id'). e.g. as obtained from the mgi_lookuptable() function", type = "error") + } + if(length(use_symbols) != 1 || !all(use_symbols %in% c(TRUE, FALSE))) { + append_log("use_symbols parameter must be TRUE or FALSE", type = "error") } - x$symbol_input = x$symbol - rows_symbol_fail = is.na(x$symbol) | nchar(x$symbol) < 2 - x$symbol = toupper(x$symbol) - - # match directly by official gene symbol - i = match(x$symbol, hgnc$hgnc_symbol) - x$hgnc_id = hgnc$hgnc_id[i] - - # for rows that failed to match, try matching against the HGNC synonym table - rows = is.na(x$hgnc_id) - if(any(rows)) { - hgnc_synonym = hgnc %>% filter(type == "synonym") - if(nrow(hgnc_synonym) > 0) { - i = match(x$symbol[rows], hgnc_synonym$value) - x$hgnc_id[rows] = hgnc_synonym$hgnc_id[i] - } + # initialize result column, if missing + if(!"hgnc_id" %in% colnames(proteins_long)) { + proteins_long$hgnc_id = NA_character_ + } + # overwrite xref column if exists + proteins_long$xref_id = NA_character_ + + # XREF ID @ first column of idmap + XREF_ID_COLNAME = colnames(idmap)[1] + idmap$XREF_ID_VALUES__TEMPCOL = unlist(idmap[,1], recursive = F, use.names = F) # ensure we're a vector and not a tibble of 1 column + hgnc_subset_xref = hgnc %>% filter(type == XREF_ID_COLNAME) + if(nrow(hgnc_subset_xref) == 0) { + append_log(sprintf("assumed database type (from first idmap column): '%s'. However, values that match this crossreference database are not found as a 'type' in the provided hgnc table", colnames(idmap)[1]), type = "error") } - # erase whatever happened to invalid input - x$hgnc_id[rows_symbol_fail] = NA - # add hgnc_symbol and entrez/ensembl IDs if present in hgnc table - x = hgnc_add_xrefs(x, hgnc) + ### first map from input table to idmap; prio matching by uniprot identifier + rows = is.na(proteins_long$hgnc_id) & !is.na(proteins_long$uniprot_id) + proteins_long$xref_id[rows] = idmap$XREF_ID_VALUES__TEMPCOL[match(proteins_long$uniprot_id[rows], idmap$uniprot_id)] - # restore input symbols - x$symbol = x$symbol_input - x$symbol_input = NULL + # fallback to symbol matching (no `xref_id` yet) + rows = is.na(proteins_long$hgnc_id) & is.na(proteins_long$xref_id) & # + # double-check that we don't even attempt to match missing gene symbols + !is.na(proteins_long$symbol) & nchar(proteins_long$symbol) >= 2 + proteins_long$xref_id[rows] = idmap$XREF_ID_VALUES__TEMPCOL[match(toupper(proteins_long$symbol[rows]), toupper(idmap$symbol))] - # log status to console. use distinct(protein_id) because some input tables might contain multiple entries for - # some protein_id, e.g. dataset$de_proteins is provided as-is when it contains 2 contrasts - tmp = x %>% filter(rows_symbol_fail == FALSE) %>% distinct(symbol, .keep_all = TRUE) - n_input = nrow(tmp) - n_fail = sum(is.na(tmp$hgnc_id)) - cat(sprintf("%d / %d (%.1f%%) unique symbols could not be mapped to a HGNC human gene ID\n", - n_fail, n_input, n_fail / n_input * 100 )) - return(x) + ### map to HGNC using crossreference identifiers + rows = is.na(proteins_long$hgnc_id) & !is.na(proteins_long$xref_id) # can be all FALSE, no problem + proteins_long$hgnc_id[rows] = hgnc_subset_xref$hgnc_id[match(proteins_long$xref_id[rows], hgnc_subset_xref$value)] + + idmap$XREF_ID_VALUES__TEMPCOL = NULL + return(proteins_long) } @@ -415,24 +348,24 @@ protein2gene_by_symbol = function(x, hgnc) { hgnc_add_xrefs = function(x, hgnc) { # symbol i = match(x$hgnc_id, hgnc$hgnc_id) - x$hgnc_symbol = hgnc$hgnc_symbol[i] + x$symbol = hgnc$hgnc_symbol[i] # remove optional return values if present - x$entrez_id = NULL x$ensembl_id = NULL + x$entrez_id = NULL # subset HGNC lookup table by respective crossref IDs hgnc_entrez = hgnc %>% filter(type == "entrez_id") hgnc_ensembl = hgnc %>% filter(type == "ensembl_id") - if(nrow(hgnc_entrez) > 0) { - i = match(x$hgnc_id, hgnc_entrez$hgnc_id) - x$entrez_id = hgnc_entrez$value[i] - } if(nrow(hgnc_ensembl) > 0) { i = match(x$hgnc_id, hgnc_ensembl$hgnc_id) x$ensembl_id = hgnc_ensembl$value[i] } + if(nrow(hgnc_entrez) > 0) { + i = match(x$hgnc_id, hgnc_entrez$hgnc_id) + x$entrez_id = hgnc_entrez$value[i] + } return(x) } diff --git a/R/plot_differential_detect.R b/R/plot_differential_detect.R index 3338ec7..b276558 100644 --- a/R/plot_differential_detect.R +++ b/R/plot_differential_detect.R @@ -5,7 +5,7 @@ #' @param zscore_threshold cutoff used in the plot (for absolute values) #' @returns list of ggplot objects with 1 plot per contrast. If no differential detection data is available, returns an empty list #' @export -plot_differential_detect = function(dataset, zscore_threshold = 4) { +plot_differential_detect = function(dataset, zscore_threshold = 6) { result = list() if(!"dd_proteins" %in% names(dataset) || nrow(dataset$dd_proteins) == 0) { return(result) diff --git a/R/plot_peptide_data.R b/R/plot_peptide_data.R index ce6e633..f6e7408 100644 --- a/R/plot_peptide_data.R +++ b/R/plot_peptide_data.R @@ -32,7 +32,7 @@ #' @param dataset your dataset #' @param select_all_proteins plot all proteins. This will take a long time. Default: FALSE #' @param select_diffdetect_candidates if "differential detection" was enabled (it is by default in `analysis_quickstart`), plot candidate proteins based on their zscores. -#' Valid parameters are positive numeric values that are to be used as absolute z-score cutoffs (4 or 5 are typically reasonable; use `plot_differential_detect` to visualize the score distributions). +#' Valid parameters are positive numeric values that are to be used as absolute z-score cutoffs (5 or higher is typically reasonable; use `plot_differential_detect` to visualize the score distributions). #' This selects all proteins that meet this criterium in any contrast, and plots these proteins in every contrast (include those where it is has a z-score below given threshold). #' Alternative, set to TRUE to use an absolute z-score cutoff value of 4. Default: NA (disable) #' @param select_dea_signif plot all significant proteins from DEA? diff --git a/R/process_peptide_data.R b/R/process_peptide_data.R index 4e6c716..c13c2eb 100644 --- a/R/process_peptide_data.R +++ b/R/process_peptide_data.R @@ -172,7 +172,10 @@ import_protein_metadata_from_fasta = function(protein_id, fasta_files, fasta_id_ summarise( accessions = paste(acc, collapse = ";"), fasta_headers = paste(header, collapse = ";"), - # for genes; take unique values, remove NA, remove strings of less than 2 characters + # full list of gene symbols, NA where missing + gene_symbols = paste(ifelse(is.na(gene) | nchar(gene) < 2, "-", gene), collapse = ";"), # for consistency, collapse + # gene_symbols = list(ifelse(is.na(gene) | nchar(gene) < 2, NA_character_, gene)), # alternatively, store as list + # gene_symbols_or_id; take unique values, remove NA, remove strings of less than 2 characters gene_symbols_or_id = paste(remove_by_charlength(unique(gene_symbols_or_id), minchar = 2), collapse = ";") ) diff --git a/R/report_as_rmarkdown.R b/R/report_as_rmarkdown.R index faff627..dcd4506 100644 --- a/R/report_as_rmarkdown.R +++ b/R/report_as_rmarkdown.R @@ -120,7 +120,7 @@ generate_pdf_report = function(dataset, output_dir, norm_algorithm = "vwmb", rol ### differential detect if("dd_proteins" %in% names(dataset) && is.data.frame(dataset$dd_proteins) && nrow(dataset$dd_proteins) > 0) { - dd_plots = plot_differential_detect(dataset) + dd_plots = plot_differential_detect(dataset, zscore_threshold = 6) } diff --git a/R/stats_summary.R b/R/stats_summary.R index 2dd3143..ac91d76 100644 --- a/R/stats_summary.R +++ b/R/stats_summary.R @@ -2,52 +2,24 @@ #' Summarize DEA and/or differential detection results in a dataset into a table with a single statistic per gene #' #' @description -#' For differential detection, the z-scores are converted to p-values by `pnorm(abs(zscore), lower.tail = F)` , -#' assuming these are normally distributed, and multiple testing correction by FDR is applied. -#' However, this is just an approximation; the differential detection scores are inferior to DEA analyses and should be treated with care. -#' Stringent cutoffs (diffdetect_zscore_threshold of at least 4 ~ 6) are encouraged when using differential detection results. #' -#' For DEA, the criteria for significance that were previously configured when performing DEA using `dea()` or -#' `analysis_quickstart()` functions are reused here (i.e. cutoffs for adjusted p-value and log2 foldchange). +#' In most cases, you probably want to use the `export_stats_genesummary()` function instead. +#' That is a wrapper function that uses this function but also adds additional functionality. +#' For documentation on the output table, also refer to that function. #' -#' If `return_dea` and `return_diffdetect` are both set to TRUE, the effectsizes from each statistic/result are standardized. -#' This makes both statistics more comparable but the downside is that the effectsizes are no longer the exact effectsizes returned by the DEA model. -#' -#' Note that if your dataset contains results for multiple contrasts and multiple DEA algorithms, -#' these will all be appended into the result table so make sure to filter the results by 'contrast' and -#' 'dea_algorithm' columns where appropriate. -#' -#' @examples \dontrun{ -#' # summarise the DEA results (and ignore the more speculative differential detect data) -#' x = summarise_stats(dataset, return_dea = TRUE, return_diffdetect = FALSE, -#' remove_ambiguous_proteingroups = FALSE) -#' -#' # first plot all differential detect results as histograms to verify the zscore cutoff -#' tmp = lapply(plot_differential_detect(dataset), print) -#' # merge both DEA and differential-detect 'extremes' into a gene-level stats summary -#' x = summarise_stats(dataset, return_dea = TRUE, return_diffdetect = TRUE, -#' diffdetect_zscore_threshold = 5, remove_ambiguous_proteingroups = FALSE) -#' } #' @param dataset dataset where dea() and/or differential_detect() has been applied #' @param return_dea boolean, set to TRUE to include DEA results in the stats table that returns 1 value per gene (setting TRUE for both DEA and DD will merge results) #' @param return_diffdetect analogous to `return_dea`, but setting this to TRUE includes differential detection results #' @param dea_logfc_as_effectsize optionally, the resulting effectsize column can be populated with standardized foldchange values (effectsize = log2fc / sd(log2fc)). #' When including differential detection results this'll be a convenient approach to getting 1 standardized distribution of DEA+DD effectsizes that can be used in e.g. GO analyses. #' While this is unusual, one could e.g. use this for DEA algorithms that apply shrinkage to estimated foldchanges such as MSqRob -#' @param diffdetect_zscore_threshold differential detect z-score cutoff. A typical value would be `diffdetect_zscore_threshold=4` , or 5~6 for stringent filtering. +#' @param diffdetect_zscore_threshold differential detect z-score cutoff. A typical value would be 5 or 6 (default) #' To plot histograms of the respective z-score distributions and inspect potential cutoff values for this relatively arbitrary metric, see below example code -#' @param remove_ambiguous_proteingroups boolean parameter indicating whether proteingroups that contain shared gene symbols (e.g. GRIA1;GRIA2) should be removed -#' @returns \itemize{ -#' \item The effectsize column data depends on the user parameters/configuration; -#' When only differential detect is used, this contains the z-scores. When only DEA is used, this contains effectsizes from DEA as-is. -#' However, when DEA and differential detect are combined, the effectsizes from DEA are standardized (divided by std) such that effectsizes from -#' both statistics are integrated into 1 distribution (and thus these can be compared/ranked in downstream analyses). -#' \item 'gene_symbols_or_id' contains the uniprot gene symbols for respective accessions in the proteingroup (protein_id column) as per usual in MS-DAP. -#' \item The 'symbol' column contains the first gene symbol thereof (e.g. in ambiguous groups where gene_symbols_or_id='GRIA1;GRIA2', this'll yield 'GRIA1'). -#' \item the 'dea_algorithm' column shows the DEA algorithm that was used. Importantly, even differential detect results that were merged into results for this DEA algorithm have the same dea_algorithm value so you can group/filter by the entire analysis downstream. -#' } -#' @export -summarise_stats = function(dataset, return_dea = TRUE, return_diffdetect = FALSE, dea_logfc_as_effectsize = FALSE, diffdetect_zscore_threshold = 4, remove_ambiguous_proteingroups = FALSE) { +#' @param diffdetect_type type of differential detect scores. options: +#' 'auto' = set to 'detect' if this score is available, 'quant' otherwise +#' 'detect' = differential detection z-scores computed from only "detected" peptides (no MBR) +#' 'quant' = differential detection z-scores computed from all quantified peptides (uses MBR) +summarise_stats = function(dataset, return_dea = TRUE, return_diffdetect = FALSE, dea_logfc_as_effectsize = FALSE, diffdetect_zscore_threshold = 6, diffdetect_type = "auto") { if(length(return_dea) != 1 || ! return_dea %in% c(TRUE, FALSE)) { append_log("return_dea must be single boolean", type = "error") } @@ -88,8 +60,9 @@ summarise_stats = function(dataset, return_dea = TRUE, return_diffdetect = FALSE dea_algorithm = iter_algo_de, dea_logfc_as_effectsize = dea_logfc_as_effectsize, diffdetect_zscore_threshold = diffdetect_zscore_threshold, - remove_ambiguous_proteingroups = remove_ambiguous_proteingroups + diffdetect_type = diffdetect_type ) + if(is.null(tmp)) next if(!is.na(iter_algo_de)) { tmp = tmp %>% add_column(dea_algorithm = iter_algo_de, .after = 2) } @@ -111,8 +84,8 @@ summarise_stats = function(dataset, return_dea = TRUE, return_diffdetect = FALSE #' @param dea_algorithm DEA algorithm as used in `dea()` upstream #' @param dea_logfc_as_effectsize see `summarise_stats()` #' @param diffdetect_zscore_threshold see `summarise_stats()` -#' @param remove_ambiguous_proteingroups see `summarise_stats()` -summarise_stats__for_contrast = function(dataset, return_dea, return_diffdetect, contr, dea_algorithm, dea_logfc_as_effectsize, diffdetect_zscore_threshold, remove_ambiguous_proteingroups) { +#' @param diffdetect_type see `summarise_stats()` +summarise_stats__for_contrast = function(dataset, return_dea, return_diffdetect, contr, dea_algorithm, dea_logfc_as_effectsize, diffdetect_zscore_threshold, diffdetect_type) { if(!is.list(dataset)) { append_log("invalid dataset", type = "error") } @@ -140,10 +113,11 @@ summarise_stats__for_contrast = function(dataset, return_dea, return_diffdetect, if(length(diffdetect_zscore_threshold) != 1 || !is.numeric(diffdetect_zscore_threshold) || !is.finite(diffdetect_zscore_threshold) || diffdetect_zscore_threshold < 0) { append_log("diffdetect_zscore_threshold must be a single positive numeric value", type = "error") } - if(length(remove_ambiguous_proteingroups) != 1 || ! remove_ambiguous_proteingroups %in% c(TRUE, FALSE)) { - append_log("remove_ambiguous_proteingroups must be single boolean", type = "error") + if(length(diffdetect_type) != 1 || ! diffdetect_type %in% c("auto", "detect", "quant")) { + append_log("diffdetect_type must be aany of; auto, detect, quant", type = "error") } + param_dea_algorithm = dea_algorithm log = dd_type = "" @@ -168,14 +142,29 @@ summarise_stats__for_contrast = function(dataset, return_dea, return_diffdetect, dd = dataset$dd_proteins %>% filter(contrast == contr & is.finite(zscore)) if(nrow(dd) > 0) { - # results might contain both detect and quant, or only one of the other - dd_type = unique(dd$type) - if("detect" %in% dd_type) { - dd_type = "detect" + # results might contain both detect and quant, or only one of these + dd_type_available = unique(dd$type) + dd_type = '' + # auto = prefer detect + if(diffdetect_type == "auto") { + if("detect" %in% dd_type_available) { + dd_type = "detect" + } else { + dd_type = "quant" + } } else { - dd_type = dd_type[1] + # catch mismatch between parameter and available data + if(diffdetect_type == "detect" && ! "detect" %in% dd_type_available) { + append_log("diffdetect_type parameter is set to 'detect', but there are no differential detection results of this type (consider setting this parameter to 'auto' instead)", type = "error") + } + if(diffdetect_type == "quant" && ! "quant" %in% dd_type_available) { + append_log("diffdetect_type parameter is set to 'quant', but there are no differential detection results of this type (consider setting this parameter to 'auto' instead)", type = "error") + } + # finally, set dd_type to user parameter (which is not 'auto' here) + dd_type = diffdetect_type } + # subset the DD results for the selected contrast @ type and z-score cutoff dd = dd %>% filter(type == dd_type) %>% @@ -201,7 +190,8 @@ summarise_stats__for_contrast = function(dataset, return_dea, return_diffdetect, append_log("DEA results are requested but the provided MS-DAP dataset has none. Did you forget to run the analysis_quickstart() or dea() function ?", type = "error") } if( ! contr %in% dataset$de_proteins$contrast) { - append_log(sprintf("selected contrast '%s' is not available in the DEA results; please re-run dea() to ensure data is available for all contrasts specified for this dataset", contr), type = "error") + append_log(sprintf("'%s' is not available in the DEA results; either dea/quickstart_analysis was not applied, or DEA failed for this contrast (please refer to the earlier MS-DAP log)", contr), type = "warning") + return(NULL) } dea = dataset$de_proteins %>% @@ -254,6 +244,9 @@ summarise_stats__for_contrast = function(dataset, return_dea, return_diffdetect, mutate(peptide_count = peptides_used_for_dea, log2fc = log2fc_dea, effectsize = effectsize_dea_std, pvalue = pvalue_dea, pvalue_adjust = pvalue_adjust_dea, score_type = ifelse(is.na(pvalue_dea), NA, "dea")) + rows_low_dea_notsignif = is.finite(result$pvalue_dea) & is.finite(result$pvalue_dd) & !result$signif_dea %in% TRUE + rows_low_dea_signif = is.finite(result$pvalue_dea) & is.finite(result$pvalue_dd) & result$signif_dea %in% TRUE + # overwrite DEA statistics where Differential Detect (DD) has stronger result # (but note that we only consider the set of 'strong' DD scores, e.g. we don't overwrite poor DEA p-values with slightly better DD results) rows_na = !is.finite(result$pvalue_dea) & is.finite(result$pvalue_dd) @@ -272,8 +265,8 @@ summarise_stats__for_contrast = function(dataset, return_dea, return_diffdetect, filter(is.finite(pvalue)) %>% select(protein_id, score_type, peptide_count, log2fc, effectsize_dea, effectsize, pvalue, pvalue_adjust, signif) - log = sprintf("%smerging DEA results from %s and differential detect (score type '%s'), the latter contributed to %d / %d proteingroups (%d without DEA, %d stronger than DEA).", - log, param_dea_algorithm, dd_type, sum(rows_na | rows_stronger), length(rows_na), sum(rows_na), sum(rows_stronger)) + log = sprintf("%smerging DD (type '%s', threshold %s) with %s (DEA) results contributed to %d / %d proteingroups (%d not tested in DEA, %d not DEA 'signif', %d DEA 'signif').", + log, dd_type, diffdetect_zscore_threshold, param_dea_algorithm, sum(rows_na | rows_stronger), length(rows_na), sum(rows_na), sum(rows_low_dea_notsignif), sum(rows_low_dea_signif)) } @@ -284,28 +277,13 @@ summarise_stats__for_contrast = function(dataset, return_dea, return_diffdetect, result = result %>% # add (leading) gene symbols - left_join(dataset$proteins %>% select(protein_id, gene_symbols_or_id), by = "protein_id") %>% + left_join(dataset$proteins %>% select(protein_id, gene_symbols, gene_symbols_or_id), by = "protein_id") %>% mutate(symbol = gsub(";.*", "", gene_symbols_or_id)) %>% # add contrast add_column(contrast = contr, .after = 1) %>% arrange(desc(signif), pvalue) - # finally, retain 1 row per gene - # this crucially depends on upstream sorting of the table - if(remove_ambiguous_proteingroups) { - # remove all proteingroups with multiple genes, then retain unique genes - result = result %>% filter(!grepl(";", gene_symbols_or_id, fixed = TRUE)) %>% distinct(gene_symbols_or_id, .keep_all = T) - } else { - # retain all unique shared gene symbols - result = result %>% distinct(gene_symbols_or_id, .keep_all = T) - } - - append_log(sprintf( - "%s; %s %d/%d(%.1f%%) gene symbols in the final result table are foreground.", - contr, log, - sum(result$signif), nrow(result), sum(result$signif)/nrow(result) * 100 - ), type = "info") - + append_log(sprintf("%s; %s", contr, log), type = "info") return(result) } diff --git a/R/util_files.R b/R/util_files.R index 1344d39..53f82d7 100644 --- a/R/util_files.R +++ b/R/util_files.R @@ -234,13 +234,14 @@ filename_strip_illegal_characters = function(f, strict = FALSE, replacement = "_ #' @param as_table boolean, TRUE = results from data.table::fread(), FALSE = results from readr::read_lines() (default) #' @param skip_empty_rows ignore empty rows (default: FALSE) #' @param nrow optionally, integer specifying to only read first N rows +#' @param datatable_skip hardcoded parameter for datatable_skip (set NULL to ignore; default) #' @param ... sent to data.table::fread() #' @return NULL if path doesn't exist or file could not be read / decompressed (warnings/errors are silent) #' @importFrom archive file_read archive_read #' @importFrom readr read_lines #' @importFrom data.table fread #' @export -read_textfile_compressed = function(file, as_table = FALSE, skip_empty_rows = FALSE, nrow = -1, ...) { +read_textfile_compressed = function(file, as_table = FALSE, skip_empty_rows = FALSE, nrow = -1, datatable_skip = NULL, ...) { # TODO: generate warning when as_table && skip_empty_rows -->> disabled if(length(file) != 1 || !is.character(file) || !file.exists(file)) { return(NULL) @@ -282,7 +283,15 @@ read_textfile_compressed = function(file, as_table = FALSE, skip_empty_rows = FA ### user requested a table if(as_table) { if(!ext_assume_compressed) { - x = data.table::fread(file = file, nrows = nrow, ...) #, blank.lines.skip = FALSE + if(nrow > 0) { + x = data.table::fread(file = file, nrows = nrow, ...) #, blank.lines.skip = FALSE + } else { + if(!is.null(datatable_skip)) { + x = data.table::fread(file = file, skip = datatable_skip, ...) #, blank.lines.skip = FALSE + } else { + x = data.table::fread(file = file, ...) #, blank.lines.skip = FALSE + } + } } if(ext_assume_compressed && length(x) > 0) { # bugfix; data.table::fread() doesn't accept input that is a single line without end-of-line character @@ -290,7 +299,11 @@ read_textfile_compressed = function(file, as_table = FALSE, skip_empty_rows = FA x = paste0(x, "\n") } # input file is compressed, assume we already uncompressed and read all lines - x = data.table::fread(text = x, ...) # don't need blank skip nor nrow argument, already done @ read_lines() + if(!is.null(datatable_skip)) { + x = data.table::fread(text = x, skip = datatable_skip, ...) # don't need blank skip nor nrow argument, already done @ read_lines() + } else { + x = data.table::fread(text = x, ...) # don't need blank skip nor nrow argument, already done @ read_lines() + } } } @@ -452,7 +465,7 @@ robust_header_matching = function(x, column_spec, columns_required) { # halt if required columns are missing if(length(col) == 0 && names(column_spec)[i] %in% columns_required) { append_log(sprintf("invalid data table; cannot find column '%s' while testing for column names %s", - names(column_spec)[1], paste(cols[[i]], collapse = ", ")), type = "error") + names(column_spec)[1], paste(column_spec[[i]], collapse = ", ")), type = "error") } if(length(col) > 0) { col = col[1] # in case of multiple matches, take the first diff --git a/README.Rmd b/README.Rmd index d8f155b..4b9b217 100644 --- a/README.Rmd +++ b/README.Rmd @@ -103,7 +103,7 @@ dataset = analysis_quickstart( filter_fraction_quant = 0.75, # analogous for quantitative values filter_min_peptide_per_prot = 1, # minimum number of peptides per protein (after applying above filters) required for DEA. Set this to 2 to increase robustness, but note that'll discard approximately 25% of proteins in typical datasets (i.e. that many proteins are only quantified by 1 peptide) filter_by_contrast = TRUE, # only relevant if dataset has 3+ groups. For DEA at each contrast, filters and normalization are applied on the subset of relevant samples within the contrast for efficiency, see further MS-DAP manuscript. Set to FALSE to disable and use traditional "global filtering" (filters are applied to all sample groups, same data table used in all statistics) - norm_algorithm = c("vsn", "modebetween_protein"), # normalization; first vsn, then modebetween on protein-level (applied sequentially so the MS-DAP modebetween algorithm corrects scaling/balance between-sample-groups) + norm_algorithm = c("vsn", "modebetween_protein"), # normalization; first vsn, then modebetween on protein-level (applied sequentially so the MS-DAP modebetween algorithm corrects scaling/balance between-sample-groups). "mwmb" is a good alternative for "vsn" dea_algorithm = c("deqms", "msempire", "msqrob"), # statistics; apply multiple methods in parallel/independently dea_qvalue_threshold = 0.01, # threshold for significance of adjusted p-values in figures and output tables dea_log2foldchange_threshold = NA, # threshold for significance of log2 foldchanges. 0 = disable, NA = automatically infer through bootstrapping @@ -123,6 +123,73 @@ print_dataset_summary(dataset) ``` +### Creating gene-level summaries of statistical analysis results + +_this requires MS-DAP version 1.10 or later_ + +To expedite downstream analysis, MS-DAP includes a function that maps the proteingroup results from your +differential expression analysis to Human gene identifiers and exports these as Excel tables that are ready for +use with GO tools. Optionally, one can also include results from differential detection analysis. +Multiple proteingroups that map to the same unique gene are merged and there are various options +for dealing with ambiguous proteingroups. + +The resulting output tables can be used as input with the [GOAT online geneset analysis tool](https://ftwkoopmans.github.io/goat/) + +Further details on available parameters and a description of all columns in the output tables are available in this function's documentation; issue the R command `?export_stats_genesummary` to open the respective help page in RStudio. + +**First, obtain the required database files** + +- HGNC human database + - direct download link, right-click and select "save as": + - alternatively, go to the HGNC website -> search for the text "Complete HGNC approved dataset" -> download the "TXT" table + - _the filename you need is "hgnc_complete_set.txt"_ + +- MGI mouse database; only needed for Mouse datasets + - direct download link, right-click and select "save as": + - _the filename you need is "MRK_SwissProt_TrEMBL.rpt"_ + +- RGD rat database; only needed for Rat datasets + - direct download link, right-click and select "save as": + - _the filename you need is "GENES_RAT.txt"_ + +pro tip: rename the files you downloaded to include the current date (e.g. "hgnc_complete_set__2024-01-01.txt"). This makes it easy to track versions of files/databases you use across analyses and reproduce your results later. + +**example code snippet** + +``` + +hgnc_table = hgnc_lookuptable("C:/DATA/hgnc_complete_set__2024-01-01.txt") # <> + +# MGI Mouse database, only needed for Mouse datasets +#mgi_table = mgi_lookuptable("C:/DATA/MRK_SwissProt_TrEMBL__2024-01-01.rpt") # <> +# RGD Rat database, only needed for Rat datasets +#rgd_table = rgd_lookuptable("C:/DATA/GENES_RAT__2024-01-01.rpt") # <> + + +# refer to this function's R documentation for additional details +export_stats_genesummary( + dataset, + # set NA to ignore differential detection (default), or define an absolute zscore threshold (e.g. 6) + # ! when using diffdetect, first review the zscore distributions using MS-DAP function: plot_differential_detect() + diffdetect_zscore_threshold = NA, + # options for dealing with proteingroups that map to multiple genes: + # "leading_gene" = use only the leading/first gene per proteingroup. e.g. "GRIA1;GRIA2" is mapped to "GRIA1" and will overwrite a specific "GRIA1" proteingroup if (and only if) the former has a lower p-value + # "prio_specific" = discard ambiguous proteingroups if their leading/first gene overlaps with a specific proteingroup. e.g. "GRIA1;GRIA2" is mapped to "GRIA1" only if there is no specific/unambiguous "GRIA1" proteingroup + # "only_specific" = all ambiguous proteingroups are discarded + gene_ambiguity = "prio_specific", # prio_specific and only_specific are recommended + # HGNC data table is always required + hgnc = hgnc_table, + # example: for Mouse datasets, provide the MGI database for more accurate ID mapping + # xref = mgi_table, + # example: for Rat datasets, provide the RGD database for more accurate ID mapping + # xref = rgd_table, + # write output files to the same directory as main MS-DAP results by using the `dataset$output_dir` variable + # alternatively, provide a valid path. e.g. output_dir="C:/temp" # use forward slashes + output_dir = dataset$output_dir +) + +``` + ## Docker @@ -148,7 +215,6 @@ Bioinformatic analyses beyond the typical MS-DAP workflow are described in the f - [introduction to MS-DAP](doc/intro.md) (start here) - [user guide: details the main R functions](doc/userguide.md) -- [summarizing results from statistics + gene ID mapping](doc/stats_summary.md) - [bioinformatics: differential expression analysis (DEA)](doc/differential_expression_analysis.md) - [bioinformatics: differential detection](doc/differential_detection.md) - [bioinformatics: plugin custom normalization or DEA](doc/custom_norm_dea.md) @@ -158,12 +224,7 @@ Bioinformatic analyses beyond the typical MS-DAP workflow are described in the f ## Roadmap -- currently working on; - - additional data visualization for the PDF report - - advanced experimental designs for eBayes/DEqMS (i.e. more flexibility in creating design matrices for limma) - - implement various imputation approaches, and apply extensive benchmarking evaluation -- planned - - plugins for more statistical methods & extensive benchmarking evaluation - - e.g. existing methods / new tools currently in development in other labs (write plugin to help beta-test and benchmark) / port methods currently not available in R - - approaches for combining test statistics when applying multiple independent DEA methods - +- advanced experimental designs for eBayes/DEqMS (i.e. more flexibility in creating design matrices for limma) +- implement various imputation approaches +- plugins for more statistical methods + - e.g. existing methods / new tools currently in development in other labs (write plugin to help beta-test and benchmark) diff --git a/README.md b/README.md index d366922..a9bd656 100644 --- a/README.md +++ b/README.md @@ -120,7 +120,7 @@ dataset = analysis_quickstart( filter_fraction_quant = 0.75, # analogous for quantitative values filter_min_peptide_per_prot = 1, # minimum number of peptides per protein (after applying above filters) required for DEA. Set this to 2 to increase robustness, but note that'll discard approximately 25% of proteins in typical datasets (i.e. that many proteins are only quantified by 1 peptide) filter_by_contrast = TRUE, # only relevant if dataset has 3+ groups. For DEA at each contrast, filters and normalization are applied on the subset of relevant samples within the contrast for efficiency, see further MS-DAP manuscript. Set to FALSE to disable and use traditional "global filtering" (filters are applied to all sample groups, same data table used in all statistics) - norm_algorithm = c("vsn", "modebetween_protein"), # normalization; first vsn, then modebetween on protein-level (applied sequentially so the MS-DAP modebetween algorithm corrects scaling/balance between-sample-groups) + norm_algorithm = c("vsn", "modebetween_protein"), # normalization; first vsn, then modebetween on protein-level (applied sequentially so the MS-DAP modebetween algorithm corrects scaling/balance between-sample-groups). "mwmb" is a good alternative for "vsn" dea_algorithm = c("deqms", "msempire", "msqrob"), # statistics; apply multiple methods in parallel/independently dea_qvalue_threshold = 0.01, # threshold for significance of adjusted p-values in figures and output tables dea_log2foldchange_threshold = NA, # threshold for significance of log2 foldchanges. 0 = disable, NA = automatically infer through bootstrapping @@ -139,6 +139,83 @@ print_dataset_summary(dataset) # 7) All done! Check out the generated files in the output directory, starting with report.pdf ``` +### Creating gene-level summaries of statistical analysis results + +*this requires MS-DAP version 1.10 or later* + +To expedite downstream analysis, MS-DAP includes a function that maps +the proteingroup results from your differential expression analysis to +Human gene identifiers and exports these as Excel tables that are ready +for use with GO tools. Optionally, one can also include results from +differential detection analysis. Multiple proteingroups that map to the +same unique gene are merged and there are various options for dealing +with ambiguous proteingroups. + +The resulting output tables can be used as input with the [GOAT online +geneset analysis tool](https://ftwkoopmans.github.io/goat/) + +Further details on available parameters and a description of all columns +in the output tables are available in this function’s documentation; +issue the R command `?export_stats_genesummary` to open the respective +help page in RStudio. + +**First, obtain the required database files** + +- HGNC human database + - direct download link, right-click and select “save as”: + + - alternatively, go to the HGNC website + -\> + search for the text “Complete HGNC approved dataset” -\> download + the “TXT” table + - *the filename you need is “hgnc_complete_set.txt”* +- MGI mouse database; only needed for Mouse datasets + - direct download link, right-click and select “save as”: + + - *the filename you need is “MRK_SwissProt_TrEMBL.rpt”* +- RGD rat database; only needed for Rat datasets + - direct download link, right-click and select “save as”: + + - *the filename you need is “GENES_RAT.txt”* + +pro tip: rename the files you downloaded to include the current date +(e.g. “hgnc_complete_set\_\_2024-01-01.txt”). This makes it easy to +track versions of files/databases you use across analyses and reproduce +your results later. + +**example code snippet** + + + hgnc_table = hgnc_lookuptable("C:/DATA/hgnc_complete_set__2024-01-01.txt") # <> + + # MGI Mouse database, only needed for Mouse datasets + #mgi_table = mgi_lookuptable("C:/DATA/MRK_SwissProt_TrEMBL__2024-01-01.rpt") # <> + # RGD Rat database, only needed for Rat datasets + #rgd_table = rgd_lookuptable("C:/DATA/GENES_RAT__2024-01-01.rpt") # <> + + + # refer to this function's R documentation for additional details + export_stats_genesummary( + dataset, + # set NA to ignore differential detection (default), or define an absolute zscore threshold (e.g. 6) + # ! when using diffdetect, first review the zscore distributions using MS-DAP function: plot_differential_detect() + diffdetect_zscore_threshold = NA, + # options for dealing with proteingroups that map to multiple genes: + # "leading_gene" = use only the leading/first gene per proteingroup. e.g. "GRIA1;GRIA2" is mapped to "GRIA1" and will overwrite a specific "GRIA1" proteingroup if (and only if) the former has a lower p-value + # "prio_specific" = discard ambiguous proteingroups if their leading/first gene overlaps with a specific proteingroup. e.g. "GRIA1;GRIA2" is mapped to "GRIA1" only if there is no specific/unambiguous "GRIA1" proteingroup + # "only_specific" = all ambiguous proteingroups are discarded + gene_ambiguity = "prio_specific", # prio_specific and only_specific are recommended + # HGNC data table is always required + hgnc = hgnc_table, + # example: for Mouse datasets, provide the MGI database for more accurate ID mapping + # xref = mgi_table, + # example: for Rat datasets, provide the RGD database for more accurate ID mapping + # xref = rgd_table, + # write output files to the same directory as main MS-DAP results by using the `dataset$output_dir` variable + # alternatively, provide a valid path. e.g. output_dir="C:/temp" # use forward slashes + output_dir = dataset$output_dir + ) + ## Docker MS-DAP is available as a [Docker container](doc/docker.md) that includes @@ -182,8 +259,6 @@ Differential Expression Analysis (DEA). - [introduction to MS-DAP](doc/intro.md) (start here) - [user guide: details the main R functions](doc/userguide.md) -- [summarizing results from statistics + gene ID - mapping](doc/stats_summary.md) - [bioinformatics: differential expression analysis (DEA)](doc/differential_expression_analysis.md) - [bioinformatics: differential @@ -193,17 +268,9 @@ Differential Expression Analysis (DEA). ## Roadmap -- currently working on; - - additional data visualization for the PDF report - - advanced experimental designs for eBayes/DEqMS (i.e. more - flexibility in creating design matrices for limma) - - implement various imputation approaches, and apply extensive - benchmarking evaluation -- planned - - plugins for more statistical methods & extensive benchmarking - evaluation - - e.g. existing methods / new tools currently in development in - other labs (write plugin to help beta-test and benchmark) / port - methods currently not available in R - - approaches for combining test statistics when applying multiple - independent DEA methods +- advanced experimental designs for eBayes/DEqMS (i.e. more flexibility + in creating design matrices for limma) +- implement various imputation approaches +- plugins for more statistical methods + - e.g. existing methods / new tools currently in development in other + labs (write plugin to help beta-test and benchmark) diff --git a/doc/stats_summary.Rmd b/doc/stats_summary.Rmd deleted file mode 100644 index d782ad3..0000000 --- a/doc/stats_summary.Rmd +++ /dev/null @@ -1,171 +0,0 @@ ---- -output: - rmarkdown::github_document: - html_preview: true - toc: false ---- - -This vignette shows how to summarize MS-DAP DEA and/or differential detection results in a dataset -into a table with a single statistic per gene and optionally map these to HGNC human gene identifiers. - - -## Summarizing statistical results from MS-DAP at gene level - -After performing DEA, the result table from statistical analysis might contain multiple proteingroups for a -gene (i.e. isoforms). In downstream analysis we often want to only condier only the most significant proteingroup -in such cases when performing GO analyses. Further, there might be proteingroups that are ambiguous, i.e. multiple -genes match the uniprot accessions in the proteingroup (e.g. the gene_symbols_or_id column shows "GRIA1;GRIA2"). - -The `summarise_stats()` function helps filter/subset the DEA results table to deal with this. Further, in case -[differential testing](differential_detection.md) has been applied to the dataset you can also use this function -to merge the respective top-hits into the result table such that the output table contains 1 pvalue and effectsize -for each unique gene in your statistical results. - -The `remove_ambiguous_proteingroups` parameter is critical, controlling whether proteingroups that map to multiple genes, e.g. "GRIA1;GRIA2", are removed from the result table. -This is an important decision; set to `TRUE` to remove ambiguous proteingroups and return only proteins groups with gene-unique evidence. - -pro: this is the conservative option with lower odds of false positives in downstream analyses, probably the better choice when using these results to inform on followup experiments. - -con: if there is no proteingroup with unique evidence for e.g. "GRIA1" nor for "GRIA2", removing the ambiguous "GRIA1;GRIA2" preludes downstream GO analyses from finding enrichment for this gene (even though for GO it wouldn't have mattered whether we searched for GRIA1 or GRIA2). - -Note that in any case one shouldn't search GO for _all_ ambiguous genes in a proteingroup (e.g. GRIA1 and GRIA2 in case of "GRIA1;GRIA2") without some form of correction because _either_ protein was found, not both. -The 'symbol' column returned by this function will contain only the 'leading' symbol; typically this column is used as input for GO (and not the 'gene_symbols_or_id' which contains all/ambiguous symbols in case `remove_ambiguous_proteingroups = FALSE`) -So for GO analyses one could retain ambiguous proteingroups and then follow the example below to map these to unique human gene identifiers and retain 1 value per unique gene - - -```{r eval=FALSE} -library(msdap) -# load your previously generated dataset from disk -load("C://dataset.RData") - -# Summarise the statistical data in your dataset, assuming DEA was applied. -# note that if you analyzed multiple contrasts, optionally with multiple DEA algorithms, all results are appended in this table. -x = summarise_stats( - dataset, - # include DEA result in the result table - return_dea = TRUE, - # disabled in this example, but if differential detection was performed set this to TRUE to integrate respective 'strong z-scores' - return_diffdetect = FALSE, - # replace effectsizes with log2fc in result table. Niche usecase, e.g. optional for DEA models that apply shrinkage to foldchanges - dea_logfc_as_effectsize = FALSE, - # differential detection absolute z-score cutoff (only relevant when return_diffdetect=TRUE) - diffdetect_zscore_threshold = 4, - # should proteingroups that map to multiple genes, e.g. "GRIA1;GRIA2", be removed from the result table? - remove_ambiguous_proteingroups = FALSE -) - - -# alternatively, when you've performed differential detection as well and want to merge the DEA results with the top-hits from differential detect -# first plot the differential detect histograms to double-check your desired z-score threshold -plot_differential_detect(dataset) -# compared to above example, we here set `return_diffdetect = TRUE` -x = summarise_stats( - dataset, - return_dea = TRUE, - return_diffdetect = TRUE, - dea_logfc_as_effectsize = FALSE, - diffdetect_zscore_threshold = 5, # critical parameter - remove_ambiguous_proteingroups = FALSE # critical parameter -) -``` - - -## Phylogenetic mapping from mouse proteins to human genes - -The MS-DAP function `protein2gene_orthologs()` adds human orthologs for mouse/rat proteins in a table that contains -uniprot accessions and their respective gene symbols. - -This function is currently limited such that the first matching human gene per proteingroup is returned, -so ambiguous proteingroups with multiple gene symbols (e.g. "GRIA1;GRIA2") might not always yield the 'leading' -gene and additional/ambiguous genes are disregarded). Columns describing the HGNC (genenames.org) ID, -NCBI Entez ID and Ensembl gene ID are appended to the provided table. - -Workflow for ID mapping, where subsequent steps are fallbacks; - -1) uniprot id -> mgi/rgd id -> hgnc id -2) uniprot symbol -> mgi/rgd id -> hgnc id -3) uniprot symbol -> hgnc official symbol -> hgnc id -4) uniprot symbol -> hgnc non-ambiguous synonyms -> hgnc id - -Before running below example code for phylogenetic mappings, download required lookup tables here; - -**HGNC** - -- download link: https://www.genenames.org/download/statistics-and-files/ -- table: "Complete dataset download links" -->> "Complete HGNC approved dataset text json" -->> download the "TXT" table -- filename is typically something like hgnc_complete_set.txt - -**MGI** - -- download link: https://www.informatics.jax.org/downloads/reports/index.html -- table: "MGI Marker associations to SWISS-PROT and TrEMBL protein IDs (tab-delimited)" -- filename is typically something like MRK_SwissProt_TrEMBL.rpt - -for rat proteins one can use information from RGD as a drop-in replacement for MGI in this example, see further `rgd_lookuptable()`. - -```{r eval=FALSE} -# load HGNC and MGI lookup tables from disk, assuming you previously downloaded these -hgnc = hgnc_lookuptable(f = "C://hgnc_complete_set.txt") -mgi = mgi_lookuptable(f = "C://MRK_SwissProt_TrEMBL.rpt") - -# map the mouse proteins to human gene identifiers -# note that the ID mapping success rate is printed to the console -x = protein2gene_orthologs(x, hgnc, mgi) - -# print first few rows to console to inspect results -print(x, n=5) -# write full results to disk as Excel table -openxlsx::write.xlsx(x, "C://stats_summary.xlsx") - -# If you retain ambiguous proteingroups in `summarise_stats()` and perform phylogenetic mapping, -# some genes might have duplicate entries in your result table (e.g. "GRIA1" and "GRIA1;GRIA2" proteingroups will yield the same human gene ID). -# Here we'll remove all proteingroups that failed to map to human genes AND those that yield the same human ortholog gene. -# (this works because sorting of table x is already by 'best on top' after summarise_stats()) -y = x %>% filter(!is.na(hgnc_id)) %>% distinct(hgnc_id, contrast, dea_algorithm, .keep_all = TRUE) -openxlsx::write.xlsx(y, "C://stats_summary_deduped.xlsx") -``` - - - -## Mapping from gene symbols directly to HGNC gene identifiers - -For human proteome datasets we can directly match from uniprot gene symbols to HGNC (genenames.org) identifiers -by directly cross-matching official symbols and synonyms. The function `protein2gene_by_symbol()` can directly work -with the 'symbol' column from the `summarise_stats()` output. Do note that if you did not remove ambiguous -proteingroups in the `summarise_stats()` step, only the first gene symbol is used for ID mapping here -(for the subset of ambiguous proteingroups). - - -Before running below example code for phylogenetic mappings, download the required lookup table here; - -**HGNC** - -- download link: https://www.genenames.org/download/statistics-and-files/ -- table: "Complete dataset download links" -->> "Complete HGNC approved dataset text json" -->> download the "TXT" table -- filename is typically something like hgnc_complete_set.txt - - -```{r eval=FALSE} -# load HGNC lookup tables from disk, assuming you previously downloaded these -hgnc = hgnc_lookuptable(f = "C://hgnc_complete_set.txt") - -# assuming x is the results you obtained from summarise_stats() -# this will add columns hgnc_id, hgnc_symbol, entrez_id and ensembl_id -x = protein2gene_by_symbol(x, hgnc) - -# print first few rows to console to inspect results -print(x, n=5) - -### below code is analogous to the mouse example above - -# write full results to disk as Excel table -openxlsx::write.xlsx(x, "C://stats_summary.xlsx") - -# If you retain ambiguous proteingroups in `summarise_stats()` and perform phylogenetic mapping, -# some genes might have duplicate entries in your result table (e.g. "GRIA1" and "GRIA1;GRIA2" proteingroups will yield the same human gene ID). -# Here we'll remove all proteingroups that failed to map to human genes AND those that yield the same human ortholog gene. -# (this works because sorting of table x is already by 'best on top' after summarise_stats()) -y = x %>% filter(!is.na(hgnc_id)) %>% distinct(hgnc_id, contrast, dea_algorithm, .keep_all = TRUE) -openxlsx::write.xlsx(y, "C://stats_summary_deduped.xlsx") -``` - diff --git a/doc/stats_summary.md b/doc/stats_summary.md deleted file mode 100644 index 27eecfe..0000000 --- a/doc/stats_summary.md +++ /dev/null @@ -1,193 +0,0 @@ - -This vignette shows how to summarize MS-DAP DEA and/or differential -detection results in a dataset into a table with a single statistic per -gene and optionally map these to HGNC human gene identifiers. - -## Summarizing statistical results from MS-DAP at gene level - -After performing DEA, the result table from statistical analysis might -contain multiple proteingroups for a gene (i.e. isoforms). In downstream -analysis we often want to only condier only the most significant -proteingroup in such cases when performing GO analyses. Further, there -might be proteingroups that are ambiguous, i.e. multiple genes match the -uniprot accessions in the proteingroup (e.g. the gene_symbols_or_id -column shows “GRIA1;GRIA2”). - -The `summarise_stats()` function helps filter/subset the DEA results -table to deal with this. Further, in case [differential -testing](differential_detection.md) has been applied to the dataset you -can also use this function to merge the respective top-hits into the -result table such that the output table contains 1 pvalue and effectsize -for each unique gene in your statistical results. - -The `remove_ambiguous_proteingroups` parameter is critical, controlling -whether proteingroups that map to multiple genes, e.g. “GRIA1;GRIA2”, -are removed from the result table. This is an important decision; set to -`TRUE` to remove ambiguous proteingroups and return only proteins groups -with gene-unique evidence. - -pro: this is the conservative option with lower odds of false positives -in downstream analyses, probably the better choice when using these -results to inform on followup experiments. - -con: if there is no proteingroup with unique evidence for e.g. “GRIA1” -nor for “GRIA2”, removing the ambiguous “GRIA1;GRIA2” preludes -downstream GO analyses from finding enrichment for this gene (even -though for GO it wouldn’t have mattered whether we searched for GRIA1 or -GRIA2). - -Note that in any case one shouldn’t search GO for *all* ambiguous genes -in a proteingroup (e.g. GRIA1 and GRIA2 in case of “GRIA1;GRIA2”) -without some form of correction because *either* protein was found, not -both. The ‘symbol’ column returned by this function will contain only -the ‘leading’ symbol; typically this column is used as input for GO (and -not the ‘gene_symbols_or_id’ which contains all/ambiguous symbols in -case `remove_ambiguous_proteingroups = FALSE`) So for GO analyses one -could retain ambiguous proteingroups and then follow the example below -to map these to unique human gene identifiers and retain 1 value per -unique gene - -``` r -library(msdap) -# load your previously generated dataset from disk -load("C://dataset.RData") - -# Summarise the statistical data in your dataset, assuming DEA was applied. -# note that if you analyzed multiple contrasts, optionally with multiple DEA algorithms, all results are appended in this table. -x = summarise_stats( - dataset, - # include DEA result in the result table - return_dea = TRUE, - # disabled in this example, but if differential detection was performed set this to TRUE to integrate respective 'strong z-scores' - return_diffdetect = FALSE, - # replace effectsizes with log2fc in result table. Niche usecase, e.g. optional for DEA models that apply shrinkage to foldchanges - dea_logfc_as_effectsize = FALSE, - # differential detection absolute z-score cutoff (only relevant when return_diffdetect=TRUE) - diffdetect_zscore_threshold = 4, - # should proteingroups that map to multiple genes, e.g. "GRIA1;GRIA2", be removed from the result table? - remove_ambiguous_proteingroups = FALSE -) - - -# alternatively, when you've performed differential detection as well and want to merge the DEA results with the top-hits from differential detect -# first plot the differential detect histograms to double-check your desired z-score threshold -plot_differential_detect(dataset) -# compared to above example, we here set `return_diffdetect = TRUE` -x = summarise_stats( - dataset, - return_dea = TRUE, - return_diffdetect = TRUE, - dea_logfc_as_effectsize = FALSE, - diffdetect_zscore_threshold = 5, # critical parameter - remove_ambiguous_proteingroups = FALSE # critical parameter -) -``` - -## Phylogenetic mapping from mouse proteins to human genes - -The MS-DAP function `protein2gene_orthologs()` adds human orthologs for -mouse/rat proteins in a table that contains uniprot accessions and their -respective gene symbols. - -This function is currently limited such that the first matching human -gene per proteingroup is returned, so ambiguous proteingroups with -multiple gene symbols (e.g. “GRIA1;GRIA2”) might not always yield the -‘leading’ gene and additional/ambiguous genes are disregarded). Columns -describing the HGNC (genenames.org) ID, NCBI Entez ID and Ensembl gene -ID are appended to the provided table. - -Workflow for ID mapping, where subsequent steps are fallbacks; - -1) uniprot id -\> mgi/rgd id -\> hgnc id -2) uniprot symbol -\> mgi/rgd id -\> hgnc id -3) uniprot symbol -\> hgnc official symbol -\> hgnc id -4) uniprot symbol -\> hgnc non-ambiguous synonyms -\> hgnc id - -Before running below example code for phylogenetic mappings, download -required lookup tables here; - -**HGNC** - -- download link: - -- table: “Complete dataset download links” –\>\> “Complete HGNC approved - dataset text json” –\>\> download the “TXT” table -- filename is typically something like hgnc_complete_set.txt - -**MGI** - -- download link: - -- table: “MGI Marker associations to SWISS-PROT and TrEMBL protein IDs - (tab-delimited)” -- filename is typically something like MRK_SwissProt_TrEMBL.rpt - -for rat proteins one can use information from RGD as a drop-in -replacement for MGI in this example, see further `rgd_lookuptable()`. - -``` r -# load HGNC and MGI lookup tables from disk, assuming you previously downloaded these -hgnc = hgnc_lookuptable(f = "C://hgnc_complete_set.txt") -mgi = mgi_lookuptable(f = "C://MRK_SwissProt_TrEMBL.rpt") - -# map the mouse proteins to human gene identifiers -# note that the ID mapping success rate is printed to the console -x = protein2gene_orthologs(x, hgnc, mgi) - -# print first few rows to console to inspect results -print(x, n=5) -# write full results to disk as Excel table -openxlsx::write.xlsx(x, "C://stats_summary.xlsx") - -# If you retain ambiguous proteingroups in `summarise_stats()` and perform phylogenetic mapping, -# some genes might have duplicate entries in your result table (e.g. "GRIA1" and "GRIA1;GRIA2" proteingroups will yield the same human gene ID). -# Here we'll remove all proteingroups that failed to map to human genes AND those that yield the same human ortholog gene. -# (this works because sorting of table x is already by 'best on top' after summarise_stats()) -y = x %>% filter(!is.na(hgnc_id)) %>% distinct(hgnc_id, contrast, dea_algorithm, .keep_all = TRUE) -openxlsx::write.xlsx(y, "C://stats_summary_deduped.xlsx") -``` - -## Mapping from gene symbols directly to HGNC gene identifiers - -For human proteome datasets we can directly match from uniprot gene -symbols to HGNC (genenames.org) identifiers by directly cross-matching -official symbols and synonyms. The function `protein2gene_by_symbol()` -can directly work with the ‘symbol’ column from the `summarise_stats()` -output. Do note that if you did not remove ambiguous proteingroups in -the `summarise_stats()` step, only the first gene symbol is used for ID -mapping here (for the subset of ambiguous proteingroups). - -Before running below example code for phylogenetic mappings, download -the required lookup table here; - -**HGNC** - -- download link: - -- table: “Complete dataset download links” –\>\> “Complete HGNC approved - dataset text json” –\>\> download the “TXT” table -- filename is typically something like hgnc_complete_set.txt - -``` r -# load HGNC lookup tables from disk, assuming you previously downloaded these -hgnc = hgnc_lookuptable(f = "C://hgnc_complete_set.txt") - -# assuming x is the results you obtained from summarise_stats() -# this will add columns hgnc_id, hgnc_symbol, entrez_id and ensembl_id -x = protein2gene_by_symbol(x, hgnc) - -# print first few rows to console to inspect results -print(x, n=5) - -### below code is analogous to the mouse example above - -# write full results to disk as Excel table -openxlsx::write.xlsx(x, "C://stats_summary.xlsx") - -# If you retain ambiguous proteingroups in `summarise_stats()` and perform phylogenetic mapping, -# some genes might have duplicate entries in your result table (e.g. "GRIA1" and "GRIA1;GRIA2" proteingroups will yield the same human gene ID). -# Here we'll remove all proteingroups that failed to map to human genes AND those that yield the same human ortholog gene. -# (this works because sorting of table x is already by 'best on top' after summarise_stats()) -y = x %>% filter(!is.na(hgnc_id)) %>% distinct(hgnc_id, contrast, dea_algorithm, .keep_all = TRUE) -openxlsx::write.xlsx(y, "C://stats_summary_deduped.xlsx") -``` diff --git a/docker/Dockerfile b/docker/Dockerfile index 0ac2c32..ec6beec 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -33,9 +33,9 @@ RUN R -e "devtools::install_version('remaCor', '0.0.18', upgrade = 'never', repo ### MS-DAP # from local devtools::build() file RUN mkdir /msdap -COPY temp/msdap_1.0.9.tar.gz /msdap/ -RUN R -e "devtools::install_local('/msdap/msdap_1.0.9.tar.gz', upgrade = 'never')" -# alternatively, install from github; RUN R -e "devtools::install_github('ftwkoopmans/msdap@1.0.9', upgrade = 'never')" +COPY temp/msdap_1.1.tar.gz /msdap/ +RUN R -e "devtools::install_local('/msdap/msdap_1.1.tar.gz', upgrade = 'never')" +# alternatively, install from github; RUN R -e "devtools::install_github('ftwkoopmans/msdap@1.1', upgrade = 'never')" ### finish setting up diff --git a/docker/msdap_launcher_unix.sh b/docker/msdap_launcher_unix.sh index 56e049b..532b007 100644 --- a/docker/msdap_launcher_unix.sh +++ b/docker/msdap_launcher_unix.sh @@ -3,7 +3,7 @@ # MS-DAP launch script # https://github.com/ftwkoopmans/msdap -VERSION="1.0.9" +VERSION="1.1" ### OS diff --git a/docker/msdap_launcher_windows.ps1 b/docker/msdap_launcher_windows.ps1 index 3cb6dac..1d13d62 100644 --- a/docker/msdap_launcher_windows.ps1 +++ b/docker/msdap_launcher_windows.ps1 @@ -1,7 +1,7 @@ # MS-DAP launch script # https://github.com/ftwkoopmans/msdap -$VERSION = "1.0.9" +$VERSION = "1.1" Write-Host "$((Get-Date).ToString("HH:mm:ss")) - Starting MS-DAP launcher script for version: $VERSION" diff --git a/examples/example_Klaassen2018_pmid26931375.R b/examples/example_Klaassen2018_pmid26931375.R index cfc01e0..88ae755 100644 --- a/examples/example_Klaassen2018_pmid26931375.R +++ b/examples/example_Klaassen2018_pmid26931375.R @@ -7,8 +7,8 @@ dataset = import_dataset_metamorpheus(path = "C:/VU/code/R/msdap/docker/temp/exa dataset = import_fasta( dataset, - files = c("C:/VU/fasta/UniProt_2018-05/UP000000589_10090.fasta", - "C:/VU/fasta/UniProt_2018-05/UP000000589_10090_additional.fasta") + files = c("C:/VU/fasta/uniprot/2018_05/UP000000589_10090.fasta", + "C:/VU/fasta/uniprot/2018_05/UP000000589_10090_additional.fasta") ) dataset = remove_proteins_by_name(dataset, regular_expression = "ig \\S+ chain|keratin|GN=(krt|try|igk|igg|igkv|ighv|ighg)") # optional diff --git a/inst/rmd/report.Rmd b/inst/rmd/report.Rmd index cb259f0..6aee804 100644 --- a/inst/rmd/report.Rmd +++ b/inst/rmd/report.Rmd @@ -550,10 +550,10 @@ As general guidelines for differential detection, the recommended default settin were observed with at least 2 peptides in at least 3 replicates (or 50% of replicates, whichever number is greater). Use the plots of differential detection z-score histograms to observe the overall distribution and start your data exploration at proteins with the strongest z-scores to find desired z-score cutoffs (typically an absolute z-score -of 4~5, but this is not set in stone for all datasets). This works best for DDA experiments, for DIA only the most +of 5 or higher, but this is not set in stone for all datasets). This works best for DDA experiments, for DIA only the most extreme values are informative in many cases (e.g. proteins exclusively identified in condition A & found in nearly all replicates of condition A). \n\n') - cat('Below figure shows the distribution of these scores with thresholds at 4 std. Both the z-scores and the counts these are based upon are available in the statistical result Excel table. \n\n') + cat('Below figure shows the distribution of these scores with thresholds at 6 std. Both the z-scores and the counts these are based upon are available in the statistical result Excel table. \n\n') # !! index by name. Order may be different from l_contrasts !! subchunkify(suppressWarnings(print(dd_plots[[contr]])), unique_chunk_id = paste0("dd_hist_", contr_index), fig_height = ifelse(isdia, 3, 5.5), fig_width = 5) @@ -583,7 +583,7 @@ if(exists("tib_report_stats_summary") && is.data.frame(tib_report_stats_summary) ## differential detect cat('\\bigskip \n\n') cat('\\bigskip \n\n') - cat("Differential detection: summary of proteins with extreme differences in observed peptides. A simple metric to complement results from DEA, which is the main result, for peptides that lack data to perform DEA. \n\nDifferential detection scores based only on 'detected' peptides; \n\n") + cat("Differential detection: summary of proteins with extreme differences in observed peptides. A simple metric to complement results from DEA, which is the main result, especially for proteins that lack data to perform DEA. \n\nDifferential detection scores based only on 'detected' peptides; \n\n") if(exists("tib_report_diffdetects_summary") && is.data.frame(tib_report_diffdetects_summary) && nrow(tib_report_diffdetects_summary) > 0) { rmarkdown_xtable_custom(tib_report_diffdetects_summary, caption = "Differential Detection @ only 'detected' peptides", align="lp{2in}llp{3.5in}", scalebox = 0.9) # print(kableExtra::column_spec(knitr::kable(tib_report_diffdetects_summary, format="latex", row.names = FALSE), 4, width = "4in")) diff --git a/man/export_stats_genesummary.Rd b/man/export_stats_genesummary.Rd new file mode 100644 index 0000000..7e4e5c9 --- /dev/null +++ b/man/export_stats_genesummary.Rd @@ -0,0 +1,165 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/export_stats_genesummary.R +\name{export_stats_genesummary} +\alias{export_stats_genesummary} +\title{Create gene-level summary tables for your DEA and differential detection results} +\usage{ +export_stats_genesummary( + dataset, + gene_ambiguity = "prio_specific", + diffdetect_zscore_threshold = NA, + diffdetect_type = "auto", + dea_logfc_instead_of_effectsize = FALSE, + output_dir = NA, + hgnc, + xref = NULL, + remove_nohgnc = FALSE +) +} +\arguments{ +\item{dataset}{to use this function, your dataset should meet these requirements: +\itemize{ +\item dataset was searched against a uniprot FASTA in MaxQuant/Spectronaut/DIA-NN/etc. (i.e. MS-DAP can only extract gene symbol info from from uniprot.org FASTA databases) +\item \code{import_fasta()} has been applied to this dataset +\item differential expression analysis has been applied (e.g. typical \code{analysis_quickstart()} workflow, or custom code that applied \code{filter_dataset()} and \code{dea()}) +}} + +\item{gene_ambiguity}{options: leading_gene, prio_specific, only_specific (default: prio_specific). +\subsection{example table from GOAT online website}{ + +this list describes various scenarios; gene symbol(s) that represent some proteingroup together with a description of presented information +\itemize{ +\item GRIA1 = protein group maps to exactly 1 gene +\item GRIA2 = protein group maps to exactly 1 gene +\item GRIA1;GRIA2 = ambiguous, one might want to use only the first entry ('leading' gene) +\item GRIA3;GRIA4 = ambiguous, but this row contributes a new gene (GRIA3) +\item tr|A8K0K0|A8K0K0_HUMAN;GRIA2;GRIA3 = ambiguous, but the first entry has no gene symbol only an accession +} +} + +\subsection{options and their result when applied to above examples}{ + +\itemize{ +\item leading_gene = use only the leading/first gene per proteingroup. e.g. "GRIA1;GRIA2" is mapped to "GRIA1" and will overwrite a specific "GRIA1" proteingroup if (and only if) the former has a lower p-value. +all rows in this example will be mapped to a gene ID (respectively, GRIA1, GRIA2, GRIA1, GRIA3, GRIA2). +\item prio_specific = discard ambiguous proteingroups if their leading/first gene overlaps with a specific proteingroup. e.g. "GRIA1;GRIA2" is mapped to "GRIA1" only if there is no specific/unambiguous "GRIA1" proteingroup. +rows 3 and 5 in this example are discarded because there exist specific entries for GRIA1 and GRIA2 (respectively, GRIA1, GRIA2, -, GRIA3, -). This approach favors rows that are specific and supplements the results only with ambiguous rows that contribute new information (leading gene is not in the results yet). +\item only_specific = all ambiguous proteingroups are discarded (included in output table, but human gene ID is set to NA). +only the first 2 rows in this example are mapped (respectively, GRIA1, GRIA2, -, -, -). +} +}} + +\item{diffdetect_zscore_threshold}{Optionally, a differential detection absolute z-score cutoff. Set to NA to disable diffdetect (default) to return only DEA results. When using this option, we strongly recommend to first review the zscore distributions using MS-DAP function: \code{plot_differential_detect()} -> double-check that your selected absolute zscore threshold cuts at intended positions in the distribution !} + +\item{diffdetect_type}{type of differential detect scores. options: +'auto' = set to 'detect' if this score is available, 'quant' otherwise +'detect' = differential detection z-scores computed from only "detected" peptides (no MBR) +'quant' = differential detection z-scores computed from all quantified peptides (uses MBR)} + +\item{dea_logfc_instead_of_effectsize}{boolean parameter: should log2fc values be returned in the effectsize column? In some edge cases this might be helpful for downstream analysis but should generally be left at default value (FALSE)} + +\item{output_dir}{where to write the Excel output tables (e.g. "C:/temp"). Set to NA to disable writing tables} + +\item{hgnc}{HGNC lookup table, the output from function \code{hgnc_lookuptable()}} + +\item{xref}{For Mouse and Rat datasets we recommend to provide MGI/RGD database files to increase ID mapping accuracy. For Mouse species, this should be the MGI lookup table from function \code{mgi_lookuptable()}. For Rat, the RGD lookup table from function \code{rgd_lookuptable()}} + +\item{remove_nohgnc}{set to TRUE to remove all rows that could not be matched to a HGNC ID. default; FALSE} +} +\value{ +the table(s) produced by this function are gene-level summaries of your statistical analyses. +The R data.table returned by this function is in long-format (i.e. results from multiple DEA algorithms and contrast are appended). +The Excel output files are split by contrast and DEA algorithm for convenience: these can be directly used as input for gene set analysis tools. +Proteingroups that failed to map to human genes are included in the output table but will have NA values in the hgnc_id column. + +A description of columns: +\itemize{ +\item 'protein_id' contains the proteingroup identifier (concatenation of protein identifiers) +\item 'contrast' describes the statistical contrast (e.g. WT vs KO) +\item 'dea_algorithm' describes the DEA algorithm that configured. Note that if you select multiple algorithms while running \code{analysis_quickstart()}, there will be multiple rows in this table for each gene*contrast ! +\item 'score_type' is either 'dea', 'dd' or 'dea,dd' indicating whether the effectsize/pvalue/pvalue_adjust shown for this row originates from differential expression analysis (DEA) or differential detection (DD), or both (i.e. statistics were available for both, in which case the metric with lowest pvalue was used to populate the effectsize/pvalue/pvalue_adjust columns) +\item 'peptide_count' describes the number of peptides that was used for DEA. So this is NOT the total number of detected peptides, rather this is the subset of peptides that passed your specified filtering rules (and were thus used for DEA) +\item 'log2fc' log2 foldchanges; for DEA results this value is taken straight from the DEA result table. For DD results this is the 'diffdetect zscore' +\item 'pvalue' analogous to \code{log2fc} +\item 'pvalue_adjust' analogous to \code{log2fc} +\item 'signif' boolean flag indicating significance. For DEA, the criteria for significance that were previously configured when performing DEA using \code{dea()} or +\code{analysis_quickstart()} functions are reused here (i.e. cutoffs for adjusted p-value and log2 foldchange). For DD results, any included protein (that passes the \code{diffdetect_zscore_threshold} cutoff) is set to TRUE. +\item The 'effectsize' column data depends on the user parameters/configuration; +When only differential detect is used, this contains the z-scores. When only DEA is used, this contains effectsizes from DEA as-is. +However, when DEA and differential detect are combined, the effectsizes from DEA are standardized (divided by std) such that effectsizes from +both statistics can be integrated into 1 distribution (and thus these can be compared/ranked in downstream analyses). +For rows that have both DD and DEA results, see item \code{score_type} +Not a perfect solution, but in combination with stringent \code{diffdetect_zscore_threshold} parameter this seems to work ok in our hands. +\item 'effectsize_dea' is only included when DD is enabled; it shows the DEA effectsizes as-is without any rescaling (and rescaled values merged with rescaled DD values are shown in 'effectsize') +\item 'gene_symbols_or_id' contains the uniprot gene symbols for respective accessions in the proteingroup (protein_id column) as per usual in MS-DAP. +\item 'hgnc_id' the mapped Human gene ID (can be NA in case of failed mappings) +\item 'symbol' the gene symbol of the respective 'hgnc_id' (can be NA in case of failed mappings) +\item 'ensembl_id' the Ensembl gene ID of the respective 'hgnc_id' (can be NA in case of failed mappings) +\item 'entrez_id' the NCBI Entrez gene ID of the respective 'hgnc_id' (can be NA in case of failed mappings) +\item 'gene' same as the 'entrez_id' column. Provided for drag-and-drop compatibility with the GOAT online tool +} +} +\description{ +To expedite downstream analysis, this function maps the proteingroup results from your +differential expression analysis to Human gene identifiers and exports these as Excel tables that are ready for +use with GO tools. Optionally, one can also include results from differential detection analysis. +Multiple proteingroups that map to the same unique gene are merged and there are various options +for dealing with ambiguous proteingroups. + +The resulting output tables can be used as input with the GOAT online geneset analysis tool @ \url{https://ftwkoopmans.github.io/goat/} + +\emph{combining differential expression and differential detection stats} + +If differential detection (DD) was performed on this dataset and the \code{diffdetect_zscore_threshold} parameter is not NA, +the DD z-scores (per contrast) are converted to p-values by \code{pnorm(abs(zscore), lower.tail = F)}, +assuming these are approximately normally distributed (!), and then multiple testing correction by FDR is applied. +However, this is just an approximation; the differential detection scores are a very simplistic approach and should +be considered inferior to DEA analyses. Double-check the zscore distributions and treat these results with care ! +Plot your DD results with \code{plot_differential_detect(dataset, zscore_threshold = 6)}. +Stringent cutoffs (absolute diffdetect_zscore_threshold of at least 5~6, or higher) are encouraged when using differential detection results. + +If DD is enabled (\code{diffdetect_zscore_threshold} parameter is not NA), the effectsizes from DEA and DD are both standardized to make +their distributions somewhat comparable. Note that the downside is that the effectsizes returned by this function +no longer will be the exact effectsizes returned by the DEA model (those can be then found @ output column \code{effectsize_dea}). + +\emph{dealing with overlapping gene symbols between proteingroups} + +Importantly, there may be multiple proteingroups (isoforms) in any proteomics dataset that match the same gene symbol. +This function will return the proteingroup/row with the lowest p-value for each unique gene (per contrast, per DEA algorithm). +Your selected option for \code{gene_ambiguity} will affect this; +if for example "leading_gene" is selected as an option then a "GRIA1;GRIA2" proteingroup with pvalue=0.01 will be +chosen as representative value for "GRIA1" over a "GRIA1" proteingroup with pvalue=0.5 (see further below parameter documentation) +} +\examples{ +\dontrun{ + +## note that in this example, we have renamed the downloaded files +## to include a timestamp; useful for tracking versions +# load the HGNC data table you previously downloaded +hgnc_table = hgnc_lookuptable("C:/DATA/hgnc_complete_set__2024-01-01.txt") + +# MGI Mouse database, only needed for Mouse datasets +#mgi_table = mgi_lookuptable("C:/DATA/MRK_SwissProt_TrEMBL__2024-01-01.rpt") +# RGD Rat database, only needed for Rat datasets +#rgd_table = rgd_lookuptable("C:/DATA/GENES_RAT__2024-01-01.rpt") + +export_stats_genesummary( + dataset, + # set NA to ignore differential detection (default), or define an absolute + # zscore threshold (e.g. 6). ! when using diffdetect, first review the + # zscore distributions using MS-DAP function: plot_differential_detect() + diffdetect_zscore_threshold = NA, + # options for dealing with proteingroups that map to multiple genes: + # leading_gene, prio_specific, only_specific + gene_ambiguity = "prio_specific", # prio_specific and only_specific are recommended + # always need the HGNC data table + hgnc = hgnc_table, + # example: for Mouse datasets, provide the MGI database for more accurate ID mapping + # xref = mgi_table, + # example: for Rat datasets, provide the RGD database for more accurate ID mapping + # xref = rgd_table, + # write the files to the same directory as main MS-DAP results + output_dir = dataset$output_dir +) +} +} diff --git a/man/idmap_symbols.Rd b/man/idmap_symbols.Rd new file mode 100644 index 0000000..bd3bc81 --- /dev/null +++ b/man/idmap_symbols.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gene_idmapping.R +\name{idmap_symbols} +\alias{idmap_symbols} +\title{helper function to match a protein table to HGNC via gene symbols (and synonyms)} +\usage{ +idmap_symbols(proteins_long, hgnc, use_synonyms = TRUE) +} +\arguments{ +\item{proteins_long}{long-format data.frame with columns protein_id and symbol pairs (i.e. no semicolon-delimted values)} + +\item{hgnc}{HGNC lookup table from \code{hgnc_lookuptable()}} + +\item{use_synonyms}{boolean; use synonyms? if FALSE, only uses hgnc_symbol} +} +\description{ +typically called from \code{export_stats_genesummary} +} diff --git a/man/idmap_uniprotid.Rd b/man/idmap_uniprotid.Rd new file mode 100644 index 0000000..bdebbcc --- /dev/null +++ b/man/idmap_uniprotid.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gene_idmapping.R +\name{idmap_uniprotid} +\alias{idmap_uniprotid} +\title{helper function to match a protein table to HGNC via an intermediate mapping table (e.g. MGI/RGD)} +\usage{ +idmap_uniprotid(proteins_long, hgnc, idmap, use_symbols = TRUE) +} +\arguments{ +\item{proteins_long}{long-format data.frame with columns protein_id and symbol pairs (i.e. no semicolon-delimted values)} + +\item{hgnc}{HGNC lookup table from \code{hgnc_lookuptable()}} + +\item{idmap}{MGI or RGD lookup table from \code{mgi_lookuptable()} or \code{rgd_lookuptable()}} + +\item{use_symbols}{boolean; use symbols to map from proteins_long to the idmap table? if FALSE, only matches by uniprot identifier} +} +\description{ +typically called from \code{export_stats_genesummary} +} diff --git a/man/plot_differential_detect.Rd b/man/plot_differential_detect.Rd index 51fe6df..87ba1ef 100644 --- a/man/plot_differential_detect.Rd +++ b/man/plot_differential_detect.Rd @@ -4,7 +4,7 @@ \alias{plot_differential_detect} \title{Plot differential detection results as a histogram} \usage{ -plot_differential_detect(dataset, zscore_threshold = 4) +plot_differential_detect(dataset, zscore_threshold = 6) } \arguments{ \item{dataset}{dataset object} diff --git a/man/plot_peptide_data.Rd b/man/plot_peptide_data.Rd index 021954a..2aa7685 100644 --- a/man/plot_peptide_data.Rd +++ b/man/plot_peptide_data.Rd @@ -22,7 +22,7 @@ plot_peptide_data( \item{select_all_proteins}{plot all proteins. This will take a long time. Default: FALSE} \item{select_diffdetect_candidates}{if "differential detection" was enabled (it is by default in \code{analysis_quickstart}), plot candidate proteins based on their zscores. -Valid parameters are positive numeric values that are to be used as absolute z-score cutoffs (4 or 5 are typically reasonable; use \code{plot_differential_detect} to visualize the score distributions). +Valid parameters are positive numeric values that are to be used as absolute z-score cutoffs (5 or higher is typically reasonable; use \code{plot_differential_detect} to visualize the score distributions). This selects all proteins that meet this criterium in any contrast, and plots these proteins in every contrast (include those where it is has a z-score below given threshold). Alternative, set to TRUE to use an absolute z-score cutoff value of 4. Default: NA (disable)} diff --git a/man/protein2gene_by_symbol.Rd b/man/protein2gene_by_symbol.Rd deleted file mode 100644 index e470725..0000000 --- a/man/protein2gene_by_symbol.Rd +++ /dev/null @@ -1,16 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gene_idmapping.R -\name{protein2gene_by_symbol} -\alias{protein2gene_by_symbol} -\title{Map the the symbol column in a table to HGNC human gene IDs by matching official gene symbols and synonyms} -\usage{ -protein2gene_by_symbol(x, hgnc) -} -\arguments{ -\item{x}{a data.table with a column symbol} - -\item{hgnc}{HGNC lookup table from \code{hgnc_lookuptable()}} -} -\description{ -Map the the symbol column in a table to HGNC human gene IDs by matching official gene symbols and synonyms -} diff --git a/man/protein2gene_orthologs.Rd b/man/protein2gene_orthologs.Rd deleted file mode 100644 index fb8f7a2..0000000 --- a/man/protein2gene_orthologs.Rd +++ /dev/null @@ -1,48 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gene_idmapping.R -\name{protein2gene_orthologs} -\alias{protein2gene_orthologs} -\title{Map the protein_id column, which are assumed to contain uniprot mouse/rat identifiers, in a table to HGNC human gene IDs} -\usage{ -protein2gene_orthologs(x, hgnc, idmap, remove_nohgnc = FALSE) -} -\arguments{ -\item{x}{a data.table with columns protein_id and symbol (e.g. the "leading symbol" extracted from the uniprot gene symbols @ dataset$proteins$gene_symbols_or_id)} - -\item{hgnc}{HGNC lookup table from \code{hgnc_lookuptable()}} - -\item{idmap}{MGI lookup table from \code{mgi_lookuptable()} or a RGD lookup table from \code{rgd_lookuptable()}} - -\item{remove_nohgnc}{set to TRUE to remove all rows that could not be matched to a HGNC ID. default; FALSE} -} -\description{ -This function is currently limited such that the first matching human gene per proteingroup is returned, -so ambiguous proteingroups with multiple gene symbols (e.g. "GRIA1;GRIA2") might not always yield the 'leading' -gene and additional/ambiguous genes are disregarded). Columns describing the HGNC (genenames.org) ID, -NCBI Entez ID and Ensembl gene ID are appended to the provided table. -} -\details{ -Workflow for ID mapping, where subsequent steps are fallbacks; -\enumerate{ -\item uniprot id -> mgi/rgd id -> hgnc id -\item uniprot symbol -> mgi/rgd id -> hgnc id -\item uniprot symbol -> hgnc official symbol -> hgnc id -\item uniprot symbol -> hgnc non-ambiguous synonyms -> hgnc id -} -} -\examples{ -\dontrun{ - # prior to this, import a dataset/samples/fasta & setup contrasts & apply filter_datasets() - # alternatively, use analysis_quickstart() to do everything - dataset = dea(dataset, dea_algorithm = "deqms") - dataset = differential_detect(dataset, min_peptides_observed = 2, min_samples_observed = 3, - min_fraction_observed = 0.5, return_wide_format = FALSE) - print_dataset_summary(dataset) - - x = summarise_stats(dataset, return_dea = TRUE, return_diffdetect = TRUE, - diffdetect_zscore_threshold = 5, remove_ambiguous_proteingroups = TRUE) - hgnc = hgnc_lookuptable(f = "C:/DATA/hgnc/hgnc__non_alt_loci_set___2023-03-21.tsv") - mgi = mgi_lookuptable(f = "C:/DATA/mgi/2023-07-15/MRK_SwissProt_TrEMBL.rpt") - y = protein2gene_orthologs(x, hgnc, mgi) -} -} diff --git a/man/read_textfile_compressed.Rd b/man/read_textfile_compressed.Rd index a582928..97810d9 100644 --- a/man/read_textfile_compressed.Rd +++ b/man/read_textfile_compressed.Rd @@ -9,6 +9,7 @@ read_textfile_compressed( as_table = FALSE, skip_empty_rows = FALSE, nrow = -1, + datatable_skip = NULL, ... ) } @@ -21,6 +22,8 @@ read_textfile_compressed( \item{nrow}{optionally, integer specifying to only read first N rows} +\item{datatable_skip}{hardcoded parameter for datatable_skip (set NULL to ignore; default)} + \item{...}{sent to data.table::fread()} } \value{ diff --git a/man/summarise_stats.Rd b/man/summarise_stats.Rd index 0dcd3c2..b75d308 100644 --- a/man/summarise_stats.Rd +++ b/man/summarise_stats.Rd @@ -9,8 +9,8 @@ summarise_stats( return_dea = TRUE, return_diffdetect = FALSE, dea_logfc_as_effectsize = FALSE, - diffdetect_zscore_threshold = 4, - remove_ambiguous_proteingroups = FALSE + diffdetect_zscore_threshold = 6, + diffdetect_type = "auto" ) } \arguments{ @@ -24,48 +24,16 @@ summarise_stats( When including differential detection results this'll be a convenient approach to getting 1 standardized distribution of DEA+DD effectsizes that can be used in e.g. GO analyses. While this is unusual, one could e.g. use this for DEA algorithms that apply shrinkage to estimated foldchanges such as MSqRob} -\item{diffdetect_zscore_threshold}{differential detect z-score cutoff. A typical value would be \code{diffdetect_zscore_threshold=4} , or 5~6 for stringent filtering. +\item{diffdetect_zscore_threshold}{differential detect z-score cutoff. A typical value would be 5 or 6 (default) To plot histograms of the respective z-score distributions and inspect potential cutoff values for this relatively arbitrary metric, see below example code} -\item{remove_ambiguous_proteingroups}{boolean parameter indicating whether proteingroups that contain shared gene symbols (e.g. GRIA1;GRIA2) should be removed} -} -\value{ -\itemize{ -\item The effectsize column data depends on the user parameters/configuration; -When only differential detect is used, this contains the z-scores. When only DEA is used, this contains effectsizes from DEA as-is. -However, when DEA and differential detect are combined, the effectsizes from DEA are standardized (divided by std) such that effectsizes from -both statistics are integrated into 1 distribution (and thus these can be compared/ranked in downstream analyses). -\item 'gene_symbols_or_id' contains the uniprot gene symbols for respective accessions in the proteingroup (protein_id column) as per usual in MS-DAP. -\item The 'symbol' column contains the first gene symbol thereof (e.g. in ambiguous groups where gene_symbols_or_id='GRIA1;GRIA2', this'll yield 'GRIA1'). -\item the 'dea_algorithm' column shows the DEA algorithm that was used. Importantly, even differential detect results that were merged into results for this DEA algorithm have the same dea_algorithm value so you can group/filter by the entire analysis downstream. -} +\item{diffdetect_type}{type of differential detect scores. options: +'auto' = set to 'detect' if this score is available, 'quant' otherwise +'detect' = differential detection z-scores computed from only "detected" peptides (no MBR) +'quant' = differential detection z-scores computed from all quantified peptides (uses MBR)} } \description{ -For differential detection, the z-scores are converted to p-values by \code{pnorm(abs(zscore), lower.tail = F)} , -assuming these are normally distributed, and multiple testing correction by FDR is applied. -However, this is just an approximation; the differential detection scores are inferior to DEA analyses and should be treated with care. -Stringent cutoffs (diffdetect_zscore_threshold of at least 4 ~ 6) are encouraged when using differential detection results. - -For DEA, the criteria for significance that were previously configured when performing DEA using \code{dea()} or -\code{analysis_quickstart()} functions are reused here (i.e. cutoffs for adjusted p-value and log2 foldchange). - -If \code{return_dea} and \code{return_diffdetect} are both set to TRUE, the effectsizes from each statistic/result are standardized. -This makes both statistics more comparable but the downside is that the effectsizes are no longer the exact effectsizes returned by the DEA model. - -Note that if your dataset contains results for multiple contrasts and multiple DEA algorithms, -these will all be appended into the result table so make sure to filter the results by 'contrast' and -'dea_algorithm' columns where appropriate. -} -\examples{ -\dontrun{ - # summarise the DEA results (and ignore the more speculative differential detect data) - x = summarise_stats(dataset, return_dea = TRUE, return_diffdetect = FALSE, - remove_ambiguous_proteingroups = FALSE) - - # first plot all differential detect results as histograms to verify the zscore cutoff - tmp = lapply(plot_differential_detect(dataset), print) - # merge both DEA and differential-detect 'extremes' into a gene-level stats summary - x = summarise_stats(dataset, return_dea = TRUE, return_diffdetect = TRUE, - diffdetect_zscore_threshold = 5, remove_ambiguous_proteingroups = FALSE) -} +In most cases, you probably want to use the \code{export_stats_genesummary()} function instead. +That is a wrapper function that uses this function but also adds additional functionality. +For documentation on the output table, also refer to that function. } diff --git a/man/summarise_stats__for_contrast.Rd b/man/summarise_stats__for_contrast.Rd index 38d681a..7276bf9 100644 --- a/man/summarise_stats__for_contrast.Rd +++ b/man/summarise_stats__for_contrast.Rd @@ -12,7 +12,7 @@ summarise_stats__for_contrast( dea_algorithm, dea_logfc_as_effectsize, diffdetect_zscore_threshold, - remove_ambiguous_proteingroups + diffdetect_type ) } \arguments{ @@ -30,7 +30,7 @@ summarise_stats__for_contrast( \item{diffdetect_zscore_threshold}{see \code{summarise_stats()}} -\item{remove_ambiguous_proteingroups}{see \code{summarise_stats()}} +\item{diffdetect_type}{see \code{summarise_stats()}} } \description{ worker function for actual stats summary