diff --git a/DESCRIPTION b/DESCRIPTION index e79e5d8..2ffb5ac 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Package: msdap Title: Mass Spectrometry Downstream Analysis Pipeline Description: Analyze label-free proteomics datasets from various sources (MaxQuant, Spectronaut, etc) using a pipeline that facilitates peptide filtering and many algorithms for normalization and statistical analysis. A comprehensive PDF report can be generated that includes many data visualizations and documentation thereof. URL: https://github.com/ftwkoopmans/msdap -Version: 1.2 +Version: 1.2.1 Authors@R: person(given = "Frank", family = "Koopmans", diff --git a/NAMESPACE b/NAMESPACE index ebacf7a..e6d553e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -59,6 +59,7 @@ export(limma_wrapper) export(logger.default) export(matrix_to_long) export(merge_fractionated_samples) +export(merge_proteingroups_by_gene) export(merge_replicate_samples) export(mgi_lookuptable) export(minlog10) diff --git a/R/dea.R b/R/dea.R index 2bc2565..2edec4f 100644 --- a/R/dea.R +++ b/R/dea.R @@ -210,6 +210,10 @@ dea = function(dataset, qval_signif = 0.01, fc_signif = 0, dea_algorithm = "deqm peptides_for_contrast = dataset$peptides %>% select(sample_id, protein_id, peptide_id, sequence_plain, sequence_modified, detect, intensity=!!as.character(col_contr_intensity), any_of(c("confidence","charge"))) %>% filter(sample_id %in% samples_for_contrast$sample_id & is.finite(intensity)) + # if(!all(samples_for_contrast$sample_id %in% unique(peptides_for_contrast$sample_id)) ) { + # append_log("some samples that are defined in the current contrast do not have any datapoints in the dataset$peptides table (at currently selected 'peptide filter') column", type = "error") + # } + # peptide ExpressionSet eset_peptides = tibble_as_eset(peptides_for_contrast, dataset$proteins, samples_for_contrast) # rollup peptide abundance matrix to protein-level diff --git a/R/export_stats_genesummary.R b/R/export_stats_genesummary.R index 526f958..472dfc1 100644 --- a/R/export_stats_genesummary.R +++ b/R/export_stats_genesummary.R @@ -215,9 +215,70 @@ export_stats_genesummary = function(dataset, gene_ambiguity = "prio_specific", d ) + # the allstats table is already sorted by "best hits on top" (last few lines @ summarise_stats() function; `arrange(desc(signif), pvalue)` ) + result = disambiguate_protein_table_by_gene(protein_data = allstats, hgnc = hgnc, gene_ambiguity = gene_ambiguity, xref = xref, distinct_factors = c("contrast", "dea_algorithm"), remove_nohgnc = remove_nohgnc) %>% + arrange(desc(signif), pvalue, gene_symbols_or_id) + + + ### print output tables + if(is.na(output_dir)) { + append_log("output_dir parameter set to NA, skip writing Excel output tables", type = "info") + } else { + for(contr in unique(result$contrast)) { + for(algo_dea in unique(result$dea_algorithm)) { + tmp = result %>% filter(contrast == contr & dea_algorithm == algo_dea) + f = sprintf("%s/%s__%s.xlsx", output_dir, filename_strip_illegal_characters(sub("contrast: *", "", contr), strict = TRUE), algo_dea) + openxlsx::write.xlsx(tmp, f) + } + } + } + + return(invisible(result)) +} + + + +#' given some table with protein-level data, map to HGNC genes and deal with redundant or ambiguous gene-level results +#' +#' @description +#' The order of the data table is pivotal: the first row for each unique gene (criteria in parameter `gene_ambiguity`)) is selected ! +#' +#' @param protein_data a data.frame with columns (from dataset$proteins) protein_id, gene_symbols, gene_symbols_or_id. +#' Importantly, if you opt to return only unique entries per gene then the row of the FIRST matching protein_id is +#' returned, so sort your data table by pvalue upstream ! +#' @param hgnc see `export_stats_genesummary()` +#' @param gene_ambiguity see `export_stats_genesummary()` +#' @param xref see `export_stats_genesummary()` +#' @param remove_nohgnc see `export_stats_genesummary()` +#' @param distinct_factors a set of columns in protein_data that should be considered (together with protein_id) factors that describe subsets of data (to return unique gene-level data within) +disambiguate_protein_table_by_gene = function(protein_data, hgnc, gene_ambiguity, xref = NULL, remove_nohgnc = FALSE, distinct_factors = NULL) { + + ### input validation + if(!(is.data.frame(hgnc) && all(c("hgnc_id", "hgnc_symbol", "type", "value") %in% colnames(hgnc)))) { + append_log("hgnc parameter should be a data.frame with columns hgnc_id, hgnc_symbol, type, value (output from hgnc_lookuptable() function)", type = "error") + } + if(!(length(gene_ambiguity) == 1 && all(gene_ambiguity %in% c("leading_gene", "prio_specific", "only_specific")))) { + append_log("gene_ambiguity parameter should be 1 of the following options: leading_gene, prio_specific, only_specific", type = "error") + } + if(!is.null(xref) && !(is.data.frame(xref) && nrow(xref) > 0 && ncol(xref) >= 3 && all(c("symbol", "uniprot_id") %in% colnames(xref)))) { + append_log("xref parameter should be a data.frame with columns mgi_id/rgd_id, symbol, uniprot_id (output from mgi_lookuptable() or rgd_lookuptable() function)", type = "error") + } + if(!is.null(xref) && !colnames(xref)[1] %in% c("mgi_id", "rgd_id")) { + append_log("xref parameter only works with MGI and RGD atm (output from mgi_lookuptable() or rgd_lookuptable() function): first column must be mgi_id or rgd_id", type = "error") + } + if(!(length(remove_nohgnc) == 1 && remove_nohgnc %in% c(TRUE, FALSE))) { + append_log("remove_nohgnc parameter should be TRUE or FALSE", type = "error") + } + if(!(is.data.frame(protein_data) && all(c("protein_id", "gene_symbols", "gene_symbols_or_id") %in% colnames(protein_data)))) { + append_log("protein_data parameter should be a data.frame with columns protein_id, gene_symbols and gene_symbols_or_id", type = "error") + } + if(!(is.null(distinct_factors) || (is.character(distinct_factors) & all(distinct_factors %in% colnames(protein_data))))) { + append_log("distinct_factors parameter should be NULL or a set of columns in protein_data that should be considered (together with protein_id) factors that describe subsets of data (to return unique gene-level data within)", type = "error") + } + ### create long-format table of uniprot_id*symbol for each unique proteingroup - proteins = allstats %>% + proteins = protein_data %>% select(protein_id, gene_symbols) %>% distinct(protein_id, .keep_all = TRUE) %>% mutate(uniprot_id = strsplit(protein_id, "[ ,;]+"), @@ -242,24 +303,24 @@ export_stats_genesummary = function(dataset, gene_ambiguity = "prio_specific", d - ### now join the unique proteingroup-to-HGNC mappings with table `allstats` + ### now join the unique proteingroup-to-HGNC mappings with table `protein_data` # hgnc_list = a list column where each protein_id has an array of HGNC identifiers (length > 0), NA where missing/failed - allstats = allstats %>% + protein_data = protein_data %>% left_join(proteins_long %>% select(protein_id, hgnc_list = hgnc_id) %>% tidyr::chop(cols = "hgnc_list"), by = "protein_id") ### apply mapping rules - ### importantly, sorting of `allstats` table prior to running below code MUST be ascending by pvalue + ### importantly, sorting of `protein_data` table prior to running below code MUST be ascending by pvalue - allstats$hgnc_list_unique_notna = lapply(allstats$hgnc_list, function(g) { unique(na.omit(g)) }) # naive/slow, but prefer simple code - # dev note: don't use `allstats %>% filter(lengths(hgnc_list) == 1)` because a proteingroup can contain multiple uniprot identifiers that all map to the same gene --> hgnc_list will not be length 1 + protein_data$hgnc_list_unique_notna = lapply(protein_data$hgnc_list, function(g) { unique(na.omit(g)) }) # naive/slow, but prefer simple code + # dev note: don't use `protein_data %>% filter(lengths(hgnc_list) == 1)` because a proteingroup can contain multiple uniprot identifiers that all map to the same gene --> hgnc_list will not be length 1 - allstats$isambiguous = grepl(";", allstats$gene_symbols_or_id, fixed = TRUE) # infer ambiguous directly from uniprot gene symbols (i.e. failed mapping to HGNC have no effect) - # allstats$isambiguous = lengths(allstats$hgnc_list_unique_notna) > 1 # alternatively, determine "ambiguous" from successful HGNC mappings @ hgnc_list_unique_notna + protein_data$isambiguous = grepl(";", protein_data$gene_symbols_or_id, fixed = TRUE) # infer ambiguous directly from uniprot gene symbols (i.e. failed mapping to HGNC have no effect) + # protein_data$isambiguous = lengths(protein_data$hgnc_list_unique_notna) > 1 # alternatively, determine "ambiguous" from successful HGNC mappings @ hgnc_list_unique_notna # take the first non-NA HGNC identifier (NA if none) - allstats$hgnc_id = unlist(lapply(allstats$hgnc_list_unique_notna, function(g) { + protein_data$hgnc_id = unlist(lapply(protein_data$hgnc_list_unique_notna, function(g) { # always return exactly 1 value, thus downstream unlist() will yield an array that exactly matches table length if(length(g) == 0) return(NA_character_) return(g[1]) @@ -268,26 +329,25 @@ export_stats_genesummary = function(dataset, gene_ambiguity = "prio_specific", d if(gene_ambiguity == "only_specific") { # invalidate all entries that are ambiguous # importantly, do this before computing `hgnc_id_FORUNIQUE` - allstats$hgnc_id[allstats$isambiguous] = NA_character_ + protein_data$hgnc_id[protein_data$isambiguous] = NA_character_ } if(gene_ambiguity == "prio_specific") { # sort table such that ambiguous hits are at the bottom; importantly, ties-of-ambiguity are still sorted as-is # taking first match per gene downstream = prio specific/unambiguous over ambiguous proteingroup that has a "better pvalue" - allstats = allstats %>% arrange(isambiguous) + protein_data = protein_data %>% arrange(isambiguous) } # ! importantly, add a unique ID to proteingroups not mapped to HGNC so we can use distinct() downstream without grouping NA values (failed mappings) - allstats$hgnc_id_FORUNIQUE = ifelse(is.na(allstats$hgnc_id), allstats$gene_symbols_or_id, allstats$hgnc_id) + protein_data$hgnc_id_FORUNIQUE = ifelse(is.na(protein_data$hgnc_id), protein_data$gene_symbols_or_id, protein_data$hgnc_id) # result table: retain unique rows per gene*contrast*dea_algorithm (assuming table has been sorted upstream) - result = allstats %>% distinct(hgnc_id_FORUNIQUE, contrast, dea_algorithm, .keep_all = TRUE) + distinct_factors = unique(c("hgnc_id_FORUNIQUE", distinct_factors)) + result = protein_data %>% distinct(dplyr::across(tidyselect::all_of(distinct_factors)), .keep_all = TRUE) - ### finally, stable sort result table + add gene database crossreferences - # sort table again ("prio_specific" option affects sorting, so here enforce stable ordering) + ### add gene database crossreferences result = result %>% - arrange(desc(signif), pvalue) %>% select(-tidyselect::any_of(c("gene_symbols", "symbol", "hgnc_list", "hgnc_list_unique_notna", "hgnc_id_FORUNIQUE", "isambiguous"))) %>% hgnc_add_xrefs(hgnc = hgnc) # add named column specifically for GOAT @@ -295,7 +355,7 @@ export_stats_genesummary = function(dataset, gene_ambiguity = "prio_specific", d - tmp = allstats %>% distinct(protein_id, .keep_all = TRUE) + tmp = protein_data %>% distinct(protein_id, .keep_all = TRUE) log_pg_ambiguous = sum(tmp$isambiguous) if(gene_ambiguity == "only_specific") { @@ -332,20 +392,5 @@ export_stats_genesummary = function(dataset, gene_ambiguity = "prio_specific", d result = result %>% filter(!is.na(hgnc_id)) } - - - ### print output tables - if(is.na(output_dir)) { - append_log("output_dir parameter set to NA, skip writing Excel output tables", type = "info") - } else { - for(contr in unique(result$contrast)) { - for(algo_dea in unique(result$dea_algorithm)) { - tmp = result %>% filter(contrast == contr & dea_algorithm == algo_dea) - f = sprintf("%s/%s__%s.xlsx", output_dir, filename_strip_illegal_characters(sub("contrast: *", "", contr), strict = TRUE), algo_dea) - openxlsx::write.xlsx(tmp, f) - } - } - } - - return(invisible(result)) + return(result) } diff --git a/R/gene_idmapping.R b/R/gene_idmapping.R index d9d6400..99ccfd3 100644 --- a/R/gene_idmapping.R +++ b/R/gene_idmapping.R @@ -242,7 +242,7 @@ rgd_lookuptable = function(f) { #' helper function to match a protein table to HGNC via gene symbols (and synonyms) #' #' @description typically called from `export_stats_genesummary` -#' @param proteins_long long-format data.frame with columns protein_id and symbol pairs (i.e. no semicolon-delimted values) +#' @param proteins_long long-format data.frame with columns protein_id and symbol pairs (i.e. no semicolon-delimited values) #' @param hgnc HGNC lookup table from `hgnc_lookuptable()` #' @param use_synonyms boolean; use synonyms? if FALSE, only uses hgnc_symbol idmap_symbols = function(proteins_long, hgnc, use_synonyms = TRUE) { @@ -290,13 +290,13 @@ idmap_symbols = function(proteins_long, hgnc, use_synonyms = TRUE) { #' helper function to match a protein table to HGNC via an intermediate mapping table (e.g. MGI/RGD) #' #' @description typically called from `export_stats_genesummary` -#' @param proteins_long long-format data.frame with columns protein_id and symbol pairs (i.e. no semicolon-delimted values) +#' @param proteins_long long-format data.frame with columns uniprot_id and symbol pairs (i.e. no semicolon-delimited values) #' @param hgnc HGNC lookup table from `hgnc_lookuptable()` #' @param idmap MGI or RGD lookup table from `mgi_lookuptable()` or `rgd_lookuptable()` #' @param use_symbols boolean; use symbols to map from proteins_long to the idmap table? if FALSE, only matches by uniprot identifier idmap_uniprotid = function(proteins_long, hgnc, idmap, use_symbols = TRUE) { if(!is.data.frame(proteins_long) || nrow(proteins_long) == 0 || !all(c("uniprot_id", "symbol") %in% colnames(proteins_long)) ) { - append_log("proteins_long parameter must be a data.frame with columns 'protein_id' and 'symbol'", type = "error") + append_log("proteins_long parameter must be a data.frame with columns 'uniprot_id' and 'symbol'", type = "error") } if(!is.data.frame(hgnc) || nrow(hgnc) == 0 || !all(c("hgnc_id", "hgnc_symbol", "type", "value") %in% colnames(hgnc)) ) { append_log("hgnc parameter must be a data.frame with columns 'hgnc_id', 'hgnc_symbol', 'type', 'value' as typically prepared using the hgnc_lookuptable() function", type = "error") @@ -329,10 +329,12 @@ idmap_uniprotid = function(proteins_long, hgnc, idmap, use_symbols = TRUE) { proteins_long$xref_id[rows] = idmap$XREF_ID_VALUES__TEMPCOL[match(proteins_long$uniprot_id[rows], idmap$uniprot_id)] # fallback to symbol matching (no `xref_id` yet) - rows = is.na(proteins_long$hgnc_id) & is.na(proteins_long$xref_id) & # - # double-check that we don't even attempt to match missing gene symbols - !is.na(proteins_long$symbol) & nchar(proteins_long$symbol) >= 2 - proteins_long$xref_id[rows] = idmap$XREF_ID_VALUES__TEMPCOL[match(toupper(proteins_long$symbol[rows]), toupper(idmap$symbol))] + if(use_symbols) { + rows = is.na(proteins_long$hgnc_id) & is.na(proteins_long$xref_id) & # + # double-check that we don't even attempt to match missing gene symbols + !is.na(proteins_long$symbol) & nchar(proteins_long$symbol) >= 2 + proteins_long$xref_id[rows] = idmap$XREF_ID_VALUES__TEMPCOL[match(toupper(proteins_long$symbol[rows]), toupper(idmap$symbol))] + } ### map to HGNC using crossreference identifiers diff --git a/R/plot_sample_pca.R b/R/plot_sample_pca.R index 9f9343b..7730501 100644 --- a/R/plot_sample_pca.R +++ b/R/plot_sample_pca.R @@ -7,20 +7,25 @@ #' @param sample_label_property labels used to represent the samples. Set to "auto" to automatically select based on number of samples (default). possible values; auto, shortname, index, index_asis #' @param pch_as_exclude should exclude samples be represented with a distinct symbol shape? possible values; FALSE, TRUE (default) #' @param infer_continuous_scale should the legends infer continuous scale from the data? FALSE = all categorical, TRUE = automatic (default) +#' @param pca_dims a list of integer pairs indicating PCA plots to make #' @importFrom pcaMethods pca #' @importFrom ggrepel geom_text_repel #' @importFrom ggpubr ggarrange annotate_figure text_grob #' @importFrom gtools mixedsort #' @importFrom scales rescale -plot_sample_pca = function(matrix_sample_intensities, samples, samples_colors, sample_label_property = "auto", pch_as_exclude = TRUE, infer_continuous_scale = TRUE) { +plot_sample_pca = function(matrix_sample_intensities, samples, samples_colors, sample_label_property = "auto", pch_as_exclude = TRUE, infer_continuous_scale = TRUE, pca_dims = list(1:2, c(1, 3), 2:3)) { if(!sample_label_property %in% c("auto", "shortname", "index", "index_asis")) { append_log(paste("invalid value for parameter 'sample_label_property':", sample_label_property), type = "error") } if(!all(c("sample_index", "sample_id", "shortname") %in% colnames(samples))) { append_log(paste(paste(c("sample_index", "sample_id", "shortname"), collapse = ", "), "are required column in the samples table"), type = "error") } + if(!(is.list(pca_dims) && all(unlist(lapply(pca_dims, function(x) length(x) == 2 && is.numeric(x) && all.equal(x, round(x)) == TRUE))) && max(unlist(pca_dims)) <= 15) ) { + append_log("pca_dims parameter must be a list that contains integer value pairs (max value: 15)", type = "error") + } + - PPCA = pcaMethods::pca(base::scale(t(matrix_sample_intensities), center=TRUE, scale=FALSE), method="ppca", nPcs = 3, seed = 123, maxIterations = 2000) + PPCA = pcaMethods::pca(base::scale(t(matrix_sample_intensities), center=TRUE, scale=FALSE), method="ppca", nPcs = as.integer(max(unlist(pca_dims))), seed = 123, maxIterations = 2000) mat_pca = PPCA@scores # rownames = sample_id pca_var = array(PPCA@R2, dimnames = list(colnames(PPCA@scores))) @@ -50,7 +55,7 @@ plot_sample_pca = function(matrix_sample_intensities, samples, samples_colors, s } plotlist = list() - for (dims in list(1:2, c(1, 3), 2:3)) { # dims=1:2 + for (dims in pca_dims) { # dims=1:2 # extract data from PCA object & join with sample metadata to get color-coding and label tib = tibble(x = mat_pca[, dims[1]], y = mat_pca[, dims[2]], sample_id = rownames(mat_pca)) %>% left_join(samples %>% select(sample_id, label=!!prop_sample_labels, prop=!!prop), by="sample_id") %>% @@ -230,112 +235,3 @@ plot_sample_pca__sample_in_contrast = function(dataset, contr, sample_label_prop # return the results of the respective plot function return( suppressWarnings(plot_sample_pca(as_matrix_except_first_column(tibw), samples = dataset$samples, samples_colors = samples_colors, sample_label_property = sample_label_property)) ) } - - - -# # for reference, standard pca (does not cope with missing values); summary(stats::prcomp(t(matrix_sample_intensities), center = T, scale. = F)) -# plot_sample_pca = function(matrix_sample_intensities, samples, samples_colors, label_samples_by_shortname = TRUE) { -# PPCA = pcaMethods::pca(base::scale(t(matrix_sample_intensities), center=TRUE, scale=FALSE), method="ppca", nPcs = 3, seed = 123, maxIterations = 2000) -# mat_pca = PPCA@scores # rownames = sample_id -# pca_var = array(PPCA@R2, dimnames = list(colnames(PPCA@scores))) -# -# props = grep("^(sample_id$|sample_index$|shortname$|contrast:)", colnames(samples_colors), ignore.case = T, value = T, invert = T) -# -# # how should we label the samples? if not by shortname, use the index (and fallback to sample_id if index not available) -# if(length(label_samples_by_shortname) != 1 || !is.logical(label_samples_by_shortname) || !is.finite(label_samples_by_shortname)) { -# label_samples_by_shortname = nrow(samples) < 20 -# } -# prop_sample_labels = "shortname" -# if(!label_samples_by_shortname) { -# prop_sample_labels = intersect(c("sample_index", "sample_id"), colnames(samples))[1] -# } -# -# pcaplots = list() -# for (prop in props) { #prop=props[4] -# # don't plot 'exclude' property if there are no excluded samples -# if(prop == "exclude" && !any(samples$exclude)) { -# next -# } -# -# plotlist = list() -# for (dims in list(1:2, c(1, 3), 2:3)) { # dims=1:2 -# # extract data from PCA object & join with sample metadata to get color-coding and label -# tib = tibble(x = mat_pca[, dims[1]], y = mat_pca[, dims[2]], sample_id = rownames(mat_pca)) %>% -# left_join(samples %>% select(sample_id, label=!!prop_sample_labels, prop=!!prop), by="sample_id") %>% -# left_join(samples_colors %>% select(sample_id, clr=!!prop), by="sample_id") -# -# # check if current property (column in sample metadata) only contains numbers. If so, color as numeric -# prop_is_numeric = !prop %in% c("group", "exclude") && suppressWarnings(all(is.finite(as.numeric(tib$prop)))) -# if(prop_is_numeric) { -# tib$prop = as.numeric(tib$prop) -# } -# -# uprop = unique(tib$prop) -# clr_map = array(paste0(tib$clr[match(uprop, tib$prop)], "CC"), dimnames=list(uprop)) -# -# p = ggplot(tib, aes(x = x, y = y, label = label, colour = prop)) + -# geom_point() + -# labs( -# title = "", -# x = sprintf("dimension %d (%.1f%%)", dims[1], pca_var[dims[1]] * 100), -# y = sprintf("dimension %d (%.1f%%)", dims[2], pca_var[dims[2]] * 100) -# ) + -# theme_bw() #+ theme(plot.title = element_text(hjust = 0.5)) -# -# # recycle colors from upstream calculation -# if(prop_is_numeric) { -# # for gradient colors, sorting by respective value is important (already converted the values to numeric type a few lines above) -# p = p + -# scale_color_gradientn(name = gsub("_", " ", prop), colours = unique(tib$clr[order(tib$prop)])) + -# theme(legend.text = element_text(angle=90, hjust=0.5, vjust=0.5), -# legend.title = element_text(size=10, face = "bold")) -# } else { -# p = p + scale_colour_manual( -# name = gsub("_", " ", prop), -# values = clr_map, # named array, value=color, name=property -# breaks = names(clr_map), # sort the legend -# aesthetics = "colour") + -# guides(colour = guide_legend(title.position = "top")) + -# theme(legend.text = element_text(size = ifelse(length(clr_map) < 6, -# 10, -# ifelse(length(clr_map) < 10, 8, 6)) ), -# legend.title = element_text(size=10, face = "bold")) -# } -# -# p_labeled = p + ggrepel::geom_text_repel(vjust = -.5, show.legend = FALSE, size = 2) -# -# plotlist[[length(plotlist) + 1]] = p -# plotlist[[length(plotlist) + 1]] = p_labeled -# } -# -# pcaplots[[length(pcaplots) + 1]] = ggarrange(plotlist = plotlist, ncol = 2, nrow = length(plotlist) / 2, common.legend = T, legend = "bottom") -# } -# -# -# ### code snippet for adding PCA on any custom continuous scale. eg, some aspect that reflects sample quality such as; #detect, outliers deviation in retention time, impact on CoV -# # counts = peptides %>% group_by(sample_id) %>% summarise(detect = sum(detect), quant=n()) %>% left_join(samples %>% select(sample_id, shortname), by="sample_id") -# # plotlist = list() -# # for (dims in list(1:2, c(1, 3), 2:3)) { # dims=1:2 -# # tib = tibble(x = mat_pca[, dims[1]], y = mat_pca[, dims[2]], sample_id = rownames(mat_pca)) %>% left_join(counts, by="sample_id") -# # -# # p = ggplot(tib, aes(x = x, y = y, label = shortname, colour = detect)) + -# # geom_point() + -# # scale_color_gradient(name = "detected peptides") + -# # labs( -# # title = "", -# # x = sprintf("dimension %d (%.1f%%)", dims[1], pca_var[dims[1]] * 100), -# # y = sprintf("dimension %d (%.1f%%)", dims[2], pca_var[dims[2]] * 100) -# # ) + -# # theme_bw() + -# # theme(plot.title = element_text(hjust = 0.5), -# # legend.text = element_text(angle=90, hjust=0.5, vjust=0.5)) -# # -# # p_labeled = p + ggrepel::geom_text_repel(show.legend = FALSE, size = 2) -# # -# # plotlist[[length(plotlist) + 1]] = p -# # plotlist[[length(plotlist) + 1]] = p_labeled -# # } -# # pcaplots[[length(pcaplots) + 1]] = ggarrange(plotlist = plotlist, ncol = 2, nrow = length(plotlist) / 2, common.legend = T, legend = "bottom") -# -# return(pcaplots) -# } diff --git a/R/process_peptide_data.R b/R/process_peptide_data.R index 54a1eee..deab7c5 100644 --- a/R/process_peptide_data.R +++ b/R/process_peptide_data.R @@ -22,8 +22,9 @@ peptides_collapse_by_sequence = function(peptides, prop_peptide = "sequence_plai #' @param remove_irt_peptides try to find the irt spike-in peptides in the fasta file header. looking for; |IRT| and IRT_KIT and "Biognosys iRT" (case insensitive). default:FALSE #' @param regular_expression careful here, regular expressions are powerful but complex matching patterns. A 'regex' that is matched against the fasta header(s) of a protein(group). case insensitive #' @param gene_symbols an array of gene symbols that are to be matched against the fasta header(s) of a protein(group). case insensitive +#' @param print_nchar_limit max number of characters for the fasta headers (of removed proteins) that are shown in the log #' @export -remove_proteins_by_name = function(dataset, remove_irt_peptides = FALSE, regular_expression = "", gene_symbols = c()) { +remove_proteins_by_name = function(dataset, remove_irt_peptides = FALSE, regular_expression = "", gene_symbols = c(), print_nchar_limit = 150) { # TODO: input validation # if(remove_not_present_in_fasta) { @@ -41,6 +42,11 @@ remove_proteins_by_name = function(dataset, remove_irt_peptides = FALSE, regular regex_filter = c(regex_filter, "\\|IRT\\||IRT_KIT|Biognosys iRT") } + if(length(print_nchar_limit) != 1 || !is.numeric(print_nchar_limit) || !is.finite(print_nchar_limit) || print_nchar_limit < 25) { + append_log("print_nchar_limit parameter must be a single number (larger than 25)", type = "error") + } + print_nchar_limit = as.integer(ceiling(print_nchar_limit)) + if(!is.na(regular_expression) && regular_expression != "") { regex_filter = c(regex_filter, regular_expression) } @@ -67,7 +73,7 @@ remove_proteins_by_name = function(dataset, remove_irt_peptides = FALSE, regular dataset$proteins = dataset$proteins[!rows_remove, ] dataset$peptides = dataset$peptides %>% filter(isdecoy | protein_id %in% dataset$proteins$protein_id) # optionally, print the fasta headers for proteins that were removed - charlim = 150 + charlim = print_nchar_limit h_too_long = nchar(h) > charlim h[h_too_long] = paste(substr(h[h_too_long], 1, charlim-4), "...") # status report & return results @@ -247,31 +253,139 @@ import_protein_metadata_from_fasta = function(protein_id, fasta_files, fasta_id_ -# TODO -# #' placeholder title -# #' @param peptides todo -# #' @param proteins todo -# rollup_proteins_by_gene = function(peptides, proteins) { -# # convert the 'peptide to protein_id' mapping -->> 'peptide to gene' -# i = match(peptides$protein_id, proteins$protein_id) -# if(any(is.na(i))) { -# append_log("all protein_id from peptide tibble must be in protein tibble", type = "error") -# } -# -# peptides$protein_id = proteins$gene_symbols_or_id[i] -# -# # collapse metadata by unique gene. note the hashtag separation character, so we can still distinguish proteinGroups downstream -# proteins = proteins %>% -# group_by(gene_symbols_or_id) %>% -# summarise( -# accessions = paste(accessions, collapse = "#"), -# fasta_headers = paste(fasta_headers, collapse = "#") -# ) -# # copy gene symbols to protein_id column -# proteins$protein_id = proteins$gene_symbols_or_id -# -# return(list(peptides = peptides, proteins = proteins)) -# } +#' Merge protein identifiers that map to the same unique set of gene symbols +#' +#' @description +#' Suppose there are 3 proteingroups that have the respective unique genes: +#' +#' 1) GRIA2 +#' 2) GRIA2 +#' 3) GRIA2;GRIA1 +#' +#' Then the latter proteingroup is an ambiguous proteingroup where respective peptides were matched to 2 genes +#' (by the upstream software that generated the dataset that you imported with e.g. `import_dataset_diann()`). +#' This function will merge all peptides that belong to proteingroups 1 and 2 into +#' one new proteingroup (that uniquely maps to GRIA2). +#' Proteingroup 3 will remain a distinct proteingroup; only proteingroups with the exact same +#' gene symbol set are matched (technical detail: this uses the definitions in `dataset$proteins$gene_symbols_or_id`). +#' +#' This function will fully update the dataset's protein table and the protein_id column in the peptide table. +#' +#' In most use-cases, this function should be used immediately after `import_fasta()` as shown in the example. +#' +#' @examples \dontrun{ +#' # example for using this function. You'll need to update the file paths accordingly +#' library(msdap) +#' dataset = import_dataset_diann("C:/data/diann_report.tsv") +#' dataset = import_fasta(dataset, files = "C:/data/uniprot_human_2020-01.fasta") +#' dataset = merge_proteingroups_by_gene(dataset) +#' } +#' @param dataset a valid dataset. Prior to calling this function, you must import protein metadata from FASTA using the `import_fasta()` function +#' @export +merge_proteingroups_by_gene = function(dataset) { + + # double-check that a fasta has been loaded, and that the dataset object is compatible with msdap 1.2 or later (i.e. has protein data in current format) + if(!is.data.frame(dataset$proteins) || + !all(c("protein_id", "fasta_headers", "gene_symbols", "gene_symbol_ucount", "gene_symbols_or_id") %in% colnames(dataset$proteins)) || + !any(is.numeric(dataset$proteins$gene_symbol_ucount) & is.finite(dataset$proteins$gene_symbol_ucount) & dataset$proteins$gene_symbol_ucount > 0) + ) { + append_log("dataset is missing essential information in the protein table. No FASTA has been imported for this dataset or the dataset was analyzed with an outdated MS-DAP version.\nTo use this function you won't have to re-run the entire analysis pipeline for this dataset, just run the import_fasta() function to update the current dataset object (assuming this is a dataset searched against a uniprot FASTA)", type = "error") + } + + if(anyNA(dataset$proteins)) { + append_log("NA values are not allowed in dataset$proteins", type = "error") + } + + + ### 1) create mapping table from old proteingroup identifiers to new IDs for the updated groups + + protein_to_gene = dataset$peptides %>% + # count unique peptides per protein_id in dataset$peptides, this dictates the prio/order of uniprot symbols + distinct(protein_id, peptide_id) %>% + count(protein_id) %>% + # after merging proteingroups, the uniprot IDs from the proteingroup with most evidence are in front + arrange(desc(n)) %>% + # add gene symbols (using uniprot ID where gene symbols are absent) + left_join(dataset$proteins %>% select(protein_id, gene_symbols_or_id), by = "protein_id") + + gene_to_new_proteinid__long = protein_to_gene %>% + select(-n) %>% + # unlist the uniprot accessions and find set of unique IDs per gene symbol + # (there may be duplicates across proteingroups, can't trust input data) + mutate(protein_id_new = strsplit(protein_id, " *; *")) %>% + tidyr::unchop(protein_id_new) %>% + distinct(gene_symbols_or_id, protein_id_new) + + gene_to_new_proteinid = gene_to_new_proteinid__long %>% + # collapse into a string: these are the new protein_id for respective gene_symbols_or_id + group_by(gene_symbols_or_id) %>% + summarise(protein_id_new = paste(protein_id_new, collapse = ";"), .groups = "drop") + + # finally, merge both tables and print stats to console + protein_to_new_proteinid = left_join(protein_to_gene, gene_to_new_proteinid, by = "gene_symbols_or_id") + + tmp = sum(protein_to_new_proteinid$protein_id != protein_to_new_proteinid$protein_id_new) + append_log(sprintf("%d / %d (%.1f%%) proteingroups that matched to the same gene symbol(s) were merged", + tmp, nrow(protein_to_new_proteinid), tmp / nrow(protein_to_new_proteinid) * 100), type = "info") + + + ### 2) update dataset$peptides : replace protein_id + + i = data.table::chmatch(dataset$peptides$protein_id, protein_to_new_proteinid$protein_id) + stopifnot(!is.na(i)) + dataset$peptides$protein_id = protein_to_new_proteinid$protein_id_new[i] + + + ### 3) update dataset$proteins : deconstruct and merge protein data + + ## example data for a proteingroup with multiple accessions, and some of which have no gene symbol + # $ protein_id : chr "A0A087WUQ1;B3KPU0;E5RGS7;H7C1D1" + # $ accessions : chr "A0A087WUQ1;B3KPU0;E5RGS7;H7C1D1" + # $ fasta_headers : chr ">tr|A0A087WUQ1|A0A087WUQ1_HUMAN Isoform of Q96JY6, PDZ and LIM domain 2 OS=Homo sapiens OX=9606 GN=PDLIM2 PE=1 "| __truncated__ + # $ gene_symbols : chr "PDLIM2;PDLIM2;-;-" + # $ gene_symbol_ucount: int 1 + # $ gene_symbols_or_id: chr "PDLIM2" + + # first, unpack the current protein data. These columns are semicolon-delimited and assumed to have same length + p = dataset$proteins %>% + select(protein_id, fasta_headers, gene_symbols) %>% + mutate( + protein_id = strsplit(protein_id, ";", fixed = TRUE), + fasta_headers = strsplit(fasta_headers, ";", fixed = TRUE), + gene_symbols = strsplit(gene_symbols, ";", fixed = TRUE) + ) + stopifnot(lengths(p$protein_id) == lengths(p$fasta_headers)) + stopifnot(lengths(p$protein_id) == lengths(p$gene_symbols)) + + p = p %>% + tidyr::unchop(cols = c(protein_id, fasta_headers, gene_symbols)) %>% + # importantly, take unique: the same uniprot ID may be part of multiple proteingroups + distinct(protein_id, .keep_all = TRUE) + + # join the fasta/symbol metadata to the new proteingroup identifiers + newproteins = gene_to_new_proteinid__long %>% + rename(protein_id = protein_id_new) %>% + left_join(p, by = "protein_id") %>% + group_by(gene_symbols_or_id) %>% + summarise( + fasta_headers = paste(fasta_headers, collapse = ";"), + gene_symbols = paste(gene_symbols, collapse = ";"), + ## definition of ucount analogous to import_protein_metadata_from_fasta() + gene_symbol_ucount = n_distinct(gene_symbols[gene_symbols != "-"]) + ) + + dataset$proteins = gene_to_new_proteinid %>% + rename(protein_id = protein_id_new) %>% + mutate(accessions = protein_id) %>% + left_join(newproteins, by = "gene_symbols_or_id") %>% + select(protein_id, accessions, fasta_headers, gene_symbols, gene_symbol_ucount, gene_symbols_or_id) + + # QC: no NA's (i.e. no issues while joining tables) + all updated protein_id in peptide table are present in protein table + stopifnot(!anyNA(dataset$proteins)) + stopifnot(unique(dataset$peptides$protein_id) %in% dataset$proteins$protein_id) + + return(dataset) +} diff --git a/R/sample_metadata.R b/R/sample_metadata.R index 13a4ea8..d0519ff 100644 --- a/R/sample_metadata.R +++ b/R/sample_metadata.R @@ -13,6 +13,7 @@ import_sample_metadata = function(dataset, filename) { dataset = invalidate_cache(dataset) dataset$samples = sample_metadata_from_file(sample_id = unique(dataset$peptides$sample_id), filename = filename) + dataset$samples = tibble::as_tibble(dataset$samples) # add peptide and protein counts to samples tibble, so downstream code includes this in output tables and report figures relating to sample metadata dataset$samples = peptide_and_protein_counts_per_sample(dataset$peptides, dataset$samples, is_dia_dataset(dataset)) return(dataset) @@ -666,6 +667,10 @@ add_contrast = function(dataset, colname_condition_variable, values_condition1, dataset = remove_contrasts(dataset) # inits a new list } + if(!tibble::is_tibble(dataset$samples)) { + dataset$samples = as_tibble(dataset$samples) + } + # validate condition colname valid_sample_metadata_columns = user_provided_metadata(dataset$samples) if(length(colname_condition_variable) != 1 || is.na(colname_condition_variable) || colname_condition_variable == "" || !colname_condition_variable %in% valid_sample_metadata_columns) { diff --git a/R/util_fasta.R b/R/util_fasta.R index 081dfd7..51adf80 100644 --- a/R/util_fasta.R +++ b/R/util_fasta.R @@ -42,6 +42,7 @@ fasta_id_short = function(id, fasta_id_type = "uniprot") { #' #' uniprot headers may contain "GN=-" or no GN tag at all ! #' example; >sp|Q6ZSR9|YJ005_HUMAN Uncharacterized protein FLJ45252 OS=Homo sapiens (Human) OX=9606 GN=- PE=2 SV=2 +#' example; >tr|A2ALT2|A2ALT2_MOUSE Isoform of Q03288, Nonagouti (Fragment) OS=Mus musculus OX=10090 GN=a PE=4 SV=1 #' example; >sp|P15252|REF_HEVBR Rubber elongation factor protein OS=Hevea brasiliensis PE=1 SV=2 #' #' @param x array of fasta headers diff --git a/docker/Dockerfile b/docker/Dockerfile index 7ae73a2..2d9fad7 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -33,8 +33,8 @@ RUN R -e "devtools::install_version('remaCor', '0.0.18', upgrade = 'never', repo ### MS-DAP # from local devtools::build() file RUN mkdir /msdap -COPY temp/msdap_1.2.tar.gz /msdap/ -RUN R -e "devtools::install_local('/msdap/msdap_1.2.tar.gz', upgrade = 'never')" +COPY temp/msdap_1.2.1.tar.gz /msdap/ +RUN R -e "devtools::install_local('/msdap/msdap_1.2.1.tar.gz', upgrade = 'never')" # alternatively, install from github; RUN R -e "devtools::install_github('ftwkoopmans/msdap@1.1', upgrade = 'never')" diff --git a/docker/msdap_launcher_unix.sh b/docker/msdap_launcher_unix.sh index e242917..c136d43 100644 --- a/docker/msdap_launcher_unix.sh +++ b/docker/msdap_launcher_unix.sh @@ -3,7 +3,7 @@ # MS-DAP launch script # https://github.com/ftwkoopmans/msdap -VERSION="1.2" +VERSION="1.2.1" ### OS diff --git a/docker/msdap_launcher_windows.ps1 b/docker/msdap_launcher_windows.ps1 index 0d2eb12..48eee77 100644 --- a/docker/msdap_launcher_windows.ps1 +++ b/docker/msdap_launcher_windows.ps1 @@ -1,7 +1,7 @@ # MS-DAP launch script # https://github.com/ftwkoopmans/msdap -$VERSION = "1.2" +$VERSION = "1.2.1" Write-Host "$((Get-Date).ToString("HH:mm:ss")) - Starting MS-DAP launcher script for version: $VERSION" diff --git a/man/disambiguate_protein_table_by_gene.Rd b/man/disambiguate_protein_table_by_gene.Rd new file mode 100644 index 0000000..ba3dd81 --- /dev/null +++ b/man/disambiguate_protein_table_by_gene.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/export_stats_genesummary.R +\name{disambiguate_protein_table_by_gene} +\alias{disambiguate_protein_table_by_gene} +\title{given some table with protein-level data, map to HGNC genes and deal with redundant or ambiguous gene-level results} +\usage{ +disambiguate_protein_table_by_gene( + protein_data, + hgnc, + gene_ambiguity, + xref = NULL, + remove_nohgnc = FALSE, + distinct_factors = NULL +) +} +\arguments{ +\item{protein_data}{a data.frame with columns (from dataset$proteins) protein_id, gene_symbols, gene_symbols_or_id. +Importantly, if you opt to return only unique entries per gene then the row of the FIRST matching protein_id is +returned, so sort your data table by pvalue upstream !} + +\item{hgnc}{see \code{export_stats_genesummary()}} + +\item{gene_ambiguity}{see \code{export_stats_genesummary()}} + +\item{xref}{see \code{export_stats_genesummary()}} + +\item{remove_nohgnc}{see \code{export_stats_genesummary()}} + +\item{distinct_factors}{a set of columns in protein_data that should be considered (together with protein_id) factors that describe subsets of data (to return unique gene-level data within)} +} +\description{ +The order of the data table is pivotal: the first row for each unique gene (criteria in parameter \code{gene_ambiguity})) is selected ! +} diff --git a/man/fasta_header_to_gene.Rd b/man/fasta_header_to_gene.Rd index c8d554d..f737efe 100644 --- a/man/fasta_header_to_gene.Rd +++ b/man/fasta_header_to_gene.Rd @@ -18,5 +18,6 @@ example; >sp|Q9Y2S6|TMA7_HUMAN Translation machinery-associated protein 7 OS=Hom \details{ uniprot headers may contain "GN=-" or no GN tag at all ! example; >sp|Q6ZSR9|YJ005_HUMAN Uncharacterized protein FLJ45252 OS=Homo sapiens (Human) OX=9606 GN=- PE=2 SV=2 +example; >tr|A2ALT2|A2ALT2_MOUSE Isoform of Q03288, Nonagouti (Fragment) OS=Mus musculus OX=10090 GN=a PE=4 SV=1 example; >sp|P15252|REF_HEVBR Rubber elongation factor protein OS=Hevea brasiliensis PE=1 SV=2 } diff --git a/man/idmap_symbols.Rd b/man/idmap_symbols.Rd index bd3bc81..a2dcfc4 100644 --- a/man/idmap_symbols.Rd +++ b/man/idmap_symbols.Rd @@ -7,7 +7,7 @@ idmap_symbols(proteins_long, hgnc, use_synonyms = TRUE) } \arguments{ -\item{proteins_long}{long-format data.frame with columns protein_id and symbol pairs (i.e. no semicolon-delimted values)} +\item{proteins_long}{long-format data.frame with columns protein_id and symbol pairs (i.e. no semicolon-delimited values)} \item{hgnc}{HGNC lookup table from \code{hgnc_lookuptable()}} diff --git a/man/idmap_uniprotid.Rd b/man/idmap_uniprotid.Rd index bdebbcc..4b1e171 100644 --- a/man/idmap_uniprotid.Rd +++ b/man/idmap_uniprotid.Rd @@ -7,7 +7,7 @@ idmap_uniprotid(proteins_long, hgnc, idmap, use_symbols = TRUE) } \arguments{ -\item{proteins_long}{long-format data.frame with columns protein_id and symbol pairs (i.e. no semicolon-delimted values)} +\item{proteins_long}{long-format data.frame with columns uniprot_id and symbol pairs (i.e. no semicolon-delimited values)} \item{hgnc}{HGNC lookup table from \code{hgnc_lookuptable()}} diff --git a/man/merge_proteingroups_by_gene.Rd b/man/merge_proteingroups_by_gene.Rd new file mode 100644 index 0000000..85c2b85 --- /dev/null +++ b/man/merge_proteingroups_by_gene.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/process_peptide_data.R +\name{merge_proteingroups_by_gene} +\alias{merge_proteingroups_by_gene} +\title{Merge protein identifiers that map to the same unique set of gene symbols} +\usage{ +merge_proteingroups_by_gene(dataset) +} +\arguments{ +\item{dataset}{a valid dataset. Prior to calling this function, you must import protein metadata from FASTA using the \code{import_fasta()} function} +} +\description{ +Suppose there are 3 proteingroups that have the respective unique genes: +\enumerate{ +\item GRIA2 +\item GRIA2 +\item GRIA2;GRIA1 +} + +Then the latter proteingroup is an ambiguous proteingroup where respective peptides were matched to 2 genes +(by the upstream software that generated the dataset that you imported with e.g. \code{import_dataset_diann()}). +This function will merge all peptides that belong to proteingroups 1 and 2 into +one new proteingroup (that uniquely maps to GRIA2). +Proteingroup 3 will remain a distinct proteingroup; only proteingroups with the exact same +gene symbol set are matched (technical detail: this uses the definitions in \code{dataset$proteins$gene_symbols_or_id}). + +This function will fully update the dataset's protein table and the protein_id column in the peptide table. + +In most use-cases, this function should be used immediately after \code{import_fasta()} as shown in the example. +} +\examples{ +\dontrun{ +# example for using this function. You'll need to update the file paths accordingly +library(msdap) +dataset = import_dataset_diann("C:/data/diann_report.tsv") +dataset = import_fasta(dataset, files = "C:/data/uniprot_human_2020-01.fasta") +dataset = merge_proteingroups_by_gene(dataset) +} +} diff --git a/man/plot_sample_pca.Rd b/man/plot_sample_pca.Rd index e50f78c..9d0534e 100644 --- a/man/plot_sample_pca.Rd +++ b/man/plot_sample_pca.Rd @@ -11,7 +11,8 @@ plot_sample_pca( samples_colors, sample_label_property = "auto", pch_as_exclude = TRUE, - infer_continuous_scale = TRUE + infer_continuous_scale = TRUE, + pca_dims = list(1:2, c(1, 3), 2:3) ) } \arguments{ @@ -26,6 +27,8 @@ plot_sample_pca( \item{pch_as_exclude}{should exclude samples be represented with a distinct symbol shape? possible values; FALSE, TRUE (default)} \item{infer_continuous_scale}{should the legends infer continuous scale from the data? FALSE = all categorical, TRUE = automatic (default)} + +\item{pca_dims}{a list of integer pairs indicating PCA plots to make} } \description{ placeholder title diff --git a/man/remove_proteins_by_name.Rd b/man/remove_proteins_by_name.Rd index 6fbbaa9..ed5c804 100644 --- a/man/remove_proteins_by_name.Rd +++ b/man/remove_proteins_by_name.Rd @@ -8,7 +8,8 @@ remove_proteins_by_name( dataset, remove_irt_peptides = FALSE, regular_expression = "", - gene_symbols = c() + gene_symbols = c(), + print_nchar_limit = 150 ) } \arguments{ @@ -19,6 +20,8 @@ remove_proteins_by_name( \item{regular_expression}{careful here, regular expressions are powerful but complex matching patterns. A 'regex' that is matched against the fasta header(s) of a protein(group). case insensitive} \item{gene_symbols}{an array of gene symbols that are to be matched against the fasta header(s) of a protein(group). case insensitive} + +\item{print_nchar_limit}{max number of characters for the fasta headers (of removed proteins) that are shown in the log} } \description{ Remove all peptides that match some protein-level filters