From d1018ce7cce3c4cab8896838895fb7c192048632 Mon Sep 17 00:00:00 2001 From: Frank Koopmans Date: Fri, 24 May 2024 21:35:38 +0200 Subject: [PATCH] release 1.0.8 --- DESCRIPTION | 2 +- R/dea.R | 10 ++- R/filter_peptides.R | 33 ++++++--- R/plot_peptide_data.R | 97 +++++++++++++++++++-------- R/plot_retention_time.R | 59 ++++++++++------ R/plot_variance_explained.R | 2 +- R/quickstart.R | 2 +- R/stats_differential_abundance.R | 61 +++++++++++++++-- docker/Dockerfile | 6 +- docker/msdap_launcher_unix.sh | 2 +- docker/msdap_launcher_windows.ps1 | 2 +- man/de_msqrobsum_msqrob.Rd | 8 ++- man/dea_algorithms.Rd | 2 + man/plot_peptide_data.Rd | 58 +++++++++++++--- man/protein_foldchange_from_ebayes.Rd | 25 +++++++ 15 files changed, 285 insertions(+), 84 deletions(-) create mode 100644 man/protein_foldchange_from_ebayes.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 9e27292..3074066 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Package: msdap Title: Mass Spectrometry Downstream Analysis Pipeline Description: Analyze label-free proteomics datasets from various sources (MaxQuant, Spectronaut, etc) using a pipeline that facilitates peptide filtering and many algorithms for normalization and statistical analysis. A comprehensive PDF report can be generated that includes many data visualizations and documentation thereof. URL: https://github.com/ftwkoopmans/msdap -Version: 1.0.7 +Version: 1.0.8 Authors@R: person(given = "Frank", family = "Koopmans", diff --git a/R/dea.R b/R/dea.R index de02818..93b5155 100644 --- a/R/dea.R +++ b/R/dea.R @@ -17,11 +17,13 @@ #' #' **msqrob**: implementation of the MSqRob package, with minor tweak for (situationally) faster computation (PMID:26566788) . This is a peptide-level DEA algorithm. This method will take provided covariates into account (if any). Implemented in function; \code{de_msqrobsum_msqrob} #' +#' **msqrob_fc**: msqrob estimated protein pvalues, but with log2fc values computed by eBayes. This avoids the excessive shrinkage of protein log2fc values by MSqRob, but does use the MSqRob robust peptide regression model for protein p-value estimates (which outperforms eBayes/DEqMS in ROC analyses on most benchmark datasets, so this method is intended as a "best of both worlds") +#' #' **msqrobsum**: implementation of the MSqRob package (which also features MSqRobSum), with minor tweak for (situationally) faster computation (PMID:32321741) . This is a hybrid peptide&protein-level DEA algorithm that takes peptide-level data as input; it first performs peptide-to-protein rollup, then applies statistics to this protein-level data matrix. This method will take provided covariates into account (if any). Implemented in function; \code{de_msqrobsum_msqrob} #' #' @export dea_algorithms = function() { - return(c("ebayes", "deqms", "msempire", "msqrobsum", "msqrob")) + return(c("ebayes", "deqms", "msempire", "msqrobsum", "msqrob")) # , "msqrob_fc" } @@ -261,9 +263,11 @@ dea = function(dataset, qval_signif = 0.01, fc_signif = 0, dea_algorithm = "deqm # ! compared to the other dea algorithms, MS-EmpiRe is not a regression model so we cannot add random variables alg_result = de_msempire(eset_peptides, input_intensities_are_log2 = T) } else if(alg == "msqrob") { - alg_result = de_msqrobsum_msqrob(eset_peptides, use_peptide_model = T, input_intensities_are_log2 = T, random_variables = ranvars) + alg_result = de_msqrobsum_msqrob(eset_peptides, eset_proteins = NULL, use_peptide_model = T, input_intensities_are_log2 = T, random_variables = ranvars, log2fc_without_shrinkage = FALSE) + # } else if(alg == "msqrob_fc") { + # alg_result = de_msqrobsum_msqrob(eset_peptides, eset_proteins = eset_proteins, use_peptide_model = T, input_intensities_are_log2 = T, random_variables = ranvars, log2fc_without_shrinkage = TRUE) } else if(alg == "msqrobsum") { - alg_result = de_msqrobsum_msqrob(eset_peptides, use_peptide_model = F, input_intensities_are_log2 = T, random_variables = ranvars) + alg_result = de_msqrobsum_msqrob(eset_peptides, eset_proteins = NULL, use_peptide_model = F, input_intensities_are_log2 = T, random_variables = ranvars, log2fc_without_shrinkage = FALSE) } else { # for non-hardcoded functions, we call the function requested as a user parameter and pass all available data alg_fun = match.fun(alg) # throws error if not found diff --git a/R/filter_peptides.R b/R/filter_peptides.R index 5a9566e..9ecb393 100644 --- a/R/filter_peptides.R +++ b/R/filter_peptides.R @@ -58,9 +58,13 @@ cache_filtering_data = function(dataset) { p[, key_peptide_group_noexclude := kpg] # detect not as a boolean, but scaled by number of detect per sample # ` * !is.na(key_peptide_group_noexclude)` is simply to remove all data points for samples that are excluded, which allows us to downstream group by key_group and not worry about excluded samples - p[, `:=` (detect_scaled_noexclude = (detect/sum(detect)) * !is.na(key_peptide_group_noexclude), - quant_scaled_noexclude = 1/.N * !is.na(key_peptide_group_noexclude)), + p[, c("detect_scaled_noexclude", "quant_scaled_noexclude") := list( + (detect/sum(detect)) * !is.na(key_peptide_group_noexclude), + 1/.N * !is.na(key_peptide_group_noexclude) ), by = key_sample] + # p[, `:=` (detect_scaled_noexclude = (detect/sum(detect)) * !is.na(key_peptide_group_noexclude), + # quant_scaled_noexclude = 1/.N * !is.na(key_peptide_group_noexclude)), + # by = key_sample] ## efficiently collapse filtering data for each peptide * 'sample group' @@ -78,7 +82,10 @@ cache_filtering_data = function(dataset) { data.table::setkey(dt_pep_group, key_group) # set key for data.table::merge() dt_criteria = data.table::data.table(grps %>% select(key_group, size, size_noexclude) %>% mutate_all(as.integer), key="key_group") dt_pep_group = data.table::merge.data.table(dt_pep_group, dt_criteria, all.x = T, sort = FALSE) - dt_pep_group[ , `:=`(ndetect_fraction = ndetect / size, ndetect_noexclude_fraction = ndetect_noexclude / size_noexclude)] + dt_pep_group[ , c("ndetect_fraction", "ndetect_noexclude_fraction") := list( + ndetect / size, + ndetect_noexclude / size_noexclude)] + # dt_pep_group[ , `:=`(ndetect_fraction = ndetect / size, ndetect_noexclude_fraction = ndetect_noexclude / size_noexclude)] ################# pre-cache CoV @@ -279,19 +286,25 @@ filter_dataset = function(dataset, data.table::setkey(dataset$dt_pep_group, key_group) # set key for data.table::merge() dt_criteria = data.table::data.table(dataset$groups %>% select(key_group, ndetect_min, nquant_min, ndetect_noexclude_min, nquant_noexclude_min) %>% mutate_all(as.integer), key="key_group") dt_filter_group = data.table::merge.data.table(dataset$dt_pep_group, dt_criteria, all.x = T, sort = FALSE) - dt_filter_group[ , `:=`(pass = ndetect >= ndetect_min & nquant >= nquant_min, - pass_noexclude = ndetect_noexclude >= ndetect_noexclude_min & nquant_noexclude >= nquant_noexclude_min)] + dt_filter_group[ , c("pass", "pass_noexclude") := list( + ndetect >= ndetect_min & nquant >= nquant_min, + ndetect_noexclude >= ndetect_noexclude_min & nquant_noexclude >= nquant_noexclude_min + )] + # dt_filter_group[ , `:=`(pass = ndetect >= ndetect_min & nquant >= nquant_min, + # pass_noexclude = ndetect_noexclude >= ndetect_noexclude_min & nquant_noexclude >= nquant_noexclude_min)] ## now we know for each peptide whether it passes the filter criteria in each group (with and without taking 'exclude' samples into account) ## wide format for re-use (eg; by-group or all-group) data.table::setkey(dt_filter_group, pass_noexclude) # set key for filter step below; remove all peptide*group that don't pass anywhere to minimize wide-table size dt_filter_group_wide_noexclude = data.table::dcast(dt_filter_group[pass_noexclude>0], key_peptide ~ key_group, value.var = "pass_noexclude", fill = FALSE) - dt_filter_group_wide_noexclude[ , `:=`(ngroup = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(setdiff(colnames(dt_filter_group_wide_noexclude), "key_peptide")))] + dt_filter_group_wide_noexclude[ , ngroup := matrixStats::rowSums2(as.matrix(.SD)), .SDcols=eval(quote(setdiff(colnames(dt_filter_group_wide_noexclude), "key_peptide")))] + # dt_filter_group_wide_noexclude[ , `:=`(ngroup = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(setdiff(colnames(dt_filter_group_wide_noexclude), "key_peptide")))] # analogous data.table::setkey(dt_filter_group, pass) dt_filter_group_wide = data.table::dcast(dt_filter_group[pass>0], key_peptide ~ key_group, value.var = "pass", fill = FALSE) - dt_filter_group_wide[ , `:=`(ngroup = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(setdiff(colnames(dt_filter_group_wide), "key_peptide")))] + dt_filter_group_wide[ , ngroup := matrixStats::rowSums2(as.matrix(.SD)), .SDcols=eval(quote(setdiff(colnames(dt_filter_group_wide), "key_peptide")))] + # dt_filter_group_wide[ , `:=`(ngroup = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(setdiff(colnames(dt_filter_group_wide), "key_peptide")))] if(by_group) { @@ -351,7 +364,8 @@ filter_dataset = function(dataset, grp1_mask = dataset$peptides$key_peptide %in% dt_filter_group_wide_noexclude$key_peptide[ dt_filter_group_wide_noexclude[[as.character(grp1_key)]] ] } else { # if multi-group, select all columns for this group and test that all are true - dt_filter_group_wide_noexclude[, `:=`(temp = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(as.character(grp1_key)))] + dt_filter_group_wide_noexclude[, temp := matrixStats::rowSums2(as.matrix(.SD)), .SDcols=eval(quote(as.character(grp1_key)))] + # dt_filter_group_wide_noexclude[, `:=`(temp = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(as.character(grp1_key)))] grp1_mask = dataset$peptides$key_peptide %in% dt_filter_group_wide_noexclude$key_peptide[ dt_filter_group_wide_noexclude$temp == length(grp1_key) ] } @@ -359,7 +373,8 @@ filter_dataset = function(dataset, grp2_mask = dataset$peptides$key_peptide %in% dt_filter_group_wide_noexclude$key_peptide[ dt_filter_group_wide_noexclude[[as.character(grp2_key)]] ] } else { # if multi-group, select all columns for this group and test that all are true - dt_filter_group_wide_noexclude[, `:=`(temp = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(as.character(grp2_key)))] + dt_filter_group_wide_noexclude[, temp := matrixStats::rowSums2(as.matrix(.SD)), .SDcols=eval(quote(as.character(grp2_key)))] + # dt_filter_group_wide_noexclude[, `:=`(temp = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(as.character(grp2_key)))] grp2_mask = dataset$peptides$key_peptide %in% dt_filter_group_wide_noexclude$key_peptide[ dt_filter_group_wide_noexclude$temp == length(grp2_key) ] } diff --git a/R/plot_peptide_data.R b/R/plot_peptide_data.R index 5a75c33..ce6e633 100644 --- a/R/plot_peptide_data.R +++ b/R/plot_peptide_data.R @@ -1,17 +1,57 @@ #' plot peptide-level data #' -#' @param dataset todo -#' @param select_all_proteins todo -#' @param select_diffdetect_candidates todo -#' @param select_dea_signif todo -#' @param filter_protein_ids todo -#' @param filter_genes todo -#' @param output_dir todo -#' @param show_unused_datapoints todo -#' @param norm_algorithm todo +#' @examples \dontrun{ +#' # example 1: +#' # +#' # plot all significant proteins found by DEA +#' plot_peptide_data( dataset, select_dea_signif = TRUE, +#' # should match analysis_quickstart() parameters ! +#' norm_algorithm = c("vwmb", "modebetween_protein"), +#' show_unused_datapoints = TRUE, output_dir = "C:/temp") +#' +#' # example 2: +#' # filter the DEA results for some specific proteins, then plot these +#' pid = dataset$de_proteins %>% +#' filter(dea_algorithm == "deqms" & qvalue < 10^-4) %>% +#' pull(protein_id) +#' plot_peptide_data( +#' dataset, filter_protein_ids = pid, +#' # should match analysis_quickstart() parameters ! +#' norm_algorithm = c("vwmb", "modebetween_protein"), +#' show_unused_datapoints = TRUE, output_dir = "C:/temp") +#' +#' # example 3: +#' # plot specific proteins by providing gene symbols (not case sensitive) +#' plot_peptide_data( +#' dataset, filter_genes = c("shisa6", "gria2"), +#' # should match analysis_quickstart() parameters ! +#' norm_algorithm = c("vwmb", "modebetween_protein"), +#' show_unused_datapoints = TRUE, output_dir = "C:/temp") +#' } +#' @param dataset your dataset +#' @param select_all_proteins plot all proteins. This will take a long time. Default: FALSE +#' @param select_diffdetect_candidates if "differential detection" was enabled (it is by default in `analysis_quickstart`), plot candidate proteins based on their zscores. +#' Valid parameters are positive numeric values that are to be used as absolute z-score cutoffs (4 or 5 are typically reasonable; use `plot_differential_detect` to visualize the score distributions). +#' This selects all proteins that meet this criterium in any contrast, and plots these proteins in every contrast (include those where it is has a z-score below given threshold). +#' Alternative, set to TRUE to use an absolute z-score cutoff value of 4. Default: NA (disable) +#' @param select_dea_signif plot all significant proteins from DEA? +#' This effectively checks the "signif" column in the `dataset$de_proteins` data table (DEA results generated by MS-DAP), finds all proteins that are signif in any contrast, and plots these proteins in every contrast (include those where it is not signif). +#' Boolean parameter, default: FALSE +#' @param filter_protein_ids a vector of protein_id values that should be plotted. For example, you can find specific protein_id values in the differential expression result table and provide only these as a parameter +#' (in this case, set `select_dea_signif=FALSE` to ensure only your specific protein identifiers are plotted) +#' @param filter_genes a vector of gene symbols to plot (alternative for filter_protein_ids) +#' @param output_dir full path to the output directory +#' @param show_unused_datapoints should data points be shown for peptides that were not used in the differential expression analyses? i.e. peptides that did not pass the filtering prior to DEA (found in N samples in both experimental conditions). Boolean value, default: FALSE +#' @param norm_algorithm normalization algorithms. This should be exactly the same as the parameter your provided to `analysis_quickstart` #' @export -plot_peptide_data = function(dataset, select_all_proteins = FALSE, select_diffdetect_candidates = TRUE, select_dea_signif = TRUE, filter_protein_ids = NA, filter_genes = NA, output_dir, show_unused_datapoints = FALSE, norm_algorithm = NA) { +plot_peptide_data = function(dataset, select_all_proteins = FALSE, select_diffdetect_candidates = NA, select_dea_signif = FALSE, filter_protein_ids = NA, filter_genes = NA, output_dir, show_unused_datapoints = FALSE, norm_algorithm = NA) { + if(length(select_diffdetect_candidates) == 1 && select_diffdetect_candidates %in% TRUE) { + select_diffdetect_candidates = 4 # default z-score + } + if(length(select_diffdetect_candidates) != 1 || !is.numeric(select_diffdetect_candidates) || select_diffdetect_candidates <= 0) { + select_diffdetect_candidates = NA + } pid = NULL ## if requested to plot all proteins, just take the unique set from peptide tibble and don't bother with other selection parameters @@ -40,11 +80,11 @@ plot_peptide_data = function(dataset, select_all_proteins = FALSE, select_diffde pid = c(pid, dataset$de_proteins %>% filter(signif) %>% distinct(protein_id) %>% pull()) } - if(select_diffdetect_candidates) { + if(!is.na(select_diffdetect_candidates)) { if(!"dd_proteins" %in% names(dataset)) { append_log("you requested to plot proteins tested by differential detection counting, but no differential detection has not been applied to this dataset yet (did you run the analysis_quickstart() function yet? Alternatively you can simply run differential_detect() and review its outcome with print_dataset_summary() to quickly prepare this selection)", type = "error") } - pid = c(pid, dataset$dd_proteins %>% filter(type == "detect" & is.finite(zscore) & abs(zscore) >= 4) %>% distinct(protein_id) %>% pull()) + pid = c(pid, dataset$dd_proteins %>% filter(type == "detect" & is.finite(zscore) & abs(zscore) >= select_diffdetect_candidates) %>% distinct(protein_id) %>% pull()) } } @@ -71,7 +111,7 @@ plot_peptide_data = function(dataset, select_all_proteins = FALSE, select_diffde #' @importFrom stringr str_trunc #' @importFrom foreach foreach #' @importFrom parallel stopCluster clusterExport -plot_peptide_data_by_contrast_to_pdf = function(dataset, filter_protein_id, output_dir, show_unused_datapoints = FALSE, norm_algorithm=NA, rollup_algorithm = "maxlfq") { +plot_peptide_data_by_contrast_to_pdf = function(dataset, filter_protein_id, output_dir, show_unused_datapoints = FALSE, norm_algorithm = NA, rollup_algorithm = "maxlfq") { filter_protein_id = unique(na.omit(filter_protein_id)) if(length(filter_protein_id) == 0) { append_log("no valid protein identifiers were provided", type = "error") @@ -203,22 +243,23 @@ plot_peptide_data_by_contrast_to_pdf = function(dataset, filter_protein_id, outp if("dd_proteins" %in% names(dataset)) { tib_dd = dataset$dd_proteins %>% filter(protein_id %in% proteins_contr$protein_id & contrast == contr & is.finite(zscore)) + if(nrow(tib_dd) > 0) { + # infer order in which to plot proteins + contr_protein_id__ordered = c(contr_protein_id__ordered, tib_dd %>% arrange(desc(abs(zscore))) %>% pull(protein_id)) - # infer order in which to plot proteins - contr_protein_id__ordered = c(contr_protein_id__ordered, tib_dd %>% arrange(desc(abs(zscore))) %>% pull(protein_id)) - - # gather differential detect scores in expected format - tib_dd_long = tib_dd %>% select(protein_id, foldchange = zscore) %>% mutate(algorithm = paste0("zscore", type, "counts")) + # gather differential detect scores in expected format + tib_dd_long = tib_dd %>% select(protein_id, type, foldchange = zscore) %>% mutate(algorithm = paste0("zscore-", type, "-counts")) - # pretty-print format for downstream visualization - tib_dd_long = tib_dd_long %>% - mutate(foldchange = sprintf("%.2f", foldchange), - pvalue="", - qvalue="", - signif="") %>% - select(protein_id, algorithm, foldchange, pvalue, qvalue, signif) - - de_proteins_contr = bind_rows(de_proteins_contr, tib_dd_long) + # pretty-print format for downstream visualization + tib_dd_long = tib_dd_long %>% + mutate(foldchange = sprintf("%.2f", foldchange), + pvalue="", + qvalue="", + signif="") %>% + select(protein_id, algorithm, foldchange, pvalue, qvalue, signif) + + de_proteins_contr = bind_rows(de_proteins_contr, tib_dd_long) + } } @@ -393,7 +434,7 @@ reintroduce_filtered_intensities = function(dataset, norm_algorithm, rollup_algo # x_ref[rows_sid] = intensity values in sample s for only the subset of peptides that pass user's filtering criteria, that were subsequently normalized # if we want to reintroduce intensity values from the unfiltered dataset, where normalization was applied to the total set of peptides, there has to be some variation between the filtered and unfiltered data ! # so we here scale the abundances of the 'unfiltered data' to the 'filtered data' - x[rows_sid] = x[rows_sid] + get_mode(x[rows_sid] - x_ref[rows_sid]) + x[rows_sid] = x[rows_sid] + suppressWarnings(get_mode(x[rows_sid] - x_ref[rows_sid])) } } diff --git a/R/plot_retention_time.R b/R/plot_retention_time.R index dc94fb5..1d729cf 100644 --- a/R/plot_retention_time.R +++ b/R/plot_retention_time.R @@ -30,8 +30,12 @@ plot_retention_time_v2 = function(peptides, samples, isdia) { peptides$temp_int = peptides$intensity_qc_basic # use data.tables to summarize the data (reasonably fast, don't need to convert to wide) - DT = data.table::setDT(peptides)[ , `:=` (temp_rt_diff = rt - stats::median(temp_rt_nodetect, na.rm=T), - temp_int_diff_overall = temp_int - stats::median(temp_int, na.rm=T)), by=key_peptide][ , temp_int_diff := (temp_int - mean(temp_int, na.rm=T)), by=key_peptide_group] + DT = data.table::setDT(peptides)[ , c("temp_rt_diff", "temp_int_diff_overall") := list( + rt - stats::median(temp_rt_nodetect, na.rm=T), + temp_int - stats::median(temp_int, na.rm=T)), + by = key_peptide][ , temp_int_diff := (temp_int - mean(temp_int, na.rm=T)), by=key_peptide_group] + # DT = data.table::setDT(peptides)[ , `:=` (temp_rt_diff = rt - stats::median(temp_rt_nodetect, na.rm=T), + # temp_int_diff_overall = temp_int - stats::median(temp_int, na.rm=T)), by=key_peptide][ , temp_int_diff := (temp_int - mean(temp_int, na.rm=T)), by=key_peptide_group] ## DEBUG: remove data in a few RT bins to simulate spray issues -->> how does the plot hold up? # DT$rt[DT$key_sample == 2 & DT$rt > 50 & DT$rt <60 ] = NA; print("********* manually discarding data for code debugging") @@ -79,7 +83,8 @@ plot_retention_time_v2 = function(peptides, samples, isdia) { # we created some temp columns by reference, no longer needed as we are working with binned data from here on - DT[ ,`:=`(temp_rt_nodetect = NULL, temp_int = NULL, temp_rt_diff = NULL, temp_int_diff_overall = NULL, temp_int_diff = NULL, temp_rt_bin = NULL, temp_rt_bin_mean = NULL)] + DT[ , c("temp_rt_nodetect", "temp_int", "temp_rt_diff", "temp_int_diff_overall", "temp_int_diff", "temp_rt_bin", "temp_rt_bin_mean") := list(NULL, NULL, NULL, NULL, NULL, NULL, NULL)] + # DT[ ,`:=`(temp_rt_nodetect = NULL, temp_int = NULL, temp_rt_diff = NULL, temp_int_diff_overall = NULL, temp_int_diff = NULL, temp_rt_bin = NULL, temp_rt_bin_mean = NULL)] # force the creation of empty bins for 'missing bins'. eg; if in some sample at some RT interval no peptides were observed (that pass qc peptide filter), these bins are empty @@ -108,22 +113,38 @@ plot_retention_time_v2 = function(peptides, samples, isdia) { DT_binned[ , size_overall_median := stats::median(size, na.rm=T), by=rt_bin] # smoothed data - DT_binned[ ,`:=`(rt_quantup_smooth = smooth_loess_custom(rt_bin, rt_quantup, span=.1), - rt_quantlow_smooth = smooth_loess_custom(rt_bin, rt_quantlow, span=.1), - rt_median_smooth = smooth_loess_custom(rt_bin, rt_median, span=.1), - - int_quantup_smooth = smooth_loess_custom(rt_bin, int_quantup, span=.1), - int_quantlow_smooth = smooth_loess_custom(rt_bin, int_quantlow, span=.1), - int_median_smooth = smooth_loess_custom(rt_bin, int_median, span=.1), - - int_quantup_smooth_overall = smooth_loess_custom(rt_bin, int_quantup_overall, span=.1), - int_quantlow_smooth_overall = smooth_loess_custom(rt_bin, int_quantlow_overall, span=.1), - int_median_smooth_overall = smooth_loess_custom(rt_bin, int_median_overall, span=.1), - - size_smooth = smooth_loess_custom(rt_bin, size, span=.1), - size_overall_median_smooth = smooth_loess_custom(rt_bin, size_overall_median, span=.1) - ), - by = key_sample] + DT_binned[ , c("rt_quantup_smooth", "rt_quantlow_smooth", "rt_median_smooth", "int_quantup_smooth", "int_quantlow_smooth", "int_median_smooth", "int_quantup_smooth_overall", "int_quantlow_smooth_overall", "int_median_smooth_overall", "size_smooth", "size_overall_median_smooth") := list( + smooth_loess_custom(rt_bin, rt_quantup, span=.1), + smooth_loess_custom(rt_bin, rt_quantlow, span=.1), + smooth_loess_custom(rt_bin, rt_median, span=.1), + + smooth_loess_custom(rt_bin, int_quantup, span=.1), + smooth_loess_custom(rt_bin, int_quantlow, span=.1), + smooth_loess_custom(rt_bin, int_median, span=.1), + + smooth_loess_custom(rt_bin, int_quantup_overall, span=.1), + smooth_loess_custom(rt_bin, int_quantlow_overall, span=.1), + smooth_loess_custom(rt_bin, int_median_overall, span=.1), + + smooth_loess_custom(rt_bin, size, span=.1), + smooth_loess_custom(rt_bin, size_overall_median, span=.1) + ), by = key_sample] + # DT_binned[ ,`:=`(rt_quantup_smooth = smooth_loess_custom(rt_bin, rt_quantup, span=.1), + # rt_quantlow_smooth = smooth_loess_custom(rt_bin, rt_quantlow, span=.1), + # rt_median_smooth = smooth_loess_custom(rt_bin, rt_median, span=.1), + # + # int_quantup_smooth = smooth_loess_custom(rt_bin, int_quantup, span=.1), + # int_quantlow_smooth = smooth_loess_custom(rt_bin, int_quantlow, span=.1), + # int_median_smooth = smooth_loess_custom(rt_bin, int_median, span=.1), + # + # int_quantup_smooth_overall = smooth_loess_custom(rt_bin, int_quantup_overall, span=.1), + # int_quantlow_smooth_overall = smooth_loess_custom(rt_bin, int_quantlow_overall, span=.1), + # int_median_smooth_overall = smooth_loess_custom(rt_bin, int_median_overall, span=.1), + # + # size_smooth = smooth_loess_custom(rt_bin, size, span=.1), + # size_overall_median_smooth = smooth_loess_custom(rt_bin, size_overall_median, span=.1) + # ), + # by = key_sample] # catch smoothing problems for bin size (eg; empty bins) DT_binned$size_smooth[!is.finite(DT_binned$size) | DT_binned$size <= 0] = 0 diff --git a/R/plot_variance_explained.R b/R/plot_variance_explained.R index 5bca804..05705f7 100644 --- a/R/plot_variance_explained.R +++ b/R/plot_variance_explained.R @@ -107,7 +107,7 @@ plot_variance_explained = function(dataset, cols_metadata = NULL, rollup_algorit } if(length(cols_na_repaired) > 0) { - append_log(paste0("replaced NA values in these sample metadata columns; ", paste(cols_na_repaired, collapse = ","), "\nfor numeric columns the mean value was imputed, for other column types the missing values were replaced by the string 'NA' (e.g. a boolean sample metadata column with missing values would become a factor with levels 'TRUE'/'FALSE'/'NA')"), type = "warning") + append_log(paste0("plot_variance_explained: replaced NA values in these sample metadata columns; ", paste(cols_na_repaired, collapse = ","), "\nfor numeric columns the mean value was imputed, for other column types the missing values were replaced by the string 'NA' (e.g. a boolean sample metadata column with missing values would become a factor with levels 'TRUE'/'FALSE'/'NA')"), type = "warning") } diff --git a/R/quickstart.R b/R/quickstart.R index 6489fdd..8596c0b 100644 --- a/R/quickstart.R +++ b/R/quickstart.R @@ -229,7 +229,7 @@ analysis_quickstart = function( # facilitate multiprocessing, this is only used for our custom implementation of msqrob at the moment - if(any(c("msqrob", "msqrobsum") %in% dea_algorithm)) { + if(any(grepl("msqrob", dea_algorithm, ignore.case = T))) { # speed up your analysis by using multiple processor cores (default; all cores but one) cl <<- initialize_multiprocessing(multiprocessing_maxcores) # finally, shut down multithreading clusters. use on.exit to execute regardless of downstream errors diff --git a/R/stats_differential_abundance.R b/R/stats_differential_abundance.R index 04cfc1a..27a6641 100644 --- a/R/stats_differential_abundance.R +++ b/R/stats_differential_abundance.R @@ -49,8 +49,8 @@ de_ebayes = function(eset_proteins, input_intensities_are_log2 = TRUE, random_va # convert from data.frame to a tibble that follows the column names/format we expect downstream result = as_tibble(result) %>% - mutate(protein_id = rownames(x)) %>% - select(protein_id, pvalue = P.Value, qvalue = adj.P.Val, foldchange.log2 = logFC, effectsize, tstatistic = t, standarddeviation, standarderror) %>% + mutate(protein_id = rownames(x)) %>% + select(protein_id, pvalue = P.Value, qvalue = adj.P.Val, foldchange.log2 = logFC, effectsize, tstatistic = t, standarddeviation, standarderror) %>% add_column(dea_algorithm = "ebayes") # note; we added condition to random variables above, strip it from result table for consistency within this R package ('condition' is always assumed to be a regression variable) @@ -153,7 +153,7 @@ de_deqms = function(eset_proteins, input_intensities_are_log2 = TRUE, random_var suppressWarnings(DEqMS::VarianceBoxplot(fit, n=20, main = "DEqMS QC plot", xlab="#unique peptides per protein")) } - ## analogous to our do_ebayes() implementation, extract results from the fit object but grab the DEqMS specific output columns where available (reference; DEqMS::outputResult() ) + ## analogous to our de_ebayes() implementation, extract results from the fit object but grab the DEqMS specific output columns where available (reference; DEqMS::outputResult() ) coef_col = 2 # !! sort.by="none" keeps the output table aligned with input matrix result = suppressMessages(limma::topTable(fit, number = length(eset_proteins__protein_id), coef = coef_col, adjust.method = "fdr", sort.by = "none", confint = TRUE)) @@ -217,7 +217,7 @@ de_ebayes_fit = function(x, random_variables) { all( rownames(random_variables) == as.character(1:nrow(random_variables)) ) || all( rownames(random_variables) == colnames(x) ) )) { - append_log("input matrix x and random_variables seem misaligned; the column names of x versus rownames of variables do not match", type = "error") + append_log("input matrix x and random_variables seem misaligned; the column names of x versus rownames of variables do not match", type = "error") } if(is.vector(random_variables) || is.array(random_variables)) { @@ -349,16 +349,19 @@ de_msempire = function(eset, input_intensities_are_log2 = F) { #' ref; https://github.com/statOmics/msqrob #' #' @param eset a Biobase ExpressionSet that contains the peptide intensities. required attributes; fData(): protein_id and pData(): condition +#' @param eset_proteins only required if `log2fc_without_shrinkage == TRUE` #' @param use_peptide_model if true, apply msqrob. if false, apply msqrobsum #' @param input_intensities_are_log2 whether the provided intensities are already log2 transformed #' @param protein_rollup_robust for msqrobsum analyses, whether to use 'robust' protein rollup (TRUE) or traditional peptide intensity summation (FALSE) #' @param random_variables a vector of column names in your sample metadata table that are added as additional(!) regression terms +#' @param log2fc_without_shrinkage boolean parameter, if FALSE (default) returns msqrob estimated log2 foldchanges. if TRUE, uses `protein_foldchange_from_ebayes()` instead (affecting log2fc and effectsize results) #' #' @importFrom Biobase exprs pData fData #' @importFrom MSnbase as.MSnSet.ExpressionSet combineFeatures #' @export -de_msqrobsum_msqrob = function(eset, use_peptide_model = T, input_intensities_are_log2 = T, protein_rollup_robust = T, random_variables = NULL) { +de_msqrobsum_msqrob = function(eset, eset_proteins = NULL, use_peptide_model = T, input_intensities_are_log2 = T, protein_rollup_robust = T, random_variables = NULL, log2fc_without_shrinkage = FALSE) { start_time = Sys.time() + if(log2fc_without_shrinkage && is.null(eset_proteins)) stop("if log2fc_without_shrinkage is set, provide protein eset") # strip forbidden random variables and compose updated formula random_variables = setdiff(random_variables, c("peptide_id", "protein_id", "sample_id", "condition")) @@ -435,6 +438,15 @@ de_msqrobsum_msqrob = function(eset, use_peptide_model = T, input_intensities_ar dplyr::select(protein_id, contrasts) %>% tidyr::unnest(cols = contrasts) + # test: optional log2fc estimates without shrinkage + if(log2fc_without_shrinkage) { + df_fc = protein_foldchange_from_ebayes(eset_proteins, input_intensities_are_log2 = TRUE, random_variables = random_variables) + i = match(as.character(result_unpacked$protein_id), df_fc$protein_id) + stopifnot(!is.na(i)) + result_unpacked$logFC = df_fc$foldchange.log2[i] + algorithm_name = paste0(algorithm_name, "_fc") + } + result_unpacked$sigma_post = result_unpacked$se / result_unpacked$sigma_contrast result_unpacked$effectsize = result_unpacked$logFC / result_unpacked$sigma_post @@ -450,3 +462,42 @@ de_msqrobsum_msqrob = function(eset, use_peptide_model = T, input_intensities_ar append_log_timestamp(algorithm_name, start_time) return(result) } + + + +#' Compute protein log2 foldchanges from protein-level eBayes (i.e. simple, no shrinkage) +#' +#' implementation analogous to `de_ebayes()` +#' +#' @param eset a Biobase ExpressionSet that contains the protein intensities. required attributes; fData(): protein_id and pData(): condition +#' @param input_intensities_are_log2 whether the provided intensities are already log2 transformed +#' @param random_variables a vector of column names in your sample metadata table that are added as additional(!) regression terms +#' @return data.frame with columns protein_id and foldchange.log2 +protein_foldchange_from_ebayes = function(eset, input_intensities_are_log2 = TRUE, random_variables = NULL) { + if(length(random_variables) > 0 && (!is.vector(random_variables) || + any(is.na(random_variables) | !is.character(random_variables)))) { + append_log(paste("random_variables must either be NULL or character vector", + paste(random_variables, collapse = ", ")), type = "error") + } + if(!input_intensities_are_log2) { + x = log2(Biobase::exprs(eset)) + x[!is.finite(x)] = NA + Biobase::exprs(eset) = x + } + random_variables = unique(c("condition", random_variables)) + df_metadata = Biobase::pData(eset) + if(!all(random_variables %in% colnames(df_metadata))) { + append_log(paste("eset ExpressionSet pData() must contain columns 'condition' and all additional random variables requested in `random_variables` parameter. missing:", + paste(setdiff(random_variables, colnames(df_metadata)), + collapse = ", ")), type = "error") + } + x = Biobase::exprs(eset) + df_metadata = df_metadata[, random_variables, drop = F] + + # re-use our limma wrapper to apply eBayes() + fit = de_ebayes_fit(x, random_variables = df_metadata) + + ## analogous to our de_ebayes() implementation, extract results from the fit object + coef_col = 2 + data.frame(protein_id = rownames(x), foldchange.log2 = fit$coefficients[,coef_col]) +} diff --git a/docker/Dockerfile b/docker/Dockerfile index d6aba45..2e4a6ca 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -33,9 +33,9 @@ RUN R -e "devtools::install_version('remaCor', '0.0.18', upgrade = 'never', repo ### MS-DAP # from local devtools::build() file RUN mkdir /msdap -COPY temp/msdap_1.0.7.tar.gz /msdap/ -RUN R -e "devtools::install_local('/msdap/msdap_1.0.7.tar.gz', upgrade = 'never')" -# alternatively, install from github; RUN R -e "devtools::install_github('ftwkoopmans/msdap@1.0.7', upgrade = 'never')" +COPY temp/msdap_1.0.8.tar.gz /msdap/ +RUN R -e "devtools::install_local('/msdap/msdap_1.0.8.tar.gz', upgrade = 'never')" +# alternatively, install from github; RUN R -e "devtools::install_github('ftwkoopmans/msdap@1.0.8', upgrade = 'never')" ### finish setting up diff --git a/docker/msdap_launcher_unix.sh b/docker/msdap_launcher_unix.sh index 4b69e3f..44c1c54 100644 --- a/docker/msdap_launcher_unix.sh +++ b/docker/msdap_launcher_unix.sh @@ -3,7 +3,7 @@ # MS-DAP launch script # https://github.com/ftwkoopmans/msdap -VERSION="1.0.7" +VERSION="1.0.8" ### OS diff --git a/docker/msdap_launcher_windows.ps1 b/docker/msdap_launcher_windows.ps1 index 5562e3b..a5133b0 100644 --- a/docker/msdap_launcher_windows.ps1 +++ b/docker/msdap_launcher_windows.ps1 @@ -1,7 +1,7 @@ # MS-DAP launch script # https://github.com/ftwkoopmans/msdap -$VERSION = "1.0.7" +$VERSION = "1.0.8" Write-Host "$((Get-Date).ToString("HH:mm:ss")) - Starting MS-DAP launcher script for version: $VERSION" diff --git a/man/de_msqrobsum_msqrob.Rd b/man/de_msqrobsum_msqrob.Rd index f97d568..ecf5896 100644 --- a/man/de_msqrobsum_msqrob.Rd +++ b/man/de_msqrobsum_msqrob.Rd @@ -6,15 +6,19 @@ \usage{ de_msqrobsum_msqrob( eset, + eset_proteins = NULL, use_peptide_model = T, input_intensities_are_log2 = T, protein_rollup_robust = T, - random_variables = NULL + random_variables = NULL, + log2fc_without_shrinkage = FALSE ) } \arguments{ \item{eset}{a Biobase ExpressionSet that contains the peptide intensities. required attributes; fData(): protein_id and pData(): condition} +\item{eset_proteins}{only required if \code{log2fc_without_shrinkage == TRUE}} + \item{use_peptide_model}{if true, apply msqrob. if false, apply msqrobsum} \item{input_intensities_are_log2}{whether the provided intensities are already log2 transformed} @@ -22,6 +26,8 @@ de_msqrobsum_msqrob( \item{protein_rollup_robust}{for msqrobsum analyses, whether to use 'robust' protein rollup (TRUE) or traditional peptide intensity summation (FALSE)} \item{random_variables}{a vector of column names in your sample metadata table that are added as additional(!) regression terms} + +\item{log2fc_without_shrinkage}{boolean parameter, if FALSE (default) returns msqrob estimated log2 foldchanges. if TRUE, uses \code{protein_foldchange_from_ebayes()} instead (affecting log2fc and effectsize results)} } \description{ note that this is the final version of this algorithm provided by its authors, and is no longer maintained; https://github.com/statOmics/msqrob diff --git a/man/dea_algorithms.Rd b/man/dea_algorithms.Rd index c238134..40190b1 100644 --- a/man/dea_algorithms.Rd +++ b/man/dea_algorithms.Rd @@ -22,6 +22,8 @@ This is in line with typical usage of moderated t-statistics in proteomics (e.g. \strong{msqrob}: implementation of the MSqRob package, with minor tweak for (situationally) faster computation (PMID:26566788) \url{https://github.com/statOmics/msqrob}. This is a peptide-level DEA algorithm. This method will take provided covariates into account (if any). Implemented in function; \code{de_msqrobsum_msqrob} +\strong{msqrob_fc}: msqrob estimated protein pvalues, but with log2fc values computed by eBayes. This avoids the excessive shrinkage of protein log2fc values by MSqRob, but does use the MSqRob robust peptide regression model for protein p-value estimates (which outperforms eBayes/DEqMS in ROC analyses on most benchmark datasets, so this method is intended as a "best of both worlds") + \strong{msqrobsum}: implementation of the MSqRob package (which also features MSqRobSum), with minor tweak for (situationally) faster computation (PMID:32321741) \url{https://github.com/statOmics/msqrob}. This is a hybrid peptide&protein-level DEA algorithm that takes peptide-level data as input; it first performs peptide-to-protein rollup, then applies statistics to this protein-level data matrix. This method will take provided covariates into account (if any). Implemented in function; \code{de_msqrobsum_msqrob} } } diff --git a/man/plot_peptide_data.Rd b/man/plot_peptide_data.Rd index c3d922e..021954a 100644 --- a/man/plot_peptide_data.Rd +++ b/man/plot_peptide_data.Rd @@ -7,8 +7,8 @@ plot_peptide_data( dataset, select_all_proteins = FALSE, - select_diffdetect_candidates = TRUE, - select_dea_signif = TRUE, + select_diffdetect_candidates = NA, + select_dea_signif = FALSE, filter_protein_ids = NA, filter_genes = NA, output_dir, @@ -17,24 +17,60 @@ plot_peptide_data( ) } \arguments{ -\item{dataset}{todo} +\item{dataset}{your dataset} -\item{select_all_proteins}{todo} +\item{select_all_proteins}{plot all proteins. This will take a long time. Default: FALSE} -\item{select_diffdetect_candidates}{todo} +\item{select_diffdetect_candidates}{if "differential detection" was enabled (it is by default in \code{analysis_quickstart}), plot candidate proteins based on their zscores. +Valid parameters are positive numeric values that are to be used as absolute z-score cutoffs (4 or 5 are typically reasonable; use \code{plot_differential_detect} to visualize the score distributions). +This selects all proteins that meet this criterium in any contrast, and plots these proteins in every contrast (include those where it is has a z-score below given threshold). +Alternative, set to TRUE to use an absolute z-score cutoff value of 4. Default: NA (disable)} -\item{select_dea_signif}{todo} +\item{select_dea_signif}{plot all significant proteins from DEA? +This effectively checks the "signif" column in the \code{dataset$de_proteins} data table (DEA results generated by MS-DAP), finds all proteins that are signif in any contrast, and plots these proteins in every contrast (include those where it is not signif). +Boolean parameter, default: FALSE} -\item{filter_protein_ids}{todo} +\item{filter_protein_ids}{a vector of protein_id values that should be plotted. For example, you can find specific protein_id values in the differential expression result table and provide only these as a parameter +(in this case, set \code{select_dea_signif=FALSE} to ensure only your specific protein identifiers are plotted)} -\item{filter_genes}{todo} +\item{filter_genes}{a vector of gene symbols to plot (alternative for filter_protein_ids)} -\item{output_dir}{todo} +\item{output_dir}{full path to the output directory} -\item{show_unused_datapoints}{todo} +\item{show_unused_datapoints}{should data points be shown for peptides that were not used in the differential expression analyses? i.e. peptides that did not pass the filtering prior to DEA (found in N samples in both experimental conditions). Boolean value, default: FALSE} -\item{norm_algorithm}{todo} +\item{norm_algorithm}{normalization algorithms. This should be exactly the same as the parameter your provided to \code{analysis_quickstart}} } \description{ plot peptide-level data } +\examples{ +\dontrun{ +# example 1: +# +# plot all significant proteins found by DEA +plot_peptide_data( dataset, select_dea_signif = TRUE, + # should match analysis_quickstart() parameters ! + norm_algorithm = c("vwmb", "modebetween_protein"), + show_unused_datapoints = TRUE, output_dir = "C:/temp") + +# example 2: +# filter the DEA results for some specific proteins, then plot these +pid = dataset$de_proteins \%>\% + filter(dea_algorithm == "deqms" & qvalue < 10^-4) \%>\% + pull(protein_id) +plot_peptide_data( + dataset, filter_protein_ids = pid, + # should match analysis_quickstart() parameters ! + norm_algorithm = c("vwmb", "modebetween_protein"), + show_unused_datapoints = TRUE, output_dir = "C:/temp") + +# example 3: +# plot specific proteins by providing gene symbols (not case sensitive) +plot_peptide_data( + dataset, filter_genes = c("shisa6", "gria2"), + # should match analysis_quickstart() parameters ! + norm_algorithm = c("vwmb", "modebetween_protein"), + show_unused_datapoints = TRUE, output_dir = "C:/temp") +} +} diff --git a/man/protein_foldchange_from_ebayes.Rd b/man/protein_foldchange_from_ebayes.Rd new file mode 100644 index 0000000..7923c97 --- /dev/null +++ b/man/protein_foldchange_from_ebayes.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/stats_differential_abundance.R +\name{protein_foldchange_from_ebayes} +\alias{protein_foldchange_from_ebayes} +\title{Compute protein log2 foldchanges from protein-level eBayes (i.e. simple, no shrinkage)} +\usage{ +protein_foldchange_from_ebayes( + eset, + input_intensities_are_log2 = TRUE, + random_variables = NULL +) +} +\arguments{ +\item{eset}{a Biobase ExpressionSet that contains the protein intensities. required attributes; fData(): protein_id and pData(): condition} + +\item{input_intensities_are_log2}{whether the provided intensities are already log2 transformed} + +\item{random_variables}{a vector of column names in your sample metadata table that are added as additional(!) regression terms} +} +\value{ +data.frame with columns protein_id and foldchange.log2 +} +\description{ +implementation analogous to \code{de_ebayes()} +}