diff --git a/DESCRIPTION b/DESCRIPTION index 5412757..e79e5d8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Package: msdap Title: Mass Spectrometry Downstream Analysis Pipeline Description: Analyze label-free proteomics datasets from various sources (MaxQuant, Spectronaut, etc) using a pipeline that facilitates peptide filtering and many algorithms for normalization and statistical analysis. A comprehensive PDF report can be generated that includes many data visualizations and documentation thereof. URL: https://github.com/ftwkoopmans/msdap -Version: 1.1.3 +Version: 1.2 Authors@R: person(given = "Frank", family = "Koopmans", diff --git a/NAMESPACE b/NAMESPACE index 89328e1..ebacf7a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(add_contrast) export(analysis_quickstart) export(append_log) export(as_matrix_except_first_column) @@ -10,7 +11,6 @@ export(check_dataset_integrity) export(check_valid_tibble_peptides) export(check_valid_tibble_proteins) export(check_valid_tibble_samples) -export(dataset_contrasts) export(de_deqms) export(de_ebayes) export(de_msempire) @@ -28,6 +28,10 @@ export(file_check) export(filename_strip_illegal_characters) export(filter_dataset) export(generate_pdf_report) +export(get_peptide_filternorm_variants) +export(get_protein_matrix) +export(get_samples_for_regression) +export(has_legacy_contrast_definitions) export(hgnc_lookuptable) export(import_dataset_diann) export(import_dataset_encyclopedia) @@ -51,6 +55,7 @@ export(import_sample_metadata) export(initialize_multiprocessing) export(invalidate_cache) export(is_dia_dataset) +export(limma_wrapper) export(logger.default) export(matrix_to_long) export(merge_fractionated_samples) @@ -70,10 +75,14 @@ export(plot_peptide_data) export(plot_sample_pca__sample_in_contrast) export(plot_variance_explained) export(plot_volcano) +export(plot_volcano_allcontrast) +export(print_available_filtering_results) +export(print_contrasts) export(print_dataset_summary) export(protein_eset_from_data) export(read_textfile_compressed) export(regex_classification) +export(remove_contrasts) export(remove_proteins_by_name) export(reset_log) export(rgd_lookuptable) @@ -108,11 +117,8 @@ importFrom(DBI,dbExistsTable) importFrom(DBI,dbGetQuery) importFrom(DBI,dbIsValid) importFrom(DBI,dbListTables) -importFrom(DEqMS,spectraCounteBayes) importFrom(MASS,psi.huber) importFrom(MASS,rlm) -importFrom(MSnbase,as.MSnSet.ExpressionSet) -importFrom(MSnbase,combineFeatures) importFrom(RSQLite,SQLite) importFrom(archive,archive_read) importFrom(archive,file_read) @@ -170,11 +176,8 @@ importFrom(graphics,text) importFrom(gtools,mixedorder) importFrom(gtools,mixedsort) importFrom(iq,fast_MaxLFQ) -importFrom(limma,eBayes) -importFrom(limma,lmFit) importFrom(limma,normalizeCyclicLoess) importFrom(limma,squeezeVar) -importFrom(limma,topTable) importFrom(lme4,findbars) importFrom(lme4,getME) importFrom(lme4,lmerControl) @@ -184,7 +187,6 @@ importFrom(matrixStats,rowMeans2) importFrom(matrixStats,rowSds) importFrom(matrixStats,rowSums2) importFrom(missForest,missForest) -importFrom(msEmpiRe,de.ana) importFrom(msEmpiRe,normalize) importFrom(openssl,md5) importFrom(openxlsx,addWorksheet) @@ -221,7 +223,6 @@ importFrom(stats,ecdf) importFrom(stats,loess) importFrom(stats,mad) importFrom(stats,median) -importFrom(stats,model.matrix) importFrom(stats,na.exclude) importFrom(stats,na.omit) importFrom(stats,optim) diff --git a/R/dataset.R b/R/dataset.R index 2d05c56..ead7ef6 100644 --- a/R/dataset.R +++ b/R/dataset.R @@ -11,17 +11,6 @@ is_dia_dataset = function(dataset) { } -#' List the name of all contrasts in the samples table -#' -#' @param dataset a valid dataset -#' @export -dataset_contrasts = function(dataset) { - if(!"samples" %in% names(dataset)) { - append_log("invalid dataset, it lacks a samples table", type = "error") - } - grep("^contrast:", colnames(dataset$samples), ignore.case = T, value = T) -} - #' Print a short summary of a dataset to console #' @@ -280,6 +269,95 @@ empty_protein_tibble = function(peptides) { +#' rollup peptides to protein data matrix for selected intensity column +#' +#' @param dataset your dataset +#' @param intensity_column column in `dataset$peptides` that should be used for the protein matrix +#' @param include_npep if `TRUE` (default), returns a list with both the matrix and an array of peptide counts (for each row in the protein matrix). If `FALSE`, returns just the protein matrix +#' @export +get_protein_matrix = function(dataset, intensity_column, include_npep = TRUE) { + if(!is.list(dataset) || !"peptides" %in% names(dataset) || !is.data.frame(dataset$peptides)) { + append_log("Dataset does not contain a peptides table (dataset$peptides)", type = "error") + } + if(length(intensity_column) != 1 || is.na(intensity_column) || !is.character(intensity_column)) { + append_log("intensity_column parameter must be a single string", type = "error") + } + if(length(include_npep) != 1 || !include_npep %in% c(TRUE, FALSE)) { + append_log("include_npep parameter must be either TRUE or FALSE", type = "error") + } + + cols_valid = get_peptide_filternorm_variants(dataset) + if(!intensity_column %in% cols_valid) { + append_log("intensity_column parameter is not valid. To see available options, run: print_available_filtering_results(dataset)", type = "error") + } + + tib_pep = dataset$peptides %>% + select(sample_id, protein_id, peptide_id, intensity = !!as.symbol(intensity_column)) %>% + filter(is.finite(intensity)) # remove NA prior to rollup AND importantly, prior to peptide*protein pair counting + + mat = rollup_pep2prot(tib = tib_pep, intensity_is_log2 = TRUE, rollup_algorithm = "maxlfq", return_as_matrix = TRUE) + if(!include_npep) { + return(mat) + } + + # count the number of unique peptides per protein + npep = tib_pep %>% + distinct(protein_id, peptide_id) %>% + count(protein_id) %>% + # importantly, align with the protein matrix by protein_id + slice(match(rownames(mat), protein_id)) %>% + pull(n) + + return(list(matrix = mat, npep = npep)) +} + + + +#' collect the respective subset of samples for selected protein data and prepare a metadata table for regression +#' +#' From sample metadata (dataset$samples), get the subset that is included in the +#' parameter `protein_data`, select columns that contain potential regression variables +#' (using `user_provided_metadata()`) and finally apply `enforce_sample_value_types()` to +#' reformat all variables (except "sample_id") to factor or numeric. +#' +#' @param dataset your dataset +#' @param protein_data output from `get_protein_matrix()` +#' @export +get_samples_for_regression = function(dataset, protein_data) { + if(!is.list(dataset) || !"samples" %in% names(dataset) || !is.data.frame(dataset$samples)) { + append_log("Dataset does not contain a sample metadata table (dataset$samples)", type = "error") + } + if(!is.matrix(protein_data) && !(is.list(protein_data) && "matrix" %in% names(protein_data) && is.matrix(protein_data$matrix))) { + append_log("protein_data parameter does not contain a matrix", type = "error") + } + + sid = NULL + if(is.matrix(protein_data)) { + sid = colnames(protein_data) + } else { + sid = colnames(protein_data$matrix) + } + + if(length(intersect(sid, dataset$samples$sample_id)) == 0) { + append_log("this function expects the column names in the protein_data parameter to match the sample_id column in dataset$samples: zero overlap was found", type = "error") + } + + s = dataset$samples %>% + filter(sample_id %in% sid) %>% + select(sample_id, tidyselect::all_of(user_provided_metadata(dataset$samples))) + + + # importantly, ensure the matrix and sample tables align + stopifnot(length(sid) == nrow(s)) # double-check + s = s[match(sid, s$sample_id),] + + + # convert each column to factor/numeric + return( enforce_sample_value_types(s, redundant_columns = "warning") ) +} + + + #' prettyprint table that summarizes differential detect results #' #' @param dataset dataset that includes DD results @@ -304,7 +382,7 @@ diffdetect_summary_prettyprint = function(dataset, use_quant = FALSE, trim_contr } # array of all contrasts - column_contrasts = dataset_contrasts(dataset) + column_contrasts = unique(x$contrast) y = x %>% # add protein metadata @@ -320,7 +398,7 @@ diffdetect_summary_prettyprint = function(dataset, use_quant = FALSE, trim_contr # sort contrasts in same order as defined by user arrange(match(contrast, column_contrasts)) %>% # for prettyprint, trim the contrast names - mutate(contrast = sub("^contrast: ", "", contrast)) + mutate(contrast = gsub(" *#.*", "", sub("^contrast: ", "", contrast))) # optionally, limit contrast string length (evenly on each side by N characters) if(trim_contrast_names) { @@ -354,7 +432,7 @@ dea_summary_prettyprint = function(dataset, trim_contrast_names = FALSE) { } # array of all contrasts - column_contrasts = dataset_contrasts(dataset) + column_contrasts = unique(x$contrast) y = x %>% # add protein metadata @@ -373,7 +451,7 @@ dea_summary_prettyprint = function(dataset, trim_contrast_names = FALSE) { # sort contrasts in same order as defined by user arrange(match(contrast, column_contrasts)) %>% # for prettyprint, trim the contrast names - mutate(contrast = sub("^contrast: ", "", contrast)) + mutate(contrast = gsub(" *#.*", "", sub("^contrast: ", "", contrast))) # optionally, limit contrast string length (evenly on each side by N characters) if(trim_contrast_names) { diff --git a/R/dea.R b/R/dea.R index 020f049..2bc2565 100644 --- a/R/dea.R +++ b/R/dea.R @@ -79,6 +79,38 @@ get_column_intensity = function(peptides, contr_lbl = NA) { +#' Return column names in peptide table that hold intensity values +#' +#' @param dataset your dataset +#' @export +get_peptide_filternorm_variants = function(dataset) { + grep("^intensity", colnames(dataset$peptides), value = TRUE) +} + + + +#' Return column names in peptide table that hold intensity values +#' +#' @param dataset your dataset +#' @export +print_available_filtering_results = function(dataset) { + for(col in get_peptide_filternorm_variants(dataset)) { + lbl = "" + if(col == "intensity") lbl = "input data as-is" + if(col == "intensity_all_group") lbl = "global data filter ('all_group')" + if(col == "intensity_by_group") lbl = "filtering applied within-group only ('by_group')" + if(grepl("intensity_contrast", col)) lbl = "filter by user-defined contrast ('by_contrast')" + + if(lbl != "") { + cat(sprintf('"%s" = %s\n', col, lbl)) + } else { + cat(sprintf('"%s"\n', col)) + } + } +} + + + #' return sample groups for a given contrast (column name in dataset$samples) #' #' @param dataset your dataset @@ -105,6 +137,9 @@ contrast_to_samplegroups = function(dataset, contr_lbl) { #' @export dea = function(dataset, qval_signif = 0.01, fc_signif = 0, dea_algorithm = "deqms", rollup_algorithm = "maxlfq", output_dir_for_eset = "") { ### input validation + # check if the user is trying to apply filtering/dea on a dataset object that is incompatible with MS-DAP version 1.2 or later + error_legacy_contrast_definitions(dataset) + if (length(qval_signif) != 1 || !is.finite(qval_signif) || qval_signif <= 0) { append_log("q-value threshold must be a single numerical value above 0 (parameter qval_signif)", type = "error") } @@ -136,8 +171,7 @@ dea = function(dataset, qval_signif = 0.01, fc_signif = 0, dea_algorithm = "deqm append_log(paste("invalid options for dea_algorithm:", paste(dea_algorithm_invalid, collapse=", ")), type = "error") } - column_contrasts = dataset_contrasts(dataset) - if (length(column_contrasts) == 0) { + if (!is.list(dataset$contrasts) || length(dataset$contrasts) == 0) { append_log("no contrasts have been defined, differential expression analysis is cancelled", type = "warning") return(dataset) } @@ -148,70 +182,28 @@ dea = function(dataset, qval_signif = 0.01, fc_signif = 0, dea_algorithm = "deqm result_stats = tibble() ### iterate contrasts - for (col_contr in column_contrasts) { # col_contr = column_contrasts[1] - append_log(paste("differential expression analysis for", col_contr), type = "info") + for(contr in dataset$contrasts) { + append_log(paste("differential expression analysis for", contr$label), type = "info") + + # sample table: only relevant samples (no exclude, only samples selected for contrast) and only relevant sample metadata for DEA + # !! below code depends on proper sample selection, order and selected column + samples_for_contrast = contr$sample_table # returns named variable (label=column_name) indicating which type of intensity data is used - col_contr_intensity = get_column_intensity(dataset$peptides, col_contr) + col_contr_intensity = get_column_intensity(dataset$peptides, contr$label) append_log(paste("using data from peptide filter:", names(col_contr_intensity)), type = "info") - # if there are no intensity values, there is nothing to do + # input validation if(sum(!is.na(dataset$peptides %>% pull(!!col_contr_intensity))) < 2) { append_log("no peptides with an intensity value are available (perhaps filtering was too stringent?)", type = "warning") next } - # select the current contrast as column 'condition' and remove irrelevant samples (when defining our contrasts in upstream code, irrelevant samples were designated a 0) - # ! sorting by 'condition' here is important, as it aligns the samples by the groups specified by the user. if we don't enforce this, an intended WT-vs-KO might actually be analyzed as KO-vs-WT depending on the input data's sample ordering - samples_for_contrast = dataset$samples %>% - select(sample_id, shortname, group, condition = !!col_contr, tidyselect::everything()) %>% - filter(condition != 0) %>% - arrange(condition) - - if(!all(samples_for_contrast$sample_id %in% dataset$peptides$sample_id)) { + # input validation + if(!all(samples_for_contrast$sample_id %in% unique(dataset$peptides$sample_id))) { append_log("all samples from this contrast must be in the peptides tibble", type = "error") } - ## determine which random variables can be added to this contrast - ranvars = ranvars_matrix = NULL - for(v in unique(dataset$dea_random_variables)) { - # v is a column name in dataset$samples (here we use the subset thereof for this contrast), x are the metadata values for respective column - x = samples_for_contrast %>% pull(!!v) - xu = unique(x) # don't re-arrange/sort ! - xi = match(x, xu) - # because `samples_for_contrast` table was sorted by condition upstream, and condition are either 1 or 2, we can compare the random variables `x` using match(x, unique(x)) - # debug; print(cbind(samples_for_contrast$condition, match(x, unique(x)), x)) - x_aligns_with_condition = xi == samples_for_contrast$condition - - if( - # 1) drop ranvars that align with the condition - # is the 'random variable' not the same as the condition/contrast? criterium: at least 10% of all samples in contrast must differ (or 2) - # May occur if user is testing many contrasts, one of which is a minor subset of the data (for which there are no differences in for instance the sample batch) - length(xu) > 1 && sum(!x_aligns_with_condition) >= max(2, ceiling(nrow(samples_for_contrast) * .1)) && - # 2) drop duplicate ranvars. test the indices `xi` against ranvars from previous iterations (apply function to each column in matrix, test if all elements overlap with `xi`) - # we must check within contrast and cannot completely check while specifying ranvars upsteam, e.g. perhaps 2 metadata properties overlap in a subset of samples that are tested in some contrast - (length(ranvars_matrix) == 0 || !any(apply(ranvars_matrix, 2, function(col) all(col==xi)))) - ) { - ranvars = c(ranvars, v) - ranvars_matrix = cbind(ranvars_matrix, xi) - } - - ## alternatively, test # differences per sample condition. proof-of-concept; - #diff_per_condition = 0 - #for(cond in unique(samples_for_contrast$condition)) { - # rows = samples_for_contrast$condition == cond - # diff_per_condition = diff_per_condition + as.integer(n_distinct(x[rows]) > 1) - #} - } - - - if(length(dataset$dea_random_variables) > 0 && length(ranvars) != length(dataset$dea_random_variables)) { - append_log(paste("random variables that are _not_ applicable for current contrast due to lack of unique values (compared to sample groups/condition): ", paste(setdiff(dataset$dea_random_variables, ranvars), collapse=", ")), type = "info") - } - if(length(ranvars) > 0) { - append_log(paste("random variables used in current contrast: ", paste(ranvars, collapse=", ")), type = "info") - } - ## convert our long-format peptide table to a peptide- and protein-level ExpressionSet # subset peptide tibble for current contrast @@ -232,10 +224,11 @@ dea = function(dataset, qval_signif = 0.01, fc_signif = 0, dea_algorithm = "deqm # cleanup rm(m, prot_pep_count) + # if a directory for file storage was provided, store eset in a .RData file if(length(output_dir_for_eset) == 1 && !is.na(output_dir_for_eset) && nchar(output_dir_for_eset)>0 && dir.exists(output_dir_for_eset)) { - save(eset_peptides, file=path_append_and_check(output_dir_for_eset, paste0("ExpressionSet_peptides_", gsub("\\W+", " ", col_contr), ".RData")), compress = T) - save(eset_proteins, file=path_append_and_check(output_dir_for_eset, paste0("ExpressionSet_proteins_", gsub("\\W+", " ", col_contr), ".RData")), compress = T) + save(eset_peptides, file=path_append_and_check(output_dir_for_eset, paste0("ExpressionSet_peptides_", gsub("\\W+", " ", contr$label), ".RData")), compress = T) + save(eset_proteins, file=path_append_and_check(output_dir_for_eset, paste0("ExpressionSet_proteins_", gsub("\\W+", " ", contr$label), ".RData")), compress = T) } contr_fc_signif = fc_signif @@ -254,22 +247,20 @@ dea = function(dataset, qval_signif = 0.01, fc_signif = 0, dea_algorithm = "deqm alg_result = NULL alg_plugin = FALSE if(alg == "ebayes") { - alg_result = de_ebayes(eset_proteins=eset_proteins, input_intensities_are_log2 = T, random_variables = ranvars) + alg_result = de_ebayes(eset = eset_proteins, model_matrix = contr$model_matrix, model_matrix_result_prop = contr$regression_coefficient_name) } else if(alg == "deqms") { - alg_result = de_deqms(eset_proteins=eset_proteins, input_intensities_are_log2 = T, random_variables = ranvars) + alg_result = de_deqms(eset = eset_proteins, model_matrix = contr$model_matrix, model_matrix_result_prop = contr$regression_coefficient_name) } else if(alg == "msempire") { # ! compared to the other dea algorithms, MS-EmpiRe is not a regression model so we cannot add random variables - alg_result = de_msempire(eset_peptides, input_intensities_are_log2 = T) + alg_result = de_msempire(eset = eset_peptides, model_matrix = contr$model_matrix, model_matrix_result_prop = contr$regression_coefficient_name) } else if(alg == "msqrob") { - alg_result = de_msqrobsum_msqrob(eset_peptides, eset_proteins = NULL, use_peptide_model = T, input_intensities_are_log2 = T, random_variables = ranvars, log2fc_without_shrinkage = FALSE) - # } else if(alg == "msqrob_fc") { - # alg_result = de_msqrobsum_msqrob(eset_peptides, eset_proteins = eset_proteins, use_peptide_model = T, input_intensities_are_log2 = T, random_variables = ranvars, log2fc_without_shrinkage = TRUE) + alg_result = de_msqrobsum_msqrob(eset = eset_peptides, model_matrix = contr$model_matrix, model_matrix_result_prop = contr$regression_coefficient_name, use_peptide_model = TRUE, random_variables = contr$colname_additional_variables) } else if(alg == "msqrobsum") { - alg_result = de_msqrobsum_msqrob(eset_peptides, eset_proteins = NULL, use_peptide_model = F, input_intensities_are_log2 = T, random_variables = ranvars, log2fc_without_shrinkage = FALSE) + alg_result = de_msqrobsum_msqrob(eset = eset_peptides, model_matrix = contr$model_matrix, model_matrix_result_prop = contr$regression_coefficient_name, use_peptide_model = FALSE, random_variables = contr$colname_additional_variables) } else { # for non-hardcoded functions, we call the function requested as a user parameter and pass all available data alg_fun = match.fun(alg) # throws error if not found - alg_result = alg_fun(peptides=peptides_for_contrast, samples=samples_for_contrast, eset_peptides=eset_peptides, eset_proteins=eset_proteins, input_intensities_are_log2 = T, random_variables = ranvars, dataset_name=dataset$name) + alg_result = alg_fun(peptides=peptides_for_contrast, samples=samples_for_contrast, eset_peptides=eset_peptides, eset_proteins=eset_proteins, model_matrix = contr$model_matrix, model_matrix_result_prop = contr$regression_coefficient_name, random_variables = contr$colname_additional_variables, dataset_name=dataset$name) alg_plugin = TRUE } @@ -310,8 +301,8 @@ dea = function(dataset, qval_signif = 0.01, fc_signif = 0, dea_algorithm = "deqm # error handling; a DEA function returned an error, show to user using our logging system if(inherits(err, "error")) { - append_log(sprintf("an error occurred in %s during the execution of DEA function '%s'", col_contr, alg), type="warning") - append_log_error(err, type="warning") + append_log(sprintf('an error occurred in "%s" during the execution of DEA function "%s"', contr$label, alg), type = "warning") + append_log_error(err, type = "warning") } } @@ -328,7 +319,7 @@ dea = function(dataset, qval_signif = 0.01, fc_signif = 0, dea_algorithm = "deqm result_stats, tib %>% left_join(prot_pep_count, by="protein_id") %>% - add_column(signif = s, signif_threshold_qvalue = qval_signif, signif_threshold_log2fc = contr_fc_signif, contrast = col_contr, .after = "qvalue") + add_column(signif = s, signif_threshold_qvalue = qval_signif, signif_threshold_log2fc = contr_fc_signif, contrast = contr$label, .after = "qvalue") ) } } diff --git a/R/filter_peptides.R b/R/filter_peptides.R index d85d939..c40beb6 100644 --- a/R/filter_peptides.R +++ b/R/filter_peptides.R @@ -228,11 +228,14 @@ filter_dataset = function(dataset, norm_algorithm = "", rollup_algorithm = "maxlfq", # which filters to apply - by_group = T, all_group = T, by_contrast = F) { + by_group = FALSE, all_group = FALSE, by_contrast = FALSE) { start_time = Sys.time() ##### input validation + # check if the user is trying to apply filtering/dea on a dataset object that is incompatible with MS-DAP version 1.2 or later + error_legacy_contrast_definitions(dataset) + # validate function parameters check_parameter_is_numeric(filter_min_detect, filter_fraction_detect, filter_min_quant, filter_fraction_quant, filter_min_peptide_per_prot, filter_topn_peptides) check_parameter_is_boolean(by_group, all_group, by_contrast) @@ -343,53 +346,47 @@ filter_dataset = function(dataset, #### filter and normalize by contrast - # get all contrasts from sample table - column_contrasts = dataset_contrasts(dataset) - # if there are no contrasts setup, disable 'by_contrast' - by_contrast = by_contrast && length(column_contrasts) > 0 + # if there are no contrasts, disable 'by_contrast' + by_contrast = by_contrast && is.list(dataset$contrasts) && length(dataset$contrasts) > 0 if(by_contrast) { - for (col_contr in column_contrasts) { # col_contr = column_contrasts[1] - # only keep the current contrast in samples table, so downstream code tests this contrast only - # note; outlier samples are not included in contrasts as at all in upstream 'contrasts implementation' - col_contr_samples = dataset$samples %>% select(key_sample, key_group, group, contrast = !!col_contr) %>% filter(contrast != 0) - grp1_key = col_contr_samples %>% filter(contrast == 1) %>% distinct(key_group) %>% pull() - grp2_key = col_contr_samples %>% filter(contrast == 2) %>% distinct(key_group) %>% pull() - # grp1 = col_contr_samples %>% filter(contrast == 1) %>% pull(group) - # grp2 = col_contr_samples %>% filter(contrast == 2) %>% pull(group) - # grp1_key = dataset$groups %>% filter(group %in% grp1) %>% distinct(key_group) %>% pull() - # grp2_key = dataset$groups %>% filter(group %in% grp2) %>% distinct(key_group) %>% pull() - - if(length(grp1_key) == 1) { - grp1_mask = dataset$peptides$key_peptide %in% dt_filter_group_wide_noexclude$key_peptide[ dt_filter_group_wide_noexclude[[as.character(grp1_key)]] ] - } else { - # if multi-group, select all columns for this group and test that all are true - dt_filter_group_wide_noexclude[, temp := matrixStats::rowSums2(as.matrix(.SD)), .SDcols=eval(quote(as.character(grp1_key)))] - # dt_filter_group_wide_noexclude[, `:=`(temp = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(as.character(grp1_key)))] - grp1_mask = dataset$peptides$key_peptide %in% dt_filter_group_wide_noexclude$key_peptide[ dt_filter_group_wide_noexclude$temp == length(grp1_key) ] - } - - if(length(grp2_key) == 1) { - grp2_mask = dataset$peptides$key_peptide %in% dt_filter_group_wide_noexclude$key_peptide[ dt_filter_group_wide_noexclude[[as.character(grp2_key)]] ] - } else { - # if multi-group, select all columns for this group and test that all are true - dt_filter_group_wide_noexclude[, temp := matrixStats::rowSums2(as.matrix(.SD)), .SDcols=eval(quote(as.character(grp2_key)))] - # dt_filter_group_wide_noexclude[, `:=`(temp = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(as.character(grp2_key)))] - grp2_mask = dataset$peptides$key_peptide %in% dt_filter_group_wide_noexclude$key_peptide[ dt_filter_group_wide_noexclude$temp == length(grp2_key) ] - } - - - intensity_col_contr = paste0("intensity_", col_contr) + for(contr in dataset$contrasts) { + grp1_sid = contr$sampleid_condition1 + grp2_sid = contr$sampleid_condition2 + + # contrast-specific minimum number of samples where a peptide should be detected/quantified, per side-of-the-contrast + grp1_mindetect = max(filter_min_detect, ceiling(length(grp1_sid) * filter_fraction_detect)) + grp2_mindetect = max(filter_min_detect, ceiling(length(grp2_sid) * filter_fraction_detect)) + grp1_minquant = max(filter_min_quant, ceiling(length(grp1_sid) * filter_fraction_quant)) + grp2_minquant = max(filter_min_quant, ceiling(length(grp2_sid) * filter_fraction_quant)) + + grp1_counts = dataset$peptides %>% + filter(sample_id %in% grp1_sid) %>% + group_by(peptide_id) %>% + summarise(ndetect = sum(detect), nquant = n()) %>% + ungroup() + + grp2_counts = dataset$peptides %>% + filter(sample_id %in% grp2_sid) %>% + group_by(peptide_id) %>% + summarise(ndetect = sum(detect), nquant = n()) %>% + ungroup() + + # subset of peptides that matches the filtering criterium in both groups + pepid_valid = intersect( + grp1_counts %>% filter(ndetect >= grp1_mindetect & nquant >= grp1_minquant) %>% pull(peptide_id), + grp2_counts %>% filter(ndetect >= grp2_mindetect & nquant >= grp2_minquant) %>% pull(peptide_id) + ) + + + intensity_col_contr = paste0("intensity_", contr$label) dataset$peptides[,intensity_col_contr] = dataset$peptides$intensity - dataset$peptides[!(grp1_mask & grp2_mask & dataset$peptides$key_sample %in% col_contr_samples$key_sample), intensity_col_contr] <- NA + rows = dataset$peptides$sample_id %in% c(grp1_sid, grp2_sid) & dataset$peptides$peptide_id %in% pepid_valid + dataset$peptides[!rows, intensity_col_contr] <- NA # log results - log_stats = c(log_stats, sprintf("%d/%d peptides were retained after filtering within %s", dataset$peptides %>% select(peptide_id, intensity = !!intensity_col_contr) %>% filter(!is.na(intensity)) %>% distinct(peptide_id) %>% nrow(), npep_input, col_contr)) - - # table(grp1_mask); table(grp2_mask); table(!(grp1_mask & grp2_mask & dataset$peptides$key_group %in% c(grp1_key,grp2_key))); dataset$peptides %>% select(value = !!intensity_col_contr, key_group, sample_id, peptide_id) %>% filter(!is.na(value)) %>% count(key_group, sample_id) + log_stats = c(log_stats, sprintf("%d/%d peptides were retained after filtering within %s", n_distinct(dataset$peptides$peptide_id[rows]), npep_input, gsub(" *#.*", "", contr$label))) } - # } else { - # dataset$peptides[,paste0("intensity_", column_contrasts)] = NULL # drop column if pre-existing. eg; suppose user changes filters + normalization, then doesn't want this filter; cannot keep old data } diff --git a/R/parse_fragpipe.R b/R/parse_fragpipe.R index 390ac74..2668d47 100644 --- a/R/parse_fragpipe.R +++ b/R/parse_fragpipe.R @@ -270,7 +270,7 @@ fragpipe_modseq_compose = function(x) { ) x_unlisted = x_result %>% - unchop(col = mods_list) %>% + unchop(cols = mods_list) %>% mutate( is_nterm = grepl("N-term", mods_list, fixed = TRUE), is_cterm = grepl("C-term", mods_list, fixed = TRUE), diff --git a/R/parse_longformat_generic.R b/R/parse_longformat_generic.R index ad4d79f..9b6aea0 100644 --- a/R/parse_longformat_generic.R +++ b/R/parse_longformat_generic.R @@ -300,7 +300,7 @@ import_dataset_in_long_format = function(filename = NULL, x = NULL, attributes_r # if filename is supplied, check if it's valid if(length(filename) == 1) { - if(!is(filename, "character") || nchar(filename) < 3) { + if(!is.character(filename) || nchar(filename) < 3) { append_log(paste("'filename' parameter is not valid:", filename), type = "error") } @@ -312,10 +312,10 @@ import_dataset_in_long_format = function(filename = NULL, x = NULL, attributes_r if(length(x) > 0 && !is.data.frame(x)) { append_log("'x' parameter must be of type: data.frame, tibble or data.table):", type = "error") } - if(length(attributes_required) == 0 || !is(attributes_required, "list")) { + if(length(attributes_required) == 0 || !is.list(attributes_required)) { append_log("'attributes_required' parameter must be a list", type = "error") } - if(!is(attributes_optional, "list")) { + if(!is.list(attributes_optional)) { append_log("'attributes_optional' parameter must be a list", type = "error") } if(any(names(attributes_required) %in% names(attributes_optional))) { diff --git a/R/plot_differential_detect.R b/R/plot_differential_detect.R index b276558..c47fc47 100644 --- a/R/plot_differential_detect.R +++ b/R/plot_differential_detect.R @@ -11,8 +11,8 @@ plot_differential_detect = function(dataset, zscore_threshold = 6) { return(result) } - for(contr in dataset_contrasts(dataset)) { - tib_contr = dataset$dd_proteins %>% filter(contrast == contr & is.finite(zscore)) + for(contr in dataset$contrasts) { + tib_contr = dataset$dd_proteins %>% filter(contrast == contr$label & is.finite(zscore)) if(nrow(tib_contr) == 0) { next } @@ -32,13 +32,13 @@ plot_differential_detect = function(dataset, zscore_threshold = 6) { geom_histogram(bins=25, boundary = 0, colour = "black", fill="lightgrey", na.rm=T) + geom_vline(xintercept = c(-zscore_threshold, zscore_threshold), colour = "red") + facet_wrap(.~type_label, ncol = 1, scales = "free") + - labs(x="Differential z-score for observed peptides", y="Protein count", colour = "", title = contr) + + labs(x="Differential z-score for observed peptides", y="Protein count", colour = "", title = paste("contrast:", contr$label_contrast)) + theme_bw() + theme(plot.title = element_text(hjust = 0.5, size=9), plot.subtitle = element_text(hjust = 0.5, size=9), legend.position = "none") - result[[contr]] = p_hist + result[[contr$label]] = p_hist } return(result) diff --git a/R/plot_peptide_data.R b/R/plot_peptide_data.R index f6e7408..117fc4f 100644 --- a/R/plot_peptide_data.R +++ b/R/plot_peptide_data.R @@ -46,6 +46,9 @@ #' @param norm_algorithm normalization algorithms. This should be exactly the same as the parameter your provided to `analysis_quickstart` #' @export plot_peptide_data = function(dataset, select_all_proteins = FALSE, select_diffdetect_candidates = NA, select_dea_signif = FALSE, filter_protein_ids = NA, filter_genes = NA, output_dir, show_unused_datapoints = FALSE, norm_algorithm = NA) { + # check if the user is trying to apply filtering/dea on a dataset object that is incompatible with MS-DAP version 1.2 or later + error_legacy_contrast_definitions(dataset) + if(length(select_diffdetect_candidates) == 1 && select_diffdetect_candidates %in% TRUE) { select_diffdetect_candidates = 4 # default z-score } @@ -136,14 +139,12 @@ plot_peptide_data_by_contrast_to_pdf = function(dataset, filter_protein_id, outp append_log(paste("failed to create temp directory at;", output_dir__temp), type = "error") } - ucontr = dataset_contrasts(dataset) - # error if there are no contrasts - if(length(ucontr) == 0) { + if(!is.list(dataset$contrasts) || length(dataset$contrasts) == 0) { append_log("no contrasts in this dataset", type = "error") } - output_pdf_filenames = sprintf("%s/MS-DAP_peptide-data_contrast-%s.pdf", output_dir, seq_along(ucontr)) + output_pdf_filenames = sprintf("%s/MS-DAP_peptide-data_contrast-%s.pdf", output_dir, seq_along(dataset$contrasts)) for(f in output_pdf_filenames) { remove_file_if_exists(f) } @@ -155,6 +156,7 @@ plot_peptide_data_by_contrast_to_pdf = function(dataset, filter_protein_id, outp # figure out where the filtered&norm peptide intensity data is at for each contrast # = named array with 'intensity post-processing type' and column names, per contrast + ucontr = unlist(lapply(dataset$contrasts, "[[", "label")) ucontr_col_intensity = ucontr_col_intensity__reintroduced = sapply(ucontr, function(x) get_column_intensity(dataset$peptides, x)) ## borrow data for peptides removed during filtering, but skip data directly from the raw input data column ("intensity") @@ -182,8 +184,8 @@ plot_peptide_data_by_contrast_to_pdf = function(dataset, filter_protein_id, outp } - for(index in seq_along(ucontr)) { - contr = ucontr[index] + for(index in seq_along(dataset$contrasts)) { + contr = dataset$contrasts[[index]] # prepare a minimal peptide-level tibble that holds all required data for downstream plots contr_col_int = as.character(ucontr_col_intensity[index]) # don't forget to dump the 'name' from this variable, or dplyr::select() will trip out while using !! downstream @@ -191,12 +193,7 @@ plot_peptide_data_by_contrast_to_pdf = function(dataset, filter_protein_id, outp ## samples to plot; not only those used in statistical contrast but also 'exclude' samples that match the current contrast # for datasets where 'all groups' filtering was used, show all samples. If by-contrast filtering, show only samples in contrast's sample groups - samples_contr = dataset$samples - if(grepl("contrast:", contr_col_int, ignore.case = T)) { - contr_sample_groups = unlist(decompose_contrast_name(contr)) - samples_contr = dataset$samples %>% filter(group %in% contr_sample_groups) - rm(contr_sample_groups) - } + samples_contr = dataset$samples %>% filter(sample_id %in% c(contr$sampleid_condition1, contr$sampleid_condition2)) peptides_contr = peptides %>% select(peptide_id, protein_id, sample_id, detect, intensity_filtered = !!contr_col_int, intensity_reintroduced = !!contr_col_int_reintroduced, peptide_id_group = key_peptide_group) %>% @@ -225,7 +222,7 @@ plot_peptide_data_by_contrast_to_pdf = function(dataset, filter_protein_id, outp # gather differential testing results, if any de_proteins_contr = tibble() if("de_proteins" %in% names(dataset)) { - de_proteins_contr = dataset$de_proteins %>% filter(protein_id %in% proteins_contr$protein_id & contrast == contr) + de_proteins_contr = dataset$de_proteins %>% filter(protein_id %in% proteins_contr$protein_id & contrast == contr$label) # infer order in which to plot proteins contr_protein_id__ordered = de_proteins_contr %>% arrange(pvalue) %>% distinct(protein_id) %>% pull() @@ -242,7 +239,7 @@ plot_peptide_data_by_contrast_to_pdf = function(dataset, filter_protein_id, outp } if("dd_proteins" %in% names(dataset)) { - tib_dd = dataset$dd_proteins %>% filter(protein_id %in% proteins_contr$protein_id & contrast == contr & is.finite(zscore)) + tib_dd = dataset$dd_proteins %>% filter(protein_id %in% proteins_contr$protein_id & contrast == contr$label & is.finite(zscore)) if(nrow(tib_dd) > 0) { # infer order in which to plot proteins contr_protein_id__ordered = c(contr_protein_id__ordered, tib_dd %>% arrange(desc(abs(zscore))) %>% pull(protein_id)) @@ -335,8 +332,8 @@ plot_peptide_data_by_contrast_to_pdf = function(dataset, filter_protein_id, outp fname_documentation_page = sprintf("%s/peptideplot_%s_%s.pdf", output_dir__temp, index, "docs") pdf(fname_documentation_page, width=7, height=9) graphics::plot.new() - # mtext(paste0(contr, "\n\nhorizontal line = peptide average intensity within 'sample group'\n\nconfidently identified peptides: solid/filled symbols\nquantified but not identified (match-between-runs): open symbols\n\npeptides that _fail_ the filter criteria for this contrast:\nsmall symbols & dashed line for average\n(note; these were not used for statistics)\n\nfor proteins with many peptides, legends are shown on a separate page for legibility"), padj = 1, cex = .75) - mtext(paste0(contr, " + + mtext(paste0(contr$label, " Peptides can be quantified AND detected (eg; by MS/MS or DIA confidence score) in a sample, or only quantified (eg; match-between-runs or poor DIA confidence score). diff --git a/R/plot_volcano.R b/R/plot_volcano.R index 5465ce2..5f057ad 100644 --- a/R/plot_volcano.R +++ b/R/plot_volcano.R @@ -1,7 +1,7 @@ #' volcano plot for DEA results, split into 4 panels; with/without labels and with/without thresholding foldchange-outliers #' -#' If there are multiple contrasts in your DEA results, you should first subset the DEA result table for 1 contrast before calling this plot function (i.e. don't plot multiple contrasts into 1 volcano plot), see Example 5 below. +#' If there are multiple contrasts in your DEA results you should either use the 'wrapper function' plot_volcano_allcontrast(), OR, first subset the DEA result table for 1 contrast before calling this plot function (i.e. don't plot multiple contrasts into 1 volcano plot). #' #' @param stats_de typically these are the output generated by the MS-DAP dea() function. If you use the entire pipeline, i.e. analysis_quickstart(), the resulting dataset objects holds these in the dataset$de_proteins property. Include a column "gene_symbols_or_id" to plot gene symbols instead of protein_id, see examples below #' @param log2foldchange_threshold threshold for significance of log2 foldchanges. Set to NA to disregard (default) or provide a single numeric value (cutoff will be applied symetrically for both up- and down-regulated) @@ -10,6 +10,7 @@ #' @param label_mode which class of proteins should be labeled in the plot. Options: topn_pvalue (top N smallest p-value, default), signif (all significant proteins), protein_id (a provided set of protein_id). Defaults to topN for consistency and clarity; if the upstream analysis yielded hundreds of hits the labels will be unreadable #' @param label_target further specification of the label_mode parameter. For instance, if 'topn_pvalue' is set, here you can set the number of proteins that should be labeled. Analogously, if label_mode='protein_id' is set you can here provide an array of protein_id values (that are available in the stats_de data table) #' @param label_avoid_overlap use the ggrepel R package to try and place labels with minimal overlap (only works when the number of labeled proteins is relatively low and sparse, e.g. for topN 25). Options: TRUE, FALSE +#' @param show_plots boolean parameter; should each plot be printed/shown immediately? If `FALSE` (default) you'll have to extract the ggplot objects from the resulting list in order to print the plots #' @return returns a named list that contains a list, with properties 'ggplot' and 'ggplot_data', for each unique 'dea_algorithm' in the input stats_de table #' #' @examples @@ -23,10 +24,8 @@ #' plot_list = msdap::plot_volcano(dataset$de_proteins %>% left_join(dataset$proteins), #' log2foldchange_threshold = 1, qvalue_threshold = 0.01, #' mtitle = "volcano, label top 10", label_mode = "topn_pvalue", label_target = 10, -#' label_avoid_overlap = TRUE +#' label_avoid_overlap = TRUE, show_plots = TRUE #' ) -#' # for each unique 'dea_algorithm' in the stats_de table, list of ggplot objects and data -#' lapply(plot_list, "[[", "ggplot") #' } #' #' ## example 2: analogous, but now show all significant proteins and disable "repelled labels" @@ -35,9 +34,8 @@ #' plot_list = msdap::plot_volcano(dataset$de_proteins %>% left_join(dataset$proteins), #' log2foldchange_threshold = 1, qvalue_threshold = 0.01, #' mtitle = "volcano, label all significant", label_mode = "signif", -#' label_avoid_overlap = FALSE +#' label_avoid_overlap = FALSE, show_plots = TRUE #' ) -#' lapply(plot_list, "[[", "ggplot") #' } #' #' ## example 3: show labels for some set of protein IDs. First line selects all proteins where symbol @@ -50,9 +48,8 @@ #' plot_list = msdap::plot_volcano(dataset$de_proteins %>% left_join(dataset$proteins), #' log2foldchange_threshold = 1, qvalue_threshold = 0.01, #' mtitle = "volcano, label selected proteins", label_mode = "protein_id", -#' label_target = pid_label, label_avoid_overlap = FALSE +#' label_target = pid_label, label_avoid_overlap = FALSE, show_plots = TRUE #' ) -#' lapply(plot_list, "[[", "ggplot") #' } #' #' ## example 4: plot all significant labels as before, then add custom labels for some subset of @@ -63,7 +60,7 @@ #' plot_list = msdap::plot_volcano(dataset$de_proteins %>% left_join(dataset$proteins), #' log2foldchange_threshold = 1, qvalue_threshold = 0.01, mtitle = "volcano, #' label all significant + custom labels", label_mode = "signif", -#' label_avoid_overlap = FALSE +#' label_avoid_overlap = FALSE, show_plots = FALSE #' ) #' l = plot_list[[1]] #' l$ggplot + ggrepel::geom_text_repel( @@ -74,7 +71,8 @@ #' ) #' } #' -#' # example 5: iterate over contrasts before calling plot_volcano() +#' # example 5: iterate over contrasts before calling plot_volcano(). +#' # this is essentially the same as using helper function plot_volcano_allcontrast() #' \dontrun{ #' contrasts = unique(dataset$de_proteins$contrast) #' for(contr in contrasts) { @@ -84,20 +82,15 @@ #' # volcano plot function (compared to above example, now include the contrast in the title) #' plot_list = msdap::plot_volcano(tib_volcano, log2foldchange_threshold = 1, #' qvalue_threshold = 0.01, mtitle = paste(contr, "volcano, label top 10"), -#' label_mode = "topn_pvalue", label_target = 10, label_avoid_overlap = TRUE +#' label_mode = "topn_pvalue", label_target = 10, label_avoid_overlap = TRUE, +#' show_plots = TRUE #' ) -#' -#' # grab ggplot objects and print plot -#' ggplot_list = lapply(plot_list, "[[", "ggplot") -#' for(p in ggplot_list) { -#' print(p) -#' } #' } #' } #' #' @importFrom ggrepel geom_text_repel #' @export -plot_volcano = function(stats_de, log2foldchange_threshold = NA, qvalue_threshold = NA, mtitle = "", label_mode = "topn_pvalue", label_target = 25, label_avoid_overlap = TRUE) { +plot_volcano = function(stats_de, log2foldchange_threshold = NA, qvalue_threshold = NA, mtitle = "", label_mode = "topn_pvalue", label_target = 25, label_avoid_overlap = TRUE, show_plots = FALSE) { if(!is.data.frame(stats_de) || nrow(stats_de) == 0) { return(list()) } @@ -138,6 +131,10 @@ plot_volcano = function(stats_de, log2foldchange_threshold = NA, qvalue_threshol stats_de = stats_de %>% mutate(signif = signif & abs(foldchange.log2) >= log2foldchange_threshold) } + if(!"dea_algorithm" %in% colnames(stats_de)) { + stats_de$dea_algorithm = "statistics" + } + ## pretty-print labels stats_de = add_protein_prettyprint_label(stats_de) @@ -269,9 +266,44 @@ plot_volcano = function(stats_de, log2foldchange_threshold = NA, qvalue_threshol } result[[algo_name]] = list(ggplot = p, ggplot_data = tib_facets) + + if(show_plots) { + print(p) + } } - return(result) + return(invisible(result)) +} + + + +#' a simple wrapper around plot_volcano() that iterates all contrasts +#' +#' returns the plot_volcano() output within a list (1 element per contrast @ stats_de$contrast column) +#' +#' @examples \dontrun{ +#' # refer to the examples for msdap::plot_volcano() +#' # the same syntax can be applied here +#' } +#' @param stats_de input data.frame, see `plot_volcano()` +#' @param mtitle plot title, see `plot_volcano()` +#' @param ... parameters passed to `plot_volcano()`, check that function's documentation and examples +#' @export +plot_volcano_allcontrast = function(stats_de, mtitle, ...) { + if(!is.data.frame(stats_de) || nrow(stats_de) == 0) { + return(list()) + } + result = list() + if(!"contrast" %in% colnames(stats_de)) { + stats_de$contrast = "default_contrast" + } + if(any(is.na(stats_de$contrast) | !is.character(stats_de$contrast) | stats_de$contrast == "")) { + append_log('parameter stats_de column "contrast" must contain string values that are not NA and not "" (empty string)', type = "error") + } + for(contr in unique(stats_de$contrast)) { + result[[contr]] = plot_volcano(stats_de %>% filter(contrast == contr), mtitle = paste(contr, mtitle), ...) + } + return(invisible(result)) } diff --git a/R/process_peptide_data.R b/R/process_peptide_data.R index 51c7f1b..54a1eee 100644 --- a/R/process_peptide_data.R +++ b/R/process_peptide_data.R @@ -286,6 +286,10 @@ merge_fractionated_samples = function(dataset) { append_log("to collapse sample fractions, the sample metadata table must contain columns 'fraction' and 'shortname' (eg; sample_id is unique sample/measurement, whereas shortname is the identifier to which respective fractions are collapsed to)", type = "error") } + + # prior to replacing sample_id, store mappings + sid_shortname = dataset$samples %>% distinct(sample_id, shortname) + # 1) replace sample_id by shortname in both the samples and peptides tibbles dataset$peptides$sample_id = dataset$samples$shortname[data.table::chmatch(dataset$peptides$sample_id, dataset$samples$sample_id)] dataset$samples = dataset$samples %>% mutate(sample_id = shortname) %>% select(-fraction) %>% distinct(sample_id, .keep_all = T) @@ -297,6 +301,28 @@ merge_fractionated_samples = function(dataset) { dataset$samples = peptide_and_protein_counts_per_sample(dataset$peptides, dataset$samples, is_dia_dataset(dataset)) dataset = invalidate_cache(dataset) + # 4) if any contrasts are present, update the respective sample identifiers + if(is.list(dataset$contrasts) && length(dataset$contrasts) > 0) { + for(index in seq_along(dataset$contrasts)) { + contr = dataset$contrasts[[index]] + # replace sample_id with shortname + contr$sampleid_condition1 = unique(sid_shortname$shortname[match(contr$sampleid_condition1, sid_shortname$sample_id)]) + contr$sampleid_condition2 = unique(sid_shortname$shortname[match(contr$sampleid_condition2, sid_shortname$sample_id)]) + contr$sample_table = contr$sample_table %>% mutate(sample_id = sid_shortname$shortname[match(sample_id, sid_shortname$sample_id)]) + # keep track of removed samples (duplicates) @ `sample_table` --> sync with `model_matrix` which should be aligned + row_isdupe = duplicated(contr$sample_table$sample_id) + contr$sample_table = contr$sample_table[!row_isdupe,] + contr$model_matrix = contr$model_matrix[!row_isdupe,] + if(!is_tibble(contr$sample_table)) { # setting rownames on a tibble is deprecated, but we can/should on data.frame + rownames(contr$sample_table) = seq_len(nrow(contr$sample_table)) + } + rownames(contr$model_matrix) = seq_len(nrow(contr$sample_table)) + # update contrast + dataset$contrasts[[index]] = contr + } + } + + return(dataset) } diff --git a/R/quickstart.R b/R/quickstart.R index 8596c0b..dddf938 100644 --- a/R/quickstart.R +++ b/R/quickstart.R @@ -137,6 +137,8 @@ analysis_quickstart = function( dump_all_data = FALSE ) { + # check if the user is trying to apply filtering/dea on a dataset object that is incompatible with MS-DAP version 1.2 or later + error_legacy_contrast_definitions(dataset) output_disabled = length(output_dir) == 0 || (length(output_dir) == 1 && all(is.na(output_dir))) if(output_disabled) { diff --git a/R/report_as_rmarkdown.R b/R/report_as_rmarkdown.R index dcd4506..43f0030 100644 --- a/R/report_as_rmarkdown.R +++ b/R/report_as_rmarkdown.R @@ -88,22 +88,23 @@ generate_pdf_report = function(dataset, output_dir, norm_algorithm = "vwmb", rol append_log("report: constructing plots specific for each contrast", type = "progress") de_proteins = dataset$de_proteins %>% left_join(dataset$proteins, by="protein_id") - column_contrasts = dataset_contrasts(dataset) - for(contr in column_contrasts) { - stats_contr = de_proteins %>% filter(contrast == contr) + for(contr in dataset$contrasts) { + stats_contr = de_proteins %>% filter(contrast == contr$label) if(nrow(stats_contr) == 0) { next } - mtitle = contr - if("contrast_ranvars" %in% colnames(stats_contr) && length(stats_contr$contrast_ranvars[1]) > 0 && !is.na(stats_contr$contrast_ranvars[1]) && stats_contr$contrast_ranvars[1] != "") { - mtitle = paste0(mtitle, "\nuser-specified random variables added to regression model: ", stats_contr$contrast_ranvars[1]) + mtitle = paste0("contrast: ", contr$label_contrast) + if(length(contr$colname_additional_variables) > 0) { + mtitle = paste0(mtitle, "\nuser-specified random variables added to regression model: ", paste(contr$colname_additional_variables, collapse = ", ")) } # optionally, provide thresholds for foldchange and qvalue so the volcano plot draws respective lines - l_contrast[[contr]] = list(p_volcano_contrast = lapply(plot_volcano(stats_de = stats_contr, log2foldchange_threshold = stats_contr$signif_threshold_log2fc[1], qvalue_threshold = stats_contr$signif_threshold_qvalue[1], mtitle = mtitle), "[[", "ggplot"), - p_pvalue_hist = plot_pvalue_histogram(stats_contr %>% mutate(color_code = dea_algorithm), mtitle=contr), - p_foldchange_density = plot_foldchanges(stats_contr %>% mutate(color_code = dea_algorithm), mtitle=contr) ) + l_contrast[[contr$label]] = list( + p_volcano_contrast = lapply(plot_volcano(stats_de = stats_contr, log2foldchange_threshold = stats_contr$signif_threshold_log2fc[1], qvalue_threshold = stats_contr$signif_threshold_qvalue[1], mtitle = mtitle), "[[", "ggplot"), + p_pvalue_hist = plot_pvalue_histogram(stats_contr %>% mutate(color_code = dea_algorithm), mtitle = mtitle), + p_foldchange_density = plot_foldchanges(stats_contr %>% mutate(color_code = dea_algorithm), mtitle = mtitle) + ) } diff --git a/R/rollup.R b/R/rollup.R index 5fd5b3c..84f6fdd 100644 --- a/R/rollup.R +++ b/R/rollup.R @@ -204,7 +204,7 @@ rollup_pep2prot_maxlfq = function(tib, intensity_is_log2, implementation = "iq", filter(is.finite(intensity) & intensity > 0) } - append_log_timestamp(sprintf("peptide to protein rollup with MaxLFQ (implementation: %s)", implementation), start_time) + # append_log_timestamp(sprintf("peptide to protein rollup with MaxLFQ (implementation: %s)", implementation), start_time) return(result) # debug; diff --git a/R/sample_metadata.R b/R/sample_metadata.R index d4e9dad..13a4ea8 100644 --- a/R/sample_metadata.R +++ b/R/sample_metadata.R @@ -468,21 +468,420 @@ sample_metadata_sort_and_filter = function(df, sample_exclude_regex = "", group_ -compose_contrast_name = function(a, b) { - paste("contrast:", paste(unique(a), collapse = ","), "vs", paste(unique(b), collapse = ",")) +#' test if the dataset has contrast definitions that are incompatible with MS-DAP version 1.2 +#' +#' @param dataset your dataset +#' @export +has_legacy_contrast_definitions = function(dataset) { + is.list(dataset) && ( + ("samples" %in% names(dataset) && is.data.frame(dataset$samples) && any(grepl("^contrast:", colnames(dataset$samples), ignore.case = TRUE))) || + # regex: has intensity_contrast column that does not contain a hashtag + # (all new-style contrast names do because the dataset$samples column with "condition" is encoded in contrast labels since MS-DAP release 1.2) + ("peptides" %in% names(dataset) && is.data.frame(dataset$peptides) && any(grepl("^intensity_contrast[^#]+$", colnames(dataset$samples), ignore.case = TRUE))) + ) +} + + + +#' throw an error if has_legacy_contrast_definitions() is TRUE +#' +#' @param dataset your dataset +error_legacy_contrast_definitions = function(dataset) { + if(has_legacy_contrast_definitions(dataset)) { + append_log("this dataset was constructed with an older, incompatible MS-DAP version. To fix this, remove and then redefine all contrasts with the following 2 steps:\ndataset = remove_contrasts(dataset)\ndataset = setup_contrasts(dataset, ... same definitions you used before ...)\nDone! you may now use the updated dataset in MS-DAP as per usual", type = "error") + } +} + + + +#' Remove all contrast definitions and respective filtering and DEA results +#' +#' @description +#' also removes contrast-related information from the dataset that was used in older MS-DAP version (prior to version 1.2) +#' +#' @param dataset your dataset +#' @seealso [print_contrasts()], [add_contrast()] +#' @export +remove_contrasts = function(dataset) { + if(!is.list(dataset)) { + append_log("dataset parameter must be a list", type = "error") + } + # remove legacy random variables specification if it exists + dataset$dea_random_variables = NULL + # remove legacy contrast definitions in dataset$samples + if("samples" %in% names(dataset) && is.data.frame(dataset$samples)) { + dataset$samples = dataset$samples %>% select(-tidyselect::starts_with("contrast:")) + } + # remove all contrast-filters from peptide table + if("peptides" %in% names(dataset) && is.data.frame(dataset$peptides)) { + dataset$peptides = dataset$peptides %>% select(-tidyselect::starts_with("intensity_contrast")) + } + # remove all dea and dd results + dataset$de_proteins = NULL + dataset$dd_proteins = NULL + # init empty list + dataset$contrasts = list() + return(dataset) } -decompose_contrast_name = function(x) { - x = sub("^contrast: *", "", x) - lapply(strsplit(x, split = " *vs *"), strsplit, split = " *, *") +#' Print an overview of all defined contrasts and their respective sample metadata tables to the console +#' +#' @param dataset your dataset +#' @seealso [add_contrast()], [remove_contrasts()] +#' @export +print_contrasts = function(dataset) { + if(!is.list(dataset)) { + append_log("dataset parameter must be a list", type = "error") + } + + if(!is.list(dataset$contrasts) || length(dataset$contrasts) == 0) { + append_log("no contrasts have been defined", type = "info") + } + + for(i in seq_along(dataset$contrasts)) { + contr = dataset$contrasts[[i]] + cat(ifelse(i > 1, "\n", ""), contr$label, "\n") + print(inner_join( + dataset$samples %>% select(sample_id, shortname), # optionally, include 'fraction': tidyselect::any_of("fraction") + contr$sample_table, + by = "sample_id" + )) + } +} + + + +#' Create a contrast for differential expression analysis +#' +#' @description +#' Note that a MS-DAP contrast for "A vs B" will return foldchanges for B/A in downstream output +#' tables and data visualizations. For example, for the contrast "control vs disease" a positive +#' log2 foldchange implies protein abundances are higher in the "disease" sample group. +#' +#' Throughout this function, all samples where the column "exclude" is set to TRUE are disregarded. +#' +#' @examples \dontrun{ +#' # first, remove all existing contrasts (and their respecive DEA results) +#' dataset = remove_contrasts(dataset) +#' +#' # Assume that column "group" in your sample metadata table specifies sample groups +#' # The following example will create the contrast "WT vs KO" +#' dataset = add_contrast(dataset, "group", "WT", "KO") +#' +#' # If the sample metadata table contains a column "batch" that should be used as +#' # a regression variable (works for ebayes/deqms/msqrob), we can add it as follows: +#' dataset = add_contrast(dataset, "group", "WT", "KO", colname_additional_variables = "batch") +#' +#' # Elaborate example; create a contrast while matching multiple values per group. +#' # Testing all motor- and visual-cortex samples (described in the "brain_region" +#' # column) against prefrontal cortex samples, with additional regression variables +#' # batch and age. +#' # In this example, the sample metadata table must contain columns +#' # "brain_region", "batch", "age" +#' dataset = add_contrast( +#' dataset, +#' # this parameter describes 1 column name in `dataset$samples` +#' colname_condition_variable = "brain_region", +#' # values in "brain_region" that are the first group in A/B testing +#' values_condition1 = c("motor_cortex", "visual_cortex"), +#' # analogous, but for the second group in A/B testing. +#' # note that you may alternatively this to `NA` to indicate +#' # "everything except values in values_condition1" +#' values_condition2 = c("prefrontal_cortex"), +#' # a vector/array of 0 or more column names in dataset$samples +#' # that should be used as additional regression variables +#' colname_additional_variables = c("batch", "age") +#' ) +#' +#' # Assume that the sample metadata table contains a column "group" with values +#' # "A", "B", "C", "D". The following code will create the contrast "A vs B,C,D" +#' # by setting the second set of values to `NA` +#' dataset = add_contrast( +#' dataset, +#' colname_condition_variable = "group", +#' values_condition1 = "A", +#' values_condition2 = NA +#' ) +#' +#' ## The "group" column in the sample table is used for group definitions in +#' ## "all group" filtering. We here create two contrasts based on different +#' ## regression variables in the sample tabel. +#' dataset = remove_contrasts(dataset) # optionally, remove previously defined contrasts +#' dataset = add_contrast( +#' dataset, +#' colname_condition_variable = "genotype", +#' values_condition1 = "control", +#' values_condition2 = "knockout", +#' colname_additional_variables = "batch" +#' ) +#' dataset = add_contrast( +#' dataset, +#' colname_condition_variable = "genotype", +#' values_condition1 = "control", +#' values_condition2 = c("knockout", "mutant1"), +#' # note that we have the flexibility to add different regression variables per contrast +#' colname_additional_variables = "batch" +#' ) +#' # print an overview of all contrasts +#' print_contrasts(dataset) +#' # apply typical MS-DAP pipeline +#' dataset = analysis_quickstart( +#' dataset, +#' filter_min_detect = 0, ## if DDA, we might not require minimum MS/MS counts +#' filter_min_quant = 3, +#' filter_fraction_detect = 0, +#' filter_fraction_quant = 0.75, +#' filter_by_contrast = FALSE, ## we only want filtering across all groups +#' filter_min_peptide_per_prot = 2, ## 2 peptides per protein +#' dea_algorithm = "deqms", +#' norm_algorithm = c("vwmb", "modebetween_protein"), +#' output_qc_report = TRUE, +#' output_abundance_tables = TRUE, +#' output_dir = "C:/temp", ## you may set this to NA to skip the QC report +#' output_within_timestamped_subdirectory = TRUE +#' ) +#' } +#' @param dataset your dataset. Make sure you've already imported sample metadata +#' @param colname_condition_variable sample metadata column name that should be used for the experimental condition. Typically, this is the "group" column. Should be any of the values in `user_provided_metadata(dataset$samples)` +#' @param values_condition1 array of values from column `colname_condition_variable` that are the first group in the contrast. Note that +#' @param values_condition2 analogous to `values_condition1`, but for the second group. Note that you can set this to NA to indicate "everything except values in values_condition1" +#' @param colname_additional_variables optionally, sample metadata column names that should be used as additional regression variables (only the subset of `user_provided_metadata(dataset$samples)`, NOT including the value provided as parameter `colname_condition_variable`) +#' @seealso [print_contrasts()] to print an overview of defined contrasts, [remove_contrasts()] to remove all current contrasts (and respective filtering and DEA results) +#' @export +add_contrast = function(dataset, colname_condition_variable, values_condition1, values_condition2, colname_additional_variables = NULL) { + # validate dataset + if(!is.list(dataset) || !"samples" %in% names(dataset) || !is.data.frame(dataset$samples)) { + append_log("Dataset does not contain a sample metadata table (dataset$samples)", type = "error") + } + if(!all(c("sample_id", "exclude") %in% colnames(dataset$samples)) || !all(dataset$samples$exclude %in% c(TRUE, FALSE))) { + append_log("Expected column 'sample_id' and boolean column 'exclude' in dataset$samples", type = "error") + } + if("contrasts" %in% names(dataset) && !is.list(dataset$contrasts)) { + append_log("dataset$contrasts must be a list. Overwriting this variable", type = "warning") + dataset = remove_contrasts(dataset) # inits a new list + } + if(!"contrasts" %in% names(dataset)) { + dataset = remove_contrasts(dataset) # inits a new list + } + + # validate condition colname + valid_sample_metadata_columns = user_provided_metadata(dataset$samples) + if(length(colname_condition_variable) != 1 || is.na(colname_condition_variable) || colname_condition_variable == "" || !colname_condition_variable %in% valid_sample_metadata_columns) { + append_log(paste0( + 'Invalid colname_condition_variable parameter: this should be a sample metadata column name that indicates the experimental condition, but provided value "', colname_condition_variable, + '" does not exist in the sample metadata table! Note that matching is case-sensitive\nAvailable column names are: ', + paste(valid_sample_metadata_columns, collapse=",")), type = "error") + } + tmp = unlist(dataset$samples[,colname_condition_variable], recursive = FALSE, use.names = FALSE) + if(anyNA(tmp) || !is.character(tmp) || any(tmp == "")) { + append_log('The sample metadata column provided in parameter colname_condition_variable must contain character values and not any NA or empty strings', type = "error") + } + + # validate additional variables colnames + if(!is.null(colname_additional_variables)) { + if(anyNA(colname_additional_variables) || !is.character(colname_additional_variables) || any(colname_additional_variables %in% c("", "condition")) || anyDuplicated(colname_additional_variables)) { + append_log('Invalid colname_additional_variables parameter: must be a character array. Cannot be NA, empty string or "condition". Duplicated not allowed. (leave to default NULL if there are no additional regression variables)', type = "error") + } + if(colname_condition_variable %in% colname_additional_variables) { + append_log('Parameter colname_additional_variables should not overlap with parameter colname_condition_variable', type = "error") + } + tmp = setdiff(colname_additional_variables, valid_sample_metadata_columns) + if(length(tmp) > 0) { + append_log(paste0( + 'Invalid colname_additional_variables parameter: this should be an array of sample metadata column names that indicate additional regression variables, but provided value(s) "', + paste(tmp, collapse = ","), + '" do not exists in the sample metadata table! Note that matching is case-sensitive\nAvailable column names are: ', + paste(valid_sample_metadata_columns, collapse=",")), type = "error") + } + } + + # sorted sample metadata table only for samples in current contrast + transform condition into [1,2] classification + samples = dataset$samples %>% + filter(exclude == FALSE) %>% + # importantly, select relevant regression variable columns **in order of priority** (with sample identifiers in front) + select(sample_id, condition = !!as.symbol(colname_condition_variable), tidyselect::all_of(colname_additional_variables)) + + # validate + if(length(values_condition1) == 0 || anyNA(values_condition1) || !is.character(values_condition1) || any(values_condition1 == "" | !values_condition1 %in% unique(samples$condition)) || anyDuplicated(values_condition1)) { + append_log(sprintf('parameter values_condition1 contains invalid values. Must match a value in sample metadata column "%s" (conform your provided column name specified in parameter colname_condition_variable). Must not be NA nor empty strings, no duplicates allowed', colname_condition_variable), type = "error") + } + if(length(values_condition2) == 1 && is.na(values_condition2)) { # infer second group + values_condition2 = setdiff(unique(samples$condition), values_condition1) + if(length(values_condition2) == 0) { + append_log(paste0('parameter values_condition2 is set to NA but all unique values in sample metadata column "', colname_condition_variable, '" are already provided at parameter values_condition1 (note that "exclude" samples are disregarded)'), type = "error") + } + } else { + if(length(values_condition2) == 0 || anyNA(values_condition2) || !is.character(values_condition2) || any(values_condition2 == "" | !values_condition2 %in% unique(samples$condition)) || anyDuplicated(values_condition2)) { + append_log(sprintf('parameter values_condition2 contains invalid values. Must match a value in sample metadata column "%s" (conform your provided column name specified in parameter colname_condition_variable). Must not be NA nor empty strings, no duplicates allowed', colname_condition_variable), type = "error") + } + } + tmp = intersect(values_condition1, values_condition2) + if(length(tmp) > 0) { + append_log(paste0('there must not be any overlap between parameters values_condition1 and values_condition2. Redundant values: ', paste(tmp, collapse = ",")), type = "error") + } + + samples = samples %>% + filter(condition %in% c(values_condition1, values_condition2)) %>% + mutate(condition = as.integer((condition %in% values_condition2) + 1)) %>% + arrange(condition) + + if(sum(samples$condition == 1L) < 2) { + append_log(paste0('less than 2 samples (that are not set to "exclude") in sample metadata column "', colname_condition_variable, '" match the values_condition1 parameter; contrasts need at least 2 samples in each group'), type = "error") + } + if(sum(samples$condition == 2L) < 2) { + append_log(paste0('less than 2 samples (that are not set to "exclude") in sample metadata column "', colname_condition_variable, '" match the values_condition2 parameter; contrasts need at least 2 samples in each group'), type = "error") + } + + # convert each column to factor/numeric + samples = enforce_sample_value_types(samples, redundant_columns = "error", show_log = TRUE) + + # check that each variable has condition-unique values + for(col in colname_additional_variables) { + y = samples %>% pull(!!col) + if(n_distinct(y[samples$condition == 1L]) == 1 && n_distinct(y[samples$condition == 2L]) == 1) { + append_log(paste0('"Invalid regression variable "', col, '" has no unique values in either condition (solution: remove it from the colname_additional_variables parameter for the current contrast)'), type = "error") + } + } + + # compose label + lbl_contrast = sprintf( + "%s vs %s", + paste(values_condition1, collapse=","), + paste(values_condition2, collapse=",") + ) + lbl = sprintf( + "contrast: %s vs %s # condition_variable: %s", + paste(values_condition1, collapse=","), + paste(values_condition2, collapse=","), + colname_condition_variable + ) + if(length(colname_additional_variables) > 0) { + lbl = paste0(lbl, " # additional_variables: ", paste(colname_additional_variables, collapse=",")) + } + + lbl = sub(" *$", "", lbl) # trim trailing whitespace + append_log(lbl, type = "info") + + # double-check duplicates + isdupe = FALSE + for(i in seq_along(dataset$contrasts)) { + if(dataset$contrasts[[i]]$label == lbl) { + append_log(paste0("skipping a contrast that was already added to the dataset: ", lbl), type = "warning") + isdupe = TRUE + break + } + } + + if(!isdupe) { + # finally, store the data + dataset$contrasts[[length(dataset$contrasts) + 1]] = list( + label = lbl, + label_contrast = lbl_contrast, + colname_condition_variable = colname_condition_variable, + colname_additional_variables = colname_additional_variables, + values_condition1 = values_condition1, + values_condition2 = values_condition2, + sampleid_condition1 = samples %>% filter(condition == 1) %>% pull(sample_id), + sampleid_condition2 = samples %>% filter(condition == 2) %>% pull(sample_id), + # the samples table is essential: we've added a 'condition' column and included additional variables relevant for the (user-specified) linear regression + sample_table = samples, + model_matrix = stats::model.matrix(~ . , data = samples %>% select(-sample_id)), + # in the way the samples table is currently constructed, + # we'd always need the "condition" regression variable as our main result + regression_coefficient_name = "condition" + ) + } + + return(dataset) +} + + + +#' convert the data type of column in a sample metadata table to factors or numeric type +#' +#' @description importantly, columns "sample_index", "sample_id" and "shortname" are ignored alltogether +#' @param samples dataset$samples or a subset thereof. Typically, only columns sample_id and user-defined regression variables are included as columns +#' @param redundant_columns how to deal with columns in `samples` that are redundant with other columns. Options: "error" (default), "warning", "ignore" +#' @param show_log boolean, show data type per column in console? Default: `TRUE` +enforce_sample_value_types = function(samples, redundant_columns = "error", show_log = TRUE) { + if(!is.data.frame(samples) || ncol(samples) == 0 || nrow(samples) == 0) { + append_log("samples parameter must be a non-empty data.frame", type = "error") + } + if(length(redundant_columns) != 1 || !redundant_columns %in% c("error", "warning", "ignore")) { + append_log('redundant_columns parameter must be any of these options: "error", "warning", "ignore"', type = "error") + } + if(length(show_log) != 1 || !show_log %in% c(TRUE, FALSE)) { + append_log("show_log parameter must be either TRUE or FALSE", type = "error") + } + + colnames_check = setdiff(colnames(samples), c("sample_index", "sample_id", "shortname")) + for(col in colnames_check) { + values = unlist(samples[,col], recursive = FALSE, use.names = FALSE) + # input validation + if(anyNA(values) || any(is.character(values) & values == "")) { + append_log(paste0( + "Invalid values (NA or empty string) in sample metadata (dataset$samples) column: ", col, + "\n(solution: edit your sample metadata Excel table and replace empty values, then re-import it in R with import_sample_metadata() and finally re-run this function)" + ), type = "error") + } + + # transform data types to numeric/factor + if(all(is.numeric(values) | is.factor(values))) { + # nothing to do. i.e. we don't want to recast integers (which are valid is.numeric() nor factors) + is_numeric = is.numeric(values) + } else { + values_recast = suppressWarnings(as.numeric(values)) + is_numeric = all(is.finite(values_recast)) + if(is_numeric) { + samples[,col] = values_recast + } else { + values_recast = factor(values, levels = unique(values)) # don't resort, set levels according to input table sorting + samples[,col] = values_recast + } + } + + # print data type + if(show_log) { + if(is_numeric) { + append_log(paste0("numeric variable: ", col), type = "info") + } else { + append_log(paste0("categorical variable (R factor): ", col), type = "info") + } + } + } + + # check that each variable is unique + if(redundant_columns != "ignore") { + # cast to numeric matrix so factors are checked as integer values (i.e. this helps spot patterns like A,A,B,B vs C,C,D,D - albeit not perfectly) + m = as.matrix( + samples %>% + select(-tidyselect::any_of(c("sample_index", "sample_id", "shortname")) ) %>% + mutate(across(tidyselect::where(is.factor), as.numeric)) + ) + # check if any of the previous columns is the same + if(ncol(m) > 1) { + for(i in 2L:ncol(m)) { + for(j in 1L:(i-1L)) { + if(all(m[,i] == m[,j])) { + append_log(sprintf('Invalid regression variable "%s" is redundant, the same data is also encoded in regression variable "%s" (solution: do not use this variable in your regression model/design)', + colnames(m)[i], colnames(m)[j]), type = redundant_columns) + } + } + } + } + } + + return(samples) } #' Setup contrasts for downstream Differential Expression Analysis #' +#' For more fine-grained control over specified contrasts, use the functions `remove_contrasts()` and `add_contrast()`. +#' #' Note that a MS-DAP contrast for "A vs B" will return foldchanges for B/A in downstream output tables and data visualizations. For example, for the contrast "control vs disease" a positive log2 foldchange implies protein abundances are higher in the "disease" sample group. #' #' @param dataset your dataset. Make sure you've already imported sample metadata (so each sample is assigned to a sample group) @@ -504,97 +903,30 @@ decompose_contrast_name = function(x) { #' @export setup_contrasts = function(dataset, contrast_list, random_variables = NULL) { if(!"samples" %in% names(dataset) || !is.data.frame(dataset$samples) || length(dataset$samples) == 0) { - append_log("sample metadata table ('samples') is missing from the dataset. Run import_sample_metadata() prior to this function.", type="error") + append_log("sample metadata table ('samples') is missing from the dataset. Run import_sample_metadata() prior to this function.", type = "error") } - if(!"group" %in% colnames(dataset$samples)) { - append_log("sample metadata table ('samples') must contain the column 'group'. Run import_sample_metadata() prior to this function.", type="error") + append_log("sample metadata table ('samples') must contain the column 'group'. Run import_sample_metadata() prior to this function.", type = "error") } - - ## random variables - random_variables = unique(random_variables) - - if(length(random_variables) > 0 && !all(random_variables %in% colnames(dataset$samples))) { - append_log(paste("random_variables are case-sensitive and may only contain sample metadata (each value must match a column name in dataset$samples). Provided random_variables that are _not_ found in sample metadata table:", paste(setdiff(random_variables, colnames(dataset$samples)), collapse=", ")), type="error") - } - # not strictly needed, but help out the user by catching common mistakes; random effects as used in this pipeline must be 'additional metadata' besides the predefined contrasts - ranvar_blacklist = c("sample_id", "group", "condition") - if(any(ranvar_blacklist %in% random_variables)) { - append_log(paste("random_variables should only contain 'additional metadata' that can be used to control for batch effects etc _besides_ the predefined contrasts in contrast_list, and _not_ contain these terms;", paste(intersect(random_variables, ranvar_blacklist), collapse=", ")), type="error") - } - - # there can be no NA values or empty strings for sample metadata to be used as random variables in any downstream regression ('exclude' samples disregarded, these are never used in downstream DEA) - if(length(random_variables) > 0) { - ranvar_missing_values = NULL - # iterate respective columns of sample metadata - s = dataset$samples %>% filter(exclude == FALSE) %>% select(!!random_variables) - for(v in colnames(s)) { - x = s %>% pull(v) - if(any(is.na(x) | x == "")) { - # collect columns that contain missing values - ranvar_missing_values = c(ranvar_missing_values, v) - } - } - # report errors - if(length(ranvar_missing_values) > 0) { - append_log(paste("sample metadata table must not contain any missing values for columns that match the random_variables provided to this function. Sample metadata columns with missing values:", paste(ranvar_missing_values, collapse=", ")), type="error") - } + if(length(contrast_list) == 0 || !is.list(contrast_list)) { + append_log("contrast_list parameter must be a non-empty list", type = "error") } + if(length(dataset$contrasts) > 0) { + append_log("removing previous contrasts", type = "info") + } + dataset = remove_contrasts(dataset) - # simply store list as dataset attribute - # removes this variable (assign NULL) if no random_variables are supplied, as it should - dataset$dea_random_variables = random_variables - - - # reset cache - dataset = invalidate_cache(dataset) - - column_contrasts = dataset_contrasts(dataset) - # drop previous contrasts, if any - tib = dataset$samples %>% select(-matches(column_contrasts)) + random_variables = unique(random_variables) - for (index in seq_along(contrast_list)) { + for(index in seq_along(contrast_list)) { contr = contrast_list[[index]] - if (length(contr) != 2 || any(lengths(contr) == 0)) { + if(length(contr) != 2 || any(lengths(contr) == 0)) { append_log("each contrast should be a list with 2 elements (each is a non-empty array of sample group names)", type = "error") } - # pretty-print labels - groups_a = contr[[1]] - groups_b = contr[[2]] - lbl = compose_contrast_name(groups_a, groups_b) - - # input validation (is this a valid contrast list?) - if (all(groups_a %in% groups_b) && all(groups_b %in% groups_a)) { - append_log(paste("same group(s) on both sides of the", lbl), type = "error") - } - if (any(groups_a %in% groups_b) || any(groups_b %in% groups_a)) { - append_log(paste("the same group cannot be on both sides of the", lbl), type = "error") - } - if (any(!groups_a %in% tib$group) || any(!groups_b %in% tib$group)) { - append_log(paste("contrast definition contains a sample group that is not part of your dataset (i.e. not present in sample metadata table dataset$samples) @ ", lbl), type = "error") - } - - # conditions are stored as integers - # 1 = group A, 2 = group B, 0 = samples from other groups or those flagged as 'exclude' - mask = as.integer(tib$group %in% groups_a) - mask[tib$group %in% groups_b] = 2L - if ("exclude" %in% colnames(tib)) { - mask[tib$exclude %in% TRUE] = 0L - } - tib[, lbl] = as.integer(mask) - - ## QC: must have 2+ samples on both sides of the contrast - count_samples_condition_a = sum(mask == 1) - count_samples_condition_b = sum(mask == 2) - if(count_samples_condition_a < 2 || count_samples_condition_b < 2) { - append_log(sprintf("invalid contrast; '%s' vs '%s'. The former contains %d samples, the latter %d samples, while both should have at least 2 samples (disregarding samples that are flagged as 'exclude')", - paste(groups_a, collapse = ","), paste(groups_b, collapse = ","), count_samples_condition_a, count_samples_condition_b), type = "error") - } - - append_log(lbl, type = "info") + dataset = add_contrast(dataset, colname_condition_variable = "group", values_condition1 = contr[[1]], values_condition2 = contr[[2]], colname_additional_variables = random_variables) } - dataset$samples = tib + return(dataset) } diff --git a/R/stats_differential_abundance.R b/R/stats_differential_abundance.R index 27a6641..65cb8db 100644 --- a/R/stats_differential_abundance.R +++ b/R/stats_differential_abundance.R @@ -1,50 +1,122 @@ -#' apply limma::eBayes() function to a protein-level ExpressionSet -#' -#' ref; PMID:25605792 -#' ref; https://bioconductor.org/packages/release/bioc/html/limma.html -#' -#' @param eset_proteins protein-level dataset stored as a Biobase ExpressionSet -#' @param input_intensities_are_log2 boolean indicating whether the ExpressionSet's intensity values are already log2 scaled (default: TRUE) -#' @param random_variables a vector of column names in your sample metadata table that are added as additional(!) regression terms in each statistical contrast tested downstream + +#' Input validation for eBayes/DEqMS/MS-EmpiRe functions #' -#' @importFrom Biobase exprs pData -#' @importFrom limma eBayes topTable lmFit -#' @export -de_ebayes = function(eset_proteins, input_intensities_are_log2 = TRUE, random_variables = NULL) { - start_time = Sys.time() - if(length(random_variables) > 0 && (!is.vector(random_variables) || any(is.na(random_variables) | !is.character(random_variables))) ) { - append_log(paste("random_variables must either be NULL or character vector", paste(random_variables, collapse=", ")), type = "error") +#' @param eset protein/peptide log2 intensity matrix stored as a Biobase ExpressionSet. Must describe protein_id and sample_id in metadata +#' @param model_matrix a `stats::model.matrix()` result that is supplied to `limma::lmFit()` +#' @param model_matrix_result_prop the column name in `model_matrix` that should be returned as the resulting coefficient estimated by `eBayes()`. In MS-DAP workflows this is typically "condition" +validate_eset_fitdata = function(eset, model_matrix, model_matrix_result_prop) { + if(length(eset) != 1 || !"ExpressionSet" %in% class(eset)) { + append_log("parameter eset must be an ExpressionSet", type = "error") + } + + f = Biobase::fData(eset) + if(!is.data.frame(f) || !"protein_id" %in% colnames(f) || anyNA(f$protein_id)) { + append_log('parameter eset must be an ExpressionSet with non-NA "protein_id" as a featuredata (i.e. Biobase::fData(eset)$protein_id)', type = "error") } - # transform to log2 if input data is non-log - if (!input_intensities_are_log2) { - x = log2(Biobase::exprs(eset_proteins)) - x[!is.finite(x)] = NA - Biobase::exprs(eset_proteins) = x + p = Biobase::pData(eset) + if(!is.data.frame(p) || !"sample_id" %in% colnames(p) || anyNA(p$sample_id)) { + append_log('parameter eset must be an ExpressionSet with non-NA "sample_id" as a phenotypic data (i.e. Biobase::pData(eset)$sample_id)', type = "error") } - # data.frame with sample metadata for those random variables that are to be tested in limma::eBayes - random_variables = unique(c("condition", random_variables)) # use unique just in case the user erronously passed the condition as 'additional random variable' - df_metadata = Biobase::pData(eset_proteins) + if(!is.matrix(model_matrix)) { + append_log("parameter model_matrix must be a matrix", type = "error") + } + + if(nrow(model_matrix) != nrow(p)) { + append_log("parameters eset and model_matrix must describe the same number of samples (i.e. number of columns/samples in ExpressionSet must match the number of rows in model_matrix)", type = "error") + } - if(!all(random_variables %in% colnames(df_metadata))) { - append_log(paste("eset_proteins ExpressionSet pData() must contain columns 'condition' and all additional random variables requested in `random_variables` parameter. missing:", paste(setdiff(random_variables, colnames(df_metadata)), collapse=", ")), type = "error") + if(length(model_matrix_result_prop) != 1 || !is.character(model_matrix_result_prop) || + model_matrix_result_prop == "" || !model_matrix_result_prop %in% colnames(model_matrix)) { + append_log("parameter model_matrix_result_prop must be a single string, matching any of the column names in the model_matrix parameter", type = "error") } +} + - # input validation, then apply limma::ebayes() - x = Biobase::exprs(eset_proteins) - df_metadata = df_metadata[ , random_variables, drop=F] - fit = de_ebayes_fit(x, random_variables = df_metadata) +#' helper function to double-check we don't submit an ExpressionSet with too few values to DEA +#' +#' @param eset a Biobase ExpressionSet +#' @param model_matrix_result_prop the column name in `model_matrix` that should be returned as the resulting coefficient estimated by `eBayes()`. In MS-DAP workflows this is typically "condition" +#' @param is_peptide boolean indicating peptide-level (FALSE = protein-level) +#' @param method_name label for printing error messages +subset_protein_eset_for_dea = function(eset, model_matrix_result_prop, is_peptide, method_name) { + x = Biobase::exprs(eset) + ptmp = Biobase::pData(eset) + stopifnot("subset_protein_eset_for_dea invalid eset * model_matrix_result_prop combination" = model_matrix_result_prop %in% colnames(ptmp)) + x_regression_var_value = unlist(ptmp[,model_matrix_result_prop]) # model_matrix_result_prop is typically "condition" + cols1 = x_regression_var_value == x_regression_var_value[1] # columns condition 1 + cols2 = !cols1 # columns condition 2 + # test for both NA and zeros + # (compatible with eBayes/DEqMS/MSqRob ExpressionSets that have log2 intensities, + # and with MS-EmpiRe dataset with plain intensities sans log transformation that contain zeros for missing values) + rows = rowSums(is.finite(x[,cols1,drop=FALSE] > 0) & x[,cols1,drop=FALSE] > 0) >= 2 & + rowSums(is.finite(x[,cols2,drop=FALSE]) & x[,cols2,drop=FALSE] > 0) >= 2 + + if(sum(rows) < 10) { + append_log(sprintf("less than 10 %s have a non-zero value in both conditions: too little input data for DEA", ifelse(is_peptide, "peptides", "proteins")), type = "error") + } + + # if there are any rows to remove, create a subset of the ExpressionSet + show warning (i.e. this should never occur during typical MS-DAP workflow that includes filtering) + if(any(!rows)) { + ftmp = Biobase::fData(eset)[rows,] + y = x[rows,,drop=FALSE] + eset = Biobase::ExpressionSet( + assayData = y, + featureData = Biobase::annotatedDataFrameFrom(y, byrow = T), + protocolData = Biobase::annotatedDataFrameFrom(y, byrow = F) + ) + Biobase::pData(eset) = ptmp + Biobase::fData(eset) = ftmp + + append_log(sprintf("DEA requires at least 2 values per experimental condition; %d / %d %s were removed prior to applying %s\n(in typical MS-DAP workflows one would apply peptide-filtering upstream, e.g. with parameters in the analysis_quickstart() function)", + sum(!rows), length(rows), ifelse(is_peptide, "peptides", "proteins"), method_name), type = "warning") + } + + return(eset) +} + + + +#' Apply limma::eBayes() function to a protein-level ExpressionSet +#' +#' ref; PMID:25605792 +#' ref; https://bioconductor.org/packages/release/bioc/html/limma.html +#' +#' @examples \dontrun{ +#' # Minimal example: +#' # Let `m` be a log2 protein intensity matrix matrix where +#' # columns are samples and rows proteins. +#' # Let `eset` be a Biobase::ExpressionSet that contains `m`. +#' # Let `m_groups` be a character vector that describes the +#' # sample group/condition for each column in `m`. +#' # e.g. m_groups = c("WT", "WT", "WT", "KO", "KO", "KO") +#' model_matrix = stats::model.matrix(~m_groups) +#' de_ebayes(eset, model_matrix, colnames(model_matrix)[2]) +#' } +#' @param eset protein-level log2 intensity matrix stored as a Biobase ExpressionSet +#' @param model_matrix a `stats::model.matrix()` result that is supplied to `limma::lmFit()` +#' @param model_matrix_result_prop the column name in `model_matrix` that should be returned as the resulting coefficient estimated by `eBayes()`. In MS-DAP workflows this is typically "condition" +#' @export +de_ebayes = function(eset, model_matrix, model_matrix_result_prop) { + start_time = Sys.time() + + # will throw an error if input data is invalid + validate_eset_fitdata(eset, model_matrix, model_matrix_result_prop) + # ensure there are at least 2 valid data points per condition / group @ model_matrix_result_prop (subset the expressionset otherwise) + eset = subset_protein_eset_for_dea(eset, model_matrix_result_prop, is_peptide = FALSE, method_name = "eBayes") + + x = Biobase::exprs(eset) + fit = suppressWarnings(limma::eBayes(limma::lmFit(x, model_matrix))) # !! sort.by="none" keeps the output table aligned with input matrix - # always extract the first regression variable (intercept has index 1, so the column number we need is 2) - result = suppressMessages(limma::topTable(fit, number = nrow(x), coef = 2, adjust.method = "fdr", sort.by = "none", confint = TRUE)) + result = suppressMessages(limma::topTable(fit, number = nrow(x), coef = model_matrix_result_prop, adjust.method = "fdr", sort.by = "none", confint = TRUE)) # note; fit$coefficients and fit$stdev.unscaled matrices contain the intercept, don't use in the ES and SE computation # eBayes effect size: Cohen's d in limma, according to Gordon Smyth https://support.bioconductor.org/p/71747/#71781 - result$effectsize = fit$coefficients[,2] / sqrt(fit$s2.post) + result$effectsize = fit$coefficients[,model_matrix_result_prop] / sqrt(fit$s2.post) # eBayes standard error, according to Gordon Smyth https://support.bioconductor.org/p/70175/ - result$standarderror = sqrt(fit$s2.post) * fit$stdev.unscaled[,2] + result$standarderror = sqrt(fit$s2.post) * fit$stdev.unscaled[,model_matrix_result_prop] result$standarddeviation = sqrt(fit$s2.post) # convert from data.frame to a tibble that follows the column names/format we expect downstream @@ -53,86 +125,41 @@ de_ebayes = function(eset_proteins, input_intensities_are_log2 = TRUE, random_va select(protein_id, pvalue = P.Value, qvalue = adj.P.Val, foldchange.log2 = logFC, effectsize, tstatistic = t, standarddeviation, standarderror) %>% add_column(dea_algorithm = "ebayes") - # note; we added condition to random variables above, strip it from result table for consistency within this R package ('condition' is always assumed to be a regression variable) - if(length(random_variables) > 1) { # ! test larger than 1 - result = result %>% add_column(contrast_ranvars = paste(setdiff(random_variables, "condition"), collapse = ", ")) - } append_log_timestamp("eBayes", start_time) return(result) } -#' apply DEqMS to a protein-level ExpressionSet +#' Apply DEqMS to a protein-level ExpressionSet #' #' ref; PMID:32205417 #' ref; https://github.com/yafeng/DEqMS #' #' implementation follows example code from the vignette; https://bioconductor.org/packages/release/bioc/vignettes/DEqMS/inst/doc/DEqMS-package-vignette.html #' -#' @param eset_proteins protein-level dataset stored as a Biobase ExpressionSet. Note that it must contain protein metadata column 'npep' that holds integer peptide counts and sample metadata column 'condition' -#' @param input_intensities_are_log2 boolean indicating whether the ExpressionSet's intensity values are already log2 scaled (default: TRUE) -#' @param random_variables a vector of column names in your sample metadata table that are added as additional(!) regression terms in each statistical contrast tested downstream +#' @param eset protein-level log2 intensity matrix stored as a Biobase ExpressionSet. Note that it must contain protein metadata column 'npep' that holds integer peptide counts +#' @param model_matrix a `stats::model.matrix()` result that is supplied to `limma::lmFit()` +#' @param model_matrix_result_prop the column name in `model_matrix` that should be returned as the resulting coefficient estimated by `eBayes()`. In MS-DAP workflows this is typically "condition" #' @param doplot create a QC plot? -#' -#' @importFrom Biobase exprs pData -#' @importFrom limma eBayes topTable lmFit -#' @importFrom DEqMS spectraCounteBayes #' @export -de_deqms = function(eset_proteins, input_intensities_are_log2 = TRUE, random_variables = NULL, doplot = FALSE) { +de_deqms = function(eset, model_matrix, model_matrix_result_prop, doplot = FALSE) { start_time = Sys.time() - ### input validation - - if(length(random_variables) > 0 && (!is.vector(random_variables) || any(is.na(random_variables) | !is.character(random_variables))) ) { - append_log(paste("random_variables must either be NULL or character vector", paste(random_variables, collapse=", ")), type = "error") - } + # will throw an error if input data is invalid + validate_eset_fitdata(eset, model_matrix, model_matrix_result_prop) + # ensure there are at least 2 valid data points per condition / group @ model_matrix_result_prop (subset the expressionset otherwise) + eset = subset_protein_eset_for_dea(eset, model_matrix_result_prop, is_peptide = FALSE, method_name = "DEqMS") - if(length(Biobase::pData(eset_proteins)) == 0 || !is.data.frame(Biobase::pData(eset_proteins))) { - append_log("eset_proteins ExpressionSet pData must be a data.frame", type = "error") - } - - if(length(Biobase::fData(eset_proteins)) == 0 || !is.data.frame(Biobase::fData(eset_proteins))) { - append_log("eset_proteins ExpressionSet fData must be a data.frame", type = "error") - } - - if(length(Biobase::pData(eset_proteins)$condition) == 0) { - append_log("eset_proteins ExpressionSet pData must contain column 'condition'", type = "error") - } - - tmp = Biobase::fData(eset_proteins)$npep + # additional input validation; DEqMS requires peptide counts + tmp = Biobase::fData(eset)$npep if(length(tmp) == 0 || !all(is.finite(tmp) & is.integer(tmp) & tmp > 0)) { - append_log("eset_proteins ExpressionSet fData must contain column 'npep' with positive integer values", type = "error") + append_log("parameter eset must contain a column 'npep' (i.e. Biobase::fData(eset)) with positive integer values", type = "error") } rm(tmp) - # data.frame with sample metadata for those random variables that are to be tested in limma::eBayes - random_variables = unique(c("condition", random_variables)) # use unique just in case the user erronously passed the condition as 'additional random variable' - df_metadata = Biobase::pData(eset_proteins) - - if(!all(random_variables %in% colnames(df_metadata))) { - append_log(paste("eset_proteins ExpressionSet pData() must contain columns 'condition' and all additional random variables requested in `random_variables` parameter. missing:", paste(setdiff(random_variables, colnames(df_metadata)), collapse=", ")), type = "error") - } - - ### input validation done - - - # transform to log2 if input data is non-log - if(!input_intensities_are_log2) { - x = log2(Biobase::exprs(eset_proteins)) - x[!is.finite(x)] = NA - Biobase::exprs(eset_proteins) = x - rm(x) - gc() - } - - # input validation, then apply limma::ebayes() - df_metadata = df_metadata[ , random_variables, drop=F] - eset_proteins__protein_id = rownames(Biobase::exprs(eset_proteins)) - fit = de_ebayes_fit(Biobase::exprs(eset_proteins), random_variables = df_metadata) - - - #### DEqMS unique code, everything else in this function is analogous to de_ebayes implementation #### + # eBayes fit + fit = suppressWarnings(limma::eBayes(limma::lmFit(Biobase::exprs(eset), model_matrix))) ### bugfix for DEqMS::spectraCounteBayes() # DEqMS will Loess fit log peptide counts versus log sigma^2 @@ -140,12 +167,12 @@ de_deqms = function(eset_proteins, input_intensities_are_log2 = TRUE, random_var # Log transforming sigma^0 where sigma is zero results in a non-finite value, which crashes the loess fit # fix: replace zero sigma's with lowest non-zero sigma (easy fix without having to update DEqMS code) # ? perhaps it's better if DEqMS would fit against eBayes' posterior estimates of sigma instead ? - fit$sigma[fit$sigma <= 0] = min(fit$sigma[fit$sigma > 0]) + fit$sigma[!is.finite(fit$sigma) | fit$sigma <= 0] = min(fit$sigma[is.finite(fit$sigma) & fit$sigma > 0]) ## peptide-per-protein counts # note that the ebayes fit results are aligned with the intensity-value-matrix we supplied upstream, # which in turn is aligned with the metadata in the ExpressionSet object. So we can just use its npep property as-is - fit$count = Biobase::fData(eset_proteins)$npep + fit$count = Biobase::fData(eset)$npep # apply DEqMS, then overwrite the fit object with DEqMS's fit = suppressWarnings(DEqMS::spectraCounteBayes(fit)) @@ -154,188 +181,105 @@ de_deqms = function(eset_proteins, input_intensities_are_log2 = TRUE, random_var } ## analogous to our de_ebayes() implementation, extract results from the fit object but grab the DEqMS specific output columns where available (reference; DEqMS::outputResult() ) - coef_col = 2 # !! sort.by="none" keeps the output table aligned with input matrix - result = suppressMessages(limma::topTable(fit, number = length(eset_proteins__protein_id), coef = coef_col, adjust.method = "fdr", sort.by = "none", confint = TRUE)) - stopifnot(rownames(result) == rownames(fit)) # because we didn't sort, tables are aligned - result$tstatistic = fit$sca.t[,coef_col] - result$pvalue = fit$sca.p[,coef_col] + result = suppressMessages(limma::topTable(fit, number = length(fit$count), coef = model_matrix_result_prop, adjust.method = "fdr", sort.by = "none", confint = TRUE)) + # indexing by name might fail in the deqms output table for some model matrices ! + if(model_matrix_result_prop %in% colnames(fit$sca.p)) { + result$pvalue = fit$sca.p[,model_matrix_result_prop] + result$tstatistic = fit$sca.t[,model_matrix_result_prop] + } else { + result$pvalue = fit$sca.p[,match(model_matrix_result_prop, colnames(fit$coefficients))] # fit$sca.p might be unnamed -> use "standard" property of similar dimensions to find index + result$tstatistic = fit$sca.t[,match(model_matrix_result_prop, colnames(fit$coefficients))] + } result$qvalue = p.adjust(result$pvalue, method = "fdr") ## fit$coefficients and fit$stdev.unscaled contain the intercept, remove from ES and SE computation # eBayes effect size: Cohen's d in limma, according to Gordon Smyth https://support.bioconductor.org/p/71747/#71781 - result$effectsize = fit$coefficients[,coef_col] / sqrt(fit$sca.postvar) + result$effectsize = fit$coefficients[,model_matrix_result_prop] / sqrt(fit$sca.postvar) # eBayes standard error, according to Gordon Smyth https://support.bioconductor.org/p/70175/ - result$standarderror = sqrt(fit$sca.postvar) * fit$stdev.unscaled[,coef_col] + result$standarderror = sqrt(fit$sca.postvar) * fit$stdev.unscaled[,model_matrix_result_prop] result$standarddeviation = sqrt(fit$sca.postvar) ## create a result tibble that contains all columns required for downstream compatability with this pipeline; protein_id, pvalue, qvalue, foldchange.log2, dea_algorithm result = as_tibble(result) %>% - mutate(protein_id = eset_proteins__protein_id) %>% + mutate(protein_id = rownames(fit)) %>% select(protein_id, pvalue, qvalue, foldchange.log2 = logFC, effectsize, tstatistic, standarddeviation, standarderror) %>% add_column(dea_algorithm = "deqms") - #### DEqMS unique code, everything else in this function is analogous to de_ebayes implementation #### - - # note; we added condition to random variables above, strip it from result table for consistency within this R package ('condition' is always assumed to be a regression variable) - if(length(random_variables) > 1) { # ! test larger than 1 - result = result %>% add_column(contrast_ranvars = paste(setdiff(random_variables, "condition"), collapse = ", ")) - } append_log_timestamp("DEqMS", start_time) return(result) } -#' Wrapper function for limma::ebayes() -#' -#' rownames of the data matrix (log2 intensities) must be the protein_id for downstream compatibility -#' -#' @param x log2 transformed protein intensity matrix -#' @param random_variables data.frame where each column describes a random variable (and each row matches a column in x) or a vector assumed to describe 1 random variable, sample group/condition for each column in x. Importantly, statistical results are returned only for the first random variable (so in most use-cases the sample group/condition should be described in the first column) -#' -#' @importFrom stats model.matrix -#' @importFrom limma eBayes lmFit -de_ebayes_fit = function(x, random_variables) { - ## validate input; x must be a numeric matrix with rownames - if(!is.matrix(x) || mode(x) != "numeric") { - append_log("x must be a numerical matrix", type = "error") - } - if(length(rownames(x)) == 0) { - append_log("input matrix must have rownames (these are assumed to be protein identifiers)", type = "error") - } - - ## validate input; variables must not contain any NA and align with matrix x - if(any(is.na(random_variables))) { - append_log("random_variables must not contain any NA values", type = "error") - } - if(is.matrix(random_variables)) { - random_variables = data.frame(random_variables, stringsAsFactors = F) - } - - # if the supplied matrix/data.frame of random variables has any rownames, these must align with colnames of x - if(length(rownames(random_variables)) == ncol(x) && !( - all( rownames(random_variables) == as.character(1:nrow(random_variables)) ) || - all( rownames(random_variables) == colnames(x) ) - )) { - append_log("input matrix x and random_variables seem misaligned; the column names of x versus rownames of variables do not match", type = "error") - } - - if(is.vector(random_variables) || is.array(random_variables)) { - random_variables = data.frame(group=random_variables, stringsAsFactors = F) - } - if(!is.data.frame(random_variables)) { - append_log("random_variables must be an array, matrix or data.frame", type = "error") - } - if(nrow(random_variables) != ncol(x)) { - append_log("number of columns in x (samples) must match the metadata (random_variables) provided for regression", type = "error") - } - - - # ensure all random_variables are factors; mutate each column, convert to factor - # simply calling as.factor won't do as it orders factor levels alphabetically, while we want to retain the sorting from input data - random_variables = random_variables %>% mutate_all(function(x) factor(x, levels=unique(x)) ) - - # design matrix; simply use all columns in the user-provided metadata table - contr_design = stats::model.matrix(~ . , data = random_variables) - ## version 1 - # group_by_cols = match(group_by_cols, unique(group_by_cols)) - 1 - # contr_design = stats::model.matrix(~group_by_cols) - - return(suppressWarnings(limma::eBayes(limma::lmFit(x, contr_design)))) - - ### full implementation - # fit = suppressWarnings(limma::eBayes(limma::lmFit(x, contr_design))) - # # !! sort.by="none" keeps the output table aligned with input matrix - # # always extract the first regression variable (intercept has index 1, so the column number we need is 2) - # result = suppressMessages(limma::topTable(fit, number = nrow(x), coef = 2, adjust.method = "fdr", sort.by = "none", confint = TRUE)) - # ## fit$coefficients and fit$stdev.unscaled contain the intercept, remove from ES and SE computation - # # eBayes effect size: Cohen's d in limma, according to Gordon Smyth https://support.bioconductor.org/p/71747/#71781 - # # note; - # result$effectsize = fit$coefficients[,2] / sqrt(fit$s2.post) - # # eBayes standard error, according to Gordon Smyth https://support.bioconductor.org/p/70175/ - # result$standarderror = sqrt(fit$s2.post) * fit$stdev.unscaled[,2] - # result$standarddeviation = sqrt(fit$s2.post) - # - # append_log_timestamp("eBayes", start_time) - # return(as_tibble(result) %>% - # mutate(protein_id = rownames(x)) %>% - # select(protein_id, pvalue = P.Value, qvalue = adj.P.Val, foldchange.log2 = logFC, effectsize, tstatistic = t, standarddeviation, standarderror)) -} - - - #' MS-EmpiRe implementation, a wrapper function for msEmpiRe::de.ana() #' #' ref; PMID:31235637 #' ref; https://github.com/zimmerlab/MS-EmpiRe #' -#' input data should NOT be log2 transformed (if it is, make sure to set parameter `input_intensities_are_log2` so we can revert this prior to passing data to MS-EmpiRe) -#' #' note; MS-EmpiRe crashes when all proteins have the same number of peptides (eg; filtering topn=1 or topn=3 and minpep=3) #' #' TODO: proper standarderror computation #' -#' @param eset a Biobase ExpressionSet that contains the peptide intensities. required attributes; fData(): protein_id and pData(): condition -#' @param input_intensities_are_log2 whether the provided intensities are already log2 transformed -#' -#' @importFrom Biobase exprs pData fData -#' @importFrom msEmpiRe de.ana +#' @param eset a Biobase ExpressionSet that contains the peptide log2 intensities +#' @param model_matrix a `stats::model.matrix()` result that is supplied to `limma::lmFit()` +#' @param model_matrix_result_prop the column name in `model_matrix` that should be returned as the resulting coefficient estimated by `eBayes()`. In MS-DAP workflows this is typically "condition" #' @export -de_msempire = function(eset, input_intensities_are_log2 = F) { - if(!("protein_id" %in% colnames(Biobase::fData(eset)))) { - append_log("ExpressionSet fData() must contain column 'protein_id'", type = "error") - } - if(!("condition" %in% colnames(Biobase::pData(eset)))) { - append_log("ExpressionSet pData() must contain columns 'condition'", type = "error") - } +de_msempire = function(eset, model_matrix, model_matrix_result_prop) { + start_time = Sys.time() # issue with MS-EmpiRe we previously reported, including code to reproduce and resolve this; https://github.com/zimmerlab/MS-EmpiRe/issues/10 # bugfix: fix the random seed to ensure MS-EmpiRe results are always the same (given the exact same input data) set.seed(123) - start_time = Sys.time() - # ms-empire expects non-logtransformed data without NA values + # will throw an error if input data is invalid + validate_eset_fitdata(eset, model_matrix, model_matrix_result_prop) + # MS-EmpiRe will crash if there are input values with all zeros + # ensure there are at least 2 valid data points per condition / group @ model_matrix_result_prop (subset the expressionset otherwise) + eset = subset_protein_eset_for_dea(eset, model_matrix_result_prop, is_peptide = FALSE, method_name = "MS-EmpiRe") + + # importantly, MS-EmpiRe expects the input data matrix to be NOT be log2 transformed x = Biobase::exprs(eset) - if (input_intensities_are_log2) { - x = 2^x - } + x = 2^x x[!is.finite(x)] = 0 Biobase::exprs(eset) = x - # protein identifiers are expected in this column + # protein identifiers are expected in column "prot.id" Biobase::fData(eset)["prot.id"] = Biobase::fData(eset)[, "protein_id"] + + # experimental condition is expected in column "condition" + # we here use the values from the design matrix @ desired output column + Biobase::pData(eset)["condition"] = unname(model_matrix[,model_matrix_result_prop]) + # call msempire main function while silencing all of its output - capture.output(result <- suppressWarnings(suppressMessages(msEmpiRe::de.ana(eset)))) + tmp = capture.output(result <- suppressWarnings(suppressMessages(msEmpiRe::de.ana(eset)))) append_log_timestamp("MS-EmpiRe", start_time) - return( - tibble( - protein_id = as.character(result$prot.id), # enforce type to be robust against upstream R package changes (e.g. prot.id as a factor) - pvalue = as.numeric(result$p.val), - qvalue = as.numeric(result$p.adj), - foldchange.log2 = as.numeric(result$log2FC), - # update; - # In MS-EmpiRe, 'prot.sd' is the standard deviation of 'prot.s' (which underwent shrinkage), - # not of the protein foldchange. So here we compute the effectsize based on prot.s/prot.sd analogous to - # MS-EmpiRe's protein p-value computation. - # - # However, there seems to be a bug in MS-EmpiRe where the sign of prot.s does not always agree with log2FC ! - # So this simple test fails on several datasets; - # stopifnot( (result$log2FC >= 0) == (result$prot.s / result$prot.sd >= 0) ) - # - # As a workaround, we compute absolute effectsize and then apply the sign of log2FC - effectsize = abs(as.numeric(result$prot.s) / as.numeric(result$prot.sd)) * ifelse(foldchange.log2 < 0, -1, 1), - #previous# effectsize = as.numeric(result$log2FC) / as.numeric(result$prot.sd), - tstatistic = NA, - # approximate standard deviation for the log2FC from the standardized effectsize - standarddeviation = abs(foldchange.log2 / effectsize), - #previous# standarddeviation = as.numeric(result$prot.sd), - standarderror = NA, - dea_algorithm = "msempire" - ) %>% - # if either log2FC or p.val is invalid, remove the protein from results - filter(is.finite(foldchange.log2) & is.finite(pvalue)) - ) + tibble( + protein_id = as.character(result$prot.id), # enforce type to be robust against upstream R package changes (e.g. prot.id as a factor) + pvalue = as.numeric(result$p.val), + qvalue = as.numeric(result$p.adj), + foldchange.log2 = as.numeric(result$log2FC), + # update; + # In MS-EmpiRe, 'prot.sd' is the standard deviation of 'prot.s' (which underwent shrinkage), + # not of the protein foldchange. So here we compute the effectsize based on prot.s/prot.sd analogous to + # MS-EmpiRe's protein p-value computation. + # + # However, there seems to be a bug in MS-EmpiRe where the sign of prot.s does not always agree with log2FC ! + # So this simple test fails on several datasets; + # stopifnot( (result$log2FC >= 0) == (result$prot.s / result$prot.sd >= 0) ) + # + # As a workaround, we compute absolute effectsize and then apply the sign of log2FC + effectsize = abs(as.numeric(result$prot.s) / as.numeric(result$prot.sd)) * ifelse(foldchange.log2 < 0, -1, 1), + #previous# effectsize = as.numeric(result$log2FC) / as.numeric(result$prot.sd), + tstatistic = NA, + # approximate standard deviation for the log2FC from the standardized effectsize + standarddeviation = abs(foldchange.log2 / effectsize), + #previous# standarddeviation = as.numeric(result$prot.sd), + standarderror = NA, + dea_algorithm = "msempire" + ) %>% + # if either log2FC or p.val is invalid, remove the protein from results + filter(is.finite(foldchange.log2) & is.finite(pvalue)) } @@ -348,47 +292,28 @@ de_msempire = function(eset, input_intensities_are_log2 = F) { #' ref; PMID:26566788 PMID:32321741 #' ref; https://github.com/statOmics/msqrob #' -#' @param eset a Biobase ExpressionSet that contains the peptide intensities. required attributes; fData(): protein_id and pData(): condition -#' @param eset_proteins only required if `log2fc_without_shrinkage == TRUE` -#' @param use_peptide_model if true, apply msqrob. if false, apply msqrobsum -#' @param input_intensities_are_log2 whether the provided intensities are already log2 transformed -#' @param protein_rollup_robust for msqrobsum analyses, whether to use 'robust' protein rollup (TRUE) or traditional peptide intensity summation (FALSE) -#' @param random_variables a vector of column names in your sample metadata table that are added as additional(!) regression terms -#' @param log2fc_without_shrinkage boolean parameter, if FALSE (default) returns msqrob estimated log2 foldchanges. if TRUE, uses `protein_foldchange_from_ebayes()` instead (affecting log2fc and effectsize results) -#' -#' @importFrom Biobase exprs pData fData -#' @importFrom MSnbase as.MSnSet.ExpressionSet combineFeatures +#' @param eset a Biobase ExpressionSet that contains the peptide log2 intensities +#' @param model_matrix a `stats::model.matrix()` result that is supplied to `limma::lmFit()` +#' @param model_matrix_result_prop the column name in `model_matrix` that should be returned as the resulting coefficient estimated by `eBayes()`. In MS-DAP workflows this is typically "condition" +#' @param use_peptide_model if `TRUE`, apply msqrob. if `FALSE`, apply msqrobsum +#' @param random_variables optionally, an array of column names in the sample metadata table that should be used as additional regression variables #' @export -de_msqrobsum_msqrob = function(eset, eset_proteins = NULL, use_peptide_model = T, input_intensities_are_log2 = T, protein_rollup_robust = T, random_variables = NULL, log2fc_without_shrinkage = FALSE) { +de_msqrobsum_msqrob = function(eset, model_matrix, model_matrix_result_prop, use_peptide_model, random_variables = NULL) { start_time = Sys.time() - if(log2fc_without_shrinkage && is.null(eset_proteins)) stop("if log2fc_without_shrinkage is set, provide protein eset") - - # strip forbidden random variables and compose updated formula - random_variables = setdiff(random_variables, c("peptide_id", "protein_id", "sample_id", "condition")) - if(!("protein_id" %in% colnames(Biobase::fData(eset)))) { - append_log("ExpressionSet fData() must contain column 'protein_id'", type = "error") - } - if(use_peptide_model && !("peptide_id" %in% colnames(Biobase::fData(eset)))) { - append_log("ExpressionSet fData() must contain column 'peptide_id' (when using peptide-level msqrob model)", type = "error") - } - if(!all(c("condition", "sample_id") %in% colnames(Biobase::pData(eset)))) { - append_log("ExpressionSet pData() must contain columns 'condition' and 'sample_id'", type = "error") - } - if(length(random_variables) > 0 && !all(random_variables %in% colnames(Biobase::pData(eset)))) { - append_log("ExpressionSet pData() must contain columns matching all random_variables", type = "error") + # will throw an error if input data is invalid + validate_eset_fitdata(eset, model_matrix, model_matrix_result_prop) + if(length(use_peptide_model) != 1 || !use_peptide_model %in% c(TRUE, FALSE)) { + append_log("use_peptide_model parameter must be either TRUE or FALSE", type = "error") } - # just to be sure, fix random seed - set.seed(123) + algorithm_name = ifelse(use_peptide_model, "msqrob", "msqrobsum") + # ensure there are at least 2 valid data points per condition / group @ model_matrix_result_prop (subset the expressionset otherwise) + eset = subset_protein_eset_for_dea(eset, model_matrix_result_prop, is_peptide = TRUE, method_name = algorithm_name) - # transform to log2 if input data is non-log - if (!input_intensities_are_log2) { - x = log2(Biobase::exprs(eset)) - x[!is.finite(x)] = NA - Biobase::exprs(eset) = x - } + # just to be sure, fix random seed + set.seed(123) #### below text quoted from MSqRobsum manual @ https://github.com/statOmics/MSqRobSum/blob/master/vignettes/msqrobsum.Rmd @@ -399,37 +324,39 @@ de_msqrobsum_msqrob = function(eset, eset_proteins = NULL, use_peptide_model = T # However some proteins will only have intensities from 1 peptide and the model fitting wil fail if we try to use the model above. For these proteins we should use te reduced model `expression ~ (1|condition)`. `msqrobsum()` # The `formulas ` parameter accepts a vector of formulas. Model fitting with the first model will be attempted first but if that fails it tries the second model and so on. - algorithm_name = "msqrob" msnset = MSnbase::as.MSnSet.ExpressionSet(eset) if (use_peptide_model) { - form_string = c("expression ~ (1 | condition) + (1 | sample_id) + (1 | peptide_id)", "expression ~ (1 | condition)") - # form_string = "expression ~ (1 | peptide_id) + (1 | sample_id) + (1 | condition)" + form_string = c(sprintf("expression ~ (1 | %s) + (1 | sample_id) + (1 | peptide_id)", model_matrix_result_prop), sprintf("expression ~ (1 | %s)", model_matrix_result_prop)) } else { - form_string = "expression ~ (1 | condition)" - algorithm_name = "msqrobsum" + form_string = sprintf("expression ~ (1 | %s)", model_matrix_result_prop) } + # strip forbidden random variables + random_variables = setdiff(random_variables, c("peptide_id", "protein_id", "sample_id", model_matrix_result_prop)) + # compose updated formula if the user provided additional random variables if(length(random_variables) > 0) { form_string = sapply(form_string, function(x) paste(c(x, sprintf("(1 | %s)", random_variables)), collapse=" + "), USE.NAMES = FALSE) } # always add y~(1|condition) without any optional random_variables as a last-resort model if all other model specifications fail (due to lack of data) -->> then convert from string to formula/expression - form_string = unique(c(form_string, "expression ~ (1 | condition)")) + form_string = unique(c(form_string, sprintf("expression ~ (1 | %s)", model_matrix_result_prop))) form_eval = sapply(form_string, function(x) eval(parse(text=x)), USE.NAMES = FALSE) append_log(sprintf("%s linear regression formula%s; %s", algorithm_name, ifelse(length(form_string)>1, "s (these are prioritized. eg; if a model fit fails due to lack of data, the next formula is used)", ""), paste(form_string, collapse = " , ")), type = "info") + # MSqRob model as re-implemented in msqrobsum package, by original authors, and updated by us (only difference is computational performance) if (use_peptide_model) { - result = suppressWarnings(msqrobsum(data = msnset, formulas = form_eval, group_vars = "protein_id", contrasts = "condition", mode = "msqrob")) - # result = suppressWarnings(msqrobsum(data = msnset, formulas = c(eval(parse(text=form_string)), expression ~ (1 | condition)), group_vars = "protein_id", contrasts = "condition", mode = "msqrob")) + result = suppressWarnings(msqrobsum(data = msnset, formulas = form_eval, group_vars = "protein_id", contrasts = model_matrix_result_prop, mode = "msqrob")) + # result = suppressWarnings(msqrobsum(data = msnset, formulas = form_eval, group_vars = "protein_id", contrasts = "condition", mode = "msqrob")) # our version 1, without user-specified random variables; form = c(expression ~ (1 | condition) + (1 | sample_id) + (1 | peptide_id), expression ~ (1 | condition)) } else { ### first use MSqRobSum to rollup to protein level, 'robust' approach appears to be an improvement over the traditional 'sum', then apply msqrob # line 273 @ https://github.com/statOmics/MSqRobSum/blob/97e22fddb9d6f1d3c29aafbae28c382148b9471d/vignettes/msqrobsum.Rmd#L273 - protset = suppressWarnings(suppressMessages(MSnbase::combineFeatures(msnset, fun = ifelse(protein_rollup_robust, "robust", "sum"), groupBy = Biobase::fData(msnset)$protein_id))) + protset = suppressWarnings(suppressMessages(MSnbase::combineFeatures(msnset, fun = "robust", groupBy = Biobase::fData(msnset)$protein_id))) # line 313 @ https://github.com/statOmics/MSqRobSum/blob/97e22fddb9d6f1d3c29aafbae28c382148b9471d/vignettes/msqrobsum.Rmd#L313 - result = suppressWarnings(msqrobsum(data = protset, formulas = form_eval, group_vars = "protein_id", contrasts = "condition", mode = "msqrobsum")) + result = suppressWarnings(msqrobsum(data = protset, formulas = form_eval, group_vars = "protein_id", contrasts = model_matrix_result_prop, mode = "msqrobsum")) + # result = suppressWarnings(msqrobsum(data = protset, formulas = form_eval, group_vars = "protein_id", contrasts = "condition", mode = "msqrobsum")) ## our version 1, without user-specified random variables; #result = suppressWarnings(msqrobsum(data = protset, formulas = expression ~ (1 | condition), group_vars = "protein_id", contrasts = "condition", mode = "msqrobsum")) } @@ -438,15 +365,6 @@ de_msqrobsum_msqrob = function(eset, eset_proteins = NULL, use_peptide_model = T dplyr::select(protein_id, contrasts) %>% tidyr::unnest(cols = contrasts) - # test: optional log2fc estimates without shrinkage - if(log2fc_without_shrinkage) { - df_fc = protein_foldchange_from_ebayes(eset_proteins, input_intensities_are_log2 = TRUE, random_variables = random_variables) - i = match(as.character(result_unpacked$protein_id), df_fc$protein_id) - stopifnot(!is.na(i)) - result_unpacked$logFC = df_fc$foldchange.log2[i] - algorithm_name = paste0(algorithm_name, "_fc") - } - result_unpacked$sigma_post = result_unpacked$se / result_unpacked$sigma_contrast result_unpacked$effectsize = result_unpacked$logFC / result_unpacked$sigma_post @@ -456,48 +374,196 @@ de_msqrobsum_msqrob = function(eset, eset_proteins = NULL, use_peptide_model = T dea_algorithm = algorithm_name) %>% select(protein_id, pvalue, qvalue, foldchange.log2 = logFC, effectsize, tstatistic = t, standarddeviation = sigma_post, standarderror = se, dea_algorithm) - if(length(random_variables) > 0) { - result = result %>% add_column(contrast_ranvars = paste(random_variables, collapse = ", ")) - } append_log_timestamp(algorithm_name, start_time) return(result) } -#' Compute protein log2 foldchanges from protein-level eBayes (i.e. simple, no shrinkage) +#' Perform linear regression with limma #' -#' implementation analogous to `de_ebayes()` +#' @description +#' Please refer to the vignette on custom limma models at the MS-DAP GitHub repository for a walk-through with examples. #' -#' @param eset a Biobase ExpressionSet that contains the protein intensities. required attributes; fData(): protein_id and pData(): condition -#' @param input_intensities_are_log2 whether the provided intensities are already log2 transformed -#' @param random_variables a vector of column names in your sample metadata table that are added as additional(!) regression terms -#' @return data.frame with columns protein_id and foldchange.log2 -protein_foldchange_from_ebayes = function(eset, input_intensities_are_log2 = TRUE, random_variables = NULL) { - if(length(random_variables) > 0 && (!is.vector(random_variables) || - any(is.na(random_variables) | !is.character(random_variables)))) { - append_log(paste("random_variables must either be NULL or character vector", - paste(random_variables, collapse = ", ")), type = "error") +#' @param x result from `get_protein_matrix(... , include_npep = TRUE)`. Alternatively, a protein log2 intensity matrix (in this case, also provide the `npep` parameter if you want to apply DEqMS) +#' @param npep optionally, the number of peptides-per-protein for each row in input protein matrix. Only needed when providing a matrix for parameter `x` (i.e. you can leave this NULL when `x` is output from `get_protein_matrix(... , include_npep = TRUE)`) +#' @param model_matrix result from `stats::model.matrix()` +#' @param contrasts contrasts defined with `limma::makeContrasts()` that should be extracted from the linear regression model +#' @param limma_block_variable option passed to `limma::lmFit()` to fit block design. To skip (default), set NULL. Otherwise, this should be a character vector of the same length as nrow(model_matrix) so it describes a block/group per respective sample in the model/design matrix +#' @param ebayes_trend option passed to `limma::eBayes()`. Defaults to `FALSE` (same as limma default) +#' @param ebayes_robust option passed to `limma::eBayes()`. Defaults to `FALSE` (same as limma default) +#' @param deqms apply DEqMS? Defaults to `TRUE`. If `FALSE`, returns `limma::eBayes()` result as-is +#' @param return_table if `TRUE`, the default, returns a table with protein_id, log2fc, pvalue, adjusted pvalue, contrast (i.e. this is the output from MS-DAP function `limma_fit_extract_stats()`). If `FALSE`, returns the limma fit object as-is +#' @export +limma_wrapper = function( + x = protein_data, + npep = NULL, + model_matrix, + contrasts, + limma_block_variable = NULL, + ebayes_trend = FALSE, + ebayes_robust = FALSE, + deqms = TRUE, + return_table = TRUE +) { + + mat = NULL + + ### input validation + # test that we got the proper data formats (matrix and vector) + if(!is.matrix(x)) { + if(!(is.list(x) && all(c("matrix", "npep") %in% names(x)) && is.matrix(x$matrix) && + (is.vector(x$npep) || is.array(x$npep)) && is.numeric(x$npep) && length(x$npep) == nrow(x$matrix))) { + append_log("x parameter should either be a matrix, or a list that contains a 'matrix' and 'npep' parameter (where npep is an integer vector of same length as nrow(x$matrix)). Typically this is the output from MS-DAP function: get_protein_matrix(... , include_npep = TRUE)", type = "error") + } + if(!is.null(npep)) { + append_log("when parameter x is a list (i.e. input from function get_protein_matrix(... , include_npep = TRUE)), leave the npep parameter empty", type = "error") + } + + mat = x$matrix + npep = as.vector(x$npep) + } else { + if(!((is.vector(npep) || is.array(npep)) && is.numeric(npep) && length(npep) == nrow(x))) { + append_log("when parameter x is a matrix, the npep parameter should be an integer vector of same length as nrow(x)", type = "error") + } + mat = x + npep = as.vector(npep) } - if(!input_intensities_are_log2) { - x = log2(Biobase::exprs(eset)) - x[!is.finite(x)] = NA - Biobase::exprs(eset) = x + + # test that the matrix is numeric and that npep only contains finite values + if(mode(mat) != "numeric") { + append_log("matrix must be numeric (you can check with the mode() function)", type = "error") } - random_variables = unique(c("condition", random_variables)) - df_metadata = Biobase::pData(eset) - if(!all(random_variables %in% colnames(df_metadata))) { - append_log(paste("eset ExpressionSet pData() must contain columns 'condition' and all additional random variables requested in `random_variables` parameter. missing:", - paste(setdiff(random_variables, colnames(df_metadata)), - collapse = ", ")), type = "error") + if(!all(is.finite(npep) & npep > 0)) { + append_log("npep must only contain finite integers, larger than zero", type = "error") } - x = Biobase::exprs(eset) - df_metadata = df_metadata[, random_variables, drop = F] - # re-use our limma wrapper to apply eBayes() - fit = de_ebayes_fit(x, random_variables = df_metadata) + # finally, check simple parameters + if(!is.matrix(model_matrix)) { + append_log("parameter model_matrix must be a matrix, i.e. output from stats::model.matrix()", type = "error") + } + if(!is.matrix(contrasts)) { + append_log("parameter contrasts must be a matrix, i.e. output from limma::makeContrasts()", type = "error") + } + if(!is.null(limma_block_variable) && ( + anyNA(limma_block_variable) || !is.character(limma_block_variable) || + any(limma_block_variable == "") || length(limma_block_variable) != nrow(model_matrix)) + ) { + append_log("parameter limma_block_variable must be a character vector of the same length as nrow(model_matrix) so it describes a block/group per respective sample in the model/design matrix. Empty strings and NA not allowed", type = "error") + } + if(length(ebayes_trend) != 1 || !ebayes_trend %in% c(TRUE, FALSE)) { + append_log("parameter ebayes_trend must be either TRUE or FALSE", type = "error") + } + if(length(ebayes_robust) != 1 || !ebayes_robust %in% c(TRUE, FALSE)) { + append_log("parameter ebayes_robust must be either TRUE or FALSE", type = "error") + } + if(length(deqms) != 1 || !deqms %in% c(TRUE, FALSE)) { + append_log("parameter deqms must be either TRUE or FALSE", type = "error") + } + if(length(return_table) != 1 || !return_table %in% c(TRUE, FALSE)) { + append_log("parameter return_table must be either TRUE or FALSE", type = "error") + } + ### input validation + + + ### prepare data + # enforce integer type. use ceiling so 0.1 becomes 1 (i.e. we never have zeros) + npep = as.integer(ceiling(npep)) + + + ### apply limma + # adapted from limma documentation; https://bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf , section 9.7 "Multi-level Experiments" + fit = NULL + if(is.null(limma_block_variable)) { + fit = limma::lmFit(mat, model_matrix) + } else { + corfit = limma::duplicateCorrelation(mat, model_matrix, block = limma_block_variable) + append_log(sprintf('applying a limma block design: %.3f correlation between samples within the same "block"\n', corfit$consensus), type = "info") + fit = limma::lmFit(mat, model_matrix, block = limma_block_variable, correlation = corfit$consensus) + } + + fit2 = limma::contrasts.fit(fit, contrasts) + fit2 = limma::eBayes(fit2, trend = ebayes_trend, robust = ebayes_robust) + - ## analogous to our de_ebayes() implementation, extract results from the fit object - coef_col = 2 - data.frame(protein_id = rownames(x), foldchange.log2 = fit$coefficients[,coef_col]) + ### apply DEqMS, analogous to msdap::de_deqms() + if(deqms) { + # bugfix for DEqMS::spectraCounteBayes(), see + fit2$sigma[!is.finite(fit2$sigma) | fit2$sigma <= 0] = min(fit2$sigma[is.finite(fit2$sigma) & fit2$sigma > 0]) + # add peptide-per-protein counts to the limma fit object, as required for DEqMS + fit2$count = npep + fit2 = suppressWarnings(DEqMS::spectraCounteBayes(fit2)) + append_log("applying limma eBayes(), followed by DEqMS (to adjust protein confidences according to their respective number of peptides)", type = "info") + } else { + append_log("returning limma eBayes() results as-is (i.e. no DEqMS post-hoc correction)", type = "info") + } + + + if(return_table) { + return(limma_fit_extract_stats(fit2)) + } else { + return(fit2) + } +} + + + +#' extract limma fit results +#' +#' @param fit result from `limma::lmFit()` +#' @param contrast_name name of a specific contrast(s) to extract. Default value, `NULL`, will return results from all contrasts +limma_fit_extract_stats = function(fit, contrast_name = NULL) { + all_contrasts = setdiff(colnames(fit$coefficients), c("intercept", "Intercept", "(intercept)", "(Intercept")) + + # naive check for limma result + if(!is.list(fit) || !all(c("coefficients", "sigma", "s2.post", "stdev.unscaled") %in% names(fit)) ) { + append_log("fit parameter must be the result from limma::eBayes() (e.g. obtained from MS-DAP function limma_wrapper() )", type = "error") + } + if(is.null(contrast_name)) { + contrast_name = all_contrasts + } else { + if(anyNA(all_contrasts) || !is.character(contrast_name) || any(!contrast_name %in% all_contrasts)) { + append_log(paste("some values in contrast_name are not found in the provided fit parameter. Available contrasts in fit:", paste(all_contrasts, collapse = ",")), type = "error") + } + } + + result = NULL + for(contr in contrast_name) { + x = tibble::as_tibble(limma::topTable(fit, number = Inf, coef = contr, adjust.method = "fdr", sort.by = "none", confint = TRUE)) + x$protein_id = rownames(fit) + x$contrast = contr + x$foldchange.log2 = x$logFC + + # test if the deqms R package was applied to this limma::eBayes() fit/result + is_deqms = "sca.p" %in% names(fit) + + if(is_deqms) { + ## DEqMS code analogous to msdap:::de_deqms() + # indexing by name might fail in the deqms output table for some model matrices ! + if(contr %in% colnames(fit$sca.p)) { + x$pvalue = fit$sca.p[,contr] + } else { + x$pvalue = fit$sca.p[,match(contr, colnames(fit$coefficients))] # fit$sca.p might be unnamed -> use "standard" property of similar dimensions to find index + } + x$qvalue = p.adjust(x$pvalue, method = "fdr") # in this loop, we subsetted a specific contrast so can safely call p.adjust() + # analogous to standard ebayes fit @ below else block, but note that we use "sca" parameters added by DEqMS ! + x$effectsize = fit$coefficients[,contr] / sqrt(fit$sca.postvar) + x$standarderror = sqrt(fit$sca.postvar) * fit$stdev.unscaled[,contr] + x$standarddeviation = sqrt(fit$sca.postvar) + } else { + ## eBayes code analogous to msdap:::de_ebayes() + x$pvalue = x$P.Value + x$qvalue = x$adj.P.Val + x$effectsize = fit$coefficients[,contr] / sqrt(fit$s2.post) + x$standarderror = sqrt(fit$s2.post) * fit$stdev.unscaled[,contr] + x$standarddeviation = sqrt(fit$s2.post) + } + + result = bind_rows( + result, + x %>% select(protein_id, contrast, foldchange.log2, effectsize, standarddeviation, standarderror, pvalue, qvalue) %>% mutate_all(unname) + ) + } + + return(result) } diff --git a/R/stats_differential_detect.R b/R/stats_differential_detect.R index 9a437e6..e5a0398 100644 --- a/R/stats_differential_detect.R +++ b/R/stats_differential_detect.R @@ -130,8 +130,7 @@ differential_detect = function(dataset, min_peptides_observed = 1L, min_samples_ append_log("you have to merge fractionated samples prior to differential detection. Typically, you want to simply use the analysis_quickstart() to process the dataset first. Advanced users may call merge_fractionated_samples(dataset) manually.", type = "error") } - column_contrasts = dataset_contrasts(dataset) - if (length(column_contrasts) == 0) { + if(!is.list(dataset$contrasts) || length(dataset$contrasts) == 0) { append_log("no contrasts have been defined, differential detection analysis is cancelled", type = "warning") return(dataset) } @@ -205,25 +204,20 @@ differential_detect = function(dataset, min_peptides_observed = 1L, min_samples_ ### compare counts within each contrast tib_results = NULL - for(col_contr in column_contrasts) { # col_contr = column_contrasts[1] - ### sample_id per condition ('side of each contrast') - contr_samples = dataset$samples %>% - select(sample_id, exclude, condition=!!col_contr) %>% - filter(exclude == FALSE & condition != 0) %>% - arrange(condition) + for(contr in dataset$contrasts) { # set of sample_id for condition 1 and 2 - sid1 = contr_samples %>% filter(condition == 1) %>% pull(sample_id) - sid2 = contr_samples %>% filter(condition == 2) %>% pull(sample_id) + sid1 = contr$sampleid_condition1 + sid2 = contr$sampleid_condition2 # compute summary stats and z-score for each protein_id z_detect = summarize_proteins(count_detect, count_detect__scaled, sid1, sid2, min_peptides_observed, min_samples_observed, min_fraction_observed, rescale_counts_per_sample) # append to results - tib_results = bind_rows(tib_results, z_detect %>% mutate(contrast = col_contr, type = "detect")) + tib_results = bind_rows(tib_results, z_detect %>% mutate(contrast = contr$label, type = "detect")) # analogous if(compute_quant_counts) { z_quant = summarize_proteins(count_quant, count_quant__scaled, sid1, sid2, min_peptides_observed, min_samples_observed, min_fraction_observed, rescale_counts_per_sample) - tib_results = bind_rows(tib_results, z_quant %>% mutate(contrast = col_contr, type = "quant")) + tib_results = bind_rows(tib_results, z_quant %>% mutate(contrast = contr$label, type = "quant")) } } diff --git a/R/stats_summary.R b/R/stats_summary.R index d09d95a..a4cb21c 100644 --- a/R/stats_summary.R +++ b/R/stats_summary.R @@ -30,8 +30,7 @@ summarise_stats = function(dataset, return_dea = TRUE, return_diffdetect = FALSE append_log("have to set at least one of 'return_dea' and 'return_diffdetect' parameters to TRUE", type = "error") } - all_contrasts = dataset_contrasts(dataset) - if(length(all_contrasts) == 0) { + if(!is.list(dataset$contrasts) || length(dataset$contrasts) == 0) { append_log("summarise_stats(); no contrasts in the dataset, returning empty result", type = "warning") return(NULL) } @@ -51,12 +50,12 @@ summarise_stats = function(dataset, return_dea = TRUE, return_diffdetect = FALSE result = list() for(iter_algo_de in all_algo_dea) { - for(iter_contr in all_contrasts) { + for(contr in dataset$contrasts) { tmp = summarise_stats__for_contrast( dataset, return_dea = return_dea, return_diffdetect = return_diffdetect, - contr = iter_contr, + contr = contr$label, dea_algorithm = iter_algo_de, dea_logfc_as_effectsize = dea_logfc_as_effectsize, diffdetect_zscore_threshold = diffdetect_zscore_threshold, diff --git a/README.Rmd b/README.Rmd index 4b9b217..5606e15 100644 --- a/README.Rmd +++ b/README.Rmd @@ -85,10 +85,11 @@ write_template_for_sample_metadata(dataset, "sample_metadata.xlsx") dataset = import_sample_metadata(dataset, filename = "sample_metadata.xlsx") # 5) Optionally, describe a statistical contrast; in this example we compare sample groups "WT" and "KO". -# - You should use exact same labels as "group" column in sample metadata table. -# - If you don't want to do stats, simply remove or comment this line (e.g. just look at QC report, or maybe your dataset has 1 experimental group only). +# - you should use exact same labels as "group" column in sample metadata table. +# - if you don't want to do stats, simply remove or comment this line (e.g. just look at QC report, or maybe your dataset has 1 only experimental group). # - example for multiple contrasts; dataset = setup_contrasts(dataset, contrast_list = list( c("control", "condition_a"), c("control", "condition_b") ) ) -# - example for adding random variables to eBayes/DEqMS/MSqRob regressions to i.e. counter batch effects (note; these variables must be column names present in sample metadata table. double-check with; print(dataset$samples,n=Inf)): dataset = setup_contrasts(dataset, contrast_list = list( c("WT","KO") ), random_variables = c("induction", "batch") ) +# - alternatively, use the more flexible function add_contrast() (requires MS-DAP version 1.2 or newer) +# - check the extensive set of examples with R command: ?add_contrast dataset = setup_contrasts(dataset, contrast_list = list( c("WT","KO") ) ) # 6) Main function that runs the entire pipeline @@ -218,13 +219,13 @@ Bioinformatic analyses beyond the typical MS-DAP workflow are described in the f - [bioinformatics: differential expression analysis (DEA)](doc/differential_expression_analysis.md) - [bioinformatics: differential detection](doc/differential_detection.md) - [bioinformatics: plugin custom normalization or DEA](doc/custom_norm_dea.md) +- [bioinformatics: advanced statistical models with limma](doc/custom_limma.md) ## Roadmap -- advanced experimental designs for eBayes/DEqMS (i.e. more flexibility in creating design matrices for limma) - implement various imputation approaches - plugins for more statistical methods - e.g. existing methods / new tools currently in development in other labs (write plugin to help beta-test and benchmark) diff --git a/README.md b/README.md index a9bd656..c053ed6 100644 --- a/README.md +++ b/README.md @@ -102,10 +102,11 @@ write_template_for_sample_metadata(dataset, "sample_metadata.xlsx") dataset = import_sample_metadata(dataset, filename = "sample_metadata.xlsx") # 5) Optionally, describe a statistical contrast; in this example we compare sample groups "WT" and "KO". -# - You should use exact same labels as "group" column in sample metadata table. -# - If you don't want to do stats, simply remove or comment this line (e.g. just look at QC report, or maybe your dataset has 1 experimental group only). +# - you should use exact same labels as "group" column in sample metadata table. +# - if you don't want to do stats, simply remove or comment this line (e.g. just look at QC report, or maybe your dataset has 1 only experimental group). # - example for multiple contrasts; dataset = setup_contrasts(dataset, contrast_list = list( c("control", "condition_a"), c("control", "condition_b") ) ) -# - example for adding random variables to eBayes/DEqMS/MSqRob regressions to i.e. counter batch effects (note; these variables must be column names present in sample metadata table. double-check with; print(dataset$samples,n=Inf)): dataset = setup_contrasts(dataset, contrast_list = list( c("WT","KO") ), random_variables = c("induction", "batch") ) +# - alternatively, use the more flexible function add_contrast() (requires MS-DAP version 1.2 or newer) +# - check the extensive set of examples with R command: ?add_contrast dataset = setup_contrasts(dataset, contrast_list = list( c("WT","KO") ) ) # 6) Main function that runs the entire pipeline @@ -265,11 +266,11 @@ Differential Expression Analysis (DEA). detection](doc/differential_detection.md) - [bioinformatics: plugin custom normalization or DEA](doc/custom_norm_dea.md) +- [bioinformatics: advanced statistical models with + limma](doc/custom_limma.md) ## Roadmap -- advanced experimental designs for eBayes/DEqMS (i.e. more flexibility - in creating design matrices for limma) - implement various imputation approaches - plugins for more statistical methods - e.g. existing methods / new tools currently in development in other diff --git a/doc/custom_limma.Rmd b/doc/custom_limma.Rmd new file mode 100644 index 0000000..180ea8f --- /dev/null +++ b/doc/custom_limma.Rmd @@ -0,0 +1,273 @@ +--- +output: + rmarkdown::github_document: + html_preview: true + toc: true +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + fig.path = "images/customlimma-", + comment = "#>" +) +``` + +```{r setup, include = FALSE} +devtools::load_all() + +# ! change to some temporary working directory on your computer ! +setwd("C:/temp/") +``` + + + +This vignette demonstrates how the MS-DAP R package can help you perform differential expression analysis (DEA) with custom linear regression models. _Note that this requires MS-DAP version 1.2 or later._ + +For all typical case _vs_ control analyses (A/B testing), you can use the default MS-DAP workflow as described on the main page. +However, for some studies we might have to deal with repeated measurement models (by creating a "block design") or create linear regression models that feature interactions. +We here show how to apply custom regression models to your dataset with MS-DAP. + +Additional documentation and examples for creating experimental designs can be found in the limma user guide, section 9; https://bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf + + +note: in the code blocks shown below (grey areas), the lines that start with `#> ` are the respective output that would be printed to the console if you run these code snippets on your computer + + +## load dataset + +In this demo we'll use the Skyline output of the LFQbench study (PMID:27701404) that is already bundled with the MS-DAP package. Not a dataset that requires anything other than A/B testing, but it's a convenient example because the data is ready to go. + +Note that we here parse the sample-to-group assignments straight from the sample filenames. But in typical workflows, this script would start with; + +- dataset = import_dataset_diann(...) +- dataset = import_fasta(...) +- dataset = import_sample_metadata(...) + +```{r} +library(msdap) + +f <- system.file("extdata", "Skyline_HYE124_TTOF5600_64var_it2.tsv.gz", package = "msdap") +dataset = import_dataset_skyline(f, confidence_threshold = 0.01, return_decoys = F, acquisition_mode = "dia") +dataset = sample_metadata_custom(dataset, group_regex_array = c(A = "007|009|011", B = "008|010|012") ) +print(dataset$samples %>% select(sample_id, group)) +``` + + +## filtering and normalization + +Assuming that your sample metadata table has a column named "group" that describes your main experimental conditions, +we'll first apply filtering and normalization such that only peptides that were detected in 75% of all samples _per group_ are retained. +Note that if your dataset is DDA, as opposed to DIA, you'd probably want to filter for "quantified in 75% of samples per group" instead (because "detect" for DDA implies MS/MS detection / PSM, whereas for DIA "detect" implies upstream software assigned a strong confidence - the latter is much more common than consistent MS/MS detection in DDA). + +For normalization, we'll first apply the "vsn" algorithm at peptide-level and then perform "modebetween_protein" to ensure the protein-level mode log2fc between "groups" is zero. + +```{r} +dataset = filter_dataset(dataset, filter_min_detect = 3, filter_fraction_detect = 0.75, all_group = TRUE, norm_algorithm = c("vsn", "modebetween_protein")) +``` + + +## applying linear regression with limma + +In this section we will use MS-DAP helper functions (new since release 1.2) to easily perform linear regression with any user-specified model, including block design and multi-level experiments. Below code shows how implement a simple comparison between two independent groups, i.e. basic A/B testing. The following two examples show more advanced experimental designs. + +```{r} +### step 1: prepare data for regression +# First, print all peptide-level data that are available. This helper function prints all column names in the peptide table that can be used (i.e. these are the outcome of the filter_dataset() function) +print_available_filtering_results(dataset) + +# Create a protein*sample log2 intensity matrix based on the "all_group" filtering we applied previously +protein_data = get_protein_matrix(dataset, intensity_column = "intensity_all_group") + +# Create a sample metadata table that matches the protein data matrix +# Importantly, this sample table is aligned with the protein matrix and downstream code critically assumes that you do not manually reorder the protein matrix nor reorder rows of the samples table +samples = get_samples_for_regression(dataset, protein_data) +print(samples) + +### step 2: define your regression model +# We here have full control over the linear regression model; create any model matrix and contrasts you want following limma syntax (c.f. limma user guide) +# In this example, we are making typical two-group comparisons between samples that are labeled as 'A' and 'B' in the group column of the samples table. +# Note that we here set "0+" to remove the intercept so we can create contrasts between values in the 'group' column in the next step (limma::makeContrasts()) +design = stats::model.matrix( ~ 0 + group, data = samples) +# Print design matrix: we can use respective columns to create contrasts on the next line +print(design) + +# Define 1 or more contrasts. Basic syntax: = - +design_contrasts = limma::makeContrasts( + AvsB = groupB - groupA, # positive log2fc = higher abundance in first-listed group + # suppose there there are multiple groups, we could here add additional comparisons as follows: + # BvsC = groupC - groupB, + levels = design +) + +### step 3: use MS-DAP helper function to easily apply this model to your data +# Result is a table that contains statistics for all specified contrasts +# Note that we can apply the DEqMS method for post-hoc adjustment of protein confidences according to their respective number of peptides. +result = limma_wrapper(protein_data, model_matrix = design, contrasts = design_contrasts, deqms = TRUE) + +# Optionally, add protein metadata (fasta header and gene symbols) +# In the current test dataset we didn't use import_fasta(), but on real datasets this'll be useful +result = inner_join( + dataset$proteins %>% select(protein_id, fasta_headers, gene_symbols_or_id), + result, + by = "protein_id" +) %>% arrange(pvalue) +print(head(result)) +``` + + +### advanced experimental design: paired samples and blocking + +Following the example for a paired test from section 9.4 of the limma user guide: + +_Suppose an experiment is conducted to compare a new treatment (T) with a control (C). Six dogs are used from three sib-ships. For each sib-pair, one dog is given the treatment while the other dog is a control. This produces the targets frame:_ + +| sample_id | group | block | +| --------- | ----- | ----- | +| File1 | control | 1 | +| File2 | treatment | 1 | +| File3 | control | 2 | +| File4 | treatment | 2 | +| File5 | control | 3 | +| File6 | treatment | 3 | + +Above sample table shows the experimental design from the limma example, with minor adaptions to fit conventions in MS-DAP (also, the siblings / "sib-ships" from the limma example are here encoded in the "block" column). If your experimental design features larger "blocks" than 2, as opposed to the paired design shown here, you can use the exact same setup for encoding sample metadata and performing the regression analyses (you may also use names/letters to indicate blocks, these are automatically converted to factors by the MS-DAP functions that prepare the samples table). + +We could implement this experimental design in MS-DAP with a minor adaption to the first example in this section: + +```{r eval=FALSE} +# Same data prep as main example above +print_available_filtering_results(dataset) +protein_data = get_protein_matrix(dataset, intensity_column = "intensity_all_group") +samples = get_samples_for_regression(dataset, protein_data) +print(samples) + +# regression formula to test the difference between control and treatment, within each block +design = stats::model.matrix( ~ 0 + block + group, data = samples) +print(design) # print design matrix: we can use respective columns to create contrasts on the next line +design_contrasts = limma::makeContrasts( + # in this contrast specification, positive log2fc = higher abundance in treatment group + control_vs_treatment = grouptreatment - groupcontrol, + levels = design +) + +# Apply linear regression model to your data and let MS-DAP take care of details + apply DEqMS +result = limma_wrapper( + protein_data, + model_matrix = design, + contrasts = design_contrasts, + deqms = TRUE +) + +# add protein metadata and print results +result = inner_join(dataset$proteins %>% select(protein_id, fasta_headers, gene_symbols_or_id), result, by = "protein_id") %>% arrange(pvalue) +print(head(result)) +``` + + +### advanced experimental design: multi-level experiments + +Following the example for a multi-level design from section 9.7 of the limma user guide: + +_This experiment involves 6 subjects, including 3 patients who have the disease and 3 normal subjects. From each subject, we have expression profiles of two tissue types, A and B. In analysing this experiment, we want to compare the two tissue types. This comparison can be made within subjects, because each subject yields a value for both tissues. We also want to compare diseased subjects to normal subjects, but this comparison is between subjects._ + +| sample_id | group | subject | tissue | +| --------- | ----- | ------- | ------ | +| File1 | Diseased | 1 | A | +| File2 | Diseased | 1 | B | +| File3 | Diseased | 2 | A | +| File4 | Diseased | 2 | B | +| File5 | Diseased | 3 | A | +| File6 | Diseased | 3 | B | +| File7 | Normal | 4 | A | +| File8 | Normal | 4 | B | +| File9 | Normal | 5 | A | +| File10 | Normal | 5 | B | +| File11 | Normal | 6 | A | +| File12 | Normal | 6 | B | + +Above sample table shows the experimental design from the limma example, with minor adaptions to fit conventions in MS-DAP. + +We could implement this experimental design in MS-DAP with a minor adaption to the first example in this section. Note that in this example, we have to 1) specify a new variable that combines group and tissue information and 2) specify the `limma_block_variable` parameter in the `limma_wrapper()` function. + +```{r eval=FALSE} +# Same data prep as main example above +print_available_filtering_results(dataset) +protein_data = get_protein_matrix(dataset, intensity_column = "intensity_all_group") +samples = get_samples_for_regression(dataset, protein_data) +print(samples) + +# Update the samples table to encode combinations of the subject's condition and the tissue +samples$treatment = factor(paste(samples$group, samples$tissue, sep = ".")) + +# Create the model matrix using this combination of group*tissue +design = stats::model.matrix( ~ 0 + treatment, data = samples) +print(design) # print design matrix: we can use respective columns to create contrasts on the next line + +# Optionally, we can rename the columns of the design matrix (yields somewhat simpler names) +colnames(design) = levels(samples$treatment) +print(colnames(design)) # view updated column names + +# create contrasts, analogous to the limma user guide: +design_contrasts = limma::makeContrasts( + DiseasedvsNormalForTissueA = Diseased.A-Normal.A, + DiseasedvsNormalForTissueB = Diseased.B-Normal.B, + TissueAvsTissueBForNormal = Normal.B-Normal.A, + TissueAvsTissueBForDiseased = Diseased.B-Diseased.A, + control_vs_treatment = grouptreatment - groupcontrol, + levels = design +) + +# Apply linear regression model to your data and let MS-DAP take care of details + apply DEqMS. +# Importantly, we here specify that limma has to model the inter-subject correlation +# by providing the subject identifiers as a blocking variable. +result = limma_wrapper( + protein_data, + model_matrix = design, + contrasts = design_contrasts, + limma_block_variable = samples$subject, + deqms = TRUE +) + +# add protein metadata and print results +result = inner_join(dataset$proteins %>% select(protein_id, fasta_headers, gene_symbols_or_id), result, by = "protein_id") %>% arrange(pvalue) +print(head(result)) +``` + + +## plotting results + +When following the above example code that performs a custom linear regression analysis with limma, you can use the following code snippet to create typical MS-DAP -styled volcano plots: + +```{r eval=FALSE} + +# here assuming that `result` is the output from `limma_wrapper()` joined with the `dataset$proteins` table, as per above example. + +plot_volcano_allcontrast( + result, log2foldchange_threshold = 0, qvalue_threshold = 0.01, + label_mode = "topn_pvalue", label_target = 10, label_avoid_overlap = TRUE, + mtitle = "volcano, label top 10", show_plots = TRUE +) +``` + + +## validation + +Appendum: we here validate that the first example shown above that featured simple A/B testing, as implemented using MS-DAP utility functions that enable flexible modeling with limma, yields the exact same results as a typical MS-DAP workflow that uses `msdap::analysis_quickstart()` or `msdap::dea()`. + +```{r validation} +# regular MS-DAP workflow: setup a contrast and apply the msdap::dea() function (which is also used when you use the typical msdap::analysis_quickstart() function) +dataset = setup_contrasts(dataset, contrast_list = list(c("A", "B"))) +dataset = dea(dataset, dea_algorithm = "deqms") + +stopifnot(nrow(dataset$de_proteins) == nrow(result)) # same number of proteins +stopifnot(dataset$de_proteins$protein_id %in% result$protein_id) # same protein identifiers +# we here have only 1 contrast and 1 dea_algorithm, so we can simply align both tables using match() +index = match(dataset$de_proteins$protein_id, result$protein_id) +print(all.equal(dataset$de_proteins$pvalue, result$pvalue[index])) # same pvalues +print(all.equal(dataset$de_proteins$foldchange.log2, result$foldchange.log2[index])) # same log2fc + +# bonus: validate LFQbench expected outcome by volcano plot; color-coded as yeast=red, ecoli=blue, hela=black +plotdata = result %>% mutate(is_yeast = grepl("_YEAS", protein_id), is_ecoli = grepl("_ECOL", protein_id)) %>% arrange(is_yeast, is_ecoli) +plot(plotdata$foldchange.log2, -log10(plotdata$qvalue), col = ifelse(plotdata$is_yeast, "red", ifelse(plotdata$is_ecoli, "blue", "black")) ) +``` diff --git a/doc/custom_limma.md b/doc/custom_limma.md new file mode 100644 index 0000000..b728115 --- /dev/null +++ b/doc/custom_limma.md @@ -0,0 +1,373 @@ + +- [load dataset](#load-dataset) +- [filtering and normalization](#filtering-and-normalization) +- [applying linear regression with + limma](#applying-linear-regression-with-limma) + - [advanced experimental design: paired samples and + blocking](#advanced-experimental-design-paired-samples-and-blocking) + - [advanced experimental design: multi-level + experiments](#advanced-experimental-design-multi-level-experiments) +- [plotting results](#plotting-results) +- [validation](#validation) + +This vignette demonstrates how the MS-DAP R package can help you perform +differential expression analysis (DEA) with custom linear regression +models. *Note that this requires MS-DAP version 1.2 or later.* + +For all typical case *vs* control analyses (A/B testing), you can use +the default MS-DAP workflow as described on the main page. However, for +some studies we might have to deal with repeated measurement models (by +creating a “block design”) or create linear regression models that +feature interactions. We here show how to apply custom regression models +to your dataset with MS-DAP. + +Additional documentation and examples for creating experimental designs +can be found in the limma user guide, section 9; + + +note: in the code blocks shown below (grey areas), the lines that start +with `#>` are the respective output that would be printed to the console +if you run these code snippets on your computer + +## load dataset + +In this demo we’ll use the Skyline output of the LFQbench study +() that is already bundled with the MS-DAP package. Not a +dataset that requires anything other than A/B testing, but it’s a +convenient example because the data is ready to go. + +Note that we here parse the sample-to-group assignments straight from +the sample filenames. But in typical workflows, this script would start +with; + +- dataset = import_dataset_diann(…) +- dataset = import_fasta(…) +- dataset = import_sample_metadata(…) + +``` r +library(msdap) + +f <- system.file("extdata", "Skyline_HYE124_TTOF5600_64var_it2.tsv.gz", package = "msdap") +dataset = import_dataset_skyline(f, confidence_threshold = 0.01, return_decoys = F, acquisition_mode = "dia") +#> info: reading Skyline report... +#> info: input file: C:/VU/code/R/msdap/inst/extdata/Skyline_HYE124_TTOF5600_64var_it2.tsv.gz +#> info: 4 unique target (plain)sequences ambiguously mapped to multiple proteins and thus removed. Examples; TTDVTGTIELPEGVEMVMPGDNIK, LNIISNLDCVNEVIGIR, LMDLSINK, EVDEQMLNVQNK +#> info: 34263/35943 precursors remain after selecting the 'best' precursor for each modified sequence +dataset = sample_metadata_custom(dataset, group_regex_array = c(A = "007|009|011", B = "008|010|012") ) +print(dataset$samples %>% select(sample_id, group)) +#> # A tibble: 6 × 2 +#> sample_id group +#> +#> 1 lgillet_L150206_007 A +#> 2 lgillet_L150206_009 A +#> 3 lgillet_L150206_011 A +#> 4 lgillet_L150206_008 B +#> 5 lgillet_L150206_010 B +#> 6 lgillet_L150206_012 B +``` + +## filtering and normalization + +Assuming that your sample metadata table has a column named “group” that +describes your main experimental conditions, we’ll first apply filtering +and normalization such that only peptides that were detected in 75% of +all samples *per group* are retained. Note that if your dataset is DDA, +as opposed to DIA, you’d probably want to filter for “quantified in 75% +of samples per group” instead (because “detect” for DDA implies MS/MS +detection / PSM, whereas for DIA “detect” implies upstream software +assigned a strong confidence - the latter is much more common than +consistent MS/MS detection in DDA). + +For normalization, we’ll first apply the “vsn” algorithm at +peptide-level and then perform “modebetween_protein” to ensure the +protein-level mode log2fc between “groups” is zero. + +``` r +dataset = filter_dataset(dataset, filter_min_detect = 3, filter_fraction_detect = 0.75, all_group = TRUE, norm_algorithm = c("vsn", "modebetween_protein")) +#> progress: caching filter data took 2 seconds +#> info: filter dataset with settings: min_detect = 3; fraction_detect = 0.75; norm_algorithm = 'vsn&modebetween_protein'; rollup_algorithm = 'maxlfq' +#> 12756/34263 peptides were retained after filtering over all groups +#> progress: peptide filtering and normalization took 3 seconds +``` + +## applying linear regression with limma + +In this section we will use MS-DAP helper functions (new since release +1.2) to easily perform linear regression with any user-specified model, +including block design and multi-level experiments. Below code shows how +implement a simple comparison between two independent groups, i.e. basic +A/B testing. The following two examples show more advanced experimental +designs. + +``` r +### step 1: prepare data for regression +# First, print all peptide-level data that are available. This helper function prints all column names in the peptide table that can be used (i.e. these are the outcome of the filter_dataset() function) +print_available_filtering_results(dataset) +#> "intensity" = input data as-is +#> "intensity_all_group" = global data filter ('all_group') + +# Create a protein*sample log2 intensity matrix based on the "all_group" filtering we applied previously +protein_data = get_protein_matrix(dataset, intensity_column = "intensity_all_group") + +# Create a sample metadata table that matches the protein data matrix +# Importantly, this sample table is aligned with the protein matrix and downstream code critically assumes that you do not manually reorder the protein matrix nor reorder rows of the samples table +samples = get_samples_for_regression(dataset, protein_data) +#> info: categorical variable (R factor): group +print(samples) +#> # A tibble: 6 × 2 +#> sample_id group +#> +#> 1 lgillet_L150206_007 A +#> 2 lgillet_L150206_008 B +#> 3 lgillet_L150206_009 A +#> 4 lgillet_L150206_010 B +#> 5 lgillet_L150206_011 A +#> 6 lgillet_L150206_012 B + +### step 2: define your regression model +# We here have full control over the linear regression model; create any model matrix and contrasts you want following limma syntax (c.f. limma user guide) +# In this example, we are making typical two-group comparisons between samples that are labeled as 'A' and 'B' in the group column of the samples table. +# Note that we here set "0+" to remove the intercept so we can create contrasts between values in the 'group' column in the next step (limma::makeContrasts()) +design = stats::model.matrix( ~ 0 + group, data = samples) +# Print design matrix: we can use respective columns to create contrasts on the next line +print(design) +#> groupA groupB +#> 1 1 0 +#> 2 0 1 +#> 3 1 0 +#> 4 0 1 +#> 5 1 0 +#> 6 0 1 +#> attr(,"assign") +#> [1] 1 1 +#> attr(,"contrasts") +#> attr(,"contrasts")$group +#> [1] "contr.treatment" + +# Define 1 or more contrasts. Basic syntax: = - +design_contrasts = limma::makeContrasts( + AvsB = groupB - groupA, # positive log2fc = higher abundance in first-listed group + # suppose there there are multiple groups, we could here add additional comparisons as follows: + # BvsC = groupC - groupB, + levels = design +) + +### step 3: use MS-DAP helper function to easily apply this model to your data +# Result is a table that contains statistics for all specified contrasts +# Note that we can apply the DEqMS method for post-hoc adjustment of protein confidences according to their respective number of peptides. +result = limma_wrapper(protein_data, model_matrix = design, contrasts = design_contrasts, deqms = TRUE) +#> info: applying limma eBayes(), followed by DEqMS (to adjust protein confidences according to their respective number of peptides) + +# Optionally, add protein metadata (fasta header and gene symbols) +# In the current test dataset we didn't use import_fasta(), but on real datasets this'll be useful +result = inner_join( + dataset$proteins %>% select(protein_id, fasta_headers, gene_symbols_or_id), + result, + by = "protein_id" +) %>% arrange(pvalue) +print(head(result)) +#> # A tibble: 6 × 10 +#> protein_id fasta_headers gene_symbols_or_id contrast foldchange.log2 +#> +#> 1 1/sp|P0A6F5|CH60_EC… 1/sp|P0A6F5|… 1/sp|P0A6F5|CH60_… AvsB 1.72 +#> 2 1/sp|P0A6M8|EFG_ECO… 1/sp|P0A6M8|… 1/sp|P0A6M8|EFG_E… AvsB 1.79 +#> 3 1/sp|P09373|PFLB_EC… 1/sp|P09373|… 1/sp|P09373|PFLB_… AvsB 1.79 +#> 4 1/sp|P0A6Y8|DNAK_EC… 1/sp|P0A6Y8|… 1/sp|P0A6Y8|DNAK_… AvsB 1.73 +#> 5 1/sp|P0A9Q7|ADHE_EC… 1/sp|P0A9Q7|… 1/sp|P0A9Q7|ADHE_… AvsB 1.83 +#> 6 2/sp|P0CE48|EFTU2_E… 2/sp|P0CE48|… 2/sp|P0CE48|EFTU2… AvsB 1.73 +#> # ℹ 5 more variables: effectsize , standarddeviation , +#> # standarderror , pvalue , qvalue +``` + +### advanced experimental design: paired samples and blocking + +Following the example for a paired test from section 9.4 of the limma +user guide: + +*Suppose an experiment is conducted to compare a new treatment (T) with +a control (C). Six dogs are used from three sib-ships. For each +sib-pair, one dog is given the treatment while the other dog is a +control. This produces the targets frame:* + +| sample_id | group | block | +|-----------|-----------|-------| +| File1 | control | 1 | +| File2 | treatment | 1 | +| File3 | control | 2 | +| File4 | treatment | 2 | +| File5 | control | 3 | +| File6 | treatment | 3 | + +Above sample table shows the experimental design from the limma example, +with minor adaptions to fit conventions in MS-DAP (also, the siblings / +“sib-ships” from the limma example are here encoded in the “block” +column). If your experimental design features larger “blocks” than 2, as +opposed to the paired design shown here, you can use the exact same +setup for encoding sample metadata and performing the regression +analyses (you may also use names/letters to indicate blocks, these are +automatically converted to factors by the MS-DAP functions that prepare +the samples table). + +We could implement this experimental design in MS-DAP with a minor +adaption to the first example in this section: + +``` r +# Same data prep as main example above +print_available_filtering_results(dataset) +protein_data = get_protein_matrix(dataset, intensity_column = "intensity_all_group") +samples = get_samples_for_regression(dataset, protein_data) +print(samples) + +# regression formula to test the difference between control and treatment, within each block +design = stats::model.matrix( ~ 0 + block + group, data = samples) +print(design) # print design matrix: we can use respective columns to create contrasts on the next line +design_contrasts = limma::makeContrasts( + # in this contrast specification, positive log2fc = higher abundance in treatment group + control_vs_treatment = grouptreatment - groupcontrol, + levels = design +) + +# Apply linear regression model to your data and let MS-DAP take care of details + apply DEqMS +result = limma_wrapper( + protein_data, + model_matrix = design, + contrasts = design_contrasts, + deqms = TRUE +) + +# add protein metadata and print results +result = inner_join(dataset$proteins %>% select(protein_id, fasta_headers, gene_symbols_or_id), result, by = "protein_id") %>% arrange(pvalue) +print(head(result)) +``` + +### advanced experimental design: multi-level experiments + +Following the example for a multi-level design from section 9.7 of the +limma user guide: + +*This experiment involves 6 subjects, including 3 patients who have the +disease and 3 normal subjects. From each subject, we have expression +profiles of two tissue types, A and B. In analysing this experiment, we +want to compare the two tissue types. This comparison can be made within +subjects, because each subject yields a value for both tissues. We also +want to compare diseased subjects to normal subjects, but this +comparison is between subjects.* + +| sample_id | group | subject | tissue | +|-----------|----------|---------|--------| +| File1 | Diseased | 1 | A | +| File2 | Diseased | 1 | B | +| File3 | Diseased | 2 | A | +| File4 | Diseased | 2 | B | +| File5 | Diseased | 3 | A | +| File6 | Diseased | 3 | B | +| File7 | Normal | 4 | A | +| File8 | Normal | 4 | B | +| File9 | Normal | 5 | A | +| File10 | Normal | 5 | B | +| File11 | Normal | 6 | A | +| File12 | Normal | 6 | B | + +Above sample table shows the experimental design from the limma example, +with minor adaptions to fit conventions in MS-DAP. + +We could implement this experimental design in MS-DAP with a minor +adaption to the first example in this section. Note that in this +example, we have to 1) specify a new variable that combines group and +tissue information and 2) specify the `limma_block_variable` parameter +in the `limma_wrapper()` function. + +``` r +# Same data prep as main example above +print_available_filtering_results(dataset) +protein_data = get_protein_matrix(dataset, intensity_column = "intensity_all_group") +samples = get_samples_for_regression(dataset, protein_data) +print(samples) + +# Update the samples table to encode combinations of the subject's condition and the tissue +samples$treatment = factor(paste(samples$group, samples$tissue, sep = ".")) + +# Create the model matrix using this combination of group*tissue +design = stats::model.matrix( ~ 0 + treatment, data = samples) +print(design) # print design matrix: we can use respective columns to create contrasts on the next line + +# Optionally, we can rename the columns of the design matrix (yields somewhat simpler names) +colnames(design) = levels(samples$treatment) +print(colnames(design)) # view updated column names + +# create contrasts, analogous to the limma user guide: +design_contrasts = limma::makeContrasts( + DiseasedvsNormalForTissueA = Diseased.A-Normal.A, + DiseasedvsNormalForTissueB = Diseased.B-Normal.B, + TissueAvsTissueBForNormal = Normal.B-Normal.A, + TissueAvsTissueBForDiseased = Diseased.B-Diseased.A, + control_vs_treatment = grouptreatment - groupcontrol, + levels = design +) + +# Apply linear regression model to your data and let MS-DAP take care of details + apply DEqMS. +# Importantly, we here specify that limma has to model the inter-subject correlation +# by providing the subject identifiers as a blocking variable. +result = limma_wrapper( + protein_data, + model_matrix = design, + contrasts = design_contrasts, + limma_block_variable = samples$subject, + deqms = TRUE +) + +# add protein metadata and print results +result = inner_join(dataset$proteins %>% select(protein_id, fasta_headers, gene_symbols_or_id), result, by = "protein_id") %>% arrange(pvalue) +print(head(result)) +``` + +## plotting results + +When following the above example code that performs a custom linear +regression analysis with limma, you can use the following code snippet +to create typical MS-DAP -styled volcano plots: + +``` r + +# here assuming that `result` is the output from `limma_wrapper()` joined with the `dataset$proteins` table, as per above example. + +plot_volcano_allcontrast( + result, log2foldchange_threshold = 0, qvalue_threshold = 0.01, + label_mode = "topn_pvalue", label_target = 10, label_avoid_overlap = TRUE, + mtitle = "volcano, label top 10", show_plots = TRUE +) +``` + +## validation + +Appendum: we here validate that the first example shown above that +featured simple A/B testing, as implemented using MS-DAP utility +functions that enable flexible modeling with limma, yields the exact +same results as a typical MS-DAP workflow that uses +`msdap::analysis_quickstart()` or `msdap::dea()`. + +``` r +# regular MS-DAP workflow: setup a contrast and apply the msdap::dea() function (which is also used when you use the typical msdap::analysis_quickstart() function) +dataset = setup_contrasts(dataset, contrast_list = list(c("A", "B"))) +#> info: numeric variable: condition +#> info: contrast: A vs B # condition_variable: group +dataset = dea(dataset, dea_algorithm = "deqms") +#> info: differential expression analysis for contrast: A vs B # condition_variable: group +#> info: using data from peptide filter: global data filter +#> progress: DEqMS took 1 seconds + +stopifnot(nrow(dataset$de_proteins) == nrow(result)) # same number of proteins +stopifnot(dataset$de_proteins$protein_id %in% result$protein_id) # same protein identifiers +# we here have only 1 contrast and 1 dea_algorithm, so we can simply align both tables using match() +index = match(dataset$de_proteins$protein_id, result$protein_id) +print(all.equal(dataset$de_proteins$pvalue, result$pvalue[index])) # same pvalues +#> [1] TRUE +print(all.equal(dataset$de_proteins$foldchange.log2, result$foldchange.log2[index])) # same log2fc +#> [1] TRUE + +# bonus: validate LFQbench expected outcome by volcano plot; color-coded as yeast=red, ecoli=blue, hela=black +plotdata = result %>% mutate(is_yeast = grepl("_YEAS", protein_id), is_ecoli = grepl("_ECOL", protein_id)) %>% arrange(is_yeast, is_ecoli) +plot(plotdata$foldchange.log2, -log10(plotdata$qvalue), col = ifelse(plotdata$is_yeast, "red", ifelse(plotdata$is_ecoli, "blue", "black")) ) +``` + +![](images/customlimma-validation-1.png) diff --git a/doc/custom_norm_dea.Rmd b/doc/custom_norm_dea.Rmd index fc7cd2a..02b09a0 100644 --- a/doc/custom_norm_dea.Rmd +++ b/doc/custom_norm_dea.Rmd @@ -8,7 +8,7 @@ output: ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, - fig.path = "images/dea-", + fig.path = "images/custnorm-", comment = "#>" ) ``` @@ -203,7 +203,7 @@ print(dataset$de_proteins %>% A simple Venn diagram of proteins at 1% FDR for each method shows our custom function identifies a subset of the results from limma's ebayes function, so we verified that the example implementation doesn't yield random stuff. -```{r} +```{r venn} gplots::venn(list(ebayes=dataset$de_proteins %>% filter(dea_algorithm=="ebayes" & qvalue <= 0.01) %>% pull(protein_id), my_dea_stats=dataset$de_proteins %>% filter(dea_algorithm=="my_dea_stats" & qvalue <= 0.01) %>% pull(protein_id))) ``` diff --git a/doc/custom_norm_dea.md b/doc/custom_norm_dea.md index 7912f09..70cc6b1 100644 --- a/doc/custom_norm_dea.md +++ b/doc/custom_norm_dea.md @@ -45,7 +45,8 @@ dataset = import_dataset_skyline(f, confidence_threshold = 0.01, return_decoys = dataset = sample_metadata_custom(dataset, group_regex_array = c(A = "007|009|011", B = "008|010|012") ) dataset = setup_contrasts(dataset, contrast_list = list(c("A", "B"))) -#> info: contrast: A vs B +#> info: numeric variable: condition +#> info: contrast: A vs B # condition_variable: group print(dataset$samples %>% select(sample_id, group)) #> # A tibble: 6 × 2 @@ -265,11 +266,12 @@ dataset = analysis_quickstart( #> 12756/34263 peptides were retained after filtering over all groups #> 21591/34263 peptides were retained after filtering within each group independently ("by group") #> progress: peptide filtering and normalization took 3 seconds -#> info: differential expression analysis for contrast: A vs B +#> info: differential expression analysis for contrast: A vs B # condition_variable: group #> info: using data from peptide filter: global data filter -#> progress: peptide to protein rollup with MaxLFQ (implementation: iq) took 1 seconds #> progress: eBayes took 1 seconds -#> Example custom DEA implementation, my_dea_stats(), yielded 720 hits at qvalue<=0.01 +#> warning: an error occurred in "contrast: A vs B # condition_variable: group" during the execution of DEA function "my_dea_stats" +#> warning: argument "input_intensities_are_log2" is missing, with no default +#> language alg_fun(peptides = peptides_for_contrast, samples = samples_for_contrast, eset_peptides = eset_peptides, eset_proteins = eset_proteins, model_matrix = contr$model_matrix, ... #> warning: differential detection parameters set to NA, this analysis is cancelled ``` @@ -280,11 +282,10 @@ print(dataset$de_proteins %>% group_by(dea_algorithm) %>% summarise(`1% FDR` = sum(qvalue <= 0.01), `1% FDR AND foldchange threshold` = sum(qvalue <= 0.01 & signif))) -#> # A tibble: 2 × 3 +#> # A tibble: 1 × 3 #> dea_algorithm `1% FDR` `1% FDR AND foldchange threshold` #> #> 1 ebayes 1045 1045 -#> 2 my_dea_stats 720 720 ``` A simple Venn diagram of proteins at 1% FDR for each method shows our @@ -297,4 +298,4 @@ gplots::venn(list(ebayes=dataset$de_proteins %>% filter(dea_algorithm=="ebayes" my_dea_stats=dataset$de_proteins %>% filter(dea_algorithm=="my_dea_stats" & qvalue <= 0.01) %>% pull(protein_id))) ``` -![](images/dea-unnamed-chunk-8-1.png) +![](images/custnorm-venn-1.png) diff --git a/doc/differential_detection.md b/doc/differential_detection.md index c0a6166..7236a11 100644 --- a/doc/differential_detection.md +++ b/doc/differential_detection.md @@ -92,7 +92,8 @@ dataset = import_dataset_skyline(f, confidence_threshold = 0.01, return_decoys = dataset = sample_metadata_custom(dataset, group_regex_array = c(A = "007|009|011", B = "008|010|012") ) dataset = setup_contrasts(dataset, contrast_list = list(c("A", "B"))) -#> info: contrast: A vs B +#> info: numeric variable: condition +#> info: contrast: A vs B # condition_variable: group print(dataset$samples %>% select(sample_id, group)) #> # A tibble: 6 × 2 diff --git a/doc/differential_expression_analysis.md b/doc/differential_expression_analysis.md index 739e955..a47cfc3 100644 --- a/doc/differential_expression_analysis.md +++ b/doc/differential_expression_analysis.md @@ -45,7 +45,8 @@ dataset = import_dataset_skyline(f, confidence_threshold = 0.01, return_decoys = dataset = sample_metadata_custom(dataset, group_regex_array = c(A = "007|009|011", B = "008|010|012") ) dataset = setup_contrasts(dataset, contrast_list = list(c("A", "B"))) -#> info: contrast: A vs B +#> info: numeric variable: condition +#> info: contrast: A vs B # condition_variable: group print(dataset$samples %>% select(sample_id, group)) #> # A tibble: 6 × 2 @@ -95,7 +96,6 @@ dataset = filter_dataset(dataset, norm_algorithm = c("vwmb", "modebetween_protein"), by_group = F, all_group = T, by_contrast = F) #> progress: caching filter data took 2 seconds -#> progress: peptide to protein rollup with MaxLFQ (implementation: iq) took 1 seconds #> info: filter dataset with settings: min_detect = 3; norm_algorithm = 'vwmb&modebetween_protein'; rollup_algorithm = 'maxlfq' #> 12756/34263 peptides were retained after filtering over all groups #> progress: peptide filtering and normalization took 3 seconds @@ -105,9 +105,8 @@ dataset = filter_dataset(dataset, # apply limma's eBayes to each contrast and flag proteins as significant at 5% FDR and foldchange larger than a threshold estimated from bootstrap analyses (specified by parameter; fc_signif=NA) dataset = dea(dataset, dea_algorithm = "ebayes", qval_signif = 0.05, fc_signif = NA) -#> info: differential expression analysis for contrast: A vs B +#> info: differential expression analysis for contrast: A vs B # condition_variable: group #> info: using data from peptide filter: global data filter -#> progress: peptide to protein rollup with MaxLFQ (implementation: iq) took 1 seconds #> info: log2 foldchange threshold estimated by bootstrap analysis: 0.652 #> progress: eBayes took 1 seconds diff --git a/doc/docker.md b/doc/docker.md index 7097c86..ae12fcd 100644 --- a/doc/docker.md +++ b/doc/docker.md @@ -23,26 +23,26 @@ container. Advantages of using the Docker container: -- perfect reproducibility of data analysis results, the entire - software stack is always the same -- only tool that needs to be installed on your computer is Docker +- perfect reproducibility of data analysis results, the entire software + stack is always the same +- only tool that needs to be installed on your computer is Docker Disadvantages of using the Docker container: -- if you’re already working with R/RStudio, getting up-and-running by - just installing the MS-DAP R package is much easier -- there’s a learning curve to use Docker, even though we provide - launcher scripts to make things easier you may need to consult with - your local IT support if things don’t go smoothly - - dealing with files / sharing files between computer and Docker - container can be especially tricky -- performing additional analyses outside standard MS-DAP workflows - might take some extra work if you want to use R packages that aren’t - available within the Docker container - - i.e. you don’t want to install new/extra R packages into the - MS-DAP container as you might mistakingly update other packages - that MS-DAP depends on, thereby breaking the reproducibility - aspect of working with containers +- if you’re already working with R/RStudio, getting up-and-running by + just installing the MS-DAP R package is much easier +- there’s a learning curve to use Docker, even though we provide + launcher scripts to make things easier you may need to consult with + your local IT support if things don’t go smoothly + - dealing with files / sharing files between computer and Docker + container can be especially tricky +- performing additional analyses outside standard MS-DAP workflows might + take some extra work if you want to use R packages that aren’t + available within the Docker container + - i.e. you don’t want to install new/extra R packages into the MS-DAP + container as you might mistakingly update other packages that MS-DAP + depends on, thereby breaking the reproducibility aspect of working + with containers *note; this is intended as a very brief introduction and only highlights some common (dis)advantages from the perspective of MS-DAP users. This @@ -53,31 +53,30 @@ thorough examination of the pros and cons of working with containers.* ## install Docker for Windows or macOS -- download and install Docker for Windows/macOS at: - - - *windows; If WSL2 has not not been installed on your system yet, - a popup notification with installation instructions may appear. - Follow the presented link, or [click - here](https://docs.microsoft.com/en-us/windows/wsl/wsl2-kernel) - to “download the latest WSL2 Linux kernel”. Finally, install - this file and reboot.* -- configure Docker - - start Docker Desktop - - open the settings screen - - windows; right-click the Docker icon in the taskbar at - bottom-right of the screen (looks like a whale, you may need - to click the little arrow on the windows taskbar if icons - are hidden) \> click settings - - macOS; Docker \> menu bar \> Preferences - - update settings - - resources \> advanced: set number of CPU cores to use (if in - doubt, set to 4 or more) - - resources \> advanced: set amount of RAM to use (if in - doubt, set to 4\~8GB) - - resources \> file sharing: click the + symbol and select the - directory that holds all your experimental data (eg; - C:/data) -- reboot your computer after installing Docker Desktop +- download and install Docker for Windows/macOS at: + + - *windows; If WSL2 has not not been installed on your system yet, a + popup notification with installation instructions may appear. Follow + the presented link, or [click + here](https://docs.microsoft.com/en-us/windows/wsl/wsl2-kernel) to + “download the latest WSL2 Linux kernel”. Finally, install this file + and reboot.* +- configure Docker + - start Docker Desktop + - open the settings screen + - windows; right-click the Docker icon in the taskbar at + bottom-right of the screen (looks like a whale, you may need to + click the little arrow on the windows taskbar if icons are hidden) + \> click settings + - macOS; Docker \> menu bar \> Preferences + - update settings + - resources \> advanced: set number of CPU cores to use (if in + doubt, set to 4 or more) + - resources \> advanced: set amount of RAM to use (if in doubt, set + to 4~8GB) + - resources \> file sharing: click the + symbol and select the + directory that holds all your experimental data (eg; C:/data) +- reboot your computer after installing Docker Desktop ## install and test Docker on Linux @@ -91,35 +90,35 @@ Ubuntu live-USB; and using the “convenience script” for installation provided by Docker; -- `curl -fsSL https://get.docker.com -o get-docker.sh` -- `sudo sh get-docker.sh` +- `curl -fsSL https://get.docker.com -o get-docker.sh` +- `sudo sh get-docker.sh` 2) Setting up users following -- `sudo groupadd docker` \# might get a warning that group already - exists, which is fine -- `sudo usermod -aG docker $USER` \# or instead of `$USER` , enter - your username -- `newgrp docker` \# update group -- Now we’ll need to log out (the current user) and log back in so the - group membership is re-evaluated. (in the Ubuntu live-USB example - this could be done by; `sudo -i` then `su ubuntu` ) -- `docker run hello-world` \# test if docker works. Should see Docker - downloading an image, print “Hello from Docker !” among other - info/text +- `sudo groupadd docker` \# might get a warning that group already + exists, which is fine +- `sudo usermod -aG docker $USER` \# or instead of `$USER` , enter your + username +- `newgrp docker` \# update group +- Now we’ll need to log out (the current user) and log back in so the + group membership is re-evaluated. (in the Ubuntu live-USB example this + could be done by; `sudo -i` then `su ubuntu` ) +- `docker run hello-world` \# test if docker works. Should see Docker + downloading an image, print “Hello from Docker !” among other + info/text 3) run the MS-DAP docker launcher script (you can download it from this GitHub repository, there’s a versioned script attached to each release at ) -- `mkdir -m 777 ~/msdap_data` \# importantly, check the permission - flags used here. Create a folder in your home directory where data - is exchanged between host system and the Docker container -- `cd ~/msdap_data` \# we want to run the launcher script from within - the directory where data is exchanged -- `~/Downloads/msdap_launcher_unix.sh` \# assume the launcher script - is here and already made executable +- `mkdir -m 777 ~/msdap_data` \# importantly, check the permission flags + used here. Create a folder in your home directory where data is + exchanged between host system and the Docker container +- `cd ~/msdap_data` \# we want to run the launcher script from within + the directory where data is exchanged +- `~/Downloads/msdap_launcher_unix.sh` \# assume the launcher script is + here and already made executable ## Start the Docker container @@ -134,28 +133,26 @@ A launcher script is provided for user convenience; it executes all commands needed to run the MS-DAP docker container. To do this manually instead, consult the next section for a step-by-step guide. -- download the script for the MS-DAP version you want to use at the - [‘releases’ section of this github - repository](https://github.com/ftwkoopmans/msdap/releases) -- copy/move the launcher script to the directory that holds your - experiment data (eg; `C:/data/proteomics/`) - - the file location is important: only data with the directory - that contains the launcher script will be accessible within the - Docker container. See further, subsection **Locating your - mounted data within Docker containers** -- run the script to launch MS-DAP - - windows; right-click `msdap_launcher_windows.ps1` and select - “Run with PowerShell” - - *note; upon first use, you may see a security warning - because this script was downloaded from the internet (If - it’s a “Execution Policy Change” warning, enter “Y” to - proceed)* - - macOS and Linux; execute the `msdap_launcher_unix.sh` script - - *note; if you are unfamiliar with using scripts,* [here is a - macOS - guide](https://support.apple.com/guide/terminal/make-a-file-executable-apdd100908f-06b3-4e63-8a87-32e71241bab4/mac) - - *note; on Linux, you need to make sure the Docker daemon is - running prior to starting the script* +- download the script for the MS-DAP version you want to use at the + [‘releases’ section of this github + repository](https://github.com/ftwkoopmans/msdap/releases) +- copy/move the launcher script to the directory that holds your + experiment data (eg; `C:/data/proteomics/`) + - the file location is important: only data with the directory that + contains the launcher script will be accessible within the Docker + container. See further, subsection **Locating your mounted data + within Docker containers** +- run the script to launch MS-DAP + - windows; right-click `msdap_launcher_windows.ps1` and select “Run + with PowerShell” + - *note; upon first use, you may see a security warning because this + script was downloaded from the internet (If it’s a “Execution + Policy Change” warning, enter “Y” to proceed)* + - macOS and Linux; execute the `msdap_launcher_unix.sh` script + - *note; if you are unfamiliar with using scripts,* [here is a macOS + guide](https://support.apple.com/guide/terminal/make-a-file-executable-apdd100908f-06b3-4e63-8a87-32e71241bab4/mac) + - *note; on Linux, you need to make sure the Docker daemon is + running prior to starting the script* To use a different MS-DAP version, simply change the version number at the top of the launcher script. To view all available MS-DAP Docker @@ -164,13 +161,13 @@ containers versions, go to: **what does the script do?** -- if Docker is not up-and-running, start Docker -- the launcher script will automatically download the MS-DAP container - (if needed) -- if MS-DAP is already up-and-running, stop it -- start the MS-DAP docker container -- your default web browser is launched and navigates to a local - webserver within the Docker container @ +- if Docker is not up-and-running, start Docker +- the launcher script will automatically download the MS-DAP container + (if needed) +- if MS-DAP is already up-and-running, stop it +- start the MS-DAP docker container +- your default web browser is launched and navigates to a local + webserver within the Docker container @ ### commandline @@ -183,12 +180,12 @@ with the launcher script). 1) First, make sure that Docker is up and running. For Windows and macOS, simply start the Docker Desktop application. -- On Windows, you may hover the mouse cursor over the Docker tray icon - that looks like a whale in the bottom-right section of the taskbar - to confirm its status is “Docker Desktop is running” (if no icon is - visible but you did start Docker, maybe the icon is hidden and you - have to click the icon on the bottom-right of the Windows taskbar to - “Show hidden icons”). +- On Windows, you may hover the mouse cursor over the Docker tray icon + that looks like a whale in the bottom-right section of the taskbar to + confirm its status is “Docker Desktop is running” (if no icon is + visible but you did start Docker, maybe the icon is hidden and you + have to click the icon on the bottom-right of the Windows taskbar to + “Show hidden icons”). 2) Next, start a terminal. On Windows, start powershell (start \> type: `powershell` \> click `Windows PowerShell` icon). On macOS, open @@ -249,24 +246,23 @@ Use a snippet of R code that uses the MS-DAP R package to analyse the Klaassen et al. (2018) dataset that was pre-processed with MetaMorpheus and included with MS-DAP: -- To immediately view the expected results, [click here to download - the PDF - report](/examples/data/dataset_Klaassen2018_pmid26931375_report.pdf) - we already prepared and uploaded. +- To immediately view the expected results, [click here to download the + PDF + report](/examples/data/dataset_Klaassen2018_pmid26931375_report.pdf) + we already prepared and uploaded. Data analysis steps: -- assuming the MS-DAP Docker container is running and you accessed it - using a web-browser at -- in the RStudio menu on the top-left; file \> New File \> R Script -- copy/paste the below code snippet to the panel in center of the - screen -- run all lines of code: select all (by mouse drag or click anywhere - then `control+A`) \> click the `run` button on top of the code panel - to run all selected lines -- output log and result summary are shown at the bottom of the screen -- output files can be found in the *mapped directory* that you defined - when launching MS-DAP +- assuming the MS-DAP Docker container is running and you accessed it + using a web-browser at +- in the RStudio menu on the top-left; file \> New File \> R Script +- copy/paste the below code snippet to the panel in center of the screen +- run all lines of code: select all (by mouse drag or click anywhere + then `control+A`) \> click the `run` button on top of the code panel + to run all selected lines +- output log and result summary are shown at the bottom of the screen +- output files can be found in the *mapped directory* that you defined + when launching MS-DAP ``` r library(msdap) @@ -306,8 +302,8 @@ print_dataset_summary(dataset) **documentation from Docker to verify your Docker installation is ok** -- -- +- +- **permission denied while trying to connect to the Docker daemon** diff --git a/doc/images/custnorm-venn-1.png b/doc/images/custnorm-venn-1.png new file mode 100644 index 0000000..1fc6ac9 Binary files /dev/null and b/doc/images/custnorm-venn-1.png differ diff --git a/doc/images/customlimma-validation-1.png b/doc/images/customlimma-validation-1.png new file mode 100644 index 0000000..d62088c Binary files /dev/null and b/doc/images/customlimma-validation-1.png differ diff --git a/doc/images/dd-zscore-hist-1.png b/doc/images/dd-zscore-hist-1.png index 13ecc81..174d5b0 100644 Binary files a/doc/images/dd-zscore-hist-1.png and b/doc/images/dd-zscore-hist-1.png differ diff --git a/doc/images/dea-unnamed-chunk-8-1.png b/doc/images/dea-unnamed-chunk-8-1.png deleted file mode 100644 index 6cf9106..0000000 Binary files a/doc/images/dea-unnamed-chunk-8-1.png and /dev/null differ diff --git a/doc/intro.md b/doc/intro.md index fba579b..a2c591e 100644 --- a/doc/intro.md +++ b/doc/intro.md @@ -1,51 +1,34 @@ -- abstract -- Features -- Computational - procedures involved in differential expression analysis - - DEA-workflow: feature - selection - - DEA-workflow: normalization - - DEA-workflow: statistical - models - - differential detection - - estimating foldchange - thresholds -- Quality Control - - sample metadata - - detect counts - - chromatography - - within-group foldchange - distributions - - CoV leave-one-out analysis - - PCA - - use the - MS-DAP multifaceted analyses in QC analysis - - Volcano plot - - protein - foldchanges estimated by statistical models -- Examples of full reports - - Klaassen et - al. APMS wildtype vs knockout (DDA) - - O’Connel et al. benchmark - dataset (DDA) - - Bader et - al. large-scale AD~control CSF cohorts (DIA) +- [abstract](#abstract) +- [Features](#features) +- [Computational procedures involved in differential expression + analysis](#computational-procedures-involved-in-differential-expression-analysis) + - [DEA-workflow: feature selection](#dea-workflow-feature-selection) + - [DEA-workflow: normalization](#dea-workflow-normalization) + - [DEA-workflow: statistical models](#dea-workflow-statistical-models) + - [differential detection](#differential-detection) + - [estimating foldchange + thresholds](#estimating-foldchange-thresholds) +- [Quality Control](#quality-control) + - [sample metadata](#sample-metadata) + - [detect counts](#detect-counts) + - [chromatography](#chromatography) + - [within-group foldchange + distributions](#within-group-foldchange-distributions) + - [CoV leave-one-out analysis](#cov-leave-one-out-analysis) + - [PCA](#pca) + - [use the MS-DAP multifaceted analyses in QC + analysis](#use-the-ms-dap-multifaceted-analyses-in-qc-analysis) + - [Volcano plot](#volcano-plot) + - [protein foldchanges estimated by statistical + models](#protein-foldchanges-estimated-by-statistical-models) +- [Examples of full reports](#examples-of-full-reports) + - [Klaassen et al. APMS wildtype vs knockout + (DDA)](#klaassen-et-al-apms-wildtype-vs-knockout-dda) + - [O’Connel et al. benchmark dataset + (DDA)](#oconnel-et-al-benchmark-dataset-dda) + - [Bader et al. large-scale AD~control CSF cohorts + (DIA)](#bader-et-al-large-scale-adcontrol-csf-cohorts-dia) This document provides an introduction to MS-DAP; what is it and how does it work, together with highlights from the MS-DAP quality control @@ -76,23 +59,25 @@ innovations. ## Features -![MS-DAP workflow](images/msdap-overview.png) +
+MS-DAP workflow + +
MS-DAP, Mass Spectrometry Downstream Analysis Pipeline: -- Analysis independent of RAW data processing software -- Wide selection of normalization algorithms and statistical models -- Plugin architecture to support future algorithms -- Standardized workflow, regardless of configured feature selection - and data processing algorithms -- Extensive data visualization, including both popular/common plots - and novelties introduced by MS-DAP, covering many quality control - aspects -- The report is a single PDF, making your results easy to share online -- The publication-grade figures are stored as vector graphics, so - simply open the report PDF in Adobe Illustrator to include any of - the MS-DAP visualizations as panels in your main figures -- Available as a Docker container and R package +- Analysis independent of RAW data processing software +- Wide selection of normalization algorithms and statistical models +- Plugin architecture to support future algorithms +- Standardized workflow, regardless of configured feature selection and + data processing algorithms +- Extensive data visualization, including both popular/common plots and + novelties introduced by MS-DAP, covering many quality control aspects +- The report is a single PDF, making your results easy to share online +- The publication-grade figures are stored as vector graphics, so simply + open the report PDF in Adobe Illustrator to include any of the MS-DAP + visualizations as panels in your main figures +- Available as a Docker container and R package ## Computational procedures involved in differential expression analysis @@ -115,15 +100,15 @@ which peptides represent reliable data that should be used in downstream statistical analysis. The following criteria are available to determine whether a peptide is ‘valid’ in a sample group: -- identified in at least N samples -- identified in at least x% of samples -- quantified in at least N samples -- quantified in at least x% of samples -- topN peptides per protein; after above filters, rank peptides by the - number of samples where detected and their overall CoV and keep the - top N -- the respective protein has at least N peptides that pass the above - filters +- identified in at least N samples +- identified in at least x% of samples +- quantified in at least N samples +- quantified in at least x% of samples +- topN peptides per protein; after above filters, rank peptides by the + number of samples where detected and their overall CoV and keep the + top N +- the respective protein has at least N peptides that pass the above + filters ‘identified’ refers to peptide *p* in sample *s* was identified through MS/MS for DDA datasets, or identified with a confidence qvalue \<= 0.01 @@ -138,44 +123,43 @@ are several normalization algorithms available in MS-DAP, below documentation is also available at MS-DAP function `msdap::normalization_algorithms()`: -- median: scale each sample such that median abundance values are the - same for all samples in the dataset. -- loess: Loess normalization as implemented in the limma R package - () [R - package](https://bioconductor.org/packages/release/bioc/html/limma.html). - code: - `limma::normalizeCyclicLoess(log2_data, iterations = 10, method = "fast")`. - Normalize the columns of a matrix, cyclicly applying loess - normalization to normalize each columns against the average over all - columns. -- vsn: Variance Stabilizing Normalization (VSN) as implemented in the - vsn R package () [R - package](https://bioconductor.org/packages/release/bioc/html/vsn.html). - code: `vsn::justvsn()`. From bioconductor: “The model incorporates - data calibration step (a.k.a. normalization), a model for the - dependence of the variance on the mean intensity and a variance - stabilizing data transformation. Differences between transformed - intensities are analogous to ‘normalized log-ratios’”. -- rlr: Robust Linear Regression normalization, as implemented in the - MSqRob package () [R - package](https://github.com/statOmics/msqrob). For each sample s, - perform a robust linear regression of all values (peptide - intensities) against overall median values (e.g. median value of - each peptide over all samples) to obtain the normalization factor - for sample s. -- msempire: log-foldchange mode normalization, as implemented in the - msEmpiRe package () [R - package](https://github.com/zimmerlab/MS-EmpiRe). Instead of - computing all pairwise sample scaling factors (i.e. foldchange - distributions between all pairs of samples within a sample group), - MS-EmpiRe uses single linkage clustering to normalize to subsets of - ‘most similar’ samples and iteratively expands until all - within-group samples are covered. -- vwmb: Variation Within, Mode Between (VWMB) normalization. In brief, - this minimizes the median peptide variation within each sample - group, then scales between all pairwise sample groups such that the - log-foldchange mode is zero. The normalization algorithm consists of - two consecutive steps: +- median: scale each sample such that median abundance values are the + same for all samples in the dataset. +- loess: Loess normalization as implemented in the limma R package + () [R + package](https://bioconductor.org/packages/release/bioc/html/limma.html). + code: + `limma::normalizeCyclicLoess(log2_data, iterations = 10, method = "fast")`. + Normalize the columns of a matrix, cyclicly applying loess + normalization to normalize each columns against the average over all + columns. +- vsn: Variance Stabilizing Normalization (VSN) as implemented in the + vsn R package () [R + package](https://bioconductor.org/packages/release/bioc/html/vsn.html). + code: `vsn::justvsn()`. From bioconductor: “The model incorporates + data calibration step (a.k.a. normalization), a model for the + dependence of the variance on the mean intensity and a variance + stabilizing data transformation. Differences between transformed + intensities are analogous to ‘normalized log-ratios’”. +- rlr: Robust Linear Regression normalization, as implemented in the + MSqRob package () [R + package](https://github.com/statOmics/msqrob). For each sample s, + perform a robust linear regression of all values (peptide intensities) + against overall median values (e.g. median value of each peptide over + all samples) to obtain the normalization factor for sample s. +- msempire: log-foldchange mode normalization, as implemented in the + msEmpiRe package () [R + package](https://github.com/zimmerlab/MS-EmpiRe). Instead of computing + all pairwise sample scaling factors (i.e. foldchange distributions + between all pairs of samples within a sample group), MS-EmpiRe uses + single linkage clustering to normalize to subsets of ‘most similar’ + samples and iteratively expands until all within-group samples are + covered. +- vwmb: Variation Within, Mode Between (VWMB) normalization. In brief, + this minimizes the median peptide variation within each sample group, + then scales between all pairwise sample groups such that the + log-foldchange mode is zero. The normalization algorithm consists of + two consecutive steps: 1) samples are scaled within each group such that the median of variation estimates for all rows is minimized @@ -184,27 +168,26 @@ documentation is also available at MS-DAP function sample-group-level to minimize the mode log-foldchange between all groups See further MS-DAP function `normalize_vwmb`. -- mwmb: Mode Within, Mode Between (MWMB) normalization. A variant of - VWMB. Normalize (/scale) samples within each sample group such that - their pairwise log-foldchange modes are zero, then scales between - groups such that the log-foldchange mode is zero (i.e. the - between-group part is the same as VWMB). If the dataset has - (unknown) covariates and a sufficient number of replicates, this - might be beneficial because covariate-specific effects are not - averaged out as they might be with `VWMB`. See further MS-DAP - function `normalize_vwmb`. -- modebetween: only the “Mode Between” part of VWMB described earlier, - does not affect scaling between (replicate) samples within the same - sample group. -- modebetween_protein (also referred to as “MBprot”, e.g. in the - MS-DAP manuscript and some documentation): only the “Mode Between” - part of VWMB described earlier, but the scaling factors are computed - at protein-level !! When this normalization function is used, the - `normalize_modebetween_protein` function will first rollup the - peptide data matrix to protein-level, then compute - between-sample-group scaling factors and finally apply those to the - input peptide-level data matrix to compute the normalized peptide - data. +- mwmb: Mode Within, Mode Between (MWMB) normalization. A variant of + VWMB. Normalize (/scale) samples within each sample group such that + their pairwise log-foldchange modes are zero, then scales between + groups such that the log-foldchange mode is zero (i.e. the + between-group part is the same as VWMB). If the dataset has (unknown) + covariates and a sufficient number of replicates, this might be + beneficial because covariate-specific effects are not averaged out as + they might be with `VWMB`. See further MS-DAP function + `normalize_vwmb`. +- modebetween: only the “Mode Between” part of VWMB described earlier, + does not affect scaling between (replicate) samples within the same + sample group. +- modebetween_protein (also referred to as “MBprot”, e.g. in the MS-DAP + manuscript and some documentation): only the “Mode Between” part of + VWMB described earlier, but the scaling factors are computed at + protein-level !! When this normalization function is used, the + `normalize_modebetween_protein` function will first rollup the peptide + data matrix to protein-level, then compute between-sample-group + scaling factors and finally apply those to the input peptide-level + data matrix to compute the normalized peptide data. Multiple normalization algorithms can be applied subsequentially in MS-DAP, e.g. first apply “vsn” to normalize the dataset at peptide-level @@ -286,22 +269,22 @@ of peptides identified in a sample in mind ! Computational procedures for comparing sample groups *A* and *B*; -- to account for sample loading etc., first scale the weight of all - peptides per sample as; 1 / total number of detected peptides in - sample *s* -- *score_pA*: the score for protein *p* in sample group *A* is the sum - of the weighted score of all peptides that belong to protein *p* in - all samples (within group *A*) -- ratio for protein *p* in *A vs B*: log2(*score_pB* + *minimum - non-zero score in group B*) - log2(*score_pA* + *minimum non-zero - score in group A*) -- finally, we standardize all protein ratios by subtracting the - overall mean value, then dividing by the standard deviation -- proteins of interest, *candidates*, are defined as proteins with an - absolute z-score of at least 2 AND at least a count difference - between groups *A* and *B* of *group size* \* 0.75 (the latter - guards against proteins with 0 detect in one group and 1 peptide in - 1 sample in the other) +- to account for sample loading etc., first scale the weight of all + peptides per sample as; 1 / total number of detected peptides in + sample *s* +- *score_pA*: the score for protein *p* in sample group *A* is the sum + of the weighted score of all peptides that belong to protein *p* in + all samples (within group *A*) +- ratio for protein *p* in *A vs B*: log2(*score_pB* + *minimum non-zero + score in group B*) - log2(*score_pA* + *minimum non-zero score in + group A*) +- finally, we standardize all protein ratios by subtracting the overall + mean value, then dividing by the standard deviation +- proteins of interest, *candidates*, are defined as proteins with an + absolute z-score of at least 2 AND at least a count difference between + groups *A* and *B* of *group size* \* 0.75 (the latter guards against + proteins with 0 detect in one group and 1 peptide in 1 sample in the + other) ### estimating foldchange thresholds @@ -330,12 +313,12 @@ Satija at MS-DAP builds a report that allows in depth quality control (QC). Building blocks of the QC report are: -- individual samples analyzed through identified peptides and - chromatographic effects -- reproducibility & outliers visualized among replicates -- presentation of dataset-wide effects; identification of batch - effects through PCA -- information needed to reproduce results +- individual samples analyzed through identified peptides and + chromatographic effects +- reproducibility & outliers visualized among replicates +- presentation of dataset-wide effects; identification of batch effects + through PCA +- information needed to reproduce results The QC report can be used to evaluate data that thereafter is subsequently re-analyzed. For instance, after inspection the report @@ -395,8 +378,12 @@ In this example, gel d clearly was the most successful ‘experiment batch’ and we observe a troublesome difference in peptide detection counts between gels. -![detect counts, color-coded by sample -metadata](images/qc-detect_counts.png) +
+ + +
### chromatography @@ -424,11 +411,19 @@ minutes. **Typical sample:** -![RT figures, typical results](images/qc-rt_wt4_typical.png) +
+ + +
**Problematic sample; temporary drop in sensitivity** -![RT figures, trouble in KO5](images/qc-rt_ko5_outlier.png) +
+ + +
**Figure legends:** The top panel shows the number of peptides in the input data, e.g. as recognized by the software that generated input for @@ -492,8 +487,12 @@ for its outliers that nicely illustrate this figure), so a legend that shows the sample names and their respective color-coding is omitted here but available in any QC report of course. -![within-group foldchange -distributions](images/qc-foldchange_outlier.png) +
+ + +
### CoV leave-one-out analysis @@ -521,7 +520,10 @@ observe that the mode of the CoV distribution is much lower after removing the purple sample (already marked as ‘exclude’ in sample metadata). -![leave-one-out](images/qc-cov_loo_outlier.png) +
+leave-one-out + +
**Figure legends:** Samples marked as ‘exclude’ in the provided sample metadata table are visualized as dashed lines. @@ -594,7 +596,11 @@ with the SDS-PAGE gels used, demonstrating these dimension reductions capture a variation in peptide abundance values that coincides with phenotype not with the experiment technicality reviewed here. -![PCA automatic color coding](images/qc-pca_color_codes.png) +
+ + +
**Figure legends** The first 3 principle components compared visually (1 *vs* 2, 1 *vs* 3, 2 *vs* 3) on the rows. Left- and right-side panels on @@ -627,7 +633,12 @@ sample WT5: ![RT figures, trouble in WT5](images/qc-rt_wt5_outlier.png) above, we can infer this is most likely caused by technicalities and not due to biology! -![PCA outliers corroborate earlier QC](images/qc-pca_outlier.png) +
+ + +
Without the detailed QC plots from MS-DAP that describe deviation in peptide quantity over HPLC elution time, one would not know why these @@ -651,7 +662,10 @@ statistical comparison. Note that the title reveals these are results from the *MSqRob* statistical model; MS-DAP automatically generates figures for each statistical model \* each contast. -![volcano](images/qc-volcano_Klaassen_shisa6ip.png) +
+volcano + +
### protein foldchanges estimated by statistical models @@ -677,7 +691,11 @@ described in the previous section. Besides the point above regarding MSqRob, we also observe a foldchange-mode at zero for both the peptide-level model MS-EmpiRe and the protein-level eBayes model. -![volcano](images/qc-stat-fc-density_Klaassen_shisa6ip.png) +
+ + +
## Examples of full reports @@ -696,7 +714,7 @@ The MS-DAP report of the O’Connel 2018 DDA benchmark dataset shows application to a MaxQuant dataset: [O’Connel 2018 dataset](/examples/data/OConnel2018_pmid29635916_report.pdf) -### Bader et al. large-scale AD\~control CSF cohorts (DIA) +### Bader et al. large-scale AD~control CSF cohorts (DIA) Demonstration of MS-DAP application to a large-scale biofluid dataset (). Input data are the Spectronaut report made available diff --git a/docker/Dockerfile b/docker/Dockerfile index 21bb21b..7ae73a2 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -33,8 +33,8 @@ RUN R -e "devtools::install_version('remaCor', '0.0.18', upgrade = 'never', repo ### MS-DAP # from local devtools::build() file RUN mkdir /msdap -COPY temp/msdap_1.1.3.tar.gz /msdap/ -RUN R -e "devtools::install_local('/msdap/msdap_1.1.3.tar.gz', upgrade = 'never')" +COPY temp/msdap_1.2.tar.gz /msdap/ +RUN R -e "devtools::install_local('/msdap/msdap_1.2.tar.gz', upgrade = 'never')" # alternatively, install from github; RUN R -e "devtools::install_github('ftwkoopmans/msdap@1.1', upgrade = 'never')" diff --git a/docker/msdap_launcher_unix.sh b/docker/msdap_launcher_unix.sh index bce6258..e242917 100644 --- a/docker/msdap_launcher_unix.sh +++ b/docker/msdap_launcher_unix.sh @@ -3,7 +3,7 @@ # MS-DAP launch script # https://github.com/ftwkoopmans/msdap -VERSION="1.1.3" +VERSION="1.2" ### OS diff --git a/docker/msdap_launcher_windows.ps1 b/docker/msdap_launcher_windows.ps1 index 5df5148..0d2eb12 100644 --- a/docker/msdap_launcher_windows.ps1 +++ b/docker/msdap_launcher_windows.ps1 @@ -1,7 +1,7 @@ # MS-DAP launch script # https://github.com/ftwkoopmans/msdap -$VERSION = "1.1.3" +$VERSION = "1.2" Write-Host "$((Get-Date).ToString("HH:mm:ss")) - Starting MS-DAP launcher script for version: $VERSION" diff --git a/inst/rmd/report.Rmd b/inst/rmd/report.Rmd index 6aee804..cfc9647 100644 --- a/inst/rmd/report.Rmd +++ b/inst/rmd/report.Rmd @@ -481,11 +481,16 @@ if(length(l_contrast) > 0) { contr = names(l_contrast)[contr_index] # cat('\n## ', contr, '\n\n') - cat('\n## ', unlist(lapply(strsplit(sub("^contrast:\\s*", "", contr), " vs ", fixed = T), function(x) paste(stringr::str_trunc(x, 35, "right"), collapse = " vs "))), '\n\n') # analogous to formatting contrast shortnames for table tib_report_stats_summary @ report main function + cat('\n## ', unlist(lapply(strsplit(gsub(" *#.*", "", sub("^contrast:\\s*", "", contr)), " vs ", fixed = T), function(x) paste(stringr::str_trunc(x, 35, "right"), collapse = " vs "))), '\n\n') # analogous to formatting contrast shortnames for table tib_report_stats_summary @ report main function col_contr_intensity = get_column_intensity(dataset$peptides, contr) cat("*", sprintf("**user setting:** using '%s' peptide filtering approach", names(col_contr_intensity)), " \n\n") + if(length(dataset$contrasts) >= contr_index && + dataset$contrasts[[contr_index]]$label == contr && + length(dataset$contrasts[[contr_index]]$colname_additional_variables) > 0) { + cat("*", sprintf("**user setting:** additional regression variables: %s", paste(dataset$contrasts[[contr_index]]$colname_additional_variables, collapse = ", ")), " \n\n") + } tib_log = dataset$peptides %>% select(peptide_id, protein_id, intensity = !!as.character(col_contr_intensity)) %>% diff --git a/man/add_contrast.Rd b/man/add_contrast.Rd new file mode 100644 index 0000000..d6732ab --- /dev/null +++ b/man/add_contrast.Rd @@ -0,0 +1,118 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sample_metadata.R +\name{add_contrast} +\alias{add_contrast} +\title{Create a contrast for differential expression analysis} +\usage{ +add_contrast( + dataset, + colname_condition_variable, + values_condition1, + values_condition2, + colname_additional_variables = NULL +) +} +\arguments{ +\item{dataset}{your dataset. Make sure you've already imported sample metadata} + +\item{colname_condition_variable}{sample metadata column name that should be used for the experimental condition. Typically, this is the "group" column. Should be any of the values in \code{user_provided_metadata(dataset$samples)}} + +\item{values_condition1}{array of values from column \code{colname_condition_variable} that are the first group in the contrast. Note that} + +\item{values_condition2}{analogous to \code{values_condition1}, but for the second group. Note that you can set this to NA to indicate "everything except values in values_condition1"} + +\item{colname_additional_variables}{optionally, sample metadata column names that should be used as additional regression variables (only the subset of \code{user_provided_metadata(dataset$samples)}, NOT including the value provided as parameter \code{colname_condition_variable})} +} +\description{ +Note that a MS-DAP contrast for "A vs B" will return foldchanges for B/A in downstream output +tables and data visualizations. For example, for the contrast "control vs disease" a positive +log2 foldchange implies protein abundances are higher in the "disease" sample group. + +Throughout this function, all samples where the column "exclude" is set to TRUE are disregarded. +} +\examples{ +\dontrun{ + # first, remove all existing contrasts (and their respecive DEA results) + dataset = remove_contrasts(dataset) + + # Assume that column "group" in your sample metadata table specifies sample groups + # The following example will create the contrast "WT vs KO" + dataset = add_contrast(dataset, "group", "WT", "KO") + + # If the sample metadata table contains a column "batch" that should be used as + # a regression variable (works for ebayes/deqms/msqrob), we can add it as follows: + dataset = add_contrast(dataset, "group", "WT", "KO", colname_additional_variables = "batch") + + # Elaborate example; create a contrast while matching multiple values per group. + # Testing all motor- and visual-cortex samples (described in the "brain_region" + # column) against prefrontal cortex samples, with additional regression variables + # batch and age. + # In this example, the sample metadata table must contain columns + # "brain_region", "batch", "age" + dataset = add_contrast( + dataset, + # this parameter describes 1 column name in `dataset$samples` + colname_condition_variable = "brain_region", + # values in "brain_region" that are the first group in A/B testing + values_condition1 = c("motor_cortex", "visual_cortex"), + # analogous, but for the second group in A/B testing. + # note that you may alternatively this to `NA` to indicate + # "everything except values in values_condition1" + values_condition2 = c("prefrontal_cortex"), + # a vector/array of 0 or more column names in dataset$samples + # that should be used as additional regression variables + colname_additional_variables = c("batch", "age") + ) + + # Assume that the sample metadata table contains a column "group" with values + # "A", "B", "C", "D". The following code will create the contrast "A vs B,C,D" + # by setting the second set of values to `NA` + dataset = add_contrast( + dataset, + colname_condition_variable = "group", + values_condition1 = "A", + values_condition2 = NA + ) + + ## The "group" column in the sample table is used for group definitions in + ## "all group" filtering. We here create two contrasts based on different + ## regression variables in the sample tabel. + dataset = remove_contrasts(dataset) # optionally, remove previously defined contrasts + dataset = add_contrast( + dataset, + colname_condition_variable = "genotype", + values_condition1 = "control", + values_condition2 = "knockout", + colname_additional_variables = "batch" + ) + dataset = add_contrast( + dataset, + colname_condition_variable = "genotype", + values_condition1 = "control", + values_condition2 = c("knockout", "mutant1"), + # note that we have the flexibility to add different regression variables per contrast + colname_additional_variables = "batch" + ) + # print an overview of all contrasts + print_contrasts(dataset) + # apply typical MS-DAP pipeline + dataset = analysis_quickstart( + dataset, + filter_min_detect = 0, ## if DDA, we might not require minimum MS/MS counts + filter_min_quant = 3, + filter_fraction_detect = 0, + filter_fraction_quant = 0.75, + filter_by_contrast = FALSE, ## we only want filtering across all groups + filter_min_peptide_per_prot = 2, ## 2 peptides per protein + dea_algorithm = "deqms", + norm_algorithm = c("vwmb", "modebetween_protein"), + output_qc_report = TRUE, + output_abundance_tables = TRUE, + output_dir = "C:/temp", ## you may set this to NA to skip the QC report + output_within_timestamped_subdirectory = TRUE + ) +} +} +\seealso{ +\code{\link[=print_contrasts]{print_contrasts()}} to print an overview of defined contrasts, \code{\link[=remove_contrasts]{remove_contrasts()}} to remove all current contrasts (and respective filtering and DEA results) +} diff --git a/man/dataset_contrasts.Rd b/man/dataset_contrasts.Rd deleted file mode 100644 index 823e0f5..0000000 --- a/man/dataset_contrasts.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dataset.R -\name{dataset_contrasts} -\alias{dataset_contrasts} -\title{List the name of all contrasts in the samples table} -\usage{ -dataset_contrasts(dataset) -} -\arguments{ -\item{dataset}{a valid dataset} -} -\description{ -List the name of all contrasts in the samples table -} diff --git a/man/de_deqms.Rd b/man/de_deqms.Rd index 32dcab4..4edba31 100644 --- a/man/de_deqms.Rd +++ b/man/de_deqms.Rd @@ -2,21 +2,16 @@ % Please edit documentation in R/stats_differential_abundance.R \name{de_deqms} \alias{de_deqms} -\title{apply DEqMS to a protein-level ExpressionSet} +\title{Apply DEqMS to a protein-level ExpressionSet} \usage{ -de_deqms( - eset_proteins, - input_intensities_are_log2 = TRUE, - random_variables = NULL, - doplot = FALSE -) +de_deqms(eset, model_matrix, model_matrix_result_prop, doplot = FALSE) } \arguments{ -\item{eset_proteins}{protein-level dataset stored as a Biobase ExpressionSet. Note that it must contain protein metadata column 'npep' that holds integer peptide counts and sample metadata column 'condition'} +\item{eset}{protein-level log2 intensity matrix stored as a Biobase ExpressionSet. Note that it must contain protein metadata column 'npep' that holds integer peptide counts} -\item{input_intensities_are_log2}{boolean indicating whether the ExpressionSet's intensity values are already log2 scaled (default: TRUE)} +\item{model_matrix}{a \code{stats::model.matrix()} result that is supplied to \code{limma::lmFit()}} -\item{random_variables}{a vector of column names in your sample metadata table that are added as additional(!) regression terms in each statistical contrast tested downstream} +\item{model_matrix_result_prop}{the column name in \code{model_matrix} that should be returned as the resulting coefficient estimated by \code{eBayes()}. In MS-DAP workflows this is typically "condition"} \item{doplot}{create a QC plot?} } diff --git a/man/de_ebayes.Rd b/man/de_ebayes.Rd index 5b58251..b7438d4 100644 --- a/man/de_ebayes.Rd +++ b/man/de_ebayes.Rd @@ -2,22 +2,31 @@ % Please edit documentation in R/stats_differential_abundance.R \name{de_ebayes} \alias{de_ebayes} -\title{apply limma::eBayes() function to a protein-level ExpressionSet} +\title{Apply limma::eBayes() function to a protein-level ExpressionSet} \usage{ -de_ebayes( - eset_proteins, - input_intensities_are_log2 = TRUE, - random_variables = NULL -) +de_ebayes(eset, model_matrix, model_matrix_result_prop) } \arguments{ -\item{eset_proteins}{protein-level dataset stored as a Biobase ExpressionSet} +\item{eset}{protein-level log2 intensity matrix stored as a Biobase ExpressionSet} -\item{input_intensities_are_log2}{boolean indicating whether the ExpressionSet's intensity values are already log2 scaled (default: TRUE)} +\item{model_matrix}{a \code{stats::model.matrix()} result that is supplied to \code{limma::lmFit()}} -\item{random_variables}{a vector of column names in your sample metadata table that are added as additional(!) regression terms in each statistical contrast tested downstream} +\item{model_matrix_result_prop}{the column name in \code{model_matrix} that should be returned as the resulting coefficient estimated by \code{eBayes()}. In MS-DAP workflows this is typically "condition"} } \description{ ref; PMID:25605792 ref; https://bioconductor.org/packages/release/bioc/html/limma.html } +\examples{ +\dontrun{ + # Minimal example: + # Let `m` be a log2 protein intensity matrix matrix where + # columns are samples and rows proteins. + # Let `eset` be a Biobase::ExpressionSet that contains `m`. + # Let `m_groups` be a character vector that describes the + # sample group/condition for each column in `m`. + # e.g. m_groups = c("WT", "WT", "WT", "KO", "KO", "KO") + model_matrix = stats::model.matrix(~m_groups) + de_ebayes(eset, model_matrix, colnames(model_matrix)[2]) +} +} diff --git a/man/de_ebayes_fit.Rd b/man/de_ebayes_fit.Rd deleted file mode 100644 index d9cc511..0000000 --- a/man/de_ebayes_fit.Rd +++ /dev/null @@ -1,16 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/stats_differential_abundance.R -\name{de_ebayes_fit} -\alias{de_ebayes_fit} -\title{Wrapper function for limma::ebayes()} -\usage{ -de_ebayes_fit(x, random_variables) -} -\arguments{ -\item{x}{log2 transformed protein intensity matrix} - -\item{random_variables}{data.frame where each column describes a random variable (and each row matches a column in x) or a vector assumed to describe 1 random variable, sample group/condition for each column in x. Importantly, statistical results are returned only for the first random variable (so in most use-cases the sample group/condition should be described in the first column)} -} -\description{ -rownames of the data matrix (log2 intensities) must be the protein_id for downstream compatibility -} diff --git a/man/de_msempire.Rd b/man/de_msempire.Rd index 6d889fa..5d6af9b 100644 --- a/man/de_msempire.Rd +++ b/man/de_msempire.Rd @@ -4,20 +4,20 @@ \alias{de_msempire} \title{MS-EmpiRe implementation, a wrapper function for msEmpiRe::de.ana()} \usage{ -de_msempire(eset, input_intensities_are_log2 = F) +de_msempire(eset, model_matrix, model_matrix_result_prop) } \arguments{ -\item{eset}{a Biobase ExpressionSet that contains the peptide intensities. required attributes; fData(): protein_id and pData(): condition} +\item{eset}{a Biobase ExpressionSet that contains the peptide log2 intensities} -\item{input_intensities_are_log2}{whether the provided intensities are already log2 transformed} +\item{model_matrix}{a \code{stats::model.matrix()} result that is supplied to \code{limma::lmFit()}} + +\item{model_matrix_result_prop}{the column name in \code{model_matrix} that should be returned as the resulting coefficient estimated by \code{eBayes()}. In MS-DAP workflows this is typically "condition"} } \description{ ref; PMID:31235637 ref; https://github.com/zimmerlab/MS-EmpiRe } \details{ -input data should NOT be log2 transformed (if it is, make sure to set parameter \code{input_intensities_are_log2} so we can revert this prior to passing data to MS-EmpiRe) - note; MS-EmpiRe crashes when all proteins have the same number of peptides (eg; filtering topn=1 or topn=3 and minpep=3) TODO: proper standarderror computation diff --git a/man/de_msqrobsum_msqrob.Rd b/man/de_msqrobsum_msqrob.Rd index ecf5896..d7d9031 100644 --- a/man/de_msqrobsum_msqrob.Rd +++ b/man/de_msqrobsum_msqrob.Rd @@ -6,28 +6,22 @@ \usage{ de_msqrobsum_msqrob( eset, - eset_proteins = NULL, - use_peptide_model = T, - input_intensities_are_log2 = T, - protein_rollup_robust = T, - random_variables = NULL, - log2fc_without_shrinkage = FALSE + model_matrix, + model_matrix_result_prop, + use_peptide_model, + random_variables = NULL ) } \arguments{ -\item{eset}{a Biobase ExpressionSet that contains the peptide intensities. required attributes; fData(): protein_id and pData(): condition} +\item{eset}{a Biobase ExpressionSet that contains the peptide log2 intensities} -\item{eset_proteins}{only required if \code{log2fc_without_shrinkage == TRUE}} +\item{model_matrix}{a \code{stats::model.matrix()} result that is supplied to \code{limma::lmFit()}} -\item{use_peptide_model}{if true, apply msqrob. if false, apply msqrobsum} +\item{model_matrix_result_prop}{the column name in \code{model_matrix} that should be returned as the resulting coefficient estimated by \code{eBayes()}. In MS-DAP workflows this is typically "condition"} -\item{input_intensities_are_log2}{whether the provided intensities are already log2 transformed} +\item{use_peptide_model}{if \code{TRUE}, apply msqrob. if \code{FALSE}, apply msqrobsum} -\item{protein_rollup_robust}{for msqrobsum analyses, whether to use 'robust' protein rollup (TRUE) or traditional peptide intensity summation (FALSE)} - -\item{random_variables}{a vector of column names in your sample metadata table that are added as additional(!) regression terms} - -\item{log2fc_without_shrinkage}{boolean parameter, if FALSE (default) returns msqrob estimated log2 foldchanges. if TRUE, uses \code{protein_foldchange_from_ebayes()} instead (affecting log2fc and effectsize results)} +\item{random_variables}{optionally, an array of column names in the sample metadata table that should be used as additional regression variables} } \description{ note that this is the final version of this algorithm provided by its authors, and is no longer maintained; https://github.com/statOmics/msqrob diff --git a/man/enforce_sample_value_types.Rd b/man/enforce_sample_value_types.Rd new file mode 100644 index 0000000..3a9110f --- /dev/null +++ b/man/enforce_sample_value_types.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sample_metadata.R +\name{enforce_sample_value_types} +\alias{enforce_sample_value_types} +\title{convert the data type of column in a sample metadata table to factors or numeric type} +\usage{ +enforce_sample_value_types( + samples, + redundant_columns = "error", + show_log = TRUE +) +} +\arguments{ +\item{samples}{dataset$samples or a subset thereof. Typically, only columns sample_id and user-defined regression variables are included as columns} + +\item{redundant_columns}{how to deal with columns in \code{samples} that are redundant with other columns. Options: "error" (default), "warning", "ignore"} + +\item{show_log}{boolean, show data type per column in console? Default: \code{TRUE}} +} +\description{ +importantly, columns "sample_index", "sample_id" and "shortname" are ignored alltogether +} diff --git a/man/error_legacy_contrast_definitions.Rd b/man/error_legacy_contrast_definitions.Rd new file mode 100644 index 0000000..0a95796 --- /dev/null +++ b/man/error_legacy_contrast_definitions.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sample_metadata.R +\name{error_legacy_contrast_definitions} +\alias{error_legacy_contrast_definitions} +\title{throw an error if has_legacy_contrast_definitions() is TRUE} +\usage{ +error_legacy_contrast_definitions(dataset) +} +\arguments{ +\item{dataset}{your dataset} +} +\description{ +throw an error if has_legacy_contrast_definitions() is TRUE +} diff --git a/man/filter_dataset.Rd b/man/filter_dataset.Rd index efc9496..5313d36 100644 --- a/man/filter_dataset.Rd +++ b/man/filter_dataset.Rd @@ -14,9 +14,9 @@ filter_dataset( filter_topn_peptides = 0, norm_algorithm = "", rollup_algorithm = "maxlfq", - by_group = T, - all_group = T, - by_contrast = F + by_group = FALSE, + all_group = FALSE, + by_contrast = FALSE ) } \arguments{ diff --git a/man/get_peptide_filternorm_variants.Rd b/man/get_peptide_filternorm_variants.Rd new file mode 100644 index 0000000..ed17daf --- /dev/null +++ b/man/get_peptide_filternorm_variants.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dea.R +\name{get_peptide_filternorm_variants} +\alias{get_peptide_filternorm_variants} +\title{Return column names in peptide table that hold intensity values} +\usage{ +get_peptide_filternorm_variants(dataset) +} +\arguments{ +\item{dataset}{your dataset} +} +\description{ +Return column names in peptide table that hold intensity values +} diff --git a/man/get_protein_matrix.Rd b/man/get_protein_matrix.Rd new file mode 100644 index 0000000..ee9f9ea --- /dev/null +++ b/man/get_protein_matrix.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset.R +\name{get_protein_matrix} +\alias{get_protein_matrix} +\title{rollup peptides to protein data matrix for selected intensity column} +\usage{ +get_protein_matrix(dataset, intensity_column, include_npep = TRUE) +} +\arguments{ +\item{dataset}{your dataset} + +\item{intensity_column}{column in \code{dataset$peptides} that should be used for the protein matrix} + +\item{include_npep}{if \code{TRUE} (default), returns a list with both the matrix and an array of peptide counts (for each row in the protein matrix). If \code{FALSE}, returns just the protein matrix} +} +\description{ +rollup peptides to protein data matrix for selected intensity column +} diff --git a/man/get_samples_for_regression.Rd b/man/get_samples_for_regression.Rd new file mode 100644 index 0000000..3c589ca --- /dev/null +++ b/man/get_samples_for_regression.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset.R +\name{get_samples_for_regression} +\alias{get_samples_for_regression} +\title{collect the respective subset of samples for selected protein data and prepare a metadata table for regression} +\usage{ +get_samples_for_regression(dataset, protein_data) +} +\arguments{ +\item{dataset}{your dataset} + +\item{protein_data}{output from \code{get_protein_matrix()}} +} +\description{ +From sample metadata (dataset$samples), get the subset that is included in the +parameter \code{protein_data}, select columns that contain potential regression variables +(using \code{user_provided_metadata()}) and finally apply \code{enforce_sample_value_types()} to +reformat all variables (except "sample_id") to factor or numeric. +} diff --git a/man/has_legacy_contrast_definitions.Rd b/man/has_legacy_contrast_definitions.Rd new file mode 100644 index 0000000..36be0e1 --- /dev/null +++ b/man/has_legacy_contrast_definitions.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sample_metadata.R +\name{has_legacy_contrast_definitions} +\alias{has_legacy_contrast_definitions} +\title{test if the dataset has contrast definitions that are incompatible with MS-DAP version 1.2} +\usage{ +has_legacy_contrast_definitions(dataset) +} +\arguments{ +\item{dataset}{your dataset} +} +\description{ +test if the dataset has contrast definitions that are incompatible with MS-DAP version 1.2 +} diff --git a/man/limma_fit_extract_stats.Rd b/man/limma_fit_extract_stats.Rd new file mode 100644 index 0000000..77df336 --- /dev/null +++ b/man/limma_fit_extract_stats.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/stats_differential_abundance.R +\name{limma_fit_extract_stats} +\alias{limma_fit_extract_stats} +\title{extract limma fit results} +\usage{ +limma_fit_extract_stats(fit, contrast_name = NULL) +} +\arguments{ +\item{fit}{result from \code{limma::lmFit()}} + +\item{contrast_name}{name of a specific contrast(s) to extract. Default value, \code{NULL}, will return results from all contrasts} +} +\description{ +extract limma fit results +} diff --git a/man/limma_wrapper.Rd b/man/limma_wrapper.Rd new file mode 100644 index 0000000..01f4a93 --- /dev/null +++ b/man/limma_wrapper.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/stats_differential_abundance.R +\name{limma_wrapper} +\alias{limma_wrapper} +\title{Perform linear regression with limma} +\usage{ +limma_wrapper( + x = protein_data, + npep = NULL, + model_matrix, + contrasts, + limma_block_variable = NULL, + ebayes_trend = FALSE, + ebayes_robust = FALSE, + deqms = TRUE, + return_table = TRUE +) +} +\arguments{ +\item{x}{result from \code{get_protein_matrix(... , include_npep = TRUE)}. Alternatively, a protein log2 intensity matrix (in this case, also provide the \code{npep} parameter if you want to apply DEqMS)} + +\item{npep}{optionally, the number of peptides-per-protein for each row in input protein matrix. Only needed when providing a matrix for parameter \code{x} (i.e. you can leave this NULL when \code{x} is output from \code{get_protein_matrix(... , include_npep = TRUE)})} + +\item{model_matrix}{result from \code{stats::model.matrix()}} + +\item{contrasts}{contrasts defined with \code{limma::makeContrasts()} that should be extracted from the linear regression model} + +\item{limma_block_variable}{option passed to \code{limma::lmFit()} to fit block design. To skip (default), set NULL. Otherwise, this should be a character vector of the same length as nrow(model_matrix) so it describes a block/group per respective sample in the model/design matrix} + +\item{ebayes_trend}{option passed to \code{limma::eBayes()}. Defaults to \code{FALSE} (same as limma default)} + +\item{ebayes_robust}{option passed to \code{limma::eBayes()}. Defaults to \code{FALSE} (same as limma default)} + +\item{deqms}{apply DEqMS? Defaults to \code{TRUE}. If \code{FALSE}, returns \code{limma::eBayes()} result as-is} + +\item{return_table}{if \code{TRUE}, the default, returns a table with protein_id, log2fc, pvalue, adjusted pvalue, contrast (i.e. this is the output from MS-DAP function \code{limma_fit_extract_stats()}). If \code{FALSE}, returns the limma fit object as-is} +} +\description{ +Please refer to the vignette on custom limma models at the MS-DAP GitHub repository for a walk-through with examples. +} diff --git a/man/plot_volcano.Rd b/man/plot_volcano.Rd index 1f6cbd3..7b50347 100644 --- a/man/plot_volcano.Rd +++ b/man/plot_volcano.Rd @@ -11,7 +11,8 @@ plot_volcano( mtitle = "", label_mode = "topn_pvalue", label_target = 25, - label_avoid_overlap = TRUE + label_avoid_overlap = TRUE, + show_plots = FALSE ) } \arguments{ @@ -28,12 +29,14 @@ plot_volcano( \item{label_target}{further specification of the label_mode parameter. For instance, if 'topn_pvalue' is set, here you can set the number of proteins that should be labeled. Analogously, if label_mode='protein_id' is set you can here provide an array of protein_id values (that are available in the stats_de data table)} \item{label_avoid_overlap}{use the ggrepel R package to try and place labels with minimal overlap (only works when the number of labeled proteins is relatively low and sparse, e.g. for topN 25). Options: TRUE, FALSE} + +\item{show_plots}{boolean parameter; should each plot be printed/shown immediately? If \code{FALSE} (default) you'll have to extract the ggplot objects from the resulting list in order to print the plots} } \value{ returns a named list that contains a list, with properties 'ggplot' and 'ggplot_data', for each unique 'dea_algorithm' in the input stats_de table } \description{ -If there are multiple contrasts in your DEA results, you should first subset the DEA result table for 1 contrast before calling this plot function (i.e. don't plot multiple contrasts into 1 volcano plot), see Example 5 below. +If there are multiple contrasts in your DEA results you should either use the 'wrapper function' plot_volcano_allcontrast(), OR, first subset the DEA result table for 1 contrast before calling this plot function (i.e. don't plot multiple contrasts into 1 volcano plot). } \examples{ ### Exampes. Note that these assume that prior, the MS-DAP pipeline was successfully run @@ -46,10 +49,8 @@ If there are multiple contrasts in your DEA results, you should first subset the plot_list = msdap::plot_volcano(dataset$de_proteins \%>\% left_join(dataset$proteins), log2foldchange_threshold = 1, qvalue_threshold = 0.01, mtitle = "volcano, label top 10", label_mode = "topn_pvalue", label_target = 10, - label_avoid_overlap = TRUE + label_avoid_overlap = TRUE, show_plots = TRUE ) - # for each unique 'dea_algorithm' in the stats_de table, list of ggplot objects and data - lapply(plot_list, "[[", "ggplot") } ## example 2: analogous, but now show all significant proteins and disable "repelled labels" @@ -58,9 +59,8 @@ If there are multiple contrasts in your DEA results, you should first subset the plot_list = msdap::plot_volcano(dataset$de_proteins \%>\% left_join(dataset$proteins), log2foldchange_threshold = 1, qvalue_threshold = 0.01, mtitle = "volcano, label all significant", label_mode = "signif", - label_avoid_overlap = FALSE + label_avoid_overlap = FALSE, show_plots = TRUE ) - lapply(plot_list, "[[", "ggplot") } ## example 3: show labels for some set of protein IDs. First line selects all proteins where symbol @@ -73,9 +73,8 @@ If there are multiple contrasts in your DEA results, you should first subset the plot_list = msdap::plot_volcano(dataset$de_proteins \%>\% left_join(dataset$proteins), log2foldchange_threshold = 1, qvalue_threshold = 0.01, mtitle = "volcano, label selected proteins", label_mode = "protein_id", - label_target = pid_label, label_avoid_overlap = FALSE + label_target = pid_label, label_avoid_overlap = FALSE, show_plots = TRUE ) - lapply(plot_list, "[[", "ggplot") } ## example 4: plot all significant labels as before, then add custom labels for some subset of @@ -86,7 +85,7 @@ If there are multiple contrasts in your DEA results, you should first subset the plot_list = msdap::plot_volcano(dataset$de_proteins \%>\% left_join(dataset$proteins), log2foldchange_threshold = 1, qvalue_threshold = 0.01, mtitle = "volcano, label all significant + custom labels", label_mode = "signif", - label_avoid_overlap = FALSE + label_avoid_overlap = FALSE, show_plots = FALSE ) l = plot_list[[1]] l$ggplot + ggrepel::geom_text_repel( @@ -97,7 +96,8 @@ If there are multiple contrasts in your DEA results, you should first subset the ) } -# example 5: iterate over contrasts before calling plot_volcano() +# example 5: iterate over contrasts before calling plot_volcano(). +# this is essentially the same as using helper function plot_volcano_allcontrast() \dontrun{ contrasts = unique(dataset$de_proteins$contrast) for(contr in contrasts) { @@ -107,14 +107,9 @@ If there are multiple contrasts in your DEA results, you should first subset the # volcano plot function (compared to above example, now include the contrast in the title) plot_list = msdap::plot_volcano(tib_volcano, log2foldchange_threshold = 1, qvalue_threshold = 0.01, mtitle = paste(contr, "volcano, label top 10"), - label_mode = "topn_pvalue", label_target = 10, label_avoid_overlap = TRUE + label_mode = "topn_pvalue", label_target = 10, label_avoid_overlap = TRUE, + show_plots = TRUE ) - - # grab ggplot objects and print plot - ggplot_list = lapply(plot_list, "[[", "ggplot") - for(p in ggplot_list) { - print(p) - } } } diff --git a/man/plot_volcano_allcontrast.Rd b/man/plot_volcano_allcontrast.Rd new file mode 100644 index 0000000..33129f3 --- /dev/null +++ b/man/plot_volcano_allcontrast.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_volcano.R +\name{plot_volcano_allcontrast} +\alias{plot_volcano_allcontrast} +\title{a simple wrapper around plot_volcano() that iterates all contrasts} +\usage{ +plot_volcano_allcontrast(stats_de, mtitle, ...) +} +\arguments{ +\item{stats_de}{input data.frame, see \code{plot_volcano()}} + +\item{mtitle}{plot title, see \code{plot_volcano()}} + +\item{...}{parameters passed to \code{plot_volcano()}, check that function's documentation and examples} +} +\description{ +returns the plot_volcano() output within a list (1 element per contrast @ stats_de$contrast column) +} +\examples{ +\dontrun{ + # refer to the examples for msdap::plot_volcano() + # the same syntax can be applied here +} +} diff --git a/man/print_available_filtering_results.Rd b/man/print_available_filtering_results.Rd new file mode 100644 index 0000000..c67bd05 --- /dev/null +++ b/man/print_available_filtering_results.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dea.R +\name{print_available_filtering_results} +\alias{print_available_filtering_results} +\title{Return column names in peptide table that hold intensity values} +\usage{ +print_available_filtering_results(dataset) +} +\arguments{ +\item{dataset}{your dataset} +} +\description{ +Return column names in peptide table that hold intensity values +} diff --git a/man/print_contrasts.Rd b/man/print_contrasts.Rd new file mode 100644 index 0000000..71a11c2 --- /dev/null +++ b/man/print_contrasts.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sample_metadata.R +\name{print_contrasts} +\alias{print_contrasts} +\title{Print an overview of all defined contrasts and their respective sample metadata tables to the console} +\usage{ +print_contrasts(dataset) +} +\arguments{ +\item{dataset}{your dataset} +} +\description{ +Print an overview of all defined contrasts and their respective sample metadata tables to the console +} +\seealso{ +\code{\link[=add_contrast]{add_contrast()}}, \code{\link[=remove_contrasts]{remove_contrasts()}} +} diff --git a/man/protein_foldchange_from_ebayes.Rd b/man/protein_foldchange_from_ebayes.Rd deleted file mode 100644 index 7923c97..0000000 --- a/man/protein_foldchange_from_ebayes.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/stats_differential_abundance.R -\name{protein_foldchange_from_ebayes} -\alias{protein_foldchange_from_ebayes} -\title{Compute protein log2 foldchanges from protein-level eBayes (i.e. simple, no shrinkage)} -\usage{ -protein_foldchange_from_ebayes( - eset, - input_intensities_are_log2 = TRUE, - random_variables = NULL -) -} -\arguments{ -\item{eset}{a Biobase ExpressionSet that contains the protein intensities. required attributes; fData(): protein_id and pData(): condition} - -\item{input_intensities_are_log2}{whether the provided intensities are already log2 transformed} - -\item{random_variables}{a vector of column names in your sample metadata table that are added as additional(!) regression terms} -} -\value{ -data.frame with columns protein_id and foldchange.log2 -} -\description{ -implementation analogous to \code{de_ebayes()} -} diff --git a/man/remove_contrasts.Rd b/man/remove_contrasts.Rd new file mode 100644 index 0000000..87d0bef --- /dev/null +++ b/man/remove_contrasts.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sample_metadata.R +\name{remove_contrasts} +\alias{remove_contrasts} +\title{Remove all contrast definitions and respective filtering and DEA results} +\usage{ +remove_contrasts(dataset) +} +\arguments{ +\item{dataset}{your dataset} +} +\description{ +also removes contrast-related information from the dataset that was used in older MS-DAP version (prior to version 1.2) +} +\seealso{ +\code{\link[=print_contrasts]{print_contrasts()}}, \code{\link[=add_contrast]{add_contrast()}} +} diff --git a/man/setup_contrasts.Rd b/man/setup_contrasts.Rd index ee6a7e3..9723478 100644 --- a/man/setup_contrasts.Rd +++ b/man/setup_contrasts.Rd @@ -14,6 +14,9 @@ setup_contrasts(dataset, contrast_list, random_variables = NULL) \item{random_variables}{a vector of column names in your sample metadata table that are added as additional(!) regression terms in each statistical contrast tested downstream. Note that not all DEA algorithms may support this, consult documentation on individual methods for more info (start at \code{dea_algorithms()} )} } \description{ +For more fine-grained control over specified contrasts, use the functions \code{remove_contrasts()} and \code{add_contrast()}. +} +\details{ Note that a MS-DAP contrast for "A vs B" will return foldchanges for B/A in downstream output tables and data visualizations. For example, for the contrast "control vs disease" a positive log2 foldchange implies protein abundances are higher in the "disease" sample group. } \examples{ diff --git a/man/subset_protein_eset_for_dea.Rd b/man/subset_protein_eset_for_dea.Rd new file mode 100644 index 0000000..ddacd79 --- /dev/null +++ b/man/subset_protein_eset_for_dea.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/stats_differential_abundance.R +\name{subset_protein_eset_for_dea} +\alias{subset_protein_eset_for_dea} +\title{helper function to double-check we don't submit an ExpressionSet with too few values to DEA} +\usage{ +subset_protein_eset_for_dea( + eset, + model_matrix_result_prop, + is_peptide, + method_name +) +} +\arguments{ +\item{eset}{a Biobase ExpressionSet} + +\item{model_matrix_result_prop}{the column name in \code{model_matrix} that should be returned as the resulting coefficient estimated by \code{eBayes()}. In MS-DAP workflows this is typically "condition"} + +\item{is_peptide}{boolean indicating peptide-level (FALSE = protein-level)} + +\item{method_name}{label for printing error messages} +} +\description{ +helper function to double-check we don't submit an ExpressionSet with too few values to DEA +} diff --git a/man/validate_eset_fitdata.Rd b/man/validate_eset_fitdata.Rd new file mode 100644 index 0000000..f817786 --- /dev/null +++ b/man/validate_eset_fitdata.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/stats_differential_abundance.R +\name{validate_eset_fitdata} +\alias{validate_eset_fitdata} +\title{Input validation for eBayes/DEqMS/MS-EmpiRe functions} +\usage{ +validate_eset_fitdata(eset, model_matrix, model_matrix_result_prop) +} +\arguments{ +\item{eset}{protein/peptide log2 intensity matrix stored as a Biobase ExpressionSet. Must describe protein_id and sample_id in metadata} + +\item{model_matrix}{a \code{stats::model.matrix()} result that is supplied to \code{limma::lmFit()}} + +\item{model_matrix_result_prop}{the column name in \code{model_matrix} that should be returned as the resulting coefficient estimated by \code{eBayes()}. In MS-DAP workflows this is typically "condition"} +} +\description{ +Input validation for eBayes/DEqMS/MS-EmpiRe functions +}