Skip to content

Commit

Permalink
release 1.0.8
Browse files Browse the repository at this point in the history
  • Loading branch information
ftwkoopmans committed May 24, 2024
1 parent f1b12fa commit d1018ce
Show file tree
Hide file tree
Showing 15 changed files with 285 additions and 84 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Package: msdap
Title: Mass Spectrometry Downstream Analysis Pipeline
Description: Analyze label-free proteomics datasets from various sources (MaxQuant, Spectronaut, etc) using a pipeline that facilitates peptide filtering and many algorithms for normalization and statistical analysis. A comprehensive PDF report can be generated that includes many data visualizations and documentation thereof.
URL: https://github.com/ftwkoopmans/msdap
Version: 1.0.7
Version: 1.0.8
Authors@R:
person(given = "Frank",
family = "Koopmans",
Expand Down
10 changes: 7 additions & 3 deletions R/dea.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@
#'
#' **msqrob**: implementation of the MSqRob package, with minor tweak for (situationally) faster computation (PMID:26566788) <https://github.com/statOmics/msqrob>. This is a peptide-level DEA algorithm. This method will take provided covariates into account (if any). Implemented in function; \code{de_msqrobsum_msqrob}
#'
#' **msqrob_fc**: msqrob estimated protein pvalues, but with log2fc values computed by eBayes. This avoids the excessive shrinkage of protein log2fc values by MSqRob, but does use the MSqRob robust peptide regression model for protein p-value estimates (which outperforms eBayes/DEqMS in ROC analyses on most benchmark datasets, so this method is intended as a "best of both worlds")
#'
#' **msqrobsum**: implementation of the MSqRob package (which also features MSqRobSum), with minor tweak for (situationally) faster computation (PMID:32321741) <https://github.com/statOmics/msqrob>. This is a hybrid peptide&protein-level DEA algorithm that takes peptide-level data as input; it first performs peptide-to-protein rollup, then applies statistics to this protein-level data matrix. This method will take provided covariates into account (if any). Implemented in function; \code{de_msqrobsum_msqrob}
#'
#' @export
dea_algorithms = function() {
return(c("ebayes", "deqms", "msempire", "msqrobsum", "msqrob"))
return(c("ebayes", "deqms", "msempire", "msqrobsum", "msqrob")) # , "msqrob_fc"
}


Expand Down Expand Up @@ -261,9 +263,11 @@ dea = function(dataset, qval_signif = 0.01, fc_signif = 0, dea_algorithm = "deqm
# ! compared to the other dea algorithms, MS-EmpiRe is not a regression model so we cannot add random variables
alg_result = de_msempire(eset_peptides, input_intensities_are_log2 = T)
} else if(alg == "msqrob") {
alg_result = de_msqrobsum_msqrob(eset_peptides, use_peptide_model = T, input_intensities_are_log2 = T, random_variables = ranvars)
alg_result = de_msqrobsum_msqrob(eset_peptides, eset_proteins = NULL, use_peptide_model = T, input_intensities_are_log2 = T, random_variables = ranvars, log2fc_without_shrinkage = FALSE)
# } else if(alg == "msqrob_fc") {
# alg_result = de_msqrobsum_msqrob(eset_peptides, eset_proteins = eset_proteins, use_peptide_model = T, input_intensities_are_log2 = T, random_variables = ranvars, log2fc_without_shrinkage = TRUE)
} else if(alg == "msqrobsum") {
alg_result = de_msqrobsum_msqrob(eset_peptides, use_peptide_model = F, input_intensities_are_log2 = T, random_variables = ranvars)
alg_result = de_msqrobsum_msqrob(eset_peptides, eset_proteins = NULL, use_peptide_model = F, input_intensities_are_log2 = T, random_variables = ranvars, log2fc_without_shrinkage = FALSE)
} else {
# for non-hardcoded functions, we call the function requested as a user parameter and pass all available data
alg_fun = match.fun(alg) # throws error if not found
Expand Down
33 changes: 24 additions & 9 deletions R/filter_peptides.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,13 @@ cache_filtering_data = function(dataset) {
p[, key_peptide_group_noexclude := kpg]
# detect not as a boolean, but scaled by number of detect per sample
# ` * !is.na(key_peptide_group_noexclude)` is simply to remove all data points for samples that are excluded, which allows us to downstream group by key_group and not worry about excluded samples
p[, `:=` (detect_scaled_noexclude = (detect/sum(detect)) * !is.na(key_peptide_group_noexclude),
quant_scaled_noexclude = 1/.N * !is.na(key_peptide_group_noexclude)),
p[, c("detect_scaled_noexclude", "quant_scaled_noexclude") := list(
(detect/sum(detect)) * !is.na(key_peptide_group_noexclude),
1/.N * !is.na(key_peptide_group_noexclude) ),
by = key_sample]
# p[, `:=` (detect_scaled_noexclude = (detect/sum(detect)) * !is.na(key_peptide_group_noexclude),
# quant_scaled_noexclude = 1/.N * !is.na(key_peptide_group_noexclude)),
# by = key_sample]


## efficiently collapse filtering data for each peptide * 'sample group'
Expand All @@ -78,7 +82,10 @@ cache_filtering_data = function(dataset) {
data.table::setkey(dt_pep_group, key_group) # set key for data.table::merge()
dt_criteria = data.table::data.table(grps %>% select(key_group, size, size_noexclude) %>% mutate_all(as.integer), key="key_group")
dt_pep_group = data.table::merge.data.table(dt_pep_group, dt_criteria, all.x = T, sort = FALSE)
dt_pep_group[ , `:=`(ndetect_fraction = ndetect / size, ndetect_noexclude_fraction = ndetect_noexclude / size_noexclude)]
dt_pep_group[ , c("ndetect_fraction", "ndetect_noexclude_fraction") := list(
ndetect / size,
ndetect_noexclude / size_noexclude)]
# dt_pep_group[ , `:=`(ndetect_fraction = ndetect / size, ndetect_noexclude_fraction = ndetect_noexclude / size_noexclude)]


################# pre-cache CoV
Expand Down Expand Up @@ -279,19 +286,25 @@ filter_dataset = function(dataset,
data.table::setkey(dataset$dt_pep_group, key_group) # set key for data.table::merge()
dt_criteria = data.table::data.table(dataset$groups %>% select(key_group, ndetect_min, nquant_min, ndetect_noexclude_min, nquant_noexclude_min) %>% mutate_all(as.integer), key="key_group")
dt_filter_group = data.table::merge.data.table(dataset$dt_pep_group, dt_criteria, all.x = T, sort = FALSE)
dt_filter_group[ , `:=`(pass = ndetect >= ndetect_min & nquant >= nquant_min,
pass_noexclude = ndetect_noexclude >= ndetect_noexclude_min & nquant_noexclude >= nquant_noexclude_min)]
dt_filter_group[ , c("pass", "pass_noexclude") := list(
ndetect >= ndetect_min & nquant >= nquant_min,
ndetect_noexclude >= ndetect_noexclude_min & nquant_noexclude >= nquant_noexclude_min
)]
# dt_filter_group[ , `:=`(pass = ndetect >= ndetect_min & nquant >= nquant_min,
# pass_noexclude = ndetect_noexclude >= ndetect_noexclude_min & nquant_noexclude >= nquant_noexclude_min)]

## now we know for each peptide whether it passes the filter criteria in each group (with and without taking 'exclude' samples into account)

## wide format for re-use (eg; by-group or all-group)
data.table::setkey(dt_filter_group, pass_noexclude) # set key for filter step below; remove all peptide*group that don't pass anywhere to minimize wide-table size
dt_filter_group_wide_noexclude = data.table::dcast(dt_filter_group[pass_noexclude>0], key_peptide ~ key_group, value.var = "pass_noexclude", fill = FALSE)
dt_filter_group_wide_noexclude[ , `:=`(ngroup = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(setdiff(colnames(dt_filter_group_wide_noexclude), "key_peptide")))]
dt_filter_group_wide_noexclude[ , ngroup := matrixStats::rowSums2(as.matrix(.SD)), .SDcols=eval(quote(setdiff(colnames(dt_filter_group_wide_noexclude), "key_peptide")))]
# dt_filter_group_wide_noexclude[ , `:=`(ngroup = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(setdiff(colnames(dt_filter_group_wide_noexclude), "key_peptide")))]
# analogous
data.table::setkey(dt_filter_group, pass)
dt_filter_group_wide = data.table::dcast(dt_filter_group[pass>0], key_peptide ~ key_group, value.var = "pass", fill = FALSE)
dt_filter_group_wide[ , `:=`(ngroup = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(setdiff(colnames(dt_filter_group_wide), "key_peptide")))]
dt_filter_group_wide[ , ngroup := matrixStats::rowSums2(as.matrix(.SD)), .SDcols=eval(quote(setdiff(colnames(dt_filter_group_wide), "key_peptide")))]
# dt_filter_group_wide[ , `:=`(ngroup = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(setdiff(colnames(dt_filter_group_wide), "key_peptide")))]


if(by_group) {
Expand Down Expand Up @@ -351,15 +364,17 @@ filter_dataset = function(dataset,
grp1_mask = dataset$peptides$key_peptide %in% dt_filter_group_wide_noexclude$key_peptide[ dt_filter_group_wide_noexclude[[as.character(grp1_key)]] ]
} else {
# if multi-group, select all columns for this group and test that all are true
dt_filter_group_wide_noexclude[, `:=`(temp = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(as.character(grp1_key)))]
dt_filter_group_wide_noexclude[, temp := matrixStats::rowSums2(as.matrix(.SD)), .SDcols=eval(quote(as.character(grp1_key)))]
# dt_filter_group_wide_noexclude[, `:=`(temp = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(as.character(grp1_key)))]
grp1_mask = dataset$peptides$key_peptide %in% dt_filter_group_wide_noexclude$key_peptide[ dt_filter_group_wide_noexclude$temp == length(grp1_key) ]
}

if(length(grp2_key) == 1) {
grp2_mask = dataset$peptides$key_peptide %in% dt_filter_group_wide_noexclude$key_peptide[ dt_filter_group_wide_noexclude[[as.character(grp2_key)]] ]
} else {
# if multi-group, select all columns for this group and test that all are true
dt_filter_group_wide_noexclude[, `:=`(temp = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(as.character(grp2_key)))]
dt_filter_group_wide_noexclude[, temp := matrixStats::rowSums2(as.matrix(.SD)), .SDcols=eval(quote(as.character(grp2_key)))]
# dt_filter_group_wide_noexclude[, `:=`(temp = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(as.character(grp2_key)))]
grp2_mask = dataset$peptides$key_peptide %in% dt_filter_group_wide_noexclude$key_peptide[ dt_filter_group_wide_noexclude$temp == length(grp2_key) ]
}

Expand Down
97 changes: 69 additions & 28 deletions R/plot_peptide_data.R
Original file line number Diff line number Diff line change
@@ -1,17 +1,57 @@

#' plot peptide-level data
#'
#' @param dataset todo
#' @param select_all_proteins todo
#' @param select_diffdetect_candidates todo
#' @param select_dea_signif todo
#' @param filter_protein_ids todo
#' @param filter_genes todo
#' @param output_dir todo
#' @param show_unused_datapoints todo
#' @param norm_algorithm todo
#' @examples \dontrun{
#' # example 1:
#' # <assuming you imported a dataset and applied analysis_quickstart()>
#' # plot all significant proteins found by DEA
#' plot_peptide_data( dataset, select_dea_signif = TRUE,
#' # should match analysis_quickstart() parameters !
#' norm_algorithm = c("vwmb", "modebetween_protein"),
#' show_unused_datapoints = TRUE, output_dir = "C:/temp")
#'
#' # example 2:
#' # filter the DEA results for some specific proteins, then plot these
#' pid = dataset$de_proteins %>%
#' filter(dea_algorithm == "deqms" & qvalue < 10^-4) %>%
#' pull(protein_id)
#' plot_peptide_data(
#' dataset, filter_protein_ids = pid,
#' # should match analysis_quickstart() parameters !
#' norm_algorithm = c("vwmb", "modebetween_protein"),
#' show_unused_datapoints = TRUE, output_dir = "C:/temp")
#'
#' # example 3:
#' # plot specific proteins by providing gene symbols (not case sensitive)
#' plot_peptide_data(
#' dataset, filter_genes = c("shisa6", "gria2"),
#' # should match analysis_quickstart() parameters !
#' norm_algorithm = c("vwmb", "modebetween_protein"),
#' show_unused_datapoints = TRUE, output_dir = "C:/temp")
#' }
#' @param dataset your dataset
#' @param select_all_proteins plot all proteins. This will take a long time. Default: FALSE
#' @param select_diffdetect_candidates if "differential detection" was enabled (it is by default in `analysis_quickstart`), plot candidate proteins based on their zscores.
#' Valid parameters are positive numeric values that are to be used as absolute z-score cutoffs (4 or 5 are typically reasonable; use `plot_differential_detect` to visualize the score distributions).
#' This selects all proteins that meet this criterium in any contrast, and plots these proteins in every contrast (include those where it is has a z-score below given threshold).
#' Alternative, set to TRUE to use an absolute z-score cutoff value of 4. Default: NA (disable)
#' @param select_dea_signif plot all significant proteins from DEA?
#' This effectively checks the "signif" column in the `dataset$de_proteins` data table (DEA results generated by MS-DAP), finds all proteins that are signif in any contrast, and plots these proteins in every contrast (include those where it is not signif).
#' Boolean parameter, default: FALSE
#' @param filter_protein_ids a vector of protein_id values that should be plotted. For example, you can find specific protein_id values in the differential expression result table and provide only these as a parameter
#' (in this case, set `select_dea_signif=FALSE` to ensure only your specific protein identifiers are plotted)
#' @param filter_genes a vector of gene symbols to plot (alternative for filter_protein_ids)
#' @param output_dir full path to the output directory
#' @param show_unused_datapoints should data points be shown for peptides that were not used in the differential expression analyses? i.e. peptides that did not pass the filtering prior to DEA (found in N samples in both experimental conditions). Boolean value, default: FALSE
#' @param norm_algorithm normalization algorithms. This should be exactly the same as the parameter your provided to `analysis_quickstart`
#' @export
plot_peptide_data = function(dataset, select_all_proteins = FALSE, select_diffdetect_candidates = TRUE, select_dea_signif = TRUE, filter_protein_ids = NA, filter_genes = NA, output_dir, show_unused_datapoints = FALSE, norm_algorithm = NA) {
plot_peptide_data = function(dataset, select_all_proteins = FALSE, select_diffdetect_candidates = NA, select_dea_signif = FALSE, filter_protein_ids = NA, filter_genes = NA, output_dir, show_unused_datapoints = FALSE, norm_algorithm = NA) {
if(length(select_diffdetect_candidates) == 1 && select_diffdetect_candidates %in% TRUE) {
select_diffdetect_candidates = 4 # default z-score
}
if(length(select_diffdetect_candidates) != 1 || !is.numeric(select_diffdetect_candidates) || select_diffdetect_candidates <= 0) {
select_diffdetect_candidates = NA
}
pid = NULL

## if requested to plot all proteins, just take the unique set from peptide tibble and don't bother with other selection parameters
Expand Down Expand Up @@ -40,11 +80,11 @@ plot_peptide_data = function(dataset, select_all_proteins = FALSE, select_diffde
pid = c(pid, dataset$de_proteins %>% filter(signif) %>% distinct(protein_id) %>% pull())
}

if(select_diffdetect_candidates) {
if(!is.na(select_diffdetect_candidates)) {
if(!"dd_proteins" %in% names(dataset)) {
append_log("you requested to plot proteins tested by differential detection counting, but no differential detection has not been applied to this dataset yet (did you run the analysis_quickstart() function yet? Alternatively you can simply run differential_detect() and review its outcome with print_dataset_summary() to quickly prepare this selection)", type = "error")
}
pid = c(pid, dataset$dd_proteins %>% filter(type == "detect" & is.finite(zscore) & abs(zscore) >= 4) %>% distinct(protein_id) %>% pull())
pid = c(pid, dataset$dd_proteins %>% filter(type == "detect" & is.finite(zscore) & abs(zscore) >= select_diffdetect_candidates) %>% distinct(protein_id) %>% pull())
}
}

Expand All @@ -71,7 +111,7 @@ plot_peptide_data = function(dataset, select_all_proteins = FALSE, select_diffde
#' @importFrom stringr str_trunc
#' @importFrom foreach foreach
#' @importFrom parallel stopCluster clusterExport
plot_peptide_data_by_contrast_to_pdf = function(dataset, filter_protein_id, output_dir, show_unused_datapoints = FALSE, norm_algorithm=NA, rollup_algorithm = "maxlfq") {
plot_peptide_data_by_contrast_to_pdf = function(dataset, filter_protein_id, output_dir, show_unused_datapoints = FALSE, norm_algorithm = NA, rollup_algorithm = "maxlfq") {
filter_protein_id = unique(na.omit(filter_protein_id))
if(length(filter_protein_id) == 0) {
append_log("no valid protein identifiers were provided", type = "error")
Expand Down Expand Up @@ -203,22 +243,23 @@ plot_peptide_data_by_contrast_to_pdf = function(dataset, filter_protein_id, outp

if("dd_proteins" %in% names(dataset)) {
tib_dd = dataset$dd_proteins %>% filter(protein_id %in% proteins_contr$protein_id & contrast == contr & is.finite(zscore))
if(nrow(tib_dd) > 0) {
# infer order in which to plot proteins
contr_protein_id__ordered = c(contr_protein_id__ordered, tib_dd %>% arrange(desc(abs(zscore))) %>% pull(protein_id))

# infer order in which to plot proteins
contr_protein_id__ordered = c(contr_protein_id__ordered, tib_dd %>% arrange(desc(abs(zscore))) %>% pull(protein_id))

# gather differential detect scores in expected format
tib_dd_long = tib_dd %>% select(protein_id, foldchange = zscore) %>% mutate(algorithm = paste0("zscore", type, "counts"))
# gather differential detect scores in expected format
tib_dd_long = tib_dd %>% select(protein_id, type, foldchange = zscore) %>% mutate(algorithm = paste0("zscore-", type, "-counts"))

# pretty-print format for downstream visualization
tib_dd_long = tib_dd_long %>%
mutate(foldchange = sprintf("%.2f", foldchange),
pvalue="",
qvalue="",
signif="") %>%
select(protein_id, algorithm, foldchange, pvalue, qvalue, signif)

de_proteins_contr = bind_rows(de_proteins_contr, tib_dd_long)
# pretty-print format for downstream visualization
tib_dd_long = tib_dd_long %>%
mutate(foldchange = sprintf("%.2f", foldchange),
pvalue="",
qvalue="",
signif="") %>%
select(protein_id, algorithm, foldchange, pvalue, qvalue, signif)

de_proteins_contr = bind_rows(de_proteins_contr, tib_dd_long)
}
}


Expand Down Expand Up @@ -393,7 +434,7 @@ reintroduce_filtered_intensities = function(dataset, norm_algorithm, rollup_algo
# x_ref[rows_sid] = intensity values in sample s for only the subset of peptides that pass user's filtering criteria, that were subsequently normalized
# if we want to reintroduce intensity values from the unfiltered dataset, where normalization was applied to the total set of peptides, there has to be some variation between the filtered and unfiltered data !
# so we here scale the abundances of the 'unfiltered data' to the 'filtered data'
x[rows_sid] = x[rows_sid] + get_mode(x[rows_sid] - x_ref[rows_sid])
x[rows_sid] = x[rows_sid] + suppressWarnings(get_mode(x[rows_sid] - x_ref[rows_sid]))
}
}

Expand Down
Loading

0 comments on commit d1018ce

Please sign in to comment.