Skip to content

Commit

Permalink
release 1.0.7
Browse files Browse the repository at this point in the history
  • Loading branch information
ftwkoopmans committed Apr 28, 2024
1 parent 166239e commit ec32942
Show file tree
Hide file tree
Showing 46 changed files with 400 additions and 193 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@
^README.Rmd$
^README.md$
^docker$
^manuscript$
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ Thumbs.db
/docker/docker_compose/**
/docker/README
/docker/Dockerfile.dev
/docker/_test_versions/**
/manuscript/**
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Package: msdap
Title: Mass Spectrometry Downstream Analysis Pipeline
Description: Analyze label-free proteomics datasets from various sources (MaxQuant, Spectronaut, etc) using a pipeline that facilitates peptide filtering and many algorithms for normalization and statistical analysis. A comprehensive PDF report can be generated that includes many data visualizations and documentation thereof.
URL: https://github.com/ftwkoopmans/msdap
Version: 1.0.6
Version: 1.0.7
Authors@R:
person(given = "Frank",
family = "Koopmans",
Expand Down Expand Up @@ -67,8 +67,9 @@ Imports:
missForest (>= 1.4),
BiocParallel (>= 1.20),
variancePartition (>= 1.16),
testthat (>= 2.1),
archive(>= 1.0.1)
Suggests:
testthat (>= 2.1)
Remotes:
bioc::release/Biobase,
bioc::release/MSnbase,
Expand All @@ -84,5 +85,5 @@ BiocViews:
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.0
RoxygenNote: 7.3.1
VignetteBuilder: knitr
13 changes: 13 additions & 0 deletions R/dea.R
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,19 @@ get_column_intensity = function(peptides, contr_lbl = NA) {



#' return sample groups for a given contrast (column name in dataset$samples)
#'
#' @param dataset your dataset
#' @param contr_lbl a string describing a contrast = column in dataset$samples, e.g. "contrast: WT vs KO"
#' @returns list of unique sample groups, first element is left-hand side of the contrast and second element the right-hand side
contrast_to_samplegroups = function(dataset, contr_lbl) {
stopifnot(contr_lbl %in% colnames(dataset$samples))
x = dataset$samples %>% select(group, contrast = !!contr_lbl)
return(list(unique(x$group[x$contrast == 1]), unique(x$group[x$contrast == 2])))
}



#' Differential expression analysis
#'
#' @param dataset your dataset
Expand Down
4 changes: 2 additions & 2 deletions R/export_data_tables.R
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ export_protein_abundance_matrix = function(dataset, rollup_algorithm, output_dir
# check if output file is writable
remove_file_if_exists(fname)
# write to TSV file. note that we enforce a dot as decimal character (so ignore user's locale)
write.table(tib, file = fname, quote = F, sep="\t", na="", row.names = F, col.names = T, dec = ".")
utils::write.table(tib, file = fname, quote = F, sep="\t", na="", row.names = F, col.names = T, dec = ".")
}
}

Expand Down Expand Up @@ -150,7 +150,7 @@ export_peptide_abundance_matrix = function(dataset, output_dir) {
# check if output file is writable
remove_file_if_exists(fname)
# write to TSV file. note that we enforce a dot as decimal character (so ignore user's locale)
write.table(tibw, file = fname, quote = F, sep="\t", na="", row.names = F, col.names = T, dec = ".")
utils::write.table(tibw, file = fname, quote = F, sep="\t", na="", row.names = F, col.names = T, dec = ".")
}

}
5 changes: 3 additions & 2 deletions R/filter_peptides.R
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ filter_dataset = function(dataset,
rollup_algorithm = "maxlfq",
# which filters to apply
by_group = T, all_group = T, by_contrast = F) {

start_time = Sys.time()

##### input validation
Expand Down Expand Up @@ -278,8 +279,8 @@ filter_dataset = function(dataset,
data.table::setkey(dataset$dt_pep_group, key_group) # set key for data.table::merge()
dt_criteria = data.table::data.table(dataset$groups %>% select(key_group, ndetect_min, nquant_min, ndetect_noexclude_min, nquant_noexclude_min) %>% mutate_all(as.integer), key="key_group")
dt_filter_group = data.table::merge.data.table(dataset$dt_pep_group, dt_criteria, all.x = T, sort = FALSE)
dt_filter_group[ , `:=`(pass = ndetect>=ndetect_min & nquant>=nquant_min,
pass_noexclude = ndetect_noexclude>=ndetect_noexclude_min & nquant_noexclude>=nquant_noexclude_min)]
dt_filter_group[ , `:=`(pass = ndetect >= ndetect_min & nquant >= nquant_min,
pass_noexclude = ndetect_noexclude >= ndetect_noexclude_min & nquant_noexclude >= nquant_noexclude_min)]

## now we know for each peptide whether it passes the filter criteria in each group (with and without taking 'exclude' samples into account)

Expand Down
19 changes: 18 additions & 1 deletion R/gene_idmapping.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
#' Parse HGNC gene identifier lookup table that was downloaded from genenames.org
#'
#' download link: https://www.genenames.org/download/statistics-and-files/
#' table: "Complete dataset download links" -->> "Complete HGNC approved dataset text json" -->> download the "TXT" table
#' filename is typically something like hgnc_complete_set.txt
#' URL as of September 2023; https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt
#'
#' # alternatively;
#' table: "Total Approved Symbols" -->> "TXT" / "text file in TSV format"
#' filename is typically something like non_alt_loci_set.txt
#'
Expand Down Expand Up @@ -88,7 +93,7 @@ hgnc_lookuptable = function(f) {
}
if("entrez_id" %in% colnames(hgnc)) {
hgnc$entrez_id = gsub("\\D.*", "", as.character(hgnc$entrez_id)) # remove multiple mappings
hgnc$entrez_id[is.na(hgnc$ensembl_id) | !grepl("^\\d+$", hgnc$entrez_id)] = NA # invalid IDs to NA
hgnc$entrez_id[is.na(hgnc$entrez_id) | !grepl("^\\d+$", hgnc$entrez_id)] = NA # invalid IDs to NA
result = bind_rows(
result,
hgnc %>% filter(!is.na(entrez_id)) %>% mutate(type = "entrez_id") %>% select(hgnc_id, type, value = entrez_id)
Expand Down Expand Up @@ -131,7 +136,19 @@ hgnc_lookuptable = function(f) {
return(hgnc %>% select(hgnc_id, hgnc_symbol))
}

# add hgnc_id that do not have any synonyms or mappings to mgi/rgd/entrez/ensembl
id_missing = setdiff(hgnc$hgnc_id, result$hgnc_id)
if(length(id_missing) > 0) {
id_missing__symbol = hgnc$hgnc_symbol[match(id_missing, hgnc$hgnc_id)]
result = bind_rows(
result,
tibble::tibble(hgnc_id = id_missing, type = "onlysymbol", value = id_missing__symbol)
)
}

# add HGNC symbols
result$hgnc_symbol = hgnc$hgnc_symbol[match(result$hgnc_id, hgnc$hgnc_id)]
# return results with re-ordered column names
result %>% select(hgnc_id, hgnc_symbol, type, value)
}

Expand Down
16 changes: 16 additions & 0 deletions R/msdap-package.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#' MS-DAP: Mass Spectrometry Downstream Analysis Pipeline for label-free proteomics data.
#'
#' http://github.com/ftwkoopmans/msdap
#'
#' @keywords internal
#' @import ggplot2
#' @importFrom grDevices colorRampPalette dev.off graphics.off pdf rainbow
#' @importFrom graphics abline boxplot legend lines mtext par plot points text
#' @importFrom stats .lm.fit contrasts cov density ecdf mad median na.exclude na.omit p.adjust p.adjust.methods pt quantile resid residuals rt sd sigma weights loess optim predict
#' @importFrom utils capture.output combn data head relist tail help packageVersion
#' @importFrom rlang .data :=
"_PACKAGE"

## usethis namespace: start
## usethis namespace: end
NULL
31 changes: 24 additions & 7 deletions R/normalize_vwmb.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,22 @@
#'
#' note; if you want to treat replicate samples that are flagged as 'exclude' upstream differently while still including them in the data matrix, you could set parameter `groups=paste(samples$group, samples$exclude)` to put them in separate groups.
#'
#' @examples \dontrun{
#' # Define a custom normalization function that we'll use in the MS-DAP pipeline later.
#' # All it does is apply the first part of the VWMB algorithm to reduce varation between replicates.
#' norm_within_only = function(x_as_log2, group_by_cols, ...) {
#' normalize_vwmb(x_as_log2, groups = group_by_cols,
#' metric_within="var", metric_between = "")
#' }
#'
#' # now we use the main MS-DAP function as typical, but note that we here set the
#' # normalization algorithm to the function we defined in the lines above.
#' dataset = analysis_quickstart(
#' dataset,
#' norm_algorithm = "norm_within_only",
#' # <other parameters here>
#' )
#' }
#' @param x numerical data matrix to normalize, should be log transformed
#' @param groups array describing the grouping of the columns in x (sample groups). Or alternatively set to NA to indicate there are no groups
#' @param metric_within how should replicate samples within a group be normalized? valid arguments: "var" reduce overall variation (default). "mode" reduce overall foldchange mode. pass empty string to disable
Expand All @@ -33,7 +49,7 @@ normalize_vwmb = function(x, groups=NA, metric_within="var", metric_between="mod
stopifnot(is.matrix(x) && typeof(x) == "double" && ncol(x) > 1 && nrow(x) > 1) # valid numeric matrix
stopifnot((length(groups)==1 && is.na(groups)) || (length(groups) == ncol(x) && all(!is.na(groups)))) # groups should be NA or an array describing all columns of x
stopifnot(length(metric_within)==1 && metric_within %in% c("mode","var", "")) # how to normalize within groups, with "" to disable
stopifnot(length(metric_between)==1 && metric_between %in% c("mode","var")) # how to normalize between groups; cannot disable between-group normalization. if you don't want to treat groups differently, disable groups by setting groups=NA
stopifnot(length(metric_between)==1 && metric_between %in% c("mode","var", "")) # how to normalize between groups

## restrict group names: assume there are no groups if groups=NA + enforces character type for groups
if(length(groups) == 1) {
Expand All @@ -47,7 +63,7 @@ normalize_vwmb = function(x, groups=NA, metric_within="var", metric_between="mod
## restrict input matrix
# remove infinite values (eg; some upstream log(0)), then compute summary stats for input data
x[!is.finite(x)] = NA
x_median = median(x, na.rm = T)
x_median = stats::median(x, na.rm = T)
if(include_attributes) {
x_colmedians = matrixStats::colMedians(x, na.rm = T)
}
Expand Down Expand Up @@ -103,7 +119,7 @@ normalize_vwmb = function(x, groups=NA, metric_within="var", metric_between="mod


#### normalize between groups
if(group_count > 1) {
if(group_count > 1 && metric_between != "") {
# analogous to above
# metric_between is either "mode" or "var"
if(metric_between == "var") {
Expand All @@ -119,7 +135,7 @@ normalize_vwmb = function(x, groups=NA, metric_within="var", metric_between="mod


#### finally, scale entire result matrix such that the median intensity is the same as input (deals with the rare case where abundances in one group are much higher, and this normalization consequentially shifts overall abundance levels @ between-group scaling)
x = x + (x_median - median(x, na.rm = T))
x = x + (x_median - stats::median(x, na.rm = T))
# reset colnames
colnames(x) = x_colnames
# threshold very low values
Expand Down Expand Up @@ -215,14 +231,15 @@ pairwise_modes = function(x, min_overlap = 10) {



#' find the mode in a numeric array using the default density function from base R
#' find the mode in a numeric array using the density function from base R
#' @param x numeric array
#' @return mode of x
get_mode = function(x) {
if(length(x) == 0) return(NA)
if(length(x) == 1) return(x)

density_estimate = density(x, na.rm=T)
density_estimate = density(x, na.rm = T) # default density kernel (left as-is for consistency with prior MS-DAP versions)
# density_estimate = density(x, bw = "sj", adjust = 0.9, na.rm = T) # alternative bandwidth; we need to benchmark to validate this is an improvement for the vast majority of datasets (prior to updating previous default)
# which.max() returns first 'max' value (thus always 1 return value). eg; which.max(c(1,1,2,2,1))
return(density_estimate$x[which.max(density_estimate$y)])
}
Expand Down Expand Up @@ -280,7 +297,7 @@ norm_scales_var = function(x) {
mle_norm = function(...) {
s = unlist(...)
# use the median instead of the mean, as the latter is too sensitive to outliers (eg; upstream data has some errors so few features have an extreme value -->> dominates the scaling of all features)
return( median(matrixStats::rowSds(x_scaled + rep(s, rep.int(nrow(x_scaled), ncol(x_scaled))), na.rm=T), na.rm=T) )
return( stats::median(matrixStats::rowSds(x_scaled + rep(s, rep.int(nrow(x_scaled), ncol(x_scaled))), na.rm=T), na.rm=T) )
# optimizations; column-wise matrix operations; https://stackoverflow.com/a/32364355 https://stats.stackexchange.com/a/51750
# optimizations; median @ https://stackoverflow.com/questions/34771088/why-is-standard-r-median-function-so-much-slower-than-a-simple-c-alternative
}
Expand Down
1 change: 1 addition & 0 deletions R/parse_fragpipe_ionquant.R
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ import_dataset_fragpipe_ionquant = function(path, acquisition_mode, confidence_t
#' @param file_ion full path to a ion.tsv file
#' @param file_psm full path to a psm.tsv file, this must match the `file_ion` argument (e.g. files in the same dir)
parse_fragpipe_psm_ion_pair = function(file_ion, file_psm) {
spectrum = NULL
# basically this reads the CSV/TSV table from file and maps column names to expected names.
# (complicated) downstream code handles compressed files, efficient parsing of only the requested columns, etc.
append_log(paste("reading FragPipe file;", file_ion), type = "info")
Expand Down
11 changes: 6 additions & 5 deletions R/plot_differential_detect.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
#' Plot differential detection results as a histogram
#'
#' @param dataset dataset object
#' @param zscore_threshold cutoff used in the plot (for absolute values)
#' @returns list of ggplot objects with 1 plot per contrast. If no differential detection data is available, returns an empty list
#' @export
plot_differential_detect = function(dataset) {
plot_differential_detect = function(dataset, zscore_threshold = 4) {
result = list()
if(!"dd_proteins" %in% names(dataset) || nrow(dataset$dd_proteins) == 0) {
return(result)
Expand All @@ -19,8 +20,8 @@ plot_differential_detect = function(dataset) {
tib_contr = tib_contr %>% filter(type %in% c("detect", "quant"))
tib_contr_detect = tib_contr %>% filter(type == "detect")
tib_contr_quant = tib_contr %>% filter(type == "quant")
lbl_detect = sprintf("only detected peptides; #proteins tested: %d #abs(zscore) >= 4: %d", nrow(tib_contr_detect), sum(abs(tib_contr_detect$zscore) >= 4))
lbl_quant = sprintf("all quantified peptides; #proteins tested: %d #abs(zscore) >= 4: %d", nrow(tib_contr_quant), sum(abs(tib_contr_quant$zscore) >= 4))
lbl_detect = sprintf("only detected peptides; #proteins tested: %d #abs(zscore) >= %s: %d", nrow(tib_contr_detect), as.character(zscore_threshold), sum(abs(tib_contr_detect$zscore) >= zscore_threshold))
lbl_quant = sprintf("all quantified peptides; #proteins tested: %d #abs(zscore) >= %s: %d", nrow(tib_contr_quant), as.character(zscore_threshold), sum(abs(tib_contr_quant$zscore) >= zscore_threshold))

tib_contr = tib_contr %>%
arrange(type) %>%
Expand All @@ -29,8 +30,8 @@ plot_differential_detect = function(dataset) {

p_hist = ggplot(tib_contr, aes(zscore)) +
geom_histogram(bins=25, boundary = 0, colour = "black", fill="lightgrey", na.rm=T) +
geom_vline(xintercept = c(-4, 4), colour = "red") +
facet_wrap(.~type_label, ncol = 1) +
geom_vline(xintercept = c(-zscore_threshold, zscore_threshold), colour = "red") +
facet_wrap(.~type_label, ncol = 1, scales = "free") +
labs(x="Differential z-score for observed peptides", y="Protein count", colour = "", title = contr) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, size=9),
Expand Down
Loading

0 comments on commit ec32942

Please sign in to comment.