Skip to content

Commit

Permalink
release 1.2
Browse files Browse the repository at this point in the history
  • Loading branch information
ftwkoopmans committed Aug 31, 2024
1 parent de80abb commit 42b6448
Show file tree
Hide file tree
Showing 61 changed files with 2,542 additions and 1,027 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Package: msdap
Title: Mass Spectrometry Downstream Analysis Pipeline
Description: Analyze label-free proteomics datasets from various sources (MaxQuant, Spectronaut, etc) using a pipeline that facilitates peptide filtering and many algorithms for normalization and statistical analysis. A comprehensive PDF report can be generated that includes many data visualizations and documentation thereof.
URL: https://github.com/ftwkoopmans/msdap
Version: 1.1.3
Version: 1.2
Authors@R:
person(given = "Frank",
family = "Koopmans",
Expand Down
19 changes: 10 additions & 9 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand

export(add_contrast)
export(analysis_quickstart)
export(append_log)
export(as_matrix_except_first_column)
Expand All @@ -10,7 +11,6 @@ export(check_dataset_integrity)
export(check_valid_tibble_peptides)
export(check_valid_tibble_proteins)
export(check_valid_tibble_samples)
export(dataset_contrasts)
export(de_deqms)
export(de_ebayes)
export(de_msempire)
Expand All @@ -28,6 +28,10 @@ export(file_check)
export(filename_strip_illegal_characters)
export(filter_dataset)
export(generate_pdf_report)
export(get_peptide_filternorm_variants)
export(get_protein_matrix)
export(get_samples_for_regression)
export(has_legacy_contrast_definitions)
export(hgnc_lookuptable)
export(import_dataset_diann)
export(import_dataset_encyclopedia)
Expand All @@ -51,6 +55,7 @@ export(import_sample_metadata)
export(initialize_multiprocessing)
export(invalidate_cache)
export(is_dia_dataset)
export(limma_wrapper)
export(logger.default)
export(matrix_to_long)
export(merge_fractionated_samples)
Expand All @@ -70,10 +75,14 @@ export(plot_peptide_data)
export(plot_sample_pca__sample_in_contrast)
export(plot_variance_explained)
export(plot_volcano)
export(plot_volcano_allcontrast)
export(print_available_filtering_results)
export(print_contrasts)
export(print_dataset_summary)
export(protein_eset_from_data)
export(read_textfile_compressed)
export(regex_classification)
export(remove_contrasts)
export(remove_proteins_by_name)
export(reset_log)
export(rgd_lookuptable)
Expand Down Expand Up @@ -108,11 +117,8 @@ importFrom(DBI,dbExistsTable)
importFrom(DBI,dbGetQuery)
importFrom(DBI,dbIsValid)
importFrom(DBI,dbListTables)
importFrom(DEqMS,spectraCounteBayes)
importFrom(MASS,psi.huber)
importFrom(MASS,rlm)
importFrom(MSnbase,as.MSnSet.ExpressionSet)
importFrom(MSnbase,combineFeatures)
importFrom(RSQLite,SQLite)
importFrom(archive,archive_read)
importFrom(archive,file_read)
Expand Down Expand Up @@ -170,11 +176,8 @@ importFrom(graphics,text)
importFrom(gtools,mixedorder)
importFrom(gtools,mixedsort)
importFrom(iq,fast_MaxLFQ)
importFrom(limma,eBayes)
importFrom(limma,lmFit)
importFrom(limma,normalizeCyclicLoess)
importFrom(limma,squeezeVar)
importFrom(limma,topTable)
importFrom(lme4,findbars)
importFrom(lme4,getME)
importFrom(lme4,lmerControl)
Expand All @@ -184,7 +187,6 @@ importFrom(matrixStats,rowMeans2)
importFrom(matrixStats,rowSds)
importFrom(matrixStats,rowSums2)
importFrom(missForest,missForest)
importFrom(msEmpiRe,de.ana)
importFrom(msEmpiRe,normalize)
importFrom(openssl,md5)
importFrom(openxlsx,addWorksheet)
Expand Down Expand Up @@ -221,7 +223,6 @@ importFrom(stats,ecdf)
importFrom(stats,loess)
importFrom(stats,mad)
importFrom(stats,median)
importFrom(stats,model.matrix)
importFrom(stats,na.exclude)
importFrom(stats,na.omit)
importFrom(stats,optim)
Expand Down
108 changes: 93 additions & 15 deletions R/dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,6 @@ is_dia_dataset = function(dataset) {
}


#' List the name of all contrasts in the samples table
#'
#' @param dataset a valid dataset
#' @export
dataset_contrasts = function(dataset) {
if(!"samples" %in% names(dataset)) {
append_log("invalid dataset, it lacks a samples table", type = "error")
}
grep("^contrast:", colnames(dataset$samples), ignore.case = T, value = T)
}


#' Print a short summary of a dataset to console
#'
Expand Down Expand Up @@ -280,6 +269,95 @@ empty_protein_tibble = function(peptides) {



#' rollup peptides to protein data matrix for selected intensity column
#'
#' @param dataset your dataset
#' @param intensity_column column in `dataset$peptides` that should be used for the protein matrix
#' @param include_npep if `TRUE` (default), returns a list with both the matrix and an array of peptide counts (for each row in the protein matrix). If `FALSE`, returns just the protein matrix
#' @export
get_protein_matrix = function(dataset, intensity_column, include_npep = TRUE) {
if(!is.list(dataset) || !"peptides" %in% names(dataset) || !is.data.frame(dataset$peptides)) {
append_log("Dataset does not contain a peptides table (dataset$peptides)", type = "error")
}
if(length(intensity_column) != 1 || is.na(intensity_column) || !is.character(intensity_column)) {
append_log("intensity_column parameter must be a single string", type = "error")
}
if(length(include_npep) != 1 || !include_npep %in% c(TRUE, FALSE)) {
append_log("include_npep parameter must be either TRUE or FALSE", type = "error")
}

cols_valid = get_peptide_filternorm_variants(dataset)
if(!intensity_column %in% cols_valid) {
append_log("intensity_column parameter is not valid. To see available options, run: print_available_filtering_results(dataset)", type = "error")
}

tib_pep = dataset$peptides %>%
select(sample_id, protein_id, peptide_id, intensity = !!as.symbol(intensity_column)) %>%
filter(is.finite(intensity)) # remove NA prior to rollup AND importantly, prior to peptide*protein pair counting

mat = rollup_pep2prot(tib = tib_pep, intensity_is_log2 = TRUE, rollup_algorithm = "maxlfq", return_as_matrix = TRUE)
if(!include_npep) {
return(mat)
}

# count the number of unique peptides per protein
npep = tib_pep %>%
distinct(protein_id, peptide_id) %>%
count(protein_id) %>%
# importantly, align with the protein matrix by protein_id
slice(match(rownames(mat), protein_id)) %>%
pull(n)

return(list(matrix = mat, npep = npep))
}



#' collect the respective subset of samples for selected protein data and prepare a metadata table for regression
#'
#' From sample metadata (dataset$samples), get the subset that is included in the
#' parameter `protein_data`, select columns that contain potential regression variables
#' (using `user_provided_metadata()`) and finally apply `enforce_sample_value_types()` to
#' reformat all variables (except "sample_id") to factor or numeric.
#'
#' @param dataset your dataset
#' @param protein_data output from `get_protein_matrix()`
#' @export
get_samples_for_regression = function(dataset, protein_data) {
if(!is.list(dataset) || !"samples" %in% names(dataset) || !is.data.frame(dataset$samples)) {
append_log("Dataset does not contain a sample metadata table (dataset$samples)", type = "error")
}
if(!is.matrix(protein_data) && !(is.list(protein_data) && "matrix" %in% names(protein_data) && is.matrix(protein_data$matrix))) {
append_log("protein_data parameter does not contain a matrix", type = "error")
}

sid = NULL
if(is.matrix(protein_data)) {
sid = colnames(protein_data)
} else {
sid = colnames(protein_data$matrix)
}

if(length(intersect(sid, dataset$samples$sample_id)) == 0) {
append_log("this function expects the column names in the protein_data parameter to match the sample_id column in dataset$samples: zero overlap was found", type = "error")
}

s = dataset$samples %>%
filter(sample_id %in% sid) %>%
select(sample_id, tidyselect::all_of(user_provided_metadata(dataset$samples)))


# importantly, ensure the matrix and sample tables align
stopifnot(length(sid) == nrow(s)) # double-check
s = s[match(sid, s$sample_id),]


# convert each column to factor/numeric
return( enforce_sample_value_types(s, redundant_columns = "warning") )
}



#' prettyprint table that summarizes differential detect results
#'
#' @param dataset dataset that includes DD results
Expand All @@ -304,7 +382,7 @@ diffdetect_summary_prettyprint = function(dataset, use_quant = FALSE, trim_contr
}

# array of all contrasts
column_contrasts = dataset_contrasts(dataset)
column_contrasts = unique(x$contrast)

y = x %>%
# add protein metadata
Expand All @@ -320,7 +398,7 @@ diffdetect_summary_prettyprint = function(dataset, use_quant = FALSE, trim_contr
# sort contrasts in same order as defined by user
arrange(match(contrast, column_contrasts)) %>%
# for prettyprint, trim the contrast names
mutate(contrast = sub("^contrast: ", "", contrast))
mutate(contrast = gsub(" *#.*", "", sub("^contrast: ", "", contrast)))

# optionally, limit contrast string length (evenly on each side by N characters)
if(trim_contrast_names) {
Expand Down Expand Up @@ -354,7 +432,7 @@ dea_summary_prettyprint = function(dataset, trim_contrast_names = FALSE) {
}

# array of all contrasts
column_contrasts = dataset_contrasts(dataset)
column_contrasts = unique(x$contrast)

y = x %>%
# add protein metadata
Expand All @@ -373,7 +451,7 @@ dea_summary_prettyprint = function(dataset, trim_contrast_names = FALSE) {
# sort contrasts in same order as defined by user
arrange(match(contrast, column_contrasts)) %>%
# for prettyprint, trim the contrast names
mutate(contrast = sub("^contrast: ", "", contrast))
mutate(contrast = gsub(" *#.*", "", sub("^contrast: ", "", contrast)))

# optionally, limit contrast string length (evenly on each side by N characters)
if(trim_contrast_names) {
Expand Down
Loading

0 comments on commit 42b6448

Please sign in to comment.