Skip to content

Commit

Permalink
release 1.1.3
Browse files Browse the repository at this point in the history
  • Loading branch information
ftwkoopmans committed Aug 26, 2024
1 parent 27aad7d commit de80abb
Show file tree
Hide file tree
Showing 17 changed files with 171 additions and 54 deletions.
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Package: msdap
Title: Mass Spectrometry Downstream Analysis Pipeline
Description: Analyze label-free proteomics datasets from various sources (MaxQuant, Spectronaut, etc) using a pipeline that facilitates peptide filtering and many algorithms for normalization and statistical analysis. A comprehensive PDF report can be generated that includes many data visualizations and documentation thereof.
URL: https://github.com/ftwkoopmans/msdap
Version: 1.1.2
Version: 1.1.3
Authors@R:
person(given = "Frank",
family = "Koopmans",
Expand Down Expand Up @@ -67,7 +67,8 @@ Imports:
missForest (>= 1.4),
BiocParallel (>= 1.20),
variancePartition (>= 1.16),
archive(>= 1.0.1)
archive(>= 1.0.1),
arrow(>= 13.0.0)
Suggests:
testthat (>= 2.1)
Remotes:
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
export(analysis_quickstart)
export(append_log)
export(as_matrix_except_first_column)
export(assert_lme4_functional)
export(cache_filtering_data)
export(check_dataset_hascache)
export(check_dataset_integrity)
Expand Down
8 changes: 6 additions & 2 deletions R/gene_idmapping.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@
#' filename is typically something like non_alt_loci_set.txt
#'
#' @param f full path to the downloaded table (expected to be tsv format)
#' @param uppercase_symbols convert all gene symbols to upper case? default: TRUE
#' @returns a long-format table with columns; hgnc_id, hgnc_symbol, type, value
#' @export
hgnc_lookuptable = function(f) {
hgnc_lookuptable = function(f, uppercase_symbols = TRUE) {
result = NULL
check_parameter_is_string(f)
check_parameter_is_boolean(uppercase_symbols)

# parse HGNC table from disk
f = path_exists(f, NULL, try_compressed = TRUE)
Expand All @@ -39,7 +41,9 @@ hgnc_lookuptable = function(f) {
)
hgnc = robust_header_matching(hgnc, column_spec = cols, columns_required = c("hgnc_id", "hgnc_symbol"))

hgnc$hgnc_symbol = toupper(hgnc$hgnc_symbol)
if(uppercase_symbols) {
hgnc$hgnc_symbol = toupper(hgnc$hgnc_symbol)
}

# remove invalid rows (this shouldn't occur but check anyway)
hgnc = hgnc %>% filter(!is.na(hgnc_id) & hgnc_id != "" & !is.na(hgnc_symbol) & nchar(hgnc_symbol) >= 2)
Expand Down
9 changes: 8 additions & 1 deletion R/msqrobsum__bugfix.R
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,13 @@ msqrobsum <- function(
# df_failed <- filter(df, is.na(df))### FRANK
# df <- filter(df, !is.na(df))### FRANK
if(!nrow(df)) {warning("No models could be fitted"); return(df_prot_failed)}

### FRANK: identify issues with lme4::lmer() when all proteins/models failed to help users diagnose R installation issues. Related to MS-DAP GitHub issue #36
if(!"sigma" %in% colnames(df)) { # if all fit_fun() calls throw errors, the 'mm' column is NULL/empty and unnest() does not yield a sigma column
assert_lme4_functional() # util function, new in MS-DAP 1.1.3
stop("msqrob model estimation failed for all proteins") # halt even if we didn't find lme4::lmer() issues because below code will fail when 'sigma' column is missing
}

## Squeeze variance
df <- mutate(df, sigma_post = sigma, df_prior = 0, sigma_prior = 0)
if(squeeze_variance) {
Expand Down Expand Up @@ -160,7 +167,7 @@ do_mm <- function(d, formulas, contrasts, mode ='msqrobsum', robust_lmer_iter =
out$data_summarized <- list(d)
}
if (mode == 'sum') return(new_tibble(out ,nrow = 1L))
for (form in c(formulas)){
for (form in c(formulas)) {
## TODO do lm fit if there are no random effects (all output inside loop, do while loop)
model <- try(do_lmerfit(d, form, robust_lmer_iter,lmer_args), silent = TRUE)
if (is(model,"lmerMod")) break #else message(paste0('Cannot fit ', format(form)))
Expand Down
85 changes: 52 additions & 33 deletions R/parse_longformat_generic.R
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ import_dataset_skyline = function(filename, acquisition_mode, confidence_thresho

#' Import a label-free proteomics dataset from DIA-NN
#'
#' @param filename the full file path of the input file
#' @param filename the full file path of the DIA-NN report; this can be either the report.tsv file or the report.parquet file (new since DIA-NN 1.9.1)
#' @param confidence_threshold confidence score threshold at which a peptide is considered 'identified', default: 0.01 (target value must be lesser than or equals)
#' @param do_plot logical indicating whether to create QC plots that are shown in the downstream PDF report (if enabled)
#' @param use_irt logical indicating whether to use standardized (IRT) or the empirical (RT) retention times
Expand All @@ -88,22 +88,37 @@ import_dataset_diann = function(filename, confidence_threshold = 0.01, use_norma
reset_log()
append_log("reading DIA-NN report...", type = "info")

rt_col = "iRT"
if(!use_irt) rt_col = "RT"
rt_col = ifelse(use_irt, "iRT", "RT")

ds = import_dataset_in_long_format(filename,
attributes_required = list(sample_id = "File.Name",
protein_id = "Protein.Group",
sequence_modified = "Modified.Sequence",
sequence_plain = "Stripped.Sequence",
charge = "Precursor.Charge",
rt = rt_col,
# isdecoy = "",
intensity = ifelse(use_normalized_intensities, "Precursor.Normalised", "Precursor.Quantity"),
confidence = "Q.Value",
confidence_score = "Evidence"),
# attributes_optional = list(confidence_score = "Evidence"),
confidence_threshold = confidence_threshold,
# if the filename ends with ".parquet", use the Apache arrow library to read the DIA-NN report in parquet format
f = df = NULL
if(grepl("\\.parquet$", filename)) {
append_log(paste("input file:", filename), type = "info")
df = arrow::read_parquet(
filename,
col_select = NULL,
as_data_frame = TRUE,
props = arrow::ParquetArrowReaderProperties$create(),
mmap = TRUE
)
} else { # not a parquet file, assume input file is a .tsv file
f = filename
}

ds = import_dataset_in_long_format(filename = f,
x = df,
attributes_required = list(sample_id = c("File.Name", "Run"),
protein_id = "Protein.Group",
sequence_modified = "Modified.Sequence",
sequence_plain = "Stripped.Sequence",
charge = "Precursor.Charge",
rt = rt_col,
# isdecoy = "",
intensity = ifelse(use_normalized_intensities, "Precursor.Normalised", "Precursor.Quantity"),
confidence = "Q.Value",
confidence_score = "Evidence"),
# attributes_optional = list(confidence_score = "Evidence"),
confidence_threshold = confidence_threshold,
return_decoys = F,
do_plot = do_plot)

Expand Down Expand Up @@ -325,6 +340,10 @@ import_dataset_in_long_format = function(filename = NULL, x = NULL, attributes_r
# this code is also checking for required data, so run even if data table is already in memory
map_required = map_headers(headers, attributes_required, error_on_missing = T, allow_multiple = T)
map_optional = map_headers(headers, attributes_optional, error_on_missing = F, allow_multiple = T)
# collect all column indices and their desired names
col_indices = c(map_required, map_optional)
col_indices_dupe = duplicated(as.integer(col_indices))
col_indices_unique = col_indices[!col_indices_dupe] # don't use unique(), it'll drop the names

if (is_file) {
### try to detect data tables with a comma as decimal separation character
Expand All @@ -342,36 +361,36 @@ import_dataset_in_long_format = function(filename = NULL, x = NULL, attributes_r


### read table
# collect all column indices and their desired names
col_indices = c(map_required, map_optional)
col_indices_dupe = duplicated(as.integer(col_indices))
col_indices_unique = col_indices[!col_indices_dupe] # don't use unique(), it'll drop the names

# only read columns of interest to speed up file parsing
# don't parse first row as header and then skip it to avoid issues with tables that have mismatched headers (e.g. MetaMorpheus files that have extra trailing tab on header row)
DT = read_textfile_compressed(filename, skip_empty_rows = F, as_table = T, select = as.integer(col_indices_unique), header = F, skip = 1, stringsAsFactors = F, dec = dec_char, data.table = T)
if(is.null(DT)) {
append_log("failed to read data table from file", type = "error")
}
colnames(DT) = names(col_indices_unique) # overwrite column names from file with the desired names from column specification
} else {
x = x[,as.integer(col_indices_unique)]
colnames(x) = names(col_indices_unique) # overwrite column names from file with the desired names from column specification

# suppose the same column from input table is assigned to multiple attributes (eg; Spectronaut FG.Id as input for both charge and modseq)
# j = index in 'col_indices' where we should borrow content from other column
for(j in which(col_indices_dupe)) {
j_source_column = names(col_indices_unique)[as.integer(col_indices_unique) == as.integer(col_indices[j])]
DT[,names(col_indices)[j]] = DT[[j_source_column]] # use column names !
}
DT = data.table::as.data.table(x) # cast input table to data.table type
}


### inject function for custom manipulation of DT. example; for some input format, repair a column. for FragPipe, the modified peptide column is empty for peptides without a modification, so there is no column with proper modseq available. have to 'manually repair', but other than that we want to maintain this generic function
if(length(custom_func_manipulate_DT_onload) == 1 && is.function(custom_func_manipulate_DT_onload)) {
DT = custom_func_manipulate_DT_onload(DT)
}
} else {
DT = data.table::as.data.table(x) # cast input table to data.table type
### suppose the same column from input table is assigned to multiple attributes (eg; Spectronaut FG.Id as input for both charge and modseq)
# j = index in 'col_indices' where we should borrow content from other column
for(j in which(col_indices_dupe)) {
j_source_column = names(col_indices_unique)[as.integer(col_indices_unique) == as.integer(col_indices[j])]
DT[,names(col_indices)[j]] = DT[[j_source_column]] # use column names !
}


### inject function for custom manipulation of DT. example; for some input format, repair a column. for FragPipe, the modified peptide column is empty for peptides without a modification, so there is no column with proper modseq available. have to 'manually repair', but other than that we want to maintain this generic function
if(length(custom_func_manipulate_DT_onload) == 1 && is.function(custom_func_manipulate_DT_onload)) {
DT = custom_func_manipulate_DT_onload(DT)
}



### decoys
if("isdecoy" %in% colnames(DT)) {
DT[ , isdecoy := isdecoy %in% c(TRUE, 1, "decoy", "true", "True", "TRUE"), by=isdecoy]
Expand Down
29 changes: 25 additions & 4 deletions R/process_peptide_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ remove_proteins_by_name = function(dataset, remove_irt_peptides = FALSE, regular
#' @param files an array of filenames, these should be the full path
#' @param fasta_id_type what type of fasta files are these? options: "uniprot" (highly recommended) or otherwise any other character string (as we have no special rules for generic fasta files)
#' @param protein_separation_character the separation character for protein identifiers in your dataset. Most commonly this is a semicolon (eg; in maxquant/metamorpheus/skyline/etc.)
#' @param uppercase_symbols convert all gene symbols to upper case? default: TRUE
#' @return table where
#' protein_id = provided proteingroup identifier,
#' accessions = result from fasta_id_short applied to each semicolon-delimited element in protein_id (result is a semicolon-collapsed string),
Expand All @@ -100,7 +101,7 @@ remove_proteins_by_name = function(dataset, remove_irt_peptides = FALSE, regular
#' gene_symbols_or_id = unique set of valid 'gene_symbol', or the FASTA full/long ID when there is no gene information
#' gene_symbol_ucount = number of unique gene_symbols for this proteingroup (i.e. unique valid elements in 'gene_symbols')
#' @export
import_fasta = function(dataset, files = NULL, fasta_id_type = "uniprot", protein_separation_character = ";") {
import_fasta = function(dataset, files = NULL, fasta_id_type = "uniprot", protein_separation_character = ";", uppercase_symbols = TRUE) {
if(length(files) == 0) {
append_log("no fasta files were provided, downstream analysis/code will use protein IDs as surrogate for fasta headers", type = "info")
dataset$proteins = empty_protein_tibble(peptides)
Expand All @@ -109,7 +110,8 @@ import_fasta = function(dataset, files = NULL, fasta_id_type = "uniprot", protei
protein_id = dataset$peptides %>% filter({if("isdecoy" %in% names(.)) isdecoy else F} != TRUE) %>% distinct(protein_id) %>% pull(),
fasta_files = files,
fasta_id_type = fasta_id_type,
protein_separation_character = protein_separation_character
protein_separation_character = protein_separation_character,
uppercase_symbols = uppercase_symbols
)
}

Expand All @@ -124,18 +126,37 @@ import_fasta = function(dataset, files = NULL, fasta_id_type = "uniprot", protei
#' @param fasta_files an array of filenames, these should be the full path
#' @param fasta_id_type what type of fasta files are these? options: "uniprot" (highly recommended) or otherwise any other character string (as we have no special rules for generic fasta files)
#' @param protein_separation_character the separation character for protein identifiers in your dataset. Most commonly this is a semicolon (eg; in maxquant/metamorpheus/skyline/etc.)
#' @param uppercase_symbols convert all gene symbols to upper case? default: TRUE
#' @return table where
#' protein_id = provided proteingroup identifier,
#' accessions = result from fasta_id_short applied to each semicolon-delimited element in protein_id (result is a semicolon-collapsed string),
#' fasta_headers = analogous to accessions, but matching the full FASTA header to each 'accessions' element,
#' gene_symbols = full set of gene symbols, '-' where missing in FASTA, matching each element in 'accessions',
#' gene_symbols_or_id = unique set of valid 'gene_symbol', or the FASTA full/long ID when there is no gene information
#' gene_symbol_ucount = number of unique gene_symbols for this proteingroup (i.e. unique valid elements in 'gene_symbols')
import_protein_metadata_from_fasta = function(protein_id, fasta_files, fasta_id_type = "uniprot", protein_separation_character = ";") {
import_protein_metadata_from_fasta = function(protein_id, fasta_files, fasta_id_type = "uniprot", protein_separation_character = ";", uppercase_symbols = TRUE) {
if(length(protein_id) == 0 || !all(!is.na(protein_id) & is.character(protein_id) & nchar(protein_id) > 0)) {
append_log('protein_id parameter must be an array of non-empty strings (no NA values, no "")', type = "error")
}
# valid paths for `fasta_files` is checked downstream and includes guessing compressed files / filename variations
if(length(fasta_files) == 0 || !all(!is.na(fasta_files) & is.character(fasta_files) & nchar(fasta_files) > 0)) {
append_log('fasta_files parameter must be an array of non-empty strings (no NA values, no "")', type = "error")
}
if(length(fasta_id_type) != 1 || is.na(fasta_id_type) || !is.character(fasta_id_type) || nchar(fasta_id_type) == 0) {
append_log('fasta_id_type parameter must be a non-empty string (no NA values, no "")', type = "error")
}
if(length(protein_separation_character) != 1 || is.na(protein_separation_character) || !is.character(protein_separation_character) || nchar(protein_separation_character) == 0) {
append_log('protein_separation_character parameter must be a non-empty string (no NA values, no "")', type = "error")
}
if(!uppercase_symbols %in% c(TRUE, FALSE)) {
append_log('uppercase_symbols parameter must be a boolean value (TRUE or FALSE)', type = "error")
}


protein_id = unique(protein_id)

#### read data from fasta files into tibble
fasta = fasta_parse_file(fasta_files, fasta_id_type) %>%
fasta = fasta_parse_file(fasta_files, fasta_id_type, uppercase_symbols) %>%
# remove decoys from fasta file then drop column
filter(isdecoy == FALSE) %>%
select(-isdecoy)
Expand Down
12 changes: 9 additions & 3 deletions R/util_fasta.R
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ fasta_header_to_gene = function(x, fasta_id_type = "uniprot") {
# if multiple GN=XXX tags are present (should never occur for uniprot), the last element is taken
res = sub(".* GN=([^ ]{2,}).*", "\\1", x) # require at least 2 characters
res[res == x] = ""
toupper(res)
return(res)
}


Expand All @@ -59,7 +59,8 @@ fasta_header_to_gene = function(x, fasta_id_type = "uniprot") {
#
#' @param fasta_files array of fill paths to fasta files
#' @param fasta_id_type fasta type, typically this is 'uniprot'
fasta_parse_file = function(fasta_files, fasta_id_type = "uniprot") {
#' @param uppercase_symbols convert all gene symbols to upper case? default: TRUE
fasta_parse_file = function(fasta_files, fasta_id_type = "uniprot", uppercase_symbols = TRUE) {
# first, validate input files
ff = NULL
for (f in fasta_files) {
Expand All @@ -75,7 +76,7 @@ fasta_parse_file = function(fasta_files, fasta_id_type = "uniprot") {
fasta = c(fasta, l[char1 == ">"]) # only fasta header lines
}

tibble(
result = tibble(
idlong = fasta_header_to_id(fasta),
gene = fasta_header_to_gene(fasta, fasta_id_type),
header = fasta
Expand All @@ -85,6 +86,11 @@ fasta_parse_file = function(fasta_files, fasta_id_type = "uniprot") {
# use the idlong because reverse/decoy tags might be prepended. e.g. "rev_tr|U3KQF4|U3KQF4_HUMAN"
distinct(idlong, .keep_all = T) %>%
mutate(isdecoy = fasta_identifier_isdecoy(idlong) | fasta_identifier_isdecoy(idshort))

if(uppercase_symbols) {
result$gene = toupper(result$gene)
}
return(result)
}


Expand Down
31 changes: 31 additions & 0 deletions R/util_generic.R
Original file line number Diff line number Diff line change
Expand Up @@ -603,3 +603,34 @@ fit_t_dist_fixed_mu = function(x) {
)
if(!is.null(fit)) return(c(sigma=fit$par[1], df=fit$par[2]))
}



#' Throw an error if lme4::lmer() cannot be executed
#'
#' @description
#' `stop()` is called when lme4 or Matrix is not installed, or when `lme4::lmer()`
#' cannot be executed (with default settings and mock data).
#' If this function does not throw an error, we may assume `lme4::lmer()` is operational.
#' @export
assert_lme4_functional = function() {
if(!requireNamespace("lme4", quietly = TRUE)) {
stop("required package lme4 is not installed", call. = FALSE)
}
if(!requireNamespace("Matrix", quietly = TRUE)) {
stop("required package Matrix is not installed", call. = FALSE)
}

result = tryCatch({
suppressMessages(lme4::lmer(formula = expression ~ (1 | condition), data = data.frame(
expression = c(14.19, 16.09, 14.94, 15.51, 14.82, 14.87),
condition = c(1, 1, 1, 2, 2, 2)
)))
}, error = function(e) {
return(e)
})

if("error" %in% class(result)) {
stop(paste0("error while executing function lme4::lmer(), the installed version of R packages lme4 and Matrix are possibly incompatible -> ", result$message))
}
}
Loading

0 comments on commit de80abb

Please sign in to comment.