Skip to content

Commit

Permalink
release 1.1.2
Browse files Browse the repository at this point in the history
  • Loading branch information
ftwkoopmans committed Aug 12, 2024
1 parent 24e069c commit 27aad7d
Show file tree
Hide file tree
Showing 11 changed files with 94 additions and 56 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Package: msdap
Title: Mass Spectrometry Downstream Analysis Pipeline
Description: Analyze label-free proteomics datasets from various sources (MaxQuant, Spectronaut, etc) using a pipeline that facilitates peptide filtering and many algorithms for normalization and statistical analysis. A comprehensive PDF report can be generated that includes many data visualizations and documentation thereof.
URL: https://github.com/ftwkoopmans/msdap
Version: 1.1.1
Version: 1.1.2
Authors@R:
person(given = "Frank",
family = "Koopmans",
Expand Down
2 changes: 1 addition & 1 deletion R/dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ tibble_peptides_reorder = function(tib) {
#' @param peptides peptide tibble in long format
empty_protein_tibble = function(peptides) {
uprot = unique(peptides$protein_id)
return(tibble(protein_id = uprot, fasta_headers = uprot, gene_symbols = uprot, gene_symbols_ucount = 0L, gene_symbols_or_id = uprot))
return(tibble(protein_id = uprot, fasta_headers = uprot, gene_symbols = uprot, gene_symbol_ucount = 0L, gene_symbols_or_id = uprot))
}


Expand Down
9 changes: 7 additions & 2 deletions R/export_data_tables.R
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,13 @@ export_protein_abundance_matrix = function(dataset, rollup_algorithm, output_dir

# add protein metadata
tib = dataset$proteins %>%
select(protein_id, fasta_headers, gene_symbols_or_id, gene_symbol_ucount) %>%
inner_join(as_tibble(m) %>% add_column(protein_id = rownames(m)), by="protein_id") %>%
select(protein_id, fasta_headers, gene_symbols_or_id, tidyselect::any_of("gene_symbol_ucount")) %>%
inner_join(as_tibble(m) %>% add_column(protein_id = rownames(m)), by="protein_id")
# if gene symbol count is lacking (i.e. dataset from old MS-DAP version), sort such that proteingroups without a symbol go on top (by guessing if gene_symbols_or_id is a long-format accession)
if( ! "gene_symbol_ucount" %in% colnames(tib)) {
tib$gene_symbol_ucount = as.integer(!grepl("|", tib$gene_symbols_or_id, fixed = TRUE) & tib$protein_id != tib$gene_symbols_or_id) # count = 1 when not a protein identifier, 0 otherwise
}
tib = tib %>%
arrange(gene_symbol_ucount > 0, gene_symbols_or_id) %>% # proteins without gene symbol first, then sort by symbol
select(-gene_symbol_ucount)

Expand Down
2 changes: 1 addition & 1 deletion R/export_stats_genesummary.R
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ export_stats_genesummary = function(dataset, gene_ambiguity = "prio_specific", d
if(!(is.list(dataset) && "de_proteins" %in% names(dataset))) {
append_log("dataset parameter should be a dataset that contains DEA results (did you forget to setup contrasts and run the analysis_quickstart function, or dea() ?)", type = "error")
}
if(!(is.list(dataset) && "proteins" %in% names(dataset) && all(c("protein_id", "gene_symbols") %in% colnames(dataset$proteins)) && !all(dataset$proteins$protein_id == dataset$proteins$gene_symbols))) {
if(!(is.list(dataset) && "proteins" %in% names(dataset) && all(c("protein_id", "gene_symbols") %in% colnames(dataset$proteins)))) {
append_log("dataset is missing essential information in the 'gene_symbols' column of the protein table. No FASTA has been imported for this dataset or the dataset was analyzed with an outdated MS-DAP version.\nTo use this function you won't have to re-run the entire analysis pipeline for this dataset, just run the import_fasta() function prior to applying export_stats_genesummary() to update the current dataset object (assuming this is a dataset searched against a uniprot FASTA)", type = "error")
}
if(!(length(gene_ambiguity) == 1 && all(gene_ambiguity %in% c("leading_gene", "prio_specific", "only_specific")))) {
Expand Down
103 changes: 62 additions & 41 deletions R/process_peptide_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,33 +9,6 @@ peptides_collapse_by_sequence = function(peptides, prop_peptide = "sequence_plai



#' Import fasta file(s) that match your dataset.
#'
#' For fasta files from uniprot, we extract the gene symbols for each protein-group (not available for non-uniprot fasta files).
#'
#' @param dataset your dataset
#' @param files an array of filenames, these should be the full path
#' @param fasta_id_type what type of fasta files are these? options: "uniprot" (highly recommended) or otherwise any other character string (as we have no special rules for generic fasta files)
#' @param protein_separation_character the separation character for protein identifiers in your dataset. Most commonly this is a semicolon (eg; in maxquant/metamorpheus/skyline/etc.)
#' @export
import_fasta = function(dataset, files = NULL, fasta_id_type = "uniprot", protein_separation_character = ";") {
if(length(files) == 0) {
append_log("no fasta files were provided, downstream analysis/code will use protein IDs as surrogate for fasta headers", type = "info")
dataset$proteins = empty_protein_tibble(peptides)
} else {
dataset$proteins = import_protein_metadata_from_fasta(
protein_id = dataset$peptides %>% filter({if("isdecoy" %in% names(.)) isdecoy else F} != TRUE) %>% distinct(protein_id) %>% pull(),
fasta_files = files,
fasta_id_type = fasta_id_type,
protein_separation_character = protein_separation_character
)
}

return(dataset)
}



#' Remove all peptides that match some protein-level filters
#'
#' @examples
Expand Down Expand Up @@ -111,11 +84,46 @@ remove_proteins_by_name = function(dataset, remove_irt_peptides = FALSE, regular



#' placeholder title
#' @param protein_id todo
#' @param fasta_files todo
#' @param fasta_id_type todo
#' @param protein_separation_character todo
#' Import fasta file(s) that match your dataset.
#'
#' For fasta files from uniprot, we extract the gene symbols for each protein-group (not available for non-uniprot fasta files).
#'
#' @param dataset your dataset
#' @param files an array of filenames, these should be the full path
#' @param fasta_id_type what type of fasta files are these? options: "uniprot" (highly recommended) or otherwise any other character string (as we have no special rules for generic fasta files)
#' @param protein_separation_character the separation character for protein identifiers in your dataset. Most commonly this is a semicolon (eg; in maxquant/metamorpheus/skyline/etc.)
#' @return table where
#' protein_id = provided proteingroup identifier,
#' accessions = result from fasta_id_short applied to each semicolon-delimited element in protein_id (result is a semicolon-collapsed string),
#' fasta_headers = analogous to accessions, but matching the full FASTA header to each 'accessions' element,
#' gene_symbols = full set of gene symbols, '-' where missing in FASTA, matching each element in 'accessions',
#' gene_symbols_or_id = unique set of valid 'gene_symbol', or the FASTA full/long ID when there is no gene information
#' gene_symbol_ucount = number of unique gene_symbols for this proteingroup (i.e. unique valid elements in 'gene_symbols')
#' @export
import_fasta = function(dataset, files = NULL, fasta_id_type = "uniprot", protein_separation_character = ";") {
if(length(files) == 0) {
append_log("no fasta files were provided, downstream analysis/code will use protein IDs as surrogate for fasta headers", type = "info")
dataset$proteins = empty_protein_tibble(peptides)
} else {
dataset$proteins = import_protein_metadata_from_fasta(
protein_id = dataset$peptides %>% filter({if("isdecoy" %in% names(.)) isdecoy else F} != TRUE) %>% distinct(protein_id) %>% pull(),
fasta_files = files,
fasta_id_type = fasta_id_type,
protein_separation_character = protein_separation_character
)
}

return(dataset)
}



#' Parse fasta files and construct a protein metadata table for provided protein identifiers
#'
#' @param protein_id an array of protein identifiers (that should be available in provided `fasta_files`)
#' @param fasta_files an array of filenames, these should be the full path
#' @param fasta_id_type what type of fasta files are these? options: "uniprot" (highly recommended) or otherwise any other character string (as we have no special rules for generic fasta files)
#' @param protein_separation_character the separation character for protein identifiers in your dataset. Most commonly this is a semicolon (eg; in maxquant/metamorpheus/skyline/etc.)
#' @return table where
#' protein_id = provided proteingroup identifier,
#' accessions = result from fasta_id_short applied to each semicolon-delimited element in protein_id (result is a semicolon-collapsed string),
Expand All @@ -128,9 +136,9 @@ import_protein_metadata_from_fasta = function(protein_id, fasta_files, fasta_id_

#### read data from fasta files into tibble
fasta = fasta_parse_file(fasta_files, fasta_id_type) %>%
filter(isdecoy == FALSE) %>% # remove decoys from fasta file
# remove decoys from fasta file then drop column
filter(isdecoy == FALSE) %>%
select(-isdecoy)
fasta$gene_symbols_or_id = ifelse(nchar(fasta$gene) < 2, fasta$idlong, fasta$gene)

#### protein-to-accession mapping table
# convert protein_id to a list of respective accessions -->> long-format tibble
Expand All @@ -152,9 +160,14 @@ import_protein_metadata_from_fasta = function(protein_id, fasta_files, fasta_id_
# finally, remove all invalid mappings and select unique elements (eg; protein-group with same protein accession listed twice is de-duped)
prot2acc = prot2acc %>%
filter(!is.na(acc) & isdecoy == FALSE) %>%
# merge in the fasta metadata
left_join(fasta, by = c(acc = "idshort")) %>%
# take unique set, using input with `fasta` sorting as-is = for each idshort, use data from first fasta file (where idshort was found)
distinct(protein_id, acc_input, .keep_all = T)
# merge in the fasta metadata
prot2acc = prot2acc %>% left_join(fasta, by = c(acc = "idshort"))

# replace invalid/missing gene symbols with a '-'
# also removed uniprot-provided gene symbols like MGI:1111. example; ">sp|Q8BR90|CE051_MOUSE UPF0600 protein C5orf51 homolog OS=Mus musculus (Mouse) OX=10090 GN=MGI:2146232 PE=1 SV=1"
prot2acc$gene[is.na(prot2acc$gene) | nchar(prot2acc$gene) < 2 | grepl("^(MGI|RGD|HGNC):", prot2acc$gene)] = "-"

# report mapping success rate at accession and protein-group level
n_acc = nrow(prot2acc)
Expand All @@ -172,17 +185,22 @@ import_protein_metadata_from_fasta = function(protein_id, fasta_files, fasta_id_
print(prot2acc %>% filter(is.na(header)), n=25)
}


#### collapse at protein-group level
tib_result = prot2acc %>%
group_by(protein_id) %>%
summarise(
accessions = paste(acc, collapse = ";"),
fasta_headers = paste(header, collapse = ";"),
# full list of gene symbols, '-' where missing, matching number-of-elements in accessions/fasta_headers
gene_symbols = paste(ifelse(nchar(gene) < 2, "-", gene), collapse = ";"),
gene_symbol_ucount = n_distinct(gene[nchar(gene) >= 2]),
# gene_symbols_or_id; take unique values, remove NA, remove strings of less than 2 characters
gene_symbols_or_id = paste(remove_by_charlength(unique(gene_symbols_or_id), minchar = 2), collapse = ";")
gene_symbols = paste(gene, collapse = ";"),
gene_symbol_ucount = n_distinct(gene[gene != "-"]),
# gene_symbols_or_id; take unique genes (remove missing/empty). If there are none, take (all) long-format accessions
gene_symbols_or_id = ifelse(
gene_symbol_ucount > 0,
paste(unique(gene[gene != "-"]), collapse = ";"), # protein_id without gene symbol are ignored / assumed same as gene symbols that we did find
paste(idlong, collapse = ";")
)
)

rows_fail = is.na(tib_result$gene_symbols_or_id) | tib_result$gene_symbols_or_id == ""
Expand All @@ -198,8 +216,11 @@ import_protein_metadata_from_fasta = function(protein_id, fasta_files, fasta_id_
}

rows_miss_fasta = is.na(tib_result$fasta_headers)
tib_result$fasta_headers[rows_miss_fasta] = tib_result$gene_symbols_or_id[rows_miss_fasta] = tib_result$protein_id[rows_miss_fasta]
if(any(rows_miss_fasta)) {
tib_result$fasta_headers[rows_miss_fasta] = tib_result$gene_symbols_or_id[rows_miss_fasta] = tib_result$protein_id[rows_miss_fasta]
}

# debug: tib_result %>% filter(gene_symbol_ucount ==0) %>% select(fasta_headers) %>% print(n=100)
return(tib_result)
}

Expand Down
3 changes: 3 additions & 0 deletions R/stats_summary.R
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ summarise_stats__for_contrast = function(dataset, return_dea, return_diffdetect,
if(!all(c("protein_id", "gene_symbols_or_id") %in% colnames(dataset$proteins)) || !is.character(dataset$proteins$gene_symbols_or_id) || any(dataset$proteins$gene_symbols_or_id == "") ) {
append_log("requires MS-DAP dataset where the proteins table contains gene symbols (did you forget to import_fasta() ?)", type = "error")
}
if(!(is.list(dataset) && "proteins" %in% names(dataset) && all(c("protein_id", "gene_symbols") %in% colnames(dataset$proteins)))) {
append_log("dataset is missing essential information in the 'gene_symbols' column of the protein table. No FASTA has been imported for this dataset or the dataset was analyzed with an outdated MS-DAP version.\nTo use this function you won't have to re-run the entire analysis pipeline for this dataset, just run the import_fasta() function prior to applying export_stats_genesummary() to update the current dataset object (assuming this is a dataset searched against a uniprot FASTA)", type = "error")
}
if(length(return_dea) != 1 || ! return_dea %in% c(TRUE, FALSE)) {
append_log("return_dea must be single boolean", type = "error")
}
Expand Down
4 changes: 2 additions & 2 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ RUN R -e "devtools::install_version('remaCor', '0.0.18', upgrade = 'never', repo
### MS-DAP
# from local devtools::build() file
RUN mkdir /msdap
COPY temp/msdap_1.1.1.tar.gz /msdap/
RUN R -e "devtools::install_local('/msdap/msdap_1.1.1.tar.gz', upgrade = 'never')"
COPY temp/msdap_1.1.2.tar.gz /msdap/
RUN R -e "devtools::install_local('/msdap/msdap_1.1.2.tar.gz', upgrade = 'never')"
# alternatively, install from github; RUN R -e "devtools::install_github('ftwkoopmans/[email protected]', upgrade = 'never')"


Expand Down
2 changes: 1 addition & 1 deletion docker/msdap_launcher_unix.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# MS-DAP launch script
# https://github.com/ftwkoopmans/msdap

VERSION="1.1.1"
VERSION="1.1.2"


### OS
Expand Down
2 changes: 1 addition & 1 deletion docker/msdap_launcher_windows.ps1
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# MS-DAP launch script
# https://github.com/ftwkoopmans/msdap

$VERSION = "1.1.1"
$VERSION = "1.1.2"


Write-Host "$((Get-Date).ToString("HH:mm:ss")) - Starting MS-DAP launcher script for version: $VERSION"
Expand Down
9 changes: 9 additions & 0 deletions man/import_fasta.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 6 additions & 6 deletions man/import_protein_metadata_from_fasta.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 27aad7d

Please sign in to comment.