Skip to content

Commit

Permalink
release 1.1.1
Browse files Browse the repository at this point in the history
  • Loading branch information
ftwkoopmans committed Aug 9, 2024
1 parent 9239384 commit 24e069c
Show file tree
Hide file tree
Showing 28 changed files with 1,092 additions and 662 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Package: msdap
Title: Mass Spectrometry Downstream Analysis Pipeline
Description: Analyze label-free proteomics datasets from various sources (MaxQuant, Spectronaut, etc) using a pipeline that facilitates peptide filtering and many algorithms for normalization and statistical analysis. A comprehensive PDF report can be generated that includes many data visualizations and documentation thereof.
URL: https://github.com/ftwkoopmans/msdap
Version: 1.1
Version: 1.1.1
Authors@R:
person(given = "Frank",
family = "Koopmans",
Expand Down
6 changes: 3 additions & 3 deletions R/dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -252,8 +252,8 @@ check_valid_tibble_samples = function(tib) {
#'
#' @param peptides peptide tibble in long format
log_peptide_tibble_pep_prot_counts = function(peptides) {
report_unique = peptides %>% group_by(isdecoy) %>% summarise(n_precursor=n_distinct(peptide_id), n_seq=n_distinct(sequence_plain), n_prot=n_distinct(protein_id)) %>% arrange(isdecoy)
append_log(sprintf("%d target precursors, %d (plain)sequences, %d proteins", report_unique$n_precursor[1], report_unique$n_seq[1], report_unique$n_prot[1]), type = "info")
report_unique = peptides %>% group_by(isdecoy) %>% summarise(n_precursor=n_distinct(peptide_id), n_seq=n_distinct(sequence_plain), n_prot=n_distinct(protein_id), n_sample=n_distinct(sample_id)) %>% arrange(isdecoy)
append_log(sprintf("%d target precursors, %d (plain)sequences, %d proteins, %d samples", report_unique$n_precursor[1], report_unique$n_seq[1], report_unique$n_prot[1], report_unique$n_sample[1]), type = "info")
if(nrow(report_unique) > 1) { # if there are decoys
append_log(sprintf("%d decoy precursors, %d (plain)sequences, %d proteins", report_unique$n_precursor[2], report_unique$n_seq[2], report_unique$n_prot[2]), type = "info")
}
Expand All @@ -275,7 +275,7 @@ tibble_peptides_reorder = function(tib) {
#' @param peptides peptide tibble in long format
empty_protein_tibble = function(peptides) {
uprot = unique(peptides$protein_id)
return(tibble(protein_id = uprot, fasta_headers = uprot, gene_symbols = uprot, gene_symbols_or_id = uprot))
return(tibble(protein_id = uprot, fasta_headers = uprot, gene_symbols = uprot, gene_symbols_ucount = 0L, gene_symbols_or_id = uprot))
}


Expand Down
12 changes: 7 additions & 5 deletions R/export_data_tables.R
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,10 @@ export_protein_abundance_matrix = function(dataset, rollup_algorithm, output_dir

# add protein metadata
tib = dataset$proteins %>%
select(protein_id, fasta_headers, gene_symbols_or_id) %>%
select(protein_id, fasta_headers, gene_symbols_or_id, gene_symbol_ucount) %>%
inner_join(as_tibble(m) %>% add_column(protein_id = rownames(m)), by="protein_id") %>%
arrange(gene_symbols_or_id!=protein_id, gene_symbols_or_id) # proteins without gene symbol first, then sort by symbol
arrange(gene_symbol_ucount > 0, gene_symbols_or_id) %>% # proteins without gene symbol first, then sort by symbol
select(-gene_symbol_ucount)

## write to file
# generate filename. if very long (eg; huge contrast name + long path in output_dir), try to shorting with md5 hash
Expand Down Expand Up @@ -135,11 +136,12 @@ export_peptide_abundance_matrix = function(dataset, output_dir) {
tibw = tib %>%
pivot_wider(id_cols = c(protein_id, peptide_id), names_from = "sample_id", values_from = "intensity") %>%
# add protein metadata
left_join(dataset$proteins %>% select(protein_id, gene_symbols_or_id), by = "protein_id") %>%
left_join(dataset$proteins %>% select(protein_id, gene_symbols_or_id, gene_symbol_ucount), by = "protein_id") %>%
# sort columns by order in sample metadata table
select(gene_symbols_or_id, protein_id, peptide_id, !!sid) %>%
select(gene_symbols_or_id, gene_symbol_ucount, protein_id, peptide_id, !!sid) %>%
# sort output table
arrange(gene_symbols_or_id!=protein_id, gene_symbols_or_id, protein_id, peptide_id) # proteins without gene symbol first, then sort by symbol
arrange(gene_symbol_ucount > 0, gene_symbols_or_id, protein_id, peptide_id) %>% # proteins without gene symbol first, then sort by symbol
select(-gene_symbol_ucount)

## write to file
# generate filename. if very long (eg; huge contrast name + long path in output_dir), try to shorting with md5 hash
Expand Down
590 changes: 590 additions & 0 deletions R/parse_fragpipe.R

Large diffs are not rendered by default.

369 changes: 0 additions & 369 deletions R/parse_fragpipe_ionquant.R

This file was deleted.

143 changes: 0 additions & 143 deletions R/parse_fragpipe_psm.R

This file was deleted.

2 changes: 1 addition & 1 deletion R/parse_longformat_generic.R
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ update_protein_mapping = function(tib, pep2prot) {
#'
#' @importFrom data.table as.data.table chmatch setkey setorder
#' @export
import_dataset_in_long_format = function(filename=NULL, x = NULL, attributes_required=list(), attributes_optional = list(), confidence_threshold = 0.01,
import_dataset_in_long_format = function(filename = NULL, x = NULL, attributes_required=list(), attributes_optional = list(), confidence_threshold = 0.01,
remove_peptides_never_above_confidence_threshold = TRUE, select_unique_precursor_per_modseq = TRUE,
custom_func_manipulate_DT_onload = NULL,
return_decoys = TRUE, do_plot = TRUE) {
Expand Down
36 changes: 21 additions & 15 deletions R/process_peptide_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -116,37 +116,43 @@ remove_proteins_by_name = function(dataset, remove_irt_peptides = FALSE, regular
#' @param fasta_files todo
#' @param fasta_id_type todo
#' @param protein_separation_character todo
#' @return table where
#' protein_id = provided proteingroup identifier,
#' accessions = result from fasta_id_short applied to each semicolon-delimited element in protein_id (result is a semicolon-collapsed string),
#' fasta_headers = analogous to accessions, but matching the full FASTA header to each 'accessions' element,
#' gene_symbols = full set of gene symbols, '-' where missing in FASTA, matching each element in 'accessions',
#' gene_symbols_or_id = unique set of valid 'gene_symbol', or the FASTA full/long ID when there is no gene information
#' gene_symbol_ucount = number of unique gene_symbols for this proteingroup (i.e. unique valid elements in 'gene_symbols')
import_protein_metadata_from_fasta = function(protein_id, fasta_files, fasta_id_type = "uniprot", protein_separation_character = ";") {
protein_id = unique(protein_id)

#### read data from fasta files into tibble
fasta = fasta_parse_file(fasta_files, fasta_id_type)
fasta$gene_symbols_or_id = ifelse(fasta$gene == "", fasta$idlong, fasta$gene)
fasta = fasta_parse_file(fasta_files, fasta_id_type) %>%
filter(isdecoy == FALSE) %>% # remove decoys from fasta file
select(-isdecoy)
fasta$gene_symbols_or_id = ifelse(nchar(fasta$gene) < 2, fasta$idlong, fasta$gene)

#### protein-to-accession mapping table
# convert protein_id to a list of respective accessions -->> long-format tibble
l = strsplit(protein_id, protein_separation_character, fixed = T)
prot2acc = tibble(
protein_id = rep(protein_id, lengths(l)),
acc_input = unlist(l, use.names = F)
acc_input = unlist(l, use.names = F),
isdecoy = FALSE
)

# extract the accession
prot2acc$acc = fasta_id_short(prot2acc$acc_input, fasta_id_type = fasta_id_type)
# accession cannot be less than 3 characters
prot2acc$acc[nchar(prot2acc$acc) < 3] = NA
# Filter valid accessions by regex. contaminant or decoy entries are not allowed. eg; CON_ REV_ DECOY_
# Check for ids that start or end with CON, REV or DECOY. Do this for both the full protein id and idshort.
# eg; suppose that input software prefixes the full ID with reverse: REV_sp|abc123|HUMAN_EXAMPLE -->> the reverse tag is lost in fasta_id_short(...)
# note that we cannot blindly match "_CON" to protein identifiers as this could fail for uniprot long IDs, eg; sp|abc123|HUMAN_CONTACTIN or whatever
prot2acc$acc[!is.na(prot2acc$acc) & (grepl("^CON_|_CON$|^REV_|_REV$|^DECOY_|_DECOY$", prot2acc$acc, ignore.case = T) |
grepl("^CON_|_CON$|^REV_|_REV$|^DECOY_|_DECOY$", prot2acc$acc_input, ignore.case = T))] = NA
# prot2acc$acc[!grepl("^[[:alnum:]-]+$", prot2acc$acc)] = NA # previous implementation only supports uniprot/ensembl, but not refseq where fasta headers are like; >NP_000006.2 arylamine N-acetyltransferase 2 [Homo sapiens]
# Filter valid accessions by regex, decoy entries are not allowed. eg; REV_ DECOY_
# Check both full and "short" IDs. eg; suppose that input software prefixes the full ID with reverse: REV_sp|abc123|HUMAN_EXAMPLE -->> the reverse tag is lost in fasta_id_short(...)
prot2acc$isdecoy = !is.na(prot2acc$acc) & (fasta_identifier_isdecoy(prot2acc$acc_input) | fasta_identifier_isdecoy(prot2acc$acc))

# finally, remove all invalid mappings and select unique elements (eg; protein-group with same protein accession listed twice is de-duped)
prot2acc = prot2acc %>%
drop_na() %>%
distinct(.keep_all = T)
filter(!is.na(acc) & isdecoy == FALSE) %>%
distinct(protein_id, acc_input, .keep_all = T)
# merge in the fasta metadata
prot2acc = prot2acc %>% left_join(fasta, by = c(acc = "idshort"))

Expand All @@ -172,9 +178,9 @@ import_protein_metadata_from_fasta = function(protein_id, fasta_files, fasta_id_
summarise(
accessions = paste(acc, collapse = ";"),
fasta_headers = paste(header, collapse = ";"),
# full list of gene symbols, NA where missing
gene_symbols = paste(ifelse(is.na(gene) | nchar(gene) < 2, "-", gene), collapse = ";"), # for consistency, collapse
# gene_symbols = list(ifelse(is.na(gene) | nchar(gene) < 2, NA_character_, gene)), # alternatively, store as list
# full list of gene symbols, '-' where missing, matching number-of-elements in accessions/fasta_headers
gene_symbols = paste(ifelse(nchar(gene) < 2, "-", gene), collapse = ";"),
gene_symbol_ucount = n_distinct(gene[nchar(gene) >= 2]),
# gene_symbols_or_id; take unique values, remove NA, remove strings of less than 2 characters
gene_symbols_or_id = paste(remove_by_charlength(unique(gene_symbols_or_id), minchar = 2), collapse = ";")
)
Expand Down
Loading

0 comments on commit 24e069c

Please sign in to comment.