diff --git a/DESCRIPTION b/DESCRIPTION
index e79e5d8..2ffb5ac 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -3,7 +3,7 @@ Package: msdap
 Title: Mass Spectrometry Downstream Analysis Pipeline
 Description: Analyze label-free proteomics datasets from various sources (MaxQuant, Spectronaut, etc) using a pipeline that facilitates peptide filtering and many algorithms for normalization and statistical analysis. A comprehensive PDF report can be generated that includes many data visualizations and documentation thereof.
 URL: https://github.com/ftwkoopmans/msdap
-Version: 1.2
+Version: 1.2.1
 Authors@R: 
     person(given = "Frank",
            family = "Koopmans",
diff --git a/NAMESPACE b/NAMESPACE
index ebacf7a..e6d553e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -59,6 +59,7 @@ export(limma_wrapper)
 export(logger.default)
 export(matrix_to_long)
 export(merge_fractionated_samples)
+export(merge_proteingroups_by_gene)
 export(merge_replicate_samples)
 export(mgi_lookuptable)
 export(minlog10)
diff --git a/R/dea.R b/R/dea.R
index 2bc2565..2edec4f 100644
--- a/R/dea.R
+++ b/R/dea.R
@@ -210,6 +210,10 @@ dea = function(dataset, qval_signif = 0.01, fc_signif = 0, dea_algorithm = "deqm
     peptides_for_contrast = dataset$peptides %>%
       select(sample_id, protein_id, peptide_id, sequence_plain, sequence_modified, detect, intensity=!!as.character(col_contr_intensity), any_of(c("confidence","charge"))) %>%
       filter(sample_id %in% samples_for_contrast$sample_id & is.finite(intensity))
+    # if(!all(samples_for_contrast$sample_id %in% unique(peptides_for_contrast$sample_id)) ) {
+    #   append_log("some samples that are defined in the current contrast do not have any datapoints in the dataset$peptides table (at currently selected 'peptide filter') column", type = "error")
+    # }
+
     # peptide ExpressionSet
     eset_peptides = tibble_as_eset(peptides_for_contrast, dataset$proteins, samples_for_contrast)
     # rollup peptide abundance matrix to protein-level
diff --git a/R/export_stats_genesummary.R b/R/export_stats_genesummary.R
index 526f958..472dfc1 100644
--- a/R/export_stats_genesummary.R
+++ b/R/export_stats_genesummary.R
@@ -215,9 +215,70 @@ export_stats_genesummary = function(dataset, gene_ambiguity = "prio_specific", d
   )
 
 
+  # the allstats table is already sorted by "best hits on top" (last few lines @ summarise_stats() function; `arrange(desc(signif), pvalue)` )
+  result = disambiguate_protein_table_by_gene(protein_data = allstats, hgnc = hgnc, gene_ambiguity = gene_ambiguity, xref = xref, distinct_factors = c("contrast", "dea_algorithm"), remove_nohgnc = remove_nohgnc) %>%
+    arrange(desc(signif), pvalue, gene_symbols_or_id)
+
+
+  ### print output tables
+  if(is.na(output_dir)) {
+    append_log("output_dir parameter set to NA, skip writing Excel output tables", type = "info")
+  } else {
+    for(contr in unique(result$contrast)) {
+      for(algo_dea in unique(result$dea_algorithm)) {
+        tmp = result %>% filter(contrast == contr & dea_algorithm == algo_dea)
+        f = sprintf("%s/%s__%s.xlsx", output_dir, filename_strip_illegal_characters(sub("contrast: *", "", contr), strict = TRUE), algo_dea)
+        openxlsx::write.xlsx(tmp, f)
+      }
+    }
+  }
+
+  return(invisible(result))
+}
+
+
+
+#' given some table with protein-level data, map to HGNC genes and deal with redundant or ambiguous gene-level results
+#'
+#' @description
+#' The order of the data table is pivotal: the first row for each unique gene (criteria in parameter `gene_ambiguity`)) is selected !
+#'
+#' @param protein_data a data.frame with columns (from dataset$proteins) protein_id, gene_symbols, gene_symbols_or_id.
+#' Importantly, if you opt to return only unique entries per gene then the row of the FIRST matching protein_id is
+#' returned, so sort your data table by pvalue upstream !
+#' @param hgnc see `export_stats_genesummary()`
+#' @param gene_ambiguity see `export_stats_genesummary()`
+#' @param xref see `export_stats_genesummary()`
+#' @param remove_nohgnc see `export_stats_genesummary()`
+#' @param distinct_factors a set of columns in protein_data that should be considered (together with protein_id) factors that describe subsets of data (to return unique gene-level data within)
+disambiguate_protein_table_by_gene = function(protein_data, hgnc, gene_ambiguity, xref = NULL, remove_nohgnc = FALSE, distinct_factors = NULL) {
+
+  ### input validation
+  if(!(is.data.frame(hgnc) && all(c("hgnc_id", "hgnc_symbol", "type", "value") %in% colnames(hgnc)))) {
+    append_log("hgnc parameter should be a data.frame with columns hgnc_id, hgnc_symbol, type, value (output from hgnc_lookuptable() function)", type = "error")
+  }
+  if(!(length(gene_ambiguity) == 1 && all(gene_ambiguity %in% c("leading_gene", "prio_specific", "only_specific")))) {
+    append_log("gene_ambiguity parameter should be 1 of the following options: leading_gene, prio_specific, only_specific", type = "error")
+  }
+  if(!is.null(xref) && !(is.data.frame(xref) && nrow(xref) > 0 && ncol(xref) >= 3 && all(c("symbol", "uniprot_id") %in% colnames(xref)))) {
+    append_log("xref parameter should be a data.frame with columns mgi_id/rgd_id, symbol, uniprot_id (output from mgi_lookuptable() or rgd_lookuptable() function)", type = "error")
+  }
+  if(!is.null(xref) && !colnames(xref)[1] %in% c("mgi_id", "rgd_id")) {
+    append_log("xref parameter only works with MGI and RGD atm (output from mgi_lookuptable() or rgd_lookuptable() function): first column must be mgi_id or rgd_id", type = "error")
+  }
+  if(!(length(remove_nohgnc) == 1 && remove_nohgnc %in% c(TRUE, FALSE))) {
+    append_log("remove_nohgnc parameter should be TRUE or FALSE", type = "error")
+  }
+  if(!(is.data.frame(protein_data) && all(c("protein_id", "gene_symbols", "gene_symbols_or_id") %in% colnames(protein_data)))) {
+    append_log("protein_data parameter should be a data.frame with columns protein_id, gene_symbols and gene_symbols_or_id", type = "error")
+  }
+  if(!(is.null(distinct_factors) || (is.character(distinct_factors) & all(distinct_factors %in% colnames(protein_data))))) {
+    append_log("distinct_factors parameter should be NULL or a set of columns in protein_data that should be considered (together with protein_id) factors that describe subsets of data (to return unique gene-level data within)", type = "error")
+  }
+
 
   ### create long-format table of uniprot_id*symbol for each unique proteingroup
-  proteins = allstats %>%
+  proteins = protein_data %>%
     select(protein_id, gene_symbols) %>%
     distinct(protein_id, .keep_all = TRUE) %>%
     mutate(uniprot_id = strsplit(protein_id, "[ ,;]+"),
@@ -242,24 +303,24 @@ export_stats_genesummary = function(dataset, gene_ambiguity = "prio_specific", d
 
 
 
-  ### now join the unique proteingroup-to-HGNC mappings with table `allstats`
+  ### now join the unique proteingroup-to-HGNC mappings with table `protein_data`
   # hgnc_list = a list column where each protein_id has an array of HGNC identifiers (length > 0), NA where missing/failed
-  allstats = allstats %>%
+  protein_data = protein_data %>%
     left_join(proteins_long %>% select(protein_id, hgnc_list = hgnc_id) %>% tidyr::chop(cols = "hgnc_list"), by = "protein_id")
 
 
 
   ### apply mapping rules
-  ### importantly, sorting of `allstats` table prior to running below code MUST be ascending by pvalue
+  ### importantly, sorting of `protein_data` table prior to running below code MUST be ascending by pvalue
 
-  allstats$hgnc_list_unique_notna = lapply(allstats$hgnc_list, function(g) { unique(na.omit(g)) }) # naive/slow, but prefer simple code
-  # dev note: don't use `allstats %>% filter(lengths(hgnc_list) == 1)` because a proteingroup can contain multiple uniprot identifiers that all map to the same gene --> hgnc_list will not be length 1
+  protein_data$hgnc_list_unique_notna = lapply(protein_data$hgnc_list, function(g) { unique(na.omit(g)) }) # naive/slow, but prefer simple code
+  # dev note: don't use `protein_data %>% filter(lengths(hgnc_list) == 1)` because a proteingroup can contain multiple uniprot identifiers that all map to the same gene --> hgnc_list will not be length 1
 
-  allstats$isambiguous = grepl(";", allstats$gene_symbols_or_id, fixed = TRUE) # infer ambiguous directly from uniprot gene symbols (i.e. failed mapping to HGNC have no effect)
-  #  allstats$isambiguous = lengths(allstats$hgnc_list_unique_notna) > 1  # alternatively, determine "ambiguous" from successful HGNC mappings @ hgnc_list_unique_notna
+  protein_data$isambiguous = grepl(";", protein_data$gene_symbols_or_id, fixed = TRUE) # infer ambiguous directly from uniprot gene symbols (i.e. failed mapping to HGNC have no effect)
+  #  protein_data$isambiguous = lengths(protein_data$hgnc_list_unique_notna) > 1  # alternatively, determine "ambiguous" from successful HGNC mappings @ hgnc_list_unique_notna
 
   # take the first non-NA HGNC identifier (NA if none)
-  allstats$hgnc_id = unlist(lapply(allstats$hgnc_list_unique_notna, function(g) {
+  protein_data$hgnc_id = unlist(lapply(protein_data$hgnc_list_unique_notna, function(g) {
     # always return exactly 1 value, thus downstream unlist() will yield an array that exactly matches table length
     if(length(g) == 0) return(NA_character_)
     return(g[1])
@@ -268,26 +329,25 @@ export_stats_genesummary = function(dataset, gene_ambiguity = "prio_specific", d
   if(gene_ambiguity == "only_specific") {
     # invalidate all entries that are ambiguous
     # importantly, do this before computing `hgnc_id_FORUNIQUE`
-    allstats$hgnc_id[allstats$isambiguous] = NA_character_
+    protein_data$hgnc_id[protein_data$isambiguous] = NA_character_
   }
 
   if(gene_ambiguity == "prio_specific") {
     # sort table such that ambiguous hits are at the bottom; importantly, ties-of-ambiguity are still sorted as-is
     # taking first match per gene downstream = prio specific/unambiguous over ambiguous proteingroup that has a "better pvalue"
-    allstats = allstats %>% arrange(isambiguous)
+    protein_data = protein_data %>% arrange(isambiguous)
   }
 
   # ! importantly, add a unique ID to proteingroups not mapped to HGNC so we can use distinct() downstream without grouping NA values (failed mappings)
-  allstats$hgnc_id_FORUNIQUE = ifelse(is.na(allstats$hgnc_id), allstats$gene_symbols_or_id, allstats$hgnc_id)
+  protein_data$hgnc_id_FORUNIQUE = ifelse(is.na(protein_data$hgnc_id), protein_data$gene_symbols_or_id, protein_data$hgnc_id)
   # result table: retain unique rows per gene*contrast*dea_algorithm (assuming table has been sorted upstream)
-  result = allstats %>% distinct(hgnc_id_FORUNIQUE, contrast, dea_algorithm, .keep_all = TRUE)
+  distinct_factors = unique(c("hgnc_id_FORUNIQUE", distinct_factors))
+  result = protein_data %>% distinct(dplyr::across(tidyselect::all_of(distinct_factors)), .keep_all = TRUE)
 
 
 
-  ### finally, stable sort result table + add gene database crossreferences
-  # sort table again ("prio_specific" option affects sorting, so here enforce stable ordering)
+  ### add gene database crossreferences
   result = result %>%
-    arrange(desc(signif), pvalue) %>%
     select(-tidyselect::any_of(c("gene_symbols", "symbol", "hgnc_list", "hgnc_list_unique_notna", "hgnc_id_FORUNIQUE", "isambiguous"))) %>%
     hgnc_add_xrefs(hgnc = hgnc)
   # add named column specifically for GOAT
@@ -295,7 +355,7 @@ export_stats_genesummary = function(dataset, gene_ambiguity = "prio_specific", d
 
 
 
-  tmp = allstats %>% distinct(protein_id, .keep_all = TRUE)
+  tmp = protein_data %>% distinct(protein_id, .keep_all = TRUE)
   log_pg_ambiguous = sum(tmp$isambiguous)
 
   if(gene_ambiguity == "only_specific") {
@@ -332,20 +392,5 @@ export_stats_genesummary = function(dataset, gene_ambiguity = "prio_specific", d
     result = result %>% filter(!is.na(hgnc_id))
   }
 
-
-
-  ### print output tables
-  if(is.na(output_dir)) {
-    append_log("output_dir parameter set to NA, skip writing Excel output tables", type = "info")
-  } else {
-    for(contr in unique(result$contrast)) {
-      for(algo_dea in unique(result$dea_algorithm)) {
-        tmp = result %>% filter(contrast == contr & dea_algorithm == algo_dea)
-        f = sprintf("%s/%s__%s.xlsx", output_dir, filename_strip_illegal_characters(sub("contrast: *", "", contr), strict = TRUE), algo_dea)
-        openxlsx::write.xlsx(tmp, f)
-      }
-    }
-  }
-
-  return(invisible(result))
+  return(result)
 }
diff --git a/R/gene_idmapping.R b/R/gene_idmapping.R
index d9d6400..99ccfd3 100644
--- a/R/gene_idmapping.R
+++ b/R/gene_idmapping.R
@@ -242,7 +242,7 @@ rgd_lookuptable = function(f) {
 #' helper function to match a protein table to HGNC via gene symbols (and synonyms)
 #'
 #' @description typically called from `export_stats_genesummary`
-#' @param proteins_long long-format data.frame with columns protein_id and symbol pairs (i.e. no semicolon-delimted values)
+#' @param proteins_long long-format data.frame with columns protein_id and symbol pairs (i.e. no semicolon-delimited values)
 #' @param hgnc HGNC lookup table from `hgnc_lookuptable()`
 #' @param use_synonyms boolean; use synonyms? if FALSE, only uses hgnc_symbol
 idmap_symbols = function(proteins_long, hgnc, use_synonyms = TRUE) {
@@ -290,13 +290,13 @@ idmap_symbols = function(proteins_long, hgnc, use_synonyms = TRUE) {
 #' helper function to match a protein table to HGNC via an intermediate mapping table (e.g. MGI/RGD)
 #'
 #' @description typically called from `export_stats_genesummary`
-#' @param proteins_long long-format data.frame with columns protein_id and symbol pairs (i.e. no semicolon-delimted values)
+#' @param proteins_long long-format data.frame with columns uniprot_id and symbol pairs (i.e. no semicolon-delimited values)
 #' @param hgnc HGNC lookup table from `hgnc_lookuptable()`
 #' @param idmap MGI or RGD lookup table from `mgi_lookuptable()` or `rgd_lookuptable()`
 #' @param use_symbols boolean; use symbols to map from proteins_long to the idmap table? if FALSE, only matches by uniprot identifier
 idmap_uniprotid = function(proteins_long, hgnc, idmap, use_symbols = TRUE) {
   if(!is.data.frame(proteins_long) || nrow(proteins_long) == 0 || !all(c("uniprot_id", "symbol") %in% colnames(proteins_long)) ) {
-    append_log("proteins_long parameter must be a data.frame with columns 'protein_id' and 'symbol'", type = "error")
+    append_log("proteins_long parameter must be a data.frame with columns 'uniprot_id' and 'symbol'", type = "error")
   }
   if(!is.data.frame(hgnc) || nrow(hgnc) == 0 || !all(c("hgnc_id", "hgnc_symbol", "type", "value") %in% colnames(hgnc)) ) {
     append_log("hgnc parameter must be a data.frame with columns 'hgnc_id', 'hgnc_symbol', 'type', 'value' as typically prepared using the hgnc_lookuptable() function", type = "error")
@@ -329,10 +329,12 @@ idmap_uniprotid = function(proteins_long, hgnc, idmap, use_symbols = TRUE) {
   proteins_long$xref_id[rows] = idmap$XREF_ID_VALUES__TEMPCOL[match(proteins_long$uniprot_id[rows], idmap$uniprot_id)]
 
   # fallback to symbol matching (no `xref_id` yet)
-  rows = is.na(proteins_long$hgnc_id) & is.na(proteins_long$xref_id) & #
-    # double-check that we don't even attempt to match missing gene symbols
-    !is.na(proteins_long$symbol) & nchar(proteins_long$symbol) >= 2
-  proteins_long$xref_id[rows] = idmap$XREF_ID_VALUES__TEMPCOL[match(toupper(proteins_long$symbol[rows]), toupper(idmap$symbol))]
+  if(use_symbols) {
+    rows = is.na(proteins_long$hgnc_id) & is.na(proteins_long$xref_id) & #
+      # double-check that we don't even attempt to match missing gene symbols
+      !is.na(proteins_long$symbol) & nchar(proteins_long$symbol) >= 2
+    proteins_long$xref_id[rows] = idmap$XREF_ID_VALUES__TEMPCOL[match(toupper(proteins_long$symbol[rows]), toupper(idmap$symbol))]
+  }
 
 
   ### map to HGNC using crossreference identifiers
diff --git a/R/plot_sample_pca.R b/R/plot_sample_pca.R
index 9f9343b..7730501 100644
--- a/R/plot_sample_pca.R
+++ b/R/plot_sample_pca.R
@@ -7,20 +7,25 @@
 #' @param sample_label_property labels used to represent the samples. Set to "auto" to automatically select based on number of samples (default). possible values; auto, shortname, index, index_asis
 #' @param pch_as_exclude should exclude samples be represented with a distinct symbol shape? possible values; FALSE, TRUE (default)
 #' @param infer_continuous_scale should the legends infer continuous scale from the data? FALSE = all categorical, TRUE = automatic (default)
+#' @param pca_dims a list of integer pairs indicating PCA plots to make
 #' @importFrom pcaMethods pca
 #' @importFrom ggrepel geom_text_repel
 #' @importFrom ggpubr ggarrange annotate_figure text_grob
 #' @importFrom gtools mixedsort
 #' @importFrom scales rescale
-plot_sample_pca = function(matrix_sample_intensities, samples, samples_colors, sample_label_property = "auto", pch_as_exclude = TRUE, infer_continuous_scale = TRUE) {
+plot_sample_pca = function(matrix_sample_intensities, samples, samples_colors, sample_label_property = "auto", pch_as_exclude = TRUE, infer_continuous_scale = TRUE, pca_dims = list(1:2, c(1, 3), 2:3)) {
   if(!sample_label_property %in% c("auto", "shortname", "index", "index_asis")) {
     append_log(paste("invalid value for parameter 'sample_label_property':", sample_label_property), type = "error")
   }
   if(!all(c("sample_index", "sample_id", "shortname") %in% colnames(samples))) {
     append_log(paste(paste(c("sample_index", "sample_id", "shortname"), collapse = ", "), "are required column in the samples table"), type = "error")
   }
+  if(!(is.list(pca_dims) && all(unlist(lapply(pca_dims, function(x) length(x) == 2 && is.numeric(x) && all.equal(x, round(x)) == TRUE))) && max(unlist(pca_dims)) <= 15) ) {
+    append_log("pca_dims parameter must be a list that contains integer value pairs (max value: 15)", type = "error")
+  }
+
 
-  PPCA = pcaMethods::pca(base::scale(t(matrix_sample_intensities), center=TRUE, scale=FALSE), method="ppca", nPcs = 3, seed = 123, maxIterations = 2000)
+  PPCA = pcaMethods::pca(base::scale(t(matrix_sample_intensities), center=TRUE, scale=FALSE), method="ppca", nPcs = as.integer(max(unlist(pca_dims))), seed = 123, maxIterations = 2000)
   mat_pca = PPCA@scores # rownames = sample_id
   pca_var = array(PPCA@R2, dimnames = list(colnames(PPCA@scores)))
 
@@ -50,7 +55,7 @@ plot_sample_pca = function(matrix_sample_intensities, samples, samples_colors, s
     }
 
     plotlist = list()
-    for (dims in list(1:2, c(1, 3), 2:3)) { # dims=1:2
+    for (dims in pca_dims) { # dims=1:2
       # extract data from PCA object  &  join with sample metadata to get color-coding and label
       tib = tibble(x = mat_pca[, dims[1]], y = mat_pca[, dims[2]], sample_id = rownames(mat_pca)) %>%
         left_join(samples %>% select(sample_id, label=!!prop_sample_labels, prop=!!prop), by="sample_id") %>%
@@ -230,112 +235,3 @@ plot_sample_pca__sample_in_contrast = function(dataset, contr, sample_label_prop
   # return the results of the respective plot function
   return( suppressWarnings(plot_sample_pca(as_matrix_except_first_column(tibw), samples = dataset$samples, samples_colors = samples_colors, sample_label_property = sample_label_property)) )
 }
-
-
-
-# # for reference, standard pca (does not cope with missing values); summary(stats::prcomp(t(matrix_sample_intensities), center = T, scale. = F))
-# plot_sample_pca = function(matrix_sample_intensities, samples, samples_colors, label_samples_by_shortname = TRUE) {
-#   PPCA = pcaMethods::pca(base::scale(t(matrix_sample_intensities), center=TRUE, scale=FALSE), method="ppca", nPcs = 3, seed = 123, maxIterations = 2000)
-#   mat_pca = PPCA@scores # rownames = sample_id
-#   pca_var = array(PPCA@R2, dimnames = list(colnames(PPCA@scores)))
-#
-#   props = grep("^(sample_id$|sample_index$|shortname$|contrast:)", colnames(samples_colors), ignore.case = T, value = T, invert = T)
-#
-#   # how should we label the samples? if not by shortname, use the index (and fallback to sample_id if index not available)
-#   if(length(label_samples_by_shortname) != 1 || !is.logical(label_samples_by_shortname) || !is.finite(label_samples_by_shortname)) {
-#     label_samples_by_shortname = nrow(samples) < 20
-#   }
-#   prop_sample_labels = "shortname"
-#   if(!label_samples_by_shortname) {
-#     prop_sample_labels = intersect(c("sample_index", "sample_id"), colnames(samples))[1]
-#   }
-#
-#   pcaplots = list()
-#   for (prop in props) { #prop=props[4]
-#     # don't plot 'exclude' property if there are no excluded samples
-#     if(prop == "exclude" && !any(samples$exclude)) {
-#       next
-#     }
-#
-#     plotlist = list()
-#     for (dims in list(1:2, c(1, 3), 2:3)) { # dims=1:2
-#       # extract data from PCA object  &  join with sample metadata to get color-coding and label
-#       tib = tibble(x = mat_pca[, dims[1]], y = mat_pca[, dims[2]], sample_id = rownames(mat_pca)) %>%
-#         left_join(samples %>% select(sample_id, label=!!prop_sample_labels, prop=!!prop), by="sample_id") %>%
-#         left_join(samples_colors %>% select(sample_id, clr=!!prop), by="sample_id")
-#
-#       # check if current property (column in sample metadata) only contains numbers. If so, color as numeric
-#       prop_is_numeric = !prop %in% c("group", "exclude") && suppressWarnings(all(is.finite(as.numeric(tib$prop))))
-#       if(prop_is_numeric) {
-#         tib$prop = as.numeric(tib$prop)
-#       }
-#
-#       uprop = unique(tib$prop)
-#       clr_map = array(paste0(tib$clr[match(uprop, tib$prop)], "CC"), dimnames=list(uprop))
-#
-#       p = ggplot(tib, aes(x = x, y = y, label = label, colour = prop)) +
-#         geom_point() +
-#         labs(
-#           title = "",
-#           x = sprintf("dimension %d (%.1f%%)", dims[1], pca_var[dims[1]] * 100),
-#           y = sprintf("dimension %d (%.1f%%)", dims[2], pca_var[dims[2]] * 100)
-#         ) +
-#         theme_bw() #+ theme(plot.title = element_text(hjust = 0.5))
-#
-#       # recycle colors from upstream calculation
-#       if(prop_is_numeric) {
-#         # for gradient colors, sorting by respective value is important (already converted the values to numeric type a few lines above)
-#         p = p +
-#           scale_color_gradientn(name = gsub("_", " ", prop), colours = unique(tib$clr[order(tib$prop)])) +
-#           theme(legend.text = element_text(angle=90, hjust=0.5, vjust=0.5),
-#                 legend.title = element_text(size=10, face = "bold"))
-#       } else {
-#         p = p + scale_colour_manual(
-#           name = gsub("_", " ", prop),
-#           values = clr_map, # named array, value=color, name=property
-#           breaks = names(clr_map), # sort the legend
-#           aesthetics = "colour") +
-#           guides(colour = guide_legend(title.position = "top")) +
-#           theme(legend.text = element_text(size = ifelse(length(clr_map) < 6,
-#                                                          10,
-#                                                          ifelse(length(clr_map) < 10, 8, 6)) ),
-#                 legend.title = element_text(size=10, face = "bold"))
-#       }
-#
-#       p_labeled = p + ggrepel::geom_text_repel(vjust = -.5, show.legend = FALSE, size = 2)
-#
-#       plotlist[[length(plotlist) + 1]] = p
-#       plotlist[[length(plotlist) + 1]] = p_labeled
-#     }
-#
-#     pcaplots[[length(pcaplots) + 1]] = ggarrange(plotlist = plotlist, ncol = 2, nrow = length(plotlist) / 2, common.legend = T, legend = "bottom")
-#   }
-#
-#
-#   ### code snippet for adding PCA on any custom continuous scale. eg, some aspect that reflects sample quality such as; #detect, outliers deviation in retention time, impact on CoV
-#   # counts = peptides %>% group_by(sample_id) %>% summarise(detect = sum(detect), quant=n()) %>% left_join(samples %>% select(sample_id, shortname), by="sample_id")
-#   # plotlist = list()
-#   # for (dims in list(1:2, c(1, 3), 2:3)) { # dims=1:2
-#   #   tib = tibble(x = mat_pca[, dims[1]], y = mat_pca[, dims[2]], sample_id = rownames(mat_pca)) %>% left_join(counts, by="sample_id")
-#   #
-#   #   p = ggplot(tib, aes(x = x, y = y, label = shortname, colour = detect)) +
-#   #     geom_point() +
-#   #     scale_color_gradient(name = "detected peptides") +
-#   #     labs(
-#   #       title = "",
-#   #       x = sprintf("dimension %d (%.1f%%)", dims[1], pca_var[dims[1]] * 100),
-#   #       y = sprintf("dimension %d (%.1f%%)", dims[2], pca_var[dims[2]] * 100)
-#   #     ) +
-#   #     theme_bw() +
-#   #     theme(plot.title = element_text(hjust = 0.5),
-#   #           legend.text = element_text(angle=90, hjust=0.5, vjust=0.5))
-#   #
-#   #   p_labeled = p + ggrepel::geom_text_repel(show.legend = FALSE, size = 2)
-#   #
-#   #   plotlist[[length(plotlist) + 1]] = p
-#   #   plotlist[[length(plotlist) + 1]] = p_labeled
-#   # }
-#   # pcaplots[[length(pcaplots) + 1]] = ggarrange(plotlist = plotlist, ncol = 2, nrow = length(plotlist) / 2, common.legend = T, legend = "bottom")
-#
-#   return(pcaplots)
-# }
diff --git a/R/process_peptide_data.R b/R/process_peptide_data.R
index 54a1eee..deab7c5 100644
--- a/R/process_peptide_data.R
+++ b/R/process_peptide_data.R
@@ -22,8 +22,9 @@ peptides_collapse_by_sequence = function(peptides, prop_peptide = "sequence_plai
 #' @param remove_irt_peptides try to find the irt spike-in peptides in the fasta file header. looking for; |IRT| and IRT_KIT and "Biognosys iRT" (case insensitive). default:FALSE
 #' @param regular_expression careful here, regular expressions are powerful but complex matching patterns. A 'regex' that is matched against the fasta header(s) of a protein(group). case insensitive
 #' @param gene_symbols an array of gene symbols that are to be matched against the fasta header(s) of a protein(group). case insensitive
+#' @param print_nchar_limit max number of characters for the fasta headers (of removed proteins) that are shown in the log
 #' @export
-remove_proteins_by_name = function(dataset, remove_irt_peptides = FALSE, regular_expression = "", gene_symbols = c()) {
+remove_proteins_by_name = function(dataset, remove_irt_peptides = FALSE, regular_expression = "", gene_symbols = c(), print_nchar_limit = 150) {
   # TODO: input validation
 
   # if(remove_not_present_in_fasta) {
@@ -41,6 +42,11 @@ remove_proteins_by_name = function(dataset, remove_irt_peptides = FALSE, regular
     regex_filter = c(regex_filter, "\\|IRT\\||IRT_KIT|Biognosys iRT")
   }
 
+  if(length(print_nchar_limit) != 1 || !is.numeric(print_nchar_limit) || !is.finite(print_nchar_limit) || print_nchar_limit < 25) {
+    append_log("print_nchar_limit parameter must be a single number (larger than 25)", type = "error")
+  }
+  print_nchar_limit = as.integer(ceiling(print_nchar_limit))
+
   if(!is.na(regular_expression) && regular_expression != "") {
     regex_filter = c(regex_filter, regular_expression)
   }
@@ -67,7 +73,7 @@ remove_proteins_by_name = function(dataset, remove_irt_peptides = FALSE, regular
       dataset$proteins = dataset$proteins[!rows_remove, ]
       dataset$peptides = dataset$peptides %>% filter(isdecoy | protein_id %in% dataset$proteins$protein_id)
       # optionally, print the fasta headers for proteins that were removed
-      charlim = 150
+      charlim = print_nchar_limit
       h_too_long = nchar(h) > charlim
       h[h_too_long] = paste(substr(h[h_too_long], 1, charlim-4), "...")
       # status report & return results
@@ -247,31 +253,139 @@ import_protein_metadata_from_fasta = function(protein_id, fasta_files, fasta_id_
 
 
 
-# TODO
-# #' placeholder title
-# #' @param peptides todo
-# #' @param proteins todo
-# rollup_proteins_by_gene = function(peptides, proteins) {
-#   # convert the 'peptide to protein_id' mapping -->> 'peptide to gene'
-#   i = match(peptides$protein_id, proteins$protein_id)
-#   if(any(is.na(i))) {
-#     append_log("all protein_id from peptide tibble must be in protein tibble", type = "error")
-#   }
-#
-#   peptides$protein_id = proteins$gene_symbols_or_id[i]
-#
-#   # collapse metadata by unique gene. note the hashtag separation character, so we can still distinguish proteinGroups downstream
-#   proteins = proteins %>%
-#     group_by(gene_symbols_or_id) %>%
-#     summarise(
-#       accessions = paste(accessions, collapse = "#"),
-#       fasta_headers = paste(fasta_headers, collapse = "#")
-#     )
-#   # copy gene symbols to protein_id column
-#   proteins$protein_id = proteins$gene_symbols_or_id
-#
-#   return(list(peptides = peptides, proteins = proteins))
-# }
+#' Merge protein identifiers that map to the same unique set of gene symbols
+#'
+#' @description
+#' Suppose there are 3 proteingroups that have the respective unique genes:
+#'
+#' 1) GRIA2
+#' 2) GRIA2
+#' 3) GRIA2;GRIA1
+#'
+#' Then the latter proteingroup is an ambiguous proteingroup where respective peptides were matched to 2 genes
+#' (by the upstream software that generated the dataset that you imported with e.g. `import_dataset_diann()`).
+#' This function will merge all peptides that belong to proteingroups 1 and 2 into
+#' one new proteingroup (that uniquely maps to GRIA2).
+#' Proteingroup 3 will remain a distinct proteingroup; only proteingroups with the exact same
+#' gene symbol set are matched (technical detail: this uses the definitions in `dataset$proteins$gene_symbols_or_id`).
+#'
+#' This function will fully update the dataset's protein table and the protein_id column in the peptide table.
+#'
+#' In most use-cases, this function should be used immediately after `import_fasta()` as shown in the example.
+#'
+#' @examples \dontrun{
+#' # example for using this function. You'll need to update the file paths accordingly
+#' library(msdap)
+#' dataset = import_dataset_diann("C:/data/diann_report.tsv")
+#' dataset = import_fasta(dataset, files = "C:/data/uniprot_human_2020-01.fasta")
+#' dataset = merge_proteingroups_by_gene(dataset)
+#' }
+#' @param dataset a valid dataset. Prior to calling this function, you must import protein metadata from FASTA using the `import_fasta()` function
+#' @export
+merge_proteingroups_by_gene = function(dataset) {
+
+  # double-check that a fasta has been loaded, and that the dataset object is compatible with msdap 1.2 or later (i.e. has protein data in current format)
+  if(!is.data.frame(dataset$proteins) ||
+     !all(c("protein_id", "fasta_headers", "gene_symbols", "gene_symbol_ucount", "gene_symbols_or_id") %in% colnames(dataset$proteins)) ||
+     !any(is.numeric(dataset$proteins$gene_symbol_ucount) & is.finite(dataset$proteins$gene_symbol_ucount) & dataset$proteins$gene_symbol_ucount > 0)
+  ) {
+    append_log("dataset is missing essential information in the protein table. No FASTA has been imported for this dataset or the dataset was analyzed with an outdated MS-DAP version.\nTo use this function you won't have to re-run the entire analysis pipeline for this dataset, just run the import_fasta() function to update the current dataset object (assuming this is a dataset searched against a uniprot FASTA)", type = "error")
+  }
+
+  if(anyNA(dataset$proteins)) {
+    append_log("NA values are not allowed in dataset$proteins", type = "error")
+  }
+
+
+  ### 1) create mapping table from old proteingroup identifiers to new IDs for the updated groups
+
+  protein_to_gene = dataset$peptides %>%
+    # count unique peptides per protein_id in dataset$peptides, this dictates the prio/order of uniprot symbols
+    distinct(protein_id, peptide_id) %>%
+    count(protein_id) %>%
+    # after merging proteingroups, the uniprot IDs from the proteingroup with most evidence are in front
+    arrange(desc(n)) %>%
+    # add gene symbols (using uniprot ID where gene symbols are absent)
+    left_join(dataset$proteins %>% select(protein_id, gene_symbols_or_id), by = "protein_id")
+
+  gene_to_new_proteinid__long = protein_to_gene %>%
+    select(-n) %>%
+    # unlist the uniprot accessions and find set of unique IDs per gene symbol
+    # (there may be duplicates across proteingroups, can't trust input data)
+    mutate(protein_id_new = strsplit(protein_id, " *; *")) %>%
+    tidyr::unchop(protein_id_new) %>%
+    distinct(gene_symbols_or_id, protein_id_new)
+
+  gene_to_new_proteinid = gene_to_new_proteinid__long %>%
+    # collapse into a string: these are the new protein_id for respective gene_symbols_or_id
+    group_by(gene_symbols_or_id) %>%
+    summarise(protein_id_new = paste(protein_id_new, collapse = ";"), .groups = "drop")
+
+  # finally, merge both tables and print stats to console
+  protein_to_new_proteinid = left_join(protein_to_gene, gene_to_new_proteinid, by = "gene_symbols_or_id")
+
+  tmp = sum(protein_to_new_proteinid$protein_id != protein_to_new_proteinid$protein_id_new)
+  append_log(sprintf("%d / %d (%.1f%%) proteingroups that matched to the same gene symbol(s) were merged",
+                     tmp, nrow(protein_to_new_proteinid), tmp / nrow(protein_to_new_proteinid) * 100), type = "info")
+
+
+  ### 2) update dataset$peptides : replace protein_id
+
+  i = data.table::chmatch(dataset$peptides$protein_id, protein_to_new_proteinid$protein_id)
+  stopifnot(!is.na(i))
+  dataset$peptides$protein_id = protein_to_new_proteinid$protein_id_new[i]
+
+
+  ### 3) update dataset$proteins : deconstruct and merge protein data
+
+  ## example data for a proteingroup with multiple accessions, and some of which have no gene symbol
+  # $ protein_id        : chr "A0A087WUQ1;B3KPU0;E5RGS7;H7C1D1"
+  # $ accessions        : chr "A0A087WUQ1;B3KPU0;E5RGS7;H7C1D1"
+  # $ fasta_headers     : chr ">tr|A0A087WUQ1|A0A087WUQ1_HUMAN Isoform of Q96JY6, PDZ and LIM domain 2 OS=Homo sapiens OX=9606 GN=PDLIM2 PE=1 "| __truncated__
+  # $ gene_symbols      : chr "PDLIM2;PDLIM2;-;-"
+  # $ gene_symbol_ucount: int 1
+  # $ gene_symbols_or_id: chr "PDLIM2"
+
+  # first, unpack the current protein data. These columns are semicolon-delimited and assumed to have same length
+  p = dataset$proteins %>%
+    select(protein_id, fasta_headers, gene_symbols) %>%
+    mutate(
+      protein_id = strsplit(protein_id, ";", fixed = TRUE),
+      fasta_headers = strsplit(fasta_headers, ";", fixed = TRUE),
+      gene_symbols = strsplit(gene_symbols, ";", fixed = TRUE)
+    )
+  stopifnot(lengths(p$protein_id) == lengths(p$fasta_headers))
+  stopifnot(lengths(p$protein_id) == lengths(p$gene_symbols))
+
+  p = p %>%
+    tidyr::unchop(cols = c(protein_id, fasta_headers, gene_symbols)) %>%
+    # importantly, take unique: the same uniprot ID may be part of multiple proteingroups
+    distinct(protein_id, .keep_all = TRUE)
+
+  # join the fasta/symbol metadata to the new proteingroup identifiers
+  newproteins = gene_to_new_proteinid__long %>%
+    rename(protein_id = protein_id_new) %>%
+    left_join(p, by = "protein_id") %>%
+    group_by(gene_symbols_or_id) %>%
+    summarise(
+      fasta_headers = paste(fasta_headers, collapse = ";"),
+      gene_symbols = paste(gene_symbols, collapse = ";"),
+      ## definition of ucount analogous to import_protein_metadata_from_fasta()
+      gene_symbol_ucount = n_distinct(gene_symbols[gene_symbols != "-"])
+    )
+
+  dataset$proteins = gene_to_new_proteinid %>%
+    rename(protein_id = protein_id_new) %>%
+    mutate(accessions = protein_id) %>%
+    left_join(newproteins, by = "gene_symbols_or_id") %>%
+    select(protein_id, accessions, fasta_headers, gene_symbols, gene_symbol_ucount, gene_symbols_or_id)
+
+  # QC: no NA's (i.e. no issues while joining tables)  +  all updated protein_id in peptide table are present in protein table
+  stopifnot(!anyNA(dataset$proteins))
+  stopifnot(unique(dataset$peptides$protein_id) %in% dataset$proteins$protein_id)
+
+  return(dataset)
+}
 
 
 
diff --git a/R/sample_metadata.R b/R/sample_metadata.R
index 13a4ea8..d0519ff 100644
--- a/R/sample_metadata.R
+++ b/R/sample_metadata.R
@@ -13,6 +13,7 @@ import_sample_metadata = function(dataset, filename) {
   dataset = invalidate_cache(dataset)
 
   dataset$samples = sample_metadata_from_file(sample_id = unique(dataset$peptides$sample_id), filename = filename)
+  dataset$samples = tibble::as_tibble(dataset$samples)
   # add peptide and protein counts to samples tibble, so downstream code includes this in output tables and report figures relating to sample metadata
   dataset$samples = peptide_and_protein_counts_per_sample(dataset$peptides, dataset$samples, is_dia_dataset(dataset))
   return(dataset)
@@ -666,6 +667,10 @@ add_contrast = function(dataset, colname_condition_variable, values_condition1,
     dataset = remove_contrasts(dataset) # inits a new list
   }
 
+  if(!tibble::is_tibble(dataset$samples)) {
+    dataset$samples = as_tibble(dataset$samples)
+  }
+
   # validate condition colname
   valid_sample_metadata_columns = user_provided_metadata(dataset$samples)
   if(length(colname_condition_variable) != 1 || is.na(colname_condition_variable) || colname_condition_variable == "" || !colname_condition_variable %in% valid_sample_metadata_columns) {
diff --git a/R/util_fasta.R b/R/util_fasta.R
index 081dfd7..51adf80 100644
--- a/R/util_fasta.R
+++ b/R/util_fasta.R
@@ -42,6 +42,7 @@ fasta_id_short = function(id, fasta_id_type = "uniprot") {
 #'
 #' uniprot headers may contain "GN=-" or no GN tag at all !
 #' example; >sp|Q6ZSR9|YJ005_HUMAN Uncharacterized protein FLJ45252 OS=Homo sapiens (Human) OX=9606 GN=- PE=2 SV=2
+#' example; >tr|A2ALT2|A2ALT2_MOUSE Isoform of Q03288, Nonagouti (Fragment) OS=Mus musculus OX=10090 GN=a PE=4 SV=1
 #' example; >sp|P15252|REF_HEVBR Rubber elongation factor protein OS=Hevea brasiliensis PE=1 SV=2
 #'
 #' @param x array of fasta headers
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 7ae73a2..2d9fad7 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -33,8 +33,8 @@ RUN R -e "devtools::install_version('remaCor', '0.0.18', upgrade = 'never', repo
 ### MS-DAP
 # from local devtools::build() file
 RUN mkdir /msdap
-COPY temp/msdap_1.2.tar.gz /msdap/
-RUN R -e "devtools::install_local('/msdap/msdap_1.2.tar.gz', upgrade = 'never')"
+COPY temp/msdap_1.2.1.tar.gz /msdap/
+RUN R -e "devtools::install_local('/msdap/msdap_1.2.1.tar.gz', upgrade = 'never')"
 # alternatively, install from github; RUN R -e "devtools::install_github('ftwkoopmans/msdap@1.1', upgrade = 'never')"
 
 
diff --git a/docker/msdap_launcher_unix.sh b/docker/msdap_launcher_unix.sh
index e242917..c136d43 100644
--- a/docker/msdap_launcher_unix.sh
+++ b/docker/msdap_launcher_unix.sh
@@ -3,7 +3,7 @@
 # MS-DAP launch script
 # https://github.com/ftwkoopmans/msdap
 
-VERSION="1.2"
+VERSION="1.2.1"
 
 
 ### OS
diff --git a/docker/msdap_launcher_windows.ps1 b/docker/msdap_launcher_windows.ps1
index 0d2eb12..48eee77 100644
--- a/docker/msdap_launcher_windows.ps1
+++ b/docker/msdap_launcher_windows.ps1
@@ -1,7 +1,7 @@
 # MS-DAP launch script
 # https://github.com/ftwkoopmans/msdap
 
-$VERSION = "1.2"
+$VERSION = "1.2.1"
 
 
 Write-Host "$((Get-Date).ToString("HH:mm:ss")) - Starting MS-DAP launcher script for version: $VERSION"
diff --git a/man/disambiguate_protein_table_by_gene.Rd b/man/disambiguate_protein_table_by_gene.Rd
new file mode 100644
index 0000000..ba3dd81
--- /dev/null
+++ b/man/disambiguate_protein_table_by_gene.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/export_stats_genesummary.R
+\name{disambiguate_protein_table_by_gene}
+\alias{disambiguate_protein_table_by_gene}
+\title{given some table with protein-level data, map to HGNC genes and deal with redundant or ambiguous gene-level results}
+\usage{
+disambiguate_protein_table_by_gene(
+  protein_data,
+  hgnc,
+  gene_ambiguity,
+  xref = NULL,
+  remove_nohgnc = FALSE,
+  distinct_factors = NULL
+)
+}
+\arguments{
+\item{protein_data}{a data.frame with columns (from dataset$proteins) protein_id, gene_symbols, gene_symbols_or_id.
+Importantly, if you opt to return only unique entries per gene then the row of the FIRST matching protein_id is
+returned, so sort your data table by pvalue upstream !}
+
+\item{hgnc}{see \code{export_stats_genesummary()}}
+
+\item{gene_ambiguity}{see \code{export_stats_genesummary()}}
+
+\item{xref}{see \code{export_stats_genesummary()}}
+
+\item{remove_nohgnc}{see \code{export_stats_genesummary()}}
+
+\item{distinct_factors}{a set of columns in protein_data that should be considered (together with protein_id) factors that describe subsets of data (to return unique gene-level data within)}
+}
+\description{
+The order of the data table is pivotal: the first row for each unique gene (criteria in parameter \code{gene_ambiguity})) is selected !
+}
diff --git a/man/fasta_header_to_gene.Rd b/man/fasta_header_to_gene.Rd
index c8d554d..f737efe 100644
--- a/man/fasta_header_to_gene.Rd
+++ b/man/fasta_header_to_gene.Rd
@@ -18,5 +18,6 @@ example; >sp|Q9Y2S6|TMA7_HUMAN Translation machinery-associated protein 7 OS=Hom
 \details{
 uniprot headers may contain "GN=-" or no GN tag at all !
 example; >sp|Q6ZSR9|YJ005_HUMAN Uncharacterized protein FLJ45252 OS=Homo sapiens (Human) OX=9606 GN=- PE=2 SV=2
+example; >tr|A2ALT2|A2ALT2_MOUSE Isoform of Q03288, Nonagouti (Fragment) OS=Mus musculus OX=10090 GN=a PE=4 SV=1
 example; >sp|P15252|REF_HEVBR Rubber elongation factor protein OS=Hevea brasiliensis PE=1 SV=2
 }
diff --git a/man/idmap_symbols.Rd b/man/idmap_symbols.Rd
index bd3bc81..a2dcfc4 100644
--- a/man/idmap_symbols.Rd
+++ b/man/idmap_symbols.Rd
@@ -7,7 +7,7 @@
 idmap_symbols(proteins_long, hgnc, use_synonyms = TRUE)
 }
 \arguments{
-\item{proteins_long}{long-format data.frame with columns protein_id and symbol pairs (i.e. no semicolon-delimted values)}
+\item{proteins_long}{long-format data.frame with columns protein_id and symbol pairs (i.e. no semicolon-delimited values)}
 
 \item{hgnc}{HGNC lookup table from \code{hgnc_lookuptable()}}
 
diff --git a/man/idmap_uniprotid.Rd b/man/idmap_uniprotid.Rd
index bdebbcc..4b1e171 100644
--- a/man/idmap_uniprotid.Rd
+++ b/man/idmap_uniprotid.Rd
@@ -7,7 +7,7 @@
 idmap_uniprotid(proteins_long, hgnc, idmap, use_symbols = TRUE)
 }
 \arguments{
-\item{proteins_long}{long-format data.frame with columns protein_id and symbol pairs (i.e. no semicolon-delimted values)}
+\item{proteins_long}{long-format data.frame with columns uniprot_id and symbol pairs (i.e. no semicolon-delimited values)}
 
 \item{hgnc}{HGNC lookup table from \code{hgnc_lookuptable()}}
 
diff --git a/man/merge_proteingroups_by_gene.Rd b/man/merge_proteingroups_by_gene.Rd
new file mode 100644
index 0000000..85c2b85
--- /dev/null
+++ b/man/merge_proteingroups_by_gene.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/process_peptide_data.R
+\name{merge_proteingroups_by_gene}
+\alias{merge_proteingroups_by_gene}
+\title{Merge protein identifiers that map to the same unique set of gene symbols}
+\usage{
+merge_proteingroups_by_gene(dataset)
+}
+\arguments{
+\item{dataset}{a valid dataset. Prior to calling this function, you must import protein metadata from FASTA using the \code{import_fasta()} function}
+}
+\description{
+Suppose there are 3 proteingroups that have the respective unique genes:
+\enumerate{
+\item GRIA2
+\item GRIA2
+\item GRIA2;GRIA1
+}
+
+Then the latter proteingroup is an ambiguous proteingroup where respective peptides were matched to 2 genes
+(by the upstream software that generated the dataset that you imported with e.g. \code{import_dataset_diann()}).
+This function will merge all peptides that belong to proteingroups 1 and 2 into
+one new proteingroup (that uniquely maps to GRIA2).
+Proteingroup 3 will remain a distinct proteingroup; only proteingroups with the exact same
+gene symbol set are matched (technical detail: this uses the definitions in \code{dataset$proteins$gene_symbols_or_id}).
+
+This function will fully update the dataset's protein table and the protein_id column in the peptide table.
+
+In most use-cases, this function should be used immediately after \code{import_fasta()} as shown in the example.
+}
+\examples{
+\dontrun{
+# example for using this function. You'll need to update the file paths accordingly
+library(msdap)
+dataset = import_dataset_diann("C:/data/diann_report.tsv")
+dataset = import_fasta(dataset, files = "C:/data/uniprot_human_2020-01.fasta")
+dataset = merge_proteingroups_by_gene(dataset)
+}
+}
diff --git a/man/plot_sample_pca.Rd b/man/plot_sample_pca.Rd
index e50f78c..9d0534e 100644
--- a/man/plot_sample_pca.Rd
+++ b/man/plot_sample_pca.Rd
@@ -11,7 +11,8 @@ plot_sample_pca(
   samples_colors,
   sample_label_property = "auto",
   pch_as_exclude = TRUE,
-  infer_continuous_scale = TRUE
+  infer_continuous_scale = TRUE,
+  pca_dims = list(1:2, c(1, 3), 2:3)
 )
 }
 \arguments{
@@ -26,6 +27,8 @@ plot_sample_pca(
 \item{pch_as_exclude}{should exclude samples be represented with a distinct symbol shape? possible values; FALSE, TRUE (default)}
 
 \item{infer_continuous_scale}{should the legends infer continuous scale from the data? FALSE = all categorical, TRUE = automatic (default)}
+
+\item{pca_dims}{a list of integer pairs indicating PCA plots to make}
 }
 \description{
 placeholder title
diff --git a/man/remove_proteins_by_name.Rd b/man/remove_proteins_by_name.Rd
index 6fbbaa9..ed5c804 100644
--- a/man/remove_proteins_by_name.Rd
+++ b/man/remove_proteins_by_name.Rd
@@ -8,7 +8,8 @@ remove_proteins_by_name(
   dataset,
   remove_irt_peptides = FALSE,
   regular_expression = "",
-  gene_symbols = c()
+  gene_symbols = c(),
+  print_nchar_limit = 150
 )
 }
 \arguments{
@@ -19,6 +20,8 @@ remove_proteins_by_name(
 \item{regular_expression}{careful here, regular expressions are powerful but complex matching patterns. A 'regex' that is matched against the fasta header(s) of a protein(group). case insensitive}
 
 \item{gene_symbols}{an array of gene symbols that are to be matched against the fasta header(s) of a protein(group). case insensitive}
+
+\item{print_nchar_limit}{max number of characters for the fasta headers (of removed proteins) that are shown in the log}
 }
 \description{
 Remove all peptides that match some protein-level filters