Skip to content

Commit c78eabb

Browse files
authored
Merge pull request #340 from tidyomics/force_abundance_column_to_reduce_dimensions
Force abundance column to reduce dimensions
2 parents 264714c + 55cbd99 commit c78eabb

16 files changed

Lines changed: 84 additions & 108 deletions

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Type: Package
22
Package: tidybulk
33
Title: Brings transcriptomics to the tidyverse
4-
Version: 2.1.2
4+
Version: 2.1.3
55
Authors@R: c(person("Stefano", "Mangiola", email = "mangiolastefano@gmail.com",
66
role = c("aut", "cre")),
77
person("Maria", "Doyle", email = "Maria.Doyle@petermac.org",

R/reduce_dimensions.R

Lines changed: 25 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -2,23 +2,21 @@
22
#'
33
#' `r lifecycle::badge("maturing")`
44
#'
5-
#' @description reduce_dimensions() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and calculates the reduced dimensional space of the transcript abundance.
5+
#' @description reduce_dimensions() takes as input a `SummarizedExperiment` and calculates the reduced dimensional space of the transcript abundance.
6+
#'
67
#'
7-
#' @importFrom rlang enquo quo_name
88
#' @importFrom magrittr not
99
#' @importFrom dplyr filter distinct select mutate rename
1010
#' @importFrom tidyr pivot_wider
1111
#' @importFrom tibble enframe
12-
#' @importFrom SummarizedExperiment colData rowData assays
12+
#' @importFrom SummarizedExperiment colData rowData assays assayNames
1313
#' @importFrom stats prcomp
1414
#'
1515
#'
1616
#' @name reduce_dimensions
1717
#'
18-
#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))
19-
#' @param .element The name of the element column (normally samples).
20-
#' @param .feature The name of the feature column (normally transcripts/genes)
21-
#' @param .abundance The name of the column including the numerical value the clustering is based on (normally transcript abundance)
18+
#' @param .data A `SummarizedExperiment`
19+
#' @param assay Character string: the name of the assay to use for dimension reduction (must match `assayNames(.data)`). This argument must be explicitly specified so that the choice of abundance matrix is deliberate.
2220
#'
2321
#' @param method A character string. The dimension reduction algorithm to use (PCA, MDS, tSNE).
2422
#' @param top An integer. How many top genes to select for dimensionality reduction
@@ -43,22 +41,7 @@
4341
#' Underlying method for tSNE:
4442
#' Rtsne::Rtsne(data, ...)
4543
#'
46-
#' Underlying method for UMAP:
47-
#'
48-
#' df_source =
49-
#' .data |>
50-
#'
51-
#' # Filter NA symbol
52-
#' filter(!!.feature |> is.na() |> not()) |>
53-
#'
54-
#' # Prepare data frame
55-
#' distinct(!!.feature,!!.element,!!.abundance) |>
56-
#'
57-
#' # Filter most variable genes
58-
#' keep_variable_transcripts(top) |>
59-
#' reduce_dimensions(method="PCA", .dims = calculate_for_pca_dimensions ) |>
60-
#' as_matrix(rownames = quo_name(.element)) |>
61-
#' uwot::tumap(...)
44+
#' Underlying method for UMAP: variable features from the chosen \code{assay} are optionally PCA-reduced, then \code{uwot::tumap()} is applied to the sample coordinates matrix.
6245
#'
6346
#'
6447
#' @return A tbl object with additional columns for the reduced dimensions
@@ -79,13 +62,13 @@
7962
#' counts.MDS =
8063
#' airway |>
8164
#' identify_abundant() |>
82-
#' reduce_dimensions( method="MDS", .dims = 3)
65+
#' reduce_dimensions(assay = "counts", method="MDS", .dims = 3)
8366
#'
8467
#'
8568
#' counts.PCA =
8669
#' airway |>
8770
#' identify_abundant() |>
88-
#' reduce_dimensions(method="PCA", .dims = 3)
71+
#' reduce_dimensions(assay = "counts", method="PCA", .dims = 3)
8972
#'
9073
#' @references
9174
#' Mangiola, S., Molania, R., Dong, R., Doyle, M. A., & Papenfuss, A. T. (2021). tidybulk: an R tidy framework for modular transcriptomic data analysis. Genome Biology, 22(1), 42. doi:10.1186/s13059-020-02233-7
@@ -102,9 +85,7 @@
10285
#'
10386
#'
10487
setGeneric("reduce_dimensions", function(.data,
105-
.element = NULL,
106-
.feature = NULL,
107-
.abundance = NULL,
88+
assay,
10889
method,
10990
.dims = 2,
11091

@@ -122,7 +103,7 @@ standardGeneric("reduce_dimensions"))
122103

123104

124105
.reduce_dimensions_se = function(.data,
125-
.abundance = NULL,
106+
assay,
126107

127108
method,
128109
.dims = 2,
@@ -135,10 +116,19 @@ standardGeneric("reduce_dimensions"))
135116
# Fix NOTEs
136117
. = NULL
137118

138-
.abundance = enquo(.abundance)
119+
if (missing(assay))
120+
stop("tidybulk says: please specify `assay` explicitly as a character string (e.g. assay = \"counts_scaled\"). If needed, create an assay scaled proportionally to library size with `scale_abundance()` and pass that assay name via `assay`.", call. = FALSE)
121+
122+
if (!is.character(assay) || length(assay) != 1L || !nzchar(assay))
123+
stop("tidybulk says: `assay` must be a single non-empty character string naming an assay in assayNames(.data).", call. = FALSE)
139124

140-
if(.abundance |> quo_is_symbolic()) my_assay = quo_name(.abundance)
141-
else my_assay = get_assay_scaled_if_exists_SE(.data)
125+
my_assay <- assay[[1L]]
126+
if (!my_assay %in% assayNames(.data))
127+
stop(sprintf(
128+
"tidybulk says: assay \"%s\" is not in assayNames(.data) (%s).",
129+
my_assay,
130+
paste(assayNames(.data), collapse = ", ")
131+
), call. = FALSE)
142132

143133
# adjust top for the max number of features I have
144134
if(top > nrow(.data)){
@@ -292,11 +282,8 @@ setMethod("reduce_dimensions",
292282
#' @importFrom rlang :=
293283
#' @importFrom stats setNames
294284
#'
295-
#' @param .data A tibble
296-
#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`)
285+
#' @param .data Feature-by-sample matrix (from an assay)
297286
#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6)
298-
#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes)
299-
#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples)
300287
#' @param top An integer. How many top genes to select
301288
#' @param of_samples A boolean
302289
#' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity
@@ -439,11 +426,8 @@ get_reduced_dimensions_MDS_bulk_SE <-
439426
#' @importFrom magrittr divide_by
440427
#' @importFrom Matrix t
441428
#'
442-
#' @param .data A tibble
443-
#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`)
429+
#' @param .data Feature-by-sample matrix (from an assay)
444430
#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6)
445-
#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes)
446-
#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples)
447431
#' @param top An integer. How many top genes to select
448432
#' @param of_samples A boolean
449433
#' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity
@@ -533,11 +517,8 @@ we suggest to partition the dataset for sample clusters.
533517
#' @importFrom stats setNames
534518
#' @importFrom Matrix t
535519
#'
536-
#' @param .data A tibble
537-
#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`)
520+
#' @param .data Feature-by-sample matrix (from an assay)
538521
#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6)
539-
#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes)
540-
#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples)
541522
#' @param top An integer. How many top genes to select
542523
#' @param of_samples A boolean
543524
#' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity

R/rotate_dimensions.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
#' counts.MDS =
5050
#' airway |>
5151
#' identify_abundant() |>
52-
#' reduce_dimensions( method="MDS", .dims = 3)
52+
#' reduce_dimensions(assay = "counts", method="MDS", .dims = 3)
5353
#'
5454
#' counts.MDS.rotated = rotate_dimensions(counts.MDS, `Dim1`, `Dim2`, rotation_degrees = 45, .element = sample)
5555
#'

dev/N52_workflow.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,15 +44,15 @@ tt_scaled %>%
4444

4545
# Visualise MDS for cell types
4646
tt_scaled %>%
47-
reduce_dimensions(method="MDS", .dims = 3) %>%
47+
reduce_dimensions(assay = "counts_scaled", method="MDS", .dims = 3) %>%
4848
select(contains("Dim"), sample, CAPRA_TOTAL, cell_type_formatted) %>%
4949
distinct() %>%
5050
GGally::ggpairs(columns = 1:3, ggplot2::aes(colour=cell_type_formatted))
5151

5252
# Vidualise MDS CAPRA
5353
tt_scaled %>%
5454
group_by(cell_type_formatted) %>%
55-
do(reduce_dimensions((.), method="MDS")) %>%
55+
do(reduce_dimensions((.), assay = "counts_scaled", method="MDS")) %>%
5656
select(contains("Dim"), sample, CAPRA_TOTAL, cell_type_formatted) %>%
5757
distinct() %>%
5858
ggplot(aes(x=Dim1, y = Dim2, color=CAPRA_TOTAL)) +

dev/benchmark/TCGA_workflow_ttBulk_for_comparison.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ tt_scaled = tt %>% identify_abundant(factor_of_interest = PFI.2 ) %>% scale_abun
2525
tt_scaled
2626
}
2727
plot_MDS = function(){
28-
tt_mds = tt_scaled %>% reduce_dimensions(method = "MDS", .dims = 3)
28+
tt_mds = tt_scaled %>% reduce_dimensions(assay = "counts_scaled", method = "MDS", .dims = 3)
2929
# Visualise MDS for cell types
3030
p = tt_mds %>%
3131
select(contains("Dim"), patient, PFI.2) %>%
@@ -45,7 +45,7 @@ plot_adjusted_MDS = function(){
4545

4646
adjust_abundance( ~ PFI.2 + batch) %>%
4747
filter(`count_scaled_adjusted` %>% is.na %>% `!`) %>%
48-
reduce_dimensions(.abundance = `count_scaled_adjusted`, method = "MDS", .dims = 3) %>%
48+
reduce_dimensions(assay = "counts_scaled_adjusted", method = "MDS", .dims = 3) %>%
4949

5050
# Plot
5151
select(contains("Dim"), patient, PFI.2, batch) %>%

dev/benchmark/pasilla_workflow_ttBulk_for_comparison.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ tt_scaled = tt %>% identify_abundant(factor_of_interest = condition) %>% scale_a
5555
tt_scaled
5656
}
5757
plot_MDS = function(){
58-
tt_mds = tt_scaled %>% reduce_dimensions(method = "MDS", .dims = 3)
58+
tt_mds = tt_scaled %>% reduce_dimensions(assay = "counts_scaled", method = "MDS", .dims = 3)
5959
# Visualise MDS for cell types
6060
p = tt_mds %>%
6161
select(contains("Dim"), sample, type, condition) %>%
@@ -69,7 +69,7 @@ plot_adjusted_MDS = function(){
6969
p = tt_mds %>%
7070
adjust_abundance( ~ condition + type) %>%
7171
filter(`count_scaled_adjusted` %>% is.na %>% `!`) %>%
72-
reduce_dimensions(.abundance = `count_scaled_adjusted`, method = "MDS", .dims = 3) %>%
72+
reduce_dimensions(assay = "counts_scaled_adjusted", method = "MDS", .dims = 3) %>%
7373

7474
# Plot
7575
select(contains("Dim"), sample, type, condition) %>%

dev/comparison_with_base_R.Rmd

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ Tidy transcriptomics
187187
```{r mds, cache=TRUE}
188188
tt.norm.MDS =
189189
tt.norm %>%
190-
reduce_dimensions(method="MDS", .dims = 2)
190+
reduce_dimensions(assay = "counts_scaled", method="MDS", .dims = 2)
191191
192192
```
193193
</div>
@@ -218,7 +218,7 @@ Tidy transcriptomics
218218
```{r pca, cache=TRUE, message=FALSE, warning=FALSE, results='hide'}
219219
tt.norm.PCA =
220220
tt.norm %>%
221-
reduce_dimensions(method="PCA", .dims = 2)
221+
reduce_dimensions(assay = "counts_scaled", method="PCA", .dims = 2)
222222
```
223223
</div>
224224
<div class="column-right">
@@ -245,6 +245,7 @@ tt.norm.tSNE =
245245
tidybulk( sample, ens, count_scaled) %>%
246246
identify_abundant() %>%
247247
reduce_dimensions(
248+
assay = "count_scaled",
248249
method = "tSNE",
249250
perplexity=10,
250251
pca_scale =TRUE

dev/manuscript_differential_transcript_abundance.Rmd

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ tt_scaled %>%
107107

108108
```{r}
109109
# Reduce data dimensionality with arbitrary number of dimensions
110-
tt_mds <- tt_scaled %>% reduce_dimensions(method="MDS", .dims = 3)
110+
tt_mds <- tt_scaled %>% reduce_dimensions(assay = "counts_scaled", method="MDS", .dims = 3)
111111
112112
# Plot all-vs-all MDS dimensions
113113
tt_mds %>%
@@ -127,8 +127,8 @@ tt_mds_adj_mds <-
127127
128128
# Calculate reduced dimensions on the adjusted counts as well
129129
reduce_dimensions(
130-
.abundance = count_scaled_adjusted,
131-
method="MDS", .dim = 3
130+
assay = "counts_scaled_adjusted",
131+
method="MDS", .dims = 3
132132
)
133133
134134

dev/pasilla_workflow.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ p1 =
6666
ylab("Density") +
6767
my_theme
6868

69-
tt_mds = tt_scaled %>% reduce_dimensions(method="MDS", .dims = 3)
69+
tt_mds = tt_scaled %>% reduce_dimensions(assay = "counts_scaled", method="MDS", .dims = 3)
7070

7171
# Visualise MDS for cell types
7272
p2 =
@@ -111,7 +111,7 @@ tt_adj = tt_mds %>% adjust_abundance(~ condition + type)
111111
p3 =
112112
tt_adj %>%
113113
filter( `count_scaled_adjusted` %>% is.na %>% `!`) %>%
114-
reduce_dimensions(.abundance = `count_scaled_adjusted`, method="MDS", .dims = 3) %>%
114+
reduce_dimensions(assay = "counts_scaled_adjusted", method="MDS", .dims = 3) %>%
115115

116116
# Plot
117117
select(contains("Dim"), sample, type, condition ) %>%

inst/NEWS.rd

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
\name{NEWS}
22
\title{News for Package \pkg{tidybulk}}
33

4+
\section{Changes in development version}{
5+
\itemize{
6+
\item \strong{BREAKING CHANGE:} \code{reduce_dimensions()} no longer accepts \code{.abundance} (a tidy-eval assay symbol). The assay must be given explicitly as a character string via \code{assay = "..."}, matching \code{assayNames(object)}. Omitting \code{assay} or passing an unknown assay name raises an error, so users must deliberately choose which abundance matrix to use (commonly a library-size scaled assay from \code{scale_abundance()}). Update all calls, tests, and vignettes that used \code{.abundance = counts} to \code{assay = "counts"} (or the appropriate assay name).
7+
}}
8+
49
\section{Changes in version 1.2.0, Bioconductor 3.12 Release}{
510
\itemize{
611
\item Make gene filtering functionality `identify_abundance` explicit, a warning will be given if this has not been performed before the majority of workflow steps (e.g. `test_differential_abundance`).

0 commit comments

Comments
 (0)