library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(Seurat)
#> Registered S3 method overwritten by 'spatstat.geom':
#> method from
#> print.boxx cli
#> Attaching SeuratObject
library(DElegate)
library(ggplot2)
library(ggrepel)
DElegate
is an R package that allows bulk RNA-seq differential expression methods to be used with single-cell data. It is a light wrapper around DESeq2
, edgeR
, and limma
, similar to the Libra
package. In contrast to Libra
, DElegate
focuses on a few DE methods and will assign cells to pseudo-replicates if no true replicates are available.
All DElegate
functionality is contained in one function - findDE()
. It has one mandatory input argument: object
, which may be of class
Seurat
- the count matrix will be extracted from the 'RNA'
assaySingleCellExperiment
- the count matrix will be extracted via counts()
dgCMatrix
- sparse matrix of the Matrix
packagematrix
To indicate the cell group memberships, you have several options, depending on input type:
Seurat
- in the object via Idents(object)
, or use the group_column
argumentSingleCellExperiment
- in the object via colLables(object)
, or use the group_column
argumentdgCMatrix
, or matrix
- use the meta_data
and group_column
argumentsDElegate
uses bulk RNA-seq DE methods and relies on replicates. If no true replicates are available, it assigns cells to pseudo-replicates. However, if replicates are available in the input, the replicate_column
argument can be used to indicate where to find the labels.
To tell findDE()
which cell groups to compare, use the compare
argument. We provide several ways to set up the comparisons that will be tested:
'each_vs_rest'
, the default, does multiple comparisons, one per group vs all remaining cells'all_vs_all'
, also does multiple comparisons, covering all group pairs'MONOCYTES'
, does one comparison between that group and the remaining cellsc('T CELLS', 'B CELLS')
, does one comparison between those two groupslist(c('T CELLS', 'B CELLS'), c('MONOCYTES'))
, does one comparison after combining groupsFinally, there are currently three DE methods supported
'edger'
uses edgeR::glmQLFit
'deseq'
uses DESeq2::DESeq(test = 'Wald')
'limma'
uses limma::eBayes(trend = TRUE, robust = TRUE)
For complete details, consult the package documentation: ?DElegate::findDE
In the examples below, we use a Seurat object s
with cell type labels in the identity slot (Idents(s)
).
show(s)
#> An object of class Seurat
#> 73202 features across 9974 samples within 2 assays
#> Active assay: SCT (36601 features, 3000 variable features)
#> 1 other assay present: RNA
#> 2 dimensional reductions calculated: pca, umap
table(Idents(s))
#>
#> MONOCYTES T CELLS B CELLS
#> 4453 4493 1028
DimPlot(s)
<- findDE(object = s)
de_results #> group_column argument is not set - will use SeuratObject::Idents()
#> No replicates specified - will assign cells randomly to pseudoreplicates
#> Doing 3 comparisons
The output is one large data frame with the results for all three comparisons for all genes. The order of the output is by comparison, then by p-value, then by test statistic. Ordering can be turned off with order_results = FALSE
head(de_results) %>%
::datatable() %>%
DT::formatSignif(columns = 2:8) DT
There are as many rows per comparison, as there were genes in the input
table(paste(de_results$group1, de_results$group2, sep = ' vs '))
#>
#> B CELLS vs not B CELLS MONOCYTES vs not MONOCYTES
#> 36601 36601
#> T CELLS vs not T CELLS
#> 36601
The output contains the following columns
The log fold-change values are estimates of the form log2(group1/group2) - details will depend on the method chosen.
We can plot the log fold-change vs FDR for each comparison.
$comp <- paste(de_results$group1, de_results$group2, sep = ' vs ')
de_results$comp <- factor(de_results$comp, levels = unique(de_results$comp))
de_resultsggplot(de_results, aes(log_fc, -log10(padj))) +
geom_point(shape = 16, alpha = 0.2) +
facet_wrap(~ comp, scales = 'free')
For a given comparison, the genes higher in the first group will be assigned positive fold-changes. Below we mark the top 5 per comparison, ranked by p-value (the default output of findDE()
).
<- filter(de_results, log_fc > 0) %>%
top_DE group_by(group1) %>%
slice_head(n = 5)
ggplot(de_results, aes(log_fc, -log10(padj))) +
geom_point(shape = 16, alpha = 0.2) +
geom_point(data = top_DE, shape = 16, size = 2, color = 'deeppink') +
geom_text_repel(data = top_DE, aes(label = feature)) +
facet_wrap(~ comp, scales = 'free')
To compare two specific groups, use a character vector of length two as the 'compare'
argument.
<- findDE(object = s, compare = c('T CELLS', 'B CELLS'))
de_results #> group_column argument is not set - will use SeuratObject::Idents()
#> No replicates specified - will assign cells randomly to pseudoreplicates
#> Doing 1 comparisons
<- filter(de_results, log_fc != 0) %>%
top_DE group_by(sign(log_fc)) %>%
slice_head(n = 5)
ggplot(de_results, aes(log_fc, -log10(padj))) +
geom_point(shape = 16, alpha = 0.2) +
geom_point(data = top_DE, shape = 16, size = 2, color = 'deeppink') +
geom_text_repel(data = top_DE, aes(label = feature)) +
ggtitle('T CELLS vs B CELLS')
An example of a count matrix and meta data as input.
# extract the counts and meta data from our example Seurat object
<- s$RNA@counts
counts <- data.frame(celltype = Idents(s))
meta_data # use matrix and data frame as input to findDE
<- findDE(object = counts, meta_data = meta_data,
de_results group_column = 'celltype', compare = c('T CELLS', 'B CELLS'))
#> No replicates specified - will assign cells randomly to pseudoreplicates
#> Doing 1 comparisons
head(de_results) %>%
::datatable() %>%
DT::formatSignif(columns = 2:8) DT
An example using a SingleCellExperiment as input.
<- SingleCellExperiment::SingleCellExperiment(list(counts=counts))
sce ::colLabels(sce) <- meta_data$celltype
SingleCellExperiment<- findDE(object = sce, compare = c('T CELLS', 'B CELLS'))
de_results #> group_column argument is not set - will use SingleCellExperiment::colLabels()
#> No replicates specified - will assign cells randomly to pseudoreplicates
#> Doing 1 comparisons
head(de_results) %>%
::datatable() %>%
DT::formatSignif(columns = 2:8) DT
To quickly find all marker features for a given object, DElegate
provides the convenience function FindAllMarkers2()
- a light wrapper around findDE(compare = 'each_vs_rest')
that filters for positive fold-change and orders the results by p-value and test statistic. Note that by default this function applies a post-test filter that removes genes with low detection rate (min_rate = 0.05
) and low log fold-change (min_fc = 1
).
<- FindAllMarkers2(object = s)
marker_results #> group_column argument is not set - will use SeuratObject::Idents()
#> No replicates specified - will assign cells randomly to pseudoreplicates
#> Doing 3 comparisons
filter(marker_results, feature_rank < 6) %>%
::datatable() %>%
DT::formatSignif(columns = 2:8) DT
DElegate
supports parallelization via the future
package. For example, to use the multicore strategy with 12 workers you may call future::plan(strategy = 'future::multicore', workers = 12)
before DE testing. See more details at the future
website
Note that every comparison is run single-threaded, but multiple comparisons will be done in parallel.
Trouble shooting: You may get an error regarding future.globals.maxSize
, the maximum allowed total size of global variables. The default value is 500 MiB and may be too small. You may increase it, for example to 8GB, using options(future.globals.maxSize = 8 * 10^9)
.
For reporting progress updates, DElegate
relies on the progressr
package. By default no progress updates are rendered, but may be turned on in an R session: progressr::handlers(global = TRUE)
and its default presentation modified (e.g. progressr::handlers(progressr::handler_progress)
). See more details at the progressr
website
Here we perform the same DE tasks as in the Seurat DE vignette.
library(Seurat)
library(SeuratData)
#> ── Installed datasets ───────────────────────────────────── SeuratData v0.2.2 ──
#> ✓ pbmc3k 3.1.4
#> ────────────────────────────────────── Key ─────────────────────────────────────
#> ✓ Dataset loaded successfully
#> > Dataset built with a newer version of Seurat than installed
#> ❓ Unknown version of Seurat installed
<- LoadData("pbmc3k", type = "pbmc3k.final") pbmc
Find differentially expressed features between CD14+ and FCGR3A+ Monocytes (this is the alternative to Seurat::FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = "FCGR3A+ Mono")
)
<- findDE(object = pbmc, compare = c("CD14+ Mono", "FCGR3A+ Mono"))
de_res #> group_column argument is not set - will use SeuratObject::Idents()
#> No replicates specified - will assign cells randomly to pseudoreplicates
#> Doing 1 comparisons
head(de_res, 10) %>%
::datatable() %>%
DT::formatSignif(columns = 2:8) DT
We can omit the second part of the comparison argument to find CD14+ Mono markers relative to all other cells (alternative to Seurat::FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = NULL, only.pos = TRUE)
).
<- findDE(object = pbmc, compare = "CD14+ Mono")
de_res #> group_column argument is not set - will use SeuratObject::Idents()
#> No replicates specified - will assign cells randomly to pseudoreplicates
#> Doing 1 comparisons
# exclude negative log fold-change
filter(de_res, log_fc > 0) %>% head(10) %>%
::datatable() %>%
DT::formatSignif(columns = 2:8) DT
Note that we do not suggest prefiltering. A postfilter using log fold-change and/or detection rate may be appropriate instead.
Just as in the examples above, we can find markers for all groups at once with FindAllMarkers2
. By default this wrapper around findDE()
removes genes with detection rate < 0.05 (maximum across groups compared) and low log fold-change (< 1). This is an alternative to Seurat::FindAllMarkers(pbmc)
.
<- FindAllMarkers2(object = pbmc)
marker_results #> group_column argument is not set - will use SeuratObject::Idents()
#> No replicates specified - will assign cells randomly to pseudoreplicates
#> Doing 9 comparisons
filter(marker_results, feature_rank < 6) %>%
::datatable() %>%
DT::formatSignif(columns = 2:8) DT
NOTE: This document was generated with
DElegate
version 1.0.0
Session info
sessionInfo()
#> R version 4.1.2 (2021-11-01)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.3 LTS
#>
#> Matrix products: default
#> BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] pbmc3k.SeuratData_3.1.4 SeuratData_0.2.2 ggrepel_0.9.1
#> [4] ggplot2_3.3.5 DElegate_1.0.0 SeuratObject_4.0.4
#> [7] Seurat_4.1.0 dplyr_1.0.7
#>
#> loaded via a namespace (and not attached):
#> [1] plyr_1.8.6 igraph_1.2.11
#> [3] lazyeval_0.2.2 splines_4.1.2
#> [5] crosstalk_1.2.0 listenv_0.8.0
#> [7] scattermore_0.7 GenomeInfoDb_1.30.1
#> [9] digest_0.6.29 htmltools_0.5.2
#> [11] fansi_1.0.2 magrittr_2.0.2
#> [13] tensor_1.5 cluster_2.1.2
#> [15] ROCR_1.0-11 limma_3.50.3
#> [17] globals_0.14.0 matrixStats_0.61.0
#> [19] spatstat.sparse_2.1-0 colorspace_2.0-2
#> [21] rappdirs_0.3.3 xfun_0.29
#> [23] crayon_1.4.2 RCurl_1.98-1.5
#> [25] jsonlite_1.7.3 progressr_0.10.0
#> [27] spatstat.data_2.1-2 survival_3.2-13
#> [29] zoo_1.8-9 glue_1.6.1
#> [31] polyclip_1.10-0 gtable_0.3.0
#> [33] zlibbioc_1.40.0 XVector_0.34.0
#> [35] leiden_0.3.9 DelayedArray_0.20.0
#> [37] future.apply_1.8.1 SingleCellExperiment_1.16.0
#> [39] BiocGenerics_0.40.0 abind_1.4-5
#> [41] scales_1.1.1 DBI_1.1.2
#> [43] edgeR_3.36.0 miniUI_0.1.1.1
#> [45] Rcpp_1.0.8 viridisLite_0.4.0
#> [47] xtable_1.8-4 reticulate_1.24
#> [49] spatstat.core_2.3-2 stats4_4.1.2
#> [51] DT_0.20 htmlwidgets_1.5.4
#> [53] httr_1.4.2 RColorBrewer_1.1-2
#> [55] ellipsis_0.3.2 ica_1.0-2
#> [57] pkgconfig_2.0.3 farver_2.1.0
#> [59] sass_0.4.0 uwot_0.1.11
#> [61] deldir_1.0-6 locfit_1.5-9.4
#> [63] utf8_1.2.2 tidyselect_1.1.1
#> [65] labeling_0.4.2 rlang_1.0.0
#> [67] reshape2_1.4.4 later_1.3.0
#> [69] munsell_0.5.0 tools_4.1.2
#> [71] cli_3.1.1 generics_0.1.2
#> [73] ggridges_0.5.3 evaluate_0.14
#> [75] stringr_1.4.0 fastmap_1.1.0
#> [77] yaml_2.2.2 goftest_1.2-3
#> [79] knitr_1.37 fitdistrplus_1.1-6
#> [81] purrr_0.3.4 RANN_2.6.1
#> [83] pbapply_1.5-0 future_1.23.0
#> [85] nlme_3.1-155 sparseMatrixStats_1.6.0
#> [87] mime_0.12 compiler_4.1.2
#> [89] rstudioapi_0.13 plotly_4.10.0
#> [91] png_0.1-7 spatstat.utils_2.3-0
#> [93] tibble_3.1.6 bslib_0.3.1
#> [95] stringi_1.7.6 highr_0.9
#> [97] lattice_0.20-45 Matrix_1.4-0
#> [99] vctrs_0.3.8 pillar_1.6.5
#> [101] lifecycle_1.0.1 spatstat.geom_2.3-1
#> [103] lmtest_0.9-39 jquerylib_0.1.4
#> [105] RcppAnnoy_0.0.19 data.table_1.14.2
#> [107] cowplot_1.1.1 bitops_1.0-7
#> [109] irlba_2.3.5 GenomicRanges_1.46.1
#> [111] httpuv_1.6.5 patchwork_1.1.1
#> [113] R6_2.5.1 promises_1.2.0.1
#> [115] KernSmooth_2.23-20 gridExtra_2.3
#> [117] IRanges_2.28.0 parallelly_1.30.0
#> [119] codetools_0.2-18 MASS_7.3-55
#> [121] assertthat_0.2.1 SummarizedExperiment_1.24.0
#> [123] withr_2.4.3 sctransform_0.3.3
#> [125] GenomeInfoDbData_1.2.7 S4Vectors_0.32.3
#> [127] mgcv_1.8-38 parallel_4.1.2
#> [129] grid_4.1.2 rpart_4.1.16
#> [131] tidyr_1.1.4 rmarkdown_2.11
#> [133] MatrixGenerics_1.6.0 Rtsne_0.15
#> [135] Biobase_2.54.0 shiny_1.7.1