Chapter 5 Grun human pancreas (CEL-seq2)
5.1 Introduction
This workflow performs an analysis of the Grun et al. (2016) CEL-seq2 dataset consisting of human pancreas cells from various donors.
5.2 Data loading
We convert to Ensembl identifiers, and we remove duplicated genes or genes without Ensembl IDs.
5.3 Quality control
This dataset lacks mitochondrial genes so we will do without them for quality control.
We compute the median and MAD while blocking on the donor;
for donors where the assumption of a majority of high-quality cells seems to be violated (Figure 5.1),
we compute an appropriate threshold using the other donors as specified in the subset=
argument.
library(scater)
stats <- perCellQCMetrics(sce.grun)
qc <- quickPerCellQC(stats, percent_subsets="altexps_ERCC_percent",
batch=sce.grun$donor,
subset=sce.grun$donor %in% c("D17", "D7", "D2"))
sce.grun <- sce.grun[,!qc$discard]
colData(unfiltered) <- cbind(colData(unfiltered), stats)
unfiltered$discard <- qc$discard
gridExtra::grid.arrange(
plotColData(unfiltered, x="donor", y="sum", colour_by="discard") +
scale_y_log10() + ggtitle("Total count"),
plotColData(unfiltered, x="donor", y="detected", colour_by="discard") +
scale_y_log10() + ggtitle("Detected features"),
plotColData(unfiltered, x="donor", y="altexps_ERCC_percent",
colour_by="discard") + ggtitle("ERCC percent"),
ncol=2
)
## low_lib_size low_n_features high_altexps_ERCC_percent
## 451 511 605
## discard
## 664
5.4 Normalization
library(scran)
set.seed(1000) # for irlba.
clusters <- quickCluster(sce.grun)
sce.grun <- computeSumFactors(sce.grun, clusters=clusters)
sce.grun <- logNormCounts(sce.grun)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.102 0.505 0.794 1.000 1.231 11.600
plot(librarySizeFactors(sce.grun), sizeFactors(sce.grun), pch=16,
xlab="Library size factors", ylab="Deconvolution factors", log="xy")
5.5 Variance modelling
We block on a combined plate and donor factor.
block <- paste0(sce.grun$sample, "_", sce.grun$donor)
dec.grun <- modelGeneVarWithSpikes(sce.grun, spikes="ERCC", block=block)
top.grun <- getTopHVGs(dec.grun, prop=0.1)
We examine the number of cells in each level of the blocking factor.
## block
## CD13+ sorted cells_D17 CD24+ CD44+ live sorted cells_D17
## 86 87
## CD63+ sorted cells_D10 TGFBR3+ sorted cells_D17
## 40 90
## exocrine fraction, live sorted cells_D2 exocrine fraction, live sorted cells_D3
## 82 7
## live sorted cells, library 1_D10 live sorted cells, library 1_D17
## 33 88
## live sorted cells, library 1_D3 live sorted cells, library 1_D7
## 25 85
## live sorted cells, library 2_D10 live sorted cells, library 2_D17
## 35 83
## live sorted cells, library 2_D3 live sorted cells, library 2_D7
## 27 84
## live sorted cells, library 3_D3 live sorted cells, library 3_D7
## 17 83
## live sorted cells, library 4_D3 live sorted cells, library 4_D7
## 29 83
par(mfrow=c(6,3))
blocked.stats <- dec.grun$per.block
for (i in colnames(blocked.stats)) {
current <- blocked.stats[[i]]
plot(current$mean, current$total, main=i, pch=16, cex=0.5,
xlab="Mean of log-expression", ylab="Variance of log-expression")
curfit <- metadata(current)
points(curfit$mean, curfit$var, col="red", pch=16)
curve(curfit$trend(x), col='dodgerblue', add=TRUE, lwd=2)
}
5.6 Data integration
library(batchelor)
set.seed(1001010)
merged.grun <- fastMNN(sce.grun, subset.row=top.grun, batch=sce.grun$donor)
## D10 D17 D2 D3 D7
## [1,] 0.030283 0.030482 0.000000 0.00000 0.00000
## [2,] 0.007548 0.012081 0.038570 0.00000 0.00000
## [3,] 0.004077 0.005298 0.008043 0.05240 0.00000
## [4,] 0.014128 0.016551 0.016705 0.01539 0.05473
5.8 Clustering
snn.gr <- buildSNNGraph(merged.grun, use.dimred="corrected")
colLabels(merged.grun) <- factor(igraph::cluster_walktrap(snn.gr)$membership)
## Donor
## Cluster D10 D17 D2 D3 D7
## 1 32 70 31 81 28
## 2 3 10 3 3 6
## 3 14 35 3 2 69
## 4 11 119 0 0 55
## 5 11 69 31 2 70
## 6 3 39 0 0 8
## 7 16 38 12 11 46
## 8 1 9 0 0 7
## 9 5 13 0 0 10
## 10 3 2 2 4 2
## 11 4 13 0 0 1
## 12 5 17 0 2 33
gridExtra::grid.arrange(
plotTSNE(merged.grun, colour_by="label"),
plotTSNE(merged.grun, colour_by="batch"),
ncol=2
)
Session Info
R version 4.4.1 (2024-06-14)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 24.04.1 LTS
Matrix products: default
BLAS: /home/biocbuild/bbs-3.20-bioc/R/lib/libRblas.so
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_GB LC_COLLATE=C
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
time zone: America/New_York
tzcode source: system (glibc)
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] batchelor_1.22.0 scran_1.34.0
[3] scater_1.34.0 ggplot2_3.5.1
[5] scuttle_1.16.0 org.Hs.eg.db_3.20.0
[7] AnnotationDbi_1.68.0 scRNAseq_2.19.1
[9] SingleCellExperiment_1.28.0 SummarizedExperiment_1.36.0
[11] Biobase_2.66.0 GenomicRanges_1.58.0
[13] GenomeInfoDb_1.42.0 IRanges_2.40.0
[15] S4Vectors_0.44.0 BiocGenerics_0.52.0
[17] MatrixGenerics_1.18.0 matrixStats_1.4.1
[19] BiocStyle_2.34.0 rebook_1.16.0
loaded via a namespace (and not attached):
[1] jsonlite_1.8.9 CodeDepends_0.6.6
[3] magrittr_2.0.3 ggbeeswarm_0.7.2
[5] GenomicFeatures_1.58.0 gypsum_1.2.0
[7] farver_2.1.2 rmarkdown_2.28
[9] BiocIO_1.16.0 zlibbioc_1.52.0
[11] vctrs_0.6.5 DelayedMatrixStats_1.28.0
[13] memoise_2.0.1 Rsamtools_2.22.0
[15] RCurl_1.98-1.16 htmltools_0.5.8.1
[17] S4Arrays_1.6.0 AnnotationHub_3.14.0
[19] curl_5.2.3 BiocNeighbors_2.0.0
[21] Rhdf5lib_1.28.0 SparseArray_1.6.0
[23] rhdf5_2.50.0 sass_0.4.9
[25] alabaster.base_1.6.0 bslib_0.8.0
[27] alabaster.sce_1.6.0 httr2_1.0.5
[29] cachem_1.1.0 ResidualMatrix_1.16.0
[31] GenomicAlignments_1.42.0 igraph_2.1.1
[33] lifecycle_1.0.4 pkgconfig_2.0.3
[35] rsvd_1.0.5 Matrix_1.7-1
[37] R6_2.5.1 fastmap_1.2.0
[39] GenomeInfoDbData_1.2.13 digest_0.6.37
[41] colorspace_2.1-1 dqrng_0.4.1
[43] irlba_2.3.5.1 ExperimentHub_2.14.0
[45] RSQLite_2.3.7 beachmat_2.22.0
[47] labeling_0.4.3 filelock_1.0.3
[49] fansi_1.0.6 httr_1.4.7
[51] abind_1.4-8 compiler_4.4.1
[53] bit64_4.5.2 withr_3.0.2
[55] BiocParallel_1.40.0 viridis_0.6.5
[57] DBI_1.2.3 highr_0.11
[59] HDF5Array_1.34.0 alabaster.ranges_1.6.0
[61] alabaster.schemas_1.6.0 rappdirs_0.3.3
[63] DelayedArray_0.32.0 bluster_1.16.0
[65] rjson_0.2.23 tools_4.4.1
[67] vipor_0.4.7 beeswarm_0.4.0
[69] glue_1.8.0 restfulr_0.0.15
[71] rhdf5filters_1.18.0 grid_4.4.1
[73] Rtsne_0.17 cluster_2.1.6
[75] generics_0.1.3 gtable_0.3.6
[77] ensembldb_2.30.0 metapod_1.14.0
[79] BiocSingular_1.22.0 ScaledMatrix_1.14.0
[81] utf8_1.2.4 XVector_0.46.0
[83] ggrepel_0.9.6 BiocVersion_3.20.0
[85] pillar_1.9.0 limma_3.62.0
[87] dplyr_1.1.4 BiocFileCache_2.14.0
[89] lattice_0.22-6 rtracklayer_1.66.0
[91] bit_4.5.0 tidyselect_1.2.1
[93] locfit_1.5-9.10 Biostrings_2.74.0
[95] knitr_1.48 gridExtra_2.3
[97] bookdown_0.41 ProtGenerics_1.38.0
[99] edgeR_4.4.0 xfun_0.48
[101] statmod_1.5.0 UCSC.utils_1.2.0
[103] lazyeval_0.2.2 yaml_2.3.10
[105] evaluate_1.0.1 codetools_0.2-20
[107] tibble_3.2.1 alabaster.matrix_1.6.0
[109] BiocManager_1.30.25 graph_1.84.0
[111] cli_3.6.3 munsell_0.5.1
[113] jquerylib_0.1.4 Rcpp_1.0.13
[115] dir.expiry_1.14.0 dbplyr_2.5.0
[117] png_0.1-8 XML_3.99-0.17
[119] parallel_4.4.1 blob_1.2.4
[121] AnnotationFilter_1.30.0 sparseMatrixStats_1.18.0
[123] bitops_1.0-9 viridisLite_0.4.2
[125] alabaster.se_1.6.0 scales_1.3.0
[127] crayon_1.5.3 rlang_1.1.4
[129] cowplot_1.1.3 KEGGREST_1.46.0
References
Grun, D., M. J. Muraro, J. C. Boisset, K. Wiebrands, A. Lyubimova, G. Dharmadhikari, M. van den Born, et al. 2016. βDe Novo Prediction of Stem Cell Identity using Single-Cell Transcriptome Data.β Cell Stem Cell 19 (2): 266β77.