Chapter 7 Human PBMCs (10X Genomics)
7.1 Introduction
This performs an analysis of the public PBMC ID dataset generated by 10X Genomics (Zheng et al. 2017), starting from the filtered count matrix.
7.3 Quality control
Cell calling implicitly serves as a QC step to remove libraries with low total counts and number of detected genes. Thus, we will only filter on the mitochondrial proportion.
library(scater)
stats <- high.mito <- list()
for (n in names(all.sce)) {
current <- all.sce[[n]]
is.mito <- grep("MT", rowData(current)$Symbol_TENx)
stats[[n]] <- perCellQCMetrics(current, subsets=list(Mito=is.mito))
high.mito[[n]] <- isOutlier(stats[[n]]$subsets_Mito_percent, type="higher")
all.sce[[n]] <- current[,!high.mito[[n]]]
}
qcplots <- list()
for (n in names(all.sce)) {
current <- unfiltered[[n]]
colData(current) <- cbind(colData(current), stats[[n]])
current$discard <- high.mito[[n]]
qcplots[[n]] <- plotColData(current, x="sum", y="subsets_Mito_percent",
colour_by="discard") + scale_x_log10()
}
do.call(gridExtra::grid.arrange, c(qcplots, ncol=3))
## $pbmc3k
## Mode FALSE TRUE
## logical 2609 91
##
## $pbmc4k
## Mode FALSE TRUE
## logical 4182 158
##
## $pbmc8k
## Mode FALSE TRUE
## logical 8157 224
7.4 Normalization
We perform library size normalization, simply for convenience when dealing with file-backed matrices.
## $pbmc3k
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.234 0.748 0.926 1.000 1.157 6.604
##
## $pbmc4k
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.315 0.711 0.890 1.000 1.127 11.027
##
## $pbmc8k
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.296 0.704 0.877 1.000 1.118 6.794
7.5 Variance modelling
library(scran)
all.dec <- lapply(all.sce, modelGeneVar)
all.hvgs <- lapply(all.dec, getTopHVGs, prop=0.1)
par(mfrow=c(1,3))
for (n in names(all.dec)) {
curdec <- all.dec[[n]]
plot(curdec$mean, curdec$total, pch=16, cex=0.5, main=n,
xlab="Mean of log-expression", ylab="Variance of log-expression")
curfit <- metadata(curdec)
curve(curfit$trend(x), col='dodgerblue', add=TRUE, lwd=2)
}
7.6 Dimensionality reduction
For various reasons, we will first analyze each PBMC dataset separately rather than merging them together. We use randomized SVD, which is more efficient for file-backed matrices.
library(BiocSingular)
set.seed(10000)
all.sce <- mapply(FUN=runPCA, x=all.sce, subset_row=all.hvgs,
MoreArgs=list(ncomponents=25, BSPARAM=RandomParam()),
SIMPLIFY=FALSE)
set.seed(100000)
all.sce <- lapply(all.sce, runTSNE, dimred="PCA")
set.seed(1000000)
all.sce <- lapply(all.sce, runUMAP, dimred="PCA")
7.7 Clustering
for (n in names(all.sce)) {
g <- buildSNNGraph(all.sce[[n]], k=10, use.dimred='PCA')
clust <- igraph::cluster_walktrap(g)$membership
colLabels(all.sce[[n]]) <- factor(clust)
}
## $pbmc3k
##
## 1 2 3 4 5 6 7 8 9 10
## 487 154 603 514 31 150 179 333 147 11
##
## $pbmc4k
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 497 185 569 786 373 232 44 1023 77 218 88 54 36
##
## $pbmc8k
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 1004 759 1073 1543 367 150 201 2067 59 154 244 67 76 285 20 15
## 17 18
## 64 9
all.tsne <- list()
for (n in names(all.sce)) {
all.tsne[[n]] <- plotTSNE(all.sce[[n]], colour_by="label") + ggtitle(n)
}
do.call(gridExtra::grid.arrange, c(all.tsne, list(ncol=2)))
7.8 Data integration
With the per-dataset analyses out of the way, we will now repeat the analysis after merging together the three batches.
# Intersecting the common genes.
universe <- Reduce(intersect, lapply(all.sce, rownames))
all.sce2 <- lapply(all.sce, "[", i=universe,)
all.dec2 <- lapply(all.dec, "[", i=universe,)
# Renormalizing to adjust for differences in depth.
library(batchelor)
normed.sce <- do.call(multiBatchNorm, all.sce2)
# Identifying a set of HVGs using stats from all batches.
combined.dec <- do.call(combineVar, all.dec2)
combined.hvg <- getTopHVGs(combined.dec, n=5000)
set.seed(1000101)
merged.pbmc <- do.call(fastMNN, c(normed.sce,
list(subset.row=combined.hvg, BSPARAM=RandomParam())))
We use the percentage of lost variance as a diagnostic measure.
## pbmc3k pbmc4k pbmc8k
## [1,] 7.003e-03 3.126e-03 0.000000
## [2,] 7.137e-05 5.125e-05 0.003003
We proceed to clustering:
g <- buildSNNGraph(merged.pbmc, use.dimred="corrected")
colLabels(merged.pbmc) <- factor(igraph::cluster_louvain(g)$membership)
table(colLabels(merged.pbmc), merged.pbmc$batch)
##
## pbmc3k pbmc4k pbmc8k
## 1 523 405 826
## 2 327 588 1125
## 3 185 122 218
## 4 150 180 293
## 5 172 340 577
## 6 283 533 1002
## 7 346 638 1203
## 8 433 749 1533
## 9 17 27 111
## 10 112 387 832
## 11 34 120 204
## 12 11 54 160
## 13 12 3 9
## 14 4 36 64
And visualization:
set.seed(10101010)
merged.pbmc <- runTSNE(merged.pbmc, dimred="corrected")
gridExtra::grid.arrange(
plotTSNE(merged.pbmc, colour_by="label", text_by="label", text_colour="red"),
plotTSNE(merged.pbmc, colour_by="batch")
)
Session Info
R version 4.2.1 (2022-06-23)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.5 LTS
Matrix products: default
BLAS: /home/biocbuild/bbs-3.16-bioc/R/lib/libRblas.so
LAPACK: /home/biocbuild/bbs-3.16-bioc/R/lib/libRlapack.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_GB LC_COLLATE=C
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] batchelor_1.14.0 BiocSingular_1.14.0
[3] scran_1.26.0 scater_1.26.0
[5] ggplot2_3.3.6 scuttle_1.8.0
[7] TENxPBMCData_1.15.0 HDF5Array_1.26.0
[9] rhdf5_2.42.0 DelayedArray_0.24.0
[11] Matrix_1.5-1 SingleCellExperiment_1.20.0
[13] SummarizedExperiment_1.28.0 Biobase_2.58.0
[15] GenomicRanges_1.50.0 GenomeInfoDb_1.34.0
[17] IRanges_2.32.0 S4Vectors_0.36.0
[19] BiocGenerics_0.44.0 MatrixGenerics_1.10.0
[21] matrixStats_0.62.0 BiocStyle_2.26.0
[23] rebook_1.8.0
loaded via a namespace (and not attached):
[1] AnnotationHub_3.6.0 BiocFileCache_2.6.0
[3] igraph_1.3.5 BiocParallel_1.32.0
[5] digest_0.6.30 htmltools_0.5.3
[7] viridis_0.6.2 fansi_1.0.3
[9] magrittr_2.0.3 memoise_2.0.1
[11] ScaledMatrix_1.6.0 cluster_2.1.4
[13] limma_3.54.0 Biostrings_2.66.0
[15] colorspace_2.0-3 blob_1.2.3
[17] rappdirs_0.3.3 ggrepel_0.9.1
[19] xfun_0.34 dplyr_1.0.10
[21] crayon_1.5.2 RCurl_1.98-1.9
[23] jsonlite_1.8.3 graph_1.76.0
[25] glue_1.6.2 gtable_0.3.1
[27] zlibbioc_1.44.0 XVector_0.38.0
[29] Rhdf5lib_1.20.0 scales_1.2.1
[31] DBI_1.1.3 edgeR_3.40.0
[33] Rcpp_1.0.9 viridisLite_0.4.1
[35] xtable_1.8-4 dqrng_0.3.0
[37] bit_4.0.4 rsvd_1.0.5
[39] ResidualMatrix_1.8.0 metapod_1.6.0
[41] httr_1.4.4 FNN_1.1.3.1
[43] dir.expiry_1.6.0 ellipsis_0.3.2
[45] pkgconfig_2.0.3 XML_3.99-0.12
[47] farver_2.1.1 uwot_0.1.14
[49] CodeDepends_0.6.5 sass_0.4.2
[51] dbplyr_2.2.1 locfit_1.5-9.6
[53] utf8_1.2.2 tidyselect_1.2.0
[55] labeling_0.4.2 rlang_1.0.6
[57] later_1.3.0 AnnotationDbi_1.60.0
[59] munsell_0.5.0 BiocVersion_3.16.0
[61] tools_4.2.1 cachem_1.0.6
[63] cli_3.4.1 generics_0.1.3
[65] RSQLite_2.2.18 ExperimentHub_2.6.0
[67] evaluate_0.17 stringr_1.4.1
[69] fastmap_1.1.0 yaml_2.3.6
[71] knitr_1.40 bit64_4.0.5
[73] purrr_0.3.5 KEGGREST_1.38.0
[75] sparseMatrixStats_1.10.0 mime_0.12
[77] compiler_4.2.1 beeswarm_0.4.0
[79] filelock_1.0.2 curl_4.3.3
[81] png_0.1-7 interactiveDisplayBase_1.36.0
[83] tibble_3.1.8 statmod_1.4.37
[85] bslib_0.4.0 stringi_1.7.8
[87] highr_0.9 lattice_0.20-45
[89] bluster_1.8.0 vctrs_0.5.0
[91] pillar_1.8.1 lifecycle_1.0.3
[93] rhdf5filters_1.10.0 BiocManager_1.30.19
[95] jquerylib_0.1.4 RcppAnnoy_0.0.20
[97] BiocNeighbors_1.16.0 cowplot_1.1.1
[99] bitops_1.0-7 irlba_2.3.5.1
[101] httpuv_1.6.6 R6_2.5.1
[103] bookdown_0.29 promises_1.2.0.1
[105] gridExtra_2.3 vipor_0.4.5
[107] codetools_0.2-18 assertthat_0.2.1
[109] withr_2.5.0 GenomeInfoDbData_1.2.9
[111] parallel_4.2.1 grid_4.2.1
[113] beachmat_2.14.0 rmarkdown_2.17
[115] DelayedMatrixStats_1.20.0 Rtsne_0.16
[117] shiny_1.7.3 ggbeeswarm_0.6.0
References
Zheng, G. X., J. M. Terry, P. Belgrader, P. Ryvkin, Z. W. Bent, R. Wilson, S. B. Ziraldo, et al. 2017. “Massively parallel digital transcriptional profiling of single cells.” Nat Commun 8 (January): 14049.