Chapter 8 Segerstolpe human pancreas (Smart-seq2)
8.1 Introduction
This performs an analysis of the Segerstolpe et al. (2016) dataset, consisting of human pancreas cells from various donors.
8.2 Data loading
library(AnnotationHub)
edb <- AnnotationHub()[["AH73881"]]
symbols <- rowData(sce.seger)$symbol
ens.id <- mapIds(edb, keys=symbols, keytype="SYMBOL", column="GENEID")
ens.id <- ifelse(is.na(ens.id), symbols, ens.id)
# Removing duplicated rows.
keep <- !duplicated(ens.id)
sce.seger <- sce.seger[keep,]
rownames(sce.seger) <- ens.id[keep]
We simplify the names of some of the relevant column metadata fields for ease of access. Some editing of the cell type labels is necessary for consistency with other data sets.
emtab.meta <- colData(sce.seger)[,c("cell type", "disease",
"individual", "single cell well quality")]
colnames(emtab.meta) <- c("CellType", "Disease", "Donor", "Quality")
colData(sce.seger) <- emtab.meta
sce.seger$CellType <- gsub(" cell", "", sce.seger$CellType)
sce.seger$CellType <- paste0(
toupper(substr(sce.seger$CellType, 1, 1)),
substring(sce.seger$CellType, 2))
8.3 Quality control
We remove low quality cells that were marked by the authors.
We then perform additional quality control as some of the remaining cells still have very low counts and numbers of detected features.
For some batches that seem to have a majority of low-quality cells (Figure 8.1), we use the other batches to define an appropriate threshold via subset=
.
low.qual <- sce.seger$Quality == "OK, filtered"
library(scater)
stats <- perCellQCMetrics(sce.seger)
qc <- quickPerCellQC(stats, percent_subsets="altexps_ERCC_percent",
batch=sce.seger$Donor,
subset=!sce.seger$Donor %in% c("H6", "H5"))
sce.seger <- sce.seger[,!(qc$discard | low.qual)]
colData(unfiltered) <- cbind(colData(unfiltered), stats)
unfiltered$discard <- qc$discard
gridExtra::grid.arrange(
plotColData(unfiltered, x="Donor", y="sum", colour_by="discard") +
scale_y_log10() + ggtitle("Total count") +
theme(axis.text.x = element_text(angle = 90)),
plotColData(unfiltered, x="Donor", y="detected", colour_by="discard") +
scale_y_log10() + ggtitle("Detected features") +
theme(axis.text.x = element_text(angle = 90)),
plotColData(unfiltered, x="Donor", y="altexps_ERCC_percent",
colour_by="discard") + ggtitle("ERCC percent") +
theme(axis.text.x = element_text(angle = 90)),
ncol=2
)
## low_lib_size low_n_features high_altexps_ERCC_percent
## 788 1056 1031
## discard
## 1246
8.4 Normalization
We don’t normalize the spike-ins at this point as there are some cells with no spike-in counts.
library(scran)
clusters <- quickCluster(sce.seger)
sce.seger <- computeSumFactors(sce.seger, clusters=clusters)
sce.seger <- logNormCounts(sce.seger)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.014 0.390 0.708 1.000 1.332 11.182
plot(librarySizeFactors(sce.seger), sizeFactors(sce.seger), pch=16,
xlab="Library size factors", ylab="Deconvolution factors", log="xy")
8.5 Variance modelling
We do not use cells with no spike-ins for variance modelling. Donor H1 also has very low spike-in counts and is subsequently ignored.
for.hvg <- sce.seger[,librarySizeFactors(altExp(sce.seger)) > 0 & sce.seger$Donor!="H1"]
dec.seger <- modelGeneVarWithSpikes(for.hvg, "ERCC", block=for.hvg$Donor)
chosen.hvgs <- getTopHVGs(dec.seger, n=2000)
par(mfrow=c(3,3))
blocked.stats <- dec.seger$per.block
for (i in colnames(blocked.stats)) {
current <- blocked.stats[[i]]
plot(current$mean, current$total, main=i, pch=16, cex=0.5,
xlab="Mean of log-expression", ylab="Variance of log-expression")
curfit <- metadata(current)
points(curfit$mean, curfit$var, col="red", pch=16)
curve(curfit$trend(x), col='dodgerblue', add=TRUE, lwd=2)
}
8.6 Dimensionality reduction
We pick the first 25 PCs for downstream analyses, as it’s a nice square number.
8.7 Clustering
library(bluster)
clust.out <- clusterRows(reducedDim(sce.seger, "PCA"), NNGraphParam(), full=TRUE)
snn.gr <- clust.out$objects$graph
colLabels(sce.seger) <- clust.out$clusters
We see a strong donor effect in Figures 8.4 and 5.3.
This might be due to differences in cell type composition between donors,
but the more likely explanation is that of a technical difference in plate processing or uninteresting genotypic differences.
The implication is that we should have called fastMNN()
at some point.
tab <- table(Cluster=colLabels(sce.seger), Donor=sce.seger$Donor)
library(pheatmap)
pheatmap(log10(tab+10), color=viridis::viridis(100))
gridExtra::grid.arrange(
plotTSNE(sce.seger, colour_by="label"),
plotTSNE(sce.seger, colour_by="Donor"),
ncol=2
)
8.8 Data integration
We repeat the clustering after running fastMNN()
on the donors.
This yields a more coherent set of clusters in Figure 8.6 where each cluster contains contributions from all donors.
library(batchelor)
set.seed(10001010)
corrected <- fastMNN(sce.seger, batch=sce.seger$Donor, subset.row=chosen.hvgs)
set.seed(10000001)
corrected <- runTSNE(corrected, dimred="corrected")
colLabels(corrected) <- clusterRows(reducedDim(corrected, "corrected"), NNGraphParam())
tab <- table(Cluster=colLabels(corrected), Donor=corrected$batch)
tab
## Donor
## Cluster H1 H2 H3 H4 H5 H6 T2D1 T2D2 T2D3 T2D4
## 1 4 20 80 3 2 4 8 29 24 13
## 2 14 53 37 41 14 19 13 20 11 70
## 3 3 19 67 8 27 11 3 78 124 46
## 4 8 21 2 6 11 6 9 6 5 34
## 5 2 1 0 1 2 9 1 2 2 1
## 6 29 114 26 136 49 72 140 121 85 96
## 7 1 1 2 6 3 10 3 12 13 4
## 8 4 20 16 2 1 8 70 8 10 34
gridExtra::grid.arrange(
plotTSNE(corrected, colour_by="label"),
plotTSNE(corrected, colour_by="batch"),
ncol=2
)
8.9 Multi-sample comparisons
This particular dataset contains both healthy donors and those with type II diabetes. It is thus of some interest to identify genes that are differentially expressed upon disease in each cell type. To keep things simple, we use the author-provided annotation rather than determining the cell type for each of our clusters.
## class: SingleCellExperiment
## dim: 25454 105
## metadata(0):
## assays(1): counts
## rownames(25454): ENSG00000118473 ENSG00000142920 ... ENSG00000278306
## eGFP
## rowData names(2): refseq symbol
## colnames: NULL
## colData names(9): CellType Disease ... CellType ncells
## reducedDimNames(2): PCA TSNE
## mainExpName: endogenous
## altExpNames(0):
Here, we will use the voom
pipeline from the limma package instead of the QL approach with edgeR.
This allows us to use sample weights to better account for the variation in the precision of each pseudo-bulk profile.
We see that insulin is downregulated in beta cells in the disease state, which is sensible enough.
summed.beta <- summed[,summed$CellType=="Beta"]
library(edgeR)
y.beta <- DGEList(counts(summed.beta), samples=colData(summed.beta),
genes=rowData(summed.beta)[,"symbol",drop=FALSE])
y.beta <- y.beta[filterByExpr(y.beta, group=y.beta$samples$Disease),]
y.beta <- calcNormFactors(y.beta)
design <- model.matrix(~Disease, y.beta$samples)
v.beta <- voomWithQualityWeights(y.beta, design)
fit.beta <- lmFit(v.beta)
fit.beta <- eBayes(fit.beta, robust=TRUE)
res.beta <- topTable(fit.beta, sort.by="p", n=Inf,
coef="Diseasetype II diabetes mellitus")
head(res.beta)
## symbol logFC AveExpr t P.Value adj.P.Val B
## ENSG00000254647 INS -2.728 16.680 -7.671 3.191e-06 0.03902 4.842
## ENSG00000137731 FXYD2 -2.595 7.265 -6.705 1.344e-05 0.08219 3.353
## ENSG00000169297 NR0B1 -2.092 6.790 -5.789 5.810e-05 0.09916 1.984
## ENSG00000181029 TRAPPC5 -2.127 7.046 -5.678 7.007e-05 0.09916 1.877
## ENSG00000105707 HPN -1.803 6.118 -5.654 7.298e-05 0.09916 1.740
## LOC284889 LOC284889 -2.113 6.652 -5.515 9.259e-05 0.09916 1.571
We also create some diagnostic plots to check for potential problems in the analysis.
The MA plots exhibit the expected shape (Figure 8.7)
while the differences in the sample weights in Figure 8.8 justify the use of voom()
in this context.
# Easier to just re-run it with plot=TRUE than
# to try to make the plot from 'v.beta'.
voomWithQualityWeights(y.beta, design, plot=TRUE)
Session Info
R Under development (unstable) (2024-10-21 r87258)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 24.04.1 LTS
Matrix products: default
BLAS: /home/biocbuild/bbs-3.21-bioc/R/lib/libRblas.so
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_GB LC_COLLATE=C
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
time zone: America/New_York
tzcode source: system (glibc)
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] edgeR_4.5.1 limma_3.63.3
[3] batchelor_1.23.0 pheatmap_1.0.12
[5] bluster_1.17.0 BiocSingular_1.23.0
[7] scran_1.35.0 scater_1.35.0
[9] ggplot2_3.5.1 scuttle_1.17.0
[11] ensembldb_2.31.0 AnnotationFilter_1.31.0
[13] GenomicFeatures_1.59.1 AnnotationDbi_1.69.0
[15] AnnotationHub_3.15.0 BiocFileCache_2.15.0
[17] dbplyr_2.5.0 scRNAseq_2.21.0
[19] SingleCellExperiment_1.29.1 SummarizedExperiment_1.37.0
[21] Biobase_2.67.0 GenomicRanges_1.59.1
[23] GenomeInfoDb_1.43.2 IRanges_2.41.2
[25] S4Vectors_0.45.2 BiocGenerics_0.53.3
[27] generics_0.1.3 MatrixGenerics_1.19.1
[29] matrixStats_1.5.0 BiocStyle_2.35.0
[31] rebook_1.17.0
loaded via a namespace (and not attached):
[1] RColorBrewer_1.1-3 jsonlite_1.8.9
[3] CodeDepends_0.6.6 magrittr_2.0.3
[5] ggbeeswarm_0.7.2 gypsum_1.3.0
[7] farver_2.1.2 rmarkdown_2.29
[9] BiocIO_1.17.1 vctrs_0.6.5
[11] DelayedMatrixStats_1.29.1 memoise_2.0.1
[13] Rsamtools_2.23.1 RCurl_1.98-1.16
[15] htmltools_0.5.8.1 S4Arrays_1.7.1
[17] curl_6.1.0 BiocNeighbors_2.1.2
[19] Rhdf5lib_1.29.0 SparseArray_1.7.2
[21] rhdf5_2.51.2 sass_0.4.9
[23] alabaster.base_1.7.2 bslib_0.8.0
[25] alabaster.sce_1.7.0 httr2_1.0.7
[27] cachem_1.1.0 ResidualMatrix_1.17.0
[29] GenomicAlignments_1.43.0 igraph_2.1.3
[31] mime_0.12 lifecycle_1.0.4
[33] pkgconfig_2.0.3 rsvd_1.0.5
[35] Matrix_1.7-1 R6_2.5.1
[37] fastmap_1.2.0 GenomeInfoDbData_1.2.13
[39] digest_0.6.37 colorspace_2.1-1
[41] dqrng_0.4.1 irlba_2.3.5.1
[43] ExperimentHub_2.15.0 RSQLite_2.3.9
[45] beachmat_2.23.6 labeling_0.4.3
[47] filelock_1.0.3 httr_1.4.7
[49] abind_1.4-8 compiler_4.5.0
[51] bit64_4.5.2 withr_3.0.2
[53] BiocParallel_1.41.0 viridis_0.6.5
[55] DBI_1.2.3 HDF5Array_1.35.3
[57] alabaster.ranges_1.7.0 alabaster.schemas_1.7.0
[59] rappdirs_0.3.3 DelayedArray_0.33.3
[61] rjson_0.2.23 tools_4.5.0
[63] vipor_0.4.7 beeswarm_0.4.0
[65] glue_1.8.0 restfulr_0.0.15
[67] rhdf5filters_1.19.0 grid_4.5.0
[69] Rtsne_0.17 cluster_2.1.8
[71] gtable_0.3.6 metapod_1.15.0
[73] ScaledMatrix_1.15.0 XVector_0.47.2
[75] ggrepel_0.9.6 BiocVersion_3.21.1
[77] pillar_1.10.1 dplyr_1.1.4
[79] lattice_0.22-6 rtracklayer_1.67.0
[81] bit_4.5.0.1 tidyselect_1.2.1
[83] locfit_1.5-9.10 Biostrings_2.75.3
[85] knitr_1.49 gridExtra_2.3
[87] bookdown_0.42 ProtGenerics_1.39.1
[89] xfun_0.50 statmod_1.5.0
[91] UCSC.utils_1.3.0 lazyeval_0.2.2
[93] yaml_2.3.10 evaluate_1.0.3
[95] codetools_0.2-20 tibble_3.2.1
[97] alabaster.matrix_1.7.4 BiocManager_1.30.25
[99] graph_1.85.1 cli_3.6.3
[101] munsell_0.5.1 jquerylib_0.1.4
[103] Rcpp_1.0.14 dir.expiry_1.15.0
[105] png_0.1-8 XML_3.99-0.18
[107] parallel_4.5.0 blob_1.2.4
[109] sparseMatrixStats_1.19.0 bitops_1.0-9
[111] viridisLite_0.4.2 alabaster.se_1.7.0
[113] scales_1.3.0 purrr_1.0.2
[115] crayon_1.5.3 rlang_1.1.4
[117] cowplot_1.1.3 KEGGREST_1.47.0