## ----style, echo=FALSE, results='hide', message=FALSE, cache=FALSE---------
library(BiocStyle)
library(knitr)
opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE, cache=TRUE)
opts_chunk$set(fig.asp=1)

## --------------------------------------------------------------------------
library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)
grun.fname <- bfcrpath(bfc, file.path("ftp://ftp.ncbi.nlm.nih.gov/geo/series",
    "GSE81nnn/GSE81076/suppl/GSE81076%5FD2%5F3%5F7%5F10%5F17%2Etxt%2Egz"))

## --------------------------------------------------------------------------
gse81076.df <- read.table(grun.fname, sep='\t', 
    header=TRUE, stringsAsFactors=FALSE, row.names=1)
dim(gse81076.df)
head(rownames(gse81076.df))
head(colnames(gse81076.df))

## --------------------------------------------------------------------------
donor.names <- sub("^(D[0-9]+).*", "\\1", colnames(gse81076.df))
table(donor.names)
plate.id <- sub("^D[0-9]+(.*)_.*", "\\1", colnames(gse81076.df))
table(plate.id)

## --------------------------------------------------------------------------
gene.symb <- gsub("__chr.*$", "", rownames(gse81076.df))
is.spike <- grepl("^ERCC-", gene.symb)
table(is.spike)

library(org.Hs.eg.db)
gene.ids <- mapIds(org.Hs.eg.db, keys=gene.symb, keytype="SYMBOL", column="ENSEMBL")
gene.ids[is.spike] <- gene.symb[is.spike]

keep <- !is.na(gene.ids) & !duplicated(gene.ids)
gse81076.df <- gse81076.df[keep,]
rownames(gse81076.df) <- gene.ids[keep]
summary(keep)

## --------------------------------------------------------------------------
library(SingleCellExperiment)
sce.gse81076 <- SingleCellExperiment(list(counts=as.matrix(gse81076.df)),
	colData=DataFrame(Donor=donor.names, Plate=plate.id),
	rowData=DataFrame(Symbol=gene.symb[keep]))
isSpike(sce.gse81076, "ERCC") <- grepl("^ERCC-", rownames(gse81076.df)) 
sce.gse81076  

## --------------------------------------------------------------------------
library(scater)
sce.gse81076 <- calculateQCMetrics(sce.gse81076, compact=TRUE)
QC <- sce.gse81076$scater_qc
low.lib <- isOutlier(QC$all$log10_total_counts, type="lower", nmad=3)
low.genes <- isOutlier(QC$all$log10_total_features_by_counts, type="lower", nmad=3)
high.spike <- isOutlier(QC$feature_control_ERCC$pct_counts, type="higher", nmad=3)
data.frame(LowLib=sum(low.lib), LowNgenes=sum(low.genes), 
	HighSpike=sum(high.spike, na.rm=TRUE))

## --------------------------------------------------------------------------
discard <- low.lib | low.genes | high.spike
sce.gse81076 <- sce.gse81076[,!discard]
summary(discard)

## --------------------------------------------------------------------------
library(scran)
library(BiocSingular)
set.seed(1000) # for irlba. 
clusters <- quickCluster(sce.gse81076, use.ranks=FALSE, BSPARAM=IrlbaParam())
table(clusters)
sce.gse81076 <- computeSumFactors(sce.gse81076, min.mean=0.1, clusters=clusters)
summary(sizeFactors(sce.gse81076))

## --------------------------------------------------------------------------
sce.gse81076 <- computeSpikeFactors(sce.gse81076, general.use=FALSE)
summary(sizeFactors(sce.gse81076, "ERCC"))

## --------------------------------------------------------------------------
sce.gse81076 <- normalize(sce.gse81076)

## ----var-gse81076, fig.cap="Variance of normalized log-expression values for each gene in the GSE81076 dataset, plotted against the mean log-expression. The blue line represents the mean-dependent trend fitted to the variances of the spike-in transcripts (red)."----
block <- paste0(sce.gse81076$Plate, "_", sce.gse81076$Donor)
fit <- trendVar(sce.gse81076, block=block, parametric=TRUE) 
dec <- decomposeVar(sce.gse81076, fit)

plot(dec$mean, dec$total, xlab="Mean log-expression", 
	ylab="Variance of log-expression", pch=16)
is.spike <- isSpike(sce.gse81076)
points(dec$mean[is.spike], dec$total[is.spike], col="red", pch=16)
curve(fit$trend(x), col="dodgerblue", add=TRUE)

## --------------------------------------------------------------------------
dec.gse81076 <- dec
dec.gse81076$Symbol <- rowData(sce.gse81076)$Symbol
dec.gse81076 <- dec.gse81076[order(dec.gse81076$bio, decreasing=TRUE),]
head(dec.gse81076)

## ---- echo=FALSE, results="hide"-------------------------------------------
rm(gse81076.df)
gc()

## --------------------------------------------------------------------------
muraro.fname <- bfcrpath(bfc, file.path("ftp://ftp.ncbi.nlm.nih.gov/geo/series",
    "GSE85nnn/GSE85241/suppl",
    "GSE85241%5Fcellsystems%5Fdataset%5F4donors%5Fupdated%2Ecsv%2Egz"))

## --------------------------------------------------------------------------
gse85241.df <- read.table(muraro.fname, sep='\t', 
    header=TRUE, row.names=1, stringsAsFactors=FALSE)
dim(gse85241.df)
head(rownames(gse85241.df))
head(colnames(gse85241.df))

## --------------------------------------------------------------------------
donor.names <- sub("^(D[0-9]+).*", "\\1", colnames(gse85241.df))
table(donor.names)
plate.id <- sub("^D[0-9]+\\.([0-9]+)_.*", "\\1", colnames(gse85241.df))
table(plate.id)

## --------------------------------------------------------------------------
gene.symb <- gsub("__chr.*$", "", rownames(gse85241.df))
is.spike <- grepl("^ERCC-", gene.symb)
table(is.spike)

library(org.Hs.eg.db)
gene.ids <- mapIds(org.Hs.eg.db, keys=gene.symb, keytype="SYMBOL", column="ENSEMBL")
gene.ids[is.spike] <- gene.symb[is.spike]

keep <- !is.na(gene.ids) & !duplicated(gene.ids)
gse85241.df <- gse85241.df[keep,]
rownames(gse85241.df) <- gene.ids[keep]
summary(keep)

## --------------------------------------------------------------------------
sce.gse85241 <- SingleCellExperiment(list(counts=as.matrix(gse85241.df)),
	colData=DataFrame(Donor=donor.names, Plate=plate.id),
	rowData=DataFrame(Symbol=gene.symb[keep]))
isSpike(sce.gse85241, "ERCC") <- grepl("^ERCC-", rownames(gse85241.df)) 
sce.gse85241  

## --------------------------------------------------------------------------
sce.gse85241 <- calculateQCMetrics(sce.gse85241, compact=TRUE)
QC <- sce.gse85241$scater_qc
low.lib <- isOutlier(QC$all$log10_total_counts, type="lower", nmad=3)
low.genes <- isOutlier(QC$all$log10_total_features_by_counts, type="lower", nmad=3)
high.spike <- isOutlier(QC$feature_control_ERCC$pct_counts, type="higher", nmad=3)
data.frame(LowLib=sum(low.lib), LowNgenes=sum(low.genes), 
	HighSpike=sum(high.spike, na.rm=TRUE))

## --------------------------------------------------------------------------
discard <- low.lib | low.genes | high.spike
sce.gse85241 <- sce.gse85241[,!discard]
summary(discard)

## --------------------------------------------------------------------------
set.seed(1000)
clusters <- quickCluster(sce.gse85241, use.ranks=FALSE, BSPARAM=IrlbaParam())
table(clusters)
sce.gse85241 <- computeSumFactors(sce.gse85241, min.mean=0.1, clusters=clusters)
summary(sizeFactors(sce.gse85241))
sce.gse85241 <- computeSpikeFactors(sce.gse85241, general.use=FALSE)
summary(sizeFactors(sce.gse85241, "ERCC"))
sce.gse85241 <- normalize(sce.gse85241)

## ----var-gse85241, fig.cap="Variance of normalized log-expression values for each gene in the GSE85241 dataset, plotted against the mean log-expression. The blue line represents the mean-dependent trend fitted to the variances of the spike-in transcripts (red)."----
block <- paste0(sce.gse85241$Plate, "_", sce.gse85241$Donor)
fit <- trendVar(sce.gse85241, block=block, parametric=TRUE) 
dec <- decomposeVar(sce.gse85241, fit)
plot(dec$mean, dec$total, xlab="Mean log-expression", 
	ylab="Variance of log-expression", pch=16)
is.spike <- isSpike(sce.gse85241)
points(dec$mean[is.spike], dec$total[is.spike], col="red", pch=16)
curve(fit$trend(x), col="dodgerblue", add=TRUE)

## --------------------------------------------------------------------------
dec.gse85241 <- dec
dec.gse85241$Symbol <- rowData(sce.gse85241)$Symbol
dec.gse85241 <- dec.gse85241[order(dec.gse85241$bio, decreasing=TRUE),]
head(dec.gse85241)

## ---- echo=FALSE, results="hide"-------------------------------------------
rm(gse85241.df)
gc()

## --------------------------------------------------------------------------
universe <- intersect(rownames(dec.gse85241), rownames(dec.gse81076))
mean.bio <- (dec.gse85241[universe,"bio"] + dec.gse81076[universe,"bio"])/2
chosen <- universe[mean.bio > 0]
length(chosen)

## --------------------------------------------------------------------------
rescaled <- batchelor::multiBatchNorm(
    sce.gse85241[universe,], 
    sce.gse81076[universe,]
)
rescaled.gse85241 <- rescaled[[1]]
rescaled.gse81076 <- rescaled[[2]]

## --------------------------------------------------------------------------
set.seed(100) 
unc.gse81076 <- logcounts(rescaled.gse81076)[chosen,]
unc.gse85241 <- logcounts(rescaled.gse85241)[chosen,]

mnn.out <- batchelor::fastMNN(
    GSE81076=unc.gse81076, GSE85241=unc.gse85241,
    k=20, d=50, BSPARAM=IrlbaParam(deferred=TRUE)
)
mnn.out

## --------------------------------------------------------------------------
dim(reducedDim(mnn.out, "corrected"))

## --------------------------------------------------------------------------
# Using an Rle for pretty-printing of batch IDs
# (as all cells from the same batch are consecutive).
Rle(mnn.out$batch) 

## --------------------------------------------------------------------------
metadata(mnn.out)$merge.info$pairs[[1]]

## ---- echo=FALSE, results="hide"-------------------------------------------
gc()

## ----tsne-batch, fig.width=10, fig.asp=0.6, fig.cap="t-SNE plots of the pancreas datasets, before and after MNN correction. Each point represents a cell and is coloured by the batch of origin."----
# Adding uncorrected values.
sce <- mnn.out
assay(sce, "original") <- cbind(unc.gse81076, unc.gse85241)

# Using irlba to set up the t-SNE, for speed.
set.seed(100)
osce <- runPCA(sce, exprs_values="original", ntop=Inf, BSPARAM=IrlbaParam())
osce <- runTSNE(osce, use_dimred="PCA")
ot <- plotTSNE(osce, colour_by="batch") + ggtitle("Original")

# Corrected.
set.seed(100)
sce <- runTSNE(sce, use_dimred="corrected")
ct <- plotTSNE(sce, colour_by="batch") + ggtitle("Corrected")

multiplot(ot, ct, cols=2)

## ----tsne-markers, fig.width=10, fig.height=10, fig.cap="t-SNE plots after MNN correction, where each point represents a cell and is coloured by its corrected expression of key marker genes for known cell types in the pancreas."----
# Replacing the row names for easier reference.
rowData(sce)$ENSEMBL <- rownames(sce)    
rowData(sce)$SYMBOL <- mapIds(org.Hs.eg.db, keytype="ENSEMBL", 
    keys=rownames(sce), column="SYMBOL")
rownames(sce) <- uniquifyFeatureNames(rownames(sce), rowData(sce)$SYMBOL)

ct.gcg <- plotTSNE(sce, by_exprs_values="reconstructed", colour_by="GCG") 
ct.ins <- plotTSNE(sce, by_exprs_values="reconstructed", colour_by="INS") 
ct.sst <- plotTSNE(sce, by_exprs_values="reconstructed", colour_by="SST") 
ct.ppy <- plotTSNE(sce, by_exprs_values="reconstructed", colour_by="PPY") 

multiplot(ct.gcg + ggtitle("Alpha cells"),
    ct.ins + ggtitle("Beta cells"),
    ct.sst + ggtitle("Delta cells"),
    ct.ppy + ggtitle("PP cells"),
    cols=2)

## --------------------------------------------------------------------------
metadata(mnn.out)$merge.info$lost.var

## ---- echo=FALSE, results="hide"-------------------------------------------
gc()

## --------------------------------------------------------------------------
mnn.out2 <- batchelor::fastMNN(
    GSE81076=unc.gse81076, GSE85241=unc.gse85241,
    k=20, d=50, auto.order=c(2,1), BSPARAM=IrlbaParam(deferred=TRUE)
)
metadata(mnn.out2)$merge.order # batch 2 (GSE85241) is first in the order.
metadata(mnn.out2)$merge.info$pairs[[1]] # 'first' now refers to GSE85241.

## --------------------------------------------------------------------------
Rle(mnn.out2$batch) # same as mnn.out$batch

## ---- echo=FALSE, results="hide"-------------------------------------------
gc()

## --------------------------------------------------------------------------
all.donors <- unique(rescaled.gse85241$Donor)
table(rescaled.gse85241$Donor)
by.donor.85241 <- vector("list", length(all.donors))
names(by.donor.85241) <- sort(all.donors)
for (x in all.donors) {
    by.donor.85241[[x]] <- rescaled.gse85241[,rescaled.gse85241$Donor==x]
}

## --------------------------------------------------------------------------
adj.donors <- c(D101="A", D102="A", D10631="A",
    D17="B", D1713="B", D172444="B",
    D2="C", D3="C",
    D71="D", D72="D", D73="D", D74="D")[rescaled.gse81076$Donor]
table(adj.donors)

all.donors <- unique(adj.donors)
by.donor.81076 <- vector("list", length(all.donors))
names(by.donor.81076) <- sort(all.donors)
for (x in all.donors) {
    by.donor.81076[[x]] <- rescaled.gse81076[,adj.donors==x]
}

## --------------------------------------------------------------------------
all.batches <- c(by.donor.85241, by.donor.81076)

# Cosine normalizing for consistency with fastMNN() defaults.
all.logcounts <- lapply(all.batches, logcounts)
scaled <- lapply(all.logcounts, batchelor::cosineNorm) 

set.seed(1000) # for irlba.
pc.all <- do.call(batchelor::multiBatchPCA, c(scaled, 
    list(d=50, BSPARAM=IrlbaParam(deferred=TRUE))
))
names(pc.all)

## --------------------------------------------------------------------------
pcs.85241 <- pc.all[seq_along(by.donor.85241)]
mnn.out.85241 <- do.call(batchelor::fastMNN, c(pcs.85241, list(pc.input=TRUE)))
Rle(mnn.out.85241$batch)

## --------------------------------------------------------------------------
pcs.81076 <- tail(pc.all, length(by.donor.81076))
mnn.out.81076 <- do.call(batchelor::fastMNN, c(pcs.81076, list(pc.input=TRUE)))
Rle(mnn.out.81076$batch)

## --------------------------------------------------------------------------
mnn.out3 <- batchelor::fastMNN(
    GSE81076=mnn.out.81076,
    GSE85241=mnn.out.85241, 
    pc.input=TRUE, k=100 # see comments below.
) 

Rle(mnn.out3$batch) # by dataset
Rle(c(mnn.out.81076$batch, mnn.out.85241$batch)) # by donor

## ----tsne-hmerge, fig.width=10, fig.asp=0.5, fig.cap="t-SNE plots after correcting for donor effects within each data set, and after correcting for batch effects between data sets (final). Each point represents a cell that is coloured according to its donor of origin (left, middle) or the data set of origin (right)."----
set.seed(1000)
par(mfrow=c(1,3))

library(Rtsne)
tout.85241 <- Rtsne(mnn.out.85241$corrected, pca=FALSE)
plot(tout.85241$Y[,1], tout.85241$Y[,2], main="GSE85241 donors",
    col=as.factor(mnn.out.85241$batch), xlab="tSNE1", ylab="tSNE2")

tout.81076 <- Rtsne(mnn.out.81076$corrected, pca=FALSE)
plot(tout.81076$Y[,1], tout.81076$Y[,2], main="GSE81076 donors",
    col=as.factor(mnn.out.81076$batch), xlab="tSNE1", ylab="tSNE2")

tout.all <- Rtsne(mnn.out3$corrected, pca=FALSE)
plot(tout.all$Y[,1], tout.all$Y[,2], main="Final",
    col=as.factor(mnn.out3$batch), xlab="tSNE1", ylab="tSNE2")

## --------------------------------------------------------------------------
original.plate <- unlist(lapply(rescaled, "[[", i="Plate"))
original.names <- unlist(lapply(rescaled, colnames))

# Needs unique names: trigger error otherwise.
stopifnot(anyDuplicated(original.names)==0L)

m <- match(rownames(mnn.out3$corrected), original.names)
new.plate <- original.plate[m]

## ---- echo=FALSE, results="hide"-------------------------------------------
gc()

## --------------------------------------------------------------------------
mnn.out.auto <- do.call(batchelor::fastMNN, c(pcs.81076, 
    list(pc.input=TRUE, auto.order=TRUE)))
names(by.donor.81076) # supplied order 
metadata(mnn.out.auto)$merge.order # automatically defined order

## --------------------------------------------------------------------------
Rle(mnn.out.auto$batch) 

## ---- echo=FALSE, results="hide"-------------------------------------------
gc()

## --------------------------------------------------------------------------
snn.gr <- buildSNNGraph(sce, use.dimred="corrected")
clusters <- igraph::cluster_walktrap(snn.gr)
table(clusters$membership, sce$batch)

## ----tsne-cluster, fig.cap="t-SNE plot after MMN correction, where each point represents a cell and is coloured by its cluster identity."----
sce$Cluster <- factor(clusters$membership)
plotTSNE(sce, colour_by="Cluster")

## --------------------------------------------------------------------------
m.out <- findMarkers(sce, clusters$membership, block=sce$batch,
    direction="up", assay.type="original")
demo <- m.out[["10"]] # probably alpha cells.
demo <- demo[demo$Top <= 5,]
as.data.frame(demo[,1:3]) # only first three columns for brevity.

## ---- echo=FALSE, results="hide"-------------------------------------------
# Checking that cluster 10 is what we think it is.
stopifnot(all(sapply(demo["GCG",-(1:3)], sign)==1))

## --------------------------------------------------------------------------
assay(sce, "reconstructed")
summary(assay(sce)["INS",])

## --------------------------------------------------------------------------
rotations <- metadata(pc.all)$rotation
cor.exp <- tcrossprod(mnn.out3$corrected,
    rotations["ENSG00000115263",,drop=FALSE])
summary(cor.exp)

## --------------------------------------------------------------------------
saveRDS(file="pancreas_data.rds", sce)

## --------------------------------------------------------------------------
sessionInfo()