library(CSAMA2014ScalableComputingLab)
fastq.sampler <- FastqSampler(NA12878.20.fastq, n = 1e6,
                              readerBlockSize = 1e6)
set.seed(1975)
fastq <- yield(fastq.sampler)
length(fastq)

qa.result <- qa(fastq, "NA12878")
qa.result[["baseCalls"]]

bam.file <- ScalableGenomics::NA12878.20.bam
param <- ScanBamParam(what=c("pos", "qwidth"),
                      flag=scanBamFlag(isUnmappedQuery=FALSE))
bam <- scanBam(bam.file, param=param)[[1]]
aln.ranges <- IRanges(bam$pos, width=bam$qwidth)
cov <- coverage(aln.ranges)

cov <- coverage(bam.file, param=param)

cov[[20]]

compression <- length(cov) / (length(runValue(cov))*2)
compression

cov.cutoff <- 150L
high.cov <- cov > cov.cutoff
length(high.cov) / (length(runValue(high.cov))*2)

high.cov.vector <- as.vector(high.cov)
microbenchmark(sum(high.cov),  sum(high.cov.vector))
summary(bm)[,c("expr", "median")]

high.cov.views <- slice(cov[[20]], cov.cutoff)
head(high.cov.views)

mean.high.cov <- viewMeans(high.cov.views)
head(mean.high.cov)

genome <- BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19
object.size(genome)

chr20.views <- Views(genome$chr20, ranges(high.cov.views))

unmasked(genome$chr20)@shared
subject(chr20.views)@shared

high.cov.freq <- alphabetFrequency(chr20.views, as.prob=TRUE, collapse=TRUE)
chr20.freq <- alphabetFrequency(genome$chr20, as.prob=TRUE)
rbind(high.cov.freq, chr20.freq)[,DNA_BASES]

cov20 <- cov[[20]]

session <- browserSession()
chr20.range <- GRangesForUCSCGenome("hg19", "chr20")
repeats.table <- getTable(session, "rmsk", chr20.range)
simple.classes <- c("Simple_repeat", "Low_complexity")
repeats.simple <- subset(repeats.table, repClass %in% simple.classes)
repeats <- with(repeats.simple, IRanges(genoStart, genoEnd))

hits <- findOverlaps(repeats, high.cov.views)
f <- factor(subjectHits(hits), seq_len(subjectLength(hits)))
repeats.olap <- pintersect(repeats[queryHits(hits)],
                           ranges(high.cov.views)[subjectHits(hits)])
repeats.split <- split(repeats.olap, f)
class(repeats.split)

repeat.cov <- sum(width(repeats.split))
length(repeat.cov)

sum(repeat.cov) / sum(width(high.cov.views))

sum(width(reduce(repeats))) / width(chr20.range)

chr20.set <- as(chr20.views, "DNAStringSet")
first.seq <- chr20.set[[1]]
first.seq@shared
unmasked(genome$chr20)@shared

repeats.tree <- GIntervalTree(GRanges("20", repeats))

bam.stream <- BamFile(path(bam.file), yieldSize = 1e6)

maskRule <- function(reads) {
  strand <- Rle(strand("*"), nrow(reads))
  ga <- with(reads, GAlignments(rname, pos, cigar, strand))
  ga %outside% repeats.tree
}
rules <- FilterRules(list(mask = maskRule))

dest <- paste0("masked-", basename(path(bam.file)))
param <- ScanBamParam(what=c("rname", "pos", "cigar"),
                      flag=scanBamFlag(isUnmappedQuery=FALSE))
filtered.bam <- filterBam(bam.file, dest, param=param, filter=rules)

bam.stream <- BamFile(path(filtered.bam), yieldSize = 1e6)
cov2 <- RleList(lapply(seqlengths(bam.stream), Rle, values=0L),
                compress=FALSE)
open(bam.stream)
while(isIncomplete(bam.stream)) {
  reads <- readGAlignments(bam.stream)
  cov2 <- cov2 + coverage(reads)
}
close(bam.stream)

sum(gc(reset=TRUE)[,6])

bam.stream <- Rsamtools::BamFile(path(filtered.bam), yieldSize = 1e6)
coverage(GenomicAlignments::readGAlignments(bam.stream))
sum(gc()[,6])

tiles <- tileGenome(seqinfo(filtered.bam)["20"], tilewidth=1e6)
cov3 <- Reduce(`+`, lapply(tiles, function(tile) {
  param <- ScanBamParam(which=tile)
  reads <- readGAlignments(filtered.bam, param=param)
  intersection <- restrict(unlist(grglist(reads)), start(tile), end(tile))
  coverage(intersection)
}))

identical(cov2, cov3)

export(cov3, "NA12878.20.coverage.bw")

register(SerialParam())

pileup.bams <- BamFileList(NA12878 = filtered.bam)
bpparam <- MulticoreParam(workers=2)
library(VariantTools) # will not work on Windows or older Macs
pileup <- do.call(c, bplapply(tiles, function(tile) {
  pileupVariants(pileup.bams, genome, PileupParam(which=tile))
}, BPPARAM=bpparam))

cov800 <- summary(BigWigFile("NA12878.20.coverage.bw"), size = 3846)
bigwig.track <- ggplot() + geom_bar(cov800[[20]]) + ylab("coverage")
filtered.bam <- BamFile("masked-NA12878.HiSeq.WGS.bwa.cleaned.recal.b37.20.bam")
cov.estimate <- biovizBase::estimateCoverage(filtered.bam)
cov.estimate <- keepSeqlevels(cov.estimate, "20")
estimate.track <- ggplot() + geom_bar(cov.estimate) + ylab("coverage")
p <- tracks(bigwig = bigwig.track, estimate = estimate.track)

cov800 <- summary(BigWigFile("NA12878.20.coverage.bw"), size = 800)

p

high.cov.gr <- GRanges("20", ranges(high.cov.views))
genome(high.cov.gr) <- "hg19"
high.cov.gr <- high.cov.gr[width(high.cov.gr) > 50L]
roi <- high.cov.gr[200] * 2L

reference.track <- autoplot(genome, which = roi)
tally.track <- autoplot(filtered.bam, stat = "mismatch",
                        bsgenome = genome, which = roi, geom = "bar")
vcf <- ScalableGenomics::NA12878.20.vcf
variant.track <- autoplot(vcf, type = "fixed", ref.show = FALSE,
                          which = roi)
p <- tracks(tally = tally.track,
            variant = variant.track, reference = reference.track,
            heights = c(4, 1, 1)) + theme_bw()

pdf(file="../pkg/vignettes/fig/tracks.pdf",height=5,width=9)
p
dev.off()

png(filename="fig/tracks-zoom.png",height=400,width=800)
p + xlim(roi + 10)
dev.off()