###################################################
### chunk number 1: setup
###################################################
options(width=72)
.oldlocale <- Sys.setlocale(locale="C")


###################################################
### chunk number 2: PackageLoads
###################################################
library("BSgenome.Mmusculus.UCSC.mm9")


###################################################
### chunk number 3: LoadTopReads
###################################################
load(file.path("..", "data", "topReads.rda"))
ls()


###################################################
### chunk number 4: check
###################################################
  ## checking
  stopifnot(all(sapply(topReads, length)==length(topReads[[1L]])))
  dd = dim(topReads[[1L]][[1L]])
  for(expts in topReads) for(lanes in expts) stopifnot(identical(dim(lanes), dd))
  rm(list=c("dd", "expts", "lanes"))


###################################################
### chunk number 5: show
###################################################
topReads[[1]][[1]]
colnames(topReads[[1]][[1]])
topReads[[1]][[1]][1,"read"]
topReads[[1]][[1]][1,"count"]


###################################################
### chunk number 6: DNAString1
###################################################
r1 <- topReads[["experiment2"]][["lane1"]][,"read"][[1]]
nchar(r1)
alphabetFrequency(r1)


###################################################
### chunk number 7: DNAString2
###################################################
reverseComplement(r1)
subseq(r1, start=5, end=15)
subseq(r1, end=15)
subseq(r1, start=-5)


###################################################
### chunk number 8: DNAStringSet
###################################################
dss <- topReads[["experiment2"]][["lane1"]][,"read"]

length(dss)
table(width(dss))

dss[-2]

rev(dss)

dss[[1]]

DNAStringSet(dss, end=-3)
DNAStringSet(dss, start=-10)

head(alphabetFrequency(dss))
reverseComplement(dss)

# Use 'collapse=TRUE' to collapse all the rows
alphabetFrequency(dss, collapse=TRUE)
alphabetFrequency(reverseComplement(dss), collapse=TRUE)

# Use [ , ] to subset the matrix returned by alphabetFrequency()
dss <- dss[alphabetFrequency(dss)[ ,"N"] == 0]


###################################################
### chunk number 9: DNAStringViews
###################################################
v3 <- Views(dss[[1]], start=c(2, 12, 20), end=c(5, 26, 27))

subject(v3)
start(v3)
end(v3)
gaps(v3)

alphabetFrequency(v3)

DNAStringSet(v3)


###################################################
### chunk number 10: BSgenome1
###################################################
Mmusculus
seqlengths(Mmusculus)


###################################################
### chunk number 11: BSgenome2
###################################################
Mmusculus$chrM


###################################################
### chunk number 12: BSgenome3
###################################################
unmasked(Mmusculus$chrM)


###################################################
### chunk number 13: BSgenome4
###################################################
af <- sapply(seqnames(Mmusculus),
        function(name)
        alphabetFrequency(unmasked(Mmusculus[[name]]), baseOnly=TRUE))


###################################################
### chunk number 14: figaf
###################################################
barplot(t(af), beside=TRUE, col=rainbow(ncol(af)))


###################################################
### chunk number 15: BSgenome5
###################################################
reg = 10000000+(1:5000)
plus_strand <- chartr("C", "T", unmasked(Mmusculus$chr1)[reg])
alphabetFrequency(plus_strand, baseOnly=TRUE)


###################################################
### chunk number 16: BSgenome6
###################################################
minus_strand <- chartr("G", "A", unmasked(Mmusculus$chr1)[reg])
alphabetFrequency(minus_strand, baseOnly=TRUE)


###################################################
### chunk number 17: matchPattern1
###################################################
pattern <- DNAString("ACCGGTTATC")
matchPattern(pattern, Mmusculus$chr1)


###################################################
### chunk number 18: matchPattern2
###################################################
matchPattern(reverseComplement(pattern), Mmusculus$chr1)


###################################################
### chunk number 19: matchPattern2
###################################################
matchPattern(pattern, Mmusculus$chr1, max.mismatch=2, with.indels=TRUE)


###################################################
### chunk number 20: vmatchPattern1
###################################################
Mmusculus$upstream5000
m <- vmatchPattern(pattern, Mmusculus$upstream5000)


###################################################
### chunk number 21: vmatchPattern2
###################################################
which(countIndex(m) != 0)


###################################################
### chunk number 22: vmatchPattern3
###################################################
m[[2956]]


###################################################
### chunk number 23: Ambiguities
###################################################
matchPattern("GAACTTTGCCACTC", Mmusculus$chr1)


###################################################
### chunk number 24: Ambiguities2
###################################################
matchPattern("GAACTNTGCCACTC", Mmusculus$chr1)
matchPattern("GAACTNTGCCACTC", Mmusculus$chr1, fixed=FALSE)


###################################################
### chunk number 25: mask1
###################################################
chr1 <- Mmusculus$chr1
active(masks(chr1))["TRF"] <- TRUE 


###################################################
### chunk number 26: mask2
###################################################
active(masks(chr1)) <- TRUE 


###################################################
### chunk number 27: WorkingWithMasks1
###################################################
maskedratio(masks(Mmusculus$chr1)["AGAPS"])


###################################################
### chunk number 28: WorkingWithMasks2
###################################################
chr1 <- Mmusculus$chr1
active(masks(chr1)) <- FALSE
active(masks(chr1))["AGAPS"] <- TRUE
chr1
alphabetFrequency(chr1, baseOnly=TRUE)
active(masks(chr1))["AMB"] <- TRUE
alphabetFrequency(chr1, baseOnly=TRUE)
alphabetFrequency(unmasked(chr1), baseOnly=TRUE)


###################################################
### chunk number 29: WorkingWithMasks3
###################################################
chr1 <- Mmusculus$chr1
active(masks(chr1)) <- FALSE
active(masks(chr1))["AGAPS"] <- TRUE
as(chr1 , "XStringViews")


###################################################
### chunk number 30: WorkingWithMasks4
###################################################
active(masks(chr1)) <- TRUE
chr1
matchPattern("ACACACACACACACACACAC", chr1)
matchPattern("ACACACACACACACACACAC", unmasked(chr1))


###################################################
### chunk number 31: FullAnalysis
###################################################
pdss <- PDict(dss)

m <- matchPDict(pdss, Mmusculus$chr1)

Rle(countIndex(m))
which(countIndex(m) == max(countIndex(m)))
pdss[[46]]

Views(unmasked(Mmusculus$chr1), start=start(m[[46]]), end=end(m[[46]]))

matchPattern(pdss[[46]], Mmusculus$chr1)

### Hits in the minus strand:
pdict1 <- PDict(reverseComplement(dss))
m1 <- matchPDict(pdict1, Mmusculus$chr1)
Rle(countIndex(m1))
which(countIndex(m1) == max(countIndex(m1)))
reverseComplement(pdict1[[433]])

# The previous analysis was for exact hits. To find inexact hits
# with at most 2 mismatches per read in the last 20 nucleotides, we
# need to specify a Trusted Band during preprocessing:
pdict2 <- PDict(dss, tb.end=16)

# and to call matchPDict() with 'max.mismatch=2':
m2 <- matchPDict(pdict2, Mmusculus$chr1, max.mismatch=2)

# Of course we find the same hits or more for each read:
all(countIndex(m2) >= countIndex(m))

which(countIndex(m2) == max(countIndex(m2)))

pdss[[90]]


###################################################
### chunk number 32: FindTopReads eval=FALSE
###################################################
## sp <-
##   list("experiment1" = SolexaPath(file.path("path", "to", "experiment1")),
##        "experiment2" = SolexaPath(file.path("path", "to", "experiment2")))
## patSeq <- paste("s_", 1:8, "_.*_seq.txt", sep = "")
## names(patSeq) <- paste("lane", 1:8, sep = "")
## topReads <-
##   lapply(seq_len(length(sp)),
##          function(i) {
##            print(experimentPath(sp[[i]]))
##            lapply(seq_len(length(patSeq)),
##                   function(j, n = 1000) {
##                       cat("Reading", patSeq[[j]], "...")
##                       x <-
##                         tables(readXStringColumns(baseCallPath(sp[[i]]),
##                                                   pattern = patSeq[[j]],
##                                                   colClasses =
##                                                   c(rep(list(NULL), 4),
##                                                   list("DNAString")))[[1]],
##                                n = n)[["top"]]
##                       names(x) <- chartr("-", "N", names(x))
##                       cat("done.\n")
##                       XDataFrame(read = DNAStringSet(names(x)),
##                                  count = unname(x))
##                   })
##          })
## names(topReads) <- names(sp)
## for (i in seq_len(length(sp))) {
##     names(topReads[[i]]) <- names(patSeq)
## }


###################################################
### chunk number 33: sessionInfo
###################################################
toLatex(sessionInfo())