###################################################
### chunk number 1: setup
###################################################
options(width=72)
olocale <- Sys.setlocale(locale="C")


###################################################
### chunk number 2: PackageLoads
###################################################
library("BSgenome.Mmusculus.UCSC.mm9")


###################################################
### chunk number 3: setwd eval=FALSE
###################################################
## setwd(file.path("path", "to", "data"))


###################################################
### chunk number 4: setwd
###################################################
# Change this path to the appropriate location
setwd("~/Documents/workspace/bioC/Courses/Seattle-Jan-2009/labs")


###################################################
### chunk number 5: LoadTopReads
###################################################
load(file.path("data", "topReads.rda"))


###################################################
### chunk number 6: DNAString
###################################################
r1 <- topReads[["experiment2"]][["lane1"]][,"read"][[1]]

nchar(r1)
alphabetFrequency(r1)

reverseComplement(r1)

subseq(r1, start=5, end=15)
subseq(r1, end=15)
subseq(r1, start=-5)


###################################################
### chunk number 7: DNAStringSet
###################################################
dict0 <- topReads[["experiment2"]][["lane1"]][,"read"]

length(dict0)
table(width(dict0))

dict0[-2]

rev(dict0)

dict0[[1]]

DNAStringSet(dict0, end=-3)
DNAStringSet(dict0, start=-10)

head(alphabetFrequency(dict0))
reverseComplement(dict0)

# Use 'collapse=TRUE' to collapse all the rows
alphabetFrequency(dict0, collapse=TRUE)
alphabetFrequency(reverseComplement(dict0), collapse=TRUE)

# Use [ , ] to subset the matrix returned by alphabetFrequency()
dict0 <- dict0[alphabetFrequency(dict0)[ ,"N"] == 0]


###################################################
### chunk number 8: DNAStringViews
###################################################
v3 <- Views(dict0[[1]], start=c(2, 12, 20), end=c(5, 26, 27))

subject(v3)
start(v3)
end(v3)
gaps(v3)

alphabetFrequency(v3)

DNAStringSet(v3)


###################################################
### chunk number 9: BSgenome
###################################################
library("BSgenome.Mmusculus.UCSC.mm9")
Mmusculus

seqlengths(Mmusculus)

Mmusculus$chrM
unmasked(Mmusculus$chrM)

# To see whether the chromosomes contain IUPAC extended letters, we apply
# alphabetFrequency() to each unmasked chromosome:
af <- sapply(seqnames(Mmusculus),
        function(name)
        alphabetFrequency(unmasked(Mmusculus[[name]]), baseOnly=TRUE))
af

# Bisulfite transformation of the plus strand:
plus_strand <- chartr("C", "T", unmasked(Mmusculus$chr1))
alphabetFrequency(plus_strand)

# Bisulfite transformation of the minus strand:
minus_strand <- chartr("G", "A", unmasked(Mmusculus$chr1))
alphabetFrequency(minus_strand)


###################################################
### chunk number 10: matchPattern
###################################################
pattern <- DNAString("ACCGGTTATC")
matchPattern(pattern, Mmusculus$chr1)

# Reverse complement 'pattern' instead of 'Mmusculus$chr1': it's more
# memory efficient and it keeps coordinates relative to the plus strand
# which is what everybody seems to do (NCBI, UCSC, etc...)
matchPattern(reverseComplement(pattern), Mmusculus$chr1)

matchPattern(pattern, Mmusculus$chr1, max.mismatch=2, with.indels=TRUE)


###################################################
### chunk number 11: vmatchPattern
###################################################
Mmusculus$upstream5000
m <- vmatchPattern(pattern, Mmusculus$upstream5000)

### To get the indices of the references sequences with hits:
which(countIndex(m) != 0)

### To get the hits in reference sequence 2956:
m[[2956]]


###################################################
### chunk number 12: Ambiguities
###################################################
matchPattern("GAACTTTGCCACTC", Mmusculus$chr1)

# By default, 'fixed' is TRUE so the N in the pattern can only match an N
# in the subject:
matchPattern("GAACTNTGCCACTC", Mmusculus$chr1)

matchPattern("GAACTNTGCCACTC", Mmusculus$chr1, fixed=FALSE)


###################################################
### chunk number 13: 
###################################################
library(BSgenome.Mmusculus.UCSC.mm9)


###################################################
### chunk number 14: 
###################################################
chr1 <- Mmusculus$chr1
active(masks(chr1))["TRF"] <- TRUE # activate Tandem Repeats Finder mask


###################################################
### chunk number 15: 
###################################################
active(masks(chr1)) <- TRUE # activate all the masks


###################################################
### chunk number 16: WorkingWithMasks
###################################################
maskedratio(masks(Mmusculus$chr1)["AGAPS"])

# 'Mmusculus$chr1' is an immutable object so before we can turn its masks
# on or off, we need to copy it to another variable (the chromosome sequence
# is not copied during this operation):
chr1 <- Mmusculus$chr1
active(masks(chr1)) <- FALSE
active(masks(chr1))["AGAPS"] <- TRUE

chr1

alphabetFrequency(chr1)
active(masks(chr1))["AMB"] <- TRUE

alphabetFrequency(chr1)

alphabetFrequency(unmasked(chr1))

# To display the contig as views:
chr1 <- Mmusculus$chr1
active(masks(chr1)) <- FALSE
active(masks(chr1))["AGAPS"] <- TRUE
as(chr1 , "XStringViews")

# Activate all masks
active(masks(chr1)) <- TRUE
chr1

matchPattern("ACACACACACACACACACAC", chr1)
matchPattern("ACACACACACACACACACAC", unmasked(chr1))


###################################################
### chunk number 17: FullAnalysis
###################################################
pdict0 <- PDict(dict0)

m <- matchPDict(pdict0, Mmusculus$chr1)

Rle(countIndex(m))
which(countIndex(m) == max(countIndex(m)))
pdict0[[46]]

Views(unmasked(Mmusculus$chr1), start=start(m[[46]]), end=end(m[[46]]))

matchPattern(pdict0[[46]], Mmusculus$chr1)

### Hits in the minus strand:
pdict1 <- PDict(reverseComplement(dict0))
m1 <- matchPDict(pdict1, Mmusculus$chr1)
Rle(countIndex(m1))
which(countIndex(m1) == max(countIndex(m1)))
reverseComplement(pdict1[[433]])

# The previous analysis was for exact hits. To find inexact hits
# with at most 2 mismatches per read in the last 20 nucleotides, we
# need to specify a Trusted Band during preprocessing:
pdict2 <- PDict(dict0, tb.end=16)

# and to call matchPDict() with 'max.mismatch=2':
m2 <- matchPDict(pdict2, Mmusculus$chr1, max.mismatch=2)

# Of course we find the same hits or more for each read:
all(countIndex(m2) >= countIndex(m))

which(countIndex(m2) == max(countIndex(m2)))

pdict0[[90]]


###################################################
### chunk number 18: sessionInfo
###################################################
toLatex(sessionInfo())