## ----echo=FALSE, results='hide', warning=FALSE, message=FALSE-----------------
suppressPackageStartupMessages({
    library(MSA2dist)
    library(Biostrings)
    library(ape)
    library(dplyr)
    library(tidyr)
    library(ggplot2)
    })

## -----------------------------------------------------------------------------
# load MSA2dist
library(MSA2dist)
# load example data
data(hiv, package="MSA2dist")
data(AAMatrix, package="MSA2dist")
data(woodmouse, package="ape")

## -----------------------------------------------------------------------------
## define two cds sequences
cds1 <- Biostrings::DNAString("ATGCAACATTGC")
cds2 <- Biostrings::DNAString("ATG---CATTGC")
cds1.cds2.aln <- c(Biostrings::DNAStringSet(cds1),
    Biostrings::DNAStringSet(cds2))
## define names
names(cds1.cds2.aln) <- c("seq1", "seq2")
## convert into alignment
cds1.cds2.aln |> dnastring2aln()

## -----------------------------------------------------------------------------
## convert back into DNAStringSet
cds1.cds2.aln |> dnastring2aln() |> aln2dnastring()

## -----------------------------------------------------------------------------
## convert into alignment
cds1.cds2.aln |> dnastring2dnabin()

## -----------------------------------------------------------------------------
## convert back into DNAStringSet
cds1.cds2.aln |> dnastring2dnabin() |> dnabin2dnastring()
## use woodmouse data
woodmouse |> dnabin2dnastring()

## -----------------------------------------------------------------------------
## translate cds into aa
aa1.aa2.aln <- cds1.cds2.aln |> cds2aa()
## convert into alignment
aa1.aa2.aln |> aastring2aln()

## -----------------------------------------------------------------------------
## convert back into AAStringSet
aa1.aa2.aln |> aastring2aln() |> aln2aastring()

## -----------------------------------------------------------------------------
## convert into AAbin
aa1.aa2.aln |> aastring2aabin()

## -----------------------------------------------------------------------------
## convert back into AAStringSet
aa1.aa2.aln |> aastring2aabin() |> aabin2aastring()

## -----------------------------------------------------------------------------
## define two cds sequences
cds1 <- Biostrings::DNAString("ATGCAACATTGC")
cds2 <- Biostrings::DNAString("ATG---CATTGC")
cds1.cds2.aln <- c(Biostrings::DNAStringSet(cds1),
    Biostrings::DNAStringSet(cds2))
## define names
names(cds1.cds2.aln) <- c("seq1", "seq2")
## translate cds into aa
cds1.cds2.aln |> cds2aa()
aa1.aa2.aln <- cds1.cds2.aln |> cds2aa()

## -----------------------------------------------------------------------------
## translate cds into aa using frame = 2
## result is empty due to not multiple of three
cds1.cds2.aln |> cds2aa(frame=2)
## translate cds into aa using frame = 2 and shorten = TRUE
cds1.cds2.aln |> cds2aa(frame=2, shorten=TRUE)
## translate cds into aa using frame = 3 and shorten = TRUE
cds1.cds2.aln |> cds2aa(frame=3, shorten=TRUE)
## use woodmouse data
woodmouse |> dnabin2dnastring() |> cds2aa(shorten=TRUE)

## -----------------------------------------------------------------------------
## alternative genetic code
## use woodmouse data
woodmouse |> dnabin2dnastring() |> cds2aa(shorten=TRUE,
    genetic.code=Biostrings::getGeneticCode("2"))

## -----------------------------------------------------------------------------
## calculate pairwise AA distances based on Grantham's distance
aa.dist <- hiv |> cds2aa() |> aastring2dist(score=granthamMatrix())
## obtain distances
head(aa.dist$distSTRING)
## obtain pairwise sites used
head(aa.dist$sitesUsed)

## -----------------------------------------------------------------------------
## create and plot bionj tree
aa.dist.bionj <- ape::bionj(as.dist(aa.dist$distSTRING))
plot(aa.dist.bionj)

## -----------------------------------------------------------------------------
## use AAMatrix data
head(AAMatrix)
aa.dist.AAMatrix <- hiv |> cds2aa() |> aastring2dist(score=AAMatrix)
head(aa.dist.AAMatrix$distSTRING)

## -----------------------------------------------------------------------------
## use hiv data
dna.dist <- hiv |> dnastring2dist(model="K80")
## obtain distances
head(dna.dist$distSTRING)
## obtain pairwise sites used
head(dna.dist$sitesUsed)

## -----------------------------------------------------------------------------
## create and plot bionj tree
dna.dist.bionj <- ape::bionj(as.dist(dna.dist$distSTRING))

## -----------------------------------------------------------------------------
## creation of the association matrix:
association <- cbind(aa.dist.bionj$tip.label, aa.dist.bionj$tip.label)
## cophyloplot
ape::cophyloplot(aa.dist.bionj,
    dna.dist.bionj,
    assoc=association,
    length.line=4,
    space=28,
    gap=3,
    rotate=FALSE)

## -----------------------------------------------------------------------------
## use hiv data
hiv.dist.iupac <- head(hiv |> dnastring2dist(model="IUPAC"))
head(hiv.dist.iupac$distSTRING)
## run multi-threaded
system.time(hiv |> dnastring2dist(model="IUPAC", threads=1))
system.time(hiv |> dnastring2dist(model="IUPAC", threads=2))

## -----------------------------------------------------------------------------
## use woodmouse data
woodmouse.dist <- woodmouse |> dnabin2dnastring() |> dnastring2dist()
head(woodmouse.dist$distSTRING)

## -----------------------------------------------------------------------------
## use hiv data
## model Li
head(hiv |> dnastring2kaks(model="Li"))
## model NG86
head(hiv |> dnastring2kaks(model="NG86", threads=1))

## -----------------------------------------------------------------------------
## define two cds sequences
cds1 <- Biostrings::DNAString("ATGCAACATTGC")
cds2 <- Biostrings::DNAString("ATG---CATTGC")
cds1.cds2.aln <- c(Biostrings::DNAStringSet(cds1),
    Biostrings::DNAStringSet(cds2))
## convert into alignment
cds1.cds2.aln |> dnastring2codonmat()

## -----------------------------------------------------------------------------
## use frame 2 and shorten to circumvent multiple of three error
cds1 <- Biostrings::DNAString("-ATGCAACATTGC-")
cds2 <- Biostrings::DNAString("-ATG---CATTGC-")
cds1.cds2.aln <- c(Biostrings::DNAStringSet(cds1),
    Biostrings::DNAStringSet(cds2))
cds1.cds2.aln |> dnastring2codonmat(frame=2, shorten=TRUE)

## -----------------------------------------------------------------------------
## use hiv data
## calculate average behavior
hiv.xy <- hiv |> dnastring2codonmat() |> codonmat2xy()

## -----------------------------------------------------------------------------
print(hiv.xy |> dplyr::select(Codon,SynMean,NonSynMean,IndelMean) |>
    tidyr::gather(variable, values, -Codon) |>
    ggplot2::ggplot(aes(x=Codon, y=values)) + 
    ggplot2::geom_line(aes(colour=factor(variable))) + 
    ggplot2::geom_point(aes(colour=factor(variable))) + 
    ggplot2::ggtitle("HIV-1 sample 136 patient 1 from
        Sweden envelope glycoprotein (env) gene"))

## ----sessionInfo, echo=TRUE---------------------------------------------------
sessionInfo()