## ----style, echo = FALSE, results = 'asis'---------------------------------
BiocStyle::markdown()

## ----global_options, include = FALSE---------------------------------------
knitr::opts_chunk$set(prompt = TRUE, collapse = TRUE, fig.align = "center")

## ----eval = FALSE----------------------------------------------------------
#  if (!requireNamespace("BiocManager", quietly=TRUE))
#      install.packages("BiocManager")
#  BiocManager::install("transcriptR")

## ----eval = TRUE, echo = FALSE, message = FALSE----------------------------
library(transcriptR)

## ----cache = FALSE, echo = TRUE, results = 'hide', message = FALSE---------
library(GenomicFeatures)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
# Use UCSC genes as the reference annotations
knownGene <- TxDb.Hsapiens.UCSC.hg19.knownGene
# Extract genes information
knownGene.genes <- genes(knownGene)

## ---- cache = FALSE--------------------------------------------------------
# load TranscriptionDataSet object
data(tds)
# view TranscriptionDataSet object
tds

## ---- eval = FALSE---------------------------------------------------------
#  # or initialize a new one (do not run the following code)
#  # specify a region of interest (optional)
#  region <- GRanges(seqnames = "chr15", ranges = IRanges(start = 63260000, end = 63300000))
#  # object construction
#  tds <- constructTDS(file = path.to.Bam,
#                      region = region,
#                      fragment.size = 250,
#                      unique = FALSE,
#                      paired.end = FALSE,
#                      swap.strand = TRUE)

## --------------------------------------------------------------------------
estimateBackground(tds, fdr.cutoff = 0.01)
# view estimated cutoff value
tds

## ----eval = FALSE----------------------------------------------------------
#  # look at the coverage profile of the regions expressed above the background level
#  exportCoverage(object = tds, file = "plus.bw", type = "bigWig", strand = "+",
#                 filter.by.coverage.cutoff = TRUE, rpm = FALSE)
#  
#  # or check the raw coverage (all expressed regions)
#  exportCoverage(object = tds, file = "plus_raw.bw", type = "bigWig", strand = "+",
#                 filter.by.coverage.cutoff = FALSE, rpm = FALSE)

## --------------------------------------------------------------------------
# create a range of gap distances to test
# from 0 bp to 10000 bp with the step of 100 bp
gd <- seq(from = 0, to = 10000, by = 100)
estimateGapDistance(object = tds, annot = knownGene.genes, filter.annot = TRUE, 
                    fpkm.quantile = 0.25, gap.dist.range = gd)
# view the optimal gap distance minimazing the sum of two errors
tds

## ----fig.cap = "Gap distance error rate.", fig.height=5, fig.width=5-------
# get intermediate calculation 
gdTest <- getTestedGapDistances(tds)
head(gdTest)
# plot error rates
plotErrorRate(tds, lwd = 2)

## --------------------------------------------------------------------------
detectTranscripts(tds, estimate.params = TRUE)

## --------------------------------------------------------------------------
annotateTranscripts(object = tds, annot = knownGene.genes, min.overlap = 0.5)

## --------------------------------------------------------------------------
trx <- getTranscripts(tds, min.length = 250, min.fpkm = 0.5) 
head(trx, 5)

## ----eval = FALSE----------------------------------------------------------
#  transcriptsToBed(object = trx, file = "transcripts.bed", strand.color = c("blue", "red"))

## ---- cache = FALSE--------------------------------------------------------
# load ChipDataSet object
data(cds)
cds

## ---- eval = FALSE---------------------------------------------------------
#  # or initialize a new one (do not run the following code)
#  # specify the region of interest (optional)
#  region <- GRanges(seqnames = "chr15", ranges = IRanges(start = 63260000, end = 63300000))
#  # object construction
#  cds <- constructCDS(peaks = path.to.peaks,       # path to a peak file
#                      reads = path.to.reads,       # path to a Bam file with reads
#                      region = region,
#                      TxDb = knownGene,            # annotation database to evaluate
#                                                   # genomic distribution of the peaks
#                      tssOf = "transcript",        # genomic feature to extract TSS region
#                      tss.region = c(-2000, 2000), # size of the TSS region,
#                                                   # from -2kb to +2 kb
#                      reduce.peaks = TRUE,         # merge neighboring peaks
#                      gapwidth = 500,              # min. gap distance between peaks
#                      unique = TRUE,
#                      swap.strand = FALSE)

## ----eval = TRUE-----------------------------------------------------------
peaks <- getPeaks(cds)
head(peaks, 3)

## ----cache = FALSE---------------------------------------------------------
getGenomicAnnot(cds)

## ---- fig.cap = "Genomic distribution of the peaks", fig.height=5, fig.width=5----
plotGenomicAnnot(object = cds, plot.type = "distr")

## ---- fig.cap = "Enrichment of peaks at distinct genomic features.", fig.height=5, fig.width=5----
plotGenomicAnnot(object = cds, plot.type = "enrich")

## ----eval = TRUE, fig.cap = "Estimated features", fig.height = 4, fig.width = 8----
plotFeatures(cds, plot.type = "box")

## ----eval = TRUE-----------------------------------------------------------
predictTssOverlap(object = cds, feature = "pileup", p = 0.75)
cds

## --------------------------------------------------------------------------
peaks <- getPeaks(cds)
head(peaks, 3)

## ----eval = TRUE-----------------------------------------------------------
getConfusionMatrix(cds)
getPredictorSignificance(cds)

## ---- fig.cap="Performance of the classification model (gene associated peaks prediction).", eval = TRUE, fig.width = 4.5, fig.height = 4.5----
plotROC(object = cds, col = "red3", grid = TRUE, auc.polygon = TRUE)

## ----eval = TRUE-----------------------------------------------------------
predictStrand(cdsObj = cds, tdsObj = tds, quant.cutoff = 0.1, win.size = 2000)
cds
peaks <- getPeaks(cds)
head(peaks[ peaks$predicted.tssOverlap == "yes" ], 3)

## ----eval = TRUE-----------------------------------------------------------
df <- getQuadProb(cds, strand = "+")
head(df, 3)

## ----eval = TRUE-----------------------------------------------------------
 getProbTreshold(cds)

## ----eval = FALSE----------------------------------------------------------
#  peaksToBed(object = cds, file = "peaks.bed", gene.associated.peaks = TRUE)

## --------------------------------------------------------------------------
# set `estimate.params = TRUE` to re-calculate FPKM and coverage density
breakTranscriptsByPeaks(tdsObj = tds, cdsObj = cds, estimate.params = TRUE)
# re-annotate identified transcripts
annotateTranscripts(object = tds, annot = knownGene.genes, min.overlap = 0.5)
# retrieve the final set of transcripts
trx.final <- getTranscripts(tds)

## ----eval = FALSE----------------------------------------------------------
#  # visualize the final set of transcripts in a UCSC genome browser
#  transcriptsToBed(object = trx.final, file = "transcripts_final.bed")

## --------------------------------------------------------------------------
hits <- findOverlaps(query = trx, subject = trx.final)
trx.broken <- trx[unique(queryHits(hits)[duplicated(queryHits(hits))])]
head(trx.broken, 5)

## ----echo=FALSE------------------------------------------------------------
sessionInfo()