## ----echo=FALSE---------------------------------------------------------------
# These settings make the vignette prettier
knitr::opts_chunk$set(results="hold", collapse=FALSE, message=FALSE, 
                      warning = FALSE)
#refreshPackage("GenomicDistributions")
#devtools::build_vignettes("code/GenomicDistributions")
#devtools::test("code/GenomicDistributions")

## ----eval=FALSE---------------------------------------------------------------
#  if (!requireNamespace("BiocManager", quietly = TRUE))
#  install.packages("BiocManager")
#  BiocManager::install("GenomicDistributions")

## ----install, eval=FALSE------------------------------------------------------
#  devtools::install_github("databio/GenomicDistributions")

## ----echo=TRUE, results="hide", message=FALSE, warning=FALSE------------------
library("GenomicDistributions")
queryFile = system.file("extdata", "vistaEnhancers.bed.gz", package="GenomicDistributions")
query = rtracklayer::import(queryFile)

## ----chrom-plots-single-------------------------------------------------------
# First, calculate the distribution:
x = calcChromBinsRef(query, "hg19")

# Then, plot the result:
plotChromBins(x)

## ----chrom-plots-multi--------------------------------------------------------
# Let's fudge a second region set by shifting the first one over 
query2 = GenomicRanges::shift(query, 1e6)
queryList = GRangesList(vistaEnhancers=query, shifted=query2)
x2 = calcChromBinsRef(queryList, "hg19")
plotChromBins(x2)

## ----tss-distribution, fig.cap="TSS plot. Distribution of query regions relative to TSSs", fig.small=TRUE----
# Calculate the distances:
TSSdist = calcFeatureDistRefTSS(query, "hg19")

# Then plot the result:
plotFeatureDist(TSSdist, featureName="TSS")

## ----tss-distribution-multi, fig.cap="TSS plots with multiple region sets"----

TSSdist2 = calcFeatureDistRefTSS(queryList, "hg19")
plotFeatureDist(TSSdist2, featureName="TSS")


## ----tiled-distance-plot, fig.cap="Tiled feature distance plot. Distribution of query regions relative to arbitrary features", fig.small=TRUE----
plotFeatureDist(TSSdist2, featureName="TSS", tile=TRUE, labelOrder = "center")

## ----Build features-----------------------------------------------------------
featureExample = GenomicRanges::shift(query, round(rnorm(length(query), 0,1000)))

## ----distance-to-features-plot, fig.cap="Feature distance plot. Distribution of query regions relative to arbitrary features", fig.small=TRUE----
fdd = calcFeatureDist(query, featureExample)
plotFeatureDist(fdd)

## ----percentage-partition-plot, fig.cap="Partition distribution plot. Percentage distribution of query regions across genomic features", fig.small=TRUE----
gp = calcPartitionsRef(query, "hg19")
plotPartitions(gp)

## ----multiple-percentage-partition-plot, fig.cap="Partition distribution plot for multiple query region sets.", fig.small=TRUE----
gp2 = calcPartitionsRef(queryList, "hg19")
plotPartitions(gp2)

## ----multiple-raw-partition-plot, fig.cap="Raw partition distribution plot for multiple regionsets", fig.small=TRUE----
# Plot the results:
plotPartitions(gp2, numbers=TRUE)

## ----multiple-proportional-partition-plot, fig.cap="Proportional partition distribution plot for multiple query region sets.", fig.small=TRUE----
gp3 = calcPartitionsRef(queryList, "hg19", bpProportion=TRUE)
plotPartitions(gp3)

## ----corrected-partition-plot, fig.cap="Expected partition distribution plot. Distribution of query regions across genomic features relative to the expected distribution of those features.", fig.small=TRUE----
ep = calcExpectedPartitionsRef(query, "hg19")
plotExpectedPartitions(ep)

## ----multiple-corrected-partition-plot, fig.cap="Expected partition distribution plots for multiple query region sets.", fig.small=TRUE----
ep2 = calcExpectedPartitionsRef(queryList, "hg19")
plotExpectedPartitions(ep2)

## ----multiple-corrected-partition-plot-pvals, fig.cap="Expected partition distribution plots for multiple query region sets with p-values displayed.", fig.small=TRUE----
plotExpectedPartitions(ep2, pval=TRUE)

## ----multiple-corrected-proportional-partition-plot, fig.cap="Expected proportional partition distribution plots for multiple query region sets.", fig.small=TRUE----
ep3 = calcExpectedPartitionsRef(queryList, "hg19", bpProportion=TRUE)
plotExpectedPartitions(ep3)

## ----cumulative-partition-plot, fig.cap="Cumulative partition distribution plot. Cumulative distribution of query regions across genomic features.", fig.small=TRUE----
cp = calcCumulativePartitionsRef(query, "hg19")
plotCumulativePartitions(cp)

## ----multiple-cumulative-partition-plot, fig.cap="Cumulative partition distribution plots for multiple query region sets.", fig.small=TRUE----
cp2 = calcCumulativePartitionsRef(queryList, "hg19")
plotCumulativePartitions(cp2)

## ----specificity-plot-single, fig.height = 6, fig.cap="Specificity of chromatin accessibility across cell types."----
exampleCellMatrixFile = system.file("extdata", "example_cell_matrix.txt", package="GenomicDistributions")
cellMatrix = data.table::fread(exampleCellMatrixFile)
op = calcSummarySignal(query, cellMatrix)
plotSummarySignal(op)

## ----specificity-plot-multi, fig.height = 7, fig.cap="Specificity of chromatin accessibility across cell types."----
op2 = calcSummarySignal(queryList, cellMatrix)
plotSummarySignal(op2)

## ----specificity-plot-multi-color, fig.height = 7, fig.cap="Specificity of chromatin accessibility across cell types."----
op2 = calcSummarySignal(queryList, cellMatrix)
# get cell type metadata from GenomicDistributions
cellTypeMetadata = cellTypeMetadata
plotSummarySignal(op2, metadata = cellTypeMetadata, colorColumn = "tissueType")

## ----specificity-plot-multi-violin, fig.height = 7, fig.cap="Violin plot: chromatin accessibility across cell types."----
plotSummarySignal(signalSummaryList = op2, plotType = "violinPlot", metadata = cellTypeMetadata, colorColumn = "tissueType")

## ----neighbor-distance-plots, fig.cap="Distances between neighboring intervals of a regionset", fig.small=TRUE----

# Calculate the distances 
nd = calcNeighborDist(query)

# Plot the distribution
plotNeighborDist(nd)

## ----multiple-neighbor-distance-plots, fig.cap="Neighboring regions distance for multiple regionsets", fig.small=TRUE----

# Create a list of GRanges objects
s = system.file("extdata", "setB_100.bed.gz", package="GenomicDistributions")
setB = rtracklayer::import(s)
queryList2 = GRangesList(vistaEnhancers=query, setB=setB)

# Calculate the distances
dist2 = calcNeighborDist(queryList2)

# Plot the distribution
plotNeighborDist(dist2)

## ----gc-content-plots, fig.cap="GC content plot. Probability density function of GC percentage", eval=FALSE----
#  # Calculate the GC content
#  gc1 = calcGCContentRef(query, "hg19")
#  
#  # Plot the GC distribution
#  plotGCContent(gc1)

## ----gc-content-plots-multi, fig.cap="Multiple GC content plots.", eval=FALSE----
#  gc2 = calcGCContentRef(queryList, "hg19")
#  plotGCContent(gc2)

## ----dinic-content-plots, fig.cap="Dinucleotide frequency plots.", eval=FALSE----
#  df1 = calcDinuclFreqRef(query, "hg19")
#  plotDinuclFreq(df1)

## ----dinic-content-plots-multi, fig.cap="Multiple dinucleotide frequency plots.", eval=FALSE----
#  df2=calcDinuclFreqRef(queryList, "hg19")
#  plotDinuclFreq(df2)

## ----dinic-content-plots-multi-raw-counts, fig.cap="Multiple dinucleotide frequency plots.", eval=FALSE----
#  df3=calcDinuclFreqRef(queryList, "hg19", rawCounts=TRUE)
#  plotDinuclFreq(df3)

## ----width-distribution-single, fig.cap="Width distribution plot. Frequency of widths in this query"----
# Calculate the widths
qt1 = calcWidth(query)

# Plot the width distribution
plotQTHist(qt1)

## ----width-distribution-multi, fig.cap="Multiple width distribution plots.", fig.small=TRUE----
qt2 = calcWidth(queryList)
plotQTHist(qt2)

## ----width-distribution-colors, fig.cap="Width distribution plot with color options.", fig.small=TRUE----
plotQTHist(qt1, bins=7, EndBarColor = 'black', MiddleBarColor='darkblue')

## -----------------------------------------------------------------------------
fastaSource = "http://ftp.ensembl.org/pub/release-103/fasta/caenorhabditis_elegans/dna/Caenorhabditis_elegans.WBcel235.dna.toplevel.fa.gz"
gtfSource = "http://ftp.ensembl.org/pub/release-103/gtf/caenorhabditis_elegans/Caenorhabditis_elegans.WBcel235.103.gtf.gz"

## ----message=TRUE, eval=FALSE-------------------------------------------------
#  CEelegansChromSizes = getChromSizesFromFasta(source=fastaSource)

## ----message=TRUE, eval=FALSE-------------------------------------------------
#  bins  = getGenomeBins(CEelegansChromSizes)

## ----message=TRUE, eval=FALSE-------------------------------------------------
#  CEelegansTSSs = getTssFromGTF(source=gtfSource, convertEnsemblUCSC=TRUE)

## ----message=TRUE, eval=FALSE-------------------------------------------------
#  features = c("gene", "exon", "three_prime_utr", "five_prime_utr")
#  CEelegansGeneModels = getGeneModelsFromGTF(source=gtfSource, features=features, convertEnsemblUCSC=TRUE)

## ----message=TRUE, eval=FALSE-------------------------------------------------
#  partitionList = genomePartitionList(CEelegansGeneModels$gene,
#                                      CEelegansGeneModels$exon,
#                                      CEelegansGeneModels$three_prime_utr,
#                                      CEelegansGeneModels$five_prime_utr)

## ----eval=FALSE---------------------------------------------------------------
#  funcgen = biomaRt::useMart("ENSEMBL_MART_FUNCGEN", host="grch37.ensembl.org")
#  funcgen = biomaRt::useDataset("hsapiens_regulatory_feature", mart=funcgen)
#  
#  enhancers = biomaRt::getBM(attributes=c('chromosome_name',
#                                          'chromosome_start',
#                                          'chromosome_end',
#                                          'feature_type_name'),
#                    filters='regulatory_feature_type_name',
#                    values='Enhancer',
#                    mart=funcgen)

## ----eval=FALSE---------------------------------------------------------------
#  enhancers$chromosome_name = paste("chr", enhancers$chromosome_name, sep="")
#  
#  gr1 = GenomicRanges::sort(GenomeInfoDb::sortSeqlevels(
#      GenomicDistributions::dtToGr(enhancers,
#                                   chr="chromosome_name",
#                                   start="chromosome_start",
#                                   end="chromosome_end")))

## ----eval=FALSE---------------------------------------------------------------
#  geneModels = GenomicDistributions::getGeneModels("hg19")
#  partitionList = GenomicDistributions::genomePartitionList(geneModels$genesGR,
#                                                            geneModels$exonsGR,
#                                                            geneModels$threeUTRGR,
#                                                            geneModels$fiveUTRGR)
#  partitionList[["enhancer"]] = gr1