## ----load package--------------------------------------------------------
library(cTRAP)

## ----load ENCODE samples, eval=FALSE-------------------------------------
#  gene <- "EIF4G1"
#  cellLine <- "HepG2"
#  
#  ENCODEmetadata <- downloadENCODEknockdownMetadata(cellLine, gene)
#  table(ENCODEmetadata$`Experiment target`)
#  length(unique(ENCODEmetadata$`Experiment target`))
#  
#  ENCODEsamples <- downloadENCODEsamples(ENCODEmetadata)
#  counts <- prepareENCODEgeneExpression(ENCODEsamples)

## ----load data, include=FALSE--------------------------------------------
gene <- "EIF4G1"
cellLine <- "HepG2"

data("ENCODEmetadata")
data("counts")

## ----differential gene expression----------------------------------------
# Remove low coverage (at least 10 counts shared across two samples)
minReads   <- 10
minSamples <- 2
filter <- rowSums(counts[ , -c(1, 2)] >= minReads) >= minSamples
counts <- counts[filter, ]

# Convert ENSEMBL identifier to gene symbol
library(biomaRt)
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
genes <- sapply(strsplit(counts$gene_id, "\\."), `[`, 1)
geneConversion <- getBM(filters="ensembl_gene_id", values=genes, mart=mart,
                        attributes=c("ensembl_gene_id", "hgnc_symbol"))
counts$gene_id <- geneConversion$hgnc_symbol[
    match(genes, geneConversion$ensembl_gene_id)]

# Perform differential gene expression analysis
diffExpr <- performDifferentialExpression(counts)

## ----t-statistics--------------------------------------------------------
# Get t-statistics of differential expression with respective gene names 
# (expected input for downstream analyses)
diffExprStat <- diffExpr$t
names(diffExprStat) <- diffExpr$Gene_symbol

## ----L1000 metadata conditions-------------------------------------------
# Check available conditions for L1000 perturbations
l1000metadata <- downloadL1000data("l1000metadata.txt", "metadata")
getL1000conditions(l1000metadata)

## ----L1000 knockdown perturbations, eval=FALSE---------------------------
#  # Code for loading CMap gene KD HepG2 data
#  l1000metadataKnockdown <- filterL1000metadata(
#      l1000metadata, cellLine="HepG2",
#      perturbationType="Consensus signature from shRNAs targeting the same gene")
#  l1000zscores  <- downloadL1000data("l1000zscores.gctx", "zscores",
#                                     l1000metadataKnockdown$sig_id)
#  l1000geneInfo <- downloadL1000data("l1000geneInfo.txt", "geneInfo")
#  
#  l1000perturbationsKnockdown <- loadL1000perturbations(
#      l1000metadataKnockdown, l1000zscores, l1000geneInfo)

## ----L1000 small molecule perturbations, eval=FALSE----------------------
#  l1000metadataSmallMolecules <- filterL1000metadata(
#      l1000metadata, cellLine="HepG2", timepoint="24 h",
#      dosage="5 \U00B5M", # \U00B5 is the unicode code for the micro symbol
#      perturbationType="Compound")
#  l1000zscores  <- downloadL1000data("l1000zscores.gctx", "zscores",
#                                     l1000metadataSmallMolecules$sig_id)
#  l1000geneInfo <- downloadL1000data("l1000geneInfo.txt")
#  
#  l1000perturbationsSmallMolecules <- loadL1000perturbations(
#      l1000metadataSmallMolecules, l1000zscores, l1000geneInfo,
#      sanitizeCompoundNames=TRUE)

## ----load L1000 perturbations, include=FALSE-----------------------------
data("l1000perturbationsKnockdown")
data("l1000perturbationsSmallMolecules")

## ----compare knockdown, include=FALSE------------------------------------
compareKnockdown <- list()

# Compare against L1000 using Spearman correlation
compareKnockdown$spearman <- compareAgainstL1000(
    diffExprStat, l1000perturbationsKnockdown, cellLine, method="spearman")

# Compare against L1000 using Pearson correlation
compareKnockdown$pearson <- compareAgainstL1000(
    diffExprStat, l1000perturbationsKnockdown, cellLine, method="pearson")

# Compare against L1000 using gene set enrichment analysis (GSEA) with the top
# and bottom 150 genes
compareKnockdown$gsea <- compareAgainstL1000(
    diffExprStat, l1000perturbationsKnockdown, cellLine, method="gsea",
    geneSize=150)

## ----compare small molecules---------------------------------------------
compareSmallMolecule <- list()
# Compare against L1000 using Spearman correlation
compareSmallMolecule$spearman <- compareAgainstL1000(
    diffExprStat, l1000perturbationsSmallMolecules, cellLine, method="spearman")

# Compare against L1000 using Pearson correlation
compareSmallMolecule$pearson <- compareAgainstL1000(
    diffExprStat, l1000perturbationsSmallMolecules, cellLine, method="pearson")

# Compare against L1000 using gene set enrichment analysis (GSEA) with the top
# and bottom 150 genes
compareSmallMolecule$gsea <- compareAgainstL1000(
    diffExprStat, l1000perturbationsSmallMolecules, cellLine, method="gsea",
    geneSize=150)

## ----order knockdown perturbation comparison-----------------------------
# Order knockdown perturbations according to similarity
compareKnockdown$spearman_ordered <- compareKnockdown$spearman[
    order(compareKnockdown$spearman$HepG2_spearman_coef, decreasing=TRUE)]
compareKnockdown$pearson_ordered <- compareKnockdown$pearson[
    order(compareKnockdown$pearson$HepG2_pearson_coef, decreasing=TRUE)]
compareKnockdown$gsea_ordered <- compareKnockdown$gsea[
    order(compareKnockdown$gsea$HepG2_WTCS, decreasing=TRUE)]

# Most positively associated perturbations (note that EIF4G1 knockdown is the
# 6th, 1st and 2nd most positively associated perturbation based on Spearman
# correlation, Pearson correlation and GSEA, respectively)
head(compareKnockdown$spearman_ordered)
head(compareKnockdown$pearson_ordered)
head(compareKnockdown$gsea_ordered)

# Most negatively associated perturbations
tail(compareKnockdown$spearman_ordered)
tail(compareKnockdown$pearson_ordered)
tail(compareKnockdown$gsea_ordered)

## ----order small molecule perturbation comparison------------------------
# Order small molecule perturbations according to similarity
compareSmallMolecule$spearman_ordered <- compareSmallMolecule$spearman[
    order(compareSmallMolecule$spearman$HepG2_spearman_coef, decreasing=TRUE)]
compareSmallMolecule$pearson_ordered <- compareSmallMolecule$pearson[
    order(compareSmallMolecule$pearson$HepG2_pearson_coef, decreasing=TRUE)]
compareSmallMolecule$gsea_ordered <- compareSmallMolecule$gsea[
    order(compareSmallMolecule$gsea$HepG2_WTCS, decreasing=TRUE)]

# Most positively associated perturbations
head(compareSmallMolecule$spearman_ordered)
head(compareSmallMolecule$pearson_ordered)
head(compareSmallMolecule$gsea_ordered)

# Most negatively associated perturbations
tail(compareSmallMolecule$spearman_ordered)
tail(compareSmallMolecule$pearson_ordered)
tail(compareSmallMolecule$gsea_ordered)

## ----plot knockdown perturbation comparison------------------------------
# Plot relationship with EIF4G1 knockdown
EIF4G1knockdown <- grep("EIF4G1", compareKnockdown$gsea_ordered$genes,
                        value=TRUE)
plotL1000comparison(compareKnockdown$spearman, EIF4G1knockdown)
plotL1000comparison(compareKnockdown$pearson, EIF4G1knockdown)
plotL1000comparison(compareKnockdown$gsea, EIF4G1knockdown, topGenes=TRUE)
plotL1000comparison(compareKnockdown$gsea, EIF4G1knockdown, topGenes=FALSE)

# Plot relationship with most negatively associated perturbation
plotL1000comparison(compareKnockdown$spearman,
                    tail(compareKnockdown$spearman_ordered, 1)$genes)
plotL1000comparison(compareKnockdown$pearson,
                    tail(compareKnockdown$pearson_ordered, 1)$genes)
plotL1000comparison(compareKnockdown$gsea,
                    tail(compareKnockdown$gsea_ordered, 1)$genes, 
                    topGenes=TRUE)
plotL1000comparison(compareKnockdown$gsea, 
                    tail(compareKnockdown$gsea_ordered, 1)$genes, 
                    topGenes=FALSE)

## ----plot small molecule perturbation comparison-------------------------
# Plot relationship with most positively associated perturbation
plotL1000comparison(compareSmallMolecule$spearman,
                    head(compareSmallMolecule$spearman_ordered, 1)$genes)
plotL1000comparison(compareSmallMolecule$pearson,
                    head(compareSmallMolecule$pearson_ordered, 1)$genes)
plotL1000comparison(compareSmallMolecule$gsea,
                    head(compareSmallMolecule$gsea_ordered, 1)$genes)

# Plot relationship with most negatively associated perturbation
plotL1000comparison(compareSmallMolecule$spearman,
                    tail(compareSmallMolecule$spearman_ordered, 1)$genes)
plotL1000comparison(compareSmallMolecule$pearson,
                    tail(compareSmallMolecule$pearson_ordered, 1)$genes)
plotL1000comparison(compareSmallMolecule$gsea,
                    tail(compareSmallMolecule$gsea_ordered, 1)$genes)