## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----extdata1-----------------------------------------------------------------
sample_id_file <- system.file("extdata/tcga_sample_ids.tsv",
                              package = "SplicingFactory")

sample_ids <- read.table(sample_id_file)

## ----extdata2-----------------------------------------------------------------
gene_id_file <- system.file("extdata/tcga_gene_ids.tsv",
                            package = "SplicingFactory")

gene_ids <- read.table(sample_id_file)

## ----setup--------------------------------------------------------------------
library("SplicingFactory")
library("SummarizedExperiment")

# Load dataset
data(tcga_brca_luma_dataset)

# Extract gene names
genes <- tcga_brca_luma_dataset[, 1]

# Extract read count data without gene names
readcounts <- tcga_brca_luma_dataset[, -1]

# Check read count dataset
dim(readcounts)

head(readcounts[, 1:5])

## ----readfilter---------------------------------------------------------------
tokeep <- rowSums(readcounts > 5) > 5

readcounts <- readcounts[tokeep, ]
genes      <- genes[tokeep]

## ----laplace------------------------------------------------------------------
laplace_entropy <- calculate_diversity(readcounts, genes,  method = "laplace",
                                       norm = TRUE, verbose = TRUE)

head(assay(laplace_entropy)[, 1:5])

## ----gini---------------------------------------------------------------------
gini_index <- calculate_diversity(readcounts, genes, method = "gini",
                                  verbose = TRUE)

head(assay(gini_index)[, 1:5])

## ----divplots-----------------------------------------------------------------
library("tidyr")
library("ggplot2")

# Construct data.frame from SummarizedExperiment result
laplace_data <- cbind(assay(laplace_entropy),
                      Gene = rowData(laplace_entropy)$genes)

# Reshape data.frame
laplace_data <- pivot_longer(laplace_data, -Gene, names_to = "sample",
                             values_to = "entropy")

# Add sample type information
laplace_data$sample_type <- apply(laplace_data[, 2], 1,
                                  function(x) ifelse(grepl("_N", x),
                                                     "Normal", "Tumor"))

# Filter genes with NA entropy values
laplace_data <- drop_na(laplace_data)

# Update gene names and add diversity type column
laplace_data$Gene <- paste0(laplace_data$Gene, "_", laplace_data$sample_type)
laplace_data$diversity <-  "Normalized Laplace entropy"

# Construct data.frame from SummarizedExperiment result
gini_data <- cbind(assay(gini_index), Gene = rowData(gini_index)$genes)

# Reshape data.frame
gini_data <- pivot_longer(gini_data, -Gene, names_to = "sample",
                          values_to = "gini")

# Add sample type information
gini_data$sample_type <- apply(gini_data[, 2], 1,
                               function(x) ifelse(grepl("_N", x),
                                                  "Normal", "Tumor"))

# Filter genes with NA gini values
gini_data <- drop_na(gini_data)

# Update gene names and add diversity type column
gini_data$Gene <- paste0(gini_data$Gene, "_", gini_data$sample_type)
gini_data$diversity <-  "Gini index"

# Plot diversity data
ggplot() +
  geom_density(data = laplace_data, alpha = 0.3,
               aes(x = entropy, group = sample, color = diversity)) +
  geom_density(data = gini_data, alpha = 0.3,
               aes(x = gini, group = sample, color = diversity)) +
  facet_grid(. ~ sample_type) +
  scale_color_manual(values = c("black", "darkorchid4")) +
  guides(color = FALSE) +
  theme_minimal() +
  labs(x = "Diversity values",
       y = "Density")

# Mean entropy calculation across samples for each gene/sample type combination
laplace_entropy_mean <- aggregate(laplace_data$entropy,
                                  by = list(laplace_data$Gene), mean)
colnames(laplace_entropy_mean)[2] <- "mean_entropy"
laplace_entropy_mean <- as_tibble(laplace_entropy_mean)

# Add sample type information
laplace_entropy_mean$sample_type <- apply(laplace_entropy_mean[, 1], 1,
                                          function(x) ifelse(grepl("_Normal", x),
                                                             "Normal", "Tumor"))

# Add diversity type column
laplace_entropy_mean$diversity <-  "Normalized Laplace entropy"

# Mean gini calculation across samples for each gene/sample type combination
gini_mean <- aggregate(gini_data$gini, by = list(gini_data$Gene), mean)
colnames(gini_mean)[2] <- "mean_gini"
gini_mean <- as_tibble(gini_mean)

# Add sample type information
gini_mean$sample_type <- apply(gini_mean[, 1], 1,
                               function(x) ifelse(grepl("_Normal", x),
                                                  "Normal", "Tumor"))

# Add diversity type column
gini_mean$diversity <-  "Gini index"

ggplot() +
  geom_violin(data = laplace_entropy_mean, aes(x = sample_type, y = mean_entropy,
                                               fill = diversity),
              alpha = 0.6) +
  geom_violin(data = gini_mean, aes(x = sample_type, y = mean_gini,
                                    fill = diversity),
              alpha = 0.6) +
  scale_fill_viridis_d(name = "Diversity") +
  coord_flip() +
  theme_minimal() +
  labs(x = "Samples",
       y = "Diversity")

## -----------------------------------------------------------------------------
# Update the SummarizedExperiment object with a new sample metadata column for
# sample types, as the the object returned by calculate_diversity does not
# contain this information.
colData(laplace_entropy) <- cbind(colData(laplace_entropy),
                                      sample_type = ifelse(grepl("_N", laplace_entropy$samples),
                                                           "Normal", "Tumor"))

# Calculate significant entropy changes
entropy_significance <- calculate_difference(x = laplace_entropy, samples = "sample_type",
                                             control = "Normal",
                                             method = "mean", test = "wilcoxon",
                                             verbose = TRUE)

head(entropy_significance)

## -----------------------------------------------------------------------------
entropy_significance$label <- apply(entropy_significance[, c(4, 7)], 1,
                                    function(x) ifelse(abs(x[1]) >= 0.1 & x[2] < 0.05,
                                                       "significant", "non-significant"))

entropy_significance$mean <- apply(entropy_significance[, c(2, 3)], 1,
                                   function(x) (x[1] + x[2]) / 2)

ggplot(entropy_significance, aes(x = mean, y = mean_difference)) +
  geom_point(color = "lightgrey", size = 1) +
  geom_point(data = entropy_significance[entropy_significance$label == "significant", ],
             color = "red", size = 1) +
  theme_minimal() +
  labs(title = "Normalized Laplace entropy",
       subtitle = "Wilcoxon signed rank test",
       x = "Mean entropy",
       y = "Mean difference")

## -----------------------------------------------------------------------------
ggplot(entropy_significance, aes(x = mean_difference,
                                 y = -log10(adjusted_p_values),
                                 color = label)) +
  geom_point() +
  scale_color_manual(values = c("grey", "red"), guide = "none") +
  geom_hline(yintercept = -log10(0.05), color = "red") +
  geom_vline(xintercept = c(0.1, -0.1)) +
  theme_minimal() +
  labs(title = "Normalized Laplace entropy",
       subtitle = "Wilcoxon signed rank test",
       x = "Mean difference of entropy values",
       y = "-Log10(adjusted p-value")

## -----------------------------------------------------------------------------

sessionInfo()