Simple example: classic vs. tidy
aggregate(hp ~ cyl, mtcars, mean) # classic
## cyl hp
## 1 4 82.63636
## 2 6 122.28571
## 3 8 209.21429
library(tidyverse)
mtcars %>% group_by(cyl) %>% summarize(hp = mean(hp)) # tidy
## # A tibble: 3 x 2
## cyl hp
## <dbl> <dbl>
## 1 4 82.63636
## 2 6 122.28571
## 3 8 209.21429
Biological example
library(airway)
data(airway)
airway # rich
## class: RangedSummarizedExperiment
## dim: 64102 8
## metadata(1): ''
## assays(1): counts
## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
hist(log10(rowMeans(assay(airway))))
library(reshape2)
library(tibble)
tbl <- as_tibble(melt(
assay(airway), c("Gene", "Sample"), value.name="Count"
))
tbl # tidy (incomplete -- no metadata)
## # A tibble: 512,816 x 3
## Gene Sample Count
## <fctr> <fctr> <int>
## 1 ENSG00000000003 SRR1039508 679
## 2 ENSG00000000005 SRR1039508 0
## 3 ENSG00000000419 SRR1039508 467
## 4 ENSG00000000457 SRR1039508 260
## 5 ENSG00000000460 SRR1039508 60
## 6 ENSG00000000938 SRR1039508 0
## 7 ENSG00000000971 SRR1039508 3251
## 8 ENSG00000001036 SRR1039508 1433
## 9 ENSG00000001084 SRR1039508 519
## 10 ENSG00000001167 SRR1039508 394
## # ... with 512,806 more rows
tbl %>% group_by(Gene) %>% summarize(mean(Count))
## # A tibble: 64,102 x 2
## Gene `mean(Count)`
## <fctr> <dbl>
## 1 ENSG00000000003 741.875
## 2 ENSG00000000005 0.000
## 3 ENSG00000000419 534.875
## 4 ENSG00000000457 242.000
## 5 ENSG00000000460 58.375
## 6 ENSG00000000938 0.375
## 7 ENSG00000000971 6034.750
## 8 ENSG00000001036 1305.000
## 9 ENSG00000001084 615.125
## 10 ENSG00000001167 392.500
## # ... with 64,092 more rows
Context
data.frame()
.data.frame()
; see the tidyverseVocabulary
Constraints (e.g., probes & samples)
Flexibility
Programming contract
Lessons learned / best practices
library(BiocFileCache)
library(TENxGenomics) # https://github.com/mtmorgan/TENxGenomics
fl <- bfcrpath(BiocFileCache(), "1M_neurons_filtered_gene_bc_matrices_h5.h5")
## Warning: call dbDisconnect() when finished working with a connection
TENxGenomics(fl) # big!
## class: TENxGenomics
## h5path: /home/mtmorgan/.cache/BiocFileCache/38e45fadc64
## dim(): 27998 x 1306127
m = as.matrix(TENxMatrix(fl)[, 1:1000]) # first 1000 cells
sum(m == 0) / length(m) # % zeros
## [1] 0.9276541
sum(rowSums(m == 0) == ncol(m)) / nrow(m) # % genes with no expression
## [1] 0.4264947
hist(log10(1 + colSums(m))) # reads per cell
library(ggplot2)
library(plotly)
static <- mtcars %>% mutate(cyl = factor(cyl)) %>%
ggplot(aes(x=qsec, y=mpg)) +
geom_point(aes(color=cyl))
ggplotly(static)
::ggplotly()
makes interactive visualization more-or-less a no-brainer.devtools create()
, load_all()
, check()
roxygen2 (e.g., devtools::document()
)
Rd
-style.testthat (e.g., devtools::use_testthat()
, devtools::test()
)
Pipes and lazy evaluation (e.g., magrittr, %>%
)
S3 classes or S4 ‘reference’ classes
Other class systems
Continuous integration
R CMD build
/ R CMD check
before pushing commits.(R / Bioconductor) Docker and AMI containers
Research reported in this course was supported by the National Human Genome Research Institute and the National Cancer Institute of the National Institutes of Health under award numbers U41HG004059 and U24CA180996.
This project has received funding from the European Research Council (ERC) under the European Union’s Horizon 2020 research and innovation programme (grant agreement number 633974)