See [slides][]
A recent tweet provides a nice summary of efforts to benchmark gene set enrichement analysis methods using the GSEABenchmarkR package.
library(EnrichmentBrowser)
Data input and massage
library(airway)
data(airway)
airway$dex <- relevel(airway$dex, "untrt")
Differential expression analysis
library(DESeq2)
des <- DESeqDataSet(airway, design = ~ cell + dex)
des <- DESeq(des)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(des)
Transition to tidy data
library(dplyr)
library(tibble)
tbl <- res %>%
as.data.frame() %>%
rownames_to_column("ENSEMBL") %>%
as_tibble()
tbl
## # A tibble: 64,102 x 7
## ENSEMBL baseMean log2FoldChange lfcSE stat pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ENSG0000000… 709. -0.381 0.101 -3.79 1.52e-4 1.28e-3
## 2 ENSG0000000… 0 NA NA NA NA NA
## 3 ENSG0000000… 520. 0.207 0.112 1.84 6.53e-2 1.97e-1
## 4 ENSG0000000… 237. 0.0379 0.143 0.264 7.92e-1 9.11e-1
## 5 ENSG0000000… 57.9 -0.0882 0.287 -0.307 7.59e-1 8.95e-1
## 6 ENSG0000000… 0.318 -1.38 3.50 -0.394 6.94e-1 NA
## 7 ENSG0000000… 5817. 0.426 0.0883 4.83 1.38e-6 1.82e-5
## 8 ENSG0000000… 1282. -0.241 0.0887 -2.72 6.58e-3 3.28e-2
## 9 ENSG0000000… 610. -0.0476 0.167 -0.286 7.75e-1 9.03e-1
## 10 ENSG0000000… 369. -0.500 0.121 -4.14 3.48e-5 3.42e-4
## # … with 64,092 more rows
::goana()
Requires ENTREZ identifiers
library(org.Hs.eg.db)
tbl <- tbl %>%
mutate(
ENTREZID = mapIds(
org.Hs.eg.db, ENSEMBL, "ENTREZID", "ENSEMBL"
) %>% unname()
)
## 'select()' returned 1:many mapping between keys and columns
tbl
## # A tibble: 64,102 x 8
## ENSEMBL baseMean log2FoldChange lfcSE stat pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ENSG00… 709. -0.381 0.101 -3.79 1.52e-4 1.28e-3
## 2 ENSG00… 0 NA NA NA NA NA
## 3 ENSG00… 520. 0.207 0.112 1.84 6.53e-2 1.97e-1
## 4 ENSG00… 237. 0.0379 0.143 0.264 7.92e-1 9.11e-1
## 5 ENSG00… 57.9 -0.0882 0.287 -0.307 7.59e-1 8.95e-1
## 6 ENSG00… 0.318 -1.38 3.50 -0.394 6.94e-1 NA
## 7 ENSG00… 5817. 0.426 0.0883 4.83 1.38e-6 1.82e-5
## 8 ENSG00… 1282. -0.241 0.0887 -2.72 6.58e-3 3.28e-2
## 9 ENSG00… 610. -0.0476 0.167 -0.286 7.75e-1 9.03e-1
## 10 ENSG00… 369. -0.500 0.121 -4.14 3.48e-5 3.42e-4
## # … with 64,092 more rows, and 1 more variable: ENTREZID <chr>
Universe – must be testable for DE
tbl <- tbl %>%
filter(!is.na(padj), !is.na(ENTREZID))
tbl
## # A tibble: 14,550 x 8
## ENSEMBL baseMean log2FoldChange lfcSE stat pvalue padj ENTREZID
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 ENSG0000… 709. -0.381 0.101 -3.79 1.52e-4 1.28e-3 7105
## 2 ENSG0000… 520. 0.207 0.112 1.84 6.53e-2 1.97e-1 8813
## 3 ENSG0000… 237. 0.0379 0.143 0.264 7.92e-1 9.11e-1 57147
## 4 ENSG0000… 57.9 -0.0882 0.287 -0.307 7.59e-1 8.95e-1 55732
## 5 ENSG0000… 5817. 0.426 0.0883 4.83 1.38e-6 1.82e-5 3075
## 6 ENSG0000… 1282. -0.241 0.0887 -2.72 6.58e-3 3.28e-2 2519
## 7 ENSG0000… 610. -0.0476 0.167 -0.286 7.75e-1 9.03e-1 2729
## 8 ENSG0000… 369. -0.500 0.121 -4.14 3.48e-5 3.42e-4 4800
## 9 ENSG0000… 183. -0.124 0.180 -0.689 4.91e-1 7.24e-1 90529
## 10 ENSG0000… 2814. -0.0411 0.103 -0.400 6.89e-1 8.57e-1 57185
## # … with 14,540 more rows
limma::goana()
– Hypergeometric
library(limma)
go <-
goana(tbl$ENTREZID[tbl$padj < .05], tbl$ENTREZID, "Hs") %>%
as_tibble()
Hmm, goana()
returns GO terms, but we also need GO identifiers
library(GO.db)
go <-
go %>%
mutate(
GOID = mapIds(
GO.db, .$Term, "GOID", "TERM"
) %>% unname()
) %>%
dplyr::select(GOID, everything()) %>%
arrange(P.DE)
## 'select()' returned 1:1 mapping between keys and columns
Sanity check
go %>% filter(grepl("glucocorticoid", Term))
## # A tibble: 22 x 6
## GOID Term Ont N DE P.DE
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 GO:0051… response to glucocorticoid BP 92 43 1.11e-5
## 2 GO:0071… cellular response to glucocorticoid… BP 42 21 6.42e-4
## 3 GO:2000… positive regulation of glucocortico… BP 2 2 6.64e-2
## 4 GO:2000… regulation of glucocorticoid recept… BP 8 4 1.25e-1
## 5 GO:0006… glucocorticoid biosynthetic process BP 8 4 1.25e-1
## 6 GO:0043… glucocorticoid mediated signaling p… BP 3 2 1.65e-1
## 7 GO:0008… glucocorticoid metabolic process BP 13 5 2.26e-1
## 8 GO:0004… glucocorticoid receptor activity MF 1 1 2.58e-1
## 9 GO:0031… negative regulation of glucocortico… BP 4 2 2.75e-1
## 10 GO:0031… negative regulation of glucocortico… BP 4 2 2.75e-1
## # … with 12 more rows
What genes in set?
genesets <-
AnnotationDbi::select(org.Hs.eg.db, tbl$ENTREZID, "GO", "ENTREZID") %>%
as_tibble() %>%
dplyr::select(ENTREZID, GO, ONTOLOGY) %>%
distinct()
## 'select()' returned many:many mapping between keys and columns
genesets
## # A tibble: 191,100 x 3
## ENTREZID GO ONTOLOGY
## <chr> <chr> <chr>
## 1 7105 GO:0005515 MF
## 2 7105 GO:0039532 BP
## 3 7105 GO:0043123 BP
## 4 7105 GO:0070062 CC
## 5 7105 GO:1901223 BP
## 6 8813 GO:0004169 MF
## 7 8813 GO:0004582 MF
## 8 8813 GO:0005515 MF
## 9 8813 GO:0005634 CC
## 10 8813 GO:0005783 CC
## # … with 191,090 more rows
sessionInfo()
## R version 3.6.0 Patched (2019-04-26 r76431)
## Platform: x86_64-apple-darwin17.7.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Users/ma38727/bin/R-3-6-branch/lib/libRblas.dylib
## LAPACK: /Users/ma38727/bin/R-3-6-branch/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] GO.db_3.8.2 tibble_2.1.3
## [3] dplyr_0.8.2 limma_3.41.6
## [5] org.Hs.eg.db_3.8.2 AnnotationDbi_1.47.0
## [7] airway_1.5.0 DESeq2_1.25.4
## [9] EnrichmentBrowser_2.15.4 graph_1.63.0
## [11] SummarizedExperiment_1.15.5 DelayedArray_0.11.2
## [13] BiocParallel_1.19.0 matrixStats_0.54.0
## [15] Biobase_2.45.0 GenomicRanges_1.37.14
## [17] GenomeInfoDb_1.21.1 IRanges_2.19.10
## [19] S4Vectors_0.23.17 BiocGenerics_0.31.4
## [21] BiocStyle_2.13.2
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-6 bit64_0.9-7 RColorBrewer_1.1-2
## [4] tools_3.6.0 backports_1.1.4 utf8_1.1.4
## [7] R6_2.4.0 rpart_4.1-15 Hmisc_4.2-0
## [10] DBI_1.0.0 lazyeval_0.2.2 colorspace_1.4-1
## [13] nnet_7.3-12 tidyselect_0.2.5 gridExtra_2.3
## [16] bit_1.1-14 compiler_3.6.0 cli_1.1.0
## [19] htmlTable_1.13.1 bookdown_0.11 KEGGgraph_1.45.0
## [22] scales_1.0.0 checkmate_1.9.3 genefilter_1.67.1
## [25] rappdirs_0.3.1 stringr_1.4.0 digest_0.6.19
## [28] foreign_0.8-71 rmarkdown_1.13 XVector_0.25.0
## [31] base64enc_0.1-3 pkgconfig_2.0.2 htmltools_0.3.6
## [34] htmlwidgets_1.3 rlang_0.4.0 rstudioapi_0.10
## [37] RSQLite_2.1.1 acepack_1.4.1 RCurl_1.95-4.12
## [40] magrittr_1.5 GenomeInfoDbData_1.2.1 Formula_1.2-3
## [43] Matrix_1.2-17 fansi_0.4.0 Rcpp_1.0.1
## [46] munsell_0.5.0 stringi_1.4.3 yaml_2.2.0
## [49] zlibbioc_1.31.0 grid_3.6.0 blob_1.1.1
## [52] crayon_1.3.4 lattice_0.20-38 splines_3.6.0
## [55] annotate_1.63.0 locfit_1.5-9.1 zeallot_0.1.0
## [58] knitr_1.23 pillar_1.4.2 codetools_0.2-16
## [61] geneplotter_1.63.0 XML_3.98-1.20 glue_1.3.1
## [64] evaluate_0.14 latticeExtra_0.6-28 data.table_1.12.2
## [67] BiocManager_1.30.5.1 vctrs_0.1.0 gtable_0.3.0
## [70] purrr_0.3.2 assertthat_0.2.1 ggplot2_3.2.0
## [73] xfun_0.8 xtable_1.8-4 survival_2.44-1.1
## [76] memoise_1.1.0 cluster_2.1.0 GSEABase_1.47.0