Gene Set Enrichment Analysis (GSEA) with clusterProfiler
#
require(tidyverse)
Let’s say we have found a set of interesting genes.
geneset_file = file.path(here::here(), "data", "raw", "geneset.txt")
genes_oi = readLines(geneset_file)
genes_oi %>% length() %>% print()
## [1] 166
genes_oi %>% head()
## [1] "ADAM10" "ADAM17" "APAF1" "APBB1" "APH1A" "APOE"
Is is enriched in some biological processes?
require(clusterProfiler)
require(org.Hs.eg.db)
result = enrichGO(genes_oi, ont="BP", keyType="SYMBOL", OrgDb=org.Hs.eg.db)
result %>% as.data.frame() %>% head()
## ID Description
## GO:0006119 GO:0006119 oxidative phosphorylation
## GO:0042775 GO:0042775 mitochondrial ATP synthesis coupled electron transport
## GO:0042773 GO:0042773 ATP synthesis coupled electron transport
## GO:0046034 GO:0046034 ATP metabolic process
## GO:0022904 GO:0022904 respiratory electron transport chain
## GO:0022900 GO:0022900 electron transport chain
## GeneRatio BgRatio pvalue p.adjust qvalue
## GO:0006119 79/157 148/18862 2.595331e-132 7.342192e-129 4.909274e-129
## GO:0042775 62/157 99/18862 1.842964e-108 2.606873e-105 1.743056e-105
## GO:0042773 62/157 100/18862 4.825741e-108 3.501476e-105 2.341222e-105
## GO:0046034 83/157 313/18862 4.950831e-108 3.501476e-105 2.341222e-105
## GO:0022904 63/157 117/18862 3.453413e-104 1.953941e-101 1.306481e-101
## GO:0022900 68/157 178/18862 1.378807e-99 6.501077e-97 4.346872e-97
## geneID
## GO:0006119 ATP5F1A/ATP5F1B/ATP5F1C/ATP5F1D/ATP5F1E/ATP5MC1/ATP5MC2/ATP5MC3/ATP5PB/ATP5PD/ATP5PF/ATP5PO/BID/COX4I1/COX4I2/COX5A/COX5B/COX6A1/COX6A2/COX6B1/COX6B2/COX6C/COX7A1/COX7A2/COX7A2L/COX7B/COX7B2/COX7C/COX8A/COX8C/CYC1/CYCS/NDUFA1/NDUFA10/NDUFA2/NDUFA3/NDUFA4/NDUFA5/NDUFA6/NDUFA7/NDUFA8/NDUFA9/NDUFAB1/NDUFB1/NDUFB10/NDUFB2/NDUFB3/NDUFB4/NDUFB5/NDUFB6/NDUFB7/NDUFB8/NDUFB9/NDUFC1/NDUFC2/NDUFS1/NDUFS2/NDUFS3/NDUFS4/NDUFS5/NDUFS6/NDUFS7/NDUFS8/NDUFV1/NDUFV2/NDUFV3/SDHA/SDHC/SDHD/SNCA/UQCR10/UQCR11/UQCRB/UQCRC1/UQCRC2/UQCRFS1/UQCRH/UQCRHL/UQCRQ
## GO:0042775 BID/COX4I1/COX4I2/COX5A/COX5B/COX6A1/COX6A2/COX6B1/COX6C/COX7A2L/COX7B/COX7C/COX8A/CYC1/CYCS/NDUFA1/NDUFA10/NDUFA2/NDUFA3/NDUFA4/NDUFA5/NDUFA6/NDUFA7/NDUFA8/NDUFA9/NDUFAB1/NDUFB1/NDUFB10/NDUFB2/NDUFB3/NDUFB4/NDUFB5/NDUFB6/NDUFB7/NDUFB8/NDUFB9/NDUFC1/NDUFC2/NDUFS1/NDUFS2/NDUFS3/NDUFS4/NDUFS5/NDUFS6/NDUFS7/NDUFS8/NDUFV1/NDUFV2/NDUFV3/SDHA/SDHC/SDHD/SNCA/UQCR10/UQCR11/UQCRB/UQCRC1/UQCRC2/UQCRFS1/UQCRH/UQCRHL/UQCRQ
## GO:0042773 BID/COX4I1/COX4I2/COX5A/COX5B/COX6A1/COX6A2/COX6B1/COX6C/COX7A2L/COX7B/COX7C/COX8A/CYC1/CYCS/NDUFA1/NDUFA10/NDUFA2/NDUFA3/NDUFA4/NDUFA5/NDUFA6/NDUFA7/NDUFA8/NDUFA9/NDUFAB1/NDUFB1/NDUFB10/NDUFB2/NDUFB3/NDUFB4/NDUFB5/NDUFB6/NDUFB7/NDUFB8/NDUFB9/NDUFC1/NDUFC2/NDUFS1/NDUFS2/NDUFS3/NDUFS4/NDUFS5/NDUFS6/NDUFS7/NDUFS8/NDUFV1/NDUFV2/NDUFV3/SDHA/SDHC/SDHD/SNCA/UQCR10/UQCR11/UQCRB/UQCRC1/UQCRC2/UQCRFS1/UQCRH/UQCRHL/UQCRQ
## GO:0046034 APP/ATP5F1A/ATP5F1B/ATP5F1C/ATP5F1D/ATP5F1E/ATP5MC1/ATP5MC2/ATP5MC3/ATP5PB/ATP5PD/ATP5PF/ATP5PO/BAD/BID/COX4I1/COX4I2/COX5A/COX5B/COX6A1/COX6A2/COX6B1/COX6B2/COX6C/COX7A1/COX7A2/COX7A2L/COX7B/COX7B2/COX7C/COX8A/COX8C/CYC1/CYCS/GAPDH/NDUFA1/NDUFA10/NDUFA2/NDUFA3/NDUFA4/NDUFA5/NDUFA6/NDUFA7/NDUFA8/NDUFA9/NDUFAB1/NDUFB1/NDUFB10/NDUFB2/NDUFB3/NDUFB4/NDUFB5/NDUFB6/NDUFB7/NDUFB8/NDUFB9/NDUFC1/NDUFC2/NDUFS1/NDUFS2/NDUFS3/NDUFS4/NDUFS5/NDUFS6/NDUFS7/NDUFS8/NDUFV1/NDUFV2/NDUFV3/PSEN1/SDHA/SDHC/SDHD/SNCA/UQCR10/UQCR11/UQCRB/UQCRC1/UQCRC2/UQCRFS1/UQCRH/UQCRHL/UQCRQ
## GO:0022904 BID/COX4I1/COX4I2/COX5A/COX5B/COX6A1/COX6A2/COX6B1/COX6C/COX7A2L/COX7B/COX7C/COX8A/CYC1/CYCS/NDUFA1/NDUFA10/NDUFA2/NDUFA3/NDUFA4/NDUFA5/NDUFA6/NDUFA7/NDUFA8/NDUFA9/NDUFAB1/NDUFB1/NDUFB10/NDUFB2/NDUFB3/NDUFB4/NDUFB5/NDUFB6/NDUFB7/NDUFB8/NDUFB9/NDUFC1/NDUFC2/NDUFS1/NDUFS2/NDUFS3/NDUFS4/NDUFS5/NDUFS6/NDUFS7/NDUFS8/NDUFV1/NDUFV2/NDUFV3/SDHA/SDHB/SDHC/SDHD/SNCA/UQCR10/UQCR11/UQCRB/UQCRC1/UQCRC2/UQCRFS1/UQCRH/UQCRHL/UQCRQ
## GO:0022900 BID/COX4I1/COX4I2/COX5A/COX5B/COX6A1/COX6A2/COX6B1/COX6C/COX7A1/COX7A2/COX7A2L/COX7B/COX7B2/COX7C/COX8A/COX8C/CYC1/CYCS/NDUFA1/NDUFA10/NDUFA2/NDUFA3/NDUFA4/NDUFA4L2/NDUFA5/NDUFA6/NDUFA7/NDUFA8/NDUFA9/NDUFAB1/NDUFB1/NDUFB10/NDUFB2/NDUFB3/NDUFB4/NDUFB5/NDUFB6/NDUFB7/NDUFB8/NDUFB9/NDUFC1/NDUFC2/NDUFS1/NDUFS2/NDUFS3/NDUFS4/NDUFS5/NDUFS6/NDUFS7/NDUFS8/NDUFV1/NDUFV2/NDUFV3/SDHA/SDHB/SDHC/SDHD/SNCA/UQCR10/UQCR11/UQCRB/UQCRC1/UQCRC2/UQCRFS1/UQCRH/UQCRHL/UQCRQ
## Count
## GO:0006119 79
## GO:0042775 62
## GO:0042773 62
## GO:0046034 83
## GO:0022904 63
## GO:0022900 68
Then, we can quickly get a glimpse of the main results.
result %>% dotplot()
result %>% cnetplot()
Now, we would like to see the overlap between the genes in the top 5 enriched categories.
require(ComplexHeatmap)
# extract lists of genes in top enriched terms
terms_oi = result %>% as.data.frame() %>% filter(p.adjust<0.05) %>% slice_max(Count, n=10)
terms_oi =
terms_oi %>%
group_by(Description) %>%
summarize(genes=strsplit(geneID, split="/")) %>%
ungroup() %>%
deframe()
# prepare inputs
terms_oi = terms_oi %>%
list_to_matrix() %>%
make_comb_mat()
# plot
plt = terms_oi %>% UpSet()
plt
Phylogenetic trees with ggtree
#
Trees are another very common plot to visualize hierarchical patterns.
Here, we will use
ggtree
to visualize the phylogenetic relationships extracted from a multiple
sequence alignment of TP53 across mammals.
Basic tree#
require(ggtree)
set.seed(100)
tree <- rtree(50)
ggtree(tree, layout="circular") + geom_tiplab()
Tree and multiple sequence alignment of TP53 amino acids#
We will follow Russell J. Gray’s approach.
require(seqinr)
require(ape)
# create tree from alignment
fasta_file = file.path(here::here(), "data", "raw", "TP53-mammals-alignment-aa.fa")
aln = read.alignment(fasta_file, format = "fasta", whole.header=TRUE)
D = dist.alignment(aln)
tree = njs(D)
# plot tree with MSA
tree_plot = ggtree(tree)
msaplot(tree_plot, fasta = fasta_file) + ggtitle("tree with MSA")
Session Info#
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so; LAPACK version 3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=es_ES.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=es_ES.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=es_ES.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Madrid
## tzcode source: system (glibc)
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ape_5.7-1 seqinr_4.2-36 ggtree_3.0.4
## [4] ComplexHeatmap_2.9.3 org.Hs.eg.db_3.13.0 AnnotationDbi_1.54.1
## [7] IRanges_2.30.1 S4Vectors_0.40.1 Biobase_2.52.0
## [10] BiocGenerics_0.48.1 clusterProfiler_4.0.5 lubridate_1.9.2
## [13] forcats_1.0.0 stringr_1.5.1 dplyr_1.1.1
## [16] purrr_0.3.4 readr_2.1.5 tidyr_1.2.1
## [19] tibble_3.2.1 ggplot2_3.5.0 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 shape_1.4.6.1 jsonlite_1.8.0
## [4] magrittr_2.0.3 magick_2.8.3 farver_2.1.1
## [7] rmarkdown_2.26 GlobalOptions_0.1.2 fs_1.6.2
## [10] zlibbioc_1.38.0 vctrs_0.6.1 memoise_2.0.1
## [13] Cairo_1.6-1 RCurl_1.98-1.14 htmltools_0.5.7
## [16] gridGraphics_0.5-1 plyr_1.8.9 cachem_1.0.8
## [19] igraph_1.3.1 lifecycle_1.0.4 iterators_1.0.14
## [22] pkgconfig_2.0.3 Matrix_1.6-5 R6_2.5.1
## [25] fastmap_1.1.1 clue_0.3-65 GenomeInfoDbData_1.2.6
## [28] digest_0.6.29 aplot_0.2.2 enrichplot_1.12.2
## [31] colorspace_2.0-3 patchwork_1.2.0 rprojroot_2.0.4
## [34] RSQLite_2.3.5 labeling_0.4.3 fansi_1.0.3
## [37] timechange_0.3.0 httr_1.4.7 polyclip_1.10-6
## [40] compiler_4.3.1 here_1.0.1 bit64_4.0.5
## [43] withr_3.0.0 doParallel_1.0.17 downloader_0.4
## [46] BiocParallel_1.26.2 viridis_0.6.5 DBI_1.2.2
## [49] highr_0.10 ggforce_0.4.2 MASS_7.3-60
## [52] rjson_0.2.21 tools_4.3.1 DO.db_2.9
## [55] scatterpie_0.2.1 glue_1.6.2 nlme_3.1-164
## [58] GOSemSim_2.18.1 shadowtext_0.1.3 ade4_1.7-22
## [61] cluster_2.1.4 reshape2_1.4.4 fgsea_1.18.0
## [64] generics_0.1.3 gtable_0.3.4 tzdb_0.3.0
## [67] data.table_1.15.2 hms_1.1.3 tidygraph_1.2.0
## [70] utf8_1.2.4 XVector_0.32.0 ggrepel_0.9.5
## [73] foreach_1.5.2 pillar_1.9.0 yulab.utils_0.1.4
## [76] circlize_0.4.16 splines_4.3.1 tweenr_2.0.3
## [79] treeio_1.16.2 lattice_0.21-8 bit_4.0.5
## [82] tidyselect_1.2.1 GO.db_3.13.0 Biostrings_2.64.1
## [85] knitr_1.45 gridExtra_2.3 xfun_0.42
## [88] graphlayouts_1.1.1 matrixStats_1.0.0 stringi_1.7.8
## [91] lazyeval_0.2.2 ggfun_0.1.4 yaml_2.3.8
## [94] evaluate_0.23 codetools_0.2-19 ggraph_2.2.1
## [97] qvalue_2.24.0 ggplotify_0.1.2 cli_3.4.0
## [100] munsell_0.5.0 Rcpp_1.0.8.3 GenomeInfoDb_1.28.4
## [103] png_0.1-8 parallel_4.3.1 blob_1.2.4
## [106] DOSE_3.18.2 bitops_1.0-7 viridisLite_0.4.2
## [109] tidytree_0.4.6 scales_1.3.0 crayon_1.5.2
## [112] GetoptLong_1.0.5 rlang_1.1.0 cowplot_1.1.3
## [115] fastmatch_1.1-4 KEGGREST_1.32.0