Here we will run test enrichment analysis approaches for undderstanding the effect of elevated homocysteine on blood methylomes. Homocysteine is thought to be an inhibitor of methyltransferases and hyperhomocysteine is associated with overall reduced genomic methylation. Previously we analysed the B-PROOF study data which enabled us to identify probes whose methylation correlatesor anti-correlated with homocysteine level.
The two enrichment analysis approaches we will use are:
gsameth
: an ORA approach that addressed the biases inherent with infinium array analysis.
gmea
: an experimental application of functional class scoring for infinium analysis.
suppressPackageStartupMessages({
library("missMethyl")
library("kableExtra")
library("mitch")
library("eulerr")
library("IlluminaHumanMethylation450kmanifest")
library("IlluminaHumanMethylation450kanno.ilmn12.hg19")
library("tictoc")
})
Load the data.
dm <- read.table("dma3a.tsv")
rownames(dm) <- dm$Row.names
dm$Row.names=NULL
head(dm, 10) %>% kbl() %>% kable_paper("hover", full_width = F)
UCSC_RefGene_Name | Regulatory_Feature_Group | logFC | AveExpr | t | P.Value | adj.P.Val | B | |
---|---|---|---|---|---|---|---|---|
cg04905210 | POLR3D | -0.2893816 | -2.8148883 | -5.377810 | 2.0e-07 | 0.1017096 | 5.889905 | |
cg09338148 | Unclassified | -0.2919282 | -1.1720330 | -5.116352 | 8.0e-07 | 0.1729449 | 4.884437 | |
cg04247967 | -0.2057500 | 3.1797718 | -4.953286 | 1.7e-06 | 0.2374161 | 4.275061 | ||
cg06458106 | -0.1900758 | 1.6656963 | -4.763516 | 4.0e-06 | 0.2374161 | 3.583754 | ||
cg26425904 | OCA2 | Unclassified | -0.2527032 | -4.0235228 | -4.731955 | 4.6e-06 | 0.2374161 | 3.470695 |
cg10746622 | PRSSL1 | -0.1999261 | 1.5539691 | -4.707850 | 5.1e-06 | 0.2374161 | 3.384715 | |
cg19590707 | SLC16A3;SLC16A3;SLC16A3 | Promoter_Associated | -0.2479821 | -1.2545892 | -4.704517 | 5.2e-06 | 0.2374161 | 3.372853 |
cg09762242 | SIPA1;SIPA1 | Promoter_Associated | -0.2102410 | -2.5125329 | -4.704466 | 5.2e-06 | 0.2374161 | 3.372673 |
cg03622371 | P2RY6;P2RY6;P2RY6;P2RY6 | Unclassified | -0.2403843 | -1.6741058 | -4.679125 | 5.8e-06 | 0.2374161 | 3.282683 |
cg11257888 | RCAN3 | 0.2790028 | 0.4787858 | 4.674646 | 5.9e-06 | 0.2374161 | 3.266814 |
Reactome pathways were downloaded on the 14th Sept 2023 from MsigDB.
gs_entrez <- gmt_import("c2.cp.reactome.v2023.1.Hs.entrez.gmt")
gs_symbols <- gmt_import("c2.cp.reactome.v2023.1.Hs.symbols.gmt")
Now perform ORA analysis using gsameth with the top 500 probes.
sigup <- rownames(head(subset(dm,logFC>0),1000))
gsaup <- gsameth(sig.cpg=head(sigup,500), all.cpg=rownames(dm), collection=gs_entrez)
## All input CpGs are used for testing.
gsaup <- gsaup[order(gsaup$P.DE),]
head(gsaup) %>% kbl(caption="Top hypermethylated pathways") %>% kable_paper("hover", full_width = F)
N | DE | P.DE | FDR | |
---|---|---|---|---|
REACTOME_INTERLEUKIN_21_SIGNALING | 9 | 4 | 0.0000418 | 0.0445810 |
REACTOME_INTERLEUKIN_20_FAMILY_SIGNALING | 26 | 5 | 0.0000730 | 0.0445810 |
REACTOME_INTERLEUKIN_6_SIGNALING | 11 | 4 | 0.0000809 | 0.0445810 |
REACTOME_INTERLEUKIN_2_FAMILY_SIGNALING | 39 | 6 | 0.0001994 | 0.0824657 |
REACTOME_TNF_RECEPTOR_SUPERFAMILY_TNFSF_MEMBERS_MEDIATING_NON_CANONICAL_NF_KB_PATHWAY | 16 | 4 | 0.0002769 | 0.0916139 |
REACTOME_INTERFERON_ALPHA_BETA_SIGNALING | 64 | 7 | 0.0003500 | 0.0964951 |
sigdn <- rownames(head(subset(dm,logFC<0),1000))
gsadn <- gsameth(sig.cpg=head(sigdn,500), all.cpg=rownames(dm), collection=gs_entrez)
## All input CpGs are used for testing.
gsadn <- gsadn[order(gsadn$P.DE),]
head(gsadn) %>% kbl(caption="Top hypomethylated pathways") %>% kable_paper("hover", full_width = F)
N | DE | P.DE | FDR | |
---|---|---|---|---|
REACTOME_SYNTHESIS_OF_BILE_ACIDS_AND_BILE_SALTS_VIA_27_HYDROXYCHOLESTEROL | 15 | 3 | 0.0016497 | 1 |
REACTOME_THROMBIN_SIGNALLING_THROUGH_PROTEINASE_ACTIVATED_RECEPTORS_PARS | 32 | 4 | 0.0057066 | 1 |
REACTOME_SYNTHESIS_OF_BILE_ACIDS_AND_BILE_SALTS_VIA_7ALPHA_HYDROXYCHOLESTEROL | 24 | 3 | 0.0058080 | 1 |
REACTOME_BLOOD_GROUP_SYSTEMS_BIOSYNTHESIS | 21 | 3 | 0.0058718 | 1 |
REACTOME_TRANSPORT_OF_NUCLEOTIDE_SUGARS | 8 | 2 | 0.0081909 | 1 |
REACTOME_PASSIVE_TRANSPORT_BY_AQUAPORINS | 13 | 2 | 0.0103872 | 1 |
Now with GMEA.
Start with gathering the annotation set.
ann450k <- getAnnotation(IlluminaHumanMethylation450kanno.ilmn12.hg19)
myann <- data.frame(ann450k[,c("UCSC_RefGene_Name","Regulatory_Feature_Group")])
promoters <- grep("Prom",myann$Regulatory_Feature_Group)
Histogram of limma t-statistics.
dm <- dm[,c("UCSC_RefGene_Name","t")]
hist(dm$t,breaks=seq(from=-6,to=6,by=1))
Aggregate probe t-scores to gene level scores.
The object generated has two parts:
df
: a simple table of the gene name and the median tvalue
gme_res_df
: results of 1-sample Wilcox test.
CORES= detectCores()
agg <- function(dm) {
gn <- unique(unlist(strsplit( dm$UCSC_RefGene_Name ,";")))
gnl <- strsplit( dm$UCSC_RefGene_Name ,";")
gnl <- mclapply(gnl,unique,mc.cores=CORES)
dm$UCSC_RefGene_Name <- gnl
l <- mclapply(1:nrow(dm), function(i) {
a <- dm[i,]
len <- length(a[[1]][[1]])
tvals <- as.numeric(rep(a[2],len))
genes <- a[[1]][[1]]
data.frame(genes,tvals)
},mc.cores=CORES)
df <- do.call(rbind,l)
gme_res <- mclapply( 1:length(gn), function(i) {
g <- gn[i]
tstats <- df[which(df$genes==g),"tvals"]
myn <- length(tstats)
mymean <- mean(tstats)
mymedian <- median(tstats)
wtselfcont <- wilcox.test(tstats)
res <- c("gene"=g,"nprobes"=myn,"mean"=mymean,"median"=mymedian,
"p-value(sc)"=wtselfcont$p.value)
} , mc.cores=CORES )
gme_res_df <- do.call(rbind, gme_res)
rownames(gme_res_df) <- gme_res_df[,1]
gme_res_df <- gme_res_df[,-1]
tmp <- apply(gme_res_df,2,as.numeric)
rownames(tmp) <- rownames(gme_res_df)
gme_res_df <- as.data.frame(tmp)
gme_res_df$sig <- -log10(gme_res_df[,4])
gme_res_df <- gme_res_df[order(-gme_res_df$sig),]
gme_res_df$`fdr(sc)` <- p.adjust(gme_res_df$`p-value(sc)`)
out <- list("df"=df,"gme_res_df"=gme_res_df)
return(out)
}
tic()
dmagg <- agg(dm)
toc()
## 65.778 sec elapsed
head(dmagg$gme_res_df)
## nprobes mean median p-value(sc) sig fdr(sc)
## TNXB 531 0.3476999 0.4126704 4.781602e-15 14.32043 9.608152e-11
## PCDHA1 162 -0.7059103 -0.6207302 3.718622e-14 13.42962 7.471827e-10
## NNAT 49 -0.9377326 -0.9433135 6.750156e-14 13.17069 1.356241e-09
## PCDHA2 149 -0.7171256 -0.6259052 3.605827e-13 12.44300 7.244468e-09
## PCDHA3 141 -0.7168381 -0.6259052 2.805782e-12 11.55195 5.636815e-08
## KCNQ1DN 39 -1.6152306 -1.7110279 3.637979e-12 11.43914 7.308336e-08
Does 1-sample t-test work better?
agg2 <- function(dm) {
gn <- unique(unlist(strsplit( dm$UCSC_RefGene_Name ,";")))
gnl <- strsplit( dm$UCSC_RefGene_Name ,";")
gnl <- mclapply(gnl,unique,mc.cores=CORES)
dm$UCSC_RefGene_Name <- gnl
l <- mclapply(1:nrow(dm), function(i) {
a <- dm[i,]
len <- length(a[[1]][[1]])
tvals <- as.numeric(rep(a["t"],len))
genes <- a[[1]][[1]]
data.frame(genes,tvals)
},mc.cores=CORES)
df <- do.call(rbind,l)
keep <- names(which(table(df$genes)>1))
df <- df[df$genes %in% keep,]
gn <- unique(df$genes)
gme_res <- lapply( 1:length(gn), function(i) {
g <- gn[i]
tstats <- df[which(df$genes==g),"tvals"]
myn <- length(tstats)
mymean <- mean(tstats)
mymedian <- median(tstats)
tselfcont <- t.test(tstats)
res <- c("gene"=g,"nprobes"=myn,"mean"=mymean,"median"=mymedian,
"p-value(sc)"=tselfcont$p.value)
} )
gme_res_df <- do.call(rbind, gme_res)
rownames(gme_res_df) <- gme_res_df[,1]
gme_res_df <- gme_res_df[,-1]
tmp <- apply(gme_res_df,2,as.numeric)
rownames(tmp) <- rownames(gme_res_df)
gme_res_df <- as.data.frame(tmp)
gme_res_df$sig <- -log10(gme_res_df[,4])
gme_res_df <- gme_res_df[order(-gme_res_df$sig),]
gme_res_df$`fdr(sc)` <- p.adjust(gme_res_df$`p-value(sc)`)
out <- list("df"=df,"gme_res_df"=gme_res_df)
return(out)
}
tic()
dmagg2 <- agg2(dm)
toc()
## 105.684 sec elapsed
head(dmagg2$gme_res_df)
## nprobes mean median p-value(sc) sig fdr(sc)
## KCNQ1DN 39 -1.6152306 -1.7110279 2.811916e-17 16.55100 5.508262e-13
## NNAT 49 -0.9377326 -0.9433135 8.119861e-17 16.09045 1.590518e-12
## PCDHA1 162 -0.7059103 -0.6207302 2.891024e-16 15.53895 5.662649e-12
## TNXB 531 0.3476999 0.4126704 1.989790e-15 14.70119 3.897202e-11
## MESTIT1 59 -0.7540952 -0.8024776 2.553284e-15 14.59290 5.000607e-11
## PCDHA2 149 -0.7171256 -0.6259052 5.353803e-15 14.27134 1.048489e-10
For reproducibility.
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] mitch_1.12.0
## [2] tictoc_1.2
## [3] IlluminaHumanMethylation450kmanifest_0.4.0
## [4] eulerr_7.0.0
## [5] kableExtra_1.3.4
## [6] missMethyl_1.34.0
## [7] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0
## [8] IlluminaHumanMethylation450kanno.ilmn12.hg19_0.6.1
## [9] minfi_1.46.0
## [10] bumphunter_1.42.0
## [11] locfit_1.5-9.8
## [12] iterators_1.0.14
## [13] foreach_1.5.2
## [14] Biostrings_2.68.1
## [15] XVector_0.40.0
## [16] SummarizedExperiment_1.30.2
## [17] Biobase_2.60.0
## [18] MatrixGenerics_1.12.3
## [19] matrixStats_1.0.0
## [20] GenomicRanges_1.52.0
## [21] GenomeInfoDb_1.36.3
## [22] IRanges_2.34.1
## [23] S4Vectors_0.38.1
## [24] BiocGenerics_0.46.0
##
## loaded via a namespace (and not attached):
## [1] splines_4.3.1 later_1.3.1
## [3] BiocIO_1.10.0 bitops_1.0-7
## [5] filelock_1.0.2 BiasedUrn_2.0.11
## [7] tibble_3.2.1 preprocessCore_1.62.1
## [9] XML_3.99-0.14 lifecycle_1.0.3
## [11] lattice_0.21-8 MASS_7.3-60
## [13] base64_2.0.1 scrime_1.3.5
## [15] magrittr_2.0.3 sass_0.4.7
## [17] limma_3.56.2 rmarkdown_2.24
## [19] jquerylib_0.1.4 yaml_2.3.7
## [21] httpuv_1.6.11 doRNG_1.8.6
## [23] askpass_1.2.0 DBI_1.1.3
## [25] RColorBrewer_1.1-3 abind_1.4-5
## [27] zlibbioc_1.46.0 rvest_1.0.3
## [29] quadprog_1.5-8 purrr_1.0.2
## [31] RCurl_1.98-1.12 rappdirs_0.3.3
## [33] GenomeInfoDbData_1.2.10 genefilter_1.82.1
## [35] annotate_1.78.0 svglite_2.1.1
## [37] DelayedMatrixStats_1.22.6 codetools_0.2-19
## [39] DelayedArray_0.26.7 xml2_1.3.5
## [41] tidyselect_1.2.0 beanplot_1.3.1
## [43] BiocFileCache_2.8.0 illuminaio_0.42.0
## [45] webshot_0.5.5 jsonlite_1.8.7
## [47] GenomicAlignments_1.36.0 multtest_2.56.0
## [49] ellipsis_0.3.2 survival_3.5-7
## [51] systemfonts_1.0.4 tools_4.3.1
## [53] progress_1.2.2 Rcpp_1.0.11
## [55] glue_1.6.2 gridExtra_2.3
## [57] xfun_0.40 dplyr_1.1.3
## [59] HDF5Array_1.28.1 fastmap_1.1.1
## [61] GGally_2.1.2 rhdf5filters_1.12.1
## [63] fansi_1.0.4 openssl_2.1.0
## [65] caTools_1.18.2 digest_0.6.33
## [67] R6_2.5.1 mime_0.12
## [69] colorspace_2.1-0 gtools_3.9.4
## [71] biomaRt_2.56.1 RSQLite_2.3.1
## [73] utf8_1.2.3 tidyr_1.3.0
## [75] generics_0.1.3 data.table_1.14.8
## [77] rtracklayer_1.60.1 prettyunits_1.1.1
## [79] httr_1.4.7 htmlwidgets_1.6.2
## [81] S4Arrays_1.0.6 pkgconfig_2.0.3
## [83] gtable_0.3.4 blob_1.2.4
## [85] siggenes_1.74.0 htmltools_0.5.6
## [87] echarts4r_0.4.5 scales_1.2.1
## [89] png_0.1-8 knitr_1.44
## [91] rstudioapi_0.15.0 reshape2_1.4.4
## [93] tzdb_0.4.0 rjson_0.2.21
## [95] nlme_3.1-163 curl_5.0.2
## [97] org.Hs.eg.db_3.17.0 cachem_1.0.8
## [99] rhdf5_2.44.0 stringr_1.5.0
## [101] KernSmooth_2.23-22 AnnotationDbi_1.62.2
## [103] restfulr_0.0.15 GEOquery_2.68.0
## [105] pillar_1.9.0 grid_4.3.1
## [107] reshape_0.8.9 vctrs_0.6.3
## [109] gplots_3.1.3 promises_1.2.1
## [111] dbplyr_2.3.3 xtable_1.8-4
## [113] beeswarm_0.4.0 evaluate_0.21
## [115] readr_2.1.4 GenomicFeatures_1.52.2
## [117] cli_3.6.1 compiler_4.3.1
## [119] Rsamtools_2.16.0 rlang_1.1.1
## [121] crayon_1.5.2 rngtools_1.5.2
## [123] nor1mix_1.3-0 mclust_6.0.0
## [125] plyr_1.8.8 stringi_1.7.12
## [127] viridisLite_0.4.2 BiocParallel_1.34.2
## [129] munsell_0.5.0 Matrix_1.6-1
## [131] hms_1.1.3 sparseMatrixStats_1.12.2
## [133] bit64_4.0.5 ggplot2_3.4.3
## [135] Rhdf5lib_1.22.1 KEGGREST_1.40.0
## [137] statmod_1.5.0 shiny_1.7.5
## [139] highr_0.10 memoise_2.0.1
## [141] bslib_0.5.1 bit_4.0.5