Previously we demonstrated the ability of GMEA to detect differentially methylated pathways. There remains a question about how reliable these observations are. To resolve it, we will use the Hcy dataset which consists of 172 samples. We will split the data into two groups and analyse these separately. If the pathway results are similar, then we can be confident about the accuracy.
suppressPackageStartupMessages({
library("mitch")
library("limma")
library("kableExtra")
library("eulerr")
library("IlluminaHumanMethylation450kmanifest")
library("IlluminaHumanMethylation450kanno.ilmn12.hg19")
library("tictoc")
})
Reactome pathways were downloaded on the 14th Sept 2023 from MsigDB.
gs_entrez <- gmt_import("c2.cp.reactome.v2023.1.Hs.entrez.gmt")
gs_symbols <- gmt_import("c2.cp.reactome.v2023.1.Hs.symbols.gmt")
ann450k <- getAnnotation(IlluminaHumanMethylation450kanno.ilmn12.hg19)
myann <- data.frame(ann450k[,c("UCSC_RefGene_Name","Regulatory_Feature_Group")])
promoters <- grep("Prom",myann$Regulatory_Feature_Group)
agg <- function(dm,cores=1) {
gn <- unique(unlist(strsplit( dm$UCSC_RefGene_Name ,";")))
gnl <- strsplit( dm$UCSC_RefGene_Name ,";")
gnl <- mclapply(gnl,unique,mc.cores=cores)
dm$UCSC_RefGene_Name <- gnl
l <- mclapply(1:nrow(dm), function(i) {
a <- dm[i,]
len <- length(a[[1]][[1]])
tvals <- as.numeric(rep(a["t"],len))
genes <- a[[1]][[1]]
data.frame(genes,tvals)
},mc.cores=cores)
df <- do.call(rbind,l)
keep <- names(which(table(df$genes)>1))
df <- df[df$genes %in% keep,]
gn <- unique(df$genes)
gme_res <- lapply( 1:length(gn), function(i) {
g <- gn[i]
tstats <- df[which(df$genes==g),"tvals"]
myn <- length(tstats)
mymean <- mean(tstats)
mymedian <- median(tstats)
tselfcont <- t.test(tstats)
res <- c("gene"=g,"nprobes"=myn,"mean"=mymean,"median"=mymedian,
"p-value(sc)"=tselfcont$p.value)
} )
gme_res_df <- do.call(rbind, gme_res)
rownames(gme_res_df) <- gme_res_df[,1]
gme_res_df <- gme_res_df[,-1]
tmp <- apply(gme_res_df,2,as.numeric)
rownames(tmp) <- rownames(gme_res_df)
gme_res_df <- as.data.frame(tmp)
gme_res_df$sig <- -log10(gme_res_df[,4])
gme_res_df <- gme_res_df[order(-gme_res_df$sig),]
gme_res_df$`fdr(sc)` <- p.adjust(gme_res_df$`p-value(sc)`)
out <- list("df"=df,"gme_res_df"=gme_res_df)
return(out)
}
ttenrich <- function(m,genesets,cores=1) {
res <- mclapply( 1:length(genesets), function(i) {
gs <- genesets[i]
name <- names(gs)
n_members <- length(which(rownames(m) %in% gs[[1]]))
if ( n_members > 4 ) {
tstats <- m[which(rownames(m) %in% gs[[1]]),]
myn <- length(tstats)
mymean <- mean(tstats)
mymedian <- median(tstats)
wt <- wilcox.test(tstats)
res <- c(name,myn,mymean,mymedian,wt$p.value)
}
} , mc.cores = cores)
res_df <- do.call(rbind, res)
rownames(res_df) <- res_df[,1]
res_df <- res_df[,-1]
colnames(res_df) <- c("n_genes","t_mean","t_median","p.value")
tmp <- apply(res_df,2,as.numeric)
rownames(tmp) <- rownames(res_df)
res_df <- tmp
res_df <- as.data.frame(res_df)
res_df <- res_df[order(res_df$p.value),]
res_df$logp <- -log10(res_df$p.value )
res_df$fdr <- p.adjust(res_df$p.value,method="fdr")
res_df[order(abs(res_df$p.value)),]
return(res_df)
}
Load the data.
design <- as.data.frame(readRDS("dm3_design.Rds"))
dim(design)
## [1] 172 4
mx <- readRDS("dm3_mx.Rds")
dim(mx)
## [1] 422374 172
Split the data into two smaller groups (s and b).
g1 <- which(as.data.frame(design)$groups==1)
g2 <- which(as.data.frame(design)$groups==2)
g3 <- which(as.data.frame(design)$groups==3)
seed=200
set.seed(seed); g1s <- sample(g1,floor(length(g1)/2))
g1b <- setdiff(g1,g1s)
set.seed(seed); g2s <- sample(g2,floor(length(g2)/2))
g2b <- setdiff(g2,g2s)
set.seed(seed); g3s <- sample(g3,floor(length(g3)/2))
g3b <- setdiff(g3,g3s)
gxs <- c(g1s,g2s,g3s)
gxs <- gxs[order(gxs)]
gxb <- c(g1b,g2b,g3b)
gxb <- gxb[order(gxb)]
dess <- design[gxs,]
mxs <- mx[,gxs]
desb <- design[gxb,]
mxb <- mx[,gxb]
Using the S dataset, apply limma to conduct differential methylation analysis, then GMEA. GMEA has two steps:
Aggregate probe t-statistics to genes.
Perform enrichment test using 1 sample t-test.
d <- model.matrix(~dess$age+ dess$sex + dess$groups)
fit.reduced <- lmFit(mxs,d)
fit.reduced <- eBayes(fit.reduced)
dm <- topTable(fit.reduced,coef=4, number = Inf)
dma <- merge(myann,dm,by=0)
dma <- dma[order(dma$P.Value),]
dma <- dma[,c("UCSC_RefGene_Name","t")]
dmagg <- agg(dma,cores=8)
m <- dmagg$gme_res_df
m <- m[,"median",drop=FALSE]
tres <- ttenrich(m=m,genesets=gs_symbols,cores=8)
sig <- subset(tres,fdr<0.05)
ssig <- sig
Now repeat with the b set.
d <- model.matrix(~desb$age+ desb$sex + desb$groups)
fit.reduced <- lmFit(mxb,d)
fit.reduced <- eBayes(fit.reduced)
dm <- topTable(fit.reduced,coef=4, number = Inf)
dma <- merge(myann,dm,by=0)
dma <- dma[order(dma$P.Value),]
dma <- dma[,c("UCSC_RefGene_Name","t")]
dmagg <- agg(dma,cores=8)
m <- dmagg$gme_res_df
m <- m[,"median",drop=FALSE]
tres <- ttenrich(m=m,genesets=gs_symbols,cores=8)
sig <- subset(tres,fdr<0.05)
bsig <- sig
Use Euler diagrams to find the similarity. As medians are normally distributed about the zero, one would expect equal numbers of up and down- methylated gene sets. Therefore the t-test based approach could be considered better.
ssig_up <- rownames(subset(ssig,t_median>0))
ssig_dn <- rownames(subset(ssig,t_median<0))
bsig_up <- rownames(subset(bsig,t_median>0))
bsig_dn <- rownames(subset(bsig,t_median<0))
v1 <- list("s up"=ssig_up, "s dn"=ssig_dn,
"b up"=bsig_up,"b dn"=bsig_dn)
plot(euler(v1),quantities = TRUE)
Make sure it is all working properly.
Repeat for other seed values and aggregate the results.
For reproducibility.
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] tictoc_1.2
## [2] IlluminaHumanMethylation450kanno.ilmn12.hg19_0.6.1
## [3] IlluminaHumanMethylation450kmanifest_0.4.0
## [4] minfi_1.46.0
## [5] bumphunter_1.42.0
## [6] locfit_1.5-9.8
## [7] iterators_1.0.14
## [8] foreach_1.5.2
## [9] Biostrings_2.68.1
## [10] XVector_0.40.0
## [11] SummarizedExperiment_1.30.2
## [12] Biobase_2.60.0
## [13] MatrixGenerics_1.12.3
## [14] matrixStats_1.0.0
## [15] GenomicRanges_1.52.0
## [16] GenomeInfoDb_1.36.3
## [17] IRanges_2.34.1
## [18] S4Vectors_0.38.1
## [19] BiocGenerics_0.46.0
## [20] eulerr_7.0.0
## [21] kableExtra_1.3.4
## [22] limma_3.56.2
## [23] mitch_1.12.0
##
## loaded via a namespace (and not attached):
## [1] splines_4.3.1 later_1.3.1
## [3] BiocIO_1.10.0 bitops_1.0-7
## [5] filelock_1.0.2 tibble_3.2.1
## [7] polyclip_1.10-4 preprocessCore_1.62.1
## [9] XML_3.99-0.14 lifecycle_1.0.3
## [11] lattice_0.21-8 MASS_7.3-60
## [13] base64_2.0.1 scrime_1.3.5
## [15] magrittr_2.0.3 sass_0.4.7
## [17] rmarkdown_2.24 jquerylib_0.1.4
## [19] yaml_2.3.7 httpuv_1.6.11
## [21] doRNG_1.8.6 askpass_1.2.0
## [23] DBI_1.1.3 RColorBrewer_1.1-3
## [25] abind_1.4-5 zlibbioc_1.46.0
## [27] rvest_1.0.3 quadprog_1.5-8
## [29] purrr_1.0.2 RCurl_1.98-1.12
## [31] rappdirs_0.3.3 GenomeInfoDbData_1.2.10
## [33] genefilter_1.82.1 annotate_1.78.0
## [35] svglite_2.1.1 DelayedMatrixStats_1.22.6
## [37] codetools_0.2-19 DelayedArray_0.26.7
## [39] xml2_1.3.5 tidyselect_1.2.0
## [41] beanplot_1.3.1 BiocFileCache_2.8.0
## [43] illuminaio_0.42.0 webshot_0.5.5
## [45] GenomicAlignments_1.36.0 jsonlite_1.8.7
## [47] multtest_2.56.0 ellipsis_0.3.2
## [49] survival_3.5-7 systemfonts_1.0.4
## [51] polylabelr_0.2.0 tools_4.3.1
## [53] progress_1.2.2 Rcpp_1.0.11
## [55] glue_1.6.2 gridExtra_2.3
## [57] xfun_0.40 dplyr_1.1.3
## [59] HDF5Array_1.28.1 fastmap_1.1.1
## [61] GGally_2.1.2 rhdf5filters_1.12.1
## [63] fansi_1.0.4 openssl_2.1.0
## [65] caTools_1.18.2 digest_0.6.33
## [67] R6_2.5.1 mime_0.12
## [69] colorspace_2.1-0 gtools_3.9.4
## [71] biomaRt_2.56.1 RSQLite_2.3.1
## [73] tidyr_1.3.0 utf8_1.2.3
## [75] generics_0.1.3 data.table_1.14.8
## [77] rtracklayer_1.60.1 prettyunits_1.1.1
## [79] httr_1.4.7 htmlwidgets_1.6.2
## [81] S4Arrays_1.0.6 pkgconfig_2.0.3
## [83] gtable_0.3.4 blob_1.2.4
## [85] siggenes_1.74.0 htmltools_0.5.6
## [87] echarts4r_0.4.5 scales_1.2.1
## [89] png_0.1-8 knitr_1.44
## [91] rstudioapi_0.15.0 tzdb_0.4.0
## [93] reshape2_1.4.4 rjson_0.2.21
## [95] nlme_3.1-163 curl_5.0.2
## [97] cachem_1.0.8 rhdf5_2.44.0
## [99] stringr_1.5.0 KernSmooth_2.23-22
## [101] AnnotationDbi_1.62.2 restfulr_0.0.15
## [103] GEOquery_2.68.0 pillar_1.9.0
## [105] grid_4.3.1 reshape_0.8.9
## [107] vctrs_0.6.3 gplots_3.1.3
## [109] promises_1.2.1 dbplyr_2.3.3
## [111] xtable_1.8-4 beeswarm_0.4.0
## [113] evaluate_0.21 readr_2.1.4
## [115] GenomicFeatures_1.52.2 cli_3.6.1
## [117] compiler_4.3.1 Rsamtools_2.16.0
## [119] rlang_1.1.1 crayon_1.5.2
## [121] rngtools_1.5.2 nor1mix_1.3-0
## [123] mclust_6.0.0 plyr_1.8.8
## [125] stringi_1.7.12 viridisLite_0.4.2
## [127] BiocParallel_1.34.2 munsell_0.5.0
## [129] Matrix_1.6-1 hms_1.1.3
## [131] sparseMatrixStats_1.12.2 bit64_4.0.5
## [133] ggplot2_3.4.3 Rhdf5lib_1.22.1
## [135] KEGGREST_1.40.0 shiny_1.7.5
## [137] memoise_2.0.1 bslib_0.5.1
## [139] bit_4.0.5