Validation of GMEA

Introduction

Previously we demonstrated the ability of GMEA to detect differentially methylated pathways. There remains a question about how reliable these observations are. To resolve it, we will use the Hcy dataset which consists of 172 samples. We will split the data into two groups and analyse these separately. If the pathway results are similar, then we can be confident about the accuracy.

suppressPackageStartupMessages({
  library("mitch")
  library("limma")
  library("kableExtra")
  library("eulerr")
  library("IlluminaHumanMethylation450kmanifest")
  library("IlluminaHumanMethylation450kanno.ilmn12.hg19")
  library("tictoc")
})

Load gene sets

Reactome pathways were downloaded on the 14th Sept 2023 from MsigDB.

gs_entrez <- gmt_import("c2.cp.reactome.v2023.1.Hs.entrez.gmt")

gs_symbols <- gmt_import("c2.cp.reactome.v2023.1.Hs.symbols.gmt")

Load annotation data

ann450k <- getAnnotation(IlluminaHumanMethylation450kanno.ilmn12.hg19)
myann <- data.frame(ann450k[,c("UCSC_RefGene_Name","Regulatory_Feature_Group")])
promoters <- grep("Prom",myann$Regulatory_Feature_Group)

Load functions for GMEA

agg <- function(dm,cores=1) {
  gn <- unique(unlist(strsplit( dm$UCSC_RefGene_Name ,";")))
  gnl <- strsplit( dm$UCSC_RefGene_Name ,";")
  gnl <- mclapply(gnl,unique,mc.cores=cores)
  dm$UCSC_RefGene_Name <- gnl
  l <- mclapply(1:nrow(dm), function(i) {
    a <- dm[i,]
    len <- length(a[[1]][[1]])
    tvals <- as.numeric(rep(a["t"],len))
    genes <- a[[1]][[1]]
    data.frame(genes,tvals)
  },mc.cores=cores)
  df <- do.call(rbind,l)
  keep <- names(which(table(df$genes)>1))
  df <- df[df$genes %in% keep,]
  gn <- unique(df$genes)
  gme_res <- lapply( 1:length(gn), function(i) {
    g <- gn[i]
    tstats <- df[which(df$genes==g),"tvals"]
    myn <- length(tstats)
    mymean <- mean(tstats)
    mymedian <- median(tstats)
    tselfcont <- t.test(tstats)
    res <- c("gene"=g,"nprobes"=myn,"mean"=mymean,"median"=mymedian,
      "p-value(sc)"=tselfcont$p.value)
  } )
  gme_res_df <- do.call(rbind, gme_res)
  rownames(gme_res_df) <- gme_res_df[,1]
  gme_res_df <- gme_res_df[,-1]
  tmp <- apply(gme_res_df,2,as.numeric)
  rownames(tmp) <- rownames(gme_res_df)
  gme_res_df <- as.data.frame(tmp)
  gme_res_df$sig <- -log10(gme_res_df[,4])
  gme_res_df <- gme_res_df[order(-gme_res_df$sig),]
  gme_res_df$`fdr(sc)` <- p.adjust(gme_res_df$`p-value(sc)`)
  out <- list("df"=df,"gme_res_df"=gme_res_df)
  return(out)
}

ttenrich <- function(m,genesets,cores=1) {
  res <- mclapply( 1:length(genesets), function(i) {
    gs <- genesets[i]
    name <- names(gs)
    n_members <- length(which(rownames(m) %in% gs[[1]]))
    if ( n_members > 4 ) {
      tstats <- m[which(rownames(m) %in% gs[[1]]),]
      myn <- length(tstats)
      mymean <- mean(tstats)
      mymedian <- median(tstats)
      wt <- wilcox.test(tstats)
      res <- c(name,myn,mymean,mymedian,wt$p.value)
    }
  } , mc.cores = cores)
  res_df <- do.call(rbind, res)
  rownames(res_df) <- res_df[,1]
  res_df <- res_df[,-1]
  colnames(res_df) <- c("n_genes","t_mean","t_median","p.value")
  tmp <- apply(res_df,2,as.numeric)
  rownames(tmp) <- rownames(res_df)
  res_df <- tmp
  res_df <- as.data.frame(res_df)
  res_df <- res_df[order(res_df$p.value),]
  res_df$logp <- -log10(res_df$p.value )
  res_df$fdr <- p.adjust(res_df$p.value,method="fdr")
  res_df[order(abs(res_df$p.value)),]
  return(res_df)
}

Load methylation data

Load the data.

design <- as.data.frame(readRDS("dm3_design.Rds"))
dim(design)

## [1] 172   4

mx <- readRDS("dm3_mx.Rds")
dim(mx)

## [1] 422374    172

Split data

Split the data into two smaller groups (s and b).

g1 <- which(as.data.frame(design)$groups==1)
g2 <- which(as.data.frame(design)$groups==2)
g3 <- which(as.data.frame(design)$groups==3)

seed=200

set.seed(seed); g1s <- sample(g1,floor(length(g1)/2))
g1b <- setdiff(g1,g1s)

set.seed(seed); g2s <- sample(g2,floor(length(g2)/2))
g2b <- setdiff(g2,g2s)

set.seed(seed); g3s <- sample(g3,floor(length(g3)/2))
g3b <- setdiff(g3,g3s)

gxs <- c(g1s,g2s,g3s)
gxs <- gxs[order(gxs)]

gxb <- c(g1b,g2b,g3b)
gxb <- gxb[order(gxb)]

dess <- design[gxs,]
mxs <- mx[,gxs]

desb <- design[gxb,]
mxb <- mx[,gxb]

Limma + GMEA

Using the S dataset, apply limma to conduct differential methylation analysis, then GMEA. GMEA has two steps:

Aggregate probe t-statistics to genes.
Perform enrichment test using 1 sample t-test.

d <- model.matrix(~dess$age+ dess$sex + dess$groups)

fit.reduced <- lmFit(mxs,d)
fit.reduced <- eBayes(fit.reduced)
dm <- topTable(fit.reduced,coef=4, number = Inf)
dma <- merge(myann,dm,by=0)
dma <- dma[order(dma$P.Value),]
dma <- dma[,c("UCSC_RefGene_Name","t")]

dmagg <- agg(dma,cores=8)
m <- dmagg$gme_res_df
m <- m[,"median",drop=FALSE]
tres <- ttenrich(m=m,genesets=gs_symbols,cores=8)
sig <- subset(tres,fdr<0.05)
ssig <- sig

Now repeat with the b set.

d <- model.matrix(~desb$age+ desb$sex + desb$groups)

fit.reduced <- lmFit(mxb,d)
fit.reduced <- eBayes(fit.reduced)
dm <- topTable(fit.reduced,coef=4, number = Inf)
dma <- merge(myann,dm,by=0)
dma <- dma[order(dma$P.Value),]
dma <- dma[,c("UCSC_RefGene_Name","t")]

dmagg <- agg(dma,cores=8)
m <- dmagg$gme_res_df
m <- m[,"median",drop=FALSE]
tres <- ttenrich(m=m,genesets=gs_symbols,cores=8)
sig <- subset(tres,fdr<0.05)
bsig <- sig

Overlap

Use Euler diagrams to find the similarity. As medians are normally distributed about the zero, one would expect equal numbers of up and down- methylated gene sets. Therefore the t-test based approach could be considered better.

ssig_up <- rownames(subset(ssig,t_median>0))
ssig_dn <- rownames(subset(ssig,t_median<0))

bsig_up <- rownames(subset(bsig,t_median>0))
bsig_dn <- rownames(subset(bsig,t_median<0))

v1 <- list("s up"=ssig_up, "s dn"=ssig_dn,
  "b up"=bsig_up,"b dn"=bsig_dn)

plot(euler(v1),quantities = TRUE)

TODO

Make sure it is all working properly.
Repeat for other seed values and aggregate the results.

Session information

For reproducibility.

sessionInfo()

## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] tictoc_1.2                                        
##  [2] IlluminaHumanMethylation450kanno.ilmn12.hg19_0.6.1
##  [3] IlluminaHumanMethylation450kmanifest_0.4.0        
##  [4] minfi_1.46.0                                      
##  [5] bumphunter_1.42.0                                 
##  [6] locfit_1.5-9.8                                    
##  [7] iterators_1.0.14                                  
##  [8] foreach_1.5.2                                     
##  [9] Biostrings_2.68.1                                 
## [10] XVector_0.40.0                                    
## [11] SummarizedExperiment_1.30.2                       
## [12] Biobase_2.60.0                                    
## [13] MatrixGenerics_1.12.3                             
## [14] matrixStats_1.0.0                                 
## [15] GenomicRanges_1.52.0                              
## [16] GenomeInfoDb_1.36.3                               
## [17] IRanges_2.34.1                                    
## [18] S4Vectors_0.38.1                                  
## [19] BiocGenerics_0.46.0                               
## [20] eulerr_7.0.0                                      
## [21] kableExtra_1.3.4                                  
## [22] limma_3.56.2                                      
## [23] mitch_1.12.0                                      
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.3.1             later_1.3.1              
##   [3] BiocIO_1.10.0             bitops_1.0-7             
##   [5] filelock_1.0.2            tibble_3.2.1             
##   [7] polyclip_1.10-4           preprocessCore_1.62.1    
##   [9] XML_3.99-0.14             lifecycle_1.0.3          
##  [11] lattice_0.21-8            MASS_7.3-60              
##  [13] base64_2.0.1              scrime_1.3.5             
##  [15] magrittr_2.0.3            sass_0.4.7               
##  [17] rmarkdown_2.24            jquerylib_0.1.4          
##  [19] yaml_2.3.7                httpuv_1.6.11            
##  [21] doRNG_1.8.6               askpass_1.2.0            
##  [23] DBI_1.1.3                 RColorBrewer_1.1-3       
##  [25] abind_1.4-5               zlibbioc_1.46.0          
##  [27] rvest_1.0.3               quadprog_1.5-8           
##  [29] purrr_1.0.2               RCurl_1.98-1.12          
##  [31] rappdirs_0.3.3            GenomeInfoDbData_1.2.10  
##  [33] genefilter_1.82.1         annotate_1.78.0          
##  [35] svglite_2.1.1             DelayedMatrixStats_1.22.6
##  [37] codetools_0.2-19          DelayedArray_0.26.7      
##  [39] xml2_1.3.5                tidyselect_1.2.0         
##  [41] beanplot_1.3.1            BiocFileCache_2.8.0      
##  [43] illuminaio_0.42.0         webshot_0.5.5            
##  [45] GenomicAlignments_1.36.0  jsonlite_1.8.7           
##  [47] multtest_2.56.0           ellipsis_0.3.2           
##  [49] survival_3.5-7            systemfonts_1.0.4        
##  [51] polylabelr_0.2.0          tools_4.3.1              
##  [53] progress_1.2.2            Rcpp_1.0.11              
##  [55] glue_1.6.2                gridExtra_2.3            
##  [57] xfun_0.40                 dplyr_1.1.3              
##  [59] HDF5Array_1.28.1          fastmap_1.1.1            
##  [61] GGally_2.1.2              rhdf5filters_1.12.1      
##  [63] fansi_1.0.4               openssl_2.1.0            
##  [65] caTools_1.18.2            digest_0.6.33            
##  [67] R6_2.5.1                  mime_0.12                
##  [69] colorspace_2.1-0          gtools_3.9.4             
##  [71] biomaRt_2.56.1            RSQLite_2.3.1            
##  [73] tidyr_1.3.0               utf8_1.2.3               
##  [75] generics_0.1.3            data.table_1.14.8        
##  [77] rtracklayer_1.60.1        prettyunits_1.1.1        
##  [79] httr_1.4.7                htmlwidgets_1.6.2        
##  [81] S4Arrays_1.0.6            pkgconfig_2.0.3          
##  [83] gtable_0.3.4              blob_1.2.4               
##  [85] siggenes_1.74.0           htmltools_0.5.6          
##  [87] echarts4r_0.4.5           scales_1.2.1             
##  [89] png_0.1-8                 knitr_1.44               
##  [91] rstudioapi_0.15.0         tzdb_0.4.0               
##  [93] reshape2_1.4.4            rjson_0.2.21             
##  [95] nlme_3.1-163              curl_5.0.2               
##  [97] cachem_1.0.8              rhdf5_2.44.0             
##  [99] stringr_1.5.0             KernSmooth_2.23-22       
## [101] AnnotationDbi_1.62.2      restfulr_0.0.15          
## [103] GEOquery_2.68.0           pillar_1.9.0             
## [105] grid_4.3.1                reshape_0.8.9            
## [107] vctrs_0.6.3               gplots_3.1.3             
## [109] promises_1.2.1            dbplyr_2.3.3             
## [111] xtable_1.8-4              beeswarm_0.4.0           
## [113] evaluate_0.21             readr_2.1.4              
## [115] GenomicFeatures_1.52.2    cli_3.6.1                
## [117] compiler_4.3.1            Rsamtools_2.16.0         
## [119] rlang_1.1.1               crayon_1.5.2             
## [121] rngtools_1.5.2            nor1mix_1.3-0            
## [123] mclust_6.0.0              plyr_1.8.8               
## [125] stringi_1.7.12            viridisLite_0.4.2        
## [127] BiocParallel_1.34.2       munsell_0.5.0            
## [129] Matrix_1.6-1              hms_1.1.3                
## [131] sparseMatrixStats_1.12.2  bit64_4.0.5              
## [133] ggplot2_3.4.3             Rhdf5lib_1.22.1          
## [135] KEGGREST_1.40.0           shiny_1.7.5              
## [137] memoise_2.0.1             bslib_0.5.1              
## [139] bit_4.0.5