Introduction

Here we will run test enrichment analysis approaches for undderstanding the effect of elevated homocysteine on blood methylomes. Homocysteine is thought to be an inhibitor of methyltransferases and hyperhomocysteine is associated with overall reduced genomic methylation. Previously we analysed the B-PROOF study data which enabled us to identify probes whose methylation correlatesor anti-correlated with homocysteine level.

The two enrichment analysis approaches we will use are:

  • gsameth: an ORA approach that addressed the biases inherent with infinium array analysis.

  • gmea: an experimental application of functional class scoring for infinium analysis.

suppressPackageStartupMessages({
  library("missMethyl")
  library("kableExtra")
  library("mitch")
  library("eulerr")
  library("IlluminaHumanMethylation450kmanifest")
  library("IlluminaHumanMethylation450kanno.ilmn12.hg19")
  library("tictoc")
})

Load the data.

dm <- read.table("dma3a.tsv")
rownames(dm) <- dm$Row.names
dm$Row.names=NULL

head(dm, 10) %>% kbl() %>% kable_paper("hover", full_width = F)
UCSC_RefGene_Name Regulatory_Feature_Group logFC AveExpr t P.Value adj.P.Val B
cg04905210 POLR3D -0.2893816 -2.8148883 -5.377810 2.0e-07 0.1017096 5.889905
cg09338148 Unclassified -0.2919282 -1.1720330 -5.116352 8.0e-07 0.1729449 4.884437
cg04247967 -0.2057500 3.1797718 -4.953286 1.7e-06 0.2374161 4.275061
cg06458106 -0.1900758 1.6656963 -4.763516 4.0e-06 0.2374161 3.583754
cg26425904 OCA2 Unclassified -0.2527032 -4.0235228 -4.731955 4.6e-06 0.2374161 3.470695
cg10746622 PRSSL1 -0.1999261 1.5539691 -4.707850 5.1e-06 0.2374161 3.384715
cg19590707 SLC16A3;SLC16A3;SLC16A3 Promoter_Associated -0.2479821 -1.2545892 -4.704517 5.2e-06 0.2374161 3.372853
cg09762242 SIPA1;SIPA1 Promoter_Associated -0.2102410 -2.5125329 -4.704466 5.2e-06 0.2374161 3.372673
cg03622371 P2RY6;P2RY6;P2RY6;P2RY6 Unclassified -0.2403843 -1.6741058 -4.679125 5.8e-06 0.2374161 3.282683
cg11257888 RCAN3 0.2790028 0.4787858 4.674646 5.9e-06 0.2374161 3.266814

Reactome pathways were downloaded on the 14th Sept 2023 from MsigDB.

gs_entrez <- gmt_import("c2.cp.reactome.v2023.1.Hs.entrez.gmt")

gs_symbols <- gmt_import("c2.cp.reactome.v2023.1.Hs.symbols.gmt")

ORA with GSAMETH - parth of the MissMethyl package

Now perform ORA analysis using gsameth with the top 500 probes.

sigup <- rownames(head(subset(dm,logFC>0),1000))
gsaup <- gsameth(sig.cpg=head(sigup,500), all.cpg=rownames(dm), collection=gs_entrez)
## All input CpGs are used for testing.
gsaup <- gsaup[order(gsaup$P.DE),]
head(gsaup) %>% kbl(caption="Top hypermethylated pathways") %>% kable_paper("hover", full_width = F)
Top hypermethylated pathways
N DE P.DE FDR
REACTOME_INTERLEUKIN_21_SIGNALING 9 4 0.0000418 0.0445810
REACTOME_INTERLEUKIN_20_FAMILY_SIGNALING 26 5 0.0000730 0.0445810
REACTOME_INTERLEUKIN_6_SIGNALING 11 4 0.0000809 0.0445810
REACTOME_INTERLEUKIN_2_FAMILY_SIGNALING 39 6 0.0001994 0.0824657
REACTOME_TNF_RECEPTOR_SUPERFAMILY_TNFSF_MEMBERS_MEDIATING_NON_CANONICAL_NF_KB_PATHWAY 16 4 0.0002769 0.0916139
REACTOME_INTERFERON_ALPHA_BETA_SIGNALING 64 7 0.0003500 0.0964951
sigdn <- rownames(head(subset(dm,logFC<0),1000))
gsadn <- gsameth(sig.cpg=head(sigdn,500), all.cpg=rownames(dm), collection=gs_entrez)
## All input CpGs are used for testing.
gsadn <- gsadn[order(gsadn$P.DE),]
head(gsadn) %>% kbl(caption="Top hypomethylated pathways") %>% kable_paper("hover", full_width = F)
Top hypomethylated pathways
N DE P.DE FDR
REACTOME_SYNTHESIS_OF_BILE_ACIDS_AND_BILE_SALTS_VIA_27_HYDROXYCHOLESTEROL 15 3 0.0016497 1
REACTOME_THROMBIN_SIGNALLING_THROUGH_PROTEINASE_ACTIVATED_RECEPTORS_PARS 32 4 0.0057066 1
REACTOME_SYNTHESIS_OF_BILE_ACIDS_AND_BILE_SALTS_VIA_7ALPHA_HYDROXYCHOLESTEROL 24 3 0.0058080 1
REACTOME_BLOOD_GROUP_SYSTEMS_BIOSYNTHESIS 21 3 0.0058718 1
REACTOME_TRANSPORT_OF_NUCLEOTIDE_SUGARS 8 2 0.0081909 1
REACTOME_PASSIVE_TRANSPORT_BY_AQUAPORINS 13 2 0.0103872 1

GMEA

Now with GMEA.

Start with gathering the annotation set.

ann450k <- getAnnotation(IlluminaHumanMethylation450kanno.ilmn12.hg19)
myann <- data.frame(ann450k[,c("UCSC_RefGene_Name","Regulatory_Feature_Group")])
promoters <- grep("Prom",myann$Regulatory_Feature_Group)

Histogram of limma t-statistics.

dm <- dm[,c("UCSC_RefGene_Name","t")]
hist(dm$t,breaks=seq(from=-6,to=6,by=1))

Aggregate probe t-scores to gene level scores.

The object generated has two parts:

  1. df: a simple table of the gene name and the median tvalue

  2. gme_res_df: results of 1-sample Wilcox test.

CORES= detectCores()

agg <- function(dm) {
  gn <- unique(unlist(strsplit( dm$UCSC_RefGene_Name ,";")))
  gnl <- strsplit( dm$UCSC_RefGene_Name ,";")
  gnl <- mclapply(gnl,unique,mc.cores=CORES)
  dm$UCSC_RefGene_Name <- gnl
  l <- mclapply(1:nrow(dm), function(i) {
    a <- dm[i,]
    len <- length(a[[1]][[1]])
    tvals <- as.numeric(rep(a[2],len))
    genes <- a[[1]][[1]]
    data.frame(genes,tvals)
  },mc.cores=CORES)
  df <- do.call(rbind,l)
  gme_res <- mclapply( 1:length(gn), function(i) {
    g <- gn[i]
    tstats <- df[which(df$genes==g),"tvals"]
    myn <- length(tstats)
    mymean <- mean(tstats)
    mymedian <- median(tstats)
    wtselfcont <- wilcox.test(tstats)
    res <- c("gene"=g,"nprobes"=myn,"mean"=mymean,"median"=mymedian,
      "p-value(sc)"=wtselfcont$p.value)
  } , mc.cores=CORES )
  gme_res_df <- do.call(rbind, gme_res)
  rownames(gme_res_df) <- gme_res_df[,1]
  gme_res_df <- gme_res_df[,-1]
  tmp <- apply(gme_res_df,2,as.numeric)
  rownames(tmp) <- rownames(gme_res_df)
  gme_res_df <- as.data.frame(tmp)
  gme_res_df$sig <- -log10(gme_res_df[,4])
  gme_res_df <- gme_res_df[order(-gme_res_df$sig),]
  gme_res_df$`fdr(sc)` <- p.adjust(gme_res_df$`p-value(sc)`)
  out <- list("df"=df,"gme_res_df"=gme_res_df)
  return(out)
}

tic()
dmagg <- agg(dm)
toc()
## 65.778 sec elapsed
head(dmagg$gme_res_df)
##         nprobes       mean     median  p-value(sc)      sig      fdr(sc)
## TNXB        531  0.3476999  0.4126704 4.781602e-15 14.32043 9.608152e-11
## PCDHA1      162 -0.7059103 -0.6207302 3.718622e-14 13.42962 7.471827e-10
## NNAT         49 -0.9377326 -0.9433135 6.750156e-14 13.17069 1.356241e-09
## PCDHA2      149 -0.7171256 -0.6259052 3.605827e-13 12.44300 7.244468e-09
## PCDHA3      141 -0.7168381 -0.6259052 2.805782e-12 11.55195 5.636815e-08
## KCNQ1DN      39 -1.6152306 -1.7110279 3.637979e-12 11.43914 7.308336e-08

Does 1-sample t-test work better?

agg2 <- function(dm) {
  gn <- unique(unlist(strsplit( dm$UCSC_RefGene_Name ,";")))
  gnl <- strsplit( dm$UCSC_RefGene_Name ,";")
  gnl <- mclapply(gnl,unique,mc.cores=CORES)
  dm$UCSC_RefGene_Name <- gnl
  l <- mclapply(1:nrow(dm), function(i) {
    a <- dm[i,]
    len <- length(a[[1]][[1]])
    tvals <- as.numeric(rep(a["t"],len))
    genes <- a[[1]][[1]]
    data.frame(genes,tvals)
  },mc.cores=CORES)
  df <- do.call(rbind,l)
  keep <- names(which(table(df$genes)>1))
  df <- df[df$genes %in% keep,]
  gn <- unique(df$genes)
  gme_res <- lapply( 1:length(gn), function(i) {
    g <- gn[i]
    tstats <- df[which(df$genes==g),"tvals"]
    myn <- length(tstats)
    mymean <- mean(tstats)
    mymedian <- median(tstats)
    tselfcont <- t.test(tstats)
    res <- c("gene"=g,"nprobes"=myn,"mean"=mymean,"median"=mymedian,
      "p-value(sc)"=tselfcont$p.value)
  } )
  gme_res_df <- do.call(rbind, gme_res)
  rownames(gme_res_df) <- gme_res_df[,1]
  gme_res_df <- gme_res_df[,-1]
  tmp <- apply(gme_res_df,2,as.numeric)
  rownames(tmp) <- rownames(gme_res_df)
  gme_res_df <- as.data.frame(tmp)
  gme_res_df$sig <- -log10(gme_res_df[,4])
  gme_res_df <- gme_res_df[order(-gme_res_df$sig),]
  gme_res_df$`fdr(sc)` <- p.adjust(gme_res_df$`p-value(sc)`)
  out <- list("df"=df,"gme_res_df"=gme_res_df)
  return(out)
}

tic()
dmagg2 <- agg2(dm)
toc()
## 105.684 sec elapsed
head(dmagg2$gme_res_df)
##         nprobes       mean     median  p-value(sc)      sig      fdr(sc)
## KCNQ1DN      39 -1.6152306 -1.7110279 2.811916e-17 16.55100 5.508262e-13
## NNAT         49 -0.9377326 -0.9433135 8.119861e-17 16.09045 1.590518e-12
## PCDHA1      162 -0.7059103 -0.6207302 2.891024e-16 15.53895 5.662649e-12
## TNXB        531  0.3476999  0.4126704 1.989790e-15 14.70119 3.897202e-11
## MESTIT1      59 -0.7540952 -0.8024776 2.553284e-15 14.59290 5.000607e-11
## PCDHA2      149 -0.7171256 -0.6259052 5.353803e-15 14.27134 1.048489e-10

Session information

For reproducibility.

sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] mitch_1.12.0                                       
##  [2] tictoc_1.2                                         
##  [3] IlluminaHumanMethylation450kmanifest_0.4.0         
##  [4] eulerr_7.0.0                                       
##  [5] kableExtra_1.3.4                                   
##  [6] missMethyl_1.34.0                                  
##  [7] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0
##  [8] IlluminaHumanMethylation450kanno.ilmn12.hg19_0.6.1 
##  [9] minfi_1.46.0                                       
## [10] bumphunter_1.42.0                                  
## [11] locfit_1.5-9.8                                     
## [12] iterators_1.0.14                                   
## [13] foreach_1.5.2                                      
## [14] Biostrings_2.68.1                                  
## [15] XVector_0.40.0                                     
## [16] SummarizedExperiment_1.30.2                        
## [17] Biobase_2.60.0                                     
## [18] MatrixGenerics_1.12.3                              
## [19] matrixStats_1.0.0                                  
## [20] GenomicRanges_1.52.0                               
## [21] GenomeInfoDb_1.36.3                                
## [22] IRanges_2.34.1                                     
## [23] S4Vectors_0.38.1                                   
## [24] BiocGenerics_0.46.0                                
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.3.1             later_1.3.1              
##   [3] BiocIO_1.10.0             bitops_1.0-7             
##   [5] filelock_1.0.2            BiasedUrn_2.0.11         
##   [7] tibble_3.2.1              preprocessCore_1.62.1    
##   [9] XML_3.99-0.14             lifecycle_1.0.3          
##  [11] lattice_0.21-8            MASS_7.3-60              
##  [13] base64_2.0.1              scrime_1.3.5             
##  [15] magrittr_2.0.3            sass_0.4.7               
##  [17] limma_3.56.2              rmarkdown_2.24           
##  [19] jquerylib_0.1.4           yaml_2.3.7               
##  [21] httpuv_1.6.11             doRNG_1.8.6              
##  [23] askpass_1.2.0             DBI_1.1.3                
##  [25] RColorBrewer_1.1-3        abind_1.4-5              
##  [27] zlibbioc_1.46.0           rvest_1.0.3              
##  [29] quadprog_1.5-8            purrr_1.0.2              
##  [31] RCurl_1.98-1.12           rappdirs_0.3.3           
##  [33] GenomeInfoDbData_1.2.10   genefilter_1.82.1        
##  [35] annotate_1.78.0           svglite_2.1.1            
##  [37] DelayedMatrixStats_1.22.6 codetools_0.2-19         
##  [39] DelayedArray_0.26.7       xml2_1.3.5               
##  [41] tidyselect_1.2.0          beanplot_1.3.1           
##  [43] BiocFileCache_2.8.0       illuminaio_0.42.0        
##  [45] webshot_0.5.5             jsonlite_1.8.7           
##  [47] GenomicAlignments_1.36.0  multtest_2.56.0          
##  [49] ellipsis_0.3.2            survival_3.5-7           
##  [51] systemfonts_1.0.4         tools_4.3.1              
##  [53] progress_1.2.2            Rcpp_1.0.11              
##  [55] glue_1.6.2                gridExtra_2.3            
##  [57] xfun_0.40                 dplyr_1.1.3              
##  [59] HDF5Array_1.28.1          fastmap_1.1.1            
##  [61] GGally_2.1.2              rhdf5filters_1.12.1      
##  [63] fansi_1.0.4               openssl_2.1.0            
##  [65] caTools_1.18.2            digest_0.6.33            
##  [67] R6_2.5.1                  mime_0.12                
##  [69] colorspace_2.1-0          gtools_3.9.4             
##  [71] biomaRt_2.56.1            RSQLite_2.3.1            
##  [73] utf8_1.2.3                tidyr_1.3.0              
##  [75] generics_0.1.3            data.table_1.14.8        
##  [77] rtracklayer_1.60.1        prettyunits_1.1.1        
##  [79] httr_1.4.7                htmlwidgets_1.6.2        
##  [81] S4Arrays_1.0.6            pkgconfig_2.0.3          
##  [83] gtable_0.3.4              blob_1.2.4               
##  [85] siggenes_1.74.0           htmltools_0.5.6          
##  [87] echarts4r_0.4.5           scales_1.2.1             
##  [89] png_0.1-8                 knitr_1.44               
##  [91] rstudioapi_0.15.0         reshape2_1.4.4           
##  [93] tzdb_0.4.0                rjson_0.2.21             
##  [95] nlme_3.1-163              curl_5.0.2               
##  [97] org.Hs.eg.db_3.17.0       cachem_1.0.8             
##  [99] rhdf5_2.44.0              stringr_1.5.0            
## [101] KernSmooth_2.23-22        AnnotationDbi_1.62.2     
## [103] restfulr_0.0.15           GEOquery_2.68.0          
## [105] pillar_1.9.0              grid_4.3.1               
## [107] reshape_0.8.9             vctrs_0.6.3              
## [109] gplots_3.1.3              promises_1.2.1           
## [111] dbplyr_2.3.3              xtable_1.8-4             
## [113] beeswarm_0.4.0            evaluate_0.21            
## [115] readr_2.1.4               GenomicFeatures_1.52.2   
## [117] cli_3.6.1                 compiler_4.3.1           
## [119] Rsamtools_2.16.0          rlang_1.1.1              
## [121] crayon_1.5.2              rngtools_1.5.2           
## [123] nor1mix_1.3-0             mclust_6.0.0             
## [125] plyr_1.8.8                stringi_1.7.12           
## [127] viridisLite_0.4.2         BiocParallel_1.34.2      
## [129] munsell_0.5.0             Matrix_1.6-1             
## [131] hms_1.1.3                 sparseMatrixStats_1.12.2 
## [133] bit64_4.0.5               ggplot2_3.4.3            
## [135] Rhdf5lib_1.22.1           KEGGREST_1.40.0          
## [137] statmod_1.5.0             shiny_1.7.5              
## [139] highr_0.10                memoise_2.0.1            
## [141] bslib_0.5.1               bit_4.0.5