GOmeth test

Introduction

Here we will run test enrichment analysis approaches for undderstanding the effect of elevated homocysteine on blood methylomes. Homocysteine is thought to be an inhibitor of methyltransferases and hyperhomocysteine is associated with overall reduced genomic methylation. Previously we analysed the B-PROOF study data which enabled us to identify probes whose methylation correlatesor anti-correlated with homocysteine level.

The two enrichment analysis approaches we will use are:

gsameth: an ORA approach that addressed the biases inherent with infinium array analysis.
gmea: an experimental application of functional class scoring for infinium analysis.

suppressPackageStartupMessages({
  library("missMethyl")
  library("kableExtra")
  library("mitch")
  library("eulerr")
  library("IlluminaHumanMethylation450kmanifest")
  library("IlluminaHumanMethylation450kanno.ilmn12.hg19")
  library("tictoc")
})

Load the data.

dm <- read.table("dma3a.tsv")
rownames(dm) <- dm$Row.names
dm$Row.names=NULL

head(dm, 10) %>% kbl() %>% kable_paper("hover", full_width = F)

	UCSC_RefGene_Name	Regulatory_Feature_Group	logFC	AveExpr	t	P.Value	adj.P.Val	B
cg04905210	POLR3D		-0.2893816	-2.8148883	-5.377810	2.0e-07	0.1017096	5.889905
cg09338148		Unclassified	-0.2919282	-1.1720330	-5.116352	8.0e-07	0.1729449	4.884437
cg04247967			-0.2057500	3.1797718	-4.953286	1.7e-06	0.2374161	4.275061
cg06458106			-0.1900758	1.6656963	-4.763516	4.0e-06	0.2374161	3.583754
cg26425904	OCA2	Unclassified	-0.2527032	-4.0235228	-4.731955	4.6e-06	0.2374161	3.470695
cg10746622	PRSSL1		-0.1999261	1.5539691	-4.707850	5.1e-06	0.2374161	3.384715
cg19590707	SLC16A3;SLC16A3;SLC16A3	Promoter_Associated	-0.2479821	-1.2545892	-4.704517	5.2e-06	0.2374161	3.372853
cg09762242	SIPA1;SIPA1	Promoter_Associated	-0.2102410	-2.5125329	-4.704466	5.2e-06	0.2374161	3.372673
cg03622371	P2RY6;P2RY6;P2RY6;P2RY6	Unclassified	-0.2403843	-1.6741058	-4.679125	5.8e-06	0.2374161	3.282683
cg11257888	RCAN3		0.2790028	0.4787858	4.674646	5.9e-06	0.2374161	3.266814

Reactome pathways were downloaded on the 14th Sept 2023 from MsigDB.

gs_entrez <- gmt_import("c2.cp.reactome.v2023.1.Hs.entrez.gmt")

gs_symbols <- gmt_import("c2.cp.reactome.v2023.1.Hs.symbols.gmt")

ORA with GSAMETH - parth of the MissMethyl package

Now perform ORA analysis using gsameth with the top 500 probes.

sigup <- rownames(head(subset(dm,logFC>0),1000))
gsaup <- gsameth(sig.cpg=head(sigup,500), all.cpg=rownames(dm), collection=gs_entrez)

## All input CpGs are used for testing.

gsaup <- gsaup[order(gsaup$P.DE),]
head(gsaup) %>% kbl(caption="Top hypermethylated pathways") %>% kable_paper("hover", full_width = F)

Top hypermethylated pathways
	N	DE	P.DE	FDR
REACTOME_INTERLEUKIN_21_SIGNALING	9	4	0.0000418	0.0445810
REACTOME_INTERLEUKIN_20_FAMILY_SIGNALING	26	5	0.0000730	0.0445810
REACTOME_INTERLEUKIN_6_SIGNALING	11	4	0.0000809	0.0445810
REACTOME_INTERLEUKIN_2_FAMILY_SIGNALING	39	6	0.0001994	0.0824657
REACTOME_TNF_RECEPTOR_SUPERFAMILY_TNFSF_MEMBERS_MEDIATING_NON_CANONICAL_NF_KB_PATHWAY	16	4	0.0002769	0.0916139
REACTOME_INTERFERON_ALPHA_BETA_SIGNALING	64	7	0.0003500	0.0964951

sigdn <- rownames(head(subset(dm,logFC<0),1000))
gsadn <- gsameth(sig.cpg=head(sigdn,500), all.cpg=rownames(dm), collection=gs_entrez)

## All input CpGs are used for testing.

gsadn <- gsadn[order(gsadn$P.DE),]
head(gsadn) %>% kbl(caption="Top hypomethylated pathways") %>% kable_paper("hover", full_width = F)

Top hypomethylated pathways
	N	DE	P.DE	FDR
REACTOME_SYNTHESIS_OF_BILE_ACIDS_AND_BILE_SALTS_VIA_27_HYDROXYCHOLESTEROL	15	3	0.0016497	1
REACTOME_THROMBIN_SIGNALLING_THROUGH_PROTEINASE_ACTIVATED_RECEPTORS_PARS	32	4	0.0057066	1
REACTOME_SYNTHESIS_OF_BILE_ACIDS_AND_BILE_SALTS_VIA_7ALPHA_HYDROXYCHOLESTEROL	24	3	0.0058080	1
REACTOME_BLOOD_GROUP_SYSTEMS_BIOSYNTHESIS	21	3	0.0058718	1
REACTOME_TRANSPORT_OF_NUCLEOTIDE_SUGARS	8	2	0.0081909	1
REACTOME_PASSIVE_TRANSPORT_BY_AQUAPORINS	13	2	0.0103872	1

GMEA

Now with GMEA.

Start with gathering the annotation set.

ann450k <- getAnnotation(IlluminaHumanMethylation450kanno.ilmn12.hg19)
myann <- data.frame(ann450k[,c("UCSC_RefGene_Name","Regulatory_Feature_Group")])
promoters <- grep("Prom",myann$Regulatory_Feature_Group)

Histogram of limma t-statistics.

dm <- dm[,c("UCSC_RefGene_Name","t")]
hist(dm$t,breaks=seq(from=-6,to=6,by=1))

Aggregate probe t-scores to gene level scores.

The object generated has two parts:

df: a simple table of the gene name and the median tvalue
gme_res_df: results of 1-sample Wilcox test.

CORES= detectCores()

agg <- function(dm) {
  gn <- unique(unlist(strsplit( dm$UCSC_RefGene_Name ,";")))
  gnl <- strsplit( dm$UCSC_RefGene_Name ,";")
  gnl <- mclapply(gnl,unique,mc.cores=CORES)
  dm$UCSC_RefGene_Name <- gnl
  l <- mclapply(1:nrow(dm), function(i) {
    a <- dm[i,]
    len <- length(a[[1]][[1]])
    tvals <- as.numeric(rep(a[2],len))
    genes <- a[[1]][[1]]
    data.frame(genes,tvals)
  },mc.cores=CORES)
  df <- do.call(rbind,l)
  gme_res <- mclapply( 1:length(gn), function(i) {
    g <- gn[i]
    tstats <- df[which(df$genes==g),"tvals"]
    myn <- length(tstats)
    mymean <- mean(tstats)
    mymedian <- median(tstats)
    wtselfcont <- wilcox.test(tstats)
    res <- c("gene"=g,"nprobes"=myn,"mean"=mymean,"median"=mymedian,
      "p-value(sc)"=wtselfcont$p.value)
  } , mc.cores=CORES )
  gme_res_df <- do.call(rbind, gme_res)
  rownames(gme_res_df) <- gme_res_df[,1]
  gme_res_df <- gme_res_df[,-1]
  tmp <- apply(gme_res_df,2,as.numeric)
  rownames(tmp) <- rownames(gme_res_df)
  gme_res_df <- as.data.frame(tmp)
  gme_res_df$sig <- -log10(gme_res_df[,4])
  gme_res_df <- gme_res_df[order(-gme_res_df$sig),]
  gme_res_df$`fdr(sc)` <- p.adjust(gme_res_df$`p-value(sc)`)
  out <- list("df"=df,"gme_res_df"=gme_res_df)
  return(out)
}

tic()
dmagg <- agg(dm)
toc()

## 65.778 sec elapsed

head(dmagg$gme_res_df)

##         nprobes       mean     median  p-value(sc)      sig      fdr(sc)
## TNXB        531  0.3476999  0.4126704 4.781602e-15 14.32043 9.608152e-11
## PCDHA1      162 -0.7059103 -0.6207302 3.718622e-14 13.42962 7.471827e-10
## NNAT         49 -0.9377326 -0.9433135 6.750156e-14 13.17069 1.356241e-09
## PCDHA2      149 -0.7171256 -0.6259052 3.605827e-13 12.44300 7.244468e-09
## PCDHA3      141 -0.7168381 -0.6259052 2.805782e-12 11.55195 5.636815e-08
## KCNQ1DN      39 -1.6152306 -1.7110279 3.637979e-12 11.43914 7.308336e-08

Does 1-sample t-test work better?

agg2 <- function(dm) {
  gn <- unique(unlist(strsplit( dm$UCSC_RefGene_Name ,";")))
  gnl <- strsplit( dm$UCSC_RefGene_Name ,";")
  gnl <- mclapply(gnl,unique,mc.cores=CORES)
  dm$UCSC_RefGene_Name <- gnl
  l <- mclapply(1:nrow(dm), function(i) {
    a <- dm[i,]
    len <- length(a[[1]][[1]])
    tvals <- as.numeric(rep(a["t"],len))
    genes <- a[[1]][[1]]
    data.frame(genes,tvals)
  },mc.cores=CORES)
  df <- do.call(rbind,l)
  keep <- names(which(table(df$genes)>1))
  df <- df[df$genes %in% keep,]
  gn <- unique(df$genes)
  gme_res <- lapply( 1:length(gn), function(i) {
    g <- gn[i]
    tstats <- df[which(df$genes==g),"tvals"]
    myn <- length(tstats)
    mymean <- mean(tstats)
    mymedian <- median(tstats)
    tselfcont <- t.test(tstats)
    res <- c("gene"=g,"nprobes"=myn,"mean"=mymean,"median"=mymedian,
      "p-value(sc)"=tselfcont$p.value)
  } )
  gme_res_df <- do.call(rbind, gme_res)
  rownames(gme_res_df) <- gme_res_df[,1]
  gme_res_df <- gme_res_df[,-1]
  tmp <- apply(gme_res_df,2,as.numeric)
  rownames(tmp) <- rownames(gme_res_df)
  gme_res_df <- as.data.frame(tmp)
  gme_res_df$sig <- -log10(gme_res_df[,4])
  gme_res_df <- gme_res_df[order(-gme_res_df$sig),]
  gme_res_df$`fdr(sc)` <- p.adjust(gme_res_df$`p-value(sc)`)
  out <- list("df"=df,"gme_res_df"=gme_res_df)
  return(out)
}

tic()
dmagg2 <- agg2(dm)
toc()

## 105.684 sec elapsed

head(dmagg2$gme_res_df)

##         nprobes       mean     median  p-value(sc)      sig      fdr(sc)
## KCNQ1DN      39 -1.6152306 -1.7110279 2.811916e-17 16.55100 5.508262e-13
## NNAT         49 -0.9377326 -0.9433135 8.119861e-17 16.09045 1.590518e-12
## PCDHA1      162 -0.7059103 -0.6207302 2.891024e-16 15.53895 5.662649e-12
## TNXB        531  0.3476999  0.4126704 1.989790e-15 14.70119 3.897202e-11
## MESTIT1      59 -0.7540952 -0.8024776 2.553284e-15 14.59290 5.000607e-11
## PCDHA2      149 -0.7171256 -0.6259052 5.353803e-15 14.27134 1.048489e-10

Session information

For reproducibility.

sessionInfo()

## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] mitch_1.12.0                                       
##  [2] tictoc_1.2                                         
##  [3] IlluminaHumanMethylation450kmanifest_0.4.0         
##  [4] eulerr_7.0.0                                       
##  [5] kableExtra_1.3.4                                   
##  [6] missMethyl_1.34.0                                  
##  [7] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0
##  [8] IlluminaHumanMethylation450kanno.ilmn12.hg19_0.6.1 
##  [9] minfi_1.46.0                                       
## [10] bumphunter_1.42.0                                  
## [11] locfit_1.5-9.8                                     
## [12] iterators_1.0.14                                   
## [13] foreach_1.5.2                                      
## [14] Biostrings_2.68.1                                  
## [15] XVector_0.40.0                                     
## [16] SummarizedExperiment_1.30.2                        
## [17] Biobase_2.60.0                                     
## [18] MatrixGenerics_1.12.3                              
## [19] matrixStats_1.0.0                                  
## [20] GenomicRanges_1.52.0                               
## [21] GenomeInfoDb_1.36.3                                
## [22] IRanges_2.34.1                                     
## [23] S4Vectors_0.38.1                                   
## [24] BiocGenerics_0.46.0                                
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.3.1             later_1.3.1              
##   [3] BiocIO_1.10.0             bitops_1.0-7             
##   [5] filelock_1.0.2            BiasedUrn_2.0.11         
##   [7] tibble_3.2.1              preprocessCore_1.62.1    
##   [9] XML_3.99-0.14             lifecycle_1.0.3          
##  [11] lattice_0.21-8            MASS_7.3-60              
##  [13] base64_2.0.1              scrime_1.3.5             
##  [15] magrittr_2.0.3            sass_0.4.7               
##  [17] limma_3.56.2              rmarkdown_2.24           
##  [19] jquerylib_0.1.4           yaml_2.3.7               
##  [21] httpuv_1.6.11             doRNG_1.8.6              
##  [23] askpass_1.2.0             DBI_1.1.3                
##  [25] RColorBrewer_1.1-3        abind_1.4-5              
##  [27] zlibbioc_1.46.0           rvest_1.0.3              
##  [29] quadprog_1.5-8            purrr_1.0.2              
##  [31] RCurl_1.98-1.12           rappdirs_0.3.3           
##  [33] GenomeInfoDbData_1.2.10   genefilter_1.82.1        
##  [35] annotate_1.78.0           svglite_2.1.1            
##  [37] DelayedMatrixStats_1.22.6 codetools_0.2-19         
##  [39] DelayedArray_0.26.7       xml2_1.3.5               
##  [41] tidyselect_1.2.0          beanplot_1.3.1           
##  [43] BiocFileCache_2.8.0       illuminaio_0.42.0        
##  [45] webshot_0.5.5             jsonlite_1.8.7           
##  [47] GenomicAlignments_1.36.0  multtest_2.56.0          
##  [49] ellipsis_0.3.2            survival_3.5-7           
##  [51] systemfonts_1.0.4         tools_4.3.1              
##  [53] progress_1.2.2            Rcpp_1.0.11              
##  [55] glue_1.6.2                gridExtra_2.3            
##  [57] xfun_0.40                 dplyr_1.1.3              
##  [59] HDF5Array_1.28.1          fastmap_1.1.1            
##  [61] GGally_2.1.2              rhdf5filters_1.12.1      
##  [63] fansi_1.0.4               openssl_2.1.0            
##  [65] caTools_1.18.2            digest_0.6.33            
##  [67] R6_2.5.1                  mime_0.12                
##  [69] colorspace_2.1-0          gtools_3.9.4             
##  [71] biomaRt_2.56.1            RSQLite_2.3.1            
##  [73] utf8_1.2.3                tidyr_1.3.0              
##  [75] generics_0.1.3            data.table_1.14.8        
##  [77] rtracklayer_1.60.1        prettyunits_1.1.1        
##  [79] httr_1.4.7                htmlwidgets_1.6.2        
##  [81] S4Arrays_1.0.6            pkgconfig_2.0.3          
##  [83] gtable_0.3.4              blob_1.2.4               
##  [85] siggenes_1.74.0           htmltools_0.5.6          
##  [87] echarts4r_0.4.5           scales_1.2.1             
##  [89] png_0.1-8                 knitr_1.44               
##  [91] rstudioapi_0.15.0         reshape2_1.4.4           
##  [93] tzdb_0.4.0                rjson_0.2.21             
##  [95] nlme_3.1-163              curl_5.0.2               
##  [97] org.Hs.eg.db_3.17.0       cachem_1.0.8             
##  [99] rhdf5_2.44.0              stringr_1.5.0            
## [101] KernSmooth_2.23-22        AnnotationDbi_1.62.2     
## [103] restfulr_0.0.15           GEOquery_2.68.0          
## [105] pillar_1.9.0              grid_4.3.1               
## [107] reshape_0.8.9             vctrs_0.6.3              
## [109] gplots_3.1.3              promises_1.2.1           
## [111] dbplyr_2.3.3              xtable_1.8-4             
## [113] beeswarm_0.4.0            evaluate_0.21            
## [115] readr_2.1.4               GenomicFeatures_1.52.2   
## [117] cli_3.6.1                 compiler_4.3.1           
## [119] Rsamtools_2.16.0          rlang_1.1.1              
## [121] crayon_1.5.2              rngtools_1.5.2           
## [123] nor1mix_1.3-0             mclust_6.0.0             
## [125] plyr_1.8.8                stringi_1.7.12           
## [127] viridisLite_0.4.2         BiocParallel_1.34.2      
## [129] munsell_0.5.0             Matrix_1.6-1             
## [131] hms_1.1.3                 sparseMatrixStats_1.12.2 
## [133] bit64_4.0.5               ggplot2_3.4.3            
## [135] Rhdf5lib_1.22.1           KEGGREST_1.40.0          
## [137] statmod_1.5.0             shiny_1.7.5              
## [139] highr_0.10                memoise_2.0.1            
## [141] bslib_0.5.1               bit_4.0.5

GOmeth test

hcy_meth meta-analysis group

2023-09-14

Introduction

ORA with GSAMETH - parth of the MissMethyl package

GMEA

Session information