Functional class sorting is widely used for pathway analysis using tools like GSEA, yet there is no consensus in how this can be conducted for Illumina Infinium methylation array data.
Here we propose a simple approach which involves the following:
Limma test on probes.
For each gene, calculate the median t-statistic from step 1.
Use this median t-stat in Camera pre-ranked test for gene sets.
In this example, I’m using matched infinium EPIC 850k data from (n=37) normal and lung cancer samples (GSE158422). The data was previously preprocessed and normalised using the minfi package (see the folder called “misc”).
Here the gene sets are obtained from Reactome.
This analysis was run on a 8C/16T computer with 64GB RAM running at 3.8 GHz. This workflow used 34 GB RAM and took RAM usage can be moderated by reducing the parallel core count.
suppressPackageStartupMessages({
library("limma")
library("IlluminaHumanMethylation450kmanifest")
library("IlluminaHumanMethylationEPICanno.ilm10b4.hg19")
library("tictoc")
library("kableExtra")
library("beeswarm")
})
annotations
probe sets
gene sets
design matrix
mval matrix
anno <- getAnnotation(IlluminaHumanMethylationEPICanno.ilm10b4.hg19)
myann <- data.frame(anno[,c("UCSC_RefGene_Name","Regulatory_Feature_Group","Islands_Name","Relation_to_Island")])
gp <- myann[,"UCSC_RefGene_Name",drop=FALSE]
gp2 <- strsplit(gp$UCSC_RefGene_Name,";")
names(gp2) <- rownames(gp)
sets <- split(rep(names(gp2), lengths(gp2)), unlist(gp2))
summary(unlist(lapply(sets,length)))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 9.00 24.00 49.68 55.00 6778.00
gmt_import <- function(gmtfile) {
genesetLines <- strsplit(readLines(gmtfile), "\t")
genesets <- lapply(genesetLines, utils::tail, -2)
names(genesets) <- unlist(lapply(genesetLines, head, 1))
attributes(genesets)$originfile <- gmtfile
if( any(duplicated(names(genesets))) ) {
warning("Duplicated gene sets names detected")
}
genesets
}
genesets <- gmt_import("https://ziemann-lab.net/public/gmea_prototype/ReactomePathways.gmt")
if (!file.exists("GSE158422_design.rds")) {
download.file("https://ziemann-lab.net/public/gmea_prototype/GSE158422_design.rds", "GSE158422_design.rds")
}
design <- readRDS("GSE158422_design.rds")
if (!file.exists("GSE158422_design.rds")) {
download.file("https://ziemann-lab.net/public/gmea_prototype/GSE158422_mx.rds","GSE158422_mx.rds")
}
mval <- readRDS("GSE158422_mx.rds")
Performs limma on probes and then reports the median limma t-statistic for each gene.
It also applies a one-sample t-test for probes belonging to each gene, but that isn’t used in downstream gene set analysis.
pmea <- function(mval,design,sets,cores=2) {
fit <- lmFit(mval, design)
fit <- eBayes(fit)
top <- topTable(fit,coef=ncol(design),num=Inf, sort.by = "P")
l <- mclapply(seq(1,length(sets)), function(i) {
g <- names(sets[i])
tstats <- top[rownames(top) %in% sets[[i]],"t"]
myn <- length(tstats)
mymean <- mean(tstats)
mymedian <- median(tstats)
if ( length(tstats) < 2 ) {
pval=1
} else {
wtselfcont <- t.test(tstats)
pval=wtselfcont$p.value
}
c("gene"=g,"nprobes"=myn,"mean"=mymean,"median"=mymedian,
"P.Value"=pval)
} , mc.cores=cores)
df <- do.call(rbind, l)
rownames(df) <- df[,1]
df <- df[,-1]
tmp <- apply(df,2,as.numeric)
rownames(tmp) <- rownames(df)
df <- as.data.frame(tmp)
df$sig <- -log10(df[,4])
df <- df[order(-df$sig),]
df$FDR <- p.adjust(df$P.Value)
out <- list("df"=df,"toptable"=top)
return(out)
}
tic()
res <- pmea(mval,design,sets,cores=8)
## Coefficients not estimable: patientpat32
## Warning: Partial NA coefficients for 839473 probe(s)
## Warning in .ebayes(fit = fit, proportion = proportion, stdev.coef.lim =
## stdev.coef.lim, : Estimation of var.prior failed - set to default value
resdf <- res$df
toc()
## 193.76 sec elapsed
head(resdf,20) %>%
kbl(caption="GMEA gene level analysis top results (p-value)") %>%
kable_styling(full_width=FALSE)
nprobes | mean | median | P.Value | sig | FDR | |
---|---|---|---|---|---|---|
PTPRN2 | 1474 | -3.980916 | -4.635077 | 0 | Inf | 0 |
TNXB | 529 | -3.386961 | -3.828508 | 0 | 114.81914 | 0 |
CDH4 | 382 | -4.086578 | -4.667115 | 0 | 100.29268 | 0 |
DIP2C | 607 | -2.607435 | -3.212233 | 0 | 86.80420 | 0 |
MAD1L1 | 817 | -2.139007 | -2.422410 | 0 | 81.71068 | 0 |
PCDHGA1 | 439 | 3.383704 | 4.175911 | 0 | 78.77093 | 0 |
PCDHGA2 | 422 | 3.415474 | 4.194368 | 0 | 76.76443 | 0 |
SHANK2 | 491 | -3.047306 | -3.885318 | 0 | 75.80757 | 0 |
SNORD115-15 | 81 | -6.324991 | -6.331847 | 0 | 72.10248 | 0 |
SNORD115-21 | 81 | -6.324991 | -6.331847 | 0 | 72.10248 | 0 |
PCDHGA3 | 399 | 3.416378 | 4.238660 | 0 | 71.40641 | 0 |
LMF1 | 262 | -3.470435 | -3.920412 | 0 | 69.95681 | 0 |
PCDHGB1 | 380 | 3.420280 | 4.223157 | 0 | 68.20318 | 0 |
PCDHGA4 | 362 | 3.483006 | 4.294889 | 0 | 66.60470 | 0 |
MYT1L | 245 | -4.072473 | -4.610405 | 0 | 63.51578 | 0 |
PCDHGB2 | 343 | 3.480576 | 4.321741 | 0 | 61.73758 | 0 |
RASA3 | 361 | -2.733695 | -2.943332 | 0 | 60.29463 | 0 |
ADARB2 | 475 | -2.609726 | -3.286796 | 0 | 60.02378 | 0 |
PCDHGA5 | 326 | 3.478061 | 4.377746 | 0 | 57.86618 | 0 |
CACNA1C | 290 | -3.738489 | -4.761867 | 0 | 56.51797 | 0 |
Now perform gene set analysis using CameraPR with reactome gene sets.
tic()
stat <- resdf$median
names(stat) <- rownames(resdf)
stat[is.na(stat)] <- 0
cres <- cameraPR(statistic=stat, index=genesets, use.ranks = FALSE, inter.gene.cor=0.01, sort = FALSE)
cres$ES <- unlist(lapply(genesets, function(gset) {median(stat[which(names(stat) %in% gset )]) }))
cres <- cres[order(cres$PValue),]
toc()
## 3.901 sec elapsed
top <- head(cres,10)
top %>%
kbl(caption="CameraPR gene set results") %>%
kable_styling(full_width=FALSE)
NGenes | Direction | PValue | FDR | ES | |
---|---|---|---|---|---|
Expression and translocation of olfactory receptors | 373 | Down | 0 | 0.0e+00 | -5.3112149 |
Olfactory Signaling Pathway | 381 | Down | 0 | 0.0e+00 | -5.2834835 |
Sensory Perception | 597 | Down | 0 | 0.0e+00 | -4.2112205 |
Antimicrobial peptides | 85 | Down | 0 | 0.0e+00 | -3.8047439 |
Defensins | 41 | Down | 0 | 0.0e+00 | -4.4543846 |
Beta defensins | 33 | Down | 0 | 0.0e+00 | -4.6816928 |
Regulation of gene expression in early pancreatic precursor cells | 8 | Up | 0 | 2.0e-07 | 4.1641140 |
Activation of HOX genes during differentiation | 63 | Up | 0 | 1.2e-06 | -0.1105348 |
Activation of anterior HOX genes in hindbrain development during early embryogenesis | 63 | Up | 0 | 1.2e-06 | -0.1105348 |
Keratinization | 216 | Down | 0 | 3.7e-06 | -2.9221755 |
top <- top[order(top$ES),]
topsets <- genesets[names(genesets) %in% rownames(top)]
scores <- lapply(topsets, function(gset) { stat[which(names(stat) %in% gset )] })
scores <- scores[order(unlist(lapply(scores,median)))]
par(mar=c(5,25,3,1))
boxplot(scores,horizontal=TRUE,las=1,cex=0,cex.axis=0.9,xlab="Enrichment score")
mtext("Top pathways in lung cancer")
beeswarm(scores,add=TRUE,horiz=TRUE,pch=19,cex=0.3)
grid()
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.1 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] kableExtra_1.3.4
## [2] tictoc_1.1
## [3] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0
## [4] IlluminaHumanMethylation450kmanifest_0.4.0
## [5] minfi_1.42.0
## [6] bumphunter_1.38.0
## [7] locfit_1.5-9.6
## [8] iterators_1.0.14
## [9] foreach_1.5.2
## [10] Biostrings_2.64.1
## [11] XVector_0.36.0
## [12] SummarizedExperiment_1.26.1
## [13] Biobase_2.56.0
## [14] MatrixGenerics_1.8.1
## [15] matrixStats_0.62.0
## [16] GenomicRanges_1.48.0
## [17] GenomeInfoDb_1.32.4
## [18] IRanges_2.30.1
## [19] S4Vectors_0.34.0
## [20] BiocGenerics_0.42.0
## [21] limma_3.52.4
## [22] beeswarm_0.4.0
##
## loaded via a namespace (and not attached):
## [1] systemfonts_1.0.4 BiocFileCache_2.4.0
## [3] plyr_1.8.7 splines_4.2.1
## [5] BiocParallel_1.30.4 digest_0.6.30
## [7] htmltools_0.5.3 fansi_1.0.3
## [9] magrittr_2.0.3 memoise_2.0.1
## [11] tzdb_0.3.0 readr_2.1.3
## [13] annotate_1.74.0 svglite_2.1.0
## [15] askpass_1.1 siggenes_1.70.0
## [17] prettyunits_1.1.1 colorspace_2.0-3
## [19] blob_1.2.3 rvest_1.0.3
## [21] rappdirs_0.3.3 xfun_0.34
## [23] dplyr_1.0.10 crayon_1.5.2
## [25] RCurl_1.98-1.9 jsonlite_1.8.3
## [27] genefilter_1.78.0 GEOquery_2.64.2
## [29] survival_3.4-0 glue_1.6.2
## [31] zlibbioc_1.42.0 webshot_0.5.4
## [33] DelayedArray_0.22.0 Rhdf5lib_1.18.2
## [35] HDF5Array_1.24.2 scales_1.2.1
## [37] DBI_1.1.3 rngtools_1.5.2
## [39] Rcpp_1.0.9 viridisLite_0.4.1
## [41] xtable_1.8-4 progress_1.2.2
## [43] bit_4.0.4 mclust_5.4.10
## [45] preprocessCore_1.58.0 httr_1.4.4
## [47] RColorBrewer_1.1-3 ellipsis_0.3.2
## [49] pkgconfig_2.0.3 reshape_0.8.9
## [51] XML_3.99-0.11 sass_0.4.2
## [53] dbplyr_2.2.1 utf8_1.2.2
## [55] tidyselect_1.2.0 rlang_1.0.6
## [57] AnnotationDbi_1.58.0 munsell_0.5.0
## [59] tools_4.2.1 cachem_1.0.6
## [61] cli_3.4.1 generics_0.1.3
## [63] RSQLite_2.2.18 evaluate_0.17
## [65] stringr_1.4.1 fastmap_1.1.0
## [67] yaml_2.3.6 knitr_1.40
## [69] bit64_4.0.5 beanplot_1.3.1
## [71] scrime_1.3.5 purrr_0.3.5
## [73] KEGGREST_1.36.3 nlme_3.1-160
## [75] doRNG_1.8.2 sparseMatrixStats_1.8.0
## [77] nor1mix_1.3-0 xml2_1.3.3
## [79] biomaRt_2.52.0 compiler_4.2.1
## [81] rstudioapi_0.14 filelock_1.0.2
## [83] curl_4.3.3 png_0.1-7
## [85] tibble_3.1.8 bslib_0.4.0
## [87] stringi_1.7.8 highr_0.9
## [89] GenomicFeatures_1.48.4 lattice_0.20-45
## [91] Matrix_1.5-1 multtest_2.52.0
## [93] vctrs_0.5.0 pillar_1.8.1
## [95] lifecycle_1.0.3 rhdf5filters_1.8.0
## [97] jquerylib_0.1.4 data.table_1.14.4
## [99] bitops_1.0-7 rtracklayer_1.56.1
## [101] R6_2.5.1 BiocIO_1.6.0
## [103] codetools_0.2-18 MASS_7.3-58.1
## [105] assertthat_0.2.1 rhdf5_2.40.0
## [107] openssl_2.0.4 rjson_0.2.21
## [109] GenomicAlignments_1.32.1 Rsamtools_2.12.0
## [111] GenomeInfoDbData_1.2.8 hms_1.1.2
## [113] quadprog_1.5-8 grid_4.2.1
## [115] tidyr_1.2.1 base64_2.0.1
## [117] rmarkdown_2.17 DelayedMatrixStats_1.18.2
## [119] illuminaio_0.38.0 restfulr_0.0.15