GSE158422: DNA methylation in Lung squamous cell carcinoma - gene methylation enrichment analysis

Introduction

Functional class sorting is widely used for pathway analysis using tools like GSEA, yet there is no consensus in how this can be conducted for Illumina Infinium methylation array data.

Here we propose a simple approach which involves the following:

Limma test on probes.
For each gene, calculate the median t-statistic from step 1.
Use this median t-stat in Camera pre-ranked test for gene sets.

In this example, I’m using matched infinium EPIC 850k data from (n=37) normal and lung cancer samples (GSE158422). The data was previously preprocessed and normalised using the minfi package (see the folder called “misc”).

Here the gene sets are obtained from Reactome.

Requirements

This analysis was run on a 8C/16T computer with 64GB RAM running at 3.8 GHz. This workflow used 34 GB RAM and took RAM usage can be moderated by reducing the parallel core count.

suppressPackageStartupMessages({
  library("limma")
  library("IlluminaHumanMethylation450kmanifest")
  library("IlluminaHumanMethylationEPICanno.ilm10b4.hg19")
  library("tictoc")
  library("kableExtra")
  library("beeswarm")
})

Load data

annotations
probe sets
gene sets
design matrix
mval matrix

anno <- getAnnotation(IlluminaHumanMethylationEPICanno.ilm10b4.hg19)
myann <- data.frame(anno[,c("UCSC_RefGene_Name","Regulatory_Feature_Group","Islands_Name","Relation_to_Island")])

gp <- myann[,"UCSC_RefGene_Name",drop=FALSE]
gp2 <- strsplit(gp$UCSC_RefGene_Name,";")
names(gp2) <- rownames(gp)
sets <- split(rep(names(gp2), lengths(gp2)), unlist(gp2))

summary(unlist(lapply(sets,length)))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    9.00   24.00   49.68   55.00 6778.00

gmt_import <- function(gmtfile) {
    genesetLines <- strsplit(readLines(gmtfile), "\t")
    genesets <- lapply(genesetLines, utils::tail, -2)
    names(genesets) <- unlist(lapply(genesetLines, head, 1))
    attributes(genesets)$originfile <- gmtfile
    if( any(duplicated(names(genesets))) ) {
        warning("Duplicated gene sets names detected")
    }
    genesets
}

genesets <- gmt_import("https://ziemann-lab.net/public/gmea_prototype/ReactomePathways.gmt")

if (!file.exists("GSE158422_design.rds")) {
  download.file("https://ziemann-lab.net/public/gmea_prototype/GSE158422_design.rds", "GSE158422_design.rds")
}
design <- readRDS("GSE158422_design.rds")

if (!file.exists("GSE158422_design.rds")) {
 download.file("https://ziemann-lab.net/public/gmea_prototype/GSE158422_mx.rds","GSE158422_mx.rds")
}
mval <- readRDS("GSE158422_mx.rds")

Gene level analysis

Performs limma on probes and then reports the median limma t-statistic for each gene.

It also applies a one-sample t-test for probes belonging to each gene, but that isn’t used in downstream gene set analysis.

pmea <- function(mval,design,sets,cores=2) {
  fit <- lmFit(mval, design)
  fit <- eBayes(fit)
  top <- topTable(fit,coef=ncol(design),num=Inf, sort.by = "P")
  l <- mclapply(seq(1,length(sets)), function(i) {
    g <- names(sets[i])
    tstats <- top[rownames(top) %in% sets[[i]],"t"]
    myn <- length(tstats)
    mymean <- mean(tstats)
    mymedian <- median(tstats)
    if ( length(tstats) < 2 ) {
      pval=1
    } else {
      wtselfcont <- t.test(tstats)
      pval=wtselfcont$p.value
    }
    c("gene"=g,"nprobes"=myn,"mean"=mymean,"median"=mymedian,
      "P.Value"=pval)
  } , mc.cores=cores)
  df <- do.call(rbind, l)
  rownames(df) <- df[,1]
  df <- df[,-1]
  tmp <- apply(df,2,as.numeric)
  rownames(tmp) <- rownames(df)
  df <- as.data.frame(tmp)
  df$sig <- -log10(df[,4])
  df <- df[order(-df$sig),]
  df$FDR <- p.adjust(df$P.Value)
  out <- list("df"=df,"toptable"=top)
  return(out)
}

tic()
res <- pmea(mval,design,sets,cores=8)

## Coefficients not estimable: patientpat32

## Warning: Partial NA coefficients for 839473 probe(s)

## Warning in .ebayes(fit = fit, proportion = proportion, stdev.coef.lim =
## stdev.coef.lim, : Estimation of var.prior failed - set to default value

resdf <- res$df
toc()

## 193.76 sec elapsed

head(resdf,20) %>%
  kbl(caption="GMEA gene level analysis top results (p-value)") %>%
  kable_styling(full_width=FALSE)

GMEA gene level analysis top results (p-value)
	nprobes	mean	median	sig
PTPRN2	1474	-3.980916	-4.635077	Inf
TNXB	529	-3.386961	-3.828508	114.81914
CDH4	382	-4.086578	-4.667115	100.29268
DIP2C	607	-2.607435	-3.212233	86.80420
MAD1L1	817	-2.139007	-2.422410	81.71068
PCDHGA1	439	3.383704	4.175911	78.77093
PCDHGA2	422	3.415474	4.194368	76.76443
SHANK2	491	-3.047306	-3.885318	75.80757
SNORD115-15	81	-6.324991	-6.331847	72.10248
SNORD115-21	81	-6.324991	-6.331847	72.10248
PCDHGA3	399	3.416378	4.238660	71.40641
LMF1	262	-3.470435	-3.920412	69.95681
PCDHGB1	380	3.420280	4.223157	68.20318
PCDHGA4	362	3.483006	4.294889	66.60470
MYT1L	245	-4.072473	-4.610405	63.51578
PCDHGB2	343	3.480576	4.321741	61.73758
RASA3	361	-2.733695	-2.943332	60.29463
ADARB2	475	-2.609726	-3.286796	60.02378
PCDHGA5	326	3.478061	4.377746	57.86618
CACNA1C	290	-3.738489	-4.761867	56.51797

Gene set analysis

Now perform gene set analysis using CameraPR with reactome gene sets.

tic()
stat <- resdf$median
names(stat) <- rownames(resdf)
stat[is.na(stat)] <- 0
cres <- cameraPR(statistic=stat, index=genesets, use.ranks = FALSE, inter.gene.cor=0.01, sort = FALSE)
cres$ES <- unlist(lapply(genesets, function(gset) {median(stat[which(names(stat) %in% gset )]) }))
cres <- cres[order(cres$PValue),]
toc()

## 3.901 sec elapsed

top <- head(cres,10)

top %>%
  kbl(caption="CameraPR gene set results") %>%
  kable_styling(full_width=FALSE)

CameraPR gene set results
	NGenes	Direction	FDR	ES
Expression and translocation of olfactory receptors	373	Down	0.0e+00	-5.3112149
Olfactory Signaling Pathway	381	Down	0.0e+00	-5.2834835
Sensory Perception	597	Down	0.0e+00	-4.2112205
Antimicrobial peptides	85	Down	0.0e+00	-3.8047439
Defensins	41	Down	0.0e+00	-4.4543846
Beta defensins	33	Down	0.0e+00	-4.6816928
Regulation of gene expression in early pancreatic precursor cells	8	Up	2.0e-07	4.1641140
Activation of HOX genes during differentiation	63	Up	1.2e-06	-0.1105348
Activation of anterior HOX genes in hindbrain development during early embryogenesis	63	Up	1.2e-06	-0.1105348
Keratinization	216	Down	3.7e-06	-2.9221755

top <- top[order(top$ES),]
topsets <- genesets[names(genesets) %in% rownames(top)]

scores <- lapply(topsets, function(gset) { stat[which(names(stat) %in% gset )] })
scores <- scores[order(unlist(lapply(scores,median)))]

par(mar=c(5,25,3,1))
boxplot(scores,horizontal=TRUE,las=1,cex=0,cex.axis=0.9,xlab="Enrichment score")
mtext("Top pathways in lung cancer")
beeswarm(scores,add=TRUE,horiz=TRUE,pch=19,cex=0.3)
grid()

Session information

sessionInfo()

## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.1 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.3.4                                   
##  [2] tictoc_1.1                                         
##  [3] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0
##  [4] IlluminaHumanMethylation450kmanifest_0.4.0         
##  [5] minfi_1.42.0                                       
##  [6] bumphunter_1.38.0                                  
##  [7] locfit_1.5-9.6                                     
##  [8] iterators_1.0.14                                   
##  [9] foreach_1.5.2                                      
## [10] Biostrings_2.64.1                                  
## [11] XVector_0.36.0                                     
## [12] SummarizedExperiment_1.26.1                        
## [13] Biobase_2.56.0                                     
## [14] MatrixGenerics_1.8.1                               
## [15] matrixStats_0.62.0                                 
## [16] GenomicRanges_1.48.0                               
## [17] GenomeInfoDb_1.32.4                                
## [18] IRanges_2.30.1                                     
## [19] S4Vectors_0.34.0                                   
## [20] BiocGenerics_0.42.0                                
## [21] limma_3.52.4                                       
## [22] beeswarm_0.4.0                                     
## 
## loaded via a namespace (and not attached):
##   [1] systemfonts_1.0.4         BiocFileCache_2.4.0      
##   [3] plyr_1.8.7                splines_4.2.1            
##   [5] BiocParallel_1.30.4       digest_0.6.30            
##   [7] htmltools_0.5.3           fansi_1.0.3              
##   [9] magrittr_2.0.3            memoise_2.0.1            
##  [11] tzdb_0.3.0                readr_2.1.3              
##  [13] annotate_1.74.0           svglite_2.1.0            
##  [15] askpass_1.1               siggenes_1.70.0          
##  [17] prettyunits_1.1.1         colorspace_2.0-3         
##  [19] blob_1.2.3                rvest_1.0.3              
##  [21] rappdirs_0.3.3            xfun_0.34                
##  [23] dplyr_1.0.10              crayon_1.5.2             
##  [25] RCurl_1.98-1.9            jsonlite_1.8.3           
##  [27] genefilter_1.78.0         GEOquery_2.64.2          
##  [29] survival_3.4-0            glue_1.6.2               
##  [31] zlibbioc_1.42.0           webshot_0.5.4            
##  [33] DelayedArray_0.22.0       Rhdf5lib_1.18.2          
##  [35] HDF5Array_1.24.2          scales_1.2.1             
##  [37] DBI_1.1.3                 rngtools_1.5.2           
##  [39] Rcpp_1.0.9                viridisLite_0.4.1        
##  [41] xtable_1.8-4              progress_1.2.2           
##  [43] bit_4.0.4                 mclust_5.4.10            
##  [45] preprocessCore_1.58.0     httr_1.4.4               
##  [47] RColorBrewer_1.1-3        ellipsis_0.3.2           
##  [49] pkgconfig_2.0.3           reshape_0.8.9            
##  [51] XML_3.99-0.11             sass_0.4.2               
##  [53] dbplyr_2.2.1              utf8_1.2.2               
##  [55] tidyselect_1.2.0          rlang_1.0.6              
##  [57] AnnotationDbi_1.58.0      munsell_0.5.0            
##  [59] tools_4.2.1               cachem_1.0.6             
##  [61] cli_3.4.1                 generics_0.1.3           
##  [63] RSQLite_2.2.18            evaluate_0.17            
##  [65] stringr_1.4.1             fastmap_1.1.0            
##  [67] yaml_2.3.6                knitr_1.40               
##  [69] bit64_4.0.5               beanplot_1.3.1           
##  [71] scrime_1.3.5              purrr_0.3.5              
##  [73] KEGGREST_1.36.3           nlme_3.1-160             
##  [75] doRNG_1.8.2               sparseMatrixStats_1.8.0  
##  [77] nor1mix_1.3-0             xml2_1.3.3               
##  [79] biomaRt_2.52.0            compiler_4.2.1           
##  [81] rstudioapi_0.14           filelock_1.0.2           
##  [83] curl_4.3.3                png_0.1-7                
##  [85] tibble_3.1.8              bslib_0.4.0              
##  [87] stringi_1.7.8             highr_0.9                
##  [89] GenomicFeatures_1.48.4    lattice_0.20-45          
##  [91] Matrix_1.5-1              multtest_2.52.0          
##  [93] vctrs_0.5.0               pillar_1.8.1             
##  [95] lifecycle_1.0.3           rhdf5filters_1.8.0       
##  [97] jquerylib_0.1.4           data.table_1.14.4        
##  [99] bitops_1.0-7              rtracklayer_1.56.1       
## [101] R6_2.5.1                  BiocIO_1.6.0             
## [103] codetools_0.2-18          MASS_7.3-58.1            
## [105] assertthat_0.2.1          rhdf5_2.40.0             
## [107] openssl_2.0.4             rjson_0.2.21             
## [109] GenomicAlignments_1.32.1  Rsamtools_2.12.0         
## [111] GenomeInfoDbData_1.2.8    hms_1.1.2                
## [113] quadprog_1.5-8            grid_4.2.1               
## [115] tidyr_1.2.1               base64_2.0.1             
## [117] rmarkdown_2.17            DelayedMatrixStats_1.18.2
## [119] illuminaio_0.38.0         restfulr_0.0.15