Example2 gene set analysis: The case of Azacytidine

Source: https://github.com/markziemann/SurveyEnrichmentMethods

Intro

Here we are performing an analysis of some gene expression data to demonstrate the difference between ORA and FCS methods and to highlight the differences caused by improper background gene set use.

The dataset being used is SRP038101 and we are comparing the cells grown in normal condition (control) to those grown with addition of Azacitidine (case).

Data are obtained from http://dee2.io/

suppressPackageStartupMessages({
library("getDEE2") 
library("DESeq2")
library("clusterProfiler")
library("mitch")
library("kableExtra")
library("eulerr")
})

Get expression data

I’m using some RNA-seq data looking at the effect of Azacytidine on AML3 cells.

name="SRP038101"
mdat<-getDEE2Metadata("hsapiens")
samplesheet <- mdat[grep("SRP038101",mdat$SRP_accession),]
samplesheet<-samplesheet[order(samplesheet$SRR_accession),]
samplesheet$trt<-as.factor(c(1,1,1,0,0,0))
s1 <- samplesheet

s1 %>% kbl(caption = "sample sheet") %>% kable_paper("hover", full_width = F)

sample sheet
	SRR_accession	QC_summary	SRX_accession	SRS_accession	SRP_accession	Sample_name	GEO_series	trt
239157	SRR1171523	PASS	SRX472607	SRS559064	SRP038101	GSM1329859	GSE55123	1
239158	SRR1171524	WARN(3,4)	SRX472608	SRS559066	SRP038101	GSM1329860	GSE55123	1
239159	SRR1171525	WARN(3,4)	SRX472609	SRS559065	SRP038101	GSM1329861	GSE55123	1
239160	SRR1171526	WARN(3,4)	SRX472610	SRS559068	SRP038101	GSM1329862	GSE55123	0
239161	SRR1171527	WARN(3,4)	SRX472611	SRS559067	SRP038101	GSM1329863	GSE55123	0
239162	SRR1171528	WARN(3,4)	SRX472612	SRS559069	SRP038101	GSM1329864	GSE55123	0

w<-getDEE2("hsapiens",samplesheet$SRR_accession,metadata=mdat,legacy = TRUE)

## For more information about DEE2 QC metrics, visit
##     https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md

x<-Tx2Gene(w)
x<-x$Tx2Gene

# save the genetable for later
gt<-w$GeneInfo[,1,drop=FALSE]
gt$accession<-rownames(gt)

# counts 
x1<-x[,which(colnames(x) %in% samplesheet$SRR_accession)]

Here show the number of genes in the annotation set, and those detected above the detection threshold.

# filter out lowly expressed genes
x1<-x1[which(rowSums(x1)/ncol(x1)>=(10)),]
nrow(x)

## [1] 39297

nrow(x1)

## [1] 13926

Now multidimensional scaling (MDS) plot to show the correlation between the datasets. If the control and case datasets are clustered separately, then it is likely that there will be many differentially expressed genes with FDR<0.05.

plot(cmdscale(dist(t(x1))), xlab="Coordinate 1", ylab="Coordinate 2", pch=19, col=s1$trt, main="MDS")

Differential expression

Now run DESeq2 for control vs case.

y <- DESeqDataSetFromMatrix(countData = round(x1), colData = s1, design = ~ trt)

## converting counts to integer mode

y <- DESeq(y)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

de <- results(y)
de<-as.data.frame(de[order(de$pvalue),])
rownames(de)<-sapply(strsplit(rownames(de),"\\."),"[[",1)
head(de) %>% kbl() %>% kable_paper("hover", full_width = F)

	baseMean	log2FoldChange	lfcSE	stat
ENSG00000090382	14491.5013	1.686516	0.0460876	36.59368
ENSG00000165949	1288.2858	3.326521	0.1053480	31.57651
ENSG00000275214	911.4085	3.432709	0.1151500	29.81075
ENSG00000115461	615.6738	5.004631	0.1746428	28.65638
ENSG00000111331	2366.8131	2.649803	0.0945138	28.03615
ENSG00000157601	1153.6028	2.820804	0.1016830	27.74116

Now let’s have a look at some of the charts showing differential expression. In particular, an MA plot and volcano plot.

maplot <- function(de,contrast_name) {
  sig <-subset(de, padj < 0.05 )
  up <-rownames(subset(de, padj < 0.05 & log2FoldChange > 0))
  dn <-rownames(subset(de, padj < 0.05 & log2FoldChange < 0))
  GENESUP <- length(up)
  GENESDN <- length(dn)
  DET=nrow(de)
  SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down", DET, "detected")
  ns <-subset(de, padj > 0.05 )
  plot(log2(de$baseMean),de$log2FoldChange, 
       xlab="log2 basemean", ylab="log2 foldchange",
       pch=19, cex=0.5, col="dark gray",
       main=contrast_name, cex.main=0.7)
  points(log2(sig$baseMean),sig$log2FoldChange,
         pch=19, cex=0.5, col="red")
  mtext(SUBHEADER,cex = 0.7)
}

make_volcano <- function(de,name) {
    sig <- subset(de,padj<0.05)
    N_SIG=nrow(sig)
    N_UP=nrow(subset(sig,log2FoldChange>0))
    N_DN=nrow(subset(sig,log2FoldChange<0))
    DET=nrow(de)
    HEADER=paste(N_SIG,"@5%FDR,", N_UP, "up", N_DN, "dn", DET, "detected")
    plot(de$log2FoldChange,-log10(de$padj),cex=0.5,pch=19,col="darkgray",
        main=name, xlab="log2 FC", ylab="-log10 pval", xlim=c(-6,6))
    mtext(HEADER)
    grid()
    points(sig$log2FoldChange,-log10(sig$padj),cex=0.5,pch=19,col="red")
}

maplot(de,name)

make_volcano(de,name)

Gene sets from Reactome

In order to perform gene set analysis, we need some gene sets.

if (! file.exists("ReactomePathways.gmt")) {
  download.file("https://reactome.org/download/current/ReactomePathways.gmt.zip", 
    destfile="ReactomePathways.gmt.zip")
  unzip("ReactomePathways.gmt.zip")
}
genesets<-gmt_import("ReactomePathways.gmt")

FCS with Mitch

Mitch uses rank-ANOVA statistics for enrichment detection.

m <- mitch_import(de,DEtype = "DEseq2", geneTable = gt)

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.

## Note: Mean no. genes in input = 13926

## Note: no. genes in output = 12980

## Note: estimated proportion of input genes in output = 0.932

mres <- mitch_calc(m,genesets = genesets)

## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.

m_up <- subset(mres$enrichment_result,p.adjustANOVA<0.05 & s.dist > 0)[,1]
m_dn <- subset(mres$enrichment_result,p.adjustANOVA<0.05 & s.dist < 0)[,1]
message(paste("Number of up-regulated pathways:",length(m_up) ))

## Number of up-regulated pathways: 192

message(paste("Number of down-regulated pathways:",length(m_dn) ))

## Number of down-regulated pathways: 317

head(mres$enrichment_result,10)  %>% kbl() %>% kable_paper("hover", full_width = F)

	set	setSize	s.dist
143	Cell Cycle	597	-0.3528903
601	Metabolism of RNA	655	-0.3178554
145	Cell Cycle, Mitotic	481	-0.3573054
144	Cell Cycle Checkpoints	251	-0.4210985
576	M Phase	341	-0.3544053
511	Innate Immune System	768	0.2276855
494	Immune System	1419	0.1688051
641	Mitotic Prometaphase	185	-0.4326001
640	Mitotic Metaphase and Anaphase	222	-0.3950159
637	Mitotic Anaphase	221	-0.3923608

ORA with clusterprofiler

Clusterprofiler uses a hypergeometric test. Firstly I will conduct the analysis separately for up and down regulated genes and with the correct backgound (as intended by the developers).

genesets2 <- read.gmt("ReactomePathways.gmt")

de_up <- rownames(subset(de,log2FoldChange>0,padj<0.05))
de_up <- unique(gt[which(rownames(gt) %in% de_up),1])

de_dn <- rownames(subset(de,log2FoldChange<0,padj<0.05))
de_dn <- unique(gt[which(rownames(gt) %in% de_dn),1])

de_bg <- rownames(de)
de_bg <- unique(gt[which(rownames(gt) %in% de_bg),1])

c_up <- as.data.frame(enricher(gene = de_up, universe = de_bg,  maxGSSize = 5000, TERM2GENE = genesets2))
c_up <- rownames(subset(c_up, p.adjust < 0.05))
       
c_dn <- as.data.frame(enricher(gene = de_dn, universe = de_bg,  maxGSSize = 5000, TERM2GENE = genesets2))
c_dn <- rownames(subset(c_dn, p.adjust < 0.05))

Now performing ORA with clusterprofiler combining up and down.

de_de <- rownames(subset(de,padj<0.05))
de_de <- unique(gt[which(rownames(gt) %in% de_de),1])

d_de <- as.data.frame(enricher(gene = de_de, universe = de_bg,  maxGSSize = 5000, TERM2GENE = genesets2))
d_de <- rownames(subset(d_de, p.adjust < 0.05))

Now performing ORA with clusterprofiler with whole genome background list

de_bg <- w$GeneInfo$GeneSymbol

f_up <- as.data.frame(enricher(gene = de_up, universe = de_bg,  maxGSSize = 5000, TERM2GENE = genesets2))
f_up <- rownames(subset(f_up, p.adjust < 0.05))
       
f_dn <- as.data.frame(enricher(gene = de_dn, universe = de_bg, maxGSSize = 5000, TERM2GENE = genesets2))
f_dn <- rownames(subset(f_dn, p.adjust < 0.05))

Now performing ORA (combining up and down gene lists) with clusterprofiler with whole genome background list

e_de <- as.data.frame(enricher(gene = de_de, universe = de_bg, maxGSSize = 5000, TERM2GENE = genesets2))
e_de <- rownames(subset(e_de, p.adjust < 0.05))

Venn diagram comparison

The Venn (or Euler to be more correct) diagram is useful to visualise the overlaps between sets.

par(cex.main=0.5)

par(mar=c(2,2,2,2))

v0 <- list("ORA up"=c_up,"ORA dn"=c_dn,
           "ORA comb" = d_de)

plot(euler(v0),quantities = TRUE, edges = "gray", main="effect of combining up and down regulated genes")

v1 <- list("FCS up"=m_up, "FCS dn"=m_dn,
           "ORA up"=c_up,"ORA dn"=c_dn)
  
plot(euler(v1),quantities = TRUE, edges = "gray", main="FCS compared to ORA")

v2 <- list("ORA up"=c_up,"ORA dn"=c_dn, 
           "ORA* up"=f_up,"ORA* dn"=f_dn )

plot(euler(v2),quantities = TRUE, edges = "gray", main="Effect of inappropriate background* (whole genome)")

vx <- list("ORA up"=c_up,"ORA dn"=c_dn,
           "ORA comb" = d_de, "ORA* comb" = e_de)

plot(euler(vx),quantities = TRUE, edges = "gray", main="combining up and down genes and whole genome bg*")

v3 <- list("ORA up"=c_up,"ORA dn"=c_dn, 
           "ORA* up"=f_up,"ORA* dn"=f_dn ,
           "FCS up"=m_up, "FCS dn"=m_dn)

png("images/fcs_ora2.png")
plot(euler(v1),quantities = TRUE, edges = "gray", main="FCS vs ORA")
dev.off()

## png 
##   2

png("images/orabg2.png")
plot(euler(v2),quantities = TRUE, edges = "gray", main="Effect of inappropriate background* (whole genome)")
dev.off()

## png 
##   2

png("images/oracomb2.png")
plot(euler(vx),quantities = TRUE, main="combining up and down genes and whole genome bg*")
dev.off()

## png 
##   2

pdf("images/fcs_ora2.pdf",width=4,height=4)
plot(euler(v1),quantities = TRUE, edges = "gray", main="FCS vs ORA")
dev.off()

## png 
##   2

pdf("images/orabg2.pdf",width=4,height=4)
plot(euler(v2),quantities = TRUE, edges = "gray", main="Effect of inappropriate background* (whole genome)")
dev.off()

## png 
##   2

pdf("images/oracomb2.pdf",width=4,height=4)
plot(euler(vx),quantities = TRUE, edges = "gray", main="combining up and down genes and whole genome bg*")
dev.off()

## png 
##   2

Jaccard calculation

# ORA vs ORA combined
dc <- length(intersect(d_de, c(c_up,c_dn))) / length(union(d_de, c(c_up,c_dn)))

# ORA vs ORA* combined
ec <- length(intersect(e_de, c(c_up,c_dn))) / length(union(e_de, c(c_up,c_dn)))

# FCS vs ORA
cm <- length(intersect(c(c_up,c_dn), c(m_up,m_dn))) / length(union(c(c_up,c_dn), c(m_up,m_dn)))

m_up <- gsub("^","up ",m_up)
m_dn <- gsub("^","dn ",m_dn)
m_de <- union(m_up,m_dn)

c_up <- gsub("^","up ",c_up)
c_dn <- gsub("^","dn ",c_dn)
c_de <- union(c_up,c_dn)

f_up <- gsub("^","up ",f_up)
f_dn <- gsub("^","dn ",f_dn)
f_de <- union(f_up,f_dn)

# ORA vs ORA*
cf <- length(intersect(c_de, f_de )) / length(union(c_de, f_de))

# FCS vs ORA*
mf <- length(intersect(m_de, f_de )) / length(union(m_de, f_de))

dat <- c("FCS vs ORA"=cm,"ORA vs ORA*"=cf,"FCS vs ORA*"=mf, "ORA vs ORA comb"=dc, "ORA vs ORA* comb"=ec)

dat

##       FCS vs ORA      ORA vs ORA*      FCS vs ORA*  ORA vs ORA comb 
##        0.5360624        0.3631040        0.5357143        0.2439863 
## ORA vs ORA* comb 
##        0.3706468

barplot(dat,ylab="jaccard metric")

saveRDS(dat,file = "ex2dat.rds")

Session information

sessionInfo()

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] rmdformats_1.0.3            beeswarm_0.4.0             
##  [3] eulerr_6.1.1                mitch_1.5.1                
##  [5] clusterProfiler_4.0.5       DESeq2_1.32.0              
##  [7] SummarizedExperiment_1.22.0 Biobase_2.52.0             
##  [9] MatrixGenerics_1.4.3        matrixStats_0.61.0         
## [11] GenomicRanges_1.44.0        GenomeInfoDb_1.28.4        
## [13] IRanges_2.26.0              S4Vectors_0.30.0           
## [15] BiocGenerics_0.38.0         getDEE2_1.2.0              
## [17] anytime_0.3.9               kableExtra_1.3.4           
## [19] XML_3.99-0.8                reutils_0.2.3              
## [21] vioplot_0.3.7               zoo_1.8-9                  
## [23] sm_2.2-5.7                  wordcloud_2.6              
## [25] RColorBrewer_1.1-2          rsvg_2.1.2                 
## [27] DiagrammeRsvg_0.1           DiagrammeR_1.0.6.1         
## [29] forcats_0.5.1               stringr_1.4.0              
## [31] dplyr_1.0.7                 purrr_0.3.4                
## [33] readr_2.0.2                 tidyr_1.1.4                
## [35] tibble_3.1.5                ggplot2_3.3.5              
## [37] tidyverse_1.3.1            
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2             tidyselect_1.1.1       RSQLite_2.2.8         
##   [4] AnnotationDbi_1.54.1   htmlwidgets_1.5.4      grid_4.1.2            
##   [7] BiocParallel_1.26.2    scatterpie_0.1.7       munsell_0.5.0         
##  [10] withr_2.4.2            colorspace_2.0-2       GOSemSim_2.18.1       
##  [13] highr_0.9              knitr_1.36             rstudioapi_0.13       
##  [16] DOSE_3.18.3            GenomeInfoDbData_1.2.6 polyclip_1.10-0       
##  [19] bit64_4.0.5            farver_2.1.0           downloader_0.4        
##  [22] vctrs_0.3.8            treeio_1.16.2          generics_0.1.0        
##  [25] xfun_0.26              R6_2.5.1               graphlayouts_0.7.2    
##  [28] locfit_1.5-9.4         bitops_1.0-7           cachem_1.0.6          
##  [31] reshape_0.8.8          fgsea_1.18.0           gridGraphics_0.5-1    
##  [34] DelayedArray_0.18.0    assertthat_0.2.1       promises_1.2.0.1      
##  [37] scales_1.1.1           ggraph_2.0.5           enrichplot_1.12.3     
##  [40] gtable_0.3.0           tidygraph_1.2.0        rlang_0.4.11          
##  [43] genefilter_1.74.0      systemfonts_1.0.2      splines_4.1.2         
##  [46] lazyeval_0.2.2         htm2txt_2.1.1          broom_0.7.9           
##  [49] yaml_2.2.1             reshape2_1.4.4         modelr_0.1.8          
##  [52] backports_1.2.1        httpuv_1.6.3           qvalue_2.24.0         
##  [55] tools_4.1.2            bookdown_0.24          ggplotify_0.1.0       
##  [58] gplots_3.1.1           ellipsis_0.3.2         jquerylib_0.1.4       
##  [61] Rcpp_1.0.7             plyr_1.8.6             visNetwork_2.1.0      
##  [64] zlibbioc_1.38.0        RCurl_1.98-1.5         viridis_0.6.1         
##  [67] cowplot_1.1.1          haven_2.4.3            ggrepel_0.9.1         
##  [70] fs_1.5.0               magrittr_2.0.1         data.table_1.14.2     
##  [73] DO.db_2.9              reprex_2.0.1           hms_1.1.1             
##  [76] patchwork_1.1.1        mime_0.12              evaluate_0.14         
##  [79] xtable_1.8-4           readxl_1.3.1           gridExtra_2.3         
##  [82] compiler_4.1.2         KernSmooth_2.23-20     V8_3.6.0              
##  [85] crayon_1.4.1           shadowtext_0.0.9       htmltools_0.5.2       
##  [88] ggfun_0.0.4            later_1.3.0            tzdb_0.1.2            
##  [91] geneplotter_1.70.0     aplot_0.1.1            lubridate_1.8.0       
##  [94] DBI_1.1.1              tweenr_1.0.2           dbplyr_2.1.1          
##  [97] MASS_7.3-54            Matrix_1.3-4           cli_3.0.1             
## [100] igraph_1.2.6           pkgconfig_2.0.3        xml2_1.3.2            
## [103] ggtree_3.0.4           svglite_2.0.0          annotate_1.70.0       
## [106] bslib_0.3.1            webshot_0.5.2          XVector_0.32.0        
## [109] rvest_1.0.1            yulab.utils_0.0.4      digest_0.6.28         
## [112] Biostrings_2.60.2      polylabelr_0.2.0       rmarkdown_2.11        
## [115] cellranger_1.1.0       fastmatch_1.1-3        tidytree_0.3.6        
## [118] curl_4.3.2             gtools_3.9.2           shiny_1.7.1           
## [121] lifecycle_1.0.1        nlme_3.1-153           jsonlite_1.7.2        
## [124] echarts4r_0.4.2        viridisLite_0.4.0      fansi_0.5.0           
## [127] pillar_1.6.3           lattice_0.20-45        GGally_2.1.2          
## [130] KEGGREST_1.32.0        fastmap_1.1.0          httr_1.4.2            
## [133] survival_3.2-13        GO.db_3.13.0           glue_1.4.2            
## [136] png_0.1-7              bit_4.0.4              ggforce_0.3.3         
## [139] stringi_1.7.5          sass_0.4.0             blob_1.2.2            
## [142] caTools_1.18.2         memoise_2.0.0          ape_5.5