Example gene set analysis: The case of HDAC inhibitor SAHA

Source: https://github.com/markziemann/SurveyEnrichmentMethods

Intro

Here we are performing an analysis of some gene expression data to demonstrate the difference between ORA and FCS methods and to highlight the differences caused by improper background gene set use.

The dataset being used is SRP096178 and we are comparing the cells grown in normal condition (control) to those grown with addition of SAHA (case).

Data are obtained from http://dee2.io/

suppressPackageStartupMessages({
library("getDEE2") 
library("DESeq2")
library("clusterProfiler")
library("mitch")
library("kableExtra")
library("eulerr")
})

Get expression data

I’m using some RNA-seq data looking at the effect of SAHA on HAEC cells.

name="SRP037718"
mdat<-getDEE2Metadata("hsapiens")
samplesheet <- mdat[grep("SRP037718",mdat$SRP_accession),]
samplesheet<-samplesheet[order(samplesheet$SRR_accession),]

samplesheet$trt<-as.factor(c(1,1,1,0,0,0))
s1 <- samplesheet

s1 %>% kbl(caption = "sample sheet") %>% kable_paper("hover", full_width = F)

sample sheet
	SRR_accession	QC_summary	SRX_accession	SRS_accession	SRP_accession	Sample_name	GEO_series	trt
238589	SRR1168225	PASS	SRX469930	SRS557162	SRP037718	GSM1326469	GSE37378	1
238590	SRR1168226	PASS	SRX469931	SRS557163	SRP037718	GSM1326470	GSE37378	1
238591	SRR1168227	PASS	SRX469932	SRS557164	SRP037718	GSM1326471	GSE37378	1
238592	SRR1168228	PASS	SRX469933	SRS557165	SRP037718	GSM1326472	GSE37378	0
238593	SRR1168229	PASS	SRX469934	SRS557166	SRP037718	GSM1326473	GSE37378	0
238594	SRR1168230	PASS	SRX469935	SRS557167	SRP037718	GSM1326474	GSE37378	0

w<-getDEE2("hsapiens",s1$SRR_accession,metadata=mdat,legacy = TRUE)

## For more information about DEE2 QC metrics, visit
##     https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md

x<-Tx2Gene(w)
x<-x$Tx2Gene

# save the genetable for later
gt<-w$GeneInfo[,1,drop=FALSE]
gt$accession<-rownames(gt)

# counts 
x1<-x[,which(colnames(x) %in% samplesheet$SRR_accession)]

Here show the number of genes in the annotation set, and those detected above the detection threshold.

# filter out lowly expressed genes
x1<-x1[which(rowSums(x1)/ncol(x1)>=(10)),]
nrow(x)

## [1] 39297

nrow(x1)

## [1] 15477

Now multidimensional scaling (MDS) plot to show the correlation between the datasets. If the control and case datasets are clustered separately, then it is likely that there will be many differentially expressed genes with FDR<0.05.

plot(cmdscale(dist(t(x1))), xlab="Coordinate 1", ylab="Coordinate 2", pch=19, col=s1$trt, main="MDS")

Differential expression

Now run DESeq2 for control vs case.

y <- DESeqDataSetFromMatrix(countData = round(x1), colData = s1, design = ~ trt)

## converting counts to integer mode

y <- DESeq(y)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

de <- results(y)
de<-as.data.frame(de[order(de$pvalue),])
rownames(de)<-sapply(strsplit(rownames(de),"\\."),"[[",1)
head(de) %>% kbl() %>% kable_paper("hover", full_width = F)

	baseMean	log2FoldChange	lfcSE	stat
ENSG00000099250	6148.277	-4.163885	0.1204942	-34.55671
ENSG00000187193	3478.958	4.219910	0.1248838	33.79071
ENSG00000126785	2075.476	-3.356182	0.1085961	-30.90517
ENSG00000166741	1046.262	-5.101344	0.1661858	-30.69662
ENSG00000102010	1153.368	-5.552842	0.1817905	-30.54528
ENSG00000159167	1898.444	7.629754	0.2499001	30.53122

Now let’s have a look at some of the charts showing differential expression. In particular, an MA plot and volcano plot.

maplot <- function(de,contrast_name) {
  sig <-subset(de, padj < 0.05 )
  up <-rownames(subset(de, padj < 0.05 & log2FoldChange > 0))
  dn <-rownames(subset(de, padj < 0.05 & log2FoldChange < 0))
  GENESUP <- length(up)
  GENESDN <- length(dn)
  DET=nrow(de)
  SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down", DET, "detected")
  ns <-subset(de, padj > 0.05 )
  plot(log2(de$baseMean),de$log2FoldChange, 
       xlab="log2 basemean", ylab="log2 foldchange",
       pch=19, cex=0.5, col="dark gray",
       main=contrast_name, cex.main=0.7)
  points(log2(sig$baseMean),sig$log2FoldChange,
         pch=19, cex=0.5, col="red")
  mtext(SUBHEADER,cex = 0.7)
}

make_volcano <- function(de,name) {
    sig <- subset(de,padj<0.05)
    N_SIG=nrow(sig)
    N_UP=nrow(subset(sig,log2FoldChange>0))
    N_DN=nrow(subset(sig,log2FoldChange<0))
    DET=nrow(de)
    HEADER=paste(N_SIG,"@5%FDR,", N_UP, "up", N_DN, "dn", DET, "detected")
    plot(de$log2FoldChange,-log10(de$padj),cex=0.5,pch=19,col="darkgray",
        main=name, xlab="log2 FC", ylab="-log10 pval", xlim=c(-6,6))
    mtext(HEADER)
    grid()
    points(sig$log2FoldChange,-log10(sig$padj),cex=0.5,pch=19,col="red")
}

maplot(de,name)

make_volcano(de,name)

Gene sets from Reactome

In order to perform gene set analysis, we need some gene sets.

if (! file.exists("ReactomePathways.gmt")) {
  download.file("https://reactome.org/download/current/ReactomePathways.gmt.zip", 
    destfile="ReactomePathways.gmt.zip")
  unzip("ReactomePathways.gmt.zip")
}
genesets<-gmt_import("ReactomePathways.gmt")

FCS with Mitch

Mitch uses rank-ANOVA statistics for enrichment detection.

m <- mitch_import(de,DEtype = "DEseq2", geneTable = gt)

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.

## Note: Mean no. genes in input = 15477

## Note: no. genes in output = 14488

## Note: estimated proportion of input genes in output = 0.936

mres <- mitch_calc(m,genesets = genesets)

## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.

m_up <- subset(mres$enrichment_result,p.adjustANOVA<0.05 & s.dist > 0)[,1]
m_dn <- subset(mres$enrichment_result,p.adjustANOVA<0.05 & s.dist < 0)[,1]
message(paste("Number of up-regulated pathways:",length(m_up) ))

## Number of up-regulated pathways: 49

message(paste("Number of down-regulated pathways:",length(m_dn) ))

## Number of down-regulated pathways: 266

head(mres$enrichment_result,10)  %>% kbl() %>% kable_paper("hover", full_width = F)

	set	setSize	s.dist
624	Metabolism of RNA	661	-0.2779789
1283	Translation	273	-0.3429991
149	Cell Cycle	599	-0.2319664
151	Cell Cycle, Mitotic	484	-0.2524928
1365	rRNA processing	208	-0.3505784
150	Cell Cycle Checkpoints	251	-0.3161425
636	Metabolism of proteins	1562	-0.1313830
1367	rRNA processing in the nucleus and cytosol	187	-0.3472750
728	Nonsense Mediated Decay (NMD) enhanced by the Exon Junction Complex (EJC)	113	-0.4428306
730	Nonsense-Mediated Decay (NMD)	113	-0.4428306

ORA with clusterprofiler

Clusterprofiler uses a hypergeometric test. Firstly I will conduct the analysis separately for up and down regulated genes and with the correct background (as intended by the developers).

genesets2 <- read.gmt("ReactomePathways.gmt")

de_up <- rownames(subset(de,log2FoldChange>0,padj<0.05))
de_up <- unique(gt[which(rownames(gt) %in% de_up),1])

de_dn <- rownames(subset(de,log2FoldChange<0,padj<0.05))
de_dn <- unique(gt[which(rownames(gt) %in% de_dn),1])

de_bg <- rownames(de)
de_bg <- unique(gt[which(rownames(gt) %in% de_bg),1])

c_up <- as.data.frame(enricher(gene = de_up, universe = de_bg,  maxGSSize = 5000, TERM2GENE = genesets2))
c_up <- rownames(subset(c_up, p.adjust < 0.05))
       
c_dn <- as.data.frame(enricher(gene = de_dn, universe = de_bg,  maxGSSize = 5000, TERM2GENE = genesets2))
c_dn <- rownames(subset(c_dn, p.adjust < 0.05))

Now performing ORA with clusterprofiler combining up and down.

de_de <- rownames(subset(de,padj<0.05))
de_de <- unique(gt[which(rownames(gt) %in% de_de),1])

d_de <- as.data.frame(enricher(gene = de_de, universe = de_bg,  maxGSSize = 5000, TERM2GENE = genesets2))
d_de <- rownames(subset(d_de, p.adjust < 0.05))

Now performing ORA with clusterprofiler with whole genome background list

de_bg <- w$GeneInfo$GeneSymbol

f_up <- as.data.frame(enricher(gene = de_up, universe = de_bg,  maxGSSize = 5000, TERM2GENE = genesets2))
f_up <- rownames(subset(f_up, p.adjust < 0.05))
       
f_dn <- as.data.frame(enricher(gene = de_dn, universe = de_bg, maxGSSize = 5000, TERM2GENE = genesets2))
f_dn <- rownames(subset(f_dn, p.adjust < 0.05))

Now performing ORA (combining up and down gene lists) with clusterprofiler with whole genome background list

e_de <- as.data.frame(enricher(gene = de_de, universe = de_bg, maxGSSize = 5000, TERM2GENE = genesets2))
e_de <- rownames(subset(e_de, p.adjust < 0.05))

Venn diagram comparison

The Venn (or Euler to be more correct) diagram is useful to visualise the overlaps between sets.

par(cex.main=0.5)

par(mar=c(2,2,2,2))

v0 <- list("ORA up"=c_up,"ORA dn"=c_dn,
           "ORA comb" = d_de)

plot(euler(v0),quantities = TRUE, edges = "gray", main="effect of combining up and down regulated genes")

## Warning in colSums(id & !empty) == 0 | merged_sets: longer object length is not
## a multiple of shorter object length

v1 <- list("FCS up"=m_up, "FCS dn"=m_dn,
           "ORA up"=c_up,"ORA dn"=c_dn)
  
plot(euler(v1),quantities = TRUE, edges = "gray", main="FCS compared to ORA")

v2 <- list("ORA up"=c_up,"ORA dn"=c_dn, 
           "ORA* up"=f_up,"ORA* dn"=f_dn )

plot(euler(v2),quantities = TRUE, edges = "gray", main="Effect of inappropriate background* (whole genome)")

vx <- list("ORA up"=c_up,"ORA dn"=c_dn,
           "ORA comb" = d_de, "ORA* comb" = e_de)

plot(euler(vx),quantities = TRUE, edges = "gray", main="combining up and down genes and whole genome bg*")

## Warning in colSums(id & !empty) == 0 | merged_sets: longer object length is not
## a multiple of shorter object length

v3 <- list("ORA up"=c_up,"ORA dn"=c_dn, 
           "ORA* up"=f_up,"ORA* dn"=f_dn ,
           "FCS up"=m_up, "FCS dn"=m_dn)

png("images/fcs_ora3.png")
plot(euler(v1),quantities = TRUE, edges = "gray", main="FCS vs ORA")
dev.off()

## png 
##   2

png("images/orabg3.png")
plot(euler(v2),quantities = TRUE, edges = "gray", main="Effect of inappropriate background* (whole genome)")
dev.off()

## png 
##   2

png("images/oracomb3.png")
plot(euler(vx),quantities = TRUE, main="combining up and down genes and whole genome bg*")

## Warning in colSums(id & !empty) == 0 | merged_sets: longer object length is not
## a multiple of shorter object length

dev.off()

## png 
##   2

pdf("images/fcs_ora3.pdf",width=4,height=4)
plot(euler(v1),quantities = TRUE, edges = "gray", main="FCS vs ORA")
dev.off()

## png 
##   2

pdf("images/orabg3.pdf",width=4,height=4)
plot(euler(v2),quantities = TRUE, edges = "gray", main="Effect of inappropriate background* (whole genome)")
dev.off()

## png 
##   2

pdf("images/oracomb3.pdf",width=4,height=4)
plot(euler(vx),quantities = TRUE, edges = "gray", main="combining up and down genes and whole genome bg*")

## Warning in colSums(id & !empty) == 0 | merged_sets: longer object length is not
## a multiple of shorter object length

dev.off()

## png 
##   2

Jaccard calculation

# ORA vs ORA combined
dc <- length(intersect(d_de, c(c_up,c_dn))) / length(union(d_de, c(c_up,c_dn)))

# ORA vs ORA* combined
ec <- length(intersect(e_de, c(c_up,c_dn))) / length(union(e_de, c(c_up,c_dn)))

# FCS vs ORA
cm <- length(intersect(c(c_up,c_dn), c(m_up,m_dn))) / length(union(c(c_up,c_dn), c(m_up,m_dn)))

m_up <- gsub("^","up ",m_up)
m_dn <- gsub("^","dn ",m_dn)
m_de <- union(m_up,m_dn)

c_up <- gsub("^","up ",c_up)
c_dn <- gsub("^","dn ",c_dn)
c_de <- union(c_up,c_dn)

f_up <- gsub("^","up ",f_up)
f_dn <- gsub("^","dn ",f_dn)
f_de <- union(f_up,f_dn)

# ORA vs ORA*
cf <- length(intersect(c_de, f_de )) / length(union(c_de, f_de))

# FCS vs ORA*
mf <- length(intersect(m_de, f_de )) / length(union(m_de, f_de))

dat <- c("FCS vs ORA"=cm,"ORA vs ORA*"=cf,"FCS vs ORA*"=mf, "ORA vs ORA comb"=dc, "ORA vs ORA* comb"=ec)

dat

##       FCS vs ORA      ORA vs ORA*      FCS vs ORA*  ORA vs ORA comb 
##        0.6645963        0.4019608        0.4871324        0.0000000 
## ORA vs ORA* comb 
##        0.2492308

barplot(dat,ylab="jaccard metric")

saveRDS(dat,file = "ex3dat.rds")

Session information

sessionInfo()

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] rmdformats_1.0.3            beeswarm_0.4.0             
##  [3] eulerr_6.1.1                mitch_1.5.1                
##  [5] clusterProfiler_4.0.5       DESeq2_1.32.0              
##  [7] SummarizedExperiment_1.22.0 Biobase_2.52.0             
##  [9] MatrixGenerics_1.4.3        matrixStats_0.61.0         
## [11] GenomicRanges_1.44.0        GenomeInfoDb_1.28.4        
## [13] IRanges_2.26.0              S4Vectors_0.30.0           
## [15] BiocGenerics_0.38.0         getDEE2_1.2.0              
## [17] anytime_0.3.9               kableExtra_1.3.4           
## [19] XML_3.99-0.8                reutils_0.2.3              
## [21] vioplot_0.3.7               zoo_1.8-9                  
## [23] sm_2.2-5.7                  wordcloud_2.6              
## [25] RColorBrewer_1.1-2          rsvg_2.1.2                 
## [27] DiagrammeRsvg_0.1           DiagrammeR_1.0.6.1         
## [29] forcats_0.5.1               stringr_1.4.0              
## [31] dplyr_1.0.7                 purrr_0.3.4                
## [33] readr_2.0.2                 tidyr_1.1.4                
## [35] tibble_3.1.5                ggplot2_3.3.5              
## [37] tidyverse_1.3.1            
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2             tidyselect_1.1.1       RSQLite_2.2.8         
##   [4] AnnotationDbi_1.54.1   htmlwidgets_1.5.4      grid_4.1.2            
##   [7] BiocParallel_1.26.2    scatterpie_0.1.7       munsell_0.5.0         
##  [10] withr_2.4.2            colorspace_2.0-2       GOSemSim_2.18.1       
##  [13] highr_0.9              knitr_1.36             rstudioapi_0.13       
##  [16] DOSE_3.18.3            GenomeInfoDbData_1.2.6 polyclip_1.10-0       
##  [19] bit64_4.0.5            farver_2.1.0           downloader_0.4        
##  [22] vctrs_0.3.8            treeio_1.16.2          generics_0.1.0        
##  [25] xfun_0.26              R6_2.5.1               graphlayouts_0.7.2    
##  [28] locfit_1.5-9.4         bitops_1.0-7           cachem_1.0.6          
##  [31] reshape_0.8.8          fgsea_1.18.0           gridGraphics_0.5-1    
##  [34] DelayedArray_0.18.0    assertthat_0.2.1       promises_1.2.0.1      
##  [37] scales_1.1.1           ggraph_2.0.5           enrichplot_1.12.3     
##  [40] gtable_0.3.0           tidygraph_1.2.0        rlang_0.4.11          
##  [43] genefilter_1.74.0      systemfonts_1.0.2      splines_4.1.2         
##  [46] lazyeval_0.2.2         htm2txt_2.1.1          broom_0.7.9           
##  [49] yaml_2.2.1             reshape2_1.4.4         modelr_0.1.8          
##  [52] backports_1.2.1        httpuv_1.6.3           qvalue_2.24.0         
##  [55] tools_4.1.2            bookdown_0.24          ggplotify_0.1.0       
##  [58] gplots_3.1.1           ellipsis_0.3.2         jquerylib_0.1.4       
##  [61] Rcpp_1.0.7             plyr_1.8.6             visNetwork_2.1.0      
##  [64] zlibbioc_1.38.0        RCurl_1.98-1.5         viridis_0.6.1         
##  [67] cowplot_1.1.1          haven_2.4.3            ggrepel_0.9.1         
##  [70] fs_1.5.0               magrittr_2.0.1         data.table_1.14.2     
##  [73] DO.db_2.9              reprex_2.0.1           hms_1.1.1             
##  [76] patchwork_1.1.1        mime_0.12              evaluate_0.14         
##  [79] xtable_1.8-4           readxl_1.3.1           gridExtra_2.3         
##  [82] compiler_4.1.2         KernSmooth_2.23-20     V8_3.6.0              
##  [85] crayon_1.4.1           shadowtext_0.0.9       htmltools_0.5.2       
##  [88] ggfun_0.0.4            later_1.3.0            tzdb_0.1.2            
##  [91] geneplotter_1.70.0     aplot_0.1.1            lubridate_1.8.0       
##  [94] DBI_1.1.1              tweenr_1.0.2           dbplyr_2.1.1          
##  [97] MASS_7.3-54            Matrix_1.3-4           cli_3.0.1             
## [100] igraph_1.2.6           pkgconfig_2.0.3        xml2_1.3.2            
## [103] ggtree_3.0.4           svglite_2.0.0          annotate_1.70.0       
## [106] bslib_0.3.1            webshot_0.5.2          XVector_0.32.0        
## [109] rvest_1.0.1            yulab.utils_0.0.4      digest_0.6.28         
## [112] Biostrings_2.60.2      polylabelr_0.2.0       rmarkdown_2.11        
## [115] cellranger_1.1.0       fastmatch_1.1-3        tidytree_0.3.6        
## [118] curl_4.3.2             gtools_3.9.2           shiny_1.7.1           
## [121] lifecycle_1.0.1        nlme_3.1-153           jsonlite_1.7.2        
## [124] echarts4r_0.4.2        viridisLite_0.4.0      fansi_0.5.0           
## [127] pillar_1.6.3           lattice_0.20-45        GGally_2.1.2          
## [130] KEGGREST_1.32.0        fastmap_1.1.0          httr_1.4.2            
## [133] survival_3.2-13        GO.db_3.13.0           glue_1.4.2            
## [136] png_0.1-7              bit_4.0.4              ggforce_0.3.3         
## [139] stringi_1.7.5          sass_0.4.0             blob_1.2.2            
## [142] caTools_1.18.2         memoise_2.0.0          ape_5.5