Source: https://github.com/markziemann/combined_enrichment

Intro

This script is designed to aggregate the results from seven RNA-seq enrichment analyses.

The purpose here is to get some idea as to how consistent the results are across several datasets.

Data was analysed with mitch (representing FCS) and clusterprofiler (representing ORA).

Here are the datasets considered.

Dataset SRA accesion genes in annotation set genes detected genes differentially expressed
1. SRP128998 39297 15635 3472
2. SRP038101 39297 13926 3589
3. SRP096178 39297 15477 9488
4. SRP038101 39297 15607 5150
5. SRP247621 39297 14288 230
6. SRP253951 39297 15182 8588
7. SRP068733 39297 14255 7365
suppressPackageStartupMessages({
  library("kableExtra")
  library("beeswarm")
})

Execute the examples

Here I’m running the scripts to analyse each of the example datasets shown in the above table.

Only execute the examples if file ex7dat.rds does not exist.

if ( ! file.exists("ex7dat.rds") ) {
  rmarkdown::render("example1.Rmd")
  rmarkdown::render("example2.Rmd")
  rmarkdown::render("example3.Rmd")
  rmarkdown::render("example4.Rmd")
  rmarkdown::render("example5.Rmd")
  rmarkdown::render("example6.Rmd")
  rmarkdown::render("example7.Rmd")
}

Summarise

Here I’m summarising the results of the five example data sets. Let’s start with FCS and then ORA. The different categories are described below:

  • dn: identified as downregulated by sep-DE

  • up: identified as upregulated by sep-DE

  • sep: up+dn as identified by sep-DE

  • com: identified as enriched by all-DE

  • all: identified by all-DE and/or sep-DE

exdat=NULL
ex1dat <- readRDS("ex1dat.rds")
ex2dat <- readRDS("ex2dat.rds")
ex3dat <- readRDS("ex3dat.rds")
ex4dat <- readRDS("ex4dat.rds")
ex5dat <- readRDS("ex5dat.rds")
ex6dat <- readRDS("ex6dat.rds")
ex7dat <- readRDS("ex7dat.rds")

msummary <- function(dat) {
  m_all  <- unique(c(dat$FCS_up ,dat$FCS_dn , dat$FCS_com ))
  m_sep_rec <- length(union(dat$FCS_up, dat$FCS_dn)) / length(m_all)
  m_com_rec <- length(dat$FCS_com) / length(m_all)
  res <- c("sep-DE up"=length(dat$FCS_up),
    "sep-DE dn"=length(dat$FCS_dn),
    "sep-DE up+dn"=length(union(dat$FCS_up,dat$FCS_dn)),
    "all-DE"=length(dat$FCS_com),
    "all-DE + sep-DE"=length(m_all),
    "sep-DE recall"=signif(m_sep_rec,3),
    "all-DE recall"=signif(m_com_rec,3),
    "sep-DE only"=length(setdiff(union(dat$FCS_up,dat$FCS_dn),dat$FCS_com )),
    "all-DE only"=length(setdiff(dat$FCS_com , union(dat$FCS_up,dat$FCS_dn))) )
  return(res)
}

mres <- lapply(list(ex1dat,ex2dat,ex3dat,ex4dat,ex5dat,ex6dat,ex7dat),msummary)
mres <- do.call(rbind,mres)
rownames(mres) <- c("SRP128998","SRP038101","SRP096178",
  "SRP038101","SRP247621","SRP253951","SRP068733")
mres %>% kbl(caption="differentially expressed pathway detection using separated or combined FCS analysis for seven studies") %>% kable_paper("hover", full_width = F)
differentially expressed pathway detection using separated or combined FCS analysis for seven studies
sep-DE up sep-DE dn sep-DE up+dn all-DE all-DE + sep-DE sep-DE recall all-DE recall sep-DE only all-DE only
SRP128998 96 317 413 55 431 0.958 0.128 376 18
SRP038101 192 317 509 209 566 0.899 0.369 357 57
SRP096178 49 266 315 61 351 0.897 0.174 290 36
SRP038101 308 44 352 402 508 0.693 0.791 106 156
SRP247621 69 156 225 34 241 0.934 0.141 207 16
SRP253951 98 322 420 352 568 0.739 0.620 216 148
SRP068733 73 183 256 225 348 0.736 0.647 123 92
# list format
mresl <- lapply(1:ncol(mres),function(i) {mres[,i]} )
names(mresl) <- colnames(mres)

osummary <- function(dat) {
  ORA_all  <- unique(c(dat$ORA_up ,dat$ORA_dn , dat$ORA_com ))
  ORA_sep_rec <- length(union(dat$ORA_up, dat$ORA_dn)) / length(ORA_all)
  ORA_com_rec <- length(dat$ORA_com) / length(ORA_all)
  res <- c("sep-DE up"=length(dat$ORA_up),
    "sep-DE dn"=length(dat$ORA_dn),
    "sep-DE up+dn"=length(union(dat$ORA_up,dat$ORA_dn)),
    "all-DE"=length(dat$ORA_com),
    "all-DE + sep-DE"=length(ORA_all),
    "sep-DE recall"=signif(ORA_sep_rec,3),
    "all-DE recall"=signif(ORA_com_rec,3),
    "sep-DE only"=length(setdiff(union(dat$ORA_up,dat$ORA_dn),dat$ORA_com )),
    "all-DE only"=length(setdiff(dat$ORA_com , union(dat$ORA_up,dat$ORA_dn))) )
  return(res)
}

ores <- lapply(list(ex1dat,ex2dat,ex3dat,ex4dat,ex5dat,ex6dat,ex7dat),osummary)
ores <- do.call(rbind,ores)
rownames(ores) <- c("SRP128998","SRP038101","SRP096178",
  "SRP038101","SRP247621","SRP253951","SRP068733")
ores %>% kbl(caption="differentially expressed pathway detection using separated or combined ORA analysis for seven studies") %>% kable_paper("hover", full_width = F)
differentially expressed pathway detection using separated or combined ORA analysis for seven studies
sep-DE up sep-DE dn sep-DE up+dn all-DE all-DE + sep-DE sep-DE recall all-DE recall sep-DE only all-DE only
SRP128998 47 33 80 23 85 0.941 0.271 62 5
SRP038101 68 191 258 83 259 0.996 0.320 176 1
SRP096178 26 120 146 0 146 1.000 0.000 146 0
SRP038101 271 38 303 181 329 0.921 0.550 148 26
SRP247621 4 1 5 3 6 0.833 0.500 3 1
SRP253951 89 249 338 52 344 0.983 0.151 292 6
SRP068733 29 149 178 76 193 0.922 0.394 117 15
# list format
oresl <- lapply(1:ncol(ores),function(i) {ores[,i]} )
names(oresl) <- colnames(ores)
means <- colMeans(mres)
medians <- apply(mres,2,median)
sds <- apply(mres,2,sd)
summstats <- rbind(means,medians,sds)
summstats %>% kbl(caption="summary statistics of FCS results") %>% kable_paper("hover", full_width = F)
summary statistics of FCS results
sep-DE up sep-DE dn sep-DE up+dn all-DE all-DE + sep-DE sep-DE recall all-DE recall sep-DE only all-DE only
means 126.42857 229.2857 355.71429 191.1429 430.4286 0.8365714 0.4100000 239.2857 74.71429
medians 96.00000 266.0000 352.00000 209.0000 431.0000 0.8970000 0.3690000 216.0000 57.00000
sds 92.33067 105.8107 99.67566 148.3053 124.0361 0.1095702 0.2752853 106.4514 58.81245
means <- colMeans(ores)
medians <- apply(ores,2,median)
sds <- apply(ores,2,sd)
summstats <- rbind(means,medians,sds)
summstats %>% kbl(caption="summary statistics of ORA results") %>% kable_paper("hover", full_width = F)
summary statistics of ORA results
sep-DE up sep-DE dn sep-DE up+dn all-DE all-DE + sep-DE sep-DE recall all-DE recall sep-DE only all-DE only
means 76.28571 111.57143 186.8571 59.71429 194.5714 0.9422857 0.3122857 134.85714 7.714286
medians 47.00000 120.00000 178.0000 52.00000 193.0000 0.9410000 0.3200000 146.00000 5.000000
sds 90.36171 91.72396 120.8531 62.85887 125.4842 0.0587132 0.1930982 91.00994 9.551863

Here let’s plot the jaccard scores. First as a boxplot and then overlay the dataset studies over the top.

Mitch results.

cols <- c("#CC79A7","#56B4E9","#E69F00","#F0E442","#009E73","#0072B2","#D55E00")
coll <- list(cols, cols, cols, cols, cols)
par(mar=c(10,10,3,1))
boxplot(mres[,5:1],yaxt="n",col="white",horizontal=TRUE,names = gsub("FCS ","", 
  colnames(mres)[5:1]),main="no. gene sets identified",ylim=c(0,600))
beeswarm(rev(mresl[1:5]) , horizontal=TRUE,pwcol = coll, cex=1.5,pch=19, add = TRUE, main="FCS")
legend("topright", legend=c("SRP128998", "SRP038101", "SRP096178",
  "SRP038101","SRP247621","SRP253951","SRP068733"), pch=19,  col=cols, cex=1,pt.cex = 1.5)
text(rep(-130,5),1:5,names(mresl)[5:1],xpd=NA)

par(mar=c(5,10,2,1))
par(mfrow=c(2,1))
coll <- list(cols, cols)
mynames <- sub(" recall","",gsub("FCS ","",colnames(mres)[7:6]))
boxplot(mres[,7:6],yaxt="n",col="white", names = mynames, horizontal = TRUE,
  main="proportion of gene sets identified", ylim=c(0,1))
beeswarm(mresl[7:6] , pwcol = coll, cex=1.5,pch=19, horizontal=TRUE, add = TRUE)
text(rep(-0.15,2),1:2,gsub("recall","",names(mresl)[7:6]),xpd=NA)


coll <- list(cols, cols)
mynames <- colnames(mres)[9:8]
boxplot(mres[,9:8], yaxt="n", col="white", names = mynames, horizontal=TRUE, 
  main="no. gene sets unique to each approach", ylim=c(0,400))
beeswarm(mresl[9:8] , pwcol = coll, cex=1.5,pch=19, add = TRUE, horizontal=TRUE)
text(rep(-60,2),1:2,names(mresl)[9:8],xpd=NA)

par(mfrow=c(1,1))

pdf("jacbox_fcs.pdf")

cols <- c("#CC79A7","#56B4E9","#E69F00","#F0E442","#009E73","#0072B2","#D55E00")
coll <- list(cols, cols, cols, cols, cols)
par(mar=c(10,10,3,1))
boxplot(mres[,5:1],yaxt="n",col="white",horizontal=TRUE,names = gsub("FCS ","",
  colnames(mres)[5:1]),main="no. gene sets identified",ylim=c(0,600))
beeswarm(rev(mresl[1:5]) , horizontal=TRUE,pwcol = coll, cex=1.5,pch=19, add = TRUE, main="FCS")
legend("topright", legend=c("SRP128998", "SRP038101", "SRP096178",
  "SRP038101","SRP247621","SRP253951","SRP068733"), pch=19,  col=cols, cex=1,pt.cex = 1.5)
text(rep(-130,5),1:5,names(mresl)[5:1],xpd=NA)

par(mar=c(5,10,2,1))
par(mfrow=c(2,1))
coll <- list(cols, cols)
mynames <- sub(" recall","",gsub("FCS ","",colnames(mres)[7:6]))
boxplot(mres[,7:6],yaxt="n",col="white", names = mynames, horizontal = TRUE,
  main="proportion of gene sets identified", ylim=c(0,1))
beeswarm(mresl[7:6] , pwcol = coll, cex=1.5,pch=19, horizontal=TRUE, add = TRUE)
text(rep(-0.15,2),1:2,gsub("recall","",names(mresl)[7:6]),xpd=NA)


coll <- list(cols, cols)
mynames <- colnames(mres)[9:8]
boxplot(mres[,9:8], yaxt="n", col="white", names = mynames, horizontal=TRUE,
  main="no. gene sets unique to each approach", ylim=c(0,400))
beeswarm(mresl[9:8] , pwcol = coll, cex=1.5,pch=19, add = TRUE, horizontal=TRUE)
text(rep(-60,2),1:2,names(mresl)[9:8],xpd=NA)

dev.off()
## png 
##   2

ORA results.

cols <- c("#CC79A7","#56B4E9","#E69F00","#F0E442","#009E73","#0072B2","#D55E00")
coll <- list(cols, cols, cols, cols, cols)
par(mar=c(10,10,3,1))
boxplot(ores[,5:1],yaxt="n",col="white",horizontal=TRUE,names = gsub("FCS ","",
  colnames(ores)[5:1]),main="no. gene sets identified",ylim=c(0,400))
beeswarm(rev(oresl[1:5]) , horizontal=TRUE,pwcol = coll, cex=1.5,pch=19, add = TRUE, main="FCS")
legend("topright", legend=c("SRP128998", "SRP038101", "SRP096178",
  "SRP038101","SRP247621","SRP253951","SRP068733"), pch=19,  col=cols, cex=1,pt.cex = 1.5)
text(rep(-100,5),1:5,names(oresl)[5:1],xpd=NA)

par(mar=c(5,10,2,1))
par(mfrow=c(2,1))
coll <- list(cols, cols)
mynames <- sub(" recall","",gsub("FCS ","",colnames(ores)[7:6]))
boxplot(ores[,7:6],yaxt="n",col="white", names = mynames, horizontal = TRUE,
  main="proportion of gene sets identified", ylim=c(0,1))
beeswarm(oresl[7:6] , pwcol = coll, cex=1.5,pch=19, horizontal=TRUE, add = TRUE)
text(rep(-0.15,2),1:2,gsub("recall","",names(oresl)[7:6]),xpd=NA)


coll <- list(cols, cols)
mynames <- colnames(ores)[9:8]
boxplot(ores[,9:8], yaxt="n", col="white", names = mynames, horizontal=TRUE,
  main="no. gene sets unique to each approach", ylim=c(0,300))
beeswarm(oresl[9:8] , pwcol = coll, cex=1.5,pch=19, add = TRUE, horizontal=TRUE)
text(rep(-60,2),1:2,names(oresl)[9:8],xpd=NA)

par(mfrow=c(1,1))

pdf("jacbox_ora.pdf")

cols <- c("#CC79A7","#56B4E9","#E69F00","#F0E442","#009E73","#0072B2","#D55E00")
coll <- list(cols, cols, cols, cols, cols)
par(mar=c(10,10,3,1))
boxplot(ores[,5:1],yaxt="n",col="white",horizontal=TRUE,names = gsub("FCS ","",
  colnames(ores)[5:1]),main="no. gene sets identified",ylim=c(0,400))
beeswarm(rev(oresl[1:5]) , horizontal=TRUE,pwcol = coll, cex=1.5,pch=19, add = TRUE, main="FCS")
legend("topright", legend=c("SRP128998", "SRP038101", "SRP096178",
  "SRP038101","SRP247621","SRP253951","SRP068733"), pch=19,  col=cols, cex=1,pt.cex = 1.5)
text(rep(-100,5),1:5,names(oresl)[5:1],xpd=NA)

par(mar=c(5,10,2,1))
par(mfrow=c(2,1))
coll <- list(cols, cols)
mynames <- sub(" recall","",gsub("FCS ","",colnames(ores)[7:6]))
boxplot(ores[,7:6],yaxt="n",col="white", names = mynames, horizontal = TRUE,
  main="proportion of gene sets identified", ylim=c(0,1))
beeswarm(oresl[7:6] , pwcol = coll, cex=1.5,pch=19, horizontal=TRUE, add = TRUE)
text(rep(-0.15,2),1:2,gsub("recall","",names(oresl)[7:6]),xpd=NA)


coll <- list(cols, cols)
mynames <- colnames(ores)[9:8]
boxplot(ores[,9:8], yaxt="n", col="white", names = mynames, horizontal=TRUE,
  main="no. gene sets unique to each approach", ylim=c(0,300))
beeswarm(oresl[9:8] , pwcol = coll, cex=1.5,pch=19, add = TRUE, horizontal=TRUE)
text(rep(-60,2),1:2,names(oresl)[9:8],xpd=NA)

par(mfrow=c(1,1))

dev.off()
## png 
##   2

Conclusions

Session information

sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] beeswarm_0.4.0   kableExtra_1.3.4
## 
## loaded via a namespace (and not attached):
##  [1] rstudioapi_0.13   knitr_1.39        xml2_1.3.3        magrittr_2.0.3   
##  [5] rvest_1.0.2       munsell_0.5.0     viridisLite_0.4.0 colorspace_2.0-3 
##  [9] R6_2.5.1          rlang_1.0.2       fastmap_1.1.0     highr_0.9        
## [13] stringr_1.4.0     httr_1.4.3        tools_4.2.0       webshot_0.5.3    
## [17] xfun_0.30         cli_3.3.0         jquerylib_0.1.4   systemfonts_1.0.4
## [21] htmltools_0.5.2   yaml_2.3.5        digest_0.6.29     lifecycle_1.0.1  
## [25] sass_0.4.1        glue_1.6.2        evaluate_0.15     rmarkdown_2.14   
## [29] stringi_1.7.6     compiler_4.2.0    bslib_0.3.1       scales_1.2.0     
## [33] svglite_2.1.0     jsonlite_1.8.0