Source: https://github.com/markziemann/combined_enrichment
This script is designed to aggregate the results from seven RNA-seq enrichment analyses.
The purpose here is to get some idea as to how consistent the results are across several datasets.
Data was analysed with mitch (representing FCS) and clusterprofiler (representing ORA).
Here are the datasets considered.
Dataset | SRA accesion | genes in annotation set | genes detected | genes differentially expressed |
---|---|---|---|---|
1. | SRP128998 | 39297 | 15635 | 3472 |
2. | SRP038101 | 39297 | 13926 | 3589 |
3. | SRP096178 | 39297 | 15477 | 9488 |
4. | SRP038101 | 39297 | 15607 | 5150 |
5. | SRP247621 | 39297 | 14288 | 230 |
6. | SRP253951 | 39297 | 15182 | 8588 |
7. | SRP068733 | 39297 | 14255 | 7365 |
suppressPackageStartupMessages({
library("kableExtra")
library("beeswarm")
})
Here I’m running the scripts to analyse each of the example datasets shown in the above table.
Only execute the examples if file ex7dat.rds
does not exist.
if ( ! file.exists("ex7dat.rds") ) {
rmarkdown::render("example1.Rmd")
rmarkdown::render("example2.Rmd")
rmarkdown::render("example3.Rmd")
rmarkdown::render("example4.Rmd")
rmarkdown::render("example5.Rmd")
rmarkdown::render("example6.Rmd")
rmarkdown::render("example7.Rmd")
}
Here I’m summarising the results of the five example data sets. Let’s start with FCS and then ORA. The different categories are described below:
dn: identified as downregulated by sep-DE
up: identified as upregulated by sep-DE
sep: up+dn as identified by sep-DE
com: identified as enriched by all-DE
all: identified by all-DE and/or sep-DE
exdat=NULL
ex1dat <- readRDS("ex1dat.rds")
ex2dat <- readRDS("ex2dat.rds")
ex3dat <- readRDS("ex3dat.rds")
ex4dat <- readRDS("ex4dat.rds")
ex5dat <- readRDS("ex5dat.rds")
ex6dat <- readRDS("ex6dat.rds")
ex7dat <- readRDS("ex7dat.rds")
msummary <- function(dat) {
m_all <- unique(c(dat$FCS_up ,dat$FCS_dn , dat$FCS_com ))
m_sep_rec <- length(union(dat$FCS_up, dat$FCS_dn)) / length(m_all)
m_com_rec <- length(dat$FCS_com) / length(m_all)
res <- c("sep-DE up"=length(dat$FCS_up),
"sep-DE dn"=length(dat$FCS_dn),
"sep-DE up+dn"=length(union(dat$FCS_up,dat$FCS_dn)),
"all-DE"=length(dat$FCS_com),
"all-DE + sep-DE"=length(m_all),
"sep-DE recall"=signif(m_sep_rec,3),
"all-DE recall"=signif(m_com_rec,3),
"sep-DE only"=length(setdiff(union(dat$FCS_up,dat$FCS_dn),dat$FCS_com )),
"all-DE only"=length(setdiff(dat$FCS_com , union(dat$FCS_up,dat$FCS_dn))) )
return(res)
}
mres <- lapply(list(ex1dat,ex2dat,ex3dat,ex4dat,ex5dat,ex6dat,ex7dat),msummary)
mres <- do.call(rbind,mres)
rownames(mres) <- c("SRP128998","SRP038101","SRP096178",
"SRP038101","SRP247621","SRP253951","SRP068733")
mres %>% kbl(caption="differentially expressed pathway detection using separated or combined FCS analysis for seven studies") %>% kable_paper("hover", full_width = F)
sep-DE up | sep-DE dn | sep-DE up+dn | all-DE | all-DE + sep-DE | sep-DE recall | all-DE recall | sep-DE only | all-DE only | |
---|---|---|---|---|---|---|---|---|---|
SRP128998 | 96 | 317 | 413 | 55 | 431 | 0.958 | 0.128 | 376 | 18 |
SRP038101 | 192 | 317 | 509 | 209 | 566 | 0.899 | 0.369 | 357 | 57 |
SRP096178 | 49 | 266 | 315 | 61 | 351 | 0.897 | 0.174 | 290 | 36 |
SRP038101 | 308 | 44 | 352 | 402 | 508 | 0.693 | 0.791 | 106 | 156 |
SRP247621 | 69 | 156 | 225 | 34 | 241 | 0.934 | 0.141 | 207 | 16 |
SRP253951 | 98 | 322 | 420 | 352 | 568 | 0.739 | 0.620 | 216 | 148 |
SRP068733 | 73 | 183 | 256 | 225 | 348 | 0.736 | 0.647 | 123 | 92 |
# list format
mresl <- lapply(1:ncol(mres),function(i) {mres[,i]} )
names(mresl) <- colnames(mres)
osummary <- function(dat) {
ORA_all <- unique(c(dat$ORA_up ,dat$ORA_dn , dat$ORA_com ))
ORA_sep_rec <- length(union(dat$ORA_up, dat$ORA_dn)) / length(ORA_all)
ORA_com_rec <- length(dat$ORA_com) / length(ORA_all)
res <- c("sep-DE up"=length(dat$ORA_up),
"sep-DE dn"=length(dat$ORA_dn),
"sep-DE up+dn"=length(union(dat$ORA_up,dat$ORA_dn)),
"all-DE"=length(dat$ORA_com),
"all-DE + sep-DE"=length(ORA_all),
"sep-DE recall"=signif(ORA_sep_rec,3),
"all-DE recall"=signif(ORA_com_rec,3),
"sep-DE only"=length(setdiff(union(dat$ORA_up,dat$ORA_dn),dat$ORA_com )),
"all-DE only"=length(setdiff(dat$ORA_com , union(dat$ORA_up,dat$ORA_dn))) )
return(res)
}
ores <- lapply(list(ex1dat,ex2dat,ex3dat,ex4dat,ex5dat,ex6dat,ex7dat),osummary)
ores <- do.call(rbind,ores)
rownames(ores) <- c("SRP128998","SRP038101","SRP096178",
"SRP038101","SRP247621","SRP253951","SRP068733")
ores %>% kbl(caption="differentially expressed pathway detection using separated or combined ORA analysis for seven studies") %>% kable_paper("hover", full_width = F)
sep-DE up | sep-DE dn | sep-DE up+dn | all-DE | all-DE + sep-DE | sep-DE recall | all-DE recall | sep-DE only | all-DE only | |
---|---|---|---|---|---|---|---|---|---|
SRP128998 | 47 | 33 | 80 | 23 | 85 | 0.941 | 0.271 | 62 | 5 |
SRP038101 | 68 | 191 | 258 | 83 | 259 | 0.996 | 0.320 | 176 | 1 |
SRP096178 | 26 | 120 | 146 | 0 | 146 | 1.000 | 0.000 | 146 | 0 |
SRP038101 | 271 | 38 | 303 | 181 | 329 | 0.921 | 0.550 | 148 | 26 |
SRP247621 | 4 | 1 | 5 | 3 | 6 | 0.833 | 0.500 | 3 | 1 |
SRP253951 | 89 | 249 | 338 | 52 | 344 | 0.983 | 0.151 | 292 | 6 |
SRP068733 | 29 | 149 | 178 | 76 | 193 | 0.922 | 0.394 | 117 | 15 |
# list format
oresl <- lapply(1:ncol(ores),function(i) {ores[,i]} )
names(oresl) <- colnames(ores)
means <- colMeans(mres)
medians <- apply(mres,2,median)
sds <- apply(mres,2,sd)
summstats <- rbind(means,medians,sds)
summstats %>% kbl(caption="summary statistics of FCS results") %>% kable_paper("hover", full_width = F)
sep-DE up | sep-DE dn | sep-DE up+dn | all-DE | all-DE + sep-DE | sep-DE recall | all-DE recall | sep-DE only | all-DE only | |
---|---|---|---|---|---|---|---|---|---|
means | 126.42857 | 229.2857 | 355.71429 | 191.1429 | 430.4286 | 0.8365714 | 0.4100000 | 239.2857 | 74.71429 |
medians | 96.00000 | 266.0000 | 352.00000 | 209.0000 | 431.0000 | 0.8970000 | 0.3690000 | 216.0000 | 57.00000 |
sds | 92.33067 | 105.8107 | 99.67566 | 148.3053 | 124.0361 | 0.1095702 | 0.2752853 | 106.4514 | 58.81245 |
means <- colMeans(ores)
medians <- apply(ores,2,median)
sds <- apply(ores,2,sd)
summstats <- rbind(means,medians,sds)
summstats %>% kbl(caption="summary statistics of ORA results") %>% kable_paper("hover", full_width = F)
sep-DE up | sep-DE dn | sep-DE up+dn | all-DE | all-DE + sep-DE | sep-DE recall | all-DE recall | sep-DE only | all-DE only | |
---|---|---|---|---|---|---|---|---|---|
means | 76.28571 | 111.57143 | 186.8571 | 59.71429 | 194.5714 | 0.9422857 | 0.3122857 | 134.85714 | 7.714286 |
medians | 47.00000 | 120.00000 | 178.0000 | 52.00000 | 193.0000 | 0.9410000 | 0.3200000 | 146.00000 | 5.000000 |
sds | 90.36171 | 91.72396 | 120.8531 | 62.85887 | 125.4842 | 0.0587132 | 0.1930982 | 91.00994 | 9.551863 |
Here let’s plot the jaccard scores. First as a boxplot and then overlay the dataset studies over the top.
Mitch results.
cols <- c("#CC79A7","#56B4E9","#E69F00","#F0E442","#009E73","#0072B2","#D55E00")
coll <- list(cols, cols, cols, cols, cols)
par(mar=c(10,10,3,1))
boxplot(mres[,5:1],yaxt="n",col="white",horizontal=TRUE,names = gsub("FCS ","",
colnames(mres)[5:1]),main="no. gene sets identified",ylim=c(0,600))
beeswarm(rev(mresl[1:5]) , horizontal=TRUE,pwcol = coll, cex=1.5,pch=19, add = TRUE, main="FCS")
legend("topright", legend=c("SRP128998", "SRP038101", "SRP096178",
"SRP038101","SRP247621","SRP253951","SRP068733"), pch=19, col=cols, cex=1,pt.cex = 1.5)
text(rep(-130,5),1:5,names(mresl)[5:1],xpd=NA)
par(mar=c(5,10,2,1))
par(mfrow=c(2,1))
coll <- list(cols, cols)
mynames <- sub(" recall","",gsub("FCS ","",colnames(mres)[7:6]))
boxplot(mres[,7:6],yaxt="n",col="white", names = mynames, horizontal = TRUE,
main="proportion of gene sets identified", ylim=c(0,1))
beeswarm(mresl[7:6] , pwcol = coll, cex=1.5,pch=19, horizontal=TRUE, add = TRUE)
text(rep(-0.15,2),1:2,gsub("recall","",names(mresl)[7:6]),xpd=NA)
coll <- list(cols, cols)
mynames <- colnames(mres)[9:8]
boxplot(mres[,9:8], yaxt="n", col="white", names = mynames, horizontal=TRUE,
main="no. gene sets unique to each approach", ylim=c(0,400))
beeswarm(mresl[9:8] , pwcol = coll, cex=1.5,pch=19, add = TRUE, horizontal=TRUE)
text(rep(-60,2),1:2,names(mresl)[9:8],xpd=NA)
par(mfrow=c(1,1))
pdf("jacbox_fcs.pdf")
cols <- c("#CC79A7","#56B4E9","#E69F00","#F0E442","#009E73","#0072B2","#D55E00")
coll <- list(cols, cols, cols, cols, cols)
par(mar=c(10,10,3,1))
boxplot(mres[,5:1],yaxt="n",col="white",horizontal=TRUE,names = gsub("FCS ","",
colnames(mres)[5:1]),main="no. gene sets identified",ylim=c(0,600))
beeswarm(rev(mresl[1:5]) , horizontal=TRUE,pwcol = coll, cex=1.5,pch=19, add = TRUE, main="FCS")
legend("topright", legend=c("SRP128998", "SRP038101", "SRP096178",
"SRP038101","SRP247621","SRP253951","SRP068733"), pch=19, col=cols, cex=1,pt.cex = 1.5)
text(rep(-130,5),1:5,names(mresl)[5:1],xpd=NA)
par(mar=c(5,10,2,1))
par(mfrow=c(2,1))
coll <- list(cols, cols)
mynames <- sub(" recall","",gsub("FCS ","",colnames(mres)[7:6]))
boxplot(mres[,7:6],yaxt="n",col="white", names = mynames, horizontal = TRUE,
main="proportion of gene sets identified", ylim=c(0,1))
beeswarm(mresl[7:6] , pwcol = coll, cex=1.5,pch=19, horizontal=TRUE, add = TRUE)
text(rep(-0.15,2),1:2,gsub("recall","",names(mresl)[7:6]),xpd=NA)
coll <- list(cols, cols)
mynames <- colnames(mres)[9:8]
boxplot(mres[,9:8], yaxt="n", col="white", names = mynames, horizontal=TRUE,
main="no. gene sets unique to each approach", ylim=c(0,400))
beeswarm(mresl[9:8] , pwcol = coll, cex=1.5,pch=19, add = TRUE, horizontal=TRUE)
text(rep(-60,2),1:2,names(mresl)[9:8],xpd=NA)
dev.off()
## png
## 2
ORA results.
cols <- c("#CC79A7","#56B4E9","#E69F00","#F0E442","#009E73","#0072B2","#D55E00")
coll <- list(cols, cols, cols, cols, cols)
par(mar=c(10,10,3,1))
boxplot(ores[,5:1],yaxt="n",col="white",horizontal=TRUE,names = gsub("FCS ","",
colnames(ores)[5:1]),main="no. gene sets identified",ylim=c(0,400))
beeswarm(rev(oresl[1:5]) , horizontal=TRUE,pwcol = coll, cex=1.5,pch=19, add = TRUE, main="FCS")
legend("topright", legend=c("SRP128998", "SRP038101", "SRP096178",
"SRP038101","SRP247621","SRP253951","SRP068733"), pch=19, col=cols, cex=1,pt.cex = 1.5)
text(rep(-100,5),1:5,names(oresl)[5:1],xpd=NA)
par(mar=c(5,10,2,1))
par(mfrow=c(2,1))
coll <- list(cols, cols)
mynames <- sub(" recall","",gsub("FCS ","",colnames(ores)[7:6]))
boxplot(ores[,7:6],yaxt="n",col="white", names = mynames, horizontal = TRUE,
main="proportion of gene sets identified", ylim=c(0,1))
beeswarm(oresl[7:6] , pwcol = coll, cex=1.5,pch=19, horizontal=TRUE, add = TRUE)
text(rep(-0.15,2),1:2,gsub("recall","",names(oresl)[7:6]),xpd=NA)
coll <- list(cols, cols)
mynames <- colnames(ores)[9:8]
boxplot(ores[,9:8], yaxt="n", col="white", names = mynames, horizontal=TRUE,
main="no. gene sets unique to each approach", ylim=c(0,300))
beeswarm(oresl[9:8] , pwcol = coll, cex=1.5,pch=19, add = TRUE, horizontal=TRUE)
text(rep(-60,2),1:2,names(oresl)[9:8],xpd=NA)
par(mfrow=c(1,1))
pdf("jacbox_ora.pdf")
cols <- c("#CC79A7","#56B4E9","#E69F00","#F0E442","#009E73","#0072B2","#D55E00")
coll <- list(cols, cols, cols, cols, cols)
par(mar=c(10,10,3,1))
boxplot(ores[,5:1],yaxt="n",col="white",horizontal=TRUE,names = gsub("FCS ","",
colnames(ores)[5:1]),main="no. gene sets identified",ylim=c(0,400))
beeswarm(rev(oresl[1:5]) , horizontal=TRUE,pwcol = coll, cex=1.5,pch=19, add = TRUE, main="FCS")
legend("topright", legend=c("SRP128998", "SRP038101", "SRP096178",
"SRP038101","SRP247621","SRP253951","SRP068733"), pch=19, col=cols, cex=1,pt.cex = 1.5)
text(rep(-100,5),1:5,names(oresl)[5:1],xpd=NA)
par(mar=c(5,10,2,1))
par(mfrow=c(2,1))
coll <- list(cols, cols)
mynames <- sub(" recall","",gsub("FCS ","",colnames(ores)[7:6]))
boxplot(ores[,7:6],yaxt="n",col="white", names = mynames, horizontal = TRUE,
main="proportion of gene sets identified", ylim=c(0,1))
beeswarm(oresl[7:6] , pwcol = coll, cex=1.5,pch=19, horizontal=TRUE, add = TRUE)
text(rep(-0.15,2),1:2,gsub("recall","",names(oresl)[7:6]),xpd=NA)
coll <- list(cols, cols)
mynames <- colnames(ores)[9:8]
boxplot(ores[,9:8], yaxt="n", col="white", names = mynames, horizontal=TRUE,
main="no. gene sets unique to each approach", ylim=c(0,300))
beeswarm(oresl[9:8] , pwcol = coll, cex=1.5,pch=19, add = TRUE, horizontal=TRUE)
text(rep(-60,2),1:2,names(oresl)[9:8],xpd=NA)
par(mfrow=c(1,1))
dev.off()
## png
## 2
sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8
## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8
## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] beeswarm_0.4.0 kableExtra_1.3.4
##
## loaded via a namespace (and not attached):
## [1] rstudioapi_0.13 knitr_1.39 xml2_1.3.3 magrittr_2.0.3
## [5] rvest_1.0.2 munsell_0.5.0 viridisLite_0.4.0 colorspace_2.0-3
## [9] R6_2.5.1 rlang_1.0.2 fastmap_1.1.0 highr_0.9
## [13] stringr_1.4.0 httr_1.4.3 tools_4.2.0 webshot_0.5.3
## [17] xfun_0.30 cli_3.3.0 jquerylib_0.1.4 systemfonts_1.0.4
## [21] htmltools_0.5.2 yaml_2.3.5 digest_0.6.29 lifecycle_1.0.1
## [25] sass_0.4.1 glue_1.6.2 evaluate_0.15 rmarkdown_2.14
## [29] stringi_1.7.6 compiler_4.2.0 bslib_0.3.1 scales_1.2.0
## [33] svglite_2.1.0 jsonlite_1.8.0