Source: https://github.com/markziemann/background
This guide is a Rmarkdown script that conducts differential expression and enrichment analysis, which are very popular workflows for transcriptome data.
The goal of this work is to understand how ClusterProfiler manages the background list, as we have observed some weird behaviour. In particular we see that when we provide a custom background, those genes do not appear in the final analysis unless they are also included in the gene list.
In the code chunk below called libs
, you can add and remove required R library dependancies. Check that the libraries listed here match the Dockerfile, otherwise you might get errors.
suppressPackageStartupMessages({
library("getDEE2")
library("DESeq2")
library("kableExtra")
library("clusterProfiler")
library("fgsea")
library("eulerr")
library("gplots")
library("parallel")
})
For this guide I will be using bulk RNA-seq data from several previous studies.
Gene sets were obtained from MSigDB.
myfiles <- list.files("../dataprep",pattern="*Rds",full.names=TRUE)
d <- lapply(myfiles,readRDS)
names(d) <- basename(myfiles)
mygmts <- list.files("../gmt",pattern="*gmt",full.names=TRUE)
gs <- lapply(mygmts,gmtPathways)
names(gs) <- basename(mygmts)
names(gs)
## [1] "c2.cp.kegg_medicus.v2023.2.Hs.symbols.gmt"
## [2] "c2.cp.reactome.v2023.2.Hs.symbols.gmt"
## [3] "c2.cp.wikipathways.v2023.2.Hs.symbols.gmt"
## [4] "c3.mir.mirdb.v2023.2.Hs.symbols.gmt"
## [5] "c3.tft.gtrd.v2023.2.Hs.symbols.gmt"
## [6] "c5.go.v2023.2.Hs.symbols.gmt"
## [7] "c5.hpo.v2023.2.Hs.symbols.gmt"
## [8] "c8.all.v2023.2.Hs.symbols.gmt"
## [9] "h.all.v2023.2.Hs.symbols.gmt"
names(gs) <- c("KEGG", "Reactome", "Wikipathways", "miR targets", "TFT GTRD",
"GO", "HPO", "Cellmarkers", "Hallmark" )
# number of sets
lapply(gs,length)
## $KEGG
## [1] 619
##
## $Reactome
## [1] 1692
##
## $Wikipathways
## [1] 791
##
## $`miR targets`
## [1] 2377
##
## $`TFT GTRD`
## [1] 505
##
## $GO
## [1] 10461
##
## $HPO
## [1] 5547
##
## $Cellmarkers
## [1] 830
##
## $Hallmark
## [1] 50
# set size summary
lapply(gs,function(x) {
y <- unlist(lapply(x, length ))
summary(y)
} )
## $KEGG
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 8.00 13.00 15.59 19.00 92.00
##
## $Reactome
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 11.00 23.00 56.28 57.00 1463.00
##
## $Wikipathways
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 14.00 28.00 42.92 53.00 438.00
##
## $`miR targets`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.0 59.0 103.0 156.5 186.0 1071.0
##
## $`TFT GTRD`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.0 55.0 287.0 509.3 880.0 1999.0
##
## $GO
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 9.00 18.00 79.83 55.00 1994.00
##
## $HPO
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 10.00 23.00 91.32 74.00 1735.00
##
## $Cellmarkers
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.0 52.0 115.0 180.5 223.2 1770.0
##
## $Hallmark
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 32.0 98.0 180.0 146.4 200.0 200.0
# gene coverage
lapply(gs,function(x) {
y <- unique(unlist(x))
length(y)
} )
## $KEGG
## [1] 2727
##
## $Reactome
## [1] 11155
##
## $Wikipathways
## [1] 8313
##
## $`miR targets`
## [1] 16655
##
## $`TFT GTRD`
## [1] 26807
##
## $GO
## [1] 19428
##
## $HPO
## [1] 4997
##
## $Cellmarkers
## [1] 20418
##
## $Hallmark
## [1] 4384
# total number of annotations
lapply(gs, function(x) {
sum(unlist(lapply(x,length)))
} )
## $KEGG
## [1] 9651
##
## $Reactome
## [1] 95226
##
## $Wikipathways
## [1] 33947
##
## $`miR targets`
## [1] 372061
##
## $`TFT GTRD`
## [1] 257181
##
## $GO
## [1] 835146
##
## $HPO
## [1] 506552
##
## $Cellmarkers
## [1] 149827
##
## $Hallmark
## [1] 7322
Here is a function that conducts over-representation analysis with ClusterProfiler. Genes that are not annotated as having any gene set category are discarded.
# df = deseq output
# df must have "ensID geneSymbol" as rownames
# gs = gene sets
# gs format is list of symbols
orabad <- function( df, gs ) {
def <- df
gs1 <- stack(gs)
colnames(gs1) <- c("gene","term")
gs1 <- gs1[,c(2,1)]
defup <- rownames(subset(def,padj<=0.05 & log2FoldChange>0))
defup <- defup[lapply(strsplit(defup," "),length)==2]
defup <- unique(sapply(strsplit(defup," "),"[[",2))
if ( length(defup)<250 ) {
defup <- head(rownames(subset(def,log2FoldChange>0)),250)
defup <- defup[lapply(strsplit(defup," "),length)==2]
defup <- unique(sapply(strsplit(defup," "),"[[",2))
}
defdn <- rownames(subset(def,padj<=0.05 & log2FoldChange<0))
defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
defdn <- unique(sapply(strsplit(defdn," "),"[[",2))
if ( length(defdn)<250 ) {
defdn <- head(rownames(subset(def,log2FoldChange<0)),250)
defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
defdn <- unique(sapply(strsplit(defdn," "),"[[",2))
}
bg <- rownames(def)
bg <- bg[lapply(strsplit(bg," "),length)==2]
bg <- unique(sapply(strsplit(bg," "),"[[",2))
ora_up <- as.data.frame(enricher(gene = defup ,
universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
pAdjustMethod="fdr", pvalueCutoff = 1, qvalueCutoff = 1 ))
ora_up$geneID <- NULL
ora_ups <- rownames(ora_up)
gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))
br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))
ora_up$es <- gr/br
ora_up <- ora_up[order(-ora_up$es),]
ora_up$Description=NULL
ora_dn <- as.data.frame(enricher(gene = defdn ,
universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
pAdjustMethod="fdr", pvalueCutoff = 1, qvalueCutoff = 1 ))
ora_dn$geneID <- NULL
ora_dns <- rownames(ora_dn)
gr <- as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",2))
br <- as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",2))
ora_dn$es <- gr/br
ora_dn <- ora_dn[order(-ora_dn$es),]
ora_dn$Description=NULL
result <- list("orabad_up"=ora_up,"orabad_dn"=ora_dn)
return(result)
}
This function fixes the problem with excluding un-annotated genes.
oragood <- function( df, gs ) {
def <- df
gs1 <- stack(gs)
colnames(gs1) <- c("gene","term")
gs1 <- gs1[,c(2,1)]
defup <- rownames(subset(def,padj<=0.05 & log2FoldChange>0))
defup <- defup[lapply(strsplit(defup," "),length)==2]
defup <- unique(sapply(strsplit(defup," "),"[[",2))
if ( length(defup)<250 ) {
defup <- head(rownames(subset(def,log2FoldChange>0)),250)
defup <- defup[lapply(strsplit(defup," "),length)==2]
defup <- unique(sapply(strsplit(defup," "),"[[",2))
}
defdn <- rownames(subset(def,padj<=0.05 & log2FoldChange<0))
defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
defdn <- unique(sapply(strsplit(defdn," "),"[[",2))
if ( length(defdn)<250 ) {
defdn <- head(rownames(subset(def,log2FoldChange<0)),250)
defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
defdn <- unique(sapply(strsplit(defdn," "),"[[",2))
}
bg <- rownames(def)
bg <- bg[lapply(strsplit(bg," "),length)==2]
bg <- unique(sapply(strsplit(bg," "),"[[",2))
bgdf <- data.frame("background",bg)
names(bgdf) <- c("term","gene")
gs1 <- rbind(gs1,bgdf)
ora_up <- as.data.frame(enricher(gene = defup ,
universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
pAdjustMethod="fdr", pvalueCutoff = 1, qvalueCutoff = 1 ))
ora_up$geneID <- NULL
ora_ups <- rownames(ora_up)
gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))
br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))
ora_up$es <- gr/br
ora_up <- ora_up[order(-ora_up$es),]
ora_up$Description=NULL
ora_dn <- as.data.frame(enricher(gene = defdn ,
universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
pAdjustMethod="fdr", pvalueCutoff = 1, qvalueCutoff = 1 ))
ora_dn$geneID <- NULL
ora_dns <- rownames(ora_dn)
gr <- as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",2))
br <- as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",2))
ora_dn$es <- gr/br
ora_dn <- ora_dn[order(-ora_dn$es),]
ora_dn$Description=NULL
result <- list("oragood_up"=ora_up,"oragood_dn"=ora_dn)
return(result)
}
# bad=ORA analysis, a list with results of up and down regulated pathways
# good=same as bad, except fixed the background problem
compare_ora_plots <- function(bad,good,libname=NULL){
badup <- bad$orabad_up
baddn <- bad$orabad_dn
goodup <- good$oragood_up
gooddn <- good$oragood_dn
mup <- merge(badup,goodup,by=0)
MAX=max(c(mup$es.x,mup$es.y))
UPHEADER=paste(libname,"up")
DNHEADER=paste(libname,"dn")
par(mfrow=c(2,2))
plot(mup$es.x, mup$es.y,
xlim=c(0,MAX), ylim=c(0,MAX),
xlab="original ES", ylab="corrected ES", main=UPHEADER)
grid()
abline(a = 0, b = 1,lwd=2,lty=2)
MAX=max(-log10(c(mup$pvalue.x,mup$pvalue.y)))
plot(-log10(mup$pvalue.x), -log10(mup$pvalue.y),
xlim=c(0,MAX), ylim=c(0,MAX),
xlab="original -log10 p-value", ylab="corrected -log10 p-value", main=UPHEADER)
grid()
abline(a = 0, b = 1,lwd=2,lty=2)
mdn <- merge(baddn,gooddn,by=0)
MAX=max(c(mdn$es.x,mdn$es.y))
plot(mdn$es.x, mdn$es.y,
xlim=c(0,MAX), ylim=c(0,MAX),
xlab="original ES", ylab="corrected ES", main=DNHEADER)
grid()
abline(a = 0, b = 1,lwd=2,lty=2)
MAX=max(-log10(c(mdn$pvalue.x,mdn$pvalue.y)))
plot(-log10(mdn$pvalue.x), -log10(mdn$pvalue.y),
xlim=c(0,MAX), ylim=c(0,MAX),
xlab="original -log10 p-value", ylab="corrected -log10 p-value", main=DNHEADER)
grid()
abline(a = 0, b = 1,lwd=2,lty=2)
}
compare_ora <- function(bad,good){
badup <- bad$orabad_up
baddn <- bad$orabad_dn
goodup <- good$oragood_up
gooddn <- good$oragood_dn
mup <- merge(badup,goodup,by=0)
mdn <- merge(baddn,gooddn,by=0)
bup <- subset(mup,p.adjust.x<0.05)$Row.names
gup <- subset(mup,p.adjust.y<0.05)$Row.names
bdn <- subset(mdn,p.adjust.x<0.05)$Row.names
gdn <- subset(mdn,p.adjust.y<0.05)$Row.names
sig_bad <- unique(c(bup,bdn))
sig_good <- unique(c(gup,gdn))
nsig_bad <- length(sig_bad)
nsig_good <- length(sig_good)
jac <- length(intersect(sig_bad,sig_good)) / length(union(sig_bad,sig_good))
result=c("nsig_bad"=nsig_bad,"nsig_good"=nsig_good,"jaccard"=jac)
return(result)
}
params <- expand.grid(1:length(d),1:length(gs))
params2 <- expand.grid(names(d),names(gs))
colnames(params) <- c("d","gs")
ora_dat <- mclapply(1:nrow(params), function(i) {
j=params[i,1]
k=params[i,2]
bad <- orabad(d[[j]],gs[[k]])
good <- oragood(d[[j]],gs[[k]])
res <- compare_ora(bad,good)
return(res)
} , mc.cores=8)
ora_dat <- do.call(rbind,ora_dat)
res1 <- cbind(params2,ora_dat)
res1 %>%
kbl(row.names = TRUE, caption="all results") %>%
kable_paper("hover", full_width = F)
Var1 | Var2 | nsig_bad | nsig_good | jaccard | |
---|---|---|---|---|---|
1 | bulkrna1.Rds | KEGG | 9 | 19 | 0.4736842 |
2 | bulkrna2.Rds | KEGG | 25 | 59 | 0.4237288 |
3 | bulkrna3.Rds | KEGG | 2 | 6 | 0.3333333 |
4 | bulkrna4.Rds | KEGG | 14 | 24 | 0.5833333 |
5 | bulkrna5.Rds | KEGG | 0 | 0 | NaN |
6 | bulkrna6.Rds | KEGG | 47 | 56 | 0.8392857 |
7 | bulkrna7.Rds | KEGG | 9 | 38 | 0.2368421 |
8 | bulkrna1.Rds | Reactome | 64 | 102 | 0.6274510 |
9 | bulkrna2.Rds | Reactome | 239 | 351 | 0.6809117 |
10 | bulkrna3.Rds | Reactome | 94 | 159 | 0.5617284 |
11 | bulkrna4.Rds | Reactome | 207 | 307 | 0.6742671 |
12 | bulkrna5.Rds | Reactome | 0 | 0 | NaN |
13 | bulkrna6.Rds | Reactome | 254 | 334 | 0.7604790 |
14 | bulkrna7.Rds | Reactome | 185 | 240 | 0.7708333 |
15 | bulkrna1.Rds | Wikipathways | 33 | 46 | 0.7173913 |
16 | bulkrna2.Rds | Wikipathways | 108 | 196 | 0.5510204 |
17 | bulkrna3.Rds | Wikipathways | 8 | 12 | 0.6666667 |
18 | bulkrna4.Rds | Wikipathways | 13 | 50 | 0.2600000 |
19 | bulkrna5.Rds | Wikipathways | 1 | 1 | 1.0000000 |
20 | bulkrna6.Rds | Wikipathways | 82 | 97 | 0.8453608 |
21 | bulkrna7.Rds | Wikipathways | 35 | 117 | 0.2991453 |
22 | bulkrna1.Rds | miR targets | 4 | 81 | 0.0493827 |
23 | bulkrna2.Rds | miR targets | 40 | 111 | 0.3603604 |
24 | bulkrna3.Rds | miR targets | 21 | 38 | 0.5526316 |
25 | bulkrna4.Rds | miR targets | 272 | 634 | 0.4290221 |
26 | bulkrna5.Rds | miR targets | 1 | 1 | 1.0000000 |
27 | bulkrna6.Rds | miR targets | 1166 | 1365 | 0.8393895 |
28 | bulkrna7.Rds | miR targets | 360 | 544 | 0.6617647 |
29 | bulkrna1.Rds | TFT GTRD | 52 | 85 | 0.6117647 |
30 | bulkrna2.Rds | TFT GTRD | 98 | 140 | 0.7000000 |
31 | bulkrna3.Rds | TFT GTRD | 108 | 135 | 0.8000000 |
32 | bulkrna4.Rds | TFT GTRD | 72 | 115 | 0.6260870 |
33 | bulkrna5.Rds | TFT GTRD | 1 | 1 | 1.0000000 |
34 | bulkrna6.Rds | TFT GTRD | 147 | 173 | 0.8497110 |
35 | bulkrna7.Rds | TFT GTRD | 129 | 153 | 0.8431373 |
36 | bulkrna1.Rds | GO | 485 | 707 | 0.6859972 |
37 | bulkrna2.Rds | GO | 1843 | 2168 | 0.8500923 |
38 | bulkrna3.Rds | GO | 686 | 915 | 0.7497268 |
39 | bulkrna4.Rds | GO | 390 | 707 | 0.5516266 |
40 | bulkrna5.Rds | GO | 20 | 65 | 0.3076923 |
41 | bulkrna6.Rds | GO | 588 | 789 | 0.7452471 |
42 | bulkrna7.Rds | GO | 1263 | 1544 | 0.8180052 |
43 | bulkrna1.Rds | HPO | 2 | 42 | 0.0476190 |
44 | bulkrna2.Rds | HPO | 380 | 988 | 0.3846154 |
45 | bulkrna3.Rds | HPO | 92 | 241 | 0.3817427 |
46 | bulkrna4.Rds | HPO | 24 | 296 | 0.0810811 |
47 | bulkrna5.Rds | HPO | 0 | 25 | 0.0000000 |
48 | bulkrna6.Rds | HPO | 17 | 265 | 0.0641509 |
49 | bulkrna7.Rds | HPO | 186 | 449 | 0.4142539 |
50 | bulkrna1.Rds | Cellmarkers | 139 | 170 | 0.8176471 |
51 | bulkrna2.Rds | Cellmarkers | 439 | 477 | 0.9203354 |
52 | bulkrna3.Rds | Cellmarkers | 345 | 382 | 0.9031414 |
53 | bulkrna4.Rds | Cellmarkers | 245 | 286 | 0.8566434 |
54 | bulkrna5.Rds | Cellmarkers | 0 | 0 | NaN |
55 | bulkrna6.Rds | Cellmarkers | 267 | 297 | 0.8989899 |
56 | bulkrna7.Rds | Cellmarkers | 448 | 491 | 0.9085366 |
57 | bulkrna1.Rds | Hallmark | 7 | 20 | 0.3500000 |
58 | bulkrna2.Rds | Hallmark | 25 | 38 | 0.6578947 |
59 | bulkrna3.Rds | Hallmark | 18 | 21 | 0.7727273 |
60 | bulkrna4.Rds | Hallmark | 13 | 27 | 0.4814815 |
61 | bulkrna5.Rds | Hallmark | 0 | 3 | 0.0000000 |
62 | bulkrna6.Rds | Hallmark | 28 | 37 | 0.7567568 |
63 | bulkrna7.Rds | Hallmark | 18 | 34 | 0.4444444 |
summary(res1$nsig_bad)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 13.0 52.0 188.4 242.0 1843.0
summary(res1$nsig_good)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 35.5 115.0 274.2 320.5 2168.0
summary(res1$jaccard)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.4068 0.6598 0.5913 0.8177 1.0000 3
# NUMBER OF SIGNIFICANT SETS
ns1 <- matrix( res1$nsig_bad , ncol=9 )
rownames(ns1) <- names(d)
colnames(ns1) <- names(gs)
ns1 %>%
kbl(row.names = TRUE, caption="significant genesets in original analysis") %>%
kable_paper("hover", full_width = F)
KEGG | Reactome | Wikipathways | miR targets | TFT GTRD | GO | HPO | Cellmarkers | Hallmark | |
---|---|---|---|---|---|---|---|---|---|
bulkrna1.Rds | 9 | 64 | 33 | 4 | 52 | 485 | 2 | 139 | 7 |
bulkrna2.Rds | 25 | 239 | 108 | 40 | 98 | 1843 | 380 | 439 | 25 |
bulkrna3.Rds | 2 | 94 | 8 | 21 | 108 | 686 | 92 | 345 | 18 |
bulkrna4.Rds | 14 | 207 | 13 | 272 | 72 | 390 | 24 | 245 | 13 |
bulkrna5.Rds | 0 | 0 | 1 | 1 | 1 | 20 | 0 | 0 | 0 |
bulkrna6.Rds | 47 | 254 | 82 | 1166 | 147 | 588 | 17 | 267 | 28 |
bulkrna7.Rds | 9 | 185 | 35 | 360 | 129 | 1263 | 186 | 448 | 18 |
ns1 <- apply(ns1,2,function(x) { x <- x[!is.na(x)] ; x <- x[is.finite(x)] ; mean(x) } )
ns2 <- matrix( res1$nsig_good , ncol=9 )
rownames(ns2) <- names(d)
colnames(ns2) <- names(gs)
ns2 %>%
kbl(row.names = TRUE, caption="significant genesets in corrected analysis") %>%
kable_paper("hover", full_width = F)
KEGG | Reactome | Wikipathways | miR targets | TFT GTRD | GO | HPO | Cellmarkers | Hallmark | |
---|---|---|---|---|---|---|---|---|---|
bulkrna1.Rds | 19 | 102 | 46 | 81 | 85 | 707 | 42 | 170 | 20 |
bulkrna2.Rds | 59 | 351 | 196 | 111 | 140 | 2168 | 988 | 477 | 38 |
bulkrna3.Rds | 6 | 159 | 12 | 38 | 135 | 915 | 241 | 382 | 21 |
bulkrna4.Rds | 24 | 307 | 50 | 634 | 115 | 707 | 296 | 286 | 27 |
bulkrna5.Rds | 0 | 0 | 1 | 1 | 1 | 65 | 25 | 0 | 3 |
bulkrna6.Rds | 56 | 334 | 97 | 1365 | 173 | 789 | 265 | 297 | 37 |
bulkrna7.Rds | 38 | 240 | 117 | 544 | 153 | 1544 | 449 | 491 | 34 |
ns2 <- apply(ns2,2,function(x) { x <- x[!is.na(x)] ; x <- x[is.finite(x)] ; mean(x) } )
ns2
## KEGG Reactome Wikipathways miR targets TFT GTRD GO
## 28.85714 213.28571 74.14286 396.28571 114.57143 985.00000
## HPO Cellmarkers Hallmark
## 329.42857 300.42857 25.71429
df <- data.frame(ns1,ns2)
df <- df[order(df$ns1),]
par(mar=c(c(5.1, 7.1, 4.1, 2.1) ))
barplot(t(df),beside=TRUE,horiz=TRUE,las=1,xlim=c(0,1250),
xlab="no. significant sets", main="Effect of correcting the background bug",
col=c("gray30","gray80"))
text(df$ns1+50,((1:nrow(df))*3)-1.5,labels=signif(df[,1],3))
text(df$ns2+50,((1:nrow(df))*3)-0.5,labels=signif(df[,2],3))
#abline(v=seq(100,1000,100),lty=2,lwd=0.5,col="gray")
legend("bottomright", inset=.02, legend=c("corrected","original"),
fill=c("gray80","gray30"), horiz=FALSE, cex=1.1)
mylabels <- gsub("$","%",gsub("^","+",signif(((df$ns2/df$ns1)-1)*100,3)))
text(df$ns2+160,((1:nrow(df))*3)-1,labels=mylabels,cex=1.1)
# JACCARD
jac <- matrix( res1$jaccard , ncol=9 )
rownames(jac) <- names(d)
colnames(jac) <- names(gs)
signif(jac,3) %>%
kbl(row.names = TRUE, caption="Jaccard similarity Corrected:Original") %>%
kable_paper("hover", full_width = F)
KEGG | Reactome | Wikipathways | miR targets | TFT GTRD | GO | HPO | Cellmarkers | Hallmark | |
---|---|---|---|---|---|---|---|---|---|
bulkrna1.Rds | 0.474 | 0.627 | 0.717 | 0.0494 | 0.612 | 0.686 | 0.0476 | 0.818 | 0.350 |
bulkrna2.Rds | 0.424 | 0.681 | 0.551 | 0.3600 | 0.700 | 0.850 | 0.3850 | 0.920 | 0.658 |
bulkrna3.Rds | 0.333 | 0.562 | 0.667 | 0.5530 | 0.800 | 0.750 | 0.3820 | 0.903 | 0.773 |
bulkrna4.Rds | 0.583 | 0.674 | 0.260 | 0.4290 | 0.626 | 0.552 | 0.0811 | 0.857 | 0.481 |
bulkrna5.Rds | NaN | NaN | 1.000 | 1.0000 | 1.000 | 0.308 | 0.0000 | NaN | 0.000 |
bulkrna6.Rds | 0.839 | 0.760 | 0.845 | 0.8390 | 0.850 | 0.745 | 0.0642 | 0.899 | 0.757 |
bulkrna7.Rds | 0.237 | 0.771 | 0.299 | 0.6620 | 0.843 | 0.818 | 0.4140 | 0.909 | 0.444 |
jac2 <- apply(jac,2,function(x) { x <- x[!is.na(x)] ; x <- x[is.finite(x)] ; mean(x) } )
jac2
## KEGG Reactome Wikipathways miR targets TFT GTRD GO
## 0.4817013 0.6792784 0.6199406 0.5560787 0.7758143 0.6726268
## HPO Cellmarkers Hallmark
## 0.1962090 0.8842156 0.4947578
barplot(sort(jac2),horiz=TRUE,las=1,xlim=c(0,1),
main="Jaccard similarity Corrected:Original",xlab="Jaccard index")
For bulk RNA-seq data set #1, run enrichment with all the gene set libraries.
names(gs)
## [1] "KEGG" "Reactome" "Wikipathways" "miR targets" "TFT GTRD"
## [6] "GO" "HPO" "Cellmarkers" "Hallmark"
lapply(1:length(gs), function(i) {
bad <- orabad(d[[1]],gs[[i]])
good <- oragood(d[[1]],gs[[i]])
compare_ora_plots(bad,good,libname=names(gs)[i])
} )
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
For reproducibility
sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] gplots_3.1.3.1 eulerr_7.0.2
## [3] fgsea_1.30.0 clusterProfiler_4.12.0
## [5] kableExtra_1.4.0 DESeq2_1.44.0
## [7] SummarizedExperiment_1.34.0 Biobase_2.64.0
## [9] MatrixGenerics_1.16.0 matrixStats_1.3.0
## [11] GenomicRanges_1.56.0 GenomeInfoDb_1.40.0
## [13] IRanges_2.38.0 S4Vectors_0.42.0
## [15] BiocGenerics_0.50.0 getDEE2_1.14.0
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.16.0 jsonlite_1.8.8
## [4] magrittr_2.0.3 farver_2.1.2 rmarkdown_2.27
## [7] fs_1.6.4 zlibbioc_1.50.0 vctrs_0.6.5
## [10] memoise_2.0.1.9000 ggtree_3.12.0 htmltools_0.5.8.1
## [13] S4Arrays_1.4.0 SparseArray_1.4.3 gridGraphics_0.5-1
## [16] sass_0.4.9 KernSmooth_2.23-24 bslib_0.7.0
## [19] plyr_1.8.9 cachem_1.1.0 igraph_2.0.3
## [22] lifecycle_1.0.4 pkgconfig_2.0.3 gson_0.1.0
## [25] Matrix_1.7-0 R6_2.5.1 fastmap_1.2.0
## [28] GenomeInfoDbData_1.2.12 digest_0.6.35 aplot_0.2.2
## [31] enrichplot_1.24.0 colorspace_2.1-0 patchwork_1.2.0
## [34] AnnotationDbi_1.66.0 RSQLite_2.3.7 fansi_1.0.6
## [37] httr_1.4.7 polyclip_1.10-6 abind_1.4-5
## [40] compiler_4.4.0 bit64_4.0.5 withr_3.0.0
## [43] BiocParallel_1.38.0 viridis_0.6.5 DBI_1.2.3
## [46] highr_0.11 ggforce_0.4.2 MASS_7.3-60.2
## [49] DelayedArray_0.30.1 HDO.db_0.99.1 caTools_1.18.2
## [52] gtools_3.9.5 tools_4.4.0 scatterpie_0.2.2
## [55] ape_5.8 glue_1.7.0 nlme_3.1-164
## [58] GOSemSim_2.30.0 shadowtext_0.1.3 grid_4.4.0
## [61] reshape2_1.4.4 generics_0.1.3 gtable_0.3.5
## [64] tidyr_1.3.1 data.table_1.15.4 tidygraph_1.3.1
## [67] xml2_1.3.6 utf8_1.2.4 XVector_0.44.0
## [70] ggrepel_0.9.5 pillar_1.9.0 stringr_1.5.1
## [73] yulab.utils_0.1.4 splines_4.4.0 dplyr_1.1.4
## [76] tweenr_2.0.3 treeio_1.28.0 lattice_0.22-6
## [79] bit_4.0.5 tidyselect_1.2.1 GO.db_3.19.1
## [82] locfit_1.5-9.9 Biostrings_2.72.0 knitr_1.47
## [85] gridExtra_2.3 svglite_2.1.3 xfun_0.44
## [88] graphlayouts_1.1.1 stringi_1.8.4 UCSC.utils_1.0.0
## [91] lazyeval_0.2.2 ggfun_0.1.5 yaml_2.3.8
## [94] evaluate_0.23 codetools_0.2-20 ggraph_2.2.1
## [97] tibble_3.2.1 qvalue_2.36.0 ggplotify_0.1.2
## [100] cli_3.6.2 systemfonts_1.1.0 munsell_0.5.1
## [103] jquerylib_0.1.4 Rcpp_1.0.12 png_0.1-8
## [106] ggplot2_3.5.1 blob_1.2.4 DOSE_3.30.1
## [109] htm2txt_2.2.2 bitops_1.0-7 viridisLite_0.4.2
## [112] tidytree_0.4.6 scales_1.3.0 purrr_1.0.2
## [115] crayon_1.5.2 rlang_1.1.4 cowplot_1.1.3
## [118] fastmatch_1.1-4 KEGGREST_1.44.0