Source: https://github.com/markziemann/background

Introduction

This guide is a Rmarkdown script that conducts differential expression and enrichment analysis, which are very popular workflows for transcriptome data.

The goal of this work is to understand how ClusterProfiler manages the background list, as we have observed some weird behaviour. In particular we see that when we provide a custom background, those genes do not appear in the final analysis unless they are also included in the gene list.

In the code chunk below called libs, you can add and remove required R library dependancies. Check that the libraries listed here match the Dockerfile, otherwise you might get errors.

suppressPackageStartupMessages({
  library("getDEE2")
  library("DESeq2")
  library("kableExtra")
  library("clusterProfiler")
  library("fgsea")
  library("eulerr")
  library("gplots")
  library("parallel")
})

For this guide I will be using bulk RNA-seq data from several previous studies.

Load data

Gene sets were obtained from MSigDB.

myfiles <- list.files("../dataprep",pattern="*Rds",full.names=TRUE)
d <- lapply(myfiles,readRDS)
names(d) <- basename(myfiles)

mygmts <- list.files("../gmt",pattern="*gmt",full.names=TRUE)
gs <- lapply(mygmts,gmtPathways)
names(gs) <- basename(mygmts)

names(gs)
## [1] "c2.cp.kegg_medicus.v2023.2.Hs.symbols.gmt"
## [2] "c2.cp.reactome.v2023.2.Hs.symbols.gmt"    
## [3] "c2.cp.wikipathways.v2023.2.Hs.symbols.gmt"
## [4] "c3.mir.mirdb.v2023.2.Hs.symbols.gmt"      
## [5] "c3.tft.gtrd.v2023.2.Hs.symbols.gmt"       
## [6] "c5.go.v2023.2.Hs.symbols.gmt"             
## [7] "c5.hpo.v2023.2.Hs.symbols.gmt"            
## [8] "c8.all.v2023.2.Hs.symbols.gmt"            
## [9] "h.all.v2023.2.Hs.symbols.gmt"
names(gs) <- c("KEGG", "Reactome", "Wikipathways", "miR targets", "TFT GTRD",
  "GO", "HPO", "Cellmarkers", "Hallmark" )

Basic geneset analysis

# number of sets
lapply(gs,length)
## $KEGG
## [1] 619
## 
## $Reactome
## [1] 1692
## 
## $Wikipathways
## [1] 791
## 
## $`miR targets`
## [1] 2377
## 
## $`TFT GTRD`
## [1] 505
## 
## $GO
## [1] 10461
## 
## $HPO
## [1] 5547
## 
## $Cellmarkers
## [1] 830
## 
## $Hallmark
## [1] 50
# set size summary
lapply(gs,function(x) {
  y <- unlist(lapply(x, length ))
  summary(y)
} )
## $KEGG
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00    8.00   13.00   15.59   19.00   92.00 
## 
## $Reactome
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00   11.00   23.00   56.28   57.00 1463.00 
## 
## $Wikipathways
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00   14.00   28.00   42.92   53.00  438.00 
## 
## $`miR targets`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     5.0    59.0   103.0   156.5   186.0  1071.0 
## 
## $`TFT GTRD`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     5.0    55.0   287.0   509.3   880.0  1999.0 
## 
## $GO
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00    9.00   18.00   79.83   55.00 1994.00 
## 
## $HPO
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00   10.00   23.00   91.32   74.00 1735.00 
## 
## $Cellmarkers
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     5.0    52.0   115.0   180.5   223.2  1770.0 
## 
## $Hallmark
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    32.0    98.0   180.0   146.4   200.0   200.0
# gene coverage
lapply(gs,function(x) {
  y <- unique(unlist(x))
  length(y)
} )
## $KEGG
## [1] 2727
## 
## $Reactome
## [1] 11155
## 
## $Wikipathways
## [1] 8313
## 
## $`miR targets`
## [1] 16655
## 
## $`TFT GTRD`
## [1] 26807
## 
## $GO
## [1] 19428
## 
## $HPO
## [1] 4997
## 
## $Cellmarkers
## [1] 20418
## 
## $Hallmark
## [1] 4384
# total number of annotations
lapply(gs, function(x) {
  sum(unlist(lapply(x,length)))
} )
## $KEGG
## [1] 9651
## 
## $Reactome
## [1] 95226
## 
## $Wikipathways
## [1] 33947
## 
## $`miR targets`
## [1] 372061
## 
## $`TFT GTRD`
## [1] 257181
## 
## $GO
## [1] 835146
## 
## $HPO
## [1] 506552
## 
## $Cellmarkers
## [1] 149827
## 
## $Hallmark
## [1] 7322

Enrichment with Clusterprofiler

Here is a function that conducts over-representation analysis with ClusterProfiler. Genes that are not annotated as having any gene set category are discarded.

# df = deseq output
# df must have "ensID geneSymbol" as rownames

# gs = gene sets
# gs format is list of symbols
orabad <- function( df, gs ) {

  def <- df

  gs1 <- stack(gs)
  colnames(gs1) <- c("gene","term")
  gs1 <- gs1[,c(2,1)]

  defup <- rownames(subset(def,padj<=0.05 & log2FoldChange>0))
  defup <- defup[lapply(strsplit(defup," "),length)==2]
  defup <- unique(sapply(strsplit(defup," "),"[[",2))

  if ( length(defup)<250 ) {
    defup <- head(rownames(subset(def,log2FoldChange>0)),250)
    defup <- defup[lapply(strsplit(defup," "),length)==2]
    defup <- unique(sapply(strsplit(defup," "),"[[",2))
  }

  defdn <- rownames(subset(def,padj<=0.05 & log2FoldChange<0))
  defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
  defdn <- unique(sapply(strsplit(defdn," "),"[[",2))

  if ( length(defdn)<250 ) {
    defdn <- head(rownames(subset(def,log2FoldChange<0)),250)
    defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
    defdn <- unique(sapply(strsplit(defdn," "),"[[",2))
  }

  bg <- rownames(def)
  bg <- bg[lapply(strsplit(bg," "),length)==2]
  bg <- unique(sapply(strsplit(bg," "),"[[",2))

  ora_up <- as.data.frame(enricher(gene = defup ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))

  ora_up$geneID <- NULL
  ora_ups <- rownames(ora_up)

  gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))

  br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))

  ora_up$es <- gr/br
  ora_up <- ora_up[order(-ora_up$es),]
  ora_up$Description=NULL

  ora_dn <- as.data.frame(enricher(gene = defdn ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))

  ora_dn$geneID <- NULL
  ora_dns <- rownames(ora_dn)

  gr <- as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",2))

  br <- as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",2))

  ora_dn$es <- gr/br
  ora_dn <- ora_dn[order(-ora_dn$es),]
  ora_dn$Description=NULL

  result <- list("orabad_up"=ora_up,"orabad_dn"=ora_dn)
  return(result)
}

This function fixes the problem with excluding un-annotated genes.

oragood <- function( df, gs ) {

  def <- df

  gs1 <- stack(gs)
  colnames(gs1) <- c("gene","term")
  gs1 <- gs1[,c(2,1)]

  defup <- rownames(subset(def,padj<=0.05 & log2FoldChange>0))
  defup <- defup[lapply(strsplit(defup," "),length)==2]
  defup <- unique(sapply(strsplit(defup," "),"[[",2))

  if ( length(defup)<250 ) {
    defup <- head(rownames(subset(def,log2FoldChange>0)),250)
    defup <- defup[lapply(strsplit(defup," "),length)==2]
    defup <- unique(sapply(strsplit(defup," "),"[[",2))
  }

  defdn <- rownames(subset(def,padj<=0.05 & log2FoldChange<0))
  defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
  defdn <- unique(sapply(strsplit(defdn," "),"[[",2))

  if ( length(defdn)<250 ) {
    defdn <- head(rownames(subset(def,log2FoldChange<0)),250)
    defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
    defdn <- unique(sapply(strsplit(defdn," "),"[[",2))
  }

  bg <- rownames(def)
  bg <- bg[lapply(strsplit(bg," "),length)==2]
  bg <- unique(sapply(strsplit(bg," "),"[[",2))

  bgdf <- data.frame("background",bg)
  names(bgdf) <- c("term","gene")
  gs1 <- rbind(gs1,bgdf)

  ora_up <- as.data.frame(enricher(gene = defup ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))

  ora_up$geneID <- NULL
  ora_ups <- rownames(ora_up)

  gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))

  br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))

  ora_up$es <- gr/br
  ora_up <- ora_up[order(-ora_up$es),]
  ora_up$Description=NULL

  ora_dn <- as.data.frame(enricher(gene = defdn ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))

  ora_dn$geneID <- NULL
  ora_dns <- rownames(ora_dn)

  gr <- as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",2))

  br <- as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",2))

  ora_dn$es <- gr/br
  ora_dn <- ora_dn[order(-ora_dn$es),]
  ora_dn$Description=NULL

  result <- list("oragood_up"=ora_up,"oragood_dn"=ora_dn)
  return(result)
}

Compare the results of bad and good analysis

# bad=ORA analysis, a list with results of up and down regulated pathways
# good=same as bad, except fixed the background problem
compare_ora_plots <- function(bad,good,libname=NULL){
  badup <- bad$orabad_up
  baddn <- bad$orabad_dn
  goodup <- good$oragood_up
  gooddn <- good$oragood_dn
  mup <- merge(badup,goodup,by=0)
  MAX=max(c(mup$es.x,mup$es.y))
  UPHEADER=paste(libname,"up")
  DNHEADER=paste(libname,"dn")

  par(mfrow=c(2,2))

  plot(mup$es.x, mup$es.y,
    xlim=c(0,MAX), ylim=c(0,MAX),
    xlab="original ES", ylab="corrected ES", main=UPHEADER)
  grid()
  abline(a = 0, b = 1,lwd=2,lty=2)

  MAX=max(-log10(c(mup$pvalue.x,mup$pvalue.y)))

  plot(-log10(mup$pvalue.x), -log10(mup$pvalue.y),
    xlim=c(0,MAX), ylim=c(0,MAX),
    xlab="original -log10 p-value", ylab="corrected -log10 p-value", main=UPHEADER)
  grid()
  abline(a = 0, b = 1,lwd=2,lty=2)

  mdn <- merge(baddn,gooddn,by=0)
  MAX=max(c(mdn$es.x,mdn$es.y))

  plot(mdn$es.x, mdn$es.y,
    xlim=c(0,MAX), ylim=c(0,MAX),
    xlab="original ES", ylab="corrected ES", main=DNHEADER)
  grid()
  abline(a = 0, b = 1,lwd=2,lty=2)

  MAX=max(-log10(c(mdn$pvalue.x,mdn$pvalue.y)))

  plot(-log10(mdn$pvalue.x), -log10(mdn$pvalue.y),
    xlim=c(0,MAX), ylim=c(0,MAX),
    xlab="original -log10 p-value", ylab="corrected -log10 p-value", main=DNHEADER)
  grid()
  abline(a = 0, b = 1,lwd=2,lty=2)
}

compare_ora <- function(bad,good){
  badup <- bad$orabad_up
  baddn <- bad$orabad_dn
  goodup <- good$oragood_up
  gooddn <- good$oragood_dn
  mup <- merge(badup,goodup,by=0)
  mdn <- merge(baddn,gooddn,by=0)
  bup <- subset(mup,p.adjust.x<0.05)$Row.names
  gup <- subset(mup,p.adjust.y<0.05)$Row.names
  bdn <- subset(mdn,p.adjust.x<0.05)$Row.names
  gdn <- subset(mdn,p.adjust.y<0.05)$Row.names
  sig_bad <- unique(c(bup,bdn))
  sig_good <- unique(c(gup,gdn))
  nsig_bad <- length(sig_bad)
  nsig_good <- length(sig_good)
  jac <- length(intersect(sig_bad,sig_good)) / length(union(sig_bad,sig_good))
  result=c("nsig_bad"=nsig_bad,"nsig_good"=nsig_good,"jaccard"=jac)
  return(result)
}

Run some analyses

params <- expand.grid(1:length(d),1:length(gs))
params2 <- expand.grid(names(d),names(gs))

colnames(params) <- c("d","gs")

ora_dat <- mclapply(1:nrow(params), function(i) {
  j=params[i,1]
  k=params[i,2]
  bad <- orabad(d[[j]],gs[[k]])
  good <- oragood(d[[j]],gs[[k]])
  res <- compare_ora(bad,good)
  return(res)
} , mc.cores=8)

ora_dat <- do.call(rbind,ora_dat)

res1 <- cbind(params2,ora_dat)

res1 %>%
  kbl(row.names = TRUE, caption="all results") %>%
  kable_paper("hover", full_width = F)
all results
Var1 Var2 nsig_bad nsig_good jaccard
1 bulkrna1.Rds KEGG 9 19 0.4736842
2 bulkrna2.Rds KEGG 25 59 0.4237288
3 bulkrna3.Rds KEGG 2 6 0.3333333
4 bulkrna4.Rds KEGG 14 24 0.5833333
5 bulkrna5.Rds KEGG 0 0 NaN
6 bulkrna6.Rds KEGG 47 56 0.8392857
7 bulkrna7.Rds KEGG 9 38 0.2368421
8 bulkrna1.Rds Reactome 64 102 0.6274510
9 bulkrna2.Rds Reactome 239 351 0.6809117
10 bulkrna3.Rds Reactome 94 159 0.5617284
11 bulkrna4.Rds Reactome 207 307 0.6742671
12 bulkrna5.Rds Reactome 0 0 NaN
13 bulkrna6.Rds Reactome 254 334 0.7604790
14 bulkrna7.Rds Reactome 185 240 0.7708333
15 bulkrna1.Rds Wikipathways 33 46 0.7173913
16 bulkrna2.Rds Wikipathways 108 196 0.5510204
17 bulkrna3.Rds Wikipathways 8 12 0.6666667
18 bulkrna4.Rds Wikipathways 13 50 0.2600000
19 bulkrna5.Rds Wikipathways 1 1 1.0000000
20 bulkrna6.Rds Wikipathways 82 97 0.8453608
21 bulkrna7.Rds Wikipathways 35 117 0.2991453
22 bulkrna1.Rds miR targets 4 81 0.0493827
23 bulkrna2.Rds miR targets 40 111 0.3603604
24 bulkrna3.Rds miR targets 21 38 0.5526316
25 bulkrna4.Rds miR targets 272 634 0.4290221
26 bulkrna5.Rds miR targets 1 1 1.0000000
27 bulkrna6.Rds miR targets 1166 1365 0.8393895
28 bulkrna7.Rds miR targets 360 544 0.6617647
29 bulkrna1.Rds TFT GTRD 52 85 0.6117647
30 bulkrna2.Rds TFT GTRD 98 140 0.7000000
31 bulkrna3.Rds TFT GTRD 108 135 0.8000000
32 bulkrna4.Rds TFT GTRD 72 115 0.6260870
33 bulkrna5.Rds TFT GTRD 1 1 1.0000000
34 bulkrna6.Rds TFT GTRD 147 173 0.8497110
35 bulkrna7.Rds TFT GTRD 129 153 0.8431373
36 bulkrna1.Rds GO 485 707 0.6859972
37 bulkrna2.Rds GO 1843 2168 0.8500923
38 bulkrna3.Rds GO 686 915 0.7497268
39 bulkrna4.Rds GO 390 707 0.5516266
40 bulkrna5.Rds GO 20 65 0.3076923
41 bulkrna6.Rds GO 588 789 0.7452471
42 bulkrna7.Rds GO 1263 1544 0.8180052
43 bulkrna1.Rds HPO 2 42 0.0476190
44 bulkrna2.Rds HPO 380 988 0.3846154
45 bulkrna3.Rds HPO 92 241 0.3817427
46 bulkrna4.Rds HPO 24 296 0.0810811
47 bulkrna5.Rds HPO 0 25 0.0000000
48 bulkrna6.Rds HPO 17 265 0.0641509
49 bulkrna7.Rds HPO 186 449 0.4142539
50 bulkrna1.Rds Cellmarkers 139 170 0.8176471
51 bulkrna2.Rds Cellmarkers 439 477 0.9203354
52 bulkrna3.Rds Cellmarkers 345 382 0.9031414
53 bulkrna4.Rds Cellmarkers 245 286 0.8566434
54 bulkrna5.Rds Cellmarkers 0 0 NaN
55 bulkrna6.Rds Cellmarkers 267 297 0.8989899
56 bulkrna7.Rds Cellmarkers 448 491 0.9085366
57 bulkrna1.Rds Hallmark 7 20 0.3500000
58 bulkrna2.Rds Hallmark 25 38 0.6578947
59 bulkrna3.Rds Hallmark 18 21 0.7727273
60 bulkrna4.Rds Hallmark 13 27 0.4814815
61 bulkrna5.Rds Hallmark 0 3 0.0000000
62 bulkrna6.Rds Hallmark 28 37 0.7567568
63 bulkrna7.Rds Hallmark 18 34 0.4444444
summary(res1$nsig_bad)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    13.0    52.0   188.4   242.0  1843.0
summary(res1$nsig_good)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    35.5   115.0   274.2   320.5  2168.0
summary(res1$jaccard)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.4068  0.6598  0.5913  0.8177  1.0000       3
# NUMBER OF SIGNIFICANT SETS
ns1 <- matrix( res1$nsig_bad , ncol=9 )
rownames(ns1) <- names(d)
colnames(ns1) <- names(gs)

ns1 %>%
  kbl(row.names = TRUE, caption="significant genesets in original analysis") %>%
  kable_paper("hover", full_width = F)
significant genesets in original analysis
KEGG Reactome Wikipathways miR targets TFT GTRD GO HPO Cellmarkers Hallmark
bulkrna1.Rds 9 64 33 4 52 485 2 139 7
bulkrna2.Rds 25 239 108 40 98 1843 380 439 25
bulkrna3.Rds 2 94 8 21 108 686 92 345 18
bulkrna4.Rds 14 207 13 272 72 390 24 245 13
bulkrna5.Rds 0 0 1 1 1 20 0 0 0
bulkrna6.Rds 47 254 82 1166 147 588 17 267 28
bulkrna7.Rds 9 185 35 360 129 1263 186 448 18
ns1 <- apply(ns1,2,function(x) { x <- x[!is.na(x)] ; x <- x[is.finite(x)] ;  mean(x) } )

ns2 <- matrix( res1$nsig_good , ncol=9 )
rownames(ns2) <- names(d)
colnames(ns2) <- names(gs)

ns2 %>%
  kbl(row.names = TRUE, caption="significant genesets in corrected analysis") %>%
  kable_paper("hover", full_width = F)
significant genesets in corrected analysis
KEGG Reactome Wikipathways miR targets TFT GTRD GO HPO Cellmarkers Hallmark
bulkrna1.Rds 19 102 46 81 85 707 42 170 20
bulkrna2.Rds 59 351 196 111 140 2168 988 477 38
bulkrna3.Rds 6 159 12 38 135 915 241 382 21
bulkrna4.Rds 24 307 50 634 115 707 296 286 27
bulkrna5.Rds 0 0 1 1 1 65 25 0 3
bulkrna6.Rds 56 334 97 1365 173 789 265 297 37
bulkrna7.Rds 38 240 117 544 153 1544 449 491 34
ns2 <- apply(ns2,2,function(x) { x <- x[!is.na(x)] ; x <- x[is.finite(x)] ;  mean(x) } )
ns2
##         KEGG     Reactome Wikipathways  miR targets     TFT GTRD           GO 
##     28.85714    213.28571     74.14286    396.28571    114.57143    985.00000 
##          HPO  Cellmarkers     Hallmark 
##    329.42857    300.42857     25.71429
df <- data.frame(ns1,ns2)
df <- df[order(df$ns1),]

par(mar=c(c(5.1, 7.1, 4.1, 2.1) ))
barplot(t(df),beside=TRUE,horiz=TRUE,las=1,xlim=c(0,1250),
  xlab="no. significant sets", main="Effect of correcting the background bug",
  col=c("gray30","gray80"))
text(df$ns1+50,((1:nrow(df))*3)-1.5,labels=signif(df[,1],3))
text(df$ns2+50,((1:nrow(df))*3)-0.5,labels=signif(df[,2],3))
#abline(v=seq(100,1000,100),lty=2,lwd=0.5,col="gray")
legend("bottomright", inset=.02, legend=c("corrected","original"),
  fill=c("gray80","gray30"), horiz=FALSE, cex=1.1)

mylabels <- gsub("$","%",gsub("^","+",signif(((df$ns2/df$ns1)-1)*100,3)))
text(df$ns2+160,((1:nrow(df))*3)-1,labels=mylabels,cex=1.1)

# JACCARD
jac <- matrix( res1$jaccard , ncol=9 )
rownames(jac) <- names(d)
colnames(jac) <- names(gs)

signif(jac,3) %>%
  kbl(row.names = TRUE, caption="Jaccard similarity Corrected:Original") %>%
  kable_paper("hover", full_width = F)
Jaccard similarity Corrected:Original
KEGG Reactome Wikipathways miR targets TFT GTRD GO HPO Cellmarkers Hallmark
bulkrna1.Rds 0.474 0.627 0.717 0.0494 0.612 0.686 0.0476 0.818 0.350
bulkrna2.Rds 0.424 0.681 0.551 0.3600 0.700 0.850 0.3850 0.920 0.658
bulkrna3.Rds 0.333 0.562 0.667 0.5530 0.800 0.750 0.3820 0.903 0.773
bulkrna4.Rds 0.583 0.674 0.260 0.4290 0.626 0.552 0.0811 0.857 0.481
bulkrna5.Rds NaN NaN 1.000 1.0000 1.000 0.308 0.0000 NaN 0.000
bulkrna6.Rds 0.839 0.760 0.845 0.8390 0.850 0.745 0.0642 0.899 0.757
bulkrna7.Rds 0.237 0.771 0.299 0.6620 0.843 0.818 0.4140 0.909 0.444
jac2 <- apply(jac,2,function(x) { x <- x[!is.na(x)] ; x <- x[is.finite(x)] ;  mean(x) } )
jac2
##         KEGG     Reactome Wikipathways  miR targets     TFT GTRD           GO 
##    0.4817013    0.6792784    0.6199406    0.5560787    0.7758143    0.6726268 
##          HPO  Cellmarkers     Hallmark 
##    0.1962090    0.8842156    0.4947578
barplot(sort(jac2),horiz=TRUE,las=1,xlim=c(0,1),
  main="Jaccard similarity Corrected:Original",xlab="Jaccard index")

Make charts showing changes in pvalues

For bulk RNA-seq data set #1, run enrichment with all the gene set libraries.

names(gs)
## [1] "KEGG"         "Reactome"     "Wikipathways" "miR targets"  "TFT GTRD"    
## [6] "GO"           "HPO"          "Cellmarkers"  "Hallmark"
lapply(1:length(gs), function(i) {
  bad <- orabad(d[[1]],gs[[i]])
  good <- oragood(d[[1]],gs[[i]])
  compare_ora_plots(bad,good,libname=names(gs)[i])
} )

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL

Session information

For reproducibility

sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] gplots_3.1.3.1              eulerr_7.0.2               
##  [3] fgsea_1.30.0                clusterProfiler_4.12.0     
##  [5] kableExtra_1.4.0            DESeq2_1.44.0              
##  [7] SummarizedExperiment_1.34.0 Biobase_2.64.0             
##  [9] MatrixGenerics_1.16.0       matrixStats_1.3.0          
## [11] GenomicRanges_1.56.0        GenomeInfoDb_1.40.0        
## [13] IRanges_2.38.0              S4Vectors_0.42.0           
## [15] BiocGenerics_0.50.0         getDEE2_1.14.0             
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3      rstudioapi_0.16.0       jsonlite_1.8.8         
##   [4] magrittr_2.0.3          farver_2.1.2            rmarkdown_2.27         
##   [7] fs_1.6.4                zlibbioc_1.50.0         vctrs_0.6.5            
##  [10] memoise_2.0.1.9000      ggtree_3.12.0           htmltools_0.5.8.1      
##  [13] S4Arrays_1.4.0          SparseArray_1.4.3       gridGraphics_0.5-1     
##  [16] sass_0.4.9              KernSmooth_2.23-24      bslib_0.7.0            
##  [19] plyr_1.8.9              cachem_1.1.0            igraph_2.0.3           
##  [22] lifecycle_1.0.4         pkgconfig_2.0.3         gson_0.1.0             
##  [25] Matrix_1.7-0            R6_2.5.1                fastmap_1.2.0          
##  [28] GenomeInfoDbData_1.2.12 digest_0.6.35           aplot_0.2.2            
##  [31] enrichplot_1.24.0       colorspace_2.1-0        patchwork_1.2.0        
##  [34] AnnotationDbi_1.66.0    RSQLite_2.3.7           fansi_1.0.6            
##  [37] httr_1.4.7              polyclip_1.10-6         abind_1.4-5            
##  [40] compiler_4.4.0          bit64_4.0.5             withr_3.0.0            
##  [43] BiocParallel_1.38.0     viridis_0.6.5           DBI_1.2.3              
##  [46] highr_0.11              ggforce_0.4.2           MASS_7.3-60.2          
##  [49] DelayedArray_0.30.1     HDO.db_0.99.1           caTools_1.18.2         
##  [52] gtools_3.9.5            tools_4.4.0             scatterpie_0.2.2       
##  [55] ape_5.8                 glue_1.7.0              nlme_3.1-164           
##  [58] GOSemSim_2.30.0         shadowtext_0.1.3        grid_4.4.0             
##  [61] reshape2_1.4.4          generics_0.1.3          gtable_0.3.5           
##  [64] tidyr_1.3.1             data.table_1.15.4       tidygraph_1.3.1        
##  [67] xml2_1.3.6              utf8_1.2.4              XVector_0.44.0         
##  [70] ggrepel_0.9.5           pillar_1.9.0            stringr_1.5.1          
##  [73] yulab.utils_0.1.4       splines_4.4.0           dplyr_1.1.4            
##  [76] tweenr_2.0.3            treeio_1.28.0           lattice_0.22-6         
##  [79] bit_4.0.5               tidyselect_1.2.1        GO.db_3.19.1           
##  [82] locfit_1.5-9.9          Biostrings_2.72.0       knitr_1.47             
##  [85] gridExtra_2.3           svglite_2.1.3           xfun_0.44              
##  [88] graphlayouts_1.1.1      stringi_1.8.4           UCSC.utils_1.0.0       
##  [91] lazyeval_0.2.2          ggfun_0.1.5             yaml_2.3.8             
##  [94] evaluate_0.23           codetools_0.2-20        ggraph_2.2.1           
##  [97] tibble_3.2.1            qvalue_2.36.0           ggplotify_0.1.2        
## [100] cli_3.6.2               systemfonts_1.1.0       munsell_0.5.1          
## [103] jquerylib_0.1.4         Rcpp_1.0.12             png_0.1-8              
## [106] ggplot2_3.5.1           blob_1.2.4              DOSE_3.30.1            
## [109] htm2txt_2.2.2           bitops_1.0-7            viridisLite_0.4.2      
## [112] tidytree_0.4.6          scales_1.3.0            purrr_1.0.2            
## [115] crayon_1.5.2            rlang_1.1.4             cowplot_1.1.3          
## [118] fastmatch_1.1-4         KEGGREST_1.44.0