Examining background gene lists in clusterProfiler

Source: https://github.com/markziemann/background

Introduction

This guide is a Rmarkdown script that conducts differential expression and enrichment analysis, which are very popular workflows for transcriptome data.

The goal of this work is to understand how ClusterProfiler manages the background list, as we have observed some weird behaviour. In particular we see that when we provide a custom background, those genes do not appear in the final analysis unless they are also included in the gene list.

In the code chunk below called libs, you can add and remove required R library dependancies. Check that the libraries listed here match the Dockerfile, otherwise you might get errors.

suppressPackageStartupMessages({
  library("getDEE2")
  library("DESeq2")
  library("kableExtra")
  library("clusterProfiler")
  library("fgsea")
  library("eulerr")
  library("gplots")
  library("parallel")
})

For this guide I will be using bulk RNA-seq data from several previous studies.

Load data

Gene sets were obtained from MSigDB.

myfiles <- list.files("../dataprep",pattern="*Rds",full.names=TRUE)
d <- lapply(myfiles,readRDS)
names(d) <- basename(myfiles)

mygmts <- list.files("../gmt",pattern="*gmt",full.names=TRUE)
gs <- lapply(mygmts,gmtPathways)
names(gs) <- basename(mygmts)

names(gs)

## [1] "c2.cp.kegg_medicus.v2023.2.Hs.symbols.gmt"
## [2] "c2.cp.reactome.v2023.2.Hs.symbols.gmt"    
## [3] "c2.cp.wikipathways.v2023.2.Hs.symbols.gmt"
## [4] "c3.mir.mirdb.v2023.2.Hs.symbols.gmt"      
## [5] "c3.tft.gtrd.v2023.2.Hs.symbols.gmt"       
## [6] "c5.go.v2023.2.Hs.symbols.gmt"             
## [7] "c5.hpo.v2023.2.Hs.symbols.gmt"            
## [8] "c8.all.v2023.2.Hs.symbols.gmt"            
## [9] "h.all.v2023.2.Hs.symbols.gmt"

names(gs) <- c("KEGG", "Reactome", "Wikipathways", "miR targets", "TFT GTRD",
  "GO", "HPO", "Cellmarkers", "Hallmark" )

Basic geneset analysis

# number of sets
lapply(gs,length)

## $KEGG
## [1] 619
## 
## $Reactome
## [1] 1692
## 
## $Wikipathways
## [1] 791
## 
## $`miR targets`
## [1] 2377
## 
## $`TFT GTRD`
## [1] 505
## 
## $GO
## [1] 10461
## 
## $HPO
## [1] 5547
## 
## $Cellmarkers
## [1] 830
## 
## $Hallmark
## [1] 50

# set size summary
lapply(gs,function(x) {
  y <- unlist(lapply(x, length ))
  summary(y)
} )

## $KEGG
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00    8.00   13.00   15.59   19.00   92.00 
## 
## $Reactome
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00   11.00   23.00   56.28   57.00 1463.00 
## 
## $Wikipathways
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00   14.00   28.00   42.92   53.00  438.00 
## 
## $`miR targets`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     5.0    59.0   103.0   156.5   186.0  1071.0 
## 
## $`TFT GTRD`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     5.0    55.0   287.0   509.3   880.0  1999.0 
## 
## $GO
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00    9.00   18.00   79.83   55.00 1994.00 
## 
## $HPO
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00   10.00   23.00   91.32   74.00 1735.00 
## 
## $Cellmarkers
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     5.0    52.0   115.0   180.5   223.2  1770.0 
## 
## $Hallmark
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    32.0    98.0   180.0   146.4   200.0   200.0

# gene coverage
lapply(gs,function(x) {
  y <- unique(unlist(x))
  length(y)
} )

## $KEGG
## [1] 2727
## 
## $Reactome
## [1] 11155
## 
## $Wikipathways
## [1] 8313
## 
## $`miR targets`
## [1] 16655
## 
## $`TFT GTRD`
## [1] 26807
## 
## $GO
## [1] 19428
## 
## $HPO
## [1] 4997
## 
## $Cellmarkers
## [1] 20418
## 
## $Hallmark
## [1] 4384

# total number of annotations
lapply(gs, function(x) {
  sum(unlist(lapply(x,length)))
} )

## $KEGG
## [1] 9651
## 
## $Reactome
## [1] 95226
## 
## $Wikipathways
## [1] 33947
## 
## $`miR targets`
## [1] 372061
## 
## $`TFT GTRD`
## [1] 257181
## 
## $GO
## [1] 835146
## 
## $HPO
## [1] 506552
## 
## $Cellmarkers
## [1] 149827
## 
## $Hallmark
## [1] 7322

Enrichment with Clusterprofiler

Here is a function that conducts over-representation analysis with ClusterProfiler. Genes that are not annotated as having any gene set category are discarded.

# df = deseq output
# df must have "ensID geneSymbol" as rownames

# gs = gene sets
# gs format is list of symbols
orabad <- function( df, gs ) {

  def <- df

  gs1 <- stack(gs)
  colnames(gs1) <- c("gene","term")
  gs1 <- gs1[,c(2,1)]

  defup <- rownames(subset(def,padj<=0.05 & log2FoldChange>0))
  defup <- defup[lapply(strsplit(defup," "),length)==2]
  defup <- unique(sapply(strsplit(defup," "),"[[",2))

  if ( length(defup)<250 ) {
    defup <- head(rownames(subset(def,log2FoldChange>0)),250)
    defup <- defup[lapply(strsplit(defup," "),length)==2]
    defup <- unique(sapply(strsplit(defup," "),"[[",2))
  }

  defdn <- rownames(subset(def,padj<=0.05 & log2FoldChange<0))
  defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
  defdn <- unique(sapply(strsplit(defdn," "),"[[",2))

  if ( length(defdn)<250 ) {
    defdn <- head(rownames(subset(def,log2FoldChange<0)),250)
    defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
    defdn <- unique(sapply(strsplit(defdn," "),"[[",2))
  }

  bg <- rownames(def)
  bg <- bg[lapply(strsplit(bg," "),length)==2]
  bg <- unique(sapply(strsplit(bg," "),"[[",2))

  ora_up <- as.data.frame(enricher(gene = defup ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))

  ora_up$geneID <- NULL
  ora_ups <- rownames(ora_up)

  gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))

  br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))

  ora_up$es <- gr/br
  ora_up <- ora_up[order(-ora_up$es),]
  ora_up$Description=NULL

  ora_dn <- as.data.frame(enricher(gene = defdn ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))

  ora_dn$geneID <- NULL
  ora_dns <- rownames(ora_dn)

  gr <- as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",2))

  br <- as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",2))

  ora_dn$es <- gr/br
  ora_dn <- ora_dn[order(-ora_dn$es),]
  ora_dn$Description=NULL

  result <- list("orabad_up"=ora_up,"orabad_dn"=ora_dn)
  return(result)
}

This function fixes the problem with excluding un-annotated genes.

oragood <- function( df, gs ) {

  def <- df

  gs1 <- stack(gs)
  colnames(gs1) <- c("gene","term")
  gs1 <- gs1[,c(2,1)]

  defup <- rownames(subset(def,padj<=0.05 & log2FoldChange>0))
  defup <- defup[lapply(strsplit(defup," "),length)==2]
  defup <- unique(sapply(strsplit(defup," "),"[[",2))

  if ( length(defup)<250 ) {
    defup <- head(rownames(subset(def,log2FoldChange>0)),250)
    defup <- defup[lapply(strsplit(defup," "),length)==2]
    defup <- unique(sapply(strsplit(defup," "),"[[",2))
  }

  defdn <- rownames(subset(def,padj<=0.05 & log2FoldChange<0))
  defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
  defdn <- unique(sapply(strsplit(defdn," "),"[[",2))

  if ( length(defdn)<250 ) {
    defdn <- head(rownames(subset(def,log2FoldChange<0)),250)
    defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
    defdn <- unique(sapply(strsplit(defdn," "),"[[",2))
  }

  bg <- rownames(def)
  bg <- bg[lapply(strsplit(bg," "),length)==2]
  bg <- unique(sapply(strsplit(bg," "),"[[",2))

  bgdf <- data.frame("background",bg)
  names(bgdf) <- c("term","gene")
  gs1 <- rbind(gs1,bgdf)

  ora_up <- as.data.frame(enricher(gene = defup ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))

  ora_up$geneID <- NULL
  ora_ups <- rownames(ora_up)

  gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))

  br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))

  ora_up$es <- gr/br
  ora_up <- ora_up[order(-ora_up$es),]
  ora_up$Description=NULL

  ora_dn <- as.data.frame(enricher(gene = defdn ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))

  ora_dn$geneID <- NULL
  ora_dns <- rownames(ora_dn)

  gr <- as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",2))

  br <- as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",2))

  ora_dn$es <- gr/br
  ora_dn <- ora_dn[order(-ora_dn$es),]
  ora_dn$Description=NULL

  result <- list("oragood_up"=ora_up,"oragood_dn"=ora_dn)
  return(result)
}

Compare the results of bad and good analysis

# bad=ORA analysis, a list with results of up and down regulated pathways
# good=same as bad, except fixed the background problem
compare_ora_plots <- function(bad,good,libname=NULL){
  badup <- bad$orabad_up
  baddn <- bad$orabad_dn
  goodup <- good$oragood_up
  gooddn <- good$oragood_dn
  mup <- merge(badup,goodup,by=0)
  MAX=max(c(mup$es.x,mup$es.y))
  UPHEADER=paste(libname,"up")
  DNHEADER=paste(libname,"dn")

  par(mfrow=c(2,2))

  plot(mup$es.x, mup$es.y,
    xlim=c(0,MAX), ylim=c(0,MAX),
    xlab="original ES", ylab="corrected ES", main=UPHEADER)
  grid()
  abline(a = 0, b = 1,lwd=2,lty=2)

  MAX=max(-log10(c(mup$pvalue.x,mup$pvalue.y)))

  plot(-log10(mup$pvalue.x), -log10(mup$pvalue.y),
    xlim=c(0,MAX), ylim=c(0,MAX),
    xlab="original -log10 p-value", ylab="corrected -log10 p-value", main=UPHEADER)
  grid()
  abline(a = 0, b = 1,lwd=2,lty=2)

  mdn <- merge(baddn,gooddn,by=0)
  MAX=max(c(mdn$es.x,mdn$es.y))

  plot(mdn$es.x, mdn$es.y,
    xlim=c(0,MAX), ylim=c(0,MAX),
    xlab="original ES", ylab="corrected ES", main=DNHEADER)
  grid()
  abline(a = 0, b = 1,lwd=2,lty=2)

  MAX=max(-log10(c(mdn$pvalue.x,mdn$pvalue.y)))

  plot(-log10(mdn$pvalue.x), -log10(mdn$pvalue.y),
    xlim=c(0,MAX), ylim=c(0,MAX),
    xlab="original -log10 p-value", ylab="corrected -log10 p-value", main=DNHEADER)
  grid()
  abline(a = 0, b = 1,lwd=2,lty=2)
}

compare_ora <- function(bad,good){
  badup <- bad$orabad_up
  baddn <- bad$orabad_dn
  goodup <- good$oragood_up
  gooddn <- good$oragood_dn
  mup <- merge(badup,goodup,by=0)
  mdn <- merge(baddn,gooddn,by=0)
  bup <- subset(mup,p.adjust.x<0.05)$Row.names
  gup <- subset(mup,p.adjust.y<0.05)$Row.names
  bdn <- subset(mdn,p.adjust.x<0.05)$Row.names
  gdn <- subset(mdn,p.adjust.y<0.05)$Row.names
  sig_bad <- unique(c(bup,bdn))
  sig_good <- unique(c(gup,gdn))
  nsig_bad <- length(sig_bad)
  nsig_good <- length(sig_good)
  jac <- length(intersect(sig_bad,sig_good)) / length(union(sig_bad,sig_good))
  result=c("nsig_bad"=nsig_bad,"nsig_good"=nsig_good,"jaccard"=jac)
  return(result)
}

Run some analyses

params <- expand.grid(1:length(d),1:length(gs))
params2 <- expand.grid(names(d),names(gs))

colnames(params) <- c("d","gs")

ora_dat <- mclapply(1:nrow(params), function(i) {
  j=params[i,1]
  k=params[i,2]
  bad <- orabad(d[[j]],gs[[k]])
  good <- oragood(d[[j]],gs[[k]])
  res <- compare_ora(bad,good)
  return(res)
} , mc.cores=8)

ora_dat <- do.call(rbind,ora_dat)

res1 <- cbind(params2,ora_dat)

res1 %>%
  kbl(row.names = TRUE, caption="all results") %>%
  kable_paper("hover", full_width = F)

all results
	Var1	Var2	nsig_bad	nsig_good	jaccard
1	bulkrna1.Rds	KEGG	9	19	0.4736842
2	bulkrna2.Rds	KEGG	25	59	0.4237288
3	bulkrna3.Rds	KEGG	2	6	0.3333333
4	bulkrna4.Rds	KEGG	14	24	0.5833333
5	bulkrna5.Rds	KEGG	0	0	NaN
6	bulkrna6.Rds	KEGG	47	56	0.8392857
7	bulkrna7.Rds	KEGG	9	38	0.2368421
8	bulkrna1.Rds	Reactome	64	102	0.6274510
9	bulkrna2.Rds	Reactome	239	351	0.6809117
10	bulkrna3.Rds	Reactome	94	159	0.5617284
11	bulkrna4.Rds	Reactome	207	307	0.6742671
12	bulkrna5.Rds	Reactome	0	0	NaN
13	bulkrna6.Rds	Reactome	254	334	0.7604790
14	bulkrna7.Rds	Reactome	185	240	0.7708333
15	bulkrna1.Rds	Wikipathways	33	46	0.7173913
16	bulkrna2.Rds	Wikipathways	108	196	0.5510204
17	bulkrna3.Rds	Wikipathways	8	12	0.6666667
18	bulkrna4.Rds	Wikipathways	13	50	0.2600000
19	bulkrna5.Rds	Wikipathways	1	1	1.0000000
20	bulkrna6.Rds	Wikipathways	82	97	0.8453608
21	bulkrna7.Rds	Wikipathways	35	117	0.2991453
22	bulkrna1.Rds	miR targets	4	81	0.0493827
23	bulkrna2.Rds	miR targets	40	111	0.3603604
24	bulkrna3.Rds	miR targets	21	38	0.5526316
25	bulkrna4.Rds	miR targets	272	634	0.4290221
26	bulkrna5.Rds	miR targets	1	1	1.0000000
27	bulkrna6.Rds	miR targets	1166	1365	0.8393895
28	bulkrna7.Rds	miR targets	360	544	0.6617647
29	bulkrna1.Rds	TFT GTRD	52	85	0.6117647
30	bulkrna2.Rds	TFT GTRD	98	140	0.7000000
31	bulkrna3.Rds	TFT GTRD	108	135	0.8000000
32	bulkrna4.Rds	TFT GTRD	72	115	0.6260870
33	bulkrna5.Rds	TFT GTRD	1	1	1.0000000
34	bulkrna6.Rds	TFT GTRD	147	173	0.8497110
35	bulkrna7.Rds	TFT GTRD	129	153	0.8431373
36	bulkrna1.Rds	GO	485	707	0.6859972
37	bulkrna2.Rds	GO	1843	2168	0.8500923
38	bulkrna3.Rds	GO	686	915	0.7497268
39	bulkrna4.Rds	GO	390	707	0.5516266
40	bulkrna5.Rds	GO	20	65	0.3076923
41	bulkrna6.Rds	GO	588	789	0.7452471
42	bulkrna7.Rds	GO	1263	1544	0.8180052
43	bulkrna1.Rds	HPO	2	42	0.0476190
44	bulkrna2.Rds	HPO	380	988	0.3846154
45	bulkrna3.Rds	HPO	92	241	0.3817427
46	bulkrna4.Rds	HPO	24	296	0.0810811
47	bulkrna5.Rds	HPO	0	25	0.0000000
48	bulkrna6.Rds	HPO	17	265	0.0641509
49	bulkrna7.Rds	HPO	186	449	0.4142539
50	bulkrna1.Rds	Cellmarkers	139	170	0.8176471
51	bulkrna2.Rds	Cellmarkers	439	477	0.9203354
52	bulkrna3.Rds	Cellmarkers	345	382	0.9031414
53	bulkrna4.Rds	Cellmarkers	245	286	0.8566434
54	bulkrna5.Rds	Cellmarkers	0	0	NaN
55	bulkrna6.Rds	Cellmarkers	267	297	0.8989899
56	bulkrna7.Rds	Cellmarkers	448	491	0.9085366
57	bulkrna1.Rds	Hallmark	7	20	0.3500000
58	bulkrna2.Rds	Hallmark	25	38	0.6578947
59	bulkrna3.Rds	Hallmark	18	21	0.7727273
60	bulkrna4.Rds	Hallmark	13	27	0.4814815
61	bulkrna5.Rds	Hallmark	0	3	0.0000000
62	bulkrna6.Rds	Hallmark	28	37	0.7567568
63	bulkrna7.Rds	Hallmark	18	34	0.4444444

summary(res1$nsig_bad)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    13.0    52.0   188.4   242.0  1843.0

summary(res1$nsig_good)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    35.5   115.0   274.2   320.5  2168.0

summary(res1$jaccard)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.4068  0.6598  0.5913  0.8177  1.0000       3

# NUMBER OF SIGNIFICANT SETS
ns1 <- matrix( res1$nsig_bad , ncol=9 )
rownames(ns1) <- names(d)
colnames(ns1) <- names(gs)

ns1 %>%
  kbl(row.names = TRUE, caption="significant genesets in original analysis") %>%
  kable_paper("hover", full_width = F)

significant genesets in original analysis
	KEGG	Reactome	Wikipathways	miR targets	TFT GTRD	GO	HPO	Cellmarkers	Hallmark
bulkrna1.Rds	9	64	33	4	52	485	2	139	7
bulkrna2.Rds	25	239	108	40	98	1843	380	439	25
bulkrna3.Rds	2	94	8	21	108	686	92	345	18
bulkrna4.Rds	14	207	13	272	72	390	24	245	13
bulkrna5.Rds	0	0	1	1	1	20	0	0	0
bulkrna6.Rds	47	254	82	1166	147	588	17	267	28
bulkrna7.Rds	9	185	35	360	129	1263	186	448	18

ns1 <- apply(ns1,2,function(x) { x <- x[!is.na(x)] ; x <- x[is.finite(x)] ;  mean(x) } )

ns2 <- matrix( res1$nsig_good , ncol=9 )
rownames(ns2) <- names(d)
colnames(ns2) <- names(gs)

ns2 %>%
  kbl(row.names = TRUE, caption="significant genesets in corrected analysis") %>%
  kable_paper("hover", full_width = F)

significant genesets in corrected analysis
	KEGG	Reactome	Wikipathways	miR targets	TFT GTRD	GO	HPO	Cellmarkers	Hallmark
bulkrna1.Rds	19	102	46	81	85	707	42	170	20
bulkrna2.Rds	59	351	196	111	140	2168	988	477	38
bulkrna3.Rds	6	159	12	38	135	915	241	382	21
bulkrna4.Rds	24	307	50	634	115	707	296	286	27
bulkrna5.Rds	0	0	1	1	1	65	25	0	3
bulkrna6.Rds	56	334	97	1365	173	789	265	297	37
bulkrna7.Rds	38	240	117	544	153	1544	449	491	34

ns2 <- apply(ns2,2,function(x) { x <- x[!is.na(x)] ; x <- x[is.finite(x)] ;  mean(x) } )
ns2

##         KEGG     Reactome Wikipathways  miR targets     TFT GTRD           GO 
##     28.85714    213.28571     74.14286    396.28571    114.57143    985.00000 
##          HPO  Cellmarkers     Hallmark 
##    329.42857    300.42857     25.71429

df <- data.frame(ns1,ns2)
df <- df[order(df$ns1),]

par(mar=c(c(5.1, 7.1, 4.1, 2.1) ))
barplot(t(df),beside=TRUE,horiz=TRUE,las=1,xlim=c(0,1250),
  xlab="no. significant sets", main="Effect of correcting the background bug",
  col=c("gray30","gray80"))
text(df$ns1+50,((1:nrow(df))*3)-1.5,labels=signif(df[,1],3))
text(df$ns2+50,((1:nrow(df))*3)-0.5,labels=signif(df[,2],3))
#abline(v=seq(100,1000,100),lty=2,lwd=0.5,col="gray")
legend("bottomright", inset=.02, legend=c("corrected","original"),
  fill=c("gray80","gray30"), horiz=FALSE, cex=1.1)

mylabels <- gsub("$","%",gsub("^","+",signif(((df$ns2/df$ns1)-1)*100,3)))
text(df$ns2+160,((1:nrow(df))*3)-1,labels=mylabels,cex=1.1)

# JACCARD
jac <- matrix( res1$jaccard , ncol=9 )
rownames(jac) <- names(d)
colnames(jac) <- names(gs)

signif(jac,3) %>%
  kbl(row.names = TRUE, caption="Jaccard similarity Corrected:Original") %>%
  kable_paper("hover", full_width = F)

Jaccard similarity Corrected:Original
	KEGG	Reactome	Wikipathways	miR targets	TFT GTRD	GO	HPO	Cellmarkers	Hallmark
bulkrna1.Rds	0.474	0.627	0.717	0.0494	0.612	0.686	0.0476	0.818	0.350
bulkrna2.Rds	0.424	0.681	0.551	0.3600	0.700	0.850	0.3850	0.920	0.658
bulkrna3.Rds	0.333	0.562	0.667	0.5530	0.800	0.750	0.3820	0.903	0.773
bulkrna4.Rds	0.583	0.674	0.260	0.4290	0.626	0.552	0.0811	0.857	0.481
bulkrna5.Rds	NaN	NaN	1.000	1.0000	1.000	0.308	0.0000	NaN	0.000
bulkrna6.Rds	0.839	0.760	0.845	0.8390	0.850	0.745	0.0642	0.899	0.757
bulkrna7.Rds	0.237	0.771	0.299	0.6620	0.843	0.818	0.4140	0.909	0.444

jac2 <- apply(jac,2,function(x) { x <- x[!is.na(x)] ; x <- x[is.finite(x)] ;  mean(x) } )
jac2

##         KEGG     Reactome Wikipathways  miR targets     TFT GTRD           GO 
##    0.4817013    0.6792784    0.6199406    0.5560787    0.7758143    0.6726268 
##          HPO  Cellmarkers     Hallmark 
##    0.1962090    0.8842156    0.4947578

barplot(sort(jac2),horiz=TRUE,las=1,xlim=c(0,1),
  main="Jaccard similarity Corrected:Original",xlab="Jaccard index")

Make charts showing changes in pvalues

For bulk RNA-seq data set #1, run enrichment with all the gene set libraries.

names(gs)

## [1] "KEGG"         "Reactome"     "Wikipathways" "miR targets"  "TFT GTRD"    
## [6] "GO"           "HPO"          "Cellmarkers"  "Hallmark"

lapply(1:length(gs), function(i) {
  bad <- orabad(d[[1]],gs[[i]])
  good <- oragood(d[[1]],gs[[i]])
  compare_ora_plots(bad,good,libname=names(gs)[i])
} )

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL

Session information

For reproducibility

sessionInfo()

## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] gplots_3.1.3.1              eulerr_7.0.2               
##  [3] fgsea_1.30.0                clusterProfiler_4.12.0     
##  [5] kableExtra_1.4.0            DESeq2_1.44.0              
##  [7] SummarizedExperiment_1.34.0 Biobase_2.64.0             
##  [9] MatrixGenerics_1.16.0       matrixStats_1.3.0          
## [11] GenomicRanges_1.56.0        GenomeInfoDb_1.40.0        
## [13] IRanges_2.38.0              S4Vectors_0.42.0           
## [15] BiocGenerics_0.50.0         getDEE2_1.14.0             
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3      rstudioapi_0.16.0       jsonlite_1.8.8         
##   [4] magrittr_2.0.3          farver_2.1.2            rmarkdown_2.27         
##   [7] fs_1.6.4                zlibbioc_1.50.0         vctrs_0.6.5            
##  [10] memoise_2.0.1.9000      ggtree_3.12.0           htmltools_0.5.8.1      
##  [13] S4Arrays_1.4.0          SparseArray_1.4.3       gridGraphics_0.5-1     
##  [16] sass_0.4.9              KernSmooth_2.23-24      bslib_0.7.0            
##  [19] plyr_1.8.9              cachem_1.1.0            igraph_2.0.3           
##  [22] lifecycle_1.0.4         pkgconfig_2.0.3         gson_0.1.0             
##  [25] Matrix_1.7-0            R6_2.5.1                fastmap_1.2.0          
##  [28] GenomeInfoDbData_1.2.12 digest_0.6.35           aplot_0.2.2            
##  [31] enrichplot_1.24.0       colorspace_2.1-0        patchwork_1.2.0        
##  [34] AnnotationDbi_1.66.0    RSQLite_2.3.7           fansi_1.0.6            
##  [37] httr_1.4.7              polyclip_1.10-6         abind_1.4-5            
##  [40] compiler_4.4.0          bit64_4.0.5             withr_3.0.0            
##  [43] BiocParallel_1.38.0     viridis_0.6.5           DBI_1.2.3              
##  [46] highr_0.11              ggforce_0.4.2           MASS_7.3-60.2          
##  [49] DelayedArray_0.30.1     HDO.db_0.99.1           caTools_1.18.2         
##  [52] gtools_3.9.5            tools_4.4.0             scatterpie_0.2.2       
##  [55] ape_5.8                 glue_1.7.0              nlme_3.1-164           
##  [58] GOSemSim_2.30.0         shadowtext_0.1.3        grid_4.4.0             
##  [61] reshape2_1.4.4          generics_0.1.3          gtable_0.3.5           
##  [64] tidyr_1.3.1             data.table_1.15.4       tidygraph_1.3.1        
##  [67] xml2_1.3.6              utf8_1.2.4              XVector_0.44.0         
##  [70] ggrepel_0.9.5           pillar_1.9.0            stringr_1.5.1          
##  [73] yulab.utils_0.1.4       splines_4.4.0           dplyr_1.1.4            
##  [76] tweenr_2.0.3            treeio_1.28.0           lattice_0.22-6         
##  [79] bit_4.0.5               tidyselect_1.2.1        GO.db_3.19.1           
##  [82] locfit_1.5-9.9          Biostrings_2.72.0       knitr_1.47             
##  [85] gridExtra_2.3           svglite_2.1.3           xfun_0.44              
##  [88] graphlayouts_1.1.1      stringi_1.8.4           UCSC.utils_1.0.0       
##  [91] lazyeval_0.2.2          ggfun_0.1.5             yaml_2.3.8             
##  [94] evaluate_0.23           codetools_0.2-20        ggraph_2.2.1           
##  [97] tibble_3.2.1            qvalue_2.36.0           ggplotify_0.1.2        
## [100] cli_3.6.2               systemfonts_1.1.0       munsell_0.5.1          
## [103] jquerylib_0.1.4         Rcpp_1.0.12             png_0.1-8              
## [106] ggplot2_3.5.1           blob_1.2.4              DOSE_3.30.1            
## [109] htm2txt_2.2.2           bitops_1.0-7            viridisLite_0.4.2      
## [112] tidytree_0.4.6          scales_1.3.0            purrr_1.0.2            
## [115] crayon_1.5.2            rlang_1.1.4             cowplot_1.1.3          
## [118] fastmatch_1.1-4         KEGGREST_1.44.0

Examining background gene lists in clusterProfiler - RNA-seq example

Mark Ziemann & Anusuiya Bora

2024-06-05