Source: https://github.com/markziemann/background

Introduction

This guide is a Rmarkdown script that conducts differential expression and enrichment analysis, which are very popular workflows for transcriptome data.

The goal of this work is to understand how ClusterProfiler manages the background list, as we have observed some weird behaviour. In particular we see that when we provide a custom background, those genes do not appear in the final analysis unless they are also included in the gene list.

In the code chunk below called libs, you can add and remove required R library dependancies. Check that the libraries listed here match the Dockerfile, otherwise you might get errors.

suppressPackageStartupMessages({
  library("getDEE2")
  library("DESeq2")
  library("kableExtra")
  library("clusterProfiler")
  library("fgsea")
  library("eulerr")
  library("gplots")
  library("parallel")
})

For this guide I will be using bulk RNA-seq data from several previous studies.

Load data

Gene sets were obtained from MSigDB.

myfiles <- list.files("../dataprep",pattern="*Rds",full.names=TRUE)
d <- lapply(myfiles,readRDS)
names(d) <- basename(myfiles)

mygmts <- list.files("../gmt",pattern="*gmt",full.names=TRUE)
gs <- lapply(mygmts,gmtPathways)
names(gs) <- basename(mygmts)

names(gs)
## [1] "c2.cp.kegg_medicus.v2023.2.Hs.symbols.gmt"
## [2] "c2.cp.reactome.v2023.2.Hs.symbols.gmt"    
## [3] "c2.cp.wikipathways.v2023.2.Hs.symbols.gmt"
## [4] "c3.mir.mirdb.v2023.2.Hs.symbols.gmt"      
## [5] "c3.tft.gtrd.v2023.2.Hs.symbols.gmt"       
## [6] "c5.go.v2023.2.Hs.symbols.gmt"             
## [7] "c5.hpo.v2023.2.Hs.symbols.gmt"            
## [8] "c8.all.v2023.2.Hs.symbols.gmt"            
## [9] "h.all.v2023.2.Hs.symbols.gmt"
names(gs) <- c("KEGG", "Reactome", "Wikipathways", "miR targets", "TFT GTRD",
  "GO", "HPO", "Cellmarkers", "Hallmark" )

Basic geneset analysis

# number of sets
num <- unlist(lapply(gs,length))
num
##         KEGG     Reactome Wikipathways  miR targets     TFT GTRD           GO 
##          619         1692          791         2377          505        10461 
##          HPO  Cellmarkers     Hallmark 
##         5547          830           50
# set size summary
sz <- unlist(lapply(gs,function(x) {
  y <- unlist(lapply(x, length ))
  summary(y)
} ))
sz <- t(matrix(sz,ncol=9))
rownames(sz) <- names(gs)
colnames(sz) <- names(summary(c(1,2,3,4)))
sz
##              Min. 1st Qu. Median      Mean 3rd Qu. Max.
## KEGG            5       8     13  15.59128   19.00   92
## Reactome        5      11     23  56.28014   57.00 1463
## Wikipathways    5      14     28  42.91656   53.00  438
## miR targets     5      59    103 156.52545  186.00 1071
## TFT GTRD        5      55    287 509.26931  880.00 1999
## GO              5       9     18  79.83424   55.00 1994
## HPO             5      10     23  91.31999   74.00 1735
## Cellmarkers     5      52    115 180.51446  223.25 1770
## Hallmark       32      98    180 146.44000  200.00  200
# gene coverage
cov <- unlist(lapply(gs,function(x) {
  y <- unique(unlist(x))
  length(y)
} ))
cov
##         KEGG     Reactome Wikipathways  miR targets     TFT GTRD           GO 
##         2727        11155         8313        16655        26807        19428 
##          HPO  Cellmarkers     Hallmark 
##         4997        20418         4384
# total number of annotations
tot <- unlist(lapply(gs, function(x) {
  sum(unlist(lapply(x,length)))
} ))
tot
##         KEGG     Reactome Wikipathways  miR targets     TFT GTRD           GO 
##         9651        95226        33947       372061       257181       835146 
##          HPO  Cellmarkers     Hallmark 
##       506552       149827         7322

First test

x1 <- readRDS("../dataprep/bulkrna1.Rds")
head(x1)
##                         baseMean log2FoldChange     lfcSE       stat
## ENSG00000000003 TSPAN6 3754.3819    -0.10925392 0.1417469 -0.7707674
## ENSG00000000419 DPM1   1718.6738    -0.06109763 0.1477350 -0.4135622
## ENSG00000000457 SCYL3   260.6222     0.28894664 0.2166482  1.3337138
## ENSG00000000460 FIRRM   327.1207     0.57058478 0.2019567  2.8252821
## ENSG00000001036 FUCA2  6164.1606    -0.47524943 0.1136307 -4.1824048
## ENSG00000001084 GCLC   4296.9247    -0.68760144 0.1235878 -5.5636659
##                              pvalue         padj
## ENSG00000000003 TSPAN6 4.408448e-01 6.243711e-01
## ENSG00000000419 DPM1   6.791948e-01 8.109114e-01
## ENSG00000000457 SCYL3  1.822977e-01 3.508604e-01
## ENSG00000000460 FIRRM  4.723901e-03 2.518387e-02
## ENSG00000001036 FUCA2  2.884418e-05 4.256904e-04
## ENSG00000001084 GCLC   2.641654e-08 1.092526e-06
# remove genes without symbols
x1f <- x1[lapply(strsplit(rownames(x1) , " "),length)>1,]

dim(x1)
## [1] 15520     6
dim(x1f)
## [1] 15104     6
# about 1000 genes are lost after removing redundant gene names
bg <- unique(sapply(strsplit(rownames(x1f) , " "),"[[",2))
length(bg)
## [1] 14144
writeLines(bg,"bg.txt")

up <-rownames(subset(x1f, log2FoldChange > 0 & padj < 0.05 ))
up <- unique(sapply(strsplit(up," "),"[[",2))
length(up)
## [1] 1469
writeLines(up,"fg.txt")

dn <-rownames(subset(x1f, log2FoldChange < 0 & padj < 0.05 ))
dn <- unique(sapply(strsplit(dn," "),"[[",2))
length(dn)
## [1] 1801
# kegg gene set
kegg <- gs[[1]]
kegg <- stack(kegg)
colnames(kegg) <- c("gene","term")
kegg <- kegg[,c(2,1)]
kegg$term <- gsub("KEGG_MEDICUS_","",kegg$term)
kegg$term <- gsub("_"," ",kegg$term)

ora_up <- as.data.frame(enricher(gene = up ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = kegg,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))

 gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))

  br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))

  ora_up$es <- gr/br

ora_up$geneID = ora_up$ID = ora_up$Description = NULL

ora_up_kegg <- ora_up

head(ora_up,20) %>%
  kbl(row.names = TRUE, caption="Upregulated KEGGs in SRP128998") %>%
  kable_paper("hover", full_width = F)
Upregulated KEGGs in SRP128998
GeneRatio BgRatio pvalue p.adjust qvalue Count es
REFERENCE ORIGIN UNWINDING AND ELONGATION 15/268 30/2120 0.0000006 0.0002128 0.0002027 15 3.955224
REFERENCE MEVALONATE PATHWAY 8/268 11/2120 0.0000069 0.0012031 0.0011460 8 5.753053
REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 5/268 5/2120 0.0000312 0.0024834 0.0023655 5 7.910448
REFERENCE TRAIP DEPENDENT REPLISOME DISASSEMBLY 9/268 16/2120 0.0000370 0.0024834 0.0023655 9 4.449627
REFERENCE RAD51 DSDNA DESTABILIZATION 7/268 10/2120 0.0000410 0.0024834 0.0023655 7 5.537313
REFERENCE CHOLESTEROL BIOSYNTHESIS 8/268 13/2120 0.0000429 0.0024834 0.0023655 8 4.867968
REFERENCE ORGANIZATION OF THE OUTER KINETOCHORE 9/268 19/2120 0.0002110 0.0093540 0.0089100 9 3.747054
REFERENCE DEPHOSPHORYLATION OF KINETOCHORE 7/268 12/2120 0.0002157 0.0093540 0.0089100 7 4.614428
REFERENCE PRE IC FORMATION 10/268 27/2120 0.0010349 0.0399023 0.0380080 10 2.929795
REFERENCE CENPE INTERACTION WITH NDC80 COMPLEX 6/268 12/2120 0.0018369 0.0637399 0.0607139 6 3.955224
REFERENCE CDC25 CELL CYCLE G2 M 5/268 9/2120 0.0025467 0.0754786 0.0718953 5 4.394693
REFERENCE DNA REPLICATION TERMINATION 8/268 21/2120 0.0027149 0.0754786 0.0718953 8 3.013504
REFERENCE COMMON PATHWAY OF COMPLEMENT CASCADE MAC FORMATION 4/268 6/2120 0.0030452 0.0754786 0.0718953 4 5.273632
REFERENCE LECTIN PATHWAY OF COAGULATION CASCADE FIBRINOGEN TO FIBRIN 4/268 6/2120 0.0030452 0.0754786 0.0718953 4 5.273632
REFERENCE HOMOLOGOUS RECOMBINATION 10/268 32/2120 0.0044335 0.0991532 0.0944460 10 2.472015
REFERENCE RECRUITMENT AND FORMATION OF THE MCC 5/268 10/2120 0.0045719 0.0991532 0.0944460 5 3.955224
REFERENCE LECTIN PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 4/268 7/2120 0.0064025 0.1306854 0.1244811 4 4.520256
REFERENCE HOMOLOGOUS RECOMBINATION IN ICLR 6/268 15/2120 0.0071439 0.1377178 0.1311797 6 3.164179
REFERENCE BREAK INDUCED REPLICATION 4/268 8/2120 0.0115435 0.2002801 0.1907719 4 3.955224
REFERENCE REGULATION OF COMPLEMENT CASCADE MAC INHIBITION 4/268 8/2120 0.0115435 0.2002801 0.1907719 4 3.955224
# GO gene set
go <- gs[[6]]
go <- stack(go)
colnames(go) <- c("gene","term")
go <- go[,c(2,1)]
go$term <- gsub("^GO","",go$term)
go$term <- gsub("_"," ",go$term)

ora_up <- as.data.frame(enricher(gene = up ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = go,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))

 gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))

  br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))

  ora_up$es <- gr/br

ora_up$geneID = ora_up$ID = ora_up$Description = NULL

ora_up_go <- ora_up

head(ora_up,20) %>%
  kbl(row.names = TRUE, caption="Upregulated GOs in SRP128998") %>%
  kable_paper("hover", full_width = F)
Upregulated GOs in SRP128998
GeneRatio BgRatio pvalue p.adjust qvalue Count es
BP SMALL MOLECULE METABOLIC PROCESS 232/1315 1465/12365 0.0e+00 0.0000003 0.0000003 232 1.489082
BP ORGANIC ACID METABOLIC PROCESS 138/1315 761/12365 0.0e+00 0.0000003 0.0000003 138 1.705151
BP SMALL MOLECULE BIOSYNTHETIC PROCESS 93/1315 455/12365 0.0e+00 0.0000006 0.0000006 93 1.921940
BP FATTY ACID METABOLIC PROCESS 64/1315 287/12365 0.0e+00 0.0000093 0.0000089 64 2.096846
BP HUMORAL IMMUNE RESPONSE MEDIATED BY CIRCULATING IMMUNOGLOBULIN 16/1315 30/12365 0.0e+00 0.0000116 0.0000111 16 5.014956
BP COMPLEMENT ACTIVATION CLASSICAL PATHWAY 14/1315 24/12365 0.0e+00 0.0000179 0.0000172 14 5.485108
CC BLOOD MICROPARTICLE 29/1315 90/12365 0.0e+00 0.0000230 0.0000221 29 3.029869
BP STEROL BIOSYNTHETIC PROCESS 21/1315 54/12365 1.0e-07 0.0000443 0.0000425 21 3.656738
BP LIPID METABOLIC PROCESS 163/1315 1039/12365 1.0e-07 0.0000798 0.0000765 163 1.475164
BP CELLULAR LIPID METABOLIC PROCESS 126/1315 765/12365 2.0e-07 0.0001667 0.0001598 126 1.548736
BP COMPLEMENT ACTIVATION 15/1315 33/12365 4.0e-07 0.0002210 0.0002118 15 4.274110
BP REGULATION OF MITOTIC NUCLEAR DIVISION 29/1315 101/12365 4.0e-07 0.0002210 0.0002118 29 2.699883
BP MONOCARBOXYLIC ACID METABOLIC PROCESS 84/1315 463/12365 5.0e-07 0.0002843 0.0002725 84 1.705951
BP REGULATION OF NUCLEAR DIVISION 31/1315 117/12365 1.1e-06 0.0005118 0.0004907 31 2.491404
CC COLLAGEN CONTAINING EXTRACELLULAR MATRIX 52/1315 248/12365 1.1e-06 0.0005118 0.0004907 52 1.971605
BP HUMORAL IMMUNE RESPONSE 27/1315 95/12365 1.2e-06 0.0005173 0.0004959 27 2.672443
CC OUTER KINETOCHORE 11/1315 20/12365 1.3e-06 0.0005209 0.0004993 11 5.171673
BP G2 MI TRANSITION OF MEIOTIC CELL CYCLE 6/1315 6/12365 1.4e-06 0.0005224 0.0005008 6 9.403042
CC KNL1 SPC105 COMPLEX 6/1315 6/12365 1.4e-06 0.0005224 0.0005008 6 9.403042
CC EXTERNAL ENCAPSULATING STRUCTURE 60/1315 306/12365 1.8e-06 0.0006187 0.0005931 60 1.843734

Transfer a gene set from GO to KEGG analysis.

pw <- gs[[6]][grep("GOBP_COMPLEMENT_ACTIVATION_CLASSICAL_PATHWAY", names(gs[[6]]))]

pw
## $GOBP_COMPLEMENT_ACTIVATION_CLASSICAL_PATHWAY
##  [1] "C1QA"     "C1QB"     "C1QBP"    "C1QC"     "C1R"      "C1RL"    
##  [7] "C1S"      "C2"       "C3"       "C4A"      "C4B"      "C4BPA"   
## [13] "C4BPB"    "C5"       "C6"       "C7"       "C8A"      "C8B"     
## [19] "C8G"      "C9"       "CD46"     "CD55"     "CFI"      "CLU"     
## [25] "CR1"      "CR1L"     "CR2"      "IGHA1"    "IGHA2"    "IGHE"    
## [31] "IGHG1"    "IGHG2"    "IGHG3"    "IGHG4"    "IGHM"     "MASP2"   
## [37] "MBL2"     "NCR3LG1"  "SERPING1" "SUSD4"    "SVEP1"    "TREM2"
pw <- stack(pw)[,c(2,1)]
colnames(pw) <- c("term","gene")
kegg2 <- rbind(kegg,pw)

ora_up <- as.data.frame(enricher(gene = up ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = kegg2,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))

 gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))

  br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))

  ora_up$es <- gr/br

ora_up$geneID = ora_up$ID = ora_up$Description = NULL

ora_up_kegg2 <- ora_up

head(ora_up,20) %>%
  kbl(row.names = TRUE, caption="Upregulated KEGGs in SRP128998 with Complement") %>%
  kable_paper("hover", full_width = F)
Upregulated KEGGs in SRP128998 with Complement
GeneRatio BgRatio pvalue p.adjust qvalue Count es
GOBP_COMPLEMENT_ACTIVATION_CLASSICAL_PATHWAY 14/270 24/2126 0.0000001 0.0000433 0.0000411 14 4.593210
REFERENCE ORIGIN UNWINDING AND ELONGATION 15/270 30/2126 0.0000007 0.0001136 0.0001079 15 3.937037
REFERENCE MEVALONATE PATHWAY 8/270 11/2126 0.0000072 0.0008337 0.0007919 8 5.726599
REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 5/270 5/2126 0.0000320 0.0022101 0.0020991 5 7.874074
REFERENCE TRAIP DEPENDENT REPLISOME DISASSEMBLY 9/270 16/2126 0.0000385 0.0022101 0.0020991 9 4.429167
REFERENCE RAD51 DSDNA DESTABILIZATION 7/270 10/2126 0.0000423 0.0022101 0.0020991 7 5.511852
REFERENCE CHOLESTEROL BIOSYNTHESIS 8/270 13/2126 0.0000445 0.0022101 0.0020991 8 4.845584
REFERENCE ORGANIZATION OF THE OUTER KINETOCHORE 9/270 19/2126 0.0002188 0.0085920 0.0081606 9 3.729825
REFERENCE DEPHOSPHORYLATION OF KINETOCHORE 7/270 12/2126 0.0002222 0.0085920 0.0081606 7 4.593210
REFERENCE PRE IC FORMATION 10/270 27/2126 0.0010740 0.0373768 0.0355001 10 2.916324
REFERENCE CENPE INTERACTION WITH NDC80 COMPLEX 6/270 12/2126 0.0018828 0.0595650 0.0565742 6 3.937037
REFERENCE CDC25 CELL CYCLE G2 M 5/270 9/2126 0.0026010 0.0719007 0.0682904 5 4.374486
REFERENCE DNA REPLICATION TERMINATION 8/270 21/2126 0.0027977 0.0719007 0.0682904 8 2.999647
REFERENCE COMMON PATHWAY OF COMPLEMENT CASCADE MAC FORMATION 4/270 6/2126 0.0030992 0.0719007 0.0682904 4 5.249383
REFERENCE LECTIN PATHWAY OF COAGULATION CASCADE FIBRINOGEN TO FIBRIN 4/270 6/2126 0.0030992 0.0719007 0.0682904 4 5.249383
REFERENCE HOMOLOGOUS RECOMBINATION 10/270 32/2126 0.0045878 0.0955378 0.0907407 10 2.460648
REFERENCE RECRUITMENT AND FORMATION OF THE MCC 5/270 10/2126 0.0046671 0.0955378 0.0907407 5 3.937037
REFERENCE LECTIN PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 4/270 7/2126 0.0065125 0.1259078 0.1195857 4 4.499471
REFERENCE HOMOLOGOUS RECOMBINATION IN ICLR 6/270 15/2126 0.0073105 0.1338972 0.1271740 6 3.149630
REFERENCE BREAK INDUCED REPLICATION 4/270 8/2126 0.0117359 0.1944810 0.1847158 4 3.937037
message("GO")
## GO
ora_up_go[rownames(ora_up_go)=="BP COMPLEMENT ACTIVATION CLASSICAL PATHWAY",]
##                                            GeneRatio  BgRatio       pvalue
## BP COMPLEMENT ACTIVATION CLASSICAL PATHWAY   14/1315 24/12365 1.553408e-08
##                                                p.adjust       qvalue Count
## BP COMPLEMENT ACTIVATION CLASSICAL PATHWAY 1.794445e-05 1.720195e-05    14
##                                                  es
## BP COMPLEMENT ACTIVATION CLASSICAL PATHWAY 5.485108
message("KEGG + complement")
## KEGG + complement
ora_up_kegg2[rownames(ora_up_kegg2)=="GOBP_COMPLEMENT_ACTIVATION_CLASSICAL_PATHWAY",]
##                                              GeneRatio BgRatio       pvalue
## GOBP_COMPLEMENT_ACTIVATION_CLASSICAL_PATHWAY    14/270 24/2126 1.243003e-07
##                                                  p.adjust       qvalue Count
## GOBP_COMPLEMENT_ACTIVATION_CLASSICAL_PATHWAY 4.325652e-05 4.108453e-05    14
##                                                   es
## GOBP_COMPLEMENT_ACTIVATION_CLASSICAL_PATHWAY 4.59321

Fix KEGG analysis.

bgdf <- data.frame("background",bg)
colnames(bgdf) <- c("term","gene")
keggfix <- rbind(kegg,bgdf)

ora_up <- as.data.frame(enricher(gene = up ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = keggfix,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))

 gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))

  br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))

  ora_up$es <- gr/br

ora_up$geneID = ora_up$ID = ora_up$Description = NULL

ora_up_keggfix <- ora_up

head(ora_up,20) %>%
  kbl(row.names = TRUE, caption="Upregulated KEGG fixed in SRP128998 with Complement") %>%
  kable_paper("hover", full_width = F)
Upregulated KEGG fixed in SRP128998 with Complement
GeneRatio BgRatio pvalue p.adjust qvalue Count es
REFERENCE ORIGIN UNWINDING AND ELONGATION 15/1469 30/14144 0.0000001 0.0000196 0.0000179 15 4.814159
REFERENCE MEVALONATE PATHWAY 8/1469 11/14144 0.0000016 0.0002863 0.0002615 8 7.002414
REFERENCE TRAIP DEPENDENT REPLISOME DISASSEMBLY 9/1469 16/14144 0.0000080 0.0006967 0.0006364 9 5.415929
REFERENCE CHOLESTEROL BIOSYNTHESIS 8/1469 13/14144 0.0000106 0.0006967 0.0006364 8 5.925119
REFERENCE RAD51 DSDNA DESTABILIZATION 7/1469 10/14144 0.0000116 0.0006967 0.0006364 7 6.739823
REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 5/1469 5/14144 0.0000120 0.0006967 0.0006364 5 9.628319
REFERENCE ORGANIZATION OF THE OUTER KINETOCHORE 9/1469 19/14144 0.0000481 0.0023924 0.0021854 9 4.560783
REFERENCE DEPHOSPHORYLATION OF KINETOCHORE 7/1469 12/14144 0.0000635 0.0027637 0.0025246 7 5.616519
REFERENCE PRE IC FORMATION 10/1469 27/14144 0.0002272 0.0087860 0.0080259 10 3.566044
REFERENCE CENPE INTERACTION WITH NDC80 COMPLEX 6/1469 12/14144 0.0006602 0.0229733 0.0209859 6 4.814159
REFERENCE DNA REPLICATION TERMINATION 8/1469 21/14144 0.0007817 0.0247290 0.0225897 8 3.667931
REFERENCE CDC25 CELL CYCLE G2 M 5/1469 9/14144 0.0010565 0.0286777 0.0261968 5 5.349066
REFERENCE HOMOLOGOUS RECOMBINATION 10/1469 32/14144 0.0010713 0.0286777 0.0261968 10 3.008850
REFERENCE COMMON PATHWAY OF COMPLEMENT CASCADE MAC FORMATION 4/1469 6/14144 0.0014632 0.0339453 0.0310087 4 6.418879
REFERENCE LECTIN PATHWAY OF COAGULATION CASCADE FIBRINOGEN TO FIBRIN 4/1469 6/14144 0.0014632 0.0339453 0.0310087 4 6.418879
REFERENCE RECRUITMENT AND FORMATION OF THE MCC 5/1469 10/14144 0.0019326 0.0420348 0.0383984 5 4.814159
REFERENCE HOMOLOGOUS RECOMBINATION IN ICLR 6/1469 15/14144 0.0027178 0.0556340 0.0508212 6 3.851327
REFERENCE LECTIN PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 4/1469 7/14144 0.0031332 0.0605748 0.0553345 4 5.501896
REFERENCE BREAK INDUCED REPLICATION 4/1469 8/14144 0.0057529 0.1000998 0.0914402 4 4.814159
REFERENCE REGULATION OF COMPLEMENT CASCADE MAC INHIBITION 4/1469 8/14144 0.0057529 0.1000998 0.0914402 4 4.814159
message("Original")
## Original
ora_up_kegg[which(rownames(ora_up_kegg)=="REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION"),]
##                                                                                    GeneRatio
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION     5/268
##                                                                                    BgRatio
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION  5/2120
##                                                                                          pvalue
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 3.124267e-05
##                                                                                       p.adjust
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 0.002483402
##                                                                                         qvalue
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 0.002365504
##                                                                                    Count
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION     5
##                                                                                          es
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 7.910448
message("Fixed")
## Fixed
ora_up_keggfix[which(rownames(ora_up_keggfix)=="REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION"),]
##                                                                                    GeneRatio
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION    5/1469
##                                                                                    BgRatio
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 5/14144
##                                                                                          pvalue
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 1.201145e-05
##                                                                                       p.adjust
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 0.000696664
##                                                                                          qvalue
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 0.0006363961
##                                                                                    Count
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION     5
##                                                                                          es
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 9.628319

Enrichment with Clusterprofiler

Here is a function that conducts over-representation analysis with ClusterProfiler. Genes that are not annotated as having any gene set category are discarded.

# df = deseq output
# df must have "ensID geneSymbol" as rownames

# gs = gene sets
# gs format is list of symbols
orabad <- function( df, gs ) {

  def <- df

  gs1 <- stack(gs)
  colnames(gs1) <- c("gene","term")
  gs1 <- gs1[,c(2,1)]

  defup <- rownames(subset(def,padj<=0.05 & log2FoldChange>0))
  defup <- defup[lapply(strsplit(defup," "),length)==2]
  defup <- unique(sapply(strsplit(defup," "),"[[",2))

  if ( length(defup)<200 ) {
    defup <- head(rownames(subset(def,log2FoldChange>0)),200)
    defup <- defup[lapply(strsplit(defup," "),length)==2]
    defup <- unique(sapply(strsplit(defup," "),"[[",2))
  }

  defdn <- rownames(subset(def,padj<=0.05 & log2FoldChange<0))
  defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
  defdn <- unique(sapply(strsplit(defdn," "),"[[",2))

  if ( length(defdn)<200 ) {
    defdn <- head(rownames(subset(def,log2FoldChange<0)),200)
    defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
    defdn <- unique(sapply(strsplit(defdn," "),"[[",2))
  }

  bg <- rownames(def)
  bg <- bg[lapply(strsplit(bg," "),length)==2]
  bg <- unique(sapply(strsplit(bg," "),"[[",2))

  ora_up <- as.data.frame(enricher(gene = defup ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))

  ora_up$geneID <- NULL
  ora_ups <- rownames(ora_up)

  gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))

  br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))

  ora_up$es <- gr/br
  ora_up <- ora_up[order(-ora_up$es),]
  ora_up$Description=NULL

  ora_dn <- as.data.frame(enricher(gene = defdn ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))

  ora_dn$geneID <- NULL
  ora_dns <- rownames(ora_dn)

  gr <- as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",2))

  br <- as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",2))

  ora_dn$es <- gr/br
  ora_dn <- ora_dn[order(-ora_dn$es),]
  ora_dn$Description=NULL

  result <- list("orabad_up"=ora_up,"orabad_dn"=ora_dn)
  return(result)
}

This function fixes the problem with excluding un-annotated genes.

oragood <- function( df, gs ) {

  def <- df

  gs1 <- stack(gs)
  colnames(gs1) <- c("gene","term")
  gs1 <- gs1[,c(2,1)]

  defup <- rownames(subset(def,padj<=0.05 & log2FoldChange>0))
  defup <- defup[lapply(strsplit(defup," "),length)==2]
  defup <- unique(sapply(strsplit(defup," "),"[[",2))

  if ( length(defup)<200 ) {
    defup <- head(rownames(subset(def,log2FoldChange>0)),200)
    defup <- defup[lapply(strsplit(defup," "),length)==2]
    defup <- unique(sapply(strsplit(defup," "),"[[",2))
  }

  defdn <- rownames(subset(def,padj<=0.05 & log2FoldChange<0))
  defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
  defdn <- unique(sapply(strsplit(defdn," "),"[[",2))

  if ( length(defdn)<200 ) {
    defdn <- head(rownames(subset(def,log2FoldChange<0)),200)
    defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
    defdn <- unique(sapply(strsplit(defdn," "),"[[",2))
  }

  bg <- rownames(def)
  bg <- bg[lapply(strsplit(bg," "),length)==2]
  bg <- unique(sapply(strsplit(bg," "),"[[",2))

  bgdf <- data.frame("background",bg)
  names(bgdf) <- c("term","gene")
  gs1 <- rbind(gs1,bgdf)

  ora_up <- as.data.frame(enricher(gene = defup ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))

  ora_up$geneID <- NULL
  ora_ups <- rownames(ora_up)

  gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))

  br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))

  ora_up$es <- gr/br
  ora_up <- ora_up[order(-ora_up$es),]
  ora_up$Description=NULL

  ora_dn <- as.data.frame(enricher(gene = defdn ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))

  ora_dn$geneID <- NULL
  ora_dns <- rownames(ora_dn)

  gr <- as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",2))

  br <- as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",2))

  ora_dn$es <- gr/br
  ora_dn <- ora_dn[order(-ora_dn$es),]
  ora_dn$Description=NULL

  result <- list("oragood_up"=ora_up,"oragood_dn"=ora_dn)
  return(result)
}

Compare the results of bad and good analysis

# bad=ORA analysis, a list with results of up and down regulated pathways
# good=same as bad, except fixed the background problem
compare_ora_plots <- function(bad,good,libname=NULL){
  badup <- bad$orabad_up
  baddn <- bad$orabad_dn
  goodup <- good$oragood_up
  gooddn <- good$oragood_dn
  mup <- merge(badup,goodup,by=0)
  MAX=max(c(mup$es.x,mup$es.y))
  UPHEADER=paste(libname,"up")
  DNHEADER=paste(libname,"dn")

  par(mfrow=c(2,2))

  plot(mup$es.x, mup$es.y,
    xlim=c(0,MAX), ylim=c(0,MAX),
    xlab="original ES", ylab="corrected ES", main=UPHEADER)
  grid()
  abline(a = 0, b = 1,lwd=2,lty=2)

  MAX=max(-log10(c(mup$pvalue.x,mup$pvalue.y)))

  plot(-log10(mup$pvalue.x), -log10(mup$pvalue.y),
    xlim=c(0,MAX), ylim=c(0,MAX),
    xlab="original -log10 p-value", ylab="corrected -log10 p-value", main=UPHEADER)
  grid()
  abline(a = 0, b = 1,lwd=2,lty=2)

  mdn <- merge(baddn,gooddn,by=0)
  MAX=max(c(mdn$es.x,mdn$es.y))

  plot(mdn$es.x, mdn$es.y,
    xlim=c(0,MAX), ylim=c(0,MAX),
    xlab="original ES", ylab="corrected ES", main=DNHEADER)
  grid()
  abline(a = 0, b = 1,lwd=2,lty=2)

  MAX=max(-log10(c(mdn$pvalue.x,mdn$pvalue.y)))

  plot(-log10(mdn$pvalue.x), -log10(mdn$pvalue.y),
    xlim=c(0,MAX), ylim=c(0,MAX),
    xlab="original -log10 p-value", ylab="corrected -log10 p-value", main=DNHEADER)
  grid()
  abline(a = 0, b = 1,lwd=2,lty=2)
}

compare_ora <- function(bad,good){
  badup <- bad$orabad_up
  baddn <- bad$orabad_dn
  goodup <- good$oragood_up
  gooddn <- good$oragood_dn
  mup <- merge(badup,goodup,by=0)
  mdn <- merge(baddn,gooddn,by=0)
  bup <- subset(mup,p.adjust.x<0.05)$Row.names
  gup <- subset(mup,p.adjust.y<0.05)$Row.names
  bdn <- subset(mdn,p.adjust.x<0.05)$Row.names
  gdn <- subset(mdn,p.adjust.y<0.05)$Row.names
  sig_bad <- unique(c(bup,bdn))
  sig_good <- unique(c(gup,gdn))
  nsig_bad <- length(sig_bad)
  nsig_good <- length(sig_good)
  jac <- length(intersect(sig_bad,sig_good)) / length(union(sig_bad,sig_good))
  result=c("nsig_bad"=nsig_bad,"nsig_good"=nsig_good,"Jaccard"=jac)
  return(result)
}

Run some analyses

params <- expand.grid(1:length(d),1:length(gs))
params2 <- expand.grid(names(d),names(gs))

colnames(params) <- c("d","gs")

ora_dat <- mclapply(1:nrow(params), function(i) {
  j=params[i,1]
  k=params[i,2]
  bad <- orabad(d[[j]],gs[[k]])
  good <- oragood(d[[j]],gs[[k]])
  res <- compare_ora(bad,good)
  return(res)
} , mc.cores=8)

ora_dat <- do.call(rbind,ora_dat)

res1 <- cbind(params2,ora_dat)

res1 %>%
  kbl(row.names = TRUE, caption="all results") %>%
  kable_paper("hover", full_width = F)
all results
Var1 Var2 nsig_bad nsig_good Jaccard
1 bulkrna1.Rds KEGG 9 19 0.4736842
2 bulkrna2.Rds KEGG 25 59 0.4237288
3 bulkrna3.Rds KEGG 2 6 0.3333333
4 bulkrna4.Rds KEGG 14 24 0.5833333
5 bulkrna5.Rds KEGG 0 0 NaN
6 bulkrna6.Rds KEGG 47 56 0.8392857
7 bulkrna7.Rds KEGG 9 38 0.2368421
8 bulkrna1.Rds Reactome 64 102 0.6274510
9 bulkrna2.Rds Reactome 239 351 0.6809117
10 bulkrna3.Rds Reactome 94 159 0.5617284
11 bulkrna4.Rds Reactome 207 307 0.6742671
12 bulkrna5.Rds Reactome 0 0 NaN
13 bulkrna6.Rds Reactome 254 334 0.7604790
14 bulkrna7.Rds Reactome 185 240 0.7708333
15 bulkrna1.Rds Wikipathways 33 46 0.7173913
16 bulkrna2.Rds Wikipathways 108 196 0.5510204
17 bulkrna3.Rds Wikipathways 8 12 0.6666667
18 bulkrna4.Rds Wikipathways 13 50 0.2600000
19 bulkrna5.Rds Wikipathways 1 1 1.0000000
20 bulkrna6.Rds Wikipathways 82 97 0.8453608
21 bulkrna7.Rds Wikipathways 35 117 0.2991453
22 bulkrna1.Rds miR targets 4 81 0.0493827
23 bulkrna2.Rds miR targets 40 111 0.3603604
24 bulkrna3.Rds miR targets 21 38 0.5526316
25 bulkrna4.Rds miR targets 272 634 0.4290221
26 bulkrna5.Rds miR targets 0 0 NaN
27 bulkrna6.Rds miR targets 1166 1365 0.8393895
28 bulkrna7.Rds miR targets 360 544 0.6617647
29 bulkrna1.Rds TFT GTRD 52 85 0.6117647
30 bulkrna2.Rds TFT GTRD 98 140 0.7000000
31 bulkrna3.Rds TFT GTRD 108 135 0.8000000
32 bulkrna4.Rds TFT GTRD 72 115 0.6260870
33 bulkrna5.Rds TFT GTRD 1 1 1.0000000
34 bulkrna6.Rds TFT GTRD 147 173 0.8497110
35 bulkrna7.Rds TFT GTRD 129 153 0.8431373
36 bulkrna1.Rds GO 485 707 0.6859972
37 bulkrna2.Rds GO 1843 2168 0.8500923
38 bulkrna3.Rds GO 686 915 0.7497268
39 bulkrna4.Rds GO 390 707 0.5516266
40 bulkrna5.Rds GO 6 48 0.1250000
41 bulkrna6.Rds GO 588 789 0.7452471
42 bulkrna7.Rds GO 1263 1544 0.8180052
43 bulkrna1.Rds HPO 2 42 0.0476190
44 bulkrna2.Rds HPO 380 988 0.3846154
45 bulkrna3.Rds HPO 92 241 0.3817427
46 bulkrna4.Rds HPO 24 296 0.0810811
47 bulkrna5.Rds HPO 0 1 0.0000000
48 bulkrna6.Rds HPO 17 265 0.0641509
49 bulkrna7.Rds HPO 186 449 0.4142539
50 bulkrna1.Rds Cellmarkers 139 170 0.8176471
51 bulkrna2.Rds Cellmarkers 439 477 0.9203354
52 bulkrna3.Rds Cellmarkers 345 382 0.9031414
53 bulkrna4.Rds Cellmarkers 245 286 0.8566434
54 bulkrna5.Rds Cellmarkers 0 1 0.0000000
55 bulkrna6.Rds Cellmarkers 267 297 0.8989899
56 bulkrna7.Rds Cellmarkers 448 491 0.9085366
57 bulkrna1.Rds Hallmark 7 20 0.3500000
58 bulkrna2.Rds Hallmark 25 38 0.6578947
59 bulkrna3.Rds Hallmark 18 21 0.7727273
60 bulkrna4.Rds Hallmark 13 27 0.4814815
61 bulkrna5.Rds Hallmark 0 2 0.0000000
62 bulkrna6.Rds Hallmark 28 37 0.7567568
63 bulkrna7.Rds Hallmark 18 34 0.4444444
summary(res1$nsig_bad)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    11.0    52.0   188.1   242.0  1843.0
summary(res1$nsig_good)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    35.5   115.0   273.5   320.5  2168.0
summary(res1$Jaccard)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.3839  0.6427  0.5716  0.8044  1.0000       3
# NUMBER OF SIGNIFICANT SETS
ns1 <- matrix( res1$nsig_bad , ncol=9 )
rownames(ns1) <- names(d)
colnames(ns1) <- names(gs)

ns1 %>%
  kbl(row.names = TRUE, caption="significant genesets in original analysis") %>%
  kable_paper("hover", full_width = F)
significant genesets in original analysis
KEGG Reactome Wikipathways miR targets TFT GTRD GO HPO Cellmarkers Hallmark
bulkrna1.Rds 9 64 33 4 52 485 2 139 7
bulkrna2.Rds 25 239 108 40 98 1843 380 439 25
bulkrna3.Rds 2 94 8 21 108 686 92 345 18
bulkrna4.Rds 14 207 13 272 72 390 24 245 13
bulkrna5.Rds 0 0 1 0 1 6 0 0 0
bulkrna6.Rds 47 254 82 1166 147 588 17 267 28
bulkrna7.Rds 9 185 35 360 129 1263 186 448 18
ns1 <- apply(ns1,2,function(x) { x <- x[!is.na(x)] ; x <- x[is.finite(x)] ;  mean(x) } )

ns2 <- matrix( res1$nsig_good , ncol=9 )
rownames(ns2) <- names(d)
colnames(ns2) <- names(gs)

ns2 %>%
  kbl(row.names = TRUE, caption="significant genesets in corrected analysis") %>%
  kable_paper("hover", full_width = F)
significant genesets in corrected analysis
KEGG Reactome Wikipathways miR targets TFT GTRD GO HPO Cellmarkers Hallmark
bulkrna1.Rds 19 102 46 81 85 707 42 170 20
bulkrna2.Rds 59 351 196 111 140 2168 988 477 38
bulkrna3.Rds 6 159 12 38 135 915 241 382 21
bulkrna4.Rds 24 307 50 634 115 707 296 286 27
bulkrna5.Rds 0 0 1 0 1 48 1 1 2
bulkrna6.Rds 56 334 97 1365 173 789 265 297 37
bulkrna7.Rds 38 240 117 544 153 1544 449 491 34
ns2 <- apply(ns2,2,function(x) { x <- x[!is.na(x)] ; x <- x[is.finite(x)] ;  mean(x) } )
ns2
##         KEGG     Reactome Wikipathways  miR targets     TFT GTRD           GO 
##     28.85714    213.28571     74.14286    396.14286    114.57143    982.57143 
##          HPO  Cellmarkers     Hallmark 
##    326.00000    300.57143     25.57143
df <- data.frame(ns1,ns2)
df <- df[order(df$ns1),]


par(mar=c(c(5.1, 7.1, 4.1, 2.1) ))
barplot(t(df),beside=TRUE,horiz=TRUE,las=1,xlim=c(0,1250),
  xlab="no. significant sets",
  col=c("gray30","gray80"))
text(df$ns1+50,((1:nrow(df))*3)-1.5,labels=signif(df[,1],3))
text(df$ns2+50,((1:nrow(df))*3)-0.5,labels=signif(df[,2],3))
#abline(v=seq(100,1000,100),lty=2,lwd=0.5,col="gray")
legend("bottomright", inset=.02, legend=c("corrected","original"),
  fill=c("gray80","gray30"), horiz=FALSE, cex=1.1)

mylabels <- gsub("$","%",gsub("^","+",signif(((df$ns2/df$ns1)-1)*100,3)))
text(df$ns2+160,((1:nrow(df))*3)-1,labels=mylabels,cex=1.1)

png("fig1_ORAsig.png")
par(mar=c(c(5.1, 7.1, 2.1, 2.1) ))
barplot(t(df),beside=TRUE,horiz=TRUE,las=1,xlim=c(0,1250),
  xlab="no. significant sets",
  col=c("gray30","gray80"))
text(df$ns1+50,((1:nrow(df))*3)-1.5,labels=signif(df[,1],3))
text(df$ns2+50,((1:nrow(df))*3)-0.5,labels=signif(df[,2],3))
#abline(v=seq(100,1000,100),lty=2,lwd=0.5,col="gray")
legend("bottomright", inset=.02, legend=c("corrected","original"),
  fill=c("gray80","gray30"), horiz=FALSE, cex=1.1)

mylabels <- gsub("$","%",gsub("^","+",signif(((df$ns2/df$ns1)-1)*100,3)))
text(df$ns2+160,((1:nrow(df))*3)-1,labels=mylabels,cex=1.1)
dev.off()
## png 
##   2
# JACCARD
jac <- matrix( res1$Jaccard , ncol=9 )
rownames(jac) <- names(d)
colnames(jac) <- names(gs)

signif(jac,3) %>%
  kbl(row.names = TRUE, caption="Jaccard similarity Corrected:Original") %>%
  kable_paper("hover", full_width = F)
Jaccard similarity Corrected:Original
KEGG Reactome Wikipathways miR targets TFT GTRD GO HPO Cellmarkers Hallmark
bulkrna1.Rds 0.474 0.627 0.717 0.0494 0.612 0.686 0.0476 0.818 0.350
bulkrna2.Rds 0.424 0.681 0.551 0.3600 0.700 0.850 0.3850 0.920 0.658
bulkrna3.Rds 0.333 0.562 0.667 0.5530 0.800 0.750 0.3820 0.903 0.773
bulkrna4.Rds 0.583 0.674 0.260 0.4290 0.626 0.552 0.0811 0.857 0.481
bulkrna5.Rds NaN NaN 1.000 NaN 1.000 0.125 0.0000 0.000 0.000
bulkrna6.Rds 0.839 0.760 0.845 0.8390 0.850 0.745 0.0642 0.899 0.757
bulkrna7.Rds 0.237 0.771 0.299 0.6620 0.843 0.818 0.4140 0.909 0.444
jac2 <- apply(jac,2,function(x) { x <- x[!is.na(x)] ; x <- x[is.finite(x)] ;  mean(x) } )
jac2
##         KEGG     Reactome Wikipathways  miR targets     TFT GTRD           GO 
##    0.4817013    0.6792784    0.6199406    0.4820918    0.7758143    0.6465279 
##          HPO  Cellmarkers     Hallmark 
##    0.1962090    0.7578991    0.4947578
par(mar=c(c(5.1, 7.1, 2.1, 2.1) ))
barplot(sort(jac2),horiz=TRUE,las=1,xlim=c(0,1), xlab="Jaccard index")

par(mar=c(c(5.1, 5.1, 2.1, 2.1) ))
plot(cov,jac2,xlim=c(0,30000),ylim=c(0,1),pch=19,col="lightgray",cex=3,
  xlab="gene coverage",ylab="Jaccard index")
text(cov,jac2,labels=names(cov))
mylm <- lm(jac2~cov)
abline(mylm,lty=2,lwd=2)
SLOPE=unname(signif(mylm$coefficients[2],3))
INT=unname(signif(mylm$coefficients[1],3))
PVAL=signif(summary(mylm)$coefficients[4],3)
RSQ=signif(summary(mylm)$r.squared,3)
HEADER=paste("Rsq =",RSQ,"; p-val =",PVAL)
mtext(HEADER)

png("fig1_ORAjac.png")
par(mar=c(c(5.1, 5.1, 2.1, 2.1) ))
plot(cov,jac2,xlim=c(0,30000),ylim=c(0,1),pch=19,col="lightgray",cex=3,
  xlab="gene coverage",ylab="Jaccard index")
text(cov,jac2,labels=names(cov))
mylm <- lm(jac2~cov)
abline(mylm,lty=2,lwd=2)
SLOPE=unname(signif(mylm$coefficients[2],3))
INT=unname(signif(mylm$coefficients[1],3))
PVAL=signif(summary(mylm)$coefficients[4],3)
RSQ=signif(summary(mylm)$r.squared,3)
HEADER=paste("Rsq =",RSQ,"; p-val =",PVAL)
mtext(HEADER)
dev.off()
## png 
##   2

Make charts showing changes in pvalues

For bulk RNA-seq data set #1, run enrichment with all the gene set libraries.

names(gs)
## [1] "KEGG"         "Reactome"     "Wikipathways" "miR targets"  "TFT GTRD"    
## [6] "GO"           "HPO"          "Cellmarkers"  "Hallmark"
lapply(1:length(gs), function(i) {
  bad <- orabad(d[[1]],gs[[i]])
  good <- oragood(d[[1]],gs[[i]])
  compare_ora_plots(bad,good,libname=names(gs)[i])
} )

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL

Session information

For reproducibility

sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] gplots_3.1.3.1              eulerr_7.0.2               
##  [3] fgsea_1.30.0                clusterProfiler_4.12.0     
##  [5] kableExtra_1.4.0            DESeq2_1.44.0              
##  [7] SummarizedExperiment_1.34.0 Biobase_2.64.0             
##  [9] MatrixGenerics_1.16.0       matrixStats_1.3.0          
## [11] GenomicRanges_1.56.0        GenomeInfoDb_1.40.1        
## [13] IRanges_2.38.0              S4Vectors_0.42.0           
## [15] BiocGenerics_0.50.0         getDEE2_1.14.0             
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3      rstudioapi_0.16.0       jsonlite_1.8.8         
##   [4] magrittr_2.0.3          farver_2.1.2            rmarkdown_2.26         
##   [7] fs_1.6.4                zlibbioc_1.50.0         vctrs_0.6.5            
##  [10] memoise_2.0.1           ggtree_3.12.0           htmltools_0.5.8.1      
##  [13] S4Arrays_1.4.1          SparseArray_1.4.8       gridGraphics_0.5-1     
##  [16] sass_0.4.9              KernSmooth_2.23-22      bslib_0.7.0            
##  [19] plyr_1.8.9              cachem_1.0.8            igraph_2.0.3           
##  [22] lifecycle_1.0.4         pkgconfig_2.0.3         gson_0.1.0             
##  [25] Matrix_1.7-0            R6_2.5.1                fastmap_1.1.1          
##  [28] GenomeInfoDbData_1.2.12 digest_0.6.35           aplot_0.2.2            
##  [31] enrichplot_1.24.0       colorspace_2.1-0        patchwork_1.2.0        
##  [34] AnnotationDbi_1.66.0    RSQLite_2.3.7           fansi_1.0.6            
##  [37] httr_1.4.7              polyclip_1.10-6         abind_1.4-5            
##  [40] compiler_4.4.0          bit64_4.0.5             withr_3.0.0            
##  [43] BiocParallel_1.38.0     viridis_0.6.5           DBI_1.2.3              
##  [46] highr_0.10              ggforce_0.4.2           MASS_7.3-60.2          
##  [49] DelayedArray_0.30.1     HDO.db_0.99.1           caTools_1.18.2         
##  [52] gtools_3.9.5            tools_4.4.0             scatterpie_0.2.3       
##  [55] ape_5.8                 glue_1.7.0              nlme_3.1-164           
##  [58] GOSemSim_2.30.0         shadowtext_0.1.3        grid_4.4.0             
##  [61] reshape2_1.4.4          generics_0.1.3          gtable_0.3.5           
##  [64] tidyr_1.3.1             data.table_1.15.4       tidygraph_1.3.1        
##  [67] xml2_1.3.6              utf8_1.2.4              XVector_0.44.0         
##  [70] ggrepel_0.9.5           pillar_1.9.0            stringr_1.5.1          
##  [73] yulab.utils_0.1.4       splines_4.4.0           dplyr_1.1.4            
##  [76] tweenr_2.0.3            treeio_1.28.0           lattice_0.22-6         
##  [79] bit_4.0.5               tidyselect_1.2.1        GO.db_3.19.1           
##  [82] locfit_1.5-9.9          Biostrings_2.72.1       knitr_1.46             
##  [85] gridExtra_2.3           svglite_2.1.3           xfun_0.43              
##  [88] graphlayouts_1.1.1      stringi_1.8.4           UCSC.utils_1.0.0       
##  [91] lazyeval_0.2.2          ggfun_0.1.5             yaml_2.3.8             
##  [94] evaluate_0.23           codetools_0.2-20        ggraph_2.2.1           
##  [97] tibble_3.2.1            qvalue_2.36.0           ggplotify_0.1.2        
## [100] cli_3.6.2               systemfonts_1.0.6       munsell_0.5.1          
## [103] jquerylib_0.1.4         Rcpp_1.0.12             png_0.1-8              
## [106] ggplot2_3.5.1           blob_1.2.4              DOSE_3.30.1            
## [109] htm2txt_2.2.2           bitops_1.0-7            viridisLite_0.4.2      
## [112] tidytree_0.4.6          scales_1.3.0            purrr_1.0.2            
## [115] crayon_1.5.2            rlang_1.1.3             cowplot_1.1.3          
## [118] fastmatch_1.1-4         KEGGREST_1.44.0