Source: https://github.com/markziemann/background
This guide is a Rmarkdown script that conducts differential expression and enrichment analysis, which are very popular workflows for transcriptome data.
The goal of this work is to understand how ClusterProfiler manages the background list, as we have observed some weird behaviour. In particular we see that when we provide a custom background, those genes do not appear in the final analysis unless they are also included in the gene list.
In the code chunk below called libs, you can add and
remove required R library dependancies. Check that the libraries listed
here match the Dockerfile, otherwise you might get errors.
suppressPackageStartupMessages({
  library("getDEE2")
  library("DESeq2")
  library("kableExtra")
  library("clusterProfiler")
  library("fgsea")
  library("eulerr")
  library("gplots")
  library("parallel")
})
For this guide I will be using bulk RNA-seq data from several previous studies.
Gene sets were obtained from MSigDB.
myfiles <- list.files("../dataprep",pattern="*Rds",full.names=TRUE)
d <- lapply(myfiles,readRDS)
names(d) <- basename(myfiles)
mygmts <- list.files("../gmt",pattern="*gmt",full.names=TRUE)
gs <- lapply(mygmts,gmtPathways)
names(gs) <- basename(mygmts)
names(gs)
## [1] "c2.cp.kegg_medicus.v2023.2.Hs.symbols.gmt"
## [2] "c2.cp.reactome.v2023.2.Hs.symbols.gmt"    
## [3] "c2.cp.wikipathways.v2023.2.Hs.symbols.gmt"
## [4] "c3.mir.mirdb.v2023.2.Hs.symbols.gmt"      
## [5] "c3.tft.gtrd.v2023.2.Hs.symbols.gmt"       
## [6] "c5.go.v2023.2.Hs.symbols.gmt"             
## [7] "c5.hpo.v2023.2.Hs.symbols.gmt"            
## [8] "c8.all.v2023.2.Hs.symbols.gmt"            
## [9] "h.all.v2023.2.Hs.symbols.gmt"
names(gs) <- c("KEGG", "Reactome", "Wikipathways", "miR targets", "TFT GTRD",
  "GO", "HPO", "Cellmarkers", "Hallmark" )
# number of sets
num <- unlist(lapply(gs,length))
num
##         KEGG     Reactome Wikipathways  miR targets     TFT GTRD           GO 
##          619         1692          791         2377          505        10461 
##          HPO  Cellmarkers     Hallmark 
##         5547          830           50
# set size summary
sz <- unlist(lapply(gs,function(x) {
  y <- unlist(lapply(x, length ))
  summary(y)
} ))
sz <- t(matrix(sz,ncol=9))
rownames(sz) <- names(gs)
colnames(sz) <- names(summary(c(1,2,3,4)))
sz
##              Min. 1st Qu. Median      Mean 3rd Qu. Max.
## KEGG            5       8     13  15.59128   19.00   92
## Reactome        5      11     23  56.28014   57.00 1463
## Wikipathways    5      14     28  42.91656   53.00  438
## miR targets     5      59    103 156.52545  186.00 1071
## TFT GTRD        5      55    287 509.26931  880.00 1999
## GO              5       9     18  79.83424   55.00 1994
## HPO             5      10     23  91.31999   74.00 1735
## Cellmarkers     5      52    115 180.51446  223.25 1770
## Hallmark       32      98    180 146.44000  200.00  200
# gene coverage
cov <- unlist(lapply(gs,function(x) {
  y <- unique(unlist(x))
  length(y)
} ))
cov
##         KEGG     Reactome Wikipathways  miR targets     TFT GTRD           GO 
##         2727        11155         8313        16655        26807        19428 
##          HPO  Cellmarkers     Hallmark 
##         4997        20418         4384
# total number of annotations
tot <- unlist(lapply(gs, function(x) {
  sum(unlist(lapply(x,length)))
} ))
tot
##         KEGG     Reactome Wikipathways  miR targets     TFT GTRD           GO 
##         9651        95226        33947       372061       257181       835146 
##          HPO  Cellmarkers     Hallmark 
##       506552       149827         7322
x1 <- readRDS("../dataprep/bulkrna1.Rds")
head(x1)
##                         baseMean log2FoldChange     lfcSE       stat
## ENSG00000000003 TSPAN6 3754.3819    -0.10925392 0.1417469 -0.7707674
## ENSG00000000419 DPM1   1718.6738    -0.06109763 0.1477350 -0.4135622
## ENSG00000000457 SCYL3   260.6222     0.28894664 0.2166482  1.3337138
## ENSG00000000460 FIRRM   327.1207     0.57058478 0.2019567  2.8252821
## ENSG00000001036 FUCA2  6164.1606    -0.47524943 0.1136307 -4.1824048
## ENSG00000001084 GCLC   4296.9247    -0.68760144 0.1235878 -5.5636659
##                              pvalue         padj
## ENSG00000000003 TSPAN6 4.408448e-01 6.243711e-01
## ENSG00000000419 DPM1   6.791948e-01 8.109114e-01
## ENSG00000000457 SCYL3  1.822977e-01 3.508604e-01
## ENSG00000000460 FIRRM  4.723901e-03 2.518387e-02
## ENSG00000001036 FUCA2  2.884418e-05 4.256904e-04
## ENSG00000001084 GCLC   2.641654e-08 1.092526e-06
# remove genes without symbols
x1f <- x1[lapply(strsplit(rownames(x1) , " "),length)>1,]
dim(x1)
## [1] 15520     6
dim(x1f)
## [1] 15104     6
# about 1000 genes are lost after removing redundant gene names
bg <- unique(sapply(strsplit(rownames(x1f) , " "),"[[",2))
length(bg)
## [1] 14144
up <-rownames(subset(x1f, log2FoldChange > 0 & padj < 0.05 ))
up <- unique(sapply(strsplit(up," "),"[[",2))
length(up)
## [1] 1469
dn <-rownames(subset(x1f, log2FoldChange < 0 & padj < 0.05 ))
dn <- unique(sapply(strsplit(dn," "),"[[",2))
length(dn)
## [1] 1801
# kegg gene set
kegg <- gs[[1]]
kegg <- stack(kegg)
colnames(kegg) <- c("gene","term")
kegg <- kegg[,c(2,1)]
kegg$term <- gsub("KEGG_MEDICUS_","",kegg$term)
kegg$term <- gsub("_"," ",kegg$term)
ora_up <- as.data.frame(enricher(gene = up ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = kegg,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))
 gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))
  br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))
  ora_up$es <- gr/br
ora_up$geneID = ora_up$ID = ora_up$Description = NULL
ora_up_kegg <- ora_up
head(ora_up,20) %>%
  kbl(row.names = TRUE, caption="Upregulated KEGGs in SRP128998") %>%
  kable_paper("hover", full_width = F)
| GeneRatio | BgRatio | pvalue | p.adjust | qvalue | Count | es | |
|---|---|---|---|---|---|---|---|
| REFERENCE ORIGIN UNWINDING AND ELONGATION | 15/268 | 30/2120 | 0.0000006 | 0.0002128 | 0.0002027 | 15 | 3.955224 | 
| REFERENCE MEVALONATE PATHWAY | 8/268 | 11/2120 | 0.0000069 | 0.0012031 | 0.0011460 | 8 | 5.753053 | 
| REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION | 5/268 | 5/2120 | 0.0000312 | 0.0024834 | 0.0023655 | 5 | 7.910448 | 
| REFERENCE TRAIP DEPENDENT REPLISOME DISASSEMBLY | 9/268 | 16/2120 | 0.0000370 | 0.0024834 | 0.0023655 | 9 | 4.449627 | 
| REFERENCE RAD51 DSDNA DESTABILIZATION | 7/268 | 10/2120 | 0.0000410 | 0.0024834 | 0.0023655 | 7 | 5.537313 | 
| REFERENCE CHOLESTEROL BIOSYNTHESIS | 8/268 | 13/2120 | 0.0000429 | 0.0024834 | 0.0023655 | 8 | 4.867968 | 
| REFERENCE ORGANIZATION OF THE OUTER KINETOCHORE | 9/268 | 19/2120 | 0.0002110 | 0.0093540 | 0.0089100 | 9 | 3.747054 | 
| REFERENCE DEPHOSPHORYLATION OF KINETOCHORE | 7/268 | 12/2120 | 0.0002157 | 0.0093540 | 0.0089100 | 7 | 4.614428 | 
| REFERENCE PRE IC FORMATION | 10/268 | 27/2120 | 0.0010349 | 0.0399023 | 0.0380080 | 10 | 2.929795 | 
| REFERENCE CENPE INTERACTION WITH NDC80 COMPLEX | 6/268 | 12/2120 | 0.0018369 | 0.0637399 | 0.0607139 | 6 | 3.955224 | 
| REFERENCE CDC25 CELL CYCLE G2 M | 5/268 | 9/2120 | 0.0025467 | 0.0754786 | 0.0718953 | 5 | 4.394693 | 
| REFERENCE DNA REPLICATION TERMINATION | 8/268 | 21/2120 | 0.0027149 | 0.0754786 | 0.0718953 | 8 | 3.013504 | 
| REFERENCE COMMON PATHWAY OF COMPLEMENT CASCADE MAC FORMATION | 4/268 | 6/2120 | 0.0030452 | 0.0754786 | 0.0718953 | 4 | 5.273632 | 
| REFERENCE LECTIN PATHWAY OF COAGULATION CASCADE FIBRINOGEN TO FIBRIN | 4/268 | 6/2120 | 0.0030452 | 0.0754786 | 0.0718953 | 4 | 5.273632 | 
| REFERENCE HOMOLOGOUS RECOMBINATION | 10/268 | 32/2120 | 0.0044335 | 0.0991532 | 0.0944460 | 10 | 2.472015 | 
| REFERENCE RECRUITMENT AND FORMATION OF THE MCC | 5/268 | 10/2120 | 0.0045719 | 0.0991532 | 0.0944460 | 5 | 3.955224 | 
| REFERENCE LECTIN PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION | 4/268 | 7/2120 | 0.0064025 | 0.1306854 | 0.1244811 | 4 | 4.520256 | 
| REFERENCE HOMOLOGOUS RECOMBINATION IN ICLR | 6/268 | 15/2120 | 0.0071439 | 0.1377178 | 0.1311797 | 6 | 3.164179 | 
| REFERENCE BREAK INDUCED REPLICATION | 4/268 | 8/2120 | 0.0115435 | 0.2002801 | 0.1907719 | 4 | 3.955224 | 
| REFERENCE REGULATION OF COMPLEMENT CASCADE MAC INHIBITION | 4/268 | 8/2120 | 0.0115435 | 0.2002801 | 0.1907719 | 4 | 3.955224 | 
# GO gene set
go <- gs[[6]]
go <- stack(go)
colnames(go) <- c("gene","term")
go <- go[,c(2,1)]
go$term <- gsub("^GO","",go$term)
go$term <- gsub("_"," ",go$term)
ora_up <- as.data.frame(enricher(gene = up ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = go,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))
 gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))
  br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))
  ora_up$es <- gr/br
ora_up$geneID = ora_up$ID = ora_up$Description = NULL
ora_up_go <- ora_up
head(ora_up,20) %>%
  kbl(row.names = TRUE, caption="Upregulated GOs in SRP128998") %>%
  kable_paper("hover", full_width = F)
| GeneRatio | BgRatio | pvalue | p.adjust | qvalue | Count | es | |
|---|---|---|---|---|---|---|---|
| BP SMALL MOLECULE METABOLIC PROCESS | 232/1315 | 1465/12365 | 0.0e+00 | 0.0000003 | 0.0000003 | 232 | 1.489082 | 
| BP ORGANIC ACID METABOLIC PROCESS | 138/1315 | 761/12365 | 0.0e+00 | 0.0000003 | 0.0000003 | 138 | 1.705151 | 
| BP SMALL MOLECULE BIOSYNTHETIC PROCESS | 93/1315 | 455/12365 | 0.0e+00 | 0.0000006 | 0.0000006 | 93 | 1.921940 | 
| BP FATTY ACID METABOLIC PROCESS | 64/1315 | 287/12365 | 0.0e+00 | 0.0000093 | 0.0000089 | 64 | 2.096846 | 
| BP HUMORAL IMMUNE RESPONSE MEDIATED BY CIRCULATING IMMUNOGLOBULIN | 16/1315 | 30/12365 | 0.0e+00 | 0.0000116 | 0.0000111 | 16 | 5.014956 | 
| BP COMPLEMENT ACTIVATION CLASSICAL PATHWAY | 14/1315 | 24/12365 | 0.0e+00 | 0.0000179 | 0.0000172 | 14 | 5.485108 | 
| CC BLOOD MICROPARTICLE | 29/1315 | 90/12365 | 0.0e+00 | 0.0000230 | 0.0000221 | 29 | 3.029869 | 
| BP STEROL BIOSYNTHETIC PROCESS | 21/1315 | 54/12365 | 1.0e-07 | 0.0000443 | 0.0000425 | 21 | 3.656738 | 
| BP LIPID METABOLIC PROCESS | 163/1315 | 1039/12365 | 1.0e-07 | 0.0000798 | 0.0000765 | 163 | 1.475164 | 
| BP CELLULAR LIPID METABOLIC PROCESS | 126/1315 | 765/12365 | 2.0e-07 | 0.0001667 | 0.0001598 | 126 | 1.548736 | 
| BP COMPLEMENT ACTIVATION | 15/1315 | 33/12365 | 4.0e-07 | 0.0002210 | 0.0002118 | 15 | 4.274110 | 
| BP REGULATION OF MITOTIC NUCLEAR DIVISION | 29/1315 | 101/12365 | 4.0e-07 | 0.0002210 | 0.0002118 | 29 | 2.699883 | 
| BP MONOCARBOXYLIC ACID METABOLIC PROCESS | 84/1315 | 463/12365 | 5.0e-07 | 0.0002843 | 0.0002725 | 84 | 1.705951 | 
| BP REGULATION OF NUCLEAR DIVISION | 31/1315 | 117/12365 | 1.1e-06 | 0.0005118 | 0.0004907 | 31 | 2.491404 | 
| CC COLLAGEN CONTAINING EXTRACELLULAR MATRIX | 52/1315 | 248/12365 | 1.1e-06 | 0.0005118 | 0.0004907 | 52 | 1.971605 | 
| BP HUMORAL IMMUNE RESPONSE | 27/1315 | 95/12365 | 1.2e-06 | 0.0005173 | 0.0004959 | 27 | 2.672443 | 
| CC OUTER KINETOCHORE | 11/1315 | 20/12365 | 1.3e-06 | 0.0005209 | 0.0004993 | 11 | 5.171673 | 
| BP G2 MI TRANSITION OF MEIOTIC CELL CYCLE | 6/1315 | 6/12365 | 1.4e-06 | 0.0005224 | 0.0005008 | 6 | 9.403042 | 
| CC KNL1 SPC105 COMPLEX | 6/1315 | 6/12365 | 1.4e-06 | 0.0005224 | 0.0005008 | 6 | 9.403042 | 
| CC EXTERNAL ENCAPSULATING STRUCTURE | 60/1315 | 306/12365 | 1.8e-06 | 0.0006187 | 0.0005931 | 60 | 1.843734 | 
Transfer a gene set from GO to KEGG analysis.
pw <- gs[[6]][grep("GOBP_COMPLEMENT_ACTIVATION_CLASSICAL_PATHWAY", names(gs[[6]]))]
pw
## $GOBP_COMPLEMENT_ACTIVATION_CLASSICAL_PATHWAY
##  [1] "C1QA"     "C1QB"     "C1QBP"    "C1QC"     "C1R"      "C1RL"    
##  [7] "C1S"      "C2"       "C3"       "C4A"      "C4B"      "C4BPA"   
## [13] "C4BPB"    "C5"       "C6"       "C7"       "C8A"      "C8B"     
## [19] "C8G"      "C9"       "CD46"     "CD55"     "CFI"      "CLU"     
## [25] "CR1"      "CR1L"     "CR2"      "IGHA1"    "IGHA2"    "IGHE"    
## [31] "IGHG1"    "IGHG2"    "IGHG3"    "IGHG4"    "IGHM"     "MASP2"   
## [37] "MBL2"     "NCR3LG1"  "SERPING1" "SUSD4"    "SVEP1"    "TREM2"
pw <- stack(pw)[,c(2,1)]
colnames(pw) <- c("term","gene")
kegg2 <- rbind(kegg,pw)
ora_up <- as.data.frame(enricher(gene = up ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = kegg2,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))
 gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))
  br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))
  ora_up$es <- gr/br
ora_up$geneID = ora_up$ID = ora_up$Description = NULL
ora_up_kegg2 <- ora_up
head(ora_up,20) %>%
  kbl(row.names = TRUE, caption="Upregulated KEGGs in SRP128998 with Complement") %>%
  kable_paper("hover", full_width = F)
| GeneRatio | BgRatio | pvalue | p.adjust | qvalue | Count | es | |
|---|---|---|---|---|---|---|---|
| GOBP_COMPLEMENT_ACTIVATION_CLASSICAL_PATHWAY | 14/270 | 24/2126 | 0.0000001 | 0.0000433 | 0.0000411 | 14 | 4.593210 | 
| REFERENCE ORIGIN UNWINDING AND ELONGATION | 15/270 | 30/2126 | 0.0000007 | 0.0001136 | 0.0001079 | 15 | 3.937037 | 
| REFERENCE MEVALONATE PATHWAY | 8/270 | 11/2126 | 0.0000072 | 0.0008337 | 0.0007919 | 8 | 5.726599 | 
| REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION | 5/270 | 5/2126 | 0.0000320 | 0.0022101 | 0.0020991 | 5 | 7.874074 | 
| REFERENCE TRAIP DEPENDENT REPLISOME DISASSEMBLY | 9/270 | 16/2126 | 0.0000385 | 0.0022101 | 0.0020991 | 9 | 4.429167 | 
| REFERENCE RAD51 DSDNA DESTABILIZATION | 7/270 | 10/2126 | 0.0000423 | 0.0022101 | 0.0020991 | 7 | 5.511852 | 
| REFERENCE CHOLESTEROL BIOSYNTHESIS | 8/270 | 13/2126 | 0.0000445 | 0.0022101 | 0.0020991 | 8 | 4.845584 | 
| REFERENCE ORGANIZATION OF THE OUTER KINETOCHORE | 9/270 | 19/2126 | 0.0002188 | 0.0085920 | 0.0081606 | 9 | 3.729825 | 
| REFERENCE DEPHOSPHORYLATION OF KINETOCHORE | 7/270 | 12/2126 | 0.0002222 | 0.0085920 | 0.0081606 | 7 | 4.593210 | 
| REFERENCE PRE IC FORMATION | 10/270 | 27/2126 | 0.0010740 | 0.0373768 | 0.0355001 | 10 | 2.916324 | 
| REFERENCE CENPE INTERACTION WITH NDC80 COMPLEX | 6/270 | 12/2126 | 0.0018828 | 0.0595650 | 0.0565742 | 6 | 3.937037 | 
| REFERENCE CDC25 CELL CYCLE G2 M | 5/270 | 9/2126 | 0.0026010 | 0.0719007 | 0.0682904 | 5 | 4.374486 | 
| REFERENCE DNA REPLICATION TERMINATION | 8/270 | 21/2126 | 0.0027977 | 0.0719007 | 0.0682904 | 8 | 2.999647 | 
| REFERENCE COMMON PATHWAY OF COMPLEMENT CASCADE MAC FORMATION | 4/270 | 6/2126 | 0.0030992 | 0.0719007 | 0.0682904 | 4 | 5.249383 | 
| REFERENCE LECTIN PATHWAY OF COAGULATION CASCADE FIBRINOGEN TO FIBRIN | 4/270 | 6/2126 | 0.0030992 | 0.0719007 | 0.0682904 | 4 | 5.249383 | 
| REFERENCE HOMOLOGOUS RECOMBINATION | 10/270 | 32/2126 | 0.0045878 | 0.0955378 | 0.0907407 | 10 | 2.460648 | 
| REFERENCE RECRUITMENT AND FORMATION OF THE MCC | 5/270 | 10/2126 | 0.0046671 | 0.0955378 | 0.0907407 | 5 | 3.937037 | 
| REFERENCE LECTIN PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION | 4/270 | 7/2126 | 0.0065125 | 0.1259078 | 0.1195857 | 4 | 4.499471 | 
| REFERENCE HOMOLOGOUS RECOMBINATION IN ICLR | 6/270 | 15/2126 | 0.0073105 | 0.1338972 | 0.1271740 | 6 | 3.149630 | 
| REFERENCE BREAK INDUCED REPLICATION | 4/270 | 8/2126 | 0.0117359 | 0.1944810 | 0.1847158 | 4 | 3.937037 | 
message("GO")
## GO
ora_up_go[rownames(ora_up_go)=="BP COMPLEMENT ACTIVATION CLASSICAL PATHWAY",]
##                                            GeneRatio  BgRatio       pvalue
## BP COMPLEMENT ACTIVATION CLASSICAL PATHWAY   14/1315 24/12365 1.553408e-08
##                                                p.adjust       qvalue Count
## BP COMPLEMENT ACTIVATION CLASSICAL PATHWAY 1.794445e-05 1.720195e-05    14
##                                                  es
## BP COMPLEMENT ACTIVATION CLASSICAL PATHWAY 5.485108
message("KEGG + complement")
## KEGG + complement
ora_up_kegg2[rownames(ora_up_kegg2)=="GOBP_COMPLEMENT_ACTIVATION_CLASSICAL_PATHWAY",]
##                                              GeneRatio BgRatio       pvalue
## GOBP_COMPLEMENT_ACTIVATION_CLASSICAL_PATHWAY    14/270 24/2126 1.243003e-07
##                                                  p.adjust       qvalue Count
## GOBP_COMPLEMENT_ACTIVATION_CLASSICAL_PATHWAY 4.325652e-05 4.108453e-05    14
##                                                   es
## GOBP_COMPLEMENT_ACTIVATION_CLASSICAL_PATHWAY 4.59321
Fix KEGG analysis.
bgdf <- data.frame("background",bg)
colnames(bgdf) <- c("term","gene")
keggfix <- rbind(kegg,bgdf)
ora_up <- as.data.frame(enricher(gene = up ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = keggfix,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))
 gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))
  br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))
  ora_up$es <- gr/br
ora_up$geneID = ora_up$ID = ora_up$Description = NULL
ora_up_keggfix <- ora_up
head(ora_up,20) %>%
  kbl(row.names = TRUE, caption="Upregulated KEGG fixed in SRP128998 with Complement") %>%
  kable_paper("hover", full_width = F)
| GeneRatio | BgRatio | pvalue | p.adjust | qvalue | Count | es | |
|---|---|---|---|---|---|---|---|
| REFERENCE ORIGIN UNWINDING AND ELONGATION | 15/1469 | 30/14144 | 0.0000001 | 0.0000196 | 0.0000179 | 15 | 4.814159 | 
| REFERENCE MEVALONATE PATHWAY | 8/1469 | 11/14144 | 0.0000016 | 0.0002863 | 0.0002615 | 8 | 7.002414 | 
| REFERENCE TRAIP DEPENDENT REPLISOME DISASSEMBLY | 9/1469 | 16/14144 | 0.0000080 | 0.0006967 | 0.0006364 | 9 | 5.415929 | 
| REFERENCE CHOLESTEROL BIOSYNTHESIS | 8/1469 | 13/14144 | 0.0000106 | 0.0006967 | 0.0006364 | 8 | 5.925119 | 
| REFERENCE RAD51 DSDNA DESTABILIZATION | 7/1469 | 10/14144 | 0.0000116 | 0.0006967 | 0.0006364 | 7 | 6.739823 | 
| REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION | 5/1469 | 5/14144 | 0.0000120 | 0.0006967 | 0.0006364 | 5 | 9.628319 | 
| REFERENCE ORGANIZATION OF THE OUTER KINETOCHORE | 9/1469 | 19/14144 | 0.0000481 | 0.0023924 | 0.0021854 | 9 | 4.560783 | 
| REFERENCE DEPHOSPHORYLATION OF KINETOCHORE | 7/1469 | 12/14144 | 0.0000635 | 0.0027637 | 0.0025246 | 7 | 5.616519 | 
| REFERENCE PRE IC FORMATION | 10/1469 | 27/14144 | 0.0002272 | 0.0087860 | 0.0080259 | 10 | 3.566044 | 
| REFERENCE CENPE INTERACTION WITH NDC80 COMPLEX | 6/1469 | 12/14144 | 0.0006602 | 0.0229733 | 0.0209859 | 6 | 4.814159 | 
| REFERENCE DNA REPLICATION TERMINATION | 8/1469 | 21/14144 | 0.0007817 | 0.0247290 | 0.0225897 | 8 | 3.667931 | 
| REFERENCE CDC25 CELL CYCLE G2 M | 5/1469 | 9/14144 | 0.0010565 | 0.0286777 | 0.0261968 | 5 | 5.349066 | 
| REFERENCE HOMOLOGOUS RECOMBINATION | 10/1469 | 32/14144 | 0.0010713 | 0.0286777 | 0.0261968 | 10 | 3.008850 | 
| REFERENCE COMMON PATHWAY OF COMPLEMENT CASCADE MAC FORMATION | 4/1469 | 6/14144 | 0.0014632 | 0.0339453 | 0.0310087 | 4 | 6.418879 | 
| REFERENCE LECTIN PATHWAY OF COAGULATION CASCADE FIBRINOGEN TO FIBRIN | 4/1469 | 6/14144 | 0.0014632 | 0.0339453 | 0.0310087 | 4 | 6.418879 | 
| REFERENCE RECRUITMENT AND FORMATION OF THE MCC | 5/1469 | 10/14144 | 0.0019326 | 0.0420348 | 0.0383984 | 5 | 4.814159 | 
| REFERENCE HOMOLOGOUS RECOMBINATION IN ICLR | 6/1469 | 15/14144 | 0.0027178 | 0.0556340 | 0.0508212 | 6 | 3.851327 | 
| REFERENCE LECTIN PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION | 4/1469 | 7/14144 | 0.0031332 | 0.0605748 | 0.0553345 | 4 | 5.501896 | 
| REFERENCE BREAK INDUCED REPLICATION | 4/1469 | 8/14144 | 0.0057529 | 0.1000998 | 0.0914402 | 4 | 4.814159 | 
| REFERENCE REGULATION OF COMPLEMENT CASCADE MAC INHIBITION | 4/1469 | 8/14144 | 0.0057529 | 0.1000998 | 0.0914402 | 4 | 4.814159 | 
message("Original")
## Original
ora_up_kegg[which(rownames(ora_up_kegg)=="REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION"),]
##                                                                                    GeneRatio
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION     5/268
##                                                                                    BgRatio
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION  5/2120
##                                                                                          pvalue
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 3.124267e-05
##                                                                                       p.adjust
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 0.002483402
##                                                                                         qvalue
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 0.002365504
##                                                                                    Count
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION     5
##                                                                                          es
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 7.910448
message("Fixed")
## Fixed
ora_up_keggfix[which(rownames(ora_up_keggfix)=="REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION"),]
##                                                                                    GeneRatio
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION    5/1469
##                                                                                    BgRatio
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 5/14144
##                                                                                          pvalue
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 1.201145e-05
##                                                                                       p.adjust
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 0.000696664
##                                                                                          qvalue
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 0.0006363961
##                                                                                    Count
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION     5
##                                                                                          es
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 9.628319
Here is a function that conducts over-representation analysis with ClusterProfiler. Genes that are not annotated as having any gene set category are discarded.
# df = deseq output
# df must have "ensID geneSymbol" as rownames
# gs = gene sets
# gs format is list of symbols
orabad <- function( df, gs ) {
  def <- df
  gs1 <- stack(gs)
  colnames(gs1) <- c("gene","term")
  gs1 <- gs1[,c(2,1)]
  defup <- rownames(subset(def,padj<=0.05 & log2FoldChange>0))
  defup <- defup[lapply(strsplit(defup," "),length)==2]
  defup <- unique(sapply(strsplit(defup," "),"[[",2))
  if ( length(defup)<200 ) {
    defup <- head(rownames(subset(def,log2FoldChange>0)),200)
    defup <- defup[lapply(strsplit(defup," "),length)==2]
    defup <- unique(sapply(strsplit(defup," "),"[[",2))
  }
  defdn <- rownames(subset(def,padj<=0.05 & log2FoldChange<0))
  defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
  defdn <- unique(sapply(strsplit(defdn," "),"[[",2))
  if ( length(defdn)<200 ) {
    defdn <- head(rownames(subset(def,log2FoldChange<0)),200)
    defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
    defdn <- unique(sapply(strsplit(defdn," "),"[[",2))
  }
  bg <- rownames(def)
  bg <- bg[lapply(strsplit(bg," "),length)==2]
  bg <- unique(sapply(strsplit(bg," "),"[[",2))
  ora_up <- as.data.frame(enricher(gene = defup ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))
  ora_up$geneID <- NULL
  ora_ups <- rownames(ora_up)
  gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))
  br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))
  ora_up$es <- gr/br
  ora_up <- ora_up[order(-ora_up$es),]
  ora_up$Description=NULL
  ora_dn <- as.data.frame(enricher(gene = defdn ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))
  ora_dn$geneID <- NULL
  ora_dns <- rownames(ora_dn)
  gr <- as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",2))
  br <- as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",2))
  ora_dn$es <- gr/br
  ora_dn <- ora_dn[order(-ora_dn$es),]
  ora_dn$Description=NULL
  result <- list("orabad_up"=ora_up,"orabad_dn"=ora_dn)
  return(result)
}
This function fixes the problem with excluding un-annotated genes.
oragood <- function( df, gs ) {
  def <- df
  gs1 <- stack(gs)
  colnames(gs1) <- c("gene","term")
  gs1 <- gs1[,c(2,1)]
  defup <- rownames(subset(def,padj<=0.05 & log2FoldChange>0))
  defup <- defup[lapply(strsplit(defup," "),length)==2]
  defup <- unique(sapply(strsplit(defup," "),"[[",2))
  if ( length(defup)<200 ) {
    defup <- head(rownames(subset(def,log2FoldChange>0)),200)
    defup <- defup[lapply(strsplit(defup," "),length)==2]
    defup <- unique(sapply(strsplit(defup," "),"[[",2))
  }
  defdn <- rownames(subset(def,padj<=0.05 & log2FoldChange<0))
  defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
  defdn <- unique(sapply(strsplit(defdn," "),"[[",2))
  if ( length(defdn)<200 ) {
    defdn <- head(rownames(subset(def,log2FoldChange<0)),200)
    defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
    defdn <- unique(sapply(strsplit(defdn," "),"[[",2))
  }
  bg <- rownames(def)
  bg <- bg[lapply(strsplit(bg," "),length)==2]
  bg <- unique(sapply(strsplit(bg," "),"[[",2))
  bgdf <- data.frame("background",bg)
  names(bgdf) <- c("term","gene")
  gs1 <- rbind(gs1,bgdf)
  ora_up <- as.data.frame(enricher(gene = defup ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))
  ora_up$geneID <- NULL
  ora_ups <- rownames(ora_up)
  gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))
  br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))
  ora_up$es <- gr/br
  ora_up <- ora_up[order(-ora_up$es),]
  ora_up$Description=NULL
  ora_dn <- as.data.frame(enricher(gene = defdn ,
    universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
    pAdjustMethod="fdr",  pvalueCutoff = 1, qvalueCutoff = 1  ))
  ora_dn$geneID <- NULL
  ora_dns <- rownames(ora_dn)
  gr <- as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",2))
  br <- as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",1)) /
    as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",2))
  ora_dn$es <- gr/br
  ora_dn <- ora_dn[order(-ora_dn$es),]
  ora_dn$Description=NULL
  result <- list("oragood_up"=ora_up,"oragood_dn"=ora_dn)
  return(result)
}
# bad=ORA analysis, a list with results of up and down regulated pathways
# good=same as bad, except fixed the background problem
compare_ora_plots <- function(bad,good,libname=NULL){
  badup <- bad$orabad_up
  baddn <- bad$orabad_dn
  goodup <- good$oragood_up
  gooddn <- good$oragood_dn
  mup <- merge(badup,goodup,by=0)
  MAX=max(c(mup$es.x,mup$es.y))
  UPHEADER=paste(libname,"up")
  DNHEADER=paste(libname,"dn")
  par(mfrow=c(2,2))
  plot(mup$es.x, mup$es.y,
    xlim=c(0,MAX), ylim=c(0,MAX),
    xlab="original ES", ylab="corrected ES", main=UPHEADER)
  grid()
  abline(a = 0, b = 1,lwd=2,lty=2)
  MAX=max(-log10(c(mup$pvalue.x,mup$pvalue.y)))
  plot(-log10(mup$pvalue.x), -log10(mup$pvalue.y),
    xlim=c(0,MAX), ylim=c(0,MAX),
    xlab="original -log10 p-value", ylab="corrected -log10 p-value", main=UPHEADER)
  grid()
  abline(a = 0, b = 1,lwd=2,lty=2)
  mdn <- merge(baddn,gooddn,by=0)
  MAX=max(c(mdn$es.x,mdn$es.y))
  plot(mdn$es.x, mdn$es.y,
    xlim=c(0,MAX), ylim=c(0,MAX),
    xlab="original ES", ylab="corrected ES", main=DNHEADER)
  grid()
  abline(a = 0, b = 1,lwd=2,lty=2)
  MAX=max(-log10(c(mdn$pvalue.x,mdn$pvalue.y)))
  plot(-log10(mdn$pvalue.x), -log10(mdn$pvalue.y),
    xlim=c(0,MAX), ylim=c(0,MAX),
    xlab="original -log10 p-value", ylab="corrected -log10 p-value", main=DNHEADER)
  grid()
  abline(a = 0, b = 1,lwd=2,lty=2)
}
compare_ora <- function(bad,good){
  badup <- bad$orabad_up
  baddn <- bad$orabad_dn
  goodup <- good$oragood_up
  gooddn <- good$oragood_dn
  mup <- merge(badup,goodup,by=0)
  mdn <- merge(baddn,gooddn,by=0)
  bup <- subset(mup,p.adjust.x<0.05)$Row.names
  gup <- subset(mup,p.adjust.y<0.05)$Row.names
  bdn <- subset(mdn,p.adjust.x<0.05)$Row.names
  gdn <- subset(mdn,p.adjust.y<0.05)$Row.names
  sig_bad <- unique(c(bup,bdn))
  sig_good <- unique(c(gup,gdn))
  nsig_bad <- length(sig_bad)
  nsig_good <- length(sig_good)
  jac <- length(intersect(sig_bad,sig_good)) / length(union(sig_bad,sig_good))
  result=c("nsig_bad"=nsig_bad,"nsig_good"=nsig_good,"Jaccard"=jac)
  return(result)
}
params <- expand.grid(1:length(d),1:length(gs))
params2 <- expand.grid(names(d),names(gs))
colnames(params) <- c("d","gs")
ora_dat <- mclapply(1:nrow(params), function(i) {
  j=params[i,1]
  k=params[i,2]
  bad <- orabad(d[[j]],gs[[k]])
  good <- oragood(d[[j]],gs[[k]])
  res <- compare_ora(bad,good)
  return(res)
} , mc.cores=8)
ora_dat <- do.call(rbind,ora_dat)
res1 <- cbind(params2,ora_dat)
res1 %>%
  kbl(row.names = TRUE, caption="all results") %>%
  kable_paper("hover", full_width = F)
| Var1 | Var2 | nsig_bad | nsig_good | Jaccard | |
|---|---|---|---|---|---|
| 1 | bulkrna1.Rds | KEGG | 9 | 19 | 0.4736842 | 
| 2 | bulkrna2.Rds | KEGG | 25 | 59 | 0.4237288 | 
| 3 | bulkrna3.Rds | KEGG | 2 | 6 | 0.3333333 | 
| 4 | bulkrna4.Rds | KEGG | 14 | 24 | 0.5833333 | 
| 5 | bulkrna5.Rds | KEGG | 0 | 0 | NaN | 
| 6 | bulkrna6.Rds | KEGG | 47 | 56 | 0.8392857 | 
| 7 | bulkrna7.Rds | KEGG | 9 | 38 | 0.2368421 | 
| 8 | bulkrna1.Rds | Reactome | 64 | 102 | 0.6274510 | 
| 9 | bulkrna2.Rds | Reactome | 239 | 351 | 0.6809117 | 
| 10 | bulkrna3.Rds | Reactome | 94 | 159 | 0.5617284 | 
| 11 | bulkrna4.Rds | Reactome | 207 | 307 | 0.6742671 | 
| 12 | bulkrna5.Rds | Reactome | 0 | 0 | NaN | 
| 13 | bulkrna6.Rds | Reactome | 254 | 334 | 0.7604790 | 
| 14 | bulkrna7.Rds | Reactome | 185 | 240 | 0.7708333 | 
| 15 | bulkrna1.Rds | Wikipathways | 33 | 46 | 0.7173913 | 
| 16 | bulkrna2.Rds | Wikipathways | 108 | 193 | 0.5595855 | 
| 17 | bulkrna3.Rds | Wikipathways | 8 | 12 | 0.6666667 | 
| 18 | bulkrna4.Rds | Wikipathways | 13 | 50 | 0.2600000 | 
| 19 | bulkrna5.Rds | Wikipathways | 1 | 1 | 1.0000000 | 
| 20 | bulkrna6.Rds | Wikipathways | 82 | 97 | 0.8453608 | 
| 21 | bulkrna7.Rds | Wikipathways | 35 | 117 | 0.2991453 | 
| 22 | bulkrna1.Rds | miR targets | 4 | 81 | 0.0493827 | 
| 23 | bulkrna2.Rds | miR targets | 40 | 111 | 0.3603604 | 
| 24 | bulkrna3.Rds | miR targets | 21 | 38 | 0.5526316 | 
| 25 | bulkrna4.Rds | miR targets | 281 | 635 | 0.4425197 | 
| 26 | bulkrna5.Rds | miR targets | 0 | 0 | NaN | 
| 27 | bulkrna6.Rds | miR targets | 1166 | 1365 | 0.8393895 | 
| 28 | bulkrna7.Rds | miR targets | 360 | 544 | 0.6617647 | 
| 29 | bulkrna1.Rds | TFT GTRD | 52 | 85 | 0.6117647 | 
| 30 | bulkrna2.Rds | TFT GTRD | 98 | 140 | 0.7000000 | 
| 31 | bulkrna3.Rds | TFT GTRD | 108 | 135 | 0.8000000 | 
| 32 | bulkrna4.Rds | TFT GTRD | 71 | 115 | 0.6173913 | 
| 33 | bulkrna5.Rds | TFT GTRD | 1 | 1 | 1.0000000 | 
| 34 | bulkrna6.Rds | TFT GTRD | 147 | 173 | 0.8497110 | 
| 35 | bulkrna7.Rds | TFT GTRD | 129 | 153 | 0.8431373 | 
| 36 | bulkrna1.Rds | GO | 485 | 707 | 0.6859972 | 
| 37 | bulkrna2.Rds | GO | 1843 | 2168 | 0.8500923 | 
| 38 | bulkrna3.Rds | GO | 686 | 915 | 0.7497268 | 
| 39 | bulkrna4.Rds | GO | 390 | 709 | 0.5500705 | 
| 40 | bulkrna5.Rds | GO | 6 | 48 | 0.1250000 | 
| 41 | bulkrna6.Rds | GO | 588 | 789 | 0.7452471 | 
| 42 | bulkrna7.Rds | GO | 1263 | 1544 | 0.8180052 | 
| 43 | bulkrna1.Rds | HPO | 2 | 42 | 0.0476190 | 
| 44 | bulkrna2.Rds | HPO | 380 | 988 | 0.3846154 | 
| 45 | bulkrna3.Rds | HPO | 92 | 241 | 0.3817427 | 
| 46 | bulkrna4.Rds | HPO | 24 | 283 | 0.0848057 | 
| 47 | bulkrna5.Rds | HPO | 0 | 1 | 0.0000000 | 
| 48 | bulkrna6.Rds | HPO | 17 | 265 | 0.0641509 | 
| 49 | bulkrna7.Rds | HPO | 186 | 449 | 0.4142539 | 
| 50 | bulkrna1.Rds | Cellmarkers | 139 | 170 | 0.8176471 | 
| 51 | bulkrna2.Rds | Cellmarkers | 439 | 477 | 0.9203354 | 
| 52 | bulkrna3.Rds | Cellmarkers | 345 | 382 | 0.9031414 | 
| 53 | bulkrna4.Rds | Cellmarkers | 244 | 286 | 0.8531469 | 
| 54 | bulkrna5.Rds | Cellmarkers | 0 | 1 | 0.0000000 | 
| 55 | bulkrna6.Rds | Cellmarkers | 267 | 297 | 0.8989899 | 
| 56 | bulkrna7.Rds | Cellmarkers | 448 | 491 | 0.9085366 | 
| 57 | bulkrna1.Rds | Hallmark | 7 | 20 | 0.3500000 | 
| 58 | bulkrna2.Rds | Hallmark | 25 | 38 | 0.6578947 | 
| 59 | bulkrna3.Rds | Hallmark | 18 | 21 | 0.7727273 | 
| 60 | bulkrna4.Rds | Hallmark | 13 | 27 | 0.4814815 | 
| 61 | bulkrna5.Rds | Hallmark | 0 | 2 | 0.0000000 | 
| 62 | bulkrna6.Rds | Hallmark | 28 | 37 | 0.7567568 | 
| 63 | bulkrna7.Rds | Hallmark | 18 | 34 | 0.4444444 | 
summary(res1$nsig_bad)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    11.0    52.0   188.3   241.5  1843.0
summary(res1$nsig_good)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    35.5   115.0   273.3   320.5  2168.0
summary(res1$Jaccard)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.3839  0.6427  0.5718  0.8044  1.0000       3
# NUMBER OF SIGNIFICANT SETS
ns1 <- matrix( res1$nsig_bad , ncol=9 )
rownames(ns1) <- names(d)
colnames(ns1) <- names(gs)
ns1 %>%
  kbl(row.names = TRUE, caption="significant genesets in original analysis") %>%
  kable_paper("hover", full_width = F)
| KEGG | Reactome | Wikipathways | miR targets | TFT GTRD | GO | HPO | Cellmarkers | Hallmark | |
|---|---|---|---|---|---|---|---|---|---|
| bulkrna1.Rds | 9 | 64 | 33 | 4 | 52 | 485 | 2 | 139 | 7 | 
| bulkrna2.Rds | 25 | 239 | 108 | 40 | 98 | 1843 | 380 | 439 | 25 | 
| bulkrna3.Rds | 2 | 94 | 8 | 21 | 108 | 686 | 92 | 345 | 18 | 
| bulkrna4.Rds | 14 | 207 | 13 | 281 | 71 | 390 | 24 | 244 | 13 | 
| bulkrna5.Rds | 0 | 0 | 1 | 0 | 1 | 6 | 0 | 0 | 0 | 
| bulkrna6.Rds | 47 | 254 | 82 | 1166 | 147 | 588 | 17 | 267 | 28 | 
| bulkrna7.Rds | 9 | 185 | 35 | 360 | 129 | 1263 | 186 | 448 | 18 | 
ns1 <- apply(ns1,2,function(x) { x <- x[!is.na(x)] ; x <- x[is.finite(x)] ;  mean(x) } )
ns2 <- matrix( res1$nsig_good , ncol=9 )
rownames(ns2) <- names(d)
colnames(ns2) <- names(gs)
ns2 %>%
  kbl(row.names = TRUE, caption="significant genesets in corrected analysis") %>%
  kable_paper("hover", full_width = F)
| KEGG | Reactome | Wikipathways | miR targets | TFT GTRD | GO | HPO | Cellmarkers | Hallmark | |
|---|---|---|---|---|---|---|---|---|---|
| bulkrna1.Rds | 19 | 102 | 46 | 81 | 85 | 707 | 42 | 170 | 20 | 
| bulkrna2.Rds | 59 | 351 | 193 | 111 | 140 | 2168 | 988 | 477 | 38 | 
| bulkrna3.Rds | 6 | 159 | 12 | 38 | 135 | 915 | 241 | 382 | 21 | 
| bulkrna4.Rds | 24 | 307 | 50 | 635 | 115 | 709 | 283 | 286 | 27 | 
| bulkrna5.Rds | 0 | 0 | 1 | 0 | 1 | 48 | 1 | 1 | 2 | 
| bulkrna6.Rds | 56 | 334 | 97 | 1365 | 173 | 789 | 265 | 297 | 37 | 
| bulkrna7.Rds | 38 | 240 | 117 | 544 | 153 | 1544 | 449 | 491 | 34 | 
ns2 <- apply(ns2,2,function(x) { x <- x[!is.na(x)] ; x <- x[is.finite(x)] ;  mean(x) } )
ns2
##         KEGG     Reactome Wikipathways  miR targets     TFT GTRD           GO 
##     28.85714    213.28571     73.71429    396.28571    114.57143    982.85714 
##          HPO  Cellmarkers     Hallmark 
##    324.14286    300.57143     25.57143
df <- data.frame(ns1,ns2)
df <- df[order(df$ns1),]
par(mar=c(c(5.1, 7.1, 4.1, 2.1) ))
barplot(t(df),beside=TRUE,horiz=TRUE,las=1,xlim=c(0,1250),
  xlab="no. significant sets",
  col=c("gray30","gray80"))
text(df$ns1+50,((1:nrow(df))*3)-1.5,labels=signif(df[,1],3))
text(df$ns2+50,((1:nrow(df))*3)-0.5,labels=signif(df[,2],3))
#abline(v=seq(100,1000,100),lty=2,lwd=0.5,col="gray")
legend("bottomright", inset=.02, legend=c("corrected","original"),
  fill=c("gray80","gray30"), horiz=FALSE, cex=1.1)
mylabels <- gsub("$","%",gsub("^","+",signif(((df$ns2/df$ns1)-1)*100,3)))
text(df$ns2+160,((1:nrow(df))*3)-1,labels=mylabels,cex=1.1)
png("fig1_ORAsig.png")
par(mar=c(c(5.1, 7.1, 2.1, 2.1) ))
barplot(t(df),beside=TRUE,horiz=TRUE,las=1,xlim=c(0,1250),
  xlab="no. significant sets",
  col=c("gray30","gray80"))
text(df$ns1+50,((1:nrow(df))*3)-1.5,labels=signif(df[,1],3))
text(df$ns2+50,((1:nrow(df))*3)-0.5,labels=signif(df[,2],3))
#abline(v=seq(100,1000,100),lty=2,lwd=0.5,col="gray")
legend("bottomright", inset=.02, legend=c("corrected","original"),
  fill=c("gray80","gray30"), horiz=FALSE, cex=1.1)
mylabels <- gsub("$","%",gsub("^","+",signif(((df$ns2/df$ns1)-1)*100,3)))
text(df$ns2+160,((1:nrow(df))*3)-1,labels=mylabels,cex=1.1)
dev.off()
## png 
##   2
# JACCARD
jac <- matrix( res1$Jaccard , ncol=9 )
rownames(jac) <- names(d)
colnames(jac) <- names(gs)
signif(jac,3) %>%
  kbl(row.names = TRUE, caption="Jaccard similarity Corrected:Original") %>%
  kable_paper("hover", full_width = F)
| KEGG | Reactome | Wikipathways | miR targets | TFT GTRD | GO | HPO | Cellmarkers | Hallmark | |
|---|---|---|---|---|---|---|---|---|---|
| bulkrna1.Rds | 0.474 | 0.627 | 0.717 | 0.0494 | 0.612 | 0.686 | 0.0476 | 0.818 | 0.350 | 
| bulkrna2.Rds | 0.424 | 0.681 | 0.560 | 0.3600 | 0.700 | 0.850 | 0.3850 | 0.920 | 0.658 | 
| bulkrna3.Rds | 0.333 | 0.562 | 0.667 | 0.5530 | 0.800 | 0.750 | 0.3820 | 0.903 | 0.773 | 
| bulkrna4.Rds | 0.583 | 0.674 | 0.260 | 0.4430 | 0.617 | 0.550 | 0.0848 | 0.853 | 0.481 | 
| bulkrna5.Rds | NaN | NaN | 1.000 | NaN | 1.000 | 0.125 | 0.0000 | 0.000 | 0.000 | 
| bulkrna6.Rds | 0.839 | 0.760 | 0.845 | 0.8390 | 0.850 | 0.745 | 0.0642 | 0.899 | 0.757 | 
| bulkrna7.Rds | 0.237 | 0.771 | 0.299 | 0.6620 | 0.843 | 0.818 | 0.4140 | 0.909 | 0.444 | 
jac2 <- apply(jac,2,function(x) { x <- x[!is.na(x)] ; x <- x[is.finite(x)] ;  mean(x) } )
jac2
##         KEGG     Reactome Wikipathways  miR targets     TFT GTRD           GO 
##    0.4817013    0.6792784    0.6211642    0.4843414    0.7745720    0.6463056 
##          HPO  Cellmarkers     Hallmark 
##    0.1967411    0.7573996    0.4947578
par(mar=c(c(5.1, 7.1, 2.1, 2.1) ))
barplot(sort(jac2),horiz=TRUE,las=1,xlim=c(0,1), xlab="Jaccard index")
par(mar=c(c(5.1, 5.1, 2.1, 2.1) ))
plot(cov,jac2,xlim=c(0,30000),ylim=c(0,1),pch=19,col="lightgray",cex=3,
  xlab="gene coverage",ylab="Jaccard index")
text(cov,jac2,labels=names(cov))
mylm <- lm(jac2~cov)
abline(mylm,lty=2,lwd=2)
SLOPE=unname(signif(mylm$coefficients[2],3))
INT=unname(signif(mylm$coefficients[1],3))
PVAL=signif(summary(mylm)$coefficients[4],3)
RSQ=signif(summary(mylm)$r.squared,3)
HEADER=paste("Rsq =",RSQ,"; p-val =",PVAL)
mtext(HEADER)
png("fig1_ORAjac.png")
par(mar=c(c(5.1, 5.1, 2.1, 2.1) ))
plot(cov,jac2,xlim=c(0,30000),ylim=c(0,1),pch=19,col="lightgray",cex=3,
  xlab="gene coverage",ylab="Jaccard index")
text(cov,jac2,labels=names(cov))
mylm <- lm(jac2~cov)
abline(mylm,lty=2,lwd=2)
SLOPE=unname(signif(mylm$coefficients[2],3))
INT=unname(signif(mylm$coefficients[1],3))
PVAL=signif(summary(mylm)$coefficients[4],3)
RSQ=signif(summary(mylm)$r.squared,3)
HEADER=paste("Rsq =",RSQ,"; p-val =",PVAL)
mtext(HEADER)
dev.off()
## png 
##   2
For bulk RNA-seq data set #1, run enrichment with all the gene set libraries.
names(gs)
## [1] "KEGG"         "Reactome"     "Wikipathways" "miR targets"  "TFT GTRD"    
## [6] "GO"           "HPO"          "Cellmarkers"  "Hallmark"
lapply(1:length(gs), function(i) {
  bad <- orabad(d[[1]],gs[[i]])
  good <- oragood(d[[1]],gs[[i]])
  compare_ora_plots(bad,good,libname=names(gs)[i])
} )
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
For reproducibility
sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] gplots_3.1.3.1              fgsea_1.30.0               
##  [3] biomaRt_2.60.0              eulerr_7.0.2               
##  [5] kableExtra_1.4.0            mitch_1.16.0               
##  [7] clusterProfiler_4.12.0      DESeq2_1.44.0              
##  [9] SummarizedExperiment_1.34.0 Biobase_2.64.0             
## [11] MatrixGenerics_1.16.0       matrixStats_1.3.0          
## [13] GenomicRanges_1.56.0        GenomeInfoDb_1.40.1        
## [15] IRanges_2.38.0              S4Vectors_0.42.0           
## [17] BiocGenerics_0.50.0         getDEE2_1.14.0             
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3      rstudioapi_0.16.0       jsonlite_1.8.8         
##   [4] magrittr_2.0.3          farver_2.1.2            rmarkdown_2.26         
##   [7] fs_1.6.4                zlibbioc_1.50.0         vctrs_0.6.5            
##  [10] memoise_2.0.1           ggtree_3.12.0           progress_1.2.3         
##  [13] htmltools_0.5.8.1       S4Arrays_1.4.1          curl_5.2.1             
##  [16] SparseArray_1.4.8       gridGraphics_0.5-1      sass_0.4.9             
##  [19] KernSmooth_2.23-22      bslib_0.7.0             htmlwidgets_1.6.4      
##  [22] httr2_1.0.1             echarts4r_0.4.5         plyr_1.8.9             
##  [25] cachem_1.0.8            igraph_2.0.3            mime_0.12              
##  [28] lifecycle_1.0.4         pkgconfig_2.0.3         Matrix_1.7-0           
##  [31] R6_2.5.1                fastmap_1.1.1           gson_0.1.0             
##  [34] shiny_1.8.1.1           GenomeInfoDbData_1.2.12 digest_0.6.35          
##  [37] aplot_0.2.2             enrichplot_1.24.0       GGally_2.2.1           
##  [40] colorspace_2.1-0        patchwork_1.2.0         AnnotationDbi_1.66.0   
##  [43] RSQLite_2.3.7           filelock_1.0.3          fansi_1.0.6            
##  [46] httr_1.4.7              polyclip_1.10-6         abind_1.4-5            
##  [49] compiler_4.4.0          bit64_4.0.5             withr_3.0.0            
##  [52] BiocParallel_1.38.0     viridis_0.6.5           DBI_1.2.3              
##  [55] ggstats_0.6.0           highr_0.10              ggforce_0.4.2          
##  [58] MASS_7.3-60.2           rappdirs_0.3.3          DelayedArray_0.30.1    
##  [61] HDO.db_0.99.1           caTools_1.18.2          gtools_3.9.5           
##  [64] tools_4.4.0             beeswarm_0.4.0          ape_5.8                
##  [67] scatterpie_0.2.3        httpuv_1.6.15           glue_1.7.0             
##  [70] promises_1.3.0          nlme_3.1-164            GOSemSim_2.30.0        
##  [73] grid_4.4.0              shadowtext_0.1.3        reshape2_1.4.4         
##  [76] generics_0.1.3          gtable_0.3.5            tidyr_1.3.1            
##  [79] hms_1.1.3               data.table_1.15.4       xml2_1.3.6             
##  [82] tidygraph_1.3.1         utf8_1.2.4              XVector_0.44.0         
##  [85] ggrepel_0.9.5           pillar_1.9.0            stringr_1.5.1          
##  [88] yulab.utils_0.1.4       later_1.3.2             splines_4.4.0          
##  [91] dplyr_1.1.4             tweenr_2.0.3            BiocFileCache_2.12.0   
##  [94] treeio_1.28.0           lattice_0.22-6          bit_4.0.5              
##  [97] tidyselect_1.2.1        GO.db_3.19.1            locfit_1.5-9.9         
## [100] Biostrings_2.72.1       knitr_1.46              gridExtra_2.3          
## [103] svglite_2.1.3           xfun_0.43               graphlayouts_1.1.1     
## [106] stringi_1.8.4           UCSC.utils_1.0.0        lazyeval_0.2.2         
## [109] ggfun_0.1.5             yaml_2.3.8              evaluate_0.23          
## [112] codetools_0.2-20        ggraph_2.2.1            tibble_3.2.1           
## [115] qvalue_2.36.0           ggplotify_0.1.2         cli_3.6.2              
## [118] systemfonts_1.0.6       xtable_1.8-4            munsell_0.5.1          
## [121] jquerylib_0.1.4         Rcpp_1.0.12             dbplyr_2.5.0           
## [124] png_0.1-8               ggplot2_3.5.1           blob_1.2.4             
## [127] prettyunits_1.2.0       DOSE_3.30.1             htm2txt_2.2.2          
## [130] bitops_1.0-7            viridisLite_0.4.2       tidytree_0.4.6         
## [133] scales_1.3.0            purrr_1.0.2             crayon_1.5.2           
## [136] rlang_1.1.3             cowplot_1.1.3           fastmatch_1.1-4        
## [139] KEGGREST_1.44.0