Source: https://github.com/markziemann/background
This guide is a Rmarkdown script that conducts differential expression and enrichment analysis, which are very popular workflows for transcriptome data.
The goal of this work is to understand how ClusterProfiler manages the background list, as we have observed some weird behaviour. In particular we see that when we provide a custom background, those genes do not appear in the final analysis unless they are also included in the gene list.
In the code chunk below called libs
, you can add and
remove required R library dependancies. Check that the libraries listed
here match the Dockerfile, otherwise you might get errors.
suppressPackageStartupMessages({
library("getDEE2")
library("DESeq2")
library("kableExtra")
library("clusterProfiler")
library("fgsea")
library("eulerr")
library("gplots")
library("parallel")
})
For this guide I will be using bulk RNA-seq data from several previous studies.
Gene sets were obtained from MSigDB.
myfiles <- list.files("../dataprep",pattern="*Rds",full.names=TRUE)
d <- lapply(myfiles,readRDS)
names(d) <- basename(myfiles)
mygmts <- list.files("../gmt",pattern="*gmt",full.names=TRUE)
gs <- lapply(mygmts,gmtPathways)
names(gs) <- basename(mygmts)
names(gs)
## [1] "c2.cp.kegg_medicus.v2023.2.Hs.symbols.gmt"
## [2] "c2.cp.reactome.v2023.2.Hs.symbols.gmt"
## [3] "c2.cp.wikipathways.v2023.2.Hs.symbols.gmt"
## [4] "c3.mir.mirdb.v2023.2.Hs.symbols.gmt"
## [5] "c3.tft.gtrd.v2023.2.Hs.symbols.gmt"
## [6] "c5.go.v2023.2.Hs.symbols.gmt"
## [7] "c5.hpo.v2023.2.Hs.symbols.gmt"
## [8] "c8.all.v2023.2.Hs.symbols.gmt"
## [9] "h.all.v2023.2.Hs.symbols.gmt"
names(gs) <- c("KEGG", "Reactome", "Wikipathways", "miR targets", "TFT GTRD",
"GO", "HPO", "Cellmarkers", "Hallmark" )
# number of sets
num <- unlist(lapply(gs,length))
num
## KEGG Reactome Wikipathways miR targets TFT GTRD GO
## 619 1692 791 2377 505 10461
## HPO Cellmarkers Hallmark
## 5547 830 50
# set size summary
sz <- unlist(lapply(gs,function(x) {
y <- unlist(lapply(x, length ))
summary(y)
} ))
sz <- t(matrix(sz,ncol=9))
rownames(sz) <- names(gs)
colnames(sz) <- names(summary(c(1,2,3,4)))
sz
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## KEGG 5 8 13 15.59128 19.00 92
## Reactome 5 11 23 56.28014 57.00 1463
## Wikipathways 5 14 28 42.91656 53.00 438
## miR targets 5 59 103 156.52545 186.00 1071
## TFT GTRD 5 55 287 509.26931 880.00 1999
## GO 5 9 18 79.83424 55.00 1994
## HPO 5 10 23 91.31999 74.00 1735
## Cellmarkers 5 52 115 180.51446 223.25 1770
## Hallmark 32 98 180 146.44000 200.00 200
# gene coverage
cov <- unlist(lapply(gs,function(x) {
y <- unique(unlist(x))
length(y)
} ))
cov
## KEGG Reactome Wikipathways miR targets TFT GTRD GO
## 2727 11155 8313 16655 26807 19428
## HPO Cellmarkers Hallmark
## 4997 20418 4384
# total number of annotations
tot <- unlist(lapply(gs, function(x) {
sum(unlist(lapply(x,length)))
} ))
tot
## KEGG Reactome Wikipathways miR targets TFT GTRD GO
## 9651 95226 33947 372061 257181 835146
## HPO Cellmarkers Hallmark
## 506552 149827 7322
x1 <- readRDS("../dataprep/bulkrna1.Rds")
head(x1)
## baseMean log2FoldChange lfcSE stat
## ENSG00000000003 TSPAN6 3754.3819 -0.10925392 0.1417469 -0.7707674
## ENSG00000000419 DPM1 1718.6738 -0.06109763 0.1477350 -0.4135622
## ENSG00000000457 SCYL3 260.6222 0.28894664 0.2166482 1.3337138
## ENSG00000000460 FIRRM 327.1207 0.57058478 0.2019567 2.8252821
## ENSG00000001036 FUCA2 6164.1606 -0.47524943 0.1136307 -4.1824048
## ENSG00000001084 GCLC 4296.9247 -0.68760144 0.1235878 -5.5636659
## pvalue padj
## ENSG00000000003 TSPAN6 4.408448e-01 6.243711e-01
## ENSG00000000419 DPM1 6.791948e-01 8.109114e-01
## ENSG00000000457 SCYL3 1.822977e-01 3.508604e-01
## ENSG00000000460 FIRRM 4.723901e-03 2.518387e-02
## ENSG00000001036 FUCA2 2.884418e-05 4.256904e-04
## ENSG00000001084 GCLC 2.641654e-08 1.092526e-06
# remove genes without symbols
x1f <- x1[lapply(strsplit(rownames(x1) , " "),length)>1,]
dim(x1)
## [1] 15520 6
dim(x1f)
## [1] 15104 6
# about 1000 genes are lost after removing redundant gene names
bg <- unique(sapply(strsplit(rownames(x1f) , " "),"[[",2))
length(bg)
## [1] 14144
up <-rownames(subset(x1f, log2FoldChange > 0 & padj < 0.05 ))
up <- unique(sapply(strsplit(up," "),"[[",2))
length(up)
## [1] 1469
dn <-rownames(subset(x1f, log2FoldChange < 0 & padj < 0.05 ))
dn <- unique(sapply(strsplit(dn," "),"[[",2))
length(dn)
## [1] 1801
# kegg gene set
kegg <- gs[[1]]
kegg <- stack(kegg)
colnames(kegg) <- c("gene","term")
kegg <- kegg[,c(2,1)]
kegg$term <- gsub("KEGG_MEDICUS_","",kegg$term)
kegg$term <- gsub("_"," ",kegg$term)
ora_up <- as.data.frame(enricher(gene = up ,
universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = kegg,
pAdjustMethod="fdr", pvalueCutoff = 1, qvalueCutoff = 1 ))
gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))
br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))
ora_up$es <- gr/br
ora_up$geneID = ora_up$ID = ora_up$Description = NULL
ora_up_kegg <- ora_up
head(ora_up,20) %>%
kbl(row.names = TRUE, caption="Upregulated KEGGs in SRP128998") %>%
kable_paper("hover", full_width = F)
GeneRatio | BgRatio | pvalue | p.adjust | qvalue | Count | es | |
---|---|---|---|---|---|---|---|
REFERENCE ORIGIN UNWINDING AND ELONGATION | 15/268 | 30/2120 | 0.0000006 | 0.0002128 | 0.0002027 | 15 | 3.955224 |
REFERENCE MEVALONATE PATHWAY | 8/268 | 11/2120 | 0.0000069 | 0.0012031 | 0.0011460 | 8 | 5.753053 |
REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION | 5/268 | 5/2120 | 0.0000312 | 0.0024834 | 0.0023655 | 5 | 7.910448 |
REFERENCE TRAIP DEPENDENT REPLISOME DISASSEMBLY | 9/268 | 16/2120 | 0.0000370 | 0.0024834 | 0.0023655 | 9 | 4.449627 |
REFERENCE RAD51 DSDNA DESTABILIZATION | 7/268 | 10/2120 | 0.0000410 | 0.0024834 | 0.0023655 | 7 | 5.537313 |
REFERENCE CHOLESTEROL BIOSYNTHESIS | 8/268 | 13/2120 | 0.0000429 | 0.0024834 | 0.0023655 | 8 | 4.867968 |
REFERENCE ORGANIZATION OF THE OUTER KINETOCHORE | 9/268 | 19/2120 | 0.0002110 | 0.0093540 | 0.0089100 | 9 | 3.747054 |
REFERENCE DEPHOSPHORYLATION OF KINETOCHORE | 7/268 | 12/2120 | 0.0002157 | 0.0093540 | 0.0089100 | 7 | 4.614428 |
REFERENCE PRE IC FORMATION | 10/268 | 27/2120 | 0.0010349 | 0.0399023 | 0.0380080 | 10 | 2.929795 |
REFERENCE CENPE INTERACTION WITH NDC80 COMPLEX | 6/268 | 12/2120 | 0.0018369 | 0.0637399 | 0.0607139 | 6 | 3.955224 |
REFERENCE CDC25 CELL CYCLE G2 M | 5/268 | 9/2120 | 0.0025467 | 0.0754786 | 0.0718953 | 5 | 4.394693 |
REFERENCE DNA REPLICATION TERMINATION | 8/268 | 21/2120 | 0.0027149 | 0.0754786 | 0.0718953 | 8 | 3.013504 |
REFERENCE COMMON PATHWAY OF COMPLEMENT CASCADE MAC FORMATION | 4/268 | 6/2120 | 0.0030452 | 0.0754786 | 0.0718953 | 4 | 5.273632 |
REFERENCE LECTIN PATHWAY OF COAGULATION CASCADE FIBRINOGEN TO FIBRIN | 4/268 | 6/2120 | 0.0030452 | 0.0754786 | 0.0718953 | 4 | 5.273632 |
REFERENCE HOMOLOGOUS RECOMBINATION | 10/268 | 32/2120 | 0.0044335 | 0.0991532 | 0.0944460 | 10 | 2.472015 |
REFERENCE RECRUITMENT AND FORMATION OF THE MCC | 5/268 | 10/2120 | 0.0045719 | 0.0991532 | 0.0944460 | 5 | 3.955224 |
REFERENCE LECTIN PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION | 4/268 | 7/2120 | 0.0064025 | 0.1306854 | 0.1244811 | 4 | 4.520256 |
REFERENCE HOMOLOGOUS RECOMBINATION IN ICLR | 6/268 | 15/2120 | 0.0071439 | 0.1377178 | 0.1311797 | 6 | 3.164179 |
REFERENCE BREAK INDUCED REPLICATION | 4/268 | 8/2120 | 0.0115435 | 0.2002801 | 0.1907719 | 4 | 3.955224 |
REFERENCE REGULATION OF COMPLEMENT CASCADE MAC INHIBITION | 4/268 | 8/2120 | 0.0115435 | 0.2002801 | 0.1907719 | 4 | 3.955224 |
# GO gene set
go <- gs[[6]]
go <- stack(go)
colnames(go) <- c("gene","term")
go <- go[,c(2,1)]
go$term <- gsub("^GO","",go$term)
go$term <- gsub("_"," ",go$term)
ora_up <- as.data.frame(enricher(gene = up ,
universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = go,
pAdjustMethod="fdr", pvalueCutoff = 1, qvalueCutoff = 1 ))
gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))
br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))
ora_up$es <- gr/br
ora_up$geneID = ora_up$ID = ora_up$Description = NULL
ora_up_go <- ora_up
head(ora_up,20) %>%
kbl(row.names = TRUE, caption="Upregulated GOs in SRP128998") %>%
kable_paper("hover", full_width = F)
GeneRatio | BgRatio | pvalue | p.adjust | qvalue | Count | es | |
---|---|---|---|---|---|---|---|
BP SMALL MOLECULE METABOLIC PROCESS | 232/1315 | 1465/12365 | 0.0e+00 | 0.0000003 | 0.0000003 | 232 | 1.489082 |
BP ORGANIC ACID METABOLIC PROCESS | 138/1315 | 761/12365 | 0.0e+00 | 0.0000003 | 0.0000003 | 138 | 1.705151 |
BP SMALL MOLECULE BIOSYNTHETIC PROCESS | 93/1315 | 455/12365 | 0.0e+00 | 0.0000006 | 0.0000006 | 93 | 1.921940 |
BP FATTY ACID METABOLIC PROCESS | 64/1315 | 287/12365 | 0.0e+00 | 0.0000093 | 0.0000089 | 64 | 2.096846 |
BP HUMORAL IMMUNE RESPONSE MEDIATED BY CIRCULATING IMMUNOGLOBULIN | 16/1315 | 30/12365 | 0.0e+00 | 0.0000116 | 0.0000111 | 16 | 5.014956 |
BP COMPLEMENT ACTIVATION CLASSICAL PATHWAY | 14/1315 | 24/12365 | 0.0e+00 | 0.0000179 | 0.0000172 | 14 | 5.485108 |
CC BLOOD MICROPARTICLE | 29/1315 | 90/12365 | 0.0e+00 | 0.0000230 | 0.0000221 | 29 | 3.029869 |
BP STEROL BIOSYNTHETIC PROCESS | 21/1315 | 54/12365 | 1.0e-07 | 0.0000443 | 0.0000425 | 21 | 3.656738 |
BP LIPID METABOLIC PROCESS | 163/1315 | 1039/12365 | 1.0e-07 | 0.0000798 | 0.0000765 | 163 | 1.475164 |
BP CELLULAR LIPID METABOLIC PROCESS | 126/1315 | 765/12365 | 2.0e-07 | 0.0001667 | 0.0001598 | 126 | 1.548736 |
BP COMPLEMENT ACTIVATION | 15/1315 | 33/12365 | 4.0e-07 | 0.0002210 | 0.0002118 | 15 | 4.274110 |
BP REGULATION OF MITOTIC NUCLEAR DIVISION | 29/1315 | 101/12365 | 4.0e-07 | 0.0002210 | 0.0002118 | 29 | 2.699883 |
BP MONOCARBOXYLIC ACID METABOLIC PROCESS | 84/1315 | 463/12365 | 5.0e-07 | 0.0002843 | 0.0002725 | 84 | 1.705951 |
BP REGULATION OF NUCLEAR DIVISION | 31/1315 | 117/12365 | 1.1e-06 | 0.0005118 | 0.0004907 | 31 | 2.491404 |
CC COLLAGEN CONTAINING EXTRACELLULAR MATRIX | 52/1315 | 248/12365 | 1.1e-06 | 0.0005118 | 0.0004907 | 52 | 1.971605 |
BP HUMORAL IMMUNE RESPONSE | 27/1315 | 95/12365 | 1.2e-06 | 0.0005173 | 0.0004959 | 27 | 2.672443 |
CC OUTER KINETOCHORE | 11/1315 | 20/12365 | 1.3e-06 | 0.0005209 | 0.0004993 | 11 | 5.171673 |
BP G2 MI TRANSITION OF MEIOTIC CELL CYCLE | 6/1315 | 6/12365 | 1.4e-06 | 0.0005224 | 0.0005008 | 6 | 9.403042 |
CC KNL1 SPC105 COMPLEX | 6/1315 | 6/12365 | 1.4e-06 | 0.0005224 | 0.0005008 | 6 | 9.403042 |
CC EXTERNAL ENCAPSULATING STRUCTURE | 60/1315 | 306/12365 | 1.8e-06 | 0.0006187 | 0.0005931 | 60 | 1.843734 |
Transfer a gene set from GO to KEGG analysis.
pw <- gs[[6]][grep("GOBP_COMPLEMENT_ACTIVATION_CLASSICAL_PATHWAY", names(gs[[6]]))]
pw
## $GOBP_COMPLEMENT_ACTIVATION_CLASSICAL_PATHWAY
## [1] "C1QA" "C1QB" "C1QBP" "C1QC" "C1R" "C1RL"
## [7] "C1S" "C2" "C3" "C4A" "C4B" "C4BPA"
## [13] "C4BPB" "C5" "C6" "C7" "C8A" "C8B"
## [19] "C8G" "C9" "CD46" "CD55" "CFI" "CLU"
## [25] "CR1" "CR1L" "CR2" "IGHA1" "IGHA2" "IGHE"
## [31] "IGHG1" "IGHG2" "IGHG3" "IGHG4" "IGHM" "MASP2"
## [37] "MBL2" "NCR3LG1" "SERPING1" "SUSD4" "SVEP1" "TREM2"
pw <- stack(pw)[,c(2,1)]
colnames(pw) <- c("term","gene")
kegg2 <- rbind(kegg,pw)
ora_up <- as.data.frame(enricher(gene = up ,
universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = kegg2,
pAdjustMethod="fdr", pvalueCutoff = 1, qvalueCutoff = 1 ))
gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))
br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))
ora_up$es <- gr/br
ora_up$geneID = ora_up$ID = ora_up$Description = NULL
ora_up_kegg2 <- ora_up
head(ora_up,20) %>%
kbl(row.names = TRUE, caption="Upregulated KEGGs in SRP128998 with Complement") %>%
kable_paper("hover", full_width = F)
GeneRatio | BgRatio | pvalue | p.adjust | qvalue | Count | es | |
---|---|---|---|---|---|---|---|
GOBP_COMPLEMENT_ACTIVATION_CLASSICAL_PATHWAY | 14/270 | 24/2126 | 0.0000001 | 0.0000433 | 0.0000411 | 14 | 4.593210 |
REFERENCE ORIGIN UNWINDING AND ELONGATION | 15/270 | 30/2126 | 0.0000007 | 0.0001136 | 0.0001079 | 15 | 3.937037 |
REFERENCE MEVALONATE PATHWAY | 8/270 | 11/2126 | 0.0000072 | 0.0008337 | 0.0007919 | 8 | 5.726599 |
REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION | 5/270 | 5/2126 | 0.0000320 | 0.0022101 | 0.0020991 | 5 | 7.874074 |
REFERENCE TRAIP DEPENDENT REPLISOME DISASSEMBLY | 9/270 | 16/2126 | 0.0000385 | 0.0022101 | 0.0020991 | 9 | 4.429167 |
REFERENCE RAD51 DSDNA DESTABILIZATION | 7/270 | 10/2126 | 0.0000423 | 0.0022101 | 0.0020991 | 7 | 5.511852 |
REFERENCE CHOLESTEROL BIOSYNTHESIS | 8/270 | 13/2126 | 0.0000445 | 0.0022101 | 0.0020991 | 8 | 4.845584 |
REFERENCE ORGANIZATION OF THE OUTER KINETOCHORE | 9/270 | 19/2126 | 0.0002188 | 0.0085920 | 0.0081606 | 9 | 3.729825 |
REFERENCE DEPHOSPHORYLATION OF KINETOCHORE | 7/270 | 12/2126 | 0.0002222 | 0.0085920 | 0.0081606 | 7 | 4.593210 |
REFERENCE PRE IC FORMATION | 10/270 | 27/2126 | 0.0010740 | 0.0373768 | 0.0355001 | 10 | 2.916324 |
REFERENCE CENPE INTERACTION WITH NDC80 COMPLEX | 6/270 | 12/2126 | 0.0018828 | 0.0595650 | 0.0565742 | 6 | 3.937037 |
REFERENCE CDC25 CELL CYCLE G2 M | 5/270 | 9/2126 | 0.0026010 | 0.0719007 | 0.0682904 | 5 | 4.374486 |
REFERENCE DNA REPLICATION TERMINATION | 8/270 | 21/2126 | 0.0027977 | 0.0719007 | 0.0682904 | 8 | 2.999647 |
REFERENCE COMMON PATHWAY OF COMPLEMENT CASCADE MAC FORMATION | 4/270 | 6/2126 | 0.0030992 | 0.0719007 | 0.0682904 | 4 | 5.249383 |
REFERENCE LECTIN PATHWAY OF COAGULATION CASCADE FIBRINOGEN TO FIBRIN | 4/270 | 6/2126 | 0.0030992 | 0.0719007 | 0.0682904 | 4 | 5.249383 |
REFERENCE HOMOLOGOUS RECOMBINATION | 10/270 | 32/2126 | 0.0045878 | 0.0955378 | 0.0907407 | 10 | 2.460648 |
REFERENCE RECRUITMENT AND FORMATION OF THE MCC | 5/270 | 10/2126 | 0.0046671 | 0.0955378 | 0.0907407 | 5 | 3.937037 |
REFERENCE LECTIN PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION | 4/270 | 7/2126 | 0.0065125 | 0.1259078 | 0.1195857 | 4 | 4.499471 |
REFERENCE HOMOLOGOUS RECOMBINATION IN ICLR | 6/270 | 15/2126 | 0.0073105 | 0.1338972 | 0.1271740 | 6 | 3.149630 |
REFERENCE BREAK INDUCED REPLICATION | 4/270 | 8/2126 | 0.0117359 | 0.1944810 | 0.1847158 | 4 | 3.937037 |
message("GO")
## GO
ora_up_go[rownames(ora_up_go)=="BP COMPLEMENT ACTIVATION CLASSICAL PATHWAY",]
## GeneRatio BgRatio pvalue
## BP COMPLEMENT ACTIVATION CLASSICAL PATHWAY 14/1315 24/12365 1.553408e-08
## p.adjust qvalue Count
## BP COMPLEMENT ACTIVATION CLASSICAL PATHWAY 1.794445e-05 1.720195e-05 14
## es
## BP COMPLEMENT ACTIVATION CLASSICAL PATHWAY 5.485108
message("KEGG + complement")
## KEGG + complement
ora_up_kegg2[rownames(ora_up_kegg2)=="GOBP_COMPLEMENT_ACTIVATION_CLASSICAL_PATHWAY",]
## GeneRatio BgRatio pvalue
## GOBP_COMPLEMENT_ACTIVATION_CLASSICAL_PATHWAY 14/270 24/2126 1.243003e-07
## p.adjust qvalue Count
## GOBP_COMPLEMENT_ACTIVATION_CLASSICAL_PATHWAY 4.325652e-05 4.108453e-05 14
## es
## GOBP_COMPLEMENT_ACTIVATION_CLASSICAL_PATHWAY 4.59321
Fix KEGG analysis.
bgdf <- data.frame("background",bg)
colnames(bgdf) <- c("term","gene")
keggfix <- rbind(kegg,bgdf)
ora_up <- as.data.frame(enricher(gene = up ,
universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = keggfix,
pAdjustMethod="fdr", pvalueCutoff = 1, qvalueCutoff = 1 ))
gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))
br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))
ora_up$es <- gr/br
ora_up$geneID = ora_up$ID = ora_up$Description = NULL
ora_up_keggfix <- ora_up
head(ora_up,20) %>%
kbl(row.names = TRUE, caption="Upregulated KEGG fixed in SRP128998 with Complement") %>%
kable_paper("hover", full_width = F)
GeneRatio | BgRatio | pvalue | p.adjust | qvalue | Count | es | |
---|---|---|---|---|---|---|---|
REFERENCE ORIGIN UNWINDING AND ELONGATION | 15/1469 | 30/14144 | 0.0000001 | 0.0000196 | 0.0000179 | 15 | 4.814159 |
REFERENCE MEVALONATE PATHWAY | 8/1469 | 11/14144 | 0.0000016 | 0.0002863 | 0.0002615 | 8 | 7.002414 |
REFERENCE TRAIP DEPENDENT REPLISOME DISASSEMBLY | 9/1469 | 16/14144 | 0.0000080 | 0.0006967 | 0.0006364 | 9 | 5.415929 |
REFERENCE CHOLESTEROL BIOSYNTHESIS | 8/1469 | 13/14144 | 0.0000106 | 0.0006967 | 0.0006364 | 8 | 5.925119 |
REFERENCE RAD51 DSDNA DESTABILIZATION | 7/1469 | 10/14144 | 0.0000116 | 0.0006967 | 0.0006364 | 7 | 6.739823 |
REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION | 5/1469 | 5/14144 | 0.0000120 | 0.0006967 | 0.0006364 | 5 | 9.628319 |
REFERENCE ORGANIZATION OF THE OUTER KINETOCHORE | 9/1469 | 19/14144 | 0.0000481 | 0.0023924 | 0.0021854 | 9 | 4.560783 |
REFERENCE DEPHOSPHORYLATION OF KINETOCHORE | 7/1469 | 12/14144 | 0.0000635 | 0.0027637 | 0.0025246 | 7 | 5.616519 |
REFERENCE PRE IC FORMATION | 10/1469 | 27/14144 | 0.0002272 | 0.0087860 | 0.0080259 | 10 | 3.566044 |
REFERENCE CENPE INTERACTION WITH NDC80 COMPLEX | 6/1469 | 12/14144 | 0.0006602 | 0.0229733 | 0.0209859 | 6 | 4.814159 |
REFERENCE DNA REPLICATION TERMINATION | 8/1469 | 21/14144 | 0.0007817 | 0.0247290 | 0.0225897 | 8 | 3.667931 |
REFERENCE CDC25 CELL CYCLE G2 M | 5/1469 | 9/14144 | 0.0010565 | 0.0286777 | 0.0261968 | 5 | 5.349066 |
REFERENCE HOMOLOGOUS RECOMBINATION | 10/1469 | 32/14144 | 0.0010713 | 0.0286777 | 0.0261968 | 10 | 3.008850 |
REFERENCE COMMON PATHWAY OF COMPLEMENT CASCADE MAC FORMATION | 4/1469 | 6/14144 | 0.0014632 | 0.0339453 | 0.0310087 | 4 | 6.418879 |
REFERENCE LECTIN PATHWAY OF COAGULATION CASCADE FIBRINOGEN TO FIBRIN | 4/1469 | 6/14144 | 0.0014632 | 0.0339453 | 0.0310087 | 4 | 6.418879 |
REFERENCE RECRUITMENT AND FORMATION OF THE MCC | 5/1469 | 10/14144 | 0.0019326 | 0.0420348 | 0.0383984 | 5 | 4.814159 |
REFERENCE HOMOLOGOUS RECOMBINATION IN ICLR | 6/1469 | 15/14144 | 0.0027178 | 0.0556340 | 0.0508212 | 6 | 3.851327 |
REFERENCE LECTIN PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION | 4/1469 | 7/14144 | 0.0031332 | 0.0605748 | 0.0553345 | 4 | 5.501896 |
REFERENCE BREAK INDUCED REPLICATION | 4/1469 | 8/14144 | 0.0057529 | 0.1000998 | 0.0914402 | 4 | 4.814159 |
REFERENCE REGULATION OF COMPLEMENT CASCADE MAC INHIBITION | 4/1469 | 8/14144 | 0.0057529 | 0.1000998 | 0.0914402 | 4 | 4.814159 |
message("Original")
## Original
ora_up_kegg[which(rownames(ora_up_kegg)=="REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION"),]
## GeneRatio
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 5/268
## BgRatio
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 5/2120
## pvalue
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 3.124267e-05
## p.adjust
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 0.002483402
## qvalue
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 0.002365504
## Count
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 5
## es
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 7.910448
message("Fixed")
## Fixed
ora_up_keggfix[which(rownames(ora_up_keggfix)=="REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION"),]
## GeneRatio
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 5/1469
## BgRatio
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 5/14144
## pvalue
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 1.201145e-05
## p.adjust
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 0.000696664
## qvalue
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 0.0006363961
## Count
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 5
## es
## REFERENCE CLASSICAL PATHWAY OF COMPLEMENT CASCADE C4 C2 TO C3 CONVERTASE FORMATION 9.628319
Here is a function that conducts over-representation analysis with ClusterProfiler. Genes that are not annotated as having any gene set category are discarded.
# df = deseq output
# df must have "ensID geneSymbol" as rownames
# gs = gene sets
# gs format is list of symbols
orabad <- function( df, gs ) {
def <- df
gs1 <- stack(gs)
colnames(gs1) <- c("gene","term")
gs1 <- gs1[,c(2,1)]
defup <- rownames(subset(def,padj<=0.05 & log2FoldChange>0))
defup <- defup[lapply(strsplit(defup," "),length)==2]
defup <- unique(sapply(strsplit(defup," "),"[[",2))
if ( length(defup)<200 ) {
defup <- head(rownames(subset(def,log2FoldChange>0)),200)
defup <- defup[lapply(strsplit(defup," "),length)==2]
defup <- unique(sapply(strsplit(defup," "),"[[",2))
}
defdn <- rownames(subset(def,padj<=0.05 & log2FoldChange<0))
defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
defdn <- unique(sapply(strsplit(defdn," "),"[[",2))
if ( length(defdn)<200 ) {
defdn <- head(rownames(subset(def,log2FoldChange<0)),200)
defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
defdn <- unique(sapply(strsplit(defdn," "),"[[",2))
}
bg <- rownames(def)
bg <- bg[lapply(strsplit(bg," "),length)==2]
bg <- unique(sapply(strsplit(bg," "),"[[",2))
ora_up <- as.data.frame(enricher(gene = defup ,
universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
pAdjustMethod="fdr", pvalueCutoff = 1, qvalueCutoff = 1 ))
ora_up$geneID <- NULL
ora_ups <- rownames(ora_up)
gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))
br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))
ora_up$es <- gr/br
ora_up <- ora_up[order(-ora_up$es),]
ora_up$Description=NULL
ora_dn <- as.data.frame(enricher(gene = defdn ,
universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
pAdjustMethod="fdr", pvalueCutoff = 1, qvalueCutoff = 1 ))
ora_dn$geneID <- NULL
ora_dns <- rownames(ora_dn)
gr <- as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",2))
br <- as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",2))
ora_dn$es <- gr/br
ora_dn <- ora_dn[order(-ora_dn$es),]
ora_dn$Description=NULL
result <- list("orabad_up"=ora_up,"orabad_dn"=ora_dn)
return(result)
}
This function fixes the problem with excluding un-annotated genes.
oragood <- function( df, gs ) {
def <- df
gs1 <- stack(gs)
colnames(gs1) <- c("gene","term")
gs1 <- gs1[,c(2,1)]
defup <- rownames(subset(def,padj<=0.05 & log2FoldChange>0))
defup <- defup[lapply(strsplit(defup," "),length)==2]
defup <- unique(sapply(strsplit(defup," "),"[[",2))
if ( length(defup)<200 ) {
defup <- head(rownames(subset(def,log2FoldChange>0)),200)
defup <- defup[lapply(strsplit(defup," "),length)==2]
defup <- unique(sapply(strsplit(defup," "),"[[",2))
}
defdn <- rownames(subset(def,padj<=0.05 & log2FoldChange<0))
defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
defdn <- unique(sapply(strsplit(defdn," "),"[[",2))
if ( length(defdn)<200 ) {
defdn <- head(rownames(subset(def,log2FoldChange<0)),200)
defdn <- defdn[lapply(strsplit(defdn," "),length)==2]
defdn <- unique(sapply(strsplit(defdn," "),"[[",2))
}
bg <- rownames(def)
bg <- bg[lapply(strsplit(bg," "),length)==2]
bg <- unique(sapply(strsplit(bg," "),"[[",2))
bgdf <- data.frame("background",bg)
names(bgdf) <- c("term","gene")
gs1 <- rbind(gs1,bgdf)
ora_up <- as.data.frame(enricher(gene = defup ,
universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
pAdjustMethod="fdr", pvalueCutoff = 1, qvalueCutoff = 1 ))
ora_up$geneID <- NULL
ora_ups <- rownames(ora_up)
gr <- as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_up$GeneRatio,"/"),"[[",2))
br <- as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_up$BgRatio,"/"),"[[",2))
ora_up$es <- gr/br
ora_up <- ora_up[order(-ora_up$es),]
ora_up$Description=NULL
ora_dn <- as.data.frame(enricher(gene = defdn ,
universe = bg, minGSSize = 5, maxGSSize = 500000, TERM2GENE = gs1,
pAdjustMethod="fdr", pvalueCutoff = 1, qvalueCutoff = 1 ))
ora_dn$geneID <- NULL
ora_dns <- rownames(ora_dn)
gr <- as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_dn$GeneRatio,"/"),"[[",2))
br <- as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",1)) /
as.numeric(sapply(strsplit(ora_dn$BgRatio,"/"),"[[",2))
ora_dn$es <- gr/br
ora_dn <- ora_dn[order(-ora_dn$es),]
ora_dn$Description=NULL
result <- list("oragood_up"=ora_up,"oragood_dn"=ora_dn)
return(result)
}
# bad=ORA analysis, a list with results of up and down regulated pathways
# good=same as bad, except fixed the background problem
compare_ora_plots <- function(bad,good,libname=NULL){
badup <- bad$orabad_up
baddn <- bad$orabad_dn
goodup <- good$oragood_up
gooddn <- good$oragood_dn
mup <- merge(badup,goodup,by=0)
MAX=max(c(mup$es.x,mup$es.y))
UPHEADER=paste(libname,"up")
DNHEADER=paste(libname,"dn")
par(mfrow=c(2,2))
plot(mup$es.x, mup$es.y,
xlim=c(0,MAX), ylim=c(0,MAX),
xlab="original ES", ylab="corrected ES", main=UPHEADER)
grid()
abline(a = 0, b = 1,lwd=2,lty=2)
MAX=max(-log10(c(mup$pvalue.x,mup$pvalue.y)))
plot(-log10(mup$pvalue.x), -log10(mup$pvalue.y),
xlim=c(0,MAX), ylim=c(0,MAX),
xlab="original -log10 p-value", ylab="corrected -log10 p-value", main=UPHEADER)
grid()
abline(a = 0, b = 1,lwd=2,lty=2)
mdn <- merge(baddn,gooddn,by=0)
MAX=max(c(mdn$es.x,mdn$es.y))
plot(mdn$es.x, mdn$es.y,
xlim=c(0,MAX), ylim=c(0,MAX),
xlab="original ES", ylab="corrected ES", main=DNHEADER)
grid()
abline(a = 0, b = 1,lwd=2,lty=2)
MAX=max(-log10(c(mdn$pvalue.x,mdn$pvalue.y)))
plot(-log10(mdn$pvalue.x), -log10(mdn$pvalue.y),
xlim=c(0,MAX), ylim=c(0,MAX),
xlab="original -log10 p-value", ylab="corrected -log10 p-value", main=DNHEADER)
grid()
abline(a = 0, b = 1,lwd=2,lty=2)
}
compare_ora <- function(bad,good){
badup <- bad$orabad_up
baddn <- bad$orabad_dn
goodup <- good$oragood_up
gooddn <- good$oragood_dn
mup <- merge(badup,goodup,by=0)
mdn <- merge(baddn,gooddn,by=0)
bup <- subset(mup,p.adjust.x<0.05)$Row.names
gup <- subset(mup,p.adjust.y<0.05)$Row.names
bdn <- subset(mdn,p.adjust.x<0.05)$Row.names
gdn <- subset(mdn,p.adjust.y<0.05)$Row.names
sig_bad <- unique(c(bup,bdn))
sig_good <- unique(c(gup,gdn))
nsig_bad <- length(sig_bad)
nsig_good <- length(sig_good)
jac <- length(intersect(sig_bad,sig_good)) / length(union(sig_bad,sig_good))
result=c("nsig_bad"=nsig_bad,"nsig_good"=nsig_good,"Jaccard"=jac)
return(result)
}
params <- expand.grid(1:length(d),1:length(gs))
params2 <- expand.grid(names(d),names(gs))
colnames(params) <- c("d","gs")
ora_dat <- mclapply(1:nrow(params), function(i) {
j=params[i,1]
k=params[i,2]
bad <- orabad(d[[j]],gs[[k]])
good <- oragood(d[[j]],gs[[k]])
res <- compare_ora(bad,good)
return(res)
} , mc.cores=8)
ora_dat <- do.call(rbind,ora_dat)
res1 <- cbind(params2,ora_dat)
res1 %>%
kbl(row.names = TRUE, caption="all results") %>%
kable_paper("hover", full_width = F)
Var1 | Var2 | nsig_bad | nsig_good | Jaccard | |
---|---|---|---|---|---|
1 | bulkrna1.Rds | KEGG | 9 | 19 | 0.4736842 |
2 | bulkrna2.Rds | KEGG | 25 | 59 | 0.4237288 |
3 | bulkrna3.Rds | KEGG | 2 | 6 | 0.3333333 |
4 | bulkrna4.Rds | KEGG | 14 | 24 | 0.5833333 |
5 | bulkrna5.Rds | KEGG | 0 | 0 | NaN |
6 | bulkrna6.Rds | KEGG | 47 | 56 | 0.8392857 |
7 | bulkrna7.Rds | KEGG | 9 | 38 | 0.2368421 |
8 | bulkrna1.Rds | Reactome | 64 | 102 | 0.6274510 |
9 | bulkrna2.Rds | Reactome | 239 | 351 | 0.6809117 |
10 | bulkrna3.Rds | Reactome | 94 | 159 | 0.5617284 |
11 | bulkrna4.Rds | Reactome | 207 | 307 | 0.6742671 |
12 | bulkrna5.Rds | Reactome | 0 | 0 | NaN |
13 | bulkrna6.Rds | Reactome | 254 | 334 | 0.7604790 |
14 | bulkrna7.Rds | Reactome | 185 | 240 | 0.7708333 |
15 | bulkrna1.Rds | Wikipathways | 33 | 46 | 0.7173913 |
16 | bulkrna2.Rds | Wikipathways | 108 | 193 | 0.5595855 |
17 | bulkrna3.Rds | Wikipathways | 8 | 12 | 0.6666667 |
18 | bulkrna4.Rds | Wikipathways | 13 | 50 | 0.2600000 |
19 | bulkrna5.Rds | Wikipathways | 1 | 1 | 1.0000000 |
20 | bulkrna6.Rds | Wikipathways | 82 | 97 | 0.8453608 |
21 | bulkrna7.Rds | Wikipathways | 35 | 117 | 0.2991453 |
22 | bulkrna1.Rds | miR targets | 4 | 81 | 0.0493827 |
23 | bulkrna2.Rds | miR targets | 40 | 111 | 0.3603604 |
24 | bulkrna3.Rds | miR targets | 21 | 38 | 0.5526316 |
25 | bulkrna4.Rds | miR targets | 281 | 635 | 0.4425197 |
26 | bulkrna5.Rds | miR targets | 0 | 0 | NaN |
27 | bulkrna6.Rds | miR targets | 1166 | 1365 | 0.8393895 |
28 | bulkrna7.Rds | miR targets | 360 | 544 | 0.6617647 |
29 | bulkrna1.Rds | TFT GTRD | 52 | 85 | 0.6117647 |
30 | bulkrna2.Rds | TFT GTRD | 98 | 140 | 0.7000000 |
31 | bulkrna3.Rds | TFT GTRD | 108 | 135 | 0.8000000 |
32 | bulkrna4.Rds | TFT GTRD | 71 | 115 | 0.6173913 |
33 | bulkrna5.Rds | TFT GTRD | 1 | 1 | 1.0000000 |
34 | bulkrna6.Rds | TFT GTRD | 147 | 173 | 0.8497110 |
35 | bulkrna7.Rds | TFT GTRD | 129 | 153 | 0.8431373 |
36 | bulkrna1.Rds | GO | 485 | 707 | 0.6859972 |
37 | bulkrna2.Rds | GO | 1843 | 2168 | 0.8500923 |
38 | bulkrna3.Rds | GO | 686 | 915 | 0.7497268 |
39 | bulkrna4.Rds | GO | 390 | 709 | 0.5500705 |
40 | bulkrna5.Rds | GO | 6 | 48 | 0.1250000 |
41 | bulkrna6.Rds | GO | 588 | 789 | 0.7452471 |
42 | bulkrna7.Rds | GO | 1263 | 1544 | 0.8180052 |
43 | bulkrna1.Rds | HPO | 2 | 42 | 0.0476190 |
44 | bulkrna2.Rds | HPO | 380 | 988 | 0.3846154 |
45 | bulkrna3.Rds | HPO | 92 | 241 | 0.3817427 |
46 | bulkrna4.Rds | HPO | 24 | 283 | 0.0848057 |
47 | bulkrna5.Rds | HPO | 0 | 1 | 0.0000000 |
48 | bulkrna6.Rds | HPO | 17 | 265 | 0.0641509 |
49 | bulkrna7.Rds | HPO | 186 | 449 | 0.4142539 |
50 | bulkrna1.Rds | Cellmarkers | 139 | 170 | 0.8176471 |
51 | bulkrna2.Rds | Cellmarkers | 439 | 477 | 0.9203354 |
52 | bulkrna3.Rds | Cellmarkers | 345 | 382 | 0.9031414 |
53 | bulkrna4.Rds | Cellmarkers | 244 | 286 | 0.8531469 |
54 | bulkrna5.Rds | Cellmarkers | 0 | 1 | 0.0000000 |
55 | bulkrna6.Rds | Cellmarkers | 267 | 297 | 0.8989899 |
56 | bulkrna7.Rds | Cellmarkers | 448 | 491 | 0.9085366 |
57 | bulkrna1.Rds | Hallmark | 7 | 20 | 0.3500000 |
58 | bulkrna2.Rds | Hallmark | 25 | 38 | 0.6578947 |
59 | bulkrna3.Rds | Hallmark | 18 | 21 | 0.7727273 |
60 | bulkrna4.Rds | Hallmark | 13 | 27 | 0.4814815 |
61 | bulkrna5.Rds | Hallmark | 0 | 2 | 0.0000000 |
62 | bulkrna6.Rds | Hallmark | 28 | 37 | 0.7567568 |
63 | bulkrna7.Rds | Hallmark | 18 | 34 | 0.4444444 |
summary(res1$nsig_bad)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 11.0 52.0 188.3 241.5 1843.0
summary(res1$nsig_good)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 35.5 115.0 273.3 320.5 2168.0
summary(res1$Jaccard)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.3839 0.6427 0.5718 0.8044 1.0000 3
# NUMBER OF SIGNIFICANT SETS
ns1 <- matrix( res1$nsig_bad , ncol=9 )
rownames(ns1) <- names(d)
colnames(ns1) <- names(gs)
ns1 %>%
kbl(row.names = TRUE, caption="significant genesets in original analysis") %>%
kable_paper("hover", full_width = F)
KEGG | Reactome | Wikipathways | miR targets | TFT GTRD | GO | HPO | Cellmarkers | Hallmark | |
---|---|---|---|---|---|---|---|---|---|
bulkrna1.Rds | 9 | 64 | 33 | 4 | 52 | 485 | 2 | 139 | 7 |
bulkrna2.Rds | 25 | 239 | 108 | 40 | 98 | 1843 | 380 | 439 | 25 |
bulkrna3.Rds | 2 | 94 | 8 | 21 | 108 | 686 | 92 | 345 | 18 |
bulkrna4.Rds | 14 | 207 | 13 | 281 | 71 | 390 | 24 | 244 | 13 |
bulkrna5.Rds | 0 | 0 | 1 | 0 | 1 | 6 | 0 | 0 | 0 |
bulkrna6.Rds | 47 | 254 | 82 | 1166 | 147 | 588 | 17 | 267 | 28 |
bulkrna7.Rds | 9 | 185 | 35 | 360 | 129 | 1263 | 186 | 448 | 18 |
ns1 <- apply(ns1,2,function(x) { x <- x[!is.na(x)] ; x <- x[is.finite(x)] ; mean(x) } )
ns2 <- matrix( res1$nsig_good , ncol=9 )
rownames(ns2) <- names(d)
colnames(ns2) <- names(gs)
ns2 %>%
kbl(row.names = TRUE, caption="significant genesets in corrected analysis") %>%
kable_paper("hover", full_width = F)
KEGG | Reactome | Wikipathways | miR targets | TFT GTRD | GO | HPO | Cellmarkers | Hallmark | |
---|---|---|---|---|---|---|---|---|---|
bulkrna1.Rds | 19 | 102 | 46 | 81 | 85 | 707 | 42 | 170 | 20 |
bulkrna2.Rds | 59 | 351 | 193 | 111 | 140 | 2168 | 988 | 477 | 38 |
bulkrna3.Rds | 6 | 159 | 12 | 38 | 135 | 915 | 241 | 382 | 21 |
bulkrna4.Rds | 24 | 307 | 50 | 635 | 115 | 709 | 283 | 286 | 27 |
bulkrna5.Rds | 0 | 0 | 1 | 0 | 1 | 48 | 1 | 1 | 2 |
bulkrna6.Rds | 56 | 334 | 97 | 1365 | 173 | 789 | 265 | 297 | 37 |
bulkrna7.Rds | 38 | 240 | 117 | 544 | 153 | 1544 | 449 | 491 | 34 |
ns2 <- apply(ns2,2,function(x) { x <- x[!is.na(x)] ; x <- x[is.finite(x)] ; mean(x) } )
ns2
## KEGG Reactome Wikipathways miR targets TFT GTRD GO
## 28.85714 213.28571 73.71429 396.28571 114.57143 982.85714
## HPO Cellmarkers Hallmark
## 324.14286 300.57143 25.57143
df <- data.frame(ns1,ns2)
df <- df[order(df$ns1),]
par(mar=c(c(5.1, 7.1, 4.1, 2.1) ))
barplot(t(df),beside=TRUE,horiz=TRUE,las=1,xlim=c(0,1250),
xlab="no. significant sets",
col=c("gray30","gray80"))
text(df$ns1+50,((1:nrow(df))*3)-1.5,labels=signif(df[,1],3))
text(df$ns2+50,((1:nrow(df))*3)-0.5,labels=signif(df[,2],3))
#abline(v=seq(100,1000,100),lty=2,lwd=0.5,col="gray")
legend("bottomright", inset=.02, legend=c("corrected","original"),
fill=c("gray80","gray30"), horiz=FALSE, cex=1.1)
mylabels <- gsub("$","%",gsub("^","+",signif(((df$ns2/df$ns1)-1)*100,3)))
text(df$ns2+160,((1:nrow(df))*3)-1,labels=mylabels,cex=1.1)
png("fig1_ORAsig.png")
par(mar=c(c(5.1, 7.1, 2.1, 2.1) ))
barplot(t(df),beside=TRUE,horiz=TRUE,las=1,xlim=c(0,1250),
xlab="no. significant sets",
col=c("gray30","gray80"))
text(df$ns1+50,((1:nrow(df))*3)-1.5,labels=signif(df[,1],3))
text(df$ns2+50,((1:nrow(df))*3)-0.5,labels=signif(df[,2],3))
#abline(v=seq(100,1000,100),lty=2,lwd=0.5,col="gray")
legend("bottomright", inset=.02, legend=c("corrected","original"),
fill=c("gray80","gray30"), horiz=FALSE, cex=1.1)
mylabels <- gsub("$","%",gsub("^","+",signif(((df$ns2/df$ns1)-1)*100,3)))
text(df$ns2+160,((1:nrow(df))*3)-1,labels=mylabels,cex=1.1)
dev.off()
## png
## 2
# JACCARD
jac <- matrix( res1$Jaccard , ncol=9 )
rownames(jac) <- names(d)
colnames(jac) <- names(gs)
signif(jac,3) %>%
kbl(row.names = TRUE, caption="Jaccard similarity Corrected:Original") %>%
kable_paper("hover", full_width = F)
KEGG | Reactome | Wikipathways | miR targets | TFT GTRD | GO | HPO | Cellmarkers | Hallmark | |
---|---|---|---|---|---|---|---|---|---|
bulkrna1.Rds | 0.474 | 0.627 | 0.717 | 0.0494 | 0.612 | 0.686 | 0.0476 | 0.818 | 0.350 |
bulkrna2.Rds | 0.424 | 0.681 | 0.560 | 0.3600 | 0.700 | 0.850 | 0.3850 | 0.920 | 0.658 |
bulkrna3.Rds | 0.333 | 0.562 | 0.667 | 0.5530 | 0.800 | 0.750 | 0.3820 | 0.903 | 0.773 |
bulkrna4.Rds | 0.583 | 0.674 | 0.260 | 0.4430 | 0.617 | 0.550 | 0.0848 | 0.853 | 0.481 |
bulkrna5.Rds | NaN | NaN | 1.000 | NaN | 1.000 | 0.125 | 0.0000 | 0.000 | 0.000 |
bulkrna6.Rds | 0.839 | 0.760 | 0.845 | 0.8390 | 0.850 | 0.745 | 0.0642 | 0.899 | 0.757 |
bulkrna7.Rds | 0.237 | 0.771 | 0.299 | 0.6620 | 0.843 | 0.818 | 0.4140 | 0.909 | 0.444 |
jac2 <- apply(jac,2,function(x) { x <- x[!is.na(x)] ; x <- x[is.finite(x)] ; mean(x) } )
jac2
## KEGG Reactome Wikipathways miR targets TFT GTRD GO
## 0.4817013 0.6792784 0.6211642 0.4843414 0.7745720 0.6463056
## HPO Cellmarkers Hallmark
## 0.1967411 0.7573996 0.4947578
par(mar=c(c(5.1, 7.1, 2.1, 2.1) ))
barplot(sort(jac2),horiz=TRUE,las=1,xlim=c(0,1), xlab="Jaccard index")
par(mar=c(c(5.1, 5.1, 2.1, 2.1) ))
plot(cov,jac2,xlim=c(0,30000),ylim=c(0,1),pch=19,col="lightgray",cex=3,
xlab="gene coverage",ylab="Jaccard index")
text(cov,jac2,labels=names(cov))
mylm <- lm(jac2~cov)
abline(mylm,lty=2,lwd=2)
SLOPE=unname(signif(mylm$coefficients[2],3))
INT=unname(signif(mylm$coefficients[1],3))
PVAL=signif(summary(mylm)$coefficients[4],3)
RSQ=signif(summary(mylm)$r.squared,3)
HEADER=paste("Rsq =",RSQ,"; p-val =",PVAL)
mtext(HEADER)
png("fig1_ORAjac.png")
par(mar=c(c(5.1, 5.1, 2.1, 2.1) ))
plot(cov,jac2,xlim=c(0,30000),ylim=c(0,1),pch=19,col="lightgray",cex=3,
xlab="gene coverage",ylab="Jaccard index")
text(cov,jac2,labels=names(cov))
mylm <- lm(jac2~cov)
abline(mylm,lty=2,lwd=2)
SLOPE=unname(signif(mylm$coefficients[2],3))
INT=unname(signif(mylm$coefficients[1],3))
PVAL=signif(summary(mylm)$coefficients[4],3)
RSQ=signif(summary(mylm)$r.squared,3)
HEADER=paste("Rsq =",RSQ,"; p-val =",PVAL)
mtext(HEADER)
dev.off()
## png
## 2
For bulk RNA-seq data set #1, run enrichment with all the gene set libraries.
names(gs)
## [1] "KEGG" "Reactome" "Wikipathways" "miR targets" "TFT GTRD"
## [6] "GO" "HPO" "Cellmarkers" "Hallmark"
lapply(1:length(gs), function(i) {
bad <- orabad(d[[1]],gs[[i]])
good <- oragood(d[[1]],gs[[i]])
compare_ora_plots(bad,good,libname=names(gs)[i])
} )
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
For reproducibility
sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] gplots_3.1.3.1 fgsea_1.30.0
## [3] biomaRt_2.60.0 eulerr_7.0.2
## [5] kableExtra_1.4.0 mitch_1.16.0
## [7] clusterProfiler_4.12.0 DESeq2_1.44.0
## [9] SummarizedExperiment_1.34.0 Biobase_2.64.0
## [11] MatrixGenerics_1.16.0 matrixStats_1.3.0
## [13] GenomicRanges_1.56.0 GenomeInfoDb_1.40.1
## [15] IRanges_2.38.0 S4Vectors_0.42.0
## [17] BiocGenerics_0.50.0 getDEE2_1.14.0
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.16.0 jsonlite_1.8.8
## [4] magrittr_2.0.3 farver_2.1.2 rmarkdown_2.26
## [7] fs_1.6.4 zlibbioc_1.50.0 vctrs_0.6.5
## [10] memoise_2.0.1 ggtree_3.12.0 progress_1.2.3
## [13] htmltools_0.5.8.1 S4Arrays_1.4.1 curl_5.2.1
## [16] SparseArray_1.4.8 gridGraphics_0.5-1 sass_0.4.9
## [19] KernSmooth_2.23-22 bslib_0.7.0 htmlwidgets_1.6.4
## [22] httr2_1.0.1 echarts4r_0.4.5 plyr_1.8.9
## [25] cachem_1.0.8 igraph_2.0.3 mime_0.12
## [28] lifecycle_1.0.4 pkgconfig_2.0.3 Matrix_1.7-0
## [31] R6_2.5.1 fastmap_1.1.1 gson_0.1.0
## [34] shiny_1.8.1.1 GenomeInfoDbData_1.2.12 digest_0.6.35
## [37] aplot_0.2.2 enrichplot_1.24.0 GGally_2.2.1
## [40] colorspace_2.1-0 patchwork_1.2.0 AnnotationDbi_1.66.0
## [43] RSQLite_2.3.7 filelock_1.0.3 fansi_1.0.6
## [46] httr_1.4.7 polyclip_1.10-6 abind_1.4-5
## [49] compiler_4.4.0 bit64_4.0.5 withr_3.0.0
## [52] BiocParallel_1.38.0 viridis_0.6.5 DBI_1.2.3
## [55] ggstats_0.6.0 highr_0.10 ggforce_0.4.2
## [58] MASS_7.3-60.2 rappdirs_0.3.3 DelayedArray_0.30.1
## [61] HDO.db_0.99.1 caTools_1.18.2 gtools_3.9.5
## [64] tools_4.4.0 beeswarm_0.4.0 ape_5.8
## [67] scatterpie_0.2.3 httpuv_1.6.15 glue_1.7.0
## [70] promises_1.3.0 nlme_3.1-164 GOSemSim_2.30.0
## [73] grid_4.4.0 shadowtext_0.1.3 reshape2_1.4.4
## [76] generics_0.1.3 gtable_0.3.5 tidyr_1.3.1
## [79] hms_1.1.3 data.table_1.15.4 xml2_1.3.6
## [82] tidygraph_1.3.1 utf8_1.2.4 XVector_0.44.0
## [85] ggrepel_0.9.5 pillar_1.9.0 stringr_1.5.1
## [88] yulab.utils_0.1.4 later_1.3.2 splines_4.4.0
## [91] dplyr_1.1.4 tweenr_2.0.3 BiocFileCache_2.12.0
## [94] treeio_1.28.0 lattice_0.22-6 bit_4.0.5
## [97] tidyselect_1.2.1 GO.db_3.19.1 locfit_1.5-9.9
## [100] Biostrings_2.72.1 knitr_1.46 gridExtra_2.3
## [103] svglite_2.1.3 xfun_0.43 graphlayouts_1.1.1
## [106] stringi_1.8.4 UCSC.utils_1.0.0 lazyeval_0.2.2
## [109] ggfun_0.1.5 yaml_2.3.8 evaluate_0.23
## [112] codetools_0.2-20 ggraph_2.2.1 tibble_3.2.1
## [115] qvalue_2.36.0 ggplotify_0.1.2 cli_3.6.2
## [118] systemfonts_1.0.6 xtable_1.8-4 munsell_0.5.1
## [121] jquerylib_0.1.4 Rcpp_1.0.12 dbplyr_2.5.0
## [124] png_0.1-8 ggplot2_3.5.1 blob_1.2.4
## [127] prettyunits_1.2.0 DOSE_3.30.1 htm2txt_2.2.2
## [130] bitops_1.0-7 viridisLite_0.4.2 tidytree_0.4.6
## [133] scales_1.3.0 purrr_1.0.2 crayon_1.5.2
## [136] rlang_1.1.3 cowplot_1.1.3 fastmatch_1.1-4
## [139] KEGGREST_1.44.0