Source: https://github.com/markziemann/10mistakes

Introduction

#knitr::opts_chunk$set(dev = 'svg') # set output device to svg

suppressPackageStartupMessages({
  library("getDEE2")
  library("DESeq2")
  library("kableExtra")
  library("clusterProfiler")
  library("eulerr")
  library("gplots")
  library("mitch")
  library("fgsea")
  library("RhpcBLASctl")
  library("parallel")
})

RhpcBLASctl::blas_set_num_threads(1)

For this guide I will be using bulk RNA-seq data from a previous study, which is deposited at NCBI GEO and SRA under accession numbers: GSE55123 and SRP038101 (Lund et al, 2014). The experiment is designed to investigate the effect of Azacitidine treatment on AML3 cells.

The raw data have been processed by the DEE2 project, and the summary gene expression counts are available at the dee2.io website, and programmatically with the getDEE2 bioconductor package (Ziemann et al, 2019).

The gene counts have also been deposited to the /data folder in the example.Rdata file in case the DEE2 resource becomes unavailable. To import it, use the command: load("data/example.Rdata")

Alternatively, you could fetch data from another resource like NCBI GEO, Zenodo or from the host storage drive.

mdat <- getDEE2Metadata("hsapiens")

# get sample sheet
ss <- subset(mdat,SRP_accession=="SRP038101")

# fetch the whole set of RNA-seq data
x <- getDEE2("hsapiens",ss$SRR_accession , metadata=mdat, legacy=TRUE)
## For more information about DEE2 QC metrics, visit
##     https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md
mx <- x$GeneCounts
rownames(mx) <- paste(rownames(mx),x$GeneInfo$GeneSymbol)
dim(mx)
## [1] 58302     6
# aza no filtering
ss$trt <- grepl("Treated",ss$Experiment_title)

ss %>%
  kbl(caption="sample sheet for Aza treatment in AML3 cells") %>%
  kable_paper("hover", full_width = F)
sample sheet for Aza treatment in AML3 cells
SRR_accession QC_summary SRX_accession SRS_accession SRP_accession Experiment_title GEO_series trt
163028 SRR1171523 PASS SRX472607 SRS559064 SRP038101 GSM1329859: Untreated.1; Homo sapiens; RNA-Seq GSE55123 FALSE
163029 SRR1171524 WARN(3,4) SRX472608 SRS559066 SRP038101 GSM1329860: Untreated.2; Homo sapiens; RNA-Seq GSE55123 FALSE
163030 SRR1171525 WARN(3,4) SRX472609 SRS559065 SRP038101 GSM1329861: Untreated.3; Homo sapiens; RNA-Seq GSE55123 FALSE
163031 SRR1171526 WARN(3,4) SRX472610 SRS559068 SRP038101 GSM1329862: Treated.1; Homo sapiens; RNA-Seq GSE55123 TRUE
163032 SRR1171527 WARN(3,4) SRX472611 SRS559067 SRP038101 GSM1329863: Treated.2; Homo sapiens; RNA-Seq GSE55123 TRUE
163033 SRR1171528 WARN(3,4) SRX472612 SRS559069 SRP038101 GSM1329864: Treated.3; Homo sapiens; RNA-Seq GSE55123 TRUE

Data quality control

QC is important, even if you are using public transcriptome data. For RNA-seq it is a good idea to show the number of reads for each sample.

par(mar=c(5,7,5,1))
barplot(rev(colSums(mx)),horiz=TRUE,las=1,main="number of reads per sample in SRP038101")

Now make a MDS plot.

mds <- cmdscale(dist(t(mx)))

# expand plot area
XMIN=min(mds[,1])*1.3
XMAX=max(mds[,1])*1.3
YMIN=min(mds[,2])*1.3
YMAX=max(mds[,2])*1.3

cols <- as.character(grepl("Treated",ss$Experiment_title))
cols <- gsub("FALSE","lightblue",cols)
cols <- gsub("TRUE","pink",cols)
plot(mds, xlab="Coordinate 1", ylab="Coordinate 2",
  xlim=c(XMIN,XMAX),ylim=c(YMIN,YMAX),
  pch=19,cex=2,col=cols, main="MDS plot")
text(cmdscale(dist(t(mx))), labels=colnames(mx) )

Differential expression analysis

I will be using DESeq2 for DE analysis, the count matrix is prefiltered using a detection threshold of 10 reads per sample across all samples. All genes that meet the detection threshold will comprise the background list. The first 6 rows of the count matrix are shows.

mxf <- mx[which(rowMeans(mx)>=10),]
dim(mxf)
## [1] 13168     6
head(mxf,6) %>%
  kbl(caption="Count matrix format") %>%
  kable_paper("hover", full_width = F)
Count matrix format
SRR1171523 SRR1171524 SRR1171525 SRR1171526 SRR1171527 SRR1171528
ENSG00000225630 MTND2P28 494 396 340 333 415 418
ENSG00000237973 MTCO1P12 52 39 40 30 37 29
ENSG00000248527 MTATP6P1 853 544 537 582 702 716
ENSG00000228327 AL669831.1 17 13 21 21 22 12
ENSG00000228794 LINC01128 42 27 30 32 40 23
ENSG00000230699 AL645608.3 20 11 13 10 15 22
dds <- DESeqDataSetFromMatrix(countData = mxf , colData = ss, design = ~ trt )
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
z <- results(res)
vsd <- vst(dds, blind=FALSE)
zz <-cbind(as.data.frame(z),mxf)
def <-as.data.frame(zz[order(zz$pvalue),])

head(def,10) %>%
  kbl(caption="Top DE genes for Aza treatment") %>%
  kable_paper("hover", full_width = F)
Top DE genes for Aza treatment
baseMean log2FoldChange lfcSE stat pvalue padj SRR1171523 SRR1171524 SRR1171525 SRR1171526 SRR1171527 SRR1171528
ENSG00000165949 IFI27 1960.1970 -3.384492 0.0938869 -36.04861 0 0 4689 3583 2758 309 384 334
ENSG00000090382 LYZ 7596.0299 -1.650342 0.0561143 -29.41036 0 0 14212 10237 10789 3476 3764 3738
ENSG00000115461 IGFBP5 531.2217 -5.071157 0.1795239 -28.24781 0 0 1320 823 1025 23 26 43
ENSG00000157601 MX1 827.1511 -2.877795 0.1047823 -27.46450 0 0 1732 1501 1206 184 223 186
ENSG00000111331 OAS3 2127.2010 -2.661214 0.0972124 -27.37525 0 0 4204 3977 2972 562 614 560
ENSG00000070915 SLC12A3 424.5509 -3.374852 0.1298671 -25.98697 0 0 1012 721 653 63 85 76
ENSG00000234745 HLA-B 3197.0159 -1.431566 0.0604169 -23.69479 0 0 6085 4256 4023 1590 1872 1719
ENSG00000137965 IFI44 409.0957 -2.978581 0.1319352 -22.57608 0 0 829 740 635 76 111 89
ENSG00000204525 HLA-C 1631.6421 -1.461550 0.0660214 -22.13750 0 0 3112 2150 2106 791 923 891
ENSG00000110042 DTX4 524.1318 -2.470219 0.1173182 -21.05572 0 0 1166 883 688 166 168 145

Make a smear plot.

sigf <- subset(def,padj<=0.05)

DET=nrow(mxf)
NSIG=nrow(sigf)
NUP=nrow(subset(sigf,log2FoldChange>0))
NDN=nrow(subset(sigf,log2FoldChange<0))

HEADER=paste(DET,"detected genes;",NSIG,"w/FDR<0.05;",NUP,"up;",NDN,"down")

plot(log10(def$baseMean) ,def$log2FoldChange,
  cex=0.6,pch=19,col="darkgray",
  main="5-azacitidine treatment in AML3 cells",
  xlab="log10(basemean)",ylab="log2(fold change)")

points(log10(sigf$baseMean) ,sigf$log2FoldChange,
  cex=0.6,pch=19,col="red")

mtext(HEADER)

In the next sections I will run enrichment analysis with over-representation test and compare it to functional class scoring.

Get update gene symbols

Let’s see if many of the gene symbols have changed.

Comparing Ensembl release 90 to GENCODE Release 49 (GRCh38.p14) which corresponds to Ensembl 115 (Sept 2025).

So v90 had 58k genes and v115 has 86k which is a big increase. The v90 has 1609 genes that have been removed an v115 has 29676 that have been added. There are 56693 common.

From these common ones, 20456 have different gene names. I think the approach to naming these putative genes has changed a lot. Lots of CXorfXX genes have changed, not just AL358472.5 type names.

gt <- read.table("https://dee2.io/data/hsapiens/hsa_gene_info.tsv",header=TRUE)
head(gt)
##            GeneID  GeneSymbol mean median longest_isoform merged
## 1 ENSG00000223972     DDX11L1 1144   1144            1657   1735
## 2 ENSG00000227232      WASH7P 1351   1351            1351   1351
## 3 ENSG00000278267   MIR6859-1   68     68              68     68
## 4 ENSG00000243485 MIR1302-2HG  623    623             712   1021
## 5 ENSG00000284332   MIR1302-2  138    138             138    138
## 6 ENSG00000237613     FAM138A  888    888            1187   1219
gtnew <- read.table("ref/gencode.v49.genenames.tsv")

v1 <- list("Ens90"=gt$GeneID, "Ens115"=gtnew$V1)
plot(euler(v1),quantities = list(cex = 2), labels = list(cex = 2))

length(gt$GeneID)
## [1] 58302
length(gtnew$V1)
## [1] 86369
length(intersect(gt$GeneID,gtnew$V1))
## [1] 56693
length(setdiff(gt$GeneID,gtnew$V1))
## [1] 1609
length(setdiff(gtnew$V1,gt$GeneID))
## [1] 29676
gtm <- merge(gt,gtnew,by.x="GeneID",by.y="V1")
nrow(gtm)
## [1] 56693
table(gtm$GeneSymbol == gtm$V2)
## 
## FALSE  TRUE 
## 20456 36237
head(gtm[which(gtm$GeneSymbol != gtm$V2),],10)
##              GeneID GeneSymbol mean median longest_isoform merged       V2
## 5   ENSG00000000460   C1orf112 2430   2661            4355   5967    FIRRM
## 88  ENSG00000005238    FAM214B 2761   3016            5771   6409    ATOSB
## 115 ENSG00000006015   C19orf60 1256    960            2465   3370   REX1BD
## 171 ENSG00000007202   KIAA0100 1954    579            7407   9235    BLTP2
## 307 ENSG00000011638    TMEM159 1163   1378            2014   2558    LDAF1
## 349 ENSG00000014257       ACPP 1054    581            3125   4193     ACP3
## 381 ENSG00000018607     ZNF806 1774   1774            1774   1774 ZNF285CP
## 382 ENSG00000018610    CXorf56 1345   1089            2372   2580   STEEP1
## 415 ENSG00000022277     RTFDC1 1191    852            2212   4063     RTF2
## 474 ENSG00000029153     ARNTL2 2289   1888            6785   6983    BMAL2
tail(gtm[which(gtm$GeneSymbol != gtm$V2),],10)
##                GeneID  GeneSymbol mean median longest_isoform merged
## 56684 ENSG00000284738  AL358472.5 1260   1260            1267   1267
## 56685 ENSG00000284739  BX005132.2  464    464             464    464
## 56686 ENSG00000284740  AL645728.2  443    443             660    660
## 56687 ENSG00000284742  AL163152.1  374    374             374    374
## 56688 ENSG00000284743  AL139254.3  674    674             674    674
## 56689 ENSG00000284744  AL591163.1  987    987             987    987
## 56690 ENSG00000284745  AL589702.1 1325   1325            1509   1654
## 56691 ENSG00000284746 AC068587.10  219    219             219    219
## 56692 ENSG00000284747  AL034417.4 2168   2168            2236   4116
## 56693 ENSG00000284748  AL513220.1 1326   1326            1326   1326
##                    V2
## 56684 ENSG00000284738
## 56685 ENSG00000284739
## 56686 ENSG00000284740
## 56687         OR11P1P
## 56688 ENSG00000284743
## 56689 ENSG00000284744
## 56690 ENSG00000284745
## 56691 ENSG00000284746
## 56692 ENSG00000284747
## 56693 ENSG00000284748

Get gene sets

# from https://reactome.org/download/current/ReactomePathways.gmt.zip
genesets2 <- read.gmt("ref/ReactomePathways_2025-09-02.gmt")
gsets <- gmtPathways("ref/ReactomePathways_2025-09-02.gmt")

Mistake 1 - Raw p-values

Here we are discarding the old gene symbols in favour of the new ones.

up <- rownames(subset(def, log2FoldChange>0 & padj<0.05))
up <- sapply(strsplit(up," "),'[[',1)
up <- unique(gtnew[match(up,gtnew$V1),2])
up <- up[which(!is.na(up))]

dn <- rownames(subset(def, log2FoldChange<0 & padj<0.05))
dn <- sapply(strsplit(dn," "),'[[',1)
dn <- unique(gtnew[match(dn,gtnew$V1),2])
dn <- dn[which(!is.na(dn))]

bg <- rownames(def)
bg <- sapply(strsplit(bg," "),'[[',1)
bg <- unique(gtnew[match(bg,gtnew$V1),2])
bg <- bg[which(!is.na(bg))]

lapply(list(up,dn,bg),length)
## [[1]]
## [1] 1667
## 
## [[2]]
## [1] 1923
## 
## [[3]]
## [1] 13063
fres_up <- fora(pathways=gsets, genes=up, universe=bg, minSize = 5, maxSize = Inf)
nrow(fres_up)
## [1] 1840
nrow(subset(fres_up,padj<0.05))
## [1] 224
nrow(subset(fres_up,pval<0.05))
## [1] 351
fres_up_fdr <- subset(fres_up,padj<0.05)$pathway
fres_up_nom <- subset(fres_up,pval<0.05)$pathway

fres_dn <- fora(pathways=gsets, genes=dn, universe=bg, minSize = 5, maxSize = Inf)
nrow(subset(fres_dn,padj<0.05))
## [1] 128
nrow(subset(fres_dn,pval<0.05))
## [1] 352
fres_dn_fdr <- subset(fres_dn,padj<0.05)$pathway
fres_dn_nom <- subset(fres_dn,pval<0.05)$pathway

v1 <- list("up p"=fres_up_nom, "up FDR"=fres_up_fdr,
  "dn p"=fres_dn_nom, "dn FDR"=fres_dn_fdr)

par(mar=c(1,1,1,1))
par(mfrow=c(1,1))
plot(euler(v1),quantities = list(cex = 2), labels = list(cex = 2))

fres_fdr <- union(fres_dn_fdr,fres_up_fdr)
fres_nom <- union(fres_dn_nom,fres_up_nom)

v1 <- list("p"=fres_nom, "FDR"=fres_fdr)
plot(euler(v1),quantities = list(cex = 2), labels = list(cex = 2))

unlink("img/1a.png")
png("img/1a.png", width = 480, height = 480)
plot(euler(v1),quantities = list(cex = 2), labels = list(cex = 2))
dev.off()
## X11cairo 
##        2
par(mfrow=c(1,1))

par(mar=c(5.1,4.1,4.1,2.1))

Demonstrate with simulation. Select 1000 genes from the background list and run.

seeds <- seq(from=100,to=100*100,by=100)

m1sim <- mclapply(seeds, function(seed) {
  set.seed(seed)
  rand <- sample(x=bg,size=1000,replace=FALSE)
  fres_rand <- fora(pathways=gsets, genes=rand, universe=bg, minSize = 5, maxSize = Inf)
  nom <- nrow(subset(fres_rand,pval<0.05))
  fdr <- nrow(subset(fres_rand,padj<0.05))
  c(nom,fdr)
},mc.cores=8)

m1sim <- do.call(rbind,m1sim)
colnames(m1sim) <- c("p","FDR")

boxplot(m1sim,main="Random gene sets",ylab="no. gene sets meeting \"significance\" threshold <0.05")
beeswarm(list(mean(m1sim[,1]), mean(m1sim[,2])),add=TRUE,pch=4,cex=2)
lab1 <- paste("Mean=",signif(mean(m1sim[,1]) , 3),sep="")
lab2 <- paste("Mean=",signif(mean(m1sim[,2]) , 2),sep="")
text(c(1.2,2.2), c( mean(m1sim[,1])+5, mean(m1sim[,2])+5 ), labels=c(lab1,lab2) )

unlink("img/1b.png")
png("img/1b.png", width = 480, height = 480)
boxplot(m1sim,main="Random gene sets",ylab="no. gene sets meeting \"significance\" threshold <0.05")
beeswarm(list(mean(m1sim[,1]), mean(m1sim[,2])),add=TRUE,pch=4,cex=2)
lab1 <- paste("Mean=",signif(mean(m1sim[,1]) , 3),sep="")
lab2 <- paste("Mean=",signif(mean(m1sim[,2]) , 2),sep="")
text(c(1.2,2.2), c( mean(m1sim[,1])+5, mean(m1sim[,2])+5 ), labels=c(lab1,lab2) )
dev.off()
## X11cairo 
##        2
par(mfrow=c(1,1))

par(mar=c(5.1,4.1,4.1,2.1))

Mistake 2 - Background

Will try FORA and mitch.

Reporting criteria Method/resource used
Origin of gene sets Reactome (2023-03-06)
Tool used FORA (check fgsea version at foot of report)
Statistical test used hypergeometric
P-value correction for multiple comparisons FDR method
Background list Genes with >=10 reads per sample on average across all samples
Gene Selection DESeq2 FDR<0.05 and log2FC>0 or log2FC<0
Data availability via DEE2 at accession SRP038101 (human)
Other parameters Min gene set size of 5
# up dn and bg are defines above
all <- unique(gtm$V2)

lapply(list(up,dn,bg,all),length)
## [[1]]
## [1] 1667
## 
## [[2]]
## [1] 1923
## 
## [[3]]
## [1] 13063
## 
## [[4]]
## [1] 55503
fres_up_bg <- fora(pathways=gsets, genes=up, universe=bg, minSize = 5, maxSize = Inf)
fres_up_bg <- subset(fres_up_bg,padj<0.05)$pathway
length(fres_up_bg)
## [1] 224
fres_up_all <- fora(pathways=gsets, genes=up, universe=all, minSize = 5, maxSize = Inf)
fres_up_all <- subset(fres_up_all,padj<0.05)$pathway
length(fres_up_all)
## [1] 744
fres_dn_bg <- fora(pathways=gsets, genes=dn, universe=bg, minSize = 5, maxSize = Inf)
fres_dn_bg <- subset(fres_dn_bg,padj<0.05)$pathway
length(fres_dn_bg)
## [1] 128
fres_dn_all <- fora(pathways=gsets, genes=dn, universe=all, minSize = 5, maxSize = Inf)
fres_dn_all <- subset(fres_dn_all,padj<0.05)$pathway
length(fres_dn_all)
## [1] 812
v1 <- list("up"=fres_up_bg, "up*"=fres_up_all, "dn"=fres_dn_bg, "dn*"=fres_dn_all)

par(mar=c(1,1,1,1))
par(mfrow=c(1,1))
plot(euler(v1),quantities = list(cex = 2), labels = list(cex = 2))

v2 <- list("bg"=union(fres_up_bg,fres_dn_bg),
  "bg*"=union(fres_up_all, fres_dn_all) )

plot(euler(v2),quantities = list(cex = 2), labels = list(cex = 2))

par(mar=c(1,1,1,1))
unlink("img/2a.png")
png("img/2a.png", width = 480, height = 480)
plot(euler(v2),quantities = list(cex = 2), labels = list(cex = 2))
dev.off()
## X11cairo 
##        2
par(mfrow=c(1,1))

par(mar=c(5.1,4.1,4.1,2.1))

Mistake 2 simulation. select genes randomly from BG list and compare to “all”.

seeds <- seq(from=100,to=100*100,by=100)

m2sim <- mclapply(seeds, function(seed) {
  set.seed(seed)
  rand <- sample(x=bg,size=1000,replace=FALSE)
  sim_all <- fora(pathways=gsets, genes=rand, universe=all, minSize = 5, maxSize = Inf)
  sim_bg <- fora(pathways=gsets, genes=rand, universe=bg, minSize = 5, maxSize = Inf)
  sig_all <- nrow(subset(sim_all,padj<0.05))
  sig_bg <- nrow(subset(sim_bg,padj<0.05))
  c(sig_all,sig_bg)
},mc.cores=8)

m2sim <- do.call(rbind,m2sim)
colnames(m2sim) <- c("Whole genome background","Correct background")

par(mar=c(5.1,4.1,4.1,2.1))

boxplot(m2sim,main="Random gene sets drawn from BG",ylab="no. gene sets FDR<0.05")
mtext("100 simulations")
beeswarm(list(mean(m2sim[,1]), mean(m2sim[,2])),add=TRUE,pch=4,cex=2)
lab1 <- paste("Mean=",signif(mean(m2sim[,1]) , 3),sep="")
lab2 <- paste("Mean=",signif(mean(m2sim[,2]) , 2),sep="")
text(c(1.2,2.2), c( mean(m2sim[,1])+15, mean(m2sim[,2])+15 ), labels=c(lab1,lab2) )

par(mar=c(5.1,4.1,4.1,2.1))
unlink("img/2b.png")
png("img/2b.png", width = 480, height = 480)
boxplot(m2sim,main="Random gene sets drawn from BG",ylab="no. gene sets FDR<0.05")
mtext("100 simulations")
beeswarm(list(mean(m2sim[,1]), mean(m2sim[,2])),add=TRUE,pch=4,cex=2)
lab1 <- paste("Mean=",signif(mean(m2sim[,1]) , 3),sep="")
lab2 <- paste("Mean=",signif(mean(m2sim[,2]) , 2),sep="")
text(c(1.2,2.2), c( mean(m2sim[,1])+15, mean(m2sim[,2])+15 ), labels=c(lab1,lab2) )
dev.off()
## X11cairo 
##        2
par(mfrow=c(1,1))

par(mar=c(5.1,4.1,4.1,2.1))

In the simulation, which gene sets falsely come up most?

There’s 37 pathways that always come up.

m2simpw <- mclapply(seeds, function(seed) {
  set.seed(seed)
  rand <- sample(x=bg,size=1000,replace=FALSE)
  sim_all <- fora(pathways=gsets, genes=rand, universe=all, minSize = 5, maxSize = Inf)
  sig_all <- subset(sim_all,padj<0.05)$pathway
  return(sig_all)
},mc.cores=8)

m2tbl <- table(unlist((m2simpw)))
m2pw <- table(m2tbl)

plot(table(m2tbl),type="b",
  xlab="no. appearances as false positive, from 100 simulations",
  ylab="no. gene sets",
  main="1000 random genes drawn from expressed genes"
)
set.seed(42) ; m2selected <- sample(names(which(m2tbl==100)),20)
text(100,seq(from=40,to=120,by=(120-40)/20),labels=m2selected,cex=0.8,pos=2)

par(mar=c(5.1,4.1,4.1,2.1))
unlink("img/2c.png")
png("img/2c.png", width = 480, height = 480)
plot(table(m2tbl),type="b",
  xlab="no. appearances as false positive, from 100 simulations",
  ylab="no. gene sets",
  main="1000 random genes drawn from expressed genes"
)
set.seed(42) ; m2selected <- sample(names(which(m2tbl==100)),20)
text(100,seq(from=40,to=120,by=(120-40)/20),labels=m2selected,cex=0.8,pos=2)
dev.off()
## X11cairo 
##        2
par(mfrow=c(1,1))
par(mar=c(5.1,4.1,4.1,2.1))

Mistake 4 - prioritisation by p-values

Reporting criteria Method/resource used
Origin of gene sets Reactome (2023-03-06)
Tool used mitch (check version at foot of report)
Statistical test used Rank ANOVA (Kruskal Wallis)
P-value correction for multiple comparisons FDR method
Background list Genes with >=10 reads per sample on average across all samples
Gene Scoring DESeq2 Wald stat
Data availability via DEE2 at accession SRP038101 (human)
Other parameters Min gene set size of 5
def2 <- def
rownames(def2) <- sapply(strsplit(rownames(def)," "),'[[',1)

m <- mitch_import(x=def2,DEtype="deseq2",geneTable=gtnew)
## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.
## Note: Mean no. genes in input = 13168
## Note: no. genes in output = 13063
## Note: estimated proportion of input genes in output = 0.992
mres <- mitch_calc(x=m,genesets=gsets,minsetsize=5,cores=4,priority="significance")
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
mres_sig_up <- head(subset(mres$enrichment_result,s.dist>0),10)
mres_sig_dn <- head(subset(mres$enrichment_result,s.dist<0),10)

mres_sig_up_tbl <- apply(data.frame(mres_sig_up[,1:2],apply(mres_sig_up[,3:5],2,signif,2)),2,as.character)
mres_sig_dn_tbl <- apply(data.frame(mres_sig_dn[,1:2],apply(mres_sig_dn[,3:5],2,signif,2)),2,as.character)

mres_sig_up_tbl %>%
  kbl(caption="Top upregulated pathways when prioritised by FDR",row.names=FALSE) %>%
  kable_paper("hover", full_width = F)
Top upregulated pathways when prioritised by FDR
set setSize pANOVA s.dist p.adjustANOVA
Cell Cycle 589 1.4e-47 0.35 2.6e-44
Metabolism of RNA 725 4.4e-46 0.31 4.0e-43
Cell Cycle, Mitotic 479 1.5e-39 0.35 9.1e-37
Cell Cycle Checkpoints 246 2.1e-27 0.40 9.6e-25
M Phase 336 9.5e-27 0.34 2.9e-24
Mitotic Prometaphase 189 4.6e-25 0.44 1.1e-22
Mitotic Metaphase and Anaphase 208 8.9e-24 0.40 1.8e-21
Mitotic Anaphase 207 2.3e-23 0.40 4.3e-21
Processing of Capped Intron-Containing Pre-mRNA 273 6.6e-23 0.35 1.1e-20
Resolution of Sister Chromatid Cohesion 114 1.0e-20 0.51 1.6e-18
mres_sig_dn_tbl %>%
  kbl(caption="Top downregulated pathways when prioritised by FDR",row.names=FALSE) %>%
  kable_paper("hover", full_width = F)
Top downregulated pathways when prioritised by FDR
set setSize pANOVA s.dist p.adjustANOVA
Innate Immune System 747 4.3e-27 -0.23 1.6e-24
Immune System 1434 5.8e-26 -0.17 1.5e-23
Interferon alpha/beta signaling 60 1.6e-19 -0.67 2.3e-17
Neutrophil degranulation 382 1.4e-18 -0.26 1.7e-16
Interferon gamma signaling 67 1.8e-15 -0.56 1.6e-13
Extracellular matrix organization 154 1.4e-13 -0.35 8.5e-12
Cytokine Signaling in Immune system 567 2.1e-12 -0.17 1.0e-10
Platelet activation, signaling and aggregation 174 9.1e-12 -0.30 3.6e-10
Signaling by Interleukins 327 2.0e-10 -0.21 5.7e-09
Immunoregulatory interactions between a Lymphoid and a non-Lymphoid cell 60 5.1e-10 -0.46 1.3e-08
mres <- mitch_calc(x=m,genesets=gsets,minsetsize=5,cores=4,priority="effect")
## Note: Enrichments with large effect sizes may not be
##             statistically significant.
mres_es_up <- head(subset(mres$enrichment_result,s.dist>0 & p.adjustANOVA < 0.05),10)
mres_es_dn <- head(subset(mres$enrichment_result,s.dist<0 & p.adjustANOVA < 0.05),10)

mres_es_up_tbl <- apply(data.frame(mres_es_up[,1:2],apply(mres_es_up[,3:5],2,signif,2)),2,as.character)
mres_es_dn_tbl <- apply(data.frame(mres_es_dn[,1:2],apply(mres_es_dn[,3:5],2,signif,2)),2,as.character)

mres_es_up_tbl %>%
  kbl(caption="Top upregulated pathways when prioritised by ES (discarding any with FDR>0.05)",row.names=FALSE) %>%
  kable_paper("hover", full_width = F)
Top upregulated pathways when prioritised by ES (discarding any with FDR>0.05)
set setSize pANOVA s.dist p.adjustANOVA
Activation of NOXA and translocation to mitochondria 5 1.1e-03 0.84 6.9e-03
Condensation of Prometaphase Chromosomes 11 2.5e-06 0.82 3.5e-05
Postmitotic nuclear pore complex (NPC) reformation 27 1.8e-11 0.75 6.4e-10
Phosphorylation of Emi1 6 1.6e-03 0.75 8.8e-03
Interactions of Rev with host cellular proteins 37 6.8e-15 0.74 5.2e-13
Nuclear import of Rev protein 34 1.7e-13 0.73 9.9e-12
Rev-mediated nuclear export of HIV RNA 35 1.0e-13 0.73 6.3e-12
Transport of Ribonucleoproteins into the Host Nucleus 32 2.1e-12 0.72 1.0e-10
Export of Viral Ribonucleoproteins from Nucleus 32 2.9e-12 0.71 1.3e-10
NEP/NS2 Interacts with the Cellular Export Machinery 32 2.9e-12 0.71 1.3e-10
mres_es_dn_tbl %>%
  kbl(caption="Top upregulated pathways when prioritised by ES (discarding any with FDR>0.05)",row.names=FALSE) %>%
  kable_paper("hover", full_width = F)
Top upregulated pathways when prioritised by ES (discarding any with FDR>0.05)
set setSize pANOVA s.dist p.adjustANOVA
Prostanoid ligand receptors 5 9.7e-04 -0.85 0.00620
Interleukin-9 signaling 6 6.0e-04 -0.81 0.00430
Interleukin-2 signaling 7 3.5e-04 -0.78 0.00270
Interleukin-21 signaling 8 2.6e-04 -0.75 0.00220
Specification of primordial germ cells 6 1.7e-03 -0.74 0.00960
Fatty Acids bound to GPR40 (FFAR1) regulate insulin secretion 6 1.8e-03 -0.74 0.00990
Scavenging by Class A Receptors 5 4.5e-03 -0.73 0.02100
Biosynthesis of Lipoxins (LX) 5 6.9e-03 -0.70 0.02900
Interleukin-20 family signaling 12 3.3e-05 -0.69 0.00038
Epithelial-Mesenchymal Transition (EMT) during gastrulation 5 7.7e-03 -0.69 0.03100
m4tbls <- list("esup"=mres_es_up_tbl,"esdn"=mres_es_dn_tbl,"sigup"=mres_sig_up_tbl,"sigdn"=mres_sig_dn_tbl)
saveRDS(m4tbls,"m4tbls.Rds")

Make a scatterplot

mres2 <- mres$enrichment_result

sig <- subset(mres2,p.adjustANOVA<0.05)
plot(abs(mres2$s.dist),-log10(mres2$pANOVA),pch=19,cex=0.6,frame.plot=FALSE,
  xlab="enrichment score (absolute)", ylab="significance [-log10(p-value)]",
  xlim=c(0,0.9),ylim=c(0,50))
points(abs(sig$s.dist),-log10(sig$pANOVA),pch=19,cex=0.61,col="red")

par(mar=c(5.1,4.1,4.1,2.1))
unlink("img/4a.png")
png("img/4a.png", width = 480, height = 480)
plot(abs(mres2$s.dist),-log10(mres2$pANOVA),pch=19,cex=0.6,frame.plot=FALSE,
  xlab="enrichment score (absolute)", ylab="significance [-log10(p-value)]",
  xlim=c(0,0.9),ylim=c(0,50))
points(abs(sig$s.dist),-log10(sig$pANOVA),pch=19,cex=0.61,col="red")
dev.off()
## X11cairo 
##        2
par(mfrow=c(1,1))
par(mar=c(5.1,4.1,4.1,2.1))

Make a barplot

par(mar=c(2,20,2,0))

mres_sig_up_vals <- -log(mres_sig_up$pANOVA)
names(mres_sig_up_vals) <- mres_sig_up$set

mres_sig_dn_vals <- -log(mres_sig_dn$pANOVA)
names(mres_sig_dn_vals) <- mres_sig_dn$set

mres_es_up_vals <- mres_es_up$s.dist
names(mres_es_up_vals) <- mres_es_up$set

mres_es_dn_vals <- -1 * mres_es_dn$s.dist
names(mres_es_dn_vals) <- mres_es_dn$set

par(mfrow=c(2,1))

barplot( mres_sig_up_vals,
  horiz=TRUE,las=1,cex.names=0.65,col="gray",
  main="Up Sig (-log10 p-value)",
  xlab="-log10 p-value",
  cex.main=0.9)

barplot( mres_es_up_vals,
  horiz=TRUE,las=1,cex.names=0.65,col="gray",
  main="Up ES",
  xlab="ES",
  cex.main=0.9)

barplot( mres_sig_dn_vals,
  horiz=TRUE,las=1,cex.names=0.65,col="gray",
  main="Down Sig (-log10 p-value)",
  xlab="-log10 p-value",
  cex.main=0.9)

barplot( mres_es_dn_vals,
  horiz=TRUE,las=1,cex.names=0.65,col="gray",
  main="Down ES",
  xlab="-ES",
  cex.main=0.9)

par(mfrow=c(1,1))
par(mar=c(5.1,4.1,4.1,2.1))

Mistake 5 - gene set list size too small or long

up <- rownames(subset(def, log2FoldChange>0 & padj<0.05))
up <- sapply(strsplit(up," "),'[[',1)
up <- unique(gtnew[match(up,gtnew$V1),2])
up <- up[which(!is.na(up))]

dn <- rownames(subset(def, log2FoldChange<0 & padj<0.05))
dn <- sapply(strsplit(dn," "),'[[',1)
dn <- unique(gtnew[match(dn,gtnew$V1),2])
dn <- dn[which(!is.na(dn))]

bg <- rownames(def)
bg <- sapply(strsplit(bg," "),'[[',1)
bg <- unique(gtnew[match(bg,gtnew$V1),2])
bg <- bg[which(!is.na(bg))]

lapply(list(up,dn,bg),length)
## [[1]]
## [1] 1667
## 
## [[2]]
## [1] 1923
## 
## [[3]]
## [1] 13063
sizes <- c(50,100,200,300,400,500,750,1000,1500,2000,2500,3000,4000,5000,6000,7000)

fup <- nrow(subset(fora(pathways=gsets, genes=up, universe=bg, minSize = 5, maxSize = Inf),padj<0.05))
fdn <- nrow(subset(fora(pathways=gsets, genes=dn, universe=bg, minSize = 5, maxSize = Inf),padj<0.05))

nups <- unlist(lapply(sizes, function(i) {
  ids <- rownames(subset(def2, log2FoldChange>0))
  ids <- head(ids,i)
  genes <- unique(gtnew[match(ids,gtnew$V1),2])
  fres <- fora(pathways=gsets, genes=genes, universe=bg, minSize = 5, maxSize = Inf)
  nrow(subset(fres,padj<0.05))
}))
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
ndns <- unlist(lapply(sizes, function(i) {
  ids <- rownames(subset(def2, log2FoldChange<0))
  ids <- head(ids,i)
  genes <- unique(gtnew[match(ids,gtnew$V1),2])
  fres <- fora(pathways=gsets, genes=genes, universe=bg, minSize = 5, maxSize = Inf)
  nrow(subset(fres,padj<0.05))
}))
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
df <- data.frame("up"=nups,"dn"=ndns,row.names=sizes)
df$tot <- df$up + df$dn
df
##       up  dn tot
## 50     0  31  31
## 100    0  40  40
## 200    8  46  54
## 300   46  51  97
## 400   88  54 142
## 500  116  72 188
## 750  189 107 296
## 1000 202 120 322
## 1500 227 118 345
## 2000 256 143 399
## 2500 283 172 455
## 3000 282 159 441
## 4000 263 143 406
## 5000 228 102 330
## 6000 192  96 288
## 7000 165  57 222
plot(sizes,df$up,type="b",col="red",ylab="no. gene sets with FDR<0.05",xlab="gene list length")
lines(sizes,df$dn,type="b",col="blue")

plot(sizes,df$up,type="b",col="red",ylab="no. gene sets with FDR<0.05",xlab="gene list length",
  xlim=c(0,500), ylim=c(0,125))
lines(sizes,df$dn,type="b",col="blue")

par(mfrow=c(1,2))
plot(sizes,df$up,type="b",col="red",ylab="no. gene sets with FDR<0.05",xlab="gene list length")
lines(sizes,df$dn,type="b",col="blue")

plot(sizes,df$up,type="b",col="red",ylab="no. gene sets with FDR<0.05",xlab="gene list length",
  xlim=c(0,500), ylim=c(0,125))
lines(sizes,df$dn,type="b",col="blue")

unlink("img/5m.png")
png("img/5m.png", width = 700, height = 480)
par(mfrow=c(1,2))
plot(sizes,df$up,type="b",col="red",ylab="no. gene sets with FDR<0.05",xlab="gene list length")
lines(sizes,df$dn,type="b",col="blue")
plot(sizes,df$up,type="b",col="red",ylab="no. gene sets with FDR<0.05",xlab="gene list length",
  xlim=c(0,500), ylim=c(0,125))
lines(sizes,df$dn,type="b",col="blue")
dev.off()
## X11cairo 
##        2
par(mfrow=c(1,1))

Mistake 6 - combining up and downregulated genes into the same test.

updn <- unique(c(up,dn))

fres_updn_bg <- fora(pathways=gsets, genes=updn, universe=bg, minSize = 5, maxSize = Inf)
fres_updn_bg <- subset(fres_updn_bg,padj<0.05)$pathway

length(fres_updn_bg)
## [1] 166
length(fres_up_bg)
## [1] 224
length(fres_dn_bg)
## [1] 128
length(unique(c(fres_up_bg,fres_dn_bg)))
## [1] 351
v1 <- list("up"=fres_up_bg,"dn"=fres_dn_bg,"combined"=fres_updn_bg)
plot(euler(v1),quantities = list(cex = 2), labels = list(cex = 2))

unlink("img/6m. png")
png("img/6m.png", width = 480, height = 300)
plot(euler(v1),quantities = list(cex = 2), labels = list(cex = 2))
dev.off()
## X11cairo 
##        2

Mistake 8 - using shallow gene annotations

msdb <- gmtPathways("ref/msigdb/msigdb.v2025.1.Hs.symbols.gmt")
kegg <- msdb[grep("KEGG",names(msdb))]
kegg_med <- msdb[grep("KEGG_MED",names(msdb))]
kegg <- kegg[grep("KEGG_MED",names(kegg),invert=TRUE)] #separate KEGG LEG from KEGG MED
gobp <- msdb[grep("GOBP",names(msdb))]
reactome <- msdb[grep("REACTOME",names(msdb))]
gslist <- list("KEGG"=kegg,"KEGGM"=kegg_med,"Reactome"=reactome,"GOBP"=gobp,"MSigDB"=msdb)

# no. gene sets
d1 <- unlist( lapply(gslist,length) )

# total number of annotations
d2 <- unlist( lapply(gslist,function(i) { length(unlist(i)) } ) )

# median gene set size
d3 <- unlist(lapply(gslist,function(i) { median(unlist(lapply(i,length))) } ))

# number of genes with one or more functions
d4 <- unlist( lapply(gslist,function(i) { length(unique(unlist(i))) } ))

kegg_up <- fora(pathways=kegg, genes=up, universe=bg, minSize = 5, maxSize = Inf)
kegg_dn <- fora(pathways=kegg, genes=dn, universe=bg, minSize = 5, maxSize = Inf)
nrow(subset(kegg_up,padj<0.05))
## [1] 11
nrow(subset(kegg_dn,padj<0.05))
## [1] 51
sum(nrow(subset(kegg_up,padj<0.05)), nrow(subset(kegg_dn,padj<0.05) ))
## [1] 62
kegg_med_up <- fora(pathways=kegg_med, genes=up, universe=bg, minSize = 5, maxSize = Inf)
kegg_med_dn <- fora(pathways=kegg_med, genes=dn, universe=bg, minSize = 5, maxSize = Inf)
nrow(subset(kegg_med_up,padj<0.05))
## [1] 20
nrow(subset(kegg_med_dn,padj<0.05))
## [1] 18
sum(nrow(subset(kegg_med_up,padj<0.05)), nrow(subset(kegg_med_dn,padj<0.05) ))
## [1] 38
reactome_up <- fora(pathways=reactome, genes=up, universe=bg, minSize = 5, maxSize = Inf)
reactome_dn <- fora(pathways=reactome, genes=dn, universe=bg, minSize = 5, maxSize = Inf)
nrow(subset(reactome_up,padj<0.05))
## [1] 165
nrow(subset(reactome_dn,padj<0.05))
## [1] 117
sum(nrow(subset(reactome_up,padj<0.05)), nrow(subset(reactome_dn,padj<0.05) ))
## [1] 282
gobp_up <- fora(pathways=gobp, genes=up, universe=bg, minSize = 5, maxSize = Inf)
gobp_dn <- fora(pathways=gobp, genes=dn, universe=bg, minSize = 5, maxSize = Inf)
nrow(subset(gobp_up,padj<0.05))
## [1] 340
nrow(subset(gobp_dn,padj<0.05))
## [1] 1416
sum(nrow(subset(gobp_up,padj<0.05)), nrow(subset(gobp_dn,padj<0.05) ))
## [1] 1756
msdb_up <- fora(pathways=msdb, genes=up, universe=bg, minSize = 5, maxSize = Inf)
msdb_dn <- fora(pathways=msdb, genes=dn, universe=bg, minSize = 5, maxSize = Inf)
nrow(subset(msdb_up,padj<0.05))
## [1] 3214
nrow(subset(msdb_dn,padj<0.05))
## [1] 7217
sum(nrow(subset(msdb_up,padj<0.05)), nrow(subset(msdb_dn,padj<0.05) ))
## [1] 10431
up_pw <- c(nrow(subset(kegg_up,padj<0.05)),nrow(subset(kegg_med_up,padj<0.05)),
  nrow(subset(reactome_up,padj<0.05)), nrow(subset(gobp_up,padj<0.05)),
  nrow(subset(msdb_up,padj<0.05)) )

dn_pw <- c(nrow(subset(kegg_dn,padj<0.05)), nrow(subset(kegg_med_dn,padj<0.05)),
  nrow(subset(reactome_dn,padj<0.05)), nrow(subset(gobp_dn,padj<0.05)),
  nrow(subset(msdb_dn,padj<0.05)) )

m8tbl <- data.frame(d1,d2,d3,d4,up_pw, dn_pw)

colnames(m8tbl) <- c("No. gene sets","Total no. annotations","Median gene set size",
  "No. genes with ≥1 annotation","Up-regulated","Down-regulated")

m8tbl %>%
  kbl(caption="Table XX. Gene set metrics, and number of significantly differentially regulated pathways (FDR<0.05).") %>%
  kable_paper("hover", full_width = F)
Table XX. Gene set metrics, and number of significantly differentially regulated pathways (FDR<0.05).
No. gene sets Total no. annotations Median gene set size No. genes with ≥1 annotation Up-regulated Down-regulated
KEGG 186 12800 52.5 5245 11 51
KEGGM 658 9662 11.5 2788 20 18
Reactome 1787 97544 23.0 11369 165 117
GOBP 7583 616242 20.0 18000 340 1416
MSigDB 35134 4089406 47.0 43351 3214 7217
saveRDS(m8tbl,"m8tbl.Rds")

Mistake 9 - outated gene sets

par(mar=c(5.1,5.1,4.1,2.1))

reactome_files <- list.files("ref/msigdb/reactome/",full.names=TRUE)
reactome_files <- reactome_files[c(6:20,1:5)]
reactome_files <- c(reactome_files,"ref/ReactomePathways_2025-09-02.gmt")

reactomes <- lapply(reactome_files,gmtPathways)

names(reactomes) <- c("2010_09","2012_09","2013_05","2015_04","2016_01","2016_09","2017_04",
  "2017_10","2018_07","2019_08","2020_03","2020_09","2021_03","2021_04","2022_01","2022_09",
  "2023_03","2023_10","2024_08","2025_06","2025_09")

nogs <- unlist(lapply(reactomes,length))
nogs
## 2010_09 2012_09 2013_05 2015_04 2016_01 2016_09 2017_04 2017_10 2018_07 2019_08 
##     430     674     674     674     674     674     674     674     674    1499 
## 2020_03 2020_09 2021_03 2021_04 2022_01 2022_09 2023_03 2023_10 2024_08 2025_06 
##    1532    1554    1569    1604    1615    1635    1654    1692    1736    1787 
## 2025_09 
##    2785
# remove sets with 4 or fewer genes
reactomes_filt <- lapply(reactomes,function(gs) {
  l <- lapply(gs,length)
  gs[which(l>=5)]
})
nogs <- unlist(lapply(reactomes_filt,length))
nogs
## 2010_09 2012_09 2013_05 2015_04 2016_01 2016_09 2017_04 2017_10 2018_07 2019_08 
##     430     674     674     674     674     674     674     674     674    1499 
## 2020_03 2020_09 2021_03 2021_04 2022_01 2022_09 2023_03 2023_10 2024_08 2025_06 
##    1532    1554    1569    1604    1615    1635    1654    1692    1736    1787 
## 2025_09 
##    2159
barplot(nogs,las=2,ylim=c(0,2500),ylab="no. gene sets",main="Reactome gene sets over time",
  border=NA)
lapply(seq(0,2500,500),function(h) { abline(h=h,lty=2,col="gray") })

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
unlink("img/9m.png")
png("img/9m.png", width = 480, height = 300)
barplot(nogs,las=2,ylim=c(0,2500),ylab="no. gene sets",main="Reactome gene sets over time",
  border=NA,col="darkgray")
lapply(seq(0,2500,500),function(h) { abline(h=h,lty=2,col="lightgray") })
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
dev.off()
## X11cairo 
##        2
reactomes <- reactomes_filt

tot_anno <- unlist(lapply(reactomes,function(gs) {
  length(unlist(gs))
}))
tot_anno
## 2010_09 2012_09 2013_05 2015_04 2016_01 2016_09 2017_04 2017_10 2018_07 2019_08 
##   21362   37601   37601   37601   37601   37601   37601   37601   37601   85647 
## 2020_03 2020_09 2021_03 2021_04 2022_01 2022_09 2023_03 2023_10 2024_08 2025_06 
##   87352   88301   86752   89457   89476   92250   92780   95226   97590   97544 
## 2025_09 
##  134094
barplot(tot_anno,las=2,ylim=c(0,140000),main="Total no. annotations")

gene_coverage <- unlist(lapply(reactomes,function(gs) {
  length(unique(unlist(gs)))
}))
gene_coverage
## 2010_09 2012_09 2013_05 2015_04 2016_01 2016_09 2017_04 2017_10 2018_07 2019_08 
##    4159    6025    6025    6025    6025    6025    6025    6025    6025   10718 
## 2020_03 2020_09 2021_03 2021_04 2022_01 2022_09 2023_03 2023_10 2024_08 2025_06 
##   10807   10821   10826   10968   10646   11013   11065   11155   11290   11369 
## 2025_09 
##   11886
barplot(gene_coverage,las=2,ylim=c(0,12000),ylab="gene coverage")

Session information

For reproducibility


Click HERE to show session info

save.image("analysis.Rdata")
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] png_0.1-8                   beeswarm_0.4.0             
##  [3] RhpcBLASctl_0.23-42         fgsea_1.34.2               
##  [5] mitch_1.20.0                gplots_3.2.0               
##  [7] eulerr_7.0.2                clusterProfiler_4.16.0     
##  [9] kableExtra_1.4.0            DESeq2_1.48.1              
## [11] SummarizedExperiment_1.38.1 Biobase_2.68.0             
## [13] MatrixGenerics_1.20.0       matrixStats_1.5.0          
## [15] GenomicRanges_1.60.0        GenomeInfoDb_1.44.1        
## [17] IRanges_2.42.0              S4Vectors_0.46.0           
## [19] BiocGenerics_0.54.0         generics_0.1.4             
## [21] getDEE2_1.18.0             
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3      rstudioapi_0.17.1       jsonlite_2.0.0         
##   [4] magrittr_2.0.3          ggtangle_0.0.7          farver_2.1.2           
##   [7] rmarkdown_2.29          fs_1.6.6                vctrs_0.6.5            
##  [10] memoise_2.0.1           ggtree_3.16.3           htmltools_0.5.8.1      
##  [13] S4Arrays_1.8.1          SparseArray_1.8.1       gridGraphics_0.5-1     
##  [16] sass_0.4.10             KernSmooth_2.23-26      bslib_0.9.0            
##  [19] htmlwidgets_1.6.4       echarts4r_0.4.5         plyr_1.8.9             
##  [22] cachem_1.1.0            igraph_2.1.4            mime_0.13              
##  [25] lifecycle_1.0.4         pkgconfig_2.0.3         Matrix_1.7-3           
##  [28] R6_2.6.1                fastmap_1.2.0           gson_0.1.0             
##  [31] shiny_1.11.1            GenomeInfoDbData_1.2.14 digest_0.6.37          
##  [34] aplot_0.2.8             enrichplot_1.28.4       colorspace_2.1-1       
##  [37] GGally_2.3.0            patchwork_1.3.1         AnnotationDbi_1.70.0   
##  [40] textshaping_1.0.1       RSQLite_2.4.2           polyclip_1.10-7        
##  [43] httr_1.4.7              abind_1.4-8             compiler_4.5.1         
##  [46] bit64_4.6.0-1           S7_0.2.0                BiocParallel_1.42.1    
##  [49] DBI_1.2.3               ggstats_0.10.0          R.utils_2.13.0         
##  [52] MASS_7.3-65             DelayedArray_0.34.1     gtools_3.9.5           
##  [55] caTools_1.18.3          tools_4.5.1             ape_5.8-1              
##  [58] httpuv_1.6.16           R.oo_1.27.1             glue_1.8.0             
##  [61] promises_1.3.3          nlme_3.1-168            GOSemSim_2.34.0        
##  [64] polylabelr_0.3.0        reshape2_1.4.4          gtable_0.3.6           
##  [67] rmdformats_1.0.4        R.methodsS3_1.8.2       tidyr_1.3.1            
##  [70] data.table_1.17.8       xml2_1.3.8              XVector_0.48.0         
##  [73] ggrepel_0.9.6           pillar_1.11.0           stringr_1.5.1          
##  [76] yulab.utils_0.2.0       later_1.4.2             splines_4.5.1          
##  [79] dplyr_1.1.4             treeio_1.32.0           lattice_0.22-7         
##  [82] bit_4.6.0               tidyselect_1.2.1        GO.db_3.21.0           
##  [85] locfit_1.5-9.12         Biostrings_2.76.0       knitr_1.50             
##  [88] gridExtra_2.3           bookdown_0.43           svglite_2.2.1          
##  [91] xfun_0.52               stringi_1.8.7           UCSC.utils_1.4.0       
##  [94] statnet.common_4.12.0   lazyeval_0.2.2          ggfun_0.2.0            
##  [97] yaml_2.3.10             evaluate_1.0.4          codetools_0.2-20       
## [100] tibble_3.3.0            qvalue_2.40.0           ggplotify_0.1.2        
## [103] cli_3.6.5               xtable_1.8-4            systemfonts_1.2.3      
## [106] jquerylib_0.1.4         network_1.19.0          dichromat_2.0-0.1      
## [109] Rcpp_1.1.0              coda_0.19-4.1           ggplot2_3.5.2          
## [112] blob_1.2.4              DOSE_4.2.0              htm2txt_2.2.2          
## [115] bitops_1.0-9            viridisLite_0.4.2       tidytree_0.4.6         
## [118] scales_1.4.0            purrr_1.1.0             crayon_1.5.3           
## [121] rlang_1.1.6             cowplot_1.2.0           fastmatch_1.1-6        
## [124] KEGGREST_1.48.1