Source: https://github.com/markziemann/10mistakes

Introduction

#knitr::opts_chunk$set(dev = 'svg') # set output device to svg

suppressPackageStartupMessages({
  library("getDEE2")
  library("DESeq2")
  library("kableExtra")
  library("eulerr")
  library("gplots")
  library("mitch")
  library("fgsea")
  library("RhpcBLASctl")
  library("parallel")
  library("beeswarm")
  library("vioplot")
})

RhpcBLASctl::blas_set_num_threads(1)

if ( ! dir.exists("img") ) {
  dir.create("img")
}

For this guide I will be using bulk RNA-seq data from a previous study, which is deposited at NCBI GEO and SRA under accession numbers: GSE55123 and SRP038101 (Lund et al, 2014). The experiment is designed to investigate the effect of Azacitidine treatment on AML3 cells.

The raw data have been processed by the DEE2 project, and the summary gene expression counts are available at the dee2.io website, and programmatically with the getDEE2 bioconductor package (Ziemann et al, 2019).

The gene counts have also been deposited to the /data folder in the example.Rdata file in case the DEE2 resource becomes unavailable. To import it, use the command: load("data/example.Rdata")

Alternatively, you could fetch data from another resource like NCBI GEO, Zenodo or from the host storage drive.

mdat <- getDEE2Metadata("hsapiens")

# get sample sheet
ss <- subset(mdat,SRP_accession=="SRP038101")

# fetch the whole set of RNA-seq data
x <- getDEE2("hsapiens",ss$SRR_accession , metadata=mdat, legacy=TRUE)
## For more information about DEE2 QC metrics, visit
##     https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md
mx <- x$GeneCounts
rownames(mx) <- paste(rownames(mx),x$GeneInfo$GeneSymbol)
dim(mx)
## [1] 58302     6
# aza no filtering
ss$trt <- grepl("Treated",ss$Experiment_title)

ss %>%
  kbl(caption="sample sheet for Aza treatment in AML3 cells") %>%
  kable_paper("hover", full_width = F)
sample sheet for Aza treatment in AML3 cells
SRR_accession QC_summary SRX_accession SRS_accession SRP_accession Experiment_title GEO_series trt
168876 SRR1171523 PASS SRX472607 SRS559064 SRP038101 GSM1329859: Untreated.1; Homo sapiens; RNA-Seq GSE55123 FALSE
168877 SRR1171524 WARN(3,4) SRX472608 SRS559066 SRP038101 GSM1329860: Untreated.2; Homo sapiens; RNA-Seq GSE55123 FALSE
168878 SRR1171525 WARN(3,4) SRX472609 SRS559065 SRP038101 GSM1329861: Untreated.3; Homo sapiens; RNA-Seq GSE55123 FALSE
168879 SRR1171526 WARN(3,4) SRX472610 SRS559068 SRP038101 GSM1329862: Treated.1; Homo sapiens; RNA-Seq GSE55123 TRUE
168880 SRR1171527 WARN(3,4) SRX472611 SRS559067 SRP038101 GSM1329863: Treated.2; Homo sapiens; RNA-Seq GSE55123 TRUE
168881 SRR1171528 WARN(3,4) SRX472612 SRS559069 SRP038101 GSM1329864: Treated.3; Homo sapiens; RNA-Seq GSE55123 TRUE

Data quality control

QC is important, even if you are using public transcriptome data. For RNA-seq it is a good idea to show the number of reads for each sample.

par(mar=c(5,7,5,1))
barplot(rev(colSums(mx)),horiz=TRUE,las=1,main="number of reads per sample in SRP038101")

Now make a MDS plot.

mds <- cmdscale(dist(t(mx)))

# expand plot area
XMIN=min(mds[,1])*1.3
XMAX=max(mds[,1])*1.3
YMIN=min(mds[,2])*1.3
YMAX=max(mds[,2])*1.3

cols <- as.character(grepl("Treated",ss$Experiment_title))
cols <- gsub("FALSE","lightblue",cols)
cols <- gsub("TRUE","pink",cols)
plot(mds, xlab="Coordinate 1", ylab="Coordinate 2",
  xlim=c(XMIN,XMAX),ylim=c(YMIN,YMAX),
  pch=19,cex=2,col=cols, main="MDS plot")
text(cmdscale(dist(t(mx))), labels=colnames(mx) )

Differential expression analysis

We will be using DESeq2 for DE analysis.

The count matrix is prefiltered using a detection threshold of 10 reads per sample across all samples. All genes that meet the detection threshold will comprise the background list.

We will also repeat this without a detection threshold.

The first 6 rows of the count matrix are shown.

mxf <- mx[which(rowMeans(mx)>=10),]
dim(mxf)
## [1] 13168     6
head(mxf,6) %>%
  kbl(caption="Count matrix format") %>%
  kable_paper("hover", full_width = F)
Count matrix format
SRR1171523 SRR1171524 SRR1171525 SRR1171526 SRR1171527 SRR1171528
ENSG00000225630 MTND2P28 494 396 340 333 415 418
ENSG00000237973 MTCO1P12 52 39 40 30 37 29
ENSG00000248527 MTATP6P1 853 544 537 582 702 716
ENSG00000228327 AL669831.1 17 13 21 21 22 12
ENSG00000228794 LINC01128 42 27 30 32 40 23
ENSG00000230699 AL645608.3 20 11 13 10 15 22
dds <- DESeqDataSetFromMatrix(countData = mxf , colData = ss, design = ~ trt )
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
z <- results(res)
vsd <- vst(dds, blind=FALSE)
zz <-cbind(as.data.frame(z),mxf)
def <-as.data.frame(zz[order(zz$pvalue),])

head(def,10) %>%
  kbl(caption="Top DE genes for Aza treatment") %>%
  kable_paper("hover", full_width = F)
Top DE genes for Aza treatment
baseMean log2FoldChange lfcSE stat pvalue padj SRR1171523 SRR1171524 SRR1171525 SRR1171526 SRR1171527 SRR1171528
ENSG00000165949 IFI27 1960.1970 -3.384492 0.0938869 -36.04861 0 0 4689 3583 2758 309 384 334
ENSG00000090382 LYZ 7596.0299 -1.650342 0.0561143 -29.41036 0 0 14212 10237 10789 3476 3764 3738
ENSG00000115461 IGFBP5 531.2217 -5.071157 0.1795239 -28.24781 0 0 1320 823 1025 23 26 43
ENSG00000157601 MX1 827.1511 -2.877795 0.1047823 -27.46450 0 0 1732 1501 1206 184 223 186
ENSG00000111331 OAS3 2127.2010 -2.661214 0.0972124 -27.37525 0 0 4204 3977 2972 562 614 560
ENSG00000070915 SLC12A3 424.5509 -3.374852 0.1298671 -25.98697 0 0 1012 721 653 63 85 76
ENSG00000234745 HLA-B 3197.0159 -1.431566 0.0604169 -23.69479 0 0 6085 4256 4023 1590 1872 1719
ENSG00000137965 IFI44 409.0957 -2.978581 0.1319352 -22.57608 0 0 829 740 635 76 111 89
ENSG00000204525 HLA-C 1631.6421 -1.461550 0.0660214 -22.13750 0 0 3112 2150 2106 791 923 891
ENSG00000110042 DTX4 524.1318 -2.470219 0.1173182 -21.05572 0 0 1166 883 688 166 168 145

Make a smear plot.

sigf <- subset(def,padj<=0.01)

DET=nrow(mxf)
NSIG=nrow(sigf)
NUP=nrow(subset(sigf,log2FoldChange>0))
NDN=nrow(subset(sigf,log2FoldChange<0))

HEADER=paste(DET,"detected genes;",NSIG,"w/FDR<0.01;",NUP,"up;",NDN,"down")

plot(log10(def$baseMean) ,def$log2FoldChange,
  cex=0.6,pch=19,col="darkgray",
  main="5-azacitidine treatment in AML3 cells",
  xlab="log10(basemean)",ylab="log2(fold change)")

points(log10(sigf$baseMean) ,sigf$log2FoldChange,
  cex=0.6,pch=19,col="red")

mtext(HEADER)

In the next sections I will run enrichment analysis with over-representation test and compare it to functional class scoring.

Get update gene symbols

Let’s see if many of the gene symbols have changed.

Comparing Ensembl release 90 to GENCODE Release 49 (GRCh38.p14) which corresponds to Ensembl 115 (Sept 2025).

So v90 had 58k genes and v115 has 86k which is a big increase. The v90 has 1609 genes that have been removed an v115 has 29676 that have been added. There are 56693 common.

From these common ones, 20456 have different gene names. I think the approach to naming these putative genes has changed a lot. Lots of CXorfXX genes have changed, not just AL358472.5 type names.

In the next chunk, I’ll show a few lines of the older source gene names.

There is also an Euler diagram of the gene symbol repertoires.

gt <- read.table("https://dee2.io/data/hsapiens/hsa_gene_info.tsv",header=TRUE)
head(gt)
##            GeneID  GeneSymbol mean median longest_isoform merged
## 1 ENSG00000223972     DDX11L1 1144   1144            1657   1735
## 2 ENSG00000227232      WASH7P 1351   1351            1351   1351
## 3 ENSG00000278267   MIR6859-1   68     68              68     68
## 4 ENSG00000243485 MIR1302-2HG  623    623             712   1021
## 5 ENSG00000284332   MIR1302-2  138    138             138    138
## 6 ENSG00000237613     FAM138A  888    888            1187   1219
gtnew <- read.table("ref/gencode.v49.genenames.tsv")

v1 <- list("Ens90"=gt$GeneID, "Ens115"=gtnew$V1)
plot(euler(v1),quantities = list(cex = 2), labels = list(cex = 2))

message("Number of unique genes in the original gene table:")
## Number of unique genes in the original gene table:
length(gt$GeneID)
## [1] 58302
message("Number of unique genes in the new gene annotation set:")
## Number of unique genes in the new gene annotation set:
length(gtnew$V1)
## [1] 86369
message("Number of shared gene IDs between new and old annotations:")
## Number of shared gene IDs between new and old annotations:
length(intersect(gt$GeneID,gtnew$V1))
## [1] 56693
message("Number of gene IDs specific to the old annotation:")
## Number of gene IDs specific to the old annotation:
length(setdiff(gt$GeneID,gtnew$V1))
## [1] 1609
message("Number of gene IDs specific to the new annotation:")
## Number of gene IDs specific to the new annotation:
length(setdiff(gtnew$V1,gt$GeneID))
## [1] 29676
message("Here is the mapping table for old-new gene symbols:")
## Here is the mapping table for old-new gene symbols:
gtm <- merge(gt,gtnew,by.x="GeneID",by.y="V1")

message("Number of rows:")
## Number of rows:
nrow(gtm)
## [1] 56693
message("How many of these gene symbols are the same?")
## How many of these gene symbols are the same?
table(gtm$GeneSymbol == gtm$V2)
## 
## FALSE  TRUE 
## 20456 36237

Get gene sets

Reading in Reactome gene sets.

# from https://reactome.org/download/current/ReactomePathways.gmt.zip
gsets <- gmtPathways("ref/ReactomePathways_2025-09-02.gmt")

Mistake 1 - Raw p-values

First, discard old gene symbols in favour of the new ones.

In this analysis, we are doing ORA with fora, and comparing the results of raw p-values versus using FDR correction.

Gene list is limited to 1000 as per mistake 5.

This corresponds to the results shown in Figure 1B.

up <- head(rownames(subset(def, log2FoldChange>0 & padj<0.01)),1004)
up <- sapply(strsplit(up," "),'[[',1)
up <- unique(gtnew[match(up,gtnew$V1),2])
up <- up[which(!is.na(up))]

dn <- head(rownames(subset(def, log2FoldChange<0 & padj<0.01)),1001)
dn <- sapply(strsplit(dn," "),'[[',1)
dn <- unique(gtnew[match(dn,gtnew$V1),2])
dn <- dn[which(!is.na(dn))]

bg <- rownames(def)
bg <- sapply(strsplit(bg," "),'[[',1)
bg <- unique(gtnew[match(bg,gtnew$V1),2])
bg <- bg[which(!is.na(bg))]

message("length of up, down and background sets")
## length of up, down and background sets
lapply(list(up,dn,bg),length)
## [[1]]
## [1] 1000
## 
## [[2]]
## [1] 1000
## 
## [[3]]
## [1] 13063
fres_up <- fora(pathways=gsets, genes=up, universe=bg, minSize = 5, maxSize = Inf)

message("number of gene sets in the results")
## number of gene sets in the results
nrow(fres_up)
## [1] 1840
message("number of up-regulated gene sets with FDR<0.01")
## number of up-regulated gene sets with FDR<0.01
nrow(subset(fres_up,padj<0.01))
## [1] 143
message("number of up-regulated gene sets with p<0.01")
## number of up-regulated gene sets with p<0.01
nrow(subset(fres_up,pval<0.01))
## [1] 223
fres_up_fdr <- subset(fres_up,padj<0.01)$pathway
fres_up_nom <- subset(fres_up,pval<0.01)$pathway

fres_dn <- fora(pathways=gsets, genes=dn, universe=bg, minSize = 5, maxSize = Inf)
message("number of down-regulated gene sets with FDR<0.01")
## number of down-regulated gene sets with FDR<0.01
nrow(subset(fres_dn,padj<0.01))
## [1] 67
message("number of down-regulated gene sets with p<0.01")
## number of down-regulated gene sets with p<0.01
nrow(subset(fres_dn,pval<0.01))
## [1] 169
fres_dn_fdr <- subset(fres_dn,padj<0.01)$pathway
fres_dn_nom <- subset(fres_dn,pval<0.01)$pathway

v1 <- list("up p"=fres_up_nom, "up FDR"=fres_up_fdr,
  "dn p"=fres_dn_nom, "dn FDR"=fres_dn_fdr)

par(mar=c(1,1,1,1))
par(mfrow=c(1,1))
plot(euler(v1),quantities = list(cex = 2), labels = list(cex = 2))

fres_fdr <- union(fres_dn_fdr,fres_up_fdr)
fres_nom <- union(fres_dn_nom,fres_up_nom)

v1 <- list("p"=fres_nom, "FDR"=fres_fdr)
plot(euler(v1),quantities = list(cex = 2), labels = list(cex = 2))

unlink("img/1b.png")
png("img/1b.png", width = 480, height = 400)
plot(euler(v1),quantities = list(cex = 2), labels = list(cex = 2))
dev.off()
## png 
##   2
par(mfrow=c(1,1))

unlink("img/1b.pdf")
pdf("img/1b.pdf", width = 5, height = 5)
plot(euler(v1),quantities = list(cex = 2), labels = list(cex = 2))
dev.off()
## png 
##   2
par(mfrow=c(1,1))

par(mar=c(5.1,4.1,4.1,2.1))

That analysis shows how a big difference between FDR and raw p-values, but it doesn’t prove that the p-value hits are false positives.

Here, we can prove they are false positives by randomly selecting 1000 genes from the list of detected ones, and testing them for enrichment.

When doing enrichment analysis of a random list of 1000 genes, we got an average of 51.6 pathways with p<0.01, while after p-value correction, that number was just 0.16 with FDR<0.01. This correponds to the data shown in Figure 1A.

seeds <- seq(from=100,to=100*100,by=100)

m1sim <- mclapply(seeds, function(seed) {
  set.seed(seed)
  rand <- sample(x=bg,size=1000,replace=FALSE)
  fres_rand <- fora(pathways=gsets, genes=rand, universe=bg, minSize = 5, maxSize = Inf)
  nom <- nrow(subset(fres_rand,pval<0.01))
  fdr <- nrow(subset(fres_rand,padj<0.01))
  c(nom,fdr)
},mc.cores=8)

m1sim <- do.call(rbind,m1sim)
colnames(m1sim) <- c("p<0.01","FDR<0.01")

boxplot(m1sim,main="Random gene sets (100 repetitions)",ylab="no. gene sets meeting \"significance\" threshold <0.01",frame=FALSE)
beeswarm(list(mean(m1sim[,1]), mean(m1sim[,2])),add=TRUE,pch=4,cex=2)
lab1 <- paste("Mean=",signif(mean(m1sim[,1]) , 3),sep="")
lab2 <- paste("Mean=",signif(mean(m1sim[,2]) , 2),sep="")
text(c(1.2,2.2), c( mean(m1sim[,1])+5, mean(m1sim[,2])+5 ), labels=c(lab1,lab2) )

unlink("img/1a.png")
png("img/1a.png", width = 480, height = 480)
boxplot(m1sim,main="Random gene sets (100 repetitions)",ylab="no. gene sets meeting \"significance\" threshold <0.01",frame=FALSE)
beeswarm(list(mean(m1sim[,1]), mean(m1sim[,2])),add=TRUE,pch=4,cex=2)
lab1 <- paste("Mean=",signif(mean(m1sim[,1]) , 3),sep="")
lab2 <- paste("Mean=",signif(mean(m1sim[,2]) , 2),sep="")
text(c(1.2,2.2), c( mean(m1sim[,1])+5, mean(m1sim[,2])+5 ), labels=c(lab1,lab2) )
dev.off()
## png 
##   2
par(mfrow=c(1,1))

unlink("img/1a.pdf")
pdf("img/1a.pdf", width = 5, height = 5)
boxplot(m1sim,main="Random gene sets (100 repetitions)",ylab="no. gene sets meeting \"significance\" threshold <0.01",frame=FALSE)
beeswarm(list(mean(m1sim[,1]), mean(m1sim[,2])),add=TRUE,pch=4,cex=2)
lab1 <- paste("Mean=",signif(mean(m1sim[,1]) , 3),sep="")
lab2 <- paste("Mean=",signif(mean(m1sim[,2]) , 2),sep="")
text(c(1.2,2.2), c( mean(m1sim[,1])+5, mean(m1sim[,2])+5 ), labels=c(lab1,lab2) )
dev.off()
## png 
##   2
par(mfrow=c(1,1))

par(mar=c(5.1,4.1,4.1,2.1))

Mistake 2 - Background

Will try FORA.

Reporting criteria Method/resource used
Origin of gene sets Reactome (2023-03-06)
Tool used FORA (check fgsea version at foot of report)
Statistical test used hypergeometric
P-value correction for multiple comparisons FDR method
Background list Genes with >=10 reads per sample on average across all samples
Gene Selection DESeq2 FDR<0.01 and log2FC>0 or log2FC<0
Data availability via DEE2 at accession SRP038101 (human)
Other parameters Min gene set size of 5
message("Number of genes in the count matrix")
## Number of genes in the count matrix
nrow(x$GeneCounts)
## [1] 58302
rowmeans <- rowMeans(x$GeneCounts)

message("Number of genes in the count matrix with more than 10 reads per gene on average.")
## Number of genes in the count matrix with more than 10 reads per gene on average.
length(rowmeans[which(rowmeans>=10)])
## [1] 13168
message("Number of genes in the count matrix with less than 10 reads per gene on average.")
## Number of genes in the count matrix with less than 10 reads per gene on average.
length(rowmeans[which(rowmeans<10)])
## [1] 45134
message("Proportion of genes below the detection threshold.")
## Proportion of genes below the detection threshold.
length(rowmeans[which(rowmeans<10)]) / length(rowmeans)
## [1] 0.7741415
message("Proportion of genes above the detection threshold.")
## Proportion of genes above the detection threshold.
( 1- sum(rowmeans[which(rowmeans>=10)]) / sum(rowmeans) ) * 100
## [1] 0.1917788
# UNFILT
unfilt <- x$GeneCounts

message("Run deseq2 for differential expression with and without filtering.")
## Run deseq2 for differential expression with and without filtering.
dds_unfilt <- DESeqDataSetFromMatrix(countData = unfilt , colData = ss, design = ~ trt )
res_unfilt <- DESeq(dds_unfilt)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
z_unfilt <- results(res_unfilt)
vsd_unfilt <- vst(dds_unfilt, blind=FALSE)
zz_unfilt <-cbind(as.data.frame(z_unfilt),unfilt)
def_unfilt <-as.data.frame(zz_unfilt[order(zz_unfilt$pvalue),])

message("No significant genes without filtering low genes")
## No significant genes without filtering low genes
nrow(subset(def_unfilt,padj<0.01))
## [1] 2409
sigf <- subset(def_unfilt,padj<=0.01)
DET=nrow(def_unfilt)
NSIG=nrow(sigf)
NUP=nrow(subset(sigf,log2FoldChange>0))
NDN=nrow(subset(sigf,log2FoldChange<0))
HEADER=paste(DET,"included genes;",NSIG,"w/FDR<0.01;",NUP,"up;",NDN,"down")
plot(log10(def_unfilt$baseMean) ,def_unfilt$log2FoldChange,
  cex=0.6,pch=19,col="darkgray",
  main="5-azacitidine treatment in AML3 cells",
  xlab="log10(basemean)",ylab="log2(fold change)")
points(log10(sigf$baseMean) ,sigf$log2FoldChange,
  cex=0.6,pch=19,col="red")
mtext(HEADER)
abline(v=1,lwd=2,lty=2,col="blue")
abline(v=log10(5),lwd=2,lty=2,col="blue")

# FILT

filt <- unfilt[which(rowMeans(unfilt)>=10),]
dds_filt <- DESeqDataSetFromMatrix(countData = filt , colData = ss, design = ~ trt )
res_filt <- DESeq(dds_filt)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
z_filt <- results(res_filt)
vsd_filt <- vst(dds_filt, blind=FALSE)
zz_filt <-cbind(as.data.frame(z_filt),filt)
def_filt <-as.data.frame(zz_filt[order(zz_filt$pvalue),])

message("No significant genes with filtering low genes")
## No significant genes with filtering low genes
nrow(subset(def_filt,padj<0.01))
## [1] 2498
sigf <- subset(def_filt,padj<=0.01)
DET=nrow(def_filt)
NSIG=nrow(sigf)
NUP=nrow(subset(sigf,log2FoldChange>0))
NDN=nrow(subset(sigf,log2FoldChange<0))
HEADER=paste(DET,"included genes;",NSIG,"w/FDR<0.01;",NUP,"up;",NDN,"down")
plot(log10(def_filt$baseMean) ,def_filt$log2FoldChange,
  cex=0.6,pch=19,col="darkgray",
  main="5-azacitidine treatment in AML3 cells",
  xlab="log10(basemean)",ylab="log2(fold change)")
points(log10(sigf$baseMean) ,sigf$log2FoldChange,
  cex=0.6,pch=19,col="red")
mtext(HEADER)

plot(log10(def_unfilt$baseMean),abs(def_unfilt$lfcSE),
  xlab="log10(baseMean)",ylab="log2FC standard error",
  pch=19,cex=0.5,col="black")
sigf <- subset(def_unfilt,padj<=0.01)
points(log10(sigf$baseMean) ,sigf$lfcSE,
  cex=0.5,pch=19,col="red")
abline(v=1,col="blue",lwd=2,lty=2)
abline(v=log10(5),col="blue",lwd=2,lty=2)

Now generate the PDF version of the charts.

unlink("img/smears.pdf")
pdf("img/smears.pdf", width = 8, height = 5)
par(mfrow=c(1,2))

sigf <- subset(def_unfilt,padj<=0.01)
DET=nrow(def_unfilt)
NSIG=nrow(sigf)
NUP=nrow(subset(sigf,log2FoldChange>0))
NDN=nrow(subset(sigf,log2FoldChange<0))
HEADER=paste(DET,"included genes;",NSIG,"w/FDR<0.01;",NUP,"up;",NDN,"down")
plot(log10(def_unfilt$baseMean) ,def_unfilt$log2FoldChange,
  cex=0.6,pch=19,col="darkgray",
  main="Unfiltered",
  xlab="log10(basemean)",ylab="log2(fold change)")
abline(v=1,lwd=2,lty=2,col="blue")
points(log10(sigf$baseMean) ,sigf$log2FoldChange,
  cex=0.6,pch=19,col="red")
mtext(HEADER,cex=0.75)

sigf <- subset(def_filt,padj<=0.01)
DET=nrow(def_filt)
NSIG=nrow(sigf)
NUP=nrow(subset(sigf,log2FoldChange>0))
NDN=nrow(subset(sigf,log2FoldChange<0))
HEADER=paste(DET,"included genes;",NSIG,"w/FDR<0.01;",NUP,"up;",NDN,"down")
plot(log10(def_filt$baseMean) ,def_filt$log2FoldChange,
  cex=0.6,pch=19,col="darkgray",
  main="Filtered",
  xlab="log10(basemean)",ylab="log2(fold change)")
points(log10(sigf$baseMean) ,sigf$log2FoldChange,
  cex=0.6,pch=19,col="red")
mtext(HEADER,cex=0.75)

dev.off()
## png 
##   2

Now have a look at the number of genes that are statistically significant below the detection threshold. Less than 1% of genes are under the detection threshold are significant. In any case they are unlikely to be detected as significant using qPCR.

LOW=length( which(def_unfilt$baseMean<10 ) )
LOW
## [1] 45170
LOWSIG=length( which(def_unfilt$baseMean<10 & def_unfilt$padj < 0.01) )
LOWSIG
## [1] 69
LOW - LOWSIG
## [1] 45101
LOWSIG / LOW
## [1] 0.001527563
LOWSIG / LOW  * 100
## [1] 0.1527563
log10basemean <- log10(def_unfilt$baseMean+0.01)
par(mfrow=c(2,1))
par(mar=c(4.1,4.1,2.1,0.5))
hist(log10basemean,xlab="log10(baseMean)")

vals <- seq(-2,4)
vres <- lapply(vals, function(i) {
  lev0=i
  lev1=i+1
  fdr <- def_unfilt[which(log10basemean >= lev0 & log10basemean < lev1 ),"padj"]
  fdr[is.na(fdr)] <- 1
  fdr<0.01
})

lapply(vres,table)
## [[1]]
## 
## FALSE 
## 30548 
## 
## [[2]]
## 
## FALSE 
##  8667 
## 
## [[3]]
## 
## FALSE  TRUE 
##  5886    69 
## 
## [[4]]
## 
## FALSE  TRUE 
##  3375   326 
## 
## [[5]]
## 
## FALSE  TRUE 
##  5323  1054 
## 
## [[6]]
## 
## FALSE  TRUE 
##  1998   927 
## 
## [[7]]
## 
## FALSE  TRUE 
##    96    33
sigproportions <- unlist(lapply(vres,function(x) { length(which(x==TRUE)) / length(x) } ))
plot(vals+0.5,sigproportions,xlim=c(-2,5),cex=1.3,bty="none",
  ylab="proportion of genes with FDR<0.01",
  xlab="log10(baseMean) bin")
grid()

It is interesting that genes with high expression are over-represented.

Let’s see if adding a log2FC threshold improves it by flattening the curve.

There is a big difference caused by adding the log2FC filter. At a value of 1.0, genes with middle expression are over-represented.

At a value of 0.35 it looks better, but middle expressed genes are still over-represented compared to highly expressed genes.

par(mfrow=c(2,1))
par(mar=c(4.1,4.1,2.1,0.5))
hist(log10basemean,xlab="log10(baseMean)")

vals <- seq(-2,4)
vres <- lapply(vals, function(i) {
  lev0=i
  lev1=i+1
  y <- def_unfilt[which(log10basemean >= lev0 & log10basemean < lev1 ),c("log2FoldChange","padj")]
  y$padj[is.na(y$padj)] <- 1
  abs(y$log2FoldChange)>=1 & y$padj < 0.01
})

lapply(vres,table)
## [[1]]
## 
## FALSE 
## 30548 
## 
## [[2]]
## 
## FALSE 
##  8667 
## 
## [[3]]
## 
## FALSE  TRUE 
##  5886    69 
## 
## [[4]]
## 
## FALSE  TRUE 
##  3431   270 
## 
## [[5]]
## 
## FALSE  TRUE 
##  6241   136 
## 
## [[6]]
## 
## FALSE  TRUE 
##  2908    17 
## 
## [[7]]
## 
## FALSE  TRUE 
##   128     1
sigproportions <- unlist(lapply(vres,function(x) { length(which(x==TRUE)) / length(x) } ))
plot(vals+0.5,sigproportions,cex=1.3,bty="none",
  xlim=c(-2,5),
  ylab="proportion of genes with FDR<0.01",
  xlab="log10(baseMean) bin")
grid()

par(mfrow=c(2,1))
par(mar=c(4.1,4.1,2.1,0.5))
hist(log10basemean,xlab="log10(baseMean)")

vals <- seq(-2,4)
vres <- lapply(vals, function(i) {
  lev0=i
  lev1=i+1
  y <- def_unfilt[which(log10basemean >= lev0 & log10basemean < lev1 ),c("log2FoldChange","padj")]
  y$padj[is.na(y$padj)] <- 1
  abs(y$log2FoldChange)>=0.35 & y$padj < 0.01
})

lapply(vres,table)
## [[1]]
## 
## FALSE 
## 30548 
## 
## [[2]]
## 
## FALSE 
##  8667 
## 
## [[3]]
## 
## FALSE  TRUE 
##  5886    69 
## 
## [[4]]
## 
## FALSE  TRUE 
##  3375   326 
## 
## [[5]]
## 
## FALSE  TRUE 
##  5514   863 
## 
## [[6]]
## 
## FALSE  TRUE 
##  2601   324 
## 
## [[7]]
## 
## FALSE  TRUE 
##   121     8
sigproportions <- unlist(lapply(vres,function(x) { length(which(x==TRUE)) / length(x) } ))
plot(vals+0.5,sigproportions,cex=1.3,bty="none",
  xlim=c(-2,5),
  ylab="proportion of genes with FDR<0.01",
  xlab="log10(baseMean) bin")
grid()

Run enrichment tests to see whether certain baseline expression levels are over-represented.

gl6 <- sapply(strsplit(names(which(rowMeans(mx)>10000 ))," "),"[[",1)
gl5 <- sapply(strsplit(names(which(rowMeans(mx)>1000 & rowMeans(mx)<=10000))," "),"[[",1)
gl4 <- sapply(strsplit(names(which(rowMeans(mx)>100 & rowMeans(mx)<=1000 ))," "),"[[",1)
gl3 <- sapply(strsplit(names(which(rowMeans(mx)>10 & rowMeans(mx)<=100 ))," "),"[[",1)
gl2 <- sapply(strsplit(names(which(rowMeans(mx)>1 & rowMeans(mx)<=10 ))," "),"[[",1)
gl1 <- sapply(strsplit(names(which(rowMeans(mx)>0 & rowMeans(mx)<=1 ))," "),"[[",1)
gl0 <- sapply(strsplit(names(which(rowMeans(mx)==0 ))," "),"[[",1)

gl_gsets <- list("GL0"=gl0,"GL1"=gl1,"GL2"=gl2,"GL3"=gl3,"GL4"=gl4,"GL5"=gl5,"GL6"=gl6)

lapply(gl_gsets,length)
## $GL0
## [1] 30548
## 
## $GL1
## [1] 8970
## 
## $GL2
## [1] 5647
## 
## $GL3
## [1] 3683
## 
## $GL4
## [1] 6366
## 
## $GL5
## [1] 2958
## 
## $GL6
## [1] 130
glup <- head(rownames(subset(def_filt,padj<0.01 & log2FoldChange > 0)),750)
gldn <- head(rownames(subset(def_filt,padj<0.01 & log2FoldChange < 0)),750)
glbg <- rownames(def_filt)

gl_res_up <- fora(pathways=gl_gsets, genes=glup, universe=glbg, minSize = 5, maxSize = Inf)
gl_res_dn <- fora(pathways=gl_gsets, genes=gldn, universe=glbg, minSize = 5, maxSize = Inf)

gl_res_up %>%
  kbl(caption="Enrichment of up genes with basemean levels",row.names=FALSE) %>%
  kable_paper("hover", full_width = F)
Enrichment of up genes with basemean levels
pathway pval padj foldEnrichment overlap size overlapGenes
GL5 0.0000000 0.0000000 2.3742168 400 2958 ENSG0000….
GL6 0.0004047 0.0010119 2.4310154 18 130 ENSG0000….
GL4 0.9999918 1.0000000 0.8439434 306 6366 ENSG0000….
GL2 1.0000000 1.0000000 0.0000000 0 31
GL3 1.0000000 1.0000000 0.1239453 26 3683 ENSG0000….
gl_res_dn %>%
  kbl(caption="Enrichment of down genes with basemean levels",row.names=FALSE) %>%
  kable_paper("hover", full_width = F)
Enrichment of down genes with basemean levels
pathway pval padj foldEnrichment overlap size overlapGenes
GL5 0.0002892 0.0014458 1.2345927 208 2958 ENSG0000….
GL4 0.2756325 0.5367058 1.0232125 371 6366 ENSG0000….
GL6 0.3220235 0.5367058 1.2155077 9 130 ENSG0000….
GL3 0.9999827 1.0000000 0.7722748 162 3683 ENSG0000….
GL2 1.0000000 1.0000000 0.0000000 0 31
glstat <- def_filt$stat

names(glstat) <- rownames(def_filt)

fgseaRes <- fgsea(pathways = gl_gsets,
                  stats    = glstat,
                  minSize  = 5,
                  maxSize  = 50000,
                  nPermSimple = 1000)
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : There were 1 pathways for which P-values were not calculated
## properly due to unbalanced (positive and negative) gene-level statistic values.
## For such pathways pval, padj, NES, log2err are set to NA. You can try to
## increase the value of the argument nPermSimple (for example set it nPermSimple
## = 10000)
head(fgseaRes[order(pval), ]) %>%
  kbl(caption="FGSEA enrichment of genes with basemean levels",row.names=FALSE) %>%
  kable_paper("hover", full_width = F)
FGSEA enrichment of genes with basemean levels
pathway pval padj log2err ES NES size leadingEdge
GL3 0.0000000 0.0000000 1.1239150 -0.3447853 -1.454199 3683 ENSG0000….
GL4 0.0759241 0.1500603 0.1596467 -0.2504214 -1.057780 6366 ENSG0000….
GL6 0.1125452 0.1500603 0.2878051 0.2653934 1.171394 130 ENSG0000….
GL2 0.2656489 0.2656489 0.0995791 -0.4068285 -1.175543 31 ENSG0000….
GL5 NA NA NA 0.3259623 NA 2958 ENSG0000….
library(mitch)
yy <- mitch_import(def_filt,"deseq2")
## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.
## Note: Mean no. genes in input = 13168
## Note: no. genes in output = 13168
## Note: estimated proportion of input genes in output = 1
yy_res <- mitch_calc(yy,gl_gsets,priority="significance",cores=8)
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
yy_res$enrichment_result %>%
  kbl(caption="Mitch enrichment of down genes with basemean levels",row.names=FALSE) %>%
  kable_paper("hover", full_width = F)
Mitch enrichment of down genes with basemean levels
set setSize pANOVA s.dist p.adjustANOVA
GL3 3683 0.0000000 -0.1935818 0.0000000
GL5 2958 0.0000000 0.2015515 0.0000000
GL2 31 0.0740629 -0.1854477 0.1234381
GL4 6366 0.1091896 0.0161283 0.1364870
GL6 130 0.5428393 0.0309698 0.5428393

The violin chart is used to understand whether the distribution of gene expression levels of the background gene list (NS) is similar to the distributrion of DEGs. If there is a big difference, it is bad, because we already know that genes with high expression in a cell type are likely to be very different ontologies when compared to lower expressed genes. In the unfiltered pair of violin charts, we see that there is a big difference in the distributions indicating that a lot of the enrichment results could be solely due to basemean expression levels, not related to the up or down regulation at all. When filtering is applied, we see that the difference between DEG and NS distributions is smaller.

deg_unfilt <- subset(def_unfilt,padj<0.01)
ns_unfilt <- subset(def_unfilt,padj>0.01)

rowmeans_deg_unfilt <- rowMeans(deg_unfilt[,7:12])
rowmeans_ns_unfilt <- rowMeans(ns_unfilt[,7:12])

viodata <- list("DEG"=log10(rowmeans_deg_unfilt+0.01),"NS"=log10(rowmeans_ns_unfilt+0.01))

deg_filt <- subset(def_filt,padj<0.01)
ns_filt <- subset(def_filt,padj>0.01)

rowmeans_deg_filt <- rowMeans(deg_filt[,7:12])
rowmeans_ns_filt <- rowMeans(ns_filt[,7:12])

message("Distribution of gene expression levels")
## Distribution of gene expression levels
viodata2 <- list("DEG unfilt"=log10(rowmeans_deg_unfilt+0.01),
  "NS unfilt"=log10(rowmeans_ns_unfilt+0.01),
  "DEG filt"=log10(rowmeans_deg_filt+0.01),
  "NS filt"=log10(rowmeans_ns_filt+0.01))

vioplot(viodata2,ylab="log10 mean expression",las=2,side="left")

lapply(viodata2,summary)
## $`DEG unfilt`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.5242  2.3205  2.8527  2.7249  3.2409  4.7594 
## 
## $`NS unfilt`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.502   1.243   2.221   2.077   2.799   4.901 
## 
## $`DEG filt`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.306   2.807   2.725   3.213   4.759 
## 
## $`NS filt`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.802   2.470   2.374   2.906   4.901

Mistake 2 - example dataset analysis

In this next analysis, we run parallel fora tests, one uses the correct background made up of detected genes (bg), while the other one uses all genes in the genome (marked with *).

The second euler diagram shown below corresponds to Figure 2C.

# up dn and bg are defined above
all <- unique(gtm$V2)

message("no. genes in each of the lists up, down, bg and all.")
## no. genes in each of the lists up, down, bg and all.
lapply(list(up,dn,bg,all),length)
## [[1]]
## [1] 1000
## 
## [[2]]
## [1] 1000
## 
## [[3]]
## [1] 13063
## 
## [[4]]
## [1] 55503
fres_up_bg <- fora(pathways=gsets, genes=up, universe=bg, minSize = 5, maxSize = Inf)
fres_up_bg <- subset(fres_up_bg,padj<0.01)$pathway
message("no up-regulated pathways with the correct background.")
## no up-regulated pathways with the correct background.
length(fres_up_bg)
## [1] 143
fres_up_all <- fora(pathways=gsets, genes=up, universe=all, minSize = 5, maxSize = Inf)
fres_up_all <- subset(fres_up_all,padj<0.01)$pathway
message("no up-regulated pathways with wg background.")
## no up-regulated pathways with wg background.
length(fres_up_all)
## [1] 528
fres_dn_bg <- fora(pathways=gsets, genes=dn, universe=bg, minSize = 5, maxSize = Inf)
fres_dn_bg <- subset(fres_dn_bg,padj<0.01)$pathway
message("no down-regulated pathways with the correct background.")
## no down-regulated pathways with the correct background.
length(fres_dn_bg)
## [1] 67
fres_dn_all <- fora(pathways=gsets, genes=dn, universe=all, minSize = 5, maxSize = Inf)
fres_dn_all <- subset(fres_dn_all,padj<0.01)$pathway
message("no down-regulated pathways with wg background.")
## no down-regulated pathways with wg background.
length(fres_dn_all)
## [1] 404
v1 <- list("up"=fres_up_bg, "up*"=fres_up_all, "dn"=fres_dn_bg, "dn*"=fres_dn_all)
message("no. significant pathways with correct and incorrect* background with direction specified.")
## no. significant pathways with correct and incorrect* background with direction specified.
par(mar=c(1,1,1,1))
par(mfrow=c(1,1))
plot(euler(v1),quantities = list(cex = 2), labels = list(cex = 2))

v2 <- list("correct background"=union(fres_up_bg,fres_dn_bg),
  "incorrect background"=union(fres_up_all, fres_dn_all) )

message("no. significant pathways with correct and incorrect* background. Direction not specified.")
## no. significant pathways with correct and incorrect* background. Direction not specified.
plot(euler(v2),quantities = list(cex = 2), labels = list(cex = 2))

par(mar=c(1,1,1,1))
unlink("img/2c.png")
png("img/2c.png", width = 480, height = 400)
plot(euler(v2),quantities = list(cex = 2), labels = list(cex = 2))
dev.off()
## png 
##   2
par(mfrow=c(1,1))

par(mar=c(1,1,1,1))
unlink("img/2c.pdf")
pdf("img/2c.pdf", width = 5, height = 5)
plot(euler(v2),quantities = list(cex = 2), labels = list(cex = 2))
dev.off()
## png 
##   2
par(mfrow=c(1,1))

par(mar=c(5.1,4.1,4.1,2.1))

Mistake 2 - simulation analysis

Here, we select genes randomly from the genes above the detection threshold (bg list) and compare to whole genome background.

There are ~444 gene sets significant on average with this approach, as the genes expressed in AML3 cells are distinctive compared to the whole genome annotation.

When using the correct background, a mean of 0.16 was obtained. This shows that false positives can be nearly eliminated by using this detection threshold.

This result corresponds to Figure 2A.

seeds <- seq(from=100,to=100*100,by=100)

m2sim <- mclapply(seeds, function(seed) {
  set.seed(seed)
  rand <- sample(x=bg,size=1000,replace=FALSE)
  sim_all <- fora(pathways=gsets, genes=rand, universe=all, minSize = 5, maxSize = Inf)
  sim_bg <- fora(pathways=gsets, genes=rand, universe=bg, minSize = 5, maxSize = Inf)
  sig_all <- nrow(subset(sim_all,padj<0.01))
  sig_bg <- nrow(subset(sim_bg,padj<0.01))
  c(sig_all,sig_bg)
},mc.cores=8)

m2sim <- do.call(rbind,m2sim)
colnames(m2sim) <- c("Whole genome background","Correct background")

par(mar=c(5.1,4.1,4.1,2.1))

boxplot(m2sim,main="Random gene sets drawn from BG",ylab="no. gene sets FDR<0.01",frame=FALSE)
mtext("100 simulations")
beeswarm(list(mean(m2sim[,1]), mean(m2sim[,2])),add=TRUE,pch=4,cex=2)
lab1 <- paste("Mean=",signif(mean(m2sim[,1]) , 3),sep="")
lab2 <- paste("Mean=",signif(mean(m2sim[,2]) , 2),sep="")
text(c(1.2,2.2), c( mean(m2sim[,1])+15, mean(m2sim[,2])+15 ), labels=c(lab1,lab2) )

par(mar=c(5.1,4.1,4.1,2.1))
unlink("img/2a.png")
png("img/2a.png", width = 480, height = 480)
boxplot(m2sim,main="Random gene sets drawn from BG",ylab="no. gene sets FDR<0.01",frame=FALSE)
mtext("100 simulations")
beeswarm(list(mean(m2sim[,1]), mean(m2sim[,2])),add=TRUE,pch=4,cex=2)
lab1 <- paste("Mean=",signif(mean(m2sim[,1]) , 3),sep="")
lab2 <- paste("Mean=",signif(mean(m2sim[,2]) , 2),sep="")
text(c(1.2,2.2), c( mean(m2sim[,1])+15, mean(m2sim[,2])+15 ), labels=c(lab1,lab2) )
dev.off()
## png 
##   2
par(mfrow=c(1,1))

par(mar=c(5.1,4.1,4.1,2.1))
unlink("img/2a.pdf")
pdf("img/2a.pdf", width = 5, height = 5)
boxplot(m2sim,main="Random gene sets drawn from BG",ylab="no. gene sets FDR<0.01",frame=FALSE)
mtext("100 simulations")
beeswarm(list(mean(m2sim[,1]), mean(m2sim[,2])),add=TRUE,pch=4,cex=2)
lab1 <- paste("Mean=",signif(mean(m2sim[,1]) , 3),sep="")
lab2 <- paste("Mean=",signif(mean(m2sim[,2]) , 2),sep="")
text(c(1.2,2.2), c( mean(m2sim[,1])+15, mean(m2sim[,2])+15 ), labels=c(lab1,lab2) )
dev.off()
## png 
##   2
par(mfrow=c(1,1))

par(mar=c(5.1,4.1,4.1,2.1))

In the simulation, which gene sets falsely come up most?

There’s 37 pathways that came up 100/100 simulations. This corresponds to Figure 2B.

m2simpw <- mclapply(seeds, function(seed) {
  set.seed(seed)
  rand <- sample(x=bg,size=1000,replace=FALSE)
  sim_all <- fora(pathways=gsets, genes=rand, universe=all, minSize = 5, maxSize = Inf)
  sig_all <- subset(sim_all,padj<0.01)$pathway
  return(sig_all)
},mc.cores=8)

m2tbl <- table(unlist((m2simpw)))
m2pw <- table(m2tbl)

plot(table(m2tbl),type="b",
  xlab="no. appearances as false positive, from 100 simulations",
  ylab="no. gene sets",
  main="1000 random genes drawn from expressed genes",
  bty="n"
)
set.seed(42) ; m2selected <- sample(names(which(m2tbl==100)),20)
text(100,seq(from=40,to=120,by=(120-40)/20),labels=m2selected,cex=0.8,pos=2)

par(mar=c(5.1,4.1,4.1,2.1))
unlink("img/2b.png")
png("img/2b.png", width = 480, height = 480)
plot(table(m2tbl),type="b",
  xlab="no. appearances as false positive, from 100 simulations",
  ylab="no. gene sets",
  main="1000 random genes drawn from expressed genes",
  bty="n"
)
set.seed(42) ; m2selected <- sample(names(which(m2tbl==100)),20)
text(100,seq(from=40,to=120,by=(120-40)/20),labels=m2selected,cex=0.8,pos=2)
dev.off()
## png 
##   2
par(mfrow=c(1,1))

par(mar=c(5.1,4.1,4.1,2.1))
unlink("img/2b.pdf")
pdf("img/2b.pdf", width = 5, height = 5)
plot(table(m2tbl),type="b",
  xlab="no. appearances as false positive, from 100 simulations",
  ylab="no. gene sets",
  main="1000 random genes drawn from expressed genes",
  bty="n"
)
set.seed(42) ; m2selected <- sample(names(which(m2tbl==100)),20)
text(100,seq(from=40,to=120,by=(120-40)/20),labels=m2selected,cex=0.7,pos=2)
dev.off()
## png 
##   2
par(mfrow=c(1,1))

Mistake 3 and 4 - not reporting enrichment scores and prioritisation only by p-values

Reporting criteria Method/resource used
Origin of gene sets Reactome (2023-03-06)
Tool used mitch (check version at foot of report)
Statistical test used Rank ANOVA (Kruskal Wallis)
P-value correction for multiple comparisons FDR method
Background list Genes with >=10 reads per sample on average across all samples
Gene Scoring DESeq2 Wald stat
Data availability via DEE2 at accession SRP038101 (human)
Other parameters Min gene set size of 5

This scatterplot shows how prioritising by FDR can lead to missing the gene sets with the biggest effect sizes. This corresponds to Figure 3.

def2 <- def
rownames(def2) <- sapply(strsplit(rownames(def)," "),'[[',1)

m <- mitch_import(x=def2,DEtype="deseq2",geneTable=gtnew)
## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.
## Note: Mean no. genes in input = 13168
## Note: no. genes in output = 13063
## Note: estimated proportion of input genes in output = 0.992
mres <- mitch_calc(x=m,genesets=gsets,minsetsize=5,cores=4,priority="significance")
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
message("No. significant gene sets")
## No. significant gene sets
nrow(subset(mres$enrichment_result,p.adjustANOVA<0.01))
## [1] 336
message("No. gene sets included")
## No. gene sets included
nrow(subset(mres$enrichment_result,p.adjustANOVA<1))
## [1] 1840
mresdf <- mres$enrichment_result

sig <- subset(mresdf,p.adjustANOVA<0.01)

plot(abs(mresdf$s.dist),-log10(mresdf$pANOVA),
  main="enrichment scores versus significance",
  bty="n",pch=19,cex=1,col="darkgray",
  xlab="absolute enrichment score",
  ylab="-log10(p-value)",
  xlim=c(0,0.95))
points(abs(sig$s.dist),-log10(sig$pANOVA),
  pch=19,cex=1,col="red")
points(c(0.35,0.8),c(42,8),cex=17,pch=0)
text(c(0.35,0.8),c(42,8),c("generic","specific"))

par(mar=c(5.1,4.1,4.1,2.1))
unlink("img/3a.png")
png("img/3a.png", width = 480, height = 480)
plot(abs(mresdf$s.dist),-log10(mresdf$pANOVA),
  main="enrichment scores versus significance",
  bty="n",pch=19,cex=1,col="darkgray",
  xlab="absolute enrichment score",
  ylab="-log10(p-value)",
  xlim=c(0,0.95))
points(abs(sig$s.dist),-log10(sig$pANOVA),
  pch=19,cex=1,col="red")
points(c(0.35,0.8),c(41,8),cex=17,pch=0)
text(c(0.35,0.8),c(41,8),c("generic","specific"))
dev.off()
## png 
##   2
par(mfrow=c(1,1))
par(mar=c(5.1,4.1,4.1,2.1))

par(mar=c(5.1,4.1,4.1,2.1))
unlink("img/3a.pdf")
pdf("img/3a.pdf", width = 5, height = 5)
plot(abs(mresdf$s.dist),-log10(mresdf$pANOVA),
  main="enrichment scores versus significance",
  bty="n",pch=19,cex=1,col="darkgray",
  xlab="absolute enrichment score",
  ylab="-log10(p-value)",
  xlim=c(0,0.95))
points(abs(sig$s.dist),-log10(sig$pANOVA),
  pch=19,cex=1,col="red")
points(c(0.35,0.8),c(41,8),cex=17,pch=0)
text(c(0.35,0.8),c(41,8),c("generic","specific"))
dev.off()
## png 
##   2
par(mfrow=c(1,1))
par(mar=c(5.1,4.1,4.1,2.1))

Now, let’s take a look at the top results when prioritising with FDR only and with enrichment score.

FDR prioritisation emphasises larger and more generic functions.

ES prioritisation emphasises smaller, more specific categories with larger effect sizes.

To prove this, I go back to the DE table and fetch the log2 fold changes.

These tables correspond to Table 1 and 2.

mres_sig_up <- head(subset(mres$enrichment_result,s.dist>0),10)
mres_sig_dn <- head(subset(mres$enrichment_result,s.dist<0),10)

# get log2fc values
mres_sig_up$mean_lfc <- sapply(mres_sig_up$set[1:10], function(y) {
  genes <- gsets[[which(names(gsets) == y)]]
  gid <- unique(gtnew[match(genes,gtnew$V2),1])
  lfc <- def2[which(rownames(def2) %in% gid),"log2FoldChange"]
  return(mean(lfc))
})

mres_sig_dn$mean_lfc <- sapply(mres_sig_dn$set[1:10], function(y) {
  genes <- gsets[[which(names(gsets) == y)]]
  gid <- unique(gtnew[match(genes,gtnew$V2),1])
  lfc <- def2[which(rownames(def2) %in% gid),"log2FoldChange"]
  return(mean(lfc))
})

mres_sig_up_tbl <- apply(data.frame(mres_sig_up[,1:2],
  apply(mres_sig_up[,3:6],2,signif,2)),2,as.character)

mres_sig_dn_tbl <- apply(data.frame(mres_sig_dn[,1:2],
  apply(mres_sig_dn[,3:6],2,signif,2)),2,as.character)

mres_sig_up_tbl %>%
  kbl(caption="Top upregulated pathways when prioritised by FDR. ",row.names=FALSE) %>%
  kable_paper("hover", full_width = F)
Top upregulated pathways when prioritised by FDR.
set setSize pANOVA s.dist p.adjustANOVA mean_lfc
Cell Cycle 589 1.4e-47 0.35 2.6e-44 0.069
Metabolism of RNA 725 4.4e-46 0.31 4.0e-43 0.079
Cell Cycle, Mitotic 479 1.5e-39 0.35 9.1e-37 0.066
Cell Cycle Checkpoints 246 2.1e-27 0.40 9.6e-25 0.081
M Phase 336 9.5e-27 0.34 2.9e-24 0.065
Mitotic Prometaphase 189 4.6e-25 0.44 1.1e-22 0.120
Mitotic Metaphase and Anaphase 208 8.9e-24 0.40 1.8e-21 0.110
Mitotic Anaphase 207 2.3e-23 0.40 4.3e-21 0.110
Processing of Capped Intron-Containing Pre-mRNA 273 6.6e-23 0.35 1.1e-20 0.084
Resolution of Sister Chromatid Cohesion 114 1.0e-20 0.51 1.6e-18 0.140
mres_sig_dn_tbl %>%
  kbl(caption="Top downregulated pathways when prioritised by FDR",row.names=FALSE) %>%
  kable_paper("hover", full_width = F)
Top downregulated pathways when prioritised by FDR
set setSize pANOVA s.dist p.adjustANOVA mean_lfc
Innate Immune System 747 4.3e-27 -0.23 1.6e-24 -0.23
Immune System 1434 5.8e-26 -0.17 1.5e-23 -0.23
Interferon alpha/beta signaling 60 1.6e-19 -0.67 2.3e-17 -1.00
Neutrophil degranulation 382 1.4e-18 -0.26 1.7e-16 -0.27
Interferon gamma signaling 67 1.8e-15 -0.56 1.6e-13 -0.64
Extracellular matrix organization 154 1.4e-13 -0.35 8.5e-12 -0.48
Cytokine Signaling in Immune system 567 2.1e-12 -0.17 1.0e-10 -0.30
Platelet activation, signaling and aggregation 174 9.1e-12 -0.30 3.6e-10 -0.26
Signaling by Interleukins 327 2.0e-10 -0.21 5.7e-09 -0.25
Immunoregulatory interactions between a Lymphoid and a non-Lymphoid cell 60 5.1e-10 -0.46 1.3e-08 -0.70
mres <- mitch_calc(x=m,genesets=gsets,minsetsize=5,cores=4,priority="effect")
## Note: Enrichments with large effect sizes may not be
##             statistically significant.
mres_es_up <- head(subset(mres$enrichment_result,s.dist>0 & p.adjustANOVA < 0.01),10)
mres_es_dn <- head(subset(mres$enrichment_result,s.dist<0 & p.adjustANOVA < 0.01),10)

# get log2fc values
mres_es_up$mean_lfc <- sapply(mres_es_up$set[1:10], function(y) {
  genes <- gsets[[which(names(gsets) == y)]]
  gid <- unique(gtnew[match(genes,gtnew$V2),1])
  lfc <- def2[which(rownames(def2) %in% gid),"log2FoldChange"]
  return(mean(lfc))
})

mres_es_dn$mean_lfc <- sapply(mres_es_dn$set[1:10], function(y) {
  genes <- gsets[[which(names(gsets) == y)]]
  gid <- unique(gtnew[match(genes,gtnew$V2),1])
  lfc <- def2[which(rownames(def2) %in% gid),"log2FoldChange"]
  return(mean(lfc))
})

mres_es_up_tbl <- apply(data.frame(mres_es_up[,1:2],
  apply(mres_es_up[,3:6],2,signif,2)),2,as.character)

mres_es_dn_tbl <- apply(data.frame(mres_es_dn[,1:2],
  apply(mres_es_dn[,3:6],2,signif,2)),2,as.character)

mres_es_up_tbl %>%
  kbl(caption="Top upregulated pathways when prioritised by ES (discarding any with FDR>0.01)",row.names=FALSE) %>%
  kable_paper("hover", full_width = F)
Top upregulated pathways when prioritised by ES (discarding any with FDR>0.01)
set setSize pANOVA s.dist p.adjustANOVA mean_lfc
Activation of NOXA and translocation to mitochondria 5 1.1e-03 0.84 6.9e-03 0.26
Condensation of Prometaphase Chromosomes 11 2.5e-06 0.82 3.5e-05 0.26
Postmitotic nuclear pore complex (NPC) reformation 27 1.8e-11 0.75 6.4e-10 0.21
Phosphorylation of Emi1 6 1.6e-03 0.75 8.8e-03 0.24
Interactions of Rev with host cellular proteins 37 6.8e-15 0.74 5.2e-13 0.20
Nuclear import of Rev protein 34 1.7e-13 0.73 9.9e-12 0.20
Rev-mediated nuclear export of HIV RNA 35 1.0e-13 0.73 6.3e-12 0.20
Transport of Ribonucleoproteins into the Host Nucleus 32 2.1e-12 0.72 1.0e-10 0.20
Export of Viral Ribonucleoproteins from Nucleus 32 2.9e-12 0.71 1.3e-10 0.19
NEP/NS2 Interacts with the Cellular Export Machinery 32 2.9e-12 0.71 1.3e-10 0.19
mres_es_dn_tbl %>%
  kbl(caption="Top upregulated pathways when prioritised by ES (discarding any with FDR>0.01)",row.names=FALSE) %>%
  kable_paper("hover", full_width = F)
Top upregulated pathways when prioritised by ES (discarding any with FDR>0.01)
set setSize pANOVA s.dist p.adjustANOVA mean_lfc
Prostanoid ligand receptors 5 9.7e-04 -0.85 6.2e-03 -1.90
Interleukin-9 signaling 6 6.0e-04 -0.81 4.3e-03 -1.00
Interleukin-2 signaling 7 3.5e-04 -0.78 2.7e-03 -0.76
Interleukin-21 signaling 8 2.6e-04 -0.75 2.2e-03 -0.93
Specification of primordial germ cells 6 1.7e-03 -0.74 9.6e-03 -0.69
Fatty Acids bound to GPR40 (FFAR1) regulate insulin secretion 6 1.8e-03 -0.74 9.9e-03 -0.60
Interleukin-20 family signaling 12 3.3e-05 -0.69 3.8e-04 -0.79
Interferon alpha/beta signaling 60 1.6e-19 -0.67 2.3e-17 -1.00
Cross-presentation of particulate exogenous antigens (phagosomes) 8 1.2e-03 -0.66 7.1e-03 -0.38
TNFs bind their physiological receptors 10 4.8e-04 -0.64 3.5e-03 -0.49
m4tbls <- list("esup"=mres_es_up_tbl,"esdn"=mres_es_dn_tbl,"sigup"=mres_sig_up_tbl,"sigdn"=mres_sig_dn_tbl)
saveRDS(m4tbls,"m4tbls.Rds")


message("Are mean log2FC values for ES prioritisation bigger than FDR approach?")
## Are mean log2FC values for ES prioritisation bigger than FDR approach?
message("Are mean log2FC values for ES prioritisation (UP)")
## Are mean log2FC values for ES prioritisation (UP)
mean(abs(mres_es_up$mean_lfc ))
## [1] 0.2169444
message("Are mean log2FC values for FDR prioritisation (UP)")
## Are mean log2FC values for FDR prioritisation (UP)
mean(abs(mres_sig_up$mean_lfc ))
## [1] 0.0914005
message("ES:FDR ratio (UP)")
## ES:FDR ratio (UP)
mean(abs(mres_es_up$mean_lfc )) / mean(abs(mres_sig_up$mean_lfc ))
## [1] 2.373559
message("Are mean log2FC values for ES prioritisation (DOWN)")
## Are mean log2FC values for ES prioritisation (DOWN)
mean(abs(mres_es_dn$mean_lfc ))
## [1] 0.8562915
message("Are mean log2FC values for FDR prioritisation (DOWN)")
## Are mean log2FC values for FDR prioritisation (DOWN)
mean(abs(mres_sig_dn$mean_lfc ))
## [1] 0.439364
message("ES:FDR ratio (DOWN)")
## ES:FDR ratio (DOWN)
mean(abs(mres_es_dn$mean_lfc )) / mean(abs(mres_sig_dn$mean_lfc ))
## [1] 1.948934

Make a barplot

par(mar=c(2,20,2,0))

mres_sig_up_vals <- -log(mres_sig_up$pANOVA)
names(mres_sig_up_vals) <- mres_sig_up$set

mres_sig_dn_vals <- -log(mres_sig_dn$pANOVA)
names(mres_sig_dn_vals) <- mres_sig_dn$set

mres_es_up_vals <- mres_es_up$s.dist
names(mres_es_up_vals) <- mres_es_up$set

mres_es_dn_vals <- -1 * mres_es_dn$s.dist
names(mres_es_dn_vals) <- mres_es_dn$set

par(mfrow=c(2,1))

barplot( mres_sig_up_vals,
  horiz=TRUE,las=1,cex.names=0.65,col="gray",
  main="Up Sig (-log10 p-value)",
  xlab="-log10 p-value",
  cex.main=0.9)

barplot( mres_es_up_vals,
  horiz=TRUE,las=1,cex.names=0.65,col="gray",
  main="Up ES",
  xlab="ES",
  cex.main=0.9)

barplot( mres_sig_dn_vals,
  horiz=TRUE,las=1,cex.names=0.65,col="gray",
  main="Down Sig (-log10 p-value)",
  xlab="-log10 p-value",
  cex.main=0.9)

barplot( mres_es_dn_vals,
  horiz=TRUE,las=1,cex.names=0.65,col="gray",
  main="Down ES",
  xlab="-ES",
  cex.main=0.9)

par(mfrow=c(1,1))
par(mar=c(5.1,4.1,4.1,2.1))

Mistake 5 - gene set list size too small or long

ORA is known to have variable results when using gene lists of different length. I will vary the foreground gene length from 50 to 7000 and see how many significant hits I get.

up <- rownames(subset(def, log2FoldChange>0 & padj<0.01))
up <- sapply(strsplit(up," "),'[[',1)
up <- unique(gtnew[match(up,gtnew$V1),2])
up <- up[which(!is.na(up))]

dn <- rownames(subset(def, log2FoldChange<0 & padj<0.01))
dn <- sapply(strsplit(dn," "),'[[',1)
dn <- unique(gtnew[match(dn,gtnew$V1),2])
dn <- dn[which(!is.na(dn))]

bg <- rownames(def)
bg <- sapply(strsplit(bg," "),'[[',1)
bg <- unique(gtnew[match(bg,gtnew$V1),2])
bg <- bg[which(!is.na(bg))]

message("Length of up, down and background gene lists. Only for context.")
## Length of up, down and background gene lists. Only for context.
lapply(list(up,dn,bg),length)
## [[1]]
## [1] 1062
## 
## [[2]]
## [1] 1431
## 
## [[3]]
## [1] 13063
sizes <- c(50,100,200,300,400,500,600,700,800,900,1000,1500,2000,2500,3000,4000,5000,6000,7000)

fup <- nrow(subset(fora(pathways=gsets, genes=up, universe=bg, minSize = 5, maxSize = Inf),padj<0.01))
fdn <- nrow(subset(fora(pathways=gsets, genes=dn, universe=bg, minSize = 5, maxSize = Inf),padj<0.01))

nups <- unlist(lapply(sizes, function(i) {
  ids <- rownames(subset(def2, log2FoldChange>0))
  ids <- head(ids,i)
  genes <- unique(gtnew[match(ids,gtnew$V1),2])
  fres <- fora(pathways=gsets, genes=genes, universe=bg, minSize = 5, maxSize = Inf)
  nrow(subset(fres,padj<0.01))
}))
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
ndns <- unlist(lapply(sizes, function(i) {
  ids <- rownames(subset(def2, log2FoldChange<0))
  ids <- head(ids,i)
  genes <- unique(gtnew[match(ids,gtnew$V1),2])
  fres <- fora(pathways=gsets, genes=genes, universe=bg, minSize = 5, maxSize = Inf)
  nrow(subset(fres,padj<0.01))
}))
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
df <- data.frame("up"=nups,"dn"=ndns,row.names=sizes)
df$tot <- df$up + df$dn
df %>%
  kbl(caption="Table XX. Number of differentially regulated gene sets when varying the foreground gene list length.") %>%
  kable_paper("hover", full_width = F)
Table XX. Number of differentially regulated gene sets when varying the foreground gene list length.
up dn tot
50 0 23 23
100 0 32 32
200 2 39 41
300 23 43 66
400 52 36 88
500 74 37 111
600 97 42 139
700 123 50 173
800 117 56 173
900 127 64 191
1000 143 67 210
1500 169 66 235
2000 190 61 251
2500 205 85 290
3000 220 74 294
4000 202 78 280
5000 160 43 203
6000 134 37 171
7000 130 20 150
dn_fes <- lapply(sizes, function(i) {
  ids <- rownames(subset(def2, log2FoldChange<0))
  ids <- head(ids,i)
  genes <- unique(gtnew[match(ids,gtnew$V1),2])
  fres <- fora(pathways=gsets, genes=genes, universe=bg, minSize = 5, maxSize = Inf)
  subset(fres,padj<0.01)$foldEnrichment
})
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
up_fes <- lapply(sizes, function(i) {
  ids <- rownames(subset(def2, log2FoldChange>0))
  ids <- head(ids,i)
  genes <- unique(gtnew[match(ids,gtnew$V1),2])
  fres <- fora(pathways=gsets, genes=genes, universe=bg, minSize = 5, maxSize = Inf)
  subset(fres,padj<0.01)$foldEnrichment
})
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
## Warning in fora(pathways = gsets, genes = genes, universe = bg, minSize = 5, :
## Not all of the input genes belong to the universe, such genes were removed
fes_df <- data.frame(sapply(up_fes,mean),sapply(dn_fes,mean))
rownames(fes_df) <- sizes
fes_df %>%
  kbl(caption="Table XX. Mean enrichment scores of differentially regulated gene sets when varying the foreground gene list length.") %>%
  kable_paper("hover", full_width = F)
Table XX. Mean enrichment scores of differentially regulated gene sets when varying the foreground gene list length.
sapply.up_fes..mean. sapply.dn_fes..mean.
50 NaN 27.185294
100 NaN 17.490674
200 3.397553 10.406799
300 7.020874 8.611209
400 6.515221 6.882359
500 5.128102 5.978592
600 4.428938 5.260855
700 4.215175 4.463295
800 3.773542 4.167073
900 3.613126 4.228704
1000 3.600386 3.836674
1500 3.080299 3.460900
2000 2.933832 2.872457
2500 2.650952 2.540473
3000 2.365507 2.226896
4000 2.050420 1.888231
5000 1.801817 1.644650
6000 1.670269 1.516452
7000 1.580148 1.426359
nups3 <- unlist(lapply(up_fes,function(x) { length(which(x>=3)) }))
ndns3 <- unlist(lapply(dn_fes,function(x) { length(which(x>=3)) }))
df3 <- data.frame("up3"=nups3,"dn3"=ndns3,row.names=sizes)
df3$tot3 <- df3$up3 + df3$dn3
df3 %>%
  kbl(caption="Table XX. No differentially regulated gene sets meeting the FES threshold of 3 when varying the foreground gene list length.") %>%
  kable_paper("hover", full_width = F)
Table XX. No differentially regulated gene sets meeting the FES threshold of 3 when varying the foreground gene list length.
up3 dn3 tot3
50 0 23 23
100 0 31 31
200 2 36 38
300 23 36 59
400 45 29 74
500 64 28 92
600 81 31 112
700 101 31 132
800 90 33 123
900 83 40 123
1000 90 36 126
1500 72 35 107
2000 80 23 103
2500 60 18 78
3000 34 9 43
4000 1 0 1
5000 0 0 0
6000 0 0 0
7000 0 0 0

Now to make a line chart of this data. The first chart is the whole range of data, and the second is a zoom in to 50 to 500 genes. The line chart shown below corresponds to Figure 4.

plot(1:nrow(df),df$up,type="b",col="red", pch=19, las=1,
  ylab="no. gene sets with FDR<0.01",xlab="gene list length",
  bty="n",xaxt = "n")
axis(1, at = 1:nrow(df), labels=sizes, cex.axis=1,las=2)
lines(1:nrow(df),df$dn,type="b",col="blue",pch=19)
lines(1:nrow(df3),df3$dn3,type="b",col="lightblue",pch=19)
lines(1:nrow(df3),df3$up3,type="b",col="pink",pch=19)
legend("topleft", legend=c("Up", "Down","Up,FES>3","Down,FES>3"),
       col=c("red", "blue","pink","lightblue"), lty=1, cex=1,pch=19)
grid()

unlink("img/5m.png")
png("img/5m.png", width = 540, height = 480)
plot(1:nrow(df),df$up,type="b",col="red", pch=19, las=1,
  ylab="no. gene sets with FDR<0.01",xlab="gene list length",
  bty="n",xaxt = "n")
axis(1, at = 1:nrow(df), labels=sizes, cex.axis=1,las=2)
lines(1:nrow(df),df$dn,type="b",col="blue",pch=19)
lines(1:nrow(df3),df3$dn3,type="b",col="lightblue",pch=19)
lines(1:nrow(df3),df3$up3,type="b",col="pink",pch=19)
legend("topleft", legend=c("Up", "Down","Up,FES>3","Down,FES>3"),
       col=c("red", "blue","pink","lightblue"), lty=1, cex=1,pch=19)
grid()
dev.off()
## png 
##   2
unlink("img/5m.pdf")
pdf("img/5m.pdf", width = 7, height = 5)
plot(1:nrow(df),df$up,type="b",col="red", pch=19, las=1,
  ylab="no. gene sets with FDR<0.01",xlab="gene list length",
  bty="n",xaxt = "n")
axis(1, at = 1:nrow(df), labels=sizes, cex.axis=1,las=2)
lines(1:nrow(df),df$dn,type="b",col="blue",pch=19)
lines(1:nrow(df3),df3$dn3,type="b",col="lightblue",pch=19)
lines(1:nrow(df3),df3$up3,type="b",col="pink",pch=19)
legend("topleft", legend=c("Up", "Down","Up,FES>3","Down,FES>3"),
       col=c("red", "blue","pink","lightblue"), lty=1, cex=1,pch=19)
grid()
dev.off()
## png 
##   2

Mistake 6 - combining up and downregulated genes into the same test.

Combining up and down into a single test can reduce the number of results significantly. This is because gene sets show typically have a predominantly pattern of up or down regulation but not both. Gene list length is 750 due to findings for mistake 5. The combined approach discovers only a few unique sets. The euler plot shown below corresponds to Figure 5.

up <- head(rownames(subset(def, log2FoldChange>0)),752)
up <- sapply(strsplit(up," "),'[[',1)
up <- unique(gtnew[match(up,gtnew$V1),2])
up <- up[which(!is.na(up))]
message("No. up genes.")
## No. up genes.
length(up)
## [1] 750
dn <- head(rownames(subset(def, log2FoldChange<0)),750)
dn <- sapply(strsplit(dn," "),'[[',1)
dn <- unique(gtnew[match(dn,gtnew$V1),2])
dn <- dn[which(!is.na(dn))]
message("No. down genes.")
## No. down genes.
length(dn)
## [1] 750
message("No. unique DE genes.")
## No. unique DE genes.
updn <- head(rownames(def),751)
updn <- sapply(strsplit(updn," "),'[[',1)
updn <- unique(gtnew[match(updn,gtnew$V1),2])
updn <- updn[which(!is.na(updn))]
length(updn)
## [1] 750
message("no. significant sets using the combined test:")
## no. significant sets using the combined test:
fres_updn_bg <- fora(pathways=gsets, genes=updn, universe=bg, minSize = 5, maxSize = Inf)
fres_updn_bg <- subset(fres_updn_bg,padj<0.01)$pathway
length(fres_updn_bg)
## [1] 31
message("no. significant upregulated sets using the separate test:")
## no. significant upregulated sets using the separate test:
fres_up_bg <- fora(pathways=gsets, genes=up, universe=bg, minSize = 5, maxSize = Inf)
fres_up_bg <- subset(fres_up_bg,padj<0.01)$pathway
length(fres_up_bg)
## [1] 117
message("no. significant downregulated sets using the separate test:")
## no. significant downregulated sets using the separate test:
fres_dn_bg <- fora(pathways=gsets, genes=dn, universe=bg, minSize = 5, maxSize = Inf)
fres_dn_bg <- subset(fres_dn_bg,padj<0.01)$pathway
length(fres_dn_bg)
## [1] 56
message("no. significant sets using the separate tests:")
## no. significant sets using the separate tests:
length(unique(c(fres_up_bg,fres_dn_bg)))
## [1] 171
v1 <- list("up"=fres_up_bg,"dn"=fres_dn_bg,"combined"=fres_updn_bg)
plot(euler(v1),quantities = list(cex = 2), labels = list(cex = 2))

unlink("img/6m. png")
png("img/6m.png", width = 480, height = 300)
plot(euler(v1),quantities = list(cex = 2), labels = list(cex = 2))
dev.off()
## png 
##   2
unlink("img/6m. pdf")
pdf("img/6m.pdf", width = 5, height = 4)
plot(euler(v1),quantities = list(cex = 2), labels = list(cex = 2))
dev.off()
## png 
##   2

Mistake 7 - using shallow gene annotations

Here, the idea is to show that the gene set database being used makes a huge difference to the quality of the results. In particular, the more comprehensive the gene sets, the more comprehensive the pathway enrichment will be. This corresponds to Table 3.

message("no. up genes for ORA test")
## no. up genes for ORA test
length(up)
## [1] 750
message("no. dn genes for ORA test")
## no. dn genes for ORA test
length(dn)
## [1] 750
message("no. BG genes for ORA test")
## no. BG genes for ORA test
length(bg)
## [1] 13063
msdb <- gmtPathways("ref/msigdb/msigdb.v2025.1.Hs.symbols.gmt")
kegg <- msdb[grep("KEGG",names(msdb))]
kegg_med <- msdb[grep("KEGG_MED",names(msdb))]
kegg <- kegg[grep("KEGG_MED",names(kegg),invert=TRUE)] #separate KEGG LEG from KEGG MED
gobp <- msdb[grep("GOBP",names(msdb))]
reactome <- msdb[grep("REACTOME",names(msdb))]
gslist <- list("KEGG"=kegg,"KEGGM"=kegg_med,"Reactome"=reactome,"GOBP"=gobp,"MSigDB"=msdb)

# no. gene sets
d1 <- unlist( lapply(gslist,length) )

# total number of annotations
d2 <- unlist( lapply(gslist,function(i) { length(unlist(i)) } ) )

# median gene set size
d3 <- unlist(lapply(gslist,function(i) { median(unlist(lapply(i,length))) } ))

# number of genes with one or more functions
d4 <- unlist( lapply(gslist,function(i) { length(unique(unlist(i))) } ))

kegg_up <- fora(pathways=kegg, genes=up, universe=bg, minSize = 5, maxSize = Inf)
kegg_dn <- fora(pathways=kegg, genes=dn, universe=bg, minSize = 5, maxSize = Inf)

message("no. significantly upregulated KEGG pathways")
## no. significantly upregulated KEGG pathways
nrow(subset(kegg_up,padj<0.01))
## [1] 3
message("no. significantly downregulated KEGG pathways")
## no. significantly downregulated KEGG pathways
nrow(subset(kegg_dn,padj<0.01))
## [1] 23
message("no. significantly dysregulated KEGG pathways")
## no. significantly dysregulated KEGG pathways
sum(nrow(subset(kegg_up,padj<0.01)), nrow(subset(kegg_dn,padj<0.01) ))
## [1] 26
kegg_med_up <- fora(pathways=kegg_med, genes=up, universe=bg, minSize = 5, maxSize = Inf)
kegg_med_dn <- fora(pathways=kegg_med, genes=dn, universe=bg, minSize = 5, maxSize = Inf)
message("no. significantly upregulated KEGG medicus pathways")
## no. significantly upregulated KEGG medicus pathways
nrow(subset(kegg_med_up,padj<0.01))
## [1] 2
message("no. significantly downregulated KEGG medicus pathways")
## no. significantly downregulated KEGG medicus pathways
nrow(subset(kegg_med_dn,padj<0.01))
## [1] 5
message("no. significantly dysregulated KEGG medicus pathways")
## no. significantly dysregulated KEGG medicus pathways
sum(nrow(subset(kegg_med_up,padj<0.01)), nrow(subset(kegg_med_dn,padj<0.01) ))
## [1] 7
reactome_up <- fora(pathways=reactome, genes=up, universe=bg, minSize = 5, maxSize = Inf)
reactome_dn <- fora(pathways=reactome, genes=dn, universe=bg, minSize = 5, maxSize = Inf)
message("no. significantly upregulated Reactome pathways")
## no. significantly upregulated Reactome pathways
nrow(subset(reactome_up,padj<0.01))
## [1] 79
message("no. significantly downregulated Reactome pathways")
## no. significantly downregulated Reactome pathways
nrow(subset(reactome_dn,padj<0.01))
## [1] 45
message("no. significantly dysregulated Reactome pathways")
## no. significantly dysregulated Reactome pathways
sum(nrow(subset(reactome_up,padj<0.01)), nrow(subset(reactome_dn,padj<0.01) ))
## [1] 124
gobp_up <- fora(pathways=gobp, genes=up, universe=bg, minSize = 5, maxSize = Inf)
gobp_dn <- fora(pathways=gobp, genes=dn, universe=bg, minSize = 5, maxSize = Inf)
message("no. significantly upregulated GO BP terms")
## no. significantly upregulated GO BP terms
nrow(subset(gobp_up,padj<0.01))
## [1] 176
message("no. significantly upregulated GO BP terms")
## no. significantly upregulated GO BP terms
nrow(subset(gobp_dn,padj<0.01))
## [1] 624
message("no. significantly dysregulated GO BP terms")
## no. significantly dysregulated GO BP terms
sum(nrow(subset(gobp_up,padj<0.01)), nrow(subset(gobp_dn,padj<0.01) ))
## [1] 800
msdb_up <- fora(pathways=msdb, genes=up, universe=bg, minSize = 5, maxSize = Inf)
msdb_dn <- fora(pathways=msdb, genes=dn, universe=bg, minSize = 5, maxSize = Inf)

message("no. significantly upregulated MSigDB gene sets")
## no. significantly upregulated MSigDB gene sets
nrow(subset(msdb_up,padj<0.01))
## [1] 1600
message("no. significantly downregulated MSigDB gene sets")
## no. significantly downregulated MSigDB gene sets
nrow(subset(msdb_dn,padj<0.01))
## [1] 3766
message("no. significantly dysregulated MSigDB gene sets")
## no. significantly dysregulated MSigDB gene sets
sum(nrow(subset(msdb_up,padj<0.01)), nrow(subset(msdb_dn,padj<0.01) ))
## [1] 5366
up_pw <- c(nrow(subset(kegg_up,padj<0.01)),nrow(subset(kegg_med_up,padj<0.01)),
  nrow(subset(reactome_up,padj<0.01)), nrow(subset(gobp_up,padj<0.01)),
  nrow(subset(msdb_up,padj<0.01)) )

dn_pw <- c(nrow(subset(kegg_dn,padj<0.01)), nrow(subset(kegg_med_dn,padj<0.01)),
  nrow(subset(reactome_dn,padj<0.01)), nrow(subset(gobp_dn,padj<0.01)),
  nrow(subset(msdb_dn,padj<0.01)) )

m8tbl <- data.frame(d1,d2,d3,d4,up_pw, dn_pw)

colnames(m8tbl) <- c("No. gene sets","Total no. annotations","Median gene set size",
  "No. genes with ≥1 annotation","Up-regulated","Down-regulated")

m8tbl %>%
  kbl(caption="Table XX. Gene set metrics, and number of significantly differentially regulated pathways (FDR<0.01).") %>%
  kable_paper("hover", full_width = F)
Table XX. Gene set metrics, and number of significantly differentially regulated pathways (FDR<0.01).
No. gene sets Total no. annotations Median gene set size No. genes with ≥1 annotation Up-regulated Down-regulated
KEGG 186 12800 52.5 5245 3 23
KEGGM 658 9662 11.5 2788 2 5
Reactome 1787 97544 23.0 11369 79 45
GOBP 7583 616242 20.0 18000 176 624
MSigDB 35134 4089406 47.0 43351 1600 3766
saveRDS(m8tbl,"m8tbl.Rds")

Mistake 8 - outated gene sets

Here, we want to show how the depth of the gene set library increases over time. To achieve this, I downloaded all the Reactome gene sets that were available in MSigDB going back to 2010. I also included the most recent gene set library from Reactome directly - the one we used above. There are a few ways to show the gene set library size including number of gene sets, mean set size, total number of annotations, and number of unique genes included. The charts below show that the number of gene sets increases over time, and so does the total number of annotations, and the gene coverage. For simplicity, in the article we just show the number of gene sets. This corresponds to Figure 6.

par(mar=c(5.1,5.1,4.1,2.1))

reactome_files <- list.files("ref/msigdb/reactome/",full.names=TRUE)
reactome_files <- reactome_files[c(6:20,1:5)]
reactome_files <- c(reactome_files,"ref/ReactomePathways_2025-09-02.gmt")

reactomes <- lapply(reactome_files,gmtPathways)

names(reactomes) <- c("2010_09","2012_09","2013_05","2015_04","2016_01","2016_09","2017_04",
  "2017_10","2018_07","2019_08","2020_03","2020_09","2021_03","2021_04","2022_01","2022_09",
  "2023_03","2023_10","2024_08","2025_06","2025_09")

nogs <- unlist(lapply(reactomes,length))
nogs
## 2010_09 2012_09 2013_05 2015_04 2016_01 2016_09 2017_04 2017_10 2018_07 2019_08 
##     430     674     674     674     674     674     674     674     674    1499 
## 2020_03 2020_09 2021_03 2021_04 2022_01 2022_09 2023_03 2023_10 2024_08 2025_06 
##    1532    1554    1569    1604    1615    1635    1654    1692    1736    1787 
## 2025_09 
##    2785
# remove sets with 4 or fewer genes
reactomes_filt <- lapply(reactomes,function(gs) {
  l <- lapply(gs,length)
  gs[which(l>=5)]
})
nogs <- unlist(lapply(reactomes_filt,length))
nogs
## 2010_09 2012_09 2013_05 2015_04 2016_01 2016_09 2017_04 2017_10 2018_07 2019_08 
##     430     674     674     674     674     674     674     674     674    1499 
## 2020_03 2020_09 2021_03 2021_04 2022_01 2022_09 2023_03 2023_10 2024_08 2025_06 
##    1532    1554    1569    1604    1615    1635    1654    1692    1736    1787 
## 2025_09 
##    2159
barplot(nogs,las=2,ylim=c(0,2500),ylab="no. gene sets",main="Reactome gene sets over time",
  border=NA)
lapply(seq(0,2500,500),function(h) { abline(h=h,lty=2,col="gray") })

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
unlink("img/9m.png")
png("img/9m.png", width = 480, height = 300)
barplot(nogs,las=2,ylim=c(0,2500),ylab="no. gene sets",main="Reactome gene sets over time",
  border=NA,col="darkgray")
lapply(seq(0,2500,500),function(h) { abline(h=h,lty=2,col="lightgray") })
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
dev.off()
## png 
##   2
unlink("img/9m.pdf")
pdf("img/9m.pdf", width = 5, height = 4)
barplot(nogs,las=2,ylim=c(0,2500),ylab="no. gene sets",main="Reactome gene sets over time",
  border=NA,col="darkgray")
lapply(seq(0,2500,500),function(h) { abline(h=h,lty=2,col="lightgray") })
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
dev.off()
## png 
##   2
reactomes <- reactomes_filt

tot_anno <- unlist(lapply(reactomes,function(gs) {
  length(unlist(gs))
}))
tot_anno
## 2010_09 2012_09 2013_05 2015_04 2016_01 2016_09 2017_04 2017_10 2018_07 2019_08 
##   21362   37601   37601   37601   37601   37601   37601   37601   37601   85647 
## 2020_03 2020_09 2021_03 2021_04 2022_01 2022_09 2023_03 2023_10 2024_08 2025_06 
##   87352   88301   86752   89457   89476   92250   92780   95226   97590   97544 
## 2025_09 
##  134094
barplot(tot_anno,las=2,ylim=c(0,140000),main="Total no. annotations")

gene_coverage <- unlist(lapply(reactomes,function(gs) {
  length(unique(unlist(gs)))
}))
gene_coverage
## 2010_09 2012_09 2013_05 2015_04 2016_01 2016_09 2017_04 2017_10 2018_07 2019_08 
##    4159    6025    6025    6025    6025    6025    6025    6025    6025   10718 
## 2020_03 2020_09 2021_03 2021_04 2022_01 2022_09 2023_03 2023_10 2024_08 2025_06 
##   10807   10821   10826   10968   10646   11013   11065   11155   11290   11369 
## 2025_09 
##   11886
barplot(gene_coverage,las=2,ylim=c(0,12000),ylab="gene coverage")

Session information

For reproducibility


Click HERE to show session info

save.image("analysis.Rdata")
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
##  [1] parallel  stats4    grid      stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] vioplot_0.5.1               zoo_1.8-14                 
##  [3] sm_2.2-6.0                  beeswarm_0.4.0             
##  [5] RhpcBLASctl_0.23-42         fgsea_1.34.2               
##  [7] mitch_1.20.0                gplots_3.2.0               
##  [9] eulerr_7.0.4                DESeq2_1.48.2              
## [11] SummarizedExperiment_1.38.1 Biobase_2.68.0             
## [13] MatrixGenerics_1.20.0       matrixStats_1.5.0          
## [15] GenomicRanges_1.60.0        GenomeInfoDb_1.44.3        
## [17] IRanges_2.42.0              S4Vectors_0.46.0           
## [19] BiocGenerics_0.54.1         generics_0.1.4             
## [21] getDEE2_1.18.0              png_0.1-8                  
## [23] kableExtra_1.4.0           
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-9            gridExtra_2.3           echarts4r_0.4.6        
##  [4] rlang_1.1.6             magrittr_2.0.3          compiler_4.5.1         
##  [7] polylabelr_0.3.0        systemfonts_1.3.1       vctrs_0.6.5            
## [10] reshape2_1.4.4          htm2txt_2.2.2           stringr_1.5.1          
## [13] pkgconfig_2.0.3         crayon_1.5.3            fastmap_1.2.0          
## [16] XVector_0.48.0          rmdformats_1.0.4        caTools_1.18.3         
## [19] promises_1.3.3          rmarkdown_2.29          UCSC.utils_1.4.0       
## [22] network_1.19.0          purrr_1.1.0             xfun_0.53              
## [25] cachem_1.1.0            jsonlite_2.0.0          later_1.4.4            
## [28] DelayedArray_0.34.1     BiocParallel_1.42.2     R6_2.6.1               
## [31] bslib_0.9.0             stringi_1.8.7           RColorBrewer_1.1-3     
## [34] GGally_2.4.0            jquerylib_0.1.4         Rcpp_1.1.0             
## [37] bookdown_0.45           knitr_1.50              httpuv_1.6.16          
## [40] Matrix_1.7-4            tidyselect_1.2.1        rstudioapi_0.17.1      
## [43] abind_1.4-8             yaml_2.3.10             codetools_0.2-20       
## [46] lattice_0.22-7          tibble_3.3.0            plyr_1.8.9             
## [49] shiny_1.11.1            S7_0.2.0                coda_0.19-4.1          
## [52] evaluate_1.0.5          polyclip_1.10-7         ggstats_0.11.0         
## [55] xml2_1.4.0              pillar_1.11.0           KernSmooth_2.23-26     
## [58] ggplot2_4.0.0           scales_1.4.0            gtools_3.9.5           
## [61] xtable_1.8-4            glue_1.8.0              tools_4.5.1            
## [64] data.table_1.17.8       locfit_1.5-9.12         fastmatch_1.1-6        
## [67] cowplot_1.2.0           tidyr_1.3.1             GenomeInfoDbData_1.2.14
## [70] cli_3.6.5               textshaping_1.0.1       S4Arrays_1.8.1         
## [73] viridisLite_0.4.2       svglite_2.2.2           dplyr_1.1.4            
## [76] gtable_0.3.6            sass_0.4.10             digest_0.6.37          
## [79] SparseArray_1.8.1       htmlwidgets_1.6.4       farver_2.1.2           
## [82] htmltools_0.5.8.1       lifecycle_1.0.4         httr_1.4.7             
## [85] statnet.common_4.12.0   mime_0.13               MASS_7.3-65