Source: https://github.com/markziemann/pathway-workshop

Introduction

#knitr::opts_chunk$set(dev = 'svg') # set output device to svg

suppressPackageStartupMessages({
  library("getDEE2")
  library("DESeq2")
  library("kableExtra")
  library("eulerr")
  library("gplots")
  library("mitch")
  library("fgsea")
  library("RhpcBLASctl")
  library("parallel")
  library("beeswarm")
  library("vioplot")
})

RhpcBLASctl::blas_set_num_threads(1)

For this guide I will be using bulk RNA-seq data from a previous study, which is deposited at NCBI GEO and SRA under accession numbers: GSE55123 and SRP038101 (Lund et al, 2014). The experiment is designed to investigate the effect of Azacitidine treatment on AML3 cells.

The raw data have been processed by the DEE2 project, and the summary gene expression counts are available at the dee2.io website, and programmatically with the getDEE2 bioconductor package (Ziemann et al, 2019).

The gene counts have also been deposited to the /data folder in the example.Rdata file in case the DEE2 resource becomes unavailable. To import it, use the command: load("data/example.Rdata")

Alternatively, you could fetch data from another resource like NCBI GEO, Zenodo or from the host storage drive.

mdat <- getDEE2Metadata("hsapiens")

# get sample sheet
ss <- subset(mdat,SRP_accession=="SRP038101")

# fetch the whole set of RNA-seq data
x <- getDEE2("hsapiens",ss$SRR_accession , metadata=mdat, legacy=TRUE)
## For more information about DEE2 QC metrics, visit
##     https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md
mx <- x$GeneCounts
rownames(mx) <- paste(rownames(mx),x$GeneInfo$GeneSymbol)
dim(mx)
## [1] 58302     6
# aza no filtering
ss$trt <- grepl("Treated",ss$Experiment_title)

ss %>%
  kbl(caption="sample sheet for Aza treatment in AML3 cells") %>%
  kable_paper("hover", full_width = F)
sample sheet for Aza treatment in AML3 cells
SRR_accession QC_summary SRX_accession SRS_accession SRP_accession Experiment_title GEO_series trt
235513 SRR1171523 PASS SRX472607 SRS559064 SRP038101 GSM1329859: Untreated.1; Homo sapiens; RNA-Seq GSE55123 FALSE
235514 SRR1171524 WARN(3,4) SRX472608 SRS559066 SRP038101 GSM1329860: Untreated.2; Homo sapiens; RNA-Seq GSE55123 FALSE
235515 SRR1171525 WARN(3,4) SRX472609 SRS559065 SRP038101 GSM1329861: Untreated.3; Homo sapiens; RNA-Seq GSE55123 FALSE
235516 SRR1171526 WARN(3,4) SRX472610 SRS559068 SRP038101 GSM1329862: Treated.1; Homo sapiens; RNA-Seq GSE55123 TRUE
235517 SRR1171527 WARN(3,4) SRX472611 SRS559067 SRP038101 GSM1329863: Treated.2; Homo sapiens; RNA-Seq GSE55123 TRUE
235518 SRR1171528 WARN(3,4) SRX472612 SRS559069 SRP038101 GSM1329864: Treated.3; Homo sapiens; RNA-Seq GSE55123 TRUE

Data quality control

QC is important, even if you are using public transcriptome data. For RNA-seq it is a good idea to show the number of reads for each sample.

colSums(mx)
## SRR1171523 SRR1171524 SRR1171525 SRR1171526 SRR1171527 SRR1171528 
##   15529931   11209272   10994666   11350703   12979401   12674207
par(mar=c(5,7,5,1))
barplot(rev(colSums(mx)),horiz=TRUE,las=1,main="number of reads per sample in SRP038101")

Now make a MDS plot.

mds <- cmdscale(dist(t(mx)))

# expand plot area
XMIN=min(mds[,1])*1.3
XMAX=max(mds[,1])*1.3
YMIN=min(mds[,2])*1.3
YMAX=max(mds[,2])*1.3

cols <- as.character(grepl("Treated",ss$Experiment_title))
cols <- gsub("FALSE","lightblue",cols)
cols <- gsub("TRUE","pink",cols)
plot(mds, xlab="Coordinate 1", ylab="Coordinate 2",
  xlim=c(XMIN,XMAX),ylim=c(YMIN,YMAX),
  pch=19,cex=2,col=cols, main="MDS plot")
text(cmdscale(dist(t(mx))), labels=colnames(mx) )

Differential expression analysis

We will be using DESeq2 for DE analysis.

The count matrix is prefiltered using a detection threshold of 10 reads per sample across all samples. All genes that meet the detection threshold will comprise the background list.

We will also repeat this without a detection threshold.

The first 6 rows of the count matrix are shown.

mxf <- mx[which(rowMeans(mx)>=10),]
dim(mxf)
## [1] 13168     6
head(mxf,6) %>%
  kbl(caption="Count matrix format") %>%
  kable_paper("hover", full_width = F)
Count matrix format
SRR1171523 SRR1171524 SRR1171525 SRR1171526 SRR1171527 SRR1171528
ENSG00000225630 MTND2P28 494 396 340 333 415 418
ENSG00000237973 MTCO1P12 52 39 40 30 37 29
ENSG00000248527 MTATP6P1 853 544 537 582 702 716
ENSG00000228327 AL669831.1 17 13 21 21 22 12
ENSG00000228794 LINC01128 42 27 30 32 40 23
ENSG00000230699 AL645608.3 20 11 13 10 15 22
dds <- DESeqDataSetFromMatrix(countData = mxf , colData = ss, design = ~ trt )
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
z <- results(res)
vsd <- vst(dds, blind=FALSE)
zz <-cbind(as.data.frame(z),mxf)
def <-as.data.frame(zz[order(zz$pvalue),])

head(def,10) %>%
  kbl(caption="Top DE genes for Aza treatment") %>%
  kable_paper("hover", full_width = F)
Top DE genes for Aza treatment
baseMean log2FoldChange lfcSE stat pvalue padj SRR1171523 SRR1171524 SRR1171525 SRR1171526 SRR1171527 SRR1171528
ENSG00000165949 IFI27 1960.1970 -3.384492 0.0938869 -36.04861 0 0 4689 3583 2758 309 384 334
ENSG00000090382 LYZ 7596.0299 -1.650342 0.0561143 -29.41036 0 0 14212 10237 10789 3476 3764 3738
ENSG00000115461 IGFBP5 531.2217 -5.071157 0.1795239 -28.24781 0 0 1320 823 1025 23 26 43
ENSG00000157601 MX1 827.1511 -2.877795 0.1047823 -27.46450 0 0 1732 1501 1206 184 223 186
ENSG00000111331 OAS3 2127.2010 -2.661214 0.0972124 -27.37525 0 0 4204 3977 2972 562 614 560
ENSG00000070915 SLC12A3 424.5509 -3.374852 0.1298671 -25.98697 0 0 1012 721 653 63 85 76
ENSG00000234745 HLA-B 3197.0159 -1.431566 0.0604169 -23.69479 0 0 6085 4256 4023 1590 1872 1719
ENSG00000137965 IFI44 409.0957 -2.978581 0.1319352 -22.57608 0 0 829 740 635 76 111 89
ENSG00000204525 HLA-C 1631.6421 -1.461550 0.0660214 -22.13750 0 0 3112 2150 2106 791 923 891
ENSG00000110042 DTX4 524.1318 -2.470219 0.1173182 -21.05572 0 0 1166 883 688 166 168 145

Make a smear plot.

sigf <- subset(def,padj<=0.01)

DET=nrow(mxf)
NSIG=nrow(sigf)
NUP=nrow(subset(sigf,log2FoldChange>0))
NDN=nrow(subset(sigf,log2FoldChange<0))

HEADER=paste(DET,"detected genes;",NSIG,"w/FDR<0.01;",NUP,"up;",NDN,"down")

plot(log10(def$baseMean) ,def$log2FoldChange,
  cex=0.6,pch=19,col="darkgray",
  main="5-azacitidine treatment in AML3 cells",
  xlab="log10(basemean)",ylab="log2(fold change)")

points(log10(sigf$baseMean) ,sigf$log2FoldChange,
  cex=0.6,pch=19,col="red")

mtext(HEADER)

In the next sections I will run enrichment analysis with over-representation test and compare it to functional class scoring.

Get update gene symbols

Let’s see if many of the gene symbols have changed.

Comparing Ensembl release 90 to GENCODE Release 49 (GRCh38.p14) which corresponds to Ensembl 115 (Sept 2025).

So v90 had 58k genes and v115 has 86k which is a big increase. The v90 has 1609 genes that have been removed an v115 has 29676 that have been added. There are 56693 common.

From these common ones, 20456 have different gene names. I think the approach to naming these putative genes has changed a lot. Lots of CXorfXX genes have changed, not just AL358472.5 type names.

In the next chunk, I’ll show a few lines of the older source gene names.

There is also an Euler diagram of the gene symbol repertoires.

gt <- read.table("https://dee2.io/data/hsapiens/hsa_gene_info.tsv",header=TRUE)
head(gt)
##            GeneID  GeneSymbol mean median longest_isoform merged
## 1 ENSG00000223972     DDX11L1 1144   1144            1657   1735
## 2 ENSG00000227232      WASH7P 1351   1351            1351   1351
## 3 ENSG00000278267   MIR6859-1   68     68              68     68
## 4 ENSG00000243485 MIR1302-2HG  623    623             712   1021
## 5 ENSG00000284332   MIR1302-2  138    138             138    138
## 6 ENSG00000237613     FAM138A  888    888            1187   1219
gtnew <- read.table("ref/gencode.v49.genenames.tsv")

v1 <- list("Ens90"=gt$GeneID, "Ens115"=gtnew$V1)
plot(euler(v1),quantities = list(cex = 2), labels = list(cex = 2))

message("Number of unique genes in the original gene table:")
## Number of unique genes in the original gene table:
length(gt$GeneID)
## [1] 58302
message("Number of unique genes in the new gene annotation set:")
## Number of unique genes in the new gene annotation set:
length(gtnew$V1)
## [1] 86369
message("Number of shared gene IDs between new and old annotations:")
## Number of shared gene IDs between new and old annotations:
length(intersect(gt$GeneID,gtnew$V1))
## [1] 56693
message("Number of gene IDs specific to the old annotation:")
## Number of gene IDs specific to the old annotation:
length(setdiff(gt$GeneID,gtnew$V1))
## [1] 1609
message("Number of gene IDs specific to the new annotation:")
## Number of gene IDs specific to the new annotation:
length(setdiff(gtnew$V1,gt$GeneID))
## [1] 29676
message("Here is the mapping table for old-new gene symbols:")
## Here is the mapping table for old-new gene symbols:
gtm <- merge(gt,gtnew,by.x="GeneID",by.y="V1")

message("Number of rows:")
## Number of rows:
nrow(gtm)
## [1] 56693
message("How many of these gene symbols are the same?")
## How many of these gene symbols are the same?
table(gtm$GeneSymbol == gtm$V2)
## 
## FALSE  TRUE 
## 20456 36237

Save data

dir.create("data")
## Warning in dir.create("data"): 'data' already exists
detsv <- cbind(rownames(def),def)
colnames(detsv)[1] <- "GeneID"
write.table(detsv,"data/de.tsv",sep="\t",row.names=FALSE)
save.image("data/de.Rdata")
saveRDS(def,"data/de.Rds")

Session information

sessionInfo()
## R version 4.5.3 (2026-03-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] vioplot_0.5.1               zoo_1.8-15                 
##  [3] sm_2.2-6.0                  beeswarm_0.4.0             
##  [5] RhpcBLASctl_0.23-42         fgsea_1.34.2               
##  [7] mitch_1.20.0                gplots_3.3.0               
##  [9] eulerr_7.0.4                kableExtra_1.4.0           
## [11] DESeq2_1.48.2               SummarizedExperiment_1.38.1
## [13] Biobase_2.68.0              MatrixGenerics_1.20.0      
## [15] matrixStats_1.5.0           GenomicRanges_1.60.0       
## [17] GenomeInfoDb_1.44.3         IRanges_2.42.0             
## [19] S4Vectors_0.46.0            BiocGenerics_0.54.1        
## [21] generics_0.1.4              getDEE2_1.18.0             
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-9            gridExtra_2.3           echarts4r_0.5.0        
##  [4] rlang_1.1.7             magrittr_2.0.4          otel_0.2.0             
##  [7] compiler_4.5.3          polylabelr_1.0.0        systemfonts_1.3.1      
## [10] vctrs_0.7.1             reshape2_1.4.5          htm2txt_2.2.2          
## [13] stringr_1.6.0           pkgconfig_2.0.3         crayon_1.5.3           
## [16] fastmap_1.2.0           XVector_0.48.0          caTools_1.18.3         
## [19] promises_1.5.0          rmarkdown_2.30          UCSC.utils_1.4.0       
## [22] network_1.20.0          purrr_1.2.1             xfun_0.56              
## [25] cachem_1.1.0            jsonlite_2.0.0          later_1.4.6            
## [28] DelayedArray_0.34.1     BiocParallel_1.42.2     R6_2.6.1               
## [31] bslib_0.10.0            stringi_1.8.7           RColorBrewer_1.1-3     
## [34] GGally_2.4.0            jquerylib_0.1.4         Rcpp_1.1.1             
## [37] knitr_1.51              httpuv_1.6.16           Matrix_1.7-4           
## [40] tidyselect_1.2.1        rstudioapi_0.18.0       dichromat_2.0-0.1      
## [43] abind_1.4-8             yaml_2.3.12             codetools_0.2-20       
## [46] lattice_0.22-9          tibble_3.3.1            plyr_1.8.9             
## [49] shiny_1.12.1            S7_0.2.1                coda_0.19-4.1          
## [52] evaluate_1.0.5          polyclip_1.10-7         ggstats_0.12.0         
## [55] xml2_1.5.2              pillar_1.11.1           KernSmooth_2.23-26     
## [58] ggplot2_4.0.2           scales_1.4.0            gtools_3.9.5           
## [61] xtable_1.8-4            glue_1.8.0              tools_4.5.3            
## [64] data.table_1.18.2.1     locfit_1.5-9.12         fastmatch_1.1-8        
## [67] cowplot_1.2.0           grid_4.5.3              tidyr_1.3.2            
## [70] GenomeInfoDbData_1.2.14 cli_3.6.5               textshaping_1.0.4      
## [73] S4Arrays_1.8.1          viridisLite_0.4.3       svglite_2.2.2          
## [76] dplyr_1.2.0             gtable_0.3.6            sass_0.4.10            
## [79] digest_0.6.39           SparseArray_1.8.1       htmlwidgets_1.6.4      
## [82] farver_2.1.2            htmltools_0.5.9         lifecycle_1.0.5        
## [85] httr_1.4.8              statnet.common_4.13.0   mime_0.13              
## [88] MASS_7.3-65