Source: https://github.com/markziemann/pathway-workshop
#knitr::opts_chunk$set(dev = 'svg') # set output device to svg
suppressPackageStartupMessages({
library("getDEE2")
library("DESeq2")
library("kableExtra")
library("eulerr")
library("gplots")
library("mitch")
library("fgsea")
library("RhpcBLASctl")
library("parallel")
library("beeswarm")
library("vioplot")
})
RhpcBLASctl::blas_set_num_threads(1)
For this guide I will be using bulk RNA-seq data from a previous study, which is deposited at NCBI GEO and SRA under accession numbers: GSE55123 and SRP038101 (Lund et al, 2014). The experiment is designed to investigate the effect of Azacitidine treatment on AML3 cells.
The raw data have been processed by the DEE2 project, and the summary gene expression counts are available at the dee2.io website, and programmatically with the getDEE2 bioconductor package (Ziemann et al, 2019).
The gene counts have also been deposited to the /data folder in the example.Rdata file in case the DEE2 resource becomes unavailable. To import it, use the command: load("data/example.Rdata")
Alternatively, you could fetch data from another resource like NCBI GEO, Zenodo or from the host storage drive.
mdat <- getDEE2Metadata("hsapiens")
# get sample sheet
ss <- subset(mdat,SRP_accession=="SRP038101")
# fetch the whole set of RNA-seq data
x <- getDEE2("hsapiens",ss$SRR_accession , metadata=mdat, legacy=TRUE)
## For more information about DEE2 QC metrics, visit
## https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md
mx <- x$GeneCounts
rownames(mx) <- paste(rownames(mx),x$GeneInfo$GeneSymbol)
dim(mx)
## [1] 58302 6
# aza no filtering
ss$trt <- grepl("Treated",ss$Experiment_title)
ss %>%
kbl(caption="sample sheet for Aza treatment in AML3 cells") %>%
kable_paper("hover", full_width = F)
| SRR_accession | QC_summary | SRX_accession | SRS_accession | SRP_accession | Experiment_title | GEO_series | trt | |
|---|---|---|---|---|---|---|---|---|
| 235513 | SRR1171523 | PASS | SRX472607 | SRS559064 | SRP038101 | GSM1329859: Untreated.1; Homo sapiens; RNA-Seq | GSE55123 | FALSE |
| 235514 | SRR1171524 | WARN(3,4) | SRX472608 | SRS559066 | SRP038101 | GSM1329860: Untreated.2; Homo sapiens; RNA-Seq | GSE55123 | FALSE |
| 235515 | SRR1171525 | WARN(3,4) | SRX472609 | SRS559065 | SRP038101 | GSM1329861: Untreated.3; Homo sapiens; RNA-Seq | GSE55123 | FALSE |
| 235516 | SRR1171526 | WARN(3,4) | SRX472610 | SRS559068 | SRP038101 | GSM1329862: Treated.1; Homo sapiens; RNA-Seq | GSE55123 | TRUE |
| 235517 | SRR1171527 | WARN(3,4) | SRX472611 | SRS559067 | SRP038101 | GSM1329863: Treated.2; Homo sapiens; RNA-Seq | GSE55123 | TRUE |
| 235518 | SRR1171528 | WARN(3,4) | SRX472612 | SRS559069 | SRP038101 | GSM1329864: Treated.3; Homo sapiens; RNA-Seq | GSE55123 | TRUE |
QC is important, even if you are using public transcriptome data. For RNA-seq it is a good idea to show the number of reads for each sample.
colSums(mx)
## SRR1171523 SRR1171524 SRR1171525 SRR1171526 SRR1171527 SRR1171528
## 15529931 11209272 10994666 11350703 12979401 12674207
par(mar=c(5,7,5,1))
barplot(rev(colSums(mx)),horiz=TRUE,las=1,main="number of reads per sample in SRP038101")
Now make a MDS plot.
mds <- cmdscale(dist(t(mx)))
# expand plot area
XMIN=min(mds[,1])*1.3
XMAX=max(mds[,1])*1.3
YMIN=min(mds[,2])*1.3
YMAX=max(mds[,2])*1.3
cols <- as.character(grepl("Treated",ss$Experiment_title))
cols <- gsub("FALSE","lightblue",cols)
cols <- gsub("TRUE","pink",cols)
plot(mds, xlab="Coordinate 1", ylab="Coordinate 2",
xlim=c(XMIN,XMAX),ylim=c(YMIN,YMAX),
pch=19,cex=2,col=cols, main="MDS plot")
text(cmdscale(dist(t(mx))), labels=colnames(mx) )
We will be using DESeq2 for DE analysis.
The count matrix is prefiltered using a detection threshold of 10 reads per sample across all samples. All genes that meet the detection threshold will comprise the background list.
We will also repeat this without a detection threshold.
The first 6 rows of the count matrix are shown.
mxf <- mx[which(rowMeans(mx)>=10),]
dim(mxf)
## [1] 13168 6
head(mxf,6) %>%
kbl(caption="Count matrix format") %>%
kable_paper("hover", full_width = F)
| SRR1171523 | SRR1171524 | SRR1171525 | SRR1171526 | SRR1171527 | SRR1171528 | |
|---|---|---|---|---|---|---|
| ENSG00000225630 MTND2P28 | 494 | 396 | 340 | 333 | 415 | 418 |
| ENSG00000237973 MTCO1P12 | 52 | 39 | 40 | 30 | 37 | 29 |
| ENSG00000248527 MTATP6P1 | 853 | 544 | 537 | 582 | 702 | 716 |
| ENSG00000228327 AL669831.1 | 17 | 13 | 21 | 21 | 22 | 12 |
| ENSG00000228794 LINC01128 | 42 | 27 | 30 | 32 | 40 | 23 |
| ENSG00000230699 AL645608.3 | 20 | 11 | 13 | 10 | 15 | 22 |
dds <- DESeqDataSetFromMatrix(countData = mxf , colData = ss, design = ~ trt )
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
z <- results(res)
vsd <- vst(dds, blind=FALSE)
zz <-cbind(as.data.frame(z),mxf)
def <-as.data.frame(zz[order(zz$pvalue),])
head(def,10) %>%
kbl(caption="Top DE genes for Aza treatment") %>%
kable_paper("hover", full_width = F)
| baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | SRR1171523 | SRR1171524 | SRR1171525 | SRR1171526 | SRR1171527 | SRR1171528 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ENSG00000165949 IFI27 | 1960.1970 | -3.384492 | 0.0938869 | -36.04861 | 0 | 0 | 4689 | 3583 | 2758 | 309 | 384 | 334 |
| ENSG00000090382 LYZ | 7596.0299 | -1.650342 | 0.0561143 | -29.41036 | 0 | 0 | 14212 | 10237 | 10789 | 3476 | 3764 | 3738 |
| ENSG00000115461 IGFBP5 | 531.2217 | -5.071157 | 0.1795239 | -28.24781 | 0 | 0 | 1320 | 823 | 1025 | 23 | 26 | 43 |
| ENSG00000157601 MX1 | 827.1511 | -2.877795 | 0.1047823 | -27.46450 | 0 | 0 | 1732 | 1501 | 1206 | 184 | 223 | 186 |
| ENSG00000111331 OAS3 | 2127.2010 | -2.661214 | 0.0972124 | -27.37525 | 0 | 0 | 4204 | 3977 | 2972 | 562 | 614 | 560 |
| ENSG00000070915 SLC12A3 | 424.5509 | -3.374852 | 0.1298671 | -25.98697 | 0 | 0 | 1012 | 721 | 653 | 63 | 85 | 76 |
| ENSG00000234745 HLA-B | 3197.0159 | -1.431566 | 0.0604169 | -23.69479 | 0 | 0 | 6085 | 4256 | 4023 | 1590 | 1872 | 1719 |
| ENSG00000137965 IFI44 | 409.0957 | -2.978581 | 0.1319352 | -22.57608 | 0 | 0 | 829 | 740 | 635 | 76 | 111 | 89 |
| ENSG00000204525 HLA-C | 1631.6421 | -1.461550 | 0.0660214 | -22.13750 | 0 | 0 | 3112 | 2150 | 2106 | 791 | 923 | 891 |
| ENSG00000110042 DTX4 | 524.1318 | -2.470219 | 0.1173182 | -21.05572 | 0 | 0 | 1166 | 883 | 688 | 166 | 168 | 145 |
Make a smear plot.
sigf <- subset(def,padj<=0.01)
DET=nrow(mxf)
NSIG=nrow(sigf)
NUP=nrow(subset(sigf,log2FoldChange>0))
NDN=nrow(subset(sigf,log2FoldChange<0))
HEADER=paste(DET,"detected genes;",NSIG,"w/FDR<0.01;",NUP,"up;",NDN,"down")
plot(log10(def$baseMean) ,def$log2FoldChange,
cex=0.6,pch=19,col="darkgray",
main="5-azacitidine treatment in AML3 cells",
xlab="log10(basemean)",ylab="log2(fold change)")
points(log10(sigf$baseMean) ,sigf$log2FoldChange,
cex=0.6,pch=19,col="red")
mtext(HEADER)
In the next sections I will run enrichment analysis with over-representation test and compare it to functional class scoring.
Let’s see if many of the gene symbols have changed.
Comparing Ensembl release 90 to GENCODE Release 49 (GRCh38.p14) which corresponds to Ensembl 115 (Sept 2025).
So v90 had 58k genes and v115 has 86k which is a big increase. The v90 has 1609 genes that have been removed an v115 has 29676 that have been added. There are 56693 common.
From these common ones, 20456 have different gene names. I think the approach to naming these putative genes has changed a lot. Lots of CXorfXX genes have changed, not just AL358472.5 type names.
In the next chunk, I’ll show a few lines of the older source gene names.
There is also an Euler diagram of the gene symbol repertoires.
gt <- read.table("https://dee2.io/data/hsapiens/hsa_gene_info.tsv",header=TRUE)
head(gt)
## GeneID GeneSymbol mean median longest_isoform merged
## 1 ENSG00000223972 DDX11L1 1144 1144 1657 1735
## 2 ENSG00000227232 WASH7P 1351 1351 1351 1351
## 3 ENSG00000278267 MIR6859-1 68 68 68 68
## 4 ENSG00000243485 MIR1302-2HG 623 623 712 1021
## 5 ENSG00000284332 MIR1302-2 138 138 138 138
## 6 ENSG00000237613 FAM138A 888 888 1187 1219
gtnew <- read.table("ref/gencode.v49.genenames.tsv")
v1 <- list("Ens90"=gt$GeneID, "Ens115"=gtnew$V1)
plot(euler(v1),quantities = list(cex = 2), labels = list(cex = 2))
message("Number of unique genes in the original gene table:")
## Number of unique genes in the original gene table:
length(gt$GeneID)
## [1] 58302
message("Number of unique genes in the new gene annotation set:")
## Number of unique genes in the new gene annotation set:
length(gtnew$V1)
## [1] 86369
message("Number of shared gene IDs between new and old annotations:")
## Number of shared gene IDs between new and old annotations:
length(intersect(gt$GeneID,gtnew$V1))
## [1] 56693
message("Number of gene IDs specific to the old annotation:")
## Number of gene IDs specific to the old annotation:
length(setdiff(gt$GeneID,gtnew$V1))
## [1] 1609
message("Number of gene IDs specific to the new annotation:")
## Number of gene IDs specific to the new annotation:
length(setdiff(gtnew$V1,gt$GeneID))
## [1] 29676
message("Here is the mapping table for old-new gene symbols:")
## Here is the mapping table for old-new gene symbols:
gtm <- merge(gt,gtnew,by.x="GeneID",by.y="V1")
message("Number of rows:")
## Number of rows:
nrow(gtm)
## [1] 56693
message("How many of these gene symbols are the same?")
## How many of these gene symbols are the same?
table(gtm$GeneSymbol == gtm$V2)
##
## FALSE TRUE
## 20456 36237
dir.create("data")
## Warning in dir.create("data"): 'data' already exists
detsv <- cbind(rownames(def),def)
colnames(detsv)[1] <- "GeneID"
write.table(detsv,"data/de.tsv",sep="\t",row.names=FALSE)
save.image("data/de.Rdata")
saveRDS(def,"data/de.Rds")
sessionInfo()
## R version 4.5.3 (2026-03-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] vioplot_0.5.1 zoo_1.8-15
## [3] sm_2.2-6.0 beeswarm_0.4.0
## [5] RhpcBLASctl_0.23-42 fgsea_1.34.2
## [7] mitch_1.20.0 gplots_3.3.0
## [9] eulerr_7.0.4 kableExtra_1.4.0
## [11] DESeq2_1.48.2 SummarizedExperiment_1.38.1
## [13] Biobase_2.68.0 MatrixGenerics_1.20.0
## [15] matrixStats_1.5.0 GenomicRanges_1.60.0
## [17] GenomeInfoDb_1.44.3 IRanges_2.42.0
## [19] S4Vectors_0.46.0 BiocGenerics_0.54.1
## [21] generics_0.1.4 getDEE2_1.18.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-9 gridExtra_2.3 echarts4r_0.5.0
## [4] rlang_1.1.7 magrittr_2.0.4 otel_0.2.0
## [7] compiler_4.5.3 polylabelr_1.0.0 systemfonts_1.3.1
## [10] vctrs_0.7.1 reshape2_1.4.5 htm2txt_2.2.2
## [13] stringr_1.6.0 pkgconfig_2.0.3 crayon_1.5.3
## [16] fastmap_1.2.0 XVector_0.48.0 caTools_1.18.3
## [19] promises_1.5.0 rmarkdown_2.30 UCSC.utils_1.4.0
## [22] network_1.20.0 purrr_1.2.1 xfun_0.56
## [25] cachem_1.1.0 jsonlite_2.0.0 later_1.4.6
## [28] DelayedArray_0.34.1 BiocParallel_1.42.2 R6_2.6.1
## [31] bslib_0.10.0 stringi_1.8.7 RColorBrewer_1.1-3
## [34] GGally_2.4.0 jquerylib_0.1.4 Rcpp_1.1.1
## [37] knitr_1.51 httpuv_1.6.16 Matrix_1.7-4
## [40] tidyselect_1.2.1 rstudioapi_0.18.0 dichromat_2.0-0.1
## [43] abind_1.4-8 yaml_2.3.12 codetools_0.2-20
## [46] lattice_0.22-9 tibble_3.3.1 plyr_1.8.9
## [49] shiny_1.12.1 S7_0.2.1 coda_0.19-4.1
## [52] evaluate_1.0.5 polyclip_1.10-7 ggstats_0.12.0
## [55] xml2_1.5.2 pillar_1.11.1 KernSmooth_2.23-26
## [58] ggplot2_4.0.2 scales_1.4.0 gtools_3.9.5
## [61] xtable_1.8-4 glue_1.8.0 tools_4.5.3
## [64] data.table_1.18.2.1 locfit_1.5-9.12 fastmatch_1.1-8
## [67] cowplot_1.2.0 grid_4.5.3 tidyr_1.3.2
## [70] GenomeInfoDbData_1.2.14 cli_3.6.5 textshaping_1.0.4
## [73] S4Arrays_1.8.1 viridisLite_0.4.3 svglite_2.2.2
## [76] dplyr_1.2.0 gtable_0.3.6 sass_0.4.10
## [79] digest_0.6.39 SparseArray_1.8.1 htmlwidgets_1.6.4
## [82] farver_2.1.2 htmltools_0.5.9 lifecycle_1.0.5
## [85] httr_1.4.8 statnet.common_4.13.0 mime_0.13
## [88] MASS_7.3-65