Pathway workshop - Data preparation

Source: https://github.com/markziemann/pathway-workshop

Introduction

#knitr::opts_chunk$set(dev = 'svg') # set output device to svg

suppressPackageStartupMessages({
  library("getDEE2")
  library("DESeq2")
  library("kableExtra")
  library("eulerr")
  library("gplots")
  library("mitch")
  library("fgsea")
  library("RhpcBLASctl")
  library("parallel")
  library("beeswarm")
  library("vioplot")
})

RhpcBLASctl::blas_set_num_threads(1)

For this guide I will be using bulk RNA-seq data from a previous study, which is deposited at NCBI GEO and SRA under accession numbers: GSE55123 and SRP038101 (Lund et al, 2014). The experiment is designed to investigate the effect of Azacitidine treatment on AML3 cells.

The raw data have been processed by the DEE2 project, and the summary gene expression counts are available at the dee2.io website, and programmatically with the getDEE2 bioconductor package (Ziemann et al, 2019).

The gene counts have also been deposited to the /data folder in the example.Rdata file in case the DEE2 resource becomes unavailable. To import it, use the command: load("data/example.Rdata")

Alternatively, you could fetch data from another resource like NCBI GEO, Zenodo or from the host storage drive.

mdat <- getDEE2Metadata("hsapiens")

# get sample sheet
ss <- subset(mdat,SRP_accession=="SRP038101")

# fetch the whole set of RNA-seq data
x <- getDEE2("hsapiens",ss$SRR_accession , metadata=mdat, legacy=TRUE)

## For more information about DEE2 QC metrics, visit
##     https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md

mx <- x$GeneCounts
rownames(mx) <- paste(rownames(mx),x$GeneInfo$GeneSymbol)
dim(mx)

## [1] 58302     6

# aza no filtering
ss$trt <- grepl("Treated",ss$Experiment_title)

ss %>%
  kbl(caption="sample sheet for Aza treatment in AML3 cells") %>%
  kable_paper("hover", full_width = F)

sample sheet for Aza treatment in AML3 cells
	SRR_accession	QC_summary	SRX_accession	SRS_accession	SRP_accession	Experiment_title	GEO_series	trt
235513	SRR1171523	PASS	SRX472607	SRS559064	SRP038101	GSM1329859: Untreated.1; Homo sapiens; RNA-Seq	GSE55123	FALSE
235514	SRR1171524	WARN(3,4)	SRX472608	SRS559066	SRP038101	GSM1329860: Untreated.2; Homo sapiens; RNA-Seq	GSE55123	FALSE
235515	SRR1171525	WARN(3,4)	SRX472609	SRS559065	SRP038101	GSM1329861: Untreated.3; Homo sapiens; RNA-Seq	GSE55123	FALSE
235516	SRR1171526	WARN(3,4)	SRX472610	SRS559068	SRP038101	GSM1329862: Treated.1; Homo sapiens; RNA-Seq	GSE55123	TRUE
235517	SRR1171527	WARN(3,4)	SRX472611	SRS559067	SRP038101	GSM1329863: Treated.2; Homo sapiens; RNA-Seq	GSE55123	TRUE
235518	SRR1171528	WARN(3,4)	SRX472612	SRS559069	SRP038101	GSM1329864: Treated.3; Homo sapiens; RNA-Seq	GSE55123	TRUE

Data quality control

QC is important, even if you are using public transcriptome data. For RNA-seq it is a good idea to show the number of reads for each sample.

colSums(mx)

## SRR1171523 SRR1171524 SRR1171525 SRR1171526 SRR1171527 SRR1171528 
##   15529931   11209272   10994666   11350703   12979401   12674207

par(mar=c(5,7,5,1))
barplot(rev(colSums(mx)),horiz=TRUE,las=1,main="number of reads per sample in SRP038101")

Now make a MDS plot.

mds <- cmdscale(dist(t(mx)))

# expand plot area
XMIN=min(mds[,1])*1.3
XMAX=max(mds[,1])*1.3
YMIN=min(mds[,2])*1.3
YMAX=max(mds[,2])*1.3

cols <- as.character(grepl("Treated",ss$Experiment_title))
cols <- gsub("FALSE","lightblue",cols)
cols <- gsub("TRUE","pink",cols)
plot(mds, xlab="Coordinate 1", ylab="Coordinate 2",
  xlim=c(XMIN,XMAX),ylim=c(YMIN,YMAX),
  pch=19,cex=2,col=cols, main="MDS plot")
text(cmdscale(dist(t(mx))), labels=colnames(mx) )

Differential expression analysis

We will be using DESeq2 for DE analysis.

The count matrix is prefiltered using a detection threshold of 10 reads per sample across all samples. All genes that meet the detection threshold will comprise the background list.

We will also repeat this without a detection threshold.

The first 6 rows of the count matrix are shown.

mxf <- mx[which(rowMeans(mx)>=10),]
dim(mxf)

## [1] 13168     6

head(mxf,6) %>%
  kbl(caption="Count matrix format") %>%
  kable_paper("hover", full_width = F)

Count matrix format
	SRR1171523	SRR1171524	SRR1171525	SRR1171526	SRR1171527	SRR1171528
ENSG00000225630 MTND2P28	494	396	340	333	415	418
ENSG00000237973 MTCO1P12	52	39	40	30	37	29
ENSG00000248527 MTATP6P1	853	544	537	582	702	716
ENSG00000228327 AL669831.1	17	13	21	21	22	12
ENSG00000228794 LINC01128	42	27	30	32	40	23
ENSG00000230699 AL645608.3	20	11	13	10	15	22

dds <- DESeqDataSetFromMatrix(countData = mxf , colData = ss, design = ~ trt )
res <- DESeq(dds)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

z <- results(res)
vsd <- vst(dds, blind=FALSE)
zz <-cbind(as.data.frame(z),mxf)
def <-as.data.frame(zz[order(zz$pvalue),])

head(def,10) %>%
  kbl(caption="Top DE genes for Aza treatment") %>%
  kable_paper("hover", full_width = F)

Top DE genes for Aza treatment
	baseMean	log2FoldChange	lfcSE	stat	SRR1171523	SRR1171524	SRR1171525	SRR1171526	SRR1171527	SRR1171528
ENSG00000165949 IFI27	1960.1970	-3.384492	0.0938869	-36.04861	4689	3583	2758	309	384	334
ENSG00000090382 LYZ	7596.0299	-1.650342	0.0561143	-29.41036	14212	10237	10789	3476	3764	3738
ENSG00000115461 IGFBP5	531.2217	-5.071157	0.1795239	-28.24781	1320	823	1025	23	26	43
ENSG00000157601 MX1	827.1511	-2.877795	0.1047823	-27.46450	1732	1501	1206	184	223	186
ENSG00000111331 OAS3	2127.2010	-2.661214	0.0972124	-27.37525	4204	3977	2972	562	614	560
ENSG00000070915 SLC12A3	424.5509	-3.374852	0.1298671	-25.98697	1012	721	653	63	85	76
ENSG00000234745 HLA-B	3197.0159	-1.431566	0.0604169	-23.69479	6085	4256	4023	1590	1872	1719
ENSG00000137965 IFI44	409.0957	-2.978581	0.1319352	-22.57608	829	740	635	76	111	89
ENSG00000204525 HLA-C	1631.6421	-1.461550	0.0660214	-22.13750	3112	2150	2106	791	923	891
ENSG00000110042 DTX4	524.1318	-2.470219	0.1173182	-21.05572	1166	883	688	166	168	145

Make a smear plot.

sigf <- subset(def,padj<=0.01)

DET=nrow(mxf)
NSIG=nrow(sigf)
NUP=nrow(subset(sigf,log2FoldChange>0))
NDN=nrow(subset(sigf,log2FoldChange<0))

HEADER=paste(DET,"detected genes;",NSIG,"w/FDR<0.01;",NUP,"up;",NDN,"down")

plot(log10(def$baseMean) ,def$log2FoldChange,
  cex=0.6,pch=19,col="darkgray",
  main="5-azacitidine treatment in AML3 cells",
  xlab="log10(basemean)",ylab="log2(fold change)")

points(log10(sigf$baseMean) ,sigf$log2FoldChange,
  cex=0.6,pch=19,col="red")

mtext(HEADER)

In the next sections I will run enrichment analysis with over-representation test and compare it to functional class scoring.

Get update gene symbols

Let’s see if many of the gene symbols have changed.

Comparing Ensembl release 90 to GENCODE Release 49 (GRCh38.p14) which corresponds to Ensembl 115 (Sept 2025).

So v90 had 58k genes and v115 has 86k which is a big increase. The v90 has 1609 genes that have been removed an v115 has 29676 that have been added. There are 56693 common.

From these common ones, 20456 have different gene names. I think the approach to naming these putative genes has changed a lot. Lots of CXorfXX genes have changed, not just AL358472.5 type names.

In the next chunk, I’ll show a few lines of the older source gene names.

There is also an Euler diagram of the gene symbol repertoires.

gt <- read.table("https://dee2.io/data/hsapiens/hsa_gene_info.tsv",header=TRUE)
head(gt)

##            GeneID  GeneSymbol mean median longest_isoform merged
## 1 ENSG00000223972     DDX11L1 1144   1144            1657   1735
## 2 ENSG00000227232      WASH7P 1351   1351            1351   1351
## 3 ENSG00000278267   MIR6859-1   68     68              68     68
## 4 ENSG00000243485 MIR1302-2HG  623    623             712   1021
## 5 ENSG00000284332   MIR1302-2  138    138             138    138
## 6 ENSG00000237613     FAM138A  888    888            1187   1219

gtnew <- read.table("ref/gencode.v49.genenames.tsv")

v1 <- list("Ens90"=gt$GeneID, "Ens115"=gtnew$V1)
plot(euler(v1),quantities = list(cex = 2), labels = list(cex = 2))

message("Number of unique genes in the original gene table:")

## Number of unique genes in the original gene table:

length(gt$GeneID)

## [1] 58302

message("Number of unique genes in the new gene annotation set:")

## Number of unique genes in the new gene annotation set:

length(gtnew$V1)

## [1] 86369

message("Number of shared gene IDs between new and old annotations:")

## Number of shared gene IDs between new and old annotations:

length(intersect(gt$GeneID,gtnew$V1))

## [1] 56693

message("Number of gene IDs specific to the old annotation:")

## Number of gene IDs specific to the old annotation:

length(setdiff(gt$GeneID,gtnew$V1))

## [1] 1609

message("Number of gene IDs specific to the new annotation:")

## Number of gene IDs specific to the new annotation:

length(setdiff(gtnew$V1,gt$GeneID))

## [1] 29676

message("Here is the mapping table for old-new gene symbols:")

## Here is the mapping table for old-new gene symbols:

gtm <- merge(gt,gtnew,by.x="GeneID",by.y="V1")

message("Number of rows:")

## Number of rows:

nrow(gtm)

## [1] 56693

message("How many of these gene symbols are the same?")

## How many of these gene symbols are the same?

table(gtm$GeneSymbol == gtm$V2)

## 
## FALSE  TRUE 
## 20456 36237

Save data

dir.create("data")

## Warning in dir.create("data"): 'data' already exists

detsv <- cbind(rownames(def),def)
colnames(detsv)[1] <- "GeneID"
write.table(detsv,"data/de.tsv",sep="\t",row.names=FALSE)
save.image("data/de.Rdata")
saveRDS(def,"data/de.Rds")

Session information

sessionInfo()

## R version 4.5.3 (2026-03-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] vioplot_0.5.1               zoo_1.8-15                 
##  [3] sm_2.2-6.0                  beeswarm_0.4.0             
##  [5] RhpcBLASctl_0.23-42         fgsea_1.34.2               
##  [7] mitch_1.20.0                gplots_3.3.0               
##  [9] eulerr_7.0.4                kableExtra_1.4.0           
## [11] DESeq2_1.48.2               SummarizedExperiment_1.38.1
## [13] Biobase_2.68.0              MatrixGenerics_1.20.0      
## [15] matrixStats_1.5.0           GenomicRanges_1.60.0       
## [17] GenomeInfoDb_1.44.3         IRanges_2.42.0             
## [19] S4Vectors_0.46.0            BiocGenerics_0.54.1        
## [21] generics_0.1.4              getDEE2_1.18.0             
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-9            gridExtra_2.3           echarts4r_0.5.0        
##  [4] rlang_1.1.7             magrittr_2.0.4          otel_0.2.0             
##  [7] compiler_4.5.3          polylabelr_1.0.0        systemfonts_1.3.1      
## [10] vctrs_0.7.1             reshape2_1.4.5          htm2txt_2.2.2          
## [13] stringr_1.6.0           pkgconfig_2.0.3         crayon_1.5.3           
## [16] fastmap_1.2.0           XVector_0.48.0          caTools_1.18.3         
## [19] promises_1.5.0          rmarkdown_2.30          UCSC.utils_1.4.0       
## [22] network_1.20.0          purrr_1.2.1             xfun_0.56              
## [25] cachem_1.1.0            jsonlite_2.0.0          later_1.4.6            
## [28] DelayedArray_0.34.1     BiocParallel_1.42.2     R6_2.6.1               
## [31] bslib_0.10.0            stringi_1.8.7           RColorBrewer_1.1-3     
## [34] GGally_2.4.0            jquerylib_0.1.4         Rcpp_1.1.1             
## [37] knitr_1.51              httpuv_1.6.16           Matrix_1.7-4           
## [40] tidyselect_1.2.1        rstudioapi_0.18.0       dichromat_2.0-0.1      
## [43] abind_1.4-8             yaml_2.3.12             codetools_0.2-20       
## [46] lattice_0.22-9          tibble_3.3.1            plyr_1.8.9             
## [49] shiny_1.12.1            S7_0.2.1                coda_0.19-4.1          
## [52] evaluate_1.0.5          polyclip_1.10-7         ggstats_0.12.0         
## [55] xml2_1.5.2              pillar_1.11.1           KernSmooth_2.23-26     
## [58] ggplot2_4.0.2           scales_1.4.0            gtools_3.9.5           
## [61] xtable_1.8-4            glue_1.8.0              tools_4.5.3            
## [64] data.table_1.18.2.1     locfit_1.5-9.12         fastmatch_1.1-8        
## [67] cowplot_1.2.0           grid_4.5.3              tidyr_1.3.2            
## [70] GenomeInfoDbData_1.2.14 cli_3.6.5               textshaping_1.0.4      
## [73] S4Arrays_1.8.1          viridisLite_0.4.3       svglite_2.2.2          
## [76] dplyr_1.2.0             gtable_0.3.6            sass_0.4.10            
## [79] digest_0.6.39           SparseArray_1.8.1       htmlwidgets_1.6.4      
## [82] farver_2.1.2            htmltools_0.5.9         lifecycle_1.0.5        
## [85] httr_1.4.8              statnet.common_4.13.0   mime_0.13              
## [88] MASS_7.3-65