Example workflow for bulk RNA-seq analysis

Source: https://github.com/markziemann/background

Intro

Here we are performing an analysis of some gene expression data to demonstrate the difference between ORA and FCS methods and to highlight the differences caused by improper background gene set use.

The dataset being used is SRP128998 and we are comparing the cells grown in normal glucose condition (control) to the high glucose condition (case).

Data are obtained from http://dee2.io/

suppressPackageStartupMessages({
  library("kableExtra")
  library("eulerr")
  library("gplots")
  library("getDEE2")
  library("DESeq2")
})

Get expression data and make an MDS plot

name = "SRP128998"
mdat <- getDEE2Metadata("hsapiens")
samplesheet <- mdat[grep("SRP128998",mdat$SRP_accession),]
samplesheet <- samplesheet[order(samplesheet$SRR_accession),]
samplesheet$trt <- as.factor(c(1,1,1,1,1,1,0,0,0,0,0,0))
samplesheet$VPA <- as.factor(c(0,0,0,1,1,1,0,0,0,1,1,1))
s1 <- subset(samplesheet,VPA==0)

s1 %>%
  kbl(caption = "sample sheet") %>%
  kable_paper("hover", full_width = F)

sample sheet
	SRR_accession	QC_summary	SRX_accession	SRS_accession	SRP_accession	Experiment_title	GEO_series	trt
406940	SRR6467479	PASS	SRX3557428	SRS2830728	SRP128998	GSM2932791: high glucose replicate 1; Homo sapiens; RNA-Seq	GSE109140	1
406941	SRR6467480	PASS	SRX3557429	SRS2830730	SRP128998	GSM2932792: high glucose replicate 2; Homo sapiens; RNA-Seq	GSE109140	1
406942	SRR6467481	PASS	SRX3557430	SRS2830729	SRP128998	GSM2932793: high glucose replicate 3; Homo sapiens; RNA-Seq	GSE109140	1
406946	SRR6467485	PASS	SRX3557434	SRS2830733	SRP128998	GSM2932797: low glucose replicate 1; Homo sapiens; RNA-Seq	GSE109140	0
406947	SRR6467486	PASS	SRX3557435	SRS2830734	SRP128998	GSM2932798: low glucose replicate 2; Homo sapiens; RNA-Seq	GSE109140	0
406948	SRR6467487	PASS	SRX3557436	SRS2830735	SRP128998	GSM2932799: low glucose replicate 3; Homo sapiens; RNA-Seq	GSE109140	0

w <- getDEE2("hsapiens", samplesheet$SRR_accession,
  metadata=mdat,legacy = TRUE)

## For more information about DEE2 QC metrics, visit
##     https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md

x <- Tx2Gene(w)
x <- x$Tx2Gene

# table of gene symbols
gt <- w$GeneInfo[,1,drop=FALSE]
gt$accession <- rownames(gt)

# fix gene symbols
rownames(x) <- sapply(strsplit(rownames(x),"\\."),"[[",1)
x  <- merge(gt,x,by=0)
rownames(x) <- paste(x$Row.names, x$GeneSymbol)
x <- x[,-c(1:3)]

# counts
x1 <- x[,which(colnames(x) %in% s1$SRR_accession)]

colnames(x1) <- c("HG1","HG2","HG3","NG1","NG2","NG3")

head(x1) %>%
  kbl(caption = "counts") %>%
  kable_paper("hover", full_width = F)

counts
	HG1	HG2	HG3	NG1	NG2	NG3
ENSG00000000003 TSPAN6	3456.1659	3182.2568	3967.1044	2581.3915	4847.4390	5116.2807
ENSG00000000005 TNMD	1.0000	0.0000	0.0000	1.0000	0.0000	0.0000
ENSG00000000419 DPM1	1712.0049	1412.9984	1807.9974	1200.9997	2243.9995	2188.0024
ENSG00000000457 SCYL3	276.2935	269.0275	295.5115	156.4475	253.5052	342.9771
ENSG00000000460 C1orf112	324.7927	381.3270	445.5202	168.1013	326.5366	353.9981
ENSG00000000938 FGR	0.0000	1.0000	0.0000	0.0000	0.0000	0.0000

Here show the number of genes in the annotation set, and those detected above the detection threshold.

# filter out lowly expressed genes
x1 <- x1[which(rowSums(x1)/ncol(x1)>=(10)),]
nrow(x)

## [1] 34947

nrow(x1)

## [1] 14540

Now multidimensional scaling (MDS) plot to show the correlation between the datasets. If the control and case datasets are clustered separately, then it is likely that there will be many differentially expressed genes with FDR<0.05.

plot(cmdscale(dist(t(x1))), xlab="Coordinate 1", ylab="Coordinate 2", pch=19, col=s1$trt, main="MDS")

Differential expression

Now run DESeq2 for control vs case.

y <- DESeqDataSetFromMatrix(countData = round(x1), colData = s1, design = ~ trt)

## converting counts to integer mode

y <- DESeq(y)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

de <- results(y)
de <- as.data.frame(de[order(de$pvalue),])
#rownames(de) <- sapply(strsplit(rownames(de),"\\."),"[[",1)
head(de) %>% kbl() %>% kable_paper("hover", full_width = F)

	baseMean	log2FoldChange	lfcSE	stat
ENSG00000145050 MANF	5844.701	-2.755134	0.1532715	-17.97551
ENSG00000128228 SDF2L1	1678.055	-2.837724	0.1876598	-15.12164
ENSG00000149131 SERPING1	1346.796	2.160230	0.1435352	15.05018
ENSG00000044574 HSPA5	124977.728	-2.035024	0.1370717	-14.84642
ENSG00000179218 CALR	78846.627	-2.228966	0.1597371	-13.95396
ENSG00000090520 DNAJB11	6756.034	-2.139771	0.1547021	-13.83156

Now let’s have a look at some of the charts showing differential expression. In particular, an MA plot and volcano plot.

maplot <- function(de,contrast_name) {
  sig <-subset(de, padj < 0.05 )
  up <-rownames(subset(de, padj < 0.05 & log2FoldChange > 0))
  dn <-rownames(subset(de, padj < 0.05 & log2FoldChange < 0))
  GENESUP <- length(up)
  GENESDN <- length(dn)
  DET=nrow(de)
  SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down", DET, "detected")
  ns <-subset(de, padj > 0.05 )
  plot(log2(de$baseMean),de$log2FoldChange, 
       xlab="log2 basemean", ylab="log2 foldchange",
       pch=19, cex=0.5, col="dark gray",
       main=contrast_name, cex.main=0.7)
  points(log2(sig$baseMean),sig$log2FoldChange,
         pch=19, cex=0.5, col="red")
  mtext(SUBHEADER,cex = 0.7)
}

make_volcano <- function(de,name) {
    sig <- subset(de,padj<0.05)
    N_SIG=nrow(sig)
    N_UP=nrow(subset(sig,log2FoldChange>0))
    N_DN=nrow(subset(sig,log2FoldChange<0))
    DET=nrow(de)
    HEADER=paste(N_SIG,"@5%FDR,", N_UP, "up", N_DN, "dn", DET, "detected")
    plot(de$log2FoldChange,-log10(de$padj),cex=0.5,pch=19,col="darkgray",
        main=name, xlab="log2 FC", ylab="-log10 pval", xlim=c(-6,6))
    mtext(HEADER)
    grid()
    points(sig$log2FoldChange,-log10(sig$padj),cex=0.5,pch=19,col="red")
}

maplot(de,name)

make_volcano(de,name)

Heatmap of top genes.

colfunc <- colorRampPalette(c("blue", "white", "red"))
topgenes <- rownames(head(de,30))
rpm <- apply(x1,2,function(x) {x / sum(x) * 1e6 } )
top <- rpm[which(rownames(rpm) %in% topgenes),]
heatmap.2(top,trace="none",col=colfunc(25),scale="row", margins = c(10,20))

Save table

saveRDS(de,"workflow.Rds")

Session information

sessionInfo()

## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] DESeq2_1.44.0               SummarizedExperiment_1.34.0
##  [3] Biobase_2.64.0              MatrixGenerics_1.16.0      
##  [5] matrixStats_1.3.0           GenomicRanges_1.56.0       
##  [7] GenomeInfoDb_1.40.1         IRanges_2.38.0             
##  [9] S4Vectors_0.42.0            BiocGenerics_0.50.0        
## [11] getDEE2_1.14.0              gplots_3.1.3.1             
## [13] eulerr_7.0.2                kableExtra_1.4.0           
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.5            xfun_0.43               bslib_0.7.0            
##  [4] ggplot2_3.5.1           caTools_1.18.2          lattice_0.22-6         
##  [7] vctrs_0.6.5             tools_4.4.0             bitops_1.0-7           
## [10] parallel_4.4.0          fansi_1.0.6             tibble_3.2.1           
## [13] highr_0.10              pkgconfig_2.0.3         Matrix_1.7-0           
## [16] KernSmooth_2.23-22      lifecycle_1.0.4         GenomeInfoDbData_1.2.12
## [19] compiler_4.4.0          stringr_1.5.1           munsell_0.5.1          
## [22] htm2txt_2.2.2           codetools_0.2-20        htmltools_0.5.8.1      
## [25] sass_0.4.9              yaml_2.3.8              pillar_1.9.0           
## [28] crayon_1.5.2            jquerylib_0.1.4         BiocParallel_1.38.0    
## [31] cachem_1.0.8            DelayedArray_0.30.1     abind_1.4-5            
## [34] gtools_3.9.5            locfit_1.5-9.9          digest_0.6.35          
## [37] stringi_1.8.4           fastmap_1.1.1           grid_4.4.0             
## [40] colorspace_2.1-0        cli_3.6.2               SparseArray_1.4.8      
## [43] magrittr_2.0.3          S4Arrays_1.4.1          utf8_1.2.4             
## [46] scales_1.3.0            UCSC.utils_1.0.0        rmarkdown_2.26         
## [49] XVector_0.44.0          httr_1.4.7              evaluate_0.23          
## [52] knitr_1.46              viridisLite_0.4.2       rlang_1.1.3            
## [55] Rcpp_1.0.12             glue_1.7.0              xml2_1.3.6             
## [58] svglite_2.1.3           rstudioapi_0.16.0       jsonlite_1.8.8         
## [61] R6_2.5.1                systemfonts_1.0.6       zlibbioc_1.50.0