Source: https://github.com/markziemann/background

Intro

Here we are performing an analysis of some gene expression data to demonstrate the difference between ORA and FCS methods and to highlight the differences caused by improper background gene set use.

The dataset being used is SRP128998 and we are comparing the cells grown in normal glucose condition (control) to the high glucose condition (case).

Data are obtained from http://dee2.io/

suppressPackageStartupMessages({
  library("kableExtra")
  library("eulerr")
  library("gplots")
  library("getDEE2")
  library("DESeq2")
})

Get expression data and make an MDS plot

name = "SRP128998"
mdat <- getDEE2Metadata("hsapiens")
samplesheet <- mdat[grep("SRP128998",mdat$SRP_accession),]
samplesheet <- samplesheet[order(samplesheet$SRR_accession),]
samplesheet$trt <- as.factor(c(1,1,1,1,1,1,0,0,0,0,0,0))
samplesheet$VPA <- as.factor(c(0,0,0,1,1,1,0,0,0,1,1,1))
s1 <- subset(samplesheet,VPA==0)

s1 %>%
  kbl(caption = "sample sheet") %>%
  kable_paper("hover", full_width = F)
sample sheet
SRR_accession QC_summary SRX_accession SRS_accession SRP_accession Experiment_title GEO_series trt VPA
406940 SRR6467479 PASS SRX3557428 SRS2830728 SRP128998 GSM2932791: high glucose replicate 1; Homo sapiens; RNA-Seq GSE109140 1 0
406941 SRR6467480 PASS SRX3557429 SRS2830730 SRP128998 GSM2932792: high glucose replicate 2; Homo sapiens; RNA-Seq GSE109140 1 0
406942 SRR6467481 PASS SRX3557430 SRS2830729 SRP128998 GSM2932793: high glucose replicate 3; Homo sapiens; RNA-Seq GSE109140 1 0
406946 SRR6467485 PASS SRX3557434 SRS2830733 SRP128998 GSM2932797: low glucose replicate 1; Homo sapiens; RNA-Seq GSE109140 0 0
406947 SRR6467486 PASS SRX3557435 SRS2830734 SRP128998 GSM2932798: low glucose replicate 2; Homo sapiens; RNA-Seq GSE109140 0 0
406948 SRR6467487 PASS SRX3557436 SRS2830735 SRP128998 GSM2932799: low glucose replicate 3; Homo sapiens; RNA-Seq GSE109140 0 0
w <- getDEE2("hsapiens", samplesheet$SRR_accession,
  metadata=mdat,legacy = TRUE)
## For more information about DEE2 QC metrics, visit
##     https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md
x <- Tx2Gene(w)
x <- x$Tx2Gene

# table of gene symbols
gt <- w$GeneInfo[,1,drop=FALSE]
gt$accession <- rownames(gt)

# fix gene symbols
rownames(x) <- sapply(strsplit(rownames(x),"\\."),"[[",1)
x  <- merge(gt,x,by=0)
rownames(x) <- paste(x$Row.names, x$GeneSymbol)
x <- x[,-c(1:3)]

# counts
x1 <- x[,which(colnames(x) %in% s1$SRR_accession)]

colnames(x1) <- c("HG1","HG2","HG3","NG1","NG2","NG3")

head(x1) %>%
  kbl(caption = "counts") %>%
  kable_paper("hover", full_width = F)
counts
HG1 HG2 HG3 NG1 NG2 NG3
ENSG00000000003 TSPAN6 3456.1659 3182.2568 3967.1044 2581.3915 4847.4390 5116.2807
ENSG00000000005 TNMD 1.0000 0.0000 0.0000 1.0000 0.0000 0.0000
ENSG00000000419 DPM1 1712.0049 1412.9984 1807.9974 1200.9997 2243.9995 2188.0024
ENSG00000000457 SCYL3 276.2935 269.0275 295.5115 156.4475 253.5052 342.9771
ENSG00000000460 C1orf112 324.7927 381.3270 445.5202 168.1013 326.5366 353.9981
ENSG00000000938 FGR 0.0000 1.0000 0.0000 0.0000 0.0000 0.0000

Here show the number of genes in the annotation set, and those detected above the detection threshold.

# filter out lowly expressed genes
x1 <- x1[which(rowSums(x1)/ncol(x1)>=(10)),]
nrow(x)
## [1] 34947
nrow(x1)
## [1] 14540

Now multidimensional scaling (MDS) plot to show the correlation between the datasets. If the control and case datasets are clustered separately, then it is likely that there will be many differentially expressed genes with FDR<0.05.

plot(cmdscale(dist(t(x1))), xlab="Coordinate 1", ylab="Coordinate 2", pch=19, col=s1$trt, main="MDS")

Differential expression

Now run DESeq2 for control vs case.

y <- DESeqDataSetFromMatrix(countData = round(x1), colData = s1, design = ~ trt)
## converting counts to integer mode
y <- DESeq(y)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
de <- results(y)
de <- as.data.frame(de[order(de$pvalue),])
#rownames(de) <- sapply(strsplit(rownames(de),"\\."),"[[",1)
head(de) %>% kbl() %>% kable_paper("hover", full_width = F)
baseMean log2FoldChange lfcSE stat pvalue padj
ENSG00000145050 MANF 5844.701 -2.755134 0.1532715 -17.97551 0 0
ENSG00000128228 SDF2L1 1678.055 -2.837724 0.1876598 -15.12164 0 0
ENSG00000149131 SERPING1 1346.796 2.160230 0.1435352 15.05018 0 0
ENSG00000044574 HSPA5 124977.728 -2.035024 0.1370717 -14.84642 0 0
ENSG00000179218 CALR 78846.627 -2.228966 0.1597371 -13.95396 0 0
ENSG00000090520 DNAJB11 6756.034 -2.139771 0.1547021 -13.83156 0 0

Now let’s have a look at some of the charts showing differential expression. In particular, an MA plot and volcano plot.

maplot <- function(de,contrast_name) {
  sig <-subset(de, padj < 0.05 )
  up <-rownames(subset(de, padj < 0.05 & log2FoldChange > 0))
  dn <-rownames(subset(de, padj < 0.05 & log2FoldChange < 0))
  GENESUP <- length(up)
  GENESDN <- length(dn)
  DET=nrow(de)
  SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down", DET, "detected")
  ns <-subset(de, padj > 0.05 )
  plot(log2(de$baseMean),de$log2FoldChange, 
       xlab="log2 basemean", ylab="log2 foldchange",
       pch=19, cex=0.5, col="dark gray",
       main=contrast_name, cex.main=0.7)
  points(log2(sig$baseMean),sig$log2FoldChange,
         pch=19, cex=0.5, col="red")
  mtext(SUBHEADER,cex = 0.7)
}

make_volcano <- function(de,name) {
    sig <- subset(de,padj<0.05)
    N_SIG=nrow(sig)
    N_UP=nrow(subset(sig,log2FoldChange>0))
    N_DN=nrow(subset(sig,log2FoldChange<0))
    DET=nrow(de)
    HEADER=paste(N_SIG,"@5%FDR,", N_UP, "up", N_DN, "dn", DET, "detected")
    plot(de$log2FoldChange,-log10(de$padj),cex=0.5,pch=19,col="darkgray",
        main=name, xlab="log2 FC", ylab="-log10 pval", xlim=c(-6,6))
    mtext(HEADER)
    grid()
    points(sig$log2FoldChange,-log10(sig$padj),cex=0.5,pch=19,col="red")
}

maplot(de,name)

make_volcano(de,name)

Heatmap of top genes.

colfunc <- colorRampPalette(c("blue", "white", "red"))
topgenes <- rownames(head(de,30))
rpm <- apply(x1,2,function(x) {x / sum(x) * 1e6 } )
top <- rpm[which(rownames(rpm) %in% topgenes),]
heatmap.2(top,trace="none",col=colfunc(25),scale="row", margins = c(10,20))

Save table

saveRDS(de,"workflow.Rds")

Session information

sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] DESeq2_1.44.0               SummarizedExperiment_1.34.0
##  [3] Biobase_2.64.0              MatrixGenerics_1.16.0      
##  [5] matrixStats_1.3.0           GenomicRanges_1.56.0       
##  [7] GenomeInfoDb_1.40.1         IRanges_2.38.0             
##  [9] S4Vectors_0.42.0            BiocGenerics_0.50.0        
## [11] getDEE2_1.14.0              gplots_3.1.3.1             
## [13] eulerr_7.0.2                kableExtra_1.4.0           
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.5            xfun_0.43               bslib_0.7.0            
##  [4] ggplot2_3.5.1           caTools_1.18.2          lattice_0.22-6         
##  [7] vctrs_0.6.5             tools_4.4.0             bitops_1.0-7           
## [10] parallel_4.4.0          fansi_1.0.6             tibble_3.2.1           
## [13] highr_0.10              pkgconfig_2.0.3         Matrix_1.7-0           
## [16] KernSmooth_2.23-22      lifecycle_1.0.4         GenomeInfoDbData_1.2.12
## [19] compiler_4.4.0          stringr_1.5.1           munsell_0.5.1          
## [22] htm2txt_2.2.2           codetools_0.2-20        htmltools_0.5.8.1      
## [25] sass_0.4.9              yaml_2.3.8              pillar_1.9.0           
## [28] crayon_1.5.2            jquerylib_0.1.4         BiocParallel_1.38.0    
## [31] cachem_1.0.8            DelayedArray_0.30.1     abind_1.4-5            
## [34] gtools_3.9.5            locfit_1.5-9.9          digest_0.6.35          
## [37] stringi_1.8.4           fastmap_1.1.1           grid_4.4.0             
## [40] colorspace_2.1-0        cli_3.6.2               SparseArray_1.4.8      
## [43] magrittr_2.0.3          S4Arrays_1.4.1          utf8_1.2.4             
## [46] scales_1.3.0            UCSC.utils_1.0.0        rmarkdown_2.26         
## [49] XVector_0.44.0          httr_1.4.7              evaluate_0.23          
## [52] knitr_1.46              viridisLite_0.4.2       rlang_1.1.3            
## [55] Rcpp_1.0.12             glue_1.7.0              xml2_1.3.6             
## [58] svglite_2.1.3           rstudioapi_0.16.0       jsonlite_1.8.8         
## [61] R6_2.5.1                systemfonts_1.0.6       zlibbioc_1.50.0