Example bulk RNA-seq analysis 1

Source: https://github.com/markziemann/background

Intro

Here we are performing an analysis of some gene expression data to demonstrate the difference between ORA and FCS methods and to highlight the differences caused by improper background gene set use.

The dataset being used is SRP128998 and we are comparing the cells grown in normal glucose condition (control) to the high glucose condition (case).

Data are obtained from http://dee2.io/

suppressPackageStartupMessages({
  library("getDEE2")
  library("DESeq2")
  library("clusterProfiler")
  library("mitch")
  library("kableExtra")
  library("eulerr")
  library("biomaRt")
})

Get expression data and make an MDS plot

I’m using some RNA-seq data looking at the effect of hyperglycemia on hepatocytes.

name = "SRP128998"
mdat <- getDEE2Metadata("hsapiens")
samplesheet <- mdat[grep("SRP128998",mdat$SRP_accession),]
samplesheet <- samplesheet[order(samplesheet$SRR_accession),]
samplesheet$trt <- as.factor(c(1,1,1,1,1,1,0,0,0,0,0,0))
samplesheet$VPA <- as.factor(c(0,0,0,1,1,1,0,0,0,1,1,1))
s1 <- subset(samplesheet,VPA==0)

s1 %>%
  kbl(caption = "sample sheet") %>%
  kable_paper("hover", full_width = F)

sample sheet
	SRR_accession	QC_summary	SRX_accession	SRS_accession	SRP_accession	Experiment_title	GEO_series	trt
406940	SRR6467479	PASS	SRX3557428	SRS2830728	SRP128998	GSM2932791: high glucose replicate 1; Homo sapiens; RNA-Seq	GSE109140	1
406941	SRR6467480	PASS	SRX3557429	SRS2830730	SRP128998	GSM2932792: high glucose replicate 2; Homo sapiens; RNA-Seq	GSE109140	1
406942	SRR6467481	PASS	SRX3557430	SRS2830729	SRP128998	GSM2932793: high glucose replicate 3; Homo sapiens; RNA-Seq	GSE109140	1
406946	SRR6467485	PASS	SRX3557434	SRS2830733	SRP128998	GSM2932797: low glucose replicate 1; Homo sapiens; RNA-Seq	GSE109140	0
406947	SRR6467486	PASS	SRX3557435	SRS2830734	SRP128998	GSM2932798: low glucose replicate 2; Homo sapiens; RNA-Seq	GSE109140	0
406948	SRR6467487	PASS	SRX3557436	SRS2830735	SRP128998	GSM2932799: low glucose replicate 3; Homo sapiens; RNA-Seq	GSE109140	0

w <- getDEE2("hsapiens", samplesheet$SRR_accession,
  metadata=mdat,legacy = TRUE)

## For more information about DEE2 QC metrics, visit
##     https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md

x <- Tx2Gene(w)
x <- x$Tx2Gene

# save the genetable for later
gt <- w$GeneInfo[,1,drop=FALSE]
gt$accession <- rownames(gt)

# counts
x1 <- x[,which(colnames(x) %in% s1$SRR_accession)]

Here show the number of genes in the annotation set, and those detected above the detection threshold.

# filter out lowly expressed genes
x1 <- x1[which(rowSums(x1)/ncol(x1)>=(10)),]
nrow(x)

## [1] 39297

nrow(x1)

## [1] 15635

Now multidimensional scaling (MDS) plot to show the correlation between the datasets. If the control and case datasets are clustered separately, then it is likely that there will be many differentially expressed genes with FDR<0.05.

plot(cmdscale(dist(t(x1))), xlab="Coordinate 1", ylab="Coordinate 2", pch=19, col=s1$trt, main="MDS")

Differential expression

Now run DESeq2 for control vs case.

y <- DESeqDataSetFromMatrix(countData = round(x1), colData = s1, design = ~ trt)

## converting counts to integer mode

y <- DESeq(y)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

de <- results(y)
de <- as.data.frame(de[order(de$pvalue),])
rownames(de) <- sapply(strsplit(rownames(de),"\\."),"[[",1)
head(de) %>% kbl() %>% kable_paper("hover", full_width = F)

	baseMean	log2FoldChange	lfcSE	stat
ENSG00000145050	5839.731	-2.753692	0.1518338	-18.13623
ENSG00000149131	1346.633	2.161115	0.1427218	15.14215
ENSG00000044574	124889.027	-2.033391	0.1343040	-15.14021
ENSG00000128228	1676.368	-2.836358	0.1895729	-14.96183
ENSG00000179218	78785.663	-2.227516	0.1586383	-14.04148
ENSG00000090520	6751.044	-2.138112	0.1538129	-13.90073

Now let’s have a look at some of the charts showing differential expression. In particular, an MA plot and volcano plot.

maplot <- function(de,contrast_name) {
  sig <-subset(de, padj < 0.05 )
  up <-rownames(subset(de, padj < 0.05 & log2FoldChange > 0))
  dn <-rownames(subset(de, padj < 0.05 & log2FoldChange < 0))
  GENESUP <- length(up)
  GENESDN <- length(dn)
  DET=nrow(de)
  SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down", DET, "detected")
  ns <-subset(de, padj > 0.05 )
  plot(log2(de$baseMean),de$log2FoldChange, 
       xlab="log2 basemean", ylab="log2 foldchange",
       pch=19, cex=0.5, col="dark gray",
       main=contrast_name, cex.main=0.7)
  points(log2(sig$baseMean),sig$log2FoldChange,
         pch=19, cex=0.5, col="red")
  mtext(SUBHEADER,cex = 0.7)
}

make_volcano <- function(de,name) {
    sig <- subset(de,padj<0.05)
    N_SIG=nrow(sig)
    N_UP=nrow(subset(sig,log2FoldChange>0))
    N_DN=nrow(subset(sig,log2FoldChange<0))
    DET=nrow(de)
    HEADER=paste(N_SIG,"@5%FDR,", N_UP, "up", N_DN, "dn", DET, "detected")
    plot(de$log2FoldChange,-log10(de$padj),cex=0.5,pch=19,col="darkgray",
        main=name, xlab="log2 FC", ylab="-log10 pval", xlim=c(-6,6))
    mtext(HEADER)
    grid()
    points(sig$log2FoldChange,-log10(sig$padj),cex=0.5,pch=19,col="red")
}

maplot(de,name)

make_volcano(de,name)

Need to add gene symbol

mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))

genes <- getBM(filters= "ensembl_gene_id",
  attributes= c("ensembl_gene_id","hgnc_symbol"),
  values=rownames(de), mart= mart)

m <- merge(de,genes,by.x=0,by.y="ensembl_gene_id")
rownames(m) <- paste(m$Row.names,m$hgnc_symbol)
m$Row.names = m$hgnc_symbol = NULL
dim(de)

## [1] 15635     6

dim(m)

## [1] 15520     6

Save table

saveRDS(m,"bulkrna1.Rds")

Session information

sessionInfo()

## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] biomaRt_2.58.0              eulerr_7.0.2               
##  [3] kableExtra_1.4.0            mitch_1.14.0               
##  [5] clusterProfiler_4.10.0      DESeq2_1.42.0              
##  [7] SummarizedExperiment_1.32.0 Biobase_2.62.0             
##  [9] MatrixGenerics_1.14.0       matrixStats_1.3.0          
## [11] GenomicRanges_1.54.1        GenomeInfoDb_1.38.5        
## [13] IRanges_2.36.0              S4Vectors_0.40.2           
## [15] BiocGenerics_0.48.1         getDEE2_1.12.0             
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.4.0           later_1.3.2             bitops_1.0-7           
##   [4] ggplotify_0.1.2         filelock_1.0.3          tibble_3.2.1           
##   [7] polyclip_1.10-6         XML_3.99-0.16.1         lifecycle_1.0.4        
##  [10] lattice_0.22-6          MASS_7.3-60.2           magrittr_2.0.3         
##  [13] sass_0.4.9              rmarkdown_2.26          jquerylib_0.1.4        
##  [16] yaml_2.3.8              httpuv_1.6.15           cowplot_1.1.3          
##  [19] DBI_1.2.2               RColorBrewer_1.1-3      abind_1.4-5            
##  [22] zlibbioc_1.48.0         purrr_1.0.2             ggraph_2.2.1           
##  [25] RCurl_1.98-1.14         yulab.utils_0.1.4       tweenr_2.0.3           
##  [28] rappdirs_0.3.3          GenomeInfoDbData_1.2.11 enrichplot_1.22.0      
##  [31] ggrepel_0.9.5           tidytree_0.4.6          svglite_2.1.3          
##  [34] codetools_0.2-20        DelayedArray_0.28.0     DOSE_3.28.2            
##  [37] xml2_1.3.6              ggforce_0.4.2           tidyselect_1.2.1       
##  [40] aplot_0.2.2             farver_2.1.1            viridis_0.6.5          
##  [43] BiocFileCache_2.10.1    jsonlite_1.8.8          tidygraph_1.3.1        
##  [46] systemfonts_1.0.6       tools_4.4.0             progress_1.2.3         
##  [49] treeio_1.26.0           Rcpp_1.0.12             glue_1.7.0             
##  [52] gridExtra_2.3           SparseArray_1.2.3       xfun_0.43              
##  [55] qvalue_2.34.0           dplyr_1.1.4             withr_3.0.0            
##  [58] fastmap_1.1.1           GGally_2.2.1            fansi_1.0.6            
##  [61] caTools_1.18.2          digest_0.6.35           R6_2.5.1               
##  [64] mime_0.12               gridGraphics_0.5-1      colorspace_2.1-0       
##  [67] GO.db_3.18.0            gtools_3.9.5            RSQLite_2.3.6          
##  [70] utf8_1.2.4              tidyr_1.3.1             generics_0.1.3         
##  [73] data.table_1.15.4       prettyunits_1.2.0       graphlayouts_1.1.1     
##  [76] httr_1.4.7              htmlwidgets_1.6.4       S4Arrays_1.2.0         
##  [79] scatterpie_0.2.2        ggstats_0.6.0           pkgconfig_2.0.3        
##  [82] gtable_0.3.5            blob_1.2.4              XVector_0.42.0         
##  [85] shadowtext_0.1.3        htmltools_0.5.8.1       fgsea_1.28.0           
##  [88] echarts4r_0.4.5         scales_1.3.0            png_0.1-8              
##  [91] ggfun_0.1.4             knitr_1.46              rstudioapi_0.16.0      
##  [94] reshape2_1.4.4          nlme_3.1-164            curl_5.2.1             
##  [97] cachem_1.0.8            stringr_1.5.1           KernSmooth_2.23-22     
## [100] parallel_4.4.0          HDO.db_0.99.1           AnnotationDbi_1.64.1   
## [103] pillar_1.9.0            grid_4.4.0              vctrs_0.6.5            
## [106] gplots_3.1.3.1          promises_1.3.0          dbplyr_2.5.0           
## [109] xtable_1.8-4            beeswarm_0.4.0          evaluate_0.23          
## [112] cli_3.6.2               locfit_1.5-9.9          compiler_4.4.0         
## [115] rlang_1.1.3             crayon_1.5.2            plyr_1.8.9             
## [118] fs_1.6.4                stringi_1.8.3           viridisLite_0.4.2      
## [121] BiocParallel_1.36.0     htm2txt_2.2.2           munsell_0.5.1          
## [124] Biostrings_2.70.1       lazyeval_0.2.2          GOSemSim_2.28.0        
## [127] Matrix_1.7-0            hms_1.1.3               patchwork_1.2.0        
## [130] bit64_4.0.5             ggplot2_3.5.1           KEGGREST_1.42.0        
## [133] shiny_1.8.1.1           highr_0.10              igraph_2.0.3           
## [136] memoise_2.0.1           bslib_0.7.0             ggtree_3.10.0          
## [139] fastmatch_1.1-4         bit_4.0.5               ape_5.8                
## [142] gson_0.1.0