Example bulk RNA-seq analysis 7

Source: https://github.com/markziemann/background

Intro

Here we are performing an analysis of some gene expression data to demonstrate the difference between ORA and FCS methods and to highlight the differences caused by improper background gene set use.

The dataset being used is SRP068733 and we are comparing the healthy endothelial cells with a scrambled siRNA to cells treated with a p300 targeting siRNA.

Data are obtained from http://dee2.io/

suppressPackageStartupMessages({
  library("getDEE2")
  library("DESeq2")
  library("clusterProfiler")
  library("mitch")
  library("kableExtra")
  library("eulerr")
  library("biomaRt")
})

Get expression data and make an MDS plot

name = "SRP068733"
mdat <- getDEE2Metadata("hsapiens")
samplesheet <- mdat[grep("SRP068733",mdat$SRP_accession),]
samplesheet <- samplesheet[order(samplesheet$SRR_accession),]
SRRvec <- c("SRR3112216","SRR3112217","SRR3112218","SRR3112219","SRR3112220","SRR3112221")
samplesheet <- samplesheet[which(samplesheet$SRR_accession %in% SRRvec),]
samplesheet$trt <- as.factor(c(0,0,0,1,1,1)) 
s1 <- samplesheet

s1 %>% kbl(caption = "sample sheet") %>% kable_paper("hover", full_width = F)

sample sheet
	SRR_accession	QC_summary	SRX_accession	SRS_accession	SRP_accession	Experiment_title	GEO_series	trt
263021	SRR3112216	PASS	SRX1540348	SRS1256815	SRP068733	GSM2044428: Healthy_NT_siRNA_R1; Homo sapiens; RNA-Seq	GSE77108	0
263022	SRR3112217	PASS	SRX1540349	SRS1256814	SRP068733	GSM2044429: Healthy_NT_siRNA_R2; Homo sapiens; RNA-Seq	GSE77108	0
263023	SRR3112218	PASS	SRX1540350	SRS1256812	SRP068733	GSM2044430: Healthy_NT_siRNA_R3; Homo sapiens; RNA-Seq	GSE77108	0
263024	SRR3112219	PASS	SRX1540351	SRS1256813	SRP068733	GSM2044431: Healthy_EP300_siRNA_R1; Homo sapiens; RNA-Seq	GSE77108	1
263025	SRR3112220	PASS	SRX1540352	SRS1256811	SRP068733	GSM2044432: Healthy_EP300_siRNA_R2; Homo sapiens; RNA-Seq	GSE77108	1
263026	SRR3112221	PASS	SRX1540353	SRS1256810	SRP068733	GSM2044433: Healthy_EP300_siRNA_R3; Homo sapiens; RNA-Seq	GSE77108	1

w <- getDEE2("hsapiens",SRRvec,metadata=mdat,legacy = TRUE)

## For more information about DEE2 QC metrics, visit
##     https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md

x <- Tx2Gene(w)
x <- x$Tx2Gene

# save the genetable for later
gt <- w$GeneInfo[,1,drop=FALSE]
gt$accession <- rownames(gt)

# counts 
x1 <- x[,which(colnames(x) %in% samplesheet$SRR_accession)]

Here show the number of genes in the annotation set, and those detected above the detection threshold.

# filter out lowly expressed genes
x1 <- x1[which(rowSums(x1)/ncol(x1)>=(10)),]
nrow(x)

## [1] 39297

nrow(x1)

## [1] 14255

Now multidimensional scaling (MDS) plot to show the correlation between the datasets. If the control and case datasets are clustered separately, then it is likely that there will be many differentially expressed genes with FDR<0.05.

plot(cmdscale(dist(t(x1))), xlab="Coordinate 1", ylab="Coordinate 2", pch=19, col=s1$trt, main="MDS")

Differential expression

Now run DESeq2 for control vs case.

y <- DESeqDataSetFromMatrix(countData = round(x1), colData = s1, design = ~ trt)

## converting counts to integer mode

y <- DESeq(y)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

de <- results(y)
de <- as.data.frame(de[order(de$pvalue),])
rownames(de) <- sapply(strsplit(rownames(de),"\\."),"[[",1)
head(de) %>% kbl() %>% kable_paper("hover", full_width = F)

	baseMean	log2FoldChange	lfcSE	stat
ENSG00000049449	10375.678	-1.319179	0.0346992	-38.01753
ENSG00000065308	9062.133	-1.354569	0.0335259	-40.40362
ENSG00000066056	10429.917	-1.273349	0.0333485	-38.18314
ENSG00000068001	6526.375	-1.635372	0.0423455	-38.61975
ENSG00000076706	25433.097	-1.625573	0.0301945	-53.83679
ENSG00000087245	19077.593	-1.541325	0.0312561	-49.31281

Now let’s have a look at some of the charts showing differential expression. In particular, an MA plot and volcano plot.

maplot <- function(de,contrast_name) {
  sig <-subset(de, padj < 0.05 )
  up <-rownames(subset(de, padj < 0.05 & log2FoldChange > 0))
  dn <-rownames(subset(de, padj < 0.05 & log2FoldChange < 0))
  GENESUP <- length(up)
  GENESDN <- length(dn)
  DET=nrow(de)
  SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down", DET, "detected")
  ns <-subset(de, padj > 0.05 )
  plot(log2(de$baseMean),de$log2FoldChange, 
       xlab="log2 basemean", ylab="log2 foldchange",
       pch=19, cex=0.5, col="dark gray",
       main=contrast_name, cex.main=0.7)
  points(log2(sig$baseMean),sig$log2FoldChange,
         pch=19, cex=0.5, col="red")
  mtext(SUBHEADER,cex = 0.7)
}

make_volcano <- function(de,name) {
    sig <- subset(de,padj<0.05)
    N_SIG=nrow(sig)
    N_UP=nrow(subset(sig,log2FoldChange>0))
    N_DN=nrow(subset(sig,log2FoldChange<0))
    DET=nrow(de)
    HEADER=paste(N_SIG,"@5%FDR,", N_UP, "up", N_DN, "dn", DET, "detected")
    plot(de$log2FoldChange,-log10(de$padj),cex=0.5,pch=19,col="darkgray",
        main=name, xlab="log2 FC", ylab="-log10 pval", xlim=c(-6,6))
    mtext(HEADER)
    grid()
    points(sig$log2FoldChange,-log10(sig$padj),cex=0.5,pch=19,col="red")
}

maplot(de,name)

make_volcano(de,name)

Need to add gene symbol

mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))

genes <- getBM(filters= "ensembl_gene_id",
  attributes= c("ensembl_gene_id","hgnc_symbol"),
  values=rownames(de), mart= mart)

## Batch submitting query [=========>---------------------] 33% eta: 14sBatch
## submitting query [====================>----------] 67% eta: 6s

m <- merge(de,genes,by.x=0,by.y="ensembl_gene_id")
rownames(m) <- paste(m$Row.names,m$hgnc_symbol)
m$Row.names = m$hgnc_symbol = NULL
dim(de)

## [1] 14255     6

dim(m)

## [1] 14155     6

Save table

saveRDS(m,"bulkrna7.Rds")

Session information

sessionInfo()

## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] biomaRt_2.58.0              eulerr_7.0.2               
##  [3] kableExtra_1.4.0            mitch_1.14.0               
##  [5] clusterProfiler_4.10.0      DESeq2_1.42.0              
##  [7] SummarizedExperiment_1.32.0 Biobase_2.62.0             
##  [9] MatrixGenerics_1.14.0       matrixStats_1.3.0          
## [11] GenomicRanges_1.54.1        GenomeInfoDb_1.38.5        
## [13] IRanges_2.36.0              S4Vectors_0.40.2           
## [15] BiocGenerics_0.48.1         getDEE2_1.12.0             
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.4.0           later_1.3.2             bitops_1.0-7           
##   [4] ggplotify_0.1.2         filelock_1.0.3          tibble_3.2.1           
##   [7] polyclip_1.10-6         XML_3.99-0.16.1         lifecycle_1.0.4        
##  [10] lattice_0.22-6          MASS_7.3-60.2           magrittr_2.0.3         
##  [13] sass_0.4.9              rmarkdown_2.26          jquerylib_0.1.4        
##  [16] yaml_2.3.8              httpuv_1.6.15           cowplot_1.1.3          
##  [19] DBI_1.2.2               RColorBrewer_1.1-3      abind_1.4-5            
##  [22] zlibbioc_1.48.0         purrr_1.0.2             ggraph_2.2.1           
##  [25] RCurl_1.98-1.14         yulab.utils_0.1.4       tweenr_2.0.3           
##  [28] rappdirs_0.3.3          GenomeInfoDbData_1.2.11 enrichplot_1.22.0      
##  [31] ggrepel_0.9.5           tidytree_0.4.6          svglite_2.1.3          
##  [34] codetools_0.2-20        DelayedArray_0.28.0     DOSE_3.28.2            
##  [37] xml2_1.3.6              ggforce_0.4.2           tidyselect_1.2.1       
##  [40] aplot_0.2.2             farver_2.1.1            viridis_0.6.5          
##  [43] BiocFileCache_2.10.1    jsonlite_1.8.8          tidygraph_1.3.1        
##  [46] systemfonts_1.0.6       tools_4.4.0             progress_1.2.3         
##  [49] treeio_1.26.0           Rcpp_1.0.12             glue_1.7.0             
##  [52] gridExtra_2.3           SparseArray_1.2.3       xfun_0.43              
##  [55] qvalue_2.34.0           dplyr_1.1.4             withr_3.0.0            
##  [58] fastmap_1.1.1           GGally_2.2.1            fansi_1.0.6            
##  [61] caTools_1.18.2          digest_0.6.35           R6_2.5.1               
##  [64] mime_0.12               gridGraphics_0.5-1      colorspace_2.1-0       
##  [67] GO.db_3.18.0            gtools_3.9.5            RSQLite_2.3.6          
##  [70] utf8_1.2.4              tidyr_1.3.1             generics_0.1.3         
##  [73] data.table_1.15.4       prettyunits_1.2.0       graphlayouts_1.1.1     
##  [76] httr_1.4.7              htmlwidgets_1.6.4       S4Arrays_1.2.0         
##  [79] scatterpie_0.2.2        ggstats_0.6.0           pkgconfig_2.0.3        
##  [82] gtable_0.3.5            blob_1.2.4              XVector_0.42.0         
##  [85] shadowtext_0.1.3        htmltools_0.5.8.1       fgsea_1.28.0           
##  [88] echarts4r_0.4.5         scales_1.3.0            png_0.1-8              
##  [91] ggfun_0.1.4             knitr_1.46              rstudioapi_0.16.0      
##  [94] reshape2_1.4.4          nlme_3.1-164            curl_5.2.1             
##  [97] cachem_1.0.8            stringr_1.5.1           KernSmooth_2.23-22     
## [100] parallel_4.4.0          HDO.db_0.99.1           AnnotationDbi_1.64.1   
## [103] pillar_1.9.0            grid_4.4.0              vctrs_0.6.5            
## [106] gplots_3.1.3.1          promises_1.3.0          dbplyr_2.5.0           
## [109] xtable_1.8-4            beeswarm_0.4.0          evaluate_0.23          
## [112] cli_3.6.2               locfit_1.5-9.9          compiler_4.4.0         
## [115] rlang_1.1.3             crayon_1.5.2            plyr_1.8.9             
## [118] fs_1.6.4                stringi_1.8.3           viridisLite_0.4.2      
## [121] BiocParallel_1.36.0     htm2txt_2.2.2           munsell_0.5.1          
## [124] Biostrings_2.70.1       lazyeval_0.2.2          GOSemSim_2.28.0        
## [127] Matrix_1.7-0            hms_1.1.3               patchwork_1.2.0        
## [130] bit64_4.0.5             ggplot2_3.5.1           KEGGREST_1.42.0        
## [133] shiny_1.8.1.1           highr_0.10              igraph_2.0.3           
## [136] memoise_2.0.1           bslib_0.7.0             ggtree_3.10.0          
## [139] fastmatch_1.1-4         bit_4.0.5               ape_5.8                
## [142] gson_0.1.0