Examining correlation structure in RNA-seq data

Source: https://github.com/markziemann/combined_enrichment

Intro

This script is designed to fetch GTEXv8 RNA-seq expression data in order to assess the correlation of genes within a gene set.

suppressPackageStartupMessages({
  library("kableExtra")
  library("beeswarm")
  library("getDEE2")
  library("mitch")
  library("gplots")
  library("Hmisc")
  library("parallel")
  library("vioplot")
})

Fetch data

First get gene expression data.

DATURL="https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz"
if (! file.exists("GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz") ) {
  download.file(DATURL,destfile="GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz")
  tmp <- readLines("GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz")
  tmp <- tmp[3:length(tmp)]
  writeLines(tmp,con="GTEx_data.tsv")
}
mx <- read.table("GTEx_data.tsv",header=TRUE,row.names=1)
mx[1:6,1:4]

##                   Description GTEX.1117F.0226.SM.5GZZ7 GTEX.1117F.0426.SM.5EGHI
## ENSG00000223972.5     DDX11L1                        0                        0
## ENSG00000227232.5      WASH7P                      187                      109
## ENSG00000278267.1   MIR6859-1                        0                        0
## ENSG00000243485.5 MIR1302-2HG                        1                        0
## ENSG00000237613.2     FAM138A                        0                        0
## ENSG00000268020.3      OR4G4P                        0                        1
##                   GTEX.1117F.0526.SM.5EGHJ
## ENSG00000223972.5                        0
## ENSG00000227232.5                      143
## ENSG00000278267.1                        1
## ENSG00000243485.5                        0
## ENSG00000237613.2                        0
## ENSG00000268020.3                        0

dim(mx)

## [1] 56200 17383

# downsample 50 samples only
# for testing only
mx <- mx[,1:50]

mx2 <- aggregate(. ~ Description,mx,sum)
mx2[1:6,1:4]

##   Description GTEX.1117F.0226.SM.5GZZ7 GTEX.1117F.0426.SM.5EGHI
## 1   5_8S_rRNA                        0                        0
## 2     5S_rRNA                        2                        0
## 3         7SK                        0                        0
## 4        A1BG                      230                       37
## 5    A1BG-AS1                       37                        6
## 6        A1CF                        5                        8
##   GTEX.1117F.0526.SM.5EGHJ
## 1                        0
## 2                        1
## 3                        1
## 4                      391
## 5                       19
## 6                        1

dim(mx2)

## [1] 54592    50

# filter based on detection threshold
rownames(mx2) <- mx2[,1]
mx2[,1]=NULL
mx2 <- mx2[which(rowMeans(mx2)>=10),]
dim(mx2)

## [1] 24760    49

# normalise
mx2 <- mx2/colSums(mx2) * 1000000
rpm <- mx2
# tidy up some files
remove(mx)

Next get pathways.

if (! file.exists("ReactomePathways.gmt")) {
  download.file("https://reactome.org/download/current/ReactomePathways.gmt.zip", 
    destfile="ReactomePathways.gmt.zip")
  unzip("ReactomePathways.gmt.zip")
}
genesets<-gmt_import("ReactomePathways.gmt")

Assess the correlation structure

# ++++++++++++++++++++++++++++
# flattenCorrMatrix
# ++++++++++++++++++++++++++++
# cormat : matrix of the correlation coefficients
# pmat : matrix of the correlation p-values
flattenCorrMatrix <- function(cormat, pmat) {
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut]
    )
}

res <- mclapply(1:length(genesets), function(i){
  gs <- genesets[[i]]
  len <- length(gs)
  x <- rpm[which(rownames(rpm) %in% gs),]
  if(nrow(x)>=10) {
    res <- rcorr(as.matrix(t(x)),type="spearman")
    res <- flattenCorrMatrix(res$r, res$P)
    # randomly select gene ids as a comparison
    y <- rpm[sample(which(! rownames(rpm) %in% gs),size=len,replace=FALSE),]
    res2 <- rcorr(as.matrix(t(y)),type="spearman")
    res2 <- flattenCorrMatrix(res2$r, res2$P)
    cormax <- max(c(res$cor,res2$cor))
    cormin <- min(c(res$cor,res2$cor))
    setname <- names(genesets[i])
    result <- list("inset"=res$cor, "rand"=res2$cor,"setname"=setname)
    return(result) 
  }
}, mc.cores=16)

looks like some empty results. Let’s get rid of those now.

str(head(res))

## List of 6
##  $ : NULL
##  $ : NULL
##  $ :List of 3
##   ..$ inset  : num [1:325] 0.4323 0.358 0.3406 0.4211 0.0626 ...
##   ..$ rand   : num [1:325] 0.195 0.119 0.395 0.223 0.444 ...
##   ..$ setname: chr "A tetrasaccharide linker sequence is required for GAG synthesis"
##  $ :List of 3
##   ..$ inset  : num [1:2926] -0.2965 -0.042 -0.0864 0.1843 0.3094 ...
##   ..$ rand   : num [1:3003] 0.0633 0.0826 0.0334 0.3253 0.1114 ...
##   ..$ setname: chr "ABC transporter disorders"
##  $ :List of 3
##   ..$ inset  : num [1:153] 0.000766 0.137041 0.021281 0.007041 -0.0864 ...
##   ..$ rand   : num [1:153] 0.521 0.118 0.295 0.212 0.304 ...
##   ..$ setname: chr "ABC transporters in lipid homeostasis"
##  $ :List of 3
##   ..$ inset  : num [1:5151] 0.000766 0.137041 0.021281 0.007041 -0.0864 ...
##   ..$ rand   : num [1:5253] -0.0656 0.1289 -0.3052 0.3258 0.0548 ...
##   ..$ setname: chr "ABC-family proteins mediated transport"

res <- res[which(lapply(res,length)>0)]
str(head(res))

## List of 6
##  $ :List of 3
##   ..$ inset  : num [1:325] 0.4323 0.358 0.3406 0.4211 0.0626 ...
##   ..$ rand   : num [1:325] 0.195 0.119 0.395 0.223 0.444 ...
##   ..$ setname: chr "A tetrasaccharide linker sequence is required for GAG synthesis"
##  $ :List of 3
##   ..$ inset  : num [1:2926] -0.2965 -0.042 -0.0864 0.1843 0.3094 ...
##   ..$ rand   : num [1:3003] 0.0633 0.0826 0.0334 0.3253 0.1114 ...
##   ..$ setname: chr "ABC transporter disorders"
##  $ :List of 3
##   ..$ inset  : num [1:153] 0.000766 0.137041 0.021281 0.007041 -0.0864 ...
##   ..$ rand   : num [1:153] 0.521 0.118 0.295 0.212 0.304 ...
##   ..$ setname: chr "ABC transporters in lipid homeostasis"
##  $ :List of 3
##   ..$ inset  : num [1:5151] 0.000766 0.137041 0.021281 0.007041 -0.0864 ...
##   ..$ rand   : num [1:5253] -0.0656 0.1289 -0.3052 0.3258 0.0548 ...
##   ..$ setname: chr "ABC-family proteins mediated transport"
##  $ :List of 3
##   ..$ inset  : num [1:5356] 0.11 -0.0119 -0.1712 0.1566 -0.1986 ...
##   ..$ rand   : num [1:8911] -0.2247 0.1902 0.0342 0.229 -0.2036 ...
##   ..$ setname: chr "ADORA2B mediated anti-inflammatory cytokines production"
##  $ :List of 3
##   ..$ inset  : num [1:276] 0.376 0.138 0.286 0.361 0.595 ...
##   ..$ rand   : num [1:300] 0.09561 -0.00112 0.31929 0.60418 0.38806 ...
##   ..$ setname: chr "ADP signalling through P2Y purinoceptor 1"

We can visualise the spearman correlation coefficients here.

z <- lapply(1:6, function(i){
setname <- res[[i]]$setname
boxplot(res[[i]][1:2],col="white",ylab="Spearman rho",main=setname)
beeswarm(res[[i]][1:2],add=TRUE,pch=19)
})

We can run a t-test to quantify the difference in distributions.

res2 <- lapply(1:length(res), function(i) {
inset <- res[[i]]$inset
rand <- res[[i]]$rand
tt <- t.test(x=inset,y=rand)
return(c("stat"=tt$statistic,"pvalue"=tt$p.value))
})

res2 <- do.call(rbind,res2)

rownames(res2) <- sapply(res,"[[",3)
summary(res2)

##      stat.t            pvalue       
##  Min.   :-85.483   Min.   :0.00000  
##  1st Qu.:  1.449   1st Qu.:0.00000  
##  Median :  6.348   Median :0.00000  
##  Mean   : 14.944   Mean   :0.08360  
##  3rd Qu.: 19.635   3rd Qu.:0.01501  
##  Max.   :307.768   Max.   :0.99906

Now we can look at the positive and negatively correlated gene sets.

poscor <- head(res2[order(-res2[,1]),],10) 
poscor %>% kbl(caption="most correlated gene sets") %>% kable_paper("hover", full_width = F)

most correlated gene sets
	stat.t	pvalue
Gene expression (Transcription)	307.7681	0
Metabolism of RNA	280.2620	0
RNA Polymerase II Transcription	279.9882	0
Generic Transcription Pathway	229.6446	0
Immune System	212.8620	0
Cell Cycle	207.1243	0
Cell Cycle, Mitotic	163.8820	0
Processing of Capped Intron-Containing Pre-mRNA	153.3164	0
M Phase	130.0567	0
Cytokine Signaling in Immune system	126.8020	0

poscor_res <- res[which(sapply(res,"[[",3) %in% rownames(poscor))]

Vioplot of positively correlated sets.

z <- lapply(1:length(poscor_res), function(i){
setname <- poscor_res[[i]]$setname
gset <- genesets[[which(names(genesets) ==setname)]]
n <- length(which(gset %in% rownames(rpm)))
vioplot(poscor_res[[i]][1:2] , ylab="Spearman rho",main=setname  )
mtext(paste("n=",n))
})

Heatmap of pos cor sets.

z <- lapply(1:10, function(i) {
setname <- rownames(poscor)[i]
gset <- genesets[[which(names(genesets) ==setname)]]
mx <- rpm[which(rownames(rpm) %in% gset),]
heatmap.2(as.matrix(mx),trace="none",scale="row",main=setname, margin=c(5,7))
heatmap.2(cor(t(mx)),trace="none",scale="none",main=setname,margin=c(5,7))
})

Now focus on negatively correlated sets.

anticor <- head(res2[order(res2[,1]),],10)
anticor %>% kbl(caption="most anti-correlated gene sets") %>% kable_paper("hover", full_width = F)

most anti-correlated gene sets
	stat.t	pvalue
Metabolism	-85.48251	0
Developmental Biology	-61.56000	0
Transport of small molecules	-47.89319	0
GPCR ligand binding	-32.89906	0
Sensory Perception	-26.55481	0
Muscle contraction	-18.78015	0
Class A/1 (Rhodopsin-like receptors)	-18.27155	0
Peptide ligand-binding receptors	-16.83629	0
Transport of inorganic cations/anions and amino acids/oligopeptides	-15.09171	0
Cardiac conduction	-15.00643	0

anticor_res <- res[which(sapply(res,"[[",3) %in% rownames(anticor))]

z <- lapply(1:length(anticor_res), function(i){
setname <- anticor_res[[i]]$setname
gset <- genesets[[which(names(genesets) ==setname)]]
n <- length(which(gset %in% rownames(rpm)))
vioplot(anticor_res[[i]][1:2] , ylab="Spearman rho",main=setname  )
mtext(paste("n=",n))
})

Heatmap of neg cor sets.

z <- lapply(1:10, function(i) {
setname <- rownames(anticor)[i]
gset <- genesets[[which(names(genesets) ==setname)]]
mx <- rpm[which(rownames(rpm) %in% gset),]
heatmap.2(as.matrix(mx),trace="none",scale="row",main=setname,margin=c(5,7))
heatmap.2(cor(t(mx)),trace="none",scale="none",main=setname,margin=c(5,7))
})

Now visualise the test statistics found.

par(mfrow=c(2,1))
hist(res2[,"stat.t"],breaks=100,main="",xlab="test statistic")
boxplot(res2[,"stat.t"],horizontal=TRUE)

hist(res2[,"stat.t"],xlim=c(-50,50),breaks=100,main="",xlab="test statistic")
boxplot(res2[,"stat.t"],ylim=c(-50,50),horizontal=TRUE)

par(mfrow=c(1,1))

pos <- length( which(res2[,1]>0 & res2[,2] < 0.05) )
neg <- length( which(res2[,1]<0 & res2[,2] < 0.05) )
none <- length( which(res2[,2] > 0.05) )
bars <- c("Pos"=pos,"None"=none,"Neg"=neg)
barplot(bars,ylab="no. gene sets with correlated expression",ylim=c(0,max(bars)*1.3))
text(((1:length(bars))-0.5)*1.25,bars+50,labels=bars)

Conclusions

This work confirms the observations by Hong, that genes in a set are on the whole, most gene sets positively correlated members.

Some gene sets are highly correlated, while others are heterogenous, or slightly anticorrelated.

We can try to reproduce this correlation structure in simulated data.

Session information

sessionInfo()

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] vioplot_0.3.7    zoo_1.8-9        sm_2.2-5.7       Hmisc_4.6-0     
##  [5] ggplot2_3.3.5    Formula_1.2-4    survival_3.2-13  lattice_0.20-45 
##  [9] gplots_3.1.1     mitch_1.4.1      getDEE2_1.2.0    beeswarm_0.4.0  
## [13] kableExtra_1.3.4
## 
## loaded via a namespace (and not attached):
##   [1] bitops_1.0-7                matrixStats_0.61.0         
##   [3] webshot_0.5.2               RColorBrewer_1.1-2         
##   [5] httr_1.4.2                  GenomeInfoDb_1.28.4        
##   [7] backports_1.4.1             tools_4.1.2                
##   [9] bslib_0.3.1                 utf8_1.2.2                 
##  [11] R6_2.5.1                    rpart_4.1.16               
##  [13] KernSmooth_2.23-20          DBI_1.1.2                  
##  [15] BiocGenerics_0.38.0         colorspace_2.0-2           
##  [17] nnet_7.3-17                 withr_2.4.3                
##  [19] htm2txt_2.1.1               tidyselect_1.1.1           
##  [21] gridExtra_2.3               GGally_2.1.2               
##  [23] compiler_4.1.2              cli_3.1.1                  
##  [25] rvest_1.0.2                 Biobase_2.52.0             
##  [27] htmlTable_2.4.0             xml2_1.3.3                 
##  [29] DelayedArray_0.18.0         sass_0.4.0                 
##  [31] checkmate_2.0.0             caTools_1.18.2             
##  [33] scales_1.1.1                systemfonts_1.0.3          
##  [35] stringr_1.4.0               digest_0.6.29              
##  [37] foreign_0.8-82              rmarkdown_2.11             
##  [39] svglite_2.0.0               XVector_0.32.0             
##  [41] jpeg_0.1-9                  base64enc_0.1-3            
##  [43] pkgconfig_2.0.3             htmltools_0.5.2            
##  [45] MatrixGenerics_1.4.3        highr_0.9                  
##  [47] fastmap_1.1.0               htmlwidgets_1.5.4          
##  [49] rlang_1.0.0                 rstudioapi_0.13            
##  [51] shiny_1.7.1                 jquerylib_0.1.4            
##  [53] generics_0.1.1              jsonlite_1.7.3             
##  [55] gtools_3.9.2                dplyr_1.0.7                
##  [57] RCurl_1.98-1.5              magrittr_2.0.2             
##  [59] GenomeInfoDbData_1.2.6      Matrix_1.4-0               
##  [61] Rcpp_1.0.8                  munsell_0.5.0              
##  [63] S4Vectors_0.30.2            fansi_1.0.2                
##  [65] lifecycle_1.0.1             stringi_1.7.6              
##  [67] yaml_2.2.2                  MASS_7.3-55                
##  [69] SummarizedExperiment_1.22.0 zlibbioc_1.38.0            
##  [71] plyr_1.8.6                  grid_4.1.2                 
##  [73] promises_1.2.0.1            crayon_1.4.2               
##  [75] echarts4r_0.4.3             splines_4.1.2              
##  [77] knitr_1.37                  pillar_1.6.5               
##  [79] GenomicRanges_1.44.0        reshape2_1.4.4             
##  [81] stats4_4.1.2                glue_1.6.1                 
##  [83] evaluate_0.14               latticeExtra_0.6-29        
##  [85] data.table_1.14.2           png_0.1-7                  
##  [87] vctrs_0.3.8                 httpuv_1.6.5               
##  [89] gtable_0.3.0                purrr_0.3.4                
##  [91] reshape_0.8.8               assertthat_0.2.1           
##  [93] xfun_0.29                   mime_0.12                  
##  [95] xtable_1.8-4                later_1.3.0                
##  [97] viridisLite_0.4.0           tibble_3.1.6               
##  [99] IRanges_2.26.0              cluster_2.1.2              
## [101] ellipsis_0.3.2