Examining correlation structure in RNA-seq data

Source: https://github.com/markziemann/combined_enrichment

Intro

This script is designed to fetch GTEXv8 RNA-seq expression data in order to assess the correlation of genes within a gene set.

suppressPackageStartupMessages({
#  library("kableExtra")
  library("beeswarm")
#  library("getDEE2")
  library("mitch")
  library("gplots")
  library("Hmisc")
  library("parallel")
  library("vioplot")
})

Fetch data

First get gene expression data.

DATURL="https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz"
if (! file.exists("GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz") ) {
  download.file(DATURL,destfile="GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz")
  tmp <- readLines("GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz")
  tmp <- tmp[3:length(tmp)]
  writeLines(tmp,con="GTEx_data.tsv")
}

# import data
# GTEx_data2.tsv is the cut-down version for testing
# GTEx_data.tsv is the main dataset
#mx <- read.table("GTEx_data2.tsv",header=TRUE,row.names=1)
mx <- read.table("GTEx_data.tsv",header=TRUE,row.names=1)
mx[1:6,1:4]

##                   Description GTEX.1117F.0226.SM.5GZZ7 GTEX.1117F.0426.SM.5EGHI
## ENSG00000223972.5     DDX11L1                        0                        0
## ENSG00000227232.5      WASH7P                      187                      109
## ENSG00000278267.1   MIR6859-1                        0                        0
## ENSG00000243485.5 MIR1302-2HG                        1                        0
## ENSG00000237613.2     FAM138A                        0                        0
## ENSG00000268020.3      OR4G4P                        0                        1
##                   GTEX.1117F.0526.SM.5EGHJ
## ENSG00000223972.5                        0
## ENSG00000227232.5                      143
## ENSG00000278267.1                        1
## ENSG00000243485.5                        0
## ENSG00000237613.2                        0
## ENSG00000268020.3                        0

dim(mx)

## [1] 56200 17383

# downsample 50 samples only
# for testing only
#mx <- mx[,1:50]

mx2 <- aggregate(. ~ Description,mx,sum)
mx2[1:6,1:4]

##   Description GTEX.1117F.0226.SM.5GZZ7 GTEX.1117F.0426.SM.5EGHI
## 1   5_8S_rRNA                        0                        0
## 2     5S_rRNA                        2                        0
## 3         7SK                        0                        0
## 4        A1BG                      230                       37
## 5    A1BG-AS1                       37                        6
## 6        A1CF                        5                        8
##   GTEX.1117F.0526.SM.5EGHJ
## 1                        0
## 2                        1
## 3                        1
## 4                      391
## 5                       19
## 6                        1

dim(mx2)

## [1] 54592 17383

# filter based on detection threshold
rownames(mx2) <- mx2[,1]
mx2[,1]=NULL
mx2 <- mx2[which(rowMeans(mx2)>=10),]
dim(mx2)

## [1] 24807 17382

# normalise
mx2 <- mx2/colSums(mx2) * 1000000
rpm <- mx2
# tidy up some files
remove(mx)

Next get pathways.

if (! file.exists("ReactomePathways.gmt")) {
  download.file("https://reactome.org/download/current/ReactomePathways.gmt.zip", 
    destfile="ReactomePathways.gmt.zip")
  unzip("ReactomePathways.gmt.zip")
}
genesets<-gmt_import("ReactomePathways.gmt")

Assess the correlation structure

# ++++++++++++++++++++++++++++
# flattenCorrMatrix
# ++++++++++++++++++++++++++++
# cormat : matrix of the correlation coefficients
# pmat : matrix of the correlation p-values
flattenCorrMatrix <- function(cormat, pmat) {
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut]
    )
}

res <- mclapply(1:length(genesets), function(i){
  gs <- genesets[[i]]
  len <- length(gs)
  x <- rpm[which(rownames(rpm) %in% gs),]
  if(nrow(x)>=5) {
    res <- rcorr(as.matrix(t(x)),type="spearman")
    res <- flattenCorrMatrix(res$r, res$P)
    # randomly select gene ids as a comparison
    y <- rpm[sample(which(! rownames(rpm) %in% gs),size=len,replace=FALSE),]
    res2 <- rcorr(as.matrix(t(y)),type="spearman")
    res2 <- flattenCorrMatrix(res2$r, res2$P)
    cormax <- max(c(res$cor,res2$cor))
    cormin <- min(c(res$cor,res2$cor))
    setname <- names(genesets[i])
    result <- list("inset"=res$cor, "rand"=res2$cor,"setname"=setname)
    return(result) 
  }
}, mc.cores=round(detectCores()*3/4))

looks like some empty results. Let’s get rid of those now.

str(head(res))

## List of 6
##  $ :List of 3
##   ..$ inset  : num [1:21] 0.0945 0.3481 0.0512 0.3507 -0.1102 ...
##   ..$ rand   : num [1:78] 0.514 -0.261 -0.428 0.352 0.208 ...
##   ..$ setname: chr "2-LTR circle formation"
##  $ : NULL
##  $ :List of 3
##   ..$ inset  : num [1:325] 0.373 0.192 0.134 0.299 0.17 ...
##   ..$ rand   : num [1:325] -0.169 -0.145 0.272 0.119 0.252 ...
##   ..$ setname: chr "A tetrasaccharide linker sequence is required for GAG synthesis"
##  $ :List of 3
##   ..$ inset  : num [1:2926] -0.00235 0.00883 0.05399 0.31089 0.18105 ...
##   ..$ rand   : num [1:3003] 0.0585 0.1892 0.0518 0.1463 -0.0135 ...
##   ..$ setname: chr "ABC transporter disorders"
##  $ :List of 3
##   ..$ inset  : num [1:153] -0.00225 -0.03581 0.00353 0.14359 0.05399 ...
##   ..$ rand   : num [1:153] 0.07153 0.06565 0.2078 -0.00457 0.02511 ...
##   ..$ setname: chr "ABC transporters in lipid homeostasis"
##  $ :List of 3
##   ..$ inset  : num [1:5151] -0.00225 -0.03581 0.00353 0.14359 0.05399 ...
##   ..$ rand   : num [1:5253] 0.0379 0.1782 0.147 0.2789 0.2374 ...
##   ..$ setname: chr "ABC-family proteins mediated transport"

res <- res[which(lapply(res,length)>0)]
str(head(res))

## List of 6
##  $ :List of 3
##   ..$ inset  : num [1:21] 0.0945 0.3481 0.0512 0.3507 -0.1102 ...
##   ..$ rand   : num [1:78] 0.514 -0.261 -0.428 0.352 0.208 ...
##   ..$ setname: chr "2-LTR circle formation"
##  $ :List of 3
##   ..$ inset  : num [1:325] 0.373 0.192 0.134 0.299 0.17 ...
##   ..$ rand   : num [1:325] -0.169 -0.145 0.272 0.119 0.252 ...
##   ..$ setname: chr "A tetrasaccharide linker sequence is required for GAG synthesis"
##  $ :List of 3
##   ..$ inset  : num [1:2926] -0.00235 0.00883 0.05399 0.31089 0.18105 ...
##   ..$ rand   : num [1:3003] 0.0585 0.1892 0.0518 0.1463 -0.0135 ...
##   ..$ setname: chr "ABC transporter disorders"
##  $ :List of 3
##   ..$ inset  : num [1:153] -0.00225 -0.03581 0.00353 0.14359 0.05399 ...
##   ..$ rand   : num [1:153] 0.07153 0.06565 0.2078 -0.00457 0.02511 ...
##   ..$ setname: chr "ABC transporters in lipid homeostasis"
##  $ :List of 3
##   ..$ inset  : num [1:5151] -0.00225 -0.03581 0.00353 0.14359 0.05399 ...
##   ..$ rand   : num [1:5253] 0.0379 0.1782 0.147 0.2789 0.2374 ...
##   ..$ setname: chr "ABC-family proteins mediated transport"
##  $ :List of 3
##   ..$ inset  : num [1:6328] 0.42661 0.09581 0.03123 0.00863 -0.12751 ...
##   ..$ rand   : num [1:8911] 0.233 -0.0342 0.1648 -0.1126 0.0139 ...
##   ..$ setname: chr "ADORA2B mediated anti-inflammatory cytokines production"

length(res)

## [1] 1894

We can visualise the spearman correlation coefficients here.

z <- lapply(1:6, function(i){
  setname <- res[[i]]$setname
  data <- res[[i]][1:2]
  vioplot(data,col="white",ylab="Spearman rho",main=setname)

  data$inset <- sample(x=data$inset,size=1000,replace=TRUE)
  data$rand <- sample(x=data$rand,size=1000,replace=TRUE)
  boxplot(data,col="white",ylab="Spearman rho",main=setname, cex=0)
  beeswarm(data,add=TRUE,pch=19,cex=0.6)
})

We can run a t-test to quantify the difference in distributions.

res2 <- lapply(1:length(res), function(i) {
  inset <- res[[i]]$inset
  rand <- res[[i]]$rand
  tt <- t.test(x=inset,y=rand)
  return(c("stat"=tt$statistic,"pvalue"=tt$p.value))
})

res2 <- do.call(rbind,res2)

rownames(res2) <- sapply(res,"[[",3)
summary(res2)

##      stat.t            pvalue         
##  Min.   :-65.062   Min.   :0.0000000  
##  1st Qu.:  1.395   1st Qu.:0.0000000  
##  Median :  5.053   Median :0.0000002  
##  Mean   : 16.077   Mean   :0.1048890  
##  3rd Qu.: 17.584   3rd Qu.:0.0509141  
##  Max.   :487.644   Max.   :0.9957407

output the results

write.table(res2,file="cor.tsv",quote=FALSE,sep="\t")

Now we can look at the positive and negatively correlated gene sets.

poscor <- head(res2[order(-res2[,1]),],10) 
poscor #%>% kbl(caption="most correlated gene sets") %>% kable_paper("hover", full_width = F)

##                                                   stat.t pvalue
## Gene expression (Transcription)                 487.6439      0
## RNA Polymerase II Transcription                 397.3404      0
## Metabolism of RNA                               381.9847      0
## Generic Transcription Pathway                   308.9746      0
## Metabolism of proteins                          289.2594      0
## Cell Cycle                                      256.5527      0
## Post-translational protein modification         215.5013      0
## Cell Cycle, Mitotic                             214.0735      0
## Processing of Capped Intron-Containing Pre-mRNA 201.0746      0
## Cellular responses to stress                    197.0605      0

poscor_res <- res[which(sapply(res,"[[",3) %in% rownames(poscor))]

Vioplot of positively correlated sets.

z <- lapply(1:length(poscor_res), function(i){
  setname <- poscor_res[[i]]$setname
  gset <- genesets[[which(names(genesets) ==setname)]]
  n <- length(which(gset %in% rownames(rpm)))
  data <- poscor_res[[i]][1:2]

  vioplot(data , ylab="Spearman rho",main=setname  )
  grid()
  mtext(paste("n=",n))

  data$inset <- sample(x=data$inset,size=1000,replace=TRUE)
  data$rand <- sample(x=data$rand,size=1000,replace=TRUE)
  boxplot(data,col="white",ylab="Spearman rho",main=setname,cex=0)
  grid()
  beeswarm(data,add=TRUE,pch=19,cex=0.6)
  mtext(paste("n=",n))
})

pdf("posvio.pdf",height=5,width=5)
z <- lapply(1:length(poscor_res), function(i){
  setname <- poscor_res[[i]]$setname
  gset <- genesets[[which(names(genesets) ==setname)]]
  n <- length(which(gset %in% rownames(rpm)))
  data <- poscor_res[[i]][1:2]

  vioplot(data , ylab="Spearman rho",main=setname  )
  grid()
  mtext(paste("n=",n))

  data$inset <- sample(x=data$inset,size=1000,replace=TRUE)
  data$rand <- sample(x=data$rand,size=1000,replace=TRUE)
  boxplot(data,col="white",ylab="Spearman rho",main=setname,cex=0)
  grid()
  beeswarm(data,add=TRUE,pch=19,cex=0.6)
  mtext(paste("n=",n))
})
dev.off()

## png 
##   2

Heatmap of pos cor sets.

z <- lapply(1:10, function(i) {
  setname <- rownames(poscor)[i]
  gset <- genesets[[which(names(genesets) ==setname)]]
  mx <- rpm[which(rownames(rpm) %in% gset),]
  heatmap.2(as.matrix(mx),trace="none",scale="row",main=setname, margin=c(5,7))
  heatmap.2(cor(t(mx)),trace="none",scale="none",main=setname,margin=c(5,7))
})

Now focus on negatively correlated sets.

negcor <- head(res2[order(res2[,1]),],10)
negcor #%>% kbl(caption="most anti-correlated gene sets") %>% kable_paper("hover", full_width = F)

##                                         stat.t        pvalue
## Signaling by GPCR                    -65.06250  0.000000e+00
## GPCR downstream signalling           -45.09572  0.000000e+00
## Hemostasis                           -42.62390  0.000000e+00
## GPCR ligand binding                  -40.57495  0.000000e+00
## Transport of small molecules         -40.36903  0.000000e+00
## Class A/1 (Rhodopsin-like receptors) -35.30746 4.895173e-271
## Metabolism                           -32.64372 1.050881e-233
## Developmental Biology                -31.72911 7.889720e-221
## Peptide ligand-binding receptors     -31.52014 2.918567e-214
## SLC-mediated transmembrane transport -30.88762 7.364697e-208

negcor_res <- res[which(sapply(res,"[[",3) %in% rownames(negcor))]

z <- lapply(1:length(negcor_res), function(i){
  setname <- negcor_res[[i]]$setname
  gset <- genesets[[which(names(genesets) ==setname)]]
  n <- length(which(gset %in% rownames(rpm)))
  data <- negcor_res[[i]][1:2]

  vioplot(data , ylab="Spearman rho",main=setname  )
  grid()
  mtext(paste("n=",n))

  data$inset <- sample(x=data$inset,size=1000,replace=TRUE)
  data$rand <- sample(x=data$rand,size=1000,replace=TRUE)
  boxplot(data,col="white",ylab="Spearman rho",main=setname,cex=0)
  grid()
  beeswarm(data,add=TRUE,pch=19,cex=0.6)
  mtext(paste("n=",n))
})

pdf("negvio.pdf",height=5,width=5)
z <- lapply(1:length(negcor_res), function(i){
  setname <- negcor_res[[i]]$setname
  gset <- genesets[[which(names(genesets) ==setname)]]
  n <- length(which(gset %in% rownames(rpm)))
  data <- negcor_res[[i]][1:2]

  vioplot(data , ylab="Spearman rho",main=setname  )
  grid()
  mtext(paste("n=",n))

  data$inset <- sample(x=data$inset,size=1000,replace=TRUE)
  data$rand <- sample(x=data$rand,size=1000,replace=TRUE)
  boxplot(data,col="white",ylab="Spearman rho",main=setname,cex=0)
  grid()
  beeswarm(data,add=TRUE,pch=19,cex=0.6)
  mtext(paste("n=",n))
})
dev.off()

## png 
##   2

Heatmap of neg cor sets.

z <- lapply(1:10, function(i) {
  setname <- rownames(negcor)[i]
  gset <- genesets[[which(names(genesets) ==setname)]]
  mx <- rpm[which(rownames(rpm) %in% gset),]
  heatmap.2(as.matrix(mx),trace="none",scale="row",main=setname,margin=c(5,7))
  heatmap.2(cor(t(mx)),trace="none",scale="none",main=setname,margin=c(5,7))
})

Now visualise the test statistics found.

res2 <- read.table("cor.tsv",sep="\t")
tstats <- res2[,"stat.t"]

par(mfrow=c(2,1))
hist(tstats,breaks=100,main="",xlab="test statistic")
grid()
hist(tstats,xlim=c(-50,50),breaks=100,main="",xlab="test statistic")
grid()

pdf("hist.pdf",height=5,width=5)
  par(mfrow=c(2,1))
  hist(tstats,breaks=100,main="",xlab="test statistic")
  grid()
  hist(tstats,xlim=c(-50,50),breaks=100,main="",xlab="test statistic")
  grid()
dev.off()

## png 
##   2

boxplot(tstats,horizontal=TRUE,col="white",cex=0, xlab="t-test statistic")
grid()
beeswarm(tstats,add=TRUE,pch=19,cex=0.4,horiz=TRUE)
boxplot(tstats,horizontal=TRUE,col="white",cex=0,ylim=c(-50,50),xlab="t-test statistic")
grid()
beeswarm(tstats,add=TRUE,pch=19,cex=0.4,horiz=TRUE)

pdf("tstat_swarm.pdf",height=5,width=5)
  par(mfrow=c(2,1))
  boxplot(tstats,horizontal=TRUE,col="white",cex=0, xlab="t-test statistic")
  grid()
  beeswarm(tstats,add=TRUE,pch=19,cex=0.4,horiz=TRUE)
  boxplot(tstats,horizontal=TRUE,col="white",cex=0,ylim=c(-50,50),xlab="t-test statistic")
  grid()
  beeswarm(tstats,add=TRUE,pch=19,cex=0.4,horiz=TRUE)
dev.off()

## png 
##   2

par(mfrow=c(1,1))

pos <- length( which(res2[,1]>0 & res2[,2] < 0.05) )
neg <- length( which(res2[,1]<0 & res2[,2] < 0.05) )
none <- length( which(res2[,2] > 0.05) )
bars <- c("Pos"=pos,"None"=none,"Neg"=neg)
barplot(bars,ylab="no. gene sets with correlated expression",ylim=c(0,max(bars)*1.3))
text(((1:length(bars))-0.5)*1.25,bars+50,labels=bars)

pdf("bars.pdf",height=4,width=4)
  barplot(bars,ylab="no. gene sets with correlated expression",ylim=c(0,max(bars)*1.3))
  text(((1:length(bars))-0.5)*1.25,bars+50,labels=bars)
dev.off()

## png 
##   2

Conclusions

This work confirms the observations by Hong, that genes in a set are on the whole, most gene sets positively correlated members.

Some gene sets are highly correlated, while others are heterogenous, or slightly anticorrelated.

We can try to reproduce this correlation structure in simulated data.

Session information

sessionInfo()

## R version 4.0.1 (2020-06-06)
## Platform: x86_64-conda_cos6-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /ceph-g/opt/miniconda3/envs/R40/lib/libopenblasp-r0.3.9.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] vioplot_0.3.7   zoo_1.8-8       sm_2.2-5.7      Hmisc_4.6-0    
##  [5] ggplot2_3.3.5   Formula_1.2-4   survival_3.2-3  lattice_0.20-41
##  [9] gplots_3.1.1    mitch_1.0.10    beeswarm_0.4.0 
## 
## loaded via a namespace (and not attached):
##  [1] splines_4.0.1       gtools_3.9.2        shiny_1.4.0.2      
##  [4] assertthat_0.2.1    latticeExtra_0.6-29 blob_1.2.1         
##  [7] yaml_2.2.1          backports_1.1.8     pillar_1.7.0       
## [10] glue_1.4.1          digest_0.6.25       RColorBrewer_1.1-2 
## [13] promises_1.1.1      checkmate_2.0.0     colorspace_1.4-1   
## [16] htmltools_0.5.0     httpuv_1.5.4        Matrix_1.2-18      
## [19] plyr_1.8.6          pkgconfig_2.0.3     purrr_0.3.4        
## [22] xtable_1.8-4        scales_1.1.1        jpeg_0.1-9         
## [25] later_1.1.0.1       tibble_3.0.1        htmlTable_2.4.0    
## [28] echarts4r_0.3.3     generics_0.0.2      ellipsis_0.3.2     
## [31] withr_2.2.0         nnet_7.3-14         cli_3.1.1          
## [34] magrittr_1.5        crayon_1.3.4        mime_0.9           
## [37] evaluate_0.14       GGally_2.1.2        fansi_0.4.1        
## [40] MASS_7.3-51.6       foreign_0.8-80      data.table_1.12.8  
## [43] tools_4.0.1         lifecycle_1.0.1     stringr_1.4.0      
## [46] munsell_0.5.0       cluster_2.1.0       compiler_4.0.1     
## [49] caTools_1.18.2      rlang_1.0.0         grid_4.0.1         
## [52] rstudioapi_0.11     htmlwidgets_1.5.1   bitops_1.0-7       
## [55] base64enc_0.1-3     rmarkdown_2.3       gtable_0.3.0       
## [58] DBI_1.1.0           reshape_0.8.8       reshape2_1.4.4     
## [61] R6_2.4.1            gridExtra_2.3       knitr_1.28         
## [64] dplyr_1.0.8         fastmap_1.0.1       utf8_1.1.4         
## [67] KernSmooth_2.23-17  stringi_1.4.6       Rcpp_1.0.8         
## [70] vctrs_0.3.8         rpart_4.1-15        png_0.1-7          
## [73] tidyselect_1.1.1    xfun_0.15