Source: https://github.com/markziemann/combined_enrichment

Intro

This script is designed to fetch GTEXv8 RNA-seq expression data in order to assess the correlation of genes within a gene set.

suppressPackageStartupMessages({
#  library("kableExtra")
  library("beeswarm")
#  library("getDEE2")
  library("mitch")
  library("gplots")
  library("Hmisc")
  library("parallel")
  library("vioplot")
})

Fetch data

First get gene expression data.

DATURL="https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz"
if (! file.exists("GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz") ) {
  download.file(DATURL,destfile="GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz")
  tmp <- readLines("GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz")
  tmp <- tmp[3:length(tmp)]
  writeLines(tmp,con="GTEx_data.tsv")
}

# import data
# GTEx_data2.tsv is the cut-down version for testing
# GTEx_data.tsv is the main dataset
#mx <- read.table("GTEx_data2.tsv",header=TRUE,row.names=1)
mx <- read.table("GTEx_data.tsv",header=TRUE,row.names=1)
mx[1:6,1:4]
##                   Description GTEX.1117F.0226.SM.5GZZ7 GTEX.1117F.0426.SM.5EGHI
## ENSG00000223972.5     DDX11L1                        0                        0
## ENSG00000227232.5      WASH7P                      187                      109
## ENSG00000278267.1   MIR6859-1                        0                        0
## ENSG00000243485.5 MIR1302-2HG                        1                        0
## ENSG00000237613.2     FAM138A                        0                        0
## ENSG00000268020.3      OR4G4P                        0                        1
##                   GTEX.1117F.0526.SM.5EGHJ
## ENSG00000223972.5                        0
## ENSG00000227232.5                      143
## ENSG00000278267.1                        1
## ENSG00000243485.5                        0
## ENSG00000237613.2                        0
## ENSG00000268020.3                        0
dim(mx)
## [1] 56200 17383
# downsample 50 samples only
# for testing only
#mx <- mx[,1:50]

mx2 <- aggregate(. ~ Description,mx,sum)
mx2[1:6,1:4]
##   Description GTEX.1117F.0226.SM.5GZZ7 GTEX.1117F.0426.SM.5EGHI
## 1   5_8S_rRNA                        0                        0
## 2     5S_rRNA                        2                        0
## 3         7SK                        0                        0
## 4        A1BG                      230                       37
## 5    A1BG-AS1                       37                        6
## 6        A1CF                        5                        8
##   GTEX.1117F.0526.SM.5EGHJ
## 1                        0
## 2                        1
## 3                        1
## 4                      391
## 5                       19
## 6                        1
dim(mx2)
## [1] 54592 17383
# filter based on detection threshold
rownames(mx2) <- mx2[,1]
mx2[,1]=NULL
mx2 <- mx2[which(rowMeans(mx2)>=10),]
dim(mx2)
## [1] 24807 17382
# normalise
mx2 <- mx2/colSums(mx2) * 1000000
rpm <- mx2
# tidy up some files
remove(mx)

Next get pathways.

if (! file.exists("ReactomePathways.gmt")) {
  download.file("https://reactome.org/download/current/ReactomePathways.gmt.zip", 
    destfile="ReactomePathways.gmt.zip")
  unzip("ReactomePathways.gmt.zip")
}
genesets<-gmt_import("ReactomePathways.gmt")

Assess the correlation structure

# ++++++++++++++++++++++++++++
# flattenCorrMatrix
# ++++++++++++++++++++++++++++
# cormat : matrix of the correlation coefficients
# pmat : matrix of the correlation p-values
flattenCorrMatrix <- function(cormat, pmat) {
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut]
    )
}
res <- mclapply(1:length(genesets), function(i){
  gs <- genesets[[i]]
  len <- length(gs)
  x <- rpm[which(rownames(rpm) %in% gs),]
  if(nrow(x)>=5) {
    res <- rcorr(as.matrix(t(x)),type="spearman")
    res <- flattenCorrMatrix(res$r, res$P)
    # randomly select gene ids as a comparison
    y <- rpm[sample(which(! rownames(rpm) %in% gs),size=len,replace=FALSE),]
    res2 <- rcorr(as.matrix(t(y)),type="spearman")
    res2 <- flattenCorrMatrix(res2$r, res2$P)
    cormax <- max(c(res$cor,res2$cor))
    cormin <- min(c(res$cor,res2$cor))
    setname <- names(genesets[i])
    result <- list("inset"=res$cor, "rand"=res2$cor,"setname"=setname)
    return(result) 
  }
}, mc.cores=round(detectCores()*3/4))

looks like some empty results. Let’s get rid of those now.

str(head(res))
## List of 6
##  $ :List of 3
##   ..$ inset  : num [1:21] 0.0945 0.3481 0.0512 0.3507 -0.1102 ...
##   ..$ rand   : num [1:78] 0.514 -0.261 -0.428 0.352 0.208 ...
##   ..$ setname: chr "2-LTR circle formation"
##  $ : NULL
##  $ :List of 3
##   ..$ inset  : num [1:325] 0.373 0.192 0.134 0.299 0.17 ...
##   ..$ rand   : num [1:325] -0.169 -0.145 0.272 0.119 0.252 ...
##   ..$ setname: chr "A tetrasaccharide linker sequence is required for GAG synthesis"
##  $ :List of 3
##   ..$ inset  : num [1:2926] -0.00235 0.00883 0.05399 0.31089 0.18105 ...
##   ..$ rand   : num [1:3003] 0.0585 0.1892 0.0518 0.1463 -0.0135 ...
##   ..$ setname: chr "ABC transporter disorders"
##  $ :List of 3
##   ..$ inset  : num [1:153] -0.00225 -0.03581 0.00353 0.14359 0.05399 ...
##   ..$ rand   : num [1:153] 0.07153 0.06565 0.2078 -0.00457 0.02511 ...
##   ..$ setname: chr "ABC transporters in lipid homeostasis"
##  $ :List of 3
##   ..$ inset  : num [1:5151] -0.00225 -0.03581 0.00353 0.14359 0.05399 ...
##   ..$ rand   : num [1:5253] 0.0379 0.1782 0.147 0.2789 0.2374 ...
##   ..$ setname: chr "ABC-family proteins mediated transport"
res <- res[which(lapply(res,length)>0)]
str(head(res))
## List of 6
##  $ :List of 3
##   ..$ inset  : num [1:21] 0.0945 0.3481 0.0512 0.3507 -0.1102 ...
##   ..$ rand   : num [1:78] 0.514 -0.261 -0.428 0.352 0.208 ...
##   ..$ setname: chr "2-LTR circle formation"
##  $ :List of 3
##   ..$ inset  : num [1:325] 0.373 0.192 0.134 0.299 0.17 ...
##   ..$ rand   : num [1:325] -0.169 -0.145 0.272 0.119 0.252 ...
##   ..$ setname: chr "A tetrasaccharide linker sequence is required for GAG synthesis"
##  $ :List of 3
##   ..$ inset  : num [1:2926] -0.00235 0.00883 0.05399 0.31089 0.18105 ...
##   ..$ rand   : num [1:3003] 0.0585 0.1892 0.0518 0.1463 -0.0135 ...
##   ..$ setname: chr "ABC transporter disorders"
##  $ :List of 3
##   ..$ inset  : num [1:153] -0.00225 -0.03581 0.00353 0.14359 0.05399 ...
##   ..$ rand   : num [1:153] 0.07153 0.06565 0.2078 -0.00457 0.02511 ...
##   ..$ setname: chr "ABC transporters in lipid homeostasis"
##  $ :List of 3
##   ..$ inset  : num [1:5151] -0.00225 -0.03581 0.00353 0.14359 0.05399 ...
##   ..$ rand   : num [1:5253] 0.0379 0.1782 0.147 0.2789 0.2374 ...
##   ..$ setname: chr "ABC-family proteins mediated transport"
##  $ :List of 3
##   ..$ inset  : num [1:6328] 0.42661 0.09581 0.03123 0.00863 -0.12751 ...
##   ..$ rand   : num [1:8911] 0.233 -0.0342 0.1648 -0.1126 0.0139 ...
##   ..$ setname: chr "ADORA2B mediated anti-inflammatory cytokines production"
length(res)
## [1] 1894

We can visualise the spearman correlation coefficients here.

z <- lapply(1:6, function(i){
  setname <- res[[i]]$setname
  data <- res[[i]][1:2]
  vioplot(data,col="white",ylab="Spearman rho",main=setname)

  data$inset <- sample(x=data$inset,size=1000,replace=TRUE)
  data$rand <- sample(x=data$rand,size=1000,replace=TRUE)
  boxplot(data,col="white",ylab="Spearman rho",main=setname, cex=0)
  beeswarm(data,add=TRUE,pch=19,cex=0.6)
})