Source: https://github.com/markziemann/combined_enrichment
This script is designed to fetch GTEXv8 RNA-seq expression data in order to assess the correlation of genes within a gene set.
suppressPackageStartupMessages({
# library("kableExtra")
library("beeswarm")
# library("getDEE2")
library("mitch")
library("gplots")
library("Hmisc")
library("parallel")
library("vioplot")
})
First get gene expression data.
DATURL="https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz"
if (! file.exists("GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz") ) {
download.file(DATURL,destfile="GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz")
tmp <- readLines("GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz")
tmp <- tmp[3:length(tmp)]
writeLines(tmp,con="GTEx_data.tsv")
}
# import data
# GTEx_data2.tsv is the cut-down version for testing
# GTEx_data.tsv is the main dataset
#mx <- read.table("GTEx_data2.tsv",header=TRUE,row.names=1)
mx <- read.table("GTEx_data.tsv",header=TRUE,row.names=1)
mx[1:6,1:4]
## Description GTEX.1117F.0226.SM.5GZZ7 GTEX.1117F.0426.SM.5EGHI
## ENSG00000223972.5 DDX11L1 0 0
## ENSG00000227232.5 WASH7P 187 109
## ENSG00000278267.1 MIR6859-1 0 0
## ENSG00000243485.5 MIR1302-2HG 1 0
## ENSG00000237613.2 FAM138A 0 0
## ENSG00000268020.3 OR4G4P 0 1
## GTEX.1117F.0526.SM.5EGHJ
## ENSG00000223972.5 0
## ENSG00000227232.5 143
## ENSG00000278267.1 1
## ENSG00000243485.5 0
## ENSG00000237613.2 0
## ENSG00000268020.3 0
dim(mx)
## [1] 56200 17383
# downsample 50 samples only
# for testing only
#mx <- mx[,1:50]
mx2 <- aggregate(. ~ Description,mx,sum)
mx2[1:6,1:4]
## Description GTEX.1117F.0226.SM.5GZZ7 GTEX.1117F.0426.SM.5EGHI
## 1 5_8S_rRNA 0 0
## 2 5S_rRNA 2 0
## 3 7SK 0 0
## 4 A1BG 230 37
## 5 A1BG-AS1 37 6
## 6 A1CF 5 8
## GTEX.1117F.0526.SM.5EGHJ
## 1 0
## 2 1
## 3 1
## 4 391
## 5 19
## 6 1
dim(mx2)
## [1] 54592 17383
# filter based on detection threshold
rownames(mx2) <- mx2[,1]
mx2[,1]=NULL
mx2 <- mx2[which(rowMeans(mx2)>=10),]
dim(mx2)
## [1] 24807 17382
# normalise
mx2 <- mx2/colSums(mx2) * 1000000
rpm <- mx2
# tidy up some files
remove(mx)
Next get pathways.
if (! file.exists("ReactomePathways.gmt")) {
download.file("https://reactome.org/download/current/ReactomePathways.gmt.zip",
destfile="ReactomePathways.gmt.zip")
unzip("ReactomePathways.gmt.zip")
}
genesets<-gmt_import("ReactomePathways.gmt")
# ++++++++++++++++++++++++++++
# flattenCorrMatrix
# ++++++++++++++++++++++++++++
# cormat : matrix of the correlation coefficients
# pmat : matrix of the correlation p-values
flattenCorrMatrix <- function(cormat, pmat) {
ut <- upper.tri(cormat)
data.frame(
row = rownames(cormat)[row(cormat)[ut]],
column = rownames(cormat)[col(cormat)[ut]],
cor =(cormat)[ut],
p = pmat[ut]
)
}
res <- mclapply(1:length(genesets), function(i){
gs <- genesets[[i]]
len <- length(gs)
x <- rpm[which(rownames(rpm) %in% gs),]
if(nrow(x)>=5) {
res <- rcorr(as.matrix(t(x)),type="spearman")
res <- flattenCorrMatrix(res$r, res$P)
# randomly select gene ids as a comparison
y <- rpm[sample(which(! rownames(rpm) %in% gs),size=len,replace=FALSE),]
res2 <- rcorr(as.matrix(t(y)),type="spearman")
res2 <- flattenCorrMatrix(res2$r, res2$P)
cormax <- max(c(res$cor,res2$cor))
cormin <- min(c(res$cor,res2$cor))
setname <- names(genesets[i])
result <- list("inset"=res$cor, "rand"=res2$cor,"setname"=setname)
return(result)
}
}, mc.cores=round(detectCores()*3/4))
looks like some empty results. Let’s get rid of those now.
str(head(res))
## List of 6
## $ :List of 3
## ..$ inset : num [1:21] 0.0945 0.3481 0.0512 0.3507 -0.1102 ...
## ..$ rand : num [1:78] 0.514 -0.261 -0.428 0.352 0.208 ...
## ..$ setname: chr "2-LTR circle formation"
## $ : NULL
## $ :List of 3
## ..$ inset : num [1:325] 0.373 0.192 0.134 0.299 0.17 ...
## ..$ rand : num [1:325] -0.169 -0.145 0.272 0.119 0.252 ...
## ..$ setname: chr "A tetrasaccharide linker sequence is required for GAG synthesis"
## $ :List of 3
## ..$ inset : num [1:2926] -0.00235 0.00883 0.05399 0.31089 0.18105 ...
## ..$ rand : num [1:3003] 0.0585 0.1892 0.0518 0.1463 -0.0135 ...
## ..$ setname: chr "ABC transporter disorders"
## $ :List of 3
## ..$ inset : num [1:153] -0.00225 -0.03581 0.00353 0.14359 0.05399 ...
## ..$ rand : num [1:153] 0.07153 0.06565 0.2078 -0.00457 0.02511 ...
## ..$ setname: chr "ABC transporters in lipid homeostasis"
## $ :List of 3
## ..$ inset : num [1:5151] -0.00225 -0.03581 0.00353 0.14359 0.05399 ...
## ..$ rand : num [1:5253] 0.0379 0.1782 0.147 0.2789 0.2374 ...
## ..$ setname: chr "ABC-family proteins mediated transport"
res <- res[which(lapply(res,length)>0)]
str(head(res))
## List of 6
## $ :List of 3
## ..$ inset : num [1:21] 0.0945 0.3481 0.0512 0.3507 -0.1102 ...
## ..$ rand : num [1:78] 0.514 -0.261 -0.428 0.352 0.208 ...
## ..$ setname: chr "2-LTR circle formation"
## $ :List of 3
## ..$ inset : num [1:325] 0.373 0.192 0.134 0.299 0.17 ...
## ..$ rand : num [1:325] -0.169 -0.145 0.272 0.119 0.252 ...
## ..$ setname: chr "A tetrasaccharide linker sequence is required for GAG synthesis"
## $ :List of 3
## ..$ inset : num [1:2926] -0.00235 0.00883 0.05399 0.31089 0.18105 ...
## ..$ rand : num [1:3003] 0.0585 0.1892 0.0518 0.1463 -0.0135 ...
## ..$ setname: chr "ABC transporter disorders"
## $ :List of 3
## ..$ inset : num [1:153] -0.00225 -0.03581 0.00353 0.14359 0.05399 ...
## ..$ rand : num [1:153] 0.07153 0.06565 0.2078 -0.00457 0.02511 ...
## ..$ setname: chr "ABC transporters in lipid homeostasis"
## $ :List of 3
## ..$ inset : num [1:5151] -0.00225 -0.03581 0.00353 0.14359 0.05399 ...
## ..$ rand : num [1:5253] 0.0379 0.1782 0.147 0.2789 0.2374 ...
## ..$ setname: chr "ABC-family proteins mediated transport"
## $ :List of 3
## ..$ inset : num [1:6328] 0.42661 0.09581 0.03123 0.00863 -0.12751 ...
## ..$ rand : num [1:8911] 0.233 -0.0342 0.1648 -0.1126 0.0139 ...
## ..$ setname: chr "ADORA2B mediated anti-inflammatory cytokines production"
length(res)
## [1] 1894
We can visualise the spearman correlation coefficients here.
z <- lapply(1:6, function(i){
setname <- res[[i]]$setname
data <- res[[i]][1:2]
vioplot(data,col="white",ylab="Spearman rho",main=setname)
data$inset <- sample(x=data$inset,size=1000,replace=TRUE)
data$rand <- sample(x=data$rand,size=1000,replace=TRUE)
boxplot(data,col="white",ylab="Spearman rho",main=setname, cex=0)
beeswarm(data,add=TRUE,pch=19,cex=0.6)
})
We can run a t-test to quantify the difference in distributions.
res2 <- lapply(1:length(res), function(i) {
inset <- res[[i]]$inset
rand <- res[[i]]$rand
tt <- t.test(x=inset,y=rand)
return(c("stat"=tt$statistic,"pvalue"=tt$p.value))
})
res2 <- do.call(rbind,res2)
rownames(res2) <- sapply(res,"[[",3)
summary(res2)
## stat.t pvalue
## Min. :-65.062 Min. :0.0000000
## 1st Qu.: 1.395 1st Qu.:0.0000000
## Median : 5.053 Median :0.0000002
## Mean : 16.077 Mean :0.1048890
## 3rd Qu.: 17.584 3rd Qu.:0.0509141
## Max. :487.644 Max. :0.9957407
output the results
write.table(res2,file="cor.tsv",quote=FALSE,sep="\t")
Now we can look at the positive and negatively correlated gene sets.
poscor <- head(res2[order(-res2[,1]),],10)
poscor #%>% kbl(caption="most correlated gene sets") %>% kable_paper("hover", full_width = F)
## stat.t pvalue
## Gene expression (Transcription) 487.6439 0
## RNA Polymerase II Transcription 397.3404 0
## Metabolism of RNA 381.9847 0
## Generic Transcription Pathway 308.9746 0
## Metabolism of proteins 289.2594 0
## Cell Cycle 256.5527 0
## Post-translational protein modification 215.5013 0
## Cell Cycle, Mitotic 214.0735 0
## Processing of Capped Intron-Containing Pre-mRNA 201.0746 0
## Cellular responses to stress 197.0605 0
poscor_res <- res[which(sapply(res,"[[",3) %in% rownames(poscor))]
Vioplot of positively correlated sets.
z <- lapply(1:length(poscor_res), function(i){
setname <- poscor_res[[i]]$setname
gset <- genesets[[which(names(genesets) ==setname)]]
n <- length(which(gset %in% rownames(rpm)))
data <- poscor_res[[i]][1:2]
vioplot(data , ylab="Spearman rho",main=setname )
grid()
mtext(paste("n=",n))
data$inset <- sample(x=data$inset,size=1000,replace=TRUE)
data$rand <- sample(x=data$rand,size=1000,replace=TRUE)
boxplot(data,col="white",ylab="Spearman rho",main=setname,cex=0)
grid()
beeswarm(data,add=TRUE,pch=19,cex=0.6)
mtext(paste("n=",n))
})
pdf("posvio.pdf",height=5,width=5)
z <- lapply(1:length(poscor_res), function(i){
setname <- poscor_res[[i]]$setname
gset <- genesets[[which(names(genesets) ==setname)]]
n <- length(which(gset %in% rownames(rpm)))
data <- poscor_res[[i]][1:2]
vioplot(data , ylab="Spearman rho",main=setname )
grid()
mtext(paste("n=",n))
data$inset <- sample(x=data$inset,size=1000,replace=TRUE)
data$rand <- sample(x=data$rand,size=1000,replace=TRUE)
boxplot(data,col="white",ylab="Spearman rho",main=setname,cex=0)
grid()
beeswarm(data,add=TRUE,pch=19,cex=0.6)
mtext(paste("n=",n))
})
dev.off()
## png
## 2
Heatmap of pos cor sets.
z <- lapply(1:10, function(i) {
setname <- rownames(poscor)[i]
gset <- genesets[[which(names(genesets) ==setname)]]
mx <- rpm[which(rownames(rpm) %in% gset),]
heatmap.2(as.matrix(mx),trace="none",scale="row",main=setname, margin=c(5,7))
heatmap.2(cor(t(mx)),trace="none",scale="none",main=setname,margin=c(5,7))
})
Now focus on negatively correlated sets.
negcor <- head(res2[order(res2[,1]),],10)
negcor #%>% kbl(caption="most anti-correlated gene sets") %>% kable_paper("hover", full_width = F)
## stat.t pvalue
## Signaling by GPCR -65.06250 0.000000e+00
## GPCR downstream signalling -45.09572 0.000000e+00
## Hemostasis -42.62390 0.000000e+00
## GPCR ligand binding -40.57495 0.000000e+00
## Transport of small molecules -40.36903 0.000000e+00
## Class A/1 (Rhodopsin-like receptors) -35.30746 4.895173e-271
## Metabolism -32.64372 1.050881e-233
## Developmental Biology -31.72911 7.889720e-221
## Peptide ligand-binding receptors -31.52014 2.918567e-214
## SLC-mediated transmembrane transport -30.88762 7.364697e-208
negcor_res <- res[which(sapply(res,"[[",3) %in% rownames(negcor))]
z <- lapply(1:length(negcor_res), function(i){
setname <- negcor_res[[i]]$setname
gset <- genesets[[which(names(genesets) ==setname)]]
n <- length(which(gset %in% rownames(rpm)))
data <- negcor_res[[i]][1:2]
vioplot(data , ylab="Spearman rho",main=setname )
grid()
mtext(paste("n=",n))
data$inset <- sample(x=data$inset,size=1000,replace=TRUE)
data$rand <- sample(x=data$rand,size=1000,replace=TRUE)
boxplot(data,col="white",ylab="Spearman rho",main=setname,cex=0)
grid()
beeswarm(data,add=TRUE,pch=19,cex=0.6)
mtext(paste("n=",n))
})
pdf("negvio.pdf",height=5,width=5)
z <- lapply(1:length(negcor_res), function(i){
setname <- negcor_res[[i]]$setname
gset <- genesets[[which(names(genesets) ==setname)]]
n <- length(which(gset %in% rownames(rpm)))
data <- negcor_res[[i]][1:2]
vioplot(data , ylab="Spearman rho",main=setname )
grid()
mtext(paste("n=",n))
data$inset <- sample(x=data$inset,size=1000,replace=TRUE)
data$rand <- sample(x=data$rand,size=1000,replace=TRUE)
boxplot(data,col="white",ylab="Spearman rho",main=setname,cex=0)
grid()
beeswarm(data,add=TRUE,pch=19,cex=0.6)
mtext(paste("n=",n))
})
dev.off()
## png
## 2
Heatmap of neg cor sets.
z <- lapply(1:10, function(i) {
setname <- rownames(negcor)[i]
gset <- genesets[[which(names(genesets) ==setname)]]
mx <- rpm[which(rownames(rpm) %in% gset),]
heatmap.2(as.matrix(mx),trace="none",scale="row",main=setname,margin=c(5,7))
heatmap.2(cor(t(mx)),trace="none",scale="none",main=setname,margin=c(5,7))
})
Now visualise the test statistics found.
res2 <- read.table("cor.tsv",sep="\t")
tstats <- res2[,"stat.t"]
par(mfrow=c(2,1))
hist(tstats,breaks=100,main="",xlab="test statistic")
grid()
hist(tstats,xlim=c(-50,50),breaks=100,main="",xlab="test statistic")
grid()
pdf("hist.pdf",height=5,width=5)
par(mfrow=c(2,1))
hist(tstats,breaks=100,main="",xlab="test statistic")
grid()
hist(tstats,xlim=c(-50,50),breaks=100,main="",xlab="test statistic")
grid()
dev.off()
## png
## 2
boxplot(tstats,horizontal=TRUE,col="white",cex=0, xlab="t-test statistic")
grid()
beeswarm(tstats,add=TRUE,pch=19,cex=0.4,horiz=TRUE)
boxplot(tstats,horizontal=TRUE,col="white",cex=0,ylim=c(-50,50),xlab="t-test statistic")
grid()
beeswarm(tstats,add=TRUE,pch=19,cex=0.4,horiz=TRUE)
pdf("tstat_swarm.pdf",height=5,width=5)
par(mfrow=c(2,1))
boxplot(tstats,horizontal=TRUE,col="white",cex=0, xlab="t-test statistic")
grid()
beeswarm(tstats,add=TRUE,pch=19,cex=0.4,horiz=TRUE)
boxplot(tstats,horizontal=TRUE,col="white",cex=0,ylim=c(-50,50),xlab="t-test statistic")
grid()
beeswarm(tstats,add=TRUE,pch=19,cex=0.4,horiz=TRUE)
dev.off()
## png
## 2
par(mfrow=c(1,1))
pos <- length( which(res2[,1]>0 & res2[,2] < 0.05) )
neg <- length( which(res2[,1]<0 & res2[,2] < 0.05) )
none <- length( which(res2[,2] > 0.05) )
bars <- c("Pos"=pos,"None"=none,"Neg"=neg)
barplot(bars,ylab="no. gene sets with correlated expression",ylim=c(0,max(bars)*1.3))
text(((1:length(bars))-0.5)*1.25,bars+50,labels=bars)
pdf("bars.pdf",height=4,width=4)
barplot(bars,ylab="no. gene sets with correlated expression",ylim=c(0,max(bars)*1.3))
text(((1:length(bars))-0.5)*1.25,bars+50,labels=bars)
dev.off()
## png
## 2
This work confirms the observations by Hong, that genes in a set are on the whole, most gene sets positively correlated members.
Some gene sets are highly correlated, while others are heterogenous, or slightly anticorrelated.
We can try to reproduce this correlation structure in simulated data.
sessionInfo()
## R version 4.0.1 (2020-06-06)
## Platform: x86_64-conda_cos6-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
##
## Matrix products: default
## BLAS/LAPACK: /ceph-g/opt/miniconda3/envs/R40/lib/libopenblasp-r0.3.9.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] vioplot_0.3.7 zoo_1.8-8 sm_2.2-5.7 Hmisc_4.6-0
## [5] ggplot2_3.3.5 Formula_1.2-4 survival_3.2-3 lattice_0.20-41
## [9] gplots_3.1.1 mitch_1.0.10 beeswarm_0.4.0
##
## loaded via a namespace (and not attached):
## [1] splines_4.0.1 gtools_3.9.2 shiny_1.4.0.2
## [4] assertthat_0.2.1 latticeExtra_0.6-29 blob_1.2.1
## [7] yaml_2.2.1 backports_1.1.8 pillar_1.7.0
## [10] glue_1.4.1 digest_0.6.25 RColorBrewer_1.1-2
## [13] promises_1.1.1 checkmate_2.0.0 colorspace_1.4-1
## [16] htmltools_0.5.0 httpuv_1.5.4 Matrix_1.2-18
## [19] plyr_1.8.6 pkgconfig_2.0.3 purrr_0.3.4
## [22] xtable_1.8-4 scales_1.1.1 jpeg_0.1-9
## [25] later_1.1.0.1 tibble_3.0.1 htmlTable_2.4.0
## [28] echarts4r_0.3.3 generics_0.0.2 ellipsis_0.3.2
## [31] withr_2.2.0 nnet_7.3-14 cli_3.1.1
## [34] magrittr_1.5 crayon_1.3.4 mime_0.9
## [37] evaluate_0.14 GGally_2.1.2 fansi_0.4.1
## [40] MASS_7.3-51.6 foreign_0.8-80 data.table_1.12.8
## [43] tools_4.0.1 lifecycle_1.0.1 stringr_1.4.0
## [46] munsell_0.5.0 cluster_2.1.0 compiler_4.0.1
## [49] caTools_1.18.2 rlang_1.0.0 grid_4.0.1
## [52] rstudioapi_0.11 htmlwidgets_1.5.1 bitops_1.0-7
## [55] base64enc_0.1-3 rmarkdown_2.3 gtable_0.3.0
## [58] DBI_1.1.0 reshape_0.8.8 reshape2_1.4.4
## [61] R6_2.4.1 gridExtra_2.3 knitr_1.28
## [64] dplyr_1.0.8 fastmap_1.0.1 utf8_1.1.4
## [67] KernSmooth_2.23-17 stringi_1.4.6 Rcpp_1.0.8
## [70] vctrs_0.3.8 rpart_4.1-15 png_0.1-7
## [73] tidyselect_1.1.1 xfun_0.15