Source: https://github.com/markziemann/combined_enrichment
This script is designed to fetch GTEXv8 RNA-seq expression data in order to assess the correlation of genes within a gene set.
suppressPackageStartupMessages({
library("kableExtra")
library("beeswarm")
library("getDEE2")
library("mitch")
library("gplots")
library("Hmisc")
library("parallel")
library("vioplot")
})
First get gene expression data.
DATURL="https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz"
if (! file.exists("GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz") ) {
download.file(DATURL,destfile="GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz")
tmp <- readLines("GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz")
tmp <- tmp[3:length(tmp)]
writeLines(tmp,con="GTEx_data.tsv")
}
mx <- read.table("GTEx_data.tsv",header=TRUE,row.names=1)
mx[1:6,1:4]
## Description GTEX.1117F.0226.SM.5GZZ7 GTEX.1117F.0426.SM.5EGHI
## ENSG00000223972.5 DDX11L1 0 0
## ENSG00000227232.5 WASH7P 187 109
## ENSG00000278267.1 MIR6859-1 0 0
## ENSG00000243485.5 MIR1302-2HG 1 0
## ENSG00000237613.2 FAM138A 0 0
## ENSG00000268020.3 OR4G4P 0 1
## GTEX.1117F.0526.SM.5EGHJ
## ENSG00000223972.5 0
## ENSG00000227232.5 143
## ENSG00000278267.1 1
## ENSG00000243485.5 0
## ENSG00000237613.2 0
## ENSG00000268020.3 0
dim(mx)
## [1] 56200 17383
# downsample 50 samples only
# for testing only
mx <- mx[,1:50]
mx2 <- aggregate(. ~ Description,mx,sum)
mx2[1:6,1:4]
## Description GTEX.1117F.0226.SM.5GZZ7 GTEX.1117F.0426.SM.5EGHI
## 1 5_8S_rRNA 0 0
## 2 5S_rRNA 2 0
## 3 7SK 0 0
## 4 A1BG 230 37
## 5 A1BG-AS1 37 6
## 6 A1CF 5 8
## GTEX.1117F.0526.SM.5EGHJ
## 1 0
## 2 1
## 3 1
## 4 391
## 5 19
## 6 1
dim(mx2)
## [1] 54592 50
# filter based on detection threshold
rownames(mx2) <- mx2[,1]
mx2[,1]=NULL
mx2 <- mx2[which(rowMeans(mx2)>=10),]
dim(mx2)
## [1] 24760 49
# normalise
mx2 <- mx2/colSums(mx2) * 1000000
rpm <- mx2
# tidy up some files
remove(mx)
Next get pathways.
if (! file.exists("ReactomePathways.gmt")) {
download.file("https://reactome.org/download/current/ReactomePathways.gmt.zip",
destfile="ReactomePathways.gmt.zip")
unzip("ReactomePathways.gmt.zip")
}
genesets<-gmt_import("ReactomePathways.gmt")
# ++++++++++++++++++++++++++++
# flattenCorrMatrix
# ++++++++++++++++++++++++++++
# cormat : matrix of the correlation coefficients
# pmat : matrix of the correlation p-values
flattenCorrMatrix <- function(cormat, pmat) {
ut <- upper.tri(cormat)
data.frame(
row = rownames(cormat)[row(cormat)[ut]],
column = rownames(cormat)[col(cormat)[ut]],
cor =(cormat)[ut],
p = pmat[ut]
)
}
res <- mclapply(1:length(genesets), function(i){
gs <- genesets[[i]]
len <- length(gs)
x <- rpm[which(rownames(rpm) %in% gs),]
if(nrow(x)>=10) {
res <- rcorr(as.matrix(t(x)),type="spearman")
res <- flattenCorrMatrix(res$r, res$P)
# randomly select gene ids as a comparison
y <- rpm[sample(which(! rownames(rpm) %in% gs),size=len,replace=FALSE),]
res2 <- rcorr(as.matrix(t(y)),type="spearman")
res2 <- flattenCorrMatrix(res2$r, res2$P)
cormax <- max(c(res$cor,res2$cor))
cormin <- min(c(res$cor,res2$cor))
setname <- names(genesets[i])
result <- list("inset"=res$cor, "rand"=res2$cor,"setname"=setname)
return(result)
}
}, mc.cores=16)
looks like some empty results. Let’s get rid of those now.
str(head(res))
## List of 6
## $ : NULL
## $ : NULL
## $ :List of 3
## ..$ inset : num [1:325] 0.4323 0.358 0.3406 0.4211 0.0626 ...
## ..$ rand : num [1:325] 0.195 0.119 0.395 0.223 0.444 ...
## ..$ setname: chr "A tetrasaccharide linker sequence is required for GAG synthesis"
## $ :List of 3
## ..$ inset : num [1:2926] -0.2965 -0.042 -0.0864 0.1843 0.3094 ...
## ..$ rand : num [1:3003] 0.0633 0.0826 0.0334 0.3253 0.1114 ...
## ..$ setname: chr "ABC transporter disorders"
## $ :List of 3
## ..$ inset : num [1:153] 0.000766 0.137041 0.021281 0.007041 -0.0864 ...
## ..$ rand : num [1:153] 0.521 0.118 0.295 0.212 0.304 ...
## ..$ setname: chr "ABC transporters in lipid homeostasis"
## $ :List of 3
## ..$ inset : num [1:5151] 0.000766 0.137041 0.021281 0.007041 -0.0864 ...
## ..$ rand : num [1:5253] -0.0656 0.1289 -0.3052 0.3258 0.0548 ...
## ..$ setname: chr "ABC-family proteins mediated transport"
res <- res[which(lapply(res,length)>0)]
str(head(res))
## List of 6
## $ :List of 3
## ..$ inset : num [1:325] 0.4323 0.358 0.3406 0.4211 0.0626 ...
## ..$ rand : num [1:325] 0.195 0.119 0.395 0.223 0.444 ...
## ..$ setname: chr "A tetrasaccharide linker sequence is required for GAG synthesis"
## $ :List of 3
## ..$ inset : num [1:2926] -0.2965 -0.042 -0.0864 0.1843 0.3094 ...
## ..$ rand : num [1:3003] 0.0633 0.0826 0.0334 0.3253 0.1114 ...
## ..$ setname: chr "ABC transporter disorders"
## $ :List of 3
## ..$ inset : num [1:153] 0.000766 0.137041 0.021281 0.007041 -0.0864 ...
## ..$ rand : num [1:153] 0.521 0.118 0.295 0.212 0.304 ...
## ..$ setname: chr "ABC transporters in lipid homeostasis"
## $ :List of 3
## ..$ inset : num [1:5151] 0.000766 0.137041 0.021281 0.007041 -0.0864 ...
## ..$ rand : num [1:5253] -0.0656 0.1289 -0.3052 0.3258 0.0548 ...
## ..$ setname: chr "ABC-family proteins mediated transport"
## $ :List of 3
## ..$ inset : num [1:5356] 0.11 -0.0119 -0.1712 0.1566 -0.1986 ...
## ..$ rand : num [1:8911] -0.2247 0.1902 0.0342 0.229 -0.2036 ...
## ..$ setname: chr "ADORA2B mediated anti-inflammatory cytokines production"
## $ :List of 3
## ..$ inset : num [1:276] 0.376 0.138 0.286 0.361 0.595 ...
## ..$ rand : num [1:300] 0.09561 -0.00112 0.31929 0.60418 0.38806 ...
## ..$ setname: chr "ADP signalling through P2Y purinoceptor 1"
We can visualise the spearman correlation coefficients here.
z <- lapply(1:6, function(i){
setname <- res[[i]]$setname
boxplot(res[[i]][1:2],col="white",ylab="Spearman rho",main=setname)
beeswarm(res[[i]][1:2],add=TRUE,pch=19)
})
We can run a t-test to quantify the difference in distributions.
res2 <- lapply(1:length(res), function(i) {
inset <- res[[i]]$inset
rand <- res[[i]]$rand
tt <- t.test(x=inset,y=rand)
return(c("stat"=tt$statistic,"pvalue"=tt$p.value))
})
res2 <- do.call(rbind,res2)
rownames(res2) <- sapply(res,"[[",3)
summary(res2)
## stat.t pvalue
## Min. :-85.483 Min. :0.00000
## 1st Qu.: 1.449 1st Qu.:0.00000
## Median : 6.348 Median :0.00000
## Mean : 14.944 Mean :0.08360
## 3rd Qu.: 19.635 3rd Qu.:0.01501
## Max. :307.768 Max. :0.99906
Now we can look at the positive and negatively correlated gene sets.
poscor <- head(res2[order(-res2[,1]),],10)
poscor %>% kbl(caption="most correlated gene sets") %>% kable_paper("hover", full_width = F)
stat.t | pvalue | |
---|---|---|
Gene expression (Transcription) | 307.7681 | 0 |
Metabolism of RNA | 280.2620 | 0 |
RNA Polymerase II Transcription | 279.9882 | 0 |
Generic Transcription Pathway | 229.6446 | 0 |
Immune System | 212.8620 | 0 |
Cell Cycle | 207.1243 | 0 |
Cell Cycle, Mitotic | 163.8820 | 0 |
Processing of Capped Intron-Containing Pre-mRNA | 153.3164 | 0 |
M Phase | 130.0567 | 0 |
Cytokine Signaling in Immune system | 126.8020 | 0 |
poscor_res <- res[which(sapply(res,"[[",3) %in% rownames(poscor))]
Vioplot of positively correlated sets.
z <- lapply(1:length(poscor_res), function(i){
setname <- poscor_res[[i]]$setname
gset <- genesets[[which(names(genesets) ==setname)]]
n <- length(which(gset %in% rownames(rpm)))
vioplot(poscor_res[[i]][1:2] , ylab="Spearman rho",main=setname )
mtext(paste("n=",n))
})
Heatmap of pos cor sets.
z <- lapply(1:10, function(i) {
setname <- rownames(poscor)[i]
gset <- genesets[[which(names(genesets) ==setname)]]
mx <- rpm[which(rownames(rpm) %in% gset),]
heatmap.2(as.matrix(mx),trace="none",scale="row",main=setname, margin=c(5,7))
heatmap.2(cor(t(mx)),trace="none",scale="none",main=setname,margin=c(5,7))
})
Now focus on negatively correlated sets.
anticor <- head(res2[order(res2[,1]),],10)
anticor %>% kbl(caption="most anti-correlated gene sets") %>% kable_paper("hover", full_width = F)
stat.t | pvalue | |
---|---|---|
Metabolism | -85.48251 | 0 |
Developmental Biology | -61.56000 | 0 |
Transport of small molecules | -47.89319 | 0 |
GPCR ligand binding | -32.89906 | 0 |
Sensory Perception | -26.55481 | 0 |
Muscle contraction | -18.78015 | 0 |
Class A/1 (Rhodopsin-like receptors) | -18.27155 | 0 |
Peptide ligand-binding receptors | -16.83629 | 0 |
Transport of inorganic cations/anions and amino acids/oligopeptides | -15.09171 | 0 |
Cardiac conduction | -15.00643 | 0 |
anticor_res <- res[which(sapply(res,"[[",3) %in% rownames(anticor))]
z <- lapply(1:length(anticor_res), function(i){
setname <- anticor_res[[i]]$setname
gset <- genesets[[which(names(genesets) ==setname)]]
n <- length(which(gset %in% rownames(rpm)))
vioplot(anticor_res[[i]][1:2] , ylab="Spearman rho",main=setname )
mtext(paste("n=",n))
})
Heatmap of neg cor sets.
z <- lapply(1:10, function(i) {
setname <- rownames(anticor)[i]
gset <- genesets[[which(names(genesets) ==setname)]]
mx <- rpm[which(rownames(rpm) %in% gset),]
heatmap.2(as.matrix(mx),trace="none",scale="row",main=setname,margin=c(5,7))
heatmap.2(cor(t(mx)),trace="none",scale="none",main=setname,margin=c(5,7))
})
Now visualise the test statistics found.
par(mfrow=c(2,1))
hist(res2[,"stat.t"],breaks=100,main="",xlab="test statistic")
boxplot(res2[,"stat.t"],horizontal=TRUE)
hist(res2[,"stat.t"],xlim=c(-50,50),breaks=100,main="",xlab="test statistic")
boxplot(res2[,"stat.t"],ylim=c(-50,50),horizontal=TRUE)
par(mfrow=c(1,1))
pos <- length( which(res2[,1]>0 & res2[,2] < 0.05) )
neg <- length( which(res2[,1]<0 & res2[,2] < 0.05) )
none <- length( which(res2[,2] > 0.05) )
bars <- c("Pos"=pos,"None"=none,"Neg"=neg)
barplot(bars,ylab="no. gene sets with correlated expression",ylim=c(0,max(bars)*1.3))
text(((1:length(bars))-0.5)*1.25,bars+50,labels=bars)
This work confirms the observations by Hong, that genes in a set are on the whole, most gene sets positively correlated members.
Some gene sets are highly correlated, while others are heterogenous, or slightly anticorrelated.
We can try to reproduce this correlation structure in simulated data.
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] vioplot_0.3.7 zoo_1.8-9 sm_2.2-5.7 Hmisc_4.6-0
## [5] ggplot2_3.3.5 Formula_1.2-4 survival_3.2-13 lattice_0.20-45
## [9] gplots_3.1.1 mitch_1.4.1 getDEE2_1.2.0 beeswarm_0.4.0
## [13] kableExtra_1.3.4
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-7 matrixStats_0.61.0
## [3] webshot_0.5.2 RColorBrewer_1.1-2
## [5] httr_1.4.2 GenomeInfoDb_1.28.4
## [7] backports_1.4.1 tools_4.1.2
## [9] bslib_0.3.1 utf8_1.2.2
## [11] R6_2.5.1 rpart_4.1.16
## [13] KernSmooth_2.23-20 DBI_1.1.2
## [15] BiocGenerics_0.38.0 colorspace_2.0-2
## [17] nnet_7.3-17 withr_2.4.3
## [19] htm2txt_2.1.1 tidyselect_1.1.1
## [21] gridExtra_2.3 GGally_2.1.2
## [23] compiler_4.1.2 cli_3.1.1
## [25] rvest_1.0.2 Biobase_2.52.0
## [27] htmlTable_2.4.0 xml2_1.3.3
## [29] DelayedArray_0.18.0 sass_0.4.0
## [31] checkmate_2.0.0 caTools_1.18.2
## [33] scales_1.1.1 systemfonts_1.0.3
## [35] stringr_1.4.0 digest_0.6.29
## [37] foreign_0.8-82 rmarkdown_2.11
## [39] svglite_2.0.0 XVector_0.32.0
## [41] jpeg_0.1-9 base64enc_0.1-3
## [43] pkgconfig_2.0.3 htmltools_0.5.2
## [45] MatrixGenerics_1.4.3 highr_0.9
## [47] fastmap_1.1.0 htmlwidgets_1.5.4
## [49] rlang_1.0.0 rstudioapi_0.13
## [51] shiny_1.7.1 jquerylib_0.1.4
## [53] generics_0.1.1 jsonlite_1.7.3
## [55] gtools_3.9.2 dplyr_1.0.7
## [57] RCurl_1.98-1.5 magrittr_2.0.2
## [59] GenomeInfoDbData_1.2.6 Matrix_1.4-0
## [61] Rcpp_1.0.8 munsell_0.5.0
## [63] S4Vectors_0.30.2 fansi_1.0.2
## [65] lifecycle_1.0.1 stringi_1.7.6
## [67] yaml_2.2.2 MASS_7.3-55
## [69] SummarizedExperiment_1.22.0 zlibbioc_1.38.0
## [71] plyr_1.8.6 grid_4.1.2
## [73] promises_1.2.0.1 crayon_1.4.2
## [75] echarts4r_0.4.3 splines_4.1.2
## [77] knitr_1.37 pillar_1.6.5
## [79] GenomicRanges_1.44.0 reshape2_1.4.4
## [81] stats4_4.1.2 glue_1.6.1
## [83] evaluate_0.14 latticeExtra_0.6-29
## [85] data.table_1.14.2 png_0.1-7
## [87] vctrs_0.3.8 httpuv_1.6.5
## [89] gtable_0.3.0 purrr_0.3.4
## [91] reshape_0.8.8 assertthat_0.2.1
## [93] xfun_0.29 mime_0.12
## [95] xtable_1.8-4 later_1.3.0
## [97] viridisLite_0.4.0 tibble_3.1.6
## [99] IRanges_2.26.0 cluster_2.1.2
## [101] ellipsis_0.3.2