Source: https://github.com/markziemann/combined_enrichment
This script is designed to fetch Illumina Human Body Map 2.0 Project gene expression data in order to assess the correlation of genes within a gene set.
The data is being fetched using getDEE2 with the accession number ERP000546.
suppressPackageStartupMessages({
library("kableExtra")
library("beeswarm")
library("getDEE2")
library("mitch")
library("gplots")
library("Hmisc")
library("parallel")
library("vioplot")
})
First get gene expression data.
mdat <- getDEE2Metadata(species="hsapiens")
mdat2 <- mdat[which(mdat$SRP_accession == "ERP000546"),]
dat <- getDEE2::getDEE2(species="hsapiens",
metadata=mdat,
SRRvec=mdat2$SRR_accession,
legacy=TRUE)
## For more information about DEE2 QC metrics, visit
## https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md
dat <- Tx2Gene(dat)
names(dat)
## [1] "Tx2Gene" "GeneCounts" "TxCounts" "GeneInfo"
## [5] "TxInfo" "QcMx" "MetadataSummary" "MetadataFull"
## [9] "absent"
head(dat$Tx2Gene)
## ERR030856 ERR030857 ERR030858 ERR030859 ERR030860 ERR030861
## ENSG00000000003.14 3143.8387 3179.4595 3098.9818 4033.8767 4176.7202 4127.6651
## ENSG00000000005.5 354.0004 340.0000 297.9998 435.9999 472.0003 443.0004
## ENSG00000000419.12 2062.9960 2049.0021 1959.9971 2690.0033 2735.0004 2686.9951
## ENSG00000000457.13 909.4659 886.7095 917.8299 1135.5987 1108.9460 1034.7060
## ENSG00000000460.16 275.6274 276.8625 282.7185 426.6796 408.1430 414.8135
## ENSG00000000938.12 1691.9997 1753.0005 1728.0012 2526.9949 2436.9969 2478.0045
## ERR030862 ERR030863 ERR030864 ERR030865 ERR030866 ERR030867
## ENSG00000000003.14 1706.9480 1695.3120 3123.7089 2968.2921 4118.8537 4126.2909
## ENSG00000000005.5 261.0000 278.9998 341.0004 321.0003 445.9996 459.9998
## ENSG00000000419.12 981.0004 932.0000 1994.9960 1982.0039 2701.0043 2586.9994
## ENSG00000000457.13 606.4480 582.5506 898.3052 861.0053 1035.4084 1049.4601
## ENSG00000000460.16 267.3197 286.0011 312.7018 275.3162 424.6966 406.5873
## ENSG00000000938.12 605.9999 567.0011 1689.9998 1747.0004 2446.0039 2218.0047
## ERR030868 ERR030869 ERR030870 ERR030871 ERR030872 ERR030873
## ENSG00000000003.14 1712.4254 1712.1002 1734.4042 1728.1194 4094.4162 5054.86170
## ENSG00000000005.5 287.9997 303.0000 313.9996 299.0002 19.0000 63.99997
## ENSG00000000419.12 918.9991 978.0006 983.0003 944.0004 3164.9993 3053.00420
## ENSG00000000457.13 560.1490 592.2570 619.6080 611.2100 1815.1573 1412.01660
## ENSG00000000460.16 287.2886 272.7070 260.8160 306.4214 371.5888 2456.60933
## ENSG00000000938.12 657.0007 616.9997 567.0000 574.0007 410.0004 168.99990
## ERR030874 ERR030875 ERR030876 ERR030877 ERR030878
## ENSG00000000003.14 5621.8377 47.47595 154.49620 3804.4443 1400.91460
## ENSG00000000005.5 48.0000 0.00000 5.00000 39.0000 26.00005
## ENSG00000000419.12 2275.9966 2318.00440 1373.00030 2457.0010 1780.00456
## ENSG00000000457.13 2049.0537 1895.20300 602.13290 1886.5268 1274.19650
## ENSG00000000460.16 361.1265 287.98750 56.47252 434.8744 371.42983
## ENSG00000000938.12 284.9998 33755.03500 156.99970 363.9998 2146.99930
## ERR030879 ERR030880 ERR030881 ERR030882 ERR030883 ERR030884
## ENSG00000000003.14 3143.2243 5153.0686 1323.4207 1199.901 3921.2988 3425.4353
## ENSG00000000005.5 7.0000 1439.9950 394.9998 8.000 3730.9960 147.0001
## ENSG00000000419.12 1782.0005 1321.9988 1792.0047 1476.001 1638.0040 1894.0023
## ENSG00000000457.13 568.9865 604.6376 1224.8002 1002.883 1315.7212 767.9053
## ENSG00000000460.16 211.4579 316.5365 328.8774 191.897 285.5377 205.0773
## ENSG00000000938.12 5467.0051 1879.9965 1256.0010 154.000 463.0002 273.0002
## ERR030885 ERR030886 ERR030887 ERR030888 ERR030889 ERR030890
## ENSG00000000003.14 3553.0532 665.06501 8635.4253 5134.5327 1239.1399 1013.6777
## ENSG00000000005.5 12.0000 20.99998 0.0000 1474.9970 429.0000 13.0000
## ENSG00000000419.12 2368.9999 1682.99840 1238.9999 1403.0001 1797.9971 1303.0039
## ENSG00000000457.13 1266.7593 587.17850 938.1669 697.8841 1295.6834 830.8925
## ENSG00000000460.16 318.6973 103.88349 366.1106 286.0438 274.5181 143.6693
## ENSG00000000938.12 520.0005 340.99960 437.9999 1797.0012 1264.9988 144.0003
## ERR030891 ERR030892 ERR030893 ERR030894 ERR030895 ERR030896
## ENSG00000000003.14 4246.4435 3337.4468 3646.36420 657.2884 8192.8496 3149.9916
## ENSG00000000005.5 3797.9970 137.9997 19.00003 22.0000 0.0000 9.0000
## ENSG00000000419.12 1738.9967 1860.9952 2296.00490 1518.9958 1135.9995 1814.0026
## ENSG00000000457.13 1419.9085 697.8773 1323.05670 458.2305 844.6574 545.3480
## ENSG00000000460.16 278.6460 186.2239 260.77698 112.5909 303.0254 181.0319
## ENSG00000000938.12 489.9995 249.0004 502.99990 294.9997 392.9999 5604.9922
## ERR030897 ERR030898 ERR030899 ERR030900 ERR030901
## ENSG00000000003.14 1342.6442 3948.9691 150.3535 57.2157 5561.4037
## ENSG00000000005.5 18.0000 54.0000 8.0000 0.0000 32.0000
## ENSG00000000419.12 1800.9996 2591.0032 1391.9993 2437.9960 2359.0036
## ENSG00000000457.13 1277.7472 1906.5170 622.7953 1999.0180 2051.2105
## ENSG00000000460.16 398.6739 534.0472 62.2850 311.3919 409.2937
## ENSG00000000938.12 2098.0029 396.0002 160.9997 35409.9590 319.0000
## ERR030902 ERR030903
## ENSG00000000003.14 5269.21850 3951.7308
## ENSG00000000005.5 68.99997 17.0000
## ENSG00000000419.12 3242.99950 3176.9996
## ENSG00000000457.13 1365.63068 1770.8321
## ENSG00000000460.16 2506.72304 358.5619
## ENSG00000000938.12 158.99989 396.0001
head(dat$GeneInfo)
## GeneSymbol mean median longest_isoform merged
## ENSG00000223972 DDX11L1 973 632 1657 1735
## ENSG00000227232 WASH7P 1351 1351 1351 1351
## ENSG00000278267 MIR6859-1 68 68 68 68
## ENSG00000243485 MIR1302-2HG 641 712 712 1021
## ENSG00000284332 MIR1302-2 138 138 138 138
## ENSG00000237613 FAM138A 948 1187 1187 1219
tx2gene <- dat$Tx2Gene
rownames(tx2gene) <- sapply(strsplit(rownames(tx2gene),"\\."),"[[",1)
# filter based on detection threshold
tx2gene <- tx2gene[which(rowMeans(tx2gene)>=10),]
# normalise
rpm <- tx2gene/colSums(tx2gene) * 1000000
Next get pathways.
if (! file.exists("ReactomePathways.gmt")) {
download.file("https://reactome.org/download/current/ReactomePathways.gmt.zip",
destfile="ReactomePathways.gmt.zip")
unzip("ReactomePathways.gmt.zip")
}
genesets<-gmt_import("ReactomePathways.gmt")
# ++++++++++++++++++++++++++++
# flattenCorrMatrix
# ++++++++++++++++++++++++++++
# cormat : matrix of the correlation coefficients
# pmat : matrix of the correlation p-values
flattenCorrMatrix <- function(cormat, pmat) {
ut <- upper.tri(cormat)
data.frame(
row = rownames(cormat)[row(cormat)[ut]],
column = rownames(cormat)[col(cormat)[ut]],
cor =(cormat)[ut],
p = pmat[ut]
)
}
res <- mclapply(1:length(genesets), function(i){
gs <- genesets[[i]]
len <- length(gs)
gsname <- names(genesets[i])
gene_accessions <- rownames(dat$GeneInfo[which(dat$GeneInfo$GeneSymbol %in% gs),])
x <- rpm[which(rownames(rpm) %in% gene_accessions),]
if(nrow(x)>=10) {
res <- rcorr(as.matrix(t(x)),type="spearman")
res <- flattenCorrMatrix(res$r, res$P)
# randomly select gene ids as a comparison
gene_accessions2 <- rownames(dat$GeneInfo[which(! dat$GeneInfo$GeneSymbol %in% gs),])
# filter for those expressed in the dataset and randomly select n
gene_accessions2 <- gene_accessions2[sample(which(gene_accessions2 %in% rownames(rpm) ),size=len, replace=FALSE)]
y <- rpm[which(rownames(rpm) %in% gene_accessions2),]
res2 <- rcorr(as.matrix(t(y)),type="spearman")
res2 <- flattenCorrMatrix(res2$r, res2$P)
cormax <- max(c(res$cor,res2$cor))
cormin <- min(c(res$cor,res2$cor))
setname <- names(genesets[i])
result <- list("inset"=res$cor, "rand"=res2$cor,"setname"=setname)
return(result)
}
}, mc.cores=16)
looks like some empty results. Let’s get rid of those now.
str(head(res))
## List of 6
## $ : NULL
## $ : NULL
## $ :List of 3
## ..$ inset : num [1:325] 0.3789 0.2613 0.6976 -0.0634 -0.0417 ...
## ..$ rand : num [1:325] 0.519 0.262 0.427 0.183 0.295 ...
## ..$ setname: chr "A tetrasaccharide linker sequence is required for GAG synthesis"
## $ :List of 3
## ..$ inset : num [1:2926] 0.165 0.209 0.208 -0.147 0.133 ...
## ..$ rand : num [1:3003] 0.1748 0.1771 0.2757 -0.0277 0.2128 ...
## ..$ setname: chr "ABC transporter disorders"
## $ :List of 3
## ..$ inset : num [1:153] -0.1573 -0.0426 0.2388 0.0987 0.3428 ...
## ..$ rand : num [1:153] 0.196 0.517 0.192 0.64 0.118 ...
## ..$ setname: chr "ABC transporters in lipid homeostasis"
## $ :List of 3
## ..$ inset : num [1:5151] 0.153 0.165 0.183 -0.147 0.408 ...
## ..$ rand : num [1:5253] -0.195 0.103 0.452 0.142 0.389 ...
## ..$ setname: chr "ABC-family proteins mediated transport"
res <- res[which(lapply(res,length)>0)]
str(head(res))
## List of 6
## $ :List of 3
## ..$ inset : num [1:325] 0.3789 0.2613 0.6976 -0.0634 -0.0417 ...
## ..$ rand : num [1:325] 0.519 0.262 0.427 0.183 0.295 ...
## ..$ setname: chr "A tetrasaccharide linker sequence is required for GAG synthesis"
## $ :List of 3
## ..$ inset : num [1:2926] 0.165 0.209 0.208 -0.147 0.133 ...
## ..$ rand : num [1:3003] 0.1748 0.1771 0.2757 -0.0277 0.2128 ...
## ..$ setname: chr "ABC transporter disorders"
## $ :List of 3
## ..$ inset : num [1:153] -0.1573 -0.0426 0.2388 0.0987 0.3428 ...
## ..$ rand : num [1:153] 0.196 0.517 0.192 0.64 0.118 ...
## ..$ setname: chr "ABC transporters in lipid homeostasis"
## $ :List of 3
## ..$ inset : num [1:5151] 0.153 0.165 0.183 -0.147 0.408 ...
## ..$ rand : num [1:5253] -0.195 0.103 0.452 0.142 0.389 ...
## ..$ setname: chr "ABC-family proteins mediated transport"
## $ :List of 3
## ..$ inset : num [1:4560] 0.0769 -0.0967 0.3501 0.155 0.4418 ...
## ..$ rand : num [1:8911] 0.379 0.452 0.314 0.268 0.491 ...
## ..$ setname: chr "ADORA2B mediated anti-inflammatory cytokines production"
## $ :List of 3
## ..$ inset : num [1:231] 0.1843 0.5432 0.3252 0.0751 0.0988 ...
## ..$ rand : num [1:300] 0.2987 -0.0332 -0.2048 0.5384 0.3387 ...
## ..$ setname: chr "ADP signalling through P2Y purinoceptor 1"
We can visualise the spearman correlation coefficients here.
z <- lapply(1:6, function(i){
setname <- res[[i]]$setname
boxplot(res[[i]][1:2],col="white",ylab="Spearman rho",main=setname)
beeswarm(res[[i]][1:2],add=TRUE,pch=19)
})
We can run a t-test to quantify the difference in distributions.
res2 <- lapply(1:length(res), function(i) {
inset <- res[[i]]$inset
rand <- res[[i]]$rand
tt <- t.test(x=inset,y=rand)
return(c("stat"=tt$statistic,"pvalue"=tt$p.value))
})
res2 <- do.call(rbind,res2)
rownames(res2) <- sapply(res,"[[",3)
summary(res2)
## stat.t pvalue
## Min. :-72.945 Min. :0.00000
## 1st Qu.: 1.908 1st Qu.:0.00000
## Median : 5.663 Median :0.00000
## Mean : 11.700 Mean :0.09054
## 3rd Qu.: 14.034 3rd Qu.:0.01619
## Max. :278.538 Max. :0.99620
Now we can look at the positive and negatively correlated gene sets.
poscor <- head(res2[order(-res2[,1]),],10)
poscor %>% kbl(caption="most correlated gene sets") %>% kable_paper("hover", full_width = F)
stat.t | pvalue | |
---|---|---|
Gene expression (Transcription) | 278.5375 | 0 |
RNA Polymerase II Transcription | 230.8156 | 0 |
Metabolism of RNA | 200.2035 | 0 |
Generic Transcription Pathway | 183.8911 | 0 |
Immune System | 147.8624 | 0 |
Immunoregulatory interactions between a Lymphoid and a non-Lymphoid cell | 132.6570 | 0 |
Processing of Capped Intron-Containing Pre-mRNA | 131.1494 | 0 |
Innate Immune System | 126.7056 | 0 |
Cell Cycle | 110.9368 | 0 |
mRNA Splicing | 101.2100 | 0 |
poscor_res <- res[which(sapply(res,"[[",3) %in% rownames(poscor))]
Vioplot of positively correlated sets.
z <- lapply(1:length(poscor_res), function(i){
setname <- poscor_res[[i]]$setname
gset <- genesets[[which(names(genesets) ==setname)]]
eset <- rownames(dat$GeneInfo[which(dat$GeneInfo$GeneSymbol %in% gset),])
n <- length(which(eset %in% rownames(rpm)))
vioplot(poscor_res[[i]][1:2] , ylab="Spearman rho",main=setname )
mtext(paste("n=",n))
})
Heatmap of pos cor sets.
z <- lapply(1:10, function(i) {
setname <- rownames(poscor)[i]
gset <- genesets[[which(names(genesets) ==setname)]]
eset <- rownames(dat$GeneInfo[which(dat$GeneInfo$GeneSymbol %in% gset),])
mx <- rpm[which(rownames(rpm) %in% eset),]
heatmap.2(as.matrix(mx),trace="none",scale="row",main=setname, margin=c(5,7))
})
Now focus on negatively correlated sets.
anticor <- head(res2[order(res2[,1]),],10)
anticor %>% kbl(caption="most anti-correlated gene sets") %>% kable_paper("hover", full_width = F)
stat.t | pvalue | |
---|---|---|
Metabolism | -72.945185 | 0 |
Transport of small molecules | -48.968055 | 0 |
Metabolism of lipids | -19.686980 | 0 |
Sensory Perception | -14.657510 | 0 |
Regulation of lipid metabolism by PPARalpha | -10.353812 | 0 |
Platelet degranulation | -9.441238 | 0 |
Oncogenic MAPK signaling | -7.837527 | 0 |
IGF1R signaling cascade | -7.033656 | 0 |
ABC-family proteins mediated transport | -6.036792 | 0 |
BMAL1:CLOCK,NPAS2 activates circadian gene expression | -5.840983 | 0 |
anticor_res <- res[which(sapply(res,"[[",3) %in% rownames(anticor))]
z <- lapply(1:length(anticor_res), function(i){
setname <- anticor_res[[i]]$setname
gset <- genesets[[which(names(genesets) ==setname)]]
eset <- rownames(dat$GeneInfo[which(dat$GeneInfo$GeneSymbol %in% gset),])
n <- length(which(eset %in% rownames(rpm)))
vioplot(anticor_res[[i]][1:2] , ylab="Spearman rho",main=setname )
mtext(paste("n=",n))
})
Heatmap of neg cor sets.
z <- lapply(1:10, function(i) {
setname <- rownames(anticor)[i]
gset <- genesets[[which(names(genesets) ==setname)]]
eset <- rownames(dat$GeneInfo[which(dat$GeneInfo$GeneSymbol %in% gset),])
mx <- rpm[which(rownames(rpm) %in% eset),]
heatmap.2(as.matrix(mx),trace="none",scale="row",main=setname,margin=c(5,7))
})
Now visualise the test statistics found.
par(mfrow=c(2,1))
hist(res2[,"stat.t"],breaks=100,main="",xlab="test statistic")
boxplot(res2[,"stat.t"],horizontal=TRUE)
hist(res2[,"stat.t"],xlim=c(-50,50),breaks=100,main="",xlab="test statistic")
boxplot(res2[,"stat.t"],ylim=c(-50,50),horizontal=TRUE)
par(mfrow=c(1,1))
pos <- length( which(res2[,1]>0 & res2[,2] < 0.05) )
neg <- length( which(res2[,1]<0 & res2[,2] < 0.05) )
none <- length( which(res2[,2] > 0.05) )
bars <- c("Pos"=pos,"None"=none,"Neg"=neg)
barplot(bars,ylab="no. gene sets with correlated expression",ylim=c(0,max(bars)*1.3))
text(((1:length(bars))-0.5)*1.25,bars+50,labels=bars)
This work confirms the observations by Hong, that genes in a set are on the whole, most gene sets positively correlated members.
Some gene sets are highly correlated, while others are heterogenous, or slightly anticorrelated.
We can try to reproduce this correlation structure in simulated data.
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8
## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8
## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] vioplot_0.3.7 zoo_1.8-9 sm_2.2-5.7 Hmisc_4.6-0
## [5] ggplot2_3.3.5 Formula_1.2-4 survival_3.2-13 lattice_0.20-45
## [9] gplots_3.1.1 mitch_1.4.1 getDEE2_1.2.0 beeswarm_0.4.0
## [13] kableExtra_1.3.4
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-7 matrixStats_0.61.0
## [3] webshot_0.5.2 RColorBrewer_1.1-2
## [5] httr_1.4.2 GenomeInfoDb_1.28.4
## [7] bslib_0.3.1 backports_1.4.1
## [9] tools_4.1.2 utf8_1.2.2
## [11] R6_2.5.1 rpart_4.1-15
## [13] KernSmooth_2.23-20 DBI_1.1.2
## [15] BiocGenerics_0.38.0 colorspace_2.0-2
## [17] nnet_7.3-16 htm2txt_2.1.1
## [19] withr_2.4.3 tidyselect_1.1.1
## [21] gridExtra_2.3 GGally_2.1.2
## [23] compiler_4.1.2 rvest_1.0.2
## [25] Biobase_2.52.0 htmlTable_2.4.0
## [27] xml2_1.3.3 DelayedArray_0.18.0
## [29] sass_0.4.0 checkmate_2.0.0
## [31] caTools_1.18.2 scales_1.1.1
## [33] systemfonts_1.0.3 stringr_1.4.0
## [35] digest_0.6.29 foreign_0.8-81
## [37] rmarkdown_2.11 svglite_2.0.0
## [39] XVector_0.32.0 base64enc_0.1-3
## [41] jpeg_0.1-9 pkgconfig_2.0.3
## [43] htmltools_0.5.2 MatrixGenerics_1.4.3
## [45] highr_0.9 fastmap_1.1.0
## [47] htmlwidgets_1.5.4 rlang_0.4.12
## [49] rstudioapi_0.13 shiny_1.7.1
## [51] jquerylib_0.1.4 generics_0.1.1
## [53] jsonlite_1.7.2 gtools_3.9.2
## [55] dplyr_1.0.7 RCurl_1.98-1.5
## [57] magrittr_2.0.1 GenomeInfoDbData_1.2.6
## [59] Matrix_1.4-0 Rcpp_1.0.7
## [61] munsell_0.5.0 S4Vectors_0.30.2
## [63] fansi_1.0.0 lifecycle_1.0.1
## [65] yaml_2.2.1 stringi_1.7.6
## [67] MASS_7.3-54 SummarizedExperiment_1.22.0
## [69] zlibbioc_1.38.0 plyr_1.8.6
## [71] grid_4.1.2 promises_1.2.0.1
## [73] crayon_1.4.2 echarts4r_0.4.3
## [75] splines_4.1.2 knitr_1.37
## [77] pillar_1.6.4 GenomicRanges_1.44.0
## [79] reshape2_1.4.4 stats4_4.1.2
## [81] glue_1.6.0 evaluate_0.14
## [83] latticeExtra_0.6-29 data.table_1.14.2
## [85] vctrs_0.3.8 png_0.1-7
## [87] httpuv_1.6.5 gtable_0.3.0
## [89] purrr_0.3.4 reshape_0.8.8
## [91] assertthat_0.2.1 xfun_0.29
## [93] mime_0.12 xtable_1.8-4
## [95] later_1.3.0 viridisLite_0.4.0
## [97] tibble_3.1.6 IRanges_2.26.0
## [99] cluster_2.1.2 ellipsis_0.3.2