Examining correlation structure in RNA-seq data

Source: https://github.com/markziemann/combined_enrichment

Intro

This script is designed to fetch Illumina Human Body Map 2.0 Project gene expression data in order to assess the correlation of genes within a gene set.

The data is being fetched using getDEE2 with the accession number ERP000546.

suppressPackageStartupMessages({
  library("kableExtra")
  library("beeswarm")
  library("getDEE2")
  library("mitch")
  library("gplots")
  library("Hmisc")
  library("parallel")
  library("vioplot")
})

Fetch data

First get gene expression data.

mdat <- getDEE2Metadata(species="hsapiens")
mdat2 <- mdat[which(mdat$SRP_accession == "ERP000546"),]
dat <- getDEE2::getDEE2(species="hsapiens",
  metadata=mdat,
  SRRvec=mdat2$SRR_accession,
  legacy=TRUE)

## For more information about DEE2 QC metrics, visit
##     https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md

dat <- Tx2Gene(dat)
names(dat)

## [1] "Tx2Gene"         "GeneCounts"      "TxCounts"        "GeneInfo"       
## [5] "TxInfo"          "QcMx"            "MetadataSummary" "MetadataFull"   
## [9] "absent"

head(dat$Tx2Gene)

##                    ERR030856 ERR030857 ERR030858 ERR030859 ERR030860 ERR030861
## ENSG00000000003.14 3143.8387 3179.4595 3098.9818 4033.8767 4176.7202 4127.6651
## ENSG00000000005.5   354.0004  340.0000  297.9998  435.9999  472.0003  443.0004
## ENSG00000000419.12 2062.9960 2049.0021 1959.9971 2690.0033 2735.0004 2686.9951
## ENSG00000000457.13  909.4659  886.7095  917.8299 1135.5987 1108.9460 1034.7060
## ENSG00000000460.16  275.6274  276.8625  282.7185  426.6796  408.1430  414.8135
## ENSG00000000938.12 1691.9997 1753.0005 1728.0012 2526.9949 2436.9969 2478.0045
##                    ERR030862 ERR030863 ERR030864 ERR030865 ERR030866 ERR030867
## ENSG00000000003.14 1706.9480 1695.3120 3123.7089 2968.2921 4118.8537 4126.2909
## ENSG00000000005.5   261.0000  278.9998  341.0004  321.0003  445.9996  459.9998
## ENSG00000000419.12  981.0004  932.0000 1994.9960 1982.0039 2701.0043 2586.9994
## ENSG00000000457.13  606.4480  582.5506  898.3052  861.0053 1035.4084 1049.4601
## ENSG00000000460.16  267.3197  286.0011  312.7018  275.3162  424.6966  406.5873
## ENSG00000000938.12  605.9999  567.0011 1689.9998 1747.0004 2446.0039 2218.0047
##                    ERR030868 ERR030869 ERR030870 ERR030871 ERR030872  ERR030873
## ENSG00000000003.14 1712.4254 1712.1002 1734.4042 1728.1194 4094.4162 5054.86170
## ENSG00000000005.5   287.9997  303.0000  313.9996  299.0002   19.0000   63.99997
## ENSG00000000419.12  918.9991  978.0006  983.0003  944.0004 3164.9993 3053.00420
## ENSG00000000457.13  560.1490  592.2570  619.6080  611.2100 1815.1573 1412.01660
## ENSG00000000460.16  287.2886  272.7070  260.8160  306.4214  371.5888 2456.60933
## ENSG00000000938.12  657.0007  616.9997  567.0000  574.0007  410.0004  168.99990
##                    ERR030874   ERR030875  ERR030876 ERR030877  ERR030878
## ENSG00000000003.14 5621.8377    47.47595  154.49620 3804.4443 1400.91460
## ENSG00000000005.5    48.0000     0.00000    5.00000   39.0000   26.00005
## ENSG00000000419.12 2275.9966  2318.00440 1373.00030 2457.0010 1780.00456
## ENSG00000000457.13 2049.0537  1895.20300  602.13290 1886.5268 1274.19650
## ENSG00000000460.16  361.1265   287.98750   56.47252  434.8744  371.42983
## ENSG00000000938.12  284.9998 33755.03500  156.99970  363.9998 2146.99930
##                    ERR030879 ERR030880 ERR030881 ERR030882 ERR030883 ERR030884
## ENSG00000000003.14 3143.2243 5153.0686 1323.4207  1199.901 3921.2988 3425.4353
## ENSG00000000005.5     7.0000 1439.9950  394.9998     8.000 3730.9960  147.0001
## ENSG00000000419.12 1782.0005 1321.9988 1792.0047  1476.001 1638.0040 1894.0023
## ENSG00000000457.13  568.9865  604.6376 1224.8002  1002.883 1315.7212  767.9053
## ENSG00000000460.16  211.4579  316.5365  328.8774   191.897  285.5377  205.0773
## ENSG00000000938.12 5467.0051 1879.9965 1256.0010   154.000  463.0002  273.0002
##                    ERR030885  ERR030886 ERR030887 ERR030888 ERR030889 ERR030890
## ENSG00000000003.14 3553.0532  665.06501 8635.4253 5134.5327 1239.1399 1013.6777
## ENSG00000000005.5    12.0000   20.99998    0.0000 1474.9970  429.0000   13.0000
## ENSG00000000419.12 2368.9999 1682.99840 1238.9999 1403.0001 1797.9971 1303.0039
## ENSG00000000457.13 1266.7593  587.17850  938.1669  697.8841 1295.6834  830.8925
## ENSG00000000460.16  318.6973  103.88349  366.1106  286.0438  274.5181  143.6693
## ENSG00000000938.12  520.0005  340.99960  437.9999 1797.0012 1264.9988  144.0003
##                    ERR030891 ERR030892  ERR030893 ERR030894 ERR030895 ERR030896
## ENSG00000000003.14 4246.4435 3337.4468 3646.36420  657.2884 8192.8496 3149.9916
## ENSG00000000005.5  3797.9970  137.9997   19.00003   22.0000    0.0000    9.0000
## ENSG00000000419.12 1738.9967 1860.9952 2296.00490 1518.9958 1135.9995 1814.0026
## ENSG00000000457.13 1419.9085  697.8773 1323.05670  458.2305  844.6574  545.3480
## ENSG00000000460.16  278.6460  186.2239  260.77698  112.5909  303.0254  181.0319
## ENSG00000000938.12  489.9995  249.0004  502.99990  294.9997  392.9999 5604.9922
##                    ERR030897 ERR030898 ERR030899  ERR030900 ERR030901
## ENSG00000000003.14 1342.6442 3948.9691  150.3535    57.2157 5561.4037
## ENSG00000000005.5    18.0000   54.0000    8.0000     0.0000   32.0000
## ENSG00000000419.12 1800.9996 2591.0032 1391.9993  2437.9960 2359.0036
## ENSG00000000457.13 1277.7472 1906.5170  622.7953  1999.0180 2051.2105
## ENSG00000000460.16  398.6739  534.0472   62.2850   311.3919  409.2937
## ENSG00000000938.12 2098.0029  396.0002  160.9997 35409.9590  319.0000
##                     ERR030902 ERR030903
## ENSG00000000003.14 5269.21850 3951.7308
## ENSG00000000005.5    68.99997   17.0000
## ENSG00000000419.12 3242.99950 3176.9996
## ENSG00000000457.13 1365.63068 1770.8321
## ENSG00000000460.16 2506.72304  358.5619
## ENSG00000000938.12  158.99989  396.0001

head(dat$GeneInfo)

##                  GeneSymbol mean median longest_isoform merged
## ENSG00000223972     DDX11L1  973    632            1657   1735
## ENSG00000227232      WASH7P 1351   1351            1351   1351
## ENSG00000278267   MIR6859-1   68     68              68     68
## ENSG00000243485 MIR1302-2HG  641    712             712   1021
## ENSG00000284332   MIR1302-2  138    138             138    138
## ENSG00000237613     FAM138A  948   1187            1187   1219

tx2gene <- dat$Tx2Gene
rownames(tx2gene) <- sapply(strsplit(rownames(tx2gene),"\\."),"[[",1)

# filter based on detection threshold
tx2gene <- tx2gene[which(rowMeans(tx2gene)>=10),]

# normalise
rpm <- tx2gene/colSums(tx2gene) * 1000000

Next get pathways.

if (! file.exists("ReactomePathways.gmt")) {
  download.file("https://reactome.org/download/current/ReactomePathways.gmt.zip", 
    destfile="ReactomePathways.gmt.zip")
  unzip("ReactomePathways.gmt.zip")
}
genesets<-gmt_import("ReactomePathways.gmt")

Assess the correlation structure

# ++++++++++++++++++++++++++++
# flattenCorrMatrix
# ++++++++++++++++++++++++++++
# cormat : matrix of the correlation coefficients
# pmat : matrix of the correlation p-values
flattenCorrMatrix <- function(cormat, pmat) {
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut]
    )
}

res <- mclapply(1:length(genesets), function(i){
gs <- genesets[[i]]
len <- length(gs)
gsname <- names(genesets[i])
gene_accessions <- rownames(dat$GeneInfo[which(dat$GeneInfo$GeneSymbol %in% gs),])
x <- rpm[which(rownames(rpm) %in% gene_accessions),]
if(nrow(x)>=10) {
  res <- rcorr(as.matrix(t(x)),type="spearman")
  res <- flattenCorrMatrix(res$r, res$P)
  # randomly select gene ids as a comparison
  gene_accessions2 <- rownames(dat$GeneInfo[which(! dat$GeneInfo$GeneSymbol %in% gs),])
  # filter for those expressed in the dataset and randomly select n
  gene_accessions2 <- gene_accessions2[sample(which(gene_accessions2 %in% rownames(rpm) ),size=len, replace=FALSE)]
  y <- rpm[which(rownames(rpm) %in% gene_accessions2),]
  res2 <- rcorr(as.matrix(t(y)),type="spearman")
  res2 <- flattenCorrMatrix(res2$r, res2$P)
  cormax <- max(c(res$cor,res2$cor))
  cormin <- min(c(res$cor,res2$cor))
  setname <- names(genesets[i])
  result <- list("inset"=res$cor, "rand"=res2$cor,"setname"=setname)
  return(result) 
}
}, mc.cores=16)

looks like some empty results. Let’s get rid of those now.

str(head(res))

## List of 6
##  $ : NULL
##  $ : NULL
##  $ :List of 3
##   ..$ inset  : num [1:325] 0.3789 0.2613 0.6976 -0.0634 -0.0417 ...
##   ..$ rand   : num [1:325] 0.519 0.262 0.427 0.183 0.295 ...
##   ..$ setname: chr "A tetrasaccharide linker sequence is required for GAG synthesis"
##  $ :List of 3
##   ..$ inset  : num [1:2926] 0.165 0.209 0.208 -0.147 0.133 ...
##   ..$ rand   : num [1:3003] 0.1748 0.1771 0.2757 -0.0277 0.2128 ...
##   ..$ setname: chr "ABC transporter disorders"
##  $ :List of 3
##   ..$ inset  : num [1:153] -0.1573 -0.0426 0.2388 0.0987 0.3428 ...
##   ..$ rand   : num [1:153] 0.196 0.517 0.192 0.64 0.118 ...
##   ..$ setname: chr "ABC transporters in lipid homeostasis"
##  $ :List of 3
##   ..$ inset  : num [1:5151] 0.153 0.165 0.183 -0.147 0.408 ...
##   ..$ rand   : num [1:5253] -0.195 0.103 0.452 0.142 0.389 ...
##   ..$ setname: chr "ABC-family proteins mediated transport"

res <- res[which(lapply(res,length)>0)]
str(head(res))

## List of 6
##  $ :List of 3
##   ..$ inset  : num [1:325] 0.3789 0.2613 0.6976 -0.0634 -0.0417 ...
##   ..$ rand   : num [1:325] 0.519 0.262 0.427 0.183 0.295 ...
##   ..$ setname: chr "A tetrasaccharide linker sequence is required for GAG synthesis"
##  $ :List of 3
##   ..$ inset  : num [1:2926] 0.165 0.209 0.208 -0.147 0.133 ...
##   ..$ rand   : num [1:3003] 0.1748 0.1771 0.2757 -0.0277 0.2128 ...
##   ..$ setname: chr "ABC transporter disorders"
##  $ :List of 3
##   ..$ inset  : num [1:153] -0.1573 -0.0426 0.2388 0.0987 0.3428 ...
##   ..$ rand   : num [1:153] 0.196 0.517 0.192 0.64 0.118 ...
##   ..$ setname: chr "ABC transporters in lipid homeostasis"
##  $ :List of 3
##   ..$ inset  : num [1:5151] 0.153 0.165 0.183 -0.147 0.408 ...
##   ..$ rand   : num [1:5253] -0.195 0.103 0.452 0.142 0.389 ...
##   ..$ setname: chr "ABC-family proteins mediated transport"
##  $ :List of 3
##   ..$ inset  : num [1:4560] 0.0769 -0.0967 0.3501 0.155 0.4418 ...
##   ..$ rand   : num [1:8911] 0.379 0.452 0.314 0.268 0.491 ...
##   ..$ setname: chr "ADORA2B mediated anti-inflammatory cytokines production"
##  $ :List of 3
##   ..$ inset  : num [1:231] 0.1843 0.5432 0.3252 0.0751 0.0988 ...
##   ..$ rand   : num [1:300] 0.2987 -0.0332 -0.2048 0.5384 0.3387 ...
##   ..$ setname: chr "ADP signalling through P2Y purinoceptor 1"

We can visualise the spearman correlation coefficients here.

z <- lapply(1:6, function(i){
setname <- res[[i]]$setname
boxplot(res[[i]][1:2],col="white",ylab="Spearman rho",main=setname)
beeswarm(res[[i]][1:2],add=TRUE,pch=19)
})

We can run a t-test to quantify the difference in distributions.

res2 <- lapply(1:length(res), function(i) {
inset <- res[[i]]$inset
rand <- res[[i]]$rand
tt <- t.test(x=inset,y=rand)
return(c("stat"=tt$statistic,"pvalue"=tt$p.value))
})

res2 <- do.call(rbind,res2)

rownames(res2) <- sapply(res,"[[",3)
summary(res2)

##      stat.t            pvalue       
##  Min.   :-72.945   Min.   :0.00000  
##  1st Qu.:  1.908   1st Qu.:0.00000  
##  Median :  5.663   Median :0.00000  
##  Mean   : 11.700   Mean   :0.09054  
##  3rd Qu.: 14.034   3rd Qu.:0.01619  
##  Max.   :278.538   Max.   :0.99620

Now we can look at the positive and negatively correlated gene sets.

poscor <- head(res2[order(-res2[,1]),],10) 
poscor %>% kbl(caption="most correlated gene sets") %>% kable_paper("hover", full_width = F)

most correlated gene sets
	stat.t	pvalue
Gene expression (Transcription)	278.5375	0
RNA Polymerase II Transcription	230.8156	0
Metabolism of RNA	200.2035	0
Generic Transcription Pathway	183.8911	0
Immune System	147.8624	0
Immunoregulatory interactions between a Lymphoid and a non-Lymphoid cell	132.6570	0
Processing of Capped Intron-Containing Pre-mRNA	131.1494	0
Innate Immune System	126.7056	0
Cell Cycle	110.9368	0
mRNA Splicing	101.2100	0

poscor_res <- res[which(sapply(res,"[[",3) %in% rownames(poscor))]

Vioplot of positively correlated sets.

z <- lapply(1:length(poscor_res), function(i){
setname <- poscor_res[[i]]$setname
gset <- genesets[[which(names(genesets) ==setname)]]
eset <- rownames(dat$GeneInfo[which(dat$GeneInfo$GeneSymbol %in% gset),])
n <- length(which(eset %in% rownames(rpm)))
vioplot(poscor_res[[i]][1:2] , ylab="Spearman rho",main=setname  )
mtext(paste("n=",n))
})

Heatmap of pos cor sets.

z <- lapply(1:10, function(i) {
setname <- rownames(poscor)[i]
gset <- genesets[[which(names(genesets) ==setname)]]
eset <- rownames(dat$GeneInfo[which(dat$GeneInfo$GeneSymbol %in% gset),])
mx <- rpm[which(rownames(rpm) %in% eset),]
heatmap.2(as.matrix(mx),trace="none",scale="row",main=setname, margin=c(5,7))
})

Now focus on negatively correlated sets.

anticor <- head(res2[order(res2[,1]),],10)
anticor %>% kbl(caption="most anti-correlated gene sets") %>% kable_paper("hover", full_width = F)

most anti-correlated gene sets
	stat.t	pvalue
Metabolism	-72.945185	0
Transport of small molecules	-48.968055	0
Metabolism of lipids	-19.686980	0
Sensory Perception	-14.657510	0
Regulation of lipid metabolism by PPARalpha	-10.353812	0
Platelet degranulation	-9.441238	0
Oncogenic MAPK signaling	-7.837527	0
IGF1R signaling cascade	-7.033656	0
ABC-family proteins mediated transport	-6.036792	0
BMAL1:CLOCK,NPAS2 activates circadian gene expression	-5.840983	0

anticor_res <- res[which(sapply(res,"[[",3) %in% rownames(anticor))]

z <- lapply(1:length(anticor_res), function(i){
setname <- anticor_res[[i]]$setname
gset <- genesets[[which(names(genesets) ==setname)]]
eset <- rownames(dat$GeneInfo[which(dat$GeneInfo$GeneSymbol %in% gset),])
n <- length(which(eset %in% rownames(rpm)))
vioplot(anticor_res[[i]][1:2] , ylab="Spearman rho",main=setname  )
mtext(paste("n=",n))
})

Heatmap of neg cor sets.

z <- lapply(1:10, function(i) {
setname <- rownames(anticor)[i]
gset <- genesets[[which(names(genesets) ==setname)]]
eset <- rownames(dat$GeneInfo[which(dat$GeneInfo$GeneSymbol %in% gset),])
mx <- rpm[which(rownames(rpm) %in% eset),]
heatmap.2(as.matrix(mx),trace="none",scale="row",main=setname,margin=c(5,7))
})

Now visualise the test statistics found.

par(mfrow=c(2,1))
hist(res2[,"stat.t"],breaks=100,main="",xlab="test statistic")
boxplot(res2[,"stat.t"],horizontal=TRUE)

hist(res2[,"stat.t"],xlim=c(-50,50),breaks=100,main="",xlab="test statistic")
boxplot(res2[,"stat.t"],ylim=c(-50,50),horizontal=TRUE)

par(mfrow=c(1,1))

pos <- length( which(res2[,1]>0 & res2[,2] < 0.05) )
neg <- length( which(res2[,1]<0 & res2[,2] < 0.05) )
none <- length( which(res2[,2] > 0.05) )
bars <- c("Pos"=pos,"None"=none,"Neg"=neg)
barplot(bars,ylab="no. gene sets with correlated expression",ylim=c(0,max(bars)*1.3))
text(((1:length(bars))-0.5)*1.25,bars+50,labels=bars)

Conclusions

This work confirms the observations by Hong, that genes in a set are on the whole, most gene sets positively correlated members.

Some gene sets are highly correlated, while others are heterogenous, or slightly anticorrelated.

We can try to reproduce this correlation structure in simulated data.

Session information

sessionInfo()

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] vioplot_0.3.7    zoo_1.8-9        sm_2.2-5.7       Hmisc_4.6-0     
##  [5] ggplot2_3.3.5    Formula_1.2-4    survival_3.2-13  lattice_0.20-45 
##  [9] gplots_3.1.1     mitch_1.4.1      getDEE2_1.2.0    beeswarm_0.4.0  
## [13] kableExtra_1.3.4
## 
## loaded via a namespace (and not attached):
##   [1] bitops_1.0-7                matrixStats_0.61.0         
##   [3] webshot_0.5.2               RColorBrewer_1.1-2         
##   [5] httr_1.4.2                  GenomeInfoDb_1.28.4        
##   [7] bslib_0.3.1                 backports_1.4.1            
##   [9] tools_4.1.2                 utf8_1.2.2                 
##  [11] R6_2.5.1                    rpart_4.1-15               
##  [13] KernSmooth_2.23-20          DBI_1.1.2                  
##  [15] BiocGenerics_0.38.0         colorspace_2.0-2           
##  [17] nnet_7.3-16                 htm2txt_2.1.1              
##  [19] withr_2.4.3                 tidyselect_1.1.1           
##  [21] gridExtra_2.3               GGally_2.1.2               
##  [23] compiler_4.1.2              rvest_1.0.2                
##  [25] Biobase_2.52.0              htmlTable_2.4.0            
##  [27] xml2_1.3.3                  DelayedArray_0.18.0        
##  [29] sass_0.4.0                  checkmate_2.0.0            
##  [31] caTools_1.18.2              scales_1.1.1               
##  [33] systemfonts_1.0.3           stringr_1.4.0              
##  [35] digest_0.6.29               foreign_0.8-81             
##  [37] rmarkdown_2.11              svglite_2.0.0              
##  [39] XVector_0.32.0              base64enc_0.1-3            
##  [41] jpeg_0.1-9                  pkgconfig_2.0.3            
##  [43] htmltools_0.5.2             MatrixGenerics_1.4.3       
##  [45] highr_0.9                   fastmap_1.1.0              
##  [47] htmlwidgets_1.5.4           rlang_0.4.12               
##  [49] rstudioapi_0.13             shiny_1.7.1                
##  [51] jquerylib_0.1.4             generics_0.1.1             
##  [53] jsonlite_1.7.2              gtools_3.9.2               
##  [55] dplyr_1.0.7                 RCurl_1.98-1.5             
##  [57] magrittr_2.0.1              GenomeInfoDbData_1.2.6     
##  [59] Matrix_1.4-0                Rcpp_1.0.7                 
##  [61] munsell_0.5.0               S4Vectors_0.30.2           
##  [63] fansi_1.0.0                 lifecycle_1.0.1            
##  [65] yaml_2.2.1                  stringi_1.7.6              
##  [67] MASS_7.3-54                 SummarizedExperiment_1.22.0
##  [69] zlibbioc_1.38.0             plyr_1.8.6                 
##  [71] grid_4.1.2                  promises_1.2.0.1           
##  [73] crayon_1.4.2                echarts4r_0.4.3            
##  [75] splines_4.1.2               knitr_1.37                 
##  [77] pillar_1.6.4                GenomicRanges_1.44.0       
##  [79] reshape2_1.4.4              stats4_4.1.2               
##  [81] glue_1.6.0                  evaluate_0.14              
##  [83] latticeExtra_0.6-29         data.table_1.14.2          
##  [85] vctrs_0.3.8                 png_0.1-7                  
##  [87] httpuv_1.6.5                gtable_0.3.0               
##  [89] purrr_0.3.4                 reshape_0.8.8              
##  [91] assertthat_0.2.1            xfun_0.29                  
##  [93] mime_0.12                   xtable_1.8-4               
##  [95] later_1.3.0                 viridisLite_0.4.0          
##  [97] tibble_3.1.6                IRanges_2.26.0             
##  [99] cluster_2.1.2               ellipsis_0.3.2