Source: https://github.com/markziemann/impute2/
Here the idea is to examine the variant assumptions. To quantify the number that are intergenic or not assigned to known protein coding genes.
suppressPackageStartupMessages({
library("tictoc")
library("parallel")
library("vioplot")
library("fgsea")
library("reshape2")
library("eulerr")
library("HGNChelper")
library("kableExtra")
library("beeswarm")
library("gplots")
library("gridExtra")
library("png")
library("parallel")
library("RhpcBLASctl")
library("tictoc")
library("vioplot")
})
Read in data:
Ensembl gene GTF/gene with biotypes
Ontologies
GWASdb: a database for human genetic variants identified by genome-wide association studies.
#URL: https://www.ebi.ac.uk/gwas/api/search/downloads/full
gwas <- read.table("gwas_catalog_v1.0-associations_e113_r2025-01-30.tsv",sep="\t",header=TRUE,fill=NA,
row.names=NULL,quote="")
gs <- fgsea::gmtPathways("msigdb.v2024.1.Hs.symbols.gmt")
context <- gwas$CONTEXT
contextcnt <- sort(table(gsub(" ","",unlist(strsplit(context,";")))))
contextcnt
##
## 3_prime_UTR_variantxregulatory_region_variant
## 1
## intron_variantxstop_gained
## 1
## regulatory_region_variantx3_prime_UTR_variant
## 1
## splice_donor_5th_base_variantxintergenic_variant
## 1
## splice_polypyrimidine_tract_variantxintergenic_variant
## 1
## splice_region_variantxintergenic_variant
## 1
## stop_gainedxintergenic_variant
## 1
## stop_gainedxintron_variant
## 1
## synonymous_variantx3_prime_UTR_variant
## 1
## intergenic_variantx5_prime_UTR_variant
## 2
## intergenic_variantxstop_gained
## 2
## intron_variantxsplice_region_variant
## 2
## intron_variantxsynonymous_variant
## 2
## intergenic_variantxTF_binding_site_variant
## 3
## stop_retained_variant
## 3
## synonymous_variantxintron_variant
## 3
## 5_prime_UTR_variantxintergenic_variant
## 4
## protein_altering_variant
## 4
## intergenic_variantxmissense_variant
## 5
## intergenic_variantxsynonymous_variant
## 6
## regulatory_region_variantxregulatory_region_variant
## 7
## intron_variantxmissense_variant
## 9
## missense_variantxintron_variant
## 9
## TFBS_ablation
## 9
## intron_variantx3_prime_UTR_variant
## 10
## missense_variantxintergenic_variant
## 11
## intergenic_variantx3_prime_UTR_variant
## 14
## synonymous_variantxintergenic_variant
## 15
## 3_prime_UTR_variantxintron_variant
## 16
## 3_prime_UTR_variantxintergenic_variant
## 26
## stop_lost
## 27
## regulatory_region_variantxintron_variant
## 30
## inframe_deletion
## 36
## inframe_insertion
## 36
## intron_variantxregulatory_region_variant
## 40
## intergenic_variantxregulatory_region_variant
## 54
## start_lost
## 54
## regulatory_region_variantxintergenic_variant
## 57
## splice_acceptor_variant
## 95
## splice_donor_5th_base_variant
## 141
## splice_donor_variant
## 217
## splice_donor_region_variant
## 246
## frameshift_variant
## 314
## intron_variantxintron_variant
## 411
## intergenic_variantxintron_variant
## 588
## intron_variantxintergenic_variant
## 627
## intergenic_variantxintergenic_variant
## 765
## splice_polypyrimidine_tract_variant
## 920
## splice_region_variant
## 1002
## stop_gained
## 2420
## TF_binding_site_variant
## 2653
## 5_prime_UTR_variant
## 5013
## synonymous_variant
## 5410
## 3_prime_UTR_variant
## 20424
## regulatory_region_variant
## 29626
## missense_variant
## 40442
## intergenic_variant
## 259591
## intron_variant
## 301421
contextcnt/sum(contextcnt) * 100
##
## 3_prime_UTR_variantxregulatory_region_variant
## 1.486257e-04
## intron_variantxstop_gained
## 1.486257e-04
## regulatory_region_variantx3_prime_UTR_variant
## 1.486257e-04
## splice_donor_5th_base_variantxintergenic_variant
## 1.486257e-04
## splice_polypyrimidine_tract_variantxintergenic_variant
## 1.486257e-04
## splice_region_variantxintergenic_variant
## 1.486257e-04
## stop_gainedxintergenic_variant
## 1.486257e-04
## stop_gainedxintron_variant
## 1.486257e-04
## synonymous_variantx3_prime_UTR_variant
## 1.486257e-04
## intergenic_variantx5_prime_UTR_variant
## 2.972515e-04
## intergenic_variantxstop_gained
## 2.972515e-04
## intron_variantxsplice_region_variant
## 2.972515e-04
## intron_variantxsynonymous_variant
## 2.972515e-04
## intergenic_variantxTF_binding_site_variant
## 4.458772e-04
## stop_retained_variant
## 4.458772e-04
## synonymous_variantxintron_variant
## 4.458772e-04
## 5_prime_UTR_variantxintergenic_variant
## 5.945029e-04
## protein_altering_variant
## 5.945029e-04
## intergenic_variantxmissense_variant
## 7.431287e-04
## intergenic_variantxsynonymous_variant
## 8.917544e-04
## regulatory_region_variantxregulatory_region_variant
## 1.040380e-03
## intron_variantxmissense_variant
## 1.337632e-03
## missense_variantxintron_variant
## 1.337632e-03
## TFBS_ablation
## 1.337632e-03
## intron_variantx3_prime_UTR_variant
## 1.486257e-03
## missense_variantxintergenic_variant
## 1.634883e-03
## intergenic_variantx3_prime_UTR_variant
## 2.080760e-03
## synonymous_variantxintergenic_variant
## 2.229386e-03
## 3_prime_UTR_variantxintron_variant
## 2.378012e-03
## 3_prime_UTR_variantxintergenic_variant
## 3.864269e-03
## stop_lost
## 4.012895e-03
## regulatory_region_variantxintron_variant
## 4.458772e-03
## inframe_deletion
## 5.350526e-03
## inframe_insertion
## 5.350526e-03
## intron_variantxregulatory_region_variant
## 5.945029e-03
## intergenic_variantxregulatory_region_variant
## 8.025790e-03
## start_lost
## 8.025790e-03
## regulatory_region_variantxintergenic_variant
## 8.471667e-03
## splice_acceptor_variant
## 1.411944e-02
## splice_donor_5th_base_variant
## 2.095623e-02
## splice_donor_variant
## 3.225178e-02
## splice_donor_region_variant
## 3.656193e-02
## frameshift_variant
## 4.666848e-02
## intron_variantxintron_variant
## 6.108518e-02
## intergenic_variantxintron_variant
## 8.739193e-02
## intron_variantxintergenic_variant
## 9.318833e-02
## intergenic_variantxintergenic_variant
## 1.136987e-01
## splice_polypyrimidine_tract_variant
## 1.367357e-01
## splice_region_variant
## 1.489230e-01
## stop_gained
## 3.596743e-01
## TF_binding_site_variant
## 3.943041e-01
## 5_prime_UTR_variant
## 7.450608e-01
## synonymous_variant
## 8.040652e-01
## 3_prime_UTR_variant
## 3.035532e+00
## regulatory_region_variant
## 4.403186e+00
## missense_variant
## 6.010722e+00
## intergenic_variant
## 3.858190e+01
## intron_variant
## 4.479892e+01
OTHER=sum(contextcnt[1:(length(contextcnt)-11)])
contextcnt2 <- tail(contextcnt,10)
contextcnt2 <- c("other"=OTHER, tail(contextcnt,10) )
par(mar=c(5.1, 16.1, 4.1, 2.1) )
barplot(tail(contextcnt,10),horiz=TRUE,las=1, xlab="no. variants")
mtext("GWAS Catalog variant context")
barplot(contextcnt2/1000,horiz=TRUE,las=1,xlab="no. variants (thousands)")
mtext("GWAS Catalog variant context")
x <- readLines("gene_biotype.tsv")
x <- strsplit(x,"\t")
# munge
y <- lapply(x,function(y) {
if ( length(y) == 2 ) {
OUT=c(y[1],y[1],y[2])
} else {
OUT=y
}
return( OUT )
})
z <- as.data.frame(do.call(rbind,y))
z$V3 <- gsub("_"," ",z$V3)
head(z)
## V1 V2 V3
## 1 ENSG00000142611 PRDM16 protein coding
## 2 ENSG00000284616 ENSG00000284616 lncRNA
## 3 ENSG00000157911 PEX10 protein coding
## 4 ENSG00000260972 ENSG00000260972 lncRNA
## 5 ENSG00000224340 RPL21P21 processed pseudogene
## 6 ENSG00000229280 EEF1DP6 processed pseudogene
gwas2 <- subset(gwas,SNP_GENE_IDS!="")
mygenes <- gsub(" ","",unlist(strsplit(gwas2$SNP_GENE_IDS,",")))
mygenes <- as.data.frame( mygenes)
colnames(mygenes) = "gene"
mybt <- merge(mygenes,z,by.x="gene",by.y="V1",all.x=TRUE)
gwas_bt <- sort(table(mybt$V3))
par(mar=c(5.1, 16.1, 4.1, 2.1) )
barplot(gwas_bt,horiz=TRUE,las=1, xlab="no. associations")
mtext("GWAS Catalog gene annotation")
OTHER <- sum(gwas_bt[1:(length(gwas_bt)-2)])
gwas_bt2 <- c("other"=OTHER,gwas_bt[(length(gwas_bt)-1):length(gwas_bt)])
par(mar=c(5.1, 10.1, 4.1, 2.1) )
barplot(gwas_bt2/1000,horiz=TRUE,las=1,main="GWAS Catalog gene annotation", xlab="no. associations (thousands)")
See what fraction of GWAS genes have gene ontologies (biological process).
66% of GWAS genes have a biological process (13599 / ( 13599 + 6988 )= 0.66 ). We can help with identifying the other ones.
gobp <- gs[grep("^GOBP",names(gs))]
gobp_genes <- unique(unlist(gobp))
length(gobp_genes)
## [1] 17951
mygenes <- gsub(" ","",unlist(strsplit(gwas2$SNP_GENE_IDS,",")))
mygenes <- unique(z[which(z$V1 %in% mygenes),2])
table(mygenes %in% gobp_genes)
##
## FALSE TRUE
## 6988 13599
Learning about the GWAS genes with and without GO annotations.
mygenes_ingo <- mygenes[which(mygenes %in% gobp_genes)]
mygenes_nogo <- mygenes[which(! mygenes %in% gobp_genes)]
mygenes_nogo_bt <- sort(table(z[which(z$V2 %in% mygenes_nogo),"V3"]))
mygenes_nogo_bt
##
## IG J gene TR V pseudogene
## 1 1
## vault RNA IG C pseudogene
## 1 4
## rRNA TEC
## 5 5
## IG V pseudogene rRNA pseudogene
## 6 14
## unitary pseudogene miRNA
## 14 23
## snoRNA snRNA
## 50 70
## transcribed unitary pseudogene transcribed processed pseudogene
## 74 87
## unprocessed pseudogene transcribed unprocessed pseudogene
## 135 359
## processed pseudogene misc RNA
## 432 950
## protein coding lncRNA
## 2087 3892
sum(mygenes_nogo_bt)
## [1] 8210
OTHER=sum(mygenes_nogo_bt[1:(length(mygenes_nogo_bt)-10)])
mygenes_nogo_bt2 <- c("other"=OTHER,tail(mygenes_nogo_bt,10))
par(mar=c(5.1, 16.1, 4.1, 2.1) )
barplot(tail(mygenes_nogo_bt,10),horiz=TRUE,las=1,xlab="no. genes")
mtext("Biotypes of GWAS genes without GO BP annotations")
grid()
barplot(mygenes_nogo_bt2,horiz=TRUE,las=1,xlab="no. genes",xlim=c(0,4000))
mtext("Biotypes of GWAS genes without GO BP annotations")
grid()
pdf("variant_annotation_barplot1.pdf")
par(mar=c(5.1, 16.1, 4.1, 2.1) )
barplot(mygenes_nogo_bt2,horiz=TRUE,las=1,xlab="no. genes",xlim=c(0,4000))
mtext("Biotypes of GWAS genes without GO BP annotations")
grid()
dev.off()
## png
## 2
mygenes_nogo_bt2/sum(mygenes_nogo_bt) *100
## other snoRNA
## 0.9013398 0.6090134
## snRNA transcribed unitary pseudogene
## 0.8526188 0.9013398
## transcribed processed pseudogene unprocessed pseudogene
## 1.0596833 1.6443362
## transcribed unprocessed pseudogene processed pseudogene
## 4.3727162 5.2618758
## misc RNA protein coding
## 11.5712546 25.4202192
## lncRNA
## 47.4056029
# NOW IN GO
mygenes_ingo_bt <- sort(table(z[which(z$V2 %in% mygenes_ingo),"V3"]))
mygenes_ingo_bt
##
## TEC TR C gene
## 1 1
## unprocessed pseudogene scaRNA
## 1 2
## processed pseudogene TR V gene
## 5 5
## transcribed processed pseudogene IG C gene
## 5 9
## miRNA IG V gene
## 10 12
## transcribed unitary pseudogene transcribed unprocessed pseudogene
## 19 28
## snoRNA lncRNA
## 34 90
## protein coding
## 13452
sum(mygenes_ingo_bt)
## [1] 13674
OTHER=sum(mygenes_ingo_bt[1:(length(mygenes_ingo_bt)-10)])
mygenes_ingo_bt2 <- c("other"=OTHER,tail(mygenes_ingo_bt,10))
par(mar=c(5.1, 16.1, 4.1, 2.1) )
barplot(tail(mygenes_ingo_bt,10),horiz=TRUE,las=1,xlab="no. genes",xlim=c(0,15000))
mtext("Biotypes of GWAS genes with GO BP annotations")
grid()
barplot(mygenes_ingo_bt2,horiz=TRUE,las=1,xlab="no. genes",xlim=c(0,15000))
grid()
mtext("Biotypes of GWAS genes with GO BP annotations")
pdf("variant_annotation_barplot2.pdf")
par(mar=c(5.1, 16.1, 4.1, 2.1) )
barplot(mygenes_ingo_bt2,horiz=TRUE,las=1,xlab="no. genes",xlim=c(0,15000))
grid()
mtext("Biotypes of GWAS genes with GO BP annotations")
dev.off()
## png
## 2
mygenes_ingo_bt2/sum(mygenes_ingo_bt) *100
## other TR V gene
## 0.07313149 0.03656575
## transcribed processed pseudogene IG C gene
## 0.03656575 0.06581834
## miRNA IG V gene
## 0.07313149 0.08775779
## transcribed unitary pseudogene transcribed unprocessed pseudogene
## 0.13894983 0.20476817
## snoRNA lncRNA
## 0.24864707 0.65818341
## protein coding
## 98.37648091
sessionInfo()
## R version 4.4.2 (2024-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.1 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] RhpcBLASctl_0.23-42 png_0.1-8 gridExtra_2.3
## [4] gplots_3.2.0 beeswarm_0.4.0 kableExtra_1.4.0
## [7] HGNChelper_0.8.15 eulerr_7.0.2 reshape2_1.4.4
## [10] fgsea_1.30.0 vioplot_0.5.0 zoo_1.8-12
## [13] sm_2.2-6.0 tictoc_1.2.1
##
## loaded via a namespace (and not attached):
## [1] fastmatch_1.1-6 gtable_0.3.6 xfun_0.49
## [4] bslib_0.8.0 ggplot2_3.5.1 caTools_1.18.3
## [7] lattice_0.22-6 vctrs_0.6.5 tools_4.4.2
## [10] bitops_1.0-9 generics_0.1.3 tibble_3.2.1
## [13] pkgconfig_2.0.3 Matrix_1.7-1 KernSmooth_2.23-24
## [16] data.table_1.16.4 polylabelr_0.3.0 lifecycle_1.0.4
## [19] compiler_4.4.2 stringr_1.5.1 munsell_0.5.1
## [22] codetools_0.2-20 htmltools_0.5.8.1 sass_0.4.9
## [25] yaml_2.3.10 pillar_1.10.0 jquerylib_0.1.4
## [28] BiocParallel_1.38.0 cachem_1.1.0 gtools_3.9.5
## [31] tidyselect_1.2.1 digest_0.6.37 stringi_1.8.4
## [34] dplyr_1.1.4 cowplot_1.1.3 polyclip_1.10-7
## [37] fastmap_1.2.0 grid_4.4.2 colorspace_2.1-1
## [40] cli_3.6.3 magrittr_2.0.3 scales_1.3.0
## [43] rmarkdown_2.29 evaluate_1.0.1 knitr_1.49
## [46] viridisLite_0.4.2 rlang_1.1.4 Rcpp_1.0.13-1
## [49] glue_1.8.0 xml2_1.3.6 splitstackshape_1.4.8
## [52] svglite_2.1.3 rstudioapi_0.17.1 jsonlite_1.8.9
## [55] R6_2.5.1 plyr_1.8.9 systemfonts_1.1.0