Source: https://github.com/markziemann/impute2/
suppressPackageStartupMessages({
library("tictoc")
library("parallel")
library("vioplot")
library("fgsea")
library("reshape2")
library("eulerr")
library("HGNChelper")
library("kableExtra")
library("beeswarm")
library("gplots")
library("gridExtra")
library("png")
library("parallel")
library("RhpcBLASctl")
library("tictoc")
library("vioplot")
})
Read in data:
Ensembl gene GTF/gene symbols
Ensembl gene biotypes
Ontologies
GWASdb: a database for human genetic variants identified by genome-wide association studies.
GTF="Homo_sapiens.GRCh38.113.gtf.gz"
if ( ! file.exists(GTF) ) {
download.file("https://ftp.ensembl.org/pub/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh38.113.gtf.gz",
destfile=GTF)
}
zcat Homo_sapiens.GRCh38.113.gtf.gz \
| awk '$3=="gene"' \
| cut -d '"' -f2,6,8,10 \
| sed 's/"/\t/g' \
| sed 's/ensembl_havana\t//' \
| sed 's/havana//' \
| sed 's/_tagene//' \
| sed 's/ensembl//' \
| sed 's/mirbase//' \
| sed 's/insdc//' \
| sed 's/\t\t/\t/' \
| sed 's/_\t//' > gene_biotype.tsv
Gene biotypes.
x <- readLines("gene_biotype.tsv")
x <- strsplit(x,"\t")
# munge
y <- lapply(x,function(y) {
if ( length(y) == 2 ) {
OUT=c(y[1],y[1],y[2])
} else {
OUT=y
}
return( OUT )
})
z <- as.data.frame(do.call(rbind,y))
z$V3 <- gsub("_"," ",z$V3)
Genesets.
gs <- fgsea::gmtPathways("msigdb.v2024.1.Hs.symbols.gmt")
gobp <- gs[grep("^GOBP",names(gs))]
reac <- gs[grep("^REACTOME",names(gs))]
message("total genes")
## total genes
dim(z)
## [1] 78932 3
message("no. named genes")
## no. named genes
length(which(z$V2!=""))
## [1] 78932
message("no. unnamed genes")
## no. unnamed genes
length(which(z$V2==""))
## [1] 0
bt <- sort(table(z$V3))
par(mar=c(4.1, 16.1, 2.1, 2.1) )
barplot(bt,horiz=TRUE,las=1,cex.names=0.8,xlab="no. genes",main="Ensembl Biotype")
bt
##
## IG pseudogene scRNA
## 1 1
## Mt rRNA translated processed pseudogene
## 2 2
## IG J pseudogene pseudogene
## 3 4
## TR J pseudogene vault RNA
## 4 4
## sRNA TR D gene
## 5 5
## TR C gene ribozyme
## 6 8
## IG C pseudogene IG C gene
## 9 14
## IG J gene artifact
## 18 19
## Mt tRNA TR V pseudogene
## 22 33
## IG D gene scaRNA
## 37 49
## rRNA TR J gene
## 53 79
## unitary pseudogene TR V gene
## 79 107
## IG V gene transcribed unitary pseudogene
## 145 181
## IG V pseudogene rRNA pseudogene
## 187 497
## snoRNA TEC
## 942 1020
## transcribed processed pseudogene transcribed unprocessed pseudogene
## 1159 1600
## miRNA snRNA
## 1879 1910
## unprocessed pseudogene misc RNA
## 1957 2217
## processed pseudogene protein coding
## 9490 20116
## lncRNA
## 35068
bt/sum(bt)* 100
##
## IG pseudogene scRNA
## 0.001266913 0.001266913
## Mt rRNA translated processed pseudogene
## 0.002533827 0.002533827
## IG J pseudogene pseudogene
## 0.003800740 0.005067653
## TR J pseudogene vault RNA
## 0.005067653 0.005067653
## sRNA TR D gene
## 0.006334566 0.006334566
## TR C gene ribozyme
## 0.007601480 0.010135306
## IG C pseudogene IG C gene
## 0.011402220 0.017736786
## IG J gene artifact
## 0.022804439 0.024071353
## Mt tRNA TR V pseudogene
## 0.027872092 0.041808139
## IG D gene scaRNA
## 0.046875792 0.062078751
## rRNA TR J gene
## 0.067146405 0.100086150
## unitary pseudogene TR V gene
## 0.100086150 0.135559722
## IG V gene transcribed unitary pseudogene
## 0.183702427 0.229311306
## IG V pseudogene rRNA pseudogene
## 0.236912786 0.629655906
## snoRNA TEC
## 1.193432321 1.292251558
## transcribed processed pseudogene transcribed unprocessed pseudogene
## 1.468352506 2.027061268
## miRNA snRNA
## 2.380530077 2.419804389
## unprocessed pseudogene misc RNA
## 2.479349313 2.808746769
## processed pseudogene protein coding
## 12.023007145 25.485227791
## lncRNA
## 44.428115340
par(mar=c(4.1, 16.1, 2.1, 2.1) )
barplot(tail(bt,10),horiz=TRUE,las=1,cex.names=1,xlab="no. genes",main="Ensembl Biotype")
gobp_genes <- unique(unname(unlist(gobp)))
message("no. genes known to GO")
## no. genes known to GO
length(gobp_genes)
## [1] 17951
ensg_genes <- unique(apply(z,1,function(x) {
if ( x[2]=="" ) {
NAME=x[1]
} else {
NAME=x[2]
}
return(NAME)
} ))
length(ensg_genes)
## [1] 77307
str(intersect(ensg_genes,gobp_genes))
## chr [1:17921] "PRDM16" "PEX10" "PEX14" "PLCH2" "SPSB1" "SLC2A5" "NMNAT1" ...
v1 <- list("Ensembl"=ensg_genes, "GO BP"=gobp_genes)
plot(euler(v1),quantities = TRUE)
pdf("ontologies_euler.pdf")
plot(euler(v1),quantities = TRUE)
dev.off()
## png
## 2
message("Ensembl genes in GO BP")
## Ensembl genes in GO BP
length(intersect(ensg_genes,gobp_genes))
## [1] 17921
itx <- intersect(ensg_genes,gobp_genes)
itx_bt <- z[which(z$V1 %in% itx | z$V2 %in% itx ),3]
itx_bt <- sort(table(itx_bt))
itx_bt/sum(itx_bt) * 100
## itx_bt
## IG C pseudogene IG D gene
## 0.005536792 0.005536792
## Mt tRNA ribozyme
## 0.005536792 0.005536792
## TR V pseudogene translated processed pseudogene
## 0.005536792 0.005536792
## IG V pseudogene TR D gene
## 0.011073584 0.011073584
## unitary pseudogene artifact
## 0.011073584 0.016610376
## IG J gene misc RNA
## 0.016610376 0.016610376
## TEC TR C gene
## 0.016610376 0.033220752
## IG C gene TR J gene
## 0.077515088 0.094125464
## scaRNA unprocessed pseudogene
## 0.110735840 0.116272632
## transcribed processed pseudogene transcribed unitary pseudogene
## 0.127346216 0.127346216
## snRNA processed pseudogene
## 0.149493384 0.166103759
## transcribed unprocessed pseudogene TR V gene
## 0.376501855 0.564752782
## IG V gene lncRNA
## 0.686562206 0.725319750
## snoRNA miRNA
## 2.569071480 3.106140302
## protein coding
## 90.836609269
par(mar=c(5.1, 16.1, 4.1, 2.1) )
barplot(tail(sort(itx_bt),10),horiz=TRUE,las=1,xlab="no. genes",
xlim=c(0,18000),main="GO BP Biotype")
pdf("ontologies_bar1.pdf")
barplot(tail(sort(itx_bt),10),horiz=TRUE,las=1,xlab="no. genes",
xlim=c(0,18000),main="GO BP Biotype")
grid()
dev.off()
## png
## 2
message("Ensembl genes not in GO BP")
## Ensembl genes not in GO BP
length(setdiff(ensg_genes,gobp_genes))
## [1] 59386
sdiff <- setdiff(ensg_genes,gobp_genes)
sdiff_bt <- z[which(z$V1 %in% sdiff | z$V2 %in% sdiff ),3]
sdiff_bt <- sort(table(sdiff_bt))
sdiff_bt/sum(sdiff_bt) * 100
## sdiff_bt
## IG pseudogene scRNA
## 0.001642818 0.001642818
## translated processed pseudogene Mt rRNA
## 0.001642818 0.003285637
## IG J pseudogene TR D gene
## 0.004928455 0.004928455
## pseudogene TR J pseudogene
## 0.006571274 0.006571274
## vault RNA sRNA
## 0.006571274 0.008214092
## TR V gene ribozyme
## 0.008214092 0.011499729
## IG C pseudogene IG J gene
## 0.013142547 0.024642276
## artifact IG V gene
## 0.026285095 0.034499187
## Mt tRNA scaRNA
## 0.034499187 0.047641734
## TR V pseudogene IG D gene
## 0.052570189 0.059141463
## rRNA TR J gene
## 0.087069376 0.101854742
## unitary pseudogene transcribed unitary pseudogene
## 0.126497018 0.259565310
## IG V pseudogene snoRNA
## 0.303921408 0.785267204
## rRNA pseudogene TEC
## 0.816480754 1.670746332
## transcribed processed pseudogene miRNA
## 1.866241724 2.165234677
## transcribed unprocessed pseudogene snRNA
## 2.516797818 3.093427084
## unprocessed pseudogene misc RNA
## 3.180496460 3.637199980
## protein coding processed pseudogene
## 6.094856336 15.541062246
## lncRNA
## 57.395147114
par(mar=c(5.1, 16.1, 4.1, 2.1) )
barplot(tail(sort(sdiff_bt),10),horiz=TRUE,las=1,cex.names=1,
xlim=c(0,35000),
xlab="no. genes",main="Ensembl genes without GO BP Biotype")
grid()
pdf("ontologies_bar2.pdf")
barplot(tail(sort(sdiff_bt),10),horiz=TRUE,las=1,cex.names=1,
xlim=c(0,35000),
xlab="no. genes",main="Ensembl genes without GO BP Biotype")
grid()
dev.off()
## png
## 2
message("GO BP genes not in Ensembl")
## GO BP genes not in Ensembl
length(setdiff(gobp_genes,ensg_genes))
## [1] 30
setdiff(gobp_genes,ensg_genes)
## [1] "MRPS36" "HLA-DRB3" "KIR2DS2" "HLA-DRB4" "LILRA3"
## [6] "PRSS3P2" "GSTT1" "SLC66A1L" "PRAMEF22" "KIR2DS3"
## [11] "GTF2H2C_2" "C12orf4" "KIR3DS1" "PRPF4B" "CCDC113"
## [16] "CCDC96" "TSPY26P" "LOC101929130" "TAS2R45" "OR8U8"
## [21] "OR8U9" "OR9G9" "C21orf62" "KIR2DL2" "KIR2DL5A"
## [26] "KIR2DL5B" "KIR2DS1" "KIR2DS5" "MSL3P1" "C2orf83"
par(mar=c(5.1, 4.1, 4.1, 2.1) )
sessionInfo()
## R version 4.4.2 (2024-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.1 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] RhpcBLASctl_0.23-42 png_0.1-8 gridExtra_2.3
## [4] gplots_3.2.0 beeswarm_0.4.0 kableExtra_1.4.0
## [7] HGNChelper_0.8.15 eulerr_7.0.2 reshape2_1.4.4
## [10] fgsea_1.30.0 vioplot_0.5.0 zoo_1.8-12
## [13] sm_2.2-6.0 tictoc_1.2.1
##
## loaded via a namespace (and not attached):
## [1] fastmatch_1.1-6 gtable_0.3.6 xfun_0.49
## [4] bslib_0.8.0 ggplot2_3.5.1 caTools_1.18.3
## [7] lattice_0.22-6 vctrs_0.6.5 tools_4.4.2
## [10] bitops_1.0-9 generics_0.1.3 tibble_3.2.1
## [13] pkgconfig_2.0.3 Matrix_1.7-1 KernSmooth_2.23-24
## [16] data.table_1.16.4 polylabelr_0.3.0 lifecycle_1.0.4
## [19] compiler_4.4.2 stringr_1.5.1 munsell_0.5.1
## [22] codetools_0.2-20 htmltools_0.5.8.1 sass_0.4.9
## [25] yaml_2.3.10 pillar_1.10.0 jquerylib_0.1.4
## [28] BiocParallel_1.38.0 cachem_1.1.0 gtools_3.9.5
## [31] tidyselect_1.2.1 digest_0.6.37 stringi_1.8.4
## [34] dplyr_1.1.4 cowplot_1.1.3 polyclip_1.10-7
## [37] fastmap_1.2.0 grid_4.4.2 colorspace_2.1-1
## [40] cli_3.6.3 magrittr_2.0.3 scales_1.3.0
## [43] rmarkdown_2.29 evaluate_1.0.1 knitr_1.49
## [46] viridisLite_0.4.2 rlang_1.1.4 Rcpp_1.0.13-1
## [49] glue_1.8.0 xml2_1.3.6 splitstackshape_1.4.8
## [52] svglite_2.1.3 rstudioapi_0.17.1 jsonlite_1.8.9
## [55] R6_2.5.1 plyr_1.8.9 systemfonts_1.1.0