Here I will be examining dinucleotide profiles in human genes.
suppressPackageStartupMessages({
library("parallel")
library("vioplot")
library("beeswarm")
library("kableExtra")
library("seqinr")
library("gplots")
library(rtracklayer)
})
Load data.
gtfpath="https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/genes/hg38.refGene.gtf.gz"
gtf <- readGFF(gtfpath)
if ( !file.exists("hg38.2bit") ) {
download.file("https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.2bit",destfile="hg38.2bit")
}
seq <- rtracklayer::TwoBitFile("hg38.2bit")
# need to extract the sequences
# first make a df of genes
genes <- unique(gtf[,c("gene_id","transcript_id")])
# get primary transcript id
tx <- sapply(unique(genes$gene_id),function(g){ genes[which(genes$gene_id==g),2][1] } )
# get exons of primary tx
exons <- gtf[gtf$transcript_id %in% tx,]
exons <- subset(exons,type=="exon")
exons <- exons[grep("_",exons$seqid,invert=TRUE),]
calc mono and di-nucleotide counts.
#geneseq <- getSeq(seq, GRanges(exons))
cores <- detectCores()
myseqs <- mclapply(unique(exons$gene_name),function(g) {
myseq <- getSeq(seq,GRanges(exons[which(exons$gene_name==g),c("seqid","start","end")]))
myseq <- as.character(unlist(myseq))
s2c(myseq)
} , mc.cores=cores)
mylen <- lapply(myseqs,length)
mono <- mclapply(myseqs,function(x) {
seqinr::count(x,wordsize=1,alphabet=c("A","G","T","C"))
} , mc.cores=cores)
dinuc <- mclapply(myseqs,function(x) {
seqinr::count(x,wordsize=2,alphabet=c("A","G","T","C"))
} , mc.cores=cores)
freq <- mclapply(1:length(myseqs),function(i){
AA <- dinuc[[i]][1] / ( mono[[i]][1] * mono[[i]][1] ) * mylen[[i]]
AC <- dinuc[[i]][2] / ( mono[[i]][1] * mono[[i]][2] ) * mylen[[i]]
AG <- dinuc[[i]][3] / ( mono[[i]][1] * mono[[i]][3] ) * mylen[[i]]
AT <- dinuc[[i]][4] / ( mono[[i]][1] * mono[[i]][4] ) * mylen[[i]]
CA <- dinuc[[i]][5] / ( mono[[i]][2] * mono[[i]][1] ) * mylen[[i]]
CC <- dinuc[[i]][6] / ( mono[[i]][2] * mono[[i]][2] ) * mylen[[i]]
CG <- dinuc[[i]][7] / ( mono[[i]][2] * mono[[i]][3] ) * mylen[[i]]
CT <- dinuc[[i]][8] / ( mono[[i]][2] * mono[[i]][4] ) * mylen[[i]]
GA <- dinuc[[i]][9] / ( mono[[i]][3] * mono[[i]][1] ) * mylen[[i]]
GC <- dinuc[[i]][10] / ( mono[[i]][3] * mono[[i]][2] ) * mylen[[i]]
GG <- dinuc[[i]][11] / ( mono[[i]][3] * mono[[i]][3] ) * mylen[[i]]
GT <- dinuc[[i]][12] / ( mono[[i]][3] * mono[[i]][4] ) * mylen[[i]]
TA <- dinuc[[i]][13] / ( mono[[i]][4] * mono[[i]][1] ) * mylen[[i]]
TC <- dinuc[[i]][14] / ( mono[[i]][4] * mono[[i]][2] ) * mylen[[i]]
TG <- dinuc[[i]][15] / ( mono[[i]][4] * mono[[i]][3] ) * mylen[[i]]
TT <- dinuc[[i]][16] / ( mono[[i]][4] * mono[[i]][4] ) * mylen[[i]]
c(AA,AC,AG,AT,CA,CC,CG,CT,GA,GC,GG,GT,TA,TC,TG,TT)
},mc.cores=cores)
df <- do.call(rbind,freq)
HEADER=paste("dinucleotide frequency for exons of",length(unique(exons$gene_name)),"genes")
vioplot(df,ylim=c(0,5),cex.names=0.8)
mtext(HEADER)
grid()
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8
## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8
## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] rtracklayer_1.56.1 GenomicRanges_1.48.0 GenomeInfoDb_1.32.2
## [4] IRanges_2.30.0 S4Vectors_0.34.0 BiocGenerics_0.42.0
## [7] gplots_3.1.3 seqinr_4.2-16 kableExtra_1.3.4
## [10] beeswarm_0.4.0 vioplot_0.3.7 zoo_1.8-10
## [13] sm_2.2-5.7.1
##
## loaded via a namespace (and not attached):
## [1] svglite_2.1.0 lattice_0.20-45
## [3] Rsamtools_2.12.0 Biostrings_2.64.0
## [5] gtools_3.9.3 digest_0.6.29
## [7] R6_2.5.1 evaluate_0.15
## [9] highr_0.9 httr_1.4.3
## [11] zlibbioc_1.42.0 rlang_1.0.4
## [13] rstudioapi_0.13 jquerylib_0.1.4
## [15] Matrix_1.4-1 rmarkdown_2.14
## [17] webshot_0.5.3 BiocParallel_1.30.3
## [19] stringr_1.4.0 RCurl_1.98-1.7
## [21] munsell_0.5.0 DelayedArray_0.22.0
## [23] compiler_4.2.1 xfun_0.31
## [25] systemfonts_1.0.4 tcltk_4.2.1
## [27] htmltools_0.5.3 SummarizedExperiment_1.26.1
## [29] GenomeInfoDbData_1.2.8 codetools_0.2-18
## [31] matrixStats_0.62.0 XML_3.99-0.10
## [33] viridisLite_0.4.0 crayon_1.5.1
## [35] GenomicAlignments_1.32.0 MASS_7.3-58
## [37] bitops_1.0-7 grid_4.2.1
## [39] jsonlite_1.8.0 lifecycle_1.0.1
## [41] magrittr_2.0.3 scales_1.2.0
## [43] KernSmooth_2.23-20 cachem_1.0.6
## [45] cli_3.3.0 stringi_1.7.8
## [47] XVector_0.36.0 bslib_0.4.0
## [49] xml2_1.3.3 rjson_0.2.21
## [51] restfulr_0.0.15 tools_4.2.1
## [53] ade4_1.7-19 Biobase_2.56.0
## [55] glue_1.6.2 MatrixGenerics_1.8.1
## [57] fastmap_1.1.0 yaml_2.3.5
## [59] colorspace_2.0-3 caTools_1.18.2
## [61] rvest_1.0.2 knitr_1.39
## [63] sass_0.4.2 BiocIO_1.6.0