Introduction

Here I will be examining dinucleotide profiles in human genes.

suppressPackageStartupMessages({
  library("parallel")
  library("vioplot")
  library("beeswarm")
  library("kableExtra")
  library("seqinr")
  library("gplots")
  library(rtracklayer)
})

Load data.

gtfpath="https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/genes/hg38.refGene.gtf.gz"
gtf <- readGFF(gtfpath)

if ( !file.exists("hg38.2bit") ) {
  download.file("https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.2bit",destfile="hg38.2bit") 
}

seq <- rtracklayer::TwoBitFile("hg38.2bit")

# need to extract the sequences
# first make a df of genes
genes <- unique(gtf[,c("gene_id","transcript_id")])
# get primary transcript id
tx <- sapply(unique(genes$gene_id),function(g){ genes[which(genes$gene_id==g),2][1] } )
# get exons of primary tx
exons <- gtf[gtf$transcript_id %in% tx,]
exons <- subset(exons,type=="exon")
exons <- exons[grep("_",exons$seqid,invert=TRUE),]

calc mono and di-nucleotide counts.

#geneseq <- getSeq(seq, GRanges(exons))
cores <- detectCores()

myseqs <- mclapply(unique(exons$gene_name),function(g) {
  myseq <- getSeq(seq,GRanges(exons[which(exons$gene_name==g),c("seqid","start","end")]))
  myseq <- as.character(unlist(myseq))
  s2c(myseq)
} , mc.cores=cores)

mylen <- lapply(myseqs,length)

mono <- mclapply(myseqs,function(x) {
  seqinr::count(x,wordsize=1,alphabet=c("A","G","T","C"))
} , mc.cores=cores)

dinuc <- mclapply(myseqs,function(x) {
  seqinr::count(x,wordsize=2,alphabet=c("A","G","T","C"))
} , mc.cores=cores)

freq <- mclapply(1:length(myseqs),function(i){
  AA <- dinuc[[i]][1] / ( mono[[i]][1] * mono[[i]][1] ) * mylen[[i]]
  AC <- dinuc[[i]][2] / ( mono[[i]][1] * mono[[i]][2] ) * mylen[[i]]
  AG <- dinuc[[i]][3] / ( mono[[i]][1] * mono[[i]][3] ) * mylen[[i]]
  AT <- dinuc[[i]][4] / ( mono[[i]][1] * mono[[i]][4] ) * mylen[[i]]
  CA <- dinuc[[i]][5] / ( mono[[i]][2] * mono[[i]][1] ) * mylen[[i]]
  CC <- dinuc[[i]][6] / ( mono[[i]][2] * mono[[i]][2] ) * mylen[[i]]
  CG <- dinuc[[i]][7] / ( mono[[i]][2] * mono[[i]][3] ) * mylen[[i]]
  CT <- dinuc[[i]][8] / ( mono[[i]][2] * mono[[i]][4] ) * mylen[[i]]
  GA <- dinuc[[i]][9] / ( mono[[i]][3] * mono[[i]][1] ) * mylen[[i]]
  GC <- dinuc[[i]][10] / ( mono[[i]][3] * mono[[i]][2] ) * mylen[[i]]
  GG <- dinuc[[i]][11] / ( mono[[i]][3] * mono[[i]][3] ) * mylen[[i]]
  GT <- dinuc[[i]][12] / ( mono[[i]][3] * mono[[i]][4] ) * mylen[[i]]
  TA <- dinuc[[i]][13] / ( mono[[i]][4] * mono[[i]][1] ) * mylen[[i]]
  TC <- dinuc[[i]][14] / ( mono[[i]][4] * mono[[i]][2] ) * mylen[[i]]
  TG <- dinuc[[i]][15] / ( mono[[i]][4] * mono[[i]][3] ) * mylen[[i]]
  TT <- dinuc[[i]][16] / ( mono[[i]][4] * mono[[i]][4] ) * mylen[[i]]
  c(AA,AC,AG,AT,CA,CC,CG,CT,GA,GC,GG,GT,TA,TC,TG,TT)
},mc.cores=cores)

df <- do.call(rbind,freq)
HEADER=paste("dinucleotide frequency for exons of",length(unique(exons$gene_name)),"genes")
vioplot(df,ylim=c(0,5),cex.names=0.8)
mtext(HEADER)
grid()

Session info

sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] rtracklayer_1.56.1   GenomicRanges_1.48.0 GenomeInfoDb_1.32.2 
##  [4] IRanges_2.30.0       S4Vectors_0.34.0     BiocGenerics_0.42.0 
##  [7] gplots_3.1.3         seqinr_4.2-16        kableExtra_1.3.4    
## [10] beeswarm_0.4.0       vioplot_0.3.7        zoo_1.8-10          
## [13] sm_2.2-5.7.1        
## 
## loaded via a namespace (and not attached):
##  [1] svglite_2.1.0               lattice_0.20-45            
##  [3] Rsamtools_2.12.0            Biostrings_2.64.0          
##  [5] gtools_3.9.3                digest_0.6.29              
##  [7] R6_2.5.1                    evaluate_0.15              
##  [9] highr_0.9                   httr_1.4.3                 
## [11] zlibbioc_1.42.0             rlang_1.0.4                
## [13] rstudioapi_0.13             jquerylib_0.1.4            
## [15] Matrix_1.4-1                rmarkdown_2.14             
## [17] webshot_0.5.3               BiocParallel_1.30.3        
## [19] stringr_1.4.0               RCurl_1.98-1.7             
## [21] munsell_0.5.0               DelayedArray_0.22.0        
## [23] compiler_4.2.1              xfun_0.31                  
## [25] systemfonts_1.0.4           tcltk_4.2.1                
## [27] htmltools_0.5.3             SummarizedExperiment_1.26.1
## [29] GenomeInfoDbData_1.2.8      codetools_0.2-18           
## [31] matrixStats_0.62.0          XML_3.99-0.10              
## [33] viridisLite_0.4.0           crayon_1.5.1               
## [35] GenomicAlignments_1.32.0    MASS_7.3-58                
## [37] bitops_1.0-7                grid_4.2.1                 
## [39] jsonlite_1.8.0              lifecycle_1.0.1            
## [41] magrittr_2.0.3              scales_1.2.0               
## [43] KernSmooth_2.23-20          cachem_1.0.6               
## [45] cli_3.3.0                   stringi_1.7.8              
## [47] XVector_0.36.0              bslib_0.4.0                
## [49] xml2_1.3.3                  rjson_0.2.21               
## [51] restfulr_0.0.15             tools_4.2.1                
## [53] ade4_1.7-19                 Biobase_2.56.0             
## [55] glue_1.6.2                  MatrixGenerics_1.8.1       
## [57] fastmap_1.1.0               yaml_2.3.5                 
## [59] colorspace_2.0-3            caTools_1.18.2             
## [61] rvest_1.0.2                 knitr_1.39                 
## [63] sass_0.4.2                  BiocIO_1.6.0