Introduction

Here I want to download gene expression data for human and fly and rank the genes by their overall gene expression. The data will be fetched from DEE2.io with accession numbers SRP002072 (fly) and ERP000546 (human). Then we will look at their dinucleotide profiles in high, medium and low expressed genes.

suppressPackageStartupMessages({
  library("R.utils")
  library("getDEE2")
  library("kableExtra")
  library("seqinr")
  library("gplots")

})

Drosophila

Study SRP002072 was published in 2011 as the first transcriptome of D. melanogaster, sampling 10 developmental stages.

mdat <- getDEE2Metadata("dmelanogaster")

mdat1 <- mdat[which(mdat$SRP_accession %in% "SRP002072"),]

SRRvec <- as.vector(mdat1$SRR_accession)

x <- getDEE2("dmelanogaster",SRRvec,metadata=mdat,counts="GeneCounts",legacy=TRUE)
## For more information about DEE2 QC metrics, visit
##     https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md
head(x$GeneCounts) %>%
  kbl(caption="sample of gene expression data") %>%
  kable_paper("hover", full_width = F)
sample of gene expression data
SRR040803 SRR042295 SRR042296 SRR042297 SRR042298 SRR042412 SRR042423 SRR042489 SRR042490 SRR058881 SRR058885 SRR059066
FBgn0085804 1 0 0 0 0 0 0 0 0 0 0 0
FBgn0267431 0 0 0 0 1 3 8 0 0 0 0 0
FBgn0039987 0 0 0 0 0 0 0 0 1 0 0 0
FBgn0058182 0 0 0 0 0 0 0 0 0 0 0 0
FBgn0267430 1 0 0 4 3 4 3 0 1 0 1 2
FBgn0266747 0 0 0 2 2 0 3 0 0 0 0 0
xx <- x$GeneCounts

rpm <- apply(xx,2,function(x) {x/sum(x)} ) * 1000000

lrpm <- log10(rpm+1)

means <- rowMeans(lrpm)

hist(means)

Now separate high, medium and low expressed genes into text files.

means <- means[order(means)]

qnt <- rev(quantile(means,seq(0,1,1/3)))
qnt <- qnt[2:length(qnt)]

high <- names(which(means > qnt[1]))
med <- names(which(means < qnt[1] & means > qnt[2] ))
low <- names(which(means < qnt[2]))

str(high)
##  chr [1:5853] "FBgn0053505" "FBgn0036005" "FBgn0021875" "FBgn0036090" ...
str(med)
##  chr [1:5853] "FBgn0036289" "FBgn0034622" "FBgn0264702" "FBgn0028658" ...
str(low)
##  chr [1:5853] "FBgn0058182" "FBgn0262214" "FBgn0040687" "FBgn0264983" ...
writeLines(high,con="fly_high.txt")
writeLines(med,con="fly_med.txt")
writeLines(low,con="fly_low.txt")

Now get the gene sequences and load into R.

if ( ! file.exists("fly_cdna.fa.gz") ) {
  download.file("ftp://ftp.ensembl.org/pub/release-90/fasta/drosophila_melanogaster/cdna/Drosophila_melanogaster.BDGP6.cdna.all.fa.gz", 
    destfile="fly_cdna.fa.gz")
}

fa <- read.fasta("fly_cdna.fa.gz")

Now the tricky part is to select one representative transcript isoform for each gene (the first one in the fasta file).

seqinfo <- unlist(getAnnot(fa))

genes <- gsub("gene:","",sapply(strsplit(seqinfo," "),"[[",4))

mygene <- unique(genes)[1]

myseqs <- lapply(unique(genes),function(g) {
  myseq <- head(fa[which(genes %in% g)],1)
})

Great. Now need to group the sequences by their membership in high, medium or low groups.

seqinfo <- unlist(getAnnot(myseqs))

genes <- gsub("gene:","",sapply(strsplit(seqinfo," "),"[[",4))

fa_high <- myseqs[which(genes %in% high)]

fa_med <- myseqs[which(genes %in% med)]

fa_low <- myseqs[which(genes %in% low)]

Now here I can show the nucleotide content of high, medium and low expressed genes.

count_high1 <- seqinr::count(seq=unlist(fa_high),wordsize=1,freq=TRUE)
count_med1 <- seqinr::count(seq=unlist(fa_med),wordsize=1,freq=TRUE)
count_low1 <- seqinr::count(seq=unlist(fa_low),wordsize=1,freq=TRUE)
res1 <- rbind(count_low1, count_med1, count_high1)
res1 %>%
  kbl(caption="comparison of nucleotide content of high, medium and low expressed genes in drosophila") %>%
  kable_paper("hover", full_width = F)
comparison of nucleotide content of high, medium and low expressed genes in drosophila
a c g t
count_low1 0.2834623 0.2338232 0.2323716 0.2503429
count_med1 0.2776779 0.2447269 0.2417666 0.2358285
count_high1 0.2760666 0.2526660 0.2478667 0.2234006

Now here I can show the dinucleotide content of high, medium and low expressed genes.

count_high2 <- seqinr::count(seq=unlist(fa_high),wordsize=2,freq=TRUE)
count_med2 <- seqinr::count(seq=unlist(fa_med),wordsize=2,freq=TRUE)
count_low2 <- seqinr::count(seq=unlist(fa_low),wordsize=2,freq=TRUE)
res2 <- rbind(count_low2, count_med2, count_high2)
res2 %>%
  kbl(caption="comparison of dinucleotide content of high, medium and low expressed genes in drosophila") %>%
  kable_paper("hover", full_width = F)
comparison of dinucleotide content of high, medium and low expressed genes in drosophila
aa ac ag at ca cc cg ct ga gc gg gt ta tc tg tt
count_low2 0.0911473 0.0576155 0.0617578 0.0729413 0.0737129 0.0550199 0.0484969 0.0565936 0.0678081 0.0620639 0.0539476 0.0485522 0.0507936 0.0591241 0.0681694 0.0722560
count_med2 0.0861690 0.0587789 0.0638489 0.0688812 0.0760625 0.0597220 0.0536413 0.0553011 0.0685853 0.0682812 0.0570483 0.0478518 0.0468612 0.0579449 0.0672280 0.0637945
count_high2 0.0838555 0.0615068 0.0664435 0.0642608 0.0794284 0.0606216 0.0592451 0.0533710 0.0688855 0.0745777 0.0568739 0.0475297 0.0438973 0.0559600 0.0653042 0.0582392

Super. Now we can make a chart of dinucleotide differences.

ymin=min(res2)
ymax=max(res2)
plot(res2[1,],xaxt = "n",pch=19,col="blue",cex=2,
  ylim=c(ymin,ymax), xlab="dinucleotide",ylab="frequency")
axis(1, at=1:ncol(res2), labels=colnames(res2))
points(res2[2,],pch=19,col="gray",cex=2)
points(res2[3,],pch=19,col="red",cex=2)

Now we can look at whether the dinucleotides are more highly represented in higher expressed genes as compared to lowly expressed.

rep <- apply(res2,2,function(x) { log2(x[3]/x[1]) } )

barplot(rep,ylab="log2 fold enrichment in higher expressed genes")

save.image("fly.Rdata")

Now look at the amino acid composition starting with CDS sequence.

Select one representative CDS per gene.

if ( ! file.exists("fly_cds.fa.gz") ) {
  download.file("ftp://ftp.ensembl.org/pub/release-90/fasta/drosophila_melanogaster/cds/Drosophila_melanogaster.BDGP6.cds.all.fa.gz",
    destfile="fly_cds.fa.gz")
}
fa <- read.fasta("fly_cds.fa.gz")
seqinfo <- unlist(getAnnot(fa))
genes <- gsub("gene:","",sapply(strsplit(seqinfo," "),"[[",4))
mygene <- unique(genes)[1]

myseqs <- lapply(unique(genes),function(g) {
  myseq <- head(fa[which(genes %in% g)],1)
})

seqinfo <- unlist(getAnnot(myseqs))
genes <- gsub("gene:","",sapply(strsplit(seqinfo," "),"[[",4))
fa_high <- myseqs[which(genes %in% high)]
fa_med <- myseqs[which(genes %in% med)]
fa_low <- myseqs[which(genes %in% low)]
codon_high <- uco(unlist(fa_high),index="rscu",as.data.frame=TRUE)
codon_high <- codon_high[order(codon_high$AA),]
codon_med <- uco(unlist(fa_med),index="rscu",as.data.frame=TRUE)
codon_med <- codon_med[order(codon_med$AA),]
codon_low <- uco(unlist(fa_low),index="rscu",as.data.frame=TRUE)
codon_low <- codon_low[order(codon_low$AA),]
codons <- data.frame(codon_low$RSCU,  codon_med$RSCU, codon_high$RSCU )
rownames(codons) <- paste(codon_low$AA, codon_low$codon)

codons %>%
  kbl(caption="RSCU of high med and low expressed genes") %>%
  kable_paper("hover", full_width = F)
RSCU of high med and low expressed genes
codon_low.RSCU codon_med.RSCU codon_high.RSCU
Ala gca 1.2510629 1.0151353 1.1222023
Ala gcc 0.9709917 1.3096806 1.1620967
Ala gcg 0.8218662 0.7652104 0.7948555
Ala gct 0.9560792 0.9099737 0.9208455
Arg aga 1.3272054 0.9022233 0.8097677
Arg agg 1.3457421 0.8684332 0.7293770
Arg cga 0.9565643 1.3277980 1.5022731
Arg cgc 0.8523186 1.1568146 1.1711491
Arg cgg 0.9707880 1.0044904 1.0126115
Arg cgt 0.5473816 0.7402405 0.7748217
Asn aac 0.9423953 1.0036371 1.0851978
Asn aat 1.0576047 0.9963629 0.9148022
Asp gac 0.8443794 0.8480066 0.9193470
Asp gat 1.1556206 1.1519934 1.0806530
Cys tgc 1.2084094 1.2503330 1.2471711
Cys tgt 0.7915906 0.7496670 0.7528289
Gln caa 1.1050071 0.9666172 1.0666840
Gln cag 0.8949929 1.0333828 0.9333160
Glu gaa 1.1310259 0.9421185 0.9839191
Glu gag 0.8689741 1.0578815 1.0160809
Gly gga 1.4129060 1.4493083 1.4782987
Gly ggc 1.1043165 1.2220502 1.2258739
Gly ggg 0.7425332 0.5616481 0.5659289
Gly ggt 0.7402442 0.7669934 0.7298985
His cac 0.9775165 1.0275372 1.0155336
His cat 1.0224835 0.9724628 0.9844664
Ile ata 0.8607845 0.6647587 0.5822379
Ile atc 1.1557099 1.3134994 1.4254683
Ile att 0.9835056 1.0217420 0.9922939
Leu cta 0.7348677 0.7591808 0.8007587
Leu ctc 0.8430560 1.0434966 1.1661689
Leu ctg 1.2889699 1.8378223 1.7453892
Leu ctt 0.9009241 0.9381465 1.0486501
Leu tta 0.9334387 0.4374372 0.4034263
Leu ttg 1.2987436 0.9839166 0.8356068
Lys aaa 1.1367615 0.8858341 0.8821693
Lys aag 0.8632385 1.1141659 1.1178307
Met atg 1.0000000 1.0000000 1.0000000
Phe ttc 0.9867888 1.0959713 1.1461263
Phe ttt 1.0132112 0.9040287 0.8538737
Pro cca 1.5317222 1.3250192 1.3829790
Pro ccc 0.7008055 0.8862955 0.7845434
Pro ccg 0.8703987 0.9380444 0.9489809
Pro cct 0.8970735 0.8506410 0.8834967
Ser agc 1.1771170 1.2859666 1.3740698
Ser agt 0.9051089 0.8365642 0.7588951
Ser tca 1.2002650 0.8754543 0.9097699
Ser tcc 0.9203071 1.2468006 1.1696108
Ser tcg 0.8883520 0.9593343 0.9682205
Ser tct 0.9088500 0.7958801 0.8194339
Stp taa 0.9414922 0.9103797 0.9010691
Stp tag 0.5427760 0.5303205 0.4967124
Stp tga 1.5157319 1.5592998 1.6022184
Thr aca 1.3133425 0.9885695 1.0494975
Thr acc 0.8188716 1.2281014 1.1612207
Thr acg 0.8688353 0.9026018 0.9395052
Thr act 0.9989507 0.8807272 0.8497766
Trp tgg 1.0000000 1.0000000 1.0000000
Tyr tac 0.9453102 1.0919401 1.1099930
Tyr tat 1.0546898 0.9080599 0.8900070
Val gta 0.6523197 0.5568126 0.5847671
Val gtc 1.0295349 0.9879666 1.0574421
Val gtg 1.3413280 1.5175623 1.4025872
Val gtt 0.9768174 0.9376584 0.9552036
ymin=min(codons)
ymax=max(codons)
par(mar=c(8,5,3,1))
plot(codons[,1],xaxt = "n",pch=19,col="blue",cex=1,
  ylim=c(ymin,ymax),ylab="RSCU",xlab="")
axis(1, at=1:nrow(codons), labels=rownames(codons),las=2,cex.axis=0.5)
points(codons[,2],pch=19,col="gray",cex=1)
points(codons[,3],pch=19,col="red",cex=1)
grid()

Now take a look at the amino acids encoded.

pep_high <- aggregate(eff ~ AA,codon_high,sum)
aa <- as.character(pep_high[,1])
pep_high <- pep_high[,2]
pep_med <- aggregate(eff ~ AA,codon_med,sum)
pep_med <- pep_med[,2]
pep_low <- aggregate(eff ~ AA,codon_low,sum)
pep_low <- pep_low[,2]
pep <- cbind(pep_high,pep_med,pep_low)
rownames(pep) <- aa
pep <- pep[which(rownames(pep)!="Stp"),]
pep <- apply(pep,2,function(x) {x/sum(x)} )

pep %>%
  kbl(caption="Amino acid frequency in high med and low expressed genes") %>%
  kable_paper("hover", full_width = F)
Amino acid frequency in high med and low expressed genes
pep_high pep_med pep_low
Ala 0.0884104 0.0774428 0.0667674
Arg 0.0956875 0.0819181 0.0914341
Asn 0.0321774 0.0370424 0.0346905
Asp 0.0391127 0.0408682 0.0277041
Cys 0.0299473 0.0291160 0.0367840
Gln 0.0573575 0.0509123 0.0382341
Glu 0.0471448 0.0499169 0.0359995
Gly 0.0822118 0.0725126 0.0513163
His 0.0398148 0.0350778 0.0271181
Ile 0.0365243 0.0435076 0.0531580
Leu 0.0794041 0.0869148 0.0867061
Lys 0.0373828 0.0427372 0.0445718
Met 0.0144138 0.0184485 0.0225747
Phe 0.0257892 0.0324793 0.0372594
Pro 0.0631918 0.0621991 0.0685714
Ser 0.0882392 0.0887488 0.1076544
Thr 0.0549269 0.0554685 0.0693000
Trp 0.0185724 0.0179974 0.0290857
Tyr 0.0209107 0.0256307 0.0239591
Val 0.0487807 0.0510609 0.0471113
ymin=min(pep)
ymax=max(pep)
plot(pep[,3],xaxt = "n",pch=19,col="blue",cex=2,
  ylim=c(ymin,ymax), xlab="amino acid",ylab="frequency")
axis(1, at=1:nrow(pep), labels=rownames(pep),las=2)
points(pep[,2],pch=19,col="gray",cex=2)
points(pep[,1],pch=19,col="red",cex=2)

Human

The study ERP000546 consists of different human tissues.

mdat <- getDEE2Metadata("hsapiens")

mdat1 <- mdat[which(mdat$SRP_accession %in% "ERP000546"),]

SRRvec <- as.vector(mdat1$SRR_accession)

x <- getDEE2("hsapiens",SRRvec,metadata=mdat,counts="GeneCounts",legacy=TRUE)
## For more information about DEE2 QC metrics, visit
##     https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md
head(x$GeneCounts) %>%
  kbl(caption="sample of gene expression data") %>%
  kable_paper("hover", full_width = F)
sample of gene expression data
ERR030856 ERR030857 ERR030858 ERR030859 ERR030860 ERR030861 ERR030862 ERR030863 ERR030864 ERR030865 ERR030866 ERR030867 ERR030868 ERR030869 ERR030870 ERR030871 ERR030872 ERR030873 ERR030874 ERR030875 ERR030876 ERR030877 ERR030878 ERR030879 ERR030880 ERR030881 ERR030882 ERR030883 ERR030884 ERR030885 ERR030886 ERR030887 ERR030888 ERR030889 ERR030890 ERR030891 ERR030892 ERR030893 ERR030894 ERR030895 ERR030896 ERR030897 ERR030898 ERR030899 ERR030900 ERR030901 ERR030902 ERR030903
ENSG00000223972 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 0 7 0 0 0 0 0 0 2 1 0 0 0 0 0 0 2 4 0 0 0 0 0 0 0 0 0 0 3 0
ENSG00000227232 9 6 7 8 12 12 8 4 16 10 10 14 9 5 1 12 31 73 85 18 8 91 114 9 23 70 27 29 37 12 5 25 1 6 1 1 4 0 0 0 1 15 2 0 0 29 8 1
ENSG00000278267 0 1 0 2 3 0 2 0 0 1 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 4 1 0 6 3 0 1 1 8 11 0 2 9 5 0
ENSG00000243485 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0
ENSG00000284332 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
ENSG00000237613 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
xx <- x$GeneCounts

rpm <- apply(xx,2,function(x) {x/sum(x)} ) * 1000000

lrpm <- log10(rpm+1)

means <- rowMeans(lrpm)

hist(means)

Now separate high, medium and low expressed genes into text files.

means <- means[order(means)]

qnt <- rev(quantile(means,seq(0,1,1/3)))
qnt <- qnt[2:length(qnt)]

high <- names(which(means > qnt[1]))
med <- names(which(means < qnt[1] & means > qnt[2] ))
low <- names(which(means < qnt[2]))

str(high)
##  chr [1:19434] "ENSG00000230076" "ENSG00000282995" "ENSG00000236393" ...
str(med)
##  chr [1:19434] "ENSG00000264712" "ENSG00000275875" "ENSG00000185962" ...
str(low)
##  chr [1:19434] "ENSG00000284332" "ENSG00000237613" "ENSG00000268020" ...
writeLines(high,con="hum_high.txt")
writeLines(med,con="hum_med.txt")
writeLines(low,con="hum_low.txt")
if ( ! file.exists("hum_cdna.fa.gz") ) {
  download.file("ftp://ftp.ensembl.org/pub/release-90/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz",
    destfile="hum_cdna.fa.gz")
}

fa <- read.fasta("hum_cdna.fa.gz")

Now the tricky part is to select one representative transcript isoform for each gene (the first one in the fasta file).

seqinfo <- unlist(getAnnot(fa))

genes <- gsub("gene:","",sapply(strsplit(seqinfo," "),"[[",4))

mygene <- unique(genes)[1]

myseqs <- lapply(unique(genes),function(g) {
  myseq <- head(fa[which(genes %in% g)],1)
})

Great. Now need to group the sequences by their membership in high, medium or low groups.

seqinfo <- unlist(getAnnot(myseqs))

genes <- gsub("gene:","",sapply(strsplit(seqinfo," "),"[[",4))
genes <- sapply(strsplit(genes,"\\."),"[[",1)

fa_high <- myseqs[which(genes %in% high)]

fa_med <- myseqs[which(genes %in% med)]

fa_low <- myseqs[which(genes %in% low)]
count_high1 <- seqinr::count(seq=unlist(fa_high),wordsize=1,freq=TRUE)
count_med1 <- seqinr::count(seq=unlist(fa_med),wordsize=1,freq=TRUE)
count_low1 <- seqinr::count(seq=unlist(fa_low),wordsize=1,freq=TRUE)
res1 <- rbind(count_low1, count_med1, count_high1)
res1 %>%
  kbl(caption="comparison of nucleotide content of high, medium and low expressed genes in human") %>%
  kable_paper("hover", full_width = F)
comparison of nucleotide content of high, medium and low expressed genes in human
a c g t
count_low1 0.2743642 0.2398423 0.2378671 0.2479265
count_med1 0.2693717 0.2451252 0.2480154 0.2374877
count_high1 0.2620743 0.2406433 0.2455658 0.2517167

Now here I can show the dinucleotide content of high, medium and low expressed genes.

count_high2 <- seqinr::count(seq=unlist(fa_high),wordsize=2,freq=TRUE)
count_med2 <- seqinr::count(seq=unlist(fa_med),wordsize=2,freq=TRUE)
count_low2 <- seqinr::count(seq=unlist(fa_low),wordsize=2,freq=TRUE)
res2 <- rbind(count_low2, count_med2, count_high2)
res2 %>%
  kbl(caption="comparison of dinucleotide content of high, medium and low expressed genes in human") %>%
  kable_paper("hover", full_width = F)
comparison of dinucleotide content of high, medium and low expressed genes in human
aa ac ag at ca cc cg ct ga gc gg gt ta tc tg tt
count_low2 0.0814479 0.0540638 0.0766824 0.0621701 0.0802701 0.0679467 0.0181863 0.0734392 0.0696699 0.0578420 0.0628600 0.0474950 0.0429762 0.0599897 0.0801384 0.0648222
count_med2 0.0799966 0.0535861 0.0783864 0.0574026 0.0783768 0.0714370 0.0232857 0.0720257 0.0714253 0.0616405 0.0680461 0.0469035 0.0395730 0.0584617 0.0782971 0.0611559
count_high2 0.0779429 0.0509775 0.0749316 0.0582223 0.0727886 0.0705796 0.0262510 0.0710241 0.0671368 0.0615443 0.0677325 0.0491522 0.0442059 0.0575419 0.0766507 0.0733181

Super. Now we can make a chart of dinucleotide differences.

ymin=min(res2)
ymax=max(res2)
plot(res2[1,],xaxt = "n",pch=19,col="blue",cex=2,
  ylim=c(ymin,ymax), xlab="dinucleotide",ylab="frequency")
axis(1, at=1:ncol(res2), labels=colnames(res2))
points(res2[2,],pch=19,col="gray",cex=2)
points(res2[3,],pch=19,col="red",cex=2)

Now we can look at whether the dinucleotides are more highly represented in higher expressed genes as compared to lowly expressed.

rep <- apply(res2,2,function(x) { log2(x[3]/x[1]) } )

barplot(rep,ylab="log2 fold enrichment in higher expressed genes")

save.image("hum.Rdata")

Now look at the amino acid composition starting with CDS sequence.

Select one representative CDS per gene.

if ( ! file.exists("hum_cds.fa.gz") ) {
  download.file("ftp://ftp.ensembl.org/pub/release-90/fasta/homo_sapiens/cds/Homo_sapiens.GRCh38.cds.all.fa.gz",
    destfile="hum_cds.fa.gz")
}

fa <- read.fasta("hum_cds.fa.gz")
seqinfo <- unlist(getAnnot(fa))
genes <- gsub("gene:","",sapply(strsplit(seqinfo," "),"[[",4))
mygene <- unique(genes)[1]

myseqs <- lapply(unique(genes),function(g) {
  myseq <- head(fa[which(genes %in% g)],1)
})

seqinfo <- unlist(getAnnot(myseqs))
genes <- gsub("gene:","",sapply(strsplit(seqinfo," "),"[[",4))
genes <- sapply(strsplit(genes,"\\."),"[[",1)

fa_high <- myseqs[which(genes %in% high)]
fa_med <- myseqs[which(genes %in% med)]
fa_low <- myseqs[which(genes %in% low)]
codon_high <- uco(unlist(fa_high),index="rscu",as.data.frame=TRUE)
codon_high <- codon_high[order(codon_high$AA),]
codon_med <- uco(unlist(fa_med),index="rscu",as.data.frame=TRUE)
codon_med <- codon_med[order(codon_med$AA),]
codon_low <- uco(unlist(fa_low),index="rscu",as.data.frame=TRUE)
codon_low <- codon_low[order(codon_low$AA),]
codons <- data.frame(codon_low$RSCU,  codon_med$RSCU, codon_high$RSCU )
rownames(codons) <- paste(codon_low$AA, codon_low$codon)

codons %>%
  kbl(caption="RSCU of high med and low expressed genes") %>%
  kable_paper("hover", full_width = F)
RSCU of high med and low expressed genes
codon_low.RSCU codon_med.RSCU codon_high.RSCU
Ala gca 0.9894397 1.0112743 1.1061142
Ala gcc 1.2810703 1.2707586 1.2599620
Ala gcg 0.4127927 0.5870928 0.5284923
Ala gct 1.3166974 1.1308744 1.1054315
Arg aga 1.8170871 1.7345745 1.9297726
Arg agg 1.5844589 1.6060744 1.5509658
Arg cga 0.5602116 0.5330014 0.5761040
Arg cgc 0.7542718 0.8236732 0.7007051
Arg cgg 0.7830757 0.8612334 0.8130682
Arg cgt 0.5008950 0.4414429 0.4293843
Asn aac 0.9717834 1.0492910 1.0109507
Asn aat 1.0282166 0.9507090 0.9890493
Asp gac 1.0123552 1.0837037 1.0267129
Asp gat 0.9876448 0.9162963 0.9732871
Cys tgc 1.0506312 1.1620409 1.1297790
Cys tgt 0.9493688 0.8379591 0.8702210
Gln caa 0.8219964 0.7898453 0.8036519
Gln cag 1.1780036 1.2101547 1.1963481
Glu gaa 1.0000000 0.9595339 1.0183375
Glu gag 1.0000000 1.0404661 0.9816625
Gly gga 1.2287912 1.2202103 1.3320889
Gly ggc 1.1710864 1.1647688 1.1169796
Gly ggg 0.9001645 0.9941510 0.9307848
Gly ggt 0.6999579 0.6208699 0.6201468
His cac 0.9883525 1.0368078 1.0358240
His cat 1.0116475 0.9631922 0.9641760
Ile ata 0.6505044 0.6747759 0.7291258
Ile atc 1.2772926 1.3303590 1.1901218
Ile att 1.0722030 0.9948651 1.0807524
Leu cta 0.6181735 0.6188255 0.5953331
Leu ctc 1.2455255 1.2731062 1.1491734
Leu ctg 1.7787976 1.9501849 1.8460329
Leu ctt 1.1194738 1.0137119 1.0006725
Leu tta 0.4610678 0.4162357 0.5293014
Leu ttg 0.7769619 0.7279359 0.8794867
Lys aaa 0.9711698 0.9459468 0.9773664
Lys aag 1.0288302 1.0540532 1.0226336
Met atg 1.0000000 1.0000000 1.0000000
Phe ttc 1.0611183 1.1127826 1.0478631
Phe ttt 0.9388817 0.8872174 0.9521369
Pro cca 1.1692184 1.2061131 1.2417976
Pro ccc 1.0425810 1.0802272 1.0816364
Pro ccg 0.4418238 0.5586692 0.5644940
Pro cct 1.3463768 1.1549905 1.1120719
Ser agc 1.1761974 1.3679601 1.4073696
Ser agt 0.7321352 0.7564830 0.8844682
Ser tca 1.1710379 1.1633220 1.1620048
Ser tcc 1.2848912 1.2285899 1.1382656
Ser tcg 0.3088830 0.3945775 0.3946817
Ser tct 1.3268553 1.0890674 1.0132101
Stp taa 0.6730258 0.6546046 0.6951968
Stp tag 0.4826195 0.4963140 0.4924823
Stp tga 1.8443548 1.8490813 1.8123209
Thr aca 1.2374807 1.2548078 1.3041955
Thr acc 1.2908112 1.2700880 1.2153266
Thr acg 0.4061650 0.5185654 0.5009106
Thr act 1.0655432 0.9565388 0.9795673
Trp tgg 1.0000000 1.0000000 1.0000000
Tyr tac 0.9635147 1.0826873 1.0380065
Tyr tat 1.0364853 0.9173127 0.9619935
Val gta 0.5458107 0.5146289 0.5822500
Val gtc 1.0193093 1.0615461 0.9626327
Val gtg 1.5821347 1.6156262 1.5949046
Val gtt 0.8527453 0.8081988 0.8602127
ymin=min(codons)
ymax=max(codons)
par(mar=c(8,5,3,1))
plot(codons[,1],xaxt = "n",pch=19,col="blue",cex=1,
  ylim=c(ymin,ymax),ylab="RSCU",xlab="")
axis(1, at=1:nrow(codons), labels=rownames(codons),las=2,cex.axis=0.5)
points(codons[,2],pch=19,col="gray",cex=1)
points(codons[,3],pch=19,col="red",cex=1)
grid()

Now take a look at the amino acids encoded.

pep_high <- aggregate(eff ~ AA,codon_high,sum)
aa <- as.character(pep_high[,1])
pep_high <- pep_high[,2]
pep_med <- aggregate(eff ~ AA,codon_med,sum)
pep_med <- pep_med[,2]
pep_low <- aggregate(eff ~ AA,codon_low,sum)
pep_low <- pep_low[,2]
pep <- cbind(pep_high,pep_med,pep_low)
rownames(pep) <- aa
pep <- pep[which(rownames(pep)!="Stp"),]
pep <- apply(pep,2,function(x) {x/sum(x)} )

pep %>%
  kbl(caption="Amino acid frequency in high med and low expressed genes") %>%
  kable_paper("hover", full_width = F)
Amino acid frequency in high med and low expressed genes
pep_high pep_med pep_low
Ala 0.0718611 0.0774976 0.0738341
Arg 0.0814834 0.0839017 0.0682041
Asn 0.0283601 0.0237934 0.0254697
Asp 0.0308189 0.0269745 0.0287467
Cys 0.0349691 0.0380265 0.0421988
Gln 0.0493456 0.0484853 0.0445047
Glu 0.0488614 0.0424199 0.0415329
Gly 0.0751684 0.0794441 0.0725133
His 0.0326326 0.0343474 0.0378785
Ile 0.0345460 0.0293621 0.0368546
Leu 0.0941096 0.0972487 0.1088324
Lys 0.0469462 0.0373357 0.0367658
Met 0.0189075 0.0169432 0.0190433
Phe 0.0302083 0.0285884 0.0362747
Pro 0.0791495 0.0913772 0.0828244
Ser 0.0944282 0.0973624 0.0968037
Thr 0.0566372 0.0566146 0.0520299
Trp 0.0251918 0.0282016 0.0241323
Tyr 0.0195021 0.0172176 0.0219790
Val 0.0468730 0.0448581 0.0495770
ymin=min(pep)
ymax=max(pep)
plot(pep[,3],xaxt = "n",pch=19,col="blue",cex=2,
  ylim=c(ymin,ymax), xlab="amino acid",ylab="frequency")
axis(1, at=1:nrow(pep), labels=rownames(pep),las=2)
points(pep[,2],pch=19,col="gray",cex=2)
points(pep[,1],pch=19,col="red",cex=2)

Session

For reproducibility.

sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] gplots_3.1.3      seqinr_4.2-16     kableExtra_1.3.4  getDEE2_1.6.0    
## [5] R.utils_2.11.0    R.oo_1.25.0       R.methodsS3_1.8.2
## 
## loaded via a namespace (and not attached):
##  [1] SummarizedExperiment_1.26.1 gtools_3.9.2.2             
##  [3] xfun_0.31                   bslib_0.3.1                
##  [5] lattice_0.20-45             colorspace_2.0-3           
##  [7] htmltools_0.5.2             stats4_4.2.0               
##  [9] viridisLite_0.4.0           yaml_2.3.5                 
## [11] rlang_1.0.2                 jquerylib_0.1.4            
## [13] glue_1.6.2                  BiocGenerics_0.42.0        
## [15] matrixStats_0.62.0          GenomeInfoDbData_1.2.8     
## [17] lifecycle_1.0.1             stringr_1.4.0              
## [19] zlibbioc_1.42.0             MatrixGenerics_1.8.0       
## [21] munsell_0.5.0               htm2txt_2.2.2              
## [23] rvest_1.0.2                 caTools_1.18.2             
## [25] evaluate_0.15               Biobase_2.56.0             
## [27] knitr_1.39                  IRanges_2.30.0             
## [29] fastmap_1.1.0               GenomeInfoDb_1.32.2        
## [31] highr_0.9                   KernSmooth_2.23-20         
## [33] scales_1.2.0                DelayedArray_0.22.0        
## [35] S4Vectors_0.34.0            webshot_0.5.3              
## [37] jsonlite_1.8.0              XVector_0.36.0             
## [39] systemfonts_1.0.4           digest_0.6.29              
## [41] stringi_1.7.6               ade4_1.7-19                
## [43] GenomicRanges_1.48.0        grid_4.2.0                 
## [45] cli_3.3.0                   tools_4.2.0                
## [47] bitops_1.0-7                magrittr_2.0.3             
## [49] sass_0.4.1                  RCurl_1.98-1.7             
## [51] MASS_7.3-57                 Matrix_1.4-1               
## [53] xml2_1.3.3                  rmarkdown_2.14             
## [55] svglite_2.1.0               httr_1.4.3                 
## [57] rstudioapi_0.13             R6_2.5.1                   
## [59] compiler_4.2.0