Here I want to download gene expression data for human and fly and rank the genes by their overall gene expression. The data will be fetched from DEE2.io with accession numbers SRP002072 (fly) and ERP000546 (human). Then we will look at their dinucleotide profiles in high, medium and low expressed genes.
suppressPackageStartupMessages({
library("R.utils")
library("getDEE2")
library("kableExtra")
library("seqinr")
library("gplots")
})
Study SRP002072 was published in 2011 as the first transcriptome of D. melanogaster, sampling 10 developmental stages.
mdat <- getDEE2Metadata("dmelanogaster")
mdat1 <- mdat[which(mdat$SRP_accession %in% "SRP002072"),]
SRRvec <- as.vector(mdat1$SRR_accession)
x <- getDEE2("dmelanogaster",SRRvec,metadata=mdat,counts="GeneCounts",legacy=TRUE)
## For more information about DEE2 QC metrics, visit
## https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md
head(x$GeneCounts) %>%
kbl(caption="sample of gene expression data") %>%
kable_paper("hover", full_width = F)
SRR040803 | SRR042295 | SRR042296 | SRR042297 | SRR042298 | SRR042412 | SRR042423 | SRR042489 | SRR042490 | SRR058881 | SRR058885 | SRR059066 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
FBgn0085804 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
FBgn0267431 | 0 | 0 | 0 | 0 | 1 | 3 | 8 | 0 | 0 | 0 | 0 | 0 |
FBgn0039987 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
FBgn0058182 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
FBgn0267430 | 1 | 0 | 0 | 4 | 3 | 4 | 3 | 0 | 1 | 0 | 1 | 2 |
FBgn0266747 | 0 | 0 | 0 | 2 | 2 | 0 | 3 | 0 | 0 | 0 | 0 | 0 |
xx <- x$GeneCounts
rpm <- apply(xx,2,function(x) {x/sum(x)} ) * 1000000
lrpm <- log10(rpm+1)
means <- rowMeans(lrpm)
hist(means)
Now separate high, medium and low expressed genes into text files.
means <- means[order(means)]
qnt <- rev(quantile(means,seq(0,1,1/3)))
qnt <- qnt[2:length(qnt)]
high <- names(which(means > qnt[1]))
med <- names(which(means < qnt[1] & means > qnt[2] ))
low <- names(which(means < qnt[2]))
str(high)
## chr [1:5853] "FBgn0053505" "FBgn0036005" "FBgn0021875" "FBgn0036090" ...
str(med)
## chr [1:5853] "FBgn0036289" "FBgn0034622" "FBgn0264702" "FBgn0028658" ...
str(low)
## chr [1:5853] "FBgn0058182" "FBgn0262214" "FBgn0040687" "FBgn0264983" ...
writeLines(high,con="fly_high.txt")
writeLines(med,con="fly_med.txt")
writeLines(low,con="fly_low.txt")
Now get the gene sequences and load into R.
if ( ! file.exists("fly_cdna.fa.gz") ) {
download.file("ftp://ftp.ensembl.org/pub/release-90/fasta/drosophila_melanogaster/cdna/Drosophila_melanogaster.BDGP6.cdna.all.fa.gz",
destfile="fly_cdna.fa.gz")
}
fa <- read.fasta("fly_cdna.fa.gz")
Now the tricky part is to select one representative transcript isoform for each gene (the first one in the fasta file).
seqinfo <- unlist(getAnnot(fa))
genes <- gsub("gene:","",sapply(strsplit(seqinfo," "),"[[",4))
mygene <- unique(genes)[1]
myseqs <- lapply(unique(genes),function(g) {
myseq <- head(fa[which(genes %in% g)],1)
})
Great. Now need to group the sequences by their membership in high, medium or low groups.
seqinfo <- unlist(getAnnot(myseqs))
genes <- gsub("gene:","",sapply(strsplit(seqinfo," "),"[[",4))
fa_high <- myseqs[which(genes %in% high)]
fa_med <- myseqs[which(genes %in% med)]
fa_low <- myseqs[which(genes %in% low)]
Now here I can show the nucleotide content of high, medium and low expressed genes.
count_high1 <- seqinr::count(seq=unlist(fa_high),wordsize=1,freq=TRUE)
count_med1 <- seqinr::count(seq=unlist(fa_med),wordsize=1,freq=TRUE)
count_low1 <- seqinr::count(seq=unlist(fa_low),wordsize=1,freq=TRUE)
res1 <- rbind(count_low1, count_med1, count_high1)
res1 %>%
kbl(caption="comparison of nucleotide content of high, medium and low expressed genes in drosophila") %>%
kable_paper("hover", full_width = F)
a | c | g | t | |
---|---|---|---|---|
count_low1 | 0.2834623 | 0.2338232 | 0.2323716 | 0.2503429 |
count_med1 | 0.2776779 | 0.2447269 | 0.2417666 | 0.2358285 |
count_high1 | 0.2760666 | 0.2526660 | 0.2478667 | 0.2234006 |
Now here I can show the dinucleotide content of high, medium and low expressed genes.
count_high2 <- seqinr::count(seq=unlist(fa_high),wordsize=2,freq=TRUE)
count_med2 <- seqinr::count(seq=unlist(fa_med),wordsize=2,freq=TRUE)
count_low2 <- seqinr::count(seq=unlist(fa_low),wordsize=2,freq=TRUE)
res2 <- rbind(count_low2, count_med2, count_high2)
res2 %>%
kbl(caption="comparison of dinucleotide content of high, medium and low expressed genes in drosophila") %>%
kable_paper("hover", full_width = F)
aa | ac | ag | at | ca | cc | cg | ct | ga | gc | gg | gt | ta | tc | tg | tt | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count_low2 | 0.0911473 | 0.0576155 | 0.0617578 | 0.0729413 | 0.0737129 | 0.0550199 | 0.0484969 | 0.0565936 | 0.0678081 | 0.0620639 | 0.0539476 | 0.0485522 | 0.0507936 | 0.0591241 | 0.0681694 | 0.0722560 |
count_med2 | 0.0861690 | 0.0587789 | 0.0638489 | 0.0688812 | 0.0760625 | 0.0597220 | 0.0536413 | 0.0553011 | 0.0685853 | 0.0682812 | 0.0570483 | 0.0478518 | 0.0468612 | 0.0579449 | 0.0672280 | 0.0637945 |
count_high2 | 0.0838555 | 0.0615068 | 0.0664435 | 0.0642608 | 0.0794284 | 0.0606216 | 0.0592451 | 0.0533710 | 0.0688855 | 0.0745777 | 0.0568739 | 0.0475297 | 0.0438973 | 0.0559600 | 0.0653042 | 0.0582392 |
Super. Now we can make a chart of dinucleotide differences.
ymin=min(res2)
ymax=max(res2)
plot(res2[1,],xaxt = "n",pch=19,col="blue",cex=2,
ylim=c(ymin,ymax), xlab="dinucleotide",ylab="frequency")
axis(1, at=1:ncol(res2), labels=colnames(res2))
points(res2[2,],pch=19,col="gray",cex=2)
points(res2[3,],pch=19,col="red",cex=2)
Now we can look at whether the dinucleotides are more highly represented in higher expressed genes as compared to lowly expressed.
rep <- apply(res2,2,function(x) { log2(x[3]/x[1]) } )
barplot(rep,ylab="log2 fold enrichment in higher expressed genes")
save.image("fly.Rdata")
Now look at the amino acid composition starting with CDS sequence.
Select one representative CDS per gene.
if ( ! file.exists("fly_cds.fa.gz") ) {
download.file("ftp://ftp.ensembl.org/pub/release-90/fasta/drosophila_melanogaster/cds/Drosophila_melanogaster.BDGP6.cds.all.fa.gz",
destfile="fly_cds.fa.gz")
}
fa <- read.fasta("fly_cds.fa.gz")
seqinfo <- unlist(getAnnot(fa))
genes <- gsub("gene:","",sapply(strsplit(seqinfo," "),"[[",4))
mygene <- unique(genes)[1]
myseqs <- lapply(unique(genes),function(g) {
myseq <- head(fa[which(genes %in% g)],1)
})
seqinfo <- unlist(getAnnot(myseqs))
genes <- gsub("gene:","",sapply(strsplit(seqinfo," "),"[[",4))
fa_high <- myseqs[which(genes %in% high)]
fa_med <- myseqs[which(genes %in% med)]
fa_low <- myseqs[which(genes %in% low)]
codon_high <- uco(unlist(fa_high),index="rscu",as.data.frame=TRUE)
codon_high <- codon_high[order(codon_high$AA),]
codon_med <- uco(unlist(fa_med),index="rscu",as.data.frame=TRUE)
codon_med <- codon_med[order(codon_med$AA),]
codon_low <- uco(unlist(fa_low),index="rscu",as.data.frame=TRUE)
codon_low <- codon_low[order(codon_low$AA),]
codons <- data.frame(codon_low$RSCU, codon_med$RSCU, codon_high$RSCU )
rownames(codons) <- paste(codon_low$AA, codon_low$codon)
codons %>%
kbl(caption="RSCU of high med and low expressed genes") %>%
kable_paper("hover", full_width = F)
codon_low.RSCU | codon_med.RSCU | codon_high.RSCU | |
---|---|---|---|
Ala gca | 1.2510629 | 1.0151353 | 1.1222023 |
Ala gcc | 0.9709917 | 1.3096806 | 1.1620967 |
Ala gcg | 0.8218662 | 0.7652104 | 0.7948555 |
Ala gct | 0.9560792 | 0.9099737 | 0.9208455 |
Arg aga | 1.3272054 | 0.9022233 | 0.8097677 |
Arg agg | 1.3457421 | 0.8684332 | 0.7293770 |
Arg cga | 0.9565643 | 1.3277980 | 1.5022731 |
Arg cgc | 0.8523186 | 1.1568146 | 1.1711491 |
Arg cgg | 0.9707880 | 1.0044904 | 1.0126115 |
Arg cgt | 0.5473816 | 0.7402405 | 0.7748217 |
Asn aac | 0.9423953 | 1.0036371 | 1.0851978 |
Asn aat | 1.0576047 | 0.9963629 | 0.9148022 |
Asp gac | 0.8443794 | 0.8480066 | 0.9193470 |
Asp gat | 1.1556206 | 1.1519934 | 1.0806530 |
Cys tgc | 1.2084094 | 1.2503330 | 1.2471711 |
Cys tgt | 0.7915906 | 0.7496670 | 0.7528289 |
Gln caa | 1.1050071 | 0.9666172 | 1.0666840 |
Gln cag | 0.8949929 | 1.0333828 | 0.9333160 |
Glu gaa | 1.1310259 | 0.9421185 | 0.9839191 |
Glu gag | 0.8689741 | 1.0578815 | 1.0160809 |
Gly gga | 1.4129060 | 1.4493083 | 1.4782987 |
Gly ggc | 1.1043165 | 1.2220502 | 1.2258739 |
Gly ggg | 0.7425332 | 0.5616481 | 0.5659289 |
Gly ggt | 0.7402442 | 0.7669934 | 0.7298985 |
His cac | 0.9775165 | 1.0275372 | 1.0155336 |
His cat | 1.0224835 | 0.9724628 | 0.9844664 |
Ile ata | 0.8607845 | 0.6647587 | 0.5822379 |
Ile atc | 1.1557099 | 1.3134994 | 1.4254683 |
Ile att | 0.9835056 | 1.0217420 | 0.9922939 |
Leu cta | 0.7348677 | 0.7591808 | 0.8007587 |
Leu ctc | 0.8430560 | 1.0434966 | 1.1661689 |
Leu ctg | 1.2889699 | 1.8378223 | 1.7453892 |
Leu ctt | 0.9009241 | 0.9381465 | 1.0486501 |
Leu tta | 0.9334387 | 0.4374372 | 0.4034263 |
Leu ttg | 1.2987436 | 0.9839166 | 0.8356068 |
Lys aaa | 1.1367615 | 0.8858341 | 0.8821693 |
Lys aag | 0.8632385 | 1.1141659 | 1.1178307 |
Met atg | 1.0000000 | 1.0000000 | 1.0000000 |
Phe ttc | 0.9867888 | 1.0959713 | 1.1461263 |
Phe ttt | 1.0132112 | 0.9040287 | 0.8538737 |
Pro cca | 1.5317222 | 1.3250192 | 1.3829790 |
Pro ccc | 0.7008055 | 0.8862955 | 0.7845434 |
Pro ccg | 0.8703987 | 0.9380444 | 0.9489809 |
Pro cct | 0.8970735 | 0.8506410 | 0.8834967 |
Ser agc | 1.1771170 | 1.2859666 | 1.3740698 |
Ser agt | 0.9051089 | 0.8365642 | 0.7588951 |
Ser tca | 1.2002650 | 0.8754543 | 0.9097699 |
Ser tcc | 0.9203071 | 1.2468006 | 1.1696108 |
Ser tcg | 0.8883520 | 0.9593343 | 0.9682205 |
Ser tct | 0.9088500 | 0.7958801 | 0.8194339 |
Stp taa | 0.9414922 | 0.9103797 | 0.9010691 |
Stp tag | 0.5427760 | 0.5303205 | 0.4967124 |
Stp tga | 1.5157319 | 1.5592998 | 1.6022184 |
Thr aca | 1.3133425 | 0.9885695 | 1.0494975 |
Thr acc | 0.8188716 | 1.2281014 | 1.1612207 |
Thr acg | 0.8688353 | 0.9026018 | 0.9395052 |
Thr act | 0.9989507 | 0.8807272 | 0.8497766 |
Trp tgg | 1.0000000 | 1.0000000 | 1.0000000 |
Tyr tac | 0.9453102 | 1.0919401 | 1.1099930 |
Tyr tat | 1.0546898 | 0.9080599 | 0.8900070 |
Val gta | 0.6523197 | 0.5568126 | 0.5847671 |
Val gtc | 1.0295349 | 0.9879666 | 1.0574421 |
Val gtg | 1.3413280 | 1.5175623 | 1.4025872 |
Val gtt | 0.9768174 | 0.9376584 | 0.9552036 |
ymin=min(codons)
ymax=max(codons)
par(mar=c(8,5,3,1))
plot(codons[,1],xaxt = "n",pch=19,col="blue",cex=1,
ylim=c(ymin,ymax),ylab="RSCU",xlab="")
axis(1, at=1:nrow(codons), labels=rownames(codons),las=2,cex.axis=0.5)
points(codons[,2],pch=19,col="gray",cex=1)
points(codons[,3],pch=19,col="red",cex=1)
grid()
Now take a look at the amino acids encoded.
pep_high <- aggregate(eff ~ AA,codon_high,sum)
aa <- as.character(pep_high[,1])
pep_high <- pep_high[,2]
pep_med <- aggregate(eff ~ AA,codon_med,sum)
pep_med <- pep_med[,2]
pep_low <- aggregate(eff ~ AA,codon_low,sum)
pep_low <- pep_low[,2]
pep <- cbind(pep_high,pep_med,pep_low)
rownames(pep) <- aa
pep <- pep[which(rownames(pep)!="Stp"),]
pep <- apply(pep,2,function(x) {x/sum(x)} )
pep %>%
kbl(caption="Amino acid frequency in high med and low expressed genes") %>%
kable_paper("hover", full_width = F)
pep_high | pep_med | pep_low | |
---|---|---|---|
Ala | 0.0884104 | 0.0774428 | 0.0667674 |
Arg | 0.0956875 | 0.0819181 | 0.0914341 |
Asn | 0.0321774 | 0.0370424 | 0.0346905 |
Asp | 0.0391127 | 0.0408682 | 0.0277041 |
Cys | 0.0299473 | 0.0291160 | 0.0367840 |
Gln | 0.0573575 | 0.0509123 | 0.0382341 |
Glu | 0.0471448 | 0.0499169 | 0.0359995 |
Gly | 0.0822118 | 0.0725126 | 0.0513163 |
His | 0.0398148 | 0.0350778 | 0.0271181 |
Ile | 0.0365243 | 0.0435076 | 0.0531580 |
Leu | 0.0794041 | 0.0869148 | 0.0867061 |
Lys | 0.0373828 | 0.0427372 | 0.0445718 |
Met | 0.0144138 | 0.0184485 | 0.0225747 |
Phe | 0.0257892 | 0.0324793 | 0.0372594 |
Pro | 0.0631918 | 0.0621991 | 0.0685714 |
Ser | 0.0882392 | 0.0887488 | 0.1076544 |
Thr | 0.0549269 | 0.0554685 | 0.0693000 |
Trp | 0.0185724 | 0.0179974 | 0.0290857 |
Tyr | 0.0209107 | 0.0256307 | 0.0239591 |
Val | 0.0487807 | 0.0510609 | 0.0471113 |
ymin=min(pep)
ymax=max(pep)
plot(pep[,3],xaxt = "n",pch=19,col="blue",cex=2,
ylim=c(ymin,ymax), xlab="amino acid",ylab="frequency")
axis(1, at=1:nrow(pep), labels=rownames(pep),las=2)
points(pep[,2],pch=19,col="gray",cex=2)
points(pep[,1],pch=19,col="red",cex=2)
The study ERP000546 consists of different human tissues.
mdat <- getDEE2Metadata("hsapiens")
mdat1 <- mdat[which(mdat$SRP_accession %in% "ERP000546"),]
SRRvec <- as.vector(mdat1$SRR_accession)
x <- getDEE2("hsapiens",SRRvec,metadata=mdat,counts="GeneCounts",legacy=TRUE)
## For more information about DEE2 QC metrics, visit
## https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md
head(x$GeneCounts) %>%
kbl(caption="sample of gene expression data") %>%
kable_paper("hover", full_width = F)
ERR030856 | ERR030857 | ERR030858 | ERR030859 | ERR030860 | ERR030861 | ERR030862 | ERR030863 | ERR030864 | ERR030865 | ERR030866 | ERR030867 | ERR030868 | ERR030869 | ERR030870 | ERR030871 | ERR030872 | ERR030873 | ERR030874 | ERR030875 | ERR030876 | ERR030877 | ERR030878 | ERR030879 | ERR030880 | ERR030881 | ERR030882 | ERR030883 | ERR030884 | ERR030885 | ERR030886 | ERR030887 | ERR030888 | ERR030889 | ERR030890 | ERR030891 | ERR030892 | ERR030893 | ERR030894 | ERR030895 | ERR030896 | ERR030897 | ERR030898 | ERR030899 | ERR030900 | ERR030901 | ERR030902 | ERR030903 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ENSG00000223972 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | 0 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 |
ENSG00000227232 | 9 | 6 | 7 | 8 | 12 | 12 | 8 | 4 | 16 | 10 | 10 | 14 | 9 | 5 | 1 | 12 | 31 | 73 | 85 | 18 | 8 | 91 | 114 | 9 | 23 | 70 | 27 | 29 | 37 | 12 | 5 | 25 | 1 | 6 | 1 | 1 | 4 | 0 | 0 | 0 | 1 | 15 | 2 | 0 | 0 | 29 | 8 | 1 |
ENSG00000278267 | 0 | 1 | 0 | 2 | 3 | 0 | 2 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | 1 | 0 | 6 | 3 | 0 | 1 | 1 | 8 | 11 | 0 | 2 | 9 | 5 | 0 |
ENSG00000243485 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
ENSG00000284332 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
ENSG00000237613 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
xx <- x$GeneCounts
rpm <- apply(xx,2,function(x) {x/sum(x)} ) * 1000000
lrpm <- log10(rpm+1)
means <- rowMeans(lrpm)
hist(means)
Now separate high, medium and low expressed genes into text files.
means <- means[order(means)]
qnt <- rev(quantile(means,seq(0,1,1/3)))
qnt <- qnt[2:length(qnt)]
high <- names(which(means > qnt[1]))
med <- names(which(means < qnt[1] & means > qnt[2] ))
low <- names(which(means < qnt[2]))
str(high)
## chr [1:19434] "ENSG00000230076" "ENSG00000282995" "ENSG00000236393" ...
str(med)
## chr [1:19434] "ENSG00000264712" "ENSG00000275875" "ENSG00000185962" ...
str(low)
## chr [1:19434] "ENSG00000284332" "ENSG00000237613" "ENSG00000268020" ...
writeLines(high,con="hum_high.txt")
writeLines(med,con="hum_med.txt")
writeLines(low,con="hum_low.txt")
if ( ! file.exists("hum_cdna.fa.gz") ) {
download.file("ftp://ftp.ensembl.org/pub/release-90/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz",
destfile="hum_cdna.fa.gz")
}
fa <- read.fasta("hum_cdna.fa.gz")
Now the tricky part is to select one representative transcript isoform for each gene (the first one in the fasta file).
seqinfo <- unlist(getAnnot(fa))
genes <- gsub("gene:","",sapply(strsplit(seqinfo," "),"[[",4))
mygene <- unique(genes)[1]
myseqs <- lapply(unique(genes),function(g) {
myseq <- head(fa[which(genes %in% g)],1)
})
Great. Now need to group the sequences by their membership in high, medium or low groups.
seqinfo <- unlist(getAnnot(myseqs))
genes <- gsub("gene:","",sapply(strsplit(seqinfo," "),"[[",4))
genes <- sapply(strsplit(genes,"\\."),"[[",1)
fa_high <- myseqs[which(genes %in% high)]
fa_med <- myseqs[which(genes %in% med)]
fa_low <- myseqs[which(genes %in% low)]
count_high1 <- seqinr::count(seq=unlist(fa_high),wordsize=1,freq=TRUE)
count_med1 <- seqinr::count(seq=unlist(fa_med),wordsize=1,freq=TRUE)
count_low1 <- seqinr::count(seq=unlist(fa_low),wordsize=1,freq=TRUE)
res1 <- rbind(count_low1, count_med1, count_high1)
res1 %>%
kbl(caption="comparison of nucleotide content of high, medium and low expressed genes in human") %>%
kable_paper("hover", full_width = F)
a | c | g | t | |
---|---|---|---|---|
count_low1 | 0.2743642 | 0.2398423 | 0.2378671 | 0.2479265 |
count_med1 | 0.2693717 | 0.2451252 | 0.2480154 | 0.2374877 |
count_high1 | 0.2620743 | 0.2406433 | 0.2455658 | 0.2517167 |
Now here I can show the dinucleotide content of high, medium and low expressed genes.
count_high2 <- seqinr::count(seq=unlist(fa_high),wordsize=2,freq=TRUE)
count_med2 <- seqinr::count(seq=unlist(fa_med),wordsize=2,freq=TRUE)
count_low2 <- seqinr::count(seq=unlist(fa_low),wordsize=2,freq=TRUE)
res2 <- rbind(count_low2, count_med2, count_high2)
res2 %>%
kbl(caption="comparison of dinucleotide content of high, medium and low expressed genes in human") %>%
kable_paper("hover", full_width = F)
aa | ac | ag | at | ca | cc | cg | ct | ga | gc | gg | gt | ta | tc | tg | tt | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count_low2 | 0.0814479 | 0.0540638 | 0.0766824 | 0.0621701 | 0.0802701 | 0.0679467 | 0.0181863 | 0.0734392 | 0.0696699 | 0.0578420 | 0.0628600 | 0.0474950 | 0.0429762 | 0.0599897 | 0.0801384 | 0.0648222 |
count_med2 | 0.0799966 | 0.0535861 | 0.0783864 | 0.0574026 | 0.0783768 | 0.0714370 | 0.0232857 | 0.0720257 | 0.0714253 | 0.0616405 | 0.0680461 | 0.0469035 | 0.0395730 | 0.0584617 | 0.0782971 | 0.0611559 |
count_high2 | 0.0779429 | 0.0509775 | 0.0749316 | 0.0582223 | 0.0727886 | 0.0705796 | 0.0262510 | 0.0710241 | 0.0671368 | 0.0615443 | 0.0677325 | 0.0491522 | 0.0442059 | 0.0575419 | 0.0766507 | 0.0733181 |
Super. Now we can make a chart of dinucleotide differences.
ymin=min(res2)
ymax=max(res2)
plot(res2[1,],xaxt = "n",pch=19,col="blue",cex=2,
ylim=c(ymin,ymax), xlab="dinucleotide",ylab="frequency")
axis(1, at=1:ncol(res2), labels=colnames(res2))
points(res2[2,],pch=19,col="gray",cex=2)
points(res2[3,],pch=19,col="red",cex=2)
Now we can look at whether the dinucleotides are more highly represented in higher expressed genes as compared to lowly expressed.
rep <- apply(res2,2,function(x) { log2(x[3]/x[1]) } )
barplot(rep,ylab="log2 fold enrichment in higher expressed genes")
save.image("hum.Rdata")
Now look at the amino acid composition starting with CDS sequence.
Select one representative CDS per gene.
if ( ! file.exists("hum_cds.fa.gz") ) {
download.file("ftp://ftp.ensembl.org/pub/release-90/fasta/homo_sapiens/cds/Homo_sapiens.GRCh38.cds.all.fa.gz",
destfile="hum_cds.fa.gz")
}
fa <- read.fasta("hum_cds.fa.gz")
seqinfo <- unlist(getAnnot(fa))
genes <- gsub("gene:","",sapply(strsplit(seqinfo," "),"[[",4))
mygene <- unique(genes)[1]
myseqs <- lapply(unique(genes),function(g) {
myseq <- head(fa[which(genes %in% g)],1)
})
seqinfo <- unlist(getAnnot(myseqs))
genes <- gsub("gene:","",sapply(strsplit(seqinfo," "),"[[",4))
genes <- sapply(strsplit(genes,"\\."),"[[",1)
fa_high <- myseqs[which(genes %in% high)]
fa_med <- myseqs[which(genes %in% med)]
fa_low <- myseqs[which(genes %in% low)]
codon_high <- uco(unlist(fa_high),index="rscu",as.data.frame=TRUE)
codon_high <- codon_high[order(codon_high$AA),]
codon_med <- uco(unlist(fa_med),index="rscu",as.data.frame=TRUE)
codon_med <- codon_med[order(codon_med$AA),]
codon_low <- uco(unlist(fa_low),index="rscu",as.data.frame=TRUE)
codon_low <- codon_low[order(codon_low$AA),]
codons <- data.frame(codon_low$RSCU, codon_med$RSCU, codon_high$RSCU )
rownames(codons) <- paste(codon_low$AA, codon_low$codon)
codons %>%
kbl(caption="RSCU of high med and low expressed genes") %>%
kable_paper("hover", full_width = F)
codon_low.RSCU | codon_med.RSCU | codon_high.RSCU | |
---|---|---|---|
Ala gca | 0.9894397 | 1.0112743 | 1.1061142 |
Ala gcc | 1.2810703 | 1.2707586 | 1.2599620 |
Ala gcg | 0.4127927 | 0.5870928 | 0.5284923 |
Ala gct | 1.3166974 | 1.1308744 | 1.1054315 |
Arg aga | 1.8170871 | 1.7345745 | 1.9297726 |
Arg agg | 1.5844589 | 1.6060744 | 1.5509658 |
Arg cga | 0.5602116 | 0.5330014 | 0.5761040 |
Arg cgc | 0.7542718 | 0.8236732 | 0.7007051 |
Arg cgg | 0.7830757 | 0.8612334 | 0.8130682 |
Arg cgt | 0.5008950 | 0.4414429 | 0.4293843 |
Asn aac | 0.9717834 | 1.0492910 | 1.0109507 |
Asn aat | 1.0282166 | 0.9507090 | 0.9890493 |
Asp gac | 1.0123552 | 1.0837037 | 1.0267129 |
Asp gat | 0.9876448 | 0.9162963 | 0.9732871 |
Cys tgc | 1.0506312 | 1.1620409 | 1.1297790 |
Cys tgt | 0.9493688 | 0.8379591 | 0.8702210 |
Gln caa | 0.8219964 | 0.7898453 | 0.8036519 |
Gln cag | 1.1780036 | 1.2101547 | 1.1963481 |
Glu gaa | 1.0000000 | 0.9595339 | 1.0183375 |
Glu gag | 1.0000000 | 1.0404661 | 0.9816625 |
Gly gga | 1.2287912 | 1.2202103 | 1.3320889 |
Gly ggc | 1.1710864 | 1.1647688 | 1.1169796 |
Gly ggg | 0.9001645 | 0.9941510 | 0.9307848 |
Gly ggt | 0.6999579 | 0.6208699 | 0.6201468 |
His cac | 0.9883525 | 1.0368078 | 1.0358240 |
His cat | 1.0116475 | 0.9631922 | 0.9641760 |
Ile ata | 0.6505044 | 0.6747759 | 0.7291258 |
Ile atc | 1.2772926 | 1.3303590 | 1.1901218 |
Ile att | 1.0722030 | 0.9948651 | 1.0807524 |
Leu cta | 0.6181735 | 0.6188255 | 0.5953331 |
Leu ctc | 1.2455255 | 1.2731062 | 1.1491734 |
Leu ctg | 1.7787976 | 1.9501849 | 1.8460329 |
Leu ctt | 1.1194738 | 1.0137119 | 1.0006725 |
Leu tta | 0.4610678 | 0.4162357 | 0.5293014 |
Leu ttg | 0.7769619 | 0.7279359 | 0.8794867 |
Lys aaa | 0.9711698 | 0.9459468 | 0.9773664 |
Lys aag | 1.0288302 | 1.0540532 | 1.0226336 |
Met atg | 1.0000000 | 1.0000000 | 1.0000000 |
Phe ttc | 1.0611183 | 1.1127826 | 1.0478631 |
Phe ttt | 0.9388817 | 0.8872174 | 0.9521369 |
Pro cca | 1.1692184 | 1.2061131 | 1.2417976 |
Pro ccc | 1.0425810 | 1.0802272 | 1.0816364 |
Pro ccg | 0.4418238 | 0.5586692 | 0.5644940 |
Pro cct | 1.3463768 | 1.1549905 | 1.1120719 |
Ser agc | 1.1761974 | 1.3679601 | 1.4073696 |
Ser agt | 0.7321352 | 0.7564830 | 0.8844682 |
Ser tca | 1.1710379 | 1.1633220 | 1.1620048 |
Ser tcc | 1.2848912 | 1.2285899 | 1.1382656 |
Ser tcg | 0.3088830 | 0.3945775 | 0.3946817 |
Ser tct | 1.3268553 | 1.0890674 | 1.0132101 |
Stp taa | 0.6730258 | 0.6546046 | 0.6951968 |
Stp tag | 0.4826195 | 0.4963140 | 0.4924823 |
Stp tga | 1.8443548 | 1.8490813 | 1.8123209 |
Thr aca | 1.2374807 | 1.2548078 | 1.3041955 |
Thr acc | 1.2908112 | 1.2700880 | 1.2153266 |
Thr acg | 0.4061650 | 0.5185654 | 0.5009106 |
Thr act | 1.0655432 | 0.9565388 | 0.9795673 |
Trp tgg | 1.0000000 | 1.0000000 | 1.0000000 |
Tyr tac | 0.9635147 | 1.0826873 | 1.0380065 |
Tyr tat | 1.0364853 | 0.9173127 | 0.9619935 |
Val gta | 0.5458107 | 0.5146289 | 0.5822500 |
Val gtc | 1.0193093 | 1.0615461 | 0.9626327 |
Val gtg | 1.5821347 | 1.6156262 | 1.5949046 |
Val gtt | 0.8527453 | 0.8081988 | 0.8602127 |
ymin=min(codons)
ymax=max(codons)
par(mar=c(8,5,3,1))
plot(codons[,1],xaxt = "n",pch=19,col="blue",cex=1,
ylim=c(ymin,ymax),ylab="RSCU",xlab="")
axis(1, at=1:nrow(codons), labels=rownames(codons),las=2,cex.axis=0.5)
points(codons[,2],pch=19,col="gray",cex=1)
points(codons[,3],pch=19,col="red",cex=1)
grid()
Now take a look at the amino acids encoded.
pep_high <- aggregate(eff ~ AA,codon_high,sum)
aa <- as.character(pep_high[,1])
pep_high <- pep_high[,2]
pep_med <- aggregate(eff ~ AA,codon_med,sum)
pep_med <- pep_med[,2]
pep_low <- aggregate(eff ~ AA,codon_low,sum)
pep_low <- pep_low[,2]
pep <- cbind(pep_high,pep_med,pep_low)
rownames(pep) <- aa
pep <- pep[which(rownames(pep)!="Stp"),]
pep <- apply(pep,2,function(x) {x/sum(x)} )
pep %>%
kbl(caption="Amino acid frequency in high med and low expressed genes") %>%
kable_paper("hover", full_width = F)
pep_high | pep_med | pep_low | |
---|---|---|---|
Ala | 0.0718611 | 0.0774976 | 0.0738341 |
Arg | 0.0814834 | 0.0839017 | 0.0682041 |
Asn | 0.0283601 | 0.0237934 | 0.0254697 |
Asp | 0.0308189 | 0.0269745 | 0.0287467 |
Cys | 0.0349691 | 0.0380265 | 0.0421988 |
Gln | 0.0493456 | 0.0484853 | 0.0445047 |
Glu | 0.0488614 | 0.0424199 | 0.0415329 |
Gly | 0.0751684 | 0.0794441 | 0.0725133 |
His | 0.0326326 | 0.0343474 | 0.0378785 |
Ile | 0.0345460 | 0.0293621 | 0.0368546 |
Leu | 0.0941096 | 0.0972487 | 0.1088324 |
Lys | 0.0469462 | 0.0373357 | 0.0367658 |
Met | 0.0189075 | 0.0169432 | 0.0190433 |
Phe | 0.0302083 | 0.0285884 | 0.0362747 |
Pro | 0.0791495 | 0.0913772 | 0.0828244 |
Ser | 0.0944282 | 0.0973624 | 0.0968037 |
Thr | 0.0566372 | 0.0566146 | 0.0520299 |
Trp | 0.0251918 | 0.0282016 | 0.0241323 |
Tyr | 0.0195021 | 0.0172176 | 0.0219790 |
Val | 0.0468730 | 0.0448581 | 0.0495770 |
ymin=min(pep)
ymax=max(pep)
plot(pep[,3],xaxt = "n",pch=19,col="blue",cex=2,
ylim=c(ymin,ymax), xlab="amino acid",ylab="frequency")
axis(1, at=1:nrow(pep), labels=rownames(pep),las=2)
points(pep[,2],pch=19,col="gray",cex=2)
points(pep[,1],pch=19,col="red",cex=2)
For reproducibility.
sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8
## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8
## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] gplots_3.1.3 seqinr_4.2-16 kableExtra_1.3.4 getDEE2_1.6.0
## [5] R.utils_2.11.0 R.oo_1.25.0 R.methodsS3_1.8.2
##
## loaded via a namespace (and not attached):
## [1] SummarizedExperiment_1.26.1 gtools_3.9.2.2
## [3] xfun_0.31 bslib_0.3.1
## [5] lattice_0.20-45 colorspace_2.0-3
## [7] htmltools_0.5.2 stats4_4.2.0
## [9] viridisLite_0.4.0 yaml_2.3.5
## [11] rlang_1.0.2 jquerylib_0.1.4
## [13] glue_1.6.2 BiocGenerics_0.42.0
## [15] matrixStats_0.62.0 GenomeInfoDbData_1.2.8
## [17] lifecycle_1.0.1 stringr_1.4.0
## [19] zlibbioc_1.42.0 MatrixGenerics_1.8.0
## [21] munsell_0.5.0 htm2txt_2.2.2
## [23] rvest_1.0.2 caTools_1.18.2
## [25] evaluate_0.15 Biobase_2.56.0
## [27] knitr_1.39 IRanges_2.30.0
## [29] fastmap_1.1.0 GenomeInfoDb_1.32.2
## [31] highr_0.9 KernSmooth_2.23-20
## [33] scales_1.2.0 DelayedArray_0.22.0
## [35] S4Vectors_0.34.0 webshot_0.5.3
## [37] jsonlite_1.8.0 XVector_0.36.0
## [39] systemfonts_1.0.4 digest_0.6.29
## [41] stringi_1.7.6 ade4_1.7-19
## [43] GenomicRanges_1.48.0 grid_4.2.0
## [45] cli_3.3.0 tools_4.2.0
## [47] bitops_1.0-7 magrittr_2.0.3
## [49] sass_0.4.1 RCurl_1.98-1.7
## [51] MASS_7.3-57 Matrix_1.4-1
## [53] xml2_1.3.3 rmarkdown_2.14
## [55] svglite_2.1.0 httr_1.4.3
## [57] rstudioapi_0.13 R6_2.5.1
## [59] compiler_4.2.0