Source: TBA
Here we analyse the effect of VLCAD mutation.
Reads were mapped to the human transcriptome version 47 from GENCODE genes (gencode.v47.transcripts.fa).
Genes with mean counts > 10 are classified as detected.
Differential expression is conducted with DESeq2.
Pathway enrichment analysis is conducted with mitch.
Gene sets were obtained from the gene ontology database (q3 2023). Biological process sets were used.
suppressPackageStartupMessages({
library("zoo")
library("dplyr")
library("reshape2")
library("DESeq2")
library("gplots")
library("MASS")
library("mitch")
library("eulerr")
library("kableExtra")
library("beeswarm")
library("network")
})
knitr::opts_chunk$set(dev = 'svg') # set output device to svg
tmp <- read.table("3col.tsv.gz",header=F)
x <- as.matrix(acast(tmp, V2~V1, value.var="V3", fun.aggregate = sum))
x <- as.data.frame(x)
accession <- sapply((strsplit(rownames(x),"\\|")),"[[",2)
symbol<-sapply((strsplit(rownames(x),"\\|")),"[[",6)
x$geneid <- paste(accession,symbol)
xx <- aggregate(. ~ geneid,x,sum)
rownames(xx) <- xx$geneid
xx$geneid = NULL
xx <- round(xx)
head(xx)
## Ctl1 Ctl2 Ctl3 Ctl4 Mut1 Mut2 Mut3 Mut4
## ENSG00000000003.16 TSPAN6 4228 3204 2812 3362 2725 2330 2284 2711
## ENSG00000000005.6 TNMD 1 4 2 0 0 0 0 0
## ENSG00000000419.14 DPM1 2538 2211 1837 1867 2632 2677 2873 3146
## ENSG00000000457.14 SCYL3 526 440 444 607 360 385 519 605
## ENSG00000000460.17 FIRRM 1572 1325 652 744 1174 1142 674 870
## ENSG00000000938.13 FGR 0 0 1 3 0 1 0 4
ss <- data.frame(colnames(xx))
ss$case <- as.numeric(grepl("Mut",ss[,1]))
rownames(ss) <- ss[,1]
ss[,1] = NULL
ss
## case
## Ctl1 0
## Ctl2 0
## Ctl3 0
## Ctl4 0
## Mut1 1
## Mut2 1
## Mut3 1
## Mut4 1
Here I’ll look at a few different quality control measures.
par(mar=c(5,8,3,1))
barplot(colSums(xx),horiz=TRUE,las=1,xlab="num reads")
colSums(xx)
## Ctl1 Ctl2 Ctl3 Ctl4 Mut1 Mut2 Mut3 Mut4
## 59500246 47803671 50526219 52506392 50626163 51905681 59571523 68184815
We have 47M to 68M assigned reads, which is very good.
All sample cluster with others from the same group. Ctl and Mut samples are distinguished on dimension 2. No outliers recorded.
cols <- c(rep("tan1",4),rep("violet",4))
plot(cmdscale(dist(t(xx))), xlab="Coordinate 1", ylab="Coordinate 2",
type = "p",bty="n",pch=19, cex=4, col=cols)
text(cmdscale(dist(t(xx))), labels=colnames(xx) )
Correlation heatmap shows a slight batch effect with batch A consisting of replicates 1 and 2, while batch B consists of replicates 3 and 4.
heatmap.2(cor(xx),trace="n",main="Pearson correlation heatmap")
cor(xx) %>% kbl(caption = "Pearson correlation coefficients") %>% kable_paper("hover", full_width = F)
Ctl1 | Ctl2 | Ctl3 | Ctl4 | Mut1 | Mut2 | Mut3 | Mut4 | |
---|---|---|---|---|---|---|---|---|
Ctl1 | 1.0000000 | 0.9955678 | 0.9641994 | 0.9673286 | 0.9572850 | 0.9613203 | 0.9463720 | 0.9518116 |
Ctl2 | 0.9955678 | 1.0000000 | 0.9622280 | 0.9663104 | 0.9576128 | 0.9636163 | 0.9494607 | 0.9514847 |
Ctl3 | 0.9641994 | 0.9622280 | 1.0000000 | 0.9967999 | 0.9069284 | 0.9241674 | 0.9663318 | 0.9752521 |
Ctl4 | 0.9673286 | 0.9663104 | 0.9967999 | 1.0000000 | 0.9159501 | 0.9308417 | 0.9717279 | 0.9798979 |
Mut1 | 0.9572850 | 0.9576128 | 0.9069284 | 0.9159501 | 1.0000000 | 0.9938098 | 0.9502892 | 0.9428478 |
Mut2 | 0.9613203 | 0.9636163 | 0.9241674 | 0.9308417 | 0.9938098 | 1.0000000 | 0.9654085 | 0.9573851 |
Mut3 | 0.9463720 | 0.9494607 | 0.9663318 | 0.9717279 | 0.9502892 | 0.9654085 | 1.0000000 | 0.9971573 |
Mut4 | 0.9518116 | 0.9514847 | 0.9752521 | 0.9798979 | 0.9428478 | 0.9573851 | 0.9971573 | 1.0000000 |
cor(xx,method="spearman") %>% kbl(caption = "Spearman correlation coefficients") %>% kable_paper("hover", full_width = F)
Ctl1 | Ctl2 | Ctl3 | Ctl4 | Mut1 | Mut2 | Mut3 | Mut4 | |
---|---|---|---|---|---|---|---|---|
Ctl1 | 1.0000000 | 0.8799723 | 0.8726250 | 0.8731369 | 0.8687678 | 0.8722935 | 0.8650790 | 0.8656965 |
Ctl2 | 0.8799723 | 1.0000000 | 0.8702726 | 0.8727027 | 0.8678268 | 0.8726946 | 0.8606341 | 0.8645307 |
Ctl3 | 0.8726250 | 0.8702726 | 1.0000000 | 0.8822079 | 0.8580223 | 0.8633952 | 0.8725751 | 0.8759524 |
Ctl4 | 0.8731369 | 0.8727027 | 0.8822079 | 1.0000000 | 0.8599849 | 0.8662153 | 0.8741373 | 0.8765094 |
Mut1 | 0.8687678 | 0.8678268 | 0.8580223 | 0.8599849 | 1.0000000 | 0.8796945 | 0.8659710 | 0.8681118 |
Mut2 | 0.8722935 | 0.8726946 | 0.8633952 | 0.8662153 | 0.8796945 | 1.0000000 | 0.8726103 | 0.8744646 |
Mut3 | 0.8650790 | 0.8606341 | 0.8725751 | 0.8741373 | 0.8659710 | 0.8726103 | 1.0000000 | 0.8884237 |
Mut4 | 0.8656965 | 0.8645307 | 0.8759524 | 0.8765094 | 0.8681118 | 0.8744646 | 0.8884237 | 1.0000000 |
First we will look at control vs mutant with no filtering for low reads. Then we remove genes with fewer than 10 reads per sample on average and rerun DESeq2.
dds <- DESeqDataSetFromMatrix(countData = xx , colData = ss, design = ~ case )
## converting counts to integer mode
## the design formula contains one or more numeric variables with integer values,
## specifying a model with increasing fold change for higher values.
## did you mean for this to be a factor? if so, first convert
## this variable to a factor using the factor() function
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
z<- results(res)
vsd <- vst(dds, blind=FALSE)
zz <- cbind(as.data.frame(z),assay(vsd))
dge <- as.data.frame(zz[order(zz$pvalue),])
head(dge,20) %>% kbl(caption = "Top gene expression changes caused by VLCAD mutation") %>% kable_paper("hover", full_width = F)
baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | Ctl1 | Ctl2 | Ctl3 | Ctl4 | Mut1 | Mut2 | Mut3 | Mut4 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ENSG00000162511.8 LAPTM5 | 987.8437 | -3.986356 | 0.1755532 | -22.70739 | 0 | 0 | 11.002220 | 10.885389 | 10.899161 | 10.841009 | 7.254068 | 7.564944 | 7.254276 | 7.671797 |
ENSG00000272975.3 MYHAS | 388.3574 | -4.195153 | 0.1956555 | -21.44153 | 0 | 0 | 9.483432 | 9.555854 | 9.850651 | 9.649723 | 6.499000 | 6.621006 | 6.398233 | 6.615835 |
ENSG00000152128.13 TMEM163 | 854.4807 | -3.990912 | 0.1867547 | -21.36981 | 0 | 0 | 10.564008 | 10.434528 | 10.843404 | 10.928707 | 7.153614 | 7.242889 | 7.399430 | 7.399775 |
ENSG00000116574.6 RHOU | 1158.7330 | -2.487046 | 0.1190345 | -20.89349 | 0 | 0 | 11.068257 | 10.953967 | 10.955235 | 10.968457 | 8.534667 | 8.751548 | 8.631342 | 8.806695 |
ENSG00000100311.17 PDGFB | 762.4043 | -4.200241 | 0.2158178 | -19.46197 | 0 | 0 | 10.818451 | 10.346682 | 10.679375 | 10.331647 | 6.809802 | 7.232135 | 7.001269 | 7.200800 |
ENSG00000100678.20 SLC8A3 | 358.9813 | -3.566660 | 0.1921649 | -18.56042 | 0 | 0 | 9.529756 | 9.369735 | 9.550313 | 9.546351 | 6.479031 | 6.830202 | 6.937598 | 6.825018 |
ENSG00000148798.11 INA | 449.6321 | -3.871214 | 0.2123022 | -18.23446 | 0 | 0 | 9.903409 | 10.088926 | 9.662117 | 9.569548 | 6.911927 | 6.603283 | 6.915641 | 6.770452 |
ENSG00000089250.20 NOS1 | 250.7947 | -5.564540 | 0.3085129 | -18.03665 | 0 | 0 | 8.786950 | 9.018045 | 9.351096 | 9.221984 | 5.899032 | 5.929534 | 5.732877 | 5.658486 |
ENSG00000054983.17 GALC | 254.7479 | -7.177380 | 0.4290953 | -16.72677 | 0 | 0 | 8.907264 | 8.988819 | 9.232172 | 9.411614 | 5.449783 | 5.578676 | 5.324774 | 5.490048 |
ENSG00000009709.13 PAX7 | 127.3333 | -4.443382 | 0.2695849 | -16.48231 | 0 | 0 | 8.195813 | 8.334413 | 8.292938 | 8.153813 | 5.736095 | 5.963757 | 5.834389 | 5.790457 |
ENSG00000183770.7 FOXL2 | 142.6765 | -3.360216 | 0.2063866 | -16.28117 | 0 | 0 | 8.272609 | 8.349909 | 8.328173 | 8.337779 | 6.328065 | 6.320680 | 6.168496 | 6.148784 |
ENSG00000133019.12 CHRM3 | 282.6068 | -2.720740 | 0.1754015 | -15.51149 | 0 | 0 | 9.015629 | 8.965668 | 9.216256 | 9.224424 | 6.839849 | 7.119456 | 7.011578 | 7.101723 |
ENSG00000154783.12 FGD5 | 355.7793 | -2.851012 | 0.1861684 | -15.31416 | 0 | 0 | 9.277233 | 9.394671 | 9.596083 | 9.411614 | 6.897841 | 7.285051 | 7.296208 | 7.118768 |
ENSG00000132164.10 SLC6A11 | 106.1048 | -4.364647 | 0.2910877 | -14.99427 | 0 | 0 | 8.012600 | 7.920884 | 8.148147 | 8.040793 | 5.780323 | 5.856464 | 5.865360 | 5.620831 |
ENSG00000206432.4 TMEM200C | 135.5014 | -2.709658 | 0.1868406 | -14.50251 | 0 | 0 | 8.134126 | 8.186723 | 8.246306 | 8.253617 | 6.395351 | 6.450698 | 6.512449 | 6.466853 |
ENSG00000204644.10 ZFP57 | 750.2561 | -1.666633 | 0.1167461 | -14.27570 | 0 | 0 | 10.223624 | 10.203270 | 10.143080 | 10.355170 | 8.664944 | 8.769525 | 8.634497 | 8.789064 |
ENSG00000112414.15 ADGRG6 | 508.4714 | -2.799366 | 0.1985637 | -14.09807 | 0 | 0 | 9.579926 | 10.266681 | 9.843772 | 9.799432 | 7.434690 | 7.564944 | 7.544218 | 7.596921 |
ENSG00000106236.4 NPTX2 | 1200.4620 | -3.000108 | 0.2134385 | -14.05608 | 0 | 0 | 11.058195 | 10.995229 | 11.313973 | 11.016301 | 7.845060 | 8.377118 | 8.550102 | 8.528409 |
ENSG00000162426.16 SLC45A1 | 112.7501 | -3.759645 | 0.2686451 | -13.99484 | 0 | 0 | 8.099755 | 7.804630 | 8.119620 | 8.243949 | 6.034266 | 5.856464 | 5.978500 | 6.022722 |
ENSG00000134873.10 CLDN10 | 285.7763 | -2.704238 | 0.1932477 | -13.99364 | 0 | 0 | 9.221829 | 9.147062 | 9.096704 | 9.020378 | 6.731291 | 7.021709 | 7.147845 | 7.200800 |
nrow(dge)
## [1] 78724
nrow(subset(dge,padj<0.05))
## [1] 3655
dge[grep("ACADVL",rownames(dge)),]
## baseMean log2FoldChange lfcSE stat
## ENSG00000072778.20 ACADVL 13984.35 -1.757624 0.262024 -6.707873
## pvalue padj Ctl1 Ctl2 Ctl3
## ENSG00000072778.20 ACADVL 1.974819e-11 1.528463e-09 14.33474 13.9524 14.6743
## Ctl4 Mut1 Mut2 Mut3 Mut4
## ENSG00000072778.20 ACADVL 14.54686 12.13382 12.30005 12.90259 13.06871
xxf <- xx[rowMeans(xx)>=10,]
dim(xx) ; dim(xxf)
## [1] 78724 8
## [1] 19603 8
dds <- DESeqDataSetFromMatrix(countData = xxf , colData = ss, design = ~ case )
## converting counts to integer mode
## the design formula contains one or more numeric variables with integer values,
## specifying a model with increasing fold change for higher values.
## did you mean for this to be a factor? if so, first convert
## this variable to a factor using the factor() function
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
z<- results(res)
vsd <- vst(dds, blind=FALSE)
zz <- cbind(as.data.frame(z),assay(vsd))
dge <- as.data.frame(zz[order(zz$pvalue),])
head(dge,20) %>% kbl(caption = "Top gene expression changes caused by VLCAD mutation") %>% kable_paper("hover", full_width = F)
baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | Ctl1 | Ctl2 | Ctl3 | Ctl4 | Mut1 | Mut2 | Mut3 | Mut4 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ENSG00000162511.8 LAPTM5 | 988.1810 | -3.977558 | 0.1624486 | -24.48502 | 0 | 0 | 11.013483 | 10.895831 | 10.910941 | 10.858916 | 7.387102 | 7.674627 | 7.392785 | 7.778289 |
ENSG00000116574.6 RHOU | 1159.8102 | -2.478028 | 0.1017890 | -24.34475 | 0 | 0 | 11.078995 | 10.963816 | 10.966535 | 10.985276 | 8.601506 | 8.806630 | 8.702114 | 8.863824 |
ENSG00000272975.3 MYHAS | 388.4941 | -4.186664 | 0.1847116 | -22.66595 | 0 | 0 | 9.515786 | 9.584977 | 9.875569 | 9.683639 | 6.691628 | 6.800557 | 6.603675 | 6.797969 |
ENSG00000152128.13 TMEM163 | 855.0084 | -3.981991 | 0.1799082 | -22.13346 | 0 | 0 | 10.579410 | 10.449616 | 10.855680 | 10.945856 | 7.293661 | 7.373581 | 7.528560 | 7.522791 |
ENSG00000100311.17 PDGFB | 762.5890 | -4.191593 | 0.2090203 | -20.05352 | 0 | 0 | 10.831306 | 10.362846 | 10.693220 | 10.354940 | 6.975994 | 7.363579 | 7.157500 | 7.337194 |
ENSG00000100678.20 SLC8A3 | 359.1927 | -3.556739 | 0.1795828 | -19.80556 | 0 | 0 | 9.561124 | 9.402958 | 9.580921 | 9.582313 | 6.673445 | 6.992103 | 7.098566 | 6.989704 |
ENSG00000148798.11 INA | 449.7423 | -3.861696 | 0.2053141 | -18.80873 | 0 | 0 | 9.927773 | 10.108623 | 9.690482 | 9.605039 | 7.070008 | 6.784385 | 7.078268 | 6.939574 |
ENSG00000089250.20 NOS1 | 250.8596 | -5.556768 | 0.3047282 | -18.23516 | 0 | 0 | 8.837995 | 9.060394 | 9.386105 | 9.265278 | 6.149451 | 6.175384 | 6.002635 | 5.934002 |
ENSG00000183770.7 FOXL2 | 142.7567 | -3.353550 | 0.1913430 | -17.52638 | 0 | 0 | 8.343122 | 8.415647 | 8.396021 | 8.409836 | 6.536307 | 6.527617 | 6.395099 | 6.373976 |
ENSG00000009709.13 PAX7 | 127.3592 | -4.434877 | 0.2575833 | -17.21726 | 0 | 0 | 8.269724 | 8.400802 | 8.362287 | 8.233832 | 6.003486 | 6.206073 | 6.093781 | 6.052004 |
ENSG00000054983.17 GALC | 254.8315 | -7.169298 | 0.4262001 | -16.82143 | 0 | 0 | 8.954509 | 9.032016 | 9.270073 | 9.450447 | 5.747955 | 5.861942 | 5.637599 | 5.783747 |
ENSG00000133019.12 CHRM3 | 282.8664 | -2.711812 | 0.1633780 | -16.59839 | 0 | 0 | 9.059671 | 9.009547 | 9.254560 | 9.267658 | 7.003624 | 7.258975 | 7.167052 | 7.245191 |
ENSG00000204644.10 ZFP57 | 751.3959 | -1.658220 | 0.1008664 | -16.43976 | 0 | 0 | 10.243168 | 10.221327 | 10.163403 | 10.378174 | 8.727173 | 8.824016 | 8.705163 | 8.846753 |
ENSG00000154783.12 FGD5 | 356.0567 | -2.841880 | 0.1756942 | -16.17515 | 0 | 0 | 9.314336 | 9.427317 | 9.625754 | 9.450447 | 7.057023 | 7.412828 | 7.431948 | 7.260999 |
ENSG00000206432.4 TMEM200C | 135.6225 | -2.700161 | 0.1726686 | -15.63782 | 0 | 0 | 8.210869 | 8.259598 | 8.317683 | 8.329221 | 6.597360 | 6.645493 | 6.707842 | 6.662130 |
ENSG00000132164.10 SLC6A11 | 106.1440 | -4.356983 | 0.2804464 | -15.53589 | 0 | 0 | 8.095188 | 8.006750 | 8.223958 | 8.126094 | 6.043063 | 6.109937 | 6.121624 | 5.900381 |
ENSG00000136425.14 CIB2 | 560.2324 | -2.093824 | 0.1381385 | -15.15743 | 0 | 0 | 9.964635 | 9.951553 | 10.006036 | 9.865507 | 8.086272 | 8.081641 | 8.064365 | 8.403856 |
ENSG00000134873.10 CLDN10 | 286.0041 | -2.694147 | 0.1820120 | -14.80202 | 0 | 0 | 9.260313 | 9.185839 | 9.138162 | 9.069007 | 6.903915 | 7.168526 | 7.293595 | 7.337194 |
ENSG00000112414.15 ADGRG6 | 508.8023 | -2.790367 | 0.1916111 | -14.56266 | 0 | 0 | 9.610256 | 10.283879 | 9.868809 | 9.830607 | 7.555822 | 7.674627 | 7.664559 | 7.707762 |
ENSG00000028137.19 TNFRSF1B | 466.1307 | -1.894338 | 0.1312207 | -14.43627 | 0 | 0 | 9.762336 | 9.569750 | 9.733929 | 9.597504 | 7.935833 | 8.210393 | 8.015367 | 8.178466 |
write.table(dge,file="deseq2_mut.tsv",quote=FALSE,sep='\t')
nrow(dge)
## [1] 19603
nrow(subset(dge,padj<0.05))
## [1] 4095
dge[grep("ACADVL",rownames(dge)),]
## baseMean log2FoldChange lfcSE stat
## ENSG00000072778.20 ACADVL 14007.45 -1.748639 0.2636304 -6.632919
## pvalue padj Ctl1 Ctl2 Ctl3
## ENSG00000072778.20 ACADVL 3.291119e-11 1.557717e-09 14.33522 13.95137 14.67416
## Ctl4 Mut1 Mut2 Mut3 Mut4
## ENSG00000072778.20 ACADVL 14.55264 12.14702 12.30763 12.92246 13.07882
maplot <- function(de,contrast_name) {
sig <-subset(de, padj < 0.05 )
up <-rownames(subset(de, padj < 0.05 & log2FoldChange > 0))
dn <-rownames(subset(de, padj < 0.05 & log2FoldChange < 0))
GENESUP <- length(up)
GENESDN <- length(dn)
DET=nrow(de)
SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down", DET, "detected")
ns <-subset(de, padj > 0.05 )
plot(log2(de$baseMean),de$log2FoldChange,
xlab="log2 basemean", ylab="log2 foldchange",
pch=19, cex=0.5, col="dark gray",
main=contrast_name, cex.main=1)
points(log2(sig$baseMean),sig$log2FoldChange,
pch=19, cex=0.5, col="red")
mtext(SUBHEADER,cex = 1)
}
make_volcano <- function(de,name) {
sig <- subset(de,padj<0.05)
N_SIG=nrow(sig)
N_UP=nrow(subset(sig,log2FoldChange>0))
N_DN=nrow(subset(sig,log2FoldChange<0))
DET=nrow(de)
HEADER=paste(N_SIG,"@5%FDR,", N_UP, "up", N_DN, "dn", DET, "detected")
plot(de$log2FoldChange,-log10(de$pval),cex=0.5,pch=19,col="darkgray",
main=name, xlab="log2 FC", ylab="-log10 pval", xlim=c(-6,6))
mtext(HEADER)
grid()
points(sig$log2FoldChange,-log10(sig$pval),cex=0.5,pch=19,col="red")
}
make_heatmap <- function(de,name,myss,mx,n=30){
colfunc <- colorRampPalette(c("blue", "white", "red"))
values <- myss$quickdash
f <- colorRamp(c("yellow", "orange"))
rr <- range(values)
svals <- (values-rr[1])/diff(rr)
colcols <- rgb(f(svals)/255)
mxn <- mx/rowSums(mx)*1000000
x <- mxn[which(rownames(mxn) %in% rownames(head(de,n))),]
heatmap.2(as.matrix(x),trace="none",col=colfunc(25),scale="row", margins = c(7,15), cexRow=0.9, cexCol=1.4,
main=paste("Top ranked",n,"genes in",name) )
}
maplot(dge,"ctrl vs mut")