Source codes: https://github.com/markziemann/tohidul_rnaseq
Here we have n=3 control (H2O; “H”) and n=3 seaweed based fertiliser treatments:
S80 ANDP
S94 DP
S93 AN
Reads underwent quality trimming using Skewer (Jiang et al, 2014). I mapped the reads to the Arabidopsis transcriptome (TAIR10/Ensembl47) using Kallisto (Bray et al, 2016). Expression counts were loaded into R and then DE analysis was performed with DESeq2 (Love et al, 2014). Enrichment analysis was performed using Plant Reactome genesets with the Mitch package (Kaspi & Ziemann 2020).
suppressPackageStartupMessages({
library("tidyverse")
library("reshape2")
library("gplots")
library("DESeq2")
library("mitch")
library("kableExtra")
library("eulerr")
library("UpSetR")
library("vioplot")
library("clusterProfiler")
})
Here we load the data in from the aligner. Take note that the columns are arranged in order.
tmp <- read.table("3col.tsv.gz")
x <- as.data.frame(acast(tmp, V2~V1, value.var="V3"))
x$gene <- sapply(strsplit(rownames(x),"\\."),"[[",1)
xx <- aggregate(. ~ gene, x, sum)
rownames(xx) <- xx$gene
xx$gene = NULL
xx <- xx[,order(colnames(xx))]
The sample sheet is read in and put in order as well.
ss <- read.table("samplesheet.tsv",header=TRUE)
ss <- ss[order(ss$Run),]
ss$label <- gsub("-0hpi","",ss$SampleName)
# check that names are in order
colnames(xx) == ss$Run
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# change data header
colnames(xx) <- ss$label
kbl(ss) %>% kable_styling()
Run | SampleName | Treatment | label | |
---|---|---|---|---|
6 | SRR11404271 | S80-0hpi_R3 | Seaweed_16280 | S80_R3 |
5 | SRR11404272 | S80-0hpi_R2 | Seaweed_16280 | S80_R2 |
4 | SRR11404273 | S80-0hpi_R1 | Seaweed_16280 | S80_R1 |
12 | SRR11404287 | S94-0hpi_R3 | Seaweed_15294 | S94_R3 |
11 | SRR11404288 | S94-0hpi_R2 | Seaweed_15294 | S94_R2 |
10 | SRR11404289 | S94-0hpi_R1 | Seaweed_15294 | S94_R1 |
3 | SRR11404301 | H-0hpi_R3 | Water | H_R3 |
9 | SRR11404304 | S93-0hpi_R3 | Seaweed_15293 | S93_R3 |
8 | SRR11404305 | S93-0hpi_R2 | Seaweed_15293 | S93_R2 |
7 | SRR11404306 | S93-0hpi_R1 | Seaweed_15293 | S93_R1 |
2 | SRR11404312 | H-0hpi_R2 | Water | H_R2 |
1 | SRR11404313 | H-0hpi_R1 | Water | H_R1 |
MDS is just like PCA. The more similar (correlated) the data sets are the closer they will appear on the scatterplot.
cols <- as.numeric(factor(ss$Treatment))
plot(cmdscale(dist(t(xx))), xlab="Coordinate 1", ylab="Coordinate 2",
col=cols, cex=4 , main="MDS plot")
text(cmdscale(dist(t(xx))), labels=ss$label )
#colfunc <- colorRampPalette(c("white", "yellow", "orange" , "red", "darkred"))
colfunc <- colorRampPalette(c("white","blue"))
heatmap.2(cor(xx,method="pearson"),col=colfunc(25),trace="none",
scale="none",margin=c(10,10),main="Pearson correlation")
heatmap.2(cor(xx,method="spearman"),col=colfunc(25),trace="none",
scale="none",margin=c(10,10),main="Spearman correlation")
run_de <- function(ss,xx){
xx <- xx[which(rowMeans(xx)>10),]
y <- round(xx)
# MDS
mds <- cmdscale(dist(t(y)))
XMAX=max(mds[,1])*1.1
XMIN=min(mds[,1])*1.1
plot( mds , xlab="Coordinate 1", ylab="Coordinate 2",
type = "n" , xlim=c(XMIN,XMAX),main="MDS plot",bty="n")
text(mds, labels=colnames(y) )
# DE
dds <- DESeqDataSetFromMatrix(countData=y, colData = ss, design = ~ trt)
dds <- DESeq(dds)
de <- DESeq2::results(dds)
de <- de[order(de$pvalue),]
up <- rownames(subset(de, log2FoldChange>0 & padj<0.05 ))
dn <- rownames(subset(de, log2FoldChange<0 & padj<0.05 ))
str(up)
str(dn)
# MA plot
sig <-subset(de, padj < 0.05 )
GENESUP <- length(up)
GENESDN <- length(dn)
SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down")
ns <-subset(de, padj > 0.05 )
plot(log2(de$baseMean),de$log2FoldChange,
xlab="log2 basemean", ylab="log2(fold change)",
pch=19, cex=1, col="dark gray",
ylim = c(-6,6),
main="smear plot")
points(log2(sig$baseMean),sig$log2FoldChange,
pch=19, cex=1, col="red")
mtext(SUBHEADER)
# Volcano
NLAB=12
plot(de$log2FoldChange, -log10(de$pvalue) ,
xlab="-log2(fold change)", ylab="-log10(p-value)",
xlim = c(-6,6),
pch=19, cex=1, col="dark gray",
main="smear plot")
grid()
mtext(SUBHEADER)
points(sig$log2FoldChange, -log10(sig$pvalue) ,
pch=19, cex=1, col="red")
text(head(sig$log2FoldChange,NLAB), head(-log10(sig$pvalue),NLAB)+0.5 ,
labels=head(rownames(sig),NLAB) , cex=1 )
# organellar genomes
chl <- de[grep("ATCG",rownames(de)),]
mit <- de[grep("ATMG",rownames(de)),]
NLAB=12
plot(de$log2FoldChange, -log10(de$pvalue) ,
xlab="-log2(fold change)", ylab="-log10(p-value)",
xlim = c(-6,6),
pch=19, cex=1, col="dark gray",
main="smear plot: organellar genomes")
grid()
abline(h=tail(-log10(sig$pvalue),1),lty=2,lwd=2)
mtext("Grey: nuclear, Red: plastid, Blue: mito")
points(chl$log2FoldChange, -log10(chl$pvalue) ,
pch=19, cex=1, col="red")
points(mit$log2FoldChange, -log10(mit$pvalue) ,
pch=19, cex=1, col="blue")
# summary plot
chl_up <- nrow(subset(chl,padj<0.05 & log2FoldChange>1))
chl_dn <- nrow(subset(chl,padj<0.05 & log2FoldChange<1))
mit_up <- nrow(subset(mit,padj<0.05 & log2FoldChange>1))
mit_dn <- nrow(subset(mit,padj<0.05 & log2FoldChange<1))
sig_up <- length(up)
sig_dn <- length(dn)
n_up <- sig_up - ( chl_up + mit_up )
n_dn <- sig_dn - ( chl_dn + mit_dn )
# barplot
par(mar=c(5,10,2,2))
mycounts <- c("nuclear up"=sig_up,
"chl up"=chl_up,
"mit up"=mit_up,
"nuclear dn"=sig_dn,
"chl dn"=chl_dn,
"mit dn"=mit_dn)
mycounts
barplot(mycounts,
horiz=TRUE,
las=1,
main="number of DE genes")
par(mar=c(5.1,4.1,4.1,2.1))
# heatmap
yn <- y/colSums(y)*1000000
yf <- yn[which(rownames(yn) %in% rownames(de)[1:50]),]
mycols <- gsub("0","yellow",ss$trt)
mycols <- gsub("1","orange",mycols)
colfunc <- colorRampPalette(c("blue", "white", "red"))
heatmap.2( as.matrix(yf), col=colfunc(25),scale="row",
ColSideColors =mycols ,trace="none",
margin = c(10,10), cexRow=0.6, cexCol=0.8 , main="Top 50 genes by p-val")
mtext("yellow=ctrl, orange=trt")
return(de)
}
mitch_barplot <- function(res){
sig <- res$enrichment_result
sig <- head( sig[order(sig$pANOVA),] ,30)
sig <- sig[order(sig$s.dist),]
par(mar=c(3,25,1,1)); barplot(sig$s.dist,horiz=TRUE,las=2,cex.names = 0.6,cex.axis = 0.6,
names.arg=sig$set,main="Enrichment score") ;grid()
}
ss80 <- subset(ss,Treatment=="Seaweed_16280"|Treatment=="Water")
ss80$trt <- as.numeric(grepl("Seaweed",ss80$Treatment))
xx80 <- xx[,which(colnames(xx) %in% ss80$label)]
Here, were using DESeq2 to perform differential expression analysis to understand gene expression changes caused by the different formulations. Enrichment analysis is performed at the end using mitch with all 4 timepoints.
de80 <- run_de(ss80,xx80)
## converting counts to integer mode
## the design formula contains one or more numeric variables with integer values,
## specifying a model with increasing fold change for higher values.
## did you mean for this to be a factor? if so, first convert
## this variable to a factor using the factor() function
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## chr [1:729] "AT1G65970" "AT4G06665" "AT3G11510" "AT5G64100" "AT1G24996" ...
## chr [1:730] "AT2G47450" "AT1G55673" "AT2G22510" "AT2G32870" "AT2G18328" ...