Source codes: https://github.com/markziemann/tohidul_rnaseq
Here we have n=3 control (H2O; “H”) and n=3 seaweed based fertiliser treatments:
S80 ANDP
S94 DP
S93 AN
Reads underwent quality trimming using Skewer (Jiang et al, 2014). I mapped the reads to the Arabidopsis transcriptome (TAIR10/Ensembl47) using Kallisto (Bray et al, 2016). Expression counts were loaded into R and then DE analysis was performed with DESeq2 (Love et al, 2014). Enrichment analysis was performed using Plant Reactome genesets with the Mitch package (Kaspi & Ziemann 2020).
suppressPackageStartupMessages({
library("reshape2")
library("gplots")
library("DESeq2")
library("mitch")
library("kableExtra")
library("eulerr")
library("UpSetR")
})
Here we load the data in from the aligner. Take note that the columns are arranged in order.
tmp <- read.table("3col.tsv.gz")
x <- as.data.frame(acast(tmp, V2~V1, value.var="V3"))
x$gene <- sapply(strsplit(rownames(x),"\\."),"[[",1)
xx <- aggregate(. ~ gene, x, sum)
rownames(xx) <- xx$gene
xx$gene = NULL
xx <- xx[,order(colnames(xx))]
The sample sheet is read in and put in order as well.
ss <- read.table("samplesheet.tsv",header=TRUE)
ss <- ss[order(ss$Run),]
ss$label <- gsub("-0hpi","",ss$SampleName)
# check that names are in order
colnames(xx) == ss$Run
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# change data header
colnames(xx) <- ss$label
kbl(ss) %>% kable_styling()
Run | SampleName | Treatment | label | |
---|---|---|---|---|
6 | SRR11404271 | S80-0hpi_R3 | Seaweed_16280 | S80_R3 |
5 | SRR11404272 | S80-0hpi_R2 | Seaweed_16280 | S80_R2 |
4 | SRR11404273 | S80-0hpi_R1 | Seaweed_16280 | S80_R1 |
12 | SRR11404287 | S94-0hpi_R3 | Seaweed_15294 | S94_R3 |
11 | SRR11404288 | S94-0hpi_R2 | Seaweed_15294 | S94_R2 |
10 | SRR11404289 | S94-0hpi_R1 | Seaweed_15294 | S94_R1 |
3 | SRR11404301 | H-0hpi_R3 | Water | H_R3 |
9 | SRR11404304 | S93-0hpi_R3 | Seaweed_15293 | S93_R3 |
8 | SRR11404305 | S93-0hpi_R2 | Seaweed_15293 | S93_R2 |
7 | SRR11404306 | S93-0hpi_R1 | Seaweed_15293 | S93_R1 |
2 | SRR11404312 | H-0hpi_R2 | Water | H_R2 |
1 | SRR11404313 | H-0hpi_R1 | Water | H_R1 |
MDS is just like PCA. The more similar (correlated) the data sets are the closer they will appear on the scatterplot.
cols <- as.numeric(factor(ss$Treatment))
plot(cmdscale(dist(t(xx))), xlab="Coordinate 1", ylab="Coordinate 2",
col=cols, cex=4 , main="MDS plot")
text(cmdscale(dist(t(xx))), labels=ss$label )
#colfunc <- colorRampPalette(c("white", "yellow", "orange" , "red", "darkred"))
colfunc <- colorRampPalette(c("white","blue"))
heatmap.2(cor(xx,method="pearson"),col=colfunc(25),trace="none",
scale="none",margin=c(10,10),main="Pearson correlation")
heatmap.2(cor(xx,method="spearman"),col=colfunc(25),trace="none",
scale="none",margin=c(10,10),main="Spearman correlation")
run_de <- function(ss,xx){
xx <- xx[which(rowMeans(xx)>10),]
y <- round(xx)
# MDS
mds <- cmdscale(dist(t(y)))
XMAX=max(mds[,1])*1.1
XMIN=min(mds[,1])*1.1
plot( mds , xlab="Coordinate 1", ylab="Coordinate 2",
type = "n" , xlim=c(XMIN,XMAX),main="MDS plot",bty="n")
text(mds, labels=colnames(y) )
# DE
dds <- DESeqDataSetFromMatrix(countData=y, colData = ss, design = ~ trt)
dds <- DESeq(dds)
de <- DESeq2::results(dds)
de <- de[order(de$pvalue),]
up <- rownames(subset(de, log2FoldChange>0 & padj<0.05 ))
dn <- rownames(subset(de, log2FoldChange<0 & padj<0.05 ))
str(up)
str(dn)
# MA plot
sig <-subset(de, padj < 0.05 )
GENESUP <- length(up)
GENESDN <- length(dn)
SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down")
ns <-subset(de, padj > 0.05 )
plot(log2(de$baseMean),de$log2FoldChange,
xlab="log2 basemean", ylab="log2 foldchange",
pch=19, cex=0.5, col="dark gray",
main="smear plot")
points(log2(sig$baseMean),sig$log2FoldChange,
pch=19, cex=0.5, col="red")
mtext(SUBHEADER)
# heatmap
yn <- y/colSums(y)*1000000
yf <- yn[which(rownames(yn) %in% rownames(de)[1:50]),]
mycols <- gsub("0","yellow",ss$trt)
mycols <- gsub("1","orange",mycols)
colfunc <- colorRampPalette(c("blue", "white", "red"))
heatmap.2( as.matrix(yf), col=colfunc(25),scale="row",
ColSideColors =mycols ,trace="none",
margin = c(10,10), cexRow=0.6, cexCol=0.8 , main="Top 50 genes by p-val")
mtext("yellow=ctrl, orange=trt")
return(de)
}
mitch_barplot <- function(res){
sig <- res$enrichment_result
sig <- head( sig[order(sig$pANOVA),] ,30)
sig <- sig[order(sig$s.dist),]
par(mar=c(3,25,1,1)); barplot(sig$s.dist,horiz=TRUE,las=2,cex.names = 0.6,cex.axis = 0.6,
names.arg=sig$set,main="Enrichment score") ;grid()
}
ss93 <- subset(ss,Treatment=="Seaweed_15293"|Treatment=="Water")
ss93$trt <- as.numeric(grepl("Seaweed",ss93$Treatment))
xx93 <- xx[,which(colnames(xx) %in% ss93$label)]
ss94 <- subset(ss,Treatment=="Seaweed_15294"|Treatment=="Water")
ss94$trt <- as.numeric(grepl("Seaweed",ss94$Treatment))
xx94 <- xx[,which(colnames(xx) %in% ss94$label)]
ss80 <- subset(ss,Treatment=="Seaweed_16280"|Treatment=="Water")
ss80$trt <- as.numeric(grepl("Seaweed",ss80$Treatment))
xx80 <- xx[,which(colnames(xx) %in% ss80$label)]
Here, were using DESeq2 to perform differential expression analysis to understand gene expression changes caused by the different formulations. Enrichment analysis is performed at the end using mitch with all 4 timepoints.
de93 <- run_de(ss93,xx93)
## converting counts to integer mode
## the design formula contains one or more numeric variables with integer values,
## specifying a model with increasing fold change for higher values.
## did you mean for this to be a factor? if so, first convert
## this variable to a factor using the factor() function
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## chr [1:131] "ATCG00650" "ATCG00270" "ATCG00450" "ATCG00090" "AT2G29380" ...
## chr [1:74] "AT1G07550" "AT4G15393" "AT1G61480" "AT4G13420" "AT5G42600" ...