Source codes: https://github.com/markziemann/tohidul_rnaseq

Background

Here we have n=3 control (H2O; “H”) and n=3 seaweed based fertiliser treatments:

Reads underwent quality trimming using Skewer (Jiang et al, 2014). I mapped the reads to the Arabidopsis transcriptome (TAIR10/Ensembl47) using Kallisto (Bray et al, 2016). Expression counts were loaded into R and then DE analysis was performed with DESeq2 (Love et al, 2014). Enrichment analysis was performed using Plant Reactome genesets with the Mitch package (Kaspi & Ziemann 2020).

suppressPackageStartupMessages({
  library("reshape2")
  library("gplots")
  library("DESeq2")
  library("mitch")
  library("kableExtra")
  library("eulerr")
  library("UpSetR")

})

Load data

Here we load the data in from the aligner. Take note that the columns are arranged in order.

tmp <- read.table("3col.tsv.gz")
x <- as.data.frame(acast(tmp, V2~V1, value.var="V3"))
x$gene <- sapply(strsplit(rownames(x),"\\."),"[[",1)
xx <- aggregate(. ~ gene, x, sum)
rownames(xx) <- xx$gene
xx$gene = NULL
xx <- xx[,order(colnames(xx))]

Sample sheet

The sample sheet is read in and put in order as well.

ss <- read.table("samplesheet.tsv",header=TRUE)
ss <- ss[order(ss$Run),]
ss$label <- gsub("-0hpi","",ss$SampleName)
# check that names are in order
colnames(xx) == ss$Run
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# change data header
colnames(xx) <- ss$label
kbl(ss) %>% kable_styling()
Run SampleName Treatment label
6 SRR11404271 S80-0hpi_R3 Seaweed_16280 S80_R3
5 SRR11404272 S80-0hpi_R2 Seaweed_16280 S80_R2
4 SRR11404273 S80-0hpi_R1 Seaweed_16280 S80_R1
12 SRR11404287 S94-0hpi_R3 Seaweed_15294 S94_R3
11 SRR11404288 S94-0hpi_R2 Seaweed_15294 S94_R2
10 SRR11404289 S94-0hpi_R1 Seaweed_15294 S94_R1
3 SRR11404301 H-0hpi_R3 Water H_R3
9 SRR11404304 S93-0hpi_R3 Seaweed_15293 S93_R3
8 SRR11404305 S93-0hpi_R2 Seaweed_15293 S93_R2
7 SRR11404306 S93-0hpi_R1 Seaweed_15293 S93_R1
2 SRR11404312 H-0hpi_R2 Water H_R2
1 SRR11404313 H-0hpi_R1 Water H_R1

MDS

MDS is just like PCA. The more similar (correlated) the data sets are the closer they will appear on the scatterplot.

cols <- as.numeric(factor(ss$Treatment))
plot(cmdscale(dist(t(xx))), xlab="Coordinate 1", ylab="Coordinate 2", 
  col=cols, cex=4 , main="MDS plot")
text(cmdscale(dist(t(xx))), labels=ss$label )

#colfunc <- colorRampPalette(c("white", "yellow", "orange" ,  "red", "darkred"))
colfunc <- colorRampPalette(c("white","blue"))

heatmap.2(cor(xx,method="pearson"),col=colfunc(25),trace="none",
  scale="none",margin=c(10,10),main="Pearson correlation")

heatmap.2(cor(xx,method="spearman"),col=colfunc(25),trace="none",
  scale="none",margin=c(10,10),main="Spearman correlation")

Functions

run_de <- function(ss,xx){
xx <- xx[which(rowMeans(xx)>10),]
y <- round(xx)
# MDS
mds <- cmdscale(dist(t(y)))
XMAX=max(mds[,1])*1.1
XMIN=min(mds[,1])*1.1
plot( mds , xlab="Coordinate 1", ylab="Coordinate 2",
  type = "n" , xlim=c(XMIN,XMAX),main="MDS plot",bty="n")
text(mds, labels=colnames(y) )
# DE
dds <- DESeqDataSetFromMatrix(countData=y, colData = ss, design = ~ trt)
dds <- DESeq(dds)
de <- DESeq2::results(dds)
de <- de[order(de$pvalue),]
up <- rownames(subset(de, log2FoldChange>0 & padj<0.05 ))
dn <- rownames(subset(de, log2FoldChange<0 & padj<0.05 ))
str(up)
str(dn)
# MA plot
sig <-subset(de, padj < 0.05 )
GENESUP <- length(up)
GENESDN <- length(dn)
SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down")
ns <-subset(de, padj > 0.05 )
plot(log2(de$baseMean),de$log2FoldChange,
     xlab="log2 basemean", ylab="log2 foldchange",
     pch=19, cex=0.5, col="dark gray",
     main="smear plot")
points(log2(sig$baseMean),sig$log2FoldChange,
       pch=19, cex=0.5, col="red")
mtext(SUBHEADER)
# heatmap
yn <- y/colSums(y)*1000000
yf <- yn[which(rownames(yn) %in% rownames(de)[1:50]),]
mycols <- gsub("0","yellow",ss$trt)
mycols <- gsub("1","orange",mycols)
colfunc <- colorRampPalette(c("blue", "white", "red"))
heatmap.2(  as.matrix(yf), col=colfunc(25),scale="row",
    ColSideColors =mycols ,trace="none",
    margin = c(10,10), cexRow=0.6, cexCol=0.8 , main="Top 50 genes by p-val")
mtext("yellow=ctrl, orange=trt")
return(de)
}


mitch_barplot <- function(res){
  sig <- res$enrichment_result
  sig <- head( sig[order(sig$pANOVA),] ,30)
  sig <- sig[order(sig$s.dist),]
  par(mar=c(3,25,1,1)); barplot(sig$s.dist,horiz=TRUE,las=2,cex.names = 0.6,cex.axis = 0.6,
    names.arg=sig$set,main="Enrichment score") ;grid()
}

Split data into contrast groups

ss93 <- subset(ss,Treatment=="Seaweed_15293"|Treatment=="Water")
ss93$trt <- as.numeric(grepl("Seaweed",ss93$Treatment))
xx93 <- xx[,which(colnames(xx) %in% ss93$label)]

ss94 <- subset(ss,Treatment=="Seaweed_15294"|Treatment=="Water")
ss94$trt <- as.numeric(grepl("Seaweed",ss94$Treatment))
xx94 <- xx[,which(colnames(xx) %in% ss94$label)]

ss80 <- subset(ss,Treatment=="Seaweed_16280"|Treatment=="Water")
ss80$trt <- as.numeric(grepl("Seaweed",ss80$Treatment))
xx80 <- xx[,which(colnames(xx) %in% ss80$label)]

DE

Here, were using DESeq2 to perform differential expression analysis to understand gene expression changes caused by the different formulations. Enrichment analysis is performed at the end using mitch with all 4 timepoints.

de93 <- run_de(ss93,xx93)
## converting counts to integer mode
##   the design formula contains one or more numeric variables with integer values,
##   specifying a model with increasing fold change for higher values.
##   did you mean for this to be a factor? if so, first convert
##   this variable to a factor using the factor() function
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

##  chr [1:131] "ATCG00650" "ATCG00270" "ATCG00450" "ATCG00090" "AT2G29380" ...
##  chr [1:74] "AT1G07550" "AT4G15393" "AT1G61480" "AT4G13420" "AT5G42600" ...