Source codes: https://github.com/markziemann/tohidul_rnaseq

Background

Here we have n=3 control (H2O; “H”) and n=3 seaweed based fertiliser treatments:

Reads underwent quality trimming using Skewer (Jiang et al, 2014). I mapped the reads to the Arabidopsis transcriptome (TAIR10/Ensembl47) using Kallisto (Bray et al, 2016). Expression counts were loaded into R and then DE analysis was performed with DESeq2 (Love et al, 2014). Enrichment analysis was performed using Plant Reactome genesets with the Mitch package (Kaspi & Ziemann 2020).

suppressPackageStartupMessages({
  library("tidyverse")
  library("reshape2")
  library("gplots")
  library("DESeq2")
  library("mitch")
  library("kableExtra")
  library("eulerr")
  library("UpSetR")
  library("vioplot")
  library("clusterProfiler")
})

Load data

Here we load the data in from the aligner. Take note that the columns are arranged in order.

tmp <- read.table("3col.tsv.gz")
x <- as.data.frame(acast(tmp, V2~V1, value.var="V3"))
x$gene <- sapply(strsplit(rownames(x),"\\."),"[[",1)
xx <- aggregate(. ~ gene, x, sum)
rownames(xx) <- xx$gene
xx$gene = NULL
xx <- xx[,order(colnames(xx))]

Sample sheet

The sample sheet is read in and put in order as well.

ss <- read.table("samplesheet.tsv",header=TRUE)
ss <- ss[order(ss$Run),]
ss$label <- gsub("-0hpi","",ss$SampleName)
# check that names are in order
colnames(xx) == ss$Run
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# change data header
colnames(xx) <- ss$label
kbl(ss) %>% kable_styling()
Run SampleName Treatment label
6 SRR11404271 S80-0hpi_R3 Seaweed_16280 S80_R3
5 SRR11404272 S80-0hpi_R2 Seaweed_16280 S80_R2
4 SRR11404273 S80-0hpi_R1 Seaweed_16280 S80_R1
12 SRR11404287 S94-0hpi_R3 Seaweed_15294 S94_R3
11 SRR11404288 S94-0hpi_R2 Seaweed_15294 S94_R2
10 SRR11404289 S94-0hpi_R1 Seaweed_15294 S94_R1
3 SRR11404301 H-0hpi_R3 Water H_R3
9 SRR11404304 S93-0hpi_R3 Seaweed_15293 S93_R3
8 SRR11404305 S93-0hpi_R2 Seaweed_15293 S93_R2
7 SRR11404306 S93-0hpi_R1 Seaweed_15293 S93_R1
2 SRR11404312 H-0hpi_R2 Water H_R2
1 SRR11404313 H-0hpi_R1 Water H_R1

MDS

MDS is just like PCA. The more similar (correlated) the data sets are the closer they will appear on the scatterplot.

cols <- as.numeric(factor(ss$Treatment))
plot(cmdscale(dist(t(xx))), xlab="Coordinate 1", ylab="Coordinate 2", 
  col=cols, cex=4 , main="MDS plot")
text(cmdscale(dist(t(xx))), labels=ss$label )

#colfunc <- colorRampPalette(c("white", "yellow", "orange" ,  "red", "darkred"))
colfunc <- colorRampPalette(c("white","blue"))

heatmap.2(cor(xx,method="pearson"),col=colfunc(25),trace="none",
  scale="none",margin=c(10,10),main="Pearson correlation")

heatmap.2(cor(xx,method="spearman"),col=colfunc(25),trace="none",
  scale="none",margin=c(10,10),main="Spearman correlation")

Functions

run_de <- function(ss,xx){
xx <- xx[which(rowMeans(xx)>10),]
y <- round(xx)
# MDS
mds <- cmdscale(dist(t(y)))
XMAX=max(mds[,1])*1.1
XMIN=min(mds[,1])*1.1
plot( mds , xlab="Coordinate 1", ylab="Coordinate 2",
  type = "n" , xlim=c(XMIN,XMAX),main="MDS plot",bty="n")
text(mds, labels=colnames(y) )
# DE
dds <- DESeqDataSetFromMatrix(countData=y, colData = ss, design = ~ trt)
dds <- DESeq(dds)
de <- DESeq2::results(dds)
de <- de[order(de$pvalue),]
up <- rownames(subset(de, log2FoldChange>0 & padj<0.05 ))
dn <- rownames(subset(de, log2FoldChange<0 & padj<0.05 ))
str(up)
str(dn)

# MA plot
sig <-subset(de, padj < 0.05 )
GENESUP <- length(up)
GENESDN <- length(dn)
SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down")
ns <-subset(de, padj > 0.05 )
plot(log2(de$baseMean),de$log2FoldChange,
     xlab="log2 basemean", ylab="log2(fold change)",
     pch=19, cex=1, col="dark gray",
     ylim = c(-6,6),
     main="smear plot")
points(log2(sig$baseMean),sig$log2FoldChange,
      pch=19, cex=1, col="red")
mtext(SUBHEADER)

# Volcano
NLAB=12
plot(de$log2FoldChange, -log10(de$pvalue) ,
    xlab="-log2(fold change)", ylab="-log10(p-value)",
    xlim = c(-6,6),
    pch=19, cex=1, col="dark gray",
    main="smear plot")
    grid()
mtext(SUBHEADER)
points(sig$log2FoldChange, -log10(sig$pvalue) ,
     pch=19, cex=1, col="red")
text(head(sig$log2FoldChange,NLAB), head(-log10(sig$pvalue),NLAB)+0.5 , 
    labels=head(rownames(sig),NLAB) , cex=1 )

# organellar genomes
chl <- de[grep("ATCG",rownames(de)),]
mit <- de[grep("ATMG",rownames(de)),]
NLAB=12 
plot(de$log2FoldChange, -log10(de$pvalue) ,
    xlab="-log2(fold change)", ylab="-log10(p-value)",
    xlim = c(-6,6),
    pch=19, cex=1, col="dark gray",
    main="smear plot: organellar genomes")
    grid()
    abline(h=tail(-log10(sig$pvalue),1),lty=2,lwd=2)
mtext("Grey: nuclear, Red: plastid, Blue: mito")
points(chl$log2FoldChange, -log10(chl$pvalue) ,
     pch=19, cex=1, col="red")
points(mit$log2FoldChange, -log10(mit$pvalue) ,
     pch=19, cex=1, col="blue")

# summary plot
chl_up <- nrow(subset(chl,padj<0.05 & log2FoldChange>1))
chl_dn <- nrow(subset(chl,padj<0.05 & log2FoldChange<1))
mit_up <- nrow(subset(mit,padj<0.05 & log2FoldChange>1))
mit_dn <- nrow(subset(mit,padj<0.05 & log2FoldChange<1))
sig_up <- length(up)
sig_dn <- length(dn)
n_up <- sig_up - ( chl_up + mit_up )
n_dn <- sig_dn - ( chl_dn + mit_dn )

# barplot
par(mar=c(5,10,2,2))
mycounts <- c("nuclear up"=sig_up,
    "chl up"=chl_up,
    "mit up"=mit_up,
    "nuclear dn"=sig_dn,
    "chl dn"=chl_dn,
    "mit dn"=mit_dn)
mycounts
barplot(mycounts,
    horiz=TRUE,
    las=1,
    main="number of DE genes")
par(mar=c(5.1,4.1,4.1,2.1))

# heatmap
yn <- y/colSums(y)*1000000
yf <- yn[which(rownames(yn) %in% rownames(de)[1:50]),]
mycols <- gsub("0","yellow",ss$trt)
mycols <- gsub("1","orange",mycols)
colfunc <- colorRampPalette(c("blue", "white", "red"))
heatmap.2(  as.matrix(yf), col=colfunc(25),scale="row",
    ColSideColors =mycols ,trace="none",
    margin = c(10,10), cexRow=0.6, cexCol=0.8 , main="Top 50 genes by p-val")
mtext("yellow=ctrl, orange=trt")
return(de)
}

mitch_barplot <- function(res){
  sig <- res$enrichment_result
  sig <- head( sig[order(sig$pANOVA),] ,30)
  sig <- sig[order(sig$s.dist),]
  par(mar=c(3,25,1,1)); barplot(sig$s.dist,horiz=TRUE,las=2,cex.names = 0.6,cex.axis = 0.6,
    names.arg=sig$set,main="Enrichment score") ;grid()
}

Split data into contrast groups

ss80 <- subset(ss,Treatment=="Seaweed_16280"|Treatment=="Water")
ss80$trt <- as.numeric(grepl("Seaweed",ss80$Treatment))
xx80 <- xx[,which(colnames(xx) %in% ss80$label)]

DE

Here, were using DESeq2 to perform differential expression analysis to understand gene expression changes caused by the different formulations. Enrichment analysis is performed at the end using mitch with all 4 timepoints.

de80 <- run_de(ss80,xx80)
## converting counts to integer mode
##   the design formula contains one or more numeric variables with integer values,
##   specifying a model with increasing fold change for higher values.
##   did you mean for this to be a factor? if so, first convert
##   this variable to a factor using the factor() function
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

##  chr [1:729] "AT1G65970" "AT4G06665" "AT3G11510" "AT5G64100" "AT1G24996" ...
##  chr [1:730] "AT2G47450" "AT1G55673" "AT2G22510" "AT2G32870" "AT2G18328" ...