Source codes: https://github.com/markziemann/tohidul_rnaseq

Background

Here we have n=3 control (H2O; “H”) and n=3 seaweed based fertiliser treatments:

Reads underwent quality trimming using Skewer (Jiang et al, 2014). I mapped the reads to the Arabidopsis transcriptome (TAIR10/Ensembl47) using Kallisto (Bray et al, 2016). Expression counts were loaded into R and then DE analysis was performed with DESeq2 (Love et al, 2014). Enrichment analysis was performed using Plant Reactome genesets with the Mitch package (Kaspi & Ziemann 2020).

suppressPackageStartupMessages({
  library("tidyverse")
  library("reshape2")
  library("gplots")
  library("DESeq2")
  library("mitch")
  library("kableExtra")
  library("eulerr")
  library("UpSetR")
  library("vioplot")
  library("clusterProfiler")
})

Load data

Here we load the data in from the aligner. Take note that the columns are arranged in order.

tmp <- read.table("3col.tsv.gz")
x <- as.data.frame(acast(tmp, V2~V1, value.var="V3"))
x$gene <- sapply(strsplit(rownames(x),"\\."),"[[",1)
xx <- aggregate(. ~ gene, x, sum)
rownames(xx) <- xx$gene
xx$gene = NULL
xx <- xx[,order(colnames(xx))]

Sample sheet

The sample sheet is read in and put in order as well.

ss <- read.table("samplesheet.tsv",header=TRUE)
ss <- ss[order(ss$Run),]
ss$label <- gsub("-0hpi","",ss$SampleName)
# check that names are in order
colnames(xx) == ss$Run
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# change data header
colnames(xx) <- ss$label
kbl(ss) %>% kable_styling()
Run SampleName Treatment label
6 SRR11404271 S80-0hpi_R3 Seaweed_16280 S80_R3
5 SRR11404272 S80-0hpi_R2 Seaweed_16280 S80_R2
4 SRR11404273 S80-0hpi_R1 Seaweed_16280 S80_R1
12 SRR11404287 S94-0hpi_R3 Seaweed_15294 S94_R3
11 SRR11404288 S94-0hpi_R2 Seaweed_15294 S94_R2
10 SRR11404289 S94-0hpi_R1 Seaweed_15294 S94_R1
3 SRR11404301 H-0hpi_R3 Water H_R3
9 SRR11404304 S93-0hpi_R3 Seaweed_15293 S93_R3
8 SRR11404305 S93-0hpi_R2 Seaweed_15293 S93_R2
7 SRR11404306 S93-0hpi_R1 Seaweed_15293 S93_R1
2 SRR11404312 H-0hpi_R2 Water H_R2
1 SRR11404313 H-0hpi_R1 Water H_R1

MDS

MDS is just like PCA. The more similar (correlated) the data sets are the closer they will appear on the scatterplot.

cols <- as.numeric(factor(ss$Treatment))
plot(cmdscale(dist(t(xx))), xlab="Coordinate 1", ylab="Coordinate 2", 
  col=cols, cex=4 , main="MDS plot")
text(cmdscale(dist(t(xx))), labels=ss$label )

#colfunc <- colorRampPalette(c("white", "yellow", "orange" ,  "red", "darkred"))
colfunc <- colorRampPalette(c("white","blue"))

heatmap.2(cor(xx,method="pearson"),col=colfunc(25),trace="none",
  scale="none",margin=c(10,10),main="Pearson correlation")

heatmap.2(cor(xx,method="spearman"),col=colfunc(25),trace="none",
  scale="none",margin=c(10,10),main="Spearman correlation")

Functions

run_de <- function(ss,xx){
xx <- xx[which(rowMeans(xx)>10),]
y <- round(xx)
# MDS
mds <- cmdscale(dist(t(y)))
XMAX=max(mds[,1])*1.1
XMIN=min(mds[,1])*1.1
plot( mds , xlab="Coordinate 1", ylab="Coordinate 2",
  type = "n" , xlim=c(XMIN,XMAX),main="MDS plot",bty="n")
text(mds, labels=colnames(y) )
# DE
dds <- DESeqDataSetFromMatrix(countData=y, colData = ss, design = ~ trt)
dds <- DESeq(dds)
de <- DESeq2::results(dds)
de <- de[order(de$pvalue),]
up <- rownames(subset(de, log2FoldChange>0 & padj<0.05 ))
dn <- rownames(subset(de, log2FoldChange<0 & padj<0.05 ))
str(up)
str(dn)

# MA plot
sig <-subset(de, padj < 0.05 )
GENESUP <- length(up)
GENESDN <- length(dn)
SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down")
ns <-subset(de, padj > 0.05 )
plot(log2(de$baseMean),de$log2FoldChange,
     xlab="log2 basemean", ylab="log2(fold change)",
     pch=19, cex=1, col="dark gray",
     ylim = c(-6,6),
     main="smear plot")
points(log2(sig$baseMean),sig$log2FoldChange,
      pch=19, cex=1, col="red")
mtext(SUBHEADER)

# Volcano
NLAB=12
plot(de$log2FoldChange, -log10(de$pvalue) ,
    xlab="-log2(fold change)", ylab="-log10(p-value)",
    xlim = c(-6,6),
    pch=19, cex=1, col="dark gray",
    main="smear plot")
    grid()
mtext(SUBHEADER)
points(sig$log2FoldChange, -log10(sig$pvalue) ,
     pch=19, cex=1, col="red")
text(head(sig$log2FoldChange,NLAB), head(-log10(sig$pvalue),NLAB)+0.5 , 
    labels=head(rownames(sig),NLAB) , cex=1 )

# organellar genomes
chl <- de[grep("ATCG",rownames(de)),]
mit <- de[grep("ATMG",rownames(de)),]
NLAB=12 
plot(de$log2FoldChange, -log10(de$pvalue) ,
    xlab="-log2(fold change)", ylab="-log10(p-value)",
    xlim = c(-6,6),
    pch=19, cex=1, col="dark gray",
    main="smear plot: organellar genomes")
    grid()
    abline(h=tail(-log10(sig$pvalue),1),lty=2,lwd=2)
mtext("Grey: nuclear, Red: plastid, Blue: mito")
points(chl$log2FoldChange, -log10(chl$pvalue) ,
     pch=19, cex=1, col="red")
points(mit$log2FoldChange, -log10(mit$pvalue) ,
     pch=19, cex=1, col="blue")

# summary plot
chl_up <- nrow(subset(chl,padj<0.05 & log2FoldChange>1))
chl_dn <- nrow(subset(chl,padj<0.05 & log2FoldChange<1))
mit_up <- nrow(subset(mit,padj<0.05 & log2FoldChange>1))
mit_dn <- nrow(subset(mit,padj<0.05 & log2FoldChange<1))
sig_up <- length(up)
sig_dn <- length(dn)
n_up <- sig_up - ( chl_up + mit_up )
n_dn <- sig_dn - ( chl_dn + mit_dn )

# barplot
par(mar=c(5,10,2,2))
mycounts <- c("nuclear up"=sig_up,
    "chl up"=chl_up,
    "mit up"=mit_up,
    "nuclear dn"=sig_dn,
    "chl dn"=chl_dn,
    "mit dn"=mit_dn)
mycounts
barplot(mycounts,
    horiz=TRUE,
    las=1,
    main="number of DE genes")
par(mar=c(5.1,4.1,4.1,2.1))

# heatmap
yn <- y/colSums(y)*1000000
yf <- yn[which(rownames(yn) %in% rownames(de)[1:50]),]
mycols <- gsub("0","yellow",ss$trt)
mycols <- gsub("1","orange",mycols)
colfunc <- colorRampPalette(c("blue", "white", "red"))
heatmap.2(  as.matrix(yf), col=colfunc(25),scale="row",
    ColSideColors =mycols ,trace="none",
    margin = c(10,10), cexRow=0.6, cexCol=0.8 , main="Top 50 genes by p-val")
mtext("yellow=ctrl, orange=trt")
return(de)
}

mitch_barplot <- function(res){
  sig <- res$enrichment_result
  sig <- head( sig[order(sig$pANOVA),] ,30)
  sig <- sig[order(sig$s.dist),]
  par(mar=c(3,25,1,1)); barplot(sig$s.dist,horiz=TRUE,las=2,cex.names = 0.6,cex.axis = 0.6,
    names.arg=sig$set,main="Enrichment score") ;grid()
}

Split data into contrast groups

ss80 <- subset(ss,Treatment=="Seaweed_16280"|Treatment=="Water")
ss80$trt <- as.numeric(grepl("Seaweed",ss80$Treatment))
xx80 <- xx[,which(colnames(xx) %in% ss80$label)]

DE

Here, were using DESeq2 to perform differential expression analysis to understand gene expression changes caused by the different formulations. Enrichment analysis is performed at the end using mitch with all 4 timepoints.

de80 <- run_de(ss80,xx80)
## converting counts to integer mode
##   the design formula contains one or more numeric variables with integer values,
##   specifying a model with increasing fold change for higher values.
##   did you mean for this to be a factor? if so, first convert
##   this variable to a factor using the factor() function
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

##  chr [1:729] "AT1G65970" "AT4G06665" "AT3G11510" "AT5G64100" "AT1G24996" ...
##  chr [1:730] "AT2G47450" "AT1G55673" "AT2G22510" "AT2G32870" "AT2G18328" ...

de80_top <- head(as.data.frame(de80),40)
kbl(de80_top[order(-de80_top$log2FoldChange),]) %>% kable_styling()
baseMean log2FoldChange lfcSE stat pvalue padj
AT4G06665 132.02756 10.6703428 1.2214435 8.735846 0 0
AT2G08750 37.84520 8.8747367 1.2919905 6.869042 0 0
AT1G24996 323.82079 4.3818892 0.5618073 7.799630 0 0
AT3G20810 1371.83363 3.1462473 0.4775527 6.588273 0 0
AT5G06760 1048.96459 2.9414243 0.4414667 6.662845 0 0
AT4G33905 330.48540 2.1883867 0.3352305 6.528005 0 0
AT1G65970 319.83244 1.8879852 0.2028938 9.305287 0 0
AT3G48510 71.68292 1.7527497 0.2646986 6.621681 0 0
AT1G19250 177.80497 1.3027098 0.2005740 6.494908 0 0
AT3G14440 696.62229 1.1073515 0.1674480 6.613105 0 0
AT5G03345 831.78940 0.9323214 0.1398791 6.665195 0 0
AT5G64100 3886.71544 0.9061570 0.1091747 8.300067 0 0
AT4G12600 1344.31545 0.8738919 0.1317509 6.632912 0 0
AT3G11510 6114.57364 0.8224084 0.0964760 8.524483 0 0
AT5G42090 3764.73635 0.7636518 0.1042944 7.322079 0 0
AT1G14980 1578.18877 0.7608785 0.1126658 6.753414 0 0
AT3G22230 3405.70403 0.7473244 0.0963363 7.757453 0 0
AT3G09735 1220.96912 0.7320855 0.0981377 7.459776 0 0
AT2G44310 1560.18430 0.6872796 0.0950144 7.233422 0 0
AT3G18740 3328.73251 0.6726190 0.1004285 6.697488 0 0
AT4G09320 6585.58229 0.6659963 0.0894218 7.447803 0 0
AT4G20150 3164.43679 0.6605573 0.0950525 6.949394 0 0
AT5G02870 7649.88412 0.6494672 0.0860326 7.549084 0 0
AT1G55330 2376.62233 0.6279042 0.0917265 6.845395 0 0
AT3G53430 5234.77439 0.6126416 0.0824943 7.426474 0 0
AT5G57660 1896.01283 -0.7450316 0.1135806 -6.559499 0 0
AT2G32870 1214.28389 -0.8573352 0.1021669 -8.391513 0 0
AT3G47470 34652.01704 -0.8809525 0.1335732 -6.595277 0 0
AT3G15353 23919.22730 -0.9793861 0.1511211 -6.480802 0 0
AT2G18328 689.16212 -1.0454080 0.1455697 -7.181493 0 0
AT4G39720 211.17876 -1.0949131 0.1686647 -6.491655 0 0
AT2G47450 3594.05175 -1.1612794 0.1044117 -11.122115 0 0
AT5G48490 1040.34652 -1.2382333 0.1779417 -6.958647 0 0
AT5G54270 14956.45719 -1.2741526 0.1886138 -6.755352 0 0
AT1G64370 5582.52780 -1.2768304 0.1829606 -6.978719 0 0
AT4G38080 3041.08385 -1.4147394 0.1978268 -7.151404 0 0
AT2G22510 1187.84821 -1.5712553 0.1672951 -9.392116 0 0
AT3G02400 210.64282 -2.0573297 0.3175210 -6.479350 0 0
AT1G15830 157.00814 -2.1186271 0.3203694 -6.613077 0 0
AT1G55673 735.08530 -12.7838910 1.2309372 -10.385494 0 0
write.table(de80,file="de80.tsv",quote=FALSE,sep="\t")

de80_up <- rownames(subset(de80,padj<0.05&log2FoldChange>0))
de80_dn <- rownames(subset(de80,padj<0.05&log2FoldChange<0))

Pathway analysis

Mapman pathways last modified in 2012 and used in the previous RNA-seq analysis.

genesets <- gmt_import("../baseline/ref/Ath_AGI_LOCUS_TAIR10_Aug2012.txt.gmt")

gt <- read.table("../baseline/ref/Arabidopsis_thaliana.TAIR10.46.geneaccession2symbol.tsv",
    fill=TRUE) 

m <- mitch_import(x=as.data.frame(de80),DEtype="deseq2",geneTable=gt)
## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.
## Note: Mean no. genes in input = 21761
## Note: no. genes in output = 21471
## Note: estimated proportion of input genes in output = 0.987
# significance
res_sig <- mitch_calc(x=m,genesets=genesets,priority="significance")
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
top <- head(subset(res_sig$enrichment_result,p.adjustANOVA<0.05),50)
kbl(head(top,30)) %>% kable_styling()
set setSize pANOVA s.dist p.adjustANOVA
91 NOT_ASSIGNED_NO_ONTOLOGY_PENTATRICOPEPTIDE_(PPR)_REPEAT-CONTAINING_PROTEIN 418 0.0000000 0.1877951 0.0000000
114 PROTEIN_DEGRADATION_UBIQUITIN_PROTEASOM 60 0.0000000 0.4827129 0.0000000
8 CELL_ORGANISATION 345 0.0000000 -0.1881141 0.0000001
243 STRESS_BIOTIC_PR-PROTEINS 172 0.0000000 -0.2651555 0.0000001
25 DNA_SYNTHESIS/CHROMATIN_STRUCTURE 204 0.0000000 -0.2396256 0.0000002
89 NOT_ASSIGNED_NO_ONTOLOGY_GLYCINE_RICH_PROTEINS 55 0.0000000 -0.4532720 0.0000003
92 NOT_ASSIGNED_NO_ONTOLOGY_PROLINE_RICH_FAMILY 39 0.0000000 -0.5225783 0.0000006
112 PROTEIN_DEGRADATION_UBIQUITIN_E3_SCF_FBOX 253 0.0000000 0.2050586 0.0000006
13 CELL_WALL_CELL_WALL_PROTEINS_HRGP 14 0.0000002 -0.8049854 0.0000054
187 RNA_REGULATION_OF_TRANSCRIPTION_MYB-RELATED_TRANSCRIPTION_FACTOR_FAMILY 42 0.0000003 -0.4550153 0.0000089
197 RNA_REGULATION_OF_TRANSCRIPTION_TRIHELIX,_TRIPLE-HELIX_TRANSCRIPTION_FACTOR_FAMILY 29 0.0000009 -0.5265238 0.0000222
186 RNA_REGULATION_OF_TRANSCRIPTION_MYB_DOMAIN_TRANSCRIPTION_FACTOR_FAMILY 121 0.0000019 -0.2507256 0.0000421
202 RNA_RNA_BINDING 164 0.0000022 -0.2140846 0.0000458
179 RNA_REGULATION_OF_TRANSCRIPTION_HB,HOMEOBOX_TRANSCRIPTION_FACTOR_FAMILY 67 0.0000025 -0.3323929 0.0000482
199 RNA_REGULATION_OF_TRANSCRIPTION_WRKY_DOMAIN_TRANSCRIPTION_FACTOR_FAMILY 62 0.0000050 -0.3352826 0.0000881
215 SIGNALLING_LIGHT 89 0.0000090 -0.2722420 0.0001501
173 RNA_REGULATION_OF_TRANSCRIPTION_CCAAT_BOX_BINDING_FACTOR_FAMILY,_HAP2 10 0.0000113 -0.8018452 0.0001767
20 DEVELOPMENT_LATE_EMBRYOGENESIS_ABUNDANT 18 0.0000186 0.5826691 0.0002765
88 NOT_ASSIGNED_NO_ONTOLOGY_FORMIN_HOMOLOGY_2_DOMAIN-CONTAINING_PROTEIN 16 0.0000421 -0.5913190 0.0005912
99 PROTEIN_ASSEMBLY_AND_COFACTOR_LIGATION 23 0.0000446 0.4916157 0.0005960
154 REDOX_THIOREDOXIN_PDIL 13 0.0000516 0.6483291 0.0006559
14 CELL_WALL_CELL_WALL_PROTEINS_LRR 17 0.0000540 -0.5654999 0.0006559
126 PROTEIN_SYNTHESIS_INITIATION 77 0.0000566 0.2653602 0.0006574
90 NOT_ASSIGNED_NO_ONTOLOGY_HYDROXYPROLINE_RICH_PROTEINS 53 0.0000647 -0.3171790 0.0006936
222 SIGNALLING_RECEPTOR_KINASES_DUF_26 40 0.0000670 -0.3642131 0.0006936
175 RNA_REGULATION_OF_TRANSCRIPTION_G2-LIKE_TRANSCRIPTION_FACTOR_FAMILY,_GARP 38 0.0000675 -0.3734919 0.0006936
125 PROTEIN_SYNTHESIS_ELONGATION 29 0.0000788 0.4235001 0.0007794
174 RNA_REGULATION_OF_TRANSCRIPTION_CHROMATIN_REMODELING_FACTORS 35 0.0000935 -0.3815477 0.0008912
171 RNA_REGULATION_OF_TRANSCRIPTION_C2H2_ZINC_FINGER_FAMILY 100 0.0001248 -0.2219952 0.0011488
18 CELL_WALL_MODIFICATION 55 0.0001333 0.2977587 0.0011863
mitch_barplot(res_sig)

# effect size
res_eff <- mitch_calc(x=m,genesets=genesets,priority="effect")
## Note: Enrichments with large effect sizes may not be
##             statistically significant.
top <- head(subset(res_eff$enrichment_result,p.adjustANOVA<0.05),50)
kbl(head(top,30)) %>% kable_styling()
set setSize pANOVA s.dist p.adjustANOVA
13 CELL_WALL_CELL_WALL_PROTEINS_HRGP 14 0.0000002 -0.8049854 0.0000054
173 RNA_REGULATION_OF_TRANSCRIPTION_CCAAT_BOX_BINDING_FACTOR_FAMILY,_HAP2 10 0.0000113 -0.8018452 0.0001767
154 REDOX_THIOREDOXIN_PDIL 13 0.0000516 0.6483291 0.0006559
88 NOT_ASSIGNED_NO_ONTOLOGY_FORMIN_HOMOLOGY_2_DOMAIN-CONTAINING_PROTEIN 16 0.0000421 -0.5913190 0.0005912
253 TRANSPORT_MAJOR_INTRINSIC_PROTEINS_PIP 13 0.0002226 0.5912875 0.0017483
20 DEVELOPMENT_LATE_EMBRYOGENESIS_ABUNDANT 18 0.0000186 0.5826691 0.0002765
14 CELL_WALL_CELL_WALL_PROTEINS_LRR 17 0.0000540 -0.5654999 0.0006559
161 RNA_REGULATION_OF_TRANSCRIPTION_ARF,_AUXIN_RESPONSE_FACTOR_FAMILY 16 0.0001897 -0.5388662 0.0015347
189 RNA_REGULATION_OF_TRANSCRIPTION_NUCLEOSOME/CHROMATIN_ASSEMBLY_FACTOR_GROUP 13 0.0008377 -0.5349556 0.0053377
197 RNA_REGULATION_OF_TRANSCRIPTION_TRIHELIX,_TRIPLE-HELIX_TRANSCRIPTION_FACTOR_FAMILY 29 0.0000009 -0.5265238 0.0000222
145 PS_LIGHTREACTION_PHOTOSYSTEM_II_LHC-II 14 0.0007003 -0.5231393 0.0046742
92 NOT_ASSIGNED_NO_ONTOLOGY_PROLINE_RICH_FAMILY 39 0.0000000 -0.5225783 0.0000006
244 STRESS_BIOTIC_PR-PROTEINS_PLANT_DEFENSINS 15 0.0005973 0.5118755 0.0041850
139 PROTEIN_TARGETING_SECRETORY_PATHWAY_ER 14 0.0010249 0.5068010 0.0062194
193 RNA_REGULATION_OF_TRANSCRIPTION_PWWP_DOMAIN_PROTEIN 11 0.0036743 -0.5057782 0.0188663
209 SECONDARY_METABOLISM_SIMPLE_PHENOLS 16 0.0006058 -0.4950827 0.0041850
99 PROTEIN_ASSEMBLY_AND_COFACTOR_LIGATION 23 0.0000446 0.4916157 0.0005960
235 SIGNALLING_RECEPTOR_KINASES_WHEAT_LRK10_LIKE 12 0.0032494 -0.4906333 0.0173518
114 PROTEIN_DEGRADATION_UBIQUITIN_PROTEASOM 60 0.0000000 0.4827129 0.0000000
38 HORMONE_METABOLISM_GIBBERELIN_INDUCED-REGULATED-RESPONSIVE-ACTIVATED 12 0.0038992 0.4811501 0.0196430
232 SIGNALLING_RECEPTOR_KINASES_PROLINE_EXTENSIN_LIKE 15 0.0013746 -0.4771564 0.0078087
110 PROTEIN_DEGRADATION_UBIQUITIN_E3_BTB/POZ_CULLIN3_BTB/POZ 10 0.0106845 0.4661852 0.0413444
133 PROTEIN_SYNTHESIS_RIBOSOME_BIOGENESIS_PRE-RRNA_PROCESSING_AND_MODIFICATIONS_SNORNAS 16 0.0012692 0.4653228 0.0073670
211 SIGNALLING_14-3-3_PROTEINS 11 0.0077265 0.4638143 0.0324841
187 RNA_REGULATION_OF_TRANSCRIPTION_MYB-RELATED_TRANSCRIPTION_FACTOR_FAMILY 42 0.0000003 -0.4550153 0.0000089
89 NOT_ASSIGNED_NO_ONTOLOGY_GLYCINE_RICH_PROTEINS 55 0.0000000 -0.4532720 0.0000003
125 PROTEIN_SYNTHESIS_ELONGATION 29 0.0000788 0.4235001 0.0007794
144 PS_LIGHTREACTION_NADH_DH 12 0.0130143 0.4140066 0.0476001
26 DNA_SYNTHESIS/CHROMATIN_STRUCTURE_HISTONE_CORE_H2A 14 0.0090832 0.4026858 0.0367458
136 PROTEIN_TARGETING_MITOCHONDRIA 32 0.0001536 0.3865648 0.0012818
mitch_barplot(res_eff)

Bubble plot with top 50, 30 and 20 gene sets.

My_Theme = theme(
  axis.title.x = element_text(size = 16),
  axis.text.y = element_text(size = 8),
  axis.text.x = element_text(size = 8),
  axis.title.y = element_text(size = 16))

top <- head(res_eff$enrichment_result,50)
top <- top[order(top$s.dist),]
top$set <- factor(top$set, levels = top$set[order(top$s.dist)])
ggplot(top, aes(s.dist, set, size = setSize)) + geom_point(aes(colour=-log10(top$p.adjustANOVA))) + My_Theme

top <- head(res_eff$enrichment_result,30)
top <- top[order(top$s.dist),]
top$set <- factor(top$set, levels = top$set[order(top$s.dist)])
ggplot(top, aes(s.dist, set, size = setSize)) + geom_point(aes(colour=-log10(top$p.adjustANOVA))) + My_Theme

top <- head(res_eff$enrichment_result,20)
top <- top[order(top$s.dist),]
top$set <- factor(top$set, levels = top$set[order(top$s.dist)])
ggplot(top, aes(s.dist, set, size = setSize)) + geom_point(aes(colour=-log10(top$p.adjustANOVA))) + My_Theme

Heatmaps of a few top ranked gene sets by effect size.

mysets <- res_eff$input_genesets[which(names(res_eff$input_genesets) %in% head(res_eff$enrichment_result$set,10))]

xxx <- xx[,c(grep("S80",colnames(xx)),grep("H",colnames(xx)))]
xxx <- xxx / colSums(xxx) * 1e6

colfunc <- colorRampPalette(c("blue", "white", "red"))

plots <- lapply(1:length(mysets), function(x) {
  at <- gt[which(gt$V2 %in% mysets[[x]]),1]
  my_genes  <- xxx[which(rownames(xxx) %in% at),]
  if(nrow(my_genes)>2) {
    heatmap.2(as.matrix(my_genes),trace="none",scale="row",
      margin=c(8,10),main=names(mysets[x]),
      col=colfunc(25))
  }
})

Heatmaps of a few top ranked gene sets by small p-values.

mysets <- res_eff$input_genesets[which(names(res_eff$input_genesets) %in% head(res_sig$enrichment_result$set,10))]

xxx <- xx[,c(grep("S80",colnames(xx)),grep("H",colnames(xx)))]
xxx <- xxx / colSums(xxx) * 1e6

colfunc <- colorRampPalette(c("blue", "white", "red"))

plots <- lapply(1:length(mysets), function(x) {
  at <- gt[which(gt$V2 %in% mysets[[x]]),1]
  my_genes  <- xxx[which(rownames(xxx) %in% at),]
  if(nrow(my_genes)>2) {
    heatmap.2(as.matrix(my_genes),trace="none",scale="row",
      margin=c(8,10),main=names(mysets[x]),
      col=colfunc(25))
  }
})

Gene sets of interest

mysets <- c("TRANSPORT_CALCIUM",
  "REDOX_THIOREDOXIN_PDIL",
  "CELL_WALL_CELLULOSE_SYNTHESIS",
  "PROTEIN_TARGETING_CHLOROPLAST",
  "MITOCHONDRIAL_ELECTRON_TRANSPORT_/_ATP_SYNTHESIS_CYTOCHROME_C")

mysets <- genesets[which(names(genesets) %in% mysets)]
 
myres <- mitch_calc(x=m,genesets=mysets,priority="effect")
## Note: Enrichments with large effect sizes may not be
##             statistically significant.
par(mar=c(5,20,5,2))
vioplot(myres$detailed_sets,horizontal=TRUE,las=2,side="right",cex.axis=0.7)
grid()

Heat map of the gene sets of interest. Values plotted are the CPM after library size correction.

xxx <- xx[,c(grep("S80",colnames(xx)),grep("H",colnames(xx)))]
xxx <- xxx / colSums(xxx) * 1e6

colfunc <- colorRampPalette(c("blue", "white", "red"))

plots <- lapply(1:length(mysets), function(x) {
  at <- gt[which(gt$V2 %in% mysets[[x]]),1]
  my_genes  <- xxx[which(rownames(xxx) %in% at),]
  if(nrow(my_genes)>2) {
    heatmap.2(as.matrix(my_genes),trace="none",scale="row",
      margin=c(8,10),main=names(mysets[x]),
      col=colfunc(25))
  }
})

Mitch reports

unlink("mapman_report_de80_sig.html")
capture.output(
    mitch_report(res_sig,outfile=paste("mapman_report_de80_sig.html"))
    , file = "/dev/null", append = FALSE,
    type = c("output", "message"), split = FALSE)
## Dataset saved as " /tmp/Rtmp1TDkcW/mapman_report_de80_sig.rds ".
## 
## 
## processing file: mitch.Rmd
## Contour plot does not apply to unidimensional analysis.
## output file: /mnt/bfx4/tohidul/baseline3/mitch.knit.md
## 
## Output created: /tmp/Rtmp1TDkcW/mitch_report.html
unlink("mapman_report_de80_eff.html")
capture.output(
    mitch_report(res_eff,outfile=paste("mapman_report_de80_eff.html"))
    , file = "/dev/null", append = FALSE,
    type = c("output", "message"), split = FALSE)
## Dataset saved as " /tmp/Rtmp1TDkcW/mapman_report_de80_eff.rds ".
## 
## 
## processing file: mitch.Rmd
## Contour plot does not apply to unidimensional analysis.
## output file: /mnt/bfx4/tohidul/baseline3/mitch.knit.md
## 
## Output created: /tmp/Rtmp1TDkcW/mitch_report.html

Make PDFs

pdf("de80.pdf")
de80 <- run_de(ss80,xx80)
## converting counts to integer mode
##   the design formula contains one or more numeric variables with integer values,
##   specifying a model with increasing fold change for higher values.
##   did you mean for this to be a factor? if so, first convert
##   this variable to a factor using the factor() function
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##  chr [1:729] "AT1G65970" "AT4G06665" "AT3G11510" "AT5G64100" "AT1G24996" ...
##  chr [1:730] "AT2G47450" "AT1G55673" "AT2G22510" "AT2G32870" "AT2G18328" ...
mitch_barplot(res_eff)
par(mar=c(5,20,5,2))
vioplot(myres$detailed_sets,horizontal=TRUE,las=2,side="right",cex.axis=0.7)
dev.off()
## X11cairo 
##        2

Session information

So you know what version of R and packages was used.

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] pkgload_1.2.0               GGally_2.1.1               
##  [3] beeswarm_0.3.1              gtools_3.8.2               
##  [5] echarts4r_0.4.0             forcats_0.5.1              
##  [7] stringr_1.4.0               dplyr_1.0.5                
##  [9] purrr_0.3.4                 readr_1.4.0                
## [11] tidyr_1.1.3                 tibble_3.1.0               
## [13] ggplot2_3.3.3               tidyverse_1.3.0            
## [15] clusterProfiler_3.18.1      vioplot_0.3.5              
## [17] zoo_1.8-9                   sm_2.2-5.6                 
## [19] UpSetR_1.4.0                eulerr_6.1.0               
## [21] kableExtra_1.3.4            mitch_1.2.2                
## [23] DESeq2_1.30.0               SummarizedExperiment_1.20.0
## [25] Biobase_2.50.0              MatrixGenerics_1.2.0       
## [27] matrixStats_0.58.0          GenomicRanges_1.42.0       
## [29] GenomeInfoDb_1.26.0         IRanges_2.24.0             
## [31] S4Vectors_0.28.0            BiocGenerics_0.36.0        
## [33] gplots_3.1.1                reshape2_1.4.4             
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.2.1        shadowtext_0.0.7      
##   [4] fastmatch_1.1-0        systemfonts_1.0.1      plyr_1.8.6            
##   [7] igraph_1.2.6           splines_4.0.3          BiocParallel_1.24.1   
##  [10] digest_0.6.27          htmltools_0.5.1.1      GOSemSim_2.16.1       
##  [13] viridis_0.6.0          GO.db_3.12.1           fansi_0.4.2           
##  [16] magrittr_2.0.1         memoise_2.0.0          annotate_1.68.0       
##  [19] graphlayouts_0.7.1     modelr_0.1.8           svglite_2.0.0         
##  [22] enrichplot_1.10.2      colorspace_2.0-0       blob_1.2.1            
##  [25] rvest_1.0.0            ggrepel_0.9.1          haven_2.3.1           
##  [28] xfun_0.22              tcltk_4.0.3            crayon_1.4.1          
##  [31] RCurl_1.98-1.3         jsonlite_1.7.2         scatterpie_0.1.5      
##  [34] genefilter_1.72.0      survival_3.2-10        glue_1.4.2            
##  [37] polyclip_1.10-0        gtable_0.3.0           zlibbioc_1.36.0       
##  [40] XVector_0.30.0         webshot_0.5.2          DelayedArray_0.16.0   
##  [43] scales_1.1.1           DOSE_3.16.0            DBI_1.1.1             
##  [46] Rcpp_1.0.6             viridisLite_0.4.0      xtable_1.8-4          
##  [49] bit_4.0.4              htmlwidgets_1.5.3      httr_1.4.2            
##  [52] fgsea_1.16.0           RColorBrewer_1.1-2     ellipsis_0.3.1        
##  [55] pkgconfig_2.0.3        reshape_0.8.8          XML_3.99-0.6          
##  [58] farver_2.1.0           dbplyr_2.1.0           sass_0.3.1            
##  [61] locfit_1.5-9.4         utf8_1.2.1             labeling_0.4.2        
##  [64] tidyselect_1.1.0       rlang_0.4.10           later_1.1.0.1         
##  [67] AnnotationDbi_1.52.0   cellranger_1.1.0       munsell_0.5.0         
##  [70] tools_4.0.3            cachem_1.0.4           cli_2.3.1             
##  [73] downloader_0.4         generics_0.1.0         RSQLite_2.2.4         
##  [76] broom_0.7.5            evaluate_0.14          fastmap_1.1.0         
##  [79] yaml_2.2.1             fs_1.5.0               knitr_1.31            
##  [82] bit64_4.0.5            tidygraph_1.2.0        caTools_1.18.1        
##  [85] ggraph_2.0.5           mime_0.10              DO.db_2.9             
##  [88] xml2_1.3.2             compiler_4.0.3         rstudioapi_0.13       
##  [91] testthat_3.0.2         reprex_1.0.0           tweenr_1.0.2          
##  [94] geneplotter_1.68.0     bslib_0.2.4            stringi_1.5.3         
##  [97] ps_1.6.0               highr_0.8              desc_1.3.0            
## [100] lattice_0.20-41        Matrix_1.3-2           vctrs_0.3.6           
## [103] pillar_1.5.1           lifecycle_1.0.0        BiocManager_1.30.10   
## [106] jquerylib_0.1.3        data.table_1.14.0      cowplot_1.1.1         
## [109] bitops_1.0-6           httpuv_1.5.5           qvalue_2.22.0         
## [112] R6_2.5.0               promises_1.2.0.1       KernSmooth_2.23-18    
## [115] gridExtra_2.3          MASS_7.3-53.1          assertthat_0.2.1      
## [118] rprojroot_2.0.2        withr_2.4.1            GenomeInfoDbData_1.2.4
## [121] hms_1.0.0              grid_4.0.3             rmarkdown_2.7         
## [124] rvcheck_0.1.8          ggforce_0.3.3          lubridate_1.7.10      
## [127] shiny_1.6.0

References

Bray NL, Pimentel H, Melsted P, Pachter L. Near-optimal probabilistic RNA-seq quantification [published correction appears in Nat Biotechnol. 2016 Aug 9;34(8):888]. Nat Biotechnol. 2016;34(5):525-527. doi:10.1038/nbt.3519

Jiang H, Lei R, Ding SW, Zhu S. Skewer: a fast and accurate adapter trimmer for next-generation sequencing paired-end reads. BMC Bioinformatics. 2014;15:182. Published 2014 Jun 12. doi:10.1186/1471-2105-15-182

Love MI, Huber W, Anders S. Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biol. 2014;15(12):550. doi:10.1186/s13059-014-0550-8

Kaspi A, Ziemann M. mitch: multi-contrast pathway enrichment for multi-omics and single-cell profiling data. BMC Genomics. 2020;21(1):447. Published 2020 Jun 29. doi:10.1186/s12864-020-06856-9