Source codes: https://github.com/markziemann/tohidul_rnaseq

Background

Here we have n=3 control (H2O; "H") and n=3 seaweed based fertiliser treatment ("80"). Arabidopsis RNA samples. Reads underwent quality trimming using Skewer (Jiang et al, 2014). I mapped the reads to the Arabidopsis transcriptome (TAIR10/Ensembl47) using Kallisto (Bray et al, 2016). Expression counts were loaded into R and then DE analysis was performed with DESeq2 (Love et al, 2014). Enrichment analysis was performed using Plant Reactome genesets with the Mitch package (Kaspi & Ziemann 2020).

suppressPackageStartupMessages({
    library("reshape2")
    library("gplots")
    library("DESeq2")
    library("mitch")
})

Load data

Here we load the data in from the aligner.

tmp <- read.table("3col.tsv")
x <- as.data.frame(acast(tmp, V2~V1, value.var="V3"))
x$gene <- sapply(strsplit(rownames(x),"\\."),"[[",1)
xx <- aggregate(. ~ gene, x, sum)
rownames(xx) <- xx$gene
xx$gene = NULL

MDS

MDS is just like PCA. The more similar (correlated) the data sets are the closer they will appear on the scatterplot.

plot(cmdscale(dist(t(xx))), xlab="Coordinate 1", ylab="Coordinate 2", 
  type = "n" , main="MDS plot")
text(cmdscale(dist(t(xx))), labels=colnames(xx) )

## DE

ss <- data.frame(colnames(xx))
rownames(ss) <- ss[,1]
ss$trt <- as.numeric(grepl("80",ss[,1]))
ss[,1]=NULL
xx <- round(xx)
dds <- DESeqDataSetFromMatrix(countData=xx, colData = ss, design = ~ trt)
## converting counts to integer mode
##   the design formula contains one or more numeric variables with integer values,
##   specifying a model with increasing fold change for higher values.
##   did you mean for this to be a factor? if so, first convert
##   this variable to a factor using the factor() function
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
de <- DESeq2::results(dds)
de <- de[order(de$pvalue),]

head(de)
## log2 fold change (MLE): trt 
## Wald test p-value: trt 
## DataFrame with 6 rows and 6 columns
##            baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##           <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## AT2G47450  3593.508      -1.164612 0.1048295 -11.10958 1.12685e-28 2.36266e-24
## AT1G55673   738.241     -12.792726 1.2289515 -10.40946 2.24484e-25 2.35338e-21
## AT2G22510  1187.885      -1.574584 0.1676953  -9.38956 6.02541e-21 4.21116e-17
## AT1G65970   319.484       1.884921 0.2055918   9.16827 4.80681e-20 2.51961e-16
## AT4G06665   132.199      10.671560 1.2215886   8.73581 2.41926e-18 1.01449e-14
## AT3G11510  6108.350       0.818938 0.0945727   8.65935 4.74441e-18 1.65794e-14
# define up and down-regulated gene lists
up <- rownames(subset(de, log2FoldChange>0 & padj<0.05 ))
dn <- rownames(subset(de, log2FoldChange<0 & padj<0.05 ))
str(up)
##  chr [1:715] "AT1G65970" "AT4G06665" "AT3G11510" "AT5G64100" "AT3G22230" ...
str(dn)
##  chr [1:727] "AT2G47450" "AT1G55673" "AT2G22510" "AT2G32870" "AT4G38080" ...
# MA plot
sig <-subset(de, padj < 0.05 )
GENESUP <- length(up)
GENESDN <- length(dn)
SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down")
ns <-subset(de, padj > 0.05 )
plot(log2(de$baseMean),de$log2FoldChange, 
     xlab="log2 basemean", ylab="log2 foldchange",
     pch=19, cex=0.5, col="dark gray",
     main="smear plot")
points(log2(sig$baseMean),sig$log2FoldChange,
       pch=19, cex=0.5, col="red")
mtext(SUBHEADER)

# heatmap
xn <- xx/colSums(xx)*1000000
xf <- xn[which(rownames(xn) %in% rownames(de)[1:50]),]

colfunc <- colorRampPalette(c("blue", "white", "red"))
heatmap.2(  as.matrix(xf), col=colfunc(25),scale="row", trace="none",
    margins = c(6,6), cexRow=.4, main="Top 50 genes by p-val")

Pathway analysis

First using plant Reactome downloaded 8th July 2020.

genesets <- gmt_import("plant_reactome.gmt")

m <- mitch_import(x=data.frame(de),DEtype="deseq2")
## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.
## Note: Mean no. genes in input = 32833
## Note: no. genes in output = 32833
## Note: estimated proportion of input genes in output = 1
capture.output(
    res <- mitch_calc(x=m,genesets=genesets)
    , file = "/dev/null", append = FALSE,
    type = c("output", "message"), split = FALSE)
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
head(res$enrichment_result,10)
##                                                     set setSize       pANOVA
## 24                      R-ATH-1119533_TCA_cycle_(plant)      28 0.0001099788
## 23                           R-ATH-1119519_Calvin_cycle      28 0.0001792785
## 21                R-ATH-1119494_Tryptophan_biosynthesis      19 0.0079117258
## 14               R-ATH-1119452_Galactose_degradation_II      17 0.0184469868
## 43               R-ATH-9639861_Development_of_root_hair      14 0.0307119250
## 37           R-ATH-8879007_Response_to_cold_temperature      20 0.0313007847
## 5                 R-ATH-1119325_Sphingolipid_metabolism      14 0.0469060788
## 40                    R-ATH-9609573_Tricin_biosynthesis      25 0.0507422284
## 15 R-ATH-1119460_Isoleucine_biosynthesis_from_threonine      11 0.0546246815
## 1                   R-ATH-1119273_Lysine_biosynthesis_I      18 0.0550900671
##        s.dist p.adjustANOVA
## 24  0.4221411   0.004033766
## 23  0.4089485   0.004033766
## 21  0.3518799   0.118675888
## 14  0.3300739   0.207528602
## 43  0.3334974   0.212296495
## 37 -0.2780771   0.212296495
## 5   0.3066999   0.212296495
## 40 -0.2256881   0.212296495
## 15  0.3346177   0.212296495
## 1   0.2611130   0.212296495
#unlink("reactome_report.html")
#capture.output(
#    mitch_report(res,outfile=paste("reactome_report.html"))
#    , file = "/dev/null", append = FALSE,
#    type = c("output", "message"), split = FALSE)

Next with mapman pathways last modified in 2012 and used in the previous RNA-seq analysis.

genesets <- gmt_import("../ref/Ath_AGI_LOCUS_TAIR10_Aug2012.txt.gmt")
gt <- read.table("../ref/Arabidopsis_thaliana.TAIR10.46.geneaccession2symbol.tsv",
    fill=TRUE) 
m <- mitch_import(x=data.frame(de),DEtype="deseq2",geneTable=gt)
## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.
## Note: Mean no. genes in input = 32833
## Note: no. genes in output = 28250
## Note: estimated proportion of input genes in output = 0.86
capture.output(
    res <- mitch_calc(x=m,genesets=genesets)
    , file = "/dev/null", append = FALSE,
    type = c("output", "message"), split = FALSE)
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
head(res$enrichment_result,30)
##                                                                                    set
## 125                                          PROTEIN_DEGRADATION_UBIQUITIN_E3_SCF_FBOX
## 9                                                                    CELL_ORGANISATION
## 127                                            PROTEIN_DEGRADATION_UBIQUITIN_PROTEASOM
## 27                                                   DNA_SYNTHESIS/CHROMATIN_STRUCTURE
## 103                                       NOT_ASSIGNED_NO_ONTOLOGY_PROLINE_RICH_FAMILY
## 219 RNA_REGULATION_OF_TRANSCRIPTION_TRIHELIX,_TRIPLE-HELIX_TRANSCRIPTION_FACTOR_FAMILY
## 14                                                   CELL_WALL_CELL_WALL_PROTEINS_HRGP
## 239                                                                   SIGNALLING_LIGHT
## 102         NOT_ASSIGNED_NO_ONTOLOGY_PENTATRICOPEPTIDE_(PPR)_REPEAT-CONTAINING_PROTEIN
## 209            RNA_REGULATION_OF_TRANSCRIPTION_MYB-RELATED_TRANSCRIPTION_FACTOR_FAMILY
## 224                                                                    RNA_RNA_BINDING
## 247                                                 SIGNALLING_RECEPTOR_KINASES_DUF_26
## 98                                      NOT_ASSIGNED_NO_ONTOLOGY_GLYCINE_RICH_PROTEINS
## 270                                          STRESS_BIOTIC_PR-PROTEINS_PLANT_DEFENSINS
## 221            RNA_REGULATION_OF_TRANSCRIPTION_WRKY_DOMAIN_TRANSCRIPTION_FACTOR_FAMILY
## 192              RNA_REGULATION_OF_TRANSCRIPTION_CCAAT_BOX_BINDING_FACTOR_FAMILY,_HAP2
## 208             RNA_REGULATION_OF_TRANSCRIPTION_MYB_DOMAIN_TRANSCRIPTION_FACTOR_FAMILY
## 195                       RNA_REGULATION_OF_TRANSCRIPTION_CHROMATIN_REMODELING_FACTORS
## 269                                                          STRESS_BIOTIC_PR-PROTEINS
## 15                                                    CELL_WALL_CELL_WALL_PROTEINS_LRR
## 97                NOT_ASSIGNED_NO_ONTOLOGY_FORMIN_HOMOLOGY_2_DOMAIN-CONTAINING_PROTEIN
## 112                                             PROTEIN_ASSEMBLY_AND_COFACTOR_LIGATION
## 172                                                             REDOX_THIOREDOXIN_PDIL
## 188                    RNA_REGULATION_OF_TRANSCRIPTION_C2C2(ZN)_DOF_ZINC_FINGER_FAMILY
## 185                 RNA_REGULATION_OF_TRANSCRIPTION_BHLH,BASIC_HELIX-LOOP-HELIX_FAMILY
## 201            RNA_REGULATION_OF_TRANSCRIPTION_HB,HOMEOBOX_TRANSCRIPTION_FACTOR_FAMILY
## 20                                                              CELL_WALL_MODIFICATION
## 22                                             DEVELOPMENT_LATE_EMBRYOGENESIS_ABUNDANT
## 281                                             TRANSPORT_MAJOR_INTRINSIC_PROTEINS_PIP
## 138                                                       PROTEIN_SYNTHESIS_ELONGATION
##     setSize       pANOVA     s.dist p.adjustANOVA
## 125     556 2.011940e-12  0.1738692  5.935222e-10
## 9       382 1.761914e-11 -0.1999235  2.598824e-09
## 127      60 2.132561e-10  0.4737602  2.097018e-08
## 27      225 4.682686e-10 -0.2406545  3.453481e-08
## 103      45 4.363621e-09 -0.5054549  2.574536e-07
## 219      30 1.166248e-08 -0.6014812  5.734053e-07
## 14       15 3.674806e-08 -0.8207804  1.548668e-06
## 239      92 8.670408e-08 -0.3226421  3.197213e-06
## 102     427 1.128361e-07  0.1493144  3.698515e-06
## 209      45 1.360896e-07 -0.4538722  4.014643e-06
## 224     178 3.989184e-07 -0.2200287  1.069827e-05
## 247      46 1.336056e-06 -0.4117871  3.284472e-05
## 98       79 1.684406e-06 -0.3113971  3.822306e-05
## 270      86 1.840081e-06  0.2973844  3.877312e-05
## 221      72 1.975199e-06 -0.3239594  3.884558e-05
## 192      10 3.980147e-06 -0.8421246  6.831554e-05
## 208     159 4.102042e-06 -0.2114637  6.831554e-05
## 195      37 4.173855e-06 -0.4370709  6.831554e-05
## 269     243 4.399984e-06 -0.1707664  6.831554e-05
## 15       19 1.065272e-05 -0.5833695  1.571276e-04
## 97       19 1.241390e-05 -0.5789604  1.743858e-04
## 112      24 1.896575e-05  0.5041658  2.543134e-04
## 172      13 4.794537e-05  0.6510634  6.149515e-04
## 188      33 6.718656e-05 -0.4008018  8.258347e-04
## 185     145 1.013444e-04 -0.1868451  1.195863e-03
## 201      86 1.345658e-04 -0.2380344  1.526805e-03
## 20       62 1.399250e-04  0.2795178  1.528811e-03
## 22       26 1.797492e-04  0.4242952  1.893786e-03
## 281      14 1.954453e-04  0.5748437  1.988151e-03
## 138      30 2.028307e-04  0.3918167  1.994502e-03
unlink("mapman_report.html")
capture.output(
    mitch_report(res,outfile=paste("mapman_report.html"))
    , file = "/dev/null", append = FALSE,
    type = c("output", "message"), split = FALSE)
## Dataset saved as " /tmp/RtmpNvXGhB/./mapman_report.RData ".
## 
## 
## processing file: mitch.Rmd
## Contour plot does not apply to unidimensional analysis.
## output file: /mnt/bfx4/tohidul/baseline/dge/mitch.knit.md
## 
## Output created: /tmp/RtmpNvXGhB/mitch_report.html

Session information

So you know what version of R and packages was used.

sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] pkgload_1.1.0               GGally_2.0.0               
##  [3] ggplot2_3.3.2               beeswarm_0.2.3             
##  [5] gtools_3.8.2                tibble_3.0.1               
##  [7] dplyr_1.0.0                 echarts4r_0.3.2            
##  [9] gplots_3.0.3                mitch_1.0.6                
## [11] DESeq2_1.28.1               SummarizedExperiment_1.18.1
## [13] DelayedArray_0.14.0         matrixStats_0.56.0         
## [15] Biobase_2.48.0              GenomicRanges_1.40.0       
## [17] GenomeInfoDb_1.24.2         IRanges_2.22.2             
## [19] S4Vectors_0.26.1            BiocGenerics_0.34.0        
## [21] reshape2_1.4.4             
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6           bit64_0.9-7            RColorBrewer_1.1-2    
##  [4] rprojroot_1.3-2        tools_4.0.2            backports_1.1.8       
##  [7] R6_2.4.1               KernSmooth_2.23-17     DBI_1.1.0             
## [10] colorspace_1.4-1       withr_2.2.0            tidyselect_1.1.0      
## [13] gridExtra_2.3          bit_1.1-15.2           compiler_4.0.2        
## [16] desc_1.2.0             caTools_1.18.0         scales_1.1.1          
## [19] genefilter_1.70.0      stringr_1.4.0          digest_0.6.25         
## [22] rmarkdown_2.3          XVector_0.28.0         pkgconfig_2.0.3       
## [25] htmltools_0.5.0        fastmap_1.0.1          highr_0.8             
## [28] htmlwidgets_1.5.1      rlang_0.4.6            RSQLite_2.2.0         
## [31] shiny_1.5.0            generics_0.0.2         jsonlite_1.7.0        
## [34] BiocParallel_1.22.0    RCurl_1.98-1.2         magrittr_1.5          
## [37] GenomeInfoDbData_1.2.3 Matrix_1.2-18          Rcpp_1.0.4.6          
## [40] munsell_0.5.0          lifecycle_0.2.0        stringi_1.4.6         
## [43] yaml_2.2.1             MASS_7.3-51.6          zlibbioc_1.34.0       
## [46] plyr_1.8.6             grid_4.0.2             blob_1.2.1            
## [49] gdata_2.18.0           promises_1.1.1         crayon_1.3.4          
## [52] lattice_0.20-41        splines_4.0.2          annotate_1.66.0       
## [55] locfit_1.5-9.4         knitr_1.29             pillar_1.4.4          
## [58] geneplotter_1.66.0     XML_3.99-0.3           glue_1.4.1            
## [61] evaluate_0.14          vctrs_0.3.1            httpuv_1.5.4          
## [64] testthat_2.3.2         gtable_0.3.0           purrr_0.3.4           
## [67] reshape_0.8.8          assertthat_0.2.1       xfun_0.15             
## [70] mime_0.9               xtable_1.8-4           later_1.1.0.1         
## [73] survival_3.2-3         pbmcapply_1.5.0        AnnotationDbi_1.50.1  
## [76] memoise_1.1.0          ellipsis_0.3.1

References

Bray NL, Pimentel H, Melsted P, Pachter L. Near-optimal probabilistic RNA-seq quantification [published correction appears in Nat Biotechnol. 2016 Aug 9;34(8):888]. Nat Biotechnol. 2016;34(5):525-527. doi:10.1038/nbt.3519

Jiang H, Lei R, Ding SW, Zhu S. Skewer: a fast and accurate adapter trimmer for next-generation sequencing paired-end reads. BMC Bioinformatics. 2014;15:182. Published 2014 Jun 12. doi:10.1186/1471-2105-15-182

Love MI, Huber W, Anders S. Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biol. 2014;15(12):550. doi:10.1186/s13059-014-0550-8

Kaspi A, Ziemann M. mitch: multi-contrast pathway enrichment for multi-omics and single-cell profiling data. BMC Genomics. 2020;21(1):447. Published 2020 Jun 29. doi:10.1186/s12864-020-06856-9