Source codes: https://github.com/markziemann/tohidul_rnaseq
Here we have n=3 control (H2O; "H") and n=3 seaweed based fertiliser treatment ("80"). Arabidopsis RNA samples. Reads underwent quality trimming using Skewer (Jiang et al, 2014). I mapped the reads to the Arabidopsis transcriptome (TAIR10/Ensembl47) using Kallisto (Bray et al, 2016). Expression counts were loaded into R and then DE analysis was performed with DESeq2 (Love et al, 2014). Enrichment analysis was performed using Plant Reactome genesets with the Mitch package (Kaspi & Ziemann 2020).
suppressPackageStartupMessages({
library("reshape2")
library("gplots")
library("DESeq2")
library("mitch")
})
Here we load the data in from the aligner.
tmp <- read.table("3col.tsv")
x <- as.data.frame(acast(tmp, V2~V1, value.var="V3"))
x$gene <- sapply(strsplit(rownames(x),"\\."),"[[",1)
xx <- aggregate(. ~ gene, x, sum)
rownames(xx) <- xx$gene
xx$gene = NULL
MDS is just like PCA. The more similar (correlated) the data sets are the closer they will appear on the scatterplot.
plot(cmdscale(dist(t(xx))), xlab="Coordinate 1", ylab="Coordinate 2",
type = "n" , main="MDS plot")
text(cmdscale(dist(t(xx))), labels=colnames(xx) )
## DE
ss <- data.frame(colnames(xx))
rownames(ss) <- ss[,1]
ss$trt <- as.numeric(grepl("80",ss[,1]))
ss[,1]=NULL
xx <- round(xx)
dds <- DESeqDataSetFromMatrix(countData=xx, colData = ss, design = ~ trt)
## converting counts to integer mode
## the design formula contains one or more numeric variables with integer values,
## specifying a model with increasing fold change for higher values.
## did you mean for this to be a factor? if so, first convert
## this variable to a factor using the factor() function
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
de <- DESeq2::results(dds)
de <- de[order(de$pvalue),]
head(de)
## log2 fold change (MLE): trt
## Wald test p-value: trt
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## AT2G47450 3593.508 -1.164612 0.1048295 -11.10958 1.12685e-28 2.36266e-24
## AT1G55673 738.241 -12.792726 1.2289515 -10.40946 2.24484e-25 2.35338e-21
## AT2G22510 1187.885 -1.574584 0.1676953 -9.38956 6.02541e-21 4.21116e-17
## AT1G65970 319.484 1.884921 0.2055918 9.16827 4.80681e-20 2.51961e-16
## AT4G06665 132.199 10.671560 1.2215886 8.73581 2.41926e-18 1.01449e-14
## AT3G11510 6108.350 0.818938 0.0945727 8.65935 4.74441e-18 1.65794e-14
# define up and down-regulated gene lists
up <- rownames(subset(de, log2FoldChange>0 & padj<0.05 ))
dn <- rownames(subset(de, log2FoldChange<0 & padj<0.05 ))
str(up)
## chr [1:715] "AT1G65970" "AT4G06665" "AT3G11510" "AT5G64100" "AT3G22230" ...
str(dn)
## chr [1:727] "AT2G47450" "AT1G55673" "AT2G22510" "AT2G32870" "AT4G38080" ...
# MA plot
sig <-subset(de, padj < 0.05 )
GENESUP <- length(up)
GENESDN <- length(dn)
SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down")
ns <-subset(de, padj > 0.05 )
plot(log2(de$baseMean),de$log2FoldChange,
xlab="log2 basemean", ylab="log2 foldchange",
pch=19, cex=0.5, col="dark gray",
main="smear plot")
points(log2(sig$baseMean),sig$log2FoldChange,
pch=19, cex=0.5, col="red")
mtext(SUBHEADER)
# heatmap
xn <- xx/colSums(xx)*1000000
xf <- xn[which(rownames(xn) %in% rownames(de)[1:50]),]
colfunc <- colorRampPalette(c("blue", "white", "red"))
heatmap.2( as.matrix(xf), col=colfunc(25),scale="row", trace="none",
margins = c(6,6), cexRow=.4, main="Top 50 genes by p-val")
First using plant Reactome downloaded 8th July 2020.
genesets <- gmt_import("plant_reactome.gmt")
m <- mitch_import(x=data.frame(de),DEtype="deseq2")
## The input is a single dataframe; one contrast only. Converting
## it to a list for you.
## Note: Mean no. genes in input = 32833
## Note: no. genes in output = 32833
## Note: estimated proportion of input genes in output = 1
capture.output(
res <- mitch_calc(x=m,genesets=genesets)
, file = "/dev/null", append = FALSE,
type = c("output", "message"), split = FALSE)
## Note: When prioritising by significance (ie: small
## p-values), large effect sizes might be missed.
head(res$enrichment_result,10)
## set setSize pANOVA
## 24 R-ATH-1119533_TCA_cycle_(plant) 28 0.0001099788
## 23 R-ATH-1119519_Calvin_cycle 28 0.0001792785
## 21 R-ATH-1119494_Tryptophan_biosynthesis 19 0.0079117258
## 14 R-ATH-1119452_Galactose_degradation_II 17 0.0184469868
## 43 R-ATH-9639861_Development_of_root_hair 14 0.0307119250
## 37 R-ATH-8879007_Response_to_cold_temperature 20 0.0313007847
## 5 R-ATH-1119325_Sphingolipid_metabolism 14 0.0469060788
## 40 R-ATH-9609573_Tricin_biosynthesis 25 0.0507422284
## 15 R-ATH-1119460_Isoleucine_biosynthesis_from_threonine 11 0.0546246815
## 1 R-ATH-1119273_Lysine_biosynthesis_I 18 0.0550900671
## s.dist p.adjustANOVA
## 24 0.4221411 0.004033766
## 23 0.4089485 0.004033766
## 21 0.3518799 0.118675888
## 14 0.3300739 0.207528602
## 43 0.3334974 0.212296495
## 37 -0.2780771 0.212296495
## 5 0.3066999 0.212296495
## 40 -0.2256881 0.212296495
## 15 0.3346177 0.212296495
## 1 0.2611130 0.212296495
#unlink("reactome_report.html")
#capture.output(
# mitch_report(res,outfile=paste("reactome_report.html"))
# , file = "/dev/null", append = FALSE,
# type = c("output", "message"), split = FALSE)
Next with mapman pathways last modified in 2012 and used in the previous RNA-seq analysis.
genesets <- gmt_import("../ref/Ath_AGI_LOCUS_TAIR10_Aug2012.txt.gmt")
gt <- read.table("../ref/Arabidopsis_thaliana.TAIR10.46.geneaccession2symbol.tsv",
fill=TRUE)
m <- mitch_import(x=data.frame(de),DEtype="deseq2",geneTable=gt)
## The input is a single dataframe; one contrast only. Converting
## it to a list for you.
## Note: Mean no. genes in input = 32833
## Note: no. genes in output = 28250
## Note: estimated proportion of input genes in output = 0.86
capture.output(
res <- mitch_calc(x=m,genesets=genesets)
, file = "/dev/null", append = FALSE,
type = c("output", "message"), split = FALSE)
## Note: When prioritising by significance (ie: small
## p-values), large effect sizes might be missed.
head(res$enrichment_result,30)
## set
## 125 PROTEIN_DEGRADATION_UBIQUITIN_E3_SCF_FBOX
## 9 CELL_ORGANISATION
## 127 PROTEIN_DEGRADATION_UBIQUITIN_PROTEASOM
## 27 DNA_SYNTHESIS/CHROMATIN_STRUCTURE
## 103 NOT_ASSIGNED_NO_ONTOLOGY_PROLINE_RICH_FAMILY
## 219 RNA_REGULATION_OF_TRANSCRIPTION_TRIHELIX,_TRIPLE-HELIX_TRANSCRIPTION_FACTOR_FAMILY
## 14 CELL_WALL_CELL_WALL_PROTEINS_HRGP
## 239 SIGNALLING_LIGHT
## 102 NOT_ASSIGNED_NO_ONTOLOGY_PENTATRICOPEPTIDE_(PPR)_REPEAT-CONTAINING_PROTEIN
## 209 RNA_REGULATION_OF_TRANSCRIPTION_MYB-RELATED_TRANSCRIPTION_FACTOR_FAMILY
## 224 RNA_RNA_BINDING
## 247 SIGNALLING_RECEPTOR_KINASES_DUF_26
## 98 NOT_ASSIGNED_NO_ONTOLOGY_GLYCINE_RICH_PROTEINS
## 270 STRESS_BIOTIC_PR-PROTEINS_PLANT_DEFENSINS
## 221 RNA_REGULATION_OF_TRANSCRIPTION_WRKY_DOMAIN_TRANSCRIPTION_FACTOR_FAMILY
## 192 RNA_REGULATION_OF_TRANSCRIPTION_CCAAT_BOX_BINDING_FACTOR_FAMILY,_HAP2
## 208 RNA_REGULATION_OF_TRANSCRIPTION_MYB_DOMAIN_TRANSCRIPTION_FACTOR_FAMILY
## 195 RNA_REGULATION_OF_TRANSCRIPTION_CHROMATIN_REMODELING_FACTORS
## 269 STRESS_BIOTIC_PR-PROTEINS
## 15 CELL_WALL_CELL_WALL_PROTEINS_LRR
## 97 NOT_ASSIGNED_NO_ONTOLOGY_FORMIN_HOMOLOGY_2_DOMAIN-CONTAINING_PROTEIN
## 112 PROTEIN_ASSEMBLY_AND_COFACTOR_LIGATION
## 172 REDOX_THIOREDOXIN_PDIL
## 188 RNA_REGULATION_OF_TRANSCRIPTION_C2C2(ZN)_DOF_ZINC_FINGER_FAMILY
## 185 RNA_REGULATION_OF_TRANSCRIPTION_BHLH,BASIC_HELIX-LOOP-HELIX_FAMILY
## 201 RNA_REGULATION_OF_TRANSCRIPTION_HB,HOMEOBOX_TRANSCRIPTION_FACTOR_FAMILY
## 20 CELL_WALL_MODIFICATION
## 22 DEVELOPMENT_LATE_EMBRYOGENESIS_ABUNDANT
## 281 TRANSPORT_MAJOR_INTRINSIC_PROTEINS_PIP
## 138 PROTEIN_SYNTHESIS_ELONGATION
## setSize pANOVA s.dist p.adjustANOVA
## 125 556 2.011940e-12 0.1738692 5.935222e-10
## 9 382 1.761914e-11 -0.1999235 2.598824e-09
## 127 60 2.132561e-10 0.4737602 2.097018e-08
## 27 225 4.682686e-10 -0.2406545 3.453481e-08
## 103 45 4.363621e-09 -0.5054549 2.574536e-07
## 219 30 1.166248e-08 -0.6014812 5.734053e-07
## 14 15 3.674806e-08 -0.8207804 1.548668e-06
## 239 92 8.670408e-08 -0.3226421 3.197213e-06
## 102 427 1.128361e-07 0.1493144 3.698515e-06
## 209 45 1.360896e-07 -0.4538722 4.014643e-06
## 224 178 3.989184e-07 -0.2200287 1.069827e-05
## 247 46 1.336056e-06 -0.4117871 3.284472e-05
## 98 79 1.684406e-06 -0.3113971 3.822306e-05
## 270 86 1.840081e-06 0.2973844 3.877312e-05
## 221 72 1.975199e-06 -0.3239594 3.884558e-05
## 192 10 3.980147e-06 -0.8421246 6.831554e-05
## 208 159 4.102042e-06 -0.2114637 6.831554e-05
## 195 37 4.173855e-06 -0.4370709 6.831554e-05
## 269 243 4.399984e-06 -0.1707664 6.831554e-05
## 15 19 1.065272e-05 -0.5833695 1.571276e-04
## 97 19 1.241390e-05 -0.5789604 1.743858e-04
## 112 24 1.896575e-05 0.5041658 2.543134e-04
## 172 13 4.794537e-05 0.6510634 6.149515e-04
## 188 33 6.718656e-05 -0.4008018 8.258347e-04
## 185 145 1.013444e-04 -0.1868451 1.195863e-03
## 201 86 1.345658e-04 -0.2380344 1.526805e-03
## 20 62 1.399250e-04 0.2795178 1.528811e-03
## 22 26 1.797492e-04 0.4242952 1.893786e-03
## 281 14 1.954453e-04 0.5748437 1.988151e-03
## 138 30 2.028307e-04 0.3918167 1.994502e-03
unlink("mapman_report.html")
capture.output(
mitch_report(res,outfile=paste("mapman_report.html"))
, file = "/dev/null", append = FALSE,
type = c("output", "message"), split = FALSE)
## Dataset saved as " /tmp/RtmpNvXGhB/./mapman_report.RData ".
##
##
## processing file: mitch.Rmd
## Contour plot does not apply to unidimensional analysis.
## output file: /mnt/bfx4/tohidul/baseline/dge/mitch.knit.md
##
## Output created: /tmp/RtmpNvXGhB/mitch_report.html
So you know what version of R and packages was used.
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] pkgload_1.1.0 GGally_2.0.0
## [3] ggplot2_3.3.2 beeswarm_0.2.3
## [5] gtools_3.8.2 tibble_3.0.1
## [7] dplyr_1.0.0 echarts4r_0.3.2
## [9] gplots_3.0.3 mitch_1.0.6
## [11] DESeq2_1.28.1 SummarizedExperiment_1.18.1
## [13] DelayedArray_0.14.0 matrixStats_0.56.0
## [15] Biobase_2.48.0 GenomicRanges_1.40.0
## [17] GenomeInfoDb_1.24.2 IRanges_2.22.2
## [19] S4Vectors_0.26.1 BiocGenerics_0.34.0
## [21] reshape2_1.4.4
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-6 bit64_0.9-7 RColorBrewer_1.1-2
## [4] rprojroot_1.3-2 tools_4.0.2 backports_1.1.8
## [7] R6_2.4.1 KernSmooth_2.23-17 DBI_1.1.0
## [10] colorspace_1.4-1 withr_2.2.0 tidyselect_1.1.0
## [13] gridExtra_2.3 bit_1.1-15.2 compiler_4.0.2
## [16] desc_1.2.0 caTools_1.18.0 scales_1.1.1
## [19] genefilter_1.70.0 stringr_1.4.0 digest_0.6.25
## [22] rmarkdown_2.3 XVector_0.28.0 pkgconfig_2.0.3
## [25] htmltools_0.5.0 fastmap_1.0.1 highr_0.8
## [28] htmlwidgets_1.5.1 rlang_0.4.6 RSQLite_2.2.0
## [31] shiny_1.5.0 generics_0.0.2 jsonlite_1.7.0
## [34] BiocParallel_1.22.0 RCurl_1.98-1.2 magrittr_1.5
## [37] GenomeInfoDbData_1.2.3 Matrix_1.2-18 Rcpp_1.0.4.6
## [40] munsell_0.5.0 lifecycle_0.2.0 stringi_1.4.6
## [43] yaml_2.2.1 MASS_7.3-51.6 zlibbioc_1.34.0
## [46] plyr_1.8.6 grid_4.0.2 blob_1.2.1
## [49] gdata_2.18.0 promises_1.1.1 crayon_1.3.4
## [52] lattice_0.20-41 splines_4.0.2 annotate_1.66.0
## [55] locfit_1.5-9.4 knitr_1.29 pillar_1.4.4
## [58] geneplotter_1.66.0 XML_3.99-0.3 glue_1.4.1
## [61] evaluate_0.14 vctrs_0.3.1 httpuv_1.5.4
## [64] testthat_2.3.2 gtable_0.3.0 purrr_0.3.4
## [67] reshape_0.8.8 assertthat_0.2.1 xfun_0.15
## [70] mime_0.9 xtable_1.8-4 later_1.1.0.1
## [73] survival_3.2-3 pbmcapply_1.5.0 AnnotationDbi_1.50.1
## [76] memoise_1.1.0 ellipsis_0.3.1
Bray NL, Pimentel H, Melsted P, Pachter L. Near-optimal probabilistic RNA-seq quantification [published correction appears in Nat Biotechnol. 2016 Aug 9;34(8):888]. Nat Biotechnol. 2016;34(5):525-527. doi:10.1038/nbt.3519
Jiang H, Lei R, Ding SW, Zhu S. Skewer: a fast and accurate adapter trimmer for next-generation sequencing paired-end reads. BMC Bioinformatics. 2014;15:182. Published 2014 Jun 12. doi:10.1186/1471-2105-15-182
Love MI, Huber W, Anders S. Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biol. 2014;15(12):550. doi:10.1186/s13059-014-0550-8
Kaspi A, Ziemann M. mitch: multi-contrast pathway enrichment for multi-omics and single-cell profiling data. BMC Genomics. 2020;21(1):447. Published 2020 Jun 29. doi:10.1186/s12864-020-06856-9