Source codes: https://github.com/markziemann/tohidul_rnaseq
Here we have n=3 control (H2O; “H”) and n=3 seaweed based fertiliser treatments:
S80 ANDP
S94 DP
S93 AN
Reads underwent quality trimming using Skewer (Jiang et al, 2014). I mapped the reads to the Arabidopsis transcriptome (TAIR10/Ensembl47) using Kallisto (Bray et al, 2016). Expression counts were loaded into R and then DE analysis was performed with DESeq2 (Love et al, 2014). Enrichment analysis was performed using Plant Reactome genesets with the Mitch package (Kaspi & Ziemann 2020).
suppressPackageStartupMessages({
library("tidyverse")
library("reshape2")
library("gplots")
library("DESeq2")
library("mitch")
library("kableExtra")
library("eulerr")
library("UpSetR")
library("vioplot")
library("clusterProfiler")
})
Here we load the data in from the aligner. Take note that the columns are arranged in order.
tmp <- read.table("3col.tsv.gz")
x <- as.data.frame(acast(tmp, V2~V1, value.var="V3"))
x$gene <- sapply(strsplit(rownames(x),"\\."),"[[",1)
xx <- aggregate(. ~ gene, x, sum)
rownames(xx) <- xx$gene
xx$gene = NULL
xx <- xx[,order(colnames(xx))]
The sample sheet is read in and put in order as well.
ss <- read.table("samplesheet.tsv",header=TRUE)
ss <- ss[order(ss$Run),]
ss$label <- gsub("-0hpi","",ss$SampleName)
# check that names are in order
colnames(xx) == ss$Run
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# change data header
colnames(xx) <- ss$label
kbl(ss) %>% kable_styling()
Run | SampleName | Treatment | label | |
---|---|---|---|---|
6 | SRR11404271 | S80-0hpi_R3 | Seaweed_16280 | S80_R3 |
5 | SRR11404272 | S80-0hpi_R2 | Seaweed_16280 | S80_R2 |
4 | SRR11404273 | S80-0hpi_R1 | Seaweed_16280 | S80_R1 |
12 | SRR11404287 | S94-0hpi_R3 | Seaweed_15294 | S94_R3 |
11 | SRR11404288 | S94-0hpi_R2 | Seaweed_15294 | S94_R2 |
10 | SRR11404289 | S94-0hpi_R1 | Seaweed_15294 | S94_R1 |
3 | SRR11404301 | H-0hpi_R3 | Water | H_R3 |
9 | SRR11404304 | S93-0hpi_R3 | Seaweed_15293 | S93_R3 |
8 | SRR11404305 | S93-0hpi_R2 | Seaweed_15293 | S93_R2 |
7 | SRR11404306 | S93-0hpi_R1 | Seaweed_15293 | S93_R1 |
2 | SRR11404312 | H-0hpi_R2 | Water | H_R2 |
1 | SRR11404313 | H-0hpi_R1 | Water | H_R1 |
MDS is just like PCA. The more similar (correlated) the data sets are the closer they will appear on the scatterplot.
cols <- as.numeric(factor(ss$Treatment))
plot(cmdscale(dist(t(xx))), xlab="Coordinate 1", ylab="Coordinate 2",
col=cols, cex=4 , main="MDS plot")
text(cmdscale(dist(t(xx))), labels=ss$label )
#colfunc <- colorRampPalette(c("white", "yellow", "orange" , "red", "darkred"))
colfunc <- colorRampPalette(c("white","blue"))
heatmap.2(cor(xx,method="pearson"),col=colfunc(25),trace="none",
scale="none",margin=c(10,10),main="Pearson correlation")
heatmap.2(cor(xx,method="spearman"),col=colfunc(25),trace="none",
scale="none",margin=c(10,10),main="Spearman correlation")
run_de <- function(ss,xx){
xx <- xx[which(rowMeans(xx)>10),]
y <- round(xx)
# MDS
mds <- cmdscale(dist(t(y)))
XMAX=max(mds[,1])*1.1
XMIN=min(mds[,1])*1.1
plot( mds , xlab="Coordinate 1", ylab="Coordinate 2",
type = "n" , xlim=c(XMIN,XMAX),main="MDS plot",bty="n")
text(mds, labels=colnames(y) )
# DE
dds <- DESeqDataSetFromMatrix(countData=y, colData = ss, design = ~ trt)
dds <- DESeq(dds)
de <- DESeq2::results(dds)
de <- de[order(de$pvalue),]
up <- rownames(subset(de, log2FoldChange>0 & padj<0.05 ))
dn <- rownames(subset(de, log2FoldChange<0 & padj<0.05 ))
str(up)
str(dn)
# MA plot
sig <-subset(de, padj < 0.05 )
GENESUP <- length(up)
GENESDN <- length(dn)
SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down")
ns <-subset(de, padj > 0.05 )
plot(log2(de$baseMean),de$log2FoldChange,
xlab="log2 basemean", ylab="log2(fold change)",
pch=19, cex=1, col="dark gray",
ylim = c(-6,6),
main="smear plot")
points(log2(sig$baseMean),sig$log2FoldChange,
pch=19, cex=1, col="red")
mtext(SUBHEADER)
# Volcano
NLAB=12
plot(de$log2FoldChange, -log10(de$pvalue) ,
xlab="-log2(fold change)", ylab="-log10(p-value)",
xlim = c(-6,6),
pch=19, cex=1, col="dark gray",
main="smear plot")
grid()
mtext(SUBHEADER)
points(sig$log2FoldChange, -log10(sig$pvalue) ,
pch=19, cex=1, col="red")
text(head(sig$log2FoldChange,NLAB), head(-log10(sig$pvalue),NLAB)+0.5 ,
labels=head(rownames(sig),NLAB) , cex=1 )
# organellar genomes
chl <- de[grep("ATCG",rownames(de)),]
mit <- de[grep("ATMG",rownames(de)),]
NLAB=12
plot(de$log2FoldChange, -log10(de$pvalue) ,
xlab="-log2(fold change)", ylab="-log10(p-value)",
xlim = c(-6,6),
pch=19, cex=1, col="dark gray",
main="smear plot: organellar genomes")
grid()
abline(h=tail(-log10(sig$pvalue),1),lty=2,lwd=2)
mtext("Grey: nuclear, Red: plastid, Blue: mito")
points(chl$log2FoldChange, -log10(chl$pvalue) ,
pch=19, cex=1, col="red")
points(mit$log2FoldChange, -log10(mit$pvalue) ,
pch=19, cex=1, col="blue")
# summary plot
chl_up <- nrow(subset(chl,padj<0.05 & log2FoldChange>1))
chl_dn <- nrow(subset(chl,padj<0.05 & log2FoldChange<1))
mit_up <- nrow(subset(mit,padj<0.05 & log2FoldChange>1))
mit_dn <- nrow(subset(mit,padj<0.05 & log2FoldChange<1))
sig_up <- length(up)
sig_dn <- length(dn)
n_up <- sig_up - ( chl_up + mit_up )
n_dn <- sig_dn - ( chl_dn + mit_dn )
# barplot
par(mar=c(5,10,2,2))
mycounts <- c("nuclear up"=sig_up,
"chl up"=chl_up,
"mit up"=mit_up,
"nuclear dn"=sig_dn,
"chl dn"=chl_dn,
"mit dn"=mit_dn)
mycounts
barplot(mycounts,
horiz=TRUE,
las=1,
main="number of DE genes")
par(mar=c(5.1,4.1,4.1,2.1))
# heatmap
yn <- y/colSums(y)*1000000
yf <- yn[which(rownames(yn) %in% rownames(de)[1:50]),]
mycols <- gsub("0","yellow",ss$trt)
mycols <- gsub("1","orange",mycols)
colfunc <- colorRampPalette(c("blue", "white", "red"))
heatmap.2( as.matrix(yf), col=colfunc(25),scale="row",
ColSideColors =mycols ,trace="none",
margin = c(10,10), cexRow=0.6, cexCol=0.8 , main="Top 50 genes by p-val")
mtext("yellow=ctrl, orange=trt")
return(de)
}
mitch_barplot <- function(res){
sig <- res$enrichment_result
sig <- head( sig[order(sig$pANOVA),] ,30)
sig <- sig[order(sig$s.dist),]
par(mar=c(3,25,1,1)); barplot(sig$s.dist,horiz=TRUE,las=2,cex.names = 0.6,cex.axis = 0.6,
names.arg=sig$set,main="Enrichment score") ;grid()
}
ss80 <- subset(ss,Treatment=="Seaweed_16280"|Treatment=="Water")
ss80$trt <- as.numeric(grepl("Seaweed",ss80$Treatment))
xx80 <- xx[,which(colnames(xx) %in% ss80$label)]
Here, were using DESeq2 to perform differential expression analysis to understand gene expression changes caused by the different formulations. Enrichment analysis is performed at the end using mitch with all 4 timepoints.
de80 <- run_de(ss80,xx80)
## converting counts to integer mode
## the design formula contains one or more numeric variables with integer values,
## specifying a model with increasing fold change for higher values.
## did you mean for this to be a factor? if so, first convert
## this variable to a factor using the factor() function
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## chr [1:729] "AT1G65970" "AT4G06665" "AT3G11510" "AT5G64100" "AT1G24996" ...
## chr [1:730] "AT2G47450" "AT1G55673" "AT2G22510" "AT2G32870" "AT2G18328" ...
de80_top <- head(as.data.frame(de80),40)
kbl(de80_top[order(-de80_top$log2FoldChange),]) %>% kable_styling()
baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | |
---|---|---|---|---|---|---|
AT4G06665 | 132.02756 | 10.6703428 | 1.2214435 | 8.735846 | 0 | 0 |
AT2G08750 | 37.84520 | 8.8747367 | 1.2919905 | 6.869042 | 0 | 0 |
AT1G24996 | 323.82079 | 4.3818892 | 0.5618073 | 7.799630 | 0 | 0 |
AT3G20810 | 1371.83363 | 3.1462473 | 0.4775527 | 6.588273 | 0 | 0 |
AT5G06760 | 1048.96459 | 2.9414243 | 0.4414667 | 6.662845 | 0 | 0 |
AT4G33905 | 330.48540 | 2.1883867 | 0.3352305 | 6.528005 | 0 | 0 |
AT1G65970 | 319.83244 | 1.8879852 | 0.2028938 | 9.305287 | 0 | 0 |
AT3G48510 | 71.68292 | 1.7527497 | 0.2646986 | 6.621681 | 0 | 0 |
AT1G19250 | 177.80497 | 1.3027098 | 0.2005740 | 6.494908 | 0 | 0 |
AT3G14440 | 696.62229 | 1.1073515 | 0.1674480 | 6.613105 | 0 | 0 |
AT5G03345 | 831.78940 | 0.9323214 | 0.1398791 | 6.665195 | 0 | 0 |
AT5G64100 | 3886.71544 | 0.9061570 | 0.1091747 | 8.300067 | 0 | 0 |
AT4G12600 | 1344.31545 | 0.8738919 | 0.1317509 | 6.632912 | 0 | 0 |
AT3G11510 | 6114.57364 | 0.8224084 | 0.0964760 | 8.524483 | 0 | 0 |
AT5G42090 | 3764.73635 | 0.7636518 | 0.1042944 | 7.322079 | 0 | 0 |
AT1G14980 | 1578.18877 | 0.7608785 | 0.1126658 | 6.753414 | 0 | 0 |
AT3G22230 | 3405.70403 | 0.7473244 | 0.0963363 | 7.757453 | 0 | 0 |
AT3G09735 | 1220.96912 | 0.7320855 | 0.0981377 | 7.459776 | 0 | 0 |
AT2G44310 | 1560.18430 | 0.6872796 | 0.0950144 | 7.233422 | 0 | 0 |
AT3G18740 | 3328.73251 | 0.6726190 | 0.1004285 | 6.697488 | 0 | 0 |
AT4G09320 | 6585.58229 | 0.6659963 | 0.0894218 | 7.447803 | 0 | 0 |
AT4G20150 | 3164.43679 | 0.6605573 | 0.0950525 | 6.949394 | 0 | 0 |
AT5G02870 | 7649.88412 | 0.6494672 | 0.0860326 | 7.549084 | 0 | 0 |
AT1G55330 | 2376.62233 | 0.6279042 | 0.0917265 | 6.845395 | 0 | 0 |
AT3G53430 | 5234.77439 | 0.6126416 | 0.0824943 | 7.426474 | 0 | 0 |
AT5G57660 | 1896.01283 | -0.7450316 | 0.1135806 | -6.559499 | 0 | 0 |
AT2G32870 | 1214.28389 | -0.8573352 | 0.1021669 | -8.391513 | 0 | 0 |
AT3G47470 | 34652.01704 | -0.8809525 | 0.1335732 | -6.595277 | 0 | 0 |
AT3G15353 | 23919.22730 | -0.9793861 | 0.1511211 | -6.480802 | 0 | 0 |
AT2G18328 | 689.16212 | -1.0454080 | 0.1455697 | -7.181493 | 0 | 0 |
AT4G39720 | 211.17876 | -1.0949131 | 0.1686647 | -6.491655 | 0 | 0 |
AT2G47450 | 3594.05175 | -1.1612794 | 0.1044117 | -11.122115 | 0 | 0 |
AT5G48490 | 1040.34652 | -1.2382333 | 0.1779417 | -6.958647 | 0 | 0 |
AT5G54270 | 14956.45719 | -1.2741526 | 0.1886138 | -6.755352 | 0 | 0 |
AT1G64370 | 5582.52780 | -1.2768304 | 0.1829606 | -6.978719 | 0 | 0 |
AT4G38080 | 3041.08385 | -1.4147394 | 0.1978268 | -7.151404 | 0 | 0 |
AT2G22510 | 1187.84821 | -1.5712553 | 0.1672951 | -9.392116 | 0 | 0 |
AT3G02400 | 210.64282 | -2.0573297 | 0.3175210 | -6.479350 | 0 | 0 |
AT1G15830 | 157.00814 | -2.1186271 | 0.3203694 | -6.613077 | 0 | 0 |
AT1G55673 | 735.08530 | -12.7838910 | 1.2309372 | -10.385494 | 0 | 0 |
write.table(de80,file="de80.tsv",quote=FALSE,sep="\t")
de80_up <- rownames(subset(de80,padj<0.05&log2FoldChange>0))
de80_dn <- rownames(subset(de80,padj<0.05&log2FoldChange<0))
Mapman pathways last modified in 2012 and used in the previous RNA-seq analysis.
genesets <- gmt_import("../baseline/ref/Ath_AGI_LOCUS_TAIR10_Aug2012.txt.gmt")
gt <- read.table("../baseline/ref/Arabidopsis_thaliana.TAIR10.46.geneaccession2symbol.tsv",
fill=TRUE)
m <- mitch_import(x=as.data.frame(de80),DEtype="deseq2",geneTable=gt)
## The input is a single dataframe; one contrast only. Converting
## it to a list for you.
## Note: Mean no. genes in input = 21761
## Note: no. genes in output = 21471
## Note: estimated proportion of input genes in output = 0.987
# significance
res_sig <- mitch_calc(x=m,genesets=genesets,priority="significance")
## Note: When prioritising by significance (ie: small
## p-values), large effect sizes might be missed.
top <- head(subset(res_sig$enrichment_result,p.adjustANOVA<0.05),50)
kbl(head(top,30)) %>% kable_styling()
set | setSize | pANOVA | s.dist | p.adjustANOVA | |
---|---|---|---|---|---|
91 | NOT_ASSIGNED_NO_ONTOLOGY_PENTATRICOPEPTIDE_(PPR)_REPEAT-CONTAINING_PROTEIN | 418 | 0.0000000 | 0.1877951 | 0.0000000 |
114 | PROTEIN_DEGRADATION_UBIQUITIN_PROTEASOM | 60 | 0.0000000 | 0.4827129 | 0.0000000 |
8 | CELL_ORGANISATION | 345 | 0.0000000 | -0.1881141 | 0.0000001 |
243 | STRESS_BIOTIC_PR-PROTEINS | 172 | 0.0000000 | -0.2651555 | 0.0000001 |
25 | DNA_SYNTHESIS/CHROMATIN_STRUCTURE | 204 | 0.0000000 | -0.2396256 | 0.0000002 |
89 | NOT_ASSIGNED_NO_ONTOLOGY_GLYCINE_RICH_PROTEINS | 55 | 0.0000000 | -0.4532720 | 0.0000003 |
92 | NOT_ASSIGNED_NO_ONTOLOGY_PROLINE_RICH_FAMILY | 39 | 0.0000000 | -0.5225783 | 0.0000006 |
112 | PROTEIN_DEGRADATION_UBIQUITIN_E3_SCF_FBOX | 253 | 0.0000000 | 0.2050586 | 0.0000006 |
13 | CELL_WALL_CELL_WALL_PROTEINS_HRGP | 14 | 0.0000002 | -0.8049854 | 0.0000054 |
187 | RNA_REGULATION_OF_TRANSCRIPTION_MYB-RELATED_TRANSCRIPTION_FACTOR_FAMILY | 42 | 0.0000003 | -0.4550153 | 0.0000089 |
197 | RNA_REGULATION_OF_TRANSCRIPTION_TRIHELIX,_TRIPLE-HELIX_TRANSCRIPTION_FACTOR_FAMILY | 29 | 0.0000009 | -0.5265238 | 0.0000222 |
186 | RNA_REGULATION_OF_TRANSCRIPTION_MYB_DOMAIN_TRANSCRIPTION_FACTOR_FAMILY | 121 | 0.0000019 | -0.2507256 | 0.0000421 |
202 | RNA_RNA_BINDING | 164 | 0.0000022 | -0.2140846 | 0.0000458 |
179 | RNA_REGULATION_OF_TRANSCRIPTION_HB,HOMEOBOX_TRANSCRIPTION_FACTOR_FAMILY | 67 | 0.0000025 | -0.3323929 | 0.0000482 |
199 | RNA_REGULATION_OF_TRANSCRIPTION_WRKY_DOMAIN_TRANSCRIPTION_FACTOR_FAMILY | 62 | 0.0000050 | -0.3352826 | 0.0000881 |
215 | SIGNALLING_LIGHT | 89 | 0.0000090 | -0.2722420 | 0.0001501 |
173 | RNA_REGULATION_OF_TRANSCRIPTION_CCAAT_BOX_BINDING_FACTOR_FAMILY,_HAP2 | 10 | 0.0000113 | -0.8018452 | 0.0001767 |
20 | DEVELOPMENT_LATE_EMBRYOGENESIS_ABUNDANT | 18 | 0.0000186 | 0.5826691 | 0.0002765 |
88 | NOT_ASSIGNED_NO_ONTOLOGY_FORMIN_HOMOLOGY_2_DOMAIN-CONTAINING_PROTEIN | 16 | 0.0000421 | -0.5913190 | 0.0005912 |
99 | PROTEIN_ASSEMBLY_AND_COFACTOR_LIGATION | 23 | 0.0000446 | 0.4916157 | 0.0005960 |
154 | REDOX_THIOREDOXIN_PDIL | 13 | 0.0000516 | 0.6483291 | 0.0006559 |
14 | CELL_WALL_CELL_WALL_PROTEINS_LRR | 17 | 0.0000540 | -0.5654999 | 0.0006559 |
126 | PROTEIN_SYNTHESIS_INITIATION | 77 | 0.0000566 | 0.2653602 | 0.0006574 |
90 | NOT_ASSIGNED_NO_ONTOLOGY_HYDROXYPROLINE_RICH_PROTEINS | 53 | 0.0000647 | -0.3171790 | 0.0006936 |
222 | SIGNALLING_RECEPTOR_KINASES_DUF_26 | 40 | 0.0000670 | -0.3642131 | 0.0006936 |
175 | RNA_REGULATION_OF_TRANSCRIPTION_G2-LIKE_TRANSCRIPTION_FACTOR_FAMILY,_GARP | 38 | 0.0000675 | -0.3734919 | 0.0006936 |
125 | PROTEIN_SYNTHESIS_ELONGATION | 29 | 0.0000788 | 0.4235001 | 0.0007794 |
174 | RNA_REGULATION_OF_TRANSCRIPTION_CHROMATIN_REMODELING_FACTORS | 35 | 0.0000935 | -0.3815477 | 0.0008912 |
171 | RNA_REGULATION_OF_TRANSCRIPTION_C2H2_ZINC_FINGER_FAMILY | 100 | 0.0001248 | -0.2219952 | 0.0011488 |
18 | CELL_WALL_MODIFICATION | 55 | 0.0001333 | 0.2977587 | 0.0011863 |
mitch_barplot(res_sig)
# effect size
res_eff <- mitch_calc(x=m,genesets=genesets,priority="effect")
## Note: Enrichments with large effect sizes may not be
## statistically significant.
top <- head(subset(res_eff$enrichment_result,p.adjustANOVA<0.05),50)
kbl(head(top,30)) %>% kable_styling()
set | setSize | pANOVA | s.dist | p.adjustANOVA | |
---|---|---|---|---|---|
13 | CELL_WALL_CELL_WALL_PROTEINS_HRGP | 14 | 0.0000002 | -0.8049854 | 0.0000054 |
173 | RNA_REGULATION_OF_TRANSCRIPTION_CCAAT_BOX_BINDING_FACTOR_FAMILY,_HAP2 | 10 | 0.0000113 | -0.8018452 | 0.0001767 |
154 | REDOX_THIOREDOXIN_PDIL | 13 | 0.0000516 | 0.6483291 | 0.0006559 |
88 | NOT_ASSIGNED_NO_ONTOLOGY_FORMIN_HOMOLOGY_2_DOMAIN-CONTAINING_PROTEIN | 16 | 0.0000421 | -0.5913190 | 0.0005912 |
253 | TRANSPORT_MAJOR_INTRINSIC_PROTEINS_PIP | 13 | 0.0002226 | 0.5912875 | 0.0017483 |
20 | DEVELOPMENT_LATE_EMBRYOGENESIS_ABUNDANT | 18 | 0.0000186 | 0.5826691 | 0.0002765 |
14 | CELL_WALL_CELL_WALL_PROTEINS_LRR | 17 | 0.0000540 | -0.5654999 | 0.0006559 |
161 | RNA_REGULATION_OF_TRANSCRIPTION_ARF,_AUXIN_RESPONSE_FACTOR_FAMILY | 16 | 0.0001897 | -0.5388662 | 0.0015347 |
189 | RNA_REGULATION_OF_TRANSCRIPTION_NUCLEOSOME/CHROMATIN_ASSEMBLY_FACTOR_GROUP | 13 | 0.0008377 | -0.5349556 | 0.0053377 |
197 | RNA_REGULATION_OF_TRANSCRIPTION_TRIHELIX,_TRIPLE-HELIX_TRANSCRIPTION_FACTOR_FAMILY | 29 | 0.0000009 | -0.5265238 | 0.0000222 |
145 | PS_LIGHTREACTION_PHOTOSYSTEM_II_LHC-II | 14 | 0.0007003 | -0.5231393 | 0.0046742 |
92 | NOT_ASSIGNED_NO_ONTOLOGY_PROLINE_RICH_FAMILY | 39 | 0.0000000 | -0.5225783 | 0.0000006 |
244 | STRESS_BIOTIC_PR-PROTEINS_PLANT_DEFENSINS | 15 | 0.0005973 | 0.5118755 | 0.0041850 |
139 | PROTEIN_TARGETING_SECRETORY_PATHWAY_ER | 14 | 0.0010249 | 0.5068010 | 0.0062194 |
193 | RNA_REGULATION_OF_TRANSCRIPTION_PWWP_DOMAIN_PROTEIN | 11 | 0.0036743 | -0.5057782 | 0.0188663 |
209 | SECONDARY_METABOLISM_SIMPLE_PHENOLS | 16 | 0.0006058 | -0.4950827 | 0.0041850 |
99 | PROTEIN_ASSEMBLY_AND_COFACTOR_LIGATION | 23 | 0.0000446 | 0.4916157 | 0.0005960 |
235 | SIGNALLING_RECEPTOR_KINASES_WHEAT_LRK10_LIKE | 12 | 0.0032494 | -0.4906333 | 0.0173518 |
114 | PROTEIN_DEGRADATION_UBIQUITIN_PROTEASOM | 60 | 0.0000000 | 0.4827129 | 0.0000000 |
38 | HORMONE_METABOLISM_GIBBERELIN_INDUCED-REGULATED-RESPONSIVE-ACTIVATED | 12 | 0.0038992 | 0.4811501 | 0.0196430 |
232 | SIGNALLING_RECEPTOR_KINASES_PROLINE_EXTENSIN_LIKE | 15 | 0.0013746 | -0.4771564 | 0.0078087 |
110 | PROTEIN_DEGRADATION_UBIQUITIN_E3_BTB/POZ_CULLIN3_BTB/POZ | 10 | 0.0106845 | 0.4661852 | 0.0413444 |
133 | PROTEIN_SYNTHESIS_RIBOSOME_BIOGENESIS_PRE-RRNA_PROCESSING_AND_MODIFICATIONS_SNORNAS | 16 | 0.0012692 | 0.4653228 | 0.0073670 |
211 | SIGNALLING_14-3-3_PROTEINS | 11 | 0.0077265 | 0.4638143 | 0.0324841 |
187 | RNA_REGULATION_OF_TRANSCRIPTION_MYB-RELATED_TRANSCRIPTION_FACTOR_FAMILY | 42 | 0.0000003 | -0.4550153 | 0.0000089 |
89 | NOT_ASSIGNED_NO_ONTOLOGY_GLYCINE_RICH_PROTEINS | 55 | 0.0000000 | -0.4532720 | 0.0000003 |
125 | PROTEIN_SYNTHESIS_ELONGATION | 29 | 0.0000788 | 0.4235001 | 0.0007794 |
144 | PS_LIGHTREACTION_NADH_DH | 12 | 0.0130143 | 0.4140066 | 0.0476001 |
26 | DNA_SYNTHESIS/CHROMATIN_STRUCTURE_HISTONE_CORE_H2A | 14 | 0.0090832 | 0.4026858 | 0.0367458 |
136 | PROTEIN_TARGETING_MITOCHONDRIA | 32 | 0.0001536 | 0.3865648 | 0.0012818 |
mitch_barplot(res_eff)
Bubble plot with top 50, 30 and 20 gene sets.
My_Theme = theme(
axis.title.x = element_text(size = 16),
axis.text.y = element_text(size = 8),
axis.text.x = element_text(size = 8),
axis.title.y = element_text(size = 16))
top <- head(res_eff$enrichment_result,50)
top <- top[order(top$s.dist),]
top$set <- factor(top$set, levels = top$set[order(top$s.dist)])
ggplot(top, aes(s.dist, set, size = setSize)) + geom_point(aes(colour=-log10(top$p.adjustANOVA))) + My_Theme
top <- head(res_eff$enrichment_result,30)
top <- top[order(top$s.dist),]
top$set <- factor(top$set, levels = top$set[order(top$s.dist)])
ggplot(top, aes(s.dist, set, size = setSize)) + geom_point(aes(colour=-log10(top$p.adjustANOVA))) + My_Theme
top <- head(res_eff$enrichment_result,20)
top <- top[order(top$s.dist),]
top$set <- factor(top$set, levels = top$set[order(top$s.dist)])
ggplot(top, aes(s.dist, set, size = setSize)) + geom_point(aes(colour=-log10(top$p.adjustANOVA))) + My_Theme
Heatmaps of a few top ranked gene sets by effect size.
mysets <- res_eff$input_genesets[which(names(res_eff$input_genesets) %in% head(res_eff$enrichment_result$set,10))]
xxx <- xx[,c(grep("S80",colnames(xx)),grep("H",colnames(xx)))]
xxx <- xxx / colSums(xxx) * 1e6
colfunc <- colorRampPalette(c("blue", "white", "red"))
plots <- lapply(1:length(mysets), function(x) {
at <- gt[which(gt$V2 %in% mysets[[x]]),1]
my_genes <- xxx[which(rownames(xxx) %in% at),]
if(nrow(my_genes)>2) {
heatmap.2(as.matrix(my_genes),trace="none",scale="row",
margin=c(8,10),main=names(mysets[x]),
col=colfunc(25))
}
})
Heatmaps of a few top ranked gene sets by small p-values.
mysets <- res_eff$input_genesets[which(names(res_eff$input_genesets) %in% head(res_sig$enrichment_result$set,10))]
xxx <- xx[,c(grep("S80",colnames(xx)),grep("H",colnames(xx)))]
xxx <- xxx / colSums(xxx) * 1e6
colfunc <- colorRampPalette(c("blue", "white", "red"))
plots <- lapply(1:length(mysets), function(x) {
at <- gt[which(gt$V2 %in% mysets[[x]]),1]
my_genes <- xxx[which(rownames(xxx) %in% at),]
if(nrow(my_genes)>2) {
heatmap.2(as.matrix(my_genes),trace="none",scale="row",
margin=c(8,10),main=names(mysets[x]),
col=colfunc(25))
}
})
Gene sets of interest
mysets <- c("TRANSPORT_CALCIUM",
"REDOX_THIOREDOXIN_PDIL",
"CELL_WALL_CELLULOSE_SYNTHESIS",
"PROTEIN_TARGETING_CHLOROPLAST",
"MITOCHONDRIAL_ELECTRON_TRANSPORT_/_ATP_SYNTHESIS_CYTOCHROME_C")
mysets <- genesets[which(names(genesets) %in% mysets)]
myres <- mitch_calc(x=m,genesets=mysets,priority="effect")
## Note: Enrichments with large effect sizes may not be
## statistically significant.
par(mar=c(5,20,5,2))
vioplot(myres$detailed_sets,horizontal=TRUE,las=2,side="right",cex.axis=0.7)
grid()
Heat map of the gene sets of interest. Values plotted are the CPM after library size correction.
xxx <- xx[,c(grep("S80",colnames(xx)),grep("H",colnames(xx)))]
xxx <- xxx / colSums(xxx) * 1e6
colfunc <- colorRampPalette(c("blue", "white", "red"))
plots <- lapply(1:length(mysets), function(x) {
at <- gt[which(gt$V2 %in% mysets[[x]]),1]
my_genes <- xxx[which(rownames(xxx) %in% at),]
if(nrow(my_genes)>2) {
heatmap.2(as.matrix(my_genes),trace="none",scale="row",
margin=c(8,10),main=names(mysets[x]),
col=colfunc(25))
}
})
unlink("mapman_report_de80_sig.html")
capture.output(
mitch_report(res_sig,outfile=paste("mapman_report_de80_sig.html"))
, file = "/dev/null", append = FALSE,
type = c("output", "message"), split = FALSE)
## Dataset saved as " /tmp/Rtmp1TDkcW/mapman_report_de80_sig.rds ".
##
##
## processing file: mitch.Rmd
## Contour plot does not apply to unidimensional analysis.
## output file: /mnt/bfx4/tohidul/baseline3/mitch.knit.md
##
## Output created: /tmp/Rtmp1TDkcW/mitch_report.html
unlink("mapman_report_de80_eff.html")
capture.output(
mitch_report(res_eff,outfile=paste("mapman_report_de80_eff.html"))
, file = "/dev/null", append = FALSE,
type = c("output", "message"), split = FALSE)
## Dataset saved as " /tmp/Rtmp1TDkcW/mapman_report_de80_eff.rds ".
##
##
## processing file: mitch.Rmd
## Contour plot does not apply to unidimensional analysis.
## output file: /mnt/bfx4/tohidul/baseline3/mitch.knit.md
##
## Output created: /tmp/Rtmp1TDkcW/mitch_report.html
pdf("de80.pdf")
de80 <- run_de(ss80,xx80)
## converting counts to integer mode
## the design formula contains one or more numeric variables with integer values,
## specifying a model with increasing fold change for higher values.
## did you mean for this to be a factor? if so, first convert
## this variable to a factor using the factor() function
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## chr [1:729] "AT1G65970" "AT4G06665" "AT3G11510" "AT5G64100" "AT1G24996" ...
## chr [1:730] "AT2G47450" "AT1G55673" "AT2G22510" "AT2G32870" "AT2G18328" ...
mitch_barplot(res_eff)
par(mar=c(5,20,5,2))
vioplot(myres$detailed_sets,horizontal=TRUE,las=2,side="right",cex.axis=0.7)
dev.off()
## X11cairo
## 2
So you know what version of R and packages was used.
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] pkgload_1.2.0 GGally_2.1.1
## [3] beeswarm_0.3.1 gtools_3.8.2
## [5] echarts4r_0.4.0 forcats_0.5.1
## [7] stringr_1.4.0 dplyr_1.0.5
## [9] purrr_0.3.4 readr_1.4.0
## [11] tidyr_1.1.3 tibble_3.1.0
## [13] ggplot2_3.3.3 tidyverse_1.3.0
## [15] clusterProfiler_3.18.1 vioplot_0.3.5
## [17] zoo_1.8-9 sm_2.2-5.6
## [19] UpSetR_1.4.0 eulerr_6.1.0
## [21] kableExtra_1.3.4 mitch_1.2.2
## [23] DESeq2_1.30.0 SummarizedExperiment_1.20.0
## [25] Biobase_2.50.0 MatrixGenerics_1.2.0
## [27] matrixStats_0.58.0 GenomicRanges_1.42.0
## [29] GenomeInfoDb_1.26.0 IRanges_2.24.0
## [31] S4Vectors_0.28.0 BiocGenerics_0.36.0
## [33] gplots_3.1.1 reshape2_1.4.4
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.2.1 shadowtext_0.0.7
## [4] fastmatch_1.1-0 systemfonts_1.0.1 plyr_1.8.6
## [7] igraph_1.2.6 splines_4.0.3 BiocParallel_1.24.1
## [10] digest_0.6.27 htmltools_0.5.1.1 GOSemSim_2.16.1
## [13] viridis_0.6.0 GO.db_3.12.1 fansi_0.4.2
## [16] magrittr_2.0.1 memoise_2.0.0 annotate_1.68.0
## [19] graphlayouts_0.7.1 modelr_0.1.8 svglite_2.0.0
## [22] enrichplot_1.10.2 colorspace_2.0-0 blob_1.2.1
## [25] rvest_1.0.0 ggrepel_0.9.1 haven_2.3.1
## [28] xfun_0.22 tcltk_4.0.3 crayon_1.4.1
## [31] RCurl_1.98-1.3 jsonlite_1.7.2 scatterpie_0.1.5
## [34] genefilter_1.72.0 survival_3.2-10 glue_1.4.2
## [37] polyclip_1.10-0 gtable_0.3.0 zlibbioc_1.36.0
## [40] XVector_0.30.0 webshot_0.5.2 DelayedArray_0.16.0
## [43] scales_1.1.1 DOSE_3.16.0 DBI_1.1.1
## [46] Rcpp_1.0.6 viridisLite_0.4.0 xtable_1.8-4
## [49] bit_4.0.4 htmlwidgets_1.5.3 httr_1.4.2
## [52] fgsea_1.16.0 RColorBrewer_1.1-2 ellipsis_0.3.1
## [55] pkgconfig_2.0.3 reshape_0.8.8 XML_3.99-0.6
## [58] farver_2.1.0 dbplyr_2.1.0 sass_0.3.1
## [61] locfit_1.5-9.4 utf8_1.2.1 labeling_0.4.2
## [64] tidyselect_1.1.0 rlang_0.4.10 later_1.1.0.1
## [67] AnnotationDbi_1.52.0 cellranger_1.1.0 munsell_0.5.0
## [70] tools_4.0.3 cachem_1.0.4 cli_2.3.1
## [73] downloader_0.4 generics_0.1.0 RSQLite_2.2.4
## [76] broom_0.7.5 evaluate_0.14 fastmap_1.1.0
## [79] yaml_2.2.1 fs_1.5.0 knitr_1.31
## [82] bit64_4.0.5 tidygraph_1.2.0 caTools_1.18.1
## [85] ggraph_2.0.5 mime_0.10 DO.db_2.9
## [88] xml2_1.3.2 compiler_4.0.3 rstudioapi_0.13
## [91] testthat_3.0.2 reprex_1.0.0 tweenr_1.0.2
## [94] geneplotter_1.68.0 bslib_0.2.4 stringi_1.5.3
## [97] ps_1.6.0 highr_0.8 desc_1.3.0
## [100] lattice_0.20-41 Matrix_1.3-2 vctrs_0.3.6
## [103] pillar_1.5.1 lifecycle_1.0.0 BiocManager_1.30.10
## [106] jquerylib_0.1.3 data.table_1.14.0 cowplot_1.1.1
## [109] bitops_1.0-6 httpuv_1.5.5 qvalue_2.22.0
## [112] R6_2.5.0 promises_1.2.0.1 KernSmooth_2.23-18
## [115] gridExtra_2.3 MASS_7.3-53.1 assertthat_0.2.1
## [118] rprojroot_2.0.2 withr_2.4.1 GenomeInfoDbData_1.2.4
## [121] hms_1.0.0 grid_4.0.3 rmarkdown_2.7
## [124] rvcheck_0.1.8 ggforce_0.3.3 lubridate_1.7.10
## [127] shiny_1.6.0
Bray NL, Pimentel H, Melsted P, Pachter L. Near-optimal probabilistic RNA-seq quantification [published correction appears in Nat Biotechnol. 2016 Aug 9;34(8):888]. Nat Biotechnol. 2016;34(5):525-527. doi:10.1038/nbt.3519
Jiang H, Lei R, Ding SW, Zhu S. Skewer: a fast and accurate adapter trimmer for next-generation sequencing paired-end reads. BMC Bioinformatics. 2014;15:182. Published 2014 Jun 12. doi:10.1186/1471-2105-15-182
Love MI, Huber W, Anders S. Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biol. 2014;15(12):550. doi:10.1186/s13059-014-0550-8
Kaspi A, Ziemann M. mitch: multi-contrast pathway enrichment for multi-omics and single-cell profiling data. BMC Genomics. 2020;21(1):447. Published 2020 Jun 29. doi:10.1186/s12864-020-06856-9