The goal here is to perform enrichment analysis with the human and mouse DUX data.
library("mitch")
library("DESeq2")
library("edgeR")
library("eulerr")
library("UpSetR")
library("gplots")
MCM: strain: ACTA1-MCM/+; treatment: no tamoxifen (CONTROL)
FLExD: strain: FLExDUX4/+treatment: no tamoxifen (DO NOT USE)
dTGM: strain: FLExDUX4/+;MCM/+ treatment: one 5 mg/kg IP injection of tamoxifen (CASE - MILD)
dTGS: FLExDUX4/+;MCM/+ treatment: two 10 mg/kg IP injections of tamoxifen on consecutive days (CASE - SEVERE)
# mouse GSE122562
#URL="https://ftp.ncbi.nlm.nih.gov/geo/series/GSE122nnn/GSE122562/suppl/GSE122562_gene_counts$
j <- read.table("GSE122562_gene_counts_table.tsv.gz",header=TRUE,row.names=1)
j[,1]<-NULL
ss <- as.data.frame(colnames(j))
ss$mcm <- grepl("MCM",ss[,1])*1
ss$flexd <- grepl("FLExD",ss[,1])*1
ss$dtgm <- grepl("dTGM",ss[,1])*1
ss$dtgs <- grepl("dTGS",ss[,1])*1
rownames(ss) <- ss[,1]
ss[,1]<-NULL
# here set the contrast
sss <- subset(ss,mcm==1|dtgm==1)
design <- model.matrix(~sss$dtgm)
rownames(design) <- rownames(sss)
jj <- j[,which(colnames(j) %in% rownames(design) )]
jj <- jj[which(rowMeans(jj)>10),]
z <- DGEList(counts=jj)
z <- calcNormFactors(z)
z <- estimateDisp(z, design,robust=TRUE,prior.df=1)
fit <- glmFit(z, design)
lrt<-glmLRT(fit)
dge<-as.data.frame(topTags(lrt,n=Inf))
dge$dispersion<-lrt$dispersion
dge<-merge(dge,lrt$fitted.values,by='row.names')
rownames(dge)=dge$Row.names
dge$Row.names=NULL
dge<-dge[order(dge$PValue),]
head(dge,10)
## logFC logCPM LR PValue FDR
## ENSMUSG00000040322 4.145430 5.524643 1519.4858 0.000000e+00 0.000000e+00
## ENSMUSG00000020437 5.432810 5.174086 1224.5266 2.851232e-268 2.142131e-264
## ENSMUSG00000024542 3.011320 4.212642 1220.4921 2.146997e-267 1.075359e-263
## ENSMUSG00000068011 6.345711 4.265444 1127.4326 3.602781e-247 1.353385e-243
## ENSMUSG00000032122 5.995664 4.925190 1100.8525 2.155748e-241 6.478453e-238
## ENSMUSG00000041346 3.280202 4.239581 1091.7976 2.002776e-239 5.015619e-236
## ENSMUSG00000036862 3.382167 5.069340 1003.9379 2.502092e-220 5.370918e-217
## ENSMUSG00000026734 11.103505 2.353322 975.5494 3.706949e-214 6.962577e-211
## ENSMUSG00000045659 4.679723 3.771261 949.1380 2.042257e-208 3.409661e-205
## ENSMUSG00000044447 4.194787 4.501589 922.5110 1.253915e-202 1.884132e-199
## dispersion MCM.1127 MCM.1246 MCM.1349 dTGM.1236
## ENSMUSG00000040322 9.765625e-05 106.953690 115.04141 116.90420 3086.0216
## ENSMUSG00000020437 6.420286e-03 35.321164 37.99211 38.60729 2492.0470
## ENSMUSG00000024542 1.366409e-02 88.027714 94.68427 96.21743 1157.3595
## ENSMUSG00000068011 4.623586e-02 9.935764 10.68710 10.86014 1328.8862
## ENSMUSG00000032122 1.382939e-02 20.184843 21.71120 22.06275 2107.9158
## ENSMUSG00000041346 5.424354e-03 75.786040 81.51689 82.83684 1200.7650
## ENSMUSG00000036862 8.246348e-03 127.368057 136.99948 139.21783 2164.8470
## ENSMUSG00000026734 1.053648e-02 0.000000 0.00000 0.00000 341.6463
## ENSMUSG00000045659 1.500921e-02 21.781858 23.42898 23.80835 913.2603
## ENSMUSG00000044447 1.449207e-02 50.481515 54.29887 55.17810 1508.7009
## dTGM.1308 dTGM.1331
## ENSMUSG00000040322 2825.7735 2985.2536
## ENSMUSG00000020437 2281.8895 2410.6741
## ENSMUSG00000024542 1059.7579 1119.5682
## ENSMUSG00000068011 1216.8195 1285.4940
## ENSMUSG00000032122 1930.1525 2039.0859
## ENSMUSG00000041346 1099.5029 1161.5564
## ENSMUSG00000036862 1982.2827 2094.1581
## ENSMUSG00000026734 312.8349 330.4906
## ENSMUSG00000045659 836.2439 883.4396
## ENSMUSG00000044447 1381.4702 1459.4372
jm <- dge
sss <- subset(ss,mcm==1|dtgs==1)
design <- model.matrix(~sss$dtgs)
rownames(design) <- rownames(sss)
jj <- j[,which(colnames(j) %in% rownames(design) )]
jj <- jj[which(rowMeans(jj)>10),]
z <- DGEList(counts=jj)
z <- calcNormFactors(z)
z <- estimateDisp(z, design,robust=TRUE,prior.df=1)
fit <- glmFit(z, design)
lrt<-glmLRT(fit)
dge<-as.data.frame(topTags(lrt,n=Inf))
dge$dispersion<-lrt$dispersion
dge<-merge(dge,lrt$fitted.values,by='row.names')
rownames(dge)=dge$Row.names
dge$Row.names=NULL
dge<-dge[order(dge$PValue),]
head(dge,10)
## logFC logCPM LR PValue FDR
## ENSMUSG00000091747 4.823213 5.166640 1497.693 0.000000e+00 0.000000e+00
## ENSMUSG00000015950 4.166462 4.573994 1416.039 6.871679e-310 5.411791e-306
## ENSMUSG00000020914 6.852232 5.797438 1376.245 3.050561e-301 1.601646e-297
## ENSMUSG00000034768 -3.849882 4.480923 1351.926 5.876642e-296 2.314075e-292
## ENSMUSG00000001131 6.137768 4.449557 1318.287 1.199992e-288 3.392601e-285
## ENSMUSG00000067455 5.348949 4.239762 1318.139 1.292337e-288 3.392601e-285
## ENSMUSG00000046345 -6.893317 4.273700 1280.119 2.363838e-280 5.318973e-277
## ENSMUSG00000038777 -3.024598 4.951953 1248.254 1.988524e-273 3.915156e-270
## ENSMUSG00000062727 7.368636 4.605333 1242.285 3.942227e-272 6.899335e-269
## ENSMUSG00000064288 5.360316 4.050966 1204.621 6.039794e-264 9.513280e-261
## dispersion MCM.1127 MCM.1246 MCM.1349 dTGS.1047
## ENSMUSG00000091747 0.007308578 41.103312 44.205182 45.086409 2668.09650
## ENSMUSG00000015950 0.007362846 41.806168 44.961078 45.857374 1721.21140
## ENSMUSG00000020914 0.009077381 16.055621 17.267262 17.611483 4265.89294
## ENSMUSG00000034768 0.065224750 731.422240 786.619167 802.300364 115.94310
## ENSMUSG00000001131 0.001835045 10.080105 10.840802 11.056913 1636.66914
## ENSMUSG00000067455 0.023280503 14.886713 16.010142 16.329303 1395.64270
## ENSMUSG00000046345 0.018311514 676.562091 727.618986 742.124019 12.85318
## ENSMUSG00000038777 0.012181762 958.006760 1030.302933 1050.841949 269.30376
## ENSMUSG00000062727 0.011519154 4.794572 5.156395 5.259188 1842.33886
## ENSMUSG00000064288 0.028101258 12.890427 13.863206 14.139568 1218.99264
## dTGS.1066 dTGS.1079
## ENSMUSG00000091747 2561.92516 2524.21890
## ENSMUSG00000015950 1652.71938 1628.39476
## ENSMUSG00000020914 4096.14062 4035.85387
## ENSMUSG00000034768 111.32938 109.69084
## ENSMUSG00000001131 1571.54130 1548.41145
## ENSMUSG00000067455 1340.10601 1320.38241
## ENSMUSG00000046345 12.34171 12.16007
## ENSMUSG00000038777 258.58738 254.78150
## ENSMUSG00000062727 1769.02683 1742.99040
## ENSMUSG00000064288 1170.48537 1153.25823
js <- dge
Now to see how similar these contrasts are. The overlap is really high, therefore I suggest only using the moderate contrast in downstream mitch analysis.
jm_up <- rownames(subset(jm,FDR<0.05 & logFC>0))
jm_dn <- rownames(subset(jm,FDR<0.05 & logFC<0))
js_up <- rownames(subset(js,FDR<0.05 & logFC>0))
js_dn <- rownames(subset(js,FDR<0.05 & logFC<0))
v1 <- list("mod up"=jm_up, "mod dn"=jm_dn,
"sev up"=js_up,"sev dn"=js_dn)
plot(euler(v1),quantities = TRUE)
# human
h <- read.table("dux2_edgeR_edger.tsv",header=TRUE,row.names=1)
rownames(h) <- sapply(strsplit(rownames(h),"_"),"[[",1)
# mouse simple
# we're not using the factorial
ms <- read.table("EVvsDUX_wDOX_paired.tsv",header=TRUE,row.names=1)
rownames(ms) <- sapply(strsplit(rownames(ms),"_"),"[[",1)
# ortholog mapping
orth <- read.table("mouse2human.txt.sort")
# gene name mapping human
gth <- orth[,c(1,3)]
# gene name mapping human
gtm <- orth[,c(2,3)]
jm_up <- rownames(subset(jm,FDR<0.05 & logFC>0))
jm_dn <- rownames(subset(jm,FDR<0.05 & logFC<0))
h_up <- rownames(subset(h,FDR<0.05 & logFC>0))
h_dn <- rownames(subset(h,FDR<0.05 & logFC<0))
ms_up <- rownames(subset(ms,FDR<0.05 & logFC>0))
ms_dn <- rownames(subset(ms,FDR<0.05 & logFC<0))
l <- list("jm_up"=jm_up,"jm_dn"=jm_dn,"h_up"=h_up,"h_dn"=h_dn,"ms_up"=ms_up,"ms_dn"=ms_dn)
barplot(unlist(lapply(l,length)),main='no. DEGs')
# here are the values
unlist(lapply(l,length))
## jm_up jm_dn h_up h_dn ms_up ms_dn
## 4744 4253 3578 3619 1902 1495
# convert lists to mouse gene names
jm_up <- unique(gtm[which(gtm$V2 %in% jm_up ),2])
jm_dn <- unique(gtm[which(gtm$V2 %in% jm_dn ),2])
h_up <- unique(gth[which(gth$V1 %in% h_up ),2])
h_dn <- unique(gth[which(gth$V1 %in% h_dn ),2])
ms_up <- unique(gtm[which(gtm$V2 %in% ms_up ),2])
ms_dn <- unique(gtm[which(gtm$V2 %in% ms_dn ),2])
v1 <- list("mod up"=jm_up,
"mod dn"=jm_dn,
"h up"=h_up,
"h dn"=h_dn,
"ms up"=ms_up,
"ms dn"=ms_dn )
plot(euler(v1),quantities = TRUE)
upset(fromList(v1), order.by = "freq", nsets=6)
This is a bit confusing, so we will do separate Venns, comparing each mouse data with the human data. We can see that the concordance with human is higher with the Watt data as compared to the Jones data. Here, I define concordance ratio as the number of FDR genes that are commonly dysregulated divided by the number of genes with discordant expression.
v1 <- list("mod up"=jm_up,
"mod dn"=jm_dn,
"h up"=h_up,
"h dn"=h_dn)
plot(euler(v1),quantities = TRUE)
j_concordance <- (length(intersect(h_up,jm_up))+length(intersect(h_dn,jm_dn))) /
(length(intersect(h_up,jm_dn))+length(intersect(h_dn,jm_up)))
j_concordance
## [1] 1.09033
v1 <- list( "h up"=h_up,
"h dn"=h_dn,
"ms up"=ms_up,
"ms dn"=ms_dn )
plot(euler(v1),quantities = TRUE)
ms_concordance <- (length(intersect(h_up,ms_up))+length(intersect(h_dn,ms_dn))) /
(length(intersect(h_up,ms_dn))+length(intersect(h_dn,ms_up)))
ms_concordance
## [1] 1.899254
barplot(c(j_concordance,ms_concordance),names.arg=c("jones","watt"),main="concordance ratio")
#download.file("https://reactome.org/download/current/ReactomePathways.gmt.zip", destfile="ReactomePathways.gmt.zip")
#unzip("ReactomePathways.gmt.zip")
genesets <- gmt_import("ReactomePathways.gmt")
The mitch_import function creates
hh <- mitch_import(x=h,DEtype="edger",geneTable=gth)
## The input is a single dataframe; one contrast only. Converting
## it to a list for you.
## Note: Mean no. genes in input = 15664
## Note: no. genes in output = 12281
## Note: estimated proportion of input genes in output = 0.784
mm <- list("ms"=ms,"jm"=jm)
mmm <- mitch_import(x=mm,DEtype="edger",geneTable=gtm)
## Note: Mean no. genes in input = 13889
## Note: no. genes in output = 11677
## Note: estimated proportion of input genes in output = 0.841
mg <- merge(hh,mmm,by=0)
rownames(mg) <- mg[,1]
mg[,1]<-NULL
colnames(mg) <- c("human","mouse","jmod")
dim(mg)
## [1] 10591 3
head(mg)
## human mouse jmod
## A4GALT -14.988939 0.8182771 5.5001019
## AAAS -2.435804 0.3877972 0.7776159
## AACS -1.581711 4.1420212 11.1737053
## AAED1 -1.286093 -0.8848853 -39.4294484
## AAGAB -1.722220 -1.3839648 0.2777522
## AAK1 14.833940 0.4269837 -1.5398326
mg <- apply(mg,2, function(x) {
x[which(x==Inf)] <- 300 ; x[which(x==-Inf)] <- -300 ; return(x)
} )
# pearson
round(cor(mg,method='p'),3)
## human mouse jmod
## human 1.000 0.177 -0.003
## mouse 0.177 1.000 0.123
## jmod -0.003 0.123 1.000
# spearman
round(cor(mg,method='s'),3)
## human mouse jmod
## human 1.000 0.169 0.032
## mouse 0.169 1.000 0.487
## jmod 0.032 0.487 1.000
colfunc <- colorRampPalette(c("white", "red"))
heatmap.2(cor(mg,method='s'),
cellnote=round(cor(mg,method='s'),3),
trace="none",scale="none",margins=c(15,15),
col=colfunc(25), notecex=2 , main="Spearman")
Here we run mitch unidimensional and then create Venn diagrams of the FDR gene sets.
hmm <- mitch_import( h,DEtype="edger",geneTable=gth )
## The input is a single dataframe; one contrast only. Converting
## it to a list for you.
## Note: Mean no. genes in input = 15664
## Note: no. genes in output = 12281
## Note: estimated proportion of input genes in output = 0.784
hres <- mitch_calc(x=hmm,genesets=genesets,priority="effect",resrows=100)
## Note: Enrichments with large effect sizes may not be
## statistically significant.
hres_up <- subset(hres$enrichment_result,p.adjustANOVA<0.02 & s.dist > 0)$set
hres_dn <- subset(hres$enrichment_result,p.adjustANOVA<0.02 & s.dist < 0)$set
jmm <- mitch_import( jm,DEtype="edger",geneTable=gtm )
## The input is a single dataframe; one contrast only. Converting
## it to a list for you.
## Note: Mean no. genes in input = 15026
## Note: no. genes in output = 12858
## Note: estimated proportion of input genes in output = 0.856
jres <- mitch_calc(x=jmm,genesets=genesets,priority="effect",resrows=100)
## Note: Enrichments with large effect sizes may not be
## statistically significant.
jres_up <- subset(jres$enrichment_result,p.adjustANOVA<0.02 & s.dist > 0)$set
jres_dn <- subset(jres$enrichment_result,p.adjustANOVA<0.02 & s.dist < 0)$set
msm <- mitch_import( ms,DEtype="edger",geneTable=gtm )
## The input is a single dataframe; one contrast only. Converting
## it to a list for you.
## Note: Mean no. genes in input = 12752
## Note: no. genes in output = 11830
## Note: estimated proportion of input genes in output = 0.928
mres <- mitch_calc(x=msm,genesets=genesets,priority="effect",resrows=100)
## Note: Enrichments with large effect sizes may not be
## statistically significant.
mres_up <- subset(mres$enrichment_result,p.adjustANOVA<0.02 & s.dist > 0)$set
mres_dn <- subset(mres$enrichment_result,p.adjustANOVA<0.02 & s.dist < 0)$set
v1 <- list("h up"=hres_up,
"h dn"=hres_dn,
"j up"=jres_up,
"j dn"=jres_dn)
plot(euler(v1),quantities = TRUE)
v1 <- list("h up"=hres_up,
"h dn"=hres_dn,
"m up"=mres_up,
"m dn"=mres_dn)
plot(euler(v1),quantities = TRUE)
Here we run multivariate mitch with different prioritisation schemes.
# effect size
res1 <- mitch_calc(x=mg,genesets=genesets,priority="effect",resrows=100)
## Note: Enrichments with large effect sizes may not be
## statistically significant.
x <- res1$enrichment_result[1:100,c(4:6)]
rownames(x) <- res1$enrichment_result$set[1:nrow(x)]
colfunc <- colorRampPalette(c("blue","white", "red"))
heatmap.2(as.matrix(x),col=colfunc(30),margins=c(5,25),trace="none",
scale="none",cexCol=1,cexRow=0.25)
# discordance
res2 <- mitch_calc(x=mg,genesets=genesets,priority="SD",resrows=100)
## Note: Prioritisation by SD after selecting sets with
## p.adjustMANOVA<=0.05.
x <- res2$enrichment_result[1:100,c(4:6)]
rownames(x) <- res2$enrichment_result$set[1:nrow(x)]
colfunc <- colorRampPalette(c("blue","white", "red"))
heatmap.2(as.matrix(x),col=colfunc(30),margins=c(5,25),trace="none",
scale="none",cexCol=1, cexRow=0.25)
# significance
res3 <- mitch_calc(x=mg,genesets=genesets,priority="significance",resrows=100)
## Note: When prioritising by significance (ie: small
## p-values), large effect sizes might be missed.
x <- res3$enrichment_result[1:100,c(4:6)]
rownames(x) <- res3$enrichment_result$set[1:nrow(x)]
colfunc <- colorRampPalette(c("blue","white", "red"))
heatmap.2(as.matrix(x),col=colfunc(30),margins=c(5,25),trace="none",
scale="none",cexCol=1, cexRow=0.25)
unlink("mitch_plots_simple_eff.html")
capture.output(
mitch_report(res1,"mitch_plots_simple_eff.html")
,file = "/dev/null", append = FALSE,
type = c("output", "message"), split = FALSE)
## Dataset saved as " /tmp/RtmprhnUTX/mitch_plots_simple_eff.rds ".
##
##
## processing file: mitch.Rmd
## output file: /mnt/bfx6/bfx/kevinwatt/dux_mitch/mitch.knit.md
##
## Output created: /tmp/RtmprhnUTX/mitch_report.html
#res2 <- mitch_calc(x=mg,genesets=genesets,priority="significance")
#head(res2$enrichment_result,20)
#unlink("mitch_plots_simple_sig.html")
#mitch_report(res2,"mitch_plots_simple_sig.html")
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] pkgload_1.1.0 GGally_2.0.0
## [3] ggplot2_3.3.2 reshape2_1.4.4
## [5] beeswarm_0.2.3 gtools_3.8.2
## [7] tibble_3.0.4 dplyr_1.0.2
## [9] echarts4r_0.3.3 gplots_3.1.0
## [11] UpSetR_1.4.0 eulerr_6.1.0
## [13] edgeR_3.30.3 limma_3.44.3
## [15] DESeq2_1.28.1 SummarizedExperiment_1.18.2
## [17] DelayedArray_0.14.1 matrixStats_0.57.0
## [19] Biobase_2.48.0 GenomicRanges_1.40.0
## [21] GenomeInfoDb_1.24.2 IRanges_2.22.2
## [23] S4Vectors_0.26.1 BiocGenerics_0.34.0
## [25] mitch_1.0.10
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-6 bit64_4.0.5 RColorBrewer_1.1-2
## [4] rprojroot_1.3-2 backports_1.1.10 tools_4.0.3
## [7] R6_2.4.1 KernSmooth_2.23-17 DBI_1.1.0
## [10] colorspace_1.4-1 withr_2.3.0 tidyselect_1.1.0
## [13] gridExtra_2.3 bit_4.0.4 compiler_4.0.3
## [16] desc_1.2.0 labeling_0.4.2 caTools_1.18.0
## [19] scales_1.1.1 genefilter_1.70.0 stringr_1.4.0
## [22] digest_0.6.26 rmarkdown_2.5 XVector_0.28.0
## [25] pkgconfig_2.0.3 htmltools_0.5.0 highr_0.8
## [28] fastmap_1.0.1 htmlwidgets_1.5.2 rlang_0.4.8
## [31] RSQLite_2.2.1 shiny_1.5.0 generics_0.0.2
## [34] farver_2.0.3 jsonlite_1.7.1 BiocParallel_1.22.0
## [37] RCurl_1.98-1.2 magrittr_1.5 GenomeInfoDbData_1.2.3
## [40] Matrix_1.2-18 Rcpp_1.0.5 munsell_0.5.0
## [43] lifecycle_0.2.0 stringi_1.5.3 yaml_2.2.1
## [46] MASS_7.3-53 zlibbioc_1.34.0 plyr_1.8.6
## [49] grid_4.0.3 blob_1.2.1 promises_1.1.1
## [52] crayon_1.3.4 lattice_0.20-41 splines_4.0.3
## [55] annotate_1.66.0 polylabelr_0.2.0 locfit_1.5-9.4
## [58] knitr_1.30 pillar_1.4.6 geneplotter_1.66.0
## [61] XML_3.99-0.5 glue_1.4.2 evaluate_0.14
## [64] vctrs_0.3.4 httpuv_1.5.4 testthat_2.3.2
## [67] gtable_0.3.0 purrr_0.3.4 polyclip_1.10-0
## [70] assertthat_0.2.1 reshape_0.8.8 xfun_0.18
## [73] mime_0.9 xtable_1.8-4 later_1.1.0.1
## [76] survival_3.2-7 AnnotationDbi_1.50.3 memoise_1.1.0
## [79] ellipsis_0.3.1