Introduction

The goal here is to perform enrichment analysis with the human and mouse DUX data.

library("mitch")
library("DESeq2")
library("edgeR")
library("eulerr")
library("UpSetR")
library("gplots")

Prepare Jones data

MCM: strain: ACTA1-MCM/+; treatment: no tamoxifen (CONTROL)
FLExD: strain: FLExDUX4/+treatment: no tamoxifen (DO NOT USE)
dTGM: strain: FLExDUX4/+;MCM/+ treatment: one 5 mg/kg IP injection of tamoxifen (CASE - MILD)
dTGS: FLExDUX4/+;MCM/+ treatment: two 10 mg/kg IP injections of tamoxifen on consecutive days (CASE - SEVERE)

# mouse GSE122562
#URL="https://ftp.ncbi.nlm.nih.gov/geo/series/GSE122nnn/GSE122562/suppl/GSE122562_gene_counts$
j <- read.table("GSE122562_gene_counts_table.tsv.gz",header=TRUE,row.names=1)
j[,1]<-NULL
ss <- as.data.frame(colnames(j))
ss$mcm <- grepl("MCM",ss[,1])*1
ss$flexd <- grepl("FLExD",ss[,1])*1
ss$dtgm <- grepl("dTGM",ss[,1])*1
ss$dtgs <- grepl("dTGS",ss[,1])*1
rownames(ss) <- ss[,1]
ss[,1]<-NULL
# here set the contrast
sss <- subset(ss,mcm==1|dtgm==1)
design <- model.matrix(~sss$dtgm)
rownames(design) <- rownames(sss)
jj <- j[,which(colnames(j) %in% rownames(design) )]
jj <- jj[which(rowMeans(jj)>10),]
z <- DGEList(counts=jj)
z <- calcNormFactors(z)
z <- estimateDisp(z, design,robust=TRUE,prior.df=1)
fit <- glmFit(z, design)
lrt<-glmLRT(fit)
dge<-as.data.frame(topTags(lrt,n=Inf))
dge$dispersion<-lrt$dispersion
dge<-merge(dge,lrt$fitted.values,by='row.names')
rownames(dge)=dge$Row.names
dge$Row.names=NULL
dge<-dge[order(dge$PValue),]
head(dge,10)

##                        logFC   logCPM        LR        PValue           FDR
## ENSMUSG00000040322  4.145430 5.524643 1519.4858  0.000000e+00  0.000000e+00
## ENSMUSG00000020437  5.432810 5.174086 1224.5266 2.851232e-268 2.142131e-264
## ENSMUSG00000024542  3.011320 4.212642 1220.4921 2.146997e-267 1.075359e-263
## ENSMUSG00000068011  6.345711 4.265444 1127.4326 3.602781e-247 1.353385e-243
## ENSMUSG00000032122  5.995664 4.925190 1100.8525 2.155748e-241 6.478453e-238
## ENSMUSG00000041346  3.280202 4.239581 1091.7976 2.002776e-239 5.015619e-236
## ENSMUSG00000036862  3.382167 5.069340 1003.9379 2.502092e-220 5.370918e-217
## ENSMUSG00000026734 11.103505 2.353322  975.5494 3.706949e-214 6.962577e-211
## ENSMUSG00000045659  4.679723 3.771261  949.1380 2.042257e-208 3.409661e-205
## ENSMUSG00000044447  4.194787 4.501589  922.5110 1.253915e-202 1.884132e-199
##                      dispersion   MCM.1127  MCM.1246  MCM.1349 dTGM.1236
## ENSMUSG00000040322 9.765625e-05 106.953690 115.04141 116.90420 3086.0216
## ENSMUSG00000020437 6.420286e-03  35.321164  37.99211  38.60729 2492.0470
## ENSMUSG00000024542 1.366409e-02  88.027714  94.68427  96.21743 1157.3595
## ENSMUSG00000068011 4.623586e-02   9.935764  10.68710  10.86014 1328.8862
## ENSMUSG00000032122 1.382939e-02  20.184843  21.71120  22.06275 2107.9158
## ENSMUSG00000041346 5.424354e-03  75.786040  81.51689  82.83684 1200.7650
## ENSMUSG00000036862 8.246348e-03 127.368057 136.99948 139.21783 2164.8470
## ENSMUSG00000026734 1.053648e-02   0.000000   0.00000   0.00000  341.6463
## ENSMUSG00000045659 1.500921e-02  21.781858  23.42898  23.80835  913.2603
## ENSMUSG00000044447 1.449207e-02  50.481515  54.29887  55.17810 1508.7009
##                    dTGM.1308 dTGM.1331
## ENSMUSG00000040322 2825.7735 2985.2536
## ENSMUSG00000020437 2281.8895 2410.6741
## ENSMUSG00000024542 1059.7579 1119.5682
## ENSMUSG00000068011 1216.8195 1285.4940
## ENSMUSG00000032122 1930.1525 2039.0859
## ENSMUSG00000041346 1099.5029 1161.5564
## ENSMUSG00000036862 1982.2827 2094.1581
## ENSMUSG00000026734  312.8349  330.4906
## ENSMUSG00000045659  836.2439  883.4396
## ENSMUSG00000044447 1381.4702 1459.4372

jm <- dge

sss <- subset(ss,mcm==1|dtgs==1)
design <- model.matrix(~sss$dtgs)
rownames(design) <- rownames(sss)
jj <- j[,which(colnames(j) %in% rownames(design) )]
jj <- jj[which(rowMeans(jj)>10),]
z <- DGEList(counts=jj)
z <- calcNormFactors(z)
z <- estimateDisp(z, design,robust=TRUE,prior.df=1)
fit <- glmFit(z, design)
lrt<-glmLRT(fit)
dge<-as.data.frame(topTags(lrt,n=Inf))
dge$dispersion<-lrt$dispersion
dge<-merge(dge,lrt$fitted.values,by='row.names')
rownames(dge)=dge$Row.names
dge$Row.names=NULL
dge<-dge[order(dge$PValue),]
head(dge,10)

##                        logFC   logCPM       LR        PValue           FDR
## ENSMUSG00000091747  4.823213 5.166640 1497.693  0.000000e+00  0.000000e+00
## ENSMUSG00000015950  4.166462 4.573994 1416.039 6.871679e-310 5.411791e-306
## ENSMUSG00000020914  6.852232 5.797438 1376.245 3.050561e-301 1.601646e-297
## ENSMUSG00000034768 -3.849882 4.480923 1351.926 5.876642e-296 2.314075e-292
## ENSMUSG00000001131  6.137768 4.449557 1318.287 1.199992e-288 3.392601e-285
## ENSMUSG00000067455  5.348949 4.239762 1318.139 1.292337e-288 3.392601e-285
## ENSMUSG00000046345 -6.893317 4.273700 1280.119 2.363838e-280 5.318973e-277
## ENSMUSG00000038777 -3.024598 4.951953 1248.254 1.988524e-273 3.915156e-270
## ENSMUSG00000062727  7.368636 4.605333 1242.285 3.942227e-272 6.899335e-269
## ENSMUSG00000064288  5.360316 4.050966 1204.621 6.039794e-264 9.513280e-261
##                     dispersion   MCM.1127    MCM.1246    MCM.1349  dTGS.1047
## ENSMUSG00000091747 0.007308578  41.103312   44.205182   45.086409 2668.09650
## ENSMUSG00000015950 0.007362846  41.806168   44.961078   45.857374 1721.21140
## ENSMUSG00000020914 0.009077381  16.055621   17.267262   17.611483 4265.89294
## ENSMUSG00000034768 0.065224750 731.422240  786.619167  802.300364  115.94310
## ENSMUSG00000001131 0.001835045  10.080105   10.840802   11.056913 1636.66914
## ENSMUSG00000067455 0.023280503  14.886713   16.010142   16.329303 1395.64270
## ENSMUSG00000046345 0.018311514 676.562091  727.618986  742.124019   12.85318
## ENSMUSG00000038777 0.012181762 958.006760 1030.302933 1050.841949  269.30376
## ENSMUSG00000062727 0.011519154   4.794572    5.156395    5.259188 1842.33886
## ENSMUSG00000064288 0.028101258  12.890427   13.863206   14.139568 1218.99264
##                     dTGS.1066  dTGS.1079
## ENSMUSG00000091747 2561.92516 2524.21890
## ENSMUSG00000015950 1652.71938 1628.39476
## ENSMUSG00000020914 4096.14062 4035.85387
## ENSMUSG00000034768  111.32938  109.69084
## ENSMUSG00000001131 1571.54130 1548.41145
## ENSMUSG00000067455 1340.10601 1320.38241
## ENSMUSG00000046345   12.34171   12.16007
## ENSMUSG00000038777  258.58738  254.78150
## ENSMUSG00000062727 1769.02683 1742.99040
## ENSMUSG00000064288 1170.48537 1153.25823

js <- dge

Now to see how similar these contrasts are. The overlap is really high, therefore I suggest only using the moderate contrast in downstream mitch analysis.

jm_up <- rownames(subset(jm,FDR<0.05 & logFC>0))
jm_dn <- rownames(subset(jm,FDR<0.05 & logFC<0))

js_up <- rownames(subset(js,FDR<0.05 & logFC>0))
js_dn <- rownames(subset(js,FDR<0.05 & logFC<0))

v1 <- list("mod up"=jm_up, "mod dn"=jm_dn,
  "sev up"=js_up,"sev dn"=js_dn)

plot(euler(v1),quantities = TRUE)

Read in Watt data

# human 
h <- read.table("dux2_edgeR_edger.tsv",header=TRUE,row.names=1)
rownames(h) <- sapply(strsplit(rownames(h),"_"),"[[",1)

# mouse simple
# we're not using the factorial
ms <- read.table("EVvsDUX_wDOX_paired.tsv",header=TRUE,row.names=1)
rownames(ms) <- sapply(strsplit(rownames(ms),"_"),"[[",1)

# ortholog mapping
orth <- read.table("mouse2human.txt.sort")

# gene name mapping human
gth <- orth[,c(1,3)]

# gene name mapping human
gtm <- orth[,c(2,3)]

Read in Watt data

jm_up <- rownames(subset(jm,FDR<0.05 & logFC>0))
jm_dn <- rownames(subset(jm,FDR<0.05 & logFC<0))

h_up <- rownames(subset(h,FDR<0.05 & logFC>0))
h_dn <- rownames(subset(h,FDR<0.05 & logFC<0))

ms_up <- rownames(subset(ms,FDR<0.05 & logFC>0))
ms_dn <- rownames(subset(ms,FDR<0.05 & logFC<0))

l <- list("jm_up"=jm_up,"jm_dn"=jm_dn,"h_up"=h_up,"h_dn"=h_dn,"ms_up"=ms_up,"ms_dn"=ms_dn)
barplot(unlist(lapply(l,length)),main='no. DEGs')

# here are the values
unlist(lapply(l,length))

## jm_up jm_dn  h_up  h_dn ms_up ms_dn 
##  4744  4253  3578  3619  1902  1495

# convert lists to mouse gene names
jm_up <- unique(gtm[which(gtm$V2 %in% jm_up ),2])
jm_dn <- unique(gtm[which(gtm$V2 %in% jm_dn ),2])

h_up <- unique(gth[which(gth$V1 %in% h_up ),2])
h_dn <- unique(gth[which(gth$V1 %in% h_dn ),2])

ms_up <- unique(gtm[which(gtm$V2 %in% ms_up ),2])
ms_dn <- unique(gtm[which(gtm$V2 %in% ms_dn ),2])

v1 <- list("mod up"=jm_up, 
  "mod dn"=jm_dn,
  "h up"=h_up,
  "h dn"=h_dn,
  "ms up"=ms_up,
  "ms dn"=ms_dn )

plot(euler(v1),quantities = TRUE)

upset(fromList(v1), order.by = "freq", nsets=6)

This is a bit confusing, so we will do separate Venns, comparing each mouse data with the human data. We can see that the concordance with human is higher with the Watt data as compared to the Jones data. Here, I define concordance ratio as the number of FDR genes that are commonly dysregulated divided by the number of genes with discordant expression.

v1 <- list("mod up"=jm_up,
  "mod dn"=jm_dn,
  "h up"=h_up,
  "h dn"=h_dn)

plot(euler(v1),quantities = TRUE)

j_concordance <- (length(intersect(h_up,jm_up))+length(intersect(h_dn,jm_dn)))  / 
  (length(intersect(h_up,jm_dn))+length(intersect(h_dn,jm_up)))

j_concordance

## [1] 1.09033

v1 <- list( "h up"=h_up,
  "h dn"=h_dn,
  "ms up"=ms_up,
  "ms dn"=ms_dn )

plot(euler(v1),quantities = TRUE)

ms_concordance <- (length(intersect(h_up,ms_up))+length(intersect(h_dn,ms_dn)))  /  
  (length(intersect(h_up,ms_dn))+length(intersect(h_dn,ms_up)))

ms_concordance

## [1] 1.899254

barplot(c(j_concordance,ms_concordance),names.arg=c("jones","watt"),main="concordance ratio")

Get gene sets

#download.file("https://reactome.org/download/current/ReactomePathways.gmt.zip", destfile="ReactomePathways.gmt.zip")
#unzip("ReactomePathways.gmt.zip")
genesets <- gmt_import("ReactomePathways.gmt")

Prepare data for running mitch

The mitch_import function creates

hh <- mitch_import(x=h,DEtype="edger",geneTable=gth)

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.

## Note: Mean no. genes in input = 15664

## Note: no. genes in output = 12281

## Note: estimated proportion of input genes in output = 0.784

mm <- list("ms"=ms,"jm"=jm)
mmm <- mitch_import(x=mm,DEtype="edger",geneTable=gtm)

## Note: Mean no. genes in input = 13889

## Note: no. genes in output = 11677

## Note: estimated proportion of input genes in output = 0.841

mg <- merge(hh,mmm,by=0)
rownames(mg) <- mg[,1]
mg[,1]<-NULL
colnames(mg) <- c("human","mouse","jmod")
dim(mg)

## [1] 10591     3

head(mg)

##             human      mouse        jmod
## A4GALT -14.988939  0.8182771   5.5001019
## AAAS    -2.435804  0.3877972   0.7776159
## AACS    -1.581711  4.1420212  11.1737053
## AAED1   -1.286093 -0.8848853 -39.4294484
## AAGAB   -1.722220 -1.3839648   0.2777522
## AAK1    14.833940  0.4269837  -1.5398326

mg <- apply(mg,2, function(x) {
  x[which(x==Inf)] <- 300 ;  x[which(x==-Inf)] <- -300   ; return(x) 
} )

# pearson
round(cor(mg,method='p'),3)

##        human mouse   jmod
## human  1.000 0.177 -0.003
## mouse  0.177 1.000  0.123
## jmod  -0.003 0.123  1.000

# spearman
round(cor(mg,method='s'),3)

##       human mouse  jmod
## human 1.000 0.169 0.032
## mouse 0.169 1.000 0.487
## jmod  0.032 0.487 1.000

colfunc <- colorRampPalette(c("white", "red"))
heatmap.2(cor(mg,method='s'),
  cellnote=round(cor(mg,method='s'),3),
  trace="none",scale="none",margins=c(15,15),
  col=colfunc(25), notecex=2 , main="Spearman")

Here we run mitch unidimensional and then create Venn diagrams of the FDR gene sets.

hmm <- mitch_import( h,DEtype="edger",geneTable=gth )

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.

## Note: Mean no. genes in input = 15664

## Note: no. genes in output = 12281

## Note: estimated proportion of input genes in output = 0.784

hres <- mitch_calc(x=hmm,genesets=genesets,priority="effect",resrows=100)

## Note: Enrichments with large effect sizes may not be
##             statistically significant.

hres_up <- subset(hres$enrichment_result,p.adjustANOVA<0.02 & s.dist > 0)$set
hres_dn <- subset(hres$enrichment_result,p.adjustANOVA<0.02 & s.dist < 0)$set

jmm <- mitch_import( jm,DEtype="edger",geneTable=gtm )

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.

## Note: Mean no. genes in input = 15026

## Note: no. genes in output = 12858

## Note: estimated proportion of input genes in output = 0.856

jres <- mitch_calc(x=jmm,genesets=genesets,priority="effect",resrows=100)

## Note: Enrichments with large effect sizes may not be
##             statistically significant.

jres_up <- subset(jres$enrichment_result,p.adjustANOVA<0.02 & s.dist > 0)$set
jres_dn <- subset(jres$enrichment_result,p.adjustANOVA<0.02 & s.dist < 0)$set

msm <- mitch_import( ms,DEtype="edger",geneTable=gtm )

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.

## Note: Mean no. genes in input = 12752

## Note: no. genes in output = 11830

## Note: estimated proportion of input genes in output = 0.928

mres <- mitch_calc(x=msm,genesets=genesets,priority="effect",resrows=100)

## Note: Enrichments with large effect sizes may not be
##             statistically significant.

mres_up <- subset(mres$enrichment_result,p.adjustANOVA<0.02 & s.dist > 0)$set
mres_dn <- subset(mres$enrichment_result,p.adjustANOVA<0.02 & s.dist < 0)$set

v1 <- list("h up"=hres_up,
  "h dn"=hres_dn,
  "j up"=jres_up,
  "j dn"=jres_dn)

plot(euler(v1),quantities = TRUE)

v1 <- list("h up"=hres_up,
  "h dn"=hres_dn,
  "m up"=mres_up,
  "m dn"=mres_dn)

plot(euler(v1),quantities = TRUE)

Here we run multivariate mitch with different prioritisation schemes.

# effect size
res1 <- mitch_calc(x=mg,genesets=genesets,priority="effect",resrows=100)

## Note: Enrichments with large effect sizes may not be 
##             statistically significant.

x <- res1$enrichment_result[1:100,c(4:6)]
rownames(x) <- res1$enrichment_result$set[1:nrow(x)]

colfunc <- colorRampPalette(c("blue","white", "red"))

heatmap.2(as.matrix(x),col=colfunc(30),margins=c(5,25),trace="none",
  scale="none",cexCol=1,cexRow=0.25)

# discordance
res2 <- mitch_calc(x=mg,genesets=genesets,priority="SD",resrows=100)

## Note: Prioritisation by SD after selecting sets with 
##             p.adjustMANOVA<=0.05.

x <- res2$enrichment_result[1:100,c(4:6)]
rownames(x) <- res2$enrichment_result$set[1:nrow(x)]

colfunc <- colorRampPalette(c("blue","white", "red"))

heatmap.2(as.matrix(x),col=colfunc(30),margins=c(5,25),trace="none",
  scale="none",cexCol=1, cexRow=0.25)

# significance
res3 <- mitch_calc(x=mg,genesets=genesets,priority="significance",resrows=100)

## Note: When prioritising by significance (ie: small 
##             p-values), large effect sizes might be missed.

x <- res3$enrichment_result[1:100,c(4:6)]
rownames(x) <- res3$enrichment_result$set[1:nrow(x)]

colfunc <- colorRampPalette(c("blue","white", "red"))

heatmap.2(as.matrix(x),col=colfunc(30),margins=c(5,25),trace="none",
  scale="none",cexCol=1, cexRow=0.25)

unlink("mitch_plots_simple_eff.html")

capture.output(
    mitch_report(res1,"mitch_plots_simple_eff.html")
  ,file = "/dev/null", append = FALSE,
  type = c("output", "message"), split = FALSE)

## Dataset saved as " /tmp/RtmprhnUTX/mitch_plots_simple_eff.rds ".

## 
## 
## processing file: mitch.Rmd

## output file: /mnt/bfx6/bfx/kevinwatt/dux_mitch/mitch.knit.md

## 
## Output created: /tmp/RtmprhnUTX/mitch_report.html

#res2 <- mitch_calc(x=mg,genesets=genesets,priority="significance")
#head(res2$enrichment_result,20)
#unlink("mitch_plots_simple_sig.html")
#mitch_report(res2,"mitch_plots_simple_sig.html")

Session information

sessionInfo()

## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] pkgload_1.1.0               GGally_2.0.0               
##  [3] ggplot2_3.3.2               reshape2_1.4.4             
##  [5] beeswarm_0.2.3              gtools_3.8.2               
##  [7] tibble_3.0.4                dplyr_1.0.2                
##  [9] echarts4r_0.3.3             gplots_3.1.0               
## [11] UpSetR_1.4.0                eulerr_6.1.0               
## [13] edgeR_3.30.3                limma_3.44.3               
## [15] DESeq2_1.28.1               SummarizedExperiment_1.18.2
## [17] DelayedArray_0.14.1         matrixStats_0.57.0         
## [19] Biobase_2.48.0              GenomicRanges_1.40.0       
## [21] GenomeInfoDb_1.24.2         IRanges_2.22.2             
## [23] S4Vectors_0.26.1            BiocGenerics_0.34.0        
## [25] mitch_1.0.10               
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6           bit64_4.0.5            RColorBrewer_1.1-2    
##  [4] rprojroot_1.3-2        backports_1.1.10       tools_4.0.3           
##  [7] R6_2.4.1               KernSmooth_2.23-17     DBI_1.1.0             
## [10] colorspace_1.4-1       withr_2.3.0            tidyselect_1.1.0      
## [13] gridExtra_2.3          bit_4.0.4              compiler_4.0.3        
## [16] desc_1.2.0             labeling_0.4.2         caTools_1.18.0        
## [19] scales_1.1.1           genefilter_1.70.0      stringr_1.4.0         
## [22] digest_0.6.26          rmarkdown_2.5          XVector_0.28.0        
## [25] pkgconfig_2.0.3        htmltools_0.5.0        highr_0.8             
## [28] fastmap_1.0.1          htmlwidgets_1.5.2      rlang_0.4.8           
## [31] RSQLite_2.2.1          shiny_1.5.0            generics_0.0.2        
## [34] farver_2.0.3           jsonlite_1.7.1         BiocParallel_1.22.0   
## [37] RCurl_1.98-1.2         magrittr_1.5           GenomeInfoDbData_1.2.3
## [40] Matrix_1.2-18          Rcpp_1.0.5             munsell_0.5.0         
## [43] lifecycle_0.2.0        stringi_1.5.3          yaml_2.2.1            
## [46] MASS_7.3-53            zlibbioc_1.34.0        plyr_1.8.6            
## [49] grid_4.0.3             blob_1.2.1             promises_1.1.1        
## [52] crayon_1.3.4           lattice_0.20-41        splines_4.0.3         
## [55] annotate_1.66.0        polylabelr_0.2.0       locfit_1.5-9.4        
## [58] knitr_1.30             pillar_1.4.6           geneplotter_1.66.0    
## [61] XML_3.99-0.5           glue_1.4.2             evaluate_0.14         
## [64] vctrs_0.3.4            httpuv_1.5.4           testthat_2.3.2        
## [67] gtable_0.3.0           purrr_0.3.4            polyclip_1.10-0       
## [70] assertthat_0.2.1       reshape_0.8.8          xfun_0.18             
## [73] mime_0.9               xtable_1.8-4           later_1.1.0.1         
## [76] survival_3.2-7         AnnotationDbi_1.50.3   memoise_1.1.0         
## [79] ellipsis_0.3.1

Multi-contrast enrichment analysis of DUX data

Mark Ziemann

2020-10-29

Introduction

Prepare Jones data

Read in Watt data

Read in Watt data

Get gene sets

Prepare data for running mitch

Session information