Here the purpose is to demonstrate how the choice of DE tool can impact the results of a study.
Source code: Source code: https://github.com/markziemann/de_compare
View result: http://118.138.234.73/public/blog/deg_compare.html
library("edgeR")
library("DESeq2")
library("eulerr")
library("UpSetR")
This is real data from an ATAC-seq experiment. I have scrubbed the sample and gene names.
x <- read.table("counts.mx",row.names=1,header=TRUE)
xx <- x[which(rowMeans(x)>10),]
str(xx)
## 'data.frame': 35978 obs. of 6 variables:
## $ C1: int 133 126 183 74 207 369 29 51 69 82 ...
## $ C2: int 96 88 111 70 163 239 11 15 44 76 ...
## $ C3: int 278 239 386 172 356 730 38 50 143 197 ...
## $ T1: int 288 264 343 177 353 708 20 45 155 199 ...
## $ T2: int 288 279 394 190 374 683 28 48 152 193 ...
## $ T3: int 244 214 279 137 312 542 35 53 137 165 ...
dim(xx)
## [1] 35978 6
ss <- as.data.frame(colnames(xx))
ss$trt <- as.integer(grepl("T",ss[,1]))
rownames(ss) <- ss[,1]
ss[,1]=NULL
ss
## trt
## C1 0
## C2 0
## C3 0
## T1 1
## T2 1
## T3 1
This will give us an idea about which samples are similar/different.
plotMDS(xx, labels=rownames(ss),col=as.integer(as.factor(ss$trt)),main="MDS")
This was the method that was used in the group when I first got involved in this project.
design <- model.matrix(~ss$trt)
rownames(design) <- rownames(ss)
z <- DGEList(counts=xx)
z <- calcNormFactors(z)
z <- estimateDisp(z, design,robust=TRUE,prior.df=1)
fit <- glmFit(z, design)
lrt<-glmLRT(fit)
dge<-as.data.frame(topTags(lrt,n=Inf))
dge$dispersion<-lrt$dispersion
dge<-merge(dge,lrt$fitted.values,by='row.names')
rownames(dge)=dge$Row.names
dge$Row.names=NULL
dge<-dge[order(dge$PValue),]
head(dge,10)
## logFC logCPM LR PValue FDR dispersion
## Gene10678 -0.9990238 5.403241 124.26043 7.388163e-29 2.658113e-24 9.765625e-05
## Gene34968 -0.8936960 5.630466 75.95619 2.900284e-18 5.217322e-14 9.765625e-05
## Gene21042 -0.9610760 4.410686 72.75256 1.469672e-17 1.762529e-13 5.355870e-03
## Gene25117 -0.7547172 5.256924 60.82050 6.252332e-15 5.623660e-11 6.442848e-03
## Gene26750 -0.9556798 4.128865 59.10887 1.491849e-14 1.073475e-10 4.890034e-03
## Gene17006 -0.7934496 5.548893 55.86682 7.755048e-14 4.650185e-10 2.456894e-03
## Gene15204 -1.0739619 7.077392 51.95923 5.666496e-13 2.912417e-09 5.161940e-02
## Gene3841 -0.9587878 5.175396 48.75033 2.907097e-12 1.307394e-08 6.131114e-02
## Gene40878 -0.6056546 5.461936 48.33269 3.597060e-12 1.437945e-08 9.765625e-05
## Gene17793 -0.6482170 4.909470 47.06795 6.856794e-12 2.466937e-08 3.948173e-03
## C1 C2 C3 T1 T2 T3
## Gene10678 210.6868 135.95119 392.1378 200.62781 207.78752 174.56741
## Gene34968 238.6822 154.01593 444.2438 244.52442 253.25064 212.76210
## Gene21042 104.9374 67.71357 195.3131 102.55419 106.21399 89.23299
## Gene25117 176.4899 113.88470 328.4892 199.08994 206.19477 173.22930
## Gene26750 86.6584 55.91860 161.2917 84.99487 88.02804 73.95453
## Gene17006 220.2282 142.10803 409.8966 241.85830 250.48938 210.44230
## Gene15204 674.5608 435.27805 1255.5166 609.99820 631.76691 530.76294
## Gene3841 177.4402 114.49795 330.2581 173.73967 179.93983 151.17189
## Gene40878 194.6619 125.61071 362.3117 243.51498 252.20518 211.88378
## Gene17793 135.7312 87.58408 252.6276 164.83803 170.72052 143.42652
dge_edger <- dge
sig <- subset(dge_edger,FDR<0.05)
dge_edger_up <- rownames(subset(sig,logFC>0))
dge_edger_dn <- rownames(subset(sig,logFC<0))
length(dge_edger_up)
## [1] 297
length(dge_edger_dn)
## [1] 589
This is a "new" edgeR method which is more conservative.
design <- model.matrix(~ss$trt)
rownames(design) <- rownames(ss)
z <- DGEList(counts=xx)
z <- calcNormFactors(z)
z <- estimateDisp(z, design,robust=TRUE,prior.df=1)
fit <- glmQLFit(z, design)
lrt <- glmQLFTest(fit)
dge<-as.data.frame(topTags(lrt,n=Inf))
dge$dispersion<-lrt$dispersion
dge<-merge(dge,lrt$fitted.values,by='row.names')
rownames(dge)=dge$Row.names
dge$Row.names=NULL
dge<-dge[order(dge$PValue),]
head(dge,10)
## logFC logCPM F PValue FDR dispersion
## Gene3219 -1.3438017 6.368508 94.55504 2.016344e-12 7.254402e-08 0.008468683
## Gene6678 -1.3997961 6.419712 85.59495 8.706031e-12 1.566128e-07 0.008569245
## Gene32136 -1.3710207 7.071626 72.89145 8.359661e-11 8.109481e-07 0.008310851
## Gene38166 -1.1578667 6.035898 71.88163 1.011420e-10 8.109481e-07 0.009763449
## Gene38199 -1.2171883 6.390173 71.23243 1.144245e-10 8.109481e-07 0.008445606
## Gene9230 -1.2636503 6.581057 70.35900 1.352407e-10 8.109481e-07 0.009916487
## Gene17647 -1.1423462 6.120403 65.24816 3.695318e-10 1.899288e-06 0.037033149
## Gene29009 -1.2241973 7.417536 62.45668 6.532462e-10 2.812151e-06 0.022572552
## Gene10678 -0.9882287 5.403241 61.83763 7.427704e-10 2.812151e-06 0.010443288
## Gene1838 -1.1494299 5.182201 61.59287 7.816306e-10 2.812151e-06 0.009346355
## C1 C2 C3 T1 T2 T3
## Gene3219 440.0001 283.9216 818.9438 329.9613 341.7365 287.1013
## Gene6678 460.6099 297.2206 857.3035 332.2650 344.1224 289.1057
## Gene32136 716.6225 462.4195 1333.8033 527.4162 546.2379 458.9079
## Gene38166 336.1733 216.9246 625.6977 286.7744 297.0084 249.5241
## Gene38199 434.5877 280.4291 808.8700 355.8104 368.5080 309.5927
## Gene9230 500.3624 322.8720 931.2923 396.6872 410.8436 345.1598
## Gene17647 354.9538 229.0432 660.6526 306.0764 316.9992 266.3188
## Gene29009 881.2980 568.6806 1640.3032 718.1402 743.7681 624.8579
## Gene10678 209.1027 134.9290 389.1893 200.6154 207.7747 174.5566
## Gene1838 186.8167 120.5483 347.7098 160.2616 165.9808 139.4446
dge_edgerql <- dge
sig <- subset(dge_edgerql,FDR<0.05)
dge_edgerql_up <- rownames(subset(sig,logFC>0))
dge_edgerql_dn <- rownames(subset(sig,logFC<0))
length(dge_edgerql_up)
## [1] 15
length(dge_edgerql_dn)
## [1] 204
This is the method that I prefer using these days.
dds <- DESeqDataSetFromMatrix(countData = xx , colData = ss, design = ~ trt )
## the design formula contains one or more numeric variables with integer values,
## specifying a model with increasing fold change for higher values.
## did you mean for this to be a factor? if so, first convert
## this variable to a factor using the factor() function
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
z<- results(res)
vsd <- vst(dds, blind=FALSE)
zz<-cbind(as.data.frame(z),assay(vsd))
dge<-as.data.frame(zz[order(zz$pvalue),])
head(dge,10)
## baseMean log2FoldChange lfcSE stat pvalue
## Gene3219 427.3872 -1.346014 0.1595261 -8.437579 3.239786e-17
## Gene15204 693.6957 -1.077427 0.1350267 -7.979363 1.470899e-15
## Gene40770 1991.4646 -1.242463 0.1579345 -7.866947 3.634010e-15
## Gene17874 909.5136 -1.190549 0.1525483 -7.804408 5.978164e-15
## Gene3100 1052.7539 -1.148181 0.1477635 -7.770398 7.823988e-15
## Gene29009 877.0216 -1.227448 0.1603871 -7.653029 1.962989e-14
## Gene31212 1682.6569 -1.154134 0.1538944 -7.499519 6.405235e-14
## Gene38166 342.7155 -1.167876 0.1646295 -7.093966 1.303219e-12
## Gene38199 434.3977 -1.222031 0.1722766 -7.093425 1.308326e-12
## Gene3365 534.8301 -1.066922 0.1537001 -6.941585 3.877255e-12
## padj C1 C2 C3 T1 T2
## Gene3219 5.328476e-13 9.521740 9.578793 9.574391 8.259974 8.615183
## Gene15204 1.209594e-11 10.068616 10.138880 10.038921 9.016384 9.126365
## Gene40770 1.992285e-11 11.671917 11.541197 11.341701 9.953281 10.456535
## Gene17874 2.458072e-11 10.530428 10.451598 10.396047 9.083131 9.510534
## Gene3100 2.573623e-11 10.740486 10.683742 10.483736 9.341947 9.634867
## Gene29009 5.380879e-11 10.558932 10.359512 10.340573 8.999707 9.451396
## Gene31212 1.504956e-10 11.427866 11.340733 11.006168 9.901164 10.205028
## Gene38166 2.390894e-09 9.205176 9.467670 9.089895 8.288780 8.337280
## Gene38199 2.390894e-09 9.521740 9.639146 9.481741 8.267237 8.733850
## Gene3365 6.376921e-09 9.684369 9.820119 9.765847 8.617673 8.973379
## T3
## Gene3219 8.733535
## Gene15204 9.403446
## Gene40770 10.629467
## Gene17874 9.663835
## Gene3100 9.879878
## Gene29009 9.611707
## Gene31212 10.473437
## Gene38166 8.607571
## Gene38199 8.810524
## Gene3365 9.106249
dge_deseq2 <- dge
sig <- subset(dge,padj<0.05)
dge_deseq2_up <- rownames(subset(sig,log2FoldChange>0))
dge_deseq2_dn <- rownames(subset(sig,log2FoldChange<0))
length(dge_deseq2_up)
## [1] 3
length(dge_deseq2_dn)
## [1] 113
This was suggested to me.
design <- model.matrix(~ss$trt)
v <- voom(xx, design, plot=FALSE)
vfit <- lmFit(v, design)
efit <- eBayes(vfit)
dge <- topTable(efit,n=Inf)
## Removing intercept from test coefficients
head(dge,10)
## logFC AveExpr t P.Value adj.P.Val B
## Gene7673 -1.407783 7.087248 -9.476003 1.211355e-16 4.358211e-12 26.81354
## Gene32136 -1.284966 6.903733 -9.230921 4.942507e-16 8.891075e-12 25.48687
## Gene3219 -1.228192 6.229800 -8.893929 3.374026e-15 4.046356e-11 23.62204
## Gene22288 -1.232098 8.771117 -8.605111 1.726611e-14 1.553001e-10 22.10257
## Gene36720 -1.169483 7.222326 -8.528768 2.652132e-14 1.908368e-10 21.75480
## Gene3636 -1.179525 8.526491 -8.427486 4.679300e-14 2.805864e-10 21.18847
## Gene9230 -1.170781 6.444837 -8.371422 6.402004e-14 3.290447e-10 20.88786
## Gene6678 -1.197501 6.238507 -8.305173 9.264912e-14 4.166662e-10 20.52090
## Gene29009 -1.108660 7.286103 -8.249198 1.265251e-13 5.057912e-10 20.28557
## Gene40770 -1.122954 8.464626 -8.214370 1.535483e-13 5.524361e-10 20.07710
dge_voom <- dge
sig <- subset(dge_voom,adj.P.Val<0.05)
dge_voom_up <- rownames(subset(sig,logFC>0))
dge_voom_dn <- rownames(subset(sig,logFC<0))
length(dge_voom_up)
## [1] 100
length(dge_voom_dn)
## [1] 128
With the "eulerr" project.
v1 <- list("edgeR up"=dge_edger_up, "edgeR dn"=dge_edger_dn,
"DESeq2 up"=dge_deseq2_up,"DESeq2 dn"=dge_deseq2_dn)
plot(euler(v1),quantities = TRUE)
v1 <- list("edgeR up"=dge_edger_up, "edgeR dn"=dge_edger_dn,
"DESeq2 up"=dge_deseq2_up,"DESeq2 dn"=dge_deseq2_dn,
"edgeR_QL up"=dge_edgerql_up, "edgeR_QL dn"=dge_edgerql_dn)
plot(euler(v1),quantities = TRUE)
v1 <- list("edgeR up"=dge_edger_up, "edgeR dn"=dge_edger_dn,
"DESeq2 up"=dge_deseq2_up,"DESeq2 dn"=dge_deseq2_dn,
"Voom up"=dge_voom_up, "Voom dn"=dge_voom_dn)
plot(euler(v1),quantities = TRUE)
This should demonstrate the overlaps between different sets better.
v1 <- list("edgeR up"=dge_edger_up,
"DESeq2 up"=dge_deseq2_up,
"Voom up"=dge_voom_up,
"edgeR_QL up"=dge_edgerql_up)
upset(fromList(v1), order.by = "freq")
v1 <- list("edgeR dn"=dge_edger_dn,
"DESeq2 dn"=dge_deseq2_dn,
"Voom dn"=dge_voom_dn,
"edgeR_QL dn"=dge_edgerql_dn)
upset(fromList(v1), order.by = "freq")
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] UpSetR_1.4.0 eulerr_6.1.0
## [3] DESeq2_1.28.1 SummarizedExperiment_1.18.2
## [5] DelayedArray_0.14.1 matrixStats_0.57.0
## [7] Biobase_2.48.0 GenomicRanges_1.40.0
## [9] GenomeInfoDb_1.24.2 IRanges_2.22.2
## [11] S4Vectors_0.26.1 BiocGenerics_0.34.0
## [13] edgeR_3.30.3 limma_3.44.3
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.5 locfit_1.5-9.4 lattice_0.20-41
## [4] digest_0.6.25 plyr_1.8.6 R6_2.4.1
## [7] RSQLite_2.2.1 evaluate_0.14 ggplot2_3.3.2
## [10] pillar_1.4.6 zlibbioc_1.34.0 rlang_0.4.7
## [13] annotate_1.66.0 blob_1.2.1 Matrix_1.2-18
## [16] rmarkdown_2.4 labeling_0.3 splines_4.0.2
## [19] BiocParallel_1.22.0 geneplotter_1.66.0 stringr_1.4.0
## [22] polyclip_1.10-0 RCurl_1.98-1.2 bit_4.0.4
## [25] munsell_0.5.0 polylabelr_0.2.0 compiler_4.0.2
## [28] xfun_0.18 pkgconfig_2.0.3 htmltools_0.5.0
## [31] tidyselect_1.1.0 gridExtra_2.3 tibble_3.0.3
## [34] GenomeInfoDbData_1.2.3 XML_3.99-0.5 crayon_1.3.4
## [37] dplyr_1.0.2 bitops_1.0-6 grid_4.0.2
## [40] xtable_1.8-4 gtable_0.3.0 lifecycle_0.2.0
## [43] DBI_1.1.0 magrittr_1.5 scales_1.1.1
## [46] stringi_1.5.3 farver_2.0.3 XVector_0.28.0
## [49] genefilter_1.70.0 ellipsis_0.3.1 vctrs_0.3.4
## [52] generics_0.0.2 RColorBrewer_1.1-2 tools_4.0.2
## [55] bit64_4.0.5 glue_1.4.2 purrr_0.3.4
## [58] survival_3.2-7 yaml_2.2.1 AnnotationDbi_1.50.3
## [61] colorspace_1.4-1 memoise_1.1.0 knitr_1.30