Intro

Here the purpose is to demonstrate how the choice of DE tool can impact the results of a study.

Source code: Source code: https://github.com/markziemann/de_compare

View result: http://118.138.234.73/public/blog/deg_compare.html

Libraries

library("edgeR")
library("DESeq2")
library("eulerr")
library("UpSetR")

Input data

This is real data from an ATAC-seq experiment. I have scrubbed the sample and gene names.

x <- read.table("counts.mx",row.names=1,header=TRUE)
xx <- x[which(rowMeans(x)>10),]
str(xx)
## 'data.frame':    35978 obs. of  6 variables:
##  $ C1: int  133 126 183 74 207 369 29 51 69 82 ...
##  $ C2: int  96 88 111 70 163 239 11 15 44 76 ...
##  $ C3: int  278 239 386 172 356 730 38 50 143 197 ...
##  $ T1: int  288 264 343 177 353 708 20 45 155 199 ...
##  $ T2: int  288 279 394 190 374 683 28 48 152 193 ...
##  $ T3: int  244 214 279 137 312 542 35 53 137 165 ...
dim(xx)
## [1] 35978     6
ss <- as.data.frame(colnames(xx))
ss$trt <- as.integer(grepl("T",ss[,1]))
rownames(ss) <- ss[,1]
ss[,1]=NULL
ss
##    trt
## C1   0
## C2   0
## C3   0
## T1   1
## T2   1
## T3   1

MDS analysis

This will give us an idea about which samples are similar/different.

plotMDS(xx, labels=rownames(ss),col=as.integer(as.factor(ss$trt)),main="MDS")

EdgeR LRT

This was the method that was used in the group when I first got involved in this project.

design <- model.matrix(~ss$trt)
rownames(design) <- rownames(ss)
z <- DGEList(counts=xx)
z <- calcNormFactors(z)
z <- estimateDisp(z, design,robust=TRUE,prior.df=1)
fit <- glmFit(z, design)
lrt<-glmLRT(fit)
dge<-as.data.frame(topTags(lrt,n=Inf))
dge$dispersion<-lrt$dispersion
dge<-merge(dge,lrt$fitted.values,by='row.names')
rownames(dge)=dge$Row.names
dge$Row.names=NULL
dge<-dge[order(dge$PValue),]
head(dge,10)
##                logFC   logCPM        LR       PValue          FDR   dispersion
## Gene10678 -0.9990238 5.403241 124.26043 7.388163e-29 2.658113e-24 9.765625e-05
## Gene34968 -0.8936960 5.630466  75.95619 2.900284e-18 5.217322e-14 9.765625e-05
## Gene21042 -0.9610760 4.410686  72.75256 1.469672e-17 1.762529e-13 5.355870e-03
## Gene25117 -0.7547172 5.256924  60.82050 6.252332e-15 5.623660e-11 6.442848e-03
## Gene26750 -0.9556798 4.128865  59.10887 1.491849e-14 1.073475e-10 4.890034e-03
## Gene17006 -0.7934496 5.548893  55.86682 7.755048e-14 4.650185e-10 2.456894e-03
## Gene15204 -1.0739619 7.077392  51.95923 5.666496e-13 2.912417e-09 5.161940e-02
## Gene3841  -0.9587878 5.175396  48.75033 2.907097e-12 1.307394e-08 6.131114e-02
## Gene40878 -0.6056546 5.461936  48.33269 3.597060e-12 1.437945e-08 9.765625e-05
## Gene17793 -0.6482170 4.909470  47.06795 6.856794e-12 2.466937e-08 3.948173e-03
##                 C1        C2        C3        T1        T2        T3
## Gene10678 210.6868 135.95119  392.1378 200.62781 207.78752 174.56741
## Gene34968 238.6822 154.01593  444.2438 244.52442 253.25064 212.76210
## Gene21042 104.9374  67.71357  195.3131 102.55419 106.21399  89.23299
## Gene25117 176.4899 113.88470  328.4892 199.08994 206.19477 173.22930
## Gene26750  86.6584  55.91860  161.2917  84.99487  88.02804  73.95453
## Gene17006 220.2282 142.10803  409.8966 241.85830 250.48938 210.44230
## Gene15204 674.5608 435.27805 1255.5166 609.99820 631.76691 530.76294
## Gene3841  177.4402 114.49795  330.2581 173.73967 179.93983 151.17189
## Gene40878 194.6619 125.61071  362.3117 243.51498 252.20518 211.88378
## Gene17793 135.7312  87.58408  252.6276 164.83803 170.72052 143.42652
dge_edger <- dge
sig <- subset(dge_edger,FDR<0.05)
dge_edger_up <- rownames(subset(sig,logFC>0))
dge_edger_dn <- rownames(subset(sig,logFC<0))
length(dge_edger_up)
## [1] 297
length(dge_edger_dn)
## [1] 589

EdgeR QL

This is a "new" edgeR method which is more conservative.

design <- model.matrix(~ss$trt)
rownames(design) <- rownames(ss)
z <- DGEList(counts=xx)
z <- calcNormFactors(z)
z <- estimateDisp(z, design,robust=TRUE,prior.df=1)
fit <- glmQLFit(z, design)
lrt <- glmQLFTest(fit)
dge<-as.data.frame(topTags(lrt,n=Inf))
dge$dispersion<-lrt$dispersion
dge<-merge(dge,lrt$fitted.values,by='row.names')
rownames(dge)=dge$Row.names
dge$Row.names=NULL
dge<-dge[order(dge$PValue),]
head(dge,10)
##                logFC   logCPM        F       PValue          FDR  dispersion
## Gene3219  -1.3438017 6.368508 94.55504 2.016344e-12 7.254402e-08 0.008468683
## Gene6678  -1.3997961 6.419712 85.59495 8.706031e-12 1.566128e-07 0.008569245
## Gene32136 -1.3710207 7.071626 72.89145 8.359661e-11 8.109481e-07 0.008310851
## Gene38166 -1.1578667 6.035898 71.88163 1.011420e-10 8.109481e-07 0.009763449
## Gene38199 -1.2171883 6.390173 71.23243 1.144245e-10 8.109481e-07 0.008445606
## Gene9230  -1.2636503 6.581057 70.35900 1.352407e-10 8.109481e-07 0.009916487
## Gene17647 -1.1423462 6.120403 65.24816 3.695318e-10 1.899288e-06 0.037033149
## Gene29009 -1.2241973 7.417536 62.45668 6.532462e-10 2.812151e-06 0.022572552
## Gene10678 -0.9882287 5.403241 61.83763 7.427704e-10 2.812151e-06 0.010443288
## Gene1838  -1.1494299 5.182201 61.59287 7.816306e-10 2.812151e-06 0.009346355
##                 C1       C2        C3       T1       T2       T3
## Gene3219  440.0001 283.9216  818.9438 329.9613 341.7365 287.1013
## Gene6678  460.6099 297.2206  857.3035 332.2650 344.1224 289.1057
## Gene32136 716.6225 462.4195 1333.8033 527.4162 546.2379 458.9079
## Gene38166 336.1733 216.9246  625.6977 286.7744 297.0084 249.5241
## Gene38199 434.5877 280.4291  808.8700 355.8104 368.5080 309.5927
## Gene9230  500.3624 322.8720  931.2923 396.6872 410.8436 345.1598
## Gene17647 354.9538 229.0432  660.6526 306.0764 316.9992 266.3188
## Gene29009 881.2980 568.6806 1640.3032 718.1402 743.7681 624.8579
## Gene10678 209.1027 134.9290  389.1893 200.6154 207.7747 174.5566
## Gene1838  186.8167 120.5483  347.7098 160.2616 165.9808 139.4446
dge_edgerql <- dge
sig <- subset(dge_edgerql,FDR<0.05)
dge_edgerql_up <- rownames(subset(sig,logFC>0))
dge_edgerql_dn <- rownames(subset(sig,logFC<0))
length(dge_edgerql_up)
## [1] 15
length(dge_edgerql_dn)
## [1] 204

DESeq2

This is the method that I prefer using these days.

dds <- DESeqDataSetFromMatrix(countData = xx , colData = ss, design = ~ trt )
##   the design formula contains one or more numeric variables with integer values,
##   specifying a model with increasing fold change for higher values.
##   did you mean for this to be a factor? if so, first convert
##   this variable to a factor using the factor() function
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
z<- results(res)
vsd <- vst(dds, blind=FALSE)
zz<-cbind(as.data.frame(z),assay(vsd))
dge<-as.data.frame(zz[order(zz$pvalue),])
head(dge,10)
##            baseMean log2FoldChange     lfcSE      stat       pvalue
## Gene3219   427.3872      -1.346014 0.1595261 -8.437579 3.239786e-17
## Gene15204  693.6957      -1.077427 0.1350267 -7.979363 1.470899e-15
## Gene40770 1991.4646      -1.242463 0.1579345 -7.866947 3.634010e-15
## Gene17874  909.5136      -1.190549 0.1525483 -7.804408 5.978164e-15
## Gene3100  1052.7539      -1.148181 0.1477635 -7.770398 7.823988e-15
## Gene29009  877.0216      -1.227448 0.1603871 -7.653029 1.962989e-14
## Gene31212 1682.6569      -1.154134 0.1538944 -7.499519 6.405235e-14
## Gene38166  342.7155      -1.167876 0.1646295 -7.093966 1.303219e-12
## Gene38199  434.3977      -1.222031 0.1722766 -7.093425 1.308326e-12
## Gene3365   534.8301      -1.066922 0.1537001 -6.941585 3.877255e-12
##                   padj        C1        C2        C3       T1        T2
## Gene3219  5.328476e-13  9.521740  9.578793  9.574391 8.259974  8.615183
## Gene15204 1.209594e-11 10.068616 10.138880 10.038921 9.016384  9.126365
## Gene40770 1.992285e-11 11.671917 11.541197 11.341701 9.953281 10.456535
## Gene17874 2.458072e-11 10.530428 10.451598 10.396047 9.083131  9.510534
## Gene3100  2.573623e-11 10.740486 10.683742 10.483736 9.341947  9.634867
## Gene29009 5.380879e-11 10.558932 10.359512 10.340573 8.999707  9.451396
## Gene31212 1.504956e-10 11.427866 11.340733 11.006168 9.901164 10.205028
## Gene38166 2.390894e-09  9.205176  9.467670  9.089895 8.288780  8.337280
## Gene38199 2.390894e-09  9.521740  9.639146  9.481741 8.267237  8.733850
## Gene3365  6.376921e-09  9.684369  9.820119  9.765847 8.617673  8.973379
##                  T3
## Gene3219   8.733535
## Gene15204  9.403446
## Gene40770 10.629467
## Gene17874  9.663835
## Gene3100   9.879878
## Gene29009  9.611707
## Gene31212 10.473437
## Gene38166  8.607571
## Gene38199  8.810524
## Gene3365   9.106249
dge_deseq2 <- dge
sig <- subset(dge,padj<0.05)
dge_deseq2_up <- rownames(subset(sig,log2FoldChange>0))
dge_deseq2_dn <- rownames(subset(sig,log2FoldChange<0))
length(dge_deseq2_up)
## [1] 3
length(dge_deseq2_dn)
## [1] 113

Voom/Limma

This was suggested to me.

design <- model.matrix(~ss$trt)
v <- voom(xx, design, plot=FALSE)
vfit <- lmFit(v, design)
efit <- eBayes(vfit)
dge <- topTable(efit,n=Inf)
## Removing intercept from test coefficients
head(dge,10)
##               logFC  AveExpr         t      P.Value    adj.P.Val        B
## Gene7673  -1.407783 7.087248 -9.476003 1.211355e-16 4.358211e-12 26.81354
## Gene32136 -1.284966 6.903733 -9.230921 4.942507e-16 8.891075e-12 25.48687
## Gene3219  -1.228192 6.229800 -8.893929 3.374026e-15 4.046356e-11 23.62204
## Gene22288 -1.232098 8.771117 -8.605111 1.726611e-14 1.553001e-10 22.10257
## Gene36720 -1.169483 7.222326 -8.528768 2.652132e-14 1.908368e-10 21.75480
## Gene3636  -1.179525 8.526491 -8.427486 4.679300e-14 2.805864e-10 21.18847
## Gene9230  -1.170781 6.444837 -8.371422 6.402004e-14 3.290447e-10 20.88786
## Gene6678  -1.197501 6.238507 -8.305173 9.264912e-14 4.166662e-10 20.52090
## Gene29009 -1.108660 7.286103 -8.249198 1.265251e-13 5.057912e-10 20.28557
## Gene40770 -1.122954 8.464626 -8.214370 1.535483e-13 5.524361e-10 20.07710
dge_voom <- dge
sig <- subset(dge_voom,adj.P.Val<0.05)
dge_voom_up <- rownames(subset(sig,logFC>0))
dge_voom_dn <- rownames(subset(sig,logFC<0))
length(dge_voom_up)
## [1] 100
length(dge_voom_dn)
## [1] 128

Venn diagram

With the "eulerr" project.

v1 <- list("edgeR up"=dge_edger_up, "edgeR dn"=dge_edger_dn,
  "DESeq2 up"=dge_deseq2_up,"DESeq2 dn"=dge_deseq2_dn)
plot(euler(v1),quantities = TRUE)

v1 <- list("edgeR up"=dge_edger_up, "edgeR dn"=dge_edger_dn,
  "DESeq2 up"=dge_deseq2_up,"DESeq2 dn"=dge_deseq2_dn,
  "edgeR_QL up"=dge_edgerql_up, "edgeR_QL dn"=dge_edgerql_dn)
plot(euler(v1),quantities = TRUE)

v1 <- list("edgeR up"=dge_edger_up, "edgeR dn"=dge_edger_dn,
  "DESeq2 up"=dge_deseq2_up,"DESeq2 dn"=dge_deseq2_dn,
  "Voom up"=dge_voom_up, "Voom dn"=dge_voom_dn)
plot(euler(v1),quantities = TRUE)

Upset plot

This should demonstrate the overlaps between different sets better.

v1 <- list("edgeR up"=dge_edger_up, 
  "DESeq2 up"=dge_deseq2_up,
  "Voom up"=dge_voom_up, 
  "edgeR_QL up"=dge_edgerql_up)

upset(fromList(v1), order.by = "freq")

v1 <- list("edgeR dn"=dge_edger_dn,
  "DESeq2 dn"=dge_deseq2_dn,
  "Voom dn"=dge_voom_dn,
  "edgeR_QL dn"=dge_edgerql_dn)

upset(fromList(v1), order.by = "freq")

Session information

sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] UpSetR_1.4.0                eulerr_6.1.0               
##  [3] DESeq2_1.28.1               SummarizedExperiment_1.18.2
##  [5] DelayedArray_0.14.1         matrixStats_0.57.0         
##  [7] Biobase_2.48.0              GenomicRanges_1.40.0       
##  [9] GenomeInfoDb_1.24.2         IRanges_2.22.2             
## [11] S4Vectors_0.26.1            BiocGenerics_0.34.0        
## [13] edgeR_3.30.3                limma_3.44.3               
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.5             locfit_1.5-9.4         lattice_0.20-41       
##  [4] digest_0.6.25          plyr_1.8.6             R6_2.4.1              
##  [7] RSQLite_2.2.1          evaluate_0.14          ggplot2_3.3.2         
## [10] pillar_1.4.6           zlibbioc_1.34.0        rlang_0.4.7           
## [13] annotate_1.66.0        blob_1.2.1             Matrix_1.2-18         
## [16] rmarkdown_2.4          labeling_0.3           splines_4.0.2         
## [19] BiocParallel_1.22.0    geneplotter_1.66.0     stringr_1.4.0         
## [22] polyclip_1.10-0        RCurl_1.98-1.2         bit_4.0.4             
## [25] munsell_0.5.0          polylabelr_0.2.0       compiler_4.0.2        
## [28] xfun_0.18              pkgconfig_2.0.3        htmltools_0.5.0       
## [31] tidyselect_1.1.0       gridExtra_2.3          tibble_3.0.3          
## [34] GenomeInfoDbData_1.2.3 XML_3.99-0.5           crayon_1.3.4          
## [37] dplyr_1.0.2            bitops_1.0-6           grid_4.0.2            
## [40] xtable_1.8-4           gtable_0.3.0           lifecycle_0.2.0       
## [43] DBI_1.1.0              magrittr_1.5           scales_1.1.1          
## [46] stringi_1.5.3          farver_2.0.3           XVector_0.28.0        
## [49] genefilter_1.70.0      ellipsis_0.3.1         vctrs_0.3.4           
## [52] generics_0.0.2         RColorBrewer_1.1-2     tools_4.0.2           
## [55] bit64_4.0.5            glue_1.4.2             purrr_0.3.4           
## [58] survival_3.2-7         yaml_2.2.1             AnnotationDbi_1.50.3  
## [61] colorspace_1.4-1       memoise_1.1.0          knitr_1.30