Source: TBA

Intro

How many parallel threads should be used for pathway enrichment analysis?

AMD Ryzen Threadripper 1900X 8-Core Processor (16 parallel threads).

BiocManager::install(c("mitch","fgsea"))
## Bioconductor version 3.11 (BiocManager 1.30.10), R 4.0.0 (2020-04-24)
## Installing package(s) 'mitch', 'fgsea'
## also installing the dependencies 'colorspace', 'formatR', 'hms', 'farver', 'labeling', 'munsell', 'viridisLite', 'bitops', 'generics', 'tidyselect', 'tidyr', 'httpuv', 'xtable', 'sourcetools', 'fastmap', 'lambda.r', 'futile.options', 'gtable', 'progress', 'RColorBrewer', 'reshape', 'scales', 'isoband', 'gtools', 'gdata', 'caTools', 'dplyr', 'countrycode', 'd3r', 'broom', 'shiny', 'corrplot', 'data.tree', 'futile.logger', 'snow', 'plyr', 'reshape2', 'GGally', 'gridExtra', 'ggplot2', 'gplots', 'beeswarm', 'echarts4r', 'data.table', 'BiocParallel', 'fastmatch'
install.packages(c("tictoc","RhpcBLASctl","peakRAM"))
## Installing packages into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
library("mitch")
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library("fgsea")
library("tictoc")
## Warning: package 'tictoc' was built under R version 4.0.3
library("RhpcBLASctl")
## Warning: package 'RhpcBLASctl' was built under R version 4.0.3
library("peakRAM")
blas_set_num_threads(1)

Get gene expression data

download.file("https://ziemann-lab.net/public/fgseatest/de.Rds",
  "de.Rds")

de <- readRDS("de.Rds")
head(de)
##                          baseMean log2FoldChange      lfcSE      stat
## ENSG00000165949 IFI27   1960.1970      -3.384492 0.09388689 -36.04861
## ENSG00000090382 LYZ     7596.0299      -1.650342 0.05611430 -29.41036
## ENSG00000115461 IGFBP5   531.2217      -5.071157 0.17952391 -28.24781
## ENSG00000157601 MX1      827.1511      -2.877795 0.10478234 -27.46450
## ENSG00000111331 OAS3    2127.2010      -2.661214 0.09721242 -27.37525
## ENSG00000070915 SLC12A3  424.5509      -3.374852 0.12986708 -25.98697
##                                pvalue          padj SRR1171523 SRR1171524
## ENSG00000165949 IFI27   1.450013e-284 1.909377e-280   12.05759   12.12946
## ENSG00000090382 LYZ     4.048160e-190 2.665308e-186   13.52939   13.52615
## ENSG00000115461 IGFBP5  1.514307e-175 6.646797e-172   10.60714   10.46316
## ENSG00000157601 MX1     4.663288e-166 1.535154e-162   10.88831   11.08737
## ENSG00000111331 OAS3    5.406541e-165 1.423867e-161   11.92053   12.26289
## ENSG00000070915 SLC12A3 6.951548e-149 1.525633e-145   10.35061   10.33824
##                         SRR1171525 SRR1171526 SRR1171527 SRR1171528
## ENSG00000165949 IFI27     11.82385   9.646471   9.705799   9.623453
## ENSG00000090382 LYZ       13.62313  12.080100  12.012891  12.031277
## ENSG00000115461 IGFBP5    10.69892   8.568916   8.566744   8.693134
## ENSG00000157601 MX1       10.86873   9.322793   9.356473   9.267699
## ENSG00000111331 OAS3      11.91655  10.108651  10.070989  10.012229
## ENSG00000070915 SLC12A3   10.26395   8.844934   8.904787   8.871748

Get pathways

download.file("https://ziemann-lab.net/public/fgseatest/c5.go.v2023.2.Hs.symbols.gmt",
  "c5.go.v2023.2.Hs.symbols.gmt")

pw <- gmt_import("c5.go.v2023.2.Hs.symbols.gmt")

Mitch

gt <- data.frame(rownames(de))
gt$g <- sapply(strsplit(gt[,1]," "),"[[",2)

m <- mitch_import(x=de,DEtype="deseq2",geneTable=gt)
## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.
## Note: Mean no. genes in input = 13168
## Note: no. genes in output = 13164
## Note: estimated proportion of input genes in output = 1
corerange <- 1:16

mres <- lapply(corerange, function(cores) {
  tic()
  mres <- mitch_calc(x=m,genesets=pw,cores=cores)
  toc()
} )
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
## 44.708 sec elapsed
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
## 25.097 sec elapsed
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
## 20.425 sec elapsed
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
## 17.815 sec elapsed
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
## 13.9 sec elapsed
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
## 15.201 sec elapsed
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
## 13.549 sec elapsed
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
## 12.972 sec elapsed
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
## 16.291 sec elapsed
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
## 15.545 sec elapsed
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
## 13.042 sec elapsed
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
## 14.536 sec elapsed
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
## 13.099 sec elapsed
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
## 14.716 sec elapsed
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
## 11.544 sec elapsed
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
## 14.431 sec elapsed
peakRAM(mxres <- mitch_calc(x=m,genesets=pw,cores=1))
## Note: When prioritising by significance (ie: small
##             p-values), large effect sizes might be missed.
##                                Function_Call Elapsed_Time_sec
## 1 mxres<-mitch_calc(x=m,genesets=pw,cores=1)            41.16
##   Total_RAM_Used_MiB Peak_RAM_Used_MiB
## 1                  1             108.5
mres <- do.call(rbind,lapply(mres,unlist))
mres <- as.numeric(mres[,2]) - as.numeric(mres[,1])
names(mres) <- corerange

mres
##      1      2      3      4      5      6      7      8      9     10     11 
## 44.708 25.097 20.425 17.815 13.900 15.201 13.549 12.972 16.291 15.545 13.042 
##     12     13     14     15     16 
## 14.536 13.099 14.716 11.544 14.431
barplot(mres,ylab="elapsed time in s",xlab="parallel threads", main="mitch")

FGSEA

f <- as.vector(m[,1])
names(f) <- rownames(m)

corerange <- 1:16

fres <- lapply(corerange, function(cores) {
  tic()
  fgseaRes <- fgsea(pathways = pw,
                  stats    = f,
                  minSize  = 10,
                  nproc=cores)
  toc()
} )
## Warning in fgseaMultilevel(...): There were 1 pathways for which P-values were
## not calculated properly due to unbalanced (positive and negative) gene-level
## statistic values.
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## 654.316 sec elapsed
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## 347.007 sec elapsed
## Warning in fgseaMultilevel(...): There were 3 pathways for which P-values were
## not calculated properly due to unbalanced (positive and negative) gene-level
## statistic values.

## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## 247.004 sec elapsed
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## 215.428 sec elapsed
## Warning in fgseaMultilevel(...): There were 6 pathways for which P-values were
## not calculated properly due to unbalanced (positive and negative) gene-level
## statistic values.

## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## 118.221 sec elapsed
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## 147.009 sec elapsed
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## 168.877 sec elapsed
## Warning in fgseaMultilevel(...): There were 3 pathways for which P-values were
## not calculated properly due to unbalanced (positive and negative) gene-level
## statistic values.

## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## 112.069 sec elapsed
## Warning in fgseaMultilevel(...): There were 1 pathways for which P-values were
## not calculated properly due to unbalanced (positive and negative) gene-level
## statistic values.

## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## 121.509 sec elapsed
## Warning in fgseaMultilevel(...): There were 1 pathways for which P-values were
## not calculated properly due to unbalanced (positive and negative) gene-level
## statistic values.

## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## 133.855 sec elapsed
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## 113.679 sec elapsed
## Warning in fgseaMultilevel(...): There were 2 pathways for which P-values were
## not calculated properly due to unbalanced (positive and negative) gene-level
## statistic values.
## Warning in fgseaMultilevel(...): For some of the pathways the P-values were
## likely overestimated. For such pathways log2err is set to NA.
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## 112.87 sec elapsed
## Warning in fgseaMultilevel(...): There were 6 pathways for which P-values were
## not calculated properly due to unbalanced (positive and negative) gene-level
## statistic values.

## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## 81.215 sec elapsed
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## 115.173 sec elapsed
## Warning in fgseaMultilevel(...): There were 1 pathways for which P-values were
## not calculated properly due to unbalanced (positive and negative) gene-level
## statistic values.

## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## 94.948 sec elapsed
## Warning in fgseaMultilevel(...): There were 4 pathways for which P-values were
## not calculated properly due to unbalanced (positive and negative) gene-level
## statistic values.

## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## 64.685 sec elapsed
blas_set_num_threads(1)
peakRAM(fgseaRes <- fgsea(pathways = pw,
                  stats    = f,
                  minSize  = 10,
                  nproc=1))
## Warning in fgseaMultilevel(...): There were 6 pathways for which P-values were
## not calculated properly due to unbalanced (positive and negative) gene-level
## statistic values.

## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
##                                             Function_Call Elapsed_Time_sec
## 1 fgseaRes<-fgsea(pathways=pw,stats=f,minSize=10,nproc=1)          507.955
##   Total_RAM_Used_MiB Peak_RAM_Used_MiB
## 1               24.5              89.1
blas_set_num_threads(8)
peakRAM(fgseaRes <- fgsea(pathways = pw,
                  stats    = f,
                  minSize  = 10,
                  nproc=1))
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
##                                             Function_Call Elapsed_Time_sec
## 1 fgseaRes<-fgsea(pathways=pw,stats=f,minSize=10,nproc=1)          681.537
##   Total_RAM_Used_MiB Peak_RAM_Used_MiB
## 1                8.5             104.5
blas_set_num_threads(1)
peakRAM(fgseaRes <- fgsea(pathways = pw,
                  stats    = f,
                  minSize  = 10,
                  nproc=8))
## Warning in fgseaMultilevel(...): There were 10 pathways for which P-values were
## not calculated properly due to unbalanced (positive and negative) gene-level
## statistic values.
## Warning in fgseaMultilevel(...): For some of the pathways the P-values were
## likely overestimated. For such pathways log2err is set to NA.
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
##                                             Function_Call Elapsed_Time_sec
## 1 fgseaRes<-fgsea(pathways=pw,stats=f,minSize=10,nproc=8)           76.662
##   Total_RAM_Used_MiB Peak_RAM_Used_MiB
## 1                8.3             104.5
blas_set_num_threads(4)
peakRAM(fgseaRes <- fgsea(pathways = pw,
                  stats    = f,
                  minSize  = 10,
                  nproc=4))
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
##                                             Function_Call Elapsed_Time_sec
## 1 fgseaRes<-fgsea(pathways=pw,stats=f,minSize=10,nproc=4)          222.283
##   Total_RAM_Used_MiB Peak_RAM_Used_MiB
## 1                8.4             104.5
fres <- do.call(rbind,lapply(fres,unlist))
fres <- as.numeric(fres[,2]) - as.numeric(fres[,1])
names(fres) <- corerange

fres
##       1       2       3       4       5       6       7       8       9      10 
## 654.316 347.007 247.004 215.428 118.221 147.009 168.877 112.069 121.509 133.855 
##      11      12      13      14      15      16 
## 113.679 112.870  81.215 115.173  94.948  64.685
barplot(fres,ylab="elapsed time in s",xlab="parallel threads", main="fgsea")

Session information

sessionInfo()
## R version 4.0.0 (2020-04-24)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] peakRAM_1.0.2        RhpcBLASctl_0.20-137 tictoc_1.0          
## [4] fgsea_1.14.0         mitch_1.0.10         BiocManager_1.30.10 
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.4.6        lattice_0.20-41     gtools_3.8.2       
##  [4] digest_0.6.25       mime_0.9            R6_2.4.1           
##  [7] plyr_1.8.6          evaluate_0.14       ggplot2_3.3.1      
## [10] pillar_1.4.4        gplots_3.0.3        rlang_0.4.6        
## [13] data.table_1.12.8   gdata_2.18.0        Matrix_1.2-18      
## [16] rmarkdown_2.2       BiocParallel_1.22.0 stringr_1.4.0      
## [19] htmlwidgets_1.5.1   munsell_0.5.0       shiny_1.4.0.2      
## [22] compiler_4.0.0      httpuv_1.5.4        xfun_0.14          
## [25] pkgconfig_2.0.3     htmltools_0.4.0     tidyselect_1.1.0   
## [28] tibble_3.0.1        gridExtra_2.3       reshape_0.8.8      
## [31] crayon_1.3.4        dplyr_1.0.0         later_1.1.0.1      
## [34] MASS_7.3-51.6       bitops_1.0-6        grid_4.0.0         
## [37] xtable_1.8-4        GGally_2.0.0        gtable_0.3.0       
## [40] lifecycle_0.2.0     magrittr_1.5        scales_1.1.1       
## [43] KernSmooth_2.23-17  stringi_1.4.6       reshape2_1.4.4     
## [46] promises_1.1.0      ellipsis_0.3.1      generics_0.0.2     
## [49] vctrs_0.3.1         fastmatch_1.1-0     RColorBrewer_1.1-2 
## [52] tools_4.0.0         glue_1.4.1          beeswarm_0.2.3     
## [55] purrr_0.3.4         parallel_4.0.0      fastmap_1.0.1      
## [58] yaml_2.2.1          colorspace_1.4-1    caTools_1.18.0     
## [61] knitr_1.28          echarts4r_0.2.3