Source codes: https://github.com/markziemann/mitch-gabe

Background

Here we are investigating report of higher than expected rate of false positives (Type 1 errors).

library("readxl")
library("mitch")

Load data

Let’s have a look at the data.

# genesets 
genesets <- gmt_import("cluster_pathway_collection_20201117.txt")
length(genesets)
## [1] 50
hist(unlist(lapply(genesets,length)))

summary(unlist(lapply(genesets,length)))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     9.0    22.0    33.0    34.2    44.5    74.0
x <- readxl::read_xlsx("test_mitch_input_20201214.xlsx")
x <- as.data.frame(x)
hist(x$CNV)

hist(x$Prot)

hist(x$RNAseq)

rownames(x) <- x$Symbol
x$Symbol=NULL
head(res$enrichment_result,30)
##          set setSize      pMANOVA      s.CNV       s.Prot     s.RNAseq
## 5  cluster05      11 3.442435e-10 -0.8045909 -0.671785542 -0.872010273
## 19 cluster19      14 6.922460e-13 -0.8371799 -0.814100404 -0.677644879
## 4  cluster04      52 5.549734e-44 -0.8040039 -0.721977359 -0.702584207
## 16 cluster16      23 4.239314e-18 -0.7174300 -0.734027474 -0.720368032
## 37 cluster37      40 5.973118e-21 -0.8473653 -0.012664671  0.100928144
## 33 cluster33      34 7.187830e-16  0.8446581  0.052541064  0.103362347
## 24 cluster24      74 2.212806e-26 -0.7346197  0.007714928 -0.246415119
## 2  cluster02      22 1.703489e-07  0.7215640  0.043838863  0.143688065
## 17 cluster17      57 1.260102e-20  0.7295507 -0.009350357  0.003640377
## 32 cluster32      53 2.468867e-16 -0.6634860  0.099327040 -0.227064142
## 44 cluster44      46 2.318433e-13  0.6720840  0.117422659  0.135216346
## 35 cluster35      39 8.254056e-12 -0.6744157 -0.115683837 -0.033927174
## 34 cluster34      48 1.208899e-16 -0.6447804  0.048335339  0.190734055
## 47 cluster47      21 5.575601e-05  0.5954777  0.035383011  0.164904565
## 3  cluster03      31 9.892387e-08  0.6103287 -0.024880401  0.072873638
## 26 cluster26      18 3.465889e-04 -0.4781324  0.221237195 -0.269175204
## 21 cluster21      65 1.779561e-12  0.5415852  0.058480243  0.216048632
## 45 cluster45      33 3.401079e-07 -0.5705896  0.028568331 -0.079741963
## 40 cluster40      16 2.535460e-03 -0.4887102  0.080209563 -0.293167060
## 7  cluster07      45 2.733820e-09  0.5565966 -0.034928262  0.067494161
## 38 cluster38      31 1.057369e-05  0.5212972 -0.033257123  0.116716940
## 36 cluster36      42 2.741456e-08  0.4988295  0.104687678 -0.114622588
## 20 cluster20      24 3.498642e-05 -0.4290728  0.221727956  0.184509688
## 12 cluster12      59 9.788472e-09  0.4822655  0.061770473  0.120799926
## 18 cluster18      53 7.696250e-07  0.4213457  0.200965600  0.121713485
## 49 cluster49      14 6.985915e-03  0.3600067 -0.163493935 -0.274258760
## 29 cluster29      39 3.682846e-06 -0.3872240 -0.056913563  0.209378692
## 28 cluster28      17 2.660073e-02  0.4030089 -0.082172266  0.129356172
## 8  cluster08      37 1.899859e-04  0.4203648  0.009999838  0.035265989
## 6  cluster06      34 2.635428e-04 -0.3965675  0.004983855  0.086199635
##           p.CNV       p.Prot     p.RNAseq    s.dist          SD p.adjustMANOVA
## 5  3.871760e-06 1.170496e-04 5.503789e-07 1.3634751 0.101876218   1.297533e-09
## 19 5.873662e-08 1.350399e-07 1.172033e-05 1.3501231 0.086220844   3.392006e-12
## 4  1.163193e-23 2.763547e-19 2.523072e-18 1.2889136 0.053836792   2.719370e-42
## 16 2.755763e-09 1.168168e-09 2.370727e-09 1.2539666 0.008857106   4.154528e-17
## 37 1.657128e-20 8.910289e-01 2.748346e-01 0.8534488 0.517830219   9.756093e-20
## 33 1.472668e-17 5.996013e-01 3.016581e-01 0.8525795 0.443386917   4.402546e-15
## 24 1.367741e-27 9.105434e-01 3.239546e-04 0.7748845 0.377267921   5.421374e-25
## 2  4.942257e-09 7.236402e-01 2.463852e-01 0.7370364 0.365882899   4.637276e-07
## 17 2.161130e-21 9.043809e-01 9.626967e-01 0.7296197 0.422904528   1.543624e-19
## 32 9.334588e-17 2.178458e-01 4.807326e-03 0.7082638 0.382726810   1.728207e-15
## 44 4.062979e-15 1.737990e-01 1.172693e-01 0.6955347 0.315222874   1.262258e-12
## 35 3.822737e-13 2.163349e-01 7.169700e-01 0.6851061 0.348590284   3.370406e-11
## 34 1.488795e-14 5.677050e-01 2.404009e-02 0.6741347 0.446984560   9.872677e-16
## 47 2.487357e-06 7.803114e-01 1.935330e-01 0.6189016 0.293222100   1.138352e-04
## 3  4.706221e-09 8.122059e-01 4.864859e-01 0.6151673 0.342029352   2.851335e-07
## 26 4.675193e-04 1.059621e-01 4.915286e-02 0.5916179 0.358999823   6.289947e-04
## 21 7.840693e-14 4.234498e-01 3.069484e-03 0.5860132 0.246371092   7.927137e-12
## 45 1.657906e-08 7.784860e-01 4.323212e-01 0.5768426 0.319283778   8.771204e-07
## 40 7.431212e-04 5.804534e-01 4.324592e-02 0.5755155 0.289055026   4.437055e-03
## 7  1.406301e-10 6.890307e-01 4.393295e-01 0.5617608 0.316125820   9.568369e-09
## 38 5.853323e-07 7.508096e-01 2.649723e-01 0.5352380 0.286853034   2.355048e-05
## 36 2.839321e-08 2.460589e-01 2.040464e-01 0.5224258 0.310850513   8.395709e-08
## 20 2.952262e-04 6.179521e-02 1.201835e-01 0.5170208 0.365470058   7.453628e-05
## 12 2.343780e-10 4.196737e-01 1.144299e-01 0.5009873 0.227653942   3.197567e-08
## 18 1.543905e-07 1.259754e-02 1.309983e-01 0.4824246 0.155256638   1.885581e-06
## 49 2.014276e-02 2.916011e-01 7.678945e-02 0.4811995 0.338775866   1.104225e-02
## 29 3.356409e-05 5.430935e-01 2.516849e-02 0.4438705 0.298873245   8.593308e-06
## 28 4.169236e-03 5.595532e-01 3.582913e-01 0.4311630 0.243252580   3.724102e-02
## 8  1.134562e-05 9.170640e-01 7.134347e-01 0.4219600 0.229977838   3.723724e-04
## 6  7.145929e-05 9.602856e-01 3.890515e-01 0.4058583 0.258490337   4.966769e-04

It looks as if the values for input data are between 0 and 1, which is indicates that the p-values were used for ranking. This is not the recommended way of using mitch. There should be positive and negative values in the prot and RNAseq data at least, to indicate that some genes are up and down regulated. As it stands, mitch does not do a 1-tailed test. The data also looks to have a skew towards small values for CNV and RNAseq (not sure about protein).

nrow(subset(res$enrichment_result, p.adjustMANOVA<0.05) )
## [1] 37
hist(subset(res$enrichment_result, p.adjustMANOVA<0.05)$s.CNV)

hist(subset(res$enrichment_result, p.adjustMANOVA<0.05)$s.Prot)

hist(subset(res$enrichment_result, p.adjustMANOVA<0.05)$s.RNA)

So there are 37 clusters with FDR<0.05. There is a mixture of “up” and “down” regulated genes.

Session info

sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] mitch_1.2.2  readxl_1.3.1
## 
## loaded via a namespace (and not attached):
##  [1] gtools_3.8.2        beeswarm_0.2.3      tidyselect_1.1.0   
##  [4] xfun_0.19           purrr_0.3.4         reshape2_1.4.4     
##  [7] colorspace_2.0-0    vctrs_0.3.5         generics_0.1.0     
## [10] htmltools_0.5.0     yaml_2.2.1          utf8_1.1.4         
## [13] rlang_0.4.9         later_1.1.0.1       pillar_1.4.7       
## [16] glue_1.4.2          echarts4r_0.3.3     RColorBrewer_1.1-2 
## [19] lifecycle_0.2.0     plyr_1.8.6          stringr_1.4.0      
## [22] munsell_0.5.0       gtable_0.3.0        cellranger_1.1.0   
## [25] htmlwidgets_1.5.3   caTools_1.18.0      evaluate_0.14      
## [28] knitr_1.30          GGally_2.0.0        fastmap_1.0.1      
## [31] httpuv_1.5.4        parallel_4.0.2      fansi_0.4.1        
## [34] Rcpp_1.0.5          KernSmooth_2.23-18  xtable_1.8-4       
## [37] promises_1.1.1      scales_1.1.1        BiocManager_1.30.10
## [40] mime_0.9            gplots_3.1.1        gridExtra_2.3      
## [43] ggplot2_3.3.2       digest_0.6.27       stringi_1.5.3      
## [46] dplyr_1.0.2         shiny_1.5.0         grid_4.0.2         
## [49] cli_2.2.0           tools_4.0.2         bitops_1.0-6       
## [52] magrittr_2.0.1      tibble_3.0.4        crayon_1.3.4       
## [55] pkgconfig_2.0.3     ellipsis_0.3.1      MASS_7.3-53        
## [58] assertthat_0.2.1    rmarkdown_2.6       reshape_0.8.8      
## [61] R6_2.5.0            compiler_4.0.2