Source: https://github.com/markziemann/dee2_gene_signatures

Background

The purpose of this analysis is to perform Jaccard analysis with the new gene sets to ensure they are not redundant with those already in MSigDB.

Libraries

suppressPackageStartupMessages({
  library("getDEE2")
  library("mitch")
  library("triwise")
  library("dplyr")
  library("gplots")
  library("reshape2")
  library("network")
})

Import gene sets

Now for MSigDB version 7.2 accessed 27/Sep/2020. There are 31120 sets and 40044 genes.

epi <- gmt_import("epilepsy_genesymbols.gmt")
diab <- gmt_import("diabetes_genesymbols.gmt")
## Warning in gmt_import("diabetes_genesymbols.gmt"): Duplicated gene sets names
## detected
hd <- gmt_import("heartdisease_genesymbols.gmt")
sars <- gmt_import("sarsmers_genesymbols.gmt")
## Warning in gmt_import("sarsmers_genesymbols.gmt"): Duplicated gene sets names
## detected
msigdb <- gmt_import("msigdb.v7.2.symbols.gmt")
length(msigdb)
## [1] 31120

Jaccard overlap

Here is a function that converts list of vectors to a network diagram. There are 2x edges than nodes. Only the edges with highest similarity are retained, as per jaccard. The size of the gene set is proportional to the node size (sqrt).

gsjac <- function(gs1,gs2){
  mclapply(gs1, function(y) {
    l <- lapply(gs2,function(x)  {  length(intersect(x,y )) /  length(union(x,y )) } ) 
    l <- unlist(l)
    lmax <- tail(l[order(l)],1)
    lmax
  },mc.cores=8)    
}


# calculate jaccard with sets already in MSigDB
epi_jac <- gsjac(epi,msigdb)
# any with jaccard above 80% ?
which(epi_jac >0.8)
## named integer(0)
#max
epi_jac[tail(order(unlist(epi_jac)),1)]
## $`SRP061192: non-silencing control in neural progenitor cell lines 690.C5 versus CYFIP1 knockdown: upregulated`
## FISCHER_DREAM_TARGETS 
##             0.2032787
diab_jac <- gsjac(diab,msigdb)
which(diab_jac >0.8)
## named integer(0)
diab_jac[tail(order(unlist(diab_jac)),1)]
## $`SRP045500:Genes expressed differentially in human CD4 with type 1 diabetes: upregulated`
## KEGG_RIBOSOME 
##        0.3125
hd_jac <- gsjac(hd,msigdb)
which(hd_jac >0.8)
## named integer(0)
hd_jac[tail(order(unlist(hd_jac)),1)]
## $`SRP163478:right ventricle of left ventricular heart failure (LV-HF) versus right ventricle of biventricular heart failure (BiV-HF): upregulated`
## HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION 
##                                  0.1343284
sars_jac <- gsjac(sars,msigdb)
which(sars_jac >0.8)
## named integer(0)
sars_jac[tail(order(unlist(sars_jac)),1)]
## $`SRP253951: NHBE mock treatment versus infected with SARS-CoV-2 (Series 1): upregulated`
## BLANCO_MELO_COVID19_BRONCHIAL_EPITHELIAL_CELLS_SARS_COV_2_INFECTION_UP 
##                                                              0.3886926

Session information

sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] network_1.16.1              reshape2_1.4.4             
##  [3] dplyr_1.0.2                 triwise_0.99.5             
##  [5] mitch_1.1.12                edgeR_3.30.3               
##  [7] limma_3.44.3                DESeq2_1.28.1              
##  [9] SummarizedExperiment_1.18.2 DelayedArray_0.14.1        
## [11] matrixStats_0.57.0          Biobase_2.48.0             
## [13] GenomicRanges_1.40.0        GenomeInfoDb_1.24.2        
## [15] IRanges_2.22.2              S4Vectors_0.26.1           
## [17] BiocGenerics_0.34.0         getDEE2_0.99.30            
## [19] gplots_3.1.0                ActivePathways_1.0.2       
## [21] devtools_2.3.2              usethis_1.6.3              
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.4-1       ellipsis_0.3.1         rprojroot_1.3-2       
##  [4] XVector_0.28.0         fs_1.5.0               remotes_2.2.0         
##  [7] bit64_4.0.5            AnnotationDbi_1.50.3   fansi_0.4.1           
## [10] splines_4.0.2          geneplotter_1.66.0     knitr_1.30            
## [13] pkgload_1.1.0          annotate_1.66.0        shiny_1.5.0           
## [16] compiler_4.0.2         backports_1.1.10       assertthat_0.2.1      
## [19] Matrix_1.2-18          fastmap_1.0.1          cli_2.0.2             
## [22] later_1.1.0.1          htmltools_0.5.0        prettyunits_1.1.1     
## [25] tools_4.0.2            gtable_0.3.0           glue_1.4.2            
## [28] GenomeInfoDbData_1.2.3 Rcpp_1.0.5             vctrs_0.3.4           
## [31] xfun_0.18              stringr_1.4.0          ps_1.4.0              
## [34] testthat_2.3.2         mime_0.9               lifecycle_0.2.0       
## [37] gtools_3.8.2           XML_3.99-0.5           zlibbioc_1.34.0       
## [40] MASS_7.3-53            scales_1.1.1           promises_1.1.1        
## [43] RColorBrewer_1.1-2     yaml_2.2.1             memoise_1.1.0         
## [46] gridExtra_2.3          ggplot2_3.3.2          reshape_0.8.8         
## [49] stringi_1.5.3          RSQLite_2.2.1          genefilter_1.70.0     
## [52] desc_1.2.0             caTools_1.18.0         pkgbuild_1.1.0        
## [55] BiocParallel_1.22.0    echarts4r_0.3.2        rlang_0.4.8           
## [58] pkgconfig_2.0.3        bitops_1.0-6           evaluate_0.14         
## [61] lattice_0.20-41        purrr_0.3.4            htmlwidgets_1.5.2     
## [64] bit_4.0.4              processx_3.4.4         tidyselect_1.1.0      
## [67] GGally_2.0.0           plyr_1.8.6             magrittr_1.5          
## [70] R6_2.4.1               generics_0.0.2         DBI_1.1.0             
## [73] pillar_1.4.6           withr_2.3.0            survival_3.2-7        
## [76] RCurl_1.98-1.2         tibble_3.0.3           crayon_1.3.4          
## [79] htm2txt_2.1.1          KernSmooth_2.23-17     rmarkdown_2.4         
## [82] locfit_1.5-9.4         grid_4.0.2             data.table_1.13.0     
## [85] blob_1.2.1             callr_3.5.0            digest_0.6.25         
## [88] pbmcapply_1.5.0        xtable_1.8-4           httpuv_1.5.4          
## [91] munsell_0.5.0          beeswarm_0.2.3         sessioninfo_1.1.1