miR Enrichment

Source: https://github.com/markziemann/miR-enrichment

Introduction

The predicted targets of microRNAs are used in functional enrichment analysis to justify the potential function of microRNAs, but there are some logical problems with this. In a biological tissue, not all of these targets will be expressed. Also the enrichment analysis requires a background list which is all the genes that can be measured. Not all genes are expressed in a tissue, at least half are silenced, so each enrichment analysis requires a custom background gene list. Unfortunately a custom background list is rarely used. We argue that this causes a dramatic distortion to the results.

Sample sheet

GSE232687

Control mRNA: Mock_IN_1 Mock_IN_2 Mock_IN_3

Case mRNA: RSV_IN_1 RSV_IN_2 RSV_IN_3

Methods

suppressPackageStartupMessages({
    library("DESeq2")
    library("gplots")
    library("mitch")
    library("eulerr")
    library("getDEE2")
    library("kableExtra")
})

Import read counts

Importing RNA-seq data

x <- read.table("GSE231788_SR92_genes_n_regRegions_raw_counts_for_degust_GEO.csv.gz",sep=",",header=TRUE)
x$gene <- paste(x$gene_id,x$gene_symbol)
x$biotype = x$chr = x$gene_symbol = x$gene_id = NULL
xa <- aggregate(. ~ gene, x , sum)
dim(x) ; dim(xa)

## [1] 276287      7

## [1] 271910      7

rownames(xa) <- xa$gene
xa$gene = NULL
table(grepl("ENSR",rownames(xa)))

## 
##  FALSE   TRUE 
##  62331 209579

xarat <- xa[grep("ENSR",rownames(xa)),]
colSums(xarat)

## Mock_IN_1 Mock_IN_2 Mock_IN_3  RSV_IN_1  RSV_IN_2  RSV_IN_3 
##  320779.1  336330.1  260418.1  357631.0  333995.4  440170.4

xah <- xa[grep("ENSG",rownames(xa)),]
colSums(xah)

## Mock_IN_1 Mock_IN_2 Mock_IN_3  RSV_IN_1  RSV_IN_2  RSV_IN_3 
##  17897967  18651940  14635180  14568408  13726321  18383635

head(xah)

##                             Mock_IN_1 Mock_IN_2 Mock_IN_3 RSV_IN_1 RSV_IN_2
## ENSG00000000003.15 TSPAN6      531.00       642       457      255      253
## ENSG00000000005.6 TNMD           0.00         0         0        0        0
## ENSG00000000419.12 DPM1        480.00       556       393      360      327
## ENSG00000000457.14 SCYL3       279.25       277       225      264      244
## ENSG00000000460.17 C1orf112    354.00       404       299      325      328
## ENSG00000000938.13 FGR           0.00         0         0       10        6
##                             RSV_IN_3
## ENSG00000000003.15 TSPAN6        340
## ENSG00000000005.6 TNMD             0
## ENSG00000000419.12 DPM1          438
## ENSG00000000457.14 SCYL3         323
## ENSG00000000460.17 C1orf112      471
## ENSG00000000938.13 FGR             4

xahf <- xah[which(rowMeans(xah)>=10),]
dim(xah) ; dim(xahf)

## [1] 62331     6

## [1] 19579     6

Differential expression

ss <- data.frame("run"=colnames(xahf),"trt"=c(0,0,0,1,1,1))
rownames(ss) <- colnames(xahf)

mds <- cmdscale(dist(t(xahf)))
plot(mds,cex=2,col="gray",pch=19)
text(mds, labels=rownames(mds) ,col="black")

colSums(xahf)

## Mock_IN_1 Mock_IN_2 Mock_IN_3  RSV_IN_1  RSV_IN_2  RSV_IN_3 
##  17860468  18613014  14605280  14526097  13686719  18330576

dds <- DESeqDataSetFromMatrix(countData = round(xahf) , colData = ss , design = ~ trt )

## converting counts to integer mode

##   the design formula contains one or more numeric variables with integer values,
##   specifying a model with increasing fold change for higher values.
##   did you mean for this to be a factor? if so, first convert
##   this variable to a factor using the factor() function

res <- DESeq(dds)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

z <- results(res)
vsd <- vst(dds, blind=FALSE)
zz <- cbind(as.data.frame(z),assay(vsd))
dge <- as.data.frame(zz[order(zz$pvalue),])
head(dge)

##                             baseMean log2FoldChange      lfcSE      stat pvalue
## ENSG00000002549.12 LAP3     1957.485       3.088694 0.05340720  57.83293      0
## ENSG00000003402.20 CFLAR    4200.253       2.010651 0.03227501  62.29745      0
## ENSG00000004468.13 CD38     1097.075       2.837600 0.06620352  42.86177      0
## ENSG00000005022.6 SLC25A5   4224.912      -1.270876 0.03075653 -41.32053      0
## ENSG00000006118.14 TMEM132A 4916.566       2.947950 0.03372988  87.39875      0
## ENSG00000006459.11 KDM7A    1336.733       2.493349 0.05841696  42.68194      0
##                             padj Mock_IN_1 Mock_IN_2 Mock_IN_3 RSV_IN_1
## ENSG00000002549.12 LAP3        0  11.48364  11.46163  11.42396 12.67572
## ENSG00000003402.20 CFLAR       0  12.11881  12.12104  12.13961 13.25673
## ENSG00000004468.13 CD38        0  11.31782  11.32527  11.33929 12.22754
## ENSG00000005022.6 SLC25A5      0  13.14469  13.16882  13.15033 12.39812
## ENSG00000006118.14 TMEM132A    0  11.90083  11.89150  11.89584 13.54467
## ENSG00000006459.11 KDM7A       0  11.45184  11.42654  11.47735 12.33421
##                             RSV_IN_2 RSV_IN_3
## ENSG00000002549.12 LAP3     12.66564 12.66829
## ENSG00000003402.20 CFLAR    13.27641 13.28575
## ENSG00000004468.13 CD38     12.21938 12.20858
## ENSG00000005022.6 SLC25A5   12.38320 12.40628
## ENSG00000006118.14 TMEM132A 13.52602 13.55476
## ENSG00000006459.11 KDM7A    12.34125 12.32280

ups <- rownames(subset(dge,padj<0.05 & log2FoldChange>0))
dns <- rownames(subset(dge,padj<0.05 & log2FoldChange<0))
lapply(list("UPs"=ups,"DNs"=dns),length)

## $UPs
## [1] 6543
## 
## $DNs
## [1] 6254

nrow(dge)

## [1] 19579

saveRDS(dge,file="GSE232687.Rmd")

Session information

For reproducibility.

sessionInfo()

## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] kableExtra_1.4.0            getDEE2_1.14.0             
##  [3] eulerr_7.0.2                mitch_1.16.0               
##  [5] gplots_3.1.3.1              DESeq2_1.44.0              
##  [7] SummarizedExperiment_1.34.0 Biobase_2.64.0             
##  [9] MatrixGenerics_1.16.0       matrixStats_1.3.0          
## [11] GenomicRanges_1.56.0        GenomeInfoDb_1.40.0        
## [13] IRanges_2.38.0              S4Vectors_0.42.0           
## [15] BiocGenerics_0.50.0        
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1        viridisLite_0.4.2       dplyr_1.1.4            
##  [4] bitops_1.0-7            fastmap_1.2.0           GGally_2.2.1           
##  [7] promises_1.3.0          digest_0.6.35           mime_0.12              
## [10] lifecycle_1.0.4         magrittr_2.0.3          compiler_4.4.0         
## [13] sass_0.4.9              rlang_1.1.4             tools_4.4.0            
## [16] yaml_2.3.8              utf8_1.2.4              knitr_1.47             
## [19] S4Arrays_1.4.0          htmlwidgets_1.6.4       DelayedArray_0.30.1    
## [22] xml2_1.3.6              plyr_1.8.9              RColorBrewer_1.1-3     
## [25] abind_1.4-5             BiocParallel_1.38.0     KernSmooth_2.23-24     
## [28] purrr_1.0.2             grid_4.4.0              fansi_1.0.6            
## [31] caTools_1.18.2          xtable_1.8-4            colorspace_2.1-0       
## [34] ggplot2_3.5.1           MASS_7.3-60.2           scales_1.3.0           
## [37] gtools_3.9.5            cli_3.6.2               rmarkdown_2.27         
## [40] crayon_1.5.2            generics_0.1.3          rstudioapi_0.16.0      
## [43] reshape2_1.4.4          httr_1.4.7              cachem_1.1.0           
## [46] stringr_1.5.1           zlibbioc_1.50.0         parallel_4.4.0         
## [49] XVector_0.44.0          vctrs_0.6.5             Matrix_1.7-0           
## [52] jsonlite_1.8.8          echarts4r_0.4.5         beeswarm_0.4.0         
## [55] systemfonts_1.1.0       locfit_1.5-9.9          jquerylib_0.1.4        
## [58] tidyr_1.3.1             glue_1.7.0              ggstats_0.6.0          
## [61] codetools_0.2-20        stringi_1.8.4           gtable_0.3.5           
## [64] later_1.3.2             UCSC.utils_1.0.0        htm2txt_2.2.2          
## [67] munsell_0.5.1           tibble_3.2.1            pillar_1.9.0           
## [70] htmltools_0.5.8.1       GenomeInfoDbData_1.2.12 R6_2.5.1               
## [73] shiny_1.8.1.1           evaluate_0.23           lattice_0.22-6         
## [76] highr_0.11              bslib_0.7.0             httpuv_1.6.15          
## [79] Rcpp_1.0.12             svglite_2.1.3           gridExtra_2.3          
## [82] SparseArray_1.4.3       xfun_0.44               pkgconfig_2.0.3