Source: https://github.com/markziemann/miR-enrichment
The predicted targets of microRNAs are used in functional enrichment analysis to justify the potential function of microRNAs, but there are some logical problems with this. In a biological tissue, not all of these targets will be expressed. Also the enrichment analysis requires a background list which is all the genes that can be measured. Not all genes are expressed in a tissue, at least half are silenced, so each enrichment analysis requires a custom background gene list. Unfortunately a custom background list is rarely used. We argue that this causes a dramatic distortion to the results.
GSE232687
Control mRNA: Mock_IN_1 Mock_IN_2 Mock_IN_3
Case mRNA: RSV_IN_1 RSV_IN_2 RSV_IN_3
suppressPackageStartupMessages({
library("DESeq2")
library("gplots")
library("mitch")
library("eulerr")
library("getDEE2")
library("kableExtra")
})
Importing RNA-seq data
x <- read.table("GSE231788_SR92_genes_n_regRegions_raw_counts_for_degust_GEO.csv.gz",sep=",",header=TRUE)
x$gene <- paste(x$gene_id,x$gene_symbol)
x$biotype = x$chr = x$gene_symbol = x$gene_id = NULL
xa <- aggregate(. ~ gene, x , sum)
dim(x) ; dim(xa)
## [1] 276287 7
## [1] 271910 7
rownames(xa) <- xa$gene
xa$gene = NULL
table(grepl("ENSR",rownames(xa)))
##
## FALSE TRUE
## 62331 209579
xarat <- xa[grep("ENSR",rownames(xa)),]
colSums(xarat)
## Mock_IN_1 Mock_IN_2 Mock_IN_3 RSV_IN_1 RSV_IN_2 RSV_IN_3
## 320779.1 336330.1 260418.1 357631.0 333995.4 440170.4
xah <- xa[grep("ENSG",rownames(xa)),]
colSums(xah)
## Mock_IN_1 Mock_IN_2 Mock_IN_3 RSV_IN_1 RSV_IN_2 RSV_IN_3
## 17897967 18651940 14635180 14568408 13726321 18383635
head(xah)
## Mock_IN_1 Mock_IN_2 Mock_IN_3 RSV_IN_1 RSV_IN_2
## ENSG00000000003.15 TSPAN6 531.00 642 457 255 253
## ENSG00000000005.6 TNMD 0.00 0 0 0 0
## ENSG00000000419.12 DPM1 480.00 556 393 360 327
## ENSG00000000457.14 SCYL3 279.25 277 225 264 244
## ENSG00000000460.17 C1orf112 354.00 404 299 325 328
## ENSG00000000938.13 FGR 0.00 0 0 10 6
## RSV_IN_3
## ENSG00000000003.15 TSPAN6 340
## ENSG00000000005.6 TNMD 0
## ENSG00000000419.12 DPM1 438
## ENSG00000000457.14 SCYL3 323
## ENSG00000000460.17 C1orf112 471
## ENSG00000000938.13 FGR 4
xahf <- xah[which(rowMeans(xah)>=10),]
dim(xah) ; dim(xahf)
## [1] 62331 6
## [1] 19579 6
ss <- data.frame("run"=colnames(xahf),"trt"=c(0,0,0,1,1,1))
rownames(ss) <- colnames(xahf)
mds <- cmdscale(dist(t(xahf)))
plot(mds,cex=2,col="gray",pch=19)
text(mds, labels=rownames(mds) ,col="black")
colSums(xahf)
## Mock_IN_1 Mock_IN_2 Mock_IN_3 RSV_IN_1 RSV_IN_2 RSV_IN_3
## 17860468 18613014 14605280 14526097 13686719 18330576
dds <- DESeqDataSetFromMatrix(countData = round(xahf) , colData = ss , design = ~ trt )
## converting counts to integer mode
## the design formula contains one or more numeric variables with integer values,
## specifying a model with increasing fold change for higher values.
## did you mean for this to be a factor? if so, first convert
## this variable to a factor using the factor() function
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
z <- results(res)
vsd <- vst(dds, blind=FALSE)
zz <- cbind(as.data.frame(z),assay(vsd))
dge <- as.data.frame(zz[order(zz$pvalue),])
head(dge)
## baseMean log2FoldChange lfcSE stat pvalue
## ENSG00000002549.12 LAP3 1957.485 3.088694 0.05340720 57.83293 0
## ENSG00000003402.20 CFLAR 4200.253 2.010651 0.03227501 62.29745 0
## ENSG00000004468.13 CD38 1097.075 2.837600 0.06620352 42.86177 0
## ENSG00000005022.6 SLC25A5 4224.912 -1.270876 0.03075653 -41.32053 0
## ENSG00000006118.14 TMEM132A 4916.566 2.947950 0.03372988 87.39875 0
## ENSG00000006459.11 KDM7A 1336.733 2.493349 0.05841696 42.68194 0
## padj Mock_IN_1 Mock_IN_2 Mock_IN_3 RSV_IN_1
## ENSG00000002549.12 LAP3 0 11.48364 11.46163 11.42396 12.67572
## ENSG00000003402.20 CFLAR 0 12.11881 12.12104 12.13961 13.25673
## ENSG00000004468.13 CD38 0 11.31782 11.32527 11.33929 12.22754
## ENSG00000005022.6 SLC25A5 0 13.14469 13.16882 13.15033 12.39812
## ENSG00000006118.14 TMEM132A 0 11.90083 11.89150 11.89584 13.54467
## ENSG00000006459.11 KDM7A 0 11.45184 11.42654 11.47735 12.33421
## RSV_IN_2 RSV_IN_3
## ENSG00000002549.12 LAP3 12.66564 12.66829
## ENSG00000003402.20 CFLAR 13.27641 13.28575
## ENSG00000004468.13 CD38 12.21938 12.20858
## ENSG00000005022.6 SLC25A5 12.38320 12.40628
## ENSG00000006118.14 TMEM132A 13.52602 13.55476
## ENSG00000006459.11 KDM7A 12.34125 12.32280
ups <- rownames(subset(dge,padj<0.05 & log2FoldChange>0))
dns <- rownames(subset(dge,padj<0.05 & log2FoldChange<0))
lapply(list("UPs"=ups,"DNs"=dns),length)
## $UPs
## [1] 6543
##
## $DNs
## [1] 6254
nrow(dge)
## [1] 19579
saveRDS(dge,file="GSE232687.Rmd")
For reproducibility.
sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] kableExtra_1.4.0 getDEE2_1.14.0
## [3] eulerr_7.0.2 mitch_1.16.0
## [5] gplots_3.1.3.1 DESeq2_1.44.0
## [7] SummarizedExperiment_1.34.0 Biobase_2.64.0
## [9] MatrixGenerics_1.16.0 matrixStats_1.3.0
## [11] GenomicRanges_1.56.0 GenomeInfoDb_1.40.0
## [13] IRanges_2.38.0 S4Vectors_0.42.0
## [15] BiocGenerics_0.50.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 viridisLite_0.4.2 dplyr_1.1.4
## [4] bitops_1.0-7 fastmap_1.2.0 GGally_2.2.1
## [7] promises_1.3.0 digest_0.6.35 mime_0.12
## [10] lifecycle_1.0.4 magrittr_2.0.3 compiler_4.4.0
## [13] sass_0.4.9 rlang_1.1.4 tools_4.4.0
## [16] yaml_2.3.8 utf8_1.2.4 knitr_1.47
## [19] S4Arrays_1.4.0 htmlwidgets_1.6.4 DelayedArray_0.30.1
## [22] xml2_1.3.6 plyr_1.8.9 RColorBrewer_1.1-3
## [25] abind_1.4-5 BiocParallel_1.38.0 KernSmooth_2.23-24
## [28] purrr_1.0.2 grid_4.4.0 fansi_1.0.6
## [31] caTools_1.18.2 xtable_1.8-4 colorspace_2.1-0
## [34] ggplot2_3.5.1 MASS_7.3-60.2 scales_1.3.0
## [37] gtools_3.9.5 cli_3.6.2 rmarkdown_2.27
## [40] crayon_1.5.2 generics_0.1.3 rstudioapi_0.16.0
## [43] reshape2_1.4.4 httr_1.4.7 cachem_1.1.0
## [46] stringr_1.5.1 zlibbioc_1.50.0 parallel_4.4.0
## [49] XVector_0.44.0 vctrs_0.6.5 Matrix_1.7-0
## [52] jsonlite_1.8.8 echarts4r_0.4.5 beeswarm_0.4.0
## [55] systemfonts_1.1.0 locfit_1.5-9.9 jquerylib_0.1.4
## [58] tidyr_1.3.1 glue_1.7.0 ggstats_0.6.0
## [61] codetools_0.2-20 stringi_1.8.4 gtable_0.3.5
## [64] later_1.3.2 UCSC.utils_1.0.0 htm2txt_2.2.2
## [67] munsell_0.5.1 tibble_3.2.1 pillar_1.9.0
## [70] htmltools_0.5.8.1 GenomeInfoDbData_1.2.12 R6_2.5.1
## [73] shiny_1.8.1.1 evaluate_0.23 lattice_0.22-6
## [76] highr_0.11 bslib_0.7.0 httpuv_1.6.15
## [79] Rcpp_1.0.12 svglite_2.1.3 gridExtra_2.3
## [82] SparseArray_1.4.3 xfun_0.44 pkgconfig_2.0.3