Source: https://github.com/markziemann/miR-enrichment
The predicted targets of microRNAs are used in functional enrichment analysis to justify the potential function of microRNAs, but there are some logical problems with this. In a biological tissue, not all of these targets will be expressed. Also the enrichment analysis requires a background list which is all the genes that can be measured. Not all genes are expressed in a tissue, at least half are silenced, so each enrichment analysis requires a custom background gene list. Unfortunately a custom background list is rarely used. We argue that this causes a dramatic distortion to the results.
GSE221179
Control mRNA: SRR22776661 0 SRR22776662 0 SRR22776663 0 SRR22776664 0 SRR22776665 0 SRR22776666 0 SRR22776667 0 SRR22776670 0 SRR22776671 0 SRR22776672 0 SRR22776673 0 SRR22776674 0
Case mRNA: SRR22776652 1 SRR22776653 1 SRR22776654 1 SRR22776655 1 SRR22776656 1 SRR22776657 1 SRR22776658 1 SRR22776659 1 SRR22776668 1 SRR22776669 1
suppressPackageStartupMessages({
library("DESeq2")
library("gplots")
library("mitch")
library("eulerr")
library("getDEE2")
library("kableExtra")
})
Importing RNA-seq data
myfiles <-list.files(".",pattern="ke.tsv",recursive=TRUE)
x <- lapply(myfiles,function(x) {
xx <- read.table(x,header=TRUE,row.names=1)
xx[,3,drop=FALSE]
})
x <- do.call(cbind,x)
colnames(x) <- gsub("_est_counts","",colnames(x))
Need gene symbols to map to the transcripts.
mdat <- getDEE2Metadata("hsapiens")
d <- getDEE2(species="hsapiens",SRRvec="SRR11509477",mdat,outfile="NULL",counts="GeneCounts",legacy=TRUE)
## For more information about DEE2 QC metrics, visit
## https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md
head(d$TxInfo)
## GeneID GeneSymbol TxLength
## ENST00000434970.2 ENSG00000237235.2 TRDD2 9
## ENST00000448914.1 ENSG00000228985.1 TRDD3 13
## ENST00000415118.1 ENSG00000223997.1 TRDD1 8
## ENST00000631435.1 ENSG00000282253.1 AC239618.6 12
## ENST00000632684.1 ENSG00000282431.1 AC245427.8 12
## ENST00000454908.1 ENSG00000236170.1 IGHD1-1 17
txinfo <- d$TxInfo
Merge txinfo.
xm <- merge(x,txinfo,by=0)
xm$GeneID_symbol <- paste(xm$GeneID,xm$GeneSymbol)
xm$Row.names = xm$GeneID = xm$GeneSymbol = xm$TxLength = NULL
xa <- aggregate(. ~ GeneID_symbol,xm,sum)
rownames(xa) <- xa[,1]
xa[,1] = NULL
xaf <- xa[which(rowMeans(xa)>=10),]
dim(xa) ; dim(xaf)
## [1] 39297 22
## [1] 19829 22
ss <- data.frame("run"=colnames(xaf),"trt"=c(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0))
rownames(ss) <- ss$run
mds <- cmdscale(dist(t(xaf)))
plot(mds,cex=2,col="gray",pch=19)
text(mds, labels=rownames(mds) ,col="black")
colSums(xaf)
## SRR22776652 SRR22776653 SRR22776654 SRR22776655 SRR22776656 SRR22776657
## 44577031 42578062 40641747 47600899 42095262 43654015
## SRR22776658 SRR22776659 SRR22776661 SRR22776662 SRR22776663 SRR22776664
## 36622281 39514388 51369553 59497937 47135786 46315073
## SRR22776665 SRR22776666 SRR22776667 SRR22776668 SRR22776669 SRR22776670
## 48886845 39436308 46874224 40303736 35022570 38503382
## SRR22776671 SRR22776672 SRR22776673 SRR22776674
## 38573745 32617293 35023134 35252217
dds <- DESeqDataSetFromMatrix(countData = round(xaf) , colData = ss , design = ~ trt )
## converting counts to integer mode
## the design formula contains one or more numeric variables with integer values,
## specifying a model with increasing fold change for higher values.
## did you mean for this to be a factor? if so, first convert
## this variable to a factor using the factor() function
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 198 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
z <- results(res)
vsd <- vst(dds, blind=FALSE)
zz <- cbind(as.data.frame(z),assay(vsd))
dge <- as.data.frame(zz[order(zz$pvalue),])
head(dge)
## baseMean log2FoldChange lfcSE stat
## ENSG00000184787.18 UBE2G2 2096.44576 0.5986704 0.07093236 8.440018
## ENSG00000159110.19 IFNAR2 732.80690 0.8773440 0.10993724 7.980408
## ENSG00000142207.6 URB1 1725.70139 0.7591352 0.09581493 7.922932
## ENSG00000156304.14 SCAF4 1329.41817 0.6940686 0.08803683 7.883844
## ENSG00000183570.16 PCBP3 351.42031 1.1321849 0.14729229 7.686654
## ENSG00000278599.5 TBC1D3E 12.18373 -22.7625373 2.96976086 -7.664771
## pvalue padj SRR22776652 SRR22776653
## ENSG00000184787.18 UBE2G2 3.172897e-17 6.167794e-13 11.257861 11.358252
## ENSG00000159110.19 IFNAR2 1.458509e-15 1.417598e-11 10.353858 9.854420
## ENSG00000142207.6 URB1 2.319736e-15 1.503112e-11 11.071443 10.989664
## ENSG00000156304.14 SCAF4 3.174612e-15 1.542782e-11 10.844229 10.691194
## ENSG00000183570.16 PCBP3 1.510323e-14 5.804174e-11 9.230514 8.855684
## ENSG00000278599.5 TBC1D3E 1.791504e-14 5.804174e-11 3.587555 3.587555
## SRR22776654 SRR22776655 SRR22776656 SRR22776657
## ENSG00000184787.18 UBE2G2 11.327872 11.277669 11.299512 11.150247
## ENSG00000159110.19 IFNAR2 9.786524 9.678161 9.804457 9.765098
## ENSG00000142207.6 URB1 10.873669 11.081373 11.236143 11.190844
## ENSG00000156304.14 SCAF4 10.566949 10.520513 10.558705 10.542409
## ENSG00000183570.16 PCBP3 8.833820 8.393641 8.801850 8.659869
## ENSG00000278599.5 TBC1D3E 3.587555 3.587555 3.587555 3.587555
## SRR22776658 SRR22776659 SRR22776661 SRR22776662
## ENSG00000184787.18 UBE2G2 11.368094 11.517657 10.776558 10.684557
## ENSG00000159110.19 IFNAR2 10.161978 10.240415 8.953855 8.910633
## ENSG00000142207.6 URB1 10.910375 11.129646 10.578542 10.333626
## ENSG00000156304.14 SCAF4 10.904194 10.794201 9.991966 9.772529
## ENSG00000183570.16 PCBP3 9.753442 9.155070 8.091874 8.018512
## ENSG00000278599.5 TBC1D3E 3.587555 3.587555 3.587555 8.180945
## SRR22776663 SRR22776664 SRR22776665 SRR22776666
## ENSG00000184787.18 UBE2G2 10.676413 10.650871 10.731341 10.911498
## ENSG00000159110.19 IFNAR2 8.929105 9.030954 8.887490 9.211094
## ENSG00000142207.6 URB1 10.446935 10.387348 10.448332 10.526414
## ENSG00000156304.14 SCAF4 9.910626 9.988348 9.978689 10.183032
## ENSG00000183570.16 PCBP3 8.278882 7.742214 7.888101 8.057762
## ENSG00000278599.5 TBC1D3E 3.587555 8.299888 3.587555 3.587555
## SRR22776667 SRR22776668 SRR22776669 SRR22776670
## ENSG00000184787.18 UBE2G2 10.813144 11.325216 11.507736 10.984056
## ENSG00000159110.19 IFNAR2 9.086391 9.901334 9.920457 9.381979
## ENSG00000142207.6 URB1 10.441242 11.373563 11.382013 10.613548
## ENSG00000156304.14 SCAF4 10.032041 10.775505 11.042343 10.297097
## ENSG00000183570.16 PCBP3 7.898055 9.221419 8.951274 7.734237
## ENSG00000278599.5 TBC1D3E 4.073844 3.587555 5.234929 3.587555
## SRR22776671 SRR22776672 SRR22776673 SRR22776674
## ENSG00000184787.18 UBE2G2 10.888516 10.396533 10.573644 10.824061
## ENSG00000159110.19 IFNAR2 8.923631 9.264923 9.040072 9.592876
## ENSG00000142207.6 URB1 10.455225 9.791255 10.205286 10.194882
## ENSG00000156304.14 SCAF4 10.366049 10.155147 9.898820 9.936431
## ENSG00000183570.16 PCBP3 7.874810 7.862555 7.696717 8.371365
## ENSG00000278599.5 TBC1D3E 3.587555 3.587555 8.714839 3.587555
ups <- rownames(subset(dge,padj<0.05 & log2FoldChange>0))
dns <- rownames(subset(dge,padj<0.05 & log2FoldChange<0))
lapply(list("UPs"=ups,"DNs"=dns),length)
## $UPs
## [1] 840
##
## $DNs
## [1] 498
nrow(dge)
## [1] 19829
saveRDS(dge,file="GSE221179.Rds")
For reproducibility.
sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] kableExtra_1.4.0 getDEE2_1.14.0
## [3] eulerr_7.0.2 mitch_1.16.0
## [5] gplots_3.1.3.1 DESeq2_1.44.0
## [7] SummarizedExperiment_1.34.0 Biobase_2.64.0
## [9] MatrixGenerics_1.16.0 matrixStats_1.3.0
## [11] GenomicRanges_1.56.0 GenomeInfoDb_1.40.0
## [13] IRanges_2.38.0 S4Vectors_0.42.0
## [15] BiocGenerics_0.50.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 viridisLite_0.4.2 dplyr_1.1.4
## [4] bitops_1.0-7 fastmap_1.2.0 GGally_2.2.1
## [7] promises_1.3.0 digest_0.6.35 mime_0.12
## [10] lifecycle_1.0.4 magrittr_2.0.3 compiler_4.4.0
## [13] sass_0.4.9 rlang_1.1.4 tools_4.4.0
## [16] yaml_2.3.8 utf8_1.2.4 knitr_1.47
## [19] S4Arrays_1.4.0 htmlwidgets_1.6.4 DelayedArray_0.30.1
## [22] xml2_1.3.6 plyr_1.8.9 RColorBrewer_1.1-3
## [25] abind_1.4-5 BiocParallel_1.38.0 KernSmooth_2.23-24
## [28] purrr_1.0.2 grid_4.4.0 fansi_1.0.6
## [31] caTools_1.18.2 xtable_1.8-4 colorspace_2.1-0
## [34] ggplot2_3.5.1 MASS_7.3-60.2 scales_1.3.0
## [37] gtools_3.9.5 cli_3.6.2 rmarkdown_2.27
## [40] crayon_1.5.2 generics_0.1.3 rstudioapi_0.16.0
## [43] reshape2_1.4.4 httr_1.4.7 cachem_1.1.0
## [46] stringr_1.5.1 zlibbioc_1.50.0 parallel_4.4.0
## [49] XVector_0.44.0 vctrs_0.6.5 Matrix_1.7-0
## [52] jsonlite_1.8.8 echarts4r_0.4.5 beeswarm_0.4.0
## [55] systemfonts_1.1.0 locfit_1.5-9.9 jquerylib_0.1.4
## [58] tidyr_1.3.1 glue_1.7.0 ggstats_0.6.0
## [61] codetools_0.2-20 stringi_1.8.4 gtable_0.3.5
## [64] later_1.3.2 UCSC.utils_1.0.0 htm2txt_2.2.2
## [67] munsell_0.5.1 tibble_3.2.1 pillar_1.9.0
## [70] htmltools_0.5.8.1 GenomeInfoDbData_1.2.12 R6_2.5.1
## [73] shiny_1.8.1.1 evaluate_0.23 lattice_0.22-6
## [76] highr_0.11 bslib_0.7.0 httpuv_1.6.15
## [79] Rcpp_1.0.12 svglite_2.1.3 gridExtra_2.3
## [82] SparseArray_1.4.3 xfun_0.44 pkgconfig_2.0.3