Source: https://github.com/markziemann/miR-enrichment

Introduction

The predicted targets of microRNAs are used in functional enrichment analysis to justify the potential function of microRNAs, but there are some logical problems with this. In a biological tissue, not all of these targets will be expressed. Also the enrichment analysis requires a background list which is all the genes that can be measured. Not all genes are expressed in a tissue, at least half are silenced, so each enrichment analysis requires a custom background gene list. Unfortunately a custom background list is rarely used. We argue that this causes a dramatic distortion to the results.

Sample sheet

GSE221179

Control mRNA: SRR22776661 0 SRR22776662 0 SRR22776663 0 SRR22776664 0 SRR22776665 0 SRR22776666 0 SRR22776667 0 SRR22776670 0 SRR22776671 0 SRR22776672 0 SRR22776673 0 SRR22776674 0

Case mRNA: SRR22776652 1 SRR22776653 1 SRR22776654 1 SRR22776655 1 SRR22776656 1 SRR22776657 1 SRR22776658 1 SRR22776659 1 SRR22776668 1 SRR22776669 1

Methods

suppressPackageStartupMessages({
    library("DESeq2")
    library("gplots")
    library("mitch")
    library("eulerr")
    library("getDEE2")
    library("kableExtra")
})

Import read counts

Importing RNA-seq data

myfiles <-list.files(".",pattern="ke.tsv",recursive=TRUE)

x <- lapply(myfiles,function(x) {
  xx <- read.table(x,header=TRUE,row.names=1)
  xx[,3,drop=FALSE]
})

x <-  do.call(cbind,x)
colnames(x) <- gsub("_est_counts","",colnames(x))

Need gene symbols to map to the transcripts.

mdat <- getDEE2Metadata("hsapiens")
d <- getDEE2(species="hsapiens",SRRvec="SRR11509477",mdat,outfile="NULL",counts="GeneCounts",legacy=TRUE)
## For more information about DEE2 QC metrics, visit
##     https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md
head(d$TxInfo)
##                              GeneID GeneSymbol TxLength
## ENST00000434970.2 ENSG00000237235.2      TRDD2        9
## ENST00000448914.1 ENSG00000228985.1      TRDD3       13
## ENST00000415118.1 ENSG00000223997.1      TRDD1        8
## ENST00000631435.1 ENSG00000282253.1 AC239618.6       12
## ENST00000632684.1 ENSG00000282431.1 AC245427.8       12
## ENST00000454908.1 ENSG00000236170.1    IGHD1-1       17
txinfo <- d$TxInfo

Merge txinfo.

xm <- merge(x,txinfo,by=0)
xm$GeneID_symbol <- paste(xm$GeneID,xm$GeneSymbol)
xm$Row.names = xm$GeneID = xm$GeneSymbol = xm$TxLength = NULL
xa <- aggregate(. ~ GeneID_symbol,xm,sum)
rownames(xa) <- xa[,1]
xa[,1] = NULL

Differential expression

xaf <- xa[which(rowMeans(xa)>=10),]
dim(xa) ; dim(xaf)
## [1] 39297    22
## [1] 19829    22
ss <- data.frame("run"=colnames(xaf),"trt"=c(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0))
rownames(ss) <- ss$run

mds <- cmdscale(dist(t(xaf)))
plot(mds,cex=2,col="gray",pch=19)
text(mds, labels=rownames(mds) ,col="black")

colSums(xaf)
## SRR22776652 SRR22776653 SRR22776654 SRR22776655 SRR22776656 SRR22776657 
##    44577031    42578062    40641747    47600899    42095262    43654015 
## SRR22776658 SRR22776659 SRR22776661 SRR22776662 SRR22776663 SRR22776664 
##    36622281    39514388    51369553    59497937    47135786    46315073 
## SRR22776665 SRR22776666 SRR22776667 SRR22776668 SRR22776669 SRR22776670 
##    48886845    39436308    46874224    40303736    35022570    38503382 
## SRR22776671 SRR22776672 SRR22776673 SRR22776674 
##    38573745    32617293    35023134    35252217
dds <- DESeqDataSetFromMatrix(countData = round(xaf) , colData = ss , design = ~ trt )
## converting counts to integer mode
##   the design formula contains one or more numeric variables with integer values,
##   specifying a model with increasing fold change for higher values.
##   did you mean for this to be a factor? if so, first convert
##   this variable to a factor using the factor() function
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 198 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
z <- results(res)
vsd <- vst(dds, blind=FALSE)
zz <- cbind(as.data.frame(z),assay(vsd))
dge <- as.data.frame(zz[order(zz$pvalue),])
head(dge)
##                             baseMean log2FoldChange      lfcSE      stat
## ENSG00000184787.18 UBE2G2 2096.44576      0.5986704 0.07093236  8.440018
## ENSG00000159110.19 IFNAR2  732.80690      0.8773440 0.10993724  7.980408
## ENSG00000142207.6 URB1    1725.70139      0.7591352 0.09581493  7.922932
## ENSG00000156304.14 SCAF4  1329.41817      0.6940686 0.08803683  7.883844
## ENSG00000183570.16 PCBP3   351.42031      1.1321849 0.14729229  7.686654
## ENSG00000278599.5 TBC1D3E   12.18373    -22.7625373 2.96976086 -7.664771
##                                 pvalue         padj SRR22776652 SRR22776653
## ENSG00000184787.18 UBE2G2 3.172897e-17 6.167794e-13   11.257861   11.358252
## ENSG00000159110.19 IFNAR2 1.458509e-15 1.417598e-11   10.353858    9.854420
## ENSG00000142207.6 URB1    2.319736e-15 1.503112e-11   11.071443   10.989664
## ENSG00000156304.14 SCAF4  3.174612e-15 1.542782e-11   10.844229   10.691194
## ENSG00000183570.16 PCBP3  1.510323e-14 5.804174e-11    9.230514    8.855684
## ENSG00000278599.5 TBC1D3E 1.791504e-14 5.804174e-11    3.587555    3.587555
##                           SRR22776654 SRR22776655 SRR22776656 SRR22776657
## ENSG00000184787.18 UBE2G2   11.327872   11.277669   11.299512   11.150247
## ENSG00000159110.19 IFNAR2    9.786524    9.678161    9.804457    9.765098
## ENSG00000142207.6 URB1      10.873669   11.081373   11.236143   11.190844
## ENSG00000156304.14 SCAF4    10.566949   10.520513   10.558705   10.542409
## ENSG00000183570.16 PCBP3     8.833820    8.393641    8.801850    8.659869
## ENSG00000278599.5 TBC1D3E    3.587555    3.587555    3.587555    3.587555
##                           SRR22776658 SRR22776659 SRR22776661 SRR22776662
## ENSG00000184787.18 UBE2G2   11.368094   11.517657   10.776558   10.684557
## ENSG00000159110.19 IFNAR2   10.161978   10.240415    8.953855    8.910633
## ENSG00000142207.6 URB1      10.910375   11.129646   10.578542   10.333626
## ENSG00000156304.14 SCAF4    10.904194   10.794201    9.991966    9.772529
## ENSG00000183570.16 PCBP3     9.753442    9.155070    8.091874    8.018512
## ENSG00000278599.5 TBC1D3E    3.587555    3.587555    3.587555    8.180945
##                           SRR22776663 SRR22776664 SRR22776665 SRR22776666
## ENSG00000184787.18 UBE2G2   10.676413   10.650871   10.731341   10.911498
## ENSG00000159110.19 IFNAR2    8.929105    9.030954    8.887490    9.211094
## ENSG00000142207.6 URB1      10.446935   10.387348   10.448332   10.526414
## ENSG00000156304.14 SCAF4     9.910626    9.988348    9.978689   10.183032
## ENSG00000183570.16 PCBP3     8.278882    7.742214    7.888101    8.057762
## ENSG00000278599.5 TBC1D3E    3.587555    8.299888    3.587555    3.587555
##                           SRR22776667 SRR22776668 SRR22776669 SRR22776670
## ENSG00000184787.18 UBE2G2   10.813144   11.325216   11.507736   10.984056
## ENSG00000159110.19 IFNAR2    9.086391    9.901334    9.920457    9.381979
## ENSG00000142207.6 URB1      10.441242   11.373563   11.382013   10.613548
## ENSG00000156304.14 SCAF4    10.032041   10.775505   11.042343   10.297097
## ENSG00000183570.16 PCBP3     7.898055    9.221419    8.951274    7.734237
## ENSG00000278599.5 TBC1D3E    4.073844    3.587555    5.234929    3.587555
##                           SRR22776671 SRR22776672 SRR22776673 SRR22776674
## ENSG00000184787.18 UBE2G2   10.888516   10.396533   10.573644   10.824061
## ENSG00000159110.19 IFNAR2    8.923631    9.264923    9.040072    9.592876
## ENSG00000142207.6 URB1      10.455225    9.791255   10.205286   10.194882
## ENSG00000156304.14 SCAF4    10.366049   10.155147    9.898820    9.936431
## ENSG00000183570.16 PCBP3     7.874810    7.862555    7.696717    8.371365
## ENSG00000278599.5 TBC1D3E    3.587555    3.587555    8.714839    3.587555
ups <- rownames(subset(dge,padj<0.05 & log2FoldChange>0))
dns <- rownames(subset(dge,padj<0.05 & log2FoldChange<0))
lapply(list("UPs"=ups,"DNs"=dns),length)
## $UPs
## [1] 840
## 
## $DNs
## [1] 498
nrow(dge)
## [1] 19829
saveRDS(dge,file="GSE221179.Rds")

Session information

For reproducibility.

sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] kableExtra_1.4.0            getDEE2_1.14.0             
##  [3] eulerr_7.0.2                mitch_1.16.0               
##  [5] gplots_3.1.3.1              DESeq2_1.44.0              
##  [7] SummarizedExperiment_1.34.0 Biobase_2.64.0             
##  [9] MatrixGenerics_1.16.0       matrixStats_1.3.0          
## [11] GenomicRanges_1.56.0        GenomeInfoDb_1.40.0        
## [13] IRanges_2.38.0              S4Vectors_0.42.0           
## [15] BiocGenerics_0.50.0        
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1        viridisLite_0.4.2       dplyr_1.1.4            
##  [4] bitops_1.0-7            fastmap_1.2.0           GGally_2.2.1           
##  [7] promises_1.3.0          digest_0.6.35           mime_0.12              
## [10] lifecycle_1.0.4         magrittr_2.0.3          compiler_4.4.0         
## [13] sass_0.4.9              rlang_1.1.4             tools_4.4.0            
## [16] yaml_2.3.8              utf8_1.2.4              knitr_1.47             
## [19] S4Arrays_1.4.0          htmlwidgets_1.6.4       DelayedArray_0.30.1    
## [22] xml2_1.3.6              plyr_1.8.9              RColorBrewer_1.1-3     
## [25] abind_1.4-5             BiocParallel_1.38.0     KernSmooth_2.23-24     
## [28] purrr_1.0.2             grid_4.4.0              fansi_1.0.6            
## [31] caTools_1.18.2          xtable_1.8-4            colorspace_2.1-0       
## [34] ggplot2_3.5.1           MASS_7.3-60.2           scales_1.3.0           
## [37] gtools_3.9.5            cli_3.6.2               rmarkdown_2.27         
## [40] crayon_1.5.2            generics_0.1.3          rstudioapi_0.16.0      
## [43] reshape2_1.4.4          httr_1.4.7              cachem_1.1.0           
## [46] stringr_1.5.1           zlibbioc_1.50.0         parallel_4.4.0         
## [49] XVector_0.44.0          vctrs_0.6.5             Matrix_1.7-0           
## [52] jsonlite_1.8.8          echarts4r_0.4.5         beeswarm_0.4.0         
## [55] systemfonts_1.1.0       locfit_1.5-9.9          jquerylib_0.1.4        
## [58] tidyr_1.3.1             glue_1.7.0              ggstats_0.6.0          
## [61] codetools_0.2-20        stringi_1.8.4           gtable_0.3.5           
## [64] later_1.3.2             UCSC.utils_1.0.0        htm2txt_2.2.2          
## [67] munsell_0.5.1           tibble_3.2.1            pillar_1.9.0           
## [70] htmltools_0.5.8.1       GenomeInfoDbData_1.2.12 R6_2.5.1               
## [73] shiny_1.8.1.1           evaluate_0.23           lattice_0.22-6         
## [76] highr_0.11              bslib_0.7.0             httpuv_1.6.15          
## [79] Rcpp_1.0.12             svglite_2.1.3           gridExtra_2.3          
## [82] SparseArray_1.4.3       xfun_0.44               pkgconfig_2.0.3