Source: https://github.com/markziemann/mesangial_rageko

Introduction

Looking at doing comparison of control and RAGE KO Mouse mesangial cells RNA-seq and miRNA-seq.

Methods

  • RAGE KO data sequenced at Baker. 61 nt length.

  • WT data sequenced at BGI with 91 nt length.

  • Quality trimming with skewer 0.2.2 to ensure 3’ end base quality > 20.

  • microRNA hairpins were downloaded from miRbase

  • Mouse hairpins were extracted from the reference and indexed with kallisto using a seed length of 15 bp.

  • kallisto was run in single end mode to map the reads.

  • Read counts were loaded into R and analysed with DESeq2.

  • OS, R and package versions are shown in the session info section at the end of this report.

suppressPackageStartupMessages({
  library("reshape2")
  library("DESeq2")
  library("gplots")
  library("mitch")
  library("dplyr")
  library("biomaRt")
  library("kableExtra")
})

Data

tmp <- read.table("3col.tsv.gz",header=F)
x <- as.matrix(acast(tmp, V2~V1, value.var="V3", fun.aggregate = sum))
x <- as.data.frame(x)

xx <- round(x)
dim(xx)
## [1] 1234   12
head(xx)
##              miR_rageko_ctrl_1 miR_rageko_ctrl_2 miR_rageko_ctrl_3
## mmu-let-7a-1             20363             20727              9947
## mmu-let-7a-2              1543              1441               691
## mmu-let-7b               42713             48743             23034
## mmu-let-7c-1             37266             34769             21005
## mmu-let-7c-2             74907             92800             46649
## mmu-let-7d                8707              9948              4206
##              miR_rageko_TGF_1 miR_rageko_TGF_2 miR_rageko_TGF_3 miR_wt_ctrl_1
## mmu-let-7a-1            16769            12800            12352       1306100
## mmu-let-7a-2             1489              614              608         44116
## mmu-let-7b              22473            15060            13756       3184290
## mmu-let-7c-1            22296             8455             5260       3453180
## mmu-let-7c-2            57307            32425            30454       1240720
## mmu-let-7d               6126             6035             5399        189697
##              miR_wt_ctrl_2 miR_wt_ctrl_3 miR_wt_TGF_1 miR_wt_TGF_2 miR_wt_TGF_3
## mmu-let-7a-1       1348620       1472040      1669340      1426850      1563840
## mmu-let-7a-2         43526         50652        78374        78690        84683
## mmu-let-7b         3163160       3465470      3348090      2859190      3046660
## mmu-let-7c-1       3466790       3664760      3491620      2982230      3103810
## mmu-let-7c-2       1190810       1395550      1427850      1286160      1329270
## mmu-let-7d          195022        196772       242764       212549       217202
write.table(x=xx,file="miRNA_count_matrix.tsv",sep="\t")

Now quantify the top 10 species. Starting with all samples.

rpm <- apply(xx, 2, function(x){x/sum(x,na.rm=T)}) * 1000000

rpm <- rpm[order(rowMeans(rpm) ),]

rpm <- tail(rpm,10)

# create color palette:
library(RColorBrewer)
n=10
pal <- brewer.pal(n, "Paired")

par(mfrow=c(1,2))
par(mar=c(10,5,3,1))
barplot(rpm,col=pal,las=2)
plot.new()
legend("topleft", legend=rev(rownames(rpm)),fill=rev(pal),bg="white",cex=1)

#restore default setting
par(mar=c(5.1, 4.1, 4.1, 2.1))

Now calculate diversity of each sample.

pc <- apply(xx, 2, function(x){x/sum(x,na.rm=T)}) * 100

res <- lapply(1:ncol(pc),function(i){
  dat <- pc[,i]
  dat <- dat[order(-dat)]
  head(cumsum(dat))
})
names(res) <- colnames(pc)
res
## $miR_rageko_ctrl_1
## mmu-mir-21a mmu-mir-143 mmu-mir-30a mmu-mir-10a mmu-mir-27b  mmu-let-7i 
##    35.04691    50.68179    56.93842    62.08391    65.17234    67.31654 
## 
## $miR_rageko_ctrl_2
## mmu-mir-21a mmu-mir-143 mmu-mir-30a mmu-mir-10a mmu-mir-27b  mmu-let-7i 
##    33.26619    49.71553    55.91072    60.54882    63.63909    65.95557 
## 
## $miR_rageko_ctrl_3
## mmu-mir-21a mmu-mir-143 mmu-mir-30a mmu-mir-10a mmu-mir-27b mmu-mir-30d 
##    33.03525    50.11035    58.16282    62.36956    65.16494    67.86357 
## 
## $miR_rageko_TGF_1
##    mmu-mir-21a    mmu-mir-143    mmu-mir-30a    mmu-mir-10a    mmu-mir-30d 
##       37.08525       51.32986       58.80983       63.22891       66.24793 
## mmu-mir-199a-1 
##       68.14504 
## 
## $miR_rageko_TGF_2
##    mmu-mir-21a    mmu-mir-143    mmu-mir-30a    mmu-mir-10a mmu-mir-199a-1 
##       45.01689       58.06857       63.68706       65.88644       68.07317 
##     mmu-mir-22 
##       70.25966 
## 
## $miR_rageko_TGF_3
##    mmu-mir-21a    mmu-mir-143    mmu-mir-30a     mmu-mir-22 mmu-mir-199a-1 
##       47.72168       62.21030       66.64633       69.00270       71.20109 
##    mmu-mir-10a 
##       73.29187 
## 
## $miR_wt_ctrl_1
## mmu-let-7c-1   mmu-let-7b mmu-let-7a-1 mmu-let-7c-2 mmu-let-7f-2   mmu-let-7e 
##     22.41564     43.08582     51.56412     59.61800     66.69586     73.01094 
## 
## $miR_wt_ctrl_2
## mmu-let-7c-1   mmu-let-7b mmu-let-7a-1 mmu-let-7c-2 mmu-let-7f-2   mmu-let-7e 
##     22.21449     42.48338     51.12507     58.75554     65.99981     72.87769 
## 
## $miR_wt_ctrl_3
## mmu-let-7c-1   mmu-let-7b mmu-let-7a-1 mmu-let-7c-2   mmu-let-7e mmu-let-7f-2 
##     22.04385     42.88896     51.74341     60.13776     67.33343     74.10425 
## 
## $miR_wt_TGF_1
## mmu-let-7c-1   mmu-let-7b   mmu-let-7e mmu-let-7a-1 mmu-let-7f-1 mmu-let-7c-2 
##     19.23706     37.68335     47.70534     56.90256     65.38002     73.24676 
## 
## $miR_wt_TGF_2
## mmu-let-7c-1   mmu-let-7b   mmu-let-7e mmu-let-7a-1 mmu-let-7f-1 mmu-let-7c-2 
##     19.07475     37.36251     47.31784     56.44416     65.47192     73.69838 
## 
## $miR_wt_TGF_3
## mmu-let-7c-1   mmu-let-7b   mmu-let-7e mmu-let-7a-1 mmu-let-7f-1 mmu-let-7c-2 
##     18.82160     37.29665     47.61448     57.09765     66.01360     74.07433

Samplesheet

#colnames(xx) <- sapply(strsplit(colnames(xx),"_"),"[[",1)
mysamples <- colnames(xx)
ko <- as.numeric(grepl("ko",mysamples))
tgf <- as.numeric(grepl("TGF",mysamples))
ss <- data.frame(mysamples,ko,tgf)
rownames(ss) <- mysamples
ss
##                           mysamples ko tgf
## miR_rageko_ctrl_1 miR_rageko_ctrl_1  1   0
## miR_rageko_ctrl_2 miR_rageko_ctrl_2  1   0
## miR_rageko_ctrl_3 miR_rageko_ctrl_3  1   0
## miR_rageko_TGF_1   miR_rageko_TGF_1  1   1
## miR_rageko_TGF_2   miR_rageko_TGF_2  1   1
## miR_rageko_TGF_3   miR_rageko_TGF_3  1   1
## miR_wt_ctrl_1         miR_wt_ctrl_1  0   0
## miR_wt_ctrl_2         miR_wt_ctrl_2  0   0
## miR_wt_ctrl_3         miR_wt_ctrl_3  0   0
## miR_wt_TGF_1           miR_wt_TGF_1  0   1
## miR_wt_TGF_2           miR_wt_TGF_2  0   1
## miR_wt_TGF_3           miR_wt_TGF_3  0   1

QC

TODO: rRNA carryover.

par(mar=c(5,8,3,1))
barplot(colSums(xx),horiz=TRUE,las=1,xlab="num reads",col=ss$cols)
grid()

MDS plot

And correlation heat map.

par(mar=c(5,5,3,3))

mds <- cmdscale(dist(t(xx)))

plot(mds, xlab="Coordinate 1", ylab="Coordinate 2",
  type = "p",bty="n", cex=4 )

text(mds, labels=rownames(mds) ,col="black")

heatmap.2(cor(xx),trace="n",main="Pearson correlation heatmap",margin=c(8,8),cexRow=0.7,cexCol=0.7)

Differential expression

Compare wt control to RAGE KO control.

maplot <- function(de,contrast_name) {
  de <- de[which(!is.na(de$padj)),]
  sig <-subset(de, padj < 0.05 )
  up <-rownames(subset(de, padj < 0.05 & log2FoldChange > 0))
  dn <-rownames(subset(de, padj < 0.05 & log2FoldChange < 0))
  GENESUP <- length(up)
  GENESDN <- length(dn)
  DET=nrow(de)
  SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down", DET, "detected")
  ns <-subset(de, padj > 0.05 )
  plot(log2(de$baseMean),de$log2FoldChange,
       xlab="log2 basemean", ylab="log2 foldchange",
       pch=19, cex=0.5, col="dark gray",
       main=contrast_name, cex.main=1)
  points(log2(sig$baseMean),sig$log2FoldChange,
         pch=19, cex=0.5, col="red")
  mtext(SUBHEADER,cex = 1)
}
make_volcano <- function(de,name) {
    de <- de[which(!is.na(de$padj)),]
    de$pvalue[which(de$pvalue==0)] <- 1e-320
    sig <- subset(de,padj<0.05)
    N_SIG=nrow(sig)
    N_UP=nrow(subset(sig,log2FoldChange>0))
    N_DN=nrow(subset(sig,log2FoldChange<0))
    DET=nrow(de)
    HEADER=paste(N_SIG,"@5%FDR,", N_UP, "up", N_DN, "dn", DET, "detected")
    plot(de$log2FoldChange,-log10(de$pval),cex=0.5,pch=19,col="darkgray",
        main=name, xlab="log2 FC", ylab="-log10 pval")
    mtext(HEADER)
    grid()
    points(sig$log2FoldChange,-log10(sig$pval),cex=0.5,pch=19,col="red")
}
ss1 <- subset(ss,tgf==0)
xx1 <- xx[,which(colnames(xx) %in% rownames(ss1) )]
dim(xx1)
## [1] 1234    6
xx1 <- xx1[which(rowMeans(xx1)>10),]
dim(xx1)
## [1] 436   6
dds <- DESeqDataSetFromMatrix(countData = xx1 , colData = ss1 , design = ~ ko )
## converting counts to integer mode
##   the design formula contains one or more numeric variables with integer values,
##   specifying a model with increasing fold change for higher values.
##   did you mean for this to be a factor? if so, first convert
##   this variable to a factor using the factor() function
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
z <- results(res)
vsd <- vst(dds, blind=FALSE,nsub=nrow(xx1))

zz <- cbind(as.data.frame(z),assay(vsd))
dge <- as.data.frame(zz[order(zz$pvalue),])
dge[1:20,1:6] %>% kbl(caption = "Top gene expression differences") %>% 
  kable_paper("hover", full_width = F)
Top gene expression differences
baseMean log2FoldChange lfcSE stat pvalue padj
mmu-let-7a-1 688836.774 -6.405851 0.1409814 -45.43755 0 0
mmu-let-7b 1637945.068 -6.481798 0.1472216 -44.02751 0 0
mmu-let-7c-1 1760492.704 -6.861097 0.1451876 -47.25676 0 0
mmu-let-7e 542795.327 -6.057295 0.1478637 -40.96540 0 0
mmu-let-7k 6972.734 -8.746573 0.1986122 -44.03844 0 0
mmu-mir-183 10684.981 5.936845 0.1475355 40.24012 0 0
mmu-mir-503 7238.823 -7.025117 0.1669449 -42.08044 0 0
mmu-mir-5099 50115.479 5.904955 0.1485551 39.74927 0 0
mmu-mir-21c 3170.357 8.086983 0.2186157 36.99178 0 0
mmu-mir-181d 11207.275 -6.469621 0.1763896 -36.67802 0 0
mmu-mir-10b 51943.154 4.448196 0.1251096 35.55440 0 0
mmu-mir-181b-2 16158.569 -5.081381 0.1431792 -35.48966 0 0
mmu-let-7a-2 23438.059 -5.314162 0.1523568 -34.87973 0 0
mmu-mir-3074-2 12854.203 5.334356 0.1547168 34.47820 0 0
mmu-mir-298 2757.555 -5.573552 0.1623121 -34.33848 0 0
mmu-mir-214 5444.600 5.150779 0.1534990 33.55579 0 0
mmu-let-7d 99318.677 -4.746768 0.1415774 -33.52772 0 0
mmu-mir-1839 17238.902 -4.554979 0.1395659 -32.63676 0 0
mmu-mir-222 125551.730 -4.952649 0.1518771 -32.60957 0 0
mmu-mir-320 103212.812 -4.961718 0.1534058 -32.34374 0 0
d1up <- rownames(subset(dge,padj <= 0.05 & log2FoldChange > 0))
d1dn <- rownames(subset(dge,padj <= 0.05 & log2FoldChange < 0))
write.table(dge,file="miRNA_DE_ctrl_vs_rageko.tsv",quote=FALSE,sep="\t")

maplot(dge,"Cont1: Effect of RAGE KO")

make_volcano(dge,"Cont1: Effect of RAGE KO")

#agerdat <- assay(vsd)[grep("Ager",rownames(assay(vsd))),]
#par(mar=c(5,8,3,1))
#barplot(agerdat,horiz=TRUE,las=1,xlab="Normalised Ager expression",xlim=c(0,10))

Make a heatmap of the top 30 genes.

rpm1 <- apply(xx1, 2, function(x){x/sum(x,na.rm=T)}) * 1000000

colfunc <- colorRampPalette(c("blue", "white", "red"))

rpm2 <- rpm1[which(rownames(rpm1) %in% rownames(head(dge,30))),]

heatmap.2(as.matrix(rpm2),margin=c(8, 22),cexRow=0.85,trace="none",
    cexCol=0.9,col=colfunc(20),scale="row")

SessionInfo

sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.1 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] RColorBrewer_1.1-3          kableExtra_1.3.4           
##  [3] biomaRt_2.52.0              dplyr_1.0.9                
##  [5] mitch_1.8.0                 gplots_3.1.3               
##  [7] DESeq2_1.36.0               SummarizedExperiment_1.26.1
##  [9] Biobase_2.56.0              MatrixGenerics_1.8.0       
## [11] matrixStats_0.62.0          GenomicRanges_1.48.0       
## [13] GenomeInfoDb_1.32.2         IRanges_2.30.0             
## [15] S4Vectors_0.34.0            BiocGenerics_0.42.0        
## [17] reshape2_1.4.4             
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_2.0-3       ellipsis_0.3.2         XVector_0.36.0        
##   [4] rstudioapi_0.13        bit64_4.0.5            AnnotationDbi_1.58.0  
##   [7] fansi_1.0.3            xml2_1.3.3             codetools_0.2-18      
##  [10] splines_4.2.1          cachem_1.0.6           geneplotter_1.74.0    
##  [13] knitr_1.39             jsonlite_1.8.0         annotate_1.74.0       
##  [16] dbplyr_2.2.1           png_0.1-7              shiny_1.7.1           
##  [19] compiler_4.2.1         httr_1.4.3             assertthat_0.2.1      
##  [22] Matrix_1.4-1           fastmap_1.1.0          cli_3.3.0             
##  [25] later_1.3.0            htmltools_0.5.2        prettyunits_1.1.1     
##  [28] tools_4.2.1            gtable_0.3.0           glue_1.6.2            
##  [31] GenomeInfoDbData_1.2.8 rappdirs_0.3.3         Rcpp_1.0.8.3          
##  [34] jquerylib_0.1.4        vctrs_0.4.1            Biostrings_2.64.0     
##  [37] svglite_2.1.0          xfun_0.31              stringr_1.4.0         
##  [40] rvest_1.0.2            mime_0.12              lifecycle_1.0.1       
##  [43] gtools_3.9.2.2         XML_3.99-0.10          zlibbioc_1.42.0       
##  [46] MASS_7.3-58            scales_1.2.0           hms_1.1.1             
##  [49] promises_1.2.0.1       parallel_4.2.1         yaml_2.3.5            
##  [52] curl_4.3.2             memoise_2.0.1          gridExtra_2.3         
##  [55] ggplot2_3.3.6          sass_0.4.1             reshape_0.8.9         
##  [58] stringi_1.7.6          RSQLite_2.2.14         highr_0.9             
##  [61] genefilter_1.78.0      caTools_1.18.2         filelock_1.0.2        
##  [64] BiocParallel_1.30.3    echarts4r_0.4.4        systemfonts_1.0.4     
##  [67] rlang_1.0.3            pkgconfig_2.0.3        bitops_1.0-7          
##  [70] evaluate_0.15          lattice_0.20-45        purrr_0.3.4           
##  [73] htmlwidgets_1.5.4      bit_4.0.4              tidyselect_1.1.2      
##  [76] GGally_2.1.2           plyr_1.8.7             magrittr_2.0.3        
##  [79] R6_2.5.1               generics_0.1.2         DelayedArray_0.22.0   
##  [82] DBI_1.1.3              pillar_1.7.0           survival_3.4-0        
##  [85] KEGGREST_1.36.2        RCurl_1.98-1.7         tibble_3.1.7          
##  [88] crayon_1.5.1           KernSmooth_2.23-20     utf8_1.2.2            
##  [91] BiocFileCache_2.4.0    rmarkdown_2.14         progress_1.2.2        
##  [94] locfit_1.5-9.5         grid_4.2.1             blob_1.2.3            
##  [97] webshot_0.5.3          digest_0.6.29          xtable_1.8-4          
## [100] httpuv_1.6.5           munsell_0.5.0          viridisLite_0.4.0     
## [103] beeswarm_0.4.0         bslib_0.3.1