Loading and reading the tsv file

#loading and reading in the str file
variants <- readr::read_tsv("ERP_str_res/ERP011529_merge.outlier_locus.tsv", col_names = TRUE,show_col_types =FALSE,skip_empty_rows = TRUE)

#creating a new row showing the subset of repeats that took place only in all samples. 
variants$length <- unlist(lapply(strsplit(variants$counts,","),length))
variants <- subset(variants, length >1)

#extracting the counts column and converting to numeric
df_counts <- do.call(rbind,strsplit(variants$counts,",", fixed=TRUE))

## Warning in (function (..., deparse.level = 1) : number of columns of result is
## not a multiple of vector length (arg 1)

df_counts <- apply(df_counts, MARGIN = 2, FUN = as.numeric)

Plotting heat_map

#deciding on our rownames and colnames
rownames(df_counts) <- paste(variants$contig,variants$start,variants$end,variants$motif)
colnames(df_counts) <- c("g06","g07","g08") 

#extracting the bed files for the repeats regions of this analysis for further investigation on IGV
str_rownames <- rownames(df_counts)
str_regions <- do.call(rbind,strsplit(str_rownames," "))
write.table(x=str_regions,file="ERP_Str_regions.bed",quote=FALSE,sep="\t", col.names = FALSE,row.names = FALSE)

#plotting the heatmap showing the repeats on each sample
colfunc <- colorRampPalette(c("blue", "red"))
library(gplots)

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

heatmap.2(df_counts, trace = "none", scale = "none", 
  dendrogram='none', Rowv=FALSE, Colv=FALSE, col=colfunc(25),
  margins = c(5,20), cexRow=.8, cexCol=.9,  main="STRs")

Analyzing the CV for column of interest

#creating a function to decide the coefficient variation on each row of variants$counts for the samples.
cv <- function(x) {
  mymean <- mean(x)
  mysd <- sd(x)
  cv <- mysd/mymean
  signif(cv,3)
}
#plotting a graph showing the coefficient variation on each row of variants$counts for the samples
my_cv <- apply(df_counts, MARGIN = 1, FUN = cv)
my_cv[order(my_cv)]

## 2 52069871 52070006 AACCATGGGCTCACAGGAGG 
##                                    0.463 
##              MU069435.1 10347 10704 AAAG 
##                                    0.473 
##                   1 72220797 72221185 AC 
##                                    0.482 
##                    15 6901897 6901971 AC 
##                                    0.591

par(mar =c(5,15,3,1))
barplot(my_cv[order(my_cv)], horiz = TRUE,las =1,xlab = "CV")

Session Information

sessionInfo()

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] gplots_3.1.1
## 
## loaded via a namespace (and not attached):
##  [1] highr_0.9          pillar_1.6.4       bslib_0.3.1        compiler_4.1.2    
##  [5] jquerylib_0.1.4    bitops_1.0-7       tools_4.1.2        digest_0.6.29     
##  [9] bit_4.0.4          jsonlite_1.7.2     evaluate_0.14      lifecycle_1.0.1   
## [13] tibble_3.1.6       pkgconfig_2.0.3    rlang_0.4.12       yaml_2.2.1        
## [17] parallel_4.1.2     xfun_0.29          fastmap_1.1.0      stringr_1.4.0     
## [21] knitr_1.37         vctrs_0.3.8        sass_0.4.0         gtools_3.9.2      
## [25] caTools_1.18.2     hms_1.1.1          bit64_4.0.5        tidyselect_1.1.1  
## [29] glue_1.6.0         R6_2.5.1           fansi_1.0.0        vroom_1.5.7       
## [33] rmarkdown_2.11     readr_2.1.1        tzdb_0.2.0         purrr_0.3.4       
## [37] magrittr_2.0.1     ellipsis_0.3.2     htmltools_0.5.2    KernSmooth_2.23-20
## [41] utf8_1.2.2         stringi_1.7.6      crayon_1.4.2

Inbred mouse short tandem repeats (STRs) Analysis on Study ERP011529

Kingsley Isaac Ogamanya & Mark Ziemann

2022-02-11

Loading and reading the tsv file

Plotting heat_map

Analyzing the CV for column of interest

Session Information