Loading and reading the tsv file

#loading and reading in the str file
variants <- readr::read_tsv("SRP_str_res/SRP199233_merge.outlier_locus.tsv", col_names = TRUE,show_col_types =FALSE,skip_empty_rows = TRUE)

#creating a new row showing the subset of repeats that took place only in all samples. 
variants$length <- unlist(lapply(strsplit(variants$counts,","),length))
variants <- subset(variants, length >5)

#extracting the counts column and converting to numeric
df_counts <- do.call(rbind,strsplit(variants$counts,",", fixed=TRUE))
df_counts <- apply(df_counts, MARGIN = 2, FUN = as.numeric)

Plotting heat_map

#deciding on our rownames and colnames
rownames(df_counts) <- paste(variants$contig,variants$start,variants$end,variants$motif)
colnames(df_counts) <- c("g275","g276","g277","g278","g279","g280") 

#extracting the bed files for the repeats regions of this analysis for further investigation on IGV
str_rownames <- rownames(df_counts)
str_regions <- do.call(rbind,strsplit(str_rownames," "))
write.table(x=str_regions,file="SRP_Str_regions.bed",quote=FALSE,sep="\t", col.names = FALSE,row.names = FALSE)

#plotting the heatmap showing the repeats on each sample
colfunc <- colorRampPalette(c("blue", "red"))
library(gplots)

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

heatmap.2(df_counts, trace = "none", scale = "none", 
  dendrogram='none', Rowv=FALSE, Colv=FALSE, col=colfunc(25),
  margins = c(5,20), cexRow=.8, cexCol=.9,  main="STRs")

Analyzing the CV for column of interest

#creating a function to decide the coefficient variation on each row of variants$counts for the samples.
cv <- function(x) {
  mymean <- mean(x)
  mysd <- sd(x)
  cv <- mysd/mymean
  signif(cv,3)
}
#plotting a graph showing the coefficient variation on each row of variants$counts for the samples
my_cv <- apply(df_counts, MARGIN = 1, FUN = cv)
my_cv[order(my_cv)]

##                  4 137374441 137375247 AC 
##                                     0.329 
## JH584304.1 3626 4105 AGAGAGGGAGGCAGGGAGGC 
##                                     0.412 
##            13 100609002 100609501 AGGAGGC 
##                                     0.441 
##        4 20220256 20220627 AATGACGTGATGGC 
##                                     0.472 
##                   11 36197808 36198096 AC 
##                                     0.537 
##               19 23528713 23528895 AGAGGC 
##                                     0.558 
##               16 50507522 50507910 AGAGGC 
##                                     0.564 
##               13 83042079 83042371 ACACAG 
##                                     0.567 
##                1 91877057 91877377 AGAGGC 
##                                     0.594 
##              1 135904396 135904634 AGAGGC 
##                                     0.614 
##                6 90518723 90518812 AGAGGC 
##                                     0.654 
##                7 28520755 28521197 AGAGGC 
##                                     0.667 
##             10 130594858 130594889 AACCCT 
##                                     0.677 
##              8 112532104 112532467 AGAGGC 
##                                     0.708 
##               12 24768510 24768721 AGAGGC 
##                                     0.761 
##                3 92490426 92490877 AGAGGC 
##                                     0.851 
##               11 88750091 88750372 AGAGGC 
##                                     0.920 
##             1 118319071 118319354 AGCAGGC 
##                                     0.946

par(mar =c(5,15,3,1))
barplot(my_cv[order(my_cv)], horiz = TRUE,las =1,xlab = "CV")

Session Information

sessionInfo()

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] gplots_3.1.1
## 
## loaded via a namespace (and not attached):
##  [1] highr_0.9          pillar_1.6.4       bslib_0.3.1        compiler_4.1.2    
##  [5] jquerylib_0.1.4    bitops_1.0-7       tools_4.1.2        digest_0.6.29     
##  [9] bit_4.0.4          jsonlite_1.7.2     evaluate_0.14      lifecycle_1.0.1   
## [13] tibble_3.1.6       pkgconfig_2.0.3    rlang_0.4.12       yaml_2.2.1        
## [17] parallel_4.1.2     xfun_0.29          fastmap_1.1.0      stringr_1.4.0     
## [21] knitr_1.37         vctrs_0.3.8        sass_0.4.0         gtools_3.9.2      
## [25] caTools_1.18.2     hms_1.1.1          bit64_4.0.5        tidyselect_1.1.1  
## [29] glue_1.6.0         R6_2.5.1           fansi_1.0.0        vroom_1.5.7       
## [33] rmarkdown_2.11     readr_2.1.1        tzdb_0.2.0         purrr_0.3.4       
## [37] magrittr_2.0.1     ellipsis_0.3.2     htmltools_0.5.2    KernSmooth_2.23-20
## [41] utf8_1.2.2         stringi_1.7.6      crayon_1.4.2

Inbred mouse short tandem repeats (STRs) Analysis on Study SRP199233

Kingsley Isaac Ogamanya & Mark Ziemann

2022-02-11

Loading and reading the tsv file

Plotting heat_map

Analyzing the CV for column of interest

Session Information