#loading and reading in the str file
variants <- readr::read_tsv("ERP_str_res/ERP011529_merge.outlier_locus.tsv", col_names = TRUE,show_col_types =FALSE,skip_empty_rows = TRUE)
#creating a new row showing the subset of repeats that took place only in all samples.
variants$length <- unlist(lapply(strsplit(variants$counts,","),length))
variants <- subset(variants, length >1)
#extracting the counts column and converting to numeric
df_counts <- do.call(rbind,strsplit(variants$counts,",", fixed=TRUE))
## Warning in (function (..., deparse.level = 1) : number of columns of result is
## not a multiple of vector length (arg 1)
df_counts <- apply(df_counts, MARGIN = 2, FUN = as.numeric)
#deciding on our rownames and colnames
rownames(df_counts) <- paste(variants$contig,variants$start,variants$end,variants$motif)
colnames(df_counts) <- c("g06","g07","g08")
#extracting the bed files for the repeats regions of this analysis for further investigation on IGV
str_rownames <- rownames(df_counts)
str_regions <- do.call(rbind,strsplit(str_rownames," "))
write.table(x=str_regions,file="ERP_Str_regions.bed",quote=FALSE,sep="\t", col.names = FALSE,row.names = FALSE)
#plotting the heatmap showing the repeats on each sample
colfunc <- colorRampPalette(c("blue", "red"))
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
heatmap.2(df_counts, trace = "none", scale = "none",
dendrogram='none', Rowv=FALSE, Colv=FALSE, col=colfunc(25),
margins = c(5,20), cexRow=.8, cexCol=.9, main="STRs")
#creating a function to decide the coefficient variation on each row of variants$counts for the samples.
cv <- function(x) {
mymean <- mean(x)
mysd <- sd(x)
cv <- mysd/mymean
signif(cv,3)
}
#plotting a graph showing the coefficient variation on each row of variants$counts for the samples
my_cv <- apply(df_counts, MARGIN = 1, FUN = cv)
my_cv[order(my_cv)]
## 2 52069871 52070006 AACCATGGGCTCACAGGAGG
## 0.463
## MU069435.1 10347 10704 AAAG
## 0.473
## 1 72220797 72221185 AC
## 0.482
## 15 6901897 6901971 AC
## 0.591
par(mar =c(5,15,3,1))
barplot(my_cv[order(my_cv)], horiz = TRUE,las =1,xlab = "CV")
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8
## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8
## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] gplots_3.1.1
##
## loaded via a namespace (and not attached):
## [1] highr_0.9 pillar_1.6.4 bslib_0.3.1 compiler_4.1.2
## [5] jquerylib_0.1.4 bitops_1.0-7 tools_4.1.2 digest_0.6.29
## [9] bit_4.0.4 jsonlite_1.7.2 evaluate_0.14 lifecycle_1.0.1
## [13] tibble_3.1.6 pkgconfig_2.0.3 rlang_0.4.12 yaml_2.2.1
## [17] parallel_4.1.2 xfun_0.29 fastmap_1.1.0 stringr_1.4.0
## [21] knitr_1.37 vctrs_0.3.8 sass_0.4.0 gtools_3.9.2
## [25] caTools_1.18.2 hms_1.1.1 bit64_4.0.5 tidyselect_1.1.1
## [29] glue_1.6.0 R6_2.5.1 fansi_1.0.0 vroom_1.5.7
## [33] rmarkdown_2.11 readr_2.1.1 tzdb_0.2.0 purrr_0.3.4
## [37] magrittr_2.0.1 ellipsis_0.3.2 htmltools_0.5.2 KernSmooth_2.23-20
## [41] utf8_1.2.2 stringi_1.7.6 crayon_1.4.2