#loading and reading in the str file
variants <- readr::read_tsv("SRP_str_res/SRP199233_merge.outlier_locus.tsv", col_names = TRUE,show_col_types =FALSE,skip_empty_rows = TRUE)
#creating a new row showing the subset of repeats that took place only in all samples.
variants$length <- unlist(lapply(strsplit(variants$counts,","),length))
variants <- subset(variants, length >5)
#extracting the counts column and converting to numeric
df_counts <- do.call(rbind,strsplit(variants$counts,",", fixed=TRUE))
df_counts <- apply(df_counts, MARGIN = 2, FUN = as.numeric)
#deciding on our rownames and colnames
rownames(df_counts) <- paste(variants$contig,variants$start,variants$end,variants$motif)
colnames(df_counts) <- c("g275","g276","g277","g278","g279","g280")
#extracting the bed files for the repeats regions of this analysis for further investigation on IGV
str_rownames <- rownames(df_counts)
str_regions <- do.call(rbind,strsplit(str_rownames," "))
write.table(x=str_regions,file="SRP_Str_regions.bed",quote=FALSE,sep="\t", col.names = FALSE,row.names = FALSE)
#plotting the heatmap showing the repeats on each sample
colfunc <- colorRampPalette(c("blue", "red"))
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
heatmap.2(df_counts, trace = "none", scale = "none",
dendrogram='none', Rowv=FALSE, Colv=FALSE, col=colfunc(25),
margins = c(5,20), cexRow=.8, cexCol=.9, main="STRs")
#creating a function to decide the coefficient variation on each row of variants$counts for the samples.
cv <- function(x) {
mymean <- mean(x)
mysd <- sd(x)
cv <- mysd/mymean
signif(cv,3)
}
#plotting a graph showing the coefficient variation on each row of variants$counts for the samples
my_cv <- apply(df_counts, MARGIN = 1, FUN = cv)
my_cv[order(my_cv)]
## 4 137374441 137375247 AC
## 0.329
## JH584304.1 3626 4105 AGAGAGGGAGGCAGGGAGGC
## 0.412
## 13 100609002 100609501 AGGAGGC
## 0.441
## 4 20220256 20220627 AATGACGTGATGGC
## 0.472
## 11 36197808 36198096 AC
## 0.537
## 19 23528713 23528895 AGAGGC
## 0.558
## 16 50507522 50507910 AGAGGC
## 0.564
## 13 83042079 83042371 ACACAG
## 0.567
## 1 91877057 91877377 AGAGGC
## 0.594
## 1 135904396 135904634 AGAGGC
## 0.614
## 6 90518723 90518812 AGAGGC
## 0.654
## 7 28520755 28521197 AGAGGC
## 0.667
## 10 130594858 130594889 AACCCT
## 0.677
## 8 112532104 112532467 AGAGGC
## 0.708
## 12 24768510 24768721 AGAGGC
## 0.761
## 3 92490426 92490877 AGAGGC
## 0.851
## 11 88750091 88750372 AGAGGC
## 0.920
## 1 118319071 118319354 AGCAGGC
## 0.946
par(mar =c(5,15,3,1))
barplot(my_cv[order(my_cv)], horiz = TRUE,las =1,xlab = "CV")
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8
## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8
## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] gplots_3.1.1
##
## loaded via a namespace (and not attached):
## [1] highr_0.9 pillar_1.6.4 bslib_0.3.1 compiler_4.1.2
## [5] jquerylib_0.1.4 bitops_1.0-7 tools_4.1.2 digest_0.6.29
## [9] bit_4.0.4 jsonlite_1.7.2 evaluate_0.14 lifecycle_1.0.1
## [13] tibble_3.1.6 pkgconfig_2.0.3 rlang_0.4.12 yaml_2.2.1
## [17] parallel_4.1.2 xfun_0.29 fastmap_1.1.0 stringr_1.4.0
## [21] knitr_1.37 vctrs_0.3.8 sass_0.4.0 gtools_3.9.2
## [25] caTools_1.18.2 hms_1.1.1 bit64_4.0.5 tidyselect_1.1.1
## [29] glue_1.6.0 R6_2.5.1 fansi_1.0.0 vroom_1.5.7
## [33] rmarkdown_2.11 readr_2.1.1 tzdb_0.2.0 purrr_0.3.4
## [37] magrittr_2.0.1 ellipsis_0.3.2 htmltools_0.5.2 KernSmooth_2.23-20
## [41] utf8_1.2.2 stringi_1.7.6 crayon_1.4.2