Loading and reading the tsv file
#loading and reading in the str file
variants <- readr::read_tsv("ERP_str_res/ERP011529_merge.outlier_locus.tsv", col_names = TRUE,show_col_types =FALSE,skip_empty_rows = TRUE)
#creating a new row showing the subset of repeats that took place only in all samples.
variants$length <- unlist(lapply(strsplit(variants$counts,","),length))
variants <- subset(variants, length >1)
#extracting the counts column and converting to numeric
df_counts <- do.call(rbind,strsplit(variants$counts,",", fixed=TRUE))
## Warning in (function (..., deparse.level = 1) : number of columns of result is
## not a multiple of vector length (arg 1)
df_counts <- apply(df_counts, MARGIN = 2, FUN = as.numeric)
Plotting heat_map
#deciding on our rownames and colnames
rownames(df_counts) <- paste(variants$contig,variants$start,variants$end,variants$motif)
colnames(df_counts) <- c("g06","g07","g08")
#extracting the bed files for the repeats regions of this analysis for further investigation on IGV
str_rownames <- rownames(df_counts)
str_regions <- do.call(rbind,strsplit(str_rownames," "))
write.table(x=str_regions,file="ERP_Str_regions.bed",quote=FALSE,sep="\t", col.names = FALSE,row.names = FALSE)
#plotting the heatmap showing the repeats on each sample
colfunc <- colorRampPalette(c("blue", "red"))
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
heatmap.2(df_counts, trace = "none", scale = "none",
dendrogram='none', Rowv=FALSE, Colv=FALSE, col=colfunc(25),
margins = c(5,20), cexRow=.8, cexCol=.9, main="STRs")
Analyzing the CV for column of interest
#creating a function to decide the coefficient variation on each row of variants$counts for the samples.
cv <- function(x) {
mymean <- mean(x)
mysd <- sd(x)
cv <- mysd/mymean
signif(cv,3)
}
#plotting a graph showing the coefficient variation on each row of variants$counts for the samples
my_cv <- apply(df_counts, MARGIN = 1, FUN = cv)
my_cv[order(my_cv)]
## 2 52069871 52070006 AACCATGGGCTCACAGGAGG
## 0.463
## MU069435.1 10347 10704 AAAG
## 0.473
## 1 72220797 72221185 AC
## 0.482
## 15 6901897 6901971 AC
## 0.591
par(mar =c(5,15,3,1))
barplot(my_cv[order(my_cv)], horiz = TRUE,las =1,xlab = "CV")