Loading and reading the tsv file
#loading and reading in the str file
variants <- readr::read_tsv("SRP_str_res/SRP199233_merge.outlier_locus.tsv", col_names = TRUE,show_col_types =FALSE,skip_empty_rows = TRUE)
#creating a new row showing the subset of repeats that took place only in all samples.
variants$length <- unlist(lapply(strsplit(variants$counts,","),length))
variants <- subset(variants, length >5)
#extracting the counts column and converting to numeric
df_counts <- do.call(rbind,strsplit(variants$counts,",", fixed=TRUE))
df_counts <- apply(df_counts, MARGIN = 2, FUN = as.numeric)
Plotting heat_map
#deciding on our rownames and colnames
rownames(df_counts) <- paste(variants$contig,variants$start,variants$end,variants$motif)
colnames(df_counts) <- c("g275","g276","g277","g278","g279","g280")
#extracting the bed files for the repeats regions of this analysis for further investigation on IGV
str_rownames <- rownames(df_counts)
str_regions <- do.call(rbind,strsplit(str_rownames," "))
write.table(x=str_regions,file="SRP_Str_regions.bed",quote=FALSE,sep="\t", col.names = FALSE,row.names = FALSE)
#plotting the heatmap showing the repeats on each sample
colfunc <- colorRampPalette(c("blue", "red"))
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
heatmap.2(df_counts, trace = "none", scale = "none",
dendrogram='none', Rowv=FALSE, Colv=FALSE, col=colfunc(25),
margins = c(5,20), cexRow=.8, cexCol=.9, main="STRs")
Analyzing the CV for column of interest
#creating a function to decide the coefficient variation on each row of variants$counts for the samples.
cv <- function(x) {
mymean <- mean(x)
mysd <- sd(x)
cv <- mysd/mymean
signif(cv,3)
}
#plotting a graph showing the coefficient variation on each row of variants$counts for the samples
my_cv <- apply(df_counts, MARGIN = 1, FUN = cv)
my_cv[order(my_cv)]
## 4 137374441 137375247 AC
## 0.329
## JH584304.1 3626 4105 AGAGAGGGAGGCAGGGAGGC
## 0.412
## 13 100609002 100609501 AGGAGGC
## 0.441
## 4 20220256 20220627 AATGACGTGATGGC
## 0.472
## 11 36197808 36198096 AC
## 0.537
## 19 23528713 23528895 AGAGGC
## 0.558
## 16 50507522 50507910 AGAGGC
## 0.564
## 13 83042079 83042371 ACACAG
## 0.567
## 1 91877057 91877377 AGAGGC
## 0.594
## 1 135904396 135904634 AGAGGC
## 0.614
## 6 90518723 90518812 AGAGGC
## 0.654
## 7 28520755 28521197 AGAGGC
## 0.667
## 10 130594858 130594889 AACCCT
## 0.677
## 8 112532104 112532467 AGAGGC
## 0.708
## 12 24768510 24768721 AGAGGC
## 0.761
## 3 92490426 92490877 AGAGGC
## 0.851
## 11 88750091 88750372 AGAGGC
## 0.920
## 1 118319071 118319354 AGCAGGC
## 0.946
par(mar =c(5,15,3,1))
barplot(my_cv[order(my_cv)], horiz = TRUE,las =1,xlab = "CV")