This code is available at https://github.com/aaronsk7/guppy-methylation
This script analyses fragment insert size by creating a violin plot. This chart illustrates the distribution of insert sizes as well as the median insert size for each sample. In 150 bp paired-end sequencing an insert size of >300 bp is optimal. Each sample has a median of either close to or >300 bp in length and therefore these results indicate cost efficiency and high yield in our profiling.
getwd()
## [1] "/mnt/data/aaron/projects/guppy-methylation"
library(vioplot)
## Loading required package: sm
## Warning in fun(libname, pkgname): couldn't connect to display ":0"
## Package 'sm', version 2.2-5.6: type help(sm) for summary information
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
myfilelist <- list.files("insertlenMD", full.names = TRUE, pattern = "insertlen.txt")
myread <- function(myfile){
x <- as.numeric(readLines(myfile))
x <- head(x,2000000)
x <- x[which(x<2000)]
x <- x[which(x>0)]
x
}
x <- lapply(myfilelist,myread)
mysamplenames <- sapply(strsplit(myfilelist,"/"),"[[",2)
mysamplenames <- sapply(strsplit(mysamplenames,"\\."),"[[",1)
mysamplenames
## [1] "Clear2F-R1" "Clear2F-R2" "Clear2F-R3" "Foundation" "Green3F-R1"
## [6] "Green3F-R2" "Green3F-R3" "Lilac4F-R1" "Lilac4F-R2" "Lilac4F-R3"
names(x) <- mysamplenames
str(x)
## List of 10
## $ Clear2F-R1: num [1:1823714] 392 269 378 272 186 345 463 275 157 433 ...
## $ Clear2F-R2: num [1:1809311] 285 367 309 207 225 183 351 406 417 298 ...
## $ Clear2F-R3: num [1:1818925] 240 304 369 255 450 240 305 240 411 203 ...
## $ Foundation: num [1:1795642] 388 285 283 479 574 445 277 279 757 394 ...
## $ Green3F-R1: num [1:1800843] 523 384 549 623 270 109 156 515 303 264 ...
## $ Green3F-R2: num [1:1811170] 349 374 584 367 311 278 323 265 333 471 ...
## $ Green3F-R3: num [1:1802823] 143 275 266 367 421 442 149 229 320 381 ...
## $ Lilac4F-R1: num [1:1796626] 289 430 448 355 446 469 598 391 721 397 ...
## $ Lilac4F-R2: num [1:1810358] 380 678 319 618 151 392 186 411 425 534 ...
## $ Lilac4F-R3: num [1:1803593] 604 480 497 268 364 529 301 385 555 492 ...
par(mar=c(5,7,4,2))
vioplot(x,main = "Insert length (bp)" , horizontal=TRUE, side="right" , las=1 , ylim=c(0,1000))
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
Foundation.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Foundation.bam.vcf_meth_average.tsv",stringsAsFactors = FALSE)
Clear2F.R1.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Clear2F-R1.bam.vcf_meth_average.tsv", stringsAsFactors = FALSE)
Clear2F.R2.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Clear2F-R2.bam.vcf_meth_average.tsv", stringsAsFactors = FALSE)
Clear2F.R3.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Clear2F-R3.bam.vcf_meth_average.tsv", stringsAsFactors = FALSE)
Green3F.R1.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Green3F-R1.bam.vcf_meth_average.tsv",stringsAsFactors = FALSE)
Green3F.R2.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Green3F-R2.bam.vcf_meth_average.tsv", stringsAsFactors = FALSE)
Green3F.R3.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Green3F-R3.bam.vcf_meth_average.tsv", stringsAsFactors = FALSE)
Lilac4F.R1.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Lilac4F-R1.bam.vcf_meth_average.tsv",stringsAsFactors = FALSE)
Lilac4F.R2.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Lilac4F-R2.bam.vcf_meth_average.tsv",stringsAsFactors = FALSE)
Lilac4F.R3.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Lilac4F-R3.bam.vcf_meth_average.tsv",stringsAsFactors = FALSE)
meth_average <- merge(Foundation.bam.vcf_meth_average, Clear2F.R1.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Clear2F.R2.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Clear2F.R3.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Green3F.R1.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Green3F.R2.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Green3F.R3.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Lilac4F.R1.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Lilac4F.R2.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Lilac4F.R3.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
readlen <- read.table("~/projects/guppy-methylation/readlen.txt", quote="\"", comment.char="", stringsAsFactors = FALSE)
General.statistics <- read.delim("~/projects/guppy-methylation/General.statistics.txt", comment.char="#",stringsAsFactors = FALSE)
FastQCcounts <- read.delim("~/projects/guppy-methylation/fastqc_sequence_counts_plot MainQC.txt", stringsAsFactors = FALSE)
This script analyses the chromosomal CpG methylation profiling by creating a bar plot which displays the percentage of CpG methylation present in chromosome 1 of each sample. We expect consistent methylation between the samples and ~80% methylation when using zebrafish as our comparative species. These results are in line with our expectations and therefore indicate quality methylation profiling.
#Isolate chromosome 1 methylation average
chr.meth <- meth_average[grep("LG1$",meth_average$chrm),]
head(chr.meth)
## sample chrm CGn CGb CHGn CHGb CHHn CHHb CHn
## 2629 Clear2F-R1.bam LG1 1111762 73.228% 2755100 0.505% 8042141 0.530% 8042141
## 5297 Clear2F-R2.bam LG1 1120250 73.436% 2762232 0.506% 8070112 0.530% 8070112
## 7962 Clear2F-R3.bam LG1 1120606 72.910% 2768290 0.499% 8085967 0.524% 8085967
## 10640 Foundation.bam LG1 1128289 75.428% 2777525 0.485% 8119362 0.500% 8119362
## 13310 Green3F-R1.bam LG1 1124674 72.852% 2757859 0.516% 8060404 0.533% 8060404
## 15962 Green3F-R2.bam LG1 1119682 73.299% 2761440 0.524% 8068501 0.547% 8068501
## CHb
## 2629 0.524%
## 5297 0.524%
## 7962 0.517%
## 10640 0.496%
## 13310 0.529%
## 15962 0.541%
#Isolate CpG percentage from chromosome 1 methylation average and remove percent sign
chr.meth.cpg <- as.numeric(gsub("%","",chr.meth[,4]))
head(chr.meth.cpg)
## [1] 73.228 73.436 72.910 75.428 72.852 73.299
#Create sample names
mysamplenames <- sapply(strsplit(chr.meth$sample,"\\."),"[[",1)
mysamplenames <- gsub(".bam","",chr.meth[,1])
mysamplenames <- gsub("2F-","",mysamplenames)
mysamplenames <- gsub("3F-","",mysamplenames)
mysamplenames <- gsub("4F-","",mysamplenames)
mysamplenames
## [1] "ClearR1" "ClearR2" "ClearR3" "Foundation" "GreenR1"
## [6] "GreenR2" "GreenR3" "LilacR1" "LilacR2" "LilacR3"
#Attach sample names to percent values
names(chr.meth.cpg) <- mysamplenames
chr.meth.cpg
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2 GreenR3
## 73.228 73.436 72.910 75.428 72.852 73.299 71.968
## LilacR1 LilacR2 LilacR3
## 73.316 72.229 73.114
#Create bar chart
par(mar=c(5,6,4,2))
barplot(chr.meth.cpg, main="% Chr1 CpG Methylation", horiz=TRUE, las=1, xlim = c(60,80))
This script creates a bar plot which displays the percentage of CpG methylation present in the mitochondrial chromosome of each sample. The mitochondrial chromosome contains little to no methylation. Each sample had consistently low mtDNA CpG methylation (<0.3%) and is therefore evidence of quality methylation profiling.
mt.meth <- meth_average[grep("MT$",meth_average$chrm),]
mt.meth
## sample chrm CGn CGb CHGn CHGb CHHn CHHb CHn CHb
## 2652 Clear2F-R1.bam MT 789 0.206% 1006 0.187% 4915 0.235% 4915 0.227%
## 5320 Clear2F-R2.bam MT 791 0.200% 1007 0.188% 4912 0.215% 4912 0.211%
## 7985 Clear2F-R3.bam MT 788 0.265% 1009 0.219% 4916 0.282% 4916 0.271%
## 10663 Foundation.bam MT 791 0.245% 1008 0.213% 4913 0.228% 4913 0.226%
## 13333 Green3F-R1.bam MT 790 0.216% 1008 0.206% 4912 0.229% 4912 0.225%
## 15985 Green3F-R2.bam MT 789 0.241% 1007 0.216% 4917 0.266% 4917 0.257%
## 18642 Green3F-R3.bam MT 790 0.219% 1008 0.227% 4915 0.256% 4915 0.251%
## 21301 Lilac4F-R1.bam MT 791 0.227% 1008 0.227% 4913 0.258% 4913 0.253%
## 23965 Lilac4F-R2.bam MT 790 0.202% 1008 0.196% 4919 0.217% 4919 0.213%
## 26603 Lilac4F-R3.bam MT 787 0.202% 1008 0.189% 4916 0.241% 4916 0.232%
mt.meth.cpg <- as.numeric(gsub("%","",mt.meth[,4]))
mt.meth.cpg
## [1] 0.206 0.200 0.265 0.245 0.216 0.241 0.219 0.227 0.202 0.202
mysamplenames
## [1] "ClearR1" "ClearR2" "ClearR3" "Foundation" "GreenR1"
## [6] "GreenR2" "GreenR3" "LilacR1" "LilacR2" "LilacR3"
names(mt.meth.cpg) <- mysamplenames
mt.meth.cpg
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2 GreenR3
## 0.206 0.200 0.265 0.245 0.216 0.241 0.219
## LilacR1 LilacR2 LilacR3
## 0.227 0.202 0.202
par(mar=c(5,6,4,2))
barplot(mt.meth.cpg, main = "% Mitochondrial CpG Methylation", horiz = TRUE, las = 1,xlim = c(0,0.3))
This script creates a bar plot which displays the percentage of CpG methylation present in the lambda phage DNA spiked into each sample. Because the lambda phage contains no methylation and the same sample of lambda phage was spiked into each population DNA sample, we expect very low methylation (<5%) and consistency between the samples. Each sample had consistently low CpG methylation (<0.3%) and is therefore evidence of quality methylation profiling.
lp.meth <- meth_average[grep("lambda_phage$",meth_average$chrm),]
lp.meth
## sample chrm CGn CGb CHGn CHGb CHHn CHHb CHn
## 2628 Clear2F-R1.bam lambda_phage 6157 0.222% 6393 0.247% 11392 0.237% 11392
## 5296 Clear2F-R2.bam lambda_phage 6218 0.180% 6447 0.197% 11493 0.166% 11493
## 7961 Clear2F-R3.bam lambda_phage 6221 0.191% 6447 0.197% 11486 0.165% 11486
## 10639 Foundation.bam lambda_phage 6223 0.171% 6447 0.199% 11496 0.170% 11496
## 13309 Green3F-R1.bam lambda_phage 6221 0.258% 6449 0.290% 11497 0.228% 11497
## 15961 Green3F-R2.bam lambda_phage 6220 0.268% 6446 0.286% 11494 0.258% 11494
## 18618 Green3F-R3.bam lambda_phage 6222 0.253% 6449 0.258% 11493 0.228% 11493
## 21277 Lilac4F-R1.bam lambda_phage 6221 0.254% 6448 0.264% 11495 0.238% 11495
## 23941 Lilac4F-R2.bam lambda_phage 6222 0.225% 6447 0.235% 11495 0.235% 11495
## 26579 Lilac4F-R3.bam lambda_phage 6207 0.255% 6434 0.262% 11472 0.234% 11472
## CHb
## 2628 0.240%
## 5296 0.177%
## 7961 0.177%
## 10639 0.180%
## 13309 0.250%
## 15961 0.268%
## 18618 0.239%
## 21277 0.248%
## 23941 0.235%
## 26579 0.244%
lp.meth.cpg <- as.numeric(gsub("%","",lp.meth[,4]))
names(lp.meth.cpg) <- mysamplenames
lp.meth.cpg
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2 GreenR3
## 0.222 0.180 0.191 0.171 0.258 0.268 0.253
## LilacR1 LilacR2 LilacR3
## 0.254 0.225 0.255
par(mar=c(5,6,4,2))
barplot(lp.meth.cpg, main = "% Lambda Phage CpG Methylation", horiz = TRUE, las = 1, xlim = c(0,0.3))
This script creates a bar plot which displays the percentage of CpG methylation present in the pUC19 plasmid spiked into each sample. Because the pUC19 plasmid is fully methylated and the same sample of the plasmid was spiked into each population DNA sample, we expect very high methylation (>95%) and consistency between the samples. Each sample had consistently high CpG methylation (>97%) and is therefore evidence of quality methylation profiling.
pUC19.meth <- meth_average[grep("pUC19$",meth_average$chrm),]
pUC19.meth.cpg <- as.numeric(gsub("%","",pUC19.meth[,4]))
names(pUC19.meth.cpg) <- mysamplenames
pUC19.meth.cpg
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2 GreenR3
## 99.021 98.715 98.923 99.512 98.746 98.212 97.947
## LilacR1 LilacR2 LilacR3
## 98.428 98.019 97.982
par(mar=c(5,6,4,2))
barplot(pUC19.meth.cpg, main = "% pUC19 CpG Methylation", horiz = TRUE, las = 1, xlim = c(90,100))
This script analyses the chromosomal CpH methylation by creating a bar plot which displays the percentage of CpH methylation present in chromosome 1 of each sample. Because methylation is rare at CpH sites we expect very low methylation in every sample and a consistent percentage of methylation among the samples. Each sample had consistently low chromosomal CpH methylation (<0.6%) and is therefore evidence of quality methylation profiling.
chr.meth.cph <- as.numeric(gsub("%","",chr.meth[,10]))
names(chr.meth.cph) <- mysamplenames
chr.meth.cph
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2 GreenR3
## 0.524 0.524 0.517 0.496 0.529 0.541 0.527
## LilacR1 LilacR2 LilacR3
## 0.539 0.499 0.517
par(mar=c(5,6,4,2))
barplot(chr.meth.cph, main = "% Chr1 CpH Methylation", horiz = TRUE, las = 1, xlim = c(0,0.6))
This script creates a bar plot which displays the percentage of CpH methylation present in the mitochondrial chromosome of each sample. The mitochondrial chromosome typically contains little to no methylation. Each sample had consistently low mtDNA CpH methylation (<0.3%) and is therefore evidence of quality methylation profiling.
mt.meth.cph <- as.numeric(gsub("%","",mt.meth[,10]))
names(mt.meth.cph) <- mysamplenames
mt.meth.cph
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2 GreenR3
## 0.227 0.211 0.271 0.226 0.225 0.257 0.251
## LilacR1 LilacR2 LilacR3
## 0.253 0.213 0.232
par(mar=c(5,6,4,2))
barplot(mt.meth.cph, main = "% mtDNA CpH Methylation", horiz = TRUE, las = 1,xlim = c(0,0.3))