This code is available at https://github.com/aaronsk7/guppy-methylation

This script analyses fragment insert size by creating a violin plot. This chart illustrates the distribution of insert sizes as well as the median insert size for each sample. In 150 bp paired-end sequencing an insert size of >300 bp is optimal. Each sample has a median of either close to or >300 bp in length and therefore these results indicate cost efficiency and high yield in our profiling.

getwd()
## [1] "/mnt/data/aaron/projects/guppy-methylation"
library(vioplot)
## Loading required package: sm
## Warning in fun(libname, pkgname): couldn't connect to display ":0"
## Package 'sm', version 2.2-5.6: type help(sm) for summary information
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
myfilelist <- list.files("insertlenMD", full.names = TRUE, pattern = "insertlen.txt")

myread <- function(myfile){
  x <- as.numeric(readLines(myfile))
  x <- head(x,2000000)
  x <- x[which(x<2000)]
  x <- x[which(x>0)]
  x
}

x <- lapply(myfilelist,myread)
mysamplenames <- sapply(strsplit(myfilelist,"/"),"[[",2)
mysamplenames <- sapply(strsplit(mysamplenames,"\\."),"[[",1)
mysamplenames
##  [1] "Clear2F-R1" "Clear2F-R2" "Clear2F-R3" "Foundation" "Green3F-R1"
##  [6] "Green3F-R2" "Green3F-R3" "Lilac4F-R1" "Lilac4F-R2" "Lilac4F-R3"
names(x) <- mysamplenames
str(x)
## List of 10
##  $ Clear2F-R1: num [1:1823714] 392 269 378 272 186 345 463 275 157 433 ...
##  $ Clear2F-R2: num [1:1809311] 285 367 309 207 225 183 351 406 417 298 ...
##  $ Clear2F-R3: num [1:1818925] 240 304 369 255 450 240 305 240 411 203 ...
##  $ Foundation: num [1:1795642] 388 285 283 479 574 445 277 279 757 394 ...
##  $ Green3F-R1: num [1:1800843] 523 384 549 623 270 109 156 515 303 264 ...
##  $ Green3F-R2: num [1:1811170] 349 374 584 367 311 278 323 265 333 471 ...
##  $ Green3F-R3: num [1:1802823] 143 275 266 367 421 442 149 229 320 381 ...
##  $ Lilac4F-R1: num [1:1796626] 289 430 448 355 446 469 598 391 721 397 ...
##  $ Lilac4F-R2: num [1:1810358] 380 678 319 618 151 392 186 411 425 534 ...
##  $ Lilac4F-R3: num [1:1803593] 604 480 497 268 364 529 301 385 555 492 ...
par(mar=c(5,7,4,2))
vioplot(x,main = "Insert length (bp)" , horizontal=TRUE,  side="right" , las=1 , ylim=c(0,1000))
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning

Import data sets

Foundation.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Foundation.bam.vcf_meth_average.tsv",stringsAsFactors = FALSE)
Clear2F.R1.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Clear2F-R1.bam.vcf_meth_average.tsv", stringsAsFactors = FALSE)
Clear2F.R2.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Clear2F-R2.bam.vcf_meth_average.tsv", stringsAsFactors = FALSE)
Clear2F.R3.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Clear2F-R3.bam.vcf_meth_average.tsv", stringsAsFactors = FALSE)
Green3F.R1.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Green3F-R1.bam.vcf_meth_average.tsv",stringsAsFactors = FALSE)
Green3F.R2.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Green3F-R2.bam.vcf_meth_average.tsv", stringsAsFactors = FALSE)
Green3F.R3.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Green3F-R3.bam.vcf_meth_average.tsv", stringsAsFactors = FALSE)
Lilac4F.R1.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Lilac4F-R1.bam.vcf_meth_average.tsv",stringsAsFactors = FALSE)
Lilac4F.R2.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Lilac4F-R2.bam.vcf_meth_average.tsv",stringsAsFactors = FALSE)
Lilac4F.R3.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Lilac4F-R3.bam.vcf_meth_average.tsv",stringsAsFactors = FALSE)
meth_average <- merge(Foundation.bam.vcf_meth_average, Clear2F.R1.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Clear2F.R2.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Clear2F.R3.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Green3F.R1.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Green3F.R2.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Green3F.R3.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Lilac4F.R1.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Lilac4F.R2.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Lilac4F.R3.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
readlen <- read.table("~/projects/guppy-methylation/readlen.txt", quote="\"", comment.char="", stringsAsFactors = FALSE)
General.statistics <- read.delim("~/projects/guppy-methylation/General.statistics.txt", comment.char="#",stringsAsFactors = FALSE)
FastQCcounts <- read.delim("~/projects/guppy-methylation/fastqc_sequence_counts_plot MainQC.txt", stringsAsFactors = FALSE)

This script analyses the chromosomal CpG methylation profiling by creating a bar plot which displays the percentage of CpG methylation present in chromosome 1 of each sample. We expect consistent methylation between the samples and ~80% methylation when using zebrafish as our comparative species. These results are in line with our expectations and therefore indicate quality methylation profiling.

#Isolate chromosome 1 methylation average
chr.meth <- meth_average[grep("LG1$",meth_average$chrm),]
head(chr.meth)
##               sample chrm     CGn     CGb    CHGn   CHGb    CHHn   CHHb     CHn
## 2629  Clear2F-R1.bam  LG1 1111762 73.228% 2755100 0.505% 8042141 0.530% 8042141
## 5297  Clear2F-R2.bam  LG1 1120250 73.436% 2762232 0.506% 8070112 0.530% 8070112
## 7962  Clear2F-R3.bam  LG1 1120606 72.910% 2768290 0.499% 8085967 0.524% 8085967
## 10640 Foundation.bam  LG1 1128289 75.428% 2777525 0.485% 8119362 0.500% 8119362
## 13310 Green3F-R1.bam  LG1 1124674 72.852% 2757859 0.516% 8060404 0.533% 8060404
## 15962 Green3F-R2.bam  LG1 1119682 73.299% 2761440 0.524% 8068501 0.547% 8068501
##          CHb
## 2629  0.524%
## 5297  0.524%
## 7962  0.517%
## 10640 0.496%
## 13310 0.529%
## 15962 0.541%
#Isolate CpG percentage from chromosome 1 methylation average and remove percent sign
chr.meth.cpg <- as.numeric(gsub("%","",chr.meth[,4]))
head(chr.meth.cpg)
## [1] 73.228 73.436 72.910 75.428 72.852 73.299
#Create sample names
mysamplenames <- sapply(strsplit(chr.meth$sample,"\\."),"[[",1)
mysamplenames <- gsub(".bam","",chr.meth[,1])
mysamplenames <- gsub("2F-","",mysamplenames)
mysamplenames <- gsub("3F-","",mysamplenames)
mysamplenames <- gsub("4F-","",mysamplenames)
mysamplenames
##  [1] "ClearR1"    "ClearR2"    "ClearR3"    "Foundation" "GreenR1"   
##  [6] "GreenR2"    "GreenR3"    "LilacR1"    "LilacR2"    "LilacR3"
#Attach sample names to percent values
names(chr.meth.cpg) <- mysamplenames
chr.meth.cpg
##    ClearR1    ClearR2    ClearR3 Foundation    GreenR1    GreenR2    GreenR3 
##     73.228     73.436     72.910     75.428     72.852     73.299     71.968 
##    LilacR1    LilacR2    LilacR3 
##     73.316     72.229     73.114
#Create bar chart
par(mar=c(5,6,4,2))
barplot(chr.meth.cpg, main="% Chr1 CpG Methylation", horiz=TRUE, las=1, xlim = c(60,80))

This script creates a bar plot which displays the percentage of CpG methylation present in the mitochondrial chromosome of each sample. The mitochondrial chromosome contains little to no methylation. Each sample had consistently low mtDNA CpG methylation (<0.3%) and is therefore evidence of quality methylation profiling.

mt.meth <- meth_average[grep("MT$",meth_average$chrm),]
mt.meth
##               sample chrm CGn    CGb CHGn   CHGb CHHn   CHHb  CHn    CHb
## 2652  Clear2F-R1.bam   MT 789 0.206% 1006 0.187% 4915 0.235% 4915 0.227%
## 5320  Clear2F-R2.bam   MT 791 0.200% 1007 0.188% 4912 0.215% 4912 0.211%
## 7985  Clear2F-R3.bam   MT 788 0.265% 1009 0.219% 4916 0.282% 4916 0.271%
## 10663 Foundation.bam   MT 791 0.245% 1008 0.213% 4913 0.228% 4913 0.226%
## 13333 Green3F-R1.bam   MT 790 0.216% 1008 0.206% 4912 0.229% 4912 0.225%
## 15985 Green3F-R2.bam   MT 789 0.241% 1007 0.216% 4917 0.266% 4917 0.257%
## 18642 Green3F-R3.bam   MT 790 0.219% 1008 0.227% 4915 0.256% 4915 0.251%
## 21301 Lilac4F-R1.bam   MT 791 0.227% 1008 0.227% 4913 0.258% 4913 0.253%
## 23965 Lilac4F-R2.bam   MT 790 0.202% 1008 0.196% 4919 0.217% 4919 0.213%
## 26603 Lilac4F-R3.bam   MT 787 0.202% 1008 0.189% 4916 0.241% 4916 0.232%
mt.meth.cpg <- as.numeric(gsub("%","",mt.meth[,4]))
mt.meth.cpg
##  [1] 0.206 0.200 0.265 0.245 0.216 0.241 0.219 0.227 0.202 0.202
mysamplenames
##  [1] "ClearR1"    "ClearR2"    "ClearR3"    "Foundation" "GreenR1"   
##  [6] "GreenR2"    "GreenR3"    "LilacR1"    "LilacR2"    "LilacR3"
names(mt.meth.cpg) <- mysamplenames
mt.meth.cpg
##    ClearR1    ClearR2    ClearR3 Foundation    GreenR1    GreenR2    GreenR3 
##      0.206      0.200      0.265      0.245      0.216      0.241      0.219 
##    LilacR1    LilacR2    LilacR3 
##      0.227      0.202      0.202
par(mar=c(5,6,4,2))
barplot(mt.meth.cpg, main = "% Mitochondrial CpG Methylation", horiz = TRUE, las = 1,xlim = c(0,0.3))

This script creates a bar plot which displays the percentage of CpG methylation present in the lambda phage DNA spiked into each sample. Because the lambda phage contains no methylation and the same sample of lambda phage was spiked into each population DNA sample, we expect very low methylation (<5%) and consistency between the samples. Each sample had consistently low CpG methylation (<0.3%) and is therefore evidence of quality methylation profiling.

lp.meth <- meth_average[grep("lambda_phage$",meth_average$chrm),]
lp.meth
##               sample         chrm  CGn    CGb CHGn   CHGb  CHHn   CHHb   CHn
## 2628  Clear2F-R1.bam lambda_phage 6157 0.222% 6393 0.247% 11392 0.237% 11392
## 5296  Clear2F-R2.bam lambda_phage 6218 0.180% 6447 0.197% 11493 0.166% 11493
## 7961  Clear2F-R3.bam lambda_phage 6221 0.191% 6447 0.197% 11486 0.165% 11486
## 10639 Foundation.bam lambda_phage 6223 0.171% 6447 0.199% 11496 0.170% 11496
## 13309 Green3F-R1.bam lambda_phage 6221 0.258% 6449 0.290% 11497 0.228% 11497
## 15961 Green3F-R2.bam lambda_phage 6220 0.268% 6446 0.286% 11494 0.258% 11494
## 18618 Green3F-R3.bam lambda_phage 6222 0.253% 6449 0.258% 11493 0.228% 11493
## 21277 Lilac4F-R1.bam lambda_phage 6221 0.254% 6448 0.264% 11495 0.238% 11495
## 23941 Lilac4F-R2.bam lambda_phage 6222 0.225% 6447 0.235% 11495 0.235% 11495
## 26579 Lilac4F-R3.bam lambda_phage 6207 0.255% 6434 0.262% 11472 0.234% 11472
##          CHb
## 2628  0.240%
## 5296  0.177%
## 7961  0.177%
## 10639 0.180%
## 13309 0.250%
## 15961 0.268%
## 18618 0.239%
## 21277 0.248%
## 23941 0.235%
## 26579 0.244%
lp.meth.cpg <- as.numeric(gsub("%","",lp.meth[,4]))
names(lp.meth.cpg) <- mysamplenames
lp.meth.cpg
##    ClearR1    ClearR2    ClearR3 Foundation    GreenR1    GreenR2    GreenR3 
##      0.222      0.180      0.191      0.171      0.258      0.268      0.253 
##    LilacR1    LilacR2    LilacR3 
##      0.254      0.225      0.255
par(mar=c(5,6,4,2))
barplot(lp.meth.cpg, main = "% Lambda Phage CpG Methylation", horiz = TRUE, las = 1, xlim = c(0,0.3))

This script creates a bar plot which displays the percentage of CpG methylation present in the pUC19 plasmid spiked into each sample. Because the pUC19 plasmid is fully methylated and the same sample of the plasmid was spiked into each population DNA sample, we expect very high methylation (>95%) and consistency between the samples. Each sample had consistently high CpG methylation (>97%) and is therefore evidence of quality methylation profiling.

pUC19.meth <- meth_average[grep("pUC19$",meth_average$chrm),]
pUC19.meth.cpg <- as.numeric(gsub("%","",pUC19.meth[,4]))
names(pUC19.meth.cpg) <- mysamplenames
pUC19.meth.cpg
##    ClearR1    ClearR2    ClearR3 Foundation    GreenR1    GreenR2    GreenR3 
##     99.021     98.715     98.923     99.512     98.746     98.212     97.947 
##    LilacR1    LilacR2    LilacR3 
##     98.428     98.019     97.982
par(mar=c(5,6,4,2))
barplot(pUC19.meth.cpg, main = "% pUC19 CpG Methylation", horiz = TRUE, las = 1, xlim = c(90,100))

This script analyses the chromosomal CpH methylation by creating a bar plot which displays the percentage of CpH methylation present in chromosome 1 of each sample. Because methylation is rare at CpH sites we expect very low methylation in every sample and a consistent percentage of methylation among the samples. Each sample had consistently low chromosomal CpH methylation (<0.6%) and is therefore evidence of quality methylation profiling.

chr.meth.cph <- as.numeric(gsub("%","",chr.meth[,10]))
names(chr.meth.cph) <- mysamplenames
chr.meth.cph
##    ClearR1    ClearR2    ClearR3 Foundation    GreenR1    GreenR2    GreenR3 
##      0.524      0.524      0.517      0.496      0.529      0.541      0.527 
##    LilacR1    LilacR2    LilacR3 
##      0.539      0.499      0.517
par(mar=c(5,6,4,2))
barplot(chr.meth.cph, main = "% Chr1 CpH Methylation", horiz = TRUE, las = 1, xlim = c(0,0.6))

This script creates a bar plot which displays the percentage of CpH methylation present in the mitochondrial chromosome of each sample. The mitochondrial chromosome typically contains little to no methylation. Each sample had consistently low mtDNA CpH methylation (<0.3%) and is therefore evidence of quality methylation profiling.

mt.meth.cph <- as.numeric(gsub("%","",mt.meth[,10]))
names(mt.meth.cph) <- mysamplenames
mt.meth.cph
##    ClearR1    ClearR2    ClearR3 Foundation    GreenR1    GreenR2    GreenR3 
##      0.227      0.211      0.271      0.226      0.225      0.257      0.251 
##    LilacR1    LilacR2    LilacR3 
##      0.253      0.213      0.232
par(mar=c(5,6,4,2))
barplot(mt.meth.cph, main = "% mtDNA CpH Methylation", horiz = TRUE, las = 1,xlim = c(0,0.3))