This code is available at https://github.com/aaronsk7/guppy-methylation
This script analyses fragment insert size by creating a violin plot. This chart illustrates the distribution of insert sizes as well as the median insert size for each sample. In 150 bp paired-end sequencing an insert size of >300 bp is optimal. Each sample has a median of either close to or >300 bp in length and therefore these results indicate cost efficiency and high yield in our profiling.
getwd()
## [1] "/mnt/data/aaron/projects/guppy-methylation"
library(vioplot)
## Loading required package: sm
## Warning in fun(libname, pkgname): couldn't connect to display ":0"
## Package 'sm', version 2.2-5.6: type help(sm) for summary information
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
myfilelist <- list.files("insertlenMD", full.names = TRUE, pattern = "insertlen.txt")
myread <- function(myfile){
x <- as.numeric(readLines(myfile))
x <- head(x,2000000)
x <- x[which(x<2000)]
x <- x[which(x>0)]
x
}
x <- lapply(myfilelist,myread)
mysamplenames <- sapply(strsplit(myfilelist,"/"),"[[",2)
mysamplenames <- sapply(strsplit(mysamplenames,"\\."),"[[",1)
mysamplenames
## [1] "Clear2F-R1" "Clear2F-R2" "Clear2F-R3" "Foundation" "Green3F-R1"
## [6] "Green3F-R2" "Green3F-R3" "Lilac4F-R1" "Lilac4F-R2" "Lilac4F-R3"
names(x) <- mysamplenames
str(x)
## List of 10
## $ Clear2F-R1: num [1:1823714] 392 269 378 272 186 345 463 275 157 433 ...
## $ Clear2F-R2: num [1:1809311] 285 367 309 207 225 183 351 406 417 298 ...
## $ Clear2F-R3: num [1:1818925] 240 304 369 255 450 240 305 240 411 203 ...
## $ Foundation: num [1:1795642] 388 285 283 479 574 445 277 279 757 394 ...
## $ Green3F-R1: num [1:1800843] 523 384 549 623 270 109 156 515 303 264 ...
## $ Green3F-R2: num [1:1811170] 349 374 584 367 311 278 323 265 333 471 ...
## $ Green3F-R3: num [1:1802823] 143 275 266 367 421 442 149 229 320 381 ...
## $ Lilac4F-R1: num [1:1796626] 289 430 448 355 446 469 598 391 721 397 ...
## $ Lilac4F-R2: num [1:1810358] 380 678 319 618 151 392 186 411 425 534 ...
## $ Lilac4F-R3: num [1:1803593] 604 480 497 268 364 529 301 385 555 492 ...
par(mar=c(5,7,4,2))
vioplot(x,main = "Insert length (bp)" , horizontal=TRUE, side="right" , las=1 , ylim=c(0,1000))
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
## Warning: weights overwritten by binning
Foundation.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Foundation.bam.vcf_meth_average.tsv",stringsAsFactors = FALSE)
Clear2F.R1.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Clear2F-R1.bam.vcf_meth_average.tsv", stringsAsFactors = FALSE)
Clear2F.R2.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Clear2F-R2.bam.vcf_meth_average.tsv", stringsAsFactors = FALSE)
Clear2F.R3.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Clear2F-R3.bam.vcf_meth_average.tsv", stringsAsFactors = FALSE)
Green3F.R1.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Green3F-R1.bam.vcf_meth_average.tsv",stringsAsFactors = FALSE)
Green3F.R2.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Green3F-R2.bam.vcf_meth_average.tsv", stringsAsFactors = FALSE)
Green3F.R3.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Green3F-R3.bam.vcf_meth_average.tsv", stringsAsFactors = FALSE)
Lilac4F.R1.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Lilac4F-R1.bam.vcf_meth_average.tsv",stringsAsFactors = FALSE)
Lilac4F.R2.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Lilac4F-R2.bam.vcf_meth_average.tsv",stringsAsFactors = FALSE)
Lilac4F.R3.bam.vcf_meth_average <- read.delim("~/projects/guppy-methylation/methaverageMD/Lilac4F-R3.bam.vcf_meth_average.tsv",stringsAsFactors = FALSE)
meth_average <- merge(Foundation.bam.vcf_meth_average, Clear2F.R1.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Clear2F.R2.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Clear2F.R3.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Green3F.R1.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Green3F.R2.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Green3F.R3.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Lilac4F.R1.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Lilac4F.R2.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
meth_average <- merge(meth_average, Lilac4F.R3.bam.vcf_meth_average, all.x = TRUE, all.y = TRUE)
readlen <- read.table("~/projects/guppy-methylation/readlen.txt", quote="\"", comment.char="", stringsAsFactors = FALSE)
General.statistics <- read.delim("~/projects/guppy-methylation/General.statistics.txt", comment.char="#",stringsAsFactors = FALSE)
FastQCcounts <- read.delim("~/projects/guppy-methylation/fastqc_sequence_counts_plot MainQC.txt", stringsAsFactors = FALSE)
This script analyses the chromosomal CpG methylation profiling by creating a bar plot which displays the percentage of CpG methylation present in chromosome 1 of each sample. We expect consistent methylation between the samples and ~80% methylation when using zebrafish as our comparative species. These results are in line with our expectations and therefore indicate quality methylation profiling.
#Isolate chromosome 1 methylation average
chr.meth <- meth_average[grep("LG1$",meth_average$chrm),]
head(chr.meth)
## sample chrm CGn CGb CHGn CHGb CHHn CHHb CHn
## 2629 Clear2F-R1.bam LG1 1111762 73.228% 2755100 0.505% 8042141 0.530% 8042141
## 5297 Clear2F-R2.bam LG1 1120250 73.436% 2762232 0.506% 8070112 0.530% 8070112
## 7962 Clear2F-R3.bam LG1 1120606 72.910% 2768290 0.499% 8085967 0.524% 8085967
## 10640 Foundation.bam LG1 1128289 75.428% 2777525 0.485% 8119362 0.500% 8119362
## 13310 Green3F-R1.bam LG1 1124674 72.852% 2757859 0.516% 8060404 0.533% 8060404
## 15962 Green3F-R2.bam LG1 1119682 73.299% 2761440 0.524% 8068501 0.547% 8068501
## CHb
## 2629 0.524%
## 5297 0.524%
## 7962 0.517%
## 10640 0.496%
## 13310 0.529%
## 15962 0.541%
#Isolate CpG percentage from chromosome 1 methylation average and remove percent sign
chr.meth.cpg <- as.numeric(gsub("%","",chr.meth[,4]))
head(chr.meth.cpg)
## [1] 73.228 73.436 72.910 75.428 72.852 73.299
#Create sample names
mysamplenames <- sapply(strsplit(chr.meth$sample,"\\."),"[[",1)
mysamplenames <- gsub(".bam","",chr.meth[,1])
mysamplenames <- gsub("2F-","",mysamplenames)
mysamplenames <- gsub("3F-","",mysamplenames)
mysamplenames <- gsub("4F-","",mysamplenames)
mysamplenames
## [1] "ClearR1" "ClearR2" "ClearR3" "Foundation" "GreenR1"
## [6] "GreenR2" "GreenR3" "LilacR1" "LilacR2" "LilacR3"
#Attach sample names to percent values
names(chr.meth.cpg) <- mysamplenames
chr.meth.cpg
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2 GreenR3
## 73.228 73.436 72.910 75.428 72.852 73.299 71.968
## LilacR1 LilacR2 LilacR3
## 73.316 72.229 73.114
#Create bar chart
par(mar=c(5,6,4,2))
barplot(chr.meth.cpg, main="% Chr1 CpG Methylation", horiz=TRUE, las=1, xlim = c(60,80))
This script creates a bar plot which displays the percentage of CpG methylation present in the mitochondrial chromosome of each sample. The mitochondrial chromosome contains little to no methylation. Each sample had consistently low mtDNA CpG methylation (<0.3%) and is therefore evidence of quality methylation profiling.
mt.meth <- meth_average[grep("MT$",meth_average$chrm),]
mt.meth
## sample chrm CGn CGb CHGn CHGb CHHn CHHb CHn CHb
## 2652 Clear2F-R1.bam MT 789 0.206% 1006 0.187% 4915 0.235% 4915 0.227%
## 5320 Clear2F-R2.bam MT 791 0.200% 1007 0.188% 4912 0.215% 4912 0.211%
## 7985 Clear2F-R3.bam MT 788 0.265% 1009 0.219% 4916 0.282% 4916 0.271%
## 10663 Foundation.bam MT 791 0.245% 1008 0.213% 4913 0.228% 4913 0.226%
## 13333 Green3F-R1.bam MT 790 0.216% 1008 0.206% 4912 0.229% 4912 0.225%
## 15985 Green3F-R2.bam MT 789 0.241% 1007 0.216% 4917 0.266% 4917 0.257%
## 18642 Green3F-R3.bam MT 790 0.219% 1008 0.227% 4915 0.256% 4915 0.251%
## 21301 Lilac4F-R1.bam MT 791 0.227% 1008 0.227% 4913 0.258% 4913 0.253%
## 23965 Lilac4F-R2.bam MT 790 0.202% 1008 0.196% 4919 0.217% 4919 0.213%
## 26603 Lilac4F-R3.bam MT 787 0.202% 1008 0.189% 4916 0.241% 4916 0.232%
mt.meth.cpg <- as.numeric(gsub("%","",mt.meth[,4]))
mt.meth.cpg
## [1] 0.206 0.200 0.265 0.245 0.216 0.241 0.219 0.227 0.202 0.202
mysamplenames
## [1] "ClearR1" "ClearR2" "ClearR3" "Foundation" "GreenR1"
## [6] "GreenR2" "GreenR3" "LilacR1" "LilacR2" "LilacR3"
names(mt.meth.cpg) <- mysamplenames
mt.meth.cpg
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2 GreenR3
## 0.206 0.200 0.265 0.245 0.216 0.241 0.219
## LilacR1 LilacR2 LilacR3
## 0.227 0.202 0.202
par(mar=c(5,6,4,2))
barplot(mt.meth.cpg, main = "% Mitochondrial CpG Methylation", horiz = TRUE, las = 1,xlim = c(0,0.3))
This script creates a bar plot which displays the percentage of CpG methylation present in the lambda phage DNA spiked into each sample. Because the lambda phage contains no methylation and the same sample of lambda phage was spiked into each population DNA sample, we expect very low methylation (<5%) and consistency between the samples. Each sample had consistently low CpG methylation (<0.3%) and is therefore evidence of quality methylation profiling.
lp.meth <- meth_average[grep("lambda_phage$",meth_average$chrm),]
lp.meth
## sample chrm CGn CGb CHGn CHGb CHHn CHHb CHn
## 2628 Clear2F-R1.bam lambda_phage 6157 0.222% 6393 0.247% 11392 0.237% 11392
## 5296 Clear2F-R2.bam lambda_phage 6218 0.180% 6447 0.197% 11493 0.166% 11493
## 7961 Clear2F-R3.bam lambda_phage 6221 0.191% 6447 0.197% 11486 0.165% 11486
## 10639 Foundation.bam lambda_phage 6223 0.171% 6447 0.199% 11496 0.170% 11496
## 13309 Green3F-R1.bam lambda_phage 6221 0.258% 6449 0.290% 11497 0.228% 11497
## 15961 Green3F-R2.bam lambda_phage 6220 0.268% 6446 0.286% 11494 0.258% 11494
## 18618 Green3F-R3.bam lambda_phage 6222 0.253% 6449 0.258% 11493 0.228% 11493
## 21277 Lilac4F-R1.bam lambda_phage 6221 0.254% 6448 0.264% 11495 0.238% 11495
## 23941 Lilac4F-R2.bam lambda_phage 6222 0.225% 6447 0.235% 11495 0.235% 11495
## 26579 Lilac4F-R3.bam lambda_phage 6207 0.255% 6434 0.262% 11472 0.234% 11472
## CHb
## 2628 0.240%
## 5296 0.177%
## 7961 0.177%
## 10639 0.180%
## 13309 0.250%
## 15961 0.268%
## 18618 0.239%
## 21277 0.248%
## 23941 0.235%
## 26579 0.244%
lp.meth.cpg <- as.numeric(gsub("%","",lp.meth[,4]))
names(lp.meth.cpg) <- mysamplenames
lp.meth.cpg
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2 GreenR3
## 0.222 0.180 0.191 0.171 0.258 0.268 0.253
## LilacR1 LilacR2 LilacR3
## 0.254 0.225 0.255
par(mar=c(5,6,4,2))
barplot(lp.meth.cpg, main = "% Lambda Phage CpG Methylation", horiz = TRUE, las = 1, xlim = c(0,0.3))
This script creates a bar plot which displays the percentage of CpG methylation present in the pUC19 plasmid spiked into each sample. Because the pUC19 plasmid is fully methylated and the same sample of the plasmid was spiked into each population DNA sample, we expect very high methylation (>95%) and consistency between the samples. Each sample had consistently high CpG methylation (>97%) and is therefore evidence of quality methylation profiling.
pUC19.meth <- meth_average[grep("pUC19$",meth_average$chrm),]
pUC19.meth.cpg <- as.numeric(gsub("%","",pUC19.meth[,4]))
names(pUC19.meth.cpg) <- mysamplenames
pUC19.meth.cpg
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2 GreenR3
## 99.021 98.715 98.923 99.512 98.746 98.212 97.947
## LilacR1 LilacR2 LilacR3
## 98.428 98.019 97.982
par(mar=c(5,6,4,2))
barplot(pUC19.meth.cpg, main = "% pUC19 CpG Methylation", horiz = TRUE, las = 1, xlim = c(90,100))
This script analyses the chromosomal CpH methylation by creating a bar plot which displays the percentage of CpH methylation present in chromosome 1 of each sample. Because methylation is rare at CpH sites we expect very low methylation in every sample and a consistent percentage of methylation among the samples. Each sample had consistently low chromosomal CpH methylation (<0.6%) and is therefore evidence of quality methylation profiling.
chr.meth.cph <- as.numeric(gsub("%","",chr.meth[,10]))
names(chr.meth.cph) <- mysamplenames
chr.meth.cph
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2 GreenR3
## 0.524 0.524 0.517 0.496 0.529 0.541 0.527
## LilacR1 LilacR2 LilacR3
## 0.539 0.499 0.517
par(mar=c(5,6,4,2))
barplot(chr.meth.cph, main = "% Chr1 CpH Methylation", horiz = TRUE, las = 1, xlim = c(0,0.6))
This script creates a bar plot which displays the percentage of CpH methylation present in the mitochondrial chromosome of each sample. The mitochondrial chromosome typically contains little to no methylation. Each sample had consistently low mtDNA CpH methylation (<0.3%) and is therefore evidence of quality methylation profiling.
mt.meth.cph <- as.numeric(gsub("%","",mt.meth[,10]))
names(mt.meth.cph) <- mysamplenames
mt.meth.cph
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2 GreenR3
## 0.227 0.211 0.271 0.226 0.225 0.257 0.251
## LilacR1 LilacR2 LilacR3
## 0.253 0.213 0.232
par(mar=c(5,6,4,2))
barplot(mt.meth.cph, main = "% mtDNA CpH Methylation", horiz = TRUE, las = 1,xlim = c(0,0.3))
This script creates a bar plot which displays the percentage of CpH methylation present in the lambda phage DNA spiked into each sample. Because methylation is rare at CpH sites and the same sample of lambda phage was spiked into each population DNA sample, we expect very low methylation (<5%) and consistency between the samples. Each sample had consistently low CpH methylation (<0.3%) and is therefore evidence of quality methylation profiling.
lp.meth.cph <- as.numeric(gsub("%","",lp.meth[,10]))
names(lp.meth.cph) <- mysamplenames
lp.meth.cph
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2 GreenR3
## 0.240 0.177 0.177 0.180 0.250 0.268 0.239
## LilacR1 LilacR2 LilacR3
## 0.248 0.235 0.244
par(mar=c(5,6,4,2))
barplot(lp.meth.cph, main = "% Lambda Phage CpH Methylation", horiz = TRUE, las = 1,xlim = c(0,0.3))
This script creates a bar plot which displays the percentage of CpH methylation present in the pUC19 plasmid spiked into each sample. Because methylation is rare at CpH sites and the same sample of lambda phage was spiked into each population DNA sample, we expect very low methylation (<5%) and consistency between the samples. Each sample had consistently low CpH methylation (<1.4%) with minimal variation between the samples. The analysis of the pUC19 CpH methylation is therefore evidence of quality methylation profiling.
pUC19.meth.cph <- as.numeric(gsub("%","",pUC19.meth[,10]))
names(pUC19.meth.cph) <- mysamplenames
pUC19.meth.cph
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2 GreenR3
## 0.567 1.289 0.608 0.730 1.124 0.765 0.832
## LilacR1 LilacR2 LilacR3
## 0.761 0.685 0.943
par(mar=c(5,6,4,2))
barplot(pUC19.meth.cph, main = "% pUC19 Plasmid CpH Methylation", horiz = TRUE, las = 1,xlim = c(0,1.4))
This script creates a bar plot which displays the percentage of GC base pairs present in each of our population DNA samples. Due to the nature of methylome sequencing, only the cytosines which are methylated are those that are left after the enzymatic methyl-sequencing has been performed. We therefore expect significantly reduced GC content when compared to a standard guppy genome sequencing (39.3% GC). Each sample had 21% methylation, which is in line with our expectation of <39.3%.
GSR1 <- General.statistics[grep("R1_001",General.statistics$Sample.Name),]
GSR1GC <- as.numeric(gsub("%","",GSR1[,5]))
names(GSR1GC) <- mysamplenames
GSR1GC
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2 GreenR3
## 21 21 21 21 21 21 21
## LilacR1 LilacR2 LilacR3
## 21 21 21
par(mar=c(5,6,4,2))
barplot(GSR1GC, main = "% GC Content", horiz=TRUE, las=1, xlim = c(0,25))
This script creates a bar plot which displays the total reads present in each of our population DNA samples. As these are the results of whole genome enzymatic methyl-sequencing, we expect a very large quantity of reads. One article states 800,000,000 reads (trimmed and mapped) that are 101 bp in length are required for accurate coverage of the human genome when performing bisulfite sequencing. Although we are working with the guppy genome instead, this number of reads can serve as a rough benchmark. Considering our reads are 150 bp in length and the guppy genome is significantly smaller than the human genome (3,096,649,726 bp compared to 731,622,281 bp), our results are relatively close to this rough benchmark and indicate quality coverage of the guppy genome.
#Create additional columns to work out percentage of duplicate reads and percentage of unique reads
FastQCcounts$Total.Reads <- (FastQCcounts$Unique.Reads + FastQCcounts$Duplicate.Reads)
FQCR1 <- FastQCcounts[grep("R1_001",FastQCcounts$Category),]
mysamplenamesR <- sapply(strsplit(FastQCcounts$Category,"\\."),"[[",1)
mysamplenamesR <- gsub("_001","",FastQCcounts[,1])
mysamplenamesR <- gsub("2F-","",mysamplenamesR)
mysamplenamesR <- gsub("3F-","",mysamplenamesR)
mysamplenamesR <- gsub("4F-","",mysamplenamesR)
mysamplenamesR <- gsub("_"," ",mysamplenamesR)
mysamplenamesR
## [1] "ClearR1 R1" "ClearR1 R2" "ClearR2 R1" "ClearR2 R2"
## [5] "ClearR3 R1" "ClearR3 R2" "Foundation R1" "Foundation R2"
## [9] "GreenR1 R1" "GreenR1 R2" "GreenR2 R1" "GreenR2 R2"
## [13] "GreenR3 R1" "GreenR3 R2" "LilacR1 R1" "LilacR1 R2"
## [17] "LilacR2 R1" "LilacR2 R2" "LilacR3 R1" "LilacR3 R2"
mysamplenames
## [1] "ClearR1" "ClearR2" "ClearR3" "Foundation" "GreenR1"
## [6] "GreenR2" "GreenR3" "LilacR1" "LilacR2" "LilacR3"
TR <- FQCR1$Total.Reads
names(TR) <- mysamplenames
TR
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2 GreenR3
## 79115494 70710131 72919139 151547956 84386620 71583417 94325467
## LilacR1 LilacR2 LilacR3
## 82688556 71797715 66880152
par(mar=c(5,6,4,2))
barplot(TR, main = "Total Reads", las = 1,horiz=TRUE, xlim = c(40000000,160000000))
This script creates a bar plot which displays the percentage of duplicate reads present in each of our population DNA samples. When analysing the percentage of duplicate reads >40% duplicate reads could be considered high, <40% and >20% could be considered medium and <20% could be considered low. Our samples had a considerably low percentage of duplicate reads. This provides evidence of a high yield in our methylation profiling.
Dup <- FQCR1$Duplicate.Reads
PDup <- (Dup / TR * 100)
PDup
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2 GreenR3
## 11.28753 10.35292 10.87860 13.24146 11.41534 10.71456 12.22378
## LilacR1 LilacR2 LilacR3
## 10.85678 10.55580 10.39011
par(mar=c(5,6,4,2))
barplot(PDup, main = "% Duplicate Reads", horiz = TRUE, las = 1, xlim = c(0,15))
This script creates a bar plot which displays the percentage of unique reads present in each of our population DNA samples. The results and expectation of this metric are the inverse of percentage of duplicate reads.
Uni <- FQCR1$Unique.Reads
PUni <- (Uni / TR * 100)
PUni
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2 GreenR3
## 88.71247 89.64708 89.12140 86.75854 88.58466 89.28544 87.77622
## LilacR1 LilacR2 LilacR3
## 89.14322 89.44420 89.60989
par(mar=c(5,6,4,2))
barplot(PUni, main = "% Unique Reads", horiz = TRUE, las = 1, xlim = c(86,90))
This script creates a bar plot which displays the mean read length (in bp) of each population DNA sample. We expect the average read length to be ~150 bp. This read length optimises the sequencing process by balancing reading the most base pairs in each fragment while not creating poor quality reads. This is therefore the most cost efficient read length size. Each of our samples average read length is very close to this number. These results indicate cost efficiency and lead to high yield of our methylation profiling.
par(mar = c(5,10,5,5))
RL <- gsub("R1_001.fastq-trimmed-","",readlen[,1])
RL <- gsub(".fastq","",RL)
RL
## [1] "Clear2F-R1_pair1" "Clear2F-R1_pair2" "Clear2F-R2_pair1" "Clear2F-R2_pair2"
## [5] "Clear2F-R3_pair1" "Clear2F-R3_pair2" "Foundation_pair1" "Foundation_pair2"
## [9] "Green3F-R1_pair1" "Green3F-R1_pair2" "Green3F-R2_pair1" "Green3F-R2_pair2"
## [13] "Green3F-R3_pair1" "Green3F-R3_pair2" "Lilac4F-R1_pair1" "Lilac4F-R1_pair2"
## [17] "Lilac4F-R2_pair1" "Lilac4F-R2_pair2" "Lilac4F-R3_pair1" "Lilac4F-R3_pair2"
barplot(readlen$V2, main="Mean Read Length (bp)", names.arg = RL, las=1, horiz=TRUE, xlim = c(148,152))
This script creates a wasted bases analysis and displays the analysis in the form of a table. The table first shows the amount of bp present immediately after sequencing (basecountSTART) and then goes onto show how many bp are removed by each step in the data processing pipeline (trimming and mapping/removing duplicates) as well as how many bases are lost due to reads overlapping. The final column shows how many usable bp are left.
library(RColorBrewer)
library(kableExtra)
coul <- brewer.pal(4, "Pastel1")
#import data
basecount <- read.table("~/projects/guppy-methylation/basecount.txt", quote="\"", comment.char="", stringsAsFactors = FALSE)
#Add read 1 and 2 together in basecount start
basecountSTARTR1 <- basecount[grep("R1_001.fastq.gz",basecount$V2),]
basecountSTARTR2 <- basecount[grep("R2_001.fastq.gz",basecount$V2),]
basecountSTART <- basecountSTARTR1$V3 + basecountSTARTR2$V3
mysamplenames
## [1] "ClearR1" "ClearR2" "ClearR3" "Foundation" "GreenR1"
## [6] "GreenR2" "GreenR3" "LilacR1" "LilacR2" "LilacR3"
names(basecountSTART) <- mysamplenames
basecountSTART
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2
## 23892879188 21354459562 22021579978 45767482712 25484759240 21618191934
## GreenR3 LilacR1 LilacR2 LilacR3
## 28486291034 24971943912 21682909930 20197805904
#Add read 1 and 2 together in trimmed reads
basecountAFTERSKEWERR1 <- basecount[grep("pair1",basecount$V2),]
basecountAFTERSKEWERR2 <- basecount[grep("pair2",basecount$V2),]
basecountAFTERSKEWER <- basecountAFTERSKEWERR1$V3 + basecountAFTERSKEWERR2$V3
names(basecountAFTERSKEWER) <- mysamplenames
basecountAFTERSKEWER
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2
## 23726688110 21219905068 21864693234 45501760335 25331908492 21492313083
## GreenR3 LilacR1 LilacR2 LilacR3
## 28306309237 24820993518 21547129588 20068129840
#Create "after overlapping reads removed values" by subtracting OVERLAPPING from AFTERMAPPED reads
AFTERMAPPED <- basecount[grep("AFTERMAPPED",basecount$V1),]
OVERLAPPING <- basecount[grep("OVERLAPPING",basecount$V1),]
basecountMAPPED <- AFTERMAPPED$V3
basecountMAPPED
## [1] 8408181829 17501584680 17890535200 37443087526 20760281390 17691297992
## [7] 23076685196 20474902945 17747008807 12891026471
names(basecountMAPPED) <- mysamplenames
basecountMAPPED
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2
## 8408181829 17501584680 17890535200 37443087526 20760281390 17691297992
## GreenR3 LilacR1 LilacR2 LilacR3
## 23076685196 20474902945 17747008807 12891026471
basecountOVERLAPPING <- OVERLAPPING$V3
UseableBP <- AFTERMAPPED$V3 - OVERLAPPING$V3
names(UseableBP) <- mysamplenames
UseableBP
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2
## 7726370969 16547937507 16292997753 36444018617 20076583515 16813219232
## GreenR3 LilacR1 LilacR2 LilacR3
## 22310598896 19946472787 16739256850 12483562301
basecountTRIMMED <- basecountSTART - basecountAFTERSKEWER
basecountUNMAPPED <- basecountAFTERSKEWER - basecountMAPPED
#Create a data frame containing all vectors created above
DF <- data.frame(basecountSTART,basecountTRIMMED,basecountAFTERSKEWER,basecountUNMAPPED,basecountMAPPED,basecountOVERLAPPING,UseableBP)
DF %>%
kbl(caption = "Wasted Bases Analysis") %>%
kable_classic(full_width = F, html_font = "Cambria")
basecountSTART | basecountTRIMMED | basecountAFTERSKEWER | basecountUNMAPPED | basecountMAPPED | basecountOVERLAPPING | UseableBP | |
---|---|---|---|---|---|---|---|
ClearR1 | 23892879188 | 166191078 | 23726688110 | 15318506281 | 8408181829 | 681810860 | 7726370969 |
ClearR2 | 21354459562 | 134554494 | 21219905068 | 3718320388 | 17501584680 | 953647173 | 16547937507 |
ClearR3 | 22021579978 | 156886744 | 21864693234 | 3974158034 | 17890535200 | 1597537447 | 16292997753 |
Foundation | 45767482712 | 265722377 | 45501760335 | 8058672809 | 37443087526 | 999068909 | 36444018617 |
GreenR1 | 25484759240 | 152850748 | 25331908492 | 4571627102 | 20760281390 | 683697875 | 20076583515 |
GreenR2 | 21618191934 | 125878851 | 21492313083 | 3801015091 | 17691297992 | 878078760 | 16813219232 |
GreenR3 | 28486291034 | 179981797 | 28306309237 | 5229624041 | 23076685196 | 766086300 | 22310598896 |
LilacR1 | 24971943912 | 150950394 | 24820993518 | 4346090573 | 20474902945 | 528430158 | 19946472787 |
LilacR2 | 21682909930 | 135780342 | 21547129588 | 3800120781 | 17747008807 | 1007751957 | 16739256850 |
LilacR3 | 20197805904 | 129676064 | 20068129840 | 7177103369 | 12891026471 | 407464170 | 12483562301 |
This script creates a pie chart which displays what percentage of bases are removed by each data processing step, the percentage of bases removed due to overlapping reads and the percentage of usable bases left. In whole genome sequencing, a loss of <30% of bases can be considered very low. These results indicate high yield and coverage of the genome.
#Create a data frame containing how many bases have been removed and the leftover bases
DF2 <- data.frame(basecountTRIMMED,basecountUNMAPPED,basecountOVERLAPPING,UseableBP)
DF2
## basecountTRIMMED basecountUNMAPPED basecountOVERLAPPING UseableBP
## ClearR1 166191078 15318506281 681810860 7726370969
## ClearR2 134554494 3718320388 953647173 16547937507
## ClearR3 156886744 3974158034 1597537447 16292997753
## Foundation 265722377 8058672809 999068909 36444018617
## GreenR1 152850748 4571627102 683697875 20076583515
## GreenR2 125878851 3801015091 878078760 16813219232
## GreenR3 179981797 5229624041 766086300 22310598896
## LilacR1 150950394 4346090573 528430158 19946472787
## LilacR2 135780342 3800120781 1007751957 16739256850
## LilacR3 129676064 7177103369 407464170 12483562301
#Calculate the percentage of wasted bases in each step and usable bases leftover by dividing each vector value by the total bases in the data frame (to 3 significant values)
colSums(DF2)
## basecountTRIMMED basecountUNMAPPED basecountOVERLAPPING
## 1598472889 59995238469 8503573609
## UseableBP
## 185381018427
labels <- c("Trimmed","Unmapped/Duplicate","Overlapping","UseableBP")
percent <- as.character(signif(colSums(DF2)/sum(colSums(DF2))*100,3))
percent
## [1] "0.626" "23.5" "3.33" "72.6"
labels2 <- paste(labels,percent,"%")
par(mar=c(5,5,4,7))
pie(colSums(DF2),labels = labels2,col = coul)
This script creates a stacked barplot which displays what percentage of bases are removed by each data processing step, the percentage of bases removed due to overlapping reads and the percentage of usable bases left in each of the population DNA samples. The loss of bases was small for most samples, however there were evidently problems mapping reads from the Clear R1 and Lilac R3 samples. We know that the problem was caused due to mapping difficulties as these samples did not contain a large number of duplicate reads.
#Calculate the percentage of bases removed for each sample by each step
DF3 <- DF2/rowSums(DF2)*100
DF3
## basecountTRIMMED basecountUNMAPPED basecountOVERLAPPING UseableBP
## ClearR1 0.6955674 64.11327 2.853615 32.33755
## ClearR2 0.6301002 17.41238 4.465799 77.49172
## ClearR3 0.7124227 18.04665 7.254418 73.98651
## Foundation 0.5805921 17.60786 2.182923 79.62863
## GreenR1 0.5997732 17.93867 2.682772 78.77878
## GreenR2 0.5822820 17.58248 4.061759 77.77348
## GreenR3 0.6318190 18.35839 2.689316 78.32048
## LilacR1 0.6044799 17.40389 2.116095 79.87553
## LilacR2 0.6262090 17.52588 4.647679 77.20023
## LilacR3 0.6420304 35.53407 2.017368 61.80653
par(mar=c(6,5,4,2))
barplot(t(DF3),col=coul, las=2, ylab = "Percent of Bases")
legend(7, 95, legend = c(labels), col = c(coul), cex=0.8, pch = 19)
This script creates a barplot which displays an estimate of the fold coverage of the guppy genome for each DNA population sample. This is done by dividing the number of usable bases in each sample by the number of bases in the guppy genome. The US National Institutes of Health (NIH) Roadmap Epigenomics Project recommends the use of 2 replicates with a combined total coverage of 30×. Combining our 3 replicates, each of our light conditions easily surpass 30x coverage. These results indicate quality coverage of the guppy genome.
coverage <- UseableBP / 731622281
par(mar=c(5,6,4,2))
coverage
## ClearR1 ClearR2 ClearR3 Foundation GreenR1 GreenR2 GreenR3
## 10.56060 22.61814 22.26969 49.81261 27.44119 22.98074 30.49470
## LilacR1 LilacR2 LilacR3
## 27.26335 22.87964 17.06285
barplot(coverage, main = "Fold Coverage", horiz = TRUE, las = 1, xlim = c(0,50))
##Explanation of results These results tell us more about the quality of the sequencing/methylation profiling than methylation differences between the populations.
For example, the reason why we have so many foundation reads when compared to the light conditions is a deliberate sequencing choice rather than a difference between the samples. This is because we are trying to achieve satisfactory coverage of the genome in order have a greater probability of finding statistically significant results in our differential methylation analysis. As we have 3 replicates of each of the light conditions and only 1 foundation sample, it is important to have a greater number of foundation reads to meet these coverage requirements (the combined total of replicate reads is how we derive our statistical power).
This leads onto the differences in duplicate/unique reads between each of the light conditions and the foundation sample. The greater the number of total reads for a sample, the greater the percentage of duplicate reads for that sample. This is essentially because as we cover more of the genome in our sequencing, we are more likely to sequence regions of the genome that have already been sequenced, hence duplicate reads. This is likely the explanation as to why the foundation, green R1 and green R2 have more duplicate reads (they have more total reads).
The charts which depict CpH methylation, mtDNA methylation, lambda phage methylation and pUC19 plasmid methylation are only controls to provide evidence that the enzymatic methyl-seq kit and sequencing process worked well. mtDNA, lambda phage DNA and CpH methylation should all have either exactly or close to 0% methylation and the pUC19 plasmid should have 100% CpG methylation. Any deviations from this in our results are due to noise/inaccuracies. Whole methylome sequencing is not perfect so we expect small noise/inaccuracies.
The only one of these charts that may tell us something about differences in methylation between the light conditions and the foundation population is the chromosome 1 CpG methylation %. It is interesting that the foundation population has ~2% greater overall methylation when compared to the light condition populations. Because of this, I can’t rule out your speculation that methylation is reduced in order to allow faster evolutionary divergence in these populations. From speaking to Mark a while ago about changes in overall % of chromosomal methylation due to these experimental conditions, he said he did not expect to see significant differences, rather we would find the differences at individual loci (or differentially methylated regions on chromsomes).
For reproducibility.
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8
## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8
## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] kableExtra_1.3.4 RColorBrewer_1.1-2 vioplot_0.3.6 zoo_1.8-9
## [5] sm_2.2-5.6
##
## loaded via a namespace (and not attached):
## [1] bslib_0.2.5.1 compiler_4.1.0 jquerylib_0.1.4 highr_0.9
## [5] tools_4.1.0 digest_0.6.27 jsonlite_1.7.2 evaluate_0.14
## [9] lifecycle_1.0.0 lattice_0.20-44 viridisLite_0.4.0 rlang_0.4.11
## [13] rstudioapi_0.13 yaml_2.2.1 xfun_0.23 stringr_1.4.0
## [17] httr_1.4.2 knitr_1.33 xml2_1.3.2 sass_0.4.0
## [21] systemfonts_1.0.2 webshot_0.5.2 grid_4.1.0 svglite_2.0.0
## [25] glue_1.4.2 R6_2.5.0 tcltk_4.1.0 rmarkdown_2.8
## [29] magrittr_2.0.1 scales_1.1.1 htmltools_0.5.1.1 rvest_1.0.0
## [33] colorspace_2.0-1 stringi_1.6.2 munsell_0.5.0