In this report, I will take you through an analysis of targeted bisulfite sequencing.
The pipeline used the Biscuit pipeline (https://github.com/zhou-lab/biscuit) run by Larry Croft.
Here I (Mark Ziemann) am reading them into R for analysis with methylKit
library("GenomicRanges")
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
library("limma")
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
library("methylKit")
LEP chr7:128241036-128241347
TNF chr6:31575210-31575467
NPY1 chr7:24283548-24283843
NPY2 chr7:24283796-24283936
SOCS3 chr17:78358469-78358619
POMC1 chr2:25168373-25168611
POMC2 chr2:25161598-25161833
gene_names <- c("CELF2","PPP2R2D","LARP4B","GAB2","CASC1","UNKNOWN","AL096870")
seqnames <- c("10","10","10","11","12","14","14")
start <- c(10648947,131945290,864894,78358014,25161220,128240193,24283354)
end <- c(10649946,131946289,865893,78359013,25162219,128242193,24284353)
genes <- GRanges(seqnames = seqnames, ranges=IRanges(start,end))
genes$gene_names <- gene_names
file.list <- list.files(".",pattern="methylkit.tsv")
lines <- lapply(file.list,readLines)
file.list <- file.list[which(lapply(lines,length) > 0)]
samplesheet <- read.table("../sample_info/samplesheet_2020-10-15.tsv",sep="\t",header=TRUE)
samplesheet$poolID <- paste("pool",samplesheet$Internal_ID,sep="")
# filter files for those in the sample sheet
file.list <- file.list[which( sapply(strsplit(file.list,"_"),"[[",1) %in% samplesheet$poolID )]
# filter samplesheet for those with files
samplesheet <- samplesheet[which( samplesheet$poolID %in% sapply(strsplit(file.list,"_"),"[[",1) ),]
# reorder the samplesheet
samplenames <- sapply(strsplit(file.list,"_"),"[[",1)
samplesheet <- samplesheet[match(samplenames , samplesheet$poolID),]
samplesheet[,c(25,9)]
## poolID GWG
## 20 pool1 20.0
## 11 pool10 16.0
## 3 pool12 13.0
## 15 pool13 17.0
## 12 pool15 16.0
## 9 pool16 15.0
## 24 pool17 23.0
## 4 pool18 13.0
## 6 pool19 13.4
## 21 pool2 20.0
## 5 pool20 13.0
## 16 pool21 17.0
## 17 pool22 17.0
## 19 pool23 18.0
## 22 pool24 20.0
## 18 pool3 17.5
## 14 pool4 17.0
## 1 pool5 12.0
## 13 pool6 16.3
## 10 pool7 15.9
## 23 pool8 22.0
## 7 pool9 13.8
# only for debugging
#treatment <- sample(c(0,1),nrow(samplesheet),replace=TRUE)
myobj <- methRead( location=as.list(file.list),
sample.id=as.list(samplesheet$poolID),
pipeline="bismarkCoverage",
header=FALSE,
assembly="hg38",
treatment=samplesheet$GWG,
context="CpG",
mincov = 1 )
## Received list of locations.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
#myobj <- methRead( location=as.list(file.list),
# sample.id=as.list(samplesheet$poolID),
# pipeline="bismarkCoverage",
# header=FALSE,
# assembly="hg38",
# treatment=as.vector(scale(samplesheet$GWG)),
# context="CpG",
# mincov = 1 )
myobjf <- filterByCoverage(myobj,lo.count=5,lo.perc=NULL,
hi.count=NULL,hi.perc=NULL)
meth <- unite(myobj, destrand=FALSE)
## uniting...
getCorrelation(meth,plot=TRUE)
## pool1 pool10 pool12 pool13 pool15 pool16 pool17
## pool1 1.0000000 0.5855325 0.6211725 0.7035727 0.7668660 0.8549100 0.8901978
## pool10 0.5855325 1.0000000 0.8277502 0.8383930 0.8320778 0.7996950 0.6576538
## pool12 0.6211725 0.8277502 1.0000000 0.8079562 0.7338048 0.8421786 0.7215105
## pool13 0.7035727 0.8383930 0.8079562 1.0000000 0.9379764 0.8758192 0.7626950
## pool15 0.7668660 0.8320778 0.7338048 0.9379764 1.0000000 0.9174383 0.7404563
## pool16 0.8549100 0.7996950 0.8421786 0.8758192 0.9174383 1.0000000 0.7907152
## pool17 0.8901978 0.6576538 0.7215105 0.7626950 0.7404563 0.7907152 1.0000000
## pool18 0.1748589 0.7707562 0.6112625 0.6453349 0.5833091 0.4197376 0.3447757
## pool19 0.6224652 0.8171979 0.7999923 0.9396694 0.9118933 0.8444395 0.7090226
## pool2 0.3463945 0.8795710 0.6779911 0.8055870 0.7477202 0.6440278 0.3930450
## pool20 0.5208891 0.9408163 0.8196230 0.8581700 0.8753589 0.8159036 0.6277972
## pool21 0.7579485 0.8155952 0.8038122 0.8981965 0.9022269 0.9085613 0.8387728
## pool22 0.3094556 0.9013274 0.7172664 0.7009321 0.7063217 0.6137617 0.4236640
## pool23 0.8240670 0.6227699 0.6457750 0.7027220 0.7217209 0.8155233 0.8696465
## pool24 0.7526162 0.8651175 0.8508769 0.8501262 0.8943713 0.9142065 0.8006848
## pool3 0.4876436 0.8449360 0.8284202 0.8470741 0.8037274 0.7322591 0.6180535
## pool4 0.9577428 0.6424293 0.6612692 0.7653096 0.7566573 0.8280070 0.9091991
## pool5 0.6006245 0.8636081 0.6872542 0.8620436 0.8292901 0.7395731 0.6941259
## pool6 0.7661228 0.7629842 0.7753259 0.7788459 0.8468274 0.8730769 0.8264379
## pool7 0.6351527 0.9219912 0.7136565 0.8637200 0.9272011 0.8594624 0.6301043
## pool8 0.8478806 0.7185273 0.7488135 0.8485363 0.8939829 0.9328516 0.8362455
## pool9 0.8848678 0.7248110 0.6853577 0.8355914 0.8109959 0.8006558 0.9191678
## pool18 pool19 pool2 pool20 pool21 pool22 pool23
## pool1 0.1748589 0.6224652 0.3463945 0.5208891 0.7579485 0.3094556 0.8240670
## pool10 0.7707562 0.8171979 0.8795710 0.9408163 0.8155952 0.9013274 0.6227699
## pool12 0.6112625 0.7999923 0.6779911 0.8196230 0.8038122 0.7172664 0.6457750
## pool13 0.6453349 0.9396694 0.8055870 0.8581700 0.8981965 0.7009321 0.7027220
## pool15 0.5833091 0.9118933 0.7477202 0.8753589 0.9022269 0.7063217 0.7217209
## pool16 0.4197376 0.8444395 0.6440278 0.8159036 0.9085613 0.6137617 0.8155233
## pool17 0.3447757 0.7090226 0.3930450 0.6277972 0.8387728 0.4236640 0.8696465
## pool18 1.0000000 0.7134811 0.8643939 0.7576313 0.5956201 0.9141064 0.2529033
## pool19 0.7134811 1.0000000 0.8005663 0.8829335 0.9285766 0.7637433 0.7185623
## pool2 0.8643939 0.8005663 1.0000000 0.8552567 0.6822761 0.9042154 0.3653855
## pool20 0.7576313 0.8829335 0.8552567 1.0000000 0.8583476 0.9047231 0.6335849
## pool21 0.5956201 0.9285766 0.6822761 0.8583476 1.0000000 0.7084981 0.8623552
## pool22 0.9141064 0.7637433 0.9042154 0.9047231 0.7084981 1.0000000 0.3767424
## pool23 0.2529033 0.7185623 0.3653855 0.6335849 0.8623552 0.3767424 1.0000000
## pool24 0.6027263 0.8623344 0.6984688 0.8831031 0.9207243 0.7594022 0.7495674
## pool3 0.7513578 0.8159299 0.8220134 0.8746839 0.7443903 0.7896510 0.4875307
## pool4 0.3017413 0.6970150 0.4456261 0.5409632 0.8004063 0.3734934 0.8409157
## pool5 0.7661318 0.8946608 0.8276833 0.8638731 0.8609330 0.7978023 0.7328585
## pool6 0.4676847 0.8467592 0.5234620 0.8304438 0.9114582 0.6257081 0.8894927
## pool7 0.6723327 0.8396083 0.8538715 0.9462839 0.8562640 0.8488222 0.6432408
## pool8 0.3592411 0.8251062 0.5680224 0.7847016 0.8746376 0.5137755 0.8523186
## pool9 0.4651253 0.7667664 0.5693550 0.6695418 0.8211306 0.4961955 0.8257139
## pool24 pool3 pool4 pool5 pool6 pool7 pool8
## pool1 0.7526162 0.4876436 0.9577428 0.6006245 0.7661228 0.6351527 0.8478806
## pool10 0.8651175 0.8449360 0.6424293 0.8636081 0.7629842 0.9219912 0.7185273
## pool12 0.8508769 0.8284202 0.6612692 0.6872542 0.7753259 0.7136565 0.7488135
## pool13 0.8501262 0.8470741 0.7653096 0.8620436 0.7788459 0.8637200 0.8485363
## pool15 0.8943713 0.8037274 0.7566573 0.8292901 0.8468274 0.9272011 0.8939829
## pool16 0.9142065 0.7322591 0.8280070 0.7395731 0.8730769 0.8594624 0.9328516
## pool17 0.8006848 0.6180535 0.9091991 0.6941259 0.8264379 0.6301043 0.8362455
## pool18 0.6027263 0.7513578 0.3017413 0.7661318 0.4676847 0.6723327 0.3592411
## pool19 0.8623344 0.8159299 0.6970150 0.8946608 0.8467592 0.8396083 0.8251062
## pool2 0.6984688 0.8220134 0.4456261 0.8276833 0.5234620 0.8538715 0.5680224
## pool20 0.8831031 0.8746839 0.5409632 0.8638731 0.8304438 0.9462839 0.7847016
## pool21 0.9207243 0.7443903 0.8004063 0.8609330 0.9114582 0.8562640 0.8746376
## pool22 0.7594022 0.7896510 0.3734934 0.7978023 0.6257081 0.8488222 0.5137755
## pool23 0.7495674 0.4875307 0.8409157 0.7328585 0.8894927 0.6432408 0.8523186
## pool24 1.0000000 0.8387768 0.7462333 0.7693286 0.8599876 0.8772006 0.8769319
## pool3 0.8387768 1.0000000 0.5209144 0.7205909 0.6392061 0.8064839 0.7500715
## pool4 0.7462333 0.5209144 1.0000000 0.6923533 0.7518557 0.6269833 0.8059069
## pool5 0.7693286 0.7205909 0.6923533 1.0000000 0.8078327 0.8566798 0.7419351
## pool6 0.8599876 0.6392061 0.7518557 0.8078327 1.0000000 0.7886626 0.8763023
## pool7 0.8772006 0.8064839 0.6269833 0.8566798 0.7886626 1.0000000 0.8128085
## pool8 0.8769319 0.7500715 0.8059069 0.7419351 0.8763023 0.8128085 1.0000000
## pool9 0.7875510 0.6909381 0.9195440 0.8220397 0.7787933 0.7204924 0.8607970
## pool9
## pool1 0.8848678
## pool10 0.7248110
## pool12 0.6853577
## pool13 0.8355914
## pool15 0.8109959
## pool16 0.8006558
## pool17 0.9191678
## pool18 0.4651253
## pool19 0.7667664
## pool2 0.5693550
## pool20 0.6695418
## pool21 0.8211306
## pool22 0.4961955
## pool23 0.8257139
## pool24 0.7875510
## pool3 0.6909381
## pool4 0.9195440
## pool5 0.8220397
## pool6 0.7787933
## pool7 0.7204924
## pool8 0.8607970
## pool9 1.0000000
clusterSamples(meth, dist="correlation", method="ward", plot=TRUE)
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
##
## Call:
## hclust(d = d, method = HCLUST.METHODS[hclust.method])
##
## Cluster method : ward.D
## Distance : pearson
## Number of objects: 22
PCASamples(meth, screeplot=TRUE)
PCASamples(meth)
dm <- calculateDiffMeth(meth)
## more than two groups detected:
## will calculate methylation difference as the difference of max(x) - min(x),
## where x is vector of mean methylation per group per region,but
## the statistical test will remain the same.
dm
## chr start end strand pvalue qvalue meth.diff
## 1 NPY2.110 110 110 * 0.8950765 0 1.2183075
## 2 NPY2.25 25 25 * 0.8512679 0 0.2475339
## 3 NPY2.59 59 59 * 0.8519023 0 0.1910137
## 4 NPY2.64 64 64 * 0.8662033 0 0.5912206
## 5 NPY2.99 99 99 * 0.5398042 0 1.3725651
## 6 POMC2.102 102 102 * 0.4291466 0 16.2284712
## 7 POMC2.26 26 26 * 0.5821303 0 20.5223881
## 8 POMC2.31 31 31 * 0.5336422 0 17.4580637
## 9 POMC2.56 56 56 * 0.6814301 0 17.4580637
## 10 POMC2.64 64 64 * 0.7644441 0 14.2639193
## 11 POMC2.74 74 74 * 0.6307043 0 19.2253332
## 12 POMC2.92 92 92 * 0.6055172 0 21.0746637
## 13 SOCS.113 113 113 * 0.8747546 0 19.7418285
## 14 SOCS.119 119 119 * 0.9645667 0 15.5223881
## 15 SOCS.32 32 32 * 0.9508960 0 11.3118617
## 16 SOCS.38 38 38 * 0.6913183 0 11.3118617
## 17 SOCS.46 46 46 * 0.8330839 0 11.3118617
## 18 SOCS.71 71 71 * 0.4525501 0 13.7931034
dm$pvalue<0.05
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE
min(dm$pvalue)
## [1] 0.4291466
plot(dm$meth.diff,-log10(dm$pvalue) )
hist(dm$meth.diff)
#dm.hyper=getMethylDiff(dm,difference=1,qvalue=0.1,type="hyper")
#dm.hypo=getMethylDiff(dm,difference=1,qvalue=0.1,type="hypo")
z <- samplesheet$GWG
i=0
xx <- apply( percMethylation(meth), 1 , function(x) {
# w <- wilcox.test(z,x)
cor <- cor.test(x,z)
p <- cor$p.value
r <- cor$estimate
i=i+1
HEADER = paste("Site",i," p=",round(p,3)," r=",round(r,3))
plot(z,x,xlab="GWG",ylab="Beta")
mtext(HEADER)
c("p"=p,"r"=r)
})
xx <- as.data.frame(t(xx))
xx$FDR <- p.adjust(xx[,1],method="fdr")
xx <- data.frame(meth$chr,meth$end,xx)
colnames(xx) <- c("seqname","pos","p","r","FDR")
top <- xx[order(xx$p),]
head(top,10)
## seqname pos p r FDR
## 6 POMC2.102 102 0.009066178 -0.5426756 0.1631912
## 8 POMC2.31 31 0.023972013 -0.4793994 0.1737516
## 7 POMC2.26 26 0.032105608 -0.4579239 0.1737516
## 5 NPY2.99 99 0.038611466 -0.4436690 0.1737516
## 12 POMC2.92 92 0.082020612 -0.3789206 0.2380618
## 11 POMC2.74 74 0.083086639 -0.3777060 0.2380618
## 9 POMC2.56 56 0.105286505 -0.3547001 0.2380618
## 18 SOCS.71 71 0.105805244 0.3542072 0.2380618
## 10 POMC2.64 64 0.150281996 -0.3172263 0.3005640
## 16 SOCS.38 38 0.329906182 0.2179321 0.5938311
gr <- GRanges(seqnames = xx$seqname, ranges = IRanges(xx$pos) )
gr$r <- xx$r
gr$p <- xx$p
gr$FDR <- xx$FDR
# lets look at each gene
genes[1]$gene_names
## [1] "CELF2"
ol <- subsetByOverlaps(gr, genes[1])
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
for ( i in seq_along(genes) ) {
print(genes[i]$gene_names)
ol <- subsetByOverlaps(gr, genes[i])
print( as.data.frame(ol) )
}
## [1] "CELF2"
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## [1] seqnames start end width strand r p FDR
## <0 rows> (or 0-length row.names)
## [1] "PPP2R2D"
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## [1] seqnames start end width strand r p FDR
## <0 rows> (or 0-length row.names)
## [1] "LARP4B"
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## [1] seqnames start end width strand r p FDR
## <0 rows> (or 0-length row.names)
## [1] "GAB2"
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## [1] seqnames start end width strand r p FDR
## <0 rows> (or 0-length row.names)
## [1] "CASC1"
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## [1] seqnames start end width strand r p FDR
## <0 rows> (or 0-length row.names)
## [1] "UNKNOWN"
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## [1] seqnames start end width strand r p FDR
## <0 rows> (or 0-length row.names)
## [1] "AL096870"
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## [1] seqnames start end width strand r p FDR
## <0 rows> (or 0-length row.names)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] methylKit_1.16.0 limma_3.46.0 GenomicRanges_1.42.0
## [4] GenomeInfoDb_1.26.0 IRanges_2.24.0 S4Vectors_0.28.0
## [7] BiocGenerics_0.36.0
##
## loaded via a namespace (and not attached):
## [1] mclust_5.4.6 Rcpp_1.0.5
## [3] bdsmatrix_1.3-4 mvtnorm_1.1-1
## [5] lattice_0.20-41 Rsamtools_2.6.0
## [7] Biostrings_2.58.0 gtools_3.8.2
## [9] digest_0.6.27 R6_2.5.0
## [11] plyr_1.8.6 emdbook_1.3.12
## [13] evaluate_0.14 coda_0.19-4
## [15] ggplot2_3.3.2 pillar_1.4.6
## [17] zlibbioc_1.36.0 rlang_0.4.8
## [19] data.table_1.13.2 R.utils_2.10.1
## [21] R.oo_1.24.0 Matrix_1.2-18
## [23] bbmle_1.0.23.1 qvalue_2.22.0
## [25] rmarkdown_2.5 splines_4.0.3
## [27] BiocParallel_1.24.1 stringr_1.4.0
## [29] fastseg_1.36.0 RCurl_1.98-1.2
## [31] munsell_0.5.0 DelayedArray_0.16.0
## [33] rtracklayer_1.50.0 compiler_4.0.3
## [35] numDeriv_2016.8-1.1 xfun_0.19
## [37] pkgconfig_2.0.3 mgcv_1.8-33
## [39] htmltools_0.5.0 SummarizedExperiment_1.20.0
## [41] tidyselect_1.1.0 tibble_3.0.4
## [43] GenomeInfoDbData_1.2.4 matrixStats_0.57.0
## [45] XML_3.99-0.5 crayon_1.3.4
## [47] dplyr_1.0.2 GenomicAlignments_1.26.0
## [49] MASS_7.3-53 bitops_1.0-6
## [51] R.methodsS3_1.8.1 grid_4.0.3
## [53] nlme_3.1-150 gtable_0.3.0
## [55] lifecycle_0.2.0 magrittr_1.5
## [57] scales_1.1.1 KernSmooth_2.23-18
## [59] stringi_1.5.3 XVector_0.30.0
## [61] reshape2_1.4.4 ellipsis_0.3.1
## [63] generics_0.1.0 vctrs_0.3.4
## [65] tools_4.0.3 Biobase_2.50.0
## [67] glue_1.4.2 purrr_0.3.4
## [69] MatrixGenerics_1.2.0 yaml_2.2.1
## [71] colorspace_2.0-0 knitr_1.30