Introduction

In this report, I will take you through an analysis of targeted bisulfite sequencing.

The pipeline used the Biscuit pipeline (https://github.com/zhou-lab/biscuit) run by Larry Croft.

Here I (Mark Ziemann) am reading them into R for analysis with methylKit

library("GenomicRanges")
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
library("limma")
## 
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
library("methylKit")

Gene regions

LEP chr7:128241036-128241347

TNF chr6:31575210-31575467

NPY1 chr7:24283548-24283843

NPY2 chr7:24283796-24283936

SOCS3 chr17:78358469-78358619

POMC1 chr2:25168373-25168611

POMC2 chr2:25161598-25161833

gene_names <- c("CELF2","PPP2R2D","LARP4B","GAB2","CASC1","UNKNOWN","AL096870")
seqnames <- c("10","10","10","11","12","14","14")
start <- c(10648947,131945290,864894,78358014,25161220,128240193,24283354)
end <- c(10649946,131946289,865893,78359013,25162219,128242193,24284353)

genes <- GRanges(seqnames = seqnames, ranges=IRanges(start,end))
genes$gene_names <- gene_names

Data preparation

file.list <- list.files(".",pattern="methylkit.tsv")

lines <- lapply(file.list,readLines)
file.list <- file.list[which(lapply(lines,length) > 0)]

samplesheet <- read.table("../sample_info/samplesheet_2020-10-15.tsv",sep="\t",header=TRUE)
samplesheet$poolID <- paste("pool",samplesheet$Internal_ID,sep="")

# filter files for those in the sample sheet
file.list <- file.list[which( sapply(strsplit(file.list,"_"),"[[",1) %in% samplesheet$poolID )]

# filter samplesheet for those with files
samplesheet <- samplesheet[which( samplesheet$poolID  %in% sapply(strsplit(file.list,"_"),"[[",1)   ),]

# reorder the samplesheet
samplenames <-  sapply(strsplit(file.list,"_"),"[[",1)
samplesheet <- samplesheet[match(samplenames , samplesheet$poolID),]

samplesheet[,c(25,9)]
##    poolID  GWG
## 20  pool1 20.0
## 11 pool10 16.0
## 3  pool12 13.0
## 15 pool13 17.0
## 12 pool15 16.0
## 9  pool16 15.0
## 24 pool17 23.0
## 4  pool18 13.0
## 6  pool19 13.4
## 21  pool2 20.0
## 5  pool20 13.0
## 16 pool21 17.0
## 17 pool22 17.0
## 19 pool23 18.0
## 22 pool24 20.0
## 18  pool3 17.5
## 14  pool4 17.0
## 1   pool5 12.0
## 13  pool6 16.3
## 10  pool7 15.9
## 23  pool8 22.0
## 7   pool9 13.8
# only for debugging
#treatment <- sample(c(0,1),nrow(samplesheet),replace=TRUE)

myobj <- methRead( location=as.list(file.list),
  sample.id=as.list(samplesheet$poolID),  
  pipeline="bismarkCoverage",
  header=FALSE, 
  assembly="hg38",
  treatment=samplesheet$GWG,
  context="CpG", 
  mincov = 1 )
## Received list of locations.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
## Reading file.
#myobj <- methRead( location=as.list(file.list),
#  sample.id=as.list(samplesheet$poolID),
#  pipeline="bismarkCoverage",
#  header=FALSE,
#  assembly="hg38",
#  treatment=as.vector(scale(samplesheet$GWG)),
#  context="CpG",
#  mincov = 1 )

myobjf <- filterByCoverage(myobj,lo.count=5,lo.perc=NULL,
  hi.count=NULL,hi.perc=NULL)

meth <- unite(myobj, destrand=FALSE)
## uniting...

Some basic charts

getCorrelation(meth,plot=TRUE)
##            pool1    pool10    pool12    pool13    pool15    pool16    pool17
## pool1  1.0000000 0.5855325 0.6211725 0.7035727 0.7668660 0.8549100 0.8901978
## pool10 0.5855325 1.0000000 0.8277502 0.8383930 0.8320778 0.7996950 0.6576538
## pool12 0.6211725 0.8277502 1.0000000 0.8079562 0.7338048 0.8421786 0.7215105
## pool13 0.7035727 0.8383930 0.8079562 1.0000000 0.9379764 0.8758192 0.7626950
## pool15 0.7668660 0.8320778 0.7338048 0.9379764 1.0000000 0.9174383 0.7404563
## pool16 0.8549100 0.7996950 0.8421786 0.8758192 0.9174383 1.0000000 0.7907152
## pool17 0.8901978 0.6576538 0.7215105 0.7626950 0.7404563 0.7907152 1.0000000
## pool18 0.1748589 0.7707562 0.6112625 0.6453349 0.5833091 0.4197376 0.3447757
## pool19 0.6224652 0.8171979 0.7999923 0.9396694 0.9118933 0.8444395 0.7090226
## pool2  0.3463945 0.8795710 0.6779911 0.8055870 0.7477202 0.6440278 0.3930450
## pool20 0.5208891 0.9408163 0.8196230 0.8581700 0.8753589 0.8159036 0.6277972
## pool21 0.7579485 0.8155952 0.8038122 0.8981965 0.9022269 0.9085613 0.8387728
## pool22 0.3094556 0.9013274 0.7172664 0.7009321 0.7063217 0.6137617 0.4236640
## pool23 0.8240670 0.6227699 0.6457750 0.7027220 0.7217209 0.8155233 0.8696465
## pool24 0.7526162 0.8651175 0.8508769 0.8501262 0.8943713 0.9142065 0.8006848
## pool3  0.4876436 0.8449360 0.8284202 0.8470741 0.8037274 0.7322591 0.6180535
## pool4  0.9577428 0.6424293 0.6612692 0.7653096 0.7566573 0.8280070 0.9091991
## pool5  0.6006245 0.8636081 0.6872542 0.8620436 0.8292901 0.7395731 0.6941259
## pool6  0.7661228 0.7629842 0.7753259 0.7788459 0.8468274 0.8730769 0.8264379
## pool7  0.6351527 0.9219912 0.7136565 0.8637200 0.9272011 0.8594624 0.6301043
## pool8  0.8478806 0.7185273 0.7488135 0.8485363 0.8939829 0.9328516 0.8362455
## pool9  0.8848678 0.7248110 0.6853577 0.8355914 0.8109959 0.8006558 0.9191678
##           pool18    pool19     pool2    pool20    pool21    pool22    pool23
## pool1  0.1748589 0.6224652 0.3463945 0.5208891 0.7579485 0.3094556 0.8240670
## pool10 0.7707562 0.8171979 0.8795710 0.9408163 0.8155952 0.9013274 0.6227699
## pool12 0.6112625 0.7999923 0.6779911 0.8196230 0.8038122 0.7172664 0.6457750
## pool13 0.6453349 0.9396694 0.8055870 0.8581700 0.8981965 0.7009321 0.7027220
## pool15 0.5833091 0.9118933 0.7477202 0.8753589 0.9022269 0.7063217 0.7217209
## pool16 0.4197376 0.8444395 0.6440278 0.8159036 0.9085613 0.6137617 0.8155233
## pool17 0.3447757 0.7090226 0.3930450 0.6277972 0.8387728 0.4236640 0.8696465
## pool18 1.0000000 0.7134811 0.8643939 0.7576313 0.5956201 0.9141064 0.2529033
## pool19 0.7134811 1.0000000 0.8005663 0.8829335 0.9285766 0.7637433 0.7185623
## pool2  0.8643939 0.8005663 1.0000000 0.8552567 0.6822761 0.9042154 0.3653855
## pool20 0.7576313 0.8829335 0.8552567 1.0000000 0.8583476 0.9047231 0.6335849
## pool21 0.5956201 0.9285766 0.6822761 0.8583476 1.0000000 0.7084981 0.8623552
## pool22 0.9141064 0.7637433 0.9042154 0.9047231 0.7084981 1.0000000 0.3767424
## pool23 0.2529033 0.7185623 0.3653855 0.6335849 0.8623552 0.3767424 1.0000000
## pool24 0.6027263 0.8623344 0.6984688 0.8831031 0.9207243 0.7594022 0.7495674
## pool3  0.7513578 0.8159299 0.8220134 0.8746839 0.7443903 0.7896510 0.4875307
## pool4  0.3017413 0.6970150 0.4456261 0.5409632 0.8004063 0.3734934 0.8409157
## pool5  0.7661318 0.8946608 0.8276833 0.8638731 0.8609330 0.7978023 0.7328585
## pool6  0.4676847 0.8467592 0.5234620 0.8304438 0.9114582 0.6257081 0.8894927
## pool7  0.6723327 0.8396083 0.8538715 0.9462839 0.8562640 0.8488222 0.6432408
## pool8  0.3592411 0.8251062 0.5680224 0.7847016 0.8746376 0.5137755 0.8523186
## pool9  0.4651253 0.7667664 0.5693550 0.6695418 0.8211306 0.4961955 0.8257139
##           pool24     pool3     pool4     pool5     pool6     pool7     pool8
## pool1  0.7526162 0.4876436 0.9577428 0.6006245 0.7661228 0.6351527 0.8478806
## pool10 0.8651175 0.8449360 0.6424293 0.8636081 0.7629842 0.9219912 0.7185273
## pool12 0.8508769 0.8284202 0.6612692 0.6872542 0.7753259 0.7136565 0.7488135
## pool13 0.8501262 0.8470741 0.7653096 0.8620436 0.7788459 0.8637200 0.8485363
## pool15 0.8943713 0.8037274 0.7566573 0.8292901 0.8468274 0.9272011 0.8939829
## pool16 0.9142065 0.7322591 0.8280070 0.7395731 0.8730769 0.8594624 0.9328516
## pool17 0.8006848 0.6180535 0.9091991 0.6941259 0.8264379 0.6301043 0.8362455
## pool18 0.6027263 0.7513578 0.3017413 0.7661318 0.4676847 0.6723327 0.3592411
## pool19 0.8623344 0.8159299 0.6970150 0.8946608 0.8467592 0.8396083 0.8251062
## pool2  0.6984688 0.8220134 0.4456261 0.8276833 0.5234620 0.8538715 0.5680224
## pool20 0.8831031 0.8746839 0.5409632 0.8638731 0.8304438 0.9462839 0.7847016
## pool21 0.9207243 0.7443903 0.8004063 0.8609330 0.9114582 0.8562640 0.8746376
## pool22 0.7594022 0.7896510 0.3734934 0.7978023 0.6257081 0.8488222 0.5137755
## pool23 0.7495674 0.4875307 0.8409157 0.7328585 0.8894927 0.6432408 0.8523186
## pool24 1.0000000 0.8387768 0.7462333 0.7693286 0.8599876 0.8772006 0.8769319
## pool3  0.8387768 1.0000000 0.5209144 0.7205909 0.6392061 0.8064839 0.7500715
## pool4  0.7462333 0.5209144 1.0000000 0.6923533 0.7518557 0.6269833 0.8059069
## pool5  0.7693286 0.7205909 0.6923533 1.0000000 0.8078327 0.8566798 0.7419351
## pool6  0.8599876 0.6392061 0.7518557 0.8078327 1.0000000 0.7886626 0.8763023
## pool7  0.8772006 0.8064839 0.6269833 0.8566798 0.7886626 1.0000000 0.8128085
## pool8  0.8769319 0.7500715 0.8059069 0.7419351 0.8763023 0.8128085 1.0000000
## pool9  0.7875510 0.6909381 0.9195440 0.8220397 0.7787933 0.7204924 0.8607970
##            pool9
## pool1  0.8848678
## pool10 0.7248110
## pool12 0.6853577
## pool13 0.8355914
## pool15 0.8109959
## pool16 0.8006558
## pool17 0.9191678
## pool18 0.4651253
## pool19 0.7667664
## pool2  0.5693550
## pool20 0.6695418
## pool21 0.8211306
## pool22 0.4961955
## pool23 0.8257139
## pool24 0.7875510
## pool3  0.6909381
## pool4  0.9195440
## pool5  0.8220397
## pool6  0.7787933
## pool7  0.7204924
## pool8  0.8607970
## pool9  1.0000000

clusterSamples(meth, dist="correlation", method="ward", plot=TRUE)
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"

## 
## Call:
## hclust(d = d, method = HCLUST.METHODS[hclust.method])
## 
## Cluster method   : ward.D 
## Distance         : pearson 
## Number of objects: 22
PCASamples(meth, screeplot=TRUE)

PCASamples(meth)

Statistical analysis

dm <- calculateDiffMeth(meth)
## more than two groups detected:
##  will calculate methylation difference as the difference of max(x) - min(x),
##  where x is vector of mean methylation per group per region,but 
##  the statistical test will remain the same.
dm
##          chr start end strand    pvalue qvalue  meth.diff
## 1   NPY2.110   110 110      * 0.8950765      0  1.2183075
## 2    NPY2.25    25  25      * 0.8512679      0  0.2475339
## 3    NPY2.59    59  59      * 0.8519023      0  0.1910137
## 4    NPY2.64    64  64      * 0.8662033      0  0.5912206
## 5    NPY2.99    99  99      * 0.5398042      0  1.3725651
## 6  POMC2.102   102 102      * 0.4291466      0 16.2284712
## 7   POMC2.26    26  26      * 0.5821303      0 20.5223881
## 8   POMC2.31    31  31      * 0.5336422      0 17.4580637
## 9   POMC2.56    56  56      * 0.6814301      0 17.4580637
## 10  POMC2.64    64  64      * 0.7644441      0 14.2639193
## 11  POMC2.74    74  74      * 0.6307043      0 19.2253332
## 12  POMC2.92    92  92      * 0.6055172      0 21.0746637
## 13  SOCS.113   113 113      * 0.8747546      0 19.7418285
## 14  SOCS.119   119 119      * 0.9645667      0 15.5223881
## 15   SOCS.32    32  32      * 0.9508960      0 11.3118617
## 16   SOCS.38    38  38      * 0.6913183      0 11.3118617
## 17   SOCS.46    46  46      * 0.8330839      0 11.3118617
## 18   SOCS.71    71  71      * 0.4525501      0 13.7931034
dm$pvalue<0.05
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE
min(dm$pvalue)
## [1] 0.4291466
plot(dm$meth.diff,-log10(dm$pvalue) )

hist(dm$meth.diff)

#dm.hyper=getMethylDiff(dm,difference=1,qvalue=0.1,type="hyper")

#dm.hypo=getMethylDiff(dm,difference=1,qvalue=0.1,type="hypo")

Lets run a simple correlation test

z <- samplesheet$GWG

i=0
xx <- apply( percMethylation(meth), 1 , function(x) {
  #  w <- wilcox.test(z,x)
  cor <- cor.test(x,z)
  p <- cor$p.value
  r <- cor$estimate
  i=i+1 
  HEADER = paste("Site",i," p=",round(p,3)," r=",round(r,3))
  plot(z,x,xlab="GWG",ylab="Beta")
  mtext(HEADER)
  c("p"=p,"r"=r)
})

xx <- as.data.frame(t(xx))
xx$FDR <- p.adjust(xx[,1],method="fdr")

xx <- data.frame(meth$chr,meth$end,xx)
colnames(xx) <- c("seqname","pos","p","r","FDR")

top <- xx[order(xx$p),]
head(top,10)
##      seqname pos           p          r       FDR
## 6  POMC2.102 102 0.009066178 -0.5426756 0.1631912
## 8   POMC2.31  31 0.023972013 -0.4793994 0.1737516
## 7   POMC2.26  26 0.032105608 -0.4579239 0.1737516
## 5    NPY2.99  99 0.038611466 -0.4436690 0.1737516
## 12  POMC2.92  92 0.082020612 -0.3789206 0.2380618
## 11  POMC2.74  74 0.083086639 -0.3777060 0.2380618
## 9   POMC2.56  56 0.105286505 -0.3547001 0.2380618
## 18   SOCS.71  71 0.105805244  0.3542072 0.2380618
## 10  POMC2.64  64 0.150281996 -0.3172263 0.3005640
## 16   SOCS.38  38 0.329906182  0.2179321 0.5938311
gr <- GRanges(seqnames = xx$seqname, ranges = IRanges(xx$pos) )
gr$r <- xx$r
gr$p <- xx$p
gr$FDR <- xx$FDR

# lets look at each gene
genes[1]$gene_names
## [1] "CELF2"
ol <- subsetByOverlaps(gr, genes[1])
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)
for ( i in seq_along(genes) )  {
  print(genes[i]$gene_names)
  ol <- subsetByOverlaps(gr, genes[i])
  print( as.data.frame(ol) )
}
## [1] "CELF2"
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)
## [1] seqnames start    end      width    strand   r        p        FDR     
## <0 rows> (or 0-length row.names)
## [1] "PPP2R2D"
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)
## [1] seqnames start    end      width    strand   r        p        FDR     
## <0 rows> (or 0-length row.names)
## [1] "LARP4B"
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)
## [1] seqnames start    end      width    strand   r        p        FDR     
## <0 rows> (or 0-length row.names)
## [1] "GAB2"
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)
## [1] seqnames start    end      width    strand   r        p        FDR     
## <0 rows> (or 0-length row.names)
## [1] "CASC1"
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)
## [1] seqnames start    end      width    strand   r        p        FDR     
## <0 rows> (or 0-length row.names)
## [1] "UNKNOWN"
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)
## [1] seqnames start    end      width    strand   r        p        FDR     
## <0 rows> (or 0-length row.names)
## [1] "AL096870"
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
##   suppressWarnings() to suppress this warning.)
## [1] seqnames start    end      width    strand   r        p        FDR     
## <0 rows> (or 0-length row.names)

Session information

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
## [1] methylKit_1.16.0     limma_3.46.0         GenomicRanges_1.42.0
## [4] GenomeInfoDb_1.26.0  IRanges_2.24.0       S4Vectors_0.28.0    
## [7] BiocGenerics_0.36.0 
## 
## loaded via a namespace (and not attached):
##  [1] mclust_5.4.6                Rcpp_1.0.5                 
##  [3] bdsmatrix_1.3-4             mvtnorm_1.1-1              
##  [5] lattice_0.20-41             Rsamtools_2.6.0            
##  [7] Biostrings_2.58.0           gtools_3.8.2               
##  [9] digest_0.6.27               R6_2.5.0                   
## [11] plyr_1.8.6                  emdbook_1.3.12             
## [13] evaluate_0.14               coda_0.19-4                
## [15] ggplot2_3.3.2               pillar_1.4.6               
## [17] zlibbioc_1.36.0             rlang_0.4.8                
## [19] data.table_1.13.2           R.utils_2.10.1             
## [21] R.oo_1.24.0                 Matrix_1.2-18              
## [23] bbmle_1.0.23.1              qvalue_2.22.0              
## [25] rmarkdown_2.5               splines_4.0.3              
## [27] BiocParallel_1.24.1         stringr_1.4.0              
## [29] fastseg_1.36.0              RCurl_1.98-1.2             
## [31] munsell_0.5.0               DelayedArray_0.16.0        
## [33] rtracklayer_1.50.0          compiler_4.0.3             
## [35] numDeriv_2016.8-1.1         xfun_0.19                  
## [37] pkgconfig_2.0.3             mgcv_1.8-33                
## [39] htmltools_0.5.0             SummarizedExperiment_1.20.0
## [41] tidyselect_1.1.0            tibble_3.0.4               
## [43] GenomeInfoDbData_1.2.4      matrixStats_0.57.0         
## [45] XML_3.99-0.5                crayon_1.3.4               
## [47] dplyr_1.0.2                 GenomicAlignments_1.26.0   
## [49] MASS_7.3-53                 bitops_1.0-6               
## [51] R.methodsS3_1.8.1           grid_4.0.3                 
## [53] nlme_3.1-150                gtable_0.3.0               
## [55] lifecycle_0.2.0             magrittr_1.5               
## [57] scales_1.1.1                KernSmooth_2.23-18         
## [59] stringi_1.5.3               XVector_0.30.0             
## [61] reshape2_1.4.4              ellipsis_0.3.1             
## [63] generics_0.1.0              vctrs_0.3.4                
## [65] tools_4.0.3                 Biobase_2.50.0             
## [67] glue_1.4.2                  purrr_0.3.4                
## [69] MatrixGenerics_1.2.0        yaml_2.2.1                 
## [71] colorspace_2.0-0            knitr_1.30