Introduction

On the 18th May, Chiara shared some Illumina Epic methylation array data on the Deakin sharepoint which I transferred to my Nectar Webserver. There were 16 samples. The sample metadata was provided in the “Groups.csv” file which outlined the two contrasts to be carried out. On the 18th Aug Chiara provided a 3rd contrast:

My general approach here is to use the MissMethyl package (Phipson et al, 2016) to import, normalise, filter and then limma to perform differential methylation analysis (Ritchie et al, 2015).

suppressPackageStartupMessages({
    library("R.utils")
    library("missMethyl")
    library("limma")
    library("minfi")
    library("IlluminaHumanMethylationEPICmanifest")
    library("RColorBrewer")
    library("IlluminaHumanMethylationEPICanno.ilm10b4.hg19")
    library("eulerr")
    library("plyr")
    library("gplots")
    library("reshape2")
    library("beeswarm")
})
# need some annotations
annEPIC <- getAnnotation(IlluminaHumanMethylationEPICanno.ilm10b4.hg19)
myann <- data.frame(annEPIC[,c("UCSC_RefGene_Name","Regulatory_Feature_Group")])
promoters <- grep("Prom",myann$Regulatory_Feature_Group)

Import dataset

baseDir <- "ILMLEPIC-15799"
targets <- read.metharray.sheet(baseDir)
## [read.metharray.sheet] Found the following CSV files:
## [1] "ILMLEPIC-15799/ILMLEPIC-15799_SampleSheet.csv"
head(targets)
##   Sample_Name Sample_Well Sample_Plate Sample_Group Pool_ID  Array        Slide
## 1       BD001         A09            1       Group1    <NA> R01C01 203845510076
## 2       BD002         B09            1       Group1    <NA> R02C01 203845510076
## 3       BD003         C09            1       Group1    <NA> R03C01 203845510076
## 4       BD004         D09            1       Group1    <NA> R04C01 203845510076
## 5       BD005         E09            1       Group1    <NA> R05C01 203845510076
## 6       BD007         F09            1       Group1    <NA> R06C01 203845510076
##                                          Basename
## 1 ILMLEPIC-15799/203845510076/203845510076_R01C01
## 2 ILMLEPIC-15799/203845510076/203845510076_R02C01
## 3 ILMLEPIC-15799/203845510076/203845510076_R03C01
## 4 ILMLEPIC-15799/203845510076/203845510076_R04C01
## 5 ILMLEPIC-15799/203845510076/203845510076_R05C01
## 6 ILMLEPIC-15799/203845510076/203845510076_R06C01
rgSet <- read.metharray.exp(targets = targets)
rgSet
## class: RGChannelSet 
## dim: 1051815 16 
## metadata(0):
## assays(2): Green Red
## rownames(1051815): 1600101 1600111 ... 99810990 99810992
## rowData names(0):
## colnames(16): 203845510076_R01C01 203845510076_R02C01 ...
##   203845510104_R07C01 203845510104_R08C01
## colData names(9): Sample_Name Sample_Well ... Basename filenames
## Annotation
##   array: IlluminaHumanMethylationEPIC
##   annotation: ilm10b4.hg19

Define sample sheet

mygroups <- read.csv("Groups.csv")
samplesheet <- mygroups[,9:11]
colnames(samplesheet)[1] <- "Sample_Name"

# define control and trt groups for contrast 1
ctrl1_grp <- as.character(mygroups[,1])
ctrl1_grp <- ctrl1_grp[which(ctrl1_grp != "")]
ctrl1_grp
##  [1] "BD001" "BD002" "BD003" "BD004" "BD005" "BD007" "BD008" "BD009" "BD010"
## [10] "BD012" "BD013" "BD014"
samplesheet$ctrl1 <- as.numeric(samplesheet$Sample_Name %in% ctrl1_grp)
trt1_grp <- as.character(mygroups[,2])
trt1_grp <- trt1_grp[which(trt1_grp != "")]
trt1_grp
## [1] "CT001" "CT002" "CT004" "CT009"
samplesheet$trt1 <- as.numeric(samplesheet$Sample_Name %in% trt1_grp)

# define control and trt groups for contrast 2
ctrl2_grp <- as.character(mygroups[,5])
ctrl2_grp <- ctrl2_grp[which(ctrl2_grp != "")]
ctrl2_grp
## [1] "BD001" "BD004" "BD005" "BD009" "BD013" "BD014"
samplesheet$ctrl2 <- as.numeric(samplesheet$Sample_Name %in% ctrl2_grp)
trt2_grp <- as.character(mygroups[,6])
trt2_grp <- trt2_grp[which(trt2_grp != "")]
trt2_grp
## [1] "BD002" "BD003" "BD007" "BD008" "BD010" "BD012"
samplesheet$trt2 <- as.numeric(samplesheet$Sample_Name %in% trt2_grp)

samplesheet
##    Sample_Name Age Gender ctrl1 trt1 ctrl2 trt2
## 1        BD001  38      F     1    0     1    0
## 2        BD002  59      F     1    0     0    1
## 3        BD003  36      F     1    0     0    1
## 4        BD004  42      F     1    0     1    0
## 5        BD005  68      M     1    0     1    0
## 6        BD007  23      F     1    0     0    1
## 7        BD008  35      F     1    0     0    1
## 8        BD009  63      M     1    0     1    0
## 9        BD010  38      F     1    0     0    1
## 10       BD012  50      M     1    0     0    1
## 11       BD013  58      F     1    0     1    0
## 12       BD014  66      F     1    0     1    0
## 13       CT001  39      F     0    1     0    0
## 14       CT002  42      F     0    1     0    0
## 15       CT004  55      F     0    1     0    0
## 16       CT009  53      M     0    1     0    0

Normalisation and filtering

mSet <- preprocessRaw(rgSet)
mSetSw <- SWAN(mSet,verbose=FALSE)
## [SWAN] Preparing normalization subset
## EPIC
## [SWAN] Normalizing methylated channel
## [SWAN] Normalizing unmethylated channel
par(mfrow=c(1,2), cex=.7)
densityByProbeType(mSet[,1], main = "Raw")
densityByProbeType(mSetSw[,1], main = "SWAN")

# filtering
# include sex chromosomes
detP <- detectionP(rgSet)
keep <- rowSums(detP < 0.01) == ncol(rgSet)
mSetSw <- mSetSw[keep,]
# exclude sex chromosomes
keep <- !(featureNames(mSetSw) %in% annEPIC$Name[annEPIC$chr %in% c("chrX","chrY")])
mSetFlt <- mSetSw[keep,]

# normalisation include sex chromosomes
meth <- getMeth(mSetSw)
unmeth <- getUnmeth(mSetSw)
Mval <- log2((meth + 100)/(unmeth + 100))
colnames(Mval) <- samplesheet$Sample_Name
beta <- getBeta(mSetSw)
dim(Mval)
## [1] 862639     16
# normalisation exclude sex chromosomes
meth <- getMeth(mSetFlt)
unmeth <- getUnmeth(mSetFlt)
Mval_flt <- log2((meth + 100)/(unmeth + 100))
colnames(Mval_flt) <- samplesheet$Sample_Name
beta_flt <- getBeta(mSetFlt)
dim(Mval_flt)
## [1] 843507     16

Multidimensional scaing analysis

MDS analysis is one of the first things we do with the data so that we can understand the main sources of variation in the study. The LHS MDS plot show that the male samples are clustered to the right of the chart and the female ones to the left. If we exclude sex chromosomes as per the RHS chart, samples labeled “CT” are to the left/bottom of the chart and there doesn’t appear to be any strong clustering of the other samples.

par(mfrow=c(1,2), cex=.7)
colour_palette=brewer.pal(n = length(levels(targets$Sample_Group)), name = "Paired")
## Warning in brewer.pal(n = length(levels(targets$Sample_Group)), name = "Paired"): minimal value for n is 3, returning requested palette with 3 different levels
colors <- colour_palette[as.integer(factor(targets$Sample_Group))]
plotMDS(Mval, labels=targets$Sample_Name,col=colors,main="sex chromosomes included")
legend("bottomright",legend=levels(factor(targets$Sample_Group)),pch=16,cex=1,col=colour_palette)
plotMDS(Mval_flt, labels=targets$Sample_Name,col=colors,main="sex chromosomes excluded")

Contrast 1

For these contrasts, I am comparing ctrl to trt1 samples, adjusting for sex and age. This analysis was performed in parallel with and without exclusion of the sex chromosomes. Exclusion of sex chromosomes did not alter the results with respect to group comparison. There was just one probe with differential methylation between groups (FDR<0.05). Probe cg02096172 had higher methylation in the trt1 group. The probe is situated in a site of DNase hypersensitivity and so could be important in gene regulation, but the probe not located close to any notable genes except a lncRNA gene RP3-429O6.1 (ENSG00000223342.2). There were no significant DMPs associated with age. There were 78 sex-associated sinificant DMPs on autosomes.

# DM analysis contrast 1 include sex chromosomes
ss <- subset(samplesheet,ctrl1==1 | trt1==1)
groups <- factor(ss$trt1)
sex <- factor(ss$Gender,levels=c("M","F"))
age <- ss$Age
design <- model.matrix(~ age + sex + groups)
mxs <- Mval[,which( colnames(Mval) %in% ss$Sample_Name )]
fit.reduced <- lmFit(mxs,design)
fit.reduced <- eBayes(fit.reduced)
summary(decideTests(fit.reduced))
##        (Intercept)    age   sexF groups1
## Down        261760      0   2512       0
## NotSig      145013 862639 854225  862638
## Up          455866      0   5902       1
dm1in <- topTable(fit.reduced,coef=4, number = Inf)
dm1ina <- merge(myann,dm1in,by=0)
dm1ina <- dm1ina[order(dm1ina$P.Value),]
head(dm1ina)
##         Row.names UCSC_RefGene_Name Regulatory_Feature_Group      logFC
## 72658  cg02096172                                             1.9998800
## 802363 cg25676166                                             1.1535353
## 460012 cg13910001     BPIFB6;BPIFB6                          -6.1503594
## 788355 cg25174338             DOCK5                          -3.8352648
## 679817 cg21242009                                             1.6926828
## 485304 cg14626785            GLT1D1                           0.7387053
##           AveExpr         t      P.Value   adj.P.Val         B
## 72658  0.43487012 11.086604 7.580966e-09 0.006539637 -1.568000
## 802363 1.18708227  8.627191 2.316781e-07 0.099927300 -1.837465
## 460012 0.05200183 -8.086297 5.378969e-07 0.154670284 -1.921540
## 788355 1.10004441 -7.773252 8.905147e-07 0.192048180 -1.975752
## 679817 1.20873680  7.138951 2.571624e-06 0.443676710 -2.100033
## 485304 2.45244036  6.969949 3.442288e-06 0.494908585 -2.136776
write.table(dm1ina,file="contrast1_incl.tsv",sep="\t",quote=F)

# DM analysis contrast 1 excl sex chromosomes
ss <- subset(samplesheet,ctrl1==1 | trt1==1)
groups <- factor(ss$trt1)
sex <- factor(ss$Gender,levels=c("M","F"))
age <- ss$Age
design <- model.matrix(~ age + sex + groups)
mxs <- Mval_flt[,which( colnames(Mval_flt) %in% ss$Sample_Name )]
fit.reduced <- lmFit(mxs,design)
fit.reduced <- eBayes(fit.reduced)
summary(decideTests(fit.reduced))
##        (Intercept)    age   sexF groups1
## Down        254523      0     25       0
## NotSig      140947 843507 843430  843506
## Up          448037      0     52       1
dm1ex <- topTable(fit.reduced,coef=4, number = Inf)
dm1exa <- merge(myann,dm1ex,by=0)
dm1exa <- dm1exa[order(dm1exa$P.Value),]
head(dm1exa)
##         Row.names UCSC_RefGene_Name Regulatory_Feature_Group      logFC
## 70880  cg02096172                                             1.9998800
## 784546 cg25676166                                             1.1535353
## 449521 cg13910001     BPIFB6;BPIFB6                          -6.1503594
## 770841 cg25174338             DOCK5                          -3.8352648
## 664535 cg21242009                                             1.6926828
## 474277 cg14626785            GLT1D1                           0.7387053
##           AveExpr         t      P.Value   adj.P.Val         B
## 70880  0.43487012 11.097879 7.345043e-09 0.006195595 -1.538799
## 784546 1.18708227  8.637373 2.252738e-07 0.095010029 -1.810951
## 449521 0.05200183 -8.093001 5.261698e-07 0.147942650 -1.896376
## 770841 1.10004441 -7.779816 8.716418e-07 0.183808997 -1.951146
## 664535 1.20873680  7.145612 2.518732e-06 0.424913576 -2.076609
## 474277 2.45244036  6.979672 3.354306e-06 0.471563455 -2.113013
write.table(dm1exa,file="contrast1_excl.tsv",sep="\t",quote=F)

Contrast 2

In this contrast, there were no statistically significant DMPs between ctrl and trt groups. Fifty two significant autosomal DMPs were associated with sex; none with age.

# DM analysis contrast 2 include sex chromosomes
ss <- subset(samplesheet,ctrl2==1 | trt2==1)
groups <- factor(ss$trt2)
sex <- factor(ss$Gender,levels=c("M","F"))
age <- ss$Age
design <- model.matrix(~ age + sex + groups)
mxs <- Mval[,which( colnames(Mval) %in% ss$Sample_Name )]
fit.reduced <- lmFit(mxs,design)
fit.reduced <- eBayes(fit.reduced)
summary(decideTests(fit.reduced))
##        (Intercept)    age   sexF groups1
## Down        251472      0   1946       0
## NotSig      211726 862639 854921  862639
## Up          399441      0   5772       0
dm2in <- topTable(fit.reduced,coef=4, number = Inf)
dm2ina <- merge(myann,dm2in,by=0)
dm2ina <- dm2ina[order(dm2ina$P.Value),]
head(dm2ina)
##         Row.names    UCSC_RefGene_Name Regulatory_Feature_Group      logFC
## 578705 cg17601621              ZNF705E                           1.9635987
## 753930 cg23957311  ZIC4;ZIC4;ZIC4;ZIC4                           1.1355741
## 549187 cg16658412 MAD1L1;MAD1L1;MAD1L1                          -1.0491241
## 519187 cg15721051                                                1.3925637
## 513096 cg15529950                                                1.0957670
## 152560 cg04457979              KCNQ1DN                           0.8712956
##           AveExpr         t      P.Value adj.P.Val         B
## 578705 -0.8917913  7.803016 4.945687e-06 0.9641669 -3.079328
## 753930  0.9597741  7.359729 8.898893e-06 0.9641669 -3.117379
## 549187  0.3667127 -7.309262 9.528686e-06 0.9641669 -3.122033
## 519187  2.5774496  7.052708 1.355592e-05 0.9641669 -3.146802
## 513096  0.8058672  7.020455 1.417849e-05 0.9641669 -3.150052
## 152560 -2.8341753  7.014847 1.428982e-05 0.9641669 -3.150620
write.table(dm2ina,file="contrast2_incl.tsv",sep="\t",quote=F)

# DM analysis contrast 2 excl sex chromosomes
ss <- subset(samplesheet,ctrl2==1 | trt2==1)
groups <- factor(ss$trt2)
sex <- factor(ss$Gender,levels=c("M","F"))
age <- ss$Age
design <- model.matrix(~ age + sex + groups)
mxs <- Mval_flt[,which( colnames(Mval_flt) %in% ss$Sample_Name )]
fit.reduced <- lmFit(mxs,design)
fit.reduced <- eBayes(fit.reduced)
summary(decideTests(fit.reduced))
##        (Intercept)    age   sexF groups1
## Down        244482      0      7       0
## NotSig      205820 843507 843458  843507
## Up          393205      0     42       0
dm2ex <- topTable(fit.reduced,coef=4, number = Inf)
dm2exa <- merge(myann,dm2ex,by=0)
dm2exa <- dm2exa[order(dm2exa$P.Value),]
head(dm2exa)
##         Row.names    UCSC_RefGene_Name Regulatory_Feature_Group      logFC
## 565534 cg17601621              ZNF705E                           1.9635987
## 737107 cg23957311  ZIC4;ZIC4;ZIC4;ZIC4                           1.1355741
## 536749 cg16658412 MAD1L1;MAD1L1;MAD1L1                          -1.0491241
## 507410 cg15721051                                                1.3925637
## 501457 cg15529950                                                1.0957670
## 148971 cg04457979              KCNQ1DN                           0.8712956
##           AveExpr         t      P.Value adj.P.Val         B
## 565534 -0.8917913  7.809881 4.869007e-06 0.9678096 -3.067209
## 737107  0.9597741  7.370359 8.717863e-06 0.9678096 -3.105192
## 536749  0.3667127 -7.320849 9.322673e-06 0.9678096 -3.109785
## 507410  2.5774496  7.060409 1.333383e-05 0.9678096 -3.135089
## 501457  0.8058672  7.030451 1.390178e-05 0.9678096 -3.138128
## 148971 -2.8341753  7.028432 1.394097e-05 0.9678096 -3.138333
write.table(dm2exa,file="contrast2_excl.tsv",sep="\t",quote=F)

Contrast 3

In this contrast, Group 1 consists of BD001, BD004, BD005, BD009, BD013 and BD014 while group2 consists of BD002, BD003, BD007, BD008, BD010 and BD012

There were no statistically significant DMPs between ctrl and trt groups.

# DM analysis contrast 3 include sex chromosomes
g1 <- c("BD001","BD004","BD009","BD013","BD014")
g2 <- c("BD002","BD003","BD007","BD008","BD010","BD012")

samplesheet$ctrl3 <- as.numeric(samplesheet$Sample_Name %in% g1)
samplesheet$trt3 <- as.numeric(samplesheet$Sample_Name %in% g2)

ss <- subset(samplesheet,ctrl3==1 | trt3==1)
groups <- factor(ss$trt3)
sex <- factor(ss$Gender,levels=c("M","F"))
age <- ss$Age
design <- model.matrix(~ age + sex + groups)
mxs <- Mval[,which( colnames(Mval) %in% ss$Sample_Name )]
fit.reduced <- lmFit(mxs,design)
fit.reduced <- eBayes(fit.reduced)
summary(decideTests(fit.reduced))
##        (Intercept)    age   sexF groups1
## Down        253083      0   1676       0
## NotSig      203414 862639 855416  862639
## Up          406142      0   5547       0
dm3in <- topTable(fit.reduced,coef=4, number = Inf)
dm3ina <- merge(myann,dm3in,by=0)
dm3ina <- dm3ina[order(dm3ina$P.Value),]
head(dm3ina)
##         Row.names   UCSC_RefGene_Name           Regulatory_Feature_Group
## 586473 cg17868694              MOGAT3                                   
## 753930 cg23957311 ZIC4;ZIC4;ZIC4;ZIC4                                   
## 427664 cg12929486            SLC22A16                                   
## 99021  cg02869783                                                       
## 425870 cg12870014            ANKRD13A Gene_Associated_Cell_type_specific
## 578705 cg17601621             ZNF705E                                   
##            logFC    AveExpr         t      P.Value adj.P.Val         B
## 586473  1.273595  0.6801242  8.191467 5.000721e-06 0.9907332 -3.143851
## 753930  1.093862  1.0191519  8.004407 6.241078e-06 0.9907332 -3.156118
## 427664 -1.728111 -2.8122311 -7.772324 8.259518e-06 0.9907332 -3.172293
## 99021   2.548374  0.7912909  7.748371 8.504782e-06 0.9907332 -3.174026
## 425870 -1.172572 -0.5783429 -7.564940 1.066407e-05 0.9907332 -3.187712
## 578705  1.932875 -0.8178293  7.537259 1.103817e-05 0.9907332 -3.189843
write.table(dm3ina,file="contrast3_incl.tsv",sep="\t",quote=F)

# DM analysis contrast 3 excl sex chromosomes
ss <- subset(samplesheet,ctrl3==1 | trt3==1)
groups <- factor(ss$trt3)
sex <- factor(ss$Gender,levels=c("M","F"))
age <- ss$Age
design <- model.matrix(~ age + sex + groups)
mxs <- Mval_flt[,which( colnames(Mval_flt) %in% ss$Sample_Name )]
fit.reduced <- lmFit(mxs,design)
fit.reduced <- eBayes(fit.reduced)
summary(decideTests(fit.reduced))
##        (Intercept)    age   sexF groups1
## Down        246081      0     11       0
## NotSig      197577 843507 843450  843507
## Up          399849      0     46       0
dm3ex <- topTable(fit.reduced,coef=4, number = Inf)
dm3exa <- merge(myann,dm3ex,by=0)
dm3exa <- dm3exa[order(dm3exa$P.Value),]
head(dm3exa)
##         Row.names   UCSC_RefGene_Name           Regulatory_Feature_Group
## 573117 cg17868694              MOGAT3                                   
## 737107 cg23957311 ZIC4;ZIC4;ZIC4;ZIC4                                   
## 417905 cg12929486            SLC22A16                                   
## 96637  cg02869783                                                       
## 416155 cg12870014            ANKRD13A Gene_Associated_Cell_type_specific
## 565534 cg17601621             ZNF705E                                   
##            logFC    AveExpr         t      P.Value adj.P.Val         B
## 573117  1.273595  0.6801242  8.203536 4.866561e-06 0.9934407 -3.131231
## 737107  1.093862  1.0191519  8.017307 6.068452e-06 0.9934407 -3.143529
## 417905 -1.728111 -2.8122311 -7.781910 8.064808e-06 0.9934407 -3.160054
## 96637   2.548374  0.7912909  7.756963 8.314605e-06 0.9934407 -3.161873
## 416155 -1.172572 -0.5783429 -7.576108 1.039434e-05 0.9934407 -3.175468
## 565534  1.932875 -0.8178293  7.546126 1.079016e-05 0.9934407 -3.177793
write.table(dm3exa,file="contrast3_excl.tsv",sep="\t",quote=F)

Potential downstream work

There are a lot of follow-up analyses and visualisations that are possible like heatmaps, volcano charts, beeswarm charts, gene ontology and the like but it would be good have your opinion of this analysis first before we continue with that.

References

Phipson B, Maksimovic J, Oshlack A. missMethyl: an R package for analyzing data from Illumina’s HumanMethylation450 platform. Bioinformatics. 2016;32(2):286‐288. doi:10.1093/bioinformatics/btv560

Ritchie ME, Phipson B, Wu D, et al. limma powers differential expression analyses for RNA-sequencing and microarray studies. Nucleic Acids Res. 2015;43(7):e47. doi:10.1093/nar/gkv007