On the 18th May, Chiara shared some Illumina Epic methylation array data on the Deakin sharepoint which I transferred to my Nectar Webserver. There were 16 samples. The sample metadata was provided in the “Groups.csv” file which outlined the two contrasts to be carried out. On the 18th Aug Chiara provided a 3rd contrast:
Group 1: BD001, BD004, BD005, BD009, BD013,BD014
Group 2: BD002, BD003, BD007, BD008, BD010, BD012
My general approach here is to use the MissMethyl package (Phipson et al, 2016) to import, normalise, filter and then limma to perform differential methylation analysis (Ritchie et al, 2015).
suppressPackageStartupMessages({
library("R.utils")
library("missMethyl")
library("limma")
library("minfi")
library("IlluminaHumanMethylationEPICmanifest")
library("RColorBrewer")
library("IlluminaHumanMethylationEPICanno.ilm10b4.hg19")
library("eulerr")
library("plyr")
library("gplots")
library("reshape2")
library("beeswarm")
})
# need some annotations
annEPIC <- getAnnotation(IlluminaHumanMethylationEPICanno.ilm10b4.hg19)
myann <- data.frame(annEPIC[,c("UCSC_RefGene_Name","Regulatory_Feature_Group")])
promoters <- grep("Prom",myann$Regulatory_Feature_Group)
baseDir <- "ILMLEPIC-15799"
targets <- read.metharray.sheet(baseDir)
## [read.metharray.sheet] Found the following CSV files:
## [1] "ILMLEPIC-15799/ILMLEPIC-15799_SampleSheet.csv"
head(targets)
## Sample_Name Sample_Well Sample_Plate Sample_Group Pool_ID Array Slide
## 1 BD001 A09 1 Group1 <NA> R01C01 203845510076
## 2 BD002 B09 1 Group1 <NA> R02C01 203845510076
## 3 BD003 C09 1 Group1 <NA> R03C01 203845510076
## 4 BD004 D09 1 Group1 <NA> R04C01 203845510076
## 5 BD005 E09 1 Group1 <NA> R05C01 203845510076
## 6 BD007 F09 1 Group1 <NA> R06C01 203845510076
## Basename
## 1 ILMLEPIC-15799/203845510076/203845510076_R01C01
## 2 ILMLEPIC-15799/203845510076/203845510076_R02C01
## 3 ILMLEPIC-15799/203845510076/203845510076_R03C01
## 4 ILMLEPIC-15799/203845510076/203845510076_R04C01
## 5 ILMLEPIC-15799/203845510076/203845510076_R05C01
## 6 ILMLEPIC-15799/203845510076/203845510076_R06C01
rgSet <- read.metharray.exp(targets = targets)
rgSet
## class: RGChannelSet
## dim: 1051815 16
## metadata(0):
## assays(2): Green Red
## rownames(1051815): 1600101 1600111 ... 99810990 99810992
## rowData names(0):
## colnames(16): 203845510076_R01C01 203845510076_R02C01 ...
## 203845510104_R07C01 203845510104_R08C01
## colData names(9): Sample_Name Sample_Well ... Basename filenames
## Annotation
## array: IlluminaHumanMethylationEPIC
## annotation: ilm10b4.hg19
mygroups <- read.csv("Groups.csv")
samplesheet <- mygroups[,9:11]
colnames(samplesheet)[1] <- "Sample_Name"
# define control and trt groups for contrast 1
ctrl1_grp <- as.character(mygroups[,1])
ctrl1_grp <- ctrl1_grp[which(ctrl1_grp != "")]
ctrl1_grp
## [1] "BD001" "BD002" "BD003" "BD004" "BD005" "BD007" "BD008" "BD009" "BD010"
## [10] "BD012" "BD013" "BD014"
samplesheet$ctrl1 <- as.numeric(samplesheet$Sample_Name %in% ctrl1_grp)
trt1_grp <- as.character(mygroups[,2])
trt1_grp <- trt1_grp[which(trt1_grp != "")]
trt1_grp
## [1] "CT001" "CT002" "CT004" "CT009"
samplesheet$trt1 <- as.numeric(samplesheet$Sample_Name %in% trt1_grp)
# define control and trt groups for contrast 2
ctrl2_grp <- as.character(mygroups[,5])
ctrl2_grp <- ctrl2_grp[which(ctrl2_grp != "")]
ctrl2_grp
## [1] "BD001" "BD004" "BD005" "BD009" "BD013" "BD014"
samplesheet$ctrl2 <- as.numeric(samplesheet$Sample_Name %in% ctrl2_grp)
trt2_grp <- as.character(mygroups[,6])
trt2_grp <- trt2_grp[which(trt2_grp != "")]
trt2_grp
## [1] "BD002" "BD003" "BD007" "BD008" "BD010" "BD012"
samplesheet$trt2 <- as.numeric(samplesheet$Sample_Name %in% trt2_grp)
samplesheet
## Sample_Name Age Gender ctrl1 trt1 ctrl2 trt2
## 1 BD001 38 F 1 0 1 0
## 2 BD002 59 F 1 0 0 1
## 3 BD003 36 F 1 0 0 1
## 4 BD004 42 F 1 0 1 0
## 5 BD005 68 M 1 0 1 0
## 6 BD007 23 F 1 0 0 1
## 7 BD008 35 F 1 0 0 1
## 8 BD009 63 M 1 0 1 0
## 9 BD010 38 F 1 0 0 1
## 10 BD012 50 M 1 0 0 1
## 11 BD013 58 F 1 0 1 0
## 12 BD014 66 F 1 0 1 0
## 13 CT001 39 F 0 1 0 0
## 14 CT002 42 F 0 1 0 0
## 15 CT004 55 F 0 1 0 0
## 16 CT009 53 M 0 1 0 0
mSet <- preprocessRaw(rgSet)
mSetSw <- SWAN(mSet,verbose=FALSE)
## [SWAN] Preparing normalization subset
## EPIC
## [SWAN] Normalizing methylated channel
## [SWAN] Normalizing unmethylated channel
par(mfrow=c(1,2), cex=.7)
densityByProbeType(mSet[,1], main = "Raw")
densityByProbeType(mSetSw[,1], main = "SWAN")
# filtering
# include sex chromosomes
detP <- detectionP(rgSet)
keep <- rowSums(detP < 0.01) == ncol(rgSet)
mSetSw <- mSetSw[keep,]
# exclude sex chromosomes
keep <- !(featureNames(mSetSw) %in% annEPIC$Name[annEPIC$chr %in% c("chrX","chrY")])
mSetFlt <- mSetSw[keep,]
# normalisation include sex chromosomes
meth <- getMeth(mSetSw)
unmeth <- getUnmeth(mSetSw)
Mval <- log2((meth + 100)/(unmeth + 100))
colnames(Mval) <- samplesheet$Sample_Name
beta <- getBeta(mSetSw)
dim(Mval)
## [1] 862639 16
# normalisation exclude sex chromosomes
meth <- getMeth(mSetFlt)
unmeth <- getUnmeth(mSetFlt)
Mval_flt <- log2((meth + 100)/(unmeth + 100))
colnames(Mval_flt) <- samplesheet$Sample_Name
beta_flt <- getBeta(mSetFlt)
dim(Mval_flt)
## [1] 843507 16
MDS analysis is one of the first things we do with the data so that we can understand the main sources of variation in the study. The LHS MDS plot show that the male samples are clustered to the right of the chart and the female ones to the left. If we exclude sex chromosomes as per the RHS chart, samples labeled “CT” are to the left/bottom of the chart and there doesn’t appear to be any strong clustering of the other samples.
par(mfrow=c(1,2), cex=.7)
colour_palette=brewer.pal(n = length(levels(targets$Sample_Group)), name = "Paired")
## Warning in brewer.pal(n = length(levels(targets$Sample_Group)), name = "Paired"): minimal value for n is 3, returning requested palette with 3 different levels
colors <- colour_palette[as.integer(factor(targets$Sample_Group))]
plotMDS(Mval, labels=targets$Sample_Name,col=colors,main="sex chromosomes included")
legend("bottomright",legend=levels(factor(targets$Sample_Group)),pch=16,cex=1,col=colour_palette)
plotMDS(Mval_flt, labels=targets$Sample_Name,col=colors,main="sex chromosomes excluded")
For these contrasts, I am comparing ctrl
to trt1
samples, adjusting for sex and age. This analysis was performed in parallel with and without exclusion of the sex chromosomes. Exclusion of sex chromosomes did not alter the results with respect to group comparison. There was just one probe with differential methylation between groups (FDR<0.05). Probe cg02096172 had higher methylation in the trt1 group. The probe is situated in a site of DNase hypersensitivity and so could be important in gene regulation, but the probe not located close to any notable genes except a lncRNA gene RP3-429O6.1 (ENSG00000223342.2). There were no significant DMPs associated with age. There were 78 sex-associated sinificant DMPs on autosomes.
# DM analysis contrast 1 include sex chromosomes
ss <- subset(samplesheet,ctrl1==1 | trt1==1)
groups <- factor(ss$trt1)
sex <- factor(ss$Gender,levels=c("M","F"))
age <- ss$Age
design <- model.matrix(~ age + sex + groups)
mxs <- Mval[,which( colnames(Mval) %in% ss$Sample_Name )]
fit.reduced <- lmFit(mxs,design)
fit.reduced <- eBayes(fit.reduced)
summary(decideTests(fit.reduced))
## (Intercept) age sexF groups1
## Down 261760 0 2512 0
## NotSig 145013 862639 854225 862638
## Up 455866 0 5902 1
dm1in <- topTable(fit.reduced,coef=4, number = Inf)
dm1ina <- merge(myann,dm1in,by=0)
dm1ina <- dm1ina[order(dm1ina$P.Value),]
head(dm1ina)
## Row.names UCSC_RefGene_Name Regulatory_Feature_Group logFC
## 72658 cg02096172 1.9998800
## 802363 cg25676166 1.1535353
## 460012 cg13910001 BPIFB6;BPIFB6 -6.1503594
## 788355 cg25174338 DOCK5 -3.8352648
## 679817 cg21242009 1.6926828
## 485304 cg14626785 GLT1D1 0.7387053
## AveExpr t P.Value adj.P.Val B
## 72658 0.43487012 11.086604 7.580966e-09 0.006539637 -1.568000
## 802363 1.18708227 8.627191 2.316781e-07 0.099927300 -1.837465
## 460012 0.05200183 -8.086297 5.378969e-07 0.154670284 -1.921540
## 788355 1.10004441 -7.773252 8.905147e-07 0.192048180 -1.975752
## 679817 1.20873680 7.138951 2.571624e-06 0.443676710 -2.100033
## 485304 2.45244036 6.969949 3.442288e-06 0.494908585 -2.136776
write.table(dm1ina,file="contrast1_incl.tsv",sep="\t",quote=F)
# DM analysis contrast 1 excl sex chromosomes
ss <- subset(samplesheet,ctrl1==1 | trt1==1)
groups <- factor(ss$trt1)
sex <- factor(ss$Gender,levels=c("M","F"))
age <- ss$Age
design <- model.matrix(~ age + sex + groups)
mxs <- Mval_flt[,which( colnames(Mval_flt) %in% ss$Sample_Name )]
fit.reduced <- lmFit(mxs,design)
fit.reduced <- eBayes(fit.reduced)
summary(decideTests(fit.reduced))
## (Intercept) age sexF groups1
## Down 254523 0 25 0
## NotSig 140947 843507 843430 843506
## Up 448037 0 52 1
dm1ex <- topTable(fit.reduced,coef=4, number = Inf)
dm1exa <- merge(myann,dm1ex,by=0)
dm1exa <- dm1exa[order(dm1exa$P.Value),]
head(dm1exa)
## Row.names UCSC_RefGene_Name Regulatory_Feature_Group logFC
## 70880 cg02096172 1.9998800
## 784546 cg25676166 1.1535353
## 449521 cg13910001 BPIFB6;BPIFB6 -6.1503594
## 770841 cg25174338 DOCK5 -3.8352648
## 664535 cg21242009 1.6926828
## 474277 cg14626785 GLT1D1 0.7387053
## AveExpr t P.Value adj.P.Val B
## 70880 0.43487012 11.097879 7.345043e-09 0.006195595 -1.538799
## 784546 1.18708227 8.637373 2.252738e-07 0.095010029 -1.810951
## 449521 0.05200183 -8.093001 5.261698e-07 0.147942650 -1.896376
## 770841 1.10004441 -7.779816 8.716418e-07 0.183808997 -1.951146
## 664535 1.20873680 7.145612 2.518732e-06 0.424913576 -2.076609
## 474277 2.45244036 6.979672 3.354306e-06 0.471563455 -2.113013
write.table(dm1exa,file="contrast1_excl.tsv",sep="\t",quote=F)
In this contrast, there were no statistically significant DMPs between ctrl and trt groups. Fifty two significant autosomal DMPs were associated with sex; none with age.
# DM analysis contrast 2 include sex chromosomes
ss <- subset(samplesheet,ctrl2==1 | trt2==1)
groups <- factor(ss$trt2)
sex <- factor(ss$Gender,levels=c("M","F"))
age <- ss$Age
design <- model.matrix(~ age + sex + groups)
mxs <- Mval[,which( colnames(Mval) %in% ss$Sample_Name )]
fit.reduced <- lmFit(mxs,design)
fit.reduced <- eBayes(fit.reduced)
summary(decideTests(fit.reduced))
## (Intercept) age sexF groups1
## Down 251472 0 1946 0
## NotSig 211726 862639 854921 862639
## Up 399441 0 5772 0
dm2in <- topTable(fit.reduced,coef=4, number = Inf)
dm2ina <- merge(myann,dm2in,by=0)
dm2ina <- dm2ina[order(dm2ina$P.Value),]
head(dm2ina)
## Row.names UCSC_RefGene_Name Regulatory_Feature_Group logFC
## 578705 cg17601621 ZNF705E 1.9635987
## 753930 cg23957311 ZIC4;ZIC4;ZIC4;ZIC4 1.1355741
## 549187 cg16658412 MAD1L1;MAD1L1;MAD1L1 -1.0491241
## 519187 cg15721051 1.3925637
## 513096 cg15529950 1.0957670
## 152560 cg04457979 KCNQ1DN 0.8712956
## AveExpr t P.Value adj.P.Val B
## 578705 -0.8917913 7.803016 4.945687e-06 0.9641669 -3.079328
## 753930 0.9597741 7.359729 8.898893e-06 0.9641669 -3.117379
## 549187 0.3667127 -7.309262 9.528686e-06 0.9641669 -3.122033
## 519187 2.5774496 7.052708 1.355592e-05 0.9641669 -3.146802
## 513096 0.8058672 7.020455 1.417849e-05 0.9641669 -3.150052
## 152560 -2.8341753 7.014847 1.428982e-05 0.9641669 -3.150620
write.table(dm2ina,file="contrast2_incl.tsv",sep="\t",quote=F)
# DM analysis contrast 2 excl sex chromosomes
ss <- subset(samplesheet,ctrl2==1 | trt2==1)
groups <- factor(ss$trt2)
sex <- factor(ss$Gender,levels=c("M","F"))
age <- ss$Age
design <- model.matrix(~ age + sex + groups)
mxs <- Mval_flt[,which( colnames(Mval_flt) %in% ss$Sample_Name )]
fit.reduced <- lmFit(mxs,design)
fit.reduced <- eBayes(fit.reduced)
summary(decideTests(fit.reduced))
## (Intercept) age sexF groups1
## Down 244482 0 7 0
## NotSig 205820 843507 843458 843507
## Up 393205 0 42 0
dm2ex <- topTable(fit.reduced,coef=4, number = Inf)
dm2exa <- merge(myann,dm2ex,by=0)
dm2exa <- dm2exa[order(dm2exa$P.Value),]
head(dm2exa)
## Row.names UCSC_RefGene_Name Regulatory_Feature_Group logFC
## 565534 cg17601621 ZNF705E 1.9635987
## 737107 cg23957311 ZIC4;ZIC4;ZIC4;ZIC4 1.1355741
## 536749 cg16658412 MAD1L1;MAD1L1;MAD1L1 -1.0491241
## 507410 cg15721051 1.3925637
## 501457 cg15529950 1.0957670
## 148971 cg04457979 KCNQ1DN 0.8712956
## AveExpr t P.Value adj.P.Val B
## 565534 -0.8917913 7.809881 4.869007e-06 0.9678096 -3.067209
## 737107 0.9597741 7.370359 8.717863e-06 0.9678096 -3.105192
## 536749 0.3667127 -7.320849 9.322673e-06 0.9678096 -3.109785
## 507410 2.5774496 7.060409 1.333383e-05 0.9678096 -3.135089
## 501457 0.8058672 7.030451 1.390178e-05 0.9678096 -3.138128
## 148971 -2.8341753 7.028432 1.394097e-05 0.9678096 -3.138333
write.table(dm2exa,file="contrast2_excl.tsv",sep="\t",quote=F)
In this contrast, Group 1 consists of BD001, BD004, BD005, BD009, BD013 and BD014 while group2 consists of BD002, BD003, BD007, BD008, BD010 and BD012
There were no statistically significant DMPs between ctrl and trt groups.
# DM analysis contrast 3 include sex chromosomes
g1 <- c("BD001","BD004","BD009","BD013","BD014")
g2 <- c("BD002","BD003","BD007","BD008","BD010","BD012")
samplesheet$ctrl3 <- as.numeric(samplesheet$Sample_Name %in% g1)
samplesheet$trt3 <- as.numeric(samplesheet$Sample_Name %in% g2)
ss <- subset(samplesheet,ctrl3==1 | trt3==1)
groups <- factor(ss$trt3)
sex <- factor(ss$Gender,levels=c("M","F"))
age <- ss$Age
design <- model.matrix(~ age + sex + groups)
mxs <- Mval[,which( colnames(Mval) %in% ss$Sample_Name )]
fit.reduced <- lmFit(mxs,design)
fit.reduced <- eBayes(fit.reduced)
summary(decideTests(fit.reduced))
## (Intercept) age sexF groups1
## Down 253083 0 1676 0
## NotSig 203414 862639 855416 862639
## Up 406142 0 5547 0
dm3in <- topTable(fit.reduced,coef=4, number = Inf)
dm3ina <- merge(myann,dm3in,by=0)
dm3ina <- dm3ina[order(dm3ina$P.Value),]
head(dm3ina)
## Row.names UCSC_RefGene_Name Regulatory_Feature_Group
## 586473 cg17868694 MOGAT3
## 753930 cg23957311 ZIC4;ZIC4;ZIC4;ZIC4
## 427664 cg12929486 SLC22A16
## 99021 cg02869783
## 425870 cg12870014 ANKRD13A Gene_Associated_Cell_type_specific
## 578705 cg17601621 ZNF705E
## logFC AveExpr t P.Value adj.P.Val B
## 586473 1.273595 0.6801242 8.191467 5.000721e-06 0.9907332 -3.143851
## 753930 1.093862 1.0191519 8.004407 6.241078e-06 0.9907332 -3.156118
## 427664 -1.728111 -2.8122311 -7.772324 8.259518e-06 0.9907332 -3.172293
## 99021 2.548374 0.7912909 7.748371 8.504782e-06 0.9907332 -3.174026
## 425870 -1.172572 -0.5783429 -7.564940 1.066407e-05 0.9907332 -3.187712
## 578705 1.932875 -0.8178293 7.537259 1.103817e-05 0.9907332 -3.189843
write.table(dm3ina,file="contrast3_incl.tsv",sep="\t",quote=F)
# DM analysis contrast 3 excl sex chromosomes
ss <- subset(samplesheet,ctrl3==1 | trt3==1)
groups <- factor(ss$trt3)
sex <- factor(ss$Gender,levels=c("M","F"))
age <- ss$Age
design <- model.matrix(~ age + sex + groups)
mxs <- Mval_flt[,which( colnames(Mval_flt) %in% ss$Sample_Name )]
fit.reduced <- lmFit(mxs,design)
fit.reduced <- eBayes(fit.reduced)
summary(decideTests(fit.reduced))
## (Intercept) age sexF groups1
## Down 246081 0 11 0
## NotSig 197577 843507 843450 843507
## Up 399849 0 46 0
dm3ex <- topTable(fit.reduced,coef=4, number = Inf)
dm3exa <- merge(myann,dm3ex,by=0)
dm3exa <- dm3exa[order(dm3exa$P.Value),]
head(dm3exa)
## Row.names UCSC_RefGene_Name Regulatory_Feature_Group
## 573117 cg17868694 MOGAT3
## 737107 cg23957311 ZIC4;ZIC4;ZIC4;ZIC4
## 417905 cg12929486 SLC22A16
## 96637 cg02869783
## 416155 cg12870014 ANKRD13A Gene_Associated_Cell_type_specific
## 565534 cg17601621 ZNF705E
## logFC AveExpr t P.Value adj.P.Val B
## 573117 1.273595 0.6801242 8.203536 4.866561e-06 0.9934407 -3.131231
## 737107 1.093862 1.0191519 8.017307 6.068452e-06 0.9934407 -3.143529
## 417905 -1.728111 -2.8122311 -7.781910 8.064808e-06 0.9934407 -3.160054
## 96637 2.548374 0.7912909 7.756963 8.314605e-06 0.9934407 -3.161873
## 416155 -1.172572 -0.5783429 -7.576108 1.039434e-05 0.9934407 -3.175468
## 565534 1.932875 -0.8178293 7.546126 1.079016e-05 0.9934407 -3.177793
write.table(dm3exa,file="contrast3_excl.tsv",sep="\t",quote=F)
There are a lot of follow-up analyses and visualisations that are possible like heatmaps, volcano charts, beeswarm charts, gene ontology and the like but it would be good have your opinion of this analysis first before we continue with that.
Phipson B, Maksimovic J, Oshlack A. missMethyl: an R package for analyzing data from Illumina’s HumanMethylation450 platform. Bioinformatics. 2016;32(2):286‐288. doi:10.1093/bioinformatics/btv560
Ritchie ME, Phipson B, Wu D, et al. limma powers differential expression analyses for RNA-sequencing and microarray studies. Nucleic Acids Res. 2015;43(7):e47. doi:10.1093/nar/gkv007