Source: https://github.com/markziemann/asd_meth

Introduction

Here I will be running a comparison of differential methylation data from guthrie and fresh blood samples.

Here are the files that I’m using:

  • ASD_blood_top_dmps_onADOS_withintw_Nov27_limma.csv

  • ASD_guthrie_top_dmps_onADOS_withintw_Nov27_limma.csv

suppressPackageStartupMessages({
  library("parallel")
  library("mitch")
  library("IlluminaHumanMethylationEPICanno.ilm10b4.hg19")
  source("https://raw.githubusercontent.com/markziemann/gmea/main/meth_functions.R")
  library("data.table")
  library("kableExtra")
  library("eulerr")
  library("tictoc")
})

Load data

bl_design <- readRDS(file="bl_design.rds")
bl_mvals <- readRDS(file="bl_mvals.rds")

gu_design <- readRDS(file="gu_design.rds")
gu_mvals <- readRDS(file="gu_mvals.rds")

Probe sets

For each gene, extract out the probes.

anno <- getAnnotation(IlluminaHumanMethylationEPICanno.ilm10b4.hg19)
myann <- data.frame(anno[,c("UCSC_RefGene_Name","Regulatory_Feature_Group","Islands_Name","Relation_to_Island")])
promoters <- grep("Prom",myann$Regulatory_Feature_Group)
promoters <- myann[promoters,]
gp <- myann[,"UCSC_RefGene_Name",drop=FALSE]
gp2 <- strsplit(gp$UCSC_RefGene_Name,";")
names(gp2) <- rownames(gp)
sets <- split(rep(names(gp2), lengths(gp2)), unlist(gp2))
summary(unlist(lapply(sets,length)))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    9.00   24.00   49.68   55.00 6778.00

GMEA functions

This is an enrichment technique.

# get the median, mean and 1-sample t-test result for each gene
pmea <- function(mval,design,sets,cores=2) {
  fit <- lmFit(mval, design)
  fit <- eBayes(fit)
  top <- topTable(fit,coef=ncol(design),num=Inf, sort.by = "P")
  l <- mclapply(seq(1,length(sets)), function(i) {
    g <- names(sets[i])
    tstats <- top[rownames(top) %in% sets[[i]],"t"]
    myn <- length(tstats)
    mymean <- mean(tstats)
    mymedian <- median(tstats)
    if ( length(tstats) < 2 ) {
      pval=1
    } else {
      wtselfcont <- t.test(tstats)
      pval=wtselfcont$p.value
    }
    c("gene"=g,"nprobes"=myn,"mean"=mymean,"median"=mymedian,
      "P.Value"=pval)
  } , mc.cores=cores)

  df <- do.call(rbind, l)
  rownames(df) <- df[,1]
  df <- df[,-1]
  tmp <- apply(df,2,as.numeric)
  rownames(tmp) <- rownames(df)
  df <- as.data.frame(tmp)
  df$sig <- -log10(df[,4])
  df <- df[order(-df$sig),]
  df$FDR <- p.adjust(df$P.Value)
  out <- list("df"=df,"toptable"=top)
  return(out)
}
# pmea_res <- pmea(mval=mval,design=design,sets=head(sets,20),cores=detectCores()/2)

# Run the fry test for each gene. This is a more conservative test.
run_fry <- function(mval,design,sets,cores=2) {
  split_sets <- split(sets, ceiling(seq_along(sets)/200))
  fry_l <- mclapply(split_sets,function(l) {
    fry(y=mval, index = l, design = design,
      contrast = ncol(design) )
  } , mc.cores=cores )
  fry_res <- do.call(rbind,fry_l)
  rownames(fry_res) <- sub("\\.","@",rownames(fry_res))
  rownames(fry_res) <- sapply(strsplit(rownames(fry_res),"@"),"[[",2)
  fry_res[is.na(fry_res$PValue),"PValue"] <- 1
  fry_res <- fry_res[order(fry_res$PValue),]
  fry_res$FDR <- p.adjust(fry_res$PValue,method="fdr")
  return(fry_res)
}
#fry_res <- run_fry(mval=mval,design=design,sets=sets,cores=cores)

# main function to perform 1-sample t-test and fry test and merge the results.
main <- function(mval,design,sets,cores=2){
  pmea_res <- pmea(mval=mval,design=design,sets=sets,cores=cores)
  pmea_df <- pmea_res[[1]]
  limma_df <- pmea_res[[2]]
  fry_res <- run_fry(mval=mval,design=design,sets=sets,cores=cores)
  m <- merge(pmea_df,fry_res,by=0)
  rownames(m) <- m$Row.names
  m$Row.names = NULL
  m <- m[,c("nprobes","median","PValue","FDR.y")]
  colnames(m) <- c("nprobes","median","PValue","FDR")
  m <- m[order(m$PValue),]
  out <- list("res"=m,"limma_df"=limma_df)
  return(out)
}
#res <- main(mval,design,sets,cores=detectCores()/2)

probe_bias <- function(res) {
  res$sig <- -log10(res$PValue)
  sig <- subset(res,FDR < 0.05)
  plot(res$nprobes,res$sig,log="x",
    pch=19,cex=0.6,
    xlab="no. probes",ylab="-log10(p-value)")
  points(sig$nprobes,sig$sig,col="red",pch=19,cex=0.62)
  SIG = nrow(sig)
  TOT = nrow(res)
  HEADER <- paste(TOT, "genes in total.", SIG, "with FDR<0.05.")
  mtext(HEADER)
}

volcano_plot <- function(res) {
  res$sig <- -log10(res$PValue)
  sig <- subset(res,FDR < 0.05)
  plot(res$median,res$sig,
    pch=19,cex=0.6,
    xlab="median t-statistic",ylab="-log10(p-value)")
  points(sig$median,sig$sig,col="red",pch=19,cex=0.62)
  SIG = nrow(sig)
  UP = nrow(subset(sig,median>0))
  DN = nrow(subset(sig,median<0))
  TOT = nrow(res)
  HEADER <- paste(TOT, "genes in total.", SIG, "with FDR<0.05;",DN,"down,",UP,"up")
  mtext(HEADER)
}

gmea_boxplot <- function(res,sets,n=50) {
  df <- res[[1]]
  limma_df <- res[[2]]
  # smallest pval
  par(mfrow=c(1,2))
  gs <- head(rownames(df),n)
  mysets <- sets[names(sets) %in% gs]
  tstats <- lapply(mysets, function(set) {
    limma_df[rownames(limma_df) %in% set,"t"]
  })
  tstats <- tstats[order(unlist(lapply(tstats,median)))]
  boxplot(tstats,horizontal=TRUE,las=1,
    main="smallest p-val",cex.axis=0.6,
    xlab="t-statistic")
  grid()
  # biggest effect size (median)
  sig <- subset(df,FDR < 0.05)
  gs <- head(rownames(sig[order(-abs(sig$median)),]),n)
  if ( length(gs) >2 ) {
    tstats <- lapply(gs, function(g) {
      df[which(df$genes==g),"tvals"]
    })
    names(tstats) <- gs
    tstats <- tstats[order(unlist(lapply(tstats,median)))]
    boxplot(tstats,horizontal=TRUE,las=1,
      main="biggest effect size(median)",cex.axis=0.6,
      xlab="t-statistic")
    grid()
  } else {
    plot(1)
    mtext("too few significant genes found")
  }
    par(mfrow=c(1,1))
}

GMEA for blood

Effect size is estimated with the median t-statistic for each gene. Significance is estimated using the fry test. Results are merged for the user.

  • PMEA step 196.063 sec elapsed on 8 cores

  • Fry step 293.569 sec elapsed on 8 cores

  • Fry single core 1205.519 sec

bl_design <- readRDS(file="bl_design.rds")
bl_mvals <- readRDS(file="bl_mvals.rds")

tic()
res_bl <- main(mval=bl_mvals,design=bl_design,sets,cores=detectCores()/2)
res_bl_df <- res_bl[[1]] # enrichment results
res_bl_top <- res_bl[[2]] # limma results
toc() ## 502-522 sec elapsed on 8 cores (8.3-8.7 mins)
## 509.323 sec elapsed
head(res_bl_df,30) %>%
  kbl(caption="Top DM genes in blood identified with GMEA") %>%
  kable_paper("hover", full_width = F)
Top DM genes in blood identified with GMEA
nprobes median PValue FDR
ANKRD29 31 -0.5119264 0.0000190 0.5185575
NUP43 20 0.5781906 0.0001144 0.7930777
CXADRP3 2 -2.2005853 0.0004065 0.7930777
LOC101926941 18 -0.5469809 0.0004103 0.7930777
OR10W1 6 -1.7758388 0.0007027 0.7930777
ERO1LB 13 -0.7489410 0.0007159 0.7930777
ATP5I 24 -0.5934763 0.0008169 0.7930777
C12orf10 29 -0.8328921 0.0008862 0.7930777
LOC101929284 6 -2.0125155 0.0009618 0.7930777
GNL2 20 -0.8487822 0.0009900 0.7930777
C9orf89 24 -0.7213163 0.0012176 0.7930777
SLC52A2 6 -0.8921214 0.0013174 0.7930777
TRPC1 25 -0.2836119 0.0014168 0.7930777
POLR3G 22 -0.6011733 0.0014716 0.7930777
TTC29 29 -1.0539440 0.0015060 0.7930777
LOC101928663 2 -1.9067665 0.0015507 0.7930777
CEP135 25 -0.5072602 0.0016338 0.7930777
LMO4 29 -0.7910709 0.0016870 0.7930777
LINC01220 2 -1.4641912 0.0016962 0.7930777
SURF1 17 -0.3737872 0.0017982 0.7930777
C5orf24 22 -0.5231647 0.0018726 0.7930777
RELN 63 -0.6171020 0.0018891 0.7930777
ZCCHC2 30 -0.5628906 0.0018896 0.7930777
C3orf54 8 -0.9636677 0.0018978 0.7930777
CPLX4 13 -1.4227283 0.0019430 0.7930777
SNORA28 5 -0.9776395 0.0019468 0.7930777
NOL3 20 -0.7747351 0.0022516 0.7930777
C1orf114 12 -0.7054945 0.0022650 0.7930777
SNF8 27 -0.5223357 0.0023644 0.7930777
LOC101928823 11 0.5870690 0.0024001 0.7930777

Now some visualisations.

volcano_plot(res_bl_df)

probe_bias(res_bl_df)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 1625 x values <= 0 omitted from
## logarithmic plot

gmea_boxplot(res_bl,sets)

GMEA for guthrie

gu_design <- readRDS(file="gu_design.rds")
gu_mvals <- readRDS(file="gu_mvals.rds")

tic()
res_gu <- main(mval=gu_mvals,design=gu_design,sets,cores=detectCores()/2)
res_gu_df <- res_gu[[1]] # enrichment results
res_gu_top <- res_gu[[2]] # limma results
toc() ## 502-522 sec elapsed on 8 cores (8.3-8.7 mins)
## 502.767 sec elapsed
head(res_gu_df,30) %>%
  kbl(caption="Top DM genes in Guthrie card identified with GMEA") %>%
  kable_paper("hover", full_width = F)
Top DM genes in Guthrie card identified with GMEA
nprobes median PValue FDR
MIR3917 5 -0.8808877 0.0000710 1
ARL5A 15 1.0478366 0.0001918 1
ADAMTS1 19 0.6102581 0.0002838 1
CHMP1B 19 0.4557641 0.0004875 1
FOXD1 11 0.7472067 0.0005569 1
ZNF322B 1 -4.0851785 0.0006000 1
NAA30 21 -0.3732929 0.0006911 1
ALPP 14 -0.6916978 0.0009992 1
CAAP1 7 0.7857603 0.0010091 1
RPS14 21 0.4654341 0.0010451 1
LEPROTL1 19 -0.4020884 0.0010934 1
ZNF226 17 0.7738140 0.0016195 1
MEOX1 28 -0.8202385 0.0017083 1
SFRS12IP1 16 0.8951516 0.0018488 1
MIR196A2 9 0.6807623 0.0021186 1
PCP4 17 -0.7132310 0.0022340 1
SEC14L4 21 -0.8930770 0.0025559 1
GZMA 7 0.4791645 0.0026386 1
ARL16 20 0.4894063 0.0027294 1
LINC00866 8 0.9125732 0.0029358 1
RECQL 20 0.4446618 0.0029434 1
GNG10 10 -0.8812882 0.0036663 1
LSM1 26 0.6178880 0.0036880 1
CSN1S2B 4 -1.8839162 0.0037939 1
LOC441046 11 0.5893277 0.0038516 1
ZNF600 5 0.5349287 0.0038972 1
LOC286135 4 -1.5617343 0.0039589 1
C1QTNF9B-AS1 3 1.7431989 0.0040010 1
TMEM223 14 1.0364105 0.0040902 1
LOC101928766 8 -1.5651393 0.0043081 1

Now some visualisations.

volcano_plot(res_gu_df)

probe_bias(res_gu_df)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 1648 x values <= 0 omitted from
## logarithmic plot

gmea_boxplot(res_gu,sets)

Comparison of blood and guthrie card

res_bl_df$bl <- res_bl_df$median
res_gu_df$gu <- res_gu_df$median

m1 <- merge(res_bl_df,res_gu_df,by=0)

plot(m1$bl,m1$gu,xlab="bl",ylab="gu") ; grid()

plot(rank(m1$bl),rank(m1$gu),xlab="bl rank",ylab="gu rank",cex=0.6,pch=19) ; grid()

Mitch enrichment analysis

genesets <- gmt_import("ReactomePathways.gmt")

Here I’m using the median t-statistic for downstream enrichment. Pathways are from REACTOME and analysis using mitch.

#res_bl_df$bl <- sign(res_bl_df$median) * -log10(res_bl_df$PValue)
#res_gu_df$gu <- sign(res_gu_df$median) * -log10(res_gu_df$PValue)

#res_bl_df$bl <- res_bl_df$median * -log10(res_bl_df$PValue)
#res_gu_df$gu <- res_gu_df$median * -log10(res_gu_df$PValue)

res_bl_df$bl <- res_bl_df$median
res_gu_df$gu <- res_gu_df$median

bl <-  res_bl_df[,"bl",drop=FALSE]
mitch_bl <- mitch_calc(bl,genesets,priority="effect")
head( mitch_bl$enrichment_result,30) %>%
  kbl(caption="BLOOD: Top effect size pathways found with mitch") %>%
  kable_paper("hover", full_width = F)

sig <- subset(mitch_bl$enrichment_result,`p.adjustANOVA`<0.05)
head(sig[order(-abs(sig$s.dist)),],30) %>%
  kbl(caption="BLOOD: Top effect size pathways found with mitch after 1% FDR filtering") %>%
  kable_paper("hover", full_width = F)

gu <-  res_gu_df[,"gu",drop=FALSE]
mitch_gu <- mitch_calc(gu,genesets,priority="effect")
head( mitch_gu$enrichment_result,30) %>%
  kbl(caption="GUTHRIE: Top effect size pathways found with mitch") %>%
  kable_paper("hover", full_width = F)

sig <- subset(mitch_gu$enrichment_result,`p.adjustANOVA`<0.05)
head(sig[order(-abs(sig$s.dist)),],30) %>%
  kbl(caption="GUTHRIE: Top effect size pathways found with mitch after 1% FDR filtering") %>%
  kable_paper("hover", full_width = F)

Mitch joint analysis of blood and guthrie card.

As using the median value is apparently the most sensitive, we’ll go with that.

m <- merge(res_bl_df,res_gu_df,by=0)
rownames(m) <- m$Row.names
m2 <- m[,c("bl","gu")]
m2[is.na(m2)] <- 0

res <- mitch_calc(m2, genesets, priority="effect")
## Note: Enrichments with large effect sizes may not be 
##             statistically significant.
#mitch_report(res,"blgu_mitch.html",overwrite=TRUE)

sig <- subset(res$enrichment_result,p.adjustMANOVA<0.01)

head(sig[order(-abs(sig$s.dist)),],30) %>%
  kbl(caption="Top pathways found with mitch (effect size after 1% FDR filtering)") %>%
  kable_paper("hover", full_width = F)
Top pathways found with mitch (effect size after 1% FDR filtering)
set setSize pMANOVA s.bl s.gu p.bl p.gu s.dist SD p.adjustMANOVA
215 Class C/3 (Metabotropic glutamate/pheromone receptors) 39 0.0000000 -0.5535374 -0.4631703 0.0000000 0.0000006 0.7217551 0.0638992 0.0000000
1226 Scavenging of heme from plasma 12 0.0012104 -0.5783916 -0.3369163 0.0005201 0.0432531 0.6693649 0.1707488 0.0094608
525 Glucuronidation 25 0.0000020 -0.5145865 -0.4138337 0.0000084 0.0003398 0.6603466 0.0712430 0.0000371
386 Eicosanoids 12 0.0000826 0.6314834 -0.1797248 0.0001514 0.2809479 0.6565609 0.5736108 0.0009077
1237 Sensory perception of sweet, bitter, and umami (glutamate) taste 41 0.0000000 -0.4796558 -0.3514032 0.0000001 0.0000982 0.5946040 0.0906882 0.0000007
1184 SARS-CoV-1 modulates host translation machinery 37 0.0000001 0.5115820 0.2972726 0.0000001 0.0017461 0.5916816 0.1515396 0.0000027
97 Aspirin ADME 44 0.0000000 -0.3954512 -0.4111515 0.0000056 0.0000023 0.5704623 0.0111018 0.0000011
1238 Sensory perception of taste 47 0.0000002 -0.3889940 -0.3483818 0.0000039 0.0000356 0.5221936 0.0287172 0.0000056
920 Peptide chain elongation 90 0.0000000 0.4357724 0.2799858 0.0000000 0.0000043 0.5179668 0.1101577 0.0000000
402 Eukaryotic Translation Elongation 94 0.0000000 0.4311023 0.2734195 0.0000000 0.0000045 0.5104972 0.1114986 0.0000000
404 Eukaryotic Translation Termination 94 0.0000000 0.4245282 0.2580058 0.0000000 0.0000152 0.4967808 0.1177491 0.0000000
1521 Viral mRNA Translation 90 0.0000000 0.4141734 0.2677845 0.0000000 0.0000111 0.4932020 0.1035126 0.0000000
1230 Selenocysteine synthesis 93 0.0000000 0.4216390 0.2550454 0.0000000 0.0000210 0.4927754 0.1177994 0.0000000
1165 Response of EIF2AK4 (GCN2) to amino acid deficiency 102 0.0000000 0.4063276 0.2485421 0.0000000 0.0000142 0.4763143 0.1115712 0.0000000
1193 SCF(Skp2)-mediated degradation of p27/p21 59 0.0000001 0.3168824 0.3523437 0.0000253 0.0000028 0.4738782 0.0250749 0.0000037
845 Nonsense Mediated Decay (NMD) independent of the Exon Junction Complex (EJC) 96 0.0000000 0.3971799 0.2574427 0.0000000 0.0000129 0.4733166 0.0988091 0.0000000
456 Formation of a pool of free 40S subunits 102 0.0000000 0.4022035 0.2321057 0.0000000 0.0000505 0.4643714 0.1202773 0.0000000
406 Expression and translocation of olfactory receptors 373 0.0000000 -0.2890609 -0.3592830 0.0000000 0.0000000 0.4611296 0.0496545 0.0000000
1190 SARS-CoV-2 modulates host translation machinery 50 0.0000023 0.3849359 0.2514183 0.0000025 0.0020918 0.4597682 0.0944112 0.0000426
462 Formation of the ternary complex, and subsequently, the 43S complex 52 0.0000009 0.4106796 0.2008106 0.0000003 0.0122091 0.4571462 0.1483998 0.0000187
869 Olfactory Signaling Pathway 381 0.0000000 -0.2800677 -0.3549053 0.0000000 0.0000000 0.4521014 0.0529182 0.0000000
108 Autodegradation of Cdh1 by Cdh1:APC/C 63 0.0000002 0.2827759 0.3523605 0.0001028 0.0000013 0.4517965 0.0492038 0.0000052
501 GTP hydrolysis and joining of the 60S ribosomal subunit 113 0.0000000 0.3860850 0.2220797 0.0000000 0.0000448 0.4453999 0.1159693 0.0000000
1235 Senescence-Associated Secretory Phenotype (SASP) 54 0.0000017 0.2228081 0.3841057 0.0046047 0.0000010 0.4440502 0.1140546 0.0000323
500 GSK3B and BTRC:CUL1-mediated-degradation of NFE2L2 51 0.0000081 0.2876817 0.3301279 0.0003769 0.0000449 0.4378872 0.0300140 0.0001238
677 L13a-mediated translational silencing of Ceruloplasmin expression 112 0.0000000 0.3787727 0.2183978 0.0000000 0.0000644 0.4372257 0.1134022 0.0000000
1208 SRP-dependent cotranslational protein targeting to membrane 112 0.0000000 0.3623194 0.2399313 0.0000000 0.0000113 0.4345600 0.0865415 0.0000000
167 Cap-dependent Translation Initiation 120 0.0000000 0.3753973 0.2177730 0.0000000 0.0000373 0.4339911 0.1114572 0.0000000
403 Eukaryotic Translation Initiation 120 0.0000000 0.3753973 0.2177730 0.0000000 0.0000373 0.4339911 0.1114572 0.0000000
574 Hh mutants are degraded by ERAD 55 0.0000043 0.3166375 0.2960356 0.0000482 0.0001450 0.4334702 0.0145678 0.0000698

Camera enrichment analysis

Camera is designed to control for correlation bias.

Using the median value gives better results than the directional significance or the combined metric.

Blood dataset.

Need to filter by FDR then sort by effect size.

stat <- res_bl_df$median
#stat <- res_bl_df$median * -log10(res_bl_df$PValue)
#stat <- sign(res_bl_df$median) * -log10(res_bl_df$PValue)

names(stat) <- rownames(res_bl_df)
stat[is.na(stat)] <- 0

tic()
cres <- cameraPR(statistic=stat, index=genesets, use.ranks = FALSE, inter.gene.cor=0.01, sort = TRUE)
cres <- subset(cres,NGenes>4)
cres$FDR <- p.adjust(cres$PValue,method="fdr")
mymedians <- sapply(1:nrow(cres),function(i) {
  myset <- genesets[[which(names(genesets) == rownames(cres)[i])]]
  mystats <- stat[names(stat) %in% myset]
  median(mystats)
})
cres$median <- mymedians
toc() # 1.0 sec
## 3.938 sec elapsed
sig <- subset(cres,FDR<0.05)

head(sig[order(-abs(sig$median)),],20) %>%
  kbl(caption = "BLOOD: Top effect size pathways found with CAMERA after 5% FDR filtering") %>%
  kable_paper("hover", full_width = F)
BLOOD: Top effect size pathways found with CAMERA after 5% FDR filtering
NGenes Direction PValue FDR median
Class C/3 (Metabotropic glutamate/pheromone receptors) 39 Down 0.0000001 0.0000545 -0.8029345
Expression and translocation of olfactory receptors 373 Down 0.0000000 0.0000455 -0.7871209
Olfactory Signaling Pathway 381 Down 0.0000001 0.0000513 -0.7598832
Sensory perception of sweet, bitter, and umami (glutamate) taste 41 Down 0.0000028 0.0013483 -0.7224652
Aspirin ADME 44 Down 0.0000179 0.0057827 -0.6442641
Glucuronidation 25 Down 0.0000066 0.0025428 -0.6094985
Sensory perception of taste 47 Down 0.0000556 0.0087017 -0.5983857
Sensory Perception 597 Down 0.0000217 0.0060304 -0.5493889
mRNA Splicing - Major Pathway 180 Up 0.0008601 0.0491044 -0.2019545
mRNA Splicing 191 Up 0.0003291 0.0245677 -0.1806142
Translation 273 Up 0.0007177 0.0427201 -0.1612735
rRNA processing 196 Up 0.0004073 0.0272629 -0.1403788
rRNA processing in the nucleus and cytosol 189 Up 0.0002143 0.0180814 -0.1319654
Influenza Viral RNA Transcription and Replication 135 Up 0.0005323 0.0344376 -0.1319654
Regulation of expression of SLITs and ROBOs 169 Up 0.0006033 0.0377752 -0.1319654
Major pathway of rRNA processing in the nucleolus and cytosol 179 Up 0.0001340 0.0123885 -0.1253995
Selenoamino acid metabolism 109 Up 0.0002432 0.0196652 -0.1225540
Nonsense Mediated Decay (NMD) enhanced by the Exon Junction Complex (EJC) 116 Up 0.0003609 0.0250156 -0.1165573
Nonsense-Mediated Decay (NMD) 116 Up 0.0003609 0.0250156 -0.1165573
SRP-dependent cotranslational protein targeting to membrane 112 Up 0.0002636 0.0204628 -0.0994079
cres_bl <- cres

Now drilling down to the top set.

myset <- rownames(head(sig[order(-abs(sig$median)),],1))
myset_members <- genesets[[which(names(genesets) == myset)]]
mystats <- stat[which(names(stat) %in% myset_members)]
myl <- list("allgenes"=stat,myset=mystats)
boxplot(myl,cex=0)

library(vioplot)
vioplot(myl,main=myset)

mygenes <- head(mystats[order(mystats)],5)

mygenes
##    TAS2R7   TAS2R42   TAS2R20    TAS2R8   TAS2R10 
## -1.561500 -1.484759 -1.481268 -1.469482 -1.466507
myprobes <- unique(unlist(sets[which(names(sets) %in% names(mygenes))]))

myprobedat <- res_bl_top[which(rownames(res_bl_top) %in% myprobes),]

myprobedat %>%
  kbl(caption = "Probe data for top 5 genes") %>%
  kable_paper("hover", full_width = F)
Probe data for top 5 genes
logFC AveExpr t P.Value adj.P.Val B
cg04365308 -0.0674324 1.694837 -2.3435291 0.0373337 0.8764716 -4.000577
cg08370500 -0.0473214 2.332283 -2.2296326 0.0458421 0.8764716 -4.188498
cg12424907 -0.0360924 4.373987 -2.1902039 0.0491959 0.8764716 -4.252725
cg23740316 -0.1100420 2.641863 -2.0122391 0.0674204 0.8764716 -4.536516
cg24927211 -0.0448067 3.556420 -1.8871781 0.0837912 0.8764716 -4.729120
cg01293346 -0.0759929 2.345302 -1.8269330 0.0929118 0.8764716 -4.819616
cg12252888 -0.0817775 3.125413 -1.7335358 0.1088358 0.8764716 -4.956686
cg25023291 -0.0527868 3.523094 -1.7269106 0.1100533 0.8764716 -4.966253
cg12791554 -0.0300006 4.065175 -1.7103908 0.1131420 0.8764716 -4.990014
cg16360649 -0.0320249 3.748225 -1.6736132 0.1202966 0.8764716 -5.042424
cg01629530 -0.0730674 3.193472 -1.6092861 0.1337745 0.8764716 -5.132415
cg06533934 -0.0881083 3.025818 -1.5634348 0.1441702 0.8764716 -5.195201
cg18573244 -0.0538251 3.151457 -1.5595655 0.1450787 0.8764716 -5.200446
cg03138863 -0.0558872 3.697740 -1.4909276 0.1620344 0.8764716 -5.292062
cg19120125 -0.0525759 3.503665 -1.4420871 0.1751047 0.8764716 -5.355556
cg08507270 -0.0247891 3.927859 -1.3296783 0.2085786 0.8764716 -5.496014
cg22791643 -0.0344284 3.384616 -1.2521461 0.2345951 0.8764716 -5.588013
cg11820113 -0.0708003 3.078575 -1.2426070 0.2379687 0.8764716 -5.599044
cg00260387 -0.0186897 3.806383 -1.1481088 0.2735059 0.8764716 -5.704783
cg21156620 -0.0405635 3.794757 -1.0565693 0.3117102 0.8764716 -5.800833
cg06496654 -0.0402366 3.639450 -0.9294281 0.3711585 0.8775414 -5.923195
cg00777560 0.0092525 3.973264 0.6908795 0.5029107 0.8991986 -6.115430
cg15428792 -0.0097955 3.713699 -0.6539128 0.5256174 0.9043415 -6.140602
cg23342872 -0.0411683 2.801241 -0.5857910 0.5689721 0.9139438 -6.183606
cg20769111 -0.0061385 3.899344 -0.4543923 0.6577328 0.9341233 -6.253858
cg18657559 -0.0052775 3.624314 -0.3734269 0.7154057 0.9469881 -6.288608

Guthrie dataset.

stat <- res_gu_df$median
#stat <- res_gu_df$median * -log10(res_gu_df$PValue)
#stat <- sign(res_gu_df$median) * -log10(res_gu_df$PValue)

names(stat) <- rownames(res_gu_df)
stat[is.na(stat)] <- 0

tic()
cres <- cameraPR(statistic=stat, index=genesets, use.ranks = FALSE, inter.gene.cor=0.01, sort = TRUE)
cres <- subset(cres,NGenes>4)
cres$FDR <- p.adjust(cres$PValue,method="fdr")
mymedians <- sapply(1:nrow(cres),function(i) {
  myset <- genesets[[which(names(genesets) == rownames(cres)[i])]]
  mystats <- stat[names(stat) %in% myset]
  median(mystats)
})
cres$median <- mymedians
toc() # 1.0 sec
## 3.911 sec elapsed
sig <- subset(cres,FDR<0.05)

head(sig[order(-abs(sig$median)),],20) %>%
  kbl(caption = "GUTHRIE: Top effect size pathways found with CAMERA after 5% FDR filtering") %>%
  kable_paper("hover", full_width = F)
GUTHRIE: Top effect size pathways found with CAMERA after 5% FDR filtering
NGenes Direction PValue FDR median
Class C/3 (Metabotropic glutamate/pheromone receptors) 39 Down 2.76e-05 0.0134055 -0.4694532
Expression and translocation of olfactory receptors 373 Down 1.00e-07 0.0000651 -0.4201781
Olfactory Signaling Pathway 381 Down 1.00e-07 0.0000651 -0.4139893
Sensory Perception 597 Down 7.20e-06 0.0046746 -0.3380598
cres_gu <- cres

Compare CAMERA data

Compare median set score for both contrasts.

In the graph below, the larger points represent larger gene sets.

Red fill means significant in blood contrast.

Blue ring means significant in guthrie contrast.

m4 <- merge(cres_bl,cres_gu,by=0)
sigx <- subset(m4,FDR.x<0.05)
sigy <- subset(m4,FDR.y<0.05)

plot(m4$median.x,m4$median.y,col="gray",pch=19,cex=log10(m4$NGenes.x),
  xlab="blood",ylab="Guthrie",main="median t-stat")
grid()
points(sigx$median.x,sigx$median.y,col="red",pch=19,cex=log10(sigx$NGenes.x))
points(sigy$median.x,sigy$median.y,col="blue",pch=1,cex=log10(sigy$NGenes.x))

subset(m4,FDR.x<0.05 & FDR.y<0.05 ) %>%
  kbl(caption = "Pathways significant in blood (x) and guthrie cards (y) at 5%FDR") %>%
  kable_paper("hover", full_width = F)
Pathways significant in blood (x) and guthrie cards (y) at 5%FDR
Row.names NGenes.x Direction.x PValue.x FDR.x median.x NGenes.y Direction.y PValue.y FDR.y median.y
262 Class C/3 (Metabotropic glutamate/pheromone receptors) 39 Down 1.00e-07 0.0000545 -0.8029345 39 Down 2.76e-05 0.0134055 -0.4694532
522 Expression and translocation of olfactory receptors 373 Down 0.00e+00 0.0000455 -0.7871209 373 Down 1.00e-07 0.0000651 -0.4201781
1099 Olfactory Signaling Pathway 381 Down 1.00e-07 0.0000513 -0.7598832 381 Down 1.00e-07 0.0000651 -0.4139893
1517 Sensory Perception 597 Down 2.17e-05 0.0060304 -0.5493889 597 Down 7.20e-06 0.0046746 -0.3380598

Distinguishing ASD from birth

Here I will use MDS plot to cluster samples.

First I will use all probes, then I will focus on a subset of candidate probes.

plot(cmdscale(dist(t(bl_mvals))), xlab="Coordinate 1", ylab="Coordinate 2",
  type = "n",main="Blood")
mtext("Numbers indicate ADOS score")
text(cmdscale(dist(t(bl_mvals))), labels=bl_design[,"ADOS"], )

plot(cmdscale(dist(t(gu_mvals))), xlab="Coordinate 1", ylab="Coordinate 2", 
  type = "n",main="Guthrie card")
mtext("Numbers indicate ADOS score")
text(cmdscale(dist(t(gu_mvals))), labels=bl_design[,"ADOS"], )

There is no apparent clustering by ADOS score, indicating a subtle effect.

Now I will examine the probes associated with the pathway of interest.

myprobes
##  [1] "cg03138863" "cg19120125" "cg22791643" "cg00260387" "cg12791554"
##  [6] "cg15428792" "cg12424907" "cg01293346" "cg25023291" "cg20769111"
## [11] "cg11820113" "cg04365308" "cg24927211" "cg18573244" "cg06496654"
## [16] "cg00777560" "cg18657559" "cg16360649" "cg08370500" "cg06533934"
## [21] "cg23342872" "cg21156620" "cg12252888" "cg23740316" "cg01629530"
## [26] "cg08507270"
myprobedat <- bl_mvals[which(rownames(bl_mvals) %in% myprobes),]
plot(cmdscale(dist(t(myprobedat))), xlab="Coordinate 1", ylab="Coordinate 2",
  type = "n",main="Blood")
text(cmdscale(dist(t(myprobedat))), labels=bl_design[,"ADOS"], )

myprobedat <- gu_mvals[which(rownames(gu_mvals) %in% myprobes),]
plot(cmdscale(dist(t(myprobedat))), xlab="Coordinate 1", ylab="Coordinate 2",
  type = "n",main="Guthrie card")
text(cmdscale(dist(t(myprobedat))), labels=bl_design[,"ADOS"], )

Again, there is no apparent clustering by ADOS score. This indicates that the top probes cannot be used diagnostically.

bl_lim <- read.csv("ASD_blood_top_dmps_onADOS_withintw_Nov27_limma.csv")
myprobes2 <- head(bl_lim$Name,50)
myprobedat2 <- bl_mvals[which(rownames(bl_mvals) %in% myprobes2),]
plot(cmdscale(dist(t(myprobedat2))), xlab="Coordinate 1", ylab="Coordinate 2",
  type = "n",main="Blood")
text(cmdscale(dist(t(myprobedat2))), labels=bl_design[,"ADOS"], )

gu_lim <- read.csv("ASD_guthrie_top_dmps_onADOS_withintw_Nov27_limma.csv")
myprobes2 <- head(gu_lim$Name,50)
myprobedat2 <- gu_mvals[which(rownames(gu_mvals) %in% myprobes2),]
plot(cmdscale(dist(t(myprobedat2))), xlab="Coordinate 1", ylab="Coordinate 2",
  type = "n", main="Guthrie")
text(cmdscale(dist(t(myprobedat2))), labels=bl_design[,"ADOS"], )

MDS analysis of top probes identified without correction for covariates.

bl_design2 <- bl_design[,c(1,10)]
fit <- lmFit(bl_mvals, bl_design2)
fit <- eBayes(fit)
summary(decideTests(fit))
##        (Intercept)   ADOS
## Down        248755      0
## NotSig       39844 802647
## Up          514048      0
top <- topTable(fit,coef="ADOS",num=Inf, sort.by = "P")
head(top)
##                  logFC   AveExpr         t      P.Value adj.P.Val        B
## cg13784557  0.04779472 -4.739442  5.456349 2.595342e-05 0.9999998 2.647442
## cg03716591  0.08873078 -3.321034  5.282177 3.833417e-05 0.9999998 2.267504
## cg11078128 -0.05723514  2.941155 -5.154247 5.114886e-05 0.9999998 1.986608
## cg06091288  0.44586721 -1.482468  5.126156 5.450383e-05 0.9999998 1.924737
## cg00320749  0.07116909 -3.962859  5.116515 5.570628e-05 0.9999998 1.903487
## cg00572586 -0.07365197 -3.390343 -5.061720 6.307166e-05 0.9999998 1.782568
myprobes3 <- rownames(top)[1:20]

myprobedat3 <- bl_mvals[which(rownames(bl_mvals) %in% myprobes3),]
plot(cmdscale(dist(t(myprobedat3))), xlab="Coordinate 1", ylab="Coordinate 2",
  type = "n",main="Blood")
text(cmdscale(dist(t(myprobedat3))), labels=bl_design[,"ADOS"], )

gu_design2 <- gu_design[,c(1,12)]
fit <- lmFit(gu_mvals, gu_design2)
fit <- eBayes(fit)
summary(decideTests(fit))
##        (Intercept)   ADOS
## Down        251749      0
## NotSig       57584 790658
## Up          481325      0
top <- topTable(fit,coef="ADOS",num=Inf, sort.by = "P")
head(top)
##                  logFC    AveExpr         t      P.Value adj.P.Val        B
## cg24046505  0.05799757 -1.9843389  5.232968 2.418307e-05 0.9999981 2.662615
## cg04098297  0.06458003  2.5127605  5.024096 4.082524e-05 0.9999981 2.158205
## cg00779672  0.06285769  2.1754652  4.996222 4.378776e-05 0.9999981 2.090748
## cg09303399 -0.08027977 -0.3905299 -4.884562 5.799307e-05 0.9999981 1.820269
## cg15310518 -0.04903224 -0.3222181 -4.883002 5.822148e-05 0.9999981 1.816486
## cg01692639 -0.05839269 -3.8720322 -4.783392 7.483579e-05 0.9999981 1.574938
myprobes3 <- rownames(top)[1:20]

myprobedat3 <- gu_mvals[which(rownames(gu_mvals) %in% myprobes3),]
plot(cmdscale(dist(t(myprobedat3))), xlab="Coordinate 1", ylab="Coordinate 2", 
  type = "n",main="Guthrie")
text(cmdscale(dist(t(myprobedat3))), labels=bl_design[,"ADOS"], )

Again it was unsuccessful. ## Session information

For reproducibility

sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.1 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] grid      stats4    parallel  stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] vioplot_0.3.7                                      
##  [2] zoo_1.8-11                                         
##  [3] sm_2.2-5.7.1                                       
##  [4] tictoc_1.1                                         
##  [5] kableExtra_1.3.4                                   
##  [6] data.table_1.14.2                                  
##  [7] ENmix_1.32.0                                       
##  [8] doParallel_1.0.17                                  
##  [9] qqman_0.1.8                                        
## [10] RCircos_1.2.2                                      
## [11] beeswarm_0.4.0                                     
## [12] forestplot_3.1.0                                   
## [13] abind_1.4-5                                        
## [14] checkmate_2.1.0                                    
## [15] reshape2_1.4.4                                     
## [16] gplots_3.1.3                                       
## [17] eulerr_6.1.1                                       
## [18] GEOquery_2.64.2                                    
## [19] RColorBrewer_1.1-3                                 
## [20] IlluminaHumanMethylation450kmanifest_0.4.0         
## [21] topconfects_1.12.0                                 
## [22] DMRcatedata_2.14.0                                 
## [23] ExperimentHub_2.4.0                                
## [24] AnnotationHub_3.4.0                                
## [25] BiocFileCache_2.4.0                                
## [26] dbplyr_2.2.1                                       
## [27] DMRcate_2.10.0                                     
## [28] limma_3.52.4                                       
## [29] missMethyl_1.30.0                                  
## [30] IlluminaHumanMethylation450kanno.ilmn12.hg19_0.6.1 
## [31] R.utils_2.12.0                                     
## [32] R.oo_1.25.0                                        
## [33] R.methodsS3_1.8.2                                  
## [34] plyr_1.8.7                                         
## [35] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0
## [36] minfi_1.42.0                                       
## [37] bumphunter_1.38.0                                  
## [38] locfit_1.5-9.6                                     
## [39] iterators_1.0.14                                   
## [40] foreach_1.5.2                                      
## [41] Biostrings_2.64.1                                  
## [42] XVector_0.36.0                                     
## [43] SummarizedExperiment_1.26.1                        
## [44] Biobase_2.56.0                                     
## [45] MatrixGenerics_1.8.1                               
## [46] matrixStats_0.62.0                                 
## [47] GenomicRanges_1.48.0                               
## [48] GenomeInfoDb_1.32.4                                
## [49] IRanges_2.30.1                                     
## [50] S4Vectors_0.34.0                                   
## [51] BiocGenerics_0.42.0                                
## [52] mitch_1.8.0                                        
## 
## loaded via a namespace (and not attached):
##   [1] rappdirs_0.3.3                rtracklayer_1.56.1           
##   [3] GGally_2.1.2                  tidyr_1.2.1                  
##   [5] ggplot2_3.3.6                 bit64_4.0.5                  
##   [7] knitr_1.40                    DelayedArray_0.22.0          
##   [9] rpart_4.1.16                  KEGGREST_1.36.3              
##  [11] RCurl_1.98-1.9                AnnotationFilter_1.20.0      
##  [13] generics_0.1.3                GenomicFeatures_1.48.4       
##  [15] preprocessCore_1.58.0         RSQLite_2.2.18               
##  [17] bit_4.0.4                     tzdb_0.3.0                   
##  [19] webshot_0.5.4                 xml2_1.3.3                   
##  [21] httpuv_1.6.6                  assertthat_0.2.1             
##  [23] xfun_0.33                     hms_1.1.2                    
##  [25] jquerylib_0.1.4               evaluate_0.17                
##  [27] promises_1.2.0.1              fansi_1.0.3                  
##  [29] restfulr_0.0.15               scrime_1.3.5                 
##  [31] progress_1.2.2                caTools_1.18.2               
##  [33] readxl_1.4.1                  DBI_1.1.3                    
##  [35] geneplotter_1.74.0            htmlwidgets_1.5.4            
##  [37] reshape_0.8.9                 purrr_0.3.5                  
##  [39] ellipsis_0.3.2                dplyr_1.0.10                 
##  [41] backports_1.4.1               permute_0.9-7                
##  [43] calibrate_1.7.7               annotate_1.74.0              
##  [45] biomaRt_2.52.0                deldir_1.0-6                 
##  [47] sparseMatrixStats_1.8.0       vctrs_0.4.2                  
##  [49] ensembldb_2.20.2              cachem_1.0.6                 
##  [51] Gviz_1.40.1                   BSgenome_1.64.0              
##  [53] GenomicAlignments_1.32.1      prettyunits_1.1.1            
##  [55] mclust_5.4.10                 svglite_2.1.0                
##  [57] cluster_2.1.4                 RPMM_1.25                    
##  [59] lazyeval_0.2.2                crayon_1.5.2                 
##  [61] genefilter_1.78.0             edgeR_3.38.4                 
##  [63] pkgconfig_2.0.3               nlme_3.1-160                 
##  [65] ProtGenerics_1.28.0           nnet_7.3-18                  
##  [67] rlang_1.0.6                   lifecycle_1.0.3              
##  [69] filelock_1.0.2                dichromat_2.0-0.1            
##  [71] cellranger_1.1.0              rngtools_1.5.2               
##  [73] base64_2.0.1                  Matrix_1.5-1                 
##  [75] Rhdf5lib_1.18.2               base64enc_0.1-3              
##  [77] viridisLite_0.4.1             png_0.1-7                    
##  [79] rjson_0.2.21                  bitops_1.0-7                 
##  [81] KernSmooth_2.23-20            rhdf5filters_1.8.0           
##  [83] blob_1.2.3                    DelayedMatrixStats_1.18.1    
##  [85] doRNG_1.8.2                   stringr_1.4.1                
##  [87] nor1mix_1.3-0                 readr_2.1.3                  
##  [89] jpeg_0.1-9                    scales_1.2.1                 
##  [91] memoise_2.0.1                 magrittr_2.0.3               
##  [93] zlibbioc_1.42.0               compiler_4.2.1               
##  [95] BiocIO_1.6.0                  illuminaio_0.38.0            
##  [97] Rsamtools_2.12.0              cli_3.4.1                    
##  [99] DSS_2.44.0                    htmlTable_2.4.1              
## [101] Formula_1.2-4                 MASS_7.3-58.1                
## [103] tidyselect_1.2.0              stringi_1.7.8                
## [105] highr_0.9                     yaml_2.3.5                   
## [107] askpass_1.1                   latticeExtra_0.6-30          
## [109] sass_0.4.2                    VariantAnnotation_1.42.1     
## [111] tools_4.2.1                   rstudioapi_0.14              
## [113] foreign_0.8-83                bsseq_1.32.0                 
## [115] gridExtra_2.3                 digest_0.6.29                
## [117] BiocManager_1.30.18           shiny_1.7.2                  
## [119] quadprog_1.5-8                Rcpp_1.0.9                   
## [121] siggenes_1.70.0               BiocVersion_3.15.2           
## [123] later_1.3.0                   org.Hs.eg.db_3.15.0          
## [125] httr_1.4.4                    AnnotationDbi_1.58.0         
## [127] biovizBase_1.44.0             colorspace_2.0-3             
## [129] rvest_1.0.3                   XML_3.99-0.11                
## [131] splines_4.2.1                 statmod_1.4.37               
## [133] multtest_2.52.0               systemfonts_1.0.4            
## [135] xtable_1.8-4                  jsonlite_1.8.2               
## [137] dynamicTreeCut_1.63-1         R6_2.5.1                     
## [139] echarts4r_0.4.4               Hmisc_4.7-1                  
## [141] pillar_1.8.1                  htmltools_0.5.3              
## [143] mime_0.12                     glue_1.6.2                   
## [145] fastmap_1.1.0                 BiocParallel_1.30.4          
## [147] interactiveDisplayBase_1.34.0 beanplot_1.3.1               
## [149] codetools_0.2-18              utf8_1.2.2                   
## [151] lattice_0.20-45               bslib_0.4.0                  
## [153] tibble_3.1.8                  curl_4.3.3                   
## [155] gtools_3.9.3                  openssl_2.0.3                
## [157] interp_1.1-3                  survival_3.4-0               
## [159] rmarkdown_2.17                munsell_0.5.0                
## [161] rhdf5_2.40.0                  GenomeInfoDbData_1.2.8       
## [163] HDF5Array_1.24.2              impute_1.70.0                
## [165] gtable_0.3.1