Source: https://github.com/markziemann/asd_meth

Introduction

Here I will be running a comparison of differential methylation data from guthrie and fresh blood samples.

Here are the files that I’m using:

  • limma_blood_SRS.csv

  • limma_guthrie_SRS.csv

suppressPackageStartupMessages({
  library("parallel")
  library("mitch")
  library("IlluminaHumanMethylationEPICanno.ilm10b4.hg19")
  source("https://raw.githubusercontent.com/markziemann/gmea/main/meth_functions.R")
  library("data.table")
  library("kableExtra")
  library("eulerr")
  library("RIdeogram")
  library("GenomicRanges")
  library("tictoc")
  library("globaltest")
  library("ebGSEA")
  data("dualmap850kEID")
})

source("meth_functions.R")

Load data

bl_mvals <- readRDS(file="bl_mvals.rds")
gu_mvals <- readRDS(file="gu_mvals.rds")

bl_design <- readRDS(file="bl_design_srs.rds")
gu_design <- readRDS(file="gu_design_srs.rds")

bl_lim <- read.csv("limma_blood_SRS.csv")
gu_lim <- read.csv("limma_guthrie_SRS.csv")

Probe sets

For each gene, extract out the probes.

anno <- getAnnotation(IlluminaHumanMethylationEPICanno.ilm10b4.hg19)
myann <- data.frame(anno[,c("UCSC_RefGene_Name","Regulatory_Feature_Group","Islands_Name","Relation_to_Island")])
promoters <- grep("Prom",myann$Regulatory_Feature_Group)
promoters <- myann[promoters,]
gp <- myann[,"UCSC_RefGene_Name",drop=FALSE]
gp2 <- strsplit(gp$UCSC_RefGene_Name,";")
names(gp2) <- rownames(gp)
sets <- split(rep(names(gp2), lengths(gp2)), unlist(gp2))
summary(unlist(lapply(sets,length)))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    9.00   24.00   49.68   55.00 6778.00

Gene level enrichment for blood

Effect size is estimated with the median t-statistic for each gene. Significance is estimated using the fry test. Results are merged for the user.

  • PMEA step 196.063 sec elapsed on 8 cores

  • Fry step 293.569 sec elapsed on 8 cores

  • Fry single core 1205.519 sec

tic()
res_bl <- main(mval=bl_mvals,design=bl_design,sets,cores=detectCores()/2)
res_bl[[1]] <- subset(res_bl[[1]],nprobes!=0) # remove genes with no probes obvs
res_bl_df <- res_bl[[1]] # enrichment results
res_bl_top <- res_bl[[2]] # limma results
toc() ## 502-522 sec elapsed on 8 cores (8.3-8.7 mins)
## 539.726 sec elapsed
head(res_bl_df,30) %>%
  kbl(caption="Top DM genes in blood identified with GMEA") %>%
  kable_paper("hover", full_width = F)
Top DM genes in blood identified with GMEA
nprobes median PValue FDR
LINC01549 6 -2.5026046 0.0000001 0.0033993
TGFB1 31 -0.8897149 0.0000079 0.1074072
BDNFOS 28 -0.6286689 0.0000135 0.1228381
ADIPOQ 13 -1.4827206 0.0000351 0.2404511
ACTL10 2 -2.7461868 0.0000600 0.2865860
SAMD12 99 -0.8662545 0.0000801 0.2865860
BECN1 15 -0.5574773 0.0001319 0.2865860
KIAA0101 25 -0.9269653 0.0001548 0.2865860
SUCLG2-AS1 12 -1.8436129 0.0001584 0.2865860
RPS28 13 -1.0320093 0.0001844 0.2865860
LOC102723385 9 -1.1146822 0.0001898 0.2865860
AVL9 27 -0.5960707 0.0001945 0.2865860
LOC100505915 1 -2.3163742 0.0002118 0.2865860
FNDC3A 51 -0.4524984 0.0002358 0.2865860
LOC101926942 15 -1.4415014 0.0002459 0.2865860
SLC34A2 31 -0.7921352 0.0002493 0.2865860
COQ9 25 -0.5148982 0.0002580 0.2865860
POLD2 25 -0.9556809 0.0002731 0.2865860
TAS2R1 7 -1.7104058 0.0003036 0.2865860
C16orf95 14 -0.9898711 0.0003096 0.2865860
CSE1L 24 -0.9631989 0.0003199 0.2865860
FTSJD1 18 -0.0275149 0.0003283 0.2865860
KIAA0406 9 -0.9941050 0.0003312 0.2865860
NCAPD2 47 -0.8817047 0.0003373 0.2865860
MIR128-2 8 -1.0550178 0.0003495 0.2865860
OR4D9 4 -2.6849300 0.0003725 0.2865860
SCLT1 35 -0.5546938 0.0003955 0.2865860
POLR2L 26 -0.7330132 0.0004026 0.2865860
OR10G8 4 -2.0077257 0.0004043 0.2865860
FKTN 15 -0.5731241 0.0004778 0.2865860
par(mfrow=c(2,1))
hist(res_bl_top$t,xlab="probe t-stat",main="blood at assessment") ; grid() ; abline(v=0,lty=1,lwd=3)
hist(res_bl_df$median,xlab="gene median t-stat",main="blood at assessment") ; grid() ; abline(v=0,lty=1,lwd=3)

pdf("hist_bl_srs.pdf")
par(mfrow=c(2,1))
hist(res_bl_top$t,xlab="probe t-stat",main="blood at assessment") ; grid() ; abline(v=0,lty=1,lwd=3)
plot(1)
hist(res_bl_df$median,xlab="gene median t-stat",main="blood at assessment") ; grid() ; abline(v=0,lty=1,lwd=3)
plot(1)
dev.off()
## png 
##   2
par(mfrow=c(1,1))

Now some visualisations.

volcano_plot(res_bl_df)

probe_bias(res_bl_df)

gmea_boxplot(res_bl,sets)

Gene level enrichment for guthrie cards

tic()
res_gu <- main(mval=gu_mvals,design=gu_design,sets,cores=detectCores()/2)
res_gu[[1]] <- subset(res_gu[[1]],nprobes!=0) # remove genes with no probes obvs
res_gu_df <- res_gu[[1]] # enrichment results
res_gu_top <- res_gu[[2]] # limma results
toc() ## 502-522 sec elapsed on 8 cores (8.3-8.7 mins)
## 504.332 sec elapsed
head(res_gu_df,30) %>%
  kbl(caption="Top DM genes in Guthrie card identified with GMEA") %>%
  kable_paper("hover", full_width = F)
Top DM genes in Guthrie card identified with GMEA
nprobes median PValue FDR
UVSSA 30 -0.6325755 0.0000440 0.4886931
LRRC14 25 -0.5386472 0.0001297 0.4886931
ZNF841 13 -1.1693217 0.0001956 0.4886931
CASC3 25 -1.2207033 0.0002125 0.4886931
CDC45L 20 0.7373682 0.0002735 0.4886931
LOC146481 2 -2.6038389 0.0003302 0.4886931
FLJ10661 2 -1.0801275 0.0003669 0.4886931
RABGGTB 21 1.1485658 0.0003707 0.4886931
ULBP3 11 -1.1513633 0.0005414 0.4886931
LIG3 21 -1.5030529 0.0006842 0.4886931
VAX2 47 -0.2413132 0.0007528 0.4886931
PCDHGB8P 9 -1.4936761 0.0007757 0.4886931
KLK10 30 -0.9561015 0.0007910 0.4886931
PAK1IP1 17 0.9541923 0.0008063 0.4886931
SNORD116-6 4 -2.0349578 0.0008536 0.4886931
ADGRE3 5 -0.7253847 0.0009441 0.4886931
PITHD1 9 -1.3180302 0.0009605 0.4886931
KHK 16 -0.8463671 0.0009606 0.4886931
LOC101929718 5 -1.8203229 0.0009695 0.4886931
SMIM7 7 -0.9490317 0.0010314 0.4886931
C21orf96 3 1.3066857 0.0011044 0.4886931
PGBD1 15 1.0282768 0.0011314 0.4886931
PDIA2 16 -0.9958923 0.0011354 0.4886931
ZNF597 17 -1.1571374 0.0011364 0.4886931
OR2D2 4 -1.8899851 0.0011632 0.4886931
LRTM2 34 -0.5260050 0.0011720 0.4886931
PIP5K1P1 1 -3.2676826 0.0011815 0.4886931
SRGAP3-AS3 1 4.5075604 0.0012416 0.4886931
PLEKHH3 30 -0.2714077 0.0012747 0.4886931
LOC101926941 18 -0.3424023 0.0013605 0.4886931
par(mfrow=c(2,1))
hist(res_gu_top$t,xlab="probe t-stat",main="neonatal Guthrie card") ; grid() ; abline(v=0,lty=1,lwd=3)
hist(res_gu_df$median,xlab="gene median t-stat",main="neonatal Guthrie card") ; grid() ; abline(v=0,lty=1,lwd=3)

pdf("hist_gu_srs.pdf")
par(mfrow=c(2,1))
hist(res_gu_top$t,xlab="probe t-stat",main="neonatal Guthrie card") ; grid() ; abline(v=0,lty=1,lwd=3)
plot(1)
hist(res_gu_df$median,xlab="gene median t-stat",main="neonatal Guthrie card") ; grid() ; abline(v=0,lty=1,lwd=3)
plot(1)
dev.off()
## png 
##   2
par(mfrow=c(1,1))

Now some visualisations.

volcano_plot(res_gu_df)

probe_bias(res_gu_df)

gmea_boxplot(res_gu,sets)

Comparison of blood and guthrie card

Now compare blood and guthrie card

res_bl_df$bl <- res_bl_df$median
res_gu_df$gu <- res_gu_df$median
m1 <- merge(res_bl_df,res_gu_df,by=0)
m2 <- m1[,c("Row.names","bl","gu")]
rownames(m2) <- m2$Row.names
m2$Row.names=NULL

plot(m2$bl,m2$gu,xlab="bl",ylab="gu",
  col = rgb(red = 0.6, green = 0.6, blue = 0.6, alpha = 0.5) ,
  pch=19,cex=0.9)
grid()
abline(v=0,lty=1,lwd=3,col="black") ; abline(h=0,lty=1,lwd=3,col="black")
mtext("aggregated t-statistic for each gene (median)")

#plot(rank(m1$bl),rank(m1$gu),xlab="bl rank",ylab="gu rank",cex=0.6,pch=19) ; grid()
rm2 <- apply(m2,2,rank)
mydiff <- apply(m2,2,function(x) { length(which(x<0)) } )
rm3 <- rm2
rm3[,1] <- rm3[,1] - mydiff[1]
rm3[,2] <- rm3[,2] - mydiff[2]
rnk <- rm3
    palette <- colorRampPalette(c("white", "yellow", "orange", "red",
        "darkred", "black"))
    xmin = min(rnk[, 1])
    xmax = max(rnk[, 1])
    ymin = min(rnk[, 2])
    ymax = max(rnk[, 2])
    k <- MASS::kde2d(rnk[, 1], rnk[, 2])
    X_AXIS = paste("Rank in contrast", colnames(rnk)[1])
    Y_AXIS = paste("Rank in contrast", colnames(rnk)[2])
    filled.contour(k, xlim = c(xmin, xmax), ylim = c(ymin, ymax), 
        color.palette = palette, plot.title = {
            abline(v = 0, h = 0, lty = 2, lwd = 2, col = "blue")
            title(main = "Rank-rank plot of all genes", xlab = X_AXIS, 
            ylab = Y_AXIS)
        })

pdf("genedens_srs.pdf")
par(mfrow=c(2,2))
plot(1)
plot(1)

plot(m2$bl,m2$gu,xlab="bl",ylab="gu",
  col = rgb(red = 0.6, green = 0.6, blue = 0.6, alpha = 0.5) ,
  pch=19,cex=0.9)
grid()
abline(v=0, h=0, lty=2, lwd=2, col="blue")
mtext("aggregated t-statistic for each gene (median)")

    filled.contour(k, xlim = c(xmin, xmax), ylim = c(ymin, ymax), 
        color.palette = palette, plot.title = {
            abline(v = 0, h = 0, lty = 2, lwd = 2, col = "blue")
            title(main = "Rank-rank plot of all genes", xlab = X_AXIS, 
            ylab = Y_AXIS)
        })

dev.off()
## png 
##   2

Now we need a table of genes with top scores.

m3 <- m2
m3$med <- apply(rnk,1,median)
mg <- merge(res_gu_df,res_bl_df,by=0)
mg$gu = mg$bl = NULL
mg$med <- m3$med
mg2 <- data.frame(mg,rnk)
mg2$med <- apply(rnk,1,median)
mg2 <- mg2[order(mg2$med),]
mg2 <-subset(mg2,nprobes.x>=5)

head(mg2,20) %>%
  kbl(caption="Genes with coordinated hypomethylation in neonatal Guthrie card and blood at assessment") %>%
  kable_paper("hover", full_width = F)
Genes with coordinated hypomethylation in neonatal Guthrie card and blood at assessment
Row.names nprobes.x median.x PValue.x FDR.x nprobes.y median.y PValue.y FDR.y med bl gu
LOC102467212 LOC102467212 5 -2.524296 0.0318950 0.4886931 5 -2.569818 0.0471187 0.2865860 -21435.0 -21723 -21147
UGT2B4 UGT2B4 5 -2.178673 0.0387101 0.4886931 5 -2.765371 0.0472124 0.2865860 -21376.5 -21810 -20943
MCART2 MCART2 5 -2.613084 0.0212015 0.4886931 5 -2.380534 0.0459839 0.2865860 -21371.0 -21585 -21157
SNORD114-24 SNORD114-24 6 -2.314620 0.0616035 0.4886931 7 -2.462636 0.0421217 0.2865860 -21354.5 -21662 -21047
KLHL41 KLHL41 6 -2.459503 0.0292898 0.4886931 6 -2.356337 0.0400779 0.2865860 -21342.0 -21560 -21124
SNORD18C SNORD18C 5 -2.283179 0.0458124 0.4886931 5 -2.441662 0.0402497 0.2865860 -21332.0 -21637 -21027
MIR1251 MIR1251 6 -2.130960 0.0494742 0.4886931 6 -2.568050 0.0389387 0.2865860 -21308.0 -21721 -20895
OR51L1 OR51L1 6 -2.147516 0.0754485 0.4886931 7 -2.514377 0.0564392 0.2865860 -21308.0 -21700 -20916
NAALADL2-AS2 NAALADL2-AS2 7 -2.103600 0.0538177 0.4886931 7 -2.599232 0.0054402 0.2865860 -21302.0 -21736 -20868
KRTAP3-1 KRTAP3-1 7 -2.516892 0.0500910 0.4886931 7 -2.239282 0.0278460 0.2865860 -21265.5 -21388 -21143
OSTN OSTN 8 -2.162278 0.0857883 0.4886931 8 -2.348182 0.0171933 0.2865860 -21243.0 -21553 -20933
MUC7 MUC7 11 -2.066699 0.0259471 0.4886931 11 -2.458572 0.0182413 0.2865860 -21241.0 -21656 -20826
SNORD127 SNORD127 5 -2.134110 0.0814640 0.4886931 6 -2.370252 0.0361055 0.2865860 -21237.0 -21572 -20902
ANXA10 ANXA10 15 -2.055298 0.1010785 0.4886931 15 -2.386332 0.0325550 0.2865860 -21202.0 -21591 -20813
SAMSN1-AS1 SAMSN1-AS1 12 -2.254566 0.0536639 0.4886931 12 -2.245134 0.0451714 0.2865860 -21201.5 -21398 -21005
LINC00613 LINC00613 5 -1.917017 0.0424680 0.4886931 6 -2.659818 0.0103418 0.2865860 -21189.0 -21761 -20617
LOC102723895 LOC102723895 5 -2.061583 0.0699891 0.4886931 5 -2.340576 0.0414558 0.2865860 -21183.5 -21545 -20822
SMR3B SMR3B 7 -2.025399 0.1021372 0.4886931 8 -2.386086 0.0126740 0.2865860 -21183.0 -21590 -20776
MIR597 MIR597 6 -2.046555 0.0547913 0.4886931 6 -2.350004 0.0772068 0.2887317 -21180.5 -21554 -20807
PTH PTH 5 -1.878701 0.0549941 0.4886931 6 -2.680792 0.0139512 0.2865860 -21153.5 -21773 -20534
mg2 <- mg2[order(-mg2$med),]

head(mg2,20) %>%
  kbl(caption="Genes with coordinated hypermethylation in neonatal Guthrie card and blood at assessment") %>%
  kable_paper("hover", full_width = F)
Genes with coordinated hypermethylation in neonatal Guthrie card and blood at assessment
Row.names nprobes.x median.x PValue.x FDR.x nprobes.y median.y PValue.y FDR.y med bl gu
POLR2K POLR2K 12 1.5974501 0.0216809 0.4886931 12 1.6304812 0.1551145 0.3294182 4095.50 3746 4445.0
LOC401397 LOC401397 5 1.3663313 0.0886975 0.4886931 5 1.4053311 0.1937340 0.3619354 4047.00 3684 4410.0
CLEC2B CLEC2B 15 1.2678357 0.1279567 0.4886931 20 1.3708202 0.3767317 0.5275516 4028.00 3670 4386.0
BPNT1 BPNT1 15 1.2506800 0.0574379 0.4886931 15 1.3462461 0.1593833 0.3332763 4020.00 3664 4376.0
CD69 CD69 7 1.0151016 0.1777060 0.4973179 7 1.7931356 0.0630855 0.2866236 4015.50 3774 4257.0
FLJ35776 FLJ35776 9 1.7095167 0.0198563 0.4886931 9 1.1161680 0.0971776 0.2955616 4008.50 3565 4452.0
C9orf23 C9orf23 5 0.9415172 0.0853870 0.4886931 5 1.7638443 0.0947240 0.2945822 3997.00 3771 4223.0
LOC100507506 LOC100507506 5 1.1556880 0.0344334 0.4886931 6 1.2427962 0.0049753 0.2865860 3984.75 3624 4345.5
LINC01578 LINC01578 7 0.8530072 0.0949374 0.4886931 10 1.9361697 0.1281344 0.3123626 3972.00 3791 4153.0
CARD6 CARD6 5 1.3174977 0.2218945 0.5171994 7 1.0164555 0.5445156 0.6849372 3953.00 3508 4398.0
MMRN1 MMRN1 9 1.0787404 0.0895686 0.4886931 11 1.0654602 0.1464934 0.3241524 3918.50 3538 4299.0
HIST1H2BM HIST1H2BM 5 0.9559987 0.6290709 0.8217834 5 1.1911537 0.2331937 0.3964901 3918.00 3605 4231.0
PGBD1 PGBD1 15 1.0282768 0.0011314 0.4886931 15 1.1214455 0.3608782 0.5137914 3917.50 3568 4267.0
HIST1H2AJ HIST1H2AJ 6 1.0234577 0.1924258 0.5026517 6 1.1043844 0.2436277 0.4052175 3911.00 3558 4264.0
LOC101928068 LOC101928068 12 1.1152349 0.0208072 0.4886931 12 0.9822378 0.3031278 0.4593669 3900.00 3483 4317.0
KRT10 KRT10 14 0.7866576 0.7227387 0.8894790 15 1.2882296 0.4767578 0.6219786 3866.00 3639 4093.0
MIR484 MIR484 9 0.9312055 0.2361829 0.5244028 9 1.0258556 0.1504667 0.3270610 3865.50 3514 4217.0
LOC101928489 LOC101928489 7 0.8210244 0.1200014 0.4886931 9 1.1482797 0.1697430 0.3420855 3853.00 3585 4121.0
LINC01431 LINC01431 5 0.8192003 0.3553782 0.6091180 5 1.1413824 0.3648748 0.5171674 3849.00 3580 4118.0
LRRIQ1 LRRIQ1 18 0.8055837 0.1873937 0.5009177 19 1.1270223 0.1762698 0.3472203 3840.50 3573 4108.0

Gene set level enrichment

genesets <- gmt_import("ReactomePathways.gmt")

cores = 8

Mitch enrichment analysis

Here I’m using the median t-statistic for downstream enrichment. Pathways are from REACTOME and analysis using mitch.

res_bl_df$bl <- res_bl_df$median
res_gu_df$gu <- res_gu_df$median

bl <-  res_bl_df[,"bl",drop=FALSE]
mitch_bl <- mitch_calc(bl,genesets,priority="effect")
head( mitch_bl$enrichment_result,30) %>%
  kbl(caption="BLOOD: Top effect size pathways found with mitch") %>%
  kable_paper("hover", full_width = F)

sig <- subset(mitch_bl$enrichment_result,`p.adjustANOVA`<0.05)
head(sig[order(-abs(sig$s.dist)),],30) %>%
  kbl(caption="BLOOD: Top effect size pathways found with mitch after 1% FDR filtering") %>%
  kable_paper("hover", full_width = F)

gu <-  res_gu_df[,"gu",drop=FALSE]
mitch_gu <- mitch_calc(gu,genesets,priority="effect")
head( mitch_gu$enrichment_result,30) %>%
  kbl(caption="GUTHRIE: Top effect size pathways found with mitch") %>%
  kable_paper("hover", full_width = F)

sig <- subset(mitch_gu$enrichment_result,`p.adjustANOVA`<0.05)
head(sig[order(-abs(sig$s.dist)),],30) %>%
  kbl(caption="GUTHRIE: Top effect size pathways found with mitch after 1% FDR filtering") %>%
  kable_paper("hover", full_width = F)

Mitch joint analysis of blood and guthrie card. As using the median value is apparently the most sensitive, we’ll go with that.

m <- merge(res_bl_df,res_gu_df,by=0)
rownames(m) <- m$Row.names
m2 <- m[,c("bl","gu")]
m2[is.na(m2)] <- 0
res <- mitch_calc(m2, genesets, priority="effect")
## Note: Enrichments with large effect sizes may not be 
##             statistically significant.
mitch_report(res,"blgu_mitch.html",overwrite=TRUE)
## Note: overwriting existing report
## Dataset saved as " /tmp/RtmpJmgNI1/blgu_mitch.rds ".
## 
## 
## processing file: mitch.Rmd
## 
  |                                               
  |                                         |   0%
  |                                               
  |.                                        |   3%                             
  |                                               
  |..                                       |   6% (checklibraries)            
  |                                               
  |....                                     |   9%                             
  |                                               
  |.....                                    |  12% (peek)                      
  |                                               
  |......                                   |  15%                             
  |                                               
  |.......                                  |  18% (metrics)                   
  |                                               
  |........                                 |  21%                             
  |                                               
  |..........                               |  24% (scatterplot)               
  |                                               
  |...........                              |  26%                             
  |                                               
  |............                             |  29% (contourplot)               
  |                                               
  |.............                            |  32%                             
  |                                               
  |..............                           |  35% (input_geneset_metrics1)    
  |                                               
  |................                         |  38%                             
  |                                               
  |.................                        |  41% (input_geneset_metrics2)    
  |                                               
  |..................                       |  44%                             
  |                                               
  |...................                      |  47% (input_geneset_metrics3)    
  |                                               
  |....................                     |  50%                             
  |                                               
  |......................                   |  53% (echart1d)                  
  |                                               
  |.......................                  |  56% (echart2d)                  
  |                                               
  |........................                 |  59%                             
  |                                               
  |.........................                |  62% (heatmap)                   
  |                                               
  |...........................              |  65%                             
  |                                               
  |............................             |  68% (effectsize)                
  |                                               
  |.............................            |  71%                             
  |                                               
  |..............................           |  74% (results_table)             
  |                                               
  |...............................          |  76%                             
  |                                               
  |.................................        |  79% (results_table_complete)    
  |                                               
  |..................................       |  82%                             
  |                                               
  |...................................      |  85% (detailed_geneset_reports1d)
  |                                               
  |....................................     |  88%                             
  |                                               
  |.....................................    |  91% (detailed_geneset_reports2d)
  |                                               
  |.......................................  |  94%                             
  |                                               
  |........................................ |  97% (session_info)              
  |                                               
  |.........................................| 100%                             
## output file: /home/mdz/projects/asd_meth/mitch.knit.md
## /usr/bin/pandoc +RTS -K512m -RTS /home/mdz/projects/asd_meth/mitch.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output /tmp/RtmpJmgNI1/mitch_report.html --lua-filter /usr/local/lib/R/site-library/rmarkdown/rmarkdown/lua/pagebreak.lua --lua-filter /usr/local/lib/R/site-library/rmarkdown/rmarkdown/lua/latex-div.lua --self-contained --variable bs3=TRUE --section-divs --template /usr/local/lib/R/site-library/rmarkdown/rmd/h/default.html --no-highlight --variable highlightjs=1 --variable theme=bootstrap --mathjax --variable 'mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' --include-in-header /tmp/RtmpJmgNI1/rmarkdown-str3a6ca0151f15f9.html
## 
## Output created: /tmp/RtmpJmgNI1/mitch_report.html
## [1] TRUE
mitch_plots(res=res,outfile="blgu_mitch_srs.pdf")
## png 
##   2
sig <- subset(res$enrichment_result,p.adjustMANOVA<0.01)

head(sig[order(-abs(sig$s.dist)),],30) %>%
  kbl(caption="Top pathways found with mitch (effect size after 1% FDR filtering)") %>%
  kable_paper("hover", full_width = F)
Top pathways found with mitch (effect size after 1% FDR filtering)
set setSize pMANOVA s.bl s.gu p.bl p.gu s.dist SD p.adjustMANOVA
212 Class C/3 (Metabotropic glutamate/pheromone receptors) 39 0.0000000 -0.5451502 -0.5830210 0.0000000 0.0000000 0.7981868 0.0267787 0.0000000
1163 SARS-CoV-1 modulates host translation machinery 34 0.0000000 0.5282872 0.4611332 0.0000001 0.0000032 0.7012355 0.0474850 0.0000006
516 Glucuronidation 25 0.0000130 -0.4604492 -0.4625853 0.0000672 0.0000621 0.6526857 0.0015105 0.0001826
1215 Sensory perception of sweet, bitter, and umami (glutamate) taste 41 0.0000000 -0.4609162 -0.4402584 0.0000003 0.0000011 0.6373941 0.0146073 0.0000007
399 Expression and translocation of olfactory receptors 356 0.0000000 -0.2941643 -0.5554542 0.0000000 0.0000000 0.6285395 0.1847599 0.0000000
903 Peptide chain elongation 84 0.0000000 0.4654301 0.4106251 0.0000000 0.0000000 0.6206756 0.0387530 0.0000000
853 Olfactory Signaling Pathway 363 0.0000000 -0.2896102 -0.5460811 0.0000000 0.0000000 0.6181251 0.1813523 0.0000000
882 PINK1-PRKN Mediated Mitophagy 21 0.0002321 0.3880781 0.4708050 0.0020759 0.0001872 0.6101327 0.0584967 0.0022567
395 Eukaryotic Translation Elongation 88 0.0000000 0.4600153 0.3959829 0.0000000 0.0000000 0.6069733 0.0452778 0.0000000
397 Eukaryotic Translation Termination 87 0.0000000 0.4677258 0.3850339 0.0000000 0.0000000 0.6058206 0.0584720 0.0000000
1496 Viral mRNA Translation 84 0.0000000 0.4489998 0.3894230 0.0000000 0.0000000 0.5943493 0.0421271 0.0000000
1145 Response of EIF2AK4 (GCN2) to amino acid deficiency 95 0.0000000 0.4456546 0.3884211 0.0000000 0.0000000 0.5911674 0.0404702 0.0000000
1208 Selenocysteine synthesis 87 0.0000000 0.4537698 0.3787435 0.0000000 0.0000000 0.5910615 0.0530516 0.0000000
829 Nonsense Mediated Decay (NMD) independent of the Exon Junction Complex (EJC) 89 0.0000000 0.4534486 0.3664661 0.0000000 0.0000000 0.5830206 0.0615059 0.0000000
13 APC/C:Cdc20 mediated degradation of Cyclin B 24 0.0001698 0.3255326 0.4704269 0.0057622 0.0000660 0.5720777 0.1024558 0.0017045
1169 SARS-CoV-2 modulates host translation machinery 46 0.0000002 0.4010035 0.4027964 0.0000025 0.0000023 0.5683738 0.0012677 0.0000038
449 Formation of a pool of free 40S subunits 94 0.0000000 0.4143879 0.3740486 0.0000000 0.0000000 0.5582380 0.0285242 0.0000000
455 Formation of the ternary complex, and subsequently, the 43S complex 46 0.0000005 0.3901543 0.3840585 0.0000047 0.0000065 0.5474681 0.0043104 0.0000101
1187 SRP-dependent cotranslational protein targeting to membrane 105 0.0000000 0.4117743 0.3583721 0.0000000 0.0000000 0.5458833 0.0377611 0.0000000
11 APC-Cdc20 mediated degradation of Nek2A 26 0.0002838 0.3320658 0.4252792 0.0033750 0.0001738 0.5395647 0.0659119 0.0027084
753 Mitophagy 27 0.0001868 0.3116696 0.4385362 0.0050531 0.0000797 0.5380074 0.0897082 0.0018629
493 GTP hydrolysis and joining of the 60S ribosomal subunit 104 0.0000000 0.4096352 0.3482938 0.0000000 0.0000000 0.5376891 0.0433749 0.0000000
1216 Sensory perception of taste 47 0.0000009 -0.3766502 -0.3721610 0.0000079 0.0000101 0.5294990 0.0031744 0.0000182
1207 Selenoamino acid metabolism 103 0.0000000 0.4053365 0.3379090 0.0000000 0.0000000 0.5277122 0.0476784 0.0000000
663 L13a-mediated translational silencing of Ceruloplasmin expression 103 0.0000000 0.3922314 0.3478971 0.0000000 0.0000000 0.5242880 0.0313491 0.0000000
828 Nonsense Mediated Decay (NMD) enhanced by the Exon Junction Complex (EJC) 108 0.0000000 0.4089723 0.3178832 0.0000000 0.0000000 0.5179846 0.0644098 0.0000000
830 Nonsense-Mediated Decay (NMD) 108 0.0000000 0.4089723 0.3178832 0.0000000 0.0000000 0.5179846 0.0644098 0.0000000
703 Major pathway of rRNA processing in the nucleolus and cytosol 171 0.0000000 0.3931928 0.3320525 0.0000000 0.0000000 0.5146450 0.0432328 0.0000000
165 Cap-dependent Translation Initiation 111 0.0000000 0.4008522 0.3214529 0.0000000 0.0000000 0.5138234 0.0561438 0.0000000
396 Eukaryotic Translation Initiation 111 0.0000000 0.4008522 0.3214529 0.0000000 0.0000000 0.5138234 0.0561438 0.0000000

Camera enrichment analysis

Camera is designed to control for correlation bias. Using the median value gives better results than the directional significance or the combined metric. Blood dataset. Need to filter by FDR then sort by effect size.

stat <- res_bl_df$median
names(stat) <- rownames(res_bl_df)
stat[is.na(stat)] <- 0

tic()
cres <- cameraPR(statistic=stat, index=genesets, use.ranks = FALSE, inter.gene.cor=0.01, sort = TRUE)
cres <- subset(cres,NGenes>4)
cres$FDR <- p.adjust(cres$PValue,method="fdr")
mymedians <- sapply(1:nrow(cres),function(i) {
  myset <- genesets[[which(names(genesets) == rownames(cres)[i])]]
  mystats <- stat[names(stat) %in% myset]
  median(mystats)
})
cres$median <- mymedians
toc() # 1.0 sec
## 3.16 sec elapsed
sig <- subset(cres,FDR<0.05)

head(sig[order(-abs(sig$median)),],20) %>%
  kbl(caption = "BLOOD: Top effect size pathways found with CAMERA after 5% FDR filtering") %>%
  kable_paper("hover", full_width = F)
BLOOD: Top effect size pathways found with CAMERA after 5% FDR filtering
NGenes Direction PValue FDR median
Class C/3 (Metabotropic glutamate/pheromone receptors) 39 Down 0.0000000 0.0000057 -1.2258376
Expression and translocation of olfactory receptors 359 Down 0.0000000 0.0000190 -1.1896032
Olfactory Signaling Pathway 366 Down 0.0000000 0.0000212 -1.1200292
Sensory perception of sweet, bitter, and umami (glutamate) taste 41 Down 0.0000006 0.0002748 -1.0639136
Glucuronidation 25 Down 0.0000240 0.0018509 -1.0364457
Sensory perception of taste 47 Down 0.0000366 0.0026212 -0.8953911
Aspirin ADME 44 Down 0.0001047 0.0067418 -0.8876389
Sensory Perception 570 Down 0.0000025 0.0006145 -0.8639039
mRNA Splicing 184 Up 0.0006322 0.0339260 -0.3679321
Metabolism of RNA 639 Up 0.0002738 0.0160316 -0.3612638
Translation 262 Up 0.0000225 0.0018133 -0.2951548
Influenza Infection 148 Up 0.0001764 0.0106512 -0.2896168
Cellular response to starvation 147 Up 0.0005261 0.0298921 -0.2731911
Regulation of expression of SLITs and ROBOs 159 Up 0.0000942 0.0062748 -0.2706523
Influenza Viral RNA Transcription and Replication 129 Up 0.0001203 0.0074956 -0.2338972
rRNA processing 186 Up 0.0000147 0.0013409 -0.2119498
Selenoamino acid metabolism 103 Up 0.0000160 0.0013409 -0.2013123
rRNA processing in the nucleus and cytosol 180 Up 0.0000073 0.0010092 -0.2000303
Cap-dependent Translation Initiation 111 Up 0.0000142 0.0013409 -0.1987482
Eukaryotic Translation Initiation 111 Up 0.0000142 0.0013409 -0.1987482
nrow(subset(cres,FDR<0.05 & Direction=="Up"))
## [1] 28
nrow(subset(cres,FDR<0.05 & Direction=="Down"))
## [1] 8
cres_bl <- cres

Now drilling down to the top set.

myset <- rownames(head(sig[order(-abs(sig$median)),],1))
myset_members <- genesets[[which(names(genesets) == myset)]]
mystats <- stat[which(names(stat) %in% myset_members)]
myl <- list("allgenes"=stat,myset=mystats)
boxplot(myl,cex=0)

library(vioplot)
vioplot(myl,main=myset)

mygenes <- head(mystats[order(mystats)],5)
mygenes
##   TAS2R43   TAS2R42   TAS2R10   TAS2R16   TAS2R31 
## -3.812601 -2.459073 -2.338374 -2.231421 -2.230518
myprobes <- unique(unlist(sets[which(names(sets) %in% names(mygenes))]))
myprobedat <- res_bl_top[which(rownames(res_bl_top) %in% myprobes),]

myprobedat %>%
  kbl(caption = "Probe data for top 5 genes") %>%
  kable_paper("hover", full_width = F)
Probe data for top 5 genes
logFC AveExpr t P.Value adj.P.Val B
cg25106783 -0.0119247 1.725882 -3.812601 0.0023820 0.3811320 -2.607251
cg21772733 -0.0257518 2.680955 -3.585432 0.0036252 0.3811320 -3.035151
cg19660744 -0.0159362 2.355372 -3.327009 0.0058693 0.3811320 -3.522686
cg04365308 -0.0191481 1.694837 -3.000682 0.0108206 0.3811320 -4.135243
cg25023291 -0.0171509 3.523094 -2.579486 0.0237744 0.3811320 -4.909667
cg19120125 -0.0194008 3.503665 -2.497526 0.0276721 0.3811320 -5.056775
cg23693756 -0.0209881 4.191329 -2.411825 0.0324078 0.3811320 -5.208933
cg03640110 -0.0065665 2.806709 -2.340778 0.0369167 0.3811320 -5.333636
cg11820113 -0.0284144 3.078575 -2.338660 0.0370599 0.3811320 -5.337332
cg27595123 -0.0196442 3.432135 -2.231421 0.0450474 0.3811320 -5.522736
cg03138863 -0.0182417 3.697740 -2.179221 0.0495026 0.3811320 -5.611665
cg04262640 0.0104372 1.687506 2.112945 0.0557586 0.3812237 -5.723225
cg02647103 -0.0083841 4.087822 -2.049212 0.0624666 0.3836623 -5.828983
cg20769111 -0.0036359 3.899344 -1.150929 0.2717120 0.5909978 -7.098949
cg23719342 0.0037343 2.657664 1.144235 0.2743727 0.5931892 -7.106438
cg12150401 0.0062786 2.620843 1.082521 0.2998506 0.6147949 -7.173796

Guthrie dataset.

stat <- res_gu_df$median
names(stat) <- rownames(res_gu_df)
stat[is.na(stat)] <- 0

tic()
cres <- cameraPR(statistic=stat, index=genesets, use.ranks = FALSE, inter.gene.cor=0.01, sort = TRUE)
cres <- subset(cres,NGenes>4)
cres$FDR <- p.adjust(cres$PValue,method="fdr")
mymedians <- sapply(1:nrow(cres),function(i) {
  myset <- genesets[[which(names(genesets) == rownames(cres)[i])]]
  mystats <- stat[names(stat) %in% myset]
  median(mystats)
})
cres$median <- mymedians
toc() # 1.0 sec
## 3.093 sec elapsed
sig <- subset(cres,FDR<0.05)

head(sig[order(-abs(sig$median)),],20) %>%
  kbl(caption = "GUTHRIE: Top effect size pathways found with CAMERA after 5% FDR filtering") %>%
  kable_paper("hover", full_width = F)
GUTHRIE: Top effect size pathways found with CAMERA after 5% FDR filtering
NGenes Direction PValue FDR median
Expression and translocation of olfactory receptors 356 Down 0.0000000 0.0000000 -1.3164834
Olfactory Signaling Pathway 363 Down 0.0000000 0.0000000 -1.2999375
Class C/3 (Metabotropic glutamate/pheromone receptors) 39 Down 0.0000000 0.0000002 -1.2347573
Sensory perception of sweet, bitter, and umami (glutamate) taste 41 Down 0.0000010 0.0003698 -1.0700847
Sensory Perception 567 Down 0.0000000 0.0000000 -1.0248355
Beta defensins 28 Down 0.0009813 0.0309614 -1.0050158
Glucuronidation 25 Down 0.0002296 0.0136609 -0.9829157
Aspirin ADME 44 Down 0.0001249 0.0113768 -0.9594742
Sensory perception of taste 47 Down 0.0000149 0.0047837 -0.8245901
Keratinization 213 Down 0.0001524 0.0117795 -0.7342223
Metabolism of RNA 639 Up 0.0003528 0.0158497 -0.2763744
Processing of Capped Intron-Containing Pre-mRNA 232 Up 0.0003778 0.0158680 -0.2669994
mRNA Splicing - Major Pathway 174 Up 0.0004265 0.0171658 -0.2555693
Signaling by ROBO receptors 203 Up 0.0016786 0.0444245 -0.2506617
mRNA Splicing 184 Up 0.0001875 0.0132786 -0.2504909
Influenza Infection 148 Up 0.0002333 0.0136609 -0.2325273
Cellular response to starvation 147 Up 0.0002830 0.0147845 -0.2245432
TNFR2 non-canonical NF-kB pathway 94 Up 0.0015462 0.0420748 -0.2240555
rRNA processing 186 Up 0.0001354 0.0113768 -0.2216474
Translation 262 Up 0.0000694 0.0083851 -0.2208027
cres_gu <- cres

Compare CAMERA data

Compare median set score for both contrasts. In the graph below, the larger points represent larger gene sets. Red fill means significant in blood contrast. Blue ring means significant in guthrie contrast.

m4 <- merge(cres_bl,cres_gu,by=0)
sigx <- subset(m4,FDR.x<0.05)
sigy <- subset(m4,FDR.y<0.05)

plot(m4$median.x,m4$median.y,col="gray",pch=19,cex=log10(m4$NGenes.x),
  xlab="blood",ylab="Guthrie",main="median t-stat")
grid()
points(sigx$median.x,sigx$median.y,col="red",pch=19,cex=log10(sigx$NGenes.x))
points(sigy$median.x,sigy$median.y,col="blue",pch=1,cex=log10(sigy$NGenes.x))
abline(v=0,h=0,lty=2,lwd=3)

par(mfrow=c(1,1))
pdf("camera_pw_srs.pdf")

plot(m4$median.x,m4$median.y,col="gray",pch=19,cex=log10(m4$NGenes.x),
  xlab="blood",ylab="Guthrie",main="median t-stat")
grid()
points(sigx$median.x,sigx$median.y,col="red",pch=19,cex=log10(sigx$NGenes.x))
points(sigy$median.x,sigy$median.y,col="blue",pch=1,cex=log10(sigy$NGenes.x))
abline(v=0,h=0,lty=2,lwd=3)

dev.off()
## png 
##   2
subset(m4,FDR.x<0.05 & FDR.y<0.05 ) %>%
  kbl(caption = "Pathways significant in blood (x) and guthrie cards (y) at 5%FDR") %>%
  kable_paper("hover", full_width = F)
Pathways significant in blood (x) and guthrie cards (y) at 5%FDR
Row.names NGenes.x Direction.x PValue.x FDR.x median.x NGenes.y Direction.y PValue.y FDR.y median.y
128 Aspirin ADME 44 Down 0.0001047 0.0067418 -0.8876389 44 Down 0.0001249 0.0113768 -0.9594742
198 Cap-dependent Translation Initiation 111 Up 0.0000142 0.0013409 -0.1987482 111 Up 0.0002531 0.0139721 -0.1904512
235 Cellular response to starvation 147 Up 0.0005261 0.0298921 -0.2731911 147 Up 0.0002830 0.0147845 -0.2245432
262 Class C/3 (Metabotropic glutamate/pheromone receptors) 39 Down 0.0000000 0.0000057 -1.2258376 39 Down 0.0000000 0.0000002 -1.2347573
518 Eukaryotic Translation Elongation 88 Up 0.0000017 0.0005528 -0.1602255 88 Up 0.0000311 0.0066684 -0.1161857
519 Eukaryotic Translation Initiation 111 Up 0.0000142 0.0013409 -0.1987482 111 Up 0.0002531 0.0139721 -0.1904512
520 Eukaryotic Translation Termination 87 Up 0.0000020 0.0005528 -0.1595887 87 Up 0.0000490 0.0078878 -0.1576043
522 Expression and translocation of olfactory receptors 359 Down 0.0000000 0.0000190 -1.1896032 356 Down 0.0000000 0.0000000 -1.3164834
566 Formation of a pool of free 40S subunits 94 Up 0.0000157 0.0013409 -0.1872828 94 Up 0.0000685 0.0083851 -0.1161857
652 Glucuronidation 25 Down 0.0000240 0.0018509 -1.0364457 25 Down 0.0002296 0.0136609 -0.9829157
683 GTP hydrolysis and joining of the 60S ribosomal subunit 104 Up 0.0000113 0.0012171 -0.1949544 104 Up 0.0001122 0.0113768 -0.1677858
747 Influenza Infection 148 Up 0.0001764 0.0106512 -0.2896168 148 Up 0.0002333 0.0136609 -0.2325273
748 Influenza Viral RNA Transcription and Replication 129 Up 0.0001203 0.0074956 -0.2338972 129 Up 0.0001351 0.0113768 -0.1904512
838 L13a-mediated translational silencing of Ceruloplasmin expression 103 Up 0.0000262 0.0019493 -0.1987482 103 Up 0.0001181 0.0113768 -0.1627180
874 Major pathway of rRNA processing in the nucleolus and cytosol 171 Up 0.0000039 0.0006292 -0.1962573 171 Up 0.0000405 0.0078289 -0.1887912
925 Metabolism of RNA 639 Up 0.0002738 0.0160316 -0.3612638 639 Up 0.0003528 0.0158497 -0.2763744
976 mRNA Splicing 184 Up 0.0006322 0.0339260 -0.3679321 184 Up 0.0001875 0.0132786 -0.2504909
1048 Nonsense Mediated Decay (NMD) enhanced by the Exon Junction Complex (EJC) 108 Up 0.0000113 0.0012171 -0.1975028 108 Up 0.0003413 0.0157014 -0.1844352
1049 Nonsense Mediated Decay (NMD) independent of the Exon Junction Complex (EJC) 89 Up 0.0000035 0.0006223 -0.1595887 89 Up 0.0000974 0.0104507 -0.1627180
1050 Nonsense-Mediated Decay (NMD) 108 Up 0.0000113 0.0012171 -0.1975028 108 Up 0.0003413 0.0157014 -0.1844352
1096 Olfactory Signaling Pathway 366 Down 0.0000000 0.0000212 -1.1200292 363 Down 0.0000000 0.0000000 -1.2999375
1137 Peptide chain elongation 84 Up 0.0000020 0.0005528 -0.1472451 84 Up 0.0000216 0.0059662 -0.1046895
1305 Regulation of expression of SLITs and ROBOs 159 Up 0.0000942 0.0062748 -0.2706523 159 Up 0.0001467 0.0117795 -0.2042136
1373 Response of EIF2AK4 (GCN2) to amino acid deficiency 95 Up 0.0000031 0.0006223 -0.1608623 95 Up 0.0000252 0.0060894 -0.1576043
1455 rRNA processing 186 Up 0.0000147 0.0013409 -0.2119498 186 Up 0.0001354 0.0113768 -0.2216474
1457 rRNA processing in the nucleus and cytosol 180 Up 0.0000073 0.0010092 -0.2000303 180 Up 0.0000548 0.0081486 -0.1952109
1485 SARS-CoV-1 modulates host translation machinery 34 Up 0.0000436 0.0030115 -0.0347767 34 Up 0.0001977 0.0132786 -0.0925408
1491 SARS-CoV-2 modulates host translation machinery 46 Up 0.0005773 0.0318688 -0.1602255 46 Up 0.0003976 0.0163437 -0.1614991
1502 Selenoamino acid metabolism 103 Up 0.0000160 0.0013409 -0.2013123 103 Up 0.0002067 0.0132786 -0.1784191
1503 Selenocysteine synthesis 87 Up 0.0000034 0.0006223 -0.1608623 87 Up 0.0000630 0.0083851 -0.1576043
1512 Sensory Perception 570 Down 0.0000025 0.0006145 -0.8639039 567 Down 0.0000000 0.0000000 -1.0248355
1514 Sensory perception of sweet, bitter, and umami (glutamate) taste 41 Down 0.0000006 0.0002748 -1.0639136 41 Down 0.0000010 0.0003698 -1.0700847
1515 Sensory perception of taste 47 Down 0.0000366 0.0026212 -0.8953911 47 Down 0.0000149 0.0047837 -0.8245901
1659 SRP-dependent cotranslational protein targeting to membrane 105 Up 0.0000110 0.0012171 -0.1731639 105 Up 0.0000792 0.0090046 -0.1627180
1831 Translation 262 Up 0.0000225 0.0018133 -0.2951548 262 Up 0.0000694 0.0083851 -0.2208027
1905 Viral mRNA Translation 84 Up 0.0000048 0.0007102 -0.1602255 84 Up 0.0000484 0.0078878 -0.1383075

make scatterplots of the top sets

XMAX=max(res$ranked_profile[,1])
XMIN=min(res$ranked_profile[,1])
YMAX=max(res$ranked_profile[,2])
YMIN=min(res$ranked_profile[,2])
plot(res$detailed_sets[[1]], pch=19,cex=2,col="lightgray",
  xlim=c(XMIN,XMAX),ylim=c(YMIN,YMAX),
  xlab="blood",ylab="Guthrie")
text(res$detailed_sets[[1]],labels=rownames(res$detailed_sets[[1]]))
mtext(names(res$detailed_sets)[[1]])
grid() ; abline(v=0,h=0,lty=2,lwd=3)

pdf("c3_metabotropic_srs.pdf")

plot(res$detailed_sets[[1]], pch=19,cex=2,col="lightgray",
  xlim=c(XMIN,XMAX),ylim=c(YMIN,YMAX), 
  xlab="blood",ylab="Guthrie")
text(res$detailed_sets[[1]],labels=rownames(res$detailed_sets[[1]]))
mtext(names(res$detailed_sets)[[1]])
grid() ; abline(v=0,h=0,lty=2,lwd=3)

dev.off()
## png 
##   2

Distinguishing ASD from birth

Here I will use MDS plot to cluster samples.

First I will use all probes, then I will focus on a subset of candidate probes.

plot(cmdscale(dist(t(bl_mvals))), xlab="Coordinate 1", ylab="Coordinate 2",
  type = "n",main="Blood")
mtext("Numbers indicate SRS score")
text(cmdscale(dist(t(bl_mvals))), labels=bl_design[,ncol(bl_design)], )

plot(cmdscale(dist(t(gu_mvals))), xlab="Coordinate 1", ylab="Coordinate 2", 
  type = "n",main="Guthrie card")
mtext("Numbers indicate SRS score")
text(cmdscale(dist(t(gu_mvals))), labels=gu_design[,ncol(gu_design)], )

There is no apparent clustering by SRS score, indicating a subtle effect.

Now I will examine the probes associated with the pathway of interest.

myprobes
##  [1] "cg03138863" "cg19120125" "cg19660744" "cg23719342" "cg04262640"
##  [6] "cg03640110" "cg21772733" "cg12150401" "cg27595123" "cg02647103"
## [11] "cg23693756" "cg21752292" "cg06352932" "cg08555389" "cg23146713"
## [16] "cg22505295" "cg25023291" "cg20769111" "cg11820113" "cg04365308"
## [21] "cg03989876" "cg20418818" "cg17773562" "cg25106783"
myprobedat <- bl_mvals[which(rownames(bl_mvals) %in% myprobes),]
plot(cmdscale(dist(t(myprobedat))), xlab="Coordinate 1", ylab="Coordinate 2",
  type = "n",main="Blood")
text(cmdscale(dist(t(myprobedat))), labels=bl_design[,ncol(bl_design)], )

myprobedat <- gu_mvals[which(rownames(gu_mvals) %in% myprobes),]
plot(cmdscale(dist(t(myprobedat))), xlab="Coordinate 1", ylab="Coordinate 2",
  type = "n",main="Guthrie card")
text(cmdscale(dist(t(myprobedat))), labels=gu_design[,ncol(gu_design)], )

Again, there is no apparent clustering by SRS score. This indicates that the top probes cannot be used diagnostically.

myprobes2 <- head(bl_lim$Name,50)
myprobedat2 <- bl_mvals[which(rownames(bl_mvals) %in% myprobes2),]
plot(cmdscale(dist(t(myprobedat2))), xlab="Coordinate 1", ylab="Coordinate 2",
  type = "n",main="Blood")
text(cmdscale(dist(t(myprobedat2))), labels=bl_design[,ncol(bl_design)], )

myprobes2 <- head(gu_lim$Name,50)
myprobedat2 <- gu_mvals[which(rownames(gu_mvals) %in% myprobes2),]
plot(cmdscale(dist(t(myprobedat2))), xlab="Coordinate 1", ylab="Coordinate 2",
  type = "n", main="Guthrie")
text(cmdscale(dist(t(myprobedat2))), labels=gu_design[,ncol(gu_design)], )

MDS analysis of top probes identified without correction for covariates.

bl_design2 <- bl_design[,c(1,10)]
fit <- lmFit(bl_mvals, bl_design2)
fit <- eBayes(fit)
summary(decideTests(fit))
##        (Intercept)    SRS
## Down        249924      0
## NotSig       43750 802647
## Up          508973      0
top <- topTable(fit,coef="SRS",num=Inf, sort.by = "P")
head(top)
##                  logFC    AveExpr         t      P.Value adj.P.Val        B
## cg22711187 -0.02399382 -1.4864149 -7.327390 4.935339e-07 0.2728643 5.207787
## cg07189381 -0.01519054 -1.2751111 -7.158354 6.926881e-07 0.2728643 4.858097
## cg19504245 -0.01794803 -0.7009587 -6.967758 1.019867e-06 0.2728643 4.459340
## cg18649823 -0.01787430 -1.9170803 -6.783711 1.488640e-06 0.2987132 4.069840
## cg25910261 -0.01775813 -2.4706708 -6.456290 2.950027e-06 0.3074610 3.366359
## cg22970003 -0.01392398 -2.4477684 -6.445506 3.017959e-06 0.3074610 3.342964
myprobes3 <- rownames(top)[1:20]

myprobedat3 <- bl_mvals[which(rownames(bl_mvals) %in% myprobes3),]
plot(cmdscale(dist(t(myprobedat3))), xlab="Coordinate 1", ylab="Coordinate 2",
  type = "n",main="Blood")
text(cmdscale(dist(t(myprobedat3))), labels=bl_design[,ncol(bl_design)], )

gu_design2 <- gu_design[,c(1,12)]
fit <- lmFit(gu_mvals, gu_design2)
fit <- eBayes(fit)
summary(decideTests(fit))
##        (Intercept)    SRS
## Down        249193      0
## NotSig       63837 790658
## Up          477628      0
top <- topTable(fit,coef="SRS",num=Inf, sort.by = "P")
head(top)
##                  logFC     AveExpr         t      P.Value adj.P.Val        B
## cg05234269  0.07333686  0.05758872  6.854547 4.797155e-07 0.1645070 5.151495
## cg07008989  0.01437187  1.59247177  6.839105 4.973160e-07 0.1645070 5.114701
## cg06856169 -0.06081367 -2.94113545 -6.742016 6.241903e-07 0.1645070 4.882733
## cg14421879  0.03266203  1.56334573  6.546365 9.901721e-07 0.1957219 4.412049
## cg13945667  0.02422840  1.89268454  6.427298 1.314134e-06 0.2078062 4.123568
## cg01543583  0.07044844  0.03847869  6.158183 2.506673e-06 0.3303201 3.466196
myprobes3 <- rownames(top)[1:20]

myprobedat3 <- gu_mvals[which(rownames(gu_mvals) %in% myprobes3),]
plot(cmdscale(dist(t(myprobedat3))), xlab="Coordinate 1", ylab="Coordinate 2", 
  type = "n",main="Guthrie")
text(cmdscale(dist(t(myprobedat3))), labels=gu_design[,ncol(gu_design)], )

Again it was unsuccessful.

Try ebGSEA

ebGSEA uses globaltest to identify DMGs from infinium data. Some points to note:

  • only 1 sided test - does not distinguish between up and down methylated genes.

  • cannot use complex experiment designs.

Therefore I modify how the ebGSEA code uses globaltest.

item mapEIDto850k.lv : A list mapping Entrez Gene ID to 850k probes

#bl_gt <- doGT(bl_design,bl_mvals,array="850k",ncores=16)
data.m <- bl_mvals

subsets <- mclapply(mapEIDto850k.lv,intersect,rownames(data.m),mc.cores = 16)
nrep.v <- unlist(lapply(subsets,length))
selG.idx <- which(nrep.v>0)

gt.o <- gt(response=bl_design[,ncol(bl_design)], alternative=t(data.m), model="linear",
  directional = TRUE, standardize = FALSE, permutations = 0,
  subsets=subsets[selG.idx],trace=F)

gt_res <- result(gt.o)
gt_res <- gt_res[order(gt_res$`p-value`),]

Gene set level enrichment - SRS

genesets <- gmt_import("ReactomePathways.gmt")
cores = 8

Aggregate mval to genes and examine pathway enrichment in blood and guthries.

tic()
bl_gsmea <- gsmea(mval=bl_mvals,design=bl_design,probesets=sets,genesets=genesets,cores=8)
toc()
## 195.916 sec elapsed
bl_gsmea$res <- subset(bl_gsmea$res,nprobes>=5)
bl_gsmea$res$FDR <- p.adjust(bl_gsmea$res$PValue,method="fdr")
head(bl_gsmea$res,20)  %>%  kbl(caption="Top DM pathways in blood identified with GSMEA") %>%
  kable_paper("hover", full_width = F)
Top DM pathways in blood identified with GSMEA
nprobes median PValue FDR
SUMO is proteolytically processed 6 -2.3995010 0.0000423 0.0817359
Toxicity of botulinum toxin type D (botD) 5 -1.7695257 0.0003440 0.1289158
Toxicity of botulinum toxin type F (botF) 5 -1.7695257 0.0003440 0.1289158
Processing and activation of SUMO 10 -1.3751097 0.0007265 0.1289158
Erythropoietin activates RAS 14 -0.7728304 0.0011631 0.1289158
Protein repair 6 -1.7673190 0.0013757 0.1289158
Mismatch Repair 15 -1.2800211 0.0014318 0.1289158
The NLRP3 inflammasome 16 -0.6760697 0.0017118 0.1289158
Sealing of the nuclear envelope (NE) by ESCRT-III 31 -0.7628404 0.0018110 0.1289158
Transcription-Coupled Nucleotide Excision Repair (TC-NER) 76 -0.6721243 0.0018726 0.1289158
Mismatch repair (MMR) directed by MSH2:MSH6 (MutSalpha) 14 -1.1036871 0.0019551 0.1289158
IRF3-mediated induction of type I IFN 11 -1.2977315 0.0019618 0.1289158
TP53 Regulates Transcription of Genes Involved in G2 Cell Cycle Arrest 18 -1.3477562 0.0019681 0.1289158
Role of ABL in ROBO-SLIT signaling 8 -2.7190463 0.0021156 0.1289158
Mismatch repair (MMR) directed by MSH2:MSH3 (MutSbeta) 14 -1.3146374 0.0023310 0.1289158
Cooperation of Prefoldin and TriC/CCT in actin and tubulin folding 32 -0.4549011 0.0023927 0.1289158
Formation of tubulin folding intermediates by CCT/TriC 26 -0.3962496 0.0025369 0.1289158
DNA Replication 125 -0.5454303 0.0026080 0.1289158
DNA Replication Pre-Initiation 97 -0.5259152 0.0026559 0.1289158
G2/M Checkpoints 129 -0.6460484 0.0026652 0.1289158
tic()
gu_gsmea <- gsmea(mval=gu_mvals,design=gu_design,probesets=sets,genesets=genesets,cores=8)
toc()
## 194.224 sec elapsed
gu_gsmea$res <- subset(gu_gsmea$res,nprobes>=5)
gu_gsmea$res$FDR <- p.adjust(gu_gsmea$res$PValue,method="fdr")
head(gu_gsmea$res,20)  %>%  kbl(caption="Top DM pathways in Guthrie card identified with GSMEA") %>%
  kable_paper("hover", full_width = F)
Top DM pathways in Guthrie card identified with GSMEA
nprobes median PValue FDR
Sensory perception of salty taste 6 -1.8108499 0.0011797 0.3630927
Diseases associated with surfactant metabolism 9 -1.8530526 0.0023157 0.3630927
Downregulation of ERBB4 signaling 9 -1.5145687 0.0025100 0.3630927
Sperm Motility And Taxes 9 -1.3991157 0.0057239 0.3630927
Methionine salvage pathway 6 -1.1882966 0.0060100 0.3630927
Signaling by FLT3 ITD and TKD mutants 16 -1.1129325 0.0063278 0.3630927
FLT3 signaling in disease 28 -1.0330410 0.0063792 0.3630927
InlA-mediated entry of Listeria monocytogenes into host cells 9 -1.0855889 0.0064292 0.3630927
HDL remodeling 10 -1.6388577 0.0074630 0.3630927
Deactivation of the beta-catenin transactivating complex 39 -0.8674579 0.0080621 0.3630927
Regulation of NF-kappa B signaling 17 -0.2902493 0.0099253 0.3630927
GLI proteins bind promoters of Hh responsive genes to promote transcription 7 -1.7271067 0.0101291 0.3630927
GRB2 events in ERBB2 signaling 16 -1.8437571 0.0101398 0.3630927
RAS signaling downstream of NF1 loss-of-function variants 7 -1.1205635 0.0102577 0.3630927
SHC1 events in ERBB4 signaling 14 -1.3062740 0.0103326 0.3630927
SOS-mediated signalling 7 -1.1205635 0.0105984 0.3630927
Signaling by FLT3 fusion proteins 19 -1.1205635 0.0113510 0.3630927
FBXW7 Mutants and NOTCH1 in Cancer 5 -0.9224496 0.0116338 0.3630927
Loss of Function of FBXW7 in Cancer and NOTCH1 Signaling 5 -0.9224496 0.0116338 0.3630927
Processing and activation of SUMO 10 -0.8989401 0.0122744 0.3630927

Session information

For reproducibility

sessionInfo()
## R version 4.2.2 Patched (2022-11-10 r83330)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] grid      stats4    parallel  stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] vioplot_0.4.0                                      
##  [2] zoo_1.8-11                                         
##  [3] sm_2.2-5.7.1                                       
##  [4] pkgload_1.3.2                                      
##  [5] GGally_2.1.2                                       
##  [6] ggplot2_3.4.1                                      
##  [7] gtools_3.9.4                                       
##  [8] tibble_3.2.0                                       
##  [9] dplyr_1.1.0                                        
## [10] echarts4r_0.4.4                                    
## [11] ebGSEA_0.1.0                                       
## [12] globaltest_5.52.0                                  
## [13] survival_3.5-5                                     
## [14] tictoc_1.1                                         
## [15] RIdeogram_0.2.2                                    
## [16] kableExtra_1.3.4                                   
## [17] data.table_1.14.8                                  
## [18] ENmix_1.34.0                                       
## [19] doParallel_1.0.17                                  
## [20] qqman_0.1.8                                        
## [21] RCircos_1.2.2                                      
## [22] beeswarm_0.4.0                                     
## [23] forestplot_3.1.1                                   
## [24] abind_1.4-5                                        
## [25] checkmate_2.1.0                                    
## [26] reshape2_1.4.4                                     
## [27] gplots_3.1.3                                       
## [28] eulerr_7.0.0                                       
## [29] GEOquery_2.66.0                                    
## [30] RColorBrewer_1.1-3                                 
## [31] IlluminaHumanMethylation450kmanifest_0.4.0         
## [32] topconfects_1.14.0                                 
## [33] DMRcatedata_2.16.0                                 
## [34] ExperimentHub_2.6.0                                
## [35] AnnotationHub_3.6.0                                
## [36] BiocFileCache_2.6.0                                
## [37] dbplyr_2.3.1                                       
## [38] DMRcate_2.12.0                                     
## [39] limma_3.54.0                                       
## [40] missMethyl_1.32.0                                  
## [41] IlluminaHumanMethylation450kanno.ilmn12.hg19_0.6.1 
## [42] R.utils_2.12.2                                     
## [43] R.oo_1.25.0                                        
## [44] R.methodsS3_1.8.2                                  
## [45] plyr_1.8.8                                         
## [46] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0
## [47] minfi_1.44.0                                       
## [48] bumphunter_1.40.0                                  
## [49] locfit_1.5-9.7                                     
## [50] iterators_1.0.14                                   
## [51] foreach_1.5.2                                      
## [52] Biostrings_2.66.0                                  
## [53] XVector_0.38.0                                     
## [54] SummarizedExperiment_1.28.0                        
## [55] Biobase_2.58.0                                     
## [56] MatrixGenerics_1.10.0                              
## [57] matrixStats_0.63.0                                 
## [58] GenomicRanges_1.50.2                               
## [59] GenomeInfoDb_1.34.6                                
## [60] IRanges_2.32.0                                     
## [61] S4Vectors_0.36.1                                   
## [62] BiocGenerics_0.44.0                                
## [63] mitch_1.10.0                                       
## 
## loaded via a namespace (and not attached):
##   [1] rappdirs_0.3.3                rtracklayer_1.58.0           
##   [3] tidyr_1.3.0                   bit64_4.0.5                  
##   [5] knitr_1.42                    DelayedArray_0.24.0          
##   [7] rpart_4.1.19                  KEGGREST_1.38.0              
##   [9] RCurl_1.98-1.10               AnnotationFilter_1.22.0      
##  [11] generics_0.1.3                GenomicFeatures_1.50.3       
##  [13] preprocessCore_1.60.2         RSQLite_2.3.0                
##  [15] bit_4.0.5                     tzdb_0.3.0                   
##  [17] webshot_0.5.4                 xml2_1.3.3                   
##  [19] httpuv_1.6.9                  assertthat_0.2.1             
##  [21] xfun_0.37                     hms_1.1.2                    
##  [23] jquerylib_0.1.4               evaluate_0.20                
##  [25] promises_1.2.0.1              fansi_1.0.4                  
##  [27] restfulr_0.0.15               scrime_1.3.5                 
##  [29] progress_1.2.2                caTools_1.18.2               
##  [31] readxl_1.4.2                  DBI_1.1.3                    
##  [33] geneplotter_1.76.0            htmlwidgets_1.6.2            
##  [35] reshape_0.8.9                 purrr_1.0.1                  
##  [37] ellipsis_0.3.2                backports_1.4.1              
##  [39] permute_0.9-7                 calibrate_1.7.7              
##  [41] grImport2_0.2-0               annotate_1.76.0              
##  [43] biomaRt_2.54.0                deldir_1.0-6                 
##  [45] sparseMatrixStats_1.10.0      vctrs_0.6.0                  
##  [47] ensembldb_2.22.0              withr_2.5.0                  
##  [49] cachem_1.0.7                  Gviz_1.42.0                  
##  [51] BSgenome_1.66.2               GenomicAlignments_1.34.0     
##  [53] prettyunits_1.1.1             mclust_6.0.0                 
##  [55] svglite_2.1.1                 cluster_2.1.4                
##  [57] RPMM_1.25                     lazyeval_0.2.2               
##  [59] crayon_1.5.2                  genefilter_1.80.3            
##  [61] labeling_0.4.2                edgeR_3.40.2                 
##  [63] pkgconfig_2.0.3               nlme_3.1-162                 
##  [65] ProtGenerics_1.30.0           nnet_7.3-18                  
##  [67] rlang_1.1.0                   lifecycle_1.0.3              
##  [69] filelock_1.0.2                dichromat_2.0-0.1            
##  [71] rsvg_2.4.0                    cellranger_1.1.0             
##  [73] rngtools_1.5.2                base64_2.0.1                 
##  [75] Matrix_1.5-3                  Rhdf5lib_1.20.0              
##  [77] base64enc_0.1-3               viridisLite_0.4.1            
##  [79] png_0.1-8                     rjson_0.2.21                 
##  [81] bitops_1.0-7                  KernSmooth_2.23-20           
##  [83] rhdf5filters_1.10.0           blob_1.2.4                   
##  [85] DelayedMatrixStats_1.20.0     doRNG_1.8.6                  
##  [87] stringr_1.5.0                 nor1mix_1.3-0                
##  [89] readr_2.1.4                   jpeg_0.1-10                  
##  [91] scales_1.2.1                  memoise_2.0.1                
##  [93] magrittr_2.0.3                zlibbioc_1.44.0              
##  [95] compiler_4.2.2                BiocIO_1.8.0                 
##  [97] illuminaio_0.40.0             Rsamtools_2.14.0             
##  [99] cli_3.6.0                     DSS_2.46.0                   
## [101] htmlTable_2.4.1               Formula_1.2-5                
## [103] MASS_7.3-58.3                 tidyselect_1.2.0             
## [105] stringi_1.7.12                highr_0.10                   
## [107] yaml_2.3.7                    askpass_1.1                  
## [109] latticeExtra_0.6-30           sass_0.4.5                   
## [111] VariantAnnotation_1.44.0      tools_4.2.2                  
## [113] rstudioapi_0.14               foreign_0.8-84               
## [115] bsseq_1.34.0                  gridExtra_2.3                
## [117] farver_2.1.1                  digest_0.6.31                
## [119] BiocManager_1.30.20           shiny_1.7.4                  
## [121] quadprog_1.5-8                Rcpp_1.0.10                  
## [123] siggenes_1.72.0               BiocVersion_3.16.0           
## [125] later_1.3.0                   org.Hs.eg.db_3.16.0          
## [127] httr_1.4.5                    AnnotationDbi_1.60.0         
## [129] biovizBase_1.46.0             colorspace_2.1-0             
## [131] rvest_1.0.3                   XML_3.99-0.13                
## [133] splines_4.2.2                 statmod_1.5.0                
## [135] kpmt_0.1.0                    multtest_2.54.0              
## [137] systemfonts_1.0.4             xtable_1.8-4                 
## [139] jsonlite_1.8.4                dynamicTreeCut_1.63-1        
## [141] R6_2.5.1                      Hmisc_5.0-1                  
## [143] pillar_1.8.1                  htmltools_0.5.4              
## [145] mime_0.12                     glue_1.6.2                   
## [147] fastmap_1.1.1                 BiocParallel_1.32.5          
## [149] interactiveDisplayBase_1.36.0 beanplot_1.3.1               
## [151] codetools_0.2-19              utf8_1.2.3                   
## [153] lattice_0.20-45               bslib_0.4.2                  
## [155] curl_5.0.0                    openssl_2.0.6                
## [157] interp_1.1-3                  rmarkdown_2.20               
## [159] munsell_0.5.0                 rhdf5_2.42.0                 
## [161] GenomeInfoDbData_1.2.9        HDF5Array_1.26.0             
## [163] impute_1.72.3                 gtable_0.3.2