Source: https://github.com/markziemann/combined_enrichment
Here we are performing an analysis of some gene expression data to demonstrate the difference between ORA and FCS methods and to highlight the differences caused by improper background gene set use.
The dataset being used is SRP247621 and we are comparing the fibroblast cells from LHON patients (case) versus healthy controls.
Data are obtained from http://dee2.io/
suppressPackageStartupMessages({
library("getDEE2")
library("DESeq2")
library("clusterProfiler")
library("mitch")
library("kableExtra")
library("eulerr")
})
I’m using some RNA-seq data looking at the difference in fibroblast gene expression between control and LHON patients.
name="SRP247621"
mdat<-getDEE2Metadata("hsapiens")
samplesheet <- mdat[grep("SRP247621",mdat$SRP_accession),]
samplesheet<-samplesheet[order(samplesheet$SRR_accession),]
samplesheet$trt<-as.factor(c(1,1,1,2,2,2,0,0,0)) # exclude carriers for simplicity
samplesheet <- samplesheet[which(samplesheet$trt!=2),]
s1 <- samplesheet
s1 %>% kbl(caption = "sample sheet") %>% kable_paper("hover", full_width = F)
SRR_accession | QC_summary | SRX_accession | SRS_accession | SRP_accession | Sample_name | GEO_series | Library_name | trt | |
---|---|---|---|---|---|---|---|---|---|
204421 | SRR11040359 | PASS | SRX7692166 | SRS6118270 | SRP247621 | GSM4300731 | GSE144914 | 1 | |
204422 | SRR11040360 | WARN(3,4,6) | SRX7692167 | SRS6118272 | SRP247621 | GSM4300732 | GSE144914 | 1 | |
204423 | SRR11040361 | PASS | SRX7692168 | SRS6118271 | SRP247621 | GSM4300733 | GSE144914 | 1 | |
204427 | SRR11040365 | PASS | SRX7692172 | SRS6118276 | SRP247621 | GSM4300737 | GSE144914 | 0 | |
204428 | SRR11040366 | PASS | SRX7692173 | SRS6118277 | SRP247621 | GSM4300738 | GSE144914 | 0 | |
204429 | SRR11040367 | PASS | SRX7692174 | SRS6118278 | SRP247621 | GSM4300739 | GSE144914 | 0 |
w<-getDEE2("hsapiens",samplesheet$SRR_accession,metadata=mdat,legacy = TRUE)
## For more information about DEE2 QC metrics, visit
## https://github.com/markziemann/dee2/blob/master/qc/qc_metrics.md
x<-Tx2Gene(w)
x<-x$Tx2Gene
# save the genetable for later
gt<-w$GeneInfo[,1,drop=FALSE]
gt$accession<-rownames(gt)
# counts
x1<-x[,which(colnames(x) %in% samplesheet$SRR_accession)]
Here show the number of genes in the annotation set, and those detected above the detection threshold.
# filter out lowly expressed genes
x1<-x1[which(rowSums(x1)/ncol(x1)>=(10)),]
nrow(x)
## [1] 39297
nrow(x1)
## [1] 14288
Now multidimensional scaling (MDS) plot to show the correlation between the datasets. If the control and case datasets are clustered separately, then it is likely that there will be many differentially expressed genes with FDR<0.05.
plot(cmdscale(dist(t(x1))), xlab="Coordinate 1", ylab="Coordinate 2", pch=19, col=s1$trt, main="MDS")
Now run DESeq2 for control vs case.
y <- DESeqDataSetFromMatrix(countData = round(x1), colData = s1, design = ~ trt)
## converting counts to integer mode
## factor levels were dropped which had no samples
y <- DESeq(y)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
de <- results(y)
de<-as.data.frame(de[order(de$pvalue),])
rownames(de)<-sapply(strsplit(rownames(de),"\\."),"[[",1)
head(de) %>% kbl() %>% kable_paper("hover", full_width = F)
baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | |
---|---|---|---|---|---|---|
ENSG00000233948 | 156.47249 | -2.955957 | 0.3146608 | -9.394106 | 0 | 0e+00 |
ENSG00000103326 | 241.05789 | 1.543670 | 0.2180326 | 7.079999 | 0 | 0e+00 |
ENSG00000268861 | 72.30512 | -6.854266 | 1.0078846 | -6.800645 | 0 | 0e+00 |
ENSG00000175221 | 419.46036 | 1.246384 | 0.1836166 | 6.787967 | 0 | 0e+00 |
ENSG00000273542 | 17.71562 | 21.242888 | 3.1581883 | 6.726289 | 0 | 0e+00 |
ENSG00000168140 | 1121.18373 | 1.113196 | 0.1771604 | 6.283553 | 0 | 7e-07 |
Now let’s have a look at some of the charts showing differential expression. In particular, an MA plot and volcano plot.
maplot <- function(de,contrast_name) {
sig <-subset(de, padj < 0.05 )
up <-rownames(subset(de, padj < 0.05 & log2FoldChange > 0))
dn <-rownames(subset(de, padj < 0.05 & log2FoldChange < 0))
GENESUP <- length(up)
GENESDN <- length(dn)
DET=nrow(de)
SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down", DET, "detected")
ns <-subset(de, padj > 0.05 )
plot(log2(de$baseMean),de$log2FoldChange,
xlab="log2 basemean", ylab="log2 foldchange",
pch=19, cex=0.5, col="dark gray",
main=contrast_name, cex.main=0.7)
points(log2(sig$baseMean),sig$log2FoldChange,
pch=19, cex=0.5, col="red")
mtext(SUBHEADER,cex = 0.7)
}
make_volcano <- function(de,name) {
sig <- subset(de,padj<0.05)
N_SIG=nrow(sig)
N_UP=nrow(subset(sig,log2FoldChange>0))
N_DN=nrow(subset(sig,log2FoldChange<0))
DET=nrow(de)
HEADER=paste(N_SIG,"@5%FDR,", N_UP, "up", N_DN, "dn", DET, "detected")
plot(de$log2FoldChange,-log10(de$padj),cex=0.5,pch=19,col="darkgray",
main=name, xlab="log2 FC", ylab="-log10 pval", xlim=c(-6,6))
mtext(HEADER)
grid()
points(sig$log2FoldChange,-log10(sig$padj),cex=0.5,pch=19,col="red")
}
maplot(de,name)
make_volcano(de,name)
In order to perform gene set analysis, we need some gene sets.
if (! file.exists("ReactomePathways.gmt")) {
download.file("https://reactome.org/download/current/ReactomePathways.gmt.zip",
destfile="ReactomePathways.gmt.zip")
unzip("ReactomePathways.gmt.zip")
}
genesets<-gmt_import("ReactomePathways.gmt")
Mitch uses rank-ANOVA statistics for enrichment detection.
Here I’m using the standard approach
m <- mitch_import(de,DEtype = "DEseq2", geneTable = gt)
## The input is a single dataframe; one contrast only. Converting
## it to a list for you.
## Note: Mean no. genes in input = 14288
## Note: no. genes in output = 13355
## Note: estimated proportion of input genes in output = 0.935
msep <- mitch_calc(m,genesets = genesets)
## Note: When prioritising by significance (ie: small
## p-values), large effect sizes might be missed.
ms_up <- subset(msep$enrichment_result,p.adjustANOVA<0.05 & s.dist > 0)[,1]
ms_dn <- subset(msep$enrichment_result,p.adjustANOVA<0.05 & s.dist < 0)[,1]
message(paste("Number of up-regulated pathways:",length(ms_up) ))
## Number of up-regulated pathways: 69
message(paste("Number of down-regulated pathways:",length(ms_dn) ))
## Number of down-regulated pathways: 156
head(msep$enrichment_result,10) #%>% kbl() %>% kable_paper("hover", full_width = F)
## set setSize pANOVA
## 77 Asparagine N-linked glycosylation 251 5.574217e-11
## 242 Defective CFTR causes cystic fibrosis 57 1.306378e-09
## 315 ER to Golgi Anterograde Transport 124 3.370379e-09
## 1305 Vif-mediated degradation of APOBEC3G 52 5.178549e-09
## 2 ABC transporter disorders 65 5.233657e-09
## 696 Negative regulation of NOTCH4 signaling 53 7.892847e-09
## 932 Regulation of Apoptosis 51 1.019099e-08
## 1312 Vpu mediated degradation of CD4 49 1.379079e-08
## 478 Hh mutants abrogate ligand secretion 54 1.866448e-08
## 1293 Ubiquitin-dependent degradation of Cyclin D 50 2.355232e-08
## s.dist p.adjustANOVA
## 77 -0.2409732 7.536341e-08
## 242 -0.4646418 8.831112e-07
## 315 -0.3078072 1.415181e-06
## 1305 -0.4683385 1.415181e-06
## 2 -0.4189732 1.415181e-06
## 696 -0.4583167 1.778522e-06
## 932 -0.4636878 1.968316e-06
## 1312 -0.4687681 2.330644e-06
## 478 -0.4425343 2.803820e-06
## 1293 -0.4565351 3.184274e-06
#mitch_report(msep,outfile="mitch_separate.html",overwrite=TRUE)
Here I’m using the combined approach.
mcom <- mitch_calc(abs(m),genesets = genesets)
## Note: When prioritising by significance (ie: small
## p-values), large effect sizes might be missed.
mc_up <- subset(mcom$enrichment_result,p.adjustANOVA<0.05 & s.dist > 0)[,1]
mc_dn <- subset(mcom$enrichment_result,p.adjustANOVA<0.05 & s.dist < 0)[,1]
message(paste("Number of up-regulated pathways:",length(mc_up) ))
## Number of up-regulated pathways: 34
message(paste("Number of down-regulated pathways:",length(mc_dn) ))
## Number of down-regulated pathways: 1
head(mcom$enrichment_result,10) #%>% kbl() %>% kable_paper("hover", full_width = F)
## set
## 990 Respiratory electron transport, ATP synthesis by chemiosmotic coupling, and heat production by uncoupling proteins.
## 989 Respiratory electron transport
## 831 Protein-protein interactions at synapses
## 1350 tRNA processing in the mitochondrion
## 1185 TCF dependent signaling in response to WNT
## 190 Complex I biogenesis
## 1224 The citric acid (TCA) cycle and respiratory electron transport
## 1235 Toll Like Receptor 4 (TLR4) Cascade
## 108 C-type lectin receptors (CLRs)
## 1138 Signaling by WNT
## setSize pANOVA s.dist p.adjustANOVA
## 990 107 4.058993e-06 0.2581705 0.005487758
## 989 103 1.551183e-05 0.2467200 0.010485995
## 831 54 2.557534e-05 0.3313340 0.011525951
## 1350 17 8.624254e-05 0.5500428 0.029149977
## 1185 151 1.781581e-04 0.1770766 0.040270892
## 190 57 2.472321e-04 0.2808337 0.040270892
## 1224 153 2.756356e-04 0.1707238 0.040270892
## 1235 108 3.764216e-04 0.1983356 0.040270892
## 108 109 3.909377e-04 0.1968785 0.040270892
## 1138 225 3.916582e-04 0.1376167 0.040270892
#mitch_report(mcom,outfile="mitch_combined.html",overwrite=TRUE)
Let’s look at the significant ones based on the combined analysis. There weren’t many gene sets classe d as significant. Let’s see how many have a direction agnostic enrichment score which is larger in magnitude than the direction informed enrichment score. There are only 11 such sets which would benefit from such combined analysis.
Euler diagram of the significant pathways found with each approach.
l0 <- list("sep up"=ms_up,"sep dn"=ms_dn,"comb up"=mc_up,"comb dn"=mc_dn)
par(cex.main=0.5)
plot(euler(l0),quantities = TRUE, edges = "gray", main="FCS: combined vs separated")
length(ms_up)
## [1] 69
length(ms_dn)
## [1] 156
length(ms_up)+length(ms_dn)
## [1] 225
length(mc_up)
## [1] 34
length(mc_dn)
## [1] 1
length(mc_up)+length(mc_dn)
## [1] 35
( length(ms_up)+length(ms_dn) ) / ( length(mc_up)+length(mc_dn) )
## [1] 6.428571
List gene sets which are specific to each approach.
ms <- c(ms_up,ms_dn)
# in sep but not comb
setdiff(ms,mc_up)
## [1] "Signal Transduction"
## [2] "Developmental Biology"
## [3] "Extracellular matrix organization"
## [4] "Heparan sulfate/heparin (HS-GAG) metabolism"
## [5] "A tetrasaccharide linker sequence is required for GAG synthesis"
## [6] "Chondroitin sulfate/dermatan sulfate metabolism"
## [7] "RORA activates gene expression"
## [8] "Mitotic Prometaphase"
## [9] "Assembly of collagen fibrils and other multimeric structures"
## [10] "CDC42 GTPase cycle"
## [11] "Defective B3GALT6 causes EDSP2 and SEMDJL1"
## [12] "Chromatin modifying enzymes"
## [13] "Chromatin organization"
## [14] "Defective B4GALT7 causes EDS, progeroid type"
## [15] "PKMTs methylate histone lysines"
## [16] "SEMA3A-Plexin repulsion signaling by inhibiting Integrin adhesion"
## [17] "Defective B3GAT3 causes JDSSDHD"
## [18] "Plasma lipoprotein remodeling"
## [19] "Pre-NOTCH Transcription and Translation"
## [20] "Centrosome maturation"
## [21] "Recruitment of mitotic centrosome proteins and complexes"
## [22] "HS-GAG degradation"
## [23] "RUNX1 regulates genes involved in megakaryocyte differentiation and platelet function"
## [24] "Collagen biosynthesis and modifying enzymes"
## [25] "Collagen formation"
## [26] "Signaling by Rho GTPases"
## [27] "Collagen chain trimerization"
## [28] "NCAM1 interactions"
## [29] "Recruitment of NuMA to mitotic centrosomes"
## [30] "Kinesins"
## [31] "Heme signaling"
## [32] "FOXO-mediated transcription of cell death genes"
## [33] "RHOA GTPase cycle"
## [34] "RHOB GTPase cycle"
## [35] "Signaling by Rho GTPases, Miro GTPases and RHOBTB3"
## [36] "Nervous system development"
## [37] "Transcriptional regulation of white adipocyte differentiation"
## [38] "NCAM signaling for neurite out-growth"
## [39] "Activation of SMO"
## [40] "ECM proteoglycans"
## [41] "Axon guidance"
## [42] "Unwinding of DNA"
## [43] "Resolution of D-loop Structures through Holliday Junction Intermediates"
## [44] "Regulation of TP53 Activity through Acetylation"
## [45] "Resolution of D-Loop Structures"
## [46] "Neurexins and neuroligins"
## [47] "Ephrin signaling"
## [48] "Plasma lipoprotein assembly, remodeling, and clearance"
## [49] "Semaphorin interactions"
## [50] "Glycosaminoglycan metabolism"
## [51] "RHO GTPases Activate Formins"
## [52] "SUMO E3 ligases SUMOylate target proteins"
## [53] "RAC1 GTPase cycle"
## [54] "RHO GTPase cycle"
## [55] "AURKA Activation by TPX2"
## [56] "Signaling by PDGF"
## [57] "Signaling by Receptor Tyrosine Kinases"
## [58] "Activation of HOX genes during differentiation"
## [59] "Activation of anterior HOX genes in hindbrain development during early embryogenesis"
## [60] "RHOC GTPase cycle"
## [61] "Pre-NOTCH Expression and Processing"
## [62] "Neuronal System"
## [63] "Sema3A PAK dependent Axon repulsion"
## [64] "Immunoregulatory interactions between a Lymphoid and a non-Lymphoid cell"
## [65] "RHO GTPase Effectors"
## [66] "Nuclear Receptor transcription pathway"
## [67] "Formation of the beta-catenin:TCF transactivating complex"
## [68] "Asparagine N-linked glycosylation"
## [69] "Defective CFTR causes cystic fibrosis"
## [70] "Vif-mediated degradation of APOBEC3G"
## [71] "ABC transporter disorders"
## [72] "Negative regulation of NOTCH4 signaling"
## [73] "Regulation of Apoptosis"
## [74] "Vpu mediated degradation of CD4"
## [75] "Hh mutants abrogate ligand secretion"
## [76] "Ubiquitin-dependent degradation of Cyclin D"
## [77] "Hh mutants are degraded by ERAD"
## [78] "Autodegradation of the E3 ubiquitin ligase COP1"
## [79] "SCF(Skp2)-mediated degradation of p27/p21"
## [80] "Stabilization of p53"
## [81] "Activation of NF-kappaB in B cells"
## [82] "Regulation of RUNX3 expression and activity"
## [83] "Autodegradation of Cdh1 by Cdh1:APC/C"
## [84] "Hedgehog ligand biogenesis"
## [85] "ER-Phagosome pathway"
## [86] "FCERI mediated NF-kB activation"
## [87] "Ubiquitin Mediated Degradation of Phosphorylated Cdc25A"
## [88] "p53-Independent DNA Damage Response"
## [89] "p53-Independent G1/S DNA damage checkpoint"
## [90] "Degradation of GLI1 by the proteasome"
## [91] "NIK-->noncanonical NF-kB signaling"
## [92] "Regulation of activated PAK-2p34 by proteasome mediated degradation"
## [93] "APC/C:Cdc20 mediated degradation of Securin"
## [94] "Degradation of DVL"
## [95] "FBXL7 down-regulates AURKA during mitotic entry and in early mitosis"
## [96] "SCF-beta-TrCP mediated degradation of Emi1"
## [97] "Regulation of ornithine decarboxylase (ODC)"
## [98] "Degradation of AXIN"
## [99] "Metabolism of proteins"
## [100] "Downstream signaling events of B Cell Receptor (BCR)"
## [101] "Degradation of GLI2 by the proteasome"
## [102] "Antigen processing-Cross presentation"
## [103] "Downstream TCR signaling"
## [104] "GLI3 is processed to GLI3R by the proteasome"
## [105] "Transport to the Golgi and subsequent modification"
## [106] "Dectin-1 mediated noncanonical NF-kB signaling"
## [107] "ROS sensing by NFE2L2"
## [108] "Metabolism of polyamines"
## [109] "p53-Dependent G1 DNA Damage Response"
## [110] "p53-Dependent G1/S DNA damage checkpoint"
## [111] "Cdc20:Phospho-APC/C mediated degradation of Cyclin A"
## [112] "CDT1 association with the CDC6:ORC:origin complex"
## [113] "Oxygen-dependent proline hydroxylation of Hypoxia-inducible Factor Alpha"
## [114] "CDK-mediated phosphorylation and removal of Cdc6"
## [115] "Class I MHC mediated antigen processing & presentation"
## [116] "Metabolism"
## [117] "APC:Cdc20 mediated degradation of cell cycle proteins prior to satisfation of the cell cycle checkpoint"
## [118] "RUNX1 regulates transcription of genes involved in differentiation of HSCs"
## [119] "G1/S DNA Damage Checkpoints"
## [120] "Cross-presentation of soluble exogenous antigens (endosomes)"
## [121] "Regulation of RUNX2 expression and activity"
## [122] "Regulation of HMOX1 expression and activity"
## [123] "APC/C:Cdc20 mediated degradation of mitotic proteins"
## [124] "ABC-family proteins mediated transport"
## [125] "The role of GTSE1 in G2/M progression after G2 checkpoint"
## [126] "APC/C:Cdh1 mediated degradation of Cdc20 and other APC/C:Cdh1 targeted proteins in late mitosis/early G1"
## [127] "Regulation of APC/C activators between G1/S and early anaphase"
## [128] "AUF1 (hnRNP D0) binds and destabilizes mRNA"
## [129] "Neddylation"
## [130] "Cyclin E associated events during G1/S transition"
## [131] "Activation of APC/C and APC/C:Cdc20 mediated degradation of mitotic proteins"
## [132] "Antigen processing: Ubiquitination & Proteasome degradation"
## [133] "Disorders of transmembrane transporters"
## [134] "Peroxisomal protein import"
## [135] "Apoptosis"
## [136] "Regulation of RAS by GAPs"
## [137] "TCR signaling"
## [138] "Regulation of PTEN stability and activity"
## [139] "CLEC7A (Dectin-1) signaling"
## [140] "COPII-mediated vesicle transport"
## [141] "Interleukin-1 signaling"
## [142] "Protein localization"
## [143] "Signaling by NOTCH4"
## [144] "APC/C-mediated degradation of cell cycle proteins"
## [145] "Regulation of mitotic cell cycle"
## [146] "TNFR2 non-canonical NF-kB pathway"
## [147] "Programmed Cell Death"
## [148] "Host Interactions of HIV factors"
## [149] "Asymmetric localization of PCP proteins"
## [150] "Cellular response to hypoxia"
## [151] "Cyclin A:Cdk2-associated events at S phase entry"
## [152] "Signaling by the B Cell Receptor (BCR)"
## [153] "HIV Infection"
## [154] "Orc1 removal from chromatin"
## [155] "Cargo concentration in the ER"
## [156] "Fc epsilon receptor (FCERI) signaling"
## [157] "Switching of origins to a post-replicative state"
## [158] "Formation of Incision Complex in GG-NER"
## [159] "Iron uptake and transport"
## [160] "Interleukin-1 family signaling"
## [161] "Translation"
## [162] "Phase II - Conjugation of compounds"
## [163] "PINK1-PRKN Mediated Mitophagy"
## [164] "Assembly of the pre-replicative complex"
## [165] "Macroautophagy"
## [166] "Post-translational protein modification"
## [167] "Ub-specific processing proteases"
## [168] "Regulation of mRNA stability by proteins that bind AU-rich elements"
## [169] "DNA Replication Pre-Initiation"
## [170] "Cytoprotection by HMOX1"
## [171] "Membrane Trafficking"
## [172] "Innate Immune System"
## [173] "Metabolism of non-coding RNA"
## [174] "snRNP Assembly"
## [175] "Immune System"
## [176] "Autophagy"
## [177] "G1/S Transition"
## [178] "Synthesis of DNA"
## [179] "Metabolism of amino acids and derivatives"
## [180] "UCH proteinases"
## [181] "Neutrophil degranulation"
## [182] "Glutamate and glutamine metabolism"
## [183] "ER Quality Control Compartment (ERQC)"
## [184] "Adaptive Immune System"
## [185] "Global Genome Nucleotide Excision Repair (GG-NER)"
## [186] "Cytochrome c-mediated apoptotic response"
## [187] "Selective autophagy"
## [188] "COPI-independent Golgi-to-ER retrograde traffic"
## [189] "MAPK6/MAPK4 signaling"
## [190] "Vesicle-mediated transport"
## [191] "Dual Incision in GG-NER"
## [192] "Hedgehog 'off' state"
## [193] "Defects in vitamin and cofactor metabolism"
## [194] "Transport of small molecules"
## [195] "DNA Replication"
## [196] "Pexophagy"
## [197] "MAP3K8 (TPL2)-dependent MAPK1/3 activation"
## [198] "Mitotic G1 phase and G1/S transition"
## [199] "Formation of apoptosome"
## [200] "Regulation of the apoptosome activity"
## [201] "MyD88-independent TLR4 cascade"
## [202] "TRIF(TICAM1)-mediated TLR4 signaling"
## [203] "Synthesis of PIPs at the early endosome membrane"
## [204] "HSP90 chaperone cycle for steroid hormone receptors (SHR) in the presence of ligand"
## [205] "Hedgehog 'on' state"
## [206] "NRIF signals cell death from the nucleus"
## [207] "PCP/CE pathway"
# in comb but not sep
setdiff(mc_up,ms)
## [1] "TCF dependent signaling in response to WNT"
## [2] "Complex I biogenesis"
## [3] "Toll Like Receptor 4 (TLR4) Cascade"
## [4] "Signaling by WNT"
## [5] "Transcriptional regulation by RUNX1"
## [6] "MyD88 dependent cascade initiated on endosome"
## [7] "TRAF6 mediated induction of NFkB and MAP kinases upon TLR7/8 or 9 activation"
## [8] "Toll Like Receptor 7/8 (TLR7/8) Cascade"
## [9] "Toll Like Receptor 9 (TLR9) Cascade"
## [10] "MyD88:MAL(TIRAP) cascade initiated on plasma membrane"
## [11] "Toll Like Receptor 2 (TLR2) Cascade"
## [12] "Toll Like Receptor TLR1:TLR2 Cascade"
## [13] "Toll Like Receptor TLR6:TLR2 Cascade"
## [14] "Cellular responses to stress"
## [15] "Toll-like Receptor Cascades"
## [16] "Cellular responses to stimuli"
# intersection
intersect(mc_up,ms)
## [1] "Respiratory electron transport, ATP synthesis by chemiosmotic coupling, and heat production by uncoupling proteins."
## [2] "Respiratory electron transport"
## [3] "Protein-protein interactions at synapses"
## [4] "tRNA processing in the mitochondrion"
## [5] "The citric acid (TCA) cycle and respiratory electron transport"
## [6] "C-type lectin receptors (CLRs)"
## [7] "Mitochondrial translation termination"
## [8] "IKK complex recruitment mediated by RIP1"
## [9] "ER to Golgi Anterograde Transport"
## [10] "Mitochondrial translation initiation"
## [11] "Formation of TC-NER Pre-Incision Complex"
## [12] "COPI-mediated anterograde transport"
## [13] "Cellular response to chemical stress"
## [14] "Mitochondrial translation"
## [15] "TICAM1, RIP1-mediated IKK complex recruitment"
## [16] "Transcriptional regulation by RUNX2"
## [17] "Mitochondrial translation elongation"
## [18] "Degradation of beta-catenin by the destruction complex"
If we consider both strategies to be valid, then we can define the significant sets as dysregulated. We can calculate the percent sentitivity of both approaches.
all <- unique(c(ms_up,ms_dn,mc_up))
message("Sensitivity: separate only")
## Sensitivity: separate only
(length(ms_up)+length(ms_dn))/length(all)
## [1] 0.93361
message("Sensitivity: combined only")
## Sensitivity: combined only
length(mc_up)/length(all)
## [1] 0.1410788
Clusterprofiler uses a hypergeometric test. Firstly I will conduct the analysis separately for up and down regulated genes and with the correct backgound (as intended by the developers).
genesets2 <- read.gmt("ReactomePathways.gmt")
de_up <- rownames(subset(de, padj<0.05 & log2FoldChange > 0))
de_up <- unique(gt[which(rownames(gt) %in% de_up),1])
de_dn <- rownames(subset(de, padj<0.05 & log2FoldChange < 0))
de_dn <- unique(gt[which(rownames(gt) %in% de_dn),1])
de_bg <- rownames(de)
de_bg <- unique(gt[which(rownames(gt) %in% de_bg),1])
o_up <- as.data.frame(enricher(gene = de_up, universe = de_bg, maxGSSize = 5000, TERM2GENE = genesets2, pAdjustMethod="fdr"))
o_up <- rownames(subset(o_up, p.adjust < 0.05))
o_dn <- as.data.frame(enricher(gene = de_dn, universe = de_bg, maxGSSize = 5000, TERM2GENE = genesets2, pAdjustMethod="fdr"))
o_dn <- rownames(subset(o_dn, p.adjust < 0.05))
o_com <- as.data.frame(enricher(gene = union(de_up,de_dn), universe = de_bg, maxGSSize = 5000, TERM2GENE = genesets2, pAdjustMethod="fdr"))
o_com <- rownames(subset(o_com, p.adjust < 0.05))
length(o_up)
## [1] 4
length(o_dn)
## [1] 1
length(o_up) + length(o_dn)
## [1] 5
length(o_com)
## [1] 3
( length(o_up) + length(o_dn) ) / length(o_com)
## [1] 1.666667
all <- unique(c(o_up,o_dn,o_com))
message("Sensitivity: separate only")
## Sensitivity: separate only
(length(o_up)+length(o_dn))/length(all)
## [1] 0.8333333
message("Sensitivity: combined only")
## Sensitivity: combined only
length(o_com)/length(all)
## [1] 0.5
Euler diagram of the significant pathways found with each approach.
l2 <- list("sep up"=o_up,"sep dn"=o_dn,"comb"=o_com)
plot(euler(l2),quantities = TRUE, edges = "gray", main="ORA: combined vs separated")
List gene sets which are specific to each approach.
o_sep <- c(o_up,o_dn)
# in sep but not comb
setdiff(o_sep,o_com)
## [1] "Assembly of collagen fibrils and other multimeric structures"
## [2] "Collagen formation"
## [3] "TRP channels"
# in comb but not sep
setdiff(o_com,o_sep)
## [1] "Synaptic adhesion-like molecules"
# intersection
intersect(mc_up,ms)
## [1] "Respiratory electron transport, ATP synthesis by chemiosmotic coupling, and heat production by uncoupling proteins."
## [2] "Respiratory electron transport"
## [3] "Protein-protein interactions at synapses"
## [4] "tRNA processing in the mitochondrion"
## [5] "The citric acid (TCA) cycle and respiratory electron transport"
## [6] "C-type lectin receptors (CLRs)"
## [7] "Mitochondrial translation termination"
## [8] "IKK complex recruitment mediated by RIP1"
## [9] "ER to Golgi Anterograde Transport"
## [10] "Mitochondrial translation initiation"
## [11] "Formation of TC-NER Pre-Incision Complex"
## [12] "COPI-mediated anterograde transport"
## [13] "Cellular response to chemical stress"
## [14] "Mitochondrial translation"
## [15] "TICAM1, RIP1-mediated IKK complex recruitment"
## [16] "Transcriptional regulation by RUNX2"
## [17] "Mitochondrial translation elongation"
## [18] "Degradation of beta-catenin by the destruction complex"
par(cex.main=0.5)
par(mar=c(2,2,2,2))
l3 <- list("ORA up"=o_up,"ORA dn"=o_dn,"ORA comb"=o_com,
"FCS up"=ms_up,"FCS dn"=ms_dn,"FCS comb"=mc_up)
plot(euler(l3),quantities = TRUE, edges = "gray", main="FCS compared to ORA")
dat <- list( "FCS_up"=ms_up,
"FCS_dn"=ms_dn,
"FCS_com"=mc_up,
"ORA_up"= o_up,
"ORA_dn"=o_dn,
"ORA_com"=o_com)
str(dat)
## List of 6
## $ FCS_up : chr [1:69] "Signal Transduction" "Developmental Biology" "Extracellular matrix organization" "Heparan sulfate/heparin (HS-GAG) metabolism" ...
## $ FCS_dn : chr [1:156] "Asparagine N-linked glycosylation" "Defective CFTR causes cystic fibrosis" "ER to Golgi Anterograde Transport" "Vif-mediated degradation of APOBEC3G" ...
## $ FCS_com: chr [1:34] "Respiratory electron transport, ATP synthesis by chemiosmotic coupling, and heat production by uncoupling proteins." "Respiratory electron transport" "Protein-protein interactions at synapses" "tRNA processing in the mitochondrion" ...
## $ ORA_up : chr [1:4] "Extracellular matrix organization" "Collagen chain trimerization" "Assembly of collagen fibrils and other multimeric structures" "Collagen formation"
## $ ORA_dn : chr "TRP channels"
## $ ORA_com: chr [1:3] "Extracellular matrix organization" "Synaptic adhesion-like molecules" "Collagen chain trimerization"
saveRDS(dat,file = "ex5dat.rds")
For mitch, it would appear that performing direction informed (DI) analysis clearly yields more differentially regulated pathways (413) as compared to the direction agnostic (DA) method (55).
That being said, there were 18 pathways identified only in the DA method that appeared to be related to the physiology of the model. These gene sets are likely to contain a mix of genes affected by the stimulus in different ways - for example a mix of up and downregulated genes. Are these really real? Not sure.
This pattern was consistent with ORA, where 80 sets were identified with separate analysis and only 23 with the combined analysis.
When comparing ORA to FCS, we found that FCS identified many more sets than ORA. In fact all gene sets that were identified by ORA were also identified by FCS, except for 3 that were specific to the ORA up set.
Let’s look at those now.
myfcs <- c(ms_up, mc_up)
setdiff(o_up,myfcs)
## character(0)
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8
## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8
## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] eulerr_6.1.1 mitch_1.4.1
## [3] clusterProfiler_4.0.5 DESeq2_1.32.0
## [5] SummarizedExperiment_1.22.0 Biobase_2.52.0
## [7] MatrixGenerics_1.4.3 matrixStats_0.61.0
## [9] GenomicRanges_1.44.0 GenomeInfoDb_1.28.4
## [11] IRanges_2.26.0 S4Vectors_0.30.2
## [13] BiocGenerics_0.38.0 getDEE2_1.2.0
## [15] beeswarm_0.4.0 kableExtra_1.3.4
##
## loaded via a namespace (and not attached):
## [1] shadowtext_0.1.1 fastmatch_1.1-3 systemfonts_1.0.3
## [4] plyr_1.8.6 igraph_1.2.11 lazyeval_0.2.2
## [7] polylabelr_0.2.0 splines_4.1.2 BiocParallel_1.26.2
## [10] ggplot2_3.3.5 digest_0.6.29 yulab.utils_0.0.4
## [13] htmltools_0.5.2 GOSemSim_2.18.1 viridis_0.6.2
## [16] GO.db_3.13.0 fansi_1.0.0 magrittr_2.0.1
## [19] memoise_2.0.1 Biostrings_2.60.2 annotate_1.70.0
## [22] graphlayouts_0.8.0 svglite_2.0.0 enrichplot_1.12.3
## [25] colorspace_2.0-2 blob_1.2.2 rvest_1.0.2
## [28] ggrepel_0.9.1 xfun_0.29 dplyr_1.0.7
## [31] crayon_1.4.2 RCurl_1.98-1.5 jsonlite_1.7.2
## [34] scatterpie_0.1.7 genefilter_1.74.1 survival_3.2-13
## [37] ape_5.6-1 glue_1.6.0 polyclip_1.10-0
## [40] gtable_0.3.0 zlibbioc_1.38.0 XVector_0.32.0
## [43] webshot_0.5.2 htm2txt_2.1.1 DelayedArray_0.18.0
## [46] scales_1.1.1 DOSE_3.18.3 DBI_1.1.2
## [49] GGally_2.1.2 Rcpp_1.0.7 viridisLite_0.4.0
## [52] xtable_1.8-4 gridGraphics_0.5-1 tidytree_0.3.7
## [55] bit_4.0.4 htmlwidgets_1.5.4 httr_1.4.2
## [58] fgsea_1.18.0 gplots_3.1.1 RColorBrewer_1.1-2
## [61] ellipsis_0.3.2 reshape_0.8.8 pkgconfig_2.0.3
## [64] XML_3.99-0.8 farver_2.1.0 sass_0.4.0
## [67] locfit_1.5-9.4 utf8_1.2.2 later_1.3.0
## [70] ggplotify_0.1.0 tidyselect_1.1.1 rlang_0.4.12
## [73] reshape2_1.4.4 AnnotationDbi_1.54.1 munsell_0.5.0
## [76] tools_4.1.2 cachem_1.0.6 downloader_0.4
## [79] generics_0.1.1 RSQLite_2.2.9 evaluate_0.14
## [82] stringr_1.4.0 fastmap_1.1.0 yaml_2.2.1
## [85] ggtree_3.0.4 knitr_1.37 bit64_4.0.5
## [88] tidygraph_1.2.0 caTools_1.18.2 purrr_0.3.4
## [91] KEGGREST_1.32.0 ggraph_2.0.5 nlme_3.1-153
## [94] mime_0.12 aplot_0.1.2 DO.db_2.9
## [97] xml2_1.3.3 compiler_4.1.2 rstudioapi_0.13
## [100] png_0.1-7 treeio_1.16.2 tibble_3.1.6
## [103] tweenr_1.0.2 geneplotter_1.70.0 bslib_0.3.1
## [106] stringi_1.7.6 highr_0.9 lattice_0.20-45
## [109] Matrix_1.4-0 vctrs_0.3.8 pillar_1.6.4
## [112] lifecycle_1.0.1 jquerylib_0.1.4 data.table_1.14.2
## [115] cowplot_1.1.1 bitops_1.0-7 httpuv_1.6.5
## [118] patchwork_1.1.1 qvalue_2.24.0 R6_2.5.1
## [121] promises_1.2.0.1 KernSmooth_2.23-20 echarts4r_0.4.3
## [124] gridExtra_2.3 gtools_3.9.2 MASS_7.3-54
## [127] assertthat_0.2.1 GenomeInfoDbData_1.2.6 grid_4.1.2
## [130] ggfun_0.0.4 tidyr_1.1.4 rmarkdown_2.11
## [133] ggforce_0.3.3 shiny_1.7.1