Source: https://github.com/markziemann/background/
Here, we look at the sensitivity of pathway analysis of RNA-seq data.
suppressPackageStartupMessages({
library("DESeq2")
library("eulerr")
library("HGNChelper")
library("mitch")
library("kableExtra")
library("beeswarm")
library("gplots")
library("gridExtra")
library("png")
library("fgsea")
library("parallel")
library("RhpcBLASctl")
library("edgeR")
library("vioplot")
})
CORES=16
Reactome pathways were downloaded on the 14th Sept 2023 from MsigDB.
gs <- gmt_import("c2.cp.reactome.v2023.2.Hs.symbols.gmt")
names(gs) <- gsub("REACTOME_","",names(gs))
names(gs) <- gsub("_"," ",names(gs))
From GEO https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE158420 GSE158420_counts.txt.gz
if (!file.exists("GSE158420_counts.txt.gz")) {
download.file("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE158420&format=file&file=GSE158420%5Fcounts%2Etxt%2Egz",
destfile="GSE158420_counts.txt.gz")
}
x <- read.table("GSE158420_counts.txt.gz",row.names=NULL)
dim(x)
## [1] 18747 75
x <- aggregate(. ~ row.names,x,sum)
dim(x)
## [1] 18745 75
#RUFKM
head(x)
## row.names X10N X10T X11N X11T X12N X12T X14N X14T X15N X15T X16N X16T X17N
## 1 1-Dec 0 31 0 1 0 1 0 0 0 0 0 0 0
## 2 1-Mar 431 461 478 98 862 635 489 499 543 608 707 498 700
## 3 1-Sep 160 131 280 131 132 383 381 124 114 217 258 886 302
## 4 10-Mar 286 217 130 58 2 18 19 39 417 68 12 90 9
## 5 10-Sep 3541 4311 2174 3675 2856 1864 3306 1546 3507 2483 3568 1294 3658
## 6 11-Mar 0 40 6 1 0 0 0 0 0 1 0 0 0
## X17T X18N X18T X19N X19T X20N X20T X21N X21T X22N X22T X23N X23T X24N X24T
## 1 1 0 79 0 18 0 1 0 1 0 0 0 1 0 2
## 2 411 607 446 608 463 361 488 615 350 673 1838 297 262 762 529
## 3 162 133 279 138 88 239 533 212 53 258 126 197 229 137 236
## 4 122 123 313 70 127 1481 1010 97 7 94 21 588 694 92 24
## 5 542 3337 1050 4527 1865 2656 1314 4449 1060 3196 400 3499 1709 3280 3528
## 6 2 1 38 0 18 0 0 1 3 0 3 1 2 0 5
## X25N X25T X26N X26T X27N X27T X28N X28T X29N X29T X2N X2T X30N X30T X31N
## 1 1 0 0 3 0 2 0 2 0 0 0 0 0 0 0
## 2 534 860 451 1110 798 380 572 336 289 514 575 257 464 483 535
## 3 132 66 98 56 573 190 211 75 79 205 151 147 183 144 212
## 4 87 23 126 368 192 114 441 5 2242 19 23 80 616 24 589
## 5 3463 2832 2893 3445 2442 3703 3250 5465 2028 1971 3502 2354 2605 934 3291
## 6 4 3 0 3 0 4 0 0 2 1 0 0 1 3 1
## X31T X32N X32T X33N X33T X34N X34T X35N X35T X36N X36T X37N X37T X38N X38T
## 1 2 0 0 0 0 1 0 0 1 0 1 0 0 0 4
## 2 478 311 280 630 437 713 334 643 789 286 339 488 320 427 507
## 3 144 302 1086 231 363 185 290 125 199 61 355 59 121 92 203
## 4 28 48 27 4 11 956 13 640 63 654 19 1234 2076 380 185
## 5 4226 3135 1416 3072 1702 3356 2515 3034 2532 3678 502 1512 1324 2684 2542
## 6 5 0 0 0 1 1 0 0 1 1 6 0 6 0 9
## X39N X39T X3N X3T X4N X4T X5N X5T X6N X6T X7N X7T X8N X8T X9N
## 1 0 1 1 3 0 2 0 8 2 0 1 1 17 2 0
## 2 695 458 628 287 305 255 801 364 626 320 638 412 264 436 504
## 3 329 174 120 114 119 59 144 44 110 184 181 327 101 217 191
## 4 43 135 11 63 323 33 78 63 28 6 323 16 185 168 76
## 5 1770 1446 2921 1688 2692 2447 3367 4396 3830 2841 2623 3671 1614 2102 3507
## 6 1 2 0 10 2 3 0 1 0 0 2 1 7 9 0
## X9T
## 1 2
## 2 317
## 3 48
## 4 7
## 5 2482
## 6 5
There are some gene name errors, so I will use HGNChelper to repair them.
human=x$row.names
new.hgnc.table <- readRDS("new.hgnc.table.rds")
fix <- checkGeneSymbols(human,map=new.hgnc.table)
## Warning in checkGeneSymbols(human, map = new.hgnc.table): Human gene symbols
## should be all upper-case except for the 'orf' in open reading frames. The case
## of some letters was corrected.
## Warning in checkGeneSymbols(human, map = new.hgnc.table): x contains
## non-approved gene symbols
x$gene <- fix$Suggested.Symbol
x$row.names=NULL
x <- aggregate(. ~ gene,x,sum)
dim(x)
## [1] 18704 75
row.names(x)=x$gene
x$gene=NULL
head(x)
## X10N X10T X11N X11T X12N X12T X14N X14T X15N X15T X16N
## A1BG 120 217 148 39 73 87 146 95 88 69 72
## A1CF 8 29 22 2 0 1 4 0 2 3 1
## A2M 89330 6096 39181 3282 126977 89739 108730 40523 160543 179385 155740
## A2ML1 16 1307 50 2729 26 19 31 393 20 16 21
## A3GALT2 5 49 21 3 2 0 1 5 1 4 5
## A4GALT 304 1138 350 1439 219 149 178 809 446 150 300
## X16T X17N X17T X18N X18T X19N X19T X20N X20T X21N X21T
## A1BG 121 63 30 113 279 43 137 152 129 56 19
## A1CF 0 3 9 4 104 0 46 6 3 1 1
## A2M 39899 129176 16554 109697 15992 125658 3504 106485 59452 194434 10592
## A2ML1 17 15 114 13 591 10 5830 16 38 15 29
## A3GALT2 4 2 3 13 76 3 53 17 5 14 6
## A4GALT 249 203 286 296 787 209 1814 335 554 220 311
## X22N X22T X23N X23T X24N X24T X25N X25T X26N X26T X27N
## A1BG 92 12 106 85 68 101 107 15 50 46 110
## A1CF 1 10 2 5 3 1 3 7 2 5 1
## A2M 127707 7544 129002 107685 84494 8974 125875 3304 133114 23805 124283
## A2ML1 13 146 13 24 16 298 96 2730 7 257 9
## A3GALT2 1 6 4 1 8 11 17 0 4 13 9
## A4GALT 248 115 291 567 177 728 248 536 284 445 235
## X27T X28N X28T X29N X29T X2N X2T X30N X30T X31N X31T
## A1BG 155 143 64 204 125 82 117 97 127 115 39
## A1CF 1 2 4 3 4 0 0 2 6 1 1
## A2M 16776 163693 15056 56772 32450 110818 57261 242936 22835 157038 5604
## A2ML1 3891 74 67 19 50 8 11 8 28 31 93
## A3GALT2 16 6 1 11 6 6 6 19 1 10 8
## A4GALT 1059 586 3534 808 461 280 1505 723 236 442 481
## X32N X32T X33N X33T X34N X34T X35N X35T X36N X36T X37N
## A1BG 82 105 116 104 119 63 101 111 58 90 107
## A1CF 0 0 3 2 3 1 7 4 1 3 4
## A2M 158632 11656 64662 92680 122607 84313 190327 145504 148968 7674 38901
## A2ML1 6 130 42 9 17 31 22 33 9 33 24
## A3GALT2 14 3 6 10 10 12 8 14 1 47 5
## A4GALT 299 94 292 564 345 694 452 521 454 322 656
## X37T X38N X38T X39N X39T X3N X3T X4N X4T X5N X5T X6N
## A1BG 57 57 133 105 125 67 94 99 291 87 59 68
## A1CF 9 1 15 2 5 2 18 1 6 3 10 0
## A2M 65047 82116 123542 127464 91470 99241 9283 117914 5648 89583 2462 97346
## A2ML1 58 12 61 13 26 13 3833 57 499 37 1108 5
## A3GALT2 1 14 30 2 10 12 21 18 11 8 3 2
## A4GALT 949 588 934 209 473 359 1134 257 711 235 627 126
## X6T X7N X7T X8N X8T X9N X9T
## A1BG 27 108 86 194 84 77 8
## A1CF 0 2 6 35 9 1 0
## A2M 9188 98328 29242 15384 26414 121946 4447
## A2ML1 632 8 329 158 1072 13 2879
## A3GALT2 1 16 9 57 9 11 3
## A4GALT 1176 225 2035 498 491 160 1470
MDS plot
patient <- head(as.character(unlist(lapply(1:ncol(x),function(i) {c(i,i)}))),ncol(x))
tumor <- rep(c(0,1),ncol(x)/2)
ss <- data.frame(patient,tumor)
rownames(ss) <- colnames(x)
mds <- cmdscale(dist(t(scale(x))))
cols <- gsub("1","pink",gsub("0","lightblue",tumor))
plot(mds, xlab="Coordinate 1", ylab="Coordinate 2", pch=16,col=cols,cex=2)
text(mds, labels=colnames(x),cex=0.7)
colcols <- as.character(as.numeric(grepl("T",colnames(x))))
colcols <- gsub("1","red",gsub("0","blue",colcols))
gspear <- cor(x,method="spearman")
heatmap.2( gspear, scale="none", ColSideColors=colcols ,
trace="none",margins = c(6,10), cexRow=.6, cexCol=.5, main="Spearman - genes")
mtext("Blue=normal,Red=tumour")
xn <- x[,grep("N",colnames(x))]
nspear <- cor(xn,method="spearman")
summary(as.vector(nspear))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.8960 0.9619 0.9722 0.9675 0.9797 1.0000
xt <- x[,grep("T",colnames(x))]
tspear <- cor(xt,method="spearman")
summary(as.vector(tspear))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.7485 0.8733 0.9028 0.8997 0.9291 1.0000
# rank
xr <- apply(x,2,rank)
res <- apply(xr,2, function(z) {
lapply(gs,function(set) {
median(z[which(names(z) %in% set)])
} )
})
res <- do.call(cbind,res)
dim(res)
## [1] 1692 74
res2 <- apply(res,2,as.numeric)
rownames(res2) <- rownames(res)
res <- res2
res_spear <- cor(res,method="spearman")
heatmap.2( res_spear, scale="none", ColSideColors=colcols ,
trace="none",margins = c(6,10), cexRow=.6, cexCol=.5, main="Spearman - pathways")
mtext("Blue=normal,Red=tumour")
resn <- res[,grep("N",colnames(res))]
resnspear <- cor(resn,method="spearman")
summary(as.vector(resnspear))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.9255 0.9584 0.9706 0.9683 0.9787 1.0000
rest <- res[,grep("T",colnames(x))]
restspear <- cor(rest,method="spearman")
summary(as.vector(restspear))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.6643 0.8576 0.9068 0.8931 0.9352 1.0000
xl <- list("gene N"=as.vector(nspear), "gene T"=as.vector(tspear),
"set N"=as.vector(resnspear),"set T"=as.vector(restspear))
vioplot(xl,main="Within group sample correlation",
col=c("red","red","blue","blue"))
mtext("Spearman rho")
abline(h=median(as.vector(nspear)),lty=2,col="red")
abline(h=median(as.vector(tspear)),lty=2,col="red")
abline(h=median(as.vector(resnspear)),lty=2,col="blue")
abline(h=median(as.vector(restspear)),lty=2,col="blue")
par(mfrow=c(2,2))
gmds <- cmdscale(dist(t(xr)),k=10)/nrow(xr)
barplot(colSums(abs(gmds)),names.arg=1:10,ylab="Gene PCA magnitude")
SUM=round(sum(colSums(abs(gmds))),digits=1)
mtext(paste("Sum =",SUM))
plot(gmds,col=colcols,xlab="MDS1",ylab="MDS2",main="Gene rank MDS")
mtext("Blue=Normal,Red=Tumour")
gsmds <- cmdscale(dist(t(res)),k=10)/nrow(res)
barplot(colSums(abs(gsmds)),names.arg=1:10,ylab="Gene set PCA magnitude")
SUM=round(sum(colSums(abs(gsmds))),digits=1)
mtext(paste("Sum =",SUM))
plot(gsmds,col=colcols,xlab="MDS1",ylab="MDS2",main="Gene set rank MDS")
mtext("Blue=Normal,Red=Tumour")
par(mfrow=c(2,2))
gmdsn <- cmdscale(dist(t(xn)),k=10)/nrow(xn)
barplot(colSums(abs(gmdsn)),names.arg=1:10,main="Gene PCA magnitude - Normal")
SUM=round(sum(colSums(abs(gmdsn))),digits=1)
mtext(paste("Sum =",SUM))
gmdst <- cmdscale(dist(t(xt)),k=10)/nrow(xt)
barplot(colSums(abs(gmdst)),names.arg=1:10,main="Gene PCA magnitude - Tumor")
SUM=round(sum(colSums(abs(gmdst))),digits=1)
mtext(paste("Sum =",SUM))
gsmdsn <- cmdscale(dist(t(resn)),k=10)/nrow(resn)
barplot(colSums(abs(gsmdsn)),names.arg=1:10,main="Gene set PCA magnitude - Normal")
SUM=round(sum(colSums(abs(gsmdsn))),digits=1)
mtext(paste("Sum =",SUM))
gsmdst <- cmdscale(dist(t(rest)),k=10)/nrow(rest)
barplot(colSums(abs(gsmdst)),names.arg=1:10,main="Gene set PCA magnitude - Tumour")
SUM=round(sum(colSums(abs(gsmdst))),digits=1)
mtext(paste("Sum =",SUM))
par(mfrow=c(1,1))
Now do DESeq2
RhpcBLASctl::blas_set_num_threads(4) #set automatic cores to 1
dds <- DESeqDataSetFromMatrix(countData = x , colData=ss, design = ~ patient + tumor)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## the design formula contains one or more numeric variables with integer values,
## specifying a model with increasing fold change for higher values.
## did you mean for this to be a factor? if so, first convert
## this variable to a factor using the factor() function
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
z <- results(res)
vsd <- vst(dds, blind=FALSE)
zz <-cbind(as.data.frame(z),assay(vsd))
dge <- as.data.frame(zz[order(zz$pvalue),])
dge_up <- rownames(subset(dge,log2FoldChange>0 & padj<0.05))
dge_dn <- rownames(subset(dge,log2FoldChange<0 & padj<0.05))
genes <- c(paste(dge_up,"up"),paste(dge_dn,"dn"))
sig <- subset(dge,padj<0.05)
SIG = nrow(sig)
DN = nrow(subset(sig,log2FoldChange<0))
UP = nrow(subset(sig,log2FoldChange>0))
HEADER = paste("mRNA", SIG , "DEGs,", UP ,"upregulated,", DN, "downregulated")
# smear
plot(log2(dge$baseMean),dge$log2FoldChange,cex=0.6,cex.axis=1.2,cex.lab=1.3,
xlab="log2 base mean",
ylab="log2 fold change" ,pch=19,col="#838383")
points(log2(sig$baseMean),sig$log2FoldChange,cex=0.6,pch=19,col="red")
mtext((HEADER),cex=1.2)
top <- head(sig,20)
# volcano
plot(dge$log2FoldChange, -log2(dge$pvalue) ,cex=0.6, cex.lab=1.3,cex.axis=1.2,
,xlab="log2 fold change", ylab="-log2 p-value" ,pch=19,col="#838383")
points(sig$log2FoldChange, -log2(sig$pvalue),cex=0.6,pch=19,col="red")
mtext((HEADER),cex=1.2)
# top N gene heatmap
colCols <- rep(c(0,1),ncol(x)/2)
colCols <- gsub("0","gray",colCols)
colCols <- gsub("1","black",colCols)
colfunc <- colorRampPalette(c("blue", "white", "red"))
heatmap.2( as.matrix(dge[1:50,c(7:ncol(dge))]), col=colfunc(25),scale="row",
ColSideColors=colCols ,
trace="none",margins = c(6,10), cexRow=.6, cexCol=.5, main="Top 50 genes")
RNA expression pathway enrichment with mitch
rna <- mitch_import(dge, DEtype="deseq2")
## The input is a single dataframe; one contrast only. Converting
## it to a list for you.
## Note: Mean no. genes in input = 18704
## Note: no. genes in output = 18704
## Note: estimated proportion of input genes in output = 1
res <- mitch_calc(rna, genesets=gs, priority="effect",cores=16, minsetsize=5)
## Note: Enrichments with large effect sizes may not be
## statistically significant.
mres <- res$enrichment_result
msig <- subset(mres,p.adjustANOVA<0.05)
nrow(msig)
## [1] 534
head(subset(mres,p.adjustANOVA<0.05 & s.dist>0),20) %>%
kbl(caption="mRNA upregulated pathways") %>%
kable_styling("hover",full_width=FALSE)
set | setSize | pANOVA | s.dist | p.adjustANOVA | |
---|---|---|---|---|---|
1644 | UNWINDING OF DNA | 12 | 0.0000000 | 0.9476460 | 0.0000002 |
1522 | TFAP2A ACTS AS A TRANSCRIPTIONAL REPRESSOR DURING RETINOIC ACID INDUCED CELL DIFFERENTIATION | 5 | 0.0002657 | 0.9415663 | 0.0015308 |
522 | G2 M DNA REPLICATION CHECKPOINT | 5 | 0.0013580 | 0.8271730 | 0.0064164 |
1008 | PHOSPHORYLATION OF EMI1 | 6 | 0.0005897 | 0.8099977 | 0.0031193 |
250 | CONDENSATION OF PROMETAPHASE CHROMOSOMES | 11 | 0.0000077 | 0.7785821 | 0.0000635 |
1144 | REGULATION OF GENE EXPRESSION IN ENDOCRINE COMMITTED NEUROG3 PROGENITOR CELLS | 5 | 0.0030517 | 0.7649742 | 0.0131351 |
371 | DNA METHYLATION | 63 | 0.0000000 | 0.7451171 | 0.0000000 |
374 | DNA REPLICATION INITIATION | 8 | 0.0002768 | 0.7422938 | 0.0015782 |
376 | DNA STRAND ELONGATION | 32 | 0.0000000 | 0.7418428 | 0.0000000 |
48 | ACTIVATION OF THE PRE REPLICATIVE COMPLEX | 33 | 0.0000000 | 0.7372399 | 0.0000000 |
118 | ASSEMBLY OF THE ORC COMPLEX AT THE ORIGIN OF REPLICATION | 67 | 0.0000000 | 0.7220965 | 0.0000000 |
335 | DEPOSITION OF NEW CENPA CONTAINING NUCLEOSOMES AT THE CENTROMERE | 67 | 0.0000000 | 0.7202713 | 0.0000000 |
1053 | PRC2 METHYLATES HISTONES AND DNA | 71 | 0.0000000 | 0.7181092 | 0.0000000 |
395 | E2F ENABLED INHIBITION OF PRE REPLICATION COMPLEX FORMATION | 9 | 0.0001998 | 0.7158501 | 0.0012005 |
251 | CONDENSATION OF PROPHASE CHROMOSOMES | 71 | 0.0000000 | 0.7155805 | 0.0000000 |
27 | ACTIVATION OF ATR IN RESPONSE TO REPLICATION STRESS | 37 | 0.0000000 | 0.7122379 | 0.0000000 |
1039 | POLO LIKE KINASE MEDIATED EVENTS | 16 | 0.0000019 | 0.6881559 | 0.0000169 |
192 | CDC6 ASSOCIATION WITH THE ORC ORIGIN COMPLEX | 8 | 0.0007932 | 0.6849615 | 0.0040183 |
425 | ERCC6 CSB AND EHMT2 G9A POSITIVELY REGULATE RRNA EXPRESSION | 74 | 0.0000000 | 0.6830573 | 0.0000000 |
101 | APOPTOSIS INDUCED DNA FRAGMENTATION | 13 | 0.0000259 | 0.6737971 | 0.0001960 |
head(subset(mres,p.adjustANOVA<0.05 & s.dist<0),20) %>%
kbl(caption="mRNA downregulated pathways") %>%
kable_styling("hover",full_width=FALSE)
set | setSize | pANOVA | s.dist | p.adjustANOVA | |
---|---|---|---|---|---|
350 | DISEASES ASSOCIATED WITH SURFACTANT METABOLISM | 10 | 0.0000043 | -0.8389466 | 0.0000374 |
313 | DEFECTIVE CSF2RB CAUSES SMDP5 | 8 | 0.0000618 | -0.8177618 | 0.0004276 |
153 | BIOSYNTHESIS OF EPA DERIVED SPMS | 6 | 0.0006262 | -0.8061607 | 0.0032914 |
843 | MUCOPOLYSACCHARIDOSES | 11 | 0.0000194 | -0.7437856 | 0.0001500 |
429 | ERYTHROCYTES TAKE UP OXYGEN AND RELEASE CARBON DIOXIDE | 9 | 0.0001294 | -0.7366999 | 0.0008276 |
1660 | VITAMIN B1 THIAMIN METABOLISM | 5 | 0.0073591 | -0.6920826 | 0.0266767 |
107 | ARACHIDONATE PRODUCTION FROM DAG | 5 | 0.0092239 | -0.6723322 | 0.0319887 |
909 | NOSTRIN MEDIATED ENOS TRAFFICKING | 5 | 0.0154679 | -0.6252367 | 0.0492747 |
1267 | RSK ACTIVATION | 7 | 0.0044561 | -0.6206870 | 0.0180173 |
428 | ERYTHROCYTES TAKE UP CARBON DIOXIDE AND RELEASE OXYGEN | 13 | 0.0001161 | -0.6172969 | 0.0007483 |
30 | ACTIVATION OF C3 AND C5 | 7 | 0.0077581 | -0.5810805 | 0.0280025 |
505 | FORMYL PEPTIDE RECEPTORS BIND FORMYL PEPTIDES AND MANY OTHER LIGANDS | 8 | 0.0054713 | -0.5671365 | 0.0211070 |
742 | LOSS OF FUNCTION OF SMAD2 3 IN CANCER | 7 | 0.0108923 | -0.5557058 | 0.0368630 |
923 | NR1H2 NR1H3 REGULATE GENE EXPRESSION TO CONTROL BILE ACID HOMEOSTASIS | 9 | 0.0040866 | -0.5527102 | 0.0167243 |
1137 | REGULATION OF CYTOSKELETAL REMODELING AND CELL SPREADING BY IPP COMPLEX COMPONENTS | 8 | 0.0069501 | -0.5510786 | 0.0255242 |
1440 | STAT5 ACTIVATION | 7 | 0.0139703 | -0.5364789 | 0.0452743 |
983 | PD 1 SIGNALING | 21 | 0.0000223 | -0.5344633 | 0.0001705 |
627 | HYALURONAN UPTAKE AND DEGRADATION | 12 | 0.0015299 | -0.5282934 | 0.0071085 |
1404 | SIGNALING BY TGF BETA RECEPTOR COMPLEX IN CANCER | 8 | 0.0112943 | -0.5172414 | 0.0377675 |
774 | METABOLISM OF ANGIOTENSINOGEN TO ANGIOTENSINS | 18 | 0.0001492 | -0.5162639 | 0.0009399 |
dge$stat[which(is.na(dge$stat))] <- 1
stat <- dge$stat
names(stat) <- rownames(dge)
fres <- fgsea(pathways=gs,stats=stat,minSize=5, nproc=8)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.57% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## | | | 0% | |= | 1% | |= | 2% | |== | 2% | |== | 3% | |=== | 4% | |=== | 5% | |==== | 5% | |==== | 6% | |===== | 7% | |===== | 8% | |====== | 8% | |====== | 9% | |======= | 10% | |======== | 11% | |======== | 12% | |========= | 12% | |========= | 13% | |========== | 14% | |========== | 15% | |=========== | 15% | |=========== | 16% | |============ | 17% | |============ | 18% | |============= | 18% | |============= | 19% | |============== | 20% | |=============== | 21% | |=============== | 22% | |================ | 22% | |================ | 23% | |================= | 24% | |================= | 25% | |================== | 25% | |================== | 26% | |=================== | 27% | |=================== | 28% | |==================== | 28% | |==================== | 29% | |===================== | 30% | |====================== | 31% | |====================== | 32% | |======================= | 32% | |======================= | 33% | |======================== | 34% | |======================== | 35% | |========================= | 35% | |========================= | 36% | |========================== | 37% | |========================== | 38% | |=========================== | 38% | |=========================== | 39% | |============================ | 40% | |============================= | 41% | |============================= | 42% | |============================== | 42% | |============================== | 43% | |=============================== | 44% | |=============================== | 45% | |================================ | 45% | |================================ | 46% | |================================= | 47% | |================================= | 48% | |================================== | 48% | |================================== | 49% | |=================================== | 50% | |==================================== | 51% | |==================================== | 52% | |===================================== | 52% | |===================================== | 53% | |====================================== | 54% | |====================================== | 55% | |======================================= | 55% | |======================================= | 56% | |======================================== | 57% | |======================================== | 58% | |========================================= | 58% | |========================================= | 59% | |========================================== | 60% | |=========================================== | 61% | |=========================================== | 62% | |============================================ | 62% | |============================================ | 63% | |============================================= | 64% | |============================================= | 65% | |============================================== | 65% | |============================================== | 66% | |=============================================== | 67% | |=============================================== | 68% | |================================================ | 68% | |================================================ | 69% | |================================================= | 70% | |================================================== | 71% | |================================================== | 72% | |=================================================== | 72% | |=================================================== | 73% | |==================================================== | 74% | |==================================================== | 75% | |===================================================== | 75% | |===================================================== | 76% | |====================================================== | 77% | |====================================================== | 78% | |======================================================= | 78% | |======================================================= | 79% | |======================================================== | 80% | |========================================================= | 81% | |========================================================= | 82% | |========================================================== | 82% | |========================================================== | 83% | |=========================================================== | 84% | |=========================================================== | 85% | |============================================================ | 85% | |============================================================ | 86% | |============================================================= | 87% | |============================================================= | 88% | |============================================================== | 88% | |============================================================== | 89% | |=============================================================== | 90% | |================================================================ | 91% | |================================================================ | 92% | |================================================================= | 92% | |================================================================= | 93% | |================================================================== | 94% | |================================================================== | 95% | |=================================================================== | 95% | |=================================================================== | 96% | |==================================================================== | 97% | |==================================================================== | 98% | |===================================================================== | 98% | |===================================================================== | 99% | |======================================================================| 100%
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-50. You can
## set the `eps` argument to zero for better estimation.
fsig <- subset(fres,padj<0.05)
nrow(fsig)
## [1] 457
fres <- fres[order(-abs(fres$ES)),]
head(subset(fres,padj<0.05 & ES>0),20) %>%
kbl(caption="mRNA upregulated pathways") %>%
kable_styling("hover",full_width=FALSE)
pathway | pval | padj | log2err | ES | NES | size | leadingEdge |
---|---|---|---|---|---|---|---|
PHOSPHORYLATION OF EMI1 | 0.0000001 | 0.0000010 | 0.7049757 | 0.9745672 | 2.083196 | 6 | CDC20, C…. |
TFAP2A ACTS AS A TRANSCRIPTIONAL REPRESSOR DURING RETINOIC ACID INDUCED CELL DIFFERENTIATION | 0.0000900 | 0.0006615 | 0.5384341 | 0.9292476 | 1.891911 | 5 | MYBL2, T…. |
G2 M DNA REPLICATION CHECKPOINT | 0.0003958 | 0.0024152 | 0.4984931 | 0.8971791 | 1.826621 | 5 | CCNB2, C…. |
UNWINDING OF DNA | 0.0000000 | 0.0000003 | 0.7337620 | 0.8956446 | 2.407643 | 12 | CDC45, G…. |
CONDENSATION OF PROMETAPHASE CHROMOSOMES | 0.0000013 | 0.0000135 | 0.6435518 | 0.8637411 | 2.255534 | 11 | NCAPH, C…. |
P75NTR NEGATIVELY REGULATES CELL CYCLE VIA SC1 | 0.0006720 | 0.0037848 | 0.4772708 | 0.8621706 | 1.842942 | 6 | HDAC2, N…. |
TFAP2 AP 2 FAMILY REGULATES TRANSCRIPTION OF CELL CYCLE FACTORS | 0.0021591 | 0.0108184 | 0.4317077 | 0.8599702 | 1.750865 | 5 | TFAP2A, …. |
VASOPRESSIN LIKE RECEPTORS | 0.0021553 | 0.0108184 | 0.4317077 | 0.8321301 | 1.778728 | 6 | OXT, AVP…. |
REGULATION OF GENE EXPRESSION IN ENDOCRINE COMMITTED NEUROG3 PROGENITOR CELLS | 0.0080562 | 0.0326906 | 0.3807304 | 0.8211669 | 1.671863 | 5 | NKX2-2, …. |
POLO LIKE KINASE MEDIATED EVENTS | 0.0000006 | 0.0000068 | 0.6594444 | 0.8057940 | 2.412862 | 16 | MYBL2, C…. |
DNA REPLICATION INITIATION | 0.0008626 | 0.0047162 | 0.4772708 | 0.8028437 | 1.881614 | 8 | POLE2, P…. |
PROTON COUPLED MONOCARBOXYLATE TRANSPORT | 0.0055222 | 0.0238446 | 0.4070179 | 0.8003236 | 1.710740 | 6 | SLC16A1,…. |
E2F ENABLED INHIBITION OF PRE REPLICATION COMPLEX FORMATION | 0.0004037 | 0.0024542 | 0.4984931 | 0.7923224 | 1.941573 | 9 | ORC6, CC…. |
REPLACEMENT OF PROTAMINES BY NUCLEOSOMES IN THE MALE PRONUCLEUS | 0.0000000 | 0.0000000 | 1.0376962 | 0.7849609 | 2.917701 | 42 | H2BC17, …. |
CDC6 ASSOCIATION WITH THE ORC ORIGIN COMPLEX | 0.0021300 | 0.0107391 | 0.4317077 | 0.7759047 | 1.818477 | 8 | CDC6, OR…. |
DNA STRAND ELONGATION | 0.0000000 | 0.0000000 | 0.8634154 | 0.7716566 | 2.709950 | 32 | CDC45, G…. |
ASSEMBLY OF THE ORC COMPLEX AT THE ORIGIN OF REPLICATION | 0.0000000 | 0.0000000 | 1.2125446 | 0.7630369 | 3.154493 | 68 | H3C13, O…. |
DEPOSITION OF NEW CENPA CONTAINING NUCLEOSOMES AT THE CENTROMERE | 0.0000000 | 0.0000000 | 1.2039752 | 0.7612403 | 3.147066 | 68 | HJURP, H…. |
ACTIVATION OF ATR IN RESPONSE TO REPLICATION STRESS | 0.0000000 | 0.0000000 | 0.8986712 | 0.7591156 | 2.777764 | 37 | CDC6, CD…. |
DNA METHYLATION | 0.0000000 | 0.0000000 | 1.1778933 | 0.7587791 | 3.073608 | 64 | H3C13, H…. |
head(subset(fres,padj<0.05 & ES<0),20) %>%
kbl(caption="mRNA downregulated pathways") %>%
kable_styling("hover",full_width=FALSE)
pathway | pval | padj | log2err | ES | NES | size | leadingEdge |
---|---|---|---|---|---|---|---|
DISEASES ASSOCIATED WITH SURFACTANT METABOLISM | 0.0000005 | 0.0000055 | 0.6594444 | -0.8906262 | -2.390803 | 10 | SFTPC, S…. |
DEFECTIVE CSF2RB CAUSES SMDP5 | 0.0000136 | 0.0001202 | 0.5933255 | -0.8905215 | -2.273767 | 8 | SFTPC, S…. |
NOSTRIN MEDIATED ENOS TRAFFICKING | 0.0008186 | 0.0045197 | 0.4772708 | -0.8868488 | -1.881345 | 5 | CAV1, NO…. |
ERYTHROCYTES TAKE UP OXYGEN AND RELEASE CARBON DIOXIDE | 0.0000919 | 0.0006730 | 0.5384341 | -0.8262034 | -2.171420 | 9 | CA4, HBA…. |
SYNTHESIS OF LIPOXINS LX | 0.0044828 | 0.0202387 | 0.4070179 | -0.8189498 | -1.852820 | 6 | ALOX5AP,…. |
BIOSYNTHESIS OF EPA DERIVED SPMS | 0.0105255 | 0.0398314 | 0.3807304 | -0.7827040 | -1.770817 | 6 | LTA4H, A…. |
RSK ACTIVATION | 0.0057469 | 0.0247512 | 0.4070179 | -0.7739407 | -1.853260 | 7 | RPS6KA1,…. |
MUCOPOLYSACCHARIDOSES | 0.0003149 | 0.0019937 | 0.4984931 | -0.7728430 | -2.164065 | 11 | HYAL1, I…. |
PD 1 SIGNALING | 0.0000000 | 0.0000005 | 0.7195128 | -0.7712443 | -2.608149 | 21 | HLA-DRB5…. |
ERYTHROCYTES TAKE UP CARBON DIOXIDE AND RELEASE OXYGEN | 0.0001307 | 0.0009209 | 0.5188481 | -0.7568012 | -2.255598 | 13 | CA4, HBA…. |
ACYL CHAIN REMODELING OF DAG AND TAG | 0.0103527 | 0.0394432 | 0.3807304 | -0.7478565 | -1.790800 | 7 | PNPLA2, MGLL |
LOSS OF FUNCTION OF SMAD2 3 IN CANCER | 0.0115505 | 0.0432244 | 0.3807304 | -0.7430239 | -1.779228 | 7 | TGFBR2, …. |
SIGNALING BY TGF BETA RECEPTOR COMPLEX IN CANCER | 0.0080493 | 0.0326906 | 0.3807304 | -0.7295896 | -1.862860 | 8 | TGFBR2, …. |
REGULATION OF CYTOSKELETAL REMODELING AND CELL SPREADING BY IPP COMPLEX COMPONENTS | 0.0086340 | 0.0342916 | 0.3807304 | -0.7277154 | -1.858074 | 8 | ARHGEF6,…. |
MAPK1 ERK2 ACTIVATION | 0.0034787 | 0.0161381 | 0.4317077 | -0.7165852 | -1.883323 | 9 | IL6R, IL…. |
NR1H2 NR1H3 REGULATE GENE EXPRESSION TO CONTROL BILE ACID HOMEOSTASIS | 0.0034787 | 0.0161381 | 0.4317077 | -0.7164348 | -1.882927 | 9 | NR1H2, N…. |
REGULATION OF FOXO TRANSCRIPTIONAL ACTIVITY BY ACETYLATION | 0.0048663 | 0.0215653 | 0.4070179 | -0.6975844 | -1.872600 | 10 | TXNIP, F…. |
AKT PHOSPHORYLATES TARGETS IN THE NUCLEUS | 0.0063138 | 0.0266477 | 0.4070179 | -0.6898235 | -1.812988 | 9 | FOXO4, N…. |
FCGR ACTIVATION | 0.0026294 | 0.0127607 | 0.4317077 | -0.6818157 | -1.989282 | 12 | FGR, FCG…. |
BETA OXIDATION OF PRISTANOYL COA | 0.0084325 | 0.0336500 | 0.3807304 | -0.6810130 | -1.789832 | 9 | ACOXL, A…. |
fsets <- paste(fsig$pathway,as.character(fsig$ES > 0))
str(fsets)
## chr [1:457] "ABC TRANSPORTER DISORDERS TRUE" ...
NRES=nrow(fres)
NSIG=nrow(fsig)
NUP=nrow(subset(fsig,ES>0))
NDN=nrow(subset(fsig,ES<0))
HEADER=paste(NRES,"pathways,",NSIG,"with FDR<0.05,",NUP,"up and",NDN,"down")
plot(fres$ES,-log10(fres$pval),pch=19,cex=0.6,col="gray",
xlab="ES",ylab="-log10 p-value",main="FGSEA")
points(fsig$ES,-log10(fsig$pval),cex=0.6,col="red",pch=19)
mtext(HEADER)
sigup <- rownames(subset(dge,padj<0.05 & log2FoldChange>0))
sigdn <- rownames(subset(dge,padj<0.05 & log2FoldChange<0))
bg <- rownames(dge)
oresup <- as.data.frame(fora(pathways=gs,genes=sigup,universe=bg,minSize=5))
oresup$es <- (oresup$overlap / length(sigup) ) / (oresup$size / length(bg))
head(oresup,10)
## pathway pval padj overlap size
## 1 METABOLISM OF RNA 5.768129e-54 9.713530e-51 481 709
## 2 CELL CYCLE 1.556582e-40 1.310642e-37 436 675
## 3 CELL CYCLE MITOTIC 3.816447e-35 2.142299e-32 358 547
## 4 CELL CYCLE CHECKPOINTS 1.188430e-31 5.003290e-29 208 282
## 5 DNA REPLICATION 5.502834e-27 1.850513e-24 146 187
## 6 TRANSLATION 6.593276e-27 1.850513e-24 203 287
## 7 M PHASE 8.492242e-26 2.042991e-23 263 403
## 8 G2 M CHECKPOINTS 1.294813e-25 2.725582e-23 131 165
## 9 DNA REPLICATION PRE INITIATION 1.503811e-24 2.813797e-22 126 159
## 10 RRNA PROCESSING 1.113509e-21 1.875150e-19 144 197
## overlapGenes es
## 1 A1CF, AD.... 1.711284
## 2 AHCTF1, .... 1.629319
## 3 AHCTF1, .... 1.650893
## 4 AHCTF1, .... 1.860534
## 5 ANAPC1, .... 1.969403
## 6 AARS1, A.... 1.784175
## 7 AHCTF1, .... 1.646168
## 8 ATR, ATR.... 2.002676
## 9 ANAPC1, .... 1.998926
## 10 BMS1, BO.... 1.843825
oresdn <- as.data.frame(fora(pathways=gs,genes=sigdn,universe=bg,minSize=5))
oresdn$es <- (oresdn$overlap / length(sigdn) ) / (oresdn$size / length(bg))
head(oresup,10)
## pathway pval padj overlap size
## 1 METABOLISM OF RNA 5.768129e-54 9.713530e-51 481 709
## 2 CELL CYCLE 1.556582e-40 1.310642e-37 436 675
## 3 CELL CYCLE MITOTIC 3.816447e-35 2.142299e-32 358 547
## 4 CELL CYCLE CHECKPOINTS 1.188430e-31 5.003290e-29 208 282
## 5 DNA REPLICATION 5.502834e-27 1.850513e-24 146 187
## 6 TRANSLATION 6.593276e-27 1.850513e-24 203 287
## 7 M PHASE 8.492242e-26 2.042991e-23 263 403
## 8 G2 M CHECKPOINTS 1.294813e-25 2.725582e-23 131 165
## 9 DNA REPLICATION PRE INITIATION 1.503811e-24 2.813797e-22 126 159
## 10 RRNA PROCESSING 1.113509e-21 1.875150e-19 144 197
## overlapGenes es
## 1 A1CF, AD.... 1.711284
## 2 AHCTF1, .... 1.629319
## 3 AHCTF1, .... 1.650893
## 4 AHCTF1, .... 1.860534
## 5 ANAPC1, .... 1.969403
## 6 AARS1, A.... 1.784175
## 7 AHCTF1, .... 1.646168
## 8 ATR, ATR.... 2.002676
## 9 ANAPC1, .... 1.998926
## 10 BMS1, BO.... 1.843825
oresup <- subset(oresup,padj<0.05)
oresup <- paste(oresup$pathway,"TRUE")
oresdn <- subset(oresdn,padj<0.05)
oresdn <- paste(oresdn$pathway,"FALSE")
osets <- c(oresup,oresdn)
str(osets)
## chr [1:375] "METABOLISM OF RNA TRUE" "CELL CYCLE TRUE" ...
Assess the effect of gene number on pathway results.
sizes <- c(100,200,500,1000,2000,3000,4000,5000,6000,7000,8000)
ores <- lapply(sizes, function(size) {
sigup <- head(rownames(subset(dge,padj<0.05 & log2FoldChange>0)),size)
sigdn <- head(rownames(subset(dge,padj<0.05 & log2FoldChange<0)),size)
bg <- rownames(dge)
oresup <- as.data.frame(fora(pathways=gs,genes=sigup,universe=bg,minSize=5))
oresup$es <- (oresup$overlap / length(sigup) ) / (oresup$size / length(bg))
nup <- nrow(subset(oresup,padj<0.05))
oresdn <- as.data.frame(fora(pathways=gs,genes=sigdn,universe=bg,minSize=5))
oresdn$es <- (oresdn$overlap / length(sigdn) ) / (oresdn$size / length(bg))
ndn <- nrow(subset(oresdn,padj<0.05))
return(c(nup,ndn))
})
ores <- do.call(rbind,ores)
rownames(ores) <- sizes
colnames(ores) <- c("up","dn")
ores <- as.data.frame(ores)
ores$tot <- ores$up + ores$dn
tot <- ores$tot
names(tot) <- sizes
par(mar=c(mar = c(5.1, 5.1, 4.1, 2.1)))
barplot(tot,horiz=TRUE,las=1,xlab="no. pathways FDR<0.05",ylab="no. genes selected")
SAMPLESIZES=c(2,3,5,10,15,20,25,30)
#SAMPLESIZES=c(3,6,10,22)
dres <- lapply(SAMPLESIZES,function(n) {
SEEDS <- seq(100,5000,100) # 50 repetitions
dsres <- mclapply(SEEDS, function(SEED) {
set.seed(SEED)
mysample <- sample(1:37,n,replace=FALSE)*2
mysample <- c(mysample,(mysample-1))
mysample <- mysample[order(mysample)]
ss2 <- ss[mysample,]
x2 <- x[,mysample]
x2 <- x2[rowMeans(x2)>=10,]
dds <- DESeqDataSetFromMatrix(countData = x2 , colData=ss2, design = ~ patient + tumor)
res <- DESeq(dds)
z <- results(res)
vsd <- vst(dds, blind=FALSE)
zz <-cbind(as.data.frame(z),assay(vsd))
dge <- as.data.frame(zz[order(zz$pvalue),])
dge_up <- rownames(subset(dge,log2FoldChange>0 & padj<0.05))
dge_dn <- rownames(subset(dge,log2FoldChange<0 & padj<0.05))
genes2 <- c(paste(dge_up,"up"),paste(dge_dn,"dn"))
TOT=length(genes2)
TP=length(which(genes2 %in% genes))
FP=length(which(!genes2 %in% genes))
FN=length(which(!genes %in% genes2))
RES=c("TOT"=TOT,"TP"=TP,"FP"=FP,"FN"=FN)
RES
},mc.cores=16)
do.call(rbind,dsres)
})
ssres <- dres
par(mfrow=c(3,1))
dots <- lapply(ssres,function(x) { x[,"TP"] })
lapply(dots,summary)
## [[1]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 87.0 537.5 1312.0 1387.3 1895.0 3622.0
##
## [[2]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 36 1198 1831 2071 2884 4540
##
## [[3]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 622 2064 3620 3455 4678 6113
##
## [[4]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4087 5680 6583 6439 7250 8084
##
## [[5]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6774 7564 8078 8027 8437 9254
##
## [[6]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7854 8880 9178 9156 9449 9987
##
## [[7]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9396 9725 9888 9904 10047 10548
##
## [[8]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10086 10344 10524 10493 10628 10898
boxplot(dots,names=SAMPLESIZES,col="white",ylab="no. consistent",xlab="sample size",cex=0,main="DESeq2",ylim=c(0,nrow(sig))) ; grid()
beeswarm(dots,add=TRUE,pch=1,cex=0.6)
abline(h=nrow(sig))
dots <- lapply(ssres,function(x) { x[,"FP"] })
lapply(dots,summary)
## [[1]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.00 22.25 64.50 72.32 108.00 271.00
##
## [[2]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.00 39.00 53.00 87.52 125.50 357.00
##
## [[3]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.0 49.0 96.0 116.3 147.5 500.0
##
## [[4]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 59.0 142.5 194.5 201.9 256.2 471.0
##
## [[5]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 90.0 159.2 218.5 231.2 305.8 502.0
##
## [[6]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 105.0 201.2 231.5 257.6 311.5 675.0
##
## [[7]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 124.0 196.2 244.5 261.8 329.2 460.0
##
## [[8]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 118.0 222.0 245.0 270.5 299.2 609.0
boxplot(dots,names=SAMPLESIZES,col="white",ylab="no. inconsistent",xlab="sample size",cex=0) ; grid()
beeswarm(dots,add=TRUE,pch=1,cex=0.6)
dots <- lapply(ssres,function(x) { x[,"TP"] })
TPdots <- do.call(rbind,dots)
dots <- lapply(ssres,function(x) { x[,"FP"] })
FPdots <- do.call(rbind,dots)
ratio <- t(FPdots / (TPdots + FPdots))
ratio <- lapply(1:ncol(ratio),function(i) {ratio[,i]})
lapply(ratio,summary)
## [[1]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01250 0.03029 0.04288 0.05598 0.06400 0.20000
##
## [[2]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.009335 0.023119 0.035135 0.043507 0.052306 0.199739
##
## [[3]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.007779 0.019386 0.028803 0.031149 0.039594 0.076748
##
## [[4]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01190 0.02288 0.02780 0.02964 0.03493 0.06199
##
## [[5]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01184 0.02024 0.02613 0.02763 0.03479 0.05731
##
## [[6]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01099 0.02097 0.02473 0.02719 0.03251 0.07138
##
## [[7]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01266 0.01952 0.02410 0.02563 0.03185 0.04561
##
## [[8]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01127 0.02041 0.02302 0.02507 0.02730 0.05605
boxplot(ratio,names=SAMPLESIZES,col="white",ylab="proportion inconsistent",xlab="sample size",cex=0,ylim=c(0,0.2)) ; grid()
beeswarm(ratio,add=TRUE,pch=1,cex=0.6)
This will run the FGSEA analysis at group sizes of 2, 3, 6, 10, 16, 22 and 30, 50 times each, with pseudorandom seeds from 100 to 5000 jumping up by 100.
RhpcBLASctl::blas_set_num_threads(1) #set automatic cores to 1
SAMPLESIZES=c(2,3,5,10,15,20,25,30)
#SAMPLESIZES=c(3,6,10,22)
fgres <- lapply(SAMPLESIZES,function(n) {
SEEDS <- seq(100,5000,100) # 50 repetitions
dsres <- mclapply(SEEDS, function(SEED) {
set.seed(SEED)
mysample <- sample(1:37,n,replace=FALSE)*2
mysample <- c(mysample,(mysample-1))
mysample <- mysample[order(mysample)]
ss2 <- ss[mysample,]
x2 <- x[,mysample]
x2 <- x2[rowMeans(x2)>=10,]
dds <- DESeqDataSetFromMatrix(countData = x2 , colData=ss2, design = ~ patient + tumor)
res <- DESeq(dds)
z <- results(res)
vsd <- vst(dds, blind=FALSE)
zz <-cbind(as.data.frame(z),assay(vsd))
dge <- as.data.frame(zz[order(zz$pvalue),])
dge$stat[which(is.na(dge$stat))] <- 1
stat <- dge$stat
names(stat) <- rownames(dge)
fres <- fgsea(pathways=gs,stats=stat,minSize=5,nproc=3)
fres <- fres[order(-abs(fres$ES)),]
fsig <- subset(fres,padj<0.05)
fsets2 <- paste(fsig$pathway,as.character(fsig$ES > 0))
TOT=length(fsets2)
TP=length(which(fsets2 %in% fsets))
FP=length(which(!fsets2 %in% fsets))
FN=length(which(!fsets %in% fsets2))
RES=c("TOT"=TOT,"TP"=TP,"FP"=FP,"FN"=FN)
RES
},mc.cores=16)
do.call(rbind,dsres)
})
ssres <- fgres
par(mfrow=c(3,1))
dots <- lapply(ssres,function(x) { x[,"TP"] })
lapply(dots,summary)
## [[1]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 31.0 214.2 278.0 258.9 306.8 360.0
##
## [[2]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 129.0 256.8 306.5 289.0 335.8 361.0
##
## [[3]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 206.0 309.0 346.0 327.1 356.0 390.0
##
## [[4]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 249.0 336.5 364.0 357.1 381.0 400.0
##
## [[5]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 276.0 361.0 376.5 372.8 389.0 427.0
##
## [[6]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 331.0 373.2 388.5 385.3 400.8 424.0
##
## [[7]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 352.0 386.2 397.0 395.8 409.8 423.0
##
## [[8]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 377.0 402.2 407.0 406.6 413.8 429.0
boxplot(dots,names=SAMPLESIZES,col="white",ylab="no. consistent",xlab="sample size",cex=0,main="fgsea",ylim=c(0,nrow(fsig))) ; grid()
beeswarm(dots,add=TRUE,pch=1,cex=0.6)
abline(h=nrow(fsig))
dots <- lapply(ssres,function(x) { x[,"FP"] })
lapply(dots,summary)
## [[1]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 24.00 46.00 68.00 74.14 90.00 243.00
##
## [[2]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 24.00 53.00 74.00 76.58 92.50 195.00
##
## [[3]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 27.00 50.50 68.50 75.88 94.50 195.00
##
## [[4]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19.00 38.50 59.00 62.34 84.75 124.00
##
## [[5]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 17.00 35.00 48.50 50.14 62.50 112.00
##
## [[6]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.00 27.00 41.50 40.92 50.25 103.00
##
## [[7]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12.00 25.25 32.00 34.08 41.75 71.00
##
## [[8]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.00 21.25 28.00 29.52 36.75 57.00
boxplot(dots,names=SAMPLESIZES,col="white",ylab="no. inconsistent",xlab="sample size",cex=0) ; grid()
beeswarm(dots,add=TRUE,pch=1,cex=0.6)
dots <- lapply(ssres,function(x) { x[,"TP"] })
TPdots <- do.call(rbind,dots)
dots <- lapply(ssres,function(x) { x[,"FP"] })
FPdots <- do.call(rbind,dots)
ratio <- t(FPdots / (TPdots + FPdots))
ratio <- lapply(1:ncol(ratio),function(i) {ratio[,i]})
lapply(ratio,summary)
## [[1]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1016 0.1487 0.2124 0.2201 0.2668 0.5271
##
## [[2]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.07895 0.15193 0.18893 0.20300 0.23072 0.44722
##
## [[3]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.07349 0.14169 0.16749 0.18383 0.21206 0.45775
##
## [[4]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.06090 0.09645 0.13721 0.14412 0.18900 0.25373
##
## [[5]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.05000 0.08652 0.11060 0.11552 0.14127 0.22400
##
## [[6]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.04058 0.06821 0.09416 0.09339 0.11322 0.19961
##
## [[7]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.03291 0.05882 0.07413 0.07780 0.09517 0.15536
##
## [[8]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02571 0.05053 0.06492 0.06693 0.08059 0.12232
boxplot(ratio,names=SAMPLESIZES,col="white",ylab="proportion inconsistent",xlab="sample size",cex=0,ylim=c(0,0.5)) ; grid()
beeswarm(ratio,add=TRUE,pch=1,cex=0.6)
SAMPLESIZES=c(2,3,5,10,15,20,25,30)
fores <- lapply(SAMPLESIZES,function(n) {
SEEDS <- seq(100,5000,100) # 50 repetitions
dsres <- mclapply(SEEDS, function(SEED) {
set.seed(SEED)
mysample <- sample(1:37,n,replace=FALSE)*2
mysample <- c(mysample,(mysample-1))
mysample <- mysample[order(mysample)]
ss2 <- ss[mysample,]
x2 <- x[,mysample]
x2 <- x2[rowMeans(x2)>=10,]
dds <- DESeqDataSetFromMatrix(countData = x2 , colData=ss2, design = ~ patient + tumor)
res <- DESeq(dds)
z <- results(res)
vsd <- vst(dds, blind=FALSE)
zz <-cbind(as.data.frame(z),assay(vsd))
dge <- as.data.frame(zz[order(zz$pvalue),])
sigup <- rownames(subset(dge,padj<0.05 & log2FoldChange>0))
if (length(sigup) < 250) { sigup <- head(rownames(subset(dge, log2FoldChange>0)),250) }
sigdn <- rownames(subset(dge,padj<0.05 & log2FoldChange<0))
if (length(sigdn) < 250) { sigdn <- head(rownames(subset(dge, log2FoldChange<0)),250) }
bg <- rownames(dge)
oresup <- as.data.frame(fora(pathways=gs,genes=sigup,universe=bg,minSize=5))
oresup <- subset(oresup,padj<0.05)
oresup <- paste(oresup$pathway,"TRUE")
oresdn <- as.data.frame(fora(pathways=gs,genes=sigdn,universe=bg,minSize=5))
oresdn <- subset(oresdn,padj<0.05)
oresdn <- paste(oresdn$pathway,"FALSE")
osets2 <- c(oresup,oresdn)
TOT=length(osets2)
TP=length(which(osets2 %in% osets))
FP=length(which(!osets2 %in% osets))
FN=length(which(!osets %in% osets2))
RES=c("TOT"=TOT,"TP"=TP,"FP"=FP,"FN"=FN)
RES
},mc.cores=16)
do.call(rbind,dsres)
})
ssres <- fores
par(mfrow=c(3,1))
dots <- lapply(ssres,function(x) { x[,"TP"] })
lapply(dots,summary)
## [[1]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.00 80.75 122.00 105.56 142.25 242.00
##
## [[2]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.0 102.8 139.5 125.2 155.8 270.0
##
## [[3]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.0 129.5 155.5 156.8 188.5 274.0
##
## [[4]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 147.0 211.0 264.0 251.7 291.0 314.0
##
## [[5]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 162.0 249.8 287.0 274.2 304.8 336.0
##
## [[6]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 236.0 285.0 300.5 297.4 314.8 330.0
##
## [[7]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 275.0 299.0 304.5 308.7 322.2 342.0
##
## [[8]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 300.0 311.0 317.0 317.7 323.0 344.0
boxplot(dots,names=SAMPLESIZES,col="white",ylab="no. consistent",xlab="sample size",cex=0,main="fora",ylim=c(0,length(osets))) ; grid()
beeswarm(dots,add=TRUE,pch=1,cex=0.6)
abline(h=length(osets))
dots <- lapply(ssres,function(x) { x[,"FP"] })
lapply(dots,summary)
## [[1]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5 28 45 41 53 78
##
## [[2]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.00 30.25 44.50 45.12 58.75 115.00
##
## [[3]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.00 31.00 40.50 46.20 57.75 128.00
##
## [[4]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.00 38.25 50.00 51.50 66.75 97.00
##
## [[5]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 17.00 29.00 40.50 43.34 54.00 84.00
##
## [[6]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.00 27.25 38.00 38.92 47.00 79.00
##
## [[7]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 23.50 29.50 32.96 37.75 75.00
##
## [[8]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 16.00 22.00 26.00 29.32 36.00 71.00
boxplot(dots,names=SAMPLESIZES,col="white",ylab="no. inconsistent",xlab="sample size",cex=0) ; grid()
beeswarm(dots,add=TRUE,pch=1,cex=0.6)
dots <- lapply(ssres,function(x) { x[,"TP"] })
TPdots <- do.call(rbind,dots)
dots <- lapply(ssres,function(x) { x[,"FP"] })
FPdots <- do.call(rbind,dots)
ratio <- t(FPdots / (TPdots + FPdots))
ratio <- lapply(1:ncol(ratio),function(i) {ratio[,i]})
lapply(ratio,summary)
## [[1]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1278 0.2446 0.2790 0.3328 0.3230 0.8667
##
## [[2]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1528 0.2143 0.2556 0.2971 0.3158 0.8000
##
## [[3]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1200 0.1893 0.2110 0.2331 0.2418 0.5000
##
## [[4]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.09053 0.14317 0.16345 0.16776 0.19836 0.28340
##
## [[5]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.06742 0.10865 0.13464 0.13281 0.15445 0.22105
##
## [[6]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.04281 0.08769 0.10953 0.11309 0.13847 0.19603
##
## [[7]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.05678 0.07221 0.08749 0.09505 0.11061 0.18519
##
## [[8]]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.04678 0.06498 0.07625 0.08360 0.10175 0.17574
boxplot(ratio,names=SAMPLESIZES,col="white",ylab="proportion inconsistent",xlab="sample size",cex=0,ylim=c(0,0.5)) ; grid()
beeswarm(ratio,add=TRUE,pch=1,cex=0.6)
par(mfrow=c(1,3))
dots <- lapply(fgres,function(x) { x[,"TP"] })
fgres2 <- do.call(rbind,lapply(dots,summary))
dots <- lapply(fores,function(x) { x[,"TP"] })
fores2 <- do.call(rbind,lapply(dots,summary))
df <- data.frame(SAMPLESIZES,fores2[,"Median"],fgres2[,"Median"])
colnames(df) <- c("Sample size", "fora", "fgsea")
df
## Sample size fora fgsea
## 1 2 122.0 278.0
## 2 3 139.5 306.5
## 3 5 155.5 346.0
## 4 10 264.0 364.0
## 5 15 287.0 376.5
## 6 20 300.5 388.5
## 7 25 304.5 397.0
## 8 30 317.0 407.0
MAX=length(fsets)
plot(df$`Sample size`,df$fora,type="b",col="red",pch=19,xlab="Sample size",ylab="no. pathways", ylim=c(0,MAX))
points(df$`Sample size`,df$fgsea,type="b",col="blue",pch=19 )
abline(h=length(osets),col="red")
abline(h=length(fsets),col="blue")
mtext("no. consistent pathways FDR<0.05")
grid()
legend("bottomright", legend=c("fgsea","fora"),
col=c("blue","red"), lty=1, pch=19, cex=1)
dots <- lapply(fgres,function(x) { x[,"FP"] })
fgres2 <- do.call(rbind,lapply(dots,summary))
dots <- lapply(fores,function(x) { x[,"FP"] })
fores2 <- do.call(rbind,lapply(dots,summary))
df <- data.frame(SAMPLESIZES,fores2[,"Median"],fgres2[,"Median"])
colnames(df) <- c("Sample size", "fora", "fgsea")
df
## Sample size fora fgsea
## 1 2 45.0 68.0
## 2 3 44.5 74.0
## 3 5 40.5 68.5
## 4 10 50.0 59.0
## 5 15 40.5 48.5
## 6 20 38.0 41.5
## 7 25 29.5 32.0
## 8 30 26.0 28.0
MAX=max(c(fores2[,"Median"],fgres2[,"Median"]))
plot(df$`Sample size`,df$fora,type="b",col="red",pch=19,xlab="Sample size",ylab="no. pathways",ylim=c(0,MAX))
points(df$`Sample size`,df$fgsea,type="b",col="blue",pch=19 )
mtext("no. inconsistent pathways FDR<0.05")
grid()
dots <- lapply(fores,function(x) { x[,"TP"] })
TPdots <- do.call(rbind,dots)
dots <- lapply(fores,function(x) { x[,"FP"] })
FPdots <- do.call(rbind,dots)
ratio <- t(FPdots / (TPdots + FPdots))
ratio <- lapply(1:ncol(ratio),function(i) {ratio[,i]})
fores2 <- do.call(rbind,lapply(ratio,summary))
dots <- lapply(fgres,function(x) { x[,"TP"] })
TPdots <- do.call(rbind,dots)
dots <- lapply(fgres,function(x) { x[,"FP"] })
FPdots <- do.call(rbind,dots)
ratio <- t(FPdots / (TPdots + FPdots))
ratio <- lapply(1:ncol(ratio),function(i) {ratio[,i]})
fgres2 <- do.call(rbind,lapply(ratio,summary))
df <- data.frame(SAMPLESIZES,fores2[,"Median"],fgres2[,"Median"])
colnames(df) <- c("Sample size", "fora", "fgsea")
df
## Sample size fora fgsea
## 1 2 0.27895182 0.21243141
## 2 3 0.25564868 0.18893189
## 3 5 0.21098129 0.16749410
## 4 10 0.16344750 0.13721007
## 5 15 0.13463905 0.11060143
## 6 20 0.10953272 0.09415841
## 7 25 0.08749498 0.07412734
## 8 30 0.07625315 0.06492421
MAX=max(c(fores2[,"Median"],fgres2[,"Median"]))
plot(df$`Sample size`,df$fora,type="b",col="red",pch=19,xlab="Sample size",ylab="proportion",ylim=c(0,MAX))
points(df$`Sample size`,df$fgsea,type="b",col="blue",pch=19 )
mtext("Proportion of inconsistent pathways")
grid()
save.image("GSE158422_sensitivity.Rdata")
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] vioplot_0.5.0 zoo_1.8-12
## [3] sm_2.2-6.0 edgeR_4.2.1
## [5] limma_3.60.3 RhpcBLASctl_0.23-42
## [7] fgsea_1.30.0 png_0.1-8
## [9] gridExtra_2.3 gplots_3.1.3.1
## [11] beeswarm_0.4.0 kableExtra_1.4.0
## [13] mitch_1.16.0 HGNChelper_0.8.14
## [15] eulerr_7.0.2 DESeq2_1.44.0
## [17] SummarizedExperiment_1.34.0 Biobase_2.64.0
## [19] MatrixGenerics_1.16.0 matrixStats_1.3.0
## [21] GenomicRanges_1.56.1 GenomeInfoDb_1.40.1
## [23] IRanges_2.38.1 S4Vectors_0.42.1
## [25] BiocGenerics_0.50.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-7 echarts4r_0.4.5 rlang_1.1.4
## [4] magrittr_2.0.3 compiler_4.4.1 systemfonts_1.1.0
## [7] vctrs_0.6.5 reshape2_1.4.4 stringr_1.5.1
## [10] pkgconfig_2.0.3 crayon_1.5.3 fastmap_1.2.0
## [13] XVector_0.44.0 caTools_1.18.2 splitstackshape_1.4.8
## [16] utf8_1.2.4 promises_1.3.0 rmarkdown_2.27
## [19] UCSC.utils_1.0.0 purrr_1.0.2 xfun_0.45
## [22] zlibbioc_1.50.0 cachem_1.1.0 jsonlite_1.8.8
## [25] highr_0.11 later_1.3.2 DelayedArray_0.30.1
## [28] BiocParallel_1.38.0 R6_2.5.1 bslib_0.7.0
## [31] stringi_1.8.4 RColorBrewer_1.1-3 GGally_2.2.1
## [34] jquerylib_0.1.4 Rcpp_1.0.12 knitr_1.47
## [37] httpuv_1.6.15 Matrix_1.7-0 tidyselect_1.2.1
## [40] rstudioapi_0.16.0 abind_1.4-5 yaml_2.3.8
## [43] codetools_0.2-20 lattice_0.22-6 tibble_3.2.1
## [46] plyr_1.8.9 shiny_1.8.1.1 evaluate_0.24.0
## [49] ggstats_0.6.0 xml2_1.3.6 pillar_1.9.0
## [52] KernSmooth_2.23-24 generics_0.1.3 ggplot2_3.5.1
## [55] munsell_0.5.1 scales_1.3.0 gtools_3.9.5
## [58] xtable_1.8-4 glue_1.7.0 tools_4.4.1
## [61] data.table_1.15.4 locfit_1.5-9.10 fastmatch_1.1-4
## [64] cowplot_1.1.3 grid_4.4.1 tidyr_1.3.1
## [67] colorspace_2.1-0 GenomeInfoDbData_1.2.12 cli_3.6.3
## [70] fansi_1.0.6 S4Arrays_1.4.1 viridisLite_0.4.2
## [73] svglite_2.1.3 dplyr_1.1.4 gtable_0.3.5
## [76] sass_0.4.9 digest_0.6.36 SparseArray_1.4.8
## [79] htmlwidgets_1.6.4 htmltools_0.5.8.1 lifecycle_1.0.4
## [82] httr_1.4.7 statmod_1.5.0 mime_0.12
## [85] MASS_7.3-61