Source: https://github.com/markziemann/shoulder-instability-osteroarthritis
Here we are comparing the gene expression patterns of shoulder instability to shoulder osteroarthritis. We are doing this to identify the gene expression patterns which are specific to osteoarthritis. We are using the instability group as a control for this contrast as it is difficult to obtain specimens of completely normal shoulder tissue.
We will be looking only at capsular tissues for the differential expression analysis.
suppressPackageStartupMessages({
library("zoo")
library("tidyverse")
library("reshape2")
library("DESeq2")
library("gplots")
library("fgsea")
library("MASS")
library("mitch")
library("eulerr")
library("limma")
library("topconfects")
library("kableExtra")
})
Importing osteoarthritis (OA) and shoulder instability (SI) data separately.
tmp <- read.table("3col_oa.tsv.gz",header=F)
x <- as.matrix(acast(tmp, V2~V1, value.var="V3", fun.aggregate = sum))
x <- as.data.frame(x)
accession <- sapply((strsplit(rownames(x),"\\|")),"[[",2)
symbol<-sapply((strsplit(rownames(x),"\\|")),"[[",6)
x$geneid <- paste(accession,symbol)
xx <- aggregate(. ~ geneid,x,sum)
rownames(xx) <- xx$geneid
colnames <- gsub("T0R","T0",colnames(xx))
xx$geneid = NULL
oa <- round(xx)
head(oa)
## 4001-41B 4001-41C 4001-41F 4001-41M 4001-41S
## ENSG00000000003.15 TSPAN6 327 231 613 27 346
## ENSG00000000005.6 TNMD 9 45 477 1 30
## ENSG00000000419.13 DPM1 304 497 485 132 502
## ENSG00000000457.14 SCYL3 245 245 280 83 231
## ENSG00000000460.17 C1orf112 101 91 79 15 82
## ENSG00000000938.13 FGR 2962 828 661 91 736
## 4002-42B 4002-42C 4002-42F 4002-42M 4002-42S
## ENSG00000000003.15 TSPAN6 432 301 579 46 336
## ENSG00000000005.6 TNMD 52 2 275 1 308
## ENSG00000000419.13 DPM1 349 333 422 126 339
## ENSG00000000457.14 SCYL3 202 185 183 93 168
## ENSG00000000460.17 C1orf112 86 71 53 9 49
## ENSG00000000938.13 FGR 562 1012 713 95 479
## 4003-43B 4003-43C 4003-43F 4003-43M 4003-43S
## ENSG00000000003.15 TSPAN6 279 683 885 69 370
## ENSG00000000005.6 TNMD 17 356 772 12 387
## ENSG00000000419.13 DPM1 851 815 486 328 617
## ENSG00000000457.14 SCYL3 287 281 326 138 278
## ENSG00000000460.17 C1orf112 229 105 68 21 89
## ENSG00000000938.13 FGR 1294 1404 421 69 605
## 4004-44B 4004-44C 4004-44F 4004-44M 4004-44S
## ENSG00000000003.15 TSPAN6 526 386 428 57 301
## ENSG00000000005.6 TNMD 3 22 480 2 10
## ENSG00000000419.13 DPM1 366 529 218 281 595
## ENSG00000000457.14 SCYL3 214 199 177 141 262
## ENSG00000000460.17 C1orf112 149 58 32 38 103
## ENSG00000000938.13 FGR 4440 775 406 74 803
## 4005-45B 4005-45C 4005-45F 4005-45M 4005-45S
## ENSG00000000003.15 TSPAN6 85 371 368 78 231
## ENSG00000000005.6 TNMD 60 157 246 4 139
## ENSG00000000419.13 DPM1 395 300 168 493 414
## ENSG00000000457.14 SCYL3 261 149 140 157 184
## ENSG00000000460.17 C1orf112 179 60 58 38 70
## ENSG00000000938.13 FGR 8028 533 450 72 1392
## 4006-46B 4006-46C 4006-46F 4006-46M 4006-46S
## ENSG00000000003.15 TSPAN6 722 342 385 55 363
## ENSG00000000005.6 TNMD 40 4 171 13 11
## ENSG00000000419.13 DPM1 525 434 273 238 516
## ENSG00000000457.14 SCYL3 300 242 147 113 237
## ENSG00000000460.17 C1orf112 134 60 47 19 116
## ENSG00000000938.13 FGR 620 968 362 364 1021
tmp <- read.table("3col_instability.tsv.gz",header=F)
x <- as.matrix(acast(tmp, V2~V1, value.var="V3", fun.aggregate = sum))
x <- as.data.frame(x)
accession <- sapply((strsplit(rownames(x),"\\|")),"[[",2)
symbol<-sapply((strsplit(rownames(x),"\\|")),"[[",6)
x$geneid <- paste(accession,symbol)
xx <- aggregate(. ~ geneid,x,sum)
rownames(xx) <- xx$geneid
colnames(xx) <- sapply(strsplit(colnames(xx),"_"),"[[",1)
xx$geneid = NULL
si <- round(xx)
head(si)
## SM-027 SM-028 SM-029 SM-030 SM-031 SM-032 SM-033
## ENSG00000000003.15 TSPAN6 187 191 204 185 112 116 128
## ENSG00000000005.6 TNMD 11 4 18 3 37 1 5
## ENSG00000000419.13 DPM1 273 308 401 351 180 375 454
## ENSG00000000457.14 SCYL3 270 318 364 230 124 218 256
## ENSG00000000460.17 C1orf112 62 101 138 124 45 88 65
## ENSG00000000938.13 FGR 806 704 1269 775 677 501 501
## SM-034 SM-035 SM-036 SM-037 SM-038 SM-039 SM-040
## ENSG00000000003.15 TSPAN6 75 165 283 104 127 107 135
## ENSG00000000005.6 TNMD 0 71 16 11 1 8 23
## ENSG00000000419.13 DPM1 231 289 345 227 280 173 325
## ENSG00000000457.14 SCYL3 141 225 361 197 232 142 251
## ENSG00000000460.17 C1orf112 65 60 91 101 79 59 81
## ENSG00000000938.13 FGR 281 354 280 371 581 302 1233
## SM-041 SM-042 SM-043 SM-044 SM-045 SM-046 SM-047
## ENSG00000000003.15 TSPAN6 224 139 197 156 183 229 248
## ENSG00000000005.6 TNMD 12 1 276 158 3 150 125
## ENSG00000000419.13 DPM1 421 319 335 331 432 317 399
## ENSG00000000457.14 SCYL3 310 262 348 269 272 341 335
## ENSG00000000460.17 C1orf112 121 103 72 85 105 136 142
## ENSG00000000938.13 FGR 1097 634 1299 767 1766 231 661
## SM-048 SM-049 SM-050 SM-051 SM-052
## ENSG00000000003.15 TSPAN6 252 114 226 363 234
## ENSG00000000005.6 TNMD 106 15 33 6 7
## ENSG00000000419.13 DPM1 326 296 432 452 398
## ENSG00000000457.14 SCYL3 220 238 266 381 350
## ENSG00000000460.17 C1orf112 90 76 115 59 142
## ENSG00000000938.13 FGR 636 352 598 446 1790
dim(oa)
## [1] 60651 30
dim(si)
## [1] 60651 26
Here I’ll look at a few different quality control measures.
par(mar=c(5,8,3,1))
barplot(colSums(oa),horiz=TRUE,las=1,xlab="OA num reads")
sums <- colSums(oa)
sums <- sums[order(sums)]
barplot(sums,horiz=TRUE,las=1,xlab="OA num reads")
abline(v=15000000,col="red")
barplot(colSums(si),horiz=TRUE,las=1,xlab="SI num reads")
sums <- colSums(si)
sums <- sums[order(sums)]
barplot(sums,horiz=TRUE,las=1,xlab="SI num reads")
abline(v=15000000,col="red")
Multidimensional scaling plot to show the variation between all samples, very similar to PCA.
xx <- cbind(oa,si)
mds <- cmdscale(dist(t(xx)))
plot(mds, xlab="Coordinate 1", ylab="Coordinate 2",
type = "p",bty="n",pch=19, cex=4 ,col="gray")
text(mds, labels=rownames(mds) )
Now remove fat, bone and muscle, then plot capsular and synovium samples.
dim(oa)
## [1] 60651 30
oa <- oa[,grep("M",colnames(oa),invert=TRUE)]
dim(oa)
## [1] 60651 24
oa <- oa[,grep("B",colnames(oa),invert=TRUE)]
dim(oa)
## [1] 60651 18
oa <- oa[,grep("F",colnames(oa),invert=TRUE)]
dim(oa)
## [1] 60651 12
xx <- cbind(oa,si)
mds <- cmdscale(dist(t(xx)))
caps <- as.numeric(grepl("C",colnames(xx)))
syno <- as.numeric(grepl("S$",colnames(xx))) * 2
cols <- caps + syno
cols <- cols +2
plot(mds, xlab="Coordinate 1", ylab="Coordinate 2",
type = "p",bty="n",pch=19, cex=4 ,col=cols)
text(mds, labels=rownames(mds) )
Now remove synovium and regenerate the MDS plot for only the capsular samples.
oa <- oa[,grep("S",colnames(oa),invert=TRUE)]
dim(oa)
## [1] 60651 6
xx <- cbind(oa,si)
isoa <- as.numeric(grepl("C",colnames(xx)))
mds <- cmdscale(dist(t(xx)))
plot(mds, xlab="Coordinate 1", ylab="Coordinate 2",
type = "p",bty="n",pch=19, cex=4 ,col=isoa+2)
text(mds, labels=rownames(mds) )
legend("topleft", legend=c("insta", "OA"),
col=c(2,3), cex=1, pch=19)
This looks promising that there is some clustering of samples, with OA sample in green on the right and instability samples on the left. We need to be cautious as these were sequences in separate runs, but it would appear that there is some overlap between the groups which indicates that the batch effects might not be too severe.
heatmap.2(cor(xx),trace="n",main="Pearson correlation heatmap")
Also need to fix the patient identifiers to make it consistent between runs.
URL="https://raw.githubusercontent.com/markziemann/shoulder-instability-osteroarthritis/main/OA_patients.tsv"
pat <- read.table(URL,header=TRUE,sep="\t")
oa_pats <- colnames(xx)[grep("C",colnames(xx))]
oa_pats <- sapply(strsplit(oa_pats,"-"),"[[",1)
oa_pats <- paste("AD-CAB",oa_pats)
oa_pats <- pat[which(pat$Record_number %in% oa_pats),"Unique_participant_ID"]
insta_pats <- colnames(xx)[grep("SM",colnames(xx))]
insta_pats <- sapply(strsplit(insta_pats,"-"),"[[",2)
insta_pats <- as.integer(gsub("^0","",insta_pats))
pats <- c(oa_pats,insta_pats)
ss <- pat[which(pat$Unique_participant_ID %in% pats),]
rownames(ss) <- ss$Unique_participant_ID
ss$OA <- factor(ss$Unique_participant_ID>70)
colnames(xx) <- pats
Don’t forget to remove poorly detected genes from the matrix with a threshold of 10 reads per sample on average.
I’m also reordering the count matrix columns (xx) so it is in the same order as the samplesheet.
dim(xx)
## [1] 60651 32
xx <- xx[which(rowMeans(xx)>=10),]
dim(xx)
## [1] 18774 32
xx <- xx[,match( rownames(ss) , colnames(xx) )]
Not taking into consideration the effect of sex (will deal with that later).
dds <- DESeqDataSetFromMatrix(countData = xx , colData = ss, design = ~ OA )
## converting counts to integer mode
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 61 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
z<- results(res)
vsd <- vst(dds, blind=FALSE)
zz <- cbind(as.data.frame(z),assay(vsd))
dge <- as.data.frame(zz[order(zz$pvalue),])
dge[1:20,1:6] %>% kbl(caption = "Top gene expression differences between instability (ctrl) and OA (case) without consideration of sex") %>% kable_paper("hover", full_width = F)
baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | |
---|---|---|---|---|---|---|
ENSG00000159658.15 EFCAB14 | 3332.6211 | -1.2520243 | 0.0519688 | -24.09186 | 0 | 0 |
ENSG00000174282.12 ZBTB4 | 4700.0483 | -1.4555560 | 0.0687237 | -21.17982 | 0 | 0 |
ENSG00000178502.6 KLHL11 | 192.8755 | -2.7752160 | 0.1312278 | -21.14808 | 0 | 0 |
ENSG00000185129.8 PURA | 1385.1347 | -1.4980158 | 0.0709312 | -21.11928 | 0 | 0 |
ENSG00000196914.9 ARHGEF12 | 4653.3909 | -1.4837353 | 0.0742316 | -19.98793 | 0 | 0 |
ENSG00000083168.11 KAT6A | 1542.5913 | -1.6332998 | 0.0818170 | -19.96284 | 0 | 0 |
ENSG00000140396.13 NCOA2 | 967.7400 | -1.6354692 | 0.0822493 | -19.88428 | 0 | 0 |
ENSG00000114439.19 BBX | 1534.9614 | -1.2030571 | 0.0615213 | -19.55513 | 0 | 0 |
ENSG00000130749.10 ZC3H4 | 1165.2906 | -1.6253576 | 0.0842774 | -19.28582 | 0 | 0 |
ENSG00000134644.16 PUM1 | 1626.2984 | -0.9261724 | 0.0491877 | -18.82936 | 0 | 0 |
ENSG00000125686.12 MED1 | 1067.1058 | -1.3420280 | 0.0713158 | -18.81810 | 0 | 0 |
ENSG00000132466.19 ANKRD17 | 1736.2567 | -1.0809885 | 0.0578369 | -18.69028 | 0 | 0 |
ENSG00000228253.1 MT-ATP8 | 14517.5823 | -3.6433673 | 0.1954961 | -18.63652 | 0 | 0 |
ENSG00000178951.9 ZBTB7A | 3589.8387 | -1.8000341 | 0.0979346 | -18.37996 | 0 | 0 |
ENSG00000160007.19 ARHGAP35 | 2424.1989 | -1.1767383 | 0.0648693 | -18.14014 | 0 | 0 |
ENSG00000183741.12 CBX6 | 3816.0534 | -1.6688803 | 0.0925367 | -18.03479 | 0 | 0 |
ENSG00000185591.10 SP1 | 2337.4013 | -1.1069720 | 0.0617523 | -17.92600 | 0 | 0 |
ENSG00000078687.17 TNRC6C | 1012.4221 | -1.8929297 | 0.1067483 | -17.73264 | 0 | 0 |
ENSG00000113580.15 NR3C1 | 2895.2502 | -1.2823855 | 0.0723789 | -17.71767 | 0 | 0 |
ENSG00000210144.1 MT-TY | 287.6235 | -6.2176214 | 0.3513097 | -17.69840 | 0 | 0 |
dgeb <- dge
Now repeat the analysis including the effect of sex.
ss$Sex <- factor(ss$Sex)
dds <- DESeqDataSetFromMatrix(countData = xx , colData = ss, design = ~ Sex + OA )
## converting counts to integer mode
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 89 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
z<- results(res)
vsd <- vst(dds, blind=FALSE)
zz <- cbind(as.data.frame(z),assay(vsd))
dge <- as.data.frame(zz[order(zz$pvalue),])
dge[1:20,1:6] %>% kbl(caption = "Top gene expression differences between instability (ctrl) and OA (case) correcting for sex") %>% kable_paper("hover", full_width = F)
baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | |
---|---|---|---|---|---|---|
ENSG00000159658.15 EFCAB14 | 3332.6211 | -1.2654394 | 0.0624634 | -20.25889 | 0 | 0 |
ENSG00000178502.6 KLHL11 | 192.8755 | -2.7198587 | 0.1460426 | -18.62374 | 0 | 0 |
ENSG00000174282.12 ZBTB4 | 4700.0483 | -1.4656438 | 0.0828188 | -17.69699 | 0 | 0 |
ENSG00000185129.8 PURA | 1385.1347 | -1.5001968 | 0.0850439 | -17.64027 | 0 | 0 |
ENSG00000140396.13 NCOA2 | 967.7400 | -1.6636029 | 0.0978811 | -16.99616 | 0 | 0 |
ENSG00000130749.10 ZC3H4 | 1165.2906 | -1.6665605 | 0.1003310 | -16.61062 | 0 | 0 |
ENSG00000083168.11 KAT6A | 1542.5913 | -1.6306942 | 0.0981851 | -16.60837 | 0 | 0 |
ENSG00000196914.9 ARHGEF12 | 4653.3909 | -1.4716189 | 0.0894538 | -16.45115 | 0 | 0 |
ENSG00000114439.19 BBX | 1534.9614 | -1.2082963 | 0.0737470 | -16.38434 | 0 | 0 |
ENSG00000210144.1 MT-TY | 287.6235 | -6.2542130 | 0.3872795 | -16.14909 | 0 | 0 |
ENSG00000173915.16 ATP5MD | 382.0800 | 1.6301546 | 0.1020193 | 15.97888 | 0 | 0 |
ENSG00000132466.19 ANKRD17 | 1736.2567 | -1.1001706 | 0.0691250 | -15.91566 | 0 | 0 |
ENSG00000125686.12 MED1 | 1067.1058 | -1.3450560 | 0.0853260 | -15.76373 | 0 | 0 |
ENSG00000210194.1 MT-TE | 353.1876 | -6.7614006 | 0.4319016 | -15.65496 | 0 | 0 |
ENSG00000005339.15 CREBBP | 2887.6825 | -1.5126863 | 0.0974960 | -15.51537 | 0 | 0 |
ENSG00000134644.16 PUM1 | 1626.2984 | -0.9111724 | 0.0587738 | -15.50305 | 0 | 0 |
ENSG00000228253.1 MT-ATP8 | 14517.5823 | -3.6537841 | 0.2356924 | -15.50234 | 0 | 0 |
ENSG00000185591.10 SP1 | 2337.4013 | -1.1329817 | 0.0737138 | -15.37002 | 0 | 0 |
ENSG00000210151.2 MT-TS1 | 345.7805 | -6.2523932 | 0.4070974 | -15.35847 | 0 | 0 |
ENSG00000143190.23 POU2F1 | 678.7954 | -1.4347898 | 0.0937583 | -15.30308 | 0 | 0 |
dgeb <- dge
Here let’s look at some plots.
MA plot shows the average level and fold change of all detected genes. Volcano plot shows the fold change and the significance, as measured by -log(p-value). Significant genes are shown as red points.
There are heatmaps of the top ranked genes by p-value. Above the gene expression values there is a bar in yellow/orange colours. Yellow shows relatively low QuickDASH score and orange shows high score.
maplot <- function(de,contrast_name) {
sig <-subset(de, padj < 0.05 )
up <-rownames(subset(de, padj < 0.05 & log2FoldChange > 0))
dn <-rownames(subset(de, padj < 0.05 & log2FoldChange < 0))
GENESUP <- length(up)
GENESDN <- length(dn)
DET=nrow(de)
SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down", DET, "detected")
ns <-subset(de, padj > 0.05 )
plot(log2(de$baseMean),de$log2FoldChange,
xlab="log2 basemean", ylab="log2 foldchange",
pch=19, cex=0.5, col="dark gray",
main=contrast_name, cex.main=1)
points(log2(sig$baseMean),sig$log2FoldChange,
pch=19, cex=0.5, col="red")
mtext(SUBHEADER,cex = 1)
}
make_volcano <- function(de,name) {
sig <- subset(de,padj<0.05)
N_SIG=nrow(sig)
N_UP=nrow(subset(sig,log2FoldChange>0))
N_DN=nrow(subset(sig,log2FoldChange<0))
DET=nrow(de)
HEADER=paste(N_SIG,"@5%FDR,", N_UP, "up", N_DN, "dn", DET, "detected")
plot(de$log2FoldChange,-log10(de$pval),cex=0.5,pch=19,col="darkgray",
main=name, xlab="log2 FC", ylab="-log10 pval")
mtext(HEADER)
grid()
points(sig$log2FoldChange,-log10(sig$pval),cex=0.5,pch=19,col="red")
}
make_heatmap <- function(de,name,myss,mx,n=30){
colfunc <- colorRampPalette(c("blue", "white", "red"))
values <- as.numeric(myss$OA)
f <- colorRamp(c("yellow", "orange"))
rr <- range(values)
svals <- (values-rr[1])/diff(rr)
colcols <- rgb(f(svals)/255)
mxn <- mx/rowSums(mx)*1000000
x <- mxn[which(rownames(mxn) %in% rownames(head(de,n))),]
heatmap.2(as.matrix(x),trace="none",col=colfunc(25),scale="row", margins = c(10,15), cexRow=0.5,
main=paste("Top ranked",n,"genes in",name) , ColSideColors = colcols )
}
maplot(dge,"insta vs OA")
make_volcano(dge,"insta vs OA")
make_heatmap(dge,"insta vs OA",ss,xx,n=50)
#download.file("https://reactome.org/download/current/ReactomePathways.gmt.zip", destfile="ReactomePathways.gmt.zip")
#unzip("ReactomePathways.gmt.zip")
genesets <- gmt_import("ReactomePathways.gmt")
# gene table
gt <- as.data.frame(rownames(xx))
gt$gene <- sapply(strsplit(gt[,1]," "),"[[",2)
m <- mitch_import(dge, DEtype="deseq2",geneTable=gt)
## The input is a single dataframe; one contrast only. Converting
## it to a list for you.
## Note: Mean no. genes in input = 18774
## Note: no. genes in output = 18743
## Note: estimated proportion of input genes in output = 0.998
m1 <- mitch_calc(m, genesets, priority="effect")
## Note: Enrichments with large effect sizes may not be
## statistically significant.
head(m1$enrichment_result,20) %>% kbl(caption = "Top gene pathway differences using the parametric DE data") %>% kable_paper("hover", full_width = F)
set | setSize | pANOVA | s.dist | p.adjustANOVA | |
---|---|---|---|---|---|
203 | Cohesin Loading onto Chromatin | 10 | 0.0000058 | -0.8274916 | 0.0000431 |
973 | RORA activates gene expression | 18 | 0.0000001 | -0.7376354 | 0.0000006 |
834 | Peptide chain elongation | 87 | 0.0000000 | 0.7193655 | 0.0000000 |
1396 | Viral mRNA Translation | 87 | 0.0000000 | 0.7144563 | 0.0000000 |
368 | Eukaryotic Translation Elongation | 92 | 0.0000000 | 0.7134260 | 0.0000000 |
1118 | Selenocysteine synthesis | 91 | 0.0000000 | 0.7042995 | 0.0000000 |
48 | Activation of RAC1 | 11 | 0.0000566 | -0.7010658 | 0.0003553 |
365 | Establishment of Sister Chromatid Cohesion | 11 | 0.0000609 | -0.6979986 | 0.0003769 |
370 | Eukaryotic Translation Termination | 91 | 0.0000000 | 0.6972943 | 0.0000000 |
1047 | Regulation of signaling by CBL | 18 | 0.0000004 | -0.6885269 | 0.0000036 |
1096 | SRP-dependent cotranslational protein targeting to membrane | 110 | 0.0000000 | 0.6765026 | 0.0000000 |
727 | NOTCH2 intracellular domain regulates transcription | 10 | 0.0002294 | -0.6727486 | 0.0011713 |
1443 | tRNA processing in the mitochondrion | 30 | 0.0000000 | -0.6639270 | 0.0000000 |
411 | Formation of a pool of free 40S subunits | 99 | 0.0000000 | 0.6589267 | 0.0000000 |
1062 | Response of EIF2AK4 (GCN2) to amino acid deficiency | 99 | 0.0000000 | 0.6461743 | 0.0000000 |
767 | Nonsense Mediated Decay (NMD) independent of the Exon Junction Complex (EJC) | 93 | 0.0000000 | 0.6455533 | 0.0000000 |
1117 | Selenoamino acid metabolism | 113 | 0.0000000 | 0.6429453 | 0.0000000 |
142 | CREB1 phosphorylation through the activation of Adenylate Cyclase | 11 | 0.0002401 | -0.6394308 | 0.0012217 |
813 | PI3K events in ERBB2 signaling | 13 | 0.0001156 | -0.6174545 | 0.0006506 |
16 | APC/C:Cdc20 mediated degradation of mitotic proteins | 73 | 0.0000000 | 0.6166820 | 0.0000000 |
m1top <- subset(m1$enrichment_result,p.adjustANOVA<0.05)
m1up <- subset(m1top,s.dist>0)$set
m1dn <- subset(m1top,s.dist<0)$set
Looking at gene ranks might be a better option for comparisons across batches.
The rank() function give the rank from lowest to highest and I’ll use this for wilcox test.
r <- apply(xx,2,rank)
ctrl <- which(colnames(r) <70)
case <- which(colnames(r) >70)
suppressWarnings(
wtres <- t(apply(r,1,function(x) {
wt <- wilcox.test(x[case] , x[ctrl] )
meanrankdiff <- mean(x[case]) - mean(x[ctrl])
unname(c(meanrankdiff,wt$p.value))
})))
colnames(wtres) <- c("meanrankdiff","pvalue")
wtres <- as.data.frame(wtres)
wtres <- wtres[order(wtres$pvalue),]
wtres$fdr <- p.adjust(wtres$pvalue)
top <- head(wtres,50)
top <- top[order(top$meanrankdiff),]
plot(top$meanrankdiff,1:nrow(top),bty="none",xlab="mean rank difference",ylab="gene",pch=19,cex=2,col="gray",main="Top differentially regulated genes")
labels <- sapply(strsplit(rownames(top)," "),"[[",2)
text(top$meanrankdiff,1:nrow(top),labels=labels,cex=0.7)
top
## meanrankdiff pvalue fdr
## ENSG00000029534.21 ANK1 -4705.0897 2.207038e-06 0.04143493
## ENSG00000047648.23 ARHGAP6 -4131.7756 2.207038e-06 0.04143493
## ENSG00000048052.23 HDAC9 -3601.2244 2.207038e-06 0.04143493
## ENSG00000011523.14 CEP68 -3571.4359 2.207038e-06 0.04143493
## ENSG00000023516.9 AKAP11 -3449.0000 2.207038e-06 0.04143493
## ENSG00000011114.15 BTBD7 -3094.7949 2.207038e-06 0.04143493
## ENSG00000007944.15 MYLIP -3065.1923 2.207038e-06 0.04143493
## ENSG00000008086.13 CDKL5 -2621.5256 2.207038e-06 0.04143493
## ENSG00000048405.10 ZNF800 -2611.4679 2.207038e-06 0.04143493
## ENSG00000031081.11 ARHGAP31 -2575.7564 2.207038e-06 0.04143493
## ENSG00000033327.13 GAB2 -2449.7372 2.207038e-06 0.04143493
## ENSG00000053254.16 FOXN3 -2312.0513 2.207038e-06 0.04143493
## ENSG00000043143.21 JADE2 -2263.0192 2.207038e-06 0.04143493
## ENSG00000008853.17 RHOBTB2 -2108.1026 2.207038e-06 0.04143493
## ENSG00000033800.14 PIAS1 -2003.0641 2.207038e-06 0.04143493
## ENSG00000049618.24 ARID1B -1971.8205 2.207038e-06 0.04143493
## ENSG00000048707.15 VPS13D -1915.5962 2.207038e-06 0.04143493
## ENSG00000025293.17 PHF20 -1594.8654 2.207038e-06 0.04143493
## ENSG00000012232.9 EXTL3 -1590.0513 2.207038e-06 0.04143493
## ENSG00000010292.13 NCAPD2 -1476.1218 2.207038e-06 0.04143493
## ENSG00000019995.6 ZRANB1 -1468.2179 2.207038e-06 0.04143493
## ENSG00000017797.13 RALBP1 -761.2244 2.207038e-06 0.04143493
## ENSG00000002834.18 LASP1 -307.4295 2.207038e-06 0.04143493
## ENSG00000047315.17 POLR2B 683.4679 2.207038e-06 0.04143493
## ENSG00000010244.19 ZNF207 707.3205 2.207038e-06 0.04143493
## ENSG00000003756.17 RBM5 1037.8077 2.207038e-06 0.04143493
## ENSG00000010295.20 IFFO1 1113.7949 2.207038e-06 0.04143493
## ENSG00000004534.15 RBM6 1269.6731 2.207038e-06 0.04143493
## ENSG00000008018.9 PSMB1 1298.3013 2.207038e-06 0.04143493
## ENSG00000006530.18 AGK 1373.7115 2.207038e-06 0.04143493
## ENSG00000037474.15 NSUN2 1417.9423 2.207038e-06 0.04143493
## ENSG00000003509.16 NDUFAF7 1635.5128 2.207038e-06 0.04143493
## ENSG00000041357.16 PSMA4 1731.2564 2.207038e-06 0.04143493
## ENSG00000011009.11 LYPLA2 1778.0385 2.207038e-06 0.04143493
## ENSG00000011260.14 UTP18 1856.5385 2.207038e-06 0.04143493
## ENSG00000023041.11 ZDHHC6 1924.8141 2.207038e-06 0.04143493
## ENSG00000006015.18 REX1BD 1933.6667 2.207038e-06 0.04143493
## ENSG00000016864.19 GLT8D1 1945.6474 2.207038e-06 0.04143493
## ENSG00000051128.19 HOMER3 1951.7821 2.207038e-06 0.04143493
## ENSG00000050426.16 LETMD1 1967.0449 2.207038e-06 0.04143493
## ENSG00000013375.16 PGM3 2021.3846 2.207038e-06 0.04143493
## ENSG00000011243.19 AKAP8L 2032.3269 2.207038e-06 0.04143493
## ENSG00000050130.18 JKAMP 2109.2756 2.207038e-06 0.04143493
## ENSG00000008324.12 SS18L2 2155.8846 2.207038e-06 0.04143493
## ENSG00000037241.7 RPL26L1 2289.0256 2.207038e-06 0.04143493
## ENSG00000004779.10 NDUFAB1 2339.1346 2.207038e-06 0.04143493
## ENSG00000014641.21 MDH1 2552.0000 2.207038e-06 0.04143493
## ENSG00000008128.23 CDK11A 2835.3974 2.207038e-06 0.04143493
## ENSG00000007392.17 LUC7L 3942.3654 2.207038e-06 0.04143493
## ENSG00000012223.13 LTF 7282.1282 2.207038e-06 0.04143493
Here I’m using the mitch package and gene pathways from Reactome to understand the affected pathways separately for each tissue type.
Reactome pathways downloaded 9th Dec 2021.
wtres$stat <- -log10(wtres$pvalue) * sign(wtres$meanrankdiff)
wtres$gene <- sapply(strsplit(rownames(wtres)," "),"[[",2)
wtres2<-wtres[,c("stat","gene")]
wtres2 <- aggregate(. ~ gene, wtres2, max)
rownames(wtres2) <- wtres2$gene
wtres2$gene=NULL
m2 <- mitch_calc(wtres2, genesets, priority="effect")
## Warning in mitch_rank(input_profile): Warning: >60% of genes have the same score. This isn't
## optimal for rank based enrichment analysis.
## Note: Enrichments with large effect sizes may not be
## statistically significant.
head(m2$enrichment_result,20) %>% kbl(caption = "Top gene pathway differences using the nonparametric DE data") %>% kable_paper("hover", full_width = F)
set | setSize | pANOVA | s.dist | p.adjustANOVA | |
---|---|---|---|---|---|
203 | Cohesin Loading onto Chromatin | 10 | 0.0000982 | -0.7111728 | 0.0005789 |
1396 | Viral mRNA Translation | 87 | 0.0000000 | 0.7098533 | 0.0000000 |
834 | Peptide chain elongation | 87 | 0.0000000 | 0.7088817 | 0.0000000 |
368 | Eukaryotic Translation Elongation | 92 | 0.0000000 | 0.7033858 | 0.0000000 |
1118 | Selenocysteine synthesis | 91 | 0.0000000 | 0.6966604 | 0.0000000 |
973 | RORA activates gene expression | 18 | 0.0000003 | -0.6963952 | 0.0000028 |
1096 | SRP-dependent cotranslational protein targeting to membrane | 110 | 0.0000000 | 0.6943214 | 0.0000000 |
370 | Eukaryotic Translation Termination | 91 | 0.0000000 | 0.6926406 | 0.0000000 |
365 | Establishment of Sister Chromatid Cohesion | 11 | 0.0000825 | -0.6854144 | 0.0004944 |
411 | Formation of a pool of free 40S subunits | 99 | 0.0000000 | 0.6778735 | 0.0000000 |
240 | Cytosolic tRNA aminoacylation | 24 | 0.0000000 | 0.6774200 | 0.0000001 |
1047 | Regulation of signaling by CBL | 18 | 0.0000008 | -0.6704376 | 0.0000070 |
402 | Folding of actin by CCT/TriC | 10 | 0.0002447 | 0.6696685 | 0.0013147 |
142 | CREB1 phosphorylation through the activation of Adenylate Cyclase | 11 | 0.0001267 | -0.6672442 | 0.0007266 |
1381 | Ubiquitin Mediated Degradation of Phosphorylated Cdc25A | 50 | 0.0000000 | 0.6505483 | 0.0000000 |
1430 | p53-Independent DNA Damage Response | 50 | 0.0000000 | 0.6505483 | 0.0000000 |
1431 | p53-Independent G1/S DNA damage checkpoint | 50 | 0.0000000 | 0.6505483 | 0.0000000 |
1117 | Selenoamino acid metabolism | 113 | 0.0000000 | 0.6441685 | 0.0000000 |
767 | Nonsense Mediated Decay (NMD) independent of the Exon Junction Complex (EJC) | 93 | 0.0000000 | 0.6440751 | 0.0000000 |
48 | Activation of RAC1 | 11 | 0.0002301 | -0.6412702 | 0.0012451 |
m2top <- subset(m2$enrichment_result,p.adjustANOVA<0.05)
m2up <- subset(m2top,s.dist>0)$set
m2dn <- subset(m2top,s.dist<0)$set
These analyses gave similar results, so it suggests that there might not be such a huge batch effect impacting the results.
v0 <- list("para up"=m1up,"para dn"=m1dn, "nonp up"=m2up, "nonp dn"=m2dn)
str(v0)
## List of 4
## $ para up: chr [1:308] "Peptide chain elongation" "Viral mRNA Translation" "Eukaryotic Translation Elongation" "Selenocysteine synthesis" ...
## $ para dn: chr [1:297] "Cohesin Loading onto Chromatin" "RORA activates gene expression" "Activation of RAC1" "Establishment of Sister Chromatid Cohesion" ...
## $ nonp up: chr [1:314] "Viral mRNA Translation" "Peptide chain elongation" "Eukaryotic Translation Elongation" "Selenocysteine synthesis" ...
## $ nonp dn: chr [1:235] "Cohesin Loading onto Chromatin" "RORA activates gene expression" "Establishment of Sister Chromatid Cohesion" "Regulation of signaling by CBL" ...
plot(euler(v0),quantities = TRUE, edges = "gray", main="Parametric and nonparametric DE analysis")
Based on the gene expression data, my guess is that there is no major technical batch effect between the two sample groups. That said, we can’t be 100% sure that the differences observed are due to biological differences and not batch effects. To be certain, it would be a good idea to perform some validation analysis, either by testing the expression of some of these genes with another method like RTqPCR, or testing pathway activation biochemically.
As OA is thought to associate with inflammation, I was expecting to see these pathways activated. However OA is different to rheumatoid arthritis which definitely has an autoimmune component to it.
What we did observe was a downregulation of many signaling pathways such as RORA, RAC1, NOTCH2, CREB1, PI3K and others, while translation, amino acid recycling, nonsense mediated decay, degradation of mitotic proteins, and p53 pathways were upregulated.
For reproducibility.
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] kableExtra_1.3.4 topconfects_1.8.0
## [3] limma_3.48.1 eulerr_6.1.0
## [5] mitch_1.4.0 MASS_7.3-54
## [7] fgsea_1.18.0 gplots_3.1.1
## [9] DESeq2_1.32.0 SummarizedExperiment_1.22.0
## [11] Biobase_2.52.0 MatrixGenerics_1.4.0
## [13] matrixStats_0.59.0 GenomicRanges_1.44.0
## [15] GenomeInfoDb_1.28.1 IRanges_2.26.0
## [17] S4Vectors_0.30.0 BiocGenerics_0.38.0
## [19] reshape2_1.4.4 forcats_0.5.1
## [21] stringr_1.4.0 dplyr_1.0.7
## [23] purrr_0.3.4 readr_2.0.0
## [25] tidyr_1.1.3 tibble_3.1.3
## [27] ggplot2_3.3.5 tidyverse_1.3.1
## [29] zoo_1.8-9
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-2 ellipsis_0.3.2 XVector_0.32.0
## [4] fs_1.5.0 rstudioapi_0.13 bit64_4.0.5
## [7] AnnotationDbi_1.54.1 fansi_0.5.0 lubridate_1.7.10
## [10] xml2_1.3.2 splines_4.1.2 cachem_1.0.5
## [13] knitr_1.33 geneplotter_1.70.0 polyclip_1.10-0
## [16] jsonlite_1.7.2 broom_0.7.8 annotate_1.70.0
## [19] dbplyr_2.1.1 png_0.1-7 shiny_1.6.0
## [22] compiler_4.1.2 httr_1.4.2 backports_1.2.1
## [25] assertthat_0.2.1 Matrix_1.3-4 fastmap_1.1.0
## [28] cli_3.0.1 later_1.2.0 htmltools_0.5.1.1
## [31] tools_4.1.2 gtable_0.3.0 glue_1.4.2
## [34] GenomeInfoDbData_1.2.6 fastmatch_1.1-3 Rcpp_1.0.7
## [37] jquerylib_0.1.4 cellranger_1.1.0 vctrs_0.3.8
## [40] Biostrings_2.60.1 svglite_2.0.0 polylabelr_0.2.0
## [43] xfun_0.24 rvest_1.0.0 mime_0.11
## [46] lifecycle_1.0.0 gtools_3.9.2 XML_3.99-0.6
## [49] zlibbioc_1.38.0 scales_1.1.1 hms_1.1.0
## [52] promises_1.2.0.1 RColorBrewer_1.1-2 yaml_2.2.1
## [55] memoise_2.0.0 gridExtra_2.3 sass_0.4.0
## [58] reshape_0.8.8 stringi_1.7.3 RSQLite_2.2.7
## [61] highr_0.9 genefilter_1.74.0 caTools_1.18.2
## [64] BiocParallel_1.26.1 echarts4r_0.4.1 systemfonts_1.0.2
## [67] rlang_0.4.11 pkgconfig_2.0.3 bitops_1.0-7
## [70] evaluate_0.14 lattice_0.20-45 htmlwidgets_1.5.3
## [73] bit_4.0.4 tidyselect_1.1.1 GGally_2.1.2
## [76] plyr_1.8.6 magrittr_2.0.1 R6_2.5.0
## [79] generics_0.1.0 DelayedArray_0.18.0 DBI_1.1.1
## [82] pillar_1.6.1 haven_2.4.1 withr_2.4.2
## [85] survival_3.2-13 KEGGREST_1.32.0 RCurl_1.98-1.3
## [88] modelr_0.1.8 crayon_1.4.1 KernSmooth_2.23-20
## [91] utf8_1.2.2 rmarkdown_2.9 tzdb_0.1.2
## [94] locfit_1.5-9.4 grid_4.1.2 readxl_1.3.1
## [97] data.table_1.14.0 blob_1.2.2 webshot_0.5.2
## [100] reprex_2.0.0 digest_0.6.27 xtable_1.8-4
## [103] httpuv_1.6.1 munsell_0.5.0 viridisLite_0.4.0
## [106] beeswarm_0.4.0 bslib_0.2.5.1