Source: https://github.com/markziemann/shoulder-instability-osteroarthritis

Introduction

Here we are comparing the gene expression patterns of shoulder instability to shoulder osteroarthritis. We are doing this to identify the gene expression patterns which are specific to osteoarthritis. We are using the instability group as a control for this contrast as it is difficult to obtain specimens of completely normal shoulder tissue.

We will be looking only at capsular tissues for the differential expression analysis.

suppressPackageStartupMessages({
    library("zoo")
    library("tidyverse")
    library("reshape2")
    library("DESeq2")
    library("gplots")
    library("fgsea")
    library("MASS")
    library("mitch")
    library("eulerr")
    library("limma")
    library("topconfects")
    library("kableExtra")
})

Import read counts

Importing osteoarthritis (OA) and shoulder instability (SI) data separately.

tmp <- read.table("3col_oa.tsv.gz",header=F)
x <- as.matrix(acast(tmp, V2~V1, value.var="V3", fun.aggregate = sum))
x <- as.data.frame(x)
accession <- sapply((strsplit(rownames(x),"\\|")),"[[",2)
symbol<-sapply((strsplit(rownames(x),"\\|")),"[[",6)
x$geneid <- paste(accession,symbol)
xx <- aggregate(. ~ geneid,x,sum)
rownames(xx) <- xx$geneid
colnames <- gsub("T0R","T0",colnames(xx))
xx$geneid = NULL
oa <- round(xx)
head(oa)
##                             4001-41B 4001-41C 4001-41F 4001-41M 4001-41S
## ENSG00000000003.15 TSPAN6        327      231      613       27      346
## ENSG00000000005.6 TNMD             9       45      477        1       30
## ENSG00000000419.13 DPM1          304      497      485      132      502
## ENSG00000000457.14 SCYL3         245      245      280       83      231
## ENSG00000000460.17 C1orf112      101       91       79       15       82
## ENSG00000000938.13 FGR          2962      828      661       91      736
##                             4002-42B 4002-42C 4002-42F 4002-42M 4002-42S
## ENSG00000000003.15 TSPAN6        432      301      579       46      336
## ENSG00000000005.6 TNMD            52        2      275        1      308
## ENSG00000000419.13 DPM1          349      333      422      126      339
## ENSG00000000457.14 SCYL3         202      185      183       93      168
## ENSG00000000460.17 C1orf112       86       71       53        9       49
## ENSG00000000938.13 FGR           562     1012      713       95      479
##                             4003-43B 4003-43C 4003-43F 4003-43M 4003-43S
## ENSG00000000003.15 TSPAN6        279      683      885       69      370
## ENSG00000000005.6 TNMD            17      356      772       12      387
## ENSG00000000419.13 DPM1          851      815      486      328      617
## ENSG00000000457.14 SCYL3         287      281      326      138      278
## ENSG00000000460.17 C1orf112      229      105       68       21       89
## ENSG00000000938.13 FGR          1294     1404      421       69      605
##                             4004-44B 4004-44C 4004-44F 4004-44M 4004-44S
## ENSG00000000003.15 TSPAN6        526      386      428       57      301
## ENSG00000000005.6 TNMD             3       22      480        2       10
## ENSG00000000419.13 DPM1          366      529      218      281      595
## ENSG00000000457.14 SCYL3         214      199      177      141      262
## ENSG00000000460.17 C1orf112      149       58       32       38      103
## ENSG00000000938.13 FGR          4440      775      406       74      803
##                             4005-45B 4005-45C 4005-45F 4005-45M 4005-45S
## ENSG00000000003.15 TSPAN6         85      371      368       78      231
## ENSG00000000005.6 TNMD            60      157      246        4      139
## ENSG00000000419.13 DPM1          395      300      168      493      414
## ENSG00000000457.14 SCYL3         261      149      140      157      184
## ENSG00000000460.17 C1orf112      179       60       58       38       70
## ENSG00000000938.13 FGR          8028      533      450       72     1392
##                             4006-46B 4006-46C 4006-46F 4006-46M 4006-46S
## ENSG00000000003.15 TSPAN6        722      342      385       55      363
## ENSG00000000005.6 TNMD            40        4      171       13       11
## ENSG00000000419.13 DPM1          525      434      273      238      516
## ENSG00000000457.14 SCYL3         300      242      147      113      237
## ENSG00000000460.17 C1orf112      134       60       47       19      116
## ENSG00000000938.13 FGR           620      968      362      364     1021
tmp <- read.table("3col_instability.tsv.gz",header=F)
x <- as.matrix(acast(tmp, V2~V1, value.var="V3", fun.aggregate = sum))
x <- as.data.frame(x)
accession <- sapply((strsplit(rownames(x),"\\|")),"[[",2)
symbol<-sapply((strsplit(rownames(x),"\\|")),"[[",6)
x$geneid <- paste(accession,symbol)
xx <- aggregate(. ~ geneid,x,sum)
rownames(xx) <- xx$geneid
colnames(xx) <- sapply(strsplit(colnames(xx),"_"),"[[",1)
xx$geneid = NULL
si <- round(xx)
head(si)
##                             SM-027 SM-028 SM-029 SM-030 SM-031 SM-032 SM-033
## ENSG00000000003.15 TSPAN6      187    191    204    185    112    116    128
## ENSG00000000005.6 TNMD          11      4     18      3     37      1      5
## ENSG00000000419.13 DPM1        273    308    401    351    180    375    454
## ENSG00000000457.14 SCYL3       270    318    364    230    124    218    256
## ENSG00000000460.17 C1orf112     62    101    138    124     45     88     65
## ENSG00000000938.13 FGR         806    704   1269    775    677    501    501
##                             SM-034 SM-035 SM-036 SM-037 SM-038 SM-039 SM-040
## ENSG00000000003.15 TSPAN6       75    165    283    104    127    107    135
## ENSG00000000005.6 TNMD           0     71     16     11      1      8     23
## ENSG00000000419.13 DPM1        231    289    345    227    280    173    325
## ENSG00000000457.14 SCYL3       141    225    361    197    232    142    251
## ENSG00000000460.17 C1orf112     65     60     91    101     79     59     81
## ENSG00000000938.13 FGR         281    354    280    371    581    302   1233
##                             SM-041 SM-042 SM-043 SM-044 SM-045 SM-046 SM-047
## ENSG00000000003.15 TSPAN6      224    139    197    156    183    229    248
## ENSG00000000005.6 TNMD          12      1    276    158      3    150    125
## ENSG00000000419.13 DPM1        421    319    335    331    432    317    399
## ENSG00000000457.14 SCYL3       310    262    348    269    272    341    335
## ENSG00000000460.17 C1orf112    121    103     72     85    105    136    142
## ENSG00000000938.13 FGR        1097    634   1299    767   1766    231    661
##                             SM-048 SM-049 SM-050 SM-051 SM-052
## ENSG00000000003.15 TSPAN6      252    114    226    363    234
## ENSG00000000005.6 TNMD         106     15     33      6      7
## ENSG00000000419.13 DPM1        326    296    432    452    398
## ENSG00000000457.14 SCYL3       220    238    266    381    350
## ENSG00000000460.17 C1orf112     90     76    115     59    142
## ENSG00000000938.13 FGR         636    352    598    446   1790
dim(oa)
## [1] 60651    30
dim(si)
## [1] 60651    26

QC analysis

Here I’ll look at a few different quality control measures.

par(mar=c(5,8,3,1))
barplot(colSums(oa),horiz=TRUE,las=1,xlab="OA num reads")

sums <- colSums(oa)
sums <- sums[order(sums)]
barplot(sums,horiz=TRUE,las=1,xlab="OA num reads")
abline(v=15000000,col="red")

barplot(colSums(si),horiz=TRUE,las=1,xlab="SI num reads")

sums <- colSums(si)
sums <- sums[order(sums)]
barplot(sums,horiz=TRUE,las=1,xlab="SI num reads")
abline(v=15000000,col="red")

MDS plot

Multidimensional scaling plot to show the variation between all samples, very similar to PCA.

xx <- cbind(oa,si)

mds <- cmdscale(dist(t(xx)))

plot(mds, xlab="Coordinate 1", ylab="Coordinate 2", 
  type = "p",bty="n",pch=19, cex=4 ,col="gray")
text(mds, labels=rownames(mds) )

Now remove fat, bone and muscle, then plot capsular and synovium samples.

dim(oa)
## [1] 60651    30
oa <- oa[,grep("M",colnames(oa),invert=TRUE)]
dim(oa)
## [1] 60651    24
oa <- oa[,grep("B",colnames(oa),invert=TRUE)]
dim(oa)
## [1] 60651    18
oa <- oa[,grep("F",colnames(oa),invert=TRUE)]
dim(oa)
## [1] 60651    12
xx <- cbind(oa,si)

mds <- cmdscale(dist(t(xx)))

caps <- as.numeric(grepl("C",colnames(xx)))
syno <- as.numeric(grepl("S$",colnames(xx))) * 2
cols <- caps + syno
cols <- cols +2

plot(mds, xlab="Coordinate 1", ylab="Coordinate 2",
  type = "p",bty="n",pch=19, cex=4 ,col=cols)
text(mds, labels=rownames(mds) )

Now remove synovium and regenerate the MDS plot for only the capsular samples.

oa <- oa[,grep("S",colnames(oa),invert=TRUE)]
dim(oa)
## [1] 60651     6
xx <- cbind(oa,si)
isoa <- as.numeric(grepl("C",colnames(xx)))

mds <- cmdscale(dist(t(xx)))

plot(mds, xlab="Coordinate 1", ylab="Coordinate 2",
  type = "p",bty="n",pch=19, cex=4 ,col=isoa+2)
text(mds, labels=rownames(mds) )

legend("topleft", legend=c("insta", "OA"),
       col=c(2,3), cex=1, pch=19)

This looks promising that there is some clustering of samples, with OA sample in green on the right and instability samples on the left. We need to be cautious as these were sequences in separate runs, but it would appear that there is some overlap between the groups which indicates that the batch effects might not be too severe.

Correlation heatmap

heatmap.2(cor(xx),trace="n",main="Pearson correlation heatmap")

Sample sheet

Also need to fix the patient identifiers to make it consistent between runs.

URL="https://raw.githubusercontent.com/markziemann/shoulder-instability-osteroarthritis/main/OA_patients.tsv"
pat <- read.table(URL,header=TRUE,sep="\t")

oa_pats <- colnames(xx)[grep("C",colnames(xx))]
oa_pats <- sapply(strsplit(oa_pats,"-"),"[[",1)
oa_pats <- paste("AD-CAB",oa_pats)
oa_pats <- pat[which(pat$Record_number %in% oa_pats),"Unique_participant_ID"]

insta_pats <- colnames(xx)[grep("SM",colnames(xx))]
insta_pats <- sapply(strsplit(insta_pats,"-"),"[[",2)
insta_pats <- as.integer(gsub("^0","",insta_pats))

pats <- c(oa_pats,insta_pats)

ss <- pat[which(pat$Unique_participant_ID %in% pats),]
rownames(ss) <- ss$Unique_participant_ID
ss$OA <- factor(ss$Unique_participant_ID>70)

colnames(xx) <- pats

Analysis of gene expression differences between instability and OA groups

Don’t forget to remove poorly detected genes from the matrix with a threshold of 10 reads per sample on average.

I’m also reordering the count matrix columns (xx) so it is in the same order as the samplesheet.

dim(xx)
## [1] 60651    32
xx <- xx[which(rowMeans(xx)>=10),]
dim(xx)
## [1] 18774    32
xx <- xx[,match( rownames(ss) , colnames(xx) )]

Not taking into consideration the effect of sex (will deal with that later).

dds <- DESeqDataSetFromMatrix(countData = xx , colData = ss, design = ~ OA )
## converting counts to integer mode
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 61 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
z<- results(res)
vsd <- vst(dds, blind=FALSE)
zz <- cbind(as.data.frame(z),assay(vsd))
dge <- as.data.frame(zz[order(zz$pvalue),])
dge[1:20,1:6] %>% kbl(caption = "Top gene expression differences between instability (ctrl) and OA (case) without consideration of sex") %>% kable_paper("hover", full_width = F)
Top gene expression differences between instability (ctrl) and OA (case) without consideration of sex
baseMean log2FoldChange lfcSE stat pvalue padj
ENSG00000159658.15 EFCAB14 3332.6211 -1.2520243 0.0519688 -24.09186 0 0
ENSG00000174282.12 ZBTB4 4700.0483 -1.4555560 0.0687237 -21.17982 0 0
ENSG00000178502.6 KLHL11 192.8755 -2.7752160 0.1312278 -21.14808 0 0
ENSG00000185129.8 PURA 1385.1347 -1.4980158 0.0709312 -21.11928 0 0
ENSG00000196914.9 ARHGEF12 4653.3909 -1.4837353 0.0742316 -19.98793 0 0
ENSG00000083168.11 KAT6A 1542.5913 -1.6332998 0.0818170 -19.96284 0 0
ENSG00000140396.13 NCOA2 967.7400 -1.6354692 0.0822493 -19.88428 0 0
ENSG00000114439.19 BBX 1534.9614 -1.2030571 0.0615213 -19.55513 0 0
ENSG00000130749.10 ZC3H4 1165.2906 -1.6253576 0.0842774 -19.28582 0 0
ENSG00000134644.16 PUM1 1626.2984 -0.9261724 0.0491877 -18.82936 0 0
ENSG00000125686.12 MED1 1067.1058 -1.3420280 0.0713158 -18.81810 0 0
ENSG00000132466.19 ANKRD17 1736.2567 -1.0809885 0.0578369 -18.69028 0 0
ENSG00000228253.1 MT-ATP8 14517.5823 -3.6433673 0.1954961 -18.63652 0 0
ENSG00000178951.9 ZBTB7A 3589.8387 -1.8000341 0.0979346 -18.37996 0 0
ENSG00000160007.19 ARHGAP35 2424.1989 -1.1767383 0.0648693 -18.14014 0 0
ENSG00000183741.12 CBX6 3816.0534 -1.6688803 0.0925367 -18.03479 0 0
ENSG00000185591.10 SP1 2337.4013 -1.1069720 0.0617523 -17.92600 0 0
ENSG00000078687.17 TNRC6C 1012.4221 -1.8929297 0.1067483 -17.73264 0 0
ENSG00000113580.15 NR3C1 2895.2502 -1.2823855 0.0723789 -17.71767 0 0
ENSG00000210144.1 MT-TY 287.6235 -6.2176214 0.3513097 -17.69840 0 0
dgeb <- dge

Now repeat the analysis including the effect of sex.

ss$Sex <- factor(ss$Sex)

dds <- DESeqDataSetFromMatrix(countData = xx , colData = ss, design = ~ Sex + OA )
## converting counts to integer mode
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 89 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
z<- results(res)
vsd <- vst(dds, blind=FALSE)
zz <- cbind(as.data.frame(z),assay(vsd))
dge <- as.data.frame(zz[order(zz$pvalue),])
dge[1:20,1:6] %>% kbl(caption = "Top gene expression differences between instability (ctrl) and OA (case) correcting for sex") %>% kable_paper("hover", full_width = F)
Top gene expression differences between instability (ctrl) and OA (case) correcting for sex
baseMean log2FoldChange lfcSE stat pvalue padj
ENSG00000159658.15 EFCAB14 3332.6211 -1.2654394 0.0624634 -20.25889 0 0
ENSG00000178502.6 KLHL11 192.8755 -2.7198587 0.1460426 -18.62374 0 0
ENSG00000174282.12 ZBTB4 4700.0483 -1.4656438 0.0828188 -17.69699 0 0
ENSG00000185129.8 PURA 1385.1347 -1.5001968 0.0850439 -17.64027 0 0
ENSG00000140396.13 NCOA2 967.7400 -1.6636029 0.0978811 -16.99616 0 0
ENSG00000130749.10 ZC3H4 1165.2906 -1.6665605 0.1003310 -16.61062 0 0
ENSG00000083168.11 KAT6A 1542.5913 -1.6306942 0.0981851 -16.60837 0 0
ENSG00000196914.9 ARHGEF12 4653.3909 -1.4716189 0.0894538 -16.45115 0 0
ENSG00000114439.19 BBX 1534.9614 -1.2082963 0.0737470 -16.38434 0 0
ENSG00000210144.1 MT-TY 287.6235 -6.2542130 0.3872795 -16.14909 0 0
ENSG00000173915.16 ATP5MD 382.0800 1.6301546 0.1020193 15.97888 0 0
ENSG00000132466.19 ANKRD17 1736.2567 -1.1001706 0.0691250 -15.91566 0 0
ENSG00000125686.12 MED1 1067.1058 -1.3450560 0.0853260 -15.76373 0 0
ENSG00000210194.1 MT-TE 353.1876 -6.7614006 0.4319016 -15.65496 0 0
ENSG00000005339.15 CREBBP 2887.6825 -1.5126863 0.0974960 -15.51537 0 0
ENSG00000134644.16 PUM1 1626.2984 -0.9111724 0.0587738 -15.50305 0 0
ENSG00000228253.1 MT-ATP8 14517.5823 -3.6537841 0.2356924 -15.50234 0 0
ENSG00000185591.10 SP1 2337.4013 -1.1329817 0.0737138 -15.37002 0 0
ENSG00000210151.2 MT-TS1 345.7805 -6.2523932 0.4070974 -15.35847 0 0
ENSG00000143190.23 POU2F1 678.7954 -1.4347898 0.0937583 -15.30308 0 0
dgeb <- dge

Here let’s look at some plots.

MA plot shows the average level and fold change of all detected genes. Volcano plot shows the fold change and the significance, as measured by -log(p-value). Significant genes are shown as red points.

There are heatmaps of the top ranked genes by p-value. Above the gene expression values there is a bar in yellow/orange colours. Yellow shows relatively low QuickDASH score and orange shows high score.

maplot <- function(de,contrast_name) {
  sig <-subset(de, padj < 0.05 )
  up <-rownames(subset(de, padj < 0.05 & log2FoldChange > 0))
  dn <-rownames(subset(de, padj < 0.05 & log2FoldChange < 0))
  GENESUP <- length(up)
  GENESDN <- length(dn)
  DET=nrow(de)
  SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down", DET, "detected")
  ns <-subset(de, padj > 0.05 )
  plot(log2(de$baseMean),de$log2FoldChange, 
       xlab="log2 basemean", ylab="log2 foldchange",
       pch=19, cex=0.5, col="dark gray",
       main=contrast_name, cex.main=1)
  points(log2(sig$baseMean),sig$log2FoldChange,
         pch=19, cex=0.5, col="red")
  mtext(SUBHEADER,cex = 1)
}

make_volcano <- function(de,name) {
    sig <- subset(de,padj<0.05)
    N_SIG=nrow(sig)
    N_UP=nrow(subset(sig,log2FoldChange>0))
    N_DN=nrow(subset(sig,log2FoldChange<0))
    DET=nrow(de)
    HEADER=paste(N_SIG,"@5%FDR,", N_UP, "up", N_DN, "dn", DET, "detected")
    plot(de$log2FoldChange,-log10(de$pval),cex=0.5,pch=19,col="darkgray",
        main=name, xlab="log2 FC", ylab="-log10 pval")
    mtext(HEADER)
    grid()
    points(sig$log2FoldChange,-log10(sig$pval),cex=0.5,pch=19,col="red")
}

make_heatmap <- function(de,name,myss,mx,n=30){

  colfunc <- colorRampPalette(c("blue", "white", "red"))
  values <- as.numeric(myss$OA)
  f <- colorRamp(c("yellow", "orange"))
  rr <- range(values)
  svals <- (values-rr[1])/diff(rr)
  colcols <- rgb(f(svals)/255)
  mxn <- mx/rowSums(mx)*1000000
  x <- mxn[which(rownames(mxn) %in% rownames(head(de,n))),]
  heatmap.2(as.matrix(x),trace="none",col=colfunc(25),scale="row", margins = c(10,15), cexRow=0.5, 
    main=paste("Top ranked",n,"genes in",name) , ColSideColors = colcols  )

}

maplot(dge,"insta vs OA")

make_volcano(dge,"insta vs OA")

make_heatmap(dge,"insta vs OA",ss,xx,n=50)

Pathway analysis with mitch

#download.file("https://reactome.org/download/current/ReactomePathways.gmt.zip", destfile="ReactomePathways.gmt.zip")
#unzip("ReactomePathways.gmt.zip")
genesets <- gmt_import("ReactomePathways.gmt")

# gene table
gt <- as.data.frame(rownames(xx))
gt$gene <- sapply(strsplit(gt[,1]," "),"[[",2)

m <- mitch_import(dge, DEtype="deseq2",geneTable=gt)
## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.
## Note: Mean no. genes in input = 18774
## Note: no. genes in output = 18743
## Note: estimated proportion of input genes in output = 0.998
m1 <- mitch_calc(m, genesets, priority="effect")
## Note: Enrichments with large effect sizes may not be
##             statistically significant.
head(m1$enrichment_result,20) %>% kbl(caption = "Top gene pathway differences using the parametric DE data") %>% kable_paper("hover", full_width = F)
Top gene pathway differences using the parametric DE data
set setSize pANOVA s.dist p.adjustANOVA
203 Cohesin Loading onto Chromatin 10 0.0000058 -0.8274916 0.0000431
973 RORA activates gene expression 18 0.0000001 -0.7376354 0.0000006
834 Peptide chain elongation 87 0.0000000 0.7193655 0.0000000
1396 Viral mRNA Translation 87 0.0000000 0.7144563 0.0000000
368 Eukaryotic Translation Elongation 92 0.0000000 0.7134260 0.0000000
1118 Selenocysteine synthesis 91 0.0000000 0.7042995 0.0000000
48 Activation of RAC1 11 0.0000566 -0.7010658 0.0003553
365 Establishment of Sister Chromatid Cohesion 11 0.0000609 -0.6979986 0.0003769
370 Eukaryotic Translation Termination 91 0.0000000 0.6972943 0.0000000
1047 Regulation of signaling by CBL 18 0.0000004 -0.6885269 0.0000036
1096 SRP-dependent cotranslational protein targeting to membrane 110 0.0000000 0.6765026 0.0000000
727 NOTCH2 intracellular domain regulates transcription 10 0.0002294 -0.6727486 0.0011713
1443 tRNA processing in the mitochondrion 30 0.0000000 -0.6639270 0.0000000
411 Formation of a pool of free 40S subunits 99 0.0000000 0.6589267 0.0000000
1062 Response of EIF2AK4 (GCN2) to amino acid deficiency 99 0.0000000 0.6461743 0.0000000
767 Nonsense Mediated Decay (NMD) independent of the Exon Junction Complex (EJC) 93 0.0000000 0.6455533 0.0000000
1117 Selenoamino acid metabolism 113 0.0000000 0.6429453 0.0000000
142 CREB1 phosphorylation through the activation of Adenylate Cyclase 11 0.0002401 -0.6394308 0.0012217
813 PI3K events in ERBB2 signaling 13 0.0001156 -0.6174545 0.0006506
16 APC/C:Cdc20 mediated degradation of mitotic proteins 73 0.0000000 0.6166820 0.0000000
m1top <- subset(m1$enrichment_result,p.adjustANOVA<0.05)
m1up <- subset(m1top,s.dist>0)$set
m1dn <- subset(m1top,s.dist<0)$set

Nonparametric analysis

Looking at gene ranks might be a better option for comparisons across batches.

The rank() function give the rank from lowest to highest and I’ll use this for wilcox test.

r <- apply(xx,2,rank)

ctrl <- which(colnames(r) <70)
case <- which(colnames(r) >70)

suppressWarnings(
wtres <- t(apply(r,1,function(x) {
  wt <- wilcox.test(x[case] ,  x[ctrl] )
  meanrankdiff <- mean(x[case]) - mean(x[ctrl])
  unname(c(meanrankdiff,wt$p.value))
})))

colnames(wtres) <- c("meanrankdiff","pvalue")
wtres <- as.data.frame(wtres)
wtres <- wtres[order(wtres$pvalue),]
wtres$fdr <- p.adjust(wtres$pvalue)

top <- head(wtres,50)
top <- top[order(top$meanrankdiff),]

plot(top$meanrankdiff,1:nrow(top),bty="none",xlab="mean rank difference",ylab="gene",pch=19,cex=2,col="gray",main="Top differentially regulated genes")
labels <- sapply(strsplit(rownames(top)," "),"[[",2)
text(top$meanrankdiff,1:nrow(top),labels=labels,cex=0.7)

top
##                             meanrankdiff       pvalue        fdr
## ENSG00000029534.21 ANK1       -4705.0897 2.207038e-06 0.04143493
## ENSG00000047648.23 ARHGAP6    -4131.7756 2.207038e-06 0.04143493
## ENSG00000048052.23 HDAC9      -3601.2244 2.207038e-06 0.04143493
## ENSG00000011523.14 CEP68      -3571.4359 2.207038e-06 0.04143493
## ENSG00000023516.9 AKAP11      -3449.0000 2.207038e-06 0.04143493
## ENSG00000011114.15 BTBD7      -3094.7949 2.207038e-06 0.04143493
## ENSG00000007944.15 MYLIP      -3065.1923 2.207038e-06 0.04143493
## ENSG00000008086.13 CDKL5      -2621.5256 2.207038e-06 0.04143493
## ENSG00000048405.10 ZNF800     -2611.4679 2.207038e-06 0.04143493
## ENSG00000031081.11 ARHGAP31   -2575.7564 2.207038e-06 0.04143493
## ENSG00000033327.13 GAB2       -2449.7372 2.207038e-06 0.04143493
## ENSG00000053254.16 FOXN3      -2312.0513 2.207038e-06 0.04143493
## ENSG00000043143.21 JADE2      -2263.0192 2.207038e-06 0.04143493
## ENSG00000008853.17 RHOBTB2    -2108.1026 2.207038e-06 0.04143493
## ENSG00000033800.14 PIAS1      -2003.0641 2.207038e-06 0.04143493
## ENSG00000049618.24 ARID1B     -1971.8205 2.207038e-06 0.04143493
## ENSG00000048707.15 VPS13D     -1915.5962 2.207038e-06 0.04143493
## ENSG00000025293.17 PHF20      -1594.8654 2.207038e-06 0.04143493
## ENSG00000012232.9 EXTL3       -1590.0513 2.207038e-06 0.04143493
## ENSG00000010292.13 NCAPD2     -1476.1218 2.207038e-06 0.04143493
## ENSG00000019995.6 ZRANB1      -1468.2179 2.207038e-06 0.04143493
## ENSG00000017797.13 RALBP1      -761.2244 2.207038e-06 0.04143493
## ENSG00000002834.18 LASP1       -307.4295 2.207038e-06 0.04143493
## ENSG00000047315.17 POLR2B       683.4679 2.207038e-06 0.04143493
## ENSG00000010244.19 ZNF207       707.3205 2.207038e-06 0.04143493
## ENSG00000003756.17 RBM5        1037.8077 2.207038e-06 0.04143493
## ENSG00000010295.20 IFFO1       1113.7949 2.207038e-06 0.04143493
## ENSG00000004534.15 RBM6        1269.6731 2.207038e-06 0.04143493
## ENSG00000008018.9 PSMB1        1298.3013 2.207038e-06 0.04143493
## ENSG00000006530.18 AGK         1373.7115 2.207038e-06 0.04143493
## ENSG00000037474.15 NSUN2       1417.9423 2.207038e-06 0.04143493
## ENSG00000003509.16 NDUFAF7     1635.5128 2.207038e-06 0.04143493
## ENSG00000041357.16 PSMA4       1731.2564 2.207038e-06 0.04143493
## ENSG00000011009.11 LYPLA2      1778.0385 2.207038e-06 0.04143493
## ENSG00000011260.14 UTP18       1856.5385 2.207038e-06 0.04143493
## ENSG00000023041.11 ZDHHC6      1924.8141 2.207038e-06 0.04143493
## ENSG00000006015.18 REX1BD      1933.6667 2.207038e-06 0.04143493
## ENSG00000016864.19 GLT8D1      1945.6474 2.207038e-06 0.04143493
## ENSG00000051128.19 HOMER3      1951.7821 2.207038e-06 0.04143493
## ENSG00000050426.16 LETMD1      1967.0449 2.207038e-06 0.04143493
## ENSG00000013375.16 PGM3        2021.3846 2.207038e-06 0.04143493
## ENSG00000011243.19 AKAP8L      2032.3269 2.207038e-06 0.04143493
## ENSG00000050130.18 JKAMP       2109.2756 2.207038e-06 0.04143493
## ENSG00000008324.12 SS18L2      2155.8846 2.207038e-06 0.04143493
## ENSG00000037241.7 RPL26L1      2289.0256 2.207038e-06 0.04143493
## ENSG00000004779.10 NDUFAB1     2339.1346 2.207038e-06 0.04143493
## ENSG00000014641.21 MDH1        2552.0000 2.207038e-06 0.04143493
## ENSG00000008128.23 CDK11A      2835.3974 2.207038e-06 0.04143493
## ENSG00000007392.17 LUC7L       3942.3654 2.207038e-06 0.04143493
## ENSG00000012223.13 LTF         7282.1282 2.207038e-06 0.04143493

Pathway enrichment

Here I’m using the mitch package and gene pathways from Reactome to understand the affected pathways separately for each tissue type.

Reactome pathways downloaded 9th Dec 2021.

wtres$stat <- -log10(wtres$pvalue) * sign(wtres$meanrankdiff)
wtres$gene <- sapply(strsplit(rownames(wtres)," "),"[[",2)

wtres2<-wtres[,c("stat","gene")]
wtres2 <- aggregate(. ~ gene, wtres2, max)
rownames(wtres2) <- wtres2$gene
wtres2$gene=NULL

m2 <- mitch_calc(wtres2, genesets, priority="effect")
## Warning in mitch_rank(input_profile): Warning: >60% of genes have the same score. This isn't
##             optimal for rank based enrichment analysis.
## Note: Enrichments with large effect sizes may not be
##             statistically significant.
head(m2$enrichment_result,20) %>% kbl(caption = "Top gene pathway differences using the nonparametric DE data") %>% kable_paper("hover", full_width = F)
Top gene pathway differences using the nonparametric DE data
set setSize pANOVA s.dist p.adjustANOVA
203 Cohesin Loading onto Chromatin 10 0.0000982 -0.7111728 0.0005789
1396 Viral mRNA Translation 87 0.0000000 0.7098533 0.0000000
834 Peptide chain elongation 87 0.0000000 0.7088817 0.0000000
368 Eukaryotic Translation Elongation 92 0.0000000 0.7033858 0.0000000
1118 Selenocysteine synthesis 91 0.0000000 0.6966604 0.0000000
973 RORA activates gene expression 18 0.0000003 -0.6963952 0.0000028
1096 SRP-dependent cotranslational protein targeting to membrane 110 0.0000000 0.6943214 0.0000000
370 Eukaryotic Translation Termination 91 0.0000000 0.6926406 0.0000000
365 Establishment of Sister Chromatid Cohesion 11 0.0000825 -0.6854144 0.0004944
411 Formation of a pool of free 40S subunits 99 0.0000000 0.6778735 0.0000000
240 Cytosolic tRNA aminoacylation 24 0.0000000 0.6774200 0.0000001
1047 Regulation of signaling by CBL 18 0.0000008 -0.6704376 0.0000070
402 Folding of actin by CCT/TriC 10 0.0002447 0.6696685 0.0013147
142 CREB1 phosphorylation through the activation of Adenylate Cyclase 11 0.0001267 -0.6672442 0.0007266
1381 Ubiquitin Mediated Degradation of Phosphorylated Cdc25A 50 0.0000000 0.6505483 0.0000000
1430 p53-Independent DNA Damage Response 50 0.0000000 0.6505483 0.0000000
1431 p53-Independent G1/S DNA damage checkpoint 50 0.0000000 0.6505483 0.0000000
1117 Selenoamino acid metabolism 113 0.0000000 0.6441685 0.0000000
767 Nonsense Mediated Decay (NMD) independent of the Exon Junction Complex (EJC) 93 0.0000000 0.6440751 0.0000000
48 Activation of RAC1 11 0.0002301 -0.6412702 0.0012451
m2top <- subset(m2$enrichment_result,p.adjustANOVA<0.05)    
m2up <- subset(m2top,s.dist>0)$set
m2dn <- subset(m2top,s.dist<0)$set

Compare parametric and nonparametric analyses

These analyses gave similar results, so it suggests that there might not be such a huge batch effect impacting the results.

v0 <- list("para up"=m1up,"para dn"=m1dn, "nonp up"=m2up, "nonp dn"=m2dn)

str(v0)
## List of 4
##  $ para up: chr [1:308] "Peptide chain elongation" "Viral mRNA Translation" "Eukaryotic Translation Elongation" "Selenocysteine synthesis" ...
##  $ para dn: chr [1:297] "Cohesin Loading onto Chromatin" "RORA activates gene expression" "Activation of RAC1" "Establishment of Sister Chromatid Cohesion" ...
##  $ nonp up: chr [1:314] "Viral mRNA Translation" "Peptide chain elongation" "Eukaryotic Translation Elongation" "Selenocysteine synthesis" ...
##  $ nonp dn: chr [1:235] "Cohesin Loading onto Chromatin" "RORA activates gene expression" "Establishment of Sister Chromatid Cohesion" "Regulation of signaling by CBL" ...
plot(euler(v0),quantities = TRUE, edges = "gray", main="Parametric and nonparametric DE analysis")

Conclusion

Based on the gene expression data, my guess is that there is no major technical batch effect between the two sample groups. That said, we can’t be 100% sure that the differences observed are due to biological differences and not batch effects. To be certain, it would be a good idea to perform some validation analysis, either by testing the expression of some of these genes with another method like RTqPCR, or testing pathway activation biochemically.

As OA is thought to associate with inflammation, I was expecting to see these pathways activated. However OA is different to rheumatoid arthritis which definitely has an autoimmune component to it.

What we did observe was a downregulation of many signaling pathways such as RORA, RAC1, NOTCH2, CREB1, PI3K and others, while translation, amino acid recycling, nonsense mediated decay, degradation of mitotic proteins, and p53 pathways were upregulated.

Session information

For reproducibility.

sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.3.4            topconfects_1.8.0          
##  [3] limma_3.48.1                eulerr_6.1.0               
##  [5] mitch_1.4.0                 MASS_7.3-54                
##  [7] fgsea_1.18.0                gplots_3.1.1               
##  [9] DESeq2_1.32.0               SummarizedExperiment_1.22.0
## [11] Biobase_2.52.0              MatrixGenerics_1.4.0       
## [13] matrixStats_0.59.0          GenomicRanges_1.44.0       
## [15] GenomeInfoDb_1.28.1         IRanges_2.26.0             
## [17] S4Vectors_0.30.0            BiocGenerics_0.38.0        
## [19] reshape2_1.4.4              forcats_0.5.1              
## [21] stringr_1.4.0               dplyr_1.0.7                
## [23] purrr_0.3.4                 readr_2.0.0                
## [25] tidyr_1.1.3                 tibble_3.1.3               
## [27] ggplot2_3.3.5               tidyverse_1.3.1            
## [29] zoo_1.8-9                  
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_2.0-2       ellipsis_0.3.2         XVector_0.32.0        
##   [4] fs_1.5.0               rstudioapi_0.13        bit64_4.0.5           
##   [7] AnnotationDbi_1.54.1   fansi_0.5.0            lubridate_1.7.10      
##  [10] xml2_1.3.2             splines_4.1.2          cachem_1.0.5          
##  [13] knitr_1.33             geneplotter_1.70.0     polyclip_1.10-0       
##  [16] jsonlite_1.7.2         broom_0.7.8            annotate_1.70.0       
##  [19] dbplyr_2.1.1           png_0.1-7              shiny_1.6.0           
##  [22] compiler_4.1.2         httr_1.4.2             backports_1.2.1       
##  [25] assertthat_0.2.1       Matrix_1.3-4           fastmap_1.1.0         
##  [28] cli_3.0.1              later_1.2.0            htmltools_0.5.1.1     
##  [31] tools_4.1.2            gtable_0.3.0           glue_1.4.2            
##  [34] GenomeInfoDbData_1.2.6 fastmatch_1.1-3        Rcpp_1.0.7            
##  [37] jquerylib_0.1.4        cellranger_1.1.0       vctrs_0.3.8           
##  [40] Biostrings_2.60.1      svglite_2.0.0          polylabelr_0.2.0      
##  [43] xfun_0.24              rvest_1.0.0            mime_0.11             
##  [46] lifecycle_1.0.0        gtools_3.9.2           XML_3.99-0.6          
##  [49] zlibbioc_1.38.0        scales_1.1.1           hms_1.1.0             
##  [52] promises_1.2.0.1       RColorBrewer_1.1-2     yaml_2.2.1            
##  [55] memoise_2.0.0          gridExtra_2.3          sass_0.4.0            
##  [58] reshape_0.8.8          stringi_1.7.3          RSQLite_2.2.7         
##  [61] highr_0.9              genefilter_1.74.0      caTools_1.18.2        
##  [64] BiocParallel_1.26.1    echarts4r_0.4.1        systemfonts_1.0.2     
##  [67] rlang_0.4.11           pkgconfig_2.0.3        bitops_1.0-7          
##  [70] evaluate_0.14          lattice_0.20-45        htmlwidgets_1.5.3     
##  [73] bit_4.0.4              tidyselect_1.1.1       GGally_2.1.2          
##  [76] plyr_1.8.6             magrittr_2.0.1         R6_2.5.0              
##  [79] generics_0.1.0         DelayedArray_0.18.0    DBI_1.1.1             
##  [82] pillar_1.6.1           haven_2.4.1            withr_2.4.2           
##  [85] survival_3.2-13        KEGGREST_1.32.0        RCurl_1.98-1.3        
##  [88] modelr_0.1.8           crayon_1.4.1           KernSmooth_2.23-20    
##  [91] utf8_1.2.2             rmarkdown_2.9          tzdb_0.1.2            
##  [94] locfit_1.5-9.4         grid_4.1.2             readxl_1.3.1          
##  [97] data.table_1.14.0      blob_1.2.2             webshot_0.5.2         
## [100] reprex_2.0.0           digest_0.6.27          xtable_1.8-4          
## [103] httpuv_1.6.1           munsell_0.5.0          viridisLite_0.4.0     
## [106] beeswarm_0.4.0         bslib_0.2.5.1