Source: https://github.com/markziemann/shoulder-instability-osteroarthritis

Introduction

Here we are comparing the gene expression patterns of shoulder instability to shoulder osteroarthritis. We are doing this to identify the gene expression patterns which are specific to osteoarthritis. We are using the instability group as a control for this contrast as it is difficult to obtain specimens of completely normal shoulder tissue.

We will be looking only at capsular tissues for the differential expression analysis.

suppressPackageStartupMessages({
    library("zoo")
    library("tidyverse")
    library("reshape2")
    library("DESeq2")
    library("gplots")
    library("fgsea")
    library("MASS")
    library("mitch")
    library("eulerr")
    library("limma")
    library("topconfects")
    library("kableExtra")
})

Import read counts

Importing osteoarthritis (OA) and shoulder instability (SI) data separately.

tmp <- read.table("3col_oa.tsv.gz",header=F)
x <- as.matrix(acast(tmp, V2~V1, value.var="V3", fun.aggregate = sum))
x <- as.data.frame(x)
accession <- sapply((strsplit(rownames(x),"\\|")),"[[",2)
symbol<-sapply((strsplit(rownames(x),"\\|")),"[[",6)
x$geneid <- paste(accession,symbol)
xx <- aggregate(. ~ geneid,x,sum)
rownames(xx) <- xx$geneid
colnames <- gsub("T0R","T0",colnames(xx))
xx$geneid = NULL
oa <- round(xx)
head(oa)

##                             4001-41B 4001-41C 4001-41F 4001-41M 4001-41S
## ENSG00000000003.15 TSPAN6        327      231      613       27      346
## ENSG00000000005.6 TNMD             9       45      477        1       30
## ENSG00000000419.13 DPM1          304      497      485      132      502
## ENSG00000000457.14 SCYL3         245      245      280       83      231
## ENSG00000000460.17 C1orf112      101       91       79       15       82
## ENSG00000000938.13 FGR          2962      828      661       91      736
##                             4002-42B 4002-42C 4002-42F 4002-42M 4002-42S
## ENSG00000000003.15 TSPAN6        432      301      579       46      336
## ENSG00000000005.6 TNMD            52        2      275        1      308
## ENSG00000000419.13 DPM1          349      333      422      126      339
## ENSG00000000457.14 SCYL3         202      185      183       93      168
## ENSG00000000460.17 C1orf112       86       71       53        9       49
## ENSG00000000938.13 FGR           562     1012      713       95      479
##                             4003-43B 4003-43C 4003-43F 4003-43M 4003-43S
## ENSG00000000003.15 TSPAN6        279      683      885       69      370
## ENSG00000000005.6 TNMD            17      356      772       12      387
## ENSG00000000419.13 DPM1          851      815      486      328      617
## ENSG00000000457.14 SCYL3         287      281      326      138      278
## ENSG00000000460.17 C1orf112      229      105       68       21       89
## ENSG00000000938.13 FGR          1294     1404      421       69      605
##                             4004-44B 4004-44C 4004-44F 4004-44M 4004-44S
## ENSG00000000003.15 TSPAN6        526      386      428       57      301
## ENSG00000000005.6 TNMD             3       22      480        2       10
## ENSG00000000419.13 DPM1          366      529      218      281      595
## ENSG00000000457.14 SCYL3         214      199      177      141      262
## ENSG00000000460.17 C1orf112      149       58       32       38      103
## ENSG00000000938.13 FGR          4440      775      406       74      803
##                             4005-45B 4005-45C 4005-45F 4005-45M 4005-45S
## ENSG00000000003.15 TSPAN6         85      371      368       78      231
## ENSG00000000005.6 TNMD            60      157      246        4      139
## ENSG00000000419.13 DPM1          395      300      168      493      414
## ENSG00000000457.14 SCYL3         261      149      140      157      184
## ENSG00000000460.17 C1orf112      179       60       58       38       70
## ENSG00000000938.13 FGR          8028      533      450       72     1392
##                             4006-46B 4006-46C 4006-46F 4006-46M 4006-46S
## ENSG00000000003.15 TSPAN6        722      342      385       55      363
## ENSG00000000005.6 TNMD            40        4      171       13       11
## ENSG00000000419.13 DPM1          525      434      273      238      516
## ENSG00000000457.14 SCYL3         300      242      147      113      237
## ENSG00000000460.17 C1orf112      134       60       47       19      116
## ENSG00000000938.13 FGR           620      968      362      364     1021

tmp <- read.table("3col_instability.tsv.gz",header=F)
x <- as.matrix(acast(tmp, V2~V1, value.var="V3", fun.aggregate = sum))
x <- as.data.frame(x)
accession <- sapply((strsplit(rownames(x),"\\|")),"[[",2)
symbol<-sapply((strsplit(rownames(x),"\\|")),"[[",6)
x$geneid <- paste(accession,symbol)
xx <- aggregate(. ~ geneid,x,sum)
rownames(xx) <- xx$geneid
colnames(xx) <- sapply(strsplit(colnames(xx),"_"),"[[",1)
xx$geneid = NULL
si <- round(xx)
head(si)

##                             SM-027 SM-028 SM-029 SM-030 SM-031 SM-032 SM-033
## ENSG00000000003.15 TSPAN6      187    191    204    185    112    116    128
## ENSG00000000005.6 TNMD          11      4     18      3     37      1      5
## ENSG00000000419.13 DPM1        273    308    401    351    180    375    454
## ENSG00000000457.14 SCYL3       270    318    364    230    124    218    256
## ENSG00000000460.17 C1orf112     62    101    138    124     45     88     65
## ENSG00000000938.13 FGR         806    704   1269    775    677    501    501
##                             SM-034 SM-035 SM-036 SM-037 SM-038 SM-039 SM-040
## ENSG00000000003.15 TSPAN6       75    165    283    104    127    107    135
## ENSG00000000005.6 TNMD           0     71     16     11      1      8     23
## ENSG00000000419.13 DPM1        231    289    345    227    280    173    325
## ENSG00000000457.14 SCYL3       141    225    361    197    232    142    251
## ENSG00000000460.17 C1orf112     65     60     91    101     79     59     81
## ENSG00000000938.13 FGR         281    354    280    371    581    302   1233
##                             SM-041 SM-042 SM-043 SM-044 SM-045 SM-046 SM-047
## ENSG00000000003.15 TSPAN6      224    139    197    156    183    229    248
## ENSG00000000005.6 TNMD          12      1    276    158      3    150    125
## ENSG00000000419.13 DPM1        421    319    335    331    432    317    399
## ENSG00000000457.14 SCYL3       310    262    348    269    272    341    335
## ENSG00000000460.17 C1orf112    121    103     72     85    105    136    142
## ENSG00000000938.13 FGR        1097    634   1299    767   1766    231    661
##                             SM-048 SM-049 SM-050 SM-051 SM-052
## ENSG00000000003.15 TSPAN6      252    114    226    363    234
## ENSG00000000005.6 TNMD         106     15     33      6      7
## ENSG00000000419.13 DPM1        326    296    432    452    398
## ENSG00000000457.14 SCYL3       220    238    266    381    350
## ENSG00000000460.17 C1orf112     90     76    115     59    142
## ENSG00000000938.13 FGR         636    352    598    446   1790

dim(oa)

## [1] 60651    30

dim(si)

## [1] 60651    26

QC analysis

Here I’ll look at a few different quality control measures.

par(mar=c(5,8,3,1))
barplot(colSums(oa),horiz=TRUE,las=1,xlab="OA num reads")

sums <- colSums(oa)
sums <- sums[order(sums)]
barplot(sums,horiz=TRUE,las=1,xlab="OA num reads")
abline(v=15000000,col="red")

barplot(colSums(si),horiz=TRUE,las=1,xlab="SI num reads")

sums <- colSums(si)
sums <- sums[order(sums)]
barplot(sums,horiz=TRUE,las=1,xlab="SI num reads")
abline(v=15000000,col="red")

MDS plot

Multidimensional scaling plot to show the variation between all samples, very similar to PCA.

xx <- cbind(oa,si)

mds <- cmdscale(dist(t(xx)))

plot(mds, xlab="Coordinate 1", ylab="Coordinate 2", 
  type = "p",bty="n",pch=19, cex=4 ,col="gray")
text(mds, labels=rownames(mds) )

Now remove fat, bone and muscle, then plot capsular and synovium samples.

dim(oa)

## [1] 60651    30

oa <- oa[,grep("M",colnames(oa),invert=TRUE)]
dim(oa)

## [1] 60651    24

oa <- oa[,grep("B",colnames(oa),invert=TRUE)]
dim(oa)

## [1] 60651    18

oa <- oa[,grep("F",colnames(oa),invert=TRUE)]
dim(oa)

## [1] 60651    12

xx <- cbind(oa,si)

mds <- cmdscale(dist(t(xx)))

caps <- as.numeric(grepl("C",colnames(xx)))
syno <- as.numeric(grepl("S$",colnames(xx))) * 2
cols <- caps + syno
cols <- cols +2

plot(mds, xlab="Coordinate 1", ylab="Coordinate 2",
  type = "p",bty="n",pch=19, cex=4 ,col=cols)
text(mds, labels=rownames(mds) )

Now remove synovium and regenerate the MDS plot for only the capsular samples.

oa <- oa[,grep("S",colnames(oa),invert=TRUE)]
dim(oa)

## [1] 60651     6

xx <- cbind(oa,si)
isoa <- as.numeric(grepl("C",colnames(xx)))

mds <- cmdscale(dist(t(xx)))

plot(mds, xlab="Coordinate 1", ylab="Coordinate 2",
  type = "p",bty="n",pch=19, cex=4 ,col=isoa+2)
text(mds, labels=rownames(mds) )

legend("topleft", legend=c("insta", "OA"),
       col=c(2,3), cex=1, pch=19)

This looks promising that there is some clustering of samples, with OA sample in green on the right and instability samples on the left. We need to be cautious as these were sequences in separate runs, but it would appear that there is some overlap between the groups which indicates that the batch effects might not be too severe.

Correlation heatmap

heatmap.2(cor(xx),trace="n",main="Pearson correlation heatmap")

Sample sheet

Also need to fix the patient identifiers to make it consistent between runs.

URL="https://raw.githubusercontent.com/markziemann/shoulder-instability-osteroarthritis/main/OA_patients.tsv"
pat <- read.table(URL,header=TRUE,sep="\t")

oa_pats <- colnames(xx)[grep("C",colnames(xx))]
oa_pats <- sapply(strsplit(oa_pats,"-"),"[[",1)
oa_pats <- paste("AD-CAB",oa_pats)
oa_pats <- pat[which(pat$Record_number %in% oa_pats),"Unique_participant_ID"]

insta_pats <- colnames(xx)[grep("SM",colnames(xx))]
insta_pats <- sapply(strsplit(insta_pats,"-"),"[[",2)
insta_pats <- as.integer(gsub("^0","",insta_pats))

pats <- c(oa_pats,insta_pats)

ss <- pat[which(pat$Unique_participant_ID %in% pats),]
rownames(ss) <- ss$Unique_participant_ID
ss$OA <- factor(ss$Unique_participant_ID>70)

colnames(xx) <- pats

Analysis of gene expression differences between instability and OA groups

Don’t forget to remove poorly detected genes from the matrix with a threshold of 10 reads per sample on average.

I’m also reordering the count matrix columns (xx) so it is in the same order as the samplesheet.

dim(xx)

## [1] 60651    32

xx <- xx[which(rowMeans(xx)>=10),]
dim(xx)

## [1] 18774    32

xx <- xx[,match( rownames(ss) , colnames(xx) )]

Not taking into consideration the effect of sex (will deal with that later).

dds <- DESeqDataSetFromMatrix(countData = xx , colData = ss, design = ~ OA )

## converting counts to integer mode

res <- DESeq(dds)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

## -- replacing outliers and refitting for 61 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)

## estimating dispersions

## fitting model and testing

z<- results(res)
vsd <- vst(dds, blind=FALSE)
zz <- cbind(as.data.frame(z),assay(vsd))
dge <- as.data.frame(zz[order(zz$pvalue),])
dge[1:20,1:6] %>% kbl(caption = "Top gene expression differences between instability (ctrl) and OA (case) without consideration of sex") %>% kable_paper("hover", full_width = F)

Top gene expression differences between instability (ctrl) and OA (case) without consideration of sex
	baseMean	log2FoldChange	lfcSE	stat
ENSG00000159658.15 EFCAB14	3332.6211	-1.2520243	0.0519688	-24.09186
ENSG00000174282.12 ZBTB4	4700.0483	-1.4555560	0.0687237	-21.17982
ENSG00000178502.6 KLHL11	192.8755	-2.7752160	0.1312278	-21.14808
ENSG00000185129.8 PURA	1385.1347	-1.4980158	0.0709312	-21.11928
ENSG00000196914.9 ARHGEF12	4653.3909	-1.4837353	0.0742316	-19.98793
ENSG00000083168.11 KAT6A	1542.5913	-1.6332998	0.0818170	-19.96284
ENSG00000140396.13 NCOA2	967.7400	-1.6354692	0.0822493	-19.88428
ENSG00000114439.19 BBX	1534.9614	-1.2030571	0.0615213	-19.55513
ENSG00000130749.10 ZC3H4	1165.2906	-1.6253576	0.0842774	-19.28582
ENSG00000134644.16 PUM1	1626.2984	-0.9261724	0.0491877	-18.82936
ENSG00000125686.12 MED1	1067.1058	-1.3420280	0.0713158	-18.81810
ENSG00000132466.19 ANKRD17	1736.2567	-1.0809885	0.0578369	-18.69028
ENSG00000228253.1 MT-ATP8	14517.5823	-3.6433673	0.1954961	-18.63652
ENSG00000178951.9 ZBTB7A	3589.8387	-1.8000341	0.0979346	-18.37996
ENSG00000160007.19 ARHGAP35	2424.1989	-1.1767383	0.0648693	-18.14014
ENSG00000183741.12 CBX6	3816.0534	-1.6688803	0.0925367	-18.03479
ENSG00000185591.10 SP1	2337.4013	-1.1069720	0.0617523	-17.92600
ENSG00000078687.17 TNRC6C	1012.4221	-1.8929297	0.1067483	-17.73264
ENSG00000113580.15 NR3C1	2895.2502	-1.2823855	0.0723789	-17.71767
ENSG00000210144.1 MT-TY	287.6235	-6.2176214	0.3513097	-17.69840

dgeb <- dge

Now repeat the analysis including the effect of sex.

ss$Sex <- factor(ss$Sex)

dds <- DESeqDataSetFromMatrix(countData = xx , colData = ss, design = ~ Sex + OA )

## converting counts to integer mode

res <- DESeq(dds)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

## -- replacing outliers and refitting for 89 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)

## estimating dispersions

## fitting model and testing

z<- results(res)
vsd <- vst(dds, blind=FALSE)
zz <- cbind(as.data.frame(z),assay(vsd))
dge <- as.data.frame(zz[order(zz$pvalue),])
dge[1:20,1:6] %>% kbl(caption = "Top gene expression differences between instability (ctrl) and OA (case) correcting for sex") %>% kable_paper("hover", full_width = F)

Top gene expression differences between instability (ctrl) and OA (case) correcting for sex
	baseMean	log2FoldChange	lfcSE	stat
ENSG00000159658.15 EFCAB14	3332.6211	-1.2654394	0.0624634	-20.25889
ENSG00000178502.6 KLHL11	192.8755	-2.7198587	0.1460426	-18.62374
ENSG00000174282.12 ZBTB4	4700.0483	-1.4656438	0.0828188	-17.69699
ENSG00000185129.8 PURA	1385.1347	-1.5001968	0.0850439	-17.64027
ENSG00000140396.13 NCOA2	967.7400	-1.6636029	0.0978811	-16.99616
ENSG00000130749.10 ZC3H4	1165.2906	-1.6665605	0.1003310	-16.61062
ENSG00000083168.11 KAT6A	1542.5913	-1.6306942	0.0981851	-16.60837
ENSG00000196914.9 ARHGEF12	4653.3909	-1.4716189	0.0894538	-16.45115
ENSG00000114439.19 BBX	1534.9614	-1.2082963	0.0737470	-16.38434
ENSG00000210144.1 MT-TY	287.6235	-6.2542130	0.3872795	-16.14909
ENSG00000173915.16 ATP5MD	382.0800	1.6301546	0.1020193	15.97888
ENSG00000132466.19 ANKRD17	1736.2567	-1.1001706	0.0691250	-15.91566
ENSG00000125686.12 MED1	1067.1058	-1.3450560	0.0853260	-15.76373
ENSG00000210194.1 MT-TE	353.1876	-6.7614006	0.4319016	-15.65496
ENSG00000005339.15 CREBBP	2887.6825	-1.5126863	0.0974960	-15.51537
ENSG00000134644.16 PUM1	1626.2984	-0.9111724	0.0587738	-15.50305
ENSG00000228253.1 MT-ATP8	14517.5823	-3.6537841	0.2356924	-15.50234
ENSG00000185591.10 SP1	2337.4013	-1.1329817	0.0737138	-15.37002
ENSG00000210151.2 MT-TS1	345.7805	-6.2523932	0.4070974	-15.35847
ENSG00000143190.23 POU2F1	678.7954	-1.4347898	0.0937583	-15.30308

dgeb <- dge

Here let’s look at some plots.

MA plot shows the average level and fold change of all detected genes. Volcano plot shows the fold change and the significance, as measured by -log(p-value). Significant genes are shown as red points.

There are heatmaps of the top ranked genes by p-value. Above the gene expression values there is a bar in yellow/orange colours. Yellow shows relatively low QuickDASH score and orange shows high score.

maplot <- function(de,contrast_name) {
  sig <-subset(de, padj < 0.05 )
  up <-rownames(subset(de, padj < 0.05 & log2FoldChange > 0))
  dn <-rownames(subset(de, padj < 0.05 & log2FoldChange < 0))
  GENESUP <- length(up)
  GENESDN <- length(dn)
  DET=nrow(de)
  SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down", DET, "detected")
  ns <-subset(de, padj > 0.05 )
  plot(log2(de$baseMean),de$log2FoldChange, 
       xlab="log2 basemean", ylab="log2 foldchange",
       pch=19, cex=0.5, col="dark gray",
       main=contrast_name, cex.main=1)
  points(log2(sig$baseMean),sig$log2FoldChange,
         pch=19, cex=0.5, col="red")
  mtext(SUBHEADER,cex = 1)
}

make_volcano <- function(de,name) {
    sig <- subset(de,padj<0.05)
    N_SIG=nrow(sig)
    N_UP=nrow(subset(sig,log2FoldChange>0))
    N_DN=nrow(subset(sig,log2FoldChange<0))
    DET=nrow(de)
    HEADER=paste(N_SIG,"@5%FDR,", N_UP, "up", N_DN, "dn", DET, "detected")
    plot(de$log2FoldChange,-log10(de$pval),cex=0.5,pch=19,col="darkgray",
        main=name, xlab="log2 FC", ylab="-log10 pval")
    mtext(HEADER)
    grid()
    points(sig$log2FoldChange,-log10(sig$pval),cex=0.5,pch=19,col="red")
}

make_heatmap <- function(de,name,myss,mx,n=30){

  colfunc <- colorRampPalette(c("blue", "white", "red"))
  values <- as.numeric(myss$OA)
  f <- colorRamp(c("yellow", "orange"))
  rr <- range(values)
  svals <- (values-rr[1])/diff(rr)
  colcols <- rgb(f(svals)/255)
  mxn <- mx/rowSums(mx)*1000000
  x <- mxn[which(rownames(mxn) %in% rownames(head(de,n))),]
  heatmap.2(as.matrix(x),trace="none",col=colfunc(25),scale="row", margins = c(10,15), cexRow=0.5, 
    main=paste("Top ranked",n,"genes in",name) , ColSideColors = colcols  )

}

maplot(dge,"insta vs OA")

make_volcano(dge,"insta vs OA")

make_heatmap(dge,"insta vs OA",ss,xx,n=50)

Pathway analysis with mitch

#download.file("https://reactome.org/download/current/ReactomePathways.gmt.zip", destfile="ReactomePathways.gmt.zip")
#unzip("ReactomePathways.gmt.zip")
genesets <- gmt_import("ReactomePathways.gmt")

# gene table
gt <- as.data.frame(rownames(xx))
gt$gene <- sapply(strsplit(gt[,1]," "),"[[",2)

m <- mitch_import(dge, DEtype="deseq2",geneTable=gt)

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.

## Note: Mean no. genes in input = 18774

## Note: no. genes in output = 18743

## Note: estimated proportion of input genes in output = 0.998

m1 <- mitch_calc(m, genesets, priority="effect")

## Note: Enrichments with large effect sizes may not be
##             statistically significant.

head(m1$enrichment_result,20) %>% kbl(caption = "Top gene pathway differences using the parametric DE data") %>% kable_paper("hover", full_width = F)

Top gene pathway differences using the parametric DE data
	set	setSize	pANOVA	s.dist	p.adjustANOVA
203	Cohesin Loading onto Chromatin	10	0.0000058	-0.8274916	0.0000431
973	RORA activates gene expression	18	0.0000001	-0.7376354	0.0000006
834	Peptide chain elongation	87	0.0000000	0.7193655	0.0000000
1396	Viral mRNA Translation	87	0.0000000	0.7144563	0.0000000
368	Eukaryotic Translation Elongation	92	0.0000000	0.7134260	0.0000000
1118	Selenocysteine synthesis	91	0.0000000	0.7042995	0.0000000
48	Activation of RAC1	11	0.0000566	-0.7010658	0.0003553
365	Establishment of Sister Chromatid Cohesion	11	0.0000609	-0.6979986	0.0003769
370	Eukaryotic Translation Termination	91	0.0000000	0.6972943	0.0000000
1047	Regulation of signaling by CBL	18	0.0000004	-0.6885269	0.0000036
1096	SRP-dependent cotranslational protein targeting to membrane	110	0.0000000	0.6765026	0.0000000
727	NOTCH2 intracellular domain regulates transcription	10	0.0002294	-0.6727486	0.0011713
1443	tRNA processing in the mitochondrion	30	0.0000000	-0.6639270	0.0000000
411	Formation of a pool of free 40S subunits	99	0.0000000	0.6589267	0.0000000
1062	Response of EIF2AK4 (GCN2) to amino acid deficiency	99	0.0000000	0.6461743	0.0000000
767	Nonsense Mediated Decay (NMD) independent of the Exon Junction Complex (EJC)	93	0.0000000	0.6455533	0.0000000
1117	Selenoamino acid metabolism	113	0.0000000	0.6429453	0.0000000
142	CREB1 phosphorylation through the activation of Adenylate Cyclase	11	0.0002401	-0.6394308	0.0012217
813	PI3K events in ERBB2 signaling	13	0.0001156	-0.6174545	0.0006506
16	APC/C:Cdc20 mediated degradation of mitotic proteins	73	0.0000000	0.6166820	0.0000000

m1top <- subset(m1$enrichment_result,p.adjustANOVA<0.05)
m1up <- subset(m1top,s.dist>0)$set
m1dn <- subset(m1top,s.dist<0)$set

Nonparametric analysis

Looking at gene ranks might be a better option for comparisons across batches.

The rank() function give the rank from lowest to highest and I’ll use this for wilcox test.

r <- apply(xx,2,rank)

ctrl <- which(colnames(r) <70)
case <- which(colnames(r) >70)

suppressWarnings(
wtres <- t(apply(r,1,function(x) {
  wt <- wilcox.test(x[case] ,  x[ctrl] )
  meanrankdiff <- mean(x[case]) - mean(x[ctrl])
  unname(c(meanrankdiff,wt$p.value))
})))

colnames(wtres) <- c("meanrankdiff","pvalue")
wtres <- as.data.frame(wtres)
wtres <- wtres[order(wtres$pvalue),]
wtres$fdr <- p.adjust(wtres$pvalue)

top <- head(wtres,50)
top <- top[order(top$meanrankdiff),]

plot(top$meanrankdiff,1:nrow(top),bty="none",xlab="mean rank difference",ylab="gene",pch=19,cex=2,col="gray",main="Top differentially regulated genes")
labels <- sapply(strsplit(rownames(top)," "),"[[",2)
text(top$meanrankdiff,1:nrow(top),labels=labels,cex=0.7)

top

##                             meanrankdiff       pvalue        fdr
## ENSG00000029534.21 ANK1       -4705.0897 2.207038e-06 0.04143493
## ENSG00000047648.23 ARHGAP6    -4131.7756 2.207038e-06 0.04143493
## ENSG00000048052.23 HDAC9      -3601.2244 2.207038e-06 0.04143493
## ENSG00000011523.14 CEP68      -3571.4359 2.207038e-06 0.04143493
## ENSG00000023516.9 AKAP11      -3449.0000 2.207038e-06 0.04143493
## ENSG00000011114.15 BTBD7      -3094.7949 2.207038e-06 0.04143493
## ENSG00000007944.15 MYLIP      -3065.1923 2.207038e-06 0.04143493
## ENSG00000008086.13 CDKL5      -2621.5256 2.207038e-06 0.04143493
## ENSG00000048405.10 ZNF800     -2611.4679 2.207038e-06 0.04143493
## ENSG00000031081.11 ARHGAP31   -2575.7564 2.207038e-06 0.04143493
## ENSG00000033327.13 GAB2       -2449.7372 2.207038e-06 0.04143493
## ENSG00000053254.16 FOXN3      -2312.0513 2.207038e-06 0.04143493
## ENSG00000043143.21 JADE2      -2263.0192 2.207038e-06 0.04143493
## ENSG00000008853.17 RHOBTB2    -2108.1026 2.207038e-06 0.04143493
## ENSG00000033800.14 PIAS1      -2003.0641 2.207038e-06 0.04143493
## ENSG00000049618.24 ARID1B     -1971.8205 2.207038e-06 0.04143493
## ENSG00000048707.15 VPS13D     -1915.5962 2.207038e-06 0.04143493
## ENSG00000025293.17 PHF20      -1594.8654 2.207038e-06 0.04143493
## ENSG00000012232.9 EXTL3       -1590.0513 2.207038e-06 0.04143493
## ENSG00000010292.13 NCAPD2     -1476.1218 2.207038e-06 0.04143493
## ENSG00000019995.6 ZRANB1      -1468.2179 2.207038e-06 0.04143493
## ENSG00000017797.13 RALBP1      -761.2244 2.207038e-06 0.04143493
## ENSG00000002834.18 LASP1       -307.4295 2.207038e-06 0.04143493
## ENSG00000047315.17 POLR2B       683.4679 2.207038e-06 0.04143493
## ENSG00000010244.19 ZNF207       707.3205 2.207038e-06 0.04143493
## ENSG00000003756.17 RBM5        1037.8077 2.207038e-06 0.04143493
## ENSG00000010295.20 IFFO1       1113.7949 2.207038e-06 0.04143493
## ENSG00000004534.15 RBM6        1269.6731 2.207038e-06 0.04143493
## ENSG00000008018.9 PSMB1        1298.3013 2.207038e-06 0.04143493
## ENSG00000006530.18 AGK         1373.7115 2.207038e-06 0.04143493
## ENSG00000037474.15 NSUN2       1417.9423 2.207038e-06 0.04143493
## ENSG00000003509.16 NDUFAF7     1635.5128 2.207038e-06 0.04143493
## ENSG00000041357.16 PSMA4       1731.2564 2.207038e-06 0.04143493
## ENSG00000011009.11 LYPLA2      1778.0385 2.207038e-06 0.04143493
## ENSG00000011260.14 UTP18       1856.5385 2.207038e-06 0.04143493
## ENSG00000023041.11 ZDHHC6      1924.8141 2.207038e-06 0.04143493
## ENSG00000006015.18 REX1BD      1933.6667 2.207038e-06 0.04143493
## ENSG00000016864.19 GLT8D1      1945.6474 2.207038e-06 0.04143493
## ENSG00000051128.19 HOMER3      1951.7821 2.207038e-06 0.04143493
## ENSG00000050426.16 LETMD1      1967.0449 2.207038e-06 0.04143493
## ENSG00000013375.16 PGM3        2021.3846 2.207038e-06 0.04143493
## ENSG00000011243.19 AKAP8L      2032.3269 2.207038e-06 0.04143493
## ENSG00000050130.18 JKAMP       2109.2756 2.207038e-06 0.04143493
## ENSG00000008324.12 SS18L2      2155.8846 2.207038e-06 0.04143493
## ENSG00000037241.7 RPL26L1      2289.0256 2.207038e-06 0.04143493
## ENSG00000004779.10 NDUFAB1     2339.1346 2.207038e-06 0.04143493
## ENSG00000014641.21 MDH1        2552.0000 2.207038e-06 0.04143493
## ENSG00000008128.23 CDK11A      2835.3974 2.207038e-06 0.04143493
## ENSG00000007392.17 LUC7L       3942.3654 2.207038e-06 0.04143493
## ENSG00000012223.13 LTF         7282.1282 2.207038e-06 0.04143493

Pathway enrichment

Here I’m using the mitch package and gene pathways from Reactome to understand the affected pathways separately for each tissue type.

Reactome pathways downloaded 9th Dec 2021.

wtres$stat <- -log10(wtres$pvalue) * sign(wtres$meanrankdiff)
wtres$gene <- sapply(strsplit(rownames(wtres)," "),"[[",2)

wtres2<-wtres[,c("stat","gene")]
wtres2 <- aggregate(. ~ gene, wtres2, max)
rownames(wtres2) <- wtres2$gene
wtres2$gene=NULL

m2 <- mitch_calc(wtres2, genesets, priority="effect")

## Warning in mitch_rank(input_profile): Warning: >60% of genes have the same score. This isn't
##             optimal for rank based enrichment analysis.

## Note: Enrichments with large effect sizes may not be
##             statistically significant.

head(m2$enrichment_result,20) %>% kbl(caption = "Top gene pathway differences using the nonparametric DE data") %>% kable_paper("hover", full_width = F)

Top gene pathway differences using the nonparametric DE data
	set	setSize	pANOVA	s.dist	p.adjustANOVA
203	Cohesin Loading onto Chromatin	10	0.0000982	-0.7111728	0.0005789
1396	Viral mRNA Translation	87	0.0000000	0.7098533	0.0000000
834	Peptide chain elongation	87	0.0000000	0.7088817	0.0000000
368	Eukaryotic Translation Elongation	92	0.0000000	0.7033858	0.0000000
1118	Selenocysteine synthesis	91	0.0000000	0.6966604	0.0000000
973	RORA activates gene expression	18	0.0000003	-0.6963952	0.0000028
1096	SRP-dependent cotranslational protein targeting to membrane	110	0.0000000	0.6943214	0.0000000
370	Eukaryotic Translation Termination	91	0.0000000	0.6926406	0.0000000
365	Establishment of Sister Chromatid Cohesion	11	0.0000825	-0.6854144	0.0004944
411	Formation of a pool of free 40S subunits	99	0.0000000	0.6778735	0.0000000
240	Cytosolic tRNA aminoacylation	24	0.0000000	0.6774200	0.0000001
1047	Regulation of signaling by CBL	18	0.0000008	-0.6704376	0.0000070
402	Folding of actin by CCT/TriC	10	0.0002447	0.6696685	0.0013147
142	CREB1 phosphorylation through the activation of Adenylate Cyclase	11	0.0001267	-0.6672442	0.0007266
1381	Ubiquitin Mediated Degradation of Phosphorylated Cdc25A	50	0.0000000	0.6505483	0.0000000
1430	p53-Independent DNA Damage Response	50	0.0000000	0.6505483	0.0000000
1431	p53-Independent G1/S DNA damage checkpoint	50	0.0000000	0.6505483	0.0000000
1117	Selenoamino acid metabolism	113	0.0000000	0.6441685	0.0000000
767	Nonsense Mediated Decay (NMD) independent of the Exon Junction Complex (EJC)	93	0.0000000	0.6440751	0.0000000
48	Activation of RAC1	11	0.0002301	-0.6412702	0.0012451

m2top <- subset(m2$enrichment_result,p.adjustANOVA<0.05)    
m2up <- subset(m2top,s.dist>0)$set
m2dn <- subset(m2top,s.dist<0)$set

Compare parametric and nonparametric analyses

These analyses gave similar results, so it suggests that there might not be such a huge batch effect impacting the results.

v0 <- list("para up"=m1up,"para dn"=m1dn, "nonp up"=m2up, "nonp dn"=m2dn)

str(v0)

## List of 4
##  $ para up: chr [1:308] "Peptide chain elongation" "Viral mRNA Translation" "Eukaryotic Translation Elongation" "Selenocysteine synthesis" ...
##  $ para dn: chr [1:297] "Cohesin Loading onto Chromatin" "RORA activates gene expression" "Activation of RAC1" "Establishment of Sister Chromatid Cohesion" ...
##  $ nonp up: chr [1:314] "Viral mRNA Translation" "Peptide chain elongation" "Eukaryotic Translation Elongation" "Selenocysteine synthesis" ...
##  $ nonp dn: chr [1:235] "Cohesin Loading onto Chromatin" "RORA activates gene expression" "Establishment of Sister Chromatid Cohesion" "Regulation of signaling by CBL" ...

plot(euler(v0),quantities = TRUE, edges = "gray", main="Parametric and nonparametric DE analysis")

Conclusion

Based on the gene expression data, my guess is that there is no major technical batch effect between the two sample groups. That said, we can’t be 100% sure that the differences observed are due to biological differences and not batch effects. To be certain, it would be a good idea to perform some validation analysis, either by testing the expression of some of these genes with another method like RTqPCR, or testing pathway activation biochemically.

As OA is thought to associate with inflammation, I was expecting to see these pathways activated. However OA is different to rheumatoid arthritis which definitely has an autoimmune component to it.

What we did observe was a downregulation of many signaling pathways such as RORA, RAC1, NOTCH2, CREB1, PI3K and others, while translation, amino acid recycling, nonsense mediated decay, degradation of mitotic proteins, and p53 pathways were upregulated.

Session information

For reproducibility.

sessionInfo()

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.3.4            topconfects_1.8.0          
##  [3] limma_3.48.1                eulerr_6.1.0               
##  [5] mitch_1.4.0                 MASS_7.3-54                
##  [7] fgsea_1.18.0                gplots_3.1.1               
##  [9] DESeq2_1.32.0               SummarizedExperiment_1.22.0
## [11] Biobase_2.52.0              MatrixGenerics_1.4.0       
## [13] matrixStats_0.59.0          GenomicRanges_1.44.0       
## [15] GenomeInfoDb_1.28.1         IRanges_2.26.0             
## [17] S4Vectors_0.30.0            BiocGenerics_0.38.0        
## [19] reshape2_1.4.4              forcats_0.5.1              
## [21] stringr_1.4.0               dplyr_1.0.7                
## [23] purrr_0.3.4                 readr_2.0.0                
## [25] tidyr_1.1.3                 tibble_3.1.3               
## [27] ggplot2_3.3.5               tidyverse_1.3.1            
## [29] zoo_1.8-9                  
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_2.0-2       ellipsis_0.3.2         XVector_0.32.0        
##   [4] fs_1.5.0               rstudioapi_0.13        bit64_4.0.5           
##   [7] AnnotationDbi_1.54.1   fansi_0.5.0            lubridate_1.7.10      
##  [10] xml2_1.3.2             splines_4.1.2          cachem_1.0.5          
##  [13] knitr_1.33             geneplotter_1.70.0     polyclip_1.10-0       
##  [16] jsonlite_1.7.2         broom_0.7.8            annotate_1.70.0       
##  [19] dbplyr_2.1.1           png_0.1-7              shiny_1.6.0           
##  [22] compiler_4.1.2         httr_1.4.2             backports_1.2.1       
##  [25] assertthat_0.2.1       Matrix_1.3-4           fastmap_1.1.0         
##  [28] cli_3.0.1              later_1.2.0            htmltools_0.5.1.1     
##  [31] tools_4.1.2            gtable_0.3.0           glue_1.4.2            
##  [34] GenomeInfoDbData_1.2.6 fastmatch_1.1-3        Rcpp_1.0.7            
##  [37] jquerylib_0.1.4        cellranger_1.1.0       vctrs_0.3.8           
##  [40] Biostrings_2.60.1      svglite_2.0.0          polylabelr_0.2.0      
##  [43] xfun_0.24              rvest_1.0.0            mime_0.11             
##  [46] lifecycle_1.0.0        gtools_3.9.2           XML_3.99-0.6          
##  [49] zlibbioc_1.38.0        scales_1.1.1           hms_1.1.0             
##  [52] promises_1.2.0.1       RColorBrewer_1.1-2     yaml_2.2.1            
##  [55] memoise_2.0.0          gridExtra_2.3          sass_0.4.0            
##  [58] reshape_0.8.8          stringi_1.7.3          RSQLite_2.2.7         
##  [61] highr_0.9              genefilter_1.74.0      caTools_1.18.2        
##  [64] BiocParallel_1.26.1    echarts4r_0.4.1        systemfonts_1.0.2     
##  [67] rlang_0.4.11           pkgconfig_2.0.3        bitops_1.0-7          
##  [70] evaluate_0.14          lattice_0.20-45        htmlwidgets_1.5.3     
##  [73] bit_4.0.4              tidyselect_1.1.1       GGally_2.1.2          
##  [76] plyr_1.8.6             magrittr_2.0.1         R6_2.5.0              
##  [79] generics_0.1.0         DelayedArray_0.18.0    DBI_1.1.1             
##  [82] pillar_1.6.1           haven_2.4.1            withr_2.4.2           
##  [85] survival_3.2-13        KEGGREST_1.32.0        RCurl_1.98-1.3        
##  [88] modelr_0.1.8           crayon_1.4.1           KernSmooth_2.23-20    
##  [91] utf8_1.2.2             rmarkdown_2.9          tzdb_0.1.2            
##  [94] locfit_1.5-9.4         grid_4.1.2             readxl_1.3.1          
##  [97] data.table_1.14.0      blob_1.2.2             webshot_0.5.2         
## [100] reprex_2.0.0           digest_0.6.27          xtable_1.8-4          
## [103] httpuv_1.6.1           munsell_0.5.0          viridisLite_0.4.0     
## [106] beeswarm_0.4.0         bslib_0.2.5.1

Shoulder transcriptome analysis: instability vs osteoarthritis

Mark Ziemann

2021-12-09