Re-analysing Patchett 2020 data

Background

Goal: re-analyse previously published DFTD tumour biopsy transcriptomes and compare to DFTD cell line transcriptomes, focusing on genes involved in metabolism. Below is a list of the samples for this project.

ss <- read.table("../ss_patchett.txt",sep="\t",fill=TRUE,header=TRUE)
ss$DFT <- as.factor(ss$DFT)

ss %>%
  kbl(caption="Sample sheet for all samples") %>%
  kable_paper("hover", full_width = F)

Sample sheet for all samples
run_accession	sample_id	DFT	sample	sample_type
ERR2804403	sha_DFT1_1-1	DFT1	DFT1_1	biopsy
ERR2804404	sha_DFT1_1-2	DFT1	DFT1_1	biopsy
ERR2804405	sha_DFT1_2-1	DFT1	DFT1_2	biopsy
ERR2804406	sha_DFT1_2-2	DFT1	DFT1_2	biopsy
ERR2804407	sha_DFT2_1-1	DFT2	DFT2_1	biopsy
ERR2804408	sha_DFT2_1-2	DFT2	DFT2_1	biopsy
ERR2804409	sha_DFT2_2-1	DFT2	DFT2_2	biopsy
ERR2804410	sha_DFT2_2-2	DFT2	DFT2_2	biopsy
ERR2852729	sha_DFT2_RV_2-2	DFT2	RV_2	cell_line
ERR2852728	sha_DFT2_RV_2-1	DFT2	RV_2	cell_line
ERR2852727	sha_DFT2_RV_1-2	DFT2	RV_1	cell_line
ERR2852726	sha_DFT2_RV_1-1	DFT2	RV_1	cell_line
SRR6266557	C5065_UT_01	DFT1	C5065	cell_line
SRR6266556	C5065_UT_02	DFT1	C5065	cell_line

Functions

maplot <- function(de,contrast_name) {
  sig <-subset(de, padj < 0.05 )
  up <-rownames(subset(de, padj < 0.05 & log2FoldChange > 0))
  dn <-rownames(subset(de, padj < 0.05 & log2FoldChange < 0))
  GENESUP <- length(up)
  GENESDN <- length(dn)
  DET=nrow(de)
  SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down", DET, "detected")
  ns <-subset(de, padj > 0.05 )
  plot(log2(de$baseMean),de$log2FoldChange,
       xlab="log2 basemean", ylab="log2 foldchange",
       pch=19, cex=1, col="dark gray",
       main=contrast_name, cex.main=0.7)
  points(log2(sig$baseMean),sig$log2FoldChange,
         pch=19, cex=1, col="red")
  mtext(SUBHEADER,cex = 0.7)
  grid()
}

make_volcano <- function(de,name) {
    sig <- subset(de,padj<0.05)
    N_SIG=nrow(sig)
    N_UP=nrow(subset(sig,log2FoldChange>0))
    N_DN=nrow(subset(sig,log2FoldChange<0))
    DET=nrow(de)
    HEADER=paste(N_SIG,"@5%FDR,", N_UP, "up", N_DN, "dn", DET, "detected")
    plot(de$log2FoldChange,-log10(de$padj),cex=1,pch=19,col="darkgray",
        main=name, xlab="log2 FC", ylab="-log10 pval")
    mtext(HEADER)
    grid()
    points(sig$log2FoldChange,-log10(sig$padj),cex=1,pch=19,col="red")
}

Load data

Import data from the aligner.

tmp <- read.table("../fastq/3col_patchett.tsv.gz")
tmp$V1 <- gsub('.fastq-trimmed.fastq','',tmp$V1)
x <- as.data.frame(acast(tmp, V2~V1, value.var="V3",fun.aggregate=sum))

tmp_old <- read.table("../fastq/3col_old.tsv.gz")
tmp_old$V1 <- gsub('.fastq-trimmed.fastq','',tmp_old$V1)
x_old <- as.data.frame(acast(tmp_old, V2~V1, value.var="V3",fun.aggregate=sum))

Load gene names.

gn <- read.table("../ref/Sarcophilus_harrisii.mSarHar1.11.cdna+ncrna.genenames.tsv",fill=TRUE)
gn <- gn[order(gn$V1),]

gn_old <- read.table("../ref/Sarcophilus_harrisii.DEVIL7.0.cdna+ncrna.genenames.tsv",fill=TRUE)
gn_old <- gn_old[order(gn_old$V1),]

hm <- read.table("../ref/mart_export_ensembl109_2023-07-14.txt",sep="\t",header=TRUE)

hm_old <- read.table("../ref/mart_export_ensembl101_2023-09-06.txt",sep="\t",header=TRUE)

Now need to collapse transcript data to genes.

x$gene <- paste(gn$V2,gn$V3)

y <- aggregate(. ~ gene,x,sum)

rownames(y) <- y$gene
y$gene = NULL


x_old$gene <- paste(gn_old$V2,gn_old$V3)

y_old <- aggregate(. ~ gene,x_old,sum)

rownames(y_old) <- y_old$gene
y_old$gene = NULL

Quality control

Samples with <1M reads should be omitted. Will also round values to integers.

cs <- colSums(y)
cs <- cs[order(cs)]

par(mar=c(5,10,5,2))
barplot(cs,main="All samples",horiz=TRUE,las=1)
abline(v=1e7,col="red",lty=2)

y <- round(y)


cs_old <- colSums(y_old)
cs_old <- cs_old[order(cs_old)]

par(mar=c(5,10,5,2))
barplot(cs_old,main="All samples - old genome ",horiz=TRUE,las=1, xli=c(0,4e+07))
abline(v=1e7,col="red",lty=2)

y_old <- round(y_old)

MDS

This will help us to visualise the sources of variability in the overall dataset.

Plot MDS. Also fix the sample names.

Plot MDS by DFT.

par(mar = c(5.1, 4.1, 4.1, 2.1))

colnames(y) <- ss$sample_id

cs <- colSums(y)
cs <- cs[order(cs)]

par(mar=c(5,10,5,2))
barplot(cs,main="All samples",horiz=TRUE,las=1)

par(mar = c(5.1, 4.1, 4.1, 2.1))

colnames(y_old) <- ss$sample_id[1:8]

cs_old <- colSums(y_old)
cs_old <- cs_old[order(cs_old)]

par(mar=c(5,10,5,2))
barplot(cs_old,main="All samples - old genome",horiz=TRUE,las=1, xlim=c(0,4e+07))

par(mar = c(5.1, 4.1, 4.1, 2.1) )

cols <- ss$DFT
cols <- gsub("DFT1","lightblue",cols)
cols <- gsub("DFT2","pink",cols)
mymds <- plotMDS(y,plot=FALSE)

mymds_old <- plotMDS(y_old,plot=FALSE)

# fix the xlims
XMIN=min(mymds$x)
XMAX=max(mymds$x)
XMID=(XMAX+XMIN)/2
XMIN <- XMID + (XMIN-XMID)*1.1
XMAX <- XMID+(XMAX-XMID)*1.1
plotMDS(mymds,pch=19,cex=3,col=cols,main="MDS plot",xlim=c(XMIN,XMAX))
text(mymds,labels=colnames(y))
mtext("blue=DFT1, pink=DFT2")

plotMDS(mymds_old,pch=19,cex=3,col=cols,main="MDS plot - old genome",xlim=c(XMIN,XMAX))
text(mymds_old,labels=colnames(y_old))
mtext("blue=DFT1, pink=DFT2")

y_DFT1 <- y[,colnames(y) %in% c("sha_DFT1_1-1","sha_DFT1_1-2","sha_DFT1_2-1","sha_DFT1_2-2",
                                "C5065_UT_01", "C5065_UT_02")]
y_DFT2 <- y[,colnames(y) %in% c("sha_DFT2_1-1","sha_DFT2_1-2","sha_DFT2_2-1","sha_DFT2_2-2",
                                "sha_DFT2_RV_1-1","sha_DFT2_RV_1-2","sha_DFT2_RV_2-1","sha_DFT2_RV_2-2")] 

y_biopsy <- y[,colnames(y) %in% c("sha_DFT1_1-1","sha_DFT1_1-2","sha_DFT1_2-1","sha_DFT1_2-2",
                                  "sha_DFT2_1-1","sha_DFT2_1-2","sha_DFT2_2-1","sha_DFT2_2-2")]
y_cline <- y[,colnames(y) %in% c( "C5065_UT_01", "C5065_UT_02",
                                "sha_DFT2_RV_1-1","sha_DFT2_RV_1-2","sha_DFT2_RV_2-1","sha_DFT2_RV_2-2")] 

mdsDFT1 <- plotMDS(y_DFT1,plot=FALSE)
plotMDS(mdsDFT1,pch=19,cex=3,col="pink",main="MDS plot",xlim=c(XMIN,XMAX))
text(mdsDFT1,labels=colnames(y_DFT1))
mtext("DFT1")

mdsDFT2 <- plotMDS(y_DFT2,plot=FALSE)
plotMDS(mdsDFT2,pch=19,cex=3,col="lightblue",main="MDS plot",xlim=c(XMIN,XMAX))
text(mdsDFT2, labels=colnames(y_DFT2))
mtext("DFT2")

mds_biopsy <- plotMDS(y_biopsy,plot=FALSE)
plotMDS(mds_biopsy,pch=19,cex=3,col=cols,main="MDS plot",xlim=c(XMIN,XMAX))
text(mds_biopsy,labels=colnames(y_biopsy))
mtext("Tumour biopsies: blue=DFT1, pink=DFT2")

mds_cline <- plotMDS(y_cline,plot=FALSE)
plotMDS(mds_cline,pch=19,cex=3,col=cols,main="MDS plot",xlim=c(XMIN,XMAX))
text(mds_cline,labels=colnames(y_cline))
mtext("Cell lines: blue=DFT1, pink=DFT2")

DESeq2

Sum replicates then run differential expression analysis. For now, ignore alignment on old genome and cell line data.

y_cell <- y[,9:14]
y <- y[,1:8]

# combine replicates for each sample
DFT1_1 <- rowSums(y[,ss$sample=="DFT1_1"])
DFT1_2 <- rowSums(y[,ss$sample=="DFT1_2"])
DFT2_1 <- rowSums(y[,ss$sample=="DFT2_1"])
DFT2_2 <- rowSums(y[,ss$sample=="DFT2_2"])

y2 <- data.frame(DFT1_1,DFT1_2,DFT2_1,DFT2_2)
ss2 <- as.data.frame(colnames(y2))
colnames(ss2) <- "sample"
ss2$DFT <- factor(c("DFT1","DFT1","DFT2","DFT2"))

y2 <- y2[which(rowMeans(y2)>10),] # remove genes with average count < 10
dim(y2)

## [1] 15931     4

dds <- DESeqDataSetFromMatrix(countData = y2 , colData = ss2, design = ~ DFT )

## converting counts to integer mode

dds2 <- dds
res <- DESeq(dds)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

z <- results(res)
vsd <- vst(dds)
zz<-cbind(as.data.frame(z),assay(vsd))
dge<-as.data.frame(zz[order(zz$pvalue),])
dge2 <- dge
sig <- subset(dge,padj<0.05)
sig2_up <- rownames(subset(sig,log2FoldChange>0))
sig2_dn <- rownames(subset(sig,log2FoldChange<0))
length(sig2_up)

## [1] 2536

length(sig2_dn)

## [1] 2033

DE visualisation.

maplot(dge2,"DFT1 vs DFT2 biopsies")

make_volcano(dge2,"DFT1 vs DFT2 biopsies")

sig[1:50,1:6] %>%
  kbl(caption="DFT1 vs DFT2 biopsies") %>%
  kable_paper("hover", full_width = F)

DFT1 vs DFT2 biopsies
	baseMean	log2FoldChange	lfcSE	stat
ENSSHAG00000031627.1 PRX	19373.4283	-8.968977	0.2894364	-30.98773
ENSSHAG00000001230.2 NA	10348.7183	-6.799811	0.2540229	-26.76849
ENSSHAG00000024692.1 NA	4999.3676	-6.180271	0.2714882	-22.76442
ENSSHAG00000028855.1 NA	7373.2103	8.218378	0.3645190	22.54582
ENSSHAG00000012247.2 COL9A1	4015.0597	8.997323	0.4019274	22.38544
ENSSHAG00000022172.1 HAPLN2	6695.1868	-10.289516	0.4742283	-21.69739
ENSSHAG00000008158.2 NA	13177.6995	9.309236	0.4323846	21.52999
ENSSHAG00000021199.1 UGT8	12350.1972	-10.371276	0.4882968	-21.23970
ENSSHAG00000025070.1 NA	1894.6094	-7.072079	0.3410819	-20.73425
ENSSHAG00000011362.2 NA	2280.0991	7.982211	0.3989326	20.00892
ENSSHAG00000030052.1 CFTR	3494.4469	-9.939796	0.5041830	-19.71466
ENSSHAG00000028043.1 NA	9433.1352	-7.017523	0.3623763	-19.36530
ENSSHAG00000000720.2 ANO1	4017.4307	-4.669392	0.2435018	-19.17601
ENSSHAG00000010203.2 NA	3321.1501	-5.790986	0.3031099	-19.10523
ENSSHAG00000013813.2 NA	1817.3236	8.295411	0.4353132	19.05619
ENSSHAG00000028708.1 NA	3398.8519	6.491294	0.3425299	18.95103
ENSSHAG00000024629.1 NA	1343.7559	6.383444	0.3399438	18.77794
ENSSHAG00000016586.2 NA	1835.1747	-6.920324	0.3884226	-17.81648
ENSSHAG00000006184.2 ATCAY	1670.3603	9.361726	0.5279151	17.73339
ENSSHAG00000024135.1 STK32B	2034.3346	9.519390	0.5470800	17.40036
ENSSHAG00000028637.1 NA	4152.7889	-5.630734	0.3245807	-17.34772
ENSSHAG00000014038.2 NA	12218.7615	5.503863	0.3213211	17.12886
ENSSHAG00000014560.2 PLLP	2929.6765	-7.645683	0.4475932	-17.08177
ENSSHAG00000028146.1 NSG1	2831.2843	-6.946736	0.4078914	-17.03085
ENSSHAG00000021888.1 PRSS35	1185.1224	5.763210	0.3446391	16.72245
ENSSHAG00000004800.2 ALKAL2	1461.3473	9.459052	0.5799044	16.31140
ENSSHAG00000028187.1 NA	1148.8575	-5.776665	0.3563146	-16.21226
ENSSHAG00000012895.2 NA	901.0142	-6.245494	0.3879777	-16.09756
ENSSHAG00000006714.2 P3H2	2310.5733	6.110694	0.3858657	15.83632
ENSSHAG00000011465.2 NA	2880.8002	-6.766093	0.4335532	-15.60614
ENSSHAG00000027357.1 DHH	1255.9489	-5.030988	0.3229230	-15.57953
ENSSHAG00000013967.2 ALDOC	2233.7205	-5.021646	0.3241752	-15.49053
ENSSHAG00000016224.2 NECTIN4	882.6687	-4.862932	0.3156142	-15.40783
ENSSHAG00000029081.1 FGF10	632.5331	7.164789	0.4652379	15.40027
ENSSHAG00000031559.1 FHDC1	1379.8599	5.027249	0.3269925	15.37420
ENSSHAG00000014878.2 EFS	987.1933	4.920944	0.3247012	15.15530
ENSSHAG00000020854.1 FMN2	765.5943	-7.665817	0.5113521	-14.99127
ENSSHAG00000008456.2 NOTCH3	3770.3176	4.007920	0.2717511	14.74850
ENSSHAG00000012189.2 NA	1655.7477	4.669329	0.3205060	14.56862
ENSSHAG00000023680.1 EGR3	5429.7319	-4.718157	0.3271562	-14.42173
ENSSHAG00000006005.2 NA	940.6342	7.989749	0.5545270	14.40822
ENSSHAG00000017676.2 ARHGAP22	727.6463	5.957871	0.4175935	14.26715
ENSSHAG00000010663.2 MAP1B	5140.0041	4.022572	0.2836538	14.18127
ENSSHAG00000007917.2 EDNRB	13288.1593	4.328769	0.3052459	14.18125
ENSSHAG00000021589.1 EBF2	996.6855	-6.138741	0.4339999	-14.14457
ENSSHAG00000024394.1 NA	1939.7779	4.717647	0.3350385	14.08091
ENSSHAG00000016955.2 CCDC68	3306.2354	-7.239744	0.5152135	-14.05193
ENSSHAG00000030008.1 NA	533.2343	5.281241	0.3765384	14.02577
ENSSHAG00000010841.2 PCDH7	506.1298	5.613175	0.4019355	13.96536
ENSSHAG00000009869.2 PXDC1	1400.8423	3.930310	0.2815769	13.95821

write.table(dge2,file="dge2.tsv",sep="\t")

# heatmap
mx <- sig[,7:ncol(sig)]
mx <- head(mx,30)
colfunc <- colorRampPalette(c("blue", "white", "red"))
heatmap.2(as.matrix(mx),trace="none",scale="row",
  col=colfunc(25),ColSideColors=cols[3:6],mar=c(8,14),
  cexRow=0.7,cexCol=1)

Enrichment analysis

Need to get human homologs of these genes. These were obtained from Ensembl v109 biomart (website).

rownames(dge2) <- sapply(strsplit(rownames(dge2),"\\."),"[[",1)

hm2 <- hm[hm$Tasmanian.devil.gene.stable.ID != "",]

gt <- hm2[,2:3]
length(unique(gt$Tasmanian.devil.gene.stable.ID))

## [1] 16979

length(unique(gt$Gene.name))

## [1] 16646

for(i in 1:length(gt$Gene.name)){
  if(gt$Gene.name[i]==""){gt$Gene.name[i] <- gt$Tasmanian.devil.gene.stable.ID[i]}
}

Now run mitch for DGE2. The pathways are sourced from Reactome 7th July 2023.

genesets <- gmt_import("../ref/ReactomePathways_2023-07-14.gmt")

m2 <- mitch_import(dge2, DEtype="deseq2", geneTable=gt)

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.

## Note: Mean no. genes in input = 15931

## Note: no. genes in output = 12859

## Note: estimated proportion of input genes in output = 0.807

head(m2)

##                 x
## A2M    -0.3375456
## A4GALT  4.2591767
## AAAS    1.7285141
## AACS    0.7228111
## AADAC   1.5043061
## AADAT  -0.3392250

res2 <- mitch_calc(m2, genesets, priority="effect",cores=16)

## Note: Enrichments with large effect sizes may not be
##             statistically significant.

head(res2$enrichment_result,20) %>%
  kbl(caption="metabolic pathways") %>%
  kable_paper("hover", full_width = F)

metabolic pathways
	set	setSize	pANOVA	s.dist	p.adjustANOVA
674	Metallothioneins bind metals	10	3.00e-07	0.9370379	0.0000060
195	Classical antibody-mediated complement activation	35	0.00e+00	0.8419660	0.0000000
222	Creation of C4 and C2 activators	38	0.00e+00	0.8380987	0.0000000
121	CD22 mediated BCR regulation	25	0.00e+00	0.8306093	0.0000000
1052	Response to metal ions	13	2.00e-07	0.8292075	0.0000049
549	Initial triggering of complement	44	0.00e+00	0.8284078	0.0000000
1100	Scavenging of heme from plasma	35	0.00e+00	0.8250913	0.0000000
1335	Translocation of ZAP-70 to Immunological synapse	11	4.10e-06	0.8018156	0.0000711
842	Phosphorylation of CD3 and TCR zeta chains	14	3.00e-07	0.7923984	0.0000060
371	FCGR activation	41	0.00e+00	0.7907896	0.0000000
533	Immunoregulatory interactions between a Lymphoid and a non-Lymphoid cell	97	0.00e+00	0.7731094	0.0000000
206	Complement cascade	60	0.00e+00	0.7653827	0.0000000
802	PD-1 signaling	15	3.00e-07	0.7627063	0.0000064
988	Regulation of Complement cascade	56	0.00e+00	0.7608513	0.0000000
180	Chemokine receptors bind chemokines	37	0.00e+00	0.7327229	0.0000000
577	Interleukin-2 signaling	10	7.58e-05	0.7226555	0.0010743
110	Binding and Uptake of Ligands by Scavenger Receptors	63	0.00e+00	0.6951552	0.0000000
1099	Scavenging by Class A Receptors	17	2.80e-06	0.6566505	0.0000496
459	Generation of second messenger molecules	24	0.00e+00	0.6497955	0.0000008
368	FCERI mediated Ca+2 mobilization	57	0.00e+00	0.6414869	0.0000000

top <- subset(res2$enrichment_result,p.adjustANOVA<0.05)

if ( !file.exists("mitch2.html") & nrow(top)>1 ) {
  mitch_report(res2, "mitch2.html")
}

topup <- head(subset(top,s.dist>0),20)

topup %>%
  kbl(caption="Top 20 upregulated pathways") %>%
  kable_paper("hover", full_width = F)

Top 20 upregulated pathways
	set	setSize	pANOVA	s.dist	p.adjustANOVA
674	Metallothioneins bind metals	10	3.00e-07	0.9370379	0.0000060
195	Classical antibody-mediated complement activation	35	0.00e+00	0.8419660	0.0000000
222	Creation of C4 and C2 activators	38	0.00e+00	0.8380987	0.0000000
121	CD22 mediated BCR regulation	25	0.00e+00	0.8306093	0.0000000
1052	Response to metal ions	13	2.00e-07	0.8292075	0.0000049
549	Initial triggering of complement	44	0.00e+00	0.8284078	0.0000000
1100	Scavenging of heme from plasma	35	0.00e+00	0.8250913	0.0000000
1335	Translocation of ZAP-70 to Immunological synapse	11	4.10e-06	0.8018156	0.0000711
842	Phosphorylation of CD3 and TCR zeta chains	14	3.00e-07	0.7923984	0.0000060
371	FCGR activation	41	0.00e+00	0.7907896	0.0000000
533	Immunoregulatory interactions between a Lymphoid and a non-Lymphoid cell	97	0.00e+00	0.7731094	0.0000000
206	Complement cascade	60	0.00e+00	0.7653827	0.0000000
802	PD-1 signaling	15	3.00e-07	0.7627063	0.0000064
988	Regulation of Complement cascade	56	0.00e+00	0.7608513	0.0000000
180	Chemokine receptors bind chemokines	37	0.00e+00	0.7327229	0.0000000
577	Interleukin-2 signaling	10	7.58e-05	0.7226555	0.0010743
110	Binding and Uptake of Ligands by Scavenger Receptors	63	0.00e+00	0.6951552	0.0000000
1099	Scavenging by Class A Receptors	17	2.80e-06	0.6566505	0.0000496
459	Generation of second messenger molecules	24	0.00e+00	0.6497955	0.0000008
368	FCERI mediated Ca+2 mobilization	57	0.00e+00	0.6414869	0.0000000

topdn <- head(subset(top,s.dist<0),20)

topdn %>%
  kbl(caption="Top 20 downregulated pathways") %>%
  kable_paper("hover", full_width = F)

Top 20 downregulated pathways
	set	setSize	pANOVA	s.dist	p.adjustANOVA
281	Degradation of cysteine and homocysteine	11	0.0056613	-0.4818154	0.0456124
330	EGR2 and SOX10-mediated initiation of Schwann cell myelination	28	0.0001144	-0.4213122	0.0015897
194	Class I peroxisomal membrane protein import	17	0.0037161	-0.4065062	0.0323276
178	Chaperone Mediated Autophagy	18	0.0061237	-0.3732575	0.0482409
555	Insulin processing	21	0.0033346	-0.3700621	0.0295527
590	Intraflagellar transport	45	0.0001932	-0.3213395	0.0025135
152	Cargo trafficking to the periciliary membrane	43	0.0027614	-0.2639506	0.0252628
189	Circadian Clock	60	0.0017280	-0.2340743	0.0170157
837	Peroxisomal protein import	53	0.0037920	-0.2300455	0.0327866
1055	Retrograde transport at the Trans-Golgi-Network	48	0.0060099	-0.2293537	0.0476094
188	Cilium Assembly	175	0.0000147	-0.1903699	0.0002340
643	Macroautophagy	115	0.0040649	-0.1553679	0.0349336
96	Autophagy	127	0.0025907	-0.1550925	0.0240109
792	Organelle biogenesis and maintenance	259	0.0002103	-0.1342986	0.0027114

top2 <- rbind(head(topup,10),head(topdn,10))
top2 <- top2[order(-top2$s.dist),]
par(mar=c(5,15,5,2))

if ( nrow(top2)>1 ) {
  barplot(rev(abs(top2$s.dist)),horiz=TRUE,las=1,names.arg=rev(top2$set),
    main="Top REACTOME Pathways",cex.names=0.6,xlab="Enrichment Score",
    col=c(rep("blue",nrow(head(topdn,10))),rep("red",nrow(head(topup,10)))),
    xlim=c(0,0.8))
}

Now to extract metabolism sets and analyse with another mitch run.

hierarchy <- read.table("../ref/gsea/ReactomePathwaysRelation.txt", sep ="\t", header = F, dec =".")
pathways <- read.delim("../ref/gsea/ReactomePathways.txt", header=FALSE)

hierarchy <- hierarchy[grep("HSA", hierarchy$V1), ] # keep only homo sapiens data
pathways <- pathways[grep("HSA", pathways$V1), 1:2] 

metabo_pathways <- "R-HSA-1430728" # reactome code for metabolism 

# this is very bad coding - but it works
# why 325? that is when while loop crashes because all branches of the hierarchy have been looped over
# will have to change this if Reactome data is updated
while(length(metabo_pathways) < 325){
  for(i in 1:length(metabo_pathways)){
    metabo_pathways <- unique(c(metabo_pathways,filter(hierarchy, V1 == metabo_pathways[i])$V2))
    }
  }

mpathways <- c()
for(i in 1:length(metabo_pathways)){mpathways <- c(mpathways,filter(pathways, V1 == metabo_pathways[i])$V2)}

library(magrittr)
genesets3 <- extract(genesets, mpathways)
# remove empty sets (ie hierarchy exists but no gene set is associated to it)
genesets3 <- genesets3[lapply(genesets3,length)>0]

Now run mitch on metabolic pathways only.

# TODO: for some reason this doesn't give any results - except when reducing number of sets
# I am confused by this, ask Mark
# also when no sets enriched, get error when produce mitch report

m3 <- mitch_import(dge2, DEtype="deseq2", geneTable=gt)

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.

## Note: Mean no. genes in input = 15931

## Note: no. genes in output = 12859

## Note: estimated proportion of input genes in output = 0.807

head(m3)

##                 x
## A2M    -0.3375456
## A4GALT  4.2591767
## AAAS    1.7285141
## AACS    0.7228111
## AADAC   1.5043061
## AADAT  -0.3392250

# maybe something off with 1st row of m3 - fixed it, doesn't change anything

res3 <- mitch_calc(m3, genesets3, priority="effect", cores=16)

## Note: Enrichments with large effect sizes may not be
##             statistically significant.

head(res3$enrichment_result,20) %>%
  kbl(caption="metabolic pathways") %>%
  kable_paper("hover", full_width = F)

metabolic pathways
	set	setSize	pANOVA	s.dist	p.adjustANOVA
39	Biosynthesis of specialized proresolving mediators (SPMs)	11	0.0030352	0.5161751	0.0827414
88	Degradation of cysteine and homocysteine	11	0.0056613	-0.4818154	0.0842911
26	Heme degradation	12	0.0056385	0.4615410	0.0842911
107	Acyl chain remodelling of PI	11	0.0105634	0.4452621	0.1177028
120	Synthesis of Prostaglandins (PG) and Thromboxanes (TX)	13	0.0085667	0.4211547	0.1043579
119	Synthesis of Leukotrienes (LT) and Eoxins (EX)	12	0.0155762	0.4032978	0.1357629
47	Glutamate and glutamine metabolism	12	0.0230345	-0.3789860	0.1543310
82	alpha-linolenic (omega3) and linoleic (omega6) acid metabolism	13	0.0225596	-0.3654056	0.1543310
118	alpha-linolenic acid (ALA) metabolism	13	0.0225596	-0.3654056	0.1543310
87	Triglyceride catabolism	16	0.0114189	0.3653352	0.1177028
129	Dermatan sulfate biosynthesis	10	0.0472731	0.3623473	0.2346148
113	Synthesis of PIPs at the early endosome membrane	15	0.0152200	-0.3620160	0.1357629
20	Nucleotide catabolism	27	0.0012633	0.3585827	0.0616802
108	Acyl chain remodelling of PG	12	0.0330634	0.3553878	0.2043665
40	Sulfur amino acid metabolism	22	0.0073914	-0.3299907	0.0990445
104	Acyl chain remodelling of PC	19	0.0162105	0.3186670	0.1357629
105	Acyl chain remodelling of PS	15	0.0335527	0.3170248	0.2043665
83	Arachidonic acid metabolism	35	0.0030874	0.2891320	0.0827414
81	Carnitine metabolism	14	0.0767701	0.2732358	0.3318450
102	Glutathione synthesis and recycling	10	0.1400903	0.2694996	0.3910854

top <- subset(res3$enrichment_result,p.adjustANOVA<0.05)

if ( !file.exists("mitch3_biopsy.html") & nrow(top)>1 ) {
  mitch_report(res3, "mitch3_biopsy.html")
}

topup <- head(subset(top,s.dist>0),20)

topup %>%
  kbl(caption="Top 20 upregulated metabolic pathways") %>%
  kable_paper("hover", full_width = F)

Top 20 upregulated metabolic pathways
set	setSize	pANOVA	s.dist	p.adjustANOVA

topdn <- head(subset(top,s.dist<0),20)

topdn %>%
  kbl(caption="Top 20 downregulated metabolic pathways") %>%
  kable_paper("hover", full_width = F)

Top 20 downregulated metabolic pathways
set	setSize	pANOVA	s.dist	p.adjustANOVA

top2 <- rbind(head(topup,10),head(topdn,10))
top2 <- top2[order(-top2$s.dist),]
par(mar=c(5,15,5,2))

if ( nrow(top2)>1 ) {
  barplot(rev(abs(top2$s.dist)),horiz=TRUE,las=1,names.arg=rev(top2$set),
    main="Top REACTOME Metabolic Pathways",cex.names=0.6,xlab="Enrichment Score",
    col=c(rep("blue",nrow(head(topdn,10))),rep("red",nrow(head(topup,10)))),
    xlim=c(0,0.8))
}

Session info

sessionInfo()

## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] pkgload_1.3.2.1             GGally_2.1.2               
##  [3] ggplot2_3.4.3               beeswarm_0.4.0             
##  [5] gtools_3.9.4                tibble_3.2.1               
##  [7] echarts4r_0.4.5             magrittr_2.0.3             
##  [9] stringr_1.5.0               pathview_1.40.0            
## [11] circlize_0.4.15             RColorBrewer_1.1-3         
## [13] ComplexHeatmap_2.16.0       tidyr_1.3.0                
## [15] dplyr_1.1.3                 kableExtra_1.3.4           
## [17] limma_3.56.2                mitch_1.12.0               
## [19] DESeq2_1.40.2               SummarizedExperiment_1.30.2
## [21] Biobase_2.60.0              MatrixGenerics_1.12.3      
## [23] matrixStats_1.0.0           GenomicRanges_1.52.0       
## [25] GenomeInfoDb_1.36.3         IRanges_2.34.1             
## [27] S4Vectors_0.38.1            BiocGenerics_0.46.0        
## [29] gplots_3.1.3                reshape2_1.4.4             
## 
## loaded via a namespace (and not attached):
##  [1] rstudioapi_0.15.0       jsonlite_1.8.7          shape_1.4.6            
##  [4] magick_2.7.5            rmarkdown_2.24          GlobalOptions_0.1.2    
##  [7] zlibbioc_1.46.0         vctrs_0.6.3             memoise_2.0.1          
## [10] Cairo_1.6-1             RCurl_1.98-1.12         webshot_0.5.5          
## [13] htmltools_0.5.6         S4Arrays_1.0.6          sass_0.4.7             
## [16] KernSmooth_2.23-22      bslib_0.5.1             htmlwidgets_1.6.2      
## [19] plyr_1.8.8              cachem_1.0.8            mime_0.12              
## [22] lifecycle_1.0.3         iterators_1.0.14        pkgconfig_2.0.3        
## [25] Matrix_1.6-1            R6_2.5.1                fastmap_1.1.1          
## [28] GenomeInfoDbData_1.2.10 shiny_1.7.5             clue_0.3-64            
## [31] digest_0.6.33           colorspace_2.1-0        reshape_0.8.9          
## [34] AnnotationDbi_1.62.2    RSQLite_2.3.1           org.Hs.eg.db_3.17.0    
## [37] fansi_1.0.4             httr_1.4.7              abind_1.4-5            
## [40] compiler_4.3.1          bit64_4.0.5             withr_2.5.0            
## [43] doParallel_1.0.17       BiocParallel_1.34.2     DBI_1.1.3              
## [46] highr_0.10              MASS_7.3-60             DelayedArray_0.26.7    
## [49] rjson_0.2.21            caTools_1.18.2          tools_4.3.1            
## [52] httpuv_1.6.11           glue_1.6.2              promises_1.2.1         
## [55] cluster_2.1.4           generics_0.1.3          gtable_0.3.4           
## [58] xml2_1.3.5              utf8_1.2.3              XVector_0.40.0         
## [61] foreach_1.5.2           pillar_1.9.0            later_1.3.1            
## [64] lattice_0.21-8          bit_4.0.5               tidyselect_1.2.0       
## [67] locfit_1.5-9.8          Biostrings_2.68.1       knitr_1.44             
## [70] gridExtra_2.3           svglite_2.1.1           xfun_0.40              
## [73] KEGGgraph_1.60.0        stringi_1.7.12          yaml_2.3.7             
## [76] evaluate_0.21           codetools_0.2-19        Rgraphviz_2.44.0       
## [79] graph_1.78.0            cli_3.6.1               xtable_1.8-4           
## [82] systemfonts_1.0.4       munsell_0.5.0           jquerylib_0.1.4        
## [85] Rcpp_1.0.11             png_0.1-8               XML_3.99-0.14          
## [88] parallel_4.3.1          ellipsis_0.3.2          blob_1.2.4             
## [91] bitops_1.0-7            viridisLite_0.4.2       scales_1.2.1           
## [94] purrr_1.0.2             crayon_1.5.2            GetoptLong_1.0.5       
## [97] rlang_1.1.1             KEGGREST_1.40.0         rvest_1.0.3