Source codes: https://github.com/Anne-LiseGerard/dftd_rnaseq Data: https://doi.org/10.1007/s00018-019-03259-2

suppressPackageStartupMessages({
    library("reshape2")
    library("gplots")
    library("DESeq2")
    library("mitch")
    library("limma")
    library("kableExtra")
    library("dplyr")
    library("tidyr")
    library("ComplexHeatmap")
    library("RColorBrewer")
    library("circlize")
    library("pathview")
    library("stringr")
    library("grid")
})

Background

Goal: re-analyse previously published DFTD tumour biopsy transcriptomes and compare to DFTD cell line transcriptomes, focusing on genes involved in metabolism. Below is a list of the samples for this project.

ss <- read.table("../ss_patchett.txt",sep="\t",fill=TRUE,header=TRUE)
ss$DFT <- as.factor(ss$DFT)

ss %>%
  kbl(caption="Sample sheet for all samples") %>%
  kable_paper("hover", full_width = F)
Sample sheet for all samples
run_accession sample_id DFT sample sample_type
ERR2804403 sha_DFT1_1-1 DFT1 DFT1_1 biopsy
ERR2804404 sha_DFT1_1-2 DFT1 DFT1_1 biopsy
ERR2804405 sha_DFT1_2-1 DFT1 DFT1_2 biopsy
ERR2804406 sha_DFT1_2-2 DFT1 DFT1_2 biopsy
ERR2804407 sha_DFT2_1-1 DFT2 DFT2_1 biopsy
ERR2804408 sha_DFT2_1-2 DFT2 DFT2_1 biopsy
ERR2804409 sha_DFT2_2-1 DFT2 DFT2_2 biopsy
ERR2804410 sha_DFT2_2-2 DFT2 DFT2_2 biopsy
ERR2852729 sha_DFT2_RV_2-2 DFT2 RV_2 cell_line
ERR2852728 sha_DFT2_RV_2-1 DFT2 RV_2 cell_line
ERR2852727 sha_DFT2_RV_1-2 DFT2 RV_1 cell_line
ERR2852726 sha_DFT2_RV_1-1 DFT2 RV_1 cell_line
SRR6266557 C5065_UT_01 DFT1 C5065 cell_line
SRR6266556 C5065_UT_02 DFT1 C5065 cell_line

Functions

maplot <- function(de,contrast_name) {
  sig <-subset(de, padj < 0.05 )
  up <-rownames(subset(de, padj < 0.05 & log2FoldChange > 0))
  dn <-rownames(subset(de, padj < 0.05 & log2FoldChange < 0))
  GENESUP <- length(up)
  GENESDN <- length(dn)
  DET=nrow(de)
  SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down", DET, "detected")
  ns <-subset(de, padj > 0.05 )
  plot(log2(de$baseMean),de$log2FoldChange,
       xlab="log2 basemean", ylab="log2 foldchange",
       pch=19, cex=1, col="dark gray",
       main=contrast_name, cex.main=0.7)
  points(log2(sig$baseMean),sig$log2FoldChange,
         pch=19, cex=1, col="red")
  mtext(SUBHEADER,cex = 0.7)
  grid()
}

make_volcano <- function(de,name) {
    sig <- subset(de,padj<0.05)
    N_SIG=nrow(sig)
    N_UP=nrow(subset(sig,log2FoldChange>0))
    N_DN=nrow(subset(sig,log2FoldChange<0))
    DET=nrow(de)
    HEADER=paste(N_SIG,"@5%FDR,", N_UP, "up", N_DN, "dn", DET, "detected")
    plot(de$log2FoldChange,-log10(de$padj),cex=1,pch=19,col="darkgray",
        main=name, xlab="log2 FC", ylab="-log10 pval")
    mtext(HEADER)
    grid()
    points(sig$log2FoldChange,-log10(sig$padj),cex=1,pch=19,col="red")
}

Load data

Import data from the aligner.

tmp <- read.table("../fastq/3col_patchett.tsv.gz")
tmp$V1 <- gsub('.fastq-trimmed.fastq','',tmp$V1)
x <- as.data.frame(acast(tmp, V2~V1, value.var="V3",fun.aggregate=sum))

tmp_old <- read.table("../fastq/3col_old.tsv.gz")
tmp_old$V1 <- gsub('.fastq-trimmed.fastq','',tmp_old$V1)
x_old <- as.data.frame(acast(tmp_old, V2~V1, value.var="V3",fun.aggregate=sum))

Load gene names.

gn <- read.table("../ref/Sarcophilus_harrisii.mSarHar1.11.cdna+ncrna.genenames.tsv",fill=TRUE)
gn <- gn[order(gn$V1),]

gn_old <- read.table("../ref/Sarcophilus_harrisii.DEVIL7.0.cdna+ncrna.genenames.tsv",fill=TRUE)
gn_old <- gn_old[order(gn_old$V1),]
hm <- read.table("../ref/mart_export_ensembl109_2023-07-14.txt",sep="\t",header=TRUE)

hm_old <- read.table("../ref/mart_export_ensembl101_2023-09-06.txt",sep="\t",header=TRUE)

Now need to collapse transcript data to genes.

x$gene <- paste(gn$V2,gn$V3)

y <- aggregate(. ~ gene,x,sum)

rownames(y) <- y$gene
y$gene = NULL


x_old$gene <- paste(gn_old$V2,gn_old$V3)

y_old <- aggregate(. ~ gene,x_old,sum)

rownames(y_old) <- y_old$gene
y_old$gene = NULL

Quality control

Samples with <1M reads should be omitted. Will also round values to integers.

cs <- colSums(y)
cs <- cs[order(cs)]

par(mar=c(5,10,5,2))
barplot(cs,main="All samples",horiz=TRUE,las=1)
abline(v=1e7,col="red",lty=2)

y <- round(y)


cs_old <- colSums(y_old)
cs_old <- cs_old[order(cs_old)]

par(mar=c(5,10,5,2))
barplot(cs_old,main="All samples - old genome ",horiz=TRUE,las=1, xli=c(0,4e+07))
abline(v=1e7,col="red",lty=2)

y_old <- round(y_old)

MDS

This will help us to visualise the sources of variability in the overall dataset.

Plot MDS. Also fix the sample names.

Plot MDS by DFT.

par(mar = c(5.1, 4.1, 4.1, 2.1))

colnames(y) <- ss$sample_id

cs <- colSums(y)
cs <- cs[order(cs)]

par(mar=c(5,10,5,2))
barplot(cs,main="All samples",horiz=TRUE,las=1)

par(mar = c(5.1, 4.1, 4.1, 2.1))

colnames(y_old) <- ss$sample_id[1:8]

cs_old <- colSums(y_old)
cs_old <- cs_old[order(cs_old)]

par(mar=c(5,10,5,2))
barplot(cs_old,main="All samples - old genome",horiz=TRUE,las=1, xlim=c(0,4e+07))

par(mar = c(5.1, 4.1, 4.1, 2.1) )

cols <- ss$DFT
cols <- gsub("DFT1","lightblue",cols)
cols <- gsub("DFT2","pink",cols)
mymds <- plotMDS(y,plot=FALSE)

mymds_old <- plotMDS(y_old,plot=FALSE)

# fix the xlims
XMIN=min(mymds$x)
XMAX=max(mymds$x)
XMID=(XMAX+XMIN)/2
XMIN <- XMID + (XMIN-XMID)*1.1
XMAX <- XMID+(XMAX-XMID)*1.1
plotMDS(mymds,pch=19,cex=3,col=cols,main="MDS plot",xlim=c(XMIN,XMAX))
text(mymds,labels=colnames(y))
mtext("blue=DFT1, pink=DFT2")

plotMDS(mymds_old,pch=19,cex=3,col=cols,main="MDS plot - old genome",xlim=c(XMIN,XMAX))
text(mymds_old,labels=colnames(y_old))
mtext("blue=DFT1, pink=DFT2")

y_DFT1 <- y[,colnames(y) %in% c("sha_DFT1_1-1","sha_DFT1_1-2","sha_DFT1_2-1","sha_DFT1_2-2",
                                "C5065_UT_01", "C5065_UT_02")]
y_DFT2 <- y[,colnames(y) %in% c("sha_DFT2_1-1","sha_DFT2_1-2","sha_DFT2_2-1","sha_DFT2_2-2",
                                "sha_DFT2_RV_1-1","sha_DFT2_RV_1-2","sha_DFT2_RV_2-1","sha_DFT2_RV_2-2")] 

y_biopsy <- y[,colnames(y) %in% c("sha_DFT1_1-1","sha_DFT1_1-2","sha_DFT1_2-1","sha_DFT1_2-2",
                                  "sha_DFT2_1-1","sha_DFT2_1-2","sha_DFT2_2-1","sha_DFT2_2-2")]
y_cline <- y[,colnames(y) %in% c( "C5065_UT_01", "C5065_UT_02",
                                "sha_DFT2_RV_1-1","sha_DFT2_RV_1-2","sha_DFT2_RV_2-1","sha_DFT2_RV_2-2")] 

mdsDFT1 <- plotMDS(y_DFT1,plot=FALSE)
plotMDS(mdsDFT1,pch=19,cex=3,col="pink",main="MDS plot",xlim=c(XMIN,XMAX))
text(mdsDFT1,labels=colnames(y_DFT1))
mtext("DFT1")

mdsDFT2 <- plotMDS(y_DFT2,plot=FALSE)
plotMDS(mdsDFT2,pch=19,cex=3,col="lightblue",main="MDS plot",xlim=c(XMIN,XMAX))
text(mdsDFT2, labels=colnames(y_DFT2))
mtext("DFT2")

mds_biopsy <- plotMDS(y_biopsy,plot=FALSE)
plotMDS(mds_biopsy,pch=19,cex=3,col=cols,main="MDS plot",xlim=c(XMIN,XMAX))
text(mds_biopsy,labels=colnames(y_biopsy))
mtext("Tumour biopsies: blue=DFT1, pink=DFT2")

mds_cline <- plotMDS(y_cline,plot=FALSE)
plotMDS(mds_cline,pch=19,cex=3,col=cols,main="MDS plot",xlim=c(XMIN,XMAX))
text(mds_cline,labels=colnames(y_cline))
mtext("Cell lines: blue=DFT1, pink=DFT2")

DESeq2

Sum replicates then run differential expression analysis. For now, ignore alignment on old genome and cell line data.

y_cell <- y[,9:14]
y <- y[,1:8]

# combine replicates for each sample
DFT1_1 <- rowSums(y[,ss$sample=="DFT1_1"])
DFT1_2 <- rowSums(y[,ss$sample=="DFT1_2"])
DFT2_1 <- rowSums(y[,ss$sample=="DFT2_1"])
DFT2_2 <- rowSums(y[,ss$sample=="DFT2_2"])

y2 <- data.frame(DFT1_1,DFT1_2,DFT2_1,DFT2_2)
ss2 <- as.data.frame(colnames(y2))
colnames(ss2) <- "sample"
ss2$DFT <- factor(c("DFT1","DFT1","DFT2","DFT2"))

y2 <- y2[which(rowMeans(y2)>10),] # remove genes with average count < 10
dim(y2)
## [1] 15931     4
dds <- DESeqDataSetFromMatrix(countData = y2 , colData = ss2, design = ~ DFT )
## converting counts to integer mode
dds2 <- dds
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
z <- results(res)
vsd <- vst(dds)
zz<-cbind(as.data.frame(z),assay(vsd))
dge<-as.data.frame(zz[order(zz$pvalue),])
dge2 <- dge
sig <- subset(dge,padj<0.05)
sig2_up <- rownames(subset(sig,log2FoldChange>0))
sig2_dn <- rownames(subset(sig,log2FoldChange<0))
length(sig2_up)
## [1] 2536
length(sig2_dn)
## [1] 2033

DE visualisation.

maplot(dge2,"DFT1 vs DFT2 biopsies")

make_volcano(dge2,"DFT1 vs DFT2 biopsies")

sig[1:50,1:6] %>%
  kbl(caption="DFT1 vs DFT2 biopsies") %>%
  kable_paper("hover", full_width = F)
DFT1 vs DFT2 biopsies
baseMean log2FoldChange lfcSE stat pvalue padj
ENSSHAG00000031627.1 PRX 19373.4283 -8.968977 0.2894364 -30.98773 0 0
ENSSHAG00000001230.2 NA 10348.7183 -6.799811 0.2540229 -26.76849 0 0
ENSSHAG00000024692.1 NA 4999.3676 -6.180271 0.2714882 -22.76442 0 0
ENSSHAG00000028855.1 NA 7373.2103 8.218378 0.3645190 22.54582 0 0
ENSSHAG00000012247.2 COL9A1 4015.0597 8.997323 0.4019274 22.38544 0 0
ENSSHAG00000022172.1 HAPLN2 6695.1868 -10.289516 0.4742283 -21.69739 0 0
ENSSHAG00000008158.2 NA 13177.6995 9.309236 0.4323846 21.52999 0 0
ENSSHAG00000021199.1 UGT8 12350.1972 -10.371276 0.4882968 -21.23970 0 0
ENSSHAG00000025070.1 NA 1894.6094 -7.072079 0.3410819 -20.73425 0 0
ENSSHAG00000011362.2 NA 2280.0991 7.982211 0.3989326 20.00892 0 0
ENSSHAG00000030052.1 CFTR 3494.4469 -9.939796 0.5041830 -19.71466 0 0
ENSSHAG00000028043.1 NA 9433.1352 -7.017523 0.3623763 -19.36530 0 0
ENSSHAG00000000720.2 ANO1 4017.4307 -4.669392 0.2435018 -19.17601 0 0
ENSSHAG00000010203.2 NA 3321.1501 -5.790986 0.3031099 -19.10523 0 0
ENSSHAG00000013813.2 NA 1817.3236 8.295411 0.4353132 19.05619 0 0
ENSSHAG00000028708.1 NA 3398.8519 6.491294 0.3425299 18.95103 0 0
ENSSHAG00000024629.1 NA 1343.7559 6.383444 0.3399438 18.77794 0 0
ENSSHAG00000016586.2 NA 1835.1747 -6.920324 0.3884226 -17.81648 0 0
ENSSHAG00000006184.2 ATCAY 1670.3603 9.361726 0.5279151 17.73339 0 0
ENSSHAG00000024135.1 STK32B 2034.3346 9.519390 0.5470800 17.40036 0 0
ENSSHAG00000028637.1 NA 4152.7889 -5.630734 0.3245807 -17.34772 0 0
ENSSHAG00000014038.2 NA 12218.7615 5.503863 0.3213211 17.12886 0 0
ENSSHAG00000014560.2 PLLP 2929.6765 -7.645683 0.4475932 -17.08177 0 0
ENSSHAG00000028146.1 NSG1 2831.2843 -6.946736 0.4078914 -17.03085 0 0
ENSSHAG00000021888.1 PRSS35 1185.1224 5.763210 0.3446391 16.72245 0 0
ENSSHAG00000004800.2 ALKAL2 1461.3473 9.459052 0.5799044 16.31140 0 0
ENSSHAG00000028187.1 NA 1148.8575 -5.776665 0.3563146 -16.21226 0 0
ENSSHAG00000012895.2 NA 901.0142 -6.245494 0.3879777 -16.09756 0 0
ENSSHAG00000006714.2 P3H2 2310.5733 6.110694 0.3858657 15.83632 0 0
ENSSHAG00000011465.2 NA 2880.8002 -6.766093 0.4335532 -15.60614 0 0
ENSSHAG00000027357.1 DHH 1255.9489 -5.030988 0.3229230 -15.57953 0 0
ENSSHAG00000013967.2 ALDOC 2233.7205 -5.021646 0.3241752 -15.49053 0 0
ENSSHAG00000016224.2 NECTIN4 882.6687 -4.862932 0.3156142 -15.40783 0 0
ENSSHAG00000029081.1 FGF10 632.5331 7.164789 0.4652379 15.40027 0 0
ENSSHAG00000031559.1 FHDC1 1379.8599 5.027249 0.3269925 15.37420 0 0
ENSSHAG00000014878.2 EFS 987.1933 4.920944 0.3247012 15.15530 0 0
ENSSHAG00000020854.1 FMN2 765.5943 -7.665817 0.5113521 -14.99127 0 0
ENSSHAG00000008456.2 NOTCH3 3770.3176 4.007920 0.2717511 14.74850 0 0
ENSSHAG00000012189.2 NA 1655.7477 4.669329 0.3205060 14.56862 0 0
ENSSHAG00000023680.1 EGR3 5429.7319 -4.718157 0.3271562 -14.42173 0 0
ENSSHAG00000006005.2 NA 940.6342 7.989749 0.5545270 14.40822 0 0
ENSSHAG00000017676.2 ARHGAP22 727.6463 5.957871 0.4175935 14.26715 0 0
ENSSHAG00000010663.2 MAP1B 5140.0041 4.022572 0.2836538 14.18127 0 0
ENSSHAG00000007917.2 EDNRB 13288.1593 4.328769 0.3052459 14.18125 0 0
ENSSHAG00000021589.1 EBF2 996.6855 -6.138741 0.4339999 -14.14457 0 0
ENSSHAG00000024394.1 NA 1939.7779 4.717647 0.3350385 14.08091 0 0
ENSSHAG00000016955.2 CCDC68 3306.2354 -7.239744 0.5152135 -14.05193 0 0
ENSSHAG00000030008.1 NA 533.2343 5.281241 0.3765384 14.02577 0 0
ENSSHAG00000010841.2 PCDH7 506.1298 5.613175 0.4019355 13.96536 0 0
ENSSHAG00000009869.2 PXDC1 1400.8423 3.930310 0.2815769 13.95821 0 0
write.table(dge2,file="dge2.tsv",sep="\t")

# heatmap
mx <- sig[,7:ncol(sig)]
mx <- head(mx,30)
colfunc <- colorRampPalette(c("blue", "white", "red"))
heatmap.2(as.matrix(mx),trace="none",scale="row",
  col=colfunc(25),ColSideColors=cols[3:6],mar=c(8,14),
  cexRow=0.7,cexCol=1)

Enrichment analysis

Need to get human homologs of these genes. These were obtained from Ensembl v109 biomart (website).

rownames(dge2) <- sapply(strsplit(rownames(dge2),"\\."),"[[",1)

hm2 <- hm[hm$Tasmanian.devil.gene.stable.ID != "",]

gt <- hm2[,2:3]
length(unique(gt$Tasmanian.devil.gene.stable.ID))
## [1] 16979
length(unique(gt$Gene.name))
## [1] 16646
for(i in 1:length(gt$Gene.name)){
  if(gt$Gene.name[i]==""){gt$Gene.name[i] <- gt$Tasmanian.devil.gene.stable.ID[i]}
}

Now run mitch for DGE2. The pathways are sourced from Reactome 7th July 2023.

genesets <- gmt_import("../ref/ReactomePathways_2023-07-14.gmt")

m2 <- mitch_import(dge2, DEtype="deseq2", geneTable=gt)
## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.
## Note: Mean no. genes in input = 15931
## Note: no. genes in output = 12859
## Note: estimated proportion of input genes in output = 0.807
head(m2)
##                 x
## A2M    -0.3375456
## A4GALT  4.2591767
## AAAS    1.7285141
## AACS    0.7228111
## AADAC   1.5043061
## AADAT  -0.3392250
res2 <- mitch_calc(m2, genesets, priority="effect",cores=16)
## Note: Enrichments with large effect sizes may not be
##             statistically significant.
head(res2$enrichment_result,20) %>%
  kbl(caption="metabolic pathways") %>%
  kable_paper("hover", full_width = F)
metabolic pathways
set setSize pANOVA s.dist p.adjustANOVA
674 Metallothioneins bind metals 10 3.00e-07 0.9370379 0.0000060
195 Classical antibody-mediated complement activation 35 0.00e+00 0.8419660 0.0000000
222 Creation of C4 and C2 activators 38 0.00e+00 0.8380987 0.0000000
121 CD22 mediated BCR regulation 25 0.00e+00 0.8306093 0.0000000
1052 Response to metal ions 13 2.00e-07 0.8292075 0.0000049
549 Initial triggering of complement 44 0.00e+00 0.8284078 0.0000000
1100 Scavenging of heme from plasma 35 0.00e+00 0.8250913 0.0000000
1335 Translocation of ZAP-70 to Immunological synapse 11 4.10e-06 0.8018156 0.0000711
842 Phosphorylation of CD3 and TCR zeta chains 14 3.00e-07 0.7923984 0.0000060
371 FCGR activation 41 0.00e+00 0.7907896 0.0000000
533 Immunoregulatory interactions between a Lymphoid and a non-Lymphoid cell 97 0.00e+00 0.7731094 0.0000000
206 Complement cascade 60 0.00e+00 0.7653827 0.0000000
802 PD-1 signaling 15 3.00e-07 0.7627063 0.0000064
988 Regulation of Complement cascade 56 0.00e+00 0.7608513 0.0000000
180 Chemokine receptors bind chemokines 37 0.00e+00 0.7327229 0.0000000
577 Interleukin-2 signaling 10 7.58e-05 0.7226555 0.0010743
110 Binding and Uptake of Ligands by Scavenger Receptors 63 0.00e+00 0.6951552 0.0000000
1099 Scavenging by Class A Receptors 17 2.80e-06 0.6566505 0.0000496
459 Generation of second messenger molecules 24 0.00e+00 0.6497955 0.0000008
368 FCERI mediated Ca+2 mobilization 57 0.00e+00 0.6414869 0.0000000
top <- subset(res2$enrichment_result,p.adjustANOVA<0.05)

if ( !file.exists("mitch2.html") & nrow(top)>1 ) {
  mitch_report(res2, "mitch2.html")
}

topup <- head(subset(top,s.dist>0),20)

topup %>%
  kbl(caption="Top 20 upregulated pathways") %>%
  kable_paper("hover", full_width = F)
Top 20 upregulated pathways
set setSize pANOVA s.dist p.adjustANOVA
674 Metallothioneins bind metals 10 3.00e-07 0.9370379 0.0000060
195 Classical antibody-mediated complement activation 35 0.00e+00 0.8419660 0.0000000
222 Creation of C4 and C2 activators 38 0.00e+00 0.8380987 0.0000000
121 CD22 mediated BCR regulation 25 0.00e+00 0.8306093 0.0000000
1052 Response to metal ions 13 2.00e-07 0.8292075 0.0000049
549 Initial triggering of complement 44 0.00e+00 0.8284078 0.0000000
1100 Scavenging of heme from plasma 35 0.00e+00 0.8250913 0.0000000
1335 Translocation of ZAP-70 to Immunological synapse 11 4.10e-06 0.8018156 0.0000711
842 Phosphorylation of CD3 and TCR zeta chains 14 3.00e-07 0.7923984 0.0000060
371 FCGR activation 41 0.00e+00 0.7907896 0.0000000
533 Immunoregulatory interactions between a Lymphoid and a non-Lymphoid cell 97 0.00e+00 0.7731094 0.0000000
206 Complement cascade 60 0.00e+00 0.7653827 0.0000000
802 PD-1 signaling 15 3.00e-07 0.7627063 0.0000064
988 Regulation of Complement cascade 56 0.00e+00 0.7608513 0.0000000
180 Chemokine receptors bind chemokines 37 0.00e+00 0.7327229 0.0000000
577 Interleukin-2 signaling 10 7.58e-05 0.7226555 0.0010743
110 Binding and Uptake of Ligands by Scavenger Receptors 63 0.00e+00 0.6951552 0.0000000
1099 Scavenging by Class A Receptors 17 2.80e-06 0.6566505 0.0000496
459 Generation of second messenger molecules 24 0.00e+00 0.6497955 0.0000008
368 FCERI mediated Ca+2 mobilization 57 0.00e+00 0.6414869 0.0000000
topdn <- head(subset(top,s.dist<0),20)

topdn %>%
  kbl(caption="Top 20 downregulated pathways") %>%
  kable_paper("hover", full_width = F)
Top 20 downregulated pathways
set setSize pANOVA s.dist p.adjustANOVA
281 Degradation of cysteine and homocysteine 11 0.0056613 -0.4818154 0.0456124
330 EGR2 and SOX10-mediated initiation of Schwann cell myelination 28 0.0001144 -0.4213122 0.0015897
194 Class I peroxisomal membrane protein import 17 0.0037161 -0.4065062 0.0323276
178 Chaperone Mediated Autophagy 18 0.0061237 -0.3732575 0.0482409
555 Insulin processing 21 0.0033346 -0.3700621 0.0295527
590 Intraflagellar transport 45 0.0001932 -0.3213395 0.0025135
152 Cargo trafficking to the periciliary membrane 43 0.0027614 -0.2639506 0.0252628
189 Circadian Clock 60 0.0017280 -0.2340743 0.0170157
837 Peroxisomal protein import 53 0.0037920 -0.2300455 0.0327866
1055 Retrograde transport at the Trans-Golgi-Network 48 0.0060099 -0.2293537 0.0476094
188 Cilium Assembly 175 0.0000147 -0.1903699 0.0002340
643 Macroautophagy 115 0.0040649 -0.1553679 0.0349336
96 Autophagy 127 0.0025907 -0.1550925 0.0240109
792 Organelle biogenesis and maintenance 259 0.0002103 -0.1342986 0.0027114
top2 <- rbind(head(topup,10),head(topdn,10))
top2 <- top2[order(-top2$s.dist),]
par(mar=c(5,15,5,2))

if ( nrow(top2)>1 ) {
  barplot(rev(abs(top2$s.dist)),horiz=TRUE,las=1,names.arg=rev(top2$set),
    main="Top REACTOME Pathways",cex.names=0.6,xlab="Enrichment Score",
    col=c(rep("blue",nrow(head(topdn,10))),rep("red",nrow(head(topup,10)))),
    xlim=c(0,0.8))
}

Now to extract metabolism sets and analyse with another mitch run.

hierarchy <- read.table("../ref/gsea/ReactomePathwaysRelation.txt", sep ="\t", header = F, dec =".")
pathways <- read.delim("../ref/gsea/ReactomePathways.txt", header=FALSE)

hierarchy <- hierarchy[grep("HSA", hierarchy$V1), ] # keep only homo sapiens data
pathways <- pathways[grep("HSA", pathways$V1), 1:2] 

metabo_pathways <- "R-HSA-1430728" # reactome code for metabolism 

# this is very bad coding - but it works
# why 325? that is when while loop crashes because all branches of the hierarchy have been looped over
# will have to change this if Reactome data is updated
while(length(metabo_pathways) < 325){
  for(i in 1:length(metabo_pathways)){
    metabo_pathways <- unique(c(metabo_pathways,filter(hierarchy, V1 == metabo_pathways[i])$V2))
    }
  }

mpathways <- c()
for(i in 1:length(metabo_pathways)){mpathways <- c(mpathways,filter(pathways, V1 == metabo_pathways[i])$V2)}

library(magrittr)
genesets3 <- extract(genesets, mpathways)
# remove empty sets (ie hierarchy exists but no gene set is associated to it)
genesets3 <- genesets3[lapply(genesets3,length)>0]

Now run mitch on metabolic pathways only.

# TODO: for some reason this doesn't give any results - except when reducing number of sets
# I am confused by this, ask Mark
# also when no sets enriched, get error when produce mitch report

m3 <- mitch_import(dge2, DEtype="deseq2", geneTable=gt)
## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.
## Note: Mean no. genes in input = 15931
## Note: no. genes in output = 12859
## Note: estimated proportion of input genes in output = 0.807
head(m3)
##                 x
## A2M    -0.3375456
## A4GALT  4.2591767
## AAAS    1.7285141
## AACS    0.7228111
## AADAC   1.5043061
## AADAT  -0.3392250
# maybe something off with 1st row of m3 - fixed it, doesn't change anything

res3 <- mitch_calc(m3, genesets3, priority="effect", cores=16)
## Note: Enrichments with large effect sizes may not be
##             statistically significant.
head(res3$enrichment_result,20) %>%
  kbl(caption="metabolic pathways") %>%
  kable_paper("hover", full_width = F)
metabolic pathways
set setSize pANOVA s.dist p.adjustANOVA
39 Biosynthesis of specialized proresolving mediators (SPMs) 11 0.0030352 0.5161751 0.0827414
88 Degradation of cysteine and homocysteine 11 0.0056613 -0.4818154 0.0842911
26 Heme degradation 12 0.0056385 0.4615410 0.0842911
107 Acyl chain remodelling of PI 11 0.0105634 0.4452621 0.1177028
120 Synthesis of Prostaglandins (PG) and Thromboxanes (TX) 13 0.0085667 0.4211547 0.1043579
119 Synthesis of Leukotrienes (LT) and Eoxins (EX) 12 0.0155762 0.4032978 0.1357629
47 Glutamate and glutamine metabolism 12 0.0230345 -0.3789860 0.1543310
82 alpha-linolenic (omega3) and linoleic (omega6) acid metabolism 13 0.0225596 -0.3654056 0.1543310
118 alpha-linolenic acid (ALA) metabolism 13 0.0225596 -0.3654056 0.1543310
87 Triglyceride catabolism 16 0.0114189 0.3653352 0.1177028
129 Dermatan sulfate biosynthesis 10 0.0472731 0.3623473 0.2346148
113 Synthesis of PIPs at the early endosome membrane 15 0.0152200 -0.3620160 0.1357629
20 Nucleotide catabolism 27 0.0012633 0.3585827 0.0616802
108 Acyl chain remodelling of PG 12 0.0330634 0.3553878 0.2043665
40 Sulfur amino acid metabolism 22 0.0073914 -0.3299907 0.0990445
104 Acyl chain remodelling of PC 19 0.0162105 0.3186670 0.1357629
105 Acyl chain remodelling of PS 15 0.0335527 0.3170248 0.2043665
83 Arachidonic acid metabolism 35 0.0030874 0.2891320 0.0827414
81 Carnitine metabolism 14 0.0767701 0.2732358 0.3318450
102 Glutathione synthesis and recycling 10 0.1400903 0.2694996 0.3910854
top <- subset(res3$enrichment_result,p.adjustANOVA<0.05)

if ( !file.exists("mitch3_biopsy.html") & nrow(top)>1 ) {
  mitch_report(res3, "mitch3_biopsy.html")
}

topup <- head(subset(top,s.dist>0),20)

topup %>%
  kbl(caption="Top 20 upregulated metabolic pathways") %>%
  kable_paper("hover", full_width = F)
Top 20 upregulated metabolic pathways
set setSize pANOVA s.dist p.adjustANOVA
topdn <- head(subset(top,s.dist<0),20)

topdn %>%
  kbl(caption="Top 20 downregulated metabolic pathways") %>%
  kable_paper("hover", full_width = F)
Top 20 downregulated metabolic pathways
set setSize pANOVA s.dist p.adjustANOVA
top2 <- rbind(head(topup,10),head(topdn,10))
top2 <- top2[order(-top2$s.dist),]
par(mar=c(5,15,5,2))

if ( nrow(top2)>1 ) {
  barplot(rev(abs(top2$s.dist)),horiz=TRUE,las=1,names.arg=rev(top2$set),
    main="Top REACTOME Metabolic Pathways",cex.names=0.6,xlab="Enrichment Score",
    col=c(rep("blue",nrow(head(topdn,10))),rep("red",nrow(head(topup,10)))),
    xlim=c(0,0.8))
}

Session info

sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] pkgload_1.3.2.1             GGally_2.1.2               
##  [3] ggplot2_3.4.3               beeswarm_0.4.0             
##  [5] gtools_3.9.4                tibble_3.2.1               
##  [7] echarts4r_0.4.5             magrittr_2.0.3             
##  [9] stringr_1.5.0               pathview_1.40.0            
## [11] circlize_0.4.15             RColorBrewer_1.1-3         
## [13] ComplexHeatmap_2.16.0       tidyr_1.3.0                
## [15] dplyr_1.1.3                 kableExtra_1.3.4           
## [17] limma_3.56.2                mitch_1.12.0               
## [19] DESeq2_1.40.2               SummarizedExperiment_1.30.2
## [21] Biobase_2.60.0              MatrixGenerics_1.12.3      
## [23] matrixStats_1.0.0           GenomicRanges_1.52.0       
## [25] GenomeInfoDb_1.36.3         IRanges_2.34.1             
## [27] S4Vectors_0.38.1            BiocGenerics_0.46.0        
## [29] gplots_3.1.3                reshape2_1.4.4             
## 
## loaded via a namespace (and not attached):
##  [1] rstudioapi_0.15.0       jsonlite_1.8.7          shape_1.4.6            
##  [4] magick_2.7.5            rmarkdown_2.24          GlobalOptions_0.1.2    
##  [7] zlibbioc_1.46.0         vctrs_0.6.3             memoise_2.0.1          
## [10] Cairo_1.6-1             RCurl_1.98-1.12         webshot_0.5.5          
## [13] htmltools_0.5.6         S4Arrays_1.0.6          sass_0.4.7             
## [16] KernSmooth_2.23-22      bslib_0.5.1             htmlwidgets_1.6.2      
## [19] plyr_1.8.8              cachem_1.0.8            mime_0.12              
## [22] lifecycle_1.0.3         iterators_1.0.14        pkgconfig_2.0.3        
## [25] Matrix_1.6-1            R6_2.5.1                fastmap_1.1.1          
## [28] GenomeInfoDbData_1.2.10 shiny_1.7.5             clue_0.3-64            
## [31] digest_0.6.33           colorspace_2.1-0        reshape_0.8.9          
## [34] AnnotationDbi_1.62.2    RSQLite_2.3.1           org.Hs.eg.db_3.17.0    
## [37] fansi_1.0.4             httr_1.4.7              abind_1.4-5            
## [40] compiler_4.3.1          bit64_4.0.5             withr_2.5.0            
## [43] doParallel_1.0.17       BiocParallel_1.34.2     DBI_1.1.3              
## [46] highr_0.10              MASS_7.3-60             DelayedArray_0.26.7    
## [49] rjson_0.2.21            caTools_1.18.2          tools_4.3.1            
## [52] httpuv_1.6.11           glue_1.6.2              promises_1.2.1         
## [55] cluster_2.1.4           generics_0.1.3          gtable_0.3.4           
## [58] xml2_1.3.5              utf8_1.2.3              XVector_0.40.0         
## [61] foreach_1.5.2           pillar_1.9.0            later_1.3.1            
## [64] lattice_0.21-8          bit_4.0.5               tidyselect_1.2.0       
## [67] locfit_1.5-9.8          Biostrings_2.68.1       knitr_1.44             
## [70] gridExtra_2.3           svglite_2.1.1           xfun_0.40              
## [73] KEGGgraph_1.60.0        stringi_1.7.12          yaml_2.3.7             
## [76] evaluate_0.21           codetools_0.2-19        Rgraphviz_2.44.0       
## [79] graph_1.78.0            cli_3.6.1               xtable_1.8-4           
## [82] systemfonts_1.0.4       munsell_0.5.0           jquerylib_0.1.4        
## [85] Rcpp_1.0.11             png_0.1-8               XML_3.99-0.14          
## [88] parallel_4.3.1          ellipsis_0.3.2          blob_1.2.4             
## [91] bitops_1.0-7            viridisLite_0.4.2       scales_1.2.1           
## [94] purrr_1.0.2             crayon_1.5.2            GetoptLong_1.0.5       
## [97] rlang_1.1.1             KEGGREST_1.40.0         rvest_1.0.3