Examining global gene expression in DFT1 and DFT2 cells

Source codes: https://github.com/Anne-LiseGerard/dftd_rnaseq

suppressPackageStartupMessages({
    library("reshape2")
    library("gplots")
    library("DESeq2")
    library("mitch")
    library("limma")
    library("kableExtra")
    library("dplyr")
    library("tidyr")
    library("ComplexHeatmap")
    library("RColorBrewer")
    library("circlize")
    library("pathview")
    library("stringr")
    library("grid")
    library("VennDiagram")
})

Background

The goal of our study is to compare the expression of genes related to metabolism of DFT1 cell lines to DFT2 cell lines. Our main goal is comparing genes expressed in the following pathways: Glycolysis, Pentose Phosphate Pathway, Fatty Acid Metabolism, Glutaminolysis and Oxidative Phosphorylation. A secondary goal would be to compare the expression of genes related to the production of Reactive Oxygen Species (ROS) and protection against ROS (genes involved in DNA repair, production of antioxydants etc). Finally, we might also be interested in looking into genes linked to cholesterol metabolism as this has been shown to be important for DFT1 but I need to do some additional reading to know what exactly we’re after. See sample sheet below (edited from Yin Peng), there are 3 DFT1 and 3 DFT2 cell lines, and 3 replicates of each cell line:

ss <- read.table("../ss.tsv",sep="\t",fill=TRUE,header=TRUE)
ss$DFT <- as.factor(ss$DFT)
ss$clone <- sapply(strsplit(ss$ClientID,"_"),"[[",1)

ss %>%
  kbl(caption="Sample sheet for all samples") %>%
  kable_paper("hover", full_width = F)

Sample sheet for all samples
C	ClientID	DFT	Replicate	Index1sequence	Index2sequence	X.ReadsIdentified.PF.	TotalReads	TotalBases.Gbases.	clone
6180766	4906_1-1	DFT1	1	TTGTTGCA	GACGTCGT	2.4465	103481165	15.5	4906
6180767	C5065_1-1	DFT1	1	CCACACTT	CGTCATAC	2.8116	118924031	17.8	C5065
6180768	1426_1-1	DFT1	1	CCCGTTTG	TTGCTGTA	2.3264	98401219	14.8	1426
6180769	RV_1-1	DFT2	1	ATGCTCCC	CCCTGCTG	2.2811	96485136	14.5	RV
6180770	SN_1-1	DFT2	1	GCTCAATA	CATTTATC	2.3947	101290147	15.2	SN
6180771	TD549_1-1	DFT2	1	GTAGTTCG	CTTGATGC	2.1732	91921221	13.8	TD549
6180772	4906_2-1	DFT1	2	CGAGAACC	ATGTATCG	2.2322	94416781	14.2	4906
6180773	C5065_2-1	DFT1	2	GCCATGTA	ATAGGGTT	2.4505	103650355	15.5	C5065
6180774	1426_2-1	DFT1	2	TTTCTCTA	CTCGACGT	2.4778	104805081	15.7	1426
6180775	RV_2-1	DFT2	2	CCAGCGAT	TCTAGTCA	2.9882	126393794	19.0	RV
6180776	SN_2-1	DFT2	2	TGGGAGTG	CCCGTCTA	2.4459	103455786	15.5	SN
6180777	TD549_2-1	DFT2	2	CCCTCGTA	TAGAAGAG	2.5437	107592495	16.1	TD549
6180778	4906_3-1	DFT1	3	CGATATGG	GGGACGTA	2.2711	96062159	14.4	4906
6180779	C5065_3-1	DFT1	3	TTGTGCCC	TTTCCATC	2.2763	96282107	14.4	C5065
6180780	1426_3-1	DFT1	3	TGTCCTCT	CGACGAAC	2.4952	105541059	15.8	1426
6180781	RV_3-1	DFT2	3	GTATAGTC	TTGGTCTC	2.3325	98659234	14.8	RV
6180782	SN_3-1	DFT2	3	TTTGGGAT	GTTCACGT	2.6750	113146174	17.0	SN
6180783	TD549_3-1	DFT2	3	CACCAAGC	AAAGGGAA	2.1610	91405190	13.7	TD549
DEA5_4NEG	DEA4_6NEG		NA	CGGAGAGG	CGAGCGTC	0.0002	8460	0.0	DEA4

We are mainly interested in comparing all DFT1 samples against all DFT2 samples. However, we did notice that some cell lines within a same DFT produced different results in the metabolism experiments so we might need to compare all 6 cell lines against one another and not be able to simply do a DFT1 vs DFT2 contrast, if that makes sense.

Regarding the reference transcriptome, we will use Ensembl v109.

Functions

maplot <- function(de,contrast_name) {
  sig <-subset(de, padj < 0.05 )
  up <-rownames(subset(de, padj < 0.05 & log2FoldChange > 0))
  dn <-rownames(subset(de, padj < 0.05 & log2FoldChange < 0))
  GENESUP <- length(up)
  GENESDN <- length(dn)
  DET=nrow(de)
  SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down", DET, "detected")
  ns <-subset(de, padj > 0.05 )
  plot(log2(de$baseMean),de$log2FoldChange,
       xlab="log2 basemean", ylab="log2 foldchange",
       pch=19, cex=1, col="dark gray",
       main=contrast_name, cex.main=0.7)
  points(log2(sig$baseMean),sig$log2FoldChange,
         pch=19, cex=1, col="red")
  mtext(SUBHEADER,cex = 0.7)
  grid()
}

make_volcano <- function(de,name) {
    sig <- subset(de,padj<0.05)
    N_SIG=nrow(sig)
    N_UP=nrow(subset(sig,log2FoldChange>0))
    N_DN=nrow(subset(sig,log2FoldChange<0))
    DET=nrow(de)
    HEADER=paste(N_SIG,"@5%FDR,", N_UP, "up", N_DN, "dn", DET, "detected")
    plot(de$log2FoldChange,-log10(de$padj),cex=1,pch=19,col="darkgray",
        main=name, xlab="log2 FC", ylab="-log10 pval")
    mtext(HEADER)
    grid()
    points(sig$log2FoldChange,-log10(sig$padj),cex=1,pch=19,col="red")
}

Load data

Here we load the data in from the aligner.

tmp <- read.table("../fastq/3col.tsv.gz")
x <- as.data.frame(acast(tmp, V2~V1, value.var="V3",fun.aggregate=sum))
dim(x)

## [1] 43512    19

Load gene names.

gn <- read.table("../ref/Sarcophilus_harrisii.mSarHar1.11.cdna+ncrna.genenames.tsv",fill=TRUE)

gn <- gn[order(gn$V1),]

dim(gn)

## [1] 43512     3

Load homology map

hm <- read.table("../ref/mart_export_ensembl109_2023-07-14.txt",sep="\t",header=TRUE)

Now need to collapse transcript data to genes.

x$gene <- paste(gn$V2,gn$V3)

y <- aggregate(. ~ gene,x,sum)

rownames(y) <- y$gene
y$gene = NULL

dim(y)

## [1] 23829    19

Quality control

Samples with <1M reads should be omitted. Will also round values to integers.

cs <- colSums(y)
cs <- cs[order(cs)]

par(mar=c(5,10,5,2))
barplot(cs,main="All samples",horiz=TRUE,las=1)
abline(v=1e7,col="red",lty=2)

y <- round(y)

MDS

This will help us to visualise the sources of variability in the overall dataset.

Plot MDS and then remove the negative control and run Plot MDS again.

Also fix the sample names.

Plot MDS by DFT.

par(mar = c(5.1, 4.1, 4.1, 2.1) )

plotMDS(y)

y <- y[,colnames(y) != "DEA5-4NEG"]

plotMDS(y)

ss <- ss[ss$ClientID != "DEA4_6NEG",]

colnames(y) <- sapply(strsplit(ss$ClientID,"-"),"[[",1)

cs <- colSums(y)
cs <- cs[order(cs)]

par(mar=c(5,10,5,2))
barplot(cs,main="All samples",horiz=TRUE,las=1)

par(mar = c(5.1, 4.1, 4.1, 2.1) )

plotMDS(y)

cols <- ss$DFT
cols <- gsub("DFT1","pink",cols)
cols <- gsub("DFT2","lightblue",cols)
mymds <- plotMDS(y,plot=FALSE)

# fix the xlims
XMIN=min(mymds$x)
XMAX=max(mymds$x)
XMID=(XMAX+XMIN)/2
XMIN <- XMID + (XMIN-XMID)*1.1
XMAX <- XMID+(XMAX-XMID)*1.1
plotMDS(mymds,pch=19,cex=3,col=cols,main="MDS plot",xlim=c(XMIN,XMAX))
text(mymds,labels=colnames(y))
mtext("pink=DFT1,blue=DFT2")

y_DFT1 <- y[,colnames(y) %in% c("4906_1","4906_2","4906_3",
                                "1426_1","1426_2","1426_3",
                                "C5065_1","C5065_2","C5065_3")]
y_DFT2 <- y[,colnames(y) %in% c("RV_1","RV_2","RV_3",
                                "SN_1","SN_2","SN_3",
                                "TD549_1","TD549_2","TD549_3")]

mdsDFT1 <- plotMDS(y_DFT1,plot=FALSE)
plotMDS(mdsDFT1,pch=19,cex=3,col="pink",main="MDS plot",xlim=c(XMIN,XMAX))
text(mdsDFT1,labels=colnames(y_DFT1))
mtext("DFT1")

mdsDFT2 <- plotMDS(y_DFT2,plot=FALSE)
plotMDS(mdsDFT2,pch=19,cex=3,col="lightblue",main="MDS plot",xlim=c(XMIN,XMAX))
text(mdsDFT2, labels=colnames(y_DFT2))
mtext("DFT2")

DESeq2

Run a simple differential analysis comparing DFT1 vs DFT2 ignoring the clone type as a source of variation.

# split data will be necessary for the smaller comparisons (but not this one)
y1 <- y
ss1 <- ss

y1 <- y1[which(rowMeans(y1)>10),]
dim(y1)

## [1] 16009    18

dds <- DESeqDataSetFromMatrix(countData = y1 , colData = ss1, design = ~ DFT )

## converting counts to integer mode

## factor levels were dropped which had no samples

res <- DESeq(dds)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

## -- replacing outliers and refitting for 4 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)

## estimating dispersions

## fitting model and testing

z<- results(res)
vsd <- vst(dds)
zz<-cbind(as.data.frame(z),assay(vsd))
dge<-as.data.frame(zz[order(zz$pvalue),])
dge1 <- dge
sig <- subset(dge,padj<0.05)
sig1_up <- rownames(subset(sig,log2FoldChange>0))
sig1_dn <- rownames(subset(sig,log2FoldChange<0))
length(sig1_up)

## [1] 5871

length(sig1_dn)

## [1] 6006

maplot(dge1,"DFT1 vs DFT2")

make_volcano(dge1,"DFT1 vs DFT2")

sig[1:50,1:6] %>%
  kbl(caption="Comparison of DFT1 vs DFT2") %>%
  kable_paper("hover", full_width = F)

Comparison of DFT1 vs DFT2
	baseMean	log2FoldChange	lfcSE	stat
ENSSHAG00000000893.2 NA	761.6994	-4.745935	0.1042514	-45.52393
ENSSHAG00000001230.2 NA	9964.7623	-7.268224	0.1727670	-42.06952
ENSSHAG00000001308.2 NA	1591.3721	-4.481411	0.1093947	-40.96554
ENSSHAG00000001552.2 TDRD5	1386.5142	-5.550630	0.0945061	-58.73301
ENSSHAG00000001728.2 ABRAXAS2	2447.7801	-2.611974	0.0580287	-45.01174
ENSSHAG00000001754.2 NA	573.3750	-4.129922	0.1018094	-40.56522
ENSSHAG00000002134.2 NA	31866.6922	2.222360	0.0386947	57.43325
ENSSHAG00000002808.2 MICAL1	9230.0290	-2.032272	0.0528958	-38.42032
ENSSHAG00000003434.2 NA	5640.3455	-9.630769	0.2085777	-46.17354
ENSSHAG00000003458.2 JMJD1C	2343.7511	-2.896812	0.0747449	-38.75599
ENSSHAG00000003737.2 NA	9677.3586	11.783418	0.2132465	55.25727
ENSSHAG00000004947.2 THBS2	24172.5467	13.496286	0.3231374	41.76640
ENSSHAG00000005182.2 SERPINE1	7064.0510	11.869506	0.2994322	39.64005
ENSSHAG00000005550.2 MEIOC	1485.7532	-7.289256	0.1694244	-43.02364
ENSSHAG00000007070.2 SMC1B	3980.3698	-10.063501	0.1851324	-54.35841
ENSSHAG00000007582.2 NA	2902.0019	-7.095020	0.0846261	-83.83964
ENSSHAG00000007784.2 NA	5517.6882	-7.314052	0.1015625	-72.01526
ENSSHAG00000007965.2 NA	5215.8477	-1.747971	0.0370471	-47.18243
ENSSHAG00000008051.2 TEX14	1537.4858	-3.531985	0.0617160	-57.22965
ENSSHAG00000009584.2 PLEKHG1	1861.7962	9.379637	0.2481401	37.79977
ENSSHAG00000010367.2 ECM1	12497.0681	9.411970	0.1164250	80.84150
ENSSHAG00000010645.2 COL16A1	6094.7776	6.056254	0.0950462	63.71906
ENSSHAG00000010824.2 MPDZ	4232.6012	-5.438458	0.0941097	-57.78850
ENSSHAG00000010902.2 SMARCD3	2489.7010	-2.649718	0.0440929	-60.09400
ENSSHAG00000011135.2 NA	5620.2000	4.064532	0.0895867	45.36980
ENSSHAG00000011480.2 ADGRB2	1933.7632	-7.544502	0.1814507	-41.57880
ENSSHAG00000011750.2 NA	3080.7121	3.088431	0.0735615	41.98432
ENSSHAG00000011809.2 NA	5508.1945	3.140454	0.0593806	52.88691
ENSSHAG00000012182.2 NA	1038.5611	7.742252	0.2036373	38.01982
ENSSHAG00000013722.2 NA	524.0949	-4.688777	0.0900165	-52.08798
ENSSHAG00000014390.2 IFRD1	2454.6507	-1.386811	0.0345922	-40.09028
ENSSHAG00000014430.2 NA	1157.2248	-1.742533	0.0399895	-43.57472
ENSSHAG00000015236.2 FKBP6	598.2945	-5.562106	0.1042482	-53.35446
ENSSHAG00000016212.2 SLC25A14	887.2862	-6.791421	0.1667459	-40.72915
ENSSHAG00000016252.2 LOXL2	8715.4741	8.867010	0.1933921	45.84991
ENSSHAG00000016335.2 NA	21006.6606	2.668634	0.0567139	47.05436
ENSSHAG00000017128.2 F10	515.2000	6.578090	0.1519688	43.28578
ENSSHAG00000017996.2 TNFRSF21	1876.2975	-7.424893	0.1564041	-47.47248
ENSSHAG00000018060.2 HR	3216.8474	-7.036189	0.1791811	-39.26859
ENSSHAG00000018469.2 PDE10A	1279.8709	-5.655401	0.1229592	-45.99414
ENSSHAG00000020521.1 CDK3	2068.9726	4.241310	0.0605836	70.00762
ENSSHAG00000021318.1 NA	2418.0580	-2.504689	0.0455484	-54.98958
ENSSHAG00000021464.1 NA	3962.2831	-8.392331	0.1785238	-47.00960
ENSSHAG00000022172.1 HAPLN2	3697.2723	-7.391660	0.1427380	-51.78480
ENSSHAG00000022666.1 NA	550.9103	6.404774	0.1677749	38.17480
ENSSHAG00000022966.1 SOX8	1304.3613	-4.733014	0.1196103	-39.57027
ENSSHAG00000023142.1 NA	20998.8159	-2.483109	0.0319815	-77.64204
ENSSHAG00000023249.1 NA	974.9689	5.489140	0.1204114	45.58656
ENSSHAG00000023680.1 EGR3	7293.3439	-4.014795	0.1044284	-38.44541
ENSSHAG00000024021.1 NA	1284.6234	5.713744	0.1294418	44.14140

write.table(dge,file="dge1.tsv",sep="\t")

# heatmap
mx <- sig[,7:ncol(sig)]
mx <- head(mx,30)
colfunc <- colorRampPalette(c("blue", "white", "red"))
heatmap.2(as.matrix(mx),trace="none",scale="row",
  col=colfunc(25),ColSideColors=cols,mar=c(5,14),
  cexRow=0.8,cexCol=0.7)

Combine the three replicates

Sum replicates.

x4906 <- rowSums(y[,ss$clone=="4906"])
xC5065 <- rowSums(y[,ss$clone=="C5065"])
x1426 <- rowSums(y[,ss$clone=="1426"])
xRV <- rowSums(y[,ss$clone=="RV"])
xSN <- rowSums(y[,ss$clone=="SN"])
xTD549 <- rowSums(y[,ss$clone=="TD549"])

y2 <- data.frame(x4906,xC5065,x1426,xRV,xSN,xTD549)
ss2 <- as.data.frame(colnames(y2))
colnames(ss2) <- "clone"
ss2$DFT <- factor(c("DFT1","DFT1","DFT1","DFT2","DFT2","DFT2"))

y2 <- y2[which(rowMeans(y2)>10),]
dim(y2)

## [1] 17468     6

dds <- DESeqDataSetFromMatrix(countData = y2 , colData = ss2, design = ~ DFT )

## converting counts to integer mode

dds2 <- dds
res <- DESeq(dds)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

z<- results(res)
vsd <- vst(dds)
zz<-cbind(as.data.frame(z),assay(vsd))
dge<-as.data.frame(zz[order(zz$pvalue),])
dge2 <- dge
sig <- subset(dge,padj<0.05)
sig2_up <- rownames(subset(sig,log2FoldChange>0))
sig2_dn <- rownames(subset(sig,log2FoldChange<0))
length(sig2_up)

## [1] 4200

length(sig2_dn)

## [1] 4412

maplot(dge2,"DFT1 vs DFT2 aggregated")

make_volcano(dge2,"DFT1 vs DFT2 aggregated")

sig[1:50,1:6] %>%
  kbl(caption="Comparison of DFT1 vs DFT2") %>%
  kable_paper("hover", full_width = F)

Comparison of DFT1 vs DFT2
	baseMean	log2FoldChange	lfcSE	stat
ENSSHAG00000007582.2 NA	8720.2072	-7.090091	0.1734257	-40.88258
ENSSHAG00000024692.1 NA	11176.1973	-8.580770	0.2275391	-37.71118
ENSSHAG00000007784.2 NA	16519.5514	-7.300242	0.1954949	-37.34236
ENSSHAG00000010367.2 ECM1	37884.8561	9.429385	0.2567216	36.73000
ENSSHAG00000026705.1 LOXL3	35605.2404	3.028419	0.0834311	36.29846
ENSSHAG00000003737.2 NA	29315.0122	11.793401	0.3388989	34.79917
ENSSHAG00000032086.1 NA	4270.9538	-9.087633	0.2656490	-34.20918
ENSSHAG00000020521.1 CDK3	6254.9578	4.247335	0.1329351	31.95045
ENSSHAG00000010645.2 COL16A1	18470.4887	6.069424	0.1909073	31.79252
ENSSHAG00000015236.2 FKBP6	1799.3194	-5.552135	0.1749135	-31.74217
ENSSHAG00000031815.1 RAB33A	4153.4227	-8.057672	0.2543513	-31.67930
ENSSHAG00000031607.1 NA	2388.4986	6.299003	0.2049613	30.73264
ENSSHAG00000003434.2 NA	16944.8869	-9.624643	0.3183782	-30.23022
ENSSHAG00000013199.2 PDGFRA	101648.6230	15.673386	0.5200441	30.13857
ENSSHAG00000024117.1 NA	6007.3116	7.935832	0.2646438	29.98684
ENSSHAG00000031627.1 PRX	228602.7417	-10.824218	0.3631920	-29.80302
ENSSHAG00000007782.2 CA8	2062.4938	8.726159	0.2990458	29.18000
ENSSHAG00000018310.2 SLIT2	44695.3018	-14.414172	0.4983908	-28.92142
ENSSHAG00000021464.1 NA	11908.1052	-8.384005	0.2916414	-28.74765
ENSSHAG00000010824.2 MPDZ	12727.1471	-5.429957	0.1920191	-28.27821
ENSSHAG00000000643.2 STK31	4182.6920	-9.537041	0.3373093	-28.27387
ENSSHAG00000023142.1 NA	63163.2147	-2.474963	0.0903962	-27.37905
ENSSHAG00000025566.1 ABLIM1	7772.7034	5.814237	0.2129981	27.29713
ENSSHAG00000021789.1 NA	1860.1298	8.577176	0.3143469	27.28571
ENSSHAG00000024021.1 NA	3895.6967	5.733097	0.2107928	27.19778
ENSSHAG00000030292.1 NA	1933.0281	-5.410220	0.1994706	-27.12289
ENSSHAG00000031074.1 NEXMIF	4117.0924	-8.404272	0.3123135	-26.90973
ENSSHAG00000008051.2 TEX14	4625.9527	-3.525105	0.1326009	-26.58431
ENSSHAG00000004947.2 THBS2	73193.5959	13.509919	0.5132476	26.32242
ENSSHAG00000022172.1 HAPLN2	11123.1806	-7.382997	0.2822149	-26.16090
ENSSHAG00000026637.1 NA	2213.7312	-5.071669	0.1942619	-26.10738
ENSSHAG00000005380.2 NA	3108.7669	10.228790	0.3940820	25.95599
ENSSHAG00000018469.2 PDE10A	3854.3231	-5.647349	0.2179279	-25.91384
ENSSHAG00000011809.2 NA	16636.2524	3.149445	0.1232734	25.54845
ENSSHAG00000029638.1 NA	18764.0415	-6.179495	0.2425972	-25.47224
ENSSHAG00000016252.2 LOXL2	26417.8025	8.878268	0.3521803	25.20944
ENSSHAG00000001552.2 TDRD5	4168.0657	-5.549233	0.2251434	-24.64755
ENSSHAG00000006728.2 NEFM	7555.0072	-10.855938	0.4479166	-24.23652
ENSSHAG00000013722.2 NA	1575.4562	-4.678030	0.1936782	-24.15363
ENSSHAG00000014886.2 CYP2R1	2488.5667	-7.465866	0.3091695	-24.14813
ENSSHAG00000031108.1 NA	964.2578	-7.703956	0.3207488	-24.01866
ENSSHAG00000005492.2 NA	1659.5069	-8.999424	0.3748897	-24.00553
ENSSHAG00000027513.1 NA	1802.2901	-5.271118	0.2200576	-23.95335
ENSSHAG00000000893.2 NA	2287.3973	-4.736996	0.1987138	-23.83828
ENSSHAG00000005182.2 SERPINE1	21367.7733	11.866757	0.5041184	23.53962
ENSSHAG00000010102.2 UNC5A	1472.0133	-8.985720	0.3860205	-23.27783
ENSSHAG00000031142.1 TPPP	2664.1676	-5.954920	0.2565656	-23.21012
ENSSHAG00000022666.1 NA	1669.6189	6.414065	0.2781309	23.06132
ENSSHAG00000010515.2 NA	2972.5658	9.167129	0.3999684	22.91964
ENSSHAG00000010902.2 SMARCD3	7489.5510	-2.642803	0.1163671	-22.71092

write.table(dge2,file="dge2.tsv",sep="\t")

# heatmap
mx <- sig[,7:ncol(sig)]
mx <- head(mx,30)
colfunc <- colorRampPalette(c("blue", "white", "red"))
heatmap.2(as.matrix(mx),trace="none",scale="row",
  col=colfunc(25),ColSideColors=cols[1:6],mar=c(8,14),
  cexRow=0.7,cexCol=1)

Compare cell lines against one another

First 4906 vs RV, 4906 vs SN and 4906 vs TD549. Then 1426 vs RV, 1426 vs SN and 1426 vs TD549. Finally C5065 vs RV, C5065 vs SN and C5065 vs TD549.

yx <- vector(mode = "list", length = 9)
yx[[1]] <- select(y1, "4906_1", "4906_2", "4906_3", "RV_1", "RV_2", "RV_3")
yx[[2]] <- select(y1, "4906_1", "4906_2", "4906_3", "SN_1", "SN_2", "SN_3")
yx[[3]] <- select(y1, "4906_1", "4906_2", "4906_3", "TD549_1", "TD549_2", "TD549_3")
yx[[4]] <- select(y1, "1426_1", "1426_2", "1426_3", "RV_1", "RV_2", "RV_3")
yx[[5]] <- select(y1, "1426_1", "1426_2", "1426_3", "SN_1", "SN_2", "SN_3")
yx[[6]] <- select(y1, "1426_1", "1426_2", "1426_3", "TD549_1", "TD549_2", "TD549_3")
yx[[7]] <- select(y1, "C5065_1", "C5065_2", "C5065_3", "RV_1", "RV_2", "RV_3")
yx[[8]] <- select(y1, "C5065_1", "C5065_2", "C5065_3", "SN_1", "SN_2", "SN_3")
yx[[9]] <- select(y1, "C5065_1", "C5065_2", "C5065_3", "TD549_1", "TD549_2", "TD549_3")

ssx <- vector(mode = "list", length = 9)
dds <- vector(mode = "list", length = 9)
res <- vector(mode = "list", length = 9)
z <- vector(mode = "list", length = 9)
vsd <- vector(mode = "list", length = 9)
zz <- vector(mode = "list", length = 9)
dgex <- vector(mode = "list", length = 9)
sigx <- vector(mode = "list", length = 9)

for(i in 1:9){
  print(paste("Processing comparison: ", i, " on 9"))
  ssx[[i]] <- as.data.frame(colnames(yx[[i]]))
  colnames(ssx[[i]]) <- "clone"
  ssx[[i]]$DFT <- factor(c("DFT1","DFT1","DFT1","DFT2","DFT2","DFT2"))
  dds[[i]] <- DESeqDataSetFromMatrix(countData = yx[[i]], colData = ssx[[i]],
                                     design = ~ DFT)
  res[[i]] <- DESeq(dds[[i]], quiet=TRUE)
  z[[i]] <- results(res[[i]])
  vsd[[i]] <- vst(dds[[i]])
  zz[[i]] <- cbind(as.data.frame(z[[i]]), assay(vsd[[i]]))
  dgex[[i]] <- as.data.frame(zz[[i]][order(zz[[i]]$pvalue),])
  sigx[[i]] <- subset(dgex[[i]], padj < 0.05)
  }

## [1] "Processing comparison:  1  on 9"

## converting counts to integer mode

## [1] "Processing comparison:  2  on 9"

## converting counts to integer mode

## [1] "Processing comparison:  3  on 9"

## converting counts to integer mode

## [1] "Processing comparison:  4  on 9"

## converting counts to integer mode

## [1] "Processing comparison:  5  on 9"

## converting counts to integer mode

## [1] "Processing comparison:  6  on 9"

## converting counts to integer mode

## [1] "Processing comparison:  7  on 9"

## converting counts to integer mode

## [1] "Processing comparison:  8  on 9"

## converting counts to integer mode

## [1] "Processing comparison:  9  on 9"

## converting counts to integer mode

plotnames <- c("4906 vs RV", "4906 vs SN", "4906 vs TD549",
               "1426 vs RV", "1426 vs SN", "1426 vs TD549",
               "C5065 vs RV", "C5065 vs SN", "C5065 vs TD549")

for(i in 1:9){
  maplot(dgex[[i]],plotnames[i])
  make_volcano(dgex[[i]],plotnames[i])
  
  sigx[[i]][1:50,1:6] %>%
  kbl(caption=plotnames[i]) %>%
  kable_paper("hover", full_width = F)
  
  write.table(dgex[[i]],file=paste(plotnames[i],".tsv"),sep="\t")
  
  mx <- sigx[[i]][,7:ncol(sigx[[i]])]
  mx <- head(mx,30)
  colfunc <- colorRampPalette(c("blue", "white", "red"))
  heatmap.2(as.matrix(mx),trace="none",scale="row",
  col=colfunc(25),ColSideColors=cols[1:6],mar=c(8,14),
  cexRow=0.7,cexCol=1)
}

Enrichment analysis

Need to get human homologs of these genes. These were obtained from Ensembl v109 biomart (website).

rownames(dge1) <- sapply(strsplit(rownames(dge1),"\\."),"[[",1)
rownames(dge2) <- sapply(strsplit(rownames(dge2),"\\."),"[[",1)

for(i in 1:9){
  rownames(dgex[[i]]) <- sapply(strsplit(rownames(dgex[[i]]),"\\."),"[[",1)
}


hm2 <- hm[hm$Tasmanian.devil.gene.stable.ID != "",]

gt <- hm2[,2:3]
length(unique(gt$Tasmanian.devil.gene.stable.ID))

## [1] 16979

length(unique(gt$Gene.name))

## [1] 16646

for(i in 1:length(gt$Gene.name)){
  if(gt$Gene.name[i]==""){gt$Gene.name[i] <- gt$Tasmanian.devil.gene.stable.ID[i]}
}

Now run mitch for DGE2. There will be a report generated that has more details on the enrichment analysis. The pathways are sourced from Reactome 7th July 2023.

genesets <- gmt_import("../ref/ReactomePathways_2023-07-14.gmt")

m2 <- mitch_import(dge2, DEtype="deseq2", geneTable=gt)

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.

## Note: Mean no. genes in input = 17468

## Note: no. genes in output = 13563

## Note: estimated proportion of input genes in output = 0.776

head(m2)

##                    x
## A2M     -13.13626596
## A4GALT   -0.07278225
## AAAS     -4.33508400
## AACS      2.13961007
## AADAC    -2.90578338
## AADACL3   1.15402966

res2 <- mitch_calc(m2, genesets, priority="effect",cores=16)

## Note: Enrichments with large effect sizes may not be
##             statistically significant.

if ( !file.exists("mitch2.html") ) {
  mitch_report(res2, "mitch2.html")
}

top <- subset(res2$enrichment_result,p.adjustANOVA<0.05)

topup <- head(subset(top,s.dist>0),20)

topup %>%
  kbl(caption="Top 20 upregulated pathways") %>%
  kable_paper("hover", full_width = F)

Top 20 upregulated pathways
	set	setSize	pANOVA	s.dist	p.adjustANOVA
687	Metallothioneins bind metals	10	0.0000122	0.7987899	0.0013275
1075	Response to metal ions	13	0.0000082	0.7143457	0.0009848
229	Crosslinking of collagen fibrils	16	0.0001180	0.5559533	0.0070965
42	Activation of Matrix Metalloproteinases	22	0.0000129	0.5372404	0.0013275
207	Collagen degradation	47	0.0000000	0.5181408	0.0000004
157	Caspase activation via Death Receptors in the presence of ligand	12	0.0019500	0.5164748	0.0460725
456	GRB2:SOS provides linkage to MAPK signaling for Integrins	12	0.0019840	0.5156200	0.0460725
206	Collagen chain trimerization	30	0.0000013	0.5100175	0.0001749
185	Cholesterol biosynthesis	24	0.0000356	0.4875237	0.0027071
205	Collagen biosynthesis and modifying enzymes	50	0.0000000	0.4811012	0.0000010
171	Cell-extracellular matrix interactions	18	0.0004405	0.4785202	0.0205059
572	Integrin cell surface interactions	63	0.0000000	0.4337449	0.0000008
337	ECM proteoglycans	61	0.0000000	0.4178543	0.0000030
1147	Signal transduction by L1	21	0.0016068	0.3977221	0.0435412
208	Collagen formation	68	0.0000001	0.3790227	0.0000095
573	Integrin signaling	23	0.0016662	0.3787875	0.0435412
1247	Syndecan interactions	24	0.0014009	0.3767696	0.0435412
93	Assembly of collagen fibrils and other multimeric structures	47	0.0000244	0.3559658	0.0021982
183	Chemokine receptors bind chemokines	27	0.0015493	0.3520160	0.0435412
287	Degradation of the extracellular matrix	107	0.0000000	0.3432544	0.0000004

topdn <- head(subset(top,s.dist<0),20)

topdn %>%
  kbl(caption="Top 20 downregulated pathways") %>%
  kable_paper("hover", full_width = F)

Top 20 downregulated pathways
	set	setSize	pANOVA	s.dist	p.adjustANOVA
286	Degradation of cysteine and homocysteine	11	0.0003114	-0.6278174	0.0160496
1243	Sulfur amino acid metabolism	23	0.0016897	-0.3782930	0.0435412
1359	Transmission across Chemical Synapses	195	0.0009484	-0.1376402	0.0333775

top2 <- rbind(head(topup,10),head(topdn,10))
top2 <- top2[order(-top2$s.dist),]
par(mar=c(5,15,5,2))

barplot(rev(abs(top2$s.dist)),horiz=TRUE,las=1,names.arg=rev(top2$set),
  main="Top REACTOME Pathways",cex.names=0.6,xlab="Enrichment Score",
  col=c(rep("blue",nrow(head(topdn,10))),rep("red",nrow(head(topup,10)))),
  xlim=c(0,0.8))

Run mitch with curated metabolic pathways.

KEGG gene sets under the “Metabolism” hierarchy were sourced from MSigDb in August 2023.

genesets2 <- gmt_import("../ref/gsea/gsea_KEGG.gmt")

res2 <- mitch_calc(m2, genesets2, priority="effect",cores=16)

## Note: Enrichments with large effect sizes may not be
##             statistically significant.

if ( !file.exists("mitch2.html") ) {
  mitch_report(res2, "mitch2.html")
}

top <- subset(res2$enrichment_result,p.adjustANOVA<0.05)

topup <- head(subset(top,s.dist>0),20)

topup %>%
  kbl(caption="Top 20 upregulated pathways") %>%
  kable_paper("hover", full_width = F)

Top 20 upregulated pathways
	set	setSize	pANOVA	s.dist	p.adjustANOVA
51	KEGG_STEROID_BIOSYNTHESIS	14	0.0004408	0.542486	0.0251253

topdn <- head(subset(top,s.dist<0),20)

topdn %>%
  kbl(caption="Top 20 downregulated pathways") %>%
  kable_paper("hover", full_width = F)

Top 20 downregulated pathways
set	setSize	pANOVA	s.dist	p.adjustANOVA

top2 <- rbind(head(topup,10),head(topdn,10))
top2 <- top2[order(-top2$s.dist),]
par(mar=c(5,15,5,2))

barplot(rev(abs(top2$s.dist)),horiz=TRUE,las=1,names.arg=rev(top2$set),
  main="Top KEGG Pathways",cex.names=0.6,xlab="Enrichment Score",
  col=c(rep("blue",nrow(head(topdn,10))),rep("red",nrow(head(topup,10)))),
  xlim=c(0,0.8))

Same with Reactome this time. Reactome gene sets of the metabolism hierarchy were sourced from the Reactome website in August 2023.

hierarchy <- read.table("../ref/gsea/ReactomePathwaysRelation.txt", sep ="\t", header = F, dec =".")
pathways <- read.delim("../ref/gsea/ReactomePathways.txt", header=FALSE)

hierarchy <- hierarchy[grep("HSA", hierarchy$V1), ] # keep only homo sapiens data
pathways <- pathways[grep("HSA", pathways$V1), 1:2] 

metabo_pathways <- "R-HSA-1430728" # reactome code for metabolism 

# this is very bad coding - but it works
# why 325? that is when while loop crashes because all branches of the hierarchy have been looped over
# will have to change this if Reactome data is updated
while(length(metabo_pathways) < 325){
  for(i in 1:length(metabo_pathways)){
    metabo_pathways <- unique(c(metabo_pathways,filter(hierarchy, V1 == metabo_pathways[i])$V2))
    }
  }

mpathways <- c()
for(i in 1:length(metabo_pathways)){mpathways <- c(mpathways,filter(pathways, V1 == metabo_pathways[i])$V2)}

library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:tidyr':
## 
##     extract

## The following object is masked from 'package:GenomicRanges':
## 
##     subtract

genesets3 <- extract(genesets, mpathways)
# remove empty sets (ie hierarchy exists but no gene set is associated to it)
genesets3 <- genesets3[lapply(genesets3,length)>0]

m3 <- mitch_import(dge2, DEtype="deseq2", geneTable=gt)

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.

## Note: Mean no. genes in input = 17468

## Note: no. genes in output = 13563

## Note: estimated proportion of input genes in output = 0.776

res3 <- mitch_calc(m3, genesets3, priority="effect", cores=16)

## Note: Enrichments with large effect sizes may not be
##             statistically significant.

if ( !file.exists("mitch3.html") ) {
  mitch_report(res3, "mitch3.html")
}

top <- subset(res3$enrichment_result,p.adjustANOVA<0.05)

topup <- head(subset(top,s.dist>0),20)

topup %>%
  kbl(caption="Top 20 upregulated metabolic pathways") %>%
  kable_paper("hover", full_width = F)

Top 20 upregulated metabolic pathways
	set	setSize	pANOVA	s.dist	p.adjustANOVA
77	Cholesterol biosynthesis	24	3.56e-05	0.4875237	0.0048833

topdn <- head(subset(top,s.dist<0),20)

topdn %>%
  kbl(caption="Top 20 downregulated metabolic pathways") %>%
  kable_paper("hover", full_width = F)

Top 20 downregulated metabolic pathways
	set	setSize	pANOVA	s.dist	p.adjustANOVA
88	Degradation of cysteine and homocysteine	11	0.0003114	-0.6278174	0.0213327

top2 <- rbind(head(topup,10),head(topdn,10))
top2 <- top2[order(-top2$s.dist),]
par(mar=c(5,15,5,2))

barplot(rev(abs(top2$s.dist)),horiz=TRUE,las=1,names.arg=rev(top2$set),
  main="Top REACTOME Metabolic Pathways",cex.names=0.6,xlab="Enrichment Score",
  col=c(rep("blue",nrow(head(topdn,10))),rep("red",nrow(head(topup,10)))),
  xlim=c(0,0.8))

# see https://www.youtube.com/watch?v=ht1r34-ifVI for reference

# find corresponding human genes to devil ENS IDs
dge3 <- dge2 
dge3$ENSSHAG <- rownames(dge3)
hm3 <- data.frame(ENSSHAG=hm2[,2],geneID=hm2[,3])
dge3 <- left_join(dge3, hm3, by="ENSSHAG")
dge3 <- unite(dge3,"ENSSHAG_geneID", ENSSHAG:geneID, sep=" ", remove=F)
dge3 <- unique(dge3)
rownames(dge3) <- dge3$ENSSHAG_geneID

# only plot significant DEGs
dge3sig <- filter(dge3, padj < 0.05)
dge3sig <- dge3sig[order(dge3sig$log2FoldChange, decreasing=TRUE),]

mx <- dge3sig[,7:ncol(dge3sig)] # get normalised counts
 
mx.scaled <- t(apply(mx[,1:6], 1, scale)) # center and scale each column (Z-score)
colnames(mx.scaled) <- colnames(mx[,1:6])
log2FC <- as.matrix(dge3sig$log2FoldChange)
rownames(log2FC) <- rownames(dge3sig)
colnames(log2FC) <- "logFC"
mean <- as.matrix(dge3sig$baseMean)
rownames(mean) <- rownames(dge3sig)
colnames(mean) <- "AveExpr"

col_logFC <- colorRamp2(c(min(log2FC),0,max(log2FC)), c("blue","white","red"))
col_AveExpr <- colorRamp2(c(quantile(mean)[1], quantile(mean)[4]), c("white","red"))

ha <- HeatmapAnnotation(summary=anno_summary(gp=gpar(fill=2),height=unit(2, "cm")))

# ----- Top DEGs in all KEGG metabolic pathways ------

metabo <- c()
for(i in 1:length(genesets2)){metabo <- c(metabo,genesets2[[i]])}
metabo <- unique(metabo)

temp <- c()
for(i in 1:length(metabo)){temp <- rbind(temp,filter(hm3, geneID==metabo[i]))}
metabolism <- c(t(unite(temp,"ENSSHAG_geneID", ENSSHAG:geneID, sep=" ")))

met <- subset(mx.scaled, rownames(mx.scaled) %in% metabolism)
metFC <- subset(log2FC, rownames(log2FC) %in% metabolism)
metAE <- subset(mean, rownames(mean) %in% metabolism)

# top 20 up & down
met40 <- rbind(met[1:20,],met[542:561,])
met40FC <- as.matrix(c(metFC[1:20],metFC[542:561,]))
rownames(met40FC) <- rownames(met40)
colnames(met40FC) <- "logFC"
met40AE <- as.matrix(c(metAE[1:20],metAE[542:561,]))
rownames(met40AE) <- rownames(met40)
colnames(met40AE) <- "AveExpr"

h1 <- Heatmap(met40, cluster_rows = F,
              column_labels = colnames(met40), name="Z-score",
              cluster_columns = T)
h2 <- Heatmap(met40FC, row_labels = rownames(met40FC), row_names_gp = gpar(fontsize = 6),
              cluster_rows = F, name="logFC", top_annotation = ha, col = col_logFC,
              cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
                grid.text(round(as.numeric(met40FC[i, j],2)), x, y,gp=gpar(fontsize=6))
              })
h3 <- Heatmap(met40AE, row_labels = rownames(met40AE), row_names_gp = gpar(fontsize = 6),
              cluster_rows = F, name = "AveExpr", col=col_AveExpr,
              cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
                grid.text(round(as.numeric(met40AE[i, j],2)), x, y,gp=gpar(fontsize=6))},
              column_title = "TOP 40 UP & DOWN - KEGG METABOLISM")

h <- h1 + h2 + h3
h

# ----- Top DEGs in all Reactome metabolic pathways ------

metabo <- c()
for(i in 1:length(genesets3)){metabo <- c(metabo,genesets3[[i]])}
metabo <- unique(metabo)

temp <- c()
for(i in 1:length(metabo)){temp <- rbind(temp,filter(hm3, geneID==metabo[i]))}
metabolism <- c(t(unite(temp,"ENSSHAG_geneID", ENSSHAG:geneID, sep=" ")))

met <- subset(mx.scaled, rownames(mx.scaled) %in% metabolism)
metFC <- subset(log2FC, rownames(log2FC) %in% metabolism)
metAE <- subset(mean, rownames(mean) %in% metabolism)

# top 20 up & down
met40 <- rbind(met[1:20,],met[784:803,])
met40FC <- as.matrix(c(metFC[1:20],metFC[784:803,]))
rownames(met40FC) <- rownames(met40)
colnames(met40FC) <- "logFC"
met40AE <- as.matrix(c(metAE[1:20],metAE[784:803,]))
rownames(met40AE) <- rownames(met40)
colnames(met40AE) <- "AveExpr"

h1 <- Heatmap(met40, cluster_rows = F,
              column_labels = colnames(met40), name="Z-score",
              cluster_columns = T)
h2 <- Heatmap(met40FC, row_labels = rownames(met40FC), row_names_gp = gpar(fontsize = 6),
              cluster_rows = F, name="logFC", top_annotation = ha, col = col_logFC,
              cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
                grid.text(round(as.numeric(met40FC[i, j],2)), x, y,gp=gpar(fontsize=6))
              })
h3 <- Heatmap(met40AE, row_labels = rownames(met40AE), row_names_gp = gpar(fontsize = 6),
              cluster_rows = F, name = "AveExpr", col=col_AveExpr,
              cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
                grid.text(round(as.numeric(met40AE[i, j],2)), x, y,gp=gpar(fontsize=6))},
              column_title = "TOP 40 UP & DOWN - REACTOME METABOLISM")

h <- h1 + h2 + h3
h

# ----- KEGG steroid biosynthesis -----

kegg <- genesets2[["KEGG_STEROID_BIOSYNTHESIS"]] 
temp <- c()
for(i in 1:length(kegg)){temp <- rbind(temp,filter(hm3, geneID==kegg[i]))}
glycolysis <- c(t(unite(temp,"ENSSHAG_geneID", ENSSHAG:geneID, sep=" ")))

glyc <- subset(mx.scaled, rownames(mx.scaled) %in% glycolysis)
glycFC <- subset(log2FC, rownames(log2FC) %in% glycolysis)
glycAE <- subset(mean, rownames(mean) %in% glycolysis)

h1 <- Heatmap(glyc, cluster_rows = F,
              column_labels = colnames(glyc), name="Z-score",
              cluster_columns = T)
h2 <- Heatmap(glycFC, row_labels = rownames(glycFC), row_names_gp = gpar(fontsize = 6),
            cluster_rows = F, name="logFC", top_annotation = ha, col = col_logFC,
            cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
              grid.text(round(as.numeric(glycFC[i, j],2)), x, y,gp=gpar(fontsize=6))
            })
h3 <- Heatmap(glycAE, row_labels = rownames(glycAE), row_names_gp = gpar(fontsize = 6),
            cluster_rows = F, name = "AveExpr", col=col_AveExpr,
            cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
              grid.text(round(as.numeric(glycAE[i, j],2)), x, y,gp=gpar(fontsize=6))},
            column_title = "KEGG Steroid biosynthesis")

h <- h1 + h2 + h3
h

# ----- Reactome Cholesterol biosynthesis -----
cholesterol <- genesets3[["Cholesterol biosynthesis"]]
temp <- c()
for(i in 1:length(cholesterol)){temp <- rbind(temp,filter(hm3, geneID==cholesterol[i]))}
cholesterol <- c(t(unite(temp,"ENSSHAG_geneID", ENSSHAG:geneID, sep=" ")))

chol <- subset(mx.scaled, rownames(mx.scaled) %in% cholesterol)
cholFC <- subset(log2FC, rownames(log2FC) %in% cholesterol)
cholAE <- subset(mean, rownames(mean) %in% cholesterol)

h1 <- Heatmap(chol, cluster_rows = F,
              column_labels = colnames(chol), name="Z-score",
              cluster_columns = T)
h2 <- Heatmap(cholFC, row_labels = rownames(cholFC), row_names_gp = gpar(fontsize = 6),
            cluster_rows = F, name="logFC", top_annotation = ha, col = col_logFC,
            cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
              grid.text(round(as.numeric(cholFC[i, j],2)), x, y,gp=gpar(fontsize=6))
            })
h3 <- Heatmap(cholAE, row_labels = rownames(cholAE), row_names_gp = gpar(fontsize = 6),
            cluster_rows = F, name = "AveExpr", col=col_AveExpr,
            cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
              grid.text(round(as.numeric(cholAE[i, j],2)), x, y,gp=gpar(fontsize=6))},
            column_title = "Reactome Cholesterol biosynthesis")

h <- h1 + h2 + h3
h

# ----- Reactome Degradation of cysteine and homocysteine -----
cholesterol <- genesets3[["Degradation of cysteine and homocysteine"]]
temp <- c()
for(i in 1:length(cholesterol)){temp <- rbind(temp,filter(hm3, geneID==cholesterol[i]))}
cholesterol <- c(t(unite(temp,"ENSSHAG_geneID", ENSSHAG:geneID, sep=" ")))

chol <- subset(mx.scaled, rownames(mx.scaled) %in% cholesterol)
cholFC <- subset(log2FC, rownames(log2FC) %in% cholesterol)
cholAE <- subset(mean, rownames(mean) %in% cholesterol)

h1 <- Heatmap(chol, cluster_rows = F,
              column_labels = colnames(chol), name="Z-score",
              cluster_columns = T)
h2 <- Heatmap(cholFC, row_labels = rownames(cholFC), row_names_gp = gpar(fontsize = 6),
            cluster_rows = F, name="logFC", top_annotation = ha, col = col_logFC,
            cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
              grid.text(round(as.numeric(cholFC[i, j],2)), x, y,gp=gpar(fontsize=6))
            })
h3 <- Heatmap(cholAE, row_labels = rownames(cholAE), row_names_gp = gpar(fontsize = 6),
            cluster_rows = F, name = "AveExpr", col=col_AveExpr,
            cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
              grid.text(round(as.numeric(cholAE[i, j],2)), x, y,gp=gpar(fontsize=6))},
            column_title = "Reactome Degradation of cysteine and homocysteine")

h <- h1 + h2 + h3
h

Venn diagrams data of all DEGs, and DEGs linked to metabolism.

alldegs <- rownames(dge3sig)
updegs <- rownames(filter(dge3sig, log2FoldChange > 0))
downdegs <- rownames(filter(dge3sig, log2FoldChange < 0))
metdegs <- rownames(met)

venn_clines <- list(alldegs, updegs, downdegs, metdegs)
saveRDS(venn_clines, file = "venn_clines.rds")

# venn.diagram(
#   x = list(alldegs, updegs, downdegs),
#   category.names = c("All DEGs" , "Up", "Down"),
#   filename = 'updown_venn_diagramm.png',
#   output=TRUE
# )

Pathview to visualise metabolic pathways.

pathFC <- log2FC
rownames(pathFC) <- str_split(rownames(pathFC), " ", simplify=T)[,2]

# cholesterol
pathview(gene.data=pathFC, pathway.id="hsa04979", species="hsa",gene.idtype="SYMBOL", node.sum="sum",
         limit=list(gene=c(-10,10), cpd=1), bins=list(gene=20, cpd=1),
         low="blue",mid="white",high="red",
         same.layer=FALSE)

## 'select()' returned 1:many mapping between keys and columns

## [1] "Note: 15 of 6882 unique input IDs unmapped."

## 'select()' returned 1:1 mapping between keys and columns

## Info: Working in directory /mnt/data/mdz/projects/dftd_RNAseq_annelise/dge

## Info: Writing image file hsa04979.pathview.png

# cysteine and methionine
pathview(gene.data=pathFC, pathway.id="hsa00270", species="hsa",gene.idtype="SYMBOL", node.sum="sum",
         limit=list(gene=c(-10,10), cpd=1), bins=list(gene=20, cpd=1),
         low="blue",mid="white",high="red",
         same.layer=FALSE)

## 'select()' returned 1:many mapping between keys and columns

## [1] "Note: 15 of 6882 unique input IDs unmapped."

## 'select()' returned 1:1 mapping between keys and columns

## Info: Working in directory /mnt/data/mdz/projects/dftd_RNAseq_annelise/dge

## Info: Writing image file hsa00270.pathview.png

# steroid biosynthesis
pathview(gene.data=pathFC, pathway.id="hsa00100", species="hsa",gene.idtype="SYMBOL", node.sum="sum",
         limit=list(gene=c(-10,10), cpd=1), bins=list(gene=20, cpd=1),
         low="blue",mid="white",high="red",
         same.layer=FALSE)

## 'select()' returned 1:many mapping between keys and columns

## [1] "Note: 15 of 6882 unique input IDs unmapped."

## 'select()' returned 1:1 mapping between keys and columns

## Info: Working in directory /mnt/data/mdz/projects/dftd_RNAseq_annelise/dge

## Info: Writing image file hsa00100.pathview.png

# library(SBGNview)
# input.pathways <- findPathways("Cholesterol biosynthesis")
# SBGNview.obj <- SBGNview(
#           gene.data = pathFC, 
#           gene.id.type = "SYMBOL",
#           input.sbgn = input.pathways[7,]$pathway.id,
#           output.file = "quick.start", 
#           output.formats =  c("png")
#           ) 
# print(SBGNview.obj)

Run mitch on cell line pairwise comparisons.

mx <- vector(mode = "list", length = 9)
resx <- vector(mode = "list", length = 9)

for(i in 1:9){
  
  mx[[i]] <- mitch_import(dgex[[i]], DEtype="deseq2",geneTable=gt)
  resx[[i]] <- mitch_calc(mx[[i]], genesets3, priority="effect",cores=16)

  if ( !file.exists(paste(plotnames[i],"_mitch.html")) ) {
    mitch_report(resx[[i]], paste(plotnames[i],"_mitch.html"))
  }
  
  top <- subset(resx[[i]]$enrichment_result, p.adjustANOVA<0.05)
  
  topup <- head(subset(top,s.dist>0),20)
  
  topup %>%
  kbl(caption=paste("Top 20 upregulated metabolic pathways: ", plotnames[i])) %>%
  kable_paper("hover", full_width = F)
  
  topdn <- head(subset(top,s.dist<0),20)
  
  topdn %>%
    kbl(caption=paste("Top 20 downregulated metabolic pathways: ", plotnames[i])) %>%
    kable_paper("hover", full_width = F)
  
  top2 <- rbind(head(topup,10),head(topdn,10))
  top2 <- top2[order(-top2$s.dist),]
  par(mar=c(5,15,5,2))
  
  barplot(rev(abs(top2$s.dist)),horiz=TRUE,las=1,names.arg=rev(top2$set),
          main=paste(plotnames[i]),
          cex.names=0.6,xlab="Enrichment Score",
          col=c(rep("blue",nrow(head(topdn,10))),rep("red",nrow(head(topup,10)))),
          xlim=c(0,0.8))
  
}

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.

## Note: Mean no. genes in input = 16009

## Note: no. genes in output = 12692

## Note: estimated proportion of input genes in output = 0.793

## Note: Enrichments with large effect sizes may not be
##             statistically significant.

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.

## Note: Mean no. genes in input = 16009

## Note: no. genes in output = 12691

## Note: estimated proportion of input genes in output = 0.793

## Note: Enrichments with large effect sizes may not be
##             statistically significant.

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.

## Note: Mean no. genes in input = 16009

## Note: no. genes in output = 12691

## Note: estimated proportion of input genes in output = 0.793

## Note: Enrichments with large effect sizes may not be
##             statistically significant.

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.

## Note: Mean no. genes in input = 16009

## Note: no. genes in output = 12691

## Note: estimated proportion of input genes in output = 0.793

## Note: Enrichments with large effect sizes may not be
##             statistically significant.

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.

## Note: Mean no. genes in input = 16009

## Note: no. genes in output = 12691

## Note: estimated proportion of input genes in output = 0.793

## Note: Enrichments with large effect sizes may not be
##             statistically significant.

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.

## Note: Mean no. genes in input = 16009

## Note: no. genes in output = 12690

## Note: estimated proportion of input genes in output = 0.793

## Note: Enrichments with large effect sizes may not be
##             statistically significant.

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.

## Note: Mean no. genes in input = 16009

## Note: no. genes in output = 12689

## Note: estimated proportion of input genes in output = 0.793

## Note: Enrichments with large effect sizes may not be
##             statistically significant.

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.

## Note: Mean no. genes in input = 16009

## Note: no. genes in output = 12691

## Note: estimated proportion of input genes in output = 0.793

## Note: Enrichments with large effect sizes may not be
##             statistically significant.

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.

## Note: Mean no. genes in input = 16009

## Note: no. genes in output = 12689

## Note: estimated proportion of input genes in output = 0.793

## Note: Enrichments with large effect sizes may not be
##             statistically significant.

Make heatmaps for pairwise cell line comparisons.

# ----- Fatty acid metabolism heatmaps -----
# fatty1 <- genesets2[["REACTOME_FATTY_ACID_METABOLISM"]]
# fatty2 <- genesets2[["HALLMARK_FATTY_ACID_METABOLISM"]]
# fatty3 <- genesets2[["GOBP_FATTY_ACID_BIOSYNTHETIC_PROCESS"]]
# fatty4 <-genesets2[["KEGG_FATTY_ACID_METABOLISM"]]
# fatty <- list(fatty1=fatty1,fatty2=fatty2,fatty3=fatty3,fatty4=fatty4)
# title <- c("REACTOME_FATTY_ACID_METABOLISM","HALLMARK_FATTY_ACID_METABOLISM","GOBP_FATTY_ACID_BIOSYNTHETIC_PROCESS","KEGG_FATTY_ACID_METABOLISM")
# 
# for(i in 1:length(fatty)){
#   temp <- c()
#   for(j in 1:length(fatty[[i]])){temp <- rbind(temp,filter(hm3, geneID==fatty[[i]][j]))}
#   fattyx <- c(t(unite(temp,"ENSSHAG_geneID", ENSSHAG:geneID, sep=" ")))
#   fat <- subset(mx.scaled, rownames(mx.scaled) %in% fattyx)
#   fatFC <- subset(log2FC, rownames(log2FC) %in% fattyx)
#   fatAE <- subset(mean, rownames(mean) %in% fattyx)
#   
#   h1 <- Heatmap(fat, cluster_rows = F,
#                 column_labels = colnames(fat), name="Z-score",
#                 cluster_columns = T)
#   h2 <- Heatmap(fatFC, row_labels = rownames(fatFC), row_names_gp = gpar(fontsize = 6),
#               cluster_rows = F, name="logFC", top_annotation = ha, col = col_logFC,
#               cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
#                 grid.text(round(as.numeric(fatFC[i, j],2)), x, y, gp=gpar(fontsize=6))
#               })
#   h3 <- Heatmap(fatAE, row_labels = rownames(fatAE), row_names_gp = gpar(fontsize = 6),
#               cluster_rows = F, name = "AveExpr", col=col_AveExpr,
#               cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
#                 grid.text(round(as.numeric(fatAE[i, j],2)), x, y, gp=gpar(fontsize=6))},
#               column_title=title[i])
#   
#   name <- paste("q",i,sep="")
#   assign(name, h1 + h2 + h3)
# }
# 
# q1
# q2
# q3
# q4