Source codes: https://github.com/Anne-LiseGerard/dftd_rnaseq

suppressPackageStartupMessages({
    library("reshape2")
    library("gplots")
    library("DESeq2")
    library("mitch")
    library("limma")
    library("kableExtra")
    library("dplyr")
    library("tidyr")
    library("ComplexHeatmap")
    library("RColorBrewer")
    library("circlize")
    library("pathview")
    library("stringr")
    library("grid")
    library("VennDiagram")
})

Background

The goal of our study is to compare the expression of genes related to metabolism of DFT1 cell lines to DFT2 cell lines. Our main goal is comparing genes expressed in the following pathways: Glycolysis, Pentose Phosphate Pathway, Fatty Acid Metabolism, Glutaminolysis and Oxidative Phosphorylation. A secondary goal would be to compare the expression of genes related to the production of Reactive Oxygen Species (ROS) and protection against ROS (genes involved in DNA repair, production of antioxydants etc). Finally, we might also be interested in looking into genes linked to cholesterol metabolism as this has been shown to be important for DFT1 but I need to do some additional reading to know what exactly we’re after. See sample sheet below (edited from Yin Peng), there are 3 DFT1 and 3 DFT2 cell lines, and 3 replicates of each cell line:

ss <- read.table("../ss.tsv",sep="\t",fill=TRUE,header=TRUE)
ss$DFT <- as.factor(ss$DFT)
ss$clone <- sapply(strsplit(ss$ClientID,"_"),"[[",1)

ss %>%
  kbl(caption="Sample sheet for all samples") %>%
  kable_paper("hover", full_width = F)
Sample sheet for all samples
C ClientID DFT Replicate Index1sequence Index2sequence X.ReadsIdentified.PF. TotalReads TotalBases.Gbases. clone
6180766 4906_1-1 DFT1 1 TTGTTGCA GACGTCGT 2.4465 103481165 15.5 4906
6180767 C5065_1-1 DFT1 1 CCACACTT CGTCATAC 2.8116 118924031 17.8 C5065
6180768 1426_1-1 DFT1 1 CCCGTTTG TTGCTGTA 2.3264 98401219 14.8 1426
6180769 RV_1-1 DFT2 1 ATGCTCCC CCCTGCTG 2.2811 96485136 14.5 RV
6180770 SN_1-1 DFT2 1 GCTCAATA CATTTATC 2.3947 101290147 15.2 SN
6180771 TD549_1-1 DFT2 1 GTAGTTCG CTTGATGC 2.1732 91921221 13.8 TD549
6180772 4906_2-1 DFT1 2 CGAGAACC ATGTATCG 2.2322 94416781 14.2 4906
6180773 C5065_2-1 DFT1 2 GCCATGTA ATAGGGTT 2.4505 103650355 15.5 C5065
6180774 1426_2-1 DFT1 2 TTTCTCTA CTCGACGT 2.4778 104805081 15.7 1426
6180775 RV_2-1 DFT2 2 CCAGCGAT TCTAGTCA 2.9882 126393794 19.0 RV
6180776 SN_2-1 DFT2 2 TGGGAGTG CCCGTCTA 2.4459 103455786 15.5 SN
6180777 TD549_2-1 DFT2 2 CCCTCGTA TAGAAGAG 2.5437 107592495 16.1 TD549
6180778 4906_3-1 DFT1 3 CGATATGG GGGACGTA 2.2711 96062159 14.4 4906
6180779 C5065_3-1 DFT1 3 TTGTGCCC TTTCCATC 2.2763 96282107 14.4 C5065
6180780 1426_3-1 DFT1 3 TGTCCTCT CGACGAAC 2.4952 105541059 15.8 1426
6180781 RV_3-1 DFT2 3 GTATAGTC TTGGTCTC 2.3325 98659234 14.8 RV
6180782 SN_3-1 DFT2 3 TTTGGGAT GTTCACGT 2.6750 113146174 17.0 SN
6180783 TD549_3-1 DFT2 3 CACCAAGC AAAGGGAA 2.1610 91405190 13.7 TD549
DEA5_4NEG DEA4_6NEG NA CGGAGAGG CGAGCGTC 0.0002 8460 0.0 DEA4

We are mainly interested in comparing all DFT1 samples against all DFT2 samples. However, we did notice that some cell lines within a same DFT produced different results in the metabolism experiments so we might need to compare all 6 cell lines against one another and not be able to simply do a DFT1 vs DFT2 contrast, if that makes sense.

Regarding the reference transcriptome, we will use Ensembl v109.

Functions

maplot <- function(de,contrast_name) {
  sig <-subset(de, padj < 0.05 )
  up <-rownames(subset(de, padj < 0.05 & log2FoldChange > 0))
  dn <-rownames(subset(de, padj < 0.05 & log2FoldChange < 0))
  GENESUP <- length(up)
  GENESDN <- length(dn)
  DET=nrow(de)
  SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down", DET, "detected")
  ns <-subset(de, padj > 0.05 )
  plot(log2(de$baseMean),de$log2FoldChange,
       xlab="log2 basemean", ylab="log2 foldchange",
       pch=19, cex=1, col="dark gray",
       main=contrast_name, cex.main=0.7)
  points(log2(sig$baseMean),sig$log2FoldChange,
         pch=19, cex=1, col="red")
  mtext(SUBHEADER,cex = 0.7)
  grid()
}

make_volcano <- function(de,name) {
    sig <- subset(de,padj<0.05)
    N_SIG=nrow(sig)
    N_UP=nrow(subset(sig,log2FoldChange>0))
    N_DN=nrow(subset(sig,log2FoldChange<0))
    DET=nrow(de)
    HEADER=paste(N_SIG,"@5%FDR,", N_UP, "up", N_DN, "dn", DET, "detected")
    plot(de$log2FoldChange,-log10(de$padj),cex=1,pch=19,col="darkgray",
        main=name, xlab="log2 FC", ylab="-log10 pval")
    mtext(HEADER)
    grid()
    points(sig$log2FoldChange,-log10(sig$padj),cex=1,pch=19,col="red")
}

Load data

Here we load the data in from the aligner.

tmp <- read.table("../fastq/3col.tsv.gz")
x <- as.data.frame(acast(tmp, V2~V1, value.var="V3",fun.aggregate=sum))
dim(x)
## [1] 43512    19

Load gene names.

gn <- read.table("../ref/Sarcophilus_harrisii.mSarHar1.11.cdna+ncrna.genenames.tsv",fill=TRUE)

gn <- gn[order(gn$V1),]

dim(gn)
## [1] 43512     3

Load homology map

hm <- read.table("../ref/mart_export_ensembl109_2023-07-14.txt",sep="\t",header=TRUE)

Now need to collapse transcript data to genes.

x$gene <- paste(gn$V2,gn$V3)

y <- aggregate(. ~ gene,x,sum)

rownames(y) <- y$gene
y$gene = NULL

dim(y)
## [1] 23829    19

Quality control

Samples with <1M reads should be omitted. Will also round values to integers.

cs <- colSums(y)
cs <- cs[order(cs)]

par(mar=c(5,10,5,2))
barplot(cs,main="All samples",horiz=TRUE,las=1)
abline(v=1e7,col="red",lty=2)

y <- round(y)

MDS

This will help us to visualise the sources of variability in the overall dataset.

Plot MDS and then remove the negative control and run Plot MDS again.

Also fix the sample names.

Plot MDS by DFT.

par(mar = c(5.1, 4.1, 4.1, 2.1) )

plotMDS(y)

y <- y[,colnames(y) != "DEA5-4NEG"]

plotMDS(y)

ss <- ss[ss$ClientID != "DEA4_6NEG",]

colnames(y) <- sapply(strsplit(ss$ClientID,"-"),"[[",1)

cs <- colSums(y)
cs <- cs[order(cs)]

par(mar=c(5,10,5,2))
barplot(cs,main="All samples",horiz=TRUE,las=1)

par(mar = c(5.1, 4.1, 4.1, 2.1) )

plotMDS(y)

cols <- ss$DFT
cols <- gsub("DFT1","pink",cols)
cols <- gsub("DFT2","lightblue",cols)
mymds <- plotMDS(y,plot=FALSE)

# fix the xlims
XMIN=min(mymds$x)
XMAX=max(mymds$x)
XMID=(XMAX+XMIN)/2
XMIN <- XMID + (XMIN-XMID)*1.1
XMAX <- XMID+(XMAX-XMID)*1.1
plotMDS(mymds,pch=19,cex=3,col=cols,main="MDS plot",xlim=c(XMIN,XMAX))
text(mymds,labels=colnames(y))
mtext("pink=DFT1,blue=DFT2")

y_DFT1 <- y[,colnames(y) %in% c("4906_1","4906_2","4906_3",
                                "1426_1","1426_2","1426_3",
                                "C5065_1","C5065_2","C5065_3")]
y_DFT2 <- y[,colnames(y) %in% c("RV_1","RV_2","RV_3",
                                "SN_1","SN_2","SN_3",
                                "TD549_1","TD549_2","TD549_3")]

mdsDFT1 <- plotMDS(y_DFT1,plot=FALSE)
plotMDS(mdsDFT1,pch=19,cex=3,col="pink",main="MDS plot",xlim=c(XMIN,XMAX))
text(mdsDFT1,labels=colnames(y_DFT1))
mtext("DFT1")

mdsDFT2 <- plotMDS(y_DFT2,plot=FALSE)
plotMDS(mdsDFT2,pch=19,cex=3,col="lightblue",main="MDS plot",xlim=c(XMIN,XMAX))
text(mdsDFT2, labels=colnames(y_DFT2))
mtext("DFT2")

DESeq2

Run a simple differential analysis comparing DFT1 vs DFT2 ignoring the clone type as a source of variation.

# split data will be necessary for the smaller comparisons (but not this one)
y1 <- y
ss1 <- ss

y1 <- y1[which(rowMeans(y1)>10),]
dim(y1)
## [1] 16009    18
dds <- DESeqDataSetFromMatrix(countData = y1 , colData = ss1, design = ~ DFT )
## converting counts to integer mode
## factor levels were dropped which had no samples
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 4 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
z<- results(res)
vsd <- vst(dds)
zz<-cbind(as.data.frame(z),assay(vsd))
dge<-as.data.frame(zz[order(zz$pvalue),])
dge1 <- dge
sig <- subset(dge,padj<0.05)
sig1_up <- rownames(subset(sig,log2FoldChange>0))
sig1_dn <- rownames(subset(sig,log2FoldChange<0))
length(sig1_up)
## [1] 5871
length(sig1_dn)
## [1] 6006
maplot(dge1,"DFT1 vs DFT2")

make_volcano(dge1,"DFT1 vs DFT2")

sig[1:50,1:6] %>%
  kbl(caption="Comparison of DFT1 vs DFT2") %>%
  kable_paper("hover", full_width = F)
Comparison of DFT1 vs DFT2
baseMean log2FoldChange lfcSE stat pvalue padj
ENSSHAG00000000893.2 NA 761.6994 -4.745935 0.1042514 -45.52393 0 0
ENSSHAG00000001230.2 NA 9964.7623 -7.268224 0.1727670 -42.06952 0 0
ENSSHAG00000001308.2 NA 1591.3721 -4.481411 0.1093947 -40.96554 0 0
ENSSHAG00000001552.2 TDRD5 1386.5142 -5.550630 0.0945061 -58.73301 0 0
ENSSHAG00000001728.2 ABRAXAS2 2447.7801 -2.611974 0.0580287 -45.01174 0 0
ENSSHAG00000001754.2 NA 573.3750 -4.129922 0.1018094 -40.56522 0 0
ENSSHAG00000002134.2 NA 31866.6922 2.222360 0.0386947 57.43325 0 0
ENSSHAG00000002808.2 MICAL1 9230.0290 -2.032272 0.0528958 -38.42032 0 0
ENSSHAG00000003434.2 NA 5640.3455 -9.630769 0.2085777 -46.17354 0 0
ENSSHAG00000003458.2 JMJD1C 2343.7511 -2.896812 0.0747449 -38.75599 0 0
ENSSHAG00000003737.2 NA 9677.3586 11.783418 0.2132465 55.25727 0 0
ENSSHAG00000004947.2 THBS2 24172.5467 13.496286 0.3231374 41.76640 0 0
ENSSHAG00000005182.2 SERPINE1 7064.0510 11.869506 0.2994322 39.64005 0 0
ENSSHAG00000005550.2 MEIOC 1485.7532 -7.289256 0.1694244 -43.02364 0 0
ENSSHAG00000007070.2 SMC1B 3980.3698 -10.063501 0.1851324 -54.35841 0 0
ENSSHAG00000007582.2 NA 2902.0019 -7.095020 0.0846261 -83.83964 0 0
ENSSHAG00000007784.2 NA 5517.6882 -7.314052 0.1015625 -72.01526 0 0
ENSSHAG00000007965.2 NA 5215.8477 -1.747971 0.0370471 -47.18243 0 0
ENSSHAG00000008051.2 TEX14 1537.4858 -3.531985 0.0617160 -57.22965 0 0
ENSSHAG00000009584.2 PLEKHG1 1861.7962 9.379637 0.2481401 37.79977 0 0
ENSSHAG00000010367.2 ECM1 12497.0681 9.411970 0.1164250 80.84150 0 0
ENSSHAG00000010645.2 COL16A1 6094.7776 6.056254 0.0950462 63.71906 0 0
ENSSHAG00000010824.2 MPDZ 4232.6012 -5.438458 0.0941097 -57.78850 0 0
ENSSHAG00000010902.2 SMARCD3 2489.7010 -2.649718 0.0440929 -60.09400 0 0
ENSSHAG00000011135.2 NA 5620.2000 4.064532 0.0895867 45.36980 0 0
ENSSHAG00000011480.2 ADGRB2 1933.7632 -7.544502 0.1814507 -41.57880 0 0
ENSSHAG00000011750.2 NA 3080.7121 3.088431 0.0735615 41.98432 0 0
ENSSHAG00000011809.2 NA 5508.1945 3.140454 0.0593806 52.88691 0 0
ENSSHAG00000012182.2 NA 1038.5611 7.742252 0.2036373 38.01982 0 0
ENSSHAG00000013722.2 NA 524.0949 -4.688777 0.0900165 -52.08798 0 0
ENSSHAG00000014390.2 IFRD1 2454.6507 -1.386811 0.0345922 -40.09028 0 0
ENSSHAG00000014430.2 NA 1157.2248 -1.742533 0.0399895 -43.57472 0 0
ENSSHAG00000015236.2 FKBP6 598.2945 -5.562106 0.1042482 -53.35446 0 0
ENSSHAG00000016212.2 SLC25A14 887.2862 -6.791421 0.1667459 -40.72915 0 0
ENSSHAG00000016252.2 LOXL2 8715.4741 8.867010 0.1933921 45.84991 0 0
ENSSHAG00000016335.2 NA 21006.6606 2.668634 0.0567139 47.05436 0 0
ENSSHAG00000017128.2 F10 515.2000 6.578090 0.1519688 43.28578 0 0
ENSSHAG00000017996.2 TNFRSF21 1876.2975 -7.424893 0.1564041 -47.47248 0 0
ENSSHAG00000018060.2 HR 3216.8474 -7.036189 0.1791811 -39.26859 0 0
ENSSHAG00000018469.2 PDE10A 1279.8709 -5.655401 0.1229592 -45.99414 0 0
ENSSHAG00000020521.1 CDK3 2068.9726 4.241310 0.0605836 70.00762 0 0
ENSSHAG00000021318.1 NA 2418.0580 -2.504689 0.0455484 -54.98958 0 0
ENSSHAG00000021464.1 NA 3962.2831 -8.392331 0.1785238 -47.00960 0 0
ENSSHAG00000022172.1 HAPLN2 3697.2723 -7.391660 0.1427380 -51.78480 0 0
ENSSHAG00000022666.1 NA 550.9103 6.404774 0.1677749 38.17480 0 0
ENSSHAG00000022966.1 SOX8 1304.3613 -4.733014 0.1196103 -39.57027 0 0
ENSSHAG00000023142.1 NA 20998.8159 -2.483109 0.0319815 -77.64204 0 0
ENSSHAG00000023249.1 NA 974.9689 5.489140 0.1204114 45.58656 0 0
ENSSHAG00000023680.1 EGR3 7293.3439 -4.014795 0.1044284 -38.44541 0 0
ENSSHAG00000024021.1 NA 1284.6234 5.713744 0.1294418 44.14140 0 0
write.table(dge,file="dge1.tsv",sep="\t")

# heatmap
mx <- sig[,7:ncol(sig)]
mx <- head(mx,30)
colfunc <- colorRampPalette(c("blue", "white", "red"))
heatmap.2(as.matrix(mx),trace="none",scale="row",
  col=colfunc(25),ColSideColors=cols,mar=c(5,14),
  cexRow=0.8,cexCol=0.7)

Combine the three replicates

Sum replicates.

x4906 <- rowSums(y[,ss$clone=="4906"])
xC5065 <- rowSums(y[,ss$clone=="C5065"])
x1426 <- rowSums(y[,ss$clone=="1426"])
xRV <- rowSums(y[,ss$clone=="RV"])
xSN <- rowSums(y[,ss$clone=="SN"])
xTD549 <- rowSums(y[,ss$clone=="TD549"])

y2 <- data.frame(x4906,xC5065,x1426,xRV,xSN,xTD549)
ss2 <- as.data.frame(colnames(y2))
colnames(ss2) <- "clone"
ss2$DFT <- factor(c("DFT1","DFT1","DFT1","DFT2","DFT2","DFT2"))

y2 <- y2[which(rowMeans(y2)>10),]
dim(y2)
## [1] 17468     6
dds <- DESeqDataSetFromMatrix(countData = y2 , colData = ss2, design = ~ DFT )
## converting counts to integer mode
dds2 <- dds
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
z<- results(res)
vsd <- vst(dds)
zz<-cbind(as.data.frame(z),assay(vsd))
dge<-as.data.frame(zz[order(zz$pvalue),])
dge2 <- dge
sig <- subset(dge,padj<0.05)
sig2_up <- rownames(subset(sig,log2FoldChange>0))
sig2_dn <- rownames(subset(sig,log2FoldChange<0))
length(sig2_up)
## [1] 4200
length(sig2_dn)
## [1] 4412
maplot(dge2,"DFT1 vs DFT2 aggregated")

make_volcano(dge2,"DFT1 vs DFT2 aggregated")

sig[1:50,1:6] %>%
  kbl(caption="Comparison of DFT1 vs DFT2") %>%
  kable_paper("hover", full_width = F)
Comparison of DFT1 vs DFT2
baseMean log2FoldChange lfcSE stat pvalue padj
ENSSHAG00000007582.2 NA 8720.2072 -7.090091 0.1734257 -40.88258 0 0
ENSSHAG00000024692.1 NA 11176.1973 -8.580770 0.2275391 -37.71118 0 0
ENSSHAG00000007784.2 NA 16519.5514 -7.300242 0.1954949 -37.34236 0 0
ENSSHAG00000010367.2 ECM1 37884.8561 9.429385 0.2567216 36.73000 0 0
ENSSHAG00000026705.1 LOXL3 35605.2404 3.028419 0.0834311 36.29846 0 0
ENSSHAG00000003737.2 NA 29315.0122 11.793401 0.3388989 34.79917 0 0
ENSSHAG00000032086.1 NA 4270.9538 -9.087633 0.2656490 -34.20918 0 0
ENSSHAG00000020521.1 CDK3 6254.9578 4.247335 0.1329351 31.95045 0 0
ENSSHAG00000010645.2 COL16A1 18470.4887 6.069424 0.1909073 31.79252 0 0
ENSSHAG00000015236.2 FKBP6 1799.3194 -5.552135 0.1749135 -31.74217 0 0
ENSSHAG00000031815.1 RAB33A 4153.4227 -8.057672 0.2543513 -31.67930 0 0
ENSSHAG00000031607.1 NA 2388.4986 6.299003 0.2049613 30.73264 0 0
ENSSHAG00000003434.2 NA 16944.8869 -9.624643 0.3183782 -30.23022 0 0
ENSSHAG00000013199.2 PDGFRA 101648.6230 15.673386 0.5200441 30.13857 0 0
ENSSHAG00000024117.1 NA 6007.3116 7.935832 0.2646438 29.98684 0 0
ENSSHAG00000031627.1 PRX 228602.7417 -10.824218 0.3631920 -29.80302 0 0
ENSSHAG00000007782.2 CA8 2062.4938 8.726159 0.2990458 29.18000 0 0
ENSSHAG00000018310.2 SLIT2 44695.3018 -14.414172 0.4983908 -28.92142 0 0
ENSSHAG00000021464.1 NA 11908.1052 -8.384005 0.2916414 -28.74765 0 0
ENSSHAG00000010824.2 MPDZ 12727.1471 -5.429957 0.1920191 -28.27821 0 0
ENSSHAG00000000643.2 STK31 4182.6920 -9.537041 0.3373093 -28.27387 0 0
ENSSHAG00000023142.1 NA 63163.2147 -2.474963 0.0903962 -27.37905 0 0
ENSSHAG00000025566.1 ABLIM1 7772.7034 5.814237 0.2129981 27.29713 0 0
ENSSHAG00000021789.1 NA 1860.1298 8.577176 0.3143469 27.28571 0 0
ENSSHAG00000024021.1 NA 3895.6967 5.733097 0.2107928 27.19778 0 0
ENSSHAG00000030292.1 NA 1933.0281 -5.410220 0.1994706 -27.12289 0 0
ENSSHAG00000031074.1 NEXMIF 4117.0924 -8.404272 0.3123135 -26.90973 0 0
ENSSHAG00000008051.2 TEX14 4625.9527 -3.525105 0.1326009 -26.58431 0 0
ENSSHAG00000004947.2 THBS2 73193.5959 13.509919 0.5132476 26.32242 0 0
ENSSHAG00000022172.1 HAPLN2 11123.1806 -7.382997 0.2822149 -26.16090 0 0
ENSSHAG00000026637.1 NA 2213.7312 -5.071669 0.1942619 -26.10738 0 0
ENSSHAG00000005380.2 NA 3108.7669 10.228790 0.3940820 25.95599 0 0
ENSSHAG00000018469.2 PDE10A 3854.3231 -5.647349 0.2179279 -25.91384 0 0
ENSSHAG00000011809.2 NA 16636.2524 3.149445 0.1232734 25.54845 0 0
ENSSHAG00000029638.1 NA 18764.0415 -6.179495 0.2425972 -25.47224 0 0
ENSSHAG00000016252.2 LOXL2 26417.8025 8.878268 0.3521803 25.20944 0 0
ENSSHAG00000001552.2 TDRD5 4168.0657 -5.549233 0.2251434 -24.64755 0 0
ENSSHAG00000006728.2 NEFM 7555.0072 -10.855938 0.4479166 -24.23652 0 0
ENSSHAG00000013722.2 NA 1575.4562 -4.678030 0.1936782 -24.15363 0 0
ENSSHAG00000014886.2 CYP2R1 2488.5667 -7.465866 0.3091695 -24.14813 0 0
ENSSHAG00000031108.1 NA 964.2578 -7.703956 0.3207488 -24.01866 0 0
ENSSHAG00000005492.2 NA 1659.5069 -8.999424 0.3748897 -24.00553 0 0
ENSSHAG00000027513.1 NA 1802.2901 -5.271118 0.2200576 -23.95335 0 0
ENSSHAG00000000893.2 NA 2287.3973 -4.736996 0.1987138 -23.83828 0 0
ENSSHAG00000005182.2 SERPINE1 21367.7733 11.866757 0.5041184 23.53962 0 0
ENSSHAG00000010102.2 UNC5A 1472.0133 -8.985720 0.3860205 -23.27783 0 0
ENSSHAG00000031142.1 TPPP 2664.1676 -5.954920 0.2565656 -23.21012 0 0
ENSSHAG00000022666.1 NA 1669.6189 6.414065 0.2781309 23.06132 0 0
ENSSHAG00000010515.2 NA 2972.5658 9.167129 0.3999684 22.91964 0 0
ENSSHAG00000010902.2 SMARCD3 7489.5510 -2.642803 0.1163671 -22.71092 0 0
write.table(dge2,file="dge2.tsv",sep="\t")

# heatmap
mx <- sig[,7:ncol(sig)]
mx <- head(mx,30)
colfunc <- colorRampPalette(c("blue", "white", "red"))
heatmap.2(as.matrix(mx),trace="none",scale="row",
  col=colfunc(25),ColSideColors=cols[1:6],mar=c(8,14),
  cexRow=0.7,cexCol=1)

Compare cell lines against one another

First 4906 vs RV, 4906 vs SN and 4906 vs TD549. Then 1426 vs RV, 1426 vs SN and 1426 vs TD549. Finally C5065 vs RV, C5065 vs SN and C5065 vs TD549.

yx <- vector(mode = "list", length = 9)
yx[[1]] <- select(y1, "4906_1", "4906_2", "4906_3", "RV_1", "RV_2", "RV_3")
yx[[2]] <- select(y1, "4906_1", "4906_2", "4906_3", "SN_1", "SN_2", "SN_3")
yx[[3]] <- select(y1, "4906_1", "4906_2", "4906_3", "TD549_1", "TD549_2", "TD549_3")
yx[[4]] <- select(y1, "1426_1", "1426_2", "1426_3", "RV_1", "RV_2", "RV_3")
yx[[5]] <- select(y1, "1426_1", "1426_2", "1426_3", "SN_1", "SN_2", "SN_3")
yx[[6]] <- select(y1, "1426_1", "1426_2", "1426_3", "TD549_1", "TD549_2", "TD549_3")
yx[[7]] <- select(y1, "C5065_1", "C5065_2", "C5065_3", "RV_1", "RV_2", "RV_3")
yx[[8]] <- select(y1, "C5065_1", "C5065_2", "C5065_3", "SN_1", "SN_2", "SN_3")
yx[[9]] <- select(y1, "C5065_1", "C5065_2", "C5065_3", "TD549_1", "TD549_2", "TD549_3")

ssx <- vector(mode = "list", length = 9)
dds <- vector(mode = "list", length = 9)
res <- vector(mode = "list", length = 9)
z <- vector(mode = "list", length = 9)
vsd <- vector(mode = "list", length = 9)
zz <- vector(mode = "list", length = 9)
dgex <- vector(mode = "list", length = 9)
sigx <- vector(mode = "list", length = 9)

for(i in 1:9){
  print(paste("Processing comparison: ", i, " on 9"))
  ssx[[i]] <- as.data.frame(colnames(yx[[i]]))
  colnames(ssx[[i]]) <- "clone"
  ssx[[i]]$DFT <- factor(c("DFT1","DFT1","DFT1","DFT2","DFT2","DFT2"))
  dds[[i]] <- DESeqDataSetFromMatrix(countData = yx[[i]], colData = ssx[[i]],
                                     design = ~ DFT)
  res[[i]] <- DESeq(dds[[i]], quiet=TRUE)
  z[[i]] <- results(res[[i]])
  vsd[[i]] <- vst(dds[[i]])
  zz[[i]] <- cbind(as.data.frame(z[[i]]), assay(vsd[[i]]))
  dgex[[i]] <- as.data.frame(zz[[i]][order(zz[[i]]$pvalue),])
  sigx[[i]] <- subset(dgex[[i]], padj < 0.05)
  }
## [1] "Processing comparison:  1  on 9"
## converting counts to integer mode
## [1] "Processing comparison:  2  on 9"
## converting counts to integer mode
## [1] "Processing comparison:  3  on 9"
## converting counts to integer mode
## [1] "Processing comparison:  4  on 9"
## converting counts to integer mode
## [1] "Processing comparison:  5  on 9"
## converting counts to integer mode
## [1] "Processing comparison:  6  on 9"
## converting counts to integer mode
## [1] "Processing comparison:  7  on 9"
## converting counts to integer mode
## [1] "Processing comparison:  8  on 9"
## converting counts to integer mode
## [1] "Processing comparison:  9  on 9"
## converting counts to integer mode
plotnames <- c("4906 vs RV", "4906 vs SN", "4906 vs TD549",
               "1426 vs RV", "1426 vs SN", "1426 vs TD549",
               "C5065 vs RV", "C5065 vs SN", "C5065 vs TD549")

for(i in 1:9){
  maplot(dgex[[i]],plotnames[i])
  make_volcano(dgex[[i]],plotnames[i])
  
  sigx[[i]][1:50,1:6] %>%
  kbl(caption=plotnames[i]) %>%
  kable_paper("hover", full_width = F)
  
  write.table(dgex[[i]],file=paste(plotnames[i],".tsv"),sep="\t")
  
  mx <- sigx[[i]][,7:ncol(sigx[[i]])]
  mx <- head(mx,30)
  colfunc <- colorRampPalette(c("blue", "white", "red"))
  heatmap.2(as.matrix(mx),trace="none",scale="row",
  col=colfunc(25),ColSideColors=cols[1:6],mar=c(8,14),
  cexRow=0.7,cexCol=1)
}

Enrichment analysis

Need to get human homologs of these genes. These were obtained from Ensembl v109 biomart (website).

rownames(dge1) <- sapply(strsplit(rownames(dge1),"\\."),"[[",1)
rownames(dge2) <- sapply(strsplit(rownames(dge2),"\\."),"[[",1)

for(i in 1:9){
  rownames(dgex[[i]]) <- sapply(strsplit(rownames(dgex[[i]]),"\\."),"[[",1)
}


hm2 <- hm[hm$Tasmanian.devil.gene.stable.ID != "",]

gt <- hm2[,2:3]
length(unique(gt$Tasmanian.devil.gene.stable.ID))
## [1] 16979
length(unique(gt$Gene.name))
## [1] 16646
for(i in 1:length(gt$Gene.name)){
  if(gt$Gene.name[i]==""){gt$Gene.name[i] <- gt$Tasmanian.devil.gene.stable.ID[i]}
}

Now run mitch for DGE2. There will be a report generated that has more details on the enrichment analysis. The pathways are sourced from Reactome 7th July 2023.

genesets <- gmt_import("../ref/ReactomePathways_2023-07-14.gmt")

m2 <- mitch_import(dge2, DEtype="deseq2", geneTable=gt)
## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.
## Note: Mean no. genes in input = 17468
## Note: no. genes in output = 13563
## Note: estimated proportion of input genes in output = 0.776
head(m2)
##                    x
## A2M     -13.13626596
## A4GALT   -0.07278225
## AAAS     -4.33508400
## AACS      2.13961007
## AADAC    -2.90578338
## AADACL3   1.15402966
res2 <- mitch_calc(m2, genesets, priority="effect",cores=16)
## Note: Enrichments with large effect sizes may not be
##             statistically significant.
if ( !file.exists("mitch2.html") ) {
  mitch_report(res2, "mitch2.html")
}

top <- subset(res2$enrichment_result,p.adjustANOVA<0.05)

topup <- head(subset(top,s.dist>0),20)

topup %>%
  kbl(caption="Top 20 upregulated pathways") %>%
  kable_paper("hover", full_width = F)
Top 20 upregulated pathways
set setSize pANOVA s.dist p.adjustANOVA
687 Metallothioneins bind metals 10 0.0000122 0.7987899 0.0013275
1075 Response to metal ions 13 0.0000082 0.7143457 0.0009848
229 Crosslinking of collagen fibrils 16 0.0001180 0.5559533 0.0070965
42 Activation of Matrix Metalloproteinases 22 0.0000129 0.5372404 0.0013275
207 Collagen degradation 47 0.0000000 0.5181408 0.0000004
157 Caspase activation via Death Receptors in the presence of ligand 12 0.0019500 0.5164748 0.0460725
456 GRB2:SOS provides linkage to MAPK signaling for Integrins 12 0.0019840 0.5156200 0.0460725
206 Collagen chain trimerization 30 0.0000013 0.5100175 0.0001749
185 Cholesterol biosynthesis 24 0.0000356 0.4875237 0.0027071
205 Collagen biosynthesis and modifying enzymes 50 0.0000000 0.4811012 0.0000010
171 Cell-extracellular matrix interactions 18 0.0004405 0.4785202 0.0205059
572 Integrin cell surface interactions 63 0.0000000 0.4337449 0.0000008
337 ECM proteoglycans 61 0.0000000 0.4178543 0.0000030
1147 Signal transduction by L1 21 0.0016068 0.3977221 0.0435412
208 Collagen formation 68 0.0000001 0.3790227 0.0000095
573 Integrin signaling 23 0.0016662 0.3787875 0.0435412
1247 Syndecan interactions 24 0.0014009 0.3767696 0.0435412
93 Assembly of collagen fibrils and other multimeric structures 47 0.0000244 0.3559658 0.0021982
183 Chemokine receptors bind chemokines 27 0.0015493 0.3520160 0.0435412
287 Degradation of the extracellular matrix 107 0.0000000 0.3432544 0.0000004
topdn <- head(subset(top,s.dist<0),20)

topdn %>%
  kbl(caption="Top 20 downregulated pathways") %>%
  kable_paper("hover", full_width = F)
Top 20 downregulated pathways
set setSize pANOVA s.dist p.adjustANOVA
286 Degradation of cysteine and homocysteine 11 0.0003114 -0.6278174 0.0160496
1243 Sulfur amino acid metabolism 23 0.0016897 -0.3782930 0.0435412
1359 Transmission across Chemical Synapses 195 0.0009484 -0.1376402 0.0333775
top2 <- rbind(head(topup,10),head(topdn,10))
top2 <- top2[order(-top2$s.dist),]
par(mar=c(5,15,5,2))

barplot(rev(abs(top2$s.dist)),horiz=TRUE,las=1,names.arg=rev(top2$set),
  main="Top REACTOME Pathways",cex.names=0.6,xlab="Enrichment Score",
  col=c(rep("blue",nrow(head(topdn,10))),rep("red",nrow(head(topup,10)))),
  xlim=c(0,0.8))

Run mitch with curated metabolic pathways.

KEGG gene sets under the “Metabolism” hierarchy were sourced from MSigDb in August 2023.

genesets2 <- gmt_import("../ref/gsea/gsea_KEGG.gmt")

res2 <- mitch_calc(m2, genesets2, priority="effect",cores=16)
## Note: Enrichments with large effect sizes may not be
##             statistically significant.
if ( !file.exists("mitch2.html") ) {
  mitch_report(res2, "mitch2.html")
}

top <- subset(res2$enrichment_result,p.adjustANOVA<0.05)

topup <- head(subset(top,s.dist>0),20)

topup %>%
  kbl(caption="Top 20 upregulated pathways") %>%
  kable_paper("hover", full_width = F)
Top 20 upregulated pathways
set setSize pANOVA s.dist p.adjustANOVA
51 KEGG_STEROID_BIOSYNTHESIS 14 0.0004408 0.542486 0.0251253
topdn <- head(subset(top,s.dist<0),20)

topdn %>%
  kbl(caption="Top 20 downregulated pathways") %>%
  kable_paper("hover", full_width = F)
Top 20 downregulated pathways
set setSize pANOVA s.dist p.adjustANOVA
top2 <- rbind(head(topup,10),head(topdn,10))
top2 <- top2[order(-top2$s.dist),]
par(mar=c(5,15,5,2))

barplot(rev(abs(top2$s.dist)),horiz=TRUE,las=1,names.arg=rev(top2$set),
  main="Top KEGG Pathways",cex.names=0.6,xlab="Enrichment Score",
  col=c(rep("blue",nrow(head(topdn,10))),rep("red",nrow(head(topup,10)))),
  xlim=c(0,0.8))

Same with Reactome this time. Reactome gene sets of the metabolism hierarchy were sourced from the Reactome website in August 2023.

hierarchy <- read.table("../ref/gsea/ReactomePathwaysRelation.txt", sep ="\t", header = F, dec =".")
pathways <- read.delim("../ref/gsea/ReactomePathways.txt", header=FALSE)

hierarchy <- hierarchy[grep("HSA", hierarchy$V1), ] # keep only homo sapiens data
pathways <- pathways[grep("HSA", pathways$V1), 1:2] 

metabo_pathways <- "R-HSA-1430728" # reactome code for metabolism 

# this is very bad coding - but it works
# why 325? that is when while loop crashes because all branches of the hierarchy have been looped over
# will have to change this if Reactome data is updated
while(length(metabo_pathways) < 325){
  for(i in 1:length(metabo_pathways)){
    metabo_pathways <- unique(c(metabo_pathways,filter(hierarchy, V1 == metabo_pathways[i])$V2))
    }
  }

mpathways <- c()
for(i in 1:length(metabo_pathways)){mpathways <- c(mpathways,filter(pathways, V1 == metabo_pathways[i])$V2)}

library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following object is masked from 'package:GenomicRanges':
## 
##     subtract
genesets3 <- extract(genesets, mpathways)
# remove empty sets (ie hierarchy exists but no gene set is associated to it)
genesets3 <- genesets3[lapply(genesets3,length)>0]
m3 <- mitch_import(dge2, DEtype="deseq2", geneTable=gt)
## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.
## Note: Mean no. genes in input = 17468
## Note: no. genes in output = 13563
## Note: estimated proportion of input genes in output = 0.776
res3 <- mitch_calc(m3, genesets3, priority="effect", cores=16)
## Note: Enrichments with large effect sizes may not be
##             statistically significant.
if ( !file.exists("mitch3.html") ) {
  mitch_report(res3, "mitch3.html")
}

top <- subset(res3$enrichment_result,p.adjustANOVA<0.05)

topup <- head(subset(top,s.dist>0),20)

topup %>%
  kbl(caption="Top 20 upregulated metabolic pathways") %>%
  kable_paper("hover", full_width = F)
Top 20 upregulated metabolic pathways
set setSize pANOVA s.dist p.adjustANOVA
77 Cholesterol biosynthesis 24 3.56e-05 0.4875237 0.0048833
topdn <- head(subset(top,s.dist<0),20)

topdn %>%
  kbl(caption="Top 20 downregulated metabolic pathways") %>%
  kable_paper("hover", full_width = F)
Top 20 downregulated metabolic pathways
set setSize pANOVA s.dist p.adjustANOVA
88 Degradation of cysteine and homocysteine 11 0.0003114 -0.6278174 0.0213327
top2 <- rbind(head(topup,10),head(topdn,10))
top2 <- top2[order(-top2$s.dist),]
par(mar=c(5,15,5,2))

barplot(rev(abs(top2$s.dist)),horiz=TRUE,las=1,names.arg=rev(top2$set),
  main="Top REACTOME Metabolic Pathways",cex.names=0.6,xlab="Enrichment Score",
  col=c(rep("blue",nrow(head(topdn,10))),rep("red",nrow(head(topup,10)))),
  xlim=c(0,0.8))

# see https://www.youtube.com/watch?v=ht1r34-ifVI for reference

# find corresponding human genes to devil ENS IDs
dge3 <- dge2 
dge3$ENSSHAG <- rownames(dge3)
hm3 <- data.frame(ENSSHAG=hm2[,2],geneID=hm2[,3])
dge3 <- left_join(dge3, hm3, by="ENSSHAG")
dge3 <- unite(dge3,"ENSSHAG_geneID", ENSSHAG:geneID, sep=" ", remove=F)
dge3 <- unique(dge3)
rownames(dge3) <- dge3$ENSSHAG_geneID

# only plot significant DEGs
dge3sig <- filter(dge3, padj < 0.05)
dge3sig <- dge3sig[order(dge3sig$log2FoldChange, decreasing=TRUE),]

mx <- dge3sig[,7:ncol(dge3sig)] # get normalised counts
 
mx.scaled <- t(apply(mx[,1:6], 1, scale)) # center and scale each column (Z-score)
colnames(mx.scaled) <- colnames(mx[,1:6])
log2FC <- as.matrix(dge3sig$log2FoldChange)
rownames(log2FC) <- rownames(dge3sig)
colnames(log2FC) <- "logFC"
mean <- as.matrix(dge3sig$baseMean)
rownames(mean) <- rownames(dge3sig)
colnames(mean) <- "AveExpr"

col_logFC <- colorRamp2(c(min(log2FC),0,max(log2FC)), c("blue","white","red"))
col_AveExpr <- colorRamp2(c(quantile(mean)[1], quantile(mean)[4]), c("white","red"))

ha <- HeatmapAnnotation(summary=anno_summary(gp=gpar(fill=2),height=unit(2, "cm")))

# ----- Top DEGs in all KEGG metabolic pathways ------

metabo <- c()
for(i in 1:length(genesets2)){metabo <- c(metabo,genesets2[[i]])}
metabo <- unique(metabo)

temp <- c()
for(i in 1:length(metabo)){temp <- rbind(temp,filter(hm3, geneID==metabo[i]))}
metabolism <- c(t(unite(temp,"ENSSHAG_geneID", ENSSHAG:geneID, sep=" ")))

met <- subset(mx.scaled, rownames(mx.scaled) %in% metabolism)
metFC <- subset(log2FC, rownames(log2FC) %in% metabolism)
metAE <- subset(mean, rownames(mean) %in% metabolism)

# top 20 up & down
met40 <- rbind(met[1:20,],met[542:561,])
met40FC <- as.matrix(c(metFC[1:20],metFC[542:561,]))
rownames(met40FC) <- rownames(met40)
colnames(met40FC) <- "logFC"
met40AE <- as.matrix(c(metAE[1:20],metAE[542:561,]))
rownames(met40AE) <- rownames(met40)
colnames(met40AE) <- "AveExpr"

h1 <- Heatmap(met40, cluster_rows = F,
              column_labels = colnames(met40), name="Z-score",
              cluster_columns = T)
h2 <- Heatmap(met40FC, row_labels = rownames(met40FC), row_names_gp = gpar(fontsize = 6),
              cluster_rows = F, name="logFC", top_annotation = ha, col = col_logFC,
              cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
                grid.text(round(as.numeric(met40FC[i, j],2)), x, y,gp=gpar(fontsize=6))
              })
h3 <- Heatmap(met40AE, row_labels = rownames(met40AE), row_names_gp = gpar(fontsize = 6),
              cluster_rows = F, name = "AveExpr", col=col_AveExpr,
              cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
                grid.text(round(as.numeric(met40AE[i, j],2)), x, y,gp=gpar(fontsize=6))},
              column_title = "TOP 40 UP & DOWN - KEGG METABOLISM")

h <- h1 + h2 + h3
h

# ----- Top DEGs in all Reactome metabolic pathways ------

metabo <- c()
for(i in 1:length(genesets3)){metabo <- c(metabo,genesets3[[i]])}
metabo <- unique(metabo)

temp <- c()
for(i in 1:length(metabo)){temp <- rbind(temp,filter(hm3, geneID==metabo[i]))}
metabolism <- c(t(unite(temp,"ENSSHAG_geneID", ENSSHAG:geneID, sep=" ")))

met <- subset(mx.scaled, rownames(mx.scaled) %in% metabolism)
metFC <- subset(log2FC, rownames(log2FC) %in% metabolism)
metAE <- subset(mean, rownames(mean) %in% metabolism)

# top 20 up & down
met40 <- rbind(met[1:20,],met[784:803,])
met40FC <- as.matrix(c(metFC[1:20],metFC[784:803,]))
rownames(met40FC) <- rownames(met40)
colnames(met40FC) <- "logFC"
met40AE <- as.matrix(c(metAE[1:20],metAE[784:803,]))
rownames(met40AE) <- rownames(met40)
colnames(met40AE) <- "AveExpr"

h1 <- Heatmap(met40, cluster_rows = F,
              column_labels = colnames(met40), name="Z-score",
              cluster_columns = T)
h2 <- Heatmap(met40FC, row_labels = rownames(met40FC), row_names_gp = gpar(fontsize = 6),
              cluster_rows = F, name="logFC", top_annotation = ha, col = col_logFC,
              cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
                grid.text(round(as.numeric(met40FC[i, j],2)), x, y,gp=gpar(fontsize=6))
              })
h3 <- Heatmap(met40AE, row_labels = rownames(met40AE), row_names_gp = gpar(fontsize = 6),
              cluster_rows = F, name = "AveExpr", col=col_AveExpr,
              cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
                grid.text(round(as.numeric(met40AE[i, j],2)), x, y,gp=gpar(fontsize=6))},
              column_title = "TOP 40 UP & DOWN - REACTOME METABOLISM")

h <- h1 + h2 + h3
h

# ----- KEGG steroid biosynthesis -----

kegg <- genesets2[["KEGG_STEROID_BIOSYNTHESIS"]] 
temp <- c()
for(i in 1:length(kegg)){temp <- rbind(temp,filter(hm3, geneID==kegg[i]))}
glycolysis <- c(t(unite(temp,"ENSSHAG_geneID", ENSSHAG:geneID, sep=" ")))

glyc <- subset(mx.scaled, rownames(mx.scaled) %in% glycolysis)
glycFC <- subset(log2FC, rownames(log2FC) %in% glycolysis)
glycAE <- subset(mean, rownames(mean) %in% glycolysis)

h1 <- Heatmap(glyc, cluster_rows = F,
              column_labels = colnames(glyc), name="Z-score",
              cluster_columns = T)
h2 <- Heatmap(glycFC, row_labels = rownames(glycFC), row_names_gp = gpar(fontsize = 6),
            cluster_rows = F, name="logFC", top_annotation = ha, col = col_logFC,
            cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
              grid.text(round(as.numeric(glycFC[i, j],2)), x, y,gp=gpar(fontsize=6))
            })
h3 <- Heatmap(glycAE, row_labels = rownames(glycAE), row_names_gp = gpar(fontsize = 6),
            cluster_rows = F, name = "AveExpr", col=col_AveExpr,
            cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
              grid.text(round(as.numeric(glycAE[i, j],2)), x, y,gp=gpar(fontsize=6))},
            column_title = "KEGG Steroid biosynthesis")

h <- h1 + h2 + h3
h

# ----- Reactome Cholesterol biosynthesis -----
cholesterol <- genesets3[["Cholesterol biosynthesis"]]
temp <- c()
for(i in 1:length(cholesterol)){temp <- rbind(temp,filter(hm3, geneID==cholesterol[i]))}
cholesterol <- c(t(unite(temp,"ENSSHAG_geneID", ENSSHAG:geneID, sep=" ")))

chol <- subset(mx.scaled, rownames(mx.scaled) %in% cholesterol)
cholFC <- subset(log2FC, rownames(log2FC) %in% cholesterol)
cholAE <- subset(mean, rownames(mean) %in% cholesterol)

h1 <- Heatmap(chol, cluster_rows = F,
              column_labels = colnames(chol), name="Z-score",
              cluster_columns = T)
h2 <- Heatmap(cholFC, row_labels = rownames(cholFC), row_names_gp = gpar(fontsize = 6),
            cluster_rows = F, name="logFC", top_annotation = ha, col = col_logFC,
            cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
              grid.text(round(as.numeric(cholFC[i, j],2)), x, y,gp=gpar(fontsize=6))
            })
h3 <- Heatmap(cholAE, row_labels = rownames(cholAE), row_names_gp = gpar(fontsize = 6),
            cluster_rows = F, name = "AveExpr", col=col_AveExpr,
            cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
              grid.text(round(as.numeric(cholAE[i, j],2)), x, y,gp=gpar(fontsize=6))},
            column_title = "Reactome Cholesterol biosynthesis")

h <- h1 + h2 + h3
h

# ----- Reactome Degradation of cysteine and homocysteine -----
cholesterol <- genesets3[["Degradation of cysteine and homocysteine"]]
temp <- c()
for(i in 1:length(cholesterol)){temp <- rbind(temp,filter(hm3, geneID==cholesterol[i]))}
cholesterol <- c(t(unite(temp,"ENSSHAG_geneID", ENSSHAG:geneID, sep=" ")))

chol <- subset(mx.scaled, rownames(mx.scaled) %in% cholesterol)
cholFC <- subset(log2FC, rownames(log2FC) %in% cholesterol)
cholAE <- subset(mean, rownames(mean) %in% cholesterol)

h1 <- Heatmap(chol, cluster_rows = F,
              column_labels = colnames(chol), name="Z-score",
              cluster_columns = T)
h2 <- Heatmap(cholFC, row_labels = rownames(cholFC), row_names_gp = gpar(fontsize = 6),
            cluster_rows = F, name="logFC", top_annotation = ha, col = col_logFC,
            cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
              grid.text(round(as.numeric(cholFC[i, j],2)), x, y,gp=gpar(fontsize=6))
            })
h3 <- Heatmap(cholAE, row_labels = rownames(cholAE), row_names_gp = gpar(fontsize = 6),
            cluster_rows = F, name = "AveExpr", col=col_AveExpr,
            cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
              grid.text(round(as.numeric(cholAE[i, j],2)), x, y,gp=gpar(fontsize=6))},
            column_title = "Reactome Degradation of cysteine and homocysteine")

h <- h1 + h2 + h3
h

Venn diagrams data of all DEGs, and DEGs linked to metabolism.

alldegs <- rownames(dge3sig)
updegs <- rownames(filter(dge3sig, log2FoldChange > 0))
downdegs <- rownames(filter(dge3sig, log2FoldChange < 0))
metdegs <- rownames(met)

venn_clines <- list(alldegs, updegs, downdegs, metdegs)
saveRDS(venn_clines, file = "venn_clines.rds")

# venn.diagram(
#   x = list(alldegs, updegs, downdegs),
#   category.names = c("All DEGs" , "Up", "Down"),
#   filename = 'updown_venn_diagramm.png',
#   output=TRUE
# )

Pathview to visualise metabolic pathways.

pathFC <- log2FC
rownames(pathFC) <- str_split(rownames(pathFC), " ", simplify=T)[,2]

# cholesterol
pathview(gene.data=pathFC, pathway.id="hsa04979", species="hsa",gene.idtype="SYMBOL", node.sum="sum",
         limit=list(gene=c(-10,10), cpd=1), bins=list(gene=20, cpd=1),
         low="blue",mid="white",high="red",
         same.layer=FALSE)
## 'select()' returned 1:many mapping between keys and columns
## [1] "Note: 15 of 6882 unique input IDs unmapped."
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /mnt/data/mdz/projects/dftd_RNAseq_annelise/dge
## Info: Writing image file hsa04979.pathview.png
# cysteine and methionine
pathview(gene.data=pathFC, pathway.id="hsa00270", species="hsa",gene.idtype="SYMBOL", node.sum="sum",
         limit=list(gene=c(-10,10), cpd=1), bins=list(gene=20, cpd=1),
         low="blue",mid="white",high="red",
         same.layer=FALSE)
## 'select()' returned 1:many mapping between keys and columns
## [1] "Note: 15 of 6882 unique input IDs unmapped."
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /mnt/data/mdz/projects/dftd_RNAseq_annelise/dge
## Info: Writing image file hsa00270.pathview.png
# steroid biosynthesis
pathview(gene.data=pathFC, pathway.id="hsa00100", species="hsa",gene.idtype="SYMBOL", node.sum="sum",
         limit=list(gene=c(-10,10), cpd=1), bins=list(gene=20, cpd=1),
         low="blue",mid="white",high="red",
         same.layer=FALSE)
## 'select()' returned 1:many mapping between keys and columns
## [1] "Note: 15 of 6882 unique input IDs unmapped."
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /mnt/data/mdz/projects/dftd_RNAseq_annelise/dge
## Info: Writing image file hsa00100.pathview.png
# library(SBGNview)
# input.pathways <- findPathways("Cholesterol biosynthesis")
# SBGNview.obj <- SBGNview(
#           gene.data = pathFC, 
#           gene.id.type = "SYMBOL",
#           input.sbgn = input.pathways[7,]$pathway.id,
#           output.file = "quick.start", 
#           output.formats =  c("png")
#           ) 
# print(SBGNview.obj)

Run mitch on cell line pairwise comparisons.

mx <- vector(mode = "list", length = 9)
resx <- vector(mode = "list", length = 9)

for(i in 1:9){
  
  mx[[i]] <- mitch_import(dgex[[i]], DEtype="deseq2",geneTable=gt)
  resx[[i]] <- mitch_calc(mx[[i]], genesets3, priority="effect",cores=16)

  if ( !file.exists(paste(plotnames[i],"_mitch.html")) ) {
    mitch_report(resx[[i]], paste(plotnames[i],"_mitch.html"))
  }
  
  top <- subset(resx[[i]]$enrichment_result, p.adjustANOVA<0.05)
  
  topup <- head(subset(top,s.dist>0),20)
  
  topup %>%
  kbl(caption=paste("Top 20 upregulated metabolic pathways: ", plotnames[i])) %>%
  kable_paper("hover", full_width = F)
  
  topdn <- head(subset(top,s.dist<0),20)
  
  topdn %>%
    kbl(caption=paste("Top 20 downregulated metabolic pathways: ", plotnames[i])) %>%
    kable_paper("hover", full_width = F)
  
  top2 <- rbind(head(topup,10),head(topdn,10))
  top2 <- top2[order(-top2$s.dist),]
  par(mar=c(5,15,5,2))
  
  barplot(rev(abs(top2$s.dist)),horiz=TRUE,las=1,names.arg=rev(top2$set),
          main=paste(plotnames[i]),
          cex.names=0.6,xlab="Enrichment Score",
          col=c(rep("blue",nrow(head(topdn,10))),rep("red",nrow(head(topup,10)))),
          xlim=c(0,0.8))
  
}
## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.
## Note: Mean no. genes in input = 16009
## Note: no. genes in output = 12692
## Note: estimated proportion of input genes in output = 0.793
## Note: Enrichments with large effect sizes may not be
##             statistically significant.
## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.
## Note: Mean no. genes in input = 16009
## Note: no. genes in output = 12691
## Note: estimated proportion of input genes in output = 0.793
## Note: Enrichments with large effect sizes may not be
##             statistically significant.

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.
## Note: Mean no. genes in input = 16009
## Note: no. genes in output = 12691
## Note: estimated proportion of input genes in output = 0.793
## Note: Enrichments with large effect sizes may not be
##             statistically significant.

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.
## Note: Mean no. genes in input = 16009
## Note: no. genes in output = 12691
## Note: estimated proportion of input genes in output = 0.793
## Note: Enrichments with large effect sizes may not be
##             statistically significant.

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.
## Note: Mean no. genes in input = 16009
## Note: no. genes in output = 12691
## Note: estimated proportion of input genes in output = 0.793
## Note: Enrichments with large effect sizes may not be
##             statistically significant.

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.
## Note: Mean no. genes in input = 16009
## Note: no. genes in output = 12690
## Note: estimated proportion of input genes in output = 0.793
## Note: Enrichments with large effect sizes may not be
##             statistically significant.

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.
## Note: Mean no. genes in input = 16009
## Note: no. genes in output = 12689
## Note: estimated proportion of input genes in output = 0.793
## Note: Enrichments with large effect sizes may not be
##             statistically significant.

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.
## Note: Mean no. genes in input = 16009
## Note: no. genes in output = 12691
## Note: estimated proportion of input genes in output = 0.793
## Note: Enrichments with large effect sizes may not be
##             statistically significant.

## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.
## Note: Mean no. genes in input = 16009
## Note: no. genes in output = 12689
## Note: estimated proportion of input genes in output = 0.793
## Note: Enrichments with large effect sizes may not be
##             statistically significant.

Make heatmaps for pairwise cell line comparisons.

# ----- Fatty acid metabolism heatmaps -----
# fatty1 <- genesets2[["REACTOME_FATTY_ACID_METABOLISM"]]
# fatty2 <- genesets2[["HALLMARK_FATTY_ACID_METABOLISM"]]
# fatty3 <- genesets2[["GOBP_FATTY_ACID_BIOSYNTHETIC_PROCESS"]]
# fatty4 <-genesets2[["KEGG_FATTY_ACID_METABOLISM"]]
# fatty <- list(fatty1=fatty1,fatty2=fatty2,fatty3=fatty3,fatty4=fatty4)
# title <- c("REACTOME_FATTY_ACID_METABOLISM","HALLMARK_FATTY_ACID_METABOLISM","GOBP_FATTY_ACID_BIOSYNTHETIC_PROCESS","KEGG_FATTY_ACID_METABOLISM")
# 
# for(i in 1:length(fatty)){
#   temp <- c()
#   for(j in 1:length(fatty[[i]])){temp <- rbind(temp,filter(hm3, geneID==fatty[[i]][j]))}
#   fattyx <- c(t(unite(temp,"ENSSHAG_geneID", ENSSHAG:geneID, sep=" ")))
#   fat <- subset(mx.scaled, rownames(mx.scaled) %in% fattyx)
#   fatFC <- subset(log2FC, rownames(log2FC) %in% fattyx)
#   fatAE <- subset(mean, rownames(mean) %in% fattyx)
#   
#   h1 <- Heatmap(fat, cluster_rows = F,
#                 column_labels = colnames(fat), name="Z-score",
#                 cluster_columns = T)
#   h2 <- Heatmap(fatFC, row_labels = rownames(fatFC), row_names_gp = gpar(fontsize = 6),
#               cluster_rows = F, name="logFC", top_annotation = ha, col = col_logFC,
#               cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
#                 grid.text(round(as.numeric(fatFC[i, j],2)), x, y, gp=gpar(fontsize=6))
#               })
#   h3 <- Heatmap(fatAE, row_labels = rownames(fatAE), row_names_gp = gpar(fontsize = 6),
#               cluster_rows = F, name = "AveExpr", col=col_AveExpr,
#               cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
#                 grid.text(round(as.numeric(fatAE[i, j],2)), x, y, gp=gpar(fontsize=6))},
#               column_title=title[i])
#   
#   name <- paste("q",i,sep="")
#   assign(name, h1 + h2 + h3)
# }
# 
# q1
# q2
# q3
# q4

Pathview to visualise metabolic pathways.

Session information

For reproducibility.

sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] magrittr_2.0.3              VennDiagram_1.7.3          
##  [3] futile.logger_1.4.3         stringr_1.5.0              
##  [5] pathview_1.40.0             circlize_0.4.15            
##  [7] RColorBrewer_1.1-3          ComplexHeatmap_2.16.0      
##  [9] tidyr_1.3.0                 dplyr_1.1.3                
## [11] kableExtra_1.3.4            limma_3.56.2               
## [13] mitch_1.12.0                DESeq2_1.40.2              
## [15] SummarizedExperiment_1.30.2 Biobase_2.60.0             
## [17] MatrixGenerics_1.12.3       matrixStats_1.0.0          
## [19] GenomicRanges_1.52.0        GenomeInfoDb_1.36.3        
## [21] IRanges_2.34.1              S4Vectors_0.38.1           
## [23] BiocGenerics_0.46.0         gplots_3.1.3               
## [25] reshape2_1.4.4             
## 
## loaded via a namespace (and not attached):
##   [1] rstudioapi_0.15.0       jsonlite_1.8.7          shape_1.4.6            
##   [4] magick_2.7.5            rmarkdown_2.24          GlobalOptions_0.1.2    
##   [7] zlibbioc_1.46.0         vctrs_0.6.3             Cairo_1.6-1            
##  [10] memoise_2.0.1           RCurl_1.98-1.12         webshot_0.5.5          
##  [13] htmltools_0.5.6         S4Arrays_1.0.6          lambda.r_1.2.4         
##  [16] sass_0.4.7              KernSmooth_2.23-22      bslib_0.5.1            
##  [19] htmlwidgets_1.6.2       plyr_1.8.8              echarts4r_0.4.5        
##  [22] futile.options_1.0.1    cachem_1.0.8            mime_0.12              
##  [25] lifecycle_1.0.3         iterators_1.0.14        pkgconfig_2.0.3        
##  [28] Matrix_1.6-1            R6_2.5.1                fastmap_1.1.1          
##  [31] GenomeInfoDbData_1.2.10 shiny_1.7.5             clue_0.3-64            
##  [34] digest_0.6.33           colorspace_2.1-0        GGally_2.1.2           
##  [37] reshape_0.8.9           AnnotationDbi_1.62.2    RSQLite_2.3.1          
##  [40] org.Hs.eg.db_3.17.0     fansi_1.0.4             httr_1.4.7             
##  [43] abind_1.4-5             compiler_4.3.1          withr_2.5.0            
##  [46] bit64_4.0.5             doParallel_1.0.17       BiocParallel_1.34.2    
##  [49] DBI_1.1.3               highr_0.10              MASS_7.3-60            
##  [52] DelayedArray_0.26.7     rjson_0.2.21            gtools_3.9.4           
##  [55] caTools_1.18.2          tools_4.3.1             beeswarm_0.4.0         
##  [58] httpuv_1.6.11           glue_1.6.2              promises_1.2.1         
##  [61] cluster_2.1.4           generics_0.1.3          gtable_0.3.4           
##  [64] xml2_1.3.5              utf8_1.2.3              XVector_0.40.0         
##  [67] foreach_1.5.2           pillar_1.9.0            later_1.3.1            
##  [70] lattice_0.21-8          bit_4.0.5               tidyselect_1.2.0       
##  [73] locfit_1.5-9.8          Biostrings_2.68.1       knitr_1.44             
##  [76] gridExtra_2.3           svglite_2.1.1           xfun_0.40              
##  [79] KEGGgraph_1.60.0        stringi_1.7.12          yaml_2.3.7             
##  [82] evaluate_0.21           codetools_0.2-19        tibble_3.2.1           
##  [85] Rgraphviz_2.44.0        graph_1.78.0            cli_3.6.1              
##  [88] xtable_1.8-4            systemfonts_1.0.4       munsell_0.5.0          
##  [91] jquerylib_0.1.4         Rcpp_1.0.11             png_0.1-8              
##  [94] XML_3.99-0.14           parallel_4.3.1          ellipsis_0.3.2         
##  [97] ggplot2_3.4.3           blob_1.2.4              bitops_1.0-7           
## [100] viridisLite_0.4.2       scales_1.2.1            purrr_1.0.2            
## [103] crayon_1.5.2            GetoptLong_1.0.5        rlang_1.1.1            
## [106] KEGGREST_1.40.0         rvest_1.0.3             formatR_1.14