Source codes: https://github.com/Anne-LiseGerard/dftd_rnaseq
suppressPackageStartupMessages({
library("reshape2")
library("gplots")
library("DESeq2")
library("mitch")
library("limma")
library("kableExtra")
library("dplyr")
library("tidyr")
library("ComplexHeatmap")
library("RColorBrewer")
library("circlize")
library("pathview")
library("stringr")
library("grid")
library("VennDiagram")
})
The goal of our study is to compare the expression of genes related to metabolism of DFT1 cell lines to DFT2 cell lines. Our main goal is comparing genes expressed in the following pathways: Glycolysis, Pentose Phosphate Pathway, Fatty Acid Metabolism, Glutaminolysis and Oxidative Phosphorylation. A secondary goal would be to compare the expression of genes related to the production of Reactive Oxygen Species (ROS) and protection against ROS (genes involved in DNA repair, production of antioxydants etc). Finally, we might also be interested in looking into genes linked to cholesterol metabolism as this has been shown to be important for DFT1 but I need to do some additional reading to know what exactly we’re after. See sample sheet below (edited from Yin Peng), there are 3 DFT1 and 3 DFT2 cell lines, and 3 replicates of each cell line:
ss <- read.table("../ss.tsv",sep="\t",fill=TRUE,header=TRUE)
ss$DFT <- as.factor(ss$DFT)
ss$clone <- sapply(strsplit(ss$ClientID,"_"),"[[",1)
ss %>%
kbl(caption="Sample sheet for all samples") %>%
kable_paper("hover", full_width = F)
C | ClientID | DFT | Replicate | Index1sequence | Index2sequence | X.ReadsIdentified.PF. | TotalReads | TotalBases.Gbases. | clone |
---|---|---|---|---|---|---|---|---|---|
6180766 | 4906_1-1 | DFT1 | 1 | TTGTTGCA | GACGTCGT | 2.4465 | 103481165 | 15.5 | 4906 |
6180767 | C5065_1-1 | DFT1 | 1 | CCACACTT | CGTCATAC | 2.8116 | 118924031 | 17.8 | C5065 |
6180768 | 1426_1-1 | DFT1 | 1 | CCCGTTTG | TTGCTGTA | 2.3264 | 98401219 | 14.8 | 1426 |
6180769 | RV_1-1 | DFT2 | 1 | ATGCTCCC | CCCTGCTG | 2.2811 | 96485136 | 14.5 | RV |
6180770 | SN_1-1 | DFT2 | 1 | GCTCAATA | CATTTATC | 2.3947 | 101290147 | 15.2 | SN |
6180771 | TD549_1-1 | DFT2 | 1 | GTAGTTCG | CTTGATGC | 2.1732 | 91921221 | 13.8 | TD549 |
6180772 | 4906_2-1 | DFT1 | 2 | CGAGAACC | ATGTATCG | 2.2322 | 94416781 | 14.2 | 4906 |
6180773 | C5065_2-1 | DFT1 | 2 | GCCATGTA | ATAGGGTT | 2.4505 | 103650355 | 15.5 | C5065 |
6180774 | 1426_2-1 | DFT1 | 2 | TTTCTCTA | CTCGACGT | 2.4778 | 104805081 | 15.7 | 1426 |
6180775 | RV_2-1 | DFT2 | 2 | CCAGCGAT | TCTAGTCA | 2.9882 | 126393794 | 19.0 | RV |
6180776 | SN_2-1 | DFT2 | 2 | TGGGAGTG | CCCGTCTA | 2.4459 | 103455786 | 15.5 | SN |
6180777 | TD549_2-1 | DFT2 | 2 | CCCTCGTA | TAGAAGAG | 2.5437 | 107592495 | 16.1 | TD549 |
6180778 | 4906_3-1 | DFT1 | 3 | CGATATGG | GGGACGTA | 2.2711 | 96062159 | 14.4 | 4906 |
6180779 | C5065_3-1 | DFT1 | 3 | TTGTGCCC | TTTCCATC | 2.2763 | 96282107 | 14.4 | C5065 |
6180780 | 1426_3-1 | DFT1 | 3 | TGTCCTCT | CGACGAAC | 2.4952 | 105541059 | 15.8 | 1426 |
6180781 | RV_3-1 | DFT2 | 3 | GTATAGTC | TTGGTCTC | 2.3325 | 98659234 | 14.8 | RV |
6180782 | SN_3-1 | DFT2 | 3 | TTTGGGAT | GTTCACGT | 2.6750 | 113146174 | 17.0 | SN |
6180783 | TD549_3-1 | DFT2 | 3 | CACCAAGC | AAAGGGAA | 2.1610 | 91405190 | 13.7 | TD549 |
DEA5_4NEG | DEA4_6NEG | NA | CGGAGAGG | CGAGCGTC | 0.0002 | 8460 | 0.0 | DEA4 |
We are mainly interested in comparing all DFT1 samples against all DFT2 samples. However, we did notice that some cell lines within a same DFT produced different results in the metabolism experiments so we might need to compare all 6 cell lines against one another and not be able to simply do a DFT1 vs DFT2 contrast, if that makes sense.
Regarding the reference transcriptome, we will use Ensembl v109.
maplot <- function(de,contrast_name) {
sig <-subset(de, padj < 0.05 )
up <-rownames(subset(de, padj < 0.05 & log2FoldChange > 0))
dn <-rownames(subset(de, padj < 0.05 & log2FoldChange < 0))
GENESUP <- length(up)
GENESDN <- length(dn)
DET=nrow(de)
SUBHEADER = paste(GENESUP, "up, ", GENESDN, "down", DET, "detected")
ns <-subset(de, padj > 0.05 )
plot(log2(de$baseMean),de$log2FoldChange,
xlab="log2 basemean", ylab="log2 foldchange",
pch=19, cex=1, col="dark gray",
main=contrast_name, cex.main=0.7)
points(log2(sig$baseMean),sig$log2FoldChange,
pch=19, cex=1, col="red")
mtext(SUBHEADER,cex = 0.7)
grid()
}
make_volcano <- function(de,name) {
sig <- subset(de,padj<0.05)
N_SIG=nrow(sig)
N_UP=nrow(subset(sig,log2FoldChange>0))
N_DN=nrow(subset(sig,log2FoldChange<0))
DET=nrow(de)
HEADER=paste(N_SIG,"@5%FDR,", N_UP, "up", N_DN, "dn", DET, "detected")
plot(de$log2FoldChange,-log10(de$padj),cex=1,pch=19,col="darkgray",
main=name, xlab="log2 FC", ylab="-log10 pval")
mtext(HEADER)
grid()
points(sig$log2FoldChange,-log10(sig$padj),cex=1,pch=19,col="red")
}
Here we load the data in from the aligner.
tmp <- read.table("../fastq/3col.tsv.gz")
x <- as.data.frame(acast(tmp, V2~V1, value.var="V3",fun.aggregate=sum))
dim(x)
## [1] 43512 19
Load gene names.
gn <- read.table("../ref/Sarcophilus_harrisii.mSarHar1.11.cdna+ncrna.genenames.tsv",fill=TRUE)
gn <- gn[order(gn$V1),]
dim(gn)
## [1] 43512 3
Load homology map
hm <- read.table("../ref/mart_export_ensembl109_2023-07-14.txt",sep="\t",header=TRUE)
Now need to collapse transcript data to genes.
x$gene <- paste(gn$V2,gn$V3)
y <- aggregate(. ~ gene,x,sum)
rownames(y) <- y$gene
y$gene = NULL
dim(y)
## [1] 23829 19
Samples with <1M reads should be omitted. Will also round values to integers.
cs <- colSums(y)
cs <- cs[order(cs)]
par(mar=c(5,10,5,2))
barplot(cs,main="All samples",horiz=TRUE,las=1)
abline(v=1e7,col="red",lty=2)
y <- round(y)
This will help us to visualise the sources of variability in the overall dataset.
Plot MDS and then remove the negative control and run Plot MDS again.
Also fix the sample names.
Plot MDS by DFT.
par(mar = c(5.1, 4.1, 4.1, 2.1) )
plotMDS(y)
y <- y[,colnames(y) != "DEA5-4NEG"]
plotMDS(y)
ss <- ss[ss$ClientID != "DEA4_6NEG",]
colnames(y) <- sapply(strsplit(ss$ClientID,"-"),"[[",1)
cs <- colSums(y)
cs <- cs[order(cs)]
par(mar=c(5,10,5,2))
barplot(cs,main="All samples",horiz=TRUE,las=1)
par(mar = c(5.1, 4.1, 4.1, 2.1) )
plotMDS(y)
cols <- ss$DFT
cols <- gsub("DFT1","pink",cols)
cols <- gsub("DFT2","lightblue",cols)
mymds <- plotMDS(y,plot=FALSE)
# fix the xlims
XMIN=min(mymds$x)
XMAX=max(mymds$x)
XMID=(XMAX+XMIN)/2
XMIN <- XMID + (XMIN-XMID)*1.1
XMAX <- XMID+(XMAX-XMID)*1.1
plotMDS(mymds,pch=19,cex=3,col=cols,main="MDS plot",xlim=c(XMIN,XMAX))
text(mymds,labels=colnames(y))
mtext("pink=DFT1,blue=DFT2")
y_DFT1 <- y[,colnames(y) %in% c("4906_1","4906_2","4906_3",
"1426_1","1426_2","1426_3",
"C5065_1","C5065_2","C5065_3")]
y_DFT2 <- y[,colnames(y) %in% c("RV_1","RV_2","RV_3",
"SN_1","SN_2","SN_3",
"TD549_1","TD549_2","TD549_3")]
mdsDFT1 <- plotMDS(y_DFT1,plot=FALSE)
plotMDS(mdsDFT1,pch=19,cex=3,col="pink",main="MDS plot",xlim=c(XMIN,XMAX))
text(mdsDFT1,labels=colnames(y_DFT1))
mtext("DFT1")
mdsDFT2 <- plotMDS(y_DFT2,plot=FALSE)
plotMDS(mdsDFT2,pch=19,cex=3,col="lightblue",main="MDS plot",xlim=c(XMIN,XMAX))
text(mdsDFT2, labels=colnames(y_DFT2))
mtext("DFT2")
Run a simple differential analysis comparing DFT1 vs DFT2 ignoring the clone type as a source of variation.
# split data will be necessary for the smaller comparisons (but not this one)
y1 <- y
ss1 <- ss
y1 <- y1[which(rowMeans(y1)>10),]
dim(y1)
## [1] 16009 18
dds <- DESeqDataSetFromMatrix(countData = y1 , colData = ss1, design = ~ DFT )
## converting counts to integer mode
## factor levels were dropped which had no samples
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 4 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
z<- results(res)
vsd <- vst(dds)
zz<-cbind(as.data.frame(z),assay(vsd))
dge<-as.data.frame(zz[order(zz$pvalue),])
dge1 <- dge
sig <- subset(dge,padj<0.05)
sig1_up <- rownames(subset(sig,log2FoldChange>0))
sig1_dn <- rownames(subset(sig,log2FoldChange<0))
length(sig1_up)
## [1] 5871
length(sig1_dn)
## [1] 6006
maplot(dge1,"DFT1 vs DFT2")
make_volcano(dge1,"DFT1 vs DFT2")
sig[1:50,1:6] %>%
kbl(caption="Comparison of DFT1 vs DFT2") %>%
kable_paper("hover", full_width = F)
baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | |
---|---|---|---|---|---|---|
ENSSHAG00000000893.2 NA | 761.6994 | -4.745935 | 0.1042514 | -45.52393 | 0 | 0 |
ENSSHAG00000001230.2 NA | 9964.7623 | -7.268224 | 0.1727670 | -42.06952 | 0 | 0 |
ENSSHAG00000001308.2 NA | 1591.3721 | -4.481411 | 0.1093947 | -40.96554 | 0 | 0 |
ENSSHAG00000001552.2 TDRD5 | 1386.5142 | -5.550630 | 0.0945061 | -58.73301 | 0 | 0 |
ENSSHAG00000001728.2 ABRAXAS2 | 2447.7801 | -2.611974 | 0.0580287 | -45.01174 | 0 | 0 |
ENSSHAG00000001754.2 NA | 573.3750 | -4.129922 | 0.1018094 | -40.56522 | 0 | 0 |
ENSSHAG00000002134.2 NA | 31866.6922 | 2.222360 | 0.0386947 | 57.43325 | 0 | 0 |
ENSSHAG00000002808.2 MICAL1 | 9230.0290 | -2.032272 | 0.0528958 | -38.42032 | 0 | 0 |
ENSSHAG00000003434.2 NA | 5640.3455 | -9.630769 | 0.2085777 | -46.17354 | 0 | 0 |
ENSSHAG00000003458.2 JMJD1C | 2343.7511 | -2.896812 | 0.0747449 | -38.75599 | 0 | 0 |
ENSSHAG00000003737.2 NA | 9677.3586 | 11.783418 | 0.2132465 | 55.25727 | 0 | 0 |
ENSSHAG00000004947.2 THBS2 | 24172.5467 | 13.496286 | 0.3231374 | 41.76640 | 0 | 0 |
ENSSHAG00000005182.2 SERPINE1 | 7064.0510 | 11.869506 | 0.2994322 | 39.64005 | 0 | 0 |
ENSSHAG00000005550.2 MEIOC | 1485.7532 | -7.289256 | 0.1694244 | -43.02364 | 0 | 0 |
ENSSHAG00000007070.2 SMC1B | 3980.3698 | -10.063501 | 0.1851324 | -54.35841 | 0 | 0 |
ENSSHAG00000007582.2 NA | 2902.0019 | -7.095020 | 0.0846261 | -83.83964 | 0 | 0 |
ENSSHAG00000007784.2 NA | 5517.6882 | -7.314052 | 0.1015625 | -72.01526 | 0 | 0 |
ENSSHAG00000007965.2 NA | 5215.8477 | -1.747971 | 0.0370471 | -47.18243 | 0 | 0 |
ENSSHAG00000008051.2 TEX14 | 1537.4858 | -3.531985 | 0.0617160 | -57.22965 | 0 | 0 |
ENSSHAG00000009584.2 PLEKHG1 | 1861.7962 | 9.379637 | 0.2481401 | 37.79977 | 0 | 0 |
ENSSHAG00000010367.2 ECM1 | 12497.0681 | 9.411970 | 0.1164250 | 80.84150 | 0 | 0 |
ENSSHAG00000010645.2 COL16A1 | 6094.7776 | 6.056254 | 0.0950462 | 63.71906 | 0 | 0 |
ENSSHAG00000010824.2 MPDZ | 4232.6012 | -5.438458 | 0.0941097 | -57.78850 | 0 | 0 |
ENSSHAG00000010902.2 SMARCD3 | 2489.7010 | -2.649718 | 0.0440929 | -60.09400 | 0 | 0 |
ENSSHAG00000011135.2 NA | 5620.2000 | 4.064532 | 0.0895867 | 45.36980 | 0 | 0 |
ENSSHAG00000011480.2 ADGRB2 | 1933.7632 | -7.544502 | 0.1814507 | -41.57880 | 0 | 0 |
ENSSHAG00000011750.2 NA | 3080.7121 | 3.088431 | 0.0735615 | 41.98432 | 0 | 0 |
ENSSHAG00000011809.2 NA | 5508.1945 | 3.140454 | 0.0593806 | 52.88691 | 0 | 0 |
ENSSHAG00000012182.2 NA | 1038.5611 | 7.742252 | 0.2036373 | 38.01982 | 0 | 0 |
ENSSHAG00000013722.2 NA | 524.0949 | -4.688777 | 0.0900165 | -52.08798 | 0 | 0 |
ENSSHAG00000014390.2 IFRD1 | 2454.6507 | -1.386811 | 0.0345922 | -40.09028 | 0 | 0 |
ENSSHAG00000014430.2 NA | 1157.2248 | -1.742533 | 0.0399895 | -43.57472 | 0 | 0 |
ENSSHAG00000015236.2 FKBP6 | 598.2945 | -5.562106 | 0.1042482 | -53.35446 | 0 | 0 |
ENSSHAG00000016212.2 SLC25A14 | 887.2862 | -6.791421 | 0.1667459 | -40.72915 | 0 | 0 |
ENSSHAG00000016252.2 LOXL2 | 8715.4741 | 8.867010 | 0.1933921 | 45.84991 | 0 | 0 |
ENSSHAG00000016335.2 NA | 21006.6606 | 2.668634 | 0.0567139 | 47.05436 | 0 | 0 |
ENSSHAG00000017128.2 F10 | 515.2000 | 6.578090 | 0.1519688 | 43.28578 | 0 | 0 |
ENSSHAG00000017996.2 TNFRSF21 | 1876.2975 | -7.424893 | 0.1564041 | -47.47248 | 0 | 0 |
ENSSHAG00000018060.2 HR | 3216.8474 | -7.036189 | 0.1791811 | -39.26859 | 0 | 0 |
ENSSHAG00000018469.2 PDE10A | 1279.8709 | -5.655401 | 0.1229592 | -45.99414 | 0 | 0 |
ENSSHAG00000020521.1 CDK3 | 2068.9726 | 4.241310 | 0.0605836 | 70.00762 | 0 | 0 |
ENSSHAG00000021318.1 NA | 2418.0580 | -2.504689 | 0.0455484 | -54.98958 | 0 | 0 |
ENSSHAG00000021464.1 NA | 3962.2831 | -8.392331 | 0.1785238 | -47.00960 | 0 | 0 |
ENSSHAG00000022172.1 HAPLN2 | 3697.2723 | -7.391660 | 0.1427380 | -51.78480 | 0 | 0 |
ENSSHAG00000022666.1 NA | 550.9103 | 6.404774 | 0.1677749 | 38.17480 | 0 | 0 |
ENSSHAG00000022966.1 SOX8 | 1304.3613 | -4.733014 | 0.1196103 | -39.57027 | 0 | 0 |
ENSSHAG00000023142.1 NA | 20998.8159 | -2.483109 | 0.0319815 | -77.64204 | 0 | 0 |
ENSSHAG00000023249.1 NA | 974.9689 | 5.489140 | 0.1204114 | 45.58656 | 0 | 0 |
ENSSHAG00000023680.1 EGR3 | 7293.3439 | -4.014795 | 0.1044284 | -38.44541 | 0 | 0 |
ENSSHAG00000024021.1 NA | 1284.6234 | 5.713744 | 0.1294418 | 44.14140 | 0 | 0 |
write.table(dge,file="dge1.tsv",sep="\t")
# heatmap
mx <- sig[,7:ncol(sig)]
mx <- head(mx,30)
colfunc <- colorRampPalette(c("blue", "white", "red"))
heatmap.2(as.matrix(mx),trace="none",scale="row",
col=colfunc(25),ColSideColors=cols,mar=c(5,14),
cexRow=0.8,cexCol=0.7)
Sum replicates.
x4906 <- rowSums(y[,ss$clone=="4906"])
xC5065 <- rowSums(y[,ss$clone=="C5065"])
x1426 <- rowSums(y[,ss$clone=="1426"])
xRV <- rowSums(y[,ss$clone=="RV"])
xSN <- rowSums(y[,ss$clone=="SN"])
xTD549 <- rowSums(y[,ss$clone=="TD549"])
y2 <- data.frame(x4906,xC5065,x1426,xRV,xSN,xTD549)
ss2 <- as.data.frame(colnames(y2))
colnames(ss2) <- "clone"
ss2$DFT <- factor(c("DFT1","DFT1","DFT1","DFT2","DFT2","DFT2"))
y2 <- y2[which(rowMeans(y2)>10),]
dim(y2)
## [1] 17468 6
dds <- DESeqDataSetFromMatrix(countData = y2 , colData = ss2, design = ~ DFT )
## converting counts to integer mode
dds2 <- dds
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
z<- results(res)
vsd <- vst(dds)
zz<-cbind(as.data.frame(z),assay(vsd))
dge<-as.data.frame(zz[order(zz$pvalue),])
dge2 <- dge
sig <- subset(dge,padj<0.05)
sig2_up <- rownames(subset(sig,log2FoldChange>0))
sig2_dn <- rownames(subset(sig,log2FoldChange<0))
length(sig2_up)
## [1] 4200
length(sig2_dn)
## [1] 4412
maplot(dge2,"DFT1 vs DFT2 aggregated")
make_volcano(dge2,"DFT1 vs DFT2 aggregated")
sig[1:50,1:6] %>%
kbl(caption="Comparison of DFT1 vs DFT2") %>%
kable_paper("hover", full_width = F)
baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | |
---|---|---|---|---|---|---|
ENSSHAG00000007582.2 NA | 8720.2072 | -7.090091 | 0.1734257 | -40.88258 | 0 | 0 |
ENSSHAG00000024692.1 NA | 11176.1973 | -8.580770 | 0.2275391 | -37.71118 | 0 | 0 |
ENSSHAG00000007784.2 NA | 16519.5514 | -7.300242 | 0.1954949 | -37.34236 | 0 | 0 |
ENSSHAG00000010367.2 ECM1 | 37884.8561 | 9.429385 | 0.2567216 | 36.73000 | 0 | 0 |
ENSSHAG00000026705.1 LOXL3 | 35605.2404 | 3.028419 | 0.0834311 | 36.29846 | 0 | 0 |
ENSSHAG00000003737.2 NA | 29315.0122 | 11.793401 | 0.3388989 | 34.79917 | 0 | 0 |
ENSSHAG00000032086.1 NA | 4270.9538 | -9.087633 | 0.2656490 | -34.20918 | 0 | 0 |
ENSSHAG00000020521.1 CDK3 | 6254.9578 | 4.247335 | 0.1329351 | 31.95045 | 0 | 0 |
ENSSHAG00000010645.2 COL16A1 | 18470.4887 | 6.069424 | 0.1909073 | 31.79252 | 0 | 0 |
ENSSHAG00000015236.2 FKBP6 | 1799.3194 | -5.552135 | 0.1749135 | -31.74217 | 0 | 0 |
ENSSHAG00000031815.1 RAB33A | 4153.4227 | -8.057672 | 0.2543513 | -31.67930 | 0 | 0 |
ENSSHAG00000031607.1 NA | 2388.4986 | 6.299003 | 0.2049613 | 30.73264 | 0 | 0 |
ENSSHAG00000003434.2 NA | 16944.8869 | -9.624643 | 0.3183782 | -30.23022 | 0 | 0 |
ENSSHAG00000013199.2 PDGFRA | 101648.6230 | 15.673386 | 0.5200441 | 30.13857 | 0 | 0 |
ENSSHAG00000024117.1 NA | 6007.3116 | 7.935832 | 0.2646438 | 29.98684 | 0 | 0 |
ENSSHAG00000031627.1 PRX | 228602.7417 | -10.824218 | 0.3631920 | -29.80302 | 0 | 0 |
ENSSHAG00000007782.2 CA8 | 2062.4938 | 8.726159 | 0.2990458 | 29.18000 | 0 | 0 |
ENSSHAG00000018310.2 SLIT2 | 44695.3018 | -14.414172 | 0.4983908 | -28.92142 | 0 | 0 |
ENSSHAG00000021464.1 NA | 11908.1052 | -8.384005 | 0.2916414 | -28.74765 | 0 | 0 |
ENSSHAG00000010824.2 MPDZ | 12727.1471 | -5.429957 | 0.1920191 | -28.27821 | 0 | 0 |
ENSSHAG00000000643.2 STK31 | 4182.6920 | -9.537041 | 0.3373093 | -28.27387 | 0 | 0 |
ENSSHAG00000023142.1 NA | 63163.2147 | -2.474963 | 0.0903962 | -27.37905 | 0 | 0 |
ENSSHAG00000025566.1 ABLIM1 | 7772.7034 | 5.814237 | 0.2129981 | 27.29713 | 0 | 0 |
ENSSHAG00000021789.1 NA | 1860.1298 | 8.577176 | 0.3143469 | 27.28571 | 0 | 0 |
ENSSHAG00000024021.1 NA | 3895.6967 | 5.733097 | 0.2107928 | 27.19778 | 0 | 0 |
ENSSHAG00000030292.1 NA | 1933.0281 | -5.410220 | 0.1994706 | -27.12289 | 0 | 0 |
ENSSHAG00000031074.1 NEXMIF | 4117.0924 | -8.404272 | 0.3123135 | -26.90973 | 0 | 0 |
ENSSHAG00000008051.2 TEX14 | 4625.9527 | -3.525105 | 0.1326009 | -26.58431 | 0 | 0 |
ENSSHAG00000004947.2 THBS2 | 73193.5959 | 13.509919 | 0.5132476 | 26.32242 | 0 | 0 |
ENSSHAG00000022172.1 HAPLN2 | 11123.1806 | -7.382997 | 0.2822149 | -26.16090 | 0 | 0 |
ENSSHAG00000026637.1 NA | 2213.7312 | -5.071669 | 0.1942619 | -26.10738 | 0 | 0 |
ENSSHAG00000005380.2 NA | 3108.7669 | 10.228790 | 0.3940820 | 25.95599 | 0 | 0 |
ENSSHAG00000018469.2 PDE10A | 3854.3231 | -5.647349 | 0.2179279 | -25.91384 | 0 | 0 |
ENSSHAG00000011809.2 NA | 16636.2524 | 3.149445 | 0.1232734 | 25.54845 | 0 | 0 |
ENSSHAG00000029638.1 NA | 18764.0415 | -6.179495 | 0.2425972 | -25.47224 | 0 | 0 |
ENSSHAG00000016252.2 LOXL2 | 26417.8025 | 8.878268 | 0.3521803 | 25.20944 | 0 | 0 |
ENSSHAG00000001552.2 TDRD5 | 4168.0657 | -5.549233 | 0.2251434 | -24.64755 | 0 | 0 |
ENSSHAG00000006728.2 NEFM | 7555.0072 | -10.855938 | 0.4479166 | -24.23652 | 0 | 0 |
ENSSHAG00000013722.2 NA | 1575.4562 | -4.678030 | 0.1936782 | -24.15363 | 0 | 0 |
ENSSHAG00000014886.2 CYP2R1 | 2488.5667 | -7.465866 | 0.3091695 | -24.14813 | 0 | 0 |
ENSSHAG00000031108.1 NA | 964.2578 | -7.703956 | 0.3207488 | -24.01866 | 0 | 0 |
ENSSHAG00000005492.2 NA | 1659.5069 | -8.999424 | 0.3748897 | -24.00553 | 0 | 0 |
ENSSHAG00000027513.1 NA | 1802.2901 | -5.271118 | 0.2200576 | -23.95335 | 0 | 0 |
ENSSHAG00000000893.2 NA | 2287.3973 | -4.736996 | 0.1987138 | -23.83828 | 0 | 0 |
ENSSHAG00000005182.2 SERPINE1 | 21367.7733 | 11.866757 | 0.5041184 | 23.53962 | 0 | 0 |
ENSSHAG00000010102.2 UNC5A | 1472.0133 | -8.985720 | 0.3860205 | -23.27783 | 0 | 0 |
ENSSHAG00000031142.1 TPPP | 2664.1676 | -5.954920 | 0.2565656 | -23.21012 | 0 | 0 |
ENSSHAG00000022666.1 NA | 1669.6189 | 6.414065 | 0.2781309 | 23.06132 | 0 | 0 |
ENSSHAG00000010515.2 NA | 2972.5658 | 9.167129 | 0.3999684 | 22.91964 | 0 | 0 |
ENSSHAG00000010902.2 SMARCD3 | 7489.5510 | -2.642803 | 0.1163671 | -22.71092 | 0 | 0 |
write.table(dge2,file="dge2.tsv",sep="\t")
# heatmap
mx <- sig[,7:ncol(sig)]
mx <- head(mx,30)
colfunc <- colorRampPalette(c("blue", "white", "red"))
heatmap.2(as.matrix(mx),trace="none",scale="row",
col=colfunc(25),ColSideColors=cols[1:6],mar=c(8,14),
cexRow=0.7,cexCol=1)
First 4906 vs RV, 4906 vs SN and 4906 vs TD549. Then 1426 vs RV, 1426 vs SN and 1426 vs TD549. Finally C5065 vs RV, C5065 vs SN and C5065 vs TD549.
yx <- vector(mode = "list", length = 9)
yx[[1]] <- select(y1, "4906_1", "4906_2", "4906_3", "RV_1", "RV_2", "RV_3")
yx[[2]] <- select(y1, "4906_1", "4906_2", "4906_3", "SN_1", "SN_2", "SN_3")
yx[[3]] <- select(y1, "4906_1", "4906_2", "4906_3", "TD549_1", "TD549_2", "TD549_3")
yx[[4]] <- select(y1, "1426_1", "1426_2", "1426_3", "RV_1", "RV_2", "RV_3")
yx[[5]] <- select(y1, "1426_1", "1426_2", "1426_3", "SN_1", "SN_2", "SN_3")
yx[[6]] <- select(y1, "1426_1", "1426_2", "1426_3", "TD549_1", "TD549_2", "TD549_3")
yx[[7]] <- select(y1, "C5065_1", "C5065_2", "C5065_3", "RV_1", "RV_2", "RV_3")
yx[[8]] <- select(y1, "C5065_1", "C5065_2", "C5065_3", "SN_1", "SN_2", "SN_3")
yx[[9]] <- select(y1, "C5065_1", "C5065_2", "C5065_3", "TD549_1", "TD549_2", "TD549_3")
ssx <- vector(mode = "list", length = 9)
dds <- vector(mode = "list", length = 9)
res <- vector(mode = "list", length = 9)
z <- vector(mode = "list", length = 9)
vsd <- vector(mode = "list", length = 9)
zz <- vector(mode = "list", length = 9)
dgex <- vector(mode = "list", length = 9)
sigx <- vector(mode = "list", length = 9)
for(i in 1:9){
print(paste("Processing comparison: ", i, " on 9"))
ssx[[i]] <- as.data.frame(colnames(yx[[i]]))
colnames(ssx[[i]]) <- "clone"
ssx[[i]]$DFT <- factor(c("DFT1","DFT1","DFT1","DFT2","DFT2","DFT2"))
dds[[i]] <- DESeqDataSetFromMatrix(countData = yx[[i]], colData = ssx[[i]],
design = ~ DFT)
res[[i]] <- DESeq(dds[[i]], quiet=TRUE)
z[[i]] <- results(res[[i]])
vsd[[i]] <- vst(dds[[i]])
zz[[i]] <- cbind(as.data.frame(z[[i]]), assay(vsd[[i]]))
dgex[[i]] <- as.data.frame(zz[[i]][order(zz[[i]]$pvalue),])
sigx[[i]] <- subset(dgex[[i]], padj < 0.05)
}
## [1] "Processing comparison: 1 on 9"
## converting counts to integer mode
## [1] "Processing comparison: 2 on 9"
## converting counts to integer mode
## [1] "Processing comparison: 3 on 9"
## converting counts to integer mode
## [1] "Processing comparison: 4 on 9"
## converting counts to integer mode
## [1] "Processing comparison: 5 on 9"
## converting counts to integer mode
## [1] "Processing comparison: 6 on 9"
## converting counts to integer mode
## [1] "Processing comparison: 7 on 9"
## converting counts to integer mode
## [1] "Processing comparison: 8 on 9"
## converting counts to integer mode
## [1] "Processing comparison: 9 on 9"
## converting counts to integer mode
plotnames <- c("4906 vs RV", "4906 vs SN", "4906 vs TD549",
"1426 vs RV", "1426 vs SN", "1426 vs TD549",
"C5065 vs RV", "C5065 vs SN", "C5065 vs TD549")
for(i in 1:9){
maplot(dgex[[i]],plotnames[i])
make_volcano(dgex[[i]],plotnames[i])
sigx[[i]][1:50,1:6] %>%
kbl(caption=plotnames[i]) %>%
kable_paper("hover", full_width = F)
write.table(dgex[[i]],file=paste(plotnames[i],".tsv"),sep="\t")
mx <- sigx[[i]][,7:ncol(sigx[[i]])]
mx <- head(mx,30)
colfunc <- colorRampPalette(c("blue", "white", "red"))
heatmap.2(as.matrix(mx),trace="none",scale="row",
col=colfunc(25),ColSideColors=cols[1:6],mar=c(8,14),
cexRow=0.7,cexCol=1)
}
Need to get human homologs of these genes. These were obtained from Ensembl v109 biomart (website).
rownames(dge1) <- sapply(strsplit(rownames(dge1),"\\."),"[[",1)
rownames(dge2) <- sapply(strsplit(rownames(dge2),"\\."),"[[",1)
for(i in 1:9){
rownames(dgex[[i]]) <- sapply(strsplit(rownames(dgex[[i]]),"\\."),"[[",1)
}
hm2 <- hm[hm$Tasmanian.devil.gene.stable.ID != "",]
gt <- hm2[,2:3]
length(unique(gt$Tasmanian.devil.gene.stable.ID))
## [1] 16979
length(unique(gt$Gene.name))
## [1] 16646
for(i in 1:length(gt$Gene.name)){
if(gt$Gene.name[i]==""){gt$Gene.name[i] <- gt$Tasmanian.devil.gene.stable.ID[i]}
}
Now run mitch for DGE2. There will be a report generated that has more details on the enrichment analysis. The pathways are sourced from Reactome 7th July 2023.
genesets <- gmt_import("../ref/ReactomePathways_2023-07-14.gmt")
m2 <- mitch_import(dge2, DEtype="deseq2", geneTable=gt)
## The input is a single dataframe; one contrast only. Converting
## it to a list for you.
## Note: Mean no. genes in input = 17468
## Note: no. genes in output = 13563
## Note: estimated proportion of input genes in output = 0.776
head(m2)
## x
## A2M -13.13626596
## A4GALT -0.07278225
## AAAS -4.33508400
## AACS 2.13961007
## AADAC -2.90578338
## AADACL3 1.15402966
res2 <- mitch_calc(m2, genesets, priority="effect",cores=16)
## Note: Enrichments with large effect sizes may not be
## statistically significant.
if ( !file.exists("mitch2.html") ) {
mitch_report(res2, "mitch2.html")
}
top <- subset(res2$enrichment_result,p.adjustANOVA<0.05)
topup <- head(subset(top,s.dist>0),20)
topup %>%
kbl(caption="Top 20 upregulated pathways") %>%
kable_paper("hover", full_width = F)
set | setSize | pANOVA | s.dist | p.adjustANOVA | |
---|---|---|---|---|---|
687 | Metallothioneins bind metals | 10 | 0.0000122 | 0.7987899 | 0.0013275 |
1075 | Response to metal ions | 13 | 0.0000082 | 0.7143457 | 0.0009848 |
229 | Crosslinking of collagen fibrils | 16 | 0.0001180 | 0.5559533 | 0.0070965 |
42 | Activation of Matrix Metalloproteinases | 22 | 0.0000129 | 0.5372404 | 0.0013275 |
207 | Collagen degradation | 47 | 0.0000000 | 0.5181408 | 0.0000004 |
157 | Caspase activation via Death Receptors in the presence of ligand | 12 | 0.0019500 | 0.5164748 | 0.0460725 |
456 | GRB2:SOS provides linkage to MAPK signaling for Integrins | 12 | 0.0019840 | 0.5156200 | 0.0460725 |
206 | Collagen chain trimerization | 30 | 0.0000013 | 0.5100175 | 0.0001749 |
185 | Cholesterol biosynthesis | 24 | 0.0000356 | 0.4875237 | 0.0027071 |
205 | Collagen biosynthesis and modifying enzymes | 50 | 0.0000000 | 0.4811012 | 0.0000010 |
171 | Cell-extracellular matrix interactions | 18 | 0.0004405 | 0.4785202 | 0.0205059 |
572 | Integrin cell surface interactions | 63 | 0.0000000 | 0.4337449 | 0.0000008 |
337 | ECM proteoglycans | 61 | 0.0000000 | 0.4178543 | 0.0000030 |
1147 | Signal transduction by L1 | 21 | 0.0016068 | 0.3977221 | 0.0435412 |
208 | Collagen formation | 68 | 0.0000001 | 0.3790227 | 0.0000095 |
573 | Integrin signaling | 23 | 0.0016662 | 0.3787875 | 0.0435412 |
1247 | Syndecan interactions | 24 | 0.0014009 | 0.3767696 | 0.0435412 |
93 | Assembly of collagen fibrils and other multimeric structures | 47 | 0.0000244 | 0.3559658 | 0.0021982 |
183 | Chemokine receptors bind chemokines | 27 | 0.0015493 | 0.3520160 | 0.0435412 |
287 | Degradation of the extracellular matrix | 107 | 0.0000000 | 0.3432544 | 0.0000004 |
topdn <- head(subset(top,s.dist<0),20)
topdn %>%
kbl(caption="Top 20 downregulated pathways") %>%
kable_paper("hover", full_width = F)
set | setSize | pANOVA | s.dist | p.adjustANOVA | |
---|---|---|---|---|---|
286 | Degradation of cysteine and homocysteine | 11 | 0.0003114 | -0.6278174 | 0.0160496 |
1243 | Sulfur amino acid metabolism | 23 | 0.0016897 | -0.3782930 | 0.0435412 |
1359 | Transmission across Chemical Synapses | 195 | 0.0009484 | -0.1376402 | 0.0333775 |
top2 <- rbind(head(topup,10),head(topdn,10))
top2 <- top2[order(-top2$s.dist),]
par(mar=c(5,15,5,2))
barplot(rev(abs(top2$s.dist)),horiz=TRUE,las=1,names.arg=rev(top2$set),
main="Top REACTOME Pathways",cex.names=0.6,xlab="Enrichment Score",
col=c(rep("blue",nrow(head(topdn,10))),rep("red",nrow(head(topup,10)))),
xlim=c(0,0.8))
Run mitch with curated metabolic pathways.
KEGG gene sets under the “Metabolism” hierarchy were sourced from MSigDb in August 2023.
genesets2 <- gmt_import("../ref/gsea/gsea_KEGG.gmt")
res2 <- mitch_calc(m2, genesets2, priority="effect",cores=16)
## Note: Enrichments with large effect sizes may not be
## statistically significant.
if ( !file.exists("mitch2.html") ) {
mitch_report(res2, "mitch2.html")
}
top <- subset(res2$enrichment_result,p.adjustANOVA<0.05)
topup <- head(subset(top,s.dist>0),20)
topup %>%
kbl(caption="Top 20 upregulated pathways") %>%
kable_paper("hover", full_width = F)
set | setSize | pANOVA | s.dist | p.adjustANOVA | |
---|---|---|---|---|---|
51 | KEGG_STEROID_BIOSYNTHESIS | 14 | 0.0004408 | 0.542486 | 0.0251253 |
topdn <- head(subset(top,s.dist<0),20)
topdn %>%
kbl(caption="Top 20 downregulated pathways") %>%
kable_paper("hover", full_width = F)
set | setSize | pANOVA | s.dist | p.adjustANOVA |
---|---|---|---|---|
top2 <- rbind(head(topup,10),head(topdn,10))
top2 <- top2[order(-top2$s.dist),]
par(mar=c(5,15,5,2))
barplot(rev(abs(top2$s.dist)),horiz=TRUE,las=1,names.arg=rev(top2$set),
main="Top KEGG Pathways",cex.names=0.6,xlab="Enrichment Score",
col=c(rep("blue",nrow(head(topdn,10))),rep("red",nrow(head(topup,10)))),
xlim=c(0,0.8))
Same with Reactome this time. Reactome gene sets of the metabolism hierarchy were sourced from the Reactome website in August 2023.
hierarchy <- read.table("../ref/gsea/ReactomePathwaysRelation.txt", sep ="\t", header = F, dec =".")
pathways <- read.delim("../ref/gsea/ReactomePathways.txt", header=FALSE)
hierarchy <- hierarchy[grep("HSA", hierarchy$V1), ] # keep only homo sapiens data
pathways <- pathways[grep("HSA", pathways$V1), 1:2]
metabo_pathways <- "R-HSA-1430728" # reactome code for metabolism
# this is very bad coding - but it works
# why 325? that is when while loop crashes because all branches of the hierarchy have been looped over
# will have to change this if Reactome data is updated
while(length(metabo_pathways) < 325){
for(i in 1:length(metabo_pathways)){
metabo_pathways <- unique(c(metabo_pathways,filter(hierarchy, V1 == metabo_pathways[i])$V2))
}
}
mpathways <- c()
for(i in 1:length(metabo_pathways)){mpathways <- c(mpathways,filter(pathways, V1 == metabo_pathways[i])$V2)}
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:GenomicRanges':
##
## subtract
genesets3 <- extract(genesets, mpathways)
# remove empty sets (ie hierarchy exists but no gene set is associated to it)
genesets3 <- genesets3[lapply(genesets3,length)>0]
m3 <- mitch_import(dge2, DEtype="deseq2", geneTable=gt)
## The input is a single dataframe; one contrast only. Converting
## it to a list for you.
## Note: Mean no. genes in input = 17468
## Note: no. genes in output = 13563
## Note: estimated proportion of input genes in output = 0.776
res3 <- mitch_calc(m3, genesets3, priority="effect", cores=16)
## Note: Enrichments with large effect sizes may not be
## statistically significant.
if ( !file.exists("mitch3.html") ) {
mitch_report(res3, "mitch3.html")
}
top <- subset(res3$enrichment_result,p.adjustANOVA<0.05)
topup <- head(subset(top,s.dist>0),20)
topup %>%
kbl(caption="Top 20 upregulated metabolic pathways") %>%
kable_paper("hover", full_width = F)
set | setSize | pANOVA | s.dist | p.adjustANOVA | |
---|---|---|---|---|---|
77 | Cholesterol biosynthesis | 24 | 3.56e-05 | 0.4875237 | 0.0048833 |
topdn <- head(subset(top,s.dist<0),20)
topdn %>%
kbl(caption="Top 20 downregulated metabolic pathways") %>%
kable_paper("hover", full_width = F)
set | setSize | pANOVA | s.dist | p.adjustANOVA | |
---|---|---|---|---|---|
88 | Degradation of cysteine and homocysteine | 11 | 0.0003114 | -0.6278174 | 0.0213327 |
top2 <- rbind(head(topup,10),head(topdn,10))
top2 <- top2[order(-top2$s.dist),]
par(mar=c(5,15,5,2))
barplot(rev(abs(top2$s.dist)),horiz=TRUE,las=1,names.arg=rev(top2$set),
main="Top REACTOME Metabolic Pathways",cex.names=0.6,xlab="Enrichment Score",
col=c(rep("blue",nrow(head(topdn,10))),rep("red",nrow(head(topup,10)))),
xlim=c(0,0.8))
# see https://www.youtube.com/watch?v=ht1r34-ifVI for reference
# find corresponding human genes to devil ENS IDs
dge3 <- dge2
dge3$ENSSHAG <- rownames(dge3)
hm3 <- data.frame(ENSSHAG=hm2[,2],geneID=hm2[,3])
dge3 <- left_join(dge3, hm3, by="ENSSHAG")
dge3 <- unite(dge3,"ENSSHAG_geneID", ENSSHAG:geneID, sep=" ", remove=F)
dge3 <- unique(dge3)
rownames(dge3) <- dge3$ENSSHAG_geneID
# only plot significant DEGs
dge3sig <- filter(dge3, padj < 0.05)
dge3sig <- dge3sig[order(dge3sig$log2FoldChange, decreasing=TRUE),]
mx <- dge3sig[,7:ncol(dge3sig)] # get normalised counts
mx.scaled <- t(apply(mx[,1:6], 1, scale)) # center and scale each column (Z-score)
colnames(mx.scaled) <- colnames(mx[,1:6])
log2FC <- as.matrix(dge3sig$log2FoldChange)
rownames(log2FC) <- rownames(dge3sig)
colnames(log2FC) <- "logFC"
mean <- as.matrix(dge3sig$baseMean)
rownames(mean) <- rownames(dge3sig)
colnames(mean) <- "AveExpr"
col_logFC <- colorRamp2(c(min(log2FC),0,max(log2FC)), c("blue","white","red"))
col_AveExpr <- colorRamp2(c(quantile(mean)[1], quantile(mean)[4]), c("white","red"))
ha <- HeatmapAnnotation(summary=anno_summary(gp=gpar(fill=2),height=unit(2, "cm")))
# ----- Top DEGs in all KEGG metabolic pathways ------
metabo <- c()
for(i in 1:length(genesets2)){metabo <- c(metabo,genesets2[[i]])}
metabo <- unique(metabo)
temp <- c()
for(i in 1:length(metabo)){temp <- rbind(temp,filter(hm3, geneID==metabo[i]))}
metabolism <- c(t(unite(temp,"ENSSHAG_geneID", ENSSHAG:geneID, sep=" ")))
met <- subset(mx.scaled, rownames(mx.scaled) %in% metabolism)
metFC <- subset(log2FC, rownames(log2FC) %in% metabolism)
metAE <- subset(mean, rownames(mean) %in% metabolism)
# top 20 up & down
met40 <- rbind(met[1:20,],met[542:561,])
met40FC <- as.matrix(c(metFC[1:20],metFC[542:561,]))
rownames(met40FC) <- rownames(met40)
colnames(met40FC) <- "logFC"
met40AE <- as.matrix(c(metAE[1:20],metAE[542:561,]))
rownames(met40AE) <- rownames(met40)
colnames(met40AE) <- "AveExpr"
h1 <- Heatmap(met40, cluster_rows = F,
column_labels = colnames(met40), name="Z-score",
cluster_columns = T)
h2 <- Heatmap(met40FC, row_labels = rownames(met40FC), row_names_gp = gpar(fontsize = 6),
cluster_rows = F, name="logFC", top_annotation = ha, col = col_logFC,
cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
grid.text(round(as.numeric(met40FC[i, j],2)), x, y,gp=gpar(fontsize=6))
})
h3 <- Heatmap(met40AE, row_labels = rownames(met40AE), row_names_gp = gpar(fontsize = 6),
cluster_rows = F, name = "AveExpr", col=col_AveExpr,
cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
grid.text(round(as.numeric(met40AE[i, j],2)), x, y,gp=gpar(fontsize=6))},
column_title = "TOP 40 UP & DOWN - KEGG METABOLISM")
h <- h1 + h2 + h3
h
# ----- Top DEGs in all Reactome metabolic pathways ------
metabo <- c()
for(i in 1:length(genesets3)){metabo <- c(metabo,genesets3[[i]])}
metabo <- unique(metabo)
temp <- c()
for(i in 1:length(metabo)){temp <- rbind(temp,filter(hm3, geneID==metabo[i]))}
metabolism <- c(t(unite(temp,"ENSSHAG_geneID", ENSSHAG:geneID, sep=" ")))
met <- subset(mx.scaled, rownames(mx.scaled) %in% metabolism)
metFC <- subset(log2FC, rownames(log2FC) %in% metabolism)
metAE <- subset(mean, rownames(mean) %in% metabolism)
# top 20 up & down
met40 <- rbind(met[1:20,],met[784:803,])
met40FC <- as.matrix(c(metFC[1:20],metFC[784:803,]))
rownames(met40FC) <- rownames(met40)
colnames(met40FC) <- "logFC"
met40AE <- as.matrix(c(metAE[1:20],metAE[784:803,]))
rownames(met40AE) <- rownames(met40)
colnames(met40AE) <- "AveExpr"
h1 <- Heatmap(met40, cluster_rows = F,
column_labels = colnames(met40), name="Z-score",
cluster_columns = T)
h2 <- Heatmap(met40FC, row_labels = rownames(met40FC), row_names_gp = gpar(fontsize = 6),
cluster_rows = F, name="logFC", top_annotation = ha, col = col_logFC,
cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
grid.text(round(as.numeric(met40FC[i, j],2)), x, y,gp=gpar(fontsize=6))
})
h3 <- Heatmap(met40AE, row_labels = rownames(met40AE), row_names_gp = gpar(fontsize = 6),
cluster_rows = F, name = "AveExpr", col=col_AveExpr,
cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
grid.text(round(as.numeric(met40AE[i, j],2)), x, y,gp=gpar(fontsize=6))},
column_title = "TOP 40 UP & DOWN - REACTOME METABOLISM")
h <- h1 + h2 + h3
h
# ----- KEGG steroid biosynthesis -----
kegg <- genesets2[["KEGG_STEROID_BIOSYNTHESIS"]]
temp <- c()
for(i in 1:length(kegg)){temp <- rbind(temp,filter(hm3, geneID==kegg[i]))}
glycolysis <- c(t(unite(temp,"ENSSHAG_geneID", ENSSHAG:geneID, sep=" ")))
glyc <- subset(mx.scaled, rownames(mx.scaled) %in% glycolysis)
glycFC <- subset(log2FC, rownames(log2FC) %in% glycolysis)
glycAE <- subset(mean, rownames(mean) %in% glycolysis)
h1 <- Heatmap(glyc, cluster_rows = F,
column_labels = colnames(glyc), name="Z-score",
cluster_columns = T)
h2 <- Heatmap(glycFC, row_labels = rownames(glycFC), row_names_gp = gpar(fontsize = 6),
cluster_rows = F, name="logFC", top_annotation = ha, col = col_logFC,
cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
grid.text(round(as.numeric(glycFC[i, j],2)), x, y,gp=gpar(fontsize=6))
})
h3 <- Heatmap(glycAE, row_labels = rownames(glycAE), row_names_gp = gpar(fontsize = 6),
cluster_rows = F, name = "AveExpr", col=col_AveExpr,
cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
grid.text(round(as.numeric(glycAE[i, j],2)), x, y,gp=gpar(fontsize=6))},
column_title = "KEGG Steroid biosynthesis")
h <- h1 + h2 + h3
h
# ----- Reactome Cholesterol biosynthesis -----
cholesterol <- genesets3[["Cholesterol biosynthesis"]]
temp <- c()
for(i in 1:length(cholesterol)){temp <- rbind(temp,filter(hm3, geneID==cholesterol[i]))}
cholesterol <- c(t(unite(temp,"ENSSHAG_geneID", ENSSHAG:geneID, sep=" ")))
chol <- subset(mx.scaled, rownames(mx.scaled) %in% cholesterol)
cholFC <- subset(log2FC, rownames(log2FC) %in% cholesterol)
cholAE <- subset(mean, rownames(mean) %in% cholesterol)
h1 <- Heatmap(chol, cluster_rows = F,
column_labels = colnames(chol), name="Z-score",
cluster_columns = T)
h2 <- Heatmap(cholFC, row_labels = rownames(cholFC), row_names_gp = gpar(fontsize = 6),
cluster_rows = F, name="logFC", top_annotation = ha, col = col_logFC,
cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
grid.text(round(as.numeric(cholFC[i, j],2)), x, y,gp=gpar(fontsize=6))
})
h3 <- Heatmap(cholAE, row_labels = rownames(cholAE), row_names_gp = gpar(fontsize = 6),
cluster_rows = F, name = "AveExpr", col=col_AveExpr,
cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
grid.text(round(as.numeric(cholAE[i, j],2)), x, y,gp=gpar(fontsize=6))},
column_title = "Reactome Cholesterol biosynthesis")
h <- h1 + h2 + h3
h
# ----- Reactome Degradation of cysteine and homocysteine -----
cholesterol <- genesets3[["Degradation of cysteine and homocysteine"]]
temp <- c()
for(i in 1:length(cholesterol)){temp <- rbind(temp,filter(hm3, geneID==cholesterol[i]))}
cholesterol <- c(t(unite(temp,"ENSSHAG_geneID", ENSSHAG:geneID, sep=" ")))
chol <- subset(mx.scaled, rownames(mx.scaled) %in% cholesterol)
cholFC <- subset(log2FC, rownames(log2FC) %in% cholesterol)
cholAE <- subset(mean, rownames(mean) %in% cholesterol)
h1 <- Heatmap(chol, cluster_rows = F,
column_labels = colnames(chol), name="Z-score",
cluster_columns = T)
h2 <- Heatmap(cholFC, row_labels = rownames(cholFC), row_names_gp = gpar(fontsize = 6),
cluster_rows = F, name="logFC", top_annotation = ha, col = col_logFC,
cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
grid.text(round(as.numeric(cholFC[i, j],2)), x, y,gp=gpar(fontsize=6))
})
h3 <- Heatmap(cholAE, row_labels = rownames(cholAE), row_names_gp = gpar(fontsize = 6),
cluster_rows = F, name = "AveExpr", col=col_AveExpr,
cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
grid.text(round(as.numeric(cholAE[i, j],2)), x, y,gp=gpar(fontsize=6))},
column_title = "Reactome Degradation of cysteine and homocysteine")
h <- h1 + h2 + h3
h
Venn diagrams data of all DEGs, and DEGs linked to metabolism.
alldegs <- rownames(dge3sig)
updegs <- rownames(filter(dge3sig, log2FoldChange > 0))
downdegs <- rownames(filter(dge3sig, log2FoldChange < 0))
metdegs <- rownames(met)
venn_clines <- list(alldegs, updegs, downdegs, metdegs)
saveRDS(venn_clines, file = "venn_clines.rds")
# venn.diagram(
# x = list(alldegs, updegs, downdegs),
# category.names = c("All DEGs" , "Up", "Down"),
# filename = 'updown_venn_diagramm.png',
# output=TRUE
# )
Pathview to visualise metabolic pathways.
pathFC <- log2FC
rownames(pathFC) <- str_split(rownames(pathFC), " ", simplify=T)[,2]
# cholesterol
pathview(gene.data=pathFC, pathway.id="hsa04979", species="hsa",gene.idtype="SYMBOL", node.sum="sum",
limit=list(gene=c(-10,10), cpd=1), bins=list(gene=20, cpd=1),
low="blue",mid="white",high="red",
same.layer=FALSE)
## 'select()' returned 1:many mapping between keys and columns
## [1] "Note: 15 of 6882 unique input IDs unmapped."
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /mnt/data/mdz/projects/dftd_RNAseq_annelise/dge
## Info: Writing image file hsa04979.pathview.png
# cysteine and methionine
pathview(gene.data=pathFC, pathway.id="hsa00270", species="hsa",gene.idtype="SYMBOL", node.sum="sum",
limit=list(gene=c(-10,10), cpd=1), bins=list(gene=20, cpd=1),
low="blue",mid="white",high="red",
same.layer=FALSE)
## 'select()' returned 1:many mapping between keys and columns
## [1] "Note: 15 of 6882 unique input IDs unmapped."
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /mnt/data/mdz/projects/dftd_RNAseq_annelise/dge
## Info: Writing image file hsa00270.pathview.png
# steroid biosynthesis
pathview(gene.data=pathFC, pathway.id="hsa00100", species="hsa",gene.idtype="SYMBOL", node.sum="sum",
limit=list(gene=c(-10,10), cpd=1), bins=list(gene=20, cpd=1),
low="blue",mid="white",high="red",
same.layer=FALSE)
## 'select()' returned 1:many mapping between keys and columns
## [1] "Note: 15 of 6882 unique input IDs unmapped."
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /mnt/data/mdz/projects/dftd_RNAseq_annelise/dge
## Info: Writing image file hsa00100.pathview.png
# library(SBGNview)
# input.pathways <- findPathways("Cholesterol biosynthesis")
# SBGNview.obj <- SBGNview(
# gene.data = pathFC,
# gene.id.type = "SYMBOL",
# input.sbgn = input.pathways[7,]$pathway.id,
# output.file = "quick.start",
# output.formats = c("png")
# )
# print(SBGNview.obj)
Run mitch on cell line pairwise comparisons.
mx <- vector(mode = "list", length = 9)
resx <- vector(mode = "list", length = 9)
for(i in 1:9){
mx[[i]] <- mitch_import(dgex[[i]], DEtype="deseq2",geneTable=gt)
resx[[i]] <- mitch_calc(mx[[i]], genesets3, priority="effect",cores=16)
if ( !file.exists(paste(plotnames[i],"_mitch.html")) ) {
mitch_report(resx[[i]], paste(plotnames[i],"_mitch.html"))
}
top <- subset(resx[[i]]$enrichment_result, p.adjustANOVA<0.05)
topup <- head(subset(top,s.dist>0),20)
topup %>%
kbl(caption=paste("Top 20 upregulated metabolic pathways: ", plotnames[i])) %>%
kable_paper("hover", full_width = F)
topdn <- head(subset(top,s.dist<0),20)
topdn %>%
kbl(caption=paste("Top 20 downregulated metabolic pathways: ", plotnames[i])) %>%
kable_paper("hover", full_width = F)
top2 <- rbind(head(topup,10),head(topdn,10))
top2 <- top2[order(-top2$s.dist),]
par(mar=c(5,15,5,2))
barplot(rev(abs(top2$s.dist)),horiz=TRUE,las=1,names.arg=rev(top2$set),
main=paste(plotnames[i]),
cex.names=0.6,xlab="Enrichment Score",
col=c(rep("blue",nrow(head(topdn,10))),rep("red",nrow(head(topup,10)))),
xlim=c(0,0.8))
}
## The input is a single dataframe; one contrast only. Converting
## it to a list for you.
## Note: Mean no. genes in input = 16009
## Note: no. genes in output = 12692
## Note: estimated proportion of input genes in output = 0.793
## Note: Enrichments with large effect sizes may not be
## statistically significant.
## The input is a single dataframe; one contrast only. Converting
## it to a list for you.
## Note: Mean no. genes in input = 16009
## Note: no. genes in output = 12691
## Note: estimated proportion of input genes in output = 0.793
## Note: Enrichments with large effect sizes may not be
## statistically significant.
## The input is a single dataframe; one contrast only. Converting
## it to a list for you.
## Note: Mean no. genes in input = 16009
## Note: no. genes in output = 12691
## Note: estimated proportion of input genes in output = 0.793
## Note: Enrichments with large effect sizes may not be
## statistically significant.
## The input is a single dataframe; one contrast only. Converting
## it to a list for you.
## Note: Mean no. genes in input = 16009
## Note: no. genes in output = 12691
## Note: estimated proportion of input genes in output = 0.793
## Note: Enrichments with large effect sizes may not be
## statistically significant.
## The input is a single dataframe; one contrast only. Converting
## it to a list for you.
## Note: Mean no. genes in input = 16009
## Note: no. genes in output = 12691
## Note: estimated proportion of input genes in output = 0.793
## Note: Enrichments with large effect sizes may not be
## statistically significant.
## The input is a single dataframe; one contrast only. Converting
## it to a list for you.
## Note: Mean no. genes in input = 16009
## Note: no. genes in output = 12690
## Note: estimated proportion of input genes in output = 0.793
## Note: Enrichments with large effect sizes may not be
## statistically significant.
## The input is a single dataframe; one contrast only. Converting
## it to a list for you.
## Note: Mean no. genes in input = 16009
## Note: no. genes in output = 12689
## Note: estimated proportion of input genes in output = 0.793
## Note: Enrichments with large effect sizes may not be
## statistically significant.
## The input is a single dataframe; one contrast only. Converting
## it to a list for you.
## Note: Mean no. genes in input = 16009
## Note: no. genes in output = 12691
## Note: estimated proportion of input genes in output = 0.793
## Note: Enrichments with large effect sizes may not be
## statistically significant.
## The input is a single dataframe; one contrast only. Converting
## it to a list for you.
## Note: Mean no. genes in input = 16009
## Note: no. genes in output = 12689
## Note: estimated proportion of input genes in output = 0.793
## Note: Enrichments with large effect sizes may not be
## statistically significant.
Make heatmaps for pairwise cell line comparisons.
# ----- Fatty acid metabolism heatmaps -----
# fatty1 <- genesets2[["REACTOME_FATTY_ACID_METABOLISM"]]
# fatty2 <- genesets2[["HALLMARK_FATTY_ACID_METABOLISM"]]
# fatty3 <- genesets2[["GOBP_FATTY_ACID_BIOSYNTHETIC_PROCESS"]]
# fatty4 <-genesets2[["KEGG_FATTY_ACID_METABOLISM"]]
# fatty <- list(fatty1=fatty1,fatty2=fatty2,fatty3=fatty3,fatty4=fatty4)
# title <- c("REACTOME_FATTY_ACID_METABOLISM","HALLMARK_FATTY_ACID_METABOLISM","GOBP_FATTY_ACID_BIOSYNTHETIC_PROCESS","KEGG_FATTY_ACID_METABOLISM")
#
# for(i in 1:length(fatty)){
# temp <- c()
# for(j in 1:length(fatty[[i]])){temp <- rbind(temp,filter(hm3, geneID==fatty[[i]][j]))}
# fattyx <- c(t(unite(temp,"ENSSHAG_geneID", ENSSHAG:geneID, sep=" ")))
# fat <- subset(mx.scaled, rownames(mx.scaled) %in% fattyx)
# fatFC <- subset(log2FC, rownames(log2FC) %in% fattyx)
# fatAE <- subset(mean, rownames(mean) %in% fattyx)
#
# h1 <- Heatmap(fat, cluster_rows = F,
# column_labels = colnames(fat), name="Z-score",
# cluster_columns = T)
# h2 <- Heatmap(fatFC, row_labels = rownames(fatFC), row_names_gp = gpar(fontsize = 6),
# cluster_rows = F, name="logFC", top_annotation = ha, col = col_logFC,
# cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
# grid.text(round(as.numeric(fatFC[i, j],2)), x, y, gp=gpar(fontsize=6))
# })
# h3 <- Heatmap(fatAE, row_labels = rownames(fatAE), row_names_gp = gpar(fontsize = 6),
# cluster_rows = F, name = "AveExpr", col=col_AveExpr,
# cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
# grid.text(round(as.numeric(fatAE[i, j],2)), x, y, gp=gpar(fontsize=6))},
# column_title=title[i])
#
# name <- paste("q",i,sep="")
# assign(name, h1 + h2 + h3)
# }
#
# q1
# q2
# q3
# q4
Pathview to visualise metabolic pathways.
For reproducibility.
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
##
## locale:
## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8
## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8
## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] magrittr_2.0.3 VennDiagram_1.7.3
## [3] futile.logger_1.4.3 stringr_1.5.0
## [5] pathview_1.40.0 circlize_0.4.15
## [7] RColorBrewer_1.1-3 ComplexHeatmap_2.16.0
## [9] tidyr_1.3.0 dplyr_1.1.3
## [11] kableExtra_1.3.4 limma_3.56.2
## [13] mitch_1.12.0 DESeq2_1.40.2
## [15] SummarizedExperiment_1.30.2 Biobase_2.60.0
## [17] MatrixGenerics_1.12.3 matrixStats_1.0.0
## [19] GenomicRanges_1.52.0 GenomeInfoDb_1.36.3
## [21] IRanges_2.34.1 S4Vectors_0.38.1
## [23] BiocGenerics_0.46.0 gplots_3.1.3
## [25] reshape2_1.4.4
##
## loaded via a namespace (and not attached):
## [1] rstudioapi_0.15.0 jsonlite_1.8.7 shape_1.4.6
## [4] magick_2.7.5 rmarkdown_2.24 GlobalOptions_0.1.2
## [7] zlibbioc_1.46.0 vctrs_0.6.3 Cairo_1.6-1
## [10] memoise_2.0.1 RCurl_1.98-1.12 webshot_0.5.5
## [13] htmltools_0.5.6 S4Arrays_1.0.6 lambda.r_1.2.4
## [16] sass_0.4.7 KernSmooth_2.23-22 bslib_0.5.1
## [19] htmlwidgets_1.6.2 plyr_1.8.8 echarts4r_0.4.5
## [22] futile.options_1.0.1 cachem_1.0.8 mime_0.12
## [25] lifecycle_1.0.3 iterators_1.0.14 pkgconfig_2.0.3
## [28] Matrix_1.6-1 R6_2.5.1 fastmap_1.1.1
## [31] GenomeInfoDbData_1.2.10 shiny_1.7.5 clue_0.3-64
## [34] digest_0.6.33 colorspace_2.1-0 GGally_2.1.2
## [37] reshape_0.8.9 AnnotationDbi_1.62.2 RSQLite_2.3.1
## [40] org.Hs.eg.db_3.17.0 fansi_1.0.4 httr_1.4.7
## [43] abind_1.4-5 compiler_4.3.1 withr_2.5.0
## [46] bit64_4.0.5 doParallel_1.0.17 BiocParallel_1.34.2
## [49] DBI_1.1.3 highr_0.10 MASS_7.3-60
## [52] DelayedArray_0.26.7 rjson_0.2.21 gtools_3.9.4
## [55] caTools_1.18.2 tools_4.3.1 beeswarm_0.4.0
## [58] httpuv_1.6.11 glue_1.6.2 promises_1.2.1
## [61] cluster_2.1.4 generics_0.1.3 gtable_0.3.4
## [64] xml2_1.3.5 utf8_1.2.3 XVector_0.40.0
## [67] foreach_1.5.2 pillar_1.9.0 later_1.3.1
## [70] lattice_0.21-8 bit_4.0.5 tidyselect_1.2.0
## [73] locfit_1.5-9.8 Biostrings_2.68.1 knitr_1.44
## [76] gridExtra_2.3 svglite_2.1.1 xfun_0.40
## [79] KEGGgraph_1.60.0 stringi_1.7.12 yaml_2.3.7
## [82] evaluate_0.21 codetools_0.2-19 tibble_3.2.1
## [85] Rgraphviz_2.44.0 graph_1.78.0 cli_3.6.1
## [88] xtable_1.8-4 systemfonts_1.0.4 munsell_0.5.0
## [91] jquerylib_0.1.4 Rcpp_1.0.11 png_0.1-8
## [94] XML_3.99-0.14 parallel_4.3.1 ellipsis_0.3.2
## [97] ggplot2_3.4.3 blob_1.2.4 bitops_1.0-7
## [100] viridisLite_0.4.2 scales_1.2.1 purrr_1.0.2
## [103] crayon_1.5.2 GetoptLong_1.0.5 rlang_1.1.1
## [106] KEGGREST_1.40.0 rvest_1.0.3 formatR_1.14