Source: https://github.com/markziemann/echs1_rnaseq

Introduction

The samples are labelled as CON (control) and KO for knockout, ut are the untreated samples and dNs are the treated samples (mito biogenesis stimulator).

suppressPackageStartupMessages({
    library("zoo")
    library("tidyverse")
    library("reshape2")
    library("DESeq2")
    library("gplots")
    library("fgsea")
    library("MASS")
    library("mitch")
    library("eulerr")
    library("limma")
    library("topconfects")
    library("kableExtra")
})

Import read counts

Importing RNA-seq data

tmp <- read.table("3col.tsv.gz",header=F)
x <- as.matrix(acast(tmp, V2~V1, value.var="V3", fun.aggregate = sum))
x <- as.data.frame(x)
accession <- sapply((strsplit(rownames(x),"\\|")),"[[",2)
symbol<-sapply((strsplit(rownames(x),"\\|")),"[[",6)
x$geneid <- paste(accession,symbol)
xx <- aggregate(. ~ geneid,x,sum)
rownames(xx) <- xx$geneid
xx$geneid = NULL
xx <- round(xx)
xx <- xx[,which(colnames(xx)!="test")]
xx[1:6,1:6]
##                             1_CON_ut_A_RNA_HB_HLF57DRXY_CTGTTGGTCC-AACGGTCTAT_L001
## ENSG00000000003.15 TSPAN6                                                      704
## ENSG00000000005.6 TNMD                                                           0
## ENSG00000000419.13 DPM1                                                        624
## ENSG00000000457.14 SCYL3                                                        97
## ENSG00000000460.17 C1orf112                                                    110
## ENSG00000000938.13 FGR                                                           0
##                             1_CON_ut_A_RNA_HB_HLF57DRXY_CTGTTGGTCC-AACGGTCTAT_L002
## ENSG00000000003.15 TSPAN6                                                      676
## ENSG00000000005.6 TNMD                                                           0
## ENSG00000000419.13 DPM1                                                        643
## ENSG00000000457.14 SCYL3                                                        99
## ENSG00000000460.17 C1orf112                                                    122
## ENSG00000000938.13 FGR                                                           0
##                             10_KO_ut_B_RNA_HB_HLF57DRXY_CACGGAACAA-GTGCTAGGTT_L001
## ENSG00000000003.15 TSPAN6                                                      752
## ENSG00000000005.6 TNMD                                                           0
## ENSG00000000419.13 DPM1                                                        360
## ENSG00000000457.14 SCYL3                                                       102
## ENSG00000000460.17 C1orf112                                                    110
## ENSG00000000938.13 FGR                                                           0
##                             10_KO_ut_B_RNA_HB_HLF57DRXY_CACGGAACAA-GTGCTAGGTT_L002
## ENSG00000000003.15 TSPAN6                                                      803
## ENSG00000000005.6 TNMD                                                           0
## ENSG00000000419.13 DPM1                                                        334
## ENSG00000000457.14 SCYL3                                                        75
## ENSG00000000460.17 C1orf112                                                    100
## ENSG00000000938.13 FGR                                                           0
##                             11_KO_ut_C_RNA_HB_HLF57DRXY_TGGAGTACTT-TCCACACAGA_L001
## ENSG00000000003.15 TSPAN6                                                     1502
## ENSG00000000005.6 TNMD                                                           0
## ENSG00000000419.13 DPM1                                                        703
## ENSG00000000457.14 SCYL3                                                       203
## ENSG00000000460.17 C1orf112                                                    175
## ENSG00000000938.13 FGR                                                           0
##                             11_KO_ut_C_RNA_HB_HLF57DRXY_TGGAGTACTT-TCCACACAGA_L002
## ENSG00000000003.15 TSPAN6                                                     1509
## ENSG00000000005.6 TNMD                                                           0
## ENSG00000000419.13 DPM1                                                        659
## ENSG00000000457.14 SCYL3                                                       184
## ENSG00000000460.17 C1orf112                                                    191
## ENSG00000000938.13 FGR                                                           0
dim(xx)
## [1] 60651    32

Fix the sample names.

They are duplicated for lane 1 and 2, which I will aggregate.

colnames(xx) <- sapply(strsplit(colnames(xx),"_RNA_"),"[[",1)
colnames(xx) <- sapply(strsplit(sub("_","@",colnames(xx)),"@"),"[[",2)
labels <- unique(sapply(strsplit(colnames(xx),"\\."),"[[",1))
l <- lapply(labels,function(x) { rowSums(xx[,grep(x,colnames(xx))]) } )
ll <- do.call(cbind,l)
colnames(ll) <- labels
ll <- as.data.frame(ll[,order(colnames(ll))])

Make a sample sheet. This will be easy because the groups are specified in the name.

ss <- as.data.frame(colnames(ll))
ss$ko <- factor(grepl("KO", colnames(ll))) 
ss$trt <- factor(grepl("dNs",colnames(ll)))
rownames(ss) <- ss[,1]
ss[,1]=NULL
# can also set some colours for later
ss$cols <- c(rep("gray",4),rep("lightblue",4),rep("pink",4),rep("orange",4))

ss %>% kbl(caption = "Sample sheet") %>% kable_paper("hover", full_width = F)
Sample sheet
ko trt cols
CON_dNs_A FALSE TRUE gray
CON_dNs_B FALSE TRUE gray
CON_dNs_C FALSE TRUE gray
CON_dNs_D FALSE TRUE gray
CON_ut_A FALSE FALSE lightblue
CON_ut_B FALSE FALSE lightblue
CON_ut_C FALSE FALSE lightblue
CON_ut_D FALSE FALSE lightblue
KO_dNs_A TRUE TRUE pink
KO_dNs_B TRUE TRUE pink
KO_dNs_C TRUE TRUE pink
KO_dNs_D TRUE TRUE pink
KO_ut_A TRUE FALSE orange
KO_ut_B TRUE FALSE orange
KO_ut_C TRUE FALSE orange
KO_ut_D TRUE FALSE orange

QC analysis

Here I’ll look at a few different quality control measures.

par(mar=c(5,8,3,1))
barplot(colSums(ll),horiz=TRUE,las=1,xlab="num reads",col=ss$cols)

sums <- colSums(ll)
sums <- sums[order(sums)]
barplot(sums,horiz=TRUE,las=1,xlab="num reads")
abline(v=15000000,col="red")