Intro

Here we are running an analysis of DIA based proteomics of control and VLCAD knock-out cells. Samples 1-4 are KO and 5-8 are control.

I have run fragpipe with LFQ pipeline.

Here I will do an exploratory data analysis.

This will include QC analysis, PCA, quantile normalisation and differential expression with limma.

library("pcaMethods") #BioC
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: generics
## 
## Attaching package: 'generics'
## The following objects are masked from 'package:base':
## 
##     as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
##     setequal, union
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, saveRDS, table, tapply, unique,
##     unsplit, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'pcaMethods'
## The following object is masked from 'package:stats':
## 
##     loadings
library("limma")
## 
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
library("kableExtra")
library("gplots")
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library("mitch")

Now I’ll read in the data.

# Load the data
x <- read.table("report.pg_matrix.tsv", header=TRUE, sep="\t")

# Clean column names
colnames(x) <- gsub("X.matt_proteomics.250711_MM_11262_","", colnames(x))
colnames(x) <- gsub("_uncalibrated.mzML", "", colnames(x))

# tidy cols
y <- x[,c("Protein.Group","Protein.Ids","Genes","1","2","3","4","5","6","7","8")]

# Set row names
rownames(y) <- paste(y$Protein.Ids, y$Genes, sep = "_")

# Extract intensity data
y <- y[,4:11]

# fix colnames
colnames(y) <- c("KO1","KO2","KO3","KO4","CTL1","CTL2","CTL3","CTL4")


# Display data dimensions and first few rows
cat("Data dimensions:", nrow(y), "proteins x", ncol(y), "samples\n")
## Data dimensions: 7899 proteins x 8 samples
cat("FIRST 6 ROWS OF INTENSITY DATA\n")
## FIRST 6 ROWS OF INTENSITY DATA
head(y) %>%
  kbl(caption="Top few rows of the dataset") %>%
  kable_paper("hover", full_width = F)
Top few rows of the dataset
KO1 KO2 KO3 KO4 CTL1 CTL2 CTL3 CTL4
A0A075B6K5_IGLV3-9 694323 2083250 NA NA NA NA NA NA
A0A075B6P5_IGKV2-28 3447730 11593300 NA NA NA NA NA NA
A0A075B6R9_IGKV2D-24 996053 1531320 NA NA NA NA NA NA
A0A075B6S5_IGKV1-27 622822 2574890 NA NA NA NA NA NA
A0A096LNW5_NOTCH2NLR 396606 310606 455893 606542 627783 374798 554069 355685
A0A096LP01_SMIM26 6361420 2306680 1930910 2496140 6221610 1555130 1960550 1772430

Quality control

Now we can run an analysis of missing values.

total_values <- nrow(y) * ncol(y)
total_missing <- sum(is.na(y))
missing_percentage <- (total_missing / total_values) * 100

cat("\nOVERALL MISSING DATA SUMMARY\n")
## 
## OVERALL MISSING DATA SUMMARY
cat("Total values:", total_values, "\n")
## Total values: 63192
cat("Total missing:", total_missing, "\n")
## Total missing: 1716
cat("Overall missing percentage:", round(missing_percentage, 2), "%\n")
## Overall missing percentage: 2.72 %
# Missing data per sample
missing_per_sample <- colSums(is.na(y))
missing_per_sample_pct <- (missing_per_sample / nrow(y)) * 100

missing_summary <- data.frame(
  Sample = names(missing_per_sample),
  Missing_Count = missing_per_sample,
  Missing_Percent = round(missing_per_sample_pct, 2)
)

cat("\nMISSING DATA PER SAMPLE\n")
## 
## MISSING DATA PER SAMPLE
print(missing_summary)
##      Sample Missing_Count Missing_Percent
## KO1     KO1           224            2.84
## KO2     KO2           139            1.76
## KO3     KO3           209            2.65
## KO4     KO4           253            3.20
## CTL1   CTL1           224            2.84
## CTL2   CTL2           190            2.41
## CTL3   CTL3           201            2.54
## CTL4   CTL4           276            3.49
# by protein
nas <- apply(X = y, MARGIN = 1, function(x){ 
  length(which(is.na(x))) 
})
missing_proteins_table <- table(nas)

cat("\nDATA DISTRIBUTION ACROSS PROTEINS\n")
## 
## DATA DISTRIBUTION ACROSS PROTEINS
cat("Number of proteins by missing value count:\n")
## Number of proteins by missing value count:
print(missing_proteins_table)
## nas
##    0    1    2    3    4    5    6    7 
## 7349  189   94   59   55   39   51   63
cat("\nSummary of missing values per protein:\n")
## 
## Summary of missing values per protein:
print(summary(nas))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2172  0.0000  7.0000

Overall 2.7% missing values which is very good.

Now remove proteins with more than 4 missing values.

table(nas >4)
## 
## FALSE  TRUE 
##  7746   153
nrow(y[nas <5,])
## [1] 7746
y2 <- y[nas <5,]

dim(y2)
## [1] 7746    8
dim(y)
## [1] 7899    8
summary(y2)
##       KO1                 KO2                 KO3           
##  Min.   :1.715e+04   Min.   :1.657e+04   Min.   :1.170e+04  
##  1st Qu.:2.533e+06   1st Qu.:2.657e+06   1st Qu.:2.639e+06  
##  Median :7.094e+06   Median :7.440e+06   Median :7.442e+06  
##  Mean   :3.505e+07   Mean   :3.504e+07   Mean   :3.494e+07  
##  3rd Qu.:2.108e+07   3rd Qu.:2.070e+07   3rd Qu.:2.180e+07  
##  Max.   :5.253e+09   Max.   :1.298e+10   Max.   :4.627e+09  
##  NA's   :117         NA's   :79          NA's   :71         
##       KO4                 CTL1                CTL2          
##  Min.   :1.426e+04   Min.   :1.784e+04   Min.   :1.841e+04  
##  1st Qu.:2.497e+06   1st Qu.:2.517e+06   1st Qu.:2.626e+06  
##  Median :7.007e+06   Median :7.125e+06   Median :7.238e+06  
##  Mean   :3.382e+07   Mean   :3.523e+07   Mean   :3.173e+07  
##  3rd Qu.:2.068e+07   3rd Qu.:2.059e+07   3rd Qu.:1.986e+07  
##  Max.   :3.798e+09   Max.   :5.020e+09   Max.   :5.979e+09  
##  NA's   :125         NA's   :88          NA's   :70         
##       CTL3                CTL4          
##  Min.   :1.576e+04   Min.   :4.673e+03  
##  1st Qu.:2.532e+06   1st Qu.:2.304e+06  
##  Median :7.153e+06   Median :6.602e+06  
##  Mean   :3.223e+07   Mean   :3.193e+07  
##  3rd Qu.:2.083e+07   3rd Qu.:1.954e+07  
##  Max.   :3.392e+09   Max.   :3.772e+09  
##  NA's   :74          NA's   :150
apply(y2,2,median,na.rm=TRUE)
##     KO1     KO2     KO3     KO4    CTL1    CTL2    CTL3    CTL4 
## 7093900 7440070 7442300 7007090 7124725 7238305 7152880 6601615
apply(y2,2,mean,na.rm=TRUE)
##      KO1      KO2      KO3      KO4     CTL1     CTL2     CTL3     CTL4 
## 35051253 35037385 34939285 33822843 35230167 31734946 32229620 31926156

Intensities are fairly even.

Data Visualization - Raw Distribution

Boxplot of raw and log-transformed abundances

# Boxplot of raw abundances
par(mfrow = c(1, 1), mar = c(5, 4, 3, 1))

# Raw scale boxplot
boxplot(y2, las = 2, main = "Raw Abundance Distribution",
        ylab = "Abundance", cex.axis = 0.6, col = rainbow(ncol(y2)))

# Log-transformed boxplot for better visualization
y_log <- log10(y2 + 1)  # Add 1 to handle zeros if any
boxplot(y_log, las = 2, main = "Log10(Abundance + 1) Distribution",
        ylab = "Log10(Abundance + 1)", cex.axis = 0.6, col = rainbow(ncol(y2)))

Protein Abundance Analysis

# Protein Abundance Analysis
# Calculate protein statistics
protein_stats <- data.frame(
  Protein = rownames(y),
  Mean_Abundance = rowMeans(y, na.rm = TRUE),
  Median_Abundance = apply(y, 1, median, na.rm = TRUE),
  SD_Abundance = apply(y, 1, sd, na.rm = TRUE),
  CV = apply(y, 1, sd, na.rm = TRUE) / rowMeans(y, na.rm = TRUE),
  Missing_Count = nas
)

cat("\nPROTEIN ABUNDANCE SUMMARY\n")
## 
## PROTEIN ABUNDANCE SUMMARY
cat("Mean abundance across all proteins:\n")
## Mean abundance across all proteins:
print(summary(protein_stats$Mean_Abundance))
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 1.907e+04 2.490e+06 7.040e+06 3.283e+07 2.059e+07 3.710e+09
cat("\nCoefficient of Variation (CV) summary:\n")
## 
## Coefficient of Variation (CV) summary:
print(summary(protein_stats$CV))
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## 0.007651 0.170949 0.236206 0.275446 0.333498 2.761049       63
# Top 10 most abundant proteins
high_abundance <- head(protein_stats[order(protein_stats$Mean_Abundance, decreasing = TRUE), ], 10)

cat("\nTOP 10 MOST ABUNDANT PROTEINS\n")
## 
## TOP 10 MOST ABUNDANT PROTEINS
print(high_abundance[, c("Protein", "Mean_Abundance", "Missing_Count")])
##                       Protein Mean_Abundance Missing_Count
## P05141_SLC25A5 P05141_SLC25A5     3710495000             0
## P06576_ATP5F1B P06576_ATP5F1B     3405280000             0
## P62805_H4C1       P62805_H4C1     3382420000             0
## P60709_ACTB       P60709_ACTB     3335985000             0
## P25705_ATP5F1A P25705_ATP5F1A     3031916250             0
## P04406_GAPDH     P04406_GAPDH     2599235000             0
## Q00325_SLC25A3 Q00325_SLC25A3     2422585000             0
## P06899_H2BC11   P06899_H2BC11     2301392125             0
## P10809_HSPD1     P10809_HSPD1     2273151375             0
## P62979_RPS27A   P62979_RPS27A     2201783750             0

Top 10 most variable proteins by CV

high_variable <- head(protein_stats[order(protein_stats$CV, decreasing = TRUE), ], 10)

cat("\nTOP 10 MOST VARIABLE PROTEINS (by Coefficient of Variation)\n")
## 
## TOP 10 MOST VARIABLE PROTEINS (by Coefficient of Variation)
#Round only the numeric columns, keep Protein column as-is
high_variable_display <- high_variable[, c("Protein", "Mean_Abundance", "CV", "Missing_Count")]
high_variable_display[, c("Mean_Abundance", "CV")] <- round(high_variable_display[, c("Mean_Abundance", "CV")], 3)
high_variable_display[, "Missing_Count"] <- round(high_variable_display[, "Missing_Count"], 0)  # Missing count should be integer
print(high_variable_display)
##                         Protein Mean_Abundance    CV Missing_Count
## P02538_KRT6A       P02538_KRT6A      211717589 2.761             0
## P08779_KRT16       P08779_KRT16      121250752 2.710             0
## P04259_KRT6B       P04259_KRT6B     1685411300 2.709             0
## P06702_S100A9     P06702_S100A9       83771732 2.696             0
## P31151_S100A7     P31151_S100A7       44475176 2.626             0
## P02533_KRT14       P02533_KRT14      338393075 2.588             0
## P48594_SERPINB4 P48594_SERPINB4       32009080 2.579             0
## Q04695_KRT17       Q04695_KRT17      206012900 2.520             0
## Q13835_PKP1         Q13835_PKP1        4272266 2.423             0
## Q08188_TGM3         Q08188_TGM3       22738735 2.415             0

Looks like keratin is highly variable could be some contamination going on.

Visualization of CV distribution

hist(protein_stats$CV[!is.infinite(protein_stats$CV)], 
     breaks = 50, main = "Distribution of Coefficient of Variation", 
     xlab = "CV", ylab = "Frequency", 
     col = "lightblue", border = "darkblue")

Now look at the sample correlation matrix.

cor_matrix <- cor(y2, use = "complete.obs")

cat("\nSAMPLE CORRELATION MATRIX\n")
## 
## SAMPLE CORRELATION MATRIX
print(round(cor_matrix, 3))
##        KO1   KO2   KO3   KO4  CTL1  CTL2  CTL3  CTL4
## KO1  1.000 0.616 0.904 0.909 0.985 0.789 0.947 0.942
## KO2  0.616 1.000 0.676 0.626 0.631 0.678 0.646 0.590
## KO3  0.904 0.676 1.000 0.973 0.914 0.900 0.971 0.929
## KO4  0.909 0.626 0.973 1.000 0.907 0.807 0.967 0.962
## CTL1 0.985 0.631 0.914 0.907 1.000 0.845 0.961 0.937
## CTL2 0.789 0.678 0.900 0.807 0.845 1.000 0.863 0.754
## CTL3 0.947 0.646 0.971 0.967 0.961 0.863 1.000 0.978
## CTL4 0.942 0.590 0.929 0.962 0.937 0.754 0.978 1.000
# Identify sample groups
chrom_samples <- grepl("Chrom", colnames(y))
sol_samples <- grepl("Sol", colnames(y))

# Calculate group-wise correlations
chrom_cor <- cor_matrix[chrom_samples, chrom_samples]
sol_cor <- cor_matrix[sol_samples, sol_samples]
between_cor <- cor_matrix[chrom_samples, sol_samples]

cat("\nCORRELATION SUMMARY\n")
## 
## CORRELATION SUMMARY
cat("Average within-Chromatin correlation:", round(mean(chrom_cor[upper.tri(chrom_cor)]), 3), "\n")
## Average within-Chromatin correlation: NaN
cat("Average within-Soluble correlation:", round(mean(sol_cor[upper.tri(sol_cor)]), 3), "\n")
## Average within-Soluble correlation: NaN
cat("Average between-condition correlation:", round(mean(between_cor), 3), "\n")
## Average between-condition correlation: NaN
cor_matrix
##            KO1       KO2       KO3       KO4      CTL1      CTL2      CTL3
## KO1  1.0000000 0.6156414 0.9042915 0.9090907 0.9854008 0.7893350 0.9466495
## KO2  0.6156414 1.0000000 0.6761849 0.6262485 0.6308259 0.6779291 0.6457640
## KO3  0.9042915 0.6761849 1.0000000 0.9729884 0.9143130 0.9001092 0.9714041
## KO4  0.9090907 0.6262485 0.9729884 1.0000000 0.9072548 0.8071333 0.9670573
## CTL1 0.9854008 0.6308259 0.9143130 0.9072548 1.0000000 0.8448395 0.9605245
## CTL2 0.7893350 0.6779291 0.9001092 0.8071333 0.8448395 1.0000000 0.8628217
## CTL3 0.9466495 0.6457640 0.9714041 0.9670573 0.9605245 0.8628217 1.0000000
## CTL4 0.9415118 0.5895684 0.9285356 0.9615625 0.9374458 0.7537676 0.9778751
##           CTL4
## KO1  0.9415118
## KO2  0.5895684
## KO3  0.9285356
## KO4  0.9615625
## CTL1 0.9374458
## CTL2 0.7537676
## CTL3 0.9778751
## CTL4 1.0000000
heatmap.2(cor_matrix,trace="none")

Looks like KO2 could be an outlier.

heatmap.2(as.matrix(y2[grep("KRT",rownames(y2)),]),trace="none",scale="row",margin=c(5,10))

The heatmap shows KO2 is contaminated with keratin. Suggest excluding it from downstream analysis.

Repeat analysis with y_log

cor_matrix <- cor(y_log, use = "complete.obs")

cat("\nSAMPLE CORRELATION MATRIX\n")
## 
## SAMPLE CORRELATION MATRIX
print(round(cor_matrix, 3))
##        KO1   KO2   KO3   KO4  CTL1  CTL2  CTL3  CTL4
## KO1  1.000 0.955 0.973 0.964 0.987 0.948 0.969 0.961
## KO2  0.955 1.000 0.970 0.946 0.944 0.968 0.963 0.933
## KO3  0.973 0.970 1.000 0.983 0.966 0.964 0.989 0.975
## KO4  0.964 0.946 0.983 1.000 0.958 0.932 0.976 0.984
## CTL1 0.987 0.944 0.966 0.958 1.000 0.960 0.976 0.964
## CTL2 0.948 0.968 0.964 0.932 0.960 1.000 0.973 0.931
## CTL3 0.969 0.963 0.989 0.976 0.976 0.973 1.000 0.983
## CTL4 0.961 0.933 0.975 0.984 0.964 0.931 0.983 1.000
# Identify sample groups
chrom_samples <- grepl("Chrom", colnames(y))
sol_samples <- grepl("Sol", colnames(y))

# Calculate group-wise correlations
chrom_cor <- cor_matrix[chrom_samples, chrom_samples]
sol_cor <- cor_matrix[sol_samples, sol_samples]
between_cor <- cor_matrix[chrom_samples, sol_samples]

cat("\nCORRELATION SUMMARY\n")
## 
## CORRELATION SUMMARY
cat("Average within-Chromatin correlation:", round(mean(chrom_cor[upper.tri(chrom_cor)]), 3), "\n")
## Average within-Chromatin correlation: NaN
cat("Average within-Soluble correlation:", round(mean(sol_cor[upper.tri(sol_cor)]), 3), "\n")
## Average within-Soluble correlation: NaN
cat("Average between-condition correlation:", round(mean(between_cor), 3), "\n")
## Average between-condition correlation: NaN
cor_matrix
##            KO1       KO2       KO3       KO4      CTL1      CTL2      CTL3
## KO1  1.0000000 0.9550537 0.9728069 0.9644101 0.9867622 0.9482181 0.9693253
## KO2  0.9550537 1.0000000 0.9703504 0.9456029 0.9441793 0.9676906 0.9625101
## KO3  0.9728069 0.9703504 1.0000000 0.9830744 0.9659350 0.9640514 0.9893335
## KO4  0.9644101 0.9456029 0.9830744 1.0000000 0.9584659 0.9319293 0.9762458
## CTL1 0.9867622 0.9441793 0.9659350 0.9584659 1.0000000 0.9598686 0.9759373
## CTL2 0.9482181 0.9676906 0.9640514 0.9319293 0.9598686 1.0000000 0.9729682
## CTL3 0.9693253 0.9625101 0.9893335 0.9762458 0.9759373 0.9729682 1.0000000
## CTL4 0.9611229 0.9334854 0.9747769 0.9840200 0.9639182 0.9311667 0.9827680
##           CTL4
## KO1  0.9611229
## KO2  0.9334854
## KO3  0.9747769
## KO4  0.9840200
## CTL1 0.9639182
## CTL2 0.9311667
## CTL3 0.9827680
## CTL4 1.0000000
heatmap.2(cor_matrix,trace="none")

After log transformation, the contamination is not so obvious.

Data normalisation

Handle remaining missing values in filtered data

for(i in 1:ncol(y2)) {
  if(any(is.na(y2[,i]))) {
    min_val <- min(y2[,i], na.rm = TRUE)
    y2[is.na(y2[,i]), i] <- min_val * 0.1
  }
}

cat("Missing values after imputation:", sum(is.na(y2)), "\n")
## Missing values after imputation: 0
#Log10 transformation
yl <- log10(y2 + 1)
cat("Log10 transformation completed\n")
## Log10 transformation completed
#Between-array normalization
yn <- normalizeBetweenArrays(yl, method = "quantile", targets = NULL, cyclic.method = "fast")
cat("Between-array normalization completed\n")
## Between-array normalization completed
cat("Final normalized data dimensions:", nrow(yn), "x", ncol(yn), "\n")
## Final normalized data dimensions: 7746 x 8

NORMALIZATION VALIDATION

Before and after comparison

cat("Raw data range:", round(range(y2), 0), "\n")
## Raw data range: 467 12982700000
cat("Normalized data range:", round(range(yn), 2), "\n")
## Normalized data range: 3.13 9.71
# Sample median variability
raw_medians <- apply(y2, 2, median)
norm_medians <- apply(yn, 2, median)

cat("CV of sample medians (raw):", round(sd(raw_medians)/mean(raw_medians), 3), "\n")
## CV of sample medians (raw): 0.045
cat("CV of sample medians (normalized):", round(sd(norm_medians)/mean(norm_medians), 3), "\n")
## CV of sample medians (normalized): 0
# Correlation preservation
cor_raw <- cor(y2)
cor_norm <- cor(yn)

ko_idx <- grepl("KO", colnames(yn))
ctl_idx <- grepl("CTL", colnames(yn))

cat("Within-Chromatin correlation (normalized):", 
    round(mean(cor_norm[ko_idx, ko_idx][upper.tri(cor_norm[ko_idx, ko_idx])]), 3), "\n")
## Within-Chromatin correlation (normalized): 0.858
cat("Within-Soluble correlation (normalized):", 
    round(mean(cor_norm[ctl_idx, ctl_idx][upper.tri(cor_norm[ctl_idx, ctl_idx])]), 3), "\n")
## Within-Soluble correlation (normalized): 0.874

VISUALIZATION COMPARISON

par(mfrow = c(2, 2), mar = c(6, 4, 3, 1))

# Raw data
boxplot(log10(y2 + 1), las = 2, main = "Raw Data (Log10 scale)",
        cex.axis = 0.6, col = rainbow(ncol(y2)))

# Log10 transformed
boxplot(yl, las = 2, main = "Log10 Transformed",
        cex.axis = 0.6, col = rainbow(ncol(yl)))

# Final normalized(quantile)
boxplot(yn, las = 2, main = "Final Normalized",
        cex.axis = 0.6, col = rainbow(ncol(yn)))

par(mfrow = c(1, 1))

Looks OK.

Sample completeness

sample_completeness <- (1 - missing_per_sample_pct/100) * 100
cat("\nSample completeness (%):\n")
## 
## Sample completeness (%):
for(i in 1:length(sample_completeness)) {
  cat(names(sample_completeness)[i], ":", round(sample_completeness[i], 1), "%\n")
}
## KO1 : 97.2 %
## KO2 : 98.2 %
## KO3 : 97.4 %
## KO4 : 96.8 %
## CTL1 : 97.2 %
## CTL2 : 97.6 %
## CTL3 : 97.5 %
## CTL4 : 96.5 %

PCA Analysis

Use filtered data (z) with fewer missing values Transpose data so samples are rows (pcaMethods expects this format)

Perform PPCA (handles missing values)

pc <- pca(t(y2), nPcs = 3, method = "ppca")

print(pc)
## ppca calculated PCA
## Importance of component(s):
##                  PC1    PC2    PC3
## R2            0.6516 0.1821 0.1125
## Cumulative R2 0.6516 0.8337 0.9463
## 7746     Variables
## 8    Samples
## 0    NAs ( 0 %)
## 3    Calculated component(s)
## Data was mean centered before running PCA 
## Data was NOT scaled before running PCA 
## Scores structure:
## [1] 8 3
## Loadings structure:
## [1] 7746    3
cat("\nPCA Summary:\n")
## 
## PCA Summary:
cat("Number of components:", pc@nPcs, "\n")
## Number of components: 3
cat("Method used:", pc@method, "\n")
## Method used: ppca
cat("Variance explained by each PC:\n")
## Variance explained by each PC:
print(round(pc@R2, 3))
## [1] 0.652 0.182 0.113
cat("\nCumulative variance explained:\n")
## 
## Cumulative variance explained:
print(round(pc@R2cum, 3))
## [1] 0.652 0.834 0.946
# Scores plot (samples)
par(mar = c(5,4,3,1))
slplot(pc, main = "PCA Scores Plot", cex = 0.8, cex.axis = 0.8)

# Manual scores plot with sample group colors
scores <- pc@scores
chrom_color <- ifelse(grepl("KO", rownames(scores)), "red", "blue")

plot(scores[,1], scores[,2], col = chrom_color, pch = 19, cex = 1.5,
     main = "PCA Scores - PC1 vs PC2",
     xlab = paste0("PC1 (", round(pc@R2[1]*100, 1), "% variance)"),
     ylab = paste0("PC2 (", round(pc@R2[2]*100, 1), "% variance)"))

# Add sample labels
text(scores[,1], scores[,2], labels = rownames(scores), pos = 3, cex = 0.7)

# Add legend
legend("topright", legend = c("KO", "CTL"),
       col = c("red", "blue"), pch = 19, cex = 0.8)

Now exclude KO2

y3 <- y2[,grep("KO2",colnames(y2),invert=TRUE)]

pc <- pca(t(y3), nPcs = 3, method = "ppca")

print(pc)
## ppca calculated PCA
## Importance of component(s):
##                  PC1    PC2    PC3
## R2            0.5476 0.3058 0.1038
## Cumulative R2 0.5476 0.8534 0.9572
## 7746     Variables
## 7    Samples
## 0    NAs ( 0 %)
## 3    Calculated component(s)
## Data was mean centered before running PCA 
## Data was NOT scaled before running PCA 
## Scores structure:
## [1] 7 3
## Loadings structure:
## [1] 7746    3
cat("\nPCA Summary:\n")
## 
## PCA Summary:
cat("Number of components:", pc@nPcs, "\n")
## Number of components: 3
cat("Method used:", pc@method, "\n")
## Method used: ppca
cat("Variance explained by each PC:\n")
## Variance explained by each PC:
print(round(pc@R2, 3))
## [1] 0.548 0.306 0.104
cat("\nCumulative variance explained:\n")
## 
## Cumulative variance explained:
print(round(pc@R2cum, 3))
## [1] 0.548 0.853 0.957
# Scores plot (samples)
par(mar = c(5,4,3,1))
slplot(pc, main = "PCA Scores Plot", cex = 0.8, cex.axis = 0.8)

# Manual scores plot with sample group colors
scores <- pc@scores
chrom_color <- ifelse(grepl("KO", rownames(scores)), "red", "blue")

plot(scores[,1], scores[,2], col = chrom_color, pch = 19, cex = 1.5,
     main = "PCA Scores - PC1 vs PC2",
     xlab = paste0("PC1 (", round(pc@R2[1]*100, 1), "% variance)"),
     ylab = paste0("PC2 (", round(pc@R2[2]*100, 1), "% variance)"))

# Add sample labels
text(scores[,1], scores[,2], labels = rownames(scores), pos = 3, cex = 0.7)

# Add legend
legend("topleft", legend = c("KO", "CTL"),
       col = c("red", "blue"), pch = 19, cex = 0.8)

Normalisation part 2

Now do normalisation on the data after removing KO2.

#Log10 transformation
yl <- log10(y3 + 1)
cat("Log10 transformation completed\n")
## Log10 transformation completed
#Between-array normalization
yn <- normalizeBetweenArrays(yl, method = "quantile", targets = NULL, cyclic.method = "fast")
cat("Between-array normalization completed\n")
## Between-array normalization completed
cat("Final normalized data dimensions:", nrow(yn), "x", ncol(yn), "\n")
## Final normalized data dimensions: 7746 x 7

Differential analysis

Make a sample sheet.

ss <- as.data.frame(colnames(yn))
ss$ko <- grepl("KO",ss[,1])
rownames(ss) <- ss[,1]
ss[,1]=NULL

ss %>%
  kbl(caption="Top few rows of the dataset") %>%
  kable_paper("hover", full_width = F)
Top few rows of the dataset
ko
KO1 TRUE
KO3 TRUE
KO4 TRUE
CTL1 FALSE
CTL2 FALSE
CTL3 FALSE
CTL4 FALSE

Design matrix.

design <- model.matrix(~ ko, data = ss)

print(design)
##      (Intercept) koTRUE
## KO1            1      1
## KO3            1      1
## KO4            1      1
## CTL1           1      0
## CTL2           1      0
## CTL3           1      0
## CTL4           1      0
## attr(,"assign")
## [1] 0 1
## attr(,"contrasts")
## attr(,"contrasts")$ko
## [1] "contr.treatment"

Fit linear models and apply empirical Bayes moderation

fit <- lmFit(yn, design)
fit <- eBayes(fit)

# Summary of differential expression
de_summary <- summary(decideTests(fit))

de_summary
##        (Intercept) koTRUE
## Down             0      4
## NotSig           0   7740
## Up            7746      2

Looking at the DE results

results <- topTable(fit, coef = 2, number = Inf)

# Calculate key statistics
total_proteins <- nrow(results)
significant_proteins <- sum(results$adj.P.Val < 0.05)
sig_rate <- round((significant_proteins / total_proteins) * 100, 1)

# Display most significant results
head(results, 20) %>%
  kbl(caption="Top 20 DE proteins") %>%
  kable_paper("hover", full_width = F)
Top 20 DE proteins
logFC AveExpr t P.Value adj.P.Val B
Q16720_ATP2B3 -3.9979017 5.404929 -79.215082 0.0000000 0.0000002 7.2817783
Q6UWL6_KIRREL2 -2.4877394 4.541980 -28.165804 0.0000000 0.0000659 6.4597870
P54803_GALC -2.9248379 4.791750 -27.912631 0.0000000 0.0000659 6.4444630
Q7Z6P3_RAB44 -3.1937750 4.945428 -23.722454 0.0000001 0.0001493 6.1299787
P19827_ITIH1 3.2713868 4.661376 22.070456 0.0000001 0.0001949 5.9653324
Q15022_SUZ12 1.9898705 4.147861 14.184665 0.0000025 0.0031751 4.5748017
P09471_GNAO1 -0.3278428 7.347893 -8.977974 0.0000490 0.0541705 2.4665274
P49748_ACADVL -0.9316407 8.094585 -8.495590 0.0000695 0.0673340 2.1772248
Q96QV1_HHIP 1.7861241 4.461318 7.294129 0.0001804 0.1552317 1.3553864
Q8N111_CEND1 1.0056142 5.827519 7.161533 0.0002019 0.1559058 1.2548389
Q5UCC4_EMC10 -0.5261417 6.917398 -7.054691 0.0002214 0.1559058 1.1722738
Q9H019_MTFR1L 0.2671044 7.184493 6.539372 0.0003508 0.2264695 0.7538082
Q53GS7_GLE1 0.2618606 6.697547 6.277462 0.0004481 0.2639324 0.5276234
P20138_CD33 0.5269739 5.968220 6.211754 0.0004770 0.2639324 0.4693845
Q9NZU0_FLRT3 0.3401387 6.879011 6.127510 0.0005172 0.2671036 0.3938203
P28676_GCA 0.3257526 6.680174 5.712750 0.0007799 0.3491965 0.0066896
Q9Y624_F11R -0.2782516 7.384211 -5.639829 0.0008401 0.3491965 -0.0640430
Q8NFH5_NUP35 0.2915181 7.271447 5.608404 0.0008676 0.3491965 -0.0947765
Q14435_GALNT3 -0.3441410 6.568961 -5.575920 0.0008971 0.3491965 -0.1267058
Q49AM1_MTERF2 1.8027949 4.354474 5.545081 0.0009262 0.3491965 -0.1571711
upregulated <- results[results$logFC > 0 & results$adj.P.Val < 0.05, ]    # KO-enriched
downregulated <- results[results$logFC < 0 & results$adj.P.Val < 0.05, ]  # CTL-enriched

upregulated %>%
  kbl(caption="Upregulated proteins") %>%
  kable_paper("hover", full_width = F)
Upregulated proteins
logFC AveExpr t P.Value adj.P.Val B
P19827_ITIH1 3.271387 4.661376 22.07046 1.0e-07 0.0001949 5.965332
Q15022_SUZ12 1.989871 4.147861 14.18467 2.5e-06 0.0031751 4.574802
downregulated %>%
  kbl(caption="Downregulated proteins") %>%
  kable_paper("hover", full_width = F)
Downregulated proteins
logFC AveExpr t P.Value adj.P.Val B
Q16720_ATP2B3 -3.997902 5.404929 -79.21508 0e+00 0.0000002 7.281778
Q6UWL6_KIRREL2 -2.487739 4.541980 -28.16580 0e+00 0.0000659 6.459787
P54803_GALC -2.924838 4.791750 -27.91263 0e+00 0.0000659 6.444463
Q7Z6P3_RAB44 -3.193775 4.945428 -23.72245 1e-07 0.0001493 6.129979
sig_up <- nrow(upregulated)
sig_down <- nrow(downregulated)
not_sig <- total_proteins - sig_up - sig_down

# Create classification summary
classification_summary <- data.frame(
  Category = c("Chromatin-enriched", "Soluble-enriched", "Not Significant"),
  Count = c(sig_up, sig_down, not_sig),
  Percentage = round(c(sig_up, sig_down, not_sig) / total_proteins * 100, 1)
)

classification_summary %>%
  kbl(caption="DE summary") %>%
  kable_paper("hover", full_width = F)
DE summary
Category Count Percentage
Chromatin-enriched 2 0.0
Soluble-enriched 4 0.1
Not Significant 7740 99.9

Volcano plot.

sig <- subset(results,adj.P.Val < 0.05)

plot(results$logFC,-log10(results$P.Value),
  pch=19,cex=0.6,col="gray",bty="n",
  xlab="log2 fold change",
  ylab="-log10(p-value)")

points(sig$logFC,-log10(sig$P.Value),pch=19,cex=0.65,col="red")

protnames <- sapply(strsplit(rownames(sig),"_"),"[[",2)
text(sig$logFC,-log10(sig$P.Value),labels=protnames,srt = -30)

Heatmap

topn=10
top <- as.matrix(yn[which(rownames(yn) %in% rownames(head(results,topn))),])

colfunc <- colorRampPalette(c("blue", "white", "red"))

heatmap.2(cor_matrix,trace="none",scale="row")

heatmap.2( top, col=colfunc(25),scale="row",
 trace="none",margins = c(5,15), cexRow=0.9, main="Top 10 genes")

Output results as tsv.

ym <- merge(results,yn,by=0)

colnames(ym)[1] <- "ProteinID"
write.table(x=ym,file="vlcad_ko_de.tsv",quote=F,sep="\t",row.names=FALSE)

ym <- ym[order(ym$P.Value),]

Pathway Enrichment

First get the pathways from Reactome.

Then run mitch.

pw <- gmt_import("ReactomePathways_2025-10-31.gmt")

gt <- data.frame(rownames(results))
gt$name <- sapply(lapply(strsplit(gt[,1],"_"),function(x) { c(x,"x") } ),"[[",2) #substitute x if there's no gene name
table(which(gt$name =="x"))
## 
##   74 3115 
##    1    1
m <- mitch_import(x=results,DEtype="limma",geneTable=gt)
## The input is a single dataframe; one contrast only. Converting
##         it to a list for you.
## Note: Mean no. genes in input = 7746
## Note: no. genes in output = 7745
## Note: estimated proportion of input genes in output = 1
# sanity checks
head(m[order(m$x),,drop=FALSE])
##                  x
## ATP2B3  -79.215082
## KIRREL2 -28.165804
## GALC    -27.912631
## RAB44   -23.722454
## GNAO1    -8.977974
## ACADVL   -8.495590
tail(m[order(m$x),,drop=FALSE])
##                x
## GLE1    6.277462
## MTFR1L  6.539372
## CEND1   7.161533
## HHIP    7.294129
## SUZ12  14.184665
## ITIH1  22.070456
mres <- mitch_calc(x=m,genesets=pw,minsetsize=5,cores=8,priority="effect")
## Note: Enrichments with large effect sizes may not be
##             statistically significant.
NTOT <- nrow(mres$enrichment_result)

mresig <- subset(mres$enrichment_result,p.adjustANOVA<0.05)

NSIG <- nrow((mresig))

NUP <- nrow(subset(mresig,s.dist>0))
NDN <- nrow(subset(mresig,s.dist<0))

head(subset(mresig,s.dist>0),20) %>%
  kbl(caption="Top upregulated pathways (effect size)") %>%
  kable_paper("hover", full_width = F)
Top upregulated pathways (effect size)
set setSize pANOVA s.dist p.adjustANOVA
1745 rRNA modification in the mitochondrion 13 0.0000001 0.8586295 0.0000017
845 Mitochondrial ribosome-associated quality control 89 0.0000000 0.8264738 0.0000000
243 Classical antibody-mediated complement activation 6 0.0006917 0.7998449 0.0043403
279 Creation of C4 and C2 activators 6 0.0006917 0.7998449 0.0043403
674 Initial triggering of complement 9 0.0000695 0.7657417 0.0006565
334 Defective TPR may confer susceptibility towards thyroid papillary carcinoma (TPC) 29 0.0000000 0.7631701 0.0000000
640 IP3 and IP4 transport between cytosol and nucleus 29 0.0000000 0.7631701 0.0000000
641 IP6 and IP7 transport between cytosol and nucleus 29 0.0000000 0.7631701 0.0000000
642 IPs transport between nucleus and cytosol 29 0.0000000 0.7631701 0.0000000
1218 Regulation of Glucokinase by Glucokinase Regulatory Protein 29 0.0000000 0.7631701 0.0000000
977 PDH complex synthesizes acetyl-CoA from PYR 5 0.0042314 0.7387080 0.0207092
1283 Regulation of pyruvate dehydrogenase (PDH) complex 12 0.0000119 0.7302470 0.0001544
1372 SUMOylation of SUMOylation proteins 31 0.0000000 0.7206503 0.0000000
734 Ketone body metabolism 7 0.0010709 0.7140272 0.0063140
1075 Propionyl-CoA catabolism 5 0.0058501 0.7117829 0.0274832
1525 Synthesis of Ketone Bodies 6 0.0025577 0.7111599 0.0137457
375 Diseases of branched-chain amino acid catabolism 12 0.0000244 0.7036295 0.0002858
841 Mitochondrial iron-sulfur cluster biogenesis 13 0.0000116 0.7024951 0.0001529
877 NEP/NS2 Interacts with the Cellular Export Machinery 31 0.0000000 0.6990558 0.0000000
1080 Protein lipoylation 9 0.0003029 0.6955073 0.0022175
head(subset(mresig,s.dist<0),20) %>%
  kbl(caption="Top downregulated pathways (effect size)") %>%
  kable_paper("hover", full_width = F)
Top downregulated pathways (effect size)
set setSize pANOVA s.dist p.adjustANOVA
1178 RUNX2 regulates genes involved in cell migration 5 0.0017631 -0.8076486 0.0099928
1171 RSK activation 5 0.0054010 -0.7185013 0.0255782
905 Nef mediated downregulation of MHC class I complex cell surface expression 9 0.0004455 -0.6760313 0.0030339
909 Negative feedback regulation of MAPK pathway 5 0.0093903 -0.6708527 0.0411439
306 DNA Damage Reversal 6 0.0087700 -0.6179524 0.0389113
1312 Reversal of alkylation damage by DNA dioxygenases 6 0.0087700 -0.6179524 0.0389113
1063 Prevention of phagosomal-lysosomal fusion 6 0.0108181 -0.6008959 0.0462159
906 Nef-mediates down modulation of cell surface receptors by recruiting them to clathrin adapters 16 0.0000330 -0.5996571 0.0003598
1515 Suppression of phagosomal maturation 9 0.0018810 -0.5984718 0.0106265
58 Activation of caspases through apoptosome-mediated cleavage 6 0.0116380 -0.5948658 0.0488238
511 Frs2-mediated activation 8 0.0063474 -0.5573543 0.0295820
1074 Prolonged ERK activation events 10 0.0025341 -0.5515191 0.0137381
509 Formation of tubulin folding intermediates by CCT/TriC 20 0.0000329 -0.5365696 0.0003598
1603 The role of Nef in HIV-1 replication and disease pathogenesis 20 0.0000354 -0.5344207 0.0003818
201 Caspase-mediated cleavage of cytoskeletal proteins 10 0.0035746 -0.5322043 0.0184181
1532 Synthesis of PIPs at the Golgi membrane 14 0.0006035 -0.5296302 0.0039258
519 G beta:gamma signalling through PI3Kgamma 11 0.0033178 -0.5115076 0.0172981
1533 Synthesis of PIPs at the early endosome membrane 14 0.0010124 -0.5075669 0.0059893
570 Glucagon-type ligand receptors 9 0.0100870 -0.4954039 0.0439773
1534 Synthesis of PIPs at the late endosome membrane 9 0.0107493 -0.4911525 0.0460648

Bar plot of top results.

ups <- head(subset(mresig,s.dist>0),20)[,c("s.dist")]
names(ups) <- head(subset(mresig,s.dist>0),20)[,c("set")]

dns <- head(subset(mresig,s.dist<0),20)[,c("s.dist")]
names(dns) <- head(subset(mresig,s.dist<0),20)[,c("set")]

tops <- c(ups,dns)

tops <- sort(tops)

par(mar=c(5,25,5,1))

cols <- c(rep("blue",length(dns)),rep("red",length(ups)))

barplot(abs(tops),
  horiz=TRUE,las=1,cex.names=0.65,col=cols,
  main="top DE Reactomes",
  xlab="ES",
  cex.main=0.9)

Make detailed pathway report.

mitch_report(res=mres,outfile="vlcad_ko_reactome.html",overwrite=TRUE)
## Note: overwriting existing report
## Dataset saved as " /tmp/Rtmp4xOreu/vlcad_ko_reactome.rds ".
## 
## 
## processing file: mitch.Rmd
## 1/36                             
## 2/36 [checklibraries]            
## 3/36                             
## 4/36 [peek]                      
## 5/36                             
## 6/36 [metrics]                   
## 7/36                             
## 8/36 [scatterplot]
## 9/36                             
## 10/36 [contourplot]               
## 11/36                             
## 12/36 [input_geneset_metrics1]    
## 13/36                             
## 14/36 [input_geneset_metrics2]
## 15/36                             
## 16/36 [input_geneset_metrics3]
## 17/36                             
## 18/36 [echart1d]                  
## 19/36 [echart2d]                  
## 20/36                             
## 21/36 [heatmap]                   
## 22/36                             
## 23/36 [effectsize]                
## 24/36                             
## 25/36 [results_table]             
## 26/36                             
## 27/36 [results_table_complete]    
## 28/36                             
## 29/36 [detailed_geneset_reports1d]
## 30/36                             
## 31/36 [detailed_geneset_reports2d]
## 32/36                             
## 33/36 [network]
## 34/36                             
## 35/36 [session_info]              
## 36/36
## output file: /mnt/data/mdz/projects/matt/VLCAD_KO_proteomics/mitch.knit.md
## /home/mdz/anaconda3/bin/pandoc +RTS -K512m -RTS /mnt/data/mdz/projects/matt/VLCAD_KO_proteomics/mitch.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output /tmp/Rtmp4xOreu/mitch_report.html --lua-filter /usr/local/lib/R/site-library/rmarkdown/rmarkdown/lua/pagebreak.lua --lua-filter /usr/local/lib/R/site-library/rmarkdown/rmarkdown/lua/latex-div.lua --self-contained --variable bs3=TRUE --section-divs --template /usr/local/lib/R/site-library/rmarkdown/rmd/h/default.html --no-highlight --variable highlightjs=1 --variable theme=bootstrap --mathjax --variable 'mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' --include-in-header /tmp/Rtmp4xOreu/rmarkdown-str20c6ac4321280.html
## 
## Output created: /tmp/Rtmp4xOreu/mitch_report.html
## [1] TRUE

Session information

For reproducibility.

sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] gtools_3.9.5        mitch_1.20.0        gplots_3.2.0       
## [4] kableExtra_1.4.0    limma_3.64.3        pcaMethods_2.0.0   
## [7] Biobase_2.68.0      BiocGenerics_0.54.0 generics_0.1.4     
## 
## loaded via a namespace (and not attached):
##  [1] beeswarm_0.4.0        gtable_0.3.6          xfun_0.52            
##  [4] bslib_0.9.0           ggplot2_3.5.2         htmlwidgets_1.6.4    
##  [7] caTools_1.18.3        lattice_0.22-7        GGally_2.3.0         
## [10] vctrs_0.6.5           tools_4.5.1           bitops_1.0-9         
## [13] parallel_4.5.1        tibble_3.3.0          pkgconfig_2.0.3      
## [16] KernSmooth_2.23-26    RColorBrewer_1.1-3    S7_0.2.0             
## [19] lifecycle_1.0.4       compiler_4.5.1        farver_2.1.2         
## [22] stringr_1.5.1         textshaping_1.0.1     statmod_1.5.0        
## [25] httpuv_1.6.16         htmltools_0.5.8.1     sass_0.4.10          
## [28] yaml_2.3.10           pillar_1.11.0         later_1.4.2          
## [31] jquerylib_0.1.4       tidyr_1.3.1           MASS_7.3-65          
## [34] cachem_1.1.0          mime_0.13             network_1.19.0       
## [37] ggstats_0.10.0        tidyselect_1.2.1      digest_0.6.37        
## [40] stringi_1.8.7         reshape2_1.4.4        dplyr_1.1.4          
## [43] purrr_1.1.0           fastmap_1.2.0         grid_4.5.1           
## [46] cli_3.6.5             magrittr_2.0.3        dichromat_2.0-0.1    
## [49] withr_3.0.2           scales_1.4.0          promises_1.3.3       
## [52] rmarkdown_2.29        gridExtra_2.3         coda_0.19-4.1        
## [55] shiny_1.11.1          evaluate_1.0.4        knitr_1.50           
## [58] viridisLite_0.4.2     rlang_1.1.6           Rcpp_1.1.0           
## [61] xtable_1.8-4          glue_1.8.0            echarts4r_0.4.5      
## [64] xml2_1.3.8            svglite_2.2.1         rstudioapi_0.17.1    
## [67] jsonlite_2.0.0        plyr_1.8.9            statnet.common_4.12.0
## [70] R6_2.6.1              systemfonts_1.2.3