Proteomics EDA

# Load required libraries
library("pcaMethods")

## Loading required package: Biobase

## Loading required package: BiocGenerics

## Loading required package: generics

## 
## Attaching package: 'generics'

## The following objects are masked from 'package:base':
## 
##     as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
##     setequal, union

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, saveRDS, table, tapply, unique,
##     unsplit, which.max, which.min

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'pcaMethods'

## The following object is masked from 'package:stats':
## 
##     loadings

library("limma")

## 
## Attaching package: 'limma'

## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA

# Load the data
# PATH projects/proteomics/workingdir02/dia-quant-output/report.pg_matrix.tsv
x <- read.table("report.pg_matrix.tsv", header=TRUE, sep="\t")

# Clean column names
colnames(x) <- gsub("X.home.projects.proteomics.PXD058340.20240911_AST0_NEO4_IAH_collab_SAG_U2OS_Frac_Control_","", colnames(x))
colnames(x) <- gsub("_uncalibrated.mzML", "", colnames(x))

# Set row names
rownames(x) <- paste(x$Protein.Ids, x$Genes, sep = "_")

# Extract intensity data
y <- x[, 6:13]

# Order columns alphabetically
y <- y[, order(colnames(y))]

# Display data dimensions and first few rows
cat("Data dimensions:", nrow(y), "proteins x", ncol(y), "samples\n")

## Data dimensions: 9387 proteins x 8 samples

cat("FIRST 6 ROWS OF INTENSITY DATA\n")

## FIRST 6 ROWS OF INTENSITY DATA

print(head(y))

##                    Chrom_01 Chrom_02 Chrom_03 Chrom_04   Sol_01   Sol_02
## A0A024RBG1_NUDT4B        NA 646141.0 133422.0 290905.0 103357.0 102421.0
## A0A096LP01_SMIM26   20260.8  13860.2  11513.8  13364.7       NA       NA
## A0A0B4J2F0_PIGBOS1  20346.8  36566.5  33835.2  49822.1  25264.6  33506.5
## A0A0U1RRI6_CENPVL3 116623.0 128056.0  65694.5 181058.0 148163.0  37960.6
## A0A2Z4LIS9_FOXO3B        NA       NA       NA       NA  55652.2  19294.6
## A0A3B3IRV3_MCTS2         NA       NA  78454.1       NA       NA       NA
##                      Sol_03   Sol_04
## A0A024RBG1_NUDT4B  72773.70 129309.0
## A0A096LP01_SMIM26        NA       NA
## A0A0B4J2F0_PIGBOS1 62707.70       NA
## A0A0U1RRI6_CENPVL3 36157.40  39845.5
## A0A2Z4LIS9_FOXO3B   6700.05  55440.3
## A0A3B3IRV3_MCTS2   94093.10 151075.0

# Missing Data Analysis
# Overall missing data statistics
total_values <- nrow(y) * ncol(y)
total_missing <- sum(is.na(y))
missing_percentage <- (total_missing / total_values) * 100

cat("\nOVERALL MISSING DATA SUMMARY\n")

## 
## OVERALL MISSING DATA SUMMARY

cat("Total values:", total_values, "\n")

## Total values: 75096

cat("Total missing:", total_missing, "\n")

## Total missing: 5667

cat("Overall missing percentage:", round(missing_percentage, 2), "%\n")

## Overall missing percentage: 7.55 %

# Missing data per sample
missing_per_sample <- colSums(is.na(y))
missing_per_sample_pct <- (missing_per_sample / nrow(y)) * 100

missing_summary <- data.frame(
  Sample = names(missing_per_sample),
  Missing_Count = missing_per_sample,
  Missing_Percent = round(missing_per_sample_pct, 2)
)

cat("\nMISSING DATA PER SAMPLE\n")

## 
## MISSING DATA PER SAMPLE

print(missing_summary)

##            Sample Missing_Count Missing_Percent
## Chrom_01 Chrom_01           841            8.96
## Chrom_02 Chrom_02           756            8.05
## Chrom_03 Chrom_03           631            6.72
## Chrom_04 Chrom_04           667            7.11
## Sol_01     Sol_01           623            6.64
## Sol_02     Sol_02           722            7.69
## Sol_03     Sol_03           715            7.62
## Sol_04     Sol_04           712            7.58

# Missing data per protein
nas <- apply(X = y, MARGIN = 1, function(x){ 
  length(which(is.na(x))) 
})
missing_proteins_table <- table(nas)

cat("\nDATA DISTRIBUTION ACROSS PROTEINS\n")

## 
## DATA DISTRIBUTION ACROSS PROTEINS

cat("Number of proteins by missing value count:\n")

## Number of proteins by missing value count:

print(missing_proteins_table)

## nas
##    0    1    2    3    4    5    6    7 
## 7448  585  345  314  327  154  126   88

cat("\nSummary of missing values per protein:\n")

## 
## Summary of missing values per protein:

print(summary(nas))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.6037  0.0000  7.0000

Analysis 7.55% overall missing data, sample-wise missing data ranges from 6.64%(sol_01) to 8.96% (Chrom_01). The pattern suggests data could be missing completely at random (MCAR)

# Raw Data Distribution Analysis
# Basic statistics for each sample
raw_stats <- data.frame(
  Sample = colnames(y),
  Min = apply(y, 2, min, na.rm = TRUE),
  Q1 = apply(y, 2, quantile, 0.25, na.rm = TRUE),
  Median = apply(y, 2, median, na.rm = TRUE),
  Mean = apply(y, 2, mean, na.rm = TRUE),
  Q3 = apply(y, 2, quantile, 0.75, na.rm = TRUE),
  Max = apply(y, 2, max, na.rm = TRUE),
  SD = apply(y, 2, sd, na.rm = TRUE)
)

# Round only numeric columns
raw_stats[, 2:8] <- round(raw_stats[, 2:8], 2)

cat("\nRAW DATA STATISTICS PER SAMPLE\n")

## 
## RAW DATA STATISTICS PER SAMPLE

print(raw_stats)

##            Sample     Min        Q1   Median    Mean        Q3        Max
## Chrom_01 Chrom_01  846.85  66616.33 199742.0 2779777  686373.5 2093610000
## Chrom_02 Chrom_02  465.30  72265.90 222034.0 2606580  748327.5 1745770000
## Chrom_03 Chrom_03  654.75  70661.12 221557.5 2425816  753637.5 1424870000
## Chrom_04 Chrom_04  757.16  73106.57 228133.0 2497867  778770.0 1533690000
## Sol_01     Sol_01 1218.35 107208.00 336680.0 1477733  972017.2  427266000
## Sol_02     Sol_02  383.39 109490.00 344214.0 1519168 1006990.0  418187000
## Sol_03     Sol_03  632.25 109828.00 336741.5 1502361  976827.8  461732000
## Sol_04     Sol_04  440.77 109120.00 342712.0 1515689  993096.0  437961000
##                SD
## Chrom_01 40994633
## Chrom_02 35158822
## Chrom_03 30908103
## Chrom_04 32631793
## Sol_01    7764360
## Sol_02    7778442
## Sol_03    7996860
## Sol_04    7941235

# Summary of all samples combined
cat("\nDATA SUMMARY\n")

## 
## DATA SUMMARY

print(summary(y))

##     Chrom_01            Chrom_02            Chrom_03        
##  Min.   :8.470e+02   Min.   :4.650e+02   Min.   :6.550e+02  
##  1st Qu.:6.662e+04   1st Qu.:7.227e+04   1st Qu.:7.066e+04  
##  Median :1.997e+05   Median :2.220e+05   Median :2.216e+05  
##  Mean   :2.780e+06   Mean   :2.607e+06   Mean   :2.426e+06  
##  3rd Qu.:6.864e+05   3rd Qu.:7.483e+05   3rd Qu.:7.536e+05  
##  Max.   :2.094e+09   Max.   :1.746e+09   Max.   :1.425e+09  
##  NA's   :841         NA's   :756         NA's   :631        
##     Chrom_04             Sol_01              Sol_02         
##  Min.   :7.570e+02   Min.   :     1218   Min.   :      383  
##  1st Qu.:7.311e+04   1st Qu.:   107208   1st Qu.:   109490  
##  Median :2.281e+05   Median :   336680   Median :   344214  
##  Mean   :2.498e+06   Mean   :  1477733   Mean   :  1519168  
##  3rd Qu.:7.788e+05   3rd Qu.:   972017   3rd Qu.:  1006990  
##  Max.   :1.534e+09   Max.   :427266000   Max.   :418187000  
##  NA's   :667         NA's   :623         NA's   :722        
##      Sol_03              Sol_04         
##  Min.   :      632   Min.   :      441  
##  1st Qu.:   109828   1st Qu.:   109120  
##  Median :   336742   Median :   342712  
##  Mean   :  1502361   Mean   :  1515689  
##  3rd Qu.:   976828   3rd Qu.:   993096  
##  Max.   :461732000   Max.   :437961000  
##  NA's   :715         NA's   :712

Analysis Chromatin samples show higher maximum values and variance

# Data Visualization - Raw Distribution
# Boxplot of raw abundances
par(mfrow = c(1, 1), mar = c(5, 4, 3, 1))

# Raw scale boxplot
boxplot(y, las = 2, main = "Raw Abundance Distribution", 
        ylab = "Abundance", cex.axis = 0.6, col = rainbow(ncol(y)))

# Log-transformed boxplot for better visualization
y_log <- log10(y + 1)  # Add 1 to handle zeros if any
boxplot(y_log, las = 2, main = "Log10(Abundance + 1) Distribution", 
        ylab = "Log10(Abundance + 1)", cex.axis = 0.6, col = rainbow(ncol(y)))

par(mfrow = c(1, 1))

# Protein Abundance Analysis
# Calculate protein statistics
protein_stats <- data.frame(
  Protein = rownames(y),
  Mean_Abundance = rowMeans(y, na.rm = TRUE),
  Median_Abundance = apply(y, 1, median, na.rm = TRUE),
  SD_Abundance = apply(y, 1, sd, na.rm = TRUE),
  CV = apply(y, 1, sd, na.rm = TRUE) / rowMeans(y, na.rm = TRUE),
  Missing_Count = nas
)

cat("\nPROTEIN ABUNDANCE SUMMARY\n")

## 
## PROTEIN ABUNDANCE SUMMARY

cat("Mean abundance across all proteins:\n")

## Mean abundance across all proteins:

print(summary(protein_stats$Mean_Abundance))

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##      1390     91480    284973   1894765    864300 806360750

cat("\nCoefficient of Variation (CV) summary:\n")

## 
## Coefficient of Variation (CV) summary:

print(summary(protein_stats$CV))

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## 0.004909 0.380411 0.544771 0.545880 0.694548 2.066170       88

# Top 10 most abundant proteins
high_abundance <- head(protein_stats[order(protein_stats$Mean_Abundance, decreasing = TRUE), ], 10)

cat("\nTOP 10 MOST ABUNDANT PROTEINS\n")

## 
## TOP 10 MOST ABUNDANT PROTEINS

print(high_abundance[, c("Protein", "Mean_Abundance", "Missing_Count")])

##                     Protein Mean_Abundance Missing_Count
## P63261_ACTG1   P63261_ACTG1      806360750             0
## P0C0S5_H2AZ1   P0C0S5_H2AZ1      793833962             0
## Q96A08_H2BC1   Q96A08_H2BC1      728385412             0
## P62805_H4C1     P62805_H4C1      716529104             0
## P68431_H3C1     P68431_H3C1      566167549             0
## P16403_H1-2     P16403_H1-2      270591260             0
## P08670_VIM       P08670_VIM      257967080             0
## Q5TEC6_H3-7     Q5TEC6_H3-7      189024492             1
## P68104_EEF1A1 P68104_EEF1A1      170472625             0
## P04406_GAPDH   P04406_GAPDH      144327562             0

# Top 10 most variable proteins by CV
high_variable <- head(protein_stats[order(protein_stats$CV, decreasing = TRUE), ], 10)

cat("\nTOP 10 MOST VARIABLE PROTEINS (by Coefficient of Variation)\n")

## 
## TOP 10 MOST VARIABLE PROTEINS (by Coefficient of Variation)

#Round only the numeric columns, keep Protein column as-is
high_variable_display <- high_variable[, c("Protein", "Mean_Abundance", "CV", "Missing_Count")]
high_variable_display[, c("Mean_Abundance", "CV")] <- round(high_variable_display[, c("Mean_Abundance", "CV")], 3)
high_variable_display[, "Missing_Count"] <- round(high_variable_display[, "Missing_Count"], 0)  # Missing count should be integer
print(high_variable_display)

##                       Protein Mean_Abundance    CV Missing_Count
## Q92786_PROX1     Q92786_PROX1    1556116.300 2.066             3
## Q9H0Q0_CYRIA     Q9H0Q0_CYRIA      58128.832 1.979             3
## Q9Y2I6_NINL       Q9Y2I6_NINL      91156.117 1.919             4
## Q5BJH7_YIF1B     Q5BJH7_YIF1B     521314.714 1.774             1
## P02649_APOE       P02649_APOE     333324.343 1.686             1
## Q86Y13_DZIP3     Q86Y13_DZIP3      23110.778 1.686             1
## Q8IUZ5_PHYKPL   Q8IUZ5_PHYKPL      82381.637 1.671             0
## Q9BTV6_DPH7       Q9BTV6_DPH7     494644.600 1.650             3
## Q53H80_AKIRIN2 Q53H80_AKIRIN2      78096.320 1.576             3
## O43529_CHST10   O43529_CHST10       6505.818 1.556             2

# Visualization of CV distribution
hist(protein_stats$CV[!is.infinite(protein_stats$CV)], 
     breaks = 50, main = "Distribution of Coefficient of Variation", 
     xlab = "CV", ylab = "Frequency", 
     col = "lightblue", border = "darkblue")

# Sample Correlation Matrix
# Calculate correlation matrix
cor_matrix <- cor(y, use = "complete.obs")

cat("\nSAMPLE CORRELATION MATRIX\n")

## 
## SAMPLE CORRELATION MATRIX

print(round(cor_matrix, 3))

##          Chrom_01 Chrom_02 Chrom_03 Chrom_04 Sol_01 Sol_02 Sol_03 Sol_04
## Chrom_01    1.000    0.966    0.982    0.983  0.322  0.317  0.330  0.319
## Chrom_02    0.966    1.000    0.994    0.989  0.327  0.324  0.336  0.326
## Chrom_03    0.982    0.994    1.000    0.997  0.327  0.323  0.334  0.324
## Chrom_04    0.983    0.989    0.997    1.000  0.314  0.311  0.321  0.312
## Sol_01      0.322    0.327    0.327    0.314  1.000  0.990  0.993  0.990
## Sol_02      0.317    0.324    0.323    0.311  0.990  1.000  0.991  0.991
## Sol_03      0.330    0.336    0.334    0.321  0.993  0.991  1.000  0.993
## Sol_04      0.319    0.326    0.324    0.312  0.990  0.991  0.993  1.000

# Identify sample groups
chrom_samples <- grepl("Chrom", colnames(y))
sol_samples <- grepl("Sol", colnames(y))

# Calculate group-wise correlations
chrom_cor <- cor_matrix[chrom_samples, chrom_samples]
sol_cor <- cor_matrix[sol_samples, sol_samples]
between_cor <- cor_matrix[chrom_samples, sol_samples]

cat("\nCORRELATION SUMMARY\n")

## 
## CORRELATION SUMMARY

cat("Average within-Chromatin correlation:", round(mean(chrom_cor[upper.tri(chrom_cor)]), 3), "\n")

## Average within-Chromatin correlation: 0.985

cat("Average within-Soluble correlation:", round(mean(sol_cor[upper.tri(sol_cor)]), 3), "\n")

## Average within-Soluble correlation: 0.991

cat("Average between-condition correlation:", round(mean(between_cor), 3), "\n")

## Average between-condition correlation: 0.323

# Simple correlation heatmap using base R
heatmap(cor_matrix, main = "Sample Correlation Heatmap", 
        col = heat.colors(100), margins = c(8, 8))

# Data Quality Assessment - Define z before PCA
# Filter proteins with fewer missing values (e.g., less than 3 missing)
z <- y[which(nas < 3), ]
cat("\nDATA QUALITY ASSESSMENT\n")

## 
## DATA QUALITY ASSESSMENT

cat("Original data:", nrow(y), "proteins\n")

## Original data: 9387 proteins

cat("After filtering (< 3 missing values):", nrow(z), "proteins\n")

## After filtering (< 3 missing values): 8378 proteins

cat("Proteins retained:", round(nrow(z)/nrow(y) * 100, 1), "%\n")

## Proteins retained: 89.3 %

#NORMALIZATION PIPELINE
cat("\nNORMALIZATION PIPELINE\n")

## 
## NORMALIZATION PIPELINE

#Load required libraries
library(limma)

#Handle remaining missing values in filtered data
z_complete <- z
for(i in 1:ncol(z_complete)) {
  if(any(is.na(z_complete[,i]))) {
    min_val <- min(z_complete[,i], na.rm = TRUE)
    z_complete[is.na(z_complete[,i]), i] <- min_val * 0.1
  }
}

cat("Missing values after imputation:", sum(is.na(z_complete)), "\n")

## Missing values after imputation: 0

#Log10 transformation
z_log10 <- log10(z_complete + 1)
cat("Log10 transformation completed\n")

## Log10 transformation completed

#Between-array normalization
z_normalized <- normalizeBetweenArrays(z_log10, method = "quantile", targets = NULL, cyclic.method = "fast")
cat("Between-array normalization completed\n")

## Between-array normalization completed

cat("Final normalized data dimensions:", nrow(z_normalized), "x", ncol(z_normalized), "\n")

## Final normalized data dimensions: 8378 x 8

# NORMALIZATION VALIDATION
cat("\nNORMALIZATION VALIDATION\n")

## 
## NORMALIZATION VALIDATION

# Before and after comparison
cat("Raw data range:", round(range(z_complete), 0), "\n")

## Raw data range: 44 2093610000

cat("Normalized data range:", round(range(z_normalized), 2), "\n")

## Normalized data range: 1.86 8.93

# Sample median variability
raw_medians <- apply(z_complete, 2, median)
norm_medians <- apply(z_normalized, 2, median)

cat("CV of sample medians (raw):", round(sd(raw_medians)/mean(raw_medians), 3), "\n")

## CV of sample medians (raw): 0.248

cat("CV of sample medians (normalized):", round(sd(norm_medians)/mean(norm_medians), 3), "\n")

## CV of sample medians (normalized): 0

# Correlation preservation
cor_raw <- cor(z_complete)
cor_norm <- cor(z_normalized)

chrom_idx <- grepl("Chrom", colnames(z_normalized))
sol_idx <- grepl("Sol", colnames(z_normalized))

cat("Within-Chromatin correlation (normalized):", 
    round(mean(cor_norm[chrom_idx, chrom_idx][upper.tri(cor_norm[chrom_idx, chrom_idx])]), 3), "\n")

## Within-Chromatin correlation (normalized): 0.822

cat("Within-Soluble correlation (normalized):", 
    round(mean(cor_norm[sol_idx, sol_idx][upper.tri(cor_norm[sol_idx, sol_idx])]), 3), "\n")

## Within-Soluble correlation (normalized): 0.853

# VISUALIZATION COMPARISON
par(mfrow = c(2, 2), mar = c(6, 4, 3, 1))

# Raw data
boxplot(log10(z_complete + 1), las = 2, main = "Raw Data (Log10 scale)", 
        cex.axis = 0.6, col = rainbow(ncol(z_complete)))

# Log10 transformed
boxplot(z_log10, las = 2, main = "Log10 Transformed", 
        cex.axis = 0.6, col = rainbow(ncol(z_log10)))

# Final normalized(quantile)
boxplot(z_normalized, las = 2, main = "Final Normalized", 
        cex.axis = 0.6, col = rainbow(ncol(z_normalized)))

par(mfrow = c(1, 1))

# Sample completeness
sample_completeness <- (1 - missing_per_sample_pct/100) * 100
cat("\nSample completeness (%):\n")

## 
## Sample completeness (%):

for(i in 1:length(sample_completeness)) {
  cat(names(sample_completeness)[i], ":", round(sample_completeness[i], 1), "%\n")
}

## Chrom_01 : 91 %
## Chrom_02 : 91.9 %
## Chrom_03 : 93.3 %
## Chrom_04 : 92.9 %
## Sol_01 : 93.4 %
## Sol_02 : 92.3 %
## Sol_03 : 92.4 %
## Sol_04 : 92.4 %

# PCA Analysis using pcaMethods
library(pcaMethods)
cat("\nPCA ANALYSIS using pcaMethods\n")

## 
## PCA ANALYSIS using pcaMethods

# Use filtered data (z) with fewer missing values
# Transpose data so samples are rows (pcaMethods expects this format)
z_t <- t(z)

# Perform PPCA (handles missing values)
pc <- pca(z_t, nPcs = 3, method = "ppca")

# Display PCA results
print(pc)

## ppca calculated PCA
## Importance of component(s):
##                  PC1     PC2      PC3
## R2            0.9704 0.02338 0.004566
## Cumulative R2 0.9704 0.99376 0.998329
## 8378     Variables
## 8    Samples
## 1275     NAs ( 1.902 %)
## 3    Calculated component(s)
## Data was mean centered before running PCA 
## Data was NOT scaled before running PCA 
## Scores structure:
## [1] 8 3
## Loadings structure:
## [1] 8378    3

cat("\nPCA Summary:\n")

## 
## PCA Summary:

cat("Number of components:", pc@nPcs, "\n")

## Number of components: 3

cat("Method used:", pc@method, "\n")

## Method used: ppca

cat("Variance explained by each PC:\n")

## Variance explained by each PC:

print(round(pc@R2, 3))

## [1] 0.970 0.023 0.005

cat("\nCumulative variance explained:\n")

## 
## Cumulative variance explained:

print(round(pc@R2cum, 3))

## [1] 0.970 0.994 0.998

# Scores plot (samples)
par(mar = c(5,4,3,1))
slplot(pc, main = "PCA Scores Plot", cex = 0.8, cex.axis = 0.8)

# Manual scores plot with sample group colors
scores <- pc@scores
chrom_color <- ifelse(grepl("Chrom", rownames(scores)), "red", "blue")

plot(scores[,1], scores[,2], col = chrom_color, pch = 19, cex = 1.5,
     main = "PCA Scores - PC1 vs PC2",
     xlab = paste0("PC1 (", round(pc@R2[1]*100, 1), "% variance)"),
     ylab = paste0("PC2 (", round(pc@R2[2]*100, 1), "% variance)"))

# Add sample labels
text(scores[,1], scores[,2], labels = rownames(scores), pos = 3, cex = 0.7)

# Add legend
legend("topright", legend = c("Chromatin", "Soluble"), 
       col = c("red", "blue"), pch = 19, cex = 0.8)

Proteomics EDA

Swanitha

2025-08-24