# Load required libraries
library("pcaMethods")
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: generics
##
## Attaching package: 'generics'
## The following objects are masked from 'package:base':
##
## as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
## setequal, union
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, saveRDS, table, tapply, unique,
## unsplit, which.max, which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'pcaMethods'
## The following object is masked from 'package:stats':
##
## loadings
library("limma")
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
# Load the data
# PATH projects/proteomics/workingdir02/dia-quant-output/report.pg_matrix.tsv
x <- read.table("report.pg_matrix.tsv", header=TRUE, sep="\t")
# Clean column names
colnames(x) <- gsub("X.home.projects.proteomics.PXD058340.20240911_AST0_NEO4_IAH_collab_SAG_U2OS_Frac_Control_","", colnames(x))
colnames(x) <- gsub("_uncalibrated.mzML", "", colnames(x))
# Set row names
rownames(x) <- paste(x$Protein.Ids, x$Genes, sep = "_")
# Extract intensity data
y <- x[, 6:13]
# Order columns alphabetically
y <- y[, order(colnames(y))]
# Display data dimensions and first few rows
cat("Data dimensions:", nrow(y), "proteins x", ncol(y), "samples\n")
## Data dimensions: 9387 proteins x 8 samples
cat("FIRST 6 ROWS OF INTENSITY DATA\n")
## FIRST 6 ROWS OF INTENSITY DATA
print(head(y))
## Chrom_01 Chrom_02 Chrom_03 Chrom_04 Sol_01 Sol_02
## A0A024RBG1_NUDT4B NA 646141.0 133422.0 290905.0 103357.0 102421.0
## A0A096LP01_SMIM26 20260.8 13860.2 11513.8 13364.7 NA NA
## A0A0B4J2F0_PIGBOS1 20346.8 36566.5 33835.2 49822.1 25264.6 33506.5
## A0A0U1RRI6_CENPVL3 116623.0 128056.0 65694.5 181058.0 148163.0 37960.6
## A0A2Z4LIS9_FOXO3B NA NA NA NA 55652.2 19294.6
## A0A3B3IRV3_MCTS2 NA NA 78454.1 NA NA NA
## Sol_03 Sol_04
## A0A024RBG1_NUDT4B 72773.70 129309.0
## A0A096LP01_SMIM26 NA NA
## A0A0B4J2F0_PIGBOS1 62707.70 NA
## A0A0U1RRI6_CENPVL3 36157.40 39845.5
## A0A2Z4LIS9_FOXO3B 6700.05 55440.3
## A0A3B3IRV3_MCTS2 94093.10 151075.0
# Missing Data Analysis
# Overall missing data statistics
total_values <- nrow(y) * ncol(y)
total_missing <- sum(is.na(y))
missing_percentage <- (total_missing / total_values) * 100
cat("\nOVERALL MISSING DATA SUMMARY\n")
##
## OVERALL MISSING DATA SUMMARY
cat("Total values:", total_values, "\n")
## Total values: 75096
cat("Total missing:", total_missing, "\n")
## Total missing: 5667
cat("Overall missing percentage:", round(missing_percentage, 2), "%\n")
## Overall missing percentage: 7.55 %
# Missing data per sample
missing_per_sample <- colSums(is.na(y))
missing_per_sample_pct <- (missing_per_sample / nrow(y)) * 100
missing_summary <- data.frame(
Sample = names(missing_per_sample),
Missing_Count = missing_per_sample,
Missing_Percent = round(missing_per_sample_pct, 2)
)
cat("\nMISSING DATA PER SAMPLE\n")
##
## MISSING DATA PER SAMPLE
print(missing_summary)
## Sample Missing_Count Missing_Percent
## Chrom_01 Chrom_01 841 8.96
## Chrom_02 Chrom_02 756 8.05
## Chrom_03 Chrom_03 631 6.72
## Chrom_04 Chrom_04 667 7.11
## Sol_01 Sol_01 623 6.64
## Sol_02 Sol_02 722 7.69
## Sol_03 Sol_03 715 7.62
## Sol_04 Sol_04 712 7.58
# Missing data per protein
nas <- apply(X = y, MARGIN = 1, function(x){
length(which(is.na(x)))
})
missing_proteins_table <- table(nas)
cat("\nDATA DISTRIBUTION ACROSS PROTEINS\n")
##
## DATA DISTRIBUTION ACROSS PROTEINS
cat("Number of proteins by missing value count:\n")
## Number of proteins by missing value count:
print(missing_proteins_table)
## nas
## 0 1 2 3 4 5 6 7
## 7448 585 345 314 327 154 126 88
cat("\nSummary of missing values per protein:\n")
##
## Summary of missing values per protein:
print(summary(nas))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.6037 0.0000 7.0000
Analysis 7.55% overall missing data, sample-wise missing data ranges from 6.64%(sol_01) to 8.96% (Chrom_01). The pattern suggests data could be missing completely at random (MCAR)
# Raw Data Distribution Analysis
# Basic statistics for each sample
raw_stats <- data.frame(
Sample = colnames(y),
Min = apply(y, 2, min, na.rm = TRUE),
Q1 = apply(y, 2, quantile, 0.25, na.rm = TRUE),
Median = apply(y, 2, median, na.rm = TRUE),
Mean = apply(y, 2, mean, na.rm = TRUE),
Q3 = apply(y, 2, quantile, 0.75, na.rm = TRUE),
Max = apply(y, 2, max, na.rm = TRUE),
SD = apply(y, 2, sd, na.rm = TRUE)
)
# Round only numeric columns
raw_stats[, 2:8] <- round(raw_stats[, 2:8], 2)
cat("\nRAW DATA STATISTICS PER SAMPLE\n")
##
## RAW DATA STATISTICS PER SAMPLE
print(raw_stats)
## Sample Min Q1 Median Mean Q3 Max
## Chrom_01 Chrom_01 846.85 66616.33 199742.0 2779777 686373.5 2093610000
## Chrom_02 Chrom_02 465.30 72265.90 222034.0 2606580 748327.5 1745770000
## Chrom_03 Chrom_03 654.75 70661.12 221557.5 2425816 753637.5 1424870000
## Chrom_04 Chrom_04 757.16 73106.57 228133.0 2497867 778770.0 1533690000
## Sol_01 Sol_01 1218.35 107208.00 336680.0 1477733 972017.2 427266000
## Sol_02 Sol_02 383.39 109490.00 344214.0 1519168 1006990.0 418187000
## Sol_03 Sol_03 632.25 109828.00 336741.5 1502361 976827.8 461732000
## Sol_04 Sol_04 440.77 109120.00 342712.0 1515689 993096.0 437961000
## SD
## Chrom_01 40994633
## Chrom_02 35158822
## Chrom_03 30908103
## Chrom_04 32631793
## Sol_01 7764360
## Sol_02 7778442
## Sol_03 7996860
## Sol_04 7941235
# Summary of all samples combined
cat("\nDATA SUMMARY\n")
##
## DATA SUMMARY
print(summary(y))
## Chrom_01 Chrom_02 Chrom_03
## Min. :8.470e+02 Min. :4.650e+02 Min. :6.550e+02
## 1st Qu.:6.662e+04 1st Qu.:7.227e+04 1st Qu.:7.066e+04
## Median :1.997e+05 Median :2.220e+05 Median :2.216e+05
## Mean :2.780e+06 Mean :2.607e+06 Mean :2.426e+06
## 3rd Qu.:6.864e+05 3rd Qu.:7.483e+05 3rd Qu.:7.536e+05
## Max. :2.094e+09 Max. :1.746e+09 Max. :1.425e+09
## NA's :841 NA's :756 NA's :631
## Chrom_04 Sol_01 Sol_02
## Min. :7.570e+02 Min. : 1218 Min. : 383
## 1st Qu.:7.311e+04 1st Qu.: 107208 1st Qu.: 109490
## Median :2.281e+05 Median : 336680 Median : 344214
## Mean :2.498e+06 Mean : 1477733 Mean : 1519168
## 3rd Qu.:7.788e+05 3rd Qu.: 972017 3rd Qu.: 1006990
## Max. :1.534e+09 Max. :427266000 Max. :418187000
## NA's :667 NA's :623 NA's :722
## Sol_03 Sol_04
## Min. : 632 Min. : 441
## 1st Qu.: 109828 1st Qu.: 109120
## Median : 336742 Median : 342712
## Mean : 1502361 Mean : 1515689
## 3rd Qu.: 976828 3rd Qu.: 993096
## Max. :461732000 Max. :437961000
## NA's :715 NA's :712
Analysis Chromatin samples show higher maximum values and variance
# Data Visualization - Raw Distribution
# Boxplot of raw abundances
par(mfrow = c(1, 1), mar = c(5, 4, 3, 1))
# Raw scale boxplot
boxplot(y, las = 2, main = "Raw Abundance Distribution",
ylab = "Abundance", cex.axis = 0.6, col = rainbow(ncol(y)))
# Log-transformed boxplot for better visualization
y_log <- log10(y + 1) # Add 1 to handle zeros if any
boxplot(y_log, las = 2, main = "Log10(Abundance + 1) Distribution",
ylab = "Log10(Abundance + 1)", cex.axis = 0.6, col = rainbow(ncol(y)))
par(mfrow = c(1, 1))
# Protein Abundance Analysis
# Calculate protein statistics
protein_stats <- data.frame(
Protein = rownames(y),
Mean_Abundance = rowMeans(y, na.rm = TRUE),
Median_Abundance = apply(y, 1, median, na.rm = TRUE),
SD_Abundance = apply(y, 1, sd, na.rm = TRUE),
CV = apply(y, 1, sd, na.rm = TRUE) / rowMeans(y, na.rm = TRUE),
Missing_Count = nas
)
cat("\nPROTEIN ABUNDANCE SUMMARY\n")
##
## PROTEIN ABUNDANCE SUMMARY
cat("Mean abundance across all proteins:\n")
## Mean abundance across all proteins:
print(summary(protein_stats$Mean_Abundance))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1390 91480 284973 1894765 864300 806360750
cat("\nCoefficient of Variation (CV) summary:\n")
##
## Coefficient of Variation (CV) summary:
print(summary(protein_stats$CV))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.004909 0.380411 0.544771 0.545880 0.694548 2.066170 88
# Top 10 most abundant proteins
high_abundance <- head(protein_stats[order(protein_stats$Mean_Abundance, decreasing = TRUE), ], 10)
cat("\nTOP 10 MOST ABUNDANT PROTEINS\n")
##
## TOP 10 MOST ABUNDANT PROTEINS
print(high_abundance[, c("Protein", "Mean_Abundance", "Missing_Count")])
## Protein Mean_Abundance Missing_Count
## P63261_ACTG1 P63261_ACTG1 806360750 0
## P0C0S5_H2AZ1 P0C0S5_H2AZ1 793833962 0
## Q96A08_H2BC1 Q96A08_H2BC1 728385412 0
## P62805_H4C1 P62805_H4C1 716529104 0
## P68431_H3C1 P68431_H3C1 566167549 0
## P16403_H1-2 P16403_H1-2 270591260 0
## P08670_VIM P08670_VIM 257967080 0
## Q5TEC6_H3-7 Q5TEC6_H3-7 189024492 1
## P68104_EEF1A1 P68104_EEF1A1 170472625 0
## P04406_GAPDH P04406_GAPDH 144327562 0
# Top 10 most variable proteins by CV
high_variable <- head(protein_stats[order(protein_stats$CV, decreasing = TRUE), ], 10)
cat("\nTOP 10 MOST VARIABLE PROTEINS (by Coefficient of Variation)\n")
##
## TOP 10 MOST VARIABLE PROTEINS (by Coefficient of Variation)
#Round only the numeric columns, keep Protein column as-is
high_variable_display <- high_variable[, c("Protein", "Mean_Abundance", "CV", "Missing_Count")]
high_variable_display[, c("Mean_Abundance", "CV")] <- round(high_variable_display[, c("Mean_Abundance", "CV")], 3)
high_variable_display[, "Missing_Count"] <- round(high_variable_display[, "Missing_Count"], 0) # Missing count should be integer
print(high_variable_display)
## Protein Mean_Abundance CV Missing_Count
## Q92786_PROX1 Q92786_PROX1 1556116.300 2.066 3
## Q9H0Q0_CYRIA Q9H0Q0_CYRIA 58128.832 1.979 3
## Q9Y2I6_NINL Q9Y2I6_NINL 91156.117 1.919 4
## Q5BJH7_YIF1B Q5BJH7_YIF1B 521314.714 1.774 1
## P02649_APOE P02649_APOE 333324.343 1.686 1
## Q86Y13_DZIP3 Q86Y13_DZIP3 23110.778 1.686 1
## Q8IUZ5_PHYKPL Q8IUZ5_PHYKPL 82381.637 1.671 0
## Q9BTV6_DPH7 Q9BTV6_DPH7 494644.600 1.650 3
## Q53H80_AKIRIN2 Q53H80_AKIRIN2 78096.320 1.576 3
## O43529_CHST10 O43529_CHST10 6505.818 1.556 2
# Visualization of CV distribution
hist(protein_stats$CV[!is.infinite(protein_stats$CV)],
breaks = 50, main = "Distribution of Coefficient of Variation",
xlab = "CV", ylab = "Frequency",
col = "lightblue", border = "darkblue")
# Sample Correlation Matrix
# Calculate correlation matrix
cor_matrix <- cor(y, use = "complete.obs")
cat("\nSAMPLE CORRELATION MATRIX\n")
##
## SAMPLE CORRELATION MATRIX
print(round(cor_matrix, 3))
## Chrom_01 Chrom_02 Chrom_03 Chrom_04 Sol_01 Sol_02 Sol_03 Sol_04
## Chrom_01 1.000 0.966 0.982 0.983 0.322 0.317 0.330 0.319
## Chrom_02 0.966 1.000 0.994 0.989 0.327 0.324 0.336 0.326
## Chrom_03 0.982 0.994 1.000 0.997 0.327 0.323 0.334 0.324
## Chrom_04 0.983 0.989 0.997 1.000 0.314 0.311 0.321 0.312
## Sol_01 0.322 0.327 0.327 0.314 1.000 0.990 0.993 0.990
## Sol_02 0.317 0.324 0.323 0.311 0.990 1.000 0.991 0.991
## Sol_03 0.330 0.336 0.334 0.321 0.993 0.991 1.000 0.993
## Sol_04 0.319 0.326 0.324 0.312 0.990 0.991 0.993 1.000
# Identify sample groups
chrom_samples <- grepl("Chrom", colnames(y))
sol_samples <- grepl("Sol", colnames(y))
# Calculate group-wise correlations
chrom_cor <- cor_matrix[chrom_samples, chrom_samples]
sol_cor <- cor_matrix[sol_samples, sol_samples]
between_cor <- cor_matrix[chrom_samples, sol_samples]
cat("\nCORRELATION SUMMARY\n")
##
## CORRELATION SUMMARY
cat("Average within-Chromatin correlation:", round(mean(chrom_cor[upper.tri(chrom_cor)]), 3), "\n")
## Average within-Chromatin correlation: 0.985
cat("Average within-Soluble correlation:", round(mean(sol_cor[upper.tri(sol_cor)]), 3), "\n")
## Average within-Soluble correlation: 0.991
cat("Average between-condition correlation:", round(mean(between_cor), 3), "\n")
## Average between-condition correlation: 0.323
# Simple correlation heatmap using base R
heatmap(cor_matrix, main = "Sample Correlation Heatmap",
col = heat.colors(100), margins = c(8, 8))
# Data Quality Assessment - Define z before PCA
# Filter proteins with fewer missing values (e.g., less than 3 missing)
z <- y[which(nas < 3), ]
cat("\nDATA QUALITY ASSESSMENT\n")
##
## DATA QUALITY ASSESSMENT
cat("Original data:", nrow(y), "proteins\n")
## Original data: 9387 proteins
cat("After filtering (< 3 missing values):", nrow(z), "proteins\n")
## After filtering (< 3 missing values): 8378 proteins
cat("Proteins retained:", round(nrow(z)/nrow(y) * 100, 1), "%\n")
## Proteins retained: 89.3 %
#NORMALIZATION PIPELINE
cat("\nNORMALIZATION PIPELINE\n")
##
## NORMALIZATION PIPELINE
#Load required libraries
library(limma)
#Handle remaining missing values in filtered data
z_complete <- z
for(i in 1:ncol(z_complete)) {
if(any(is.na(z_complete[,i]))) {
min_val <- min(z_complete[,i], na.rm = TRUE)
z_complete[is.na(z_complete[,i]), i] <- min_val * 0.1
}
}
cat("Missing values after imputation:", sum(is.na(z_complete)), "\n")
## Missing values after imputation: 0
#Log10 transformation
z_log10 <- log10(z_complete + 1)
cat("Log10 transformation completed\n")
## Log10 transformation completed
#Between-array normalization
z_normalized <- normalizeBetweenArrays(z_log10, method = "quantile", targets = NULL, cyclic.method = "fast")
cat("Between-array normalization completed\n")
## Between-array normalization completed
cat("Final normalized data dimensions:", nrow(z_normalized), "x", ncol(z_normalized), "\n")
## Final normalized data dimensions: 8378 x 8
# NORMALIZATION VALIDATION
cat("\nNORMALIZATION VALIDATION\n")
##
## NORMALIZATION VALIDATION
# Before and after comparison
cat("Raw data range:", round(range(z_complete), 0), "\n")
## Raw data range: 44 2093610000
cat("Normalized data range:", round(range(z_normalized), 2), "\n")
## Normalized data range: 1.86 8.93
# Sample median variability
raw_medians <- apply(z_complete, 2, median)
norm_medians <- apply(z_normalized, 2, median)
cat("CV of sample medians (raw):", round(sd(raw_medians)/mean(raw_medians), 3), "\n")
## CV of sample medians (raw): 0.248
cat("CV of sample medians (normalized):", round(sd(norm_medians)/mean(norm_medians), 3), "\n")
## CV of sample medians (normalized): 0
# Correlation preservation
cor_raw <- cor(z_complete)
cor_norm <- cor(z_normalized)
chrom_idx <- grepl("Chrom", colnames(z_normalized))
sol_idx <- grepl("Sol", colnames(z_normalized))
cat("Within-Chromatin correlation (normalized):",
round(mean(cor_norm[chrom_idx, chrom_idx][upper.tri(cor_norm[chrom_idx, chrom_idx])]), 3), "\n")
## Within-Chromatin correlation (normalized): 0.822
cat("Within-Soluble correlation (normalized):",
round(mean(cor_norm[sol_idx, sol_idx][upper.tri(cor_norm[sol_idx, sol_idx])]), 3), "\n")
## Within-Soluble correlation (normalized): 0.853
# VISUALIZATION COMPARISON
par(mfrow = c(2, 2), mar = c(6, 4, 3, 1))
# Raw data
boxplot(log10(z_complete + 1), las = 2, main = "Raw Data (Log10 scale)",
cex.axis = 0.6, col = rainbow(ncol(z_complete)))
# Log10 transformed
boxplot(z_log10, las = 2, main = "Log10 Transformed",
cex.axis = 0.6, col = rainbow(ncol(z_log10)))
# Final normalized(quantile)
boxplot(z_normalized, las = 2, main = "Final Normalized",
cex.axis = 0.6, col = rainbow(ncol(z_normalized)))
par(mfrow = c(1, 1))
# Sample completeness
sample_completeness <- (1 - missing_per_sample_pct/100) * 100
cat("\nSample completeness (%):\n")
##
## Sample completeness (%):
for(i in 1:length(sample_completeness)) {
cat(names(sample_completeness)[i], ":", round(sample_completeness[i], 1), "%\n")
}
## Chrom_01 : 91 %
## Chrom_02 : 91.9 %
## Chrom_03 : 93.3 %
## Chrom_04 : 92.9 %
## Sol_01 : 93.4 %
## Sol_02 : 92.3 %
## Sol_03 : 92.4 %
## Sol_04 : 92.4 %
# PCA Analysis using pcaMethods
library(pcaMethods)
cat("\nPCA ANALYSIS using pcaMethods\n")
##
## PCA ANALYSIS using pcaMethods
# Use filtered data (z) with fewer missing values
# Transpose data so samples are rows (pcaMethods expects this format)
z_t <- t(z)
# Perform PPCA (handles missing values)
pc <- pca(z_t, nPcs = 3, method = "ppca")
# Display PCA results
print(pc)
## ppca calculated PCA
## Importance of component(s):
## PC1 PC2 PC3
## R2 0.9704 0.02338 0.004566
## Cumulative R2 0.9704 0.99376 0.998329
## 8378 Variables
## 8 Samples
## 1275 NAs ( 1.902 %)
## 3 Calculated component(s)
## Data was mean centered before running PCA
## Data was NOT scaled before running PCA
## Scores structure:
## [1] 8 3
## Loadings structure:
## [1] 8378 3
cat("\nPCA Summary:\n")
##
## PCA Summary:
cat("Number of components:", pc@nPcs, "\n")
## Number of components: 3
cat("Method used:", pc@method, "\n")
## Method used: ppca
cat("Variance explained by each PC:\n")
## Variance explained by each PC:
print(round(pc@R2, 3))
## [1] 0.970 0.023 0.005
cat("\nCumulative variance explained:\n")
##
## Cumulative variance explained:
print(round(pc@R2cum, 3))
## [1] 0.970 0.994 0.998
# Scores plot (samples)
par(mar = c(5,4,3,1))
slplot(pc, main = "PCA Scores Plot", cex = 0.8, cex.axis = 0.8)
# Manual scores plot with sample group colors
scores <- pc@scores
chrom_color <- ifelse(grepl("Chrom", rownames(scores)), "red", "blue")
plot(scores[,1], scores[,2], col = chrom_color, pch = 19, cex = 1.5,
main = "PCA Scores - PC1 vs PC2",
xlab = paste0("PC1 (", round(pc@R2[1]*100, 1), "% variance)"),
ylab = paste0("PC2 (", round(pc@R2[2]*100, 1), "% variance)"))
# Add sample labels
text(scores[,1], scores[,2], labels = rownames(scores), pos = 3, cex = 0.7)
# Add legend
legend("topright", legend = c("Chromatin", "Soluble"),
col = c("red", "blue"), pch = 19, cex = 0.8)