Protein Signatures as Biomarkers for Cardiovascular Disease in HIV Patients

Intro

Here we are analysing samples obtained before patients had a CVD event. We have case samples from patients who later had a CVD event and a control groups which in the follow up period did not have a CVD event.

Load libraries

library("limma")
library("mitch")

library("PLSDAbatch")

library("tibble")
library("tidyr")
library("readr")
library("dplyr")
library("ggplot2")
library("pheatmap")
library("janitor")
library("kableExtra")
library("eulerr")
library("beeswarm")

Load files and clean the data tables

These files were generated by FragPipe.

plexA <- read_tsv("abundance_protein_MD_plexA.tsv")

## Rows: 963 Columns: 27
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (8): Index, Gene, Protein, Protein ID, Entry Name, Protein Description,...
## dbl (19): NumberPSM, MaxPepProb, ReferenceIntensity, _126, _127N, _127C, _12...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

plexB <- read_tsv("abundance_protein_MD_plexB.tsv")

## Rows: 962 Columns: 27
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (8): Index, Gene, Protein, Protein ID, Entry Name, Protein Description,...
## dbl (19): NumberPSM, MaxPepProb, ReferenceIntensity, _126, _127N, _127C, _12...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

plexA <- janitor::clean_names(plexA)
plexB <- janitor::clean_names(plexB)

Extract out the protein quantifications.

plexA <- dplyr::select(plexA, protein, gene,
                       x126, x127n, x127c, x128n, x128c,
                       x129n, x129c, x130n, x130c,
                       x131n, x131c, x132n)

plexB <- dplyr::select(plexB, protein, gene,
                       x126, x127n, x127c, x128n, x128c,
                       x129n, x129c, x130n, x130c,
                       x131n, x131c, x132n)

Rename TMT channels to sample IDs.

annotationA <- c(
  x126="19191", x127n="34629", x127c="34862",
  x128n="40431", x128c="40475", x129n="35038",
  x129c="34866", x130n="41436", x130c="41383",
  x131n="40316", x131c="3746", x132n="102953"
)

annotationB <- c(
  x126="40797", x127n="4177", x127c="103376",
  x128n="47542", x128c="47442", x129n="42977",
  x129c="43081", x130n="113835", x130c="113856",
  x131n="19584", x131c="118117", x132n="118008"
)

colnames(plexA)[match(names(annotationA), colnames(plexA))] <- annotationA
colnames(plexB)[match(names(annotationB), colnames(plexB))] <- annotationB

Merge plexes.

merged <- full_join(plexA, plexB, by=c("protein","gene"))
head(merged) |>
    kbl(caption = "Example of merged data") |>
    kable_paper("hover", full_width = F)

Example of merged data
protein	gene	19191	34629	34862	40431	40475	35038	34866	41436	41383	40316	3746	102953	40797	4177	103376	47542	47442	42977	43081	113835	113856	19584	118117	118008
sp\|A0A075B6H7\|KV37_HUMAN	IGKV3-7	25.20197	25.39573	25.43256	26.05771	25.05397	25.57650	25.65469	25.37295	25.65501	25.64185	25.43883	25.33301	30.46372	30.34873	30.21146	30.78558	30.36076	30.19026	30.28124	30.56661	29.76611	30.72821	30.20665	30.00076
sp\|A0A075B6H9\|LV469_HUMAN	IGLV4-69	28.95035	29.82285	29.61003	29.53702	29.44966	28.84860	29.91425	29.41320	29.00432	28.83877	28.59919	29.14621	22.40007	21.18373	21.26245	21.90925	22.59531	21.24953	21.98843	21.22654	21.40015	22.00258	21.02142	21.46372
sp\|A0A075B6I0\|LV861_HUMAN	IGLV8-61	27.53609	28.54878	26.58017	27.43782	27.43139	26.83964	27.26845	26.57449	28.21526	26.57432	26.75554	NA	25.58149	26.12823	26.52763	25.72071	26.59545	25.47399	26.13838	25.35584	25.89193	25.69072	25.72252	26.67365
sp\|A0A075B6I1\|LV460_HUMAN	IGLV4-60	21.13980	NA	20.95866	21.90418	19.86279	21.18880	21.20105	20.12885	20.95064	21.09826	20.35881	NA	27.79615	26.93927	25.90896	26.48209	28.84106	26.96525	28.56674	27.15404	27.20759	27.49989	27.18150	27.81810
sp\|A0A075B6J9\|LV218_HUMAN	IGLV2-18	22.38769	25.30212	25.12898	24.02298	23.89291	24.49498	24.70158	24.32961	24.30966	24.58185	23.43995	23.82350	27.34292	26.35223	24.59685	26.37291	25.30866	25.64357	26.10869	25.59368	27.73543	25.64763	26.16884	26.54800
sp\|A0A075B6K0\|LV316_HUMAN	IGLV3-16	21.37483	22.29617	21.39598	21.69973	21.51293	23.78554	22.11370	22.61187	20.93010	22.14908	20.87886	20.41397	20.66449	22.03552	22.04962	21.69365	21.55941	21.37521	22.05910	22.66605	21.11006	20.78442	21.76661	21.22903

dim(merged)

## [1] 1186   26

Expression matrix for Merged plexes including Plex A and Plex B.

expr_combined <- as.data.frame(merged)
protein_info_combined <- expr_combined[, c("protein", "gene")]
rownames(expr_combined) <- expr_combined$protein
expr_combined <- expr_combined[, !(colnames(expr_combined) %in% c("protein","gene"))]
expr_combined <- as.matrix(expr_combined)
storage.mode(expr_combined) <- "numeric"

expr_A <- as.data.frame(plexA)
protein_info_A <- expr_A[, c("protein", "gene")]
rownames(expr_A) <- expr_A$protein
expr_A <- expr_A[, !(colnames(expr_A) %in% c("protein","gene"))]
expr_A <- as.matrix(expr_A)
storage.mode(expr_A) <- "numeric"
head(expr_A) |>
    kbl(caption = "Example of Plex A data") |>
    kable_paper("hover", full_width = F)

Example of Plex A data
	19191	34629	34862	40431	40475	35038	34866	41436	41383	40316	3746	102953
sp\|A0A075B6H7\|KV37_HUMAN	25.20197	25.39573	25.43256	26.05771	25.05397	25.57650	25.65469	25.37295	25.65501	25.64185	25.43883	25.33301
sp\|A0A075B6H9\|LV469_HUMAN	28.95035	29.82285	29.61003	29.53702	29.44966	28.84860	29.91425	29.41320	29.00432	28.83877	28.59919	29.14621
sp\|A0A075B6I0\|LV861_HUMAN	27.53609	28.54878	26.58017	27.43782	27.43139	26.83964	27.26845	26.57449	28.21526	26.57432	26.75554	NA
sp\|A0A075B6I1\|LV460_HUMAN	21.13980	NA	20.95866	21.90418	19.86279	21.18880	21.20105	20.12885	20.95064	21.09826	20.35881	NA
sp\|A0A075B6J9\|LV218_HUMAN	22.38769	25.30212	25.12898	24.02298	23.89291	24.49498	24.70158	24.32961	24.30966	24.58185	23.43995	23.82350
sp\|A0A075B6K0\|LV316_HUMAN	21.37483	22.29617	21.39598	21.69973	21.51293	23.78554	22.11370	22.61187	20.93010	22.14908	20.87886	20.41397

dim(expr_A)

## [1] 963  12

expr_B <- as.data.frame(plexB)
protein_info_B <- expr_B[, c("protein", "gene")]
rownames(expr_B) <- expr_B$protein
expr_B <- expr_B[, !(colnames(expr_B) %in% c("protein","gene"))]
expr_B <- as.matrix(expr_B)
storage.mode(expr_B) <- "numeric"
head(expr_B)|>
    kbl(caption = "Example of Plex B data") |>
    kable_paper("hover", full_width = F)

Example of Plex B data
	40797	4177	103376	47542	47442	42977	43081	113835	113856	19584	118117	118008
sp\|A0A075B6H7\|KV37_HUMAN	30.46372	30.34873	30.21146	30.78558	30.36076	30.19026	30.28124	30.56661	29.76611	30.72821	30.20665	30.00076
sp\|A0A075B6H9\|LV469_HUMAN	22.40007	21.18373	21.26245	21.90925	22.59531	21.24953	21.98843	21.22654	21.40015	22.00258	21.02142	21.46372
sp\|A0A075B6I0\|LV861_HUMAN	25.58149	26.12823	26.52763	25.72071	26.59545	25.47399	26.13838	25.35584	25.89193	25.69072	25.72252	26.67365
sp\|A0A075B6I1\|LV460_HUMAN	27.79615	26.93927	25.90896	26.48209	28.84106	26.96525	28.56674	27.15404	27.20759	27.49989	27.18150	27.81810
sp\|A0A075B6I4\|LVX54_HUMAN	14.60397	13.68839	NA	13.41395	17.83912	14.71371	15.67450	14.14982	13.30315	NA	13.41029	14.51386
sp\|A0A075B6J9\|LV218_HUMAN	27.34292	26.35223	24.59685	26.37291	25.30866	25.64357	26.10869	25.59368	27.73543	25.64763	26.16884	26.54800

dim(expr_B)

## [1] 962  12

Filter proteins

Keep proteins which are present in >=50% of samples.

keep_combined <- rowSums(!is.na(expr_combined)) >= ncol(expr_combined)*0.5
expr_combined_filt <- expr_combined[keep_combined, ]
dim(expr_combined_filt)

## [1] 904  24

keep_A <- rowSums(!is.na(expr_A)) >= ncol(expr_A)*0.5
expr_A_filt <- expr_A[keep_A, ]
dim(expr_A_filt)

## [1] 873  12

keep_B <- rowSums(!is.na(expr_B)) >= ncol(expr_B)*0.5
expr_B_filt <- expr_B[keep_B, ]
dim(expr_B_filt)

## [1] 854  12

QC before normalization

qc_sample_combined <- data.frame(
  Sample = colnames(expr_combined_filt),
  Detected_Proteins = colSums(!is.na(expr_combined_filt)),
  Missing_Percent = colMeans(is.na(expr_combined_filt)) * 100,
  Median_Intensity = apply(expr_combined_filt, 2, median, na.rm = TRUE),
  SD_Intensity = apply(expr_combined_filt, 2, sd, na.rm = TRUE)
)

# Assign plex
plexA_samples <- c(
  "19191","34629","34862","40431","40475","35038",
  "34866","41436","41383","40316","3746","102953"
)

qc_sample_combined$Plex <- ifelse(qc_sample_combined$Sample %in% plexA_samples, "A", "B")

# QC summary by plex
qc_plex_combined <- qc_sample_combined %>%
  group_by(Plex) %>%
  summarise(
    N_Samples = n(),
    Mean_Detected_Proteins = mean(Detected_Proteins),
    Mean_Missing_Percent = mean(Missing_Percent),
    Mean_Median_Intensity = mean(Median_Intensity),
    .groups = "drop"
  )

qc_sample_combined |>
    kbl(caption = "QC on combined data") |>
    kable_paper("hover", full_width = F)

QC on combined data
	Sample	Detected_Proteins	Missing_Percent	Median_Intensity	SD_Intensity	Plex
19191	19191	811	10.28761	22.61437	3.608051	A
34629	34629	768	15.04425	22.52596	3.824270	A
34862	34862	792	12.38938	22.47182	3.687522	A
40431	40431	767	15.15487	22.17860	3.914548	A
40475	40475	768	15.04425	22.27699	3.916427	A
35038	35038	806	10.84071	22.40563	3.668941	A
34866	34866	805	10.95133	22.46457	3.701970	A
41436	41436	779	13.82743	22.34549	3.813249	A
41383	41383	791	12.50000	22.41082	3.736537	A
40316	40316	762	15.70796	22.20311	3.877041	A
3746	3746	797	11.83628	22.94357	3.486882	A
102953	102953	765	15.37611	22.48856	3.758113	A
40797	40797	766	15.26549	22.30942	3.775893	B
4177	4177	782	13.49558	22.93066	3.365816	B
103376	103376	714	21.01770	22.29280	3.759772	B
47542	47542	780	13.71681	22.25514	3.563151	B
47442	47442	731	19.13717	22.16957	3.792444	B
42977	42977	772	14.60177	22.38404	3.503025	B
43081	43081	776	14.15929	22.13263	3.707914	B
113835	113835	726	19.69027	22.26161	3.831329	B
113856	113856	768	15.04425	22.04823	3.841864	B
19584	19584	764	15.48673	22.35815	3.493251	B
118117	118117	727	19.57965	22.31801	3.876589	B
118008	118008	675	25.33186	22.33120	3.854068	B

qc_plex_combined |>
    kbl(caption = "Summary of Plex data QC") |>
    kable_paper("hover", full_width = F)

Summary of Plex data QC
Plex	N_Samples	Mean_Detected_Proteins	Mean_Missing_Percent	Mean_Median_Intensity
A	12	784.2500	13.24668	22.44412
B	12	748.4167	17.21055	22.31595

Now we have some QC plots. The first one is the missingness plot.

qc_long_combined <- as.data.frame(expr_combined_filt) %>%
  rownames_to_column("Protein") %>%
  pivot_longer(-Protein, names_to = "Sample", values_to = "Intensity") %>%
  filter(is.finite(Intensity))

ggplot(qc_sample_combined,
       aes(x = reorder(Sample, Missing_Percent),
           y = Missing_Percent,
           fill = Plex)) +
  geom_col() +
  coord_flip() +
  theme_bw() +
  labs(title = "Missingness per Sample",
       x = "Sample", y = "% Missing")

Detected proteins plot.

ggplot(qc_sample_combined,
       aes(x = reorder(Sample, Detected_Proteins),
           y = Detected_Proteins,
           fill = Plex)) +
  geom_col() +
  coord_flip() +
  theme_bw() +
  labs(title = "Detected Proteins per Sample",
       x = "Sample", y = "Detected Proteins")

Density plot.

ggplot(qc_long_combined,
       aes(x = Intensity, color = Sample)) +
  geom_density(linewidth = 0.8) +
  theme_bw() +
  labs(title = "Intensity Distribution per Sample",
       x = "Intensity", y = "Density")

Plex-level comparison.

ggplot(qc_plex_combined,
       aes(x = Plex, y = Mean_Detected_Proteins, fill = Plex)) +
  geom_col() +
  theme_bw() +
  labs(title = "Mean Detected Proteins by Plex",
       x = "Plex", y = "Mean Detected Proteins")

Correlation heatmap

A correlation heatmap shows the similarity between samples.

It shows that the samples are clustered into clusters based on plex A/B.

cor_pre_combined <- cor(expr_combined_filt, use = "pairwise.complete.obs")

pheatmap(cor_pre_combined,
         main = "Sample Correlation Heatmap (Pre-normalization)")

Quantile normalization and log2 transform

This needs to be run for combined dataset and separate plexes as well.

expr_combined_norm <- normalizeBetweenArrays(expr_combined_filt, method = "quantile")
expr_combined_log <- log2(expr_combined_norm + 1)
boxplot(expr_combined_log, main = "Post-normalization for both plexes", las = 2)

# Plex A
expr_A_norm <- normalizeBetweenArrays(expr_A_filt, method = "quantile")
expr_A_log <- log2(expr_A_norm + 1)
boxplot(expr_A_log, main = "Post-normalization for Plex A", las = 2)

# plex B
expr_B_norm <- normalizeBetweenArrays(expr_B_filt, method = "quantile")
expr_B_log <- log2(expr_B_norm + 1)
boxplot(expr_B_log, main = "Post-normalization for Plex B", las = 2)

QC after normalization

After normalisation, the signal distributions are uniform, but there’s a shoulder.

The correlation structure is retained as expected.

qc_long_norm_combined <- as.data.frame(expr_combined_log) %>%
  rownames_to_column("Protein") %>%
  pivot_longer(-Protein, names_to = "Sample", values_to = "Intensity") %>%
  filter(is.finite(Intensity))

ggplot(qc_long_norm_combined,
       aes(x = Intensity, color = Sample)) +
  geom_density(linewidth = 0.8) +
  theme_bw() +
  labs(title = "Post-normalization Intensity Distribution (Combined)",
       x = "Log2 Intensity",
       y = "Density")

cor_post_combined <- cor(expr_combined_log, use = "pairwise.complete.obs")

pheatmap(cor_post_combined,
         main = "Sample Correlation Heatmap (Post-normalization)")

Impute missing values

Here we are using the median method for imputation of missing values.

# Combine Plexes
expr_combined_imp <- expr_combined_log
for(i in 1:ncol(expr_combined_imp)){
  expr_combined_imp[is.na(expr_combined_imp[,i]), i] <-
    median(expr_combined_imp[,i], na.rm = TRUE)
}

# Plex A
expr_A_imp <- expr_A_log

for(i in 1:ncol(expr_A_imp)){
  expr_A_imp[is.na(expr_A_imp[,i]), i] <-
    median(expr_A_imp[,i], na.rm = TRUE)
}

# Plex B
expr_B_imp <- expr_B_log

for(i in 1:ncol(expr_B_imp)){
  expr_B_imp[is.na(expr_B_imp[,i]), i] <-
    median(expr_B_imp[,i], na.rm = TRUE)
}
sum(is.na(expr_combined_imp))

## [1] 0

sum(is.na(expr_A_imp))

## [1] 0

sum(is.na(expr_B_imp))

## [1] 0

Next, we will removing Zero Variance Proteins because they cannot be statistically significant and may cause other problems later on.

expr_combined_imp <- expr_combined_imp[apply(expr_combined_imp, 1, var) != 0, ]
expr_A_imp <- expr_A_imp[apply(expr_A_imp, 1, var) != 0, ]
expr_B_imp <- expr_B_imp[apply(expr_B_imp, 1, var) != 0, ]

Metadata Creation

# Create combined metadata from expression matrix
meta_combined <- data.frame(
  Sample = colnames(expr_combined_imp)
)

# Define Plex (based on known Plex A samples)

plexA_samples <- c(
  "19191","34629","34862","40431","40475","35038",
  "34866","41436","41383","40316","3746","102953"
)

meta_combined$Plex <- ifelse(meta_combined$Sample %in% plexA_samples,
                             "A", "B")

meta_combined$Plex <- factor(meta_combined$Plex)

# Define biological group (Case vs Control)

case_samples <- c(
  "19191","40797","34862","103376","40475","47442",
  "34866","43081","41383","113856","3746","118117"
)

meta_combined$Group <- ifelse(meta_combined$Sample %in% case_samples,
                              "Case", "Control")

meta_combined$Group <- factor(meta_combined$Group,
                              levels = c("Control","Case"))

# Final formatting

rownames(meta_combined) <- meta_combined$Sample

# Create Plex-specific metadata (CORRECT WAY)
meta_A <- meta_combined[meta_combined$Plex == "A", ]
meta_B <- meta_combined[meta_combined$Plex == "B", ]

# Align with expression matrices (CRITICAL)
meta_A <- meta_A[colnames(expr_A_imp), ]
meta_B <- meta_B[colnames(expr_B_imp), ]

# Keep only needed columns
meta_A <- meta_A[, c("Sample","Group")]
meta_B <- meta_B[, c("Sample","Group")]

# SANITY CHECKS (DO NOT SKIP)
# Alignment
all(rownames(meta_A) == colnames(expr_A_imp))

## [1] TRUE

all(rownames(meta_B) == colnames(expr_B_imp))

## [1] TRUE

all(rownames(meta_combined) == colnames(expr_combined_imp))

## [1] TRUE

# Distribution checks
table(meta_combined$Plex, meta_combined$Group)

##    
##     Control Case
##   A       6    6
##   B       6    6

table(meta_A$Group)

## 
## Control    Case 
##       6       6

table(meta_B$Group)

## 
## Control    Case 
##       6       6

PCA Before Batch Correction for combined plexes

pca_pre <- prcomp(t(expr_combined_imp), center = TRUE, scale. = TRUE)

pca_pre_df <- data.frame(
  PC1 = pca_pre$x[,1],
  PC2 = pca_pre$x[,2],
  Sample = colnames(expr_combined_imp),
  Plex = meta_combined$Plex,
  Group = meta_combined$Group
)

ggplot(pca_pre_df, aes(x = PC1, y = PC2, color = Plex, shape = Group)) +
  geom_point(size = 4) +
  geom_text(aes(label = Sample), vjust = -0.5, size = 3) +
  theme_bw() +
  labs(title = "PCA Before Batch Correction",
       x = "PC1",
       y = "PC2")

Batch correction

expr_combined_corrected <- removeBatchEffect(
  expr_combined_imp,
  batch = meta_combined$Plex,
  design = model.matrix(~ Group, data = meta_combined)
)

# Quick check
head(expr_combined_corrected[,1:5])

##                              19191    34629    34862    40431    40475
## sp|A0A075B6H7|KV37_HUMAN  4.833346 4.830965 4.839526 4.871850 4.827196
## sp|A0A075B6H9|LV469_HUMAN 4.702996 4.718612 4.721695 4.712016 4.715457
## sp|A0A075B6I0|LV861_HUMAN 4.825501 4.858319 4.764883 4.807596 4.800510
## sp|A0A075B6I1|LV460_HUMAN 4.638888 4.726509 4.627839 4.711256 4.579935
## sp|A0A075B6J9|LV218_HUMAN 4.595879 4.759264 4.755084 4.708453 4.696748
## sp|A0A075B6K0|LV316_HUMAN 4.471802 4.529905 4.479845 4.517084 4.502633

# Dimensions unchanged
dim(expr_combined_corrected)

## [1] 904  24

PCA On Combined Plexes after removing Batch effect

pca_post <- prcomp(t(expr_combined_corrected), center = TRUE, scale. = TRUE)

pca_post_df <- data.frame(
  PC1 = pca_post$x[,1],
  PC2 = pca_post$x[,2],
  Sample = colnames(expr_combined_corrected),
  Plex = meta_combined$Plex,
  Group = meta_combined$Group
)

ggplot(pca_post_df, aes(x = PC1, y = PC2, color = Group, shape = Plex)) +
  geom_point(size = 4) +
  geom_text(aes(label = Sample), vjust = -0.5, size = 3) +
  theme_bw() +
  labs(title = "PCA After Batch Correction",
       x = "PC1",
       y = "PC2")

Differential Expression Analysis

Combined plexes

First analysis includes batch correction at the limma step

design_combined <- model.matrix(~ Plex + Group, data = meta_combined)

fit_combined <- lmFit(expr_combined_imp, design_combined)
fit_combined <- eBayes(fit_combined)
results_combined <- topTable(
  fit_combined,
  coef = "GroupCase",
  number = Inf,
  sort.by = "P"
)
results_combined$Protein <- rownames(results_combined)
# Add gene annotation
results_combined <- merge(
  results_combined,
  protein_info_combined,
  by.x = "Protein",
  by.y = "protein",
  all.x = TRUE
)

results_combined <- results_combined[order(results_combined$P.Value),]

head(results_combined) |>
    kbl(caption = "Top ranked proteins by p-value") |>
    kable_paper("hover", full_width = F)

Top ranked proteins by p-value
	Protein	logFC	AveExpr	t	P.Value	adj.P.Val	B	gene
751	sp\|Q27J81\|INF2_HUMAN	-0.1169760	4.464969	-4.867502	0.0000637	0.0576017	1.6695010	INF2
507	sp\|P28482\|MK01_HUMAN	0.1460155	4.370219	4.197978	0.0003398	0.1535753	0.0466474	MAPK1
96	sp\|O75356\|ENTP5_HUMAN	0.0411850	4.456341	3.137454	0.0045940	0.5645256	-2.4416598	ENTPD5
571	sp\|P49407\|ARRB1_HUMAN	0.0752710	4.480139	3.119861	0.0047905	0.5645256	-2.4810489	ARRB1
891	sp\|Q9Y287\|ITM2B_HUMAN	-0.0322520	4.425711	-3.009591	0.0062189	0.5645256	-2.7258946	ITM2B
609	sp\|P59998\|ARPC4_HUMAN	-0.0810547	4.524232	-2.956053	0.0070519	0.5645256	-2.8434299	ARPC4

Batch correction was done separately before limma.

design_combined <- model.matrix(~ Group, data = meta_combined)

fit_combined <- lmFit(expr_combined_corrected, design_combined)
fit_combined <- eBayes(fit_combined)
results_combined <- topTable(
  fit_combined,
  coef = "GroupCase",
  number = Inf,
  sort.by = "P"
)
results_combined$Protein <- rownames(results_combined)
# Add gene annotation
results_combined <- merge(
  results_combined,
  protein_info_combined,
  by.x = "Protein",
  by.y = "protein",
  all.x = TRUE
)

results_combined <- results_combined[order(results_combined$P.Value),]

head(results_combined) |>
    kbl(caption = "Top ranked proteins by p-value") |>
    kable_paper("hover", full_width = F)

Top ranked proteins by p-value
	Protein	logFC	AveExpr	t	P.Value	adj.P.Val	B	gene
751	sp\|Q27J81\|INF2_HUMAN	-0.1169760	4.464969	-4.974535	0.0000436	0.0394328	2.0060272	INF2
507	sp\|P28482\|MK01_HUMAN	0.1460155	4.370219	4.288700	0.0002506	0.1132865	0.3085421	MAPK1
96	sp\|O75356\|ENTP5_HUMAN	0.0411850	4.456341	3.211806	0.0037135	0.5064708	-2.2717706	ENTPD5
571	sp\|P49407\|ARRB1_HUMAN	0.0752710	4.480139	3.188448	0.0039299	0.5064708	-2.3251860	ARRB1
891	sp\|Q9Y287\|ITM2B_HUMAN	-0.0322520	4.425711	-3.084598	0.0050485	0.5064708	-2.5607553	ITM2B
757	sp\|Q5JSH3\|WDR44_HUMAN	0.0271306	4.503781	3.023974	0.0058367	0.5064708	-2.6967557	WDR44

Make sure to do volcano plot, box/beeswarm plot and heatmap.

Volcano plot

sig <- subset(results_combined,adj.P.Val<0.05)

plot(results_combined$logFC, -log10(results_combined$P.Value),
  cex=0.6,pch=19,col="darkgray",
  xlab="log2 fold change",ylab="-log10(p-value)")

abline(v=0,lty=2,lwd=1)
points(sig$logFC, -log10(sig$P.Value) , col="red",pch=19,cex=0.7)
text(sig$logFC+0.015, -log10(sig$P.Value), labels=sig$gene)

sig <- subset(results_combined,adj.P.Val<0.05)

plot(log10(results_combined$AveExpr),results_combined$logFC,
  cex=0.6,pch=19,col="darkgray",
  xlab="log10 basemean",ylab="log2 fold change")

abline(h=0,lty=2,lwd=1)
points(log10(sig$AveExpr),sig$logFC,col="red",pch=19,cex=0.7)
text(log10(sig$AveExpr)+0.004,sig$logFC,labels=sig$gene)

Beeswarm plot using batch corrected data.

par(mfrow=c(3,3))

null <- lapply(1:9,function(i) {
  mygene <- results_combined$gene[i]
  myprot <- results_combined$Protein[i]
  mydat <- expr_combined_corrected[which(rownames(expr_combined_corrected) == myprot),]
  mygroups <- meta_combined[match(names(mydat),meta_combined$Sample),3]
  ctrl <- mydat[which(mygroups=="Control")]
  case <- mydat[which(mygroups=="Case")]
  xl <- list("Ctrl"=ctrl,"Case"=case)
  boxplot(xl,ylab="Relative protein expression",main=mygene,col="white",cex=0)
  beeswarm(xl,add=TRUE,cex=1.5,pch=19,col="darkgray")
} )

Now look at the ratio between MAPK1 and INF2.

prota="sp|P28482|MK01_HUMAN"
protb="sp|Q27J81|INF2_HUMAN"
dat_a <- expr_combined_corrected[which(rownames(expr_combined_corrected)==prota),]
dat_b <- expr_combined_corrected[which(rownames(expr_combined_corrected)==protb),]
abrat <- dat_a / dat_b
case_samples <- meta_combined[which(meta_combined$Group=="Case"),"Sample"]
ctrl_samples <- meta_combined[which(meta_combined$Group=="Control"),"Sample"]

case_dat <- abrat[which(names(abrat) %in% case_samples)]
ctrl_dat <- abrat[which(names(abrat) %in% ctrl_samples)]

rl <- list("Ctrl"=ctrl_dat, "Case"=case_dat)
boxplot(rl,ylab="Ratio",main="Ratio MAPK1/INF2",col="white",cex=0)
beeswarm(rl,add=TRUE,cex=1.5,pch=19,col="darkgray")

t.test(case_dat,ctrl_dat)

## 
##  Welch Two Sample t-test
## 
## data:  case_dat and ctrl_dat
## t = 6.4566, df = 22, p-value = 1.701e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.03962877 0.07713260
## sample estimates:
## mean of x mean of y 
## 1.0084825 0.9501018

Separate plexes

Analyse plex A and plex B.

design_A <- model.matrix(~ Group, data = meta_A)
fit_A <- lmFit(expr_A_imp, design_A)
fit_A <- eBayes(fit_A)
results_A <- topTable(
  fit_A,
  coef = "GroupCase",
  number = Inf,
  sort.by = "P"
)
results_A$Protein <- rownames(results_A)
# Add gene annotation
results_A <- merge(
  results_A,
  protein_info_A,
  by.x = "Protein",
  by.y = "protein",
  all.x = TRUE
)

# View
results_A <- results_A[order(results_A$P.Value),]

head(results_A,10) |>
    kbl(caption = "Top ranked proteins by p-value in plex A") |>
    kable_paper("hover", full_width = F)

Top ranked proteins by p-value in plex A
	Protein	logFC	AveExpr	t	P.Value	adj.P.Val	B	gene
740	sp\|Q86UF1\|TSN33_HUMAN	-0.2007621	4.396326	-4.595562	0.0006315	0.3477544	-0.0924426	TSPAN33
270	sp\|P06702\|S10A9_HUMAN	-0.1394058	4.711956	-4.203125	0.0012514	0.3477544	-0.7472006	S100A9
715	sp\|Q27J81\|INF2_HUMAN	-0.1226512	4.448334	-4.067512	0.0015918	0.3477544	-0.9777981	INF2
546	sp\|P49593\|PPM1F_HUMAN	0.1064798	4.172522	3.995491	0.0018102	0.3477544	-1.1010580	PPM1F
863	sp\|Q9Y287\|ITM2B_HUMAN	-0.0563506	4.300458	-3.616202	0.0035936	0.3477544	-1.7577260	ITM2B
253	sp\|P05109\|S10A8_HUMAN	-0.1256702	4.779495	-3.570575	0.0039059	0.3477544	-1.8373967	S100A8
63	sp\|O14974\|MYPT1_HUMAN	-0.2520287	4.360400	-3.529357	0.0042118	0.3477544	-1.9094565	PPP1R12A
335	sp\|P0DP01\|HV108_HUMAN	-0.1724802	4.563498	-3.239505	0.0071775	0.3477544	-2.4176917	IGHV1-8
587	sp\|P61106\|RAB14_HUMAN	0.0912982	4.462361	3.077161	0.0096891	0.3477544	-2.7024736	RAB14
12	sp\|A0A075B6S2\|KVD29_HUMAN	0.0737667	4.787900	3.072875	0.0097662	0.3477544	-2.7099842	IGKV2D-29

Plex B.

design_B <- model.matrix(~ Group, data = meta_B)
fit_B <- lmFit(expr_B_imp, design_B)
fit_B <- eBayes(fit_B)
results_B <- topTable(
  fit_B,
  coef = "GroupCase",
  number = Inf,
  sort.by = "P"
)
results_B$Protein <- rownames(results_B)
# Add gene annotation
results_B <- merge(
  results_B,
  protein_info_B,
  by.x = "Protein",
  by.y = "protein",
  all.x = TRUE
)

# View
results_B <- results_B[order(results_B$P.Value),]

head(results_B,10) |>
    kbl(caption = "Top ranked proteins by p-value in plex B") |>
    kable_paper("hover", full_width = F)

Top ranked proteins by p-value in plex B
	Protein	logFC	AveExpr	t	P.Value	adj.P.Val	B	gene
112	sp\|P00387\|NB5R3_HUMAN	-0.1768389	4.440961	-6.330268	0.0000379	0.0323369	2.5541250	CYB5R3
528	sp\|P40227\|TCPZ_HUMAN	-0.1501753	4.378868	-5.615484	0.0001136	0.0371781	1.5320233	CCT6A
590	sp\|P61106\|RAB14_HUMAN	-0.1788209	4.385867	-5.528161	0.0001306	0.0371781	1.4014708	RAB14
575	sp\|P55084\|ECHB_HUMAN	-0.2729338	4.251690	-5.063074	0.0002790	0.0595748	0.6855750	HADHB
282	sp\|P07203\|GPX1_HUMAN	-0.1583329	4.361424	-4.767197	0.0004594	0.0784686	0.2126430	GPX1
89	sp\|O75356\|ENTP5_HUMAN	0.0867614	4.370177	4.514696	0.0007096	0.1010049	-0.2010684	ENTPD5
827	sp\|Q9UBR2\|CATZ_HUMAN	0.0824984	4.454822	4.367655	0.0009176	0.1116510	-0.4459936	CTSZ
458	sp\|P24539\|AT5F1_HUMAN	-0.1709199	4.334835	-4.195393	0.0012440	0.1116510	-0.7363886	ATP5PB
741	sp\|Q8IYS5\|OSCAR_HUMAN	0.1121910	4.272896	4.132539	0.0013914	0.1116510	-0.8432130	OSCAR
244	sp\|P04899\|GNAI2_HUMAN	-0.1958671	4.296231	-4.072116	0.0015501	0.1116510	-0.9463140	GNAI2

Comparison of Plex A and B results

The top 50 up and down regulated proteins are selected from Plex A and B and these lists were compared.

plexA_up <- head(subset(results_A,logFC>0),50)[,1]
plexA_dn <- head(subset(results_A,logFC<0),50)[,1]
plexB_up <- head(subset(results_B,logFC>0),50)[,1]
plexB_dn <- head(subset(results_B,logFC<0),50)[,1]

v1 <- list("A up"=plexA_up, "A dn"=plexA_dn, "B up"=plexB_up, "B dn"=plexB_dn)

plot(euler(v1), quantities = TRUE)

intersect(plexA_up,plexB_up)

## character(0)

intersect(plexA_dn,plexB_dn)

## [1] "sp|Q27J81|INF2_HUMAN"

intersect(plexA_up,plexB_dn)

## [1] "sp|P61106|RAB14_HUMAN" "sp|Q9UBW5|BIN2_HUMAN"  "sp|Q3ZCW2|LEGL_HUMAN" 
## [4] "sp|P14770|GPIX_HUMAN"  "sp|Q15555|MARE2_HUMAN" "sp|Q86UX7|URP2_HUMAN" 
## [7] "sp|Q14247|SRC8_HUMAN"

intersect(plexB_up,plexA_dn)

## [1] "sp|P18428|LBP_HUMAN"   "sp|Q5D862|FILA2_HUMAN" "sp|P01766|HV313_HUMAN"

Ranking of proteins by logFC

results_A$score <- sign(results_A$logFC) * -log10(results_A$P.Value)
results_B$score <- sign(results_B$logFC) * -log10(results_B$P.Value)

results_m <- merge(results_A,results_B,by="Protein")

ranks_m <- results_m[,c("Protein","score.x","score.y")]
rownames(ranks_m) <- ranks_m$Protein ; ranks_m$Protein=NULL
colnames(ranks_m) <- c("Plex A","Plex B")

plot(ranks_m)

mylm <- lm(ranks_m$`Plex B` ~ ranks_m$`Plex A`)
abline(mylm,lwd=2,lty=2,col="red")
COR=signif(cor(ranks_m),3)[2,1]
COR

## [1] -0.551

mtext(paste("Pearson correlation=",COR))

Cross batch analysis

Look at the proteins that are differentially expressed across batches.

IGHV and IHKV are over-represented.

Would be interesting to look at the composition of these proteins to understand the batch effect better.

design_batch <- model.matrix(~ Plex , data = meta_combined)

fit_batch <- lmFit(expr_combined_imp, design_batch)

fit_batch <- eBayes(fit_batch)

summary(decideTests(fit_batch))

##        (Intercept) PlexB
## Down             0   299
## NotSig           0   331
## Up             904   274

results_batch <- topTable(
  fit_batch,
  coef = ncol(summary(decideTests(fit_batch))),
  number = Inf,
  sort.by = "P"
)

results_batch$Protein <- rownames(results_batch)

# Add gene annotation
results_batch <- merge(
  results_batch,
  protein_info_combined,
  by.x = "Protein",
  by.y = "protein",
  all.x = TRUE
)

results_batch <- results_batch[order(results_batch$P.Value),]

head(results_batch,10) |>
    kbl(caption = "Proteins related to Plex A/B differences.") |>
    kable_paper("hover", full_width = F)

Proteins related to Plex A/B differences.
	Protein	logFC	AveExpr	t	B	gene
713	sp\|Q14204\|DYHC1_HUMAN	-0.4230034	4.335732	-39.76442	43.95389	DYNC1H1
861	sp\|Q9NY33\|DPP3_HUMAN	-0.3047488	4.394859	-34.77114	40.80698	DPP3
2	sp\|A0A075B6H9\|LV469_HUMAN	-0.4129798	4.707875	-32.80910	39.43752	IGLV4-69
1	sp\|A0A075B6H7\|KV37_HUMAN	0.2517374	4.847043	32.61892	39.30028	IGKV3-7
37	sp\|A0A0C4DH31\|HV118_HUMAN	-0.2425323	4.820578	-32.13371	38.94635	IGHV1-18
174	sp\|P01814\|HV270_HUMAN	0.2878827	4.691175	31.74363	38.65779	IGHV2-70
215	sp\|P02765\|FETUA_HUMAN	-0.2652081	4.856746	-30.92674	38.04153	AHSG
538	sp\|P35542\|SAA4_HUMAN	0.2109984	4.810501	29.13942	36.63325	SAA4
100	sp\|O75822\|EIF3J_HUMAN	0.1900294	4.642248	28.85470	36.40087	EIF3J
23	sp\|A0A0B4J1U7\|HV601_HUMAN	0.3305690	4.712518	28.55146	36.15082	IGHV6-1

head(subset(results_batch,logFC>0),20) |>
    kbl(caption = "Proteins with higher expression in Plex B.") |>
    kable_paper("hover", full_width = F)

Proteins with higher expression in Plex B.
	Protein	logFC	AveExpr	t	B	gene
1	sp\|A0A075B6H7\|KV37_HUMAN	0.2517374	4.847043	32.61892	39.30028	IGKV3-7
174	sp\|P01814\|HV270_HUMAN	0.2878827	4.691175	31.74363	38.65779	IGHV2-70
538	sp\|P35542\|SAA4_HUMAN	0.2109984	4.810501	29.13942	36.63325	SAA4
100	sp\|O75822\|EIF3J_HUMAN	0.1900294	4.642248	28.85470	36.40087	EIF3J
23	sp\|A0A0B4J1U7\|HV601_HUMAN	0.3305690	4.712518	28.55146	36.15082	IGHV6-1
21	sp\|A0A0A0MT89\|KJ01_HUMAN	0.5697265	4.262369	28.24926	35.89898	IGKJ1
155	sp\|P01619\|KV320_HUMAN	0.2411374	4.845132	27.88502	35.59180	IGKV3-20
846	sp\|Q9HBR0\|S38AA_HUMAN	0.2405421	4.426961	27.06020	34.88120	SLC38A10
374	sp\|P11047\|LAMC1_HUMAN	0.2874736	4.403495	27.03158	34.85616	LAMC1
12	sp\|A0A075B6R9\|KVD24_HUMAN	0.2853694	4.626380	24.22532	32.26496	IGKV2D-24
294	sp\|P07225\|PROS_HUMAN	0.1403271	4.861195	24.21825	32.25807	PROS1
780	sp\|Q8IWV2\|CNTN4_HUMAN	0.3482763	4.373094	23.43664	31.48412	CNTN4
77	sp\|O15394\|NCAM2_HUMAN	0.2564100	4.419027	23.04011	31.08198	NCAM2
556	sp\|P41222\|PTGDS_HUMAN	0.3360664	4.526600	22.39538	30.41383	PTGDS
789	sp\|Q8NBS9\|TXND5_HUMAN	0.2134845	4.440490	21.43401	29.38300	TXNDC5
720	sp\|Q14623\|IHH_HUMAN	0.2363269	4.429068	21.14677	29.06660	IHH
806	sp\|Q93070\|NAR4_HUMAN	0.2486540	4.422905	21.13923	29.05824	ART4
748	sp\|Q16775\|GLO2_HUMAN	0.3274800	4.383492	21.12337	29.04065	HAGH
828	sp\|Q9BRK5\|CAB45_HUMAN	0.2748577	4.409803	20.49463	28.33314	SDF4
81	sp\|O43242\|PSMD3_HUMAN	0.2209955	4.436734	19.83966	27.57449	PSMD3

head(subset(results_batch,logFC<0),20) |>
    kbl(caption = "Proteins with higher expression in Plex A.") |>
    kable_paper("hover", full_width = F)

Proteins with higher expression in Plex A.
	Protein	logFC	AveExpr	t	B	gene
713	sp\|Q14204\|DYHC1_HUMAN	-0.4230034	4.335732	-39.76442	43.95389	DYNC1H1
861	sp\|Q9NY33\|DPP3_HUMAN	-0.3047488	4.394859	-34.77114	40.80698	DPP3
2	sp\|A0A075B6H9\|LV469_HUMAN	-0.4129798	4.707875	-32.80910	39.43752	IGLV4-69
37	sp\|A0A0C4DH31\|HV118_HUMAN	-0.2425323	4.820578	-32.13371	38.94635	IGHV1-18
215	sp\|P02765\|FETUA_HUMAN	-0.2652081	4.856746	-30.92674	38.04153	AHSG
206	sp\|P02746\|C1QB_HUMAN	-0.1673451	4.860047	-28.29022	35.93327	C1QB
53	sp\|A6NMY6\|AXA2L_HUMAN	-0.3200210	4.387223	-27.54504	35.30147	ANXA2P2
209	sp\|P02749\|APOH_HUMAN	-0.2119714	4.846936	-26.55510	34.43536	APOH
321	sp\|P08237\|PFKAM_HUMAN	-0.3363273	4.379070	-24.70965	32.73241	PFKM
140	sp\|P01019\|ANGT_HUMAN	-0.1383361	4.841181	-23.77543	31.82260	AGT
331	sp\|P08575\|PTPRC_HUMAN	-0.2611470	4.416660	-23.66786	31.71563	PTPRC
800	sp\|Q92530\|PSMF1_HUMAN	-0.4651059	4.314681	-22.98949	31.03016	PSMF1
48	sp\|A0A0J9YX35\|HV64D_HUMAN	-0.2875261	4.728430	-22.93122	30.97039	IGHV3-64D
579	sp\|P50148\|GNAQ_HUMAN	-0.3877185	4.481589	-22.67411	30.70489	GNAQ
746	sp\|Q16698\|DECR_HUMAN	-0.3726718	4.360898	-20.61462	28.46970	DECR1
167	sp\|P01743\|HV146_HUMAN	-0.1826418	4.636493	-19.22869	26.84588	IGHV1-46
129	sp\|P00740\|FA9_HUMAN	-0.1775446	4.690761	-19.03372	26.60894	F9
856	sp\|Q9NSC7\|SIA7A_HUMAN	-0.3343780	4.380045	-18.76737	26.28170	ST6GALNAC1
670	sp\|Q05315\|LEG10_HUMAN	-0.5500179	4.272225	-18.38986	25.81067	CLC
728	sp\|Q15084\|PDIA6_HUMAN	-0.5164362	4.244026	-18.06663	25.40046	PDIA6

PLSDA-batch

expr_combined_pl_corr <- PLSDA_batch(
    X = t(expr_combined_imp),
    Y.trt = meta_combined$Group, Y.bat = meta_combined$Plex,
    ncomp.trt = 1, ncomp.bat = 1,
    mode = "regression"
)

expr_combined_pl_corr$X.nobatch[1:5,1:7]

##       sp|A0A075B6H7|KV37_HUMAN sp|A0A075B6H9|LV469_HUMAN
## 19191                 4.837655                  4.695762
## 34629                 4.829669                  4.720578
## 34862                 4.841301                  4.718620
## 40431                 4.866308                  4.720954
## 40475                 4.823830                  4.720825
##       sp|A0A075B6I0|LV861_HUMAN sp|A0A075B6I1|LV460_HUMAN
## 19191                  4.823905                  4.645763
## 34629                  4.857660                  4.725371
## 34862                  4.763711                  4.631092
## 40431                  4.807648                  4.704044
## 40475                  4.800197                  4.575834
##       sp|A0A075B6J9|LV218_HUMAN sp|A0A075B6K0|LV316_HUMAN
## 19191                  4.598065                  4.471714
## 34629                  4.758746                  4.530039
## 34862                  4.756047                  4.479858
## 40431                  4.705885                  4.517386
## 40475                  4.695230                  4.502849
##       sp|A0A075B6K4|LV310_HUMAN
## 19191                  4.795010
## 34629                  4.877746
## 34862                  4.875459
## 40431                  4.895919
## 40475                  4.855380

expr_combined_pl_corrmx <- t(expr_combined_pl_corr$X.nobatch)
expr_combined_pl_corrmx[1:6,1:4]

##                              19191    34629    34862    40431
## sp|A0A075B6H7|KV37_HUMAN  4.837655 4.829669 4.841301 4.866308
## sp|A0A075B6H9|LV469_HUMAN 4.695762 4.720578 4.718620 4.720954
## sp|A0A075B6I0|LV861_HUMAN 4.823905 4.857660 4.763711 4.807648
## sp|A0A075B6I1|LV460_HUMAN 4.645763 4.725371 4.631092 4.704044
## sp|A0A075B6J9|LV218_HUMAN 4.598065 4.758746 4.756047 4.705885
## sp|A0A075B6K0|LV316_HUMAN 4.471714 4.530039 4.479858 4.517386

Limma on PLSDA-batch corrected data

# stash the old analysis
results_combined0 <- results_combined

design_combined <- model.matrix(~ Group, data = meta_combined)

fit_combined <- lmFit(expr_combined_pl_corrmx, design_combined)
fit_combined <- eBayes(fit_combined)
results_combined <- topTable(
  fit_combined,
  coef = "GroupCase",
  number = Inf,
  sort.by = "P"
)
results_combined$Protein <- rownames(results_combined)
# Add gene annotation
results_combined <- merge(
  results_combined,
  protein_info_combined,
  by.x = "Protein",
  by.y = "protein",
  all.x = TRUE
)

results_combined <- results_combined[order(results_combined$P.Value),]

head(results_combined) |>
    kbl(caption = "Top ranked proteins by p-value (limma done on PLSDA-batch-corrected data)") |>
    kable_paper("hover", full_width = F)

Top ranked proteins by p-value (limma done on PLSDA-batch-corrected data)
	Protein	logFC	AveExpr	t	P.Value	adj.P.Val	B	gene
751	sp\|Q27J81\|INF2_HUMAN	-0.1169760	4.464969	-4.987563	0.0000422	0.0381130	2.0408304	INF2
507	sp\|P28482\|MK01_HUMAN	0.1460155	4.370219	4.340409	0.0002195	0.0992365	0.4386394	MAPK1
891	sp\|Q9Y287\|ITM2B_HUMAN	-0.0322520	4.425711	-3.288005	0.0030836	0.5117635	-2.0937802	ITM2B
871	sp\|Q9UBQ0\|VPS29_HUMAN	-0.0557655	4.404436	-3.186019	0.0039520	0.5117635	-2.3280071	VPS29
571	sp\|P49407\|ARRB1_HUMAN	0.0752710	4.480139	3.172493	0.0040836	0.5117635	-2.3588576	ARRB1
844	sp\|Q9HBB8\|CDHR5_HUMAN	0.0384885	4.294720	3.055350	0.0054138	0.5117635	-2.6237749	CDHR5