Introduction

Reproducibility is a critical challenge in proteomics computational workflows where different software platforms can produce varying results from identical raw mass spectrometry data. In my chromatin-associated proteomics research, I analyze data using MaxQuant [1] to identify and quantify proteins in chromatin versus soluble fractions from U2OS cell lines. Each platform has unique dependency requirements that complicate reproducibility on shared high-performance computing systems.

This project demonstrates containerised proteomics workflows using Apptainer, directly applying concepts from Workshop Session 7 [2]. By standardizing the computational workflow, I can ensure results are robust and reproducible across different computing environments. Using Apptainer ensures that R package versions and system dependencies remain isolated and reproducible, addressing a critical challenge in proteomics data analysis.

How to Run

Step 1: Build the Apptainer Container

Save the definition file below as proteomics_apptainer.def:

BootStrap: docker
From: rocker/r-ver:4.5.2

%post
  # Update apt-get 
  apt-get update \
  && apt-get upgrade -y \
  
  # Install system dependencies
  apt-get install -y \
    libcurl4-openssl-dev \
    libssl-dev \
    libxml2-dev \
    git \
    nano \
    pandoc
  
  # Install R packages
  R -e "install.packages(c('knitr', 'rmarkdown', 'ggplot2', 'tidyverse'))"
  R -e "install.packages('BiocManager')"
  R -e "BiocManager::install('limma')"
  
  # Clean up
  apt-get clean
  rm -rf /var/lib/apt/lists/*

%environment
  export LC_ALL=C.UTF-8

%labels
  Author Swanitha
  Description Proteomics analysis container with R and limma

%runscript
  exec /bin/bash "$@"

Build the container:

apptainer build proteomics.sif proteomics_apptainer.def

Step 2: Request HPC Resources

srun -c 4 --mem 16G --time 4:00:00 --pty bash -i

Step 3: Run Analysis Inside Container

# Start the container with writable filesystem
apptainer shell --writable-tmpfs proteomics.sif

# Inside the container, run the analysis
R -e "rmarkdown::render('proteomics_analysis.Rmd')"

# Exit the container
exit

Analysis

Load Libraries

library(limma)
library(ggplot2)
library(knitr)
library(tidyverse)
library(DT)

MaxQuant Data Processing

Load Data

# Read MaxQuant data
x_mq <- read.table("proteinGroups.txt",
                   header = TRUE, sep = "\t", quote = "", 
                   stringsAsFactors = FALSE)

# Remove contaminants and reverse hits
x_mq <- x_mq[is.na(x_mq$Potential.contaminant) | 
             x_mq$Potential.contaminant != "+", ]
x_mq <- x_mq[is.na(x_mq$Reverse) | x_mq$Reverse != "+", ]
x_mq <- x_mq[is.na(x_mq$Only.identified.by.site) | 
             x_mq$Only.identified.by.site != "+", ]

Process Intensities

# Get intensity columns for chromatin and soluble fractions
chrom_cols <- grep("Intensity\\.chr[1-4]$", colnames(x_mq), value = TRUE)
sol_cols <- grep("Intensity\\.sol[1-4]$", colnames(x_mq), value = TRUE)
all_intensity_mq <- cbind(x_mq[, chrom_cols], x_mq[, sol_cols])

# Filter for proteins detected in both fractions
chrom_raw <- rowMeans(x_mq[, chrom_cols], na.rm = TRUE)
sol_raw <- rowMeans(x_mq[, sol_cols], na.rm = TRUE)
chrom_raw[chrom_raw == 0 | is.nan(chrom_raw)] <- NA
sol_raw[sol_raw == 0 | is.nan(sol_raw)] <- NA
both_idx <- which(!is.na(chrom_raw) & !is.na(sol_raw))

# Extract proteins present in both fractions
z_mq <- all_intensity_mq[both_idx, ]

Impute Missing Values

# Impute missing values with 10% of minimum intensity
for(i in 1:ncol(z_mq)) {
  if(any(is.na(z_mq[,i]))) {
    z_mq[is.na(z_mq[,i]), i] <- min(z_mq[,i], na.rm = TRUE) * 0.1
  }
}

Normalize Data

# Log transform and quantile normalize
z_mq_norm <- normalizeBetweenArrays(log10(z_mq + 1), method = "quantile")

Differential Expression Analysis

# Perform differential expression analysis
# Design: chromatin (1,1,1,1) vs soluble (0,0,0,0)
design_mq <- model.matrix(~ factor(c(rep(1, 4), rep(0, 4))))
fit_mq <- eBayes(lmFit(z_mq_norm, design_mq))
res_mq <- topTable(fit_mq, coef = 2, number = Inf)

# Count significant proteins
sig_mq <- sum(res_mq$adj.P.Val < 0.05)
sig_mq_up <- sum(res_mq$adj.P.Val < 0.05 & res_mq$logFC > 0)
sig_mq_down <- sum(res_mq$adj.P.Val < 0.05 & res_mq$logFC < 0)

Results

Table 1: Platform Performance

summary_df <- data.frame(
  Metric = c("Total Proteins Identified", 
             "Proteins Analyzed", 
             "Significant (adj.p < 0.05)",
             "Chromatin-Enriched", 
             "Soluble-Enriched"),
  Count = c(nrow(x_mq), 
            length(both_idx), 
            sig_mq,
            sig_mq_up, 
            sig_mq_down)
)

kable(summary_df, 
      caption = "Table 1: MaxQuant proteomics analysis summary")
Table 1: MaxQuant proteomics analysis summary
Metric Count
Total Proteins Identified 6693
Proteins Analyzed 5047
Significant (adj.p < 0.05) 3293
Chromatin-Enriched 1438
Soluble-Enriched 1855

MaxQuant identified 6693 proteins in total. After quality filtering and requiring detection in both chromatin and soluble fractions, 5047 proteins were analyzed. Of these, 3293 proteins showed significant differential expression (adjusted p < 0.05), with 1438 enriched in chromatin and 1855 enriched in soluble fractions.

Figure 1: Differential Expression

# Create volcano plot
plot(res_mq$logFC, -log10(res_mq$P.Value), 
     pch = 19, cex = 0.5,
     col = ifelse(res_mq$adj.P.Val < 0.05, "red", "grey"),
     main = "MaxQuant: Chromatin vs Soluble Fractions", 
     xlab = "Log2 Fold Change (Chromatin/Soluble)", 
     ylab = "-Log10 P-value")
abline(v = 0, lty = 2, col = "blue")
abline(h = -log10(0.05), lty = 2, col = "darkgreen")
legend("topright", 
       legend = c("Significant", "Not Significant"), 
       col = c("red", "grey"), 
       pch = 19, 
       cex = 0.8)

Figure 1: Volcano plot showing differential protein expression between chromatin and soluble fractions analyzed with MaxQuant [1]. Red points indicate proteins with adjusted p-value < 0.05. Positive log fold changes represent chromatin-enriched proteins, while negative values indicate soluble-enriched proteins.

Table 2: Top Differential Proteins

# Get gene names from original data for proteins in analysis
gene_names <- x_mq$Gene.names[both_idx]

# If gene name is empty, use Majority protein ID as fallback
protein_ids <- x_mq$Majority.protein.IDs[both_idx]
gene_names[gene_names == ""] <- protein_ids[gene_names == ""]

# Add protein identifiers to results
res_mq_with_ids <- res_mq
res_mq_with_ids$Protein <- gene_names

# Prepare top differential proteins for display
top_proteins <- res_mq_with_ids %>%
  select(Protein, logFC, AveExpr, t, P.Value, adj.P.Val) %>%
  arrange(adj.P.Val) %>%
  head(100)  # Show top 100 proteins

# Create interactive searchable table
datatable(top_proteins,
          caption = "Table 2: Top 100 differential proteins ranked by adjusted p-value",
          options = list(
            pageLength = 10,
            scrollX = TRUE,
            order = list(list(5, 'asc'))  # Sort by adj.P.Val
          ),
          rownames = FALSE) %>%
  formatSignif(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val'), 
               digits = 4)

Table 2 shows the top 100 most significantly differential proteins between chromatin and soluble fractions, ranked by adjusted p-value. The table is searchable and sortable, allowing exploration of specific proteins of interest. Positive log fold changes indicate chromatin enrichment, while negative values indicate soluble enrichment.

Integration with Research

This containerised workflow enables me to validate chromatin-associated protein findings with reproducible, platform-independent analysis. The MaxQuant platform [1] provides robust peptide identification and quantification for my U2OS cell line fractionation experiments. By containerising this workflow, I ensure that my analysis can be reproduced exactly by collaborators and reviewers, regardless of their local R installation or system configuration.


Reflection

The bioinformatics workshop series fundamentally changed my approach to computational reproducibility. Workshop Session 7 on containerisation [2] was particularly valuable because it provided practical solutions to dependency management challenges I had encountered when trying to reproduce analyses across different HPC systems.

Learning to document workflows systematically and treat computational environments as research artifacts has been transformative. I now build an Apptainer container for each major project, ensuring that my work remains reproducible regardless of future software updates on our shared HPC.

From a career perspective, the workshops taught skills increasingly required for computational biology positions. Understanding containerisation is now essential for bioinformatics roles, and I can confidently discuss these concepts. Beyond technical skills, the workshops taught systematic thinking about treating analysis code as permanent research artifacts requiring proper documentation and version control.


Bibliography

[1] Cox J, Mann M. MaxQuant enables high peptide identification rates, individualized p.p.b.-range mass accuracies and proteome-wide protein quantification. Nature Biotechnology. 2008;26:1367-1372.

[2] Ziemann M. Bioinformatics data skills workshop - Session 7: Containerising workflows on shared computers. Burnet Bioinformatics Group. 2025. Available from: https://github.com/markziemann/bioinformatics_intro_workshop


Session Information

sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] DT_0.34.0       lubridate_1.9.4 forcats_1.0.1   stringr_1.6.0  
##  [5] dplyr_1.1.4     purrr_1.2.1     readr_2.1.6     tidyr_1.3.2    
##  [9] tibble_3.3.1    tidyverse_2.0.0 knitr_1.51      ggplot2_4.0.1  
## [13] limma_3.64.3   
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       jsonlite_2.0.0     compiler_4.5.2     tidyselect_1.2.1  
##  [5] dichromat_2.0-0.1  jquerylib_0.1.4    scales_1.4.0       yaml_2.3.12       
##  [9] fastmap_1.2.0      statmod_1.5.1      R6_2.6.1           generics_0.1.4    
## [13] htmlwidgets_1.6.4  tzdb_0.5.0         bslib_0.9.0        pillar_1.11.1     
## [17] RColorBrewer_1.1-3 rlang_1.1.7        stringi_1.8.7      cachem_1.1.0      
## [21] xfun_0.56          sass_0.4.10        S7_0.2.1           otel_0.2.0        
## [25] timechange_0.3.0   cli_3.6.5          withr_3.0.2        magrittr_2.0.4    
## [29] crosstalk_1.2.2    digest_0.6.39      grid_4.5.2         hms_1.1.4         
## [33] lifecycle_1.0.5    vctrs_0.7.0        evaluate_1.0.5     glue_1.8.0        
## [37] farver_2.1.2       rmarkdown_2.30     tools_4.5.2        pkgconfig_2.0.3   
## [41] htmltools_0.5.9