Intro

Need to cluster the proteomics data.

to run this script, load it into Rstudio and hit “knit”.

library(readxl)
library(limma)
library(gplots)
library(stringr)
library(RColorBrewer)
library(kableExtra)

Read data

There were some cells with “Missing Value” and this is causing those coluns to be recognised as characters.

I deleted those cells’ contents and then loaded into R.

excel_sheets("1b.2_MS25-036_Dimple_Reruns._Perseus.xlsx")
## [1] "Samples Table of MS25-036_Dimpl"
xl <- read_excel("1b.2_MS25-036_Dimple_Reruns._Perseus.xlsx")

head(xl)
## # A tibble: 6 × 29
##   S.No  `Protein Name`                   `Control-Intact -1` `Control-Intact -2`
##   <chr> <chr>                                          <dbl>               <dbl>
## 1 1     FAD-binding PCMH-type domain-co…             3650000             4920000
## 2 2     Group of Major pollen allergen …             4760000             6300000
## 3 3     Group of Pollen allergen Phl p …             4740000             6290000
## 4 4     FAD-binding PCMH-type domain-co…             3440000             4860000
## 5 5     Group of Pollen allergen Phl p …             3620000             4980000
## 6 6     FAD-binding PCMH-type domain-co…             3100000             4500000
## # ℹ 25 more variables: `Control-Intact -3` <dbl>, `Intact-N02 - 1` <dbl>,
## #   `Intact-N02 - 2` <dbl>, `Intact-N02 - 3` <dbl>, `Intact-N02 +O3 - 1` <dbl>,
## #   `Intact-N02+O3 - 2` <dbl>, `Inatct-N02 + O3 - 3` <dbl>,
## #   `Control-Ruptured -1` <dbl>, `Control-Ruptured -2` <dbl>,
## #   `Control-Ruptured -3` <dbl>, `Ruptured-NO2-1` <dbl>,
## #   `Ruptured-NO2-2` <dbl>, `Ruptured-NO2-3` <dbl>,
## #   `Ruptured-NO2+O3 - 1` <dbl>, `Ruptured-NO2 + O3-2` <dbl>, …
xl <- as.data.frame(read_excel("1b.2_MS25-036_Dimple_Reruns._Perseus.xlsx"))

str(xl)
## 'data.frame':    606 obs. of  29 variables:
##  $ S.No               : chr  "1" "2" "3" "4" ...
##  $ Protein Name       : chr  "FAD-binding PCMH-type domain-containing protein OS=Lolium multiflorum OX=4521 GN=QYE76_034260 PE=3 SV=1" "Group of Major pollen allergen Phl p 4 OS=Phleum pratense OX=15957 PE=1 SV=1+2" "Group of Pollen allergen Phl p 4.0203 (Fragment) OS=Phleum pratense OX=15957 GN=phlp4 PE=3 SV=1+1" "FAD-binding PCMH-type domain-containing protein OS=Lolium multiflorum OX=4521 GN=QYE76_034340 PE=3 SV=1" ...
##  $ Control-Intact -1  : num  3650000 4760000 4740000 3440000 3620000 3100000 NA NA NA 15600000 ...
##  $ Control-Intact -2  : num  4920000 6300000 6290000 4860000 4980000 4500000 NA NA NA 22700000 ...
##  $ Control-Intact -3  : num  4200000 5490000 5480000 3930000 4290000 3740000 NA NA NA 18500000 ...
##  $ Intact-N02 - 1     : num  NA NA NA NA NA NA 14500000 NA NA NA ...
##  $ Intact-N02 - 2     : num  NA NA NA NA NA NA 13800000 NA NA NA ...
##  $ Intact-N02 - 3     : num  NA NA NA NA NA NA 11400000 NA NA NA ...
##  $ Intact-N02 +O3 - 1 : num  NA NA NA NA NA NA 12900000 30700000 30500000 31000000 ...
##  $ Intact-N02+O3 - 2  : num  NA NA NA NA NA NA 12200000 26900000 26700000 27200000 ...
##  $ Inatct-N02 + O3 - 3: num  NA NA NA NA NA NA 12100000 22900000 22600000 22700000 ...
##  $ Control-Ruptured -1: num  1580000 408000 384000 1400000 1440000 1300000 1680000 NA NA NA ...
##  $ Control-Ruptured -2: num  1390000 575000 548000 1230000 1300000 1160000 1820000 NA NA NA ...
##  $ Control-Ruptured -3: num  1360000 545000 525000 1360000 1160000 1130000 1510000 NA NA NA ...
##  $ Ruptured-NO2-1     : num  NA NA NA NA 24700000 18700000 7250000 NA NA NA ...
##  $ Ruptured-NO2-2     : num  NA NA NA NA 31900000 26600000 8820000 NA NA NA ...
##  $ Ruptured-NO2-3     : num  NA NA NA NA 35700000 26900000 6240000 NA NA NA ...
##  $ Ruptured-NO2+O3 - 1: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Ruptured-NO2 + O3-2: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Ruptured-NO2+O3-3  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ AN6-1              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ AN6-2              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ AN6-3              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ AN7-1              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ AN7-2              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ AN7-3              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ AN8-1              : num  15800000 14700000 11500000 13500000 13500000 12800000 12100000 58800000 56200000 47900000 ...
##  $ AN8-2              : num  17900000 16200000 12600000 15400000 15400000 14100000 14400000 67400000 63500000 52400000 ...
##  $ AN8-3              : num  16900000 14900000 10600000 14400000 13400000 12900000 12800000 74200000 70500000 55600000 ...

Cleaning data by fixing the rownames.

rownames(xl) <- paste(xl[,1],xl[,2])
xl[,1:2]=NULL

myrownames <- rownames(xl)

Remove the last 2 rows.

xl <- xl[1:(nrow(xl) -2),]

Log transformation and normalisation

lxl <- log10(xl + 1)

nxl <- normalizeBetweenArrays(lxl, method = "quantile",
  targets = NULL, cyclic.method = "fast")

colSums(nxl,na.rm=TRUE)
##   Control-Intact -1   Control-Intact -2   Control-Intact -3      Intact-N02 - 1 
##          1252.93364          1252.91422          1252.92699           609.17510 
##      Intact-N02 - 2      Intact-N02 - 3  Intact-N02 +O3 - 1   Intact-N02+O3 - 2 
##           609.19131           609.19201          1333.39568          1333.37674 
## Inatct-N02 + O3 - 3 Control-Ruptured -1 Control-Ruptured -2 Control-Ruptured -3 
##          1333.40046           499.97905           499.97527           499.99009 
##      Ruptured-NO2-1      Ruptured-NO2-2      Ruptured-NO2-3 Ruptured-NO2+O3 - 1 
##           528.73810           528.73798           528.73653           758.67551 
## Ruptured-NO2 + O3-2   Ruptured-NO2+O3-3               AN6-1               AN6-2 
##           752.93288           758.68654            22.91007            22.69277 
##               AN6-3               AN7-1               AN7-2               AN7-3 
##            22.69277            34.43037            34.43037            34.43037 
##               AN8-1               AN8-2               AN8-3 
##          2120.80305          2120.80304          2109.30771

There’s a wide range of total intensity values, as expected for pollen.

Basic visualisation

colfunc <- colorRampPalette(c("blue", "white", "red"))

rownames(nxl) <- str_trunc(rownames(nxl), 20)

nxl2 <- nxl
nxl2[is.na(nxl2)] <- 0

heatmap.2(as.matrix(nxl2),trace="none",col=colfunc(25),scale="row",
  margins = c(10,6), cexRow=0.6, cexCol=0.8 )

heatmap.2(cor(as.matrix(nxl2),method="spearman"),trace="none",scale="none",
  main="Sample spearman cor", margins = c(10,10))

heatmap.2(cor(t(nxl2),method="spearman"),trace="none",scale="none",
  main="Protein spearman cor", margins = c(6,6))

Hierarchical clustering

This will perform clustering with different tree cut thresholds.

The protein clusters will be saved to different files with names like “ProteinClusters_h”.

heights <- c(1.2,1.4,1.6,1.8,2,2.2)
dump <- lapply(heights,function(height) {
  cl<-as.dist(1-cor(t(nxl2), method="spearman"))
  hr <- hclust(cl , method="complete")
  mycl <- cutree(hr, h=max(hr$height/height))
  clusterCols <- colorRampPalette(c("red", "orange", "yellow","green",
    "blue","violet","black"))
  clusterCols <-  clusterCols(length(unique(mycl)))
  myClusterSideBar <- clusterCols[mycl]
  colfunc <- colorRampPalette(c("blue", "white", "red"))
  myfilename <- paste("ProteinClusters_h",height,".txt",sep="")
  write.table(mycl,file=myfilename,quote=F,sep="\t")

  heatmap.2(nxl2, main=paste("Protein Clustering h=",height),  Rowv=as.dendrogram(hr),
    dendrogram="both", scale="none", col = colfunc(25), trace="none",
    RowSideColors= myClusterSideBar, margins = c(9,7))
})

Session information

For reproducibility.

sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] kableExtra_1.4.0   RColorBrewer_1.1-3 stringr_1.5.1      gplots_3.2.0      
## [5] limma_3.64.3       readxl_1.4.5      
## 
## loaded via a namespace (and not attached):
##  [1] jsonlite_2.0.0     compiler_4.5.2     gtools_3.9.5       xml2_1.3.8        
##  [5] bitops_1.0-9       dichromat_2.0-0.1  jquerylib_0.1.4    systemfonts_1.2.3 
##  [9] scales_1.4.0       textshaping_1.0.1  yaml_2.3.10        fastmap_1.2.0     
## [13] statmod_1.5.0      R6_2.6.1           knitr_1.50         tibble_3.3.0      
## [17] svglite_2.2.1      pillar_1.11.0      bslib_0.9.0        rlang_1.1.6       
## [21] utf8_1.2.6         cachem_1.1.0       stringi_1.8.7      xfun_0.52         
## [25] caTools_1.18.3     sass_0.4.10        viridisLite_0.4.2  cli_3.6.5         
## [29] magrittr_2.0.3     digest_0.6.37      rstudioapi_0.17.1  lifecycle_1.0.4   
## [33] vctrs_0.6.5        KernSmooth_2.23-26 evaluate_1.0.4     glue_1.8.0        
## [37] farver_2.1.2       cellranger_1.1.0   rmarkdown_2.29     pkgconfig_2.0.3   
## [41] tools_4.5.2        htmltools_0.5.8.1