Need to cluster the proteomics data.
to run this script, load it into Rstudio and hit “knit”.
library(readxl)
library(limma)
library(gplots)
library(stringr)
library(RColorBrewer)
library(kableExtra)
There were some cells with “Missing Value” and this is causing those coluns to be recognised as characters.
I deleted those cells’ contents and then loaded into R.
excel_sheets("1b.2_MS25-036_Dimple_Reruns._Perseus.xlsx")
## [1] "Samples Table of MS25-036_Dimpl"
xl <- read_excel("1b.2_MS25-036_Dimple_Reruns._Perseus.xlsx")
head(xl)
## # A tibble: 6 × 29
## S.No `Protein Name` `Control-Intact -1` `Control-Intact -2`
## <chr> <chr> <dbl> <dbl>
## 1 1 FAD-binding PCMH-type domain-co… 3650000 4920000
## 2 2 Group of Major pollen allergen … 4760000 6300000
## 3 3 Group of Pollen allergen Phl p … 4740000 6290000
## 4 4 FAD-binding PCMH-type domain-co… 3440000 4860000
## 5 5 Group of Pollen allergen Phl p … 3620000 4980000
## 6 6 FAD-binding PCMH-type domain-co… 3100000 4500000
## # ℹ 25 more variables: `Control-Intact -3` <dbl>, `Intact-N02 - 1` <dbl>,
## # `Intact-N02 - 2` <dbl>, `Intact-N02 - 3` <dbl>, `Intact-N02 +O3 - 1` <dbl>,
## # `Intact-N02+O3 - 2` <dbl>, `Inatct-N02 + O3 - 3` <dbl>,
## # `Control-Ruptured -1` <dbl>, `Control-Ruptured -2` <dbl>,
## # `Control-Ruptured -3` <dbl>, `Ruptured-NO2-1` <dbl>,
## # `Ruptured-NO2-2` <dbl>, `Ruptured-NO2-3` <dbl>,
## # `Ruptured-NO2+O3 - 1` <dbl>, `Ruptured-NO2 + O3-2` <dbl>, …
xl <- as.data.frame(read_excel("1b.2_MS25-036_Dimple_Reruns._Perseus.xlsx"))
str(xl)
## 'data.frame': 606 obs. of 29 variables:
## $ S.No : chr "1" "2" "3" "4" ...
## $ Protein Name : chr "FAD-binding PCMH-type domain-containing protein OS=Lolium multiflorum OX=4521 GN=QYE76_034260 PE=3 SV=1" "Group of Major pollen allergen Phl p 4 OS=Phleum pratense OX=15957 PE=1 SV=1+2" "Group of Pollen allergen Phl p 4.0203 (Fragment) OS=Phleum pratense OX=15957 GN=phlp4 PE=3 SV=1+1" "FAD-binding PCMH-type domain-containing protein OS=Lolium multiflorum OX=4521 GN=QYE76_034340 PE=3 SV=1" ...
## $ Control-Intact -1 : num 3650000 4760000 4740000 3440000 3620000 3100000 NA NA NA 15600000 ...
## $ Control-Intact -2 : num 4920000 6300000 6290000 4860000 4980000 4500000 NA NA NA 22700000 ...
## $ Control-Intact -3 : num 4200000 5490000 5480000 3930000 4290000 3740000 NA NA NA 18500000 ...
## $ Intact-N02 - 1 : num NA NA NA NA NA NA 14500000 NA NA NA ...
## $ Intact-N02 - 2 : num NA NA NA NA NA NA 13800000 NA NA NA ...
## $ Intact-N02 - 3 : num NA NA NA NA NA NA 11400000 NA NA NA ...
## $ Intact-N02 +O3 - 1 : num NA NA NA NA NA NA 12900000 30700000 30500000 31000000 ...
## $ Intact-N02+O3 - 2 : num NA NA NA NA NA NA 12200000 26900000 26700000 27200000 ...
## $ Inatct-N02 + O3 - 3: num NA NA NA NA NA NA 12100000 22900000 22600000 22700000 ...
## $ Control-Ruptured -1: num 1580000 408000 384000 1400000 1440000 1300000 1680000 NA NA NA ...
## $ Control-Ruptured -2: num 1390000 575000 548000 1230000 1300000 1160000 1820000 NA NA NA ...
## $ Control-Ruptured -3: num 1360000 545000 525000 1360000 1160000 1130000 1510000 NA NA NA ...
## $ Ruptured-NO2-1 : num NA NA NA NA 24700000 18700000 7250000 NA NA NA ...
## $ Ruptured-NO2-2 : num NA NA NA NA 31900000 26600000 8820000 NA NA NA ...
## $ Ruptured-NO2-3 : num NA NA NA NA 35700000 26900000 6240000 NA NA NA ...
## $ Ruptured-NO2+O3 - 1: num NA NA NA NA NA NA NA NA NA NA ...
## $ Ruptured-NO2 + O3-2: num NA NA NA NA NA NA NA NA NA NA ...
## $ Ruptured-NO2+O3-3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ AN6-1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ AN6-2 : num NA NA NA NA NA NA NA NA NA NA ...
## $ AN6-3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ AN7-1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ AN7-2 : num NA NA NA NA NA NA NA NA NA NA ...
## $ AN7-3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ AN8-1 : num 15800000 14700000 11500000 13500000 13500000 12800000 12100000 58800000 56200000 47900000 ...
## $ AN8-2 : num 17900000 16200000 12600000 15400000 15400000 14100000 14400000 67400000 63500000 52400000 ...
## $ AN8-3 : num 16900000 14900000 10600000 14400000 13400000 12900000 12800000 74200000 70500000 55600000 ...
Cleaning data by fixing the rownames.
rownames(xl) <- paste(xl[,1],xl[,2])
xl[,1:2]=NULL
myrownames <- rownames(xl)
Remove the last 2 rows.
xl <- xl[1:(nrow(xl) -2),]
lxl <- log10(xl + 1)
nxl <- normalizeBetweenArrays(lxl, method = "quantile",
targets = NULL, cyclic.method = "fast")
colSums(nxl,na.rm=TRUE)
## Control-Intact -1 Control-Intact -2 Control-Intact -3 Intact-N02 - 1
## 1252.93364 1252.91422 1252.92699 609.17510
## Intact-N02 - 2 Intact-N02 - 3 Intact-N02 +O3 - 1 Intact-N02+O3 - 2
## 609.19131 609.19201 1333.39568 1333.37674
## Inatct-N02 + O3 - 3 Control-Ruptured -1 Control-Ruptured -2 Control-Ruptured -3
## 1333.40046 499.97905 499.97527 499.99009
## Ruptured-NO2-1 Ruptured-NO2-2 Ruptured-NO2-3 Ruptured-NO2+O3 - 1
## 528.73810 528.73798 528.73653 758.67551
## Ruptured-NO2 + O3-2 Ruptured-NO2+O3-3 AN6-1 AN6-2
## 752.93288 758.68654 22.91007 22.69277
## AN6-3 AN7-1 AN7-2 AN7-3
## 22.69277 34.43037 34.43037 34.43037
## AN8-1 AN8-2 AN8-3
## 2120.80305 2120.80304 2109.30771
There’s a wide range of total intensity values, as expected for pollen.
colfunc <- colorRampPalette(c("blue", "white", "red"))
rownames(nxl) <- str_trunc(rownames(nxl), 20)
nxl2 <- nxl
nxl2[is.na(nxl2)] <- 0
heatmap.2(as.matrix(nxl2),trace="none",col=colfunc(25),scale="row",
margins = c(10,6), cexRow=0.6, cexCol=0.8 )
heatmap.2(cor(as.matrix(nxl2),method="spearman"),trace="none",scale="none",
main="Sample spearman cor", margins = c(10,10))
heatmap.2(cor(t(nxl2),method="spearman"),trace="none",scale="none",
main="Protein spearman cor", margins = c(6,6))
This will perform clustering with different tree cut thresholds.
The protein clusters will be saved to different files with names like “ProteinClusters_h”.
heights <- c(1.2,1.4,1.6,1.8,2,2.2)
dump <- lapply(heights,function(height) {
cl<-as.dist(1-cor(t(nxl2), method="spearman"))
hr <- hclust(cl , method="complete")
mycl <- cutree(hr, h=max(hr$height/height))
clusterCols <- colorRampPalette(c("red", "orange", "yellow","green",
"blue","violet","black"))
clusterCols <- clusterCols(length(unique(mycl)))
myClusterSideBar <- clusterCols[mycl]
colfunc <- colorRampPalette(c("blue", "white", "red"))
myfilename <- paste("ProteinClusters_h",height,".txt",sep="")
write.table(mycl,file=myfilename,quote=F,sep="\t")
heatmap.2(nxl2, main=paste("Protein Clustering h=",height), Rowv=as.dendrogram(hr),
dendrogram="both", scale="none", col = colfunc(25), trace="none",
RowSideColors= myClusterSideBar, margins = c(9,7))
})
For reproducibility.
sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8
## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8
## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] kableExtra_1.4.0 RColorBrewer_1.1-3 stringr_1.5.1 gplots_3.2.0
## [5] limma_3.64.3 readxl_1.4.5
##
## loaded via a namespace (and not attached):
## [1] jsonlite_2.0.0 compiler_4.5.2 gtools_3.9.5 xml2_1.3.8
## [5] bitops_1.0-9 dichromat_2.0-0.1 jquerylib_0.1.4 systemfonts_1.2.3
## [9] scales_1.4.0 textshaping_1.0.1 yaml_2.3.10 fastmap_1.2.0
## [13] statmod_1.5.0 R6_2.6.1 knitr_1.50 tibble_3.3.0
## [17] svglite_2.2.1 pillar_1.11.0 bslib_0.9.0 rlang_1.1.6
## [21] utf8_1.2.6 cachem_1.1.0 stringi_1.8.7 xfun_0.52
## [25] caTools_1.18.3 sass_0.4.10 viridisLite_0.4.2 cli_3.6.5
## [29] magrittr_2.0.3 digest_0.6.37 rstudioapi_0.17.1 lifecycle_1.0.4
## [33] vctrs_0.6.5 KernSmooth_2.23-26 evaluate_1.0.4 glue_1.8.0
## [37] farver_2.1.2 cellranger_1.1.0 rmarkdown_2.29 pkgconfig_2.0.3
## [41] tools_4.5.2 htmltools_0.5.8.1