Twenty PMC articles were selected ramdomly from the supplementary file of our 2022 article (pkg1) and a further 20 were hand selected offer more diverse methods, including tools, background use and FDR (pkg2).
library("kableExtra")
library("beeswarm")
knitr::opts_chunk$set(dev = 'svg') # set output device to svg
pkg1 <- read.table("pkg1.tsv",sep="\t",header=TRUE,quote='"')
pkg2 <- read.table("pkg2.tsv",sep="\t",header=TRUE,quote='"')
pkg1 <- subset(pkg1,Model!="human")
pkg2 <- subset(pkg2,Model!="human")
pkg1 <- pkg1[,c(1,8:13)]
pkg2 <- pkg2[,c(1,8:13)]
a1 <- aggregate(. ~ Model, pkg1, mean )
a2 <- aggregate(. ~ Model, pkg2, mean )
a1 |> kbl(caption = "pkg1") |> kable_paper("hover", full_width = F)
Model | Tool.Used.1 | Tool.version.1 | Gene.Set.library.1 | Background.List.1 | Stat.test.used.1 | FDR.used.1 |
---|---|---|---|---|---|---|
deepseek-r1:70b | 0.4550 | 0.850 | 0.6750 | 0.80 | 0.800 | 0.90 |
gemma3:27b | 0.5400 | 0.850 | 0.6000 | 0.85 | 0.325 | 0.40 |
GPT-4.1 | 0.9500 | 1.000 | 0.9585 | 0.95 | 0.950 | 0.90 |
GPT-4.1-mini | 0.9900 | 1.000 | 0.8650 | 0.90 | 0.900 | 0.90 |
GPT-4.1-nano | 0.8750 | 1.000 | 0.7500 | 1.00 | 0.925 | 0.90 |
gpt-oss:120b | 0.6650 | 0.800 | 0.7585 | 0.90 | 1.000 | 1.00 |
gpt-oss:20b | 0.5985 | 0.775 | 0.6250 | 0.85 | 0.900 | 0.90 |
Haiku3.5 | 0.9650 | 1.000 | 0.8900 | 0.90 | 0.900 | 0.90 |
llama3.3:70b | 0.5235 | 0.850 | 0.7000 | 0.85 | 0.875 | 0.90 |
mistral-small3.1:24b | 0.4485 | 0.800 | 0.3250 | 0.80 | 0.850 | 0.85 |
phi4:14b | 0.3800 | 0.850 | 0.4820 | 0.80 | 0.825 | 0.85 |
qwen3:32b | 0.4485 | 0.750 | 0.6085 | 0.80 | 0.875 | 0.90 |
qwq:32b | 0.5050 | 0.750 | 0.6585 | 0.85 | 0.900 | 0.80 |
Sonnet4 | 1.0000 | 1.000 | 0.8835 | 0.90 | 1.000 | 0.90 |
a2 |> kbl(caption = "pkg1") |> kable_paper("hover", full_width = F)
Model | Tool.Used.1 | Tool.version.1 | Gene.Set.library.1 | Background.List.1 | Stat.test.used.1 | FDR.used.1 |
---|---|---|---|---|---|---|
deepseek-r1:70b | 0.7375 | 0.850 | 0.6725 | 0.55 | 0.600 | 0.50 |
gemma3:27b | 0.6335 | 0.825 | 0.6875 | 0.40 | 0.300 | 0.75 |
GPT-4.1 | 0.8165 | 0.950 | 0.8395 | 0.85 | 0.875 | 0.95 |
GPT-4.1-mini | 0.8080 | 0.925 | 0.7745 | 0.90 | 0.875 | 0.95 |
GPT-4.1-nano | 0.9250 | 0.900 | 0.6620 | 0.75 | 0.925 | 0.95 |
gpt-oss:120b | 0.7000 | 0.850 | 0.8295 | 0.70 | 0.650 | 0.70 |
gpt-oss:20b | 0.8250 | 0.800 | 0.8040 | 0.60 | 0.700 | 0.60 |
Haiku3.5 | 0.8250 | 0.950 | 0.6725 | 0.60 | 0.925 | 0.95 |
llama3.3:70b | 0.6670 | 0.850 | 0.7460 | 0.60 | 0.750 | 0.65 |
mistral-small3.1:24b | 0.4750 | 0.750 | 0.4455 | 0.45 | 0.700 | 0.40 |
phi4:14b | 0.5250 | 0.850 | 0.6585 | 0.50 | 0.650 | 0.70 |
qwen3:32b | 0.4750 | 0.850 | 0.5625 | 0.50 | 0.700 | 0.55 |
qwq:32b | 0.4665 | 0.850 | 0.6795 | 0.55 | 0.600 | 0.75 |
Sonnet4 | 0.8500 | 0.950 | 0.7645 | 1.00 | 0.975 | 0.95 |
p12 <- rbind(pkg1,pkg2)
a12 <- aggregate(. ~ Model, p12, mean )
a12 |> kbl(caption = "pkg1") |> kable_paper("hover", full_width = F)
Model | Tool.Used.1 | Tool.version.1 | Gene.Set.library.1 | Background.List.1 | Stat.test.used.1 | FDR.used.1 |
---|---|---|---|---|---|---|
deepseek-r1:70b | 0.59625 | 0.8500 | 0.67375 | 0.675 | 0.7000 | 0.700 |
gemma3:27b | 0.58675 | 0.8375 | 0.64375 | 0.625 | 0.3125 | 0.575 |
GPT-4.1 | 0.88325 | 0.9750 | 0.89900 | 0.900 | 0.9125 | 0.925 |
GPT-4.1-mini | 0.89900 | 0.9625 | 0.81975 | 0.900 | 0.8875 | 0.925 |
GPT-4.1-nano | 0.90000 | 0.9500 | 0.70600 | 0.875 | 0.9250 | 0.925 |
gpt-oss:120b | 0.68250 | 0.8250 | 0.79400 | 0.800 | 0.8250 | 0.850 |
gpt-oss:20b | 0.71175 | 0.7875 | 0.71450 | 0.725 | 0.8000 | 0.750 |
Haiku3.5 | 0.89500 | 0.9750 | 0.78125 | 0.750 | 0.9125 | 0.925 |
llama3.3:70b | 0.59525 | 0.8500 | 0.72300 | 0.725 | 0.8125 | 0.775 |
mistral-small3.1:24b | 0.46175 | 0.7750 | 0.38525 | 0.625 | 0.7750 | 0.625 |
phi4:14b | 0.45250 | 0.8500 | 0.57025 | 0.650 | 0.7375 | 0.775 |
qwen3:32b | 0.46175 | 0.8000 | 0.58550 | 0.650 | 0.7875 | 0.725 |
qwq:32b | 0.48575 | 0.8000 | 0.66900 | 0.700 | 0.7500 | 0.775 |
Sonnet4 | 0.92500 | 0.9750 | 0.82400 | 0.950 | 0.9875 | 0.925 |
Get some summary stats.
rownames(a1) <- a1$Model ; a1[,1] = NULL
rownames(a2) <- a2$Model ; a2[,1] = NULL
rownames(a12) <- a12$Model ; a12[,1] = NULL
al <- list("Set1"=colMeans(a1), "Set2"=colMeans(a2), "Both"=colMeans(a12) )
al
## $Set1
## Tool.Used.1 Tool.version.1 Gene.Set.library.1 Background.List.1
## 0.6674286 0.8767857 0.6985357 0.8678571
## Stat.test.used.1 FDR.used.1
## 0.8589286 0.8571429
##
## $Set2
## Tool.Used.1 Tool.version.1 Gene.Set.library.1 Background.List.1
## 0.6949286 0.8678571 0.6998929 0.6392857
## Stat.test.used.1 FDR.used.1
## 0.7303571 0.7392857
##
## $Both
## Tool.Used.1 Tool.version.1 Gene.Set.library.1 Background.List.1
## 0.6811786 0.8723214 0.6992143 0.7535714
## Stat.test.used.1 FDR.used.1
## 0.7946429 0.7982143
boxplot(al,beside=TRUE,ylim=c(0.6,0.9),col="white",frame=FALSE,
main="Accuracy by set", ylab="Accuracy relative to human")
beeswarm(al,add=TRUE,cex=2,col="gray",pch=19)
amx <- do.call(rbind,al)
amx
## Tool.Used.1 Tool.version.1 Gene.Set.library.1 Background.List.1
## Set1 0.6674286 0.8767857 0.6985357 0.8678571
## Set2 0.6949286 0.8678571 0.6998929 0.6392857
## Both 0.6811786 0.8723214 0.6992143 0.7535714
## Stat.test.used.1 FDR.used.1
## Set1 0.8589286 0.8571429
## Set2 0.7303571 0.7392857
## Both 0.7946429 0.7982143
la <- lapply(1:length(al[[1]]), function(i) { unlist(lapply(al,function(x) {x[i]} ) ) })
names(la) <- c("Tool","Vers","GeneSet","Background","StatTest","FDR")
for ( i in 1:6) { names(la[[i]]) <- c("Set1","Set2","Both") }
par(cex.axis=0.8)
boxplot(la,beside=TRUE,ylim=c(0.6,0.9),col="white",frame=FALSE,
main="Accuracy by question", ylab="Accuracy relative to human")
beeswarm(la,add=TRUE,cex=2,col="gray",pch=19)
Now drill down to the model accuracy.
aa <- list("Set1"=rowMeans(a1),"Set2"=rowMeans(a2),"Both"=rowMeans(a12))
aamx <- do.call(rbind,aa)
aamx2 <- aamx[,order(colMeans(aamx))]
par(mar=c(8.9,4.1,4.1,2.1))
mycols=c("gray20","gray50","gray90" )
barplot(aamx2-0.5,beside=2,ylim=c(0.5,1),offset=0.5,col=mycols,
las=3,ylab="Accuracy relative to human")
lapply(seq(0.5,1,0.1),function(h) {
abline(h=h,lty=2,lwd=0.5,col="gray")
})
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
legend("topleft", inset=.02, c("Set1","Set2","Both"),
fill=mycols, horiz=TRUE, cex=0.8)
For reproducibility.
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8
## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8
## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] beeswarm_0.4.0 kableExtra_1.4.0
##
## loaded via a namespace (and not attached):
## [1] vctrs_0.6.5 svglite_2.2.1 cli_3.6.5 knitr_1.50
## [5] rlang_1.1.6 xfun_0.52 stringi_1.8.7 textshaping_1.0.1
## [9] jsonlite_2.0.0 glue_1.8.0 htmltools_0.5.8.1 sass_0.4.10
## [13] scales_1.4.0 rmarkdown_2.29 evaluate_1.0.4 jquerylib_0.1.4
## [17] fastmap_1.2.0 yaml_2.3.10 lifecycle_1.0.4 stringr_1.5.1
## [21] compiler_4.5.1 RColorBrewer_1.1-3 rstudioapi_0.17.1 systemfonts_1.2.3
## [25] farver_2.1.2 digest_0.6.37 viridisLite_0.4.2 R6_2.6.1
## [29] dichromat_2.0-0.1 magrittr_2.0.3 bslib_0.9.0 tools_4.5.1
## [33] xml2_1.3.8 cachem_1.1.0