Twenty PMC articles were selected ramdomly from the supplementary file of our 2022 article (pkg1) and a further 20 were hand selected offer more diverse methods, including tools, background use and FDR (pkg2).

library("kableExtra")
library("beeswarm")
knitr::opts_chunk$set(dev = 'svg') # set output device to svg
pkg1 <- read.table("pkg1.tsv",sep="\t",header=TRUE,quote='"')
pkg2 <- read.table("pkg2.tsv",sep="\t",header=TRUE,quote='"')

pkg1 <- subset(pkg1,Model!="human")
pkg2 <- subset(pkg2,Model!="human")

pkg1 <- pkg1[,c(1,8:13)]
pkg2 <- pkg2[,c(1,8:13)]

a1 <- aggregate(. ~ Model, pkg1, mean )
a2 <- aggregate(. ~ Model, pkg2, mean )

a1 |> kbl(caption = "pkg1") |> kable_paper("hover", full_width = F)
pkg1
Model Tool.Used.1 Tool.version.1 Gene.Set.library.1 Background.List.1 Stat.test.used.1 FDR.used.1
deepseek-r1:70b 0.4550 0.850 0.6750 0.80 0.800 0.90
gemma3:27b 0.5400 0.850 0.6000 0.85 0.325 0.40
GPT-4.1 0.9500 1.000 0.9585 0.95 0.950 0.90
GPT-4.1-mini 0.9900 1.000 0.8650 0.90 0.900 0.90
GPT-4.1-nano 0.8750 1.000 0.7500 1.00 0.925 0.90
gpt-oss:120b 0.6650 0.800 0.7585 0.90 1.000 1.00
gpt-oss:20b 0.5985 0.775 0.6250 0.85 0.900 0.90
Haiku3.5 0.9650 1.000 0.8900 0.90 0.900 0.90
llama3.3:70b 0.5235 0.850 0.7000 0.85 0.875 0.90
mistral-small3.1:24b 0.4485 0.800 0.3250 0.80 0.850 0.85
phi4:14b 0.3800 0.850 0.4820 0.80 0.825 0.85
qwen3:32b 0.4485 0.750 0.6085 0.80 0.875 0.90
qwq:32b 0.5050 0.750 0.6585 0.85 0.900 0.80
Sonnet4 1.0000 1.000 0.8835 0.90 1.000 0.90
a2 |> kbl(caption = "pkg1") |> kable_paper("hover", full_width = F)
pkg1
Model Tool.Used.1 Tool.version.1 Gene.Set.library.1 Background.List.1 Stat.test.used.1 FDR.used.1
deepseek-r1:70b 0.7375 0.850 0.6725 0.55 0.600 0.50
gemma3:27b 0.6335 0.825 0.6875 0.40 0.300 0.75
GPT-4.1 0.8165 0.950 0.8395 0.85 0.875 0.95
GPT-4.1-mini 0.8080 0.925 0.7745 0.90 0.875 0.95
GPT-4.1-nano 0.9250 0.900 0.6620 0.75 0.925 0.95
gpt-oss:120b 0.7000 0.850 0.8295 0.70 0.650 0.70
gpt-oss:20b 0.8250 0.800 0.8040 0.60 0.700 0.60
Haiku3.5 0.8250 0.950 0.6725 0.60 0.925 0.95
llama3.3:70b 0.6670 0.850 0.7460 0.60 0.750 0.65
mistral-small3.1:24b 0.4750 0.750 0.4455 0.45 0.700 0.40
phi4:14b 0.5250 0.850 0.6585 0.50 0.650 0.70
qwen3:32b 0.4750 0.850 0.5625 0.50 0.700 0.55
qwq:32b 0.4665 0.850 0.6795 0.55 0.600 0.75
Sonnet4 0.8500 0.950 0.7645 1.00 0.975 0.95
p12 <- rbind(pkg1,pkg2)
a12 <- aggregate(. ~ Model, p12, mean )

a12 |> kbl(caption = "pkg1") |> kable_paper("hover", full_width = F)
pkg1
Model Tool.Used.1 Tool.version.1 Gene.Set.library.1 Background.List.1 Stat.test.used.1 FDR.used.1
deepseek-r1:70b 0.59625 0.8500 0.67375 0.675 0.7000 0.700
gemma3:27b 0.58675 0.8375 0.64375 0.625 0.3125 0.575
GPT-4.1 0.88325 0.9750 0.89900 0.900 0.9125 0.925
GPT-4.1-mini 0.89900 0.9625 0.81975 0.900 0.8875 0.925
GPT-4.1-nano 0.90000 0.9500 0.70600 0.875 0.9250 0.925
gpt-oss:120b 0.68250 0.8250 0.79400 0.800 0.8250 0.850
gpt-oss:20b 0.71175 0.7875 0.71450 0.725 0.8000 0.750
Haiku3.5 0.89500 0.9750 0.78125 0.750 0.9125 0.925
llama3.3:70b 0.59525 0.8500 0.72300 0.725 0.8125 0.775
mistral-small3.1:24b 0.46175 0.7750 0.38525 0.625 0.7750 0.625
phi4:14b 0.45250 0.8500 0.57025 0.650 0.7375 0.775
qwen3:32b 0.46175 0.8000 0.58550 0.650 0.7875 0.725
qwq:32b 0.48575 0.8000 0.66900 0.700 0.7500 0.775
Sonnet4 0.92500 0.9750 0.82400 0.950 0.9875 0.925

Get some summary stats.

rownames(a1) <- a1$Model ; a1[,1] = NULL
rownames(a2) <- a2$Model ; a2[,1] = NULL
rownames(a12) <- a12$Model ; a12[,1] = NULL

al <- list("Set1"=colMeans(a1), "Set2"=colMeans(a2), "Both"=colMeans(a12) )

al
## $Set1
##        Tool.Used.1     Tool.version.1 Gene.Set.library.1  Background.List.1 
##          0.6674286          0.8767857          0.6985357          0.8678571 
##   Stat.test.used.1         FDR.used.1 
##          0.8589286          0.8571429 
## 
## $Set2
##        Tool.Used.1     Tool.version.1 Gene.Set.library.1  Background.List.1 
##          0.6949286          0.8678571          0.6998929          0.6392857 
##   Stat.test.used.1         FDR.used.1 
##          0.7303571          0.7392857 
## 
## $Both
##        Tool.Used.1     Tool.version.1 Gene.Set.library.1  Background.List.1 
##          0.6811786          0.8723214          0.6992143          0.7535714 
##   Stat.test.used.1         FDR.used.1 
##          0.7946429          0.7982143
boxplot(al,beside=TRUE,ylim=c(0.6,0.9),col="white",frame=FALSE,
  main="Accuracy by set", ylab="Accuracy relative to human")
beeswarm(al,add=TRUE,cex=2,col="gray",pch=19)

amx <- do.call(rbind,al)
amx
##      Tool.Used.1 Tool.version.1 Gene.Set.library.1 Background.List.1
## Set1   0.6674286      0.8767857          0.6985357         0.8678571
## Set2   0.6949286      0.8678571          0.6998929         0.6392857
## Both   0.6811786      0.8723214          0.6992143         0.7535714
##      Stat.test.used.1 FDR.used.1
## Set1        0.8589286  0.8571429
## Set2        0.7303571  0.7392857
## Both        0.7946429  0.7982143
la <- lapply(1:length(al[[1]]), function(i) { unlist(lapply(al,function(x) {x[i]} ) ) })
names(la) <- c("Tool","Vers","GeneSet","Background","StatTest","FDR")
for ( i in 1:6) { names(la[[i]]) <- c("Set1","Set2","Both")  }

par(cex.axis=0.8)
boxplot(la,beside=TRUE,ylim=c(0.6,0.9),col="white",frame=FALSE,
  main="Accuracy by question", ylab="Accuracy relative to human")
beeswarm(la,add=TRUE,cex=2,col="gray",pch=19)

Now drill down to the model accuracy.

aa <- list("Set1"=rowMeans(a1),"Set2"=rowMeans(a2),"Both"=rowMeans(a12))

aamx <- do.call(rbind,aa)
aamx2 <- aamx[,order(colMeans(aamx))]

par(mar=c(8.9,4.1,4.1,2.1))

mycols=c("gray20","gray50","gray90" )

barplot(aamx2-0.5,beside=2,ylim=c(0.5,1),offset=0.5,col=mycols,
  las=3,ylab="Accuracy relative to human")

lapply(seq(0.5,1,0.1),function(h) {
  abline(h=h,lty=2,lwd=0.5,col="gray")
})
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
legend("topleft", inset=.02, c("Set1","Set2","Both"),
  fill=mycols, horiz=TRUE, cex=0.8)

Session information

For reproducibility.

sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] beeswarm_0.4.0   kableExtra_1.4.0
## 
## loaded via a namespace (and not attached):
##  [1] vctrs_0.6.5        svglite_2.2.1      cli_3.6.5          knitr_1.50        
##  [5] rlang_1.1.6        xfun_0.52          stringi_1.8.7      textshaping_1.0.1 
##  [9] jsonlite_2.0.0     glue_1.8.0         htmltools_0.5.8.1  sass_0.4.10       
## [13] scales_1.4.0       rmarkdown_2.29     evaluate_1.0.4     jquerylib_0.1.4   
## [17] fastmap_1.2.0      yaml_2.3.10        lifecycle_1.0.4    stringr_1.5.1     
## [21] compiler_4.5.1     RColorBrewer_1.1-3 rstudioapi_0.17.1  systemfonts_1.2.3 
## [25] farver_2.1.2       digest_0.6.37      viridisLite_0.4.2  R6_2.6.1          
## [29] dichromat_2.0-0.1  magrittr_2.0.3     bslib_0.9.0        tools_4.5.1       
## [33] xml2_1.3.8         cachem_1.1.0