Accuracy of open and Proprietary LLMs to analyse the methodology in journal articles

Twenty PMC articles were selected ramdomly from the supplementary file of our 2022 article (pkg1) and a further 20 were hand selected offer more diverse methods, including tools, background use and FDR (pkg2).

library("kableExtra")
library("beeswarm")
knitr::opts_chunk$set(dev = 'svg') # set output device to svg

pkg1 <- read.table("pkg1.tsv",sep="\t",header=TRUE,quote='"')
pkg2 <- read.table("pkg2.tsv",sep="\t",header=TRUE,quote='"')

pkg1 <- subset(pkg1,Model!="human")
pkg2 <- subset(pkg2,Model!="human")

pkg1 <- pkg1[,c(1,8:13)]
pkg2 <- pkg2[,c(1,8:13)]

a1 <- aggregate(. ~ Model, pkg1, mean )
a2 <- aggregate(. ~ Model, pkg2, mean )

a1 |> kbl(caption = "pkg1") |> kable_paper("hover", full_width = F)

pkg1
Model	Tool.Used.1	Tool.version.1	Gene.Set.library.1	Background.List.1	Stat.test.used.1	FDR.used.1
deepseek-r1:70b	0.4550	0.850	0.6750	0.80	0.800	0.90
gemma3:27b	0.5400	0.850	0.6000	0.85	0.325	0.40
GPT-4.1	0.9500	1.000	0.9585	0.95	0.950	0.90
GPT-4.1-mini	0.9900	1.000	0.8650	0.90	0.900	0.90
GPT-4.1-nano	0.8750	1.000	0.7500	1.00	0.925	0.90
gpt-oss:120b	0.6650	0.800	0.7585	0.90	1.000	1.00
gpt-oss:20b	0.5985	0.775	0.6250	0.85	0.900	0.90
Haiku3.5	0.9650	1.000	0.8900	0.90	0.900	0.90
llama3.3:70b	0.5235	0.850	0.7000	0.85	0.875	0.90
mistral-small3.1:24b	0.4485	0.800	0.3250	0.80	0.850	0.85
phi4:14b	0.3800	0.850	0.4820	0.80	0.825	0.85
qwen3:32b	0.4485	0.750	0.6085	0.80	0.875	0.90
qwq:32b	0.5050	0.750	0.6585	0.85	0.900	0.80
Sonnet4	1.0000	1.000	0.8835	0.90	1.000	0.90

a2 |> kbl(caption = "pkg1") |> kable_paper("hover", full_width = F)

pkg1
Model	Tool.Used.1	Tool.version.1	Gene.Set.library.1	Background.List.1	Stat.test.used.1	FDR.used.1
deepseek-r1:70b	0.7375	0.850	0.6725	0.55	0.600	0.50
gemma3:27b	0.6335	0.825	0.6875	0.40	0.300	0.75
GPT-4.1	0.8165	0.950	0.8395	0.85	0.875	0.95
GPT-4.1-mini	0.8080	0.925	0.7745	0.90	0.875	0.95
GPT-4.1-nano	0.9250	0.900	0.6620	0.75	0.925	0.95
gpt-oss:120b	0.7000	0.850	0.8295	0.70	0.650	0.70
gpt-oss:20b	0.8250	0.800	0.8040	0.60	0.700	0.60
Haiku3.5	0.8250	0.950	0.6725	0.60	0.925	0.95
llama3.3:70b	0.6670	0.850	0.7460	0.60	0.750	0.65
mistral-small3.1:24b	0.4750	0.750	0.4455	0.45	0.700	0.40
phi4:14b	0.5250	0.850	0.6585	0.50	0.650	0.70
qwen3:32b	0.4750	0.850	0.5625	0.50	0.700	0.55
qwq:32b	0.4665	0.850	0.6795	0.55	0.600	0.75
Sonnet4	0.8500	0.950	0.7645	1.00	0.975	0.95

p12 <- rbind(pkg1,pkg2)
a12 <- aggregate(. ~ Model, p12, mean )

a12 |> kbl(caption = "pkg1") |> kable_paper("hover", full_width = F)

pkg1
Model	Tool.Used.1	Tool.version.1	Gene.Set.library.1	Background.List.1	Stat.test.used.1	FDR.used.1
deepseek-r1:70b	0.59625	0.8500	0.67375	0.675	0.7000	0.700
gemma3:27b	0.58675	0.8375	0.64375	0.625	0.3125	0.575
GPT-4.1	0.88325	0.9750	0.89900	0.900	0.9125	0.925
GPT-4.1-mini	0.89900	0.9625	0.81975	0.900	0.8875	0.925
GPT-4.1-nano	0.90000	0.9500	0.70600	0.875	0.9250	0.925
gpt-oss:120b	0.68250	0.8250	0.79400	0.800	0.8250	0.850
gpt-oss:20b	0.71175	0.7875	0.71450	0.725	0.8000	0.750
Haiku3.5	0.89500	0.9750	0.78125	0.750	0.9125	0.925
llama3.3:70b	0.59525	0.8500	0.72300	0.725	0.8125	0.775
mistral-small3.1:24b	0.46175	0.7750	0.38525	0.625	0.7750	0.625
phi4:14b	0.45250	0.8500	0.57025	0.650	0.7375	0.775
qwen3:32b	0.46175	0.8000	0.58550	0.650	0.7875	0.725
qwq:32b	0.48575	0.8000	0.66900	0.700	0.7500	0.775
Sonnet4	0.92500	0.9750	0.82400	0.950	0.9875	0.925

Get some summary stats.

rownames(a1) <- a1$Model ; a1[,1] = NULL
rownames(a2) <- a2$Model ; a2[,1] = NULL
rownames(a12) <- a12$Model ; a12[,1] = NULL

al <- list("Set1"=colMeans(a1), "Set2"=colMeans(a2), "Both"=colMeans(a12) )

al

## $Set1
##        Tool.Used.1     Tool.version.1 Gene.Set.library.1  Background.List.1 
##          0.6674286          0.8767857          0.6985357          0.8678571 
##   Stat.test.used.1         FDR.used.1 
##          0.8589286          0.8571429 
## 
## $Set2
##        Tool.Used.1     Tool.version.1 Gene.Set.library.1  Background.List.1 
##          0.6949286          0.8678571          0.6998929          0.6392857 
##   Stat.test.used.1         FDR.used.1 
##          0.7303571          0.7392857 
## 
## $Both
##        Tool.Used.1     Tool.version.1 Gene.Set.library.1  Background.List.1 
##          0.6811786          0.8723214          0.6992143          0.7535714 
##   Stat.test.used.1         FDR.used.1 
##          0.7946429          0.7982143

boxplot(al,beside=TRUE,ylim=c(0.6,0.9),col="white",frame=FALSE,
  main="Accuracy by set", ylab="Accuracy relative to human")
beeswarm(al,add=TRUE,cex=2,col="gray",pch=19)

amx <- do.call(rbind,al)
amx

##      Tool.Used.1 Tool.version.1 Gene.Set.library.1 Background.List.1
## Set1   0.6674286      0.8767857          0.6985357         0.8678571
## Set2   0.6949286      0.8678571          0.6998929         0.6392857
## Both   0.6811786      0.8723214          0.6992143         0.7535714
##      Stat.test.used.1 FDR.used.1
## Set1        0.8589286  0.8571429
## Set2        0.7303571  0.7392857
## Both        0.7946429  0.7982143

la <- lapply(1:length(al[[1]]), function(i) { unlist(lapply(al,function(x) {x[i]} ) ) })
names(la) <- c("Tool","Vers","GeneSet","Background","StatTest","FDR")
for ( i in 1:6) { names(la[[i]]) <- c("Set1","Set2","Both")  }

par(cex.axis=0.8)
boxplot(la,beside=TRUE,ylim=c(0.6,0.9),col="white",frame=FALSE,
  main="Accuracy by question", ylab="Accuracy relative to human")
beeswarm(la,add=TRUE,cex=2,col="gray",pch=19)

Now drill down to the model accuracy.

aa <- list("Set1"=rowMeans(a1),"Set2"=rowMeans(a2),"Both"=rowMeans(a12))

aamx <- do.call(rbind,aa)
aamx2 <- aamx[,order(colMeans(aamx))]

par(mar=c(8.9,4.1,4.1,2.1))

mycols=c("gray20","gray50","gray90" )

barplot(aamx2-0.5,beside=2,ylim=c(0.5,1),offset=0.5,col=mycols,
  las=3,ylab="Accuracy relative to human")

lapply(seq(0.5,1,0.1),function(h) {
  abline(h=h,lty=2,lwd=0.5,col="gray")
})

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL

legend("topleft", inset=.02, c("Set1","Set2","Both"),
  fill=mycols, horiz=TRUE, cex=0.8)

Session information

For reproducibility.

sessionInfo()

## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] beeswarm_0.4.0   kableExtra_1.4.0
## 
## loaded via a namespace (and not attached):
##  [1] vctrs_0.6.5        svglite_2.2.1      cli_3.6.5          knitr_1.50        
##  [5] rlang_1.1.6        xfun_0.52          stringi_1.8.7      textshaping_1.0.1 
##  [9] jsonlite_2.0.0     glue_1.8.0         htmltools_0.5.8.1  sass_0.4.10       
## [13] scales_1.4.0       rmarkdown_2.29     evaluate_1.0.4     jquerylib_0.1.4   
## [17] fastmap_1.2.0      yaml_2.3.10        lifecycle_1.0.4    stringr_1.5.1     
## [21] compiler_4.5.1     RColorBrewer_1.1-3 rstudioapi_0.17.1  systemfonts_1.2.3 
## [25] farver_2.1.2       digest_0.6.37      viridisLite_0.4.2  R6_2.6.1          
## [29] dichromat_2.0-0.1  magrittr_2.0.3     bslib_0.9.0        tools_4.5.1       
## [33] xml2_1.3.8         cachem_1.1.0

Accuracy of open and Proprietary LLMs to analyse the methodology in journal articles

Burnet Bioinformatics group

2025-09-02

Session information