Source: http://118.138.234.73/public/gene_name_errors/manuscript_files/results_summary.html

library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("kableExtra")
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library("eulerr")
library("Cairo")
options(bitmapType="cairo")

Introduction

The bash script “gene_names.sh” was used to screen a list of PMC articles and here we are processing and analysing the output of the script to generate figures that will be needed in the paper

Number of publications

The number of publications available in each year from 2014 to 2020 in PUBMED CENTRAL. These publications were selected through a search in PubMed Central using the key word,“genom”

pmc2014 <- read.table("pmc/genom/pmc2014.txt")
pmc2014 <- nrow(pmc2014)
pmc2015 <- read.table("pmc/genom/pmc2015.txt")
pmc2015 <- nrow(pmc2015)
pmc2016 <- read.table("pmc/genom/pmc2016.txt")
pmc2016 <- nrow(pmc2016)
pmc2017 <- read.table("pmc/genom/pmc2017.txt")
pmc2017 <- nrow(pmc2017)
pmc2018 <- read.table("pmc/genom/pmc2018.txt")
pmc2018 <- nrow(pmc2018)
pmc2019 <- read.table("pmc/genom/pmc2019.txt")
pmc2019 <- nrow(pmc2019)
pmc2020 <- read.table("pmc/genom/pmc2020.txt")
pmc2020 <- nrow(pmc2020)
total_pmc <- sum(pmc2014,pmc2015,pmc2016,pmc2017,pmc2018,pmc2019,pmc2020)
pmc <- c(pmc2014,pmc2015,pmc2016,pmc2017,pmc2018,pmc2019,pmc2020,total_pmc)

Number of supplementary files

The number of publications resulted from the “genom” keyword search from 2014 to 2020 was short listed where the publications with supplementary files were downloaded.

These supplementary files were selected in a manner in which supplementary files compose of “.xls” or “.xlsx” suffixes were assumed to be supplementary excel files.

res2014 <- readLines("results/genom/results2014.txt")
res2014_length <- length(unique(sapply(strsplit(res2014," "),"[[",1)))
res2015 <- readLines("results/genom/results2015.txt")
res2015_length <- length(unique(sapply(strsplit(res2015," "),"[[",1)))
res2016 <- readLines("results/genom/results2016.txt")
res2016_length <- length(unique(sapply(strsplit(res2016," "),"[[",1)))
res2017 <- readLines("results/genom/results2017.txt")
res2017_length <- length(unique(sapply(strsplit(res2017," "),"[[",1)))
res2018 <- readLines("results/genom/results2018.txt")
res2018_length <- length(unique(sapply(strsplit(res2018," "),"[[",1)))
res2019 <- readLines("results/genom/results2019.txt")
res2019_length <- length(unique(sapply(strsplit(res2019," "),"[[",1)))
res2020 <- readLines("results/genom/results2020.txt")
res2020_length <- length(unique(sapply(strsplit(res2020," "),"[[",1)))
total_res <- sum(res2014_length,res2015_length,res2016_length,res2017_length,
  res2018_length,res2019_length,res2020_length)
res <- c(res2014_length,res2015_length,res2016_length,res2017_length,
  res2018_length,res2019_length,res2020_length,total_res)

Excel files with gene lists

All the supplementary files downloaded from 2014 to 2020 where searched for a gene list in a way that each supplementary files’ content were analysed for their length.

If a supplementary file content is depicting a length greater than 2, it is verified as a supplementary file containing of a gene list.

res2014_l <- strsplit(res2014," ")
res2014_xlg <- res2014_l[which(lapply(res2014_l,length)>2)]
res2014_xlg <- length(unique(sapply(res2014_xlg,"[[",2)))
res2015_l <- strsplit(res2015," ")
res2015_xlg <- res2015_l[which(lapply(res2015_l,length)>2)]
res2015_xlg <- length(unique(sapply(res2015_xlg,"[[",2)))
res2016_l <- strsplit(res2016," ")
res2016_xlg <- res2016_l[which(lapply(res2016_l,length)>2)]
res2016_xlg <- length(unique(sapply(res2016_xlg,"[[",2)))
res2017_l <- strsplit(res2017," ")
res2017_xlg <- res2017_l[which(lapply(res2017_l,length)>2)]
res2017_xlg <- length(unique(sapply(res2017_xlg,"[[",2)))
res2018_l <- strsplit(res2018," ")
res2018_xlg <- res2018_l[which(lapply(res2018_l,length)>2)]
res2018_xlg <- length(unique(sapply(res2018_xlg,"[[",2)))
res2019_l <- strsplit(res2019," ")
res2019_xlg <- res2019_l[which(lapply(res2019_l,length)>2)]
res2019_xlg <- length(unique(sapply(res2019_xlg,"[[",2)))
res2020_l <- strsplit(res2020," ")
res2020_xlg <- res2020_l[which(lapply(res2020_l,length)>2)]
res2020_xlg <- length(unique(sapply(res2020_xlg,"[[",2)))

total_pub_xlg <- sum(res2014_xlg,res2015_xlg,res2016_xlg,res2017_xlg,res2018_xlg,res2019_xlg,res2020_xlg)

pub_xlg <- c(res2014_xlg,res2015_xlg,res2016_xlg,res2017_xlg,res2018_xlg,res2019_xlg,res2020_xlg,total_pub_xlg)

Publications with Excel gene lists

The results from the excel files with gene lists were shortlisted in a manner where the related publication for each supplementary file with excel gene list was selected.

res2014_l <- strsplit(res2014," ")
res2014_lx <- res2014_l[which(lapply(res2014_l,length)>2)]
res2014_lx <- length(unique(sapply(res2014_lx,"[[",1)))
res2015_l <- strsplit(res2015," ")
res2015_lx <- res2015_l[which(lapply(res2015_l,length)>2)]
res2015_lx <- length(unique(sapply(res2015_lx,"[[",1)))
res2016_l <- strsplit(res2016," ")
res2016_lx <- res2016_l[which(lapply(res2016_l,length)>2)]
res2016_lx <- length(unique(sapply(res2016_lx,"[[",1)))
res2017_l <- strsplit(res2017," ")
res2017_lx <- res2017_l[which(lapply(res2017_l,length)>2)]
res2017_lx <- length(unique(sapply(res2017_lx,"[[",1)))
res2018_l <- strsplit(res2018," ")
res2018_lx <- res2018_l[which(lapply(res2018_l,length)>2)]
res2018_lx <- length(unique(sapply(res2018_lx,"[[",1)))
res2019_l <- strsplit(res2019," ")
res2019_lx <- res2019_l[which(lapply(res2019_l,length)>2)]
res2019_lx <- length(unique(sapply(res2019_lx,"[[",1)))
res2020_l <- strsplit(res2020," ")
res2020_lx <- res2020_l[which(lapply(res2020_l,length)>2)]
res2020_lx <- length(unique(sapply(res2020_lx,"[[",1)))
total_pub_xls <- sum(res2014_lx,res2015_lx,res2016_lx,res2017_lx,res2018_lx,res2019_lx,res2020_lx)
pub_xls <- c(res2014_lx,res2015_lx,res2016_lx,res2017_lx,res2018_lx,res2019_lx,res2020_lx,total_pub_xls)

Publications with suspected gene name errors

The resulted publications with excel gene list were run in a shell scripted software where each selected publications with excel gene list were screened for their gene symbols.

Erroneous conversions, such as date formats, scientific numbers and five-digit numbers were chosen from gene lists as gene name errors and the results were recorded and saved as “aggregated_res”.

This aggregated_res file composed of PMCID, Species, Journal name, year and the link to the affected file.

Total suspected publications with gene name errors were detected from aggregated_res file and aggregated_res file was analysed in a manner where publications with gene name errors for each year were also extracted.

aggregated_res <- readLines("results/genom/aggregated_res.txt")
head(aggregated_res)
## [1] "PMC3879165 PLoS_Genet results2014.txt /pmc/articles/PMC3879165/bin/pgen.1004006.s005.xlsx"
## [2] "PMC3886906 PLoS_Genet results2014.txt /pmc/articles/PMC3886906/bin/pgen.1004079.s012.xlsx"
## [3] "PMC3894565 Sci_Rep results2014.txt /pmc/articles/PMC3894565/bin/srep03692-s1.xls"         
## [4] "PMC3894565 Sci_Rep results2014.txt /pmc/articles/PMC3894565/bin/srep03692-s2.xls"         
## [5] "PMC3894996 PLoS_One results2014.txt /pmc/articles/PMC3894996/bin/pone.0085599.s003.xlsx"  
## [6] "PMC3897657 PLoS_One results2014.txt /pmc/articles/PMC3897657/bin/pone.0086220.s002.xlsx"
total_sus_pub <- length(unique(sapply(strsplit(aggregated_res," "),"[[",1)))
total_sus_pub
## [1] 3470
sus_pub <- sapply(strsplit(aggregated_res," "),"[[",1)
sus_pub_year <- sapply(strsplit(aggregated_res," "),"[[",3)
sus_pub_file <- sapply(strsplit(aggregated_res," "),"[[",4)
suspected_df <- data.frame(sus_pub,sus_pub_year,sus_pub_file)

sus_pub_2014_df <- subset(suspected_df[suspected_df$sus_pub_year=="results2014.txt",])
sus_pub_2014 <- length(unique(sus_pub_2014_df$sus_pub))
sus_pub_2015_df <- subset(suspected_df[suspected_df$sus_pub_year=="results2015.txt",])
sus_pub_2015 <- length(unique(sus_pub_2015_df$sus_pub))
sus_pub_2016_df <- subset(suspected_df[suspected_df$sus_pub_year=="results2016.txt",])
sus_pub_2016 <- length(unique(sus_pub_2016_df$sus_pub))
sus_pub_2017_df <- subset(suspected_df[suspected_df$sus_pub_year=="results2017.txt",])
sus_pub_2017 <- length(unique(sus_pub_2017_df$sus_pub))
sus_pub_2018_df <- subset(suspected_df[suspected_df$sus_pub_year=="results2018.txt",])
sus_pub_2018 <- length(unique(sus_pub_2018_df$sus_pub))
sus_pub_2019_df <- subset(suspected_df[suspected_df$sus_pub_year=="results2019.txt",])
sus_pub_2019 <- length(unique(sus_pub_2019_df$sus_pub))
sus_pub_2020_df <- subset(suspected_df[suspected_df$sus_pub_year=="results2020.txt",])
sus_pub_2020 <- length(unique(sus_pub_2020_df$sus_pub))

sus_pub <- c(sus_pub_2014,sus_pub_2015,sus_pub_2016,sus_pub_2017,
  sus_pub_2018,sus_pub_2019,sus_pub_2020,total_sus_pub)

Affected XLS files

aggregated_res file was analysed for the total number of spread files which contained gene name errors and the final result was saved as “affected_xl”

affected_xl <- length(sapply(strsplit(aggregated_res," "),"[[",1))
affected_xl
## [1] 5136

Affected files

In order to reconfirm the results obtained each spreadsheet in aggregated_res file was manually opened to validate accuracy of the data and final result was saved in “true_positive.txt” file.

true_positive.txt" file was read as affected xls. Total number of true positive excel files with gene name errors from the “true_positive.txt” was obtained and the result was saved as “total_aff_xls”

“affected.xls” was reanalysed based on the year(2014 to 2020) to obtain number of true positive excel files for each year.

aff_xls <- read.table("results/genom/true_positive.txt")
total_aff_xls <- nrow(aff_xls)
total_aff_xls
## [1] 5086
aff_xls_2014 <- subset(aff_xls[aff_xls$V4==2014,])
aff_xls_2014 <- length(aff_xls_2014$V1)
aff_xls_2015 <- subset(aff_xls[aff_xls$V4==2015,])
aff_xls_2015 <- length(aff_xls_2015$V1)
aff_xls_2016 <- subset(aff_xls[aff_xls$V4==2016,])
aff_xls_2016 <- length(aff_xls_2016$V1)
aff_xls_2017 <- subset(aff_xls[aff_xls$V4==2017,])
aff_xls_2017 <- length(aff_xls_2017$V1)
aff_xls_2018 <- subset(aff_xls[aff_xls$V4==2018,])
aff_xls_2018 <- length(aff_xls_2018$V1)
aff_xls_2019 <- subset(aff_xls[aff_xls$V4==2019,])
aff_xls_2019 <- length(aff_xls_2019$V1)
aff_xls_2020 <- subset(aff_xls[aff_xls$V4==2020,])
aff_xls_2020 <- length(aff_xls_2020$V1)

aff_xls_files <- c(aff_xls_2014,aff_xls_2015,aff_xls_2016,
  aff_xls_2017,aff_xls_2018,aff_xls_2019,
  aff_xls_2020,total_aff_xls)

Affected publications

“aff_xls” was reannalysed for the total number of true positive publications and the result was saved as “total_aff_pub”. “aff_xls” was reanalysed based on the year(2014 to 2020) to obtain number of true positive publications for each year.

aff_xls <- read.table("results/genom/true_positive.txt")
total_aff_pub <- length(unique(aff_xls$V1))
total_aff_pub
## [1] 3436
pub_2014 <- subset(aff_xls[aff_xls$V4==2014,])
aff_pub_2014 <- length(unique(pub_2014$V1))
pub_2015 <- subset(aff_xls[aff_xls$V4==2015,])
aff_pub_2015 <- length(unique(pub_2015$V1))
pub_2016 <- subset(aff_xls[aff_xls$V4==2016,])
aff_pub_2016 <- length(unique(pub_2016$V1))
pub_2017 <- subset(aff_xls[aff_xls$V4==2017,])
aff_pub_2017 <- length(unique(pub_2017$V1))
pub_2018 <- subset(aff_xls[aff_xls$V4==2018,])
aff_pub_2018 <- length(unique(pub_2018$V1))
pub_2019 <- subset(aff_xls[aff_xls$V4==2019,])
aff_pub_2019 <- length(unique(pub_2019$V1))
pub_2020 <- subset(aff_xls[aff_xls$V4==2020,])
aff_pub_2020 <- length(unique(pub_2020$V1))

aff_pub <- c(aff_pub_2014,aff_pub_2015,aff_pub_2016,aff_pub_2017,aff_pub_2018,aff_pub_2019,aff_pub_2020,total_aff_pub)

False positive XLS files

In order to reconfirm the results obtained each spreadsheet in aggregated_res file was manually opened to validate accuracy of the data and final result was updated in the “aggregated_res” file. In the manual search of aggregated_res for validating accuracy of data in the file, several false positive excel files were recorded and these false positive results were saved in a text file named “fp.txt”

fp.txt file was read as “false positive”. Total number of false positive results were recorded from “fp.txt” and saved as “total_fp_xls”. “false positive” was reanalysed based on the year (2014 to 2020) to obtain number of false positive excel files for each year.

fp <- read.table("results/genom/false_positive.txt")
total_fp_xls <- length(fp$V1)
total_fp_xls
## [1] 50
fp_2014 <- subset(fp[fp$V4==2014,])
fp_2014_xls <- length(fp_2014$V1)
fp_2015 <- subset(fp[fp$V4==2015,])
fp_2015_xls <- length(fp_2015$V1)
fp_2016 <- subset(fp[fp$V4==2016,])
fp_2016_xls <- length(fp_2016$V1)
fp_2017 <- subset(fp[fp$V4==2017,])
fp_2017_xls <- length(fp_2017$V1)
fp_2018 <- subset(fp[fp$V4==2018,])
fp_2018_xls <- length(fp_2018$V1)
fp_2019 <- subset(fp[fp$V4==2019,])
fp_2019_xls <- length(fp_2019$V1)
fp_2020 <- subset(fp[fp$V4==2020,])
fp_2020_xls <- length(fp_2020$V1)

fp_xls <- c(fp_2014_xls,fp_2015_xls,fp_2016_xls,
  fp_2017_xls,fp_2018_xls,fp_2019_xls,
  fp_2020_xls,total_fp_xls)

False positive publications

False positive publications = suspected pubs - affected pubs

fp_2014_pub <- sus_pub_2014 - aff_pub_2014
fp_2015_pub <- sus_pub_2015 - aff_pub_2015
fp_2016_pub <- sus_pub_2016 - aff_pub_2016
fp_2017_pub <- sus_pub_2017 - aff_pub_2017
fp_2018_pub <- sus_pub_2018 - aff_pub_2018
fp_2019_pub <- sus_pub_2019 - aff_pub_2019
fp_2020_pub <- sus_pub_2020 - aff_pub_2020

total_fp_pub <- sum(fp_2014_pub,fp_2015_pub,fp_2016_pub,fp_2017_pub,fp_2018_pub,fp_2019_pub,fp_2020_pub)

fp_pub <- c(fp_2014_pub,fp_2015_pub,fp_2016_pub,
  fp_2017_pub,fp_2018_pub,fp_2019_pub,
  fp_2020_pub,total_fp_pub)

Proportion of Excel gene lists articles containing errors

“total_aff_pub” was divided by “total_pub_xls” and multiple by 100 to find the proportion (%) of excel gene lists articles which contain gene name errors. The final answer was recorded to 3 significant figures and the result was saved as “total_aff_pub”. Figures below are precentages.

Total_percent_of_errors <- signif(total_aff_pub/total_pub_xls*100,3)
Total_percent_of_errors
## [1] 30.9
percentage_2014 <- signif(aff_pub_2014/res2014_lx*100,3)
percentage_2015 <- signif(aff_pub_2015/res2015_lx*100,3)
percentage_2016 <- signif(aff_pub_2016/res2016_lx*100,3)
percentage_2017 <- signif(aff_pub_2017/res2017_lx*100,3)
percentage_2018 <- signif(aff_pub_2018/res2018_lx*100,3)
percentage_2019 <- signif(aff_pub_2019/res2019_lx*100,3)
percentage_2020 <- signif(aff_pub_2020/res2020_lx*100,3)

percentages <- c(percentage_2014,percentage_2015,percentage_2016,percentage_2017,
  percentage_2018,percentage_2019,percentage_2020,Total_percent_of_errors)

Table 3 of the manuscript

Table 3. Results of a screen for gene name errors in PubMed Central.

years <- c("2014","2015","2016","2017","2018","2019","2020","Total")
df_gene_name_errors_years <- data.frame(pmc,res,pub_xlg,pub_xls,sus_pub,fp_xls,fp_pub,aff_xls_files,aff_pub,percentages)
df_gene_name_errors_years <- cbind(years,df_gene_name_errors_years)

colnames(df_gene_name_errors_years) <- c(" ",
  "PMC ID",
  "Excel files screened",
  "Excel files with gene lists",
  "Publications with Excel gene lists",
  "Publications with suspected gene name errors",
  "False positive XLS files",
  "False positive publications",
  "Affected XLS files",
  "Affected publications",
  "Proportion of publications affected")

  df_gene_name_errors_years%>%
  kbl() %>%
  kable_paper("hover", full_width = F)
PMC ID Excel files screened Excel files with gene lists Publications with Excel gene lists Publications with suspected gene name errors False positive XLS files False positive publications Affected XLS files Affected publications Proportion of publications affected
2014 19976 2948 2286 936 284 8 2 429 282 30.1
2015 21204 4318 3037 1491 490 0 0 701 490 32.9
2016 22261 4472 3331 1579 477 7 6 653 471 29.8
2017 23976 4355 3012 1412 443 5 3 648 440 31.2
2018 24986 4824 3566 1653 475 15 11 703 464 28.1
2019 26046 5481 3942 1823 594 4 3 914 591 32.4
2020 27690 6443 4496 2223 707 11 9 1038 698 31.4
Total 166139 32841 23670 11117 3470 50 34 5086 3436 30.9

Plots for figure 1 of the manuscript

Plots are created for the figure 1 in the manuscript which composed of results for prevalence of gene name errors in the period 2014-2020.

Accordingly, A .barplot for Publications with Excel gene lists. B. barplot for Affected publication and C. Line diagram for Proportion of publications affected

pdf("fig1.pdf",width=4,height=7) 
par(mfrow=c(3,1))

#barplot for Publications with Excel gene lists
df <- df_gene_name_errors_years
df <- df[1:(nrow(df)-1),]

barplot(df[,5],names.arg = df[,1],ylab="Publications with Excel gene lists",ylim = c(0,2500))
grid(nx=0,ny=5,col = "gray")

#barplot for Affected publications
barplot(df[,10],names.arg = df[,1],ylab="Affected publications",ylim = c(0,800))
grid(nx=0,ny=8,col = "gray")

#Line diagram for Proportion of publications affected
plot(x=df[,1],y=df[,11],ylab ="Proportion of publications affected (%)",xlab = "",ylim = c(0,35),type = "b",pch=19,bty="n")
grid()
dev.off()
## png 
##   2
par(mfrow=c(3,1))
#barplot for Publications with Excel gene lists
df <- df_gene_name_errors_years
df <- df[1:(nrow(df)-1),]
barplot(df[,5],names.arg = df[,1],ylab="Publications with Excel gene lists",ylim = c(0,2500))
grid(nx=0,ny=5,col = "gray")

#barplot for Affected publications
barplot(df[,10],names.arg = df[,1],ylab="Affected publications",ylim = c(0,800))
grid(nx=0,ny=8,col = "gray")

#Line diagram for Proportion of publications affected

plot(x=df[,1],y=df[,11],ylab ="Proportion of publications affected (%)",xlab = "",ylim = c(0,35),type = "b",pch=19,bty="n")
grid()

Gene name errors for different organisms

From the results obtained on the publications and excel spread sheets with gene name errors, a reanalysation was performed to identify which organism composed of most number of errors in their name.

Each publication and excel sheet was screened for gene name errors. Accordngly, A. thaliana, C. elegans, D. melanogaster, D. rerio, G. gallus, H. sapiens,M. musculus,O. sativa,R. norvegicus,S. cerevisiae species were selected and each publications and excel sheets composing of gene name errors from 2014 to 2020 were screened. The obtained results were saved under each individual name of the species.

Proportion of article affected for each individual species was calculated and resulted percentage was saved to 3 significant figures.

res_all_years <- c(res2014,res2015,res2016,res2017,res2018,res2019,res2020)
res_all_years_l <- strsplit(res_all_years," ")
res_all_years_lx <- res_all_years_l[which(lapply(res_all_years_l,length)>2)]
pmid <- sapply(res_all_years_lx,"[[",1)
org <- sapply(res_all_years_lx,"[[",3)
org_df <- data.frame(pmid,org)


ath_pub <- length(unique(subset(org_df,org=="Athaliana")$pmid))
ath_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Athaliana",])$V1))
ath_pc <- signif(ath_aff/ath_pub*100,3)

cel_pub <- length(unique(subset(org_df,org=="Celegans")$pmid))
cel_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Celegans",])$V1))
cel_pc <- signif(cel_aff/cel_pub*100,3)

dme_pub <- length(unique(subset(org_df,org=="Dmelanogaster")$pmid))
dme_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Dmelanogaster",])$V1))
dme_pc <- signif(dme_aff/dme_pub*100,3)

dre_pub <- length(unique(subset(org_df,org=="Drerio")$pmid))
dre_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Drerio",])$V1))
dre_pc <- signif(dre_aff/dre_pub*100,3)

gga_pub <- length(unique(subset(org_df,org=="Ggallus")$pmid))
gga_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Ggallus",])$V1))
gga_pc <- signif(gga_aff/gga_pub*100,3)

hsa_pub <- length(unique(subset(org_df,org=="Hsapiens")$pmid))
hsa_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Hsapiens",])$V1))
hsa_pc <- signif(hsa_aff/hsa_pub*100,3)

mmu_pub <- length(unique(subset(org_df,org=="Mmusculus")$pmid))
mmu_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Mmusculus",])$V1))
mmu_pc <- signif(mmu_aff/mmu_pub*100,3)

osa_pub <- length(unique(subset(org_df,org=="Osativa")$pmid))
osa_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Osativa",])$V1))
osa_pc <- signif(osa_aff/osa_pub*100,3)

rno_pub <- length(unique(subset(org_df,org=="Rnorvegicus")$pmid))
rno_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Rnorvegicus",])$V1))
rno_pc <- signif(rno_aff/rno_pub*100,3)

sce_pub <- length(unique(subset(org_df,org=="Scerevisiae")$pmid))
sce_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Scerevisiae",])$V1))
sce_pc <- signif(sce_aff/sce_pub*100,3)

Table 4 of the manuscript

A data frame was created which composed of Species, Publications with excel gene lists, Affected publications and Proportion of artcles affected followed by generation of Table 4 in the manuscript.

Species <- c("A.thaliana","C.elegans","D.melanogaster","D.rerio","G.gallus",
  "H.sapiens","M.musculus","O.sativa","R.norvegicus","S.cerevisiae")

pub <- c(ath_pub, cel_pub, dme_pub, dre_pub, gga_pub, hsa_pub, mmu_pub, osa_pub, rno_pub, sce_pub)

aff <- c(ath_aff, cel_aff, dme_aff, dre_aff, gga_aff, hsa_aff, mmu_aff, osa_aff, rno_aff, sce_aff)

prop <- c(ath_pc, cel_pc, dme_pc, dre_pc, gga_pc, hsa_pc, mmu_pc, osa_pc, rno_pc, sce_pc)

gene_name_errors_species <- data.frame(Species,pub,aff,prop)

colnames(gene_name_errors_species) <- c("Species",
  "Publications with excel gene lists",
  "Affected publications",
  "Propotion of articles affected")

  gene_name_errors_species%>%
  kbl() %>%
  kable_paper("hover", full_width = F)
Species Publications with excel gene lists Affected publications Propotion of articles affected
A.thaliana 511 76 14.90
C.elegans 124 31 25.00
D.melanogaster 607 142 23.40
D.rerio 251 48 19.10
G.gallus 1827 172 9.41
H.sapiens 7936 2419 30.50
M.musculus 1577 609 38.60
O.sativa 10 0 0.00
R.norvegicus 327 68 20.80
S.cerevisiae 443 93 21.00

Number of affected publications in different journals

Total number of journals index in PubMed Central was extraced for every year from 2014 to 2020 was obtained and result was saved as “jdf”. Total number of journals composing of one or more supplementary excel lists was calculated and result was saved as “res_p”. jdf_subset was created to evaluate repetition of the data obtained in “jdf” and “res_p”.

“journal_supplementary” was created to count number of journals with gene name errors in the supplementary files. “jdf_t_df” was created to for number of articles with excel gene list by using “jdf_subset”. Number of affected articles was calculated using “aff_xls”.

Table 5 in the manuscript was created from above gathered results.

#journals index in PMC
j2014 <- read.table("pmc/genom/pmc_journal2014genom.out.txt")
j2015 <- read.table("pmc/genom/pmc_journal2015genom.out.txt")
j2016 <- read.table("pmc/genom/pmc_journal2016genom.out.txt")
j2017 <- read.table("pmc/genom/pmc_journal2017genom.out.txt")
j2018 <- read.table("pmc/genom/pmc_journal2018genom.out.txt")
j2019 <- read.table("pmc/genom/pmc_journal2019genom.out.txt")
j2020 <- read.table("pmc/genom/pmc_journal2020genom.out.txt")
jdf <- rbind(j2014,j2015,j2016,j2017,j2018,j2019,j2020)
length(unique(jdf[,2]))
## [1] 4581
#journals published one or more supplementary Excel gene lists
res2014_xlg <- res2014_l[which(lapply(res2014_l,length)>2)]
res2014_p <- unique(sapply(res2014_xlg,"[[",1))
res2015_xlg <- res2015_l[which(lapply(res2015_l,length)>2)]
res2015_p <- unique(sapply(res2015_xlg,"[[",1))
res2016_xlg <- res2016_l[which(lapply(res2016_l,length)>2)]
res2016_p <- unique(sapply(res2016_xlg,"[[",1))
res2017_xlg <- res2017_l[which(lapply(res2017_l,length)>2)]
res2017_p <- unique(sapply(res2017_xlg,"[[",1))
res2018_xlg <- res2018_l[which(lapply(res2018_l,length)>2)]
res2018_p <- unique(sapply(res2018_xlg,"[[",1))
res2019_xlg <- res2019_l[which(lapply(res2019_l,length)>2)]
res2019_p <- unique(sapply(res2019_xlg,"[[",1))
res2020_xlg <- res2020_l[which(lapply(res2020_l,length)>2)]
res2020_p <- unique(sapply(res2020_xlg,"[[",1))

res_p <- unique(c(res2014_p,res2015_p,res2016_p,res2017_p,res2018_p,res2019_p,res2020_p))
jdf_subset <- jdf[which(jdf[,1] %in% res_p),]

#Number of journals with gene name errors in the supplementary files
journal_supplementary <- length(unique(aff_xls$V3))

# Number of articles with excel gene list
jdf <- unique(jdf_subset)
jdf_t <- table(jdf[,2])
jdf_t_df <- as.data.frame(jdf_t)

# Number of affected articles 
jres <- unique(aff_xls[,c(1,3)])
j_table <- table(jres[,2])
j_table_df <- as.data.frame(j_table)

# table content for gene name errors by journal 
jj <- merge(jdf_t_df,j_table_df,by="Var1")
colnames(jj) <- c("journal",
  "Number of articles with Excel gene lists",
  "Number of affected articles")

jj$`Proportion of articles affected` <- signif(jj[,3]/jj[,2]*100,3)
jjf <- jj[which(jj[,2]>49),]
jjf <- jjf[order(-jjf[,3]),]

jjf %>% kbl() %>%
  kable_paper("hover", full_width = F)
journal Number of articles with Excel gene lists Number of affected articles Proportion of articles affected
319 Nat_Commun 920 345 37.50
376 PLoS_One 946 244 25.80
394 Sci_Rep 767 227 29.60
49 BMC_Genomics 660 166 25.20
373 PLoS_Genet 448 134 29.90
352 Oncotarget 326 107 32.80
171 Front_Genet 313 94 30.00
140 eLife 243 89 36.60
380 Proc_Natl_Acad_Sci_U_S_A 155 73 47.10
95 Cell_Rep 158 71 44.90
200 Genome_Biol 193 66 34.20
330 Nature 118 52 44.10
321 Nat_Genet 140 48 34.30
202 Genome_Med 137 44 32.10
83 Cell 74 39 52.70
358 PeerJ 137 39 28.50
109 Clin_Epigenetics 109 38 34.90
345 Nucleic_Acids_Res 120 36 30.00
54 BMC_Med_Genomics 117 31 26.50
179 Front_Oncol 85 31 36.50
406 Transl_Psychiatry 73 29 39.70
42 BMC_Cancer 105 28 26.70
118 Commun_Biol 74 27 36.50
377 PLoS_Pathog 80 27 33.80
8 Aging_ 56 26 46.40
137 EBioMedicine 51 26 51.00
371 PLoS_Biol 66 26 39.40
149 Epigenetics_Chromatin 64 25 39.10
372 PLoS_Comput_Biol 97 24 24.70
348 Oncogene 53 22 41.50
233 iScience 58 20 34.50
392 Sci_Adv 56 20 35.70
40 BMC_Bioinformatics 77 19 24.70
187 G3_ 74 15 20.30
216 Hum_Mol_Genet 53 15 28.30
62 BMC_Plant_Biol 52 6 11.50
183 Front_Plant_Sci 75 5 6.67

Here we need to create an object containing PMCID, journal name and year for the 11117 pubs with Excel gene lists, so that we can calculate yearly proportions later.

res2014_p_df <- as.data.frame(res2014_p)
colnames(res2014_p_df) <- "pmc"
res2014_p_df$year <- "2014"

res2015_p_df <- as.data.frame(res2015_p)
colnames(res2015_p_df) <- "pmc"
res2015_p_df$year <- "2015"

res2016_p_df <- as.data.frame(res2016_p)
colnames(res2016_p_df) <- "pmc"
res2016_p_df$year <- "2016"

res2017_p_df <- as.data.frame(res2017_p)
colnames(res2017_p_df) <- "pmc"
res2017_p_df$year <- "2017"

res2018_p_df <- as.data.frame(res2018_p)
colnames(res2018_p_df) <- "pmc"
res2018_p_df$year <- "2018"

res2019_p_df <- as.data.frame(res2019_p)
colnames(res2019_p_df) <- "pmc"
res2019_p_df$year <- "2019"

res2020_p_df <- as.data.frame(res2020_p)
colnames(res2020_p_df) <- "pmc"
res2020_p_df$year <- "2020"

pub_yr <- rbind(res2014_p_df, res2015_p_df, res2016_p_df, res2017_p_df, res2018_p_df, res2019_p_df, res2020_p_df)

jdf2 <- merge(jdf,pub_yr,by.x="V1",by.y="pmc")

Top journals with more than 100 gene name errors articles from 2014 to 2020

Top six journals with most gene name errors were selected based on “Number of affected articles” in the Table 5 of the manuscript.

Accordingly, Nat Commun,PLoS One,Sci Rep,BMC Genomics,PLoS Genet,Oncotarget were considered as top 6 journals with most gene name errors. The number of publications published by each of these 6 journals in each year with gene name errors were counted and result was recorded.

years <- c("2014","2015","2016","2017","2018","2019","2020")

aj2014 <- unique(pub_2014[,c(1,3)])
aj2015 <- unique(pub_2015[,c(1,3)])
aj2016 <- unique(pub_2016[,c(1,3)])
aj2017 <- unique(pub_2017[,c(1,3)])
aj2018 <- unique(pub_2018[,c(1,3)])
aj2019 <- unique(pub_2019[,c(1,3)])
aj2020 <- unique(pub_2020[,c(1,3)])

# Nature communication
nc_aff <- c(nrow(subset(aj2014[aj2014$V3=="Nat_Commun",])),
  nrow(subset(aj2015[aj2015$V3=="Nat_Commun",])),
  nrow(subset(aj2016[aj2016$V3=="Nat_Commun",])),
  nrow(subset(aj2017[aj2017$V3=="Nat_Commun",])),
  nrow(subset(aj2018[aj2018$V3=="Nat_Commun",])),
  nrow(subset(aj2019[aj2019$V3=="Nat_Commun",])),
  nrow(subset(aj2020[aj2020$V3=="Nat_Commun",])))

nc_gl <- c(nrow(unique(subset(jdf2,V2=="Nat_Commun"&year=="2014"))),
  nrow(unique(subset(jdf2,V2=="Nat_Commun"&year=="2015"))),
  nrow(unique(subset(jdf2,V2=="Nat_Commun"&year=="2016"))),
  nrow(unique(subset(jdf2,V2=="Nat_Commun"&year=="2017"))),
  nrow(unique(subset(jdf2,V2=="Nat_Commun"&year=="2018"))),
  nrow(unique(subset(jdf2,V2=="Nat_Commun"&year=="2019"))),
  nrow(unique(subset(jdf2,V2=="Nat_Commun"&year=="2020"))))

nc_df<- data.frame(years,nc_aff,nc_gl,signif(nc_aff/nc_gl*100,3))
colnames(nc_df) <- c("year","affected articles","articles with excel gene lists", "proportion of articles affected (%)")
nc_df%>%
  kbl() %>%
  kable_paper("hover", full_width = F)
year affected articles articles with excel gene lists proportion of articles affected (%)
2014 11 33 33.3
2015 19 56 33.9
2016 29 85 34.1
2017 50 131 38.2
2018 67 175 38.3
2019 84 225 37.3
2020 85 215 39.5
# PLoS_One
po_aff <- c(nrow(subset(aj2014[aj2014$V3=="PLoS_One",])),
  nrow(subset(aj2015[aj2015$V3=="PLoS_One",])),
  nrow(subset(aj2016[aj2016$V3=="PLoS_One",])),
  nrow(subset(aj2017[aj2017$V3=="PLoS_One",])),
  nrow(subset(aj2018[aj2018$V3=="PLoS_One",])),
  nrow(subset(aj2019[aj2019$V3=="PLoS_One",])),
  nrow(subset(aj2020[aj2020$V3=="PLoS_One",])))

po_gl <- c(nrow(unique(subset(jdf2,V2=="PLoS_One"&year=="2014"))),
  nrow(unique(subset(jdf2,V2=="PLoS_One"&year=="2015"))),
  nrow(unique(subset(jdf2,V2=="PLoS_One"&year=="2016"))),
  nrow(unique(subset(jdf2,V2=="PLoS_One"&year=="2017"))),
  nrow(unique(subset(jdf2,V2=="PLoS_One"&year=="2018"))),
  nrow(unique(subset(jdf2,V2=="PLoS_One"&year=="2019"))),
  nrow(unique(subset(jdf2,V2=="PLoS_One"&year=="2020"))))

po_df<- data.frame(years,po_aff,po_gl,signif(po_aff/po_gl*100,3))
colnames(po_df) <- c("year","affected articles","articles with excel gene lists", "proportion of articles affected (%)")
po_df%>%
  kbl() %>%
  kable_paper("hover", full_width = F)
year affected articles articles with excel gene lists proportion of articles affected (%)
2014 58 222 26.1
2015 57 195 29.2
2016 34 150 22.7
2017 37 135 27.4
2018 21 106 19.8
2019 20 69 29.0
2020 17 69 24.6
# Sci_Rep
sr_aff <- c(nrow(subset(aj2014[aj2014$V3=="Sci_Rep",])),
  nrow(subset(aj2015[aj2015$V3=="Sci_Rep",])),
  nrow(subset(aj2016[aj2016$V3=="Sci_Rep",])),
  nrow(subset(aj2017[aj2017$V3=="Sci_Rep",])),
  nrow(subset(aj2018[aj2018$V3=="Sci_Rep",])),
  nrow(subset(aj2019[aj2019$V3=="Sci_Rep",])),
  nrow(subset(aj2020[aj2020$V3=="Sci_Rep",])))

sr_gl <- c(nrow(unique(subset(jdf2,V2=="Sci_Rep"&year=="2014"))),
  nrow(unique(subset(jdf2,V2=="Sci_Rep"&year=="2015"))),
  nrow(unique(subset(jdf2,V2=="Sci_Rep"&year=="2016"))),
  nrow(unique(subset(jdf2,V2=="Sci_Rep"&year=="2017"))),
  nrow(unique(subset(jdf2,V2=="Sci_Rep"&year=="2018"))),
  nrow(unique(subset(jdf2,V2=="Sci_Rep"&year=="2019"))),
  nrow(unique(subset(jdf2,V2=="Sci_Rep"&year=="2020"))))

sr_df<- data.frame(years,sr_aff,sr_gl,signif(sr_aff/sr_gl*100,3))
colnames(sr_df) <- c("year","affected articles","articles with excel gene lists", "proportion of articles affected (%)")
sr_df%>%
  kbl() %>%
  kable_paper("hover", full_width = F)
year affected articles articles with excel gene lists proportion of articles affected (%)
2014 3 9 33.3
2015 7 41 17.1
2016 37 144 25.7
2017 57 157 36.3
2018 43 141 30.5
2019 47 133 35.3
2020 33 142 23.2
# BMC_Genomics
bg_aff <- c(nrow(subset(aj2014[aj2014$V3=="BMC_Genomics",])),
  nrow(subset(aj2015[aj2015$V3=="BMC_Genomics",])),
  nrow(subset(aj2016[aj2016$V3=="BMC_Genomics",])),
  nrow(subset(aj2017[aj2017$V3=="BMC_Genomics",])),
  nrow(subset(aj2018[aj2018$V3=="BMC_Genomics",])),
  nrow(subset(aj2019[aj2019$V3=="BMC_Genomics",])),
  nrow(subset(aj2020[aj2020$V3=="BMC_Genomics",])))

bg_gl <- c(nrow(unique(subset(jdf2,V2=="BMC_Genomics"&year=="2014"))),
  nrow(unique(subset(jdf2,V2=="BMC_Genomics"&year=="2015"))),
  nrow(unique(subset(jdf2,V2=="BMC_Genomics"&year=="2016"))),
  nrow(unique(subset(jdf2,V2=="BMC_Genomics"&year=="2017"))),
  nrow(unique(subset(jdf2,V2=="BMC_Genomics"&year=="2018"))),
  nrow(unique(subset(jdf2,V2=="BMC_Genomics"&year=="2019"))),
  nrow(unique(subset(jdf2,V2=="BMC_Genomics"&year=="2020"))))

bg_df<- data.frame(years,bg_aff,bg_gl,signif(bg_aff/bg_gl*100,3))
colnames(bg_df) <- c("year","affected articles","articles with excel gene lists", "proportion of articles affected (%)")
bg_df%>%
  kbl() %>%
  kable_paper("hover", full_width = F)
year affected articles articles with excel gene lists proportion of articles affected (%)
2014 25 98 25.5
2015 28 116 24.1
2016 26 97 26.8
2017 26 86 30.2
2018 20 102 19.6
2019 21 92 22.8
2020 20 69 29.0
# PLoS_Genet
pg_aff <- c(nrow(subset(aj2014[aj2014$V3=="PLoS_Genet",])),
  nrow(subset(aj2015[aj2015$V3=="PLoS_Genet",])),
  nrow(subset(aj2016[aj2016$V3=="PLoS_Genet",])),
  nrow(subset(aj2017[aj2017$V3=="PLoS_Genet",])),
  nrow(subset(aj2018[aj2018$V3=="PLoS_Genet",])),
  nrow(subset(aj2019[aj2019$V3=="PLoS_Genet",])),
  nrow(subset(aj2020[aj2020$V3=="PLoS_Genet",])))

pg_gl <- c(nrow(unique(subset(jdf2,V2=="PLoS_Genet"&year=="2014"))),
  nrow(unique(subset(jdf2,V2=="PLoS_Genet"&year=="2015"))),
  nrow(unique(subset(jdf2,V2=="PLoS_Genet"&year=="2016"))),
  nrow(unique(subset(jdf2,V2=="PLoS_Genet"&year=="2017"))),
  nrow(unique(subset(jdf2,V2=="PLoS_Genet"&year=="2018"))),
  nrow(unique(subset(jdf2,V2=="PLoS_Genet"&year=="2019"))),
  nrow(unique(subset(jdf2,V2=="PLoS_Genet"&year=="2020"))))

pg_df<- data.frame(years,pg_aff,pg_gl,signif(pg_aff/pg_gl*100,3))
colnames(pg_df) <- c("year","affected articles","articles with excel gene lists", "proportion of articles affected (%)")
pg_df%>%
  kbl() %>%
  kable_paper("hover", full_width = F)
year affected articles articles with excel gene lists proportion of articles affected (%)
2014 23 72 31.9
2015 19 68 27.9
2016 19 67 28.4
2017 18 64 28.1
2018 16 56 28.6
2019 16 60 26.7
2020 23 61 37.7
# Oncotarget
ot_aff <- c(nrow(subset(aj2014[aj2014$V3=="Oncotarget",])),
  nrow(subset(aj2015[aj2015$V3=="Oncotarget",])),
  nrow(subset(aj2016[aj2016$V3=="Oncotarget",])),
  nrow(subset(aj2017[aj2017$V3=="Oncotarget",])),
  nrow(subset(aj2018[aj2018$V3=="Oncotarget",])),
  nrow(subset(aj2019[aj2019$V3=="Oncotarget",])),
  nrow(subset(aj2020[aj2020$V3=="Oncotarget",])))

ot_gl <- c(nrow(unique(subset(jdf2,V2=="Oncotarget"&year=="2014"))),
  nrow(unique(subset(jdf2,V2=="Oncotarget"&year=="2015"))),
  nrow(unique(subset(jdf2,V2=="Oncotarget"&year=="2016"))),
  nrow(unique(subset(jdf2,V2=="Oncotarget"&year=="2017"))),
  nrow(unique(subset(jdf2,V2=="Oncotarget"&year=="2018"))),
  nrow(unique(subset(jdf2,V2=="Oncotarget"&year=="2019"))),
  nrow(unique(subset(jdf2,V2=="Oncotarget"&year=="2020"))))

ot_df<- data.frame(years,ot_aff,ot_gl,signif(ot_aff/ot_gl*100,3))
colnames(ot_df) <- c("year","affected articles","articles with excel gene lists", "proportion of articles affected (%)")
ot_df%>%
  kbl() %>%
  kable_paper("hover", full_width = F)
year affected articles articles with excel gene lists proportion of articles affected (%)
2014 5 18 27.8
2015 10 47 21.3
2016 41 106 38.7
2017 35 107 32.7
2018 11 35 31.4
2019 3 8 37.5
2020 2 5 40.0

Plots for figure 2 of the manuscript

Temporal trends for the six journals with most gene name errors from 2014 to 2020 was observed and Figure 2 in the manuscript was reproduced.

years <- c("2014","2015","2016","2017","2018","2019","2020")

par(mfrow=c(3,1))
# line diagrams on the number of gene name error articles published by  journal per year

## Nature communication
barplot(nc_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="Nature Communications")
grid()
barplot(nc_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=nc_df[,1],y=nc_df[,2]/nc_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

## PLOS ONE
barplot(po_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="PLOS ONE")
grid()
barplot(po_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=po_df[,1],y=po_df[,2]/po_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

## Scientific Reports
barplot(sr_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="Scientific Reports")
grid()
barplot(sr_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=sr_df[,1],y=sr_df[,2]/sr_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

## BMC_Genomics
barplot(bg_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="BMC Genomics")
grid()
barplot(bg_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=bg_df[,1],y=bg_df[,2]/bg_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

## PLoS Genetics
barplot(pg_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="PLoS Genetics")
grid()
barplot(pg_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=pg_df[,1],y=pg_df[,2]/pg_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

## Oncotarget
barplot(ot_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="Oncotarget")
grid()
barplot(ot_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=ot_df[,1],y=ot_df[,2]/ot_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

## now make the pdf
pdf("fig3.pdf",width=4,height=7) 
par(mfrow=c(3,1))

## Nature communication
barplot(nc_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="Nature Communications")
grid()
barplot(nc_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=nc_df[,1],y=nc_df[,2]/nc_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

## PLOS ONE
barplot(po_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="PLOS ONE")
grid()
barplot(po_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=po_df[,1],y=po_df[,2]/po_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

## Scientific Reports
barplot(sr_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="Scientific Reports")
grid()
barplot(sr_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=sr_df[,1],y=sr_df[,2]/sr_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

## BMC_Genomics
barplot(bg_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="BMC Genomics")
grid()
barplot(bg_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=bg_df[,1],y=bg_df[,2]/bg_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

## PLoS Genetics
barplot(pg_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="PLoS Genetics")
grid()
barplot(pg_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=pg_df[,1],y=pg_df[,2]/pg_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

## Oncotarget
barplot(ot_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="Oncotarget")
grid()
barplot(ot_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=ot_df[,1],y=ot_df[,2]/ot_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

dev.off()
## png 
##   2

Journals error articles in refernce to yearly basis

All the journals from 2014 to 2020 which contain more than 10 gene name error publications.

Later we have output top 10 journals with gene name errors for each year from 2014 to 2020 with " head(aj20**_j_dsc,10)

aj2014_tab <- table(aj2014[,2])
aj2015_tab <- table(aj2015[,2])
aj2016_tab <- table(aj2016[,2])
aj2017_tab <- table(aj2017[,2])
aj2018_tab <- table(aj2018[,2])
aj2019_tab <- table(aj2019[,2])
aj2020_tab <- table(aj2020[,2])

aj2020_df <- as.data.frame(aj2020_tab)
aj2020_j <- aj2020_df[which(aj2020_df[,2]>10),]
aj2020_j_dsc <- aj2020_df[order(-aj2020_df[,2]),]
head(aj2020_j_dsc,10)
##                         Var1 Freq
## 123               Nat_Commun   85
## 60               Front_Genet   43
## 162                  Sci_Rep   33
## 66               Front_Oncol   23
## 152               PLoS_Genet   23
## 50                     eLife   22
## 15              BMC_Genomics   20
## 154                 PLoS_One   17
## 157 Proc_Natl_Acad_Sci_U_S_A   17
## 42               Commun_Biol   16
aj2019_df <- as.data.frame(aj2019_tab)
aj2019_j <- aj2019_df[which(aj2019_df[,2]>10),]
aj2019_j_dsc <- aj2019_df[order(-aj2019_df[,2]),]
head(aj2019_j_dsc,10)
##                 Var1 Freq
## 129       Nat_Commun   84
## 157          Sci_Rep   47
## 69       Front_Genet   29
## 24      BMC_Genomics   21
## 152         PLoS_One   20
## 151       PLoS_Genet   16
## 41          Cell_Rep   15
## 54             eLife   13
## 145            PeerJ   13
## 44  Clin_Epigenetics   12
aj2018_df <- as.data.frame(aj2018_tab)
aj2018_j <- aj2018_df[which(aj2018_df[,2]>10),]
aj2018_j_dsc <- aj2018_df[order(-aj2018_df[,2]),]
head(aj2018_j_dsc,10)
##             Var1 Freq
## 103   Nat_Commun   67
## 131      Sci_Rep   43
## 124     PLoS_One   21
## 20  BMC_Genomics   20
## 35      Cell_Rep   16
## 122   PLoS_Genet   16
## 46         eLife   14
## 55   Front_Genet   13
## 66   Genome_Biol   12
## 118   Oncotarget   11
aj2017_df <- as.data.frame(aj2017_tab)
aj2017_j <- aj2017_df[which(aj2017_df[,2]>10),]
aj2017_j_dsc <- aj2017_df[order(-aj2017_df[,2]),]
head(aj2017_j_dsc,10)
##             Var1 Freq
## 102      Sci_Rep   57
## 80    Nat_Commun   50
## 98      PLoS_One   37
## 91    Oncotarget   35
## 12  BMC_Genomics   26
## 97    PLoS_Genet   18
## 86        Nature   13
## 35         eLife   11
## 55   Genome_Biol   11
## 24      Cell_Rep    7
aj2016_df <- as.data.frame(aj2016_tab)
aj2016_j <- aj2016_df[which(aj2016_df[,2]>10),]
aj2016_j_dsc <- aj2016_df[order(-aj2016_df[,2]),]
head(aj2016_j_dsc,10)
##                         Var1 Freq
## 119               Oncotarget   41
## 137                  Sci_Rep   37
## 131                 PLoS_One   34
## 103               Nat_Commun   29
## 14              BMC_Genomics   26
## 128               PLoS_Genet   19
## 32                  Cell_Rep   15
## 48                     eLife   10
## 133 Proc_Natl_Acad_Sci_U_S_A   10
## 26                      Cell    9
aj2015_df <- as.data.frame(aj2015_tab)
aj2015_j <- aj2015_df[which(aj2015_df[,2]>10),]
aj2015_j_dsc <- aj2015_df[order(-aj2015_df[,2]),]
head(aj2015_j_dsc,10)
##                         Var1 Freq
## 133                 PLoS_One   57
## 18              BMC_Genomics   28
## 136 Proc_Natl_Acad_Sci_U_S_A   23
## 115               Nat_Commun   19
## 132               PLoS_Genet   19
## 123        Nucleic_Acids_Res   18
## 31                      Cell   14
## 48                     eLife   11
## 76                Genome_Res   11
## 125               Oncotarget   10
aj2014_df <- as.data.frame(aj2014_tab)
aj2014_j <- aj2014_df[which(aj2014_df[,2]>10),]
aj2014_j_dsc <- aj2014_df[order(-aj2014_df[,2]),]
head(aj2014_j_dsc,10)
##                  Var1 Freq
## 76           PLoS_One   58
## 8        BMC_Genomics   25
## 75         PLoS_Genet   23
## 39        Genome_Biol   12
## 62         Nat_Commun   11
## 68  Nucleic_Acids_Res    9
## 25              eLife    8
## 67             Nature    6
## 6  BMC_Bioinformatics    5
## 70         Oncotarget    5
aj2020_df <- as.data.frame(aj2020_tab)
aj2020_j <- aj2020_df[which(aj2020_df[,2]>10),]
aj2020_j_dsc <- aj2020_df[order(-aj2020_df[,2]),]
head(aj2020_j_dsc,10)
##                         Var1 Freq
## 123               Nat_Commun   85
## 60               Front_Genet   43
## 162                  Sci_Rep   33
## 66               Front_Oncol   23
## 152               PLoS_Genet   23
## 50                     eLife   22
## 15              BMC_Genomics   20
## 154                 PLoS_One   17
## 157 Proc_Natl_Acad_Sci_U_S_A   17
## 42               Commun_Biol   16

Consistency between this and our previous analysis

Focus on plos one in the period 2014,2015. Previously we found 60 studies. Here we find 115 studies. Of these 56 are common to both.

prev <- read.table("plosone_20142015.tsv",header=TRUE,sep="\t")
prev <- subset(prev,Confirmed=="Confirmed")
tail(prev)
##    Journal YearPublished Confirmed     PMID      PMCID
## 73 PLosOne          2015 Confirmed 26444573 PMC4596691
## 74 PLosOne          2015 Confirmed 26510177 PMC4624949
## 75 PLosOne          2015 Confirmed 26529237 PMC4631338
## 76 PLosOne          2015 Confirmed 26636579 PMC4670106
## 77 PLosOne          2015 Confirmed 26695660 PMC4687867
## 78 PLosOne          2015 Confirmed 26684451 PMC4684321
prev_pmc <- unique(prev$PMCID)
length(prev_pmc)
## [1] 60
new <- subset(aff_xls,V3=="PLoS_One")
new <- subset(new, V4=="2014" | V4 == "2015")
new <- new[,c(1,3,4)]
new_pmc <- unique(new$V1)
length(new_pmc)
## [1] 115
v1 <- list("Prev"=prev_pmc,"New"=new_pmc)
plot(euler(v1),quantities = TRUE)

# PMCs specific to 2016 study
setdiff(prev_pmc,new_pmc)
## [1] "PMC4118979" "PMC4441472" "PMC4500563" "PMC4508115"
# PMCs specific to 2021 study
setdiff(new_pmc,prev_pmc)
##  [1] "PMC3894996" "PMC3901708" "PMC3968145" "PMC3989189" "PMC3990548"
##  [6] "PMC3990644" "PMC3990668" "PMC3999108" "PMC4002427" "PMC4002480"
## [11] "PMC4011728" "PMC4012993" "PMC4029955" "PMC4039489" "PMC4041891"
## [16] "PMC4079602" "PMC4084626" "PMC4099127" "PMC4103770" "PMC4105622"
## [21] "PMC4108364" "PMC4109958" "PMC4128672" "PMC4141782" "PMC4156408"
## [26] "PMC4157799" "PMC4167545" "PMC4207689" "PMC4208810" "PMC4210245"
## [31] "PMC4230926" "PMC4252097" "PMC4262388" "PMC4338293" "PMC4364623"
## [36] "PMC4370594" "PMC4370704" "PMC4372331" "PMC4373911" "PMC4388690"
## [41] "PMC4416816" "PMC4427337" "PMC4441380" "PMC4456163" "PMC4472808"
## [46] "PMC4478024" "PMC4514889" "PMC4549312" "PMC4551741" "PMC4552880"
## [51] "PMC4556673" "PMC4580636" "PMC4583255" "PMC4618846" "PMC4641597"
## [56] "PMC4641603" "PMC4671692" "PMC4681367" "PMC4699828"

Journal impact factor correlation

Journal Citation Reports 2020 (Impact factor & Ranking of 2019). InCites Journal Citation Reports (Clarivate Analytics).

jifs <- read.table("jifs.tsv",header=TRUE)
proportions <- jjf[,c(1,4)]
myjifs <- merge(proportions, jifs, by.x="journal",by.y="Journal")
cor.test(myjifs$JIF,myjifs$`Proportion of articles affected`)
## 
##  Pearson's product-moment correlation
## 
## data:  myjifs$JIF and myjifs$`Proportion of articles affected`
## t = 2.9932, df = 33, p-value = 0.005197
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1522930 0.6892045
## sample estimates:
##       cor 
## 0.4620881
cor.test(myjifs$JIF,myjifs$`Proportion of articles affected`,method="s")
## Warning in cor.test.default(myjifs$JIF, myjifs$`Proportion of articles
## affected`, : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  myjifs$JIF and myjifs$`Proportion of articles affected`
## S = 2932.1, p-value = 0.0001953
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.5893395
mylm <- lm(myjifs$`Proportion of articles affected` ~ myjifs$JIF)
summary(mylm)
## 
## Call:
## lm(formula = myjifs$`Proportion of articles affected` ~ myjifs$JIF)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.2790  -4.5087  -0.6097   4.7067  19.4141 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  28.8471     1.9874  14.515 6.95e-16 ***
## myjifs$JIF    0.4775     0.1595   2.993   0.0052 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.702 on 33 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.2135, Adjusted R-squared:  0.1897 
## F-statistic: 8.959 on 1 and 33 DF,  p-value: 0.005197
plot(myjifs$JIF,myjifs$`Proportion of articles affected`,
  xlab="JIF", ylab="Proportion of articles affected (%)",
  main="Articles with supplementary Excel gene lists")
abline(mylm)

pdf("fig2.pdf")
plot(myjifs$JIF,myjifs$`Proportion of articles affected`,
  xlab="JIF", ylab="Proportion of articles affected (%)",
  main="Articles with supplementary Excel gene lists")
abline(mylm)
dev.off()
## png 
##   2

Session information

For reproducibility.

sessionInfo() 
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] Cairo_1.5-12.2   eulerr_6.1.0     kableExtra_1.3.4 dplyr_1.0.6     
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.6        highr_0.9         pillar_1.6.1      bslib_0.2.5.1    
##  [5] compiler_4.1.0    jquerylib_0.1.4   tools_4.1.0       digest_0.6.27    
##  [9] viridisLite_0.4.0 jsonlite_1.7.2    evaluate_0.14     lifecycle_1.0.0  
## [13] tibble_3.1.2      pkgconfig_2.0.3   rlang_0.4.11      DBI_1.1.1        
## [17] rstudioapi_0.13   yaml_2.2.1        xfun_0.23         stringr_1.4.0    
## [21] httr_1.4.2        knitr_1.33        xml2_1.3.2        systemfonts_1.0.2
## [25] generics_0.1.0    vctrs_0.3.8       sass_0.4.0        grid_4.1.0       
## [29] webshot_0.5.2     tidyselect_1.1.1  svglite_2.0.0     glue_1.4.2       
## [33] R6_2.5.0          fansi_0.5.0       rmarkdown_2.8     polyclip_1.10-0  
## [37] polylabelr_0.2.0  purrr_0.3.4       magrittr_2.0.1    scales_1.1.1     
## [41] ellipsis_0.3.2    htmltools_0.5.1.1 assertthat_0.2.1  rvest_1.0.0      
## [45] colorspace_2.0-1  utf8_1.2.1        stringi_1.6.2     munsell_0.5.0    
## [49] crayon_1.4.1