Gene Name Errors Screen

Source: http://118.138.234.73/public/gene_name_errors/manuscript_files/results_summary.html

library("dplyr")

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library("kableExtra")

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

library("eulerr")
library("Cairo")
options(bitmapType="cairo")

Introduction

The bash script “gene_names.sh” was used to screen a list of PMC articles and here we are processing and analysing the output of the script to generate figures that will be needed in the paper

Number of publications

The number of publications available in each year from 2014 to 2020 in PUBMED CENTRAL. These publications were selected through a search in PubMed Central using the key word,“genom”

pmc2014 <- read.table("pmc/genom/pmc2014.txt")
pmc2014 <- nrow(pmc2014)
pmc2015 <- read.table("pmc/genom/pmc2015.txt")
pmc2015 <- nrow(pmc2015)
pmc2016 <- read.table("pmc/genom/pmc2016.txt")
pmc2016 <- nrow(pmc2016)
pmc2017 <- read.table("pmc/genom/pmc2017.txt")
pmc2017 <- nrow(pmc2017)
pmc2018 <- read.table("pmc/genom/pmc2018.txt")
pmc2018 <- nrow(pmc2018)
pmc2019 <- read.table("pmc/genom/pmc2019.txt")
pmc2019 <- nrow(pmc2019)
pmc2020 <- read.table("pmc/genom/pmc2020.txt")
pmc2020 <- nrow(pmc2020)
total_pmc <- sum(pmc2014,pmc2015,pmc2016,pmc2017,pmc2018,pmc2019,pmc2020)
pmc <- c(pmc2014,pmc2015,pmc2016,pmc2017,pmc2018,pmc2019,pmc2020,total_pmc)

Number of supplementary files

The number of publications resulted from the “genom” keyword search from 2014 to 2020 was short listed where the publications with supplementary files were downloaded.

These supplementary files were selected in a manner in which supplementary files compose of “.xls” or “.xlsx” suffixes were assumed to be supplementary excel files.

res2014 <- readLines("results/genom/results2014.txt")
res2014_length <- length(unique(sapply(strsplit(res2014," "),"[[",1)))
res2015 <- readLines("results/genom/results2015.txt")
res2015_length <- length(unique(sapply(strsplit(res2015," "),"[[",1)))
res2016 <- readLines("results/genom/results2016.txt")
res2016_length <- length(unique(sapply(strsplit(res2016," "),"[[",1)))
res2017 <- readLines("results/genom/results2017.txt")
res2017_length <- length(unique(sapply(strsplit(res2017," "),"[[",1)))
res2018 <- readLines("results/genom/results2018.txt")
res2018_length <- length(unique(sapply(strsplit(res2018," "),"[[",1)))
res2019 <- readLines("results/genom/results2019.txt")
res2019_length <- length(unique(sapply(strsplit(res2019," "),"[[",1)))
res2020 <- readLines("results/genom/results2020.txt")
res2020_length <- length(unique(sapply(strsplit(res2020," "),"[[",1)))
total_res <- sum(res2014_length,res2015_length,res2016_length,res2017_length,
  res2018_length,res2019_length,res2020_length)
res <- c(res2014_length,res2015_length,res2016_length,res2017_length,
  res2018_length,res2019_length,res2020_length,total_res)

Excel files with gene lists

All the supplementary files downloaded from 2014 to 2020 where searched for a gene list in a way that each supplementary files’ content were analysed for their length.

If a supplementary file content is depicting a length greater than 2, it is verified as a supplementary file containing of a gene list.

res2014_l <- strsplit(res2014," ")
res2014_xlg <- res2014_l[which(lapply(res2014_l,length)>2)]
res2014_xlg <- length(unique(sapply(res2014_xlg,"[[",2)))
res2015_l <- strsplit(res2015," ")
res2015_xlg <- res2015_l[which(lapply(res2015_l,length)>2)]
res2015_xlg <- length(unique(sapply(res2015_xlg,"[[",2)))
res2016_l <- strsplit(res2016," ")
res2016_xlg <- res2016_l[which(lapply(res2016_l,length)>2)]
res2016_xlg <- length(unique(sapply(res2016_xlg,"[[",2)))
res2017_l <- strsplit(res2017," ")
res2017_xlg <- res2017_l[which(lapply(res2017_l,length)>2)]
res2017_xlg <- length(unique(sapply(res2017_xlg,"[[",2)))
res2018_l <- strsplit(res2018," ")
res2018_xlg <- res2018_l[which(lapply(res2018_l,length)>2)]
res2018_xlg <- length(unique(sapply(res2018_xlg,"[[",2)))
res2019_l <- strsplit(res2019," ")
res2019_xlg <- res2019_l[which(lapply(res2019_l,length)>2)]
res2019_xlg <- length(unique(sapply(res2019_xlg,"[[",2)))
res2020_l <- strsplit(res2020," ")
res2020_xlg <- res2020_l[which(lapply(res2020_l,length)>2)]
res2020_xlg <- length(unique(sapply(res2020_xlg,"[[",2)))

total_pub_xlg <- sum(res2014_xlg,res2015_xlg,res2016_xlg,res2017_xlg,res2018_xlg,res2019_xlg,res2020_xlg)

pub_xlg <- c(res2014_xlg,res2015_xlg,res2016_xlg,res2017_xlg,res2018_xlg,res2019_xlg,res2020_xlg,total_pub_xlg)

Publications with Excel gene lists

The results from the excel files with gene lists were shortlisted in a manner where the related publication for each supplementary file with excel gene list was selected.

res2014_l <- strsplit(res2014," ")
res2014_lx <- res2014_l[which(lapply(res2014_l,length)>2)]
res2014_lx <- length(unique(sapply(res2014_lx,"[[",1)))
res2015_l <- strsplit(res2015," ")
res2015_lx <- res2015_l[which(lapply(res2015_l,length)>2)]
res2015_lx <- length(unique(sapply(res2015_lx,"[[",1)))
res2016_l <- strsplit(res2016," ")
res2016_lx <- res2016_l[which(lapply(res2016_l,length)>2)]
res2016_lx <- length(unique(sapply(res2016_lx,"[[",1)))
res2017_l <- strsplit(res2017," ")
res2017_lx <- res2017_l[which(lapply(res2017_l,length)>2)]
res2017_lx <- length(unique(sapply(res2017_lx,"[[",1)))
res2018_l <- strsplit(res2018," ")
res2018_lx <- res2018_l[which(lapply(res2018_l,length)>2)]
res2018_lx <- length(unique(sapply(res2018_lx,"[[",1)))
res2019_l <- strsplit(res2019," ")
res2019_lx <- res2019_l[which(lapply(res2019_l,length)>2)]
res2019_lx <- length(unique(sapply(res2019_lx,"[[",1)))
res2020_l <- strsplit(res2020," ")
res2020_lx <- res2020_l[which(lapply(res2020_l,length)>2)]
res2020_lx <- length(unique(sapply(res2020_lx,"[[",1)))
total_pub_xls <- sum(res2014_lx,res2015_lx,res2016_lx,res2017_lx,res2018_lx,res2019_lx,res2020_lx)
pub_xls <- c(res2014_lx,res2015_lx,res2016_lx,res2017_lx,res2018_lx,res2019_lx,res2020_lx,total_pub_xls)

Publications with suspected gene name errors

The resulted publications with excel gene list were run in a shell scripted software where each selected publications with excel gene list were screened for their gene symbols.

Erroneous conversions, such as date formats, scientific numbers and five-digit numbers were chosen from gene lists as gene name errors and the results were recorded and saved as “aggregated_res”.

This aggregated_res file composed of PMCID, Species, Journal name, year and the link to the affected file.

Total suspected publications with gene name errors were detected from aggregated_res file and aggregated_res file was analysed in a manner where publications with gene name errors for each year were also extracted.

aggregated_res <- readLines("results/genom/aggregated_res.txt")
head(aggregated_res)

## [1] "PMC3879165 PLoS_Genet results2014.txt /pmc/articles/PMC3879165/bin/pgen.1004006.s005.xlsx"
## [2] "PMC3886906 PLoS_Genet results2014.txt /pmc/articles/PMC3886906/bin/pgen.1004079.s012.xlsx"
## [3] "PMC3894565 Sci_Rep results2014.txt /pmc/articles/PMC3894565/bin/srep03692-s1.xls"         
## [4] "PMC3894565 Sci_Rep results2014.txt /pmc/articles/PMC3894565/bin/srep03692-s2.xls"         
## [5] "PMC3894996 PLoS_One results2014.txt /pmc/articles/PMC3894996/bin/pone.0085599.s003.xlsx"  
## [6] "PMC3897657 PLoS_One results2014.txt /pmc/articles/PMC3897657/bin/pone.0086220.s002.xlsx"

total_sus_pub <- length(unique(sapply(strsplit(aggregated_res," "),"[[",1)))
total_sus_pub

## [1] 3470

sus_pub <- sapply(strsplit(aggregated_res," "),"[[",1)
sus_pub_year <- sapply(strsplit(aggregated_res," "),"[[",3)
sus_pub_file <- sapply(strsplit(aggregated_res," "),"[[",4)
suspected_df <- data.frame(sus_pub,sus_pub_year,sus_pub_file)

sus_pub_2014_df <- subset(suspected_df[suspected_df$sus_pub_year=="results2014.txt",])
sus_pub_2014 <- length(unique(sus_pub_2014_df$sus_pub))
sus_pub_2015_df <- subset(suspected_df[suspected_df$sus_pub_year=="results2015.txt",])
sus_pub_2015 <- length(unique(sus_pub_2015_df$sus_pub))
sus_pub_2016_df <- subset(suspected_df[suspected_df$sus_pub_year=="results2016.txt",])
sus_pub_2016 <- length(unique(sus_pub_2016_df$sus_pub))
sus_pub_2017_df <- subset(suspected_df[suspected_df$sus_pub_year=="results2017.txt",])
sus_pub_2017 <- length(unique(sus_pub_2017_df$sus_pub))
sus_pub_2018_df <- subset(suspected_df[suspected_df$sus_pub_year=="results2018.txt",])
sus_pub_2018 <- length(unique(sus_pub_2018_df$sus_pub))
sus_pub_2019_df <- subset(suspected_df[suspected_df$sus_pub_year=="results2019.txt",])
sus_pub_2019 <- length(unique(sus_pub_2019_df$sus_pub))
sus_pub_2020_df <- subset(suspected_df[suspected_df$sus_pub_year=="results2020.txt",])
sus_pub_2020 <- length(unique(sus_pub_2020_df$sus_pub))

sus_pub <- c(sus_pub_2014,sus_pub_2015,sus_pub_2016,sus_pub_2017,
  sus_pub_2018,sus_pub_2019,sus_pub_2020,total_sus_pub)

Affected XLS files

aggregated_res file was analysed for the total number of spread files which contained gene name errors and the final result was saved as “affected_xl”

affected_xl <- length(sapply(strsplit(aggregated_res," "),"[[",1))
affected_xl

## [1] 5136

Affected files

In order to reconfirm the results obtained each spreadsheet in aggregated_res file was manually opened to validate accuracy of the data and final result was saved in “true_positive.txt” file.

true_positive.txt" file was read as affected xls. Total number of true positive excel files with gene name errors from the “true_positive.txt” was obtained and the result was saved as “total_aff_xls”

“affected.xls” was reanalysed based on the year(2014 to 2020) to obtain number of true positive excel files for each year.

aff_xls <- read.table("results/genom/true_positive.txt")
total_aff_xls <- nrow(aff_xls)
total_aff_xls

## [1] 5086

aff_xls_2014 <- subset(aff_xls[aff_xls$V4==2014,])
aff_xls_2014 <- length(aff_xls_2014$V1)
aff_xls_2015 <- subset(aff_xls[aff_xls$V4==2015,])
aff_xls_2015 <- length(aff_xls_2015$V1)
aff_xls_2016 <- subset(aff_xls[aff_xls$V4==2016,])
aff_xls_2016 <- length(aff_xls_2016$V1)
aff_xls_2017 <- subset(aff_xls[aff_xls$V4==2017,])
aff_xls_2017 <- length(aff_xls_2017$V1)
aff_xls_2018 <- subset(aff_xls[aff_xls$V4==2018,])
aff_xls_2018 <- length(aff_xls_2018$V1)
aff_xls_2019 <- subset(aff_xls[aff_xls$V4==2019,])
aff_xls_2019 <- length(aff_xls_2019$V1)
aff_xls_2020 <- subset(aff_xls[aff_xls$V4==2020,])
aff_xls_2020 <- length(aff_xls_2020$V1)

aff_xls_files <- c(aff_xls_2014,aff_xls_2015,aff_xls_2016,
  aff_xls_2017,aff_xls_2018,aff_xls_2019,
  aff_xls_2020,total_aff_xls)

Affected publications

“aff_xls” was reannalysed for the total number of true positive publications and the result was saved as “total_aff_pub”. “aff_xls” was reanalysed based on the year(2014 to 2020) to obtain number of true positive publications for each year.

aff_xls <- read.table("results/genom/true_positive.txt")
total_aff_pub <- length(unique(aff_xls$V1))
total_aff_pub

## [1] 3436

pub_2014 <- subset(aff_xls[aff_xls$V4==2014,])
aff_pub_2014 <- length(unique(pub_2014$V1))
pub_2015 <- subset(aff_xls[aff_xls$V4==2015,])
aff_pub_2015 <- length(unique(pub_2015$V1))
pub_2016 <- subset(aff_xls[aff_xls$V4==2016,])
aff_pub_2016 <- length(unique(pub_2016$V1))
pub_2017 <- subset(aff_xls[aff_xls$V4==2017,])
aff_pub_2017 <- length(unique(pub_2017$V1))
pub_2018 <- subset(aff_xls[aff_xls$V4==2018,])
aff_pub_2018 <- length(unique(pub_2018$V1))
pub_2019 <- subset(aff_xls[aff_xls$V4==2019,])
aff_pub_2019 <- length(unique(pub_2019$V1))
pub_2020 <- subset(aff_xls[aff_xls$V4==2020,])
aff_pub_2020 <- length(unique(pub_2020$V1))

aff_pub <- c(aff_pub_2014,aff_pub_2015,aff_pub_2016,aff_pub_2017,aff_pub_2018,aff_pub_2019,aff_pub_2020,total_aff_pub)

False positive XLS files

In order to reconfirm the results obtained each spreadsheet in aggregated_res file was manually opened to validate accuracy of the data and final result was updated in the “aggregated_res” file. In the manual search of aggregated_res for validating accuracy of data in the file, several false positive excel files were recorded and these false positive results were saved in a text file named “fp.txt”

fp.txt file was read as “false positive”. Total number of false positive results were recorded from “fp.txt” and saved as “total_fp_xls”. “false positive” was reanalysed based on the year (2014 to 2020) to obtain number of false positive excel files for each year.

fp <- read.table("results/genom/false_positive.txt")
total_fp_xls <- length(fp$V1)
total_fp_xls

## [1] 50

fp_2014 <- subset(fp[fp$V4==2014,])
fp_2014_xls <- length(fp_2014$V1)
fp_2015 <- subset(fp[fp$V4==2015,])
fp_2015_xls <- length(fp_2015$V1)
fp_2016 <- subset(fp[fp$V4==2016,])
fp_2016_xls <- length(fp_2016$V1)
fp_2017 <- subset(fp[fp$V4==2017,])
fp_2017_xls <- length(fp_2017$V1)
fp_2018 <- subset(fp[fp$V4==2018,])
fp_2018_xls <- length(fp_2018$V1)
fp_2019 <- subset(fp[fp$V4==2019,])
fp_2019_xls <- length(fp_2019$V1)
fp_2020 <- subset(fp[fp$V4==2020,])
fp_2020_xls <- length(fp_2020$V1)

fp_xls <- c(fp_2014_xls,fp_2015_xls,fp_2016_xls,
  fp_2017_xls,fp_2018_xls,fp_2019_xls,
  fp_2020_xls,total_fp_xls)

False positive publications

False positive publications = suspected pubs - affected pubs

fp_2014_pub <- sus_pub_2014 - aff_pub_2014
fp_2015_pub <- sus_pub_2015 - aff_pub_2015
fp_2016_pub <- sus_pub_2016 - aff_pub_2016
fp_2017_pub <- sus_pub_2017 - aff_pub_2017
fp_2018_pub <- sus_pub_2018 - aff_pub_2018
fp_2019_pub <- sus_pub_2019 - aff_pub_2019
fp_2020_pub <- sus_pub_2020 - aff_pub_2020

total_fp_pub <- sum(fp_2014_pub,fp_2015_pub,fp_2016_pub,fp_2017_pub,fp_2018_pub,fp_2019_pub,fp_2020_pub)

fp_pub <- c(fp_2014_pub,fp_2015_pub,fp_2016_pub,
  fp_2017_pub,fp_2018_pub,fp_2019_pub,
  fp_2020_pub,total_fp_pub)

Proportion of Excel gene lists articles containing errors

“total_aff_pub” was divided by “total_pub_xls” and multiple by 100 to find the proportion (%) of excel gene lists articles which contain gene name errors. The final answer was recorded to 3 significant figures and the result was saved as “total_aff_pub”. Figures below are precentages.

Total_percent_of_errors <- signif(total_aff_pub/total_pub_xls*100,3)
Total_percent_of_errors

## [1] 30.9

percentage_2014 <- signif(aff_pub_2014/res2014_lx*100,3)
percentage_2015 <- signif(aff_pub_2015/res2015_lx*100,3)
percentage_2016 <- signif(aff_pub_2016/res2016_lx*100,3)
percentage_2017 <- signif(aff_pub_2017/res2017_lx*100,3)
percentage_2018 <- signif(aff_pub_2018/res2018_lx*100,3)
percentage_2019 <- signif(aff_pub_2019/res2019_lx*100,3)
percentage_2020 <- signif(aff_pub_2020/res2020_lx*100,3)

percentages <- c(percentage_2014,percentage_2015,percentage_2016,percentage_2017,
  percentage_2018,percentage_2019,percentage_2020,Total_percent_of_errors)

Table 3 of the manuscript

Table 3. Results of a screen for gene name errors in PubMed Central.

years <- c("2014","2015","2016","2017","2018","2019","2020","Total")
df_gene_name_errors_years <- data.frame(pmc,res,pub_xlg,pub_xls,sus_pub,fp_xls,fp_pub,aff_xls_files,aff_pub,percentages)
df_gene_name_errors_years <- cbind(years,df_gene_name_errors_years)

colnames(df_gene_name_errors_years) <- c(" ",
  "PMC ID",
  "Excel files screened",
  "Excel files with gene lists",
  "Publications with Excel gene lists",
  "Publications with suspected gene name errors",
  "False positive XLS files",
  "False positive publications",
  "Affected XLS files",
  "Affected publications",
  "Proportion of publications affected")

  df_gene_name_errors_years%>%
  kbl() %>%
  kable_paper("hover", full_width = F)

	PMC ID	Excel files screened	Excel files with gene lists	Publications with Excel gene lists	Publications with suspected gene name errors	False positive XLS files	False positive publications	Affected XLS files	Affected publications	Proportion of publications affected
2014	19976	2948	2286	936	284	8	2	429	282	30.1
2015	21204	4318	3037	1491	490	0	0	701	490	32.9
2016	22261	4472	3331	1579	477	7	6	653	471	29.8
2017	23976	4355	3012	1412	443	5	3	648	440	31.2
2018	24986	4824	3566	1653	475	15	11	703	464	28.1
2019	26046	5481	3942	1823	594	4	3	914	591	32.4
2020	27690	6443	4496	2223	707	11	9	1038	698	31.4
Total	166139	32841	23670	11117	3470	50	34	5086	3436	30.9

Plots for figure 1 of the manuscript

Plots are created for the figure 1 in the manuscript which composed of results for prevalence of gene name errors in the period 2014-2020.

Accordingly, A .barplot for Publications with Excel gene lists. B. barplot for Affected publication and C. Line diagram for Proportion of publications affected

pdf("fig1.pdf",width=4,height=7) 
par(mfrow=c(3,1))

#barplot for Publications with Excel gene lists
df <- df_gene_name_errors_years
df <- df[1:(nrow(df)-1),]

barplot(df[,5],names.arg = df[,1],ylab="Publications with Excel gene lists",ylim = c(0,2500))
grid(nx=0,ny=5,col = "gray")

#barplot for Affected publications
barplot(df[,10],names.arg = df[,1],ylab="Affected publications",ylim = c(0,800))
grid(nx=0,ny=8,col = "gray")

#Line diagram for Proportion of publications affected
plot(x=df[,1],y=df[,11],ylab ="Proportion of publications affected (%)",xlab = "",ylim = c(0,35),type = "b",pch=19,bty="n")
grid()
dev.off()

## png 
##   2

par(mfrow=c(3,1))
#barplot for Publications with Excel gene lists
df <- df_gene_name_errors_years
df <- df[1:(nrow(df)-1),]
barplot(df[,5],names.arg = df[,1],ylab="Publications with Excel gene lists",ylim = c(0,2500))
grid(nx=0,ny=5,col = "gray")

#barplot for Affected publications
barplot(df[,10],names.arg = df[,1],ylab="Affected publications",ylim = c(0,800))
grid(nx=0,ny=8,col = "gray")

#Line diagram for Proportion of publications affected

plot(x=df[,1],y=df[,11],ylab ="Proportion of publications affected (%)",xlab = "",ylim = c(0,35),type = "b",pch=19,bty="n")
grid()

Gene name errors for different organisms

From the results obtained on the publications and excel spread sheets with gene name errors, a reanalysation was performed to identify which organism composed of most number of errors in their name.

Each publication and excel sheet was screened for gene name errors. Accordngly, A. thaliana, C. elegans, D. melanogaster, D. rerio, G. gallus, H. sapiens,M. musculus,O. sativa,R. norvegicus,S. cerevisiae species were selected and each publications and excel sheets composing of gene name errors from 2014 to 2020 were screened. The obtained results were saved under each individual name of the species.

Proportion of article affected for each individual species was calculated and resulted percentage was saved to 3 significant figures.

res_all_years <- c(res2014,res2015,res2016,res2017,res2018,res2019,res2020)
res_all_years_l <- strsplit(res_all_years," ")
res_all_years_lx <- res_all_years_l[which(lapply(res_all_years_l,length)>2)]
pmid <- sapply(res_all_years_lx,"[[",1)
org <- sapply(res_all_years_lx,"[[",3)
org_df <- data.frame(pmid,org)


ath_pub <- length(unique(subset(org_df,org=="Athaliana")$pmid))
ath_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Athaliana",])$V1))
ath_pc <- signif(ath_aff/ath_pub*100,3)

cel_pub <- length(unique(subset(org_df,org=="Celegans")$pmid))
cel_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Celegans",])$V1))
cel_pc <- signif(cel_aff/cel_pub*100,3)

dme_pub <- length(unique(subset(org_df,org=="Dmelanogaster")$pmid))
dme_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Dmelanogaster",])$V1))
dme_pc <- signif(dme_aff/dme_pub*100,3)

dre_pub <- length(unique(subset(org_df,org=="Drerio")$pmid))
dre_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Drerio",])$V1))
dre_pc <- signif(dre_aff/dre_pub*100,3)

gga_pub <- length(unique(subset(org_df,org=="Ggallus")$pmid))
gga_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Ggallus",])$V1))
gga_pc <- signif(gga_aff/gga_pub*100,3)

hsa_pub <- length(unique(subset(org_df,org=="Hsapiens")$pmid))
hsa_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Hsapiens",])$V1))
hsa_pc <- signif(hsa_aff/hsa_pub*100,3)

mmu_pub <- length(unique(subset(org_df,org=="Mmusculus")$pmid))
mmu_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Mmusculus",])$V1))
mmu_pc <- signif(mmu_aff/mmu_pub*100,3)

osa_pub <- length(unique(subset(org_df,org=="Osativa")$pmid))
osa_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Osativa",])$V1))
osa_pc <- signif(osa_aff/osa_pub*100,3)

rno_pub <- length(unique(subset(org_df,org=="Rnorvegicus")$pmid))
rno_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Rnorvegicus",])$V1))
rno_pc <- signif(rno_aff/rno_pub*100,3)

sce_pub <- length(unique(subset(org_df,org=="Scerevisiae")$pmid))
sce_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Scerevisiae",])$V1))
sce_pc <- signif(sce_aff/sce_pub*100,3)

Table 4 of the manuscript

A data frame was created which composed of Species, Publications with excel gene lists, Affected publications and Proportion of artcles affected followed by generation of Table 4 in the manuscript.

Species <- c("A.thaliana","C.elegans","D.melanogaster","D.rerio","G.gallus",
  "H.sapiens","M.musculus","O.sativa","R.norvegicus","S.cerevisiae")

pub <- c(ath_pub, cel_pub, dme_pub, dre_pub, gga_pub, hsa_pub, mmu_pub, osa_pub, rno_pub, sce_pub)

aff <- c(ath_aff, cel_aff, dme_aff, dre_aff, gga_aff, hsa_aff, mmu_aff, osa_aff, rno_aff, sce_aff)

prop <- c(ath_pc, cel_pc, dme_pc, dre_pc, gga_pc, hsa_pc, mmu_pc, osa_pc, rno_pc, sce_pc)

gene_name_errors_species <- data.frame(Species,pub,aff,prop)

colnames(gene_name_errors_species) <- c("Species",
  "Publications with excel gene lists",
  "Affected publications",
  "Propotion of articles affected")

  gene_name_errors_species%>%
  kbl() %>%
  kable_paper("hover", full_width = F)

Species	Publications with excel gene lists	Affected publications	Propotion of articles affected
A.thaliana	511	76	14.90
C.elegans	124	31	25.00
D.melanogaster	607	142	23.40
D.rerio	251	48	19.10
G.gallus	1827	172	9.41
H.sapiens	7936	2419	30.50
M.musculus	1577	609	38.60
O.sativa	10	0	0.00
R.norvegicus	327	68	20.80
S.cerevisiae	443	93	21.00

Number of affected publications in different journals

Total number of journals index in PubMed Central was extraced for every year from 2014 to 2020 was obtained and result was saved as “jdf”. Total number of journals composing of one or more supplementary excel lists was calculated and result was saved as “res_p”. jdf_subset was created to evaluate repetition of the data obtained in “jdf” and “res_p”.

“journal_supplementary” was created to count number of journals with gene name errors in the supplementary files. “jdf_t_df” was created to for number of articles with excel gene list by using “jdf_subset”. Number of affected articles was calculated using “aff_xls”.

Table 5 in the manuscript was created from above gathered results.

#journals index in PMC
j2014 <- read.table("pmc/genom/pmc_journal2014genom.out.txt")
j2015 <- read.table("pmc/genom/pmc_journal2015genom.out.txt")
j2016 <- read.table("pmc/genom/pmc_journal2016genom.out.txt")
j2017 <- read.table("pmc/genom/pmc_journal2017genom.out.txt")
j2018 <- read.table("pmc/genom/pmc_journal2018genom.out.txt")
j2019 <- read.table("pmc/genom/pmc_journal2019genom.out.txt")
j2020 <- read.table("pmc/genom/pmc_journal2020genom.out.txt")
jdf <- rbind(j2014,j2015,j2016,j2017,j2018,j2019,j2020)
length(unique(jdf[,2]))

## [1] 4581

#journals published one or more supplementary Excel gene lists
res2014_xlg <- res2014_l[which(lapply(res2014_l,length)>2)]
res2014_p <- unique(sapply(res2014_xlg,"[[",1))
res2015_xlg <- res2015_l[which(lapply(res2015_l,length)>2)]
res2015_p <- unique(sapply(res2015_xlg,"[[",1))
res2016_xlg <- res2016_l[which(lapply(res2016_l,length)>2)]
res2016_p <- unique(sapply(res2016_xlg,"[[",1))
res2017_xlg <- res2017_l[which(lapply(res2017_l,length)>2)]
res2017_p <- unique(sapply(res2017_xlg,"[[",1))
res2018_xlg <- res2018_l[which(lapply(res2018_l,length)>2)]
res2018_p <- unique(sapply(res2018_xlg,"[[",1))
res2019_xlg <- res2019_l[which(lapply(res2019_l,length)>2)]
res2019_p <- unique(sapply(res2019_xlg,"[[",1))
res2020_xlg <- res2020_l[which(lapply(res2020_l,length)>2)]
res2020_p <- unique(sapply(res2020_xlg,"[[",1))

res_p <- unique(c(res2014_p,res2015_p,res2016_p,res2017_p,res2018_p,res2019_p,res2020_p))
jdf_subset <- jdf[which(jdf[,1] %in% res_p),]

#Number of journals with gene name errors in the supplementary files
journal_supplementary <- length(unique(aff_xls$V3))

# Number of articles with excel gene list
jdf <- unique(jdf_subset)
jdf_t <- table(jdf[,2])
jdf_t_df <- as.data.frame(jdf_t)

# Number of affected articles 
jres <- unique(aff_xls[,c(1,3)])
j_table <- table(jres[,2])
j_table_df <- as.data.frame(j_table)

# table content for gene name errors by journal 
jj <- merge(jdf_t_df,j_table_df,by="Var1")
colnames(jj) <- c("journal",
  "Number of articles with Excel gene lists",
  "Number of affected articles")

jj$`Proportion of articles affected` <- signif(jj[,3]/jj[,2]*100,3)
jjf <- jj[which(jj[,2]>49),]
jjf <- jjf[order(-jjf[,3]),]

jjf %>% kbl() %>%
  kable_paper("hover", full_width = F)

	journal	Number of articles with Excel gene lists	Number of affected articles	Proportion of articles affected
319	Nat_Commun	920	345	37.50
376	PLoS_One	946	244	25.80
394	Sci_Rep	767	227	29.60
49	BMC_Genomics	660	166	25.20
373	PLoS_Genet	448	134	29.90
352	Oncotarget	326	107	32.80
171	Front_Genet	313	94	30.00
140	eLife	243	89	36.60
380	Proc_Natl_Acad_Sci_U_S_A	155	73	47.10
95	Cell_Rep	158	71	44.90
200	Genome_Biol	193	66	34.20
330	Nature	118	52	44.10
321	Nat_Genet	140	48	34.30
202	Genome_Med	137	44	32.10
83	Cell	74	39	52.70
358	PeerJ	137	39	28.50
109	Clin_Epigenetics	109	38	34.90
345	Nucleic_Acids_Res	120	36	30.00
54	BMC_Med_Genomics	117	31	26.50
179	Front_Oncol	85	31	36.50
406	Transl_Psychiatry	73	29	39.70
42	BMC_Cancer	105	28	26.70
118	Commun_Biol	74	27	36.50
377	PLoS_Pathog	80	27	33.80
8	Aging_	56	26	46.40
137	EBioMedicine	51	26	51.00
371	PLoS_Biol	66	26	39.40
149	Epigenetics_Chromatin	64	25	39.10
372	PLoS_Comput_Biol	97	24	24.70
348	Oncogene	53	22	41.50
233	iScience	58	20	34.50
392	Sci_Adv	56	20	35.70
40	BMC_Bioinformatics	77	19	24.70
187	G3_	74	15	20.30
216	Hum_Mol_Genet	53	15	28.30
62	BMC_Plant_Biol	52	6	11.50
183	Front_Plant_Sci	75	5	6.67

Here we need to create an object containing PMCID, journal name and year for the 11117 pubs with Excel gene lists, so that we can calculate yearly proportions later.

res2014_p_df <- as.data.frame(res2014_p)
colnames(res2014_p_df) <- "pmc"
res2014_p_df$year <- "2014"

res2015_p_df <- as.data.frame(res2015_p)
colnames(res2015_p_df) <- "pmc"
res2015_p_df$year <- "2015"

res2016_p_df <- as.data.frame(res2016_p)
colnames(res2016_p_df) <- "pmc"
res2016_p_df$year <- "2016"

res2017_p_df <- as.data.frame(res2017_p)
colnames(res2017_p_df) <- "pmc"
res2017_p_df$year <- "2017"

res2018_p_df <- as.data.frame(res2018_p)
colnames(res2018_p_df) <- "pmc"
res2018_p_df$year <- "2018"

res2019_p_df <- as.data.frame(res2019_p)
colnames(res2019_p_df) <- "pmc"
res2019_p_df$year <- "2019"

res2020_p_df <- as.data.frame(res2020_p)
colnames(res2020_p_df) <- "pmc"
res2020_p_df$year <- "2020"

pub_yr <- rbind(res2014_p_df, res2015_p_df, res2016_p_df, res2017_p_df, res2018_p_df, res2019_p_df, res2020_p_df)

jdf2 <- merge(jdf,pub_yr,by.x="V1",by.y="pmc")

Top journals with more than 100 gene name errors articles from 2014 to 2020

Top six journals with most gene name errors were selected based on “Number of affected articles” in the Table 5 of the manuscript.

Accordingly, Nat Commun,PLoS One,Sci Rep,BMC Genomics,PLoS Genet,Oncotarget were considered as top 6 journals with most gene name errors. The number of publications published by each of these 6 journals in each year with gene name errors were counted and result was recorded.

years <- c("2014","2015","2016","2017","2018","2019","2020")

aj2014 <- unique(pub_2014[,c(1,3)])
aj2015 <- unique(pub_2015[,c(1,3)])
aj2016 <- unique(pub_2016[,c(1,3)])
aj2017 <- unique(pub_2017[,c(1,3)])
aj2018 <- unique(pub_2018[,c(1,3)])
aj2019 <- unique(pub_2019[,c(1,3)])
aj2020 <- unique(pub_2020[,c(1,3)])

# Nature communication
nc_aff <- c(nrow(subset(aj2014[aj2014$V3=="Nat_Commun",])),
  nrow(subset(aj2015[aj2015$V3=="Nat_Commun",])),
  nrow(subset(aj2016[aj2016$V3=="Nat_Commun",])),
  nrow(subset(aj2017[aj2017$V3=="Nat_Commun",])),
  nrow(subset(aj2018[aj2018$V3=="Nat_Commun",])),
  nrow(subset(aj2019[aj2019$V3=="Nat_Commun",])),
  nrow(subset(aj2020[aj2020$V3=="Nat_Commun",])))

nc_gl <- c(nrow(unique(subset(jdf2,V2=="Nat_Commun"&year=="2014"))),
  nrow(unique(subset(jdf2,V2=="Nat_Commun"&year=="2015"))),
  nrow(unique(subset(jdf2,V2=="Nat_Commun"&year=="2016"))),
  nrow(unique(subset(jdf2,V2=="Nat_Commun"&year=="2017"))),
  nrow(unique(subset(jdf2,V2=="Nat_Commun"&year=="2018"))),
  nrow(unique(subset(jdf2,V2=="Nat_Commun"&year=="2019"))),
  nrow(unique(subset(jdf2,V2=="Nat_Commun"&year=="2020"))))

nc_df<- data.frame(years,nc_aff,nc_gl,signif(nc_aff/nc_gl*100,3))
colnames(nc_df) <- c("year","affected articles","articles with excel gene lists", "proportion of articles affected (%)")
nc_df%>%
  kbl() %>%
  kable_paper("hover", full_width = F)

year	affected articles	articles with excel gene lists	proportion of articles affected (%)
2014	11	33	33.3
2015	19	56	33.9
2016	29	85	34.1
2017	50	131	38.2
2018	67	175	38.3
2019	84	225	37.3
2020	85	215	39.5

# PLoS_One
po_aff <- c(nrow(subset(aj2014[aj2014$V3=="PLoS_One",])),
  nrow(subset(aj2015[aj2015$V3=="PLoS_One",])),
  nrow(subset(aj2016[aj2016$V3=="PLoS_One",])),
  nrow(subset(aj2017[aj2017$V3=="PLoS_One",])),
  nrow(subset(aj2018[aj2018$V3=="PLoS_One",])),
  nrow(subset(aj2019[aj2019$V3=="PLoS_One",])),
  nrow(subset(aj2020[aj2020$V3=="PLoS_One",])))

po_gl <- c(nrow(unique(subset(jdf2,V2=="PLoS_One"&year=="2014"))),
  nrow(unique(subset(jdf2,V2=="PLoS_One"&year=="2015"))),
  nrow(unique(subset(jdf2,V2=="PLoS_One"&year=="2016"))),
  nrow(unique(subset(jdf2,V2=="PLoS_One"&year=="2017"))),
  nrow(unique(subset(jdf2,V2=="PLoS_One"&year=="2018"))),
  nrow(unique(subset(jdf2,V2=="PLoS_One"&year=="2019"))),
  nrow(unique(subset(jdf2,V2=="PLoS_One"&year=="2020"))))

po_df<- data.frame(years,po_aff,po_gl,signif(po_aff/po_gl*100,3))
colnames(po_df) <- c("year","affected articles","articles with excel gene lists", "proportion of articles affected (%)")
po_df%>%
  kbl() %>%
  kable_paper("hover", full_width = F)

year	affected articles	articles with excel gene lists	proportion of articles affected (%)
2014	58	222	26.1
2015	57	195	29.2
2016	34	150	22.7
2017	37	135	27.4
2018	21	106	19.8
2019	20	69	29.0
2020	17	69	24.6

# Sci_Rep
sr_aff <- c(nrow(subset(aj2014[aj2014$V3=="Sci_Rep",])),
  nrow(subset(aj2015[aj2015$V3=="Sci_Rep",])),
  nrow(subset(aj2016[aj2016$V3=="Sci_Rep",])),
  nrow(subset(aj2017[aj2017$V3=="Sci_Rep",])),
  nrow(subset(aj2018[aj2018$V3=="Sci_Rep",])),
  nrow(subset(aj2019[aj2019$V3=="Sci_Rep",])),
  nrow(subset(aj2020[aj2020$V3=="Sci_Rep",])))

sr_gl <- c(nrow(unique(subset(jdf2,V2=="Sci_Rep"&year=="2014"))),
  nrow(unique(subset(jdf2,V2=="Sci_Rep"&year=="2015"))),
  nrow(unique(subset(jdf2,V2=="Sci_Rep"&year=="2016"))),
  nrow(unique(subset(jdf2,V2=="Sci_Rep"&year=="2017"))),
  nrow(unique(subset(jdf2,V2=="Sci_Rep"&year=="2018"))),
  nrow(unique(subset(jdf2,V2=="Sci_Rep"&year=="2019"))),
  nrow(unique(subset(jdf2,V2=="Sci_Rep"&year=="2020"))))

sr_df<- data.frame(years,sr_aff,sr_gl,signif(sr_aff/sr_gl*100,3))
colnames(sr_df) <- c("year","affected articles","articles with excel gene lists", "proportion of articles affected (%)")
sr_df%>%
  kbl() %>%
  kable_paper("hover", full_width = F)

year	affected articles	articles with excel gene lists	proportion of articles affected (%)
2014	3	9	33.3
2015	7	41	17.1
2016	37	144	25.7
2017	57	157	36.3
2018	43	141	30.5
2019	47	133	35.3
2020	33	142	23.2

# BMC_Genomics
bg_aff <- c(nrow(subset(aj2014[aj2014$V3=="BMC_Genomics",])),
  nrow(subset(aj2015[aj2015$V3=="BMC_Genomics",])),
  nrow(subset(aj2016[aj2016$V3=="BMC_Genomics",])),
  nrow(subset(aj2017[aj2017$V3=="BMC_Genomics",])),
  nrow(subset(aj2018[aj2018$V3=="BMC_Genomics",])),
  nrow(subset(aj2019[aj2019$V3=="BMC_Genomics",])),
  nrow(subset(aj2020[aj2020$V3=="BMC_Genomics",])))

bg_gl <- c(nrow(unique(subset(jdf2,V2=="BMC_Genomics"&year=="2014"))),
  nrow(unique(subset(jdf2,V2=="BMC_Genomics"&year=="2015"))),
  nrow(unique(subset(jdf2,V2=="BMC_Genomics"&year=="2016"))),
  nrow(unique(subset(jdf2,V2=="BMC_Genomics"&year=="2017"))),
  nrow(unique(subset(jdf2,V2=="BMC_Genomics"&year=="2018"))),
  nrow(unique(subset(jdf2,V2=="BMC_Genomics"&year=="2019"))),
  nrow(unique(subset(jdf2,V2=="BMC_Genomics"&year=="2020"))))

bg_df<- data.frame(years,bg_aff,bg_gl,signif(bg_aff/bg_gl*100,3))
colnames(bg_df) <- c("year","affected articles","articles with excel gene lists", "proportion of articles affected (%)")
bg_df%>%
  kbl() %>%
  kable_paper("hover", full_width = F)

year	affected articles	articles with excel gene lists	proportion of articles affected (%)
2014	25	98	25.5
2015	28	116	24.1
2016	26	97	26.8
2017	26	86	30.2
2018	20	102	19.6
2019	21	92	22.8
2020	20	69	29.0

# PLoS_Genet
pg_aff <- c(nrow(subset(aj2014[aj2014$V3=="PLoS_Genet",])),
  nrow(subset(aj2015[aj2015$V3=="PLoS_Genet",])),
  nrow(subset(aj2016[aj2016$V3=="PLoS_Genet",])),
  nrow(subset(aj2017[aj2017$V3=="PLoS_Genet",])),
  nrow(subset(aj2018[aj2018$V3=="PLoS_Genet",])),
  nrow(subset(aj2019[aj2019$V3=="PLoS_Genet",])),
  nrow(subset(aj2020[aj2020$V3=="PLoS_Genet",])))

pg_gl <- c(nrow(unique(subset(jdf2,V2=="PLoS_Genet"&year=="2014"))),
  nrow(unique(subset(jdf2,V2=="PLoS_Genet"&year=="2015"))),
  nrow(unique(subset(jdf2,V2=="PLoS_Genet"&year=="2016"))),
  nrow(unique(subset(jdf2,V2=="PLoS_Genet"&year=="2017"))),
  nrow(unique(subset(jdf2,V2=="PLoS_Genet"&year=="2018"))),
  nrow(unique(subset(jdf2,V2=="PLoS_Genet"&year=="2019"))),
  nrow(unique(subset(jdf2,V2=="PLoS_Genet"&year=="2020"))))

pg_df<- data.frame(years,pg_aff,pg_gl,signif(pg_aff/pg_gl*100,3))
colnames(pg_df) <- c("year","affected articles","articles with excel gene lists", "proportion of articles affected (%)")
pg_df%>%
  kbl() %>%
  kable_paper("hover", full_width = F)

year	affected articles	articles with excel gene lists	proportion of articles affected (%)
2014	23	72	31.9
2015	19	68	27.9
2016	19	67	28.4
2017	18	64	28.1
2018	16	56	28.6
2019	16	60	26.7
2020	23	61	37.7

# Oncotarget
ot_aff <- c(nrow(subset(aj2014[aj2014$V3=="Oncotarget",])),
  nrow(subset(aj2015[aj2015$V3=="Oncotarget",])),
  nrow(subset(aj2016[aj2016$V3=="Oncotarget",])),
  nrow(subset(aj2017[aj2017$V3=="Oncotarget",])),
  nrow(subset(aj2018[aj2018$V3=="Oncotarget",])),
  nrow(subset(aj2019[aj2019$V3=="Oncotarget",])),
  nrow(subset(aj2020[aj2020$V3=="Oncotarget",])))

ot_gl <- c(nrow(unique(subset(jdf2,V2=="Oncotarget"&year=="2014"))),
  nrow(unique(subset(jdf2,V2=="Oncotarget"&year=="2015"))),
  nrow(unique(subset(jdf2,V2=="Oncotarget"&year=="2016"))),
  nrow(unique(subset(jdf2,V2=="Oncotarget"&year=="2017"))),
  nrow(unique(subset(jdf2,V2=="Oncotarget"&year=="2018"))),
  nrow(unique(subset(jdf2,V2=="Oncotarget"&year=="2019"))),
  nrow(unique(subset(jdf2,V2=="Oncotarget"&year=="2020"))))

ot_df<- data.frame(years,ot_aff,ot_gl,signif(ot_aff/ot_gl*100,3))
colnames(ot_df) <- c("year","affected articles","articles with excel gene lists", "proportion of articles affected (%)")
ot_df%>%
  kbl() %>%
  kable_paper("hover", full_width = F)

year	affected articles	articles with excel gene lists	proportion of articles affected (%)
2014	5	18	27.8
2015	10	47	21.3
2016	41	106	38.7
2017	35	107	32.7
2018	11	35	31.4
2019	3	8	37.5
2020	2	5	40.0

Plots for figure 2 of the manuscript

Temporal trends for the six journals with most gene name errors from 2014 to 2020 was observed and Figure 2 in the manuscript was reproduced.

years <- c("2014","2015","2016","2017","2018","2019","2020")

par(mfrow=c(3,1))
# line diagrams on the number of gene name error articles published by  journal per year

## Nature communication
barplot(nc_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="Nature Communications")
grid()
barplot(nc_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=nc_df[,1],y=nc_df[,2]/nc_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

## PLOS ONE
barplot(po_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="PLOS ONE")
grid()
barplot(po_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=po_df[,1],y=po_df[,2]/po_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

## Scientific Reports
barplot(sr_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="Scientific Reports")
grid()
barplot(sr_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=sr_df[,1],y=sr_df[,2]/sr_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

## BMC_Genomics
barplot(bg_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="BMC Genomics")
grid()
barplot(bg_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=bg_df[,1],y=bg_df[,2]/bg_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

## PLoS Genetics
barplot(pg_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="PLoS Genetics")
grid()
barplot(pg_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=pg_df[,1],y=pg_df[,2]/pg_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

## Oncotarget
barplot(ot_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="Oncotarget")
grid()
barplot(ot_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=ot_df[,1],y=ot_df[,2]/ot_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

## now make the pdf
pdf("fig3.pdf",width=4,height=7) 
par(mfrow=c(3,1))

## Nature communication
barplot(nc_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="Nature Communications")
grid()
barplot(nc_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=nc_df[,1],y=nc_df[,2]/nc_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

## PLOS ONE
barplot(po_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="PLOS ONE")
grid()
barplot(po_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=po_df[,1],y=po_df[,2]/po_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

## Scientific Reports
barplot(sr_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="Scientific Reports")
grid()
barplot(sr_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=sr_df[,1],y=sr_df[,2]/sr_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

## BMC_Genomics
barplot(bg_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="BMC Genomics")
grid()
barplot(bg_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=bg_df[,1],y=bg_df[,2]/bg_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

## PLoS Genetics
barplot(pg_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="PLoS Genetics")
grid()
barplot(pg_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=pg_df[,1],y=pg_df[,2]/pg_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

## Oncotarget
barplot(ot_df[,3],ylab="Publications with Excel gene lists",xlab="",
  names.arg = years, main="Oncotarget")
grid()
barplot(ot_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=ot_df[,1],y=ot_df[,2]/ot_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()

dev.off()

## png 
##   2

Journals error articles in refernce to yearly basis

All the journals from 2014 to 2020 which contain more than 10 gene name error publications.

Later we have output top 10 journals with gene name errors for each year from 2014 to 2020 with " head(aj20**_j_dsc,10)

aj2014_tab <- table(aj2014[,2])
aj2015_tab <- table(aj2015[,2])
aj2016_tab <- table(aj2016[,2])
aj2017_tab <- table(aj2017[,2])
aj2018_tab <- table(aj2018[,2])
aj2019_tab <- table(aj2019[,2])
aj2020_tab <- table(aj2020[,2])

aj2020_df <- as.data.frame(aj2020_tab)
aj2020_j <- aj2020_df[which(aj2020_df[,2]>10),]
aj2020_j_dsc <- aj2020_df[order(-aj2020_df[,2]),]
head(aj2020_j_dsc,10)

##                         Var1 Freq
## 123               Nat_Commun   85
## 60               Front_Genet   43
## 162                  Sci_Rep   33
## 66               Front_Oncol   23
## 152               PLoS_Genet   23
## 50                     eLife   22
## 15              BMC_Genomics   20
## 154                 PLoS_One   17
## 157 Proc_Natl_Acad_Sci_U_S_A   17
## 42               Commun_Biol   16

aj2019_df <- as.data.frame(aj2019_tab)
aj2019_j <- aj2019_df[which(aj2019_df[,2]>10),]
aj2019_j_dsc <- aj2019_df[order(-aj2019_df[,2]),]
head(aj2019_j_dsc,10)

##                 Var1 Freq
## 129       Nat_Commun   84
## 157          Sci_Rep   47
## 69       Front_Genet   29
## 24      BMC_Genomics   21
## 152         PLoS_One   20
## 151       PLoS_Genet   16
## 41          Cell_Rep   15
## 54             eLife   13
## 145            PeerJ   13
## 44  Clin_Epigenetics   12

aj2018_df <- as.data.frame(aj2018_tab)
aj2018_j <- aj2018_df[which(aj2018_df[,2]>10),]
aj2018_j_dsc <- aj2018_df[order(-aj2018_df[,2]),]
head(aj2018_j_dsc,10)

##             Var1 Freq
## 103   Nat_Commun   67
## 131      Sci_Rep   43
## 124     PLoS_One   21
## 20  BMC_Genomics   20
## 35      Cell_Rep   16
## 122   PLoS_Genet   16
## 46         eLife   14
## 55   Front_Genet   13
## 66   Genome_Biol   12
## 118   Oncotarget   11

aj2017_df <- as.data.frame(aj2017_tab)
aj2017_j <- aj2017_df[which(aj2017_df[,2]>10),]
aj2017_j_dsc <- aj2017_df[order(-aj2017_df[,2]),]
head(aj2017_j_dsc,10)

##             Var1 Freq
## 102      Sci_Rep   57
## 80    Nat_Commun   50
## 98      PLoS_One   37
## 91    Oncotarget   35
## 12  BMC_Genomics   26
## 97    PLoS_Genet   18
## 86        Nature   13
## 35         eLife   11
## 55   Genome_Biol   11
## 24      Cell_Rep    7

aj2016_df <- as.data.frame(aj2016_tab)
aj2016_j <- aj2016_df[which(aj2016_df[,2]>10),]
aj2016_j_dsc <- aj2016_df[order(-aj2016_df[,2]),]
head(aj2016_j_dsc,10)

##                         Var1 Freq
## 119               Oncotarget   41
## 137                  Sci_Rep   37
## 131                 PLoS_One   34
## 103               Nat_Commun   29
## 14              BMC_Genomics   26
## 128               PLoS_Genet   19
## 32                  Cell_Rep   15
## 48                     eLife   10
## 133 Proc_Natl_Acad_Sci_U_S_A   10
## 26                      Cell    9

aj2015_df <- as.data.frame(aj2015_tab)
aj2015_j <- aj2015_df[which(aj2015_df[,2]>10),]
aj2015_j_dsc <- aj2015_df[order(-aj2015_df[,2]),]
head(aj2015_j_dsc,10)

##                         Var1 Freq
## 133                 PLoS_One   57
## 18              BMC_Genomics   28
## 136 Proc_Natl_Acad_Sci_U_S_A   23
## 115               Nat_Commun   19
## 132               PLoS_Genet   19
## 123        Nucleic_Acids_Res   18
## 31                      Cell   14
## 48                     eLife   11
## 76                Genome_Res   11
## 125               Oncotarget   10

aj2014_df <- as.data.frame(aj2014_tab)
aj2014_j <- aj2014_df[which(aj2014_df[,2]>10),]
aj2014_j_dsc <- aj2014_df[order(-aj2014_df[,2]),]
head(aj2014_j_dsc,10)

##                  Var1 Freq
## 76           PLoS_One   58
## 8        BMC_Genomics   25
## 75         PLoS_Genet   23
## 39        Genome_Biol   12
## 62         Nat_Commun   11
## 68  Nucleic_Acids_Res    9
## 25              eLife    8
## 67             Nature    6
## 6  BMC_Bioinformatics    5
## 70         Oncotarget    5

aj2020_df <- as.data.frame(aj2020_tab)
aj2020_j <- aj2020_df[which(aj2020_df[,2]>10),]
aj2020_j_dsc <- aj2020_df[order(-aj2020_df[,2]),]
head(aj2020_j_dsc,10)

##                         Var1 Freq
## 123               Nat_Commun   85
## 60               Front_Genet   43
## 162                  Sci_Rep   33
## 66               Front_Oncol   23
## 152               PLoS_Genet   23
## 50                     eLife   22
## 15              BMC_Genomics   20
## 154                 PLoS_One   17
## 157 Proc_Natl_Acad_Sci_U_S_A   17
## 42               Commun_Biol   16

Consistency between this and our previous analysis

Focus on plos one in the period 2014,2015. Previously we found 60 studies. Here we find 115 studies. Of these 56 are common to both.

prev <- read.table("plosone_20142015.tsv",header=TRUE,sep="\t")
prev <- subset(prev,Confirmed=="Confirmed")
tail(prev)

##    Journal YearPublished Confirmed     PMID      PMCID
## 73 PLosOne          2015 Confirmed 26444573 PMC4596691
## 74 PLosOne          2015 Confirmed 26510177 PMC4624949
## 75 PLosOne          2015 Confirmed 26529237 PMC4631338
## 76 PLosOne          2015 Confirmed 26636579 PMC4670106
## 77 PLosOne          2015 Confirmed 26695660 PMC4687867
## 78 PLosOne          2015 Confirmed 26684451 PMC4684321

prev_pmc <- unique(prev$PMCID)
length(prev_pmc)

## [1] 60

new <- subset(aff_xls,V3=="PLoS_One")
new <- subset(new, V4=="2014" | V4 == "2015")
new <- new[,c(1,3,4)]
new_pmc <- unique(new$V1)
length(new_pmc)

## [1] 115

v1 <- list("Prev"=prev_pmc,"New"=new_pmc)
plot(euler(v1),quantities = TRUE)

# PMCs specific to 2016 study
setdiff(prev_pmc,new_pmc)

## [1] "PMC4118979" "PMC4441472" "PMC4500563" "PMC4508115"

# PMCs specific to 2021 study
setdiff(new_pmc,prev_pmc)

##  [1] "PMC3894996" "PMC3901708" "PMC3968145" "PMC3989189" "PMC3990548"
##  [6] "PMC3990644" "PMC3990668" "PMC3999108" "PMC4002427" "PMC4002480"
## [11] "PMC4011728" "PMC4012993" "PMC4029955" "PMC4039489" "PMC4041891"
## [16] "PMC4079602" "PMC4084626" "PMC4099127" "PMC4103770" "PMC4105622"
## [21] "PMC4108364" "PMC4109958" "PMC4128672" "PMC4141782" "PMC4156408"
## [26] "PMC4157799" "PMC4167545" "PMC4207689" "PMC4208810" "PMC4210245"
## [31] "PMC4230926" "PMC4252097" "PMC4262388" "PMC4338293" "PMC4364623"
## [36] "PMC4370594" "PMC4370704" "PMC4372331" "PMC4373911" "PMC4388690"
## [41] "PMC4416816" "PMC4427337" "PMC4441380" "PMC4456163" "PMC4472808"
## [46] "PMC4478024" "PMC4514889" "PMC4549312" "PMC4551741" "PMC4552880"
## [51] "PMC4556673" "PMC4580636" "PMC4583255" "PMC4618846" "PMC4641597"
## [56] "PMC4641603" "PMC4671692" "PMC4681367" "PMC4699828"

Journal impact factor correlation

Journal Citation Reports 2020 (Impact factor & Ranking of 2019). InCites Journal Citation Reports (Clarivate Analytics).

jifs <- read.table("jifs.tsv",header=TRUE)
proportions <- jjf[,c(1,4)]
myjifs <- merge(proportions, jifs, by.x="journal",by.y="Journal")
cor.test(myjifs$JIF,myjifs$`Proportion of articles affected`)

## 
##  Pearson's product-moment correlation
## 
## data:  myjifs$JIF and myjifs$`Proportion of articles affected`
## t = 2.9932, df = 33, p-value = 0.005197
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1522930 0.6892045
## sample estimates:
##       cor 
## 0.4620881

cor.test(myjifs$JIF,myjifs$`Proportion of articles affected`,method="s")

## Warning in cor.test.default(myjifs$JIF, myjifs$`Proportion of articles
## affected`, : Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  myjifs$JIF and myjifs$`Proportion of articles affected`
## S = 2932.1, p-value = 0.0001953
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.5893395

mylm <- lm(myjifs$`Proportion of articles affected` ~ myjifs$JIF)
summary(mylm)

## 
## Call:
## lm(formula = myjifs$`Proportion of articles affected` ~ myjifs$JIF)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.2790  -4.5087  -0.6097   4.7067  19.4141 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  28.8471     1.9874  14.515 6.95e-16 ***
## myjifs$JIF    0.4775     0.1595   2.993   0.0052 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.702 on 33 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.2135, Adjusted R-squared:  0.1897 
## F-statistic: 8.959 on 1 and 33 DF,  p-value: 0.005197

plot(myjifs$JIF,myjifs$`Proportion of articles affected`,
  xlab="JIF", ylab="Proportion of articles affected (%)",
  main="Articles with supplementary Excel gene lists")
abline(mylm)

pdf("fig2.pdf")
plot(myjifs$JIF,myjifs$`Proportion of articles affected`,
  xlab="JIF", ylab="Proportion of articles affected (%)",
  main="Articles with supplementary Excel gene lists")
abline(mylm)
dev.off()

## png 
##   2

Session information

For reproducibility.

sessionInfo()

## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] Cairo_1.5-12.2   eulerr_6.1.0     kableExtra_1.3.4 dplyr_1.0.6     
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.6        highr_0.9         pillar_1.6.1      bslib_0.2.5.1    
##  [5] compiler_4.1.0    jquerylib_0.1.4   tools_4.1.0       digest_0.6.27    
##  [9] viridisLite_0.4.0 jsonlite_1.7.2    evaluate_0.14     lifecycle_1.0.0  
## [13] tibble_3.1.2      pkgconfig_2.0.3   rlang_0.4.11      DBI_1.1.1        
## [17] rstudioapi_0.13   yaml_2.2.1        xfun_0.23         stringr_1.4.0    
## [21] httr_1.4.2        knitr_1.33        xml2_1.3.2        systemfonts_1.0.2
## [25] generics_0.1.0    vctrs_0.3.8       sass_0.4.0        grid_4.1.0       
## [29] webshot_0.5.2     tidyselect_1.1.1  svglite_2.0.0     glue_1.4.2       
## [33] R6_2.5.0          fansi_0.5.0       rmarkdown_2.8     polyclip_1.10-0  
## [37] polylabelr_0.2.0  purrr_0.3.4       magrittr_2.0.1    scales_1.1.1     
## [41] ellipsis_0.3.2    htmltools_0.5.1.1 assertthat_0.2.1  rvest_1.0.0      
## [45] colorspace_2.0-1  utf8_1.2.1        stringi_1.6.2     munsell_0.5.0    
## [49] crayon_1.4.1