Source: http://118.138.234.73/public/gene_name_errors/manuscript_files/results_summary.html
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("kableExtra")
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library("eulerr")
library("Cairo")
options(bitmapType="cairo")
The bash script “gene_names.sh” was used to screen a list of PMC articles and here we are processing and analysing the output of the script to generate figures that will be needed in the paper
The number of publications available in each year from 2014 to 2020 in PUBMED CENTRAL. These publications were selected through a search in PubMed Central using the key word,“genom”
pmc2014 <- read.table("pmc/genom/pmc2014.txt")
pmc2014 <- nrow(pmc2014)
pmc2015 <- read.table("pmc/genom/pmc2015.txt")
pmc2015 <- nrow(pmc2015)
pmc2016 <- read.table("pmc/genom/pmc2016.txt")
pmc2016 <- nrow(pmc2016)
pmc2017 <- read.table("pmc/genom/pmc2017.txt")
pmc2017 <- nrow(pmc2017)
pmc2018 <- read.table("pmc/genom/pmc2018.txt")
pmc2018 <- nrow(pmc2018)
pmc2019 <- read.table("pmc/genom/pmc2019.txt")
pmc2019 <- nrow(pmc2019)
pmc2020 <- read.table("pmc/genom/pmc2020.txt")
pmc2020 <- nrow(pmc2020)
total_pmc <- sum(pmc2014,pmc2015,pmc2016,pmc2017,pmc2018,pmc2019,pmc2020)
pmc <- c(pmc2014,pmc2015,pmc2016,pmc2017,pmc2018,pmc2019,pmc2020,total_pmc)
The number of publications resulted from the “genom” keyword search from 2014 to 2020 was short listed where the publications with supplementary files were downloaded.
These supplementary files were selected in a manner in which supplementary files compose of “.xls” or “.xlsx” suffixes were assumed to be supplementary excel files.
res2014 <- readLines("results/genom/results2014.txt")
res2014_length <- length(unique(sapply(strsplit(res2014," "),"[[",1)))
res2015 <- readLines("results/genom/results2015.txt")
res2015_length <- length(unique(sapply(strsplit(res2015," "),"[[",1)))
res2016 <- readLines("results/genom/results2016.txt")
res2016_length <- length(unique(sapply(strsplit(res2016," "),"[[",1)))
res2017 <- readLines("results/genom/results2017.txt")
res2017_length <- length(unique(sapply(strsplit(res2017," "),"[[",1)))
res2018 <- readLines("results/genom/results2018.txt")
res2018_length <- length(unique(sapply(strsplit(res2018," "),"[[",1)))
res2019 <- readLines("results/genom/results2019.txt")
res2019_length <- length(unique(sapply(strsplit(res2019," "),"[[",1)))
res2020 <- readLines("results/genom/results2020.txt")
res2020_length <- length(unique(sapply(strsplit(res2020," "),"[[",1)))
total_res <- sum(res2014_length,res2015_length,res2016_length,res2017_length,
res2018_length,res2019_length,res2020_length)
res <- c(res2014_length,res2015_length,res2016_length,res2017_length,
res2018_length,res2019_length,res2020_length,total_res)
All the supplementary files downloaded from 2014 to 2020 where searched for a gene list in a way that each supplementary files’ content were analysed for their length.
If a supplementary file content is depicting a length greater than 2, it is verified as a supplementary file containing of a gene list.
res2014_l <- strsplit(res2014," ")
res2014_xlg <- res2014_l[which(lapply(res2014_l,length)>2)]
res2014_xlg <- length(unique(sapply(res2014_xlg,"[[",2)))
res2015_l <- strsplit(res2015," ")
res2015_xlg <- res2015_l[which(lapply(res2015_l,length)>2)]
res2015_xlg <- length(unique(sapply(res2015_xlg,"[[",2)))
res2016_l <- strsplit(res2016," ")
res2016_xlg <- res2016_l[which(lapply(res2016_l,length)>2)]
res2016_xlg <- length(unique(sapply(res2016_xlg,"[[",2)))
res2017_l <- strsplit(res2017," ")
res2017_xlg <- res2017_l[which(lapply(res2017_l,length)>2)]
res2017_xlg <- length(unique(sapply(res2017_xlg,"[[",2)))
res2018_l <- strsplit(res2018," ")
res2018_xlg <- res2018_l[which(lapply(res2018_l,length)>2)]
res2018_xlg <- length(unique(sapply(res2018_xlg,"[[",2)))
res2019_l <- strsplit(res2019," ")
res2019_xlg <- res2019_l[which(lapply(res2019_l,length)>2)]
res2019_xlg <- length(unique(sapply(res2019_xlg,"[[",2)))
res2020_l <- strsplit(res2020," ")
res2020_xlg <- res2020_l[which(lapply(res2020_l,length)>2)]
res2020_xlg <- length(unique(sapply(res2020_xlg,"[[",2)))
total_pub_xlg <- sum(res2014_xlg,res2015_xlg,res2016_xlg,res2017_xlg,res2018_xlg,res2019_xlg,res2020_xlg)
pub_xlg <- c(res2014_xlg,res2015_xlg,res2016_xlg,res2017_xlg,res2018_xlg,res2019_xlg,res2020_xlg,total_pub_xlg)
The results from the excel files with gene lists were shortlisted in a manner where the related publication for each supplementary file with excel gene list was selected.
res2014_l <- strsplit(res2014," ")
res2014_lx <- res2014_l[which(lapply(res2014_l,length)>2)]
res2014_lx <- length(unique(sapply(res2014_lx,"[[",1)))
res2015_l <- strsplit(res2015," ")
res2015_lx <- res2015_l[which(lapply(res2015_l,length)>2)]
res2015_lx <- length(unique(sapply(res2015_lx,"[[",1)))
res2016_l <- strsplit(res2016," ")
res2016_lx <- res2016_l[which(lapply(res2016_l,length)>2)]
res2016_lx <- length(unique(sapply(res2016_lx,"[[",1)))
res2017_l <- strsplit(res2017," ")
res2017_lx <- res2017_l[which(lapply(res2017_l,length)>2)]
res2017_lx <- length(unique(sapply(res2017_lx,"[[",1)))
res2018_l <- strsplit(res2018," ")
res2018_lx <- res2018_l[which(lapply(res2018_l,length)>2)]
res2018_lx <- length(unique(sapply(res2018_lx,"[[",1)))
res2019_l <- strsplit(res2019," ")
res2019_lx <- res2019_l[which(lapply(res2019_l,length)>2)]
res2019_lx <- length(unique(sapply(res2019_lx,"[[",1)))
res2020_l <- strsplit(res2020," ")
res2020_lx <- res2020_l[which(lapply(res2020_l,length)>2)]
res2020_lx <- length(unique(sapply(res2020_lx,"[[",1)))
total_pub_xls <- sum(res2014_lx,res2015_lx,res2016_lx,res2017_lx,res2018_lx,res2019_lx,res2020_lx)
pub_xls <- c(res2014_lx,res2015_lx,res2016_lx,res2017_lx,res2018_lx,res2019_lx,res2020_lx,total_pub_xls)
The resulted publications with excel gene list were run in a shell scripted software where each selected publications with excel gene list were screened for their gene symbols.
Erroneous conversions, such as date formats, scientific numbers and five-digit numbers were chosen from gene lists as gene name errors and the results were recorded and saved as “aggregated_res”.
This aggregated_res file composed of PMCID, Species, Journal name, year and the link to the affected file.
Total suspected publications with gene name errors were detected from aggregated_res file and aggregated_res file was analysed in a manner where publications with gene name errors for each year were also extracted.
aggregated_res <- readLines("results/genom/aggregated_res.txt")
head(aggregated_res)
## [1] "PMC3879165 PLoS_Genet results2014.txt /pmc/articles/PMC3879165/bin/pgen.1004006.s005.xlsx"
## [2] "PMC3886906 PLoS_Genet results2014.txt /pmc/articles/PMC3886906/bin/pgen.1004079.s012.xlsx"
## [3] "PMC3894565 Sci_Rep results2014.txt /pmc/articles/PMC3894565/bin/srep03692-s1.xls"
## [4] "PMC3894565 Sci_Rep results2014.txt /pmc/articles/PMC3894565/bin/srep03692-s2.xls"
## [5] "PMC3894996 PLoS_One results2014.txt /pmc/articles/PMC3894996/bin/pone.0085599.s003.xlsx"
## [6] "PMC3897657 PLoS_One results2014.txt /pmc/articles/PMC3897657/bin/pone.0086220.s002.xlsx"
total_sus_pub <- length(unique(sapply(strsplit(aggregated_res," "),"[[",1)))
total_sus_pub
## [1] 3470
sus_pub <- sapply(strsplit(aggregated_res," "),"[[",1)
sus_pub_year <- sapply(strsplit(aggregated_res," "),"[[",3)
sus_pub_file <- sapply(strsplit(aggregated_res," "),"[[",4)
suspected_df <- data.frame(sus_pub,sus_pub_year,sus_pub_file)
sus_pub_2014_df <- subset(suspected_df[suspected_df$sus_pub_year=="results2014.txt",])
sus_pub_2014 <- length(unique(sus_pub_2014_df$sus_pub))
sus_pub_2015_df <- subset(suspected_df[suspected_df$sus_pub_year=="results2015.txt",])
sus_pub_2015 <- length(unique(sus_pub_2015_df$sus_pub))
sus_pub_2016_df <- subset(suspected_df[suspected_df$sus_pub_year=="results2016.txt",])
sus_pub_2016 <- length(unique(sus_pub_2016_df$sus_pub))
sus_pub_2017_df <- subset(suspected_df[suspected_df$sus_pub_year=="results2017.txt",])
sus_pub_2017 <- length(unique(sus_pub_2017_df$sus_pub))
sus_pub_2018_df <- subset(suspected_df[suspected_df$sus_pub_year=="results2018.txt",])
sus_pub_2018 <- length(unique(sus_pub_2018_df$sus_pub))
sus_pub_2019_df <- subset(suspected_df[suspected_df$sus_pub_year=="results2019.txt",])
sus_pub_2019 <- length(unique(sus_pub_2019_df$sus_pub))
sus_pub_2020_df <- subset(suspected_df[suspected_df$sus_pub_year=="results2020.txt",])
sus_pub_2020 <- length(unique(sus_pub_2020_df$sus_pub))
sus_pub <- c(sus_pub_2014,sus_pub_2015,sus_pub_2016,sus_pub_2017,
sus_pub_2018,sus_pub_2019,sus_pub_2020,total_sus_pub)
aggregated_res file was analysed for the total number of spread files which contained gene name errors and the final result was saved as “affected_xl”
affected_xl <- length(sapply(strsplit(aggregated_res," "),"[[",1))
affected_xl
## [1] 5136
In order to reconfirm the results obtained each spreadsheet in aggregated_res file was manually opened to validate accuracy of the data and final result was saved in “true_positive.txt” file.
true_positive.txt" file was read as affected xls. Total number of true positive excel files with gene name errors from the “true_positive.txt” was obtained and the result was saved as “total_aff_xls”
“affected.xls” was reanalysed based on the year(2014 to 2020) to obtain number of true positive excel files for each year.
aff_xls <- read.table("results/genom/true_positive.txt")
total_aff_xls <- nrow(aff_xls)
total_aff_xls
## [1] 5086
aff_xls_2014 <- subset(aff_xls[aff_xls$V4==2014,])
aff_xls_2014 <- length(aff_xls_2014$V1)
aff_xls_2015 <- subset(aff_xls[aff_xls$V4==2015,])
aff_xls_2015 <- length(aff_xls_2015$V1)
aff_xls_2016 <- subset(aff_xls[aff_xls$V4==2016,])
aff_xls_2016 <- length(aff_xls_2016$V1)
aff_xls_2017 <- subset(aff_xls[aff_xls$V4==2017,])
aff_xls_2017 <- length(aff_xls_2017$V1)
aff_xls_2018 <- subset(aff_xls[aff_xls$V4==2018,])
aff_xls_2018 <- length(aff_xls_2018$V1)
aff_xls_2019 <- subset(aff_xls[aff_xls$V4==2019,])
aff_xls_2019 <- length(aff_xls_2019$V1)
aff_xls_2020 <- subset(aff_xls[aff_xls$V4==2020,])
aff_xls_2020 <- length(aff_xls_2020$V1)
aff_xls_files <- c(aff_xls_2014,aff_xls_2015,aff_xls_2016,
aff_xls_2017,aff_xls_2018,aff_xls_2019,
aff_xls_2020,total_aff_xls)
“aff_xls” was reannalysed for the total number of true positive publications and the result was saved as “total_aff_pub”. “aff_xls” was reanalysed based on the year(2014 to 2020) to obtain number of true positive publications for each year.
aff_xls <- read.table("results/genom/true_positive.txt")
total_aff_pub <- length(unique(aff_xls$V1))
total_aff_pub
## [1] 3436
pub_2014 <- subset(aff_xls[aff_xls$V4==2014,])
aff_pub_2014 <- length(unique(pub_2014$V1))
pub_2015 <- subset(aff_xls[aff_xls$V4==2015,])
aff_pub_2015 <- length(unique(pub_2015$V1))
pub_2016 <- subset(aff_xls[aff_xls$V4==2016,])
aff_pub_2016 <- length(unique(pub_2016$V1))
pub_2017 <- subset(aff_xls[aff_xls$V4==2017,])
aff_pub_2017 <- length(unique(pub_2017$V1))
pub_2018 <- subset(aff_xls[aff_xls$V4==2018,])
aff_pub_2018 <- length(unique(pub_2018$V1))
pub_2019 <- subset(aff_xls[aff_xls$V4==2019,])
aff_pub_2019 <- length(unique(pub_2019$V1))
pub_2020 <- subset(aff_xls[aff_xls$V4==2020,])
aff_pub_2020 <- length(unique(pub_2020$V1))
aff_pub <- c(aff_pub_2014,aff_pub_2015,aff_pub_2016,aff_pub_2017,aff_pub_2018,aff_pub_2019,aff_pub_2020,total_aff_pub)
In order to reconfirm the results obtained each spreadsheet in aggregated_res file was manually opened to validate accuracy of the data and final result was updated in the “aggregated_res” file. In the manual search of aggregated_res for validating accuracy of data in the file, several false positive excel files were recorded and these false positive results were saved in a text file named “fp.txt”
fp.txt file was read as “false positive”. Total number of false positive results were recorded from “fp.txt” and saved as “total_fp_xls”. “false positive” was reanalysed based on the year (2014 to 2020) to obtain number of false positive excel files for each year.
fp <- read.table("results/genom/false_positive.txt")
total_fp_xls <- length(fp$V1)
total_fp_xls
## [1] 50
fp_2014 <- subset(fp[fp$V4==2014,])
fp_2014_xls <- length(fp_2014$V1)
fp_2015 <- subset(fp[fp$V4==2015,])
fp_2015_xls <- length(fp_2015$V1)
fp_2016 <- subset(fp[fp$V4==2016,])
fp_2016_xls <- length(fp_2016$V1)
fp_2017 <- subset(fp[fp$V4==2017,])
fp_2017_xls <- length(fp_2017$V1)
fp_2018 <- subset(fp[fp$V4==2018,])
fp_2018_xls <- length(fp_2018$V1)
fp_2019 <- subset(fp[fp$V4==2019,])
fp_2019_xls <- length(fp_2019$V1)
fp_2020 <- subset(fp[fp$V4==2020,])
fp_2020_xls <- length(fp_2020$V1)
fp_xls <- c(fp_2014_xls,fp_2015_xls,fp_2016_xls,
fp_2017_xls,fp_2018_xls,fp_2019_xls,
fp_2020_xls,total_fp_xls)
False positive publications = suspected pubs - affected pubs
fp_2014_pub <- sus_pub_2014 - aff_pub_2014
fp_2015_pub <- sus_pub_2015 - aff_pub_2015
fp_2016_pub <- sus_pub_2016 - aff_pub_2016
fp_2017_pub <- sus_pub_2017 - aff_pub_2017
fp_2018_pub <- sus_pub_2018 - aff_pub_2018
fp_2019_pub <- sus_pub_2019 - aff_pub_2019
fp_2020_pub <- sus_pub_2020 - aff_pub_2020
total_fp_pub <- sum(fp_2014_pub,fp_2015_pub,fp_2016_pub,fp_2017_pub,fp_2018_pub,fp_2019_pub,fp_2020_pub)
fp_pub <- c(fp_2014_pub,fp_2015_pub,fp_2016_pub,
fp_2017_pub,fp_2018_pub,fp_2019_pub,
fp_2020_pub,total_fp_pub)
“total_aff_pub” was divided by “total_pub_xls” and multiple by 100 to find the proportion (%) of excel gene lists articles which contain gene name errors. The final answer was recorded to 3 significant figures and the result was saved as “total_aff_pub”. Figures below are precentages.
Total_percent_of_errors <- signif(total_aff_pub/total_pub_xls*100,3)
Total_percent_of_errors
## [1] 30.9
percentage_2014 <- signif(aff_pub_2014/res2014_lx*100,3)
percentage_2015 <- signif(aff_pub_2015/res2015_lx*100,3)
percentage_2016 <- signif(aff_pub_2016/res2016_lx*100,3)
percentage_2017 <- signif(aff_pub_2017/res2017_lx*100,3)
percentage_2018 <- signif(aff_pub_2018/res2018_lx*100,3)
percentage_2019 <- signif(aff_pub_2019/res2019_lx*100,3)
percentage_2020 <- signif(aff_pub_2020/res2020_lx*100,3)
percentages <- c(percentage_2014,percentage_2015,percentage_2016,percentage_2017,
percentage_2018,percentage_2019,percentage_2020,Total_percent_of_errors)
Table 3. Results of a screen for gene name errors in PubMed Central.
years <- c("2014","2015","2016","2017","2018","2019","2020","Total")
df_gene_name_errors_years <- data.frame(pmc,res,pub_xlg,pub_xls,sus_pub,fp_xls,fp_pub,aff_xls_files,aff_pub,percentages)
df_gene_name_errors_years <- cbind(years,df_gene_name_errors_years)
colnames(df_gene_name_errors_years) <- c(" ",
"PMC ID",
"Excel files screened",
"Excel files with gene lists",
"Publications with Excel gene lists",
"Publications with suspected gene name errors",
"False positive XLS files",
"False positive publications",
"Affected XLS files",
"Affected publications",
"Proportion of publications affected")
df_gene_name_errors_years%>%
kbl() %>%
kable_paper("hover", full_width = F)
PMC ID | Excel files screened | Excel files with gene lists | Publications with Excel gene lists | Publications with suspected gene name errors | False positive XLS files | False positive publications | Affected XLS files | Affected publications | Proportion of publications affected | |
---|---|---|---|---|---|---|---|---|---|---|
2014 | 19976 | 2948 | 2286 | 936 | 284 | 8 | 2 | 429 | 282 | 30.1 |
2015 | 21204 | 4318 | 3037 | 1491 | 490 | 0 | 0 | 701 | 490 | 32.9 |
2016 | 22261 | 4472 | 3331 | 1579 | 477 | 7 | 6 | 653 | 471 | 29.8 |
2017 | 23976 | 4355 | 3012 | 1412 | 443 | 5 | 3 | 648 | 440 | 31.2 |
2018 | 24986 | 4824 | 3566 | 1653 | 475 | 15 | 11 | 703 | 464 | 28.1 |
2019 | 26046 | 5481 | 3942 | 1823 | 594 | 4 | 3 | 914 | 591 | 32.4 |
2020 | 27690 | 6443 | 4496 | 2223 | 707 | 11 | 9 | 1038 | 698 | 31.4 |
Total | 166139 | 32841 | 23670 | 11117 | 3470 | 50 | 34 | 5086 | 3436 | 30.9 |
Plots are created for the figure 1 in the manuscript which composed of results for prevalence of gene name errors in the period 2014-2020.
Accordingly, A .barplot for Publications with Excel gene lists. B. barplot for Affected publication and C. Line diagram for Proportion of publications affected
pdf("fig1.pdf",width=4,height=7)
par(mfrow=c(3,1))
#barplot for Publications with Excel gene lists
df <- df_gene_name_errors_years
df <- df[1:(nrow(df)-1),]
barplot(df[,5],names.arg = df[,1],ylab="Publications with Excel gene lists",ylim = c(0,2500))
grid(nx=0,ny=5,col = "gray")
#barplot for Affected publications
barplot(df[,10],names.arg = df[,1],ylab="Affected publications",ylim = c(0,800))
grid(nx=0,ny=8,col = "gray")
#Line diagram for Proportion of publications affected
plot(x=df[,1],y=df[,11],ylab ="Proportion of publications affected (%)",xlab = "",ylim = c(0,35),type = "b",pch=19,bty="n")
grid()
dev.off()
## png
## 2
par(mfrow=c(3,1))
#barplot for Publications with Excel gene lists
df <- df_gene_name_errors_years
df <- df[1:(nrow(df)-1),]
barplot(df[,5],names.arg = df[,1],ylab="Publications with Excel gene lists",ylim = c(0,2500))
grid(nx=0,ny=5,col = "gray")
#barplot for Affected publications
barplot(df[,10],names.arg = df[,1],ylab="Affected publications",ylim = c(0,800))
grid(nx=0,ny=8,col = "gray")
#Line diagram for Proportion of publications affected
plot(x=df[,1],y=df[,11],ylab ="Proportion of publications affected (%)",xlab = "",ylim = c(0,35),type = "b",pch=19,bty="n")
grid()
From the results obtained on the publications and excel spread sheets with gene name errors, a reanalysation was performed to identify which organism composed of most number of errors in their name.
Each publication and excel sheet was screened for gene name errors. Accordngly, A. thaliana, C. elegans, D. melanogaster, D. rerio, G. gallus, H. sapiens,M. musculus,O. sativa,R. norvegicus,S. cerevisiae species were selected and each publications and excel sheets composing of gene name errors from 2014 to 2020 were screened. The obtained results were saved under each individual name of the species.
Proportion of article affected for each individual species was calculated and resulted percentage was saved to 3 significant figures.
res_all_years <- c(res2014,res2015,res2016,res2017,res2018,res2019,res2020)
res_all_years_l <- strsplit(res_all_years," ")
res_all_years_lx <- res_all_years_l[which(lapply(res_all_years_l,length)>2)]
pmid <- sapply(res_all_years_lx,"[[",1)
org <- sapply(res_all_years_lx,"[[",3)
org_df <- data.frame(pmid,org)
ath_pub <- length(unique(subset(org_df,org=="Athaliana")$pmid))
ath_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Athaliana",])$V1))
ath_pc <- signif(ath_aff/ath_pub*100,3)
cel_pub <- length(unique(subset(org_df,org=="Celegans")$pmid))
cel_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Celegans",])$V1))
cel_pc <- signif(cel_aff/cel_pub*100,3)
dme_pub <- length(unique(subset(org_df,org=="Dmelanogaster")$pmid))
dme_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Dmelanogaster",])$V1))
dme_pc <- signif(dme_aff/dme_pub*100,3)
dre_pub <- length(unique(subset(org_df,org=="Drerio")$pmid))
dre_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Drerio",])$V1))
dre_pc <- signif(dre_aff/dre_pub*100,3)
gga_pub <- length(unique(subset(org_df,org=="Ggallus")$pmid))
gga_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Ggallus",])$V1))
gga_pc <- signif(gga_aff/gga_pub*100,3)
hsa_pub <- length(unique(subset(org_df,org=="Hsapiens")$pmid))
hsa_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Hsapiens",])$V1))
hsa_pc <- signif(hsa_aff/hsa_pub*100,3)
mmu_pub <- length(unique(subset(org_df,org=="Mmusculus")$pmid))
mmu_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Mmusculus",])$V1))
mmu_pc <- signif(mmu_aff/mmu_pub*100,3)
osa_pub <- length(unique(subset(org_df,org=="Osativa")$pmid))
osa_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Osativa",])$V1))
osa_pc <- signif(osa_aff/osa_pub*100,3)
rno_pub <- length(unique(subset(org_df,org=="Rnorvegicus")$pmid))
rno_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Rnorvegicus",])$V1))
rno_pc <- signif(rno_aff/rno_pub*100,3)
sce_pub <- length(unique(subset(org_df,org=="Scerevisiae")$pmid))
sce_aff <- length(unique(subset(aff_xls[aff_xls$V2=="Scerevisiae",])$V1))
sce_pc <- signif(sce_aff/sce_pub*100,3)
A data frame was created which composed of Species, Publications with excel gene lists, Affected publications and Proportion of artcles affected followed by generation of Table 4 in the manuscript.
Species <- c("A.thaliana","C.elegans","D.melanogaster","D.rerio","G.gallus",
"H.sapiens","M.musculus","O.sativa","R.norvegicus","S.cerevisiae")
pub <- c(ath_pub, cel_pub, dme_pub, dre_pub, gga_pub, hsa_pub, mmu_pub, osa_pub, rno_pub, sce_pub)
aff <- c(ath_aff, cel_aff, dme_aff, dre_aff, gga_aff, hsa_aff, mmu_aff, osa_aff, rno_aff, sce_aff)
prop <- c(ath_pc, cel_pc, dme_pc, dre_pc, gga_pc, hsa_pc, mmu_pc, osa_pc, rno_pc, sce_pc)
gene_name_errors_species <- data.frame(Species,pub,aff,prop)
colnames(gene_name_errors_species) <- c("Species",
"Publications with excel gene lists",
"Affected publications",
"Propotion of articles affected")
gene_name_errors_species%>%
kbl() %>%
kable_paper("hover", full_width = F)
Species | Publications with excel gene lists | Affected publications | Propotion of articles affected |
---|---|---|---|
A.thaliana | 511 | 76 | 14.90 |
C.elegans | 124 | 31 | 25.00 |
D.melanogaster | 607 | 142 | 23.40 |
D.rerio | 251 | 48 | 19.10 |
G.gallus | 1827 | 172 | 9.41 |
H.sapiens | 7936 | 2419 | 30.50 |
M.musculus | 1577 | 609 | 38.60 |
O.sativa | 10 | 0 | 0.00 |
R.norvegicus | 327 | 68 | 20.80 |
S.cerevisiae | 443 | 93 | 21.00 |
Total number of journals index in PubMed Central was extraced for every year from 2014 to 2020 was obtained and result was saved as “jdf”. Total number of journals composing of one or more supplementary excel lists was calculated and result was saved as “res_p”. jdf_subset was created to evaluate repetition of the data obtained in “jdf” and “res_p”.
“journal_supplementary” was created to count number of journals with gene name errors in the supplementary files. “jdf_t_df” was created to for number of articles with excel gene list by using “jdf_subset”. Number of affected articles was calculated using “aff_xls”.
Table 5 in the manuscript was created from above gathered results.
#journals index in PMC
j2014 <- read.table("pmc/genom/pmc_journal2014genom.out.txt")
j2015 <- read.table("pmc/genom/pmc_journal2015genom.out.txt")
j2016 <- read.table("pmc/genom/pmc_journal2016genom.out.txt")
j2017 <- read.table("pmc/genom/pmc_journal2017genom.out.txt")
j2018 <- read.table("pmc/genom/pmc_journal2018genom.out.txt")
j2019 <- read.table("pmc/genom/pmc_journal2019genom.out.txt")
j2020 <- read.table("pmc/genom/pmc_journal2020genom.out.txt")
jdf <- rbind(j2014,j2015,j2016,j2017,j2018,j2019,j2020)
length(unique(jdf[,2]))
## [1] 4581
#journals published one or more supplementary Excel gene lists
res2014_xlg <- res2014_l[which(lapply(res2014_l,length)>2)]
res2014_p <- unique(sapply(res2014_xlg,"[[",1))
res2015_xlg <- res2015_l[which(lapply(res2015_l,length)>2)]
res2015_p <- unique(sapply(res2015_xlg,"[[",1))
res2016_xlg <- res2016_l[which(lapply(res2016_l,length)>2)]
res2016_p <- unique(sapply(res2016_xlg,"[[",1))
res2017_xlg <- res2017_l[which(lapply(res2017_l,length)>2)]
res2017_p <- unique(sapply(res2017_xlg,"[[",1))
res2018_xlg <- res2018_l[which(lapply(res2018_l,length)>2)]
res2018_p <- unique(sapply(res2018_xlg,"[[",1))
res2019_xlg <- res2019_l[which(lapply(res2019_l,length)>2)]
res2019_p <- unique(sapply(res2019_xlg,"[[",1))
res2020_xlg <- res2020_l[which(lapply(res2020_l,length)>2)]
res2020_p <- unique(sapply(res2020_xlg,"[[",1))
res_p <- unique(c(res2014_p,res2015_p,res2016_p,res2017_p,res2018_p,res2019_p,res2020_p))
jdf_subset <- jdf[which(jdf[,1] %in% res_p),]
#Number of journals with gene name errors in the supplementary files
journal_supplementary <- length(unique(aff_xls$V3))
# Number of articles with excel gene list
jdf <- unique(jdf_subset)
jdf_t <- table(jdf[,2])
jdf_t_df <- as.data.frame(jdf_t)
# Number of affected articles
jres <- unique(aff_xls[,c(1,3)])
j_table <- table(jres[,2])
j_table_df <- as.data.frame(j_table)
# table content for gene name errors by journal
jj <- merge(jdf_t_df,j_table_df,by="Var1")
colnames(jj) <- c("journal",
"Number of articles with Excel gene lists",
"Number of affected articles")
jj$`Proportion of articles affected` <- signif(jj[,3]/jj[,2]*100,3)
jjf <- jj[which(jj[,2]>49),]
jjf <- jjf[order(-jjf[,3]),]
jjf %>% kbl() %>%
kable_paper("hover", full_width = F)
journal | Number of articles with Excel gene lists | Number of affected articles | Proportion of articles affected | |
---|---|---|---|---|
319 | Nat_Commun | 920 | 345 | 37.50 |
376 | PLoS_One | 946 | 244 | 25.80 |
394 | Sci_Rep | 767 | 227 | 29.60 |
49 | BMC_Genomics | 660 | 166 | 25.20 |
373 | PLoS_Genet | 448 | 134 | 29.90 |
352 | Oncotarget | 326 | 107 | 32.80 |
171 | Front_Genet | 313 | 94 | 30.00 |
140 | eLife | 243 | 89 | 36.60 |
380 | Proc_Natl_Acad_Sci_U_S_A | 155 | 73 | 47.10 |
95 | Cell_Rep | 158 | 71 | 44.90 |
200 | Genome_Biol | 193 | 66 | 34.20 |
330 | Nature | 118 | 52 | 44.10 |
321 | Nat_Genet | 140 | 48 | 34.30 |
202 | Genome_Med | 137 | 44 | 32.10 |
83 | Cell | 74 | 39 | 52.70 |
358 | PeerJ | 137 | 39 | 28.50 |
109 | Clin_Epigenetics | 109 | 38 | 34.90 |
345 | Nucleic_Acids_Res | 120 | 36 | 30.00 |
54 | BMC_Med_Genomics | 117 | 31 | 26.50 |
179 | Front_Oncol | 85 | 31 | 36.50 |
406 | Transl_Psychiatry | 73 | 29 | 39.70 |
42 | BMC_Cancer | 105 | 28 | 26.70 |
118 | Commun_Biol | 74 | 27 | 36.50 |
377 | PLoS_Pathog | 80 | 27 | 33.80 |
8 | Aging_ | 56 | 26 | 46.40 |
137 | EBioMedicine | 51 | 26 | 51.00 |
371 | PLoS_Biol | 66 | 26 | 39.40 |
149 | Epigenetics_Chromatin | 64 | 25 | 39.10 |
372 | PLoS_Comput_Biol | 97 | 24 | 24.70 |
348 | Oncogene | 53 | 22 | 41.50 |
233 | iScience | 58 | 20 | 34.50 |
392 | Sci_Adv | 56 | 20 | 35.70 |
40 | BMC_Bioinformatics | 77 | 19 | 24.70 |
187 | G3_ | 74 | 15 | 20.30 |
216 | Hum_Mol_Genet | 53 | 15 | 28.30 |
62 | BMC_Plant_Biol | 52 | 6 | 11.50 |
183 | Front_Plant_Sci | 75 | 5 | 6.67 |
Here we need to create an object containing PMCID, journal name and year for the 11117 pubs with Excel gene lists, so that we can calculate yearly proportions later.
res2014_p_df <- as.data.frame(res2014_p)
colnames(res2014_p_df) <- "pmc"
res2014_p_df$year <- "2014"
res2015_p_df <- as.data.frame(res2015_p)
colnames(res2015_p_df) <- "pmc"
res2015_p_df$year <- "2015"
res2016_p_df <- as.data.frame(res2016_p)
colnames(res2016_p_df) <- "pmc"
res2016_p_df$year <- "2016"
res2017_p_df <- as.data.frame(res2017_p)
colnames(res2017_p_df) <- "pmc"
res2017_p_df$year <- "2017"
res2018_p_df <- as.data.frame(res2018_p)
colnames(res2018_p_df) <- "pmc"
res2018_p_df$year <- "2018"
res2019_p_df <- as.data.frame(res2019_p)
colnames(res2019_p_df) <- "pmc"
res2019_p_df$year <- "2019"
res2020_p_df <- as.data.frame(res2020_p)
colnames(res2020_p_df) <- "pmc"
res2020_p_df$year <- "2020"
pub_yr <- rbind(res2014_p_df, res2015_p_df, res2016_p_df, res2017_p_df, res2018_p_df, res2019_p_df, res2020_p_df)
jdf2 <- merge(jdf,pub_yr,by.x="V1",by.y="pmc")
Top six journals with most gene name errors were selected based on “Number of affected articles” in the Table 5 of the manuscript.
Accordingly, Nat Commun,PLoS One,Sci Rep,BMC Genomics,PLoS Genet,Oncotarget were considered as top 6 journals with most gene name errors. The number of publications published by each of these 6 journals in each year with gene name errors were counted and result was recorded.
years <- c("2014","2015","2016","2017","2018","2019","2020")
aj2014 <- unique(pub_2014[,c(1,3)])
aj2015 <- unique(pub_2015[,c(1,3)])
aj2016 <- unique(pub_2016[,c(1,3)])
aj2017 <- unique(pub_2017[,c(1,3)])
aj2018 <- unique(pub_2018[,c(1,3)])
aj2019 <- unique(pub_2019[,c(1,3)])
aj2020 <- unique(pub_2020[,c(1,3)])
# Nature communication
nc_aff <- c(nrow(subset(aj2014[aj2014$V3=="Nat_Commun",])),
nrow(subset(aj2015[aj2015$V3=="Nat_Commun",])),
nrow(subset(aj2016[aj2016$V3=="Nat_Commun",])),
nrow(subset(aj2017[aj2017$V3=="Nat_Commun",])),
nrow(subset(aj2018[aj2018$V3=="Nat_Commun",])),
nrow(subset(aj2019[aj2019$V3=="Nat_Commun",])),
nrow(subset(aj2020[aj2020$V3=="Nat_Commun",])))
nc_gl <- c(nrow(unique(subset(jdf2,V2=="Nat_Commun"&year=="2014"))),
nrow(unique(subset(jdf2,V2=="Nat_Commun"&year=="2015"))),
nrow(unique(subset(jdf2,V2=="Nat_Commun"&year=="2016"))),
nrow(unique(subset(jdf2,V2=="Nat_Commun"&year=="2017"))),
nrow(unique(subset(jdf2,V2=="Nat_Commun"&year=="2018"))),
nrow(unique(subset(jdf2,V2=="Nat_Commun"&year=="2019"))),
nrow(unique(subset(jdf2,V2=="Nat_Commun"&year=="2020"))))
nc_df<- data.frame(years,nc_aff,nc_gl,signif(nc_aff/nc_gl*100,3))
colnames(nc_df) <- c("year","affected articles","articles with excel gene lists", "proportion of articles affected (%)")
nc_df%>%
kbl() %>%
kable_paper("hover", full_width = F)
year | affected articles | articles with excel gene lists | proportion of articles affected (%) |
---|---|---|---|
2014 | 11 | 33 | 33.3 |
2015 | 19 | 56 | 33.9 |
2016 | 29 | 85 | 34.1 |
2017 | 50 | 131 | 38.2 |
2018 | 67 | 175 | 38.3 |
2019 | 84 | 225 | 37.3 |
2020 | 85 | 215 | 39.5 |
# PLoS_One
po_aff <- c(nrow(subset(aj2014[aj2014$V3=="PLoS_One",])),
nrow(subset(aj2015[aj2015$V3=="PLoS_One",])),
nrow(subset(aj2016[aj2016$V3=="PLoS_One",])),
nrow(subset(aj2017[aj2017$V3=="PLoS_One",])),
nrow(subset(aj2018[aj2018$V3=="PLoS_One",])),
nrow(subset(aj2019[aj2019$V3=="PLoS_One",])),
nrow(subset(aj2020[aj2020$V3=="PLoS_One",])))
po_gl <- c(nrow(unique(subset(jdf2,V2=="PLoS_One"&year=="2014"))),
nrow(unique(subset(jdf2,V2=="PLoS_One"&year=="2015"))),
nrow(unique(subset(jdf2,V2=="PLoS_One"&year=="2016"))),
nrow(unique(subset(jdf2,V2=="PLoS_One"&year=="2017"))),
nrow(unique(subset(jdf2,V2=="PLoS_One"&year=="2018"))),
nrow(unique(subset(jdf2,V2=="PLoS_One"&year=="2019"))),
nrow(unique(subset(jdf2,V2=="PLoS_One"&year=="2020"))))
po_df<- data.frame(years,po_aff,po_gl,signif(po_aff/po_gl*100,3))
colnames(po_df) <- c("year","affected articles","articles with excel gene lists", "proportion of articles affected (%)")
po_df%>%
kbl() %>%
kable_paper("hover", full_width = F)
year | affected articles | articles with excel gene lists | proportion of articles affected (%) |
---|---|---|---|
2014 | 58 | 222 | 26.1 |
2015 | 57 | 195 | 29.2 |
2016 | 34 | 150 | 22.7 |
2017 | 37 | 135 | 27.4 |
2018 | 21 | 106 | 19.8 |
2019 | 20 | 69 | 29.0 |
2020 | 17 | 69 | 24.6 |
# Sci_Rep
sr_aff <- c(nrow(subset(aj2014[aj2014$V3=="Sci_Rep",])),
nrow(subset(aj2015[aj2015$V3=="Sci_Rep",])),
nrow(subset(aj2016[aj2016$V3=="Sci_Rep",])),
nrow(subset(aj2017[aj2017$V3=="Sci_Rep",])),
nrow(subset(aj2018[aj2018$V3=="Sci_Rep",])),
nrow(subset(aj2019[aj2019$V3=="Sci_Rep",])),
nrow(subset(aj2020[aj2020$V3=="Sci_Rep",])))
sr_gl <- c(nrow(unique(subset(jdf2,V2=="Sci_Rep"&year=="2014"))),
nrow(unique(subset(jdf2,V2=="Sci_Rep"&year=="2015"))),
nrow(unique(subset(jdf2,V2=="Sci_Rep"&year=="2016"))),
nrow(unique(subset(jdf2,V2=="Sci_Rep"&year=="2017"))),
nrow(unique(subset(jdf2,V2=="Sci_Rep"&year=="2018"))),
nrow(unique(subset(jdf2,V2=="Sci_Rep"&year=="2019"))),
nrow(unique(subset(jdf2,V2=="Sci_Rep"&year=="2020"))))
sr_df<- data.frame(years,sr_aff,sr_gl,signif(sr_aff/sr_gl*100,3))
colnames(sr_df) <- c("year","affected articles","articles with excel gene lists", "proportion of articles affected (%)")
sr_df%>%
kbl() %>%
kable_paper("hover", full_width = F)
year | affected articles | articles with excel gene lists | proportion of articles affected (%) |
---|---|---|---|
2014 | 3 | 9 | 33.3 |
2015 | 7 | 41 | 17.1 |
2016 | 37 | 144 | 25.7 |
2017 | 57 | 157 | 36.3 |
2018 | 43 | 141 | 30.5 |
2019 | 47 | 133 | 35.3 |
2020 | 33 | 142 | 23.2 |
# BMC_Genomics
bg_aff <- c(nrow(subset(aj2014[aj2014$V3=="BMC_Genomics",])),
nrow(subset(aj2015[aj2015$V3=="BMC_Genomics",])),
nrow(subset(aj2016[aj2016$V3=="BMC_Genomics",])),
nrow(subset(aj2017[aj2017$V3=="BMC_Genomics",])),
nrow(subset(aj2018[aj2018$V3=="BMC_Genomics",])),
nrow(subset(aj2019[aj2019$V3=="BMC_Genomics",])),
nrow(subset(aj2020[aj2020$V3=="BMC_Genomics",])))
bg_gl <- c(nrow(unique(subset(jdf2,V2=="BMC_Genomics"&year=="2014"))),
nrow(unique(subset(jdf2,V2=="BMC_Genomics"&year=="2015"))),
nrow(unique(subset(jdf2,V2=="BMC_Genomics"&year=="2016"))),
nrow(unique(subset(jdf2,V2=="BMC_Genomics"&year=="2017"))),
nrow(unique(subset(jdf2,V2=="BMC_Genomics"&year=="2018"))),
nrow(unique(subset(jdf2,V2=="BMC_Genomics"&year=="2019"))),
nrow(unique(subset(jdf2,V2=="BMC_Genomics"&year=="2020"))))
bg_df<- data.frame(years,bg_aff,bg_gl,signif(bg_aff/bg_gl*100,3))
colnames(bg_df) <- c("year","affected articles","articles with excel gene lists", "proportion of articles affected (%)")
bg_df%>%
kbl() %>%
kable_paper("hover", full_width = F)
year | affected articles | articles with excel gene lists | proportion of articles affected (%) |
---|---|---|---|
2014 | 25 | 98 | 25.5 |
2015 | 28 | 116 | 24.1 |
2016 | 26 | 97 | 26.8 |
2017 | 26 | 86 | 30.2 |
2018 | 20 | 102 | 19.6 |
2019 | 21 | 92 | 22.8 |
2020 | 20 | 69 | 29.0 |
# PLoS_Genet
pg_aff <- c(nrow(subset(aj2014[aj2014$V3=="PLoS_Genet",])),
nrow(subset(aj2015[aj2015$V3=="PLoS_Genet",])),
nrow(subset(aj2016[aj2016$V3=="PLoS_Genet",])),
nrow(subset(aj2017[aj2017$V3=="PLoS_Genet",])),
nrow(subset(aj2018[aj2018$V3=="PLoS_Genet",])),
nrow(subset(aj2019[aj2019$V3=="PLoS_Genet",])),
nrow(subset(aj2020[aj2020$V3=="PLoS_Genet",])))
pg_gl <- c(nrow(unique(subset(jdf2,V2=="PLoS_Genet"&year=="2014"))),
nrow(unique(subset(jdf2,V2=="PLoS_Genet"&year=="2015"))),
nrow(unique(subset(jdf2,V2=="PLoS_Genet"&year=="2016"))),
nrow(unique(subset(jdf2,V2=="PLoS_Genet"&year=="2017"))),
nrow(unique(subset(jdf2,V2=="PLoS_Genet"&year=="2018"))),
nrow(unique(subset(jdf2,V2=="PLoS_Genet"&year=="2019"))),
nrow(unique(subset(jdf2,V2=="PLoS_Genet"&year=="2020"))))
pg_df<- data.frame(years,pg_aff,pg_gl,signif(pg_aff/pg_gl*100,3))
colnames(pg_df) <- c("year","affected articles","articles with excel gene lists", "proportion of articles affected (%)")
pg_df%>%
kbl() %>%
kable_paper("hover", full_width = F)
year | affected articles | articles with excel gene lists | proportion of articles affected (%) |
---|---|---|---|
2014 | 23 | 72 | 31.9 |
2015 | 19 | 68 | 27.9 |
2016 | 19 | 67 | 28.4 |
2017 | 18 | 64 | 28.1 |
2018 | 16 | 56 | 28.6 |
2019 | 16 | 60 | 26.7 |
2020 | 23 | 61 | 37.7 |
# Oncotarget
ot_aff <- c(nrow(subset(aj2014[aj2014$V3=="Oncotarget",])),
nrow(subset(aj2015[aj2015$V3=="Oncotarget",])),
nrow(subset(aj2016[aj2016$V3=="Oncotarget",])),
nrow(subset(aj2017[aj2017$V3=="Oncotarget",])),
nrow(subset(aj2018[aj2018$V3=="Oncotarget",])),
nrow(subset(aj2019[aj2019$V3=="Oncotarget",])),
nrow(subset(aj2020[aj2020$V3=="Oncotarget",])))
ot_gl <- c(nrow(unique(subset(jdf2,V2=="Oncotarget"&year=="2014"))),
nrow(unique(subset(jdf2,V2=="Oncotarget"&year=="2015"))),
nrow(unique(subset(jdf2,V2=="Oncotarget"&year=="2016"))),
nrow(unique(subset(jdf2,V2=="Oncotarget"&year=="2017"))),
nrow(unique(subset(jdf2,V2=="Oncotarget"&year=="2018"))),
nrow(unique(subset(jdf2,V2=="Oncotarget"&year=="2019"))),
nrow(unique(subset(jdf2,V2=="Oncotarget"&year=="2020"))))
ot_df<- data.frame(years,ot_aff,ot_gl,signif(ot_aff/ot_gl*100,3))
colnames(ot_df) <- c("year","affected articles","articles with excel gene lists", "proportion of articles affected (%)")
ot_df%>%
kbl() %>%
kable_paper("hover", full_width = F)
year | affected articles | articles with excel gene lists | proportion of articles affected (%) |
---|---|---|---|
2014 | 5 | 18 | 27.8 |
2015 | 10 | 47 | 21.3 |
2016 | 41 | 106 | 38.7 |
2017 | 35 | 107 | 32.7 |
2018 | 11 | 35 | 31.4 |
2019 | 3 | 8 | 37.5 |
2020 | 2 | 5 | 40.0 |
Temporal trends for the six journals with most gene name errors from 2014 to 2020 was observed and Figure 2 in the manuscript was reproduced.
years <- c("2014","2015","2016","2017","2018","2019","2020")
par(mfrow=c(3,1))
# line diagrams on the number of gene name error articles published by journal per year
## Nature communication
barplot(nc_df[,3],ylab="Publications with Excel gene lists",xlab="",
names.arg = years, main="Nature Communications")
grid()
barplot(nc_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=nc_df[,1],y=nc_df[,2]/nc_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()
## PLOS ONE
barplot(po_df[,3],ylab="Publications with Excel gene lists",xlab="",
names.arg = years, main="PLOS ONE")
grid()
barplot(po_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=po_df[,1],y=po_df[,2]/po_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()
## Scientific Reports
barplot(sr_df[,3],ylab="Publications with Excel gene lists",xlab="",
names.arg = years, main="Scientific Reports")
grid()
barplot(sr_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=sr_df[,1],y=sr_df[,2]/sr_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()
## BMC_Genomics
barplot(bg_df[,3],ylab="Publications with Excel gene lists",xlab="",
names.arg = years, main="BMC Genomics")
grid()
barplot(bg_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=bg_df[,1],y=bg_df[,2]/bg_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()
## PLoS Genetics
barplot(pg_df[,3],ylab="Publications with Excel gene lists",xlab="",
names.arg = years, main="PLoS Genetics")
grid()
barplot(pg_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=pg_df[,1],y=pg_df[,2]/pg_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()
## Oncotarget
barplot(ot_df[,3],ylab="Publications with Excel gene lists",xlab="",
names.arg = years, main="Oncotarget")
grid()
barplot(ot_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=ot_df[,1],y=ot_df[,2]/ot_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()
## now make the pdf
pdf("fig3.pdf",width=4,height=7)
par(mfrow=c(3,1))
## Nature communication
barplot(nc_df[,3],ylab="Publications with Excel gene lists",xlab="",
names.arg = years, main="Nature Communications")
grid()
barplot(nc_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=nc_df[,1],y=nc_df[,2]/nc_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()
## PLOS ONE
barplot(po_df[,3],ylab="Publications with Excel gene lists",xlab="",
names.arg = years, main="PLOS ONE")
grid()
barplot(po_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=po_df[,1],y=po_df[,2]/po_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()
## Scientific Reports
barplot(sr_df[,3],ylab="Publications with Excel gene lists",xlab="",
names.arg = years, main="Scientific Reports")
grid()
barplot(sr_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=sr_df[,1],y=sr_df[,2]/sr_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()
## BMC_Genomics
barplot(bg_df[,3],ylab="Publications with Excel gene lists",xlab="",
names.arg = years, main="BMC Genomics")
grid()
barplot(bg_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=bg_df[,1],y=bg_df[,2]/bg_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()
## PLoS Genetics
barplot(pg_df[,3],ylab="Publications with Excel gene lists",xlab="",
names.arg = years, main="PLoS Genetics")
grid()
barplot(pg_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=pg_df[,1],y=pg_df[,2]/pg_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()
## Oncotarget
barplot(ot_df[,3],ylab="Publications with Excel gene lists",xlab="",
names.arg = years, main="Oncotarget")
grid()
barplot(ot_df[,2],ylab="Affected publications",xlab="",names.arg = years)
grid()
plot(x=ot_df[,1],y=ot_df[,2]/ot_df[,3]*100,ylab ="Proportion of publications affected (%)",xlab = "",type = "b",pch=19,bty="n")
grid()
dev.off()
## png
## 2
All the journals from 2014 to 2020 which contain more than 10 gene name error publications.
Later we have output top 10 journals with gene name errors for each year from 2014 to 2020 with " head(aj20**_j_dsc,10)
aj2014_tab <- table(aj2014[,2])
aj2015_tab <- table(aj2015[,2])
aj2016_tab <- table(aj2016[,2])
aj2017_tab <- table(aj2017[,2])
aj2018_tab <- table(aj2018[,2])
aj2019_tab <- table(aj2019[,2])
aj2020_tab <- table(aj2020[,2])
aj2020_df <- as.data.frame(aj2020_tab)
aj2020_j <- aj2020_df[which(aj2020_df[,2]>10),]
aj2020_j_dsc <- aj2020_df[order(-aj2020_df[,2]),]
head(aj2020_j_dsc,10)
## Var1 Freq
## 123 Nat_Commun 85
## 60 Front_Genet 43
## 162 Sci_Rep 33
## 66 Front_Oncol 23
## 152 PLoS_Genet 23
## 50 eLife 22
## 15 BMC_Genomics 20
## 154 PLoS_One 17
## 157 Proc_Natl_Acad_Sci_U_S_A 17
## 42 Commun_Biol 16
aj2019_df <- as.data.frame(aj2019_tab)
aj2019_j <- aj2019_df[which(aj2019_df[,2]>10),]
aj2019_j_dsc <- aj2019_df[order(-aj2019_df[,2]),]
head(aj2019_j_dsc,10)
## Var1 Freq
## 129 Nat_Commun 84
## 157 Sci_Rep 47
## 69 Front_Genet 29
## 24 BMC_Genomics 21
## 152 PLoS_One 20
## 151 PLoS_Genet 16
## 41 Cell_Rep 15
## 54 eLife 13
## 145 PeerJ 13
## 44 Clin_Epigenetics 12
aj2018_df <- as.data.frame(aj2018_tab)
aj2018_j <- aj2018_df[which(aj2018_df[,2]>10),]
aj2018_j_dsc <- aj2018_df[order(-aj2018_df[,2]),]
head(aj2018_j_dsc,10)
## Var1 Freq
## 103 Nat_Commun 67
## 131 Sci_Rep 43
## 124 PLoS_One 21
## 20 BMC_Genomics 20
## 35 Cell_Rep 16
## 122 PLoS_Genet 16
## 46 eLife 14
## 55 Front_Genet 13
## 66 Genome_Biol 12
## 118 Oncotarget 11
aj2017_df <- as.data.frame(aj2017_tab)
aj2017_j <- aj2017_df[which(aj2017_df[,2]>10),]
aj2017_j_dsc <- aj2017_df[order(-aj2017_df[,2]),]
head(aj2017_j_dsc,10)
## Var1 Freq
## 102 Sci_Rep 57
## 80 Nat_Commun 50
## 98 PLoS_One 37
## 91 Oncotarget 35
## 12 BMC_Genomics 26
## 97 PLoS_Genet 18
## 86 Nature 13
## 35 eLife 11
## 55 Genome_Biol 11
## 24 Cell_Rep 7
aj2016_df <- as.data.frame(aj2016_tab)
aj2016_j <- aj2016_df[which(aj2016_df[,2]>10),]
aj2016_j_dsc <- aj2016_df[order(-aj2016_df[,2]),]
head(aj2016_j_dsc,10)
## Var1 Freq
## 119 Oncotarget 41
## 137 Sci_Rep 37
## 131 PLoS_One 34
## 103 Nat_Commun 29
## 14 BMC_Genomics 26
## 128 PLoS_Genet 19
## 32 Cell_Rep 15
## 48 eLife 10
## 133 Proc_Natl_Acad_Sci_U_S_A 10
## 26 Cell 9
aj2015_df <- as.data.frame(aj2015_tab)
aj2015_j <- aj2015_df[which(aj2015_df[,2]>10),]
aj2015_j_dsc <- aj2015_df[order(-aj2015_df[,2]),]
head(aj2015_j_dsc,10)
## Var1 Freq
## 133 PLoS_One 57
## 18 BMC_Genomics 28
## 136 Proc_Natl_Acad_Sci_U_S_A 23
## 115 Nat_Commun 19
## 132 PLoS_Genet 19
## 123 Nucleic_Acids_Res 18
## 31 Cell 14
## 48 eLife 11
## 76 Genome_Res 11
## 125 Oncotarget 10
aj2014_df <- as.data.frame(aj2014_tab)
aj2014_j <- aj2014_df[which(aj2014_df[,2]>10),]
aj2014_j_dsc <- aj2014_df[order(-aj2014_df[,2]),]
head(aj2014_j_dsc,10)
## Var1 Freq
## 76 PLoS_One 58
## 8 BMC_Genomics 25
## 75 PLoS_Genet 23
## 39 Genome_Biol 12
## 62 Nat_Commun 11
## 68 Nucleic_Acids_Res 9
## 25 eLife 8
## 67 Nature 6
## 6 BMC_Bioinformatics 5
## 70 Oncotarget 5
aj2020_df <- as.data.frame(aj2020_tab)
aj2020_j <- aj2020_df[which(aj2020_df[,2]>10),]
aj2020_j_dsc <- aj2020_df[order(-aj2020_df[,2]),]
head(aj2020_j_dsc,10)
## Var1 Freq
## 123 Nat_Commun 85
## 60 Front_Genet 43
## 162 Sci_Rep 33
## 66 Front_Oncol 23
## 152 PLoS_Genet 23
## 50 eLife 22
## 15 BMC_Genomics 20
## 154 PLoS_One 17
## 157 Proc_Natl_Acad_Sci_U_S_A 17
## 42 Commun_Biol 16
Focus on plos one in the period 2014,2015. Previously we found 60 studies. Here we find 115 studies. Of these 56 are common to both.
prev <- read.table("plosone_20142015.tsv",header=TRUE,sep="\t")
prev <- subset(prev,Confirmed=="Confirmed")
tail(prev)
## Journal YearPublished Confirmed PMID PMCID
## 73 PLosOne 2015 Confirmed 26444573 PMC4596691
## 74 PLosOne 2015 Confirmed 26510177 PMC4624949
## 75 PLosOne 2015 Confirmed 26529237 PMC4631338
## 76 PLosOne 2015 Confirmed 26636579 PMC4670106
## 77 PLosOne 2015 Confirmed 26695660 PMC4687867
## 78 PLosOne 2015 Confirmed 26684451 PMC4684321
prev_pmc <- unique(prev$PMCID)
length(prev_pmc)
## [1] 60
new <- subset(aff_xls,V3=="PLoS_One")
new <- subset(new, V4=="2014" | V4 == "2015")
new <- new[,c(1,3,4)]
new_pmc <- unique(new$V1)
length(new_pmc)
## [1] 115
v1 <- list("Prev"=prev_pmc,"New"=new_pmc)
plot(euler(v1),quantities = TRUE)
# PMCs specific to 2016 study
setdiff(prev_pmc,new_pmc)
## [1] "PMC4118979" "PMC4441472" "PMC4500563" "PMC4508115"
# PMCs specific to 2021 study
setdiff(new_pmc,prev_pmc)
## [1] "PMC3894996" "PMC3901708" "PMC3968145" "PMC3989189" "PMC3990548"
## [6] "PMC3990644" "PMC3990668" "PMC3999108" "PMC4002427" "PMC4002480"
## [11] "PMC4011728" "PMC4012993" "PMC4029955" "PMC4039489" "PMC4041891"
## [16] "PMC4079602" "PMC4084626" "PMC4099127" "PMC4103770" "PMC4105622"
## [21] "PMC4108364" "PMC4109958" "PMC4128672" "PMC4141782" "PMC4156408"
## [26] "PMC4157799" "PMC4167545" "PMC4207689" "PMC4208810" "PMC4210245"
## [31] "PMC4230926" "PMC4252097" "PMC4262388" "PMC4338293" "PMC4364623"
## [36] "PMC4370594" "PMC4370704" "PMC4372331" "PMC4373911" "PMC4388690"
## [41] "PMC4416816" "PMC4427337" "PMC4441380" "PMC4456163" "PMC4472808"
## [46] "PMC4478024" "PMC4514889" "PMC4549312" "PMC4551741" "PMC4552880"
## [51] "PMC4556673" "PMC4580636" "PMC4583255" "PMC4618846" "PMC4641597"
## [56] "PMC4641603" "PMC4671692" "PMC4681367" "PMC4699828"
Journal Citation Reports 2020 (Impact factor & Ranking of 2019). InCites Journal Citation Reports (Clarivate Analytics).
jifs <- read.table("jifs.tsv",header=TRUE)
proportions <- jjf[,c(1,4)]
myjifs <- merge(proportions, jifs, by.x="journal",by.y="Journal")
cor.test(myjifs$JIF,myjifs$`Proportion of articles affected`)
##
## Pearson's product-moment correlation
##
## data: myjifs$JIF and myjifs$`Proportion of articles affected`
## t = 2.9932, df = 33, p-value = 0.005197
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1522930 0.6892045
## sample estimates:
## cor
## 0.4620881
cor.test(myjifs$JIF,myjifs$`Proportion of articles affected`,method="s")
## Warning in cor.test.default(myjifs$JIF, myjifs$`Proportion of articles
## affected`, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: myjifs$JIF and myjifs$`Proportion of articles affected`
## S = 2932.1, p-value = 0.0001953
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.5893395
mylm <- lm(myjifs$`Proportion of articles affected` ~ myjifs$JIF)
summary(mylm)
##
## Call:
## lm(formula = myjifs$`Proportion of articles affected` ~ myjifs$JIF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.2790 -4.5087 -0.6097 4.7067 19.4141
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.8471 1.9874 14.515 6.95e-16 ***
## myjifs$JIF 0.4775 0.1595 2.993 0.0052 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.702 on 33 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.2135, Adjusted R-squared: 0.1897
## F-statistic: 8.959 on 1 and 33 DF, p-value: 0.005197
plot(myjifs$JIF,myjifs$`Proportion of articles affected`,
xlab="JIF", ylab="Proportion of articles affected (%)",
main="Articles with supplementary Excel gene lists")
abline(mylm)
pdf("fig2.pdf")
plot(myjifs$JIF,myjifs$`Proportion of articles affected`,
xlab="JIF", ylab="Proportion of articles affected (%)",
main="Articles with supplementary Excel gene lists")
abline(mylm)
dev.off()
## png
## 2
For reproducibility.
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8
## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8
## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] Cairo_1.5-12.2 eulerr_6.1.0 kableExtra_1.3.4 dplyr_1.0.6
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.6 highr_0.9 pillar_1.6.1 bslib_0.2.5.1
## [5] compiler_4.1.0 jquerylib_0.1.4 tools_4.1.0 digest_0.6.27
## [9] viridisLite_0.4.0 jsonlite_1.7.2 evaluate_0.14 lifecycle_1.0.0
## [13] tibble_3.1.2 pkgconfig_2.0.3 rlang_0.4.11 DBI_1.1.1
## [17] rstudioapi_0.13 yaml_2.2.1 xfun_0.23 stringr_1.4.0
## [21] httr_1.4.2 knitr_1.33 xml2_1.3.2 systemfonts_1.0.2
## [25] generics_0.1.0 vctrs_0.3.8 sass_0.4.0 grid_4.1.0
## [29] webshot_0.5.2 tidyselect_1.1.1 svglite_2.0.0 glue_1.4.2
## [33] R6_2.5.0 fansi_0.5.0 rmarkdown_2.8 polyclip_1.10-0
## [37] polylabelr_0.2.0 purrr_0.3.4 magrittr_2.0.1 scales_1.1.1
## [41] ellipsis_0.3.2 htmltools_0.5.1.1 assertthat_0.2.1 rvest_1.0.0
## [45] colorspace_2.0-1 utf8_1.2.1 stringi_1.6.2 munsell_0.5.0
## [49] crayon_1.4.1