Source: https://github.com/markziemann/SurveyEnrichmentMethods

Intro

Here we are performing the analysis that is described in Figure 3 of the manuscript. This involves calculating an analysis score for a 1500 articles from 2019. We then identify the best/worst journals, and correlate analysis scores with journal and article level metrics.

knitr::opts_chunk$set(fig.width=7, fig.height=5) 

library("wordcloud")
library("vioplot")

Overview of included and excluded analyses

x <- read.table("../data/PMC_2019-analysis.tsv",header=TRUE,fill=TRUE,sep="\t")
y <- read.table("../data/QC-analysis2.tsv",header=TRUE,fill=TRUE,sep="\t")
y <- y[,1:16]
x <- x[which(! x$Pubmed.Central.ID %in% y$Pubmed.Central.ID),]
x <- rbind(x,y)

head(x)
##   Pubmed.Central.ID Article.number Allocated          Journal
## 2        PMC6317219              2      Mark     J Transl Med
## 4        PMC6318897              4      Mark     BMC Genomics
## 5        PMC6322516              5      Mark Cancer Manag Res
## 6        PMC6323737              6      Mark   BMC Plant Biol
## 8        PMC6325784              8      Mark       Respir Res
## 9        PMC6325795              9      Mark       BMC Cancer
##              Omics.type                Organism         Gene.set.library
## 2 Gene expression array            Homo sapiens                 GO, KEGG
## 4               RNA-seq Gymnocypris przewalskii                     KEGG
## 5 Gene expression array            Homo sapiens Ingenuity Knowledge Base
## 6 Gene expression array             Arabidopsis                       GO
## 8 Gene expression array            Homo sapiens                     KEGG
## 9             miRNA-seq            Homo sapiens                     KEGG
##   GS.version Statistical.test.used FDR.Correction                   App.used
## 2         No            Not stated             No                      DAVID
## 4         No            Not stated            Yes                      KOBAS
## 5         No            Not stated             No Ingenuity Pathway Analysis
## 6        Yes            Not stated             No                      topGO
## 8         No        Hypergeometric            Yes                GeneAnswers
## 9         No                Fisher             No              DIANA-miRPath
##   App.Version Code.availability Background.gene.set Assumptions.violated
## 2          No              <NA>          Not stated      Background, FDR
## 4          No              <NA>          Not stated           Background
## 5          No              <NA>          Not stated      Background, FDR
## 6         Yes                No          Not stated      Background, FDR
## 8          No                No          Not stated           Background
## 9         Yes              <NA>          Not stated           Background
##   Gene.lists.provided
## 2                  No
## 4                 Yes
## 5                  No
## 6                 Yes
## 8                 Yes
## 9                 Yes
colnames(x)
##  [1] "Pubmed.Central.ID"     "Article.number"        "Allocated"            
##  [4] "Journal"               "Omics.type"            "Organism"             
##  [7] "Gene.set.library"      "GS.version"            "Statistical.test.used"
## [10] "FDR.Correction"        "App.used"              "App.Version"          
## [13] "Code.availability"     "Background.gene.set"   "Assumptions.violated" 
## [16] "Gene.lists.provided"
dim(x)
## [1] 1766   16
exclude <- subset(x,x$GS.version=="EXCLUDE")
nrow(exclude)
## [1] 132
length(unique(exclude$Pubmed.Central.ID))
## [1] 132
x <- subset(x,x$GS.version!="EXCLUDE")
nrow(x)
## [1] 1630
length(unique(x$Pubmed.Central.ID))
## [1] 1365
write.table(x,file="TableS2.tsv",quote=FALSE,sep="\t",col.names = TRUE,row.names = FALSE)

Score

Here I am proposing a scoring scheme. If there is information missing from the article, or basic mistakes are made, then a point is deducted. If the article goes over and above the basic reproducibility, then they are awarded a point.

Gene set library origin not stated = -1

Gene set library version not stated = -1

Stat test not stated = -1

No stat test conducted = -1

No FDR correction conducted = -1

App used not stated = -1

App version not stated = -1

Background list not defined = -1

Inappropriate background list used = -1

Code availability = +1

Gene lists provided = +1

score <- function(r){
  r[is.na(r)] <- 0
  SCORE=0
  # gene set lib
  if(r["Gene.set.library"]=="Not stated"){
    SCORE=SCORE-1
  }
  # GS version
  if(r["GS.version"]=="No"){
    SCORE=SCORE-1
  }
  # stat test
  if(r["Statistical.test.used"]=="No test"){
    SCORE=SCORE-1
  }
  if(r["Statistical.test.used"]=="Not stated"){
    SCORE=SCORE-1
  }
  # FDR
  if(r["FDR.Correction"]!="Yes"){
    SCORE=SCORE-1
  }
  # app used
  if(r["App.used"]=="Not stated"){
    SCORE=SCORE-1
  }
  # App version
  if(r["App.Version"]=="No"){
    SCORE=SCORE-1
  }
  # Code availability
  if(r["Code.availability"]=="Yes"){
    SCORE=SCORE+1
  }
  # Background
  if(r["Background.gene.set"]=="Not stated"){
    SCORE=SCORE-1
  }
  if(r["Background.gene.set"]=="No"){
    SCORE=SCORE-1
  }
  if(r["Background.gene.set"]=="Stated, but incorrect"){
    SCORE=SCORE-1
  }
  # gene list
  if(r["Gene.lists.provided"]=="Yes"){
    SCORE=SCORE+1
  }
  return(SCORE)
}

scores <- apply(X = x,MARGIN = 1, FUN = score)
barplot(table(scores), xlab="analysis score",ylab="frequency")

x$scores <- scores
length(which(scores>0))
## [1] 2
mean(x$scores)
## [1] -3.529448
sd(x$scores)
## [1] 1.361599
median(x$scores)
## [1] -4
length(scores)
## [1] 1630
which(scores>0)
##  349 5910 
##  278 1450
table(scores)
## scores
##  -7  -6  -5  -4  -3  -2  -1   0   1 
##   1  67 351 492 360 216 111  30   2
dir.create("images")
## Warning in dir.create("images"): 'images' already exists
png("images/hist1.png",width=300,height=200)
par(mar=c(5,5,2,1))
barplot(table(scores), xlab="analysis score",ylab="frequency",ylim=c(0,600))
text(x= 1:length(table(scores))* 1.2 -0.5, y = table(scores), label = table(scores), pos = 3, cex = 1, col = "black")
dev.off()
## png 
##   2
pdf("images/hist1.pdf",width=5,height=4)
par(mar=c(5,5,2,1))
barplot(table(scores), xlab="analysis score",ylab="frequency",ylim=c(0,600))
text(x= 1:length(table(scores))* 1.2 -0.5, y = table(scores), label = table(scores), pos = 3, cex = 1, col = "black")
dev.off()
## png 
##   2
scores2 <- round(table(scores)/sum(table(scores))*100,1)
scores2
## scores
##   -7   -6   -5   -4   -3   -2   -1    0    1 
##  0.1  4.1 21.5 30.2 22.1 13.3  6.8  1.8  0.1
png("images/hist2.png",width=300,height=200)
par(mar=c(5,5,2,1))
barplot(scores2, xlab="analysis score",ylab="frequency",ylim=c(0,50))
text(x= 1:length(scores2)* 1.2 -0.5, y = scores2, label = scores2, pos = 3, cex = 0.9, col = "black")
dev.off()
## png 
##   2
pdf("images/hist2.pdf",width=5,height=4)
par(mar=c(5,5,2,1))
barplot(scores2, xlab="analysis score",ylab="frequency",ylim=c(0,40))
text(x= 1:length(scores2)* 1.2 -0.5, y = scores2, label = scores2, pos = 3, cex = 0.9, col = "black")
dev.off()
## png 
##   2

Journal rank

jmetrics <- read.table("../data/scimagojr_2020.csv",sep=";",header=TRUE,dec = ",")
jmetrics$Title <- toupper(jmetrics$Title)
head(jmetrics,3)
##   Rank Sourceid                                 Title    Type
## 1    1    28773    CA-A CANCER JOURNAL FOR CLINICIANS journal
## 2    2    19434      MMWR RECOMMENDATIONS AND REPORTS journal
## 3    3    20315 NATURE REVIEWS MOLECULAR CELL BIOLOGY journal
##                 Issn    SJR SJR.Best.Quartile H.index Total.Docs...2020.
## 1 15424863, 00079235 62.937                Q1     168                 47
## 2 10575987, 15458601 40.949                Q1     143                 10
## 3 14710072, 14710080 37.461                Q1     431                115
##   Total.Docs...3years. Total.Refs. Total.Cites..3years. Citable.Docs...3years.
## 1                  119        3452                15499                     80
## 2                    9        1292                  492                      9
## 3                  338        8439                10844                    167
##   Cites...Doc...2years. Ref....Doc.        Country           Region
## 1                126.34       73.45  United States Northern America
## 2                 50.00      129.20  United States Northern America
## 3                 32.83       73.38 United Kingdom   Western Europe
##                                          Publisher  Coverage
## 1                                  Wiley-Blackwell 1950-2020
## 2 Centers for Disease Control and Prevention (CDC) 1990-2020
## 3                          Nature Publishing Group 2000-2020
##                                                                                                                                                    Categories
## 1                                                                                                                              Hematology (Q1); Oncology (Q1)
## 2 Epidemiology (Q1); Health Information Management (Q1); Health (social science) (Q1); Health, Toxicology and Mutagenesis (Q1); Medicine (miscellaneous) (Q1)
## 3                                                                                                                   Cell Biology (Q1); Molecular Biology (Q1)
dim(jmetrics)
## [1] 32952    20
nlmcat <- readLines("../data/nlmcatalog_result.txt")
journaltitle <- nlmcat[grep("Title\\(s\\):",nlmcat)]
journaltitle <- gsub("Title\\(s\\): ","",journaltitle)
journaltitle <- gsub("\\.$","",journaltitle)
journaltitle <- toupper(journaltitle) 
journalabbrev <- nlmcat[grep("Title Abbreviation:",nlmcat)]
journalabbrev <- sapply(strsplit(journalabbrev,":"),"[[",2)
journalabbrev <- gsub(" $","",journalabbrev)
journalabbrev <- gsub("^ ","",journalabbrev)
jdf <- data.frame(journalabbrev,journaltitle)

mjdf <- merge(jdf,jmetrics,by.x="journaltitle",by.y="Title")

xm <- merge(x,mjdf,by.x="Journal",by.y="journalabbrev",all.x = TRUE)

Analysis scores vs Journal rank

tail(xm$Journal)
## [1] "World J Gastroenterol"      "World J Gastrointest Oncol"
## [3] "World J Gastrointest Oncol" "World J Surg Oncol"        
## [5] "World J Surg Oncol"         "World J Surg Oncol"
hist(xm$SJR,xlab="SJR",main="SJR distribution")

mylm1 <- lm (xm$scores ~ xm$SJR)
plot(xm$SJR,xm$scores, col = rgb(red=0,blue=0,green=0,alpha=0.2), pch=19, bty="n",
  xlab="SJR",ylab="score",main="Analysis scores vs. journal metrics")
abline(mylm1,col="red")

cor.test(xm$scores,xm$SJR,method="pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  xm$scores and xm$SJR
## t = 1.335, df = 1196, p-value = 0.1821
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01810308  0.09500263
## sample estimates:
##        cor 
## 0.03857333
cor.test(xm$scores,xm$SJR,method="spearman")
## Warning in cor.test.default(xm$scores, xm$SJR, method = "spearman"): Cannot
## compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  xm$scores and xm$SJR
## S = 278554962, p-value = 0.3339
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.0279424
png("images/score_sjr1.png",width=350,height=300)
par(mar=c(5,5,3,1))
plot(xm$SJR,xm$scores, col = rgb(red=0,blue=0,green=0,alpha=0.2), pch=19,bty="n",
  xlab="SJR",ylab="score",main="Analysis scores vs. journal metrics")
abline(mylm1,col="red")
grid()
dev.off()
## png 
##   2
pdf("images/score_sjr1.pdf",width=4,height=3)
par(mar=c(4,4,3,2))
plot(xm$SJR,xm$scores, col = rgb(red=0,blue=0,green=0,alpha=0.2), pch=19,bty="n",
  xlab="SJR",ylab="score",main="Analysis scores vs. journal metrics")
abline(mylm1,col="red")
grid()
dev.off()
## png 
##   2

Best and worst journals

Minimum of 5 analyses.

xmm <-xm[,c("Journal","scores")]
tab <- table(xmm$Journal)
tab <- tab[which(tab>=5)]
xmm <- xmm[which(xmm$Journal %in% names(tab)),]
dim(xmm)
## [1] 1236    2
jres <- aggregate(. ~ Journal, xmm ,mean)
jsd <- aggregate(. ~ Journal, xmm ,sd)
jres$sd <- jsd$scores
jres <- jres[order(jres$scores),]
dim(jres)
## [1] 70  3
head(jres)
##                  Journal    scores        sd
## 65              RNA Biol -5.000000 1.0954451
## 53           Mol Med Rep -4.275000 0.9867715
## 15            Cancer Med -4.272727 0.9045340
## 70 World J Gastroenterol -4.250000 1.3887301
## 18     Cell Death Discov -4.200000 1.3038405
## 39         Int J Med Sci -4.200000 1.3038405
par(mar=c(5,12,3,1))
barplot(tail(jres$scores,20),names.arg = tail(jres$Journal,20),horiz=TRUE,las=1,cex.names = 0.7, xlab="mean score",
        main = "Highest scoring",xlim=c(-4,0))
grid()