Source: https://github.com/markziemann/SurveyEnrichment2/pubmed

Introduction

Enrichment analysis commonly suffers from statistical problems and poor reporting, making the findings irreproducible. Here we are investigating the methodology of some high impact articles that have conducted enrichment analysis. In order to highlight these problematic articles, we need to collect a corpus of them to screen. A list of pubmed articles cting DAVID was collected. The SJR (Scimago Journal Rank) for prominent journals was also collected. These will be merged and high impact enrichment articles will be curated.

library("kableExtra")

Load data

SJR data were downloaded from here.

Pubmed data were downloaded from the “cited by” pages for PMID 19131956 and 35325185. The data were downloaded in “Pubmed Format”.

Obtained 2023-05-29.

sjr <- read.csv("scimagojr_2022.csv",header=TRUE,sep=";")
colnames(sjr)
##  [1] "Rank"                   "Sourceid"               "Title"                 
##  [4] "Type"                   "Issn"                   "SJR"                   
##  [7] "SJR.Best.Quartile"      "H.index"                "Total.Docs...2022."    
## [10] "Total.Docs...3years."   "Total.Refs."            "Total.Cites..3years."  
## [13] "Citable.Docs...3years." "Cites...Doc...2years."  "Ref....Doc."           
## [16] "Country"                "Region"                 "Publisher"             
## [19] "Coverage"               "Categories"             "Areas"

Need to clean some data. Removing all data except the ISSN and the PMID.


tr '\r' '\n' < pubmed-19131956-set.txt \
| egrep '(^PMID|^IS)' \
| grep -v "Print" \
| grep -v "Link" \
| tr -d ' ' \
| sed 's/-//2' \
| cut -d '-' -f2 \
| cut -d '(' -f1 \
| sed 's/$/\n/' \
| grep -v ^$ \
| paste - - > tmp1.tsv

tr '\r' '\n' < pubmed-35325185-set.txt \
| egrep '(^PMID|^IS)' \
| grep -v "Print" \
| grep -v "Link" \
| tr -d ' ' \
| sed 's/-//2' \
| cut -d '-' -f2 \
| cut -d '(' -f1 \
| sed 's/$/\n/' \
| grep -v ^$ \
| paste - - > tmp2.tsv

Now it should be possible to load in to R.

pm1 <- read.table("tmp1.tsv",header=FALSE)
colnames(pm1) <- c("PMID","ISSN")

pm2 <- read.table("tmp2.tsv",header=FALSE)
colnames(pm2) <- c("PMID","ISSN")

Merge

First need to get the electronic ISSN, then cut down the dataset to include just the eISSN and the SJR.

rev1 <- lapply(sjr$Issn , function(x) {intToUtf8(rev(utf8ToInt(x)))} )
rev1 <- do.call(rbind,rev1)
rev1 <- sapply(strsplit(rev1," "),"[[",1)
issn <- lapply(rev1 , function(x) {intToUtf8(rev(utf8ToInt(x)))} )
issn <- unlist(issn)
sjr$issn <- issn

sjr2 <- sjr[,c("SJR","issn")]

m1 <- merge(sjr2,pm1,by.x="issn",by.y="ISSN")
m2 <- merge(sjr2,pm2,by.x="issn",by.y="ISSN")


m1$SJR <- as.numeric(gsub(",",".",m1$SJR))
m2$SJR <- as.numeric(gsub(",",".",m2$SJR))

Histogram and filter.

We will remove any papers published before 2020.

We will remove papers with SJR less than 5.

m <- rbind(m1,m2)
m <- unique(m)
m <- m[order(-m$SJR),]

# remove papers pre 2020
m <- m[which(as.numeric(m$PMID)>32000000),]

hist(m$SJR)

nrow(subset(m,SJR>5))
## [1] 151
nrow(subset(m,SJR>10))
## [1] 16
nrow(subset(m,SJR>15))
## [1] 8
m <- subset(m,SJR>5)

m %>%
  kbl(caption="top high impact articles describing enrichment analysis") %>%
  kable_paper("hover",full_width=FALSE)
top high impact articles describing enrichment analysis
issn SJR PMID
43 10974172 26.494 32991841
45 10974172 26.494 37001506
50 10974172 26.494 35120663
51 10974172 26.494 33743211
730 15461718 16.732 35710981
731 15461718 16.732 37012455
732 15461718 16.732 32989324
733 15461718 16.732 32989326
742 15487105 14.358 36344834
1632 18783686 12.578 35623341
3415 24709468 11.191 35658015
3416 24709468 11.191 36490328
3417 24709468 11.191 35213211
595 15292916 10.921 32807943
597 15292916 10.921 35624211
598 15292916 10.921 34017121
3523 26621347 9.817 36550235
511 14764598 8.703 32843065
512 14764598 8.703 35279152
3580 14764598 8.703 36670412
147 13624962 8.234 33035346
149 13624962 8.234 36631985
153 13624962 8.234 35323972
155 13624962 8.234 35234924
157 13624962 8.234 36124662
158 13624962 8.234 35100428
159 13624962 8.234 36029115
160 13624962 8.234 36440760
163 13624962 8.234 34037798
164 13624962 8.234 35390159
166 13624962 8.234 36373634
169 13624962 8.234 34125917
172 13624962 8.234 32997144
173 13624962 8.234 36629268
174 13624962 8.234 34850140
179 13624962 8.234 36454018
180 13624962 8.234 36477312
181 13624962 8.234 34019640
184 13624962 8.234 34023904
3556 13624962 8.234 36864760
3557 13624962 8.234 35670661
3559 13624962 8.234 35325185
1408 17554349 8.140 36864143
575 15244539 7.800 37013819
576 15244539 7.800 32885664
2816 21598290 7.268 36259947
2817 21598290 7.268 35262173
3440 25225812 7.045 35999469
3441 25225812 7.045 37037945
3443 25225812 7.045 35228746
3444 25225812 7.045 34417591
1846 19466242 6.361 36197962
1848 19466242 6.361 33762433
1851 19466242 6.361 35613280
701 15409538 6.237 34854884
702 15409538 6.237 33857288
704 15409538 6.237 37115584
706 15409538 6.237 35029648
707 15409538 6.237 35238865
708 15409538 6.237 36749798
709 15409538 6.237 35420633
1364 17444292 6.220 32744794
1454 17568722 6.046 35659036
3788 26663791 5.841 37075704
561 15213773 5.573 36524454
269 14602075 5.484 35466425
272 14602075 5.484 32954504
275 14602075 5.484 36398858
276 14602075 5.484 32946121
277 14602075 5.484 32985705
283 14602075 5.484 32954517
284 14602075 5.484 33034061
1623 18781551 5.385 33735618
1624 18781551 5.385 33891899
1625 18781551 5.385 36905926
1626 18781551 5.385 32857951
1627 18781551 5.385 36868234
1630 18781551 5.385 34004152
1384 17501326 5.158 36056435
801 15588238 5.117 33822765
802 15588238 5.117 33905372
804 15588238 5.117 33938445
805 15588238 5.117 36074606
807 15588238 5.117 35472067
808 15588238 5.117 35579943
809 15588238 5.117 33792563
810 15588238 5.117 33724957
811 15588238 5.117 32663196
813 15588238 5.117 32673291
814 15588238 5.117 33998599
1934 20411723 5.116 36316343
1936 20411723 5.116 36418319
1937 20411723 5.116 36658155
1939 20411723 5.116 36513677
1946 20411723 5.116 36253375
1947 20411723 5.116 36808153
1950 20411723 5.116 34862370
1954 20411723 5.116 36650144
1955 20411723 5.116 32703943
1956 20411723 5.116 33976125
1958 20411723 5.116 36316347
1959 20411723 5.116 34075040
1962 20411723 5.116 34001900
1963 20411723 5.116 36639706
1965 20411723 5.116 33037222
1967 20411723 5.116 36198703
1968 20411723 5.116 36418293
1969 20411723 5.116 33963192
1972 20411723 5.116 36229455
1975 20411723 5.116 34140506
1981 20411723 5.116 35082301
1984 20411723 5.116 34145295
1989 20411723 5.116 32934238
1990 20411723 5.116 35058437
1991 20411723 5.116 36433953
1992 20411723 5.116 36245034
1998 20411723 5.116 36376321
2000 20411723 5.116 35624100
2002 20411723 5.116 35523793
2004 20411723 5.116 36309495
2005 20411723 5.116 32814767
2008 20411723 5.116 32908143
2009 20411723 5.116 35273186
2010 20411723 5.116 33931647
2011 20411723 5.116 33772001
2013 20411723 5.116 36522318
2015 20411723 5.116 33004791
2016 20411723 5.116 33850120
2017 20411723 5.116 33953205
2020 20411723 5.116 32732898
2021 20411723 5.116 33947848
2024 20411723 5.116 34404785
2025 20411723 5.116 32826896
2026 20411723 5.116 35440542
2027 20411723 5.116 34031380
2029 20411723 5.116 35140242
2033 20411723 5.116 32732875
2034 20411723 5.116 35618699
2035 20411723 5.116 35440545
2036 20411723 5.116 33712610
2037 20411723 5.116 35650206
2041 20411723 5.116 35413957
2042 20411723 5.116 35145080
2044 20411723 5.116 35987910
2046 20411723 5.116 32895370
2047 20411723 5.116 33782403
2048 20411723 5.116 36071032
2052 20411723 5.116 34404786
3649 20411723 5.116 35817779
3650 20411723 5.116 35948564
3653 20411723 5.116 36828809
write.table(m,file="corpus.tsv",row.names=FALSE)

Session Information

sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] kableExtra_1.3.4
## 
## loaded via a namespace (and not attached):
##  [1] vctrs_0.6.2       svglite_2.1.1     httr_1.4.5        cli_3.6.1        
##  [5] knitr_1.42        rlang_1.1.1       xfun_0.39         highr_0.10       
##  [9] stringi_1.7.12    jsonlite_1.8.4    glue_1.6.2        colorspace_2.1-0 
## [13] htmltools_0.5.5   sass_0.4.5        scales_1.2.1      rmarkdown_2.21   
## [17] evaluate_0.20     munsell_0.5.0     jquerylib_0.1.4   fastmap_1.1.1    
## [21] yaml_2.3.7        lifecycle_1.0.3   stringr_1.5.0     compiler_4.3.0   
## [25] rvest_1.0.3       rstudioapi_0.14   systemfonts_1.0.4 digest_0.6.31    
## [29] viridisLite_0.4.1 R6_2.5.1          magrittr_2.0.3    webshot_0.5.4    
## [33] bslib_0.4.2       tools_4.3.0       xml2_1.3.4        cachem_1.0.7