Source: https://github.com/markziemann/SurveyEnrichment2/pubmed
Enrichment analysis commonly suffers from statistical problems and poor reporting, making the findings irreproducible. Here we are investigating the methodology of some high impact articles that have conducted enrichment analysis. In order to highlight these problematic articles, we need to collect a corpus of them to screen. A list of pubmed articles cting DAVID was collected. The SJR (Scimago Journal Rank) for prominent journals was also collected. These will be merged and high impact enrichment articles will be curated.
library("kableExtra")
SJR data were downloaded from here.
Pubmed data were downloaded from the “cited by” pages for PMID 19131956 and 35325185. The data were downloaded in “Pubmed Format”.
Obtained 2023-05-29.
sjr <- read.csv("scimagojr_2022.csv",header=TRUE,sep=";")
colnames(sjr)
## [1] "Rank" "Sourceid" "Title"
## [4] "Type" "Issn" "SJR"
## [7] "SJR.Best.Quartile" "H.index" "Total.Docs...2022."
## [10] "Total.Docs...3years." "Total.Refs." "Total.Cites..3years."
## [13] "Citable.Docs...3years." "Cites...Doc...2years." "Ref....Doc."
## [16] "Country" "Region" "Publisher"
## [19] "Coverage" "Categories" "Areas"
Need to clean some data. Removing all data except the ISSN and the PMID.
tr '\r' '\n' < pubmed-19131956-set.txt \
| egrep '(^PMID|^IS)' \
| grep -v "Print" \
| grep -v "Link" \
| tr -d ' ' \
| sed 's/-//2' \
| cut -d '-' -f2 \
| cut -d '(' -f1 \
| sed 's/$/\n/' \
| grep -v ^$ \
| paste - - > tmp1.tsv
tr '\r' '\n' < pubmed-35325185-set.txt \
| egrep '(^PMID|^IS)' \
| grep -v "Print" \
| grep -v "Link" \
| tr -d ' ' \
| sed 's/-//2' \
| cut -d '-' -f2 \
| cut -d '(' -f1 \
| sed 's/$/\n/' \
| grep -v ^$ \
| paste - - > tmp2.tsv
Now it should be possible to load in to R.
pm1 <- read.table("tmp1.tsv",header=FALSE)
colnames(pm1) <- c("PMID","ISSN")
pm2 <- read.table("tmp2.tsv",header=FALSE)
colnames(pm2) <- c("PMID","ISSN")
First need to get the electronic ISSN, then cut down the dataset to include just the eISSN and the SJR.
rev1 <- lapply(sjr$Issn , function(x) {intToUtf8(rev(utf8ToInt(x)))} )
rev1 <- do.call(rbind,rev1)
rev1 <- sapply(strsplit(rev1," "),"[[",1)
issn <- lapply(rev1 , function(x) {intToUtf8(rev(utf8ToInt(x)))} )
issn <- unlist(issn)
sjr$issn <- issn
sjr2 <- sjr[,c("SJR","issn")]
m1 <- merge(sjr2,pm1,by.x="issn",by.y="ISSN")
m2 <- merge(sjr2,pm2,by.x="issn",by.y="ISSN")
m1$SJR <- as.numeric(gsub(",",".",m1$SJR))
m2$SJR <- as.numeric(gsub(",",".",m2$SJR))
Histogram and filter.
We will remove any papers published before 2020.
We will remove papers with SJR less than 5.
m <- rbind(m1,m2)
m <- unique(m)
m <- m[order(-m$SJR),]
# remove papers pre 2020
m <- m[which(as.numeric(m$PMID)>32000000),]
hist(m$SJR)
nrow(subset(m,SJR>5))
## [1] 151
nrow(subset(m,SJR>10))
## [1] 16
nrow(subset(m,SJR>15))
## [1] 8
m <- subset(m,SJR>5)
m %>%
kbl(caption="top high impact articles describing enrichment analysis") %>%
kable_paper("hover",full_width=FALSE)
issn | SJR | PMID | |
---|---|---|---|
43 | 10974172 | 26.494 | 32991841 |
45 | 10974172 | 26.494 | 37001506 |
50 | 10974172 | 26.494 | 35120663 |
51 | 10974172 | 26.494 | 33743211 |
730 | 15461718 | 16.732 | 35710981 |
731 | 15461718 | 16.732 | 37012455 |
732 | 15461718 | 16.732 | 32989324 |
733 | 15461718 | 16.732 | 32989326 |
742 | 15487105 | 14.358 | 36344834 |
1632 | 18783686 | 12.578 | 35623341 |
3415 | 24709468 | 11.191 | 35658015 |
3416 | 24709468 | 11.191 | 36490328 |
3417 | 24709468 | 11.191 | 35213211 |
595 | 15292916 | 10.921 | 32807943 |
597 | 15292916 | 10.921 | 35624211 |
598 | 15292916 | 10.921 | 34017121 |
3523 | 26621347 | 9.817 | 36550235 |
511 | 14764598 | 8.703 | 32843065 |
512 | 14764598 | 8.703 | 35279152 |
3580 | 14764598 | 8.703 | 36670412 |
147 | 13624962 | 8.234 | 33035346 |
149 | 13624962 | 8.234 | 36631985 |
153 | 13624962 | 8.234 | 35323972 |
155 | 13624962 | 8.234 | 35234924 |
157 | 13624962 | 8.234 | 36124662 |
158 | 13624962 | 8.234 | 35100428 |
159 | 13624962 | 8.234 | 36029115 |
160 | 13624962 | 8.234 | 36440760 |
163 | 13624962 | 8.234 | 34037798 |
164 | 13624962 | 8.234 | 35390159 |
166 | 13624962 | 8.234 | 36373634 |
169 | 13624962 | 8.234 | 34125917 |
172 | 13624962 | 8.234 | 32997144 |
173 | 13624962 | 8.234 | 36629268 |
174 | 13624962 | 8.234 | 34850140 |
179 | 13624962 | 8.234 | 36454018 |
180 | 13624962 | 8.234 | 36477312 |
181 | 13624962 | 8.234 | 34019640 |
184 | 13624962 | 8.234 | 34023904 |
3556 | 13624962 | 8.234 | 36864760 |
3557 | 13624962 | 8.234 | 35670661 |
3559 | 13624962 | 8.234 | 35325185 |
1408 | 17554349 | 8.140 | 36864143 |
575 | 15244539 | 7.800 | 37013819 |
576 | 15244539 | 7.800 | 32885664 |
2816 | 21598290 | 7.268 | 36259947 |
2817 | 21598290 | 7.268 | 35262173 |
3440 | 25225812 | 7.045 | 35999469 |
3441 | 25225812 | 7.045 | 37037945 |
3443 | 25225812 | 7.045 | 35228746 |
3444 | 25225812 | 7.045 | 34417591 |
1846 | 19466242 | 6.361 | 36197962 |
1848 | 19466242 | 6.361 | 33762433 |
1851 | 19466242 | 6.361 | 35613280 |
701 | 15409538 | 6.237 | 34854884 |
702 | 15409538 | 6.237 | 33857288 |
704 | 15409538 | 6.237 | 37115584 |
706 | 15409538 | 6.237 | 35029648 |
707 | 15409538 | 6.237 | 35238865 |
708 | 15409538 | 6.237 | 36749798 |
709 | 15409538 | 6.237 | 35420633 |
1364 | 17444292 | 6.220 | 32744794 |
1454 | 17568722 | 6.046 | 35659036 |
3788 | 26663791 | 5.841 | 37075704 |
561 | 15213773 | 5.573 | 36524454 |
269 | 14602075 | 5.484 | 35466425 |
272 | 14602075 | 5.484 | 32954504 |
275 | 14602075 | 5.484 | 36398858 |
276 | 14602075 | 5.484 | 32946121 |
277 | 14602075 | 5.484 | 32985705 |
283 | 14602075 | 5.484 | 32954517 |
284 | 14602075 | 5.484 | 33034061 |
1623 | 18781551 | 5.385 | 33735618 |
1624 | 18781551 | 5.385 | 33891899 |
1625 | 18781551 | 5.385 | 36905926 |
1626 | 18781551 | 5.385 | 32857951 |
1627 | 18781551 | 5.385 | 36868234 |
1630 | 18781551 | 5.385 | 34004152 |
1384 | 17501326 | 5.158 | 36056435 |
801 | 15588238 | 5.117 | 33822765 |
802 | 15588238 | 5.117 | 33905372 |
804 | 15588238 | 5.117 | 33938445 |
805 | 15588238 | 5.117 | 36074606 |
807 | 15588238 | 5.117 | 35472067 |
808 | 15588238 | 5.117 | 35579943 |
809 | 15588238 | 5.117 | 33792563 |
810 | 15588238 | 5.117 | 33724957 |
811 | 15588238 | 5.117 | 32663196 |
813 | 15588238 | 5.117 | 32673291 |
814 | 15588238 | 5.117 | 33998599 |
1934 | 20411723 | 5.116 | 36316343 |
1936 | 20411723 | 5.116 | 36418319 |
1937 | 20411723 | 5.116 | 36658155 |
1939 | 20411723 | 5.116 | 36513677 |
1946 | 20411723 | 5.116 | 36253375 |
1947 | 20411723 | 5.116 | 36808153 |
1950 | 20411723 | 5.116 | 34862370 |
1954 | 20411723 | 5.116 | 36650144 |
1955 | 20411723 | 5.116 | 32703943 |
1956 | 20411723 | 5.116 | 33976125 |
1958 | 20411723 | 5.116 | 36316347 |
1959 | 20411723 | 5.116 | 34075040 |
1962 | 20411723 | 5.116 | 34001900 |
1963 | 20411723 | 5.116 | 36639706 |
1965 | 20411723 | 5.116 | 33037222 |
1967 | 20411723 | 5.116 | 36198703 |
1968 | 20411723 | 5.116 | 36418293 |
1969 | 20411723 | 5.116 | 33963192 |
1972 | 20411723 | 5.116 | 36229455 |
1975 | 20411723 | 5.116 | 34140506 |
1981 | 20411723 | 5.116 | 35082301 |
1984 | 20411723 | 5.116 | 34145295 |
1989 | 20411723 | 5.116 | 32934238 |
1990 | 20411723 | 5.116 | 35058437 |
1991 | 20411723 | 5.116 | 36433953 |
1992 | 20411723 | 5.116 | 36245034 |
1998 | 20411723 | 5.116 | 36376321 |
2000 | 20411723 | 5.116 | 35624100 |
2002 | 20411723 | 5.116 | 35523793 |
2004 | 20411723 | 5.116 | 36309495 |
2005 | 20411723 | 5.116 | 32814767 |
2008 | 20411723 | 5.116 | 32908143 |
2009 | 20411723 | 5.116 | 35273186 |
2010 | 20411723 | 5.116 | 33931647 |
2011 | 20411723 | 5.116 | 33772001 |
2013 | 20411723 | 5.116 | 36522318 |
2015 | 20411723 | 5.116 | 33004791 |
2016 | 20411723 | 5.116 | 33850120 |
2017 | 20411723 | 5.116 | 33953205 |
2020 | 20411723 | 5.116 | 32732898 |
2021 | 20411723 | 5.116 | 33947848 |
2024 | 20411723 | 5.116 | 34404785 |
2025 | 20411723 | 5.116 | 32826896 |
2026 | 20411723 | 5.116 | 35440542 |
2027 | 20411723 | 5.116 | 34031380 |
2029 | 20411723 | 5.116 | 35140242 |
2033 | 20411723 | 5.116 | 32732875 |
2034 | 20411723 | 5.116 | 35618699 |
2035 | 20411723 | 5.116 | 35440545 |
2036 | 20411723 | 5.116 | 33712610 |
2037 | 20411723 | 5.116 | 35650206 |
2041 | 20411723 | 5.116 | 35413957 |
2042 | 20411723 | 5.116 | 35145080 |
2044 | 20411723 | 5.116 | 35987910 |
2046 | 20411723 | 5.116 | 32895370 |
2047 | 20411723 | 5.116 | 33782403 |
2048 | 20411723 | 5.116 | 36071032 |
2052 | 20411723 | 5.116 | 34404786 |
3649 | 20411723 | 5.116 | 35817779 |
3650 | 20411723 | 5.116 | 35948564 |
3653 | 20411723 | 5.116 | 36828809 |
write.table(m,file="corpus.tsv",row.names=FALSE)
sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
##
## locale:
## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8
## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8
## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] kableExtra_1.3.4
##
## loaded via a namespace (and not attached):
## [1] vctrs_0.6.2 svglite_2.1.1 httr_1.4.5 cli_3.6.1
## [5] knitr_1.42 rlang_1.1.1 xfun_0.39 highr_0.10
## [9] stringi_1.7.12 jsonlite_1.8.4 glue_1.6.2 colorspace_2.1-0
## [13] htmltools_0.5.5 sass_0.4.5 scales_1.2.1 rmarkdown_2.21
## [17] evaluate_0.20 munsell_0.5.0 jquerylib_0.1.4 fastmap_1.1.1
## [21] yaml_2.3.7 lifecycle_1.0.3 stringr_1.5.0 compiler_4.3.0
## [25] rvest_1.0.3 rstudioapi_0.14 systemfonts_1.0.4 digest_0.6.31
## [29] viridisLite_0.4.1 R6_2.5.1 magrittr_2.0.3 webshot_0.5.4
## [33] bslib_0.4.2 tools_4.3.0 xml2_1.3.4 cachem_1.0.7