library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.4
## ✓ tibble 3.0.1 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Updated 13 June 2020: Graph axes changed to more clearly reflect that these are cumulative article counts
Load in data from the NIH COVID-19 portfolio (https://icite.od.nih.gov/covid19/search/). This dataset was downloaded 9 June 2020, including the DOI, PMCID, PMID, Publication Date, Publication Types, Source, and Journal Name fields, and includes all articles through 2020-06-09.
covidlit <- read_csv('data/covidlit-COVID-19_Portfolio-export_2020-06-09-13-52-30.csv')
## Parsed with column specification:
## cols(
## DOI = col_character(),
## PMCID = col_character(),
## PMID = col_double(),
## `Publication Date` = col_date(format = ""),
## `Publication Types` = col_character(),
## Source = col_character(),
## `Journal Name` = col_character()
## )
How many total articles?
nrow(covidlit)
## [1] 29054
We want to count published articles and preprints. What kind of sources do we have?
unique(covidlit$Source)
## [1] "bioRxiv" "ChemRxiv" "Peer reviewed (PubMed)"
## [4] "Research Square" "medRxiv" "arXiv"
## [7] "SSRN"
Let’s count articles by type…
all_types <- unique(covidlit$Source)
preprint_types <- all_types[!all_types == "Peer reviewed (PubMed)"]
for (src in all_types) {
print(paste(src, nrow(covidlit[covidlit$Source == src,])))
}
## [1] "bioRxiv 914"
## [1] "ChemRxiv 246"
## [1] "Peer reviewed (PubMed) 19669"
## [1] "Research Square 2235"
## [1] "medRxiv 3877"
## [1] "arXiv 942"
## [1] "SSRN 1171"
For each date in the Publication Date column, count number of articles and number of preprints…
# convert 'Publication Date' to a Date field
covidlit$`Publication Date` <- as.Date(covidlit$`Publication Date`, format = "%Y-%m-%d")
dates <- sort(unique(as.Date(covidlit$`Publication Date`)))
# data.frame to hold the data
df <- data.frame(date = NA, all = NA, jrnl = NA, pp = NA, stringsAsFactors = FALSE)
for (d in as.list(dates)) {
tmp <- covidlit[covidlit$`Publication Date` == d,]
df <- rbind(df, data.frame(date = d,
all = nrow(tmp),
jrnl = nrow(tmp[tmp$Source == 'Peer reviewed (PubMed)',]),
pp = nrow(tmp[tmp$Source %in% preprint_types,]),
stringsAsFactors = FALSE))
}
# when creating a data.frame, dates get mangled, so reformat
df$date <- as.Date(df$date, origin="1970-01-01")
# remove the first row, all NA
df <- df[-1,]
# get cumulative # of journal articles and preprints
df$cumsum_all <- cumsum(df$all)
df$cumsum_jrnl <- cumsum(df$jrnl)
df$cumsum_pp <- cumsum(df$pp)
df
Make the data ‘tidy’…
# get rid of columns we don't need
# select pubs after 1 Jan 2020, but skip the last day, whose data hasn't been fully updated
df <- df %>%
select(-c("all", "jrnl", "pp")) %>%
filter(date >= "2020-01-01", date <= "2020-06-08")
# rename cols (note, these are cumulative totals)
colnames(df) <- c("date","All Articles", "Published", "Preprints")
# use {tidyr} to convert a 'wide' table to 'long' for easier plotting (see https://tidyr.tidyverse.org/)
mytable <- df %>% pivot_longer(cols = c('All Articles', 'Published', 'Preprints'), names_to = "Article type",
values_to = "count")
mytable
Graph the data…
p <- ggplot(mytable) +
geom_line(aes(x = date, y = count, color = `Article type`)) +
labs(title = "Growth of the COVID-19 literature",
subtitle = "Source: https://icite.od.nih.gov/covid19/search/",
y = "Cumulative Number of Articles", x = "Publication Date") +
theme_minimal() +
theme(legend.position = "right", legend.title = element_blank())
p
Document session for computational reproducibility
sessionInfo()
## R version 4.0.0 (2020-04-24)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] forcats_0.5.0 stringr_1.4.0 dplyr_0.8.5 purrr_0.3.4
## [5] readr_1.3.1 tidyr_1.0.2 tibble_3.0.1 ggplot2_3.3.0
## [9] tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.0.0 xfun_0.13 haven_2.2.0 lattice_0.20-41
## [5] colorspace_1.4-1 vctrs_0.3.0 generics_0.0.2 htmltools_0.4.0
## [9] yaml_2.2.1 rlang_0.4.6 pillar_1.4.4 glue_1.4.1
## [13] withr_2.2.0 DBI_1.1.0 dbplyr_1.4.3 modelr_0.1.6
## [17] readxl_1.3.1 lifecycle_0.2.0 munsell_0.5.0 gtable_0.3.0
## [21] cellranger_1.1.0 rvest_0.3.5 evaluate_0.14 labeling_0.3
## [25] knitr_1.28 fansi_0.4.1 broom_0.5.6 Rcpp_1.0.4.6
## [29] scales_1.1.0 backports_1.1.7 jsonlite_1.6.1 farver_2.0.3
## [33] fs_1.4.1 hms_0.5.3 digest_0.6.25 stringi_1.4.6
## [37] grid_4.0.0 cli_2.0.2 tools_4.0.0 magrittr_1.5
## [41] crayon_1.3.4 pkgconfig_2.0.3 ellipsis_0.3.1 xml2_1.3.2
## [45] reprex_0.3.0 lubridate_1.7.8 assertthat_0.2.1 rmarkdown_2.1
## [49] httr_1.4.1 rstudioapi_0.11 R6_2.4.1 nlme_3.1-147
## [53] compiler_4.0.0