Growth of the COVID19 literature

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.0     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Updated 13 June 2020: Graph axes changed to more clearly reflect that these are cumulative article counts

Load in data from the NIH COVID-19 portfolio (https://icite.od.nih.gov/covid19/search/). This dataset was downloaded 9 June 2020, including the DOI, PMCID, PMID, Publication Date, Publication Types, Source, and Journal Name fields, and includes all articles through 2020-06-09.

covidlit <- read_csv('data/covidlit-COVID-19_Portfolio-export_2020-06-09-13-52-30.csv')

## Parsed with column specification:
## cols(
##   DOI = col_character(),
##   PMCID = col_character(),
##   PMID = col_double(),
##   `Publication Date` = col_date(format = ""),
##   `Publication Types` = col_character(),
##   Source = col_character(),
##   `Journal Name` = col_character()
## )

How many total articles?

nrow(covidlit)

## [1] 29054

We want to count published articles and preprints. What kind of sources do we have?

unique(covidlit$Source)

## [1] "bioRxiv"                "ChemRxiv"               "Peer reviewed (PubMed)"
## [4] "Research Square"        "medRxiv"                "arXiv"                 
## [7] "SSRN"

Let’s count articles by type…

all_types <- unique(covidlit$Source)
preprint_types <- all_types[!all_types == "Peer reviewed (PubMed)"]

for (src in all_types) {
  print(paste(src, nrow(covidlit[covidlit$Source == src,])))
}

## [1] "bioRxiv 914"
## [1] "ChemRxiv 246"
## [1] "Peer reviewed (PubMed) 19669"
## [1] "Research Square 2235"
## [1] "medRxiv 3877"
## [1] "arXiv 942"
## [1] "SSRN 1171"

For each date in the Publication Date column, count number of articles and number of preprints…

# convert 'Publication Date' to a Date field
covidlit$`Publication Date` <- as.Date(covidlit$`Publication Date`, format = "%Y-%m-%d")
dates <- sort(unique(as.Date(covidlit$`Publication Date`)))
# data.frame to hold the data
df <- data.frame(date = NA, all = NA, jrnl = NA, pp = NA, stringsAsFactors = FALSE)
for (d in as.list(dates)) {
  tmp <- covidlit[covidlit$`Publication Date` == d,]
  df <- rbind(df, data.frame(date = d, 
                             all = nrow(tmp),
                             jrnl = nrow(tmp[tmp$Source == 'Peer reviewed (PubMed)',]),
                             pp = nrow(tmp[tmp$Source %in% preprint_types,]),
                             stringsAsFactors = FALSE))
}
# when creating a data.frame, dates get mangled, so reformat
df$date <- as.Date(df$date, origin="1970-01-01")
# remove the first row, all NA
df <- df[-1,]
# get cumulative # of journal articles and preprints
df$cumsum_all <- cumsum(df$all)
df$cumsum_jrnl <- cumsum(df$jrnl)
df$cumsum_pp <- cumsum(df$pp)

df

Make the data ‘tidy’…

# get rid of columns we don't need
# select pubs after 1 Jan 2020, but skip the last day, whose data hasn't been fully updated
df <- df %>% 
  select(-c("all", "jrnl", "pp")) %>% 
  filter(date >= "2020-01-01", date <= "2020-06-08")

# rename cols (note, these are cumulative totals)
colnames(df) <- c("date","All Articles", "Published", "Preprints")

# use {tidyr} to convert a 'wide' table to 'long' for easier plotting (see https://tidyr.tidyverse.org/)
mytable <- df %>% pivot_longer(cols = c('All Articles', 'Published', 'Preprints'), names_to = "Article type",
                               values_to = "count")
mytable

Graph the data…

p <- ggplot(mytable) +
  geom_line(aes(x = date, y = count, color = `Article type`)) +
  labs(title = "Growth of the COVID-19 literature", 
       subtitle = "Source: https://icite.od.nih.gov/covid19/search/",
       y = "Cumulative Number of Articles", x = "Publication Date") +
  theme_minimal() +
  theme(legend.position = "right", legend.title = element_blank())

p

Document session for computational reproducibility

sessionInfo()

## R version 4.0.0 (2020-04-24)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] forcats_0.5.0   stringr_1.4.0   dplyr_0.8.5     purrr_0.3.4    
## [5] readr_1.3.1     tidyr_1.0.2     tibble_3.0.1    ggplot2_3.3.0  
## [9] tidyverse_1.3.0
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.0.0 xfun_0.13        haven_2.2.0      lattice_0.20-41 
##  [5] colorspace_1.4-1 vctrs_0.3.0      generics_0.0.2   htmltools_0.4.0 
##  [9] yaml_2.2.1       rlang_0.4.6      pillar_1.4.4     glue_1.4.1      
## [13] withr_2.2.0      DBI_1.1.0        dbplyr_1.4.3     modelr_0.1.6    
## [17] readxl_1.3.1     lifecycle_0.2.0  munsell_0.5.0    gtable_0.3.0    
## [21] cellranger_1.1.0 rvest_0.3.5      evaluate_0.14    labeling_0.3    
## [25] knitr_1.28       fansi_0.4.1      broom_0.5.6      Rcpp_1.0.4.6    
## [29] scales_1.1.0     backports_1.1.7  jsonlite_1.6.1   farver_2.0.3    
## [33] fs_1.4.1         hms_0.5.3        digest_0.6.25    stringi_1.4.6   
## [37] grid_4.0.0       cli_2.0.2        tools_4.0.0      magrittr_1.5    
## [41] crayon_1.3.4     pkgconfig_2.0.3  ellipsis_0.3.1   xml2_1.3.2      
## [45] reprex_0.3.0     lubridate_1.7.8  assertthat_0.2.1 rmarkdown_2.1   
## [49] httr_1.4.1       rstudioapi_0.11  R6_2.4.1         nlme_3.1-147    
## [53] compiler_4.0.0