1 Commercial use subset (includes PMC content) [9000 papers]

### Data Source: https://pages.semanticscholar.org/coronavirus-research
library(purrr)
library(tidyverse)
library(jsonlite)
path <- "D:/COVID19/comm_use_subset.tar/comm_use_subset"
temp <- list.files(path, pattern="*.json", full.names=TRUE)

data <- purrr::map_df(temp, function(x) { 
  purrr::map(jsonlite::fromJSON(x), function(y) ifelse(is.null(y), NA, y)) 
})
head(data)
## # A tibble: 6 x 7
##   paper_id      metadata  abstract body_text bib_entries ref_entries back_matter
##   <chr>         <list>    <list>   <list>    <list>      <list>      <list>     
## 1 000b7d1517ce… <chr [1]> <NULL>   <chr [1]> <named lis… <named lis… <NULL>     
## 2 00142f93c18b… <chr [1]> <chr [1… <chr [21… <named lis… <named lis… <NULL>     
## 3 0022796bb211… <chr [1]> <chr [2… <chr [62… <named lis… <named lis… <NULL>     
## 4 00326efcca08… <chr [1]> <chr [1… <chr [59… <named lis… <named lis… <NULL>     
## 5 00352a58c876… <chr [1]> <chr [1… <chr [21… <named lis… <named lis… <chr [2]>  
## 6 0043d044273b… <chr [1]> <chr [3… <chr [24… <named lis… <named lis… <chr [1]>

1.1 Abstracts

abs= data[, c(1, 3)] %>% 
  unnest(abstract) %>%
  group_by(paper_id) %>%
  mutate(ind = row_number()) %>% 
  pivot_wider(names_from = ind, values_from = abstract)


library(DT)

datatable(
  abs, extensions = c('Select', 'Buttons'), options = list(
    select = list(style = 'os', items = 'row'),
    dom = 'Blfrtip',
    rowId = 0,
    buttons = c('csv', 'excel')
  ),
  selection = 'none'
)

1.2 Metadata

title= data[, c(1, 2)] %>% 
  unnest(metadata) %>%
  group_by(paper_id) %>%
  mutate(ind = row_number()) %>% 
  pivot_wider(names_from = ind, values_from = metadata)

datatable(
  title, extensions = c('Select', 'Buttons'), options = list(
    select = list(style = 'os', items = 'row'),
    dom = 'Blfrtip',
    rowId = 0,
    buttons = c('csv', 'excel')
  ),
  selection = 'none'
)