Commercial use subset (includes PMC content) [9000 papers]
### Data Source: https://pages.semanticscholar.org/coronavirus-research
library(purrr)
library(tidyverse)
library(jsonlite)
path <- "D:/COVID19/comm_use_subset.tar/comm_use_subset"
temp <- list.files(path, pattern="*.json", full.names=TRUE)
data <- purrr::map_df(temp, function(x) {
purrr::map(jsonlite::fromJSON(x), function(y) ifelse(is.null(y), NA, y))
})
head(data)
## # A tibble: 6 x 7
## paper_id metadata abstract body_text bib_entries ref_entries back_matter
## <chr> <list> <list> <list> <list> <list> <list>
## 1 000b7d1517ce… <chr [1]> <NULL> <chr [1]> <named lis… <named lis… <NULL>
## 2 00142f93c18b… <chr [1]> <chr [1… <chr [21… <named lis… <named lis… <NULL>
## 3 0022796bb211… <chr [1]> <chr [2… <chr [62… <named lis… <named lis… <NULL>
## 4 00326efcca08… <chr [1]> <chr [1… <chr [59… <named lis… <named lis… <NULL>
## 5 00352a58c876… <chr [1]> <chr [1… <chr [21… <named lis… <named lis… <chr [2]>
## 6 0043d044273b… <chr [1]> <chr [3… <chr [24… <named lis… <named lis… <chr [1]>
Abstracts
abs= data[, c(1, 3)] %>%
unnest(abstract) %>%
group_by(paper_id) %>%
mutate(ind = row_number()) %>%
pivot_wider(names_from = ind, values_from = abstract)
library(DT)
datatable(
abs, extensions = c('Select', 'Buttons'), options = list(
select = list(style = 'os', items = 'row'),
dom = 'Blfrtip',
rowId = 0,
buttons = c('csv', 'excel')
),
selection = 'none'
)