library(tidyverse)
library(rvest)
library(xml2)
library(jsonlite)

HTML

Read in the file

html <- read_html('https://raw.githubusercontent.com/kac624/cuny/main/D607/data/week7_books.html')

Explore

html %>% html_elements('title')
## {xml_nodeset (1)}
## [1] <title>This page has a table for D607.</title>
html %>% html_elements('td')
## {xml_nodeset (18)}
##  [1] <td>1</td>
##  [2] <td>Iran Between Two Revolutions</td>
##  [3] <td>Ervand Abrahamian</td>
##  [4] <td>Princeton University Press</td>
##  [5] <td>1982</td>
##  [6] <td>978-0691101347</td>
##  [7] <td>2</td>
##  [8] <td>Orientalsm</td>
##  [9] <td>Edward Said</td>
## [10] <td>Pantheon Books</td>
## [11] <td>1978</td>
## [12] <td>978-0394428147</td>
## [13] <td>3</td>
## [14] <td>The Arab Spring: Pathways of Repression and Reform</td>
## [15] <td>Jason Brownlee; Tarek Masoud; Andrew Reynolds</td>
## [16] <td>Oxford University Press</td>
## [17] <td>2015</td>
## [18] <td>978-0199660063</td>

Convert to dataframe

books_df_html <- html %>%
  html_element('table') %>% 
  html_table()

knitr::kable(books_df_html)
Index Title Author Publisher Year Published ISBN
1 Iran Between Two Revolutions Ervand Abrahamian Princeton University Press 1982 978-0691101347
2 Orientalsm Edward Said Pantheon Books 1978 978-0394428147
3 The Arab Spring: Pathways of Repression and Reform Jason Brownlee; Tarek Masoud; Andrew Reynolds Oxford University Press 2015 978-0199660063

XML

Read in the file

xml <- read_xml('https://raw.githubusercontent.com/kac624/cuny/main/D607/data/week7_books.xml')

Explore

xml_structure(xml)
## <root>
##   <row>
##     <index>
##       {text}
##     <title>
##       {text}
##     <author>
##       {text}
##     <publisher>
##       {text}
##     <publish_year>
##       {text}
##     <isbn>
##       {text}
##   <row>
##     <index>
##       {text}
##     <title>
##       {text}
##     <author>
##       {text}
##     <publisher>
##       {text}
##     <publish_year>
##       {text}
##     <isbn>
##       {text}
##   <row>
##     <index>
##       {text}
##     <title>
##       {text}
##     <author>
##       {text}
##     <publisher>
##       {text}
##     <publish_year>
##       {text}
##     <isbn>
##       {text}
xml_find_all(xml, xpath = '//title') 
## {xml_nodeset (3)}
## [1] <title>Iran Between Two Revolutions</title>
## [2] <title>Orientalsm</title>
## [3] <title>The Arab Spring: Pathways of Repression and Reform</title>

Convert to dataframe

elements <- c('index','title','author','publisher','publish_year','isbn')

for (element in elements) {
  contents <- xml_find_all(xml, xpath = str_c('//',element)) %>% xml_text()
  assign(element, contents)
}

books_df_xml <- tibble(index = index,
                       title = title,
                       author = author,
                       publisher = publisher,
                       publish_year = publish_year,
                       isbn = isbn)

knitr::kable(books_df_xml)
index title author publisher publish_year isbn
1 Iran Between Two Revolutions Ervand Abrahamian Princeton University Press 1982 978-0691101347
2 Orientalsm Edward Said Pantheon Books 1978 978-0394428147
3 The Arab Spring: Pathways of Repression and Reform Jason Brownlee; Tarek Masoud; Andrew Reynolds Oxford University Press 2015 978-0199660063

JSON

Read in the file

json <- read_json('https://raw.githubusercontent.com/kac624/cuny/main/D607/data/week7_books.json')

Explore

json[[2]]
## $index
## [1] "2"
## 
## $title
## [1] "Orientalsm"
## 
## $author
## [1] "Edward Said"
## 
## $publisher
## [1] "Pantheon Books"
## 
## $publish_year
## [1] "1978"
## 
## $isbn
## [1] "978-0394428147"
json[[3]]$title
## [1] "The Arab Spring: Pathways of Repression and Reform"

Convert to dataframe

books_df_json <- tibble(json = json) %>%
  unnest_wider(json)

knitr::kable(books_df_json)
index title author publisher publish_year isbn
1 Iran Between Two Revolutions Ervand Abrahamian Princeton University Press 1982 978-0691101347
2 Orientalsm Edward Said Pantheon Books 1978 978-0394428147
3 The Arab Spring: Pathways of Repression and Reform Jason Brownlee; Tarek Masoud; Andrew Reynolds Oxford University Press 2015 978-0199660063