library(tidyverse)
library(rvest)
library(xml2)
library(jsonlite)
Read in the file
html <- read_html('https://raw.githubusercontent.com/kac624/cuny/main/D607/data/week7_books.html')
Explore
html %>% html_elements('title')
## {xml_nodeset (1)}
## [1] <title>This page has a table for D607.</title>
html %>% html_elements('td')
## {xml_nodeset (18)}
## [1] <td>1</td>
## [2] <td>Iran Between Two Revolutions</td>
## [3] <td>Ervand Abrahamian</td>
## [4] <td>Princeton University Press</td>
## [5] <td>1982</td>
## [6] <td>978-0691101347</td>
## [7] <td>2</td>
## [8] <td>Orientalsm</td>
## [9] <td>Edward Said</td>
## [10] <td>Pantheon Books</td>
## [11] <td>1978</td>
## [12] <td>978-0394428147</td>
## [13] <td>3</td>
## [14] <td>The Arab Spring: Pathways of Repression and Reform</td>
## [15] <td>Jason Brownlee; Tarek Masoud; Andrew Reynolds</td>
## [16] <td>Oxford University Press</td>
## [17] <td>2015</td>
## [18] <td>978-0199660063</td>
Convert to dataframe
books_df_html <- html %>%
html_element('table') %>%
html_table()
knitr::kable(books_df_html)
| Index | Title | Author | Publisher | Year Published | ISBN |
|---|---|---|---|---|---|
| 1 | Iran Between Two Revolutions | Ervand Abrahamian | Princeton University Press | 1982 | 978-0691101347 |
| 2 | Orientalsm | Edward Said | Pantheon Books | 1978 | 978-0394428147 |
| 3 | The Arab Spring: Pathways of Repression and Reform | Jason Brownlee; Tarek Masoud; Andrew Reynolds | Oxford University Press | 2015 | 978-0199660063 |
Read in the file
xml <- read_xml('https://raw.githubusercontent.com/kac624/cuny/main/D607/data/week7_books.xml')
Explore
xml_structure(xml)
## <root>
## <row>
## <index>
## {text}
## <title>
## {text}
## <author>
## {text}
## <publisher>
## {text}
## <publish_year>
## {text}
## <isbn>
## {text}
## <row>
## <index>
## {text}
## <title>
## {text}
## <author>
## {text}
## <publisher>
## {text}
## <publish_year>
## {text}
## <isbn>
## {text}
## <row>
## <index>
## {text}
## <title>
## {text}
## <author>
## {text}
## <publisher>
## {text}
## <publish_year>
## {text}
## <isbn>
## {text}
xml_find_all(xml, xpath = '//title')
## {xml_nodeset (3)}
## [1] <title>Iran Between Two Revolutions</title>
## [2] <title>Orientalsm</title>
## [3] <title>The Arab Spring: Pathways of Repression and Reform</title>
Convert to dataframe
elements <- c('index','title','author','publisher','publish_year','isbn')
for (element in elements) {
contents <- xml_find_all(xml, xpath = str_c('//',element)) %>% xml_text()
assign(element, contents)
}
books_df_xml <- tibble(index = index,
title = title,
author = author,
publisher = publisher,
publish_year = publish_year,
isbn = isbn)
knitr::kable(books_df_xml)
| index | title | author | publisher | publish_year | isbn |
|---|---|---|---|---|---|
| 1 | Iran Between Two Revolutions | Ervand Abrahamian | Princeton University Press | 1982 | 978-0691101347 |
| 2 | Orientalsm | Edward Said | Pantheon Books | 1978 | 978-0394428147 |
| 3 | The Arab Spring: Pathways of Repression and Reform | Jason Brownlee; Tarek Masoud; Andrew Reynolds | Oxford University Press | 2015 | 978-0199660063 |
Read in the file
json <- read_json('https://raw.githubusercontent.com/kac624/cuny/main/D607/data/week7_books.json')
Explore
json[[2]]
## $index
## [1] "2"
##
## $title
## [1] "Orientalsm"
##
## $author
## [1] "Edward Said"
##
## $publisher
## [1] "Pantheon Books"
##
## $publish_year
## [1] "1978"
##
## $isbn
## [1] "978-0394428147"
json[[3]]$title
## [1] "The Arab Spring: Pathways of Repression and Reform"
Convert to dataframe
books_df_json <- tibble(json = json) %>%
unnest_wider(json)
knitr::kable(books_df_json)
| index | title | author | publisher | publish_year | isbn |
|---|---|---|---|---|---|
| 1 | Iran Between Two Revolutions | Ervand Abrahamian | Princeton University Press | 1982 | 978-0691101347 |
| 2 | Orientalsm | Edward Said | Pantheon Books | 1978 | 978-0394428147 |
| 3 | The Arab Spring: Pathways of Repression and Reform | Jason Brownlee; Tarek Masoud; Andrew Reynolds | Oxford University Press | 2015 | 978-0199660063 |