Overview
The manually formatted html, xml, and json files contain information
for three books. Each book entry contains the title of the book, the
author(s), the page count, and the year the book was first published.
Using various packages, the data from each file structure is retrieved
from github, imported, and then converted into a data frame, which is
displayed via printing and the kable function.
knitr::opts_chunk$set(echo = TRUE)
#Import libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library('RCurl')
##
## Attaching package: 'RCurl'
##
## The following object is masked from 'package:tidyr':
##
## complete
library(knitr)
library(htmltools)
library(XML)
library(rjson)
library(rvest)
##
## Attaching package: 'rvest'
##
## The following object is masked from 'package:readr':
##
## guess_encoding
library(xml2)
library(jsonlite)
##
## Attaching package: 'jsonlite'
##
## The following objects are masked from 'package:rjson':
##
## fromJSON, toJSON
##
## The following object is masked from 'package:purrr':
##
## flatten
#read data from html file and convert to a data frame
htmldata <- getURL('https://raw.githubusercontent.com/kr0710/Data607/refs/heads/main/books.html')
df_html <- html_table(read_html(htmldata))
df_html
## [[1]]
## # A tibble: 3 × 4
## Book_title Authors Pages Published_year
## <chr> <chr> <int> <int>
## 1 One hundred years of solitude Gabriel Garcia Marquez 417 1967
## 2 The signal and the noise Nate Silver 534 2012
## 3 Nudge Richard Thaler, Cass Sunst… 312 2008
kable(df_html)
| One hundred years of solitude |
Gabriel Garcia Marquez |
417 |
1967 |
| The signal and the noise |
Nate Silver |
534 |
2012 |
| Nudge |
Richard Thaler, Cass Sunstein |
312 |
2008 |
|
#Read data from xml file and convert to a data frame
getxml <- getURL('https://raw.githubusercontent.com/kr0710/Data607/refs/heads/main/books.xml')
getxml2 <- read_xml(getxml)
getxml3 <- xml_find_all(getxml2, "//book")
xml_df <- data.frame(
Book_title = xml_text(xml_find_all(getxml3, ".//Book_title")),
Authors = xml_text(xml_find_all(getxml3, ".//Authors")),
Pages = xml_text(xml_find_all(getxml3, ".//Pages")),
Published_year = xml_text(xml_find_all(getxml3, ".//Published_year"))
)
xml_df
## Book_title Authors Pages
## 1 One hundred years of solitude Gabriel Garcia Marquez 417
## 2 The signal and the noise Nate Silver 534
## 3 Nudge Richard Thaler, Cass Sunstein 312
## Published_year
## 1 1967
## 2 2012
## 3 2008
kable(xml_df)
| One hundred years of solitude |
Gabriel Garcia Marquez |
417 |
1967 |
| The signal and the noise |
Nate Silver |
534 |
2012 |
| Nudge |
Richard Thaler, Cass Sunstein |
312 |
2008 |
#read json data and convert to a data frame
json_data <- getURL('https://raw.githubusercontent.com/kr0710/Data607/refs/heads/main/books.json')
json_df <- fromJSON(json_data)
json_df
## $Books
## Book_title Authors Pages
## 1 One hundred years of solitude Gabriel Garcia Marquez 417
## 2 The signal and the noise Nate Silver 534
## 3 Nudge Richard Thaler, Cass Sunstein 312
## Published_year
## 1 1967
## 2 2012
## 3 2008
kable(json_df)
| One hundred years of solitude |
Gabriel Garcia Marquez |
417 |
1967 |
| The signal and the noise |
Nate Silver |
534 |
2012 |
| Nudge |
Richard Thaler, Cass Sunstein |
312 |
2008 |
|