Overview

The manually formatted html, xml, and json files contain information for three books. Each book entry contains the title of the book, the author(s), the page count, and the year the book was first published. Using various packages, the data from each file structure is retrieved from github, imported, and then converted into a data frame, which is displayed via printing and the kable function.

knitr::opts_chunk$set(echo = TRUE)

#Import libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library('RCurl')
## 
## Attaching package: 'RCurl'
## 
## The following object is masked from 'package:tidyr':
## 
##     complete
library(knitr)
library(htmltools)
library(XML)
library(rjson)
library(rvest)
## 
## Attaching package: 'rvest'
## 
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(xml2)
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## 
## The following objects are masked from 'package:rjson':
## 
##     fromJSON, toJSON
## 
## The following object is masked from 'package:purrr':
## 
##     flatten
#read data from html file and convert to a data frame

htmldata <- getURL('https://raw.githubusercontent.com/kr0710/Data607/refs/heads/main/books.html')

df_html <- html_table(read_html(htmldata))

df_html
## [[1]]
## # A tibble: 3 × 4
##   Book_title                    Authors                     Pages Published_year
##   <chr>                         <chr>                       <int>          <int>
## 1 One hundred years of solitude Gabriel Garcia Marquez        417           1967
## 2 The signal and the noise      Nate Silver                   534           2012
## 3 Nudge                         Richard Thaler, Cass Sunst…   312           2008
kable(df_html)
Book_title Authors Pages Published_year
One hundred years of solitude Gabriel Garcia Marquez 417 1967
The signal and the noise Nate Silver 534 2012
Nudge Richard Thaler, Cass Sunstein 312 2008
#Read data from xml file and convert to a data frame

getxml <- getURL('https://raw.githubusercontent.com/kr0710/Data607/refs/heads/main/books.xml')

getxml2 <- read_xml(getxml)

getxml3 <- xml_find_all(getxml2, "//book")

xml_df <- data.frame(
  Book_title = xml_text(xml_find_all(getxml3, ".//Book_title")),
  Authors = xml_text(xml_find_all(getxml3, ".//Authors")),
  Pages = xml_text(xml_find_all(getxml3, ".//Pages")),
  Published_year = xml_text(xml_find_all(getxml3, ".//Published_year"))
)

xml_df
##                      Book_title                       Authors Pages
## 1 One hundred years of solitude        Gabriel Garcia Marquez   417
## 2      The signal and the noise                   Nate Silver   534
## 3                         Nudge Richard Thaler, Cass Sunstein   312
##   Published_year
## 1           1967
## 2           2012
## 3           2008
kable(xml_df)
Book_title Authors Pages Published_year
One hundred years of solitude Gabriel Garcia Marquez 417 1967
The signal and the noise Nate Silver 534 2012
Nudge Richard Thaler, Cass Sunstein 312 2008
#read json data and convert to a data frame

json_data <- getURL('https://raw.githubusercontent.com/kr0710/Data607/refs/heads/main/books.json')

json_df <- fromJSON(json_data)

json_df
## $Books
##                      Book_title                       Authors Pages
## 1 One hundred years of solitude        Gabriel Garcia Marquez   417
## 2      The signal and the noise                   Nate Silver   534
## 3                         Nudge Richard Thaler, Cass Sunstein   312
##   Published_year
## 1           1967
## 2           2012
## 3           2008
kable(json_df)
Book_title Authors Pages Published_year
One hundred years of solitude Gabriel Garcia Marquez 417 1967
The signal and the noise Nate Silver 534 2012
Nudge Richard Thaler, Cass Sunstein 312 2008