library(rvest)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
html_file <- read_html("https://raw.githubusercontent.com/j-song-npc/607-Week-7-Assignment/refs/heads/main/books.html")
html_file <- html_table(html_file)
html_file <- as.data.frame(html_file)
glimpse(html_file)
## Rows: 3
## Columns: 5
## $ Title <chr> "Feeling Good: The New Mood Therapy", "The Archer", "…
## $ Author <chr> "David D. Burns, M.D.", "Paulo Coelho", "Chip Heath a…
## $ Publication.year <int> 1980, 2020, 2017
## $ Pages <int> 736, 160, 320
## $ Publisher <chr> "Harper Collins", "Knopf", "Simon & Schuster"
library(RCurl)
## Warning: package 'RCurl' was built under R version 4.4.2
library(XML)
## Warning: package 'XML' was built under R version 4.4.2
xml_file <- "https://raw.githubusercontent.com/j-song-npc/607-Week-7-Assignment/refs/heads/main/books.xml"
RCurladdress <- getURL(xml_file)
xml_file <- xmlToDataFrame(xmlParse(getURL(xml_file)))
glimpse(xml_file)
## Rows: 3
## Columns: 5
## $ Title <chr> "Feeling Good: The New Mood Therapy", "The Archer", "…
## $ Author <chr> "David D. Burns, M.D.", "Paulo Coelho", "Chip Heath a…
## $ Publication_year <chr> "1980", "2020", "2017"
## $ Pages <chr> "736", "160", "320"
## $ Publisher <chr> "Harper Collins", "Knopf", "Simon & Schuster"
library(jsonlite)
json_file <-fromJSON("https://raw.githubusercontent.com/j-song-npc/607-Week-7-Assignment/refs/heads/main/books.json")
glimpse(json_file)
## Rows: 3
## Columns: 5
## $ Title <chr> "Feeling Good: The New Mood Therapy", "The Archer", "…
## $ Author <chr> "David D. Burns, M.D.", "Paulo Coelho", "Chip Heath a…
## $ Publication_year <int> 1980, 2020, 2017
## $ Pages <int> 736, 160, 320
## $ Publisher <chr> "Harper Collins", "Knopf", "Simon & Schuster"
Using the above methods to load my files into R studio, the data frames appear to be identical. However, my xml data frame seems to qualify all columns as characters while the html and json data frames characterize numerical columns (Publication year and pages) as integers. I also found that the xml file was trickiest to load in as I had to try many different methods before successfully finding one that worked. Due to these 2 features, I think my preference would be to work in json or html unless there is other rationale why xml is more efficient.