Brian_Singh_DATA607

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(RCurl)

1. JSON

library(jsonlite)
#sport_books_json <- read_json("/Users/briansingh/Desktop/CUNY/Data607/Week7/sportbooks.json",simplifyVector=TRUE)
sport_books_json <- read_json("https://raw.githubusercontent.com/brsingh7/DATA607/main/Week7/sportbooks.json",simplifyVector=TRUE)
sport_books_json2 <- as.data.frame(sport_books_json)
sport_books_json2

##                     Favorite.Sport.Books.Title Favorite.Sport.Books.Author.s.
## 1                          Friday Night Lights H.G. Bizzinger, Buzz Bizzinger
## 2 Moneyball: The Art of Winning an Unfair Game                  Michael Lewis
## 3              The Mamba Mentality: How I Play                    Kobe Bryant
##   Favorite.Sport.Books.Released.Date Favorite.Sport.Books.Rating..Amazon.
## 1                            8/11/15                                  4.5
## 2                            3/17/04                                  4.7
## 3                           10/23/18                                  4.9

2. XML

I was having trouble with XML. I validated the XML I created and it states it is valid. I’m not sure how to proceed into reading in and converting to a data frame.

library(XML)
url<- "https://raw.githubusercontent.com/brsingh7/DATA607/main/Week7/sportbooks.xml"
data<-getURL(url)
sport_books_xml <- xmlTreeParse(data,useInternalNodes = TRUE)

bind_rows(xpathApply(sport_books_xml, "//Frame", function(x) {
  parent <- data.frame(as.list(xmlAttrs(x)), stringsAsFactors=FALSE)
  kids <- bind_rows(lapply(xmlChildren(x), function(x) as.list(xmlAttrs(x))))
  cbind.data.frame(parent, kids, stringsAsFactors=FALSE)
}))

## # A tibble: 0 × 0

3. HTML

library(XML)
url2<- "https://raw.githubusercontent.com/brsingh7/DATA607/main/Week7/sportbooks.html"
data2<-getURL(url2)
sport_books_html <- htmlTreeParse(data2,useInternalNodes = T)
sport_books_html2 <- readHTMLTable(sport_books_html, stringAsFactors = FALSE)
sport_books_html2

## $`NULL`
##                                          Title                      Author(s)
## 1                          Friday Night Lights H.G. Bizzinger, Buzz Bizzinger
## 2 Moneyball: The Art of Winning an Unfair Game                  Michael Lewis
## 3              The Mamba Mentality: How I Play                    Kobe Bryant
##   Release Date Rating
## 1    8/11/2015    4.5
## 2    3/17/2004    4.7
## 3   10/23/2018    4.9

sport_books_html3 <- sapply(sport_books_html2[[1]][,-1], FUN= function(x) 
    as.character(gsub(",", "", as.character(x), fixed = TRUE) ))
sport_books_html3<-as.data.frame(substring(sport_books_html3,1), stringsAsFactors=FALSE)

names(sport_books_html3)<-c("Author(s)","Release_Date","Rating")
sport_books_html3$Title<-sport_books_html2[[1]][,1]
sport_books_html3 <- sport_books_html3[,c(4,1,2,3)]
sport_books_html3

##                                          Title                     Author(s)
## 1                          Friday Night Lights H.G. Bizzinger Buzz Bizzinger
## 2 Moneyball: The Art of Winning an Unfair Game                 Michael Lewis
## 3              The Mamba Mentality: How I Play                   Kobe Bryant
##   Release_Date Rating
## 1    8/11/2015    4.5
## 2    3/17/2004    4.7
## 3   10/23/2018    4.9

Conclusion

My data frames (the ones I was able to successfully create) are pretty much identical. With a little more understanding of the files and how to use within R, I think they’d be identical regardless of type (HTML, XML, JSON).

Brian_Singh_DATA607_Week7

Brian Singh

3/19/2022

1. JSON

2. XML

3. HTML

Conclusion