data607 wk 7

html <- "https://raw.githubusercontent.com/miasiracusa/Data607/master/assignment7/books.html"
#read html file form html to r
html_books <- read_html(html)
html_books <- htmlParse(html_books)
#convert ot data frame
html_books_df <- as.data.frame(readHTMLTable(html_books))
#edit column names
colnames(html_books_df) <- c("title", "author", "year published", "goodreads rating")
#print
html_books_df

##                 title                       author year published
## 1        Bad Feminist                   Roxane Gay           2014
## 2   Art and Feminisim Helena Reckitt, Peggy Phelan           2001
## 3 Women, Race & Class                 Angela Davis           1981
##   goodreads rating
## 1              3.9
## 2              4.4
## 3              4.4

xml <- "https://raw.githubusercontent.com/miasiracusa/Data607/master/assignment7/books.xml"
xmldata <- read_xml(xml)
#parse xml file to r object
xml_books <- xmlParse(file = xmldata)
#convert r object to data frame
xml_books <- xmlToDataFrame(xml_books)
#print
xml_books

##                   title                       author year goodreadsrating
## 1          Bad Feminist                   Roxane Gay 2014             3.9
## 2      Art and Feminism Helena Reckitt, Peggy Phelan 2001             4.4
## 3 Women, Race and Class                 Angela Davis 1981             4.4

json <- "https://raw.githubusercontent.com/miasiracusa/Data607/master/assignment7/books.json"
#check validity of code
isValidJSON(json)

## [1] TRUE

#load json file into R object
json_books <- fromJSON(json)
#convert to data frame
json_books <- as.data.frame(json_books)
#clean up data frame
json_books$books.author.1[1] <- "Helena Reckitt and Peggy Phelan"

## Warning in `[<-.factor`(`*tmp*`, 1, value = structure(c(NA, 2L), .Label =
## c("Helena Reckitt", : invalid factor level, NA generated

json_books <- json_books[-2,]
a <- json_books[,(1:4)]
colnames(a) <- c("title", "author", "year published", "goodreads rating")
b <- json_books[,(5:8)]
colnames(b) <- c("title", "author", "year published", "goodreads rating")
c <- json_books[,(9:12)]
colnames(c) <- c("title", "author", "year published", "goodreads rating")

json_books_df <- rbind(a, b, c)
i <- sapply(json_books_df, is.factor)
json_books_df[i] <- lapply(json_books_df[i], as.character)
json_books_df$author[2] <- "Helena Reckitt and Peggy Phelan"
#print
json_books_df

##                 title                          author year published
## 1        Bad Feminist                      Roxane Gay           2014
## 2    Art and Feminism Helena Reckitt and Peggy Phelan           2001
## 3 Women, Race & Class                    Angela Davis           1981
##   goodreads rating
## 1              3.9
## 2              4.4
## 3              4.4

The 3 are very similar, but I found json to be the most different from the html and the xml. I had to edit that data frmae the most to make it look like the other data frames.