R Markdown

library(XML)
## Warning: package 'XML' was built under R version 3.5.3
library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#library(xml2)
library(RCurl)
## Loading required package: bitops
library(httr)
library(rvest)
## Warning: package 'rvest' was built under R version 3.5.3
## Loading required package: xml2
## Warning: package 'xml2' was built under R version 3.5.3
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
## 
##     xml
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 3.5.3
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
#library(jsonlite)
require(RJSONIO)
## Loading required package: RJSONIO
library(DT)
## Warning: package 'DT' was built under R version 3.5.3
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## The following objects are masked from 'package:RJSONIO':
## 
##     fromJSON, toJSON
library(purrr)
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:jsonlite':
## 
##     flatten
## The following object is masked from 'package:rvest':
## 
##     pluck
## The following object is masked from 'package:plyr':
## 
##     compact
library(data.table)
## Warning: package 'data.table' was built under R version 3.5.3
## 
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
## 
##     transpose
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

Reference: https://www.datacamp.com/community/tutorials/r-data-import-tutorial#xml

       https://www.tutorialspoint.com/r/r_xml_files.htm
       
       https://subscription.packtpub.com/book/big_data_and_business_intelligence/9781783989065/1/ch01lvl1sec11/reading-xml-data
       
       https://www.kaggle.com/robertoruiz/how-convert-json-files-into-data-frames-in-r
##Read html document
url1 = "https://raw.githubusercontent.com/AjayArora35/Data-607-Assignment-Week-7/master/Source Week Assignment 7.html"
download.file(url1, destfile = "scrapedpage1.html", quiet=TRUE)
doc1 = htmlParse("scrapedpage1.html", encoding = "ISO-8859-1")
# For debugging doc1

z1 = as.data.frame(readHTMLTable(doc1, stringsAsFactors = FALSE, header = TRUE))

#Rename the columns becuase they contain "NULL."
names(z1)[1] <- "title"
names(z1)[2] <- "author"
names(z1)[3] <- "year"
names(z1)[4] <- "price"
names(z1)[5] <- "_category"

z1 %>% kable() %>%  kable_styling()
title author year price _category
Everyday Italian Giada De Laurentiis 2005 30.00 cooking
Harry Potter J K. Rowling 2005 29.99 children
XQuery Kick Start James McGovern,Per Bothner,Kurt Cagle,James Linn,Vaidyanathan Nagarajan 2003 49.99 web
Learning XML Erik T. Ray 2003 39.95 web
url2 = "https://raw.githubusercontent.com/AjayArora35/Data-607-Assignment-Week-7/master/Source Week Assignment 7.xml"

download.file(url2, destfile = "scrapedpage1.xml", quiet=TRUE)
doc2 = xmlParse("scrapedpage1.xml", encoding = "ISO-8859-1")
#For debugging doc2

rootNode <- xmlRoot(doc2)
#rootNode[1] 

xml <- ldply(xmlToList(rootNode), data.frame)
## Warning in (function (..., row.names = NULL, check.rows = FALSE,
## check.names = TRUE, : row names were found from a short variable and have
## been discarded
#For debugging xml

z2 <- xml %>% select(-c(.id, title..attrs))
colnames(z2) <- c('title','author','year','price', '_category', 'author.1', 'author.2', 'author.3', 'author.4')

z2[1:4,] %>% kable() %>%  kable_styling()
title author year price _category author.1 author.2 author.3 author.4
Everyday Italian Giada De Laurentiis 2005 30.00 cooking NA NA NA NA
Harry Potter J K. Rowling 2005 29.99 children NA NA NA NA
XQuery Kick Start James McGovern 2003 49.99 web Per Bothner Kurt Cagle James Linn Vaidyanathan Nagarajan
Learning XML Erik T. Ray 2003 39.95 web NA NA NA NA
url3 <- "https://raw.githubusercontent.com/AjayArora35/Data-607-Assignment-Week-7/master/Source Week Assignment 7.json"

pop_proj_data_df <- RJSONIO::fromJSON(url3)

#Since we get back a list, lets convert to an array and construt a matrix then convert to data frame
arr <- array(pop_proj_data_df[[1]]$book)

#Get number of items in list
iterations <- length(pop_proj_data_df[[1]]$book)
#Establish matrix to construct our data frame 
m <- matrix(ncol = 5, nrow = iterations)

row <- 1

#Construct matrix and populate with list items 
for(i in 1:iterations){
  for ( j in arr[i]) { 
    if(j$`-category` != ''){
      m[row, 5] <- (j$`-category`)
    }
    if(j$year != ''){
      m[row, 3] <- (j$year)
    }
    if(j$price != ''){
      m[row, 4] <- (j$price)
    }
    if(j$title[2] != ''){
      m[row, 1] <- (j$title[2])
    }
    a <- paste0(j$author, collaspe = ' ')
    if(j$author[1] != ''){
      m[row, 2] <- toString(a)
    }

    row <- row + 1
    
  }
  
}
#For debugging m

z3 <- data.frame(m)
x <- c('title','author','year','price', '_category')
colnames(z3) <- x
z3 %>% kable() %>%  kable_styling()
title author year price _category
Everyday Italian Giada De Laurentiis 2005 30.00 cooking
Harry Potter J K. Rowling 2005 29.99 children
XQuery Kick Start James McGovern , Per Bothner , Kurt Cagle , James Linn , Vaidyanathan Nagarajan 2003 49.99 web
Learning XML Erik T. Ray 2003 39.95 web

Data frames are different for XML compared with HTML and JSON.

The HTML file produced a table where the multiple authors were listed in a single column. The XML file produced a table where the multiple authors were separated into different columns. Lastly, JSON file produced a list, where the authors were all together, and I decided to produce a table where all the authors were stored in a single column, similar to HTML.