R Markdown
library(XML)
## Warning: package 'XML' was built under R version 3.5.3
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#library(xml2)
library(RCurl)
## Loading required package: bitops
library(httr)
library(rvest)
## Warning: package 'rvest' was built under R version 3.5.3
## Loading required package: xml2
## Warning: package 'xml2' was built under R version 3.5.3
##
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
##
## xml
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 3.5.3
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
#library(jsonlite)
require(RJSONIO)
## Loading required package: RJSONIO
library(DT)
## Warning: package 'DT' was built under R version 3.5.3
library(jsonlite)
##
## Attaching package: 'jsonlite'
## The following objects are masked from 'package:RJSONIO':
##
## fromJSON, toJSON
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:jsonlite':
##
## flatten
## The following object is masked from 'package:rvest':
##
## pluck
## The following object is masked from 'package:plyr':
##
## compact
library(data.table)
## Warning: package 'data.table' was built under R version 3.5.3
##
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
##
## transpose
## The following objects are masked from 'package:dplyr':
##
## between, first, last
Reference: https://www.datacamp.com/community/tutorials/r-data-import-tutorial#xml
https://www.tutorialspoint.com/r/r_xml_files.htm
https://subscription.packtpub.com/book/big_data_and_business_intelligence/9781783989065/1/ch01lvl1sec11/reading-xml-data
https://www.kaggle.com/robertoruiz/how-convert-json-files-into-data-frames-in-r
##Read html document
url1 = "https://raw.githubusercontent.com/AjayArora35/Data-607-Assignment-Week-7/master/Source Week Assignment 7.html"
download.file(url1, destfile = "scrapedpage1.html", quiet=TRUE)
doc1 = htmlParse("scrapedpage1.html", encoding = "ISO-8859-1")
# For debugging doc1
z1 = as.data.frame(readHTMLTable(doc1, stringsAsFactors = FALSE, header = TRUE))
#Rename the columns becuase they contain "NULL."
names(z1)[1] <- "title"
names(z1)[2] <- "author"
names(z1)[3] <- "year"
names(z1)[4] <- "price"
names(z1)[5] <- "_category"
z1 %>% kable() %>% kable_styling()
|
title
|
author
|
year
|
price
|
_category
|
|
Everyday Italian
|
Giada De Laurentiis
|
2005
|
30.00
|
cooking
|
|
Harry Potter
|
J K. Rowling
|
2005
|
29.99
|
children
|
|
XQuery Kick Start
|
James McGovern,Per Bothner,Kurt Cagle,James Linn,Vaidyanathan Nagarajan
|
2003
|
49.99
|
web
|
|
Learning XML
|
Erik T. Ray
|
2003
|
39.95
|
web
|
|
|
|
|
|
|
url2 = "https://raw.githubusercontent.com/AjayArora35/Data-607-Assignment-Week-7/master/Source Week Assignment 7.xml"
download.file(url2, destfile = "scrapedpage1.xml", quiet=TRUE)
doc2 = xmlParse("scrapedpage1.xml", encoding = "ISO-8859-1")
#For debugging doc2
rootNode <- xmlRoot(doc2)
#rootNode[1]
xml <- ldply(xmlToList(rootNode), data.frame)
## Warning in (function (..., row.names = NULL, check.rows = FALSE,
## check.names = TRUE, : row names were found from a short variable and have
## been discarded
#For debugging xml
z2 <- xml %>% select(-c(.id, title..attrs))
colnames(z2) <- c('title','author','year','price', '_category', 'author.1', 'author.2', 'author.3', 'author.4')
z2[1:4,] %>% kable() %>% kable_styling()
|
title
|
author
|
year
|
price
|
_category
|
author.1
|
author.2
|
author.3
|
author.4
|
|
Everyday Italian
|
Giada De Laurentiis
|
2005
|
30.00
|
cooking
|
NA
|
NA
|
NA
|
NA
|
|
Harry Potter
|
J K. Rowling
|
2005
|
29.99
|
children
|
NA
|
NA
|
NA
|
NA
|
|
XQuery Kick Start
|
James McGovern
|
2003
|
49.99
|
web
|
Per Bothner
|
Kurt Cagle
|
James Linn
|
Vaidyanathan Nagarajan
|
|
Learning XML
|
Erik T. Ray
|
2003
|
39.95
|
web
|
NA
|
NA
|
NA
|
NA
|
url3 <- "https://raw.githubusercontent.com/AjayArora35/Data-607-Assignment-Week-7/master/Source Week Assignment 7.json"
pop_proj_data_df <- RJSONIO::fromJSON(url3)
#Since we get back a list, lets convert to an array and construt a matrix then convert to data frame
arr <- array(pop_proj_data_df[[1]]$book)
#Get number of items in list
iterations <- length(pop_proj_data_df[[1]]$book)
#Establish matrix to construct our data frame
m <- matrix(ncol = 5, nrow = iterations)
row <- 1
#Construct matrix and populate with list items
for(i in 1:iterations){
for ( j in arr[i]) {
if(j$`-category` != ''){
m[row, 5] <- (j$`-category`)
}
if(j$year != ''){
m[row, 3] <- (j$year)
}
if(j$price != ''){
m[row, 4] <- (j$price)
}
if(j$title[2] != ''){
m[row, 1] <- (j$title[2])
}
a <- paste0(j$author, collaspe = ' ')
if(j$author[1] != ''){
m[row, 2] <- toString(a)
}
row <- row + 1
}
}
#For debugging m
z3 <- data.frame(m)
x <- c('title','author','year','price', '_category')
colnames(z3) <- x
z3 %>% kable() %>% kable_styling()
|
title
|
author
|
year
|
price
|
_category
|
|
Everyday Italian
|
Giada De Laurentiis
|
2005
|
30.00
|
cooking
|
|
Harry Potter
|
J K. Rowling
|
2005
|
29.99
|
children
|
|
XQuery Kick Start
|
James McGovern , Per Bothner , Kurt Cagle , James Linn , Vaidyanathan Nagarajan
|
2003
|
49.99
|
web
|
|
Learning XML
|
Erik T. Ray
|
2003
|
39.95
|
web
|