##The three input files in htm, json and xml formats are uploaded to Github https://github.com/tponnada/DATA607/blob/master/convertcsv.htm, https://github.com/tponnada/DATA607/blob/master/convertcsv.json and https://github.com/tponnada/DATA607/blob/master/Rfile-2.xml
alongwith the original csv file - https://raw.githubusercontent.com/tponnada/DATA607/master/Rfile.csv
##The original table is loaded
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(RCurl)
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
library(XML)
library(rjson)
library(jsonlite)
##
## Attaching package: 'jsonlite'
## The following objects are masked from 'package:rjson':
##
## fromJSON, toJSON
## The following object is masked from 'package:purrr':
##
## flatten
library(DT)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
library(methods)
original <- read.csv("https://raw.githubusercontent.com/tponnada/DATA607/master/Rfile.csv", stringsAsFactors = FALSE)
## Warning in read.table(file = file, header = header, sep = sep, quote
## = quote, : incomplete final line found by readTableHeader on 'https://
## raw.githubusercontent.com/tponnada/DATA607/master/Rfile.csv'
datatable(original)
##Read data in first using an html link and create a corresponding data frame. I created the html file, uploaded files to Github and consequently downloaded them to local computer from where they are read.
htmlparsed <- htmlParse(file = "/Users/tponnada/Desktop/DATA607/Rfile.htm")
html_table <- readHTMLTable(htmlparsed, stringsAsFactors = FALSE)
html_table <- html_table[[1]]
html_books <- as.data.frame(html_table); html_books
## Book Name Book Author Topic Pages
## 1 The Romanov\r\n Sisters Helen Rappaport Russian History 500
## 2 The Russian Revolution Oliver Figue, Sebastian Polet Russian History 1000
## 3 Just Send Me\r\n Word Oliver Figue Russian History 500
## 4 Midnight at Chernobyl Adam Fast Russian History 500
## 5
##Read data in using an xml file and create a corresponding data frame.
##Read data in using an JSON file and create a corresponding data frame.
json_url <- "/Users/tponnada/Downloads/convertcsv.json"
json <- fromJSON(json_url); json
## Book Name Book Author Topic Pages
## 1 The Romanov Sisters Helen Rappaport Russian History 500
## 2 The Russian Revolution Oliver Figue, Sebastian Polet Russian History 1000
## 3 Just Send Me Word Oliver Figue Russian History 500
## 4 Midnight at Chernobyl Adam Fast Russian History 500
json_books <- as.data.frame(json); json_books
## Book Name Book Author Topic Pages
## 1 The Romanov Sisters Helen Rappaport Russian History 500
## 2 The Russian Revolution Oliver Figue, Sebastian Polet Russian History 1000
## 3 Just Send Me Word Oliver Figue Russian History 500
## 4 Midnight at Chernobyl Adam Fast Russian History 500
View and compare the three separate data frames. The three observations are not equal. The json format is equal to the original csv file, the html file has a blank row and some additional spaces and the xml parse gives a lump of content.
View(html_books)
View(xml_books)
View(json_books)
str(html_books)
## 'data.frame': 5 obs. of 4 variables:
## $ Book Name : chr "The Romanov\r\n Sisters" "The Russian Revolution" "Just Send Me\r\n Word" "Midnight at Chernobyl" ...
## $ Book Author: chr "Helen Rappaport" "Oliver Figue, Sebastian Polet" "Oliver Figue" "Adam Fast" ...
## $ Topic : chr "Russian History" "Russian History" "Russian History" "Russian History" ...
## $ Pages : chr "500" "1000" "500" "500" ...
str(xml_books)
## 'data.frame': 4 obs. of 8 variables:
## $ ActiveSheet : chr "0" NA NA NA
## $ Author : chr NA "TEJASWI PARKER" NA NA
## $ Created : chr NA "2020-09-07T01:12:06Z" NA NA
## $ LastSaved : chr NA "2020-09-07T02:52:41Z" NA NA
## $ Style : chr NA NA "" NA
## $ Names : chr NA NA NA ""
## $ Table : chr NA NA NA "Book NameBook AuthorTopicPagesThe Romanov SistersHelen RappaportRussian History500The Russian RevolutionOliver "| __truncated__
## $ WorksheetOptions: chr NA NA NA "96006003R1C1:R5C4"
str(json_books)
## 'data.frame': 4 obs. of 4 variables:
## $ Book Name : chr "The Romanov Sisters" "The Russian Revolution" "Just Send Me Word" "Midnight at Chernobyl"
## $ Book Author: chr "Helen Rappaport" "Oliver Figue, Sebastian Polet" "Oliver Figue" "Adam Fast"
## $ Topic : chr "Russian History" "Russian History" "Russian History" "Russian History"
## $ Pages : int 500 1000 500 500
all.equal(original, html_books)
## [1] "Names: 2 string mismatches"
## [2] "Attributes: < Component \"row.names\": Numeric: lengths (4, 5) differ >"
## [3] "Component 1: Lengths (4, 5) differ (string compare on first 4)"
## [4] "Component 1: 2 string mismatches"
## [5] "Component 2: Lengths (4, 5) differ (string compare on first 4)"
## [6] "Component \"Topic\": Lengths (4, 5) differ (string compare on first 4)"
## [7] "Component \"Pages\": Modes: numeric, character"
## [8] "Component \"Pages\": Lengths: 4, 5"
## [9] "Component \"Pages\": target is numeric, current is character"
all.equal(original, json_books)
## [1] "Names: 2 string mismatches"
all.equal(original, xml_books)
## [1] "Names: 4 string mismatches"
## [2] "Length mismatch: comparison on first 4 components"
## [3] "Component 1: 'is.NA' value mismatch: 3 in current 0 in target"
## [4] "Component 2: 'is.NA' value mismatch: 3 in current 0 in target"
## [5] "Component 3: 'is.NA' value mismatch: 3 in current 0 in target"
## [6] "Component 4: Modes: numeric, character"
## [7] "Component 4: target is numeric, current is character"