Assignment 7

Introduction

##The three input files in htm, json and xml formats are uploaded to Github https://github.com/tponnada/DATA607/blob/master/convertcsv.htm, https://github.com/tponnada/DATA607/blob/master/convertcsv.json and https://github.com/tponnada/DATA607/blob/master/Rfile-2.xml

alongwith the original csv file - https://raw.githubusercontent.com/tponnada/DATA607/master/Rfile.csv

##The original table is loaded

library(tidyverse)

## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(RCurl)

## 
## Attaching package: 'RCurl'

## The following object is masked from 'package:tidyr':
## 
##     complete

library(XML)
library(rjson)
library(jsonlite)

## 
## Attaching package: 'jsonlite'

## The following objects are masked from 'package:rjson':
## 
##     fromJSON, toJSON

## The following object is masked from 'package:purrr':
## 
##     flatten

library(DT)
library(plyr)

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:purrr':
## 
##     compact

library(methods)
original <- read.csv("https://raw.githubusercontent.com/tponnada/DATA607/master/Rfile.csv", stringsAsFactors = FALSE)

## Warning in read.table(file = file, header = header, sep = sep, quote
## = quote, : incomplete final line found by readTableHeader on 'https://
## raw.githubusercontent.com/tponnada/DATA607/master/Rfile.csv'

datatable(original)

##Read data in first using an html link and create a corresponding data frame. I created the html file, uploaded files to Github and consequently downloaded them to local computer from where they are read.

htmlparsed <- htmlParse(file = "/Users/tponnada/Desktop/DATA607/Rfile.htm")
html_table <- readHTMLTable(htmlparsed, stringsAsFactors = FALSE)
html_table <- html_table[[1]]
html_books <- as.data.frame(html_table); html_books

##                  Book Name                   Book Author           Topic Pages
## 1 The Romanov\r\n  Sisters               Helen Rappaport Russian History   500
## 2   The Russian Revolution Oliver Figue, Sebastian Polet Russian History  1000
## 3   Just Send Me\r\n  Word                  Oliver Figue Russian History   500
## 4    Midnight at Chernobyl                     Adam Fast Russian History   500
## 5

##Read data in using an xml file and create a corresponding data frame.

##Read data in using an JSON file and create a corresponding data frame.

json_url <- "/Users/tponnada/Downloads/convertcsv.json"
json <- fromJSON(json_url); json

##                Book Name                   Book Author           Topic Pages
## 1    The Romanov Sisters               Helen Rappaport Russian History   500
## 2 The Russian Revolution Oliver Figue, Sebastian Polet Russian History  1000
## 3      Just Send Me Word                  Oliver Figue Russian History   500
## 4  Midnight at Chernobyl                     Adam Fast Russian History   500

json_books <- as.data.frame(json); json_books

##                Book Name                   Book Author           Topic Pages
## 1    The Romanov Sisters               Helen Rappaport Russian History   500
## 2 The Russian Revolution Oliver Figue, Sebastian Polet Russian History  1000
## 3      Just Send Me Word                  Oliver Figue Russian History   500
## 4  Midnight at Chernobyl                     Adam Fast Russian History   500

Conclusion

View and compare the three separate data frames. The three observations are not equal. The json format is equal to the original csv file, the html file has a blank row and some additional spaces and the xml parse gives a lump of content.

View(html_books)
View(xml_books)
View(json_books)
str(html_books)

## 'data.frame':    5 obs. of  4 variables:
##  $ Book Name  : chr  "The Romanov\r\n  Sisters" "The Russian Revolution" "Just Send Me\r\n  Word" "Midnight at Chernobyl" ...
##  $ Book Author: chr  "Helen Rappaport" "Oliver Figue, Sebastian Polet" "Oliver Figue" "Adam Fast" ...
##  $ Topic      : chr  "Russian History" "Russian History" "Russian History" "Russian History" ...
##  $ Pages      : chr  "500" "1000" "500" "500" ...

str(xml_books)

## 'data.frame':    4 obs. of  8 variables:
##  $ ActiveSheet     : chr  "0" NA NA NA
##  $ Author          : chr  NA "TEJASWI PARKER" NA NA
##  $ Created         : chr  NA "2020-09-07T01:12:06Z" NA NA
##  $ LastSaved       : chr  NA "2020-09-07T02:52:41Z" NA NA
##  $ Style           : chr  NA NA "" NA
##  $ Names           : chr  NA NA NA ""
##  $ Table           : chr  NA NA NA "Book NameBook AuthorTopicPagesThe Romanov SistersHelen RappaportRussian History500The Russian RevolutionOliver "| __truncated__
##  $ WorksheetOptions: chr  NA NA NA "96006003R1C1:R5C4"

str(json_books)

## 'data.frame':    4 obs. of  4 variables:
##  $ Book Name  : chr  "The Romanov Sisters" "The Russian Revolution" "Just Send Me Word" "Midnight at Chernobyl"
##  $ Book Author: chr  "Helen Rappaport" "Oliver Figue, Sebastian Polet" "Oliver Figue" "Adam Fast"
##  $ Topic      : chr  "Russian History" "Russian History" "Russian History" "Russian History"
##  $ Pages      : int  500 1000 500 500

all.equal(original, html_books)

## [1] "Names: 2 string mismatches"                                             
## [2] "Attributes: < Component \"row.names\": Numeric: lengths (4, 5) differ >"
## [3] "Component 1: Lengths (4, 5) differ (string compare on first 4)"         
## [4] "Component 1: 2 string mismatches"                                       
## [5] "Component 2: Lengths (4, 5) differ (string compare on first 4)"         
## [6] "Component \"Topic\": Lengths (4, 5) differ (string compare on first 4)" 
## [7] "Component \"Pages\": Modes: numeric, character"                         
## [8] "Component \"Pages\": Lengths: 4, 5"                                     
## [9] "Component \"Pages\": target is numeric, current is character"

all.equal(original, json_books)

## [1] "Names: 2 string mismatches"

all.equal(original, xml_books)

## [1] "Names: 4 string mismatches"                                   
## [2] "Length mismatch: comparison on first 4 components"            
## [3] "Component 1: 'is.NA' value mismatch: 3 in current 0 in target"
## [4] "Component 2: 'is.NA' value mismatch: 3 in current 0 in target"
## [5] "Component 3: 'is.NA' value mismatch: 3 in current 0 in target"
## [6] "Component 4: Modes: numeric, character"                       
## [7] "Component 4: target is numeric, current is character"