Load library

library(data.table)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between()     masks data.table::between()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ dplyr::first()       masks data.table::first()
## ✖ lubridate::hour()    masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ dplyr::last()        masks data.table::last()
## ✖ lubridate::mday()    masks data.table::mday()
## ✖ lubridate::minute()  masks data.table::minute()
## ✖ lubridate::month()   masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second()  masks data.table::second()
## ✖ purrr::transpose()   masks data.table::transpose()
## ✖ lubridate::wday()    masks data.table::wday()
## ✖ lubridate::week()    masks data.table::week()
## ✖ lubridate::yday()    masks data.table::yday()
## ✖ lubridate::year()    masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rvest) #for reading HTML file into R
## 
## Attaching package: 'rvest'
## 
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(jsonlite) #for reading JSON file into R
## 
## Attaching package: 'jsonlite'
## 
## The following object is masked from 'package:purrr':
## 
##     flatten
library(xml2)

HTML Format

Lets read a HTML table into R

Save HTML URL into R

url <- "https://raw.githubusercontent.com/folushoa/Data-Science/Data-607/Assignment%207/books.html"
html <- read_html(url)

Convert to data frame

html_table <- html %>% 
  html_element("table") %>% 
  html_table()

html_table
## # A tibble: 3 × 3
##   Name        Genre     Author       
##   <chr>       <chr>     <chr>        
## 1 Famous Five Mystery   Enid Blyton  
## 2 Matilda     Magic     Roadl Dahl   
## 3 Animal Farm Dystopian George Orwell

JSON Format

Lets read a JSON table into R

Save JSON url into R

url <- "https://raw.githubusercontent.com/folushoa/Data-Science/Data-607/Assignment%207/books.json"
json_table <- fromJSON(url, flatten = TRUE) #reads JSON table and converts to
                                            #data frame

json_table
##          Name     Genre        Author
## 1 Famous Five   Mystery   Enid Blyton
## 2     Matilda     Magic    Roald Dahl
## 3 Animal Farm Dystopian George Orwell

XML Format

Lets read a XML table into R

Save XML URL into R

url <- "https://raw.githubusercontent.com/folushoa/Data-Science/Data-607/Assignment%207/books.xml"
xml <- read_xml(url)

Convert to data frame

name_nodes <- xml %>% #extract all name nodes
  xml_find_all(".//book") %>% 
  xml_find_all(".//Name")
  
genre_nodes <- xml %>% #extract all genre nodes
  xml_find_all(".//book") %>% 
  xml_find_all(".//Genre")

author_nodes <- xml %>% #extract all author nodes
  xml_find_all(".//book") %>% 
  xml_find_all(".//Author")

# Extract text from the nodes
name <- xml_text(name_nodes)
genre <- xml_text(genre_nodes)
author <- xml_text(author_nodes)

# Put into a data frame
xml_table <- tibble(Name = name, Genre = genre, Author = author)

xml_table
## # A tibble: 3 × 3
##   Name        Genre     Author       
##   <chr>       <chr>     <chr>        
## 1 Famous Five Mystery   Enid Blyton  
## 2 Matilda     Magic     Roald Dahl   
## 3 Animal Farm Dystopian George Orwell

The HTML and SML to data frame, data frames are the same, tibble. However, the JSON to data frame is a data frame. All in all they are all data frames.