library(data.table)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ lubridate::hour() masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ lubridate::mday() masks data.table::mday()
## ✖ lubridate::minute() masks data.table::minute()
## ✖ lubridate::month() masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second() masks data.table::second()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ lubridate::wday() masks data.table::wday()
## ✖ lubridate::week() masks data.table::week()
## ✖ lubridate::yday() masks data.table::yday()
## ✖ lubridate::year() masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rvest) #for reading HTML file into R
##
## Attaching package: 'rvest'
##
## The following object is masked from 'package:readr':
##
## guess_encoding
library(jsonlite) #for reading JSON file into R
##
## Attaching package: 'jsonlite'
##
## The following object is masked from 'package:purrr':
##
## flatten
library(xml2)
Lets read a HTML table into R
url <- "https://raw.githubusercontent.com/folushoa/Data-Science/Data-607/Assignment%207/books.html"
html <- read_html(url)
html_table <- html %>%
html_element("table") %>%
html_table()
html_table
## # A tibble: 3 × 3
## Name Genre Author
## <chr> <chr> <chr>
## 1 Famous Five Mystery Enid Blyton
## 2 Matilda Magic Roadl Dahl
## 3 Animal Farm Dystopian George Orwell
Lets read a JSON table into R
url <- "https://raw.githubusercontent.com/folushoa/Data-Science/Data-607/Assignment%207/books.json"
json_table <- fromJSON(url, flatten = TRUE) #reads JSON table and converts to
#data frame
json_table
## Name Genre Author
## 1 Famous Five Mystery Enid Blyton
## 2 Matilda Magic Roald Dahl
## 3 Animal Farm Dystopian George Orwell
Lets read a XML table into R
url <- "https://raw.githubusercontent.com/folushoa/Data-Science/Data-607/Assignment%207/books.xml"
xml <- read_xml(url)
name_nodes <- xml %>% #extract all name nodes
xml_find_all(".//book") %>%
xml_find_all(".//Name")
genre_nodes <- xml %>% #extract all genre nodes
xml_find_all(".//book") %>%
xml_find_all(".//Genre")
author_nodes <- xml %>% #extract all author nodes
xml_find_all(".//book") %>%
xml_find_all(".//Author")
# Extract text from the nodes
name <- xml_text(name_nodes)
genre <- xml_text(genre_nodes)
author <- xml_text(author_nodes)
# Put into a data frame
xml_table <- tibble(Name = name, Genre = genre, Author = author)
xml_table
## # A tibble: 3 × 3
## Name Genre Author
## <chr> <chr> <chr>
## 1 Famous Five Mystery Enid Blyton
## 2 Matilda Magic Roald Dahl
## 3 Animal Farm Dystopian George Orwell
The HTML and SML to data frame, data frames are the same, tibble. However, the JSON to data frame is a data frame. All in all they are all data frames.