library(RCurl)
library(XML)
library(stringr)
library(htmlTable)
library(magrittr)
library(rvest)
library(dplyr)
library(knitr)
library(kableExtra)
library(jsonlite)
Snapshot of part of the raw XML file that I created.
books_xml_raw <- getURL('https://raw.githubusercontent.com/zachalexander/data607_cunysps/master/Homework5/books.xml')
books_xml_df <- data.frame(matrix(NA, nrow = 5, ncol = 7))
# renaming column headers
books_xml_df <- books_xml_df %>%
rename("title" = X1,
"date_pub" = X2,
"famous_quote" = X3,
"amazon_rating" = X4,
"author_first_name" = X5,
"author_last_name" = X6,
"author_birthplace" = X7)
# using regex to find the data needed
book_xml_titles <- str_replace_all(unlist(str_extract_all(books_xml_raw, '<title>.+>')), '<\\/?\\w+\\>', '')
book_xml_dt_pub <- str_replace_all(unlist(str_extract_all(books_xml_raw, '<date_published>.+>')), '<\\/?\\w+\\>', '')
book_xml_famous_quote <- str_replace_all(unlist(str_extract_all(books_xml_raw, '<famous_quote_from_book>.+>')), '<\\/?\\w+\\>', '')
book_xml_amazon <- str_replace_all(unlist(str_extract_all(books_xml_raw, '<amazon_review_stars>.+>')), '<\\/?\\w+\\>', '')
book_xml_author_fn <- str_replace_all(unlist(str_extract_all(books_xml_raw, '<first_name>.+>')), '<\\/?\\w+\\>', '')
book_xml_author_ln <- str_replace_all(unlist(str_extract_all(books_xml_raw, '<last_name>.+>')), '<\\/?\\w+\\>', '')
book_xml_author_bp <- str_replace_all(unlist(str_extract_all(books_xml_raw, '<birthplace>.+>')), '<\\/?\\w+\\>', '')
# creating a column to identify which book categorized in dataframe
book_xml_num <- c('Book 1', 'Book 2', 'Book 3')
kable(books_xml_df, align = rep('c', 8)) %>%
kable_styling(bootstrap_options = c("striped"))
| Book.Number | Title | Date.Published | Famous.Quote | Amazon.Rating | Author.First.Name | Author.Last.Name | Author.Birthplace |
|---|---|---|---|---|---|---|---|
| Book 1 | The Origin of Species | 1859 | “We will now discuss in a little more detail the Struggle for Existence.” | 4 out of 5 | Charles | Darwin | Shrewsbury, United Kingdom |
| Book 2 | Sapiens: A Brief History of Humankind | 2011 | “Each year the US population spends more money on diets than the amount needed to feed all the hungry people in the rest of the world.” | 4.5 out of 5 | Yuval | Noah Harari | Kiryat Ata, Israel |
| Book 3 | The Feynman Lectures on Physics | 1963 | “…every object is a mixture of lots of things, so we can deal with it only as a series of approximations and idealizations.” | 4.5 out of 5 | Richard | Feynman | Queens, New York City |
| Book 3 | The Feynman Lectures on Physics | 1963 | “…every object is a mixture of lots of things, so we can deal with it only as a series of approximations and idealizations.” | 4.5 out of 5 | Robert | Leighton | Detroit, Michigan |
| Book 3 | The Feynman Lectures on Physics | 1963 | “…every object is a mixture of lots of things, so we can deal with it only as a series of approximations and idealizations.” | 4.5 out of 5 | Matthew | Sands | Oxford, Massachusetts |
Snapshot of part of the raw HTML file that I created.
# read in the raw html table from my GitHub account
books_html_raw <- read_html('https://raw.githubusercontent.com/zachalexander/data607_cunysps/master/Homework5/books.html')
# using html nodes to extract the book_num column
book_html_num <- books_html_raw %>%
html_nodes(xpath = '//*[@class="book_num"]') %>% html_text()
# using html nodes again to extract all other table information
book_html_text <- books_html_raw %>%
html_nodes("td") %>% html_text()
# using html nodes to extract all values with class name of auth_first_name
book_html_author_fn <- books_html_raw %>%
html_nodes(xpath = '//*[@class="auth_first_name"]') %>% html_text()
# using html nodes to extract all values with class name of auth_last_name
book_html_author_ln <- books_html_raw %>%
html_nodes(xpath = '//*[@class="auth_last_name"]') %>% html_text()
# using html nodes to extract all values with class name of auth_last_name
book_html_titles <- books_html_raw %>%
html_nodes(xpath = '//*[@class="book_title"]') %>% html_text()
# using html nodes to extract all values with class name of auth_last_name
book_html_amazon <- books_html_raw %>%
html_nodes(xpath = '//*[@class="book_rating"]') %>% html_text()
# removing duplicate information from the vector
book_html_text <- unlist(book_html_text[!duplicated(book_html_text)])
# extracting the column header names that will be used later
book_html_column_headers <- book_html_text[1:7]
# subsequently removing the first 7 values in vector since they are now
# in the column_headers vector
book_html_text <- book_html_text[8:length(book_html_text)]
# had to get rid of the \n character for later regex work, also removed
# extra spaces in vector
book_html_text <- str_replace_all(book_html_text, '\n', '')
book_html_text <- str_replace_all(book_html_text, '\\s{2,6}', ' ')
# since all years are of length four, was able to extract using regex
book_html_dt_pub <- unlist(str_extract_all(book_html_text, '\\d{4}'))
# since all birthplaces had a comma separating locations, I used regex again
# I could've used html_nodes, but wanted to practice regex more
book_html_author_bp <- unlist(str_extract_all(book_html_text, '\\w.+,\\s\\w.+'))
book_html_author_bp <- book_html_author_bp[-c(3)]
# since all quotes had smart quotes, could use regex to extract these into vector
book_html_famous_quote <- str_replace_all(unlist(str_extract_all(book_html_text, '“.+?\\w+.+')), '[“”]', '')
kable(books_html_df, align = rep('c', 8)) %>%
kable_styling(bootstrap_options = c("striped"))
| Book.Number | Title | Date.Published | Famous.Quote | Amazon.Rating | Author.First.Name | Author.Last.Name | Author.Birthplace |
|---|---|---|---|---|---|---|---|
| Book #1 | The Origin of Species | 1859 | We will now discuss in a little more detail the Struggle for Existence. | 4 out of 5 | Charles | Darwin | Shrewsbury, United Kingdom |
| Book #2 | Sapiens: A Brief History of Humankind | 2011 | Each year the US population spends more money on diets than the amount needed to feed all the hungry people in the rest of the world. | 4.5 out of 5 | Yuval | Noah Harari | Kiryat Ata, Israel |
| Book #3 | The Feynman Lectures on Physics | 1963 | …every object is a mixture of lots of things, so we can deal with it only as a series of approximations and idealizations. | 4.5 out of 5 | Richard | Feynman | Queens, New York City |
| Book #3 | The Feynman Lectures on Physics | 1963 | …every object is a mixture of lots of things, so we can deal with it only as a series of approximations and idealizations. | 4.5 out of 5 | Robert | Leighton | Detroit, Michigan |
| Book #3 | The Feynman Lectures on Physics | 1963 | …every object is a mixture of lots of things, so we can deal with it only as a series of approximations and idealizations. | 4.5 out of 5 | Matthew | Sands | Oxford, Massachusetts |
Snapshot of part of the raw JSON file that I created.
books_json_raw <- fromJSON('https://raw.githubusercontent.com/zachalexander/data607_cunysps/master/Homework5/books.json')
book_json_titles <- books_json_raw$myFavoriteBooks$books[[1]]$title
book_json_dt_pub <- books_json_raw$myFavoriteBooks$books[[1]]$date_published
book_json_famous_quote <- books_json_raw$myFavoriteBooks$books[[1]]$famous_quote
book_json_amazon <- books_json_raw$myFavoriteBooks$books[[1]]$amazon_review_stars
book_json_author_fn <- books_json_raw$myFavoriteBooks$authors[[1]]$first_name
book_json_author_ln <- books_json_raw$myFavoriteBooks$authors[[1]]$last_name
book_json_author_bp <- books_json_raw$myFavoriteBooks$authors[[1]]$birthplace
book_json_num <- paste0('Book ', books_json_raw$myFavoriteBooks$authors[[1]]$book_id)
for(i in 4:5){
book_json_amazon[i] <- book_json_amazon[3]
book_json_titles[i] <- book_json_titles[3]
book_json_dt_pub[i] <- book_json_dt_pub[3]
book_json_famous_quote[i] <- book_json_famous_quote[3]
book_json_num[i] <- book_json_num[3]
}
# changed the column names one more time to make it more presentable
books_json_df <- data.frame("Book Number" = book_json_num,
"Title" = book_json_titles,
"Date Published" = book_json_dt_pub,
"Famous Quote" = book_json_famous_quote,
"Amazon Rating" = book_json_amazon,
"Author First Name" = book_json_author_fn,
"Author Last Name" = book_json_author_ln,
"Author Birthplace" = book_json_author_bp)
kable(books_json_df, align = rep('c', 8)) %>%
kable_styling(bootstrap_options = c("striped"))
| Book.Number | Title | Date.Published | Famous.Quote | Amazon.Rating | Author.First.Name | Author.Last.Name | Author.Birthplace |
|---|---|---|---|---|---|---|---|
| Book 1 | The Origin of Species | 1859 | We will now discuss in a little more detail the Struggle for Existence. | 4 out of 5 | Charles | Darwin | Shrewsbury, United Kingdom |
| Book 2 | Sapiens: A Brief History of Humankind | 2011 | Each year the US population spends more money on diets than the amount needed to feed all the hungry people in the rest of the world. | 4.5 out of 5 | Yuval | Noah Harari | Kiryat Ata, Israel |
| Book 3 | The Feynman Lectures on Physics | 1963 | …every object is a mixture of lots of things, so we can deal with it only as a series of approximations and idealizations. | 4.5 out of 5 | Richard | Feynman | Queens, New York City |
| Book 3 | The Feynman Lectures on Physics | 1963 | …every object is a mixture of lots of things, so we can deal with it only as a series of approximations and idealizations. | 4.5 out of 5 | Robert | Leighton | Detroit, Michigan |
| Book 3 | The Feynman Lectures on Physics | 1963 | …every object is a mixture of lots of things, so we can deal with it only as a series of approximations and idealizations. | 4.5 out of 5 | Matthew | Sands | Oxford, Massachusetts |