suppressWarnings(suppressMessages({
library(tidycensus)
library(sf)
library(tmap)
library(jsonlite)
library(tidyverse)
library(httr)
library(reshape2)
library(here)
library(knitr)
library(ggplot2)
}))
### Read the rds data of hotels and restaurants
hotels <- readRDS('hotels.rds')
restaurants <- readRDS('restaurants.rds')
### Deleting duplicated rows with distinct()
hotels_unique <- hotels %>% distinct(id, .keep_all = T)
print(paste("Number of duplicated rows in hotels:", nrow(hotels) - nrow(hotels_unique)))
## [1] "Number of duplicated rows in hotels: 0"
### Since the difference between the number of rows in hotels and hotels_unique is 0, there are no duplicated rows in hotels
restaurants_unique <- restaurants %>% distinct(id, .keep_all = T)
print(paste("Number of duplicated rows in restaurants:", nrow(restaurants) - nrow(restaurants_unique)))
## [1] "Number of duplicated rows in restaurants: 0"
### The same goes for restaurants and restaurants_unique. There are no duplicated rows in duplicated. In other words, we can use hotels and restaurants without worrying about duplicated rows.
### Flatten nested columns
### First, we find the columns that have to be flattened (in other words, look for the nested columns).
sapply(hotels, class)
## id alias name image_url is_closed
## "character" "character" "character" "character" "logical"
## url review_count categories rating coordinates
## "character" "integer" "list" "numeric" "data.frame"
## transactions price location phone display_phone
## "list" "character" "data.frame" "character" "character"
## distance business_hours attributes
## "numeric" "list" "data.frame"
sapply(restaurants, class)
## id alias name image_url is_closed
## "character" "character" "character" "character" "logical"
## url review_count categories rating coordinates
## "character" "integer" "list" "numeric" "data.frame"
## transactions price location phone display_phone
## "list" "character" "data.frame" "character" "character"
## distance business_hours attributes
## "numeric" "list" "data.frame"
### In both cases, there are three lists (categories, business_hours, and transactions) and three data frames (coordinates, location, and attributes).
### We first flatten the data frames with jsonlite::flatten()
hotels_flat <- hotels %>% jsonlite::flatten()
restaurants_flat <- restaurants %>% jsonlite::flatten()
### Next, we look for the lists that have to be flattened
sapply(hotels_flat, class)
## id alias
## "character" "character"
## name image_url
## "character" "character"
## is_closed url
## "logical" "character"
## review_count categories
## "integer" "list"
## rating transactions
## "numeric" "list"
## price phone
## "character" "character"
## display_phone distance
## "character" "numeric"
## business_hours coordinates.latitude
## "list" "numeric"
## coordinates.longitude location.address1
## "numeric" "character"
## location.address2 location.address3
## "character" "character"
## location.city location.zip_code
## "character" "character"
## location.country location.state
## "character" "character"
## location.display_address attributes.business_temp_closed
## "list" "logical"
## attributes.menu_url attributes.open24_hours
## "logical" "logical"
## attributes.waitlist_reservation
## "logical"
sapply(restaurants_flat, class)
## id alias
## "character" "character"
## name image_url
## "character" "character"
## is_closed url
## "logical" "character"
## review_count categories
## "integer" "list"
## rating transactions
## "numeric" "list"
## price phone
## "character" "character"
## display_phone distance
## "character" "numeric"
## business_hours coordinates.latitude
## "list" "numeric"
## coordinates.longitude location.address1
## "numeric" "character"
## location.address2 location.address3
## "character" "character"
## location.city location.zip_code
## "character" "character"
## location.country location.state
## "character" "character"
## location.display_address attributes.business_temp_closed
## "list" "logical"
## attributes.menu_url attributes.open24_hours
## "character" "logical"
## attributes.waitlist_reservation
## "logical"
### The four lists are categories, transactions, business_hours, and location.display_address. We will have to handle each list differently.
### Transactions seems to be empty for hotels. Let's test that.
hotels_empty_t <- sapply(hotels_flat$transactions, function(x) length(x) == 0)
sum(hotels_empty_t) == nrow(hotels_flat)
## [1] TRUE
### All rows are empty for hotels, so we can ignore transactions.
restaurants_empty_t <- sapply(restaurants_flat$transactions, function(x) length(x) == 0)
sum(restaurants_empty_t) == nrow(restaurants_flat)
## [1] FALSE
### Not all rows are empty in restaurants, so we will have to open them.
restaurants_flat <- restaurants_flat %>% mutate(transactions = transactions %>% map_chr(., function(x) str_c(x, collapse = ", ")))
### location.display_address is similar to transactions, a list consisting of characters separated by commas. We can open them in a way similar to transactions.
hotels_flat <- hotels_flat %>% mutate(location.display_address = location.display_address %>% map_chr(., function(x) str_c(x, collapse = ", ")))
restaurants_flat <- restaurants_flat %>% mutate(location.display_address = location.display_address %>% map_chr(., function(x) str_c(x, collapse = ", ")))
### categories and business_hours are data frames, so we will run a special function to flatten them.
concate_list <- function(x){
titles <- x[["title"]] %>% str_c(collapse = ", ")
return(titles)
}
hotels_flat <- hotels_flat %>% mutate(categories = categories %>% map_chr(concate_list), business_hours = business_hours %>% map_chr(concate_list))
restaurants_flat <- restaurants_flat %>% mutate(categories = categories %>% map_chr(concate_list), business_hours = business_hours %>% map_chr(concate_list))
### To delete rows that have missing coordinates, we change the flattened data to an sf file, then delete rows that have missing geometry values.
hotels_sf <- hotels_flat %>% st_as_sf(coords = c("coordinates.longitude", "coordinates.latitude"), crs = 4326)
restaurants_sf <- restaurants_flat %>% st_as_sf(coords = c("coordinates.longitude", "coordinates.latitude"), crs = 4326)
### Check whether any businesses has NA for its coordinates:
print(sprintf("There are %d hotels with NA values as coordinates.", sum(is.na(hotels_sf$geometry))))
## [1] "There are 0 hotels with NA values as coordinates."
print(sprintf("There are %d restaurants with NA values as coordinates.", sum(is.na(restaurants_sf$geometry))))
## [1] "There are 0 restaurants with NA values as coordinates."
### We would use filter and !is.na() to drop the missing values, but since there are no missing values, we do not bother with it.
### Finally, we select the hotels within Brunswick
Brunswick <- suppressMessages(tigris::places(13, progress_bar = FALSE) %>% filter(NAME == 'Brunswick') %>% st_transform(4326))
hotels_criteria <- hotels_sf[Brunswick,]
hotels_Brunswick <- st_join(hotels_criteria, Brunswick, join = st_intersects)
restaurants_criteria <- restaurants_sf[Brunswick,]
restaurants_Brunswick <- st_join(restaurants_criteria, Brunswick, join = st_intersects)
tmap_mode('view')
## tmap mode set to interactive viewing
### Since we have the location of all businesses, we can plot them on a map.
tm_shape(hotels_Brunswick) + tm_dots(col = 'orange') + # Orange shows the hotels
tm_shape(restaurants_Brunswick) + tm_dots(col = 'lightblue') + # Lightblue shows the restaurants
tm_shape(Brunswick) + tm_borders()
# Some questions to consider:
# 1. Are there any problems in the data?
# As mentioned in Mini 1, two hotels have duplicated locations and should be removed.
hotels_Brunswick <- hotels_Brunswick[-c(6, 9), ]
# 2. What’s the most frequent rating score? Does that seem to be related with review_count?
# Since there are only seven hotels, let's look at the ratings of restaurants in Brunswick:
hist(restaurants_Brunswick$rating,
breaks = seq(-0.1, 5.1, by = 0.1),
main = "Restaurant Rating",
xlab = "Score",
ylab = "Frequency",
col = "lightblue",
border = "black")

# In general, the restaurants have pretty decent ratings. We can use a scatter plot to see whether the ratings are correlated with the review counts.
plot(restaurants_Brunswick$rating, restaurants_Brunswick$review_count, main = "Ratings and Reviews", xlab = "Ratings", ylab = "Review Count", col = "lightblue", cex = 1, pch = 19)
c <- lm(restaurants_Brunswick$review_count ~ restaurants_Brunswick$rating)
abline(c, col = 'black')

summary(c)
##
## Call:
## lm(formula = restaurants_Brunswick$review_count ~ restaurants_Brunswick$rating)
##
## Residuals:
## Min 1Q Median 3Q Max
## -87.32 -51.56 -20.59 9.02 503.26
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.303 47.138 -0.028 0.978
## restaurants_Brunswick$rating 17.925 12.102 1.481 0.144
##
## Residual standard error: 97.09 on 54 degrees of freedom
## Multiple R-squared: 0.03904, Adjusted R-squared: 0.02125
## F-statistic: 2.194 on 1 and 54 DF, p-value: 0.1444
# Based on the summary and chart, we don't really see a statistical correlation between the ratings and review counts of restaurants.
# 3. Are expensive restaurants clustered together?
# Let's plot the location of the restaurants based on their price.
# First, remove restaurants without price data.
restaurants_Brunswick_withprice <- restaurants_Brunswick %>% filter(!is.na(price))
tm_shape(restaurants_Brunswick_withprice) + tm_dots(col = "price", palette = "Set1") + tm_shape(Brunswick) + tm_borders()
# It appears that some expensive restaurants are clustered at central Brunswick. Could they be near a certain hotel? Let's add the location of the hotels.
tm_shape(restaurants_Brunswick_withprice) + tm_dots(col = "price", palette = "Set1") + tm_shape(Brunswick) + tm_borders() +
tm_shape(hotels_Brunswick) + tm_dots(col = 'black')
# There is a hotel within this small cluster, Kress Brunswick. Kress Brunswick also happens to be the only hotel with a five point rating (albeit one review). Perhaps the number of fancy restaurants next to it improved its comfort.
# 4. Are restaurant rating and price associated?
# We will use a jitter plot to see if rating and price are associated for restaurants.
ggplot(restaurants_Brunswick_withprice, aes(x = price, y = rating, color = price)) +
geom_jitter(width = 0.01, size = 2, alpha = 0.7) +
labs(title = "Price and Rating of Restaurants",
x = "Price",
y = "Rating") +
theme_minimal() +
scale_color_manual(values = c("$" = "red", "$$" = "blue"))

# Based on the chart, we observe that in general, restaurants that are more expensive have a higher rating.