Mini 2: Analysis of yelp data

suppressWarnings(suppressMessages({
  library(tidycensus)
  library(sf)
  library(tmap)
  library(jsonlite)
  library(tidyverse)
  library(httr)
  library(reshape2)
  library(here)
  library(knitr)
  library(ggplot2)
}))


### Read the rds data of hotels and restaurants
hotels <- readRDS('hotels.rds')
restaurants <- readRDS('restaurants.rds')

### Deleting duplicated rows with distinct()
hotels_unique <- hotels %>% distinct(id, .keep_all = T)

print(paste("Number of duplicated rows in hotels:", nrow(hotels) - nrow(hotels_unique)))

## [1] "Number of duplicated rows in hotels: 0"

### Since the difference between the number of rows in hotels and hotels_unique is 0, there are no duplicated rows in hotels

restaurants_unique <- restaurants %>% distinct(id, .keep_all = T)
print(paste("Number of duplicated rows in restaurants:", nrow(restaurants) - nrow(restaurants_unique)))

## [1] "Number of duplicated rows in restaurants: 0"

### The same goes for restaurants and restaurants_unique. There are no duplicated rows in duplicated. In other words, we can use hotels and restaurants without worrying about duplicated rows.

### Flatten nested columns
### First, we find the columns that have to be flattened (in other words, look for the nested columns).
sapply(hotels, class)

##             id          alias           name      image_url      is_closed 
##    "character"    "character"    "character"    "character"      "logical" 
##            url   review_count     categories         rating    coordinates 
##    "character"      "integer"         "list"      "numeric"   "data.frame" 
##   transactions          price       location          phone  display_phone 
##         "list"    "character"   "data.frame"    "character"    "character" 
##       distance business_hours     attributes 
##      "numeric"         "list"   "data.frame"

sapply(restaurants, class)

##             id          alias           name      image_url      is_closed 
##    "character"    "character"    "character"    "character"      "logical" 
##            url   review_count     categories         rating    coordinates 
##    "character"      "integer"         "list"      "numeric"   "data.frame" 
##   transactions          price       location          phone  display_phone 
##         "list"    "character"   "data.frame"    "character"    "character" 
##       distance business_hours     attributes 
##      "numeric"         "list"   "data.frame"

### In both cases, there are three lists (categories, business_hours, and transactions) and three data frames (coordinates, location, and attributes). 

### We first flatten the data frames with jsonlite::flatten()
hotels_flat <- hotels %>% jsonlite::flatten()
restaurants_flat <- restaurants %>% jsonlite::flatten()

### Next, we look for the lists that have to be flattened
sapply(hotels_flat, class)

##                              id                           alias 
##                     "character"                     "character" 
##                            name                       image_url 
##                     "character"                     "character" 
##                       is_closed                             url 
##                       "logical"                     "character" 
##                    review_count                      categories 
##                       "integer"                          "list" 
##                          rating                    transactions 
##                       "numeric"                          "list" 
##                           price                           phone 
##                     "character"                     "character" 
##                   display_phone                        distance 
##                     "character"                       "numeric" 
##                  business_hours            coordinates.latitude 
##                          "list"                       "numeric" 
##           coordinates.longitude               location.address1 
##                       "numeric"                     "character" 
##               location.address2               location.address3 
##                     "character"                     "character" 
##                   location.city               location.zip_code 
##                     "character"                     "character" 
##                location.country                  location.state 
##                     "character"                     "character" 
##        location.display_address attributes.business_temp_closed 
##                          "list"                       "logical" 
##             attributes.menu_url         attributes.open24_hours 
##                       "logical"                       "logical" 
## attributes.waitlist_reservation 
##                       "logical"

sapply(restaurants_flat, class)

##                              id                           alias 
##                     "character"                     "character" 
##                            name                       image_url 
##                     "character"                     "character" 
##                       is_closed                             url 
##                       "logical"                     "character" 
##                    review_count                      categories 
##                       "integer"                          "list" 
##                          rating                    transactions 
##                       "numeric"                          "list" 
##                           price                           phone 
##                     "character"                     "character" 
##                   display_phone                        distance 
##                     "character"                       "numeric" 
##                  business_hours            coordinates.latitude 
##                          "list"                       "numeric" 
##           coordinates.longitude               location.address1 
##                       "numeric"                     "character" 
##               location.address2               location.address3 
##                     "character"                     "character" 
##                   location.city               location.zip_code 
##                     "character"                     "character" 
##                location.country                  location.state 
##                     "character"                     "character" 
##        location.display_address attributes.business_temp_closed 
##                          "list"                       "logical" 
##             attributes.menu_url         attributes.open24_hours 
##                     "character"                       "logical" 
## attributes.waitlist_reservation 
##                       "logical"

### The four lists are categories, transactions, business_hours, and location.display_address. We will have to handle each list differently.
### Transactions seems to be empty for hotels. Let's test that.
hotels_empty_t <- sapply(hotels_flat$transactions, function(x) length(x) == 0)
sum(hotels_empty_t) == nrow(hotels_flat)

## [1] TRUE

### All rows are empty for hotels, so we can ignore transactions. 

restaurants_empty_t <- sapply(restaurants_flat$transactions, function(x) length(x) == 0)
sum(restaurants_empty_t) == nrow(restaurants_flat)

## [1] FALSE

### Not all rows are empty in restaurants, so we will have to open them. 
restaurants_flat <- restaurants_flat %>% mutate(transactions = transactions %>% map_chr(., function(x) str_c(x, collapse = ", ")))

### location.display_address is similar to transactions, a list consisting of characters separated by commas. We can open them in a way similar to transactions.
hotels_flat <- hotels_flat %>% mutate(location.display_address = location.display_address %>% map_chr(., function(x) str_c(x, collapse = ", ")))

restaurants_flat <- restaurants_flat %>% mutate(location.display_address = location.display_address %>% map_chr(., function(x) str_c(x, collapse = ", ")))

### categories and business_hours are data frames, so we will run a special function to flatten them.
concate_list <- function(x){
  titles <- x[["title"]] %>% str_c(collapse = ", ")
  return(titles)
}

hotels_flat <- hotels_flat %>% mutate(categories = categories %>% map_chr(concate_list), business_hours = business_hours %>% map_chr(concate_list))

restaurants_flat <- restaurants_flat %>% mutate(categories = categories %>% map_chr(concate_list), business_hours = business_hours %>% map_chr(concate_list))

### To delete rows that have missing coordinates, we change the flattened data to an sf file, then delete rows that have missing geometry values.
hotels_sf <- hotels_flat %>% st_as_sf(coords = c("coordinates.longitude", "coordinates.latitude"), crs = 4326) 

restaurants_sf <- restaurants_flat %>% st_as_sf(coords = c("coordinates.longitude", "coordinates.latitude"), crs = 4326) 

### Check whether any businesses has NA for its coordinates:
print(sprintf("There are %d hotels with NA values as coordinates.", sum(is.na(hotels_sf$geometry))))

## [1] "There are 0 hotels with NA values as coordinates."

print(sprintf("There are %d restaurants with NA values as coordinates.", sum(is.na(restaurants_sf$geometry))))

## [1] "There are 0 restaurants with NA values as coordinates."

### We would use filter and !is.na() to drop the missing values, but since there are no missing values, we do not bother with it. 

### Finally, we select the hotels within Brunswick
Brunswick <- suppressMessages(tigris::places(13, progress_bar = FALSE) %>% filter(NAME == 'Brunswick') %>% st_transform(4326))

hotels_criteria <- hotels_sf[Brunswick,]
hotels_Brunswick <- st_join(hotels_criteria, Brunswick, join = st_intersects)

restaurants_criteria <- restaurants_sf[Brunswick,]
restaurants_Brunswick <- st_join(restaurants_criteria, Brunswick, join = st_intersects)

tmap_mode('view')

## tmap mode set to interactive viewing

### Since we have the location of all businesses, we can plot them on a map. 
tm_shape(hotels_Brunswick) + tm_dots(col = 'orange') + # Orange shows the hotels
  tm_shape(restaurants_Brunswick) + tm_dots(col = 'lightblue') + # Lightblue shows the restaurants
  tm_shape(Brunswick) + tm_borders()

# Some questions to consider:
# 1. Are there any problems in the data? 
# As mentioned in Mini 1, two hotels have duplicated locations and should be removed.
hotels_Brunswick <- hotels_Brunswick[-c(6, 9), ]

# 2. What’s the most frequent rating score? Does that seem to be related with review_count?
# Since there are only seven hotels, let's look at the ratings of restaurants in Brunswick:
hist(restaurants_Brunswick$rating, 
     breaks = seq(-0.1, 5.1, by = 0.1), 
     main = "Restaurant Rating", 
     xlab = "Score", 
     ylab = "Frequency",
     col = "lightblue",
     border = "black")

# In general, the restaurants have pretty decent ratings. We can use a scatter plot to see whether the ratings are correlated with the review counts. 

plot(restaurants_Brunswick$rating, restaurants_Brunswick$review_count, main = "Ratings and Reviews", xlab = "Ratings", ylab = "Review Count", col = "lightblue", cex = 1, pch = 19)

c <- lm(restaurants_Brunswick$review_count ~ restaurants_Brunswick$rating)
abline(c, col = 'black')

summary(c)

## 
## Call:
## lm(formula = restaurants_Brunswick$review_count ~ restaurants_Brunswick$rating)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -87.32 -51.56 -20.59   9.02 503.26 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)
## (Intercept)                    -1.303     47.138  -0.028    0.978
## restaurants_Brunswick$rating   17.925     12.102   1.481    0.144
## 
## Residual standard error: 97.09 on 54 degrees of freedom
## Multiple R-squared:  0.03904,    Adjusted R-squared:  0.02125 
## F-statistic: 2.194 on 1 and 54 DF,  p-value: 0.1444

# Based on the summary and chart, we don't really see a statistical correlation between the ratings and review counts of restaurants. 

# 3. Are expensive restaurants clustered together?
# Let's plot the location of the restaurants based on their price.
# First, remove restaurants without price data.

restaurants_Brunswick_withprice <- restaurants_Brunswick %>% filter(!is.na(price))

tm_shape(restaurants_Brunswick_withprice) + tm_dots(col = "price", palette = "Set1") + tm_shape(Brunswick) + tm_borders()

# It appears that some expensive restaurants are clustered at central Brunswick. Could they be near a certain hotel? Let's add the location of the hotels.
tm_shape(restaurants_Brunswick_withprice) + tm_dots(col = "price", palette = "Set1") + tm_shape(Brunswick) + tm_borders() + 
tm_shape(hotels_Brunswick) + tm_dots(col = 'black')

# There is a hotel within this small cluster, Kress Brunswick. Kress Brunswick also happens to be the only hotel with a five point rating (albeit one review). Perhaps the number of fancy restaurants next to it improved its comfort. 

# 4. Are restaurant rating and price associated?
# We will use a jitter plot to see if rating and price are associated for restaurants. 
ggplot(restaurants_Brunswick_withprice, aes(x = price, y = rating, color = price)) +
  geom_jitter(width = 0.01, size = 2, alpha = 0.7) +  
  labs(title = "Price and Rating of Restaurants",
       x = "Price",
       y = "Rating") +
  theme_minimal() +
  scale_color_manual(values = c("$" = "red", "$$" = "blue"))

# Based on the chart, we observe that in general, restaurants that are more expensive have a higher rating.

Mini 2: Analysis of yelp data

Ranon Chen

2024-09-22