Read in Data

# Load Data 
yelp_data <- readRDS("/Users/morganlane/Library/Mobile Documents/com~apple~CloudDocs/Documents/Tech/Fall 2024/Urban Analytics/Homeworks/yelp_data.rds") 

Data Cleaning

# Delete duplicated rows
yelp_unique <- yelp_data %>% 
  distinct(id, .keep_all=T)

glue::glue("Before dropping duplicated rows, there were {nrow(yelp_data)} rows. After dropping them, there are {nrow(yelp_unique)} rows") %>% 
  print()
## Before dropping duplicated rows, there were 475 rows. After dropping them, there are 243 rows
# Flatten nested columns that have multiple variables in one column
## check class of columns
sapply(yelp_unique,class) %>% print()
##             id          alias           name      image_url      is_closed 
##    "character"    "character"    "character"    "character"      "logical" 
##            url   review_count     categories         rating    coordinates 
##    "character"      "integer"         "list"      "numeric"   "data.frame" 
##   transactions       location          phone  display_phone       distance 
##         "list"   "data.frame"    "character"    "character"      "numeric" 
## business_hours     attributes          price 
##         "list"   "data.frame"    "character"
##flatten the columns that are dfs
yelp_flat <- yelp_unique %>% 
  jsonlite::flatten()

## check the columns
sapply(yelp_flat,class) %>% print()
##                              id                           alias 
##                     "character"                     "character" 
##                            name                       image_url 
##                     "character"                     "character" 
##                       is_closed                             url 
##                       "logical"                     "character" 
##                    review_count                      categories 
##                       "integer"                          "list" 
##                          rating                    transactions 
##                       "numeric"                          "list" 
##                           phone                   display_phone 
##                     "character"                     "character" 
##                        distance                  business_hours 
##                       "numeric"                          "list" 
##                           price            coordinates.latitude 
##                     "character"                       "numeric" 
##           coordinates.longitude               location.address1 
##                       "numeric"                     "character" 
##               location.address2               location.address3 
##                     "character"                     "character" 
##                   location.city               location.zip_code 
##                     "character"                     "character" 
##                location.country                  location.state 
##                     "character"                     "character" 
##        location.display_address attributes.business_temp_closed 
##                          "list"                       "logical" 
## attributes.waitlist_reservation         attributes.open24_hours 
##                       "logical"                       "logical" 
##             attributes.menu_url 
##                     "character"
## Concatenate what's inside a list
yelp_concat <- yelp_flat %>% 
  mutate(transactions = transactions %>% 
           map_chr(., function(x) str_c(x, collapse=", ")),
         location.display_address = location.display_address %>% 
           map_chr(., function(x) str_c(x, collapse=", ")))

## fix the categories column
# Custom function that takes the data frame in "categories" column in Yelp data
# and returns a character vector
concate_list <- function(x){
  # x is a data frame with columns "alias" and "title" from Yelp$categories
  # returns a character vector containing category concatenated titles 
  titles <- x[["title"]] %>% str_c(collapse = ", ")
  return(titles)
}

yelp_flat2 <- yelp_concat %>% 
  mutate(categories = categories %>% map_chr(concate_list))

yelp_flat2 %>% head()
##                       id                                                alias
## 1 8sI2EtjEu6_wgcxBIiw_pA                             planned-pethood-duluth-2
## 2 VqPyg5Vh-ukzpXE55-1F2w                             carpet-savers-lakemont-3
## 3 ZMXmsMMPHaD5JO3zyUu2uw aquarium-and-shark-lab-by-team-ecco-hendersonville-2
## 4 NQuHP24Zrf3UYltD5jt2gg                           on-the-move-flowery-branch
## 5 _7tLry_qnZwTIKHtvJ66dQ                chris-motes-pumping-service-cleveland
## 6 FKT3ucAzns9-p-k_jp1a4g                 gilstrap-exterminating-dawsonville-2
##                                name
## 1                   Planned PEThood
## 2                     Carpet Savers
## 3 Aquarium & Shark Lab by Team ECCO
## 4                       On The Move
## 5      Chris Mote's Pumping Service
## 6            Gilstrap Exterminating
##                                                              image_url
## 1 https://s3-media3.fl.yelpcdn.com/bphoto/vjz6KCBwXNup59nMFuw4-w/o.jpg
## 2 https://s3-media1.fl.yelpcdn.com/bphoto/-zZe8Ig9adbcqzUesY47VA/o.jpg
## 3 https://s3-media4.fl.yelpcdn.com/bphoto/jnks6ZOpW8Vdc4svLHQDRQ/o.jpg
## 4 https://s3-media1.fl.yelpcdn.com/bphoto/6G8zpQ-d-KDZJe_MtLbSag/o.jpg
## 5 https://s3-media1.fl.yelpcdn.com/bphoto/QZqEHFE2N2-Xu9vHAhfPKQ/o.jpg
## 6 https://s3-media4.fl.yelpcdn.com/bphoto/ckr9n7KByAxCbzQGw1qRdg/o.jpg
##   is_closed
## 1     FALSE
## 2     FALSE
## 3     FALSE
## 4     FALSE
## 5     FALSE
## 6     FALSE
##                                                                                                                                                                                                                 url
## 1                             https://www.yelp.com/biz/planned-pethood-duluth-2?adjust_creative=P_ibNlpy71Wj-TC2vsUAew&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=P_ibNlpy71Wj-TC2vsUAew
## 2                             https://www.yelp.com/biz/carpet-savers-lakemont-3?adjust_creative=P_ibNlpy71Wj-TC2vsUAew&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=P_ibNlpy71Wj-TC2vsUAew
## 3 https://www.yelp.com/biz/aquarium-and-shark-lab-by-team-ecco-hendersonville-2?adjust_creative=P_ibNlpy71Wj-TC2vsUAew&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=P_ibNlpy71Wj-TC2vsUAew
## 4                           https://www.yelp.com/biz/on-the-move-flowery-branch?adjust_creative=P_ibNlpy71Wj-TC2vsUAew&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=P_ibNlpy71Wj-TC2vsUAew
## 5                https://www.yelp.com/biz/chris-motes-pumping-service-cleveland?adjust_creative=P_ibNlpy71Wj-TC2vsUAew&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=P_ibNlpy71Wj-TC2vsUAew
## 6                 https://www.yelp.com/biz/gilstrap-exterminating-dawsonville-2?adjust_creative=P_ibNlpy71Wj-TC2vsUAew&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=P_ibNlpy71Wj-TC2vsUAew
##   review_count                                  categories rating transactions
## 1           62 Veterinarians, Community Service/Non-Profit    4.5             
## 2           10                             Carpet Cleaning    5.0             
## 3           25     Community Service/Non-Profit, Aquariums    4.3             
## 4           24      Movers, Self Storage, Packing Services    4.0             
## 5            5                             Septic Services    4.8             
## 6           12              Pest Control, Wildlife Control    4.1             
##          phone  display_phone  distance
## 1 +16785613491 (678) 561-3491  86015.30
## 2 +17067820279 (706) 782-0279  28045.38
## 3 +18286928386 (828) 692-8386 132793.63
## 4 +17069730175 (706) 973-0175  55857.18
## 5 +17068655526 (706) 865-5526  14551.81
## 6 +16786974430 (678) 697-4430  48247.29
##                                                                                                                                                     business_hours
## 1                                                           FALSE, FALSE, FALSE, FALSE, 0700, 0700, 0700, 0700, 1630, 1630, 1630, 1630, 0, 1, 2, 3, REGULAR, FALSE
## 2 TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0, 1, 2, 3, 4, 5, 6, REGULAR, TRUE
## 3                                                                                 FALSE, FALSE, FALSE, 1300, 1300, 1300, 1600, 1600, 1600, 3, 4, 5, REGULAR, FALSE
## 4                                     FALSE, FALSE, FALSE, FALSE, FALSE, 0800, 0800, 0800, 0800, 0800, 1700, 1700, 1700, 1700, 1700, 0, 1, 2, 3, 4, REGULAR, FALSE
## 5                                     FALSE, FALSE, FALSE, FALSE, FALSE, 0800, 0800, 0800, 0800, 0800, 1700, 1700, 1700, 1700, 1700, 0, 1, 2, 3, 4, REGULAR, FALSE
## 6                                     FALSE, FALSE, FALSE, FALSE, FALSE, 0800, 0800, 0800, 0800, 0800, 1700, 1700, 1700, 1700, 1700, 0, 1, 2, 3, 4, REGULAR, FALSE
##   price coordinates.latitude coordinates.longitude      location.address1
## 1  <NA>             34.00646             -84.13470        2860 Buford Hwy
## 2  <NA>             34.77966             -83.39705                   <NA>
## 3  <NA>             35.31794             -82.46090          511 N Main St
## 4  <NA>             34.21471             -83.88878    4050 Enterprise Way
## 5  <NA>             34.63543             -83.84629 669 Paradise Valley Rd
## 6  <NA>             34.36337             -84.04568  30 Industrial Park Rd
##   location.address2 location.address3  location.city location.zip_code
## 1                                             Duluth             30096
## 2              <NA>              <NA>       Lakemont             30552
## 3              <NA>              <NA> Hendersonville             28792
## 4           Ste 180              <NA> Flowery Branch             30542
## 5              <NA>              <NA>      Cleveland             30528
## 6           Ste 103                      Dawsonville             30534
##   location.country location.state
## 1               US             GA
## 2               US             GA
## 3               US             NC
## 4               US             GA
## 5               US             GA
## 6               US             GA
##                                 location.display_address
## 1                      2860 Buford Hwy, Duluth, GA 30096
## 2                                     Lakemont, GA 30552
## 3                511 N Main St, Hendersonville, NC 28792
## 4 4050 Enterprise Way, Ste 180, Flowery Branch, GA 30542
## 5            669 Paradise Valley Rd, Cleveland, GA 30528
## 6  30 Industrial Park Rd, Ste 103, Dawsonville, GA 30534
##   attributes.business_temp_closed attributes.waitlist_reservation
## 1                              NA                              NA
## 2                              NA                              NA
## 3                              NA                              NA
## 4                              NA                              NA
## 5                              NA                              NA
## 6                              NA                              NA
##   attributes.open24_hours attributes.menu_url
## 1                      NA                <NA>
## 2                      NA                <NA>
## 3                      NA                <NA>
## 4                      NA                <NA>
## 5                      NA                <NA>
## 6                      NA                <NA>
# Delete rows that have missing data in coordinates variable
## check for NAs
yelp_flat2 %>% 
  map_dbl(., function(x) sum(is.na(x))) 
##                              id                           alias 
##                               0                               0 
##                            name                       image_url 
##                               0                               0 
##                       is_closed                             url 
##                               0                               0 
##                    review_count                      categories 
##                               0                               0 
##                          rating                    transactions 
##                               0                               0 
##                           phone                   display_phone 
##                               0                               0 
##                        distance                  business_hours 
##                               0                               0 
##                           price            coordinates.latitude 
##                             207                               0 
##           coordinates.longitude               location.address1 
##                               0                              59 
##               location.address2               location.address3 
##                             155                              89 
##                   location.city               location.zip_code 
##                               0                               0 
##                location.country                  location.state 
##                               0                               0 
##        location.display_address attributes.business_temp_closed 
##                               0                             243 
## attributes.waitlist_reservation         attributes.open24_hours 
##                             239                             242 
##             attributes.menu_url 
##                             234
## there are no NAs in either coordinates column 


# Delete rows that fall outside of the boundary of the city you chose
## city boundary
helen <- tigris::places("GA", progress_bar = FALSE) %>% 
  filter(NAME == 'Helen') %>% 
  st_transform(4326)
## Retrieving data for the year 2021
# Converting yelp_flat2 into a sf object
yelp_sf <- yelp_flat2 %>% 
  st_as_sf(coords=c("coordinates.longitude", "coordinates.latitude"), 
           crs = 4326)
  
# sf subsets
yelp_in <- yelp_sf[helen, ]

print(paste0("Before: ", nrow(yelp_sf)))
## [1] "Before: 243"
print(paste0("Before: ", nrow(yelp_in)))
## [1] "Before: 46"
glue::glue("nrow before: {nrow(yelp_data)} -> nrow after: {nrow(yelp_in)} \n
            ncol before: {ncol(yelp_data)} -> ncol after: {ncol(yelp_in)} \n") %>% 
  print()
## nrow before: 475 -> nrow after: 46 
## 
## ncol before: 18 -> ncol after: 28

View the Data

# Visualize
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(yelp_in) + tm_dots() +
  tm_shape(helen) + tm_borders()
## the businesses are all within the borders now

tm_shape(yelp_in) + tm_dots("rating") +
  tm_shape(helen) + tm_borders()
# frequency of each rating 
ggplot(yelp_in, aes(rating)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# rating and number of ratings 
ggplot(yelp_in, aes(rating, review_count)) +
  geom_point()

# price levels
yelp_dropna_price <- yelp_in %>% 
  drop_na(price)

tm_shape(yelp_dropna_price) + tm_dots("price") +
  tm_shape(helen) + tm_borders()

Write-up The data are much easier to interpret now that the food and local services businesses included actually fall within the borders of Helen. It’s even clearer that they cluster around the main road where the tourists frequent. I am surprised by how few businesses show up, especially the local services. It makes me wonder if these are more commonly outside the city limits while food proprietors are closer to the tourist area. I was interested in pricing of the restaurants, but there are a lot of NAs there. I chose to keep those rows for the main analysis, but removed them to investigate price. There is only one relatively expensive restaurant in Helen. Less expensive businesses overlap in the center of town and get cheaper as you move away from downtown. In looking at business ratings, most businesses are rated highly, outside of those with a 0. It does look like those with the higher ratings have been reviewed more frequently, though only one business, Hofer’s Bakery & Café, has been reviewed four times more often than the next most reviewed restaurant. I have been there, and it is great!