Read in Data
# Load Data
yelp_data <- readRDS("/Users/morganlane/Library/Mobile Documents/com~apple~CloudDocs/Documents/Tech/Fall 2024/Urban Analytics/Homeworks/yelp_data.rds")
Data Cleaning
# Delete duplicated rows
yelp_unique <- yelp_data %>%
distinct(id, .keep_all=T)
glue::glue("Before dropping duplicated rows, there were {nrow(yelp_data)} rows. After dropping them, there are {nrow(yelp_unique)} rows") %>%
print()
## Before dropping duplicated rows, there were 475 rows. After dropping them, there are 243 rows
# Flatten nested columns that have multiple variables in one column
## check class of columns
sapply(yelp_unique,class) %>% print()
## id alias name image_url is_closed
## "character" "character" "character" "character" "logical"
## url review_count categories rating coordinates
## "character" "integer" "list" "numeric" "data.frame"
## transactions location phone display_phone distance
## "list" "data.frame" "character" "character" "numeric"
## business_hours attributes price
## "list" "data.frame" "character"
##flatten the columns that are dfs
yelp_flat <- yelp_unique %>%
jsonlite::flatten()
## check the columns
sapply(yelp_flat,class) %>% print()
## id alias
## "character" "character"
## name image_url
## "character" "character"
## is_closed url
## "logical" "character"
## review_count categories
## "integer" "list"
## rating transactions
## "numeric" "list"
## phone display_phone
## "character" "character"
## distance business_hours
## "numeric" "list"
## price coordinates.latitude
## "character" "numeric"
## coordinates.longitude location.address1
## "numeric" "character"
## location.address2 location.address3
## "character" "character"
## location.city location.zip_code
## "character" "character"
## location.country location.state
## "character" "character"
## location.display_address attributes.business_temp_closed
## "list" "logical"
## attributes.waitlist_reservation attributes.open24_hours
## "logical" "logical"
## attributes.menu_url
## "character"
## Concatenate what's inside a list
yelp_concat <- yelp_flat %>%
mutate(transactions = transactions %>%
map_chr(., function(x) str_c(x, collapse=", ")),
location.display_address = location.display_address %>%
map_chr(., function(x) str_c(x, collapse=", ")))
## fix the categories column
# Custom function that takes the data frame in "categories" column in Yelp data
# and returns a character vector
concate_list <- function(x){
# x is a data frame with columns "alias" and "title" from Yelp$categories
# returns a character vector containing category concatenated titles
titles <- x[["title"]] %>% str_c(collapse = ", ")
return(titles)
}
yelp_flat2 <- yelp_concat %>%
mutate(categories = categories %>% map_chr(concate_list))
yelp_flat2 %>% head()
## id alias
## 1 8sI2EtjEu6_wgcxBIiw_pA planned-pethood-duluth-2
## 2 VqPyg5Vh-ukzpXE55-1F2w carpet-savers-lakemont-3
## 3 ZMXmsMMPHaD5JO3zyUu2uw aquarium-and-shark-lab-by-team-ecco-hendersonville-2
## 4 NQuHP24Zrf3UYltD5jt2gg on-the-move-flowery-branch
## 5 _7tLry_qnZwTIKHtvJ66dQ chris-motes-pumping-service-cleveland
## 6 FKT3ucAzns9-p-k_jp1a4g gilstrap-exterminating-dawsonville-2
## name
## 1 Planned PEThood
## 2 Carpet Savers
## 3 Aquarium & Shark Lab by Team ECCO
## 4 On The Move
## 5 Chris Mote's Pumping Service
## 6 Gilstrap Exterminating
## image_url
## 1 https://s3-media3.fl.yelpcdn.com/bphoto/vjz6KCBwXNup59nMFuw4-w/o.jpg
## 2 https://s3-media1.fl.yelpcdn.com/bphoto/-zZe8Ig9adbcqzUesY47VA/o.jpg
## 3 https://s3-media4.fl.yelpcdn.com/bphoto/jnks6ZOpW8Vdc4svLHQDRQ/o.jpg
## 4 https://s3-media1.fl.yelpcdn.com/bphoto/6G8zpQ-d-KDZJe_MtLbSag/o.jpg
## 5 https://s3-media1.fl.yelpcdn.com/bphoto/QZqEHFE2N2-Xu9vHAhfPKQ/o.jpg
## 6 https://s3-media4.fl.yelpcdn.com/bphoto/ckr9n7KByAxCbzQGw1qRdg/o.jpg
## is_closed
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## url
## 1 https://www.yelp.com/biz/planned-pethood-duluth-2?adjust_creative=P_ibNlpy71Wj-TC2vsUAew&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=P_ibNlpy71Wj-TC2vsUAew
## 2 https://www.yelp.com/biz/carpet-savers-lakemont-3?adjust_creative=P_ibNlpy71Wj-TC2vsUAew&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=P_ibNlpy71Wj-TC2vsUAew
## 3 https://www.yelp.com/biz/aquarium-and-shark-lab-by-team-ecco-hendersonville-2?adjust_creative=P_ibNlpy71Wj-TC2vsUAew&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=P_ibNlpy71Wj-TC2vsUAew
## 4 https://www.yelp.com/biz/on-the-move-flowery-branch?adjust_creative=P_ibNlpy71Wj-TC2vsUAew&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=P_ibNlpy71Wj-TC2vsUAew
## 5 https://www.yelp.com/biz/chris-motes-pumping-service-cleveland?adjust_creative=P_ibNlpy71Wj-TC2vsUAew&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=P_ibNlpy71Wj-TC2vsUAew
## 6 https://www.yelp.com/biz/gilstrap-exterminating-dawsonville-2?adjust_creative=P_ibNlpy71Wj-TC2vsUAew&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=P_ibNlpy71Wj-TC2vsUAew
## review_count categories rating transactions
## 1 62 Veterinarians, Community Service/Non-Profit 4.5
## 2 10 Carpet Cleaning 5.0
## 3 25 Community Service/Non-Profit, Aquariums 4.3
## 4 24 Movers, Self Storage, Packing Services 4.0
## 5 5 Septic Services 4.8
## 6 12 Pest Control, Wildlife Control 4.1
## phone display_phone distance
## 1 +16785613491 (678) 561-3491 86015.30
## 2 +17067820279 (706) 782-0279 28045.38
## 3 +18286928386 (828) 692-8386 132793.63
## 4 +17069730175 (706) 973-0175 55857.18
## 5 +17068655526 (706) 865-5526 14551.81
## 6 +16786974430 (678) 697-4430 48247.29
## business_hours
## 1 FALSE, FALSE, FALSE, FALSE, 0700, 0700, 0700, 0700, 1630, 1630, 1630, 1630, 0, 1, 2, 3, REGULAR, FALSE
## 2 TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0, 1, 2, 3, 4, 5, 6, REGULAR, TRUE
## 3 FALSE, FALSE, FALSE, 1300, 1300, 1300, 1600, 1600, 1600, 3, 4, 5, REGULAR, FALSE
## 4 FALSE, FALSE, FALSE, FALSE, FALSE, 0800, 0800, 0800, 0800, 0800, 1700, 1700, 1700, 1700, 1700, 0, 1, 2, 3, 4, REGULAR, FALSE
## 5 FALSE, FALSE, FALSE, FALSE, FALSE, 0800, 0800, 0800, 0800, 0800, 1700, 1700, 1700, 1700, 1700, 0, 1, 2, 3, 4, REGULAR, FALSE
## 6 FALSE, FALSE, FALSE, FALSE, FALSE, 0800, 0800, 0800, 0800, 0800, 1700, 1700, 1700, 1700, 1700, 0, 1, 2, 3, 4, REGULAR, FALSE
## price coordinates.latitude coordinates.longitude location.address1
## 1 <NA> 34.00646 -84.13470 2860 Buford Hwy
## 2 <NA> 34.77966 -83.39705 <NA>
## 3 <NA> 35.31794 -82.46090 511 N Main St
## 4 <NA> 34.21471 -83.88878 4050 Enterprise Way
## 5 <NA> 34.63543 -83.84629 669 Paradise Valley Rd
## 6 <NA> 34.36337 -84.04568 30 Industrial Park Rd
## location.address2 location.address3 location.city location.zip_code
## 1 Duluth 30096
## 2 <NA> <NA> Lakemont 30552
## 3 <NA> <NA> Hendersonville 28792
## 4 Ste 180 <NA> Flowery Branch 30542
## 5 <NA> <NA> Cleveland 30528
## 6 Ste 103 Dawsonville 30534
## location.country location.state
## 1 US GA
## 2 US GA
## 3 US NC
## 4 US GA
## 5 US GA
## 6 US GA
## location.display_address
## 1 2860 Buford Hwy, Duluth, GA 30096
## 2 Lakemont, GA 30552
## 3 511 N Main St, Hendersonville, NC 28792
## 4 4050 Enterprise Way, Ste 180, Flowery Branch, GA 30542
## 5 669 Paradise Valley Rd, Cleveland, GA 30528
## 6 30 Industrial Park Rd, Ste 103, Dawsonville, GA 30534
## attributes.business_temp_closed attributes.waitlist_reservation
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## attributes.open24_hours attributes.menu_url
## 1 NA <NA>
## 2 NA <NA>
## 3 NA <NA>
## 4 NA <NA>
## 5 NA <NA>
## 6 NA <NA>
# Delete rows that have missing data in coordinates variable
## check for NAs
yelp_flat2 %>%
map_dbl(., function(x) sum(is.na(x)))
## id alias
## 0 0
## name image_url
## 0 0
## is_closed url
## 0 0
## review_count categories
## 0 0
## rating transactions
## 0 0
## phone display_phone
## 0 0
## distance business_hours
## 0 0
## price coordinates.latitude
## 207 0
## coordinates.longitude location.address1
## 0 59
## location.address2 location.address3
## 155 89
## location.city location.zip_code
## 0 0
## location.country location.state
## 0 0
## location.display_address attributes.business_temp_closed
## 0 243
## attributes.waitlist_reservation attributes.open24_hours
## 239 242
## attributes.menu_url
## 234
## there are no NAs in either coordinates column
# Delete rows that fall outside of the boundary of the city you chose
## city boundary
helen <- tigris::places("GA", progress_bar = FALSE) %>%
filter(NAME == 'Helen') %>%
st_transform(4326)
## Retrieving data for the year 2021
# Converting yelp_flat2 into a sf object
yelp_sf <- yelp_flat2 %>%
st_as_sf(coords=c("coordinates.longitude", "coordinates.latitude"),
crs = 4326)
# sf subsets
yelp_in <- yelp_sf[helen, ]
print(paste0("Before: ", nrow(yelp_sf)))
## [1] "Before: 243"
print(paste0("Before: ", nrow(yelp_in)))
## [1] "Before: 46"
glue::glue("nrow before: {nrow(yelp_data)} -> nrow after: {nrow(yelp_in)} \n
ncol before: {ncol(yelp_data)} -> ncol after: {ncol(yelp_in)} \n") %>%
print()
## nrow before: 475 -> nrow after: 46
##
## ncol before: 18 -> ncol after: 28
View the Data
# Visualize
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(yelp_in) + tm_dots() +
tm_shape(helen) + tm_borders()
## the businesses are all within the borders now
tm_shape(yelp_in) + tm_dots("rating") +
tm_shape(helen) + tm_borders()
# frequency of each rating
ggplot(yelp_in, aes(rating)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# rating and number of ratings
ggplot(yelp_in, aes(rating, review_count)) +
geom_point()
# price levels
yelp_dropna_price <- yelp_in %>%
drop_na(price)
tm_shape(yelp_dropna_price) + tm_dots("price") +
tm_shape(helen) + tm_borders()
Write-up The data are much easier to interpret now that the food and local services businesses included actually fall within the borders of Helen. It’s even clearer that they cluster around the main road where the tourists frequent. I am surprised by how few businesses show up, especially the local services. It makes me wonder if these are more commonly outside the city limits while food proprietors are closer to the tourist area. I was interested in pricing of the restaurants, but there are a lot of NAs there. I chose to keep those rows for the main analysis, but removed them to investigate price. There is only one relatively expensive restaurant in Helen. Less expensive businesses overlap in the center of town and get cheaper as you move away from downtown. In looking at business ratings, most businesses are rated highly, outside of those with a 0. It does look like those with the higher ratings have been reviewed more frequently, though only one business, Hofer’s Bakery & Café, has been reviewed four times more often than the next most reviewed restaurant. I have been there, and it is great!