Library packages
Read Yelp data, rds file
yelp_all <- readRDS("yelp_data.rds")
Tidying data
Delete duplicated rows
yelp_unique <- yelp_all %>%
distinct(id, .keep_all=T)
glue::glue("Before dropping duplicated rows, there were {nrow(yelp_all)} rows. After dropping them, there are {nrow(yelp_unique)} rows") %>%
print()
## Before dropping duplicated rows, there were 110 rows. After dropping them, there are 24 rows
Flatten nested columns that have multiple variables in one column
concate_list <- function(x){
# x is a data frame with columns "alias" and "title" from Yelp$categories
# returns a character vector containing category concatenated titles
titles <- x[["title"]] %>% str_c(collapse = ", ")
return(titles)
}
yelp_flat <- yelp_unique %>%
# 1. Flattening columns with data frame
jsonlite::flatten() %>%
# 2. Handling list-columns
mutate(transactions = transactions %>%
map_chr(., function(x) str_c(x, collapse=", ")),
location.display_address = location.display_address %>%
map_chr(., function(x) str_c(x, collapse=", ")),
categories = categories %>% map_chr(concate_list))
Missing values
yelp_flat %>%
map_dbl(., function(x) sum(is.na(x)))
## id alias
## 0 0
## name image_url
## 0 0
## is_closed url
## 0 0
## review_count categories
## 0 0
## rating transactions
## 0 0
## phone display_phone
## 0 0
## distance business_hours
## 0 0
## price coordinates.latitude
## 21 0
## coordinates.longitude location.address1
## 0 3
## location.address2 location.address3
## 14 13
## location.city location.zip_code
## 0 0
## location.country location.state
## 0 0
## location.display_address attributes.business_temp_closed
## 0 24
## attributes.waitlist_reservation
## 24
Drop missing
identical(is.na(yelp_flat$coordinates.latitude),
is.na(yelp_flat$coordinates.longitude))
## [1] TRUE
# Drop rows that have missing values in `coordinates.longitude` and 'coordinates.latitude'
yelp_dropna <- yelp_flat %>%
drop_na(coordinates.longitude, coordinates.latitude)
print(paste0("Before: ", nrow(yelp_flat)))
## [1] "Before: 24"
print(paste0("After: ", nrow(yelp_dropna)))
## [1] "After: 24"
Clear points ouside the city boundary
canton <- tigris::places("GA", progress_bar = FALSE) %>%
filter(NAME == 'Canton') %>%
st_transform(4326)
## Retrieving data for the year 2022
yelp_sf <- yelp_dropna %>%
st_as_sf(coords=c("coordinates.longitude", "coordinates.latitude"),
crs = 4326)
# sf subsets
yelp_in <- yelp_sf[canton, ]
print(paste0("Before: ", nrow(yelp_sf)))
## [1] "Before: 24"
print(paste0("After: ", nrow(yelp_in)))
## [1] "After: 15"
yelp_in <- yelp_in %>%
mutate(category_type = case_when(
str_detect(categories, "Car Rental") ~ "Car Rental",
str_detect(categories, "Hotels") ~ "Hotels"
))
Save yelp_in
saveRDS(yelp_in, here('yelp_data_in_canton.rds'))
Read canton city rds
canton <- readRDS("canton.rds")
tract_canton <- readRDS("tract_canton.rds")
Spatial join data
canton <- canton %>%
st_transform(4326)
tract_canton <- tract_canton %>%
st_transform(4326)
yelp_in <- yelp_in %>%
st_transform(4326)
census_yelp <- st_join(tract_canton, yelp_in, join = st_intersects)
census_yelp %>% head()
## Simple feature collection with 6 features and 30 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -84.59237 ymin: 34.13413 xmax: -84.39161 ymax: 34.38819
## Geodetic CRS: WGS 84
## GEOID NAME hhincomeE
## 2 13057090707 Census Tract 907.07, Cherokee County, Georgia 85684
## 10 13057090703 Census Tract 907.03, Cherokee County, Georgia 43647
## 16 13057090301 Census Tract 903.01, Cherokee County, Georgia 126250
## 17 13057090603 Census Tract 906.03, Cherokee County, Georgia 86563
## 20 13057090102 Census Tract 901.02, Cherokee County, Georgia 75840
## 24 13057090710 Census Tract 907.10, Cherokee County, Georgia 100893
## hhincomeM id alias name image_url is_closed url review_count categories
## 2 15839 <NA> <NA> <NA> <NA> NA <NA> NA <NA>
## 10 13022 <NA> <NA> <NA> <NA> NA <NA> NA <NA>
## 16 32291 <NA> <NA> <NA> <NA> NA <NA> NA <NA>
## 17 23368 <NA> <NA> <NA> <NA> NA <NA> NA <NA>
## 20 9309 <NA> <NA> <NA> <NA> NA <NA> NA <NA>
## 24 5307 <NA> <NA> <NA> <NA> NA <NA> NA <NA>
## rating transactions phone display_phone distance business_hours price
## 2 NA <NA> <NA> <NA> NA NULL <NA>
## 10 NA <NA> <NA> <NA> NA NULL <NA>
## 16 NA <NA> <NA> <NA> NA NULL <NA>
## 17 NA <NA> <NA> <NA> NA NULL <NA>
## 20 NA <NA> <NA> <NA> NA NULL <NA>
## 24 NA <NA> <NA> <NA> NA NULL <NA>
## location.address1 location.address2 location.address3 location.city
## 2 <NA> <NA> <NA> <NA>
## 10 <NA> <NA> <NA> <NA>
## 16 <NA> <NA> <NA> <NA>
## 17 <NA> <NA> <NA> <NA>
## 20 <NA> <NA> <NA> <NA>
## 24 <NA> <NA> <NA> <NA>
## location.zip_code location.country location.state location.display_address
## 2 <NA> <NA> <NA> <NA>
## 10 <NA> <NA> <NA> <NA>
## 16 <NA> <NA> <NA> <NA>
## 17 <NA> <NA> <NA> <NA>
## 20 <NA> <NA> <NA> <NA>
## 24 <NA> <NA> <NA> <NA>
## attributes.business_temp_closed attributes.waitlist_reservation
## 2 NA NA
## 10 NA NA
## 16 NA NA
## 17 NA NA
## 20 NA NA
## 24 NA NA
## category_type geometry
## 2 <NA> MULTIPOLYGON (((-84.52836 3...
## 10 <NA> MULTIPOLYGON (((-84.51658 3...
## 16 <NA> MULTIPOLYGON (((-84.59071 3...
## 17 <NA> MULTIPOLYGON (((-84.48169 3...
## 20 <NA> MULTIPOLYGON (((-84.49538 3...
## 24 <NA> MULTIPOLYGON (((-84.55758 3...
yelp_census <- st_join(yelp_in, tract_canton, join = st_intersects)
yelp_census
## Simple feature collection with 15 features and 30 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -84.50495 ymin: 34.22241 xmax: -84.4608 ymax: 34.25922
## Geodetic CRS: WGS 84
## First 10 features:
## id
## 10 Zo6haDZ-Rqp7ndDcqggZ0A
## 11 x_OWVk-E9TaZb0kaM4tkjg
## 12 A0VbxD6eVzuF0gId7Twd1g
## 13 SIlkALXHW60N32RqsEwhiQ
## 14 Gg60QjMkxds2C_IneLB3Tg
## 15 9aOx5PrQI4PloEqPTalCRA
## 16 aNJHteN5VjN6yJqasZlvDQ
## 17 bhl2MHAx3y4BiCUoQjQEcQ
## 18 4Qxv1uHOxiLQuj3ojAeeyw
## 19 P-nmWiDeLnd-fOHkFG2f5g
## alias
## 10 hampton-inn-atlanta-canton-canton
## 11 fairfield-inn-and-suites-canton-riverstone-parkway-canton
## 12 motel-6-canton-3
## 13 holiday-inn-express-and-suites-canton-canton-11
## 14 days-inn-by-wyndham-canton-canton
## 15 country-inn-and-suites-by-radisson-canton-ga-canton
## 16 comfort-inn-and-suites-canton
## 17 quality-inn-and-suites-canton-6
## 18 homestead-inn-canton
## 19 microtel-inn-and-suites-canton
## name
## 10 Hampton Inn Atlanta-Canton
## 11 Fairfield Inn & Suites Canton Riverstone Parkway
## 12 Motel 6
## 13 Holiday Inn Express & Suites Canton
## 14 Days Inn by Wyndham Canton
## 15 Country Inn & Suites by Radisson, Canton, GA
## 16 Comfort Inn & Suites
## 17 Quality Inn & Suites
## 18 Homestead Inn
## 19 Microtel Inn and Suites
## image_url
## 10 https://s3-media2.fl.yelpcdn.com/bphoto/IW-NmR7Y1cPEMakTf1fX0Q/o.jpg
## 11 https://s3-media1.fl.yelpcdn.com/bphoto/xA1UJo7SRah0LkC6-uB9uA/o.jpg
## 12 https://s3-media2.fl.yelpcdn.com/bphoto/P0TlJXC1z8_b2ErtG5l5-g/o.jpg
## 13 https://s3-media3.fl.yelpcdn.com/bphoto/tcoCEgRWmBRAuYV7_5oaqw/o.jpg
## 14 https://s3-media4.fl.yelpcdn.com/bphoto/v8bzjqYT-qcm__MkrB-BIA/o.jpg
## 15
## 16 https://s3-media1.fl.yelpcdn.com/bphoto/Moqniq7zuV-QzjgutLOjow/o.jpg
## 17 https://s3-media2.fl.yelpcdn.com/bphoto/qR4zvIRY_S-vmxWQMZcv8g/o.jpg
## 18
## 19
## is_closed
## 10 FALSE
## 11 FALSE
## 12 FALSE
## 13 FALSE
## 14 FALSE
## 15 FALSE
## 16 FALSE
## 17 FALSE
## 18 FALSE
## 19 FALSE
## url
## 10 https://www.yelp.com/biz/hampton-inn-atlanta-canton-canton?adjust_creative=yhocquUnsSVW3lwquDGWlQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=yhocquUnsSVW3lwquDGWlQ
## 11 https://www.yelp.com/biz/fairfield-inn-and-suites-canton-riverstone-parkway-canton?adjust_creative=yhocquUnsSVW3lwquDGWlQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=yhocquUnsSVW3lwquDGWlQ
## 12 https://www.yelp.com/biz/motel-6-canton-3?adjust_creative=yhocquUnsSVW3lwquDGWlQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=yhocquUnsSVW3lwquDGWlQ
## 13 https://www.yelp.com/biz/holiday-inn-express-and-suites-canton-canton-11?adjust_creative=yhocquUnsSVW3lwquDGWlQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=yhocquUnsSVW3lwquDGWlQ
## 14 https://www.yelp.com/biz/days-inn-by-wyndham-canton-canton?adjust_creative=yhocquUnsSVW3lwquDGWlQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=yhocquUnsSVW3lwquDGWlQ
## 15 https://www.yelp.com/biz/country-inn-and-suites-by-radisson-canton-ga-canton?adjust_creative=yhocquUnsSVW3lwquDGWlQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=yhocquUnsSVW3lwquDGWlQ
## 16 https://www.yelp.com/biz/comfort-inn-and-suites-canton?adjust_creative=yhocquUnsSVW3lwquDGWlQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=yhocquUnsSVW3lwquDGWlQ
## 17 https://www.yelp.com/biz/quality-inn-and-suites-canton-6?adjust_creative=yhocquUnsSVW3lwquDGWlQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=yhocquUnsSVW3lwquDGWlQ
## 18 https://www.yelp.com/biz/homestead-inn-canton?adjust_creative=yhocquUnsSVW3lwquDGWlQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=yhocquUnsSVW3lwquDGWlQ
## 19 https://www.yelp.com/biz/microtel-inn-and-suites-canton?adjust_creative=yhocquUnsSVW3lwquDGWlQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=yhocquUnsSVW3lwquDGWlQ
## review_count categories rating transactions phone
## 10 27 Hotels 3.4 +17703457400
## 11 1 Hotels, Venues & Event Spaces 5.0 +18558166193
## 12 17 Hotels 1.8 +17703458700
## 13 3 Venues & Event Spaces, Hotels 4.3 +16783291140
## 14 10 Hotels 2.6 +18003291073
## 15 8 Hotels 2.4 +14707612019
## 16 2 Hotels 1.0 +17704797300
## 17 2 Hotels 1.0 +17703451994
## 18 1 Hotels, Apartments 1.0 +17707200888
## 19 2 Hotels 1.0 +17703458700
## display_phone distance
## 10 (770) 345-7400 4937.944
## 11 (855) 816-6193 5368.347
## 12 (770) 345-8700 4972.966
## 13 (678) 329-1140 5273.145
## 14 (800) 329-1073 5297.867
## 15 (470) 761-2019 4979.686
## 16 (770) 479-7300 4930.712
## 17 (770) 345-1994 4712.304
## 18 (770) 720-0888 5146.432
## 19 (770) 345-8700 4972.966
## business_hours
## 10 TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0, 1, 2, 3, 4, 5, 6, REGULAR, TRUE
## 11 TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0, 1, 2, 3, 4, 5, 6, REGULAR, TRUE
## 12 TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0, 1, 2, 3, 4, 5, 6, REGULAR, TRUE
## 13 NULL
## 14 FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 0900, 0900, 0900, 0900, 0900, 0900, 0900, 1700, 1700, 1700, 1700, 1700, 1700, 1700, 0, 1, 2, 3, 4, 5, 6, REGULAR, TRUE
## 15 TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0, 1, 2, 3, 4, 5, 6, REGULAR, TRUE
## 16 TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0, 1, 2, 3, 4, 5, 6, REGULAR, TRUE
## 17 TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0000, 0, 1, 2, 3, 4, 5, 6, REGULAR, TRUE
## 18 NULL
## 19 NULL
## price location.address1 location.address2 location.address3
## 10 $$ 710 Transit Ave <NA> <NA>
## 11 <NA> 120 Reinhardt College Pkwy <NA> <NA>
## 12 $$ 114 Riverpoint Pkwy <NA>
## 13 <NA> 145 Park Center Dr <NA> <NA>
## 14 $$ 101 Juniper Street
## 15 <NA> 705 Transit Ave <NA>
## 16 <NA> 713 Transit Avenue <NA> <NA>
## 17 <NA> 138 Keith Drive <NA> <NA>
## 18 <NA> 1615 Ball Ground Hwy
## 19 <NA> 114 River Pointe Pkwy <NA> <NA>
## location.city location.zip_code location.country location.state
## 10 Canton 30114 US GA
## 11 Canton 30114 US GA
## 12 Canton 30114 US GA
## 13 Canton 30114 US GA
## 14 Canton 30114 US GA
## 15 Canton 30114 US GA
## 16 Canton 30114 US GA
## 17 Canton 30114 US GA
## 18 Canton 30114 US GA
## 19 Canton 30114 US GA
## location.display_address attributes.business_temp_closed
## 10 710 Transit Ave, Canton, GA 30114 NA
## 11 120 Reinhardt College Pkwy, Canton, GA 30114 NA
## 12 114 Riverpoint Pkwy, Canton, GA 30114 NA
## 13 145 Park Center Dr, Canton, GA 30114 NA
## 14 101 Juniper Street, Canton, GA 30114 NA
## 15 705 Transit Ave, Canton, GA 30114 NA
## 16 713 Transit Avenue, Canton, GA 30114 NA
## 17 138 Keith Drive, Canton, GA 30114 NA
## 18 1615 Ball Ground Hwy, Canton, GA 30114 NA
## 19 114 River Pointe Pkwy, Canton, GA 30114 NA
## attributes.waitlist_reservation category_type GEOID
## 10 NA Hotels 13057090402
## 11 NA Hotels 13057090401
## 12 NA Hotels 13057090402
## 13 NA Hotels 13057090401
## 14 NA Hotels 13057090401
## 15 NA Hotels 13057090402
## 16 NA Hotels 13057090402
## 17 NA Hotels 13057090402
## 18 NA Hotels 13057090402
## 19 NA Hotels 13057090402
## NAME hhincomeE hhincomeM
## 10 Census Tract 904.02, Cherokee County, Georgia 75507 16083
## 11 Census Tract 904.01, Cherokee County, Georgia 42898 11295
## 12 Census Tract 904.02, Cherokee County, Georgia 75507 16083
## 13 Census Tract 904.01, Cherokee County, Georgia 42898 11295
## 14 Census Tract 904.01, Cherokee County, Georgia 42898 11295
## 15 Census Tract 904.02, Cherokee County, Georgia 75507 16083
## 16 Census Tract 904.02, Cherokee County, Georgia 75507 16083
## 17 Census Tract 904.02, Cherokee County, Georgia 75507 16083
## 18 Census Tract 904.02, Cherokee County, Georgia 75507 16083
## 19 Census Tract 904.02, Cherokee County, Georgia 75507 16083
## geometry
## 10 POINT (-84.47032 34.25411)
## 11 POINT (-84.48037 34.25348)
## 12 POINT (-84.4608 34.25818)
## 13 POINT (-84.47911 34.25257)
## 14 POINT (-84.48716 34.24516)
## 15 POINT (-84.47073 34.25462)
## 16 POINT (-84.46961 34.25473)
## 17 POINT (-84.46423 34.25514)
## 18 POINT (-84.46497 34.25922)
## 19 POINT (-84.46228 34.2585)
# Visualize census data using information from Yelp
tm_shape(census_yelp %>% group_by(GEOID) %>% summarise(rating=mean(rating))) +
tm_polygons(col = "rating")
Findings:
Rating score and Review Count
The most frequent rating score is 1. There is a weak correlation of 0.32, which indicates that Rating Score is not a strong predictor of Review Count
review_counts <- as.data.frame(table(yelp_in$rating))
review_counts
## Var1 Freq
## 1 0 1
## 2 1 6
## 3 1.7 1
## 4 1.8 1
## 5 2.4 1
## 6 2.6 1
## 7 2.9 1
## 8 3.4 1
## 9 4.3 1
## 10 5 1
# Scatter plot of rating vs. review_count
plot(yelp_in$rating, yelp_in$review_count,
main = "Scatter Plot of Rating vs. Review Count",
xlab = "Rating Score",
ylab = "Review Count",
pch = 16, # Type of point (solid circle)
col = "blue") # Point color
# Trend line
abline(lm(review_count ~ rating, data = yelp_in), col = "red", lwd = 2)
# Calculate correlation
correlation <- cor(yelp_in$rating, yelp_in$review_count, use = "complete.obs")
text(min(yelp_in$rating), max(yelp_in$review_count),
paste("Correlation: ", round(correlation, 2)), pos = 4, col = "darkgreen")
Price
Only 3 businesses have a price value, and all of them are labeled as $$
# Count Price
price_counts <- as.data.frame(table(yelp_in$price))
price_counts
## Var1 Freq
## 1 $$ 3
Househole income and rating
There is a weak negative relationship between household income and Yelp rating. The household income of residents living in the area does not significantly influence the ratings given to hotels or car rental businesses. Hotels and car rentals are services predominantly used by non-residents such as tourists, business travelers, or other visitors. As a result, the ratings for these businesses are more reflective of the experiences and perceptions of these visitors rather than the local population.
# Assuming your data frame is 'yelp_data' with columns 'household_income' and 'rating'
# Replace 'household_income' and 'rating' with actual column names if they differ.
# Scatter plot to see the pattern
plot(yelp_census$rating, yelp_census$hhincomeE,
main = "Scatter Plot of Household Income vs. Yelp Rating",
xlab = "Yelp Rating",
ylab = "Household Income",
pch = 16, col = "blue")
# Add a linear regression line
abline(lm(hhincomeE ~ rating, data = yelp_census), col = "red", lwd = 2)
# Calculate correlation coefficient
income_rating_correlation <- cor(yelp_census$rating, yelp_census$hhincomeE, use = "complete.obs")
text(max(yelp_census$rating-1.2), max(yelp_census$hhincomeE),
paste("Correlation: ", round(income_rating_correlation, 2)), pos = 4, col = "darkgreen")