load("/Users/helenalindsay/Documents/Fall_23/CP8883/Yelpdata.RData")
no_duplicates <- combined_yelp[!duplicated(combined_yelp$name), ]
concate_list <- function(x){
# x is a data frame with columns "alias" and "title" from Yelp$categories
# returns a character vector containing category concatenated titles
titles <- x[["title"]] %>% str_c(collapse = ", ")
return(titles)
}
yelp_flat <- no_duplicates %>%
# 1. Flattening columns with data frame
jsonlite::flatten() %>%
# 2. Handling list-columns
mutate(transactions = transactions %>%
map_chr(., function(x) str_c(x, collapse=", ")),
location.display_address = location.display_address %>%
map_chr(., function(x) str_c(x, collapse=", ")),
categories = categories %>% map_chr(concate_list))
yelp_dropna <- yelp_flat %>%
drop_na(coordinates.longitude)%>%
drop_na(coordinates.latitude)
epsg_id <- 4326
# Load tract boundary data
load("/Users/helenalindsay/Documents/Fall_23/CP8883/tract.RData")
tract_split <- separate(tract, NAME, into = c("Tract", "County", "State"), sep = ", ")
# Converting yelp_dropna into a sf object
yelp_sf <- yelp_dropna %>%
st_as_sf(coords=c("coordinates.longitude", "coordinates.latitude"), crs = epsg_id)
# Converting census tract data to an sf object
tract_sf <- tract_split %>% filter(County %in% c("Boulder County")) %>% st_sf()
# Matching the CRS for the yelp and census tract data
st_crs(yelp_sf) <- st_crs(tract_sf) <- st_crs("EPSG:4326")
# Creating a subset based on census boundary
yelp_in <- yelp_sf[tract_sf %>%
filter(County %in% c("Boulder County")) %>%
st_union(), ,op = st_intersects]
kable(head(yelp_in,5))
| id | alias | name | image_url | is_closed | url | review_count | categories | rating | transactions | price | phone | display_phone | distance | business_category | location.address1 | location.address2 | location.address3 | location.city | location.zip_code | location.country | location.state | location.display_address | geometry |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| OCzajJdQ6p5m82diXAGnRw | carellis-of-boulder-boulder-3 | Carelli’s Of Boulder | https://s3-media2.fl.yelpcdn.com/bphoto/F11F8KuIZmfe01-iebuqFw/o.jpg | FALSE | https://www.yelp.com/biz/carellis-of-boulder-boulder-3?adjust_creative=2fwKSTvgAQf6j-1QQXYBxw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=2fwKSTvgAQf6j-1QQXYBxw | 296 | Italian, Pizza, Sandwiches | 4.0 | delivery | \[ |+13039389300 |(303) 938-9300 | 578.8628|Restaurants |645 30th St | | |Boulder |80303 |US |CO |645 30th St, Boulder, CO 80303 |POINT (-105.2534 39.9989) | |miKI4GfcsdjgPfAW3mOaxg |dark-horse-boulder |Dark Horse |https://s3-media4.fl.yelpcdn.com/bphoto/sVSI5qZ2FVMpRio7wiK-fQ/o.jpg |FALSE |https://www.yelp.com/biz/dark-horse-boulder?adjust_creative=2fwKSTvgAQf6j-1QQXYBxw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=2fwKSTvgAQf6j-1QQXYBxw | 444|American (Traditional), Sports Bars | 3.5|delivery |\] | +13034428162 | (303) 442-8162 | 575.8988 | Restaurants | 2922 Baseline Rd | Boulder | 80303 | US | CO | 2922 Baseline Rd, Boulder, CO 80303 | POINT (-105.2552 39.99985) | ||
| OB67BseTZ6Vm1LkTdPkUlg | taj-indian-cuisine-boulder | Taj Indian Cuisine | https://s3-media4.fl.yelpcdn.com/bphoto/9pr1ROXgbyLyOAFoKtOD-g/o.jpg | FALSE | https://www.yelp.com/biz/taj-indian-cuisine-boulder?adjust_creative=2fwKSTvgAQf6j-1QQXYBxw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=2fwKSTvgAQf6j-1QQXYBxw | 184 | Indian, Buffets | 4.0 | pickup, delivery | \[ |+13034945216 |(303) 494-5216 | 799.7593|Restaurants |2630 Baseline Rd | | |Boulder |80305 |US |CO |2630 Baseline Rd, Boulder, CO 80305 |POINT (-105.2615 40.00013) | |8_xaAQRUUczH71qZcTLfXQ |pho-kitchen-bar-and-grill-boulder |Pho Kitchen Bar and Grill |https://s3-media1.fl.yelpcdn.com/bphoto/lm__0AcV1LtxN8UdqBjIMA/o.jpg |FALSE |https://www.yelp.com/biz/pho-kitchen-bar-and-grill-boulder?adjust_creative=2fwKSTvgAQf6j-1QQXYBxw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=2fwKSTvgAQf6j-1QQXYBxw | 121|Vietnamese | 4.0|pickup, delivery |\] | +13038723903 | (303) 872-3903 | 513.7881 | Restaurants | 2900 Baseline Rd | Unit 3 | Boulder | 80303 | US | CO | 2900 Baseline Rd, Unit 3, Boulder, CO 80303 | POINT (-105.2557 39.99974) | |
| IJegH5ORVHInQUrYAr0SAA | may-wah-restaurant-boulder-3 | May Wah Restaurant | https://s3-media2.fl.yelpcdn.com/bphoto/gkkZzfQeG4bKnO59onyY3A/o.jpg | FALSE | https://www.yelp.com/biz/may-wah-restaurant-boulder-3?adjust_creative=2fwKSTvgAQf6j-1QQXYBxw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=2fwKSTvgAQf6j-1QQXYBxw | 109 | Chinese, Vietnamese | 4.0 | delivery | $$ | +13034998225 | (303) 499-8225 | 909.1617 | Restaurants | 2500 Baseline Rd | Boulder | 80305 | US | CO | 2500 Baseline Rd, Boulder, CO 80305 | POINT (-105.2615 39.99874) |
tmap_mode("view")
base_map <- tm_shape(tract_sf) +
tm_borders() # Add borders for the tracts
# Overlay Yelp points (yelp_in) with rating-based colors
yelp_map <- tm_shape(yelp_in) +
tm_dots(col = "rating", palette = "RdYlBu") +
tm_legend(show = TRUE)
final_map <- base_map + yelp_map
final_map
library(ggplot2)
# Create the scatter plot
ggplot(data = yelp_in, aes(x = review_count, y = rating)) +
geom_point() +
labs(x = "Review Count", y = "Rating") +
ggtitle("Review Count vs. Rating")
price_filtered <- yelp_in[!is.na(yelp_in$price), ]
yelp_map_price <- tm_shape(price_filtered) +
tm_dots(col = "price", palette = "RdYlBu") +
tm_legend(show = TRUE)
price_map <- base_map + yelp_map_price
price_map
# Create the scatter plot
ggplot(data = price_filtered, aes(x = price, y = review_count)) +
geom_point() +
labs(x = "Price", y = "Review count") +
ggtitle("Price vs. Review count")
After some data exploration and analysis, I found that higher ratings seemed to relate to larger numbers of review counts. This suggests that businesses that provide exceptional service, or food in this case, tend to attract more customer reviews, possibly due to customer satisfaction and a greater likelihood of customers sharing their positive experiences.
However, when it came to the price map, there wasn’t a significant concentration of high-priced businesses clustered in specific areas within Boulder County. Instead, pricier businesses appeared to be scattered throughout the county, suggesting that higher-priced businesses may not be limited to particular regions but are rather evenly distributed in Boulder.
Perhaps the most surprising finding was related to the number of reviews in relation to business prices. According to the scatter plot, there appeared to be a higher volume of reviews for businesses with lower price ranges. This was intriguing due to the fact that combined with my first finding of higher ratings = more reviews, this last finding suggests the opposite. After discovering this trend, I attempted to visualize price vs reviews, but due to the categorical nature of the two variables, it was difficult to determine a pattern. In the future, I believe that it would be valuable to gather numerical data, such as the average cost spent at a business, and analyze how it relates to the number of reviews. Uncovering the nuances of the relationships between price, review counts, and ratings could offer valuable insights for both businesses and consumers in Boulder County.
# Create the scatter plot
ggplot(data = price_filtered, aes(x = price, y = rating)) +
geom_point() +
labs(x = "Price", y = "Rating") +
ggtitle("Price vs. Rating")