Prepare the yelp data downloaded for Mini-Assignment 1.

load("/Users/helenalindsay/Documents/Fall_23/CP8883/Yelpdata.RData")

Delete duplicated rows.

no_duplicates <- combined_yelp[!duplicated(combined_yelp$name), ]

Flatten nested columns that have multiple variables in one column.

concate_list <- function(x){
  # x is a data frame with columns "alias" and "title" from Yelp$categories
  # returns a character vector containing category concatenated titles 
  titles <- x[["title"]] %>% str_c(collapse = ", ")
  return(titles)
}

yelp_flat <- no_duplicates %>% 
  # 1. Flattening columns with data frame
  jsonlite::flatten() %>% 
  # 2. Handling list-columns
  mutate(transactions = transactions %>% 
           map_chr(., function(x) str_c(x, collapse=", ")),
         location.display_address = location.display_address %>% 
           map_chr(., function(x) str_c(x, collapse=", ")),
         categories = categories %>% map_chr(concate_list)) 

Delete rows that have missing data in coordinates variable. It’s okay to have NAs in other variables.

yelp_dropna <- yelp_flat %>% 
  drop_na(coordinates.longitude)%>%
  drop_na(coordinates.latitude)

Delete rows that fall outside of the boundary of your choice

epsg_id <- 4326
# Load tract boundary data
load("/Users/helenalindsay/Documents/Fall_23/CP8883/tract.RData")

tract_split <- separate(tract, NAME, into = c("Tract", "County", "State"), sep = ", ")

# Converting yelp_dropna into a sf object
yelp_sf <- yelp_dropna %>% 
  st_as_sf(coords=c("coordinates.longitude", "coordinates.latitude"), crs = epsg_id)

# Converting census tract data to an sf object
tract_sf <- tract_split %>% filter(County %in% c("Boulder County")) %>% st_sf()

# Matching the CRS for the yelp and census tract data
st_crs(yelp_sf) <- st_crs(tract_sf) <- st_crs("EPSG:4326")

# Creating a subset based on census boundary
yelp_in <- yelp_sf[tract_sf %>% 
                     filter(County %in% c("Boulder County")) %>% 
                     st_union(), ,op = st_intersects]

kable(head(yelp_in,5))
id alias name image_url is_closed url review_count categories rating transactions price phone display_phone distance business_category location.address1 location.address2 location.address3 location.city location.zip_code location.country location.state location.display_address geometry
OCzajJdQ6p5m82diXAGnRw carellis-of-boulder-boulder-3 Carelli’s Of Boulder https://s3-media2.fl.yelpcdn.com/bphoto/F11F8KuIZmfe01-iebuqFw/o.jpg FALSE https://www.yelp.com/biz/carellis-of-boulder-boulder-3?adjust_creative=2fwKSTvgAQf6j-1QQXYBxw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=2fwKSTvgAQf6j-1QQXYBxw 296 Italian, Pizza, Sandwiches 4.0 delivery \[ |+13039389300 |(303) 938-9300 | 578.8628|Restaurants |645 30th St | | |Boulder |80303 |US |CO |645 30th St, Boulder, CO 80303 |POINT (-105.2534 39.9989) | |miKI4GfcsdjgPfAW3mOaxg |dark-horse-boulder |Dark Horse |https://s3-media4.fl.yelpcdn.com/bphoto/sVSI5qZ2FVMpRio7wiK-fQ/o.jpg |FALSE |https://www.yelp.com/biz/dark-horse-boulder?adjust_creative=2fwKSTvgAQf6j-1QQXYBxw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=2fwKSTvgAQf6j-1QQXYBxw | 444|American (Traditional), Sports Bars | 3.5|delivery |\] +13034428162 (303) 442-8162 575.8988 Restaurants 2922 Baseline Rd Boulder 80303 US CO 2922 Baseline Rd, Boulder, CO 80303 POINT (-105.2552 39.99985)
OB67BseTZ6Vm1LkTdPkUlg taj-indian-cuisine-boulder Taj Indian Cuisine https://s3-media4.fl.yelpcdn.com/bphoto/9pr1ROXgbyLyOAFoKtOD-g/o.jpg FALSE https://www.yelp.com/biz/taj-indian-cuisine-boulder?adjust_creative=2fwKSTvgAQf6j-1QQXYBxw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=2fwKSTvgAQf6j-1QQXYBxw 184 Indian, Buffets 4.0 pickup, delivery \[ |+13034945216 |(303) 494-5216 | 799.7593|Restaurants |2630 Baseline Rd | | |Boulder |80305 |US |CO |2630 Baseline Rd, Boulder, CO 80305 |POINT (-105.2615 40.00013) | |8_xaAQRUUczH71qZcTLfXQ |pho-kitchen-bar-and-grill-boulder |Pho Kitchen Bar and Grill |https://s3-media1.fl.yelpcdn.com/bphoto/lm__0AcV1LtxN8UdqBjIMA/o.jpg |FALSE |https://www.yelp.com/biz/pho-kitchen-bar-and-grill-boulder?adjust_creative=2fwKSTvgAQf6j-1QQXYBxw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=2fwKSTvgAQf6j-1QQXYBxw | 121|Vietnamese | 4.0|pickup, delivery |\] +13038723903 (303) 872-3903 513.7881 Restaurants 2900 Baseline Rd Unit 3 Boulder 80303 US CO 2900 Baseline Rd, Unit 3, Boulder, CO 80303 POINT (-105.2557 39.99974)
IJegH5ORVHInQUrYAr0SAA may-wah-restaurant-boulder-3 May Wah Restaurant https://s3-media2.fl.yelpcdn.com/bphoto/gkkZzfQeG4bKnO59onyY3A/o.jpg FALSE https://www.yelp.com/biz/may-wah-restaurant-boulder-3?adjust_creative=2fwKSTvgAQf6j-1QQXYBxw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=2fwKSTvgAQf6j-1QQXYBxw 109 Chinese, Vietnamese 4.0 delivery $$ +13034998225 (303) 499-8225 909.1617 Restaurants 2500 Baseline Rd Boulder 80305 US CO 2500 Baseline Rd, Boulder, CO 80305 POINT (-105.2615 39.99874)

Visualize the output

tmap_mode("view")
base_map <- tm_shape(tract_sf) +
  tm_borders()  # Add borders for the tracts

# Overlay Yelp points (yelp_in) with rating-based colors
yelp_map <- tm_shape(yelp_in) +
  tm_dots(col = "rating", palette = "RdYlBu") +
  tm_legend(show = TRUE)
final_map <- base_map + yelp_map
final_map

Extra: Data exploration

Review Count vs Rating

library(ggplot2)

# Create the scatter plot
ggplot(data = yelp_in, aes(x = review_count, y = rating)) +
  geom_point() +
  labs(x = "Review Count", y = "Rating") +
  ggtitle("Review Count vs. Rating")

Price map

price_filtered <- yelp_in[!is.na(yelp_in$price), ]

yelp_map_price <- tm_shape(price_filtered) +
  tm_dots(col = "price", palette = "RdYlBu") +
  tm_legend(show = TRUE)
price_map <- base_map + yelp_map_price
price_map

Review Count vs Price

# Create the scatter plot
ggplot(data = price_filtered, aes(x = price, y = review_count)) +
  geom_point() +
  labs(x = "Price", y = "Review count") +
  ggtitle("Price vs. Review count")

Findings

After some data exploration and analysis, I found that higher ratings seemed to relate to larger numbers of review counts. This suggests that businesses that provide exceptional service, or food in this case, tend to attract more customer reviews, possibly due to customer satisfaction and a greater likelihood of customers sharing their positive experiences.

However, when it came to the price map, there wasn’t a significant concentration of high-priced businesses clustered in specific areas within Boulder County. Instead, pricier businesses appeared to be scattered throughout the county, suggesting that higher-priced businesses may not be limited to particular regions but are rather evenly distributed in Boulder.

Perhaps the most surprising finding was related to the number of reviews in relation to business prices. According to the scatter plot, there appeared to be a higher volume of reviews for businesses with lower price ranges. This was intriguing due to the fact that combined with my first finding of higher ratings = more reviews, this last finding suggests the opposite. After discovering this trend, I attempted to visualize price vs reviews, but due to the categorical nature of the two variables, it was difficult to determine a pattern. In the future, I believe that it would be valuable to gather numerical data, such as the average cost spent at a business, and analyze how it relates to the number of reviews. Uncovering the nuances of the relationships between price, review counts, and ratings could offer valuable insights for both businesses and consumers in Boulder County.

# Create the scatter plot
ggplot(data = price_filtered, aes(x = price, y = rating)) +
  geom_point() +
  labs(x = "Price", y = "Rating") +
  ggtitle("Price vs. Rating")