Load the yelp dataset used in the previous assignment (mini 1)

load("/Users/seungjaelieu/GaTech Dropbox/Seung Jae Lieu/GT 2-1/3_CP8883 Intro to UA/yelp_suwanee_mini1_0920.RData")

Tidy the data

Delete duplicted rows by unique POI id (column “id”)

yelp_all_pois_filtered <- distinct(yelp_all_pois, id, .keep_all = T) # # Remove duplicate entries based on the "id" column to keep only unique POIs

Flatten nested columns that have multiple variables in one column

  1. Transactions and location address
  2. Categories
yelp_all_pois_flat <- jsonlite::flatten(yelp_all_pois_filtered)

yelp_all_pois_concat <- yelp_all_pois_flat %>% 
  mutate(transactions = transactions %>% 
           map_chr(., function(x) str_c(x, collapse=", ")),
         location.display_address = location.display_address %>% 
           map_chr(., function(x) str_c(x, collapse=", "))) # Convert the 'transactions' and 'location.display_address' into comma-separated strings

concate_list <- function(x){
  titles <- x[["title"]] %>% str_c(collapse = ", ")
  return(titles)
} # # Custom function to concatenate category titles

yelp_all_pois_flat2 <- yelp_all_pois_concat %>%   
  mutate(categories = categories %>% map_chr(concate_list)) # Concatenate 'categories' titles into a single string

Delete rows that have missing data in coordinates variable

yelp_all_pois_flat2 %<>% filter(!is.na(coordinates.latitude)) # Filter out rows where latitude is missing

Delete rows that fall outside of the boundary of the city of Suwanee

yelp_sf <- yelp_all_pois_flat2 %>% 
  mutate(x = .$coordinates.longitude,
         y = .$coordinates.latitude) %>% 
  st_as_sf(coords = c("x", "y"), crs = 4326) # Convert Yelp POIs data into a spatial object (sf) using longitude and latitude coordinates

city <- tigris::places('GA') %>% 
  filter(NAME == 'Suwanee') %>% 
  st_transform(st_crs(yelp_sf)) # Get the boundary of Suwanee city

yelp_within_city <- yelp_sf[city,] # Identify POIs within the Suwanee city boundary
yelp_without_city <- filter(yelp_sf, !id %in% yelp_within_city$id) # Identify POIs outside it

Descriptive Analysis

print(paste0("Before data wrangling: ", nrow(yelp_all_pois)))
## [1] "Before data wrangling: 173"
print(paste0("POIs located within the city: ", nrow(yelp_within_city)))
## [1] "POIs located within the city: 20"
tmap_mode("view")
tm_shape(yelp_within_city) +
  tm_dots(col = "type", style="quantile") +
  tm_shape(city) +
  tm_borders(col = "red", lwd = 3)
summary(yelp_within_city$rating)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.200   3.600   3.950   4.035   4.525   5.000
summary(yelp_without_city$rating)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.800   3.400   3.259   4.200   5.000
table(yelp_within_city$price)
## 
##   $  $$ $$$ 
##   4   9   1
table(yelp_without_city$price)
## 
##    $   $$  $$$ $$$$ 
##    2   22    2    2
summary(yelp_within_city$review_count)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   40.75  135.50  138.50  209.25  393.00
summary(yelp_without_city$review_count)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    14.0    62.0   112.7   162.0   661.0

A Short Story

The raw dataset includes 173 POIs, comprising cafes and bars, but after data wrangling, only 69 remain. Of these, 20 POIs are located within the boundary of Suwanee city, including 8 cafes and the rest being bars. The average rating of POIs within the city is 4.04, while POIs outside the city have a lower average rating of 3.26. Additionally, POIs within the city boundary tend to receive more reviews on average, with a median review count of 135 compared to 62 for those outside the city. This suggests that POIs within the city are not only rated higher but also tend to attract more attention from customers. It indicates that businesses within the city limits may benefit from greater visibility or higher foot traffic compared to those outside. Despite this advantage, there is little difference in the distribution pattern of price levels, as both POIs inside and outside the city account for approximately 70% of the total POIs at similar price points.