The required R packages and RDS files should be loaded to make the data available and ready for cleaning.
# Load Required Packages
library(tidycensus)
library(tidyverse)
library(tmap)
library(sf)
library(yelpr)
library(jsonlite)
library(dplyr)
# Load saved Yelp data
yelp_skincare_marietta <- readRDS("E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment_2/yelp_data_skincare_marietta.rds")
yelp_spas_marietta <- readRDS("E:/Georgia_Tech_MCRP/2024_FALL/CP8883_Urban_Analytics/Assignment_2/yelp_data_spas_marietta.rds")
Before starting data cleaning process, we can use the sapply function to review the data class of each column in the two data sets. This task allows us to identify that there are three columns where the data is nested in each data set.
sapply(yelp_skincare_marietta, class) %>% print() # Data class review: Skincare, Marietta
## id alias name image_url is_closed
## "character" "character" "character" "character" "logical"
## url review_count categories rating coordinates
## "character" "integer" "list" "numeric" "data.frame"
## transactions location phone display_phone distance
## "list" "data.frame" "character" "character" "numeric"
## business_hours attributes price
## "list" "data.frame" "character"
sapply(yelp_spas_marietta, class) %>% print() # Data class review: Day Spa, Marietta
## id alias name image_url is_closed
## "character" "character" "character" "character" "logical"
## url review_count categories rating coordinates
## "character" "integer" "list" "numeric" "data.frame"
## transactions price location phone display_phone
## "list" "character" "data.frame" "character" "character"
## distance business_hours attributes
## "numeric" "list" "data.frame"
Accordingly, we may use the flatten function from jsonlite package in R to flatten nested columns. Due to this task, the structure of two data sets would be simplified while they respectively have 29 columns.
yelp_skincare_marietta_flat <- yelp_skincare_marietta %>% # Flatten nested columns: Skincare, Marietta
jsonlite::flatten()
sapply(yelp_skincare_marietta_flat, class) %>% print() # Data class review: Skincare, Marietta
## id alias
## "character" "character"
## name image_url
## "character" "character"
## is_closed url
## "logical" "character"
## review_count categories
## "integer" "list"
## rating transactions
## "numeric" "list"
## phone display_phone
## "character" "character"
## distance business_hours
## "numeric" "list"
## price coordinates.latitude
## "character" "numeric"
## coordinates.longitude location.address1
## "numeric" "character"
## location.address2 location.address3
## "character" "character"
## location.city location.zip_code
## "character" "character"
## location.country location.state
## "character" "character"
## location.display_address attributes.business_temp_closed
## "list" "logical"
## attributes.menu_url attributes.waitlist_reservation
## "character" "logical"
## attributes.open24_hours
## "logical"
yelp_spas_marietta_flat <- yelp_spas_marietta %>% # Flatten nested columns: Day Spa, Marietta
jsonlite::flatten()
sapply(yelp_spas_marietta_flat, class) %>% print() # Data class review: Day Spa, Marietta
## id alias
## "character" "character"
## name image_url
## "character" "character"
## is_closed url
## "logical" "character"
## review_count categories
## "integer" "list"
## rating transactions
## "numeric" "list"
## price phone
## "character" "character"
## display_phone distance
## "character" "numeric"
## business_hours coordinates.latitude
## "list" "numeric"
## coordinates.longitude location.address1
## "numeric" "character"
## location.address2 location.address3
## "character" "character"
## location.city location.zip_code
## "character" "character"
## location.country location.state
## "character" "character"
## location.display_address attributes.business_temp_closed
## "list" "logical"
## attributes.menu_url attributes.waitlist_reservation
## "character" "logical"
## attributes.open24_hours
## "logical"
Next, we will move on to removing duplicated rows from two data sets by using distinct function. In this case, neither of the two data sets has duplicated rows, therefore, there are no modifications made to the data sets.
yelp_skincare_marietta_flat_duplicate <- yelp_skincare_marietta_flat %>% # Drop duplicated rows
distinct()
yelp_spas_marietta_flat_duplicate <- yelp_spas_marietta_flat %>% # Drop duplicated rows
distinct()
Moving forward, we will delete out rows that does not have coordinate information from the data sets. In this case, all rows have coordinate information, therefore, there are no modifications made to the data sets.
yelp_spas_marietta_flat_duplicate %>% # Drop rows with no coordinates values
map_dbl(., function(x) sum(is.na(x)))
yelp_skincare_marietta_flat_duplicate %>% # Drop rows with no coordinates values
map_dbl(., function(x) sum(is.na(x)))
Lastly, we will drop rows that are located outside of the Marietta city limits. This step will result in a significant reduction in the number of businesses included in the data sets (Skincare: 538 -> 158, Day Spa: 180 -> 50). After completing this task, we can conduct a visual inspection to confirm that all businesses are located within the city limits.
marietta <- tigris::places('GA') %>%# load geometry of Marietta
filter(NAME == 'Marietta') %>%
st_transform(4326)
yelp_skincare_flat_duplicate_sf <- yelp_skincare_marietta_flat_duplicate %>% # convert dataset to spatial object
st_as_sf(coords=c("coordinates.longitude", "coordinates.latitude"),
crs = 4326)
yelp_skincare_clean <- yelp_skincare_flat_duplicate_sf[marietta, ] # sf subsets
print(paste0("Before: ", nrow(yelp_skincare_flat_duplicate_sf))) # print results
## [1] "Before: 538"
print(paste0("After: ", nrow(yelp_skincare_clean)))
## [1] "After: 158"
tmap_mode("view") # activate visualization
## tmap mode set to interactive viewing
tm_shape(yelp_skincare_clean) + tm_dots(col = "review_count") + # visualize the result
tm_shape(marietta) + tm_borders()
yelp_spas_flat_duplicate_sf <- yelp_spas_marietta_flat_duplicate %>% # convert dataset to spatial object
st_as_sf(coords=c("coordinates.longitude", "coordinates.latitude"),
crs = 4326)
yelp_spas_clean <- yelp_spas_flat_duplicate_sf[marietta, ] # sf subsets
print(paste0("Before: ", nrow(yelp_spas_flat_duplicate_sf))) # print results
## [1] "Before: 180"
print(paste0("After: ", nrow(yelp_spas_clean)))
## [1] "After: 50"
tm_shape(yelp_spas_clean) + tm_dots(col = "review_count") + # visualize the result
tm_shape(marietta) + tm_borders()
Before data cleaning, the analysis shows that there are 538 skincare businesses and 180 day spa businesses in Marietta. However, after data cleaning, the number of skincare businesses (158) and day spa businesses (50) have substantially reduced. Although no duplicated rows or rows lacking coordinate information were found, a significant number of skin and day spa businesses outside of the city boundary have been successfully identified and dropped throughout the data cleaning process.
By using skimr package, we can easily identify that what is the most frequent rating score along with its relationship with the number of review count.
library(skimr)
## Warning: package 'skimr' was built under R version 4.3.3
For the skincare sector, most businesses have fewer than 12 reviews, while their ratings tend to be concentrated at both ends of the scale (1 - 5).
skim(yelp_skincare_clean) # summary statistics
## Warning: Couldn't find skimmers for class: sfc_POINT, sfc; No user-defined
## `sfl` provided. Falling back to `character`.
| Name | yelp_skincare_clean |
| Number of rows | 158 |
| Number of columns | 28 |
| _______________________ | |
| Column type frequency: | |
| character | 17 |
| list | 4 |
| logical | 4 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 22 | 22 | 0 | 31 | 0 |
| alias | 0 | 1.00 | 19 | 62 | 0 | 31 | 0 |
| name | 0 | 1.00 | 10 | 51 | 0 | 31 | 0 |
| image_url | 0 | 1.00 | 0 | 68 | 37 | 25 | 0 |
| url | 0 | 1.00 | 176 | 219 | 0 | 31 | 0 |
| phone | 0 | 1.00 | 0 | 12 | 12 | 30 | 0 |
| display_phone | 0 | 1.00 | 0 | 14 | 12 | 30 | 0 |
| price | 150 | 0.05 | 2 | 2 | 0 | 1 | 0 |
| location.address1 | 13 | 0.92 | 13 | 30 | 0 | 24 | 0 |
| location.address2 | 54 | 0.66 | 0 | 11 | 29 | 15 | 0 |
| location.address3 | 19 | 0.88 | 0 | 0 | 139 | 1 | 0 |
| location.city | 0 | 1.00 | 8 | 8 | 0 | 1 | 0 |
| location.zip_code | 0 | 1.00 | 5 | 5 | 0 | 6 | 0 |
| location.country | 0 | 1.00 | 2 | 2 | 0 | 1 | 0 |
| location.state | 0 | 1.00 | 2 | 2 | 0 | 1 | 0 |
| attributes.menu_url | 119 | 0.25 | 0 | 58 | 4 | 7 | 0 |
| geometry | 0 | 1.00 | 20 | 38 | 0 | 31 | 0 |
Variable type: list
| skim_variable | n_missing | complete_rate | n_unique | min_length | max_length |
|---|---|---|---|---|---|
| categories | 0 | 1 | 28 | 2 | 2 |
| transactions | 0 | 1 | 1 | 0 | 0 |
| business_hours | 0 | 1 | 24 | 0 | 3 |
| location.display_address | 0 | 1 | 29 | 1 | 3 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| is_closed | 0 | 1.00 | 0 | FAL: 158 |
| attributes.business_temp_closed | 158 | 0.00 | NaN | : |
| attributes.waitlist_reservation | 152 | 0.04 | 1 | TRU: 6 |
| attributes.open24_hours | 158 | 0.00 | NaN | : |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| review_count | 0 | 1 | 1.33 | 2.61 | 0.00 | 0.00 | 0.00 | 1.00 | 12.00 | ▇▁▁▁▁ |
| rating | 0 | 1 | 1.85 | 2.33 | 0.00 | 0.00 | 0.00 | 5.00 | 5.00 | ▇▁▁▁▅ |
| distance | 0 | 1 | 2118.85 | 930.87 | 224.05 | 1500.37 | 1903.36 | 2742.43 | 4232.26 | ▂▇▅▃▂ |
Similarly, for the Day Spa sector, most businesses have fewer than 10 reviews, while the distribution of ratings is positively skewed.
skim(yelp_spas_clean) # summary statistics
## Warning: Couldn't find skimmers for class: sfc_POINT, sfc; No user-defined
## `sfl` provided. Falling back to `character`.
| Name | yelp_spas_clean |
| Number of rows | 50 |
| Number of columns | 28 |
| _______________________ | |
| Column type frequency: | |
| character | 17 |
| list | 4 |
| logical | 4 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 22 | 22 | 0 | 11 | 0 |
| alias | 0 | 1.00 | 17 | 34 | 0 | 11 | 0 |
| name | 0 | 1.00 | 9 | 23 | 0 | 11 | 0 |
| image_url | 0 | 1.00 | 68 | 68 | 0 | 11 | 0 |
| url | 0 | 1.00 | 174 | 191 | 0 | 11 | 0 |
| price | 33 | 0.34 | 2 | 3 | 0 | 2 | 0 |
| phone | 0 | 1.00 | 0 | 12 | 5 | 11 | 0 |
| display_phone | 0 | 1.00 | 0 | 14 | 5 | 11 | 0 |
| location.address1 | 5 | 0.90 | 13 | 20 | 0 | 10 | 0 |
| location.address2 | 10 | 0.80 | 0 | 9 | 8 | 8 | 0 |
| location.address3 | 8 | 0.84 | 0 | 0 | 42 | 1 | 0 |
| location.city | 0 | 1.00 | 8 | 8 | 0 | 1 | 0 |
| location.zip_code | 0 | 1.00 | 5 | 5 | 0 | 4 | 0 |
| location.country | 0 | 1.00 | 2 | 2 | 0 | 1 | 0 |
| location.state | 0 | 1.00 | 2 | 2 | 0 | 1 | 0 |
| attributes.menu_url | 47 | 0.06 | 41 | 41 | 0 | 1 | 0 |
| geometry | 0 | 1.00 | 22 | 38 | 0 | 11 | 0 |
Variable type: list
| skim_variable | n_missing | complete_rate | n_unique | min_length | max_length |
|---|---|---|---|---|---|
| categories | 0 | 1 | 11 | 2 | 2 |
| transactions | 0 | 1 | 1 | 0 | 0 |
| business_hours | 0 | 1 | 11 | 0 | 3 |
| location.display_address | 0 | 1 | 11 | 1 | 3 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| is_closed | 0 | 1 | 0 | FAL: 50 |
| attributes.business_temp_closed | 50 | 0 | NaN | : |
| attributes.waitlist_reservation | 50 | 0 | NaN | : |
| attributes.open24_hours | 50 | 0 | NaN | : |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| review_count | 0 | 1 | 11.62 | 20.52 | 0.00 | 0.00 | 2.00 | 7.00 | 56.00 | ▇▁▁▁▂ |
| rating | 0 | 1 | 2.66 | 2.16 | 0.00 | 0.00 | 2.80 | 5.00 | 5.00 | ▇▂▂▂▇ |
| distance | 0 | 1 | 2204.16 | 737.28 | 647.05 | 1614.07 | 2015.64 | 2686.83 | 4232.26 | ▁▇▅▃▁ |
Lastly, we can examine whether any patterns have emerged between household income and Yelp ratings. To do this task, we will load household income data by census tract from ACS 5-year estimate 2021 database.
# Load the geometry of Census Tracts within Cobb County from ACS
tract <- get_acs(geography = "tract",
state = "GA",
county = c("Cobb"), # Marietta is within Cobb County
variables = c(hhincome = 'B19019_001'),
year = 2021,
survey = "acs5",
geometry = TRUE,
output = "wide") %>%
rename(hhincome = hhincomeE) %>%
st_transform(4326)
tract_marietta <- tract[marietta, ] # load geometry of census tracts within marietta
After loading the census data, we can left join yelp data to census data.
skincare_tract <- st_join(tract_marietta, yelp_skincare_clean, join = st_intersects) # join yelp data into census tract data: skincare
spas_tract <- st_join(tract_marietta, yelp_spas_clean, join = st_intersects) # join yelp data into census tract data: day spa
skincare_tract <- skincare_tract %>% # transform dataset to a spatial object
st_as_sf(coords=c("coordinates.longitude", "coordinates.latitude"),
crs = 4326)
spas_tract <- spas_tract %>% # transform dataset to a spatial object
st_as_sf(coords=c("coordinates.longitude", "coordinates.latitude"),
crs = 4326)
Next, we will visualize the map of Marietta by census tracts, using color coding to represent the average rating scores of businesses within each tract. From this task, we can conclude that for skincare sector, average rating of between 0-1 has the most census tracts (6).
tm_shape(skincare_tract %>% group_by(GEOID) %>% summarise(rating=mean(rating))) + # visualize the result
tm_polygons(col = "rating")
For Day Spa sector, average rating of between 0-1, 3-4, and 4-5 respectively has two census tracts.
tm_shape(spas_tract %>% group_by(GEOID) %>% summarise(rating=mean(rating))) + # visualize the result
tm_polygons(col = "rating")
Lastly, we can visualize the locations of two businesses in Marietta by color coding each points by household income. To do so, we need to join census tract data into yelp data.
tract_skincare <- st_join(yelp_skincare_clean, tract_marietta, join = st_intersects) # join census tract data into yelp data: skincare
tract_spas <- st_join(yelp_spas_clean, tract_marietta, join = st_intersects) # join census tract data into yelp data: day spa
From this task, we can conclude that skincare businesses in Marietta located in census tracts with lower household incomes tend to receive higher reviews.
tm_shape(tract_skincare) + tm_dots("hhincome") + # visualize the result
tm_shape(marietta) + tm_borders()
On the other hand, for day spa businesses in Marietta, those located in census tracts with higher household incomes tend to receive higher reviews.
tm_shape(tract_spas) + tm_dots("hhincome") + # visualize the result
tm_shape(marietta) + tm_borders()