## Load packages
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tidycensus)
library(tmap)
library(here)
## here() starts at C:/Users/ikim302/Dropbox (GaTech)/2022-2023/2022 Fall/CP 8883 Intro to Urban Analytics/UA_module1
library(sf)
## Linking to GEOS 3.9.1, GDAL 3.3.2, PROJ 7.2.1; sf_use_s2() is TRUE
## Session info
sessionInfo()
## R version 4.2.1 (2022-06-23 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19042)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] sf_1.0-7 here_1.0.1 tmap_3.3-3 tidycensus_1.2.2
## [5] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.9 purrr_0.3.4
## [9] readr_2.1.2 tidyr_1.2.0 tibble_3.1.7 ggplot2_3.3.6
## [13] tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] fs_1.5.2 lubridate_1.8.0 RColorBrewer_1.1-3 httr_1.4.3
## [5] rprojroot_2.0.3 tools_4.2.1 backports_1.4.1 bslib_0.3.1
## [9] utf8_1.2.2 rgdal_1.5-32 R6_2.5.1 KernSmooth_2.23-20
## [13] DBI_1.1.3 colorspace_2.0-3 raster_3.5-21 withr_2.5.0
## [17] sp_1.5-0 tidyselect_1.1.2 leaflet_2.1.1 compiler_4.2.1
## [21] leafem_0.2.0 cli_3.3.0 rvest_1.0.2 xml2_1.3.3
## [25] sass_0.4.1 scales_1.2.0 classInt_0.4-7 proxy_0.4-27
## [29] rappdirs_0.3.3 digest_0.6.29 foreign_0.8-82 rmarkdown_2.14
## [33] base64enc_0.1-3 dichromat_2.0-0.1 pkgconfig_2.0.3 htmltools_0.5.2
## [37] dbplyr_2.2.1 fastmap_1.1.0 htmlwidgets_1.5.4 rlang_1.0.3
## [41] readxl_1.4.0 rstudioapi_0.13 jquerylib_0.1.4 generics_0.1.2
## [45] jsonlite_1.8.0 crosstalk_1.2.0 magrittr_2.0.3 Rcpp_1.0.8.3
## [49] munsell_0.5.0 fansi_1.0.3 abind_1.4-5 terra_1.5-34
## [53] lifecycle_1.0.1 stringi_1.7.6 leafsync_0.1.0 yaml_2.3.5
## [57] tmaptools_3.1-1 grid_4.2.1 maptools_1.1-4 parallel_4.2.1
## [61] crayon_1.5.1 lattice_0.20-45 haven_2.5.0 stars_0.5-5
## [65] hms_1.1.1 knitr_1.39 pillar_1.7.0 uuid_1.1-0
## [69] codetools_0.2-18 reprex_2.0.1 XML_3.99-0.10 glue_1.6.2
## [73] evaluate_0.15 modelr_0.1.8 png_0.1-7 vctrs_0.4.1
## [77] tzdb_0.3.0 cellranger_1.1.0 gtable_0.3.0 assertthat_0.2.1
## [81] xfun_0.31 lwgeom_0.2-8 broom_0.8.0 e1071_1.7-11
## [85] viridisLite_0.4.0 class_7.3-20 tigris_1.6.1 units_0.8-0
## [89] ellipsis_0.3.2
## Load Yelp API data (hair salons and banks in Gwinnett County)
yelp_all <- read_rds(here('data', 'yelp_all_mini_assignment1.rds'))
## Print
yelp_all
## # A tibble: 5,059 × 17
## id alias name image_url is_closed url review_count categories rating
## <chr> <chr> <chr> <chr> <lgl> <chr> <int> <list> <dbl>
## 1 nq4BI-L… salo… Salo… "https:/… FALSE http… 7 <df> 4
## 2 tycA5BG… marr… Marr… "https:/… FALSE http… 5 <df> 5
## 3 sqw62Y2… sl-b… SL B… "https:/… FALSE http… 20 <df> 3
## 4 Qg5l_0e… k-an… K & … "" FALSE http… 3 <df> 4
## 5 vtfHPJW… form… Form… "https:/… FALSE http… 54 <df> 5
## 6 fiGDS1b… hair… Hair… "https:/… FALSE http… 6 <df> 3.5
## 7 3roxcGB… zapi… Zapi… "https:/… FALSE http… 34 <df> 4.5
## 8 fS93bC5… sali… Sali… "https:/… FALSE http… 2 <df> 5
## 9 qH7a-aw… salo… Salo… "https:/… FALSE http… 3 <df> 5
## 10 ACkJmVq… new-… New … "https:/… FALSE http… 4 <df> 4
## # … with 5,049 more rows, and 8 more variables: coordinates <df[,2]>,
## # transactions <list>, price <chr>, location <df[,8]>, phone <chr>,
## # display_phone <chr>, distance <dbl>, Category <chr>
## Check duplicated values in the "id" column and delete duplicated rows
yelp_all <- yelp_all %>%
distinct(id, .keep_all = T) # from 5059 rows to 870 rows
## Print
yelp_all
## # A tibble: 870 × 17
## id alias name image_url is_closed url review_count categories rating
## <chr> <chr> <chr> <chr> <lgl> <chr> <int> <list> <dbl>
## 1 nq4BI-L… salo… Salo… "https:/… FALSE http… 7 <df> 4
## 2 tycA5BG… marr… Marr… "https:/… FALSE http… 5 <df> 5
## 3 sqw62Y2… sl-b… SL B… "https:/… FALSE http… 20 <df> 3
## 4 Qg5l_0e… k-an… K & … "" FALSE http… 3 <df> 4
## 5 vtfHPJW… form… Form… "https:/… FALSE http… 54 <df> 5
## 6 fiGDS1b… hair… Hair… "https:/… FALSE http… 6 <df> 3.5
## 7 3roxcGB… zapi… Zapi… "https:/… FALSE http… 34 <df> 4.5
## 8 fS93bC5… sali… Sali… "https:/… FALSE http… 2 <df> 5
## 9 qH7a-aw… salo… Salo… "https:/… FALSE http… 3 <df> 5
## 10 ACkJmVq… new-… New … "https:/… FALSE http… 4 <df> 4
## # … with 860 more rows, and 8 more variables: coordinates <df[,2]>,
## # transactions <list>, price <chr>, location <df[,8]>, phone <chr>,
## # display_phone <chr>, distance <dbl>, Category <chr>
## Define a custom function that takes a data frame in "categories" column in Yelp data and returns a character vector
concate_list <- function(x) {
# "x" is a data frame with columns "alias" and "title" from Yelp$categories
# The function returns a character vector
titles <- x$title %>% str_c(collapse = ", ")
return(titles)
}
## Flatten columns
yelp_all <- yelp_all %>%
# Flatten data-frame-columns ("coordinates" and "location")
jsonlite::flatten() %>%
relocate(coordinates.latitude, coordinates.longitude, .after = rating) %>%
relocate(location.address1, location.address2, location.address3, location.city,
location.zip_code, location.country, location.state, location.display_address, .after = price) %>%
as_tibble() %>%
# Flatten list-columns ("transactions" (empty), "location.display_address", and "categories")
mutate(transactions = transactions %>% map_chr(function(x) str_c(x, collapse = ", ")),
location.display_address = location.display_address %>% map_chr(function(x) str_c(x, collapse = ", ")),
categories = categories %>% map_chr(concate_list)
)
## Print
yelp_all
## # A tibble: 870 × 25
## id alias name image_url is_closed url review_count categories rating
## <chr> <chr> <chr> <chr> <lgl> <chr> <int> <chr> <dbl>
## 1 nq4BI-L… salo… Salo… "https:/… FALSE http… 7 Hair Salo… 4
## 2 tycA5BG… marr… Marr… "https:/… FALSE http… 5 Threading… 5
## 3 sqw62Y2… sl-b… SL B… "https:/… FALSE http… 20 Makeup Ar… 3
## 4 Qg5l_0e… k-an… K & … "" FALSE http… 3 Hair Salo… 4
## 5 vtfHPJW… form… Form… "https:/… FALSE http… 54 Makeup Ar… 5
## 6 fiGDS1b… hair… Hair… "https:/… FALSE http… 6 Hair Salo… 3.5
## 7 3roxcGB… zapi… Zapi… "https:/… FALSE http… 34 Hair Salo… 4.5
## 8 fS93bC5… sali… Sali… "https:/… FALSE http… 2 Hair Salo… 5
## 9 qH7a-aw… salo… Salo… "https:/… FALSE http… 3 Hair Salo… 5
## 10 ACkJmVq… new-… New … "https:/… FALSE http… 4 Hair Salo… 4
## # … with 860 more rows, and 16 more variables: coordinates.latitude <dbl>,
## # coordinates.longitude <dbl>, transactions <chr>, price <chr>,
## # location.address1 <chr>, location.address2 <chr>, location.address3 <chr>,
## # location.city <chr>, location.zip_code <chr>, location.country <chr>,
## # location.state <chr>, location.display_address <chr>, phone <chr>,
## # display_phone <chr>, distance <dbl>, Category <chr>
## Check missing values in "coordinates.latitude" and "coordinates.longitude" variables
yelp_all %>% select(coordinates.latitude, coordinates.longitude) %>% summary()
## coordinates.latitude coordinates.longitude
## Min. :28.66 Min. :-87.21
## 1st Qu.:33.90 1st Qu.:-84.17
## Median :33.96 Median :-84.09
## Mean :33.95 Mean :-84.09
## 3rd Qu.:34.02 3rd Qu.:-84.00
## Max. :35.29 Max. :-81.02
## No missing values in "coordinates.latitude" and "coordinates.longitude" -> Keep all rows
## Create an sf object (crs = 4326)
yelp_sf <- yelp_all %>%
mutate(x = .$coordinates.longitude,
y = .$coordinates.latitude) %>%
filter(!is.na(x) & !is.na(y)) %>% # No row is removed
st_as_sf(coords = c("x", "y"), crs = 4326)
## Download Census tracts in Gwinnett County
# Load my Census API key
load(here('data', 'census_api_key.RData')) # Load the API key save in my local machine
census_api_key(census.api.key) %>% suppressMessages() # Activate the key in this R session
rm(census.api.key) # Remove from the environment
# Download tract polygons in Gwinnett and change the coordinates to 4326
Gwinnett <- get_acs(geography = "tract",
state = "GA",
county = c("Gwinnett"),
variables = c(pop = "B01001_001",
hhincome = "B19019_001"
),
year = 2019,
survey = "acs5",
geometry = T,
output = "wide"
) %>%
suppressMessages() %>%
st_transform(crs = 4326)
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======= | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|=========== | 16%
|
|============ | 16%
|
|============ | 18%
|
|============== | 20%
|
|=============== | 21%
|
|================= | 24%
|
|================= | 25%
|
|=================== | 27%
|
|==================== | 29%
|
|====================== | 31%
|
|======================= | 32%
|
|======================== | 34%
|
|========================= | 36%
|
|=========================== | 38%
|
|============================ | 39%
|
|============================= | 42%
|
|============================== | 43%
|
|================================ | 45%
|
|================================= | 47%
|
|================================== | 49%
|
|=================================== | 50%
|
|===================================== | 52%
|
|====================================== | 54%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|============================================ | 63%
|
|============================================= | 64%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================= | 70%
|
|================================================== | 72%
|
|==================================================== | 74%
|
|===================================================== | 75%
|
|====================================================== | 77%
|
|======================================================= | 79%
|
|========================================================= | 81%
|
|========================================================== | 82%
|
|=========================================================== | 85%
|
|============================================================ | 86%
|
|============================================================== | 88%
|
|=============================================================== | 90%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|=================================================================== | 95%
|
|==================================================================== | 97%
|
|===================================================================== | 99%
|
|======================================================================| 100%
## Check current data
tmap_mode('view') %>% suppressMessages()
tm_shape(yelp_sf) + tm_dots(col = 'Category') +
tm_shape(Gwinnett %>% st_union()) + tm_borders()
## Keep only cases fall inside Gwinnett County
yelp_sf <- yelp_sf[Gwinnett %>% st_union(), , op = st_intersects] # from 870 rows to 713 rows
## Check the output
tmap_mode('view') %>% suppressMessages()
tm_shape(yelp_sf) + tm_dots(col = 'Category') +
tm_shape(Gwinnett %>% st_union()) + tm_borders()
Have the findings changed before and after the tidying the
data?
Deleting the duplicated rows reduced the number of businesses
substantially; the number of hair salons decreased from 4230 to 712 and
that of banks from 829 to 158. Filtering out businesses falling outside
of Gwinnett county further reduced the numbers to 579 (hair salons) and
158 (banks). Considering the size and population (slightly over 0.9
million) of Gwinnett county, the number of businesses of each category
makes more sense now.
What’s the most frequent rating score? Does that seem to be
related with review_count?
The most frequent rating in the dataset (both categories
combined) is 5 (27.1%). Looking at the distribution by each category,
the most frequent rating of banks is 1 (23.1%) while the proportion of
banks with rating of 5 is just 9.0%. On the other hand, the trend is
reversed for hair salons. The most frequent rating of hair salons is 5
(31.3%) while the proportion of hair salons with rating of 1 is just
7.4%. As shown in the result of the code chunk below, hair salons are
more likely to be rated high but banks are more likely to be rated low.
I cannot be sure about whether the observed difference in the
distributions of ratings is directly related to the number of reviews.
However, the number of reviews are higher in general for hair salons
than for banks. The mean and median of review counts is 4 and 9.88 for
hair salons but those of banks are 3 and 4.17.
## Check the distribution of "rating" of businesses
cat("Distribution of rating of both categories combined (unit: %): \n", sep = "")
## Distribution of rating of both categories combined (unit: %):
round(yelp_sf %>%
pull(rating) %>%
table(useNA = 'ifany')/nrow(yelp_sf)*100,
1)
## .
## 1 1.5 2 2.5 3 3.5 4 4.5 5
## 10.4 2.7 6.5 9.8 8.1 10.4 15.6 9.5 27.1
cat("\n", "Distribution of rating of hair salons (unit: %): \n", sep = "")
##
## Distribution of rating of hair salons (unit: %):
round(yelp_sf %>%
filter(Category == "Hair Salons") %>%
pull(rating) %>%
table(useNA = 'ifany')/nrow(yelp_sf %>% filter(Category == "Hair Salons"))*100,
1)
## .
## 1 1.5 2 2.5 3 3.5 4 4.5 5
## 7.4 0.9 5.4 7.4 7.8 10.7 17.6 11.6 31.3
cat("\n", "Distribution of rating of banks (unit: %): \n", sep = "")
##
## Distribution of rating of banks (unit: %):
round(yelp_sf %>%
filter(Category == "Banks & Credit Unions") %>%
pull(rating) %>%
table(useNA = 'ifany')/nrow(yelp_sf %>% filter(Category == "Banks & Credit Unions"))*100,
1)
## .
## 1 1.5 2 2.5 3 3.5 4 4.5 5
## 23.1 10.4 11.2 20.1 9.7 9.0 6.7 0.7 9.0
## Check the summary of "review_count"
cat("Summary of review counts of banks: \n", sep = "")
## Summary of review counts of banks:
yelp_sf %>% pull(review_count) %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 4.000 8.809 9.000 228.000
cat("\n", "Summary of review counts of hair salons: \n", sep = "")
##
## Summary of review counts of hair salons:
yelp_sf %>% filter(Category == "Hair Salons") %>% pull(review_count) %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 4.000 9.883 9.500 228.000
cat("\n", "Summary of review counts of banks: \n", sep = "")
##
## Summary of review counts of banks:
yelp_sf %>% filter(Category == "Banks & Credit Unions") %>% pull(review_count) %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 3.000 4.172 5.750 16.000
## Check the spatial distribution of hair salons by price
tmap_mode('view') %>% suppressMessages()
tm_shape(yelp_sf %>% filter(Category == "Hair Salons")) + tm_dots(col = 'price') +
tm_shape(Gwinnett %>% st_union()) + tm_borders()