Load packages and print the session info

## Load packages
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(tidycensus)
library(tmap)
library(here)
## here() starts at C:/Users/ikim302/Dropbox (GaTech)/2022-2023/2022 Fall/CP 8883 Intro to Urban Analytics/UA_module1
library(sf)
## Linking to GEOS 3.9.1, GDAL 3.3.2, PROJ 7.2.1; sf_use_s2() is TRUE
## Session info
sessionInfo()
## R version 4.2.1 (2022-06-23 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19042)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] sf_1.0-7         here_1.0.1       tmap_3.3-3       tidycensus_1.2.2
##  [5] forcats_0.5.1    stringr_1.4.0    dplyr_1.0.9      purrr_0.3.4     
##  [9] readr_2.1.2      tidyr_1.2.0      tibble_3.1.7     ggplot2_3.3.6   
## [13] tidyverse_1.3.1 
## 
## loaded via a namespace (and not attached):
##  [1] fs_1.5.2           lubridate_1.8.0    RColorBrewer_1.1-3 httr_1.4.3        
##  [5] rprojroot_2.0.3    tools_4.2.1        backports_1.4.1    bslib_0.3.1       
##  [9] utf8_1.2.2         rgdal_1.5-32       R6_2.5.1           KernSmooth_2.23-20
## [13] DBI_1.1.3          colorspace_2.0-3   raster_3.5-21      withr_2.5.0       
## [17] sp_1.5-0           tidyselect_1.1.2   leaflet_2.1.1      compiler_4.2.1    
## [21] leafem_0.2.0       cli_3.3.0          rvest_1.0.2        xml2_1.3.3        
## [25] sass_0.4.1         scales_1.2.0       classInt_0.4-7     proxy_0.4-27      
## [29] rappdirs_0.3.3     digest_0.6.29      foreign_0.8-82     rmarkdown_2.14    
## [33] base64enc_0.1-3    dichromat_2.0-0.1  pkgconfig_2.0.3    htmltools_0.5.2   
## [37] dbplyr_2.2.1       fastmap_1.1.0      htmlwidgets_1.5.4  rlang_1.0.3       
## [41] readxl_1.4.0       rstudioapi_0.13    jquerylib_0.1.4    generics_0.1.2    
## [45] jsonlite_1.8.0     crosstalk_1.2.0    magrittr_2.0.3     Rcpp_1.0.8.3      
## [49] munsell_0.5.0      fansi_1.0.3        abind_1.4-5        terra_1.5-34      
## [53] lifecycle_1.0.1    stringi_1.7.6      leafsync_0.1.0     yaml_2.3.5        
## [57] tmaptools_3.1-1    grid_4.2.1         maptools_1.1-4     parallel_4.2.1    
## [61] crayon_1.5.1       lattice_0.20-45    haven_2.5.0        stars_0.5-5       
## [65] hms_1.1.1          knitr_1.39         pillar_1.7.0       uuid_1.1-0        
## [69] codetools_0.2-18   reprex_2.0.1       XML_3.99-0.10      glue_1.6.2        
## [73] evaluate_0.15      modelr_0.1.8       png_0.1-7          vctrs_0.4.1       
## [77] tzdb_0.3.0         cellranger_1.1.0   gtable_0.3.0       assertthat_0.2.1  
## [81] xfun_0.31          lwgeom_0.2-8       broom_0.8.0        e1071_1.7-11      
## [85] viridisLite_0.4.0  class_7.3-20       tigris_1.6.1       units_0.8-0       
## [89] ellipsis_0.3.2

Load downloaded files frome mini assignment 1

## Load Yelp API data (hair salons and banks in Gwinnett County)
yelp_all <- read_rds(here('data', 'yelp_all_mini_assignment1.rds'))
## Print
yelp_all
## # A tibble: 5,059 × 17
##    id       alias name  image_url is_closed url   review_count categories rating
##    <chr>    <chr> <chr> <chr>     <lgl>     <chr>        <int> <list>      <dbl>
##  1 nq4BI-L… salo… Salo… "https:/… FALSE     http…            7 <df>          4  
##  2 tycA5BG… marr… Marr… "https:/… FALSE     http…            5 <df>          5  
##  3 sqw62Y2… sl-b… SL B… "https:/… FALSE     http…           20 <df>          3  
##  4 Qg5l_0e… k-an… K & … ""        FALSE     http…            3 <df>          4  
##  5 vtfHPJW… form… Form… "https:/… FALSE     http…           54 <df>          5  
##  6 fiGDS1b… hair… Hair… "https:/… FALSE     http…            6 <df>          3.5
##  7 3roxcGB… zapi… Zapi… "https:/… FALSE     http…           34 <df>          4.5
##  8 fS93bC5… sali… Sali… "https:/… FALSE     http…            2 <df>          5  
##  9 qH7a-aw… salo… Salo… "https:/… FALSE     http…            3 <df>          5  
## 10 ACkJmVq… new-… New … "https:/… FALSE     http…            4 <df>          4  
## # … with 5,049 more rows, and 8 more variables: coordinates <df[,2]>,
## #   transactions <list>, price <chr>, location <df[,8]>, phone <chr>,
## #   display_phone <chr>, distance <dbl>, Category <chr>

Tidy data

Delete duplicated rows (from 5059 rows to 870 rows)
## Check duplicated values in the "id" column and delete duplicated rows
yelp_all <- yelp_all %>% 
  distinct(id, .keep_all = T)  # from 5059 rows to 870 rows
## Print
yelp_all
## # A tibble: 870 × 17
##    id       alias name  image_url is_closed url   review_count categories rating
##    <chr>    <chr> <chr> <chr>     <lgl>     <chr>        <int> <list>      <dbl>
##  1 nq4BI-L… salo… Salo… "https:/… FALSE     http…            7 <df>          4  
##  2 tycA5BG… marr… Marr… "https:/… FALSE     http…            5 <df>          5  
##  3 sqw62Y2… sl-b… SL B… "https:/… FALSE     http…           20 <df>          3  
##  4 Qg5l_0e… k-an… K & … ""        FALSE     http…            3 <df>          4  
##  5 vtfHPJW… form… Form… "https:/… FALSE     http…           54 <df>          5  
##  6 fiGDS1b… hair… Hair… "https:/… FALSE     http…            6 <df>          3.5
##  7 3roxcGB… zapi… Zapi… "https:/… FALSE     http…           34 <df>          4.5
##  8 fS93bC5… sali… Sali… "https:/… FALSE     http…            2 <df>          5  
##  9 qH7a-aw… salo… Salo… "https:/… FALSE     http…            3 <df>          5  
## 10 ACkJmVq… new-… New … "https:/… FALSE     http…            4 <df>          4  
## # … with 860 more rows, and 8 more variables: coordinates <df[,2]>,
## #   transactions <list>, price <chr>, location <df[,8]>, phone <chr>,
## #   display_phone <chr>, distance <dbl>, Category <chr>
Flatten nested columns
## Define a custom function that takes a data frame in "categories" column in Yelp data and returns a character vector
concate_list <- function(x) {
  # "x" is a data frame with columns "alias" and "title" from  Yelp$categories
  # The function returns a character vector
  titles <- x$title %>% str_c(collapse = ", ") 
  return(titles)
}

## Flatten columns
yelp_all <- yelp_all %>% 
  # Flatten data-frame-columns ("coordinates" and "location")
  jsonlite::flatten() %>% 
  relocate(coordinates.latitude, coordinates.longitude, .after = rating) %>% 
  relocate(location.address1, location.address2, location.address3, location.city, 
           location.zip_code, location.country, location.state, location.display_address, .after = price) %>% 
  as_tibble() %>%
  # Flatten list-columns ("transactions" (empty), "location.display_address", and "categories")
  mutate(transactions = transactions %>% map_chr(function(x) str_c(x, collapse = ", ")),
         location.display_address = location.display_address %>% map_chr(function(x) str_c(x, collapse = ", ")),
         categories = categories %>% map_chr(concate_list)
         )
## Print
yelp_all
## # A tibble: 870 × 25
##    id       alias name  image_url is_closed url   review_count categories rating
##    <chr>    <chr> <chr> <chr>     <lgl>     <chr>        <int> <chr>       <dbl>
##  1 nq4BI-L… salo… Salo… "https:/… FALSE     http…            7 Hair Salo…    4  
##  2 tycA5BG… marr… Marr… "https:/… FALSE     http…            5 Threading…    5  
##  3 sqw62Y2… sl-b… SL B… "https:/… FALSE     http…           20 Makeup Ar…    3  
##  4 Qg5l_0e… k-an… K & … ""        FALSE     http…            3 Hair Salo…    4  
##  5 vtfHPJW… form… Form… "https:/… FALSE     http…           54 Makeup Ar…    5  
##  6 fiGDS1b… hair… Hair… "https:/… FALSE     http…            6 Hair Salo…    3.5
##  7 3roxcGB… zapi… Zapi… "https:/… FALSE     http…           34 Hair Salo…    4.5
##  8 fS93bC5… sali… Sali… "https:/… FALSE     http…            2 Hair Salo…    5  
##  9 qH7a-aw… salo… Salo… "https:/… FALSE     http…            3 Hair Salo…    5  
## 10 ACkJmVq… new-… New … "https:/… FALSE     http…            4 Hair Salo…    4  
## # … with 860 more rows, and 16 more variables: coordinates.latitude <dbl>,
## #   coordinates.longitude <dbl>, transactions <chr>, price <chr>,
## #   location.address1 <chr>, location.address2 <chr>, location.address3 <chr>,
## #   location.city <chr>, location.zip_code <chr>, location.country <chr>,
## #   location.state <chr>, location.display_address <chr>, phone <chr>,
## #   display_phone <chr>, distance <dbl>, Category <chr>
Delete rows with missing data in “coordinates” variables (No missing values -> all rows are kept)
## Check missing values in "coordinates.latitude" and "coordinates.longitude" variables
yelp_all %>% select(coordinates.latitude, coordinates.longitude) %>% summary()
##  coordinates.latitude coordinates.longitude
##  Min.   :28.66        Min.   :-87.21       
##  1st Qu.:33.90        1st Qu.:-84.17       
##  Median :33.96        Median :-84.09       
##  Mean   :33.95        Mean   :-84.09       
##  3rd Qu.:34.02        3rd Qu.:-84.00       
##  Max.   :35.29        Max.   :-81.02
## No missing values in  "coordinates.latitude" and "coordinates.longitude" -> Keep all rows
Delete rows that fall outside of Gwinnett County (from 870 rows to 713 rows)
## Create an sf object (crs = 4326)
yelp_sf <- yelp_all %>%
  mutate(x = .$coordinates.longitude,
         y = .$coordinates.latitude) %>%
  filter(!is.na(x) & !is.na(y)) %>%      # No row is removed
  st_as_sf(coords = c("x", "y"), crs = 4326)
## Download Census tracts in Gwinnett County 
# Load my Census API key
load(here('data', 'census_api_key.RData'))              # Load the API key save in my local machine
census_api_key(census.api.key) %>% suppressMessages()   # Activate the key in this R session
rm(census.api.key)                                      # Remove from the environment

# Download tract polygons in Gwinnett and change the coordinates to 4326
Gwinnett <- get_acs(geography = "tract", 
                    state = "GA",
                    county = c("Gwinnett"),
                    variables = c(pop = "B01001_001", 
                                  hhincome = "B19019_001"
                                  ),
                    year = 2019,
                    survey = "acs5", 
                    geometry = T,    
                    output = "wide"  
                    ) %>% 
  suppressMessages() %>% 
  st_transform(crs = 4326)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |=======                                                               |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  16%
  |                                                                            
  |============                                                          |  18%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |=================                                                     |  25%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |=======================                                               |  32%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |============================                                          |  39%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |=====================================                                 |  52%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |=====================================================                 |  75%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |==========================================================            |  82%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |===================================================================   |  95%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |===================================================================== |  99%
  |                                                                            
  |======================================================================| 100%
## Check current data
tmap_mode('view') %>% suppressMessages()
tm_shape(yelp_sf) + tm_dots(col = 'Category') +
  tm_shape(Gwinnett %>% st_union()) + tm_borders()
## Keep only cases fall inside Gwinnett County
yelp_sf <- yelp_sf[Gwinnett %>% st_union(), , op = st_intersects] # from 870 rows to 713 rows

## Check the output
tmap_mode('view') %>% suppressMessages()
tm_shape(yelp_sf) + tm_dots(col = 'Category') +
  tm_shape(Gwinnett %>% st_union()) + tm_borders()

Answers to questions

## Check the distribution of "rating" of businesses
cat("Distribution of rating of both categories combined (unit: %): \n", sep = "")
## Distribution of rating of both categories combined (unit: %):
round(yelp_sf %>% 
        pull(rating) %>% 
        table(useNA = 'ifany')/nrow(yelp_sf)*100, 
      1)
## .
##    1  1.5    2  2.5    3  3.5    4  4.5    5 
## 10.4  2.7  6.5  9.8  8.1 10.4 15.6  9.5 27.1
cat("\n", "Distribution of rating of hair salons (unit: %): \n", sep = "")
## 
## Distribution of rating of hair salons (unit: %):
round(yelp_sf %>% 
        filter(Category == "Hair Salons") %>% 
        pull(rating) %>% 
        table(useNA = 'ifany')/nrow(yelp_sf %>% filter(Category == "Hair Salons"))*100, 
      1)
## .
##    1  1.5    2  2.5    3  3.5    4  4.5    5 
##  7.4  0.9  5.4  7.4  7.8 10.7 17.6 11.6 31.3
cat("\n", "Distribution of rating of banks (unit: %): \n", sep = "")
## 
## Distribution of rating of banks (unit: %):
round(yelp_sf %>% 
        filter(Category == "Banks & Credit Unions") %>% 
        pull(rating) %>% 
        table(useNA = 'ifany')/nrow(yelp_sf %>% filter(Category == "Banks & Credit Unions"))*100, 
      1)
## .
##    1  1.5    2  2.5    3  3.5    4  4.5    5 
## 23.1 10.4 11.2 20.1  9.7  9.0  6.7  0.7  9.0
## Check the summary of "review_count"
cat("Summary of review counts of banks: \n", sep = "")
## Summary of review counts of banks:
yelp_sf %>% pull(review_count) %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   4.000   8.809   9.000 228.000
cat("\n", "Summary of review counts of hair salons: \n", sep = "")
## 
## Summary of review counts of hair salons:
yelp_sf %>% filter(Category == "Hair Salons") %>% pull(review_count) %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   4.000   9.883   9.500 228.000
cat("\n", "Summary of review counts of banks: \n", sep = "")
## 
## Summary of review counts of banks:
yelp_sf %>% filter(Category == "Banks & Credit Unions") %>%  pull(review_count) %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   3.000   4.172   5.750  16.000
## Check the spatial distribution of hair salons by price
tmap_mode('view') %>% suppressMessages()
tm_shape(yelp_sf %>% filter(Category == "Hair Salons")) + tm_dots(col = 'price') +
  tm_shape(Gwinnett %>% st_union()) + tm_borders()