Load packages and print the session info

## Load packages
library(sf)
## Linking to GEOS 3.9.1, GDAL 3.4.3, PROJ 7.2.1; sf_use_s2() is TRUE
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(here)
## here() starts at C:/Users/iskim/Dropbox (GaTech)/2022-2023/2022 Fall/CP 8883 Intro to Urban Analytics/UA_module1
library(knitr)
library(tmap)
library(leaflet)

## Session info
sessionInfo()
## R version 4.2.1 (2022-06-23 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22621)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] leaflet_2.1.1   tmap_3.3-3      knitr_1.40      here_1.0.1     
##  [5] forcats_0.5.2   stringr_1.4.1   dplyr_1.0.10    purrr_0.3.4    
##  [9] readr_2.1.2     tidyr_1.2.1     tibble_3.1.8    ggplot2_3.3.6  
## [13] tidyverse_1.3.2 sf_1.0-8       
## 
## loaded via a namespace (and not attached):
##  [1] fs_1.5.2                lubridate_1.8.0         RColorBrewer_1.1-3     
##  [4] httr_1.4.4              rprojroot_2.0.3         tools_4.2.1            
##  [7] backports_1.4.1         bslib_0.4.0             utf8_1.2.2             
## [10] R6_2.5.1                KernSmooth_2.23-20      DBI_1.1.3              
## [13] colorspace_2.0-3        raster_3.5-29           sp_1.5-0               
## [16] withr_2.5.0             tidyselect_1.1.2        compiler_4.2.1         
## [19] leafem_0.2.0            cli_3.4.0               rvest_1.0.3            
## [22] xml2_1.3.3              sass_0.4.2              scales_1.2.1           
## [25] classInt_0.4-7          proxy_0.4-27            digest_0.6.29          
## [28] rmarkdown_2.16          base64enc_0.1-3         dichromat_2.0-0.1      
## [31] pkgconfig_2.0.3         htmltools_0.5.3         dbplyr_2.2.1           
## [34] fastmap_1.1.0           htmlwidgets_1.5.4       rlang_1.0.5            
## [37] readxl_1.4.1            rstudioapi_0.14         jquerylib_0.1.4        
## [40] generics_0.1.3          jsonlite_1.8.0          crosstalk_1.2.0        
## [43] googlesheets4_1.0.1     magrittr_2.0.3          Rcpp_1.0.9             
## [46] munsell_0.5.0           fansi_1.0.3             abind_1.4-5            
## [49] terra_1.6-17            lifecycle_1.0.2         stringi_1.7.8          
## [52] leafsync_0.1.0          yaml_2.3.5              tmaptools_3.1-1        
## [55] grid_4.2.1              parallel_4.2.1          crayon_1.5.1           
## [58] lattice_0.20-45         haven_2.5.1             stars_0.5-6            
## [61] hms_1.1.2               pillar_1.8.1            codetools_0.2-18       
## [64] reprex_2.0.2            XML_3.99-0.10           glue_1.6.2             
## [67] evaluate_0.16           leaflet.providers_1.9.0 modelr_0.1.9           
## [70] png_0.1-7               vctrs_0.4.1             tzdb_0.3.0             
## [73] cellranger_1.1.0        gtable_0.3.1            assertthat_0.2.1       
## [76] cachem_1.0.6            xfun_0.32               lwgeom_0.2-8           
## [79] broom_1.0.1             e1071_1.7-11            class_7.3-20           
## [82] googledrive_2.0.0       viridisLite_0.4.1       gargle_1.2.1           
## [85] units_0.8-0             ellipsis_0.3.2

Load Data

df <- read_csv(here('data', 'coffee.csv'))
## New names:
## Rows: 363 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): county dbl (13): ...1, GEOID, hhincome, pct_pov, review_count, avg_rating,
## race.tot...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
head(df)
## # A tibble: 6 × 14
##    ...1     GEOID county hhinc…¹ pct_pov revie…² avg_r…³ race.…⁴ avg_p…⁵ pct_w…⁶
##   <dbl>     <dbl> <chr>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1     1   1.31e10 Clayt…   33276  0.201     57         2    2850       1  0.0751
## 2     2   1.31e10 Clayt…   28422  0.211     13         3    4262       1  0.261 
## 3     3   1.31e10 Clayt…   49271  0.108     29.3       2    4046       1  0.205 
## 4     4   1.31e10 Clayt…   44551  0.181     20         4    8489       1  0.169 
## 5     5   1.31e10 Clayt…   49719  0.115     41         1    7166       1  0.194 
## 6     6   1.31e10 Clayt…   57924  0.0907    18         2   13311       1  0.165 
## # … with 4 more variables: hhincome_log <dbl>, review_count_log <dbl>,
## #   pct_pov_log <dbl>, yelp_n <dbl>, and abbreviated variable names ¹​hhincome,
## #   ²​review_count, ³​avg_rating, ⁴​race.tot, ⁵​avg_price, ⁶​pct_white

Plot 1

Findings: In general, the higher the average rating of a neighborhood, the larger the mediean household income of the neighborhood. One noteworthy aspect is that the neighborhoods with average rating of 5 (the maximum rating) have substantially low median household incomes than those with average rating of 4, which disagrees with the general trend.
ggplot(df %>% mutate(avg_rating = as.factor(avg_rating))) + 
  geom_boxplot(mapping = aes(x = avg_rating, y = hhincome)) 

Plot 2

Plot 3

Findings: Clayton county does not have many tracts (i.e., have less dots in Plot 3) than others, partially explaining why the boxplot of Clayton county in Plot 2 has small boxes with very limited dispersion. For the other couties, the higher the proportion of white residents, the higher median household income. With this scatterplots by county, it is hard to be sure whether the average number of review counts (logged) of a neigborhood is correlated with the median household income of the neighborhood.
ggplot(df) + 
  geom_point(mapping = aes(x = review_count_log, y = hhincome, color = pct_white), size = 3, alpha = 0.6) +
  facet_wrap(~county) +
  theme_bw() + 
  labs(x = "Review Count (log)",
       y = "Median Annual Household Income",
       color = "Proportion of residents \nwho self-identified as white",
       title = "Scatterplot: Review Count vs. Household Income") +
  scale_color_gradient(low="blue", high="red")

Plot 4

Findings: The most prominent trend shown in Plot 4 is that the proportion of white residents of a neighborhood is postitviely correlated with the average number of reviews for POIs. The correlation coefficient value is the largest and the p-value is the smallest among the four values used as the y-axis. The slopes of smooth lines (based on linear regressions) are steeper as well compared to other plots in Plot 4. Compared with the proportion of white residents, median household income seems to have less strong (in terms of the correlation coefficient and the slopes of regression lines) positive relationship with average number of review counts, while proportion of residents under poverty has slightly a negative relationship.
var_name <- c(
  'hhincome' = "Median Annual Household Income ($)",
  'pct_pov_log' = "Percent Residents Under Poverty",
  'pct_white' = "Percent White Resident",
  'race.tot' = "Total Population"
)

var_name_labeller <- as_labeller(var_name)

df %>% 
  pivot_longer(cols = c('pct_pov_log', 'hhincome', 'pct_white', 'race.tot'), 
               names_to = "variable", 
               values_to = "value") %>% 
  ggplot(data = ., mapping = aes(x = review_count_log, y = value)) +
  geom_point(mapping = aes(color = county), alpha = 0.9) +
  geom_smooth(method = "lm", se = FALSE, aes(group = county, color = county)) +
  labs(x = "Review Count Logged",
       y = "Values",
       color = "County",
       title = "Scatterplot between logged review count & neighborhood characteristics",
       subtitle = "Using Yelp data in Five COunties Around Atlanta, GA") +
  facet_wrap(~variable, scales = "free_y", labeller = var_name_labeller) +
  theme_bw() +
  ggpubr::stat_cor(method = "pearson")
## `geom_smooth()` using formula 'y ~ x'