Load packages and print the session info
## Load packages
library(sf)
## Linking to GEOS 3.9.1, GDAL 3.4.3, PROJ 7.2.1; sf_use_s2() is TRUE
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(here)
## here() starts at C:/Users/iskim/Dropbox (GaTech)/2022-2023/2022 Fall/CP 8883 Intro to Urban Analytics/UA_module1
library(knitr)
library(tmap)
library(leaflet)
## Session info
sessionInfo()
## R version 4.2.1 (2022-06-23 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22621)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] leaflet_2.1.1 tmap_3.3-3 knitr_1.40 here_1.0.1
## [5] forcats_0.5.2 stringr_1.4.1 dplyr_1.0.10 purrr_0.3.4
## [9] readr_2.1.2 tidyr_1.2.1 tibble_3.1.8 ggplot2_3.3.6
## [13] tidyverse_1.3.2 sf_1.0-8
##
## loaded via a namespace (and not attached):
## [1] fs_1.5.2 lubridate_1.8.0 RColorBrewer_1.1-3
## [4] httr_1.4.4 rprojroot_2.0.3 tools_4.2.1
## [7] backports_1.4.1 bslib_0.4.0 utf8_1.2.2
## [10] R6_2.5.1 KernSmooth_2.23-20 DBI_1.1.3
## [13] colorspace_2.0-3 raster_3.5-29 sp_1.5-0
## [16] withr_2.5.0 tidyselect_1.1.2 compiler_4.2.1
## [19] leafem_0.2.0 cli_3.4.0 rvest_1.0.3
## [22] xml2_1.3.3 sass_0.4.2 scales_1.2.1
## [25] classInt_0.4-7 proxy_0.4-27 digest_0.6.29
## [28] rmarkdown_2.16 base64enc_0.1-3 dichromat_2.0-0.1
## [31] pkgconfig_2.0.3 htmltools_0.5.3 dbplyr_2.2.1
## [34] fastmap_1.1.0 htmlwidgets_1.5.4 rlang_1.0.5
## [37] readxl_1.4.1 rstudioapi_0.14 jquerylib_0.1.4
## [40] generics_0.1.3 jsonlite_1.8.0 crosstalk_1.2.0
## [43] googlesheets4_1.0.1 magrittr_2.0.3 Rcpp_1.0.9
## [46] munsell_0.5.0 fansi_1.0.3 abind_1.4-5
## [49] terra_1.6-17 lifecycle_1.0.2 stringi_1.7.8
## [52] leafsync_0.1.0 yaml_2.3.5 tmaptools_3.1-1
## [55] grid_4.2.1 parallel_4.2.1 crayon_1.5.1
## [58] lattice_0.20-45 haven_2.5.1 stars_0.5-6
## [61] hms_1.1.2 pillar_1.8.1 codetools_0.2-18
## [64] reprex_2.0.2 XML_3.99-0.10 glue_1.6.2
## [67] evaluate_0.16 leaflet.providers_1.9.0 modelr_0.1.9
## [70] png_0.1-7 vctrs_0.4.1 tzdb_0.3.0
## [73] cellranger_1.1.0 gtable_0.3.1 assertthat_0.2.1
## [76] cachem_1.0.6 xfun_0.32 lwgeom_0.2-8
## [79] broom_1.0.1 e1071_1.7-11 class_7.3-20
## [82] googledrive_2.0.0 viridisLite_0.4.1 gargle_1.2.1
## [85] units_0.8-0 ellipsis_0.3.2
Load Data
df <- read_csv(here('data', 'coffee.csv'))
## New names:
## Rows: 363 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): county dbl (13): ...1, GEOID, hhincome, pct_pov, review_count, avg_rating,
## race.tot...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
head(df)
## # A tibble: 6 × 14
## ...1 GEOID county hhinc…¹ pct_pov revie…² avg_r…³ race.…⁴ avg_p…⁵ pct_w…⁶
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1.31e10 Clayt… 33276 0.201 57 2 2850 1 0.0751
## 2 2 1.31e10 Clayt… 28422 0.211 13 3 4262 1 0.261
## 3 3 1.31e10 Clayt… 49271 0.108 29.3 2 4046 1 0.205
## 4 4 1.31e10 Clayt… 44551 0.181 20 4 8489 1 0.169
## 5 5 1.31e10 Clayt… 49719 0.115 41 1 7166 1 0.194
## 6 6 1.31e10 Clayt… 57924 0.0907 18 2 13311 1 0.165
## # … with 4 more variables: hhincome_log <dbl>, review_count_log <dbl>,
## # pct_pov_log <dbl>, yelp_n <dbl>, and abbreviated variable names ¹hhincome,
## # ²review_count, ³avg_rating, ⁴race.tot, ⁵avg_price, ⁶pct_white
Plot 2
Findings: Looking at the boxplots by county, the observations from
Plot 1 hold for Fulton for sure. However, the trends from Plot 1 are
less promient for DeKalb and Gwinnett counties, while Clayton county
does not show the trends at all and does not have tracts with average
rating of 5. This indicates disparities among counites that did not show
up when merged.
ggplot(df %>% mutate(avg_rating = as.factor(avg_rating))) +
geom_boxplot(mapping = aes(x = avg_rating, y = hhincome)) +
facet_wrap(~county)

Plot 4
Findings: The most prominent trend shown in Plot 4 is that the
proportion of white residents of a neighborhood is postitviely
correlated with the average number of reviews for POIs. The correlation
coefficient value is the largest and the p-value is the smallest among
the four values used as the y-axis. The slopes of smooth lines (based on
linear regressions) are steeper as well compared to other plots in Plot
4. Compared with the proportion of white residents, median household
income seems to have less strong (in terms of the correlation
coefficient and the slopes of regression lines) positive relationship
with average number of review counts, while proportion of residents
under poverty has slightly a negative relationship.
var_name <- c(
'hhincome' = "Median Annual Household Income ($)",
'pct_pov_log' = "Percent Residents Under Poverty",
'pct_white' = "Percent White Resident",
'race.tot' = "Total Population"
)
var_name_labeller <- as_labeller(var_name)
df %>%
pivot_longer(cols = c('pct_pov_log', 'hhincome', 'pct_white', 'race.tot'),
names_to = "variable",
values_to = "value") %>%
ggplot(data = ., mapping = aes(x = review_count_log, y = value)) +
geom_point(mapping = aes(color = county), alpha = 0.9) +
geom_smooth(method = "lm", se = FALSE, aes(group = county, color = county)) +
labs(x = "Review Count Logged",
y = "Values",
color = "County",
title = "Scatterplot between logged review count & neighborhood characteristics",
subtitle = "Using Yelp data in Five COunties Around Atlanta, GA") +
facet_wrap(~variable, scales = "free_y", labeller = var_name_labeller) +
theme_bw() +
ggpubr::stat_cor(method = "pearson")
## `geom_smooth()` using formula 'y ~ x'
