# Load packages:
library(tidycensus)
library(tidyr)
library(sf)
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
library(tmap)
## Breaking News: tmap 3.x is retiring. Please test v4, e.g. with
## remotes::install_github('r-tmap/tmap')
library(jsonlite)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::flatten() masks jsonlite::flatten()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(httr)
library(jsonlite)
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(here)
## here() starts at C:/Users/wpgeorgia/Documents/GT MSUA/CP 8883/Intro to UA R Projects
library(knitr)
library(skimr)
library(units)
## udunits database from C:/Users/wpgeorgia/AppData/Local/R/win-library/4.4/units/share/udunits/udunits2.xml
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(ggalt)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
library(ggplot2)
library(ggtext)
library(ggpubr)
library(readr)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:httr':
##
## config
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
# Import data file:
atl_coffee <- read_csv("coffee.csv")
## New names:
## Rows: 363 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): county dbl (13): ...1, GEOID, hhincome, pct_pov, review_count, avg_rating,
## pop, avg...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
Plot 1
ggplot(data = atl_coffee, aes(x=avg_rating, y=hhincome, group=avg_rating)) +
geom_boxplot(size=0.4, outlier.size=1)
Plot 1 displays a box-and-whisker graph of median household income
by census tract compared to average ratings of coffee shops in those
same census tracts. A box-and-whisker plot shows the middle 25 to 75
percentiles of the data (within each box) as well as data points beyond
that mid-range (which in may be considered outliers). For Atlanta area
coffees shops, this plot appears to suggest that as the median income of
a tract increases, the quality of the shops (as measured by very
non-scientific Yelp ratings) generally increases. Although the top
5-star rated shops do not appear to be concentrated in the higher income
tracts.
Plot 2
ggplot(data = atl_coffee, aes(x=avg_rating, y=hhincome, group=avg_rating)) +
geom_boxplot(size=0.4, outlier.size=1) +
facet_wrap(~county)
Plot 2 is similar to Plot 1 with box-and-whisker plots of median
household income compared to average ratings of coffee shops. Plot 2
adds another dimension to the visual by splitting the 5 counties in the
study area into separate graphs. The additional layer of detail provides
new insights. Clayton County’s coffee shops are all located in
relatively lower income tracts compared to the other counties, and
Clayton has 0 5-star shops. With bigger boxes in its plot, Fulton
appears to have shops of all ratings spread across a range of different
income areas. Dekalb County appears to have the most data points outside
its boxes (and higher than its boxes) meaning general opinions of shops
in its highest-income areas don’t align with areas closer to the median
income.
Plot 3
ggplot(data = atl_coffee) +
geom_point(mapping = aes(x=review_count_log, y=hhincome, color = pct_white), size=2.5, alpha=0.5) +
labs(x = "Review Count (log)",
y = "Median Annual Household Income",
title = "Scatterplot: Review Count vs. Household Income") +
facet_wrap(~county) +
scale_color_gradient(low = "blue", high = "red",
name = "Proportion of residents
who self-identified as white") +
theme_bw() +
theme(legend.key.size = unit(0.5, 'cm'),legend.title = element_text(size=8))
Plot 3 displays scatterplots by county of median income by census
tract to review counts of coffee shops (scaled using logarithimic
calculations). The plot also adds another component - residents who
identify as white (as shown by the color of each plot point). To start,
across all of the plots, it’s immediately clear that income and race are
correlated. Also, Fulton county - which has the most dots in its plot -
has the most of reviews of any county (and therefore, likely the most
shops). Clayton County with the fewest points has the fewest ratings
(and likely the fewest shops). Meanwhile Dekalb has more purple and pink
dots indicating more racial diversity along with a relatively high
number of plot points.
Plot 4
# Make data set long combining four variables into one column:
atl_coffee_long <- pivot_longer(atl_coffee, cols=c("hhincome","pct_white","pop","pct_pov_log"))
ggplot(data=atl_coffee_long, mapping = aes(review_count_log, value)) +
geom_point(mapping=aes(color=county)) +
geom_smooth(mapping = aes(color=county), method = lm, se=FALSE) +
stat_cor(label.y.npc="top", size=3) +
facet_wrap(~name, scales='free_y',
labeller = as_labeller(c(hhincome = "Median Annual Household Income ($)",
pct_white = "Percent White Resident",
pct_pov_log = "Percent Residents Under Poverty",
pop = "Total Population"))) +
theme_bw() +
labs(title = "Scatterplot between logged review count & neighborhood characteristics",
subtitle = "Using Yelp data in five counties around Atlanta, GA",
y = "Values",
x = "Review Count Logged",
color = "County") +
theme(plot.title = element_text(size=10),
plot.subtitle = element_text(size=8),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8),
legend.title = element_text(size=8),
legend.text = element_text(size=7))
## `geom_smooth()` using formula = 'y ~ x'
Plot 4 compares 4 different variables against coffee shop review
counts. The plots’ points are colored by county, and regression lines
for each county’s points are included in each plot. First, 2 of the 4
variables have p-values from their regressions below 0.05 indicating a
high likelihood that the variables are good indicators. Those variables
are Percent of Residents Under the Poverty Line and Percent of White
Residents, and not surprisingly, the number of reviews (and also likely
the number of shops) appears to be highly correlated with these
variables. Also not surprisingly, total population (which doesn’t seem
to be a descriptive enough metric) is the variable with the lowest
correlation to review counts.