library(tidyverse)
library(sf)
library(tmap)
library(leaflet)
library(ggpubr)
library(tidycensus)
setwd("C:/Users/benso/Documents/Georgia Tech/CP8883")
df = read.csv("coffee.csv")
df %>% head
## X GEOID county hhincome pct_pov review_count avg_rating
## 1 1 13063040202 Clayton County 33276 0.20134228 57.00000 2
## 2 2 13063040308 Clayton County 28422 0.21071800 13.00000 3
## 3 3 13063040407 Clayton County 49271 0.10825507 29.33333 2
## 4 4 13063040408 Clayton County 44551 0.18095661 20.00000 4
## 5 5 13063040410 Clayton County 49719 0.11468019 41.00000 1
## 6 6 13063040411 Clayton County 57924 0.09068942 18.00000 2
## pop avg_price pct_white hhincome_log review_count_log pct_pov_log yelp_n
## 1 2850 1 0.07508772 10.41289 4.060443 -1.554276 1
## 2 4262 1 0.26067574 10.25527 1.975622 -1.510869 2
## 3 4046 1 0.20514088 10.80529 3.320837 -2.134911 3
## 4 8489 1 0.16868889 10.70461 3.044522 -1.655709 1
## 5 7166 1 0.19369244 10.81434 3.737670 -2.082003 1
## 6 13311 1 0.16512659 10.96706 2.944439 -2.295715 1
ggplot(data = df) +
geom_boxplot(aes(y=hhincome, x=factor(avg_rating)),
color="black",fill="white") +
xlab("avg_rating")

ggplot(data=df) +
geom_boxplot(aes(x = factor(avg_rating), y=hhincome),
color = "black", ) +
facet_wrap(~county) +
xlab("Average Yelp Rating") +
ylab("Median Annual Household Income($)")

ggplot(data=df) +
geom_point(aes(x = review_count_log, y=hhincome, color = pct_white), alpha = 0.5, size = 3) +
facet_wrap(~county) +
labs(
x = "Review Count (log)",
y = "Median Annual Household Income($)",
title = "Scatterplot: Review Count vs. Household Income",
color = "Proportion of residents \nwho self-identified as white") +
scale_color_gradient(low="blue", high="red") +
theme_bw()

df_long <- df %>%
pivot_longer(cols = c("pct_pov_log", "hhincome", "pct_white", "pop"),
names_to = "y_variables",
values_to = "value"
)
df_long %>% head
## # A tibble: 6 × 12
## X GEOID county pct_pov review_count avg_rating avg_price hhincome_log
## <int> <dbl> <chr> <dbl> <dbl> <int> <dbl> <dbl>
## 1 1 1.31e10 Clayt… 0.201 57 2 1 10.4
## 2 1 1.31e10 Clayt… 0.201 57 2 1 10.4
## 3 1 1.31e10 Clayt… 0.201 57 2 1 10.4
## 4 1 1.31e10 Clayt… 0.201 57 2 1 10.4
## 5 2 1.31e10 Clayt… 0.211 13 3 1 10.3
## 6 2 1.31e10 Clayt… 0.211 13 3 1 10.3
## # ℹ 4 more variables: review_count_log <dbl>, yelp_n <int>, y_variables <chr>,
## # value <dbl>
facet_labels <- c(
pct_pov_log = "Percent Residents Under Poverty Logged",
hhincome = "Median Annual Household Income ($)",
pct_white = "Percent White Resident",
pop = "Total Population"
)
ggplot(data = df_long, aes(x = review_count_log, y = value)) +
geom_point(aes(color = county)) +
geom_smooth(formula = y ~ x, aes(color = county), method = "lm", se = FALSE) +
stat_cor(aes(label = paste(after_stat(r.label), after_stat(p.label), sep = "~`,`~")),
method = "pearson", label.x.npc = "left", label.y.npc = "top", size = 3) +
facet_wrap(~ y_variables, scales = "free",labeller = as_labeller(facet_labels)) +
labs(title = "Scatterplots between logged review count & neighborhood characteristics",
subtitle = "Using Yelp Data in Five Counties Around Atlanta, GA",
x = "Review Count Logged", y = "Values", color = "County") +
theme_bw()
