Assignment4_Plots

Benson Bai

2024-10-11

library(tidyverse)
library(sf)
library(tmap)
library(leaflet)
library(ggpubr)
library(tidycensus)
setwd("C:/Users/benso/Documents/Georgia Tech/CP8883")
df = read.csv("coffee.csv")
df %>% head
##   X       GEOID         county hhincome    pct_pov review_count avg_rating
## 1 1 13063040202 Clayton County    33276 0.20134228     57.00000          2
## 2 2 13063040308 Clayton County    28422 0.21071800     13.00000          3
## 3 3 13063040407 Clayton County    49271 0.10825507     29.33333          2
## 4 4 13063040408 Clayton County    44551 0.18095661     20.00000          4
## 5 5 13063040410 Clayton County    49719 0.11468019     41.00000          1
## 6 6 13063040411 Clayton County    57924 0.09068942     18.00000          2
##     pop avg_price  pct_white hhincome_log review_count_log pct_pov_log yelp_n
## 1  2850         1 0.07508772     10.41289         4.060443   -1.554276      1
## 2  4262         1 0.26067574     10.25527         1.975622   -1.510869      2
## 3  4046         1 0.20514088     10.80529         3.320837   -2.134911      3
## 4  8489         1 0.16868889     10.70461         3.044522   -1.655709      1
## 5  7166         1 0.19369244     10.81434         3.737670   -2.082003      1
## 6 13311         1 0.16512659     10.96706         2.944439   -2.295715      1
ggplot(data = df) +
  geom_boxplot(aes(y=hhincome, x=factor(avg_rating)),
               color="black",fill="white") +
  xlab("avg_rating")

ggplot(data=df) +
  geom_boxplot(aes(x = factor(avg_rating), y=hhincome),
               color = "black", ) +
  facet_wrap(~county) +
  xlab("Average Yelp Rating") +
  ylab("Median Annual Household Income($)")

ggplot(data=df) +
  geom_point(aes(x = review_count_log, y=hhincome, color = pct_white), alpha = 0.5, size = 3) +
  facet_wrap(~county) +
  labs(
  x = "Review Count (log)",
  y = "Median Annual Household Income($)",
  title = "Scatterplot: Review Count vs. Household Income",
  color = "Proportion of residents \nwho self-identified as white") +
  scale_color_gradient(low="blue", high="red") +
  theme_bw()

df_long <- df %>% 
  pivot_longer(cols = c("pct_pov_log", "hhincome", "pct_white", "pop"),
               names_to = "y_variables",
               values_to = "value"
)
df_long %>% head
## # A tibble: 6 × 12
##       X      GEOID county pct_pov review_count avg_rating avg_price hhincome_log
##   <int>      <dbl> <chr>    <dbl>        <dbl>      <int>     <dbl>        <dbl>
## 1     1    1.31e10 Clayt…   0.201           57          2         1         10.4
## 2     1    1.31e10 Clayt…   0.201           57          2         1         10.4
## 3     1    1.31e10 Clayt…   0.201           57          2         1         10.4
## 4     1    1.31e10 Clayt…   0.201           57          2         1         10.4
## 5     2    1.31e10 Clayt…   0.211           13          3         1         10.3
## 6     2    1.31e10 Clayt…   0.211           13          3         1         10.3
## # ℹ 4 more variables: review_count_log <dbl>, yelp_n <int>, y_variables <chr>,
## #   value <dbl>
facet_labels <- c(
  pct_pov_log = "Percent Residents Under Poverty Logged",
  hhincome = "Median Annual Household Income ($)",
  pct_white = "Percent White Resident",
  pop = "Total Population"
)
ggplot(data = df_long, aes(x = review_count_log, y = value)) +
  geom_point(aes(color = county)) +
  geom_smooth(formula = y ~ x, aes(color = county), method = "lm", se = FALSE) + 
  stat_cor(aes(label = paste(after_stat(r.label), after_stat(p.label), sep = "~`,`~")), 
           method = "pearson", label.x.npc = "left", label.y.npc = "top", size = 3) +
  facet_wrap(~ y_variables, scales = "free",labeller = as_labeller(facet_labels)) +
  labs(title = "Scatterplots between logged review count & neighborhood characteristics",
       subtitle = "Using Yelp Data in Five Counties Around Atlanta, GA",
       x = "Review Count Logged", y = "Values", color = "County") +
  theme_bw()