Mini4

Kaiyu Zhou

2024-10-11

Library packages

Read data file

coffee <- read.csv("coffee.csv")

head(coffee)
##   X       GEOID         county hhincome    pct_pov review_count avg_rating
## 1 1 13063040202 Clayton County    33276 0.20134228     57.00000          2
## 2 2 13063040308 Clayton County    28422 0.21071800     13.00000          3
## 3 3 13063040407 Clayton County    49271 0.10825507     29.33333          2
## 4 4 13063040408 Clayton County    44551 0.18095661     20.00000          4
## 5 5 13063040410 Clayton County    49719 0.11468019     41.00000          1
## 6 6 13063040411 Clayton County    57924 0.09068942     18.00000          2
##     pop avg_price  pct_white hhincome_log review_count_log pct_pov_log yelp_n
## 1  2850         1 0.07508772     10.41289         4.060443   -1.554276      1
## 2  4262         1 0.26067574     10.25527         1.975622   -1.510869      2
## 3  4046         1 0.20514088     10.80529         3.320837   -2.134911      3
## 4  8489         1 0.16868889     10.70461         3.044522   -1.655709      1
## 5  7166         1 0.19369244     10.81434         3.737670   -2.082003      1
## 6 13311         1 0.16512659     10.96706         2.944439   -2.295715      1

Plots

Plot 1. Variables used - avg_rating, hhincome

Finding: Wealthier neighborhoods with higher median household income generally have higher-rated POIs, particularly for Yelp ratings of 3 and 4. However, there is a decrease in median income for neighborhoods with a Yelp rating of 5.

plot1 <- ggplot(data = coffee) +
  geom_boxplot(aes(x=avg_rating, y=hhincome),
               color="black", fill="white") +
  scale_x_discrete(limits = c("1", "2", "3", "4", "5")) +
  labs(title = "Plot 1.", 
       subtitle = "Variables used - avg_rating, hhincome", 
       x = "Average Yelp Rating", 
       y = "Median Annual Household Income ($)")

ggplot(data = coffee) +
  geom_boxplot(aes(x=as.factor(avg_rating), y=hhincome),
               color="black", fill="white") +
  labs(title = "Plot 1. Variables used - avg_rating, hhincome", 
       x = "Average Yelp Rating", 
       y = "Median Annual Household Income ($)")

plotly::ggplotly(plot1)

Plot2. Variables used - avg_rating, hhincome, county

Finding: Higher Yelp ratings generally correlate with higher median household incomes, though the strength of this relationship varies by county. Fulton, Cobb, DeKalb, and Gwinnett show higher ratings linked to wealthier areas but experience drops in the 5-star category, while Clayton has consistently lower incomes across all ratings.

ggplot(data = coffee) +
  geom_boxplot(aes(x = as.factor(avg_rating), y = hhincome),
               color = "black", fill = "white") +
  facet_wrap(~ county) +
  labs(title = "Plot 2.", 
       subtitle = "Variables used - avg_rating, hhincome, county", 
       x = "Average Yelp Rating", 
       y = "Median Annual Household Income ($)")

plot2 <- ggplot(data = coffee) +
  geom_boxplot(aes(x = as.factor(avg_rating), y = hhincome),
               color = "black", fill = "white") +
  facet_wrap(~ county) +
  labs(title = "Plot 2. Variables used - avg_rating, hhincome, county", 
       x = "Average Yelp Rating", 
       y = "Median Annual Household Income ($)")

plotly::ggplotly(plot2)
# Convert to interactive Plotly plot
#plotly::ggplotly(plot2)

Plot 3. Variables used - review_count_log, hhincome, county, pct_white

Finding: Higher-income areas, especially in Fulton and Cobb counties, have a higher proportion of white residents, while lower-income areas like Clayton County show a lower proportion.

ggplot(data = coffee) +
  geom_point(mapping = aes(x=review_count_log, y=hhincome, color = pct_white), alpha=0.7, size=1.5) +
  facet_wrap(~ county) +
  labs(x = "Review Count (log)",
       y = "Median Annual Household Income",
       color = "Proportion of residents\nwho self-identified as white",
       title = "Scatterplot: Review Count vs. Household Income") +
  scale_color_gradient(low="blue", high="red") + 
  theme_bw()+ 
  theme(
    plot.title = element_text(size= 12, color = "black"),
    legend.title = element_text(size = 6),
    legend.text = element_text(size = 5),
    legend.key.height = unit(0.4, 'cm'),
    legend.key.width = unit(0.4, 'cm'),
    axis.title.x = element_text(size = 8),
    axis.title.y = element_text(size = 8),
    axis.text.x = element_text(size = 6),
    axis.text.y = element_text(size = 6),
    
    # Adjust facet label size
    strip.text = element_text(size = 6)
  )

Plot 4. Variables used - pct_pov_log, hhincome, pct_white, pop, review_count_log, county