library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
coffee <- read.csv("coffee.csv")

# Plot 1
ggplot(data = coffee, aes(x = factor(avg_rating), y = hhincome), color = "black", fill = "white") + geom_boxplot() + labs(x = "avg_rating")

# We see that coffee shops with an average rating of 1 are mostly located at census tracts with a low average income. We also see that most of the coffee shops are located at census tracts with an average income of 100000 or less. 

# Plot 2
ggplot(data = coffee, aes(x = factor(avg_rating), y = hhincome), color = "black", fill = "white") + geom_boxplot() + facet_wrap(~county) + labs(x = "Average Yelp Rating", y = "Median Annual Household Income ($)")

# We see that all of the coffee shops in Clayton County are located at census tracts with a low average income, implying that Clayton County has a low average income. Other than that, we do not see a clear trend of high-income census tracts having coffee shops with higher ratings, particularly since most of five-point coffee shops are located at poorer census tracts in most counties. 

# Plot 3
ggplot(data = coffee) + geom_point(mapping = aes(x = review_count_log, y = hhincome, color = pct_white)) + facet_wrap(~county) + scale_color_gradient(low="blue", high="red", name = "Proportion of residents who\nself-identified as white") + labs(title = "Scatterplot: Review Count vs. Household Income", x = "Review Count (log)", y = "Median Annual Household Income ($)") + theme_bw()

# We observe from the chart above that as the proportion of residents who self-identify as white increases, the average income increases as well. However, we don't really see the income increasing or decreasing as the review count changes. 

# Plot 4
library(tidyr)

coffee_longer <- coffee %>% pivot_longer(col = c(hhincome, pct_pov_log, pct_white, pop), names_to = "review_count_logged", values_to = "Values")

subplot_title = c("hhincome" = "Median Annual Household Income ($)", "pct_pov_log" = "Percent Residents Under Poverty", "pct_white" = "Percent White Resident", "pop" = "Total Population")

ggplot(data = coffee_longer, aes(x = review_count_log, y = Values, color = county)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) + 
  facet_wrap(~ `review_count_logged`, scales = "free_y", labeller = as_labeller(subplot_title)) + 
  labs(title = "Scatterplot between logged review count & neighborhood characteristics", subtitle = "Using Yelp data in Five Counties Around Atlanta, GA", x = "Review Count Logged", y = "Values", color = "County")  +
  theme_bw()
## `geom_smooth()` using formula = 'y ~ x'

# For most of the subplots, we do not see a clear pattern between the neighborhood characteristic and the log of review counts. We do observe that as the percent of white residents increase, the log of review counts increase. In addition, we again observe that the income of Clayton County is generally lower.