library(tidyverse)
library(ggplot2)
library(plotly)
coffee <- read_csv("coffee.csv")
# Plot 1
coffee$avg_rating <- as.factor(coffee$avg_rating)
bxplot <- ggplot(data = coffee) +
geom_boxplot(aes(x=avg_rating, y=hhincome),
color="black",fill="white") +
xlab("yelp average rating") + ylab("household income")
plotly::ggplotly(bxplot)
Figure 1. As median household income increases, the average yelp rating of coffee shops seems to increase as well, until the 5 star rating. The household incomes are more spread out at the 2 and 4 rating with more outliers, whereas at the 1 and 5 rating, the household incomes are more concentrated.
# Plot 2
bxplot +
facet_wrap(~county) +
scale_fill_brewer(palette = "Blues") +
xlab("average yelp rating") + ylab("median household income ($)")
Figure 2. The median household incomes differ between counties. Clayton county has the lowest incomes overall with the highest incomes equivalent to the lowest incomes in Cobb county. The spread of incomes across yelp ratings also greatly differs between counties.
# Plot 3
bxplot2 <- ggplot(data = coffee) +
geom_point(aes(x=review_count_log, y=hhincome, color=pct_white), alpha = 0.5) +
labs(x = "Review Count (log)", y = "Median Household Income ($)", color="Proportion of residents \n who self-identify \n as non-white")+
ggtitle("Scatterplot: Review Count vs. Household Income") +
facet_wrap(~county)
plotly::ggplotly(bxplot2)
Figure 3. Just looking at the census tracts, income, and race, census tracts in Dekalb and Fulton counties seem to be segregated (census tracts have a high proportion of white or a high proportion non-white residents). There are a lot of census tracts that are closer to 50% white as well though. Cobb and Gwinnett counties seem to have a larger white population, while Clayton has a larger non-white population. In terms of household income, Cobb, Dekalb, and Fulton counties have a large spread of incomes, but also include census tracts with the highest household incomes. The number of coffee shops is lowest in Clayton County. Generally, it looks like there are more coffee shops in census tracts with a higher proportion of white residents.
get_stats <- function(data, x_var, y_var) {
# Calculate correlation coefficient
r <- round(cor(data[[x_var]], data[[y_var]]), 2)
# Calculate p-value using linear model
model <- lm(as.formula(paste(y_var, "~", x_var)), data = data)
p_val <- round(summary(model)$coefficients[2,4],5)
return(paste("R =", r, "\np =", p_val))
}
# Calculate overall stats for each plot
stats_pov <- get_stats(coffee, "review_count_log", "pct_pov_log")
stats_inc <- get_stats(coffee, "review_count_log", "hhincome")
stats_white <- get_stats(coffee, "review_count_log", "pct_white")
stats_pop <- get_stats(coffee, "review_count_log", "pop")
plot1 <- ggplot(data = coffee, aes(x=review_count_log, y=pct_pov_log, color=county)) +
geom_smooth(method="lm", se=FALSE) +
geom_point(size=0.5) +
theme(legend.position = "right") +
labs(x = "Percent Residents Under Poverty", y = "Values") +
annotate("text", x = min(coffee$review_count_log), y = max(coffee$pct_pov_log),
label = stats_pov, hjust = 0, vjust = 1, size = 3)
plot2 <- ggplot(data = coffee, aes(x=review_count_log, y=hhincome, color=county)) +
geom_smooth(method="lm", se=FALSE) +
geom_point(size=0.5) +
theme(legend.position = "none",
axis.title.y = element_blank()) +
labs(x = "Median Household Income ($)")+
annotate("text", x = min(coffee$review_count_log), y = max(coffee$hhincome),
label = stats_inc, hjust = 0, vjust = 1, size = 3)
plot3 <- ggplot(data = coffee, aes(x=review_count_log, y=pct_white, color=county)) +
geom_smooth(method="lm", se=FALSE) +
geom_point(size=0.5) +
theme(legend.position = "none") +
labs(y = "Values",x = "Percent White Residents")+
annotate("text", x = min(coffee$review_count_log), y = max(coffee$pct_white),
label = stats_white, hjust = 0, vjust = 1, size = 3)
plot4 <- ggplot(data = coffee, aes(x=review_count_log, y=pop, color=county)) +
geom_smooth(method="lm", se=FALSE) +
geom_point(size=0.5) +
theme(legend.position = "none",
axis.title.y = element_blank())+
labs(x = "Total Population")+
annotate("text", x = min(coffee$review_count_log), y = max(coffee$pop),
label = stats_pop, hjust = 0, vjust = 1, size = 3)
# Combine using patchwork
library(patchwork)
combined_plot <- (plot1 + plot2) / (plot3 + plot4) +
plot_layout(guides = "collect") + # Collect all legends
plot_annotation(
title = "Scatterplot between logged review count & neighborhood characteristics",
subtitle = "Using Yelp data in Five Counties Around Atlanta, GA",
theme = theme(plot.title = element_text(hjust = 0.5))
)
# Display the combined plot
combined_plot
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Figure 4. There is a significant, weak correlation between the percentage of white residents in a census tract and yelp ratings of coffee shops, indicating a higher number of coffee shops in census tracts with a higher proportion of white residents. There is a very slight significant negative correlation between median household income and review counts, meaning there are actually fewer coffee shops in census tracts with higher median household incomes. There is a significant, weak negative correlation between the percent of residents in poverty and yelp ratings. As the percent of residents in poverty increases, the number of coffee shops decreases. There is no correlation between total population and coffee shops.