library(tidyverse)
library(ggplot2)
library(plotly)
library(ggrepel)
coffee <- read.csv("coffee.csv", header = TRUE)
Plot 1: Simple Boxplot - Areas with mid-range ratings tend to have higher and more variable household incomes, while those with very low or very high ratings show lower median incomes.
bxplot1 <- ggplot(data = coffee) +
geom_boxplot(aes(x = factor(avg_rating), y = hhincome),
color = "black", fill = "white") +
# Add axis labels
labs(
x = "avg_rating",
y = "hhincome"
)
bxplot1
Plot 2: Faceted Boxplot by County - Median household income generally increases with higher average ratings in most counties, like Fulton, DeKalb, Cobb and Gwinnett. Clayton shows lower and less variable incomes overall.
bxplot2 <- ggplot(data = coffee) +
geom_boxplot(aes(x = as.factor(avg_rating), y = hhincome),
color = "black", fill = "white") +
# Separate panels for each county
facet_wrap(~ county) +
labs(
x = "Average Rating",
y = "Median Annual Household Income ($)"
) +
theme_grey()
bxplot2
Plot 3: Facetted Scatter Plot by County - Higher-income areas generally show higher review counts, with this trend more pronounced in counties like Fulton and Cobb. Lighter colored dots representing areas with a higher proportion of white residents tend to cluster in neighborhoods with both higher income and more reviews. In contrast, Clayton County stands out with the lowest levels of both household income and review activity.
scplot1 <- ggplot(data = coffee) +
geom_point(mapping = aes(x = review_count_log,
y = hhincome,
color = pct_white),
size = 2, alpha = 0.5) +
facet_wrap(~county) +
scale_color_gradient(
low = "darkred",
high = "grey",
name = "Proportion of residents\nwho self-identified as white"
) +
# Add axis labels and title
labs(
title = "Scatterplot: Review Count vs. Household Income",
x = "Review Count (log)",
y = "Median Annual Household Income"
) +
theme_bw() +
theme(
plot.title = element_text(hjust = 0.5),
strip.background = element_rect(fill = "gray90"),
)
scplot1
Plot 4: Scatterplot between logged review count & neighborhood characteristics - Higher review counts are modestly associated with neighborhoods that have higher median household incomes (R = 0.16) and a greater proportion of residents who identify as white (R = 0.24). Conversely, areas with higher poverty levels tend to have slightly lower review counts (R = -0.12). There is no significant relationship between review count and total population size (R = -0.03), suggesting that population alone does not drive review activity.
# Convert data from wide to long format for selected variables
coffee_long <- coffee %>%
pivot_longer(
cols = c(hhincome, pct_pov_log, pct_white, pop),
names_to = "metric",
values_to = "value"
) %>%
# Rename variables to more descriptive labels
mutate(metric = recode(metric,
hhincome = "Median Annual Household Income ($)",
pct_pov_log = "Residents Under Poverty Level (%; log)",
pct_white = "Residents who self-identify as White",
pop = "Total Population"))
# Calculate correlation (r) and p-values between review_count_log and each metric
cor_data <- coffee_long %>%
group_by(metric) %>%
summarize(
r = cor(review_count_log, value, use = "complete.obs"),
p = cor.test(review_count_log, value)$p.value,
.groups = "drop"
)
# Create scatterplots with regression lines for each metric
scplot2<-ggplot(data = coffee_long, aes(x = review_count_log, y = value, color = county)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE) +
facet_wrap(~metric, scales = "free_y") +
geom_text_repel(
data = cor_data,
aes(
x = min(coffee_long$review_count_log) + 0.2,
y = Inf,
label = paste0("R = ", round(r, 2), ", p = ", signif(p, 2))
),
color = "black",size = 3,hjust=0, vjust = 1,
) +
# Add axis labels, title, and legend
labs(x = "Review Count (log)",
y = "Values",
color = "County",
title = "Scatterplot between logged review count & neighborhood characteristics") +
theme_bw()
scplot2