Library packages
Read data file
coffee <- read.csv("coffee.csv")
head(coffee)
## X GEOID county hhincome pct_pov review_count avg_rating
## 1 1 13063040202 Clayton County 33276 0.20134228 57.00000 2
## 2 2 13063040308 Clayton County 28422 0.21071800 13.00000 3
## 3 3 13063040407 Clayton County 49271 0.10825507 29.33333 2
## 4 4 13063040408 Clayton County 44551 0.18095661 20.00000 4
## 5 5 13063040410 Clayton County 49719 0.11468019 41.00000 1
## 6 6 13063040411 Clayton County 57924 0.09068942 18.00000 2
## pop avg_price pct_white hhincome_log review_count_log pct_pov_log yelp_n
## 1 2850 1 0.07508772 10.41289 4.060443 -1.554276 1
## 2 4262 1 0.26067574 10.25527 1.975622 -1.510869 2
## 3 4046 1 0.20514088 10.80529 3.320837 -2.134911 3
## 4 8489 1 0.16868889 10.70461 3.044522 -1.655709 1
## 5 7166 1 0.19369244 10.81434 3.737670 -2.082003 1
## 6 13311 1 0.16512659 10.96706 2.944439 -2.295715 1
Plots
Plot 1. Variables used - avg_rating, hhincome
Finding: Wealthier neighborhoods with higher median household income generally have higher-rated POIs, particularly for Yelp ratings of 3 and 4. However, there is a decrease in median income for neighborhoods with a Yelp rating of 5.
plot1 <- ggplot(data = coffee) +
geom_boxplot(aes(x=avg_rating, y=hhincome),
color="black", fill="white") +
scale_x_discrete(limits = c("1", "2", "3", "4", "5")) +
labs(title = "Plot 1.",
subtitle = "Variables used - avg_rating, hhincome",
x = "Average Yelp Rating",
y = "Median Annual Household Income ($)")
ggplot(data = coffee) +
geom_boxplot(aes(x=as.factor(avg_rating), y=hhincome),
color="black", fill="white") +
labs(title = "Plot 1. Variables used - avg_rating, hhincome",
x = "Average Yelp Rating",
y = "Median Annual Household Income ($)")
plotly::ggplotly(plot1)
Plot2. Variables used - avg_rating, hhincome, county
Finding: Higher Yelp ratings generally correlate with higher median household incomes, though the strength of this relationship varies by county. Fulton, Cobb, DeKalb, and Gwinnett show higher ratings linked to wealthier areas but experience drops in the 5-star category, while Clayton has consistently lower incomes across all ratings.
ggplot(data = coffee) +
geom_boxplot(aes(x = as.factor(avg_rating), y = hhincome),
color = "black", fill = "white") +
facet_wrap(~ county) +
labs(title = "Plot 2.",
subtitle = "Variables used - avg_rating, hhincome, county",
x = "Average Yelp Rating",
y = "Median Annual Household Income ($)")
plot2 <- ggplot(data = coffee) +
geom_boxplot(aes(x = as.factor(avg_rating), y = hhincome),
color = "black", fill = "white") +
facet_wrap(~ county) +
labs(title = "Plot 2. Variables used - avg_rating, hhincome, county",
x = "Average Yelp Rating",
y = "Median Annual Household Income ($)")
plotly::ggplotly(plot2)
# Convert to interactive Plotly plot
#plotly::ggplotly(plot2)
Plot 3. Variables used - review_count_log, hhincome, county, pct_white
Finding: Higher-income areas, especially in Fulton and Cobb counties, have a higher proportion of white residents, while lower-income areas like Clayton County show a lower proportion.
ggplot(data = coffee) +
geom_point(mapping = aes(x=review_count_log, y=hhincome, color = pct_white), alpha=0.7, size=1.5) +
facet_wrap(~ county) +
labs(x = "Review Count (log)",
y = "Median Annual Household Income",
color = "Proportion of residents\nwho self-identified as white",
title = "Scatterplot: Review Count vs. Household Income") +
scale_color_gradient(low="blue", high="red") +
theme_bw()+
theme(
plot.title = element_text(size= 12, color = "black"),
legend.title = element_text(size = 6),
legend.text = element_text(size = 5),
legend.key.height = unit(0.4, 'cm'),
legend.key.width = unit(0.4, 'cm'),
axis.title.x = element_text(size = 8),
axis.title.y = element_text(size = 8),
axis.text.x = element_text(size = 6),
axis.text.y = element_text(size = 6),
# Adjust facet label size
strip.text = element_text(size = 6)
)
Plot 4. Variables used - pct_pov_log, hhincome, pct_white, pop, review_count_log, county
Finding: The strongest correlation is with the percentage of white residents, showing that neighborhoods with more white residents tend to have more popular POIs. There is a weaker positive link with median income, while higher poverty rates are negatively associated with POI reviews. Total population has no significant impact on POI attractiveness.
library(ggpubr)
coffee_long <- coffee %>%
pivot_longer(cols = c(pct_pov_log, hhincome, pct_white, pop),
names_to = "variable",
values_to = "value")
labels <- c(
hhincome = "Median Annual Household Income ($)",
pct_pov_log = "Percent Residents Under Poverty",
pct_white = "Percent White Resident",
pop = "Total Population"
)
ggplot(data = coffee_long, aes(x = review_count_log, y = value)) +
geom_point(aes(color = county), size = 1) +
geom_smooth(aes(color = county), method = "lm", se = FALSE, size = 0.5) +
facet_wrap(~ variable, scales = "free_y", , labeller = as_labeller(labels)) +
labs(
title = "Scatterplot between logged review count & neighborhood characteristics",
subtitle = "Using Yelp data in Five Counties Around Atlanta, GA",
x = "Review Count Logged",
y = "Values",
color = "County"
) +
stat_cor(aes(x = review_count_log, y = value), method = "pearson", size = 2) +
theme(
plot.title = element_text(size= 12, color = "black"),
plot.subtitle = element_text(size = 10, hjust = 0.5),
axis.title.x = element_text(size = 8),
axis.title.y = element_text(size = 8),
axis.text.x = element_text(size = 6),
axis.text.y = element_text(size = 6),
legend.title = element_text(size = 6),
legend.text = element_text(size = 4),
legend.position = "right"
)+
theme_bw()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
library(ggpubr)
labels <- c(
hhincome = "Median Annual Household Income ($)",
pct_pov_log = "Percent Residents Under Poverty",
pct_white = "Percent White Resident",
pop = "Total Population"
)
coffee %>%
pivot_longer(cols = c(pct_pov_log, hhincome, pct_white, pop),
names_to = "variable",
values_to = "value") %>%
ggplot(aes(x = review_count_log, y = value)) +
geom_point(aes(color = county),size = 1) +
geom_smooth(aes(color = county), method = "lm", se = FALSE, size = 0.5) +
facet_wrap(~ variable, scales = "free_y", labeller = as_labeller(labels)) +
labs(
title = "Scatterplot between logged review count & neighborhood characteristics",
subtitle = "Using Yelp data in Five Counties Around Atlanta, GA",
x = "Review Count Logged",
y = "Values",
color = "County"
) +
stat_cor(aes(x = review_count_log, y = value), method = "pearson", size = 2) +
theme(
plot.title = element_text(size = 12, color = "black"),
plot.subtitle = element_text(size = 10, hjust = 0.5),
axis.title.x = element_text(size = 8),
axis.title.y = element_text(size = 8),
axis.text.x = element_text(size = 6),
axis.text.y = element_text(size = 6),
legend.title = element_text(size = 6),
legend.text = element_text(size = 5),
legend.position = "right"
)+
theme_bw()
## `geom_smooth()` using formula = 'y ~ x'