# Load required packages
library(tidyverse)
library(sf)
library(tmap)
library(leaflet)
library(tidycensus)
library(readr)
library(ggplot2)
library(plotly)
library(gridExtra)
# Preparing data
coffee <- read_csv("https://ujhwang.github.io/urban-analytics-2024/Assignment/mini_4/coffee.csv")
## New names:
## Rows: 363 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): county dbl (13): ...1, GEOID, hhincome, pct_pov, review_count, avg_rating,
## pop, avg...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
bxplot1 <- ggplot(data = coffee) +
geom_boxplot(aes(x = as.factor(avg_rating), y = hhincome),
color = "black", fill = "white") +
labs(x = "avg_rating", y = "hhincome")
# Make the plot interactive using ggplotly
plotly::ggplotly(bxplot1)
This boxplot shows the relationship between average coffee ratings (avg_rating) and household income (hhincome). Higher income groups are more represented in the middle ratings 2 and 4, while lower income groups appear more prevalent in both the lowest and highest ratings (1 and 5). Higher ratings don’t necessarily correlate with higher income levels, due to rating 5’s lower income distribution level.
# Boxplot: avg_rating vs hhincome, grouped by county
b <- ggplot(data = coffee) +
geom_boxplot(aes(x = as.factor(avg_rating), y = hhincome),
fill = "white", color = "black") +
labs(x = "Average Yelp Rating", y = "Median Annual Household Income ($)") +
facet_wrap(~ county) + # Create one plot per county
theme(legend.position = "none") # No legend
# Display the plot
b
The boxplots demonstrate that higher average Yelp ratings are associated with higher median household incomes in Cobb, DeKalb, Fulton, and Gwinnett counties. Clayton County, however, shows income levels at relatively low levels across the average ratings, which indicates that higher Yelp ratings don’t significantly correlate with higher household incomes in the county.
scatter_plot1 <- ggplot(data = coffee) +
geom_point(aes(x = review_count_log, y = hhincome, color = pct_white), size = 2) +
labs(
x = "Review Count",
y = "Median Annual Household Income",
color = "Proportion of residents who\nself-identified as white"
) +
facet_wrap(~ county) + # Facet by county to create one scatter plot per county
scale_color_gradient(low = "blue", high = "red") +
theme_minimal() + # Clean theme
theme(
legend.position = "right", # Move legend to the right
legend.text = element_text(size = 8),
legend.title = element_text(size = 9),
legend.key.size = unit(0.5, "cm"),
panel.border = element_rect(color = "black", fill = NA), # Add black borders around scatterplots
strip.background = element_rect(color = "black", fill = NA) # Add black borders around facet labels
)
# Display the plot
scatter_plot1
In this plot delving into the relationship between race, income, and user engagement, Clayton County once again demonstrates a majority of reviews at lower income levels with a relatively low proportion of white residents. Cobb, DeKalb, Fulton, and Gwinnett counties exhibit higher median household incomes with a higher proportion of residents identifying as white. Higher review count generally coincide with higher income areas, as in Cobb and Fulton, with a wide spread of review counts and income levels, which may suggest that areas with higher income are more likely to contribute a higher volume of reviews.
# Filter the data for the specified counties around Atlanta, GA
selected_counties <- coffee %>%
filter(county %in% c("Fulton County", "DeKalb County", "Cobb County", "Gwinnett County", "Clayton County"))
if (nrow(selected_counties) == 0) {
message("No data available for the selected counties.")
} else {
# Reshape the data into long format
long_data <- selected_counties %>%
pivot_longer(cols = c(pct_pov_log, hhincome, pct_white, pop),
names_to = "neighborhood_characteristic",
values_to = "value")
# Check the structure of long_data
print("Structure of long_data:")
str(long_data)
# Create scatter plots
scatter_plot2 <- ggplot(long_data, aes(x = review_count_log, y = value, color = county)) +
geom_point(size = 1) +
geom_smooth(method = "lm", se = FALSE) + # Add a linear regression line without confidence interval
labs(x = "Review Count Logged", y = "Values", color = "County") +
ggtitle("Scatterplot between logged review count & neighborhood characteristics",
subtitle = "Using Yelp data in Five Counties Around Atlanta, GA") +
facet_wrap(~ neighborhood_characteristic, scales = "free_y") + # Facet by neighborhood characteristic
theme_minimal() + # Clean minimal theme
theme(
legend.position = "right", # Legend on the right
panel.border = element_rect(color = "black", fill = NA), # Add black borders around scatterplots
strip.background = element_rect(color = "black", fill = NA) # Add black borders around facet labels
)
# Display the plot
scatter_plot2
}
## [1] "Structure of long_data:"
## tibble [1,452 Ă— 12] (S3: tbl_df/tbl/data.frame)
## $ ...1 : num [1:1452] 1 1 1 1 2 2 2 2 3 3 ...
## $ GEOID : num [1:1452] 1.31e+10 1.31e+10 1.31e+10 1.31e+10 1.31e+10 ...
## $ county : chr [1:1452] "Clayton County" "Clayton County" "Clayton County" "Clayton County" ...
## $ pct_pov : num [1:1452] 0.201 0.201 0.201 0.201 0.211 ...
## $ review_count : num [1:1452] 57 57 57 57 13 ...
## $ avg_rating : num [1:1452] 2 2 2 2 3 3 3 3 2 2 ...
## $ avg_price : num [1:1452] 1 1 1 1 1 1 1 1 1 1 ...
## $ hhincome_log : num [1:1452] 10.4 10.4 10.4 10.4 10.3 ...
## $ review_count_log : num [1:1452] 4.06 4.06 4.06 4.06 1.98 ...
## $ yelp_n : num [1:1452] 1 1 1 1 2 2 2 2 3 3 ...
## $ neighborhood_characteristic: chr [1:1452] "pct_pov_log" "hhincome" "pct_white" "pop" ...
## $ value : num [1:1452] -1.55 3.33e+04 7.51e-02 2.85e+03 -1.51 ...
## `geom_smooth()` using formula = 'y ~ x'
According to the data, neighborhoods with higher household incomes and a higher proportion of white residents tend to contribute to a larger volume of reviews. This indicates a potential link between wealthier, mostly white areas and more engagement based on review count. According to the pct_white plot, higher review counts are associated with neighborhoods with a greater proportion of white residents.The poverty plot demonstrates a negative correlation between poverty and review count, which suggests poorer areas being less likely to leave reviews. The population plot shows little impact on review counts, which suggests the size of a neighborhood isn’t necessarily correlated to more reviews.