# Load required packages
library(tidyverse)
library(sf)
library(tmap)
library(leaflet)
library(tidycensus)
library(readr)
library(ggplot2)
library(plotly)
library(gridExtra)

# Preparing data
coffee <- read_csv("https://ujhwang.github.io/urban-analytics-2024/Assignment/mini_4/coffee.csv")

## New names:
## Rows: 363 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): county dbl (13): ...1, GEOID, hhincome, pct_pov, review_count, avg_rating,
## pop, avg...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

Plot 1: Average Ratings and Household Income

Variables used - avg_rating, hhincome

bxplot1 <- ggplot(data = coffee) +
  geom_boxplot(aes(x = as.factor(avg_rating), y = hhincome), 
               color = "black", fill = "white") +
  labs(x = "avg_rating", y = "hhincome")

# Make the plot interactive using ggplotly
plotly::ggplotly(bxplot1)

This boxplot shows the relationship between average coffee ratings (avg_rating) and household income (hhincome). Higher income groups are more represented in the middle ratings 2 and 4, while lower income groups appear more prevalent in both the lowest and highest ratings (1 and 5). Higher ratings don’t necessarily correlate with higher income levels, due to rating 5’s lower income distribution level.

Plot 2: Median Household Income Across Yelp Rating Categories Grouped by County

# Boxplot: avg_rating vs hhincome, grouped by county
b <- ggplot(data = coffee) +
  geom_boxplot(aes(x = as.factor(avg_rating), y = hhincome), 
               fill = "white", color = "black") +
  labs(x = "Average Yelp Rating", y = "Median Annual Household Income ($)") +
  facet_wrap(~ county) +  # Create one plot per county
  theme(legend.position = "none")  # No legend

# Display the plot
b

The boxplots demonstrate that higher average Yelp ratings are associated with higher median household incomes in Cobb, DeKalb, Fulton, and Gwinnett counties. Clayton County, however, shows income levels at relatively low levels across the average ratings, which indicates that higher Yelp ratings don’t significantly correlate with higher household incomes in the county.

Plot 3: Review Count and Median Household Income

scatter_plot1 <- ggplot(data = coffee) +
  geom_point(aes(x = review_count_log, y = hhincome, color = pct_white), size = 2) +
  labs(
    x = "Review Count",
    y = "Median Annual Household Income",
    color = "Proportion of residents who\nself-identified as white"  
  ) +
  facet_wrap(~ county) +   # Facet by county to create one scatter plot per county
  scale_color_gradient(low = "blue", high = "red") + 
  theme_minimal() +        # Clean theme
  theme(
    legend.position = "right",                    # Move legend to the right
    legend.text = element_text(size = 8),          
    legend.title = element_text(size = 9),         
    legend.key.size = unit(0.5, "cm"),            
    panel.border = element_rect(color = "black", fill = NA),  # Add black borders around scatterplots
    strip.background = element_rect(color = "black", fill = NA)  # Add black borders around facet labels
  )

# Display the plot
scatter_plot1

In this plot delving into the relationship between race, income, and user engagement, Clayton County once again demonstrates a majority of reviews at lower income levels with a relatively low proportion of white residents. Cobb, DeKalb, Fulton, and Gwinnett counties exhibit higher median household incomes with a higher proportion of residents identifying as white. Higher review count generally coincide with higher income areas, as in Cobb and Fulton, with a wide spread of review counts and income levels, which may suggest that areas with higher income are more likely to contribute a higher volume of reviews.

Plot 4: Logged Review Count and Neighborhood Characteristics

# Filter the data for the specified counties around Atlanta, GA
selected_counties <- coffee %>%
  filter(county %in% c("Fulton County", "DeKalb County", "Cobb County", "Gwinnett County", "Clayton County"))

if (nrow(selected_counties) == 0) {
  message("No data available for the selected counties.")
} else {
  
  # Reshape the data into long format
  long_data <- selected_counties %>%
    pivot_longer(cols = c(pct_pov_log, hhincome, pct_white, pop), 
                 names_to = "neighborhood_characteristic", 
                 values_to = "value")
  
  # Check the structure of long_data
  print("Structure of long_data:")
  str(long_data)
  
  # Create scatter plots 
  scatter_plot2 <- ggplot(long_data, aes(x = review_count_log, y = value, color = county)) +
    geom_point(size = 1) +
    geom_smooth(method = "lm", se = FALSE) +  # Add a linear regression line without confidence interval
    labs(x = "Review Count Logged", y = "Values", color = "County") +
    ggtitle("Scatterplot between logged review count & neighborhood characteristics",
            subtitle = "Using Yelp data in Five Counties Around Atlanta, GA") +
    facet_wrap(~ neighborhood_characteristic, scales = "free_y") +  # Facet by neighborhood characteristic
    theme_minimal() +  # Clean minimal theme
    theme(
      legend.position = "right",  # Legend on the right
      panel.border = element_rect(color = "black", fill = NA),  # Add black borders around scatterplots
      strip.background = element_rect(color = "black", fill = NA)  # Add black borders around facet labels
    )
  
  # Display the plot
  scatter_plot2
}

## [1] "Structure of long_data:"
## tibble [1,452 × 12] (S3: tbl_df/tbl/data.frame)
##  $ ...1                       : num [1:1452] 1 1 1 1 2 2 2 2 3 3 ...
##  $ GEOID                      : num [1:1452] 1.31e+10 1.31e+10 1.31e+10 1.31e+10 1.31e+10 ...
##  $ county                     : chr [1:1452] "Clayton County" "Clayton County" "Clayton County" "Clayton County" ...
##  $ pct_pov                    : num [1:1452] 0.201 0.201 0.201 0.201 0.211 ...
##  $ review_count               : num [1:1452] 57 57 57 57 13 ...
##  $ avg_rating                 : num [1:1452] 2 2 2 2 3 3 3 3 2 2 ...
##  $ avg_price                  : num [1:1452] 1 1 1 1 1 1 1 1 1 1 ...
##  $ hhincome_log               : num [1:1452] 10.4 10.4 10.4 10.4 10.3 ...
##  $ review_count_log           : num [1:1452] 4.06 4.06 4.06 4.06 1.98 ...
##  $ yelp_n                     : num [1:1452] 1 1 1 1 2 2 2 2 3 3 ...
##  $ neighborhood_characteristic: chr [1:1452] "pct_pov_log" "hhincome" "pct_white" "pop" ...
##  $ value                      : num [1:1452] -1.55 3.33e+04 7.51e-02 2.85e+03 -1.51 ...

## `geom_smooth()` using formula = 'y ~ x'

According to the data, neighborhoods with higher household incomes and a higher proportion of white residents tend to contribute to a larger volume of reviews. This indicates a potential link between wealthier, mostly white areas and more engagement based on review count. According to the pct_white plot, higher review counts are associated with neighborhoods with a greater proportion of white residents.The poverty plot demonstrates a negative correlation between poverty and review count, which suggests poorer areas being less likely to leave reviews. The population plot shows little impact on review counts, which suggests the size of a neighborhood isn’t necessarily correlated to more reviews.

Mini Assignment 4 - Points of Interest

Alex Kozela

2024-10-11

Plot 1: Average Ratings and Household Income

Variables used - avg_rating, hhincome

Plot 2: Median Household Income Across Yelp Rating Categories Grouped by County

Plot 3: Review Count and Median Household Income

Plot 4: Logged Review Count and Neighborhood Characteristics