Visualization for Data Exploration & Communication

R Code Steps

  1. Loading all the necessary R packages that will be used throughout the workflow
library(tidyverse)
library(ggplot2)
library(plotly)
library(ggrepel)
  1. Getting data based on Census 2023 ACS 5-year estimates and Yelp POI data - It includes Census Tracts in Fulton, DeKalb, Cobb, Gwinnett, and Clayton Counties that have at least one coffee shop.
coffee <- read.csv("coffee.csv", header = TRUE)

Replicating the Plots

Plot 1: Simple Boxplot - Areas with mid-range ratings tend to have higher and more variable household incomes, while those with very low or very high ratings show lower median incomes.

bxplot1 <- ggplot(data = coffee) +
  geom_boxplot(aes(x = factor(avg_rating), y = hhincome),
               color = "black", fill = "white") +   
  
  # Add axis labels
  labs(
    x = "avg_rating",
    y = "hhincome"
  )
bxplot1

Plot 2: Faceted Boxplot by County - Median household income generally increases with higher average ratings in most counties, like Fulton, DeKalb, Cobb and Gwinnett. Clayton shows lower and less variable incomes overall.

bxplot2 <- ggplot(data = coffee) +
  geom_boxplot(aes(x = as.factor(avg_rating), y = hhincome),
               color = "black", fill = "white") +
  
  # Separate panels for each county
  facet_wrap(~ county) + 
  
  labs(
    x = "Average Rating",
    y = "Median Annual Household Income ($)"
  ) +
  theme_grey()

bxplot2

Plot 3: Facetted Scatter Plot by County - Higher-income areas generally show higher review counts, with this trend more pronounced in counties like Fulton and Cobb. Lighter colored dots representing areas with a higher proportion of white residents tend to cluster in neighborhoods with both higher income and more reviews. In contrast, Clayton County stands out with the lowest levels of both household income and review activity.

scplot1 <- ggplot(data = coffee) +
  geom_point(mapping = aes(x = review_count_log, 
                               y = hhincome, 
                               color = pct_white),
                           size = 2, alpha = 0.5) +
  facet_wrap(~county) +
  scale_color_gradient(
    low = "darkred",
    high = "grey",
    name = "Proportion of residents\nwho self-identified as white"
  ) +
  
  # Add axis labels and title
  labs(
    title = "Scatterplot: Review Count vs. Household Income",
    x = "Review Count (log)",
    y = "Median Annual Household Income"
  ) +
  theme_bw() +
  theme(
    plot.title = element_text(hjust = 0.5),
    strip.background = element_rect(fill = "gray90"),
  )

scplot1

Plot 4: Scatterplot between logged review count & neighborhood characteristics - Higher review counts are modestly associated with neighborhoods that have higher median household incomes (R = 0.16) and a greater proportion of residents who identify as white (R = 0.24). Conversely, areas with higher poverty levels tend to have slightly lower review counts (R = -0.12). There is no significant relationship between review count and total population size (R = -0.03), suggesting that population alone does not drive review activity.

# Convert data from wide to long format for selected variables
coffee_long <- coffee %>%
  pivot_longer(
    cols = c(hhincome, pct_pov_log, pct_white, pop),   
    names_to = "metric",             
    values_to = "value"                                
  ) %>%
  
  # Rename variables to more descriptive labels
  mutate(metric = recode(metric,
                         hhincome = "Median Annual Household Income ($)",
                         pct_pov_log = "Residents Under Poverty Level (%; log)",
                         pct_white = "Residents who self-identify as White",
                         pop = "Total Population")) 

# Calculate correlation (r) and p-values between review_count_log and each metric
cor_data <- coffee_long %>%
  group_by(metric) %>%
  summarize(
    r = cor(review_count_log, value, use = "complete.obs"),
    p = cor.test(review_count_log, value)$p.value,
    .groups = "drop"
  )

# Create scatterplots with regression lines for each metric
scplot2<-ggplot(data = coffee_long, aes(x = review_count_log, y = value, color = county)) +
  geom_point(alpha = 0.7) +                       
  geom_smooth(method = "lm", se = FALSE) +
  facet_wrap(~metric, scales = "free_y") +
  geom_text_repel(
    data = cor_data,
    aes(
      x = min(coffee_long$review_count_log) + 0.2,
      y = Inf,
      label = paste0("R = ", round(r, 2), ", p = ", signif(p, 2))
    ),
    color = "black",size = 3,hjust=0, vjust = 1,
  ) +
  
  # Add axis labels, title, and legend
  labs(x = "Review Count (log)",
       y = "Values",
       color = "County",
       title = "Scatterplot between logged review count & neighborhood characteristics") +
  theme_bw()

scplot2