# Set CRAN mirror non-interactively
options(repos = c(CRAN = "https://cloud.r-project.org"))

install.packages("ggpubr", dependencies = TRUE)
## Installing package into 'C:/Users/SHAMBHAVI SINHA/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'ggpubr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\SHAMBHAVI SINHA\AppData\Local\Temp\Rtmp2nMtdB\downloaded_packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(ggplot2)

library(ggpubr)

# Load data
coffee_data <- read_csv("C:/Users/SHAMBHAVI SINHA/OneDrive/semester1_mcrp/urban_analytics_R/urban_analytics/coffee.csv")
## New names:
## Rows: 434 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): county dbl (13): ...1, GEOID, hhincome, hhincome_log, pct_pov,
## pct_pov_log, pct_whi...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# Define income bins (every $20k)
income_breaks <- seq(20000, max(coffee_data$hhincome, na.rm = TRUE) + 1, by = 20000)

# Midpoints of income bins
income_midpoints <- (income_breaks[-1] + income_breaks[-length(income_breaks)]) / 2

# Assign bin labels & numeric midpoint
coffee_data <- coffee_data %>%
  mutate(
    income_bin = cut(hhincome, breaks = income_breaks, include.lowest = TRUE),
    income_mid = as.numeric(income_bin) 
  )

# Boxplot  style
ggplot(coffee_data, aes(x = factor(avg_rating), y = hhincome)) +
  geom_boxplot(fill = "white", color = "black", outlier.shape = 16, outlier.size = 2) +
  scale_y_continuous(labels = comma) +
  labs(
    x = "avg_rating",
    y = "hhincome"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    panel.grid.major = element_line(color = "grey90"),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(color = "grey", fill = NA, size = 0.6)
  )
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# --- BOX PLOT BY COUNTY --
ggplot(coffee_data, aes(x = factor(avg_rating), y = hhincome)) +
  geom_boxplot(fill = "white", color = "black", outlier.shape = 16, outlier.size = 2) +
  facet_wrap(~county) +
  scale_y_continuous(labels = comma) +
  labs(
    title = NULL,
    x = "Average Rating",
    y = "Median Annual Household Income ($)"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    panel.grid.major = element_line(color = "grey90"),
    panel.grid.minor = element_blank(),
    strip.text = element_text(face = "bold"),
    plot.title = element_text(face = "bold", hjust = 0.5),
 
    panel.border = element_rect(color = "grey", fill = NA, size = 0.6)
  )

# ---------- PLOT 3: Review Count (log) vs Income by County, Color = % White ----------
ggplot(coffee_data, aes(y = hhincome, x = review_count_log, color = pct_white)) +
  geom_point(alpha = 0.7, size = 2) +
  scale_color_gradient(low = "lightyellow", high = "darkblue", name = "Proportion of Whites") +
  facet_wrap(~county) +
  labs(title = "Scatterplot Review Count vs Household Income by County",
       y = "Median Annual Household Income ($)",
       x = "Review Count (log)") +
  theme_minimal()+theme(
    strip.text = element_text(face = "bold"),
    plot.title = element_text(face = "bold", hjust = 0.5),
    panel.grid.major = element_line(color = "grey90"),
    panel.grid.minor = element_blank(),
    # Add box around each panel
    panel.border = element_rect(color = "grey", fill = NA, size = 0.6)
  )

# Prepare data for Plot 4 using select and pivot_longer
plot4_data <- coffee_data %>%
  select(GEOID, county, review_count_log, hhincome, pct_pov_log, pct_white, pop) %>%
  pivot_longer(cols = c(hhincome, pct_pov_log, pct_white, pop),
               names_to = "variable",
               values_to = "value")

# ---- Prepare long-form data ----
plot4_data <- coffee_data %>%
  select(GEOID, county, review_count_log, hhincome, pct_pov_log, pct_white, pop) %>%
  pivot_longer(
    cols = c(hhincome, pct_pov_log, pct_white, pop),
    names_to = "variable",
    values_to = "value"
  )

# ---- Compute overall correlation ----
cor_stats <- plot4_data %>%
  group_by(variable) %>%
  summarise(
    cor = cor.test(review_count_log, value)$estimate,
    pval = cor.test(review_count_log, value)$p.value
  ) %>%
  mutate(
    label = paste0("R = ", round(cor, 2), ", p = ", signif(pval, 2))
  )

# ---- Custom facet labels ----
var_labels <- c(
  hhincome = "Median Annual Household Income ($)",
  pct_pov_log = "Residents Under Poverty Level (%; log)",
  pct_white = "Residents who self-identify as White (%)",
  pop = "Total Population"
)

# ---- Plot ----
plot4 <- ggplot(plot4_data, aes(x = review_count_log, y = value)) +
  geom_point(aes(color = county), alpha = 0.7, size = 2) +
  geom_smooth(aes(color = county), method = "lm", se = FALSE, linewidth = 1) +
  facet_wrap(~variable, scales = "free_y", labeller = as_labeller(var_labels)) +
  geom_text(
    data = cor_stats,
    aes(x = -Inf, y = Inf, label = label),
    hjust = -0.1, vjust = 1.5, size = 4.2, fontface = "italic", inherit.aes = FALSE
  ) +
  scale_color_brewer(palette = "Set1") +
  labs(
    title = "Scatterplot between logged review count & neighborhood characteristics",
    x = "Logged Review Count",
    y = "Values",
    color = "County"
  ) +
  theme_minimal(base_size = 10) +
  theme(
    strip.text = element_text(face = "bold"),
    plot.title = element_text(face = "bold", hjust = 0.5),
    panel.grid.major = element_line(color = "grey90"),
    panel.grid.minor = element_blank(),
    
    panel.border = element_rect(color = "grey", fill = NA, size = 0.6)
  )

print(plot4)
## `geom_smooth()` using formula = 'y ~ x'

plot 1 coffee shop ratings generally appear to be higher in moderately to high-income neighborhoods, with a peak around 4 stars.

Extremely low ratings (1 star) are more common in lower-income tracts.

Very high-income areas can also have highly rated coffee shops, but these are fewer (hence the outliers).So, better quality coffee shops are coming in the paces where people can afford to pay fot them or well off are rating their coffee shop experiance better.

plot 2 there seems to be a general trend where higher-rated coffee shops tend to be located in wealthier areas, particularly in counties like Cobb, Gwinnett, and Fulton. However, some counties, such as DeKalb, show more economic diversity, with high-rated coffee shops spread across both high- and low-income areas. Clayton County stands out for its relatively low median income across all ratings, indicating that coffee shops in this county tend to cater to a more economically modest population.

plot 3 In general, areas with higher median household incomes tend to have higher review counts. This is most noticeable in Fulton and Gwinnett Counties, where coffee shops in wealthier areas tend to get more reviews.

There appears to be some relationship between the proportion of White residents and higher median incomes, particularly in Fulton and Gwinnett Counties, are more likely to correspond with higher incomes and higher review counts.

Clayton County stands out with its lower household incomes and fewer reviews, paired with a higher concentration of Yellow (low proportion of White residents), pointing to a more diverse population with relatively low incomes and fewer coffee shop reviews.It demonstrates that in these counties, wealthier areas with higher proportions of White residents tend to have coffee shops with higher review counts. Conversely, areas with lower incomes and a more racially diverse population (such as Clayton County) typically have fewer reviews and lower household incomes.

plot 4 The scatterplots reveal that wealthier neighborhoods as indicated by higher median household income, tend to have more reviews for coffee shops. Similarly, areas with a higher proportion of White residents also see more reviews. In contrast, neighborhoods with higher poverty levels have fewer reviews. However, the total population size does not significantly affect the number of reviews. Overall, economic and racial factors seem to influence review counts more than population size.