DATA 608 Story 3

# Load required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(viridis)
## Loading required package: viridisLite
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:viridis':
## 
##     viridis_pal
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
# Step 1: Load the datasets
obesity <- read_excel("/Users/zigcah/Downloads/adult obesity.xlsx")
healthcare <- read_csv("/Users/zigcah/Downloads/Health Care Expenditures by State of Residence.csv", skip = 2)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 62 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Location, Total Health Spending
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Step 2: Clean and prepare healthcare data
colnames(healthcare) <- c("State", "Total_Health_Spending")

healthcare <- healthcare %>%
  filter(State != "United States") %>%
  mutate(Total_Health_Spending = as.numeric(gsub("[$,]", "", Total_Health_Spending)))

# Step 3: Clean and prepare obesity data
obesity_clean <- obesity %>%
  select(State, `Obesity %`) %>%
  mutate(`Obesity %` = as.numeric(`Obesity %`))

# Step 4: Merge datasets
merged_data <- obesity_clean %>%
  inner_join(healthcare, by = "State")

# Step 5: Assign spending quintiles
merged_data <- merged_data %>%
  mutate(
    Spending_Quintile = ntile(Total_Health_Spending, 5),
    Spending_Quintile = factor(Spending_Quintile,
                                labels = c("Lowest 20%", "Second Lowest 20%",
                                           "Middle 20%", "Second Highest 20%", "Highest 20%"))
  )

# Step 6: Create a boxplot comparing obesity by spending quintile
ggplot(merged_data, aes(x = Spending_Quintile, y = `Obesity %`, fill = Spending_Quintile)) +
  geom_boxplot() +
  scale_fill_viridis_d(option = "D") +
  labs(
    title = "Adult Obesity Rates by Healthcare Spending Quintile",
    x = "Per Capita Healthcare Spending Quintile",
    y = "Obesity Prevalence (%)",
    fill = "Spending Quintile"
  ) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")