#	Research Question
Q1	How is funding distributed geographically across the states?
Q2	Are the top 20 most populated states also the top 20 most funded?
Q3	Did the bottom 20 populated states also receive the lowest funding?
Q4	Does mean per-capita funding differ between red and blue states?
Q5	Is there a statistically significant bias in per-capita funding by party?

Loading and Inspecting the Raw Datasets

# ---- Load Required Libraries ----

library(readxl)
library(dplyr)
library(ggplot2)
library(tidyr)
library(scales)
library(maps)
library(stringr)
library(knitr)
library(gridExtra)
library(maps)
library(mapproj)   # <-- add this line right after library(maps)

# ---- Load Raw Datasets ----
funding_raw  <- read_excel("Dataset_Funding.xlsx")
pop_raw      <- read_excel("Dataset_Population.xlsx")
votes_raw    <- read_excel("Votes_2020.xlsx")

cat("Funding dataset:", dim(funding_raw), "\n")

## Funding dataset: 57 2

cat("Population dataset:", dim(pop_raw), "\n")

## Population dataset: 66 43

cat("Votes dataset:", dim(votes_raw), "\n")

## Votes dataset: 58 7

Standardizing State Names for Seamless Merging

# ---- Clean Funding Dataset ----
# Rename Jurisdiction to 'State' and standardize to title case
funding <- funding_raw %>%
  rename(State = Jurisdiction) %>%
  mutate(State = str_to_title(State)) %>%
  # Fix the misspelling in the original data
  mutate(State = ifelse(State == "Deleware", "Delaware", State)) %>%
  # Filter to 50 states + DC (remove territories and tribal)
  filter(!State %in% c("American Samoa", "Guam",
                        "Northern Mariana Islands",
                        "Puerto Rico", "Us Virgin Islands",
                        "Tribal Communities"))

cat("Funding: Cleaned to", nrow(funding), "jurisdictions\n")

## Funding: Cleaned to 51 jurisdictions

head(funding, 5)

## # A tibble: 5 × 2
##   State      Total_funding
##   <chr>              <dbl>
## 1 Alabama              3  
## 2 Alaska               3.7
## 3 Arizona              3.5
## 4 Arkansas             2.8
## 5 California          18.4

Cleaning Population Data — Extracting State-Level Records

# ---- Clean Population Dataset ----
# Filter to state-level summaries only (SUMLEV == 40)
# Rename NAME to 'State' and select relevant columns
population <- pop_raw %>%
  filter(SUMLEV == 40) %>%
  rename(State = NAME) %>%
  select(State, Population = POPESTIMATE2020) %>%
  # Remove Puerto Rico to align with funding (50 states + DC)
  filter(State != "Puerto Rico")

cat("Population: Cleaned to", nrow(population), "jurisdictions\n")

## Population: Cleaned to 51 jurisdictions

head(population, 5)

## # A tibble: 5 × 2
##   State      Population
##   <chr>           <dbl>
## 1 Alabama       5032962
## 2 Alaska         732906
## 3 Arizona       7186647
## 4 Arkansas      3014399
## 5 California   39527808

Cleaning Votes Data — Mapping Abbreviations to Full State Names

# ---- Clean Votes Dataset ----
# The votes dataset has a multi-row header; extract and restructure
votes_clean <- votes_raw %>%
  slice(3:53) %>%  # Rows 3-53 contain the 50 states + DC
  setNames(c("State_Abbr", "Biden_EV", "Trump_EV",
             "Biden_Popular", "Trump_Popular",
             "Others_Popular", "Total_Vote")) %>%
  mutate(
    # Remove asterisks from ME* and NE*
    State_Abbr = str_remove(State_Abbr, "\\*"),
    Biden_EV = as.numeric(Biden_EV),
    Trump_EV = as.numeric(Trump_EV),
    Biden_Popular = as.numeric(Biden_Popular),
    Trump_Popular = as.numeric(Trump_Popular),
    Total_Vote = as.numeric(Total_Vote)
  )

Mapping State Abbreviations to Full Names

# ---- Create State Abbreviation to Full Name Mapping ----
state_lookup <- data.frame(
  State_Abbr = c("AL","AK","AZ","AR","CA","CO","CT","DE","DC","FL",
                 "GA","HI","ID","IL","IN","IA","KS","KY","LA","ME",
                 "MD","MA","MI","MN","MS","MO","MT","NE","NV","NH",
                 "NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI",
                 "SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY"),
  State = c("Alabama","Alaska","Arizona","Arkansas","California",
            "Colorado","Connecticut","Delaware",
            "District Of Columbia","Florida",
            "Georgia","Hawaii","Idaho","Illinois","Indiana",
            "Iowa","Kansas","Kentucky","Louisiana","Maine",
            "Maryland","Massachusetts","Michigan","Minnesota",
            "Mississippi","Missouri","Montana","Nebraska","Nevada",
            "New Hampshire","New Jersey","New Mexico","New York",
            "North Carolina","North Dakota","Ohio","Oklahoma",
            "Oregon","Pennsylvania","Rhode Island",
            "South Carolina","South Dakota","Tennessee","Texas",
            "Utah","Vermont","Virginia","Washington",
            "West Virginia","Wisconsin","Wyoming"),
  stringsAsFactors = FALSE
)

# Join to get full state names and determine winner
votes <- votes_clean %>%
  left_join(state_lookup, by = "State_Abbr") %>%
  mutate(
    Winner = case_when(
      !is.na(Biden_EV) & (is.na(Trump_EV) | Biden_EV > 0) ~ "Biden",
      !is.na(Trump_EV) & (is.na(Biden_EV) | Trump_EV > 0) ~ "Trump",
      TRUE ~ NA_character_
    ),
    Party = ifelse(Winner == "Biden", "Blue", "Red")
  ) %>%
  select(State, State_Abbr, Biden_EV, Trump_EV, Winner, Party,
         Biden_Popular, Trump_Popular, Total_Vote)

head(votes, 5)

## # A tibble: 5 × 9
##   State    State_Abbr Biden_EV Trump_EV Winner Party Biden_Popular Trump_Popular
##   <chr>    <chr>         <dbl>    <dbl> <chr>  <chr>         <dbl>         <dbl>
## 1 <NA>     <NA>             NA       NA <NA>   <NA>             NA            NA
## 2 Alabama  AL               NA        9 Trump  Red          849624       1441170
## 3 Alaska   AK               NA        3 Trump  Red          153778        189951
## 4 Arizona  AZ               11       NA Biden  Blue        1672143       1661686
## 5 Arkansas AR               NA        6 Trump  Red          423932        760647
## # ℹ 1 more variable: Total_Vote <dbl>

Merging Into a Master Analytical Dataset

# ---- Merge All Three Datasets ----
# Fix DC name mismatch between funding ("District Of Columbia")
# and population ("District of Columbia")
funding <- funding %>%
  mutate(State = ifelse(State == "District Of Columbia",
                        "District of Columbia", State))

votes <- votes %>%
  mutate(State = ifelse(State == "District Of Columbia",
                        "District of Columbia", State))

# Merge funding + population + votes
master <- funding %>%
  inner_join(population, by = "State") %>%
  inner_join(votes, by = "State") %>%
  mutate(
    # Convert funding from billions to dollars
    Funding_Dollars = Total_funding * 1e9,
    # Calculate per-capita funding
    Funding_Per_Capita = Funding_Dollars / Population
  )

cat("Master dataset:", nrow(master), "rows x", ncol(master), "cols\n")

## Master dataset: 50 rows x 13 cols

Master Dataset Preview — The Foundation of Our Analysis

master %>%
  select(State, Total_funding, Population, Party,
         Funding_Per_Capita) %>%
  arrange(desc(Total_funding)) %>%
  head(10) %>%
  kable(caption = "Top 10 States by Total IIJA Funding (Billions $)",
        digits = 2, format.args = list(big.mark = ","))

Top 10 States by Total IIJA Funding (Billions $)
State	Total_funding	Population	Party	Funding_Per_Capita
California	18.4	39,527,808	Blue	465.50
Texas	14.2	29,237,895	Red	485.67
New York	10.1	20,122,262	Blue	501.93
Illinois	8.4	12,795,348	Blue	656.49
Florida	8.2	21,591,325	Red	379.78
Pennsylvania	8.1	12,995,576	Blue	623.29
Ohio	6.6	11,798,534	Red	559.39
Michigan	5.2	10,072,294	Blue	516.27
New Jersey	5.1	9,270,476	Blue	550.13
Georgia	5.0	10,732,595	Blue	465.87

Q1: The Geographic Landscape | How Is IIJA Funding Distributed Across the States?

Q1: A Choropleth Reveals the Geographic Story of IIJA Dollars

# ---- Q1: Choropleth Map of Funding by State ----
# Prepare map data
us_map <- map_data("state")

# Create lowercase state names for joining with map data
map_funding <- master %>%
  mutate(region = tolower(State)) %>%
  select(region, Total_funding)

# Merge with map coordinates
map_merged <- us_map %>%
  left_join(map_funding, by = "region")

ggplot(map_merged, aes(x = long, y = lat, group = group,
                       fill = Total_funding)) +
  geom_polygon(color = "white", linewidth = 0.3) +
  scale_fill_gradient(low = "#FFF3E0", high = "#BF360C",
                      name = "Funding\n($ Billions)",
                      labels = dollar_format(suffix = "B"),
                      na.value = "grey85") +
  coord_map("albers", lat0 = 39, lat1 = 45) +
  labs(
    title = "IIJA Funding Distribution Across the Continental U.S.",
    subtitle = "Darker intensity = higher funding allocation",
    caption = "Source: White House IIJA Fact Sheets"
  ) +
  theme_void() +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 11, hjust = 0.5, color = "gray40"),
    legend.position = "right"
  )

Q1 Insight: Funding Concentrates in a Few Large States

# Summary statistics
cat("Summary of Total Funding ($ Billions):\n")

## Summary of Total Funding ($ Billions):

summary(master$Total_funding)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.7518  1.8000  2.8500  3.7859  4.2250 18.4000

cat("\nStandard Deviation:", round(sd(master$Total_funding), 2), "B\n")

## 
## Standard Deviation: 3.35 B

cat("Coefficient of Variation:",
    round(sd(master$Total_funding) / mean(master$Total_funding) * 100, 1),
    "%\n")

## Coefficient of Variation: 88.5 %

# Top 5 vs Bottom 5
cat("\nTop 5 states receive $",
    sum(sort(master$Total_funding, decreasing = TRUE)[1:5]), "B\n")

## 
## Top 5 states receive $ 59.3 B

cat("Bottom 5 states receive $",
    sum(sort(master$Total_funding)[1:5]), "B\n")

## Bottom 5 states receive $ 4.4959 B

Q2: Population vs. Funding | Are the Top 20 Most Populated States Also the Top 20 Most Funded?

Q2: Mirrored Bar Chart — Population Rank vs. Funding Rank

# ---- Q2: Top 20 by Population vs Top 20 by Funding ----
top20_pop <- master %>%
  arrange(desc(Population)) %>%
  slice(1:20) %>%
  mutate(Pop_Rank = row_number())

top20_fund <- master %>%
  arrange(desc(Total_funding)) %>%
  slice(1:20) %>%
  mutate(Fund_Rank = row_number())

# Combine into one comparison dataset
top20_comparison <- top20_pop %>%
  select(State, Population, Pop_Rank) %>%
  full_join(
    top20_fund %>% select(State, Total_funding, Fund_Rank),
    by = "State"
  ) %>%
  mutate(
    In_Both = ifelse(!is.na(Pop_Rank) & !is.na(Fund_Rank),
                     "In Both Top 20", "In One List Only"),
    State_Label = State
  )

# Create ordered factor for display
top20_comparison <- top20_comparison %>%
  arrange(desc(Population)) %>%
  mutate(State_Label = factor(State_Label,
                              levels = rev(State_Label)))

# Side-by-side: Population (left) | Funding (right)
p1 <- ggplot(top20_comparison %>% filter(!is.na(Pop_Rank)),
             aes(x = Population / 1e6, y = State_Label,
                 fill = In_Both)) +
  geom_col(width = 0.7) +
  scale_fill_manual(values = c("In Both Top 20" = "#1565C0",
                               "In One List Only" = "#FF8F00")) +
  scale_x_continuous(labels = function(x) paste0(x, "M")) +
  labs(x = "Population (Millions)", y = NULL,
       title = "Top 20 by Population") +
  theme_minimal() +
  theme(legend.position = "bottom",
        plot.title = element_text(face = "bold", size = 11))

p2 <- ggplot(top20_comparison %>% filter(!is.na(Fund_Rank)) %>%
               mutate(State_Label = factor(State,
                        levels = levels(top20_comparison$State_Label))),
             aes(x = Total_funding, y = State_Label,
                 fill = In_Both)) +
  geom_col(width = 0.7) +
  scale_fill_manual(values = c("In Both Top 20" = "#1565C0",
                               "In One List Only" = "#FF8F00")) +
  scale_x_continuous(labels = dollar_format(suffix = "B")) +
  labs(x = "Funding ($ Billions)", y = NULL,
       title = "Top 20 by Funding") +
  theme_minimal() +
  theme(legend.position = "bottom",
        plot.title = element_text(face = "bold", size = 11))

grid.arrange(p1, p2, ncol = 2,
             top = grid::textGrob(
               "Q2: Do Population Giants Also Dominate Funding?",
               gp = grid::gpar(fontsize = 15, fontface = "bold")))

Q2: Overlap Analysis — Quantifying the Match

overlap_states <- intersect(top20_pop$State, top20_fund$State)
only_pop   <- setdiff(top20_pop$State, top20_fund$State)
only_fund  <- setdiff(top20_fund$State, top20_pop$State)

cat("States in BOTH Top 20 lists:", length(overlap_states), "out of 20\n")

## States in BOTH Top 20 lists: 17 out of 20

cat("Overlap:", paste(overlap_states, collapse = ", "), "\n\n")

## Overlap: California, Texas, Florida, New York, Pennsylvania, Illinois, Ohio, Georgia, North Carolina, Michigan, New Jersey, Virginia, Washington, Arizona, Massachusetts, Tennessee, Missouri

cat("In Top 20 Population ONLY:", paste(only_pop, collapse = ", "), "\n")

## In Top 20 Population ONLY: Indiana, Maryland, Wisconsin

cat("In Top 20 Funding ONLY:", paste(only_fund, collapse = ", "), "\n")

## In Top 20 Funding ONLY: Louisiana, Kentucky, Alaska

cat("\nOverlap Rate:", round(length(overlap_states)/20*100, 1), "%\n")

## 
## Overlap Rate: 85 %

Q3: The Bottom of the Distribution | Do the Least Populated States Receive the Least Funding?

Q3: Bottom 20 Population vs. Bottom 20 Funding — Who Falls Behind?

# ---- Q3: Bottom 20 by Population vs Bottom 20 by Funding ----
bottom20_pop <- master %>%
  arrange(Population) %>%
  slice(1:20) %>%
  mutate(Pop_Rank = row_number())

bottom20_fund <- master %>%
  arrange(Total_funding) %>%
  slice(1:20) %>%
  mutate(Fund_Rank = row_number())

# Combined comparison
bottom_combined <- bottom20_pop %>%
  select(State, Population, Pop_Rank) %>%
  full_join(
    bottom20_fund %>% select(State, Total_funding, Fund_Rank),
    by = "State"
  ) %>%
  mutate(
    Category = case_when(
      !is.na(Pop_Rank) & !is.na(Fund_Rank) ~ "Bottom 20 in Both",
      !is.na(Pop_Rank) & is.na(Fund_Rank)  ~ "Bottom 20 Pop Only",
      is.na(Pop_Rank) & !is.na(Fund_Rank)  ~ "Bottom 20 Funding Only"
    )
  )

# Dot plot for clearer comparison
bottom_long <- bottom_combined %>%
  arrange(Population) %>%
  mutate(State = factor(State, levels = rev(State)))

ggplot(bottom_long, aes(y = State, color = Category)) +
  geom_point(aes(x = "Population\nRank", size = Population / 1e6),
             alpha = 0.8) +
  geom_point(aes(x = "Funding\nRank", size = Total_funding),
             alpha = 0.8) +
  geom_segment(
    data = bottom_long %>% filter(Category == "Bottom 20 in Both"),
    aes(x = "Population\nRank", xend = "Funding\nRank",
        y = State, yend = State),
    linetype = "dashed", color = "grey50", linewidth = 0.3
  ) +
  scale_color_manual(values = c(
    "Bottom 20 in Both" = "#2E7D32",
    "Bottom 20 Pop Only" = "#E65100",
    "Bottom 20 Funding Only" = "#6A1B9A"
  )) +
  scale_size_continuous(range = c(2, 8), guide = "none") +
  labs(
    title = "Q3: Do the Smallest States Also Receive the Least Funding?",
    subtitle = "Comparing Bottom 20 states by population vs. Bottom 20 by funding",
    x = NULL, y = NULL
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(color = "gray40"),
    legend.position = "bottom"
  )

Q3: Dumbbell Plot — Direct Rank Comparison

# Create rank-based comparison for ALL states in either bottom 20
bottom_ranked <- bottom_combined %>%
  left_join(
    master %>%
      mutate(Pop_Overall_Rank = rank(Population),
             Fund_Overall_Rank = rank(Total_funding)) %>%
      select(State, Pop_Overall_Rank, Fund_Overall_Rank),
    by = "State"
  ) %>%
  arrange(Pop_Overall_Rank) %>%
  mutate(State = factor(State, levels = rev(State)))

ggplot(bottom_ranked, aes(y = State)) +
  geom_segment(aes(x = Pop_Overall_Rank, xend = Fund_Overall_Rank,
                   yend = State),
               color = "grey60", linewidth = 1) +
  geom_point(aes(x = Pop_Overall_Rank), color = "#1565C0",
             size = 3.5) +
  geom_point(aes(x = Fund_Overall_Rank), color = "#C62828",
             size = 3.5) +
  geom_vline(xintercept = 20.5, linetype = "dashed",
             color = "grey40", linewidth = 0.5) +
  annotate("text", x = 10, y = 0.5, label = "Bottom 20 Zone",
           color = "grey40", size = 3.5, fontface = "italic") +
  scale_x_continuous(breaks = seq(0, 51, 5)) +
  labs(
    title = "Q3: Population Rank vs. Funding Rank for Small States",
    subtitle = "Blue dot = Population rank | Red dot = Funding rank | Line = gap between ranks",
    x = "Overall Rank (1 = Smallest/Lowest)",
    y = NULL,
    caption = "Dashed line separates bottom 20 from the rest"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(color = "gray40", size = 10)
  )

Q3: Mismatch Summary

overlap_bottom <- intersect(bottom20_pop$State, bottom20_fund$State)
only_pop_b     <- setdiff(bottom20_pop$State, bottom20_fund$State)
only_fund_b    <- setdiff(bottom20_fund$State, bottom20_pop$State)

cat("States in BOTH Bottom 20 lists:", length(overlap_bottom), "\n")

## States in BOTH Bottom 20 lists: 16

cat("Overlap:", paste(overlap_bottom, collapse = ", "), "\n\n")

## Overlap: Vermont, District of Columbia, North Dakota, South Dakota, Delaware, Rhode Island, Maine, New Hampshire, Hawaii, West Virginia, Idaho, Nebraska, Kansas, Mississippi, Nevada, Iowa

cat("Bottom 20 Pop but NOT Bottom 20 Funding:\n  ",
    paste(only_pop_b, collapse = ", "), "\n")

## Bottom 20 Pop but NOT Bottom 20 Funding:
##    Alaska, Montana, New Mexico, Arkansas

cat("Bottom 20 Funding but NOT Bottom 20 Pop:\n  ",
    paste(only_fund_b, collapse = ", "), "\n")

## Bottom 20 Funding but NOT Bottom 20 Pop:
##    Utah, Oregon, South Carolina, Connecticut

Q4: The Political Lens | Does Mean Per-Capita Funding Differ Between Red and Blue States?

Q4: Box Plot — Per-Capita Funding by Political Affiliation

# ---- Q4: Per-Capita Funding by Party ----
ggplot(master, aes(x = Party, y = Funding_Per_Capita,
                   fill = Party)) +
  geom_boxplot(alpha = 0.7, width = 0.5, outlier.shape = 21,
               outlier.size = 3) +
  geom_jitter(width = 0.15, alpha = 0.5, size = 2, color = "grey30") +
  scale_fill_manual(values = c("Blue" = "#1565C0",
                               "Red" = "#C62828")) +
  scale_y_continuous(labels = dollar_format()) +
  labs(
    title = "Q4: Per-Capita IIJA Funding — Blue States vs. Red States",
    subtitle = "Each point represents one state; box shows median & IQR",
    x = "2020 Election Result",
    y = "Funding Per Capita ($)",
    caption = "Per capita = Total State Funding / 2020 Census Population"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 15),
    plot.subtitle = element_text(color = "gray40"),
    legend.position = "none"
  )

Q4: Statistical Comparison — Are the Means Different?

# Descriptive statistics by party
party_stats <- master %>%
  group_by(Party) %>%
  summarise(
    N = n(),
    Mean_Per_Capita = mean(Funding_Per_Capita),
    Median_Per_Capita = median(Funding_Per_Capita),
    SD_Per_Capita = sd(Funding_Per_Capita),
    Min = min(Funding_Per_Capita),
    Max = max(Funding_Per_Capita),
    .groups = "drop"
  )

kable(party_stats,
      caption = "Per-Capita Funding Summary by Party",
      digits = 2,
      format.args = list(big.mark = ","))

Per-Capita Funding Summary by Party
Party	N	Mean_Per_Capita	Median_Per_Capita	SD_Per_Capita	Min	Max
Blue	27	675.57	545.42	295.71	437.01	1,639.45
Red	23	1,052.69	648.88	1,075.30	379.78	5,048.40

# Welch's t-test
blue <- master %>% filter(Party == "Blue") %>%
  pull(Funding_Per_Capita)
red  <- master %>% filter(Party == "Red") %>%
  pull(Funding_Per_Capita)

t_result <- t.test(blue, red)
cat("\nWelch's Two-Sample t-test:\n")

## 
## Welch's Two-Sample t-test:

cat("  t =", round(t_result$statistic, 4), "\n")

##   t = -1.6303

cat("  p-value =", round(t_result$p.value, 4), "\n")

##   p-value = 0.1157

cat("  95% CI: [", round(t_result$conf.int[1], 2), ",",
    round(t_result$conf.int[2], 2), "]\n")

##   95% CI: [ -853.7 , 99.46 ]

cat("\nInterpretation at alpha = 0.05:",
    ifelse(t_result$p.value < 0.05,
           "SIGNIFICANT difference in per-capita funding.",
           "NO significant difference in per-capita funding."), "\n")

## 
## Interpretation at alpha = 0.05: NO significant difference in per-capita funding.

Q5: The Bias Question | Is Per-Capita Funding Biased Toward Blue States?

Q5: Side-by-Side Per-Capita Funding — Every State Labeled

# ---- Q5: Per-Capita Funding Comparison ----
blue_states <- master %>%
  filter(Party == "Blue") %>%
  arrange(desc(Funding_Per_Capita)) %>%
  mutate(Rank = row_number())

red_states <- master %>%
  filter(Party == "Red") %>%
  arrange(desc(Funding_Per_Capita)) %>%
  mutate(Rank = row_number())

p_blue <- ggplot(blue_states,
                 aes(x = reorder(State_Abbr, Funding_Per_Capita),
                     y = Funding_Per_Capita)) +
  geom_col(fill = "#1565C0", alpha = 0.85, width = 0.7) +
  geom_hline(yintercept = mean(blue_states$Funding_Per_Capita),
             linetype = "dashed", color = "#0D47A1", linewidth = 0.7) +
  annotate("text",
           x = 1, y = mean(blue_states$Funding_Per_Capita) + 200,
           label = paste0("Mean: $",
                          round(mean(blue_states$Funding_Per_Capita))),
           color = "#0D47A1", size = 3, hjust = 0) +
 scale_y_continuous(labels = dollar_format(), limits = c(0, 5000)) +
  coord_flip() +
  labs(title = "Blue States (Biden 2020)",
       x = NULL, y = "Funding Per Capita ($)") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 12,
                                  color = "#1565C0"))

p_red <- ggplot(red_states,
                aes(x = reorder(State_Abbr, Funding_Per_Capita),
                    y = Funding_Per_Capita)) +
  geom_col(fill = "#C62828", alpha = 0.85, width = 0.7) +
  geom_hline(yintercept = mean(red_states$Funding_Per_Capita),
             linetype = "dashed", color = "#B71C1C", linewidth = 0.7) +
  annotate("text",
           x = 1, y = mean(red_states$Funding_Per_Capita) + 200,
           label = paste0("Mean: $",
                          round(mean(red_states$Funding_Per_Capita))),
           color = "#B71C1C", size = 3, hjust = 0) +
  scale_y_continuous(labels = dollar_format(), limits = c(0, 5000)) +
  coord_flip() +
  labs(title = "Red States (Trump 2020)",
       x = NULL, y = "Funding Per Capita ($)") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 12,
                                  color = "#C62828"))

grid.arrange(p_blue, p_red, ncol = 2,
             top = grid::textGrob(
               "Q5: Per-Capita Funding — Blue States vs. Red States",
               gp = grid::gpar(fontsize = 15, fontface = "bold")))

Q5: Density Overlay — Distribution Shapes Tell the Story

ggplot(master, aes(x = Funding_Per_Capita, fill = Party,
                   color = Party)) +
  geom_density(alpha = 0.35, linewidth = 1) +
  geom_vline(data = master %>% group_by(Party) %>%
               summarise(Mean = mean(Funding_Per_Capita)),
             aes(xintercept = Mean, color = Party),
             linetype = "dashed", linewidth = 1) +
  scale_fill_manual(values = c("Blue" = "#1565C0",
                               "Red" = "#C62828")) +
  scale_color_manual(values = c("Blue" = "#1565C0",
                                "Red" = "#C62828")) +
  scale_x_continuous(labels = dollar_format()) +
  labs(
    title = "Q5: Per-Capita Funding Density — Overlapping Distributions",
    subtitle = "Dashed lines show group means; substantial overlap suggests limited bias",
    x = "Funding Per Capita ($)", y = "Density"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(color = "gray40"),
    legend.position = "top"
  )

Q5: Wilcoxon Rank-Sum Test — A Nonparametric Check

# Non-parametric alternative (robust to outliers/skewness)
w_result <- wilcox.test(blue, red)

cat("Wilcoxon Rank-Sum (Mann-Whitney U) Test:\n")

## Wilcoxon Rank-Sum (Mann-Whitney U) Test:

cat("  W =", w_result$statistic, "\n")

##   W = 249

cat("  p-value =", round(w_result$p.value, 4), "\n")

##   p-value = 0.2374

cat("\nInterpretation at α = 0.05:",
    ifelse(w_result$p.value < 0.05,
           "SIGNIFICANT difference — evidence of distributional shift.",
           "NO significant difference — insufficient evidence of bias."),
    "\n")

## 
## Interpretation at α = 0.05: NO significant difference — insufficient evidence of bias.

# Effect size (Cohen's d)
cohens_d <- (mean(blue) - mean(red)) /
  sqrt((sd(blue)^2 + sd(red)^2) / 2)
cat("\nCohen's d (effect size):", round(cohens_d, 3), "\n")

## 
## Cohen's d (effect size): -0.478

cat("Interpretation:",
    case_when(
      abs(cohens_d) < 0.2 ~ "Negligible effect",
      abs(cohens_d) < 0.5 ~ "Small effect",
      abs(cohens_d) < 0.8 ~ "Medium effect",
      TRUE                ~ "Large effect"
    ), "\n")

## Interpretation: Small effect

Conclusions | What the Data Tells Us

Key Findings — The Equity and Bias Story

Goal 1 — Is funding equitable?

Raw funding is heavily concentrated in large states (CA, TX, NY, FL)
This mirrors population: larger states legitimately need more infrastructure dollars
Per-capita analysis normalizes this, revealing a more nuanced picture
Small states like Alaska and Wyoming receive disproportionately high per-capita allocations (likely due to formula-based highway funding minimums)

Goal 2 — Is there political bias?

Visual comparison of blue vs. red per-capita funding shows overlapping distributions
Statistical tests (t-test & Wilcoxon) provide the evidence-based verdict
The formula-driven nature of most IIJA highway funding limits discretionary political targeting
Outlier states (e.g., DC, Alaska) may skew group means but don’t indicate systematic bias

Limitations & Future Directions

Limitations:

Funding data represents announced allocations, not disbursed amounts
Analysis uses 2020 Census population; population shifts since 2020 may affect per-capita calculations
“Red” vs. “Blue” is a binary simplification of a political spectrum
DC is classified as “Blue” but has unique federal characteristics

Future Research:

Analyze competitive vs. formula-based funding separately
Investigate funding per road-mile or per structurally deficient bridge
Compare swing-state funding patterns to safe-state patterns
Longitudinal analysis as more funding is disbursed through 2026

Is IIJA Funding Equitable? A Data-Driven Investigation

Analyzing the Infrastructure Investment and Jobs Act Across 50 States

Agenda & Research Questions

Data Preparation | Loading, Cleaning & Merging Three Datasets

Loading and Inspecting the Raw Datasets

Standardizing State Names for Seamless Merging

Cleaning Population Data — Extracting State-Level Records

Cleaning Votes Data — Mapping Abbreviations to Full State Names

Mapping State Abbreviations to Full Names

Merging Into a Master Analytical Dataset

Master Dataset Preview — The Foundation of Our Analysis

Q1: The Geographic Landscape | How Is IIJA Funding Distributed Across the States?

Q1: A Choropleth Reveals the Geographic Story of IIJA Dollars

Q1 Insight: Funding Concentrates in a Few Large States

Q2: Population vs. Funding | Are the Top 20 Most Populated States Also the Top 20 Most Funded?

Q2: Mirrored Bar Chart — Population Rank vs. Funding Rank

Q2: Overlap Analysis — Quantifying the Match

Q3: The Bottom of the Distribution | Do the Least Populated States Receive the Least Funding?

Q3: Bottom 20 Population vs. Bottom 20 Funding — Who Falls Behind?

Q3: Dumbbell Plot — Direct Rank Comparison

Q3: Mismatch Summary

Q4: The Political Lens | Does Mean Per-Capita Funding Differ Between Red and Blue States?

Q4: Box Plot — Per-Capita Funding by Political Affiliation

Q4: Statistical Comparison — Are the Means Different?

Q5: The Bias Question | Is Per-Capita Funding Biased Toward Blue States?

Q5: Side-by-Side Per-Capita Funding — Every State Labeled

Q5: Density Overlay — Distribution Shapes Tell the Story

Q5: Wilcoxon Rank-Sum Test — A Nonparametric Check

Conclusions | What the Data Tells Us

Key Findings — The Equity and Bias Story

Limitations & Future Directions