Complete Statistical Project

Author

Eric Mulaa

# Load necessary packages 
library(tidyverse)
library(dplyr)
library(skimr)      
library(naniar)      
library(lubridate)  
library(flextable)  
library(gt)         
library(janitor)    
library(sf)          
library(leaflet)     
library(scales)
library(maps) 
library(viridis)
theme_set(theme_minimal())

1 Data dictionary & Exploratory Data Analysis

1.1 Loading the gun violence dataset and the us census state level

# Load data (use readr::read_csv for better parsing)
library(readr)
gun_violence_data <- read_csv("~/Final Project STA 518/gun_violence_geo.csv", show_col_types = FALSE) |>
  clean_names()
census_data <- read_csv("https://raw.githubusercontent.com/dilernia/STA418-518/main/Data/census_data_state_2008-2023.csv", show_col_types = FALSE) |>
  clean_names()

# Quick check of column names and types
glimpse(gun_violence_data)
Rows: 389,850
Columns: 16
$ geoid                    <chr> "26049", "22105", "06065", "48439", "22079", …
$ incident_id              <dbl> 2803597, 2799267, 2797925, 2795377, 2795134, …
$ date                     <dttm> 2023-12-31, 2023-12-31, 2023-12-31, 2023-12-…
$ state                    <chr> "Michigan", "Louisiana", "California", "Texas…
$ city                     <chr> "Flint", "Amite", "Perris", "Fort Worth", "Al…
$ county                   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ business_or_school       <chr> NA, NA, NA, NA, NA, NA, "Point Woronzof Overl…
$ address                  <chr> "Balsam Lane", "300 block of S First St", "17…
$ latitude                 <dbl> 43.0647, 30.7448, 33.8029, 32.7058, 31.3183, …
$ longitude                <dbl> -83.7073, -90.4106, -117.2150, -97.2322, -92.…
$ number_victims_killed    <dbl> 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, …
$ number_victims_injured   <dbl> 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, …
$ number_suspects_killed   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ number_suspects_injured  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ number_suspects_arrested <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ incident_characteristics <chr> "Shot - Dead (murder, accidental, or suicide)…
glimpse(census_data)
Rows: 780
Columns: 26
$ geoid                    <chr> "01", "02", "04", "05", "06", "08", "09", "10…
$ county_state             <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "…
$ year                     <dbl> 2008, 2008, 2008, 2008, 2008, 2008, 2008, 200…
$ population               <dbl> 4661900, 686293, 6500180, 2855390, 36756666, …
$ median_income            <dbl> 42666, 68460, 50958, 38815, 61021, 56993, 685…
$ median_monthly_rent_cost <dbl> 631, 949, 866, 606, 1135, 848, 970, 917, 1011…
$ median_monthly_home_cost <dbl> 742, 1393, 1177, 660, 1919, 1364, 1722, 1209,…
$ prop_female              <dbl> 0.5154499, 0.4789995, 0.4986763, 0.5102035, 0…
$ prop_male                <dbl> 0.4845501, 0.5210005, 0.5013237, 0.4897965, 0…
$ prop_white               <dbl> 0.7028360, 0.6911290, 0.8005912, 0.7871510, 0…
$ prop_black               <dbl> 0.261833802, 0.036257109, 0.036269457, 0.1550…
$ prop_native              <dbl> 0.005140822, 0.127259057, 0.044114009, 0.0052…
$ prop_asian               <dbl> 0.009768335, 0.046373779, 0.023755650, 0.0103…
$ prop_hawaiin_islander    <dbl> 2.417469e-04, 5.484538e-03, 1.436883e-03, 6.7…
$ prop_other_race          <dbl> 0.007177117, 0.012462607, 0.068086884, 0.0218…
$ prop_multi_racial        <dbl> 0.013002209, 0.081033902, 0.025745902, 0.0197…
$ prop_highschool          <dbl> 0.2604227, 0.2183097, 0.2106347, 0.2958009, 0…
$ prop_ged                 <dbl> 0.05599385, 0.04658968, 0.03935913, 0.0582695…
$ prop_some_college        <dbl> 0.05957806, 0.08116479, 0.07719126, 0.0687879…
$ prop_college_no_degree   <dbl> 0.1546491, 0.2168623, 0.1816943, 0.1534883, 0…
$ prop_associates          <dbl> 0.06831511, 0.08030577, 0.07837909, 0.0558112…
$ prop_bachelors           <dbl> 0.1427926, 0.1758716, 0.1590928, 0.1251352, 0…
$ prop_masters             <dbl> 0.05542474, 0.06465035, 0.06474227, 0.0433444…
$ prop_professional        <dbl> 0.01338380, 0.01743700, 0.01623786, 0.0121880…
$ prop_doctoral            <dbl> 0.008333932, 0.015125876, 0.010927963, 0.0072…
$ prop_poverty             <dbl> 0.15695141, 0.08420041, 0.14718370, 0.1732244…

We will be using the census data and later merge with gun violence data to create a summary statistics table

1.2 Data dictionary for our main data set, Census Data

dictionary <- tibble(
  variable = names(census_data),
  type = map_chr(census_data, ~class(.x)[1]),
  description = c(
    "geographic region ID with the first 2 digits being the state code and the last 3 digits the county FIPS code",
    "geographic region",
    "year",
    "population",
    "median income in dollars",
    "median monthly rent costs for renters in dollars",
    "median monthly housing costs for homeowners in dollars",
    "proportion of people who are female",
    "proportion of people who are male",
    "proportion of people who are white alone",
    "proportion of people who are black or African American alone",
    "proportion of people who are American Indian and Alaska Native alone",
    "proportion of people who are Asian alone",
    "proportion of people who are Native Hawaiian and Other Pacific Islander alone",
    "proportion of people who are some other race alone",
    "proportion of people who are two or more races",
    "proportion of people 25 and older whose highest education-level is high school",
    "proportion of people 25 and older whose highest education-level is a GED",
    "proportion of people 25 and older whose highest education-level is some, but less than 1 year of college",
    "proportion of people 25 and older whose highest education-level is greater than 1 year of college but no degree",
    "proportion of people 25 and older whose highest education-level is an Associates degree",
    "proportion of people 25 and older whose highest education-level is a Bachelors degree",
    "proportion of people 25 and older whose highest education-level is a Masters degree",
    "proportion of people 25 and older whose highest education-level is a Professional degree",
    "proportion of people 25 and older whose highest education-level is a Doctoral degree",
    "proportion of people 25 and older living in poverty, defined by the Census Bureau as having an income below the poverty threshold for their family size."
  )
)

if(nrow(dictionary) != ncol(census_data)) {
  stop("Data dictionary length does not match number of columns in census_data")
}

dictionary |>
  gt() |>
  tab_header(title = "Data dictionary: census_data")
Data dictionary: census_data
variable type description
geoid character geographic region ID with the first 2 digits being the state code and the last 3 digits the county FIPS code
county_state character geographic region
year numeric year
population numeric population
median_income numeric median income in dollars
median_monthly_rent_cost numeric median monthly rent costs for renters in dollars
median_monthly_home_cost numeric median monthly housing costs for homeowners in dollars
prop_female numeric proportion of people who are female
prop_male numeric proportion of people who are male
prop_white numeric proportion of people who are white alone
prop_black numeric proportion of people who are black or African American alone
prop_native numeric proportion of people who are American Indian and Alaska Native alone
prop_asian numeric proportion of people who are Asian alone
prop_hawaiin_islander numeric proportion of people who are Native Hawaiian and Other Pacific Islander alone
prop_other_race numeric proportion of people who are some other race alone
prop_multi_racial numeric proportion of people who are two or more races
prop_highschool numeric proportion of people 25 and older whose highest education-level is high school
prop_ged numeric proportion of people 25 and older whose highest education-level is a GED
prop_some_college numeric proportion of people 25 and older whose highest education-level is some, but less than 1 year of college
prop_college_no_degree numeric proportion of people 25 and older whose highest education-level is greater than 1 year of college but no degree
prop_associates numeric proportion of people 25 and older whose highest education-level is an Associates degree
prop_bachelors numeric proportion of people 25 and older whose highest education-level is a Bachelors degree
prop_masters numeric proportion of people 25 and older whose highest education-level is a Masters degree
prop_professional numeric proportion of people 25 and older whose highest education-level is a Professional degree
prop_doctoral numeric proportion of people 25 and older whose highest education-level is a Doctoral degree
prop_poverty numeric proportion of people 25 and older living in poverty, defined by the Census Bureau as having an income below the poverty threshold for their family size.

1.3 Describing Patterns of missingness for census data

#using naniar package to check visualize missingness
n_miss(census_data)
[1] 0
#Visualizing missingness
gg_miss_var(census_data)

From the visualization above, there is no missing values

1.4 Quickly visualizing our datasets

Census Data

skim(census_data)
Data summary
Name census_data
Number of rows 780
Number of columns 26
_______________________
Column type frequency:
character 2
numeric 24
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
geoid 0 1 2 2 0 52 0
county_state 0 1 4 20 0 52 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1 2015.20 4.61 2008.00 2011.00 2015.00 2019.00 2023.00 ▇▆▆▃▆
population 0 1 6229692.05 7025083.48 532668.00 1786156.75 4257700.00 7053886.75 39557045.00 ▇▂▁▁▁
median_income 0 1 58358.78 14075.73 18314.00 48362.75 56484.50 67147.25 108210.00 ▁▇▇▂▁
median_monthly_rent_cost 0 1 945.44 269.03 419.00 749.50 884.00 1092.25 1992.00 ▃▇▃▁▁
median_monthly_home_cost 0 1 1109.16 371.59 241.00 850.00 1020.00 1353.00 2561.00 ▁▇▃▂▁
prop_female 0 1 0.51 0.01 0.47 0.50 0.51 0.51 0.53 ▁▂▇▇▁
prop_male 0 1 0.49 0.01 0.47 0.49 0.49 0.50 0.53 ▁▇▇▂▁
prop_white 0 1 0.75 0.15 0.22 0.67 0.77 0.85 0.96 ▁▁▃▇▇
prop_black 0 1 0.11 0.11 0.00 0.03 0.07 0.15 0.53 ▇▃▂▁▁
prop_native 0 1 0.02 0.03 0.00 0.00 0.00 0.01 0.16 ▇▁▁▁▁
prop_asian 0 1 0.04 0.05 0.00 0.01 0.03 0.04 0.39 ▇▁▁▁▁
prop_hawaiin_islander 0 1 0.00 0.01 0.00 0.00 0.00 0.00 0.11 ▇▁▁▁▁
prop_other_race 0 1 0.04 0.04 0.00 0.01 0.02 0.05 0.32 ▇▁▁▁▁
prop_multi_racial 0 1 0.05 0.05 0.01 0.02 0.03 0.05 0.39 ▇▁▁▁▁
prop_highschool 0 1 0.24 0.04 0.10 0.22 0.24 0.26 0.35 ▁▂▇▆▁
prop_ged 0 1 0.04 0.01 0.02 0.04 0.04 0.05 0.07 ▃▇▆▃▁
prop_some_college 0 1 0.06 0.01 0.01 0.06 0.06 0.07 0.09 ▁▁▅▇▂
prop_college_no_degree 0 1 0.15 0.02 0.08 0.13 0.14 0.16 0.22 ▁▅▇▃▁
prop_associates 0 1 0.09 0.02 0.03 0.08 0.08 0.10 0.15 ▁▃▇▂▁
prop_bachelors 0 1 0.19 0.03 0.10 0.17 0.19 0.21 0.29 ▁▅▇▃▁
prop_masters 0 1 0.08 0.03 0.04 0.06 0.08 0.09 0.24 ▇▆▁▁▁
prop_professional 0 1 0.02 0.01 0.01 0.02 0.02 0.02 0.10 ▇▁▁▁▁
prop_doctoral 0 1 0.01 0.01 0.01 0.01 0.01 0.02 0.05 ▇▃▁▁▁
prop_poverty 0 1 0.14 0.05 0.07 0.11 0.13 0.16 0.46 ▇▃▁▁▁

Gun Violence Data

skim(gun_violence_data)
Data summary
Name gun_violence_data
Number of rows 389850
Number of columns 16
_______________________
Column type frequency:
character 7
numeric 8
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
geoid 46 1.00 5 5 0 2956 0
state 0 1.00 4 20 0 51 0
city 761 1.00 3 27 0 11548 0
county 389065 0.00 1 22 0 459 0
business_or_school 316795 0.19 1 85 0 47476 0
address 9665 0.98 3 61 0 325895 0
incident_characteristics 102 1.00 8 828 0 10939 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
incident_id 0 1 1468977.53 796698.22 92114.00 756778.50 1529846.00 2165292.50 2976478.00 ▇▆▇▇▅
latitude 2 1 37.33 4.82 19.03 33.76 38.31 41.07 71.34 ▁▇▅▁▁
longitude 2 1 -89.25 13.45 -171.74 -93.77 -86.77 -80.20 -67.04 ▁▁▂▃▇
number_victims_killed 0 1 0.37 0.56 0.00 0.00 0.00 1.00 60.00 ▇▁▁▁▁
number_victims_injured 0 1 0.78 1.06 0.00 0.00 1.00 1.00 439.00 ▇▁▁▁▁
number_suspects_killed 0 1 0.06 0.24 0.00 0.00 0.00 0.00 4.00 ▇▁▁▁▁
number_suspects_injured 0 1 0.05 0.23 0.00 0.00 0.00 0.00 5.00 ▇▁▁▁▁
number_suspects_arrested 0 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ▁▁▇▁▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
date 0 1 2014-01-01 2023-12-31 2019-09-26 3652

We have missing values especially onbusiness_or_schoolandcountycolumns but we will not be using those columns in our summary tables.

We convert the date column to a datetime object and extracting year and month

gun_violence_data <- gun_violence_data |>
  mutate(new_date = parse_date_time(date, orders = c("ymd HMS", "ymd", "mdy HMS", "mdy")),
         year = year(new_date),
         month = month(new_date, label = TRUE, abbr = TRUE))

2 Merging Datasets

We will merge gun violence data set to the census data set and create a summary statistics table that will help us understand the correlation between poverty rate and gun incidences see

2.1 Yearly gun incidents summarized data

yearly_gun_incidence <- gun_violence_data |>
  group_by(year) |>
  summarise(gun_incidence = n())
yearly_gun_incidence |>
flextable()

year

gun_incidence

2,014

27,091

2,015

32,468

2,016

36,636

2,017

37,508

2,018

34,564

2,019

36,290

2,020

46,690

2,021

48,474

2,022

46,506

2,023

43,623

2.2 Census yearly proportion of poverty summarized table

# Population average proportion in poverty 
yearly_prop_poverty <- census_data |>
  group_by(year) |>
  summarise(avg_prop_poverty = sum(prop_poverty * population, na.rm = TRUE) / sum(population, na.rm = TRUE))

# Lets calculate the national total population
national_pop <- census_data |>
  group_by(year) |>
  summarise(total_pop = sum(population, na.rm=TRUE))
yearly_prop_poverty |>
flextable()
Error: bkm [prop poverty table] should only contain alphanumeric characters, ':', '-' and '_'.
national_pop |>
  flextable()
Error: bkm [national_pop table] should only contain alphanumeric characters, ':', '-' and '_'.

2.3 Combining the two summary statistics table

# joining the gun data to both and show both results
combined_yearly_summary <- yearly_gun_incidence |>
  inner_join(national_pop, by = "year") |>
  mutate(inc_per_100k = gun_incidence / total_pop * 100000) |> # aggregating incidents per 100k
  inner_join(yearly_prop_poverty, by = "year")

We use inner_join on year so that only years present in both datasets are included — this avoids comparing years with no gun-incident records or missing socio-demographic information.

combined_yearly_summary |>
  flextable()

year

gun_incidence

total_pop

inc_per_100k

avg_prop_poverty

2,014

27,091

322,405,453

8.402774

0.1584288

2,015

32,468

324,893,003

9.993444

0.1505759

2,016

36,636

326,538,822

11.219493

0.1435402

2,017

37,508

329,056,355

11.398655

0.1371734

2,018

34,564

330,362,592

10.462444

0.1340370

2,019

36,290

331,433,217

10.949415

0.1263793

2,021

48,474

335,157,329

14.463058

0.1303997

2,022

46,506

336,509,351

13.820121

0.1286181

2,023

43,623

338,120,587

12.901610

0.1272208

2.4 Correlation on per-100k vs average proportion poverty

# correlation on per-100k vs average prop poverty
cor_val <- cor(combined_yearly_summary$inc_per_100k, combined_yearly_summary$avg_prop_poverty, use = "complete.obs")
round(cor_val, 2)
[1] -0.77

2.5 Correlation Visualization using a scatter plot

ggplot(combined_yearly_summary, aes(x = avg_prop_poverty, y = inc_per_100k)) +
  geom_point() + geom_smooth(method = "lm", se = FALSE) +
  labs(title = paste0("Correlation (r = ", round(cor_val,2), ")"),
       x = "Average poverty proportion", y = "Incidents per 100k")

We observe a strong, negative correlation (r = -0.77) between the national proportion of poverty and the national rate of gun incidents per 100k population. This indicates that on years where the average proportion of poverty is higher, the total national gun incident counts per capita are lower.

3 Data Visualization

state-level summary from census_data for year 2023

# Lets begin by preparing  state-level summary from census_data for year 2023
census_data_2023 <- census_data |>
  mutate(state = str_to_lower(county_state)) |>
  filter(year == 2023) |>
  select(-county_state)

checking for states with top population

# Get vector of top 5 states by population
top_states <- census_data |>
  group_by(county_state) |>
  summarise(total_pop = sum(population), .groups = "drop") |>
  arrange(desc(total_pop)) |>
  slice_head(n = 5) |>
  pull(county_state)

top_states
[1] "California" "Texas"      "Florida"    "New York"   "Illinois"  

3.1 Chloropleth: Visualization for median income in the year 2023 across states

# Lets get state polygons and standardize names
us_states_sf <- st_as_sf(map("state", plot = FALSE, fill = TRUE)) |>
  rename(state = ID) |>
  mutate(state = str_to_lower(state))

# Left join on standardized state names
map_data_sf <- us_states_sf |>
  left_join(census_data_2023, by = "state")

pal <- colorNumeric(palette = "YlGnBu", domain = map_data_sf$median_income, na.color = "transparent")

leaflet(map_data_sf) |>
  addProviderTiles("CartoDB.Positron") |>
  addPolygons(fillColor = ~pal(median_income), weight = 1, color = "white", fillOpacity = 0.8,
              label = ~paste0(str_to_title(state), ": ", scales::dollar(median_income))) |>
  addLegend("bottomright", pal = pal, values = ~median_income, title = "Median income (2023)")

From the leaflet map, we can see states like Missisipi and West Virginia have the lowest median income, while states like Carlifonia, Massachusets and New Hampshire record high median incomes.`

3.2 Histogram: Distribution of median income in 2023 across states

ggplot(census_data_2023, aes(x = median_income)) +
  geom_histogram(fill = viridis(1), color = "white", alpha = 0.8) +
  labs(title = "Distribution of State Median Incomes",
       x = "Median Income ($)", y = "Count of States") +
  scale_x_continuous(labels = scales::dollar) +
  theme_minimal()

The histogram reveals that most U.S. states have median household incomes clustered between $60,000 and $80,000, with a peak near $75,000. Distribution appears to be right skewed.

3.3 Scatter plot: Visualization of poverty rate vs bachelors degree

ggplot(census_data_2023, aes(x = prop_bachelors, y = prop_poverty)) +
  geom_point(aes(color = median_income), alpha = 0.7) +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(title = "Poverty Rate vs. Proportion with Bachelor's Degrees",
       x = "Proportion with Bachelor's Degree",
       y = "Proportion in Poverty",
       color = "Median Income (USD)",
       caption = "Data Source: census_data (years 2008-2023)") +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::percent) +
  scale_color_viridis_c(labels = scales::dollar) +
  theme_minimal()

3.4 Stacked Bar Chart: Racial composition stacked bar chart for Top 5 populated states

# Lets reshape race data
race_data_long <- census_data |>
  select(county_state, prop_white, prop_asian, prop_multi_racial, prop_black, prop_hawaiin_islander) |>
  pivot_longer(cols = starts_with("prop_"), names_to = "race", values_to = "proportion") |>
  mutate(race = gsub("prop_", "", race))

# Pick top states
top_states <- c("California","Texas","Florida","New York" ,"Illinois")

race_data_long <- race_data_long |>
  mutate(county_state = factor(county_state, levels = top_states))


ggplot(filter(race_data_long, county_state %in% top_states),
       aes(x = county_state, y = proportion, fill = race)) +
  geom_bar(stat = "identity", position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Racial Composition of Top 5 States",
       x = "State", y = "Proportion of Population", fill = "Race/Ethnicity") +
  scale_fill_brewer(palette = "Set2") +
  theme_minimal()

3.5 Time series

national_avg_income <- census_data |>
  group_by(year) |>
  summarise(avg_median_income = mean(median_income, na.rm = TRUE),
            total_population = sum(population, na.rm = TRUE))

ggplot(national_avg_income, aes(x = year, y = avg_median_income)) +
  geom_line(size = 1) + geom_point(size = 2) +
  scale_y_continuous(labels = scales::dollar) +
  labs(title = "Average Median Income Over Time (US)", x = "Year", y = "Average Median Income (USD)",
       caption = "Data source: census_data aggregated to national level") +
  theme_minimal()

4 Monte Carlo / Permutation Test

4.1 Hypothesis Question

Do U.S. states with higher proportions of residents holding a bachelor’s degree have different median household incomes than states with lower proportions of bachelor’s degrees?

Test chosen: Two‑sample permutation test comparing the mean median_income between two groups of states defined by prop_bachelors (above vs below the median proportion).

Hypotheses * Null hypothesis (H0): The mean median household income is the same for states with high and low proportions of bachelor’s degrees; any observed difference is due to random assignment of the state labels. * Alternative hypothesis (Ha): The mean median household income differs between the two groups (two‑sided).

4.2 Code implementation

# Lets start by creating a binary grouping variable of HighEdu = prop_bachelors > median(prop_bachelors)
set.seed(2025)  # Lets set the seed for reproducibility
census_high_edu <- census_data_2023 |>
  mutate(
    high_bachelors = ifelse(prop_bachelors > median(prop_bachelors, na.rm = TRUE), "High", "Low")
  ) |>
  filter(!is.na(median_income), !is.na(high_bachelors))

# Lets calculate the observed difference in group means (High - Low)
obs_diff <- census_high_edu |>
  group_by(high_bachelors) |>
  summarize(mean_income = mean(median_income)) |>
  arrange(high_bachelors) |>
  summarize(diff = mean_income[high_bachelors == "High"] - mean_income[high_bachelors == "Low"]) %>%
  pull(diff)

# Lets do start our permutation
n_perm <- 100000
perm_diffs <- numeric(n_perm)

for (i in seq_len(n_perm)) {
  perm_labels <- sample(census_high_edu$high_bachelors)            
  perm_diffs[i] <- with(census_high_edu, 
                        mean(median_income[perm_labels == "High"]) -
                        mean(median_income[perm_labels == "Low"]))
}


p_value <- mean(abs(perm_diffs) >= abs(obs_diff))

lower_cut <- quantile(perm_diffs, 0.025)
upper_cut <- quantile(perm_diffs, 0.975)


cat("Observed difference (High - Low):", round(obs_diff, 2), "\n")
Observed difference (High - Low): 18651.81 
cat("Permutation p-value (two-sided):", signif(p_value, 3), "\n")
Permutation p-value (two-sided): 0 
cat("95% null cutoff interval:", round(lower_cut,2), "to", round(upper_cut,2), "\n")
95% null cutoff interval: -7879.62 to 7934.9 

Since our p_value < 0.05 we reject H0 and conclude that there is evidence that mean median household income differs between states with high vs low bachelor proportions.

4.3 ggplot visualization for the null distribution

# Plotting null distribution 
perm_df <- data.frame(perm_diff = perm_diffs)

ggplot(perm_df, aes(x = perm_diff)) +
  geom_histogram(binwidth = diff(range(perm_diffs))/60, color = "black", fill = "lightblue") +
  geom_vline(xintercept = obs_diff, color = "red", size = 1, linetype = "solid") +
  geom_vline(xintercept = lower_cut, color = "darkgreen", size = 1, linetype = "dashed") +
  geom_vline(xintercept = upper_cut, color = "darkgreen", size = 1, linetype = "dashed") +
  labs(
    title = "Permutation Null Distribution of Difference in Mean Median Income",
    subtitle = "Group labels: High vs Low proportion of bachelor degrees",
    x = "Difference in mean median_income (High - Low)",
    y = "Count"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold"),
    plot.subtitle = element_text(size = 10),
    axis.title = element_text(face = "bold"),
    panel.grid.minor = element_blank()
  ) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0))