Complete Statistical Project

Author

Eric Mulaa

# Load necessary packages 
library(tidyverse)
library(dplyr)
library(skimr)      
library(naniar)      
library(lubridate)  
library(flextable)  
library(gt)         
library(janitor)    
library(sf)          
library(leaflet)     
library(scales)
library(maps) 
library(viridis)
theme_set(theme_minimal())

1 Data dictionary & Exploratory Data Analysis

1.1 Loading the gun violence dataset and the us census state level

# Load data (use readr::read_csv for better parsing)
library(readr)
gun_violence_data <- read_csv("~/Final Project STA 518/gun_violence_geo.csv", show_col_types = FALSE) |>
  clean_names()
census_data <- read_csv("https://raw.githubusercontent.com/dilernia/STA418-518/main/Data/census_data_state_2008-2023.csv", show_col_types = FALSE) |>
  clean_names()

# Quick check of column names and types
glimpse(gun_violence_data)

Rows: 389,850
Columns: 16
$ geoid                    <chr> "26049", "22105", "06065", "48439", "22079", …
$ incident_id              <dbl> 2803597, 2799267, 2797925, 2795377, 2795134, …
$ date                     <dttm> 2023-12-31, 2023-12-31, 2023-12-31, 2023-12-…
$ state                    <chr> "Michigan", "Louisiana", "California", "Texas…
$ city                     <chr> "Flint", "Amite", "Perris", "Fort Worth", "Al…
$ county                   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ business_or_school       <chr> NA, NA, NA, NA, NA, NA, "Point Woronzof Overl…
$ address                  <chr> "Balsam Lane", "300 block of S First St", "17…
$ latitude                 <dbl> 43.0647, 30.7448, 33.8029, 32.7058, 31.3183, …
$ longitude                <dbl> -83.7073, -90.4106, -117.2150, -97.2322, -92.…
$ number_victims_killed    <dbl> 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, …
$ number_victims_injured   <dbl> 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, …
$ number_suspects_killed   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ number_suspects_injured  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ number_suspects_arrested <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ incident_characteristics <chr> "Shot - Dead (murder, accidental, or suicide)…

glimpse(census_data)

Rows: 780
Columns: 26
$ geoid                    <chr> "01", "02", "04", "05", "06", "08", "09", "10…
$ county_state             <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "…
$ year                     <dbl> 2008, 2008, 2008, 2008, 2008, 2008, 2008, 200…
$ population               <dbl> 4661900, 686293, 6500180, 2855390, 36756666, …
$ median_income            <dbl> 42666, 68460, 50958, 38815, 61021, 56993, 685…
$ median_monthly_rent_cost <dbl> 631, 949, 866, 606, 1135, 848, 970, 917, 1011…
$ median_monthly_home_cost <dbl> 742, 1393, 1177, 660, 1919, 1364, 1722, 1209,…
$ prop_female              <dbl> 0.5154499, 0.4789995, 0.4986763, 0.5102035, 0…
$ prop_male                <dbl> 0.4845501, 0.5210005, 0.5013237, 0.4897965, 0…
$ prop_white               <dbl> 0.7028360, 0.6911290, 0.8005912, 0.7871510, 0…
$ prop_black               <dbl> 0.261833802, 0.036257109, 0.036269457, 0.1550…
$ prop_native              <dbl> 0.005140822, 0.127259057, 0.044114009, 0.0052…
$ prop_asian               <dbl> 0.009768335, 0.046373779, 0.023755650, 0.0103…
$ prop_hawaiin_islander    <dbl> 2.417469e-04, 5.484538e-03, 1.436883e-03, 6.7…
$ prop_other_race          <dbl> 0.007177117, 0.012462607, 0.068086884, 0.0218…
$ prop_multi_racial        <dbl> 0.013002209, 0.081033902, 0.025745902, 0.0197…
$ prop_highschool          <dbl> 0.2604227, 0.2183097, 0.2106347, 0.2958009, 0…
$ prop_ged                 <dbl> 0.05599385, 0.04658968, 0.03935913, 0.0582695…
$ prop_some_college        <dbl> 0.05957806, 0.08116479, 0.07719126, 0.0687879…
$ prop_college_no_degree   <dbl> 0.1546491, 0.2168623, 0.1816943, 0.1534883, 0…
$ prop_associates          <dbl> 0.06831511, 0.08030577, 0.07837909, 0.0558112…
$ prop_bachelors           <dbl> 0.1427926, 0.1758716, 0.1590928, 0.1251352, 0…
$ prop_masters             <dbl> 0.05542474, 0.06465035, 0.06474227, 0.0433444…
$ prop_professional        <dbl> 0.01338380, 0.01743700, 0.01623786, 0.0121880…
$ prop_doctoral            <dbl> 0.008333932, 0.015125876, 0.010927963, 0.0072…
$ prop_poverty             <dbl> 0.15695141, 0.08420041, 0.14718370, 0.1732244…

We will be using the census data and later merge with gun violence data to create a summary statistics table

1.2 Data dictionary for our main data set, Census Data

dictionary <- tibble(
  variable = names(census_data),
  type = map_chr(census_data, ~class(.x)[1]),
  description = c(
    "geographic region ID with the first 2 digits being the state code and the last 3 digits the county FIPS code",
    "geographic region",
    "year",
    "population",
    "median income in dollars",
    "median monthly rent costs for renters in dollars",
    "median monthly housing costs for homeowners in dollars",
    "proportion of people who are female",
    "proportion of people who are male",
    "proportion of people who are white alone",
    "proportion of people who are black or African American alone",
    "proportion of people who are American Indian and Alaska Native alone",
    "proportion of people who are Asian alone",
    "proportion of people who are Native Hawaiian and Other Pacific Islander alone",
    "proportion of people who are some other race alone",
    "proportion of people who are two or more races",
    "proportion of people 25 and older whose highest education-level is high school",
    "proportion of people 25 and older whose highest education-level is a GED",
    "proportion of people 25 and older whose highest education-level is some, but less than 1 year of college",
    "proportion of people 25 and older whose highest education-level is greater than 1 year of college but no degree",
    "proportion of people 25 and older whose highest education-level is an Associates degree",
    "proportion of people 25 and older whose highest education-level is a Bachelors degree",
    "proportion of people 25 and older whose highest education-level is a Masters degree",
    "proportion of people 25 and older whose highest education-level is a Professional degree",
    "proportion of people 25 and older whose highest education-level is a Doctoral degree",
    "proportion of people 25 and older living in poverty, defined by the Census Bureau as having an income below the poverty threshold for their family size."
  )
)

if(nrow(dictionary) != ncol(census_data)) {
  stop("Data dictionary length does not match number of columns in census_data")
}

dictionary |>
  gt() |>
  tab_header(title = "Data dictionary: census_data")

Data dictionary: census_data
variable	type	description
geoid	character	geographic region ID with the first 2 digits being the state code and the last 3 digits the county FIPS code
county_state	character	geographic region
year	numeric	year
population	numeric	population
median_income	numeric	median income in dollars
median_monthly_rent_cost	numeric	median monthly rent costs for renters in dollars
median_monthly_home_cost	numeric	median monthly housing costs for homeowners in dollars
prop_female	numeric	proportion of people who are female
prop_male	numeric	proportion of people who are male
prop_white	numeric	proportion of people who are white alone
prop_black	numeric	proportion of people who are black or African American alone
prop_native	numeric	proportion of people who are American Indian and Alaska Native alone
prop_asian	numeric	proportion of people who are Asian alone
prop_hawaiin_islander	numeric	proportion of people who are Native Hawaiian and Other Pacific Islander alone
prop_other_race	numeric	proportion of people who are some other race alone
prop_multi_racial	numeric	proportion of people who are two or more races
prop_highschool	numeric	proportion of people 25 and older whose highest education-level is high school
prop_ged	numeric	proportion of people 25 and older whose highest education-level is a GED
prop_some_college	numeric	proportion of people 25 and older whose highest education-level is some, but less than 1 year of college
prop_college_no_degree	numeric	proportion of people 25 and older whose highest education-level is greater than 1 year of college but no degree
prop_associates	numeric	proportion of people 25 and older whose highest education-level is an Associates degree
prop_bachelors	numeric	proportion of people 25 and older whose highest education-level is a Bachelors degree
prop_masters	numeric	proportion of people 25 and older whose highest education-level is a Masters degree
prop_professional	numeric	proportion of people 25 and older whose highest education-level is a Professional degree
prop_doctoral	numeric	proportion of people 25 and older whose highest education-level is a Doctoral degree
prop_poverty	numeric	proportion of people 25 and older living in poverty, defined by the Census Bureau as having an income below the poverty threshold for their family size.

1.3 Describing Patterns of missingness for census data

#using naniar package to check visualize missingness
n_miss(census_data)

[1] 0

#Visualizing missingness
gg_miss_var(census_data)

From the visualization above, there is no missing values

1.4 Quickly visualizing our datasets

Census Data

skim(census_data)

Data summary
Name	census_data
Number of rows	780
Number of columns	26
_______________________
Column type frequency:
character	2
numeric	24
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
geoid	0	1	2	2	0	52	0
county_state	0	1	4	20	0	52	0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	1	2015.20	4.61	2008.00	2011.00	2015.00	2019.00	2023.00	▇▆▆▃▆
population	1	6229692.05	7025083.48	532668.00	1786156.75	4257700.00	7053886.75	39557045.00	▇▂▁▁▁
median_income	1	58358.78	14075.73	18314.00	48362.75	56484.50	67147.25	108210.00	▁▇▇▂▁
median_monthly_rent_cost	1	945.44	269.03	419.00	749.50	884.00	1092.25	1992.00	▃▇▃▁▁
median_monthly_home_cost	1	1109.16	371.59	241.00	850.00	1020.00	1353.00	2561.00	▁▇▃▂▁
prop_female	1	0.51	0.01	0.47	0.50	0.51	0.51	0.53	▁▂▇▇▁
prop_male	1	0.49	0.01	0.47	0.49	0.49	0.50	0.53	▁▇▇▂▁
prop_white	1	0.75	0.15	0.22	0.67	0.77	0.85	0.96	▁▁▃▇▇
prop_black	1	0.11	0.11	0.00	0.03	0.07	0.15	0.53	▇▃▂▁▁
prop_native	1	0.02	0.03	0.00	0.00	0.00	0.01	0.16	▇▁▁▁▁
prop_asian	1	0.04	0.05	0.00	0.01	0.03	0.04	0.39	▇▁▁▁▁
prop_hawaiin_islander	1	0.00	0.01	0.00	0.00	0.00	0.00	0.11	▇▁▁▁▁
prop_other_race	1	0.04	0.04	0.00	0.01	0.02	0.05	0.32	▇▁▁▁▁
prop_multi_racial	1	0.05	0.05	0.01	0.02	0.03	0.05	0.39	▇▁▁▁▁
prop_highschool	1	0.24	0.04	0.10	0.22	0.24	0.26	0.35	▁▂▇▆▁
prop_ged	1	0.04	0.01	0.02	0.04	0.04	0.05	0.07	▃▇▆▃▁
prop_some_college	1	0.06	0.01	0.01	0.06	0.06	0.07	0.09	▁▁▅▇▂
prop_college_no_degree	1	0.15	0.02	0.08	0.13	0.14	0.16	0.22	▁▅▇▃▁
prop_associates	1	0.09	0.02	0.03	0.08	0.08	0.10	0.15	▁▃▇▂▁
prop_bachelors	1	0.19	0.03	0.10	0.17	0.19	0.21	0.29	▁▅▇▃▁
prop_masters	1	0.08	0.03	0.04	0.06	0.08	0.09	0.24	▇▆▁▁▁
prop_professional	1	0.02	0.01	0.01	0.02	0.02	0.02	0.10	▇▁▁▁▁
prop_doctoral	1	0.01	0.01	0.01	0.01	0.01	0.02	0.05	▇▃▁▁▁
prop_poverty	1	0.14	0.05	0.07	0.11	0.13	0.16	0.46	▇▃▁▁▁

Gun Violence Data

skim(gun_violence_data)

Data summary
Name	gun_violence_data
Number of rows	389850
Number of columns	16
_______________________
Column type frequency:
character	7
numeric	8
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
geoid	46	1.00	5	5	2956
state	0	1.00	4	20	51
city	761	1.00	3	27	11548
county	389065	0.00	1	22	459
business_or_school	316795	0.19	1	85	47476
address	9665	0.98	3	61	325895
incident_characteristics	102	1.00	8	828	10939

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
incident_id	0	1	1468977.53	796698.22	92114.00	756778.50	1529846.00	2165292.50	2976478.00	▇▆▇▇▅
latitude	2	1	37.33	4.82	19.03	33.76	38.31	41.07	71.34	▁▇▅▁▁
longitude	2	1	-89.25	13.45	-171.74	-93.77	-86.77	-80.20	-67.04	▁▁▂▃▇
number_victims_killed	0	1	0.37	0.56	0.00	0.00	0.00	1.00	60.00	▇▁▁▁▁
number_victims_injured	0	1	0.78	1.06	0.00	0.00	1.00	1.00	439.00	▇▁▁▁▁
number_suspects_killed	0	1	0.06	0.24	0.00	0.00	0.00	0.00	4.00	▇▁▁▁▁
number_suspects_injured	0	1	0.05	0.23	0.00	0.00	0.00	0.00	5.00	▇▁▁▁▁
number_suspects_arrested	0	1	0.00	0.00	0.00	0.00	0.00	0.00	0.00	▁▁▇▁▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
date	0	1	2014-01-01	2023-12-31	2019-09-26	3652

We have missing values especially onbusiness_or_schoolandcountycolumns but we will not be using those columns in our summary tables.

We convert the date column to a datetime object and extracting year and month

gun_violence_data <- gun_violence_data |>
  mutate(new_date = parse_date_time(date, orders = c("ymd HMS", "ymd", "mdy HMS", "mdy")),
         year = year(new_date),
         month = month(new_date, label = TRUE, abbr = TRUE))

2 Merging Datasets

We will merge gun violence data set to the census data set and create a summary statistics table that will help us understand the correlation between poverty rate and gun incidences see

2.1 Yearly gun incidents summarized data

yearly_gun_incidence <- gun_violence_data |>
  group_by(year) |>
  summarise(gun_incidence = n())

yearly_gun_incidence |>
flextable()

year	gun_incidence
2,014	27,091
2,015	32,468
2,016	36,636
2,017	37,508
2,018	34,564
2,019	36,290
2,020	46,690
2,021	48,474
2,022	46,506
2,023	43,623

2.2 Census yearly proportion of poverty summarized table

# Population average proportion in poverty 
yearly_prop_poverty <- census_data |>
  group_by(year) |>
  summarise(avg_prop_poverty = sum(prop_poverty * population, na.rm = TRUE) / sum(population, na.rm = TRUE))

# Lets calculate the national total population
national_pop <- census_data |>
  group_by(year) |>
  summarise(total_pop = sum(population, na.rm=TRUE))

yearly_prop_poverty |>
flextable()

Error: bkm [prop poverty table] should only contain alphanumeric characters, ':', '-' and '_'.

national_pop |>
  flextable()

Error: bkm [national_pop table] should only contain alphanumeric characters, ':', '-' and '_'.

2.3 Combining the two summary statistics table

# joining the gun data to both and show both results
combined_yearly_summary <- yearly_gun_incidence |>
  inner_join(national_pop, by = "year") |>
  mutate(inc_per_100k = gun_incidence / total_pop * 100000) |> # aggregating incidents per 100k
  inner_join(yearly_prop_poverty, by = "year")

We use inner_join on year so that only years present in both datasets are included — this avoids comparing years with no gun-incident records or missing socio-demographic information.

combined_yearly_summary |>
  flextable()

year	gun_incidence	total_pop	inc_per_100k	avg_prop_poverty
2,014	27,091	322,405,453	8.402774	0.1584288
2,015	32,468	324,893,003	9.993444	0.1505759
2,016	36,636	326,538,822	11.219493	0.1435402
2,017	37,508	329,056,355	11.398655	0.1371734
2,018	34,564	330,362,592	10.462444	0.1340370
2,019	36,290	331,433,217	10.949415	0.1263793
2,021	48,474	335,157,329	14.463058	0.1303997
2,022	46,506	336,509,351	13.820121	0.1286181
2,023	43,623	338,120,587	12.901610	0.1272208

2.4 Correlation on per-100k vs average proportion poverty

# correlation on per-100k vs average prop poverty
cor_val <- cor(combined_yearly_summary$inc_per_100k, combined_yearly_summary$avg_prop_poverty, use = "complete.obs")
round(cor_val, 2)

[1] -0.77

2.5 Correlation Visualization using a scatter plot

ggplot(combined_yearly_summary, aes(x = avg_prop_poverty, y = inc_per_100k)) +
  geom_point() + geom_smooth(method = "lm", se = FALSE) +
  labs(title = paste0("Correlation (r = ", round(cor_val,2), ")"),
       x = "Average poverty proportion", y = "Incidents per 100k")

We observe a strong, negative correlation (r = -0.77) between the national proportion of poverty and the national rate of gun incidents per 100k population. This indicates that on years where the average proportion of poverty is higher, the total national gun incident counts per capita are lower.

3 Data Visualization

state-level summary from census_data for year 2023

# Lets begin by preparing  state-level summary from census_data for year 2023
census_data_2023 <- census_data |>
  mutate(state = str_to_lower(county_state)) |>
  filter(year == 2023) |>
  select(-county_state)

checking for states with top population

# Get vector of top 5 states by population
top_states <- census_data |>
  group_by(county_state) |>
  summarise(total_pop = sum(population), .groups = "drop") |>
  arrange(desc(total_pop)) |>
  slice_head(n = 5) |>
  pull(county_state)

top_states

[1] "California" "Texas"      "Florida"    "New York"   "Illinois"

3.1 Chloropleth: Visualization for median income in the year 2023 across states

# Lets get state polygons and standardize names
us_states_sf <- st_as_sf(map("state", plot = FALSE, fill = TRUE)) |>
  rename(state = ID) |>
  mutate(state = str_to_lower(state))

# Left join on standardized state names
map_data_sf <- us_states_sf |>
  left_join(census_data_2023, by = "state")

pal <- colorNumeric(palette = "YlGnBu", domain = map_data_sf$median_income, na.color = "transparent")

leaflet(map_data_sf) |>
  addProviderTiles("CartoDB.Positron") |>
  addPolygons(fillColor = ~pal(median_income), weight = 1, color = "white", fillOpacity = 0.8,
              label = ~paste0(str_to_title(state), ": ", scales::dollar(median_income))) |>
  addLegend("bottomright", pal = pal, values = ~median_income, title = "Median income (2023)")

From the leaflet map, we can see states like Missisipi and West Virginia have the lowest median income, while states like Carlifonia, Massachusets and New Hampshire record high median incomes.`

3.2 Histogram: Distribution of median income in 2023 across states

ggplot(census_data_2023, aes(x = median_income)) +
  geom_histogram(fill = viridis(1), color = "white", alpha = 0.8) +
  labs(title = "Distribution of State Median Incomes",
       x = "Median Income ($)", y = "Count of States") +
  scale_x_continuous(labels = scales::dollar) +
  theme_minimal()

The histogram reveals that most U.S. states have median household incomes clustered between $60,000 and $80,000, with a peak near $75,000. Distribution appears to be right skewed.

3.3 Scatter plot: Visualization of poverty rate vs bachelors degree

ggplot(census_data_2023, aes(x = prop_bachelors, y = prop_poverty)) +
  geom_point(aes(color = median_income), alpha = 0.7) +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(title = "Poverty Rate vs. Proportion with Bachelor's Degrees",
       x = "Proportion with Bachelor's Degree",
       y = "Proportion in Poverty",
       color = "Median Income (USD)",
       caption = "Data Source: census_data (years 2008-2023)") +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::percent) +
  scale_color_viridis_c(labels = scales::dollar) +
  theme_minimal()

3.4 Stacked Bar Chart: Racial composition stacked bar chart for Top 5 populated states

# Lets reshape race data
race_data_long <- census_data |>
  select(county_state, prop_white, prop_asian, prop_multi_racial, prop_black, prop_hawaiin_islander) |>
  pivot_longer(cols = starts_with("prop_"), names_to = "race", values_to = "proportion") |>
  mutate(race = gsub("prop_", "", race))

# Pick top states
top_states <- c("California","Texas","Florida","New York" ,"Illinois")

race_data_long <- race_data_long |>
  mutate(county_state = factor(county_state, levels = top_states))


ggplot(filter(race_data_long, county_state %in% top_states),
       aes(x = county_state, y = proportion, fill = race)) +
  geom_bar(stat = "identity", position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Racial Composition of Top 5 States",
       x = "State", y = "Proportion of Population", fill = "Race/Ethnicity") +
  scale_fill_brewer(palette = "Set2") +
  theme_minimal()

3.5 Time series

national_avg_income <- census_data |>
  group_by(year) |>
  summarise(avg_median_income = mean(median_income, na.rm = TRUE),
            total_population = sum(population, na.rm = TRUE))

ggplot(national_avg_income, aes(x = year, y = avg_median_income)) +
  geom_line(size = 1) + geom_point(size = 2) +
  scale_y_continuous(labels = scales::dollar) +
  labs(title = "Average Median Income Over Time (US)", x = "Year", y = "Average Median Income (USD)",
       caption = "Data source: census_data aggregated to national level") +
  theme_minimal()

4 Monte Carlo / Permutation Test

4.1 Hypothesis Question

Do U.S. states with higher proportions of residents holding a bachelor’s degree have different median household incomes than states with lower proportions of bachelor’s degrees?

Test chosen: Two‑sample permutation test comparing the mean median_income between two groups of states defined by prop_bachelors (above vs below the median proportion).

Hypotheses * Null hypothesis (H0): The mean median household income is the same for states with high and low proportions of bachelor’s degrees; any observed difference is due to random assignment of the state labels. * Alternative hypothesis (Ha): The mean median household income differs between the two groups (two‑sided).

4.2 Code implementation

# Lets start by creating a binary grouping variable of HighEdu = prop_bachelors > median(prop_bachelors)
set.seed(2025)  # Lets set the seed for reproducibility
census_high_edu <- census_data_2023 |>
  mutate(
    high_bachelors = ifelse(prop_bachelors > median(prop_bachelors, na.rm = TRUE), "High", "Low")
  ) |>
  filter(!is.na(median_income), !is.na(high_bachelors))

# Lets calculate the observed difference in group means (High - Low)
obs_diff <- census_high_edu |>
  group_by(high_bachelors) |>
  summarize(mean_income = mean(median_income)) |>
  arrange(high_bachelors) |>
  summarize(diff = mean_income[high_bachelors == "High"] - mean_income[high_bachelors == "Low"]) %>%
  pull(diff)

# Lets do start our permutation
n_perm <- 100000
perm_diffs <- numeric(n_perm)

for (i in seq_len(n_perm)) {
  perm_labels <- sample(census_high_edu$high_bachelors)            
  perm_diffs[i] <- with(census_high_edu, 
                        mean(median_income[perm_labels == "High"]) -
                        mean(median_income[perm_labels == "Low"]))
}


p_value <- mean(abs(perm_diffs) >= abs(obs_diff))

lower_cut <- quantile(perm_diffs, 0.025)
upper_cut <- quantile(perm_diffs, 0.975)


cat("Observed difference (High - Low):", round(obs_diff, 2), "\n")

Observed difference (High - Low): 18651.81

cat("Permutation p-value (two-sided):", signif(p_value, 3), "\n")

Permutation p-value (two-sided): 0

cat("95% null cutoff interval:", round(lower_cut,2), "to", round(upper_cut,2), "\n")

95% null cutoff interval: -7879.62 to 7934.9

Since our p_value < 0.05 we reject H0 and conclude that there is evidence that mean median household income differs between states with high vs low bachelor proportions.

4.3 ggplot visualization for the null distribution

# Plotting null distribution 
perm_df <- data.frame(perm_diff = perm_diffs)

ggplot(perm_df, aes(x = perm_diff)) +
  geom_histogram(binwidth = diff(range(perm_diffs))/60, color = "black", fill = "lightblue") +
  geom_vline(xintercept = obs_diff, color = "red", size = 1, linetype = "solid") +
  geom_vline(xintercept = lower_cut, color = "darkgreen", size = 1, linetype = "dashed") +
  geom_vline(xintercept = upper_cut, color = "darkgreen", size = 1, linetype = "dashed") +
  labs(
    title = "Permutation Null Distribution of Difference in Mean Median Income",
    subtitle = "Group labels: High vs Low proportion of bachelor degrees",
    x = "Difference in mean median_income (High - Low)",
    y = "Count"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold"),
    plot.subtitle = element_text(size = 10),
    axis.title = element_text(face = "bold"),
    panel.grid.minor = element_blank()
  ) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0))