# Load necessary packages
library(tidyverse)
library(dplyr)
library(skimr)
library(naniar)
library(lubridate)
library(flextable)
library(gt)
library(janitor)
library(sf)
library(leaflet)
library(scales)
library(maps)
library(viridis)
theme_set(theme_minimal())Complete Statistical Project
1 Data dictionary & Exploratory Data Analysis
1.1 Loading the gun violence dataset and the us census state level
# Load data (use readr::read_csv for better parsing)
library(readr)
gun_violence_data <- read_csv("~/Final Project STA 518/gun_violence_geo.csv", show_col_types = FALSE) |>
clean_names()
census_data <- read_csv("https://raw.githubusercontent.com/dilernia/STA418-518/main/Data/census_data_state_2008-2023.csv", show_col_types = FALSE) |>
clean_names()
# Quick check of column names and types
glimpse(gun_violence_data)Rows: 389,850
Columns: 16
$ geoid <chr> "26049", "22105", "06065", "48439", "22079", …
$ incident_id <dbl> 2803597, 2799267, 2797925, 2795377, 2795134, …
$ date <dttm> 2023-12-31, 2023-12-31, 2023-12-31, 2023-12-…
$ state <chr> "Michigan", "Louisiana", "California", "Texas…
$ city <chr> "Flint", "Amite", "Perris", "Fort Worth", "Al…
$ county <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ business_or_school <chr> NA, NA, NA, NA, NA, NA, "Point Woronzof Overl…
$ address <chr> "Balsam Lane", "300 block of S First St", "17…
$ latitude <dbl> 43.0647, 30.7448, 33.8029, 32.7058, 31.3183, …
$ longitude <dbl> -83.7073, -90.4106, -117.2150, -97.2322, -92.…
$ number_victims_killed <dbl> 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, …
$ number_victims_injured <dbl> 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, …
$ number_suspects_killed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ number_suspects_injured <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ number_suspects_arrested <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ incident_characteristics <chr> "Shot - Dead (murder, accidental, or suicide)…
glimpse(census_data)Rows: 780
Columns: 26
$ geoid <chr> "01", "02", "04", "05", "06", "08", "09", "10…
$ county_state <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "…
$ year <dbl> 2008, 2008, 2008, 2008, 2008, 2008, 2008, 200…
$ population <dbl> 4661900, 686293, 6500180, 2855390, 36756666, …
$ median_income <dbl> 42666, 68460, 50958, 38815, 61021, 56993, 685…
$ median_monthly_rent_cost <dbl> 631, 949, 866, 606, 1135, 848, 970, 917, 1011…
$ median_monthly_home_cost <dbl> 742, 1393, 1177, 660, 1919, 1364, 1722, 1209,…
$ prop_female <dbl> 0.5154499, 0.4789995, 0.4986763, 0.5102035, 0…
$ prop_male <dbl> 0.4845501, 0.5210005, 0.5013237, 0.4897965, 0…
$ prop_white <dbl> 0.7028360, 0.6911290, 0.8005912, 0.7871510, 0…
$ prop_black <dbl> 0.261833802, 0.036257109, 0.036269457, 0.1550…
$ prop_native <dbl> 0.005140822, 0.127259057, 0.044114009, 0.0052…
$ prop_asian <dbl> 0.009768335, 0.046373779, 0.023755650, 0.0103…
$ prop_hawaiin_islander <dbl> 2.417469e-04, 5.484538e-03, 1.436883e-03, 6.7…
$ prop_other_race <dbl> 0.007177117, 0.012462607, 0.068086884, 0.0218…
$ prop_multi_racial <dbl> 0.013002209, 0.081033902, 0.025745902, 0.0197…
$ prop_highschool <dbl> 0.2604227, 0.2183097, 0.2106347, 0.2958009, 0…
$ prop_ged <dbl> 0.05599385, 0.04658968, 0.03935913, 0.0582695…
$ prop_some_college <dbl> 0.05957806, 0.08116479, 0.07719126, 0.0687879…
$ prop_college_no_degree <dbl> 0.1546491, 0.2168623, 0.1816943, 0.1534883, 0…
$ prop_associates <dbl> 0.06831511, 0.08030577, 0.07837909, 0.0558112…
$ prop_bachelors <dbl> 0.1427926, 0.1758716, 0.1590928, 0.1251352, 0…
$ prop_masters <dbl> 0.05542474, 0.06465035, 0.06474227, 0.0433444…
$ prop_professional <dbl> 0.01338380, 0.01743700, 0.01623786, 0.0121880…
$ prop_doctoral <dbl> 0.008333932, 0.015125876, 0.010927963, 0.0072…
$ prop_poverty <dbl> 0.15695141, 0.08420041, 0.14718370, 0.1732244…
We will be using the census data and later merge with gun violence data to create a summary statistics table
1.2 Data dictionary for our main data set, Census Data
dictionary <- tibble(
variable = names(census_data),
type = map_chr(census_data, ~class(.x)[1]),
description = c(
"geographic region ID with the first 2 digits being the state code and the last 3 digits the county FIPS code",
"geographic region",
"year",
"population",
"median income in dollars",
"median monthly rent costs for renters in dollars",
"median monthly housing costs for homeowners in dollars",
"proportion of people who are female",
"proportion of people who are male",
"proportion of people who are white alone",
"proportion of people who are black or African American alone",
"proportion of people who are American Indian and Alaska Native alone",
"proportion of people who are Asian alone",
"proportion of people who are Native Hawaiian and Other Pacific Islander alone",
"proportion of people who are some other race alone",
"proportion of people who are two or more races",
"proportion of people 25 and older whose highest education-level is high school",
"proportion of people 25 and older whose highest education-level is a GED",
"proportion of people 25 and older whose highest education-level is some, but less than 1 year of college",
"proportion of people 25 and older whose highest education-level is greater than 1 year of college but no degree",
"proportion of people 25 and older whose highest education-level is an Associates degree",
"proportion of people 25 and older whose highest education-level is a Bachelors degree",
"proportion of people 25 and older whose highest education-level is a Masters degree",
"proportion of people 25 and older whose highest education-level is a Professional degree",
"proportion of people 25 and older whose highest education-level is a Doctoral degree",
"proportion of people 25 and older living in poverty, defined by the Census Bureau as having an income below the poverty threshold for their family size."
)
)
if(nrow(dictionary) != ncol(census_data)) {
stop("Data dictionary length does not match number of columns in census_data")
}
dictionary |>
gt() |>
tab_header(title = "Data dictionary: census_data")| Data dictionary: census_data | ||
|---|---|---|
| variable | type | description |
| geoid | character | geographic region ID with the first 2 digits being the state code and the last 3 digits the county FIPS code |
| county_state | character | geographic region |
| year | numeric | year |
| population | numeric | population |
| median_income | numeric | median income in dollars |
| median_monthly_rent_cost | numeric | median monthly rent costs for renters in dollars |
| median_monthly_home_cost | numeric | median monthly housing costs for homeowners in dollars |
| prop_female | numeric | proportion of people who are female |
| prop_male | numeric | proportion of people who are male |
| prop_white | numeric | proportion of people who are white alone |
| prop_black | numeric | proportion of people who are black or African American alone |
| prop_native | numeric | proportion of people who are American Indian and Alaska Native alone |
| prop_asian | numeric | proportion of people who are Asian alone |
| prop_hawaiin_islander | numeric | proportion of people who are Native Hawaiian and Other Pacific Islander alone |
| prop_other_race | numeric | proportion of people who are some other race alone |
| prop_multi_racial | numeric | proportion of people who are two or more races |
| prop_highschool | numeric | proportion of people 25 and older whose highest education-level is high school |
| prop_ged | numeric | proportion of people 25 and older whose highest education-level is a GED |
| prop_some_college | numeric | proportion of people 25 and older whose highest education-level is some, but less than 1 year of college |
| prop_college_no_degree | numeric | proportion of people 25 and older whose highest education-level is greater than 1 year of college but no degree |
| prop_associates | numeric | proportion of people 25 and older whose highest education-level is an Associates degree |
| prop_bachelors | numeric | proportion of people 25 and older whose highest education-level is a Bachelors degree |
| prop_masters | numeric | proportion of people 25 and older whose highest education-level is a Masters degree |
| prop_professional | numeric | proportion of people 25 and older whose highest education-level is a Professional degree |
| prop_doctoral | numeric | proportion of people 25 and older whose highest education-level is a Doctoral degree |
| prop_poverty | numeric | proportion of people 25 and older living in poverty, defined by the Census Bureau as having an income below the poverty threshold for their family size. |
1.3 Describing Patterns of missingness for census data
#using naniar package to check visualize missingness
n_miss(census_data)[1] 0
#Visualizing missingness
gg_miss_var(census_data)From the visualization above, there is no missing values
1.4 Quickly visualizing our datasets
Census Data
skim(census_data)| Name | census_data |
| Number of rows | 780 |
| Number of columns | 26 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| numeric | 24 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| geoid | 0 | 1 | 2 | 2 | 0 | 52 | 0 |
| county_state | 0 | 1 | 4 | 20 | 0 | 52 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| year | 0 | 1 | 2015.20 | 4.61 | 2008.00 | 2011.00 | 2015.00 | 2019.00 | 2023.00 | ▇▆▆▃▆ |
| population | 0 | 1 | 6229692.05 | 7025083.48 | 532668.00 | 1786156.75 | 4257700.00 | 7053886.75 | 39557045.00 | ▇▂▁▁▁ |
| median_income | 0 | 1 | 58358.78 | 14075.73 | 18314.00 | 48362.75 | 56484.50 | 67147.25 | 108210.00 | ▁▇▇▂▁ |
| median_monthly_rent_cost | 0 | 1 | 945.44 | 269.03 | 419.00 | 749.50 | 884.00 | 1092.25 | 1992.00 | ▃▇▃▁▁ |
| median_monthly_home_cost | 0 | 1 | 1109.16 | 371.59 | 241.00 | 850.00 | 1020.00 | 1353.00 | 2561.00 | ▁▇▃▂▁ |
| prop_female | 0 | 1 | 0.51 | 0.01 | 0.47 | 0.50 | 0.51 | 0.51 | 0.53 | ▁▂▇▇▁ |
| prop_male | 0 | 1 | 0.49 | 0.01 | 0.47 | 0.49 | 0.49 | 0.50 | 0.53 | ▁▇▇▂▁ |
| prop_white | 0 | 1 | 0.75 | 0.15 | 0.22 | 0.67 | 0.77 | 0.85 | 0.96 | ▁▁▃▇▇ |
| prop_black | 0 | 1 | 0.11 | 0.11 | 0.00 | 0.03 | 0.07 | 0.15 | 0.53 | ▇▃▂▁▁ |
| prop_native | 0 | 1 | 0.02 | 0.03 | 0.00 | 0.00 | 0.00 | 0.01 | 0.16 | ▇▁▁▁▁ |
| prop_asian | 0 | 1 | 0.04 | 0.05 | 0.00 | 0.01 | 0.03 | 0.04 | 0.39 | ▇▁▁▁▁ |
| prop_hawaiin_islander | 0 | 1 | 0.00 | 0.01 | 0.00 | 0.00 | 0.00 | 0.00 | 0.11 | ▇▁▁▁▁ |
| prop_other_race | 0 | 1 | 0.04 | 0.04 | 0.00 | 0.01 | 0.02 | 0.05 | 0.32 | ▇▁▁▁▁ |
| prop_multi_racial | 0 | 1 | 0.05 | 0.05 | 0.01 | 0.02 | 0.03 | 0.05 | 0.39 | ▇▁▁▁▁ |
| prop_highschool | 0 | 1 | 0.24 | 0.04 | 0.10 | 0.22 | 0.24 | 0.26 | 0.35 | ▁▂▇▆▁ |
| prop_ged | 0 | 1 | 0.04 | 0.01 | 0.02 | 0.04 | 0.04 | 0.05 | 0.07 | ▃▇▆▃▁ |
| prop_some_college | 0 | 1 | 0.06 | 0.01 | 0.01 | 0.06 | 0.06 | 0.07 | 0.09 | ▁▁▅▇▂ |
| prop_college_no_degree | 0 | 1 | 0.15 | 0.02 | 0.08 | 0.13 | 0.14 | 0.16 | 0.22 | ▁▅▇▃▁ |
| prop_associates | 0 | 1 | 0.09 | 0.02 | 0.03 | 0.08 | 0.08 | 0.10 | 0.15 | ▁▃▇▂▁ |
| prop_bachelors | 0 | 1 | 0.19 | 0.03 | 0.10 | 0.17 | 0.19 | 0.21 | 0.29 | ▁▅▇▃▁ |
| prop_masters | 0 | 1 | 0.08 | 0.03 | 0.04 | 0.06 | 0.08 | 0.09 | 0.24 | ▇▆▁▁▁ |
| prop_professional | 0 | 1 | 0.02 | 0.01 | 0.01 | 0.02 | 0.02 | 0.02 | 0.10 | ▇▁▁▁▁ |
| prop_doctoral | 0 | 1 | 0.01 | 0.01 | 0.01 | 0.01 | 0.01 | 0.02 | 0.05 | ▇▃▁▁▁ |
| prop_poverty | 0 | 1 | 0.14 | 0.05 | 0.07 | 0.11 | 0.13 | 0.16 | 0.46 | ▇▃▁▁▁ |
Gun Violence Data
skim(gun_violence_data)| Name | gun_violence_data |
| Number of rows | 389850 |
| Number of columns | 16 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| numeric | 8 |
| POSIXct | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| geoid | 46 | 1.00 | 5 | 5 | 0 | 2956 | 0 |
| state | 0 | 1.00 | 4 | 20 | 0 | 51 | 0 |
| city | 761 | 1.00 | 3 | 27 | 0 | 11548 | 0 |
| county | 389065 | 0.00 | 1 | 22 | 0 | 459 | 0 |
| business_or_school | 316795 | 0.19 | 1 | 85 | 0 | 47476 | 0 |
| address | 9665 | 0.98 | 3 | 61 | 0 | 325895 | 0 |
| incident_characteristics | 102 | 1.00 | 8 | 828 | 0 | 10939 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| incident_id | 0 | 1 | 1468977.53 | 796698.22 | 92114.00 | 756778.50 | 1529846.00 | 2165292.50 | 2976478.00 | ▇▆▇▇▅ |
| latitude | 2 | 1 | 37.33 | 4.82 | 19.03 | 33.76 | 38.31 | 41.07 | 71.34 | ▁▇▅▁▁ |
| longitude | 2 | 1 | -89.25 | 13.45 | -171.74 | -93.77 | -86.77 | -80.20 | -67.04 | ▁▁▂▃▇ |
| number_victims_killed | 0 | 1 | 0.37 | 0.56 | 0.00 | 0.00 | 0.00 | 1.00 | 60.00 | ▇▁▁▁▁ |
| number_victims_injured | 0 | 1 | 0.78 | 1.06 | 0.00 | 0.00 | 1.00 | 1.00 | 439.00 | ▇▁▁▁▁ |
| number_suspects_killed | 0 | 1 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 4.00 | ▇▁▁▁▁ |
| number_suspects_injured | 0 | 1 | 0.05 | 0.23 | 0.00 | 0.00 | 0.00 | 0.00 | 5.00 | ▇▁▁▁▁ |
| number_suspects_arrested | 0 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ▁▁▇▁▁ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| date | 0 | 1 | 2014-01-01 | 2023-12-31 | 2019-09-26 | 3652 |
We have missing values especially onbusiness_or_schoolandcountycolumns but we will not be using those columns in our summary tables.
We convert the date column to a datetime object and extracting year and month
gun_violence_data <- gun_violence_data |>
mutate(new_date = parse_date_time(date, orders = c("ymd HMS", "ymd", "mdy HMS", "mdy")),
year = year(new_date),
month = month(new_date, label = TRUE, abbr = TRUE))2 Merging Datasets
We will merge gun violence data set to the census data set and create a summary statistics table that will help us understand the correlation between poverty rate and gun incidences see
2.1 Yearly gun incidents summarized data
yearly_gun_incidence <- gun_violence_data |>
group_by(year) |>
summarise(gun_incidence = n())yearly_gun_incidence |>
flextable()year | gun_incidence |
|---|---|
2,014 | 27,091 |
2,015 | 32,468 |
2,016 | 36,636 |
2,017 | 37,508 |
2,018 | 34,564 |
2,019 | 36,290 |
2,020 | 46,690 |
2,021 | 48,474 |
2,022 | 46,506 |
2,023 | 43,623 |
2.2 Census yearly proportion of poverty summarized table
# Population average proportion in poverty
yearly_prop_poverty <- census_data |>
group_by(year) |>
summarise(avg_prop_poverty = sum(prop_poverty * population, na.rm = TRUE) / sum(population, na.rm = TRUE))
# Lets calculate the national total population
national_pop <- census_data |>
group_by(year) |>
summarise(total_pop = sum(population, na.rm=TRUE))yearly_prop_poverty |>
flextable()Error: bkm [prop poverty table] should only contain alphanumeric characters, ':', '-' and '_'.
national_pop |>
flextable()Error: bkm [national_pop table] should only contain alphanumeric characters, ':', '-' and '_'.
2.3 Combining the two summary statistics table
# joining the gun data to both and show both results
combined_yearly_summary <- yearly_gun_incidence |>
inner_join(national_pop, by = "year") |>
mutate(inc_per_100k = gun_incidence / total_pop * 100000) |> # aggregating incidents per 100k
inner_join(yearly_prop_poverty, by = "year")We use
inner_joinonyearso that only years present in both datasets are included — this avoids comparing years with no gun-incident records or missing socio-demographic information.
combined_yearly_summary |>
flextable()year | gun_incidence | total_pop | inc_per_100k | avg_prop_poverty |
|---|---|---|---|---|
2,014 | 27,091 | 322,405,453 | 8.402774 | 0.1584288 |
2,015 | 32,468 | 324,893,003 | 9.993444 | 0.1505759 |
2,016 | 36,636 | 326,538,822 | 11.219493 | 0.1435402 |
2,017 | 37,508 | 329,056,355 | 11.398655 | 0.1371734 |
2,018 | 34,564 | 330,362,592 | 10.462444 | 0.1340370 |
2,019 | 36,290 | 331,433,217 | 10.949415 | 0.1263793 |
2,021 | 48,474 | 335,157,329 | 14.463058 | 0.1303997 |
2,022 | 46,506 | 336,509,351 | 13.820121 | 0.1286181 |
2,023 | 43,623 | 338,120,587 | 12.901610 | 0.1272208 |
2.4 Correlation on per-100k vs average proportion poverty
# correlation on per-100k vs average prop poverty
cor_val <- cor(combined_yearly_summary$inc_per_100k, combined_yearly_summary$avg_prop_poverty, use = "complete.obs")
round(cor_val, 2)[1] -0.77
2.5 Correlation Visualization using a scatter plot
ggplot(combined_yearly_summary, aes(x = avg_prop_poverty, y = inc_per_100k)) +
geom_point() + geom_smooth(method = "lm", se = FALSE) +
labs(title = paste0("Correlation (r = ", round(cor_val,2), ")"),
x = "Average poverty proportion", y = "Incidents per 100k")We observe a strong, negative correlation (r = -0.77) between the national proportion of poverty and the national rate of gun incidents per 100k population. This indicates that on years where the average proportion of poverty is higher, the total national gun incident counts per capita are lower.
3 Data Visualization
state-level summary from census_data for year 2023
# Lets begin by preparing state-level summary from census_data for year 2023
census_data_2023 <- census_data |>
mutate(state = str_to_lower(county_state)) |>
filter(year == 2023) |>
select(-county_state)checking for states with top population
# Get vector of top 5 states by population
top_states <- census_data |>
group_by(county_state) |>
summarise(total_pop = sum(population), .groups = "drop") |>
arrange(desc(total_pop)) |>
slice_head(n = 5) |>
pull(county_state)
top_states[1] "California" "Texas" "Florida" "New York" "Illinois"
3.1 Chloropleth: Visualization for median income in the year 2023 across states
# Lets get state polygons and standardize names
us_states_sf <- st_as_sf(map("state", plot = FALSE, fill = TRUE)) |>
rename(state = ID) |>
mutate(state = str_to_lower(state))
# Left join on standardized state names
map_data_sf <- us_states_sf |>
left_join(census_data_2023, by = "state")
pal <- colorNumeric(palette = "YlGnBu", domain = map_data_sf$median_income, na.color = "transparent")
leaflet(map_data_sf) |>
addProviderTiles("CartoDB.Positron") |>
addPolygons(fillColor = ~pal(median_income), weight = 1, color = "white", fillOpacity = 0.8,
label = ~paste0(str_to_title(state), ": ", scales::dollar(median_income))) |>
addLegend("bottomright", pal = pal, values = ~median_income, title = "Median income (2023)")From the leaflet map, we can see states like Missisipi and West Virginia have the lowest median income, while states like Carlifonia, Massachusets and New Hampshire record high median incomes.`
3.2 Histogram: Distribution of median income in 2023 across states
ggplot(census_data_2023, aes(x = median_income)) +
geom_histogram(fill = viridis(1), color = "white", alpha = 0.8) +
labs(title = "Distribution of State Median Incomes",
x = "Median Income ($)", y = "Count of States") +
scale_x_continuous(labels = scales::dollar) +
theme_minimal()The histogram reveals that most U.S. states have median household incomes clustered between $60,000 and $80,000, with a peak near $75,000. Distribution appears to be right skewed.
3.3 Scatter plot: Visualization of poverty rate vs bachelors degree
ggplot(census_data_2023, aes(x = prop_bachelors, y = prop_poverty)) +
geom_point(aes(color = median_income), alpha = 0.7) +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(title = "Poverty Rate vs. Proportion with Bachelor's Degrees",
x = "Proportion with Bachelor's Degree",
y = "Proportion in Poverty",
color = "Median Income (USD)",
caption = "Data Source: census_data (years 2008-2023)") +
scale_x_continuous(labels = scales::percent) +
scale_y_continuous(labels = scales::percent) +
scale_color_viridis_c(labels = scales::dollar) +
theme_minimal()3.4 Stacked Bar Chart: Racial composition stacked bar chart for Top 5 populated states
# Lets reshape race data
race_data_long <- census_data |>
select(county_state, prop_white, prop_asian, prop_multi_racial, prop_black, prop_hawaiin_islander) |>
pivot_longer(cols = starts_with("prop_"), names_to = "race", values_to = "proportion") |>
mutate(race = gsub("prop_", "", race))
# Pick top states
top_states <- c("California","Texas","Florida","New York" ,"Illinois")
race_data_long <- race_data_long |>
mutate(county_state = factor(county_state, levels = top_states))
ggplot(filter(race_data_long, county_state %in% top_states),
aes(x = county_state, y = proportion, fill = race)) +
geom_bar(stat = "identity", position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(title = "Racial Composition of Top 5 States",
x = "State", y = "Proportion of Population", fill = "Race/Ethnicity") +
scale_fill_brewer(palette = "Set2") +
theme_minimal()3.5 Time series
national_avg_income <- census_data |>
group_by(year) |>
summarise(avg_median_income = mean(median_income, na.rm = TRUE),
total_population = sum(population, na.rm = TRUE))
ggplot(national_avg_income, aes(x = year, y = avg_median_income)) +
geom_line(size = 1) + geom_point(size = 2) +
scale_y_continuous(labels = scales::dollar) +
labs(title = "Average Median Income Over Time (US)", x = "Year", y = "Average Median Income (USD)",
caption = "Data source: census_data aggregated to national level") +
theme_minimal()4 Monte Carlo / Permutation Test
4.1 Hypothesis Question
Do U.S. states with higher proportions of residents holding a bachelor’s degree have different median household incomes than states with lower proportions of bachelor’s degrees?
Test chosen: Two‑sample permutation test comparing the mean median_income between two groups of states defined by prop_bachelors (above vs below the median proportion).
Hypotheses * Null hypothesis (H0): The mean median household income is the same for states with high and low proportions of bachelor’s degrees; any observed difference is due to random assignment of the state labels. * Alternative hypothesis (Ha): The mean median household income differs between the two groups (two‑sided).
4.2 Code implementation
# Lets start by creating a binary grouping variable of HighEdu = prop_bachelors > median(prop_bachelors)
set.seed(2025) # Lets set the seed for reproducibility
census_high_edu <- census_data_2023 |>
mutate(
high_bachelors = ifelse(prop_bachelors > median(prop_bachelors, na.rm = TRUE), "High", "Low")
) |>
filter(!is.na(median_income), !is.na(high_bachelors))
# Lets calculate the observed difference in group means (High - Low)
obs_diff <- census_high_edu |>
group_by(high_bachelors) |>
summarize(mean_income = mean(median_income)) |>
arrange(high_bachelors) |>
summarize(diff = mean_income[high_bachelors == "High"] - mean_income[high_bachelors == "Low"]) %>%
pull(diff)
# Lets do start our permutation
n_perm <- 100000
perm_diffs <- numeric(n_perm)
for (i in seq_len(n_perm)) {
perm_labels <- sample(census_high_edu$high_bachelors)
perm_diffs[i] <- with(census_high_edu,
mean(median_income[perm_labels == "High"]) -
mean(median_income[perm_labels == "Low"]))
}
p_value <- mean(abs(perm_diffs) >= abs(obs_diff))
lower_cut <- quantile(perm_diffs, 0.025)
upper_cut <- quantile(perm_diffs, 0.975)
cat("Observed difference (High - Low):", round(obs_diff, 2), "\n")Observed difference (High - Low): 18651.81
cat("Permutation p-value (two-sided):", signif(p_value, 3), "\n")Permutation p-value (two-sided): 0
cat("95% null cutoff interval:", round(lower_cut,2), "to", round(upper_cut,2), "\n")95% null cutoff interval: -7879.62 to 7934.9
Since our
p_value < 0.05we reject H0 and conclude that there is evidence that mean median household income differs between states with high vs low bachelor proportions.
4.3 ggplot visualization for the null distribution
# Plotting null distribution
perm_df <- data.frame(perm_diff = perm_diffs)
ggplot(perm_df, aes(x = perm_diff)) +
geom_histogram(binwidth = diff(range(perm_diffs))/60, color = "black", fill = "lightblue") +
geom_vline(xintercept = obs_diff, color = "red", size = 1, linetype = "solid") +
geom_vline(xintercept = lower_cut, color = "darkgreen", size = 1, linetype = "dashed") +
geom_vline(xintercept = upper_cut, color = "darkgreen", size = 1, linetype = "dashed") +
labs(
title = "Permutation Null Distribution of Difference in Mean Median Income",
subtitle = "Group labels: High vs Low proportion of bachelor degrees",
x = "Difference in mean median_income (High - Low)",
y = "Count"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold"),
plot.subtitle = element_text(size = 10),
axis.title = element_text(face = "bold"),
panel.grid.minor = element_blank()
) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0))