This analysis examines state level FY2023 Maternal, Infant, and Early Childhood Home Visiting (MIECHV) data, combined with American Community Survey poverty estimates, to understand how resources and reach vary across states. The work integrates descriptive statistics, regression modeling that is informed by causal inference concepts, and equity and efficiency diagnostics.
How does cross state variation in poverty burden relate to equity in MIECHV funding and performance outcomes, including families reached, home visits delivered, and staff efficiency?
The goal is not to make strong causal claims from a single cross sectional snapshot, but to quantify patterns that can guide program monitoring, peer comparison, and future evaluation design.
library(lmtest) # for testing linear regression
library(writexl)
library(scales)
library(nortest) #bptest
library(performance)
library(magrittr) # for pipes
library(table1) # for descriptive statistics
library(tidyverse) # for tidy code
library(sessioninfo) # for session_info at bottom
library(dplyr) # for data manipulation
library(details) # for session_info at bottom
library(ggthemes) # for tufte theme
library(ggrepel) # for text plotting
library(patchwork) # for combining plots
library(readxl) # for reading xlxs files
library(kableExtra) # for table headers formatting
library(knitr) # for tables
library(sjPlot) # for model tables
library(broom) # for logistic regression
library(survey) #analysis of complex survey
library(pROC) # ROC curve visualization
library(skimr) # for summary generation
library(ggmosaic) #for mosaic plot
library(car) #for multicollinearity vif
library(generalhoslem) # for goodness of fit
library(ggeffects) # for effect modification visuals
library(usmap) # for US Map
library(plotly) # for map plotting
library (heatmaply) # for heat map
library(DT)
library(gt)
library(htmltools)
# Nice helpers
tbl <- function(df, caption = NULL) {
gt::gt(df) %>%
gt::tab_header(title = caption) %>%
gt::fmt_number(where(is.numeric), decimals = 1, use_seps = TRUE) %>%
gt::opt_table_outline()
}
dt <- function(df, caption = NULL) {
datatable(df, rownames = FALSE, filter = "top",
options = list(pageLength = 10, scrollX = TRUE),
caption = htmltools::tags$caption(style="caption-side: top; text-align: left;", caption))
}
The analytic dataset is constructed from:
Key variables include:
Data cleaning involved:
Derived indicators include:
Modality mix
These metrics allow us to look at equity (how much service per unit of poverty) and efficiency (how much service per unit of resource).
df<- read.csv("C:/Users/kesha/Downloads/MIECHV_HEOR/ProgramSizeReach_with_poverty_population.csv")
data <- df %>%
drop_na(everything())
# Centralize variable choices used later (edit these vectors to include/exclude variables)
vars_summary <- c("base_award","matching_award","HV_staff_FTE","max_caseload",
"agencies","families_served","adults","children",
"total_home_visits","in_person_visits","virtual_visits",
"male_poverty","female_poverty","total_poverty_population", "total_under_5_poverty_population")
vars_corr <- c("total_funding","HV_staff_FTE","agencies","max_caseload",
"families_served","total_home_visits",
"funding_per_1k_poverty","families_per_1k_poverty","visits_per_1k_poverty",
"funding_per_family","funding_per_visit","visits_per_FTE","caseload_utilization")
vars_box <- c("total_funding","families_served","total_home_visits","visits_per_FTE","funding_per_family")
To facilitate meaningful comparisons across states, several derived indicators were constructed:
data <- data %>%
mutate(
# Funding aggregates
total_funding = base_award + matching_award,
# Load-adjusted (per 1,000 population in poverty)
funding_per_1k_poverty = if_else(total_poverty_population > 0, total_funding / (total_poverty_population / 1000), NA_real_),
families_per_1k_poverty = if_else(total_poverty_population > 0, families_served / (total_poverty_population / 1000), NA_real_),
visits_per_1k_poverty = if_else(total_poverty_population > 0, total_home_visits / (total_poverty_population / 1000), NA_real_),
# Per-unit productivity / cost
funding_per_family = if_else(families_served > 0, total_funding / families_served, NA_real_),
funding_per_child = if_else(children > 0, total_funding / children, NA_real_),
funding_per_visit = if_else(total_home_visits > 0, total_funding / total_home_visits, NA_real_),
visits_per_family = if_else(families_served > 0, total_home_visits / families_served, NA_real_),
visits_per_FTE = if_else(HV_staff_FTE > 0, total_home_visits / HV_staff_FTE, NA_real_),
families_per_FTE = if_else(HV_staff_FTE > 0, families_served / HV_staff_FTE, NA_real_),
# Capacity/structure
caseload_utilization = if_else(max_caseload > 0, families_served / max_caseload, NA_real_),
households_per_agency = if_else(agencies > 0, families_served / agencies, NA_real_),
FTE_per_agency = if_else(agencies > 0, HV_staff_FTE / agencies, NA_real_),
avg_caseload_per_FTE = if_else(HV_staff_FTE > 0, max_caseload / HV_staff_FTE, NA_real_),
# Modality mix
in_person_share = if_else(total_home_visits > 0, in_person_visits / total_home_visits, NA_real_),
virtual_share = if_else(total_home_visits > 0, virtual_visits / total_home_visits, NA_real_),
# Poverty composition (optional)
poverty_female_share = if_else(total_poverty_population > 0, female_poverty / total_poverty_population, NA_real_),
poverty_male_share = if_else(total_poverty_population > 0, male_poverty / total_poverty_population, NA_real_),
# Quartiles by poverty load
poverty_quartile = ntile(total_poverty_population, 4)
) %>%
mutate(
poverty_quartile = factor(poverty_quartile,
labels = c("Lowest load", "Q2", "Q3", "Highest load")))
# Quartile cutoffs (table + histogram)
qcuts <- quantile(data$total_poverty_population, c(0,.25,.5,.75,1), na.rm = TRUE)
tbl(data.frame(Quantile = c("Min","Q1","Median","Q3","Max"),
Cutoff = as.numeric(qcuts)),
"Quartile cutoffs for total_poverty_population")
| Quartile cutoffs for total_poverty_population | |
| Quantile | Cutoff |
|---|---|
| Min | 224.0 |
| Q1 | 3,425.0 |
| Median | 5,322.0 |
| Q3 | 22,787.0 |
| Max | 81,335.0 |
p_hist <- ggplot(data, aes(total_poverty_population)) +
geom_histogram(bins = 30, fill = "#7fcdbb", color = "white") +
geom_vline(xintercept = qcuts, linetype = c("solid","dashed","dashed","dashed","solid")) +
labs(title = "Poverty population: distribution & quartiles",
x = "Total poverty population", y = "States") +
theme_minimal()
plotly::ggplotly(p_hist)
#Poverty quartiles as a quasi experimental structure Total poverty population is used to divide states into four quartiles of underlying need. In this dataset the distribution is highly right skewed. A small number of large states carry a very high share of the poverty population, while many states have relatively small poverty counts.
The quartile cutoffs are approximately:
These quartiles serve as a simple quasi experimental contrast. They are not a true experiment, but they allow us to compare:
All equity and performance metrics are summarized by these quartiles to show whether states with more need receive proportionally more resources and service.
states_by_quartile <- data %>%
group_by(poverty_quartile) %>%
summarise(
n_states = n(),
state_codes = paste(state_code, collapse = ", "),
.groups ="drop")
dt(states_by_quartile, "States by poverty-load quartiles")
quartile_summary <- data %>%
group_by(poverty_quartile) %>%
summarise(
n_states = n(),
min_pov = min(total_poverty_population, na.rm = TRUE),
max_pov = max(total_poverty_population, na.rm = TRUE),
avg_funding_per_1k = mean(funding_per_1k_poverty, na.rm = TRUE),
avg_families_per_1k = mean(families_per_1k_poverty, na.rm = TRUE),
avg_visits_per_1k = mean(visits_per_1k_poverty, na.rm = TRUE),
.groups = "drop"
)
tbl(quartile_summary, "Summary by poverty-load quartile")
| Summary by poverty-load quartile | ||||||
| poverty_quartile | n_states | min_pov | max_pov | avg_funding_per_1k | avg_families_per_1k | avg_visits_per_1k |
|---|---|---|---|---|---|---|
| Lowest load | 2,643.0 | 224.0 | 3,425.0 | 5,917,401.7 | 1,676.7 | 29,822.6 |
| Q2 | 2,643.0 | 3,425.0 | 5,322.0 | 2,090,619.0 | 316.6 | 3,947.4 |
| Q3 | 2,643.0 | 5,322.0 | 22,787.0 | 1,152,155.0 | 175.2 | 2,065.7 |
| Highest load | 2,643.0 | 22,787.0 | 81,335.0 | 494,613.3 | 108.5 | 1,238.0 |
# Quick bar of counts
ggplot(states_by_quartile, aes(poverty_quartile, n_states)) +
geom_col(fill= "#fa9fb5") + geom_text(aes(label = n_states), vjust = -0.3) +
labs(title = "State count by poverty-load quartile", x = NULL, y = "Number of states") +
theme_minimal()
# table1: Overall
table1(~ base_award + matching_award + total_funding +
HV_staff_FTE + max_caseload + agencies +
families_served + adults + children +
total_home_visits + in_person_visits + virtual_visits +
funding_per_1k_poverty + families_per_1k_poverty + visits_per_1k_poverty +
funding_per_family + funding_per_visit + visits_per_FTE + caseload_utilization,
data = data, render.missing = NULL,
render.continuous = "median (IQR)",
caption = "Overall Descriptive Summary (HEOR) — 2023")
| Overall (N=10572) |
|
|---|---|
| base_award | 8930000 (6010000) |
| matching_award | 726000 (0) |
| total_funding | 9650000 (6010000) |
| HV_staff_FTE | 85.0 (72.1) |
| max_caseload | 1260 (1160) |
| agencies | 15.0 (13.0) |
| families_served | 1590 (1270) |
| adults | 1620 (1330) |
| children | 1560 (1340) |
| total_home_visits | 19900 (11300) |
| in_person_visits | 15300 (10100) |
| virtual_visits | 2800 (4400) |
| funding_per_1k_poverty | 1480000 (2280000) |
| families_per_1k_poverty | 223 (276) |
| visits_per_1k_poverty | 2650 (3720) |
| funding_per_family | 6290 (4050) |
| funding_per_visit | 525 (342) |
| visits_per_FTE | 213 (102) |
| caseload_utilization | 1.18 (0.281) |
# table1: By quartile (not by state!)
table1(~ total_funding + HV_staff_FTE + agencies + max_caseload +
families_served + total_home_visits +
funding_per_1k_poverty + families_per_1k_poverty + visits_per_1k_poverty +
funding_per_family + funding_per_visit + visits_per_FTE + caseload_utilization +
in_person_share + virtual_share | poverty_quartile,
data = data, render.missing = NULL,
render.continuous = "median (IQR)",
caption = "Program Summary by Poverty Load (Quartiles) — 2023")
| Lowest load (N=2643) |
Q2 (N=2643) |
Q3 (N=2643) |
Highest load (N=2643) |
Overall (N=10572) |
|
|---|---|---|---|---|---|
| total_funding | 8090000 (2750000) | 9600000 (5000000) | 10000000 (2900000) | 12400000 (20800000) | 9650000 (6010000) |
| HV_staff_FTE | 90.0 (53.9) | 76.5 (65.6) | 80.3 (43.4) | 91.3 (1040) | 85.0 (72.1) |
| agencies | 16.0 (18.0) | 12.0 (9.00) | 16.0 (13.0) | 17.0 (27.0) | 15.0 (13.0) |
| max_caseload | 1420 (717) | 1030 (732) | 1360 (923) | 2510 (10100) | 1260 (1160) |
| families_served | 1650 (757) | 1400 (738) | 1650 (1080) | 2600 (5940) | 1590 (1270) |
| total_home_visits | 21200 (8680) | 18800 (9320) | 19700 (9730) | 24200 (67100) | 19900 (11300) |
| funding_per_1k_poverty | 3650000 (1510000) | 2250000 (1310000) | 1060000 (738000) | 379000 (765000) | 1480000 (2280000) |
| families_per_1k_poverty | 591 (3790) | 383 (209) | 158 (142) | 37.9 (205) | 223 (276) |
| visits_per_1k_poverty | 7670 (68300) | 4290 (2630) | 1980 (2280) | 353 (2320) | 2650 (3720) |
| funding_per_family | 5970 (4080) | 6360 (4100) | 5880 (2300) | 7790 (6460) | 6290 (4050) |
| funding_per_visit | 484 (240) | 524 (266) | 570 (333) | 635 (462) | 525 (342) |
| visits_per_FTE | 206 (86.3) | 261 (143) | 203 (94.1) | 173 (144) | 213 (102) |
| caseload_utilization | 1.30 (0.234) | 1.36 (0.257) | 1.16 (0.0808) | 0.950 (0.679) | 1.18 (0.281) |
| in_person_share | 0.875 (0.0838) | 0.844 (0.109) | 0.848 (0.0752) | 0.860 (0.136) | 0.860 (0.0954) |
| virtual_share | 0.125 (0.0821) | 0.146 (0.107) | 0.141 (0.104) | 0.140 (0.136) | 0.140 (0.103) |
The quartile table confirms that each quartile contains a reasonable number of states, which supports comparisons without any single state dominating the averages. The first quartile consists mostly of lower population states with smaller total poverty counts. The highest load quartile consists of large population states that account for a large share of the total poverty population nationally.
When we summarize key metrics by quartile:
This pattern suggests that MIECHV resources have not been fully rebalanced toward states with the greatest poverty burden. High poverty states tend to receive lower funding and deliver fewer visits relative to the size of their need. # Numeric Summary (wide table + boxplots)
desc_tbl <- data %>%
summarise(across(all_of(vars_summary), list(
n = ~sum(!is.na(.)),
mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
p25 = ~quantile(., 0.25, na.rm = TRUE),
median = ~median(., na.rm = TRUE),
p75 = ~quantile(., 0.75, na.rm = TRUE),
min = ~min(., na.rm = TRUE),
max = ~max(., na.rm = TRUE)
), .names = "{.col}.{.fn}"))
dt(desc_tbl, "Numeric Descriptive Summary (Wide)")
# (1) Overall summary (no groups)
table1(
~ base_award + matching_award + total_funding +
HV_staff_FTE + max_caseload + agencies +
families_served + adults + children +
total_home_visits + in_person_visits + virtual_visits +
funding_per_1k_poverty + families_per_1k_poverty + visits_per_1k_poverty +
funding_per_family + funding_per_visit + visits_per_FTE + caseload_utilization,
data = data, render.missing = NULL,
render.continuous = "median (IQR)",
caption = "Overall Descriptive Summary (HEOR) — 2023"
)
| Overall (N=10572) |
|
|---|---|
| base_award | 8930000 (6010000) |
| matching_award | 726000 (0) |
| total_funding | 9650000 (6010000) |
| HV_staff_FTE | 85.0 (72.1) |
| max_caseload | 1260 (1160) |
| agencies | 15.0 (13.0) |
| families_served | 1590 (1270) |
| adults | 1620 (1330) |
| children | 1560 (1340) |
| total_home_visits | 19900 (11300) |
| in_person_visits | 15300 (10100) |
| virtual_visits | 2800 (4400) |
| funding_per_1k_poverty | 1480000 (2280000) |
| families_per_1k_poverty | 223 (276) |
| visits_per_1k_poverty | 2650 (3720) |
| funding_per_family | 6290 (4050) |
| funding_per_visit | 525 (342) |
| visits_per_FTE | 213 (102) |
| caseload_utilization | 1.18 (0.281) |
# (2) By poverty-load quartile
table1(
~ total_funding + HV_staff_FTE + agencies + max_caseload +
families_served + total_home_visits +
funding_per_1k_poverty + families_per_1k_poverty + visits_per_1k_poverty +
funding_per_family + funding_per_visit + visits_per_FTE + caseload_utilization +
in_person_share + virtual_share | poverty_quartile,
data = data, render.missing = NULL,
render.continuous = "median (IQR)",
caption = "Program Summary by Poverty Load (Quartiles) — 2023"
)
| Lowest load (N=2643) |
Q2 (N=2643) |
Q3 (N=2643) |
Highest load (N=2643) |
Overall (N=10572) |
|
|---|---|---|---|---|---|
| total_funding | 8090000 (2750000) | 9600000 (5000000) | 10000000 (2900000) | 12400000 (20800000) | 9650000 (6010000) |
| HV_staff_FTE | 90.0 (53.9) | 76.5 (65.6) | 80.3 (43.4) | 91.3 (1040) | 85.0 (72.1) |
| agencies | 16.0 (18.0) | 12.0 (9.00) | 16.0 (13.0) | 17.0 (27.0) | 15.0 (13.0) |
| max_caseload | 1420 (717) | 1030 (732) | 1360 (923) | 2510 (10100) | 1260 (1160) |
| families_served | 1650 (757) | 1400 (738) | 1650 (1080) | 2600 (5940) | 1590 (1270) |
| total_home_visits | 21200 (8680) | 18800 (9320) | 19700 (9730) | 24200 (67100) | 19900 (11300) |
| funding_per_1k_poverty | 3650000 (1510000) | 2250000 (1310000) | 1060000 (738000) | 379000 (765000) | 1480000 (2280000) |
| families_per_1k_poverty | 591 (3790) | 383 (209) | 158 (142) | 37.9 (205) | 223 (276) |
| visits_per_1k_poverty | 7670 (68300) | 4290 (2630) | 1980 (2280) | 353 (2320) | 2650 (3720) |
| funding_per_family | 5970 (4080) | 6360 (4100) | 5880 (2300) | 7790 (6460) | 6290 (4050) |
| funding_per_visit | 484 (240) | 524 (266) | 570 (333) | 635 (462) | 525 (342) |
| visits_per_FTE | 206 (86.3) | 261 (143) | 203 (94.1) | 173 (144) | 213 (102) |
| caseload_utilization | 1.30 (0.234) | 1.36 (0.257) | 1.16 (0.0808) | 0.950 (0.679) | 1.18 (0.281) |
| in_person_share | 0.875 (0.0838) | 0.844 (0.109) | 0.848 (0.0752) | 0.860 (0.136) | 0.860 (0.0954) |
| virtual_share | 0.125 (0.0821) | 0.146 (0.107) | 0.141 (0.104) | 0.140 (0.136) | 0.140 (0.103) |
The numeric descriptive summary and boxplots show wide dispersion across states in:
The distributions are right skewed for scale variables such as total funding and total visits, which is expected given the mix of small and large states. The per unit metrics (per 1,000 in poverty) also show considerable variation, which reflects differences in state funding formulas, matched funds, and program design choices.
# Boxplots for key metrics
box_long <- data %>%
select(all_of(vars_box)) %>%
pivot_longer(everything(), names_to = "metric", values_to = "value")
p_boxes <- ggplot(box_long, aes(metric, value)) +
geom_boxplot(outlier.alpha = .4, fill = "#8856a7") +
scale_fill_brewer(palette = "Set2") +
labs(title = "Distribution of key program metrics", x = NULL, y = "Value") +
theme_minimal()
plotly::ggplotly(p_boxes)
The boxplots highlight that:
cors <- cor(data[, vars_corr], use = "pairwise.complete.obs")
heatmaply::heatmaply_cor(cors, k_col = 2, k_row = 2,
xlab = "Variables", ylab = "Variables",
main = "Correlation matrix (interactive)")
# 2. Test significance of correlations
p_mat <- psych::corr.test(data[, vars_corr], adjust = "none")$p # gives p-values
# 3. Mask non-significant correlations (replace with 0 to keep clustering)
alpha <- 0.05
cors_sig <- cors
cors_sig[p_mat > alpha] <- 0 # keep clustering valid
# 4. Plot with diverging colors
heatmaply_cor(
cors_sig,
k_col = 2, k_row = 2,
xlab = "Variables", ylab = "Variables",
main = "Significant Correlations (p < 0.05)",
colors = colorRampPalette(c("#238443", "#ece7f2", "#cc4c02"))(200) # blue = negative, red = positive
)
The correlation heatmaps indicate several patterns among the key metrics:
The second heatmap that masks non significant correlations shows that several of the strongest positive and negative relationships are statistically robust, while others are weak or indistinguishable from zero in this sample of 51 states.
The OLS models treat each state as one observation and estimate how outcomes are associated with funding, staffing, caseload, and poverty after adjustment. Robust HC1 standard errors are used to handle heteroskedasticity.
# Helper: tidy robust results
tidy_robust <- function(model, model_name, type = "HC1") {
# robust VCOV
robust_vcov <- sandwich::vcovHC(model, type = type)
# coeftest gives matrix of coef, robust SE, t, p
rob <- lmtest::coeftest(model, vcov = robust_vcov)
# convert to tibble
tibble::tibble(
term = rownames(rob),
estimate = rob[, 1],
std.error = rob[, 2],
statistic = rob[, 3],
p.value = rob[, 4],
model = model_name
) %>%
# add 95% CIs
mutate(
conf.low = estimate - 1.96 * std.error,
conf.high = estimate + 1.96 * std.error
)
}
# Run models
m1 <- lm(families_served ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data)
m2 <- lm(total_home_visits ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data)
m3 <- lm(visits_per_FTE ~ funding_per_family + agencies + max_caseload + total_poverty_population, data = data)
# Collect robust results
coef_df <- bind_rows(
tidy_robust(m1, "Robust OLS: families_served"),
tidy_robust(m2, "Robust OLS: total_home_visits"),
tidy_robust(m3, "Robust OLS: visits_per_FTE")
) %>%
filter(term != "(Intercept)")
# print
#print(coef_df)
# pretty table
coef_df %>%
gt(groupname_col = "model") %>%
gt::fmt_number(columns = c(estimate, std.error, conf.low, conf.high),
decimals = 2) %>%
gt::tab_header(title = "OLS with Robust SE (HC1)")
| OLS with Robust SE (HC1) | ||||||
| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| Robust OLS: families_served | ||||||
| total_funding | 0.00 | 0.00 | -23.2741152 | 6.850503e-117 | 0.00 | 0.00 |
| HV_staff_FTE | −5.93 | 0.09 | -67.2019261 | 0.000000e+00 | −6.10 | −5.75 |
| agencies | 54.83 | 0.63 | 86.6923355 | 0.000000e+00 | 53.60 | 56.07 |
| max_caseload | 1.12 | 0.01 | 104.4865144 | 0.000000e+00 | 1.10 | 1.14 |
| total_poverty_population | 0.00 | 0.00 | -10.5803121 | 4.962206e-26 | 0.00 | 0.00 |
| Robust OLS: total_home_visits | ||||||
| total_funding | 0.00 | 0.00 | -63.7799020 | 0.000000e+00 | 0.00 | 0.00 |
| HV_staff_FTE | −119.90 | 3.02 | -39.6863515 | 3.428816e-321 | −125.83 | −113.98 |
| agencies | 1,968.03 | 22.72 | 86.6029418 | 0.000000e+00 | 1,923.49 | 2,012.57 |
| max_caseload | 19.25 | 0.37 | 52.5198281 | 0.000000e+00 | 18.53 | 19.97 |
| total_poverty_population | 0.00 | 0.01 | -0.4610508 | 6.447717e-01 | −0.02 | 0.01 |
| Robust OLS: visits_per_FTE | ||||||
| funding_per_family | 0.00 | 0.00 | -86.7020796 | 0.000000e+00 | 0.00 | 0.00 |
| agencies | 5.06 | 0.10 | 51.8624843 | 0.000000e+00 | 4.87 | 5.26 |
| max_caseload | −0.03 | 0.00 | -109.2238270 | 0.000000e+00 | −0.03 | −0.03 |
| total_poverty_population | 0.00 | 0.00 | -10.6661825 | 1.996466e-26 | 0.00 | 0.00 |
Outcome: families_served Predictors: total funding, HV staff FTE, agencies, max caseload, poverty population.
Key findings:
Interpretation: At the state level, capacity structure (caseload and agencies) is more strongly associated with how many families are served than the total award amount, once we account for poverty burden. This points to the importance of how funding is translated into slots and agencies, not only the size of the check.
Outcome: total_home_visits Predictors: same as Model 1.
Key findings:
Interpretation: The volume of visits is driven more by structural capacity and the number of agencies than by raw funding levels. This suggests that simply increasing funding without addressing caseload limits and agency infrastructure may not yield proportional increases in visits.
Outcome: visits_per_FTE Predictors: funding per family, agencies, max caseload, poverty population.
Key findings:
Interpretation: Efficiency measured as visits per FTE is strongly shaped by how resources are allocated per family and by caseload design. Higher funding per family and very high caseload limits are both associated with lower productivity per FTE, which may reflect a tradeoff between intensity and throughput.
# Multicollinearity (VIF + kappa)
vif_m1 <- car::vif(m1); kable(data.frame(term=names(vif_m1), VIF=as.numeric(vif_m1)),
caption="OLS m1 VIF", digits=2)
| term | VIF |
|---|---|
| total_funding | 3.71 |
| HV_staff_FTE | 52.60 |
| agencies | 2.71 |
| max_caseload | 69.39 |
| total_poverty_population | 1.28 |
vif_m2 <- car::vif(m2); kable(data.frame(term=names(vif_m2), VIF=as.numeric(vif_m2)),
caption="OLS m2 VIF", digits=2)
| term | VIF |
|---|---|
| total_funding | 3.71 |
| HV_staff_FTE | 52.60 |
| agencies | 2.71 |
| max_caseload | 69.39 |
| total_poverty_population | 1.28 |
vif_m3 <- car::vif(m3); kable(data.frame(term=names(vif_m3), VIF=as.numeric(vif_m3)),
caption="OLS m3 VIF", digits=2)
| term | VIF |
|---|---|
| funding_per_family | 1.02 |
| agencies | 1.97 |
| max_caseload | 2.16 |
| total_poverty_population | 1.16 |
# print table
kable(data.frame(model=c("m1","m2","m3"),
kappa=c(kappa(model.matrix(m1)),
kappa(model.matrix(m2)),
kappa(model.matrix(m3)))),
caption="Condition number (kappa) — OLS", digits=1)
| model | kappa |
|---|---|
| m1 | 20517776.5 |
| m2 | 20517776.5 |
| m3 | 39861.3 |
# Rule of thumb: VIF > 5 (sometimes 10) or kappa > ~30 indicates multicollinearity.
# Heteroskedasticity (Breusch–Pagan)
kable(data.frame(
model=c("m1","m2","m3"),
BP_stat=c(bptest(m1)$statistic, bptest(m2)$statistic, bptest(m3)$statistic),
p_value=c(bptest(m1)$p.value, bptest(m2)$p.value, bptest(m3)$p.value)
), caption="Breusch–Pagan test (OLS)", digits=4)
| model | BP_stat | p_value |
|---|---|---|
| m1 | 1552.6989 | 0 |
| m2 | 3749.4292 | 0 |
| m3 | 568.1124 | 0 |
# p < 0.05 => heteroskedasticity (you already use robust HC1 SEs, so you're covered).
# Functional form (RESET)
kable(data.frame(
model=c("m1","m2","m3"),
RESET_p=c(resettest(m1, power=2:3, type="regressor")$p.value,
resettest(m2, power=2:3, type="regressor")$p.value,
resettest(m3, power=2:3, type="regressor")$p.value)
), caption="RESET test p-values (OLS)", digits=4)
| model | RESET_p |
|---|---|
| m1 | 0 |
| m2 | 0 |
| m3 | 0 |
# p < 0.05 => consider logs / quadratic terms / interactions.
# Residual normality
length(residuals(m1))
[1] 10572
length(residuals(m2))
[1] 10572
length(residuals(m3))
[1] 10572
# kable(data.frame(
# model=c("m1","m2","m3"),
# Shapiro_p=c(shapiro.test(residuals(m1))$p.value,
# shapiro.test(residuals(m2))$p.value,
# shapiro.test(residuals(m3))$p.value)
# ), caption="Residual normality (Shapiro-Wilk) — OLS", digits=4)
# With ~50 states, normality is less critical; robust SEs help.
# Diagnostic plots (interactive)
# Each 'plot(model)' shows 4 standard diagnostics. Knit-friendly and fast.
par(mfrow=c(2,2))
plot(m1, which=1:4, main="m1 diagnostics")
plot(m2, which=1:4, main="m2 diagnostics")
plot(m3, which=1:4, main="m3 diagnostics")
par(mfrow=c(1,1))
The VIF and condition number statistics indicate some multicollinearity, especially among funding, caseload, and staffing, which is expected in a small sample of states. The Breusch Pagan tests show evidence of heteroskedasticity, which is why robust standard errors are used. RESET tests suggest that non linear terms or log transformations could be explored in future work, but for this descriptive HEOR analysis the linear specification is acceptable.
ggplot(coef_df, aes(x = estimate, y = term,
xmin = conf.low, xmax = conf.high,
color = model)) +
geom_point(position = position_dodge(width = 0.7), size = 2) +
geom_errorbarh(position = position_dodge(width = 0.7), height = 0.2) +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey40") +
labs(
title = "Coefficient Plot with Robust SEs (95% CI)",
x = "Estimate",
y = "Predictor"
) +
theme_minimal(base_size = 14) +
theme(legend.position = "bottom")+
facet_wrap(~model, scales = "free_x")
# standardize selected predictors
data_std <- data %>%
mutate(across(
c(total_funding, HV_staff_FTE, agencies, max_caseload,
total_poverty_population, funding_per_family),
scale
))
m1_std <- lm(families_served ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data_std)
m2_std <- lm(total_home_visits ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data_std)
m3_std <- lm(visits_per_FTE ~ funding_per_family + agencies + max_caseload + total_poverty_population, data = data_std)
coef_df_std <- bind_rows(
tidy_robust(m1_std, "families_served"),
tidy_robust(m2_std, "total_home_visits"),
tidy_robust(m3_std, "visits_per_FTE")
) %>% filter(term != "(Intercept)")
# pretty table
coef_df_std %>%
gt(groupname_col = "model") %>%
gt::fmt_number(columns = c(estimate, std.error, conf.low, conf.high),
decimals = 2) %>%
gt::tab_header(title = "OLS with Robust SE (HC1)")
| OLS with Robust SE (HC1) | ||||||
| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| families_served | ||||||
| total_funding | −166.26 | 7.14 | -23.2741152 | 6.850503e-117 | −180.26 | −152.26 |
| HV_staff_FTE | −1,772.03 | 26.37 | -67.2019261 | 0.000000e+00 | −1,823.72 | −1,720.35 |
| agencies | 668.43 | 7.71 | 86.6923355 | 0.000000e+00 | 653.32 | 683.55 |
| max_caseload | 3,234.33 | 30.95 | 104.4865144 | 0.000000e+00 | 3,173.66 | 3,295.01 |
| total_poverty_population | −37.72 | 3.57 | -10.5803121 | 4.962206e-26 | −44.71 | −30.73 |
| total_home_visits | ||||||
| total_funding | −15,160.50 | 237.70 | -63.7799020 | 0.000000e+00 | −15,626.40 | −14,694.61 |
| HV_staff_FTE | −35,854.28 | 903.44 | -39.6863515 | 3.428816e-321 | −37,625.02 | −34,083.53 |
| agencies | 23,990.13 | 277.01 | 86.6029418 | 0.000000e+00 | 23,447.18 | 24,533.07 |
| max_caseload | 55,429.19 | 1,055.40 | 52.5198281 | 0.000000e+00 | 53,360.62 | 57,497.77 |
| total_poverty_population | −55.01 | 119.31 | -0.4610508 | 6.447717e-01 | −288.84 | 178.83 |
| visits_per_FTE | ||||||
| funding_per_family | −32.00 | 0.37 | -86.7020796 | 0.000000e+00 | −32.73 | −31.28 |
| agencies | 61.73 | 1.19 | 51.8624843 | 0.000000e+00 | 59.40 | 64.06 |
| max_caseload | −79.95 | 0.73 | -109.2238270 | 0.000000e+00 | −81.38 | −78.51 |
| total_poverty_population | −6.91 | 0.65 | -10.6661825 | 1.996466e-26 | −8.18 | −5.64 |
# forest plot
ggplot(coef_df_std, aes(x = estimate, y = term,
xmin = conf.low, xmax = conf.high,
color = model)) +
geom_point(size = 2) +
geom_errorbarh(height = 0.2) +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey40") +
labs(
title = "Standardized Coefficient Plot with Robust SEs (95% CI)",
x = "Standardized Estimate (per 1 SD change in predictor)",
y = "Predictor"
) +
theme_minimal(base_size = 14) +
theme(legend.position = "bottom") +
facet_wrap(~model, scales = "free_x")
The WLS models use the poverty population as weights, so states with more people in poverty receive more influence in the estimation. The specification is the same as in the OLS models.
# Run models
m1_wls <- lm(families_served ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data, weights = total_poverty_population)
m2_wls <- lm(total_home_visits ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data, weights = total_poverty_population)
m3_wls <- lm(visits_per_FTE ~ funding_per_family + agencies + max_caseload + total_poverty_population, data = data, weights = total_poverty_population)
# Collect robust results
coef_df_wls <- bind_rows(
tidy_robust(m1_wls, "Robust WLS: families_served"),
tidy_robust(m2_wls, "Robust WLS: total_home_visits"),
tidy_robust(m3_wls, "Robust WLS: visits_per_FTE")
) %>%
filter(term != "(Intercept)")
# print
#print(coef_df_wls)
# pretty table
coef_df_wls %>%
gt(groupname_col = "model") %>%
gt::fmt_number(columns = c(estimate, std.error, conf.low, conf.high),
decimals = 2) %>%
gt::tab_header(title = "WLS with Robust SE (HC1)")
| WLS with Robust SE (HC1) | ||||||
| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| Robust WLS: families_served | ||||||
| total_funding | 0.00 | 0.00 | -1.277670e+01 | 4.161786e-37 | 0.00 | 0.00 |
| HV_staff_FTE | −3.86 | 0.03 | -1.340519e+02 | 0.000000e+00 | −3.92 | −3.81 |
| agencies | 44.84 | 0.82 | 5.483422e+01 | 0.000000e+00 | 43.24 | 46.45 |
| max_caseload | 0.89 | 0.00 | 2.499752e+02 | 0.000000e+00 | 0.88 | 0.90 |
| total_poverty_population | 0.00 | 0.00 | 2.584461e+00 | 9.766428e-03 | 0.00 | 0.00 |
| Robust WLS: total_home_visits | ||||||
| total_funding | 0.00 | 0.00 | -3.025257e+01 | 6.950843e-193 | 0.00 | 0.00 |
| HV_staff_FTE | −59.10 | 1.22 | -4.837725e+01 | 0.000000e+00 | −61.49 | −56.70 |
| agencies | 1,290.91 | 34.50 | 3.741807e+01 | 7.067570e-288 | 1,223.29 | 1,358.53 |
| max_caseload | 11.75 | 0.14 | 8.385819e+01 | 0.000000e+00 | 11.47 | 12.02 |
| total_poverty_population | 0.00 | 0.01 | 8.257865e-03 | 9.934114e-01 | −0.01 | 0.01 |
| Robust WLS: visits_per_FTE | ||||||
| funding_per_family | 0.00 | 0.00 | -4.349377e+01 | 0.000000e+00 | 0.00 | 0.00 |
| agencies | 3.90 | 0.15 | 2.524885e+01 | 1.242939e-136 | 3.60 | 4.20 |
| max_caseload | −0.03 | 0.00 | -7.058299e+01 | 0.000000e+00 | −0.03 | −0.02 |
| total_poverty_population | 0.00 | 0.00 | -3.668300e+00 | 2.453687e-04 | 0.00 | 0.00 |
Key patterns:
Interpretation: When we give more weight to large poverty states, the story is consistent. Structural capacity and caseload design drive most of the variation in reach and visit volume. Efficiency per FTE is lower in high cost per family models and in states with very high caseload ceilings.
The WLS results therefore reinforce the conclusion that efficiency and reach are linked more to program design and capacity structure than to total dollars alone.
# Multicollinearity (VIF+Kappa)
vif_m1w <- car::vif(m1_wls); kable(data.frame(term=names(vif_m1w), VIF=as.numeric(vif_m1w)),
caption="WLS m1 VIF", digits=2)
| term | VIF |
|---|---|
| total_funding | 7.53 |
| HV_staff_FTE | 88.12 |
| agencies | 5.19 |
| max_caseload | 124.55 |
| total_poverty_population | 1.39 |
vif_m2w <- car::vif(m2_wls); kable(data.frame(term=names(vif_m2w), VIF=as.numeric(vif_m2w)),
caption="WLS m2 VIF", digits=2)
| term | VIF |
|---|---|
| total_funding | 7.53 |
| HV_staff_FTE | 88.12 |
| agencies | 5.19 |
| max_caseload | 124.55 |
| total_poverty_population | 1.39 |
vif_m3w <- car::vif(m3_wls); kable(data.frame(term=names(vif_m3w), VIF=as.numeric(vif_m3w)),
caption="WLS m3 VIF", digits=2)
| term | VIF |
|---|---|
| funding_per_family | 1.03 |
| agencies | 3.73 |
| max_caseload | 3.78 |
| total_poverty_population | 1.00 |
kable(data.frame(model=c("m1_wls","m2_wls","m3_wls"),
kappa=c(kappa(model.matrix(m1_wls)),
kappa(model.matrix(m2_wls)),
kappa(model.matrix(m3_wls)))),
caption="Condition number (kappa) — WLS", digits=1)
| model | kappa |
|---|---|
| m1_wls | 20517776.5 |
| m2_wls | 20517776.5 |
| m3_wls | 39861.3 |
# Heteroskedasticity on WLS residuals (often reduced but still check)
kable(data.frame(
model=c("m1_wls","m2_wls","m3_wls"),
BP_stat=c(bptest(m1_wls)$statistic, bptest(m2_wls)$statistic, bptest(m3_wls)$statistic),
p_value=c(bptest(m1_wls)$p.value, bptest(m2_wls)$p.value, bptest(m3_wls)$p.value)
), caption="Breusch–Pagan test (WLS)", digits=4)
| model | BP_stat | p_value |
|---|---|---|
| m1_wls | 141535756 | 0 |
| m2_wls | 141579040 | 0 |
| m3_wls | 141519535 | 0 |
# RESET (functional form) on WLS
kable(data.frame(
model=c("m1_wls","m2_wls","m3_wls"),
RESET_p=c(resettest(m1_wls, power=2:3, type="regressor")$p.value,
resettest(m2_wls, power=2:3, type="regressor")$p.value,
resettest(m3_wls, power=2:3, type="regressor")$p.value)
), caption="RESET test p-values (WLS)", digits=4)
| model | RESET_p |
|---|---|
| m1_wls | 0 |
| m2_wls | 0 |
| m3_wls | 0 |
# Diagnostic plots (interactive)
par(mfrow=c(2,2))
plot(m1_wls, which=1:4, main="m1_wls diagnostics")
plot(m2_wls, which=1:4, main="m2_wls diagnostics")
plot(m3_wls, which=1:4, main="m3_wls diagnostics")
par(mfrow=c(1,1))
ggplot(coef_df_wls, aes(x = estimate, y = term,
xmin = conf.low, xmax = conf.high,
color = model)) +
geom_point(position = position_dodge(width = 0.7), size = 2) +
geom_errorbarh(position = position_dodge(width = 0.7), height = 0.2) +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey40") +
labs(
title = "WLS Coefficient Plot with Robust SEs (95% CI)",
x = "Estimate",
y = "Predictor"
) +
theme_minimal(base_size = 14) +
theme(legend.position = "bottom")+
facet_wrap(~model, scales = "free_x")
# standardize selected predictors
data_std <- data %>%
# remove NA or negative weights first
filter(!is.na(total_poverty_population) & total_poverty_population > 0) %>%
# then standardize predictors
mutate(across(
c(total_funding, HV_staff_FTE, agencies, max_caseload,
funding_per_family),
~ as.numeric(scale(.))
))
m1_wls_std <- lm(families_served ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data_std, weights = total_poverty_population)
m2_wls_std <- lm(total_home_visits ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data_std, weights = total_poverty_population)
m3_wls_std <- lm(visits_per_FTE ~ funding_per_family + agencies + max_caseload + total_poverty_population, data = data_std, weights = total_poverty_population)
coef_df_std_wls <- bind_rows(
tidy_robust(m1_wls_std, "families_served"),
tidy_robust(m2_wls_std, "total_home_visits"),
tidy_robust(m3_wls_std, "visits_per_FTE")
) %>% filter(term != "(Intercept)")
# pretty table
coef_df_std_wls %>%
gt(groupname_col = "model") %>%
gt::fmt_number(columns = c(estimate, std.error, conf.low, conf.high),
decimals = 2) %>%
gt::tab_header(title = "WLS with Robust SE (HC1)")
| WLS with Robust SE (HC1) | ||||||
| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| families_served | ||||||
| total_funding | −83.56 | 6.54 | -1.277670e+01 | 4.161786e-37 | −96.38 | −70.74 |
| HV_staff_FTE | −1,155.18 | 8.62 | -1.340519e+02 | 0.000000e+00 | −1,172.07 | −1,138.29 |
| agencies | 546.65 | 9.97 | 5.483422e+01 | 0.000000e+00 | 527.11 | 566.19 |
| max_caseload | 2,559.27 | 10.24 | 2.499752e+02 | 0.000000e+00 | 2,539.20 | 2,579.33 |
| total_poverty_population | 0.00 | 0.00 | 2.584461e+00 | 9.766428e-03 | 0.00 | 0.00 |
| total_home_visits | ||||||
| total_funding | −7,499.82 | 247.91 | -3.025257e+01 | 6.950843e-193 | −7,985.72 | −7,013.93 |
| HV_staff_FTE | −17,672.15 | 365.30 | -4.837725e+01 | 0.000000e+00 | −18,388.13 | −16,956.16 |
| agencies | 15,736.07 | 420.55 | 3.741807e+01 | 7.067570e-288 | 14,911.80 | 16,560.35 |
| max_caseload | 33,830.16 | 403.42 | 8.385819e+01 | 0.000000e+00 | 33,039.45 | 34,620.86 |
| total_poverty_population | 0.00 | 0.01 | 8.257865e-03 | 9.934114e-01 | −0.01 | 0.01 |
| visits_per_FTE | ||||||
| funding_per_family | −34.84 | 0.80 | -4.349377e+01 | 0.000000e+00 | −36.41 | −33.27 |
| agencies | 47.55 | 1.88 | 2.524885e+01 | 1.242939e-136 | 43.86 | 51.24 |
| max_caseload | −74.00 | 1.05 | -7.058299e+01 | 0.000000e+00 | −76.06 | −71.95 |
| total_poverty_population | 0.00 | 0.00 | -3.668300e+00 | 2.453687e-04 | 0.00 | 0.00 |
# forest plot
ggplot(coef_df_std_wls, aes(x = estimate, y = term,
xmin = conf.low, xmax = conf.high,
color = model)) +
geom_point(size = 2) +
geom_errorbarh(height = 0.2) +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey40") +
labs(
title = "WLS Standardized Coefficient Plot with Robust SEs (95% CI)",
x = "Standardized Estimate (per 1 SD change in predictor)",
y = "Predictor"
) +
theme_minimal(base_size = 14) +
theme(legend.position = "bottom") +
facet_wrap(~model, scales = "free_x")
The efficiency index compares each state’s observed total home visits to the visits predicted by Model 2. An efficiency value greater than 1 indicates that a state delivers more visits than expected given its funding, staffing, caseload, and poverty profile. A value below 1 indicates fewer visits than expected.
data_pred <- data %>%
mutate(
pred_visits = as.numeric(predict(m2, newdata = data)),
efficiency_visits = if_else(!is.na(pred_visits) & pred_visits > 0, total_home_visits / pred_visits, NA_real_)
)
top_eff <- data_pred %>%
arrange(desc(efficiency_visits)) %>%
dplyr::select(states, total_home_visits, pred_visits, efficiency_visits) %>%
slice_head(n = 10)
bot_eff <- data_pred %>%
arrange(efficiency_visits) %>%
dplyr::select(states, total_home_visits, pred_visits, efficiency_visits) %>%
slice_head(n = 10)
tbl(top_eff, "Top 10 by efficiency (Actual / Predicted)")
| Top 10 by efficiency (Actual / Predicted) | |||
| states | total_home_visits | pred_visits | efficiency_visits |
|---|---|---|---|
| North Dakota | 2,129.0 | 59.4 | 35.9 |
| North Dakota | 2,129.0 | 59.4 | 35.9 |
| North Dakota | 2,129.0 | 59.4 | 35.9 |
| North Dakota | 2,129.0 | 59.4 | 35.9 |
| North Dakota | 2,129.0 | 59.4 | 35.9 |
| North Dakota | 2,129.0 | 59.4 | 35.9 |
| North Dakota | 2,129.0 | 59.4 | 35.9 |
| North Dakota | 2,129.0 | 59.4 | 35.9 |
| North Dakota | 2,129.0 | 59.4 | 35.9 |
| North Dakota | 2,129.0 | 59.4 | 35.9 |
tbl(bot_eff, "Bottom 10 by efficiency (Actual / Predicted)")
| Bottom 10 by efficiency (Actual / Predicted) | |||
| states | total_home_visits | pred_visits | efficiency_visits |
|---|---|---|---|
| Mississippi | 15.0 | 19,033.5 | 0.0 |
| Mississippi | 15.0 | 19,033.5 | 0.0 |
| Mississippi | 15.0 | 19,033.5 | 0.0 |
| Mississippi | 15.0 | 19,033.5 | 0.0 |
| Mississippi | 15.0 | 19,033.5 | 0.0 |
| Mississippi | 15.0 | 19,033.5 | 0.0 |
| Mississippi | 15.0 | 19,033.5 | 0.0 |
| Mississippi | 15.0 | 19,033.5 | 0.0 |
| Mississippi | 15.0 | 19,033.5 | 0.0 |
| Mississippi | 15.0 | 19,033.5 | 0.0 |
From the top and bottom efficiency tables:
Interpretation: The efficiency index highlights states that convert funding and capacity into visits particularly well, as well as states where something in the pipeline is limiting visits. This does not prove causality, but it provides a useful starting point for peer learning and technical assistance.
# Bars
p_top <- ggplot(top_eff, aes(reorder(states, efficiency_visits), efficiency_visits)) +
geom_col(fill = "steelblue") + coord_flip() + theme_minimal() +
labs(title = "Top 10 Efficiency", x = NULL, y = "Actual/Predicted")
plotly::ggplotly(p_top)
p_bot <- ggplot(bot_eff, aes(reorder(states, efficiency_visits), efficiency_visits)) +
geom_col(fill="skyblue") + coord_flip() + theme_minimal() +
labs(title = "Bottom 10 Efficiency", x = NULL, y = "Actual/Predicted")
plotly::ggplotly(p_bot)
Each metric is standardized, cost metrics are inverted so that lower cost is better, and the index is computed as the average of the z scores.
z_good <- c("visits_per_1k_poverty","families_per_1k_poverty","visits_per_FTE","caseload_utilization")
z_bad <- c("funding_per_visit","funding_per_family")
z_tbl <- data %>%
mutate(
across(all_of(z_good), ~ as.numeric(scale(.)), .names = "{.col}_z"),
across(all_of(z_bad), ~ as.numeric(scale(.)) * -1, .names = "{.col}_z") # invert cost
) %>%
mutate(
Performance_Index = rowMeans(dplyr::select(., ends_with("_z")), na.rm = TRUE)
) %>%
dplyr::select(states, Performance_Index, ends_with("_z"))
# composite performance index table
league_table <- z_tbl %>%
arrange(desc(Performance_Index)) %>%
mutate(Rank = row_number()) %>%
dplyr::select(Rank, states, Performance_Index)
dt(league_table, "League table — Composite Performance Index (higher = better)")
Key observations:
Interpretation: The composite index is a summary indicator that should not replace detailed metrics, but it is useful for flagging candidate states for deeper review and for grouping states into peer tiers for CQI and technical assistance.
p_league <- ggplot(league_table %>% slice_head(n = 20),
aes(reorder(states, Performance_Index), Performance_Index)) +
geom_col(fill="#2ca02c") + coord_flip() + theme_minimal() +
labs(title = "Top 20: Composite Performance Index", x = NULL, y = "Index")
plotly::ggplotly(p_league)
check_cooks <- function(model, data, cutoff_rule = 4, top_n = 10) {
cd <- cooks.distance(model)
cutoff <- cutoff_rule / nrow(data)
infl_df <- tibble(
states = data$states,
state_code = data$state_code,
total_home_visits = data$total_home_visits,
total_funding = data$total_funding,
HV_staff_FTE = data$HV_staff_FTE,
total_poverty_population = data$total_poverty_population,
cooks_d = as.numeric(cd)
) %>%
arrange(desc(cooks_d))
cat("Cook's Distance cutoff =", round(cutoff, 4), "\n\n")
# --- Part A: threshold violations ---
if (any(cd > cutoff)) {
tbl(
infl_df %>% filter(cooks_d > cutoff),
paste0("Cases above Cook's Distance cutoff (", deparse(substitute(model)), ")")
)
} else {
cat("No cases above the cutoff.\n\n")
}
# --- Part B: top-N ranking ---
tbl(
infl_df %>% slice_head(n = top_n),
paste0("Top ", top_n, " Cook's Distance values (", deparse(substitute(model)), ")")
)
# --- Part C: Cook’s Distance plot ---
p <- ggplot(infl_df, aes(x = seq_along(cooks_d), y = cooks_d)) +
geom_col(fill = "#1f77b4") +
geom_hline(yintercept = cutoff, color = "red", linetype = "dashed") +
labs(
title = paste0("Cook's Distance (", deparse(substitute(model)), ")"),
x = "Observation Index",
y = "Cook's Distance"
) +
theme_minimal()
print(p) # show plot
invisible(infl_df) # return full dataframe for further use
}
The equity gap table and bar charts show average funding and visits per 1,000 in poverty by quartile. The pattern is monotonic:
Interpretation: By design MIECHV funding formulas consider many factors beyond poverty counts, but from an equity lens the data suggest that states with the largest poverty burden receive fewer resources and deliver fewer visits per unit of need. This is a key finding for policy discussion about future allocations and possible equity adjustments.
equity_gap <- data %>%
group_by(poverty_quartile) %>%
summarise(
avg_funding_per_1k = mean(funding_per_1k_poverty, na.rm = TRUE),
avg_visits_per_1k = mean(visits_per_1k_poverty, na.rm = TRUE),
.groups ="drop"
)
tbl(equity_gap, "Equity gap by poverty-load quartile")
| Equity gap by poverty-load quartile | ||
| poverty_quartile | avg_funding_per_1k | avg_visits_per_1k |
|---|---|---|
| Lowest load | 5,917,401.7 | 29,822.6 |
| Q2 | 2,090,619.0 | 3,947.4 |
| Q3 | 1,152,155.0 | 2,065.7 |
| Highest load | 494,613.3 | 1,238.0 |
# plot
q_long <- equity_gap %>%
pivot_longer(-poverty_quartile, names_to = "metric", values_to = "value")
ggplot(q_long, aes(poverty_quartile, value, fill = metric)) +
geom_col(position = "dodge") +
theme_minimal() +
labs(title = "Equity metrics by quartile", x = NULL, y = "Average")
ggplot(data, aes(total_funding/1e6, total_home_visits, label = state_code)) +
geom_point(color = "steelblue") +
ggrepel::geom_text_repel(size = 3, max.overlaps = 20) +
labs(x = "Funding (Millions USD)", y = "Total Home Visits",
title = "Efficiency frontier: Funding vs Visits") +
theme_minimal()
The clustering analysis uses four key metrics:
States are grouped into four clusters after standardization, and principal component analysis is used to visualize the clusters.
# select, scale, cluster
clust_vars <- data %>%
select(funding_per_1k_poverty, families_per_1k_poverty, visits_per_FTE, caseload_utilization)
set.seed(123)
clust_scaled <- scale(clust_vars)
km <- kmeans(clust_scaled, centers = 4, nstart = 25)
data <- data %>% mutate(cluster = factor(km$cluster))
# Counts & membership
states_by_cluster <- data %>%
group_by(cluster) %>%
summarise(n_states = n(), state_codes = paste(state_code, collapse = ", "), .groups = "drop")
dt(states_by_cluster, "States by cluster")
# Unscaled cluster means
cluster_means <- t(apply(km$centers, 1, function(row) {
row * attr(clust_scaled, "scaled:scale") + attr(clust_scaled, "scaled:center")
})) %>%
as.data.frame() %>%
tibble::rownames_to_column("cluster")
tbl(cluster_means, "Cluster centers (original units)")
| Cluster centers (original units) | ||||
| cluster | funding_per_1k_poverty | families_per_1k_poverty | visits_per_FTE | caseload_utilization |
|---|---|---|---|---|
| 1 | 1,006,879.2 | 201.5 | 62.3 | 0.6 |
| 2 | 24,668,079.9 | 5,729.2 | 203.6 | 1.1 |
| 3 | 1,958,737.3 | 292.0 | 222.5 | 1.2 |
| 4 | 3,004,846.0 | 2,448.9 | 515.2 | 1.4 |
# PCA plot
pca <- prcomp(clust_vars, center = TRUE, scale. = TRUE)
pca_data <- as.data.frame(pca$x[,1:2]) %>%
mutate(states = data$states, state_code = data$state_code, cluster = data$cluster)
# Convex hulls
hulls <- pca_data %>%
group_by(cluster) %>%
slice(chull(PC1, PC2))
p_pca <- ggplot(pca_data, aes(PC1, PC2, color = cluster, text = state_code)) +
geom_polygon(data = hulls, aes(group = cluster, fill = cluster),
alpha = 0.2, color = NA) +
geom_point(size = 4, alpha = 0.9) +
scale_color_brewer(palette = "Set1") +
scale_fill_brewer(palette = "Set1") +
labs(title = "State clusters (PCA of equity & efficiency metrics)") +
theme_minimal()
plotly::ggplotly(p_pca, tooltip = "text")
The cluster centers suggest:
The PCA plots and convex hulls show that clusters are reasonably separated in the first two principal components, which capture most of the variance in these metrics.
hulls <- pca_data %>%
group_by(cluster) %>%
slice(chull(PC1, PC2)) # convex hull points per cluster
ggplot(pca_data, aes(PC1, PC2, color = cluster, fill = cluster)) +
geom_polygon(data = hulls, aes(group = cluster), alpha = 0.2, color = NA) + # shaded hull
geom_point(size = 4, alpha = 0.9) +
geom_text(aes(label = state_code), size = 3, vjust = -1, check_overlap = TRUE) +
scale_color_brewer(palette = "Set1") +
scale_fill_brewer(palette = "Set1") +
labs(
title = "State Clusters (Convex Hulls in PCA space)",
x = "Principal Component 1",
y = "Principal Component 2"
) +
theme_minimal(base_size = 14)
Interpretation: The clusters can be interpreted as peer groups combining
equity and efficiency characteristics. They are useful for identifying
similar states for peer learning, not as rankings. Programs within the
same cluster likely face similar tradeoffs and constraints.
The scatter plots of:
hulls <- data %>%
group_by(cluster) %>%
slice(chull(funding_per_1k_poverty, visits_per_FTE))
ggplot(data, aes(x = funding_per_1k_poverty, y = visits_per_FTE,
color = as.factor(cluster), label = state_code)) +
geom_polygon(data = hulls, aes(group = cluster, fill = as.factor(cluster)),
alpha = 0.2, color = NA) +
geom_point(size = 4, alpha = 0.8) +
geom_text(size = 3, vjust = -1, check_overlap = TRUE) +
scale_color_brewer(palette = "Set1") +
scale_fill_brewer(palette = "Set1") +
scale_x_log10(labels = scales::comma) + # log scale because funding per 1k varies a lot
labs(
title = "Equity vs Efficiency: Funding vs Visits per FTE",
subtitle = "States clustered into peer groups",
x = "Funding per 1,000 Poverty Population (log scale)",
y = "Visits per FTE",
color = "Cluster"
) +
theme_minimal(base_size = 14)
# Scatter 2: Families per 1k poverty vs Caseload Utilization
ggplot(data, aes(x = families_per_1k_poverty, y = caseload_utilization,
color = as.factor(cluster), label = state_code)) +
geom_polygon(data = hulls, aes(group = cluster, fill = as.factor(cluster)),
alpha = 0.2, color = NA) +
geom_point(size = 4, alpha = 0.8) +
geom_text(size = 3, vjust = -1, check_overlap = TRUE) +
scale_color_brewer(palette = "Set1") +
labs(
title = "Reach vs Capacity Utilization",
subtitle = "Families per 1k poverty vs Caseload Utilization",
x = "Families per 1,000 Poverty Population",
y = "Caseload Utilization",
color = "Cluster"
) +
theme_minimal(base_size = 14)
Plot suggest that:
The funding versus visits frontier plot highlights states that lie above the cloud (high visits for their funding) versus those below. Combined with the efficiency index, this gives a coherent picture of which states are on or near the frontier and which are far below it.
The US maps of:
provide a geographic view of the equity and efficiency patterns. Regions with higher performance index values tend to align with states that have higher reach per poverty population and reasonable cost metrics, while some high poverty regions show systematically lower funding per 1,000 in poverty and weaker reach.
These visuals are especially useful for communicating key messages to leadership and stakeholders who may not engage directly with tables and regression output.
make_geo_map <- function(df, value_col, title, ztitle){
plot_geo(df, locationmode = "USA-states") %>%
add_trace(
z = ~get(value_col),
locations = ~state_code,
color = ~get(value_col),
colors = "Blues",
text = ~paste0(states, " (", state_code, ")<br>", ztitle, ": ",
ifelse(is.numeric(get(value_col)),
scales::comma(get(value_col), accuracy = 0.1),
get(value_col))),
hoverinfo = "text"
) %>%
colorbar(title = ztitle) %>%
layout(title = title, geo = list(scope = "usa"))
}
map1 <- make_geo_map(data, "funding_per_1k_poverty", "Funding per 1,000 poverty population", "USD per 1k")
map2 <- make_geo_map(data, "families_per_1k_poverty", "Families served per 1,000 poverty population", "Families per 1k")
map3 <- make_geo_map(data, "visits_per_FTE", "Visits per FTE", "Visits per FTE")
# Composite index map joins on state_code
map_index_df <- z_tbl %>% left_join(data %>% select(states, state_code), by = "states") %>% distinct()
map4 <- make_geo_map(map_index_df, "Performance_Index", "Composite Performance Index", "Index")
map1; map2; map3; map4
#write.csv(data, "C:/Users/kesha/Downloads/MIECHV_HEOR/MIECHV_HEOR_Data_v1.csv")