Analysis Coverage
✓Descriptive Table
✓Numeric Summary
✓Correlations
✓Econometric Models:OLS
✓Econometric Models:WLS
✓Efficiency Index/div>
✓Composite Performance Index
✓Influence Check
✓Equity Gap
✓Efficiency Benchmark
✓Clustering + PCA
✓Raw Metric Scatter
✓US Maps
Setup
Libraries
library(lmtest) # for testing linear regression
library(writexl)
library(scales)
library(magrittr) # for pipes
library(table1) # for descriptive statistics
library(tidyverse) # for tidy code
library(sessioninfo) # for session_info at bottom
library(dplyr) # for data manipulation
library(details) # for session_info at bottom
library(ggthemes) # for tufte theme
library(ggrepel) # for text plotting
library(patchwork) # for combining plots
library(readxl) # for reading xlxs files
library(kableExtra) # for table headers formatting
library(knitr) # for tables
library(sjPlot) # for model tables
library(broom) # for logistic regression
library(survey) #analysis of complex survey
library(pROC) # ROC curve visualization
library(skimr) # for summary generation
library(ggmosaic) #for mosaic plot
library(car) #for multicollinearity vif
library(generalhoslem) # for goodness of fit
library(ggeffects) # for effect modification visuals
library(usmap) # for US Map
library(plotly) # for map plotting
library (heatmaply) # for heat map
library(DT)
library(gt)
library(htmltools)
# Nice helpers
tbl <- function(df, caption = NULL) {
gt::gt(df) %>%
gt::tab_header(title = caption) %>%
gt::fmt_number(where(is.numeric), decimals = 1, use_seps = TRUE) %>%
gt::opt_table_outline()
}
dt <- function(df, caption = NULL) {
datatable(df, rownames = FALSE, filter = "top",
options = list(pageLength = 10, scrollX = TRUE),
caption = htmltools::tags$caption(style="caption-side: top; text-align: left;", caption))
}
Read & Clean data
df<- read.csv("C:/Users/kesha/Downloads/MIECHV_HEOR/ProgramSizeReach_with_poverty_population.csv")
data <- df %>%
drop_na(everything())
# Centralize variable choices used later (edit these vectors to include/exclude variables)
vars_summary <- c("base_award","matching_award","HV_staff_FTE","max_caseload",
"agencies","families_served","adults","children",
"total_home_visits","in_person_visits","virtual_visits",
"male_poverty","female_poverty","total_poverty_population")
vars_corr <- c("total_funding","HV_staff_FTE","agencies","max_caseload",
"families_served","total_home_visits",
"funding_per_1k_poverty","families_per_1k_poverty","visits_per_1k_poverty",
"funding_per_family","funding_per_visit","visits_per_FTE","caseload_utilization")
vars_box <- c("total_funding","families_served","total_home_visits","visits_per_FTE","funding_per_family")
Derived Metrics (HEOR)
data <- data %>%
mutate(
# Funding aggregates
total_funding = base_award + matching_award,
# Load-adjusted (per 1,000 population in poverty)
funding_per_1k_poverty = if_else(total_poverty_population > 0, total_funding / (total_poverty_population / 1000), NA_real_),
families_per_1k_poverty = if_else(total_poverty_population > 0, families_served / (total_poverty_population / 1000), NA_real_),
visits_per_1k_poverty = if_else(total_poverty_population > 0, total_home_visits / (total_poverty_population / 1000), NA_real_),
# Per-unit productivity / cost
funding_per_family = if_else(families_served > 0, total_funding / families_served, NA_real_),
funding_per_child = if_else(children > 0, total_funding / children, NA_real_),
funding_per_visit = if_else(total_home_visits > 0, total_funding / total_home_visits, NA_real_),
visits_per_family = if_else(families_served > 0, total_home_visits / families_served, NA_real_),
visits_per_FTE = if_else(HV_staff_FTE > 0, total_home_visits / HV_staff_FTE, NA_real_),
families_per_FTE = if_else(HV_staff_FTE > 0, families_served / HV_staff_FTE, NA_real_),
# Capacity/structure
caseload_utilization = if_else(max_caseload > 0, families_served / max_caseload, NA_real_),
households_per_agency = if_else(agencies > 0, families_served / agencies, NA_real_),
FTE_per_agency = if_else(agencies > 0, HV_staff_FTE / agencies, NA_real_),
avg_caseload_per_FTE = if_else(HV_staff_FTE > 0, max_caseload / HV_staff_FTE, NA_real_),
# Modality mix
in_person_share = if_else(total_home_visits > 0, in_person_visits / total_home_visits, NA_real_),
virtual_share = if_else(total_home_visits > 0, virtual_visits / total_home_visits, NA_real_),
# Poverty composition (optional)
poverty_female_share = if_else(total_poverty_population > 0, female_poverty / total_poverty_population, NA_real_),
poverty_male_share = if_else(total_poverty_population > 0, male_poverty / total_poverty_population, NA_real_),
# Quartiles by poverty load
poverty_quartile = ntile(total_poverty_population, 4)
) %>%
mutate(
poverty_quartile = factor(poverty_quartile,
labels = c("Lowest load", "Q2", "Q3", "Highest load")))
Quartile Cut-off
# Quartile cutoffs (table + histogram)
qcuts <- quantile(data$total_poverty_population, c(0,.25,.5,.75,1), na.rm = TRUE)
tbl(data.frame(Quantile = c("Min","Q1","Median","Q3","Max"),
Cutoff = as.numeric(qcuts)),
"Quartile cutoffs for total_poverty_population")
Quartile cutoffs for total_poverty_population |
Quantile |
Cutoff |
Min |
224.0 |
Q1 |
2,368.0 |
Median |
4,404.0 |
Q3 |
10,560.0 |
Max |
81,335.0 |
p_hist <- ggplot(data, aes(total_poverty_population)) +
geom_histogram(bins = 30, fill = "#7fcdbb", color = "white") +
geom_vline(xintercept = qcuts, linetype = c("solid","dashed","dashed","dashed","solid")) +
labs(title = "Poverty population: distribution & quartiles",
x = "Total poverty population", y = "States") +
theme_minimal()
plotly::ggplotly(p_hist)
States by Quartile
states_by_quartile <- data %>%
group_by(poverty_quartile) %>%
summarise(
n_states = n(),
state_codes = paste(state_code, collapse = ", "),
.groups ="drop")
dt(states_by_quartile, "States by poverty-load quartiles")
quartile_summary <- data %>%
group_by(poverty_quartile) %>%
summarise(
n_states = n(),
min_pov = min(total_poverty_population, na.rm = TRUE),
max_pov = max(total_poverty_population, na.rm = TRUE),
avg_funding_per_1k = mean(funding_per_1k_poverty, na.rm = TRUE),
avg_families_per_1k = mean(families_per_1k_poverty, na.rm = TRUE),
avg_visits_per_1k = mean(visits_per_1k_poverty, na.rm = TRUE),
.groups = "drop"
)
tbl(quartile_summary, "Summary by poverty-load quartile")
Summary by poverty-load quartile |
poverty_quartile |
n_states |
min_pov |
max_pov |
avg_funding_per_1k |
avg_families_per_1k |
avg_visits_per_1k |
Lowest load |
13.0 |
224.0 |
2,319.0 |
8,059,051.9 |
1,562.1 |
23,355.5 |
Q2 |
13.0 |
2,417.0 |
4,404.0 |
2,592,599.4 |
407.0 |
5,166.8 |
Q3 |
13.0 |
5,002.0 |
10,629.0 |
1,303,415.7 |
193.9 |
2,312.0 |
Highest load |
12.0 |
12,115.0 |
81,335.0 |
376,162.8 |
67.1 |
709.4 |
# Quick bar of counts
ggplot(states_by_quartile, aes(poverty_quartile, n_states)) +
geom_col(fill= "#fa9fb5") + geom_text(aes(label = n_states), vjust = -0.3) +
labs(title = "State count by poverty-load quartile", x = NULL, y = "Number of states") +
theme_minimal()

Descriptive Table
# table1: Overall
table1(~ base_award + matching_award + total_funding +
HV_staff_FTE + max_caseload + agencies +
families_served + adults + children +
total_home_visits + in_person_visits + virtual_visits +
funding_per_1k_poverty + families_per_1k_poverty + visits_per_1k_poverty +
funding_per_family + funding_per_visit + visits_per_FTE + caseload_utilization,
data = data, render.missing = NULL,
render.continuous = "median (IQR)",
caption = "Overall Descriptive Summary (HEOR) — 2023")
Overall Descriptive Summary (HEOR) — 2023
|
Overall (N=51) |
base_award |
7960000 (6200000) |
matching_award |
726000 (0) |
total_funding |
8560000 (6200000) |
HV_staff_FTE |
74.7 (60.4) |
max_caseload |
1030 (1100) |
agencies |
12.0 (12.0) |
families_served |
1400 (1240) |
adults |
1400 (1290) |
children |
1320 (1130) |
total_home_visits |
16100 (14300) |
in_person_visits |
12000 (12900) |
virtual_visits |
2590 (3540) |
funding_per_1k_poverty |
1860000 (2780000) |
families_per_1k_poverty |
270 (421) |
visits_per_1k_poverty |
3270 (4800) |
funding_per_family |
7260 (3680) |
funding_per_visit |
570 (283) |
visits_per_FTE |
213 (84.7) |
caseload_utilization |
1.18 (0.224) |
# table1: By quartile (not by state!)
table1(~ total_funding + HV_staff_FTE + agencies + max_caseload +
families_served + total_home_visits +
funding_per_1k_poverty + families_per_1k_poverty + visits_per_1k_poverty +
funding_per_family + funding_per_visit + visits_per_FTE + caseload_utilization +
in_person_share + virtual_share | poverty_quartile,
data = data, render.missing = NULL,
render.continuous = "median (IQR)",
caption = "Program Summary by Poverty Load (Quartiles) — 2023")
Program Summary by Poverty Load (Quartiles) — 2023
|
Lowest load (N=13) |
Q2 (N=13) |
Q3 (N=13) |
Highest load (N=12) |
Overall (N=51) |
total_funding |
5750000 (3340000) |
9650000 (2480000) |
10000000 (7060000) |
9100000 (7220000) |
8560000 (6200000) |
HV_staff_FTE |
57.0 (53.1) |
82.2 (25.1) |
80.3 (50.1) |
61.2 (71.5) |
74.7 (60.4) |
agencies |
8.00 (9.00) |
14.0 (9.00) |
15.0 (11.0) |
13.0 (13.3) |
12.0 (12.0) |
max_caseload |
519 (769) |
1200 (392) |
1410 (1180) |
919 (1800) |
1030 (1100) |
families_served |
655 (1120) |
1490 (597) |
1650 (1390) |
1290 (2020) |
1400 (1240) |
total_home_visits |
9380 (16200) |
19900 (8290) |
19700 (12700) |
12300 (17300) |
16100 (14300) |
funding_per_1k_poverty |
6440000 (4900000) |
2950000 (1220000) |
1220000 (1050000) |
244000 (441000) |
1860000 (2780000) |
families_per_1k_poverty |
713 (1500) |
407 (196) |
158 (150) |
34.8 (91.6) |
270 (421) |
visits_per_1k_poverty |
12200 (13500) |
5280 (2550) |
2110 (1710) |
327 (865) |
3270 (4800) |
funding_per_family |
7780 (4700) |
6290 (2430) |
7580 (2170) |
7660 (4900) |
7260 (3680) |
funding_per_visit |
575 (265) |
484 (116) |
608 (154) |
644 (419) |
570 (283) |
visits_per_FTE |
206 (65.4) |
261 (112) |
195 (51.2) |
219 (93.2) |
213 (84.7) |
caseload_utilization |
1.26 (0.220) |
1.30 (0.220) |
1.16 (0.126) |
1.15 (0.283) |
1.18 (0.224) |
in_person_share |
0.852 (0.154) |
0.861 (0.157) |
0.845 (0.0864) |
0.850 (0.103) |
0.851 (0.136) |
virtual_share |
0.146 (0.179) |
0.139 (0.157) |
0.141 (0.116) |
0.150 (0.103) |
0.146 (0.146) |
Numeric Summary (wide table + boxplots)
desc_tbl <- data %>%
summarise(across(all_of(vars_summary), list(
n = ~sum(!is.na(.)),
mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
p25 = ~quantile(., 0.25, na.rm = TRUE),
median = ~median(., na.rm = TRUE),
p75 = ~quantile(., 0.75, na.rm = TRUE),
min = ~min(., na.rm = TRUE),
max = ~max(., na.rm = TRUE)
), .names = "{.col}.{.fn}"))
dt(desc_tbl, "Numeric Descriptive Summary (Wide)")
# (1) Overall summary (no groups)
table1(
~ base_award + matching_award + total_funding +
HV_staff_FTE + max_caseload + agencies +
families_served + adults + children +
total_home_visits + in_person_visits + virtual_visits +
funding_per_1k_poverty + families_per_1k_poverty + visits_per_1k_poverty +
funding_per_family + funding_per_visit + visits_per_FTE + caseload_utilization,
data = data, render.missing = NULL,
render.continuous = "median (IQR)",
caption = "Overall Descriptive Summary (HEOR) — 2023"
)
Overall Descriptive Summary (HEOR) — 2023
|
Overall (N=51) |
base_award |
7960000 (6200000) |
matching_award |
726000 (0) |
total_funding |
8560000 (6200000) |
HV_staff_FTE |
74.7 (60.4) |
max_caseload |
1030 (1100) |
agencies |
12.0 (12.0) |
families_served |
1400 (1240) |
adults |
1400 (1290) |
children |
1320 (1130) |
total_home_visits |
16100 (14300) |
in_person_visits |
12000 (12900) |
virtual_visits |
2590 (3540) |
funding_per_1k_poverty |
1860000 (2780000) |
families_per_1k_poverty |
270 (421) |
visits_per_1k_poverty |
3270 (4800) |
funding_per_family |
7260 (3680) |
funding_per_visit |
570 (283) |
visits_per_FTE |
213 (84.7) |
caseload_utilization |
1.18 (0.224) |
# (2) By poverty-load quartile
table1(
~ total_funding + HV_staff_FTE + agencies + max_caseload +
families_served + total_home_visits +
funding_per_1k_poverty + families_per_1k_poverty + visits_per_1k_poverty +
funding_per_family + funding_per_visit + visits_per_FTE + caseload_utilization +
in_person_share + virtual_share | poverty_quartile,
data = data, render.missing = NULL,
render.continuous = "median (IQR)",
caption = "Program Summary by Poverty Load (Quartiles) — 2023"
)
Program Summary by Poverty Load (Quartiles) — 2023
|
Lowest load (N=13) |
Q2 (N=13) |
Q3 (N=13) |
Highest load (N=12) |
Overall (N=51) |
total_funding |
5750000 (3340000) |
9650000 (2480000) |
10000000 (7060000) |
9100000 (7220000) |
8560000 (6200000) |
HV_staff_FTE |
57.0 (53.1) |
82.2 (25.1) |
80.3 (50.1) |
61.2 (71.5) |
74.7 (60.4) |
agencies |
8.00 (9.00) |
14.0 (9.00) |
15.0 (11.0) |
13.0 (13.3) |
12.0 (12.0) |
max_caseload |
519 (769) |
1200 (392) |
1410 (1180) |
919 (1800) |
1030 (1100) |
families_served |
655 (1120) |
1490 (597) |
1650 (1390) |
1290 (2020) |
1400 (1240) |
total_home_visits |
9380 (16200) |
19900 (8290) |
19700 (12700) |
12300 (17300) |
16100 (14300) |
funding_per_1k_poverty |
6440000 (4900000) |
2950000 (1220000) |
1220000 (1050000) |
244000 (441000) |
1860000 (2780000) |
families_per_1k_poverty |
713 (1500) |
407 (196) |
158 (150) |
34.8 (91.6) |
270 (421) |
visits_per_1k_poverty |
12200 (13500) |
5280 (2550) |
2110 (1710) |
327 (865) |
3270 (4800) |
funding_per_family |
7780 (4700) |
6290 (2430) |
7580 (2170) |
7660 (4900) |
7260 (3680) |
funding_per_visit |
575 (265) |
484 (116) |
608 (154) |
644 (419) |
570 (283) |
visits_per_FTE |
206 (65.4) |
261 (112) |
195 (51.2) |
219 (93.2) |
213 (84.7) |
caseload_utilization |
1.26 (0.220) |
1.30 (0.220) |
1.16 (0.126) |
1.15 (0.283) |
1.18 (0.224) |
in_person_share |
0.852 (0.154) |
0.861 (0.157) |
0.845 (0.0864) |
0.850 (0.103) |
0.851 (0.136) |
virtual_share |
0.146 (0.179) |
0.139 (0.157) |
0.141 (0.116) |
0.150 (0.103) |
0.146 (0.146) |
# Boxplots for key metrics
box_long <- data %>%
select(all_of(vars_box)) %>%
pivot_longer(everything(), names_to = "metric", values_to = "value")
p_boxes <- ggplot(box_long, aes(metric, value)) +
geom_boxplot(outlier.alpha = .4, fill = "#8856a7") +
scale_fill_brewer(palette = "Set2") +
labs(title = "Distribution of key program metrics", x = NULL, y = "Value") +
theme_minimal()
plotly::ggplotly(p_boxes)
Correlations (key variables)
cors <- cor(data[, vars_corr], use = "pairwise.complete.obs")
heatmaply::heatmaply_cor(cors, k_col = 2, k_row = 2,
xlab = "Variables", ylab = "Variables",
main = "Correlation matrix (interactive)")
# 2. Test significance of correlations
p_mat <- psych::corr.test(data[, vars_corr], adjust = "none")$p # gives p-values
# 3. Mask non-significant correlations (replace with 0 to keep clustering)
alpha <- 0.05
cors_sig <- cors
cors_sig[p_mat > alpha] <- 0 # keep clustering valid
# 4. Plot with diverging colors
heatmaply_cor(
cors_sig,
k_col = 2, k_row = 2,
xlab = "Variables", ylab = "Variables",
main = "Significant Correlations (p < 0.05)",
colors = colorRampPalette(c("#238443", "#ece7f2", "#cc4c02"))(200) # blue = negative, red = positive
)
Econometric Models:Ordinary Least Squares (OLS), State Level
Inference
# Helper: tidy robust results
tidy_robust <- function(model, model_name, type = "HC1") {
# robust VCOV
robust_vcov <- sandwich::vcovHC(model, type = type)
# coeftest gives matrix of coef, robust SE, t, p
rob <- lmtest::coeftest(model, vcov = robust_vcov)
# convert to tibble
tibble::tibble(
term = rownames(rob),
estimate = rob[, 1],
std.error = rob[, 2],
statistic = rob[, 3],
p.value = rob[, 4],
model = model_name
) %>%
# add 95% CIs
mutate(
conf.low = estimate - 1.96 * std.error,
conf.high = estimate + 1.96 * std.error
)
}
# Run models
m1 <- lm(families_served ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data)
m2 <- lm(total_home_visits ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data)
m3 <- lm(visits_per_FTE ~ funding_per_family + agencies + max_caseload + total_poverty_population, data = data)
# Collect robust results
coef_df <- bind_rows(
tidy_robust(m1, "Robust OLS: families_served"),
tidy_robust(m2, "Robust OLS: total_home_visits"),
tidy_robust(m3, "Robust OLS: visits_per_FTE")
) %>%
filter(term != "(Intercept)")
# print
#print(coef_df)
# pretty table
coef_df %>%
gt(groupname_col = "model") %>%
gt::fmt_number(columns = c(estimate, std.error, conf.low, conf.high),
decimals = 2) %>%
gt::tab_header(title = "OLS with Robust SE (HC1)")
OLS with Robust SE (HC1) |
term |
estimate |
std.error |
statistic |
p.value |
conf.low |
conf.high |
Robust OLS: families_served |
total_funding |
0.00 |
0.00 |
-0.3912873 |
6.974316e-01 |
0.00 |
0.00 |
HV_staff_FTE |
−5.09 |
1.51 |
-3.3817080 |
1.499262e-03 |
−8.04 |
−2.14 |
agencies |
43.13 |
14.47 |
2.9809569 |
4.623269e-03 |
14.77 |
71.50 |
max_caseload |
1.07 |
0.18 |
6.0348855 |
2.771351e-07 |
0.72 |
1.42 |
total_poverty_population |
0.00 |
0.00 |
-0.6300713 |
5.318345e-01 |
−0.01 |
0.00 |
Robust OLS: total_home_visits |
total_funding |
0.00 |
0.00 |
-1.7773238 |
8.227399e-02 |
0.00 |
0.00 |
HV_staff_FTE |
−94.68 |
50.90 |
-1.8602151 |
6.939824e-02 |
−194.43 |
5.08 |
agencies |
1,451.98 |
541.08 |
2.6834748 |
1.015838e-02 |
391.46 |
2,512.50 |
max_caseload |
17.76 |
6.22 |
2.8547144 |
6.492045e-03 |
5.57 |
29.95 |
total_poverty_population |
−0.08 |
0.08 |
-0.9839336 |
3.304090e-01 |
−0.24 |
0.08 |
Robust OLS: visits_per_FTE |
funding_per_family |
0.00 |
0.00 |
-9.9436652 |
4.869858e-13 |
0.00 |
0.00 |
agencies |
4.05 |
1.76 |
2.2929646 |
2.647102e-02 |
0.59 |
7.51 |
max_caseload |
−0.02 |
0.01 |
-2.7968311 |
7.508768e-03 |
−0.04 |
−0.01 |
total_poverty_population |
0.00 |
0.00 |
-0.5975456 |
5.530735e-01 |
0.00 |
0.00 |
Forest Plot for coefficeint
ggplot(coef_df, aes(x = estimate, y = term,
xmin = conf.low, xmax = conf.high,
color = model)) +
geom_point(position = position_dodge(width = 0.7), size = 2) +
geom_errorbarh(position = position_dodge(width = 0.7), height = 0.2) +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey40") +
labs(
title = "Coefficient Plot with Robust SEs (95% CI)",
x = "Estimate",
y = "Predictor"
) +
theme_minimal(base_size = 14) +
theme(legend.position = "bottom")+
facet_wrap(~model, scales = "free_x")

Standardize predictors before fitting models
# standardize selected predictors
data_std <- data %>%
mutate(across(
c(total_funding, HV_staff_FTE, agencies, max_caseload,
total_poverty_population, funding_per_family),
scale
))
m1_std <- lm(families_served ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data_std)
m2_std <- lm(total_home_visits ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data_std)
m3_std <- lm(visits_per_FTE ~ funding_per_family + agencies + max_caseload + total_poverty_population, data = data_std)
coef_df_std <- bind_rows(
tidy_robust(m1_std, "families_served"),
tidy_robust(m2_std, "total_home_visits"),
tidy_robust(m3_std, "visits_per_FTE")
) %>% filter(term != "(Intercept)")
# pretty table
coef_df_std %>%
gt(groupname_col = "model") %>%
gt::fmt_number(columns = c(estimate, std.error, conf.low, conf.high),
decimals = 2) %>%
gt::tab_header(title = "OLS with Robust SE (HC1)")
OLS with Robust SE (HC1) |
term |
estimate |
std.error |
statistic |
p.value |
conf.low |
conf.high |
families_served |
total_funding |
−53.30 |
136.21 |
-0.3912873 |
6.974316e-01 |
−320.27 |
213.67 |
HV_staff_FTE |
−772.26 |
228.36 |
-3.3817080 |
1.499262e-03 |
−1,219.85 |
−324.67 |
agencies |
428.27 |
143.67 |
2.9809569 |
4.623269e-03 |
146.68 |
709.86 |
max_caseload |
1,680.32 |
278.43 |
6.0348855 |
2.771351e-07 |
1,134.59 |
2,226.05 |
total_poverty_population |
−26.33 |
41.79 |
-0.6300713 |
5.318345e-01 |
−108.25 |
55.58 |
total_home_visits |
total_funding |
−8,802.57 |
4,952.71 |
-1.7773238 |
8.227399e-02 |
−18,509.88 |
904.74 |
HV_staff_FTE |
−14,361.85 |
7,720.53 |
-1.8602151 |
6.939824e-02 |
−29,494.10 |
770.39 |
agencies |
14,416.28 |
5,372.24 |
2.6834748 |
1.015838e-02 |
3,886.68 |
24,945.88 |
max_caseload |
27,922.53 |
9,781.20 |
2.8547144 |
6.492045e-03 |
8,751.38 |
47,093.67 |
total_poverty_population |
−1,407.15 |
1,430.13 |
-0.9839336 |
3.304090e-01 |
−4,210.19 |
1,395.90 |
visits_per_FTE |
funding_per_family |
−39.00 |
3.92 |
-9.9436652 |
4.869858e-13 |
−46.69 |
−31.31 |
agencies |
40.18 |
17.52 |
2.2929646 |
2.647102e-02 |
5.83 |
74.52 |
max_caseload |
−32.99 |
11.79 |
-2.7968311 |
7.508768e-03 |
−56.11 |
−9.87 |
total_poverty_population |
−5.81 |
9.72 |
-0.5975456 |
5.530735e-01 |
−24.85 |
13.24 |
# forest plot
ggplot(coef_df_std, aes(x = estimate, y = term,
xmin = conf.low, xmax = conf.high,
color = model)) +
geom_point(size = 2) +
geom_errorbarh(height = 0.2) +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey40") +
labs(
title = "Standardized Coefficient Plot with Robust SEs (95% CI)",
x = "Standardized Estimate (per 1 SD change in predictor)",
y = "Predictor"
) +
theme_minimal(base_size = 14) +
theme(legend.position = "bottom") +
facet_wrap(~model, scales = "free_x")

Econometric Models:Weighted Least Squares(WLS), Population Level
Inference
# Run models
m1_wls <- lm(families_served ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data, weights = total_poverty_population)
m2_wls <- lm(total_home_visits ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data, weights = total_poverty_population)
m3_wls <- lm(visits_per_FTE ~ funding_per_family + agencies + max_caseload + total_poverty_population, data = data, weights = total_poverty_population)
# Collect robust results
coef_df_wls <- bind_rows(
tidy_robust(m1_wls, "Robust WLS: families_served"),
tidy_robust(m2_wls, "Robust WLS: total_home_visits"),
tidy_robust(m3_wls, "Robust WLS: visits_per_FTE")
) %>%
filter(term != "(Intercept)")
# print
#print(coef_df_wls)
# pretty table
coef_df_wls %>%
gt(groupname_col = "model") %>%
gt::fmt_number(columns = c(estimate, std.error, conf.low, conf.high),
decimals = 2) %>%
gt::tab_header(title = "WLS with Robust SE (HC1)")
WLS with Robust SE (HC1) |
term |
estimate |
std.error |
statistic |
p.value |
conf.low |
conf.high |
Robust WLS: families_served |
total_funding |
0.00 |
0.00 |
-0.6694296 |
5.066419e-01 |
0.00 |
0.00 |
HV_staff_FTE |
−3.64 |
0.47 |
-7.7312319 |
8.508534e-10 |
−4.56 |
−2.71 |
agencies |
40.00 |
9.56 |
4.1825367 |
1.315608e-04 |
21.26 |
58.75 |
max_caseload |
0.88 |
0.06 |
15.6885357 |
7.295050e-20 |
0.77 |
0.99 |
total_poverty_population |
0.00 |
0.00 |
1.0119571 |
3.169690e-01 |
0.00 |
0.01 |
Robust WLS: total_home_visits |
total_funding |
0.00 |
0.00 |
-1.4797539 |
1.459062e-01 |
0.00 |
0.00 |
HV_staff_FTE |
−46.53 |
18.55 |
-2.5079307 |
1.581869e-02 |
−82.90 |
−10.17 |
agencies |
740.59 |
336.14 |
2.2031886 |
3.274287e-02 |
81.75 |
1,399.43 |
max_caseload |
10.81 |
2.05 |
5.2669365 |
3.763857e-06 |
6.79 |
14.84 |
total_poverty_population |
−0.05 |
0.04 |
-1.3168081 |
1.945699e-01 |
−0.13 |
0.02 |
Robust WLS: visits_per_FTE |
funding_per_family |
0.00 |
0.00 |
-6.2316068 |
1.300715e-07 |
0.00 |
0.00 |
agencies |
1.42 |
2.15 |
0.6614850 |
5.116013e-01 |
−2.79 |
5.63 |
max_caseload |
−0.02 |
0.01 |
-2.8082815 |
7.285133e-03 |
−0.03 |
−0.01 |
total_poverty_population |
0.00 |
0.00 |
-0.5624614 |
5.765326e-01 |
0.00 |
0.00 |
Forest Plot for coefficeint
ggplot(coef_df_wls, aes(x = estimate, y = term,
xmin = conf.low, xmax = conf.high,
color = model)) +
geom_point(position = position_dodge(width = 0.7), size = 2) +
geom_errorbarh(position = position_dodge(width = 0.7), height = 0.2) +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey40") +
labs(
title = "WLS Coefficient Plot with Robust SEs (95% CI)",
x = "Estimate",
y = "Predictor"
) +
theme_minimal(base_size = 14) +
theme(legend.position = "bottom")+
facet_wrap(~model, scales = "free_x")

Standardize predictors before fitting models
# standardize selected predictors
data_std <- data %>%
# remove NA or negative weights first
filter(!is.na(total_poverty_population) & total_poverty_population > 0) %>%
# then standardize predictors
mutate(across(
c(total_funding, HV_staff_FTE, agencies, max_caseload,
funding_per_family),
~ as.numeric(scale(.))
))
m1_wls_std <- lm(families_served ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data_std, weights = total_poverty_population)
m2_wls_std <- lm(total_home_visits ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data_std, weights = total_poverty_population)
m3_wls_std <- lm(visits_per_FTE ~ funding_per_family + agencies + max_caseload + total_poverty_population, data = data_std, weights = total_poverty_population)
coef_df_std_wls <- bind_rows(
tidy_robust(m1_wls_std, "families_served"),
tidy_robust(m2_wls_std, "total_home_visits"),
tidy_robust(m3_wls_std, "visits_per_FTE")
) %>% filter(term != "(Intercept)")
# pretty table
coef_df_std_wls %>%
gt(groupname_col = "model") %>%
gt::fmt_number(columns = c(estimate, std.error, conf.low, conf.high),
decimals = 2) %>%
gt::tab_header(title = "WLS with Robust SE (HC1)")
WLS with Robust SE (HC1) |
term |
estimate |
std.error |
statistic |
p.value |
conf.low |
conf.high |
families_served |
total_funding |
−49.74 |
74.31 |
-0.6694296 |
5.066419e-01 |
−195.39 |
95.90 |
HV_staff_FTE |
−551.53 |
71.34 |
-7.7312319 |
8.508534e-10 |
−691.35 |
−411.71 |
agencies |
397.18 |
94.96 |
4.1825367 |
1.315608e-04 |
211.05 |
583.30 |
max_caseload |
1,379.87 |
87.95 |
15.6885357 |
7.295050e-20 |
1,207.48 |
1,552.26 |
total_poverty_population |
0.00 |
0.00 |
1.0119571 |
3.169690e-01 |
0.00 |
0.01 |
total_home_visits |
total_funding |
−3,023.69 |
2,043.37 |
-1.4797539 |
1.459062e-01 |
−7,028.69 |
981.32 |
HV_staff_FTE |
−7,058.44 |
2,814.45 |
-2.5079307 |
1.581869e-02 |
−12,574.75 |
−1,542.12 |
agencies |
7,353.07 |
3,337.47 |
2.2031886 |
3.274287e-02 |
811.63 |
13,894.51 |
max_caseload |
17,002.64 |
3,228.18 |
5.2669365 |
3.763857e-06 |
10,675.40 |
23,329.88 |
total_poverty_population |
−0.05 |
0.04 |
-1.3168081 |
1.945699e-01 |
−0.13 |
0.02 |
visits_per_FTE |
funding_per_family |
−38.40 |
6.16 |
-6.2316068 |
1.300715e-07 |
−50.48 |
−26.33 |
agencies |
14.11 |
21.33 |
0.6614850 |
5.116013e-01 |
−27.69 |
55.91 |
max_caseload |
−27.33 |
9.73 |
-2.8082815 |
7.285133e-03 |
−46.40 |
−8.26 |
total_poverty_population |
0.00 |
0.00 |
-0.5624614 |
5.765326e-01 |
0.00 |
0.00 |
# forest plot
ggplot(coef_df_std_wls, aes(x = estimate, y = term,
xmin = conf.low, xmax = conf.high,
color = model)) +
geom_point(size = 2) +
geom_errorbarh(height = 0.2) +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey40") +
labs(
title = "WLS Standardized Coefficient Plot with Robust SEs (95% CI)",
x = "Standardized Estimate (per 1 SD change in predictor)",
y = "Predictor"
) +
theme_minimal(base_size = 14) +
theme(legend.position = "bottom") +
facet_wrap(~model, scales = "free_x")

Efficiency Index (actual vs predicted visits)
data_pred <- data %>%
mutate(
pred_visits = as.numeric(predict(m2, newdata = data)),
efficiency_visits = if_else(!is.na(pred_visits) & pred_visits > 0, total_home_visits / pred_visits, NA_real_)
)
top_eff <- data_pred %>%
arrange(desc(efficiency_visits)) %>%
dplyr::select(states, total_home_visits, pred_visits, efficiency_visits) %>%
slice_head(n = 10)
bot_eff <- data_pred %>%
arrange(efficiency_visits) %>%
dplyr::select(states, total_home_visits, pred_visits, efficiency_visits) %>%
slice_head(n = 10)
tbl(top_eff, "Top 10 by efficiency (Actual / Predicted)")
Top 10 by efficiency (Actual / Predicted) |
states |
total_home_visits |
pred_visits |
efficiency_visits |
New Jersey |
21,002.0 |
2,422.9 |
8.7 |
Wyoming |
4,724.0 |
815.4 |
5.8 |
Tennessee |
20,110.0 |
4,669.0 |
4.3 |
Oklahoma |
12,119.0 |
2,860.8 |
4.2 |
Hawaii |
8,105.0 |
2,116.9 |
3.8 |
Indiana |
22,089.0 |
7,589.2 |
2.9 |
Connecticut |
25,146.0 |
9,936.0 |
2.5 |
Missouri |
9,381.0 |
3,868.9 |
2.4 |
Iowa |
12,172.0 |
6,298.0 |
1.9 |
North Carolina |
6,813.0 |
3,531.0 |
1.9 |
tbl(bot_eff, "Bottom 10 by efficiency (Actual / Predicted)")
Bottom 10 by efficiency (Actual / Predicted) |
states |
total_home_visits |
pred_visits |
efficiency_visits |
Mississippi |
15.0 |
16,445.2 |
0.0 |
Vermont |
3,551.0 |
12,483.1 |
0.3 |
South Dakota |
1,589.0 |
4,567.6 |
0.3 |
Montana |
12,425.0 |
26,265.1 |
0.5 |
Nebraska |
11,259.0 |
22,687.8 |
0.5 |
Maryland |
13,955.0 |
26,206.6 |
0.5 |
Arkansas |
21,697.0 |
39,273.9 |
0.6 |
West Virginia |
21,181.0 |
38,324.2 |
0.6 |
Virginia |
13,411.0 |
23,233.5 |
0.6 |
Massachusetts |
23,246.0 |
35,128.8 |
0.7 |
Plot for Efficiency Index
# Bars
p_top <- ggplot(top_eff, aes(reorder(states, efficiency_visits), efficiency_visits)) +
geom_col(fill = "steelblue") + coord_flip() + theme_minimal() +
labs(title = "Top 10 Efficiency", x = NULL, y = "Actual/Predicted")
plotly::ggplotly(p_top)
p_bot <- ggplot(bot_eff, aes(reorder(states, efficiency_visits), efficiency_visits)) +
geom_col(fill="skyblue") + coord_flip() + theme_minimal() +
labs(title = "Bottom 10 Efficiency", x = NULL, y = "Actual/Predicted")
plotly::ggplotly(p_bot)
Influence Check (outliers for M2, Cooks distance)
check_cooks <- function(model, data, cutoff_rule = 4, top_n = 10) {
cd <- cooks.distance(model)
cutoff <- cutoff_rule / nrow(data)
infl_df <- tibble(
states = data$states,
state_code = data$state_code,
total_home_visits = data$total_home_visits,
total_funding = data$total_funding,
HV_staff_FTE = data$HV_staff_FTE,
total_poverty_population = data$total_poverty_population,
cooks_d = as.numeric(cd)
) %>%
arrange(desc(cooks_d))
cat("Cook's Distance cutoff =", round(cutoff, 4), "\n\n")
# --- Part A: threshold violations ---
if (any(cd > cutoff)) {
tbl(
infl_df %>% filter(cooks_d > cutoff),
paste0("Cases above Cook's Distance cutoff (", deparse(substitute(model)), ")")
)
} else {
cat("No cases above the cutoff.\n\n")
}
# --- Part B: top-N ranking ---
tbl(
infl_df %>% slice_head(n = top_n),
paste0("Top ", top_n, " Cook's Distance values (", deparse(substitute(model)), ")")
)
# --- Part C: Cook’s Distance plot ---
p <- ggplot(infl_df, aes(x = seq_along(cooks_d), y = cooks_d)) +
geom_col(fill = "#1f77b4") +
geom_hline(yintercept = cutoff, color = "red", linetype = "dashed") +
labs(
title = paste0("Cook's Distance (", deparse(substitute(model)), ")"),
x = "Observation Index",
y = "Cook's Distance"
) +
theme_minimal()
print(p) # show plot
invisible(infl_df) # return full dataframe for further use
}
Equity Gap Analysis (Per-1k vs Quartile)
equity_gap <- data %>%
group_by(poverty_quartile) %>%
summarise(
avg_funding_per_1k = mean(funding_per_1k_poverty, na.rm = TRUE),
avg_visits_per_1k = mean(visits_per_1k_poverty, na.rm = TRUE),
.groups ="drop"
)
tbl(equity_gap, "Equity gap by poverty-load quartile")
Equity gap by poverty-load quartile |
poverty_quartile |
avg_funding_per_1k |
avg_visits_per_1k |
Lowest load |
8,059,051.9 |
23,355.5 |
Q2 |
2,592,599.4 |
5,166.8 |
Q3 |
1,303,415.7 |
2,312.0 |
Highest load |
376,162.8 |
709.4 |
# plot
q_long <- equity_gap %>%
pivot_longer(-poverty_quartile, names_to = "metric", values_to = "value")
ggplot(q_long, aes(poverty_quartile, value, fill = metric)) +
geom_col(position = "dodge") +
theme_minimal() +
labs(title = "Equity metrics by quartile", x = NULL, y = "Average")

Efficiency Benchmarking (Frontier view)
ggplot(data, aes(total_funding/1e6, total_home_visits, label = state_code)) +
geom_point(color = "steelblue") +
ggrepel::geom_text_repel(size = 3, max.overlaps = 20) +
labs(x = "Funding (Millions USD)", y = "Total Home Visits",
title = "Efficiency frontier: Funding vs Visits") +
theme_minimal()

Cluster + PCA
# select, scale, cluster
clust_vars <- data %>%
select(funding_per_1k_poverty, families_per_1k_poverty, visits_per_FTE, caseload_utilization)
set.seed(123)
clust_scaled <- scale(clust_vars)
km <- kmeans(clust_scaled, centers = 4, nstart = 25)
data <- data %>% mutate(cluster = factor(km$cluster))
# Counts & membership
states_by_cluster <- data %>%
group_by(cluster) %>%
summarise(n_states = n(), state_codes = paste(state_code, collapse = ", "), .groups = "drop")
dt(states_by_cluster, "States by cluster")
# Unscaled cluster means
cluster_means <- t(apply(km$centers, 1, function(row) {
row * attr(clust_scaled, "scaled:scale") + attr(clust_scaled, "scaled:center")
})) %>%
as.data.frame() %>%
tibble::rownames_to_column("cluster")
tbl(cluster_means, "Cluster centers (original units)")
Cluster centers (original units) |
cluster |
funding_per_1k_poverty |
families_per_1k_poverty |
visits_per_FTE |
caseload_utilization |
1 |
1,529,447.1 |
229.2 |
311.7 |
1.6 |
2 |
13,116,572.3 |
3,088.7 |
260.8 |
1.2 |
3 |
2,668,941.7 |
166.4 |
67.5 |
0.5 |
4 |
2,134,562.2 |
321.2 |
211.8 |
1.2 |
# PCA plot
pca <- prcomp(clust_vars, center = TRUE, scale. = TRUE)
pca_data <- as.data.frame(pca$x[,1:2]) %>%
mutate(states = data$states, state_code = data$state_code, cluster = data$cluster)
# Convex hulls
hulls <- pca_data %>%
group_by(cluster) %>%
slice(chull(PC1, PC2))
p_pca <- ggplot(pca_data, aes(PC1, PC2, color = cluster, text = state_code)) +
geom_polygon(data = hulls, aes(group = cluster, fill = cluster),
alpha = 0.2, color = NA) +
geom_point(size = 4, alpha = 0.9) +
scale_color_brewer(palette = "Set1") +
scale_fill_brewer(palette = "Set1") +
labs(title = "State clusters (PCA of equity & efficiency metrics)") +
theme_minimal()
plotly::ggplotly(p_pca, tooltip = "text")
PCA for visualization
hulls <- pca_data %>%
group_by(cluster) %>%
slice(chull(PC1, PC2)) # convex hull points per cluster
ggplot(pca_data, aes(PC1, PC2, color = cluster, fill = cluster)) +
geom_polygon(data = hulls, aes(group = cluster), alpha = 0.2, color = NA) + # shaded hull
geom_point(size = 4, alpha = 0.9) +
geom_text(aes(label = state_code), size = 3, vjust = -1, check_overlap = TRUE) +
scale_color_brewer(palette = "Set1") +
scale_fill_brewer(palette = "Set1") +
labs(
title = "State Clusters (Convex Hulls in PCA space)",
x = "Principal Component 1",
y = "Principal Component 2"
) +
theme_minimal(base_size = 14)

Raw metric scatter plot
Scatter 1: Funding per 1k poverty vs Visits per FTE
hulls <- data %>%
group_by(cluster) %>%
slice(chull(funding_per_1k_poverty, visits_per_FTE))
ggplot(data, aes(x = funding_per_1k_poverty, y = visits_per_FTE,
color = as.factor(cluster), label = state_code)) +
geom_polygon(data = hulls, aes(group = cluster, fill = as.factor(cluster)),
alpha = 0.2, color = NA) +
geom_point(size = 4, alpha = 0.8) +
geom_text(size = 3, vjust = -1, check_overlap = TRUE) +
scale_color_brewer(palette = "Set1") +
scale_fill_brewer(palette = "Set1") +
scale_x_log10(labels = scales::comma) + # log scale because funding per 1k varies a lot
labs(
title = "Equity vs Efficiency: Funding vs Visits per FTE",
subtitle = "States clustered into peer groups",
x = "Funding per 1,000 Poverty Population (log scale)",
y = "Visits per FTE",
color = "Cluster"
) +
theme_minimal(base_size = 14)

# Scatter 2: Families per 1k poverty vs Caseload Utilization
ggplot(data, aes(x = families_per_1k_poverty, y = caseload_utilization,
color = as.factor(cluster), label = state_code)) +
geom_polygon(data = hulls, aes(group = cluster, fill = as.factor(cluster)),
alpha = 0.2, color = NA) +
geom_point(size = 4, alpha = 0.8) +
geom_text(size = 3, vjust = -1, check_overlap = TRUE) +
scale_color_brewer(palette = "Set1") +
labs(
title = "Reach vs Capacity Utilization",
subtitle = "Families per 1k poverty vs Caseload Utilization",
x = "Families per 1,000 Poverty Population",
y = "Caseload Utilization",
color = "Cluster"
) +
theme_minimal(base_size = 14)

Prep data for mapping
make_geo_map <- function(df, value_col, title, ztitle){
plot_geo(df, locationmode = "USA-states") %>%
add_trace(
z = ~get(value_col),
locations = ~state_code,
color = ~get(value_col),
colors = "Blues",
text = ~paste0(states, " (", state_code, ")<br>", ztitle, ": ",
ifelse(is.numeric(get(value_col)),
scales::comma(get(value_col), accuracy = 0.1),
get(value_col))),
hoverinfo = "text"
) %>%
colorbar(title = ztitle) %>%
layout(title = title, geo = list(scope = "usa"))
}
map1 <- make_geo_map(data, "funding_per_1k_poverty", "Funding per 1,000 poverty population", "USD per 1k")
map2 <- make_geo_map(data, "families_per_1k_poverty", "Families served per 1,000 poverty population", "Families per 1k")
map3 <- make_geo_map(data, "visits_per_FTE", "Visits per FTE", "Visits per FTE")
# Composite index map joins on state_code
map_index_df <- z_tbl %>% left_join(data %>% select(states, state_code), by = "states") %>% distinct()
map4 <- make_geo_map(map_index_df, "Performance_Index", "Composite Performance Index", "Index")
map1; map2; map3; map4