Analysis Coverage
Descriptive Table
Numeric Summary
Correlations
Econometric Models:OLS
Econometric Models:WLS
Efficiency Index/div>
Composite Performance Index
Influence Check
Equity Gap
Efficiency Benchmark
Clustering + PCA
Raw Metric Scatter
US Maps

Setup

Libraries

library(lmtest) # for testing linear regression
library(writexl)
library(scales)
library(magrittr) # for pipes
library(table1) # for descriptive statistics 
library(tidyverse) # for tidy code
library(sessioninfo) # for session_info at bottom
library(dplyr) # for data manipulation
library(details) # for session_info at bottom
library(ggthemes) # for tufte theme
library(ggrepel) # for text plotting
library(patchwork) # for combining plots
library(readxl) # for reading xlxs files
library(kableExtra) # for table headers formatting
library(knitr) # for tables
library(sjPlot) # for model tables 
library(broom) # for logistic regression
library(survey) #analysis of complex survey
library(pROC) # ROC curve visualization
library(skimr) # for summary generation
library(ggmosaic) #for mosaic plot
library(car) #for multicollinearity vif
library(generalhoslem) # for goodness of fit
library(ggeffects) # for effect modification visuals
library(usmap) # for US Map 
library(plotly) # for map plotting
library (heatmaply) # for heat map
library(DT)
library(gt)
library(htmltools)


# Nice helpers
tbl <- function(df, caption = NULL) {
  gt::gt(df) %>%
    gt::tab_header(title = caption) %>%
    gt::fmt_number(where(is.numeric), decimals = 1, use_seps = TRUE) %>%
    gt::opt_table_outline()
}
dt <- function(df, caption = NULL) {
  datatable(df, rownames = FALSE, filter = "top",
            options = list(pageLength = 10, scrollX = TRUE),
            caption = htmltools::tags$caption(style="caption-side: top; text-align: left;", caption))
}

Read & Clean data

df<- read.csv("C:/Users/kesha/Downloads/MIECHV_HEOR/ProgramSizeReach_with_poverty_population.csv")


data <- df %>% 
  drop_na(everything())

# Centralize variable choices used later (edit these vectors to include/exclude variables)
vars_summary <- c("base_award","matching_award","HV_staff_FTE","max_caseload",
                  "agencies","families_served","adults","children",
                  "total_home_visits","in_person_visits","virtual_visits",
                  "male_poverty","female_poverty","total_poverty_population")

vars_corr <- c("total_funding","HV_staff_FTE","agencies","max_caseload",
               "families_served","total_home_visits",
               "funding_per_1k_poverty","families_per_1k_poverty","visits_per_1k_poverty",
               "funding_per_family","funding_per_visit","visits_per_FTE","caseload_utilization")

vars_box  <- c("total_funding","families_served","total_home_visits","visits_per_FTE","funding_per_family")

Derived Metrics (HEOR)

data <- data %>%
  mutate(
    # Funding aggregates
    total_funding = base_award + matching_award,
    
    # Load-adjusted (per 1,000 population in poverty)
    funding_per_1k_poverty   = if_else(total_poverty_population > 0, total_funding / (total_poverty_population / 1000), NA_real_),
    families_per_1k_poverty  = if_else(total_poverty_population > 0, families_served / (total_poverty_population / 1000), NA_real_),
    visits_per_1k_poverty    = if_else(total_poverty_population > 0, total_home_visits / (total_poverty_population / 1000), NA_real_),
    
    # Per-unit productivity / cost
    funding_per_family = if_else(families_served > 0, total_funding / families_served, NA_real_),
    funding_per_child  = if_else(children > 0,       total_funding / children, NA_real_),
    funding_per_visit  = if_else(total_home_visits > 0, total_funding / total_home_visits, NA_real_),
    
    visits_per_family  = if_else(families_served > 0, total_home_visits / families_served, NA_real_),
    visits_per_FTE     = if_else(HV_staff_FTE > 0, total_home_visits / HV_staff_FTE, NA_real_),
    families_per_FTE   = if_else(HV_staff_FTE > 0, families_served / HV_staff_FTE, NA_real_),
    
    # Capacity/structure
    caseload_utilization  = if_else(max_caseload > 0, families_served / max_caseload, NA_real_),
    households_per_agency = if_else(agencies > 0, families_served / agencies, NA_real_),
    FTE_per_agency        = if_else(agencies > 0, HV_staff_FTE / agencies, NA_real_),
    avg_caseload_per_FTE  = if_else(HV_staff_FTE > 0, max_caseload / HV_staff_FTE, NA_real_),
    
    # Modality mix
    in_person_share = if_else(total_home_visits > 0, in_person_visits / total_home_visits, NA_real_),
    virtual_share   = if_else(total_home_visits > 0, virtual_visits / total_home_visits, NA_real_),
    
    # Poverty composition (optional)
    poverty_female_share = if_else(total_poverty_population > 0, female_poverty / total_poverty_population, NA_real_),
      poverty_male_share   = if_else(total_poverty_population > 0, male_poverty / total_poverty_population, NA_real_),    
    
    # Quartiles by poverty load
      poverty_quartile = ntile(total_poverty_population, 4)
    ) %>%
    mutate(
      poverty_quartile = factor(poverty_quartile,
                                labels = c("Lowest load", "Q2", "Q3", "Highest load")))

Quartile Cut-off

# Quartile cutoffs (table + histogram)
qcuts <- quantile(data$total_poverty_population, c(0,.25,.5,.75,1), na.rm = TRUE)
tbl(data.frame(Quantile = c("Min","Q1","Median","Q3","Max"),
               Cutoff = as.numeric(qcuts)),
    "Quartile cutoffs for total_poverty_population")
Quartile cutoffs for total_poverty_population
Quantile Cutoff
Min 224.0
Q1 2,368.0
Median 4,404.0
Q3 10,560.0
Max 81,335.0
p_hist <- ggplot(data, aes(total_poverty_population)) +
  geom_histogram(bins = 30, fill = "#7fcdbb", color = "white") +
  geom_vline(xintercept = qcuts, linetype = c("solid","dashed","dashed","dashed","solid")) +
  labs(title = "Poverty population: distribution & quartiles",
       x = "Total poverty population", y = "States") +
  theme_minimal()
plotly::ggplotly(p_hist)

States by Quartile

states_by_quartile <- data %>%
  group_by(poverty_quartile) %>%
  summarise(
    n_states = n(),
    state_codes = paste(state_code, collapse = ", "),
    .groups ="drop")
dt(states_by_quartile, "States by poverty-load quartiles")
quartile_summary <- data %>%
  group_by(poverty_quartile) %>%
  summarise(
    n_states = n(),
    min_pov  = min(total_poverty_population, na.rm = TRUE),
    max_pov  = max(total_poverty_population, na.rm = TRUE),
    avg_funding_per_1k  = mean(funding_per_1k_poverty, na.rm = TRUE),
    avg_families_per_1k = mean(families_per_1k_poverty, na.rm = TRUE),
    avg_visits_per_1k   = mean(visits_per_1k_poverty, na.rm = TRUE),
  .groups = "drop"
  )
tbl(quartile_summary, "Summary by poverty-load quartile")
Summary by poverty-load quartile
poverty_quartile n_states min_pov max_pov avg_funding_per_1k avg_families_per_1k avg_visits_per_1k
Lowest load 13.0 224.0 2,319.0 8,059,051.9 1,562.1 23,355.5
Q2 13.0 2,417.0 4,404.0 2,592,599.4 407.0 5,166.8
Q3 13.0 5,002.0 10,629.0 1,303,415.7 193.9 2,312.0
Highest load 12.0 12,115.0 81,335.0 376,162.8 67.1 709.4
# Quick bar of counts
ggplot(states_by_quartile, aes(poverty_quartile, n_states)) +
  geom_col(fill= "#fa9fb5") + geom_text(aes(label = n_states), vjust = -0.3) +
  labs(title = "State count by poverty-load quartile", x = NULL, y = "Number of states") +
  theme_minimal()

Descriptive Table

# table1: Overall
table1(~ base_award + matching_award + total_funding +
         HV_staff_FTE + max_caseload + agencies +
         families_served + adults + children +
         total_home_visits + in_person_visits + virtual_visits +
         funding_per_1k_poverty + families_per_1k_poverty + visits_per_1k_poverty +
         funding_per_family + funding_per_visit + visits_per_FTE + caseload_utilization,
       data = data, render.missing = NULL,
       render.continuous = "median (IQR)",
       caption = "Overall Descriptive Summary (HEOR) — 2023")
Overall Descriptive Summary (HEOR) — 2023
Overall
(N=51)
base_award 7960000 (6200000)
matching_award 726000 (0)
total_funding 8560000 (6200000)
HV_staff_FTE 74.7 (60.4)
max_caseload 1030 (1100)
agencies 12.0 (12.0)
families_served 1400 (1240)
adults 1400 (1290)
children 1320 (1130)
total_home_visits 16100 (14300)
in_person_visits 12000 (12900)
virtual_visits 2590 (3540)
funding_per_1k_poverty 1860000 (2780000)
families_per_1k_poverty 270 (421)
visits_per_1k_poverty 3270 (4800)
funding_per_family 7260 (3680)
funding_per_visit 570 (283)
visits_per_FTE 213 (84.7)
caseload_utilization 1.18 (0.224)
# table1: By quartile (not by state!)
table1(~ total_funding + HV_staff_FTE + agencies + max_caseload +
         families_served + total_home_visits +
         funding_per_1k_poverty + families_per_1k_poverty + visits_per_1k_poverty +
         funding_per_family + funding_per_visit + visits_per_FTE + caseload_utilization +
         in_person_share + virtual_share | poverty_quartile,
       data = data, render.missing = NULL,
       render.continuous = "median (IQR)",
       caption = "Program Summary by Poverty Load (Quartiles) — 2023")
Program Summary by Poverty Load (Quartiles) — 2023
Lowest load
(N=13)
Q2
(N=13)
Q3
(N=13)
Highest load
(N=12)
Overall
(N=51)
total_funding 5750000 (3340000) 9650000 (2480000) 10000000 (7060000) 9100000 (7220000) 8560000 (6200000)
HV_staff_FTE 57.0 (53.1) 82.2 (25.1) 80.3 (50.1) 61.2 (71.5) 74.7 (60.4)
agencies 8.00 (9.00) 14.0 (9.00) 15.0 (11.0) 13.0 (13.3) 12.0 (12.0)
max_caseload 519 (769) 1200 (392) 1410 (1180) 919 (1800) 1030 (1100)
families_served 655 (1120) 1490 (597) 1650 (1390) 1290 (2020) 1400 (1240)
total_home_visits 9380 (16200) 19900 (8290) 19700 (12700) 12300 (17300) 16100 (14300)
funding_per_1k_poverty 6440000 (4900000) 2950000 (1220000) 1220000 (1050000) 244000 (441000) 1860000 (2780000)
families_per_1k_poverty 713 (1500) 407 (196) 158 (150) 34.8 (91.6) 270 (421)
visits_per_1k_poverty 12200 (13500) 5280 (2550) 2110 (1710) 327 (865) 3270 (4800)
funding_per_family 7780 (4700) 6290 (2430) 7580 (2170) 7660 (4900) 7260 (3680)
funding_per_visit 575 (265) 484 (116) 608 (154) 644 (419) 570 (283)
visits_per_FTE 206 (65.4) 261 (112) 195 (51.2) 219 (93.2) 213 (84.7)
caseload_utilization 1.26 (0.220) 1.30 (0.220) 1.16 (0.126) 1.15 (0.283) 1.18 (0.224)
in_person_share 0.852 (0.154) 0.861 (0.157) 0.845 (0.0864) 0.850 (0.103) 0.851 (0.136)
virtual_share 0.146 (0.179) 0.139 (0.157) 0.141 (0.116) 0.150 (0.103) 0.146 (0.146)

Numeric Summary (wide table + boxplots)

desc_tbl <- data %>%
  summarise(across(all_of(vars_summary), list(
    n = ~sum(!is.na(.)),
    mean = ~mean(., na.rm = TRUE),
    sd = ~sd(., na.rm = TRUE),
    p25 = ~quantile(., 0.25, na.rm = TRUE),
    median = ~median(., na.rm = TRUE),
    p75 = ~quantile(., 0.75, na.rm = TRUE),
    min = ~min(., na.rm = TRUE),
    max = ~max(., na.rm = TRUE)
  ), .names = "{.col}.{.fn}"))
dt(desc_tbl, "Numeric Descriptive Summary (Wide)")
# (1) Overall summary (no groups)
table1(
  ~ base_award + matching_award + total_funding +
    HV_staff_FTE + max_caseload + agencies +
    families_served + adults + children +
    total_home_visits + in_person_visits + virtual_visits +
    funding_per_1k_poverty + families_per_1k_poverty + visits_per_1k_poverty +
    funding_per_family + funding_per_visit + visits_per_FTE + caseload_utilization,
  data = data, render.missing = NULL,
  render.continuous = "median (IQR)",
  caption = "Overall Descriptive Summary (HEOR) — 2023"
)
Overall Descriptive Summary (HEOR) — 2023
Overall
(N=51)
base_award 7960000 (6200000)
matching_award 726000 (0)
total_funding 8560000 (6200000)
HV_staff_FTE 74.7 (60.4)
max_caseload 1030 (1100)
agencies 12.0 (12.0)
families_served 1400 (1240)
adults 1400 (1290)
children 1320 (1130)
total_home_visits 16100 (14300)
in_person_visits 12000 (12900)
virtual_visits 2590 (3540)
funding_per_1k_poverty 1860000 (2780000)
families_per_1k_poverty 270 (421)
visits_per_1k_poverty 3270 (4800)
funding_per_family 7260 (3680)
funding_per_visit 570 (283)
visits_per_FTE 213 (84.7)
caseload_utilization 1.18 (0.224)
# (2) By poverty-load quartile
table1(
  ~ total_funding + HV_staff_FTE + agencies + max_caseload +
    families_served + total_home_visits +
    funding_per_1k_poverty + families_per_1k_poverty + visits_per_1k_poverty +
    funding_per_family + funding_per_visit + visits_per_FTE + caseload_utilization +
    in_person_share + virtual_share | poverty_quartile,
  data = data, render.missing = NULL,
  render.continuous = "median (IQR)",
  caption = "Program Summary by Poverty Load (Quartiles) — 2023"
)
Program Summary by Poverty Load (Quartiles) — 2023
Lowest load
(N=13)
Q2
(N=13)
Q3
(N=13)
Highest load
(N=12)
Overall
(N=51)
total_funding 5750000 (3340000) 9650000 (2480000) 10000000 (7060000) 9100000 (7220000) 8560000 (6200000)
HV_staff_FTE 57.0 (53.1) 82.2 (25.1) 80.3 (50.1) 61.2 (71.5) 74.7 (60.4)
agencies 8.00 (9.00) 14.0 (9.00) 15.0 (11.0) 13.0 (13.3) 12.0 (12.0)
max_caseload 519 (769) 1200 (392) 1410 (1180) 919 (1800) 1030 (1100)
families_served 655 (1120) 1490 (597) 1650 (1390) 1290 (2020) 1400 (1240)
total_home_visits 9380 (16200) 19900 (8290) 19700 (12700) 12300 (17300) 16100 (14300)
funding_per_1k_poverty 6440000 (4900000) 2950000 (1220000) 1220000 (1050000) 244000 (441000) 1860000 (2780000)
families_per_1k_poverty 713 (1500) 407 (196) 158 (150) 34.8 (91.6) 270 (421)
visits_per_1k_poverty 12200 (13500) 5280 (2550) 2110 (1710) 327 (865) 3270 (4800)
funding_per_family 7780 (4700) 6290 (2430) 7580 (2170) 7660 (4900) 7260 (3680)
funding_per_visit 575 (265) 484 (116) 608 (154) 644 (419) 570 (283)
visits_per_FTE 206 (65.4) 261 (112) 195 (51.2) 219 (93.2) 213 (84.7)
caseload_utilization 1.26 (0.220) 1.30 (0.220) 1.16 (0.126) 1.15 (0.283) 1.18 (0.224)
in_person_share 0.852 (0.154) 0.861 (0.157) 0.845 (0.0864) 0.850 (0.103) 0.851 (0.136)
virtual_share 0.146 (0.179) 0.139 (0.157) 0.141 (0.116) 0.150 (0.103) 0.146 (0.146)
# Boxplots for key metrics
box_long <- data %>%
  select(all_of(vars_box)) %>%
  pivot_longer(everything(), names_to = "metric", values_to = "value")

p_boxes <- ggplot(box_long, aes(metric, value)) +
  geom_boxplot(outlier.alpha = .4, fill = "#8856a7") + 
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Distribution of key program metrics", x = NULL, y = "Value") +
  theme_minimal()
plotly::ggplotly(p_boxes)

Correlations (key variables)

cors <- cor(data[, vars_corr], use = "pairwise.complete.obs")

heatmaply::heatmaply_cor(cors, k_col = 2, k_row = 2,
                         xlab = "Variables", ylab = "Variables",
                         main = "Correlation matrix (interactive)")
# 2. Test significance of correlations
p_mat <- psych::corr.test(data[, vars_corr], adjust = "none")$p  # gives p-values

# 3. Mask non-significant correlations (replace with 0 to keep clustering)
alpha <- 0.05
cors_sig <- cors
cors_sig[p_mat > alpha] <- 0   # keep clustering valid

# 4. Plot with diverging colors
heatmaply_cor(
  cors_sig,
  k_col = 2, k_row = 2,
  xlab = "Variables", ylab = "Variables",
  main = "Significant Correlations (p < 0.05)",
  colors = colorRampPalette(c("#238443", "#ece7f2", "#cc4c02"))(200) # blue = negative, red = positive
)

Econometric Models:Ordinary Least Squares (OLS), State Level Inference

# Helper: tidy robust results
tidy_robust <- function(model, model_name, type = "HC1") {
  # robust VCOV
  robust_vcov <- sandwich::vcovHC(model, type = type)
  
  # coeftest gives matrix of coef, robust SE, t, p
  rob <- lmtest::coeftest(model, vcov = robust_vcov)
  
  # convert to tibble
  tibble::tibble(
    term = rownames(rob),
    estimate = rob[, 1],
    std.error = rob[, 2],
    statistic = rob[, 3],
    p.value = rob[, 4],
    model = model_name
  ) %>%
    # add 95% CIs
    mutate(
      conf.low  = estimate - 1.96 * std.error,
      conf.high = estimate + 1.96 * std.error
    )
}


# Run models

m1 <- lm(families_served ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data)
m2 <- lm(total_home_visits ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data)
m3 <- lm(visits_per_FTE    ~ funding_per_family + agencies + max_caseload + total_poverty_population, data = data)


# Collect robust results

coef_df <- bind_rows(
  tidy_robust(m1, "Robust OLS: families_served"),
  tidy_robust(m2, "Robust OLS: total_home_visits"),
  tidy_robust(m3, "Robust OLS: visits_per_FTE")
) %>%
  filter(term != "(Intercept)")


# print

#print(coef_df)

#  pretty table

coef_df %>%
  gt(groupname_col = "model") %>%
  gt::fmt_number(columns = c(estimate, std.error, conf.low, conf.high),
                 decimals = 2) %>%
  gt::tab_header(title = "OLS with Robust SE (HC1)")
OLS with Robust SE (HC1)
term estimate std.error statistic p.value conf.low conf.high
Robust OLS: families_served
total_funding 0.00 0.00 -0.3912873 6.974316e-01 0.00 0.00
HV_staff_FTE −5.09 1.51 -3.3817080 1.499262e-03 −8.04 −2.14
agencies 43.13 14.47 2.9809569 4.623269e-03 14.77 71.50
max_caseload 1.07 0.18 6.0348855 2.771351e-07 0.72 1.42
total_poverty_population 0.00 0.00 -0.6300713 5.318345e-01 −0.01 0.00
Robust OLS: total_home_visits
total_funding 0.00 0.00 -1.7773238 8.227399e-02 0.00 0.00
HV_staff_FTE −94.68 50.90 -1.8602151 6.939824e-02 −194.43 5.08
agencies 1,451.98 541.08 2.6834748 1.015838e-02 391.46 2,512.50
max_caseload 17.76 6.22 2.8547144 6.492045e-03 5.57 29.95
total_poverty_population −0.08 0.08 -0.9839336 3.304090e-01 −0.24 0.08
Robust OLS: visits_per_FTE
funding_per_family 0.00 0.00 -9.9436652 4.869858e-13 0.00 0.00
agencies 4.05 1.76 2.2929646 2.647102e-02 0.59 7.51
max_caseload −0.02 0.01 -2.7968311 7.508768e-03 −0.04 −0.01
total_poverty_population 0.00 0.00 -0.5975456 5.530735e-01 0.00 0.00

Forest Plot for coefficeint

ggplot(coef_df, aes(x = estimate, y = term, 
                    xmin = conf.low, xmax = conf.high,
                    color = model)) +
  geom_point(position = position_dodge(width = 0.7), size = 2) +
  geom_errorbarh(position = position_dodge(width = 0.7), height = 0.2) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey40") +
  labs(
    title = "Coefficient Plot with Robust SEs (95% CI)",
    x = "Estimate",
    y = "Predictor"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "bottom")+
  facet_wrap(~model, scales = "free_x")

Standardize predictors before fitting models

# standardize selected predictors
data_std <- data %>%
  mutate(across(
    c(total_funding, HV_staff_FTE, agencies, max_caseload, 
      total_poverty_population, funding_per_family),
    scale
  ))

m1_std <- lm(families_served ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data_std)
m2_std <- lm(total_home_visits ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data_std)
m3_std <- lm(visits_per_FTE    ~ funding_per_family + agencies + max_caseload + total_poverty_population, data = data_std)

coef_df_std <- bind_rows(
  tidy_robust(m1_std, "families_served"),
  tidy_robust(m2_std, "total_home_visits"),
  tidy_robust(m3_std, "visits_per_FTE")
) %>% filter(term != "(Intercept)")


# pretty table

coef_df_std %>%
  gt(groupname_col = "model") %>%
  gt::fmt_number(columns = c(estimate, std.error, conf.low, conf.high),
                 decimals = 2) %>%
  gt::tab_header(title = "OLS with Robust SE (HC1)")
OLS with Robust SE (HC1)
term estimate std.error statistic p.value conf.low conf.high
families_served
total_funding −53.30 136.21 -0.3912873 6.974316e-01 −320.27 213.67
HV_staff_FTE −772.26 228.36 -3.3817080 1.499262e-03 −1,219.85 −324.67
agencies 428.27 143.67 2.9809569 4.623269e-03 146.68 709.86
max_caseload 1,680.32 278.43 6.0348855 2.771351e-07 1,134.59 2,226.05
total_poverty_population −26.33 41.79 -0.6300713 5.318345e-01 −108.25 55.58
total_home_visits
total_funding −8,802.57 4,952.71 -1.7773238 8.227399e-02 −18,509.88 904.74
HV_staff_FTE −14,361.85 7,720.53 -1.8602151 6.939824e-02 −29,494.10 770.39
agencies 14,416.28 5,372.24 2.6834748 1.015838e-02 3,886.68 24,945.88
max_caseload 27,922.53 9,781.20 2.8547144 6.492045e-03 8,751.38 47,093.67
total_poverty_population −1,407.15 1,430.13 -0.9839336 3.304090e-01 −4,210.19 1,395.90
visits_per_FTE
funding_per_family −39.00 3.92 -9.9436652 4.869858e-13 −46.69 −31.31
agencies 40.18 17.52 2.2929646 2.647102e-02 5.83 74.52
max_caseload −32.99 11.79 -2.7968311 7.508768e-03 −56.11 −9.87
total_poverty_population −5.81 9.72 -0.5975456 5.530735e-01 −24.85 13.24
# forest plot
ggplot(coef_df_std, aes(x = estimate, y = term,
                        xmin = conf.low, xmax = conf.high,
                        color = model)) +
  geom_point(size = 2) +
  geom_errorbarh(height = 0.2) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey40") +
  labs(
    title = "Standardized Coefficient Plot with Robust SEs (95% CI)",
    x = "Standardized Estimate (per 1 SD change in predictor)",
    y = "Predictor"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "bottom") +
  facet_wrap(~model, scales = "free_x")

Econometric Models:Weighted Least Squares(WLS), Population Level Inference

# Run models

m1_wls <- lm(families_served ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data, weights = total_poverty_population)
m2_wls <- lm(total_home_visits ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data,  weights = total_poverty_population)
m3_wls <- lm(visits_per_FTE    ~ funding_per_family + agencies + max_caseload + total_poverty_population, data = data,  weights = total_poverty_population)


# Collect robust results

coef_df_wls <- bind_rows(
  tidy_robust(m1_wls, "Robust WLS: families_served"),
  tidy_robust(m2_wls, "Robust WLS: total_home_visits"),
  tidy_robust(m3_wls, "Robust WLS: visits_per_FTE")
) %>%
  filter(term != "(Intercept)")


# print

#print(coef_df_wls)

#  pretty table

coef_df_wls %>%
  gt(groupname_col = "model") %>%
  gt::fmt_number(columns = c(estimate, std.error, conf.low, conf.high),
                 decimals = 2) %>%
  gt::tab_header(title = "WLS with Robust SE (HC1)")
WLS with Robust SE (HC1)
term estimate std.error statistic p.value conf.low conf.high
Robust WLS: families_served
total_funding 0.00 0.00 -0.6694296 5.066419e-01 0.00 0.00
HV_staff_FTE −3.64 0.47 -7.7312319 8.508534e-10 −4.56 −2.71
agencies 40.00 9.56 4.1825367 1.315608e-04 21.26 58.75
max_caseload 0.88 0.06 15.6885357 7.295050e-20 0.77 0.99
total_poverty_population 0.00 0.00 1.0119571 3.169690e-01 0.00 0.01
Robust WLS: total_home_visits
total_funding 0.00 0.00 -1.4797539 1.459062e-01 0.00 0.00
HV_staff_FTE −46.53 18.55 -2.5079307 1.581869e-02 −82.90 −10.17
agencies 740.59 336.14 2.2031886 3.274287e-02 81.75 1,399.43
max_caseload 10.81 2.05 5.2669365 3.763857e-06 6.79 14.84
total_poverty_population −0.05 0.04 -1.3168081 1.945699e-01 −0.13 0.02
Robust WLS: visits_per_FTE
funding_per_family 0.00 0.00 -6.2316068 1.300715e-07 0.00 0.00
agencies 1.42 2.15 0.6614850 5.116013e-01 −2.79 5.63
max_caseload −0.02 0.01 -2.8082815 7.285133e-03 −0.03 −0.01
total_poverty_population 0.00 0.00 -0.5624614 5.765326e-01 0.00 0.00

Forest Plot for coefficeint

ggplot(coef_df_wls, aes(x = estimate, y = term, 
                    xmin = conf.low, xmax = conf.high,
                    color = model)) +
  geom_point(position = position_dodge(width = 0.7), size = 2) +
  geom_errorbarh(position = position_dodge(width = 0.7), height = 0.2) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey40") +
  labs(
    title = "WLS Coefficient Plot with Robust SEs (95% CI)",
    x = "Estimate",
    y = "Predictor"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "bottom")+
  facet_wrap(~model, scales = "free_x")

Standardize predictors before fitting models

# standardize selected predictors
data_std <- data %>%
  # remove NA or negative weights first
  filter(!is.na(total_poverty_population) & total_poverty_population > 0) %>%
  # then standardize predictors
  mutate(across(
    c(total_funding, HV_staff_FTE, agencies, max_caseload, 
       funding_per_family),
    ~ as.numeric(scale(.))
  ))

m1_wls_std <- lm(families_served ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data_std, weights = total_poverty_population)
m2_wls_std <- lm(total_home_visits ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data_std, weights = total_poverty_population)
m3_wls_std <- lm(visits_per_FTE    ~ funding_per_family + agencies + max_caseload + total_poverty_population, data = data_std, weights = total_poverty_population)

coef_df_std_wls <- bind_rows(
  tidy_robust(m1_wls_std, "families_served"),
  tidy_robust(m2_wls_std, "total_home_visits"),
  tidy_robust(m3_wls_std, "visits_per_FTE")
) %>% filter(term != "(Intercept)")


# pretty table

coef_df_std_wls %>%
  gt(groupname_col = "model") %>%
  gt::fmt_number(columns = c(estimate, std.error, conf.low, conf.high),
                 decimals = 2) %>%
  gt::tab_header(title = "WLS with Robust SE (HC1)")
WLS with Robust SE (HC1)
term estimate std.error statistic p.value conf.low conf.high
families_served
total_funding −49.74 74.31 -0.6694296 5.066419e-01 −195.39 95.90
HV_staff_FTE −551.53 71.34 -7.7312319 8.508534e-10 −691.35 −411.71
agencies 397.18 94.96 4.1825367 1.315608e-04 211.05 583.30
max_caseload 1,379.87 87.95 15.6885357 7.295050e-20 1,207.48 1,552.26
total_poverty_population 0.00 0.00 1.0119571 3.169690e-01 0.00 0.01
total_home_visits
total_funding −3,023.69 2,043.37 -1.4797539 1.459062e-01 −7,028.69 981.32
HV_staff_FTE −7,058.44 2,814.45 -2.5079307 1.581869e-02 −12,574.75 −1,542.12
agencies 7,353.07 3,337.47 2.2031886 3.274287e-02 811.63 13,894.51
max_caseload 17,002.64 3,228.18 5.2669365 3.763857e-06 10,675.40 23,329.88
total_poverty_population −0.05 0.04 -1.3168081 1.945699e-01 −0.13 0.02
visits_per_FTE
funding_per_family −38.40 6.16 -6.2316068 1.300715e-07 −50.48 −26.33
agencies 14.11 21.33 0.6614850 5.116013e-01 −27.69 55.91
max_caseload −27.33 9.73 -2.8082815 7.285133e-03 −46.40 −8.26
total_poverty_population 0.00 0.00 -0.5624614 5.765326e-01 0.00 0.00
# forest plot
ggplot(coef_df_std_wls, aes(x = estimate, y = term,
                        xmin = conf.low, xmax = conf.high,
                        color = model)) +
  geom_point(size = 2) +
  geom_errorbarh(height = 0.2) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey40") +
  labs(
    title = "WLS Standardized Coefficient Plot with Robust SEs (95% CI)",
    x = "Standardized Estimate (per 1 SD change in predictor)",
    y = "Predictor"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "bottom") +
  facet_wrap(~model, scales = "free_x")

Efficiency Index (actual vs predicted visits)

data_pred <- data %>%
  mutate(
    pred_visits = as.numeric(predict(m2, newdata = data)),
    efficiency_visits = if_else(!is.na(pred_visits) & pred_visits > 0, total_home_visits / pred_visits, NA_real_)
  )


top_eff <- data_pred %>%
  arrange(desc(efficiency_visits)) %>%
  dplyr::select(states, total_home_visits, pred_visits, efficiency_visits) %>%
  slice_head(n = 10)


bot_eff <- data_pred %>%
  arrange(efficiency_visits) %>%
  dplyr::select(states, total_home_visits, pred_visits, efficiency_visits) %>%
  slice_head(n = 10)


tbl(top_eff, "Top 10 by efficiency (Actual / Predicted)")
Top 10 by efficiency (Actual / Predicted)
states total_home_visits pred_visits efficiency_visits
New Jersey 21,002.0 2,422.9 8.7
Wyoming 4,724.0 815.4 5.8
Tennessee 20,110.0 4,669.0 4.3
Oklahoma 12,119.0 2,860.8 4.2
Hawaii 8,105.0 2,116.9 3.8
Indiana 22,089.0 7,589.2 2.9
Connecticut 25,146.0 9,936.0 2.5
Missouri 9,381.0 3,868.9 2.4
Iowa 12,172.0 6,298.0 1.9
North Carolina 6,813.0 3,531.0 1.9
tbl(bot_eff, "Bottom 10 by efficiency (Actual / Predicted)")
Bottom 10 by efficiency (Actual / Predicted)
states total_home_visits pred_visits efficiency_visits
Mississippi 15.0 16,445.2 0.0
Vermont 3,551.0 12,483.1 0.3
South Dakota 1,589.0 4,567.6 0.3
Montana 12,425.0 26,265.1 0.5
Nebraska 11,259.0 22,687.8 0.5
Maryland 13,955.0 26,206.6 0.5
Arkansas 21,697.0 39,273.9 0.6
West Virginia 21,181.0 38,324.2 0.6
Virginia 13,411.0 23,233.5 0.6
Massachusetts 23,246.0 35,128.8 0.7

Plot for Efficiency Index

# Bars
p_top <- ggplot(top_eff, aes(reorder(states, efficiency_visits), efficiency_visits)) +
  geom_col(fill = "steelblue") + coord_flip() + theme_minimal() +
  labs(title = "Top 10 Efficiency", x = NULL, y = "Actual/Predicted")
plotly::ggplotly(p_top)
p_bot <- ggplot(bot_eff, aes(reorder(states, efficiency_visits), efficiency_visits)) +
  geom_col(fill="skyblue") + coord_flip() + theme_minimal() +
  labs(title = "Bottom 10 Efficiency", x = NULL, y = "Actual/Predicted")
plotly::ggplotly(p_bot)

Composite Performance Index (z-score blend)

Higher is better for: visits_per_1k_poverty, families_per_1k_poverty, visits_per_FTE, caseload_utilization

Lower is better for: funding_per_visit, funding_per_family (so we invert their z)

z_good <- c("visits_per_1k_poverty","families_per_1k_poverty","visits_per_FTE","caseload_utilization")
z_bad  <- c("funding_per_visit","funding_per_family")

z_tbl <- data %>%
  mutate(
    across(all_of(z_good), ~ as.numeric(scale(.)), .names = "{.col}_z"),
    across(all_of(z_bad),  ~ as.numeric(scale(.)) * -1, .names = "{.col}_z") # invert cost
  ) %>%
  mutate(
    Performance_Index = rowMeans(dplyr::select(., ends_with("_z")), na.rm = TRUE)
  ) %>%
  dplyr::select(states, Performance_Index, ends_with("_z"))

# composite performance index table

league_table <- z_tbl %>%
  arrange(desc(Performance_Index)) %>%
  mutate(Rank = row_number()) %>%
  dplyr::select(Rank, states, Performance_Index)

dt(league_table, "League table — Composite Performance Index (higher = better)")

plot performance table

p_league <- ggplot(league_table %>% slice_head(n = 20),
                   aes(reorder(states, Performance_Index), Performance_Index)) +
  geom_col(fill="#2ca02c") + coord_flip() + theme_minimal() +
  labs(title = "Top 20: Composite Performance Index", x = NULL, y = "Index")
plotly::ggplotly(p_league)

Influence Check (outliers for M2, Cooks distance)

check_cooks <- function(model, data, cutoff_rule = 4, top_n = 10) {
  cd <- cooks.distance(model)
  cutoff <- cutoff_rule / nrow(data)
  
  infl_df <- tibble(
    states   = data$states,
    state_code = data$state_code,
    total_home_visits = data$total_home_visits,
    total_funding = data$total_funding,
    HV_staff_FTE = data$HV_staff_FTE,
    total_poverty_population = data$total_poverty_population,
    cooks_d = as.numeric(cd)
  ) %>%
    arrange(desc(cooks_d))
  
  cat("Cook's Distance cutoff =", round(cutoff, 4), "\n\n")
  
  # --- Part A: threshold violations ---
  if (any(cd > cutoff)) {
    tbl(
      infl_df %>% filter(cooks_d > cutoff),
      paste0("Cases above Cook's Distance cutoff (", deparse(substitute(model)), ")")
    )
  } else {
    cat("No cases above the cutoff.\n\n")
  }
  
  # --- Part B: top-N ranking ---
  tbl(
    infl_df %>%  slice_head(n = top_n),
    paste0("Top ", top_n, " Cook's Distance values (", deparse(substitute(model)), ")")
  )
  
  # --- Part C: Cook’s Distance plot ---
  p <- ggplot(infl_df, aes(x = seq_along(cooks_d), y = cooks_d)) +
    geom_col(fill = "#1f77b4") +
    geom_hline(yintercept = cutoff, color = "red", linetype = "dashed") +
    labs(
      title = paste0("Cook's Distance (", deparse(substitute(model)), ")"),
      x = "Observation Index",
      y = "Cook's Distance"
    ) +
    theme_minimal()
  
  print(p)  # show plot
  
  invisible(infl_df)  # return full dataframe for further use
}

Equity Gap Analysis (Per-1k vs Quartile)

equity_gap <- data %>%
  group_by(poverty_quartile) %>%
  summarise(
    avg_funding_per_1k = mean(funding_per_1k_poverty, na.rm = TRUE),
    avg_visits_per_1k  = mean(visits_per_1k_poverty, na.rm = TRUE),
    .groups ="drop"
  )
tbl(equity_gap, "Equity gap by poverty-load quartile")
Equity gap by poverty-load quartile
poverty_quartile avg_funding_per_1k avg_visits_per_1k
Lowest load 8,059,051.9 23,355.5
Q2 2,592,599.4 5,166.8
Q3 1,303,415.7 2,312.0
Highest load 376,162.8 709.4
# plot

q_long <- equity_gap %>% 
  pivot_longer(-poverty_quartile, names_to = "metric", values_to = "value")
ggplot(q_long, aes(poverty_quartile, value, fill = metric)) +
  geom_col(position = "dodge") +
  theme_minimal() +
  labs(title = "Equity metrics by quartile", x = NULL, y = "Average")

Efficiency Benchmarking (Frontier view)

ggplot(data, aes(total_funding/1e6, total_home_visits, label = state_code)) +
  geom_point(color = "steelblue") +
  ggrepel::geom_text_repel(size = 3, max.overlaps = 20) +
  labs(x = "Funding (Millions USD)", y = "Total Home Visits",
       title = "Efficiency frontier: Funding vs Visits") +
  theme_minimal()

Cluster + PCA

# select, scale, cluster
clust_vars <- data %>%
  select(funding_per_1k_poverty, families_per_1k_poverty, visits_per_FTE, caseload_utilization)
set.seed(123)
clust_scaled <- scale(clust_vars)
km <- kmeans(clust_scaled, centers = 4, nstart = 25)
data <- data %>% mutate(cluster = factor(km$cluster))

# Counts & membership
states_by_cluster <- data %>%
  group_by(cluster) %>%
  summarise(n_states = n(), state_codes = paste(state_code, collapse = ", "), .groups = "drop")
dt(states_by_cluster, "States by cluster")
# Unscaled cluster means
cluster_means <- t(apply(km$centers, 1, function(row) {
  row * attr(clust_scaled, "scaled:scale") + attr(clust_scaled, "scaled:center")
})) %>%
  as.data.frame() %>%
  tibble::rownames_to_column("cluster")
tbl(cluster_means, "Cluster centers (original units)")
Cluster centers (original units)
cluster funding_per_1k_poverty families_per_1k_poverty visits_per_FTE caseload_utilization
1 1,529,447.1 229.2 311.7 1.6
2 13,116,572.3 3,088.7 260.8 1.2
3 2,668,941.7 166.4 67.5 0.5
4 2,134,562.2 321.2 211.8 1.2
# PCA plot
pca <- prcomp(clust_vars, center = TRUE, scale. = TRUE)
pca_data <- as.data.frame(pca$x[,1:2]) %>%
  mutate(states = data$states, state_code = data$state_code, cluster = data$cluster)

# Convex hulls
hulls <- pca_data %>%
  group_by(cluster) %>%
  slice(chull(PC1, PC2))

p_pca <- ggplot(pca_data, aes(PC1, PC2, color = cluster, text = state_code)) +
  geom_polygon(data = hulls, aes(group = cluster, fill = cluster), 
               alpha = 0.2, color = NA) +
  geom_point(size = 4, alpha = 0.9) +
  scale_color_brewer(palette = "Set1") +
  scale_fill_brewer(palette = "Set1") +
  labs(title = "State clusters (PCA of equity & efficiency metrics)") +
  theme_minimal()

plotly::ggplotly(p_pca, tooltip = "text")

PCA for visualization

hulls <- pca_data %>%
  group_by(cluster) %>%
  slice(chull(PC1, PC2))   # convex hull points per cluster

ggplot(pca_data, aes(PC1, PC2, color = cluster, fill = cluster)) +
  geom_polygon(data = hulls, aes(group = cluster), alpha = 0.2, color = NA) + # shaded hull
  geom_point(size = 4, alpha = 0.9) +
  geom_text(aes(label = state_code), size = 3, vjust = -1, check_overlap = TRUE) +
  scale_color_brewer(palette = "Set1") +
  scale_fill_brewer(palette = "Set1") +
  labs(
    title = "State Clusters (Convex Hulls in PCA space)",
    x = "Principal Component 1",
    y = "Principal Component 2"
  ) +
  theme_minimal(base_size = 14)

Raw metric scatter plot

Scatter 1: Funding per 1k poverty vs Visits per FTE

hulls <- data %>%
  group_by(cluster) %>%
  slice(chull(funding_per_1k_poverty, visits_per_FTE))

ggplot(data, aes(x = funding_per_1k_poverty, y = visits_per_FTE, 
                 color = as.factor(cluster), label = state_code)) +
  geom_polygon(data = hulls, aes(group = cluster, fill = as.factor(cluster)), 
               alpha = 0.2, color = NA) +
  geom_point(size = 4, alpha = 0.8) +
  geom_text(size = 3, vjust = -1, check_overlap = TRUE) +
  scale_color_brewer(palette = "Set1") +
  scale_fill_brewer(palette = "Set1") +
  scale_x_log10(labels = scales::comma) +  # log scale because funding per 1k varies a lot
  labs(
    title = "Equity vs Efficiency: Funding vs Visits per FTE",
    subtitle = "States clustered into peer groups",
    x = "Funding per 1,000 Poverty Population (log scale)",
    y = "Visits per FTE",
    color = "Cluster"
  ) +
  theme_minimal(base_size = 14)

# Scatter 2: Families per 1k poverty vs Caseload Utilization
ggplot(data, aes(x = families_per_1k_poverty, y = caseload_utilization, 
                 color = as.factor(cluster), label = state_code)) +
  geom_polygon(data = hulls, aes(group = cluster, fill = as.factor(cluster)), 
               alpha = 0.2, color = NA) +
  geom_point(size = 4, alpha = 0.8) +
  geom_text(size = 3, vjust = -1, check_overlap = TRUE) +
  scale_color_brewer(palette = "Set1") +
  labs(
    title = "Reach vs Capacity Utilization",
    subtitle = "Families per 1k poverty vs Caseload Utilization",
    x = "Families per 1,000 Poverty Population",
    y = "Caseload Utilization",
    color = "Cluster"
  ) +
  theme_minimal(base_size = 14)

Prep data for mapping

make_geo_map <- function(df, value_col, title, ztitle){
  plot_geo(df, locationmode = "USA-states") %>%
    add_trace(
      z = ~get(value_col),
      locations = ~state_code,
      color = ~get(value_col),
      colors = "Blues",
      text = ~paste0(states, " (", state_code, ")<br>", ztitle, ": ",
                     ifelse(is.numeric(get(value_col)),
                            scales::comma(get(value_col), accuracy = 0.1),
                            get(value_col))),
      hoverinfo = "text"
    ) %>%
    colorbar(title = ztitle) %>%
    layout(title = title, geo = list(scope = "usa"))
}

map1 <- make_geo_map(data, "funding_per_1k_poverty",  "Funding per 1,000 poverty population", "USD per 1k")
map2 <- make_geo_map(data, "families_per_1k_poverty", "Families served per 1,000 poverty population", "Families per 1k")
map3 <- make_geo_map(data, "visits_per_FTE",          "Visits per FTE", "Visits per FTE")

# Composite index map joins on state_code
map_index_df <- z_tbl %>% left_join(data %>% select(states, state_code), by = "states") %>% distinct()
map4 <- make_geo_map(map_index_df, "Performance_Index", "Composite Performance Index", "Index")

map1; map2; map3; map4