Analysis Coverage

✓Descriptive Table

✓Numeric Summary

✓Correlations

✓Econometric Models:OLS

✓Econometric Models:WLS

✓Efficiency Index/div>

✓Composite Performance Index

✓Influence Check

✓Equity Gap

✓Efficiency Benchmark

✓Clustering + PCA

✓Raw Metric Scatter

✓US Maps

Setup

Libraries

library(lmtest) # for testing linear regression
library(writexl)
library(scales)
library(magrittr) # for pipes
library(table1) # for descriptive statistics 
library(tidyverse) # for tidy code
library(sessioninfo) # for session_info at bottom
library(dplyr) # for data manipulation
library(details) # for session_info at bottom
library(ggthemes) # for tufte theme
library(ggrepel) # for text plotting
library(patchwork) # for combining plots
library(readxl) # for reading xlxs files
library(kableExtra) # for table headers formatting
library(knitr) # for tables
library(sjPlot) # for model tables 
library(broom) # for logistic regression
library(survey) #analysis of complex survey
library(pROC) # ROC curve visualization
library(skimr) # for summary generation
library(ggmosaic) #for mosaic plot
library(car) #for multicollinearity vif
library(generalhoslem) # for goodness of fit
library(ggeffects) # for effect modification visuals
library(usmap) # for US Map 
library(plotly) # for map plotting
library (heatmaply) # for heat map
library(DT)
library(gt)
library(htmltools)


# Nice helpers
tbl <- function(df, caption = NULL) {
  gt::gt(df) %>%
    gt::tab_header(title = caption) %>%
    gt::fmt_number(where(is.numeric), decimals = 1, use_seps = TRUE) %>%
    gt::opt_table_outline()
}
dt <- function(df, caption = NULL) {
  datatable(df, rownames = FALSE, filter = "top",
            options = list(pageLength = 10, scrollX = TRUE),
            caption = htmltools::tags$caption(style="caption-side: top; text-align: left;", caption))
}

Read & Clean data

df<- read.csv("C:/Users/kesha/Downloads/MIECHV_HEOR/ProgramSizeReach_with_poverty_population.csv")


data <- df %>% 
  drop_na(everything())

# Centralize variable choices used later (edit these vectors to include/exclude variables)
vars_summary <- c("base_award","matching_award","HV_staff_FTE","max_caseload",
                  "agencies","families_served","adults","children",
                  "total_home_visits","in_person_visits","virtual_visits",
                  "male_poverty","female_poverty","total_poverty_population")

vars_corr <- c("total_funding","HV_staff_FTE","agencies","max_caseload",
               "families_served","total_home_visits",
               "funding_per_1k_poverty","families_per_1k_poverty","visits_per_1k_poverty",
               "funding_per_family","funding_per_visit","visits_per_FTE","caseload_utilization")

vars_box  <- c("total_funding","families_served","total_home_visits","visits_per_FTE","funding_per_family")

Derived Metrics (HEOR)

data <- data %>%
  mutate(
    # Funding aggregates
    total_funding = base_award + matching_award,
    
    # Load-adjusted (per 1,000 population in poverty)
    funding_per_1k_poverty   = if_else(total_poverty_population > 0, total_funding / (total_poverty_population / 1000), NA_real_),
    families_per_1k_poverty  = if_else(total_poverty_population > 0, families_served / (total_poverty_population / 1000), NA_real_),
    visits_per_1k_poverty    = if_else(total_poverty_population > 0, total_home_visits / (total_poverty_population / 1000), NA_real_),
    
    # Per-unit productivity / cost
    funding_per_family = if_else(families_served > 0, total_funding / families_served, NA_real_),
    funding_per_child  = if_else(children > 0,       total_funding / children, NA_real_),
    funding_per_visit  = if_else(total_home_visits > 0, total_funding / total_home_visits, NA_real_),
    
    visits_per_family  = if_else(families_served > 0, total_home_visits / families_served, NA_real_),
    visits_per_FTE     = if_else(HV_staff_FTE > 0, total_home_visits / HV_staff_FTE, NA_real_),
    families_per_FTE   = if_else(HV_staff_FTE > 0, families_served / HV_staff_FTE, NA_real_),
    
    # Capacity/structure
    caseload_utilization  = if_else(max_caseload > 0, families_served / max_caseload, NA_real_),
    households_per_agency = if_else(agencies > 0, families_served / agencies, NA_real_),
    FTE_per_agency        = if_else(agencies > 0, HV_staff_FTE / agencies, NA_real_),
    avg_caseload_per_FTE  = if_else(HV_staff_FTE > 0, max_caseload / HV_staff_FTE, NA_real_),
    
    # Modality mix
    in_person_share = if_else(total_home_visits > 0, in_person_visits / total_home_visits, NA_real_),
    virtual_share   = if_else(total_home_visits > 0, virtual_visits / total_home_visits, NA_real_),
    
    # Poverty composition (optional)
    poverty_female_share = if_else(total_poverty_population > 0, female_poverty / total_poverty_population, NA_real_),
      poverty_male_share   = if_else(total_poverty_population > 0, male_poverty / total_poverty_population, NA_real_),    
    
    # Quartiles by poverty load
      poverty_quartile = ntile(total_poverty_population, 4)
    ) %>%
    mutate(
      poverty_quartile = factor(poverty_quartile,
                                labels = c("Lowest load", "Q2", "Q3", "Highest load")))

Quartile Cut-off

# Quartile cutoffs (table + histogram)
qcuts <- quantile(data$total_poverty_population, c(0,.25,.5,.75,1), na.rm = TRUE)
tbl(data.frame(Quantile = c("Min","Q1","Median","Q3","Max"),
               Cutoff = as.numeric(qcuts)),
    "Quartile cutoffs for total_poverty_population")

Quantile	Cutoff
Quartile cutoffs for total_poverty_population
Min	224.0
Q1	2,368.0
Median	4,404.0
Q3	10,560.0
Max	81,335.0

p_hist <- ggplot(data, aes(total_poverty_population)) +
  geom_histogram(bins = 30, fill = "#7fcdbb", color = "white") +
  geom_vline(xintercept = qcuts, linetype = c("solid","dashed","dashed","dashed","solid")) +
  labs(title = "Poverty population: distribution & quartiles",
       x = "Total poverty population", y = "States") +
  theme_minimal()
plotly::ggplotly(p_hist)

States by Quartile

states_by_quartile <- data %>%
  group_by(poverty_quartile) %>%
  summarise(
    n_states = n(),
    state_codes = paste(state_code, collapse = ", "),
    .groups ="drop")
dt(states_by_quartile, "States by poverty-load quartiles")

quartile_summary <- data %>%
  group_by(poverty_quartile) %>%
  summarise(
    n_states = n(),
    min_pov  = min(total_poverty_population, na.rm = TRUE),
    max_pov  = max(total_poverty_population, na.rm = TRUE),
    avg_funding_per_1k  = mean(funding_per_1k_poverty, na.rm = TRUE),
    avg_families_per_1k = mean(families_per_1k_poverty, na.rm = TRUE),
    avg_visits_per_1k   = mean(visits_per_1k_poverty, na.rm = TRUE),
  .groups = "drop"
  )
tbl(quartile_summary, "Summary by poverty-load quartile")

poverty_quartile	n_states	min_pov	max_pov	avg_funding_per_1k	avg_families_per_1k	avg_visits_per_1k
Summary by poverty-load quartile
Lowest load	13.0	224.0	2,319.0	8,059,051.9	1,562.1	23,355.5
Q2	13.0	2,417.0	4,404.0	2,592,599.4	407.0	5,166.8
Q3	13.0	5,002.0	10,629.0	1,303,415.7	193.9	2,312.0
Highest load	12.0	12,115.0	81,335.0	376,162.8	67.1	709.4

# Quick bar of counts
ggplot(states_by_quartile, aes(poverty_quartile, n_states)) +
  geom_col(fill= "#fa9fb5") + geom_text(aes(label = n_states), vjust = -0.3) +
  labs(title = "State count by poverty-load quartile", x = NULL, y = "Number of states") +
  theme_minimal()

Descriptive Table

# table1: Overall
table1(~ base_award + matching_award + total_funding +
         HV_staff_FTE + max_caseload + agencies +
         families_served + adults + children +
         total_home_visits + in_person_visits + virtual_visits +
         funding_per_1k_poverty + families_per_1k_poverty + visits_per_1k_poverty +
         funding_per_family + funding_per_visit + visits_per_FTE + caseload_utilization,
       data = data, render.missing = NULL,
       render.continuous = "median (IQR)",
       caption = "Overall Descriptive Summary (HEOR) — 2023")

Overall Descriptive Summary (HEOR) — 2023
	Overall (N=51)
base_award	7960000 (6200000)
matching_award	726000 (0)
total_funding	8560000 (6200000)
HV_staff_FTE	74.7 (60.4)
max_caseload	1030 (1100)
agencies	12.0 (12.0)
families_served	1400 (1240)
adults	1400 (1290)
children	1320 (1130)
total_home_visits	16100 (14300)
in_person_visits	12000 (12900)
virtual_visits	2590 (3540)
funding_per_1k_poverty	1860000 (2780000)
families_per_1k_poverty	270 (421)
visits_per_1k_poverty	3270 (4800)
funding_per_family	7260 (3680)
funding_per_visit	570 (283)
visits_per_FTE	213 (84.7)
caseload_utilization	1.18 (0.224)

# table1: By quartile (not by state!)
table1(~ total_funding + HV_staff_FTE + agencies + max_caseload +
         families_served + total_home_visits +
         funding_per_1k_poverty + families_per_1k_poverty + visits_per_1k_poverty +
         funding_per_family + funding_per_visit + visits_per_FTE + caseload_utilization +
         in_person_share + virtual_share | poverty_quartile,
       data = data, render.missing = NULL,
       render.continuous = "median (IQR)",
       caption = "Program Summary by Poverty Load (Quartiles) — 2023")

Program Summary by Poverty Load (Quartiles) — 2023
	Lowest load (N=13)	Q2 (N=13)	Q3 (N=13)	Highest load (N=12)	Overall (N=51)
total_funding	5750000 (3340000)	9650000 (2480000)	10000000 (7060000)	9100000 (7220000)	8560000 (6200000)
HV_staff_FTE	57.0 (53.1)	82.2 (25.1)	80.3 (50.1)	61.2 (71.5)	74.7 (60.4)
agencies	8.00 (9.00)	14.0 (9.00)	15.0 (11.0)	13.0 (13.3)	12.0 (12.0)
max_caseload	519 (769)	1200 (392)	1410 (1180)	919 (1800)	1030 (1100)
families_served	655 (1120)	1490 (597)	1650 (1390)	1290 (2020)	1400 (1240)
total_home_visits	9380 (16200)	19900 (8290)	19700 (12700)	12300 (17300)	16100 (14300)
funding_per_1k_poverty	6440000 (4900000)	2950000 (1220000)	1220000 (1050000)	244000 (441000)	1860000 (2780000)
families_per_1k_poverty	713 (1500)	407 (196)	158 (150)	34.8 (91.6)	270 (421)
visits_per_1k_poverty	12200 (13500)	5280 (2550)	2110 (1710)	327 (865)	3270 (4800)
funding_per_family	7780 (4700)	6290 (2430)	7580 (2170)	7660 (4900)	7260 (3680)
funding_per_visit	575 (265)	484 (116)	608 (154)	644 (419)	570 (283)
visits_per_FTE	206 (65.4)	261 (112)	195 (51.2)	219 (93.2)	213 (84.7)
caseload_utilization	1.26 (0.220)	1.30 (0.220)	1.16 (0.126)	1.15 (0.283)	1.18 (0.224)
in_person_share	0.852 (0.154)	0.861 (0.157)	0.845 (0.0864)	0.850 (0.103)	0.851 (0.136)
virtual_share	0.146 (0.179)	0.139 (0.157)	0.141 (0.116)	0.150 (0.103)	0.146 (0.146)

Numeric Summary (wide table + boxplots)

desc_tbl <- data %>%
  summarise(across(all_of(vars_summary), list(
    n = ~sum(!is.na(.)),
    mean = ~mean(., na.rm = TRUE),
    sd = ~sd(., na.rm = TRUE),
    p25 = ~quantile(., 0.25, na.rm = TRUE),
    median = ~median(., na.rm = TRUE),
    p75 = ~quantile(., 0.75, na.rm = TRUE),
    min = ~min(., na.rm = TRUE),
    max = ~max(., na.rm = TRUE)
  ), .names = "{.col}.{.fn}"))
dt(desc_tbl, "Numeric Descriptive Summary (Wide)")

# (1) Overall summary (no groups)
table1(
  ~ base_award + matching_award + total_funding +
    HV_staff_FTE + max_caseload + agencies +
    families_served + adults + children +
    total_home_visits + in_person_visits + virtual_visits +
    funding_per_1k_poverty + families_per_1k_poverty + visits_per_1k_poverty +
    funding_per_family + funding_per_visit + visits_per_FTE + caseload_utilization,
  data = data, render.missing = NULL,
  render.continuous = "median (IQR)",
  caption = "Overall Descriptive Summary (HEOR) — 2023"
)

Overall Descriptive Summary (HEOR) — 2023
	Overall (N=51)
base_award	7960000 (6200000)
matching_award	726000 (0)
total_funding	8560000 (6200000)
HV_staff_FTE	74.7 (60.4)
max_caseload	1030 (1100)
agencies	12.0 (12.0)
families_served	1400 (1240)
adults	1400 (1290)
children	1320 (1130)
total_home_visits	16100 (14300)
in_person_visits	12000 (12900)
virtual_visits	2590 (3540)
funding_per_1k_poverty	1860000 (2780000)
families_per_1k_poverty	270 (421)
visits_per_1k_poverty	3270 (4800)
funding_per_family	7260 (3680)
funding_per_visit	570 (283)
visits_per_FTE	213 (84.7)
caseload_utilization	1.18 (0.224)

# (2) By poverty-load quartile
table1(
  ~ total_funding + HV_staff_FTE + agencies + max_caseload +
    families_served + total_home_visits +
    funding_per_1k_poverty + families_per_1k_poverty + visits_per_1k_poverty +
    funding_per_family + funding_per_visit + visits_per_FTE + caseload_utilization +
    in_person_share + virtual_share | poverty_quartile,
  data = data, render.missing = NULL,
  render.continuous = "median (IQR)",
  caption = "Program Summary by Poverty Load (Quartiles) — 2023"
)

Program Summary by Poverty Load (Quartiles) — 2023
	Lowest load (N=13)	Q2 (N=13)	Q3 (N=13)	Highest load (N=12)	Overall (N=51)
total_funding	5750000 (3340000)	9650000 (2480000)	10000000 (7060000)	9100000 (7220000)	8560000 (6200000)
HV_staff_FTE	57.0 (53.1)	82.2 (25.1)	80.3 (50.1)	61.2 (71.5)	74.7 (60.4)
agencies	8.00 (9.00)	14.0 (9.00)	15.0 (11.0)	13.0 (13.3)	12.0 (12.0)
max_caseload	519 (769)	1200 (392)	1410 (1180)	919 (1800)	1030 (1100)
families_served	655 (1120)	1490 (597)	1650 (1390)	1290 (2020)	1400 (1240)
total_home_visits	9380 (16200)	19900 (8290)	19700 (12700)	12300 (17300)	16100 (14300)
funding_per_1k_poverty	6440000 (4900000)	2950000 (1220000)	1220000 (1050000)	244000 (441000)	1860000 (2780000)
families_per_1k_poverty	713 (1500)	407 (196)	158 (150)	34.8 (91.6)	270 (421)
visits_per_1k_poverty	12200 (13500)	5280 (2550)	2110 (1710)	327 (865)	3270 (4800)
funding_per_family	7780 (4700)	6290 (2430)	7580 (2170)	7660 (4900)	7260 (3680)
funding_per_visit	575 (265)	484 (116)	608 (154)	644 (419)	570 (283)
visits_per_FTE	206 (65.4)	261 (112)	195 (51.2)	219 (93.2)	213 (84.7)
caseload_utilization	1.26 (0.220)	1.30 (0.220)	1.16 (0.126)	1.15 (0.283)	1.18 (0.224)
in_person_share	0.852 (0.154)	0.861 (0.157)	0.845 (0.0864)	0.850 (0.103)	0.851 (0.136)
virtual_share	0.146 (0.179)	0.139 (0.157)	0.141 (0.116)	0.150 (0.103)	0.146 (0.146)

# Boxplots for key metrics
box_long <- data %>%
  select(all_of(vars_box)) %>%
  pivot_longer(everything(), names_to = "metric", values_to = "value")

p_boxes <- ggplot(box_long, aes(metric, value)) +
  geom_boxplot(outlier.alpha = .4, fill = "#8856a7") + 
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Distribution of key program metrics", x = NULL, y = "Value") +
  theme_minimal()
plotly::ggplotly(p_boxes)

Correlations (key variables)

cors <- cor(data[, vars_corr], use = "pairwise.complete.obs")

heatmaply::heatmaply_cor(cors, k_col = 2, k_row = 2,
                         xlab = "Variables", ylab = "Variables",
                         main = "Correlation matrix (interactive)")

# 2. Test significance of correlations
p_mat <- psych::corr.test(data[, vars_corr], adjust = "none")$p  # gives p-values

# 3. Mask non-significant correlations (replace with 0 to keep clustering)
alpha <- 0.05
cors_sig <- cors
cors_sig[p_mat > alpha] <- 0   # keep clustering valid

# 4. Plot with diverging colors
heatmaply_cor(
  cors_sig,
  k_col = 2, k_row = 2,
  xlab = "Variables", ylab = "Variables",
  main = "Significant Correlations (p < 0.05)",
  colors = colorRampPalette(c("#238443", "#ece7f2", "#cc4c02"))(200) # blue = negative, red = positive
)

Econometric Models:Ordinary Least Squares (OLS), State Level Inference

# Helper: tidy robust results
tidy_robust <- function(model, model_name, type = "HC1") {
  # robust VCOV
  robust_vcov <- sandwich::vcovHC(model, type = type)
  
  # coeftest gives matrix of coef, robust SE, t, p
  rob <- lmtest::coeftest(model, vcov = robust_vcov)
  
  # convert to tibble
  tibble::tibble(
    term = rownames(rob),
    estimate = rob[, 1],
    std.error = rob[, 2],
    statistic = rob[, 3],
    p.value = rob[, 4],
    model = model_name
  ) %>%
    # add 95% CIs
    mutate(
      conf.low  = estimate - 1.96 * std.error,
      conf.high = estimate + 1.96 * std.error
    )
}


# Run models

m1 <- lm(families_served ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data)
m2 <- lm(total_home_visits ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data)
m3 <- lm(visits_per_FTE    ~ funding_per_family + agencies + max_caseload + total_poverty_population, data = data)


# Collect robust results

coef_df <- bind_rows(
  tidy_robust(m1, "Robust OLS: families_served"),
  tidy_robust(m2, "Robust OLS: total_home_visits"),
  tidy_robust(m3, "Robust OLS: visits_per_FTE")
) %>%
  filter(term != "(Intercept)")


# print

#print(coef_df)

#  pretty table

coef_df %>%
  gt(groupname_col = "model") %>%
  gt::fmt_number(columns = c(estimate, std.error, conf.low, conf.high),
                 decimals = 2) %>%
  gt::tab_header(title = "OLS with Robust SE (HC1)")

term	estimate	std.error	statistic	p.value	conf.low	conf.high
OLS with Robust SE (HC1)
Robust OLS: families_served
total_funding	0.00	0.00	-0.3912873	6.974316e-01	0.00	0.00
HV_staff_FTE	−5.09	1.51	-3.3817080	1.499262e-03	−8.04	−2.14
agencies	43.13	14.47	2.9809569	4.623269e-03	14.77	71.50
max_caseload	1.07	0.18	6.0348855	2.771351e-07	0.72	1.42
total_poverty_population	0.00	0.00	-0.6300713	5.318345e-01	−0.01	0.00
Robust OLS: total_home_visits
total_funding	0.00	0.00	-1.7773238	8.227399e-02	0.00	0.00
HV_staff_FTE	−94.68	50.90	-1.8602151	6.939824e-02	−194.43	5.08
agencies	1,451.98	541.08	2.6834748	1.015838e-02	391.46	2,512.50
max_caseload	17.76	6.22	2.8547144	6.492045e-03	5.57	29.95
total_poverty_population	−0.08	0.08	-0.9839336	3.304090e-01	−0.24	0.08
Robust OLS: visits_per_FTE
funding_per_family	0.00	0.00	-9.9436652	4.869858e-13	0.00	0.00
agencies	4.05	1.76	2.2929646	2.647102e-02	0.59	7.51
max_caseload	−0.02	0.01	-2.7968311	7.508768e-03	−0.04	−0.01
total_poverty_population	0.00	0.00	-0.5975456	5.530735e-01	0.00	0.00

Forest Plot for coefficeint

ggplot(coef_df, aes(x = estimate, y = term, 
                    xmin = conf.low, xmax = conf.high,
                    color = model)) +
  geom_point(position = position_dodge(width = 0.7), size = 2) +
  geom_errorbarh(position = position_dodge(width = 0.7), height = 0.2) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey40") +
  labs(
    title = "Coefficient Plot with Robust SEs (95% CI)",
    x = "Estimate",
    y = "Predictor"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "bottom")+
  facet_wrap(~model, scales = "free_x")

Standardize predictors before fitting models

# standardize selected predictors
data_std <- data %>%
  mutate(across(
    c(total_funding, HV_staff_FTE, agencies, max_caseload, 
      total_poverty_population, funding_per_family),
    scale
  ))

m1_std <- lm(families_served ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data_std)
m2_std <- lm(total_home_visits ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data_std)
m3_std <- lm(visits_per_FTE    ~ funding_per_family + agencies + max_caseload + total_poverty_population, data = data_std)

coef_df_std <- bind_rows(
  tidy_robust(m1_std, "families_served"),
  tidy_robust(m2_std, "total_home_visits"),
  tidy_robust(m3_std, "visits_per_FTE")
) %>% filter(term != "(Intercept)")


# pretty table

coef_df_std %>%
  gt(groupname_col = "model") %>%
  gt::fmt_number(columns = c(estimate, std.error, conf.low, conf.high),
                 decimals = 2) %>%
  gt::tab_header(title = "OLS with Robust SE (HC1)")

term	estimate	std.error	statistic	p.value	conf.low	conf.high
OLS with Robust SE (HC1)
families_served
total_funding	−53.30	136.21	-0.3912873	6.974316e-01	−320.27	213.67
HV_staff_FTE	−772.26	228.36	-3.3817080	1.499262e-03	−1,219.85	−324.67
agencies	428.27	143.67	2.9809569	4.623269e-03	146.68	709.86
max_caseload	1,680.32	278.43	6.0348855	2.771351e-07	1,134.59	2,226.05
total_poverty_population	−26.33	41.79	-0.6300713	5.318345e-01	−108.25	55.58
total_home_visits
total_funding	−8,802.57	4,952.71	-1.7773238	8.227399e-02	−18,509.88	904.74
HV_staff_FTE	−14,361.85	7,720.53	-1.8602151	6.939824e-02	−29,494.10	770.39
agencies	14,416.28	5,372.24	2.6834748	1.015838e-02	3,886.68	24,945.88
max_caseload	27,922.53	9,781.20	2.8547144	6.492045e-03	8,751.38	47,093.67
total_poverty_population	−1,407.15	1,430.13	-0.9839336	3.304090e-01	−4,210.19	1,395.90
visits_per_FTE
funding_per_family	−39.00	3.92	-9.9436652	4.869858e-13	−46.69	−31.31
agencies	40.18	17.52	2.2929646	2.647102e-02	5.83	74.52
max_caseload	−32.99	11.79	-2.7968311	7.508768e-03	−56.11	−9.87
total_poverty_population	−5.81	9.72	-0.5975456	5.530735e-01	−24.85	13.24

# forest plot
ggplot(coef_df_std, aes(x = estimate, y = term,
                        xmin = conf.low, xmax = conf.high,
                        color = model)) +
  geom_point(size = 2) +
  geom_errorbarh(height = 0.2) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey40") +
  labs(
    title = "Standardized Coefficient Plot with Robust SEs (95% CI)",
    x = "Standardized Estimate (per 1 SD change in predictor)",
    y = "Predictor"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "bottom") +
  facet_wrap(~model, scales = "free_x")

Econometric Models:Weighted Least Squares(WLS), Population Level Inference

# Run models

m1_wls <- lm(families_served ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data, weights = total_poverty_population)
m2_wls <- lm(total_home_visits ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data,  weights = total_poverty_population)
m3_wls <- lm(visits_per_FTE    ~ funding_per_family + agencies + max_caseload + total_poverty_population, data = data,  weights = total_poverty_population)


# Collect robust results

coef_df_wls <- bind_rows(
  tidy_robust(m1_wls, "Robust WLS: families_served"),
  tidy_robust(m2_wls, "Robust WLS: total_home_visits"),
  tidy_robust(m3_wls, "Robust WLS: visits_per_FTE")
) %>%
  filter(term != "(Intercept)")


# print

#print(coef_df_wls)

#  pretty table

coef_df_wls %>%
  gt(groupname_col = "model") %>%
  gt::fmt_number(columns = c(estimate, std.error, conf.low, conf.high),
                 decimals = 2) %>%
  gt::tab_header(title = "WLS with Robust SE (HC1)")

term	estimate	std.error	statistic	p.value	conf.low	conf.high
WLS with Robust SE (HC1)
Robust WLS: families_served
total_funding	0.00	0.00	-0.6694296	5.066419e-01	0.00	0.00
HV_staff_FTE	−3.64	0.47	-7.7312319	8.508534e-10	−4.56	−2.71
agencies	40.00	9.56	4.1825367	1.315608e-04	21.26	58.75
max_caseload	0.88	0.06	15.6885357	7.295050e-20	0.77	0.99
total_poverty_population	0.00	0.00	1.0119571	3.169690e-01	0.00	0.01
Robust WLS: total_home_visits
total_funding	0.00	0.00	-1.4797539	1.459062e-01	0.00	0.00
HV_staff_FTE	−46.53	18.55	-2.5079307	1.581869e-02	−82.90	−10.17
agencies	740.59	336.14	2.2031886	3.274287e-02	81.75	1,399.43
max_caseload	10.81	2.05	5.2669365	3.763857e-06	6.79	14.84
total_poverty_population	−0.05	0.04	-1.3168081	1.945699e-01	−0.13	0.02
Robust WLS: visits_per_FTE
funding_per_family	0.00	0.00	-6.2316068	1.300715e-07	0.00	0.00
agencies	1.42	2.15	0.6614850	5.116013e-01	−2.79	5.63
max_caseload	−0.02	0.01	-2.8082815	7.285133e-03	−0.03	−0.01
total_poverty_population	0.00	0.00	-0.5624614	5.765326e-01	0.00	0.00

Forest Plot for coefficeint

ggplot(coef_df_wls, aes(x = estimate, y = term, 
                    xmin = conf.low, xmax = conf.high,
                    color = model)) +
  geom_point(position = position_dodge(width = 0.7), size = 2) +
  geom_errorbarh(position = position_dodge(width = 0.7), height = 0.2) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey40") +
  labs(
    title = "WLS Coefficient Plot with Robust SEs (95% CI)",
    x = "Estimate",
    y = "Predictor"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "bottom")+
  facet_wrap(~model, scales = "free_x")

Standardize predictors before fitting models

# standardize selected predictors
data_std <- data %>%
  # remove NA or negative weights first
  filter(!is.na(total_poverty_population) & total_poverty_population > 0) %>%
  # then standardize predictors
  mutate(across(
    c(total_funding, HV_staff_FTE, agencies, max_caseload, 
       funding_per_family),
    ~ as.numeric(scale(.))
  ))

m1_wls_std <- lm(families_served ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data_std, weights = total_poverty_population)
m2_wls_std <- lm(total_home_visits ~ total_funding + HV_staff_FTE + agencies + max_caseload + total_poverty_population, data = data_std, weights = total_poverty_population)
m3_wls_std <- lm(visits_per_FTE    ~ funding_per_family + agencies + max_caseload + total_poverty_population, data = data_std, weights = total_poverty_population)

coef_df_std_wls <- bind_rows(
  tidy_robust(m1_wls_std, "families_served"),
  tidy_robust(m2_wls_std, "total_home_visits"),
  tidy_robust(m3_wls_std, "visits_per_FTE")
) %>% filter(term != "(Intercept)")


# pretty table

coef_df_std_wls %>%
  gt(groupname_col = "model") %>%
  gt::fmt_number(columns = c(estimate, std.error, conf.low, conf.high),
                 decimals = 2) %>%
  gt::tab_header(title = "WLS with Robust SE (HC1)")

term	estimate	std.error	statistic	p.value	conf.low	conf.high
WLS with Robust SE (HC1)
families_served
total_funding	−49.74	74.31	-0.6694296	5.066419e-01	−195.39	95.90
HV_staff_FTE	−551.53	71.34	-7.7312319	8.508534e-10	−691.35	−411.71
agencies	397.18	94.96	4.1825367	1.315608e-04	211.05	583.30
max_caseload	1,379.87	87.95	15.6885357	7.295050e-20	1,207.48	1,552.26
total_poverty_population	0.00	0.00	1.0119571	3.169690e-01	0.00	0.01
total_home_visits
total_funding	−3,023.69	2,043.37	-1.4797539	1.459062e-01	−7,028.69	981.32
HV_staff_FTE	−7,058.44	2,814.45	-2.5079307	1.581869e-02	−12,574.75	−1,542.12
agencies	7,353.07	3,337.47	2.2031886	3.274287e-02	811.63	13,894.51
max_caseload	17,002.64	3,228.18	5.2669365	3.763857e-06	10,675.40	23,329.88
total_poverty_population	−0.05	0.04	-1.3168081	1.945699e-01	−0.13	0.02
visits_per_FTE
funding_per_family	−38.40	6.16	-6.2316068	1.300715e-07	−50.48	−26.33
agencies	14.11	21.33	0.6614850	5.116013e-01	−27.69	55.91
max_caseload	−27.33	9.73	-2.8082815	7.285133e-03	−46.40	−8.26
total_poverty_population	0.00	0.00	-0.5624614	5.765326e-01	0.00	0.00

# forest plot
ggplot(coef_df_std_wls, aes(x = estimate, y = term,
                        xmin = conf.low, xmax = conf.high,
                        color = model)) +
  geom_point(size = 2) +
  geom_errorbarh(height = 0.2) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey40") +
  labs(
    title = "WLS Standardized Coefficient Plot with Robust SEs (95% CI)",
    x = "Standardized Estimate (per 1 SD change in predictor)",
    y = "Predictor"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "bottom") +
  facet_wrap(~model, scales = "free_x")

Efficiency Index (actual vs predicted visits)

data_pred <- data %>%
  mutate(
    pred_visits = as.numeric(predict(m2, newdata = data)),
    efficiency_visits = if_else(!is.na(pred_visits) & pred_visits > 0, total_home_visits / pred_visits, NA_real_)
  )


top_eff <- data_pred %>%
  arrange(desc(efficiency_visits)) %>%
  dplyr::select(states, total_home_visits, pred_visits, efficiency_visits) %>%
  slice_head(n = 10)


bot_eff <- data_pred %>%
  arrange(efficiency_visits) %>%
  dplyr::select(states, total_home_visits, pred_visits, efficiency_visits) %>%
  slice_head(n = 10)


tbl(top_eff, "Top 10 by efficiency (Actual / Predicted)")

states	total_home_visits	pred_visits	efficiency_visits
Top 10 by efficiency (Actual / Predicted)
New Jersey	21,002.0	2,422.9	8.7
Wyoming	4,724.0	815.4	5.8
Tennessee	20,110.0	4,669.0	4.3
Oklahoma	12,119.0	2,860.8	4.2
Hawaii	8,105.0	2,116.9	3.8
Indiana	22,089.0	7,589.2	2.9
Connecticut	25,146.0	9,936.0	2.5
Missouri	9,381.0	3,868.9	2.4
Iowa	12,172.0	6,298.0	1.9
North Carolina	6,813.0	3,531.0	1.9

tbl(bot_eff, "Bottom 10 by efficiency (Actual / Predicted)")

states	total_home_visits	pred_visits	efficiency_visits
Bottom 10 by efficiency (Actual / Predicted)
Mississippi	15.0	16,445.2	0.0
Vermont	3,551.0	12,483.1	0.3
South Dakota	1,589.0	4,567.6	0.3
Montana	12,425.0	26,265.1	0.5
Nebraska	11,259.0	22,687.8	0.5
Maryland	13,955.0	26,206.6	0.5
Arkansas	21,697.0	39,273.9	0.6
West Virginia	21,181.0	38,324.2	0.6
Virginia	13,411.0	23,233.5	0.6
Massachusetts	23,246.0	35,128.8	0.7

Plot for Efficiency Index

# Bars
p_top <- ggplot(top_eff, aes(reorder(states, efficiency_visits), efficiency_visits)) +
  geom_col(fill = "steelblue") + coord_flip() + theme_minimal() +
  labs(title = "Top 10 Efficiency", x = NULL, y = "Actual/Predicted")
plotly::ggplotly(p_top)

p_bot <- ggplot(bot_eff, aes(reorder(states, efficiency_visits), efficiency_visits)) +
  geom_col(fill="skyblue") + coord_flip() + theme_minimal() +
  labs(title = "Bottom 10 Efficiency", x = NULL, y = "Actual/Predicted")
plotly::ggplotly(p_bot)

Composite Performance Index (z-score blend)

Higher is better for: visits_per_1k_poverty, families_per_1k_poverty, visits_per_FTE, caseload_utilization

Lower is better for: funding_per_visit, funding_per_family (so we invert their z)

z_good <- c("visits_per_1k_poverty","families_per_1k_poverty","visits_per_FTE","caseload_utilization")
z_bad  <- c("funding_per_visit","funding_per_family")

z_tbl <- data %>%
  mutate(
    across(all_of(z_good), ~ as.numeric(scale(.)), .names = "{.col}_z"),
    across(all_of(z_bad),  ~ as.numeric(scale(.)) * -1, .names = "{.col}_z") # invert cost
  ) %>%
  mutate(
    Performance_Index = rowMeans(dplyr::select(., ends_with("_z")), na.rm = TRUE)
  ) %>%
  dplyr::select(states, Performance_Index, ends_with("_z"))

# composite performance index table

league_table <- z_tbl %>%
  arrange(desc(Performance_Index)) %>%
  mutate(Rank = row_number()) %>%
  dplyr::select(Rank, states, Performance_Index)

dt(league_table, "League table — Composite Performance Index (higher = better)")

plot performance table

p_league <- ggplot(league_table %>% slice_head(n = 20),
                   aes(reorder(states, Performance_Index), Performance_Index)) +
  geom_col(fill="#2ca02c") + coord_flip() + theme_minimal() +
  labs(title = "Top 20: Composite Performance Index", x = NULL, y = "Index")
plotly::ggplotly(p_league)

Influence Check (outliers for M2, Cooks distance)

check_cooks <- function(model, data, cutoff_rule = 4, top_n = 10) {
  cd <- cooks.distance(model)
  cutoff <- cutoff_rule / nrow(data)
  
  infl_df <- tibble(
    states   = data$states,
    state_code = data$state_code,
    total_home_visits = data$total_home_visits,
    total_funding = data$total_funding,
    HV_staff_FTE = data$HV_staff_FTE,
    total_poverty_population = data$total_poverty_population,
    cooks_d = as.numeric(cd)
  ) %>%
    arrange(desc(cooks_d))
  
  cat("Cook's Distance cutoff =", round(cutoff, 4), "\n\n")
  
  # --- Part A: threshold violations ---
  if (any(cd > cutoff)) {
    tbl(
      infl_df %>% filter(cooks_d > cutoff),
      paste0("Cases above Cook's Distance cutoff (", deparse(substitute(model)), ")")
    )
  } else {
    cat("No cases above the cutoff.\n\n")
  }
  
  # --- Part B: top-N ranking ---
  tbl(
    infl_df %>%  slice_head(n = top_n),
    paste0("Top ", top_n, " Cook's Distance values (", deparse(substitute(model)), ")")
  )
  
  # --- Part C: Cook’s Distance plot ---
  p <- ggplot(infl_df, aes(x = seq_along(cooks_d), y = cooks_d)) +
    geom_col(fill = "#1f77b4") +
    geom_hline(yintercept = cutoff, color = "red", linetype = "dashed") +
    labs(
      title = paste0("Cook's Distance (", deparse(substitute(model)), ")"),
      x = "Observation Index",
      y = "Cook's Distance"
    ) +
    theme_minimal()
  
  print(p)  # show plot
  
  invisible(infl_df)  # return full dataframe for further use
}

Equity Gap Analysis (Per-1k vs Quartile)

equity_gap <- data %>%
  group_by(poverty_quartile) %>%
  summarise(
    avg_funding_per_1k = mean(funding_per_1k_poverty, na.rm = TRUE),
    avg_visits_per_1k  = mean(visits_per_1k_poverty, na.rm = TRUE),
    .groups ="drop"
  )
tbl(equity_gap, "Equity gap by poverty-load quartile")

poverty_quartile	avg_funding_per_1k	avg_visits_per_1k
Equity gap by poverty-load quartile
Lowest load	8,059,051.9	23,355.5
Q2	2,592,599.4	5,166.8
Q3	1,303,415.7	2,312.0
Highest load	376,162.8	709.4

# plot

q_long <- equity_gap %>% 
  pivot_longer(-poverty_quartile, names_to = "metric", values_to = "value")
ggplot(q_long, aes(poverty_quartile, value, fill = metric)) +
  geom_col(position = "dodge") +
  theme_minimal() +
  labs(title = "Equity metrics by quartile", x = NULL, y = "Average")

Efficiency Benchmarking (Frontier view)

ggplot(data, aes(total_funding/1e6, total_home_visits, label = state_code)) +
  geom_point(color = "steelblue") +
  ggrepel::geom_text_repel(size = 3, max.overlaps = 20) +
  labs(x = "Funding (Millions USD)", y = "Total Home Visits",
       title = "Efficiency frontier: Funding vs Visits") +
  theme_minimal()

Cluster + PCA

# select, scale, cluster
clust_vars <- data %>%
  select(funding_per_1k_poverty, families_per_1k_poverty, visits_per_FTE, caseload_utilization)
set.seed(123)
clust_scaled <- scale(clust_vars)
km <- kmeans(clust_scaled, centers = 4, nstart = 25)
data <- data %>% mutate(cluster = factor(km$cluster))

# Counts & membership
states_by_cluster <- data %>%
  group_by(cluster) %>%
  summarise(n_states = n(), state_codes = paste(state_code, collapse = ", "), .groups = "drop")
dt(states_by_cluster, "States by cluster")

# Unscaled cluster means
cluster_means <- t(apply(km$centers, 1, function(row) {
  row * attr(clust_scaled, "scaled:scale") + attr(clust_scaled, "scaled:center")
})) %>%
  as.data.frame() %>%
  tibble::rownames_to_column("cluster")
tbl(cluster_means, "Cluster centers (original units)")

cluster	funding_per_1k_poverty	families_per_1k_poverty	visits_per_FTE	caseload_utilization
Cluster centers (original units)
1	1,529,447.1	229.2	311.7	1.6
2	13,116,572.3	3,088.7	260.8	1.2
3	2,668,941.7	166.4	67.5	0.5
4	2,134,562.2	321.2	211.8	1.2

# PCA plot
pca <- prcomp(clust_vars, center = TRUE, scale. = TRUE)
pca_data <- as.data.frame(pca$x[,1:2]) %>%
  mutate(states = data$states, state_code = data$state_code, cluster = data$cluster)

# Convex hulls
hulls <- pca_data %>%
  group_by(cluster) %>%
  slice(chull(PC1, PC2))

p_pca <- ggplot(pca_data, aes(PC1, PC2, color = cluster, text = state_code)) +
  geom_polygon(data = hulls, aes(group = cluster, fill = cluster), 
               alpha = 0.2, color = NA) +
  geom_point(size = 4, alpha = 0.9) +
  scale_color_brewer(palette = "Set1") +
  scale_fill_brewer(palette = "Set1") +
  labs(title = "State clusters (PCA of equity & efficiency metrics)") +
  theme_minimal()

plotly::ggplotly(p_pca, tooltip = "text")

PCA for visualization

hulls <- pca_data %>%
  group_by(cluster) %>%
  slice(chull(PC1, PC2))   # convex hull points per cluster

ggplot(pca_data, aes(PC1, PC2, color = cluster, fill = cluster)) +
  geom_polygon(data = hulls, aes(group = cluster), alpha = 0.2, color = NA) + # shaded hull
  geom_point(size = 4, alpha = 0.9) +
  geom_text(aes(label = state_code), size = 3, vjust = -1, check_overlap = TRUE) +
  scale_color_brewer(palette = "Set1") +
  scale_fill_brewer(palette = "Set1") +
  labs(
    title = "State Clusters (Convex Hulls in PCA space)",
    x = "Principal Component 1",
    y = "Principal Component 2"
  ) +
  theme_minimal(base_size = 14)

Raw metric scatter plot

Scatter 1: Funding per 1k poverty vs Visits per FTE

hulls <- data %>%
  group_by(cluster) %>%
  slice(chull(funding_per_1k_poverty, visits_per_FTE))

ggplot(data, aes(x = funding_per_1k_poverty, y = visits_per_FTE, 
                 color = as.factor(cluster), label = state_code)) +
  geom_polygon(data = hulls, aes(group = cluster, fill = as.factor(cluster)), 
               alpha = 0.2, color = NA) +
  geom_point(size = 4, alpha = 0.8) +
  geom_text(size = 3, vjust = -1, check_overlap = TRUE) +
  scale_color_brewer(palette = "Set1") +
  scale_fill_brewer(palette = "Set1") +
  scale_x_log10(labels = scales::comma) +  # log scale because funding per 1k varies a lot
  labs(
    title = "Equity vs Efficiency: Funding vs Visits per FTE",
    subtitle = "States clustered into peer groups",
    x = "Funding per 1,000 Poverty Population (log scale)",
    y = "Visits per FTE",
    color = "Cluster"
  ) +
  theme_minimal(base_size = 14)

# Scatter 2: Families per 1k poverty vs Caseload Utilization
ggplot(data, aes(x = families_per_1k_poverty, y = caseload_utilization, 
                 color = as.factor(cluster), label = state_code)) +
  geom_polygon(data = hulls, aes(group = cluster, fill = as.factor(cluster)), 
               alpha = 0.2, color = NA) +
  geom_point(size = 4, alpha = 0.8) +
  geom_text(size = 3, vjust = -1, check_overlap = TRUE) +
  scale_color_brewer(palette = "Set1") +
  labs(
    title = "Reach vs Capacity Utilization",
    subtitle = "Families per 1k poverty vs Caseload Utilization",
    x = "Families per 1,000 Poverty Population",
    y = "Caseload Utilization",
    color = "Cluster"
  ) +
  theme_minimal(base_size = 14)

Prep data for mapping

make_geo_map <- function(df, value_col, title, ztitle){
  plot_geo(df, locationmode = "USA-states") %>%
    add_trace(
      z = ~get(value_col),
      locations = ~state_code,
      color = ~get(value_col),
      colors = "Blues",
      text = ~paste0(states, " (", state_code, ")<br>", ztitle, ": ",
                     ifelse(is.numeric(get(value_col)),
                            scales::comma(get(value_col), accuracy = 0.1),
                            get(value_col))),
      hoverinfo = "text"
    ) %>%
    colorbar(title = ztitle) %>%
    layout(title = title, geo = list(scope = "usa"))
}

map1 <- make_geo_map(data, "funding_per_1k_poverty",  "Funding per 1,000 poverty population", "USD per 1k")
map2 <- make_geo_map(data, "families_per_1k_poverty", "Families served per 1,000 poverty population", "Families per 1k")
map3 <- make_geo_map(data, "visits_per_FTE",          "Visits per FTE", "Visits per FTE")

# Composite index map joins on state_code
map_index_df <- z_tbl %>% left_join(data %>% select(states, state_code), by = "states") %>% distinct()
map4 <- make_geo_map(map_index_df, "Performance_Index", "Composite Performance Index", "Index")

map1; map2; map3; map4

MIECHV HEOR Analysis - 2023

Dr. Keshav Kumar

11 September, 2025

Setup

Libraries

Read & Clean data

Derived Metrics (HEOR)

Quartile Cut-off

States by Quartile

Descriptive Table

Numeric Summary (wide table + boxplots)

Correlations (key variables)

Econometric Models:Ordinary Least Squares (OLS), State Level Inference

Forest Plot for coefficeint

Standardize predictors before fitting models

Econometric Models:Weighted Least Squares(WLS), Population Level Inference

Forest Plot for coefficeint

Standardize predictors before fitting models

Efficiency Index (actual vs predicted visits)

Plot for Efficiency Index

Composite Performance Index (z-score blend)

Higher is better for: visits_per_1k_poverty, families_per_1k_poverty, visits_per_FTE, caseload_utilization

Lower is better for: funding_per_visit, funding_per_family (so we invert their z)

plot performance table

Influence Check (outliers for M2, Cooks distance)

Equity Gap Analysis (Per-1k vs Quartile)

Efficiency Benchmarking (Frontier view)

Cluster + PCA

PCA for visualization

Raw metric scatter plot

Scatter 1: Funding per 1k poverty vs Visits per FTE

Prep data for mapping