School Bus Commuting Analysis: Data Preparation, Indicators, and Descriptive Modelling

Libraries, data import, harmonisation, and core recoding

## =========================
## 0) Packages (minimal, no duplicates)
## =========================
suppressPackageStartupMessages({
library(tidyverse)   # readr, dplyr, tidyr, ggplot2, stringr, purrr, etc.
library(janitor)     # remove_empty()
library(lubridate)   # mdy_hms()
library(stringi)     # stri_trans_general()
library(rlang)

library(leaflet)
library(broom)
library(MASS)
library(emmeans)
library(multcomp)
library(splines)
library(ordinal)
library(DescTools)
library(pscl)
library(summarytools)
library(RColorBrewer)
})

Warning: package 'leaflet' was built under R version 4.5.2

Warning: package 'emmeans' was built under R version 4.5.2

Warning: package 'multcomp' was built under R version 4.5.2

Warning: package 'ordinal' was built under R version 4.5.2

Warning: package 'DescTools' was built under R version 4.5.2

Warning: package 'pscl' was built under R version 4.5.2

## =========================
## 1) Read data + remove empties
## =========================
base <- readr::read_csv(
  "C:/Users/benjamin.mottebaumvo/Documents/Donnees/School_Bus_2025/Parent Survey on commute to private school.csv",
  show_col_types = FALSE,
  name_repair = "minimal"
) %>%
  janitor::remove_empty(c("rows", "cols")) %>%
  janitor::clean_names() %>%
  dplyr::slice(-1)

## =========================
## 2) Helpers (vectorised, safe)
## =========================
to_chr <- function(x) as.character(x)

to_int <- function(x) {
  # faster + safer than parse_number() for typical SurveyMonkey integer codes
  suppressWarnings(as.integer(readr::parse_number(to_chr(x))))
}

ord_fac <- function(x, k) {
  v <- to_int(x)
  factor(v, levels = seq_len(k), ordered = TRUE)
}

# map arbitrary codes to 1..K (e.g., 1,2,4,5,6 -> 1..5)
recode_map <- function(x, map) {
  v <- unname(map[match(to_chr(x), names(map))])
  factor(v, levels = sort(unique(unname(map))), ordered = TRUE)
}

## =========================
## 3) Standardised column names
## =========================
base <- base %>%
  rlang::set_names(c(
    "id","collector","start","end","flag","lang","role",
    "n_child_priv","same_school","old_young","q16","child_gender",
    "child_age","school_city","school_name1","school_name2","school_name3",
    "dist_home_school","to_bus","to_car","to_walk","to_mom","to_dad","to_oth","to_nan",
    "fr_bus","fr_car","fr_walk","fr_mom","fr_dad","fr_oth","fr_nan",
    "bus_stop","bus_cost","bus_time","bus_sched","bus_rely","bus_super",
    "car_cost","car_time","car_park","car_rout","bus_none",
    "bus_short","bus_lower","bus_improv","bus_oneway","bus_cust","bus_close","bus_more",
    "bus_oth","res","res_oth1","neighbor","hous_type","res_neighbor","res_spec",
    "res_work","res_school","res_cost","res_fami","res_empl","res_oth2","n_cars",
    "elec_car","income","hh_none","hh_partn","hh_oth1","hh_dom","hh_oth2",
    "years_uae","cont","age","job","edu","license","partn",
    "partn_years_uae","partn_cont","partn_age","partn_job","partn_edu","partn_license"
  ))

## =========================
## 4) Variable groups (single source of truth)
## =========================
ord15_map <- c("to_bus","to_car","to_walk","fr_bus","fr_car","fr_walk")
ord15_plain <- c("to_mom","to_dad","to_oth","to_nan","fr_mom","fr_dad","fr_oth","fr_nan")
ord16 <- c("bus_stop","bus_cost","bus_time","bus_sched","bus_rely","bus_super",
           "car_cost","car_time","car_park","car_rout")
bin_int_misc <- c("res","res_neighbor","res_spec","res_work","res_school","res_cost",
                  "hh_none","hh_partn","hh_oth1","hh_dom","hh_oth2",
                  "bus_none","bus_short","bus_lower","bus_improv",
                  "bus_oneway","bus_cust","bus_close","bus_more")

map_1to5 <- c("1"=1, "2"=2, "4"=3, "5"=4, "6"=5)

## =========================
## 5) Main harmonisation (one mutate + across)
## =========================
base <- base %>%
  mutate(
    # IDs / dates
    id        = to_chr(id),
    collector = to_int(collector),
    start     = lubridate::mdy_hms(start, tz = "Asia/Dubai"),
    end       = lubridate::mdy_hms(end,   tz = "Asia/Dubai"),

    # respondent metadata
    lang     = factor(lang),
    role     = to_int(role),
    role_fct = factor(role, levels = c(1, 2), labels = c("father", "mother")),

    # child / school
    n_child_priv = to_int(n_child_priv),
    same_school  = factor(to_int(same_school), levels = 1:2, labels = c("yes","no")),
    child_gender = factor(to_int(child_gender), levels = 1:2),
    child_age    = to_int(child_age),
    school_city  = to_int(school_city),

    # coerce school name codes once
    across(c(school_name1, school_name2, school_name3), to_int),

    school_number = coalesce(school_name1, school_name2, school_name3),
    school_id     = sprintf("%01d%03d", school_city, school_number),

    # ordered outcomes / covariates
    dist_home_school = ord_fac(dist_home_school, 7),
    n_cars    = to_int(n_cars),
    elec_car  = factor(to_int(elec_car), levels = 1:3, ordered = TRUE),
    income    = ord_fac(income, 7),
    hous_type = factor(to_int(hous_type), levels = 1:4, ordered = TRUE),

    years_uae = ord_fac(years_uae, 4),
    cont      = factor(to_int(cont), levels = 1:7),
    age       = to_int(age),
    job       = ord_fac(job, 5),
    edu       = ord_fac(edu, 6),
    license   = ord_fac(license, 3),

    partn_years_uae = ord_fac(partn_years_uae, 4),
    partn_cont      = factor(to_int(partn_cont), levels = 1:7),
    partn_age       = to_int(partn_age),
    partn_job       = ord_fac(partn_job, 5),
    partn_edu       = ord_fac(partn_edu, 6),
    partn_license   = ord_fac(partn_license, 3)
  ) %>%
  mutate(
    across(any_of(ord15_map),   ~ recode_map(.x, map_1to5)), # 1,2,4,5,6 -> 1..5
    across(any_of(ord15_plain), ~ ord_fac(.x, 5)),           # already 1..5
    across(any_of(ord16),       ~ ord_fac(.x, 6)),
    across(any_of(bin_int_misc), to_int)
  )

School name normalisation and linkage with ADEK reference data

## =========================
## Normalise school names + build ADEK linkage (optimised)
## =========================

# Fast, consistent normalisation (lowercase, drop punctuation, squish spaces, Latin → ASCII)
norm_latin <- function(x) {
  x %>%
    stringr::str_to_lower() %>%
    stringr::str_replace_all("[[:punct:]]+", " ") %>%  # collapse runs of punct
    stringr::str_squish() %>%
    stringi::stri_trans_general("Latin-ASCII")
}

# 1) ADEK school table: import + deduplicate on normalised title
donnees_school_uni <- readr::read_csv(
  "C:/Users/benjamin.mottebaumvo/Documents/Donnees/School_Bus_2025/donnees_school.csv",
  show_col_types = FALSE
) %>%
  dplyr::select(-...3) %>%
  dplyr::mutate(title_norm = norm_latin(title)) %>%
  dplyr::distinct(title_norm, .keep_all = TRUE)  # keeps first row per school

New names:
• `` -> `...3`

# 2) Code–name table: import + build keys once
school_code_name <- readr::read_csv(
  "C:/Users/benjamin.mottebaumvo/Documents/Donnees/School_Bus_2025/school_code_name.csv",
  show_col_types = FALSE
) %>%
  dplyr::mutate(
    school_name_unique = norm_latin(NAME),
    school_id          = sprintf("%01d%03d", CITY, NUMBER)
  )

# 3) Link table: many reported names → one ADEK school record (by normalised name)
school_link <- school_code_name %>%
  dplyr::left_join(donnees_school_uni, by = c("school_name_unique" = "title_norm"))

Merging survey data with ADEK school information

## 5) Final merge: survey base + ADEK school attributes
base <- base %>%
  dplyr::left_join(school_link, by = "school_id")

Overlay map: all schools vs respondent schools

## 1. Nombre de répondants par école (avec coordonnées)
## 0. Table de toutes les écoles
schools_all <- donnees_school_uni %>%
  mutate(
    latitude  = as.numeric(latitude),
    longitude = as.numeric(longitude)
  ) %>%
  filter(!is.na(latitude), !is.na(longitude)) %>%
  distinct(title, latitude, longitude, .keep_all = TRUE)

## 1. Nombre de répondants par école
schools_map <- base %>%
  filter(
    !is.na(school_id),
    !is.na(latitude),
    !is.na(longitude)
  ) %>%
  group_by(school_id, NAME, latitude, longitude) %>%
  summarise(
    n_resp = n(),
    .groups = "drop"
  )

## Use plain vectors (not named) to avoid jsonlite warnings
pal_cols  <- c("black", "blue")
pal_labs  <- c("All schools", "Respondents")

leaflet::leaflet() %>%
  leaflet::addProviderTiles(leaflet::providers$CartoDB.Positron) %>%

  leaflet::addCircleMarkers(
    data = schools_all,
    lng  = ~longitude, lat = ~latitude,
    radius = 3,
    color = pal_cols[1], fillColor = pal_cols[1],
    fillOpacity = 0.8, stroke = FALSE,
    group = pal_labs[1],
    popup = ~sprintf("<b>%s</b>", title)
  ) %>%

  leaflet::addCircleMarkers(
    data = schools_map,
    lng  = ~longitude, lat = ~latitude,
    radius = ~pmax(3, sqrt(n_resp)),
    color = pal_cols[2], fillColor = pal_cols[2],
    fillOpacity = 0.7, stroke = FALSE,
    group = pal_labs[2],
    popup = ~sprintf(
      "<b>%s</b><br>School ID: %s<br>Respondents: %s",
      NAME, school_id, n_resp
    )
  ) %>%

  leaflet::addLayersControl(
    overlayGroups = pal_labs,
    options = leaflet::layersControlOptions(collapsed = FALSE)
  ) %>%

  leaflet::addLegend(
    position = "bottomright",
    colors = pal_cols,
    labels = pal_labs,
    opacity = 0.9,
    title = "Legend"
  )

Reconstruct father/mother characteristics independent of respondent

# Convert SurveyMonkey-style fields to integer safely
to_int <- function(x) suppressWarnings(as.integer(readr::parse_number(as.character(x))))

base <- base %>%
  mutate(
    # Numeric role indicator
    # 1 = father respondent, 2 = mother respondent
    role_i = to_int(role),

    # Respondent characteristics (as reported by the respondent)
    years_uae_i = to_int(years_uae),
    cont_i      = to_int(cont),
    age_i       = to_int(age),
    job_i       = to_int(job),
    edu_i       = to_int(edu),
    license_i   = to_int(license),

    # Partner characteristics (as reported in the partner module)
    partn_years_uae_i = to_int(partn_years_uae),
    partn_cont_i      = to_int(partn_cont),
    partn_age_i       = to_int(partn_age),
    partn_job_i       = to_int(partn_job),
    partn_edu_i       = to_int(partn_edu),
    partn_license_i   = to_int(partn_license),

    # Father characteristics
    # If the respondent is the father (role_i == 1), take respondent values
    # Otherwise (respondent is the mother), take partner values
    father_years_uae = if_else(role_i == 1L, years_uae_i, partn_years_uae_i),
    father_cont      = if_else(role_i == 1L, cont_i,      partn_cont_i),
    father_age       = if_else(role_i == 1L, age_i,       partn_age_i),
    father_job       = if_else(role_i == 1L, job_i,       partn_job_i),
    father_edu       = if_else(role_i == 1L, edu_i,       partn_edu_i),
    father_license   = if_else(role_i == 1L, license_i,   partn_license_i),

    # Mother characteristics
    # If the respondent is the mother (role_i == 2), take respondent values
    # Otherwise (respondent is the father), take partner values
    mother_years_uae = if_else(role_i == 2L, years_uae_i, partn_years_uae_i),
    mother_cont      = if_else(role_i == 2L, cont_i,      partn_cont_i),
    mother_age       = if_else(role_i == 2L, age_i,       partn_age_i),
    mother_job       = if_else(role_i == 2L, job_i,       partn_job_i),
    mother_edu       = if_else(role_i == 2L, edu_i,       partn_edu_i),
    mother_license   = if_else(role_i == 2L, license_i,   partn_license_i)
  ) %>%
  mutate(
    # If the respondent role is invalid or missing, set parent variables to NA
    across(starts_with("father_"), ~ if_else(role_i %in% 1:2, .x, NA_integer_)),
    across(starts_with("mother_"), ~ if_else(role_i %in% 1:2, .x, NA_integer_))
  ) %>%
  # Remove intermediate variables used only for construction
  dplyr::select(-role_i, -ends_with("_i"))

Plot styling and label dictionaries

## =========================
## Plot theme + shared labels (optimised)
## =========================

theme_modern <- function(base_size = 12, base_family = "") {
  ggplot2::theme_minimal(base_size = base_size, base_family = base_family) +
    ggplot2::theme(
      plot.title    = ggplot2::element_text(face = "bold", size = base_size + 4, hjust = 0),
      plot.subtitle = ggplot2::element_text(size = base_size + 1, colour = "grey30", hjust = 0),

      axis.title.y = ggplot2::element_text(size = base_size + 1, margin = ggplot2::margin(r = 8)),
      axis.text.x  = ggplot2::element_text(size = base_size, margin = ggplot2::margin(t = 5)),
      axis.text.y  = ggplot2::element_text(size = base_size),

      legend.position   = "top",
      legend.box.margin = ggplot2::margin(t = -5, b = -10),
      legend.margin     = ggplot2::margin(b = -10),

      plot.margin = ggplot2::margin(t = 5, r = 10, b = 40, l = 5),

      panel.grid.minor.y = ggplot2::element_line(colour = "grey92", linewidth = 0.25),
      panel.grid.major.y = ggplot2::element_line(colour = "grey80", linewidth = 0.4),
      panel.grid.major.x = ggplot2::element_blank()
    )
}

freq_labels <- c(
  never = "Never",
  sometimes = "Sometimes (<1/week)",
  one = "1 day/week",
  two_three = "2–3 days/week",
  four_plus = "4+ days/week"
) %>%
  unname()

dist_labels <- c(
  "<5 min",
  "5–10 min",
  "10–15 min",
  "15–20 min",
  "20–25 min",
  "25–30 min",
  ">30 min"
)

Descriptive plot: home–school distance distribution

# --- Préparation des données ---
df_dist <- base |>
  filter(!is.na(dist_home_school)) |>
  count(dist_home_school) |>
  mutate(
    percent = 100 * n / sum(n),
    # inversion de l’ordre ici :
    dist_home_school = factor(dist_labels[as.numeric(dist_home_school)],
                              levels = rev(dist_labels))   # <-- ordre inversé
  )

# --- Palette qualitative ---
palette_dist <- brewer.pal(n = length(dist_labels), name = "Set2")

# --- Graphique ---
ggplot(df_dist, aes(x = percent, y = dist_home_school, fill = dist_home_school)) +
  geom_col(width = 0.7) +
  scale_fill_manual(values = palette_dist) +
  scale_x_continuous(
    expand = c(0,0),
    limits = c(0, 25),
    labels = scales::percent_format(scale = 1)
  ) +
  labs(
    x = NULL,
    y = NULL,
    title = "Driving distance between home and school"
  ) +
  theme_modern() +
  theme(legend.position = "none")

Descriptive plot: home–school transport mode

## =========================
## Mode frequency stacked bars (optimised)
## Keeps "Never" for denominator, hides it in the plot
## =========================

mode_map <- c(
  to_bus  = "School bus",
  to_car  = "Car",
  to_walk = "Walk/scooter/bike"
)

freq_order_plot <- c("4+ days/week", "2–3 days/week", "1 day/week", "Sometimes (<1/week)")

df_plot <- base %>%
  transmute(
    to_bus  = to_bus,
    to_car  = to_car,
    to_walk = to_walk
  ) %>%
  pivot_longer(
    cols = everything(),
    names_to = "mode_raw",
    values_to = "freq_code"
  ) %>%
  mutate(
    mode = dplyr::recode(mode_raw, !!!mode_map),
    frequency = factor(freq_code, levels = levels(base$to_bus), labels = freq_labels)
  ) %>%
  filter(!is.na(frequency)) %>%
  count(mode, frequency, name = "n") %>%
  group_by(mode) %>%
  mutate(percent = 100 * n / sum(n)) %>%
  ungroup() %>%
  filter(frequency != "Never") %>%
  mutate(frequency = forcats::fct_relevel(frequency, freq_order_plot))

my_colors <- rev(RColorBrewer::brewer.pal(4, "Blues"))

ggplot(df_plot, aes(x = mode, y = percent, fill = frequency)) +
  geom_col(position = position_stack(reverse = TRUE)) +
  scale_fill_manual(values = my_colors, guide = guide_legend(reverse = TRUE)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 80)) +
  labs(
    x = NULL,
    y = "Share of pupils (%)",
    title = "How children usually travel from home to school"
  ) +
  theme_modern()

Descriptive plot: escort to school (stacked frequencies)

# --- Données en format long : les NA deviennent "Never"
df_escort <- base |>
  dplyr::select(to_mom, to_dad, to_oth, to_nan) |>
  pivot_longer(
    cols      = everything(),
    names_to  = "escort_raw",
    values_to = "freq_code"
  ) |>
  mutate(
    escort = recode(
      escort_raw,
      to_mom = "Mother",
      to_dad = "Father",
      to_oth = "Relative/friend/neighbour",
      to_nan = "Domestic worker"
    ),
    # freq_code peut être NA → on force à 1 ("Never")
    freq_num = ifelse(is.na(freq_code), 1L, as.integer(freq_code)),
    frequency = factor(
      freq_labels[freq_num],
      levels = freq_labels,
      ordered = TRUE
    )
  ) |>
  group_by(escort, frequency) |>
  summarise(n = n(), .groups = "drop") |>
  group_by(escort) |>
  mutate(percent = 100 * n / sum(n)) |>
  ungroup()


# pour le tracé : on ne montre pas "Never", comme pour le graphique des modes
# Pour le tracé : exclure "Never"
df_escort_plot <- df_escort |>
  filter(frequency != "Never") |>
  mutate(
    frequency = forcats::fct_relevel(
      frequency,
      "4+ days/week",
      "2–3 days/week",
      "1 day/week",
      "Sometimes (<1/week)"
    ),
    # --- ORDRE IMPOSE DES MODALITÉS ---
    escort = factor(
      escort,
      levels = c(
        "Mother",
        "Father",
        "Domestic worker",
        "Relative/friend/neighbour"
      )
    )
  )


# palette Blues inversée (4+ en bleu foncé, Sometimes en bleu clair)
my_colors <- rev(brewer.pal(4, "Blues"))

# --- Graphique ---
ggplot(df_escort_plot,
       aes(x = escort, y = percent, fill = frequency)) +
  geom_col(position = position_stack(reverse = TRUE)) +
  scale_fill_manual(values = my_colors,
                    guide = guide_legend(reverse = TRUE)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 70)) +
  labs(
    x = NULL,
    y = "Share of pupils (%)",
    title = "How often is the child accompanied to school?",
    #subtitle = "By type of accompanying person (excluding 'Never')",
    fill = "Frequency"
  ) +
  theme_modern()

Build the modelling dataset (cleaning, recoding, exclusivity, income imputation)

## =========================
## Pruned prep for your current models (bus-only logit + distance models)
## =========================

to_int <- function(x) suppressWarnings(as.integer(readr::parse_number(as.character(x))))

collapse_job <- function(x) {
  v <- to_int(x)
  dplyr::case_when(
    v == 1L ~ 1L,
    v %in% 2:5 ~ 0L,
    TRUE ~ NA_integer_
  )
}

collapse_license <- function(x) {
  v <- to_int(x)
  dplyr::case_when(
    v == 1L ~ 1L,
    v %in% 2:3 ~ 0L,
    TRUE ~ NA_integer_
  )
}

## =========================
## 1) Exclusivity + keep only bus-only vs car-only
## =========================

base_excl <- base %>%
  mutate(
    bus_num = to_int(to_bus),
    car_num = to_int(to_car),
    mode_exclusive = dplyr::case_when(
      bus_num == 5L ~ "bus_only",
      car_num == 5L & bus_num < 5L ~ "car_only",
      TRUE ~ NA_character_
    ) %>%
      factor(levels = c("bus_only", "car_only"))
  ) %>%
  tidyr::drop_na(mode_exclusive) %>%
  mutate(
    bus_only_bin = if_else(mode_exclusive == "bus_only", 1L, 0L)
  )

## =========================
## 2) Income imputation (kept, but simplified to produce only income_imp_num + income_imp_ord)
## =========================

impute_income <- function(df) {
  df <- df %>%
    mutate(
      income_num  = to_int(income),
      income_imp0 = dplyr::na_if(income_num, 7L),
      income_ord  = factor(income_imp0, levels = 1:6, ordered = TRUE)
    )

  preds <- c(
    "father_edu", "mother_edu",
    "father_job", "mother_job",
    "n_cars", "hous_type"
  )
  preds <- preds[preds %in% names(df)]

  df_fit <- df %>%
    dplyr::select(income_ord, dplyr::all_of(preds)) %>%
    tidyr::drop_na()

  if (nrow(df_fit) <= 100) stop("Too few complete cases to fit polr")

  df_fit[preds] <- lapply(df_fit[preds], factor)

  fit_polr <- MASS::polr(
    stats::as.formula(paste("income_ord ~", paste(preds, collapse = " + "))),
    data = df_fit,
    Hess = TRUE,
    method = "logistic"
  )

  idx_imp <- which(is.na(df$income_imp0))
  if (!length(idx_imp)) {
    return(df %>%
      mutate(
        income_imp_num = income_num,
        income_imp_ord = factor(income_num, levels = 1:6, ordered = TRUE)
      ) %>%
      dplyr::select(-income_num, -income_imp0, -income_ord)
    )
  }

  df_pred <- df[idx_imp, preds, drop = FALSE]
  df_pred[preds] <- lapply(df_pred[preds], factor)

  ok_rows <- stats::complete.cases(df_pred)
  pred_num <- rep(NA_integer_, nrow(df_pred))

  if (any(ok_rows)) {
    p_mat <- predict(fit_polr, newdata = df_pred[ok_rows, , drop = FALSE], type = "probs")
    pred_num[ok_rows] <- apply(p_mat, 1, function(p) as.integer(colnames(p_mat)[which.max(p)]))
  }

  global_mode <- as.integer(names(which.max(table(df_fit$income_ord))))
  pred_num[is.na(pred_num)] <- global_mode

  income_imputed <- df$income_num
  income_imputed[idx_imp] <- pred_num

  df %>%
    mutate(
      income_imp_num = income_imputed,
      income_imp_ord = factor(income_imp_num, levels = 1:6, ordered = TRUE)
    ) %>%
    dplyr::select(-income_num, -income_imp0, -income_ord)
}

base_excl <- impute_income(base_excl)

## =========================
## 3) Model covariates only (single pipeline)
## =========================

base_excl <- base_excl %>%
  mutate(
    ## Fees
    fee_mid = (minFee + maxFee) / 2,
    log_fee = log1p(fee_mid),

    ## Curriculum grouping
    curriculum_grp = dplyr::case_when(
      curriculumTypeCode == 1L               ~ "American",
      curriculumTypeCode == 9L               ~ "British",
      curriculumTypeCode %in% c(10L, 12L, 29L) ~ "Western_other",
      curriculumTypeCode %in% c(14L, 31L)      ~ "Indian_CBSE",
      curriculumTypeCode == 18L              ~ "MoE_UAE",
      curriculumTypeCode %in% c(15L, 22L, 23L) ~ "Other",
      TRUE                                   ~ NA_character_
    ) %>%
      factor(levels = c("American", "British", "Western_other", "Indian_CBSE", "MoE_UAE", "Other")),

    ## Child age (centred + quadratic)
    child_age_num = readr::parse_number(as.character(child_age)),
    age_c  = child_age_num - mean(child_age_num, na.rm = TRUE),
    age_c2 = age_c^2,

    ## Same-school dummy
    same_school_no = if_else(as.character(same_school) == "no", 1L, 0L),

    ## Children count recode
    n_child_priv_num = to_int(n_child_priv),
    n_child_priv_rec = dplyr::case_when(
      n_child_priv_num == 2L ~ "1 child",
      n_child_priv_num == 3L ~ "2 children",
      n_child_priv_num >= 4L ~ "3+ children",
      TRUE                   ~ NA_character_
    ) %>%
      factor(levels = c("1 child", "2 children", "3+ children")),

    ## Car ownership
    onecar_bin = if_else(to_int(n_cars) == 1L, 1L, 0L, missing = 0L),

    ## Household domestic worker
    hh_dom_bin = if_else(to_int(hh_dom) == 3L, 1L, 0L, missing = 0L),

    ## Housing type: villa indicator
    villa_bin = dplyr::case_when(
      is.na(hous_type) ~ NA_integer_,
      to_int(hous_type) == 3L ~ 1L,
      TRUE ~ 0L
    ),

    ## Residential cost reason
    res_cost_bin = if_else(to_int(res_cost) == 5L, 1L, 0L, missing = 0L),

    ## Mother licence (binary as in your models)
    mother_license_rec = collapse_license(mother_license),

    ## Mother years in UAE (5+ years category)
    mother_years_uae_5plus = if_else(to_int(mother_years_uae) == 4L, 1L, 0L, missing = 0L),

    ## Mother Europe vs non-Europe
    mother_europe = dplyr::case_when(
      to_int(mother_cont) == 2L ~ "Europe",
      to_int(mother_cont) %in% c(1L,3L,4L,5L,6L,7L) ~ "Non_Europe",
      TRUE ~ NA_character_
    ) %>%
      factor(levels = c("Non_Europe", "Europe")),

    ## Father job + education
    father_job_rec = collapse_job(father_job),
    father_edu_6   = if_else(to_int(father_edu) == 6L, 1L, 0L, missing = 0L),

    ## Distance codings used downstream
    dist_home_school_num = as.numeric(dist_home_school),
    dist_home_school_ord6 = factor(dist_home_school_num, ordered = TRUE),
    dist_grp_adj2 = cut(
      dist_home_school_num,
      breaks = c(0, 1, 2, 3, 4, 6, 7),
      labels = c("1", "2", "3", "4", "5-6", "7"),
      right = TRUE,
      ordered_result = TRUE
    )
  )



base_excl <- base_excl %>%
  dplyr::mutate(
    res_neighbor_bin = dplyr::if_else(to_int(res_neighbor) == 1L, 1L, 0L, missing = 0L),
    res_spec_bin     = dplyr::if_else(to_int(res_spec)     == 2L, 1L, 0L, missing = 0L),
    res_work_bin     = dplyr::if_else(to_int(res_work)     == 3L, 1L, 0L, missing = 0L),
    res_school_bin   = dplyr::if_else(to_int(res_school)   == 4L, 1L, 0L, missing = 0L),
    res_cost_bin     = dplyr::if_else(to_int(res_cost)     == 5L, 1L, 0L, missing = 0L)
  )

Build school neighbourhood groups from free-text locations

norm_loc <- function(x) {
  x %>%
    stringr::str_to_lower() %>%
    stringr::str_replace_all("'", "") %>%
    stringr::str_squish()
}

neigh_levels <- c(
  "Main island west",
  "Al Wathbah",
  "Eastern islands",
  "Khalifa + MBZ",
  "Mushrif + Sa'adah",
  "Al Ain",
  "Al Dhafra",
  "Other"
)

## Build neigh_lookup ONCE (put this chunk before you use it)
main_island_west_areas <- c("al bateen","al danah","al khalidiyah","al zahiyah","al nahyan")
khalifa_mbz_areas      <- c("khalifa city","mohamed bin zayed city","mohammed bin zayed city","shakhbout city")
mushrif_saadah_areas   <- c("al mushrif","rabdan","al saadah","hadbat al zafaranah")
islands_east_areas     <- c("al reem island","al saadiyat island","yas island")
al_ain_areas           <- c("al muwaiji","al jimi","hili","al qattarah","al maqam","falaj hazza","al tiwayya")
al_dhafra_areas        <- c("madinat zayed","zayed city")

neigh_lookup <- tibble::tibble(loc_norm = unique(norm_loc(base$locationName))) %>%
  dplyr::mutate(
    neigh_group = dplyr::case_when(
      loc_norm == "al wathbah"             ~ "Al Wathbah",
      loc_norm %in% islands_east_areas     ~ "Eastern islands",
      loc_norm %in% main_island_west_areas ~ "Main island west",
      loc_norm %in% khalifa_mbz_areas      ~ "Khalifa + MBZ",
      loc_norm %in% mushrif_saadah_areas   ~ "Mushrif + Sa'adah",
      loc_norm %in% al_ain_areas           ~ "Al Ain",
      loc_norm %in% al_dhafra_areas        ~ "Al Dhafra",
      TRUE                                 ~ "Other"
    ),
    neigh_group = factor(neigh_group, levels = neigh_levels)
  )

## Apply it to base_excl
base_excl <- base_excl %>%
  dplyr::mutate(loc_norm = norm_loc(locationName)) %>%
  dplyr::left_join(neigh_lookup, by = "loc_norm") %>%
  dplyr::mutate(
    neigh_group = tidyr::replace_na(neigh_group, "Other"),
    neigh_group = factor(neigh_group, levels = neigh_levels),

    ng_main_island_west = as.integer(neigh_group == "Main island west"),
    ng_al_wathbah       = as.integer(neigh_group == "Al Wathbah"),
    ng_eastern_islands  = as.integer(neigh_group == "Eastern islands"),
    ng_khalifa_mbz      = as.integer(neigh_group == "Khalifa + MBZ"),
    ng_mushrif_saadah   = as.integer(neigh_group == "Mushrif + Sa'adah"),
    ng_al_ain           = as.integer(neigh_group == "Al Ain"),
    ng_al_dhafra        = as.integer(neigh_group == "Al Dhafra")
  ) %>%
  dplyr::select(-loc_norm)

Prepare predictors and optimise category groupings (AIC-based collapsing)

## =========================
## Pruned feature engineering for your current models only
## Uses: bus_only_bin, dist_grp_adj2 / dist_home_school_ord6, curriculum_grp,
##       income_imp_num, log_fee, mother_license_rec, neigh_group, villa_bin,
##       onecar_bin, n_child_priv_rec, father_job_rec, age_c + age_c2,
##       same_school_no, mother_years_uae_5plus, mother_europe, res_cost_bin,
##       father_edu_6, hh_dom_bin
## =========================

to_int <- function(x) suppressWarnings(as.integer(readr::parse_number(as.character(x))))

collapse_job <- function(x) {
  v <- to_int(x)
  dplyr::case_when(
    v == 1L ~ 1L,
    v %in% 2:5 ~ 0L,
    TRUE ~ NA_integer_
  )
}

base_excl <- base_excl %>%
  mutate(
    ## Target
    mode_exclusive = factor(mode_exclusive, levels = c("bus_only", "car_only")) %>% forcats::fct_drop(),
    bus_only_bin = if_else(mode_exclusive == "bus_only", 1L,
                  if_else(mode_exclusive == "car_only", 0L, NA_integer_)),

    ## Distance predictors used in your models
    dist_home_school_num = as.numeric(dist_home_school),
    dist_grp_adj2 = cut(
      dist_home_school_num,
      breaks = c(0, 1, 2, 3, 4, 6, 7),
      labels = c("1", "2", "3", "4", "5-6", "7"),
      right = TRUE,
      ordered_result = TRUE
    ),
    dist_home_school_ord6 = factor(dist_home_school_num, ordered = TRUE),

    ## Fees (spline uses log_fee)
    fee_mid = (minFee + maxFee) / 2,
    log_fee = log1p(fee_mid),

    ## Age (centred + quadratic)
    child_age_num = readr::parse_number(as.character(child_age)),
    age_c  = child_age_num - mean(child_age_num, na.rm = TRUE),
    age_c2 = age_c^2,

    ## Income (models use factor(income_imp_num) or income_imp_ord already computed elsewhere)
    income_imp_num = as.integer(income_imp_num),

    ## Household car ownership
    onecar_bin = if_else(to_int(n_cars) == 1L, 1L, 0L, missing = 0L),

    ## Private-school children recode
    n_child_priv_num = to_int(n_child_priv),
    n_child_priv_rec = case_when(
      n_child_priv_num == 2L ~ "1 child",
      n_child_priv_num == 3L ~ "2 children",
      n_child_priv_num >= 4L ~ "3+ children",
      TRUE ~ NA_character_
    ) %>% factor(levels = c("1 child", "2 children", "3+ children")),

    ## Same school dummy (as used in your logit)
    same_school_no = if_else(as.character(same_school) == "no", 1L, 0L),

    ## Household domestic worker dummy (as used)
    hh_dom_bin = if_else(to_int(hh_dom) == 3L, 1L, 0L, missing = 0L),

    ## Curriculum grouping (as used)
    curriculum_grp = case_when(
      curriculumTypeCode == 1L               ~ "American",
      curriculumTypeCode == 9L               ~ "British",
      curriculumTypeCode %in% c(10L,12L,29L) ~ "Western_other",
      curriculumTypeCode %in% c(14L,31L)     ~ "Indian_CBSE",
      curriculumTypeCode == 18L              ~ "MoE_UAE",
      curriculumTypeCode %in% c(15L,22L,23L) ~ "Other",
      TRUE                                   ~ NA_character_
    ) %>% factor(levels = c("American", "British", "Western_other", "Indian_CBSE", "MoE_UAE", "Other")),

    ## Mother Europe indicator used in models (reference = Non_Europe)
    mother_europe = case_when(
      to_int(mother_cont) == 2L ~ "Europe",
      to_int(mother_cont) %in% c(1L,3L,4L,5L,6L,7L) ~ "Non_Europe",
      TRUE ~ NA_character_
    ) %>% factor(levels = c("Non_Europe", "Europe")),

    ## Mother years in UAE (5+ years dummy, category 4)
    mother_years_uae_5plus = if_else(to_int(mother_years_uae) == 4L, 1L, 0L, missing = 0L),

    ## Father job recode used in models (if you already have father_job_rec, this safely overwrites)
    father_job_rec = collapse_job(father_job),

    ## Father education dummy used in models
    father_edu_6 = if_else(to_int(father_edu) == 6L, 1L, 0L, missing = 0L)
  ) %>%
  dplyr::select(
    -dist_home_school_num,
    -fee_mid,
    -child_age_num,
    -n_child_priv_num
  )

stopifnot("income_imp_ord" %in% names(base_excl), is.ordered(base_excl$income_imp_ord))

Estimate final binary logit model (bus-only vs car-only)

## =========================
## Final estimation sample + final logit model (optimised)
## =========================

m_final <- glm(bus_only_bin ~ #dist_home_school_ord_c + income_imp_num
                 
                dist_grp_adj2 
               
               + curriculum_grp
               + factor(income_imp_num) 
               + ns(log_fee, df = 3)
               + mother_license_rec
               + neigh_group
               + villa_bin
               + onecar_bin
               
               + n_child_priv_rec
               + father_job_rec 
               + age_c + age_c2 
               + same_school_no 
               + mother_years_uae_5plus
               + mother_europe
               + res_cost_bin
               + father_edu_6
               + hh_dom_bin
              
               
               ,
               data = base_excl, family = binomial("logit"),
               contrasts = list(dist_grp_adj2 = contr.treatment
                            ))

summary(m_final)


Call:
glm(formula = bus_only_bin ~ dist_grp_adj2 + curriculum_grp + 
    factor(income_imp_num) + ns(log_fee, df = 3) + mother_license_rec + 
    neigh_group + villa_bin + onecar_bin + n_child_priv_rec + 
    father_job_rec + age_c + age_c2 + same_school_no + mother_years_uae_5plus + 
    mother_europe + res_cost_bin + father_edu_6 + hh_dom_bin, 
    family = binomial("logit"), data = base_excl, contrasts = list(dist_grp_adj2 = contr.treatment))

Coefficients:
                              Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  -4.688307   0.974412  -4.811 1.50e-06 ***
dist_grp_adj22                0.883341   0.340512   2.594 0.009482 ** 
dist_grp_adj23                1.449231   0.336017   4.313 1.61e-05 ***
dist_grp_adj24                1.812905   0.340145   5.330 9.83e-08 ***
dist_grp_adj25-6              1.822449   0.336583   5.415 6.14e-08 ***
dist_grp_adj27                2.800137   0.366536   7.639 2.18e-14 ***
curriculum_grpBritish         0.266903   0.203156   1.314 0.188918    
curriculum_grpWestern_other   0.735337   0.281484   2.612 0.008992 ** 
curriculum_grpIndian_CBSE     1.775801   0.306904   5.786 7.20e-09 ***
curriculum_grpMoE_UAE         0.887541   0.341647   2.598 0.009382 ** 
curriculum_grpOther           0.601004   0.424335   1.416 0.156676    
factor(income_imp_num)2       0.360085   0.190135   1.894 0.058247 .  
factor(income_imp_num)3       0.545054   0.232063   2.349 0.018837 *  
factor(income_imp_num)4       0.673160   0.271024   2.484 0.013000 *  
factor(income_imp_num)5       0.662262   0.308574   2.146 0.031857 *  
factor(income_imp_num)6       0.116762   0.248429   0.470 0.638353    
ns(log_fee, df = 3)1          2.563761   0.525789   4.876 1.08e-06 ***
ns(log_fee, df = 3)2          3.454927   0.929708   3.716 0.000202 ***
ns(log_fee, df = 3)3         -1.368719   0.540505  -2.532 0.011332 *  
mother_license_rec           -0.534980   0.140043  -3.820 0.000133 ***
neigh_groupAl Wathbah         1.480293   0.381905   3.876 0.000106 ***
neigh_groupEastern islands    1.545722   0.514953   3.002 0.002685 ** 
neigh_groupKhalifa + MBZ      0.780876   0.244969   3.188 0.001434 ** 
neigh_groupMushrif + Sa'adah  0.602423   0.220060   2.738 0.006190 ** 
neigh_groupAl Ain             0.322144   0.217677   1.480 0.138895    
neigh_groupAl Dhafra          1.716949   0.356075   4.822 1.42e-06 ***
neigh_groupOther              0.932463   0.258006   3.614 0.000301 ***
villa_bin                     0.813541   0.131012   6.210 5.31e-10 ***
onecar_bin                    1.074433   0.341451   3.147 0.001651 ** 
n_child_priv_rec2 children   -0.619341   0.696722  -0.889 0.374037    
n_child_priv_rec3+ children  -0.856812   0.701049  -1.222 0.221638    
father_job_rec                0.612785   0.191988   3.192 0.001414 ** 
age_c                         0.014811   0.017501   0.846 0.397403    
age_c2                       -0.013185   0.004329  -3.046 0.002319 ** 
same_school_no                0.652262   0.144804   4.504 6.65e-06 ***
mother_years_uae_5plus       -0.666501   0.149856  -4.448 8.68e-06 ***
mother_europeEurope          -0.318458   0.207387  -1.536 0.124643    
res_cost_bin                 -0.346137   0.119659  -2.893 0.003819 ** 
father_edu_6                  0.070866   0.127832   0.554 0.579324    
hh_dom_bin                    0.318645   0.167253   1.905 0.056759 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2533.8  on 1871  degrees of freedom
Residual deviance: 1818.9  on 1832  degrees of freedom
  (1965 observations deleted due to missingness)
AIC: 1898.9

Number of Fisher Scoring iterations: 6

anova(m_final)

Analysis of Deviance Table

Model: binomial, link: logit

Response: bus_only_bin

Terms added sequentially (first to last)

                       Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
NULL                                    1871     2533.8              
dist_grp_adj2           5  226.316      1866     2307.5 < 2.2e-16 ***
curriculum_grp          5  198.550      1861     2108.9 < 2.2e-16 ***
factor(income_imp_num)  5   51.232      1856     2057.7 7.753e-10 ***
ns(log_fee, df = 3)     3   50.707      1853     2007.0 5.649e-11 ***
mother_license_rec      1   35.576      1852     1971.4 2.452e-09 ***
neigh_group             7   31.348      1845     1940.0 5.362e-05 ***
villa_bin               1   35.027      1844     1905.0 3.252e-09 ***
onecar_bin              1   15.122      1843     1889.9 0.0001008 ***
n_child_priv_rec        2    5.225      1841     1884.7 0.0733546 .  
father_job_rec          1    8.374      1840     1876.3 0.0038058 ** 
age_c                   1    0.438      1839     1875.9 0.5080184    
age_c2                  1    5.043      1838     1870.8 0.0247259 *  
same_school_no          1   18.618      1837     1852.2 1.597e-05 ***
mother_years_uae_5plus  1   17.367      1836     1834.8 3.081e-05 ***
mother_europe           1    2.587      1835     1832.2 0.1077767    
res_cost_bin            1    9.560      1834     1822.7 0.0019890 ** 
father_edu_6            1    0.154      1833     1822.5 0.6951497    
hh_dom_bin              1    3.647      1832     1818.9 0.0561709 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# --- R² de McFadden ---
mcfadden <- function(fit) 1 - (fit$deviance / fit$null.deviance)
R2_McF <- mcfadden(m_final)
cat(sprintf("R² de McFadden = %.4f ; AIC = %.3f ; k = %d\n",
            R2_McF, AIC(m_final), length(coef(m_final))))

R² de McFadden = 0.2821 ; AIC = 1898.889 ; k = 40

Evaluate the binary model (ROC/AUC and classification metrics)

# --- Prediction + confusion matrices + metrics ---
if (!requireNamespace("pROC", quietly = TRUE)) install.packages("pROC")
library(pROC)

Type 'citation("pROC")' for a citation.


Attaching package: 'pROC'

The following objects are masked from 'package:stats':

    cov, smooth, var

# vars used by the model -> evaluation frame
vars_used <- all.vars(formula(m_final))
df_eval <- base_excl[, vars_used, drop = FALSE] |> tidyr::drop_na()

# predicted probs (in-sample) and observed
p_hat <- predict(m_final, newdata = df_eval, type = "response")
y_obs <- df_eval$bus_only_bin

# helpers
cm_at <- function(p, y, thr = 0.5) table(Pred = as.integer(p >= thr), Obs = y)
acc   <- function(cm) sum(diag(cm)) / sum(cm)
prec  <- function(cm) if ((cm[2,1] + cm[2,2])==0) NA else cm[2,2] / (cm[2,1] + cm[2,2])
rec   <- function(cm) if ((cm[1,2] + cm[2,2])==0) NA else cm[2,2] / (cm[1,2] + cm[2,2])
spec  <- function(cm) if ((cm[1,1] + cm[2,1])==0) NA else cm[1,1] / (cm[1,1] + cm[2,1])
f1    <- function(cm) { p <- prec(cm); r <- rec(cm); if (is.na(p)||is.na(r)||p+r==0) NA else 2*p*r/(p+r) }
bacc  <- function(cm) mean(c(spec(cm), rec(cm)), na.rm = TRUE)

# AUC + optimal threshold (Youden)
roc_obj <- pROC::roc(y_obs, p_hat, quiet = TRUE)
auc_val <- as.numeric(pROC::auc(roc_obj))
thr_opt <- as.numeric(pROC::coords(roc_obj, "best", best.method = "youden", ret = "threshold"))

# Confusion matrices + metrics
cm_05  <- cm_at(p_hat, y_obs, thr = 0.5)
cm_opt <- cm_at(p_hat, y_obs, thr = thr_opt)

metrics <- function(cm) c(
  Accuracy    = acc(cm),
  Precision   = prec(cm),
  Recall      = rec(cm),
  Specificity = spec(cm),
  F1          = f1(cm),
  BalAccuracy = bacc(cm)
)

cat("\nAUC =", round(auc_val, 3), "\n")


AUC = 0.833

cat("\nCutoff = 0.50\n"); print(cm_05); print(round(metrics(cm_05), 3))


Cutoff = 0.50

    Obs
Pred   0   1
   0 521 222
   1 246 883

   Accuracy   Precision      Recall Specificity          F1 BalAccuracy 
      0.750       0.782       0.799       0.679       0.791       0.739

cat(sprintf("\nROC-optimal cutoff (Youden) = %.3f\n", thr_opt))


ROC-optimal cutoff (Youden) = 0.643

print(cm_opt); print(round(metrics(cm_opt), 3))

    Obs
Pred   0   1
   0 641 363
   1 126 742

   Accuracy   Precision      Recall Specificity          F1 BalAccuracy 
      0.739       0.855       0.671       0.836       0.752       0.754

Ordinal model for distance outcome + pseudo-R² calculations (first implementation)

## =========================
## Ordinal distance model + pseudo-R² (optimised, consistent)
## =========================

## 1) Outcome as ordered factor (no need to hand-build levels)
base_excl <- base_excl %>%
  dplyr::mutate(dist_home_school_ord6 = factor(as.integer(dist_home_school), ordered = TRUE))


## 2) Model formula (single source of truth)
fml_dist <- dist_home_school_ord6 ~
  curriculum_grp +
  splines::ns(log_fee, df = 3) +
  mother_license_rec +
  neigh_group +
  villa_bin +
  onecar_bin +
  age_c + age_c2 +
  mother_europe +
  res_cost_bin +
  res_work_bin +
  res_school_bin +
  hh_dom_bin

## 3) Complete-case dataset derived from the formula
dat_dist <- base_excl %>%
  dplyr::select(all_of(all.vars(fml_dist))) %>%
  drop_na()

## 4) Fit ordinal model once, then reuse for everything
mod_dist <- MASS::polr(fml_dist, data = dat_dist, Hess = TRUE, method = "logistic")
sum_mod <- summary(mod_dist)

## 5) Coefficient table with p-values (no manual duplication of objects)
ctab <- coef(sum_mod)
pvals <- 2 * pnorm(abs(ctab[, "t value"]), lower.tail = FALSE)

coefs <- cbind(
  Estimate   = ctab[, "Value"],
  Std.Error  = ctab[, "Std. Error"],
  z.value    = ctab[, "t value"],
  `Pr(>|z|)` = pvals
)

printCoefmat(coefs, digits = 3, signif.stars = TRUE, signif.legend = TRUE)

                              Estimate Std.Error z.value Pr(>|z|)    
curriculum_grpBritish          0.33422   0.11808    2.83  0.00465 ** 
curriculum_grpWestern_other    0.62970   0.15488    4.07  4.8e-05 ***
curriculum_grpIndian_CBSE      1.27785   0.15661    8.16  3.4e-16 ***
curriculum_grpMoE_UAE          1.04310   0.19332    5.40  6.8e-08 ***
curriculum_grpOther           -0.27502   0.27630   -1.00  0.31957    
splines::ns(log_fee, df = 3)1  0.88976   0.26857    3.31  0.00092 ***
splines::ns(log_fee, df = 3)2  1.82108   0.45997    3.96  7.5e-05 ***
splines::ns(log_fee, df = 3)3  2.08750   0.30347    6.88  6.0e-12 ***
mother_license_rec            -0.20958   0.07080   -2.96  0.00307 ** 
neigh_groupAl Wathbah          2.67476   0.16217   16.49  < 2e-16 ***
neigh_groupEastern islands    -0.42619   0.29250   -1.46  0.14509    
neigh_groupKhalifa + MBZ       1.77368   0.13420   13.22  < 2e-16 ***
neigh_groupMushrif + Sa'adah   0.71571   0.12623    5.67  1.4e-08 ***
neigh_groupAl Ain              0.87062   0.12576    6.92  4.4e-12 ***
neigh_groupAl Dhafra           1.34810   0.20184    6.68  2.4e-11 ***
neigh_groupOther               1.28305   0.15440    8.31  < 2e-16 ***
villa_bin                      0.38950   0.06921    5.63  1.8e-08 ***
onecar_bin                     0.20811   0.12223    1.70  0.08864 .  
age_c                          0.03184   0.00878    3.63  0.00029 ***
age_c2                        -0.00249   0.00224   -1.11  0.26630    
mother_europeEurope            0.14473   0.11191    1.29  0.19591    
res_cost_bin                   0.30943   0.06357    4.87  1.1e-06 ***
res_work_bin                   0.32349   0.06568    4.93  8.4e-07 ***
res_school_bin                -1.89939   0.07209  -26.35  < 2e-16 ***
hh_dom_bin                     0.23937   0.09463    2.53  0.01142 *  
1|2                           -1.79189   0.29578   -6.06  1.4e-09 ***
2|3                            0.25793   0.28689    0.90  0.36862    
3|4                            1.54683   0.28718    5.39  7.2e-08 ***
4|5                            2.53287   0.28895    8.77  < 2e-16 ***
5|6                            3.30251   0.29101   11.35  < 2e-16 ***
6|7                            4.15291   0.29409   14.12  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## 6) Standard pseudo-R² from pscl
pscl::pR2(mod_dist)

fitting null model for pseudo-r2

          llh       llhNull            G2      McFadden          r2ML 
-5583.4005485 -6486.7952505  1806.7894039     0.1392667     0.4045557 
         r2CU 
    0.4145755

## 7) McKelvey–Zavoina R² (aligned design matrix, no fragile name filters)
beta <- stats::coef(mod_dist)

X_full <- stats::model.matrix(fml_dist, data = dat_dist)
X <- X_full[, colnames(X_full) %in% names(beta), drop = FALSE]

beta <- beta[colnames(X)]
stopifnot(identical(colnames(X), names(beta)))

eta <- as.vector(X %*% beta)
var_eta <- stats::var(eta, na.rm = TRUE)

sigma2_logit <- (pi^2) / 3
R2_MZ <- var_eta / (var_eta + sigma2_logit)

## 8) Likelihood-based pseudo-R² (use matching model class for null)
ll_full <- as.numeric(logLik(mod_dist))
ll_null <- as.numeric(logLik(update(mod_dist, . ~ 1)))
n <- nrow(dat_dist)

R2_McF <- 1 - ll_full / ll_null
R2_CS  <- 1 - exp((ll_null - ll_full) * 2 / n)
R2_NK  <- R2_CS / (1 - exp(2 * ll_null / n))

R2_tab <- tibble::tibble(
  McFadden = R2_McF,
  CoxSnell = R2_CS,
  Nagelkerke = R2_NK,
  McKelveyZavoina = R2_MZ
)

print(R2_tab, digits = 3)

# A tibble: 1 × 4
  McFadden CoxSnell Nagelkerke McKelveyZavoina
     <dbl>    <dbl>      <dbl>           <dbl>
1    0.139    0.405      0.415           0.406

Ordinal distance model (finalised implementation with correct packages)

## 1) Ordinal outcome (6 levels) + 2) complete-case dataset + 3) model
fml_dist <- dist_home_school_ord6 ~
  curriculum_grp +
  ns(log_fee, df = 3) +
  mother_license_rec +
  neigh_group +
  villa_bin +
  onecar_bin +
  age_c + age_c2 +
  res_cost_bin +
  res_work_bin +
  res_school_bin +
  hh_dom_bin

dat_dist <- base_excl %>%
  dplyr::select(all_of(all.vars(fml_dist))) %>%
  drop_na()

mod_dist <- ordinal::clm(fml_dist, data = dat_dist, link = "logit")
summary(mod_dist)

formula: 
dist_home_school_ord6 ~ curriculum_grp + ns(log_fee, df = 3) + mother_license_rec + neigh_group + villa_bin + onecar_bin + age_c + age_c2 + res_cost_bin + res_work_bin + res_school_bin + hh_dom_bin
data:    dat_dist

 link  threshold nobs logLik   AIC      niter max.grad cond.H 
 logit flexible  3606 -5784.66 11629.32 6(0)  5.46e-12 3.1e+05

Coefficients:
                              Estimate Std. Error z value Pr(>|z|)    
curriculum_grpBritish         0.407786   0.114398   3.565 0.000364 ***
curriculum_grpWestern_other   0.716301   0.148552   4.822 1.42e-06 ***
curriculum_grpIndian_CBSE     1.335970   0.154316   8.657  < 2e-16 ***
curriculum_grpMoE_UAE         1.110269   0.190032   5.843 5.14e-09 ***
curriculum_grpOther          -0.153262   0.272104  -0.563 0.573265    
ns(log_fee, df = 3)1          0.932310   0.264800   3.521 0.000430 ***
ns(log_fee, df = 3)2          1.838724   0.449389   4.092 4.28e-05 ***
ns(log_fee, df = 3)3          2.021319   0.282817   7.147 8.86e-13 ***
mother_license_rec           -0.195745   0.069598  -2.813 0.004916 ** 
neigh_groupAl Wathbah         2.674074   0.159294  16.787  < 2e-16 ***
neigh_groupEastern islands   -0.250203   0.283748  -0.882 0.377895    
neigh_groupKhalifa + MBZ      1.755249   0.130765  13.423  < 2e-16 ***
neigh_groupMushrif + Sa'adah  0.737196   0.122800   6.003 1.93e-09 ***
neigh_groupAl Ain             0.882751   0.122956   7.179 7.00e-13 ***
neigh_groupAl Dhafra          1.406627   0.197481   7.123 1.06e-12 ***
neigh_groupOther              1.235739   0.149444   8.269  < 2e-16 ***
villa_bin                     0.398677   0.067929   5.869 4.38e-09 ***
onecar_bin                    0.214709   0.120894   1.776 0.075733 .  
age_c                         0.033990   0.008609   3.948 7.88e-05 ***
age_c2                       -0.002099   0.002196  -0.956 0.339055    
res_cost_bin                  0.310167   0.062613   4.954 7.28e-07 ***
res_work_bin                  0.305832   0.064866   4.715 2.42e-06 ***
res_school_bin               -1.897036   0.070912 -26.752  < 2e-16 ***
hh_dom_bin                    0.226982   0.091052   2.493 0.012671 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Threshold coefficients:
    Estimate Std. Error z value
1|2  -1.7009     0.2907  -5.852
2|3   0.3693     0.2820   1.310
3|4   1.6495     0.2824   5.840
4|5   2.6278     0.2842   9.245
5|6   3.3940     0.2863  11.854
6|7   4.2501     0.2894  14.685