School name normalisation and linkage with ADEK reference data
## =========================## Normalise school names + build ADEK linkage (optimised)## =========================# Fast, consistent normalisation (lowercase, drop punctuation, squish spaces, Latin → ASCII)norm_latin <-function(x) { x %>% stringr::str_to_lower() %>% stringr::str_replace_all("[[:punct:]]+", " ") %>%# collapse runs of punct stringr::str_squish() %>% stringi::stri_trans_general("Latin-ASCII")}# 1) ADEK school table: import + deduplicate on normalised titledonnees_school_uni <- readr::read_csv("C:/Users/benjamin.mottebaumvo/Documents/Donnees/School_Bus_2025/donnees_school.csv",show_col_types =FALSE) %>% dplyr::select(-...3) %>% dplyr::mutate(title_norm =norm_latin(title)) %>% dplyr::distinct(title_norm, .keep_all =TRUE) # keeps first row per school
New names:
• `` -> `...3`
# 2) Code–name table: import + build keys onceschool_code_name <- readr::read_csv("C:/Users/benjamin.mottebaumvo/Documents/Donnees/School_Bus_2025/school_code_name.csv",show_col_types =FALSE) %>% dplyr::mutate(school_name_unique =norm_latin(NAME),school_id =sprintf("%01d%03d", CITY, NUMBER) )# 3) Link table: many reported names → one ADEK school record (by normalised name)school_link <- school_code_name %>% dplyr::left_join(donnees_school_uni, by =c("school_name_unique"="title_norm"))
Merging survey data with ADEK school information
## 5) Final merge: survey base + ADEK school attributesbase <- base %>% dplyr::left_join(school_link, by ="school_id")
Overlay map: all schools vs respondent schools
## 1. Nombre de répondants par école (avec coordonnées)## 0. Table de toutes les écolesschools_all <- donnees_school_uni %>%mutate(latitude =as.numeric(latitude),longitude =as.numeric(longitude) ) %>%filter(!is.na(latitude), !is.na(longitude)) %>%distinct(title, latitude, longitude, .keep_all =TRUE)## 1. Nombre de répondants par écoleschools_map <- base %>%filter(!is.na(school_id),!is.na(latitude),!is.na(longitude) ) %>%group_by(school_id, NAME, latitude, longitude) %>%summarise(n_resp =n(),.groups ="drop" )## Use plain vectors (not named) to avoid jsonlite warningspal_cols <-c("black", "blue")pal_labs <-c("All schools", "Respondents")leaflet::leaflet() %>% leaflet::addProviderTiles(leaflet::providers$CartoDB.Positron) %>% leaflet::addCircleMarkers(data = schools_all,lng =~longitude, lat =~latitude,radius =3,color = pal_cols[1], fillColor = pal_cols[1],fillOpacity =0.8, stroke =FALSE,group = pal_labs[1],popup =~sprintf("<b>%s</b>", title) ) %>% leaflet::addCircleMarkers(data = schools_map,lng =~longitude, lat =~latitude,radius =~pmax(3, sqrt(n_resp)),color = pal_cols[2], fillColor = pal_cols[2],fillOpacity =0.7, stroke =FALSE,group = pal_labs[2],popup =~sprintf("<b>%s</b><br>School ID: %s<br>Respondents: %s", NAME, school_id, n_resp ) ) %>% leaflet::addLayersControl(overlayGroups = pal_labs,options = leaflet::layersControlOptions(collapsed =FALSE) ) %>% leaflet::addLegend(position ="bottomright",colors = pal_cols,labels = pal_labs,opacity =0.9,title ="Legend" )
Reconstruct father/mother characteristics independent of respondent
# Convert SurveyMonkey-style fields to integer safelyto_int <-function(x) suppressWarnings(as.integer(readr::parse_number(as.character(x))))base <- base %>%mutate(# Numeric role indicator# 1 = father respondent, 2 = mother respondentrole_i =to_int(role),# Respondent characteristics (as reported by the respondent)years_uae_i =to_int(years_uae),cont_i =to_int(cont),age_i =to_int(age),job_i =to_int(job),edu_i =to_int(edu),license_i =to_int(license),# Partner characteristics (as reported in the partner module)partn_years_uae_i =to_int(partn_years_uae),partn_cont_i =to_int(partn_cont),partn_age_i =to_int(partn_age),partn_job_i =to_int(partn_job),partn_edu_i =to_int(partn_edu),partn_license_i =to_int(partn_license),# Father characteristics# If the respondent is the father (role_i == 1), take respondent values# Otherwise (respondent is the mother), take partner valuesfather_years_uae =if_else(role_i ==1L, years_uae_i, partn_years_uae_i),father_cont =if_else(role_i ==1L, cont_i, partn_cont_i),father_age =if_else(role_i ==1L, age_i, partn_age_i),father_job =if_else(role_i ==1L, job_i, partn_job_i),father_edu =if_else(role_i ==1L, edu_i, partn_edu_i),father_license =if_else(role_i ==1L, license_i, partn_license_i),# Mother characteristics# If the respondent is the mother (role_i == 2), take respondent values# Otherwise (respondent is the father), take partner valuesmother_years_uae =if_else(role_i ==2L, years_uae_i, partn_years_uae_i),mother_cont =if_else(role_i ==2L, cont_i, partn_cont_i),mother_age =if_else(role_i ==2L, age_i, partn_age_i),mother_job =if_else(role_i ==2L, job_i, partn_job_i),mother_edu =if_else(role_i ==2L, edu_i, partn_edu_i),mother_license =if_else(role_i ==2L, license_i, partn_license_i) ) %>%mutate(# If the respondent role is invalid or missing, set parent variables to NAacross(starts_with("father_"), ~if_else(role_i %in%1:2, .x, NA_integer_)),across(starts_with("mother_"), ~if_else(role_i %in%1:2, .x, NA_integer_)) ) %>%# Remove intermediate variables used only for construction dplyr::select(-role_i, -ends_with("_i"))
Descriptive plot: home–school distance distribution
# --- Préparation des données ---df_dist <- base |>filter(!is.na(dist_home_school)) |>count(dist_home_school) |>mutate(percent =100* n /sum(n),# inversion de l’ordre ici :dist_home_school =factor(dist_labels[as.numeric(dist_home_school)],levels =rev(dist_labels)) # <-- ordre inversé )# --- Palette qualitative ---palette_dist <-brewer.pal(n =length(dist_labels), name ="Set2")# --- Graphique ---ggplot(df_dist, aes(x = percent, y = dist_home_school, fill = dist_home_school)) +geom_col(width =0.7) +scale_fill_manual(values = palette_dist) +scale_x_continuous(expand =c(0,0),limits =c(0, 25),labels = scales::percent_format(scale =1) ) +labs(x =NULL,y =NULL,title ="Driving distance between home and school" ) +theme_modern() +theme(legend.position ="none")
Descriptive plot: home–school transport mode
## =========================## Mode frequency stacked bars (optimised)## Keeps "Never" for denominator, hides it in the plot## =========================mode_map <-c(to_bus ="School bus",to_car ="Car",to_walk ="Walk/scooter/bike")freq_order_plot <-c("4+ days/week", "2–3 days/week", "1 day/week", "Sometimes (<1/week)")df_plot <- base %>%transmute(to_bus = to_bus,to_car = to_car,to_walk = to_walk ) %>%pivot_longer(cols =everything(),names_to ="mode_raw",values_to ="freq_code" ) %>%mutate(mode = dplyr::recode(mode_raw, !!!mode_map),frequency =factor(freq_code, levels =levels(base$to_bus), labels = freq_labels) ) %>%filter(!is.na(frequency)) %>%count(mode, frequency, name ="n") %>%group_by(mode) %>%mutate(percent =100* n /sum(n)) %>%ungroup() %>%filter(frequency !="Never") %>%mutate(frequency = forcats::fct_relevel(frequency, freq_order_plot))my_colors <-rev(RColorBrewer::brewer.pal(4, "Blues"))ggplot(df_plot, aes(x = mode, y = percent, fill = frequency)) +geom_col(position =position_stack(reverse =TRUE)) +scale_fill_manual(values = my_colors, guide =guide_legend(reverse =TRUE)) +scale_y_continuous(expand =c(0, 0), limits =c(0, 80)) +labs(x =NULL,y ="Share of pupils (%)",title ="How children usually travel from home to school" ) +theme_modern()
Descriptive plot: escort to school (stacked frequencies)
# --- Données en format long : les NA deviennent "Never"df_escort <- base |> dplyr::select(to_mom, to_dad, to_oth, to_nan) |>pivot_longer(cols =everything(),names_to ="escort_raw",values_to ="freq_code" ) |>mutate(escort =recode( escort_raw,to_mom ="Mother",to_dad ="Father",to_oth ="Relative/friend/neighbour",to_nan ="Domestic worker" ),# freq_code peut être NA → on force à 1 ("Never")freq_num =ifelse(is.na(freq_code), 1L, as.integer(freq_code)),frequency =factor( freq_labels[freq_num],levels = freq_labels,ordered =TRUE ) ) |>group_by(escort, frequency) |>summarise(n =n(), .groups ="drop") |>group_by(escort) |>mutate(percent =100* n /sum(n)) |>ungroup()# pour le tracé : on ne montre pas "Never", comme pour le graphique des modes# Pour le tracé : exclure "Never"df_escort_plot <- df_escort |>filter(frequency !="Never") |>mutate(frequency = forcats::fct_relevel( frequency,"4+ days/week","2–3 days/week","1 day/week","Sometimes (<1/week)" ),# --- ORDRE IMPOSE DES MODALITÉS ---escort =factor( escort,levels =c("Mother","Father","Domestic worker","Relative/friend/neighbour" ) ) )# palette Blues inversée (4+ en bleu foncé, Sometimes en bleu clair)my_colors <-rev(brewer.pal(4, "Blues"))# --- Graphique ---ggplot(df_escort_plot,aes(x = escort, y = percent, fill = frequency)) +geom_col(position =position_stack(reverse =TRUE)) +scale_fill_manual(values = my_colors,guide =guide_legend(reverse =TRUE)) +scale_y_continuous(expand =c(0, 0), limits =c(0, 70)) +labs(x =NULL,y ="Share of pupils (%)",title ="How often is the child accompanied to school?",#subtitle = "By type of accompanying person (excluding 'Never')",fill ="Frequency" ) +theme_modern()
Build the modelling dataset (cleaning, recoding, exclusivity, income imputation)