Read required packages

library(ggplot2)
library(maps)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(osrm)
## Data: (c) OpenStreetMap contributors, ODbL 1.0 - http://www.openstreetmap.org/copyright
## Routing: OSRM - http://project-osrm.org/
library(sf)
## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
library(stringr)
library(sf)
library(osrm)
library(geosphere)
library(leaflet)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(caret)
## Loading required package: lattice
library(tidyr)
library(gt)
library(brant)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(caret)
library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library(nnet)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(ggspatial)

Read data

data = read.csv("DATA clean.csv")

Data preperation

Add LSOA’s

LSOA_lookup = read.csv("LSOA_lookup.csv")

# Clean survey dataset

data = data %>%
  mutate(Postcode = str_replace_all(str_to_upper(Postcode), " ", ""))

# Clean lookup table
LSOA_lookup = LSOA_lookup %>%
  mutate(pcd7 = str_replace_all(str_to_upper(pcd7), " ", "")) 

# Join

data = data %>%
  left_join(LSOA_lookup, by = c("Postcode" = "pcd7"))

Clean up data - remove rows with no postcodes and unecessary columns

data = data %>%
  filter(!is.na(lsoa21cd)) 

data = data %>%
  dplyr::select(-X, -pcd8, -pcds, -dointr, -doterm, -usertype, -oa21cd, -msoa21nm, -ladnm, -ladnmw)

Add IMD

imd_lookup = read.csv("standardised_IMD.csv") 

data = data %>%
  left_join(imd_lookup, by = c("lsoa21cd" = "LSOA"))

Add rural/urban id

rural_urban = read.csv("Rural_urban.csv")

data = data %>%
  left_join(rural_urban, by = c("ladcd" = "LAD21CD"))

Remove postcodes with no match

data = data %>%
  filter(!is.na(Urban_rural_flag)) 

data = data %>%
  filter(!is.na(SOA_decile))

Adding travel time to nearest GP’s

# GP surgeries 

gp = read.csv("GPS_England_Wales.csv")

# Filter for only active institutions 

gp_active = gp %>%
  filter(gp[[13]] == "A")

# Filter for only GP surgeries and hospitals 

gp_filtered = gp_active %>%
  filter(gp_active[[26]] == 4 | gp_active[[26]] == 10)

# Get rid of unnecessary columns for ease 

gp_filtered = gp_filtered %>%
  dplyr::select(-c(3, 4, 9, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27))

# Rename columns 

colnames(gp_filtered) = c("Org code", "Name", "Adress1", "Adress2", "Adress3", "Adress4", "Postcode", "Status", "Setting")

# Change postcodes to long/lat 

postcode_convert = read.csv("ukpostcodes.csv")

# Clean postcodes

postcode_convert$postcode = toupper(gsub("\\s+", "", postcode_convert$postcode))
gp_filtered$clean_postcode = toupper(gsub("\\s+", "", gp_filtered$Postcode))

# Merge

data = left_join(data, postcode_convert, by = c("Postcode" = "postcode"))

gp_postcodes = left_join(gp_filtered, postcode_convert, by = c("clean_postcode" = "postcode"))

# Convert to sf objects 

gps_clean = gp_postcodes %>%
  filter(!is.na(longitude), !is.na(latitude))

lsoa_clean = data %>%
  filter(!is.na(longitude), !is.na(latitude))

lsoa_sf = st_as_sf(lsoa_clean, coords = c("longitude", "latitude"), crs = 4326)
gp_sf = st_as_sf(gps_clean, coords = c("longitude", "latitude"), crs = 4326) 


results = list()

# Get coordinate matrix of all GPs
gp_coords = st_coordinates(gp_sf)

for (i in seq_len(nrow(lsoa_sf))) {
  lsoa_point = lsoa_sf[i, ]
  cat("Processing LSOA", i, "/", nrow(lsoa_sf), "\n")

  tryCatch({
    # Get coordinates of current LSOA
    lsoa_coord = st_coordinates(lsoa_point)

    # Compute Haversine distances to all GPs
    dists = distHaversine(lsoa_coord, gp_coords)

    # Get indices of 10 closest GPs
    nearest_idxs = order(dists)[1:10]

    # Subset those GPs
    gps_nearby = gp_sf[nearest_idxs, ]

    # route to just 10 GPs
    travel_times = osrmTable(src = lsoa_point, dst = gps_nearby)

    if (!is.null(travel_times$durations)) {
      min_time = min(travel_times$durations[1, ], na.rm = TRUE)

      results[[i]] = data.frame(
        lsoa_code = lsoa_point$lsoa21cd,
        min_time_to_gp = min_time
      )
    }

  }, error = function(e) {
    message("Failed on row ", i, ": ", e$message)
  })
}

# Combine results
travel_df = bind_rows(results)

data = data %>%
  left_join(travel_df, by = c("lsoa21cd" = "lsoa_code"))

Remove any repeated rows or rows containing NA’s

data = data %>%
  distinct()

data = data[data$ID != 10, ]

data = data %>%
  filter(!is.na(Risk))

Create Risk Index

# Define a vector of variables for the index

index_vars = c(
  "ID", "Spend", "Cost_home", "Avoided.", "Limited", "Prescription", "Prescription_difference", "Cost_out", "Avoided_out.", "Spend_other.", "Availabilty", "Travel.", "Extra_time", "Extra_time_shopping.", "Time_off", "Stress", "Follow_GF.", "QOL"
)

# Create a new data frame with only these columns

index_df = data[ , index_vars]

# Create scores for each variable

index_df = index_df %>%
  mutate(
    Spend_score = case_when(
      Spend == "£0-£10" ~ 0,
      Spend == "£11-£20" ~ 1,
      Spend == "£21-£30" ~ 2,
      Spend == "£31-£40" ~ 3,
      Spend == "Over £40" ~ 4,
      TRUE ~ 0
    ),
    
    Cost_home_score = case_when(
      Cost_home == "About the same" ~ 0,
      Cost_home == "Slightly more expensive" ~ 1,
      Cost_home == "Much more expensive" ~ 2,
      TRUE ~ 0
    ),
    
    Avoided_score = case_when(
      Avoided. == "No, never" ~ 0,
      Avoided. == "Yes, occasionally" ~ 1,
      Avoided. == "Yes, often" ~ 2,
      TRUE ~ 0
    ),
    
    Limited_score = case_when(
      Limited == "No" ~ 0,
      Limited == "Yes, somewhat" ~ 1,
      Limited == "Yes, significantly" ~ 2,
      TRUE ~ 0
    ),
    
    Prescription_score = case_when(
      Prescription == "Yes, regularly" ~ 0,
      Prescription == "Yes, occasionally" ~ 1,
      Prescription %in% c("No, not eligible in my area", "No, eligible, but do not use it", "No, unaware this was an option") ~ 2,
      TRUE ~ 0
    ),
    
    Prescription_difference_score = case_when(
      Prescription_difference == "A significant difference" ~ 0,
      Prescription_difference %in% c("A moderate difference", "A small difference") ~ 1,
      Prescription_difference == "No difference" ~ 2,
      TRUE ~ 0
    ),
    
    Cost_out_score = case_when(
      Cost_out == "Very cheap" ~ 0,
      Cost_out == "Affordable" ~ 1,
      Cost_out == "Expensive" ~ 2,
      Cost_out == "Very expensive" ~ 3,
      TRUE ~ 0
    ),
    
    Avoided_out_score = case_when(
      Avoided_out. == "No, never" ~ 0,
      Avoided_out. == "Yes, occasionally" ~ 1,
      Avoided_out. == "Yes, often" ~ 2,
      TRUE ~ 0
    ),
    
    Spend_other_score = case_when(
      Spend_other. == "Never" ~ 0,
      Spend_other. == "Occasionally" ~ 1,
      Spend_other. == "Frequently" ~ 2,
      TRUE ~ 0
    ),
    
    Availabilty_score = case_when(
      Availabilty == "Very Good" ~ 0,
      Availabilty == "Good" ~ 1,
      Availabilty == "Adequate" ~ 2,
      Availabilty == "Poor" ~ 3,
      Availabilty == "Very poor" ~ 4,
      TRUE ~ 0
    ),
    
    Travel_score = case_when(
      Travel. == "No" ~ 0,
      Travel. == "Yes" ~ 2,
      TRUE ~ 0
    ),
    
    Extra_time_score = case_when(
      Extra_time == "Less than 30 minutes" ~ 0,
      Extra_time == "30 minutes - 1 hour" ~ 1,
      Extra_time == "1–2 hours" ~ 2,
      Extra_time == "3–4 hours" ~ 3,
      Extra_time == "More than 4 hours" ~ 4,
      TRUE ~ 0
    ),
    
    Extra_time_shopping_score = case_when(
      Extra_time_shopping. == "Less than 30 minutes" ~ 0,
      Extra_time_shopping. == "30 minutes - 1 hour" ~ 1,
      Extra_time_shopping. == "1–2 hours" ~ 2,
      Extra_time_shopping. == "3–4 hours" ~ 3,
      Extra_time_shopping. == "More than 4 hours" ~ 4,
      TRUE ~ 0
    ),
    
    Time_off_score = case_when(
      Time_off == "Yes" ~ 2,
      Time_off == "No" ~ 0,
      TRUE ~ 0
    ),
    
    Stress_score = case_when(
      Stress == "None at all" ~ 0,
      Stress == "A small amount" ~ 1,
      Stress == "A moderate amount" ~ 2,
      Stress == "A great deal" ~ 3,
      TRUE ~ 0
    ),
    
    Limited_diet_score = case_when(
      Follow_GF. == "No" ~ 0,
      Follow_GF. == "Yes - occasionally" ~ 1,
      Follow_GF. == "Yes - frequently" ~ 2,
      TRUE ~ 0
    ),
    
    QOL_score = case_when(
      QOL == "No impact" ~ 0,
      QOL == "Yes - a slight impact" ~ 1,
      QOL == "Yes - a significant impact" ~ 2,
      TRUE ~ 0
    ),
    
  ) %>%
  #  Sum all score columns to create the final index
  mutate(
    economic_risk_index = rowSums(across(ends_with("_score")), na.rm = TRUE)
  )

Visualise distribution of index

ggplot(index_df, aes(x = economic_risk_index)) +
  geom_histogram(binwidth = 1, fill = "#69b3a2", color = "white") +
  labs(title = "Distribution of Economic Risk Index",
       x = "Economic Risk Index Score",
       y = "Number of Respondents") +
  theme_minimal()

Categorise into risk groups using z scores

index_df = index_df %>%
  mutate(z_score = scale(economic_risk_index)[,1])

index_df = index_df %>%
  mutate(risk_category = case_when(
    z_score <= -0.5 ~ "Low Risk",
    z_score >=  0.5 ~ "High Risk",
    TRUE            ~ "Moderate Risk"
  ))

table(index_df$risk_category)
## 
##     High Risk      Low Risk Moderate Risk 
##           146           157           181
ggplot(index_df, aes(x = economic_risk_index, fill = risk_category)) +
  geom_histogram(binwidth = 1, color = "white", boundary = 0, closed = "left") +
  scale_fill_manual(values = c("Low Risk" = "green", 
                               "Moderate Risk" = "gold", 
                               "High Risk" = "red")) +
  labs(
    title = "Economic Risk Index Distribution by Category",
    x = "Economic Risk Index Score",
    y = "Number of Respondents",
    fill = "Risk Category"
  ) +
  theme_minimal() +
  theme(
    legend.position = "top"
  )

Add back to original data and remove nas

data = data %>%
  mutate(
    economic_risk_index = index_df$economic_risk_index,
    risk_category = index_df$risk_category
  )

data = data %>%
  mutate(
    Risk = str_to_title(str_trim(Risk)),  
    Risk = ifelse(Risk == "", NA, Risk)   
  )

Compare to original risk scores

data %>%
  filter(!is.na(Risk), !is.na(risk_category)) %>%
  count(Risk, risk_category) %>%
  group_by(Risk) %>%
  mutate(prop = n / sum(n)) %>%
  ggplot(aes(x = Risk, y = prop, fill = risk_category)) +
  geom_col(position = "stack") +
  scale_fill_manual(values = c("Low Risk" = "green", "Moderate Risk" = "gold", "High Risk" = "red")) +
  labs(
    title = "Distribution of Calculated Risk Within Each Self-Reported Category",
    x = "Self-Reported Risk",
    y = "Proportion",
    fill = "Calculated Risk"
  ) +
  theme_minimal()

Demographic analysis and cleaning

# Gender

data = data %>%
  filter(`Gender.` != "My sex is female")

data = data %>%
  filter(`Gender.` != "Prefer not to say")

data = data %>%
  filter(`Gender.` != "Non-binary")

gender_proportions = prop.table(table(data$Gender))
gender_percentages = gender_proportions * 100
print(gender_percentages)
## 
##       Man     Woman 
##  5.405405 94.594595
# Age 


age_proportions = prop.table(table(data$Age))
age_percentages = age_proportions * 100
print(age_percentages)
## 
##       18-24       25-34       35-44       45-54       55-64 65 or older 
##    6.652807   13.305613   22.037422   25.987526   20.374220   11.642412
# Race

data = data %>%
  filter(`Ethnicity` != "British")

race_proportions = prop.table(table(data$Ethnicity))
race_percentages = race_proportions * 100
print(race_percentages)
## 
##           Asian/Asian British  Mixed/Multiple ethnic origins  
##                      0.2083333                      0.4166667 
##                          White 
##                     99.3750000
# Education 

# Combine undergrad and postgrad into one to match census category 

data = data %>%
  mutate(Education = str_trim(Education),   
         Education = case_when(
           str_to_lower(Education) %in% c("college or university", "post-graduate degree") ~ "Higher education",
           TRUE ~ Education
         ))

data = data %>%
  filter(`Education` != "Prefer not to say")

education_proportions = prop.table(table(data$Education))
education_percentages = education_proportions * 100
print(education_percentages)
## 
##                                                Higher education 
##                                                       70.675105 
## Higher or secondary or further education (A-Levels, BTEC, etc.) 
##                                                       21.097046 
##                                 Secondary school up to 16 years 
##                                                        8.227848
# Income 

data = data %>%
  filter(`Income.` != "Prefer not to say")

income_proportions = prop.table(table(data$Income.))
income_percentages = income_proportions * 100
print(income_percentages)
## 
##      £0 to £15,000 per year  £15,000 to £30,000 per year  
##                     6.038647                    16.666667 
## £30,000 to £50,000 per year  £50,000 to £100,000 per year 
##                    28.019324                    38.164251 
##     Above £100,000 per year  
##                    11.111111
# Car

car_proportions = prop.table(table(data$Car))
car_percentages = car_proportions * 100
print(car_percentages)
## 
##        No       Yes 
##  5.555556 94.444444
# CD

CD_proportions = prop.table(table(data$CD))
CD_percentages = CD_proportions * 100
print(CD_percentages)
## 
##       No      Yes 
## 20.28986 79.71014
# Household 

household_proportions = prop.table(table(data$Household.))
household_percentages = household_proportions * 100
print(household_percentages)
## 
##                                       Child 
##                                  12.3188406 
##                         Child being tested  
##                                   0.2415459 
##                            Friend/housemate 
##                                   0.4830918 
##                                  Grandchild 
##                                   0.2415459 
##             Granddaughter. Daughter in law  
##                                   0.2415459 
##                        Husband and daughter 
##                                   0.2415459 
## I am coeliac and my 2 daughters are coeliac 
##                                   0.2415459 
##                                         Me  
##                                   0.2415459 
##                                      Myself 
##                                   0.2415459 
##                         Myself and my child 
##                                   0.2415459 
##                                         N/A 
##                                  63.7681159 
##                                 No just me  
##                                   0.4830918 
##                                     Parent  
##                                  13.0434783 
##                           parent and spouse 
##                                   0.2415459 
##                           Partner and child 
##                                   0.2415459 
##                              Partner/spouse 
##                                   5.7971014 
##                                     Sibling 
##                                   1.6908213
# Location

leaflet(data) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addCircleMarkers(~longitude, ~latitude,
                   radius = 4,
                   popup = ~Postcode,
                   fillColor = "red",
                   fillOpacity = 0.6,
                   stroke = FALSE) %>%
  addScaleBar()

Desciptive statistics

Summary table direct impacts

direct_cols = c(
  "Spend",
  "Cost_home",
  "Avoided.",
  "Limited",
  "Prescription",
  "Prescription_difference",
  "Cost_out",
  "Avoided_out.",
  "Spend_other.")

q_names = c(
  Spend = "Household Spend per Week on GF Diet",
  Cost_home = "Cost of GF Alternatives in Local Area",
  Avoided. = "Avoided Buying GF Food Due To Cost",
  Limited = "Cost of GF Food Limited Household Diet or Nutrition", 
  Prescription = "Recieve GF Food on Prescription",
  Prescription_difference = "Financial Impact of Prescriptions",
  Cost_out = "Cost of GF Alternatives When Eating Out",
  Avoided_out. = "Avoided Eating Out Due to GF Alternative Cost",
  Spend_other. = "Cost of GF Food Caused To Limit Spending Elsewhere")

out = do.call(rbind, lapply(direct_cols, function(col) {
  v = data[[col]]
  v = v[!is.na(v) & v != ""]  
  
  if (length(v) == 0) return(NULL) 
  
  tab = table(v, useNA = "no")
 
  tab = sort(tab, decreasing = TRUE)
  
  pct = round(100 * prop.table(tab), 1)
  summary_str = paste(paste0(names(pct), ": ", pct, "%"), collapse = ", ")
  
  data.frame(
    Question = ifelse(col %in% names(q_names), q_names[col], col),
    Summary  = summary_str,
    stringsAsFactors = FALSE
  )
}))

out
##                                              Question
## 1                 Household Spend per Week on GF Diet
## 2               Cost of GF Alternatives in Local Area
## 3                  Avoided Buying GF Food Due To Cost
## 4 Cost of GF Food Limited Household Diet or Nutrition
## 5                     Recieve GF Food on Prescription
## 6                   Financial Impact of Prescriptions
## 7             Cost of GF Alternatives When Eating Out
## 8       Avoided Eating Out Due to GF Alternative Cost
## 9  Cost of GF Food Caused To Limit Spending Elsewhere
##                                                                                                                                                           Summary
## 1                                                                                     £11-£20: 32.6%, £21-£30: 28.2%, Over £40: 16.9%, £31-£40: 13%, £0-£10: 9.3%
## 2                                                                                 Much more expensive: 91.4%, Slightly more expensive: 8.1%, About the same: 0.5%
## 3                                                                                   Yes, often: 44.6%, Yes, occasionally: 35.5%, No, never: 19.4%, Not sure: 0.5%
## 4                                                                                      No: 40.8%, Yes, somewhat: 39.1%, Yes, significantly: 13.4%, Not sure: 6.6%
## 5 No, not eligible in my area: 61.6%, No, eligible, but do not use it: 12.7%, Yes, regularly: 9.3%, No, unaware this was an option: 8.6%, Yes, occasionally: 7.8%
## 6                                                   A significant difference: 37.6%, A moderate difference: 33.3%, A small difference: 19.4%, No difference: 9.7%
## 7                                                                                    Expensive: 56.5%, Affordable: 30.1%, Very expensive: 13.2%, Very cheap: 0.2%
## 8                                                                                   No, never: 38.7%, Yes, occasionally: 37.2%, Yes, often: 21.4%, Not sure: 2.7%
## 9                                                                                            Occasionally: 41.4%, Never: 32.6%, Frequently: 21.2%, Not sure: 4.9%
out |>
  gt() |>
  cols_label(Question = "Question", Summary = "Summary") |>
  opt_row_striping() |>
  tab_options(table.font.size = px(12)) |>
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(columns = Question)
  ) |>
  cols_width(
    Question ~ px(220),
    Summary  ~ px(520)
  )|>
  tab_options(table.font.names = "Times New Roman")
Question Summary
Household Spend per Week on GF Diet £11-£20: 32.6%, £21-£30: 28.2%, Over £40: 16.9%, £31-£40: 13%, £0-£10: 9.3%
Cost of GF Alternatives in Local Area Much more expensive: 91.4%, Slightly more expensive: 8.1%, About the same: 0.5%
Avoided Buying GF Food Due To Cost Yes, often: 44.6%, Yes, occasionally: 35.5%, No, never: 19.4%, Not sure: 0.5%
Cost of GF Food Limited Household Diet or Nutrition No: 40.8%, Yes, somewhat: 39.1%, Yes, significantly: 13.4%, Not sure: 6.6%
Recieve GF Food on Prescription No, not eligible in my area: 61.6%, No, eligible, but do not use it: 12.7%, Yes, regularly: 9.3%, No, unaware this was an option: 8.6%, Yes, occasionally: 7.8%
Financial Impact of Prescriptions A significant difference: 37.6%, A moderate difference: 33.3%, A small difference: 19.4%, No difference: 9.7%
Cost of GF Alternatives When Eating Out Expensive: 56.5%, Affordable: 30.1%, Very expensive: 13.2%, Very cheap: 0.2%
Avoided Eating Out Due to GF Alternative Cost No, never: 38.7%, Yes, occasionally: 37.2%, Yes, often: 21.4%, Not sure: 2.7%
Cost of GF Food Caused To Limit Spending Elsewhere Occasionally: 41.4%, Never: 32.6%, Frequently: 21.2%, Not sure: 4.9%
out
##                                              Question
## 1                 Household Spend per Week on GF Diet
## 2               Cost of GF Alternatives in Local Area
## 3                  Avoided Buying GF Food Due To Cost
## 4 Cost of GF Food Limited Household Diet or Nutrition
## 5                     Recieve GF Food on Prescription
## 6                   Financial Impact of Prescriptions
## 7             Cost of GF Alternatives When Eating Out
## 8       Avoided Eating Out Due to GF Alternative Cost
## 9  Cost of GF Food Caused To Limit Spending Elsewhere
##                                                                                                                                                           Summary
## 1                                                                                     £11-£20: 32.6%, £21-£30: 28.2%, Over £40: 16.9%, £31-£40: 13%, £0-£10: 9.3%
## 2                                                                                 Much more expensive: 91.4%, Slightly more expensive: 8.1%, About the same: 0.5%
## 3                                                                                   Yes, often: 44.6%, Yes, occasionally: 35.5%, No, never: 19.4%, Not sure: 0.5%
## 4                                                                                      No: 40.8%, Yes, somewhat: 39.1%, Yes, significantly: 13.4%, Not sure: 6.6%
## 5 No, not eligible in my area: 61.6%, No, eligible, but do not use it: 12.7%, Yes, regularly: 9.3%, No, unaware this was an option: 8.6%, Yes, occasionally: 7.8%
## 6                                                   A significant difference: 37.6%, A moderate difference: 33.3%, A small difference: 19.4%, No difference: 9.7%
## 7                                                                                    Expensive: 56.5%, Affordable: 30.1%, Very expensive: 13.2%, Very cheap: 0.2%
## 8                                                                                   No, never: 38.7%, Yes, occasionally: 37.2%, Yes, often: 21.4%, Not sure: 2.7%
## 9                                                                                            Occasionally: 41.4%, Never: 32.6%, Frequently: 21.2%, Not sure: 4.9%

Indirect impacts summary table

indirect_cols = c(
  "Availability",
  "Travel.",
  "Extra_time",
  "Extra_time_shopping.",
  "Time_off",
  "Stress",
  "Follow_GF.",
  "QOL",
  "Support.")

q_names = c(
  Availability = "Availabilty of GF Food in Local Area",
  Travel. = "Have to Travel Outside of Area for GF Food or Medical Care",
  Extra_time = "Travel Time for GF Products",
  Extra_time_shopping. = "Extra Weekly Time Spent on GF Shopping/Prep",
  Time_off = "Time Off Work/School/Responsibilities Due to CD",
  Stress = "Extent of Stress Due to CD Financial Impact",
  Follow_GF. = "Financial Impacts Made Strict GFD Harder",
  QOL = "Impact on household quality of life",
  Support. = "Emotional/Mental Health Support Required")
  
  
 out = do.call(rbind, lapply(indirect_cols, function(col) {
  v = data[[col]]
  v = v[!is.na(v) & v != ""]   
  
  if (length(v) == 0) return(NULL)  
  
  tab = table(v, useNA = "no")
  
  tab = sort(tab, decreasing = TRUE)
  
  pct = round(100 * prop.table(tab), 1)
  summary_str = paste(paste0(names(pct), ": ", pct, "%"), collapse = ", ")
  
  data.frame(
    Question = ifelse(col %in% names(q_names), q_names[col], col),
    Summary  = summary_str,
    stringsAsFactors = FALSE
  )
}))

out
##                                                     Question
## 1 Have to Travel Outside of Area for GF Food or Medical Care
## 2                                Travel Time for GF Products
## 3                Extra Weekly Time Spent on GF Shopping/Prep
## 4            Time Off Work/School/Responsibilities Due to CD
## 5                Extent of Stress Due to CD Financial Impact
## 6                   Financial Impacts Made Strict GFD Harder
## 7                        Impact on household quality of life
## 8                   Emotional/Mental Health Support Required
##                                                                                                                                                                                                                                       Summary
## 1                                                                                                                                                                                                                       No: 56.8%, Yes: 43.2%
## 2                                                                                                         5–10 minutes: 39.9%, 11–20 minutes: 35%, 21–30 minutes: 12.5%, Less than 5 minutes: 8.6%, Over 30 minutes: 3.7%, I’m not sure: 0.5%
## 3                                                                                                       30 minutes - 1 hour: 30.8%, 1–2 hours : 26.9%, Less than 30 minutes: 18.4%, 3–4 hours: 12.4%, More than 4 hours: 6.8%, Not sure: 4.6%
## 4                                                                                                                                                                                                                       Yes: 60.1%, No: 39.9%
## 5                                                                                                                                        A small amount: 42.3%, A moderate amount: 33%, None at all: 12.5%, A great deal: 12%, Not sure: 0.2%
## 6                                                                                                                                           No - I’ve always been able to manage it: 68.2%, Yes - occasionally: 24.9%, Yes - frequently: 6.8%
## 7                                                                                                                                           Yes - a slight impact: 52.3%, No impact: 27.1%, Yes - a significant impact: 16.9%, Not sure: 3.7%
## 8 No - not needed\u2028: 55.7%,  No - but it would have been helpful\u2028: 23%, Yes - informal support (e.g. friends, support groups)\u2028: 15.6%, Yes - formal support (e.g. counselling, GP support)\u2028: 5.4%, Prefer not to say: 0.2%
out |>
  gt() |>
  cols_label(Question = "Question", Summary = "Summary") |>
  opt_row_striping() |>
  tab_options(table.font.size = px(12)) |>
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(columns = Question)
  ) |>
  cols_width(
    Question ~ px(220),
    Summary  ~ px(520)
  )|>
  tab_options(table.font.names = "Times New Roman")
Question Summary
Have to Travel Outside of Area for GF Food or Medical Care No: 56.8%, Yes: 43.2%
Travel Time for GF Products 5–10 minutes: 39.9%, 11–20 minutes: 35%, 21–30 minutes: 12.5%, Less than 5 minutes: 8.6%, Over 30 minutes: 3.7%, I’m not sure: 0.5%
Extra Weekly Time Spent on GF Shopping/Prep 30 minutes - 1 hour: 30.8%, 1–2 hours : 26.9%, Less than 30 minutes: 18.4%, 3–4 hours: 12.4%, More than 4 hours: 6.8%, Not sure: 4.6%
Time Off Work/School/Responsibilities Due to CD Yes: 60.1%, No: 39.9%
Extent of Stress Due to CD Financial Impact A small amount: 42.3%, A moderate amount: 33%, None at all: 12.5%, A great deal: 12%, Not sure: 0.2%
Financial Impacts Made Strict GFD Harder No - I’ve always been able to manage it: 68.2%, Yes - occasionally: 24.9%, Yes - frequently: 6.8%
Impact on household quality of life Yes - a slight impact: 52.3%, No impact: 27.1%, Yes - a significant impact: 16.9%, Not sure: 3.7%
Emotional/Mental Health Support Required No - not needed
: 55.7%, No - but it would have been helpful
: 23%, Yes - informal support (e.g. friends, support groups)
: 15.6%, Yes - formal support (e.g. counselling, GP support)
: 5.4%, Prefer not to say: 0.2%
out 
##                                                     Question
## 1 Have to Travel Outside of Area for GF Food or Medical Care
## 2                                Travel Time for GF Products
## 3                Extra Weekly Time Spent on GF Shopping/Prep
## 4            Time Off Work/School/Responsibilities Due to CD
## 5                Extent of Stress Due to CD Financial Impact
## 6                   Financial Impacts Made Strict GFD Harder
## 7                        Impact on household quality of life
## 8                   Emotional/Mental Health Support Required
##                                                                                                                                                                                                                                       Summary
## 1                                                                                                                                                                                                                       No: 56.8%, Yes: 43.2%
## 2                                                                                                         5–10 minutes: 39.9%, 11–20 minutes: 35%, 21–30 minutes: 12.5%, Less than 5 minutes: 8.6%, Over 30 minutes: 3.7%, I’m not sure: 0.5%
## 3                                                                                                       30 minutes - 1 hour: 30.8%, 1–2 hours : 26.9%, Less than 30 minutes: 18.4%, 3–4 hours: 12.4%, More than 4 hours: 6.8%, Not sure: 4.6%
## 4                                                                                                                                                                                                                       Yes: 60.1%, No: 39.9%
## 5                                                                                                                                        A small amount: 42.3%, A moderate amount: 33%, None at all: 12.5%, A great deal: 12%, Not sure: 0.2%
## 6                                                                                                                                           No - I’ve always been able to manage it: 68.2%, Yes - occasionally: 24.9%, Yes - frequently: 6.8%
## 7                                                                                                                                           Yes - a slight impact: 52.3%, No impact: 27.1%, Yes - a significant impact: 16.9%, Not sure: 3.7%
## 8 No - not needed\u2028: 55.7%,  No - but it would have been helpful\u2028: 23%, Yes - informal support (e.g. friends, support groups)\u2028: 15.6%, Yes - formal support (e.g. counselling, GP support)\u2028: 5.4%, Prefer not to say: 0.2%

Model one - dependent variable self-identified risk score

Prep data

# Turn off scientific notation

options(scipen = 999)

# Convert dependent variable to ordinal 

data$Risk = factor(data$Risk, 
                             levels = c("Low Risk", "Moderate Risk", "High Risk"), 
                             ordered = TRUE)

# Convert rest of categorical variables to factor 

data$Gender. = as.factor(data$Gender.)
data$Age = as.factor(data$Age)
data$Education = as.factor(data$Education)
data$Income. = as.factor(data$Income.)
data$Car. = as.factor(data$Car.)
data$Urban_rural_flag  = as.factor(data$Urban_rural_flag)

# Set test train - using caret to keeep proportional classes

set.seed(123)

# Create partition — stratified by outcome variable

train_indices = createDataPartition(data$Risk, p = 0.7, list = FALSE)

train_data = data[train_indices, ]
test_data  = data[-train_indices, ]

# clean data 

train_data_clean = na.omit(train_data)
test_data_clean = na.omit(test_data)

Chi-squared and anova tests

# results table
results = data.frame(
  Variable = character(),
  Chi_Squared_p = numeric(),
  Fisher_p = numeric(),
  ANOVA_p = numeric(),
  Test_Type = character(),
  stringsAsFactors = FALSE
)

# Categorical variables

# Gender

chi_gender = chisq.test(train_data_clean$Gender., train_data_clean$Risk)
fisher_gender = fisher.test(train_data_clean$Gender., train_data_clean$Risk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
  Variable = "Gender",
  Chi_Squared_p = chi_gender$p.value,
  Fisher_p = fisher_gender$p.value,
  ANOVA_p = NA,
  Test_Type = "Categorical"
))

# Age

chi_age = chisq.test(train_data_clean$Age, train_data_clean$Risk)
fisher_age = fisher.test(train_data_clean$Age, train_data_clean$Risk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
  Variable = "Age",
  Chi_Squared_p = chi_age$p.value,
  Fisher_p = fisher_age$p.value,
  ANOVA_p = NA,
  Test_Type = "Categorical"
))

# Education

chi_edu = chisq.test(train_data_clean$Education, train_data_clean$Risk)
fisher_edu = fisher.test(train_data_clean$Education, train_data_clean$Risk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
  Variable = "Education",
  Chi_Squared_p = chi_edu$p.value,
  Fisher_p = fisher_edu$p.value,
  ANOVA_p = NA,
  Test_Type = "Categorical"
))

# Income
chi_income = chisq.test(train_data_clean$Income., train_data_clean$Risk)
fisher_income = fisher.test(train_data_clean$Income., train_data_clean$Risk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
  Variable = "Income",
  Chi_Squared_p = chi_income$p.value,
  Fisher_p = fisher_income$p.value,
  ANOVA_p = NA,
  Test_Type = "Categorical"
))

# Car

chi_car = chisq.test(train_data_clean$Car., train_data_clean$Risk)
fisher_car = fisher.test(train_data_clean$Car., train_data_clean$Risk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
  Variable = "Car",
  Chi_Squared_p = chi_car$p.value,
  Fisher_p = fisher_car$p.value,
  ANOVA_p = NA,
  Test_Type = "Categorical"
))

# Urban/Rural
chi_urban = chisq.test(train_data_clean$Urban_rural_flag, train_data_clean$Risk)
fisher_urban = fisher.test(train_data_clean$Urban_rural_flag, train_data_clean$Risk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
  Variable = "Urban_Rural",
  Chi_Squared_p = chi_urban$p.value,
  Fisher_p = fisher_urban$p.value,
  ANOVA_p = NA,
  Test_Type = "Categorical"
))

# Continuous variables

# SOA deciles

aov_soa = aov(SOA_decile ~ Risk, data = train_data_clean)
soa_p = summary(aov_soa)[[1]][["Pr(>F)"]][1]
results = rbind(results, data.frame(
  Variable = "SOA_decile",
  Chi_Squared_p = NA,
  Fisher_p = NA,
  ANOVA_p = soa_p,
  Test_Type = "Continuous"
))

# Time to GP

aov_gp <- aov(min_time_to_gp ~ Risk, data = train_data_clean)
gp_p <- summary(aov_gp)[[1]][["Pr(>F)"]][1]
results <- rbind(results, data.frame(
  Variable = "Time_to_GP",
  Chi_Squared_p = NA,
  Fisher_p = NA,
  ANOVA_p = gp_p,
  Test_Type = "Continuous"
))

Make Table look nice

results = results %>%
  mutate(
    Variable = recode(
      Variable,
      "Urban_Rural" = "Urban–Rural Classification",
      "SOA_decile"  = "Deprivation decile (LSOA)",
      "Time_to_GP"  = "Time to GP (mins)",
      "Car"         = "Car access",
      "Income"      = "Household income",
      "Education"   = "Education level"
    )
  )

nice_tbl = results %>%
  mutate(
    Chi_Squared_p = round(Chi_Squared_p, 4),
    Fisher_p      = round(Fisher_p, 4),
    ANOVA_p       = round(ANOVA_p, 4),
    Conservative = if_else(
      (Test_Type == "Categorical" & Chi_Squared_p < 0.10) |
      (Test_Type == "Continuous"  & ANOVA_p       < 0.10), "Yes","No"),
    Moderate = if_else(
      (Test_Type == "Categorical" & Chi_Squared_p < 0.50) |
      (Test_Type == "Continuous"  & ANOVA_p       < 0.50), "Yes","No")
  ) %>%
  gt(groupname_col = "Test_Type") %>%
 
  cols_label(
    Chi_Squared_p = html("Chi-Squared<br>p"),
    Fisher_p      = html("Fisher’s Exact<br>p"),
    ANOVA_p       = html("ANOVA<br>p"),
    Conservative  = html("Significant<br>(α = 0.10)"),
    Moderate      = html("Significant<br>(α = 0.50)")
  ) %>%
  tab_style(
    style = list(cell_fill(color = "#E8F5E9"), cell_text(weight = "bold")),
    locations = cells_body(columns = Conservative, rows = Conservative == "Yes")
  ) %>%
  tab_style(
    style = list(cell_fill(color = "#E8F5E9"), cell_text(weight = "bold")),
    locations = cells_body(columns = Moderate, rows = Moderate == "Yes")
  ) %>%
  tab_spanner(label = "p-values", columns = c(Chi_Squared_p, Fisher_p, ANOVA_p)) %>%
  fmt_number(columns = c(Chi_Squared_p, Fisher_p, ANOVA_p), decimals = 4) %>%
  sub_missing(columns = everything(), missing_text = "—") %>%
  opt_row_striping()

nice_tbl
Variable
p-values
Significant
(α = 0.10)
Significant
(α = 0.50)
Chi-Squared
p
Fisher’s Exact
p
ANOVA
p
Categorical
Gender 0.7439 1.0000 No No
Age 0.5923 0.6087 No No
Education level 0.1190 0.1109 No Yes
Household income 0.0001 0.0005 Yes Yes
Car access 0.1918 0.2484 No Yes
Urban–Rural Classification 0.4602 0.4633 No Yes
Continuous
Deprivation decile (LSOA) 0.0098 Yes Yes
Time to GP (mins) 0.2221 No Yes
# Change size and shape of table

nice_tbl = nice_tbl %>%
  tab_options(table.width = pct(80)) %>%
  cols_width(
    everything() ~ px(120)
  ) %>%
  tab_options(
    table.font.size       = px(11),  
    data_row.padding      = px(2),   
    column_labels.padding = px(2),
    row_group.padding     = px(2),
    heading.padding       = px(0)    
  ) %>%
  cols_label(
    Chi_Squared_p = "Chi-Squared p",
    Fisher_p      = "Fisher’s Exact p",
    ANOVA_p       = "ANOVA p",
    Conservative = "Significant (p < 0.10)",
    Moderate     = "Significant (p < 0.50)",
  )

nice_tbl = nice_tbl %>%
  gt::tab_options(table.font.names = c("Times New Roman", "Times", "serif"))


nice_tbl
Variable
p-values
Significant (p < 0.10) Significant (p < 0.50)
Chi-Squared p Fisher’s Exact p ANOVA p
Categorical
Gender 0.7439 1.0000 No No
Age 0.5923 0.6087 No No
Education level 0.1190 0.1109 No Yes
Household income 0.0001 0.0005 Yes Yes
Car access 0.1918 0.2484 No Yes
Urban–Rural Classification 0.4602 0.4633 No Yes
Continuous
Deprivation decile (LSOA) 0.0098 Yes Yes
Time to GP (mins) 0.2221 No Yes

Ordinal Logisitic Regression - Conservative

# Run model 

model = polr(Risk ~ Income. + SOA_decile, data = train_data_clean, Hess=TRUE)

confusionMatrix(predict(model, test_data_clean),
                test_data_clean$Risk)
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Low Risk Moderate Risk High Risk
##   Low Risk            71            33         2
##   Moderate Risk        5             8         1
##   High Risk            0             0         0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6583          
##                  95% CI : (0.5662, 0.7424)
##     No Information Rate : 0.6333          
##     P-Value [Acc > NIR] : 0.3203          
##                                           
##                   Kappa : 0.1473          
##                                           
##  Mcnemar's Test P-Value : 0.00002982      
## 
## Statistics by Class:
## 
##                      Class: Low Risk Class: Moderate Risk Class: High Risk
## Sensitivity                   0.9342              0.19512            0.000
## Specificity                   0.2045              0.92405            1.000
## Pos Pred Value                0.6698              0.57143              NaN
## Neg Pred Value                0.6429              0.68868            0.975
## Prevalence                    0.6333              0.34167            0.025
## Detection Rate                0.5917              0.06667            0.000
## Detection Prevalence          0.8833              0.11667            0.000
## Balanced Accuracy             0.5694              0.55959            0.500
# Check if parallel regression assumption holds 

brant(model) 
## -------------------------------------------------------------------- 
## Test for             X2  df  probability 
## -------------------------------------------------------------------- 
## Omnibus                  10.54   5   0.06
## Income.£15,000 to £30,000 per year   0   1   0.95
## Income.£30,000 to £50,000 per year   0.03    1   0.87
## Income.£50,000 to £100,000 per year  0.55    1   0.46
## Income.Above £100,000 per year   3.35    1   0.07
## SOA_decile               1.16    1   0.28
## -------------------------------------------------------------------- 
## 
## H0: Parallel Regression Assumption holds

Logistic Regression - moderate

# Run model 

model = polr(Risk ~ Income. + SOA_decile + Education + Car. + Urban_rural_flag + min_time_to_gp, data = train_data_clean, Hess=TRUE)

confusionMatrix(predict(model, test_data_clean),
                test_data_clean$Risk)
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Low Risk Moderate Risk High Risk
##   Low Risk            69            32         2
##   Moderate Risk        7             9         1
##   High Risk            0             0         0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.65            
##                  95% CI : (0.5576, 0.7348)
##     No Information Rate : 0.6333          
##     P-Value [Acc > NIR] : 0.3912068       
##                                           
##                   Kappa : 0.1421          
##                                           
##  Mcnemar's Test P-Value : 0.0002701       
## 
## Statistics by Class:
## 
##                      Class: Low Risk Class: Moderate Risk Class: High Risk
## Sensitivity                   0.9079               0.2195            0.000
## Specificity                   0.2273               0.8987            1.000
## Pos Pred Value                0.6699               0.5294              NaN
## Neg Pred Value                0.5882               0.6893            0.975
## Prevalence                    0.6333               0.3417            0.025
## Detection Rate                0.5750               0.0750            0.000
## Detection Prevalence          0.8583               0.1417            0.000
## Balanced Accuracy             0.5676               0.5591            0.500
brant(model)
## -------------------------------------------------------------------------------------------------------------------- 
## Test for                                     X2  df  probability 
## -------------------------------------------------------------------------------------------------------------------- 
## Omnibus                                          10.23   10  0.42
## Income.£15,000 to £30,000 per year                           0   1   0.98
## Income.£30,000 to £50,000 per year                           0   1   0.97
## Income.£50,000 to £100,000 per year                          0.72    1   0.4
## Income.Above £100,000 per year                           2.62    1   0.11
## SOA_decile                                       0.61    1   0.44
## EducationHigher or secondary or further education (A-Levels, BTEC, etc.) 0.33    1   0.57
## EducationSecondary school up to 16 years                     0   1   0.99
## Car.Yes                                          0   1   0.99
## Urban_rural_flagUrban                                    1.14    1   0.29
## min_time_to_gp                                       0.04    1   0.84
## -------------------------------------------------------------------------------------------------------------------- 
## 
## H0: Parallel Regression Assumption holds

Random forest - conservative

set.seed(111)

# Run model 

rf_model = randomForest(
  Risk ~ Income. + SOA_decile,
  data = train_data_clean,
  importance = TRUE,
  ntree = 500
)

# Predict 

confusionMatrix(predict(rf_model, test_data_clean),
                test_data_clean$Risk)
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Low Risk Moderate Risk High Risk
##   Low Risk            69            31         2
##   Moderate Risk        7            10         1
##   High Risk            0             0         0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6583          
##                  95% CI : (0.5662, 0.7424)
##     No Information Rate : 0.6333          
##     P-Value [Acc > NIR] : 0.3203424       
##                                           
##                   Kappa : 0.1675          
##                                           
##  Mcnemar's Test P-Value : 0.0004081       
## 
## Statistics by Class:
## 
##                      Class: Low Risk Class: Moderate Risk Class: High Risk
## Sensitivity                   0.9079              0.24390            0.000
## Specificity                   0.2500              0.89873            1.000
## Pos Pred Value                0.6765              0.55556              NaN
## Neg Pred Value                0.6111              0.69608            0.975
## Prevalence                    0.6333              0.34167            0.025
## Detection Rate                0.5750              0.08333            0.000
## Detection Prevalence          0.8500              0.15000            0.000
## Balanced Accuracy             0.5789              0.57132            0.500

Random forest - moderate

set.seed(111)

rf_model_2 = randomForest(
  Risk ~ Education + Income. + Car. + SOA_decile + min_time_to_gp + Urban_rural_flag,
  data = train_data_clean,
  importance = TRUE,
  ntree = 500
)

confusionMatrix(predict(rf_model_2, test_data_clean),
                test_data_clean$Risk)
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Low Risk Moderate Risk High Risk
##   Low Risk            63            29         1
##   Moderate Risk       13            12         2
##   High Risk            0             0         0
## 
## Overall Statistics
##                                          
##                Accuracy : 0.625          
##                  95% CI : (0.532, 0.7117)
##     No Information Rate : 0.6333         
##     P-Value [Acc > NIR] : 0.61475        
##                                          
##                   Kappa : 0.1325         
##                                          
##  Mcnemar's Test P-Value : 0.02805        
## 
## Statistics by Class:
## 
##                      Class: Low Risk Class: Moderate Risk Class: High Risk
## Sensitivity                   0.8289               0.2927            0.000
## Specificity                   0.3182               0.8101            1.000
## Pos Pred Value                0.6774               0.4444              NaN
## Neg Pred Value                0.5185               0.6882            0.975
## Prevalence                    0.6333               0.3417            0.025
## Detection Rate                0.5250               0.1000            0.000
## Detection Prevalence          0.7750               0.2250            0.000
## Balanced Accuracy             0.5736               0.5514            0.500

XGboost - conservative

set.seed(111)

#  Build model matrices for test and train 

mm = function(df) model.matrix(~ 0 + Income. + SOA_decile, data = df)

X_train = mm(train_data_clean)
X_test  = mm(test_data_clean)

# align test columns to train columns

train_cols = colnames(X_train)
missing = setdiff(train_cols, colnames(X_test))
if (length(missing)) {
  X_test = cbind(X_test, matrix(0, nrow(X_test), length(missing),
                                 dimnames = list(NULL, missing)))
}
X_test <- X_test[, train_cols, drop = FALSE]

# Outcome factor - create levels 

risk_levels = make.names(c("Low Risk","Moderate Risk","High Risk"))
y_train = factor(make.names(as.character(train_data_clean$Risk)), levels = risk_levels)
y_test  = factor(make.names(as.character(test_data_clean$Risk)),  levels = risk_levels)

# Create tuning grid 

xgb_grid = expand.grid(
  nrounds = c(100, 200),
  max_depth = c(3, 5, 7),
  eta = c(0.05, 0.1),
  gamma = c(0, 1),
  colsample_bytree = c(0.6, 0.8),
  min_child_weight = c(1, 5),
  subsample = 0.8
)

control = trainControl(
  method = "cv",
  number = 5,
  verboseIter = TRUE,
  allowParallel = TRUE,
  classProbs = TRUE,
  summaryFunction = multiClassSummary
)

# Train model 

xgb_tuned = train(
  x = X_train,
  y = y_train,
  method = "xgbTree",
  trControl = control,
  tuneGrid = xgb_grid,
  metric = "Accuracy"
)
# predict on test data 

prob_df = predict(xgb_tuned, newdata = X_test, type = "prob") 
pred_lab = factor(colnames(prob_df)[max.col(as.matrix(prob_df))], levels = risk_levels)

# Confusion matrix 

caret::confusionMatrix(pred_lab, y_test)
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Low.Risk Moderate.Risk High.Risk
##   Low.Risk            73            31         3
##   Moderate.Risk        3            10         0
##   High.Risk            0             0         0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6917          
##                  95% CI : (0.6009, 0.7727)
##     No Information Rate : 0.6333          
##     P-Value [Acc > NIR] : 0.1082          
##                                           
##                   Kappa : 0.2258          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Low.Risk Class: Moderate.Risk Class: High.Risk
## Sensitivity                   0.9605              0.24390            0.000
## Specificity                   0.2273              0.96203            1.000
## Pos Pred Value                0.6822              0.76923              NaN
## Neg Pred Value                0.7692              0.71028            0.975
## Prevalence                    0.6333              0.34167            0.025
## Detection Rate                0.6083              0.08333            0.000
## Detection Prevalence          0.8917              0.10833            0.000
## Balanced Accuracy             0.5939              0.60296            0.500

XGBoost - moderate

set.seed(111)

# Create model matrices for test and train 

mm = function(df) model.matrix(~ 0 + Education + Car. + Urban_rural_flag + Income. + SOA_decile + min_time_to_gp, data = df)

X_train = mm(train_data_clean)
X_test  = mm(test_data_clean)

# align test columns to train columns 

train_cols = colnames(X_train)
missing = setdiff(train_cols, colnames(X_test))
if (length(missing)) {
  X_test <- cbind(X_test, matrix(0, nrow(X_test), length(missing),
                                 dimnames = list(NULL, missing)))
}
X_test = X_test[, train_cols, drop = FALSE]

# Outcome factor

risk_levels = make.names(c("Low Risk","Moderate Risk","High Risk"))
y_train = factor(make.names(as.character(train_data_clean$Risk)), levels = risk_levels)
y_test = factor(make.names(as.character(test_data_clean$Risk)),  levels = risk_levels)

# Create tuning grid 

xgb_grid = expand.grid(
  nrounds = c(100, 200),
  max_depth = c(3, 5, 7),
  eta = c(0.05, 0.1),
  gamma = c(0, 1),
  colsample_bytree = c(0.6, 0.8),
  min_child_weight = c(1, 5),
  subsample = 0.8
)

control = trainControl(
  method = "cv",
  number = 5,
  verboseIter = TRUE,
  allowParallel = TRUE,
  classProbs = TRUE,
  summaryFunction = multiClassSummary
)

# Train model 

xgb_tuned = train(
  x = X_train,
  y = y_train,
  method = "xgbTree",
  trControl = control,
  tuneGrid = xgb_grid,
  metric = "Accuracy"
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
# Predict on test data 

prob_df = predict(xgb_tuned, newdata = X_test, type = "prob")  # columns == risk_levels
pred_lab = factor(colnames(prob_df)[max.col(as.matrix(prob_df))], levels = risk_levels)

# Confusion matrix

caret::confusionMatrix(pred_lab, y_test)
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Low.Risk Moderate.Risk High.Risk
##   Low.Risk            66            28         1
##   Moderate.Risk       10            13         2
##   High.Risk            0             0         0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6583          
##                  95% CI : (0.5662, 0.7424)
##     No Information Rate : 0.6333          
##     P-Value [Acc > NIR] : 0.320342        
##                                           
##                   Kappa : 0.2006          
##                                           
##  Mcnemar's Test P-Value : 0.009195        
## 
## Statistics by Class:
## 
##                      Class: Low.Risk Class: Moderate.Risk Class: High.Risk
## Sensitivity                   0.8684               0.3171            0.000
## Specificity                   0.3409               0.8481            1.000
## Pos Pred Value                0.6947               0.5200              NaN
## Neg Pred Value                0.6000               0.7053            0.975
## Prevalence                    0.6333               0.3417            0.025
## Detection Rate                0.5500               0.1083            0.000
## Detection Prevalence          0.7917               0.2083            0.000
## Balanced Accuracy             0.6047               0.5826            0.500

Neural Network - conservative

Prep data

# Create train matrix 

x_train = model.matrix(~ 0 + Income. + SOA_decile,
                        data = train_data_clean)

# Outcome labels

risk_levels = make.names(c("Low Risk","Moderate Risk","High Risk"))
y_train = factor(make.names(as.character(train_data_clean$Risk)), levels = risk_levels)

# Test matrix
x_test = model.matrix(~ 0 + Income. + SOA_decile,
                       data = test_data_clean)

# Align test and train 
train_cols = colnames(x_train)
missing = setdiff(train_cols, colnames(x_test))
if (length(missing)) {
  x_test = cbind(x_test, matrix(0, nrow(x_test), length(missing),
                                 dimnames = list(NULL, missing)))
}
x_test = x_test[, train_cols, drop = FALSE]

Tuning and run

set.seed(111)

# Create tuning grid 

nnet_grid = expand.grid(size = c(3,5,7), decay = c(0.01, 0.1, 0.5))

control = trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE,
  summaryFunction = multiClassSummary,
  verboseIter = TRUE
)

# Run model 

nnet_model = caret::train(
  x = x_train,
  y = y_train,
  method = "nnet",
  trControl = control,
  tuneGrid = nnet_grid,
  metric = "Accuracy",
  preProcess = c("center","scale"), 
  maxit = 200,
  trace = FALSE
)
# Predict 

pred_labels = predict(nnet_model, newdata = x_test)
actual_labels = factor(make.names(as.character(test_data_clean$Risk)),
                        levels = risk_levels)
# Confusion matrix 
caret::confusionMatrix(pred_labels, actual_labels)
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Low.Risk Moderate.Risk High.Risk
##   Low.Risk            70            29         2
##   Moderate.Risk        6            12         1
##   High.Risk            0             0         0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6833          
##                  95% CI : (0.5922, 0.7652)
##     No Information Rate : 0.6333          
##     P-Value [Acc > NIR] : 0.1485258       
##                                           
##                   Kappa : 0.233           
##                                           
##  Mcnemar's Test P-Value : 0.0004166       
## 
## Statistics by Class:
## 
##                      Class: Low.Risk Class: Moderate.Risk Class: High.Risk
## Sensitivity                   0.9211               0.2927            0.000
## Specificity                   0.2955               0.9114            1.000
## Pos Pred Value                0.6931               0.6316              NaN
## Neg Pred Value                0.6842               0.7129            0.975
## Prevalence                    0.6333               0.3417            0.025
## Detection Rate                0.5833               0.1000            0.000
## Detection Prevalence          0.8417               0.1583            0.000
## Balanced Accuracy             0.6083               0.6020            0.500

Neural network - moderate

Prep data

# Train matrix

x_train = model.matrix(~ 0 + Income. + Education + SOA_decile + Car. + Urban_rural_flag + min_time_to_gp,
                        data = train_data_clean)

# Outcome labels 

risk_levels = make.names(c("Low Risk","Moderate Risk","High Risk"))
y_train = factor(make.names(as.character(train_data_clean$Risk)), levels = risk_levels)

# Test matrix

x_test = model.matrix(~ 0 + Income. + SOA_decile + Education + Car. + Urban_rural_flag + min_time_to_gp,
                       data = test_data_clean)

# Align test and train 

train_cols = colnames(x_train)
missing = setdiff(train_cols, colnames(x_test))
if (length(missing)) {
  x_test = cbind(x_test, matrix(0, nrow(x_test), length(missing),
                                 dimnames = list(NULL, missing)))
}
x_test = x_test[, train_cols, drop = FALSE]

Tune and run

set.seed(111)

nnet_grid = expand.grid(size = c(3,5,7), decay = c(0.01, 0.1, 0.5))

control = trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE,
  summaryFunction = multiClassSummary,
  verboseIter = TRUE
)

nnet_model = caret::train(
  x = x_train,
  y = y_train,
  method = "nnet",
  trControl = control,
  tuneGrid = nnet_grid,
  metric = "Accuracy",
  preProcess = c("center","scale"),  
  maxit = 200,
  trace = FALSE
)

Predict

pred_labels = predict(nnet_model, newdata = x_test)
actual_labels = factor(make.names(as.character(test_data_clean$Risk)),
                        levels = risk_levels)
caret::confusionMatrix(pred_labels, actual_labels)
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Low.Risk Moderate.Risk High.Risk
##   Low.Risk            59            29         1
##   Moderate.Risk       17            12         2
##   High.Risk            0             0         0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5917          
##                  95% CI : (0.4982, 0.6805)
##     No Information Rate : 0.6333          
##     P-Value [Acc > NIR] : 0.8511          
##                                           
##                   Kappa : 0.0762          
##                                           
##  Mcnemar's Test P-Value : 0.1054          
## 
## Statistics by Class:
## 
##                      Class: Low.Risk Class: Moderate.Risk Class: High.Risk
## Sensitivity                   0.7763               0.2927            0.000
## Specificity                   0.3182               0.7595            1.000
## Pos Pred Value                0.6629               0.3871              NaN
## Neg Pred Value                0.4516               0.6742            0.975
## Prevalence                    0.6333               0.3417            0.025
## Detection Rate                0.4917               0.1000            0.000
## Detection Prevalence          0.7417               0.2583            0.000
## Balanced Accuracy             0.5472               0.5261            0.500

Model 2 - Binary model

# Creating binary dependent variable 

data$BinaryRisk = ifelse(data$Risk %in% c("High Risk", "Moderate Risk"),
                          "At Risk",
                          "Low Risk")

# Make it a factor with consistent level order 

data$BinaryRisk = factor(data$BinaryRisk, levels = c("Low Risk", "At Risk"))

# Create same test train split 

# Set test train - using caret to keep proprtional classes

set.seed(123)

train_indices = createDataPartition(data$Risk, p = 0.7, list = FALSE)

train_data = data[train_indices, ]
test_data  = data[-train_indices, ]

# clean data 

train_data_clean = na.omit(train_data)
test_data_clean = na.omit(test_data)

Stats tests

# results table

results = data.frame(
  Variable = character(),
  Chi_Squared_p = numeric(),
  Fisher_p = numeric(),
  T_Test_p = numeric(),
  Test_Type = character(),
  stringsAsFactors = FALSE
)

# Categorical variables

# Gender

chi_gender = chisq.test(train_data_clean$Gender., train_data_clean$BinaryRisk)
fisher_gender = fisher.test(train_data_clean$Gender., train_data_clean$BinaryRisk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
  Variable = "Gender",
  Chi_Squared_p = chi_gender$p.value,
  Fisher_p = fisher_gender$p.value,
  T_Test_p = NA,
  Test_Type = "Categorical"
))

# Age

chi_age = chisq.test(train_data_clean$Age, train_data_clean$BinaryRisk)
fisher_age = fisher.test(train_data_clean$Age, train_data_clean$BinaryRisk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
  Variable = "Age",
  Chi_Squared_p = chi_age$p.value,
  Fisher_p = fisher_age$p.value,
  T_Test_p = NA,
  Test_Type = "Categorical"
))

# Education

chi_edu = chisq.test(train_data_clean$Education, train_data_clean$BinaryRisk)
fisher_edu = fisher.test(train_data_clean$Education, train_data_clean$BinaryRisk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
  Variable = "Education",
  Chi_Squared_p = chi_edu$p.value,
  Fisher_p = fisher_edu$p.value,
  T_Test_p = NA,
  Test_Type = "Categorical"
))

# Income

chi_income = chisq.test(train_data_clean$Income., train_data_clean$BinaryRisk)
fisher_income = fisher.test(train_data_clean$Income., train_data_clean$BinaryRisk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
  Variable = "Income",
  Chi_Squared_p = chi_income$p.value,
  Fisher_p = fisher_income$p.value,
  T_Test_p = NA,
  Test_Type = "Categorical"
))

# Car

chi_car = chisq.test(train_data_clean$Car., train_data_clean$BinaryRisk)
fisher_car = fisher.test(train_data_clean$Car., train_data_clean$BinaryRisk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
  Variable = "Car",
  Chi_Squared_p = chi_car$p.value,
  Fisher_p = fisher_car$p.value,
  T_Test_p = NA,
  Test_Type = "Categorical"
))

# Urban/Rural

chi_urban = chisq.test(train_data_clean$Urban_rural_flag, train_data_clean$BinaryRisk)
fisher_urban = fisher.test(train_data_clean$Urban_rural_flag, train_data_clean$BinaryRisk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
  Variable = "Urban_Rural",
  Chi_Squared_p = chi_urban$p.value,
  Fisher_p = fisher_urban$p.value,
  T_Test_p = NA,
  Test_Type = "Categorical"
))

# Continuous variables - t test 

# SOA decile 

ttest_soa = t.test(SOA_decile ~ BinaryRisk, data = train_data_clean)
soa_p = ttest_soa$p.value
results = rbind(results, data.frame(
  Variable = "SOA_decile",
  Chi_Squared_p = NA,
  Fisher_p = NA,
  T_Test_p = soa_p,
  Test_Type = "Continuous"
))

# Time to GP 

ttest_gp = t.test(min_time_to_gp ~ BinaryRisk, data = train_data_clean)
gp_p = ttest_gp$p.value
results = rbind(results, data.frame(
  Variable = "Time_to_GP",
  Chi_Squared_p = NA,
  Fisher_p = NA,
  T_Test_p = gp_p,
  Test_Type = "Continuous"
))

Make a nice table

results = results %>%
  mutate(
    Variable = recode(
      Variable,
      "Urban_Rural" = "Urban–Rural Classification",
      "SOA_decile"  = "Deprivation decile (LSOA)",
      "Time_to_GP"  = "Time to GP (mins)",
      "Car"         = "Car access",
      "Income"      = "Household income",
      "Education"   = "Education level"
    )
  )

nice_tbl = results %>%
  mutate(
    Chi_Squared_p = round(Chi_Squared_p, 4),
    Fisher_p      = round(Fisher_p, 4),
    T_Test_p       = round(T_Test_p, 4),
    Conservative = if_else(
      (Test_Type == "Categorical" & Chi_Squared_p < 0.10) |
      (Test_Type == "Continuous"  & T_Test_p       < 0.10), "Yes","No"),
    Moderate = if_else(
      (Test_Type == "Categorical" & Chi_Squared_p < 0.50) |
      (Test_Type == "Continuous"  & T_Test_p       < 0.50), "Yes","No")
  ) %>%
  gt(groupname_col = "Test_Type") %>%
  # <- no tab_header() here
  cols_label(
    Chi_Squared_p = html("Chi-Squared<br>p"),
    Fisher_p      = html("Fisher’s Exact<br>p"),
    T_Test_p       = html("T_Test_p<br>p"),
    Conservative  = html("Significant<br>(α = 0.10)"),
    Moderate      = html("Significant<br>(α = 0.50)")
  ) %>%
  tab_style(
    style = list(cell_fill(color = "#E8F5E9"), cell_text(weight = "bold")),
    locations = cells_body(columns = Conservative, rows = Conservative == "Yes")
  ) %>%
  tab_style(
    style = list(cell_fill(color = "#E8F5E9"), cell_text(weight = "bold")),
    locations = cells_body(columns = Moderate, rows = Moderate == "Yes")
  ) %>%
  tab_spanner(label = "p-values", columns = c(Chi_Squared_p, Fisher_p, T_Test_p)) %>%
  fmt_number(columns = c(Chi_Squared_p, Fisher_p, T_Test_p), decimals = 4) %>%
  sub_missing(columns = everything(), missing_text = "—") %>%
  opt_row_striping()

nice_tbl
Variable
p-values
Significant
(α = 0.10)
Significant
(α = 0.50)
Chi-Squared
p
Fisher’s Exact
p
T_Test_p
p
Categorical
Gender 1.0000 1.0000 No No
Age 0.4873 0.4813 No Yes
Education level 0.0661 0.0670 Yes Yes
Household income 0.0000 0.0005 Yes Yes
Car access 0.2461 0.1956 No Yes
Urban–Rural Classification 0.3722 0.3429 No Yes
Continuous
Deprivation decile (LSOA) 0.0040 Yes Yes
Time to GP (mins) 0.1478 No Yes
# Change shape and size of table

nice_tbl = nice_tbl %>%
  tab_options(table.width = pct(80)) %>%
  cols_width(
    everything() ~ px(120)
  ) %>%
  tab_options(
    table.font.size       = px(11),  
    data_row.padding      = px(2),   
    column_labels.padding = px(2),
    row_group.padding     = px(2),
    heading.padding       = px(0)   
  ) %>%
  cols_label(
    Chi_Squared_p = "Chi-Squared p",
    Fisher_p      = "Fisher’s Exact p",
    T_Test_p      = "T Test p",
    Conservative = "Significant (p < 0.10)",
    Moderate     = "Significant (p < 0.50)",
  )

nice_tbl = nice_tbl %>%
  gt::tab_options(table.font.names = c("Times New Roman", "Times", "serif"))


nice_tbl
Variable
p-values
Significant (p < 0.10) Significant (p < 0.50)
Chi-Squared p Fisher’s Exact p T Test p
Categorical
Gender 1.0000 1.0000 No No
Age 0.4873 0.4813 No Yes
Education level 0.0661 0.0670 Yes Yes
Household income 0.0000 0.0005 Yes Yes
Car access 0.2461 0.1956 No Yes
Urban–Rural Classification 0.3722 0.3429 No Yes
Continuous
Deprivation decile (LSOA) 0.0040 Yes Yes
Time to GP (mins) 0.1478 No Yes

Binary logistic regression - conservative

# Run model 

binary = glm(BinaryRisk ~ Income. + SOA_decile + Education,
             data = train_data_clean,
             family = binomial(link = "logit"))


# Predict probabilities

pred_probs = predict(binary, newdata = test_data_clean, type = "response")

# Convert to class labels based on standard threshold

pred_classes = ifelse(pred_probs > 0.5, "At Risk", "Low Risk")

# Make both factors with consistent levels

pred_factor = factor(pred_classes, levels = c("Low Risk", "At Risk"))
actual_factor = factor(test_data_clean$BinaryRisk, levels = c("Low Risk", "At Risk"))

# Confusion matrix

confusionMatrix(pred_factor, actual_factor)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Low Risk At Risk
##   Low Risk       70      32
##   At Risk         6      12
##                                           
##                Accuracy : 0.6833          
##                  95% CI : (0.5922, 0.7652)
##     No Information Rate : 0.6333          
##     P-Value [Acc > NIR] : 0.1485          
##                                           
##                   Kappa : 0.2213          
##                                           
##  Mcnemar's Test P-Value : 0.00005002      
##                                           
##             Sensitivity : 0.9211          
##             Specificity : 0.2727          
##          Pos Pred Value : 0.6863          
##          Neg Pred Value : 0.6667          
##              Prevalence : 0.6333          
##          Detection Rate : 0.5833          
##    Detection Prevalence : 0.8500          
##       Balanced Accuracy : 0.5969          
##                                           
##        'Positive' Class : Low Risk        
## 

Logistic regression - moderate

# Run model 

binary = glm(BinaryRisk ~ Age + Education + Income. + Car. + 
               SOA_decile + Urban_rural_flag + min_time_to_gp,
             data = train_data_clean,
             family = binomial(link = "logit"))


# Predict probabilities
pred_probs = predict(binary, newdata = test_data_clean, type = "response")

# Convert to class labels based on threshold

pred_classes = ifelse(pred_probs > 0.5, "At Risk", "Low Risk")

# Make both factors with consistent levels

pred_factor = factor(pred_classes, levels = c("Low Risk", "At Risk"))
actual_factor = factor(test_data_clean$BinaryRisk, levels = c("Low Risk", "At Risk"))

# Compute confusion matrix

confusionMatrix(pred_factor, actual_factor)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Low Risk At Risk
##   Low Risk       64      29
##   At Risk        12      15
##                                           
##                Accuracy : 0.6583          
##                  95% CI : (0.5662, 0.7424)
##     No Information Rate : 0.6333          
##     P-Value [Acc > NIR] : 0.32034         
##                                           
##                   Kappa : 0.1992          
##                                           
##  Mcnemar's Test P-Value : 0.01246         
##                                           
##             Sensitivity : 0.8421          
##             Specificity : 0.3409          
##          Pos Pred Value : 0.6882          
##          Neg Pred Value : 0.5556          
##              Prevalence : 0.6333          
##          Detection Rate : 0.5333          
##    Detection Prevalence : 0.7750          
##       Balanced Accuracy : 0.5915          
##                                           
##        'Positive' Class : Low Risk        
## 

Random forest - conservative

set.seed(111)

rf_model_B = randomForest(
  BinaryRisk ~ Income. + SOA_decile + Education,
  data = train_data_clean,
  importance = TRUE,
  ntree = 500
)

confusionMatrix(predict(rf_model_B, test_data_clean),
                test_data_clean$BinaryRisk)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Low Risk At Risk
##   Low Risk       72      33
##   At Risk         4      11
##                                           
##                Accuracy : 0.6917          
##                  95% CI : (0.6009, 0.7727)
##     No Information Rate : 0.6333          
##     P-Value [Acc > NIR] : 0.1082          
##                                           
##                   Kappa : 0.2292          
##                                           
##  Mcnemar's Test P-Value : 0.000004161     
##                                           
##             Sensitivity : 0.9474          
##             Specificity : 0.2500          
##          Pos Pred Value : 0.6857          
##          Neg Pred Value : 0.7333          
##              Prevalence : 0.6333          
##          Detection Rate : 0.6000          
##    Detection Prevalence : 0.8750          
##       Balanced Accuracy : 0.5987          
##                                           
##        'Positive' Class : Low Risk        
## 

Random forest - moderate

set.seed(111)

rf_model_B = randomForest(
  BinaryRisk ~ Age + Education + Income. + Car. + SOA_decile + Urban_rural_flag + min_time_to_gp,
  data = train_data_clean,
  importance = TRUE,
  ntree = 500
)

confusionMatrix(predict(rf_model_B, test_data_clean),
                test_data_clean$BinaryRisk)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Low Risk At Risk
##   Low Risk       62      27
##   At Risk        14      17
##                                           
##                Accuracy : 0.6583          
##                  95% CI : (0.5662, 0.7424)
##     No Information Rate : 0.6333          
##     P-Value [Acc > NIR] : 0.32034         
##                                           
##                   Kappa : 0.2156          
##                                           
##  Mcnemar's Test P-Value : 0.06092         
##                                           
##             Sensitivity : 0.8158          
##             Specificity : 0.3864          
##          Pos Pred Value : 0.6966          
##          Neg Pred Value : 0.5484          
##              Prevalence : 0.6333          
##          Detection Rate : 0.5167          
##    Detection Prevalence : 0.7417          
##       Balanced Accuracy : 0.6011          
##                                           
##        'Positive' Class : Low Risk        
## 

XGBoost - conservative

set.seed(111)

# Create matrices

mm = function(df) {
  model.matrix(~ 0 + Education + Income. + SOA_decile, data = df)
}

X_train = mm(train_data_clean)
X_test  = mm(test_data_clean)

colnames(X_train)

train_cols = colnames(X_train)
missing = setdiff(train_cols, colnames(X_test))
if (length(missing)) {
  X_test = cbind(X_test, matrix(0, nrow(X_test), length(missing),
                                 dimnames = list(NULL, missing)))
}
X_test = X_test[, train_cols, drop = FALSE]

# Outcome 

levels_bin = c("At Risk", "Low Risk")
y_train = factor(train_data_clean$BinaryRisk, levels = levels_bin)
y_test  = factor(test_data_clean$BinaryRisk,  levels = levels_bin)

# Tuning grid

xgb_grid = expand.grid(
  nrounds = c(100, 200),
  max_depth = c(3, 5, 7),
  eta = c(0.05, 0.1),
  gamma = c(0, 1),
  colsample_bytree = c(0.6, 0.8),
  min_child_weight = c(1, 5),
  subsample = 0.8
)

control = trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE,
  summaryFunction = twoClassSummary, 
  verboseIter = TRUE,
  allowParallel = TRUE
)
# Set names 

levels_bin_train = c("At_Risk", "Low_Risk")

y_train = factor(gsub(" ", "_", train_data_clean$BinaryRisk), levels = levels_bin_train)

# Train model
xgb_bin = caret::train(
  x = X_train,
  y = y_train,
  method = "xgbTree",
  trControl = control,
  tuneGrid = xgb_grid,
  metric = "ROC"
)

# For predictions, map back

pred_probs = predict(xgb_bin, newdata = X_test, type = "prob")[, "At_Risk"]
pred_labels = ifelse(pred_probs >= 0.5, "At Risk", "Low Risk")
# Predict probabilities 

pred_probs = predict(xgb_bin, newdata = X_test, type = "prob")[, "At_Risk"]

# Classify using threshold 

pred_labels_clean = ifelse(pred_probs >= 0.5, "At Risk", "Low Risk")

# Original labels 

actual_labels_clean = gsub("_", " ", as.character(y_test))

# Make factors with same levels 

pred_labels_clean = factor(pred_labels_clean, levels = c("At Risk", "Low Risk"))
actual_labels_clean = factor(actual_labels_clean, levels = c("At Risk", "Low Risk"))

# Confusion matrix 

confusionMatrix(pred_labels_clean, actual_labels_clean, positive = "At Risk")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction At Risk Low Risk
##   At Risk       19        5
##   Low Risk      25       71
##                                           
##                Accuracy : 0.75            
##                  95% CI : (0.6627, 0.8245)
##     No Information Rate : 0.6333          
##     P-Value [Acc > NIR] : 0.0044136       
##                                           
##                   Kappa : 0.4048          
##                                           
##  Mcnemar's Test P-Value : 0.0005226       
##                                           
##             Sensitivity : 0.4318          
##             Specificity : 0.9342          
##          Pos Pred Value : 0.7917          
##          Neg Pred Value : 0.7396          
##              Prevalence : 0.3667          
##          Detection Rate : 0.1583          
##    Detection Prevalence : 0.2000          
##       Balanced Accuracy : 0.6830          
##                                           
##        'Positive' Class : At Risk         
## 

XGBoost - moderate

set.seed(111)

# Create matrices 

mm = function(df) {
  model.matrix(~ 0 + Education + Age + Income. + Car. + SOA_decile + Urban_rural_flag + min_time_to_gp, data = df)
}

X_train = mm(train_data_clean)
X_test  = mm(test_data_clean)

train_cols = colnames(X_train)
missing = setdiff(train_cols, colnames(X_test))
if (length(missing)) {
  X_test = cbind(X_test, matrix(0, nrow(X_test), length(missing),
                                 dimnames = list(NULL, missing)))
}
X_test = X_test[, train_cols, drop = FALSE]

# Outcome 

levels_bin = c("At Risk", "Low Risk")
y_train = factor(train_data_clean$BinaryRisk, levels = levels_bin)
y_test  = factor(test_data_clean$BinaryRisk,  levels = levels_bin)

# Create tuning grid 

xgb_grid = expand.grid(
  nrounds = c(100, 200),
  max_depth = c(3, 5, 7),
  eta = c(0.05, 0.1),
  gamma = c(0, 1),
  colsample_bytree = c(0.6, 0.8),
  min_child_weight = c(1, 5),
  subsample = 0.8
)

control = trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE,
  summaryFunction = twoClassSummary, 
  verboseIter = TRUE,
  allowParallel = TRUE
)

# Rename

levels_bin_train= c("At_Risk", "Low_Risk")

y_train = factor(gsub(" ", "_", train_data_clean$BinaryRisk), levels = levels_bin_train)

# Train model

xgb_bin = caret::train(
  x = X_train,
  y = y_train,
  method = "xgbTree",
  trControl = control,
  tuneGrid = xgb_grid,
  metric = "ROC"
)

# Predicitions 

pred_probs = predict(xgb_bin, newdata = X_test, type = "prob")[, "At_Risk"]
pred_labels = ifelse(pred_probs >= 0.5, "At Risk", "Low Risk")
# Predict probabilities 

pred_probs = predict(xgb_bin, newdata = X_test, type = "prob")[, "At_Risk"]

# Classify using threshold = 0.5 

pred_labels_clean = ifelse(pred_probs >= 0.5, "At Risk", "Low Risk")

# Original labels

actual_labels_clean = gsub("_", " ", as.character(y_test))

# Factors with same levels

pred_labels_clean = factor(pred_labels_clean, levels = c("At Risk", "Low Risk"))
actual_labels_clean = factor(actual_labels_clean, levels = c("At Risk", "Low Risk"))

# Confusion matrix 

confusionMatrix(pred_labels_clean, actual_labels_clean, positive = "At Risk")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction At Risk Low Risk
##   At Risk       15       18
##   Low Risk      29       58
##                                          
##                Accuracy : 0.6083         
##                  95% CI : (0.515, 0.6961)
##     No Information Rate : 0.6333         
##     P-Value [Acc > NIR] : 0.7478         
##                                          
##                   Kappa : 0.1098         
##                                          
##  Mcnemar's Test P-Value : 0.1447         
##                                          
##             Sensitivity : 0.3409         
##             Specificity : 0.7632         
##          Pos Pred Value : 0.4545         
##          Neg Pred Value : 0.6667         
##              Prevalence : 0.3667         
##          Detection Rate : 0.1250         
##    Detection Prevalence : 0.2750         
##       Balanced Accuracy : 0.5520         
##                                          
##        'Positive' Class : At Risk        
## 

Neural Network - conservative

# Train matrix
x_train =  model.matrix(~ 0 + Education + Income. + 
                          SOA_decile,
                        data = train_data_clean)

# class names for binary outcome

risk_levels = make.names(c("Low Risk","At Risk"))
y_train = factor(make.names(as.character(train_data_clean$BinaryRisk)),
                  levels = risk_levels)

# Test matrix
x_test = model.matrix(~ Education + Income. +
                         SOA_decile - 1,
                       data = test_data_clean)

# Align test and train 

train_cols = colnames(x_train)
missing = setdiff(train_cols, colnames(x_test))
if (length(missing)) {
  x_test = cbind(x_test, matrix(0, nrow(x_test), length(missing),
                                 dimnames = list(NULL, missing)))
}
x_test = x_test[, train_cols, drop = FALSE]

# create same levels for test

actual_labels = factor(make.names(as.character(test_data_clean$BinaryRisk)),
                        levels = risk_levels)
# Tuning grid 

set.seed(111)

nnet_grid = expand.grid(size = c(3,5,7), decay = c(0.01, 0.1, 0.5))

control = trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE,
  summaryFunction = twoClassSummary,  # binary case
  verboseIter = TRUE
)

nnet_model = caret::train(
  x = x_train,
  y = y_train,
  method = "nnet",
  trControl = control,
  tuneGrid = nnet_grid,
  metric = "Accuracy",
  preProcess = c("center","scale"),
  maxit = 200,
  trace = FALSE
)

Predict

pred_labels = predict(nnet_model, newdata = x_test)
caret::confusionMatrix(pred_labels, actual_labels)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Low.Risk At.Risk
##   Low.Risk       70      31
##   At.Risk         6      13
##                                           
##                Accuracy : 0.6917          
##                  95% CI : (0.6009, 0.7727)
##     No Information Rate : 0.6333          
##     P-Value [Acc > NIR] : 0.1082          
##                                           
##                   Kappa : 0.2459          
##                                           
##  Mcnemar's Test P-Value : 0.00007961      
##                                           
##             Sensitivity : 0.9211          
##             Specificity : 0.2955          
##          Pos Pred Value : 0.6931          
##          Neg Pred Value : 0.6842          
##              Prevalence : 0.6333          
##          Detection Rate : 0.5833          
##    Detection Prevalence : 0.8417          
##       Balanced Accuracy : 0.6083          
##                                           
##        'Positive' Class : Low.Risk        
## 

Neural Network - moderate

# Train matrix 

x_train = model.matrix(~ 0 + Age + Education + Income. + Car. + SOA_decile + Urban_rural_flag + min_time_to_gp,
                        data = train_data_clean)

# class names for binary outcome

risk_levels = make.names(c("Low Risk","At Risk"))
y_train = factor(make.names(as.character(train_data_clean$BinaryRisk)),
                  levels = risk_levels)

# Test matrix

x_test = model.matrix(~ 0 + Age + Education + Income. + Car. + SOA_decile + Urban_rural_flag + min_time_to_gp,
                       data = test_data_clean)

# Align test and train columns 

train_cols = colnames(x_train)
missing = setdiff(train_cols, colnames(x_test))
if (length(missing)) {
  x_test = cbind(x_test, matrix(0, nrow(x_test), length(missing),
                                 dimnames = list(NULL, missing)))
}
x_test = x_test[, train_cols, drop = FALSE]

# same labels for test

actual_labels = factor(make.names(as.character(test_data_clean$BinaryRisk)),
                        levels = risk_levels)
# Tuning grid and run

set.seed(111)

nnet_grid = expand.grid(size = c(3,5,7), decay = c(0.01, 0.1, 0.5))

control = trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE,
  summaryFunction = twoClassSummary,  # binary case
  verboseIter = TRUE
)

nnet_model = caret::train(
  x = x_train,
  y = y_train,
  method = "nnet",
  trControl = control,
  tuneGrid = nnet_grid,
  metric = "Accuracy",
  preProcess = c("center","scale"),
  maxit = 200,
  trace = FALSE
)

Predict

pred_labels = predict(nnet_model, newdata = x_test)
caret::confusionMatrix(pred_labels, actual_labels)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Low.Risk At.Risk
##   Low.Risk       60      29
##   At.Risk        16      15
##                                          
##                Accuracy : 0.625          
##                  95% CI : (0.532, 0.7117)
##     No Information Rate : 0.6333         
##     P-Value [Acc > NIR] : 0.61475        
##                                          
##                   Kappa : 0.139          
##                                          
##  Mcnemar's Test P-Value : 0.07364        
##                                          
##             Sensitivity : 0.7895         
##             Specificity : 0.3409         
##          Pos Pred Value : 0.6742         
##          Neg Pred Value : 0.4839         
##              Prevalence : 0.6333         
##          Detection Rate : 0.5000         
##    Detection Prevalence : 0.7417         
##       Balanced Accuracy : 0.5652         
##                                          
##        'Positive' Class : Low.Risk       
## 

Model 3 - risk index

Data prep

# Convert dependent variable to ordinal 

data$risk_category = factor(data$risk_category, 
                             levels = c("Low Risk", "Moderate Risk", "High Risk"), 
                             ordered = TRUE)

# Set test train - using caret to keeep proprtional classes

set.seed(123)

# Create partition — stratified by same variable

train_indices = createDataPartition(data$Risk, p = 0.7, list = FALSE)

train_data = data[train_indices, ]
test_data  = data[-train_indices, ]

# clean data 

train_data_clean = na.omit(train_data)
test_data_clean = na.omit(test_data)

Stats

# results table
results = data.frame(
  Variable = character(),
  Chi_Squared_p = numeric(),
  Fisher_p = numeric(),
  ANOVA_p = numeric(),
  Test_Type = character(),
  stringsAsFactors = FALSE
)

# Categorical variables

# Gender

chi_gender = chisq.test(train_data_clean$Gender., train_data_clean$risk_category)
fisher_gender = fisher.test(train_data_clean$Gender., train_data_clean$risk_category, simulate.p.value=TRUE)
results = rbind(results, data.frame(
  Variable = "Gender",
  Chi_Squared_p = chi_gender$p.value,
  Fisher_p = fisher_gender$p.value,
  ANOVA_p = NA,
  Test_Type = "Categorical"
))

# Age

chi_age = chisq.test(train_data_clean$Age, train_data_clean$risk_category)
fisher_age = fisher.test(train_data_clean$Age, train_data_clean$risk_category, simulate.p.value=TRUE)
results = rbind(results, data.frame(
  Variable = "Age",
  Chi_Squared_p = chi_age$p.value,
  Fisher_p = fisher_age$p.value,
  ANOVA_p = NA,
  Test_Type = "Categorical"
))

# Education

chi_edu = chisq.test(train_data_clean$Education, train_data_clean$risk_category)
fisher_edu = fisher.test(train_data_clean$Education, train_data_clean$risk_category, simulate.p.value=TRUE)
results = rbind(results, data.frame(
  Variable = "Education",
  Chi_Squared_p = chi_edu$p.value,
  Fisher_p = fisher_edu$p.value,
  ANOVA_p = NA,
  Test_Type = "Categorical"
))

# Income

chi_income = chisq.test(train_data_clean$Income., train_data_clean$risk_category)
fisher_income = fisher.test(train_data_clean$Income., train_data_clean$risk_category, simulate.p.value=TRUE)
results = rbind(results, data.frame(
  Variable = "Income",
  Chi_Squared_p = chi_income$p.value,
  Fisher_p = fisher_income$p.value,
  ANOVA_p = NA,
  Test_Type = "Categorical"
))

# Car

chi_car = chisq.test(train_data_clean$Car., train_data_clean$risk_category)
fisher_car = fisher.test(train_data_clean$Car., train_data_clean$risk_category, simulate.p.value=TRUE)
results = rbind(results, data.frame(
  Variable = "Car",
  Chi_Squared_p = chi_car$p.value,
  Fisher_p = fisher_car$p.value,
  ANOVA_p = NA,
  Test_Type = "Categorical"
))

# Urban/Rural

chi_urban = chisq.test(train_data_clean$Urban_rural_flag, train_data_clean$risk_category)
fisher_urban = fisher.test(train_data_clean$Urban_rural_flag, train_data_clean$risk_category, simulate.p.value=TRUE)
results = rbind(results, data.frame(
  Variable = "Urban_Rural",
  Chi_Squared_p = chi_urban$p.value,
  Fisher_p = fisher_urban$p.value,
  ANOVA_p = NA,
  Test_Type = "Categorical"
))

# Continuous variables

# SOA decile

aov_soa = aov(SOA_decile ~ risk_category, data = train_data_clean)
soa_p = summary(aov_soa)[[1]][["Pr(>F)"]][1]
results = rbind(results, data.frame(
  Variable = "SOA_decile",
  Chi_Squared_p = NA,
  Fisher_p = NA,
  ANOVA_p = soa_p,
  Test_Type = "Continuous"
))

# Time to GP

aov_gp = aov(min_time_to_gp ~ risk_category, data = train_data_clean)
gp_p = summary(aov_gp)[[1]][["Pr(>F)"]][1]
results = rbind(results, data.frame(
  Variable = "Time_to_GP",
  Chi_Squared_p = NA,
  Fisher_p = NA,
  ANOVA_p = gp_p,
  Test_Type = "Continuous"
))

Make table look nice

results = results %>%
  mutate(
    Variable = recode(
      Variable,
      "Urban_Rural" = "Urban–Rural Classification",
      "SOA_decile"  = "Deprivation decile (LSOA)",
      "Time_to_GP"  = "Time to GP (mins)",
      "Car"         = "Car access",
      "Income"      = "Household income",
      "Education"   = "Education level"
    )
  )

nice_tbl = results %>%
  mutate(
    Chi_Squared_p = round(Chi_Squared_p, 4),
    Fisher_p      = round(Fisher_p, 4),
    ANOVA_p       = round(ANOVA_p, 4),
    Conservative = if_else(
      (Test_Type == "Categorical" & Chi_Squared_p < 0.10) |
      (Test_Type == "Continuous"  & ANOVA_p       < 0.10), "Yes","No"),
    Moderate = if_else(
      (Test_Type == "Categorical" & Chi_Squared_p < 0.50) |
      (Test_Type == "Continuous"  & ANOVA_p       < 0.50), "Yes","No")
  ) %>%
  gt(groupname_col = "Test_Type") %>%
  cols_label(
    Chi_Squared_p = html("Chi-Squared<br>p"),
    Fisher_p      = html("Fisher’s Exact<br>p"),
    ANOVA_p       = html("ANOVA<br>p"),
    Conservative  = html("Significant<br>(α = 0.10)"),
    Moderate      = html("Significant<br>(α = 0.50)")
  ) %>%
  tab_style(
    style = list(cell_fill(color = "#E8F5E9"), cell_text(weight = "bold")),
    locations = cells_body(columns = Conservative, rows = Conservative == "Yes")
  ) %>%
  tab_style(
    style = list(cell_fill(color = "#E8F5E9"), cell_text(weight = "bold")),
    locations = cells_body(columns = Moderate, rows = Moderate == "Yes")
  ) %>%
  tab_spanner(label = "p-values", columns = c(Chi_Squared_p, Fisher_p, ANOVA_p)) %>%
  fmt_number(columns = c(Chi_Squared_p, Fisher_p, ANOVA_p), decimals = 4) %>%
  sub_missing(columns = everything(), missing_text = "—") %>%
  opt_row_striping()

# Change shape and size

nice_tbl = nice_tbl %>%
  tab_options(table.width = pct(80)) %>%
  cols_width(
    everything() ~ px(120)
  ) %>%
  tab_options(
    table.font.size       = px(11),  
    data_row.padding      = px(2),   
    column_labels.padding = px(2),
    row_group.padding     = px(2),
    heading.padding       = px(0)    
  ) %>%
  cols_label(
    Chi_Squared_p = "Chi-Squared p",
    Fisher_p      = "Fisher’s Exact p",
    ANOVA_p       = "ANOVA p",
    Conservative = "Significant (p < 0.10)",
    Moderate     = "Significant (p < 0.50)",
  )

nice_tbl = nice_tbl %>%
  gt::tab_options(table.font.names = c("Times New Roman", "Times", "serif"))


nice_tbl
Variable
p-values
Significant (p < 0.10) Significant (p < 0.50)
Chi-Squared p Fisher’s Exact p ANOVA p
Categorical
Gender 0.2937 0.3348 No Yes
Age 0.2509 0.2404 No Yes
Education level 0.1094 0.0945 No Yes
Household income 0.0794 0.1124 Yes Yes
Car access 0.5307 0.6252 No No
Urban–Rural Classification 0.1078 0.1199 No Yes
Continuous
Deprivation decile (LSOA) 0.0018 Yes Yes
Time to GP (mins) 0.8439 No No

Logistic regression - conservative

# Run model 

model = polr(risk_category ~ Income. + SOA_decile, data = train_data_clean, Hess=TRUE)

summary_model = summary(model)

confusionMatrix(predict(model, test_data_clean),
                test_data_clean$risk_category)
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Low Risk Moderate Risk High Risk
##   Low Risk             6             3         3
##   Moderate Risk       33            31        19
##   High Risk            4             5        16
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4417          
##                  95% CI : (0.3511, 0.5352)
##     No Information Rate : 0.3583          
##     P-Value [Acc > NIR] : 0.03659         
##                                           
##                   Kappa : 0.1709          
##                                           
##  Mcnemar's Test P-Value : 0.0000002771    
## 
## Statistics by Class:
## 
##                      Class: Low Risk Class: Moderate Risk Class: High Risk
## Sensitivity                   0.1395               0.7949           0.4211
## Specificity                   0.9221               0.3580           0.8902
## Pos Pred Value                0.5000               0.3735           0.6400
## Neg Pred Value                0.6574               0.7838           0.7684
## Prevalence                    0.3583               0.3250           0.3167
## Detection Rate                0.0500               0.2583           0.1333
## Detection Prevalence          0.1000               0.6917           0.2083
## Balanced Accuracy             0.5308               0.5764           0.6556
brant(model)
## -------------------------------------------------------------------- 
## Test for             X2  df  probability 
## -------------------------------------------------------------------- 
## Omnibus                  5.93    5   0.31
## Income.£15,000 to £30,000 per year   0.08    1   0.78
## Income.£30,000 to £50,000 per year   1.44    1   0.23
## Income.£50,000 to £100,000 per year  1.19    1   0.27
## Income.Above £100,000 per year   0.06    1   0.81
## SOA_decile               0.5 1   0.48
## -------------------------------------------------------------------- 
## 
## H0: Parallel Regression Assumption holds

Logistic regression - mod

# Run model 

model = polr(risk_category ~ Income. + SOA_decile + Gender. + Age + Education + Urban_rural_flag, data = train_data_clean, Hess=TRUE)

summary_model = summary(model)

confusionMatrix(predict(model, test_data_clean),
                test_data_clean$risk_category)
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Low Risk Moderate Risk High Risk
##   Low Risk            12             8         2
##   Moderate Risk       27            22        23
##   High Risk            4             9        13
## 
## Overall Statistics
##                                          
##                Accuracy : 0.3917         
##                  95% CI : (0.3039, 0.485)
##     No Information Rate : 0.3583         
##     P-Value [Acc > NIR] : 0.2510811      
##                                          
##                   Kappa : 0.093          
##                                          
##  Mcnemar's Test P-Value : 0.0006722      
## 
## Statistics by Class:
## 
##                      Class: Low Risk Class: Moderate Risk Class: High Risk
## Sensitivity                   0.2791               0.5641           0.3421
## Specificity                   0.8701               0.3827           0.8415
## Pos Pred Value                0.5455               0.3056           0.5000
## Neg Pred Value                0.6837               0.6458           0.7340
## Prevalence                    0.3583               0.3250           0.3167
## Detection Rate                0.1000               0.1833           0.1083
## Detection Prevalence          0.1833               0.6000           0.2167
## Balanced Accuracy             0.5746               0.4734           0.5918
brant(model)
## -------------------------------------------------------------------------------------------------------------------- 
## Test for                                     X2  df  probability 
## -------------------------------------------------------------------------------------------------------------------- 
## Omnibus                                          23.57   14  0.05
## Income.£15,000 to £30,000 per year                           0   1   0.97
## Income.£30,000 to £50,000 per year                           0.72    1   0.4
## Income.£50,000 to £100,000 per year                          0.55    1   0.46
## Income.Above £100,000 per year                           0.14    1   0.71
## SOA_decile                                       0.67    1   0.41
## Gender.Woman                                     0.01    1   0.9
## Age25-34                                     0.02    1   0.89
## Age35-44                                     0.01    1   0.91
## Age45-54                                     0.01    1   0.94
## Age55-64                                     0.91    1   0.34
## Age65 or older                                       0.02    1   0.89
## EducationHigher or secondary or further education (A-Levels, BTEC, etc.) 0.64    1   0.42
## EducationSecondary school up to 16 years                     6.41    1   0.01
## Urban_rural_flagUrban                                    2.64    1   0.1
## -------------------------------------------------------------------------------------------------------------------- 
## 
## H0: Parallel Regression Assumption holds
## Warning in brant(model): 920 combinations in table(dv,ivs) do not occur.
## Because of that, the test results might be invalid.

Random forst - conservative

set.seed(111)

rf_model = randomForest(
  risk_category ~ Income. + SOA_decile,
  data = train_data_clean,
  importance = TRUE,
  ntree = 500
)

confusionMatrix(predict(rf_model, test_data_clean),
                test_data_clean$risk_category)
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Low Risk Moderate Risk High Risk
##   Low Risk            13             6         9
##   Moderate Risk       26            27        21
##   High Risk            4             6         8
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4             
##                  95% CI : (0.3117, 0.4934)
##     No Information Rate : 0.3583          
##     P-Value [Acc > NIR] : 0.1952          
##                                           
##                   Kappa : 0.1024          
##                                           
##  Mcnemar's Test P-Value : 0.00004539      
## 
## Statistics by Class:
## 
##                      Class: Low Risk Class: Moderate Risk Class: High Risk
## Sensitivity                   0.3023               0.6923          0.21053
## Specificity                   0.8052               0.4198          0.87805
## Pos Pred Value                0.4643               0.3649          0.44444
## Neg Pred Value                0.6739               0.7391          0.70588
## Prevalence                    0.3583               0.3250          0.31667
## Detection Rate                0.1083               0.2250          0.06667
## Detection Prevalence          0.2333               0.6167          0.15000
## Balanced Accuracy             0.5538               0.5560          0.54429

Random forest - mod

set.seed(111)

rf_model = randomForest(
  risk_category ~ Income. + SOA_decile + Gender. + Age + Education + Urban_rural_flag,
  data = train_data_clean,
  importance = TRUE,
  ntree = 500
)

confusionMatrix(predict(rf_model, test_data_clean),
                test_data_clean$risk_category)
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Low Risk Moderate Risk High Risk
##   Low Risk            17             7         8
##   Moderate Risk       19            22        16
##   High Risk            7            10        14
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4417          
##                  95% CI : (0.3511, 0.5352)
##     No Information Rate : 0.3583          
##     P-Value [Acc > NIR] : 0.03659         
##                                           
##                   Kappa : 0.1645          
##                                           
##  Mcnemar's Test P-Value : 0.07223         
## 
## Statistics by Class:
## 
##                      Class: Low Risk Class: Moderate Risk Class: High Risk
## Sensitivity                   0.3953               0.5641           0.3684
## Specificity                   0.8052               0.5679           0.7927
## Pos Pred Value                0.5313               0.3860           0.4516
## Neg Pred Value                0.7045               0.7302           0.7303
## Prevalence                    0.3583               0.3250           0.3167
## Detection Rate                0.1417               0.1833           0.1167
## Detection Prevalence          0.2667               0.4750           0.2583
## Balanced Accuracy             0.6003               0.5660           0.5806

XGboost - conservative

set.seed(111)

# Create model matrices 

mm = function(df) {
  model.matrix(~ 0 + Income. + SOA_decile, data = df)
}

X_train = mm(train_data_clean)
X_test = mm(test_data_clean)

# align test and train columns

train_cols = colnames(X_train)
missing = setdiff(train_cols, colnames(X_test))
if (length(missing)) {
  X_test = cbind(X_test, matrix(0, nrow(X_test), length(missing),
                                 dimnames = list(NULL, missing)))
}

X_test = X_test[, train_cols, drop = FALSE]

# Outcome factor

risk_levels = make.names(c("Low Risk","Moderate Risk","High Risk"))
y_train = factor(make.names(as.character(train_data_clean$risk_category)), levels = risk_levels)
y_test  = factor(make.names(as.character(test_data_clean$risk_category)),  levels = risk_levels)

# Create tuning grid 

xgb_grid = expand.grid(
  nrounds = c(100, 200),
  max_depth = c(3, 5, 7),
  eta = c(0.05, 0.1),
  gamma = c(0, 1),
  colsample_bytree = c(0.6, 0.8),
  min_child_weight = c(1, 5),
  subsample = 0.8
)

control = trainControl(
  method = "cv",
  number = 5,
  verboseIter = TRUE,
  allowParallel = TRUE,
  classProbs = TRUE,
  summaryFunction = multiClassSummary
)

# Train model

xgb_tuned = train(
  x = X_train,
  y = y_train,
  method = "xgbTree",
  trControl = control,
  tuneGrid = xgb_grid,
  metric = "Accuracy"
)
# Predict 

prob_df = predict(xgb_tuned, newdata = X_test, type = "prob")  # columns == risk_levels
pred_lab = factor(colnames(prob_df)[max.col(as.matrix(prob_df))], levels = risk_levels)

# Confusion matrix

caret::confusionMatrix(pred_lab, y_test)
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Low.Risk Moderate.Risk High.Risk
##   Low.Risk            15             6        11
##   Moderate.Risk       21            21        15
##   High.Risk            7            12        12
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4             
##                  95% CI : (0.3117, 0.4934)
##     No Information Rate : 0.3583          
##     P-Value [Acc > NIR] : 0.19517         
##                                           
##                   Kappa : 0.1022          
##                                           
##  Mcnemar's Test P-Value : 0.02275         
## 
## Statistics by Class:
## 
##                      Class: Low.Risk Class: Moderate.Risk Class: High.Risk
## Sensitivity                   0.3488               0.5385           0.3158
## Specificity                   0.7792               0.5556           0.7683
## Pos Pred Value                0.4688               0.3684           0.3871
## Neg Pred Value                0.6818               0.7143           0.7079
## Prevalence                    0.3583               0.3250           0.3167
## Detection Rate                0.1250               0.1750           0.1000
## Detection Prevalence          0.2667               0.4750           0.2583
## Balanced Accuracy             0.5640               0.5470           0.5420

XGboost - moderate

set.seed(111)

# Create model matrices 

mm = function(df) {
  model.matrix(~ 0 + Income. + SOA_decile + Gender. + Age + Education + Urban_rural_flag, data = df)
}

X_train = mm(train_data_clean)
X_test  = mm(test_data_clean)

# align test and train

train_cols = colnames(X_train)
missing = setdiff(train_cols, colnames(X_test))
if (length(missing)) {
  X_test = cbind(X_test, matrix(0, nrow(X_test), length(missing),
                                 dimnames = list(NULL, missing)))
}

X_test = X_test[, train_cols, drop = FALSE]

# Outcome 

risk_levels = make.names(c("Low Risk","Moderate Risk","High Risk"))
y_train = factor(make.names(as.character(train_data_clean$risk_category)), levels = risk_levels)
y_test = factor(make.names(as.character(test_data_clean$risk_category)),  levels = risk_levels)

# Create tuning grid 

xgb_grid = expand.grid(
  nrounds = c(100, 200),
  max_depth = c(3, 5, 7),
  eta = c(0.05, 0.1),
  gamma = c(0, 1),
  colsample_bytree = c(0.6, 0.8),
  min_child_weight = c(1, 5),
  subsample = 0.8
)

control = trainControl(
  method = "cv",
  number = 5,
  verboseIter = TRUE,
  allowParallel = TRUE,
  classProbs = TRUE,
  summaryFunction = multiClassSummary
)

# Train model 

xgb_tuned = train(
  x = X_train,
  y = y_train,
  method = "xgbTree",
  trControl = control,
  tuneGrid = xgb_grid,
  metric = "Accuracy"
)
# Predict 

prob_df = predict(xgb_tuned, newdata = X_test, type = "prob")  
pred_lab = factor(colnames(prob_df)[max.col(as.matrix(prob_df))], levels = risk_levels)

# Cofusion matrix 

caret::confusionMatrix(pred_lab, y_test)
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Low.Risk Moderate.Risk High.Risk
##   Low.Risk            16             9         5
##   Moderate.Risk       20            17        18
##   High.Risk            7            13        15
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4             
##                  95% CI : (0.3117, 0.4934)
##     No Information Rate : 0.3583          
##     P-Value [Acc > NIR] : 0.1952          
##                                           
##                   Kappa : 0.1033          
##                                           
##  Mcnemar's Test P-Value : 0.1503          
## 
## Statistics by Class:
## 
##                      Class: Low.Risk Class: Moderate.Risk Class: High.Risk
## Sensitivity                   0.3721               0.4359           0.3947
## Specificity                   0.8182               0.5309           0.7561
## Pos Pred Value                0.5333               0.3091           0.4286
## Neg Pred Value                0.7000               0.6615           0.7294
## Prevalence                    0.3583               0.3250           0.3167
## Detection Rate                0.1333               0.1417           0.1250
## Detection Prevalence          0.2500               0.4583           0.2917
## Balanced Accuracy             0.5951               0.4834           0.5754

Neural network - Conservative

# Train matrix

x_train = model.matrix(~ 0 + Income. + SOA_decile,
                        data = train_data_clean)

# Outcome labels 

risk_levels = make.names(c("Low Risk","Moderate Risk","High Risk"))
y_train = factor(make.names(as.character(train_data_clean$risk_category)), levels = risk_levels)

# Test matrix

x_test = model.matrix(~ 0 + Income. + SOA_decile,
                       data = test_data_clean)

# Align test and train 

train_cols = colnames(x_train)
missing = setdiff(train_cols, colnames(x_test))
if (length(missing)) {
  x_test = cbind(x_test, matrix(0, nrow(x_test), length(missing),
                                 dimnames = list(NULL, missing)))
}
x_test = x_test[, train_cols, drop = FALSE]
# Tuning and run 

set.seed(111)

nnet_grid = expand.grid(size = c(3,5,7), decay = c(0.01, 0.1, 0.5))

control = trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE,
  summaryFunction = multiClassSummary,
  verboseIter = TRUE
)

nnet_model = caret::train(
  x = x_train,
  y = y_train,
  method = "nnet",
  trControl = control,
  tuneGrid = nnet_grid,
  metric = "Accuracy",
  preProcess = c("center","scale"),  # helpful for nnet
  maxit = 200,
  trace = FALSE
)
# Predicitions 

pred_labels = predict(nnet_model, newdata = x_test)
actual_labels = factor(make.names(as.character(test_data_clean$risk_category)),
                        levels = risk_levels)
caret::confusionMatrix(pred_labels, actual_labels)
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Low.Risk Moderate.Risk High.Risk
##   Low.Risk             8             4         2
##   Moderate.Risk       27            30        20
##   High.Risk            8             5        16
## 
## Overall Statistics
##                                           
##                Accuracy : 0.45            
##                  95% CI : (0.3591, 0.5435)
##     No Information Rate : 0.3583          
##     P-Value [Acc > NIR] : 0.02401         
##                                           
##                   Kappa : 0.1829          
##                                           
##  Mcnemar's Test P-Value : 0.000001624     
## 
## Statistics by Class:
## 
##                      Class: Low.Risk Class: Moderate.Risk Class: High.Risk
## Sensitivity                  0.18605               0.7692           0.4211
## Specificity                  0.92208               0.4198           0.8415
## Pos Pred Value               0.57143               0.3896           0.5517
## Neg Pred Value               0.66981               0.7907           0.7582
## Prevalence                   0.35833               0.3250           0.3167
## Detection Rate               0.06667               0.2500           0.1333
## Detection Prevalence         0.11667               0.6417           0.2417
## Balanced Accuracy            0.55406               0.5945           0.6313

Neural network - Moderate

# Train matrix

x_train = model.matrix(~ 0 + Income. + SOA_decile + Gender. + Age + Education + Urban_rural_flag,
                        data = train_data_clean)

# Outcome labels 

risk_levels = make.names(c("Low Risk","Moderate Risk","High Risk"))
y_train = factor(make.names(as.character(train_data_clean$risk_category)), levels = risk_levels)

# Test matrix

x_test = model.matrix(~ 0 + Income. + SOA_decile + Gender. + Age + Education + Urban_rural_flag,
                       data = test_data_clean)

# Align test and train 

train_cols = colnames(x_train)
missing = setdiff(train_cols, colnames(x_test))
if (length(missing)) {
  x_test = cbind(x_test, matrix(0, nrow(x_test), length(missing),
                                 dimnames = list(NULL, missing)))
}
x_test = x_test[, train_cols, drop = FALSE]

Tune and run

set.seed(111)

nnet_grid = expand.grid(size = c(3,5,7), decay = c(0.01, 0.1, 0.5))

control = trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE,
  summaryFunction = multiClassSummary,
  verboseIter = TRUE
)

nnet_model = caret::train(
  x = x_train,
  y = y_train,
  method = "nnet",
  trControl = control,
  tuneGrid = nnet_grid,
  metric = "Accuracy",
  preProcess = c("center","scale"),  # helpful for nnet
  maxit = 200,
  trace = FALSE
)
# Predict 

pred_labels = predict(nnet_model, newdata = x_test)
actual_labels = factor(make.names(as.character(test_data_clean$risk_category)),
                        levels = risk_levels)
caret::confusionMatrix(pred_labels, actual_labels)
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Low.Risk Moderate.Risk High.Risk
##   Low.Risk            19             9         8
##   Moderate.Risk       14            18        16
##   High.Risk           10            12        14
## 
## Overall Statistics
##                                           
##                Accuracy : 0.425           
##                  95% CI : (0.3353, 0.5185)
##     No Information Rate : 0.3583          
##     P-Value [Acc > NIR] : 0.07782         
##                                           
##                   Kappa : 0.1386          
##                                           
##  Mcnemar's Test P-Value : 0.59755         
## 
## Statistics by Class:
## 
##                      Class: Low.Risk Class: Moderate.Risk Class: High.Risk
## Sensitivity                   0.4419               0.4615           0.3684
## Specificity                   0.7792               0.6296           0.7317
## Pos Pred Value                0.5278               0.3750           0.3889
## Neg Pred Value                0.7143               0.7083           0.7143
## Prevalence                    0.3583               0.3250           0.3167
## Detection Rate                0.1583               0.1500           0.1167
## Detection Prevalence          0.3000               0.4000           0.3000
## Balanced Accuracy             0.6105               0.5456           0.5501

Model comparison tables

Model 1

model1 = data.frame(
  Model = c(
    "Ordinal Logistic - Conservative",
    "Ordinal Logistic - Moderate",
    "Random Forest - Conservative",
    "Random Forest - Moderate",
    "XGBoost - Conservative",
    "XGBoost - Moderate",
    "Neural Network - Conservative",
    "Neural Network - Moderate"
  ),
  Accuracy = c(0.6583, 0.65, 0.6583, 0.625, 0.6917, 0.6583, 0.6833, 0.5917),
  P_Value  = c(0.3203, 0.3912068, 0.3203424, 0.61475, 0.1082, 0.320342, 0.1485258, 0.8511),
  Kappa    = c(0.1473, 0.1421, 0.1675, 0.1422, 0.2258, 0.2006, 0.233, 0.0762),
  stringsAsFactors = FALSE
)
model2 = data.frame(
  Model = c(
    "Binary Logistic - Conservative",
    "Binary Logistic - Moderate",
    "Random Forest - Conservative",
    "Random Forest - Moderate",
    "XGBoost - Conservative",
    "XGBoost - Moderate",
    "Neural Network - Conservative",
    "Neural Network - Moderate"
  ),
  Accuracy = c(0.6833, 0.6583, 0.6917, 0.6417, 0.75, 0.6083, 0.6917, 0.6167),
  P_Value  = c(0.1485, 0.32034, 0.1082, 0.4656, 0.0044136, 0.7478, 0.1082, 0.68434),
  Kappa    = c(0.2213, 0.1992, 0.2292, 0.1938, 0.4048, 0.1098, 0.2459, 0.1154),
  stringsAsFactors = FALSE
)
model3 = data.frame(
  Model = c(
    "Ordinal Logistic - Conservative",
    "Ordinal Logistic - Moderate",
    "Random Forest - Conservative",
    "Random Forest - Moderate",
    "XGBoost - Conservative",
    "XGBoost - Moderate",
    "Neural Network - Conservative",
    "Neural Network - Moderate"
  ),
  Accuracy = c(0.4417, 0.3917, 0.4, 0.4417, 0.4, 0.4, 0.45, 0.425),
  P_Value  = c(0.03659, 0.2510811, 0.1952, 0.03659, 0.19517, 0.1952, 0.02401, 0.07782),
  Kappa    = c(0.1709, 0.093, 0.1024, 0.1119, 0.1022, 0.1533, 0.1829, 0.1386),
  stringsAsFactors = FALSE
)

Tables

build_tbl = function(df) {
  df2 = df
  df2$Accuracy = round(df2$Accuracy, 4)
  df2$P_Value  = round(df2$P_Value,  4)
  df2$Kappa    = round(df2$Kappa,    4)

  # highlight winnder
  winner = df2$Model[order(-df2$Accuracy, -df2$Kappa, df2$P_Value)][1]

  gt(df2) %>%
    cols_label(
      Model    = "Model",
      Accuracy = "Accuracy",
      P_Value  = "P-Value [Acc > NIR]",  # <-- left side is the column name; right side is the label
      Kappa    = "Kappa"
    ) %>%
    fmt_number(columns = c(Accuracy, P_Value, Kappa), decimals = 4) %>%
    opt_row_striping() %>%
    tab_options(
      table.width           = pct(80),
      table.font.size       = px(11),
      data_row.padding      = px(2),
      column_labels.padding = px(2),
      heading.padding       = px(0),
      table.font.names      = c("Times New Roman", "Times", "serif")
    ) %>%
    tab_style(
      style = list(cell_fill(color = "#E8F5E9"), cell_text(weight = "bold")),
      locations = cells_body(rows = Model == winner)
    )
}


tbl_model1 = build_tbl(model1)
tbl_model2 = build_tbl(model2)
tbl_model3 = build_tbl(model3)


tbl_model1
Model Accuracy P-Value [Acc > NIR] Kappa
Ordinal Logistic - Conservative 0.6583 0.3203 0.1473
Ordinal Logistic - Moderate 0.6500 0.3912 0.1421
Random Forest - Conservative 0.6583 0.3203 0.1675
Random Forest - Moderate 0.6250 0.6148 0.1422
XGBoost - Conservative 0.6917 0.1082 0.2258
XGBoost - Moderate 0.6583 0.3203 0.2006
Neural Network - Conservative 0.6833 0.1485 0.2330
Neural Network - Moderate 0.5917 0.8511 0.0762
tbl_model2
Model Accuracy P-Value [Acc > NIR] Kappa
Binary Logistic - Conservative 0.6833 0.1485 0.2213
Binary Logistic - Moderate 0.6583 0.3203 0.1992
Random Forest - Conservative 0.6917 0.1082 0.2292
Random Forest - Moderate 0.6417 0.4656 0.1938
XGBoost - Conservative 0.7500 0.0044 0.4048
XGBoost - Moderate 0.6083 0.7478 0.1098
Neural Network - Conservative 0.6917 0.1082 0.2459
Neural Network - Moderate 0.6167 0.6843 0.1154
tbl_model3
Model Accuracy P-Value [Acc > NIR] Kappa
Ordinal Logistic - Conservative 0.4417 0.0366 0.1709
Ordinal Logistic - Moderate 0.3917 0.2511 0.0930
Random Forest - Conservative 0.4000 0.1952 0.1024
Random Forest - Moderate 0.4417 0.0366 0.1119
XGBoost - Conservative 0.4000 0.1952 0.1022
XGBoost - Moderate 0.4000 0.1952 0.1533
Neural Network - Conservative 0.4500 0.0240 0.1829
Neural Network - Moderate 0.4250 0.0778 0.1386

Re run best model

set.seed(111)

# Create matrices

mm = function(df) {
  model.matrix(~ 0 + Education + Income. + SOA_decile, data = df)
}

X_train = mm(train_data_clean)
X_test  = mm(test_data_clean)

colnames(X_train)

train_cols = colnames(X_train)
missing = setdiff(train_cols, colnames(X_test))
if (length(missing)) {
  X_test = cbind(X_test, matrix(0, nrow(X_test), length(missing),
                                 dimnames = list(NULL, missing)))
}
X_test = X_test[, train_cols, drop = FALSE]

# Outcome 

levels_bin = c("At Risk", "Low Risk")
y_train = factor(train_data_clean$BinaryRisk, levels = levels_bin)
y_test  = factor(test_data_clean$BinaryRisk,  levels = levels_bin)

# Tuning grid

xgb_grid = expand.grid(
  nrounds = c(100, 200),
  max_depth = c(3, 5, 7),
  eta = c(0.05, 0.1),
  gamma = c(0, 1),
  colsample_bytree = c(0.6, 0.8),
  min_child_weight = c(1, 5),
  subsample = 0.8
)

control = trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE,
  summaryFunction = twoClassSummary, 
  verboseIter = TRUE,
  allowParallel = TRUE
)
# Set names 

levels_bin_train = c("At_Risk", "Low_Risk")

y_train = factor(gsub(" ", "_", train_data_clean$BinaryRisk), levels = levels_bin_train)

# Train model
xgb_best = caret::train(
  x = X_train,
  y = y_train,
  method = "xgbTree",
  trControl = control,
  tuneGrid = xgb_grid,
  metric = "ROC"
)

# For predictions, map back

pred_probs = predict(xgb_best, newdata = X_test, type = "prob")[, "At_Risk"]
pred_labels = ifelse(pred_probs >= 0.5, "At Risk", "Low Risk")
# Predict probabilities 

pred_probs = predict(xgb_best, newdata = X_test, type = "prob")[, "At_Risk"]

# Classify using threshold 

pred_labels_clean = ifelse(pred_probs >= 0.5, "At Risk", "Low Risk")

# Original labels 

actual_labels_clean = gsub("_", " ", as.character(y_test))

# Make factors with same levels 

pred_labels_clean = factor(pred_labels_clean, levels = c("At Risk", "Low Risk"))
actual_labels_clean = factor(actual_labels_clean, levels = c("At Risk", "Low Risk"))

# Confusion matrix 

confusionMatrix(pred_labels_clean, actual_labels_clean, positive = "At Risk")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction At Risk Low Risk
##   At Risk       19        5
##   Low Risk      25       71
##                                           
##                Accuracy : 0.75            
##                  95% CI : (0.6627, 0.8245)
##     No Information Rate : 0.6333          
##     P-Value [Acc > NIR] : 0.0044136       
##                                           
##                   Kappa : 0.4048          
##                                           
##  Mcnemar's Test P-Value : 0.0005226       
##                                           
##             Sensitivity : 0.4318          
##             Specificity : 0.9342          
##          Pos Pred Value : 0.7917          
##          Neg Pred Value : 0.7396          
##              Prevalence : 0.3667          
##          Detection Rate : 0.1583          
##    Detection Prevalence : 0.2000          
##       Balanced Accuracy : 0.6830          
##                                           
##        'Positive' Class : At Risk         
## 
xgb_best$bestTune
##    nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 61     100         3 0.1     1              0.8                1       0.8

ROC curve

# Probabilities for the positive class

probs = predict(xgb_best, newdata = X_test, type = "prob")[, "At_Risk"]

# Make a 0/1 response

y_true01 = as.integer(gsub(" ", "_", test_data_clean$BinaryRisk) == "At_Risk")

# sanity check
table(y_true01)  
## y_true01
##  0  1 
## 76 44
# ROC + plot
roc_obj = roc(response = y_true01, predictor = probs, direction = "<", quiet = TRUE)
plot(roc_obj, lwd = 2, main = "ROC Curve")
abline(a = 0, b = 1, lty = 2)

auc(roc_obj)
## Area under the curve: 0.7956

Best threshold

# Probabilities + truth
probs = predict(xgb_best, newdata = X_test, type = "prob")[, "At_Risk"]
lvls  = c("Low_Risk","At_Risk")
y_true = factor(gsub(" ", "_", test_data_clean$BinaryRisk), levels = lvls)

# ROC
roc_obj = roc(y_true, probs, levels = lvls, direction = "<", quiet = TRUE)

# Best threshold - Youden's J

best = coords(roc_obj, "best", best.method = "youden",
               ret = c("threshold","sensitivity","specificity"))

best_thresh = as.numeric(if (is.list(best)) best[["threshold"]] else best["threshold"])
stopifnot(is.finite(best_thresh))

# Reclassify at cutoff and compute CM
pred_opt = factor(ifelse(probs >= best_thresh, "At_Risk", "Low_Risk"), levels = lvls)
cm_opt = confusionMatrix(pred_opt, y_true, positive = "At_Risk")
cm_opt
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Low_Risk At_Risk
##   Low_Risk       65      15
##   At_Risk        11      29
##                                           
##                Accuracy : 0.7833          
##                  95% CI : (0.6989, 0.8533)
##     No Information Rate : 0.6333          
##     P-Value [Acc > NIR] : 0.0002961       
##                                           
##                   Kappa : 0.5244          
##                                           
##  Mcnemar's Test P-Value : 0.5562985       
##                                           
##             Sensitivity : 0.6591          
##             Specificity : 0.8553          
##          Pos Pred Value : 0.7250          
##          Neg Pred Value : 0.8125          
##              Prevalence : 0.3667          
##          Detection Rate : 0.2417          
##    Detection Prevalence : 0.3333          
##       Balanced Accuracy : 0.7572          
##                                           
##        'Positive' Class : At_Risk         
## 
best
##   threshold sensitivity specificity
## 1 0.3971832   0.6590909   0.8552632

Calculate shap values

# SHAP values for each feature

shap = predict(xgb_best$finalModel, newdata = X_train, predcontrib = TRUE)
shap = as.data.frame(shap)

# Drop bias columns

shap = shap[, !grepl("(?i)^(bias|\\(intercept\\))$", colnames(shap)), drop = FALSE]

# Mean SHAP per feature = global importance
mean_abs = data.frame(
  feature = colnames(shap),
  mean_abs_shap = colMeans(abs(shap))
)

n_bars= 10L 

p = mean_abs %>%
  arrange(desc(mean_abs_shap)) %>%
  slice_head(n = n_bars) %>%             
  ggplot(aes(x = reorder(feature, mean_abs_shap), y = mean_abs_shap)) +
  geom_col(width = 0.7) +
  coord_flip() +
  labs(x = "Feature", y = "Mean |SHAP|",
       title = "XGBoost — Feature Impact") +
  theme_minimal(base_family = "Times") +
  theme(
    plot.title = element_text(face = "bold"),
    axis.text = element_text(size = 10),
    panel.grid.minor = element_blank()
  )

p

Aggregate variables

map = tibble(feature = colnames(X_train)) %>%
  mutate(group = case_when(
    grepl("^Income\\.",   feature) ~ "Income",
    feature == "SOA_decile"        ~ "SOA_decile",
    grepl("^Education",   feature) ~ "Education",
    TRUE                           ~ feature
  ))

# join and aggregate 

mean_abs = tibble(feature = colnames(shap),
                   mean_abs_shap = colMeans(abs(shap))) %>%
  left_join(map, by = "feature")

by_var = mean_abs %>%
  group_by(group) %>%
  summarise(mean_abs_shap = sum(mean_abs_shap), .groups = "drop") %>%
  arrange(desc(mean_abs_shap))

by_var_no_intercept = by_var %>%
  filter(!grepl("(?i)intercept|bias", group)) 

# Plot 

p = by_var_no_intercept %>%
  arrange(desc(mean_abs_shap)) %>%
  slice_head(n = nrow(by_var_no_intercept)) %>%  # Use all rows, or set n_bars if you want to limit
  ggplot(aes(x = reorder(group, mean_abs_shap), y = mean_abs_shap)) +
  geom_col(width = 0.7) +
  coord_flip() +
  labs(x = "Feature", y = "Mean |SHAP|",
       title = "XGBoost — Aggregated Feature Impact") +
  theme_minimal(base_family = "Times") +
  theme(
    plot.title = element_text(face = "bold"),
    axis.text = element_text(size = 10),
    panel.grid.minor = element_blank()
  )
p

Plot interactions

# SHAP interaction values

shap_int = predict(xgb_best$finalModel, newdata = X_train, predinteraction = TRUE)

feat_names = colnames(X_train)
p_all = dim(shap_int)[2]
has_bias = (p_all == length(feat_names) + 1)

# indices of real features apart from bias

feat_idx = if (has_bias) seq_len(length(feat_names)) else seq_len(p_all)

# mean interaction across rows 

int_mat = apply(abs(shap_int[, feat_idx, feat_idx, drop = FALSE]), c(2, 3), mean)
dimnames(int_mat) = list(feat_names, feat_names)

# long data frame

ut = which(upper.tri(int_mat), arr.ind = TRUE)
pairs = data.frame(
  f1 = rownames(int_mat)[ut[, 1]],
  f2 = colnames(int_mat)[ut[, 2]],
  mean_abs_int = int_mat[ut]
)
pairs = pairs[order(-pairs$mean_abs_int), ]
top_k = head(pairs, 10)
top_k$pair = paste(top_k$f1, "×", top_k$f2)

# plot 

ggplot(top_k, aes(x = mean_abs_int, y = reorder(pair, mean_abs_int))) +
  geom_segment(aes(x = 0, xend = mean_abs_int, yend = pair)) +
  geom_point() +
  labs(x = "Mean |SHAP interaction|", y = "Feature pair",
       title = "Top 10 Feature Interactions") +
  theme_minimal(base_family = "Times")

Predicitions

Creating a fake population data set

Read necessary data and change to fit existing data

education_new = read.csv("Education_new.csv")

education_new = education_new %>%
  filter(!is.na(.[[1]]) & .[[1]] != "") %>%        
  rename(LSOA = 1) %>%                            
  mutate(LSOA = str_trim(str_extract(LSOA, "^[^:]+")))

colnames(education_new)
## [1] "LSOA"                                                            
## [2] "All.categories..Highest.level.of.qualification"                  
## [3] "Highest.level.of.qualification..Level.1.qualifications"          
## [4] "Highest.level.of.qualification..Level.2.qualifications"          
## [5] "Highest.level.of.qualification..Apprenticeship"                  
## [6] "Highest.level.of.qualification..Level.3.qualifications"          
## [7] "Highest.level.of.qualification..Level.4.qualifications.and.above"
education_new = education_new %>%
  mutate(
    Secondary_school_up_to_16 = Highest.level.of.qualification..Level.1.qualifications +  Highest.level.of.qualification..Level.2.qualifications,
    Higher_secondary_or_further =  Highest.level.of.qualification..Level.3.qualifications +  Highest.level.of.qualification..Apprenticeship,
    Higher_education =  Highest.level.of.qualification..Level.4.qualifications.and.above,
  ) %>%
  dplyr::select(LSOA, Secondary_school_up_to_16, Higher_secondary_or_further, Higher_education)

education_new = education_new %>%
  rowwise() %>%
  mutate(across(-LSOA, ~ . / sum(c_across(-LSOA)))) %>%
  ungroup()

Add imd

IMD = read.csv("IMD_new.csv")

population_data = left_join(
  education_new,
  IMD,
  by = c("LSOA" = "LSOA")
)
# add msoas to population data - 35672

LSOA_MSOA = read.csv("LSOA_MSOA_2011_new.csv")

population_data = left_join(
  education_new,
  LSOA_MSOA,
  by = c("LSOA" = "LSOA11CD")
)

population_data = population_data %>%
  distinct(LSOA, .keep_all = TRUE)

sum(is.na(population_data))
## [1] 0
income = read.csv("household_income_msoa.csv")

population_data = left_join(
  population_data,
  income,
  by = c("MSOA11CD" = "MSOA.code")
)

population_data = population_data %>% dplyr:: select(-c("MSOA.name", "Local.authority.code", "Local.authority.name", "Region.code", "Region.name", "Upper.confidence.limit....", "Lower.confidence.limit....", "Confidence.interval...."))

# Convert to income bands

income_labels = c(
  "£0 to £15,000 per year",
  "£15,000 to £30,000 per year",
  "£30,000 to £50,000 per year",
  "£50,000 to £100,000 per year",
  "Above £100,000 per year"
)
breaks = c(-Inf, 15000, 30000, 50000, 100000, Inf)

population_data = population_data %>%
  mutate(
    mean_income_num = as.numeric(gsub(",", "", Total.annual.income....)),
    IncomeBand = cut(mean_income_num, breaks = breaks, labels = income_labels, right = TRUE)
  )

# Get rid of unecessary columns 

population_data = population_data %>% dplyr:: select(-c("Total.annual.income....", "MSOA11CD", "mean_income_num"))

Add population size of lsoas

population_size = read.csv("Population_size.csv")

population_size = population_size %>%
  filter(!is.na(.[[1]]) & .[[1]] != "") %>%       
  rename(LSOA = 1) %>%                            
  mutate(LSOA = str_trim(str_extract(LSOA, "^[^:]+")))

population_data = left_join(
  population_data,
  population_size,
  by = c("LSOA" = "LSOA")
)


population_data$All.usual.residents = as.numeric(population_data$All.usual.residents)

Create synthetic population

synthetic micro

# Define variables

levels_Edu = c("Secondary school up to 16 years",
                "Higher or secondary or further education (A-Levels, BTEC, etc.)",
                "Higher education / degree")
levels_Inc = c("£0 to £15,000 per year",
                "£15,000 to £30,000 per year",
                "£30,000 to £50,000 per year",
                "£50,000 to £100,000 per year",
                "Above £100,000 per year")


# Creates every possible combination

micro = expand.grid(
  Education = levels_Edu,
  Income    = levels_Inc
)

micro$w_init = 1  # initial uniform weights

Define functions

# Takes numbers and converts to proportions

safe_prop = function(x) { x[is.na(x)] <- 0; s <- sum(x); if (s <= 0) rep(1/length(x), length(x)) else x/s }

# Convert proportions to actual counts that add up to N

probs_to_counts = function(p, N) {
  p[is.na(p)] <- 0; p <- p / sum(p)
  tgt <- round(p * N)
  diff <- N - sum(tgt)
  if (diff != 0) {
    ord <- order(p, decreasing = (diff > 0))
    for (i in seq_len(abs(diff))) tgt[ ord[((i-1) %% length(tgt)) + 1] ] <- tgt[ ord[((i-1) %% length(tgt)) + 1] ] + sign(diff)
  }
  tgt
}

# average income band with weighting

meanband_to_probs = function(mean_band, K = 5, labels = levels_Inc) {
  mu = if (is.character(mean_band)) match(mean_band, labels) else as.numeric(mean_band)
  mu = min(max(mu, 1), K)
  if (abs(mu - 1) < 1e-9) { p <- rep(0, K); p[1] <- 1; return(p) }
  if (abs(mu - K) < 1e-9) { p <- rep(0, K); p[K] <- 1; return(p) }
  f = function(lambda) { k <- 1:K; w <- exp(lambda * k); sum(k*w)/sum(w) - mu }
  lambda = uniroot(f, c(-20, 20))$root
  w = exp(lambda * (1:K)); as.numeric(w / sum(w))
}

# target counts for each category

targets_for = function(row) {
  N = as.integer(row[["All.usual.residents"]])
  out = list()

  # Education
  p_edu = safe_prop(as.numeric(row[ c("Secondary_school_up_to_16", "Higher_secondary_or_further", "Higher_education") ]))
  out$Education = probs_to_counts(p_edu, N)

  # Income from average band
  
  income_band = row[["IncomeBand"]]
  if(is.na(income_band)) {
    income_band = "£30,000 to £50,000 per year"  # Default to middle band
  }
  p_inc = meanband_to_probs(income_band, K = 5, labels = levels_Inc)
  out$Income = probs_to_counts(p_inc, N)

  out
}
# Converts into mathematical matrices

one_hot = function(x, lev) {
  f = factor(x, levels = lev)
  mm = model.matrix(~ f - 1L)
  colnames(mm) = lev
  storage.mode(mm) = "double"
  mm
}

X = list(
  Education = one_hot(micro$Education, levels_Edu),
  Income    = one_hot(micro$Income,    levels_Inc)
)

# IPF Algorithm

ipf_weights = function(X, targets, max_iter = 200, tol = 1e-6) {
  n = nrow(X[[1L]])
  w = rep(1, n)
  for (it in seq_len(max_iter)) {
    w_old = w
    for (nm in names(X)) {
      Xi = X[[nm]]
      tgt = targets[[nm]]
      est = as.numeric(colSums(Xi * w))
      adj = rep(1, length(tgt))
      nz = est > 0
      adj[nz] <- tgt[nz] / est[nz]
      w = w * as.numeric(Xi %*% adj)
    }
    if (max(abs(w - w_old)) < tol) break
  }
  w
}
# Sampling 

seed_for = function(x) (sum(utf8ToInt(as.character(x))) * 131) %% .Machine$integer.max

sample_N = function(micro, w, N, lsoa_id) {
  if (sum(w) <= 0) w <- rep(1, length(w))
  p <- w / sum(w)
  set.seed(seed_for(lsoa_id))
  idx <- sample.int(nrow(micro), size = N, replace = TRUE, prob = p)
  micro[idx, , drop = FALSE]
}

# Generate population 

synthetic_pop = population_data %>%
  group_split(row_number()) %>%
  lapply(function(rr) {
    row <- rr[1, ]
    lsoa <- row[["LSOA"]]
    N <- as.integer(row[["All.usual.residents"]])
    tgt <- targets_for(row)
    w <- ipf_weights(X, tgt)
    syn <- sample_N(micro, w, N, lsoa)
    syn$LSOA <- lsoa
    syn
  }) %>%
  bind_rows()

Add additional variables back in

IMD = read.csv("IMD_new.csv")

synthetic_pop = left_join(
  synthetic_pop,
  IMD,
  by = c("LSOA" = "LSOA")
)
synthetic_df = synthetic_pop


MAPPING = c(
  "Income."    = "Income",
  "SOA_decile" = "SOA_decile",
  "Education"  = "Education"
)

EDU_MAPPING = c(
  "Secondary school up to 16 years" = "Secondary school up to 16 years",
  "Higher or secondary or further education (A-Levels, BTEC, etc.)"  = "Higher or secondary or further education (A-Levels, BTEC, etc.)",
  "Higher education / degree"  = "Higher education"
)


synthetic_df$Education = unname(EDU_MAPPING[synthetic_df$Education])


names(synthetic_df)[names(synthetic_df) == "Income"] = "Income."

Predict

# match training 

synthetic_df_fixed = synthetic_df %>%
  filter(!is.na(SOA_decile)) %>%
  mutate(
    Income. = case_when(
      Income. == "£15,000 to £30,000 per year" ~ "£15,000 to £30,000 per year ",
      Income. == "£30,000 to £50,000 per year" ~ "£30,000 to £50,000 per year ",
      Income. == "Above £100,000 per year" ~ "Above £100,000 per year ",
      TRUE ~ Income.
    ),
    Education = case_when(
      Education == "Higher education / degree" ~ "Higher education",  # ← REVERSE THIS
      TRUE ~ Education
    )
  )


# Create model matrix

X_pred = model.matrix(~ 0 + Education + Income. + SOA_decile, data = synthetic_df_fixed)

train_cols = colnames(X_train) 


# Add predictions

pred_probs = predict(xgb_best, newdata = X_pred, type = "prob")[, "At_Risk"]

pred_classes = ifelse(pred_probs >= 0.397, "At_Risk", "Low_Risk")

synthetic_df_fixed$Prob_At_Risk = pred_probs

synthetic_df_fixed$Predicted_Risk = pred_classes

# Aggregate to LSOA level

lsoa_results = synthetic_df_fixed %>%
  group_by(LSOA) %>%
  summarise(
    Population = n(),
    Percent_At_Risk = round(mean(Predicted_Risk == "At_Risk") * 100),
    Avg_Risk_Prob = round(mean(Prob_At_Risk), 3),
    Modal_Income = names(sort(table(Income.), decreasing = TRUE))[1],
    Modal_Education = names(sort(table(Education), decreasing = TRUE))[1],
    SOA_decile = first(SOA_decile),
    .groups = "drop"
  ) %>%
  arrange(desc(Percent_At_Risk))

# Results

print(paste("Predicted for", nrow(synthetic_df_fixed), "individuals in", nrow(lsoa_results), "LSOAs"))
## [1] "Predicted for 56075912 individuals in 34753 LSOAs"
print("Top 10 highest risk LSOAs:")
## [1] "Top 10 highest risk LSOAs:"
print(head(lsoa_results, 10))
## # A tibble: 10 × 7
##    LSOA    Population Percent_At_Risk Avg_Risk_Prob Modal_Income Modal_Education
##    <chr>        <int>           <dbl>         <dbl> <chr>        <chr>          
##  1 E01006…       1727             100         0.825 £0 to £15,0… Secondary scho…
##  2 E01006…       1616             100         0.835 £0 to £15,0… Secondary scho…
##  3 E01006…       1272             100         0.834 £0 to £15,0… Secondary scho…
##  4 E01007…       1368             100         0.833 £0 to £15,0… Secondary scho…
##  5 E01007…       1105             100         0.832 £0 to £15,0… Secondary scho…
##  6 E01007…       1664             100         0.831 £0 to £15,0… Secondary scho…
##  7 E01007…       1752             100         0.831 £0 to £15,0… Secondary scho…
##  8 E01007…       1606             100         0.829 £0 to £15,0… Secondary scho…
##  9 E01007…       1583             100         0.828 £0 to £15,0… Secondary scho…
## 10 E01007…       1784             100         0.831 £0 to £15,0… Secondary scho…
## # ℹ 1 more variable: SOA_decile <int>
# Save
write.csv(lsoa_results, "lsoa_risk_simple.csv", row.names = FALSE)

Map

lsoa_shp = read_sf("LSOA_2011_EW_BFC_V3.shp")

lsoa_shp_map = left_join(
  lsoa_results,
  lsoa_shp,
  by = c("LSOA" = "LSOA11CD")
)

lsoa_shp_map = st_sf(lsoa_shp_map)

lsoa_shp_map = lsoa_shp_map %>%
  mutate(
    Risk_Category = case_when(
      is.na(Percent_At_Risk) ~ "No Data",
      Percent_At_Risk >= 80 ~ "Very High (80%+)",
      Percent_At_Risk >= 60 ~ "High (60-80%)",
      Percent_At_Risk >= 40 ~ "Medium (40-60%)",
      Percent_At_Risk >= 20 ~ "Low (20-40%)",
      TRUE ~ "Very Low (<20%)"
    ),
    Risk_Category = factor(Risk_Category, 
                          levels = c("Very Low (<20%)", "Low (20-40%)", 
                                   "Medium (40-60%)", "High (60-80%)", 
                                   "Very High (80%+)", "No Data"))
  )


risk_colors = c(
  "Very Low (<20%)" = "#2E8B57",   
  "Low (20-40%)" = "#FFD700",         
  "Medium (40-60%)" = "#FF8C00",   
  "High (60-80%)" = "#FF4500",      
  "Very High (80%+)" = "#8B0000",   
  "No Data" = "#D3D3D3"             
)
summary_stats = lsoa_shp_map %>%
  st_drop_geometry() %>%
  filter(!is.na(Percent_At_Risk)) %>%
  summarise(
    n_areas = n(),
    mean_risk = round(mean(Percent_At_Risk), 1),
    median_risk = round(median(Percent_At_Risk), 1),
    high_risk_areas = sum(Percent_At_Risk >= 60),
    pct_high_risk = round(100 * high_risk_areas / n_areas, 1)
  )


p1_pub = ggplot(lsoa_shp_map) +
  geom_sf(aes(fill = Risk_Category), 
          color = "NA", 
          linewidth = 0.01) +  # Very thin white borders
  
  scale_fill_manual(
    values = risk_colors, 
    name = "Risk Level (%)",
    guide = guide_legend(
      title.position = "top",
      title.hjust = 0.5,
      nrow = 1,
      override.aes = list(color = "black", linewidth = 0.5)  # Clean legend
    )
  ) +
  
  # Scale bar and north arrow
  annotation_scale(
    location = "bl",
    width_hint = 0.25,
    style = "ticks",
    line_col = "black",
    text_col = "black",
    text_cex = 0.9
  ) +
  
  annotation_north_arrow(
    location = "tr",
    which_north = "true",
    style = north_arrow_minimal(text_size = 8),
    height = unit(1.2, "cm"),
    width = unit(1.2, "cm")
  ) +
  
  theme_void() +
  theme(
    text = element_text(family = "Times", color = "black"),
    plot.title = element_text(
      size = 12, 
      face = "bold", 
      hjust = 0.5,
      margin = ggplot2::margin(b = 5)
    ),
    plot.subtitle = element_text(
      size = 10, 
      hjust = 0.5,
      margin = ggplot2::margin(b = 10),
      color = "gray30"
    ),
    plot.caption = element_text(
      size = 7,
      hjust = 0,
      margin = ggplot2::margin(t = 10),
      color = "gray50"
    ),
    
    legend.position = "bottom",
    legend.title = element_text(size = 11, face = "bold"),
    legend.text = element_text(size = 10),
    legend.margin = ggplot2::margin(t = 15),
    legend.box.spacing = unit(0.5, "cm"),
    
    plot.margin = ggplot2::margin(20, 20, 20, 20),
    
    panel.background = element_rect(fill = "white", color = NA),
    plot.background = element_rect(fill = "white", color = NA)
  ) +
  
  labs(
    title = "Coeliac Disease Related Economic Risk Predictions Across English Lower Super Output Areas",
    subtitle = sprintf(
      "Mean risk: %s%% | %s LSOAs (%s%%) classified as high risk (≥60%%)", 
      summary_stats$mean_risk,
      scales::comma(summary_stats$high_risk_areas),
      summary_stats$pct_high_risk
    ),
    caption = paste0(
      "Data: Synthetic population predictions (n = ", 
      scales::comma(summary_stats$n_areas), 
      " LSOAs) | ", "Model: XGBoost classification "
    )
  )

p1_pub

Stats

labels = c("Very Low (<20%)","Low (20–40%)","Medium (40–60%)",
            "High (60–80%)","Very High (80%+)")

risk_num = lsoa_results$Percent_At_Risk / 100
breaks= c(-Inf, 0.20, 0.40, 0.60, 0.80, Inf)

df = lsoa_results %>%
  mutate(
    Risk_Band = cut(risk_num, breaks = breaks, labels = labels, right = FALSE)
  )

band_counts = df %>%
  count(Risk_Band, name = "n") %>%
  mutate(perc = round(100 * n / sum(n), 1)) %>%
  arrange(match(Risk_Band, labels))

band_counts
## # A tibble: 5 × 3
##   Risk_Band            n  perc
##   <fct>            <int> <dbl>
## 1 Very Low (<20%)   5255  15.1
## 2 Low (20–40%)      8242  23.7
## 3 Medium (40–60%)  15126  43.5
## 4 High (60–80%)     1109   3.2
## 5 Very High (80%+)  5021  14.4