library(ggplot2)
library(maps)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(osrm)
## Data: (c) OpenStreetMap contributors, ODbL 1.0 - http://www.openstreetmap.org/copyright
## Routing: OSRM - http://project-osrm.org/
library(sf)
## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
library(stringr)
library(sf)
library(osrm)
library(geosphere)
library(leaflet)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(caret)
## Loading required package: lattice
library(tidyr)
library(gt)
library(brant)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(nnet)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ggspatial)
data = read.csv("DATA clean.csv")
LSOA_lookup = read.csv("LSOA_lookup.csv")
# Clean survey dataset
data = data %>%
mutate(Postcode = str_replace_all(str_to_upper(Postcode), " ", ""))
# Clean lookup table
LSOA_lookup = LSOA_lookup %>%
mutate(pcd7 = str_replace_all(str_to_upper(pcd7), " ", ""))
# Join
data = data %>%
left_join(LSOA_lookup, by = c("Postcode" = "pcd7"))
data = data %>%
filter(!is.na(lsoa21cd))
data = data %>%
dplyr::select(-X, -pcd8, -pcds, -dointr, -doterm, -usertype, -oa21cd, -msoa21nm, -ladnm, -ladnmw)
imd_lookup = read.csv("standardised_IMD.csv")
data = data %>%
left_join(imd_lookup, by = c("lsoa21cd" = "LSOA"))
rural_urban = read.csv("Rural_urban.csv")
data = data %>%
left_join(rural_urban, by = c("ladcd" = "LAD21CD"))
data = data %>%
filter(!is.na(Urban_rural_flag))
data = data %>%
filter(!is.na(SOA_decile))
# GP surgeries
gp = read.csv("GPS_England_Wales.csv")
# Filter for only active institutions
gp_active = gp %>%
filter(gp[[13]] == "A")
# Filter for only GP surgeries and hospitals
gp_filtered = gp_active %>%
filter(gp_active[[26]] == 4 | gp_active[[26]] == 10)
# Get rid of unnecessary columns for ease
gp_filtered = gp_filtered %>%
dplyr::select(-c(3, 4, 9, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27))
# Rename columns
colnames(gp_filtered) = c("Org code", "Name", "Adress1", "Adress2", "Adress3", "Adress4", "Postcode", "Status", "Setting")
# Change postcodes to long/lat
postcode_convert = read.csv("ukpostcodes.csv")
# Clean postcodes
postcode_convert$postcode = toupper(gsub("\\s+", "", postcode_convert$postcode))
gp_filtered$clean_postcode = toupper(gsub("\\s+", "", gp_filtered$Postcode))
# Merge
data = left_join(data, postcode_convert, by = c("Postcode" = "postcode"))
gp_postcodes = left_join(gp_filtered, postcode_convert, by = c("clean_postcode" = "postcode"))
# Convert to sf objects
gps_clean = gp_postcodes %>%
filter(!is.na(longitude), !is.na(latitude))
lsoa_clean = data %>%
filter(!is.na(longitude), !is.na(latitude))
lsoa_sf = st_as_sf(lsoa_clean, coords = c("longitude", "latitude"), crs = 4326)
gp_sf = st_as_sf(gps_clean, coords = c("longitude", "latitude"), crs = 4326)
results = list()
# Get coordinate matrix of all GPs
gp_coords = st_coordinates(gp_sf)
for (i in seq_len(nrow(lsoa_sf))) {
lsoa_point = lsoa_sf[i, ]
cat("Processing LSOA", i, "/", nrow(lsoa_sf), "\n")
tryCatch({
# Get coordinates of current LSOA
lsoa_coord = st_coordinates(lsoa_point)
# Compute Haversine distances to all GPs
dists = distHaversine(lsoa_coord, gp_coords)
# Get indices of 10 closest GPs
nearest_idxs = order(dists)[1:10]
# Subset those GPs
gps_nearby = gp_sf[nearest_idxs, ]
# route to just 10 GPs
travel_times = osrmTable(src = lsoa_point, dst = gps_nearby)
if (!is.null(travel_times$durations)) {
min_time = min(travel_times$durations[1, ], na.rm = TRUE)
results[[i]] = data.frame(
lsoa_code = lsoa_point$lsoa21cd,
min_time_to_gp = min_time
)
}
}, error = function(e) {
message("Failed on row ", i, ": ", e$message)
})
}
# Combine results
travel_df = bind_rows(results)
data = data %>%
left_join(travel_df, by = c("lsoa21cd" = "lsoa_code"))
data = data %>%
distinct()
data = data[data$ID != 10, ]
data = data %>%
filter(!is.na(Risk))
# Define a vector of variables for the index
index_vars = c(
"ID", "Spend", "Cost_home", "Avoided.", "Limited", "Prescription", "Prescription_difference", "Cost_out", "Avoided_out.", "Spend_other.", "Availabilty", "Travel.", "Extra_time", "Extra_time_shopping.", "Time_off", "Stress", "Follow_GF.", "QOL"
)
# Create a new data frame with only these columns
index_df = data[ , index_vars]
# Create scores for each variable
index_df = index_df %>%
mutate(
Spend_score = case_when(
Spend == "£0-£10" ~ 0,
Spend == "£11-£20" ~ 1,
Spend == "£21-£30" ~ 2,
Spend == "£31-£40" ~ 3,
Spend == "Over £40" ~ 4,
TRUE ~ 0
),
Cost_home_score = case_when(
Cost_home == "About the same" ~ 0,
Cost_home == "Slightly more expensive" ~ 1,
Cost_home == "Much more expensive" ~ 2,
TRUE ~ 0
),
Avoided_score = case_when(
Avoided. == "No, never" ~ 0,
Avoided. == "Yes, occasionally" ~ 1,
Avoided. == "Yes, often" ~ 2,
TRUE ~ 0
),
Limited_score = case_when(
Limited == "No" ~ 0,
Limited == "Yes, somewhat" ~ 1,
Limited == "Yes, significantly" ~ 2,
TRUE ~ 0
),
Prescription_score = case_when(
Prescription == "Yes, regularly" ~ 0,
Prescription == "Yes, occasionally" ~ 1,
Prescription %in% c("No, not eligible in my area", "No, eligible, but do not use it", "No, unaware this was an option") ~ 2,
TRUE ~ 0
),
Prescription_difference_score = case_when(
Prescription_difference == "A significant difference" ~ 0,
Prescription_difference %in% c("A moderate difference", "A small difference") ~ 1,
Prescription_difference == "No difference" ~ 2,
TRUE ~ 0
),
Cost_out_score = case_when(
Cost_out == "Very cheap" ~ 0,
Cost_out == "Affordable" ~ 1,
Cost_out == "Expensive" ~ 2,
Cost_out == "Very expensive" ~ 3,
TRUE ~ 0
),
Avoided_out_score = case_when(
Avoided_out. == "No, never" ~ 0,
Avoided_out. == "Yes, occasionally" ~ 1,
Avoided_out. == "Yes, often" ~ 2,
TRUE ~ 0
),
Spend_other_score = case_when(
Spend_other. == "Never" ~ 0,
Spend_other. == "Occasionally" ~ 1,
Spend_other. == "Frequently" ~ 2,
TRUE ~ 0
),
Availabilty_score = case_when(
Availabilty == "Very Good" ~ 0,
Availabilty == "Good" ~ 1,
Availabilty == "Adequate" ~ 2,
Availabilty == "Poor" ~ 3,
Availabilty == "Very poor" ~ 4,
TRUE ~ 0
),
Travel_score = case_when(
Travel. == "No" ~ 0,
Travel. == "Yes" ~ 2,
TRUE ~ 0
),
Extra_time_score = case_when(
Extra_time == "Less than 30 minutes" ~ 0,
Extra_time == "30 minutes - 1 hour" ~ 1,
Extra_time == "1–2 hours" ~ 2,
Extra_time == "3–4 hours" ~ 3,
Extra_time == "More than 4 hours" ~ 4,
TRUE ~ 0
),
Extra_time_shopping_score = case_when(
Extra_time_shopping. == "Less than 30 minutes" ~ 0,
Extra_time_shopping. == "30 minutes - 1 hour" ~ 1,
Extra_time_shopping. == "1–2 hours" ~ 2,
Extra_time_shopping. == "3–4 hours" ~ 3,
Extra_time_shopping. == "More than 4 hours" ~ 4,
TRUE ~ 0
),
Time_off_score = case_when(
Time_off == "Yes" ~ 2,
Time_off == "No" ~ 0,
TRUE ~ 0
),
Stress_score = case_when(
Stress == "None at all" ~ 0,
Stress == "A small amount" ~ 1,
Stress == "A moderate amount" ~ 2,
Stress == "A great deal" ~ 3,
TRUE ~ 0
),
Limited_diet_score = case_when(
Follow_GF. == "No" ~ 0,
Follow_GF. == "Yes - occasionally" ~ 1,
Follow_GF. == "Yes - frequently" ~ 2,
TRUE ~ 0
),
QOL_score = case_when(
QOL == "No impact" ~ 0,
QOL == "Yes - a slight impact" ~ 1,
QOL == "Yes - a significant impact" ~ 2,
TRUE ~ 0
),
) %>%
# Sum all score columns to create the final index
mutate(
economic_risk_index = rowSums(across(ends_with("_score")), na.rm = TRUE)
)
ggplot(index_df, aes(x = economic_risk_index)) +
geom_histogram(binwidth = 1, fill = "#69b3a2", color = "white") +
labs(title = "Distribution of Economic Risk Index",
x = "Economic Risk Index Score",
y = "Number of Respondents") +
theme_minimal()
index_df = index_df %>%
mutate(z_score = scale(economic_risk_index)[,1])
index_df = index_df %>%
mutate(risk_category = case_when(
z_score <= -0.5 ~ "Low Risk",
z_score >= 0.5 ~ "High Risk",
TRUE ~ "Moderate Risk"
))
table(index_df$risk_category)
##
## High Risk Low Risk Moderate Risk
## 146 157 181
ggplot(index_df, aes(x = economic_risk_index, fill = risk_category)) +
geom_histogram(binwidth = 1, color = "white", boundary = 0, closed = "left") +
scale_fill_manual(values = c("Low Risk" = "green",
"Moderate Risk" = "gold",
"High Risk" = "red")) +
labs(
title = "Economic Risk Index Distribution by Category",
x = "Economic Risk Index Score",
y = "Number of Respondents",
fill = "Risk Category"
) +
theme_minimal() +
theme(
legend.position = "top"
)
data = data %>%
mutate(
economic_risk_index = index_df$economic_risk_index,
risk_category = index_df$risk_category
)
data = data %>%
mutate(
Risk = str_to_title(str_trim(Risk)),
Risk = ifelse(Risk == "", NA, Risk)
)
data %>%
filter(!is.na(Risk), !is.na(risk_category)) %>%
count(Risk, risk_category) %>%
group_by(Risk) %>%
mutate(prop = n / sum(n)) %>%
ggplot(aes(x = Risk, y = prop, fill = risk_category)) +
geom_col(position = "stack") +
scale_fill_manual(values = c("Low Risk" = "green", "Moderate Risk" = "gold", "High Risk" = "red")) +
labs(
title = "Distribution of Calculated Risk Within Each Self-Reported Category",
x = "Self-Reported Risk",
y = "Proportion",
fill = "Calculated Risk"
) +
theme_minimal()
# Gender
data = data %>%
filter(`Gender.` != "My sex is female")
data = data %>%
filter(`Gender.` != "Prefer not to say")
data = data %>%
filter(`Gender.` != "Non-binary")
gender_proportions = prop.table(table(data$Gender))
gender_percentages = gender_proportions * 100
print(gender_percentages)
##
## Man Woman
## 5.405405 94.594595
# Age
age_proportions = prop.table(table(data$Age))
age_percentages = age_proportions * 100
print(age_percentages)
##
## 18-24 25-34 35-44 45-54 55-64 65 or older
## 6.652807 13.305613 22.037422 25.987526 20.374220 11.642412
# Race
data = data %>%
filter(`Ethnicity` != "British")
race_proportions = prop.table(table(data$Ethnicity))
race_percentages = race_proportions * 100
print(race_percentages)
##
## Asian/Asian British Mixed/Multiple ethnic origins
## 0.2083333 0.4166667
## White
## 99.3750000
# Education
# Combine undergrad and postgrad into one to match census category
data = data %>%
mutate(Education = str_trim(Education),
Education = case_when(
str_to_lower(Education) %in% c("college or university", "post-graduate degree") ~ "Higher education",
TRUE ~ Education
))
data = data %>%
filter(`Education` != "Prefer not to say")
education_proportions = prop.table(table(data$Education))
education_percentages = education_proportions * 100
print(education_percentages)
##
## Higher education
## 70.675105
## Higher or secondary or further education (A-Levels, BTEC, etc.)
## 21.097046
## Secondary school up to 16 years
## 8.227848
# Income
data = data %>%
filter(`Income.` != "Prefer not to say")
income_proportions = prop.table(table(data$Income.))
income_percentages = income_proportions * 100
print(income_percentages)
##
## £0 to £15,000 per year £15,000 to £30,000 per year
## 6.038647 16.666667
## £30,000 to £50,000 per year £50,000 to £100,000 per year
## 28.019324 38.164251
## Above £100,000 per year
## 11.111111
# Car
car_proportions = prop.table(table(data$Car))
car_percentages = car_proportions * 100
print(car_percentages)
##
## No Yes
## 5.555556 94.444444
# CD
CD_proportions = prop.table(table(data$CD))
CD_percentages = CD_proportions * 100
print(CD_percentages)
##
## No Yes
## 20.28986 79.71014
# Household
household_proportions = prop.table(table(data$Household.))
household_percentages = household_proportions * 100
print(household_percentages)
##
## Child
## 12.3188406
## Child being tested
## 0.2415459
## Friend/housemate
## 0.4830918
## Grandchild
## 0.2415459
## Granddaughter. Daughter in law
## 0.2415459
## Husband and daughter
## 0.2415459
## I am coeliac and my 2 daughters are coeliac
## 0.2415459
## Me
## 0.2415459
## Myself
## 0.2415459
## Myself and my child
## 0.2415459
## N/A
## 63.7681159
## No just me
## 0.4830918
## Parent
## 13.0434783
## parent and spouse
## 0.2415459
## Partner and child
## 0.2415459
## Partner/spouse
## 5.7971014
## Sibling
## 1.6908213
# Location
leaflet(data) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircleMarkers(~longitude, ~latitude,
radius = 4,
popup = ~Postcode,
fillColor = "red",
fillOpacity = 0.6,
stroke = FALSE) %>%
addScaleBar()
direct_cols = c(
"Spend",
"Cost_home",
"Avoided.",
"Limited",
"Prescription",
"Prescription_difference",
"Cost_out",
"Avoided_out.",
"Spend_other.")
q_names = c(
Spend = "Household Spend per Week on GF Diet",
Cost_home = "Cost of GF Alternatives in Local Area",
Avoided. = "Avoided Buying GF Food Due To Cost",
Limited = "Cost of GF Food Limited Household Diet or Nutrition",
Prescription = "Recieve GF Food on Prescription",
Prescription_difference = "Financial Impact of Prescriptions",
Cost_out = "Cost of GF Alternatives When Eating Out",
Avoided_out. = "Avoided Eating Out Due to GF Alternative Cost",
Spend_other. = "Cost of GF Food Caused To Limit Spending Elsewhere")
out = do.call(rbind, lapply(direct_cols, function(col) {
v = data[[col]]
v = v[!is.na(v) & v != ""]
if (length(v) == 0) return(NULL)
tab = table(v, useNA = "no")
tab = sort(tab, decreasing = TRUE)
pct = round(100 * prop.table(tab), 1)
summary_str = paste(paste0(names(pct), ": ", pct, "%"), collapse = ", ")
data.frame(
Question = ifelse(col %in% names(q_names), q_names[col], col),
Summary = summary_str,
stringsAsFactors = FALSE
)
}))
out
## Question
## 1 Household Spend per Week on GF Diet
## 2 Cost of GF Alternatives in Local Area
## 3 Avoided Buying GF Food Due To Cost
## 4 Cost of GF Food Limited Household Diet or Nutrition
## 5 Recieve GF Food on Prescription
## 6 Financial Impact of Prescriptions
## 7 Cost of GF Alternatives When Eating Out
## 8 Avoided Eating Out Due to GF Alternative Cost
## 9 Cost of GF Food Caused To Limit Spending Elsewhere
## Summary
## 1 £11-£20: 32.6%, £21-£30: 28.2%, Over £40: 16.9%, £31-£40: 13%, £0-£10: 9.3%
## 2 Much more expensive: 91.4%, Slightly more expensive: 8.1%, About the same: 0.5%
## 3 Yes, often: 44.6%, Yes, occasionally: 35.5%, No, never: 19.4%, Not sure: 0.5%
## 4 No: 40.8%, Yes, somewhat: 39.1%, Yes, significantly: 13.4%, Not sure: 6.6%
## 5 No, not eligible in my area: 61.6%, No, eligible, but do not use it: 12.7%, Yes, regularly: 9.3%, No, unaware this was an option: 8.6%, Yes, occasionally: 7.8%
## 6 A significant difference: 37.6%, A moderate difference: 33.3%, A small difference: 19.4%, No difference: 9.7%
## 7 Expensive: 56.5%, Affordable: 30.1%, Very expensive: 13.2%, Very cheap: 0.2%
## 8 No, never: 38.7%, Yes, occasionally: 37.2%, Yes, often: 21.4%, Not sure: 2.7%
## 9 Occasionally: 41.4%, Never: 32.6%, Frequently: 21.2%, Not sure: 4.9%
out |>
gt() |>
cols_label(Question = "Question", Summary = "Summary") |>
opt_row_striping() |>
tab_options(table.font.size = px(12)) |>
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body(columns = Question)
) |>
cols_width(
Question ~ px(220),
Summary ~ px(520)
)|>
tab_options(table.font.names = "Times New Roman")
| Question | Summary |
|---|---|
| Household Spend per Week on GF Diet | £11-£20: 32.6%, £21-£30: 28.2%, Over £40: 16.9%, £31-£40: 13%, £0-£10: 9.3% |
| Cost of GF Alternatives in Local Area | Much more expensive: 91.4%, Slightly more expensive: 8.1%, About the same: 0.5% |
| Avoided Buying GF Food Due To Cost | Yes, often: 44.6%, Yes, occasionally: 35.5%, No, never: 19.4%, Not sure: 0.5% |
| Cost of GF Food Limited Household Diet or Nutrition | No: 40.8%, Yes, somewhat: 39.1%, Yes, significantly: 13.4%, Not sure: 6.6% |
| Recieve GF Food on Prescription | No, not eligible in my area: 61.6%, No, eligible, but do not use it: 12.7%, Yes, regularly: 9.3%, No, unaware this was an option: 8.6%, Yes, occasionally: 7.8% |
| Financial Impact of Prescriptions | A significant difference: 37.6%, A moderate difference: 33.3%, A small difference: 19.4%, No difference: 9.7% |
| Cost of GF Alternatives When Eating Out | Expensive: 56.5%, Affordable: 30.1%, Very expensive: 13.2%, Very cheap: 0.2% |
| Avoided Eating Out Due to GF Alternative Cost | No, never: 38.7%, Yes, occasionally: 37.2%, Yes, often: 21.4%, Not sure: 2.7% |
| Cost of GF Food Caused To Limit Spending Elsewhere | Occasionally: 41.4%, Never: 32.6%, Frequently: 21.2%, Not sure: 4.9% |
out
## Question
## 1 Household Spend per Week on GF Diet
## 2 Cost of GF Alternatives in Local Area
## 3 Avoided Buying GF Food Due To Cost
## 4 Cost of GF Food Limited Household Diet or Nutrition
## 5 Recieve GF Food on Prescription
## 6 Financial Impact of Prescriptions
## 7 Cost of GF Alternatives When Eating Out
## 8 Avoided Eating Out Due to GF Alternative Cost
## 9 Cost of GF Food Caused To Limit Spending Elsewhere
## Summary
## 1 £11-£20: 32.6%, £21-£30: 28.2%, Over £40: 16.9%, £31-£40: 13%, £0-£10: 9.3%
## 2 Much more expensive: 91.4%, Slightly more expensive: 8.1%, About the same: 0.5%
## 3 Yes, often: 44.6%, Yes, occasionally: 35.5%, No, never: 19.4%, Not sure: 0.5%
## 4 No: 40.8%, Yes, somewhat: 39.1%, Yes, significantly: 13.4%, Not sure: 6.6%
## 5 No, not eligible in my area: 61.6%, No, eligible, but do not use it: 12.7%, Yes, regularly: 9.3%, No, unaware this was an option: 8.6%, Yes, occasionally: 7.8%
## 6 A significant difference: 37.6%, A moderate difference: 33.3%, A small difference: 19.4%, No difference: 9.7%
## 7 Expensive: 56.5%, Affordable: 30.1%, Very expensive: 13.2%, Very cheap: 0.2%
## 8 No, never: 38.7%, Yes, occasionally: 37.2%, Yes, often: 21.4%, Not sure: 2.7%
## 9 Occasionally: 41.4%, Never: 32.6%, Frequently: 21.2%, Not sure: 4.9%
indirect_cols = c(
"Availability",
"Travel.",
"Extra_time",
"Extra_time_shopping.",
"Time_off",
"Stress",
"Follow_GF.",
"QOL",
"Support.")
q_names = c(
Availability = "Availabilty of GF Food in Local Area",
Travel. = "Have to Travel Outside of Area for GF Food or Medical Care",
Extra_time = "Travel Time for GF Products",
Extra_time_shopping. = "Extra Weekly Time Spent on GF Shopping/Prep",
Time_off = "Time Off Work/School/Responsibilities Due to CD",
Stress = "Extent of Stress Due to CD Financial Impact",
Follow_GF. = "Financial Impacts Made Strict GFD Harder",
QOL = "Impact on household quality of life",
Support. = "Emotional/Mental Health Support Required")
out = do.call(rbind, lapply(indirect_cols, function(col) {
v = data[[col]]
v = v[!is.na(v) & v != ""]
if (length(v) == 0) return(NULL)
tab = table(v, useNA = "no")
tab = sort(tab, decreasing = TRUE)
pct = round(100 * prop.table(tab), 1)
summary_str = paste(paste0(names(pct), ": ", pct, "%"), collapse = ", ")
data.frame(
Question = ifelse(col %in% names(q_names), q_names[col], col),
Summary = summary_str,
stringsAsFactors = FALSE
)
}))
out
## Question
## 1 Have to Travel Outside of Area for GF Food or Medical Care
## 2 Travel Time for GF Products
## 3 Extra Weekly Time Spent on GF Shopping/Prep
## 4 Time Off Work/School/Responsibilities Due to CD
## 5 Extent of Stress Due to CD Financial Impact
## 6 Financial Impacts Made Strict GFD Harder
## 7 Impact on household quality of life
## 8 Emotional/Mental Health Support Required
## Summary
## 1 No: 56.8%, Yes: 43.2%
## 2 5–10 minutes: 39.9%, 11–20 minutes: 35%, 21–30 minutes: 12.5%, Less than 5 minutes: 8.6%, Over 30 minutes: 3.7%, I’m not sure: 0.5%
## 3 30 minutes - 1 hour: 30.8%, 1–2 hours : 26.9%, Less than 30 minutes: 18.4%, 3–4 hours: 12.4%, More than 4 hours: 6.8%, Not sure: 4.6%
## 4 Yes: 60.1%, No: 39.9%
## 5 A small amount: 42.3%, A moderate amount: 33%, None at all: 12.5%, A great deal: 12%, Not sure: 0.2%
## 6 No - I’ve always been able to manage it: 68.2%, Yes - occasionally: 24.9%, Yes - frequently: 6.8%
## 7 Yes - a slight impact: 52.3%, No impact: 27.1%, Yes - a significant impact: 16.9%, Not sure: 3.7%
## 8 No - not needed\u2028: 55.7%, No - but it would have been helpful\u2028: 23%, Yes - informal support (e.g. friends, support groups)\u2028: 15.6%, Yes - formal support (e.g. counselling, GP support)\u2028: 5.4%, Prefer not to say: 0.2%
out |>
gt() |>
cols_label(Question = "Question", Summary = "Summary") |>
opt_row_striping() |>
tab_options(table.font.size = px(12)) |>
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body(columns = Question)
) |>
cols_width(
Question ~ px(220),
Summary ~ px(520)
)|>
tab_options(table.font.names = "Times New Roman")
| Question | Summary |
|---|---|
| Have to Travel Outside of Area for GF Food or Medical Care | No: 56.8%, Yes: 43.2% |
| Travel Time for GF Products | 5–10 minutes: 39.9%, 11–20 minutes: 35%, 21–30 minutes: 12.5%, Less than 5 minutes: 8.6%, Over 30 minutes: 3.7%, I’m not sure: 0.5% |
| Extra Weekly Time Spent on GF Shopping/Prep | 30 minutes - 1 hour: 30.8%, 1–2 hours : 26.9%, Less than 30 minutes: 18.4%, 3–4 hours: 12.4%, More than 4 hours: 6.8%, Not sure: 4.6% |
| Time Off Work/School/Responsibilities Due to CD | Yes: 60.1%, No: 39.9% |
| Extent of Stress Due to CD Financial Impact | A small amount: 42.3%, A moderate amount: 33%, None at all: 12.5%, A great deal: 12%, Not sure: 0.2% |
| Financial Impacts Made Strict GFD Harder | No - I’ve always been able to manage it: 68.2%, Yes - occasionally: 24.9%, Yes - frequently: 6.8% |
| Impact on household quality of life | Yes - a slight impact: 52.3%, No impact: 27.1%, Yes - a significant impact: 16.9%, Not sure: 3.7% |
| Emotional/Mental Health Support Required | No - not needed : 55.7%, No - but it would have been helpful : 23%, Yes - informal support (e.g. friends, support groups) : 15.6%, Yes - formal support (e.g. counselling, GP support) : 5.4%, Prefer not to say: 0.2% |
out
## Question
## 1 Have to Travel Outside of Area for GF Food or Medical Care
## 2 Travel Time for GF Products
## 3 Extra Weekly Time Spent on GF Shopping/Prep
## 4 Time Off Work/School/Responsibilities Due to CD
## 5 Extent of Stress Due to CD Financial Impact
## 6 Financial Impacts Made Strict GFD Harder
## 7 Impact on household quality of life
## 8 Emotional/Mental Health Support Required
## Summary
## 1 No: 56.8%, Yes: 43.2%
## 2 5–10 minutes: 39.9%, 11–20 minutes: 35%, 21–30 minutes: 12.5%, Less than 5 minutes: 8.6%, Over 30 minutes: 3.7%, I’m not sure: 0.5%
## 3 30 minutes - 1 hour: 30.8%, 1–2 hours : 26.9%, Less than 30 minutes: 18.4%, 3–4 hours: 12.4%, More than 4 hours: 6.8%, Not sure: 4.6%
## 4 Yes: 60.1%, No: 39.9%
## 5 A small amount: 42.3%, A moderate amount: 33%, None at all: 12.5%, A great deal: 12%, Not sure: 0.2%
## 6 No - I’ve always been able to manage it: 68.2%, Yes - occasionally: 24.9%, Yes - frequently: 6.8%
## 7 Yes - a slight impact: 52.3%, No impact: 27.1%, Yes - a significant impact: 16.9%, Not sure: 3.7%
## 8 No - not needed\u2028: 55.7%, No - but it would have been helpful\u2028: 23%, Yes - informal support (e.g. friends, support groups)\u2028: 15.6%, Yes - formal support (e.g. counselling, GP support)\u2028: 5.4%, Prefer not to say: 0.2%
# Turn off scientific notation
options(scipen = 999)
# Convert dependent variable to ordinal
data$Risk = factor(data$Risk,
levels = c("Low Risk", "Moderate Risk", "High Risk"),
ordered = TRUE)
# Convert rest of categorical variables to factor
data$Gender. = as.factor(data$Gender.)
data$Age = as.factor(data$Age)
data$Education = as.factor(data$Education)
data$Income. = as.factor(data$Income.)
data$Car. = as.factor(data$Car.)
data$Urban_rural_flag = as.factor(data$Urban_rural_flag)
# Set test train - using caret to keeep proportional classes
set.seed(123)
# Create partition — stratified by outcome variable
train_indices = createDataPartition(data$Risk, p = 0.7, list = FALSE)
train_data = data[train_indices, ]
test_data = data[-train_indices, ]
# clean data
train_data_clean = na.omit(train_data)
test_data_clean = na.omit(test_data)
# results table
results = data.frame(
Variable = character(),
Chi_Squared_p = numeric(),
Fisher_p = numeric(),
ANOVA_p = numeric(),
Test_Type = character(),
stringsAsFactors = FALSE
)
# Categorical variables
# Gender
chi_gender = chisq.test(train_data_clean$Gender., train_data_clean$Risk)
fisher_gender = fisher.test(train_data_clean$Gender., train_data_clean$Risk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
Variable = "Gender",
Chi_Squared_p = chi_gender$p.value,
Fisher_p = fisher_gender$p.value,
ANOVA_p = NA,
Test_Type = "Categorical"
))
# Age
chi_age = chisq.test(train_data_clean$Age, train_data_clean$Risk)
fisher_age = fisher.test(train_data_clean$Age, train_data_clean$Risk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
Variable = "Age",
Chi_Squared_p = chi_age$p.value,
Fisher_p = fisher_age$p.value,
ANOVA_p = NA,
Test_Type = "Categorical"
))
# Education
chi_edu = chisq.test(train_data_clean$Education, train_data_clean$Risk)
fisher_edu = fisher.test(train_data_clean$Education, train_data_clean$Risk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
Variable = "Education",
Chi_Squared_p = chi_edu$p.value,
Fisher_p = fisher_edu$p.value,
ANOVA_p = NA,
Test_Type = "Categorical"
))
# Income
chi_income = chisq.test(train_data_clean$Income., train_data_clean$Risk)
fisher_income = fisher.test(train_data_clean$Income., train_data_clean$Risk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
Variable = "Income",
Chi_Squared_p = chi_income$p.value,
Fisher_p = fisher_income$p.value,
ANOVA_p = NA,
Test_Type = "Categorical"
))
# Car
chi_car = chisq.test(train_data_clean$Car., train_data_clean$Risk)
fisher_car = fisher.test(train_data_clean$Car., train_data_clean$Risk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
Variable = "Car",
Chi_Squared_p = chi_car$p.value,
Fisher_p = fisher_car$p.value,
ANOVA_p = NA,
Test_Type = "Categorical"
))
# Urban/Rural
chi_urban = chisq.test(train_data_clean$Urban_rural_flag, train_data_clean$Risk)
fisher_urban = fisher.test(train_data_clean$Urban_rural_flag, train_data_clean$Risk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
Variable = "Urban_Rural",
Chi_Squared_p = chi_urban$p.value,
Fisher_p = fisher_urban$p.value,
ANOVA_p = NA,
Test_Type = "Categorical"
))
# Continuous variables
# SOA deciles
aov_soa = aov(SOA_decile ~ Risk, data = train_data_clean)
soa_p = summary(aov_soa)[[1]][["Pr(>F)"]][1]
results = rbind(results, data.frame(
Variable = "SOA_decile",
Chi_Squared_p = NA,
Fisher_p = NA,
ANOVA_p = soa_p,
Test_Type = "Continuous"
))
# Time to GP
aov_gp <- aov(min_time_to_gp ~ Risk, data = train_data_clean)
gp_p <- summary(aov_gp)[[1]][["Pr(>F)"]][1]
results <- rbind(results, data.frame(
Variable = "Time_to_GP",
Chi_Squared_p = NA,
Fisher_p = NA,
ANOVA_p = gp_p,
Test_Type = "Continuous"
))
results = results %>%
mutate(
Variable = recode(
Variable,
"Urban_Rural" = "Urban–Rural Classification",
"SOA_decile" = "Deprivation decile (LSOA)",
"Time_to_GP" = "Time to GP (mins)",
"Car" = "Car access",
"Income" = "Household income",
"Education" = "Education level"
)
)
nice_tbl = results %>%
mutate(
Chi_Squared_p = round(Chi_Squared_p, 4),
Fisher_p = round(Fisher_p, 4),
ANOVA_p = round(ANOVA_p, 4),
Conservative = if_else(
(Test_Type == "Categorical" & Chi_Squared_p < 0.10) |
(Test_Type == "Continuous" & ANOVA_p < 0.10), "Yes","No"),
Moderate = if_else(
(Test_Type == "Categorical" & Chi_Squared_p < 0.50) |
(Test_Type == "Continuous" & ANOVA_p < 0.50), "Yes","No")
) %>%
gt(groupname_col = "Test_Type") %>%
cols_label(
Chi_Squared_p = html("Chi-Squared<br>p"),
Fisher_p = html("Fisher’s Exact<br>p"),
ANOVA_p = html("ANOVA<br>p"),
Conservative = html("Significant<br>(α = 0.10)"),
Moderate = html("Significant<br>(α = 0.50)")
) %>%
tab_style(
style = list(cell_fill(color = "#E8F5E9"), cell_text(weight = "bold")),
locations = cells_body(columns = Conservative, rows = Conservative == "Yes")
) %>%
tab_style(
style = list(cell_fill(color = "#E8F5E9"), cell_text(weight = "bold")),
locations = cells_body(columns = Moderate, rows = Moderate == "Yes")
) %>%
tab_spanner(label = "p-values", columns = c(Chi_Squared_p, Fisher_p, ANOVA_p)) %>%
fmt_number(columns = c(Chi_Squared_p, Fisher_p, ANOVA_p), decimals = 4) %>%
sub_missing(columns = everything(), missing_text = "—") %>%
opt_row_striping()
nice_tbl
| Variable |
p-values
|
Significant (α = 0.10) |
Significant (α = 0.50) |
||
|---|---|---|---|---|---|
| Chi-Squared p |
Fisher’s Exact p |
ANOVA p |
|||
| Categorical | |||||
| Gender | 0.7439 | 1.0000 | — | No | No |
| Age | 0.5923 | 0.6087 | — | No | No |
| Education level | 0.1190 | 0.1109 | — | No | Yes |
| Household income | 0.0001 | 0.0005 | — | Yes | Yes |
| Car access | 0.1918 | 0.2484 | — | No | Yes |
| Urban–Rural Classification | 0.4602 | 0.4633 | — | No | Yes |
| Continuous | |||||
| Deprivation decile (LSOA) | — | — | 0.0098 | Yes | Yes |
| Time to GP (mins) | — | — | 0.2221 | No | Yes |
# Change size and shape of table
nice_tbl = nice_tbl %>%
tab_options(table.width = pct(80)) %>%
cols_width(
everything() ~ px(120)
) %>%
tab_options(
table.font.size = px(11),
data_row.padding = px(2),
column_labels.padding = px(2),
row_group.padding = px(2),
heading.padding = px(0)
) %>%
cols_label(
Chi_Squared_p = "Chi-Squared p",
Fisher_p = "Fisher’s Exact p",
ANOVA_p = "ANOVA p",
Conservative = "Significant (p < 0.10)",
Moderate = "Significant (p < 0.50)",
)
nice_tbl = nice_tbl %>%
gt::tab_options(table.font.names = c("Times New Roman", "Times", "serif"))
nice_tbl
| Variable |
p-values
|
Significant (p < 0.10) | Significant (p < 0.50) | ||
|---|---|---|---|---|---|
| Chi-Squared p | Fisher’s Exact p | ANOVA p | |||
| Categorical | |||||
| Gender | 0.7439 | 1.0000 | — | No | No |
| Age | 0.5923 | 0.6087 | — | No | No |
| Education level | 0.1190 | 0.1109 | — | No | Yes |
| Household income | 0.0001 | 0.0005 | — | Yes | Yes |
| Car access | 0.1918 | 0.2484 | — | No | Yes |
| Urban–Rural Classification | 0.4602 | 0.4633 | — | No | Yes |
| Continuous | |||||
| Deprivation decile (LSOA) | — | — | 0.0098 | Yes | Yes |
| Time to GP (mins) | — | — | 0.2221 | No | Yes |
# Run model
model = polr(Risk ~ Income. + SOA_decile, data = train_data_clean, Hess=TRUE)
confusionMatrix(predict(model, test_data_clean),
test_data_clean$Risk)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low Risk Moderate Risk High Risk
## Low Risk 71 33 2
## Moderate Risk 5 8 1
## High Risk 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.6583
## 95% CI : (0.5662, 0.7424)
## No Information Rate : 0.6333
## P-Value [Acc > NIR] : 0.3203
##
## Kappa : 0.1473
##
## Mcnemar's Test P-Value : 0.00002982
##
## Statistics by Class:
##
## Class: Low Risk Class: Moderate Risk Class: High Risk
## Sensitivity 0.9342 0.19512 0.000
## Specificity 0.2045 0.92405 1.000
## Pos Pred Value 0.6698 0.57143 NaN
## Neg Pred Value 0.6429 0.68868 0.975
## Prevalence 0.6333 0.34167 0.025
## Detection Rate 0.5917 0.06667 0.000
## Detection Prevalence 0.8833 0.11667 0.000
## Balanced Accuracy 0.5694 0.55959 0.500
# Check if parallel regression assumption holds
brant(model)
## --------------------------------------------------------------------
## Test for X2 df probability
## --------------------------------------------------------------------
## Omnibus 10.54 5 0.06
## Income.£15,000 to £30,000 per year 0 1 0.95
## Income.£30,000 to £50,000 per year 0.03 1 0.87
## Income.£50,000 to £100,000 per year 0.55 1 0.46
## Income.Above £100,000 per year 3.35 1 0.07
## SOA_decile 1.16 1 0.28
## --------------------------------------------------------------------
##
## H0: Parallel Regression Assumption holds
# Run model
model = polr(Risk ~ Income. + SOA_decile + Education + Car. + Urban_rural_flag + min_time_to_gp, data = train_data_clean, Hess=TRUE)
confusionMatrix(predict(model, test_data_clean),
test_data_clean$Risk)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low Risk Moderate Risk High Risk
## Low Risk 69 32 2
## Moderate Risk 7 9 1
## High Risk 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.65
## 95% CI : (0.5576, 0.7348)
## No Information Rate : 0.6333
## P-Value [Acc > NIR] : 0.3912068
##
## Kappa : 0.1421
##
## Mcnemar's Test P-Value : 0.0002701
##
## Statistics by Class:
##
## Class: Low Risk Class: Moderate Risk Class: High Risk
## Sensitivity 0.9079 0.2195 0.000
## Specificity 0.2273 0.8987 1.000
## Pos Pred Value 0.6699 0.5294 NaN
## Neg Pred Value 0.5882 0.6893 0.975
## Prevalence 0.6333 0.3417 0.025
## Detection Rate 0.5750 0.0750 0.000
## Detection Prevalence 0.8583 0.1417 0.000
## Balanced Accuracy 0.5676 0.5591 0.500
brant(model)
## --------------------------------------------------------------------------------------------------------------------
## Test for X2 df probability
## --------------------------------------------------------------------------------------------------------------------
## Omnibus 10.23 10 0.42
## Income.£15,000 to £30,000 per year 0 1 0.98
## Income.£30,000 to £50,000 per year 0 1 0.97
## Income.£50,000 to £100,000 per year 0.72 1 0.4
## Income.Above £100,000 per year 2.62 1 0.11
## SOA_decile 0.61 1 0.44
## EducationHigher or secondary or further education (A-Levels, BTEC, etc.) 0.33 1 0.57
## EducationSecondary school up to 16 years 0 1 0.99
## Car.Yes 0 1 0.99
## Urban_rural_flagUrban 1.14 1 0.29
## min_time_to_gp 0.04 1 0.84
## --------------------------------------------------------------------------------------------------------------------
##
## H0: Parallel Regression Assumption holds
set.seed(111)
# Run model
rf_model = randomForest(
Risk ~ Income. + SOA_decile,
data = train_data_clean,
importance = TRUE,
ntree = 500
)
# Predict
confusionMatrix(predict(rf_model, test_data_clean),
test_data_clean$Risk)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low Risk Moderate Risk High Risk
## Low Risk 69 31 2
## Moderate Risk 7 10 1
## High Risk 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.6583
## 95% CI : (0.5662, 0.7424)
## No Information Rate : 0.6333
## P-Value [Acc > NIR] : 0.3203424
##
## Kappa : 0.1675
##
## Mcnemar's Test P-Value : 0.0004081
##
## Statistics by Class:
##
## Class: Low Risk Class: Moderate Risk Class: High Risk
## Sensitivity 0.9079 0.24390 0.000
## Specificity 0.2500 0.89873 1.000
## Pos Pred Value 0.6765 0.55556 NaN
## Neg Pred Value 0.6111 0.69608 0.975
## Prevalence 0.6333 0.34167 0.025
## Detection Rate 0.5750 0.08333 0.000
## Detection Prevalence 0.8500 0.15000 0.000
## Balanced Accuracy 0.5789 0.57132 0.500
set.seed(111)
rf_model_2 = randomForest(
Risk ~ Education + Income. + Car. + SOA_decile + min_time_to_gp + Urban_rural_flag,
data = train_data_clean,
importance = TRUE,
ntree = 500
)
confusionMatrix(predict(rf_model_2, test_data_clean),
test_data_clean$Risk)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low Risk Moderate Risk High Risk
## Low Risk 63 29 1
## Moderate Risk 13 12 2
## High Risk 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.625
## 95% CI : (0.532, 0.7117)
## No Information Rate : 0.6333
## P-Value [Acc > NIR] : 0.61475
##
## Kappa : 0.1325
##
## Mcnemar's Test P-Value : 0.02805
##
## Statistics by Class:
##
## Class: Low Risk Class: Moderate Risk Class: High Risk
## Sensitivity 0.8289 0.2927 0.000
## Specificity 0.3182 0.8101 1.000
## Pos Pred Value 0.6774 0.4444 NaN
## Neg Pred Value 0.5185 0.6882 0.975
## Prevalence 0.6333 0.3417 0.025
## Detection Rate 0.5250 0.1000 0.000
## Detection Prevalence 0.7750 0.2250 0.000
## Balanced Accuracy 0.5736 0.5514 0.500
set.seed(111)
# Build model matrices for test and train
mm = function(df) model.matrix(~ 0 + Income. + SOA_decile, data = df)
X_train = mm(train_data_clean)
X_test = mm(test_data_clean)
# align test columns to train columns
train_cols = colnames(X_train)
missing = setdiff(train_cols, colnames(X_test))
if (length(missing)) {
X_test = cbind(X_test, matrix(0, nrow(X_test), length(missing),
dimnames = list(NULL, missing)))
}
X_test <- X_test[, train_cols, drop = FALSE]
# Outcome factor - create levels
risk_levels = make.names(c("Low Risk","Moderate Risk","High Risk"))
y_train = factor(make.names(as.character(train_data_clean$Risk)), levels = risk_levels)
y_test = factor(make.names(as.character(test_data_clean$Risk)), levels = risk_levels)
# Create tuning grid
xgb_grid = expand.grid(
nrounds = c(100, 200),
max_depth = c(3, 5, 7),
eta = c(0.05, 0.1),
gamma = c(0, 1),
colsample_bytree = c(0.6, 0.8),
min_child_weight = c(1, 5),
subsample = 0.8
)
control = trainControl(
method = "cv",
number = 5,
verboseIter = TRUE,
allowParallel = TRUE,
classProbs = TRUE,
summaryFunction = multiClassSummary
)
# Train model
xgb_tuned = train(
x = X_train,
y = y_train,
method = "xgbTree",
trControl = control,
tuneGrid = xgb_grid,
metric = "Accuracy"
)
# predict on test data
prob_df = predict(xgb_tuned, newdata = X_test, type = "prob")
pred_lab = factor(colnames(prob_df)[max.col(as.matrix(prob_df))], levels = risk_levels)
# Confusion matrix
caret::confusionMatrix(pred_lab, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low.Risk Moderate.Risk High.Risk
## Low.Risk 73 31 3
## Moderate.Risk 3 10 0
## High.Risk 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.6917
## 95% CI : (0.6009, 0.7727)
## No Information Rate : 0.6333
## P-Value [Acc > NIR] : 0.1082
##
## Kappa : 0.2258
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Low.Risk Class: Moderate.Risk Class: High.Risk
## Sensitivity 0.9605 0.24390 0.000
## Specificity 0.2273 0.96203 1.000
## Pos Pred Value 0.6822 0.76923 NaN
## Neg Pred Value 0.7692 0.71028 0.975
## Prevalence 0.6333 0.34167 0.025
## Detection Rate 0.6083 0.08333 0.000
## Detection Prevalence 0.8917 0.10833 0.000
## Balanced Accuracy 0.5939 0.60296 0.500
set.seed(111)
# Create model matrices for test and train
mm = function(df) model.matrix(~ 0 + Education + Car. + Urban_rural_flag + Income. + SOA_decile + min_time_to_gp, data = df)
X_train = mm(train_data_clean)
X_test = mm(test_data_clean)
# align test columns to train columns
train_cols = colnames(X_train)
missing = setdiff(train_cols, colnames(X_test))
if (length(missing)) {
X_test <- cbind(X_test, matrix(0, nrow(X_test), length(missing),
dimnames = list(NULL, missing)))
}
X_test = X_test[, train_cols, drop = FALSE]
# Outcome factor
risk_levels = make.names(c("Low Risk","Moderate Risk","High Risk"))
y_train = factor(make.names(as.character(train_data_clean$Risk)), levels = risk_levels)
y_test = factor(make.names(as.character(test_data_clean$Risk)), levels = risk_levels)
# Create tuning grid
xgb_grid = expand.grid(
nrounds = c(100, 200),
max_depth = c(3, 5, 7),
eta = c(0.05, 0.1),
gamma = c(0, 1),
colsample_bytree = c(0.6, 0.8),
min_child_weight = c(1, 5),
subsample = 0.8
)
control = trainControl(
method = "cv",
number = 5,
verboseIter = TRUE,
allowParallel = TRUE,
classProbs = TRUE,
summaryFunction = multiClassSummary
)
# Train model
xgb_tuned = train(
x = X_train,
y = y_train,
method = "xgbTree",
trControl = control,
tuneGrid = xgb_grid,
metric = "Accuracy"
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
# Predict on test data
prob_df = predict(xgb_tuned, newdata = X_test, type = "prob") # columns == risk_levels
pred_lab = factor(colnames(prob_df)[max.col(as.matrix(prob_df))], levels = risk_levels)
# Confusion matrix
caret::confusionMatrix(pred_lab, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low.Risk Moderate.Risk High.Risk
## Low.Risk 66 28 1
## Moderate.Risk 10 13 2
## High.Risk 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.6583
## 95% CI : (0.5662, 0.7424)
## No Information Rate : 0.6333
## P-Value [Acc > NIR] : 0.320342
##
## Kappa : 0.2006
##
## Mcnemar's Test P-Value : 0.009195
##
## Statistics by Class:
##
## Class: Low.Risk Class: Moderate.Risk Class: High.Risk
## Sensitivity 0.8684 0.3171 0.000
## Specificity 0.3409 0.8481 1.000
## Pos Pred Value 0.6947 0.5200 NaN
## Neg Pred Value 0.6000 0.7053 0.975
## Prevalence 0.6333 0.3417 0.025
## Detection Rate 0.5500 0.1083 0.000
## Detection Prevalence 0.7917 0.2083 0.000
## Balanced Accuracy 0.6047 0.5826 0.500
# Create train matrix
x_train = model.matrix(~ 0 + Income. + SOA_decile,
data = train_data_clean)
# Outcome labels
risk_levels = make.names(c("Low Risk","Moderate Risk","High Risk"))
y_train = factor(make.names(as.character(train_data_clean$Risk)), levels = risk_levels)
# Test matrix
x_test = model.matrix(~ 0 + Income. + SOA_decile,
data = test_data_clean)
# Align test and train
train_cols = colnames(x_train)
missing = setdiff(train_cols, colnames(x_test))
if (length(missing)) {
x_test = cbind(x_test, matrix(0, nrow(x_test), length(missing),
dimnames = list(NULL, missing)))
}
x_test = x_test[, train_cols, drop = FALSE]
set.seed(111)
# Create tuning grid
nnet_grid = expand.grid(size = c(3,5,7), decay = c(0.01, 0.1, 0.5))
control = trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = multiClassSummary,
verboseIter = TRUE
)
# Run model
nnet_model = caret::train(
x = x_train,
y = y_train,
method = "nnet",
trControl = control,
tuneGrid = nnet_grid,
metric = "Accuracy",
preProcess = c("center","scale"),
maxit = 200,
trace = FALSE
)
# Predict
pred_labels = predict(nnet_model, newdata = x_test)
actual_labels = factor(make.names(as.character(test_data_clean$Risk)),
levels = risk_levels)
# Confusion matrix
caret::confusionMatrix(pred_labels, actual_labels)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low.Risk Moderate.Risk High.Risk
## Low.Risk 70 29 2
## Moderate.Risk 6 12 1
## High.Risk 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.6833
## 95% CI : (0.5922, 0.7652)
## No Information Rate : 0.6333
## P-Value [Acc > NIR] : 0.1485258
##
## Kappa : 0.233
##
## Mcnemar's Test P-Value : 0.0004166
##
## Statistics by Class:
##
## Class: Low.Risk Class: Moderate.Risk Class: High.Risk
## Sensitivity 0.9211 0.2927 0.000
## Specificity 0.2955 0.9114 1.000
## Pos Pred Value 0.6931 0.6316 NaN
## Neg Pred Value 0.6842 0.7129 0.975
## Prevalence 0.6333 0.3417 0.025
## Detection Rate 0.5833 0.1000 0.000
## Detection Prevalence 0.8417 0.1583 0.000
## Balanced Accuracy 0.6083 0.6020 0.500
# Train matrix
x_train = model.matrix(~ 0 + Income. + Education + SOA_decile + Car. + Urban_rural_flag + min_time_to_gp,
data = train_data_clean)
# Outcome labels
risk_levels = make.names(c("Low Risk","Moderate Risk","High Risk"))
y_train = factor(make.names(as.character(train_data_clean$Risk)), levels = risk_levels)
# Test matrix
x_test = model.matrix(~ 0 + Income. + SOA_decile + Education + Car. + Urban_rural_flag + min_time_to_gp,
data = test_data_clean)
# Align test and train
train_cols = colnames(x_train)
missing = setdiff(train_cols, colnames(x_test))
if (length(missing)) {
x_test = cbind(x_test, matrix(0, nrow(x_test), length(missing),
dimnames = list(NULL, missing)))
}
x_test = x_test[, train_cols, drop = FALSE]
set.seed(111)
nnet_grid = expand.grid(size = c(3,5,7), decay = c(0.01, 0.1, 0.5))
control = trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = multiClassSummary,
verboseIter = TRUE
)
nnet_model = caret::train(
x = x_train,
y = y_train,
method = "nnet",
trControl = control,
tuneGrid = nnet_grid,
metric = "Accuracy",
preProcess = c("center","scale"),
maxit = 200,
trace = FALSE
)
pred_labels = predict(nnet_model, newdata = x_test)
actual_labels = factor(make.names(as.character(test_data_clean$Risk)),
levels = risk_levels)
caret::confusionMatrix(pred_labels, actual_labels)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low.Risk Moderate.Risk High.Risk
## Low.Risk 59 29 1
## Moderate.Risk 17 12 2
## High.Risk 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.5917
## 95% CI : (0.4982, 0.6805)
## No Information Rate : 0.6333
## P-Value [Acc > NIR] : 0.8511
##
## Kappa : 0.0762
##
## Mcnemar's Test P-Value : 0.1054
##
## Statistics by Class:
##
## Class: Low.Risk Class: Moderate.Risk Class: High.Risk
## Sensitivity 0.7763 0.2927 0.000
## Specificity 0.3182 0.7595 1.000
## Pos Pred Value 0.6629 0.3871 NaN
## Neg Pred Value 0.4516 0.6742 0.975
## Prevalence 0.6333 0.3417 0.025
## Detection Rate 0.4917 0.1000 0.000
## Detection Prevalence 0.7417 0.2583 0.000
## Balanced Accuracy 0.5472 0.5261 0.500
# Creating binary dependent variable
data$BinaryRisk = ifelse(data$Risk %in% c("High Risk", "Moderate Risk"),
"At Risk",
"Low Risk")
# Make it a factor with consistent level order
data$BinaryRisk = factor(data$BinaryRisk, levels = c("Low Risk", "At Risk"))
# Create same test train split
# Set test train - using caret to keep proprtional classes
set.seed(123)
train_indices = createDataPartition(data$Risk, p = 0.7, list = FALSE)
train_data = data[train_indices, ]
test_data = data[-train_indices, ]
# clean data
train_data_clean = na.omit(train_data)
test_data_clean = na.omit(test_data)
# results table
results = data.frame(
Variable = character(),
Chi_Squared_p = numeric(),
Fisher_p = numeric(),
T_Test_p = numeric(),
Test_Type = character(),
stringsAsFactors = FALSE
)
# Categorical variables
# Gender
chi_gender = chisq.test(train_data_clean$Gender., train_data_clean$BinaryRisk)
fisher_gender = fisher.test(train_data_clean$Gender., train_data_clean$BinaryRisk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
Variable = "Gender",
Chi_Squared_p = chi_gender$p.value,
Fisher_p = fisher_gender$p.value,
T_Test_p = NA,
Test_Type = "Categorical"
))
# Age
chi_age = chisq.test(train_data_clean$Age, train_data_clean$BinaryRisk)
fisher_age = fisher.test(train_data_clean$Age, train_data_clean$BinaryRisk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
Variable = "Age",
Chi_Squared_p = chi_age$p.value,
Fisher_p = fisher_age$p.value,
T_Test_p = NA,
Test_Type = "Categorical"
))
# Education
chi_edu = chisq.test(train_data_clean$Education, train_data_clean$BinaryRisk)
fisher_edu = fisher.test(train_data_clean$Education, train_data_clean$BinaryRisk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
Variable = "Education",
Chi_Squared_p = chi_edu$p.value,
Fisher_p = fisher_edu$p.value,
T_Test_p = NA,
Test_Type = "Categorical"
))
# Income
chi_income = chisq.test(train_data_clean$Income., train_data_clean$BinaryRisk)
fisher_income = fisher.test(train_data_clean$Income., train_data_clean$BinaryRisk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
Variable = "Income",
Chi_Squared_p = chi_income$p.value,
Fisher_p = fisher_income$p.value,
T_Test_p = NA,
Test_Type = "Categorical"
))
# Car
chi_car = chisq.test(train_data_clean$Car., train_data_clean$BinaryRisk)
fisher_car = fisher.test(train_data_clean$Car., train_data_clean$BinaryRisk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
Variable = "Car",
Chi_Squared_p = chi_car$p.value,
Fisher_p = fisher_car$p.value,
T_Test_p = NA,
Test_Type = "Categorical"
))
# Urban/Rural
chi_urban = chisq.test(train_data_clean$Urban_rural_flag, train_data_clean$BinaryRisk)
fisher_urban = fisher.test(train_data_clean$Urban_rural_flag, train_data_clean$BinaryRisk, simulate.p.value=TRUE)
results = rbind(results, data.frame(
Variable = "Urban_Rural",
Chi_Squared_p = chi_urban$p.value,
Fisher_p = fisher_urban$p.value,
T_Test_p = NA,
Test_Type = "Categorical"
))
# Continuous variables - t test
# SOA decile
ttest_soa = t.test(SOA_decile ~ BinaryRisk, data = train_data_clean)
soa_p = ttest_soa$p.value
results = rbind(results, data.frame(
Variable = "SOA_decile",
Chi_Squared_p = NA,
Fisher_p = NA,
T_Test_p = soa_p,
Test_Type = "Continuous"
))
# Time to GP
ttest_gp = t.test(min_time_to_gp ~ BinaryRisk, data = train_data_clean)
gp_p = ttest_gp$p.value
results = rbind(results, data.frame(
Variable = "Time_to_GP",
Chi_Squared_p = NA,
Fisher_p = NA,
T_Test_p = gp_p,
Test_Type = "Continuous"
))
results = results %>%
mutate(
Variable = recode(
Variable,
"Urban_Rural" = "Urban–Rural Classification",
"SOA_decile" = "Deprivation decile (LSOA)",
"Time_to_GP" = "Time to GP (mins)",
"Car" = "Car access",
"Income" = "Household income",
"Education" = "Education level"
)
)
nice_tbl = results %>%
mutate(
Chi_Squared_p = round(Chi_Squared_p, 4),
Fisher_p = round(Fisher_p, 4),
T_Test_p = round(T_Test_p, 4),
Conservative = if_else(
(Test_Type == "Categorical" & Chi_Squared_p < 0.10) |
(Test_Type == "Continuous" & T_Test_p < 0.10), "Yes","No"),
Moderate = if_else(
(Test_Type == "Categorical" & Chi_Squared_p < 0.50) |
(Test_Type == "Continuous" & T_Test_p < 0.50), "Yes","No")
) %>%
gt(groupname_col = "Test_Type") %>%
# <- no tab_header() here
cols_label(
Chi_Squared_p = html("Chi-Squared<br>p"),
Fisher_p = html("Fisher’s Exact<br>p"),
T_Test_p = html("T_Test_p<br>p"),
Conservative = html("Significant<br>(α = 0.10)"),
Moderate = html("Significant<br>(α = 0.50)")
) %>%
tab_style(
style = list(cell_fill(color = "#E8F5E9"), cell_text(weight = "bold")),
locations = cells_body(columns = Conservative, rows = Conservative == "Yes")
) %>%
tab_style(
style = list(cell_fill(color = "#E8F5E9"), cell_text(weight = "bold")),
locations = cells_body(columns = Moderate, rows = Moderate == "Yes")
) %>%
tab_spanner(label = "p-values", columns = c(Chi_Squared_p, Fisher_p, T_Test_p)) %>%
fmt_number(columns = c(Chi_Squared_p, Fisher_p, T_Test_p), decimals = 4) %>%
sub_missing(columns = everything(), missing_text = "—") %>%
opt_row_striping()
nice_tbl
| Variable |
p-values
|
Significant (α = 0.10) |
Significant (α = 0.50) |
||
|---|---|---|---|---|---|
| Chi-Squared p |
Fisher’s Exact p |
T_Test_p p |
|||
| Categorical | |||||
| Gender | 1.0000 | 1.0000 | — | No | No |
| Age | 0.4873 | 0.4813 | — | No | Yes |
| Education level | 0.0661 | 0.0670 | — | Yes | Yes |
| Household income | 0.0000 | 0.0005 | — | Yes | Yes |
| Car access | 0.2461 | 0.1956 | — | No | Yes |
| Urban–Rural Classification | 0.3722 | 0.3429 | — | No | Yes |
| Continuous | |||||
| Deprivation decile (LSOA) | — | — | 0.0040 | Yes | Yes |
| Time to GP (mins) | — | — | 0.1478 | No | Yes |
# Change shape and size of table
nice_tbl = nice_tbl %>%
tab_options(table.width = pct(80)) %>%
cols_width(
everything() ~ px(120)
) %>%
tab_options(
table.font.size = px(11),
data_row.padding = px(2),
column_labels.padding = px(2),
row_group.padding = px(2),
heading.padding = px(0)
) %>%
cols_label(
Chi_Squared_p = "Chi-Squared p",
Fisher_p = "Fisher’s Exact p",
T_Test_p = "T Test p",
Conservative = "Significant (p < 0.10)",
Moderate = "Significant (p < 0.50)",
)
nice_tbl = nice_tbl %>%
gt::tab_options(table.font.names = c("Times New Roman", "Times", "serif"))
nice_tbl
| Variable |
p-values
|
Significant (p < 0.10) | Significant (p < 0.50) | ||
|---|---|---|---|---|---|
| Chi-Squared p | Fisher’s Exact p | T Test p | |||
| Categorical | |||||
| Gender | 1.0000 | 1.0000 | — | No | No |
| Age | 0.4873 | 0.4813 | — | No | Yes |
| Education level | 0.0661 | 0.0670 | — | Yes | Yes |
| Household income | 0.0000 | 0.0005 | — | Yes | Yes |
| Car access | 0.2461 | 0.1956 | — | No | Yes |
| Urban–Rural Classification | 0.3722 | 0.3429 | — | No | Yes |
| Continuous | |||||
| Deprivation decile (LSOA) | — | — | 0.0040 | Yes | Yes |
| Time to GP (mins) | — | — | 0.1478 | No | Yes |
# Run model
binary = glm(BinaryRisk ~ Income. + SOA_decile + Education,
data = train_data_clean,
family = binomial(link = "logit"))
# Predict probabilities
pred_probs = predict(binary, newdata = test_data_clean, type = "response")
# Convert to class labels based on standard threshold
pred_classes = ifelse(pred_probs > 0.5, "At Risk", "Low Risk")
# Make both factors with consistent levels
pred_factor = factor(pred_classes, levels = c("Low Risk", "At Risk"))
actual_factor = factor(test_data_clean$BinaryRisk, levels = c("Low Risk", "At Risk"))
# Confusion matrix
confusionMatrix(pred_factor, actual_factor)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low Risk At Risk
## Low Risk 70 32
## At Risk 6 12
##
## Accuracy : 0.6833
## 95% CI : (0.5922, 0.7652)
## No Information Rate : 0.6333
## P-Value [Acc > NIR] : 0.1485
##
## Kappa : 0.2213
##
## Mcnemar's Test P-Value : 0.00005002
##
## Sensitivity : 0.9211
## Specificity : 0.2727
## Pos Pred Value : 0.6863
## Neg Pred Value : 0.6667
## Prevalence : 0.6333
## Detection Rate : 0.5833
## Detection Prevalence : 0.8500
## Balanced Accuracy : 0.5969
##
## 'Positive' Class : Low Risk
##
# Run model
binary = glm(BinaryRisk ~ Age + Education + Income. + Car. +
SOA_decile + Urban_rural_flag + min_time_to_gp,
data = train_data_clean,
family = binomial(link = "logit"))
# Predict probabilities
pred_probs = predict(binary, newdata = test_data_clean, type = "response")
# Convert to class labels based on threshold
pred_classes = ifelse(pred_probs > 0.5, "At Risk", "Low Risk")
# Make both factors with consistent levels
pred_factor = factor(pred_classes, levels = c("Low Risk", "At Risk"))
actual_factor = factor(test_data_clean$BinaryRisk, levels = c("Low Risk", "At Risk"))
# Compute confusion matrix
confusionMatrix(pred_factor, actual_factor)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low Risk At Risk
## Low Risk 64 29
## At Risk 12 15
##
## Accuracy : 0.6583
## 95% CI : (0.5662, 0.7424)
## No Information Rate : 0.6333
## P-Value [Acc > NIR] : 0.32034
##
## Kappa : 0.1992
##
## Mcnemar's Test P-Value : 0.01246
##
## Sensitivity : 0.8421
## Specificity : 0.3409
## Pos Pred Value : 0.6882
## Neg Pred Value : 0.5556
## Prevalence : 0.6333
## Detection Rate : 0.5333
## Detection Prevalence : 0.7750
## Balanced Accuracy : 0.5915
##
## 'Positive' Class : Low Risk
##
set.seed(111)
rf_model_B = randomForest(
BinaryRisk ~ Income. + SOA_decile + Education,
data = train_data_clean,
importance = TRUE,
ntree = 500
)
confusionMatrix(predict(rf_model_B, test_data_clean),
test_data_clean$BinaryRisk)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low Risk At Risk
## Low Risk 72 33
## At Risk 4 11
##
## Accuracy : 0.6917
## 95% CI : (0.6009, 0.7727)
## No Information Rate : 0.6333
## P-Value [Acc > NIR] : 0.1082
##
## Kappa : 0.2292
##
## Mcnemar's Test P-Value : 0.000004161
##
## Sensitivity : 0.9474
## Specificity : 0.2500
## Pos Pred Value : 0.6857
## Neg Pred Value : 0.7333
## Prevalence : 0.6333
## Detection Rate : 0.6000
## Detection Prevalence : 0.8750
## Balanced Accuracy : 0.5987
##
## 'Positive' Class : Low Risk
##
set.seed(111)
rf_model_B = randomForest(
BinaryRisk ~ Age + Education + Income. + Car. + SOA_decile + Urban_rural_flag + min_time_to_gp,
data = train_data_clean,
importance = TRUE,
ntree = 500
)
confusionMatrix(predict(rf_model_B, test_data_clean),
test_data_clean$BinaryRisk)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low Risk At Risk
## Low Risk 62 27
## At Risk 14 17
##
## Accuracy : 0.6583
## 95% CI : (0.5662, 0.7424)
## No Information Rate : 0.6333
## P-Value [Acc > NIR] : 0.32034
##
## Kappa : 0.2156
##
## Mcnemar's Test P-Value : 0.06092
##
## Sensitivity : 0.8158
## Specificity : 0.3864
## Pos Pred Value : 0.6966
## Neg Pred Value : 0.5484
## Prevalence : 0.6333
## Detection Rate : 0.5167
## Detection Prevalence : 0.7417
## Balanced Accuracy : 0.6011
##
## 'Positive' Class : Low Risk
##
set.seed(111)
# Create matrices
mm = function(df) {
model.matrix(~ 0 + Education + Income. + SOA_decile, data = df)
}
X_train = mm(train_data_clean)
X_test = mm(test_data_clean)
colnames(X_train)
train_cols = colnames(X_train)
missing = setdiff(train_cols, colnames(X_test))
if (length(missing)) {
X_test = cbind(X_test, matrix(0, nrow(X_test), length(missing),
dimnames = list(NULL, missing)))
}
X_test = X_test[, train_cols, drop = FALSE]
# Outcome
levels_bin = c("At Risk", "Low Risk")
y_train = factor(train_data_clean$BinaryRisk, levels = levels_bin)
y_test = factor(test_data_clean$BinaryRisk, levels = levels_bin)
# Tuning grid
xgb_grid = expand.grid(
nrounds = c(100, 200),
max_depth = c(3, 5, 7),
eta = c(0.05, 0.1),
gamma = c(0, 1),
colsample_bytree = c(0.6, 0.8),
min_child_weight = c(1, 5),
subsample = 0.8
)
control = trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
verboseIter = TRUE,
allowParallel = TRUE
)
# Set names
levels_bin_train = c("At_Risk", "Low_Risk")
y_train = factor(gsub(" ", "_", train_data_clean$BinaryRisk), levels = levels_bin_train)
# Train model
xgb_bin = caret::train(
x = X_train,
y = y_train,
method = "xgbTree",
trControl = control,
tuneGrid = xgb_grid,
metric = "ROC"
)
# For predictions, map back
pred_probs = predict(xgb_bin, newdata = X_test, type = "prob")[, "At_Risk"]
pred_labels = ifelse(pred_probs >= 0.5, "At Risk", "Low Risk")
# Predict probabilities
pred_probs = predict(xgb_bin, newdata = X_test, type = "prob")[, "At_Risk"]
# Classify using threshold
pred_labels_clean = ifelse(pred_probs >= 0.5, "At Risk", "Low Risk")
# Original labels
actual_labels_clean = gsub("_", " ", as.character(y_test))
# Make factors with same levels
pred_labels_clean = factor(pred_labels_clean, levels = c("At Risk", "Low Risk"))
actual_labels_clean = factor(actual_labels_clean, levels = c("At Risk", "Low Risk"))
# Confusion matrix
confusionMatrix(pred_labels_clean, actual_labels_clean, positive = "At Risk")
## Confusion Matrix and Statistics
##
## Reference
## Prediction At Risk Low Risk
## At Risk 19 5
## Low Risk 25 71
##
## Accuracy : 0.75
## 95% CI : (0.6627, 0.8245)
## No Information Rate : 0.6333
## P-Value [Acc > NIR] : 0.0044136
##
## Kappa : 0.4048
##
## Mcnemar's Test P-Value : 0.0005226
##
## Sensitivity : 0.4318
## Specificity : 0.9342
## Pos Pred Value : 0.7917
## Neg Pred Value : 0.7396
## Prevalence : 0.3667
## Detection Rate : 0.1583
## Detection Prevalence : 0.2000
## Balanced Accuracy : 0.6830
##
## 'Positive' Class : At Risk
##
set.seed(111)
# Create matrices
mm = function(df) {
model.matrix(~ 0 + Education + Age + Income. + Car. + SOA_decile + Urban_rural_flag + min_time_to_gp, data = df)
}
X_train = mm(train_data_clean)
X_test = mm(test_data_clean)
train_cols = colnames(X_train)
missing = setdiff(train_cols, colnames(X_test))
if (length(missing)) {
X_test = cbind(X_test, matrix(0, nrow(X_test), length(missing),
dimnames = list(NULL, missing)))
}
X_test = X_test[, train_cols, drop = FALSE]
# Outcome
levels_bin = c("At Risk", "Low Risk")
y_train = factor(train_data_clean$BinaryRisk, levels = levels_bin)
y_test = factor(test_data_clean$BinaryRisk, levels = levels_bin)
# Create tuning grid
xgb_grid = expand.grid(
nrounds = c(100, 200),
max_depth = c(3, 5, 7),
eta = c(0.05, 0.1),
gamma = c(0, 1),
colsample_bytree = c(0.6, 0.8),
min_child_weight = c(1, 5),
subsample = 0.8
)
control = trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
verboseIter = TRUE,
allowParallel = TRUE
)
# Rename
levels_bin_train= c("At_Risk", "Low_Risk")
y_train = factor(gsub(" ", "_", train_data_clean$BinaryRisk), levels = levels_bin_train)
# Train model
xgb_bin = caret::train(
x = X_train,
y = y_train,
method = "xgbTree",
trControl = control,
tuneGrid = xgb_grid,
metric = "ROC"
)
# Predicitions
pred_probs = predict(xgb_bin, newdata = X_test, type = "prob")[, "At_Risk"]
pred_labels = ifelse(pred_probs >= 0.5, "At Risk", "Low Risk")
# Predict probabilities
pred_probs = predict(xgb_bin, newdata = X_test, type = "prob")[, "At_Risk"]
# Classify using threshold = 0.5
pred_labels_clean = ifelse(pred_probs >= 0.5, "At Risk", "Low Risk")
# Original labels
actual_labels_clean = gsub("_", " ", as.character(y_test))
# Factors with same levels
pred_labels_clean = factor(pred_labels_clean, levels = c("At Risk", "Low Risk"))
actual_labels_clean = factor(actual_labels_clean, levels = c("At Risk", "Low Risk"))
# Confusion matrix
confusionMatrix(pred_labels_clean, actual_labels_clean, positive = "At Risk")
## Confusion Matrix and Statistics
##
## Reference
## Prediction At Risk Low Risk
## At Risk 15 18
## Low Risk 29 58
##
## Accuracy : 0.6083
## 95% CI : (0.515, 0.6961)
## No Information Rate : 0.6333
## P-Value [Acc > NIR] : 0.7478
##
## Kappa : 0.1098
##
## Mcnemar's Test P-Value : 0.1447
##
## Sensitivity : 0.3409
## Specificity : 0.7632
## Pos Pred Value : 0.4545
## Neg Pred Value : 0.6667
## Prevalence : 0.3667
## Detection Rate : 0.1250
## Detection Prevalence : 0.2750
## Balanced Accuracy : 0.5520
##
## 'Positive' Class : At Risk
##
# Train matrix
x_train = model.matrix(~ 0 + Education + Income. +
SOA_decile,
data = train_data_clean)
# class names for binary outcome
risk_levels = make.names(c("Low Risk","At Risk"))
y_train = factor(make.names(as.character(train_data_clean$BinaryRisk)),
levels = risk_levels)
# Test matrix
x_test = model.matrix(~ Education + Income. +
SOA_decile - 1,
data = test_data_clean)
# Align test and train
train_cols = colnames(x_train)
missing = setdiff(train_cols, colnames(x_test))
if (length(missing)) {
x_test = cbind(x_test, matrix(0, nrow(x_test), length(missing),
dimnames = list(NULL, missing)))
}
x_test = x_test[, train_cols, drop = FALSE]
# create same levels for test
actual_labels = factor(make.names(as.character(test_data_clean$BinaryRisk)),
levels = risk_levels)
# Tuning grid
set.seed(111)
nnet_grid = expand.grid(size = c(3,5,7), decay = c(0.01, 0.1, 0.5))
control = trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary, # binary case
verboseIter = TRUE
)
nnet_model = caret::train(
x = x_train,
y = y_train,
method = "nnet",
trControl = control,
tuneGrid = nnet_grid,
metric = "Accuracy",
preProcess = c("center","scale"),
maxit = 200,
trace = FALSE
)
pred_labels = predict(nnet_model, newdata = x_test)
caret::confusionMatrix(pred_labels, actual_labels)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low.Risk At.Risk
## Low.Risk 70 31
## At.Risk 6 13
##
## Accuracy : 0.6917
## 95% CI : (0.6009, 0.7727)
## No Information Rate : 0.6333
## P-Value [Acc > NIR] : 0.1082
##
## Kappa : 0.2459
##
## Mcnemar's Test P-Value : 0.00007961
##
## Sensitivity : 0.9211
## Specificity : 0.2955
## Pos Pred Value : 0.6931
## Neg Pred Value : 0.6842
## Prevalence : 0.6333
## Detection Rate : 0.5833
## Detection Prevalence : 0.8417
## Balanced Accuracy : 0.6083
##
## 'Positive' Class : Low.Risk
##
# Train matrix
x_train = model.matrix(~ 0 + Age + Education + Income. + Car. + SOA_decile + Urban_rural_flag + min_time_to_gp,
data = train_data_clean)
# class names for binary outcome
risk_levels = make.names(c("Low Risk","At Risk"))
y_train = factor(make.names(as.character(train_data_clean$BinaryRisk)),
levels = risk_levels)
# Test matrix
x_test = model.matrix(~ 0 + Age + Education + Income. + Car. + SOA_decile + Urban_rural_flag + min_time_to_gp,
data = test_data_clean)
# Align test and train columns
train_cols = colnames(x_train)
missing = setdiff(train_cols, colnames(x_test))
if (length(missing)) {
x_test = cbind(x_test, matrix(0, nrow(x_test), length(missing),
dimnames = list(NULL, missing)))
}
x_test = x_test[, train_cols, drop = FALSE]
# same labels for test
actual_labels = factor(make.names(as.character(test_data_clean$BinaryRisk)),
levels = risk_levels)
# Tuning grid and run
set.seed(111)
nnet_grid = expand.grid(size = c(3,5,7), decay = c(0.01, 0.1, 0.5))
control = trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary, # binary case
verboseIter = TRUE
)
nnet_model = caret::train(
x = x_train,
y = y_train,
method = "nnet",
trControl = control,
tuneGrid = nnet_grid,
metric = "Accuracy",
preProcess = c("center","scale"),
maxit = 200,
trace = FALSE
)
pred_labels = predict(nnet_model, newdata = x_test)
caret::confusionMatrix(pred_labels, actual_labels)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low.Risk At.Risk
## Low.Risk 60 29
## At.Risk 16 15
##
## Accuracy : 0.625
## 95% CI : (0.532, 0.7117)
## No Information Rate : 0.6333
## P-Value [Acc > NIR] : 0.61475
##
## Kappa : 0.139
##
## Mcnemar's Test P-Value : 0.07364
##
## Sensitivity : 0.7895
## Specificity : 0.3409
## Pos Pred Value : 0.6742
## Neg Pred Value : 0.4839
## Prevalence : 0.6333
## Detection Rate : 0.5000
## Detection Prevalence : 0.7417
## Balanced Accuracy : 0.5652
##
## 'Positive' Class : Low.Risk
##
# Convert dependent variable to ordinal
data$risk_category = factor(data$risk_category,
levels = c("Low Risk", "Moderate Risk", "High Risk"),
ordered = TRUE)
# Set test train - using caret to keeep proprtional classes
set.seed(123)
# Create partition — stratified by same variable
train_indices = createDataPartition(data$Risk, p = 0.7, list = FALSE)
train_data = data[train_indices, ]
test_data = data[-train_indices, ]
# clean data
train_data_clean = na.omit(train_data)
test_data_clean = na.omit(test_data)
# results table
results = data.frame(
Variable = character(),
Chi_Squared_p = numeric(),
Fisher_p = numeric(),
ANOVA_p = numeric(),
Test_Type = character(),
stringsAsFactors = FALSE
)
# Categorical variables
# Gender
chi_gender = chisq.test(train_data_clean$Gender., train_data_clean$risk_category)
fisher_gender = fisher.test(train_data_clean$Gender., train_data_clean$risk_category, simulate.p.value=TRUE)
results = rbind(results, data.frame(
Variable = "Gender",
Chi_Squared_p = chi_gender$p.value,
Fisher_p = fisher_gender$p.value,
ANOVA_p = NA,
Test_Type = "Categorical"
))
# Age
chi_age = chisq.test(train_data_clean$Age, train_data_clean$risk_category)
fisher_age = fisher.test(train_data_clean$Age, train_data_clean$risk_category, simulate.p.value=TRUE)
results = rbind(results, data.frame(
Variable = "Age",
Chi_Squared_p = chi_age$p.value,
Fisher_p = fisher_age$p.value,
ANOVA_p = NA,
Test_Type = "Categorical"
))
# Education
chi_edu = chisq.test(train_data_clean$Education, train_data_clean$risk_category)
fisher_edu = fisher.test(train_data_clean$Education, train_data_clean$risk_category, simulate.p.value=TRUE)
results = rbind(results, data.frame(
Variable = "Education",
Chi_Squared_p = chi_edu$p.value,
Fisher_p = fisher_edu$p.value,
ANOVA_p = NA,
Test_Type = "Categorical"
))
# Income
chi_income = chisq.test(train_data_clean$Income., train_data_clean$risk_category)
fisher_income = fisher.test(train_data_clean$Income., train_data_clean$risk_category, simulate.p.value=TRUE)
results = rbind(results, data.frame(
Variable = "Income",
Chi_Squared_p = chi_income$p.value,
Fisher_p = fisher_income$p.value,
ANOVA_p = NA,
Test_Type = "Categorical"
))
# Car
chi_car = chisq.test(train_data_clean$Car., train_data_clean$risk_category)
fisher_car = fisher.test(train_data_clean$Car., train_data_clean$risk_category, simulate.p.value=TRUE)
results = rbind(results, data.frame(
Variable = "Car",
Chi_Squared_p = chi_car$p.value,
Fisher_p = fisher_car$p.value,
ANOVA_p = NA,
Test_Type = "Categorical"
))
# Urban/Rural
chi_urban = chisq.test(train_data_clean$Urban_rural_flag, train_data_clean$risk_category)
fisher_urban = fisher.test(train_data_clean$Urban_rural_flag, train_data_clean$risk_category, simulate.p.value=TRUE)
results = rbind(results, data.frame(
Variable = "Urban_Rural",
Chi_Squared_p = chi_urban$p.value,
Fisher_p = fisher_urban$p.value,
ANOVA_p = NA,
Test_Type = "Categorical"
))
# Continuous variables
# SOA decile
aov_soa = aov(SOA_decile ~ risk_category, data = train_data_clean)
soa_p = summary(aov_soa)[[1]][["Pr(>F)"]][1]
results = rbind(results, data.frame(
Variable = "SOA_decile",
Chi_Squared_p = NA,
Fisher_p = NA,
ANOVA_p = soa_p,
Test_Type = "Continuous"
))
# Time to GP
aov_gp = aov(min_time_to_gp ~ risk_category, data = train_data_clean)
gp_p = summary(aov_gp)[[1]][["Pr(>F)"]][1]
results = rbind(results, data.frame(
Variable = "Time_to_GP",
Chi_Squared_p = NA,
Fisher_p = NA,
ANOVA_p = gp_p,
Test_Type = "Continuous"
))
results = results %>%
mutate(
Variable = recode(
Variable,
"Urban_Rural" = "Urban–Rural Classification",
"SOA_decile" = "Deprivation decile (LSOA)",
"Time_to_GP" = "Time to GP (mins)",
"Car" = "Car access",
"Income" = "Household income",
"Education" = "Education level"
)
)
nice_tbl = results %>%
mutate(
Chi_Squared_p = round(Chi_Squared_p, 4),
Fisher_p = round(Fisher_p, 4),
ANOVA_p = round(ANOVA_p, 4),
Conservative = if_else(
(Test_Type == "Categorical" & Chi_Squared_p < 0.10) |
(Test_Type == "Continuous" & ANOVA_p < 0.10), "Yes","No"),
Moderate = if_else(
(Test_Type == "Categorical" & Chi_Squared_p < 0.50) |
(Test_Type == "Continuous" & ANOVA_p < 0.50), "Yes","No")
) %>%
gt(groupname_col = "Test_Type") %>%
cols_label(
Chi_Squared_p = html("Chi-Squared<br>p"),
Fisher_p = html("Fisher’s Exact<br>p"),
ANOVA_p = html("ANOVA<br>p"),
Conservative = html("Significant<br>(α = 0.10)"),
Moderate = html("Significant<br>(α = 0.50)")
) %>%
tab_style(
style = list(cell_fill(color = "#E8F5E9"), cell_text(weight = "bold")),
locations = cells_body(columns = Conservative, rows = Conservative == "Yes")
) %>%
tab_style(
style = list(cell_fill(color = "#E8F5E9"), cell_text(weight = "bold")),
locations = cells_body(columns = Moderate, rows = Moderate == "Yes")
) %>%
tab_spanner(label = "p-values", columns = c(Chi_Squared_p, Fisher_p, ANOVA_p)) %>%
fmt_number(columns = c(Chi_Squared_p, Fisher_p, ANOVA_p), decimals = 4) %>%
sub_missing(columns = everything(), missing_text = "—") %>%
opt_row_striping()
# Change shape and size
nice_tbl = nice_tbl %>%
tab_options(table.width = pct(80)) %>%
cols_width(
everything() ~ px(120)
) %>%
tab_options(
table.font.size = px(11),
data_row.padding = px(2),
column_labels.padding = px(2),
row_group.padding = px(2),
heading.padding = px(0)
) %>%
cols_label(
Chi_Squared_p = "Chi-Squared p",
Fisher_p = "Fisher’s Exact p",
ANOVA_p = "ANOVA p",
Conservative = "Significant (p < 0.10)",
Moderate = "Significant (p < 0.50)",
)
nice_tbl = nice_tbl %>%
gt::tab_options(table.font.names = c("Times New Roman", "Times", "serif"))
nice_tbl
| Variable |
p-values
|
Significant (p < 0.10) | Significant (p < 0.50) | ||
|---|---|---|---|---|---|
| Chi-Squared p | Fisher’s Exact p | ANOVA p | |||
| Categorical | |||||
| Gender | 0.2937 | 0.3348 | — | No | Yes |
| Age | 0.2509 | 0.2404 | — | No | Yes |
| Education level | 0.1094 | 0.0945 | — | No | Yes |
| Household income | 0.0794 | 0.1124 | — | Yes | Yes |
| Car access | 0.5307 | 0.6252 | — | No | No |
| Urban–Rural Classification | 0.1078 | 0.1199 | — | No | Yes |
| Continuous | |||||
| Deprivation decile (LSOA) | — | — | 0.0018 | Yes | Yes |
| Time to GP (mins) | — | — | 0.8439 | No | No |
# Run model
model = polr(risk_category ~ Income. + SOA_decile, data = train_data_clean, Hess=TRUE)
summary_model = summary(model)
confusionMatrix(predict(model, test_data_clean),
test_data_clean$risk_category)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low Risk Moderate Risk High Risk
## Low Risk 6 3 3
## Moderate Risk 33 31 19
## High Risk 4 5 16
##
## Overall Statistics
##
## Accuracy : 0.4417
## 95% CI : (0.3511, 0.5352)
## No Information Rate : 0.3583
## P-Value [Acc > NIR] : 0.03659
##
## Kappa : 0.1709
##
## Mcnemar's Test P-Value : 0.0000002771
##
## Statistics by Class:
##
## Class: Low Risk Class: Moderate Risk Class: High Risk
## Sensitivity 0.1395 0.7949 0.4211
## Specificity 0.9221 0.3580 0.8902
## Pos Pred Value 0.5000 0.3735 0.6400
## Neg Pred Value 0.6574 0.7838 0.7684
## Prevalence 0.3583 0.3250 0.3167
## Detection Rate 0.0500 0.2583 0.1333
## Detection Prevalence 0.1000 0.6917 0.2083
## Balanced Accuracy 0.5308 0.5764 0.6556
brant(model)
## --------------------------------------------------------------------
## Test for X2 df probability
## --------------------------------------------------------------------
## Omnibus 5.93 5 0.31
## Income.£15,000 to £30,000 per year 0.08 1 0.78
## Income.£30,000 to £50,000 per year 1.44 1 0.23
## Income.£50,000 to £100,000 per year 1.19 1 0.27
## Income.Above £100,000 per year 0.06 1 0.81
## SOA_decile 0.5 1 0.48
## --------------------------------------------------------------------
##
## H0: Parallel Regression Assumption holds
# Run model
model = polr(risk_category ~ Income. + SOA_decile + Gender. + Age + Education + Urban_rural_flag, data = train_data_clean, Hess=TRUE)
summary_model = summary(model)
confusionMatrix(predict(model, test_data_clean),
test_data_clean$risk_category)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low Risk Moderate Risk High Risk
## Low Risk 12 8 2
## Moderate Risk 27 22 23
## High Risk 4 9 13
##
## Overall Statistics
##
## Accuracy : 0.3917
## 95% CI : (0.3039, 0.485)
## No Information Rate : 0.3583
## P-Value [Acc > NIR] : 0.2510811
##
## Kappa : 0.093
##
## Mcnemar's Test P-Value : 0.0006722
##
## Statistics by Class:
##
## Class: Low Risk Class: Moderate Risk Class: High Risk
## Sensitivity 0.2791 0.5641 0.3421
## Specificity 0.8701 0.3827 0.8415
## Pos Pred Value 0.5455 0.3056 0.5000
## Neg Pred Value 0.6837 0.6458 0.7340
## Prevalence 0.3583 0.3250 0.3167
## Detection Rate 0.1000 0.1833 0.1083
## Detection Prevalence 0.1833 0.6000 0.2167
## Balanced Accuracy 0.5746 0.4734 0.5918
brant(model)
## --------------------------------------------------------------------------------------------------------------------
## Test for X2 df probability
## --------------------------------------------------------------------------------------------------------------------
## Omnibus 23.57 14 0.05
## Income.£15,000 to £30,000 per year 0 1 0.97
## Income.£30,000 to £50,000 per year 0.72 1 0.4
## Income.£50,000 to £100,000 per year 0.55 1 0.46
## Income.Above £100,000 per year 0.14 1 0.71
## SOA_decile 0.67 1 0.41
## Gender.Woman 0.01 1 0.9
## Age25-34 0.02 1 0.89
## Age35-44 0.01 1 0.91
## Age45-54 0.01 1 0.94
## Age55-64 0.91 1 0.34
## Age65 or older 0.02 1 0.89
## EducationHigher or secondary or further education (A-Levels, BTEC, etc.) 0.64 1 0.42
## EducationSecondary school up to 16 years 6.41 1 0.01
## Urban_rural_flagUrban 2.64 1 0.1
## --------------------------------------------------------------------------------------------------------------------
##
## H0: Parallel Regression Assumption holds
## Warning in brant(model): 920 combinations in table(dv,ivs) do not occur.
## Because of that, the test results might be invalid.
set.seed(111)
rf_model = randomForest(
risk_category ~ Income. + SOA_decile,
data = train_data_clean,
importance = TRUE,
ntree = 500
)
confusionMatrix(predict(rf_model, test_data_clean),
test_data_clean$risk_category)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low Risk Moderate Risk High Risk
## Low Risk 13 6 9
## Moderate Risk 26 27 21
## High Risk 4 6 8
##
## Overall Statistics
##
## Accuracy : 0.4
## 95% CI : (0.3117, 0.4934)
## No Information Rate : 0.3583
## P-Value [Acc > NIR] : 0.1952
##
## Kappa : 0.1024
##
## Mcnemar's Test P-Value : 0.00004539
##
## Statistics by Class:
##
## Class: Low Risk Class: Moderate Risk Class: High Risk
## Sensitivity 0.3023 0.6923 0.21053
## Specificity 0.8052 0.4198 0.87805
## Pos Pred Value 0.4643 0.3649 0.44444
## Neg Pred Value 0.6739 0.7391 0.70588
## Prevalence 0.3583 0.3250 0.31667
## Detection Rate 0.1083 0.2250 0.06667
## Detection Prevalence 0.2333 0.6167 0.15000
## Balanced Accuracy 0.5538 0.5560 0.54429
set.seed(111)
rf_model = randomForest(
risk_category ~ Income. + SOA_decile + Gender. + Age + Education + Urban_rural_flag,
data = train_data_clean,
importance = TRUE,
ntree = 500
)
confusionMatrix(predict(rf_model, test_data_clean),
test_data_clean$risk_category)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low Risk Moderate Risk High Risk
## Low Risk 17 7 8
## Moderate Risk 19 22 16
## High Risk 7 10 14
##
## Overall Statistics
##
## Accuracy : 0.4417
## 95% CI : (0.3511, 0.5352)
## No Information Rate : 0.3583
## P-Value [Acc > NIR] : 0.03659
##
## Kappa : 0.1645
##
## Mcnemar's Test P-Value : 0.07223
##
## Statistics by Class:
##
## Class: Low Risk Class: Moderate Risk Class: High Risk
## Sensitivity 0.3953 0.5641 0.3684
## Specificity 0.8052 0.5679 0.7927
## Pos Pred Value 0.5313 0.3860 0.4516
## Neg Pred Value 0.7045 0.7302 0.7303
## Prevalence 0.3583 0.3250 0.3167
## Detection Rate 0.1417 0.1833 0.1167
## Detection Prevalence 0.2667 0.4750 0.2583
## Balanced Accuracy 0.6003 0.5660 0.5806
set.seed(111)
# Create model matrices
mm = function(df) {
model.matrix(~ 0 + Income. + SOA_decile, data = df)
}
X_train = mm(train_data_clean)
X_test = mm(test_data_clean)
# align test and train columns
train_cols = colnames(X_train)
missing = setdiff(train_cols, colnames(X_test))
if (length(missing)) {
X_test = cbind(X_test, matrix(0, nrow(X_test), length(missing),
dimnames = list(NULL, missing)))
}
X_test = X_test[, train_cols, drop = FALSE]
# Outcome factor
risk_levels = make.names(c("Low Risk","Moderate Risk","High Risk"))
y_train = factor(make.names(as.character(train_data_clean$risk_category)), levels = risk_levels)
y_test = factor(make.names(as.character(test_data_clean$risk_category)), levels = risk_levels)
# Create tuning grid
xgb_grid = expand.grid(
nrounds = c(100, 200),
max_depth = c(3, 5, 7),
eta = c(0.05, 0.1),
gamma = c(0, 1),
colsample_bytree = c(0.6, 0.8),
min_child_weight = c(1, 5),
subsample = 0.8
)
control = trainControl(
method = "cv",
number = 5,
verboseIter = TRUE,
allowParallel = TRUE,
classProbs = TRUE,
summaryFunction = multiClassSummary
)
# Train model
xgb_tuned = train(
x = X_train,
y = y_train,
method = "xgbTree",
trControl = control,
tuneGrid = xgb_grid,
metric = "Accuracy"
)
# Predict
prob_df = predict(xgb_tuned, newdata = X_test, type = "prob") # columns == risk_levels
pred_lab = factor(colnames(prob_df)[max.col(as.matrix(prob_df))], levels = risk_levels)
# Confusion matrix
caret::confusionMatrix(pred_lab, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low.Risk Moderate.Risk High.Risk
## Low.Risk 15 6 11
## Moderate.Risk 21 21 15
## High.Risk 7 12 12
##
## Overall Statistics
##
## Accuracy : 0.4
## 95% CI : (0.3117, 0.4934)
## No Information Rate : 0.3583
## P-Value [Acc > NIR] : 0.19517
##
## Kappa : 0.1022
##
## Mcnemar's Test P-Value : 0.02275
##
## Statistics by Class:
##
## Class: Low.Risk Class: Moderate.Risk Class: High.Risk
## Sensitivity 0.3488 0.5385 0.3158
## Specificity 0.7792 0.5556 0.7683
## Pos Pred Value 0.4688 0.3684 0.3871
## Neg Pred Value 0.6818 0.7143 0.7079
## Prevalence 0.3583 0.3250 0.3167
## Detection Rate 0.1250 0.1750 0.1000
## Detection Prevalence 0.2667 0.4750 0.2583
## Balanced Accuracy 0.5640 0.5470 0.5420
set.seed(111)
# Create model matrices
mm = function(df) {
model.matrix(~ 0 + Income. + SOA_decile + Gender. + Age + Education + Urban_rural_flag, data = df)
}
X_train = mm(train_data_clean)
X_test = mm(test_data_clean)
# align test and train
train_cols = colnames(X_train)
missing = setdiff(train_cols, colnames(X_test))
if (length(missing)) {
X_test = cbind(X_test, matrix(0, nrow(X_test), length(missing),
dimnames = list(NULL, missing)))
}
X_test = X_test[, train_cols, drop = FALSE]
# Outcome
risk_levels = make.names(c("Low Risk","Moderate Risk","High Risk"))
y_train = factor(make.names(as.character(train_data_clean$risk_category)), levels = risk_levels)
y_test = factor(make.names(as.character(test_data_clean$risk_category)), levels = risk_levels)
# Create tuning grid
xgb_grid = expand.grid(
nrounds = c(100, 200),
max_depth = c(3, 5, 7),
eta = c(0.05, 0.1),
gamma = c(0, 1),
colsample_bytree = c(0.6, 0.8),
min_child_weight = c(1, 5),
subsample = 0.8
)
control = trainControl(
method = "cv",
number = 5,
verboseIter = TRUE,
allowParallel = TRUE,
classProbs = TRUE,
summaryFunction = multiClassSummary
)
# Train model
xgb_tuned = train(
x = X_train,
y = y_train,
method = "xgbTree",
trControl = control,
tuneGrid = xgb_grid,
metric = "Accuracy"
)
# Predict
prob_df = predict(xgb_tuned, newdata = X_test, type = "prob")
pred_lab = factor(colnames(prob_df)[max.col(as.matrix(prob_df))], levels = risk_levels)
# Cofusion matrix
caret::confusionMatrix(pred_lab, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low.Risk Moderate.Risk High.Risk
## Low.Risk 16 9 5
## Moderate.Risk 20 17 18
## High.Risk 7 13 15
##
## Overall Statistics
##
## Accuracy : 0.4
## 95% CI : (0.3117, 0.4934)
## No Information Rate : 0.3583
## P-Value [Acc > NIR] : 0.1952
##
## Kappa : 0.1033
##
## Mcnemar's Test P-Value : 0.1503
##
## Statistics by Class:
##
## Class: Low.Risk Class: Moderate.Risk Class: High.Risk
## Sensitivity 0.3721 0.4359 0.3947
## Specificity 0.8182 0.5309 0.7561
## Pos Pred Value 0.5333 0.3091 0.4286
## Neg Pred Value 0.7000 0.6615 0.7294
## Prevalence 0.3583 0.3250 0.3167
## Detection Rate 0.1333 0.1417 0.1250
## Detection Prevalence 0.2500 0.4583 0.2917
## Balanced Accuracy 0.5951 0.4834 0.5754
# Train matrix
x_train = model.matrix(~ 0 + Income. + SOA_decile,
data = train_data_clean)
# Outcome labels
risk_levels = make.names(c("Low Risk","Moderate Risk","High Risk"))
y_train = factor(make.names(as.character(train_data_clean$risk_category)), levels = risk_levels)
# Test matrix
x_test = model.matrix(~ 0 + Income. + SOA_decile,
data = test_data_clean)
# Align test and train
train_cols = colnames(x_train)
missing = setdiff(train_cols, colnames(x_test))
if (length(missing)) {
x_test = cbind(x_test, matrix(0, nrow(x_test), length(missing),
dimnames = list(NULL, missing)))
}
x_test = x_test[, train_cols, drop = FALSE]
# Tuning and run
set.seed(111)
nnet_grid = expand.grid(size = c(3,5,7), decay = c(0.01, 0.1, 0.5))
control = trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = multiClassSummary,
verboseIter = TRUE
)
nnet_model = caret::train(
x = x_train,
y = y_train,
method = "nnet",
trControl = control,
tuneGrid = nnet_grid,
metric = "Accuracy",
preProcess = c("center","scale"), # helpful for nnet
maxit = 200,
trace = FALSE
)
# Predicitions
pred_labels = predict(nnet_model, newdata = x_test)
actual_labels = factor(make.names(as.character(test_data_clean$risk_category)),
levels = risk_levels)
caret::confusionMatrix(pred_labels, actual_labels)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low.Risk Moderate.Risk High.Risk
## Low.Risk 8 4 2
## Moderate.Risk 27 30 20
## High.Risk 8 5 16
##
## Overall Statistics
##
## Accuracy : 0.45
## 95% CI : (0.3591, 0.5435)
## No Information Rate : 0.3583
## P-Value [Acc > NIR] : 0.02401
##
## Kappa : 0.1829
##
## Mcnemar's Test P-Value : 0.000001624
##
## Statistics by Class:
##
## Class: Low.Risk Class: Moderate.Risk Class: High.Risk
## Sensitivity 0.18605 0.7692 0.4211
## Specificity 0.92208 0.4198 0.8415
## Pos Pred Value 0.57143 0.3896 0.5517
## Neg Pred Value 0.66981 0.7907 0.7582
## Prevalence 0.35833 0.3250 0.3167
## Detection Rate 0.06667 0.2500 0.1333
## Detection Prevalence 0.11667 0.6417 0.2417
## Balanced Accuracy 0.55406 0.5945 0.6313
# Train matrix
x_train = model.matrix(~ 0 + Income. + SOA_decile + Gender. + Age + Education + Urban_rural_flag,
data = train_data_clean)
# Outcome labels
risk_levels = make.names(c("Low Risk","Moderate Risk","High Risk"))
y_train = factor(make.names(as.character(train_data_clean$risk_category)), levels = risk_levels)
# Test matrix
x_test = model.matrix(~ 0 + Income. + SOA_decile + Gender. + Age + Education + Urban_rural_flag,
data = test_data_clean)
# Align test and train
train_cols = colnames(x_train)
missing = setdiff(train_cols, colnames(x_test))
if (length(missing)) {
x_test = cbind(x_test, matrix(0, nrow(x_test), length(missing),
dimnames = list(NULL, missing)))
}
x_test = x_test[, train_cols, drop = FALSE]
set.seed(111)
nnet_grid = expand.grid(size = c(3,5,7), decay = c(0.01, 0.1, 0.5))
control = trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = multiClassSummary,
verboseIter = TRUE
)
nnet_model = caret::train(
x = x_train,
y = y_train,
method = "nnet",
trControl = control,
tuneGrid = nnet_grid,
metric = "Accuracy",
preProcess = c("center","scale"), # helpful for nnet
maxit = 200,
trace = FALSE
)
# Predict
pred_labels = predict(nnet_model, newdata = x_test)
actual_labels = factor(make.names(as.character(test_data_clean$risk_category)),
levels = risk_levels)
caret::confusionMatrix(pred_labels, actual_labels)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low.Risk Moderate.Risk High.Risk
## Low.Risk 19 9 8
## Moderate.Risk 14 18 16
## High.Risk 10 12 14
##
## Overall Statistics
##
## Accuracy : 0.425
## 95% CI : (0.3353, 0.5185)
## No Information Rate : 0.3583
## P-Value [Acc > NIR] : 0.07782
##
## Kappa : 0.1386
##
## Mcnemar's Test P-Value : 0.59755
##
## Statistics by Class:
##
## Class: Low.Risk Class: Moderate.Risk Class: High.Risk
## Sensitivity 0.4419 0.4615 0.3684
## Specificity 0.7792 0.6296 0.7317
## Pos Pred Value 0.5278 0.3750 0.3889
## Neg Pred Value 0.7143 0.7083 0.7143
## Prevalence 0.3583 0.3250 0.3167
## Detection Rate 0.1583 0.1500 0.1167
## Detection Prevalence 0.3000 0.4000 0.3000
## Balanced Accuracy 0.6105 0.5456 0.5501
model1 = data.frame(
Model = c(
"Ordinal Logistic - Conservative",
"Ordinal Logistic - Moderate",
"Random Forest - Conservative",
"Random Forest - Moderate",
"XGBoost - Conservative",
"XGBoost - Moderate",
"Neural Network - Conservative",
"Neural Network - Moderate"
),
Accuracy = c(0.6583, 0.65, 0.6583, 0.625, 0.6917, 0.6583, 0.6833, 0.5917),
P_Value = c(0.3203, 0.3912068, 0.3203424, 0.61475, 0.1082, 0.320342, 0.1485258, 0.8511),
Kappa = c(0.1473, 0.1421, 0.1675, 0.1422, 0.2258, 0.2006, 0.233, 0.0762),
stringsAsFactors = FALSE
)
model2 = data.frame(
Model = c(
"Binary Logistic - Conservative",
"Binary Logistic - Moderate",
"Random Forest - Conservative",
"Random Forest - Moderate",
"XGBoost - Conservative",
"XGBoost - Moderate",
"Neural Network - Conservative",
"Neural Network - Moderate"
),
Accuracy = c(0.6833, 0.6583, 0.6917, 0.6417, 0.75, 0.6083, 0.6917, 0.6167),
P_Value = c(0.1485, 0.32034, 0.1082, 0.4656, 0.0044136, 0.7478, 0.1082, 0.68434),
Kappa = c(0.2213, 0.1992, 0.2292, 0.1938, 0.4048, 0.1098, 0.2459, 0.1154),
stringsAsFactors = FALSE
)
model3 = data.frame(
Model = c(
"Ordinal Logistic - Conservative",
"Ordinal Logistic - Moderate",
"Random Forest - Conservative",
"Random Forest - Moderate",
"XGBoost - Conservative",
"XGBoost - Moderate",
"Neural Network - Conservative",
"Neural Network - Moderate"
),
Accuracy = c(0.4417, 0.3917, 0.4, 0.4417, 0.4, 0.4, 0.45, 0.425),
P_Value = c(0.03659, 0.2510811, 0.1952, 0.03659, 0.19517, 0.1952, 0.02401, 0.07782),
Kappa = c(0.1709, 0.093, 0.1024, 0.1119, 0.1022, 0.1533, 0.1829, 0.1386),
stringsAsFactors = FALSE
)
build_tbl = function(df) {
df2 = df
df2$Accuracy = round(df2$Accuracy, 4)
df2$P_Value = round(df2$P_Value, 4)
df2$Kappa = round(df2$Kappa, 4)
# highlight winnder
winner = df2$Model[order(-df2$Accuracy, -df2$Kappa, df2$P_Value)][1]
gt(df2) %>%
cols_label(
Model = "Model",
Accuracy = "Accuracy",
P_Value = "P-Value [Acc > NIR]", # <-- left side is the column name; right side is the label
Kappa = "Kappa"
) %>%
fmt_number(columns = c(Accuracy, P_Value, Kappa), decimals = 4) %>%
opt_row_striping() %>%
tab_options(
table.width = pct(80),
table.font.size = px(11),
data_row.padding = px(2),
column_labels.padding = px(2),
heading.padding = px(0),
table.font.names = c("Times New Roman", "Times", "serif")
) %>%
tab_style(
style = list(cell_fill(color = "#E8F5E9"), cell_text(weight = "bold")),
locations = cells_body(rows = Model == winner)
)
}
tbl_model1 = build_tbl(model1)
tbl_model2 = build_tbl(model2)
tbl_model3 = build_tbl(model3)
tbl_model1
| Model | Accuracy | P-Value [Acc > NIR] | Kappa |
|---|---|---|---|
| Ordinal Logistic - Conservative | 0.6583 | 0.3203 | 0.1473 |
| Ordinal Logistic - Moderate | 0.6500 | 0.3912 | 0.1421 |
| Random Forest - Conservative | 0.6583 | 0.3203 | 0.1675 |
| Random Forest - Moderate | 0.6250 | 0.6148 | 0.1422 |
| XGBoost - Conservative | 0.6917 | 0.1082 | 0.2258 |
| XGBoost - Moderate | 0.6583 | 0.3203 | 0.2006 |
| Neural Network - Conservative | 0.6833 | 0.1485 | 0.2330 |
| Neural Network - Moderate | 0.5917 | 0.8511 | 0.0762 |
tbl_model2
| Model | Accuracy | P-Value [Acc > NIR] | Kappa |
|---|---|---|---|
| Binary Logistic - Conservative | 0.6833 | 0.1485 | 0.2213 |
| Binary Logistic - Moderate | 0.6583 | 0.3203 | 0.1992 |
| Random Forest - Conservative | 0.6917 | 0.1082 | 0.2292 |
| Random Forest - Moderate | 0.6417 | 0.4656 | 0.1938 |
| XGBoost - Conservative | 0.7500 | 0.0044 | 0.4048 |
| XGBoost - Moderate | 0.6083 | 0.7478 | 0.1098 |
| Neural Network - Conservative | 0.6917 | 0.1082 | 0.2459 |
| Neural Network - Moderate | 0.6167 | 0.6843 | 0.1154 |
tbl_model3
| Model | Accuracy | P-Value [Acc > NIR] | Kappa |
|---|---|---|---|
| Ordinal Logistic - Conservative | 0.4417 | 0.0366 | 0.1709 |
| Ordinal Logistic - Moderate | 0.3917 | 0.2511 | 0.0930 |
| Random Forest - Conservative | 0.4000 | 0.1952 | 0.1024 |
| Random Forest - Moderate | 0.4417 | 0.0366 | 0.1119 |
| XGBoost - Conservative | 0.4000 | 0.1952 | 0.1022 |
| XGBoost - Moderate | 0.4000 | 0.1952 | 0.1533 |
| Neural Network - Conservative | 0.4500 | 0.0240 | 0.1829 |
| Neural Network - Moderate | 0.4250 | 0.0778 | 0.1386 |
set.seed(111)
# Create matrices
mm = function(df) {
model.matrix(~ 0 + Education + Income. + SOA_decile, data = df)
}
X_train = mm(train_data_clean)
X_test = mm(test_data_clean)
colnames(X_train)
train_cols = colnames(X_train)
missing = setdiff(train_cols, colnames(X_test))
if (length(missing)) {
X_test = cbind(X_test, matrix(0, nrow(X_test), length(missing),
dimnames = list(NULL, missing)))
}
X_test = X_test[, train_cols, drop = FALSE]
# Outcome
levels_bin = c("At Risk", "Low Risk")
y_train = factor(train_data_clean$BinaryRisk, levels = levels_bin)
y_test = factor(test_data_clean$BinaryRisk, levels = levels_bin)
# Tuning grid
xgb_grid = expand.grid(
nrounds = c(100, 200),
max_depth = c(3, 5, 7),
eta = c(0.05, 0.1),
gamma = c(0, 1),
colsample_bytree = c(0.6, 0.8),
min_child_weight = c(1, 5),
subsample = 0.8
)
control = trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
verboseIter = TRUE,
allowParallel = TRUE
)
# Set names
levels_bin_train = c("At_Risk", "Low_Risk")
y_train = factor(gsub(" ", "_", train_data_clean$BinaryRisk), levels = levels_bin_train)
# Train model
xgb_best = caret::train(
x = X_train,
y = y_train,
method = "xgbTree",
trControl = control,
tuneGrid = xgb_grid,
metric = "ROC"
)
# For predictions, map back
pred_probs = predict(xgb_best, newdata = X_test, type = "prob")[, "At_Risk"]
pred_labels = ifelse(pred_probs >= 0.5, "At Risk", "Low Risk")
# Predict probabilities
pred_probs = predict(xgb_best, newdata = X_test, type = "prob")[, "At_Risk"]
# Classify using threshold
pred_labels_clean = ifelse(pred_probs >= 0.5, "At Risk", "Low Risk")
# Original labels
actual_labels_clean = gsub("_", " ", as.character(y_test))
# Make factors with same levels
pred_labels_clean = factor(pred_labels_clean, levels = c("At Risk", "Low Risk"))
actual_labels_clean = factor(actual_labels_clean, levels = c("At Risk", "Low Risk"))
# Confusion matrix
confusionMatrix(pred_labels_clean, actual_labels_clean, positive = "At Risk")
## Confusion Matrix and Statistics
##
## Reference
## Prediction At Risk Low Risk
## At Risk 19 5
## Low Risk 25 71
##
## Accuracy : 0.75
## 95% CI : (0.6627, 0.8245)
## No Information Rate : 0.6333
## P-Value [Acc > NIR] : 0.0044136
##
## Kappa : 0.4048
##
## Mcnemar's Test P-Value : 0.0005226
##
## Sensitivity : 0.4318
## Specificity : 0.9342
## Pos Pred Value : 0.7917
## Neg Pred Value : 0.7396
## Prevalence : 0.3667
## Detection Rate : 0.1583
## Detection Prevalence : 0.2000
## Balanced Accuracy : 0.6830
##
## 'Positive' Class : At Risk
##
xgb_best$bestTune
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 61 100 3 0.1 1 0.8 1 0.8
# Probabilities for the positive class
probs = predict(xgb_best, newdata = X_test, type = "prob")[, "At_Risk"]
# Make a 0/1 response
y_true01 = as.integer(gsub(" ", "_", test_data_clean$BinaryRisk) == "At_Risk")
# sanity check
table(y_true01)
## y_true01
## 0 1
## 76 44
# ROC + plot
roc_obj = roc(response = y_true01, predictor = probs, direction = "<", quiet = TRUE)
plot(roc_obj, lwd = 2, main = "ROC Curve")
abline(a = 0, b = 1, lty = 2)
auc(roc_obj)
## Area under the curve: 0.7956
# Probabilities + truth
probs = predict(xgb_best, newdata = X_test, type = "prob")[, "At_Risk"]
lvls = c("Low_Risk","At_Risk")
y_true = factor(gsub(" ", "_", test_data_clean$BinaryRisk), levels = lvls)
# ROC
roc_obj = roc(y_true, probs, levels = lvls, direction = "<", quiet = TRUE)
# Best threshold - Youden's J
best = coords(roc_obj, "best", best.method = "youden",
ret = c("threshold","sensitivity","specificity"))
best_thresh = as.numeric(if (is.list(best)) best[["threshold"]] else best["threshold"])
stopifnot(is.finite(best_thresh))
# Reclassify at cutoff and compute CM
pred_opt = factor(ifelse(probs >= best_thresh, "At_Risk", "Low_Risk"), levels = lvls)
cm_opt = confusionMatrix(pred_opt, y_true, positive = "At_Risk")
cm_opt
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low_Risk At_Risk
## Low_Risk 65 15
## At_Risk 11 29
##
## Accuracy : 0.7833
## 95% CI : (0.6989, 0.8533)
## No Information Rate : 0.6333
## P-Value [Acc > NIR] : 0.0002961
##
## Kappa : 0.5244
##
## Mcnemar's Test P-Value : 0.5562985
##
## Sensitivity : 0.6591
## Specificity : 0.8553
## Pos Pred Value : 0.7250
## Neg Pred Value : 0.8125
## Prevalence : 0.3667
## Detection Rate : 0.2417
## Detection Prevalence : 0.3333
## Balanced Accuracy : 0.7572
##
## 'Positive' Class : At_Risk
##
best
## threshold sensitivity specificity
## 1 0.3971832 0.6590909 0.8552632
# SHAP values for each feature
shap = predict(xgb_best$finalModel, newdata = X_train, predcontrib = TRUE)
shap = as.data.frame(shap)
# Drop bias columns
shap = shap[, !grepl("(?i)^(bias|\\(intercept\\))$", colnames(shap)), drop = FALSE]
# Mean SHAP per feature = global importance
mean_abs = data.frame(
feature = colnames(shap),
mean_abs_shap = colMeans(abs(shap))
)
n_bars= 10L
p = mean_abs %>%
arrange(desc(mean_abs_shap)) %>%
slice_head(n = n_bars) %>%
ggplot(aes(x = reorder(feature, mean_abs_shap), y = mean_abs_shap)) +
geom_col(width = 0.7) +
coord_flip() +
labs(x = "Feature", y = "Mean |SHAP|",
title = "XGBoost — Feature Impact") +
theme_minimal(base_family = "Times") +
theme(
plot.title = element_text(face = "bold"),
axis.text = element_text(size = 10),
panel.grid.minor = element_blank()
)
p
map = tibble(feature = colnames(X_train)) %>%
mutate(group = case_when(
grepl("^Income\\.", feature) ~ "Income",
feature == "SOA_decile" ~ "SOA_decile",
grepl("^Education", feature) ~ "Education",
TRUE ~ feature
))
# join and aggregate
mean_abs = tibble(feature = colnames(shap),
mean_abs_shap = colMeans(abs(shap))) %>%
left_join(map, by = "feature")
by_var = mean_abs %>%
group_by(group) %>%
summarise(mean_abs_shap = sum(mean_abs_shap), .groups = "drop") %>%
arrange(desc(mean_abs_shap))
by_var_no_intercept = by_var %>%
filter(!grepl("(?i)intercept|bias", group))
# Plot
p = by_var_no_intercept %>%
arrange(desc(mean_abs_shap)) %>%
slice_head(n = nrow(by_var_no_intercept)) %>% # Use all rows, or set n_bars if you want to limit
ggplot(aes(x = reorder(group, mean_abs_shap), y = mean_abs_shap)) +
geom_col(width = 0.7) +
coord_flip() +
labs(x = "Feature", y = "Mean |SHAP|",
title = "XGBoost — Aggregated Feature Impact") +
theme_minimal(base_family = "Times") +
theme(
plot.title = element_text(face = "bold"),
axis.text = element_text(size = 10),
panel.grid.minor = element_blank()
)
p
# SHAP interaction values
shap_int = predict(xgb_best$finalModel, newdata = X_train, predinteraction = TRUE)
feat_names = colnames(X_train)
p_all = dim(shap_int)[2]
has_bias = (p_all == length(feat_names) + 1)
# indices of real features apart from bias
feat_idx = if (has_bias) seq_len(length(feat_names)) else seq_len(p_all)
# mean interaction across rows
int_mat = apply(abs(shap_int[, feat_idx, feat_idx, drop = FALSE]), c(2, 3), mean)
dimnames(int_mat) = list(feat_names, feat_names)
# long data frame
ut = which(upper.tri(int_mat), arr.ind = TRUE)
pairs = data.frame(
f1 = rownames(int_mat)[ut[, 1]],
f2 = colnames(int_mat)[ut[, 2]],
mean_abs_int = int_mat[ut]
)
pairs = pairs[order(-pairs$mean_abs_int), ]
top_k = head(pairs, 10)
top_k$pair = paste(top_k$f1, "×", top_k$f2)
# plot
ggplot(top_k, aes(x = mean_abs_int, y = reorder(pair, mean_abs_int))) +
geom_segment(aes(x = 0, xend = mean_abs_int, yend = pair)) +
geom_point() +
labs(x = "Mean |SHAP interaction|", y = "Feature pair",
title = "Top 10 Feature Interactions") +
theme_minimal(base_family = "Times")
education_new = read.csv("Education_new.csv")
education_new = education_new %>%
filter(!is.na(.[[1]]) & .[[1]] != "") %>%
rename(LSOA = 1) %>%
mutate(LSOA = str_trim(str_extract(LSOA, "^[^:]+")))
colnames(education_new)
## [1] "LSOA"
## [2] "All.categories..Highest.level.of.qualification"
## [3] "Highest.level.of.qualification..Level.1.qualifications"
## [4] "Highest.level.of.qualification..Level.2.qualifications"
## [5] "Highest.level.of.qualification..Apprenticeship"
## [6] "Highest.level.of.qualification..Level.3.qualifications"
## [7] "Highest.level.of.qualification..Level.4.qualifications.and.above"
education_new = education_new %>%
mutate(
Secondary_school_up_to_16 = Highest.level.of.qualification..Level.1.qualifications + Highest.level.of.qualification..Level.2.qualifications,
Higher_secondary_or_further = Highest.level.of.qualification..Level.3.qualifications + Highest.level.of.qualification..Apprenticeship,
Higher_education = Highest.level.of.qualification..Level.4.qualifications.and.above,
) %>%
dplyr::select(LSOA, Secondary_school_up_to_16, Higher_secondary_or_further, Higher_education)
education_new = education_new %>%
rowwise() %>%
mutate(across(-LSOA, ~ . / sum(c_across(-LSOA)))) %>%
ungroup()
IMD = read.csv("IMD_new.csv")
population_data = left_join(
education_new,
IMD,
by = c("LSOA" = "LSOA")
)
# add msoas to population data - 35672
LSOA_MSOA = read.csv("LSOA_MSOA_2011_new.csv")
population_data = left_join(
education_new,
LSOA_MSOA,
by = c("LSOA" = "LSOA11CD")
)
population_data = population_data %>%
distinct(LSOA, .keep_all = TRUE)
sum(is.na(population_data))
## [1] 0
income = read.csv("household_income_msoa.csv")
population_data = left_join(
population_data,
income,
by = c("MSOA11CD" = "MSOA.code")
)
population_data = population_data %>% dplyr:: select(-c("MSOA.name", "Local.authority.code", "Local.authority.name", "Region.code", "Region.name", "Upper.confidence.limit....", "Lower.confidence.limit....", "Confidence.interval...."))
# Convert to income bands
income_labels = c(
"£0 to £15,000 per year",
"£15,000 to £30,000 per year",
"£30,000 to £50,000 per year",
"£50,000 to £100,000 per year",
"Above £100,000 per year"
)
breaks = c(-Inf, 15000, 30000, 50000, 100000, Inf)
population_data = population_data %>%
mutate(
mean_income_num = as.numeric(gsub(",", "", Total.annual.income....)),
IncomeBand = cut(mean_income_num, breaks = breaks, labels = income_labels, right = TRUE)
)
# Get rid of unecessary columns
population_data = population_data %>% dplyr:: select(-c("Total.annual.income....", "MSOA11CD", "mean_income_num"))
population_size = read.csv("Population_size.csv")
population_size = population_size %>%
filter(!is.na(.[[1]]) & .[[1]] != "") %>%
rename(LSOA = 1) %>%
mutate(LSOA = str_trim(str_extract(LSOA, "^[^:]+")))
population_data = left_join(
population_data,
population_size,
by = c("LSOA" = "LSOA")
)
population_data$All.usual.residents = as.numeric(population_data$All.usual.residents)
# Define variables
levels_Edu = c("Secondary school up to 16 years",
"Higher or secondary or further education (A-Levels, BTEC, etc.)",
"Higher education / degree")
levels_Inc = c("£0 to £15,000 per year",
"£15,000 to £30,000 per year",
"£30,000 to £50,000 per year",
"£50,000 to £100,000 per year",
"Above £100,000 per year")
# Creates every possible combination
micro = expand.grid(
Education = levels_Edu,
Income = levels_Inc
)
micro$w_init = 1 # initial uniform weights
# Takes numbers and converts to proportions
safe_prop = function(x) { x[is.na(x)] <- 0; s <- sum(x); if (s <= 0) rep(1/length(x), length(x)) else x/s }
# Convert proportions to actual counts that add up to N
probs_to_counts = function(p, N) {
p[is.na(p)] <- 0; p <- p / sum(p)
tgt <- round(p * N)
diff <- N - sum(tgt)
if (diff != 0) {
ord <- order(p, decreasing = (diff > 0))
for (i in seq_len(abs(diff))) tgt[ ord[((i-1) %% length(tgt)) + 1] ] <- tgt[ ord[((i-1) %% length(tgt)) + 1] ] + sign(diff)
}
tgt
}
# average income band with weighting
meanband_to_probs = function(mean_band, K = 5, labels = levels_Inc) {
mu = if (is.character(mean_band)) match(mean_band, labels) else as.numeric(mean_band)
mu = min(max(mu, 1), K)
if (abs(mu - 1) < 1e-9) { p <- rep(0, K); p[1] <- 1; return(p) }
if (abs(mu - K) < 1e-9) { p <- rep(0, K); p[K] <- 1; return(p) }
f = function(lambda) { k <- 1:K; w <- exp(lambda * k); sum(k*w)/sum(w) - mu }
lambda = uniroot(f, c(-20, 20))$root
w = exp(lambda * (1:K)); as.numeric(w / sum(w))
}
# target counts for each category
targets_for = function(row) {
N = as.integer(row[["All.usual.residents"]])
out = list()
# Education
p_edu = safe_prop(as.numeric(row[ c("Secondary_school_up_to_16", "Higher_secondary_or_further", "Higher_education") ]))
out$Education = probs_to_counts(p_edu, N)
# Income from average band
income_band = row[["IncomeBand"]]
if(is.na(income_band)) {
income_band = "£30,000 to £50,000 per year" # Default to middle band
}
p_inc = meanband_to_probs(income_band, K = 5, labels = levels_Inc)
out$Income = probs_to_counts(p_inc, N)
out
}
# Converts into mathematical matrices
one_hot = function(x, lev) {
f = factor(x, levels = lev)
mm = model.matrix(~ f - 1L)
colnames(mm) = lev
storage.mode(mm) = "double"
mm
}
X = list(
Education = one_hot(micro$Education, levels_Edu),
Income = one_hot(micro$Income, levels_Inc)
)
# IPF Algorithm
ipf_weights = function(X, targets, max_iter = 200, tol = 1e-6) {
n = nrow(X[[1L]])
w = rep(1, n)
for (it in seq_len(max_iter)) {
w_old = w
for (nm in names(X)) {
Xi = X[[nm]]
tgt = targets[[nm]]
est = as.numeric(colSums(Xi * w))
adj = rep(1, length(tgt))
nz = est > 0
adj[nz] <- tgt[nz] / est[nz]
w = w * as.numeric(Xi %*% adj)
}
if (max(abs(w - w_old)) < tol) break
}
w
}
# Sampling
seed_for = function(x) (sum(utf8ToInt(as.character(x))) * 131) %% .Machine$integer.max
sample_N = function(micro, w, N, lsoa_id) {
if (sum(w) <= 0) w <- rep(1, length(w))
p <- w / sum(w)
set.seed(seed_for(lsoa_id))
idx <- sample.int(nrow(micro), size = N, replace = TRUE, prob = p)
micro[idx, , drop = FALSE]
}
# Generate population
synthetic_pop = population_data %>%
group_split(row_number()) %>%
lapply(function(rr) {
row <- rr[1, ]
lsoa <- row[["LSOA"]]
N <- as.integer(row[["All.usual.residents"]])
tgt <- targets_for(row)
w <- ipf_weights(X, tgt)
syn <- sample_N(micro, w, N, lsoa)
syn$LSOA <- lsoa
syn
}) %>%
bind_rows()
IMD = read.csv("IMD_new.csv")
synthetic_pop = left_join(
synthetic_pop,
IMD,
by = c("LSOA" = "LSOA")
)
synthetic_df = synthetic_pop
MAPPING = c(
"Income." = "Income",
"SOA_decile" = "SOA_decile",
"Education" = "Education"
)
EDU_MAPPING = c(
"Secondary school up to 16 years" = "Secondary school up to 16 years",
"Higher or secondary or further education (A-Levels, BTEC, etc.)" = "Higher or secondary or further education (A-Levels, BTEC, etc.)",
"Higher education / degree" = "Higher education"
)
synthetic_df$Education = unname(EDU_MAPPING[synthetic_df$Education])
names(synthetic_df)[names(synthetic_df) == "Income"] = "Income."
# match training
synthetic_df_fixed = synthetic_df %>%
filter(!is.na(SOA_decile)) %>%
mutate(
Income. = case_when(
Income. == "£15,000 to £30,000 per year" ~ "£15,000 to £30,000 per year ",
Income. == "£30,000 to £50,000 per year" ~ "£30,000 to £50,000 per year ",
Income. == "Above £100,000 per year" ~ "Above £100,000 per year ",
TRUE ~ Income.
),
Education = case_when(
Education == "Higher education / degree" ~ "Higher education", # ← REVERSE THIS
TRUE ~ Education
)
)
# Create model matrix
X_pred = model.matrix(~ 0 + Education + Income. + SOA_decile, data = synthetic_df_fixed)
train_cols = colnames(X_train)
# Add predictions
pred_probs = predict(xgb_best, newdata = X_pred, type = "prob")[, "At_Risk"]
pred_classes = ifelse(pred_probs >= 0.397, "At_Risk", "Low_Risk")
synthetic_df_fixed$Prob_At_Risk = pred_probs
synthetic_df_fixed$Predicted_Risk = pred_classes
# Aggregate to LSOA level
lsoa_results = synthetic_df_fixed %>%
group_by(LSOA) %>%
summarise(
Population = n(),
Percent_At_Risk = round(mean(Predicted_Risk == "At_Risk") * 100),
Avg_Risk_Prob = round(mean(Prob_At_Risk), 3),
Modal_Income = names(sort(table(Income.), decreasing = TRUE))[1],
Modal_Education = names(sort(table(Education), decreasing = TRUE))[1],
SOA_decile = first(SOA_decile),
.groups = "drop"
) %>%
arrange(desc(Percent_At_Risk))
# Results
print(paste("Predicted for", nrow(synthetic_df_fixed), "individuals in", nrow(lsoa_results), "LSOAs"))
## [1] "Predicted for 56075912 individuals in 34753 LSOAs"
print("Top 10 highest risk LSOAs:")
## [1] "Top 10 highest risk LSOAs:"
print(head(lsoa_results, 10))
## # A tibble: 10 × 7
## LSOA Population Percent_At_Risk Avg_Risk_Prob Modal_Income Modal_Education
## <chr> <int> <dbl> <dbl> <chr> <chr>
## 1 E01006… 1727 100 0.825 £0 to £15,0… Secondary scho…
## 2 E01006… 1616 100 0.835 £0 to £15,0… Secondary scho…
## 3 E01006… 1272 100 0.834 £0 to £15,0… Secondary scho…
## 4 E01007… 1368 100 0.833 £0 to £15,0… Secondary scho…
## 5 E01007… 1105 100 0.832 £0 to £15,0… Secondary scho…
## 6 E01007… 1664 100 0.831 £0 to £15,0… Secondary scho…
## 7 E01007… 1752 100 0.831 £0 to £15,0… Secondary scho…
## 8 E01007… 1606 100 0.829 £0 to £15,0… Secondary scho…
## 9 E01007… 1583 100 0.828 £0 to £15,0… Secondary scho…
## 10 E01007… 1784 100 0.831 £0 to £15,0… Secondary scho…
## # ℹ 1 more variable: SOA_decile <int>
# Save
write.csv(lsoa_results, "lsoa_risk_simple.csv", row.names = FALSE)
lsoa_shp = read_sf("LSOA_2011_EW_BFC_V3.shp")
lsoa_shp_map = left_join(
lsoa_results,
lsoa_shp,
by = c("LSOA" = "LSOA11CD")
)
lsoa_shp_map = st_sf(lsoa_shp_map)
lsoa_shp_map = lsoa_shp_map %>%
mutate(
Risk_Category = case_when(
is.na(Percent_At_Risk) ~ "No Data",
Percent_At_Risk >= 80 ~ "Very High (80%+)",
Percent_At_Risk >= 60 ~ "High (60-80%)",
Percent_At_Risk >= 40 ~ "Medium (40-60%)",
Percent_At_Risk >= 20 ~ "Low (20-40%)",
TRUE ~ "Very Low (<20%)"
),
Risk_Category = factor(Risk_Category,
levels = c("Very Low (<20%)", "Low (20-40%)",
"Medium (40-60%)", "High (60-80%)",
"Very High (80%+)", "No Data"))
)
risk_colors = c(
"Very Low (<20%)" = "#2E8B57",
"Low (20-40%)" = "#FFD700",
"Medium (40-60%)" = "#FF8C00",
"High (60-80%)" = "#FF4500",
"Very High (80%+)" = "#8B0000",
"No Data" = "#D3D3D3"
)
summary_stats = lsoa_shp_map %>%
st_drop_geometry() %>%
filter(!is.na(Percent_At_Risk)) %>%
summarise(
n_areas = n(),
mean_risk = round(mean(Percent_At_Risk), 1),
median_risk = round(median(Percent_At_Risk), 1),
high_risk_areas = sum(Percent_At_Risk >= 60),
pct_high_risk = round(100 * high_risk_areas / n_areas, 1)
)
p1_pub = ggplot(lsoa_shp_map) +
geom_sf(aes(fill = Risk_Category),
color = "NA",
linewidth = 0.01) + # Very thin white borders
scale_fill_manual(
values = risk_colors,
name = "Risk Level (%)",
guide = guide_legend(
title.position = "top",
title.hjust = 0.5,
nrow = 1,
override.aes = list(color = "black", linewidth = 0.5) # Clean legend
)
) +
# Scale bar and north arrow
annotation_scale(
location = "bl",
width_hint = 0.25,
style = "ticks",
line_col = "black",
text_col = "black",
text_cex = 0.9
) +
annotation_north_arrow(
location = "tr",
which_north = "true",
style = north_arrow_minimal(text_size = 8),
height = unit(1.2, "cm"),
width = unit(1.2, "cm")
) +
theme_void() +
theme(
text = element_text(family = "Times", color = "black"),
plot.title = element_text(
size = 12,
face = "bold",
hjust = 0.5,
margin = ggplot2::margin(b = 5)
),
plot.subtitle = element_text(
size = 10,
hjust = 0.5,
margin = ggplot2::margin(b = 10),
color = "gray30"
),
plot.caption = element_text(
size = 7,
hjust = 0,
margin = ggplot2::margin(t = 10),
color = "gray50"
),
legend.position = "bottom",
legend.title = element_text(size = 11, face = "bold"),
legend.text = element_text(size = 10),
legend.margin = ggplot2::margin(t = 15),
legend.box.spacing = unit(0.5, "cm"),
plot.margin = ggplot2::margin(20, 20, 20, 20),
panel.background = element_rect(fill = "white", color = NA),
plot.background = element_rect(fill = "white", color = NA)
) +
labs(
title = "Coeliac Disease Related Economic Risk Predictions Across English Lower Super Output Areas",
subtitle = sprintf(
"Mean risk: %s%% | %s LSOAs (%s%%) classified as high risk (≥60%%)",
summary_stats$mean_risk,
scales::comma(summary_stats$high_risk_areas),
summary_stats$pct_high_risk
),
caption = paste0(
"Data: Synthetic population predictions (n = ",
scales::comma(summary_stats$n_areas),
" LSOAs) | ", "Model: XGBoost classification "
)
)
p1_pub
labels = c("Very Low (<20%)","Low (20–40%)","Medium (40–60%)",
"High (60–80%)","Very High (80%+)")
risk_num = lsoa_results$Percent_At_Risk / 100
breaks= c(-Inf, 0.20, 0.40, 0.60, 0.80, Inf)
df = lsoa_results %>%
mutate(
Risk_Band = cut(risk_num, breaks = breaks, labels = labels, right = FALSE)
)
band_counts = df %>%
count(Risk_Band, name = "n") %>%
mutate(perc = round(100 * n / sum(n), 1)) %>%
arrange(match(Risk_Band, labels))
band_counts
## # A tibble: 5 × 3
## Risk_Band n perc
## <fct> <int> <dbl>
## 1 Very Low (<20%) 5255 15.1
## 2 Low (20–40%) 8242 23.7
## 3 Medium (40–60%) 15126 43.5
## 4 High (60–80%) 1109 3.2
## 5 Very High (80%+) 5021 14.4