1 Data Loading

library(readr)
data <- read_csv("C:/Users/Fakudze/Downloads/archive (31).zip")
View(data)

2 Data Examination

library(skimr)
skim(data)
Data summary
Name data
Number of rows 1205
Number of columns 12
_______________________
Column type frequency:
character 1
numeric 11
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Risk Level 18 0.99 3 4 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Age 0 1.00 27.48 9.20 10 21.00 25.0 31.0 65 ▅▇▂▁▁
Systolic BP 5 1.00 116.82 18.72 70 100.00 120.0 130.0 200 ▂▇▃▁▁
Diastolic 4 1.00 77.17 14.31 40 65.00 80.0 90.0 140 ▃▇▅▁▁
BS 2 1.00 7.50 3.05 3 6.00 6.9 7.9 19 ▅▇▁▁▁
Body Temp 0 1.00 98.40 1.09 97 98.00 98.0 98.0 103 ▇▁▁▁▁
BMI 18 0.99 23.32 3.88 0 20.45 23.0 25.0 37 ▁▁▇▇▁
Previous Complications 2 1.00 0.18 0.38 0 0.00 0.0 0.0 1 ▇▁▁▁▂
Preexisting Diabetes 2 1.00 0.29 0.45 0 0.00 0.0 1.0 1 ▇▁▁▁▃
Gestational Diabetes 0 1.00 0.12 0.32 0 0.00 0.0 0.0 1 ▇▁▁▁▁
Mental Health 0 1.00 0.33 0.47 0 0.00 0.0 1.0 1 ▇▁▁▁▅
Heart Rate 2 1.00 75.82 7.23 58 70.00 76.0 80.0 92 ▁▃▇▂▂
str(data)
## spc_tbl_ [1,205 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Age                   : num [1:1205] 22 22 27 20 20 22 20 23 22 26 ...
##  $ Systolic BP           : num [1:1205] 90 110 110 100 90 120 110 110 90 110 ...
##  $ Diastolic             : num [1:1205] 60 70 70 70 60 70 70 80 60 70 ...
##  $ BS                    : num [1:1205] 9 7.1 7.5 7.2 7.5 7.01 9 7 6.4 12 ...
##  $ Body Temp             : num [1:1205] 100 98 98 98 98 98 102 98 98 100 ...
##  $ BMI                   : num [1:1205] 18 20.4 23 21.2 19.7 24 17.6 21.3 22 30.2 ...
##  $ Previous Complications: num [1:1205] 1 0 1 0 0 0 0 0 0 1 ...
##  $ Preexisting Diabetes  : num [1:1205] 1 0 0 0 0 0 1 0 0 1 ...
##  $ Gestational Diabetes  : num [1:1205] 0 0 0 0 0 0 0 0 0 1 ...
##  $ Mental Health         : num [1:1205] 1 0 0 0 0 0 0 0 0 1 ...
##  $ Heart Rate            : num [1:1205] 80 74 72 74 74 76 78 74 72 80 ...
##  $ Risk Level            : chr [1:1205] "High" "Low" "Low" "Low" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Age = col_double(),
##   ..   `Systolic BP` = col_double(),
##   ..   Diastolic = col_double(),
##   ..   BS = col_double(),
##   ..   `Body Temp` = col_double(),
##   ..   BMI = col_double(),
##   ..   `Previous Complications` = col_double(),
##   ..   `Preexisting Diabetes` = col_double(),
##   ..   `Gestational Diabetes` = col_double(),
##   ..   `Mental Health` = col_double(),
##   ..   `Heart Rate` = col_double(),
##   ..   `Risk Level` = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
colSums(is.na(data))
##                    Age            Systolic BP              Diastolic 
##                      0                      5                      4 
##                     BS              Body Temp                    BMI 
##                      2                      0                     18 
## Previous Complications   Preexisting Diabetes   Gestational Diabetes 
##                      2                      2                      0 
##          Mental Health             Heart Rate             Risk Level 
##                      0                      2                     18

3 Data Cleaning

3.1 Renaming long format variables

library(dplyr)
data <- data %>% 
  rename(SystolicBP = `Systolic BP`,
         BodyTemp = `Body Temp`,
         PreviousComplications = `Previous Complications`,
         PreexistingDiabetes = `Preexisting Diabetes`,
         GestationalDiabetes = `Gestational Diabetes`,
         MentalHealth = `Mental Health`,
         HeartRate = `Heart Rate`,
         RiskLevel = `Risk Level`) %>% 
  mutate(PreviousComplications = factor(PreviousComplications),
         PreexistingDiabetes = factor(PreexistingDiabetes),
         GestationalDiabetes = factor(GestationalDiabetes),
         MentalHealth = factor(MentalHealth),
         RiskLevel = factor(RiskLevel))

str(data)
## tibble [1,205 × 12] (S3: tbl_df/tbl/data.frame)
##  $ Age                  : num [1:1205] 22 22 27 20 20 22 20 23 22 26 ...
##  $ SystolicBP           : num [1:1205] 90 110 110 100 90 120 110 110 90 110 ...
##  $ Diastolic            : num [1:1205] 60 70 70 70 60 70 70 80 60 70 ...
##  $ BS                   : num [1:1205] 9 7.1 7.5 7.2 7.5 7.01 9 7 6.4 12 ...
##  $ BodyTemp             : num [1:1205] 100 98 98 98 98 98 102 98 98 100 ...
##  $ BMI                  : num [1:1205] 18 20.4 23 21.2 19.7 24 17.6 21.3 22 30.2 ...
##  $ PreviousComplications: Factor w/ 2 levels "0","1": 2 1 2 1 1 1 1 1 1 2 ...
##  $ PreexistingDiabetes  : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 2 ...
##  $ GestationalDiabetes  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
##  $ MentalHealth         : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 2 ...
##  $ HeartRate            : num [1:1205] 80 74 72 74 74 76 78 74 72 80 ...
##  $ RiskLevel            : Factor w/ 2 levels "High","Low": 1 2 2 2 2 2 1 2 2 1 ...

3.2 Cleaning Missing Values in factor variables

library(tidyverse)
factor_cols <- sapply(data, is.factor)

for(col in names(data)[factor_cols]){
  data[[col]] <- addNA(data[[col]])
  levels(data[[col]])[is.na(levels(data[[col]]))] <- "Unknown"
  data[[col]][is.na(data[[col]])] <- "Unknown"
}

colSums(is.na(data))
##                   Age            SystolicBP             Diastolic 
##                     0                     5                     4 
##                    BS              BodyTemp                   BMI 
##                     2                     0                    18 
## PreviousComplications   PreexistingDiabetes   GestationalDiabetes 
##                     0                     0                     0 
##          MentalHealth             HeartRate             RiskLevel 
##                     0                     2                     0

3.3 Cleaning Missing Values in numeric variables

boxplot(data$SystolicBP)

boxplot(data$Diastolic)

boxplot(data$BS)

boxplot(data$BMI, main = "BMI Distribution")

3.4 Median Imputation Method

data <- data %>% 
  mutate(across(where(is.numeric), 
                ~ifelse(is.na(.), median(., na.rm = TRUE),.)))

colSums(is.na(data))
##                   Age            SystolicBP             Diastolic 
##                     0                     0                     0 
##                    BS              BodyTemp                   BMI 
##                     0                     0                     0 
## PreviousComplications   PreexistingDiabetes   GestationalDiabetes 
##                     0                     0                     0 
##          MentalHealth             HeartRate             RiskLevel 
##                     0                     0                     0

4 Exploratory Data Analysis

4.1 Numeric Variables EDA

4.1.1 Creat a dataset for numeric variables

library(tidyverse)
num_data <- data %>%
  select(where(is.numeric)) %>%
  mutate(RiskLevel = data$RiskLevel) %>%          
  pivot_longer(cols = -RiskLevel,
               names_to = "Variable",
               values_to = "Value")

4.1.2 Distribution of numeric variables by Risk Level

ggplot(num_data, aes(x = RiskLevel, y = Value, fill = RiskLevel)) +
  geom_boxplot(alpha = 0.85, outlier.size = 1.5) +
  facet_wrap(~ Variable, scales = "free_y", ncol = 3) +
  scale_fill_manual(values = c("High" = "#E74C3C",
                               "Low" = "#2ECC71",
                               "Unknown" = "#95A5A6")) +
  labs(title = "Numeric Variables by Risk Level",
       subtitle = "Clear separation visible in BS, Blood Pressure, and Age",
       x = "Risk Level",
       y = "Value") +
  theme_minimal(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        strip.text = element_text(face = "bold"),
        legend.position = "bottom")

4.2 Factor Variables EDA

4.2.1 Create a dataset for factor variables

fact_data <- data %>%
  select(where(is.factor)) %>%
  select(-RiskLevel) %>%                          
  mutate(RiskLevel = data$RiskLevel) %>%
  pivot_longer(cols = -RiskLevel,
               names_to = "Variable",
               values_to = "Value")

4.2.2 Distribution of Factor variables by Risk Level

ggplot(fact_data, aes(x = Value, fill = RiskLevel)) +
  geom_bar(position = "fill", alpha = 0.85) +
  facet_wrap(~ Variable, scales = "free_x", ncol = 2) +
  scale_fill_manual(values = c("Low" = "#2ECC71",      
                               "High" = "#E74C3C",  
                               "Unknown" = "#95A5A6")) + 
  labs(title = "Categorical Variables vs Risk Level",
       subtitle = "Proportion of Risk Level within each category (Strong Predictive Power)",
       x = "Category Value",
       y = "Proportion",
       fill = "Risk Level") +
  theme_minimal(base_size = 14) +
  theme(axis.text.x = element_text(angle = 0, size = 12),
        strip.text = element_text(face = "bold", size = 13),
        legend.position = "bottom",
        plot.subtitle = element_text(size = 12, color = "darkblue")) +
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "black", alpha = 0.3)

5 Selecting Predictive variables

Pred_data <- data %>% 
  select(RiskLevel, Age, BMI, BS, Diastolic, HeartRate, GestationalDiabetes, MentalHealth,
         PreexistingDiabetes, PreviousComplications)

6 Data Splitting

library(tidymodels)
library(themis)
library(doParallel)

set.seed(100)
data_split <- initial_split(Pred_data, prop = 0.8, strata = RiskLevel)
train_data <- training(data_split)
test_data <- testing(data_split)

7 Cross-validation folds

folds <- vfold_cv(
  train_data,
  v = 5,
  strata = RiskLevel
)

8 Preprocessing Recipe

recipe_spec <- recipe(RiskLevel ~., data = train_data) %>% 
  step_dummy(all_nominal_predictors(), one_hot = T) %>% 
  step_normalize(all_numeric_predictors()) %>% 
  step_zv(all_predictors()) %>% 
  step_corr(all_numeric_predictors(), threshold = 0.9)

9 Prep and Bake

recipe_prep <- prep(recipe_spec)
bake(recipe_prep, new_data = NULL) %>%head()

10 Defining different models (For comparison)

10.1 Logistic Regression (Baseline)

log_reg_spec <- logistic_reg() %>% 
  set_engine("glm") %>% 
  set_mode("classification")

10.2 Random Forest

rf_spec <- rand_forest(trees = tune(), mtry = tune(), min_n = tune()) %>% 
  set_engine("ranger", importance = "permutation") %>% 
  set_mode("classification")

10.3 XGBoost

xgb_spec <- boost_tree(
  trees = tune(),
  mtry = tune(),
  min_n = tune(),
  learn_rate = tune(),
  tree_depth = tune()
) %>% 
  set_engine("xgboost") %>% 
  set_mode("classification")

10.4 Support Vector Machine (SVM)

svm_spec <- svm_rbf(cost = tune(), rbf_sigma = tune()) %>% 
  set_engine("kernlab") %>% 
  set_mode("classification")

11 Defining workflows

log_wf <- workflow() %>% 
  add_recipe(recipe_spec) %>% 
  add_model(log_reg_spec)

rf_wf <- workflow() %>% 
  add_recipe(recipe_spec) %>% 
  add_model(rf_spec)

xgb_wf <- workflow() %>% 
  add_recipe(recipe_spec) %>% 
  add_model(xgb_spec)

svm_wf <- workflow() %>% 
  add_recipe(recipe_spec) %>% 
  add_model(svm_spec)

12 Hyperparameter tuning Random Forest and XGB (using parallel processing)

library(dials)
library(parsnip)
library(tune)

cl <- makePSOCKcluster(parallel::detectCores()- 1)
registerDoParallel(cl)

rf_grid <- grid_latin_hypercube(
  trees(range = c(100, 800)),
  mtry(range = c(2, 8)),
  min_n(range = c(2, 10)),
  size = 15
)

xgb_grid <- grid_latin_hypercube(
  trees(range = c(100, 1000)),
  mtry(range = c(2, 8)),
  min_n(range = c(2, 10)),
  learn_rate(range = c(-2, -0.1)),
  tree_depth(range = c(3, 10)),
  size = 20
)

13 Tuning models (Random Forest & XGB)

library(xgboost)

rf_tune <- tune_grid(
  rf_wf,
  resamples = folds,
  grid = rf_grid,
  metrics = metric_set(accuracy, roc_auc, f_meas),
  control = control_grid(save_pred = TRUE)
)

xgb_tune <- tune_grid(
  xgb_wf,
  resamples = folds,
  grid = xgb_grid,
  metrics = metric_set(accuracy, roc_auc, f_meas),
  control = control_grid(save_pred = TRUE)
)

14 Selecting best parameters

best_rf <- select_best(rf_tune, metric = "roc_auc")
best_xbg <- select_best(xgb_tune, metric = "roc_auc")

15 Finalizing the workflows

rf_final_wf <- finalize_workflow(rf_wf, best_rf)
xgb_final_wf <- finalize_workflow(xgb_wf, best_xbg)

16 Final model fitting on training data

rf_final_fit <- fit(rf_final_wf, data = train_data)
xgb_final_fit <- fit(xgb_final_wf, data = train_data)

17 Model evaluation on test set

test_results <- bind_rows(
  augment(rf_final_fit, new_data = test_data) %>% 
    mutate(model = "Random Forest"),
  augment(xgb_final_fit, new_data = test_data) %>% 
    mutate(model = "XGBoost")
)

test_metrics <- test_results %>% 
  group_by(model) %>% 
  metrics(truth = RiskLevel, estimate = .pred_class)

test_roc <- test_results %>% 
  group_by(model) %>% 
  roc_auc(truth = RiskLevel, .pred_Low, .pred_High, .pred_Unknown)

print(test_metrics)
## # A tibble: 4 × 4
##   model         .metric  .estimator .estimate
##   <chr>         <chr>    <chr>          <dbl>
## 1 Random Forest accuracy multiclass     0.946
## 2 XGBoost       accuracy multiclass     0.959
## 3 Random Forest kap      multiclass     0.891
## 4 XGBoost       kap      multiclass     0.916
print(test_roc)
## # A tibble: 2 × 4
##   model         .metric .estimator .estimate
##   <chr>         <chr>   <chr>          <dbl>
## 1 Random Forest roc_auc hand_till      0.270
## 2 XGBoost       roc_auc hand_till      0.304

18 Confusion Matrix for best model

conf_mat <- test_results %>% 
  filter(model == "XGBoost") %>% 
  conf_mat(truth = RiskLevel, estimate = .pred_class)

conf_mat
##           Truth
## Prediction High Low Unknown
##    High      92   1       2
##    Low        1 139       6
##    Unknown    0   0       0

19 Variable importance (for tree-based models)

library(vip)
vip(xgb_final_fit)

20 Choose the best model based on perfomance

best_model <- xgb_final_fit

21 Saving the final model

saveRDS(best_model, "maternal_risk_xgboost_model.rds")