1 Data Loading

library(readr)
data <- read_csv("C:/Users/Fakudze/Downloads/archive (31).zip")
View(data)

2 Data Examination

library(skimr)
skim(data)

Data summary
Name	data
Number of rows	1205
Number of columns	12
_______________________
Column type frequency:
character	1
numeric	11
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
Risk Level	18	0.99	3	4	0	2	0

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Age	0	1.00	27.48	9.20	10	21.00	25.0	31.0	65	▅▇▂▁▁
Systolic BP	5	1.00	116.82	18.72	70	100.00	120.0	130.0	200	▂▇▃▁▁
Diastolic	4	1.00	77.17	14.31	40	65.00	80.0	90.0	140	▃▇▅▁▁
BS	2	1.00	7.50	3.05	3	6.00	6.9	7.9	19	▅▇▁▁▁
Body Temp	0	1.00	98.40	1.09	97	98.00	98.0	98.0	103	▇▁▁▁▁
BMI	18	0.99	23.32	3.88	0	20.45	23.0	25.0	37	▁▁▇▇▁
Previous Complications	2	1.00	0.18	0.38	0	0.00	0.0	0.0	1	▇▁▁▁▂
Preexisting Diabetes	2	1.00	0.29	0.45	0	0.00	0.0	1.0	1	▇▁▁▁▃
Gestational Diabetes	0	1.00	0.12	0.32	0	0.00	0.0	0.0	1	▇▁▁▁▁
Mental Health	0	1.00	0.33	0.47	0	0.00	0.0	1.0	1	▇▁▁▁▅
Heart Rate	2	1.00	75.82	7.23	58	70.00	76.0	80.0	92	▁▃▇▂▂

str(data)

## spc_tbl_ [1,205 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Age                   : num [1:1205] 22 22 27 20 20 22 20 23 22 26 ...
##  $ Systolic BP           : num [1:1205] 90 110 110 100 90 120 110 110 90 110 ...
##  $ Diastolic             : num [1:1205] 60 70 70 70 60 70 70 80 60 70 ...
##  $ BS                    : num [1:1205] 9 7.1 7.5 7.2 7.5 7.01 9 7 6.4 12 ...
##  $ Body Temp             : num [1:1205] 100 98 98 98 98 98 102 98 98 100 ...
##  $ BMI                   : num [1:1205] 18 20.4 23 21.2 19.7 24 17.6 21.3 22 30.2 ...
##  $ Previous Complications: num [1:1205] 1 0 1 0 0 0 0 0 0 1 ...
##  $ Preexisting Diabetes  : num [1:1205] 1 0 0 0 0 0 1 0 0 1 ...
##  $ Gestational Diabetes  : num [1:1205] 0 0 0 0 0 0 0 0 0 1 ...
##  $ Mental Health         : num [1:1205] 1 0 0 0 0 0 0 0 0 1 ...
##  $ Heart Rate            : num [1:1205] 80 74 72 74 74 76 78 74 72 80 ...
##  $ Risk Level            : chr [1:1205] "High" "Low" "Low" "Low" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Age = col_double(),
##   ..   `Systolic BP` = col_double(),
##   ..   Diastolic = col_double(),
##   ..   BS = col_double(),
##   ..   `Body Temp` = col_double(),
##   ..   BMI = col_double(),
##   ..   `Previous Complications` = col_double(),
##   ..   `Preexisting Diabetes` = col_double(),
##   ..   `Gestational Diabetes` = col_double(),
##   ..   `Mental Health` = col_double(),
##   ..   `Heart Rate` = col_double(),
##   ..   `Risk Level` = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

colSums(is.na(data))

##                    Age            Systolic BP              Diastolic 
##                      0                      5                      4 
##                     BS              Body Temp                    BMI 
##                      2                      0                     18 
## Previous Complications   Preexisting Diabetes   Gestational Diabetes 
##                      2                      2                      0 
##          Mental Health             Heart Rate             Risk Level 
##                      0                      2                     18

3 Data Cleaning

3.1 Renaming long format variables

library(dplyr)
data <- data %>% 
  rename(SystolicBP = `Systolic BP`,
         BodyTemp = `Body Temp`,
         PreviousComplications = `Previous Complications`,
         PreexistingDiabetes = `Preexisting Diabetes`,
         GestationalDiabetes = `Gestational Diabetes`,
         MentalHealth = `Mental Health`,
         HeartRate = `Heart Rate`,
         RiskLevel = `Risk Level`) %>% 
  mutate(PreviousComplications = factor(PreviousComplications),
         PreexistingDiabetes = factor(PreexistingDiabetes),
         GestationalDiabetes = factor(GestationalDiabetes),
         MentalHealth = factor(MentalHealth),
         RiskLevel = factor(RiskLevel))

str(data)

## tibble [1,205 × 12] (S3: tbl_df/tbl/data.frame)
##  $ Age                  : num [1:1205] 22 22 27 20 20 22 20 23 22 26 ...
##  $ SystolicBP           : num [1:1205] 90 110 110 100 90 120 110 110 90 110 ...
##  $ Diastolic            : num [1:1205] 60 70 70 70 60 70 70 80 60 70 ...
##  $ BS                   : num [1:1205] 9 7.1 7.5 7.2 7.5 7.01 9 7 6.4 12 ...
##  $ BodyTemp             : num [1:1205] 100 98 98 98 98 98 102 98 98 100 ...
##  $ BMI                  : num [1:1205] 18 20.4 23 21.2 19.7 24 17.6 21.3 22 30.2 ...
##  $ PreviousComplications: Factor w/ 2 levels "0","1": 2 1 2 1 1 1 1 1 1 2 ...
##  $ PreexistingDiabetes  : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 2 ...
##  $ GestationalDiabetes  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
##  $ MentalHealth         : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 2 ...
##  $ HeartRate            : num [1:1205] 80 74 72 74 74 76 78 74 72 80 ...
##  $ RiskLevel            : Factor w/ 2 levels "High","Low": 1 2 2 2 2 2 1 2 2 1 ...

3.2 Cleaning Missing Values in factor variables

library(tidyverse)
factor_cols <- sapply(data, is.factor)

for(col in names(data)[factor_cols]){
  data[[col]] <- addNA(data[[col]])
  levels(data[[col]])[is.na(levels(data[[col]]))] <- "Unknown"
  data[[col]][is.na(data[[col]])] <- "Unknown"
}

colSums(is.na(data))

##                   Age            SystolicBP             Diastolic 
##                     0                     5                     4 
##                    BS              BodyTemp                   BMI 
##                     2                     0                    18 
## PreviousComplications   PreexistingDiabetes   GestationalDiabetes 
##                     0                     0                     0 
##          MentalHealth             HeartRate             RiskLevel 
##                     0                     2                     0

3.3 Cleaning Missing Values in numeric variables

boxplot(data$SystolicBP)

boxplot(data$Diastolic)

boxplot(data$BS)

boxplot(data$BMI, main = "BMI Distribution")

3.4 Median Imputation Method

data <- data %>% 
  mutate(across(where(is.numeric), 
                ~ifelse(is.na(.), median(., na.rm = TRUE),.)))

colSums(is.na(data))

##                   Age            SystolicBP             Diastolic 
##                     0                     0                     0 
##                    BS              BodyTemp                   BMI 
##                     0                     0                     0 
## PreviousComplications   PreexistingDiabetes   GestationalDiabetes 
##                     0                     0                     0 
##          MentalHealth             HeartRate             RiskLevel 
##                     0                     0                     0

4 Exploratory Data Analysis

4.1 Numeric Variables EDA

4.1.1 Creat a dataset for numeric variables

library(tidyverse)
num_data <- data %>%
  select(where(is.numeric)) %>%
  mutate(RiskLevel = data$RiskLevel) %>%          
  pivot_longer(cols = -RiskLevel,
               names_to = "Variable",
               values_to = "Value")

4.1.2 Distribution of numeric variables by Risk Level

ggplot(num_data, aes(x = RiskLevel, y = Value, fill = RiskLevel)) +
  geom_boxplot(alpha = 0.85, outlier.size = 1.5) +
  facet_wrap(~ Variable, scales = "free_y", ncol = 3) +
  scale_fill_manual(values = c("High" = "#E74C3C",
                               "Low" = "#2ECC71",
                               "Unknown" = "#95A5A6")) +
  labs(title = "Numeric Variables by Risk Level",
       subtitle = "Clear separation visible in BS, Blood Pressure, and Age",
       x = "Risk Level",
       y = "Value") +
  theme_minimal(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        strip.text = element_text(face = "bold"),
        legend.position = "bottom")

4.2 Factor Variables EDA

4.2.1 Create a dataset for factor variables

fact_data <- data %>%
  select(where(is.factor)) %>%
  select(-RiskLevel) %>%                          
  mutate(RiskLevel = data$RiskLevel) %>%
  pivot_longer(cols = -RiskLevel,
               names_to = "Variable",
               values_to = "Value")

4.2.2 Distribution of Factor variables by Risk Level

ggplot(fact_data, aes(x = Value, fill = RiskLevel)) +
  geom_bar(position = "fill", alpha = 0.85) +
  facet_wrap(~ Variable, scales = "free_x", ncol = 2) +
  scale_fill_manual(values = c("Low" = "#2ECC71",      
                               "High" = "#E74C3C",  
                               "Unknown" = "#95A5A6")) + 
  labs(title = "Categorical Variables vs Risk Level",
       subtitle = "Proportion of Risk Level within each category (Strong Predictive Power)",
       x = "Category Value",
       y = "Proportion",
       fill = "Risk Level") +
  theme_minimal(base_size = 14) +
  theme(axis.text.x = element_text(angle = 0, size = 12),
        strip.text = element_text(face = "bold", size = 13),
        legend.position = "bottom",
        plot.subtitle = element_text(size = 12, color = "darkblue")) +
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "black", alpha = 0.3)

5 Selecting Predictive variables

Pred_data <- data %>% 
  select(RiskLevel, Age, BMI, BS, Diastolic, HeartRate, GestationalDiabetes, MentalHealth,
         PreexistingDiabetes, PreviousComplications)

6 Data Splitting

library(tidymodels)
library(themis)
library(doParallel)

set.seed(100)
data_split <- initial_split(Pred_data, prop = 0.8, strata = RiskLevel)
train_data <- training(data_split)
test_data <- testing(data_split)

7 Cross-validation folds

folds <- vfold_cv(
  train_data,
  v = 5,
  strata = RiskLevel
)

8 Preprocessing Recipe

recipe_spec <- recipe(RiskLevel ~., data = train_data) %>% 
  step_dummy(all_nominal_predictors(), one_hot = T) %>% 
  step_normalize(all_numeric_predictors()) %>% 
  step_zv(all_predictors()) %>% 
  step_corr(all_numeric_predictors(), threshold = 0.9)

9 Prep and Bake

recipe_prep <- prep(recipe_spec)
bake(recipe_prep, new_data = NULL) %>%head()

10 Defining different models (For comparison)

10.1 Logistic Regression (Baseline)

log_reg_spec <- logistic_reg() %>% 
  set_engine("glm") %>% 
  set_mode("classification")

10.2 Random Forest

rf_spec <- rand_forest(trees = tune(), mtry = tune(), min_n = tune()) %>% 
  set_engine("ranger", importance = "permutation") %>% 
  set_mode("classification")

10.3 XGBoost

xgb_spec <- boost_tree(
  trees = tune(),
  mtry = tune(),
  min_n = tune(),
  learn_rate = tune(),
  tree_depth = tune()
) %>% 
  set_engine("xgboost") %>% 
  set_mode("classification")

10.4 Support Vector Machine (SVM)

svm_spec <- svm_rbf(cost = tune(), rbf_sigma = tune()) %>% 
  set_engine("kernlab") %>% 
  set_mode("classification")

11 Defining workflows

log_wf <- workflow() %>% 
  add_recipe(recipe_spec) %>% 
  add_model(log_reg_spec)

rf_wf <- workflow() %>% 
  add_recipe(recipe_spec) %>% 
  add_model(rf_spec)

xgb_wf <- workflow() %>% 
  add_recipe(recipe_spec) %>% 
  add_model(xgb_spec)

svm_wf <- workflow() %>% 
  add_recipe(recipe_spec) %>% 
  add_model(svm_spec)

12 Hyperparameter tuning Random Forest and XGB (using parallel processing)

library(dials)
library(parsnip)
library(tune)

cl <- makePSOCKcluster(parallel::detectCores()- 1)
registerDoParallel(cl)

rf_grid <- grid_latin_hypercube(
  trees(range = c(100, 800)),
  mtry(range = c(2, 8)),
  min_n(range = c(2, 10)),
  size = 15
)

xgb_grid <- grid_latin_hypercube(
  trees(range = c(100, 1000)),
  mtry(range = c(2, 8)),
  min_n(range = c(2, 10)),
  learn_rate(range = c(-2, -0.1)),
  tree_depth(range = c(3, 10)),
  size = 20
)

13 Tuning models (Random Forest & XGB)

library(xgboost)

rf_tune <- tune_grid(
  rf_wf,
  resamples = folds,
  grid = rf_grid,
  metrics = metric_set(accuracy, roc_auc, f_meas),
  control = control_grid(save_pred = TRUE)
)

xgb_tune <- tune_grid(
  xgb_wf,
  resamples = folds,
  grid = xgb_grid,
  metrics = metric_set(accuracy, roc_auc, f_meas),
  control = control_grid(save_pred = TRUE)
)

14 Selecting best parameters

best_rf <- select_best(rf_tune, metric = "roc_auc")
best_xbg <- select_best(xgb_tune, metric = "roc_auc")

15 Finalizing the workflows

rf_final_wf <- finalize_workflow(rf_wf, best_rf)
xgb_final_wf <- finalize_workflow(xgb_wf, best_xbg)

16 Final model fitting on training data

rf_final_fit <- fit(rf_final_wf, data = train_data)
xgb_final_fit <- fit(xgb_final_wf, data = train_data)

17 Model evaluation on test set

test_results <- bind_rows(
  augment(rf_final_fit, new_data = test_data) %>% 
    mutate(model = "Random Forest"),
  augment(xgb_final_fit, new_data = test_data) %>% 
    mutate(model = "XGBoost")
)

test_metrics <- test_results %>% 
  group_by(model) %>% 
  metrics(truth = RiskLevel, estimate = .pred_class)

test_roc <- test_results %>% 
  group_by(model) %>% 
  roc_auc(truth = RiskLevel, .pred_Low, .pred_High, .pred_Unknown)

print(test_metrics)

## # A tibble: 4 × 4
##   model         .metric  .estimator .estimate
##   <chr>         <chr>    <chr>          <dbl>
## 1 Random Forest accuracy multiclass     0.946
## 2 XGBoost       accuracy multiclass     0.959
## 3 Random Forest kap      multiclass     0.891
## 4 XGBoost       kap      multiclass     0.916

print(test_roc)

## # A tibble: 2 × 4
##   model         .metric .estimator .estimate
##   <chr>         <chr>   <chr>          <dbl>
## 1 Random Forest roc_auc hand_till      0.270
## 2 XGBoost       roc_auc hand_till      0.304

18 Confusion Matrix for best model

conf_mat <- test_results %>% 
  filter(model == "XGBoost") %>% 
  conf_mat(truth = RiskLevel, estimate = .pred_class)

conf_mat

##           Truth
## Prediction High Low Unknown
##    High      92   1       2
##    Low        1 139       6
##    Unknown    0   0       0

19 Variable importance (for tree-based models)

library(vip)
vip(xgb_final_fit)

20 Choose the best model based on perfomance

best_model <- xgb_final_fit

21 Saving the final model

saveRDS(best_model, "maternal_risk_xgboost_model.rds")

Maternal Health Risk Prediction using tidymodels

Celumusa Fakudze Fakudze

2026-05-19

1 Data Loading

2 Data Examination

3 Data Cleaning

3.1 Renaming long format variables

3.2 Cleaning Missing Values in factor variables

3.3 Cleaning Missing Values in numeric variables

3.4 Median Imputation Method

4 Exploratory Data Analysis

4.1 Numeric Variables EDA

4.1.1 Creat a dataset for numeric variables

4.1.2 Distribution of numeric variables by Risk Level

4.2 Factor Variables EDA

4.2.1 Create a dataset for factor variables

4.2.2 Distribution of Factor variables by Risk Level

5 Selecting Predictive variables

6 Data Splitting

7 Cross-validation folds

8 Preprocessing Recipe

9 Prep and Bake

10 Defining different models (For comparison)

10.1 Logistic Regression (Baseline)

10.2 Random Forest

10.3 XGBoost

10.4 Support Vector Machine (SVM)

11 Defining workflows

12 Hyperparameter tuning Random Forest and XGB (using parallel processing)

13 Tuning models (Random Forest & XGB)

14 Selecting best parameters

15 Finalizing the workflows

16 Final model fitting on training data

17 Model evaluation on test set

18 Confusion Matrix for best model

19 Variable importance (for tree-based models)

20 Choose the best model based on perfomance

21 Saving the final model