Data Loading
library(readr)
data <- read_csv("C:/Users/Fakudze/Downloads/archive (31).zip")
View(data)
Data Examination
library(skimr)
skim(data)
Data summary
| Name |
data |
| Number of rows |
1205 |
| Number of columns |
12 |
| _______________________ |
|
| Column type frequency: |
|
| character |
1 |
| numeric |
11 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| Risk Level |
18 |
0.99 |
3 |
4 |
0 |
2 |
0 |
Variable type: numeric
| Age |
0 |
1.00 |
27.48 |
9.20 |
10 |
21.00 |
25.0 |
31.0 |
65 |
▅▇▂▁▁ |
| Systolic BP |
5 |
1.00 |
116.82 |
18.72 |
70 |
100.00 |
120.0 |
130.0 |
200 |
▂▇▃▁▁ |
| Diastolic |
4 |
1.00 |
77.17 |
14.31 |
40 |
65.00 |
80.0 |
90.0 |
140 |
▃▇▅▁▁ |
| BS |
2 |
1.00 |
7.50 |
3.05 |
3 |
6.00 |
6.9 |
7.9 |
19 |
▅▇▁▁▁ |
| Body Temp |
0 |
1.00 |
98.40 |
1.09 |
97 |
98.00 |
98.0 |
98.0 |
103 |
▇▁▁▁▁ |
| BMI |
18 |
0.99 |
23.32 |
3.88 |
0 |
20.45 |
23.0 |
25.0 |
37 |
▁▁▇▇▁ |
| Previous Complications |
2 |
1.00 |
0.18 |
0.38 |
0 |
0.00 |
0.0 |
0.0 |
1 |
▇▁▁▁▂ |
| Preexisting Diabetes |
2 |
1.00 |
0.29 |
0.45 |
0 |
0.00 |
0.0 |
1.0 |
1 |
▇▁▁▁▃ |
| Gestational Diabetes |
0 |
1.00 |
0.12 |
0.32 |
0 |
0.00 |
0.0 |
0.0 |
1 |
▇▁▁▁▁ |
| Mental Health |
0 |
1.00 |
0.33 |
0.47 |
0 |
0.00 |
0.0 |
1.0 |
1 |
▇▁▁▁▅ |
| Heart Rate |
2 |
1.00 |
75.82 |
7.23 |
58 |
70.00 |
76.0 |
80.0 |
92 |
▁▃▇▂▂ |
## spc_tbl_ [1,205 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Age : num [1:1205] 22 22 27 20 20 22 20 23 22 26 ...
## $ Systolic BP : num [1:1205] 90 110 110 100 90 120 110 110 90 110 ...
## $ Diastolic : num [1:1205] 60 70 70 70 60 70 70 80 60 70 ...
## $ BS : num [1:1205] 9 7.1 7.5 7.2 7.5 7.01 9 7 6.4 12 ...
## $ Body Temp : num [1:1205] 100 98 98 98 98 98 102 98 98 100 ...
## $ BMI : num [1:1205] 18 20.4 23 21.2 19.7 24 17.6 21.3 22 30.2 ...
## $ Previous Complications: num [1:1205] 1 0 1 0 0 0 0 0 0 1 ...
## $ Preexisting Diabetes : num [1:1205] 1 0 0 0 0 0 1 0 0 1 ...
## $ Gestational Diabetes : num [1:1205] 0 0 0 0 0 0 0 0 0 1 ...
## $ Mental Health : num [1:1205] 1 0 0 0 0 0 0 0 0 1 ...
## $ Heart Rate : num [1:1205] 80 74 72 74 74 76 78 74 72 80 ...
## $ Risk Level : chr [1:1205] "High" "Low" "Low" "Low" ...
## - attr(*, "spec")=
## .. cols(
## .. Age = col_double(),
## .. `Systolic BP` = col_double(),
## .. Diastolic = col_double(),
## .. BS = col_double(),
## .. `Body Temp` = col_double(),
## .. BMI = col_double(),
## .. `Previous Complications` = col_double(),
## .. `Preexisting Diabetes` = col_double(),
## .. `Gestational Diabetes` = col_double(),
## .. `Mental Health` = col_double(),
## .. `Heart Rate` = col_double(),
## .. `Risk Level` = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
## Age Systolic BP Diastolic
## 0 5 4
## BS Body Temp BMI
## 2 0 18
## Previous Complications Preexisting Diabetes Gestational Diabetes
## 2 2 0
## Mental Health Heart Rate Risk Level
## 0 2 18
Data Cleaning
Cleaning Missing
Values in factor variables
library(tidyverse)
factor_cols <- sapply(data, is.factor)
for(col in names(data)[factor_cols]){
data[[col]] <- addNA(data[[col]])
levels(data[[col]])[is.na(levels(data[[col]]))] <- "Unknown"
data[[col]][is.na(data[[col]])] <- "Unknown"
}
colSums(is.na(data))
## Age SystolicBP Diastolic
## 0 5 4
## BS BodyTemp BMI
## 2 0 18
## PreviousComplications PreexistingDiabetes GestationalDiabetes
## 0 0 0
## MentalHealth HeartRate RiskLevel
## 0 2 0
Cleaning Missing
Values in numeric variables



boxplot(data$BMI, main = "BMI Distribution")

Exploratory Data
Analysis
Numeric Variables
EDA
Creat a dataset for
numeric variables
library(tidyverse)
num_data <- data %>%
select(where(is.numeric)) %>%
mutate(RiskLevel = data$RiskLevel) %>%
pivot_longer(cols = -RiskLevel,
names_to = "Variable",
values_to = "Value")
Distribution of
numeric variables by Risk Level
ggplot(num_data, aes(x = RiskLevel, y = Value, fill = RiskLevel)) +
geom_boxplot(alpha = 0.85, outlier.size = 1.5) +
facet_wrap(~ Variable, scales = "free_y", ncol = 3) +
scale_fill_manual(values = c("High" = "#E74C3C",
"Low" = "#2ECC71",
"Unknown" = "#95A5A6")) +
labs(title = "Numeric Variables by Risk Level",
subtitle = "Clear separation visible in BS, Blood Pressure, and Age",
x = "Risk Level",
y = "Value") +
theme_minimal(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
strip.text = element_text(face = "bold"),
legend.position = "bottom")

Factor Variables
EDA
Create a dataset
for factor variables
fact_data <- data %>%
select(where(is.factor)) %>%
select(-RiskLevel) %>%
mutate(RiskLevel = data$RiskLevel) %>%
pivot_longer(cols = -RiskLevel,
names_to = "Variable",
values_to = "Value")
Distribution of
Factor variables by Risk Level
ggplot(fact_data, aes(x = Value, fill = RiskLevel)) +
geom_bar(position = "fill", alpha = 0.85) +
facet_wrap(~ Variable, scales = "free_x", ncol = 2) +
scale_fill_manual(values = c("Low" = "#2ECC71",
"High" = "#E74C3C",
"Unknown" = "#95A5A6")) +
labs(title = "Categorical Variables vs Risk Level",
subtitle = "Proportion of Risk Level within each category (Strong Predictive Power)",
x = "Category Value",
y = "Proportion",
fill = "Risk Level") +
theme_minimal(base_size = 14) +
theme(axis.text.x = element_text(angle = 0, size = 12),
strip.text = element_text(face = "bold", size = 13),
legend.position = "bottom",
plot.subtitle = element_text(size = 12, color = "darkblue")) +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "black", alpha = 0.3)

Selecting Predictive
variables
Pred_data <- data %>%
select(RiskLevel, Age, BMI, BS, Diastolic, HeartRate, GestationalDiabetes, MentalHealth,
PreexistingDiabetes, PreviousComplications)
Data Splitting
library(tidymodels)
library(themis)
library(doParallel)
set.seed(100)
data_split <- initial_split(Pred_data, prop = 0.8, strata = RiskLevel)
train_data <- training(data_split)
test_data <- testing(data_split)
Cross-validation
folds
folds <- vfold_cv(
train_data,
v = 5,
strata = RiskLevel
)
Preprocessing
Recipe
recipe_spec <- recipe(RiskLevel ~., data = train_data) %>%
step_dummy(all_nominal_predictors(), one_hot = T) %>%
step_normalize(all_numeric_predictors()) %>%
step_zv(all_predictors()) %>%
step_corr(all_numeric_predictors(), threshold = 0.9)
Prep and Bake
recipe_prep <- prep(recipe_spec)
bake(recipe_prep, new_data = NULL) %>%head()
Defining different
models (For comparison)
Logistic Regression
(Baseline)
log_reg_spec <- logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
Random Forest
rf_spec <- rand_forest(trees = tune(), mtry = tune(), min_n = tune()) %>%
set_engine("ranger", importance = "permutation") %>%
set_mode("classification")
XGBoost
xgb_spec <- boost_tree(
trees = tune(),
mtry = tune(),
min_n = tune(),
learn_rate = tune(),
tree_depth = tune()
) %>%
set_engine("xgboost") %>%
set_mode("classification")
Support Vector
Machine (SVM)
svm_spec <- svm_rbf(cost = tune(), rbf_sigma = tune()) %>%
set_engine("kernlab") %>%
set_mode("classification")
Defining
workflows
log_wf <- workflow() %>%
add_recipe(recipe_spec) %>%
add_model(log_reg_spec)
rf_wf <- workflow() %>%
add_recipe(recipe_spec) %>%
add_model(rf_spec)
xgb_wf <- workflow() %>%
add_recipe(recipe_spec) %>%
add_model(xgb_spec)
svm_wf <- workflow() %>%
add_recipe(recipe_spec) %>%
add_model(svm_spec)
Hyperparameter tuning
Random Forest and XGB (using parallel processing)
library(dials)
library(parsnip)
library(tune)
cl <- makePSOCKcluster(parallel::detectCores()- 1)
registerDoParallel(cl)
rf_grid <- grid_latin_hypercube(
trees(range = c(100, 800)),
mtry(range = c(2, 8)),
min_n(range = c(2, 10)),
size = 15
)
xgb_grid <- grid_latin_hypercube(
trees(range = c(100, 1000)),
mtry(range = c(2, 8)),
min_n(range = c(2, 10)),
learn_rate(range = c(-2, -0.1)),
tree_depth(range = c(3, 10)),
size = 20
)
Tuning models (Random
Forest & XGB)
library(xgboost)
rf_tune <- tune_grid(
rf_wf,
resamples = folds,
grid = rf_grid,
metrics = metric_set(accuracy, roc_auc, f_meas),
control = control_grid(save_pred = TRUE)
)
xgb_tune <- tune_grid(
xgb_wf,
resamples = folds,
grid = xgb_grid,
metrics = metric_set(accuracy, roc_auc, f_meas),
control = control_grid(save_pred = TRUE)
)
Selecting best
parameters
best_rf <- select_best(rf_tune, metric = "roc_auc")
best_xbg <- select_best(xgb_tune, metric = "roc_auc")
Finalizing the
workflows
rf_final_wf <- finalize_workflow(rf_wf, best_rf)
xgb_final_wf <- finalize_workflow(xgb_wf, best_xbg)
Final model fitting on
training data
rf_final_fit <- fit(rf_final_wf, data = train_data)
xgb_final_fit <- fit(xgb_final_wf, data = train_data)
Model evaluation on
test set
test_results <- bind_rows(
augment(rf_final_fit, new_data = test_data) %>%
mutate(model = "Random Forest"),
augment(xgb_final_fit, new_data = test_data) %>%
mutate(model = "XGBoost")
)
test_metrics <- test_results %>%
group_by(model) %>%
metrics(truth = RiskLevel, estimate = .pred_class)
test_roc <- test_results %>%
group_by(model) %>%
roc_auc(truth = RiskLevel, .pred_Low, .pred_High, .pred_Unknown)
print(test_metrics)
## # A tibble: 4 × 4
## model .metric .estimator .estimate
## <chr> <chr> <chr> <dbl>
## 1 Random Forest accuracy multiclass 0.946
## 2 XGBoost accuracy multiclass 0.959
## 3 Random Forest kap multiclass 0.891
## 4 XGBoost kap multiclass 0.916
## # A tibble: 2 × 4
## model .metric .estimator .estimate
## <chr> <chr> <chr> <dbl>
## 1 Random Forest roc_auc hand_till 0.270
## 2 XGBoost roc_auc hand_till 0.304
Confusion Matrix for
best model
conf_mat <- test_results %>%
filter(model == "XGBoost") %>%
conf_mat(truth = RiskLevel, estimate = .pred_class)
conf_mat
## Truth
## Prediction High Low Unknown
## High 92 1 2
## Low 1 139 6
## Unknown 0 0 0
Variable importance
(for tree-based models)
library(vip)
vip(xgb_final_fit)

Choose the best model
based on perfomance
best_model <- xgb_final_fit
Saving the final
model
saveRDS(best_model, "maternal_risk_xgboost_model.rds")