Data Wizard Group

Universiti Malaya

This is an R Markdown document of machine learning and data modelling script for Stroke Predictor.

## File name: stroke_prediction_v3[1].R
## This is an R script to create data model from machine learning

# Load necessary libraries
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(ggplot2)
library(DMwR2)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ranger)
## 
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
## 
##     importance
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(PRROC)
library(ranger)
library(smotefamily)

# Load the dataset
file_path <- 'stroke prediction_cleaned data.xlsx'
df <- read_excel(file_path)

# Display the first few rows of the dataset
head(df)
## # A tibble: 6 × 12
##      ID Gender   Age Hypertension Heart_Disease Ever_Married Work_Type    
##   <dbl> <chr>  <dbl>        <dbl>         <dbl> <chr>        <chr>        
## 1  9046 Male      67            0             1 Yes          Private_Job  
## 2 51676 Female    61            0             0 Yes          Self_Employed
## 3 31112 Male      80            0             1 Yes          Private_Job  
## 4 60182 Female    49            0             0 Yes          Private_Job  
## 5  1665 Female    79            1             0 Yes          Self_Employed
## 6 56669 Male      81            0             0 Yes          Private_Job  
## # ℹ 5 more variables: Residence_Type <chr>, Avg_Glucose_Level <dbl>, BMI <dbl>,
## #   Smoking_Status <chr>, Stroke <dbl>
# Encode categorical values
df <- df %>%
  mutate(Gender = recode(Gender, 'Male' = 0, 'Female' = 1, 'Other' = -1),
         Residence_Type = recode(Residence_Type, 'Rural' = 0, 'Urban' = 1),
         Work_Type = recode(Work_Type, 'Private_Job' = 0, 'Self_Employed' = 1, 'Government_Job' = 2, 'Dependant_Never_Worked' = -1),
         Ever_Married = recode(Ever_Married, 'No' = 0, 'Yes' = 1),
         Smoking_Status = recode(Smoking_Status, 'Never_Smoked' = 0, 'Formerly_Smoked' = 1, 'Smokes' = 2, 'Unknown' = -1, 'Unlikely_Smoked' = -2))

# Define features and target variable
X <- df %>%
  select(Gender, Age, Hypertension, Heart_Disease, Work_Type, Avg_Glucose_Level, BMI)
y <- df$Stroke

# Convert target variable to factor with appropriate levels
y <- factor(y, levels = c(0, 1), labels = c("NoStroke", "Stroke"))

# Feature engineering: Create interaction terms or new features
df <- df %>%
  mutate(Age_BMI = Age * BMI,
         Hypertension_Age = Hypertension * Age,
         Heart_Disease_Age = Heart_Disease * Age,
         Age_Avg_Glucose_Level = Age * Avg_Glucose_Level)

# Define features and target variable again to include new features
X <- df %>%
  select(Gender, Age, Hypertension, Heart_Disease, Work_Type, Avg_Glucose_Level, BMI, Age_BMI, Hypertension_Age, Heart_Disease_Age, Age_Avg_Glucose_Level)
y <- df$Stroke

# Plot class distribution before resampling
ggplot(data = df, aes(x = Stroke)) + 
  geom_bar() +
  ggtitle("Class Distribution Before Resampling") +
  xlab("Stroke") +
  ylab("Count")

# Print the number of instances in each class before resampling
#cat('Stroke (train before resampling):', sum(y_train == 1), '\n')
#cat('No stroke (train before resampling):', sum(y_train == 0), '\n')

# Verify the dimensions before resampling
#cat("X_train shape:", dim(X_train), "\n")
#cat("y_train shape:", length(y_train), "\n")

# Assuming X is your feature matrix and y is your target variable
# Convert the data to a data frame
data <- data.frame(X, y)

# Applying SMOTE
set.seed(3)
smote_data <- SMOTE(X = data[, -ncol(data)], target = data$y, K = 5, dup_size = 0)

# Extracting the resampled data
X_smote <- smote_data$data[, -ncol(smote_data$data)]
y_smote <- smote_data$data$class

# Convert y_smote to factor with appropriate levels
y_smote <- factor(y_smote, levels = c(0, 1), labels = c("NoStroke", "Stroke"))

# Plotting the class distribution after SMOTE
y_smote <- as.factor(y_smote)
ggplot(data.frame(y_smote), aes(x = y_smote)) + 
  geom_bar() +
  ggtitle("Class Distribution After SMOTE")

# Print the number of instances in each class
cat('Stroke:', sum(y_smote == 1), '\n')
## Stroke: 0
cat('No stroke:', sum(y_smote == 0), '\n')
## No stroke: 0
# Assuming X_smote and y_smote are your resampled data
# Combine the features and target variable into one data frame
data_smote <- data.frame(X_smote, y_smote)

# Splitting the dataset into training and testing sets
set.seed(42)
trainIndex <- createDataPartition(data_smote$y_smote, p = .8, 
                                  list = FALSE, 
                                  times = 1)
dataTrain <- data_smote[ trainIndex,]
dataTest  <- data_smote[-trainIndex,]

X_train <- dataTrain[, -ncol(dataTrain)]
X_test <- dataTest[, -ncol(dataTest)]
y_train <- dataTrain$y_smote
y_test <- dataTest$y_smote

# Print the dimensions of the split data
cat("Training Set Dimensions: ", dim(X_train), "\n")
## Training Set Dimensions:  7674 11
cat("Testing Set Dimensions: ", dim(X_test), "\n")
## Testing Set Dimensions:  1918 11
cat("Training Labels Dimensions: ", length(y_train), "\n")
## Training Labels Dimensions:  7674
cat("Testing Labels Dimensions: ", length(y_test), "\n")
## Testing Labels Dimensions:  1918
# Plotting the class distribution in the training set
ggplot(data.frame(y_train), aes(x = y_train)) + 
  geom_bar() +
  ggtitle("Class Distribution in Training Set")

# Print the number of instances in each class in the training set
cat('Stroke (train):', sum(y_train == 1), '\n')
## Stroke (train): 0
cat('No stroke (train):', sum(y_train == 0), '\n')
## No stroke (train): 0
# Plotting the class distribution in the testing set
ggplot(data.frame(y_test), aes(x = y_test)) + 
  geom_bar() +
  ggtitle("Class Distribution in Test Set")

# Print the number of instances in each class in the testing set
cat('Stroke (test):', sum(y_test == 1), '\n')
## Stroke (test): 0
cat('No stroke (test):', sum(y_test == 0), '\n')
## No stroke (test): 0
# Print column names of X_train and X_test
cat("Column names in X_train:", colnames(X_train), "\n")
## Column names in X_train: Gender Age Hypertension Heart_Disease Work_Type Avg_Glucose_Level BMI Age_BMI Hypertension_Age Heart_Disease_Age Age_Avg_Glucose_Level
cat("Column names in X_test:", colnames(X_test), "\n")
## Column names in X_test: Gender Age Hypertension Heart_Disease Work_Type Avg_Glucose_Level BMI Age_BMI Hypertension_Age Heart_Disease_Age Age_Avg_Glucose_Level
# RANDOM FOREST

# Define the grid of hyperparameters for Random Forest
rf_tuning_grid <- expand.grid(
  mtry = c(2, 3, 5, 6),
  splitrule = "gini",
  min.node.size = 5
)

# Cross-validation setup
rf_cv_control <- trainControl(
  method = "cv",
  number = 5,
  verboseIter = TRUE,
  classProbs = TRUE,
  summaryFunction = twoClassSummary
)

# Train Random Forest model
rf_model <- train(
  y_smote ~ .,
  data = dataTrain,
  method = "ranger",
  tuneGrid = rf_tuning_grid,
  trControl = rf_cv_control,
  metric = "ROC"
)
## + Fold1: mtry=2, splitrule=gini, min.node.size=5 
## - Fold1: mtry=2, splitrule=gini, min.node.size=5 
## + Fold1: mtry=3, splitrule=gini, min.node.size=5 
## - Fold1: mtry=3, splitrule=gini, min.node.size=5 
## + Fold1: mtry=5, splitrule=gini, min.node.size=5 
## - Fold1: mtry=5, splitrule=gini, min.node.size=5 
## + Fold1: mtry=6, splitrule=gini, min.node.size=5 
## - Fold1: mtry=6, splitrule=gini, min.node.size=5 
## + Fold2: mtry=2, splitrule=gini, min.node.size=5 
## - Fold2: mtry=2, splitrule=gini, min.node.size=5 
## + Fold2: mtry=3, splitrule=gini, min.node.size=5 
## - Fold2: mtry=3, splitrule=gini, min.node.size=5 
## + Fold2: mtry=5, splitrule=gini, min.node.size=5 
## - Fold2: mtry=5, splitrule=gini, min.node.size=5 
## + Fold2: mtry=6, splitrule=gini, min.node.size=5 
## - Fold2: mtry=6, splitrule=gini, min.node.size=5 
## + Fold3: mtry=2, splitrule=gini, min.node.size=5 
## - Fold3: mtry=2, splitrule=gini, min.node.size=5 
## + Fold3: mtry=3, splitrule=gini, min.node.size=5 
## - Fold3: mtry=3, splitrule=gini, min.node.size=5 
## + Fold3: mtry=5, splitrule=gini, min.node.size=5 
## - Fold3: mtry=5, splitrule=gini, min.node.size=5 
## + Fold3: mtry=6, splitrule=gini, min.node.size=5 
## - Fold3: mtry=6, splitrule=gini, min.node.size=5 
## + Fold4: mtry=2, splitrule=gini, min.node.size=5 
## - Fold4: mtry=2, splitrule=gini, min.node.size=5 
## + Fold4: mtry=3, splitrule=gini, min.node.size=5 
## - Fold4: mtry=3, splitrule=gini, min.node.size=5 
## + Fold4: mtry=5, splitrule=gini, min.node.size=5 
## - Fold4: mtry=5, splitrule=gini, min.node.size=5 
## + Fold4: mtry=6, splitrule=gini, min.node.size=5 
## - Fold4: mtry=6, splitrule=gini, min.node.size=5 
## + Fold5: mtry=2, splitrule=gini, min.node.size=5 
## - Fold5: mtry=2, splitrule=gini, min.node.size=5 
## + Fold5: mtry=3, splitrule=gini, min.node.size=5 
## - Fold5: mtry=3, splitrule=gini, min.node.size=5 
## + Fold5: mtry=5, splitrule=gini, min.node.size=5 
## - Fold5: mtry=5, splitrule=gini, min.node.size=5 
## + Fold5: mtry=6, splitrule=gini, min.node.size=5 
## - Fold5: mtry=6, splitrule=gini, min.node.size=5 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 3, splitrule = gini, min.node.size = 5 on full training set
# Display the model
rf_model
## Random Forest 
## 
## 7674 samples
##   11 predictor
##    2 classes: 'NoStroke', 'Stroke' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 6140, 6139, 6139, 6139, 6139 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##   2     0.9894043  0.9650290  0.9360634
##   3     0.9916990  0.9711993  0.9421400
##   5     0.9913373  0.9724846  0.9424042
##   6     0.9910812  0.9729988  0.9413474
## 
## Tuning parameter 'splitrule' was held constant at a value of gini
## 
## Tuning parameter 'min.node.size' was held constant at a value of 5
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 3, splitrule = gini
##  and min.node.size = 5.
# Predict probabilities on the test set
rf_pred_prob <- predict(rf_model, newdata = X_test, type = "prob")[, "Stroke"]
# Predict classes on the test set
rf_pred <- predict(rf_model, newdata = X_test)

# Ensure the predicted values are factors with correct levels
rf_pred <- factor(rf_pred, levels = c("NoStroke", "Stroke"))

# Evaluate the Random Forest model using confusion matrix
cat("Confusion Matrix for Random Forest:\n")
## Confusion Matrix for Random Forest:
rf_conf_matrix <- confusionMatrix(rf_pred, y_test, positive = "Stroke")
print(rf_conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction NoStroke Stroke
##   NoStroke      945     66
##   Stroke         27    880
##                                           
##                Accuracy : 0.9515          
##                  95% CI : (0.9409, 0.9607)
##     No Information Rate : 0.5068          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.903           
##                                           
##  Mcnemar's Test P-Value : 8.134e-05       
##                                           
##             Sensitivity : 0.9302          
##             Specificity : 0.9722          
##          Pos Pred Value : 0.9702          
##          Neg Pred Value : 0.9347          
##              Prevalence : 0.4932          
##          Detection Rate : 0.4588          
##    Detection Prevalence : 0.4729          
##       Balanced Accuracy : 0.9512          
##                                           
##        'Positive' Class : Stroke          
## 
# Calculate precision, recall, and F1-score for Random Forest
rf_precision <- rf_conf_matrix$byClass["Pos Pred Value"]
rf_recall <- rf_conf_matrix$byClass["Sensitivity"]
rf_f1 <- (2 * rf_precision * rf_recall) / (rf_precision + rf_recall)

cat("Random Forest Precision: ", rf_precision, "\n")
## Random Forest Precision:  0.9702315
cat("Random Forest Recall: ", rf_recall, "\n")
## Random Forest Recall:  0.9302326
cat("Random Forest F1-score: ", rf_f1, "\n")
## Random Forest F1-score:  0.9498111
# XGBOOST

# Define the grid of hyperparameters for XGBoost
xgb_tuning_grid <- expand.grid(
  nrounds = 3500,
  max_depth = 7,
  eta = 0.01,
  gamma = 0.01,
  colsample_bytree = 0.75,
  min_child_weight = 0,
  subsample = 0.5
)

# Cross-validation setup
xgb_cv_control <- trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE,
  summaryFunction = twoClassSummary
)

# Train XGBoost model
xgb_model <- train(
  y_smote ~ .,
  data = dataTrain,
  method = "xgbTree",
  tuneGrid = xgb_tuning_grid,
  trControl = xgb_cv_control,
  metric = "ROC"
)


# Display the model
xgb_model
## eXtreme Gradient Boosting 
## 
## 7674 samples
##   11 predictor
##    2 classes: 'NoStroke', 'Stroke' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 6139, 6140, 6139, 6139, 6139 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.9927121  0.9740287  0.9503303
## 
## Tuning parameter 'nrounds' was held constant at a value of 3500
## 
## Tuning parameter 'min_child_weight' was held constant at a value of 0
## 
## Tuning parameter 'subsample' was held constant at a value of 0.5
# Predict probabilities on the test set
xgb_pred_prob <- predict(xgb_model, newdata = X_test, type = "prob")[, "Stroke"]
# Predict classes on the test set
xgb_pred <- predict(xgb_model, newdata = X_test)

# Ensure the predicted values are factors with correct levels
xgb_pred <- factor(xgb_pred, levels = c("NoStroke", "Stroke"))

# Evaluate the XGBoost model using confusion matrix
cat("Confusion Matrix for XGBoost:\n")
## Confusion Matrix for XGBoost:
xgb_conf_matrix <- confusionMatrix(xgb_pred, y_test, positive = "Stroke")
print(xgb_conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction NoStroke Stroke
##   NoStroke      945     55
##   Stroke         27    891
##                                           
##                Accuracy : 0.9572          
##                  95% CI : (0.9472, 0.9659)
##     No Information Rate : 0.5068          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9144          
##                                           
##  Mcnemar's Test P-Value : 0.002867        
##                                           
##             Sensitivity : 0.9419          
##             Specificity : 0.9722          
##          Pos Pred Value : 0.9706          
##          Neg Pred Value : 0.9450          
##              Prevalence : 0.4932          
##          Detection Rate : 0.4645          
##    Detection Prevalence : 0.4786          
##       Balanced Accuracy : 0.9570          
##                                           
##        'Positive' Class : Stroke          
## 
# Calculate precision, recall, and F1-score for XGBoost
xgb_precision <- xgb_conf_matrix$byClass["Pos Pred Value"]
xgb_recall <- xgb_conf_matrix$byClass["Sensitivity"]
xgb_f1 <- (2 * xgb_precision * xgb_recall) / (xgb_precision + xgb_recall)

cat("XGBoost Precision: ", xgb_precision, "\n")
## XGBoost Precision:  0.9705882
cat("XGBoost Recall: ", xgb_recall, "\n")
## XGBoost Recall:  0.9418605
cat("XGBoost F1-score: ", xgb_f1, "\n")
## XGBoost F1-score:  0.9560086
# Calculate weights based on AUC for each model
rf_auc <- roc(response = y_test, predictor = rf_pred_prob)$auc
## Setting levels: control = NoStroke, case = Stroke
## Setting direction: controls < cases
xgb_auc <- roc(response = y_test, predictor = xgb_pred_prob)$auc
## Setting levels: control = NoStroke, case = Stroke
## Setting direction: controls < cases
# Normalize the AUC scores to sum to 1 (for weights)
total_auc <- rf_auc + xgb_auc
rf_weight <- rf_auc / total_auc
xgb_weight <- xgb_auc / total_auc

cat("Random Forest Weight: ", rf_weight, "\n")
## Random Forest Weight:  0.4998316
cat("XGBoost Weight: ", xgb_weight, "\n")
## XGBoost Weight:  0.5001684
# Voting Classifier

# Combine predictions using weighted average
combined_pred_prob <- (rf_weight * rf_pred_prob) + (xgb_weight * xgb_pred_prob)
final_pred <- ifelse(combined_pred_prob >= 0.5, "Stroke", "NoStroke")

# Ensure that the levels of the factors are consistent
final_pred <- factor(final_pred, levels = c("NoStroke", "Stroke"))

# Evaluate the voting classifier using confusion matrix on the resampled test set
cat("Confusion Matrix for Weighted Voting Classifier:\n")
## Confusion Matrix for Weighted Voting Classifier:
print(confusionMatrix(final_pred, y_test, positive = "Stroke"))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction NoStroke Stroke
##   NoStroke      950     57
##   Stroke         22    889
##                                           
##                Accuracy : 0.9588          
##                  95% CI : (0.9489, 0.9673)
##     No Information Rate : 0.5068          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9176          
##                                           
##  Mcnemar's Test P-Value : 0.0001306       
##                                           
##             Sensitivity : 0.9397          
##             Specificity : 0.9774          
##          Pos Pred Value : 0.9759          
##          Neg Pred Value : 0.9434          
##              Prevalence : 0.4932          
##          Detection Rate : 0.4635          
##    Detection Prevalence : 0.4750          
##       Balanced Accuracy : 0.9586          
##                                           
##        'Positive' Class : Stroke          
## 
# Calculate Precision, Recall, and F1-score
voting_precision <- posPredValue(final_pred, y_test, positive = "Stroke")
voting_recall <- sensitivity(final_pred, y_test, positive = "Stroke")
voting_f1 <- (2 * voting_precision * voting_recall) / (voting_precision + voting_recall)

cat("Precision: ", voting_precision, "\n")
## Precision:  0.9758507
cat("Recall: ", voting_recall, "\n")
## Recall:  0.9397463
cat("F1-score: ", voting_f1, "\n")
## F1-score:  0.9574583
# Plot Precision-Recall Curve
pr <- pr.curve(scores.class0 = combined_pred_prob, weights.class0 = as.numeric(y_test == "Stroke"), curve = TRUE)
plot(pr)

# Plot ROC Curve
roc <- roc(as.numeric(y_test == "Stroke"), combined_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc, print.auc = TRUE, col = "blue", main = "ROC Curve for Weighted Voting Classifier")

# Create a summary table with 2 decimal places
results <- data.frame(
  Model = c("Random Forest", "XGBoost", "Weighted Voting"),
  Precision = round(c(rf_precision, xgb_precision, voting_precision), 2),
  Recall = round(c(rf_recall, xgb_recall, voting_recall), 2),
  F1_Score = round(c(rf_f1, xgb_f1, voting_f1), 2)
)

print(results)
##             Model Precision Recall F1_Score
## 1   Random Forest      0.97   0.93     0.95
## 2         XGBoost      0.97   0.94     0.96
## 3 Weighted Voting      0.98   0.94     0.96
# Save the models and combined predictions
save(rf_model, xgb_model, rf_weight, xgb_weight, file = "voting_classifier_model.RData")

–The End–