This is an R Markdown document of machine learning and data modelling script for Stroke Predictor.
## File name: stroke_prediction_v3[1].R
## This is an R script to create data model from machine learning
# Load necessary libraries
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(ggplot2)
library(DMwR2)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(ranger)
##
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
##
## importance
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(PRROC)
library(ranger)
library(smotefamily)
# Load the dataset
file_path <- 'stroke prediction_cleaned data.xlsx'
df <- read_excel(file_path)
# Display the first few rows of the dataset
head(df)
## # A tibble: 6 × 12
## ID Gender Age Hypertension Heart_Disease Ever_Married Work_Type
## <dbl> <chr> <dbl> <dbl> <dbl> <chr> <chr>
## 1 9046 Male 67 0 1 Yes Private_Job
## 2 51676 Female 61 0 0 Yes Self_Employed
## 3 31112 Male 80 0 1 Yes Private_Job
## 4 60182 Female 49 0 0 Yes Private_Job
## 5 1665 Female 79 1 0 Yes Self_Employed
## 6 56669 Male 81 0 0 Yes Private_Job
## # ℹ 5 more variables: Residence_Type <chr>, Avg_Glucose_Level <dbl>, BMI <dbl>,
## # Smoking_Status <chr>, Stroke <dbl>
# Encode categorical values
df <- df %>%
mutate(Gender = recode(Gender, 'Male' = 0, 'Female' = 1, 'Other' = -1),
Residence_Type = recode(Residence_Type, 'Rural' = 0, 'Urban' = 1),
Work_Type = recode(Work_Type, 'Private_Job' = 0, 'Self_Employed' = 1, 'Government_Job' = 2, 'Dependant_Never_Worked' = -1),
Ever_Married = recode(Ever_Married, 'No' = 0, 'Yes' = 1),
Smoking_Status = recode(Smoking_Status, 'Never_Smoked' = 0, 'Formerly_Smoked' = 1, 'Smokes' = 2, 'Unknown' = -1, 'Unlikely_Smoked' = -2))
# Define features and target variable
X <- df %>%
select(Gender, Age, Hypertension, Heart_Disease, Work_Type, Avg_Glucose_Level, BMI)
y <- df$Stroke
# Convert target variable to factor with appropriate levels
y <- factor(y, levels = c(0, 1), labels = c("NoStroke", "Stroke"))
# Feature engineering: Create interaction terms or new features
df <- df %>%
mutate(Age_BMI = Age * BMI,
Hypertension_Age = Hypertension * Age,
Heart_Disease_Age = Heart_Disease * Age,
Age_Avg_Glucose_Level = Age * Avg_Glucose_Level)
# Define features and target variable again to include new features
X <- df %>%
select(Gender, Age, Hypertension, Heart_Disease, Work_Type, Avg_Glucose_Level, BMI, Age_BMI, Hypertension_Age, Heart_Disease_Age, Age_Avg_Glucose_Level)
y <- df$Stroke
# Plot class distribution before resampling
ggplot(data = df, aes(x = Stroke)) +
geom_bar() +
ggtitle("Class Distribution Before Resampling") +
xlab("Stroke") +
ylab("Count")
# Print the number of instances in each class before resampling
#cat('Stroke (train before resampling):', sum(y_train == 1), '\n')
#cat('No stroke (train before resampling):', sum(y_train == 0), '\n')
# Verify the dimensions before resampling
#cat("X_train shape:", dim(X_train), "\n")
#cat("y_train shape:", length(y_train), "\n")
# Assuming X is your feature matrix and y is your target variable
# Convert the data to a data frame
data <- data.frame(X, y)
# Applying SMOTE
set.seed(3)
smote_data <- SMOTE(X = data[, -ncol(data)], target = data$y, K = 5, dup_size = 0)
# Extracting the resampled data
X_smote <- smote_data$data[, -ncol(smote_data$data)]
y_smote <- smote_data$data$class
# Convert y_smote to factor with appropriate levels
y_smote <- factor(y_smote, levels = c(0, 1), labels = c("NoStroke", "Stroke"))
# Plotting the class distribution after SMOTE
y_smote <- as.factor(y_smote)
ggplot(data.frame(y_smote), aes(x = y_smote)) +
geom_bar() +
ggtitle("Class Distribution After SMOTE")
# Print the number of instances in each class
cat('Stroke:', sum(y_smote == 1), '\n')
## Stroke: 0
cat('No stroke:', sum(y_smote == 0), '\n')
## No stroke: 0
# Assuming X_smote and y_smote are your resampled data
# Combine the features and target variable into one data frame
data_smote <- data.frame(X_smote, y_smote)
# Splitting the dataset into training and testing sets
set.seed(42)
trainIndex <- createDataPartition(data_smote$y_smote, p = .8,
list = FALSE,
times = 1)
dataTrain <- data_smote[ trainIndex,]
dataTest <- data_smote[-trainIndex,]
X_train <- dataTrain[, -ncol(dataTrain)]
X_test <- dataTest[, -ncol(dataTest)]
y_train <- dataTrain$y_smote
y_test <- dataTest$y_smote
# Print the dimensions of the split data
cat("Training Set Dimensions: ", dim(X_train), "\n")
## Training Set Dimensions: 7674 11
cat("Testing Set Dimensions: ", dim(X_test), "\n")
## Testing Set Dimensions: 1918 11
cat("Training Labels Dimensions: ", length(y_train), "\n")
## Training Labels Dimensions: 7674
cat("Testing Labels Dimensions: ", length(y_test), "\n")
## Testing Labels Dimensions: 1918
# Plotting the class distribution in the training set
ggplot(data.frame(y_train), aes(x = y_train)) +
geom_bar() +
ggtitle("Class Distribution in Training Set")
# Print the number of instances in each class in the training set
cat('Stroke (train):', sum(y_train == 1), '\n')
## Stroke (train): 0
cat('No stroke (train):', sum(y_train == 0), '\n')
## No stroke (train): 0
# Plotting the class distribution in the testing set
ggplot(data.frame(y_test), aes(x = y_test)) +
geom_bar() +
ggtitle("Class Distribution in Test Set")
# Print the number of instances in each class in the testing set
cat('Stroke (test):', sum(y_test == 1), '\n')
## Stroke (test): 0
cat('No stroke (test):', sum(y_test == 0), '\n')
## No stroke (test): 0
# Print column names of X_train and X_test
cat("Column names in X_train:", colnames(X_train), "\n")
## Column names in X_train: Gender Age Hypertension Heart_Disease Work_Type Avg_Glucose_Level BMI Age_BMI Hypertension_Age Heart_Disease_Age Age_Avg_Glucose_Level
cat("Column names in X_test:", colnames(X_test), "\n")
## Column names in X_test: Gender Age Hypertension Heart_Disease Work_Type Avg_Glucose_Level BMI Age_BMI Hypertension_Age Heart_Disease_Age Age_Avg_Glucose_Level
# RANDOM FOREST
# Define the grid of hyperparameters for Random Forest
rf_tuning_grid <- expand.grid(
mtry = c(2, 3, 5, 6),
splitrule = "gini",
min.node.size = 5
)
# Cross-validation setup
rf_cv_control <- trainControl(
method = "cv",
number = 5,
verboseIter = TRUE,
classProbs = TRUE,
summaryFunction = twoClassSummary
)
# Train Random Forest model
rf_model <- train(
y_smote ~ .,
data = dataTrain,
method = "ranger",
tuneGrid = rf_tuning_grid,
trControl = rf_cv_control,
metric = "ROC"
)
## + Fold1: mtry=2, splitrule=gini, min.node.size=5
## - Fold1: mtry=2, splitrule=gini, min.node.size=5
## + Fold1: mtry=3, splitrule=gini, min.node.size=5
## - Fold1: mtry=3, splitrule=gini, min.node.size=5
## + Fold1: mtry=5, splitrule=gini, min.node.size=5
## - Fold1: mtry=5, splitrule=gini, min.node.size=5
## + Fold1: mtry=6, splitrule=gini, min.node.size=5
## - Fold1: mtry=6, splitrule=gini, min.node.size=5
## + Fold2: mtry=2, splitrule=gini, min.node.size=5
## - Fold2: mtry=2, splitrule=gini, min.node.size=5
## + Fold2: mtry=3, splitrule=gini, min.node.size=5
## - Fold2: mtry=3, splitrule=gini, min.node.size=5
## + Fold2: mtry=5, splitrule=gini, min.node.size=5
## - Fold2: mtry=5, splitrule=gini, min.node.size=5
## + Fold2: mtry=6, splitrule=gini, min.node.size=5
## - Fold2: mtry=6, splitrule=gini, min.node.size=5
## + Fold3: mtry=2, splitrule=gini, min.node.size=5
## - Fold3: mtry=2, splitrule=gini, min.node.size=5
## + Fold3: mtry=3, splitrule=gini, min.node.size=5
## - Fold3: mtry=3, splitrule=gini, min.node.size=5
## + Fold3: mtry=5, splitrule=gini, min.node.size=5
## - Fold3: mtry=5, splitrule=gini, min.node.size=5
## + Fold3: mtry=6, splitrule=gini, min.node.size=5
## - Fold3: mtry=6, splitrule=gini, min.node.size=5
## + Fold4: mtry=2, splitrule=gini, min.node.size=5
## - Fold4: mtry=2, splitrule=gini, min.node.size=5
## + Fold4: mtry=3, splitrule=gini, min.node.size=5
## - Fold4: mtry=3, splitrule=gini, min.node.size=5
## + Fold4: mtry=5, splitrule=gini, min.node.size=5
## - Fold4: mtry=5, splitrule=gini, min.node.size=5
## + Fold4: mtry=6, splitrule=gini, min.node.size=5
## - Fold4: mtry=6, splitrule=gini, min.node.size=5
## + Fold5: mtry=2, splitrule=gini, min.node.size=5
## - Fold5: mtry=2, splitrule=gini, min.node.size=5
## + Fold5: mtry=3, splitrule=gini, min.node.size=5
## - Fold5: mtry=3, splitrule=gini, min.node.size=5
## + Fold5: mtry=5, splitrule=gini, min.node.size=5
## - Fold5: mtry=5, splitrule=gini, min.node.size=5
## + Fold5: mtry=6, splitrule=gini, min.node.size=5
## - Fold5: mtry=6, splitrule=gini, min.node.size=5
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 3, splitrule = gini, min.node.size = 5 on full training set
# Display the model
rf_model
## Random Forest
##
## 7674 samples
## 11 predictor
## 2 classes: 'NoStroke', 'Stroke'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 6140, 6139, 6139, 6139, 6139
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.9894043 0.9650290 0.9360634
## 3 0.9916990 0.9711993 0.9421400
## 5 0.9913373 0.9724846 0.9424042
## 6 0.9910812 0.9729988 0.9413474
##
## Tuning parameter 'splitrule' was held constant at a value of gini
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 3, splitrule = gini
## and min.node.size = 5.
# Predict probabilities on the test set
rf_pred_prob <- predict(rf_model, newdata = X_test, type = "prob")[, "Stroke"]
# Predict classes on the test set
rf_pred <- predict(rf_model, newdata = X_test)
# Ensure the predicted values are factors with correct levels
rf_pred <- factor(rf_pred, levels = c("NoStroke", "Stroke"))
# Evaluate the Random Forest model using confusion matrix
cat("Confusion Matrix for Random Forest:\n")
## Confusion Matrix for Random Forest:
rf_conf_matrix <- confusionMatrix(rf_pred, y_test, positive = "Stroke")
print(rf_conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction NoStroke Stroke
## NoStroke 945 66
## Stroke 27 880
##
## Accuracy : 0.9515
## 95% CI : (0.9409, 0.9607)
## No Information Rate : 0.5068
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.903
##
## Mcnemar's Test P-Value : 8.134e-05
##
## Sensitivity : 0.9302
## Specificity : 0.9722
## Pos Pred Value : 0.9702
## Neg Pred Value : 0.9347
## Prevalence : 0.4932
## Detection Rate : 0.4588
## Detection Prevalence : 0.4729
## Balanced Accuracy : 0.9512
##
## 'Positive' Class : Stroke
##
# Calculate precision, recall, and F1-score for Random Forest
rf_precision <- rf_conf_matrix$byClass["Pos Pred Value"]
rf_recall <- rf_conf_matrix$byClass["Sensitivity"]
rf_f1 <- (2 * rf_precision * rf_recall) / (rf_precision + rf_recall)
cat("Random Forest Precision: ", rf_precision, "\n")
## Random Forest Precision: 0.9702315
cat("Random Forest Recall: ", rf_recall, "\n")
## Random Forest Recall: 0.9302326
cat("Random Forest F1-score: ", rf_f1, "\n")
## Random Forest F1-score: 0.9498111
# XGBOOST
# Define the grid of hyperparameters for XGBoost
xgb_tuning_grid <- expand.grid(
nrounds = 3500,
max_depth = 7,
eta = 0.01,
gamma = 0.01,
colsample_bytree = 0.75,
min_child_weight = 0,
subsample = 0.5
)
# Cross-validation setup
xgb_cv_control <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary
)
# Train XGBoost model
xgb_model <- train(
y_smote ~ .,
data = dataTrain,
method = "xgbTree",
tuneGrid = xgb_tuning_grid,
trControl = xgb_cv_control,
metric = "ROC"
)
# Display the model
xgb_model
## eXtreme Gradient Boosting
##
## 7674 samples
## 11 predictor
## 2 classes: 'NoStroke', 'Stroke'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 6139, 6140, 6139, 6139, 6139
## Resampling results:
##
## ROC Sens Spec
## 0.9927121 0.9740287 0.9503303
##
## Tuning parameter 'nrounds' was held constant at a value of 3500
##
## Tuning parameter 'min_child_weight' was held constant at a value of 0
##
## Tuning parameter 'subsample' was held constant at a value of 0.5
# Predict probabilities on the test set
xgb_pred_prob <- predict(xgb_model, newdata = X_test, type = "prob")[, "Stroke"]
# Predict classes on the test set
xgb_pred <- predict(xgb_model, newdata = X_test)
# Ensure the predicted values are factors with correct levels
xgb_pred <- factor(xgb_pred, levels = c("NoStroke", "Stroke"))
# Evaluate the XGBoost model using confusion matrix
cat("Confusion Matrix for XGBoost:\n")
## Confusion Matrix for XGBoost:
xgb_conf_matrix <- confusionMatrix(xgb_pred, y_test, positive = "Stroke")
print(xgb_conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction NoStroke Stroke
## NoStroke 945 55
## Stroke 27 891
##
## Accuracy : 0.9572
## 95% CI : (0.9472, 0.9659)
## No Information Rate : 0.5068
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9144
##
## Mcnemar's Test P-Value : 0.002867
##
## Sensitivity : 0.9419
## Specificity : 0.9722
## Pos Pred Value : 0.9706
## Neg Pred Value : 0.9450
## Prevalence : 0.4932
## Detection Rate : 0.4645
## Detection Prevalence : 0.4786
## Balanced Accuracy : 0.9570
##
## 'Positive' Class : Stroke
##
# Calculate precision, recall, and F1-score for XGBoost
xgb_precision <- xgb_conf_matrix$byClass["Pos Pred Value"]
xgb_recall <- xgb_conf_matrix$byClass["Sensitivity"]
xgb_f1 <- (2 * xgb_precision * xgb_recall) / (xgb_precision + xgb_recall)
cat("XGBoost Precision: ", xgb_precision, "\n")
## XGBoost Precision: 0.9705882
cat("XGBoost Recall: ", xgb_recall, "\n")
## XGBoost Recall: 0.9418605
cat("XGBoost F1-score: ", xgb_f1, "\n")
## XGBoost F1-score: 0.9560086
# Calculate weights based on AUC for each model
rf_auc <- roc(response = y_test, predictor = rf_pred_prob)$auc
## Setting levels: control = NoStroke, case = Stroke
## Setting direction: controls < cases
xgb_auc <- roc(response = y_test, predictor = xgb_pred_prob)$auc
## Setting levels: control = NoStroke, case = Stroke
## Setting direction: controls < cases
# Normalize the AUC scores to sum to 1 (for weights)
total_auc <- rf_auc + xgb_auc
rf_weight <- rf_auc / total_auc
xgb_weight <- xgb_auc / total_auc
cat("Random Forest Weight: ", rf_weight, "\n")
## Random Forest Weight: 0.4998316
cat("XGBoost Weight: ", xgb_weight, "\n")
## XGBoost Weight: 0.5001684
# Voting Classifier
# Combine predictions using weighted average
combined_pred_prob <- (rf_weight * rf_pred_prob) + (xgb_weight * xgb_pred_prob)
final_pred <- ifelse(combined_pred_prob >= 0.5, "Stroke", "NoStroke")
# Ensure that the levels of the factors are consistent
final_pred <- factor(final_pred, levels = c("NoStroke", "Stroke"))
# Evaluate the voting classifier using confusion matrix on the resampled test set
cat("Confusion Matrix for Weighted Voting Classifier:\n")
## Confusion Matrix for Weighted Voting Classifier:
print(confusionMatrix(final_pred, y_test, positive = "Stroke"))
## Confusion Matrix and Statistics
##
## Reference
## Prediction NoStroke Stroke
## NoStroke 950 57
## Stroke 22 889
##
## Accuracy : 0.9588
## 95% CI : (0.9489, 0.9673)
## No Information Rate : 0.5068
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9176
##
## Mcnemar's Test P-Value : 0.0001306
##
## Sensitivity : 0.9397
## Specificity : 0.9774
## Pos Pred Value : 0.9759
## Neg Pred Value : 0.9434
## Prevalence : 0.4932
## Detection Rate : 0.4635
## Detection Prevalence : 0.4750
## Balanced Accuracy : 0.9586
##
## 'Positive' Class : Stroke
##
# Calculate Precision, Recall, and F1-score
voting_precision <- posPredValue(final_pred, y_test, positive = "Stroke")
voting_recall <- sensitivity(final_pred, y_test, positive = "Stroke")
voting_f1 <- (2 * voting_precision * voting_recall) / (voting_precision + voting_recall)
cat("Precision: ", voting_precision, "\n")
## Precision: 0.9758507
cat("Recall: ", voting_recall, "\n")
## Recall: 0.9397463
cat("F1-score: ", voting_f1, "\n")
## F1-score: 0.9574583
# Plot Precision-Recall Curve
pr <- pr.curve(scores.class0 = combined_pred_prob, weights.class0 = as.numeric(y_test == "Stroke"), curve = TRUE)
plot(pr)
# Plot ROC Curve
roc <- roc(as.numeric(y_test == "Stroke"), combined_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc, print.auc = TRUE, col = "blue", main = "ROC Curve for Weighted Voting Classifier")
# Create a summary table with 2 decimal places
results <- data.frame(
Model = c("Random Forest", "XGBoost", "Weighted Voting"),
Precision = round(c(rf_precision, xgb_precision, voting_precision), 2),
Recall = round(c(rf_recall, xgb_recall, voting_recall), 2),
F1_Score = round(c(rf_f1, xgb_f1, voting_f1), 2)
)
print(results)
## Model Precision Recall F1_Score
## 1 Random Forest 0.97 0.93 0.95
## 2 XGBoost 0.97 0.94 0.96
## 3 Weighted Voting 0.98 0.94 0.96
# Save the models and combined predictions
save(rf_model, xgb_model, rf_weight, xgb_weight, file = "voting_classifier_model.RData")
–The End–