# ── Load all libraries ────────────────────────────────────────────────────────
library(ggplot2)
library(dplyr)
library(tidyr)
library(moments)
library(corrplot)
library(caret)
library(randomForest)
library(pROC)
library(scales)
library(forcats)
library(gridExtra)
library(knitr)
library(kableExtra)# ── Load dataset directly from GitHub (no local path needed) ─────────────────
url <- paste0(
"https://raw.githubusercontent.com/shahriyarcse-arch/",
"DataScience_FinalTerm_Project/main/",
"healthcare-dataset-stroke-data.csv"
)
stroke_raw <- read.csv(url, na.strings = c("NA", "N/A", "", " "),
stringsAsFactors = FALSE)
cat("Dataset loaded successfully!\n")## Dataset loaded successfully!
## Rows: 5110
## Columns: 12
## id gender age hypertension heart_disease ever_married work_type
## 1 9046 Male 67 0 1 Yes Private
## 2 51676 Female 61 0 0 Yes Self-employed
## 3 31112 Male 80 0 1 Yes Private
## 4 60182 Female 49 0 0 Yes Private
## 5 1665 Female 79 1 0 Yes Self-employed
## 6 56669 Male 81 0 0 Yes Private
## Residence_type avg_glucose_level bmi smoking_status stroke
## 1 Urban 228.69 36.6 formerly smoked 1
## 2 Rural 202.21 NA never smoked 1
## 3 Rural 105.92 32.5 never smoked 1
## 4 Urban 171.23 34.4 smokes 1
## 5 Rural 174.12 24.0 never smoked 1
## 6 Urban 186.21 29.0 formerly smoked 1
## [1] 5110 12
## [1] "id" "gender" "age"
## [4] "hypertension" "heart_disease" "ever_married"
## [7] "work_type" "Residence_type" "avg_glucose_level"
## [10] "bmi" "smoking_status" "stroke"
## 'data.frame': 5110 obs. of 12 variables:
## $ id : int 9046 51676 31112 60182 1665 56669 53882 10434 27419 60491 ...
## $ gender : chr "Male" "Female" "Male" "Female" ...
## $ age : num 67 61 80 49 79 81 74 69 59 78 ...
## $ hypertension : int 0 0 0 0 1 0 1 0 0 0 ...
## $ heart_disease : int 1 0 1 0 0 0 1 0 0 0 ...
## $ ever_married : chr "Yes" "Yes" "Yes" "Yes" ...
## $ work_type : chr "Private" "Self-employed" "Private" "Private" ...
## $ Residence_type : chr "Urban" "Rural" "Rural" "Urban" ...
## $ avg_glucose_level: num 229 202 106 171 174 ...
## $ bmi : num 36.6 NA 32.5 34.4 24 29 27.4 22.8 NA 24.2 ...
## $ smoking_status : chr "formerly smoked" "never smoked" "never smoked" "smokes" ...
## $ stroke : int 1 1 1 1 1 1 1 1 1 1 ...
## id gender age hypertension
## Min. : 67 Length:5110 Min. : 0.08 Min. :0.00000
## 1st Qu.:17741 Class :character 1st Qu.:25.00 1st Qu.:0.00000
## Median :36932 Mode :character Median :45.00 Median :0.00000
## Mean :36518 Mean :43.23 Mean :0.09746
## 3rd Qu.:54682 3rd Qu.:61.00 3rd Qu.:0.00000
## Max. :72940 Max. :82.00 Max. :1.00000
##
## heart_disease ever_married work_type Residence_type
## Min. :0.00000 Length:5110 Length:5110 Length:5110
## 1st Qu.:0.00000 Class :character Class :character Class :character
## Median :0.00000 Mode :character Mode :character Mode :character
## Mean :0.05401
## 3rd Qu.:0.00000
## Max. :1.00000
##
## avg_glucose_level bmi smoking_status stroke
## Min. : 55.12 Min. :10.30 Length:5110 Min. :0.00000
## 1st Qu.: 77.25 1st Qu.:23.50 Class :character 1st Qu.:0.00000
## Median : 91.89 Median :28.10 Mode :character Median :0.00000
## Mean :106.15 Mean :28.89 Mean :0.04873
## 3rd Qu.:114.09 3rd Qu.:33.10 3rd Qu.:0.00000
## Max. :271.74 Max. :97.60 Max. :1.00000
## NA's :201
# Convert bmi to numeric first (it loads as character due to "N/A" values)
stroke_raw$bmi <- as.numeric(stroke_raw$bmi)
# Select numeric columns
numeric_cols <- stroke_raw %>% select(age, avg_glucose_level, bmi)
# Mean for all numeric columns
cat("── Mean ──\n")## ── Mean ──
## age avg_glucose_level bmi
## 43.22661 106.14768 28.89324
## ── Median ──
## age avg_glucose_level bmi
## 45.000 91.885 28.100
## ── Standard Deviation ──
## age avg_glucose_level bmi
## 22.612647 45.283560 7.854067
## ── Skewness ──
## age avg_glucose_level bmi
## -0.1370191 1.5718223 1.0550177
##
## Female Male Other
## 2994 2115 1
##
## children Govt_job Never_worked Private Self-employed
## 687 657 22 2925 819
##
## formerly smoked never smoked smokes Unknown
## 885 1892 789 1544
##
## No Yes
## 1757 3353
##
## Rural Urban
## 2514 2596
# Count stroke vs no-stroke patients
stroke_counts <- stroke_raw %>%
count(stroke) %>%
mutate(
label = ifelse(stroke == 1, "Stroke", "No Stroke"),
pct = paste0(round(n / sum(n) * 100, 1), "%")
)
ggplot(stroke_counts, aes(x = label, y = n, fill = label)) +
geom_bar(stat = "identity", width = 0.5, show.legend = FALSE) +
geom_text(aes(label = paste0(n, "\n(", pct, ")")),
vjust = -0.3, size = 4.5, fontface = "bold") +
scale_fill_manual(values = c("No Stroke" = "#2ecc71", "Stroke" = "#e74c3c")) +
labs(
title = "Class Distribution: Stroke vs No Stroke",
subtitle = "Dataset is highly imbalanced — only ~4.9% stroke cases",
x = NULL, y = "Number of Patients"
) +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold"))Note: The dataset is highly imbalanced. Only ~4.9% of patients had a stroke. We will handle this during modeling using SMOTE.
# Add a label column for plotting
stroke_raw$stroke_label <- ifelse(stroke_raw$stroke == 1, "Stroke", "No Stroke")
ggplot(stroke_raw, aes(x = age, fill = stroke_label)) +
geom_histogram(bins = 35, alpha = 0.7, position = "identity") +
scale_fill_manual(values = c("No Stroke" = "#3498db", "Stroke" = "#e74c3c")) +
labs(
title = "Age Distribution by Stroke Status",
subtitle = "Stroke patients are generally older",
x = "Age (years)", y = "Count", fill = NULL
) +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold"))# Glucose histogram
p1 <- ggplot(stroke_raw, aes(x = avg_glucose_level, fill = stroke_label)) +
geom_histogram(bins = 35, alpha = 0.7, position = "identity") +
scale_fill_manual(values = c("No Stroke" = "#3498db", "Stroke" = "#e74c3c")) +
labs(title = "Average Glucose Level", x = "Glucose (mg/dL)",
y = "Count", fill = NULL) +
theme_minimal(base_size = 11)
# BMI histogram
p2 <- ggplot(stroke_raw, aes(x = bmi, fill = stroke_label)) +
geom_histogram(bins = 35, alpha = 0.7, position = "identity") +
scale_fill_manual(values = c("No Stroke" = "#3498db", "Stroke" = "#e74c3c")) +
labs(title = "BMI Distribution", x = "BMI",
y = "Count", fill = NULL) +
theme_minimal(base_size = 11)
grid.arrange(p1, p2, ncol = 2,
top = "Glucose and BMI Distributions by Stroke Status")# Reshape to long format for easy plotting
stroke_long <- stroke_raw %>%
select(age, avg_glucose_level, bmi) %>%
pivot_longer(everything(), names_to = "Feature", values_to = "Value")
ggplot(stroke_long, aes(x = Feature, y = Value, fill = Feature)) +
geom_boxplot(alpha = 0.7, outlier.colour = "#e74c3c",
outlier.shape = 16, outlier.size = 1.5) +
scale_fill_brewer(palette = "Set2") +
labs(
title = "Boxplot — Detecting Outliers in Numeric Features",
x = NULL, y = "Value"
) +
theme_minimal(base_size = 13) +
theme(legend.position = "none",
plot.title = element_text(face = "bold"))# Stroke rate by smoking status
smoke_summary <- stroke_raw %>%
group_by(smoking_status) %>%
summarise(stroke_rate = mean(stroke, na.rm = TRUE), .groups = "drop")
ggplot(smoke_summary,
aes(x = fct_reorder(smoking_status, stroke_rate),
y = stroke_rate, fill = stroke_rate)) +
geom_bar(stat = "identity", width = 0.6, show.legend = FALSE) +
geom_text(aes(label = paste0(round(stroke_rate * 100, 1), "%")),
hjust = -0.2, size = 4) +
scale_fill_gradient(low = "#f9ca24", high = "#e74c3c") +
scale_y_continuous(labels = percent_format(),
limits = c(0, max(smoke_summary$stroke_rate) * 1.35)) +
coord_flip() +
labs(
title = "Stroke Rate by Smoking Status",
x = "Smoking Status", y = "Stroke Rate (%)"
) +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold"))# Stroke rate by work type
work_summary <- stroke_raw %>%
group_by(work_type) %>%
summarise(stroke_rate = mean(stroke, na.rm = TRUE), .groups = "drop")
ggplot(work_summary,
aes(x = fct_reorder(work_type, stroke_rate),
y = stroke_rate, fill = stroke_rate)) +
geom_bar(stat = "identity", width = 0.6, show.legend = FALSE) +
geom_text(aes(label = paste0(round(stroke_rate * 100, 1), "%")),
hjust = -0.2, size = 4) +
scale_fill_gradient(low = "#f9ca24", high = "#e74c3c") +
scale_y_continuous(labels = percent_format(),
limits = c(0, max(work_summary$stroke_rate) * 1.35)) +
coord_flip() +
labs(
title = "Stroke Rate by Work Type",
x = "Work Type", y = "Stroke Rate (%)"
) +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold"))ggplot(stroke_raw, aes(x = age, y = avg_glucose_level,
colour = stroke_label, alpha = stroke_label)) +
geom_point(size = 1.2) +
scale_colour_manual(values = c("No Stroke" = "#95a5a6", "Stroke" = "#e74c3c")) +
scale_alpha_manual(values = c("No Stroke" = 0.3, "Stroke" = 0.9)) +
labs(
title = "Age vs Average Glucose Level",
subtitle = "Stroke patients tend to be older with higher glucose",
x = "Age", y = "Average Glucose Level (mg/dL)",
colour = NULL, alpha = NULL
) +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold"))# Select numeric columns for correlation
cor_data <- stroke_raw %>%
select(age, avg_glucose_level, bmi, hypertension, heart_disease, stroke) %>%
na.omit()
# Calculate correlation matrix
cor_matrix <- cor(cor_data)
# Plot heatmap
corrplot(cor_matrix,
method = "color",
type = "upper",
addCoef.col = "black",
number.cex = 0.75,
tl.col = "black",
tl.srt = 45,
title = "Correlation Matrix of Numeric Features",
mar = c(0, 0, 2, 0))## Missing values per column:
## id gender age hypertension
## 0 0 0 0
## heart_disease ever_married work_type Residence_type
## 0 0 0 0
## avg_glucose_level bmi smoking_status stroke
## 0 201 0 0
## stroke_label
## 0
# Remove patient ID — it has no predictive value
# Also remove the label column we created for plotting
stroke_df <- stroke_raw %>% select(-id, -stroke_label)
cat("Columns after removing id:\n")## Columns after removing id:
## [1] "gender" "age" "hypertension"
## [4] "heart_disease" "ever_married" "work_type"
## [7] "Residence_type" "avg_glucose_level" "bmi"
## [10] "smoking_status" "stroke"
## Missing values BEFORE imputation:
## gender age hypertension heart_disease
## 0 0 0 0
## ever_married work_type Residence_type avg_glucose_level
## 0 0 0 0
## bmi smoking_status stroke
## 201 0 0
# Fill missing BMI values with the median (median is better than mean for skewed data)
bmi_median <- median(stroke_df$bmi, na.rm = TRUE)
stroke_df$bmi[is.na(stroke_df$bmi)] <- bmi_median
cat("\nMissing values AFTER imputation:\n")##
## Missing values AFTER imputation:
## gender age hypertension heart_disease
## 0 0 0 0
## ever_married work_type Residence_type avg_glucose_level
## 0 0 0 0
## bmi smoking_status stroke
## 0 0 0
# Only 1 patient has gender = "Other" — remove to avoid problems in encoding
stroke_df <- stroke_df %>% filter(gender != "Other")
cat("Rows after removing gender = 'Other':", nrow(stroke_df), "\n")## Rows after removing gender = 'Other': 5109
# Show outliers using boxplot before treatment
boxplot(stroke_df$avg_glucose_level,
main = "Glucose Level — Before Outlier Treatment",
col = "lightblue")# IQR method — cap values instead of removing them
cap_outliers <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_v <- IQR(x, na.rm = TRUE)
lower <- Q1 - 1.5 * IQR_v
upper <- Q3 + 1.5 * IQR_v
x[x > upper] <- upper
x[x < lower] <- lower
return(x)
}
stroke_df$avg_glucose_level <- cap_outliers(stroke_df$avg_glucose_level)
stroke_df$bmi <- cap_outliers(stroke_df$bmi)
# Show after treatment
boxplot(stroke_df$avg_glucose_level,
main = "Glucose Level — After Outlier Treatment",
col = "lightgreen")## Outlier capping applied to avg_glucose_level and bmi.
# Create new meaningful features from existing ones
# 1. BMI Category (based on WHO standard thresholds)
stroke_df$bmi_category <- case_when(
stroke_df$bmi < 18.5 ~ "Underweight",
stroke_df$bmi < 25 ~ "Normal",
stroke_df$bmi < 30 ~ "Overweight",
TRUE ~ "Obese"
)
# 2. Glucose Category (based on clinical diabetes thresholds)
stroke_df$glucose_category <- case_when(
stroke_df$avg_glucose_level < 100 ~ "Normal",
stroke_df$avg_glucose_level < 126 ~ "Pre-diabetic",
TRUE ~ "Diabetic"
)
# 3. Age Group
stroke_df$age_group <- case_when(
stroke_df$age < 18 ~ "Child",
stroke_df$age < 40 ~ "Young Adult",
stroke_df$age < 60 ~ "Middle Aged",
TRUE ~ "Senior"
)
# 4. Combined cardiovascular risk score (sum of risk factors: 0 to 4)
stroke_df$cardio_risk <- stroke_df$hypertension +
stroke_df$heart_disease +
as.integer(stroke_df$avg_glucose_level >= 126) +
as.integer(stroke_df$bmi >= 30)
cat("New features created: bmi_category, glucose_category, age_group, cardio_risk\n")## New features created: bmi_category, glucose_category, age_group, cardio_risk
## bmi_category glucose_category age_group cardio_risk
## 1 Obese Diabetic Senior 3
## 2 Overweight Diabetic Senior 1
## 3 Obese Pre-diabetic Senior 2
## 4 Obese Diabetic Middle Aged 2
## 5 Normal Diabetic Senior 2
## Skewness BEFORE log transform:
## avg_glucose_level: 0.936
## bmi : 0.449
# log1p is safe even if values are 0
stroke_df$avg_glucose_level <- log1p(stroke_df$avg_glucose_level)
stroke_df$bmi <- log1p(stroke_df$bmi)
cat("\nSkewness AFTER log transform:\n")##
## Skewness AFTER log transform:
## avg_glucose_level: 0.45
## bmi : -0.188
# List of categorical columns to convert to factors
cat_cols <- c("gender", "ever_married", "work_type", "Residence_type",
"smoking_status", "bmi_category", "glucose_category", "age_group")
# Convert each to factor
stroke_df[cat_cols] <- lapply(stroke_df[cat_cols], as.factor)
# Convert target (stroke) to factor with meaningful labels
stroke_df$stroke <- factor(stroke_df$stroke,
levels = c(0, 1),
labels = c("No", "Yes"))
cat("Categorical encoding done.\n")## Categorical encoding done.
## 'data.frame': 5109 obs. of 15 variables:
## $ gender : Factor w/ 2 levels "Female","Male": 2 1 2 1 1 2 2 1 1 1 ...
## $ age : num 67 61 80 49 79 81 74 69 59 78 ...
## $ hypertension : int 0 0 0 0 1 0 1 0 0 0 ...
## $ heart_disease : int 1 0 1 0 0 0 1 0 0 0 ...
## $ ever_married : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 1 2 2 ...
## $ work_type : Factor w/ 5 levels "children","Govt_job",..: 4 5 4 4 5 4 4 4 4 4 ...
## $ Residence_type : Factor w/ 2 levels "Rural","Urban": 2 1 1 2 1 2 1 2 1 2 ...
## $ avg_glucose_level: num 5.14 5.14 4.67 5.14 5.14 ...
## $ bmi : num 3.63 3.37 3.51 3.57 3.22 ...
## $ smoking_status : Factor w/ 4 levels "formerly smoked",..: 1 2 2 3 2 1 2 2 4 4 ...
## $ stroke : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ bmi_category : Factor w/ 4 levels "Normal","Obese",..: 2 3 2 2 1 3 3 1 3 1 ...
## $ glucose_category : Factor w/ 3 levels "Diabetic","Normal",..: 1 1 3 1 1 1 2 2 2 2 ...
## $ age_group : Factor w/ 4 levels "Child","Middle Aged",..: 3 3 3 2 3 3 3 3 2 3 ...
## $ cardio_risk : int 3 1 2 2 2 1 2 0 0 0 ...
# Min-Max normalization — scales all values between 0 and 1
normalize <- function(x) {
(x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
}
stroke_df$age <- normalize(stroke_df$age)
stroke_df$avg_glucose_level <- normalize(stroke_df$avg_glucose_level)
stroke_df$bmi <- normalize(stroke_df$bmi)
cat("Normalization applied. New ranges:\n")## Normalization applied. New ranges:
## age avg_glucose_level bmi
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3042 1st Qu.:0.2992 1st Qu.:0.5490
## Median :0.5483 Median :0.4537 Median :0.6607
## Mean :0.5267 Mean :0.4945 Mean :0.6545
## 3rd Qu.:0.7437 3rd Qu.:0.6468 3rd Qu.:0.7653
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## 'data.frame': 5109 obs. of 15 variables:
## $ gender : Factor w/ 2 levels "Female","Male": 2 1 2 1 1 2 2 1 1 1 ...
## $ age : num 0.817 0.744 0.976 0.597 0.963 ...
## $ hypertension : int 0 0 0 0 1 0 1 0 0 0 ...
## $ heart_disease : int 1 0 1 0 0 0 1 0 0 0 ...
## $ ever_married : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 1 2 2 ...
## $ work_type : Factor w/ 5 levels "children","Govt_job",..: 4 5 4 4 5 4 4 4 4 4 ...
## $ Residence_type : Factor w/ 2 levels "Rural","Urban": 2 1 1 2 1 2 1 2 1 2 ...
## $ avg_glucose_level: num 1 1 0.58 1 1 ...
## $ bmi : num 0.84 0.661 0.759 0.798 0.555 ...
## $ smoking_status : Factor w/ 4 levels "formerly smoked",..: 1 2 2 3 2 1 2 2 4 4 ...
## $ stroke : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ bmi_category : Factor w/ 4 levels "Normal","Obese",..: 2 3 2 2 1 3 3 1 3 1 ...
## $ glucose_category : Factor w/ 3 levels "Diabetic","Normal",..: 1 1 3 1 1 1 2 2 2 2 ...
## $ age_group : Factor w/ 4 levels "Child","Middle Aged",..: 3 3 3 2 3 3 3 3 2 3 ...
## $ cardio_risk : int 3 1 2 2 2 1 2 0 0 0 ...
## gender age hypertension heart_disease
## Female:2994 Min. :0.0000 Min. :0.00000 Min. :0.00000
## Male :2115 1st Qu.:0.3042 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.5483 Median :0.00000 Median :0.00000
## Mean :0.5267 Mean :0.09748 Mean :0.05402
## 3rd Qu.:0.7437 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000 Max. :1.00000
## ever_married work_type Residence_type avg_glucose_level
## No :1756 children : 687 Rural:2513 Min. :0.0000
## Yes:3353 Govt_job : 657 Urban:2596 1st Qu.:0.2992
## Never_worked : 22 Median :0.4537
## Private :2924 Mean :0.4945
## Self-employed: 819 3rd Qu.:0.6468
## Max. :1.0000
## bmi smoking_status stroke bmi_category
## Min. :0.0000 formerly smoked: 884 No :4860 Normal :1242
## 1st Qu.:0.5490 never smoked :1892 Yes: 249 Obese :1920
## Median :0.6607 smokes : 789 Overweight :1610
## Mean :0.6545 Unknown :1544 Underweight: 337
## 3rd Qu.:0.7653
## Max. :1.0000
## glucose_category age_group cardio_risk
## Diabetic : 980 Child : 856 Min. :0.0000
## Normal :3131 Middle Aged:1564 1st Qu.:0.0000
## Pre-diabetic: 998 Senior :1376 Median :1.0000
## Young Adult:1313 Mean :0.7191
## 3rd Qu.:1.0000
## Max. :4.0000
##
## Missing values in final dataset:
## gender age hypertension heart_disease
## 0 0 0 0
## ever_married work_type Residence_type avg_glucose_level
## 0 0 0 0
## bmi smoking_status stroke bmi_category
## 0 0 0 0
## glucose_category age_group cardio_risk
## 0 0 0
set.seed(42)
# createDataPartition keeps the same stroke ratio in both sets (stratified split)
train_idx <- createDataPartition(stroke_df$stroke, p = 0.80, list = FALSE)
train_data <- stroke_df[ train_idx, ]
test_data <- stroke_df[-train_idx, ]
cat("Training set size:", nrow(train_data), "\n")## Training set size: 4088
## Test set size : 1021
##
## Stroke proportion in training set:
##
## No Yes
## 0.95107632 0.04892368
##
## Stroke proportion in test set:
##
## No Yes
## 0.95200784 0.04799216
set.seed(42)
# 5-fold cross-validation with SMOTE to handle class imbalance
ctrl <- trainControl(
method = "cv", # cross-validation
number = 5, # 5 folds
classProbs = TRUE, # needed for ROC
summaryFunction = twoClassSummary,
sampling = "smote" # oversample minority class
)
# Train logistic regression model
logit_model <- train(
stroke ~ .,
data = train_data,
method = "glm",
family = "binomial",
trControl = ctrl,
metric = "ROC"
)
cat("Logistic Regression — Cross-Validation ROC-AUC:",
round(max(logit_model$results$ROC), 4), "\n")## Logistic Regression — Cross-Validation ROC-AUC: 0.8316
set.seed(42)
# Try different values of mtry (number of variables tried at each split)
tune_grid <- expand.grid(mtry = c(2, 3, 4, 5, 6))
# 5-fold CV with SMOTE
ctrl_rf <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
sampling = "smote",
savePredictions = "final"
)
# Train Random Forest
rf_model <- train(
stroke ~ .,
data = train_data,
method = "rf",
trControl = ctrl_rf,
tuneGrid = tune_grid,
metric = "ROC",
ntree = 300,
importance = TRUE
)
cat("Best mtry value :", rf_model$bestTune$mtry, "\n")## Best mtry value : 2
## Best CV ROC-AUC : 0.8114
# This shows how ROC-AUC changes with different mtry values
ggplot(rf_model) +
labs(
title = "Random Forest — Hyperparameter Tuning",
subtitle = "5-Fold CV ROC-AUC for each mtry value",
x = "mtry (Variables per Split)",
y = "ROC-AUC (Cross-Validation)"
) +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold"))# Predicted class labels
rf_pred_class <- predict(rf_model, newdata = test_data)
logit_pred_class <- predict(logit_model, newdata = test_data)
# Predicted probabilities (needed for ROC curve)
rf_pred_prob <- predict(rf_model, newdata = test_data, type = "prob")[, "Yes"]
logit_pred_prob <- predict(logit_model, newdata = test_data, type = "prob")[, "Yes"]## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 873 34
## Yes 99 15
##
## Accuracy : 0.8697
## 95% CI : (0.8475, 0.8898)
## No Information Rate : 0.952
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1253
##
## Mcnemar's Test P-Value : 2.865e-08
##
## Sensitivity : 0.30612
## Specificity : 0.89815
## Pos Pred Value : 0.13158
## Neg Pred Value : 0.96251
## Prevalence : 0.04799
## Detection Rate : 0.01469
## Detection Prevalence : 0.11166
## Balanced Accuracy : 0.60214
##
## 'Positive' Class : Yes
##
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 691 11
## Yes 281 38
##
## Accuracy : 0.714
## 95% CI : (0.6852, 0.7416)
## No Information Rate : 0.952
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1345
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.77551
## Specificity : 0.71091
## Pos Pred Value : 0.11912
## Neg Pred Value : 0.98433
## Prevalence : 0.04799
## Detection Rate : 0.03722
## Detection Prevalence : 0.31244
## Balanced Accuracy : 0.74321
##
## 'Positive' Class : Yes
##
# Build ROC objects
roc_rf <- roc(test_data$stroke, rf_pred_prob,
levels = c("No", "Yes"), quiet = TRUE)
roc_logit <- roc(test_data$stroke, logit_pred_prob,
levels = c("No", "Yes"), quiet = TRUE)
# Build comparison table
metrics_df <- data.frame(
Model = c("Random Forest", "Logistic Regression"),
Accuracy = round(c(cm_rf$overall["Accuracy"],
cm_logit$overall["Accuracy"]), 4),
Precision = round(c(cm_rf$byClass["Precision"],
cm_logit$byClass["Precision"]), 4),
Recall = round(c(cm_rf$byClass["Recall"],
cm_logit$byClass["Recall"]), 4),
F1_Score = round(c(cm_rf$byClass["F1"],
cm_logit$byClass["F1"]), 4),
ROC_AUC = round(c(auc(roc_rf), auc(roc_logit)), 4)
)
kable(metrics_df, row.names = FALSE,
caption = "Model Performance Comparison on Test Set") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(1, bold = TRUE, background = "#d5f5e3")| Model | Accuracy | Precision | Recall | F1_Score | ROC_AUC |
|---|---|---|---|---|---|
| Random Forest | 0.8697 | 0.1316 | 0.3061 | 0.1840 | 0.8130 |
| Logistic Regression | 0.7140 | 0.1191 | 0.7755 | 0.2065 | 0.8302 |
Although Random Forest achieved higher overall accuracy, its recall for stroke cases was relatively low (~30.6%), meaning many true stroke patients were missed. Logistic Regression achieved much higher recall (~77.6%), making it more appropriate for medical screening where detecting stroke cases is more important than maximizing overall accuracy.
Model Comparison: Random Forest achieved higher Accuracy and slightly higher Precision, while Logistic Regression achieved substantially better Recall, F1-Score, and ROC-AUC.
Since stroke prediction is a medical classification problem where detecting actual stroke patients is critical, Logistic Regression is considered the more suitable model despite its lower overall accuracy.
# Build data frames for plotting
roc_rf_df <- data.frame(
FPR = 1 - roc_rf$specificities,
TPR = roc_rf$sensitivities,
Model = paste0("Random Forest (AUC = ", round(auc(roc_rf), 3), ")")
)
roc_logit_df <- data.frame(
FPR = 1 - roc_logit$specificities,
TPR = roc_logit$sensitivities,
Model = paste0("Logistic Reg. (AUC = ", round(auc(roc_logit), 3), ")")
)
roc_all <- rbind(roc_rf_df, roc_logit_df)
ggplot(roc_all, aes(x = FPR, y = TPR, colour = Model)) +
geom_line(size = 1.2) +
geom_abline(linetype = "dashed", colour = "grey60") +
scale_colour_manual(values = c("#e74c3c", "#2980b9")) +
labs(
title = "ROC Curves — Random Forest vs Logistic Regression",
subtitle = "Higher and more leftward curve = better model",
x = "False Positive Rate (1 - Specificity)",
y = "True Positive Rate (Sensitivity)",
colour = NULL
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold"),
legend.position = c(0.65, 0.2),
legend.background = element_rect(fill = "white", colour = "grey80")
)Which lifestyle factor — smoking, BMI, or blood glucose — increases stroke risk the most?
# Extract variable importance from the trained Random Forest model
imp_raw <- varImp(rf_model, scale = TRUE)$importance
# varImp() may return columns named "Yes"/"No" or "Overall" depending on caret version.
# We safely average all numeric columns to get one importance score per feature.
imp_raw$Score <- rowMeans(imp_raw[, sapply(imp_raw, is.numeric), drop = FALSE])
imp_raw$Feature <- rownames(imp_raw)
# Keep top 15 features sorted by score
imp_top <- imp_raw[order(imp_raw$Score, decreasing = TRUE), ][1:15, ]
ggplot(imp_top, aes(x = reorder(Feature, Score), y = Score, fill = Score)) +
geom_bar(stat = "identity", show.legend = FALSE) +
geom_text(aes(label = round(Score, 1)), hjust = -0.15, size = 3.5) +
scale_fill_gradient(low = "#f9ca24", high = "#e74c3c") +
coord_flip() +
labs(
title = "Variable Importance — Random Forest (Top 15)",
subtitle = "Higher score = stronger predictive importance in the model", x = NULL, y = "Importance Score (0–100)"
) +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"))# Use the original raw data (before transformation) for clearer group comparisons
# Recompute from stroke_raw (before preprocessing changed the values)
raw_plot <- stroke_raw
raw_plot$bmi <- as.numeric(raw_plot$bmi)
# Stroke rate per smoking group
smoke_df <- raw_plot %>%
filter(gender != "Other") %>%
group_by(smoking_status) %>%
summarise(
stroke_rate = mean(stroke, na.rm = TRUE),
n = n(),
.groups = "drop"
)
ggplot(smoke_df, aes(x = fct_reorder(smoking_status, stroke_rate),
y = stroke_rate, fill = stroke_rate)) +
geom_bar(stat = "identity", width = 0.6, show.legend = FALSE) +
geom_text(aes(label = paste0(round(stroke_rate * 100, 1), "% (n=", n, ")")),
hjust = -0.05, size = 3.8) +
scale_fill_gradient(low = "#f9ca24", high = "#c0392b") +
scale_y_continuous(labels = percent_format(),
limits = c(0, max(smoke_df$stroke_rate) * 1.45)) +
coord_flip() +
labs(
title = "Stroke Rate by Smoking Status",
subtitle = "Former smokers show the highest observed stroke rate (possibly influenced by age differences)", x = "Smoking Status", y = "Stroke Rate (%)"
) +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold"))bmi_df <- raw_plot %>%
filter(gender != "Other", !is.na(bmi)) %>%
mutate(bmi_category = case_when(
bmi < 18.5 ~ "Underweight",
bmi < 25 ~ "Normal",
bmi < 30 ~ "Overweight",
TRUE ~ "Obese"
)) %>%
group_by(bmi_category) %>%
summarise(
stroke_rate = mean(stroke, na.rm = TRUE),
n = n(),
.groups = "drop"
) %>%
mutate(bmi_category = factor(bmi_category,
levels = c("Underweight", "Normal", "Overweight", "Obese")))
ggplot(bmi_df, aes(x = bmi_category, y = stroke_rate, fill = stroke_rate)) +
geom_bar(stat = "identity", width = 0.6, show.legend = FALSE) +
geom_text(aes(label = paste0(round(stroke_rate * 100, 1), "% (n=", n, ")")),
vjust = -0.4, size = 3.8) +
scale_fill_gradient(low = "#f9ca24", high = "#c0392b") +
scale_y_continuous(labels = percent_format(),
limits = c(0, max(bmi_df$stroke_rate) * 1.35)) +
labs(
title = "Stroke Rate by BMI Category",
subtitle = "Overweight and Obese groups show higher stroke rates",
x = "BMI Category", y = "Stroke Rate (%)"
) +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold"))glucose_df <- raw_plot %>%
filter(gender != "Other") %>%
mutate(glucose_cat = case_when(
avg_glucose_level < 100 ~ "Normal",
avg_glucose_level < 126 ~ "Pre-diabetic",
TRUE ~ "Diabetic"
)) %>%
group_by(glucose_cat) %>%
summarise(
stroke_rate = mean(stroke, na.rm = TRUE),
n = n(),
.groups = "drop"
) %>%
mutate(glucose_cat = factor(glucose_cat,
levels = c("Normal", "Pre-diabetic", "Diabetic")))
ggplot(glucose_df, aes(x = glucose_cat, y = stroke_rate, fill = stroke_rate)) +
geom_bar(stat = "identity", width = 0.6, show.legend = FALSE) +
geom_text(aes(label = paste0(round(stroke_rate * 100, 1), "% (n=", n, ")")),
vjust = -0.4, size = 3.8) +
scale_fill_gradient(low = "#f9ca24", high = "#c0392b") +
scale_y_continuous(labels = percent_format(),
limits = c(0, max(glucose_df$stroke_rate) * 1.35)) +
labs(
title = "Stroke Rate by Glucose Category",
subtitle = "Diabetic patients have ~3x higher stroke rate than normal glucose patients",
x = "Glucose Category", y = "Stroke Rate (%)"
) +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold"))# Calculate how many times MORE likely each high-risk group is vs their baseline
rr_glucose <- glucose_df$stroke_rate[glucose_df$glucose_cat == "Diabetic"] /
glucose_df$stroke_rate[glucose_df$glucose_cat == "Normal"]
rr_bmi <- bmi_df$stroke_rate[bmi_df$bmi_category == "Obese"] /
bmi_df$stroke_rate[bmi_df$bmi_category == "Normal"]
rr_smoking <- smoke_df$stroke_rate[smoke_df$smoking_status == "formerly smoked"] /
smoke_df$stroke_rate[smoke_df$smoking_status == "never smoked"]
# Combine into one data frame
risk_df <- data.frame(
Factor = c("Blood Glucose\n(Diabetic vs Normal)",
"BMI\n(Obese vs Normal)",
"Smoking\n(Former vs Never)"),
Rel_Risk = c(rr_glucose, rr_bmi, rr_smoking)
)
ggplot(risk_df, aes(x = reorder(Factor, Rel_Risk), y = Rel_Risk, fill = Rel_Risk)) +
geom_bar(stat = "identity", width = 0.55, show.legend = FALSE) +
geom_text(aes(label = paste0(round(Rel_Risk, 2), "x more likely")),
hjust = -0.1, size = 4.2, fontface = "bold") +
scale_fill_gradient(low = "#f39c12", high = "#c0392b") +
coord_flip() +
ylim(0, max(risk_df$Rel_Risk) * 1.35) +
labs(
title = "Relative Stroke Risk by Lifestyle Factor",
subtitle = "Compared to the low-risk group in each category",
x = NULL,
y = "Relative Risk (times more likely)"
) +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold"))findings <- data.frame(
No = 1:7,
Finding = c(
"Age is the strongest single predictor of stroke. Risk increases sharply after age 60.",
"Blood glucose (Diabetic range >= 126 mg/dL) increases stroke risk ~2.85x vs normal glucose. It is the #1 modifiable risk factor.",
"Obese BMI patients show substantially higher stroke risk compared to patients with normal BMI.",
"Former smokers have ~1.66x higher stroke rate than never-smokers. Current smokers are slightly lower, likely due to age differences.",
"The combined cardiovascular risk score (hypertension + heart disease + glucose + BMI) is a strong composite predictor.",
"Gender and Residence type (Urban/Rural) have minimal independent impact on stroke.",
"Logistic Regression (AUC ~0.83) slightly outperformed Random Forest (AUC ~0.81). Logistic Regression also achieved much higher recall, making it more suitable for identifying stroke cases in medical screening scenarios."
)
)
kable(findings, col.names = c("#", "Finding"),
caption = "Key Findings from the Analysis") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
column_spec(1, bold = TRUE, width = "2em")| # | Finding |
|---|---|
| 1 | Age is the strongest single predictor of stroke. Risk increases sharply after age 60. |
| 2 | Blood glucose (Diabetic range >= 126 mg/dL) increases stroke risk ~2.85x vs normal glucose. It is the #1 modifiable risk factor. |
| 3 | Obese BMI patients show substantially higher stroke risk compared to patients with normal BMI. |
| 4 | Former smokers have ~1.66x higher stroke rate than never-smokers. Current smokers are slightly lower, likely due to age differences. |
| 5 | The combined cardiovascular risk score (hypertension + heart disease + glucose + BMI) is a strong composite predictor. |
| 6 | Gender and Residence type (Urban/Rural) have minimal independent impact on stroke. |
| 7 | Logistic Regression (AUC ~0.83) slightly outperformed Random Forest (AUC ~0.81). Logistic Regression also achieved much higher recall, making it more suitable for identifying stroke cases in medical screening scenarios. |
## R version 4.5.2 (2025-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Asia/Dhaka
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] themis_1.0.3 recipes_1.3.2 kableExtra_1.4.0
## [4] knitr_1.51 gridExtra_2.3 forcats_1.0.1
## [7] scales_1.4.0 pROC_1.19.0.1 randomForest_4.7-1.2
## [10] caret_7.0-1 lattice_0.22-7 corrplot_0.95
## [13] moments_0.14.1 tidyr_1.3.2 dplyr_1.2.1
## [16] ggplot2_4.0.3
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 viridisLite_0.4.3 timeDate_4052.112
## [4] farver_2.1.2 S7_0.2.1 fastmap_1.2.0
## [7] RANN_2.6.2 digest_0.6.39 rpart_4.1.24
## [10] timechange_0.4.0 lifecycle_1.0.5 survival_3.8-3
## [13] magrittr_2.0.4 compiler_4.5.2 rlang_1.1.7
## [16] sass_0.4.10 tools_4.5.2 yaml_2.3.12
## [19] data.table_1.18.2.1 labeling_0.4.3 plyr_1.8.9
## [22] xml2_1.5.2 RColorBrewer_1.1-3 withr_3.0.2
## [25] purrr_1.2.2 nnet_7.3-20 grid_4.5.2
## [28] stats4_4.5.2 e1071_1.7-17 future_1.70.0
## [31] globals_0.19.1 iterators_1.0.14 MASS_7.3-65
## [34] cli_3.6.5 rmarkdown_2.30 generics_0.1.4
## [37] otel_0.2.0 rstudioapi_0.18.0 future.apply_1.20.2
## [40] reshape2_1.4.5 proxy_0.4-29 cachem_1.1.0
## [43] stringr_1.6.0 splines_4.5.2 parallel_4.5.2
## [46] vctrs_0.7.1 hardhat_1.4.3 Matrix_1.7-4
## [49] jsonlite_2.0.0 listenv_0.10.0 systemfonts_1.3.2
## [52] foreach_1.5.2 gower_1.0.2 jquerylib_0.1.4
## [55] glue_1.8.0 parallelly_1.46.1 codetools_0.2-20
## [58] lubridate_1.9.5 stringi_1.8.7 gtable_0.3.6
## [61] ROSE_0.0-4 tibble_3.3.1 pillar_1.11.1
## [64] htmltools_0.5.9 ipred_0.9-15 lava_1.8.2
## [67] R6_2.6.1 textshaping_1.0.5 evaluate_1.0.5
## [70] bslib_0.10.0 class_7.3-23 Rcpp_1.1.1
## [73] svglite_2.2.2 nlme_3.1-168 prodlim_2025.04.28
## [76] xfun_0.56 ModelMetrics_1.2.2.2 pkgconfig_2.0.3
End of Report — Introduction to Data Science Final Project