library(dplyr)
library(caret)
library(broom)
library(pROC)
library(tidyr)
library(e1071)
library(ggplot2)
df <- read.csv("healthcare-dataset-stroke-data.csv")
str(df)
## 'data.frame': 5110 obs. of 12 variables:
## $ id : int 9046 51676 31112 60182 1665 56669 53882 10434 27419 60491 ...
## $ gender : chr "Male" "Female" "Male" "Female" ...
## $ age : num 67 61 80 49 79 81 74 69 59 78 ...
## $ hypertension : int 0 0 0 0 1 0 1 0 0 0 ...
## $ heart_disease : int 1 0 1 0 0 0 1 0 0 0 ...
## $ ever_married : chr "Yes" "Yes" "Yes" "Yes" ...
## $ work_type : chr "Private" "Self-employed" "Private" "Private" ...
## $ Residence_type : chr "Urban" "Rural" "Rural" "Urban" ...
## $ avg_glucose_level: num 229 202 106 171 174 ...
## $ bmi : chr "36.6" "N/A" "32.5" "34.4" ...
## $ smoking_status : chr "formerly smoked" "never smoked" "never smoked" "smokes" ...
## $ stroke : int 1 1 1 1 1 1 1 1 1 1 ...
The dataset contains the following variables:
"Male", "Female", or
"Other".0 if
the patient does not have hypertension, 1 if they do.0 if
the patient does not have heart disease, 1 if they do."No" or "Yes"."children", "Govt_job",
"Never_worked", "Private", or
"Self-employed"."Rural" or "Urban"."formerly smoked", "never smoked",
"smokes", or "Unknown".
"Unknown" indicates that smoking
information is not available for the patient.1 if the
patient had a stroke, 0 otherwise.chr_var <- c("gender","ever_married","work_type","Residence_type","smoking_status")
df$bmi <- as.numeric(df$bmi)
df1 <- df %>%
filter(gender != "Other",
!is.na(bmi))%>%
mutate(across(all_of(chr_var), as.factor))%>%
mutate(gender = factor(gender, levels = c("Male", "Female")),
heart_disease = factor(heart_disease),
hypertension = factor(hypertension),
ever_married = factor(ever_married, levels = c("No","Yes"))
)
summary(df1)
## id gender age hypertension heart_disease
## Min. : 77 Male :2011 Min. : 0.08 0:4457 0:4665
## 1st Qu.:18603 Female:2897 1st Qu.:25.00 1: 451 1: 243
## Median :37581 Median :44.00
## Mean :37060 Mean :42.87
## 3rd Qu.:55182 3rd Qu.:60.00
## Max. :72940 Max. :82.00
## ever_married work_type Residence_type avg_glucose_level
## No :1704 children : 671 Rural:2418 Min. : 55.12
## Yes:3204 Govt_job : 630 Urban:2490 1st Qu.: 77.07
## Never_worked : 22 Median : 91.68
## Private :2810 Mean :105.30
## Self-employed: 775 3rd Qu.:113.50
## Max. :271.74
## bmi smoking_status stroke
## Min. :10.30 formerly smoked: 836 Min. :0.00000
## 1st Qu.:23.50 never smoked :1852 1st Qu.:0.00000
## Median :28.10 smokes : 737 Median :0.00000
## Mean :28.89 Unknown :1483 Mean :0.04258
## 3rd Qu.:33.10 3rd Qu.:0.00000
## Max. :97.60 Max. :1.00000
### ---- BMI ----
ggplot(df1, aes(x = factor(stroke, levels = c(0, 1), labels = c("No Stroke", "Stroke")),
y = bmi, fill = factor(stroke))) +
geom_boxplot(alpha = 0.7, outlier.shape = 21, outlier.fill = "white", outlier.color = "black") +
scale_fill_manual(values = c("No Stroke" = "#3498DB", "Stroke" = "#E74C3C")) +
labs(
title = "BMI Distribution by Stroke Status",
subtitle = "Comparing BMI between individuals with and without stroke",
x = "Stroke Status",
y = "Body Mass Index (BMI)",
fill = "Stroke"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", size = 18, hjust = 0.5),
plot.subtitle = element_text(size = 13, hjust = 0.5),
axis.title = element_text(face = "bold"),
legend.position = "top",
panel.grid.major.y = element_line(color = "gray80"),
panel.grid.minor = element_blank()
)
The graph box-plot above shows that, people who has stroke has average BMI above 25 which categorize as overweight. While, non-stroke person has wider range of BMI from just under 25 up to around 30, this fall into health to overweight. Based on that comparison above, we cannot say that there a significant different of BMI of group of people with Stoke and Withou Stroke.
### ---- AGE ----
ggplot(df1, aes(x = factor(stroke, levels = c(0, 1), labels = c("No Stroke", "Stroke")),
y = age, fill = factor(stroke))) +
geom_boxplot(alpha = 0.7, outlier.shape = 21, outlier.fill = "white", outlier.color = "black") +
scale_fill_manual(values = c("No Stroke" = "#3498DB", "Stroke" = "#E74C3C")) +
labs(
title = "Age Distribution by Stroke Status",
subtitle = "Comparing age between individuals with and without stroke",
x = "Stroke Status",
y = "Age (years)",
fill = "Stroke"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", size = 18, hjust = 0.5),
plot.subtitle = element_text(size = 13, hjust = 0.5),
axis.title = element_text(face = "bold"),
legend.position = "top",
panel.grid.major.y = element_line(color = "gray80"),
panel.grid.minor = element_blank()
)
Graph above shows that, There is significant evidence that majority of people with stoke is in their 60th, while people below that age mostly has no stoke.
### ---- Glucode Level ----
ggplot(df1, aes(x = factor(stroke, levels = c(0, 1), labels = c("No Stroke", "Stroke")),
y = avg_glucose_level, fill = factor(stroke))) +
geom_boxplot(alpha = 0.7, outlier.shape = 21, outlier.fill = "white", outlier.color = "black") +
scale_fill_manual(values = c("No Stroke" = "#3498DB", "Stroke" = "#E74C3C")) +
labs(
title = "Average Glucose Level by Stroke Status",
subtitle = "Comparing glucose levels between individuals with and without stroke",
x = "Stroke Status",
y = "Average Glucose Level (mg/dL)",
fill = "Stroke"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", size = 18, hjust = 0.5),
plot.subtitle = element_text(size = 13, hjust = 0.5),
axis.title = element_text(face = "bold"),
legend.position = "top",
panel.grid.major.y = element_line(color = "gray80"),
panel.grid.minor = element_blank()
)
mg/dL.
df1s <- df1 %>%
mutate(
stroke = factor(stroke, levels = c(0, 1), labels = c("No Stroke", "Stroke")),
hypertension = factor(hypertension, levels = c(0, 1), labels = c("No Hypertension", "Hypertension")),
heart_disease = factor(heart_disease, levels = c(0, 1), labels = c("No Heart Disease", "Heart Disease"))
)
facet_vars <- c("ever_married", "work_type", "smoking_status")
df_long <- df1s %>%
select(stroke, all_of(facet_vars)) %>%
pivot_longer(cols = all_of(facet_vars), names_to = "variable", values_to = "category")
df_prop <- df_long %>%
group_by(variable, category, stroke) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(variable, category) %>%
mutate(prop = round(n / sum(n) * 100, 1))
plot_stroke_distribution <- function(data, varname) {
data %>%
count(!!sym(varname), stroke) %>%
group_by(!!sym(varname)) %>%
mutate(prop = round(n / sum(n) * 100, 1)) %>%
ggplot(aes(x = !!sym(varname), y = n, fill = stroke)) +
geom_bar(stat = "identity", position = "stack", color = "black", alpha = 0.9) +
geom_text(aes(label = paste0(prop, "%")),
position = position_stack(vjust = 0.5), size = 3.5, color = "white") +
scale_fill_manual(values = c("No Stroke" = "#3498DB", "Stroke" = "#E74C3C")) +
labs(
title = paste("Stroke Distribution by", gsub("_", " ", varname)),
x = gsub("_", " ", varname),
y = "Count",
fill = "Stroke Status"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", size = 18, hjust = 0.5),
axis.text.x = element_text(angle = 30, hjust = 1),
legend.position = "top"
)
}
plot_stroke_distribution(df1s, "ever_married")
plot_stroke_distribution(df1s, "work_type")
plot_stroke_distribution(df1s, "smoking_status")
set.seed(1028)
split_idx <- createDataPartition(df1s$stroke, p = 0.8, list = FALSE)
train_data <- df1[split_idx,-1] # Exclude ID
test_data <- df1[-split_idx,-1 ] # Exclude ID
mo1 <- glm( formula = stroke ~ .,
data = train_data,family = "binomial"
)
summary(mo1)
##
## Call:
## glm(formula = stroke ~ ., family = "binomial", data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.192694 1.076998 -6.678 2.41e-11 ***
## genderFemale -0.005719 0.172364 -0.033 0.97353
## age 0.074458 0.007202 10.338 < 2e-16 ***
## hypertension1 0.502230 0.196700 2.553 0.01067 *
## heart_disease1 0.430320 0.228863 1.880 0.06007 .
## ever_marriedYes -0.053360 0.287717 -0.185 0.85287
## work_typeGovt_job -0.815798 1.142701 -0.714 0.47528
## work_typeNever_worked -9.943494 351.454800 -0.028 0.97743
## work_typePrivate -0.759412 1.126865 -0.674 0.50037
## work_typeSelf-employed -1.176107 1.149679 -1.023 0.30631
## Residence_typeUrban -0.067803 0.167262 -0.405 0.68521
## avg_glucose_level 0.004677 0.001446 3.235 0.00122 **
## bmi 0.005733 0.013318 0.431 0.66682
## smoking_statusnever smoked -0.153420 0.206006 -0.745 0.45643
## smoking_statussmokes -0.005535 0.267156 -0.021 0.98347
## smoking_statusUnknown -0.239800 0.264640 -0.906 0.36486
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1387.8 on 3927 degrees of freedom
## Residual deviance: 1092.7 on 3912 degrees of freedom
## AIC: 1124.7
##
## Number of Fisher Scoring iterations: 14
prob_mo1 <- predict(mo1,newdata = test_data,type = "response")
pred_mo1 <- ifelse(prob_mo1 > 0.04, 1, 0)
confusionMatrix(factor(pred_mo1),
factor(test_data$stroke), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 689 8
## 1 250 33
##
## Accuracy : 0.7367
## 95% CI : (0.708, 0.7641)
## No Information Rate : 0.9582
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1409
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.80488
## Specificity : 0.73376
## Pos Pred Value : 0.11661
## Neg Pred Value : 0.98852
## Prevalence : 0.04184
## Detection Rate : 0.03367
## Detection Prevalence : 0.28878
## Balanced Accuracy : 0.76932
##
## 'Positive' Class : 1
##
roc_mo1 <- roc(test_data$stroke, prob_mo1)
plot(roc_mo1)
thresh_mo1 <- coords(roc_mo1, "best", ret = "threshold", best.method = "closest.topleft")
pred_mo1_opt <- ifelse(prob_mo1 > thresh_mo1$threshold, 1, 0)
confusionMatrix(factor(pred_mo1_opt),
factor(test_data$stroke), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 702 8
## 1 237 33
##
## Accuracy : 0.75
## 95% CI : (0.7217, 0.7768)
## No Information Rate : 0.9582
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1505
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.80488
## Specificity : 0.74760
## Pos Pred Value : 0.12222
## Neg Pred Value : 0.98873
## Prevalence : 0.04184
## Detection Rate : 0.03367
## Detection Prevalence : 0.27551
## Balanced Accuracy : 0.77624
##
## 'Positive' Class : 1
##
library(randomForest)
tuneRF(train_data[, -which(names(train_data) == "stroke")],
train_data$stroke,
stepFactor = 1.5,
improve = 0.01,
ntreeTry = 500)
## mtry = 3 OOB error = 0.03954649
## Searching left ...
## mtry = 2 OOB error = 0.0388864
## 0.01669144 0.01
## Searching right ...
## mtry = 4 OOB error = 0.03996808
## -0.02781641 0.01
## mtry OOBError
## 2 2 0.03888640
## 3 3 0.03954649
## 4 4 0.03996808
mo2 <- randomForest(
stroke ~.,
data = train_data,
ntree = 500,
mtry = 2,
importance = TRUE
)
prob_mo2 <- predict(mo2, newdata = test_data, type = "response")
pred_mo2 <- ifelse(prob_mo2 > 0.04,1,0)
confusionMatrix(factor(pred_mo2),
factor(test_data$stroke),positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 675 10
## 1 264 31
##
## Accuracy : 0.7204
## 95% CI : (0.6912, 0.7483)
## No Information Rate : 0.9582
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1199
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.75610
## Specificity : 0.71885
## Pos Pred Value : 0.10508
## Neg Pred Value : 0.98540
## Prevalence : 0.04184
## Detection Rate : 0.03163
## Detection Prevalence : 0.30102
## Balanced Accuracy : 0.73747
##
## 'Positive' Class : 1
##
roc_mo2 <- roc(test_data$stroke, prob_mo2)
plot(roc_mo2)
thresh_mo2 <- coords(roc_mo2, "best", ret = "threshold", best.method = "closest.topleft")
pred_mo2_opt <- ifelse(prob_mo2 > thresh_mo2$threshold, 1, 0)
confusionMatrix(factor(pred_mo2_opt),
factor(test_data$stroke),positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 688 10
## 1 251 31
##
## Accuracy : 0.7337
## 95% CI : (0.7048, 0.7611)
## No Information Rate : 0.9582
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1283
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.75610
## Specificity : 0.73269
## Pos Pred Value : 0.10993
## Neg Pred Value : 0.98567
## Prevalence : 0.04184
## Detection Rate : 0.03163
## Detection Prevalence : 0.28776
## Balanced Accuracy : 0.74440
##
## 'Positive' Class : 1
##
importance(mo2)
## %IncMSE IncNodePurity
## gender 1.0061629 2.499924
## age 13.2062128 23.206689
## hypertension 3.2246503 3.119429
## heart_disease 3.6083795 3.330760
## ever_married 10.2882264 1.972012
## work_type 0.8099144 4.849053
## Residence_type -1.9388501 2.506115
## avg_glucose_level -4.9774013 23.818775
## bmi 8.5079719 19.095851
## smoking_status -1.8192926 6.035214
varImpPlot(mo2,
main = "Feature Importance - Random Forest",
type = 2,
col = "#2E86C1",
pch = 19,
cex = 1.2)
mo3 <- naiveBayes(
stroke ~ age + hypertension + heart_disease + avg_glucose_level + bmi +
gender + ever_married + work_type + Residence_type + smoking_status,
data = train_data
)
mo3
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 0 1
## 0.95723014 0.04276986
##
## Conditional probabilities:
## age
## Y [,1] [,2]
## 0 41.85461 22.22224
## 1 67.82738 12.21710
##
## hypertension
## Y 0 1
## 0 0.91808511 0.08191489
## 1 0.72023810 0.27976190
##
## heart_disease
## Y 0 1
## 0 0.95664894 0.04335106
## 1 0.80357143 0.19642857
##
## avg_glucose_level
## Y [,1] [,2]
## 0 104.1366 43.19514
## 1 134.6539 62.27698
##
## bmi
## Y [,1] [,2]
## 0 28.81915 8.017786
## 1 30.45119 6.102370
##
## gender
## Y Male Female
## 0 0.4050532 0.5949468
## 1 0.4285714 0.5714286
##
## ever_married
## Y No Yes
## 0 0.3558511 0.6441489
## 1 0.1011905 0.8988095
##
## work_type
## Y children Govt_job Never_worked Private Self-employed
## 0 0.145212766 0.127127660 0.004521277 0.565691489 0.157446809
## 1 0.005952381 0.148809524 0.000000000 0.589285714 0.255952381
##
## Residence_type
## Y Rural Urban
## 0 0.4925532 0.5074468
## 1 0.4940476 0.5059524
##
## smoking_status
## Y formerly smoked never smoked smokes Unknown
## 0 0.1672872 0.3781915 0.1476064 0.3069149
## 1 0.2976190 0.3988095 0.1488095 0.1547619
prob_mo3 <- predict(mo3, newdata = test_data, type = "raw")[, "1"]
pred_mo3 <- predict(mo3, newdata = test_data, type = "class")
confusionMatrix(pred_mo3,
factor(test_data$stroke), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 870 29
## 1 69 12
##
## Accuracy : 0.9
## 95% CI : (0.8795, 0.9181)
## No Information Rate : 0.9582
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1495
##
## Mcnemar's Test P-Value : 8.162e-05
##
## Sensitivity : 0.29268
## Specificity : 0.92652
## Pos Pred Value : 0.14815
## Neg Pred Value : 0.96774
## Prevalence : 0.04184
## Detection Rate : 0.01224
## Detection Prevalence : 0.08265
## Balanced Accuracy : 0.60960
##
## 'Positive' Class : 1
##
roc_mo3 <- roc(test_data$stroke, prob_mo3)
plot(roc_mo3)
thresh_mo3 <- coords(roc_mo3, "best", ret = "threshold", best.method = "youden")
pred_mo3_opt <- ifelse(prob_mo3 > thresh_mo3$threshold, 1, 0)
confusionMatrix(factor(pred_mo3_opt),
factor(test_data$stroke), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 631 6
## 1 308 35
##
## Accuracy : 0.6796
## 95% CI : (0.6494, 0.7087)
## No Information Rate : 0.9582
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1162
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.85366
## Specificity : 0.67199
## Pos Pred Value : 0.10204
## Neg Pred Value : 0.99058
## Prevalence : 0.04184
## Detection Rate : 0.03571
## Detection Prevalence : 0.35000
## Balanced Accuracy : 0.76283
##
## 'Positive' Class : 1
##
According to study above.