📌 1. Setup Library
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'dplyr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(smotefamily)
## Warning: package 'smotefamily' was built under R version 4.5.3
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.3
library(ggplot2)
📌 2. Load Data
df <- read_excel("C:/Users/ACER/Downloads/data stroke.xls")
str(df)
## tibble [5,110 × 12] (S3: tbl_df/tbl/data.frame)
## $ id : num [1:5110] 9046 51676 31112 60182 1665 ...
## $ gender : chr [1:5110] "Male" "Female" "Male" "Female" ...
## $ age : num [1:5110] 67 61 80 49 79 81 74 69 59 78 ...
## $ hypertension : num [1:5110] 0 0 0 0 1 0 1 0 0 0 ...
## $ heart_disease : num [1:5110] 1 0 1 0 0 0 1 0 0 0 ...
## $ ever_married : chr [1:5110] "Yes" "Yes" "Yes" "Yes" ...
## $ work_type : chr [1:5110] "Private" "Self-employed" "Private" "Private" ...
## $ Residence_type : chr [1:5110] "Urban" "Rural" "Rural" "Urban" ...
## $ avg_glucose_level: num [1:5110] 229 202 106 171 174 ...
## $ bmi : chr [1:5110] "36.600000000000001" "N/A" "32.5" "34.399999999999999" ...
## $ smoking_status : chr [1:5110] "formerly smoked" "never smoked" "never smoked" "smokes" ...
## $ stroke : num [1:5110] 1 1 1 1 1 1 1 1 1 1 ...
summary(df)
## id gender age hypertension
## Min. : 67 Length:5110 Min. : 0.08 Min. :0.00000
## 1st Qu.:17741 Class :character 1st Qu.:25.00 1st Qu.:0.00000
## Median :36932 Mode :character Median :45.00 Median :0.00000
## Mean :36518 Mean :43.23 Mean :0.09746
## 3rd Qu.:54682 3rd Qu.:61.00 3rd Qu.:0.00000
## Max. :72940 Max. :82.00 Max. :1.00000
## heart_disease ever_married work_type Residence_type
## Min. :0.00000 Length:5110 Length:5110 Length:5110
## 1st Qu.:0.00000 Class :character Class :character Class :character
## Median :0.00000 Mode :character Mode :character Mode :character
## Mean :0.05401
## 3rd Qu.:0.00000
## Max. :1.00000
## avg_glucose_level bmi smoking_status stroke
## Min. : 55.12 Length:5110 Length:5110 Min. :0.00000
## 1st Qu.: 77.25 Class :character Class :character 1st Qu.:0.00000
## Median : 91.89 Mode :character Mode :character Median :0.00000
## Mean :106.15 Mean :0.04873
## 3rd Qu.:114.09 3rd Qu.:0.00000
## Max. :271.74 Max. :1.00000
📌 3. Preprocessing
# hapus id jika ada
if("id" %in% names(df)){
df$id <- NULL
}
# handle missing BMI
if("bmi" %in% names(df)){
df$bmi[is.na(df$bmi)] <- median(df$bmi, na.rm = TRUE)
}
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
# ubah target
df$stroke <- as.factor(df$stroke)
📌 5. Encoding (Dummy Variables)
dummies <- dummyVars(stroke ~ ., data = df)
df_dummy <- data.frame(predict(dummies, newdata = df))
## Warning in model.frame.default(Terms, newdata, na.action = na.action, xlev =
## object$lvls): variable 'stroke' is not a factor
df_dummy$stroke <- df$stroke
📌 6. SMOTE
set.seed(123)
X <- df_dummy[, names(df_dummy) != "stroke"]
y <- df_dummy$stroke
# FIX ERROR SMOTE (WAJIB)
y_num <- as.numeric(y) - 1
smote_result <- SMOTE(X, y_num, K = 5)
df_smote <- smote_result$data
# rename target
df_smote$class <- as.factor(df_smote$class)
names(df_smote)[ncol(df_smote)] <- "stroke"
table(df_smote$stroke)
##
## 0 1
## 4861 4731
📌 7. Split Data
set.seed(123)
trainIndex <- createDataPartition(df_smote$stroke, p = 0.8, list = FALSE)
train <- df_smote[trainIndex, ]
test <- df_smote[-trainIndex, ]
table(train$stroke)
##
## 0 1
## 3889 3785
table(test$stroke)
##
## 0 1
## 972 946
📌 8. Model Random Forest
model_rf <- randomForest(
stroke ~ .,
data = train,
ntree = 100,
importance = TRUE
)
print(model_rf)
##
## Call:
## randomForest(formula = stroke ~ ., data = train, ntree = 100, importance = TRUE)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 20
##
## OOB estimate of error rate: 2.65%
## Confusion matrix:
## 0 1 class.error
## 0 3847 42 0.01079969
## 1 161 3624 0.04253633
📌 9. Prediksi
pred <- predict(model_rf, test)
head(pred)
## 19 20 34 37 38 40
## 0 1 0 0 0 0
## Levels: 0 1
📌 10. Evaluasi Model
# FIX ERROR level
test$stroke <- factor(test$stroke, levels = levels(pred))
hasil <- confusionMatrix(pred, test$stroke)
hasil
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 965 40
## 1 7 906
##
## Accuracy : 0.9755
## 95% CI : (0.9675, 0.9819)
## No Information Rate : 0.5068
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.951
##
## Mcnemar's Test P-Value : 3.046e-06
##
## Sensitivity : 0.9928
## Specificity : 0.9577
## Pos Pred Value : 0.9602
## Neg Pred Value : 0.9923
## Prevalence : 0.5068
## Detection Rate : 0.5031
## Detection Prevalence : 0.5240
## Balanced Accuracy : 0.9753
##
## 'Positive' Class : 0
##
📌 11. Feature Importance
varImpPlot(model_rf)

📌 12. Visualisasi Confusion Matrix
cm <- confusionMatrix(pred, test$stroke)
cm_table <- as.data.frame(cm$table)
ggplot(cm_table, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq), color = "white", size = 5) +
scale_fill_gradient(low = "black", high = "darkblue") +
labs(title = "Confusion Matrix Heatmap") +
theme_minimal()
