📌 1. Setup Library

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'dplyr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(caret)
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(smotefamily)
## Warning: package 'smotefamily' was built under R version 4.5.3
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.3
library(ggplot2)

📌 2. Load Data

df <- read_excel("C:/Users/ACER/Downloads/data stroke.xls")

str(df)
## tibble [5,110 × 12] (S3: tbl_df/tbl/data.frame)
##  $ id               : num [1:5110] 9046 51676 31112 60182 1665 ...
##  $ gender           : chr [1:5110] "Male" "Female" "Male" "Female" ...
##  $ age              : num [1:5110] 67 61 80 49 79 81 74 69 59 78 ...
##  $ hypertension     : num [1:5110] 0 0 0 0 1 0 1 0 0 0 ...
##  $ heart_disease    : num [1:5110] 1 0 1 0 0 0 1 0 0 0 ...
##  $ ever_married     : chr [1:5110] "Yes" "Yes" "Yes" "Yes" ...
##  $ work_type        : chr [1:5110] "Private" "Self-employed" "Private" "Private" ...
##  $ Residence_type   : chr [1:5110] "Urban" "Rural" "Rural" "Urban" ...
##  $ avg_glucose_level: num [1:5110] 229 202 106 171 174 ...
##  $ bmi              : chr [1:5110] "36.600000000000001" "N/A" "32.5" "34.399999999999999" ...
##  $ smoking_status   : chr [1:5110] "formerly smoked" "never smoked" "never smoked" "smokes" ...
##  $ stroke           : num [1:5110] 1 1 1 1 1 1 1 1 1 1 ...
summary(df)
##        id           gender               age         hypertension    
##  Min.   :   67   Length:5110        Min.   : 0.08   Min.   :0.00000  
##  1st Qu.:17741   Class :character   1st Qu.:25.00   1st Qu.:0.00000  
##  Median :36932   Mode  :character   Median :45.00   Median :0.00000  
##  Mean   :36518                      Mean   :43.23   Mean   :0.09746  
##  3rd Qu.:54682                      3rd Qu.:61.00   3rd Qu.:0.00000  
##  Max.   :72940                      Max.   :82.00   Max.   :1.00000  
##  heart_disease     ever_married        work_type         Residence_type    
##  Min.   :0.00000   Length:5110        Length:5110        Length:5110       
##  1st Qu.:0.00000   Class :character   Class :character   Class :character  
##  Median :0.00000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.05401                                                           
##  3rd Qu.:0.00000                                                           
##  Max.   :1.00000                                                           
##  avg_glucose_level     bmi            smoking_status         stroke       
##  Min.   : 55.12    Length:5110        Length:5110        Min.   :0.00000  
##  1st Qu.: 77.25    Class :character   Class :character   1st Qu.:0.00000  
##  Median : 91.89    Mode  :character   Mode  :character   Median :0.00000  
##  Mean   :106.15                                          Mean   :0.04873  
##  3rd Qu.:114.09                                          3rd Qu.:0.00000  
##  Max.   :271.74                                          Max.   :1.00000

📌 3. Preprocessing

# hapus id jika ada
if("id" %in% names(df)){
  df$id <- NULL
}

# handle missing BMI
if("bmi" %in% names(df)){
  df$bmi[is.na(df$bmi)] <- median(df$bmi, na.rm = TRUE)
}
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
## argument is not numeric or logical: returning NA
# ubah target
df$stroke <- as.factor(df$stroke)

📌 4. Transformasi Data

df$gender <- as.factor(df$gender)
df$ever_married <- as.factor(df$ever_married)
df$work_type <- as.factor(df$work_type)
df$Residence_type <- as.factor(df$Residence_type)
df$smoking_status <- as.factor(df$smoking_status)

📌 5. Encoding (Dummy Variables)

dummies <- dummyVars(stroke ~ ., data = df)

df_dummy <- data.frame(predict(dummies, newdata = df))
## Warning in model.frame.default(Terms, newdata, na.action = na.action, xlev =
## object$lvls): variable 'stroke' is not a factor
df_dummy$stroke <- df$stroke

📌 6. SMOTE

set.seed(123)

X <- df_dummy[, names(df_dummy) != "stroke"]
y <- df_dummy$stroke

# FIX ERROR SMOTE (WAJIB)
y_num <- as.numeric(y) - 1

smote_result <- SMOTE(X, y_num, K = 5)

df_smote <- smote_result$data

# rename target
df_smote$class <- as.factor(df_smote$class)
names(df_smote)[ncol(df_smote)] <- "stroke"

table(df_smote$stroke)
## 
##    0    1 
## 4861 4731

📌 7. Split Data

set.seed(123)

trainIndex <- createDataPartition(df_smote$stroke, p = 0.8, list = FALSE)

train <- df_smote[trainIndex, ]
test  <- df_smote[-trainIndex, ]

table(train$stroke)
## 
##    0    1 
## 3889 3785
table(test$stroke)
## 
##   0   1 
## 972 946

📌 8. Model Random Forest

model_rf <- randomForest(
  stroke ~ .,
  data = train,
  ntree = 100,
  importance = TRUE
)

print(model_rf)
## 
## Call:
##  randomForest(formula = stroke ~ ., data = train, ntree = 100,      importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 20
## 
##         OOB estimate of  error rate: 2.65%
## Confusion matrix:
##      0    1 class.error
## 0 3847   42  0.01079969
## 1  161 3624  0.04253633

📌 9. Prediksi

pred <- predict(model_rf, test)

head(pred)
## 19 20 34 37 38 40 
##  0  1  0  0  0  0 
## Levels: 0 1

📌 10. Evaluasi Model

# FIX ERROR level
test$stroke <- factor(test$stroke, levels = levels(pred))

hasil <- confusionMatrix(pred, test$stroke)

hasil
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 965  40
##          1   7 906
##                                           
##                Accuracy : 0.9755          
##                  95% CI : (0.9675, 0.9819)
##     No Information Rate : 0.5068          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.951           
##                                           
##  Mcnemar's Test P-Value : 3.046e-06       
##                                           
##             Sensitivity : 0.9928          
##             Specificity : 0.9577          
##          Pos Pred Value : 0.9602          
##          Neg Pred Value : 0.9923          
##              Prevalence : 0.5068          
##          Detection Rate : 0.5031          
##    Detection Prevalence : 0.5240          
##       Balanced Accuracy : 0.9753          
##                                           
##        'Positive' Class : 0               
## 

📌 11. Feature Importance

varImpPlot(model_rf)


📌 12. Visualisasi Confusion Matrix

cm <- confusionMatrix(pred, test$stroke)

cm_table <- as.data.frame(cm$table)

ggplot(cm_table, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), color = "white", size = 5) +
  scale_fill_gradient(low = "black", high = "darkblue") +
  labs(title = "Confusion Matrix Heatmap") +
  theme_minimal()