Imbalanced Data Handling

Library

library(themis)
library(tidyverse)
library(tidymodels)
library(DataExplorer)
library(readxl)
library(caret)
library(ggplot2)
library(dplyr)
library(DescTools)
library(e1071)
library(ROCR)

Data

Data yang digunakan dalam materi ini adalah data Pasien yang terdiri dari Gender, Riwayat Penyakit Jantung, Riwayat Hipertensi, Status Pernikahan, berdasarkan keempat variabel tersebut akan diperiksa apakah pasien menderita penyakit Stroke/Tidak.

pasien<- read_xlsx("D:/Data Stroke.xlsx")
pasien<- na.omit(pasien)
head(pasien)
## # A tibble: 6 × 5
##   gender hypertension heart_disease ever_married stroke
##   <chr>  <chr>        <chr>         <chr>        <chr> 
## 1 Male   No           No            No           No    
## 2 Male   Yes          No            Yes          No    
## 3 Female No           No            No           No    
## 4 Female No           No            Yes          No    
## 5 Male   No           No            No           No    
## 6 Female No           No            Yes          No
pasien$gender <- as.factor(pasien$gender)
pasien$hypertension <- as.factor(pasien$hypertension)
pasien$heart_disease <- as.factor(pasien$heart_disease)
pasien$ever_married <- as.factor(pasien$ever_married)

pasien$stroke <- as.factor(pasien$stroke)

Eksplorasi Data

plot_intro(data = pasien)

pasien%>%count(stroke)
## # A tibble: 2 × 2
##   stroke     n
##   <fct>  <int>
## 1 No     42617
## 2 Yes      783
plotdata <- pasien %>%
  count(stroke) %>%
  arrange(desc(stroke)) %>%
  mutate(prop = round(n*100/sum(n), 1),
         lab.ypos = cumsum(prop) - 0.5*prop)

# Pie Chart
ggplot(plotdata, aes(x = "", y = prop, fill = stroke)) +
  geom_bar(width = 1, stat = "identity", color = "white") +
  coord_polar("y", start = 0)+
  geom_text(aes(y = lab.ypos, label = prop), color = "black")+
  theme_void()+
  labs(title = "Persentase Pasien Stroke")

Berdasarkan hasil eksplorasi di atas terlihat bahwa data Pasien Stroke merupakan data yang tidak seimbang karena jumlah pasien stroke stroke hanya 1,8 % dari keseluruhan data.

Partisi Data

Data Pasien dipartisi dengan proporsi data training sebesar 80% dan testing 20%.

set.seed(1001)
train_pasien_y <- createDataPartition(pasien$stroke, p = 0.80, list=FALSE)
trainData <- pasien[train_pasien_y,]
testData <- pasien[-train_pasien_y,]
n<-nrow(trainData)
n
## [1] 34721
trainData%>%count(stroke)
## # A tibble: 2 × 2
##   stroke     n
##   <fct>  <int>
## 1 No     34094
## 2 Yes      627
testData%>%count(stroke)
## # A tibble: 2 × 2
##   stroke     n
##   <fct>  <int>
## 1 No      8523
## 2 Yes      156

Naive Bayes dengan Imbalanced Data

Proses Training

NBClassifier <- naiveBayes(stroke ~., data = trainData)
NBClassifier
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##         No        Yes 
## 0.98194176 0.01805824 
## 
## Conditional probabilities:
##      gender
## Y           Female         Male        Other
##   No  0.5910424122 0.4086349504 0.0003226374
##   Yes 0.5438596491 0.4561403509 0.0000000000
## 
##      hypertension
## Y             No        Yes
##   No  0.90901625 0.09098375
##   Yes 0.74800638 0.25199362
## 
##      heart_disease
## Y             No        Yes
##   No  0.95673726 0.04326274
##   Yes 0.77671451 0.22328549
## 
##      ever_married
## Y            No       Yes
##   No  0.3595647 0.6404353
##   Yes 0.1052632 0.8947368

Proses Testing

# Predict using Naive Bayes
testData$predicted <- predict(NBClassifier,testData)
testData$actual <- testData$stroke
#confusion matrix
confusionMatrix(factor(testData$predicted),
                factor(testData$actual))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  8523  156
##        Yes    0    0
##                                          
##                Accuracy : 0.982          
##                  95% CI : (0.979, 0.9847)
##     No Information Rate : 0.982          
##     P-Value [Acc > NIR] : 0.5213         
##                                          
##                   Kappa : 0              
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 1.000          
##             Specificity : 0.000          
##          Pos Pred Value : 0.982          
##          Neg Pred Value :   NaN          
##              Prevalence : 0.982          
##          Detection Rate : 0.982          
##    Detection Prevalence : 1.000          
##       Balanced Accuracy : 0.500          
##                                          
##        'Positive' Class : No             
## 

Naive Bayes dengan Balanced data (Undersampling)

Metode Random Undersampling digunakan untuk menyeimbangkan sebaran setiap kelas dengan menghapus sampel kelas mayoritas secara acak.

Imbalanced Data Handling

set.seed(1001)
down_train <- downSample(x = trainData[, !colnames(trainData) %in% "stroke"],
                         y = trainData$stroke)

# we have to remane the Class variable
names(down_train)[5]<-"stroke"

table(down_train$stroke)
## 
##  No Yes 
## 627 627
down_train %>%
  count(stroke)
##   stroke   n
## 1     No 627
## 2    Yes 627

Proses Training

NBClassifier3 <- naiveBayes(stroke ~., data = down_train)
NBClassifier3
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##  No Yes 
## 0.5 0.5 
## 
## Conditional probabilities:
##      gender
## Y        Female      Male     Other
##   No  0.5964912 0.4035088 0.0000000
##   Yes 0.5438596 0.4561404 0.0000000
## 
##      hypertension
## Y             No        Yes
##   No  0.93301435 0.06698565
##   Yes 0.74800638 0.25199362
## 
##      heart_disease
## Y            No       Yes
##   No  0.9569378 0.0430622
##   Yes 0.7767145 0.2232855
## 
##      ever_married
## Y            No       Yes
##   No  0.3333333 0.6666667
##   Yes 0.1052632 0.8947368

Proses Testing

# Predict using Naive Bayes
testData$predicted <- predict(NBClassifier3,testData)
testData$actual <- testData$stroke
#confusion matrix
confusionMatrix(factor(testData$predicted),
                factor(testData$actual))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  7494   90
##        Yes 1029   66
##                                           
##                Accuracy : 0.8711          
##                  95% CI : (0.8638, 0.8781)
##     No Information Rate : 0.982           
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0765          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.87927         
##             Specificity : 0.42308         
##          Pos Pred Value : 0.98813         
##          Neg Pred Value : 0.06027         
##              Prevalence : 0.98203         
##          Detection Rate : 0.86346         
##    Detection Prevalence : 0.87383         
##       Balanced Accuracy : 0.65117         
##                                           
##        'Positive' Class : No              
## 

Naive Bayes dengan Balanced data (Oversampling)

Metode Random Oversampling melakukan replikasi acak pada sampel minoritas untuk menyeimbangkan sebaran kelas.

Imbalanced Data Handling

set.seed(1001)
up_train <- upSample(x = trainData[, !colnames(trainData) %in% "Churn"],
                         y = trainData$stroke)

# we have to remane the Class variable
names(up_train)[5]<-"stroke"

table(up_train$stroke)
## 
##    No   Yes 
## 34094 34094
up_train %>%
  count(stroke)
##   stroke     n
## 1     No 34094
## 2    Yes 34094

Proses Training

NBClassifier1 <- naiveBayes(stroke ~., data = up_train)
NBClassifier1
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##  No Yes 
## 0.5 0.5 
## 
## Conditional probabilities:
##      gender
## Y           Female         Male        Other
##   No  0.5910424122 0.4086349504 0.0003226374
##   Yes 0.5473103772 0.4526896228 0.0000000000
## 
##      hypertension
## Y             No        Yes
##   No  0.90901625 0.09098375
##   Yes 0.74678829 0.25321171
## 
##      heart_disease
## Y             No        Yes
##   No  0.95673726 0.04326274
##   Yes 0.77685223 0.22314777
## 
##      ever_married
## Y            No       Yes
##   No  0.3595647 0.6404353
##   Yes 0.1062064 0.8937936
## 
##      Class
## Y     No Yes
##   No   1   0
##   Yes  0   1

Proses Testing

# Predict using Naive Bayes
testData$predicted <- predict(NBClassifier1,testData)
testData$actual <- testData$stroke
#confusion matrix
confusionMatrix(factor(testData$predicted),
                factor(testData$actual))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  5882   63
##        Yes 2641   93
##                                           
##                Accuracy : 0.6884          
##                  95% CI : (0.6786, 0.6982)
##     No Information Rate : 0.982           
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0314          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.69013         
##             Specificity : 0.59615         
##          Pos Pred Value : 0.98940         
##          Neg Pred Value : 0.03402         
##              Prevalence : 0.98203         
##          Detection Rate : 0.67773         
##    Detection Prevalence : 0.68499         
##       Balanced Accuracy : 0.64314         
##                                           
##        'Positive' Class : No              
## 

Naive Bayes dengan Balanced data (SMOTE)

Synthetic Minority Over-sampling Technique (SMOTE) merupakan teknik oversampling yang menggunakan karakteristik K-nearest neighbor pada peubah penjelas untuk menghasilkan data sintetik pada kelas minoritas.

Imbalanced Data Handling

set.seed(1001)
train_smnc<-smotenc(trainData, var = "stroke", over_ratio = 0.8)

train_smnc%>%count(stroke)
## # A tibble: 2 × 2
##   stroke     n
##   <fct>  <int>
## 1 No     34094
## 2 Yes    27275

Proses Training

NBClassifier2 <- naiveBayes(stroke ~., data = train_smnc)
NBClassifier2
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##        No       Yes 
## 0.5555574 0.4444426 
## 
## Conditional probabilities:
##      gender
## Y           Female         Male        Other
##   No  0.5910424122 0.4086349504 0.0003226374
##   Yes 0.5441613199 0.4558386801 0.0000000000
## 
##      hypertension
## Y             No        Yes
##   No  0.90901625 0.09098375
##   Yes 0.74830431 0.25169569
## 
##      heart_disease
## Y             No        Yes
##   No  0.95673726 0.04326274
##   Yes 0.77686526 0.22313474
## 
##      ever_married
## Y             No        Yes
##   No  0.35956473 0.64043527
##   Yes 0.09605866 0.90394134

Proses Testing

# Predict using Naive Bayes
testData$predicted <- predict(NBClassifier2,testData)
testData$actual <- testData$stroke
#confusion matrix
confusionMatrix(factor(testData$predicted),
                factor(testData$actual))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  7527   91
##        Yes  996   65
##                                           
##                Accuracy : 0.8748          
##                  95% CI : (0.8676, 0.8816)
##     No Information Rate : 0.982           
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0779          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.88314         
##             Specificity : 0.41667         
##          Pos Pred Value : 0.98805         
##          Neg Pred Value : 0.06126         
##              Prevalence : 0.98203         
##          Detection Rate : 0.86727         
##    Detection Prevalence : 0.87775         
##       Balanced Accuracy : 0.64990         
##                                           
##        'Positive' Class : No              
##