Kuisbag2

# Package
library(AER)
## Warning: package 'AER' was built under R version 4.1.2
## Loading required package: car
## Warning: package 'car' was built under R version 4.1.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.1.2
## Loading required package: lmtest
## Warning: package 'lmtest' was built under R version 4.1.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.1.2
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.1.2
## Loading required package: survival
## Warning: package 'survival' was built under R version 4.1.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.2
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.1.2
## Loading required package: ggplot2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.1.2
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## The following object is masked from 'package:car':
## 
##     logit

Data A

data(Affairs, package="AER")
head(Affairs)
##    affairs gender age yearsmarried children religiousness education occupation
## 4        0   male  37        10.00       no             3        18          7
## 5        0 female  27         4.00       no             4        14          6
## 11       0 female  32        15.00      yes             1        12          1
## 16       0   male  57        15.00      yes             5        18          6
## 23       0   male  22         0.75       no             2        17          6
## 29       0 female  32         1.50       no             2        17          5
##    rating
## 4       4
## 5       4
## 11      4
## 16      5
## 23      3
## 29      5
describe(Affairs)
##               vars   n  mean   sd median trimmed  mad   min max range  skew
## affairs          1 601  1.46 3.30      0    0.55 0.00  0.00  12 12.00  2.34
## gender*          2 601  1.48 0.50      1    1.47 0.00  1.00   2  1.00  0.10
## age              3 601 32.49 9.29     32   31.37 7.41 17.50  57 39.50  0.88
## yearsmarried     4 601  8.18 5.57      7    8.26 8.15  0.12  15 14.88  0.08
## children*        5 601  1.72 0.45      2    1.77 0.00  1.00   2  1.00 -0.95
## religiousness    6 601  3.12 1.17      3    3.12 1.48  1.00   5  4.00 -0.09
## education        7 601 16.17 2.40     16   16.21 2.97  9.00  20 11.00 -0.25
## occupation       8 601  4.19 1.82      5    4.34 1.48  1.00   7  6.00 -0.74
## rating           9 601  3.93 1.10      4    4.07 1.48  1.00   5  4.00 -0.83
##               kurtosis   se
## affairs           4.19 0.13
## gender*          -1.99 0.02
## age               0.21 0.38
## yearsmarried     -1.57 0.23
## children*        -1.09 0.02
## religiousness    -1.02 0.05
## education        -0.32 0.10
## occupation       -0.79 0.07
## rating           -0.22 0.04

Memilih contoh acak

nim_terakhir <- 31001  
set.seed(nim_terakhir)

sampled_data <- Affairs[sample(nrow(Affairs), 200, replace = FALSE), ]

Menampilkan 10 pengamatan pertama

head(sampled_data, 10)
##      affairs gender age yearsmarried children religiousness education
## 1599       1 female  22        4.000      yes             3        16
## 176       12   male  37       10.000      yes             2        20
## 186        7   male  37       10.000      yes             2        18
## 277        0 female  37       15.000      yes             4        14
## 1086       0   male  27        4.000      yes             3        18
## 1850       0   male  22        4.000      yes             4        16
## 1177       0 female  27        4.000      yes             3        18
## 933        2 female  27        7.000      yes             3        17
## 1056      12 female  42       15.000      yes             3        14
## 67         1   male  22        0.125       no             4        16
##      occupation rating
## 1599          1      3
## 176           6      2
## 186           7      3
## 277           3      1
## 1086          5      5
## 1850          5      5
## 1177          4      5
## 933           5      3
## 1056          4      3
## 67            5      5

Menampilkan informasi ada berapa pengamatan untuk setiap kategori dari respon Y

table(sampled_data$affairs)  
## 
##   0   1   2   3   7  12 
## 145  12   8   6  15  14

Mentransformasi peubah respon Y menjadi dua kategori saja

sampled_data$affairs <- ifelse(sampled_data$affairs > 0, 1, 0)
table(sampled_data$affairs)
## 
##   0   1 
## 145  55

Membagi data secara acak menjadi data latih (80%) dan data uji (20%)

set.seed(nim_terakhir)  
split_index <- createDataPartition(sampled_data$affairs, p = 0.8, list = FALSE)
train_data <- sampled_data[split_index, ]
test_data <- sampled_data[-split_index, ]
train_data$affairs = as.factor(train_data$affairs)
test_data$affairs = as.factor(test_data$affairs)
table(train_data$affairs)
## 
##   0   1 
## 116  44

Regresi Logistrik

CV untuk Regresi Logistik

# Atur kontrol untuk validasi silang (misalnya, 10-fold cross-validation)
ctrl <- trainControl(method = "cv", number = 10)

# Latih model regresi logistik menggunakan validasi silang
modelLog <- train(affairs ~ ., data = train_data, method = "glm", family = binomial, trControl = ctrl)

# Tampilkan hasil
print(modelLog)
## Generalized Linear Model 
## 
## 160 samples
##   8 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 144, 144, 143, 144, 144, 144, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.7613725  0.3163236
summary(modelLog)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7982  -0.7808  -0.4826   0.8043   2.5538  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    1.65819    1.87357   0.885 0.376134    
## gendermale     0.28310    0.50351   0.562 0.573936    
## age           -0.01832    0.03270  -0.560 0.575200    
## yearsmarried   0.10097    0.06264   1.612 0.106991    
## childrenyes    0.51807    0.61810   0.838 0.401939    
## religiousness -0.58401    0.17657  -3.308 0.000941 ***
## education      0.04428    0.10577   0.419 0.675460    
## occupation    -0.07204    0.16475  -0.437 0.661917    
## rating        -0.55696    0.19414  -2.869 0.004120 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 188.21  on 159  degrees of freedom
## Residual deviance: 157.39  on 151  degrees of freedom
## AIC: 175.39
## 
## Number of Fisher Scoring iterations: 5

Evaluasi Model

predlog = predict(modelLog, test_data) 
p_predlog = predict(modelLog,test_data,type="prob")
actuallog = factor(test_data$affairs) 

cm_log <- confusionMatrix(actuallog, predlog, positive = "1")
cm_log
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 24  5
##          1 10  1
##                                          
##                Accuracy : 0.625          
##                  95% CI : (0.458, 0.7727)
##     No Information Rate : 0.85           
##     P-Value [Acc > NIR] : 0.9999         
##                                          
##                   Kappa : -0.0949        
##                                          
##  Mcnemar's Test P-Value : 0.3017         
##                                          
##             Sensitivity : 0.16667        
##             Specificity : 0.70588        
##          Pos Pred Value : 0.09091        
##          Neg Pred Value : 0.82759        
##              Prevalence : 0.15000        
##          Detection Rate : 0.02500        
##    Detection Prevalence : 0.27500        
##       Balanced Accuracy : 0.43627        
##                                          
##        'Positive' Class : 1              
## 

Kurva ROC Regresi Logistik

roclog = roc(test_data$affairs, as.numeric(predlog))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roclog, main = "Kurva ROC untuk Regresi Logistik", col = "blue")

KNN

CV untuk KNN

# Atur kontrol untuk validasi silang (misalnya, 10-fold cross-validation)
ctrl <- trainControl(method = "cv", number = 10)

# Latih model regresi logistik menggunakan validasi silang
modelknn <- train(affairs~., data = train_data, method = "knn", trControl = ctrl)

# Tampilkan hasil
print(modelknn)
## k-Nearest Neighbors 
## 
## 160 samples
##   8 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 144, 144, 145, 145, 144, 143, ... 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa    
##   5  0.7612255  0.2410670
##   7  0.7549265  0.1914154
##   9  0.7369118  0.1588204
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.

Evaluasi Model

predknn = predict(modelknn, test_data)  
p_predknn = predict(modelknn,test_data,type="prob")
actualknn = factor(test_data$affairs)

cm_knn <- confusionMatrix(actualknn, predknn, positive = "1")
cm_knn
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 25  4
##          1  8  3
##                                           
##                Accuracy : 0.7             
##                  95% CI : (0.5347, 0.8344)
##     No Information Rate : 0.825           
##     P-Value [Acc > NIR] : 0.9843          
##                                           
##                   Kappa : 0.1519          
##                                           
##  Mcnemar's Test P-Value : 0.3865          
##                                           
##             Sensitivity : 0.4286          
##             Specificity : 0.7576          
##          Pos Pred Value : 0.2727          
##          Neg Pred Value : 0.8621          
##              Prevalence : 0.1750          
##          Detection Rate : 0.0750          
##    Detection Prevalence : 0.2750          
##       Balanced Accuracy : 0.5931          
##                                           
##        'Positive' Class : 1               
## 

Kurva ROC KKN

rocknn = roc(test_data$affairs, as.numeric(predknn))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(rocknn, main = "Kurva ROC untuk KNN", col = "red")

Perbandingan Kurva ROC Regresi Logistik dan KNN

# Plot kurva ROC untuk Regresi Logistik (roclog)
plot(roclog, main = "Kurva ROC untuk Regresi Logistik", col = "blue")

# Tambahkan kurva ROC untuk KNN (rocknn) pada grafik yang sudah ada
lines(rocknn, col = "red")

Odds Ratios Regresi Logistika

log_coef <- coef(modelLog$finalModel)
odds_ratios_log <- exp(log_coef)
odds_ratios_log
##   (Intercept)    gendermale           age  yearsmarried   childrenyes 
##     5.2497745     1.3272432     0.9818428     1.1062415     1.6787850 
## religiousness     education    occupation        rating 
##     0.5576576     1.0452777     0.9304956     0.5729486

Odds Ratios KNN

predicted_probs_knn <- p_predknn[, "1"]
odds_ratios_knn <- predicted_probs_knn / (1 - predicted_probs_knn)
odds_ratios_knn
##  [1] 0.2500000 1.0000000 0.0000000 1.0000000 0.6666667 0.0000000 0.0000000
##  [8] 0.6666667 0.2500000 0.2500000       Inf 0.2500000 6.0000000 0.2500000
## [15] 1.5000000 0.2500000 0.0000000 0.2500000 0.2000000 0.6666667 0.2500000
## [22] 1.0000000 0.0000000 0.2500000 0.6666667 0.0000000       Inf 0.5000000
## [29] 1.5000000 0.0000000 0.2500000 1.5000000 0.2500000 0.6666667 0.2000000
## [36] 0.2500000 1.5000000 0.5000000 0.2500000 0.0000000
# Membuat dataframe untuk model logistik
df_log <- data.frame(
  Model = "Logistic Regression",
  Accuracy = cm_log$overall["Accuracy"],
  Sensitivity = cm_log$byClass["Sensitivity"],
  Specificity = cm_log$byClass["Specificity"],
  Precision = cm_log$byClass["Pos Pred Value"],
  F1_Score = cm_log$byClass["F1"]
)

# Membuat dataframe untuk model KNN
df_knn <- data.frame(
  Model = "K-Nearest Neighbors",
  Accuracy = cm_knn$overall["Accuracy"],
  Sensitivity = cm_knn$byClass["Sensitivity"],
  Specificity = cm_knn$byClass["Specificity"],
  Precision = cm_knn$byClass["Pos Pred Value"],
  F1_Score = cm_knn$byClass["F1"]
)

# Menggabungkan kedua dataframe
df_result <- rbind(df_log, df_knn)

# Menampilkan hasil
print(df_result)
##                         Model Accuracy Sensitivity Specificity  Precision
## Accuracy  Logistic Regression    0.625   0.1666667   0.7058824 0.09090909
## Accuracy1 K-Nearest Neighbors    0.700   0.4285714   0.7575758 0.27272727
##            F1_Score
## Accuracy  0.1176471
## Accuracy1 0.3333333

Interpretasi

  • Berdasarkan kurva ROC, regresi logistik (garis biru) berada di atas kurva ROC KNN (garis merah). Hal ini menunjukkan bahwa regresi logistik lebih baik dalam memprediksi kasus daripada KNN.

  • Berdasarkan nilai Accuracy, KNN memiliki akurasi yang lebih tinggi daripada Regresi Logistik. Hal ini berarti bahwa regresi logistik lebih baik dalam memprediksi data secara keseluruhan.