# Package
library(AER)
## Warning: package 'AER' was built under R version 4.1.2
## Loading required package: car
## Warning: package 'car' was built under R version 4.1.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.1.2
## Loading required package: lmtest
## Warning: package 'lmtest' was built under R version 4.1.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.1.2
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.1.2
## Loading required package: survival
## Warning: package 'survival' was built under R version 4.1.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.2
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.1.2
## Loading required package: ggplot2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.1.2
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## The following object is masked from 'package:car':
##
## logit
data(Affairs, package="AER")
head(Affairs)
## affairs gender age yearsmarried children religiousness education occupation
## 4 0 male 37 10.00 no 3 18 7
## 5 0 female 27 4.00 no 4 14 6
## 11 0 female 32 15.00 yes 1 12 1
## 16 0 male 57 15.00 yes 5 18 6
## 23 0 male 22 0.75 no 2 17 6
## 29 0 female 32 1.50 no 2 17 5
## rating
## 4 4
## 5 4
## 11 4
## 16 5
## 23 3
## 29 5
describe(Affairs)
## vars n mean sd median trimmed mad min max range skew
## affairs 1 601 1.46 3.30 0 0.55 0.00 0.00 12 12.00 2.34
## gender* 2 601 1.48 0.50 1 1.47 0.00 1.00 2 1.00 0.10
## age 3 601 32.49 9.29 32 31.37 7.41 17.50 57 39.50 0.88
## yearsmarried 4 601 8.18 5.57 7 8.26 8.15 0.12 15 14.88 0.08
## children* 5 601 1.72 0.45 2 1.77 0.00 1.00 2 1.00 -0.95
## religiousness 6 601 3.12 1.17 3 3.12 1.48 1.00 5 4.00 -0.09
## education 7 601 16.17 2.40 16 16.21 2.97 9.00 20 11.00 -0.25
## occupation 8 601 4.19 1.82 5 4.34 1.48 1.00 7 6.00 -0.74
## rating 9 601 3.93 1.10 4 4.07 1.48 1.00 5 4.00 -0.83
## kurtosis se
## affairs 4.19 0.13
## gender* -1.99 0.02
## age 0.21 0.38
## yearsmarried -1.57 0.23
## children* -1.09 0.02
## religiousness -1.02 0.05
## education -0.32 0.10
## occupation -0.79 0.07
## rating -0.22 0.04
nim_terakhir <- 31001
set.seed(nim_terakhir)
sampled_data <- Affairs[sample(nrow(Affairs), 200, replace = FALSE), ]
head(sampled_data, 10)
## affairs gender age yearsmarried children religiousness education
## 1599 1 female 22 4.000 yes 3 16
## 176 12 male 37 10.000 yes 2 20
## 186 7 male 37 10.000 yes 2 18
## 277 0 female 37 15.000 yes 4 14
## 1086 0 male 27 4.000 yes 3 18
## 1850 0 male 22 4.000 yes 4 16
## 1177 0 female 27 4.000 yes 3 18
## 933 2 female 27 7.000 yes 3 17
## 1056 12 female 42 15.000 yes 3 14
## 67 1 male 22 0.125 no 4 16
## occupation rating
## 1599 1 3
## 176 6 2
## 186 7 3
## 277 3 1
## 1086 5 5
## 1850 5 5
## 1177 4 5
## 933 5 3
## 1056 4 3
## 67 5 5
table(sampled_data$affairs)
##
## 0 1 2 3 7 12
## 145 12 8 6 15 14
sampled_data$affairs <- ifelse(sampled_data$affairs > 0, 1, 0)
table(sampled_data$affairs)
##
## 0 1
## 145 55
set.seed(nim_terakhir)
split_index <- createDataPartition(sampled_data$affairs, p = 0.8, list = FALSE)
train_data <- sampled_data[split_index, ]
test_data <- sampled_data[-split_index, ]
train_data$affairs = as.factor(train_data$affairs)
test_data$affairs = as.factor(test_data$affairs)
table(train_data$affairs)
##
## 0 1
## 116 44
# Atur kontrol untuk validasi silang (misalnya, 10-fold cross-validation)
ctrl <- trainControl(method = "cv", number = 10)
# Latih model regresi logistik menggunakan validasi silang
modelLog <- train(affairs ~ ., data = train_data, method = "glm", family = binomial, trControl = ctrl)
# Tampilkan hasil
print(modelLog)
## Generalized Linear Model
##
## 160 samples
## 8 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 144, 144, 143, 144, 144, 144, ...
## Resampling results:
##
## Accuracy Kappa
## 0.7613725 0.3163236
summary(modelLog)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7982 -0.7808 -0.4826 0.8043 2.5538
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.65819 1.87357 0.885 0.376134
## gendermale 0.28310 0.50351 0.562 0.573936
## age -0.01832 0.03270 -0.560 0.575200
## yearsmarried 0.10097 0.06264 1.612 0.106991
## childrenyes 0.51807 0.61810 0.838 0.401939
## religiousness -0.58401 0.17657 -3.308 0.000941 ***
## education 0.04428 0.10577 0.419 0.675460
## occupation -0.07204 0.16475 -0.437 0.661917
## rating -0.55696 0.19414 -2.869 0.004120 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 188.21 on 159 degrees of freedom
## Residual deviance: 157.39 on 151 degrees of freedom
## AIC: 175.39
##
## Number of Fisher Scoring iterations: 5
predlog = predict(modelLog, test_data)
p_predlog = predict(modelLog,test_data,type="prob")
actuallog = factor(test_data$affairs)
cm_log <- confusionMatrix(actuallog, predlog, positive = "1")
cm_log
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 24 5
## 1 10 1
##
## Accuracy : 0.625
## 95% CI : (0.458, 0.7727)
## No Information Rate : 0.85
## P-Value [Acc > NIR] : 0.9999
##
## Kappa : -0.0949
##
## Mcnemar's Test P-Value : 0.3017
##
## Sensitivity : 0.16667
## Specificity : 0.70588
## Pos Pred Value : 0.09091
## Neg Pred Value : 0.82759
## Prevalence : 0.15000
## Detection Rate : 0.02500
## Detection Prevalence : 0.27500
## Balanced Accuracy : 0.43627
##
## 'Positive' Class : 1
##
roclog = roc(test_data$affairs, as.numeric(predlog))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roclog, main = "Kurva ROC untuk Regresi Logistik", col = "blue")
# Atur kontrol untuk validasi silang (misalnya, 10-fold cross-validation)
ctrl <- trainControl(method = "cv", number = 10)
# Latih model regresi logistik menggunakan validasi silang
modelknn <- train(affairs~., data = train_data, method = "knn", trControl = ctrl)
# Tampilkan hasil
print(modelknn)
## k-Nearest Neighbors
##
## 160 samples
## 8 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 144, 144, 145, 145, 144, 143, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.7612255 0.2410670
## 7 0.7549265 0.1914154
## 9 0.7369118 0.1588204
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
predknn = predict(modelknn, test_data)
p_predknn = predict(modelknn,test_data,type="prob")
actualknn = factor(test_data$affairs)
cm_knn <- confusionMatrix(actualknn, predknn, positive = "1")
cm_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 25 4
## 1 8 3
##
## Accuracy : 0.7
## 95% CI : (0.5347, 0.8344)
## No Information Rate : 0.825
## P-Value [Acc > NIR] : 0.9843
##
## Kappa : 0.1519
##
## Mcnemar's Test P-Value : 0.3865
##
## Sensitivity : 0.4286
## Specificity : 0.7576
## Pos Pred Value : 0.2727
## Neg Pred Value : 0.8621
## Prevalence : 0.1750
## Detection Rate : 0.0750
## Detection Prevalence : 0.2750
## Balanced Accuracy : 0.5931
##
## 'Positive' Class : 1
##
rocknn = roc(test_data$affairs, as.numeric(predknn))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(rocknn, main = "Kurva ROC untuk KNN", col = "red")
# Plot kurva ROC untuk Regresi Logistik (roclog)
plot(roclog, main = "Kurva ROC untuk Regresi Logistik", col = "blue")
# Tambahkan kurva ROC untuk KNN (rocknn) pada grafik yang sudah ada
lines(rocknn, col = "red")
log_coef <- coef(modelLog$finalModel)
odds_ratios_log <- exp(log_coef)
odds_ratios_log
## (Intercept) gendermale age yearsmarried childrenyes
## 5.2497745 1.3272432 0.9818428 1.1062415 1.6787850
## religiousness education occupation rating
## 0.5576576 1.0452777 0.9304956 0.5729486
predicted_probs_knn <- p_predknn[, "1"]
odds_ratios_knn <- predicted_probs_knn / (1 - predicted_probs_knn)
odds_ratios_knn
## [1] 0.2500000 1.0000000 0.0000000 1.0000000 0.6666667 0.0000000 0.0000000
## [8] 0.6666667 0.2500000 0.2500000 Inf 0.2500000 6.0000000 0.2500000
## [15] 1.5000000 0.2500000 0.0000000 0.2500000 0.2000000 0.6666667 0.2500000
## [22] 1.0000000 0.0000000 0.2500000 0.6666667 0.0000000 Inf 0.5000000
## [29] 1.5000000 0.0000000 0.2500000 1.5000000 0.2500000 0.6666667 0.2000000
## [36] 0.2500000 1.5000000 0.5000000 0.2500000 0.0000000
# Membuat dataframe untuk model logistik
df_log <- data.frame(
Model = "Logistic Regression",
Accuracy = cm_log$overall["Accuracy"],
Sensitivity = cm_log$byClass["Sensitivity"],
Specificity = cm_log$byClass["Specificity"],
Precision = cm_log$byClass["Pos Pred Value"],
F1_Score = cm_log$byClass["F1"]
)
# Membuat dataframe untuk model KNN
df_knn <- data.frame(
Model = "K-Nearest Neighbors",
Accuracy = cm_knn$overall["Accuracy"],
Sensitivity = cm_knn$byClass["Sensitivity"],
Specificity = cm_knn$byClass["Specificity"],
Precision = cm_knn$byClass["Pos Pred Value"],
F1_Score = cm_knn$byClass["F1"]
)
# Menggabungkan kedua dataframe
df_result <- rbind(df_log, df_knn)
# Menampilkan hasil
print(df_result)
## Model Accuracy Sensitivity Specificity Precision
## Accuracy Logistic Regression 0.625 0.1666667 0.7058824 0.09090909
## Accuracy1 K-Nearest Neighbors 0.700 0.4285714 0.7575758 0.27272727
## F1_Score
## Accuracy 0.1176471
## Accuracy1 0.3333333
Berdasarkan kurva ROC, regresi logistik (garis biru) berada di
atas kurva ROC KNN (garis merah). Hal ini menunjukkan bahwa regresi
logistik lebih baik dalam memprediksi kasus daripada KNN.
Berdasarkan nilai Accuracy, KNN memiliki akurasi yang lebih
tinggi daripada Regresi Logistik. Hal ini berarti bahwa regresi logistik
lebih baik dalam memprediksi data secara keseluruhan.