3. Asumsi dan Pemodelan
3.1 LDA Versi 1 (Setelah Seleksi Fitur)
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
data <- read.csv("C:/Users/USER/Downloads/Tahapan Pemaparan Progres Kemajuan Project_Kelompok 4/project_anmul/data_proses/data_scaled_reduce.csv")
target_col <- "dlq_2yrs"
predictor_cols <- setdiff(names(data), target_col)
# Uji Normalitas
for (col in predictor_cols) {
x <- data[[col]]
if (length(x) > 5000) {
result <- shapiro.test(sample(x, 5000))
} else {
result <- shapiro.test(x)
}
cat(sprintf("Fitur: %-25s | p-value: %.5f %s\n", col, result$p.value,
ifelse(result$p.value < 0.05, "Tidak normal", "Normal")))
}
## Fitur: age | p-value: 0.00000 Tidak normal
## Fitur: late_30_59 | p-value: 0.00000 Tidak normal
## Fitur: monthly_inc | p-value: 0.00000 Tidak normal
## Fitur: open_credit | p-value: 0.00000 Tidak normal
## Fitur: late_90 | p-value: 0.00000 Tidak normal
## Fitur: late_60_89 | p-value: 0.00000 Tidak normal
## Fitur: dependents | p-value: 0.00000 Tidak normal
# Homogenitas
group0 <- data[data[[target_col]] == 0, predictor_cols]
group1 <- data[data[[target_col]] == 1, predictor_cols]
det0 <- det(cov(group0))
det1 <- det(cov(group1))
cat("Determinant Kovarian Kelas 0:", format(det0, scientific = TRUE), "\n")
## Determinant Kovarian Kelas 0: 2.259597e-04
cat("Determinant Kovarian Kelas 1:", format(det1, scientific = TRUE), "\n")
## Determinant Kovarian Kelas 1: 2.601147e-01
# VIF
vif_manual <- function(data, features) {
vifs <- c()
for (f in features) {
model <- lm(as.formula(paste(f, "~", paste(setdiff(features, f), collapse="+"))), data=data)
r2 <- summary(model)$r.squared
vifs[f] <- ifelse(r2 == 1, Inf, 1 / (1 - r2))
}
return(vifs)
}
print(round(vif_manual(data, predictor_cols), 3))
## age late_30_59 monthly_inc open_credit late_90 late_60_89
## 1.120 3.430 1.143 1.202 4.058 5.231
## dependents
## 1.068
# Model LDA
lda_model <- lda(as.formula(paste(target_col, "~", paste(predictor_cols, collapse = "+"))), data = data)
pred <- predict(lda_model, data)$class
conf_mat <- table(Predicted = pred, Actual = data[[target_col]])
print(conf_mat)
## Actual
## Predicted 0 1
## 0 5719 3180
## 1 2638 5177
cat(sprintf("Akurasi: %.2f%%\n", 100 * sum(diag(conf_mat)) / sum(conf_mat)))
## Akurasi: 65.19%
print(lda_model$scaling)
## LD1
## age -0.50999435
## late_30_59 0.98669238
## monthly_inc -0.28561065
## open_credit 0.08805892
## late_90 0.56389098
## late_60_89 -0.86787275
## dependents 0.18523582
3.2 Regresi Logistik Versi 1
data <- read.csv("C:/Users/USER/Downloads/Tahapan Pemaparan Progres Kemajuan Project_Kelompok 4/project_anmul/data_proses/data_scaled_reduce.csv")
target_col <- "dlq_2yrs"
predictor_cols <- setdiff(names(data), target_col)
formula <- as.formula(paste(target_col, "~", paste(predictor_cols, collapse = "+")))
model <- glm(formula, data = data, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary_model <- summary(model)
print(summary_model)
##
## Call:
## glm(formula = formula, family = binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.56908 0.02685 21.193 < 2e-16 ***
## age -0.36787 0.01929 -19.074 < 2e-16 ***
## late_30_59 1.35100 0.04883 27.667 < 2e-16 ***
## monthly_inc -0.17882 0.01938 -9.228 < 2e-16 ***
## open_credit 0.14399 0.01939 7.425 1.13e-13 ***
## late_90 2.02401 0.08603 23.528 < 2e-16 ***
## late_60_89 1.43502 0.09316 15.405 < 2e-16 ***
## dependents 0.08613 0.01843 4.675 2.95e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23171 on 16713 degrees of freedom
## Residual deviance: 18120 on 16706 degrees of freedom
## AIC: 18136
##
## Number of Fisher Scoring iterations: 8
# Uji Signifikansi
print(anova(model, test = "Chisq"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: dlq_2yrs
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 16713 23171
## age 1 758.22 16712 22412 < 2.2e-16 ***
## late_30_59 1 2348.81 16711 20064 < 2.2e-16 ***
## monthly_inc 1 119.59 16710 19944 < 2.2e-16 ***
## open_credit 1 0.10 16709 19944 0.7479
## late_90 1 1359.41 16708 18584 < 2.2e-16 ***
## late_60_89 1 445.83 16707 18139 < 2.2e-16 ***
## dependents 1 18.51 16706 18120 1.692e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coeff <- summary_model$coefficients
print(data.frame(
Variable = rownames(coeff),
Estimate = coeff[, "Estimate"],
Std_Error = coeff[, "Std. Error"],
z_value = coeff[, "z value"],
p_value = coeff[, "Pr(>|z|)"],
Significance = ifelse(coeff[, "Pr(>|z|)"] < 0.05, "Signifikan", "Tidak Signifikan")
))
## Variable Estimate Std_Error z_value p_value
## (Intercept) (Intercept) 0.56908027 0.02685218 21.193079 1.106228e-99
## age age -0.36786971 0.01928601 -19.074430 4.118873e-81
## late_30_59 late_30_59 1.35100262 0.04883063 27.667115 1.737301e-168
## monthly_inc monthly_inc -0.17881915 0.01937757 -9.228150 2.753472e-20
## open_credit open_credit 0.14399411 0.01939380 7.424750 1.129927e-13
## late_90 late_90 2.02401039 0.08602506 23.528149 2.101572e-122
## late_60_89 late_60_89 1.43501862 0.09315531 15.404583 1.524698e-53
## dependents dependents 0.08613073 0.01842539 4.674568 2.945731e-06
## Significance
## (Intercept) Signifikan
## age Signifikan
## late_30_59 Signifikan
## monthly_inc Signifikan
## open_credit Signifikan
## late_90 Signifikan
## late_60_89 Signifikan
## dependents Signifikan
# Evaluasi
prob <- predict(model, type = "response")
pred <- ifelse(prob >= 0.5, 1, 0)
conf_matrix <- table(Predicted = pred, Actual = data[[target_col]])
print(conf_matrix)
## Actual
## Predicted 0 1
## 0 7225 3247
## 1 1132 5110
cat(sprintf("Akurasi: %.2f%%\n", 100 * sum(diag(conf_matrix)) / sum(conf_matrix)))
## Akurasi: 73.80%
print(round(exp(coeff[, "Estimate"]), 3))
## (Intercept) age late_30_59 monthly_inc open_credit late_90
## 1.767 0.692 3.861 0.836 1.155 7.569
## late_60_89 dependents
## 4.200 1.090
3.3 LDA Versi 2 (Tanpa Seleksi Fitur)
data <- read.csv("C:/Users/USER/Downloads/Tahapan Pemaparan Progres Kemajuan Project_Kelompok 4/project_anmul/data_proses/data_normalize.csv")
target_col <- "dlq_2yrs"
predictor_cols <- setdiff(names(data), target_col)
# Uji Asumsi Normalitas, Homogenitas, VIF
for (col in predictor_cols) {
x <- data[[col]]
result <- shapiro.test(if (length(x) > 5000) sample(x, 5000) else x)
cat(sprintf("Fitur: %-25s | p-value: %.5f\n", col, result$p.value))
}
## Fitur: rev_util | p-value: 0.00000
## Fitur: age | p-value: 0.00000
## Fitur: late_30_59 | p-value: 0.00000
## Fitur: debt_ratio | p-value: 0.00000
## Fitur: monthly_inc | p-value: 0.00000
## Fitur: open_credit | p-value: 0.00000
## Fitur: late_90 | p-value: 0.00000
## Fitur: real_estate | p-value: 0.00000
## Fitur: late_60_89 | p-value: 0.00000
## Fitur: dependents | p-value: 0.00000
group0 <- data[data[[target_col]] == 0, predictor_cols]
group1 <- data[data[[target_col]] == 1, predictor_cols]
cat("Det Kelas 0:", format(det(cov(group0)), scientific = TRUE), "\n")
## Det Kelas 0: 1.493817e-04
cat("Det Kelas 1:", format(det(cov(group1)), scientific = TRUE), "\n")
## Det Kelas 1: 1.725732e-01
print(round(vif_manual(data, predictor_cols), 3))
## rev_util age late_30_59 debt_ratio monthly_inc open_credit
## 1.001 1.120 3.431 1.029 1.266 1.433
## late_90 real_estate late_60_89 dependents
## 4.060 1.433 5.231 1.074
lda_model <- lda(as.formula(paste(target_col, "~", paste(predictor_cols, collapse = "+"))), data = data)
pred <- predict(lda_model, data)$class
conf_mat <- table(Predicted = pred, Actual = data[[target_col]])
print(conf_mat)
## Actual
## Predicted 0 1
## 0 5672 3127
## 1 2685 5230
cat(sprintf("Akurasi: %.2f%%\n", 100 * sum(diag(conf_mat)) / sum(conf_mat)))
## Akurasi: 65.23%
print(lda_model$scaling)
## LD1
## rev_util 0.02495683
## age -0.50661321
## late_30_59 0.97634006
## debt_ratio -0.10994825
## monthly_inc -0.32841326
## open_credit 0.05154739
## late_90 0.56354558
## real_estate 0.10755077
## late_60_89 -0.86326511
## dependents 0.18256577
3.4 Regresi Logistik Versi 2
data <- read.csv("C:/Users/USER/Downloads/Tahapan Pemaparan Progres Kemajuan Project_Kelompok 4/project_anmul/data_proses/data_normalize.csv")
target_col <- "dlq_2yrs"
predictor_cols <- setdiff(names(data), target_col)
formula <- as.formula(paste(target_col, "~", paste(predictor_cols, collapse = "+")))
model <- glm(formula, data = data, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary_model <- summary(model)
print(summary_model)
##
## Call:
## glm(formula = formula, family = binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.57057 0.02692 21.193 < 2e-16 ***
## rev_util 0.01878 0.01715 1.095 0.274
## age -0.37000 0.01935 -19.118 < 2e-16 ***
## late_30_59 1.35095 0.04893 27.607 < 2e-16 ***
## debt_ratio -0.07932 0.01857 -4.271 1.94e-05 ***
## monthly_inc -0.22436 0.02068 -10.847 < 2e-16 ***
## open_credit 0.10296 0.02116 4.866 1.14e-06 ***
## late_90 2.03855 0.08630 23.622 < 2e-16 ***
## real_estate 0.11820 0.02133 5.542 3.00e-08 ***
## late_60_89 1.44092 0.09339 15.430 < 2e-16 ***
## dependents 0.08364 0.01851 4.519 6.20e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23171 on 16713 degrees of freedom
## Residual deviance: 18069 on 16703 degrees of freedom
## AIC: 18091
##
## Number of Fisher Scoring iterations: 8
print(anova(model, test = "Chisq"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: dlq_2yrs
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 16713 23171
## rev_util 1 2.35 16712 23168 0.125262
## age 1 757.37 16711 22411 < 2.2e-16 ***
## late_30_59 1 2348.51 16710 20062 < 2.2e-16 ***
## debt_ratio 1 8.14 16709 20054 0.004331 **
## monthly_inc 1 131.25 16708 19923 < 2.2e-16 ***
## open_credit 1 0.00 16707 19923 0.967931
## late_90 1 1355.13 16706 18568 < 2.2e-16 ***
## real_estate 1 34.18 16705 18534 5.030e-09 ***
## late_60_89 1 447.81 16704 18086 < 2.2e-16 ***
## dependents 1 16.98 16703 18069 3.786e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coeff <- summary_model$coefficients
print(data.frame(
Variable = rownames(coeff),
Estimate = coeff[, "Estimate"],
Std_Error = coeff[, "Std. Error"],
z_value = coeff[, "z value"],
p_value = coeff[, "Pr(>|z|)"],
Significance = ifelse(coeff[, "Pr(>|z|)"] < 0.05, "Signifikan", "Tidak Signifikan")
))
## Variable Estimate Std_Error z_value p_value
## (Intercept) (Intercept) 0.57056668 0.02692221 21.193157 1.104385e-99
## rev_util rev_util 0.01877627 0.01714992 1.094831 2.735906e-01
## age age -0.36999778 0.01935293 -19.118435 1.773479e-81
## late_30_59 late_30_59 1.35094953 0.04893425 27.607443 9.058082e-168
## debt_ratio debt_ratio -0.07932053 0.01857050 -4.271319 1.943201e-05
## monthly_inc monthly_inc -0.22436297 0.02068468 -10.846821 2.064833e-27
## open_credit open_credit 0.10296084 0.02115979 4.865873 1.139527e-06
## late_90 late_90 2.03855432 0.08630029 23.621638 2.310186e-123
## real_estate real_estate 0.11819660 0.02132895 5.541605 2.997126e-08
## late_60_89 late_60_89 1.44091672 0.09338559 15.429754 1.032635e-53
## dependents dependents 0.08363916 0.01850708 4.519306 6.204265e-06
## Significance
## (Intercept) Signifikan
## rev_util Tidak Signifikan
## age Signifikan
## late_30_59 Signifikan
## debt_ratio Signifikan
## monthly_inc Signifikan
## open_credit Signifikan
## late_90 Signifikan
## real_estate Signifikan
## late_60_89 Signifikan
## dependents Signifikan
prob <- predict(model, type = "response")
pred <- ifelse(prob >= 0.5, 1, 0)
conf_matrix <- table(Predicted = pred, Actual = data[[target_col]])
print(conf_matrix)
## Actual
## Predicted 0 1
## 0 7219 3224
## 1 1138 5133
cat(sprintf("Akurasi: %.2f%%\n", 100 * sum(diag(conf_matrix)) / sum(conf_matrix)))
## Akurasi: 73.90%
print(round(exp(coeff[, "Estimate"]), 3))
## (Intercept) rev_util age late_30_59 debt_ratio monthly_inc
## 1.769 1.019 0.691 3.861 0.924 0.799
## open_credit late_90 real_estate late_60_89 dependents
## 1.108 7.679 1.125 4.225 1.087