Logistic Regression in R

Load Packages

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Load Data

Data diperoleh melalui link https://online.stat.psu.edu/stat857/sites/onlinecourses.science.psu.edu.stat857/files/german_credit/index.csv dan untuk peubah selain duration, previous.credit, credit.amount, dan age, diganti menjadi faktor.

credit <- read.csv("https://online.stat.psu.edu/stat857/sites/onlinecourses.science.psu.edu.stat857/files/german_credit/index.csv")
credit <- credit %>% mutate(across(-c(Duration.of.Credit..month.,
                            Credit.Amount,
                            Age..years.),as.factor))
str(credit)

## 'data.frame':    1000 obs. of  21 variables:
##  $ Creditability                    : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Account.Balance                  : Factor w/ 4 levels "1","2","3","4": 1 1 2 1 1 1 1 1 4 2 ...
##  $ Duration.of.Credit..month.       : int  18 9 12 12 12 10 8 6 18 24 ...
##  $ Payment.Status.of.Previous.Credit: Factor w/ 5 levels "0","1","2","3",..: 5 5 3 5 5 5 5 5 5 3 ...
##  $ Purpose                          : Factor w/ 10 levels "0","1","2","3",..: 3 1 9 1 1 1 1 1 4 4 ...
##  $ Credit.Amount                    : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
##  $ Value.Savings.Stocks             : Factor w/ 5 levels "1","2","3","4",..: 1 1 2 1 1 1 1 1 1 3 ...
##  $ Length.of.current.employment     : Factor w/ 5 levels "1","2","3","4",..: 2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment.per.cent              : Factor w/ 4 levels "1","2","3","4": 4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex...Marital.Status             : Factor w/ 4 levels "1","2","3","4": 2 3 2 3 3 3 3 3 2 2 ...
##  $ Guarantors                       : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Duration.in.Current.address      : Factor w/ 4 levels "1","2","3","4": 4 2 4 2 4 3 4 4 4 4 ...
##  $ Most.valuable.available.asset    : Factor w/ 4 levels "1","2","3","4": 2 1 1 1 2 1 1 1 3 4 ...
##  $ Age..years.                      : int  21 36 23 39 38 48 39 40 65 23 ...
##  $ Concurrent.Credits               : Factor w/ 3 levels "1","2","3": 3 3 3 3 1 3 3 3 3 3 ...
##  $ Type.of.apartment                : Factor w/ 3 levels "1","2","3": 1 1 1 1 2 1 2 2 2 1 ...
##  $ No.of.Credits.at.this.Bank       : Factor w/ 4 levels "1","2","3","4": 1 2 1 2 2 2 2 1 2 1 ...
##  $ Occupation                       : Factor w/ 4 levels "1","2","3","4": 3 3 2 2 2 2 2 2 1 1 ...
##  $ No.of.dependents                 : Factor w/ 2 levels "1","2": 1 2 1 2 1 2 1 2 1 1 ...
##  $ Telephone                        : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Foreign.Worker                   : Factor w/ 2 levels "1","2": 1 1 1 2 2 2 2 2 1 1 ...

Modify Data

names(credit) <- c("creditability", "account.balance", "duration", "previous.credit", "purpose", "credit.amount", "saving.balance", "employment.year", "installment.rate", "marital.status", "guarantors", "duration.address", "valuable.asset", "age", "concurrent.credit", "apartment.type", "number.of.credit", "occupation", "dependents", "telephone", "foreign.worker")
credit2 <- credit[c("creditability", "account.balance", "duration", "credit.amount", "saving.balance", "employment.year", "installment.rate", "marital.status", "duration.address", "age","dependents", "number.of.credit", "occupation", "previous.credit")]

head(credit2, 10)

##    creditability account.balance duration credit.amount saving.balance
## 1              1               1       18          1049              1
## 2              1               1        9          2799              1
## 3              1               2       12           841              2
## 4              1               1       12          2122              1
## 5              1               1       12          2171              1
## 6              1               1       10          2241              1
## 7              1               1        8          3398              1
## 8              1               1        6          1361              1
## 9              1               4       18          1098              1
## 10             1               2       24          3758              3
##    employment.year installment.rate marital.status duration.address age
## 1                2                4              2                4  21
## 2                3                2              3                2  36
## 3                4                2              2                4  23
## 4                3                3              3                2  39
## 5                3                4              3                4  38
## 6                2                1              3                3  48
## 7                4                1              3                4  39
## 8                2                2              3                4  40
## 9                1                4              2                4  65
## 10               1                1              2                4  23
##    dependents number.of.credit occupation previous.credit
## 1           1                1          3               4
## 2           2                2          3               4
## 3           1                1          2               2
## 4           2                2          2               4
## 5           1                2          2               4
## 6           2                2          2               4
## 7           1                2          2               4
## 8           2                1          2               4
## 9           1                2          1               4
## 10          1                1          1               2

#glimpse(credit)
glimpse(credit2)

## Rows: 1,000
## Columns: 14
## $ creditability    <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ account.balance  <fct> 1, 1, 2, 1, 1, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1, 1, 2,…
## $ duration         <int> 18, 9, 12, 12, 12, 10, 8, 6, 18, 24, 11, 30, 6, 48, 1…
## $ credit.amount    <int> 1049, 2799, 841, 2122, 2171, 2241, 3398, 1361, 1098, …
## $ saving.balance   <fct> 1, 1, 2, 1, 1, 1, 1, 1, 1, 3, 1, 2, 1, 2, 5, 3, 1, 3,…
## $ employment.year  <fct> 2, 3, 4, 3, 3, 2, 4, 2, 1, 1, 3, 4, 4, 1, 4, 3, 3, 2,…
## $ installment.rate <fct> 4, 2, 2, 3, 4, 1, 1, 2, 4, 1, 2, 1, 1, 2, 2, 2, 1, 1,…
## $ marital.status   <fct> 2, 3, 2, 3, 3, 3, 3, 3, 2, 2, 3, 4, 2, 3, 4, 3, 3, 4,…
## $ duration.address <fct> 4, 2, 4, 2, 4, 3, 4, 4, 4, 4, 2, 4, 4, 4, 4, 3, 2, 3,…
## $ age              <int> 21, 36, 23, 39, 38, 48, 39, 40, 65, 23, 36, 24, 31, 3…
## $ dependents       <fct> 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1,…
## $ number.of.credit <fct> 1, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 1,…
## $ occupation       <fct> 3, 3, 2, 2, 2, 2, 2, 2, 1, 1, 3, 3, 3, 4, 2, 3, 2, 3,…
## $ previous.credit  <fct> 4, 4, 2, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 3, 2, 2, 4, 2,…

Identify Data

sum(is.na(credit2))

## [1] 0

sum(is.null(credit2))

## [1] 0

Didapatkan dari data, bahwa data tidak mengandung nilai Null atau NA.

Split Data

Dilakukan pemecahan data menjadi data training (untuk membentuk model) dan data testing (untuk menguji kebaikan model), dengan proporsi 0.8:0.2.

#index_split <- rsample::initial_split(data = credit,
#                                      prop = 0.8,
#                                      strata = "Creditability"
#                                      )
#train_credit <- rsample::training(index_split)
#test_credit <- rsample::testing(index_split)
set.seed(1); index_split <- rsample::initial_split(data = credit2,
                                      prop = 0.8,
                                      strata = "creditability"
                                      )
set.seed(1); train_credit <- rsample::training(index_split)
set.seed(1); test_credit <- rsample::testing(index_split)

Make Model

Model dibentuk dengan menggunakan data training dan diperoleh hasil sebagai berikut:

#logreg <- glm(Creditability~.,data=train_credit,family = "binomial")
logreg <- glm(creditability~.,data=train_credit,family = "binomial")
summary(logreg)

## 
## Call:
## glm(formula = creditability ~ ., family = "binomial", data = train_credit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7625  -0.7261   0.3844   0.7330   2.0848  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.019e+00  1.085e+00   0.939 0.347589    
## account.balance2   4.588e-01  2.301e-01   1.994 0.046140 *  
## account.balance3   1.106e+00  4.205e-01   2.631 0.008503 ** 
## account.balance4   1.885e+00  2.547e-01   7.398 1.38e-13 ***
## duration          -3.725e-02  1.012e-02  -3.682 0.000231 ***
## credit.amount     -8.668e-05  4.666e-05  -1.858 0.063203 .  
## saving.balance2    1.131e-01  3.061e-01   0.369 0.711887    
## saving.balance3    2.984e-01  4.493e-01   0.664 0.506644    
## saving.balance4    1.151e+00  5.688e-01   2.023 0.043080 *  
## saving.balance5    9.259e-01  2.823e-01   3.279 0.001041 ** 
## employment.year2  -2.533e-01  4.706e-01  -0.538 0.590444    
## employment.year3   2.443e-01  4.506e-01   0.542 0.587666    
## employment.year4   9.819e-01  4.941e-01   1.987 0.046878 *  
## employment.year5   7.232e-02  4.529e-01   0.160 0.873141    
## installment.rate2 -2.003e-01  3.351e-01  -0.598 0.549961    
## installment.rate3 -5.243e-01  3.612e-01  -1.451 0.146670    
## installment.rate4 -9.081e-01  3.181e-01  -2.854 0.004311 ** 
## marital.status2    2.640e-01  4.233e-01   0.624 0.532941    
## marital.status3    1.006e+00  4.154e-01   2.421 0.015485 *  
## marital.status4    6.581e-01  4.889e-01   1.346 0.178255    
## duration.address2 -5.329e-01  3.223e-01  -1.653 0.098269 .  
## duration.address3 -4.214e-01  3.583e-01  -1.176 0.239644    
## duration.address4 -4.091e-01  3.161e-01  -1.294 0.195600    
## age                1.163e-02  9.676e-03   1.202 0.229257    
## dependents2       -4.099e-01  2.784e-01  -1.472 0.140942    
## number.of.credit2 -4.338e-01  2.596e-01  -1.671 0.094707 .  
## number.of.credit3 -1.276e-01  7.148e-01  -0.179 0.858275    
## number.of.credit4 -1.435e+00  1.314e+00  -1.092 0.274922    
## occupation2       -7.171e-01  7.757e-01  -0.924 0.355266    
## occupation3       -7.767e-01  7.498e-01  -1.036 0.300235    
## occupation4       -6.433e-01  7.535e-01  -0.854 0.393219    
## previous.credit1  -6.318e-01  6.329e-01  -0.998 0.318198    
## previous.credit2   4.083e-01  5.100e-01   0.801 0.423372    
## previous.credit3   6.230e-01  5.461e-01   1.141 0.253908    
## previous.credit4   1.396e+00  5.182e-01   2.693 0.007073 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 977.38  on 799  degrees of freedom
## Residual deviance: 738.25  on 765  degrees of freedom
## AIC: 808.25
## 
## Number of Fisher Scoring iterations: 5

Dari pemodelan di atas, diperoleh terdapat beberapa peubah penjelas yang tidak berpengaruh signifikan pada taraf nyata 5% dalam menjelaskan peubah respon (creditability), antara lain: credit.amount, duration.address, age, dependents, number.of.credits, dan occupation. Peubah-peubah ini bisa dibuang dan menyisakan peubah-peubah yang berpengaruh signifikan saja, atau ditetapkan dalam model. Untuk memudahkan dalam interpretasi, maka diputuskan membuang peubah-peubah penjelas dan dilakukan pembuatan model baru dengan peubah sisanya.

logreg2 <- glm(creditability~account.balance+duration+saving.balance+employment.year+
                 installment.rate+marital.status+previous.credit,data=train_credit,family = "binomial")
summary(logreg2)

## 
## Call:
## glm(formula = creditability ~ account.balance + duration + saving.balance + 
##     employment.year + installment.rate + marital.status + previous.credit, 
##     family = "binomial", data = train_credit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7539  -0.7580   0.4054   0.7308   1.9463  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        0.123000   0.729883   0.169  0.86617    
## account.balance2   0.473731   0.225339   2.102  0.03553 *  
## account.balance3   1.307193   0.415105   3.149  0.00164 ** 
## account.balance4   1.826200   0.248846   7.339 2.16e-13 ***
## duration          -0.047689   0.007721  -6.177 6.55e-10 ***
## saving.balance2    0.013504   0.297222   0.045  0.96376    
## saving.balance3    0.427325   0.436213   0.980  0.32727    
## saving.balance4    1.006886   0.546101   1.844  0.06522 .  
## saving.balance5    0.900029   0.279078   3.225  0.00126 ** 
## employment.year2  -0.461522   0.393504  -1.173  0.24086    
## employment.year3  -0.091738   0.370940  -0.247  0.80467    
## employment.year4   0.652855   0.417226   1.565  0.11764    
## employment.year5  -0.177679   0.391261  -0.454  0.64974    
## installment.rate2 -0.157938   0.325849  -0.485  0.62789    
## installment.rate3 -0.354829   0.345974  -1.026  0.30508    
## installment.rate4 -0.647864   0.288858  -2.243  0.02491 *  
## marital.status2    0.194593   0.409111   0.476  0.63432    
## marital.status3    0.827485   0.402300   2.057  0.03970 *  
## marital.status4    0.621675   0.476010   1.306  0.19155    
## previous.credit1  -0.371396   0.596987  -0.622  0.53386    
## previous.credit2   0.654365   0.468821   1.396  0.16279    
## previous.credit3   0.576545   0.523052   1.102  0.27034    
## previous.credit4   1.356065   0.494272   2.744  0.00608 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 977.38  on 799  degrees of freedom
## Residual deviance: 753.75  on 777  degrees of freedom
## AIC: 799.75
## 
## Number of Fisher Scoring iterations: 5

Terdapat peubah installment.rate yang tidak signifikan dalam taraf nyata 5% dalam menjelaskan peubah respon (creditability), sehingga dengan langkah yang sama seperti sebelumnya diperoleh:

logreg3 <- glm(creditability~account.balance+duration+saving.balance+employment.year+
                 marital.status+previous.credit,data=train_credit,family = "binomial")
summary(logreg3)

## 
## Call:
## glm(formula = creditability ~ account.balance + duration + saving.balance + 
##     employment.year + marital.status + previous.credit, family = "binomial", 
##     data = train_credit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6850  -0.8206   0.4175   0.7299   2.1200  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.026911   0.705283  -0.038 0.969563    
## account.balance2  0.503164   0.223632   2.250 0.024451 *  
## account.balance3  1.369116   0.410339   3.337 0.000848 ***
## account.balance4  1.807905   0.247061   7.318 2.52e-13 ***
## duration         -0.047628   0.007655  -6.221 4.93e-10 ***
## saving.balance2   0.028521   0.293679   0.097 0.922634    
## saving.balance3   0.449697   0.433146   1.038 0.299172    
## saving.balance4   0.959736   0.540175   1.777 0.075615 .  
## saving.balance5   0.865278   0.277577   3.117 0.001825 ** 
## employment.year2 -0.520819   0.391335  -1.331 0.183229    
## employment.year3 -0.110580   0.368976  -0.300 0.764411    
## employment.year4  0.617826   0.415049   1.489 0.136603    
## employment.year5 -0.279266   0.388180  -0.719 0.471879    
## marital.status2   0.113485   0.405668   0.280 0.779670    
## marital.status3   0.687437   0.396251   1.735 0.082767 .  
## marital.status4   0.488384   0.472553   1.034 0.301369    
## previous.credit1 -0.554916   0.591288  -0.938 0.347994    
## previous.credit2  0.537196   0.466636   1.151 0.249645    
## previous.credit3  0.502870   0.521507   0.964 0.334914    
## previous.credit4  1.250488   0.491779   2.543 0.010997 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 977.38  on 799  degrees of freedom
## Residual deviance: 761.21  on 780  degrees of freedom
## AIC: 801.21
## 
## Number of Fisher Scoring iterations: 5

logreg4 <- glm(creditability~account.balance+duration+saving.balance+previous.credit,
               data=train_credit,family = "binomial")
summary(logreg4)

## 
## Call:
## glm(formula = creditability ~ account.balance + duration + saving.balance + 
##     previous.credit, family = "binomial", data = train_credit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7106  -0.8592   0.4480   0.7712   1.9555  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       0.110848   0.490485   0.226 0.821204    
## account.balance2  0.448306   0.216979   2.066 0.038817 *  
## account.balance3  1.185640   0.394718   3.004 0.002667 ** 
## account.balance4  1.804899   0.242043   7.457 8.86e-14 ***
## duration         -0.039967   0.007205  -5.547 2.90e-08 ***
## saving.balance2   0.063589   0.288907   0.220 0.825792    
## saving.balance3   0.478523   0.424153   1.128 0.259242    
## saving.balance4   0.850819   0.526442   1.616 0.106057    
## saving.balance5   0.899850   0.270871   3.322 0.000894 ***
## previous.credit1 -0.456384   0.576008  -0.792 0.428173    
## previous.credit2  0.552842   0.450511   1.227 0.219768    
## previous.credit3  0.593091   0.507433   1.169 0.242481    
## previous.credit4  1.312042   0.476098   2.756 0.005854 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 977.38  on 799  degrees of freedom
## Residual deviance: 788.29  on 787  degrees of freedom
## AIC: 814.29
## 
## Number of Fisher Scoring iterations: 5

Diperoleh peubah yang signifikan semuanya dalam menjelaskan creditability, yaitu account.balance, duration, saving.balance, dan previous.credit.

Uji Asumsi

Asumsi independensi terpenuhi jika peubah prediktor tidak saling berkorelasi (tidak terjadi multikolinearitas).

Plot Collinearity

plot(performance::check_collinearity(logreg4))

Diperoleh semua peubah memiliki nilai VIF < 5 (rendah), artinya tidak ada peubah yang saling berkorelasi. Asumsi independensi terpenuhi.

Check Outliers

Adanya pencilan akan mengakibatkan model tidak akurat. Sehingga keberadaan pencilan harus diuji.

performance::check_outliers(logreg4)

## OK: No outliers detected.

plot(performance::check_outliers(logreg4))

Diperoleh bahwa data tidak mengandung pencilan, atau tidak terdapat data yang letaknya di luar selang.

Identify Models

Display Parameters

performance::performance(logreg4) %>% 
  parameters::display()

AIC	BIC	Tjur’s R2	RMSE	Sigma	Log_loss	Score_log	Score_spherical	PCP
814.29	875.19	0.23	0.40	1.00	0.49	-Inf	1.25e-03	0.68

Diperoleh nilai RMSE yang cukup kecil, semakin kecil nilai RMSE (mendekati nol) diharapkan model yang dihasilkan semakin akurat.

Goodness of Fit

performance::performance_hosmer(logreg4)

## # Hosmer-Lemeshow Goodness-of-Fit Test
## 
##   Chi-squared: 9.337
##            df: 8    
##       p-value: 0.315

## Summary: model seems to fit well.

Uji GoF Hosmer Lemeshow digunakan untuk menguji kebaikan model. Hipotesis dalam pengujian ini adalah:

H0: Tidak ada perbedaan antara nilai pengamatan dan prediksi, model sudah baik. H1: Ada perbedaan antara nilai pengamatan dan prediksi.

Diperoleh nilai peluang lebih dari taraf nyata sebesar 5%, artinya model sudah baik dan layak digunakan untuk prediksi.

summary(logreg4)

## 
## Call:
## glm(formula = creditability ~ account.balance + duration + saving.balance + 
##     previous.credit, family = "binomial", data = train_credit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7106  -0.8592   0.4480   0.7712   1.9555  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       0.110848   0.490485   0.226 0.821204    
## account.balance2  0.448306   0.216979   2.066 0.038817 *  
## account.balance3  1.185640   0.394718   3.004 0.002667 ** 
## account.balance4  1.804899   0.242043   7.457 8.86e-14 ***
## duration         -0.039967   0.007205  -5.547 2.90e-08 ***
## saving.balance2   0.063589   0.288907   0.220 0.825792    
## saving.balance3   0.478523   0.424153   1.128 0.259242    
## saving.balance4   0.850819   0.526442   1.616 0.106057    
## saving.balance5   0.899850   0.270871   3.322 0.000894 ***
## previous.credit1 -0.456384   0.576008  -0.792 0.428173    
## previous.credit2  0.552842   0.450511   1.227 0.219768    
## previous.credit3  0.593091   0.507433   1.169 0.242481    
## previous.credit4  1.312042   0.476098   2.756 0.005854 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 977.38  on 799  degrees of freedom
## Residual deviance: 788.29  on 787  degrees of freedom
## AIC: 814.29
## 
## Number of Fisher Scoring iterations: 5

Diperoleh model sebagai berikut:

\[P(Y_i≤1|X_i)=\frac{e^{-0.110848+0.448306(AB=2)+1.185640(AB=3)+1.804899(AB=4)-0.039967D+0.899850(SB=5)+1.312042(PC=4)}}{1+e^{-0.110848+0.448306(AB=2)+1.185640(AB=3)+1.804899(AB=4)-0.039967D+0.899850(SB=5)+1.312042(PC=4)}}\]

Make Model using Train Data

#null_model <- glm(Creditability~1,data=train_credit,family = "binomial")
null_model <- glm(creditability~1,data=train_credit,family = "binomial")
performance::test_lrt(null_model,logreg4)

## # Likelihood-Ratio-Test (LRT) for Model Comparison (ML-estimator)
## 
## Name       | Model | df | df_diff |   Chi2 |      p
## ---------------------------------------------------
## null_model |   glm |  1 |         |        |       
## logreg4    |   glm | 13 |      12 | 189.09 | < .001

Diperoleh nilai peluang kurang dari taraf nyata 5% pada pengujian Likelihood-Ratio Test, dimana:

H0: Tidak ada peningkatan signifikan antara model yang dibentuk dengan model keseluruhan H1: Ada peningkatan signifikan antara model yang dibentuk dengan model keseluruhan

dan diperoleh nilai peluang lebih kecil dari taraf nyata 5%, maka H0 ditolak.

Odds Ratio

mp_logreg4 <- parameters::parameters(logreg4,exponentiate = TRUE)
parameters::display(mp_logreg4)

Parameter	Odds Ratio	SE	95% CI	z	p
(Intercept)	1.12	0.55	(0.42, 2.91)	0.23	0.821
account balance (2)	1.57	0.34	(1.02, 2.40)	2.07	0.039
account balance (3)	3.27	1.29	(1.56, 7.43)	3.00	0.003
account balance (4)	6.08	1.47	(3.82, 9.87)	7.46	< .001
duration	0.96	6.92e-03	(0.95, 0.97)	-5.55	< .001
saving balance (2)	1.07	0.31	(0.61, 1.90)	0.22	0.826
saving balance (3)	1.61	0.68	(0.73, 3.92)	1.13	0.259
saving balance (4)	2.34	1.23	(0.90, 7.35)	1.62	0.106
saving balance (5)	2.46	0.67	(1.47, 4.25)	3.32	< .001
previous credit (1)	0.63	0.36	(0.20, 1.98)	-0.79	0.428
previous credit (2)	1.74	0.78	(0.73, 4.31)	1.23	0.220
previous credit (3)	1.81	0.92	(0.68, 5.00)	1.17	0.242
previous credit (4)	3.71	1.77	(1.48, 9.67)	2.76	0.006

# report::report(logreg4)

s <- summary(logreg4)
coeff.sig <- s$coefficients[,1][s$coefficients[,4] < 0.05]
odds <- round(exp(coeff.sig),2)
odds

## account.balance2 account.balance3 account.balance4         duration 
##             1.57             3.27             6.08             0.96 
##  saving.balance5 previous.credit4 
##             2.46             3.71

Diperoleh odds ratio atau rasio kecenderungan untuk masing-masing peubah penjelas yang signifikan sebagai berikut:

Untuk peubah account.balance (saldo akun), didapatkan nilai OR sebesar 1.57 untuk kategori 2 (Account Balance = 0 Deutsch Mark). Artinya, peminjam/nasabah memiliki kecenderungan untuk masuk dalam klasifikasi kredit buruk (1) sebesar 1.57 kali lebih besar jika memiliki saldo akun sebesar 0 DM dibandingkan dengan yang tidak memiliki akun. Kemudian, didapatkan nilai OR sebesar 3.27 untuk kategori 3 (Account Balance = antara 0 - 200 DM) yang artinya, nasabah memiliki kecenderungan untuk masuk dalam klasifikasi kredit buruk (1) sebesar 3.27 kali lebih besar jika memiliki saldo akun antara 0 hingga 200 DM. Dan terakhir, untuk peubah yang sama pada kategori 4 (Account Balance > 200 DM) memiliki nilai OR sebesar 6.08, artinya nasabah memiliki kecenderungan untuk masuk dalam klasifikasi kredit buruk (1) sebesar 6.08 kali lebih besar jika memiliki saldo akun di atas 200 DM.
Untuk peubah duration (durasi kredit), diperoleh nilai OR sebesar 0.96 yang artinya jika nasaban menambah durasi kredit sebanyak 1 tahun, maka ia cenderung untuk tergolong dalam klasifikasi kredit buruk sebesar 0.96 kali dibandingkan durasi sebelumnya. Hal ini mengindikasikan bahwa semakin lama durasi nasabah mengajukan kredit, kecenderungan ia tergolong dalam kredit buruk akan semakin menurun.
Untuk peubah saving.balance (saldo tabungan), didapatkan nilai OR sebesar 2.46 untuk kategori 5 (Saving Balance > 1000 DM). Artinya, peminjam/nasabah memiliki kecenderungan untuk masuk dalam klasifikasi kredit buruk (1) sebesar 2.46 kali lebih besar jika memiliki saldo tabungan > 1000 DM dibandingkan dengan yang saldonya tidak diketahui. Hal ini mengindikasikan bahwa tabungan yang besar justru menarik nasabah untuk meminjam.
Diperoleh nilai OR untuk peubah penjelas previous.credit dengan kategori 4 (Previous Credit = Critical) sebesar 3.71, artinya nasabah dengan status kredit sebelumnya kritis (tidak membayar) akan berpotensi 3.71 kali lebih besar daripada nasabah yang status kredit sebelumnya membayar tepat waktu untuk tergolong ke dalam status kredit buruk.
Anova Test

anova(logreg4,test = "Chisq") %>% 
  parameters::parameters() %>% 
  parameters::display()

Parameter	df	Deviance	df (error)	Deviance (error)	p
NULL			799	977.38
account.balance	3	109.93	796	867.46	< .001
duration	1	39.72	795	827.74	< .001
saving.balance	4	14.46	791	813.28	0.006
previous.credit	4	24.98	787	788.29	< .001

Anova Table (Type 1 tests)

Diperoleh nilai peluang yang signifikan untuk semua peubah penjelas dalam model, artinya peubah-peubah ini secara signifikan mampu menjelaskan status kredit nasabah.

Predict

qplot(predict(logreg4,type = "link"))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#qplot(train_credit$Credit.Amount,predict(logreg,type = "link"))
#qplot(train_credit$Credit.Amount,predict(logreg,type = "response"))

qplot(train_credit$credit.amount,predict(logreg4,type = "link"))

qplot(train_credit$credit.amount,predict(logreg4,type = "response"))

Test Data

prob_test <- predict(logreg4,type = "response",newdata = test_credit)
pred_class1_test <- as.factor(ifelse(prob_test<=0.5,0,1))
pred_class2_test <- as.factor(ifelse(prob_test<=0.55,0,1))
pred_class3_test <- as.factor(ifelse(prob_test<=0.4,0,1))

Confusion Matrix

#caret::confusionMatrix(pred_class1_test,test_credit$Creditability)
#caret::confusionMatrix(pred_class2_test,test_credit$Creditability)
#caret::confusionMatrix(pred_class3_test,test_credit$Creditability)

caret::confusionMatrix(pred_class1_test,test_credit$creditability)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  19  12
##          1  41 128
##                                           
##                Accuracy : 0.735           
##                  95% CI : (0.6681, 0.7948)
##     No Information Rate : 0.7             
##     P-Value [Acc > NIR] : 0.15789         
##                                           
##                   Kappa : 0.268           
##                                           
##  Mcnemar's Test P-Value : 0.00012         
##                                           
##             Sensitivity : 0.3167          
##             Specificity : 0.9143          
##          Pos Pred Value : 0.6129          
##          Neg Pred Value : 0.7574          
##              Prevalence : 0.3000          
##          Detection Rate : 0.0950          
##    Detection Prevalence : 0.1550          
##       Balanced Accuracy : 0.6155          
##                                           
##        'Positive' Class : 0               
##

caret::confusionMatrix(pred_class2_test,test_credit$creditability)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  25  22
##          1  35 118
##                                           
##                Accuracy : 0.715           
##                  95% CI : (0.6471, 0.7764)
##     No Information Rate : 0.7             
##     P-Value [Acc > NIR] : 0.3532          
##                                           
##                   Kappa : 0.2766          
##                                           
##  Mcnemar's Test P-Value : 0.1120          
##                                           
##             Sensitivity : 0.4167          
##             Specificity : 0.8429          
##          Pos Pred Value : 0.5319          
##          Neg Pred Value : 0.7712          
##              Prevalence : 0.3000          
##          Detection Rate : 0.1250          
##    Detection Prevalence : 0.2350          
##       Balanced Accuracy : 0.6298          
##                                           
##        'Positive' Class : 0               
##

caret::confusionMatrix(pred_class3_test,test_credit$creditability)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0   8   7
##          1  52 133
##                                           
##                Accuracy : 0.705           
##                  95% CI : (0.6366, 0.7672)
##     No Information Rate : 0.7             
##     P-Value [Acc > NIR] : 0.4733          
##                                           
##                   Kappa : 0.1061          
##                                           
##  Mcnemar's Test P-Value : 1.014e-08       
##                                           
##             Sensitivity : 0.1333          
##             Specificity : 0.9500          
##          Pos Pred Value : 0.5333          
##          Neg Pred Value : 0.7189          
##              Prevalence : 0.3000          
##          Detection Rate : 0.0400          
##    Detection Prevalence : 0.0750          
##       Balanced Accuracy : 0.5417          
##                                           
##        'Positive' Class : 0               
##

#roc_empirical_test <- ROCit::rocit(prob_test,test_credit$Creditability,negref = 0)
roc_empirical_test <- ROCit::rocit(prob_test,test_credit$creditability,negref = 0)
get_output_test <- plot(roc_empirical_test)

Prediction

get_output_test$`optimal Youden Index point`

##     value       FPR       TPR    cutoff 
## 0.4261905 0.2166667 0.6428571 0.7371052

pred_class4_test <- as.factor(ifelse(prob_test<=get_output_test$`optimal Youden Index point`["cutoff"],0,1))
#caret::confusionMatrix(pred_class1_test,test_credit$Creditability)
#caret::confusionMatrix(pred_class4_test,test_credit$Creditability)

caret::confusionMatrix(pred_class1_test,test_credit$creditability)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  19  12
##          1  41 128
##                                           
##                Accuracy : 0.735           
##                  95% CI : (0.6681, 0.7948)
##     No Information Rate : 0.7             
##     P-Value [Acc > NIR] : 0.15789         
##                                           
##                   Kappa : 0.268           
##                                           
##  Mcnemar's Test P-Value : 0.00012         
##                                           
##             Sensitivity : 0.3167          
##             Specificity : 0.9143          
##          Pos Pred Value : 0.6129          
##          Neg Pred Value : 0.7574          
##              Prevalence : 0.3000          
##          Detection Rate : 0.0950          
##    Detection Prevalence : 0.1550          
##       Balanced Accuracy : 0.6155          
##                                           
##        'Positive' Class : 0               
##

caret::confusionMatrix(pred_class4_test,test_credit$creditability)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 47 51
##          1 13 89
##                                          
##                Accuracy : 0.68           
##                  95% CI : (0.6105, 0.744)
##     No Information Rate : 0.7            
##     P-Value [Acc > NIR] : 0.7579         
##                                          
##                   Kappa : 0.3548         
##                                          
##  Mcnemar's Test P-Value : 3.746e-06      
##                                          
##             Sensitivity : 0.7833         
##             Specificity : 0.6357         
##          Pos Pred Value : 0.4796         
##          Neg Pred Value : 0.8725         
##              Prevalence : 0.3000         
##          Detection Rate : 0.2350         
##    Detection Prevalence : 0.4900         
##       Balanced Accuracy : 0.7095         
##                                          
##        'Positive' Class : 0              
##

prob <- predict(logreg4,type = "response")
pred_class1 <- as.factor(ifelse(prob<=0.5,0,1))
pred_class2 <- as.factor(ifelse(prob<=0.55,0,1))
pred_class3 <- as.factor(ifelse(prob<=0.4,0,1))

#caret::confusionMatrix(pred_class1,train_credit$Creditability)
#caret::confusionMatrix(pred_class2,train_credit$Creditability)
#caret::confusionMatrix(pred_class3,train_credit$Creditability)

caret::confusionMatrix(pred_class1,train_credit$creditability)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 113  55
##          1 127 505
##                                           
##                Accuracy : 0.7725          
##                  95% CI : (0.7418, 0.8011)
##     No Information Rate : 0.7             
##     P-Value [Acc > NIR] : 2.700e-06       
##                                           
##                   Kappa : 0.4076          
##                                           
##  Mcnemar's Test P-Value : 1.418e-07       
##                                           
##             Sensitivity : 0.4708          
##             Specificity : 0.9018          
##          Pos Pred Value : 0.6726          
##          Neg Pred Value : 0.7991          
##              Prevalence : 0.3000          
##          Detection Rate : 0.1412          
##    Detection Prevalence : 0.2100          
##       Balanced Accuracy : 0.6863          
##                                           
##        'Positive' Class : 0               
##

caret::confusionMatrix(pred_class2,train_credit$creditability)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 137  87
##          1 103 473
##                                           
##                Accuracy : 0.7625          
##                  95% CI : (0.7314, 0.7916)
##     No Information Rate : 0.7             
##     P-Value [Acc > NIR] : 4.831e-05       
##                                           
##                   Kappa : 0.4235          
##                                           
##  Mcnemar's Test P-Value : 0.2765          
##                                           
##             Sensitivity : 0.5708          
##             Specificity : 0.8446          
##          Pos Pred Value : 0.6116          
##          Neg Pred Value : 0.8212          
##              Prevalence : 0.3000          
##          Detection Rate : 0.1713          
##    Detection Prevalence : 0.2800          
##       Balanced Accuracy : 0.7077          
##                                           
##        'Positive' Class : 0               
##

caret::confusionMatrix(pred_class3,train_credit$creditability)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  64  22
##          1 176 538
##                                           
##                Accuracy : 0.7525          
##                  95% CI : (0.7211, 0.7821)
##     No Information Rate : 0.7             
##     P-Value [Acc > NIR] : 0.0005669       
##                                           
##                   Kappa : 0.2784          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.2667          
##             Specificity : 0.9607          
##          Pos Pred Value : 0.7442          
##          Neg Pred Value : 0.7535          
##              Prevalence : 0.3000          
##          Detection Rate : 0.0800          
##    Detection Prevalence : 0.1075          
##       Balanced Accuracy : 0.6137          
##                                           
##        'Positive' Class : 0               
##

#roc_empirical <- ROCit::rocit(prob,train_credit$Creditability,negref = 0)
roc_empirical <- ROCit::rocit(prob,train_credit$creditability,negref = 0)
get_output <- plot(roc_empirical)

get_output$`optimal Youden Index point`

##     value       FPR       TPR    cutoff 
## 0.4613095 0.2833333 0.7446429 0.6735371

pred_class4 <- as.factor(ifelse(prob<=get_output$`optimal Youden Index point`["cutoff"],0,1))
#caret::confusionMatrix(pred_class1,train_credit$Creditability)
#caret::confusionMatrix(pred_class4,train_credit$Creditability)

caret::confusionMatrix(pred_class1,train_credit$creditability)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 113  55
##          1 127 505
##                                           
##                Accuracy : 0.7725          
##                  95% CI : (0.7418, 0.8011)
##     No Information Rate : 0.7             
##     P-Value [Acc > NIR] : 2.700e-06       
##                                           
##                   Kappa : 0.4076          
##                                           
##  Mcnemar's Test P-Value : 1.418e-07       
##                                           
##             Sensitivity : 0.4708          
##             Specificity : 0.9018          
##          Pos Pred Value : 0.6726          
##          Neg Pred Value : 0.7991          
##              Prevalence : 0.3000          
##          Detection Rate : 0.1412          
##    Detection Prevalence : 0.2100          
##       Balanced Accuracy : 0.6863          
##                                           
##        'Positive' Class : 0               
##

caret::confusionMatrix(pred_class4,train_credit$creditability)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 172 144
##          1  68 416
##                                          
##                Accuracy : 0.735          
##                  95% CI : (0.703, 0.7653)
##     No Information Rate : 0.7            
##     P-Value [Acc > NIR] : 0.01613        
##                                          
##                   Kappa : 0.4214         
##                                          
##  Mcnemar's Test P-Value : 2.591e-07      
##                                          
##             Sensitivity : 0.7167         
##             Specificity : 0.7429         
##          Pos Pred Value : 0.5443         
##          Neg Pred Value : 0.8595         
##              Prevalence : 0.3000         
##          Detection Rate : 0.2150         
##    Detection Prevalence : 0.3950         
##       Balanced Accuracy : 0.7298         
##                                          
##        'Positive' Class : 0              
##

Diperoleh nilai akurasi tertinggi sebesar 77%.