Regresi Logistik

Author

Alfa Nugraha

Persiapan data

germancredit <- read.csv(file="https://github.com/alfanugraha/tsa-handson/raw/master/german_credit.csv", 
                header = T, 
                sep = ",")
str(germancredit)
'data.frame':   1000 obs. of  21 variables:
 $ creditability          : int  1 1 1 1 1 1 1 1 1 1 ...
 $ account_balance        : int  1 1 2 1 1 1 1 1 4 2 ...
 $ duration               : int  18 9 12 12 12 10 8 6 18 24 ...
 $ payment_status         : int  4 4 2 4 4 4 4 4 4 2 ...
 $ purpose                : int  2 0 9 0 0 0 0 0 3 3 ...
 $ credit_amount          : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
 $ stocks                 : int  1 1 2 1 1 1 1 1 1 3 ...
 $ length_employment      : int  2 3 4 3 3 2 4 2 1 1 ...
 $ instalment_percent     : int  4 2 2 3 4 1 1 2 4 1 ...
 $ sex_marital_status     : int  2 3 2 3 3 3 3 3 2 2 ...
 $ guarantors             : int  1 1 1 1 1 1 1 1 1 1 ...
 $ duration_currentaddress: int  4 2 4 2 4 3 4 4 4 4 ...
 $ most_available_asset   : int  2 1 1 1 2 1 1 1 3 4 ...
 $ age                    : int  21 36 23 39 38 48 39 40 65 23 ...
 $ concurrent_credits     : int  3 3 3 3 1 3 3 3 3 3 ...
 $ type_apartment         : int  1 1 1 1 2 1 2 2 2 1 ...
 $ no_credits             : int  1 2 1 2 2 2 2 1 2 1 ...
 $ cccupation             : int  3 3 2 2 2 2 2 2 1 1 ...
 $ no_dependents          : int  1 2 1 2 1 2 1 2 1 1 ...
 $ telephone              : int  1 1 1 1 1 1 1 1 1 1 ...
 $ foreign_worker         : int  1 1 1 2 2 2 2 2 1 1 ...

Tentang gugus data German Credit.

  • Account Balance: No account (1), None (No balance) (2), Some Balance (3)
  • Payment Status: Some Problems (1), Paid Up (2), No Problems (in this bank) (3)
  • Savings/Stock Value: None, Below 100 DM, [100, 1000] DM, Above 1000 DM
  • Employment Length: Below 1 year (including unemployed), [1, 4], [4, 7], Above 7
  • Sex/Marital Status: Male Divorced/Single, Male Married/Widowed, Female
  • No of Credits at this bank: 1, More than 1
  • Guarantor: None, Yes
  • Concurrent Credits: Other Banks or Dept Stores, None
  • ForeignWorker variable may be dropped from the study
  • Purpose of Credit: New car, Used car, Home Related, Other
kable(head(germancredit))
creditability account_balance duration payment_status purpose credit_amount stocks length_employment instalment_percent sex_marital_status guarantors duration_currentaddress most_available_asset age concurrent_credits type_apartment no_credits cccupation no_dependents telephone foreign_worker
1 1 18 4 2 1049 1 2 4 2 1 4 2 21 3 1 1 3 1 1 1
1 1 9 4 0 2799 1 3 2 3 1 2 1 36 3 1 2 3 2 1 1
1 2 12 2 9 841 2 4 2 2 1 4 1 23 3 1 1 2 1 1 1
1 1 12 4 0 2122 1 3 3 3 1 2 1 39 3 1 2 2 2 1 2
1 1 12 4 0 2171 1 3 4 3 1 4 2 38 1 2 2 2 1 1 2
1 1 10 4 0 2241 1 2 1 3 1 3 1 48 3 1 2 2 2 1 2
skimr::skim(germancredit)
Data summary
Name germancredit
Number of rows 1000
Number of columns 21
_______________________
Column type frequency:
numeric 21
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
creditability 0 1 0.70 0.46 0 0.0 1.0 1.00 1 ▃▁▁▁▇
account_balance 0 1 2.58 1.26 1 1.0 2.0 4.00 4 ▆▆▁▁▇
duration 0 1 20.90 12.06 4 12.0 18.0 24.00 72 ▇▇▂▁▁
payment_status 0 1 2.54 1.08 0 2.0 2.0 4.00 4 ▁▁▇▁▅
purpose 0 1 2.83 2.74 0 1.0 2.0 3.00 10 ▇▅▁▁▂
credit_amount 0 1 3271.25 2822.75 250 1365.5 2319.5 3972.25 18424 ▇▂▁▁▁
stocks 0 1 2.10 1.58 1 1.0 1.0 3.00 5 ▇▂▁▁▂
length_employment 0 1 3.38 1.21 1 3.0 3.0 5.00 5 ▂▅▇▅▆
instalment_percent 0 1 2.97 1.12 1 2.0 3.0 4.00 4 ▂▃▁▂▇
sex_marital_status 0 1 2.68 0.71 1 2.0 3.0 3.00 4 ▁▅▁▇▂
guarantors 0 1 1.15 0.48 1 1.0 1.0 1.00 3 ▇▁▁▁▁
duration_currentaddress 0 1 2.85 1.10 1 2.0 3.0 4.00 4 ▂▆▁▃▇
most_available_asset 0 1 2.36 1.05 1 1.0 2.0 3.00 4 ▇▆▁▇▃
age 0 1 35.54 11.35 19 27.0 33.0 42.00 75 ▇▆▃▁▁
concurrent_credits 0 1 2.67 0.71 1 3.0 3.0 3.00 3 ▂▁▁▁▇
type_apartment 0 1 1.93 0.53 1 2.0 2.0 2.00 3 ▂▁▇▁▁
no_credits 0 1 1.41 0.58 1 1.0 1.0 2.00 4 ▇▅▁▁▁
cccupation 0 1 2.90 0.65 1 3.0 3.0 3.00 4 ▁▂▁▇▂
no_dependents 0 1 1.16 0.36 1 1.0 1.0 1.00 2 ▇▁▁▁▂
telephone 0 1 1.40 0.49 1 1.0 1.0 2.00 2 ▇▁▁▁▆
foreign_worker 0 1 1.04 0.19 1 1.0 1.0 1.00 2 ▇▁▁▁▁

Konversi beberapa data numerik sebagai data kategorik

germancredit$creditability <- as.factor(germancredit$creditability)
germancredit$account_balance <- as.factor(germancredit$account_balance)
germancredit$payment_status <- as.factor(germancredit$payment_status)
germancredit$stocks <- as.factor(germancredit$stocks)
germancredit$length_employment <- as.factor(germancredit$length_employment)
germancredit$sex_marital_status <- as.factor(germancredit$sex_marital_status)
germancredit$no_credits <- as.factor(germancredit$no_credits)
germancredit$foreign_worker <- as.factor(germancredit$foreign_worker)
germancredit$purpose <- as.factor(germancredit$purpose)

Statistik deskriptif

summary(germancredit)
 creditability account_balance    duration    payment_status    purpose   
 0:300         1:274           Min.   : 4.0   0: 40          3      :280  
 1:700         2:269           1st Qu.:12.0   1: 49          0      :234  
               3: 63           Median :18.0   2:530          2      :181  
               4:394           Mean   :20.9   3: 88          1      :103  
                               3rd Qu.:24.0   4:293          9      : 97  
                               Max.   :72.0                  6      : 50  
                                                             (Other): 55  
 credit_amount   stocks  length_employment instalment_percent
 Min.   :  250   1:603   1: 62             Min.   :1.000     
 1st Qu.: 1366   2:103   2:172             1st Qu.:2.000     
 Median : 2320   3: 63   3:339             Median :3.000     
 Mean   : 3271   4: 48   4:174             Mean   :2.973     
 3rd Qu.: 3972   5:183   5:253             3rd Qu.:4.000     
 Max.   :18424                             Max.   :4.000     
                                                             
 sex_marital_status   guarantors    duration_currentaddress
 1: 50              Min.   :1.000   Min.   :1.000          
 2:310              1st Qu.:1.000   1st Qu.:2.000          
 3:548              Median :1.000   Median :3.000          
 4: 92              Mean   :1.145   Mean   :2.845          
                    3rd Qu.:1.000   3rd Qu.:4.000          
                    Max.   :3.000   Max.   :4.000          
                                                           
 most_available_asset      age        concurrent_credits type_apartment 
 Min.   :1.000        Min.   :19.00   Min.   :1.000      Min.   :1.000  
 1st Qu.:1.000        1st Qu.:27.00   1st Qu.:3.000      1st Qu.:2.000  
 Median :2.000        Median :33.00   Median :3.000      Median :2.000  
 Mean   :2.358        Mean   :35.54   Mean   :2.675      Mean   :1.928  
 3rd Qu.:3.000        3rd Qu.:42.00   3rd Qu.:3.000      3rd Qu.:2.000  
 Max.   :4.000        Max.   :75.00   Max.   :3.000      Max.   :3.000  
                                                                        
 no_credits   cccupation    no_dependents     telephone     foreign_worker
 1:633      Min.   :1.000   Min.   :1.000   Min.   :1.000   1:963         
 2:333      1st Qu.:3.000   1st Qu.:1.000   1st Qu.:1.000   2: 37         
 3: 28      Median :3.000   Median :1.000   Median :1.000                 
 4:  6      Mean   :2.904   Mean   :1.155   Mean   :1.404                 
            3rd Qu.:3.000   3rd Qu.:1.000   3rd Qu.:2.000                 
            Max.   :4.000   Max.   :2.000   Max.   :2.000                 
                                                                          
prop.table(table(germancredit$creditability))

  0   1 
0.3 0.7 

Data tersebut memiliki 30% pemohon kredit dengan kategori buruk, 70% sisanya dengan kategori baik

library(DataExplorer)
plot_density(germancredit)

plot_bar(germancredit)

Boxplot peubah numerik

ggplot(germancredit, aes(x=creditability, y=credit_amount, fill=creditability)) +
  geom_boxplot()

Grafik batang peubah kategorik

ggplot(germancredit, aes(account_balance, after_stat(count))) +
  geom_bar(aes(fill=creditability), position = "dodge")

Partisi/splitting data

# generate indeks data
indeks <- sample(1:nrow(germancredit), size = 0.7 * nrow(germancredit))

# partisi 70%
train70 <- germancredit[indeks, ] # berisi 70%
test30 <- germancredit[-indeks, ] # berisi 30%

Regresi Logistik Biner

Sebagai pengantar pada regresi klasik, sisaan \epsilon diasumsikan menyebar normal, yang berarti Y juga menyebar normal. Namun pada kasus tertentu Y dapat berupa kategorik, misalnya kasus biner.

Makna garis lurus jika digunakan pada kasus biner bentuk pola prediksi Y menjadi jauh lebih besar dari 1 (positif), atau bisa juga menjadi sangat negatif sehingga dianggap tidak sesuai dengan hasil sebenarnya.

Odds adalah rasio peluang sukses dibagi peluang gagal, dimana peluang sukses > peluang gagal. Nilai odd disebut juga nilai logit

Logit adalah nilai log dari odds yang sama dengan \alpha + \beta X.

Pada regresi linear jika nilai X naik satu satuan maka \beta meningkat satu satuan. Pada logit jika nilai X naik satu satuan maka odds meningkat e^{\beta} satuan. Merasiokan odd atau odds ratio sama saja dengan nilai e^{\beta}.

Jika odd ratio bernilai 2, maknanya bahwa odds pada saat X=x+1 sama dengan 2 kali dari odds pada saat X=x. Kesimpulannya adalah peluang sukses terjadi 2 kali dari kejadian peluang gagal.

Pemodelan

reglog <- glm(creditability ~ ., family = binomial, data=train70)
summary(reglog)

Call:
glm(formula = creditability ~ ., family = binomial, data = train70)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.9954  -0.6297   0.3459   0.7204   1.9740  

Coefficients:
                          Estimate Std. Error z value Pr(>|z|)    
(Intercept)             -1.451e+00  1.315e+00  -1.103 0.269977    
account_balance2         2.675e-01  2.693e-01   0.993 0.320696    
account_balance3         1.760e+00  5.228e-01   3.367 0.000760 ***
account_balance4         1.449e+00  2.697e-01   5.373 7.74e-08 ***
duration                -3.090e-02  1.147e-02  -2.695 0.007044 ** 
payment_status1          4.615e-01  6.794e-01   0.679 0.496941    
payment_status2          1.048e+00  5.450e-01   1.922 0.054575 .  
payment_status3          1.327e+00  5.804e-01   2.286 0.022251 *  
payment_status4          2.149e+00  5.529e-01   3.886 0.000102 ***
purpose1                 2.421e+00  5.296e-01   4.571 4.85e-06 ***
purpose2                 5.938e-01  3.119e-01   1.903 0.056976 .  
purpose3                 7.439e-01  2.925e-01   2.543 0.010977 *  
purpose4                 6.324e-01  1.073e+00   0.590 0.555511    
purpose5                -4.368e-01  7.192e-01  -0.607 0.543641    
purpose6                -7.236e-02  4.886e-01  -0.148 0.882257    
purpose8                 1.756e+00  1.253e+00   1.401 0.161209    
purpose9                 1.084e+00  4.258e-01   2.547 0.010881 *  
purpose10                2.590e+00  1.075e+00   2.410 0.015941 *  
credit_amount           -1.649e-04  5.567e-05  -2.962 0.003055 ** 
stocks2                  4.410e-01  3.617e-01   1.220 0.222652    
stocks3                  4.599e-01  4.531e-01   1.015 0.310134    
stocks4                  1.495e+00  6.270e-01   2.385 0.017075 *  
stocks5                  1.323e+00  3.364e-01   3.934 8.35e-05 ***
length_employment2      -6.268e-02  5.027e-01  -0.125 0.900772    
length_employment3      -6.413e-02  4.724e-01  -0.136 0.892011    
length_employment4       1.033e+00  5.248e-01   1.969 0.049009 *  
length_employment5       1.659e-01  4.951e-01   0.335 0.737598    
instalment_percent      -3.308e-01  1.093e-01  -3.026 0.002478 ** 
sex_marital_status2      1.300e-01  4.733e-01   0.275 0.783578    
sex_marital_status3      6.563e-01  4.588e-01   1.430 0.152641    
sex_marital_status4      3.579e-01  5.790e-01   0.618 0.536546    
guarantors               4.080e-01  2.325e-01   1.755 0.079273 .  
duration_currentaddress -3.643e-02  1.049e-01  -0.347 0.728338    
most_available_asset    -1.124e-01  1.244e-01  -0.904 0.366163    
age                      5.118e-03  1.090e-02   0.469 0.638806    
concurrent_credits       1.985e-01  1.550e-01   1.281 0.200247    
type_apartment           1.357e-01  2.257e-01   0.601 0.547639    
no_credits2             -3.524e-01  2.868e-01  -1.229 0.219034    
no_credits3              1.361e-01  8.118e-01   0.168 0.866889    
no_credits4             -2.236e-01  1.144e+00  -0.195 0.845070    
cccupation              -6.756e-03  1.920e-01  -0.035 0.971931    
no_dependents           -2.852e-01  3.058e-01  -0.933 0.351030    
telephone                3.329e-01  2.472e-01   1.347 0.178139    
foreign_worker2          1.348e+00  7.357e-01   1.833 0.066851 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 850.06  on 699  degrees of freedom
Residual deviance: 608.53  on 656  degrees of freedom
AIC: 696.53

Number of Fisher Scoring iterations: 5
reglog2 <- glm(creditability ~ account_balance + credit_amount + instalment_percent + stocks + purpose, 
              family = binomial, 
              data=train70)
summary(reglog2)

Call:
glm(formula = creditability ~ account_balance + credit_amount + 
    instalment_percent + stocks + purpose, family = binomial, 
    data = train70)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.5504  -0.9081   0.4699   0.7474   2.0260  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)         1.209e+00  3.872e-01   3.123 0.001793 ** 
account_balance2    3.134e-01  2.405e-01   1.303 0.192586    
account_balance3    1.555e+00  4.789e-01   3.247 0.001168 ** 
account_balance4    1.607e+00  2.447e-01   6.565 5.20e-11 ***
credit_amount      -2.141e-04  3.902e-05  -5.487 4.09e-08 ***
instalment_percent -3.328e-01  9.248e-02  -3.598 0.000320 ***
stocks2             1.451e-01  3.184e-01   0.456 0.648523    
stocks3             4.429e-01  4.334e-01   1.022 0.306882    
stocks4             1.367e+00  5.727e-01   2.386 0.017010 *  
stocks5             1.184e+00  3.057e-01   3.874 0.000107 ***
purpose1            2.087e+00  4.803e-01   4.345 1.39e-05 ***
purpose2            3.035e-01  2.803e-01   1.083 0.278888    
purpose3            5.349e-01  2.581e-01   2.072 0.038235 *  
purpose4            1.453e-01  9.702e-01   0.150 0.880930    
purpose5           -7.224e-01  6.509e-01  -1.110 0.267088    
purpose6           -1.922e-01  4.483e-01  -0.429 0.668138    
purpose8            9.535e-01  1.178e+00   0.809 0.418228    
purpose9            4.203e-01  3.527e-01   1.192 0.233343    
purpose10           2.067e+00  8.968e-01   2.305 0.021149 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 850.06  on 699  degrees of freedom
Residual deviance: 689.88  on 681  degrees of freedom
AIC: 727.88

Number of Fisher Scoring iterations: 5

Seleksi fitur dengan stepwise

reglog_step <- step(reglog)
Start:  AIC=696.53
creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    sex_marital_status + guarantors + duration_currentaddress + 
    most_available_asset + age + concurrent_credits + type_apartment + 
    no_credits + cccupation + no_dependents + telephone + foreign_worker

                          Df Deviance    AIC
- no_credits               3   610.28 692.28
- cccupation               1   608.53 694.53
- duration_currentaddress  1   608.65 694.65
- age                      1   608.75 694.75
- type_apartment           1   608.89 694.89
- most_available_asset     1   609.35 695.35
- no_dependents            1   609.39 695.39
- sex_marital_status       3   613.76 695.76
- concurrent_credits       1   610.15 696.15
- telephone                1   610.36 696.36
<none>                         608.53 696.53
- guarantors               1   611.79 697.79
- foreign_worker           1   612.72 698.72
- length_employment        4   621.11 701.11
- duration                 1   615.88 701.88
- credit_amount            1   617.47 703.47
- instalment_percent       1   617.99 703.99
- stocks                   4   630.76 710.76
- payment_status           4   632.21 712.21
- purpose                  9   646.68 716.68
- account_balance          3   649.61 731.61

Step:  AIC=692.28
creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    sex_marital_status + guarantors + duration_currentaddress + 
    most_available_asset + age + concurrent_credits + type_apartment + 
    cccupation + no_dependents + telephone + foreign_worker

                          Df Deviance    AIC
- cccupation               1   610.28 690.28
- duration_currentaddress  1   610.36 690.36
- age                      1   610.43 690.43
- type_apartment           1   610.79 690.79
- most_available_asset     1   611.00 691.00
- no_dependents            1   611.11 691.11
- sex_marital_status       3   615.38 691.38
- telephone                1   612.11 692.11
- concurrent_credits       1   612.22 692.22
<none>                         610.28 692.28
- guarantors               1   613.40 693.40
- foreign_worker           1   615.09 695.09
- length_employment        4   622.80 696.80
- duration                 1   617.69 697.69
- credit_amount            1   619.00 699.00
- instalment_percent       1   619.77 699.77
- stocks                   4   632.52 706.52
- payment_status           4   632.99 706.99
- purpose                  9   648.57 712.57
- account_balance          3   650.66 726.66

Step:  AIC=690.28
creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    sex_marital_status + guarantors + duration_currentaddress + 
    most_available_asset + age + concurrent_credits + type_apartment + 
    no_dependents + telephone + foreign_worker

                          Df Deviance    AIC
- duration_currentaddress  1   610.36 688.36
- age                      1   610.43 688.43
- type_apartment           1   610.79 688.79
- most_available_asset     1   611.03 689.03
- no_dependents            1   611.11 689.11
- sex_marital_status       3   615.41 689.41
- concurrent_credits       1   612.22 690.22
- telephone                1   612.28 690.28
<none>                         610.28 690.28
- guarantors               1   613.40 691.40
- foreign_worker           1   615.09 693.09
- length_employment        4   622.86 694.86
- duration                 1   617.72 695.72
- credit_amount            1   619.12 697.12
- instalment_percent       1   620.02 698.02
- stocks                   4   632.53 704.53
- payment_status           4   633.01 705.01
- purpose                  9   649.18 711.18
- account_balance          3   650.73 724.73

Step:  AIC=688.36
creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    sex_marital_status + guarantors + most_available_asset + 
    age + concurrent_credits + type_apartment + no_dependents + 
    telephone + foreign_worker

                       Df Deviance    AIC
- age                   1   610.48 686.48
- type_apartment        1   610.93 686.93
- most_available_asset  1   611.19 687.19
- no_dependents         1   611.19 687.19
- sex_marital_status    3   615.52 687.52
- concurrent_credits    1   612.29 688.29
- telephone             1   612.31 688.31
<none>                      610.36 688.36
- guarantors            1   613.48 689.48
- foreign_worker        1   615.19 691.19
- length_employment     4   622.87 692.87
- duration              1   617.85 693.85
- credit_amount         1   619.12 695.12
- instalment_percent    1   620.06 696.06
- stocks                4   632.53 702.53
- payment_status        4   633.03 703.03
- purpose               9   649.26 709.26
- account_balance       3   651.62 723.62

Step:  AIC=686.48
creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    sex_marital_status + guarantors + most_available_asset + 
    concurrent_credits + type_apartment + no_dependents + telephone + 
    foreign_worker

                       Df Deviance    AIC
- type_apartment        1   611.16 685.16
- no_dependents         1   611.29 685.29
- most_available_asset  1   611.29 685.29
- sex_marital_status    3   615.70 685.70
- concurrent_credits    1   612.40 686.40
<none>                      610.48 686.48
- telephone             1   612.56 686.56
- guarantors            1   613.79 687.79
- foreign_worker        1   615.34 689.34
- length_employment     4   623.06 691.06
- duration              1   618.10 692.10
- credit_amount         1   619.32 693.32
- instalment_percent    1   620.10 694.10
- stocks                4   632.99 700.99
- payment_status        4   633.28 701.28
- purpose               9   649.29 707.29
- account_balance       3   651.88 721.88

Step:  AIC=685.16
creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    sex_marital_status + guarantors + most_available_asset + 
    concurrent_credits + no_dependents + telephone + foreign_worker

                       Df Deviance    AIC
- most_available_asset  1   611.62 683.62
- no_dependents         1   611.93 683.93
- concurrent_credits    1   613.00 685.00
- sex_marital_status    3   617.00 685.00
<none>                      611.16 685.16
- telephone             1   613.23 685.23
- guarantors            1   614.50 686.50
- foreign_worker        1   615.84 687.84
- length_employment     4   623.32 689.32
- duration              1   618.62 690.62
- credit_amount         1   619.82 691.82
- instalment_percent    1   620.55 692.55
- stocks                4   633.24 699.24
- payment_status        4   634.74 700.74
- purpose               9   650.09 706.09
- account_balance       3   653.59 721.59

Step:  AIC=683.62
creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    sex_marital_status + guarantors + concurrent_credits + no_dependents + 
    telephone + foreign_worker

                     Df Deviance    AIC
- no_dependents       1   612.35 682.35
- sex_marital_status  3   617.31 683.31
- telephone           1   613.44 683.44
- concurrent_credits  1   613.60 683.60
<none>                    611.62 683.62
- guarantors          1   615.20 685.20
- foreign_worker      1   616.56 686.56
- length_employment   4   624.18 688.18
- duration            1   619.90 689.90
- credit_amount       1   620.65 690.65
- instalment_percent  1   621.42 691.42
- stocks              4   633.99 697.99
- payment_status      4   635.81 699.81
- purpose             9   650.87 704.87
- account_balance     3   654.41 720.41

Step:  AIC=682.35
creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    sex_marital_status + guarantors + concurrent_credits + telephone + 
    foreign_worker

                     Df Deviance    AIC
- sex_marital_status  3   617.38 681.38
- telephone           1   614.28 682.28
<none>                    612.35 682.35
- concurrent_credits  1   614.54 682.54
- guarantors          1   615.69 683.69
- foreign_worker      1   617.31 685.31
- length_employment   4   625.03 687.03
- duration            1   620.66 688.66
- credit_amount       1   621.06 689.06
- instalment_percent  1   621.72 689.72
- stocks              4   634.38 696.38
- payment_status      4   636.68 698.68
- purpose             9   651.55 703.55
- account_balance     3   655.46 719.46

Step:  AIC=681.38
creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    guarantors + concurrent_credits + telephone + foreign_worker

                     Df Deviance    AIC
- concurrent_credits  1   618.80 680.80
- telephone           1   618.90 680.90
<none>                    617.38 681.38
- guarantors          1   620.54 682.54
- foreign_worker      1   622.92 684.92
- credit_amount       1   625.04 687.04
- instalment_percent  1   625.11 687.11
- duration            1   625.61 687.61
- length_employment   4   632.04 688.04
- stocks              4   638.42 694.42
- payment_status      4   642.69 698.69
- purpose             9   656.57 702.57
- account_balance     3   661.21 719.21

Step:  AIC=680.8
creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    guarantors + telephone + foreign_worker

                     Df Deviance    AIC
- telephone           1   620.23 680.23
<none>                    618.80 680.80
- guarantors          1   622.08 682.08
- foreign_worker      1   624.14 684.14
- credit_amount       1   626.37 686.37
- instalment_percent  1   626.49 686.49
- duration            1   627.13 687.13
- length_employment   4   633.85 687.85
- stocks              4   639.31 693.31
- purpose             9   657.43 701.43
- payment_status      4   648.07 702.07
- account_balance     3   662.13 718.13

Step:  AIC=680.23
creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    guarantors + foreign_worker

                     Df Deviance    AIC
<none>                    620.23 680.23
- guarantors          1   623.40 681.40
- foreign_worker      1   625.51 683.51
- credit_amount       1   626.81 684.81
- instalment_percent  1   627.41 685.41
- duration            1   628.76 686.76
- length_employment   4   635.86 687.86
- stocks              4   641.33 693.33
- payment_status      4   649.73 701.73
- purpose             9   660.21 702.21
- account_balance     3   665.35 719.35
summary(reglog_step)

Call:
glm(formula = creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    guarantors + foreign_worker, family = binomial, data = train70)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.8803  -0.6574   0.3820   0.7053   2.1417  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -7.678e-01  7.848e-01  -0.978 0.327880    
account_balance2    3.033e-01  2.628e-01   1.154 0.248538    
account_balance3    1.826e+00  5.108e-01   3.575 0.000351 ***
account_balance4    1.475e+00  2.619e-01   5.632 1.79e-08 ***
duration           -3.230e-02  1.114e-02  -2.899 0.003739 ** 
payment_status1     5.051e-01  6.353e-01   0.795 0.426633    
payment_status2     1.285e+00  5.033e-01   2.554 0.010664 *  
payment_status3     1.408e+00  5.622e-01   2.505 0.012242 *  
payment_status4     2.243e+00  5.320e-01   4.215 2.50e-05 ***
purpose1            2.393e+00  5.132e-01   4.662 3.13e-06 ***
purpose2            5.455e-01  3.003e-01   1.817 0.069282 .  
purpose3            7.808e-01  2.840e-01   2.749 0.005983 ** 
purpose4            7.277e-01  1.055e+00   0.690 0.490408    
purpose5           -4.945e-01  6.915e-01  -0.715 0.474548    
purpose6           -1.250e-01  4.800e-01  -0.260 0.794491    
purpose8            1.509e+00  1.221e+00   1.235 0.216787    
purpose9            1.092e+00  4.119e-01   2.652 0.007994 ** 
purpose10           2.266e+00  9.755e-01   2.323 0.020191 *  
credit_amount      -1.323e-04  5.189e-05  -2.551 0.010755 *  
stocks2             3.121e-01  3.477e-01   0.897 0.369490    
stocks3             4.796e-01  4.498e-01   1.066 0.286361    
stocks4             1.391e+00  6.114e-01   2.275 0.022934 *  
stocks5             1.257e+00  3.269e-01   3.846 0.000120 ***
length_employment2 -2.231e-01  4.877e-01  -0.457 0.647330    
length_employment3 -1.184e-01  4.603e-01  -0.257 0.796991    
length_employment4  1.033e+00  5.117e-01   2.018 0.043559 *  
length_employment5  1.281e-01  4.748e-01   0.270 0.787342    
instalment_percent -2.738e-01  1.035e-01  -2.646 0.008144 ** 
guarantors          3.855e-01  2.230e-01   1.729 0.083892 .  
foreign_worker2     1.436e+00  7.088e-01   2.026 0.042778 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 850.06  on 699  degrees of freedom
Residual deviance: 620.23  on 670  degrees of freedom
AIC: 680.23

Number of Fisher Scoring iterations: 5

Final model

reglog_final <- glm(formula = creditability ~ account_balance + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    most_available_asset + concurrent_credits + type_apartment + telephone + foreign_worker, 
    family = binomial, data = train70)
summary(reglog_final)

Call:
glm(formula = creditability ~ account_balance + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    most_available_asset + concurrent_credits + type_apartment + 
    telephone + foreign_worker, family = binomial, data = train70)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.9233  -0.6723   0.3786   0.7012   2.1482  

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)          -1.319e+00  9.617e-01  -1.372 0.170193    
account_balance2      3.174e-01  2.621e-01   1.211 0.225872    
account_balance3      1.782e+00  5.056e-01   3.524 0.000425 ***
account_balance4      1.460e+00  2.616e-01   5.579 2.42e-08 ***
payment_status1       6.073e-01  6.366e-01   0.954 0.340072    
payment_status2       1.230e+00  4.992e-01   2.463 0.013787 *  
payment_status3       1.208e+00  5.522e-01   2.187 0.028769 *  
payment_status4       2.144e+00  5.301e-01   4.043 5.27e-05 ***
purpose1              2.313e+00  5.092e-01   4.543 5.54e-06 ***
purpose2              5.606e-01  3.031e-01   1.850 0.064365 .  
purpose3              6.914e-01  2.783e-01   2.484 0.012975 *  
purpose4              4.722e-01  1.029e+00   0.459 0.646270    
purpose5             -4.670e-01  6.979e-01  -0.669 0.503431    
purpose6             -1.446e-01  4.794e-01  -0.302 0.762878    
purpose8              1.652e+00  1.253e+00   1.318 0.187603    
purpose9              8.628e-01  4.019e-01   2.147 0.031808 *  
purpose10             2.582e+00  9.931e-01   2.600 0.009313 ** 
credit_amount        -2.299e-04  4.459e-05  -5.155 2.53e-07 ***
stocks2               3.666e-01  3.531e-01   1.038 0.299255    
stocks3               4.751e-01  4.431e-01   1.072 0.283611    
stocks4               1.369e+00  6.114e-01   2.240 0.025122 *  
stocks5               1.238e+00  3.294e-01   3.758 0.000171 ***
length_employment2   -1.559e-01  4.867e-01  -0.320 0.748751    
length_employment3   -1.024e-01  4.570e-01  -0.224 0.822613    
length_employment4    1.005e+00  5.078e-01   1.979 0.047854 *  
length_employment5    1.703e-01  4.728e-01   0.360 0.718754    
instalment_percent   -3.670e-01  1.017e-01  -3.607 0.000309 ***
most_available_asset -1.639e-01  1.166e-01  -1.406 0.159725    
concurrent_credits    1.873e-01  1.495e-01   1.253 0.210140    
type_apartment        2.199e-01  2.118e-01   1.038 0.299230    
telephone             3.263e-01  2.259e-01   1.445 0.148538    
foreign_worker2       1.725e+00  7.073e-01   2.439 0.014746 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 850.06  on 699  degrees of freedom
Residual deviance: 626.33  on 668  degrees of freedom
AIC: 690.33

Number of Fisher Scoring iterations: 5

Kinerja Model

Validasi model pada regresi logistik untuk peubah Y sudah dipastikan bahwa:

  • tidak menyebar normal
  • varian atau ragam peubah Y berubah dan tidak konstan
  • masalah multikolinearitas terkait pendugaan parameter, jika menduga dengan Least Square maka multikolinearitas tetap diperhatikan. Regresi gulud (ridge) dapat digunakan untuk kasus regresi klasik terhadap asumsi ini, tetapi tidak ada penanganan hal tersebut di regresi logistik.

Regresi klasik menggunakan analisis ragam (ANOVA) untuk melihat kinerja model. Secara prinsip, Y pada regresi logistik hanya bersifat biner, ukuran yang dapat digunakan untuk menilai kebaikan model adalah dengan menggunakan uji likelihood ratio test atau chi-square. Cara lain dalam menilai kebaikan regresi logistik bisa dilihat dari kemampuan prediksinya dimana kemampuan memprediksi untuk kasus klasifikasi yang cocok dapat menggunakan tabel klasifikasi atau confusion matrix

Confusion matrix

# fitted.values
fit70 <- fitted.values(reglog_final)
pred_fit70 <- ifelse(fit70 >= 0.5, 1, 0)
conftab <- table(train70$creditability, pred_fit70, dnn = c("Aktual", "Prediksi"))
conftab
      Prediksi
Aktual   0   1
     0 109  98
     1  48 445
cat("Akurasi\t=", sum(diag(conftab)/sum(conftab)), "\n")
Akurasi = 0.7914286 

Cara lain dengan perhitungan manual untuk melihat performa model menggunakan data latih

# predict
pred_reglog70 <- predict(reglog_final, train70, type="response")
y_pred <- ifelse(pred_reglog70 >= 0.5, 1, 0)
y_act <- train70$creditability

tbl_klas_reglog <- table(y_act, y_pred)
tbl_klas_reglog
     y_pred
y_act   0   1
    0 109  98
    1  48 445
akurasi <- (tbl_klas_reglog[1,1] + tbl_klas_reglog[2,2]) / sum(tbl_klas_reglog) * 100
sensitivitas <- tbl_klas_reglog[2,2] / sum(tbl_klas_reglog[2,]) * 100
spesifisitas <- tbl_klas_reglog[1,1] / sum(tbl_klas_reglog[1,]) * 100 
fprate <- tbl_klas_reglog[2,1] / (tbl_klas_reglog[2,1] + tbl_klas_reglog[1,1]) * 100
AUC <- (100 + sensitivitas - fprate) / 2 

performa <- data.frame(akurasi, sensitivitas, spesifisitas, AUC)
performa
   akurasi sensitivitas spesifisitas      AUC
1 79.14286     90.26369       52.657 79.84522

Berdasarkan output confusion matrix diatas dapat dilihat bahwa kemampuan prediksi dari model yang didapatkan untuk memprediksi kelas dengan pembayaran pinjaman kredit yang baik dapat diprediksi tepat sebanyak 435 dari total 490 sedangkan untuk kelas pembayaran kredit yang macet atau kurang baik diprediksi tepat sebanyak 109 dari total 210.

Prediksi Model

Kurva ROC

Menguji model dengan data uji

# predict.glm
pred_reglog_fin <- predict.glm(reglog_final, test30)
pred_rocr <- prediction(pred_reglog_fin, test30$creditability)
perf_rocr <- performance(pred_rocr, "tpr", "fpr")
plot(perf_rocr)

Regresi Logistik Multinomial

Data yang digunakan pada model ini melihat bagaimana perilaku nasabah perbankan dalam memilih produk baru yang dimiliki suatu bank.

Pilihan produk:

  • direct debit
  • e-account
  • none
  • pension

Terdapat (4-1) = 3 model logit.

multinom_train <- read.csv(file="https://github.com/alfanugraha/tsa-handson/raw/master/training_multinom.csv", 
                header = T, 
                sep = ",")
multinom_test <- read.csv(file="https://github.com/alfanugraha/tsa-handson/raw/master/test_multinom.csv", 
                header = T, 
                sep = ",")
kable(head(multinom_train))
age new.cust cust.relation income payroll product
35 0 A 75931.98 0 direct debit
39 0 I 241814.67 0 e-account
59 0 A 319132.80 0 direct debit
25 0 A 52795.47 0 direct debit
39 0 A 266896.86 1 e-account
44 0 A 42304.23 0 direct debit

Proporsi peubah respon

prop.table(table(multinom_train$product))

direct debit    e-account         none      pension 
   0.2776003    0.1884984    0.3972311    0.1366702 

Pemodelan

reglog_multinom <- vglm(product ~ ., family = multinomial(refLevel = "none"), data = multinom_train)
summary(reglog_multinom)

Call:
vglm(formula = product ~ ., family = multinomial(refLevel = "none"), 
    data = multinom_train)

Coefficients: 
                   Estimate Std. Error z value Pr(>|z|)    
(Intercept):1     3.274e-01  2.047e-01   1.600   0.1097    
(Intercept):2    -9.475e-01  2.046e-01  -4.632 3.62e-06 ***
(Intercept):3    -2.440e+00  3.389e-01  -7.199 6.07e-13 ***
age:1            -3.922e-03  4.260e-03  -0.921   0.3573    
age:2             1.695e-02  4.013e-03   4.223 2.41e-05 ***
age:3            -5.089e-03  6.189e-03  -0.822   0.4110    
new.cust:1       -1.067e+00  4.353e-01  -2.452   0.0142 *  
new.cust:2       -8.027e-01  4.513e-01  -1.779   0.0753 .  
new.cust:3       -7.338e-01  8.972e-01  -0.818   0.4134    
cust.relationI:1 -4.429e+00  3.155e-01 -14.039  < 2e-16 ***
cust.relationI:2 -2.658e+00  1.769e-01 -15.024  < 2e-16 ***
cust.relationI:3 -3.222e+00  5.380e-01  -5.990 2.10e-09 ***
cust.relationP:1 -1.298e+01  6.940e+02      NA       NA    
cust.relationP:2 -1.377e+01  6.940e+02      NA       NA    
cust.relationP:3 -1.048e+01  6.940e+02      NA       NA    
income:1         -3.347e-07  3.851e-07  -0.869   0.3847    
income:2          2.156e-07  2.641e-07   0.816   0.4144    
income:3         -6.003e-08  4.194e-07  -0.143   0.8862    
payroll:1         4.238e+00  4.258e-01   9.952  < 2e-16 ***
payroll:2         3.676e+00  4.291e-01   8.566  < 2e-16 ***
payroll:3         6.966e+00  4.624e-01  15.065  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Names of linear predictors: log(mu[,1]/mu[,3]), log(mu[,2]/mu[,3]), 
log(mu[,4]/mu[,3])

Residual deviance: 4983.426 on 8430 degrees of freedom

Log-likelihood: -2491.713 on 8430 degrees of freedom

Number of Fisher scoring iterations: 11 

Warning: Hauck-Donner effect detected in the following estimate(s):
'cust.relationI:1', 'cust.relationP:1', 'cust.relationP:2', 'cust.relationP:3', 'payroll:3'


Reference group is level  3  of the response

Selang kepercayaan berdasarkan pendekatan uji Wald

zCrit <- qnorm(c(0.05/2, 1 - 0.05/2))
ciCoef <- as.data.frame(t(apply(coef(summary(reglog_multinom)), 1, function(x) x["Estimate"] - zCrit*x["Std. Error"] )))
names(ciCoef)<-c("Lower","Upper")
head(ciCoef)
                     Lower        Upper
(Intercept):1  0.728631308 -0.073782419
(Intercept):2 -0.546570339 -1.348398715
(Intercept):3 -1.775431639 -3.103872242
age:1          0.004427342 -0.012270389
age:2          0.024813364  0.009081686
age:3          0.007042049 -0.017219239

Odds Ratios

round(exp(coef(reglog_multinom)),4)
   (Intercept):1    (Intercept):2    (Intercept):3            age:1 
          1.3874           0.3877           0.0872           0.9961 
           age:2            age:3       new.cust:1       new.cust:2 
          1.0171           0.9949           0.3439           0.4481 
      new.cust:3 cust.relationI:1 cust.relationI:2 cust.relationI:3 
          0.4801           0.0119           0.0701           0.0399 
cust.relationP:1 cust.relationP:2 cust.relationP:3         income:1 
          0.0000           0.0000           0.0000           1.0000 
        income:2         income:3        payroll:1        payroll:2 
          1.0000           1.0000          69.2474          39.4699 
       payroll:3 
       1060.0324 

Kinerja model

Model dievaluasi dengan data latih untuk melihat kinerja model regresi logistik multinomial

reglog_multinom_pred <- predict(reglog_multinom, type="response")

# menambahkan kolom kelas aktual
tbl_multinom <- data.frame(reglog_multinom_pred, multinom_train$product)
kable(head(tbl_multinom))
direct.debit e.account none pension multinom_train.product
0.3976784 0.2405524 0.3372716 0.0244977 direct debit
0.0122196 0.0517709 0.9333877 0.0026218 e-account
0.3109494 0.3548228 0.3143151 0.0199127 direct debit
0.4244712 0.2057611 0.3434802 0.0262875 direct debit
0.4134388 0.1721252 0.0054831 0.4089529 e-account
0.3779828 0.2708211 0.3283668 0.0228292 direct debit

Perbandingan antara kelas aktual dan prediksi

# menambah kolom kelas prediksi
tbl_multinom$prediksi <- levels(as.factor(multinom_train$product))[max.col(reglog_multinom_pred)]
kable(head(tbl_multinom))
direct.debit e.account none pension multinom_train.product prediksi
0.3976784 0.2405524 0.3372716 0.0244977 direct debit direct debit
0.0122196 0.0517709 0.9333877 0.0026218 e-account none
0.3109494 0.3548228 0.3143151 0.0199127 direct debit e-account
0.4244712 0.2057611 0.3434802 0.0262875 direct debit direct debit
0.4134388 0.1721252 0.0054831 0.4089529 e-account direct debit
0.3779828 0.2708211 0.3283668 0.0228292 direct debit direct debit

Ukuran kebaikan model dapat dilihat dengan confusion matrix, Deviance, Log-likelihood, AIC, McFadden, Cox & Snell and Nagelkerke Pseudo R-squared

Confusion matrix

Confusion matrix atau tabel klasifikasi dapat digunakan untuk melihat kebaikan model

multinom_prop <- table(tbl_multinom$prediksi, tbl_multinom$multinom_train.product)
multinom_prop
              
               direct debit e-account none pension
  direct debit          694       371  269     355
  e-account              52       101   83       9
  none                   20        51  765       0
  pension                16         8    2      21
round(prop.table(multinom_prop, 1), 3)
              
               direct debit e-account  none pension
  direct debit        0.411     0.220 0.159   0.210
  e-account           0.212     0.412 0.339   0.037
  none                0.024     0.061 0.915   0.000
  pension             0.340     0.170 0.043   0.447
mean(tbl_multinom$prediksi == multinom_train$product)
[1] 0.5612354

Deviance, Log-likelihood, AIC

cat("deviance\t= ", deviance(reglog_multinom), "\n")
deviance    =  4983.426 
cat("logLik\t\t= ", logLik(reglog_multinom), "\n")
logLik      =  -2491.713 
cat("AIC\t\t= ", AIC(reglog_multinom), "\n")
AIC     =  5025.426 

McFadden, Cox & Snell and Nagelkerke Pseudo R-squared

Koefisien determinasi sering digunakan pada model logit untuk mengetahui seberapa besar pengaruh peubah penjelas terhadap peubah respon. Koefisien determinasi merupakan kuadrat dari koefisien korelasi sebagai ukuran untuk mengetahui kemampuan dari masing-masing peubah yang digunakan dalam penelitian. Nilai koefisien determinasi yang kecil berarti kemampuan peubah penjelas dalam menjelaskan peubah respon amat terbatas.

McFadden Pseudo R-squared intinya mengukur seberapa jauh kemampuan model dalam menerangkan variasi peubah respon. Nilai koefisien determinasi adalah antara 0 dan 1:

  1. McFadden R-squared semakin mendekati nilai 1 maka model telah dianggap semakin baik, atau semakin besar kemampuan model dalam menjelaskan perubahan perubahan dari peubah penjelas terhadap peubah respon.

  2. Jika McFadden R-squared semakin mendekati nilai 0 maka berarti semakin kecil kemampuan model dalam menjelaskan perubahan dari nilai variabel independen terhadap peubah respon dan model dianggap kurang baik (Ghozali, 2017).

reglog_multinom0 <- vglm(product ~ 1, family=multinomial(refLevel="none"), data=multinom_train)
LLf   <- logLik(reglog_multinom)
LL0   <- logLik(reglog_multinom0)

# McFadden pseudo-R2
cat("McFadden\t= ", as.vector(1 - (LLf / LL0)), "\n")
McFadden    =  0.3242949 
# Cox & Snell pseudo-R2
N <- nrow(multinom_train)
cat("Cox & Snell\t= ", as.vector(1 - exp((2/N) * (LL0 - LLf))), "\n")
Cox & Snell =  0.5721712 
# Nagelkerke pseudo-R2
cat("Nagelkerke\t= ", as.vector((1 - exp((2/N) * (LL0 - LLf))) / (1 - exp(LL0)^(2/N))), "\n")
Nagelkerke  =  0.5721712 

Prediksi Model

Hasil prediksi menggunakan data latih

multinom_pred_test <- predict(reglog_multinom, multinom_test, type="response")

# menambah kolom kelas prediksi
tbl_multinom_pred_test <- data.frame(multinom_pred_test)
tbl_multinom_pred_test$newclass <- levels(as.factor(multinom_train$product))[max.col(multinom_pred_test)]
kable(head(tbl_multinom_pred_test))
direct.debit e.account none pension newclass
0.3058285 0.3728430 0.3026629 0.0186656 e-account
0.3178217 0.3542616 0.3084877 0.0194290 e-account
0.0128270 0.0476523 0.9368211 0.0026996 none
0.2790161 0.4195045 0.2848335 0.0166459 e-account
0.3865516 0.2573039 0.3325727 0.0235717 direct debit
0.3822719 0.2640208 0.3305069 0.0232003 direct debit

Regresi Logistik Ordinal

Pada gugus data ini disediakan peubah yang memiliki atribut calon nasabah asuransi. Studi kasus yang akan dilakukan adalah memprediksi peubah Response pada masing-masing amatan di data latih. Peubah/kolom Response adalah ukuran ordinal (ordo) yang menunjukkan 8 resiko tingkat kesehatan seseorang.

Penjelasan peubah:

  • Product_Info_1-4:
    A set of normalized variables relating to the product applied for

  • BMI: Normalized BMI of applicant history of the applicant.

  • InsuredInfo_1-5: A set of normalized variables providing information about the applicant.

  • Insurance_History_1-3: A set of normalized variables relating to the insurance history of the applicant.

  • Family_Hist: Variables relating to the family history of the applicant.

  • Medical_History_1-6: A set of normalized variables relating to the medical history of the applicant.

  • Employment_Info: Variables relating to the employment

ordinal_train <- read.csv(file="https://github.com/alfanugraha/tsa-handson/raw/master/training_ordinal.csv", 
                header = T, 
                sep = ",")
ordinal_test <- read.csv(file="https://github.com/alfanugraha/tsa-handson/raw/master/test_ordinal.csv", 
                header = T, 
                sep = ",")
kable(head(ordinal_train))
BMI Product_Info_1 Product_Info_2 Product_Info_3 Product_Info_4 InsuredInfo_1 InsuredInfo_2 InsuredInfo_3 InsuredInfo_4 InsuredInfo_5 Insurance_History_1 Insurance_History_2 Insurance_History_3 Family_Hist Medical_History_1 Medical_History_2 Medical_History_3 Medical_History_4 Medical_History_5 Medical_History_6 Employment_Info Response
0.3419113 1 26 2 1 1 6 1 1 6 2 2 3 2 2 1 3 3 3 240 9 8
0.4066125 1 26 2 1 1 6 1 3 6 2 2 3 3 2 1 2 1 3 55 1 7
0.4241890 1 26 2 1 1 3 1 1 3 1 1 1 3 2 1 2 3 3 18 15 1
0.6340781 1 29 2 1 2 8 1 1 8 1 1 1 3 2 1 2 3 3 77 1 2
0.8153073 1 10 2 3 1 8 1 1 8 2 2 3 2 2 1 3 3 1 2 1 3
0.4149147 1 26 2 1 2 6 1 1 6 2 2 3 1 2 1 2 3 3 176 14 6

Pemodelan

reglog_ordinal <- vglm(Response ~ BMI + Product_Info_2 + InsuredInfo_3 + Medical_History_1 + Insurance_History_3 + Employment_Info + Family_Hist, family = propodds, data = ordinal_train)
sum_ord <- summary(reglog_ordinal)
sum_ord

Call:
vglm(formula = Response ~ BMI + Product_Info_2 + InsuredInfo_3 + 
    Medical_History_1 + Insurance_History_3 + Employment_Info + 
    Family_Hist, family = propodds, data = ordinal_train)

Coefficients: 
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept):1        3.763450   0.431215   8.728  < 2e-16 ***
(Intercept):2        2.883124   0.427995   6.736 1.62e-11 ***
(Intercept):3        2.612465   0.427250   6.115 9.68e-10 ***
(Intercept):4        2.220794   0.426346   5.209 1.90e-07 ***
(Intercept):5        1.886840   0.425746   4.432 9.34e-06 ***
(Intercept):6        0.908360   0.424920   2.138 0.032539 *  
(Intercept):7        0.190009   0.425500   0.447 0.655197    
BMI                 -4.075292   0.300920 -13.543  < 2e-16 ***
Product_Info_2      -0.028867   0.007802  -3.700 0.000216 ***
InsuredInfo_3       -0.598242   0.124292  -4.813 1.49e-06 ***
Medical_History_1    0.328194   0.118412   2.772 0.005578 ** 
Insurance_History_3 -0.074967   0.039177  -1.914 0.055678 .  
Employment_Info      0.041459   0.008029   5.164 2.42e-07 ***
Family_Hist          0.183656   0.071449   2.570 0.010157 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Number of linear predictors:  7 

Names of linear predictors: logitlink(P[Y>=2]), logitlink(P[Y>=3]), 
logitlink(P[Y>=4]), logitlink(P[Y>=5]), logitlink(P[Y>=6]), logitlink(P[Y>=7]), 
logitlink(P[Y>=8])

Residual deviance: 9338.263 on 16786 degrees of freedom

Log-likelihood: -4669.132 on 16786 degrees of freedom

Number of Fisher scoring iterations: 5 

No Hauck-Donner effect found in any of the estimates


Exponentiated coefficients:
                BMI      Product_Info_2       InsuredInfo_3   Medical_History_1 
         0.01698726          0.97154540          0.54977724          1.38845790 
Insurance_History_3     Employment_Info         Family_Hist 
         0.92777407          1.04233068          1.20160223 

Uji koefisien dan uji model keseluruhan

coef(sum_ord)
                       Estimate  Std. Error     z value     Pr(>|z|)
(Intercept):1        3.76344963 0.431215211   8.7275438 2.602631e-18
(Intercept):2        2.88312431 0.427994856   6.7363527 1.624118e-11
(Intercept):3        2.61246539 0.427250350   6.1146009 9.679889e-10
(Intercept):4        2.22079405 0.426346444   5.2088954 1.899681e-07
(Intercept):5        1.88683969 0.425746243   4.4318411 9.343187e-06
(Intercept):6        0.90835974 0.424919717   2.1377209 3.253941e-02
(Intercept):7        0.19000861 0.425500049   0.4465537 6.551974e-01
BMI                 -4.07529155 0.300919536 -13.5427949 8.739926e-42
Product_Info_2      -0.02886728 0.007801746  -3.7001053 2.155100e-04
InsuredInfo_3       -0.59824209 0.124292119  -4.8131941 1.485369e-06
Medical_History_1    0.32819370 0.118412285   2.7716187 5.577833e-03
Insurance_History_3 -0.07496704 0.039176913  -1.9135514 5.567751e-02
Employment_Info      0.04145925 0.008028688   5.1638886 2.418717e-07
Family_Hist          0.18365585 0.071449440   2.5704310 1.015721e-02

Kinerja Model

reglog_ordinal_pred <- predict(reglog_ordinal, type="response")
kable(head(reglog_ordinal_pred))
1 2 3 4 5 6 7 8
0.1003627 0.1116433 0.0487165 0.0821441 0.0786401 0.2381778 0.1393450 0.2009705
0.1441172 0.1446906 0.0585843 0.0931774 0.0831884 0.2215199 0.1118865 0.1428358
0.0801551 0.0935036 0.0423199 0.0735683 0.0731608 0.2395423 0.1541790 0.2435710
0.2853961 0.2052237 0.0674011 0.0932897 0.0715628 0.1511752 0.0603062 0.0656453
0.4027122 0.2164865 0.0614615 0.0785728 0.0557069 0.1064161 0.0386906 0.0399536
0.1279361 0.1334057 0.0554949 0.0900921 0.0823924 0.2289209 0.1212036 0.1605543
# menambahkan kolom kelas aktual
tbl_ord <- data.frame(reglog_ordinal_pred, ordinal_train$Response)
kable(head(tbl_ord))
X1 X2 X3 X4 X5 X6 X7 X8 ordinal_train.Response
0.1003627 0.1116433 0.0487165 0.0821441 0.0786401 0.2381778 0.1393450 0.2009705 8
0.1441172 0.1446906 0.0585843 0.0931774 0.0831884 0.2215199 0.1118865 0.1428358 7
0.0801551 0.0935036 0.0423199 0.0735683 0.0731608 0.2395423 0.1541790 0.2435710 1
0.2853961 0.2052237 0.0674011 0.0932897 0.0715628 0.1511752 0.0603062 0.0656453 2
0.4027122 0.2164865 0.0614615 0.0785728 0.0557069 0.1064161 0.0386906 0.0399536 3
0.1279361 0.1334057 0.0554949 0.0900921 0.0823924 0.2289209 0.1212036 0.1605543 6
# menambah kolom kelas prediksi
tbl_ord$prediksi <- as.numeric(levels(as.factor(ordinal_train$Response)))[max.col(reglog_ordinal_pred)]
kable(head(tbl_ord))
X1 X2 X3 X4 X5 X6 X7 X8 ordinal_train.Response prediksi
0.1003627 0.1116433 0.0487165 0.0821441 0.0786401 0.2381778 0.1393450 0.2009705 8 6
0.1441172 0.1446906 0.0585843 0.0931774 0.0831884 0.2215199 0.1118865 0.1428358 7 6
0.0801551 0.0935036 0.0423199 0.0735683 0.0731608 0.2395423 0.1541790 0.2435710 1 8
0.2853961 0.2052237 0.0674011 0.0932897 0.0715628 0.1511752 0.0603062 0.0656453 2 1
0.4027122 0.2164865 0.0614615 0.0785728 0.0557069 0.1064161 0.0386906 0.0399536 3 1
0.1279361 0.1334057 0.0554949 0.0900921 0.0823924 0.2289209 0.1212036 0.1605543 6 6

Confusion matrix

ordinal_prop <- table(tbl_ord$prediksi, tbl_ord$ordinal_train.Response)
ordinal_prop
   
      1   2   3   4   5   6   7   8
  1 129 132  41   7  99  79  28   7
  6 214 197  77 154  76 368 204 258
  8  42  13  12  50  14  52  33 114
round(prop.table(ordinal_prop, 1), 3)
   
        1     2     3     4     5     6     7     8
  1 0.247 0.253 0.079 0.013 0.190 0.151 0.054 0.013
  6 0.138 0.127 0.050 0.099 0.049 0.238 0.132 0.167
  8 0.127 0.039 0.036 0.152 0.042 0.158 0.100 0.345
mean(tbl_ord$prediksi == ordinal_train$Response)
[1] 0.2545833

Deviance, Log-likelihood, AIC

cat("deviance\t= ", deviance(reglog_ordinal), "\n")
deviance    =  9338.263 
cat("logLik\t\t= ", logLik(reglog_ordinal), "\n")
logLik      =  -4669.131 
cat("AIC\t\t= ", AIC(reglog_ordinal), "\n")
AIC     =  9366.263 

McFadden, Cox & Snell and Nagelkerke Pseudo R-squared

reglog_ordinal0 <- vglm(Response ~ 1, family=propodds, data=ordinal_train)
LLf_ord   <- logLik(reglog_ordinal)
LL0_ord   <- logLik(reglog_ordinal0)

# McFadden pseudo-R2
cat("McFadden\t= ", as.vector(1 - (LLf_ord / LL0_ord)), "\n")
McFadden    =  0.02938282 
# Cox & Snell pseudo-R2
N_ord <- nrow(ordinal_train)
cat("Cox & Snell\t= ", as.vector(1 - exp((2/N_ord) * (LL0_ord - LLf_ord))), "\n")
Cox & Snell =  0.1111154 
# Nagelkerke pseudo-R2
cat("Nagelkerke\t= ", as.vector((1 - exp((2/N_ord) * (LL0_ord - LLf_ord))) / (1 - exp(LL0_ord)^(2/N_ord))), "\n")
Nagelkerke  =  0.1111154 

Prediksi Model

ordinal_test_filter <- ordinal_test[c("BMI", "Product_Info_2", "InsuredInfo_3", "Medical_History_1", "Insurance_History_3", "Employment_Info", "Family_Hist")]
kable(head(ordinal_test_filter))
BMI Product_Info_2 InsuredInfo_3 Medical_History_1 Insurance_History_3 Employment_Info Family_Hist
0.2634011 26 1 2 3 9 3
0.7508862 26 1 2 1 12 2
0.3736277 26 1 2 3 9 3
0.4592303 26 1 2 3 14 2
0.4867665 26 1 2 1 9 3
0.4472439 26 1 2 3 12 2
pred_ordinal_test <- predict(reglog_ordinal, ordinal_test, type="response")
# menambah kolom kelas prediksi
tbl_ord_test <- data.frame(pred_ordinal_test)
tbl_ord_test$newclass <- as.numeric(levels(as.factor(ordinal_train$Response)))[max.col(pred_ordinal_test)]
kable(head(tbl_ord_test))
X1 X2 X3 X4 X5 X6 X7 X8 newclass
0.0631619 0.0766944 0.0358338 0.0640411 0.0659901 0.2337664 0.1666360 0.2938762 8
0.3098499 0.2100157 0.0667915 0.0907397 0.0682994 0.1406820 0.0548000 0.0588218 1
0.0955564 0.1075036 0.0473139 0.0803422 0.0775848 0.2390662 0.1427841 0.2098487 6
0.1275955 0.1331567 0.0554233 0.0900159 0.0823660 0.2290656 0.1214102 0.1609667 6
0.1260368 0.1320112 0.0550920 0.0896608 0.0822401 0.2297211 0.1223611 0.1628768 6
0.1314364 0.1359365 0.0562146 0.0908472 0.0826383 0.2274026 0.1191061 0.1564183 6