Regresi Logistik

Author

Alfa Nugraha

Persiapan data

germancredit <- read.csv(file="https://github.com/alfanugraha/tsa-handson/raw/master/german_credit.csv", 
                header = T, 
                sep = ",")
str(germancredit)

'data.frame':   1000 obs. of  21 variables:
 $ creditability          : int  1 1 1 1 1 1 1 1 1 1 ...
 $ account_balance        : int  1 1 2 1 1 1 1 1 4 2 ...
 $ duration               : int  18 9 12 12 12 10 8 6 18 24 ...
 $ payment_status         : int  4 4 2 4 4 4 4 4 4 2 ...
 $ purpose                : int  2 0 9 0 0 0 0 0 3 3 ...
 $ credit_amount          : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
 $ stocks                 : int  1 1 2 1 1 1 1 1 1 3 ...
 $ length_employment      : int  2 3 4 3 3 2 4 2 1 1 ...
 $ instalment_percent     : int  4 2 2 3 4 1 1 2 4 1 ...
 $ sex_marital_status     : int  2 3 2 3 3 3 3 3 2 2 ...
 $ guarantors             : int  1 1 1 1 1 1 1 1 1 1 ...
 $ duration_currentaddress: int  4 2 4 2 4 3 4 4 4 4 ...
 $ most_available_asset   : int  2 1 1 1 2 1 1 1 3 4 ...
 $ age                    : int  21 36 23 39 38 48 39 40 65 23 ...
 $ concurrent_credits     : int  3 3 3 3 1 3 3 3 3 3 ...
 $ type_apartment         : int  1 1 1 1 2 1 2 2 2 1 ...
 $ no_credits             : int  1 2 1 2 2 2 2 1 2 1 ...
 $ cccupation             : int  3 3 2 2 2 2 2 2 1 1 ...
 $ no_dependents          : int  1 2 1 2 1 2 1 2 1 1 ...
 $ telephone              : int  1 1 1 1 1 1 1 1 1 1 ...
 $ foreign_worker         : int  1 1 1 2 2 2 2 2 1 1 ...

Tentang gugus data German Credit.

Account Balance: No account (1), None (No balance) (2), Some Balance (3)
Payment Status: Some Problems (1), Paid Up (2), No Problems (in this bank) (3)
Savings/Stock Value: None, Below 100 DM, [100, 1000] DM, Above 1000 DM
Employment Length: Below 1 year (including unemployed), [1, 4], [4, 7], Above 7
Sex/Marital Status: Male Divorced/Single, Male Married/Widowed, Female
No of Credits at this bank: 1, More than 1
Guarantor: None, Yes
Concurrent Credits: Other Banks or Dept Stores, None
ForeignWorker variable may be dropped from the study
Purpose of Credit: New car, Used car, Home Related, Other

kable(head(germancredit))

creditability	account_balance	duration	payment_status	purpose	credit_amount	stocks	length_employment	instalment_percent	sex_marital_status	guarantors	duration_currentaddress	most_available_asset	age	concurrent_credits	type_apartment	no_credits	cccupation	no_dependents	telephone	foreign_worker
1	1	18	4	2	1049	1	2	4	2	1	4	2	21	3	1	1	3	1	1	1
1	1	9	4	0	2799	1	3	2	3	1	2	1	36	3	1	2	3	2	1	1
1	2	12	2	9	841	2	4	2	2	1	4	1	23	3	1	1	2	1	1	1
1	1	12	4	0	2122	1	3	3	3	1	2	1	39	3	1	2	2	2	1	2
1	1	12	4	0	2171	1	3	4	3	1	4	2	38	1	2	2	2	1	1	2
1	1	10	4	0	2241	1	2	1	3	1	3	1	48	3	1	2	2	2	1	2

skimr::skim(germancredit)

Data summary
Name	germancredit
Number of rows	1000
Number of columns	21
_______________________
Column type frequency:
numeric	21
________________________
Group variables	None

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
creditability	1	0.70	0.46	0	0.0	1.0	1.00	1	▃▁▁▁▇
account_balance	1	2.58	1.26	1	1.0	2.0	4.00	4	▆▆▁▁▇
duration	1	20.90	12.06	4	12.0	18.0	24.00	72	▇▇▂▁▁
payment_status	1	2.54	1.08	0	2.0	2.0	4.00	4	▁▁▇▁▅
purpose	1	2.83	2.74	0	1.0	2.0	3.00	10	▇▅▁▁▂
credit_amount	1	3271.25	2822.75	250	1365.5	2319.5	3972.25	18424	▇▂▁▁▁
stocks	1	2.10	1.58	1	1.0	1.0	3.00	5	▇▂▁▁▂
length_employment	1	3.38	1.21	1	3.0	3.0	5.00	5	▂▅▇▅▆
instalment_percent	1	2.97	1.12	1	2.0	3.0	4.00	4	▂▃▁▂▇
sex_marital_status	1	2.68	0.71	1	2.0	3.0	3.00	4	▁▅▁▇▂
guarantors	1	1.15	0.48	1	1.0	1.0	1.00	3	▇▁▁▁▁
duration_currentaddress	1	2.85	1.10	1	2.0	3.0	4.00	4	▂▆▁▃▇
most_available_asset	1	2.36	1.05	1	1.0	2.0	3.00	4	▇▆▁▇▃
age	1	35.54	11.35	19	27.0	33.0	42.00	75	▇▆▃▁▁
concurrent_credits	1	2.67	0.71	1	3.0	3.0	3.00	3	▂▁▁▁▇
type_apartment	1	1.93	0.53	1	2.0	2.0	2.00	3	▂▁▇▁▁
no_credits	1	1.41	0.58	1	1.0	1.0	2.00	4	▇▅▁▁▁
cccupation	1	2.90	0.65	1	3.0	3.0	3.00	4	▁▂▁▇▂
no_dependents	1	1.16	0.36	1	1.0	1.0	1.00	2	▇▁▁▁▂
telephone	1	1.40	0.49	1	1.0	1.0	2.00	2	▇▁▁▁▆
foreign_worker	1	1.04	0.19	1	1.0	1.0	1.00	2	▇▁▁▁▁

Konversi beberapa data numerik sebagai data kategorik

germancredit$creditability <- as.factor(germancredit$creditability)
germancredit$account_balance <- as.factor(germancredit$account_balance)
germancredit$payment_status <- as.factor(germancredit$payment_status)
germancredit$stocks <- as.factor(germancredit$stocks)
germancredit$length_employment <- as.factor(germancredit$length_employment)
germancredit$sex_marital_status <- as.factor(germancredit$sex_marital_status)
germancredit$no_credits <- as.factor(germancredit$no_credits)
germancredit$foreign_worker <- as.factor(germancredit$foreign_worker)
germancredit$purpose <- as.factor(germancredit$purpose)

Statistik deskriptif

summary(germancredit)

 creditability account_balance    duration    payment_status    purpose   
 0:300         1:274           Min.   : 4.0   0: 40          3      :280  
 1:700         2:269           1st Qu.:12.0   1: 49          0      :234  
               3: 63           Median :18.0   2:530          2      :181  
               4:394           Mean   :20.9   3: 88          1      :103  
                               3rd Qu.:24.0   4:293          9      : 97  
                               Max.   :72.0                  6      : 50  
                                                             (Other): 55  
 credit_amount   stocks  length_employment instalment_percent
 Min.   :  250   1:603   1: 62             Min.   :1.000     
 1st Qu.: 1366   2:103   2:172             1st Qu.:2.000     
 Median : 2320   3: 63   3:339             Median :3.000     
 Mean   : 3271   4: 48   4:174             Mean   :2.973     
 3rd Qu.: 3972   5:183   5:253             3rd Qu.:4.000     
 Max.   :18424                             Max.   :4.000     
                                                             
 sex_marital_status   guarantors    duration_currentaddress
 1: 50              Min.   :1.000   Min.   :1.000          
 2:310              1st Qu.:1.000   1st Qu.:2.000          
 3:548              Median :1.000   Median :3.000          
 4: 92              Mean   :1.145   Mean   :2.845          
                    3rd Qu.:1.000   3rd Qu.:4.000          
                    Max.   :3.000   Max.   :4.000          
                                                           
 most_available_asset      age        concurrent_credits type_apartment 
 Min.   :1.000        Min.   :19.00   Min.   :1.000      Min.   :1.000  
 1st Qu.:1.000        1st Qu.:27.00   1st Qu.:3.000      1st Qu.:2.000  
 Median :2.000        Median :33.00   Median :3.000      Median :2.000  
 Mean   :2.358        Mean   :35.54   Mean   :2.675      Mean   :1.928  
 3rd Qu.:3.000        3rd Qu.:42.00   3rd Qu.:3.000      3rd Qu.:2.000  
 Max.   :4.000        Max.   :75.00   Max.   :3.000      Max.   :3.000  
                                                                        
 no_credits   cccupation    no_dependents     telephone     foreign_worker
 1:633      Min.   :1.000   Min.   :1.000   Min.   :1.000   1:963         
 2:333      1st Qu.:3.000   1st Qu.:1.000   1st Qu.:1.000   2: 37         
 3: 28      Median :3.000   Median :1.000   Median :1.000                 
 4:  6      Mean   :2.904   Mean   :1.155   Mean   :1.404                 
            3rd Qu.:3.000   3rd Qu.:1.000   3rd Qu.:2.000                 
            Max.   :4.000   Max.   :2.000   Max.   :2.000

prop.table(table(germancredit$creditability))


  0   1 
0.3 0.7

Data tersebut memiliki 30% pemohon kredit dengan kategori buruk, 70% sisanya dengan kategori baik

library(DataExplorer)
plot_density(germancredit)

plot_bar(germancredit)

Boxplot peubah numerik

ggplot(germancredit, aes(x=creditability, y=credit_amount, fill=creditability)) +
  geom_boxplot()

Grafik batang peubah kategorik

ggplot(germancredit, aes(account_balance, after_stat(count))) +
  geom_bar(aes(fill=creditability), position = "dodge")

Partisi/splitting data

# generate indeks data
indeks <- sample(1:nrow(germancredit), size = 0.7 * nrow(germancredit))

# partisi 70%
train70 <- germancredit[indeks, ] # berisi 70%
test30 <- germancredit[-indeks, ] # berisi 30%

Regresi Logistik Biner

Sebagai pengantar pada regresi klasik, sisaan \epsilon diasumsikan menyebar normal, yang berarti Y juga menyebar normal. Namun pada kasus tertentu Y dapat berupa kategorik, misalnya kasus biner.

Makna garis lurus jika digunakan pada kasus biner bentuk pola prediksi Y menjadi jauh lebih besar dari 1 (positif), atau bisa juga menjadi sangat negatif sehingga dianggap tidak sesuai dengan hasil sebenarnya.

Odds adalah rasio peluang sukses dibagi peluang gagal, dimana peluang sukses > peluang gagal. Nilai odd disebut juga nilai logit

Logit adalah nilai log dari odds yang sama dengan \alpha + \beta X.

Pada regresi linear jika nilai X naik satu satuan maka \beta meningkat satu satuan. Pada logit jika nilai X naik satu satuan maka odds meningkat e^{\beta} satuan. Merasiokan odd atau odds ratio sama saja dengan nilai e^{\beta}.

Jika odd ratio bernilai 2, maknanya bahwa odds pada saat X=x+1 sama dengan 2 kali dari odds pada saat X=x. Kesimpulannya adalah peluang sukses terjadi 2 kali dari kejadian peluang gagal.

Pemodelan

reglog <- glm(creditability ~ ., family = binomial, data=train70)
summary(reglog)


Call:
glm(formula = creditability ~ ., family = binomial, data = train70)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.9954  -0.6297   0.3459   0.7204   1.9740  

Coefficients:
                          Estimate Std. Error z value Pr(>|z|)    
(Intercept)             -1.451e+00  1.315e+00  -1.103 0.269977    
account_balance2         2.675e-01  2.693e-01   0.993 0.320696    
account_balance3         1.760e+00  5.228e-01   3.367 0.000760 ***
account_balance4         1.449e+00  2.697e-01   5.373 7.74e-08 ***
duration                -3.090e-02  1.147e-02  -2.695 0.007044 ** 
payment_status1          4.615e-01  6.794e-01   0.679 0.496941    
payment_status2          1.048e+00  5.450e-01   1.922 0.054575 .  
payment_status3          1.327e+00  5.804e-01   2.286 0.022251 *  
payment_status4          2.149e+00  5.529e-01   3.886 0.000102 ***
purpose1                 2.421e+00  5.296e-01   4.571 4.85e-06 ***
purpose2                 5.938e-01  3.119e-01   1.903 0.056976 .  
purpose3                 7.439e-01  2.925e-01   2.543 0.010977 *  
purpose4                 6.324e-01  1.073e+00   0.590 0.555511    
purpose5                -4.368e-01  7.192e-01  -0.607 0.543641    
purpose6                -7.236e-02  4.886e-01  -0.148 0.882257    
purpose8                 1.756e+00  1.253e+00   1.401 0.161209    
purpose9                 1.084e+00  4.258e-01   2.547 0.010881 *  
purpose10                2.590e+00  1.075e+00   2.410 0.015941 *  
credit_amount           -1.649e-04  5.567e-05  -2.962 0.003055 ** 
stocks2                  4.410e-01  3.617e-01   1.220 0.222652    
stocks3                  4.599e-01  4.531e-01   1.015 0.310134    
stocks4                  1.495e+00  6.270e-01   2.385 0.017075 *  
stocks5                  1.323e+00  3.364e-01   3.934 8.35e-05 ***
length_employment2      -6.268e-02  5.027e-01  -0.125 0.900772    
length_employment3      -6.413e-02  4.724e-01  -0.136 0.892011    
length_employment4       1.033e+00  5.248e-01   1.969 0.049009 *  
length_employment5       1.659e-01  4.951e-01   0.335 0.737598    
instalment_percent      -3.308e-01  1.093e-01  -3.026 0.002478 ** 
sex_marital_status2      1.300e-01  4.733e-01   0.275 0.783578    
sex_marital_status3      6.563e-01  4.588e-01   1.430 0.152641    
sex_marital_status4      3.579e-01  5.790e-01   0.618 0.536546    
guarantors               4.080e-01  2.325e-01   1.755 0.079273 .  
duration_currentaddress -3.643e-02  1.049e-01  -0.347 0.728338    
most_available_asset    -1.124e-01  1.244e-01  -0.904 0.366163    
age                      5.118e-03  1.090e-02   0.469 0.638806    
concurrent_credits       1.985e-01  1.550e-01   1.281 0.200247    
type_apartment           1.357e-01  2.257e-01   0.601 0.547639    
no_credits2             -3.524e-01  2.868e-01  -1.229 0.219034    
no_credits3              1.361e-01  8.118e-01   0.168 0.866889    
no_credits4             -2.236e-01  1.144e+00  -0.195 0.845070    
cccupation              -6.756e-03  1.920e-01  -0.035 0.971931    
no_dependents           -2.852e-01  3.058e-01  -0.933 0.351030    
telephone                3.329e-01  2.472e-01   1.347 0.178139    
foreign_worker2          1.348e+00  7.357e-01   1.833 0.066851 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 850.06  on 699  degrees of freedom
Residual deviance: 608.53  on 656  degrees of freedom
AIC: 696.53

Number of Fisher Scoring iterations: 5

reglog2 <- glm(creditability ~ account_balance + credit_amount + instalment_percent + stocks + purpose, 
              family = binomial, 
              data=train70)
summary(reglog2)


Call:
glm(formula = creditability ~ account_balance + credit_amount + 
    instalment_percent + stocks + purpose, family = binomial, 
    data = train70)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.5504  -0.9081   0.4699   0.7474   2.0260  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)         1.209e+00  3.872e-01   3.123 0.001793 ** 
account_balance2    3.134e-01  2.405e-01   1.303 0.192586    
account_balance3    1.555e+00  4.789e-01   3.247 0.001168 ** 
account_balance4    1.607e+00  2.447e-01   6.565 5.20e-11 ***
credit_amount      -2.141e-04  3.902e-05  -5.487 4.09e-08 ***
instalment_percent -3.328e-01  9.248e-02  -3.598 0.000320 ***
stocks2             1.451e-01  3.184e-01   0.456 0.648523    
stocks3             4.429e-01  4.334e-01   1.022 0.306882    
stocks4             1.367e+00  5.727e-01   2.386 0.017010 *  
stocks5             1.184e+00  3.057e-01   3.874 0.000107 ***
purpose1            2.087e+00  4.803e-01   4.345 1.39e-05 ***
purpose2            3.035e-01  2.803e-01   1.083 0.278888    
purpose3            5.349e-01  2.581e-01   2.072 0.038235 *  
purpose4            1.453e-01  9.702e-01   0.150 0.880930    
purpose5           -7.224e-01  6.509e-01  -1.110 0.267088    
purpose6           -1.922e-01  4.483e-01  -0.429 0.668138    
purpose8            9.535e-01  1.178e+00   0.809 0.418228    
purpose9            4.203e-01  3.527e-01   1.192 0.233343    
purpose10           2.067e+00  8.968e-01   2.305 0.021149 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 850.06  on 699  degrees of freedom
Residual deviance: 689.88  on 681  degrees of freedom
AIC: 727.88

Number of Fisher Scoring iterations: 5

Seleksi fitur dengan stepwise

reglog_step <- step(reglog)

Start:  AIC=696.53
creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    sex_marital_status + guarantors + duration_currentaddress + 
    most_available_asset + age + concurrent_credits + type_apartment + 
    no_credits + cccupation + no_dependents + telephone + foreign_worker

                          Df Deviance    AIC
- no_credits               3   610.28 692.28
- cccupation               1   608.53 694.53
- duration_currentaddress  1   608.65 694.65
- age                      1   608.75 694.75
- type_apartment           1   608.89 694.89
- most_available_asset     1   609.35 695.35
- no_dependents            1   609.39 695.39
- sex_marital_status       3   613.76 695.76
- concurrent_credits       1   610.15 696.15
- telephone                1   610.36 696.36
<none>                         608.53 696.53
- guarantors               1   611.79 697.79
- foreign_worker           1   612.72 698.72
- length_employment        4   621.11 701.11
- duration                 1   615.88 701.88
- credit_amount            1   617.47 703.47
- instalment_percent       1   617.99 703.99
- stocks                   4   630.76 710.76
- payment_status           4   632.21 712.21
- purpose                  9   646.68 716.68
- account_balance          3   649.61 731.61

Step:  AIC=692.28
creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    sex_marital_status + guarantors + duration_currentaddress + 
    most_available_asset + age + concurrent_credits + type_apartment + 
    cccupation + no_dependents + telephone + foreign_worker

                          Df Deviance    AIC
- cccupation               1   610.28 690.28
- duration_currentaddress  1   610.36 690.36
- age                      1   610.43 690.43
- type_apartment           1   610.79 690.79
- most_available_asset     1   611.00 691.00
- no_dependents            1   611.11 691.11
- sex_marital_status       3   615.38 691.38
- telephone                1   612.11 692.11
- concurrent_credits       1   612.22 692.22
<none>                         610.28 692.28
- guarantors               1   613.40 693.40
- foreign_worker           1   615.09 695.09
- length_employment        4   622.80 696.80
- duration                 1   617.69 697.69
- credit_amount            1   619.00 699.00
- instalment_percent       1   619.77 699.77
- stocks                   4   632.52 706.52
- payment_status           4   632.99 706.99
- purpose                  9   648.57 712.57
- account_balance          3   650.66 726.66

Step:  AIC=690.28
creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    sex_marital_status + guarantors + duration_currentaddress + 
    most_available_asset + age + concurrent_credits + type_apartment + 
    no_dependents + telephone + foreign_worker

                          Df Deviance    AIC
- duration_currentaddress  1   610.36 688.36
- age                      1   610.43 688.43
- type_apartment           1   610.79 688.79
- most_available_asset     1   611.03 689.03
- no_dependents            1   611.11 689.11
- sex_marital_status       3   615.41 689.41
- concurrent_credits       1   612.22 690.22
- telephone                1   612.28 690.28
<none>                         610.28 690.28
- guarantors               1   613.40 691.40
- foreign_worker           1   615.09 693.09
- length_employment        4   622.86 694.86
- duration                 1   617.72 695.72
- credit_amount            1   619.12 697.12
- instalment_percent       1   620.02 698.02
- stocks                   4   632.53 704.53
- payment_status           4   633.01 705.01
- purpose                  9   649.18 711.18
- account_balance          3   650.73 724.73

Step:  AIC=688.36
creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    sex_marital_status + guarantors + most_available_asset + 
    age + concurrent_credits + type_apartment + no_dependents + 
    telephone + foreign_worker

                       Df Deviance    AIC
- age                   1   610.48 686.48
- type_apartment        1   610.93 686.93
- most_available_asset  1   611.19 687.19
- no_dependents         1   611.19 687.19
- sex_marital_status    3   615.52 687.52
- concurrent_credits    1   612.29 688.29
- telephone             1   612.31 688.31
<none>                      610.36 688.36
- guarantors            1   613.48 689.48
- foreign_worker        1   615.19 691.19
- length_employment     4   622.87 692.87
- duration              1   617.85 693.85
- credit_amount         1   619.12 695.12
- instalment_percent    1   620.06 696.06
- stocks                4   632.53 702.53
- payment_status        4   633.03 703.03
- purpose               9   649.26 709.26
- account_balance       3   651.62 723.62

Step:  AIC=686.48
creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    sex_marital_status + guarantors + most_available_asset + 
    concurrent_credits + type_apartment + no_dependents + telephone + 
    foreign_worker

                       Df Deviance    AIC
- type_apartment        1   611.16 685.16
- no_dependents         1   611.29 685.29
- most_available_asset  1   611.29 685.29
- sex_marital_status    3   615.70 685.70
- concurrent_credits    1   612.40 686.40
<none>                      610.48 686.48
- telephone             1   612.56 686.56
- guarantors            1   613.79 687.79
- foreign_worker        1   615.34 689.34
- length_employment     4   623.06 691.06
- duration              1   618.10 692.10
- credit_amount         1   619.32 693.32
- instalment_percent    1   620.10 694.10
- stocks                4   632.99 700.99
- payment_status        4   633.28 701.28
- purpose               9   649.29 707.29
- account_balance       3   651.88 721.88

Step:  AIC=685.16
creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    sex_marital_status + guarantors + most_available_asset + 
    concurrent_credits + no_dependents + telephone + foreign_worker

                       Df Deviance    AIC
- most_available_asset  1   611.62 683.62
- no_dependents         1   611.93 683.93
- concurrent_credits    1   613.00 685.00
- sex_marital_status    3   617.00 685.00
<none>                      611.16 685.16
- telephone             1   613.23 685.23
- guarantors            1   614.50 686.50
- foreign_worker        1   615.84 687.84
- length_employment     4   623.32 689.32
- duration              1   618.62 690.62
- credit_amount         1   619.82 691.82
- instalment_percent    1   620.55 692.55
- stocks                4   633.24 699.24
- payment_status        4   634.74 700.74
- purpose               9   650.09 706.09
- account_balance       3   653.59 721.59

Step:  AIC=683.62
creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    sex_marital_status + guarantors + concurrent_credits + no_dependents + 
    telephone + foreign_worker

                     Df Deviance    AIC
- no_dependents       1   612.35 682.35
- sex_marital_status  3   617.31 683.31
- telephone           1   613.44 683.44
- concurrent_credits  1   613.60 683.60
<none>                    611.62 683.62
- guarantors          1   615.20 685.20
- foreign_worker      1   616.56 686.56
- length_employment   4   624.18 688.18
- duration            1   619.90 689.90
- credit_amount       1   620.65 690.65
- instalment_percent  1   621.42 691.42
- stocks              4   633.99 697.99
- payment_status      4   635.81 699.81
- purpose             9   650.87 704.87
- account_balance     3   654.41 720.41

Step:  AIC=682.35
creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    sex_marital_status + guarantors + concurrent_credits + telephone + 
    foreign_worker

                     Df Deviance    AIC
- sex_marital_status  3   617.38 681.38
- telephone           1   614.28 682.28
<none>                    612.35 682.35
- concurrent_credits  1   614.54 682.54
- guarantors          1   615.69 683.69
- foreign_worker      1   617.31 685.31
- length_employment   4   625.03 687.03
- duration            1   620.66 688.66
- credit_amount       1   621.06 689.06
- instalment_percent  1   621.72 689.72
- stocks              4   634.38 696.38
- payment_status      4   636.68 698.68
- purpose             9   651.55 703.55
- account_balance     3   655.46 719.46

Step:  AIC=681.38
creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    guarantors + concurrent_credits + telephone + foreign_worker

                     Df Deviance    AIC
- concurrent_credits  1   618.80 680.80
- telephone           1   618.90 680.90
<none>                    617.38 681.38
- guarantors          1   620.54 682.54
- foreign_worker      1   622.92 684.92
- credit_amount       1   625.04 687.04
- instalment_percent  1   625.11 687.11
- duration            1   625.61 687.61
- length_employment   4   632.04 688.04
- stocks              4   638.42 694.42
- payment_status      4   642.69 698.69
- purpose             9   656.57 702.57
- account_balance     3   661.21 719.21

Step:  AIC=680.8
creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    guarantors + telephone + foreign_worker

                     Df Deviance    AIC
- telephone           1   620.23 680.23
<none>                    618.80 680.80
- guarantors          1   622.08 682.08
- foreign_worker      1   624.14 684.14
- credit_amount       1   626.37 686.37
- instalment_percent  1   626.49 686.49
- duration            1   627.13 687.13
- length_employment   4   633.85 687.85
- stocks              4   639.31 693.31
- purpose             9   657.43 701.43
- payment_status      4   648.07 702.07
- account_balance     3   662.13 718.13

Step:  AIC=680.23
creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    guarantors + foreign_worker

                     Df Deviance    AIC
<none>                    620.23 680.23
- guarantors          1   623.40 681.40
- foreign_worker      1   625.51 683.51
- credit_amount       1   626.81 684.81
- instalment_percent  1   627.41 685.41
- duration            1   628.76 686.76
- length_employment   4   635.86 687.86
- stocks              4   641.33 693.33
- payment_status      4   649.73 701.73
- purpose             9   660.21 702.21
- account_balance     3   665.35 719.35

summary(reglog_step)


Call:
glm(formula = creditability ~ account_balance + duration + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    guarantors + foreign_worker, family = binomial, data = train70)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.8803  -0.6574   0.3820   0.7053   2.1417  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -7.678e-01  7.848e-01  -0.978 0.327880    
account_balance2    3.033e-01  2.628e-01   1.154 0.248538    
account_balance3    1.826e+00  5.108e-01   3.575 0.000351 ***
account_balance4    1.475e+00  2.619e-01   5.632 1.79e-08 ***
duration           -3.230e-02  1.114e-02  -2.899 0.003739 ** 
payment_status1     5.051e-01  6.353e-01   0.795 0.426633    
payment_status2     1.285e+00  5.033e-01   2.554 0.010664 *  
payment_status3     1.408e+00  5.622e-01   2.505 0.012242 *  
payment_status4     2.243e+00  5.320e-01   4.215 2.50e-05 ***
purpose1            2.393e+00  5.132e-01   4.662 3.13e-06 ***
purpose2            5.455e-01  3.003e-01   1.817 0.069282 .  
purpose3            7.808e-01  2.840e-01   2.749 0.005983 ** 
purpose4            7.277e-01  1.055e+00   0.690 0.490408    
purpose5           -4.945e-01  6.915e-01  -0.715 0.474548    
purpose6           -1.250e-01  4.800e-01  -0.260 0.794491    
purpose8            1.509e+00  1.221e+00   1.235 0.216787    
purpose9            1.092e+00  4.119e-01   2.652 0.007994 ** 
purpose10           2.266e+00  9.755e-01   2.323 0.020191 *  
credit_amount      -1.323e-04  5.189e-05  -2.551 0.010755 *  
stocks2             3.121e-01  3.477e-01   0.897 0.369490    
stocks3             4.796e-01  4.498e-01   1.066 0.286361    
stocks4             1.391e+00  6.114e-01   2.275 0.022934 *  
stocks5             1.257e+00  3.269e-01   3.846 0.000120 ***
length_employment2 -2.231e-01  4.877e-01  -0.457 0.647330    
length_employment3 -1.184e-01  4.603e-01  -0.257 0.796991    
length_employment4  1.033e+00  5.117e-01   2.018 0.043559 *  
length_employment5  1.281e-01  4.748e-01   0.270 0.787342    
instalment_percent -2.738e-01  1.035e-01  -2.646 0.008144 ** 
guarantors          3.855e-01  2.230e-01   1.729 0.083892 .  
foreign_worker2     1.436e+00  7.088e-01   2.026 0.042778 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 850.06  on 699  degrees of freedom
Residual deviance: 620.23  on 670  degrees of freedom
AIC: 680.23

Number of Fisher Scoring iterations: 5

Final model

reglog_final <- glm(formula = creditability ~ account_balance + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    most_available_asset + concurrent_credits + type_apartment + telephone + foreign_worker, 
    family = binomial, data = train70)
summary(reglog_final)


Call:
glm(formula = creditability ~ account_balance + payment_status + 
    purpose + credit_amount + stocks + length_employment + instalment_percent + 
    most_available_asset + concurrent_credits + type_apartment + 
    telephone + foreign_worker, family = binomial, data = train70)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.9233  -0.6723   0.3786   0.7012   2.1482  

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)          -1.319e+00  9.617e-01  -1.372 0.170193    
account_balance2      3.174e-01  2.621e-01   1.211 0.225872    
account_balance3      1.782e+00  5.056e-01   3.524 0.000425 ***
account_balance4      1.460e+00  2.616e-01   5.579 2.42e-08 ***
payment_status1       6.073e-01  6.366e-01   0.954 0.340072    
payment_status2       1.230e+00  4.992e-01   2.463 0.013787 *  
payment_status3       1.208e+00  5.522e-01   2.187 0.028769 *  
payment_status4       2.144e+00  5.301e-01   4.043 5.27e-05 ***
purpose1              2.313e+00  5.092e-01   4.543 5.54e-06 ***
purpose2              5.606e-01  3.031e-01   1.850 0.064365 .  
purpose3              6.914e-01  2.783e-01   2.484 0.012975 *  
purpose4              4.722e-01  1.029e+00   0.459 0.646270    
purpose5             -4.670e-01  6.979e-01  -0.669 0.503431    
purpose6             -1.446e-01  4.794e-01  -0.302 0.762878    
purpose8              1.652e+00  1.253e+00   1.318 0.187603    
purpose9              8.628e-01  4.019e-01   2.147 0.031808 *  
purpose10             2.582e+00  9.931e-01   2.600 0.009313 ** 
credit_amount        -2.299e-04  4.459e-05  -5.155 2.53e-07 ***
stocks2               3.666e-01  3.531e-01   1.038 0.299255    
stocks3               4.751e-01  4.431e-01   1.072 0.283611    
stocks4               1.369e+00  6.114e-01   2.240 0.025122 *  
stocks5               1.238e+00  3.294e-01   3.758 0.000171 ***
length_employment2   -1.559e-01  4.867e-01  -0.320 0.748751    
length_employment3   -1.024e-01  4.570e-01  -0.224 0.822613    
length_employment4    1.005e+00  5.078e-01   1.979 0.047854 *  
length_employment5    1.703e-01  4.728e-01   0.360 0.718754    
instalment_percent   -3.670e-01  1.017e-01  -3.607 0.000309 ***
most_available_asset -1.639e-01  1.166e-01  -1.406 0.159725    
concurrent_credits    1.873e-01  1.495e-01   1.253 0.210140    
type_apartment        2.199e-01  2.118e-01   1.038 0.299230    
telephone             3.263e-01  2.259e-01   1.445 0.148538    
foreign_worker2       1.725e+00  7.073e-01   2.439 0.014746 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 850.06  on 699  degrees of freedom
Residual deviance: 626.33  on 668  degrees of freedom
AIC: 690.33

Number of Fisher Scoring iterations: 5

Kinerja Model

Validasi model pada regresi logistik untuk peubah Y sudah dipastikan bahwa:

tidak menyebar normal
varian atau ragam peubah Y berubah dan tidak konstan
masalah multikolinearitas terkait pendugaan parameter, jika menduga dengan Least Square maka multikolinearitas tetap diperhatikan. Regresi gulud (ridge) dapat digunakan untuk kasus regresi klasik terhadap asumsi ini, tetapi tidak ada penanganan hal tersebut di regresi logistik.

Regresi klasik menggunakan analisis ragam (ANOVA) untuk melihat kinerja model. Secara prinsip, Y pada regresi logistik hanya bersifat biner, ukuran yang dapat digunakan untuk menilai kebaikan model adalah dengan menggunakan uji likelihood ratio test atau chi-square. Cara lain dalam menilai kebaikan regresi logistik bisa dilihat dari kemampuan prediksinya dimana kemampuan memprediksi untuk kasus klasifikasi yang cocok dapat menggunakan tabel klasifikasi atau confusion matrix

Confusion matrix

# fitted.values
fit70 <- fitted.values(reglog_final)
pred_fit70 <- ifelse(fit70 >= 0.5, 1, 0)
conftab <- table(train70$creditability, pred_fit70, dnn = c("Aktual", "Prediksi"))
conftab

      Prediksi
Aktual   0   1
     0 109  98
     1  48 445

cat("Akurasi\t=", sum(diag(conftab)/sum(conftab)), "\n")

Akurasi = 0.7914286

Cara lain dengan perhitungan manual untuk melihat performa model menggunakan data latih

# predict
pred_reglog70 <- predict(reglog_final, train70, type="response")
y_pred <- ifelse(pred_reglog70 >= 0.5, 1, 0)
y_act <- train70$creditability

tbl_klas_reglog <- table(y_act, y_pred)
tbl_klas_reglog

     y_pred
y_act   0   1
    0 109  98
    1  48 445

akurasi <- (tbl_klas_reglog[1,1] + tbl_klas_reglog[2,2]) / sum(tbl_klas_reglog) * 100
sensitivitas <- tbl_klas_reglog[2,2] / sum(tbl_klas_reglog[2,]) * 100
spesifisitas <- tbl_klas_reglog[1,1] / sum(tbl_klas_reglog[1,]) * 100 
fprate <- tbl_klas_reglog[2,1] / (tbl_klas_reglog[2,1] + tbl_klas_reglog[1,1]) * 100
AUC <- (100 + sensitivitas - fprate) / 2 

performa <- data.frame(akurasi, sensitivitas, spesifisitas, AUC)
performa

   akurasi sensitivitas spesifisitas      AUC
1 79.14286     90.26369       52.657 79.84522

Berdasarkan output confusion matrix diatas dapat dilihat bahwa kemampuan prediksi dari model yang didapatkan untuk memprediksi kelas dengan pembayaran pinjaman kredit yang baik dapat diprediksi tepat sebanyak 435 dari total 490 sedangkan untuk kelas pembayaran kredit yang macet atau kurang baik diprediksi tepat sebanyak 109 dari total 210.

Prediksi Model

Kurva ROC

Menguji model dengan data uji

# predict.glm
pred_reglog_fin <- predict.glm(reglog_final, test30)
pred_rocr <- prediction(pred_reglog_fin, test30$creditability)
perf_rocr <- performance(pred_rocr, "tpr", "fpr")
plot(perf_rocr)

Regresi Logistik Multinomial

Data yang digunakan pada model ini melihat bagaimana perilaku nasabah perbankan dalam memilih produk baru yang dimiliki suatu bank.

Pilihan produk:

direct debit
e-account
none
pension

Terdapat (4-1) = 3 model logit.

multinom_train <- read.csv(file="https://github.com/alfanugraha/tsa-handson/raw/master/training_multinom.csv", 
                header = T, 
                sep = ",")
multinom_test <- read.csv(file="https://github.com/alfanugraha/tsa-handson/raw/master/test_multinom.csv", 
                header = T, 
                sep = ",")
kable(head(multinom_train))

age	cust.relation	income	payroll	product
35	A	75931.98	0	direct debit
39	I	241814.67	0	e-account
59	A	319132.80	0	direct debit
25	A	52795.47	0	direct debit
39	A	266896.86	1	e-account
44	A	42304.23	0	direct debit

Proporsi peubah respon

prop.table(table(multinom_train$product))


direct debit    e-account         none      pension 
   0.2776003    0.1884984    0.3972311    0.1366702

Pemodelan

reglog_multinom <- vglm(product ~ ., family = multinomial(refLevel = "none"), data = multinom_train)
summary(reglog_multinom)


Call:
vglm(formula = product ~ ., family = multinomial(refLevel = "none"), 
    data = multinom_train)

Coefficients: 
                   Estimate Std. Error z value Pr(>|z|)    
(Intercept):1     3.274e-01  2.047e-01   1.600   0.1097    
(Intercept):2    -9.475e-01  2.046e-01  -4.632 3.62e-06 ***
(Intercept):3    -2.440e+00  3.389e-01  -7.199 6.07e-13 ***
age:1            -3.922e-03  4.260e-03  -0.921   0.3573    
age:2             1.695e-02  4.013e-03   4.223 2.41e-05 ***
age:3            -5.089e-03  6.189e-03  -0.822   0.4110    
new.cust:1       -1.067e+00  4.353e-01  -2.452   0.0142 *  
new.cust:2       -8.027e-01  4.513e-01  -1.779   0.0753 .  
new.cust:3       -7.338e-01  8.972e-01  -0.818   0.4134    
cust.relationI:1 -4.429e+00  3.155e-01 -14.039  < 2e-16 ***
cust.relationI:2 -2.658e+00  1.769e-01 -15.024  < 2e-16 ***
cust.relationI:3 -3.222e+00  5.380e-01  -5.990 2.10e-09 ***
cust.relationP:1 -1.298e+01  6.940e+02      NA       NA    
cust.relationP:2 -1.377e+01  6.940e+02      NA       NA    
cust.relationP:3 -1.048e+01  6.940e+02      NA       NA    
income:1         -3.347e-07  3.851e-07  -0.869   0.3847    
income:2          2.156e-07  2.641e-07   0.816   0.4144    
income:3         -6.003e-08  4.194e-07  -0.143   0.8862    
payroll:1         4.238e+00  4.258e-01   9.952  < 2e-16 ***
payroll:2         3.676e+00  4.291e-01   8.566  < 2e-16 ***
payroll:3         6.966e+00  4.624e-01  15.065  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Names of linear predictors: log(mu[,1]/mu[,3]), log(mu[,2]/mu[,3]), 
log(mu[,4]/mu[,3])

Residual deviance: 4983.426 on 8430 degrees of freedom

Log-likelihood: -2491.713 on 8430 degrees of freedom

Number of Fisher scoring iterations: 11 

Warning: Hauck-Donner effect detected in the following estimate(s):
'cust.relationI:1', 'cust.relationP:1', 'cust.relationP:2', 'cust.relationP:3', 'payroll:3'


Reference group is level  3  of the response

Selang kepercayaan berdasarkan pendekatan uji Wald

zCrit <- qnorm(c(0.05/2, 1 - 0.05/2))
ciCoef <- as.data.frame(t(apply(coef(summary(reglog_multinom)), 1, function(x) x["Estimate"] - zCrit*x["Std. Error"] )))
names(ciCoef)<-c("Lower","Upper")
head(ciCoef)

                     Lower        Upper
(Intercept):1  0.728631308 -0.073782419
(Intercept):2 -0.546570339 -1.348398715
(Intercept):3 -1.775431639 -3.103872242
age:1          0.004427342 -0.012270389
age:2          0.024813364  0.009081686
age:3          0.007042049 -0.017219239

Odds Ratios

round(exp(coef(reglog_multinom)),4)

   (Intercept):1    (Intercept):2    (Intercept):3            age:1 
          1.3874           0.3877           0.0872           0.9961 
           age:2            age:3       new.cust:1       new.cust:2 
          1.0171           0.9949           0.3439           0.4481 
      new.cust:3 cust.relationI:1 cust.relationI:2 cust.relationI:3 
          0.4801           0.0119           0.0701           0.0399 
cust.relationP:1 cust.relationP:2 cust.relationP:3         income:1 
          0.0000           0.0000           0.0000           1.0000 
        income:2         income:3        payroll:1        payroll:2 
          1.0000           1.0000          69.2474          39.4699 
       payroll:3 
       1060.0324

Kinerja model

Model dievaluasi dengan data latih untuk melihat kinerja model regresi logistik multinomial

reglog_multinom_pred <- predict(reglog_multinom, type="response")

# menambahkan kolom kelas aktual
tbl_multinom <- data.frame(reglog_multinom_pred, multinom_train$product)
kable(head(tbl_multinom))

direct.debit	e.account	none	pension	multinom_train.product
0.3976784	0.2405524	0.3372716	0.0244977	direct debit
0.0122196	0.0517709	0.9333877	0.0026218	e-account
0.3109494	0.3548228	0.3143151	0.0199127	direct debit
0.4244712	0.2057611	0.3434802	0.0262875	direct debit
0.4134388	0.1721252	0.0054831	0.4089529	e-account
0.3779828	0.2708211	0.3283668	0.0228292	direct debit

Perbandingan antara kelas aktual dan prediksi

# menambah kolom kelas prediksi
tbl_multinom$prediksi <- levels(as.factor(multinom_train$product))[max.col(reglog_multinom_pred)]
kable(head(tbl_multinom))

direct.debit	e.account	none	pension	multinom_train.product	prediksi
0.3976784	0.2405524	0.3372716	0.0244977	direct debit	direct debit
0.0122196	0.0517709	0.9333877	0.0026218	e-account	none
0.3109494	0.3548228	0.3143151	0.0199127	direct debit	e-account
0.4244712	0.2057611	0.3434802	0.0262875	direct debit	direct debit
0.4134388	0.1721252	0.0054831	0.4089529	e-account	direct debit
0.3779828	0.2708211	0.3283668	0.0228292	direct debit	direct debit

Ukuran kebaikan model dapat dilihat dengan confusion matrix, Deviance, Log-likelihood, AIC, McFadden, Cox & Snell and Nagelkerke Pseudo R-squared

Confusion matrix

Confusion matrix atau tabel klasifikasi dapat digunakan untuk melihat kebaikan model

multinom_prop <- table(tbl_multinom$prediksi, tbl_multinom$multinom_train.product)
multinom_prop

              
               direct debit e-account none pension
  direct debit          694       371  269     355
  e-account              52       101   83       9
  none                   20        51  765       0
  pension                16         8    2      21

round(prop.table(multinom_prop, 1), 3)

              
               direct debit e-account  none pension
  direct debit        0.411     0.220 0.159   0.210
  e-account           0.212     0.412 0.339   0.037
  none                0.024     0.061 0.915   0.000
  pension             0.340     0.170 0.043   0.447

mean(tbl_multinom$prediksi == multinom_train$product)

[1] 0.5612354

Deviance, Log-likelihood, AIC

cat("deviance\t= ", deviance(reglog_multinom), "\n")

deviance    =  4983.426

cat("logLik\t\t= ", logLik(reglog_multinom), "\n")

logLik      =  -2491.713

cat("AIC\t\t= ", AIC(reglog_multinom), "\n")

AIC     =  5025.426

McFadden, Cox & Snell and Nagelkerke Pseudo R-squared

Koefisien determinasi sering digunakan pada model logit untuk mengetahui seberapa besar pengaruh peubah penjelas terhadap peubah respon. Koefisien determinasi merupakan kuadrat dari koefisien korelasi sebagai ukuran untuk mengetahui kemampuan dari masing-masing peubah yang digunakan dalam penelitian. Nilai koefisien determinasi yang kecil berarti kemampuan peubah penjelas dalam menjelaskan peubah respon amat terbatas.

McFadden Pseudo R-squared intinya mengukur seberapa jauh kemampuan model dalam menerangkan variasi peubah respon. Nilai koefisien determinasi adalah antara 0 dan 1:

McFadden R-squared semakin mendekati nilai 1 maka model telah dianggap semakin baik, atau semakin besar kemampuan model dalam menjelaskan perubahan perubahan dari peubah penjelas terhadap peubah respon.
Jika McFadden R-squared semakin mendekati nilai 0 maka berarti semakin kecil kemampuan model dalam menjelaskan perubahan dari nilai variabel independen terhadap peubah respon dan model dianggap kurang baik (Ghozali, 2017).

reglog_multinom0 <- vglm(product ~ 1, family=multinomial(refLevel="none"), data=multinom_train)
LLf   <- logLik(reglog_multinom)
LL0   <- logLik(reglog_multinom0)

# McFadden pseudo-R2
cat("McFadden\t= ", as.vector(1 - (LLf / LL0)), "\n")

McFadden    =  0.3242949

# Cox & Snell pseudo-R2
N <- nrow(multinom_train)
cat("Cox & Snell\t= ", as.vector(1 - exp((2/N) * (LL0 - LLf))), "\n")

Cox & Snell =  0.5721712

# Nagelkerke pseudo-R2
cat("Nagelkerke\t= ", as.vector((1 - exp((2/N) * (LL0 - LLf))) / (1 - exp(LL0)^(2/N))), "\n")

Nagelkerke  =  0.5721712

Prediksi Model

Hasil prediksi menggunakan data latih

multinom_pred_test <- predict(reglog_multinom, multinom_test, type="response")

# menambah kolom kelas prediksi
tbl_multinom_pred_test <- data.frame(multinom_pred_test)
tbl_multinom_pred_test$newclass <- levels(as.factor(multinom_train$product))[max.col(multinom_pred_test)]
kable(head(tbl_multinom_pred_test))

direct.debit	e.account	none	pension	newclass
0.3058285	0.3728430	0.3026629	0.0186656	e-account
0.3178217	0.3542616	0.3084877	0.0194290	e-account
0.0128270	0.0476523	0.9368211	0.0026996	none
0.2790161	0.4195045	0.2848335	0.0166459	e-account
0.3865516	0.2573039	0.3325727	0.0235717	direct debit
0.3822719	0.2640208	0.3305069	0.0232003	direct debit

Regresi Logistik Ordinal

Pada gugus data ini disediakan peubah yang memiliki atribut calon nasabah asuransi. Studi kasus yang akan dilakukan adalah memprediksi peubah Response pada masing-masing amatan di data latih. Peubah/kolom Response adalah ukuran ordinal (ordo) yang menunjukkan 8 resiko tingkat kesehatan seseorang.

Penjelasan peubah:

Product_Info_1-4:
A set of normalized variables relating to the product applied for
BMI: Normalized BMI of applicant history of the applicant.
InsuredInfo_1-5: A set of normalized variables providing information about the applicant.
Insurance_History_1-3: A set of normalized variables relating to the insurance history of the applicant.
Family_Hist: Variables relating to the family history of the applicant.
Medical_History_1-6: A set of normalized variables relating to the medical history of the applicant.
Employment_Info: Variables relating to the employment

ordinal_train <- read.csv(file="https://github.com/alfanugraha/tsa-handson/raw/master/training_ordinal.csv", 
                header = T, 
                sep = ",")
ordinal_test <- read.csv(file="https://github.com/alfanugraha/tsa-handson/raw/master/test_ordinal.csv", 
                header = T, 
                sep = ",")
kable(head(ordinal_train))

BMI	Product_Info_1	Product_Info_2	Product_Info_3	Product_Info_4	InsuredInfo_1	InsuredInfo_2	InsuredInfo_3	InsuredInfo_4	InsuredInfo_5	Insurance_History_1	Insurance_History_2	Insurance_History_3	Family_Hist	Medical_History_1	Medical_History_2	Medical_History_3	Medical_History_4	Medical_History_5	Medical_History_6	Employment_Info	Response
0.3419113	1	26	2	1	1	6	1	1	6	2	2	3	2	2	1	3	3	3	240	9	8
0.4066125	1	26	2	1	1	6	1	3	6	2	2	3	3	2	1	2	1	3	55	1	7
0.4241890	1	26	2	1	1	3	1	1	3	1	1	1	3	2	1	2	3	3	18	15	1
0.6340781	1	29	2	1	2	8	1	1	8	1	1	1	3	2	1	2	3	3	77	1	2
0.8153073	1	10	2	3	1	8	1	1	8	2	2	3	2	2	1	3	3	1	2	1	3
0.4149147	1	26	2	1	2	6	1	1	6	2	2	3	1	2	1	2	3	3	176	14	6

Pemodelan

reglog_ordinal <- vglm(Response ~ BMI + Product_Info_2 + InsuredInfo_3 + Medical_History_1 + Insurance_History_3 + Employment_Info + Family_Hist, family = propodds, data = ordinal_train)
sum_ord <- summary(reglog_ordinal)
sum_ord


Call:
vglm(formula = Response ~ BMI + Product_Info_2 + InsuredInfo_3 + 
    Medical_History_1 + Insurance_History_3 + Employment_Info + 
    Family_Hist, family = propodds, data = ordinal_train)

Coefficients: 
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept):1        3.763450   0.431215   8.728  < 2e-16 ***
(Intercept):2        2.883124   0.427995   6.736 1.62e-11 ***
(Intercept):3        2.612465   0.427250   6.115 9.68e-10 ***
(Intercept):4        2.220794   0.426346   5.209 1.90e-07 ***
(Intercept):5        1.886840   0.425746   4.432 9.34e-06 ***
(Intercept):6        0.908360   0.424920   2.138 0.032539 *  
(Intercept):7        0.190009   0.425500   0.447 0.655197    
BMI                 -4.075292   0.300920 -13.543  < 2e-16 ***
Product_Info_2      -0.028867   0.007802  -3.700 0.000216 ***
InsuredInfo_3       -0.598242   0.124292  -4.813 1.49e-06 ***
Medical_History_1    0.328194   0.118412   2.772 0.005578 ** 
Insurance_History_3 -0.074967   0.039177  -1.914 0.055678 .  
Employment_Info      0.041459   0.008029   5.164 2.42e-07 ***
Family_Hist          0.183656   0.071449   2.570 0.010157 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Number of linear predictors:  7 

Names of linear predictors: logitlink(P[Y>=2]), logitlink(P[Y>=3]), 
logitlink(P[Y>=4]), logitlink(P[Y>=5]), logitlink(P[Y>=6]), logitlink(P[Y>=7]), 
logitlink(P[Y>=8])

Residual deviance: 9338.263 on 16786 degrees of freedom

Log-likelihood: -4669.132 on 16786 degrees of freedom

Number of Fisher scoring iterations: 5 

No Hauck-Donner effect found in any of the estimates


Exponentiated coefficients:
                BMI      Product_Info_2       InsuredInfo_3   Medical_History_1 
         0.01698726          0.97154540          0.54977724          1.38845790 
Insurance_History_3     Employment_Info         Family_Hist 
         0.92777407          1.04233068          1.20160223

Uji koefisien dan uji model keseluruhan

coef(sum_ord)

                       Estimate  Std. Error     z value     Pr(>|z|)
(Intercept):1        3.76344963 0.431215211   8.7275438 2.602631e-18
(Intercept):2        2.88312431 0.427994856   6.7363527 1.624118e-11
(Intercept):3        2.61246539 0.427250350   6.1146009 9.679889e-10
(Intercept):4        2.22079405 0.426346444   5.2088954 1.899681e-07
(Intercept):5        1.88683969 0.425746243   4.4318411 9.343187e-06
(Intercept):6        0.90835974 0.424919717   2.1377209 3.253941e-02
(Intercept):7        0.19000861 0.425500049   0.4465537 6.551974e-01
BMI                 -4.07529155 0.300919536 -13.5427949 8.739926e-42
Product_Info_2      -0.02886728 0.007801746  -3.7001053 2.155100e-04
InsuredInfo_3       -0.59824209 0.124292119  -4.8131941 1.485369e-06
Medical_History_1    0.32819370 0.118412285   2.7716187 5.577833e-03
Insurance_History_3 -0.07496704 0.039176913  -1.9135514 5.567751e-02
Employment_Info      0.04145925 0.008028688   5.1638886 2.418717e-07
Family_Hist          0.18365585 0.071449440   2.5704310 1.015721e-02

Kinerja Model

reglog_ordinal_pred <- predict(reglog_ordinal, type="response")
kable(head(reglog_ordinal_pred))

1	2	3	4	5	6	7	8
0.1003627	0.1116433	0.0487165	0.0821441	0.0786401	0.2381778	0.1393450	0.2009705
0.1441172	0.1446906	0.0585843	0.0931774	0.0831884	0.2215199	0.1118865	0.1428358
0.0801551	0.0935036	0.0423199	0.0735683	0.0731608	0.2395423	0.1541790	0.2435710
0.2853961	0.2052237	0.0674011	0.0932897	0.0715628	0.1511752	0.0603062	0.0656453
0.4027122	0.2164865	0.0614615	0.0785728	0.0557069	0.1064161	0.0386906	0.0399536
0.1279361	0.1334057	0.0554949	0.0900921	0.0823924	0.2289209	0.1212036	0.1605543

# menambahkan kolom kelas aktual
tbl_ord <- data.frame(reglog_ordinal_pred, ordinal_train$Response)
kable(head(tbl_ord))

X1	X2	X3	X4	X5	X6	X7	X8	ordinal_train.Response
0.1003627	0.1116433	0.0487165	0.0821441	0.0786401	0.2381778	0.1393450	0.2009705	8
0.1441172	0.1446906	0.0585843	0.0931774	0.0831884	0.2215199	0.1118865	0.1428358	7
0.0801551	0.0935036	0.0423199	0.0735683	0.0731608	0.2395423	0.1541790	0.2435710	1
0.2853961	0.2052237	0.0674011	0.0932897	0.0715628	0.1511752	0.0603062	0.0656453	2
0.4027122	0.2164865	0.0614615	0.0785728	0.0557069	0.1064161	0.0386906	0.0399536	3
0.1279361	0.1334057	0.0554949	0.0900921	0.0823924	0.2289209	0.1212036	0.1605543	6

# menambah kolom kelas prediksi
tbl_ord$prediksi <- as.numeric(levels(as.factor(ordinal_train$Response)))[max.col(reglog_ordinal_pred)]
kable(head(tbl_ord))

X1	X2	X3	X4	X5	X6	X7	X8	ordinal_train.Response	prediksi
0.1003627	0.1116433	0.0487165	0.0821441	0.0786401	0.2381778	0.1393450	0.2009705	8	6
0.1441172	0.1446906	0.0585843	0.0931774	0.0831884	0.2215199	0.1118865	0.1428358	7	6
0.0801551	0.0935036	0.0423199	0.0735683	0.0731608	0.2395423	0.1541790	0.2435710	1	8
0.2853961	0.2052237	0.0674011	0.0932897	0.0715628	0.1511752	0.0603062	0.0656453	2	1
0.4027122	0.2164865	0.0614615	0.0785728	0.0557069	0.1064161	0.0386906	0.0399536	3	1
0.1279361	0.1334057	0.0554949	0.0900921	0.0823924	0.2289209	0.1212036	0.1605543	6	6

Confusion matrix

ordinal_prop <- table(tbl_ord$prediksi, tbl_ord$ordinal_train.Response)
ordinal_prop

   
      1   2   3   4   5   6   7   8
  1 129 132  41   7  99  79  28   7
  6 214 197  77 154  76 368 204 258
  8  42  13  12  50  14  52  33 114

round(prop.table(ordinal_prop, 1), 3)

   
        1     2     3     4     5     6     7     8
  1 0.247 0.253 0.079 0.013 0.190 0.151 0.054 0.013
  6 0.138 0.127 0.050 0.099 0.049 0.238 0.132 0.167
  8 0.127 0.039 0.036 0.152 0.042 0.158 0.100 0.345

mean(tbl_ord$prediksi == ordinal_train$Response)

[1] 0.2545833

Deviance, Log-likelihood, AIC

cat("deviance\t= ", deviance(reglog_ordinal), "\n")

deviance    =  9338.263

cat("logLik\t\t= ", logLik(reglog_ordinal), "\n")

logLik      =  -4669.131

cat("AIC\t\t= ", AIC(reglog_ordinal), "\n")

AIC     =  9366.263

McFadden, Cox & Snell and Nagelkerke Pseudo R-squared

reglog_ordinal0 <- vglm(Response ~ 1, family=propodds, data=ordinal_train)
LLf_ord   <- logLik(reglog_ordinal)
LL0_ord   <- logLik(reglog_ordinal0)

# McFadden pseudo-R2
cat("McFadden\t= ", as.vector(1 - (LLf_ord / LL0_ord)), "\n")

McFadden    =  0.02938282

# Cox & Snell pseudo-R2
N_ord <- nrow(ordinal_train)
cat("Cox & Snell\t= ", as.vector(1 - exp((2/N_ord) * (LL0_ord - LLf_ord))), "\n")

Cox & Snell =  0.1111154

# Nagelkerke pseudo-R2
cat("Nagelkerke\t= ", as.vector((1 - exp((2/N_ord) * (LL0_ord - LLf_ord))) / (1 - exp(LL0_ord)^(2/N_ord))), "\n")

Nagelkerke  =  0.1111154

Prediksi Model

ordinal_test_filter <- ordinal_test[c("BMI", "Product_Info_2", "InsuredInfo_3", "Medical_History_1", "Insurance_History_3", "Employment_Info", "Family_Hist")]
kable(head(ordinal_test_filter))

BMI	Product_Info_2	InsuredInfo_3	Medical_History_1	Insurance_History_3	Employment_Info	Family_Hist
0.2634011	26	1	2	3	9	3
0.7508862	26	1	2	1	12	2
0.3736277	26	1	2	3	9	3
0.4592303	26	1	2	3	14	2
0.4867665	26	1	2	1	9	3
0.4472439	26	1	2	3	12	2

pred_ordinal_test <- predict(reglog_ordinal, ordinal_test, type="response")

# menambah kolom kelas prediksi
tbl_ord_test <- data.frame(pred_ordinal_test)
tbl_ord_test$newclass <- as.numeric(levels(as.factor(ordinal_train$Response)))[max.col(pred_ordinal_test)]
kable(head(tbl_ord_test))

X1	X2	X3	X4	X5	X6	X7	X8	newclass
0.0631619	0.0766944	0.0358338	0.0640411	0.0659901	0.2337664	0.1666360	0.2938762	8
0.3098499	0.2100157	0.0667915	0.0907397	0.0682994	0.1406820	0.0548000	0.0588218	1
0.0955564	0.1075036	0.0473139	0.0803422	0.0775848	0.2390662	0.1427841	0.2098487	6
0.1275955	0.1331567	0.0554233	0.0900159	0.0823660	0.2290656	0.1214102	0.1609667	6
0.1260368	0.1320112	0.0550920	0.0896608	0.0822401	0.2297211	0.1223611	0.1628768	6
0.1314364	0.1359365	0.0562146	0.0908472	0.0826383	0.2274026	0.1191061	0.1564183	6