library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(epiR) # phân tích odds và OR
## Loading required package: survival
## Package epiR 2.0.62 is loaded
## Type help(epi.about) for summary information
## Type browseVignettes(package = 'epiR') to learn how to use epiR for applied epidemiological analyses
## 
library(epiDisplay) # hiển thị kết quả mh logistic
## Loading required package: foreign
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: nnet

1. Odds và Odds Ratio

DATA_RAW <- read.csv("data/application_logistic.csv")
col_description <- read.csv("data/columns_description.csv")
# View(DATA_RAW)
# View(col_description)
# chuyển đổi kiểu dữ liệu các biến sang kiểu factor
application_logistic <- mutate(
  DATA_RAW,
  TARGET = factor(TARGET, 
                  levels = c(0, 1), 
                  labels = c("không", "có"),
                  ordered = TRUE),
  NAME_CONTRACT_TYPE = factor(NAME_CONTRACT_TYPE),
  CODE_GENDER = factor(CODE_GENDER),
  FLAG_OWN_CAR = factor(FLAG_OWN_CAR),
  FLAG_OWN_REALTY = factor(FLAG_OWN_REALTY),
  # đổi đơn vị cột thu nhập thành triệu đô
  AMT_INCOME_TOTAL = AMT_INCOME_TOTAL / 1000000
)
head(application_logistic)
summary(application_logistic)
##    TARGET            NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR
##  không:18452   Cash loans     :18091    F:13194     N:13068     
##  có   : 1548   Revolving loans: 1909    M: 6806     Y: 6932     
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  FLAG_OWN_REALTY  CNT_CHILDREN     AMT_INCOME_TOTAL    AMT_CREDIT     
##  N: 6104         Min.   : 0.0000   Min.   : 0.0270   Min.   :  45000  
##  Y:13896         1st Qu.: 0.0000   1st Qu.: 0.1125   1st Qu.: 270000  
##                  Median : 0.0000   Median : 0.1485   Median : 512064  
##                  Mean   : 0.4153   Mean   : 0.1705   Mean   : 599440  
##                  3rd Qu.: 1.0000   3rd Qu.: 0.2025   3rd Qu.: 808650  
##                  Max.   :12.0000   Max.   :18.0001   Max.   :3375000
table(application_logistic$CODE_GENDER, application_logistic$TARGET) %>% 
  epi.2by2(method = "case.control")
##              Outcome +    Outcome -      Total                       Odds
## Exposed +        12283          911      13194     13.48 (12.63 to 14.45)
## Exposed -         6169          637       6806       9.68 (8.95 to 10.54)
## Total            18452         1548      20000     11.92 (11.33 to 12.57)
## 
## Point estimates and 95% CIs:
## -------------------------------------------------------------------
## Exposure odds ratio                            1.39 (1.25, 1.55)
## Attrib fraction (est) in the exposed (%)      28.17 (20.03, 35.46)
## Attrib fraction (est) in the population (%)   18.75 (13.47, 23.71)
## -------------------------------------------------------------------
## Uncorrected chi2 test that OR = 1: chi2(1) = 37.887 Pr>chi2 = <0.001
## Fisher exact test that OR = 1: Pr>chi2 = <0.001
##  Wald confidence limits
##  CI: confidence interval
# không có giá trị 1 trong khoảng tin cậy 95%
# OR có ý nghĩa thông kê


2. Hồi quy logistic

m1 <- glm(TARGET ~ CODE_GENDER, 
    data = application_logistic,
    family = binomial) # lưu phải có, khai báo mh logistic
summary(m1)
## 
## Call:
## glm(formula = TARGET ~ CODE_GENDER, family = binomial, data = application_logistic)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4433  -0.4433  -0.3783  -0.3783   2.3121  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.60143    0.03434 -75.759  < 2e-16 ***
## CODE_GENDERM  0.33091    0.05395   6.133 8.62e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10895  on 19999  degrees of freedom
## Residual deviance: 10858  on 19998  degrees of freedom
## AIC: 10862
## 
## Number of Fisher Scoring iterations: 5
#tính OR của nam và nữ  = e^0.33
exp(0.33091)
## [1] 1.392234
logistic.display(m1)
## 
## Logistic regression predicting TARGET 
##  
##                     OR(95%CI)         P(Wald's test) P(LR-test)
## CODE_GENDER: M vs F 1.39 (1.25,1.55)  < 0.001        < 0.001   
##                                                                
## Log-likelihood = -5429.0128
## No. of observations = 20000
## AIC value = 10862.0255


3. Diễn giải kết quả mô hình

m1 <- glm(TARGET ~ CODE_GENDER, 
          data = application_logistic,
          family = binomial)
# logit(P) = β1 + β2 × AMT_INCOME_TOTAL
m2 <- glm(TARGET ~ AMT_INCOME_TOTAL,
          data = application_logistic,
          family = binomial)
# logit(P) = β1 + β2 × CODE_GENDER + β3 × AMT_INCOME_TOTAL
m3 <- glm(TARGET ~ CODE_GENDER + AMT_INCOME_TOTAL,
          data = application_logistic,
          family = binomial)
summary(m2)
## 
## Call:
## glm(formula = TARGET ~ AMT_INCOME_TOTAL, family = binomial, data = application_logistic)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4245  -0.4101  -0.4027  -0.3919   2.6479  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -2.33868    0.05588 -41.853  < 2e-16 ***
## AMT_INCOME_TOTAL -0.84197    0.30272  -2.781  0.00541 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10895  on 19999  degrees of freedom
## Residual deviance: 10886  on 19998  degrees of freedom
## AIC: 10890
## 
## Number of Fisher Scoring iterations: 5
logistic.display(m2)
## 
## Logistic regression predicting TARGET 
##  
##                               OR(95%CI)         P(Wald's test) P(LR-test)
## AMT_INCOME_TOTAL (cont. var.) 0.43 (0.24,0.78)  0.005          0.003     
##                                                                          
## Log-likelihood = -5443.0909
## No. of observations = 20000
## AIC value = 10890.1818
logistic.display(m3)
## 
## Logistic regression predicting TARGET 
##  
##                               crude OR(95%CI)   adj. OR(95%CI)   
## CODE_GENDER: M vs F           1.39 (1.25,1.55)  1.46 (1.31,1.63) 
##                                                                  
## AMT_INCOME_TOTAL (cont. var.) 0.43 (0.24,0.78)  0.28 (0.15,0.52) 
##                                                                  
##                               P(Wald's test) P(LR-test)
## CODE_GENDER: M vs F           < 0.001        < 0.001   
##                                                        
## AMT_INCOME_TOTAL (cont. var.) < 0.001        < 0.001   
##                                                        
## Log-likelihood = -5419.8227
## No. of observations = 20000
## AIC value = 10845.6455
deviance(m1)
## [1] 10858.03
deviance(m2)
## [1] 10886.18
deviance(m3)
## [1] 10839.65
AIC(m1)
## [1] 10862.03
AIC(m2)
## [1] 10890.18
AIC(m2)
## [1] 10890.18
# kiểm tra mô hình nào tốt hơn
# logit(P) = β1 + β2 × AMT_INCOME_TOTAL
# logit(P) = β1 + β2 × CODE_GENDER + β3 × AMT_INCOME_TOTAL
lrtest(m1, m3)
## Likelihood ratio test for MLE method 
## Chi-squared 1 d.f. =  18.38007 , P value =  1.809406e-05
lrtest(m2, m3)
## Likelihood ratio test for MLE method 
## Chi-squared 1 d.f. =  46.5363 , P value =  8.993698e-12
# khi p-value < alpha, mô hình m3 tốt hơn
# lưu ý, mô hình m1 hoặc m2 phải nằm trong m3


4. Dự đoán kết quả

# khi thu nhập = 500.000USD thì xác suất người đó trả muộn khoản vay là bao nhiêu
predict(m2, 
        data.frame(AMT_INCOME_TOTAL = 0.5),
        type = "response") # chuyển sang dạng xác suất
##          1 
## 0.05954306
# khi thu nhập 800k USD và là Nam thì xác suất trả muộn khoản vay là bao nhiêu?
predict(m3, 
        data.frame(
          AMT_INCOME_TOTAL = 0.8,
          CODE_GENDER = "M"),
        type = "response")
##          1 
## 0.04504246