library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(epiR) # phân tích odds và OR
## Loading required package: survival
## Package epiR 2.0.62 is loaded
## Type help(epi.about) for summary information
## Type browseVignettes(package = 'epiR') to learn how to use epiR for applied epidemiological analyses
##
library(epiDisplay) # hiển thị kết quả mh logistic
## Loading required package: foreign
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: nnet
1. Odds và Odds Ratio
DATA_RAW <- read.csv("data/application_logistic.csv")
col_description <- read.csv("data/columns_description.csv")
# View(DATA_RAW)
# View(col_description)
# chuyển đổi kiểu dữ liệu các biến sang kiểu factor
application_logistic <- mutate(
DATA_RAW,
TARGET = factor(TARGET,
levels = c(0, 1),
labels = c("không", "có"),
ordered = TRUE),
NAME_CONTRACT_TYPE = factor(NAME_CONTRACT_TYPE),
CODE_GENDER = factor(CODE_GENDER),
FLAG_OWN_CAR = factor(FLAG_OWN_CAR),
FLAG_OWN_REALTY = factor(FLAG_OWN_REALTY),
# đổi đơn vị cột thu nhập thành triệu đô
AMT_INCOME_TOTAL = AMT_INCOME_TOTAL / 1000000
)
head(application_logistic)
summary(application_logistic)
## TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR
## không:18452 Cash loans :18091 F:13194 N:13068
## có : 1548 Revolving loans: 1909 M: 6806 Y: 6932
##
##
##
##
## FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT
## N: 6104 Min. : 0.0000 Min. : 0.0270 Min. : 45000
## Y:13896 1st Qu.: 0.0000 1st Qu.: 0.1125 1st Qu.: 270000
## Median : 0.0000 Median : 0.1485 Median : 512064
## Mean : 0.4153 Mean : 0.1705 Mean : 599440
## 3rd Qu.: 1.0000 3rd Qu.: 0.2025 3rd Qu.: 808650
## Max. :12.0000 Max. :18.0001 Max. :3375000
table(application_logistic$CODE_GENDER, application_logistic$TARGET) %>%
epi.2by2(method = "case.control")
## Outcome + Outcome - Total Odds
## Exposed + 12283 911 13194 13.48 (12.63 to 14.45)
## Exposed - 6169 637 6806 9.68 (8.95 to 10.54)
## Total 18452 1548 20000 11.92 (11.33 to 12.57)
##
## Point estimates and 95% CIs:
## -------------------------------------------------------------------
## Exposure odds ratio 1.39 (1.25, 1.55)
## Attrib fraction (est) in the exposed (%) 28.17 (20.03, 35.46)
## Attrib fraction (est) in the population (%) 18.75 (13.47, 23.71)
## -------------------------------------------------------------------
## Uncorrected chi2 test that OR = 1: chi2(1) = 37.887 Pr>chi2 = <0.001
## Fisher exact test that OR = 1: Pr>chi2 = <0.001
## Wald confidence limits
## CI: confidence interval
# không có giá trị 1 trong khoảng tin cậy 95%
# OR có ý nghĩa thông kê
2. Hồi quy logistic
m1 <- glm(TARGET ~ CODE_GENDER,
data = application_logistic,
family = binomial) # lưu phải có, khai báo mh logistic
summary(m1)
##
## Call:
## glm(formula = TARGET ~ CODE_GENDER, family = binomial, data = application_logistic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4433 -0.4433 -0.3783 -0.3783 2.3121
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.60143 0.03434 -75.759 < 2e-16 ***
## CODE_GENDERM 0.33091 0.05395 6.133 8.62e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10895 on 19999 degrees of freedom
## Residual deviance: 10858 on 19998 degrees of freedom
## AIC: 10862
##
## Number of Fisher Scoring iterations: 5
#tính OR của nam và nữ = e^0.33
exp(0.33091)
## [1] 1.392234
logistic.display(m1)
##
## Logistic regression predicting TARGET
##
## OR(95%CI) P(Wald's test) P(LR-test)
## CODE_GENDER: M vs F 1.39 (1.25,1.55) < 0.001 < 0.001
##
## Log-likelihood = -5429.0128
## No. of observations = 20000
## AIC value = 10862.0255
3. Diễn giải kết quả mô hình
m1 <- glm(TARGET ~ CODE_GENDER,
data = application_logistic,
family = binomial)
# logit(P) = β1 + β2 × AMT_INCOME_TOTAL
m2 <- glm(TARGET ~ AMT_INCOME_TOTAL,
data = application_logistic,
family = binomial)
# logit(P) = β1 + β2 × CODE_GENDER + β3 × AMT_INCOME_TOTAL
m3 <- glm(TARGET ~ CODE_GENDER + AMT_INCOME_TOTAL,
data = application_logistic,
family = binomial)
summary(m2)
##
## Call:
## glm(formula = TARGET ~ AMT_INCOME_TOTAL, family = binomial, data = application_logistic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4245 -0.4101 -0.4027 -0.3919 2.6479
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.33868 0.05588 -41.853 < 2e-16 ***
## AMT_INCOME_TOTAL -0.84197 0.30272 -2.781 0.00541 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10895 on 19999 degrees of freedom
## Residual deviance: 10886 on 19998 degrees of freedom
## AIC: 10890
##
## Number of Fisher Scoring iterations: 5
logistic.display(m2)
##
## Logistic regression predicting TARGET
##
## OR(95%CI) P(Wald's test) P(LR-test)
## AMT_INCOME_TOTAL (cont. var.) 0.43 (0.24,0.78) 0.005 0.003
##
## Log-likelihood = -5443.0909
## No. of observations = 20000
## AIC value = 10890.1818
logistic.display(m3)
##
## Logistic regression predicting TARGET
##
## crude OR(95%CI) adj. OR(95%CI)
## CODE_GENDER: M vs F 1.39 (1.25,1.55) 1.46 (1.31,1.63)
##
## AMT_INCOME_TOTAL (cont. var.) 0.43 (0.24,0.78) 0.28 (0.15,0.52)
##
## P(Wald's test) P(LR-test)
## CODE_GENDER: M vs F < 0.001 < 0.001
##
## AMT_INCOME_TOTAL (cont. var.) < 0.001 < 0.001
##
## Log-likelihood = -5419.8227
## No. of observations = 20000
## AIC value = 10845.6455
deviance(m1)
## [1] 10858.03
deviance(m2)
## [1] 10886.18
deviance(m3)
## [1] 10839.65
AIC(m1)
## [1] 10862.03
AIC(m2)
## [1] 10890.18
AIC(m2)
## [1] 10890.18
# kiểm tra mô hình nào tốt hơn
# logit(P) = β1 + β2 × AMT_INCOME_TOTAL
# logit(P) = β1 + β2 × CODE_GENDER + β3 × AMT_INCOME_TOTAL
lrtest(m1, m3)
## Likelihood ratio test for MLE method
## Chi-squared 1 d.f. = 18.38007 , P value = 1.809406e-05
lrtest(m2, m3)
## Likelihood ratio test for MLE method
## Chi-squared 1 d.f. = 46.5363 , P value = 8.993698e-12
# khi p-value < alpha, mô hình m3 tốt hơn
# lưu ý, mô hình m1 hoặc m2 phải nằm trong m3
4. Dự đoán kết quả
# khi thu nhập = 500.000USD thì xác suất người đó trả muộn khoản vay là bao nhiêu
predict(m2,
data.frame(AMT_INCOME_TOTAL = 0.5),
type = "response") # chuyển sang dạng xác suất
## 1
## 0.05954306
# khi thu nhập 800k USD và là Nam thì xác suất trả muộn khoản vay là bao nhiêu?
predict(m3,
data.frame(
AMT_INCOME_TOTAL = 0.8,
CODE_GENDER = "M"),
type = "response")
## 1
## 0.04504246