1 Giới thiệu bộ dữ liệu

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.1
library(AER)
## Loading required package: car
## Loading required package: carData
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
data("HMDA")
d<-HMDA

2 Xác định biến phụ thuộc

Các biến phân tích bao gồm:

Biến định tính: deny,condomin,afam

Biến định lượng: mhist,chist,ivrat,unemp

3 Thống kê mô tả cho các biến

3.1 Biến deny

table(d$deny)
## 
##   no  yes 
## 2095  285
table(d$deny)/sum(table(d$deny))
## 
##        no       yes 
## 0.8802521 0.1197479
d |> ggplot(aes( x = deny, y = after_stat(count))) +
  geom_bar(fill = 'blue') +
  geom_text(aes(label = scales::percent( after_stat(count/sum(count)))), stat = 'count', color = 'black', vjust = 1.5) +
  theme_classic() + 
  labs(x = 'deny', y = 'Số người')

## Biến condomin

table(d$condomin)
## 
##   no  yes 
## 1694  686
table(d$condomin)/sum(table(d$condomin))
## 
##        no       yes 
## 0.7117647 0.2882353
d |> ggplot(aes( x = condomin, y = after_stat(count))) +
  geom_bar(fill = 'blue') +
  geom_text(aes(label = scales::percent( after_stat(count/sum(count)))), stat = 'count', color = 'black', vjust = 1.5) +
  theme_classic() + 
  labs(x = 'condomin', y = 'Số người')

4 Biến afam

table(d$afam)
## 
##   no  yes 
## 2041  339
table(d$afam)/sum(table(d$afam))
## 
##       no      yes 
## 0.857563 0.142437
d |> ggplot(aes( x = afam, y = after_stat(count))) +
  geom_bar(fill = 'blue') +
  geom_text(aes(label = scales::percent( after_stat(count/sum(count)))), stat = 'count', color = 'black', vjust = 1.5) +
  theme_classic() + 
  labs(x = 'afam', y = 'Số người')

5 Biến lvrat, pirat, hirat, unemp

table(cut(d$lvrat,4))
## 
## (0.0181,0.502]  (0.502,0.985]   (0.985,1.47]    (1.47,1.95] 
##            270           2062             44              4
table(cut(d$lvrat,4))/sum(table(cut(d$lvrat,4)))
## 
## (0.0181,0.502]  (0.502,0.985]   (0.985,1.47]    (1.47,1.95] 
##    0.113445378    0.866386555    0.018487395    0.001680672
d |> ggplot( aes( x =cut(d$lvrat,4) , y= after_stat(count))) + geom_bar(fill='blue') + geom_text(aes(label= scales :: percent(after_stat(count/sum(count)),accuracy=.01)), stat = 'count', color= 'black', vjust= -.5) + theme_classic() + xlab('tỷ lệ vay vốn') + ylab('số người')

table(d$chist)
## 
##    1    2    3    4    5    6 
## 1353  441  126   77  182  201
table(d$chist)/sum(table(d$chist))
## 
##          1          2          3          4          5          6 
## 0.56848739 0.18529412 0.05294118 0.03235294 0.07647059 0.08445378
d |> ggplot( aes( x =chist , y= after_stat(count))) + geom_bar(fill='blue') + geom_text(aes(label= scales :: percent(after_stat(count/sum(count)),accuracy=.01)), stat = 'count', color= 'black', vjust= -.5) + theme_classic() + xlab('số lần thanh toán tiêu dùng') + ylab('số người')

table(d$mhist)
## 
##    1    2    3    4 
##  747 1571   41   21
table(d$mhist)/sum(table(d$mhist))
## 
##           1           2           3           4 
## 0.313865546 0.660084034 0.017226891 0.008823529
d |> ggplot( aes( x =mhist , y= after_stat(count))) + geom_bar(fill='blue') + geom_text(aes(label= scales :: percent(after_stat(count/sum(count)),accuracy=.01)), stat = 'count', color= 'black', vjust= -.5) + theme_classic() + xlab('số lần thanh toán thế chấp') + ylab('số người')

table(cut(d$unemp,6))
## 
## (1.79,3.27] (3.27,4.73]  (4.73,6.2]  (6.2,7.67] (7.67,9.13] (9.13,10.6] 
##        1517         623          65           0          16         159
table(cut(d$unemp,6))/sum(table(cut(d$unemp,6)))
## 
## (1.79,3.27] (3.27,4.73]  (4.73,6.2]  (6.2,7.67] (7.67,9.13] (9.13,10.6] 
## 0.637394958 0.261764706 0.027310924 0.000000000 0.006722689 0.066806723
d |> ggplot( aes( x =cut(d$unemp,6) , y= after_stat(count))) + geom_bar(fill='blue') + geom_text(aes(label= scales :: percent(after_stat(count/sum(count)),accuracy=.01)), stat = 'count', color= 'black', vjust= -.5) + theme_classic() + xlab('tỷ lệ thất nghiệp') + ylab('số người')

6 Thống kê mô tả cho các cặp biến

6.1

chisq.test(table(d$deny,d$insurance))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(d$deny, d$insurance)
## X-squared = 287.48, df = 1, p-value < 2.2e-16
chisq.test(table(d$deny,d$afam))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(d$deny, d$afam)
## X-squared = 98.376, df = 1, p-value < 2.2e-16
chisq.test(table(d$deny,d$single))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(d$deny, d$single)
## X-squared = 13.489, df = 1, p-value = 0.0002399
chisq.test(table(d$deny,d$hschool))
## Warning in chisq.test(table(d$deny, d$hschool)): Chi-squared approximation may
## be incorrect
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(d$deny, d$hschool)
## X-squared = 8.4052, df = 1, p-value = 0.003741
mh1 <- glm(deny~afam + insurance + single + hschool , family = binomial(link = 'logit'), data = d)
summary(mh1)
## 
## Call:
## glm(formula = deny ~ afam + insurance + single + hschool, family = binomial(link = "logit"), 
##     data = d)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.4399     0.3767  -3.823 0.000132 ***
## afamyes        1.2466     0.1543   8.078 6.58e-16 ***
## insuranceyes   4.5517     0.5330   8.540  < 2e-16 ***
## singleyes      0.4113     0.1396   2.946 0.003217 ** 
## hschoolyes    -1.1932     0.3801  -3.139 0.001694 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1744.2  on 2379  degrees of freedom
## Residual deviance: 1494.5  on 2375  degrees of freedom
## AIC: 1504.5
## 
## Number of Fisher Scoring iterations: 5
mh2 <- glm(deny~afam +  insurance + single + hschool  , family = binomial(link = 'probit'), data = d)
summary(mh2)
## 
## Call:
## glm(formula = deny ~ afam + insurance + single + hschool, family = binomial(link = "probit"), 
##     data = d)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.83197    0.22012  -3.780 0.000157 ***
## afamyes       0.67833    0.08660   7.832 4.78e-15 ***
## insuranceyes  2.62368    0.27016   9.712  < 2e-16 ***
## singleyes     0.22048    0.07242   3.045 0.002330 ** 
## hschoolyes   -0.67503    0.22188  -3.042 0.002347 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1744.2  on 2379  degrees of freedom
## Residual deviance: 1493.8  on 2375  degrees of freedom
## AIC: 1503.8
## 
## Number of Fisher Scoring iterations: 5
mh3 <- glm(deny~afam +  insurance + single + hschool  , family = binomial(link = 'cloglog'), data = d)
summary(mh3)
## 
## Call:
## glm(formula = deny ~ afam + insurance + single + hschool, family = binomial(link = "cloglog"), 
##     data = d)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.6205     0.3180  -5.096 3.47e-07 ***
## afamyes        1.0593     0.1378   7.687 1.50e-14 ***
## insuranceyes   3.1197     0.2284  13.657  < 2e-16 ***
## singleyes      0.4143     0.1253   3.307 0.000944 ***
## hschoolyes    -1.0302     0.3194  -3.226 0.001257 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1744.2  on 2379  degrees of freedom
## Residual deviance: 1497.6  on 2375  degrees of freedom
## AIC: 1507.6
## 
## Number of Fisher Scoring iterations: 6
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.3.1
## 
## Attaching package: 'DescTools'
## The following object is masked from 'package:car':
## 
##     Recode
library(caret)
## Warning: package 'caret' was built under R version 4.3.1
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:DescTools':
## 
##     MAE, RMSE
## The following object is masked from 'package:survival':
## 
##     cluster
BrierScore(mh1)
## [1] 0.08850611
BrierScore(mh2)
## [1] 0.08845049
BrierScore(mh3)
## [1] 0.08859589
confusionMatrix(table(predict(mh1, type = "response")>=0.5, mh1$data$deny == 'yes'))
## Confusion Matrix and Statistics
## 
##        
##         FALSE TRUE
##   FALSE  2089  240
##   TRUE      6   45
##                                           
##                Accuracy : 0.8966          
##                  95% CI : (0.8837, 0.9086)
##     No Information Rate : 0.8803          
##     P-Value [Acc > NIR] : 0.006704        
##                                           
##                   Kappa : 0.2402          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9971          
##             Specificity : 0.1579          
##          Pos Pred Value : 0.8970          
##          Neg Pred Value : 0.8824          
##              Prevalence : 0.8803          
##          Detection Rate : 0.8777          
##    Detection Prevalence : 0.9786          
##       Balanced Accuracy : 0.5775          
##                                           
##        'Positive' Class : FALSE           
## 
confusionMatrix(table(predict(mh2, type = "response")>=0.5, mh2$data$deny == 'yes'))
## Confusion Matrix and Statistics
## 
##        
##         FALSE TRUE
##   FALSE  2089  240
##   TRUE      6   45
##                                           
##                Accuracy : 0.8966          
##                  95% CI : (0.8837, 0.9086)
##     No Information Rate : 0.8803          
##     P-Value [Acc > NIR] : 0.006704        
##                                           
##                   Kappa : 0.2402          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9971          
##             Specificity : 0.1579          
##          Pos Pred Value : 0.8970          
##          Neg Pred Value : 0.8824          
##              Prevalence : 0.8803          
##          Detection Rate : 0.8777          
##    Detection Prevalence : 0.9786          
##       Balanced Accuracy : 0.5775          
##                                           
##        'Positive' Class : FALSE           
## 
confusionMatrix(table(predict(mh3, type = "response")>=0.5, mh3$data$deny == 'yes'))
## Confusion Matrix and Statistics
## 
##        
##         FALSE TRUE
##   FALSE  2089  240
##   TRUE      6   45
##                                           
##                Accuracy : 0.8966          
##                  95% CI : (0.8837, 0.9086)
##     No Information Rate : 0.8803          
##     P-Value [Acc > NIR] : 0.006704        
##                                           
##                   Kappa : 0.2402          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9971          
##             Specificity : 0.1579          
##          Pos Pred Value : 0.8970          
##          Neg Pred Value : 0.8824          
##              Prevalence : 0.8803          
##          Detection Rate : 0.8777          
##    Detection Prevalence : 0.9786          
##       Balanced Accuracy : 0.5775          
##                                           
##        'Positive' Class : FALSE           
## 
da<-table(d$deny,d$afam)
addmargins(da)
##      
##         no  yes  Sum
##   no  1852  243 2095
##   yes  189   96  285
##   Sum 2041  339 2380
ggplot(d, aes(deny, fill = afam)) + geom_bar(position = 'dodge')

RelRisk(da)
## [1] 1.33303
library("epitools")
## 
## Attaching package: 'epitools'
## The following object is masked from 'package:survival':
## 
##     ratetable
riskratio(da)
## $data
##        
##           no yes Total
##   no    1852 243  2095
##   yes    189  96   285
##   Total 2041 339  2380
## 
## $measure
##      risk ratio with 95% C.I.
##       estimate    lower    upper
##   no   1.00000       NA       NA
##   yes  2.90405 2.374608 3.551537
## 
## $p.value
##      two-sided
##       midp.exact fisher.exact   chi.square
##   no          NA           NA           NA
##   yes          0 2.302896e-19 1.394266e-23
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"
oddsratio(da)
## $data
##        
##           no yes Total
##   no    1852 243  2095
##   yes    189  96   285
##   Total 2041 339  2380
## 
## $measure
##      odds ratio with 95% C.I.
##       estimate    lower    upper
##   no  1.000000       NA       NA
##   yes 3.870131 2.918351 5.111518
## 
## $p.value
##      two-sided
##       midp.exact fisher.exact   chi.square
##   no          NA           NA           NA
##   yes          0 2.302896e-19 1.394266e-23
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "median-unbiased estimate & mid-p exact CI"
di<-table(d$deny,d$insurance)
addmargins(di)
##      
##         no  yes  Sum
##   no  2091    4 2095
##   yes  241   44  285
##   Sum 2332   48 2380
ggplot(d, aes(deny, fill = insurance)) + geom_bar(position = 'dodge')

RelRisk(di)
## [1] 1.180315
riskratio(di)
## $data
##        
##           no yes Total
##   no    2091   4  2095
##   yes    241  44   285
##   Total 2332  48  2380
## 
## $measure
##      risk ratio with 95% C.I.
##       estimate    lower    upper
##   no   1.00000       NA       NA
##   yes 80.85965 29.27297 223.3556
## 
## $p.value
##      two-sided
##       midp.exact fisher.exact   chi.square
##   no          NA           NA           NA
##   yes          0 1.587383e-37 3.753633e-66
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"
oddsratio(di)
## $data
##        
##           no yes Total
##   no    2091   4  2095
##   yes    241  44   285
##   Total 2332  48  2380
## 
## $measure
##      odds ratio with 95% C.I.
##       estimate    lower    upper
##   no   1.00000       NA       NA
##   yes 91.80233 36.72332 314.0005
## 
## $p.value
##      two-sided
##       midp.exact fisher.exact   chi.square
##   no          NA           NA           NA
##   yes          0 1.587383e-37 3.753633e-66
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "median-unbiased estimate & mid-p exact CI"
ds<-table(d$deny,d$single)
addmargins(ds)
##      
##         no  yes  Sum
##   no  1300  795 2095
##   yes  144  141  285
##   Sum 1444  936 2380
ggplot(d, aes(deny, fill = single)) + geom_bar(position = 'dodge')

RelRisk(ds)
## [1] 1.228123
riskratio(ds)
## $data
##        
##           no yes Total
##   no    1300 795  2095
##   yes    144 141   285
##   Total 1444 936  2380
## 
## $measure
##      risk ratio with 95% C.I.
##       estimate    lower    upper
##   no   1.00000       NA       NA
##   yes  1.30374 1.145409 1.483959
## 
## $p.value
##      two-sided
##         midp.exact fisher.exact   chi.square
##   no            NA           NA           NA
##   yes 0.0002185362 0.0002236294 0.0001859483
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"
oddsratio(ds)
## $data
##        
##           no yes Total
##   no    1300 795  2095
##   yes    144 141   285
##   Total 1444 936  2380
## 
## $measure
##      odds ratio with 95% C.I.
##       estimate    lower    upper
##   no  1.000000       NA       NA
##   yes 1.600842 1.248113 2.053054
## 
## $p.value
##      two-sided
##         midp.exact fisher.exact   chi.square
##   no            NA           NA           NA
##   yes 0.0002185362 0.0002236294 0.0001859483
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "median-unbiased estimate & mid-p exact CI"
dh<-table(d$deny,d$hschool)
addmargins(dh)
##      
##         no  yes  Sum
##   no    28 2067 2095
##   yes   11  274  285
##   Sum   39 2341 2380
ggplot(d, aes(deny, fill = hschool)) + geom_bar(position = 'dodge')

RelRisk(dh)
## [1] 0.346279
riskratio(dh)
## Warning in chisq.test(xx, correct = correction): Chi-squared approximation may
## be incorrect
## $data
##        
##         no  yes Total
##   no    28 2067  2095
##   yes   11  274   285
##   Total 39 2341  2380
## 
## $measure
##      risk ratio with 95% C.I.
##        estimate     lower     upper
##   no  1.0000000        NA        NA
##   yes 0.9744269 0.9515189 0.9978863
## 
## $p.value
##      two-sided
##        midp.exact fisher.exact  chi.square
##   no           NA           NA          NA
##   yes 0.005908764  0.004559804 0.001644918
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"
oddsratio(dh)
## Warning in chisq.test(xx, correct = correction): Chi-squared approximation may
## be incorrect
## $data
##        
##         no  yes Total
##   no    28 2067  2095
##   yes   11  274   285
##   Total 39 2341  2380
## 
## $measure
##      odds ratio with 95% C.I.
##        estimate     lower     upper
##   no  1.0000000        NA        NA
##   yes 0.3347397 0.1686599 0.7134814
## 
## $p.value
##      two-sided
##        midp.exact fisher.exact  chi.square
##   no           NA           NA          NA
##   yes 0.005908764  0.004559804 0.001644918
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "median-unbiased estimate & mid-p exact CI"
y <- d[d$deny == 'yes',]
prop.test(length(y$deny), length(d$deny))
## 
##  1-sample proportions test with continuity correction
## 
## data:  length(y$deny) out of length(d$deny), null probability 0.5
## X-squared = 1375, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.1071133 0.1336277
## sample estimates:
##         p 
## 0.1197479
y1 <- d[d$deny == 'no',]
prop.test(length(y1$deny), length(d$deny))
## 
##  1-sample proportions test with continuity correction
## 
## data:  length(y1$deny) out of length(d$deny), null probability 0.5
## X-squared = 1375, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.8663723 0.8928867
## sample estimates:
##         p 
## 0.8802521
i <- d[d$insurance == 'yes',]
prop.test(length(i$insurance), length(d$insurance))
## 
##  1-sample proportions test with continuity correction
## 
## data:  length(i$insurance) out of length(d$insurance), null probability 0.5
## X-squared = 2190, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.01506428 0.02687526
## sample estimates:
##          p 
## 0.02016807
i1 <- d[d$insurance == 'no',]
prop.test(length(i1$insurance), length(d$insurance))
## 
##  1-sample proportions test with continuity correction
## 
## data:  length(i1$insurance) out of length(d$insurance), null probability 0.5
## X-squared = 2190, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.9731247 0.9849357
## sample estimates:
##         p 
## 0.9798319
s <- d[d$single == 'yes',]
prop.test(length(s$single), length(d$single))
## 
##  1-sample proportions test with continuity correction
## 
## data:  length(s$single) out of length(d$single), null probability 0.5
## X-squared = 108, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.3736317 0.4132706
## sample estimates:
##         p 
## 0.3932773
s1 <- d[d$single == 'no',]
prop.test(length(s1$single), length(d$single))
## 
##  1-sample proportions test with continuity correction
## 
## data:  length(s1$single) out of length(d$single), null probability 0.5
## X-squared = 108, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.5867294 0.6263683
## sample estimates:
##         p 
## 0.6067227
a <- d[d$afam == 'yes',]
prop.test(length(a$afam), length(d$afam))
## 
##  1-sample proportions test with continuity correction
## 
## data:  length(a$afam) out of length(d$afam), null probability 0.5
## X-squared = 1215.7, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.1287703 0.1572732
## sample estimates:
##        p 
## 0.142437
a1 <- d[d$afam == 'no',]
prop.test(length(a1$afam), length(d$afam))
## 
##  1-sample proportions test with continuity correction
## 
## data:  length(a1$afam) out of length(d$afam), null probability 0.5
## X-squared = 1215.7, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.8427268 0.8712297
## sample estimates:
##        p 
## 0.857563
h <- d[d$hschool == 'yes',]
prop.test(length(h$hschool), length(d$hschool))
## 
##  1-sample proportions test with continuity correction
## 
## data:  length(h$hschool) out of length(d$hschool), null probability 0.5
## X-squared = 2224.6, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.9774371 0.9881677
## sample estimates:
##         p 
## 0.9836134
h1 <- d[d$hschool == 'no',]
prop.test(length(h1$hschool), length(d$hschool))
## 
##  1-sample proportions test with continuity correction
## 
## data:  length(h1$hschool) out of length(d$hschool), null probability 0.5
## X-squared = 2224.6, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.01183228 0.02256289
## sample estimates:
##          p 
## 0.01638655