library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.1
library(AER)
## Loading required package: car
## Loading required package: carData
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
data("HMDA")
d<-HMDA
Các biến phân tích bao gồm:
Biến định tính: deny,condomin,afam
Biến định lượng: mhist,chist,ivrat,unemp
table(d$deny)
##
## no yes
## 2095 285
table(d$deny)/sum(table(d$deny))
##
## no yes
## 0.8802521 0.1197479
d |> ggplot(aes( x = deny, y = after_stat(count))) +
geom_bar(fill = 'blue') +
geom_text(aes(label = scales::percent( after_stat(count/sum(count)))), stat = 'count', color = 'black', vjust = 1.5) +
theme_classic() +
labs(x = 'deny', y = 'Số người')
## Biến condomin
table(d$condomin)
##
## no yes
## 1694 686
table(d$condomin)/sum(table(d$condomin))
##
## no yes
## 0.7117647 0.2882353
d |> ggplot(aes( x = condomin, y = after_stat(count))) +
geom_bar(fill = 'blue') +
geom_text(aes(label = scales::percent( after_stat(count/sum(count)))), stat = 'count', color = 'black', vjust = 1.5) +
theme_classic() +
labs(x = 'condomin', y = 'Số người')
table(d$afam)
##
## no yes
## 2041 339
table(d$afam)/sum(table(d$afam))
##
## no yes
## 0.857563 0.142437
d |> ggplot(aes( x = afam, y = after_stat(count))) +
geom_bar(fill = 'blue') +
geom_text(aes(label = scales::percent( after_stat(count/sum(count)))), stat = 'count', color = 'black', vjust = 1.5) +
theme_classic() +
labs(x = 'afam', y = 'Số người')
table(cut(d$lvrat,4))
##
## (0.0181,0.502] (0.502,0.985] (0.985,1.47] (1.47,1.95]
## 270 2062 44 4
table(cut(d$lvrat,4))/sum(table(cut(d$lvrat,4)))
##
## (0.0181,0.502] (0.502,0.985] (0.985,1.47] (1.47,1.95]
## 0.113445378 0.866386555 0.018487395 0.001680672
d |> ggplot( aes( x =cut(d$lvrat,4) , y= after_stat(count))) + geom_bar(fill='blue') + geom_text(aes(label= scales :: percent(after_stat(count/sum(count)),accuracy=.01)), stat = 'count', color= 'black', vjust= -.5) + theme_classic() + xlab('tỷ lệ vay vốn') + ylab('số người')
table(d$chist)
##
## 1 2 3 4 5 6
## 1353 441 126 77 182 201
table(d$chist)/sum(table(d$chist))
##
## 1 2 3 4 5 6
## 0.56848739 0.18529412 0.05294118 0.03235294 0.07647059 0.08445378
d |> ggplot( aes( x =chist , y= after_stat(count))) + geom_bar(fill='blue') + geom_text(aes(label= scales :: percent(after_stat(count/sum(count)),accuracy=.01)), stat = 'count', color= 'black', vjust= -.5) + theme_classic() + xlab('số lần thanh toán tiêu dùng') + ylab('số người')
table(d$mhist)
##
## 1 2 3 4
## 747 1571 41 21
table(d$mhist)/sum(table(d$mhist))
##
## 1 2 3 4
## 0.313865546 0.660084034 0.017226891 0.008823529
d |> ggplot( aes( x =mhist , y= after_stat(count))) + geom_bar(fill='blue') + geom_text(aes(label= scales :: percent(after_stat(count/sum(count)),accuracy=.01)), stat = 'count', color= 'black', vjust= -.5) + theme_classic() + xlab('số lần thanh toán thế chấp') + ylab('số người')
table(cut(d$unemp,6))
##
## (1.79,3.27] (3.27,4.73] (4.73,6.2] (6.2,7.67] (7.67,9.13] (9.13,10.6]
## 1517 623 65 0 16 159
table(cut(d$unemp,6))/sum(table(cut(d$unemp,6)))
##
## (1.79,3.27] (3.27,4.73] (4.73,6.2] (6.2,7.67] (7.67,9.13] (9.13,10.6]
## 0.637394958 0.261764706 0.027310924 0.000000000 0.006722689 0.066806723
d |> ggplot( aes( x =cut(d$unemp,6) , y= after_stat(count))) + geom_bar(fill='blue') + geom_text(aes(label= scales :: percent(after_stat(count/sum(count)),accuracy=.01)), stat = 'count', color= 'black', vjust= -.5) + theme_classic() + xlab('tỷ lệ thất nghiệp') + ylab('số người')
chisq.test(table(d$deny,d$insurance))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(d$deny, d$insurance)
## X-squared = 287.48, df = 1, p-value < 2.2e-16
chisq.test(table(d$deny,d$afam))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(d$deny, d$afam)
## X-squared = 98.376, df = 1, p-value < 2.2e-16
chisq.test(table(d$deny,d$single))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(d$deny, d$single)
## X-squared = 13.489, df = 1, p-value = 0.0002399
chisq.test(table(d$deny,d$hschool))
## Warning in chisq.test(table(d$deny, d$hschool)): Chi-squared approximation may
## be incorrect
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(d$deny, d$hschool)
## X-squared = 8.4052, df = 1, p-value = 0.003741
mh1 <- glm(deny~afam + insurance + single + hschool , family = binomial(link = 'logit'), data = d)
summary(mh1)
##
## Call:
## glm(formula = deny ~ afam + insurance + single + hschool, family = binomial(link = "logit"),
## data = d)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4399 0.3767 -3.823 0.000132 ***
## afamyes 1.2466 0.1543 8.078 6.58e-16 ***
## insuranceyes 4.5517 0.5330 8.540 < 2e-16 ***
## singleyes 0.4113 0.1396 2.946 0.003217 **
## hschoolyes -1.1932 0.3801 -3.139 0.001694 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1744.2 on 2379 degrees of freedom
## Residual deviance: 1494.5 on 2375 degrees of freedom
## AIC: 1504.5
##
## Number of Fisher Scoring iterations: 5
mh2 <- glm(deny~afam + insurance + single + hschool , family = binomial(link = 'probit'), data = d)
summary(mh2)
##
## Call:
## glm(formula = deny ~ afam + insurance + single + hschool, family = binomial(link = "probit"),
## data = d)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.83197 0.22012 -3.780 0.000157 ***
## afamyes 0.67833 0.08660 7.832 4.78e-15 ***
## insuranceyes 2.62368 0.27016 9.712 < 2e-16 ***
## singleyes 0.22048 0.07242 3.045 0.002330 **
## hschoolyes -0.67503 0.22188 -3.042 0.002347 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1744.2 on 2379 degrees of freedom
## Residual deviance: 1493.8 on 2375 degrees of freedom
## AIC: 1503.8
##
## Number of Fisher Scoring iterations: 5
mh3 <- glm(deny~afam + insurance + single + hschool , family = binomial(link = 'cloglog'), data = d)
summary(mh3)
##
## Call:
## glm(formula = deny ~ afam + insurance + single + hschool, family = binomial(link = "cloglog"),
## data = d)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.6205 0.3180 -5.096 3.47e-07 ***
## afamyes 1.0593 0.1378 7.687 1.50e-14 ***
## insuranceyes 3.1197 0.2284 13.657 < 2e-16 ***
## singleyes 0.4143 0.1253 3.307 0.000944 ***
## hschoolyes -1.0302 0.3194 -3.226 0.001257 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1744.2 on 2379 degrees of freedom
## Residual deviance: 1497.6 on 2375 degrees of freedom
## AIC: 1507.6
##
## Number of Fisher Scoring iterations: 6
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.3.1
##
## Attaching package: 'DescTools'
## The following object is masked from 'package:car':
##
## Recode
library(caret)
## Warning: package 'caret' was built under R version 4.3.1
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:DescTools':
##
## MAE, RMSE
## The following object is masked from 'package:survival':
##
## cluster
BrierScore(mh1)
## [1] 0.08850611
BrierScore(mh2)
## [1] 0.08845049
BrierScore(mh3)
## [1] 0.08859589
confusionMatrix(table(predict(mh1, type = "response")>=0.5, mh1$data$deny == 'yes'))
## Confusion Matrix and Statistics
##
##
## FALSE TRUE
## FALSE 2089 240
## TRUE 6 45
##
## Accuracy : 0.8966
## 95% CI : (0.8837, 0.9086)
## No Information Rate : 0.8803
## P-Value [Acc > NIR] : 0.006704
##
## Kappa : 0.2402
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9971
## Specificity : 0.1579
## Pos Pred Value : 0.8970
## Neg Pred Value : 0.8824
## Prevalence : 0.8803
## Detection Rate : 0.8777
## Detection Prevalence : 0.9786
## Balanced Accuracy : 0.5775
##
## 'Positive' Class : FALSE
##
confusionMatrix(table(predict(mh2, type = "response")>=0.5, mh2$data$deny == 'yes'))
## Confusion Matrix and Statistics
##
##
## FALSE TRUE
## FALSE 2089 240
## TRUE 6 45
##
## Accuracy : 0.8966
## 95% CI : (0.8837, 0.9086)
## No Information Rate : 0.8803
## P-Value [Acc > NIR] : 0.006704
##
## Kappa : 0.2402
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9971
## Specificity : 0.1579
## Pos Pred Value : 0.8970
## Neg Pred Value : 0.8824
## Prevalence : 0.8803
## Detection Rate : 0.8777
## Detection Prevalence : 0.9786
## Balanced Accuracy : 0.5775
##
## 'Positive' Class : FALSE
##
confusionMatrix(table(predict(mh3, type = "response")>=0.5, mh3$data$deny == 'yes'))
## Confusion Matrix and Statistics
##
##
## FALSE TRUE
## FALSE 2089 240
## TRUE 6 45
##
## Accuracy : 0.8966
## 95% CI : (0.8837, 0.9086)
## No Information Rate : 0.8803
## P-Value [Acc > NIR] : 0.006704
##
## Kappa : 0.2402
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9971
## Specificity : 0.1579
## Pos Pred Value : 0.8970
## Neg Pred Value : 0.8824
## Prevalence : 0.8803
## Detection Rate : 0.8777
## Detection Prevalence : 0.9786
## Balanced Accuracy : 0.5775
##
## 'Positive' Class : FALSE
##
da<-table(d$deny,d$afam)
addmargins(da)
##
## no yes Sum
## no 1852 243 2095
## yes 189 96 285
## Sum 2041 339 2380
ggplot(d, aes(deny, fill = afam)) + geom_bar(position = 'dodge')
RelRisk(da)
## [1] 1.33303
library("epitools")
##
## Attaching package: 'epitools'
## The following object is masked from 'package:survival':
##
## ratetable
riskratio(da)
## $data
##
## no yes Total
## no 1852 243 2095
## yes 189 96 285
## Total 2041 339 2380
##
## $measure
## risk ratio with 95% C.I.
## estimate lower upper
## no 1.00000 NA NA
## yes 2.90405 2.374608 3.551537
##
## $p.value
## two-sided
## midp.exact fisher.exact chi.square
## no NA NA NA
## yes 0 2.302896e-19 1.394266e-23
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"
oddsratio(da)
## $data
##
## no yes Total
## no 1852 243 2095
## yes 189 96 285
## Total 2041 339 2380
##
## $measure
## odds ratio with 95% C.I.
## estimate lower upper
## no 1.000000 NA NA
## yes 3.870131 2.918351 5.111518
##
## $p.value
## two-sided
## midp.exact fisher.exact chi.square
## no NA NA NA
## yes 0 2.302896e-19 1.394266e-23
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "median-unbiased estimate & mid-p exact CI"
di<-table(d$deny,d$insurance)
addmargins(di)
##
## no yes Sum
## no 2091 4 2095
## yes 241 44 285
## Sum 2332 48 2380
ggplot(d, aes(deny, fill = insurance)) + geom_bar(position = 'dodge')
RelRisk(di)
## [1] 1.180315
riskratio(di)
## $data
##
## no yes Total
## no 2091 4 2095
## yes 241 44 285
## Total 2332 48 2380
##
## $measure
## risk ratio with 95% C.I.
## estimate lower upper
## no 1.00000 NA NA
## yes 80.85965 29.27297 223.3556
##
## $p.value
## two-sided
## midp.exact fisher.exact chi.square
## no NA NA NA
## yes 0 1.587383e-37 3.753633e-66
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"
oddsratio(di)
## $data
##
## no yes Total
## no 2091 4 2095
## yes 241 44 285
## Total 2332 48 2380
##
## $measure
## odds ratio with 95% C.I.
## estimate lower upper
## no 1.00000 NA NA
## yes 91.80233 36.72332 314.0005
##
## $p.value
## two-sided
## midp.exact fisher.exact chi.square
## no NA NA NA
## yes 0 1.587383e-37 3.753633e-66
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "median-unbiased estimate & mid-p exact CI"
ds<-table(d$deny,d$single)
addmargins(ds)
##
## no yes Sum
## no 1300 795 2095
## yes 144 141 285
## Sum 1444 936 2380
ggplot(d, aes(deny, fill = single)) + geom_bar(position = 'dodge')
RelRisk(ds)
## [1] 1.228123
riskratio(ds)
## $data
##
## no yes Total
## no 1300 795 2095
## yes 144 141 285
## Total 1444 936 2380
##
## $measure
## risk ratio with 95% C.I.
## estimate lower upper
## no 1.00000 NA NA
## yes 1.30374 1.145409 1.483959
##
## $p.value
## two-sided
## midp.exact fisher.exact chi.square
## no NA NA NA
## yes 0.0002185362 0.0002236294 0.0001859483
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"
oddsratio(ds)
## $data
##
## no yes Total
## no 1300 795 2095
## yes 144 141 285
## Total 1444 936 2380
##
## $measure
## odds ratio with 95% C.I.
## estimate lower upper
## no 1.000000 NA NA
## yes 1.600842 1.248113 2.053054
##
## $p.value
## two-sided
## midp.exact fisher.exact chi.square
## no NA NA NA
## yes 0.0002185362 0.0002236294 0.0001859483
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "median-unbiased estimate & mid-p exact CI"
dh<-table(d$deny,d$hschool)
addmargins(dh)
##
## no yes Sum
## no 28 2067 2095
## yes 11 274 285
## Sum 39 2341 2380
ggplot(d, aes(deny, fill = hschool)) + geom_bar(position = 'dodge')
RelRisk(dh)
## [1] 0.346279
riskratio(dh)
## Warning in chisq.test(xx, correct = correction): Chi-squared approximation may
## be incorrect
## $data
##
## no yes Total
## no 28 2067 2095
## yes 11 274 285
## Total 39 2341 2380
##
## $measure
## risk ratio with 95% C.I.
## estimate lower upper
## no 1.0000000 NA NA
## yes 0.9744269 0.9515189 0.9978863
##
## $p.value
## two-sided
## midp.exact fisher.exact chi.square
## no NA NA NA
## yes 0.005908764 0.004559804 0.001644918
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"
oddsratio(dh)
## Warning in chisq.test(xx, correct = correction): Chi-squared approximation may
## be incorrect
## $data
##
## no yes Total
## no 28 2067 2095
## yes 11 274 285
## Total 39 2341 2380
##
## $measure
## odds ratio with 95% C.I.
## estimate lower upper
## no 1.0000000 NA NA
## yes 0.3347397 0.1686599 0.7134814
##
## $p.value
## two-sided
## midp.exact fisher.exact chi.square
## no NA NA NA
## yes 0.005908764 0.004559804 0.001644918
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "median-unbiased estimate & mid-p exact CI"
y <- d[d$deny == 'yes',]
prop.test(length(y$deny), length(d$deny))
##
## 1-sample proportions test with continuity correction
##
## data: length(y$deny) out of length(d$deny), null probability 0.5
## X-squared = 1375, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.1071133 0.1336277
## sample estimates:
## p
## 0.1197479
y1 <- d[d$deny == 'no',]
prop.test(length(y1$deny), length(d$deny))
##
## 1-sample proportions test with continuity correction
##
## data: length(y1$deny) out of length(d$deny), null probability 0.5
## X-squared = 1375, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.8663723 0.8928867
## sample estimates:
## p
## 0.8802521
i <- d[d$insurance == 'yes',]
prop.test(length(i$insurance), length(d$insurance))
##
## 1-sample proportions test with continuity correction
##
## data: length(i$insurance) out of length(d$insurance), null probability 0.5
## X-squared = 2190, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.01506428 0.02687526
## sample estimates:
## p
## 0.02016807
i1 <- d[d$insurance == 'no',]
prop.test(length(i1$insurance), length(d$insurance))
##
## 1-sample proportions test with continuity correction
##
## data: length(i1$insurance) out of length(d$insurance), null probability 0.5
## X-squared = 2190, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.9731247 0.9849357
## sample estimates:
## p
## 0.9798319
s <- d[d$single == 'yes',]
prop.test(length(s$single), length(d$single))
##
## 1-sample proportions test with continuity correction
##
## data: length(s$single) out of length(d$single), null probability 0.5
## X-squared = 108, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.3736317 0.4132706
## sample estimates:
## p
## 0.3932773
s1 <- d[d$single == 'no',]
prop.test(length(s1$single), length(d$single))
##
## 1-sample proportions test with continuity correction
##
## data: length(s1$single) out of length(d$single), null probability 0.5
## X-squared = 108, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.5867294 0.6263683
## sample estimates:
## p
## 0.6067227
a <- d[d$afam == 'yes',]
prop.test(length(a$afam), length(d$afam))
##
## 1-sample proportions test with continuity correction
##
## data: length(a$afam) out of length(d$afam), null probability 0.5
## X-squared = 1215.7, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.1287703 0.1572732
## sample estimates:
## p
## 0.142437
a1 <- d[d$afam == 'no',]
prop.test(length(a1$afam), length(d$afam))
##
## 1-sample proportions test with continuity correction
##
## data: length(a1$afam) out of length(d$afam), null probability 0.5
## X-squared = 1215.7, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.8427268 0.8712297
## sample estimates:
## p
## 0.857563
h <- d[d$hschool == 'yes',]
prop.test(length(h$hschool), length(d$hschool))
##
## 1-sample proportions test with continuity correction
##
## data: length(h$hschool) out of length(d$hschool), null probability 0.5
## X-squared = 2224.6, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.9774371 0.9881677
## sample estimates:
## p
## 0.9836134
h1 <- d[d$hschool == 'no',]
prop.test(length(h1$hschool), length(d$hschool))
##
## 1-sample proportions test with continuity correction
##
## data: length(h1$hschool) out of length(d$hschool), null probability 0.5
## X-squared = 2224.6, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.01183228 0.02256289
## sample estimates:
## p
## 0.01638655