library("webshot2")

## Warning: package 'webshot2' was built under R version 4.3.1

library("epitools")
library("DescTools")

## Warning: package 'DescTools' was built under R version 4.3.1

library("ggplot2")

## Warning: package 'ggplot2' was built under R version 4.3.1

library("caret")

## Warning: package 'caret' was built under R version 4.3.1

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following objects are masked from 'package:DescTools':
## 
##     MAE, RMSE

library(AER)

## Loading required package: car

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:DescTools':
## 
##     Recode

## Loading required package: lmtest

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

## Loading required package: survival

## 
## Attaching package: 'survival'

## The following object is masked from 'package:caret':
## 
##     cluster

## The following object is masked from 'package:epitools':
## 
##     ratetable

library(DT)

## Warning: package 'DT' was built under R version 4.3.1

data("NMES1988")
c<-NMES1988

1 Câu 5

Ước lượng mô hình hồi quy

1.1 Hồi quy với hàm logit

MH1 <- glm(insurance~afam + gender + married + employed + medicaid , family = binomial(link = 'logit'), data = c)
summary(MH1)

## 
## Call:
## glm(formula = insurance ~ afam + gender + married + employed + 
##     medicaid, family = binomial(link = "logit"), data = c)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.67111    0.07026  23.785  < 2e-16 ***
## afamyes     -1.58561    0.11278 -14.060  < 2e-16 ***
## gendermale  -0.37716    0.09555  -3.947 7.91e-05 ***
## marriedyes   0.62911    0.09411   6.685 2.31e-11 ***
## employedyes  0.31328    0.15180   2.064    0.039 *  
## medicaidyes -3.10146    0.15195 -20.411  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4682.5  on 4405  degrees of freedom
## Residual deviance: 3610.0  on 4400  degrees of freedom
## AIC: 3622
## 
## Number of Fisher Scoring iterations: 4

# Giá trị BrierScore
BrierScore(MH1)

## [1] 0.1240728

# Ma trận nhầm lẫn
confusionMatrix(table(predict(MH1, type = "response")>=0.5, MH1$data$insurance == 'yes'))

## Confusion Matrix and Statistics
## 
##        
##         FALSE TRUE
##   FALSE   374   82
##   TRUE    611 3339
##                                           
##                Accuracy : 0.8427          
##                  95% CI : (0.8316, 0.8533)
##     No Information Rate : 0.7764          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4398          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.37970         
##             Specificity : 0.97603         
##          Pos Pred Value : 0.82018         
##          Neg Pred Value : 0.84532         
##              Prevalence : 0.22356         
##          Detection Rate : 0.08488         
##    Detection Prevalence : 0.10350         
##       Balanced Accuracy : 0.67786         
##                                           
##        'Positive' Class : FALSE           
##

1.2 Hồi quy với hàm probit

MH2 <- glm(insurance~afam +  gender + married + employed + medicaid  , family = binomial(link = 'probit'), data = c)
summary(MH2)

## 
## Call:
## glm(formula = insurance ~ afam + gender + married + employed + 
##     medicaid, family = binomial(link = "probit"), data = c)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.99574    0.03889  25.603  < 2e-16 ***
## afamyes     -0.90863    0.06723 -13.516  < 2e-16 ***
## gendermale  -0.20788    0.05247  -3.962 7.44e-05 ***
## marriedyes   0.34546    0.05176   6.674 2.50e-11 ***
## employedyes  0.16218    0.08120   1.997   0.0458 *  
## medicaidyes -1.81734    0.08407 -21.617  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4682.5  on 4405  degrees of freedom
## Residual deviance: 3614.0  on 4400  degrees of freedom
## AIC: 3626
## 
## Number of Fisher Scoring iterations: 4

# Giá trị BrierScore
BrierScore(MH2)

## [1] 0.1242269

# Ma trận nhầm lẫn
confusionMatrix(table(predict(MH2, type = "response")>=0.5, MH2$data$insurance == 'yes'))

## Confusion Matrix and Statistics
## 
##        
##         FALSE TRUE
##   FALSE   374   82
##   TRUE    611 3339
##                                           
##                Accuracy : 0.8427          
##                  95% CI : (0.8316, 0.8533)
##     No Information Rate : 0.7764          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4398          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.37970         
##             Specificity : 0.97603         
##          Pos Pred Value : 0.82018         
##          Neg Pred Value : 0.84532         
##              Prevalence : 0.22356         
##          Detection Rate : 0.08488         
##    Detection Prevalence : 0.10350         
##       Balanced Accuracy : 0.67786         
##                                           
##        'Positive' Class : FALSE           
##

1.3 Hồi quy với hàm cloglog

MH3 <- glm(insurance~afam +  gender + married + employed + medicaid  , family = binomial(link = 'cloglog'), data = c)
summary(MH3)

## 
## Call:
## glm(formula = insurance ~ afam + gender + married + employed + 
##     medicaid, family = binomial(link = "cloglog"), data = c)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.61192    0.03329  18.380  < 2e-16 ***
## afamyes     -0.91623    0.07381 -12.413  < 2e-16 ***
## gendermale  -0.17797    0.04456  -3.994 6.49e-05 ***
## marriedyes   0.28824    0.04431   6.506 7.74e-11 ***
## employedyes  0.11991    0.06576   1.824   0.0682 .  
## medicaidyes -2.20318    0.13108 -16.808  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4682.5  on 4405  degrees of freedom
## Residual deviance: 3605.8  on 4400  degrees of freedom
## AIC: 3617.8
## 
## Number of Fisher Scoring iterations: 5

# Giá trị BrierScore
BrierScore(MH3)

## [1] 0.1240329

# Ma trận nhầm lẫn
confusionMatrix(table(predict(MH3, type = "response")>=0.5, MH3$data$insurance == 'yes'))

## Confusion Matrix and Statistics
## 
##        
##         FALSE TRUE
##   FALSE   374   82
##   TRUE    611 3339
##                                           
##                Accuracy : 0.8427          
##                  95% CI : (0.8316, 0.8533)
##     No Information Rate : 0.7764          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4398          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.37970         
##             Specificity : 0.97603         
##          Pos Pred Value : 0.82018         
##          Neg Pred Value : 0.84532         
##              Prevalence : 0.22356         
##          Detection Rate : 0.08488         
##    Detection Prevalence : 0.10350         
##       Balanced Accuracy : 0.67786         
##                                           
##        'Positive' Class : FALSE           
##

1.4 Lựa chọn mô hình

Mô hình	AIC	Deviance	Brier Score	Độ chính xác	Độ nhạy	Độ đặc hiệu
Logit	3622	3610.0	0.1240728	0.8427	0.37970	0.97603
Probit	3626	3614.0	0.1242269	0.8427	0.37970	0.97603
Cloglog	3617.8	3605.8	0.1240329	0.8427	0.37970	0.97603
Lựa chọn	MH3	MH3	MH3	MH3	MH3	MH3

Dựa vào các tiêu chí đánh giá một mô hình bao gồm AIC, Deviance, Brier Score và Confusion Matrix (Độ chính xác, độ nhạy, độ đặc hiệu) đều đưa ra kết quả cho thấy mô hình 3 - Mô hình sử dụng hàm cloglog là tốt nhất trong 3 mô hình được đề xuất.

Do đó mô hình hồi quy logistic với hàm cloglog là mô hình tốt nhất.

Mô hình hồi quy với hàm clolog:

\(cloglog(π)=log(−log(1−π)) = 0.61192 - 0.91623afmayes - 0.17797gendermale + 0.28824marriedyes + 0.11991employedyes - 2.20318medicaidyes\)

2 Câu 3,4

Thống kê mô tả

2.1 Thống kê mô tả cho 1 biến

Biến afam

#Tần số
table(c$afam)

## 
##   no  yes 
## 3890  516

#Tần suất
table(c$afam)/sum(table(c$afam))

## 
##       no      yes 
## 0.882887 0.117113

#Đồ thị cột
c |> ggplot(aes( x = afam, y = after_stat(count))) +
  geom_bar(fill = 'blue') +
  geom_text(aes(label = scales::percent( after_stat(count/sum(count)))), stat = 'count', color = 'black', vjust = 1.5) +
  theme_classic() + 
  labs(x = 'afam', y = 'Số người')

Có 516 cá nhân là người Mỹ gốc Phi chiếm 12% và 3890 cá nhân không phải người Mỹ gốc phi chiếm 88%.

Biến gender

#Tần số
table(c$gender)

## 
## female   male 
##   2628   1778

#Tần suất
table(c$gender)/sum(table(c$gender))

## 
##    female      male 
## 0.5964594 0.4035406

#Đồ thị cột
c |> ggplot(aes( x = gender, y = after_stat(count))) +
  geom_bar(fill = 'blue') +
  geom_text(aes(label = scales::percent( after_stat(count/sum(count)))), stat = 'count', color = 'black', vjust = 1.5) +
  theme_classic() + 
  labs(x = 'gender', y = 'Số người')

Có 2628 cá nhân là giới tính nữ chiếm 60% tổng thể và 1778 cá nhân thuộc giới tính nam chiếm 40% tổng thể.

Biến maried

#Tần số
table(c$married)

## 
##   no  yes 
## 2000 2406

#Tần suất
table(c$married)/sum(table(c$married))

## 
##        no       yes 
## 0.4539265 0.5460735

#Đồ thị cột
c |> ggplot(aes( x = married, y = after_stat(count))) +
  geom_bar(fill = 'blue') +
  geom_text(aes(label = scales::percent( after_stat(count/sum(count)))), stat = 'count', color = 'black', vjust = 1.5) +
  theme_classic() + 
  labs(x = 'married', y = 'Số người')

Có 2000 cá nhân là độc thân chiếm 45.4% tổng thể và 2406 cá nhân đã kết hôn chiếm 54.6% tổng thể.

Biến employed

#Tần số
table(c$employed)

## 
##   no  yes 
## 3951  455

#Tần suất
table(c$employed)/sum(table(c$employed))

## 
##        no       yes 
## 0.8967317 0.1032683

#Đồ thị cột
c |> ggplot(aes( x = employed, y = after_stat(count))) +
  geom_bar(fill = 'blue') +
  geom_text(aes(label = scales::percent( after_stat(count/sum(count)))), stat = 'count', color = 'black', vjust = 1.5) +
  theme_classic() + 
  labs(x = 'employed', y = 'Số người')

Có 3951 cá nhân không được tuyển dụng chiếm 90% tổng thể và 455 cá nhân được tuyển dụng chiếm 10% tổng thể.

Biến insurance

#Tần số
table(c$insurance)

## 
##   no  yes 
##  985 3421

#Tần suất
table(c$insurance)/sum(table(c$insurance))

## 
##        no       yes 
## 0.2235588 0.7764412

#Đồ thị cột
c |> ggplot(aes( x = insurance, y = after_stat(count))) +
  geom_bar(fill = 'blue') +
  geom_text(aes(label = scales::percent( after_stat(count/sum(count)))), stat = 'count', color = 'black', vjust = 1.5) +
  theme_classic() + 
  labs(x = 'insurance', y = 'Số người')

Có 985 cá nhân không được bảo hiểm tư nhân chiếm 22% tổng thể và 3421 cá nhân được bảo hiểm tư nhân chiếm 78% tổng thể.

Biến medicaid

#Tần số
table(c$medicaid)

## 
##   no  yes 
## 4004  402

#Tần suất
table(c$medicaid)/sum(table(c$medicaid))

## 
##         no        yes 
## 0.90876078 0.09123922

#Đồ thị cột
c |> ggplot(aes( x = medicaid, y = after_stat(count))) +
  geom_bar(fill = 'blue') +
  geom_text(aes(label = scales::percent( after_stat(count/sum(count)))), stat = 'count', color = 'black', vjust = 1.5) +
  theme_classic() + 
  labs(x = 'medicaid', y = 'Số người')

Có 4004 cá nhân không được Medicaid chi trả chiếm 91% tổng thể và 402 cá nhân được Medicaid chi trả chiếm 9% tổng thể.

Biến income

summary(c$income)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.0125  0.9122  1.6982  2.5271  3.1728 54.8351

#Đồ thị cột
hist(c$income, main = "Biểu đồ thể hiện thu nhập ",xlab = "thu nhập", ylab = "số người", col = "pink")

Dựa vào kết quả thống kê mô tả, ta thấy thu nhập dao động từ -1.0125 đến 54.8351 , trung bình (mean) là 2.5271. 1st Qu.(first quartile) = 0.9122 có nghĩa là 25% đối tượng nghiên cứu có tỷ lệ thanh toán trên thu nhập bằng hoặc nhỏ hơn 0.9122. Tương tự, 3rd Qu.(Third quartile) = 3.1728 có nghĩa là 75% đối tượng có tỷ lệ thanh toán trên thu nhập bằng hoặc thấp hơn 3.1728. Số trung vị (median) 1.6982 cũng có nghĩa là 50% đối tượng có tỷ lệ thanh toán trên thu nhập là 1.6982 trở xuống.

Biến age

summary(c$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   6.600   6.900   7.300   7.402   7.800  10.900

hist(c$age, main = "Biểu đồ thể hiện độ tuổi ",xlab = "độ tuổi", ylab = "số người", col = "pink")

Dựa vào kết quả thống kê mô tả, ta thấy độ tuổi dao động từ 6.6 đến 10.9 , trung bình (mean) là 7.402 1st Qu.(first quartile) = 6.900 có nghĩa là 25% đối tượng nghiên cứu có tỷ lệ thanh toán trên thu nhập bằng hoặc nhỏ hơn 6.900. Tương tự, 3rd Qu.(Third quartile) = 7.800 có nghĩa là 75% đối tượng có tỷ lệ thanh toán trên thu nhập bằng hoặc thấp hơn 7.800 Số trung vị (median) 7.300 cũng có nghĩa là 50% đối tượng có tỷ lệ thanh toán trên thu nhập là 7.300 trở xuống.

2.2 Thống kê mô tả cho 2 biến định tính

Biến insurance và afam

#Tần số
ia<-table(c$insurance,c$afam)
ia

##      
##         no  yes
##   no   681  304
##   yes 3209  212

ggplot(c, aes(insurance, fill = afam)) + geom_bar(position = 'dodge')

Có 681 cá nhân không là người Mỹ gốc Phi không được bảo hiểm tư nhân và có 3209 cá nhân không là người Mỹ gốc Phi được bảo hiểm tư nhân. Có 304 cá nhân là người Mỹ gốc Phi không được bảo hiểm tư nhân và có 212 cá nhân là người Mỹ gốc Phi được bảo hiểm tư nhân.

# Rủi ro tương đối
riskratio(ia)

## $data
##        
##           no yes Total
##   no     681 304   985
##   yes   3209 212  3421
##   Total 3890 516  4406
## 
## $measure
##      risk ratio with 95% C.I.
##        estimate     lower     upper
##   no  1.0000000        NA        NA
##   yes 0.2007916 0.1710319 0.2357293
## 
## $p.value
##      two-sided
##       midp.exact fisher.exact    chi.square
##   no          NA           NA            NA
##   yes          0 1.744536e-83 7.150272e-100
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"

#Tỷ lệ chênh
epitab(ia, method = "oddsratio")

## $tab
##      
##         no        p0 yes        p1 oddsratio     lower     upper      p.value
##   no   681 0.1750643 304 0.5891473 1.0000000        NA        NA           NA
##   yes 3209 0.8249357 212 0.4108527 0.1479925 0.1219082 0.1796579 1.744536e-83
## 
## $measure
## [1] "wald"
## 
## $conf.level
## [1] 0.95
## 
## $pvalue
## [1] "fisher.exact"

Biến insurance và gender

#Tần số
ig<-table(c$insurance,c$gender)
ig

##      
##       female male
##   no     627  358
##   yes   2001 1420

ggplot(c, aes(insurance, fill = gender)) + geom_bar(position = 'dodge')

Có 627 cá nhân là nữ không được bảo hiểm tư nhân và có 2001 cá nhân là nữ được bảo hiểm tư nhân. Có 358 cá nhân là nam không được bảo hiểm tư nhân và có 1420 cá nhân là nam được bảo hiểm tư nhân.

# Rủi ro tương đối
riskratio(ig)

## $data
##        
##         female male Total
##   no       627  358   985
##   yes     2001 1420  3421
##   Total   2628 1778  4406
## 
## $measure
##      risk ratio with 95% C.I.
##       estimate    lower    upper
##   no  1.000000       NA       NA
##   yes 1.142059 1.041969 1.251764
## 
## $p.value
##      two-sided
##        midp.exact fisher.exact  chi.square
##   no           NA           NA          NA
##   yes 0.003505651  0.003600429 0.003609613
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"

#Tỷ lệ chênh
epitab(ig, method = "oddsratio")

## $tab
##      
##       female        p0 male        p1 oddsratio    lower    upper     p.value
##   no     627 0.2385845  358 0.2013498   1.00000       NA       NA          NA
##   yes   2001 0.7614155 1420 0.7986502   1.24287 1.073426 1.439061 0.003600429
## 
## $measure
## [1] "wald"
## 
## $conf.level
## [1] 0.95
## 
## $pvalue
## [1] "fisher.exact"

Biến insurance và married

im<-table(c$insurance,c$married)
im

##      
##         no  yes
##   no   622  363
##   yes 1378 2043

ggplot(c, aes(insurance, fill = married)) + geom_bar(position = 'dodge')

Có 622 cá nhân không kết hôn không được bảo hiểm tư nhân và có 1378 cá nhân không kết hôn được bảo hiểm tư nhân. Có 363 cá nhân đã kết hôn không được bảo hiểm tư nhân và có 2043 cá nhân đã kết hôn được bảo hiểm tư nhân.

# Rủi ro tương đối
riskratio(im)

## $data
##        
##           no  yes Total
##   no     622  363   985
##   yes   1378 2043  3421
##   Total 2000 2406  4406
## 
## $measure
##      risk ratio with 95% C.I.
##       estimate    lower    upper
##   no  1.000000       NA       NA
##   yes 1.620485 1.486568 1.766465
## 
## $p.value
##      two-sided
##       midp.exact fisher.exact   chi.square
##   no          NA           NA           NA
##   yes          0 8.187682e-37 5.799732e-37
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"

#Tỷ lệ chênh
epitab(im, method = "oddsratio")

## $tab
##      
##         no    p0  yes        p1 oddsratio    lower    upper      p.value
##   no   622 0.311  363 0.1508728  1.000000       NA       NA           NA
##   yes 1378 0.689 2043 0.8491272  2.540405 2.194481 2.940858 8.187682e-37
## 
## $measure
## [1] "wald"
## 
## $conf.level
## [1] 0.95
## 
## $pvalue
## [1] "fisher.exact"

Biến insurance và employed

#Tần số
ie<-table(c$insurance,c$employed)
ie

##      
##         no  yes
##   no   921   64
##   yes 3030  391

ggplot(c, aes(insurance, fill = employed)) + geom_bar(position = 'dodge')

Có 921 cá nhân không được tuyển dụng không được bảo hiểm tư nhân và có 3030 cá nhân không được tuyển dụng được bảo hiểm tư nhân. Có 64 cá nhân được tuyển dụng không được bảo hiểm tư nhân và có 391 cá nhân được tuyển dụng được bảo hiểm tư nhân.

# Rủi ro tương đối
riskratio(ie)

## $data
##        
##           no yes Total
##   no     921  64   985
##   yes   3030 391  3421
##   Total 3951 455  4406
## 
## $measure
##      risk ratio with 95% C.I.
##       estimate    lower    upper
##   no  1.000000       NA       NA
##   yes 1.759057 1.363658 2.269103
## 
## $p.value
##      two-sided
##         midp.exact fisher.exact   chi.square
##   no            NA           NA           NA
##   yes 2.888423e-06  3.29812e-06 7.393233e-06
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"

#Tỷ lệ chênh
epitab(ie, method = "oddsratio")

## $tab
##      
##         no        p0 yes        p1 oddsratio    lower    upper     p.value
##   no   921 0.2331055  64 0.1406593  1.000000       NA       NA          NA
##   yes 3030 0.7668945 391 0.8593407  1.857008 1.411401 2.443301 3.29812e-06
## 
## $measure
## [1] "wald"
## 
## $conf.level
## [1] 0.95
## 
## $pvalue
## [1] "fisher.exact"

Biến insurance và medicaid

#Tần số
id<-table(c$insurance,c$medicaid)
id

##      
##         no  yes
##   no   644  341
##   yes 3360   61

ggplot(c, aes(insurance, fill = medicaid)) + geom_bar(position = 'dodge')

Có 644 cá nhân không được Medicaid chi trả không được bảo hiểm tư nhân và có 3360 cá nhân không được Medicaid chi trả được bảo hiểm tư nhân. Có 341 cá nhân được Medicaid chi trả không được bảo hiểm tư nhân và có 61 cá nhân được Medicaid chi trả được bảo hiểm tư nhân.

# Rủi ro tương đối
riskratio(id)

## $data
##        
##           no yes Total
##   no     644 341   985
##   yes   3360  61  3421
##   Total 4004 402  4406
## 
## $measure
##      risk ratio with 95% C.I.
##         estimate      lower      upper
##   no  1.00000000         NA         NA
##   yes 0.05150609 0.03959127 0.06700663
## 
## $p.value
##      two-sided
##       midp.exact fisher.exact    chi.square
##   no          NA           NA            NA
##   yes          0 2.16003e-177 2.771183e-218
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"

#Tỷ lệ chênh
epitab(id, method = "oddsratio")

## $tab
##      
##         no        p0 yes        p1  oddsratio     lower      upper      p.value
##   no   644 0.1608392 341 0.8482587 1.00000000        NA         NA           NA
##   yes 3360 0.8391608  61 0.1517413 0.03428641 0.0257783 0.04560263 2.16003e-177
## 
## $measure
## [1] "wald"
## 
## $conf.level
## [1] 0.95
## 
## $pvalue
## [1] "fisher.exact"

2.3 Thống kê suy diễn

2.3.1 Kiểm định tính độc lập cho 2 biến định tính

Biến deny với biến afam

Giả thuyết \(H_0\) : insurance, afam độc lập

chisq.test(table(c$insurance,c$afam))

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(c$insurance, c$afam)
## X-squared = 447.64, df = 1, p-value < 2.2e-16

Qua kết quả kiểm định cho ta p−value<2.2e−16<0.05, nên bác bỏ H_0, nghĩa là biến insurance và afam là có liên quan với nhau.

Biến insurance với biến gender

Giả thuyết \(H_0\) : insurance, gender độc lập

chisq.test(table(c$insurance,c$gender))

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(c$insurance, c$gender)
## X-squared = 8.2573, df = 1, p-value = 0.004059

Qua kết quả kiểm định cho ta p−value=0.004059<0.05, nên bác bỏ H_0, nghĩa là biến insurance và gender là có liên quan với nhau.

Biến insurance với biến married

Giả thuyết \(H_0\) : insurance, married độc lập

chisq.test(table(c$insurance,c$married))

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(c$insurance, c$married)
## X-squared = 160.41, df = 1, p-value < 2.2e-16

Qua kết quả kiểm định cho ta p−value<2.2e−16<0.05, nên bác bỏ H_0, nghĩa là biến insurance và married là có liên quan với nhau.

Biến insurance với biến employed

Giả thuyết \(H_0\) : insurance, employed độc lập

chisq.test(table(c$insurance,c$employed))

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(c$insurance, c$employed)
## X-squared = 19.56, df = 1, p-value = 9.751e-06

Qua kết quả kiểm định cho ta p−value= 9.751e-06<0.05, nên bác bỏ H_0, nghĩa là biến insurance và employed là có liên quan với nhau.

Biến insurance với medicaid

Giả thuyết \(H_0\) : insurance, employed độc lập

chisq.test(table(c$insurance,c$medicaid))

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(c$insurance, c$medicaid)
## X-squared = 990.58, df = 1, p-value < 2.2e-16

Qua kết quả kiểm định cho ta p−value< 2.2e-16<0.05, nên bác bỏ H_0, nghĩa là biến insurance và medicaid là có liên quan với nhau.

2.3.2 Khoảng ước lượng cho tỷ lệ

i <- c[c$insurance == 'yes',]
prop.test(length(i$insurance), length(c$insurance))

## 
##  1-sample proportions test with continuity correction
## 
## data:  length(i$insurance) out of length(c$insurance), null probability 0.5
## X-squared = 1345.7, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.7637858 0.7886106
## sample estimates:
##         p 
## 0.7764412

Với độ tin cậy 95%, ta có tỷ lệ người được bảo hiểm tư nhân so với tổng thể nằm trong khoảng 76.38% đến 78.86%.

a <- c[c$afam == 'yes',]
prop.test(length(a$afam), length(c$afam))

## 
##  1-sample proportions test with continuity correction
## 
## data:  length(a$afam) out of length(c$afam), null probability 0.5
## X-squared = 2582.2, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.1078407 0.1270603
## sample estimates:
##        p 
## 0.117113

Với độ tin cậy 95%, ta có tỷ lệ người Mỹ gốc Phi so với tổng thể nằm trong khoảng 10.78% đến 12.71%.

g <- c[c$gender == 'female',]
prop.test(length(g$gender), length(c$gender))

## 
##  1-sample proportions test with continuity correction
## 
## data:  length(g$gender) out of length(c$gender), null probability 0.5
## X-squared = 163.6, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.5817810 0.6109684
## sample estimates:
##         p 
## 0.5964594

Với độ tin cậy 95%, ta có tỷ lệ người có giới tính nữ so với tổng thể nằm trong khoảng 58.18% đến 61.1%.

m <- c[c$married == 'yes',]
prop.test(length(m$married), length(c$married))

## 
##  1-sample proportions test with continuity correction
## 
## data:  length(m$married) out of length(c$married), null probability 0.5
## X-squared = 37.228, df = 1, p-value = 1.051e-09
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.5312252 0.5608410
## sample estimates:
##         p 
## 0.5460735

Với độ tin cậy 95%, ta có tỷ lệ người đã kết hôn so với tổng thể nằm trong khoảng 53.12% đến 56.08%.

e <- c[c$employed == 'yes',]
prop.test(length(e$employed), length(c$employed))

## 
##  1-sample proportions test with continuity correction
## 
## data:  length(e$employed) out of length(c$employed), null probability 0.5
## X-squared = 2772.4, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.09451665 0.11271981
## sample estimates:
##         p 
## 0.1032683

Với độ tin cậy 95%, ta có tỷ lệ người được tuyển dụng so với tổng thể nằm trong khoảng 9.45% đến 11.27%.

d <- c[c$medicaid == 'yes',]
prop.test(length(d$medicaid), length(c$medicaid))

## 
##  1-sample proportions test with continuity correction
## 
## data:  length(d$medicaid) out of length(c$medicaid), null probability 0.5
## X-squared = 2943.1, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.08298051 0.10021957
## sample estimates:
##          p 
## 0.09123922

Với độ tin cậy 95%, ta có tỷ lệ người được Medicaid chi trả so với tổng thể nằm trong khoảng 8.3% đến 10.02%

3 Câu 2

Chọn 1 hoặc 2 biến định tính và 1 biến định lượng để làm biến phụ thuộc, giải thích lý do.

3.1 Chọn biến định tính làm biến phụ thuộc

Chọn biến insurance làm biến phụ thuộc. Đây là biến cho biết cá nhân có được bảo hiểm tư nhân hay không, biếu hiện là yes/no. Với lý do muốn xác định yếu tố bảo hiểm tư nhân bị ảnh hưởng như thế nào bởi các yếu tố như tuyển dụng, medicaid, giới tính hay là người Mỹ gốc Phi hay không.

3.2 Chọn biến định lượng là biến phụ thuộc

Chọn biến income làm biến phụ thuộc, đây là biến thu nhập gia đình tính theo USD. Với lý do muốn xác định thu nhập chịu ảnh hưởng của có yếu tố khác như thế nào.

4 Câu 1

Giải thích bộ dữ liệu

datatable(c)

Bộ dữ liệu 4406 quan sát và 19 biến:

visits: số lần khám tại văn phòng bác sĩ
nvisits: số lần khám tại văn phòng không phải bác sĩ
ovisits: số lần khám ngoại trú tại bệnh viện của bác sĩ
novisits: số lần khám ngoại trú tại bệnh viện không phải bác sĩ
emergency: thăm phòng cấp cứu
hospital: số lần nằm viện
health: yếu tố biểu thị tình trạng sức khỏe của bản thân, các mức độ là “kém”, “trung bình”,“xuất sắc”.
chronic: số bệnh mãn tính
adl: yếu tố cho biến cá nhân có tình trạng hạn chế các hoạt động sinh hoạt hàng ngày hay không. “hạn chế”/“bình thường”
region: hệ số chỉ khu vực, cấp độ là đông bắc. trung tây, tây, khác.
age: tuổi tính bằng năm
afam: là cá nhân người Mỹ gốc Phi hay không?
gender: yếu tố chỉ giới tính
married: cá nhân đã kết hôn chưa?
school: số năm học
income: thu nhập gia đình tính theo USD
employed: cá nhân có được tuyển dụng không?
insurance: cá nhân có được bảo hiểm tư nhân không?
medicaid: cá nhân có được Medicaid chi trả không?

PTĐLĐT

Lữ Thị Bảo Uyên

2023-06-12