Ngày thứ ba: giá trị P, so sánh 2 nhóm

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(readxl)
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
library(compareGroups)

(3.1) Việc 1: Đọc dữ liệu

library(readxl)
arr = read.csv("C:\\UTS\\Arrest dataset.csv", header = TRUE)

ins = read_excel("C:\\UTS\\Insurance dataset.xlsx")

summary(ins)
##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1338        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.30   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.21                      Mean   :30.66   Mean   :1.095  
##  3rd Qu.:51.00                      3rd Qu.:34.69   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             region              charge     
##  Length:1338        Length:1338        Min.   : 1122  
##  Class :character   Class :character   1st Qu.: 4740  
##  Mode  :character   Mode  :character   Median : 9382  
##                                        Mean   :13270  
##                                        3rd Qu.:16640  
##                                        Max.   :63770

(3.2) Việc 2: Dùng table1 tóm tắt dữ liệu ins

# Dùng table1 tóm tắt dữ liệu ins 

library(table1)

table1(~ age + sex + bmi + children + smoker + region + charge, data = ins)
Overall
(N=1338)
age
Mean (SD) 39.2 (14.0)
Median [Min, Max] 39.0 [18.0, 64.0]
sex
female 662 (49.5%)
male 676 (50.5%)
bmi
Mean (SD) 30.7 (6.10)
Median [Min, Max] 30.4 [16.0, 53.1]
children
Mean (SD) 1.09 (1.21)
Median [Min, Max] 1.00 [0, 5.00]
smoker
no 1064 (79.5%)
yes 274 (20.5%)
region
northeast 324 (24.2%)
northwest 325 (24.3%)
southeast 364 (27.2%)
southwest 325 (24.3%)
charge
Mean (SD) 13300 (12100)
Median [Min, Max] 9380 [1120, 63800]
# Dùng table1 tóm tắt dữ liệu ins chia theo vùng miền (region)

table1(~ age + sex + bmi + children + smoker + charge | region, data = ins)
northeast
(N=324)
northwest
(N=325)
southeast
(N=364)
southwest
(N=325)
Overall
(N=1338)
age
Mean (SD) 39.3 (14.1) 39.2 (14.1) 38.9 (14.2) 39.5 (14.0) 39.2 (14.0)
Median [Min, Max] 39.5 [18.0, 64.0] 39.0 [19.0, 64.0] 39.0 [18.0, 64.0] 39.0 [19.0, 64.0] 39.0 [18.0, 64.0]
sex
female 161 (49.7%) 164 (50.5%) 175 (48.1%) 162 (49.8%) 662 (49.5%)
male 163 (50.3%) 161 (49.5%) 189 (51.9%) 163 (50.2%) 676 (50.5%)
bmi
Mean (SD) 29.2 (5.94) 29.2 (5.14) 33.4 (6.48) 30.6 (5.69) 30.7 (6.10)
Median [Min, Max] 28.9 [16.0, 48.1] 28.9 [17.4, 42.9] 33.3 [19.8, 53.1] 30.3 [17.4, 47.6] 30.4 [16.0, 53.1]
children
Mean (SD) 1.05 (1.20) 1.15 (1.17) 1.05 (1.18) 1.14 (1.28) 1.09 (1.21)
Median [Min, Max] 1.00 [0, 5.00] 1.00 [0, 5.00] 1.00 [0, 5.00] 1.00 [0, 5.00] 1.00 [0, 5.00]
smoker
no 257 (79.3%) 267 (82.2%) 273 (75.0%) 267 (82.2%) 1064 (79.5%)
yes 67 (20.7%) 58 (17.8%) 91 (25.0%) 58 (17.8%) 274 (20.5%)
charge
Mean (SD) 13400 (11300) 12400 (11100) 14700 (14000) 12300 (11600) 13300 (12100)
Median [Min, Max] 10100 [1690, 58600] 8970 [1620, 60000] 9290 [1120, 63800] 8800 [1240, 52600] 9380 [1120, 63800]

(3.3) Việc 3: Dùng compareGroups tóm tắt dữ liệu arr

library(compareGroups)

# Dùng package compareGroups tóm tắt dữ liệu arr chia theo biến finance 

createTable(compareGroups(finance ~ age + race + prior + parole, data=arr))
## 
## --------Summary descriptives table by 'finance'---------
## 
## ___________________________________________ 
##               no          yes     p.overall 
##              N=216       N=216              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## age       24.2 (5.73) 25.0 (6.47)   0.203   
## race:                               0.241   
##     black 185 (85.6%) 194 (89.8%)           
##     other 31 (14.4%)  22 (10.2%)            
## prior     2.99 (2.92) 2.98 (2.88)   0.987   
## parole:                             0.843   
##     no    81 (37.5%)  84 (38.9%)            
##     yes   135 (62.5%) 132 (61.1%)           
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

(3.4) Việc 4: Kiểm định sự khác biệt về tỉ lệ tái phạm giữa người da đen và người không phải da đen

createTable(compareGroups(arrest ~ race, data=arr))
## 
## --------Summary descriptives table by 'arrest'---------
## 
## ___________________________________________ 
##                0           1      p.overall 
##              N=318       N=114              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## race:                               0.621   
##     black 277 (87.1%) 102 (89.5%)           
##     other 41 (12.9%)  12 (10.5%)            
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
chisq.test(arr$arrest, arr$race)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  arr$arrest and arr$race
## X-squared = 0.24452, df = 1, p-value = 0.621
# Bạn hãy thể hiện sự khác biệt bằng biểu đồ thanh (bar chart)

library(tidyverse)

arr %>% count(arrest, race) %>% group_by(arrest) %>%  mutate(percent = n / sum(n) * 100) %>% ggplot(aes(x=arrest, y=percent, fill=race)) + geom_bar(stat="identity") + geom_text(aes(label=paste0(sprintf("%1.1f", percent),"%")), position=position_stack(vjust=0.5)) + theme(legend.position="none") + labs(x="Arrest", y="Percent (%)")

(3.5) Việc 5: Kiểm định giả thuyết vô hiệu rằng không có sự khác biệt về tỉ lệ tái phạm giữa nhóm được hỗ trợ tài chánh và nhóm không được hỗ trợ tài chánh.

createTable(compareGroups(arrest ~ finance, data = arr))
## 
## --------Summary descriptives table by 'arrest'---------
## 
## _________________________________________ 
##               0          1      p.overall 
##             N=318      N=114              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## finance:                          0.063   
##     no   150 (47.2%) 66 (57.9%)           
##     yes  168 (52.8%) 48 (42.1%)           
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
chisq.test(arr$finance, arr$arrest)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  arr$finance and arr$arrest
## X-squared = 3.4439, df = 1, p-value = 0.06349

(3.6) Việc 6: Trình độ học vấn có liên quan đến tỉ lệ tái phạm?

# (6.1) Kiểm định mối liên quan giữa trình độ học vấn và nguy cơ tái phạm

arr$edu = as.factor(arr$educ) 

createTable(compareGroups(arrest ~ edu, data = arr))
## Warning in chisq.test(xx, correct = FALSE): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(xx, correct = FALSE): Chi-squared approximation may be
## incorrect
## 
## --------Summary descriptives table by 'arrest'---------
## 
## ______________________________________ 
##            0          1      p.overall 
##          N=318      N=114              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## edu:                           0.023   
##     2 20 (6.29%)  4 (3.51%)            
##     3 162 (50.9%) 77 (67.5%)           
##     4 92 (28.9%)  27 (23.7%)           
##     5 34 (10.7%)  5 (4.39%)            
##     6 10 (3.14%)  1 (0.88%)            
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
chisq.test(arr$arrest, arr$edu)
## Warning in chisq.test(arr$arrest, arr$edu): Chi-squared approximation may be
## incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  arr$arrest and arr$edu
## X-squared = 11.577, df = 4, p-value = 0.02079
# (6.2) Các bạn hãy thể hiện dữ liệu trên bằng một biểu đồ

arr %>% count(arrest, edu) %>% group_by(arrest) %>%  mutate(percent = n / sum(n) * 100) %>% ggplot(aes(x=arrest, y=percent, fill=edu)) + geom_bar(stat="identity") + geom_text(aes(label=paste0(sprintf("%1.1f", percent),"%")), position=position_stack(vjust=0.5)) + theme(legend.position="none") + labs(x="Arrest", y="Percent (%)")

(3.7) Việc 7: Kiểm định sự khác biệt về độ tuổi và tiền sử tội phạm giữa người tái phạm và người không tái phạm?

# (7.1) Có phải người tái phạm tuổi trẻ hơn người không tái phạm? Có phải người tái phạm có tiền sử phạm tội nhiều hơn nhóm không tái phạm?

createTable(compareGroups(arrest ~ age, data = arr))
## 
## --------Summary descriptives table by 'arrest'---------
## 
## _____________________________________ 
##          0           1      p.overall 
##        N=318       N=114              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## age 25.3 (6.31) 22.8 (5.12)  <0.001   
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
t.test(arr$age ~ arr$arrest)
## 
##  Welch Two Sample t-test
## 
## data:  arr$age by arr$arrest
## t = 4.1789, df = 243.6, p-value = 4.086e-05
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  1.317143 3.665975
## sample estimates:
## mean in group 0 mean in group 1 
##        25.25472        22.76316
createTable(compareGroups(arrest ~ prior, data = arr))
## 
## --------Summary descriptives table by 'arrest'---------
## 
## _______________________________________ 
##            0           1      p.overall 
##          N=318       N=114              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## prior 2.70 (2.55) 3.77 (3.59)   0.004   
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
t.test(arr$prior ~ arr$arrest)
## 
##  Welch Two Sample t-test
## 
## data:  arr$prior by arr$arrest
## t = -2.9319, df = 155.9, p-value = 0.003877
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -1.7920133 -0.3493306
## sample estimates:
## mean in group 0 mean in group 1 
##        2.701258        3.771930
# (7.2) Hãy thể hiện sự khác biệt trên bằng một biểu đồ

ggplot(arr, aes(group=arrest, x=arrest, y=age, fill=arrest, color=arrest)) + geom_boxplot(color="black") + geom_jitter(aes(color=arrest), alpha=0.5) + theme(legend.position="none") + labs(x="Arrest", y="Age (year)")

(3.8) Việc 8: kiểm định sự khác biệt về tiền bảo hiểm trong nghiên cứu bảo hiểm

# (8.1) Hãy kiểm định sự khác biệt về tiền bảo hiểm (charge) giữa nam và nữ (sex)

ins$gender[ins$sex == "female"] = 2
## Warning: Unknown or uninitialised column: `gender`.
ins$gender[ins$sex == "male"] = 1
t.test(ins$charge ~ ins$gender)
## 
##  Welch Two Sample t-test
## 
## data:  ins$charge by ins$gender
## t = 2.1009, df = 1313.4, p-value = 0.03584
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##    91.85535 2682.48932
## sample estimates:
## mean in group 1 mean in group 2 
##        13956.75        12569.58
## Hãy thể hiện bằng biểu đồ hộp

ggplot(ins, aes(group=gender, x=gender, y=charge, fill=gender, color=gender)) + geom_boxplot(colour= "black") + geom_jitter(aes(color=gender), alpha=0.5) + theme(legend.position="none") + labs(x="Gender", y="Charge")

# (8.2) Hãy kiểm tra xem tiền bảo hiểm (charge) có tuân theo luật phân bố chuẩn không?

ggplot(ins, aes(x=charge)) + geom_histogram(aes(y=..density..), fill="blue", col="white") + geom_density(col="purple") + labs(x="Charge", y="Charge amount", title="Distribution of charge")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Kiểm định khác

men = ins$charge[ins$sex=="male"]
women = ins$charge[ins$sex=="female"]
n = length(men)
m <- length(women)
B = 1000
difference <- numeric(B)
for (i in 1:B) {
bs.men = sample(men, n, replace=T)
bs.women = sample(women, m, replace=T)
difference[i] = mean(bs.men, na.rm=T) - mean(bs.women, na.rm=T) 
}
hist(difference, breaks=20)

quantile(difference, probs=c(0.025, 0.50, 0.975))
##       2.5%        50%      97.5% 
##   47.28669 1378.03271 2624.32926
### Sử dụng package 'simpleboot'

library(simpleboot)
## Simple Bootstrap Routines (1.1-7)
men.only = ins %>% filter(sex == "male")
women.only = ins %>% filter(sex == "female")
set.seed(1234)
b.means = two.boot(women.only$charge, men.only$charge, mean, R = 1000)
hist (b.means$t, breaks = 20)

quantile(b.means$t, probs=c(0.025, 0.50, 0.975))
##       2.5%        50%      97.5% 
## -2786.3075 -1390.4840  -107.7096

(3.9) Việc 9: kiểm định sự khác biệt về tiền bảo hiểm giữa các vùng miền trong nghiên cứu bảo hiểm

# (9.1) Hãy kiểm định sự khác biệt về tiền bảo hiểm (charge) giữa 4 vùng miền (region) 

summary(aov(charge ~ region, data = ins))
##               Df    Sum Sq   Mean Sq F value Pr(>F)  
## region         3 1.301e+09 433586560    2.97 0.0309 *
## Residuals   1334 1.948e+11 146007093                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(aov(charge ~ region, data = ins))
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = charge ~ region, data = ins)
## 
## $region
##                           diff         lwr        upr     p adj
## northwest-northeast  -988.8091 -3428.93434 1451.31605 0.7245243
## southeast-northeast  1329.0269 -1044.94167 3702.99551 0.4745046
## southwest-northeast -1059.4471 -3499.57234 1380.67806 0.6792086
## southeast-northwest  2317.8361   -54.19944 4689.87157 0.0582938
## southwest-northwest   -70.6380 -2508.88256 2367.60656 0.9998516
## southwest-southeast -2388.4741 -4760.50957  -16.43855 0.0476896
## Hãy thể hiện bằng biểu đồ hộp

ggplot(ins, aes(group=region, x= region, y=charge, fill= region, color= region)) + geom_boxplot(colour= "black") + geom_jitter(aes(color=region))+theme(legend.position="none")+ labs(x="Region", y="Charge")

# (9.2) Hãy kiểm tra xem tiền bảo hiểm (charge) có tuân theo luật phân bố chuẩn không?
ggplot(ins, aes(x=charge)) + geom_histogram(aes(y=..density..), fill="blue", col="white") + geom_density(col="purple") + labs(x="Charge", y="Charge amount", title="Distribution of charge")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Kiểm định khác

#install.packages("lmboot", dependencies = T)
library(lmboot)
anova <- ANOVA.boot(charge ~ region, seed = 1234, B = 3000, data = ins)
anova$`p-values` 
## [1] 0.027
## Bootstrap cho so sánh các nhóm
lm.b <- wild.boot(charge ~ region, B = 100, seed = 1234, data = ins)
## Warning in wild.boot(charge ~ region, B = 100, seed = 1234, data = ins): Number of bootstrap samples is recommended to be more than the number of observations.
quantile(lm.b$bootEstParam[,1], probs=c(.025, .5, .975)) # intercept (NorthEast)
##     2.5%      50%    97.5% 
## 12239.05 13438.29 14592.53
quantile(lm.b$bootEstParam[,2], probs=c(.025, .5, .975)) # NorthWest - NorthEast
##       2.5%        50%      97.5% 
## -2580.6522 -1011.0974   794.8059
quantile(lm.b$bootEstParam[,3], probs=c(.025, .5, .975)) # SouthEast - NorthEast
##     2.5%      50%    97.5% 
## -165.305 1369.632 3200.967
quantile(lm.b$bootEstParam[,4], probs=c(.025, .5, .975)) # SouthWest - NorthEast
##       2.5%        50%      97.5% 
## -2852.8361 -1187.2188   386.2501