library(readxl)
arr = read.csv("D:/Downloads/tailieu/R course/Seminar TDT 2022/Tai lieu/Data set/Arrest dataset.csv")
ins = read_excel("D:/Downloads/tailieu/R course/Seminar TDT 2022/Tai lieu/Data set/Insurance dataset.xlsx")
library(table1)
## Warning: package 'table1' was built under R version 4.0.3
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~., data=ins)
| Overall (N=1338) |
|
|---|---|
| age | |
| Mean (SD) | 39.2 (14.0) |
| Median [Min, Max] | 39.0 [18.0, 64.0] |
| sex | |
| female | 662 (49.5%) |
| male | 676 (50.5%) |
| bmi | |
| Mean (SD) | 30.7 (6.10) |
| Median [Min, Max] | 30.4 [16.0, 53.1] |
| children | |
| Mean (SD) | 1.09 (1.21) |
| Median [Min, Max] | 1.00 [0, 5.00] |
| smoker | |
| no | 1064 (79.5%) |
| yes | 274 (20.5%) |
| region | |
| northeast | 324 (24.2%) |
| northwest | 325 (24.3%) |
| southeast | 364 (27.2%) |
| southwest | 325 (24.3%) |
| charge | |
| Mean (SD) | 13300 (12100) |
| Median [Min, Max] | 9380 [1120, 63800] |
table1(~ age + sex + bmi + children + smoker + region + charge | region, data=ins)
| northeast (N=324) |
northwest (N=325) |
southeast (N=364) |
southwest (N=325) |
Overall (N=1338) |
|
|---|---|---|---|---|---|
| age | |||||
| Mean (SD) | 39.3 (14.1) | 39.2 (14.1) | 38.9 (14.2) | 39.5 (14.0) | 39.2 (14.0) |
| Median [Min, Max] | 39.5 [18.0, 64.0] | 39.0 [19.0, 64.0] | 39.0 [18.0, 64.0] | 39.0 [19.0, 64.0] | 39.0 [18.0, 64.0] |
| sex | |||||
| female | 161 (49.7%) | 164 (50.5%) | 175 (48.1%) | 162 (49.8%) | 662 (49.5%) |
| male | 163 (50.3%) | 161 (49.5%) | 189 (51.9%) | 163 (50.2%) | 676 (50.5%) |
| bmi | |||||
| Mean (SD) | 29.2 (5.94) | 29.2 (5.14) | 33.4 (6.48) | 30.6 (5.69) | 30.7 (6.10) |
| Median [Min, Max] | 28.9 [16.0, 48.1] | 28.9 [17.4, 42.9] | 33.3 [19.8, 53.1] | 30.3 [17.4, 47.6] | 30.4 [16.0, 53.1] |
| children | |||||
| Mean (SD) | 1.05 (1.20) | 1.15 (1.17) | 1.05 (1.18) | 1.14 (1.28) | 1.09 (1.21) |
| Median [Min, Max] | 1.00 [0, 5.00] | 1.00 [0, 5.00] | 1.00 [0, 5.00] | 1.00 [0, 5.00] | 1.00 [0, 5.00] |
| smoker | |||||
| no | 257 (79.3%) | 267 (82.2%) | 273 (75.0%) | 267 (82.2%) | 1064 (79.5%) |
| yes | 67 (20.7%) | 58 (17.8%) | 91 (25.0%) | 58 (17.8%) | 274 (20.5%) |
| region | |||||
| northeast | 324 (100%) | 0 (0%) | 0 (0%) | 0 (0%) | 324 (24.2%) |
| northwest | 0 (0%) | 325 (100%) | 0 (0%) | 0 (0%) | 325 (24.3%) |
| southeast | 0 (0%) | 0 (0%) | 364 (100%) | 0 (0%) | 364 (27.2%) |
| southwest | 0 (0%) | 0 (0%) | 0 (0%) | 325 (100%) | 325 (24.3%) |
| charge | |||||
| Mean (SD) | 13400 (11300) | 12400 (11100) | 14700 (14000) | 12300 (11600) | 13300 (12100) |
| Median [Min, Max] | 10100 [1690, 58600] | 8970 [1620, 60000] | 9290 [1120, 63800] | 8800 [1240, 52600] | 9380 [1120, 63800] |
library(compareGroups); library(tidyverse)
## Loading required package: SNPassoc
## Loading required package: haplo.stats
## Loading required package: arsenal
## Loading required package: survival
## Warning: package 'survival' was built under R version 4.0.5
## Loading required package: mvtnorm
## Registered S3 method overwritten by 'SNPassoc':
## method from
## summary.haplo.glm haplo.stats
## Warning: package 'tidyverse' was built under R version 4.0.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.6
## v tidyr 1.1.1 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
arr = arr %>% mutate(arrest1 = recode(arrest, "1"="Yes", "0"="No"),
fin = recode(finance, "yes" = 1, "no" = 0))
createTable(compareGroups(finance ~ age + race + work.exp + married + parole + prior + educ + employ1 + arrest1, data = arr))
##
## --------Summary descriptives table by 'finance'---------
##
## _________________________________________________
## no yes p.overall
## N=216 N=216
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## age 24.2 (5.73) 25.0 (6.47) 0.203
## race: 0.241
## black 185 (85.6%) 194 (89.8%)
## other 31 (14.4%) 22 (10.2%)
## work.exp: 1.000
## no 93 (43.1%) 92 (42.6%)
## yes 123 (56.9%) 124 (57.4%)
## married: 0.557
## married 29 (13.4%) 24 (11.1%)
## not married 187 (86.6%) 192 (88.9%)
## parole: 0.843
## no 81 (37.5%) 84 (38.9%)
## yes 135 (62.5%) 132 (61.1%)
## prior 2.99 (2.92) 2.98 (2.88) 0.987
## educ 3.44 (0.84) 3.52 (0.82) 0.300
## employ1: 0.330
## no 182 (84.3%) 190 (88.0%)
## yes 34 (15.7%) 26 (12.0%)
## arrest1: 0.063
## No 150 (69.4%) 168 (77.8%)
## Yes 66 (30.6%) 48 (22.2%)
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
library(gmodels); library(ggplot2)
chisq.test(arr$race, arr$arrest1)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: arr$race and arr$arrest1
## X-squared = 0.24452, df = 1, p-value = 0.621
createTable(compareGroups(race ~ arrest1, data = arr))
##
## --------Summary descriptives table by 'race'---------
##
## _________________________________________
## black other p.overall
## N=379 N=53
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## arrest1: 0.621
## No 277 (73.1%) 41 (77.4%)
## Yes 102 (26.9%) 12 (22.6%)
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
CrossTable(arr$race, arr$arrest1, digits=3, prop.t=F, prop.r=T, prop.c=F, chisq=T, expected=T)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Expected N |
## | Chi-square contribution |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 432
##
##
## | arr$arrest1
## arr$race | No | Yes | Row Total |
## -------------|-----------|-----------|-----------|
## black | 277 | 102 | 379 |
## | 278.986 | 100.014 | |
## | 0.014 | 0.039 | |
## | 0.731 | 0.269 | 0.877 |
## -------------|-----------|-----------|-----------|
## other | 41 | 12 | 53 |
## | 39.014 | 13.986 | |
## | 0.101 | 0.282 | |
## | 0.774 | 0.226 | 0.123 |
## -------------|-----------|-----------|-----------|
## Column Total | 318 | 114 | 432 |
## -------------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 0.4367282 d.f. = 1 p = 0.5087058
##
## Pearson's Chi-squared test with Yates' continuity correction
## ------------------------------------------------------------
## Chi^2 = 0.2445157 d.f. = 1 p = 0.6209635
##
##
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
## The following object is masked from 'package:arsenal':
##
## ordinal
show_col(hue_pal()(2))
arr %>% count(race, arrest1) %>% group_by(race) %>% mutate(pct = n/sum(n)*100) %>%
ggplot(aes(race, pct, fill=arrest1)) + geom_bar(stat="identity") + scale_fill_manual(values=c("#00BFC4", "#F8766D")) + labs(fill="Tái phạm") + ylab("Phần trăm (%)") + xlab("Chủng tộc") + geom_text(aes(label=paste0(sprintf("%1.1f", pct),"%")), position=position_stack(vjust=0.5))
chisq.test(arr$finance, arr$arrest1)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: arr$finance and arr$arrest1
## X-squared = 3.4439, df = 1, p-value = 0.06349
createTable(compareGroups(finance ~ arrest1, data = arr))
##
## --------Summary descriptives table by 'finance'---------
##
## __________________________________________
## no yes p.overall
## N=216 N=216
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## arrest1: 0.063
## No 150 (69.4%) 168 (77.8%)
## Yes 66 (30.6%) 48 (22.2%)
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
chisq.test(arr$educ, arr$arrest1)
## Warning in chisq.test(arr$educ, arr$arrest1): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: arr$educ and arr$arrest1
## X-squared = 11.577, df = 4, p-value = 0.02079
fisher.test(arr$educ, arr$arrest1)
##
## Fisher's Exact Test for Count Data
##
## data: arr$educ and arr$arrest1
## p-value = 0.02271
## alternative hypothesis: two.sided
CrossTable(arr$educ, arr$arrest1, digits=3, prop.t=F, prop.r=T, prop.c=F, chisq=T, fisher=T,expected=T)
## Warning in chisq.test(t, correct = FALSE, ...): Chi-squared approximation may be
## incorrect
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Expected N |
## | Chi-square contribution |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 432
##
##
## | arr$arrest1
## arr$educ | No | Yes | Row Total |
## -------------|-----------|-----------|-----------|
## 2 | 20 | 4 | 24 |
## | 17.667 | 6.333 | |
## | 0.308 | 0.860 | |
## | 0.833 | 0.167 | 0.056 |
## -------------|-----------|-----------|-----------|
## 3 | 162 | 77 | 239 |
## | 175.931 | 63.069 | |
## | 1.103 | 3.077 | |
## | 0.678 | 0.322 | 0.553 |
## -------------|-----------|-----------|-----------|
## 4 | 92 | 27 | 119 |
## | 87.597 | 31.403 | |
## | 0.221 | 0.617 | |
## | 0.773 | 0.227 | 0.275 |
## -------------|-----------|-----------|-----------|
## 5 | 34 | 5 | 39 |
## | 28.708 | 10.292 | |
## | 0.975 | 2.721 | |
## | 0.872 | 0.128 | 0.090 |
## -------------|-----------|-----------|-----------|
## 6 | 10 | 1 | 11 |
## | 8.097 | 2.903 | |
## | 0.447 | 1.247 | |
## | 0.909 | 0.091 | 0.025 |
## -------------|-----------|-----------|-----------|
## Column Total | 318 | 114 | 432 |
## -------------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 11.577 d.f. = 4 p = 0.02079029
##
##
##
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Alternative hypothesis: two.sided
## p = 0.02271227
##
##
createTable(compareGroups(educ ~ arrest1, data = arr))
##
## --------Summary descriptives table by 'educ'---------
##
## __________________________________________________________________________
## 2 3 4 5 6 p.overall
## N=24 N=239 N=119 N=39 N=11
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## arrest1: 0.023
## No 20 (83.3%) 162 (67.8%) 92 (77.3%) 34 (87.2%) 10 (90.9%)
## Yes 4 (16.7%) 77 (32.2%) 27 (22.7%) 5 (12.8%) 1 (9.09%)
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
arr %>% count(educ, arrest1) %>% group_by(educ) %>% mutate(pct = n/sum(n)*100) %>%
ggplot(aes(educ, pct, fill=arrest1)) + geom_bar(stat="identity") + scale_fill_manual(values=c("#00BFC4", "#F8766D")) + labs(fill="Tái phạm") + ylab("Phần trăm (%)") + xlab("Trình độ học vấn") + geom_text(aes(label=paste0(sprintf("%1.1f", pct),"%")), position=position_stack(vjust=0.5))
createTable(compareGroups(arrest1 ~ age, data = arr))
##
## --------Summary descriptives table by 'arrest1'---------
##
## _____________________________________
## No Yes p.overall
## N=318 N=114
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## age 25.3 (6.31) 22.8 (5.12) <0.001
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
t.test(arr$age ~ arr$arrest1)
##
## Welch Two Sample t-test
##
## data: arr$age by arr$arrest1
## t = 4.1789, df = 243.6, p-value = 4.086e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.317143 3.665975
## sample estimates:
## mean in group No mean in group Yes
## 25.25472 22.76316
createTable(compareGroups(arrest1 ~ prior, data = arr))
##
## --------Summary descriptives table by 'arrest1'---------
##
## _______________________________________
## No Yes p.overall
## N=318 N=114
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## prior 2.70 (2.55) 3.77 (3.59) 0.004
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
t.test(arr$prior ~ arr$arrest1)
##
## Welch Two Sample t-test
##
## data: arr$prior by arr$arrest1
## t = -2.9319, df = 155.9, p-value = 0.003877
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.7920133 -0.3493306
## sample estimates:
## mean in group No mean in group Yes
## 2.701258 3.771930
arr %>% ggplot((aes(x=arrest1, y=age, fill = arrest1, col = arrest1))) + geom_boxplot(col="black") + geom_jitter(alpha=0.5) + xlab("Tái phạm") + ylab("Tuổi (năm)") + theme(legend.position = "none") + scale_fill_manual(values=c("#00BFC4", "#F8766D")) + scale_colour_manual(values=c("#00BFC4", "#F8766D"))
arr %>% ggplot((aes(x=arrest1, y=age, fill = arrest1, col = arrest1))) + xlab("Tái phạm") + ylab("Tuổi (năm)") + theme(legend.position = "none") + geom_violin(trim=FALSE) + geom_boxplot(col="black", width=0.1) + scale_fill_manual(values=c("#00BFC4", "#F8766D")) + scale_colour_manual(values=c("#00BFC4", "#F8766D"))
t.test(ins$charge ~ ins$sex)
##
## Welch Two Sample t-test
##
## data: ins$charge by ins$sex
## t = -2.1009, df = 1313.4, p-value = 0.03584
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2682.48932 -91.85535
## sample estimates:
## mean in group female mean in group male
## 12569.58 13956.75
ins %>% ggplot((aes(x=sex, y=charge, fill = sex, col = sex))) + geom_boxplot(col="black") + geom_jitter(alpha=0.5) + xlab("Giới tính") + ylab("Tiền bảo hiểm") + theme(legend.position = "none") + scale_fill_manual(values=c("#00BFC4", "#F8766D")) + scale_colour_manual(values=c("#00BFC4", "#F8766D"))
ins %>% ggplot((aes(x=sex, y=charge, fill = sex, col = sex))) + xlab("Giới tính") + ylab("Tiền bảo hiểm") + theme(legend.position = "none") + geom_violin(trim=FALSE) + geom_boxplot(col="black", width=0.1) + scale_fill_manual(values=c("#00BFC4", "#F8766D")) + scale_colour_manual(values=c("#00BFC4", "#F8766D"))
ins %>% ggplot(aes(x=charge)) + geom_histogram(aes(y=..density..), col="white", fill="blue") + geom_density(col="red") + xlab("Tiền bảo hiểm") + ylab("Xác suất") + ggtitle("Phân bố của tiền bảo hiểm") + theme(plot.title = element_text(lineheight = 0.8, face = "bold", hjust = 0.5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
men = ins$charge[ins$sex == "male"]
women = ins$charge[ins$sex == "female"]
set.seed(123)
n1 = length(men); n2 = length(women)
B = 10000
diff = numeric(B)
for (i in 1:B) {
bs1 = sample(men, n1, replace=T)
bs2 = sample(women, n2, replace=T)
diff[i] = mean(bs1, na.rm=T) - mean(bs2, na.rm=T)
}
hist(diff, breaks=20, col="blue", border="white")
quantile(diff, probs=c(0.025, 0.50, 0.975))
## 2.5% 50% 97.5%
## 105.1178 1379.4375 2690.7921
summary(aov(charge ~ region, data = ins))
## Df Sum Sq Mean Sq F value Pr(>F)
## region 3 1.301e+09 433586560 2.97 0.0309 *
## Residuals 1334 1.948e+11 146007093
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(aov(charge ~ region, data = ins))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = charge ~ region, data = ins)
##
## $region
## diff lwr upr p adj
## northwest-northeast -988.8091 -3428.93434 1451.31605 0.7245243
## southeast-northeast 1329.0269 -1044.94167 3702.99551 0.4745046
## southwest-northeast -1059.4471 -3499.57234 1380.67806 0.6792086
## southeast-northwest 2317.8361 -54.19944 4689.87157 0.0582938
## southwest-northwest -70.6380 -2508.88256 2367.60656 0.9998516
## southwest-southeast -2388.4741 -4760.50957 -16.43855 0.0476896
ggplot(ins, aes(group=region, x= region, y=charge, fill= region, color= region)) + xlab("Giới tính") + ylab("Tiền bảo hiểm") + theme(legend.position = "none") + geom_violin(trim=FALSE) + geom_boxplot(col="black", width=0.1) +theme(legend.position="none")+ labs(x="Khu vực", y="Tiền bảo hiểm")
library(lmboot)
## Warning: package 'lmboot' was built under R version 4.0.5
anova <- ANOVA.boot(charge ~ region, seed = 1234, B = 1000, data = ins)
## Warning in ANOVA.boot(charge ~ region, seed = 1234, B = 1000, data = ins): Number of bootstrap samples is recommended to be more than the number of observations.
anova$`p-values`
## [1] 0.032
lm.b <- wild.boot(charge ~ region, B = 100, seed = 1234, data = ins)
## Warning in wild.boot(charge ~ region, B = 100, seed = 1234, data = ins): Number of bootstrap samples is recommended to be more than the number of observations.
quantile(lm.b$bootEstParam[,1], probs=c(.025, .5, .975)) # intercept (NorthEast)
## 2.5% 50% 97.5%
## 12239.05 13438.29 14592.53
quantile(lm.b$bootEstParam[,2], probs=c(.025, .5, .975)) # NorthWest - NorthEast
## 2.5% 50% 97.5%
## -2580.6522 -1011.0974 794.8059
quantile(lm.b$bootEstParam[,3], probs=c(.025, .5, .975)) # SouthEast - NorthEast
## 2.5% 50% 97.5%
## -165.305 1369.632 3200.967
quantile(lm.b$bootEstParam[,4], probs=c(.025, .5, .975)) # SouthWest - NorthEast
## 2.5% 50% 97.5%
## -2852.8361 -1187.2188 386.2501