library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(readxl)
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
library(compareGroups)
library(readxl)
arr = read.csv("C:\\UTS\\Arrest dataset.csv", header = TRUE)
ins = read_excel("C:\\UTS\\Insurance dataset.xlsx")
summary(ins)
## age sex bmi children
## Min. :18.00 Length:1338 Min. :15.96 Min. :0.000
## 1st Qu.:27.00 Class :character 1st Qu.:26.30 1st Qu.:0.000
## Median :39.00 Mode :character Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## smoker region charge
## Length:1338 Length:1338 Min. : 1122
## Class :character Class :character 1st Qu.: 4740
## Mode :character Mode :character Median : 9382
## Mean :13270
## 3rd Qu.:16640
## Max. :63770
# Dùng table1 tóm tắt dữ liệu ins
library(table1)
table1(~ age + sex + bmi + children + smoker + region + charge, data = ins)
Overall (N=1338) |
|
---|---|
age | |
Mean (SD) | 39.2 (14.0) |
Median [Min, Max] | 39.0 [18.0, 64.0] |
sex | |
female | 662 (49.5%) |
male | 676 (50.5%) |
bmi | |
Mean (SD) | 30.7 (6.10) |
Median [Min, Max] | 30.4 [16.0, 53.1] |
children | |
Mean (SD) | 1.09 (1.21) |
Median [Min, Max] | 1.00 [0, 5.00] |
smoker | |
no | 1064 (79.5%) |
yes | 274 (20.5%) |
region | |
northeast | 324 (24.2%) |
northwest | 325 (24.3%) |
southeast | 364 (27.2%) |
southwest | 325 (24.3%) |
charge | |
Mean (SD) | 13300 (12100) |
Median [Min, Max] | 9380 [1120, 63800] |
# Dùng table1 tóm tắt dữ liệu ins chia theo vùng miền (region)
table1(~ age + sex + bmi + children + smoker + charge | region, data = ins)
northeast (N=324) |
northwest (N=325) |
southeast (N=364) |
southwest (N=325) |
Overall (N=1338) |
|
---|---|---|---|---|---|
age | |||||
Mean (SD) | 39.3 (14.1) | 39.2 (14.1) | 38.9 (14.2) | 39.5 (14.0) | 39.2 (14.0) |
Median [Min, Max] | 39.5 [18.0, 64.0] | 39.0 [19.0, 64.0] | 39.0 [18.0, 64.0] | 39.0 [19.0, 64.0] | 39.0 [18.0, 64.0] |
sex | |||||
female | 161 (49.7%) | 164 (50.5%) | 175 (48.1%) | 162 (49.8%) | 662 (49.5%) |
male | 163 (50.3%) | 161 (49.5%) | 189 (51.9%) | 163 (50.2%) | 676 (50.5%) |
bmi | |||||
Mean (SD) | 29.2 (5.94) | 29.2 (5.14) | 33.4 (6.48) | 30.6 (5.69) | 30.7 (6.10) |
Median [Min, Max] | 28.9 [16.0, 48.1] | 28.9 [17.4, 42.9] | 33.3 [19.8, 53.1] | 30.3 [17.4, 47.6] | 30.4 [16.0, 53.1] |
children | |||||
Mean (SD) | 1.05 (1.20) | 1.15 (1.17) | 1.05 (1.18) | 1.14 (1.28) | 1.09 (1.21) |
Median [Min, Max] | 1.00 [0, 5.00] | 1.00 [0, 5.00] | 1.00 [0, 5.00] | 1.00 [0, 5.00] | 1.00 [0, 5.00] |
smoker | |||||
no | 257 (79.3%) | 267 (82.2%) | 273 (75.0%) | 267 (82.2%) | 1064 (79.5%) |
yes | 67 (20.7%) | 58 (17.8%) | 91 (25.0%) | 58 (17.8%) | 274 (20.5%) |
charge | |||||
Mean (SD) | 13400 (11300) | 12400 (11100) | 14700 (14000) | 12300 (11600) | 13300 (12100) |
Median [Min, Max] | 10100 [1690, 58600] | 8970 [1620, 60000] | 9290 [1120, 63800] | 8800 [1240, 52600] | 9380 [1120, 63800] |
library(compareGroups)
# Dùng package compareGroups tóm tắt dữ liệu arr chia theo biến finance
createTable(compareGroups(finance ~ age + race + prior + parole, data=arr))
##
## --------Summary descriptives table by 'finance'---------
##
## ___________________________________________
## no yes p.overall
## N=216 N=216
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## age 24.2 (5.73) 25.0 (6.47) 0.203
## race: 0.241
## black 185 (85.6%) 194 (89.8%)
## other 31 (14.4%) 22 (10.2%)
## prior 2.99 (2.92) 2.98 (2.88) 0.987
## parole: 0.843
## no 81 (37.5%) 84 (38.9%)
## yes 135 (62.5%) 132 (61.1%)
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
createTable(compareGroups(arrest ~ race, data=arr))
##
## --------Summary descriptives table by 'arrest'---------
##
## ___________________________________________
## 0 1 p.overall
## N=318 N=114
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## race: 0.621
## black 277 (87.1%) 102 (89.5%)
## other 41 (12.9%) 12 (10.5%)
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
chisq.test(arr$arrest, arr$race)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: arr$arrest and arr$race
## X-squared = 0.24452, df = 1, p-value = 0.621
# Bạn hãy thể hiện sự khác biệt bằng biểu đồ thanh (bar chart)
library(tidyverse)
arr %>% count(arrest, race) %>% group_by(arrest) %>% mutate(percent = n / sum(n) * 100) %>% ggplot(aes(x=arrest, y=percent, fill=race)) + geom_bar(stat="identity") + geom_text(aes(label=paste0(sprintf("%1.1f", percent),"%")), position=position_stack(vjust=0.5)) + theme(legend.position="none") + labs(x="Arrest", y="Percent (%)")
createTable(compareGroups(arrest ~ finance, data = arr))
##
## --------Summary descriptives table by 'arrest'---------
##
## _________________________________________
## 0 1 p.overall
## N=318 N=114
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## finance: 0.063
## no 150 (47.2%) 66 (57.9%)
## yes 168 (52.8%) 48 (42.1%)
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
chisq.test(arr$finance, arr$arrest)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: arr$finance and arr$arrest
## X-squared = 3.4439, df = 1, p-value = 0.06349
# (6.1) Kiểm định mối liên quan giữa trình độ học vấn và nguy cơ tái phạm
arr$edu = as.factor(arr$educ)
createTable(compareGroups(arrest ~ edu, data = arr))
## Warning in chisq.test(xx, correct = FALSE): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(xx, correct = FALSE): Chi-squared approximation may be
## incorrect
##
## --------Summary descriptives table by 'arrest'---------
##
## ______________________________________
## 0 1 p.overall
## N=318 N=114
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## edu: 0.023
## 2 20 (6.29%) 4 (3.51%)
## 3 162 (50.9%) 77 (67.5%)
## 4 92 (28.9%) 27 (23.7%)
## 5 34 (10.7%) 5 (4.39%)
## 6 10 (3.14%) 1 (0.88%)
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
chisq.test(arr$arrest, arr$edu)
## Warning in chisq.test(arr$arrest, arr$edu): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: arr$arrest and arr$edu
## X-squared = 11.577, df = 4, p-value = 0.02079
# (6.2) Các bạn hãy thể hiện dữ liệu trên bằng một biểu đồ
arr %>% count(arrest, edu) %>% group_by(arrest) %>% mutate(percent = n / sum(n) * 100) %>% ggplot(aes(x=arrest, y=percent, fill=edu)) + geom_bar(stat="identity") + geom_text(aes(label=paste0(sprintf("%1.1f", percent),"%")), position=position_stack(vjust=0.5)) + theme(legend.position="none") + labs(x="Arrest", y="Percent (%)")
# (7.1) Có phải người tái phạm tuổi trẻ hơn người không tái phạm? Có phải người tái phạm có tiền sử phạm tội nhiều hơn nhóm không tái phạm?
createTable(compareGroups(arrest ~ age, data = arr))
##
## --------Summary descriptives table by 'arrest'---------
##
## _____________________________________
## 0 1 p.overall
## N=318 N=114
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## age 25.3 (6.31) 22.8 (5.12) <0.001
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
t.test(arr$age ~ arr$arrest)
##
## Welch Two Sample t-test
##
## data: arr$age by arr$arrest
## t = 4.1789, df = 243.6, p-value = 4.086e-05
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 1.317143 3.665975
## sample estimates:
## mean in group 0 mean in group 1
## 25.25472 22.76316
createTable(compareGroups(arrest ~ prior, data = arr))
##
## --------Summary descriptives table by 'arrest'---------
##
## _______________________________________
## 0 1 p.overall
## N=318 N=114
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## prior 2.70 (2.55) 3.77 (3.59) 0.004
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
t.test(arr$prior ~ arr$arrest)
##
## Welch Two Sample t-test
##
## data: arr$prior by arr$arrest
## t = -2.9319, df = 155.9, p-value = 0.003877
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -1.7920133 -0.3493306
## sample estimates:
## mean in group 0 mean in group 1
## 2.701258 3.771930
# (7.2) Hãy thể hiện sự khác biệt trên bằng một biểu đồ
ggplot(arr, aes(group=arrest, x=arrest, y=age, fill=arrest, color=arrest)) + geom_boxplot(color="black") + geom_jitter(aes(color=arrest), alpha=0.5) + theme(legend.position="none") + labs(x="Arrest", y="Age (year)")
# (8.1) Hãy kiểm định sự khác biệt về tiền bảo hiểm (charge) giữa nam và nữ (sex)
ins$gender[ins$sex == "female"] = 2
## Warning: Unknown or uninitialised column: `gender`.
ins$gender[ins$sex == "male"] = 1
t.test(ins$charge ~ ins$gender)
##
## Welch Two Sample t-test
##
## data: ins$charge by ins$gender
## t = 2.1009, df = 1313.4, p-value = 0.03584
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
## 91.85535 2682.48932
## sample estimates:
## mean in group 1 mean in group 2
## 13956.75 12569.58
## Hãy thể hiện bằng biểu đồ hộp
ggplot(ins, aes(group=gender, x=gender, y=charge, fill=gender, color=gender)) + geom_boxplot(colour= "black") + geom_jitter(aes(color=gender), alpha=0.5) + theme(legend.position="none") + labs(x="Gender", y="Charge")
# (8.2) Hãy kiểm tra xem tiền bảo hiểm (charge) có tuân theo luật phân bố chuẩn không?
ggplot(ins, aes(x=charge)) + geom_histogram(aes(y=..density..), fill="blue", col="white") + geom_density(col="purple") + labs(x="Charge", y="Charge amount", title="Distribution of charge")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Kiểm định khác
men = ins$charge[ins$sex=="male"]
women = ins$charge[ins$sex=="female"]
n = length(men)
m <- length(women)
B = 1000
difference <- numeric(B)
for (i in 1:B) {
bs.men = sample(men, n, replace=T)
bs.women = sample(women, m, replace=T)
difference[i] = mean(bs.men, na.rm=T) - mean(bs.women, na.rm=T)
}
hist(difference, breaks=20)
quantile(difference, probs=c(0.025, 0.50, 0.975))
## 2.5% 50% 97.5%
## 47.28669 1378.03271 2624.32926
### Sử dụng package 'simpleboot'
library(simpleboot)
## Simple Bootstrap Routines (1.1-7)
men.only = ins %>% filter(sex == "male")
women.only = ins %>% filter(sex == "female")
set.seed(1234)
b.means = two.boot(women.only$charge, men.only$charge, mean, R = 1000)
hist (b.means$t, breaks = 20)
quantile(b.means$t, probs=c(0.025, 0.50, 0.975))
## 2.5% 50% 97.5%
## -2786.3075 -1390.4840 -107.7096
# (9.1) Hãy kiểm định sự khác biệt về tiền bảo hiểm (charge) giữa 4 vùng miền (region)
summary(aov(charge ~ region, data = ins))
## Df Sum Sq Mean Sq F value Pr(>F)
## region 3 1.301e+09 433586560 2.97 0.0309 *
## Residuals 1334 1.948e+11 146007093
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(aov(charge ~ region, data = ins))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = charge ~ region, data = ins)
##
## $region
## diff lwr upr p adj
## northwest-northeast -988.8091 -3428.93434 1451.31605 0.7245243
## southeast-northeast 1329.0269 -1044.94167 3702.99551 0.4745046
## southwest-northeast -1059.4471 -3499.57234 1380.67806 0.6792086
## southeast-northwest 2317.8361 -54.19944 4689.87157 0.0582938
## southwest-northwest -70.6380 -2508.88256 2367.60656 0.9998516
## southwest-southeast -2388.4741 -4760.50957 -16.43855 0.0476896
## Hãy thể hiện bằng biểu đồ hộp
ggplot(ins, aes(group=region, x= region, y=charge, fill= region, color= region)) + geom_boxplot(colour= "black") + geom_jitter(aes(color=region))+theme(legend.position="none")+ labs(x="Region", y="Charge")
# (9.2) Hãy kiểm tra xem tiền bảo hiểm (charge) có tuân theo luật phân bố chuẩn không?
ggplot(ins, aes(x=charge)) + geom_histogram(aes(y=..density..), fill="blue", col="white") + geom_density(col="purple") + labs(x="Charge", y="Charge amount", title="Distribution of charge")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Kiểm định khác
#install.packages("lmboot", dependencies = T)
library(lmboot)
anova <- ANOVA.boot(charge ~ region, seed = 1234, B = 3000, data = ins)
anova$`p-values`
## [1] 0.027
## Bootstrap cho so sánh các nhóm
lm.b <- wild.boot(charge ~ region, B = 100, seed = 1234, data = ins)
## Warning in wild.boot(charge ~ region, B = 100, seed = 1234, data = ins): Number of bootstrap samples is recommended to be more than the number of observations.
quantile(lm.b$bootEstParam[,1], probs=c(.025, .5, .975)) # intercept (NorthEast)
## 2.5% 50% 97.5%
## 12239.05 13438.29 14592.53
quantile(lm.b$bootEstParam[,2], probs=c(.025, .5, .975)) # NorthWest - NorthEast
## 2.5% 50% 97.5%
## -2580.6522 -1011.0974 794.8059
quantile(lm.b$bootEstParam[,3], probs=c(.025, .5, .975)) # SouthEast - NorthEast
## 2.5% 50% 97.5%
## -165.305 1369.632 3200.967
quantile(lm.b$bootEstParam[,4], probs=c(.025, .5, .975)) # SouthWest - NorthEast
## 2.5% 50% 97.5%
## -2852.8361 -1187.2188 386.2501