hh = read_excel("~/Dropbox/Bai giang online/Datasets/Hoa hau the gioi 1952-2020.xlsx")
vn = read_excel("~/Dropbox/Bai giang online/Datasets/Hoa hau Viet Nam.xlsx")
vn$ob[vn$bmi < 18.5] = "Thiếu cân"
## Warning: Unknown or uninitialised column: `ob`.
vn$ob[vn$bmi >= 18.5 & vn$bmi < 25.0] = "Bình thường"
vnhh = subset(vn, group=="Hoa hau")
table1(~height + weight + bmi + bust + waist + hip + whr | group, data=vn)
| Hoa hau (N=17) |
Ung vien (N=17) |
Overall (N=34) |
|
|---|---|---|---|
| height | |||
| Mean (SD) | 171 (5.76) | 173 (4.36) | 172 (5.15) |
| Median [Min, Max] | 172 [157, 179] | 173 [167, 184] | 172 [157, 184] |
| weight | |||
| Mean (SD) | 53.0 (4.32) | 50.8 (4.63) | 52.0 (4.48) |
| Median [Min, Max] | 52.0 [49.0, 61.5] | 49.0 [47.0, 60.0] | 50.0 [47.0, 61.5] |
| Missing | 5 (29.4%) | 8 (47.1%) | 13 (38.2%) |
| bmi | |||
| Mean (SD) | 18.0 (1.17) | 17.1 (0.745) | 17.6 (1.08) |
| Median [Min, Max] | 18.0 [16.4, 20.3] | 16.8 [16.3, 18.4] | 17.6 [16.3, 20.3] |
| Missing | 5 (29.4%) | 8 (47.1%) | 13 (38.2%) |
| bust | |||
| Mean (SD) | 83.7 (2.76) | 81.5 (3.81) | 82.6 (3.47) |
| Median [Min, Max] | 84.0 [78.0, 87.0] | 82.0 [76.0, 87.0] | 83.5 [76.0, 87.0] |
| waist | |||
| Mean (SD) | 61.4 (1.90) | 61.5 (2.55) | 61.4 (2.22) |
| Median [Min, Max] | 61.0 [58.0, 65.0] | 61.0 [58.0, 67.0] | 61.0 [58.0, 67.0] |
| hip | |||
| Mean (SD) | 90.1 (2.78) | 91.8 (2.61) | 90.9 (2.78) |
| Median [Min, Max] | 90.0 [84.0, 95.0] | 92.0 [88.0, 96.0] | 90.0 [84.0, 96.0] |
| whr | |||
| Mean (SD) | 0.681 (0.0193) | 0.670 (0.0181) | 0.675 (0.0193) |
| Median [Min, Max] | 0.682 [0.649, 0.726] | 0.667 [0.642, 0.698] | 0.676 [0.642, 0.726] |
p1 = ggplot(data=hh, aes(x=year, y=height, col=year)) + geom_point() + geom_smooth(method="lm", formula=y~x+I(x^2)) + labs(title="Hoa hậu Thế giới", x="Năm", y="Chiều cao") + scale_y_continuous(limits=c(150, 180)) + scale_x_continuous(limits=c(1952, 2020)) + theme(legend.position="none")
p2 = ggplot(data=vnhh, aes(x=year, y=height, col=sname, label=sname)) + geom_point(aes(color=sname)) + geom_smooth(method="lm", formula=y~x+I(x^2)) + geom_text() + theme(legend.position="none") + labs(title="Hoa hậu Việt Nam", x="Năm", y="Chiều cao") + scale_y_continuous(limits=c(150, 180)) + scale_x_continuous(limits=c(1988, 2020))
grid.arrange(p1, p2, ncol=2)
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).
ggplot(vn, aes(x = bmi, fill=group)) + geom_dotplot() + labs(x="Body mass index", y="Number of individuals")
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 13 rows containing non-finite values (stat_bindot).
t.test(vn$height, hh$height)
##
## Welch Two Sample t-test
##
## data: vn$height and hh$height
## t = -0.95039, df = 64.412, p-value = 0.3455
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.159268 1.122184
## sample estimates:
## mean of x mean of y
## 171.7206 172.7391
dat = vn[, c("bust", "waist", "hip", "whr", "height", "group")]
ggpairs(dat, aes(colour=group))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.