Reading data

hh = read_excel("~/Dropbox/Bai giang online/Datasets/Hoa hau the gioi 1952-2020.xlsx")

vn = read_excel("~/Dropbox/Bai giang online/Datasets/Hoa hau Viet Nam.xlsx")

vn$ob[vn$bmi < 18.5] = "Thiếu cân"
## Warning: Unknown or uninitialised column: `ob`.
vn$ob[vn$bmi >= 18.5 & vn$bmi < 25.0] = "Bình thường"

vnhh = subset(vn, group=="Hoa hau")

Descriptive data

table1(~height + weight + bmi + bust + waist + hip + whr | group, data=vn)
Hoa hau
(N=17)
Ung vien
(N=17)
Overall
(N=34)
height
Mean (SD) 171 (5.76) 173 (4.36) 172 (5.15)
Median [Min, Max] 172 [157, 179] 173 [167, 184] 172 [157, 184]
weight
Mean (SD) 53.0 (4.32) 50.8 (4.63) 52.0 (4.48)
Median [Min, Max] 52.0 [49.0, 61.5] 49.0 [47.0, 60.0] 50.0 [47.0, 61.5]
Missing 5 (29.4%) 8 (47.1%) 13 (38.2%)
bmi
Mean (SD) 18.0 (1.17) 17.1 (0.745) 17.6 (1.08)
Median [Min, Max] 18.0 [16.4, 20.3] 16.8 [16.3, 18.4] 17.6 [16.3, 20.3]
Missing 5 (29.4%) 8 (47.1%) 13 (38.2%)
bust
Mean (SD) 83.7 (2.76) 81.5 (3.81) 82.6 (3.47)
Median [Min, Max] 84.0 [78.0, 87.0] 82.0 [76.0, 87.0] 83.5 [76.0, 87.0]
waist
Mean (SD) 61.4 (1.90) 61.5 (2.55) 61.4 (2.22)
Median [Min, Max] 61.0 [58.0, 65.0] 61.0 [58.0, 67.0] 61.0 [58.0, 67.0]
hip
Mean (SD) 90.1 (2.78) 91.8 (2.61) 90.9 (2.78)
Median [Min, Max] 90.0 [84.0, 95.0] 92.0 [88.0, 96.0] 90.0 [84.0, 96.0]
whr
Mean (SD) 0.681 (0.0193) 0.670 (0.0181) 0.675 (0.0193)
Median [Min, Max] 0.682 [0.649, 0.726] 0.667 [0.642, 0.698] 0.676 [0.642, 0.726]

Change in height with time

p1 = ggplot(data=hh, aes(x=year, y=height, col=year)) + geom_point() + geom_smooth(method="lm", formula=y~x+I(x^2)) +  labs(title="Hoa hậu Thế giới", x="Năm", y="Chiều cao") + scale_y_continuous(limits=c(150, 180)) + scale_x_continuous(limits=c(1952, 2020)) + theme(legend.position="none")

p2 = ggplot(data=vnhh, aes(x=year, y=height, col=sname, label=sname)) + geom_point(aes(color=sname)) + geom_smooth(method="lm", formula=y~x+I(x^2)) + geom_text() + theme(legend.position="none") + labs(title="Hoa hậu Việt Nam", x="Năm", y="Chiều cao") + scale_y_continuous(limits=c(150, 180)) + scale_x_continuous(limits=c(1988, 2020))

grid.arrange(p1, p2, ncol=2)
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).

Distribution of BMI

ggplot(vn, aes(x = bmi, fill=group)) + geom_dotplot() + labs(x="Body mass index", y="Number of individuals")
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 13 rows containing non-finite values (stat_bindot).

Khác biệt chiều cao giữa hoa hậu VN và thế giới

t.test(vn$height, hh$height)
## 
##  Welch Two Sample t-test
## 
## data:  vn$height and hh$height
## t = -0.95039, df = 64.412, p-value = 0.3455
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3.159268  1.122184
## sample estimates:
## mean of x mean of y 
##  171.7206  172.7391

Correlation between measures

dat = vn[, c("bust", "waist", "hip", "whr", "height", "group")]
ggpairs(dat, aes(colour=group))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.