Thực hành ngày 2
Phương pháp hiển thị dữ liệu (data visualization)
Học viên Đặng Bảo Đăng - Mã số R1F019
library(ggplot2); library(tidyverse); library(gridExtra); library(readxl)
Việc 1: Đọc dữ liệu vào R
ob = read.csv("D:\\Downloads\\tailieu\\R course\\Seminar TDT 2022\\Tai lieu\\Data set\\obesity data.csv")
Việc 2: Biên tập dữ liệu bằng tidyverse
ob = ob %>% mutate(sex = recode(gender, "F" = 1, "M" = 0),
bmig = cut(bmi, breaks = c(-Inf, 18.5, 25, 30, Inf), labels = c("Underweight", "Normal", "Overweight", "Obese"), right = FALSE),
lean = lean/1000,
fat = fat/1000)
Việc 3: Soạn biểu đồ phân bố biến số dùng ggplot
p1 = ggplot(ob, aes(x=pcfat))
p1 + geom_histogram(aes(y=..count..), col = "white", fill = "blue") + xlab("Percent body fat") + ylab("Number of people") + ggtitle("Distribution of percent body fat")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p1 + geom_histogram(aes(y=..density..), col="white", fill="blue") + geom_density(col="red") + xlab("Percent body fat") + ylab("Number of people") + ggtitle("Distribution of percent body fat") + theme(plot.title = element_text(lineheight = 0.8, face = "bold", hjust = 0.5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p2 = ggplot(ob, aes(x=pcfat, fill=gender, colour=gender))
p2 + geom_histogram(aes(y=..count..), position = "dodge")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p2 + geom_density(alpha=0.4)

Việc 4: Soạn biểu đồ thanh
p3 = ggplot(ob, aes(x=bmig))
p3 + geom_bar()

p4 = ggplot(ob, aes(x=bmig, fill=bmig))
p4 + geom_bar() + theme(legend.position="none")

ob %>%
count(bmig, gender) %>%
group_by(bmig) %>%
mutate(pct= prop.table(n) * 100) %>%
ggplot() + aes(bmig, pct, fill=gender) +
geom_bar(stat="identity") +
ylab("percent") + xlab("obese") +
geom_text(aes(label=paste0(sprintf("%1.1f", pct),"%")),
position=position_stack(vjust=0.5))

summarize(ob %>% group_by(bmig), mean.pcfat = mean(pcfat)) %>%
ggplot() + aes(bmig, mean.pcfat, fill = bmig) + geom_bar(stat="identity") +
ylab("mean pcfat") + xlab("obese") + labs(fill="obese")

Việc 5: Soạn biểu đồ hộp
female = ob %>% filter(gender == "F")
female %>% ggplot() + aes(x=bmig, y=pcfat, fill = bmig, col = bmig) + geom_boxplot(col="black") +geom_jitter(alpha=0.15) + xlab("Obesity group") + ylab("Percent body fat (%)") + theme(legend.position = "none")

male = ob %>% filter(gender == "M")
male %>% ggplot() + aes(x=bmig, y=pcfat, fill = bmig, col = bmig) + geom_boxplot(col="black") +geom_jitter(alpha=0.15) + xlab("Obesity group") + ylab("Percent body fat (%)") + theme(legend.position = "none")

female %>% ggplot() + aes(x=bmig, y=pcfat, fill = bmig, col = bmig) + geom_violin(trim=TRUE) + geom_boxplot(col="black", width=0.1)

female %>% ggplot() + aes(x=bmig, y=pcfat, fill = bmig, col = bmig) + geom_violin(trim=FALSE) + geom_boxplot(col="black", width=0.1)

Việc 6: Soạn biểu đồ tương quan
ob %>% ggplot() + aes(x=bmi, y=pcfat) + geom_point()

ob %>% ggplot() + aes(x=bmi, y=pcfat) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ob %>% ggplot() + aes(x=bmi, y=pcfat) + geom_point() + geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ob %>% ggplot() + aes(x=bmi, y=pcfat, group=gender, col=gender) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ob %>% ggplot() + aes(x=bmi, y=pcfat, group=gender, col=gender) + geom_point() + geom_smooth(method="lm", formula=y~x+I(x^2))
