Thực hành ngày 2

Phương pháp hiển thị dữ liệu (data visualization)

Học viên Đặng Bảo Đăng - Mã số R1F019

library(ggplot2); library(tidyverse); library(gridExtra); library(readxl)

Việc 1: Đọc dữ liệu vào R

ob = read.csv("D:\\Downloads\\tailieu\\R course\\Seminar TDT 2022\\Tai lieu\\Data set\\obesity data.csv")

Việc 2: Biên tập dữ liệu bằng tidyverse

ob = ob %>% mutate(sex = recode(gender, "F" = 1, "M" = 0), 
                   bmig = cut(bmi, breaks = c(-Inf, 18.5, 25, 30, Inf), labels = c("Underweight", "Normal", "Overweight", "Obese"), right = FALSE), 
                   lean = lean/1000, 
                   fat = fat/1000)

Việc 3: Soạn biểu đồ phân bố biến số dùng ggplot

p1 = ggplot(ob, aes(x=pcfat))
p1 + geom_histogram(aes(y=..count..), col = "white", fill = "blue") + xlab("Percent body fat") + ylab("Number of people") + ggtitle("Distribution of percent body fat")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p1 + geom_histogram(aes(y=..density..), col="white", fill="blue") + geom_density(col="red") + xlab("Percent body fat") + ylab("Number of people") + ggtitle("Distribution of percent body fat") + theme(plot.title = element_text(lineheight = 0.8, face = "bold", hjust = 0.5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p2 = ggplot(ob, aes(x=pcfat, fill=gender, colour=gender))
p2 + geom_histogram(aes(y=..count..), position = "dodge")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p2 + geom_density(alpha=0.4)

Việc 4: Soạn biểu đồ thanh

p3 = ggplot(ob, aes(x=bmig))

p3 + geom_bar()

p4 = ggplot(ob, aes(x=bmig, fill=bmig))

p4 + geom_bar() + theme(legend.position="none")

ob %>%
    count(bmig, gender) %>%       
    group_by(bmig) %>%
    mutate(pct= prop.table(n) * 100) %>%
    ggplot() + aes(bmig, pct, fill=gender) +
    geom_bar(stat="identity") +
    ylab("percent") + xlab("obese") +
    geom_text(aes(label=paste0(sprintf("%1.1f", pct),"%")),
              position=position_stack(vjust=0.5))

summarize(ob %>% group_by(bmig), mean.pcfat = mean(pcfat)) %>% 
  ggplot() + aes(bmig, mean.pcfat, fill = bmig) + geom_bar(stat="identity") +
  ylab("mean pcfat") + xlab("obese") + labs(fill="obese")

Việc 5: Soạn biểu đồ hộp

female = ob %>% filter(gender == "F")

female %>% ggplot() + aes(x=bmig, y=pcfat, fill = bmig, col = bmig) + geom_boxplot(col="black") +geom_jitter(alpha=0.15) + xlab("Obesity group") + ylab("Percent body fat (%)") + theme(legend.position = "none")

male = ob %>% filter(gender == "M")

male %>% ggplot() + aes(x=bmig, y=pcfat, fill = bmig, col = bmig) + geom_boxplot(col="black") +geom_jitter(alpha=0.15) + xlab("Obesity group") + ylab("Percent body fat (%)") + theme(legend.position = "none")

female %>% ggplot() + aes(x=bmig, y=pcfat, fill = bmig, col = bmig) + geom_violin(trim=TRUE) + geom_boxplot(col="black", width=0.1)

female %>% ggplot() + aes(x=bmig, y=pcfat, fill = bmig, col = bmig) + geom_violin(trim=FALSE) + geom_boxplot(col="black", width=0.1)

Việc 6: Soạn biểu đồ tương quan

ob %>% ggplot() + aes(x=bmi, y=pcfat) + geom_point()

ob %>% ggplot() + aes(x=bmi, y=pcfat) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ob %>% ggplot() + aes(x=bmi, y=pcfat) + geom_point() + geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ob %>% ggplot() + aes(x=bmi, y=pcfat, group=gender, col=gender) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ob %>% ggplot() + aes(x=bmi, y=pcfat, group=gender, col=gender) + geom_point() + geom_smooth(method="lm", formula=y~x+I(x^2))