#Load data Obesity and transfer variable “gender”

ob=read.csv("/Users/osx/Desktop/Dataset for TDTU workshop 4-2022/obesity data.csv")
ob$gender[ob$gender=="F"]=1
ob$gender[ob$gender=="M"]=0

#cut variable “bmi” to 4 groups

ob$obese = cut(ob$bmi, breaks=c(0, 18.5, 25.0, 30.0, Inf), labels=c("underweight", "normal", "overweight", "obese"))

transfer variables “lean” and “fat”

ob$lean=ob$lean/1000
ob$fat=ob$fat/1000

call library ggplot2, griExtra, tidyverse

library(ggplot2)
library(gridExtra) 
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ✓ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::combine() masks gridExtra::combine()
## x dplyr::filter()  masks stats::filter()
## x dplyr::lag()     masks stats::lag()

Bai tap 1: ve bieu do phan bo histogram

##ve pcfat histogram: bieu do theo count and density

q = ggplot(data=ob, aes(pcfat)) # lop 1: ve bieu khung bieu do, aes(x=, y=), If ghi aes(pcfat) tuong duong aes(x=pcfat, y = count)
q = ggplot(data=ob, aes(pcfat)) + geom_histogram() # them lop 2: geom la them dang bieu do, geom_histogram: bieu do phan bo, mac dinh bieu do phan bo ve theo gia tri tung ID (tung dong) trong data
q= ggplot(data=ob, aes(pcfat)) + geom_histogram(fill="blue", color ="white") # ve bieu do phan bo,"fill=bue" them mau trong tung thanh & "color=white" the mau duong vien thanh
q1 = ggplot(data=ob, aes(pcfat)) + geom_histogram(aes(y=..density..), fill="blue", color="white") # ve do thi phan bo, voi truc y la density cua tung gia tri ID
q1 = ggplot(data=ob, aes(pcfat)) + geom_histogram(aes(y=..density..), fill="blue", color="white") + geom_density(color="red") # do thi tren, them density, which is a smoothed version of the histogram
q1 # do thi hinh thanh la do thi phan do theo dang do thi geom_histogram; do thi hinh chuong (duong do) la do thi phan bo dang smoothed version of histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

q2= ggplot(data=ob, aes(pcfat)) + geom_histogram(aes(y=..density..), fill="blue", color="white") + geom_density(color="red") + labs(x="Percent body fat", y ="Number of people", title="Distribution of percent body fat")  #labs: them ten truc x,y, ten do thi
q2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

grid.arrange(q, q2, ncol=2) # sap xep 2 do thi  nam ngang tren cung 1 trang A4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Hãy vẽ biểu đồ phân bố tỉ trọng mỡ (pcfat) theo giới tính

p= ggplot(data=ob, aes(x=pcfat, fill=gender)) # biểu đồ phân bố tỉ trọng mỡ (pcfat) theo giới tính, fill=gender: dien mau cho 2 gioi tinh 
p1 = p + geom_histogram(position="dodge")
p2 = ggplot(data=ob, aes(x=pcfat, fill=gender, color=gender)) + geom_density(alpha=0.1)
grid.arrange(p1,p2, ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Bai tap 2: ve bieu do hop ##biểu đồ so sánh phân bố của tỉ trọng mỡ (pcfat) giữa các nhóm béo phì (obese) cho nữ giới.

#1: tao tap du lieu chi gom nu gioi tu data=ob
ob.femal= ob %>%filter(gender==1)
#2: ve do thi boxplot
r= ggplot(data=ob.femal, aes(x=obese, y= pcfat, fill=obese, color=obese))
r = r + geom_boxplot()
r

#3: ve do thi boxplot with jitter, which is adds a small amount of random variation to the location of each point, and is a useful way of handling overplotting caused by discreteness in smaller datasets.
r1 = ggplot(data=ob.femal, aes(x=obese, y= pcfat, fill=obese, color=obese)) + geom_boxplot(color="black") + geom_jitter(alpha=0.3)
r1

##biểu đồ so sánh phân bố của tỉ trọng mỡ (pcfat) giữa các nhóm béo phì (obese) cho nam giới.

#1: tao tap du lieu nam gioi tu data=ob
ob.male= ob %>% filter(gender==0)
#2: ve do thi boxplot
s = ggplot(data=ob.male, aes(x=obese, y=pcfat, fill=obese, color=obese))
s = s + geom_boxplot()
s

#3 ve do thi boxplot with jitter
s1 = ggplot(data=ob.male, aes(x=obese, y=pcfat, fill=obese, color=obese)) + geom_boxplot(color="black") + geom_jitter(alpha=0.5)
s1

#4: trinh bay do thi s & s1 trong 1 trang A4
grid.arrange(s,s1, ncol=2)

#Bai tap 3: ve bieu do scatter plot ##1: thể hiện mối liên quan giữa chỉ số khối cơ thể (bmi) và tỉ trọng mỡ (p bằng hàm ggplot trong package ggplot2.

p = ggplot(data=ob, aes(x=bmi, y=pcfat), fill=bmi, color=bmi)
p= p +geom_point() + geom_smooth(method="lm")
p
## `geom_smooth()` using formula 'y ~ x'

##2: Hãy thể hiện mối mối liên quan giữa chỉ số khối cơ thể và tỉ trọng mỡ theo giới tính (gender)

p = ggplot(data=ob, aes(x=bmi, y=pcfat, fill=gender, color=gender))
p = p + geom_point() + geom_smooth()
p
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

p1 = ggplot(data=ob, aes(x=bmi, y=pcfat, fill=gender, color=gender)) + geom_point() + geom_smooth(method="lm")
p1
## `geom_smooth()` using formula 'y ~ x'