1. Install the necessary packages:

library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.8     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(readxl)

2. Task 1: import data by R

# Obesity data.csv

  # Name this dataframe is "ob"
  # WBBMC (g): whole-body bone mineral content
  # wbbmd (g/cm2): whole-body bone mineral density
  # fat (g), lean (g)

ob <- read.csv("C:/Users/thien/OneDrive/Desktop/R thuc hanh/R learning/SUMS - R class data/obesity data.csv")

dim(ob)
## [1] 1217   11

3. Task 2: Editing data with tidyverse

# Create categorical variable sex (1/0) from gender (F/M)
  
# Create categorical variable bmigroup from bmi;
    # If bmi < 18.5 then bmigroup = "Underweight"
    # If bmi >= 18.5 and bmi < 25.0 then bmigroup = "Normal"
    # If bmi >= 25.0 and bmi < 29.9 then bmigroup = "Overweight"
    # If bmi >= 30.0 then bmigroup = "Obese"

# Use the mutate function to calculate the amount of muscle (lean) and fat (fat) in kilograms

ob = ob %>% mutate(sex = recode(gender, "F"=1, "M"=0))
ob = ob %>% mutate(bmigr = cut(bmi, breaks=c(0, 18.5, 25, 30, Inf), labels=c("Underweight", "Normal", "Overweight", "Obese")))
ob = ob %>% mutate(leankg = lean/1000, fatkg = fat/1000)

4. Task 3: Prepare a graph of variable using ggplot

# Histogram the distribution of (pcfat).
attach(ob)
hist(pcfat)

ggplot(ob, aes(x=pcfat)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(ob, aes(x=pcfat)) + geom_histogram(fill="blue", col="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(ob, aes(x=pcfat)) + geom_histogram(aes(y=..density..), fill="blue", col="white") + geom_density(col="red")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## i Please use `after_stat(density)` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(ob, aes(x=pcfat)) + geom_histogram(aes(y=..density..), fill="blue", col="white") + geom_density(col="purple") + labs(x="Percent body fat", y="Number of people", title="Distribution of percent body fat")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Histogram the distribution of (pcfat) by gender
library(gridExtra)
p  = ggplot(data=ob, aes(x=pcfat))
p1 = p + geom_histogram(color="white",fill="blue")
p  =  p + geom_histogram(aes(y=..density..),color="white", fill="blue")
p2 = p + geom_density(col="red")  
grid.arrange(p1, p2, ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p  = ggplot(data=ob, aes(x=pcfat, fill=gender))
p1 = p + geom_histogram(position="dodge")
p2 = ggplot(data=ob, aes(x=pcfat, fill=gender, color=gender)) + geom_density(alpha = 0.1)
grid.arrange(p1, p2, nrow=2) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

5. Task 4: Bar chart

# Show the distribution of bmigroup with a bar graph.
p = ggplot(data=ob, aes(x=bmigr, fill=bmigr, col=bmigr))
p + geom_bar(position="dodge")

# Show the distribution of bmigroup by gender with a bar graph
p = ggplot(data=ob, aes(x=bmigr, fill=gender, col=gender))
p + geom_bar(position="dodge")

temp = ob %>% count(bmigr, gender) %>% group_by(bmigr) %>%  mutate(percent = n / sum(n) * 100)

# view(temp)
ggplot(temp, aes(x=bmigr, y=percent, fill=gender)) + geom_bar(stat="identity") 

# ob %>% count(bmigr, gender) %>% group_by(bmigr) %>%  mutate(percent = n / sum(n) * 100) %>% ggplot(aes(x=bmigr, y=percent, fill=gender)) + geom_bar(stat="identity")
  
# Do two steps: create frequency, create a bar graph with frequency data
# How to add the percentage of bmigroup by gender
ob %>% count(bmigr, gender) %>% group_by(bmigr) %>%  mutate(percent = n / sum(n) * 100) %>% ggplot(aes(x=bmigr, y=percent, fill=gender)) + geom_bar(stat="identity") + geom_text(aes(label=paste0(sprintf("%1.1f", percent),"%")), position=position_stack(vjust=0.5))

# Make a bar graph based on the means
ob %>% group_by(bmigr) %>% summarize(mean = mean(pcfat))
## # A tibble: 4 x 2
##   bmigr        mean
##   <fct>       <dbl>
## 1 Underweight  24.6
## 2 Normal       31.4
## 3 Overweight   35.5
## 4 Obese        38.1
temp1 = ob %>% group_by(bmigr) %>% summarize(mean = mean(pcfat))
ggplot(temp1, aes(x=bmigr, y=mean, fill=bmigr)) + geom_bar(stat="identity")

ob %>% group_by(bmigr) %>% summarize(mean = mean(pcfat)) %>% ggplot(aes(x=bmigr, y=mean, fill=bmigr)) + geom_bar(stat="identity")

ob %>% group_by(bmigr) %>% summarize(mean = mean(pcfat)) %>% ggplot(aes(x=bmigr, y=mean, fill=bmigr)) + geom_bar(stat="identity") + geom_text(aes(label=paste0(sprintf("%1.1f", mean),"%")), position=position_stack(vjust=0.5))

6. Task 5: Box-plot

# Box-plot are often used to compare variables between two or more groups.
# Use Box-plot to compare the distribution of (pcfat) between the bmigroups for female.

women = ob %>% filter(gender=="F")
ggplot(women, aes(x=bmigr, y=pcfat)) + geom_boxplot()

ggplot(women, aes(x=bmigr, y=pcfat, fill=bmigr)) + geom_boxplot()

ggplot(women, aes(x=bmigr, y=pcfat, fill=bmigr)) + geom_boxplot()

ggplot(women, aes(x=bmigr, y=pcfat, fill=bmigr)) + geom_boxplot() + geom_jitter(aes(color=bmigr), alpha=0.5)

ggplot(women, aes(x=bmigr, y=pcfat, fill=bmigr)) + geom_boxplot() + geom_jitter(aes(color=bmigr), alpha=0.5) + theme(legend.position="none") + labs(x="BMI group", y="Percent body fat (%)")

  # Create a dataset containing only female
  # Similarly, create a dataset containing only male
men = ob %>% filter(gender=="M")
ggplot(men, aes(x=bmigr, y=pcfat, fill=bmigr)) + geom_boxplot() + geom_jitter(aes(color=bmigr), alpha=0.5) + theme(legend.position="none") + labs(x="BMI group", y="Percent body fat (%)")

  # How to present in 1 page using gridExtra?
p1=ggplot(women, aes(x=bmigr, y=pcfat, fill=bmigr)) + geom_boxplot() + geom_jitter(aes(color=bmigr), alpha=0.5)

p2=ggplot(men, aes(x=bmigr, y=pcfat, fill=bmigr)) + geom_boxplot() + geom_jitter(aes(color=bmigr), alpha=0.5) + theme(legend.position="none") + labs(x="BMI group", y="Percent body fat (%)")

grid.arrange(p1, p2, ncol=2)

7. Task 6: Create a correlation graph

# Show the relationship between (bmi) and (pcfat) by using the ggplot function
ggplot(data=ob, aes(x=bmi, y=pcfat)) + geom_point()

ggplot(data=ob, aes(x=bmi, y=pcfat)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

ggplot(data=ob, aes(x=bmi, y=pcfat)) + geom_point() + geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'

# Show the relationship between (bmi) and (pcfat) by gender
ggplot(data=ob, aes(x=bmi, y=pcfat, col=gender)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(data=ob, aes(x=bmi, y=pcfat, col=gender)) + geom_point() + geom_smooth(method="lm", formula=y~x+I(x^2))