1. Install the necessary packages:
library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.8 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(readxl)
2. Task 1: import data by R
# Obesity data.csv
# Name this dataframe is "ob"
# WBBMC (g): whole-body bone mineral content
# wbbmd (g/cm2): whole-body bone mineral density
# fat (g), lean (g)
ob <- read.csv("C:/Users/thien/OneDrive/Desktop/R thuc hanh/R learning/SUMS - R class data/obesity data.csv")
dim(ob)
## [1] 1217 11
3. Task 2: Editing data with tidyverse
# Create categorical variable sex (1/0) from gender (F/M)
# Create categorical variable bmigroup from bmi;
# If bmi < 18.5 then bmigroup = "Underweight"
# If bmi >= 18.5 and bmi < 25.0 then bmigroup = "Normal"
# If bmi >= 25.0 and bmi < 29.9 then bmigroup = "Overweight"
# If bmi >= 30.0 then bmigroup = "Obese"
# Use the mutate function to calculate the amount of muscle (lean) and fat (fat) in kilograms
ob = ob %>% mutate(sex = recode(gender, "F"=1, "M"=0))
ob = ob %>% mutate(bmigr = cut(bmi, breaks=c(0, 18.5, 25, 30, Inf), labels=c("Underweight", "Normal", "Overweight", "Obese")))
ob = ob %>% mutate(leankg = lean/1000, fatkg = fat/1000)
4. Task 3: Prepare a graph of variable using ggplot
# Histogram the distribution of (pcfat).
attach(ob)
hist(pcfat)

ggplot(ob, aes(x=pcfat)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(ob, aes(x=pcfat)) + geom_histogram(fill="blue", col="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(ob, aes(x=pcfat)) + geom_histogram(aes(y=..density..), fill="blue", col="white") + geom_density(col="red")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## i Please use `after_stat(density)` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(ob, aes(x=pcfat)) + geom_histogram(aes(y=..density..), fill="blue", col="white") + geom_density(col="purple") + labs(x="Percent body fat", y="Number of people", title="Distribution of percent body fat")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Histogram the distribution of (pcfat) by gender
library(gridExtra)
p = ggplot(data=ob, aes(x=pcfat))
p1 = p + geom_histogram(color="white",fill="blue")
p = p + geom_histogram(aes(y=..density..),color="white", fill="blue")
p2 = p + geom_density(col="red")
grid.arrange(p1, p2, ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p = ggplot(data=ob, aes(x=pcfat, fill=gender))
p1 = p + geom_histogram(position="dodge")
p2 = ggplot(data=ob, aes(x=pcfat, fill=gender, color=gender)) + geom_density(alpha = 0.1)
grid.arrange(p1, p2, nrow=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

5. Task 4: Bar chart
# Show the distribution of bmigroup with a bar graph.
p = ggplot(data=ob, aes(x=bmigr, fill=bmigr, col=bmigr))
p + geom_bar(position="dodge")

# Show the distribution of bmigroup by gender with a bar graph
p = ggplot(data=ob, aes(x=bmigr, fill=gender, col=gender))
p + geom_bar(position="dodge")

temp = ob %>% count(bmigr, gender) %>% group_by(bmigr) %>% mutate(percent = n / sum(n) * 100)
# view(temp)
ggplot(temp, aes(x=bmigr, y=percent, fill=gender)) + geom_bar(stat="identity")

# ob %>% count(bmigr, gender) %>% group_by(bmigr) %>% mutate(percent = n / sum(n) * 100) %>% ggplot(aes(x=bmigr, y=percent, fill=gender)) + geom_bar(stat="identity")
# Do two steps: create frequency, create a bar graph with frequency data
# How to add the percentage of bmigroup by gender
ob %>% count(bmigr, gender) %>% group_by(bmigr) %>% mutate(percent = n / sum(n) * 100) %>% ggplot(aes(x=bmigr, y=percent, fill=gender)) + geom_bar(stat="identity") + geom_text(aes(label=paste0(sprintf("%1.1f", percent),"%")), position=position_stack(vjust=0.5))

# Make a bar graph based on the means
ob %>% group_by(bmigr) %>% summarize(mean = mean(pcfat))
## # A tibble: 4 x 2
## bmigr mean
## <fct> <dbl>
## 1 Underweight 24.6
## 2 Normal 31.4
## 3 Overweight 35.5
## 4 Obese 38.1
temp1 = ob %>% group_by(bmigr) %>% summarize(mean = mean(pcfat))
ggplot(temp1, aes(x=bmigr, y=mean, fill=bmigr)) + geom_bar(stat="identity")

ob %>% group_by(bmigr) %>% summarize(mean = mean(pcfat)) %>% ggplot(aes(x=bmigr, y=mean, fill=bmigr)) + geom_bar(stat="identity")

ob %>% group_by(bmigr) %>% summarize(mean = mean(pcfat)) %>% ggplot(aes(x=bmigr, y=mean, fill=bmigr)) + geom_bar(stat="identity") + geom_text(aes(label=paste0(sprintf("%1.1f", mean),"%")), position=position_stack(vjust=0.5))

6. Task 5: Box-plot
# Box-plot are often used to compare variables between two or more groups.
# Use Box-plot to compare the distribution of (pcfat) between the bmigroups for female.
women = ob %>% filter(gender=="F")
ggplot(women, aes(x=bmigr, y=pcfat)) + geom_boxplot()

ggplot(women, aes(x=bmigr, y=pcfat, fill=bmigr)) + geom_boxplot()

ggplot(women, aes(x=bmigr, y=pcfat, fill=bmigr)) + geom_boxplot()

ggplot(women, aes(x=bmigr, y=pcfat, fill=bmigr)) + geom_boxplot() + geom_jitter(aes(color=bmigr), alpha=0.5)

ggplot(women, aes(x=bmigr, y=pcfat, fill=bmigr)) + geom_boxplot() + geom_jitter(aes(color=bmigr), alpha=0.5) + theme(legend.position="none") + labs(x="BMI group", y="Percent body fat (%)")

# Create a dataset containing only female
# Similarly, create a dataset containing only male
men = ob %>% filter(gender=="M")
ggplot(men, aes(x=bmigr, y=pcfat, fill=bmigr)) + geom_boxplot() + geom_jitter(aes(color=bmigr), alpha=0.5) + theme(legend.position="none") + labs(x="BMI group", y="Percent body fat (%)")

# How to present in 1 page using gridExtra?
p1=ggplot(women, aes(x=bmigr, y=pcfat, fill=bmigr)) + geom_boxplot() + geom_jitter(aes(color=bmigr), alpha=0.5)
p2=ggplot(men, aes(x=bmigr, y=pcfat, fill=bmigr)) + geom_boxplot() + geom_jitter(aes(color=bmigr), alpha=0.5) + theme(legend.position="none") + labs(x="BMI group", y="Percent body fat (%)")
grid.arrange(p1, p2, ncol=2)

7. Task 6: Create a correlation graph
# Show the relationship between (bmi) and (pcfat) by using the ggplot function
ggplot(data=ob, aes(x=bmi, y=pcfat)) + geom_point()

ggplot(data=ob, aes(x=bmi, y=pcfat)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

ggplot(data=ob, aes(x=bmi, y=pcfat)) + geom_point() + geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'

# Show the relationship between (bmi) and (pcfat) by gender
ggplot(data=ob, aes(x=bmi, y=pcfat, col=gender)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(data=ob, aes(x=bmi, y=pcfat, col=gender)) + geom_point() + geom_smooth(method="lm", formula=y~x+I(x^2))
