#Loading Package
require(tidyverse)
## Loading required package: tidyverse
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#Loading dataset
ob=read.csv("https://raw.githubusercontent.com/tuanvnguyen/R-book/master/obesity%20data.csv",header=T)
attach(ob)
head(ob)
## id gender height weight bmi age WBBMC wbbmd fat lean pcfat
## 1 1 F 150 49 21.8 53 1312 0.88 17802 28600 37.3
## 2 2 M 165 52 19.1 65 1309 0.84 8381 40229 16.8
## 3 3 F 157 57 23.1 64 1230 0.84 19221 36057 34.0
## 4 4 F 156 53 21.8 56 1171 0.80 17472 33094 33.8
## 5 5 M 160 51 19.9 54 1681 0.98 7336 40621 14.8
## 6 6 F 153 47 20.1 52 1358 0.91 14904 30068 32.2
#Scatter plot
p=ggplot(data=ob,aes(x=bmi,y=pcfat, fill=gender))
p+geom_point(aes(col=gender))+geom_smooth()+theme_bw()+xlab("Body mass index")+ ylab("Percent body fat")+ ggtitle("Distribution of percentage body fat and body mass index")+theme(legend.position = "bottom")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#Apply regression line into scatter plot
p+geom_point(aes(col=gender))+geom_smooth(method="lm",formula = y~ x+ I(x^2))+theme_bw()+xlab("Body mass index")+ ylab("Percent body fat")+ ggtitle("Distribution of percentage body fat and body mass index")+theme(legend.position = "bottom")
#Scatter plot: separate values by third variable
p=ggplot(ob, aes(x=age,y=wbbmd,fill=gender))
p+geom_point(aes(col=gender,size=bmi)) + geom_smooth(method="lm")+ xlab("Age")+ ylab("WbBMD")+ theme(legend.position = "top")+theme_bw()+theme_classic()
## `geom_smooth()` using formula 'y ~ x'
#Histogram
p=ggplot(ob, aes(x=pcfat))
p+ geom_histogram(col="white", aes(fill=..count..))+ xlab("Percent body fat")+ ylab("No of people")+ ggtitle("Distribution of PBF")+theme_bw()+ scale_fill_gradient("Count",low="blue",high="red")+theme(legend.position = "bottom")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Boxplot with ggplot
b=ggplot(ob,aes(x=gender, y=pcfat))
b+ geom_boxplot(notch = T,aes(fill=gender), outlier.colour = "red",)+ coord_flip()+ theme_bw()+ theme_classic()
#Boxplot with a categorical variable #Use +lab(fill=“Title name”) to fill in legend title
ob$group[ob$bmi<20] ="A"
ob$group[ob$bmi>=20& ob$bmi<25] = "B"
ob$group[ob$bmi>=25] ="C"
p= ggplot(ob,aes(x=ob$group, y=pcfat))
p+geom_boxplot(aes(fill=ob$group))+ theme_bw()+ theme(legend.position = "top")+ xlab("BMI Groups") + ylab("Percent body fat")+ ggtitle("Distribution of percent body fat by BMI group")+ labs(fill="BMI groups")
## Warning: Use of `ob$group` is discouraged. Use `group` instead.
## Warning: Use of `ob$group` is discouraged. Use `group` instead.
##Practice session with R programming 101
require(tidyverse)
ggplot(data=BOD,mapping=aes(x=Time, y=demand))+
geom_point(size=3)+
geom_line(colour="red")
#Use third variable to separate data- but still in the same graph
head(CO2)
## Grouped Data: uptake ~ conc | Plant
## Plant Type Treatment conc uptake
## 1 Qn1 Quebec nonchilled 95 16.0
## 2 Qn1 Quebec nonchilled 175 30.4
## 3 Qn1 Quebec nonchilled 250 34.8
## 4 Qn1 Quebec nonchilled 350 37.2
## 5 Qn1 Quebec nonchilled 500 35.3
## 6 Qn1 Quebec nonchilled 675 39.2
CO2 %>%
ggplot(aes(conc, uptake, colour=Treatment))+
geom_point()+
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#Use third variable to separate data- in differenct graphs
CO2 %>%
ggplot(aes(Treatment, uptake))+
geom_boxplot()+
geom_point(alpha=0.5, aes(size=conc,colour=Plant))+
coord_flip()+
theme_bw()+
facet_wrap(~Type)+
labs(title="Chilled vs Non-chilled")
#Practice with mpg dataset ##Add filter to select those with less than xxx city miles per gallon ###choose colour in geometric object–> only that object possesses colour
head(mpg)
## # A tibble: 6 x 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto(l5) f 18 29 p compa~
## 2 audi a4 1.8 1999 4 manual(m5) f 21 29 p compa~
## 3 audi a4 2 2008 4 manual(m6) f 20 31 p compa~
## 4 audi a4 2 2008 4 auto(av) f 21 30 p compa~
## 5 audi a4 2.8 1999 6 auto(l5) f 16 26 p compa~
## 6 audi a4 2.8 1999 6 manual(m5) f 18 26 p compa~
mpg %>%
filter(cty<35)%>%
ggplot(aes(displ, cty))+
geom_point(aes(colour=drv),
alpha =0.5)+
geom_smooth(method=lm,se=F)+
facet_wrap(~year,nrow=1)+
labs(x="Engine size",
y="MPG in the city",
title="Fuel efficiency")+
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
###Choosing colour in ggplot main code–> different lines and colors
mpg %>%
filter(cty<35)%>%
ggplot(aes(displ, cty, col=drv))+
geom_point(alpha =0.5)+
geom_smooth(method=lm,se=F)+
facet_wrap(~year,nrow=1)+
labs(x="Engine size",
y="MPG in the city",
title="Fuel efficiency")+
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
#Boxplot-Two and more variables (multiple numeric variables)
head(msleep)
## # A tibble: 6 x 11
## name genus vore order conservation sleep_total sleep_rem sleep_cycle awake
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Cheetah Acin~ carni Carn~ lc 12.1 NA NA 11.9
## 2 Owl mo~ Aotus omni Prim~ <NA> 17 1.8 NA 7
## 3 Mounta~ Aplo~ herbi Rode~ nt 14.4 2.4 NA 9.6
## 4 Greate~ Blar~ omni Sori~ lc 14.9 2.3 0.133 9.1
## 5 Cow Bos herbi Arti~ domesticated 4 0.7 0.667 20
## 6 Three-~ Brad~ herbi Pilo~ <NA> 14.4 2.2 0.767 9.6
## # ... with 2 more variables: brainwt <dbl>, bodywt <dbl>
msleep%>%
filter(bodywt<2)%>%
ggplot(aes(bodywt,brainwt))+
geom_point(aes(colour=sleep_total,
size=awake))+
geom_smooth()+
labs(x="Body Weight",
y="Brain Weight",
title = "Brain and body weight")+
theme_minimal()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 16 rows containing non-finite values (stat_smooth).
## Warning: Removed 16 rows containing missing values (geom_point).
#A numeric and a categorical ##Method 1: Use boxplot
msleep%>%
drop_na(vore)%>%
ggplot(aes(vore,sleep_total))+
geom_boxplot()+
theme_bw()+
coord_flip()
##Method 2: Use the density plot (Notice differences between fill and colour) ###Colour- for more than 2 variables
msleep%>%
drop_na(vore)%>%
ggplot(aes(sleep_total,colour=vore))+
geom_density(alpha=0.2)+
theme_bw()
###Fill- when there is less than or 3 variables
msleep%>%
drop_na(vore)%>%
ggplot(aes(sleep_total,fill=vore))+
geom_density(alpha=0.2)+
theme_bw()