#Loading Package

require(tidyverse)
## Loading required package: tidyverse
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

#Loading dataset

ob=read.csv("https://raw.githubusercontent.com/tuanvnguyen/R-book/master/obesity%20data.csv",header=T)
attach(ob)
head(ob)
##   id gender height weight  bmi age WBBMC wbbmd   fat  lean pcfat
## 1  1      F    150     49 21.8  53  1312  0.88 17802 28600  37.3
## 2  2      M    165     52 19.1  65  1309  0.84  8381 40229  16.8
## 3  3      F    157     57 23.1  64  1230  0.84 19221 36057  34.0
## 4  4      F    156     53 21.8  56  1171  0.80 17472 33094  33.8
## 5  5      M    160     51 19.9  54  1681  0.98  7336 40621  14.8
## 6  6      F    153     47 20.1  52  1358  0.91 14904 30068  32.2

#Scatter plot

p=ggplot(data=ob,aes(x=bmi,y=pcfat, fill=gender))
p+geom_point(aes(col=gender))+geom_smooth()+theme_bw()+xlab("Body mass index")+ ylab("Percent body fat")+ ggtitle("Distribution of percentage body fat and body mass index")+theme(legend.position = "bottom")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Apply regression line into scatter plot

p+geom_point(aes(col=gender))+geom_smooth(method="lm",formula = y~ x+ I(x^2))+theme_bw()+xlab("Body mass index")+ ylab("Percent body fat")+ ggtitle("Distribution of percentage body fat and body mass index")+theme(legend.position = "bottom")

#Scatter plot: separate values by third variable

p=ggplot(ob, aes(x=age,y=wbbmd,fill=gender))
p+geom_point(aes(col=gender,size=bmi)) + geom_smooth(method="lm")+ xlab("Age")+ ylab("WbBMD")+ theme(legend.position = "top")+theme_bw()+theme_classic()
## `geom_smooth()` using formula 'y ~ x'

#Histogram

p=ggplot(ob, aes(x=pcfat))
p+ geom_histogram(col="white", aes(fill=..count..))+ xlab("Percent body fat")+ ylab("No of people")+ ggtitle("Distribution of PBF")+theme_bw()+ scale_fill_gradient("Count",low="blue",high="red")+theme(legend.position = "bottom")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Boxplot with ggplot

b=ggplot(ob,aes(x=gender, y=pcfat))
b+ geom_boxplot(notch = T,aes(fill=gender), outlier.colour = "red",)+ coord_flip()+ theme_bw()+ theme_classic()

#Boxplot with a categorical variable #Use +lab(fill=“Title name”) to fill in legend title

ob$group[ob$bmi<20] ="A"
ob$group[ob$bmi>=20& ob$bmi<25] = "B"
ob$group[ob$bmi>=25] ="C"
p= ggplot(ob,aes(x=ob$group, y=pcfat))
p+geom_boxplot(aes(fill=ob$group))+ theme_bw()+ theme(legend.position = "top")+ xlab("BMI Groups") + ylab("Percent body fat")+ ggtitle("Distribution of percent body fat by BMI group")+ labs(fill="BMI groups")
## Warning: Use of `ob$group` is discouraged. Use `group` instead.

## Warning: Use of `ob$group` is discouraged. Use `group` instead.

##Practice session with R programming 101

require(tidyverse)
ggplot(data=BOD,mapping=aes(x=Time, y=demand))+ 
  geom_point(size=3)+
  geom_line(colour="red")

#Use third variable to separate data- but still in the same graph

head(CO2)
## Grouped Data: uptake ~ conc | Plant
##   Plant   Type  Treatment conc uptake
## 1   Qn1 Quebec nonchilled   95   16.0
## 2   Qn1 Quebec nonchilled  175   30.4
## 3   Qn1 Quebec nonchilled  250   34.8
## 4   Qn1 Quebec nonchilled  350   37.2
## 5   Qn1 Quebec nonchilled  500   35.3
## 6   Qn1 Quebec nonchilled  675   39.2
CO2 %>%
  ggplot(aes(conc, uptake, colour=Treatment))+
  geom_point()+
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Use third variable to separate data- in differenct graphs

CO2 %>%
  ggplot(aes(Treatment, uptake))+
  geom_boxplot()+
  geom_point(alpha=0.5, aes(size=conc,colour=Plant))+
  coord_flip()+
  theme_bw()+
  facet_wrap(~Type)+
  labs(title="Chilled vs Non-chilled")

#Practice with mpg dataset ##Add filter to select those with less than xxx city miles per gallon ###choose colour in geometric object–> only that object possesses colour

head(mpg)
## # A tibble: 6 x 11
##   manufacturer model displ  year   cyl trans      drv     cty   hwy fl    class 
##   <chr>        <chr> <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr> 
## 1 audi         a4      1.8  1999     4 auto(l5)   f        18    29 p     compa~
## 2 audi         a4      1.8  1999     4 manual(m5) f        21    29 p     compa~
## 3 audi         a4      2    2008     4 manual(m6) f        20    31 p     compa~
## 4 audi         a4      2    2008     4 auto(av)   f        21    30 p     compa~
## 5 audi         a4      2.8  1999     6 auto(l5)   f        16    26 p     compa~
## 6 audi         a4      2.8  1999     6 manual(m5) f        18    26 p     compa~
mpg %>%
  filter(cty<35)%>%
  ggplot(aes(displ, cty))+
  geom_point(aes(colour=drv),
             alpha =0.5)+
  geom_smooth(method=lm,se=F)+
  facet_wrap(~year,nrow=1)+
  labs(x="Engine size",
       y="MPG in the city",
       title="Fuel efficiency")+
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'

###Choosing colour in ggplot main code–> different lines and colors

mpg %>%
filter(cty<35)%>%
  ggplot(aes(displ, cty, col=drv))+
  geom_point(alpha =0.5)+
  geom_smooth(method=lm,se=F)+
  facet_wrap(~year,nrow=1)+
  labs(x="Engine size",
       y="MPG in the city",
       title="Fuel efficiency")+
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'

#Boxplot-Two and more variables (multiple numeric variables)

head(msleep)
## # A tibble: 6 x 11
##   name    genus vore  order conservation sleep_total sleep_rem sleep_cycle awake
##   <chr>   <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
## 1 Cheetah Acin~ carni Carn~ lc                  12.1      NA        NA      11.9
## 2 Owl mo~ Aotus omni  Prim~ <NA>                17         1.8      NA       7  
## 3 Mounta~ Aplo~ herbi Rode~ nt                  14.4       2.4      NA       9.6
## 4 Greate~ Blar~ omni  Sori~ lc                  14.9       2.3       0.133   9.1
## 5 Cow     Bos   herbi Arti~ domesticated         4         0.7       0.667  20  
## 6 Three-~ Brad~ herbi Pilo~ <NA>                14.4       2.2       0.767   9.6
## # ... with 2 more variables: brainwt <dbl>, bodywt <dbl>
msleep%>%
  filter(bodywt<2)%>%
  ggplot(aes(bodywt,brainwt))+
  geom_point(aes(colour=sleep_total,
                 size=awake))+
  geom_smooth()+
  labs(x="Body Weight",
       y="Brain Weight",
       title = "Brain and body weight")+
  theme_minimal()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 16 rows containing non-finite values (stat_smooth).
## Warning: Removed 16 rows containing missing values (geom_point).

#A numeric and a categorical ##Method 1: Use boxplot

msleep%>%
  drop_na(vore)%>%
  ggplot(aes(vore,sleep_total))+
  geom_boxplot()+
  theme_bw()+
  coord_flip()

##Method 2: Use the density plot (Notice differences between fill and colour) ###Colour- for more than 2 variables

msleep%>%
  drop_na(vore)%>%
  ggplot(aes(sleep_total,colour=vore))+
  geom_density(alpha=0.2)+
  theme_bw()

###Fill- when there is less than or 3 variables

msleep%>%
  drop_na(vore)%>%
  ggplot(aes(sleep_total,fill=vore))+
  geom_density(alpha=0.2)+
  theme_bw()