#Cài đặt thư viện dplyr ##Hàm select,filter,mutate,sample,count,arrange,contains,starts_with,everything.

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data("airquality")
head(airquality)

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6

data("mpg")
mpg_df <- mpg
#####Select
select(airquality, Ozone, Temp) %>% head(10)

##    Ozone Temp
## 1     41   67
## 2     36   72
## 3     12   74
## 4     18   62
## 5     NA   56
## 6     28   66
## 7     23   65
## 8     19   59
## 9      8   61
## 10    NA   69

airquality %>% select(-Ozone, -Temp) %>% head(10)

##    Solar.R Wind Month Day
## 1      190  7.4     5   1
## 2      118  8.0     5   2
## 3      149 12.6     5   3
## 4      313 11.5     5   4
## 5       NA 14.3     5   5
## 6       NA 14.9     5   6
## 7      299  8.6     5   7
## 8       99 13.8     5   8
## 9       19 20.1     5   9
## 10     194  8.6     5  10

#####Filter
filter(airquality, Ozone > 106) %>% head(10)

##   Ozone Solar.R Wind Temp Month Day
## 1   115     223  5.7   79     5  30
## 2   135     269  4.1   84     7   1
## 3   108     223  8.0   85     7  25
## 4   122     255  4.0   89     8   7
## 5   110     207  8.0   90     8   9
## 6   168     238  3.4   81     8  25
## 7   118     225  2.3   94     8  29

filter(airquality, Ozone > 106 & Month == 8) %>% head(10)

##   Ozone Solar.R Wind Temp Month Day
## 1   122     255  4.0   89     8   7
## 2   110     207  8.0   90     8   9
## 3   168     238  3.4   81     8  25
## 4   118     225  2.3   94     8  29

filter(airquality, Ozone > 106, Month == 8, Wind > 7) %>% head(10)

##   Ozone Solar.R Wind Temp Month Day
## 1   110     207    8   90     8   9

#####Mutate
mutate(airquality, temp.celsius = Temp - 31) %>% head(3)

##   Ozone Solar.R Wind Temp Month Day temp.celsius
## 1    41     190  7.4   67     5   1           36
## 2    36     118  8.0   72     5   2           41
## 3    12     149 12.6   74     5   3           43

summarize(group_by(airquality, Month), round(mean(Temp, na.rm = T)))

## # A tibble: 5 × 2
##   Month `round(mean(Temp, na.rm = T))`
##   <int>                          <dbl>
## 1     5                             66
## 2     6                             79
## 3     7                             84
## 4     8                             84
## 5     9                             77

#####Sample
sample_n(airquality, 2) # Lấy ngẫu nhiên 2 giá trị

##   Ozone Solar.R Wind Temp Month Day
## 1    85     175  7.4   89     7  10
## 2    NA     153  5.7   88     8  27

sample_frac(airquality, size = 0.1) # Lấy ngẫu nhiên 10%

##    Ozone Solar.R Wind Temp Month Day
## 1      9      24 13.8   81     8   2
## 2     44     236 14.9   81     9  11
## 3     21     230 10.9   75     9   9
## 4      6      78 18.4   57     5  18
## 5     85     188  6.3   94     8  31
## 6    118     225  2.3   94     8  29
## 7     35      NA  7.4   85     8   5
## 8     91     189  4.6   93     9   4
## 9     18     131  8.0   76     9  29
## 10    23     115  7.4   76     8  18
## 11    NA     150  6.3   77     6  21
## 12    27     175 14.9   81     7  13
## 13    18     313 11.5   62     5   4
## 14    11     290  9.2   66     5  13
## 15    20      37  9.2   65     6  18

#####Count
count(airquality, Month > 5) # Đếm xem có bao nhiêu qs có tháng lớn hơn 5 và nhỏ hơn

##   Month > 5   n
## 1     FALSE  31
## 2      TRUE 122

count(airquality, Month) # Thống kê theo tháng

##   Month  n
## 1     5 31
## 2     6 30
## 3     7 31
## 4     8 31
## 5     9 30

#####Arrange
arrange(airquality, Month, Day) %>% head(5) # mặc định là ascending- tăng dần

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5

arrange(airquality, desc(Month), Day) %>% head(5) # để sắp giảm dần, dùng desc (tức descending)

##   Ozone Solar.R Wind Temp Month Day
## 1    96     167  6.9   91     9   1
## 2    78     197  5.1   92     9   2
## 3    73     183  2.8   93     9   3
## 4    91     189  4.6   93     9   4
## 5    47      95  7.4   87     9   5

#####Contains
mpg_df %>% select(contains('a')) %>% head(4)

## # A tibble: 4 × 4
##   manufacturer  year trans      class  
##   <chr>        <int> <chr>      <chr>  
## 1 audi          1999 auto(l5)   compact
## 2 audi          1999 manual(m5) compact
## 3 audi          2008 manual(m6) compact
## 4 audi          2008 auto(av)   compact

#####Starts_with
mpg_df %>% select(starts_with('p')) %>% head(5)

## # A tibble: 5 × 0

mpg_df %>% select( 2, 1, class, contains('y')) %>% head(5)

## # A tibble: 5 × 7
##   model manufacturer class    year   cyl   cty   hwy
##   <chr> <chr>        <chr>   <int> <int> <int> <int>
## 1 a4    audi         compact  1999     4    18    29
## 2 a4    audi         compact  1999     4    21    29
## 3 a4    audi         compact  2008     4    20    31
## 4 a4    audi         compact  2008     4    21    30
## 5 a4    audi         compact  1999     6    16    26

#####Everything
mpg_df %>% select(class,displ,year,everything()) %>% head(4)

## # A tibble: 4 × 11
##   class   displ  year manufacturer model   cyl trans     drv     cty   hwy fl   
##   <chr>   <dbl> <int> <chr>        <chr> <int> <chr>     <chr> <int> <int> <chr>
## 1 compact   1.8  1999 audi         a4        4 auto(l5)  f        18    29 p    
## 2 compact   1.8  1999 audi         a4        4 manual(m… f        21    29 p    
## 3 compact   2    2008 audi         a4        4 manual(m… f        20    31 p    
## 4 compact   2    2008 audi         a4        4 auto(av)  f        21    30 p

#Toán tử pipe

#Chuyển tiếp ống ( Forward pipe )
rnorm(100,20,5)%>%
matrix(ncol=2)%>%
data.frame(x=.[,1],y=.[,2])%>%
plot(col="pink")

##Toán tử T pipe

rnorm(100,10,1)%T>%
ts.plot(col="pink")%>%
density()%>%
plot(col="red","densityplot")

##Toán tử Assigning pipe

library(magrittr)
data.frame(x=rnorm(100,10,5),
y=rnorm(100,20,5)) %$% ts.plot(x,col="red")

data.frame(x=rnorm(100,10,5),
y=rnorm(100,20,5)) %$% ts.plot(y,col="blue")

##Toán tử Backwaed pipe

library(magrittr)
x=abs(rnorm(100))
y=x
cbind(x,y)%>%head()

##               x          y
## [1,] 0.16079413 0.16079413
## [2,] 0.64129290 0.64129290
## [3,] 0.39291205 0.39291205
## [4,] 0.04778004 0.04778004
## [5,] 0.57212651 0.57212651
## [6,] 3.48549299 3.48549299

x %<>% .^2 %>% sqrt()
x==y

##   [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

Với dữ liệu MPG

#library(tidyverse)
library(dplyr)

# lọc dữ liệu mpg_df cho ô tô sản xuất năm 1999
filter(mpg_df, year == 1999) %>% head(5)

## # A tibble: 5 × 11
##   manufacturer model      displ  year   cyl trans  drv     cty   hwy fl    class
##   <chr>        <chr>      <dbl> <int> <int> <chr>  <chr> <int> <int> <chr> <chr>
## 1 audi         a4           1.8  1999     4 auto(… f        18    29 p     comp…
## 2 audi         a4           1.8  1999     4 manua… f        21    29 p     comp…
## 3 audi         a4           2.8  1999     6 auto(… f        16    26 p     comp…
## 4 audi         a4           2.8  1999     6 manua… f        18    26 p     comp…
## 5 audi         a4 quattro   1.8  1999     4 manua… 4        18    26 p     comp…

# Hiển thị kích thước, bộ lọc
filter(mpg_df, year == 1999) %>% dim()

## [1] 117  11

#chúng ta hãy lấy tất cả các phương tiện trong lớp 'hạng trung'
filter(mpg_df, class == "midsize") %>% head(5)

## # A tibble: 5 × 11
##   manufacturer model      displ  year   cyl trans  drv     cty   hwy fl    class
##   <chr>        <chr>      <dbl> <int> <int> <chr>  <chr> <int> <int> <chr> <chr>
## 1 audi         a6 quattro   2.8  1999     6 auto(… 4        15    24 p     mids…
## 2 audi         a6 quattro   3.1  2008     6 auto(… 4        17    25 p     mids…
## 3 audi         a6 quattro   4.2  2008     8 auto(… 4        16    23 p     mids…
## 4 chevrolet    malibu       2.4  1999     4 auto(… f        19    27 r     mids…
## 5 chevrolet    malibu       2.4  2008     4 auto(… f        22    30 r     mids…

#bộ lọc cho các phương tiện được sản xuất vào năm 1999 và có số dặm trong thành phố (cty) lớn hơn 18
mpg_df %>% filter(year==1999 & cty > 18) %>% head(4)

## # A tibble: 4 × 11
##   manufacturer model  displ  year   cyl trans      drv     cty   hwy fl    class
##   <chr>        <chr>  <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr>
## 1 audi         a4       1.8  1999     4 manual(m5) f        21    29 p     comp…
## 2 chevrolet    malibu   2.4  1999     4 auto(l4)   f        19    27 r     mids…
## 3 honda        civic    1.6  1999     4 manual(m5) f        28    33 r     subc…
## 4 honda        civic    1.6  1999     4 auto(l4)   f        24    32 r     subc…

# bộ lọc dành cho xe (tức là hàng) có nhà sản xuất là Chevrolet hoặc hạng là 'suv'.
mpg_df %>% filter(manufacturer=='chevrolet' | class=='suv') %>% head(4)

## # A tibble: 4 × 11
##   manufacturer model       displ  year   cyl trans drv     cty   hwy fl    class
##   <chr>        <chr>       <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 chevrolet    c1500 subu…   5.3  2008     8 auto… r        14    20 r     suv  
## 2 chevrolet    c1500 subu…   5.3  2008     8 auto… r        11    15 e     suv  
## 3 chevrolet    c1500 subu…   5.3  2008     8 auto… r        14    20 r     suv  
## 4 chevrolet    c1500 subu…   5.7  1999     8 auto… r        13    17 r     suv

mpg_df %>% filter( (manufacturer=='chevrolet' | class=='suv') & hwy < 20) %>% head(4)

## # A tibble: 4 × 11
##   manufacturer model       displ  year   cyl trans drv     cty   hwy fl    class
##   <chr>        <chr>       <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 chevrolet    c1500 subu…   5.3  2008     8 auto… r        11    15 e     suv  
## 2 chevrolet    c1500 subu…   5.7  1999     8 auto… r        13    17 r     suv  
## 3 chevrolet    c1500 subu…   6    2008     8 auto… r        12    17 r     suv  
## 4 chevrolet    k1500 taho…   5.3  2008     8 auto… 4        14    19 r     suv

##Sử dụng str_detect() và %in

#library(tidyverse)
#mpg_df %>% filter(str_detect(model,'4wd')) %>% head(5)
mpg_df %>% filter(cyl %in% c(4,5,6)) %>% head(5)

## # A tibble: 5 × 11
##   manufacturer model displ  year   cyl trans      drv     cty   hwy fl    class 
##   <chr>        <chr> <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr> 
## 1 audi         a4      1.8  1999     4 auto(l5)   f        18    29 p     compa…
## 2 audi         a4      1.8  1999     4 manual(m5) f        21    29 p     compa…
## 3 audi         a4      2    2008     4 manual(m6) f        20    31 p     compa…
## 4 audi         a4      2    2008     4 auto(av)   f        21    30 p     compa…
## 5 audi         a4      2.8  1999     6 auto(l5)   f        16    26 p     compa…

#Phát hiện dl trống
mpg %>% filter(is.na(year))

## # A tibble: 0 × 11
## # … with 11 variables: manufacturer <chr>, model <chr>, displ <dbl>,
## #   year <int>, cyl <int>, trans <chr>, drv <chr>, cty <int>, hwy <int>,
## #   fl <chr>, class <chr>

#Loại bỏ tất cả giá trị còn thiếu 
mpg %>% filter(!is.na(year)) %>% head(4)

## # A tibble: 4 × 11
##   manufacturer model displ  year   cyl trans      drv     cty   hwy fl    class 
##   <chr>        <chr> <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr> 
## 1 audi         a4      1.8  1999     4 auto(l5)   f        18    29 p     compa…
## 2 audi         a4      1.8  1999     4 manual(m5) f        21    29 p     compa…
## 3 audi         a4      2    2008     4 manual(m6) f        20    31 p     compa…
## 4 audi         a4      2    2008     4 auto(av)   f        21    30 p     compa…

#Kiểm tra giá trị bằng complete.cases Và để lọc tất cả các hàng có giá trị bị thiếu trong ít nhất một cột:
mpg %>% filter(complete.cases(.)) %>% head(4)

## # A tibble: 4 × 11
##   manufacturer model displ  year   cyl trans      drv     cty   hwy fl    class 
##   <chr>        <chr> <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr> 
## 1 audi         a4      1.8  1999     4 auto(l5)   f        18    29 p     compa…
## 2 audi         a4      1.8  1999     4 manual(m5) f        21    29 p     compa…
## 3 audi         a4      2    2008     4 manual(m6) f        20    31 p     compa…
## 4 audi         a4      2    2008     4 auto(av)   f        21    30 p     compa…

mpg %>% filter( !complete.cases(.) )

## # A tibble: 0 × 11
## # … with 11 variables: manufacturer <chr>, model <chr>, displ <dbl>,
## #   year <int>, cyl <int>, trans <chr>, drv <chr>, cty <int>, hwy <int>,
## #   fl <chr>, class <chr>

##Xâu chuổi dplyr và ggplot

mpg_df %>%
filter(class=='midsize') %>%
select(class,manufacturer,displ,year) %>%
arrange(displ) %>%
ggplot(aes(x=class,y=displ)) + geom_boxplot()

#Dữ liệu với Kim cương

library(ggplot2)
data("diamonds")
diamonds_df <- diamonds
diamonds_df %>% select(-x, -y, -z) %>% mutate(AUD = price*1.25) %>% head(4)

## # A tibble: 4 × 8
##   carat cut     color clarity depth table price   AUD
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  408.
## 2  0.21 Premium E     SI1      59.8    61   326  408.
## 3  0.23 Good    E     VS1      56.9    65   327  409.
## 4  0.29 Premium I     VS2      62.4    58   334  418.

diamonds_df %>% select(-x, -y , -z) %>% mutate(ppc = price/carat) %>% head(4)

## # A tibble: 4 × 8
##   carat cut     color clarity depth table price   ppc
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326 1417.
## 2  0.21 Premium E     SI1      59.8    61   326 1552.
## 3  0.23 Good    E     VS1      56.9    65   327 1422.
## 4  0.29 Premium I     VS2      62.4    58   334 1152.

diamonds_df %>% select(-x, -y, -z) %>% mutate(price_label = ifelse(price > 5000, "expensive", "cheap"))

## # A tibble: 53,940 × 8
##    carat cut       color clarity depth table price price_label
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <chr>      
##  1  0.23 Ideal     E     SI2      61.5    55   326 cheap      
##  2  0.21 Premium   E     SI1      59.8    61   326 cheap      
##  3  0.23 Good      E     VS1      56.9    65   327 cheap      
##  4  0.29 Premium   I     VS2      62.4    58   334 cheap      
##  5  0.31 Good      J     SI2      63.3    58   335 cheap      
##  6  0.24 Very Good J     VVS2     62.8    57   336 cheap      
##  7  0.24 Very Good I     VVS1     62.3    57   336 cheap      
##  8  0.26 Very Good H     SI1      61.9    55   337 cheap      
##  9  0.22 Fair      E     VS2      65.1    61   337 cheap      
## 10  0.23 Very Good H     VS1      59.4    61   338 cheap      
## # … with 53,930 more rows

#Lập biểu đồ
diamonds_df %>% select(-x,-y,-z) %>%
mutate(price_label = ifelse(price > 5000,'expensive','cheap')) %>%
ggplot(aes(x=price, fill = price_label)) +
geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

diamonds_df %>%
select(clarity) %>%
mutate(clarity_group = case_when(clarity == 'IF' ~ 'flawless',
#str_detect(clarity, 'VVS') ~ 'VV_slight',
#str_detect(clarity, 'VS') ~ 'V_slight',
#str_detect(clarity, 'SI') ~ 'slight',
clarity == 'I1' ~ 'impurity',
TRUE ~ 'other')) %>% head(10)

## # A tibble: 10 × 2
##    clarity clarity_group
##    <ord>   <chr>        
##  1 SI2     other        
##  2 SI1     other        
##  3 VS1     other        
##  4 VS2     other        
##  5 SI2     other        
##  6 VVS2    other        
##  7 VVS1    other        
##  8 SI1     other        
##  9 VS2     other        
## 10 VS1     other

#Toán tử n() và summarize()
diamonds_df %>% summarize(mean_price = mean(price),
sd_price = sd(price),
min_price = min(price),
max_price = max(price),
n_rows = n())

## # A tibble: 1 × 5
##   mean_price sd_price min_price max_price n_rows
##        <dbl>    <dbl>     <int>     <int>  <int>
## 1      3933.    3989.       326     18823  53940

diamonds_df %>% group_by(clarity) %>%
summarize(mean_price = mean(price),
sd_price = sd(price),
min_price = min(price),
max_price = max(price),
n_rows = n()) %>% head(10)

## # A tibble: 8 × 6
##   clarity mean_price sd_price min_price max_price n_rows
##   <ord>        <dbl>    <dbl>     <int>     <int>  <int>
## 1 I1           3924.    2807.       345     18531    741
## 2 SI2          5063.    4260.       326     18804   9194
## 3 SI1          3996.    3799.       326     18818  13065
## 4 VS2          3925.    4042.       334     18823  12258
## 5 VS1          3839.    4012.       327     18795   8171
## 6 VVS2         3284.    3822.       336     18768   5066
## 7 VVS1         2523.    3335.       336     18777   3655
## 8 IF           2865.    3920.       369     18806   1790

diamonds_df %>%
group_by(clarity, cut) %>%
summarize(mean_price = mean(price),
sd_price = sd(price),
min_price = min(price),
max_price = max(price),
n_rows = n()) %>% head(10)

## `summarise()` has grouped output by 'clarity'. You can override using the
## `.groups` argument.

## # A tibble: 10 × 7
## # Groups:   clarity [2]
##    clarity cut       mean_price sd_price min_price max_price n_rows
##    <ord>   <ord>          <dbl>    <dbl>     <int>     <int>  <int>
##  1 I1      Fair           3704.    3099.       584     18531    210
##  2 I1      Good           3597.    2285.       361     11548     96
##  3 I1      Very Good      4078.    2720.       511     15984     84
##  4 I1      Premium        3947.    2827.       345     16193    205
##  5 I1      Ideal          4336.    2671.       413     16538    146
##  6 SI2     Fair           5174.    3928.       536     18308    466
##  7 SI2     Good           4580.    3901.       335     18788   1081
##  8 SI2     Very Good      4989.    4126.       383     18692   2100
##  9 SI2     Premium        5546.    4488.       345     18784   2949
## 10 SI2     Ideal          4756.    4252.       326     18804   2598

Homework

Nguyễn Đức Phương

2023-02-04

Với dữ liệu MPG