CDL

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   1.0.1
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.3     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(dplyr)
data("airquality")
head(airquality)

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6

#Hàm select

select(airquality, Ozone, Temp) %>% head(4)

##   Ozone Temp
## 1    41   67
## 2    36   72
## 3    12   74
## 4    18   62

airquality %>% select(-Ozone, -Temp) %>% head(4)

##   Solar.R Wind Month Day
## 1     190  7.4     5   1
## 2     118  8.0     5   2
## 3     149 12.6     5   3
## 4     313 11.5     5   4

#hàm filter

filter(airquality, Ozone > 106) %>% head(5)

##   Ozone Solar.R Wind Temp Month Day
## 1   115     223  5.7   79     5  30
## 2   135     269  4.1   84     7   1
## 3   108     223  8.0   85     7  25
## 4   122     255  4.0   89     8   7
## 5   110     207  8.0   90     8   9

filter(airquality, Ozone > 106 & Month == 8) %>% head(5)

##   Ozone Solar.R Wind Temp Month Day
## 1   122     255  4.0   89     8   7
## 2   110     207  8.0   90     8   9
## 3   168     238  3.4   81     8  25
## 4   118     225  2.3   94     8  29

filter(airquality, Ozone > 106, Month == 8, Wind > 7) %>% head(5)

##   Ozone Solar.R Wind Temp Month Day
## 1   110     207    8   90     8   9

#hàm mutate

mutate(airquality, temp.celsius = Temp - 31) %>% head(5)

##   Ozone Solar.R Wind Temp Month Day temp.celsius
## 1    41     190  7.4   67     5   1           36
## 2    36     118  8.0   72     5   2           41
## 3    12     149 12.6   74     5   3           43
## 4    18     313 11.5   62     5   4           31
## 5    NA      NA 14.3   56     5   5           25

#Hàm summarize và group_by

summarize(group_by(airquality, Month), round(mean(Temp, na.rm = T)))

## # A tibble: 5 × 2
##   Month `round(mean(Temp, na.rm = T))`
##   <int>                          <dbl>
## 1     5                             66
## 2     6                             79
## 3     7                             84
## 4     8                             84
## 5     9                             77

#Hàm sample

sample_n(airquality, 4) # Lấy ngẫu nhiên 4

##   Ozone Solar.R Wind Temp Month Day
## 1    37     279  7.4   76     5  31
## 2     9      24 10.9   71     9  14
## 3    34     307 12.0   66     5  17
## 4    23     148  8.0   82     6  13

sample_frac(airquality, size = 0.02) # Lấy ngẫu nhiên 2%

##   Ozone Solar.R Wind Temp Month Day
## 1    39      83  6.9   81     8   1
## 2    13     137 10.3   76     6  20
## 3    NA     138  8.0   83     6  30

#Hàm count

count(airquality, Month > 5) # Đếm xem có bao nhiêu quan sát có tháng lớn hơn 5 và nhỏ hơn

##   Month > 5   n
## 1     FALSE  31
## 2      TRUE 122

count(airquality, Month) # Thống kê theo tháng

##   Month  n
## 1     5 31
## 2     6 30
## 3     7 31
## 4     8 31
## 5     9 30

#Hàm arrange

arrange(airquality, Month, Day) %>% head(5) # mặc định là ascending- tăng dần

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5

arrange(airquality, desc(Month), Day) %>% head(5) # để sắp giảm dần, dùng desc (tức descending)

##   Ozone Solar.R Wind Temp Month Day
## 1    96     167  6.9   91     9   1
## 2    78     197  5.1   92     9   2
## 3    73     183  2.8   93     9   3
## 4    91     189  4.6   93     9   4
## 5    47      95  7.4   87     9   5

#Hàm contains()

mpg_df<-mpg
mpg_df %>% select(contains('y')) %>% head(4)

## # A tibble: 4 × 4
##    year   cyl   cty   hwy
##   <int> <int> <int> <int>
## 1  1999     4    18    29
## 2  1999     4    21    29
## 3  2008     4    20    31
## 4  2008     4    21    30

#Hàm starts_with()

mpg_df %>% select(starts_with('c')) %>% head(4)

## # A tibble: 4 × 3
##     cyl   cty class  
##   <int> <int> <chr>  
## 1     4    18 compact
## 2     4    21 compact
## 3     4    20 compact
## 4     4    21 compact

mpg_df %>% select( 2, 1, class, contains('y')) %>% head(4)

## # A tibble: 4 × 7
##   model manufacturer class    year   cyl   cty   hwy
##   <chr> <chr>        <chr>   <int> <int> <int> <int>
## 1 a4    audi         compact  1999     4    18    29
## 2 a4    audi         compact  1999     4    21    29
## 3 a4    audi         compact  2008     4    20    31
## 4 a4    audi         compact  2008     4    21    30

#Hàm everything()

mpg_df %>% select(class,displ,year,everything()) %>% head(4)

## # A tibble: 4 × 11
##   class   displ  year manufacturer model   cyl trans     drv     cty   hwy fl   
##   <chr>   <dbl> <int> <chr>        <chr> <int> <chr>     <chr> <int> <int> <chr>
## 1 compact   1.8  1999 audi         a4        4 auto(l5)  f        18    29 p    
## 2 compact   1.8  1999 audi         a4        4 manual(m… f        21    29 p    
## 3 compact   2    2008 audi         a4        4 manual(m… f        20    31 p    
## 4 compact   2    2008 audi         a4        4 auto(av)  f        21    30 p

#Toán tử pipe

# Forward pipe
rnorm(100,20,5)%>%
matrix(ncol=2)%>%
data.frame(x=.[,1],y=.[,2])%>%
plot(col="red")

#Toán tử T pipe %T>%

# T pipe
rnorm(100,10,1)%T>%
ts.plot(col="red")%>%
density()%>%                                                                                                                                                                                                                                                                                                                                                                 
plot(col="red","densityplot")

#Toán tử Assigning pipe

library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

data.frame(x=rnorm(100,10,5),
y=rnorm(100,20,5)) %$% ts.plot(x,col="red")

data.frame(x=rnorm(100,10,5),
y=rnorm(100,20,5)) %$% ts.plot(y,col="blue")

#Toán tử Backward pipe

library(magrittr)
x=abs(rnorm(100))
y=x
cbind(x,y)%>%head()

##               x          y
## [1,] 0.72393021 0.72393021
## [2,] 0.06708265 0.06708265
## [3,] 1.54033222 1.54033222
## [4,] 0.08714650 0.08714650
## [5,] 0.37668094 0.37668094
## [6,] 1.11563590 1.11563590

x %<>% .^2 %>% sqrt()
x==y

##   [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

#thực hành với dự liệu mpg

library(tidyverse)
library(dplyr)
mpg_df <- mpg
# filter the mpg_df data for cars manufactured in the year 1999
filter(mpg_df, year == 1999) %>% head(5)

## # A tibble: 5 × 11
##   manufacturer model      displ  year   cyl trans  drv     cty   hwy fl    class
##   <chr>        <chr>      <dbl> <int> <int> <chr>  <chr> <int> <int> <chr> <chr>
## 1 audi         a4           1.8  1999     4 auto(… f        18    29 p     comp…
## 2 audi         a4           1.8  1999     4 manua… f        21    29 p     comp…
## 3 audi         a4           2.8  1999     6 auto(… f        16    26 p     comp…
## 4 audi         a4           2.8  1999     6 manua… f        18    26 p     comp…
## 5 audi         a4 quattro   1.8  1999     4 manua… 4        18    26 p     comp…

# Show the dimension
filter(mpg_df, year == 1999) %>% dim()

## [1] 117  11

# let’s take all vehicles in the ‘midsize’ class:
filter(mpg_df, class == "midsize") %>% head(5)

## # A tibble: 5 × 11
##   manufacturer model      displ  year   cyl trans  drv     cty   hwy fl    class
##   <chr>        <chr>      <dbl> <int> <int> <chr>  <chr> <int> <int> <chr> <chr>
## 1 audi         a6 quattro   2.8  1999     6 auto(… 4        15    24 p     mids…
## 2 audi         a6 quattro   3.1  2008     6 auto(… 4        17    25 p     mids…
## 3 audi         a6 quattro   4.2  2008     8 auto(… 4        16    23 p     mids…
## 4 chevrolet    malibu       2.4  1999     4 auto(… f        19    27 r     mids…
## 5 chevrolet    malibu       2.4  2008     4 auto(… f        22    30 r     mids…

# filter for vehicles built in 1999 and with mileage in the city (cty) greater than 18.
mpg_df %>% filter(year==1999 & cty > 18) %>% head(4)

## # A tibble: 4 × 11
##   manufacturer model  displ  year   cyl trans      drv     cty   hwy fl    class
##   <chr>        <chr>  <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr>
## 1 audi         a4       1.8  1999     4 manual(m5) f        21    29 p     comp…
## 2 chevrolet    malibu   2.4  1999     4 auto(l4)   f        19    27 r     mids…
## 3 honda        civic    1.6  1999     4 manual(m5) f        28    33 r     subc…
## 4 honda        civic    1.6  1999     4 auto(l4)   f        24    32 r     subc…

# filter for vehicles (i.e., rows) where the manufacturer is Chevrolet or the class is ‘suv’.
mpg_df %>% filter(manufacturer=='chevrolet' | class=='suv') %>% head(4)

## # A tibble: 4 × 11
##   manufacturer model       displ  year   cyl trans drv     cty   hwy fl    class
##   <chr>        <chr>       <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 chevrolet    c1500 subu…   5.3  2008     8 auto… r        14    20 r     suv  
## 2 chevrolet    c1500 subu…   5.3  2008     8 auto… r        11    15 e     suv  
## 3 chevrolet    c1500 subu…   5.3  2008     8 auto… r        14    20 r     suv  
## 4 chevrolet    c1500 subu…   5.7  1999     8 auto… r        13    17 r     suv

mpg_df %>% filter( (manufacturer=='chevrolet' | class=='suv') & hwy < 20) %>% head(4)

## # A tibble: 4 × 11
##   manufacturer model       displ  year   cyl trans drv     cty   hwy fl    class
##   <chr>        <chr>       <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 chevrolet    c1500 subu…   5.3  2008     8 auto… r        11    15 e     suv  
## 2 chevrolet    c1500 subu…   5.7  1999     8 auto… r        13    17 r     suv  
## 3 chevrolet    c1500 subu…   6    2008     8 auto… r        12    17 r     suv  
## 4 chevrolet    k1500 taho…   5.3  2008     8 auto… 4        14    19 r     suv

#Sử dụng str_detect() và %in

library(tidyverse)
mpg_df %>% filter(str_detect(model,'4wd')) %>% head(5)

## # A tibble: 5 × 11
##   manufacturer model       displ  year   cyl trans drv     cty   hwy fl    class
##   <chr>        <chr>       <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 chevrolet    k1500 taho…   5.3  2008     8 auto… 4        14    19 r     suv  
## 2 chevrolet    k1500 taho…   5.3  2008     8 auto… 4        11    14 e     suv  
## 3 chevrolet    k1500 taho…   5.7  1999     8 auto… 4        11    15 r     suv  
## 4 chevrolet    k1500 taho…   6.5  1999     8 auto… 4        14    17 d     suv  
## 5 dodge        dakota pic…   3.7  2008     6 manu… 4        15    19 r     pick…

mpg_df %>% filter(cyl %in% c(4,5,6)) %>% head(5)

## # A tibble: 5 × 11
##   manufacturer model displ  year   cyl trans      drv     cty   hwy fl    class 
##   <chr>        <chr> <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr> 
## 1 audi         a4      1.8  1999     4 auto(l5)   f        18    29 p     compa…
## 2 audi         a4      1.8  1999     4 manual(m5) f        21    29 p     compa…
## 3 audi         a4      2    2008     4 manual(m6) f        20    31 p     compa…
## 4 audi         a4      2    2008     4 auto(av)   f        21    30 p     compa…
## 5 audi         a4      2.8  1999     6 auto(l5)   f        16    26 p     compa…

#Phát hiện dữ liệu trống

mpg %>% filter(is.na(year))

## # A tibble: 0 × 11
## # … with 11 variables: manufacturer <chr>, model <chr>, displ <dbl>,
## #   year <int>, cyl <int>, trans <chr>, drv <chr>, cty <int>, hwy <int>,
## #   fl <chr>, class <chr>

# To drop out all the missing value
mpg %>% filter(!is.na(year)) %>% head(4)

## # A tibble: 4 × 11
##   manufacturer model displ  year   cyl trans      drv     cty   hwy fl    class 
##   <chr>        <chr> <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr> 
## 1 audi         a4      1.8  1999     4 auto(l5)   f        18    29 p     compa…
## 2 audi         a4      1.8  1999     4 manual(m5) f        21    29 p     compa…
## 3 audi         a4      2    2008     4 manual(m6) f        20    31 p     compa…
## 4 audi         a4      2    2008     4 auto(av)   f        21    30 p     compa…

#Kiểm tra giá trị bằng complete.cases(.)

mpg %>% filter(complete.cases(.)) %>% head(4)

## # A tibble: 4 × 11
##   manufacturer model displ  year   cyl trans      drv     cty   hwy fl    class 
##   <chr>        <chr> <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr> 
## 1 audi         a4      1.8  1999     4 auto(l5)   f        18    29 p     compa…
## 2 audi         a4      1.8  1999     4 manual(m5) f        21    29 p     compa…
## 3 audi         a4      2    2008     4 manual(m6) f        20    31 p     compa…
## 4 audi         a4      2    2008     4 auto(av)   f        21    30 p     compa…

# And to filter for all rows with a missing value in at least one column:
mpg %>% filter( !complete.cases(.) )

## # A tibble: 0 × 11
## # … with 11 variables: manufacturer <chr>, model <chr>, displ <dbl>,
## #   year <int>, cyl <int>, trans <chr>, drv <chr>, cty <int>, hwy <int>,
## #   fl <chr>, class <chr>

#Chaining dplyr and ggplot

mpg_df %>%
filter(class=='midsize') %>%
select(class,manufacturer,displ,year) %>%
arrange(displ) %>%
ggplot(aes(x=class,y=displ)) + geom_boxplot()

#DỮ LIỆU DIAMOND

library(ggplot2)
diamonds_df <- diamonds
diamonds_df %>% select(-x, -y, -z) %>% mutate(AUD = price*1.25) %>% head(4)

## # A tibble: 4 × 8
##   carat cut     color clarity depth table price   AUD
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  408.
## 2  0.21 Premium E     SI1      59.8    61   326  408.
## 3  0.23 Good    E     VS1      56.9    65   327  409.
## 4  0.29 Premium I     VS2      62.4    58   334  418.

diamonds_df %>% select(-x, -y , -z) %>% mutate(ppc = price/carat) %>% head(4)

## # A tibble: 4 × 8
##   carat cut     color clarity depth table price   ppc
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326 1417.
## 2  0.21 Premium E     SI1      59.8    61   326 1552.
## 3  0.23 Good    E     VS1      56.9    65   327 1422.
## 4  0.29 Premium I     VS2      62.4    58   334 1152.

diamonds_df %>% select(-x, -y, -z) %>% mutate(price_label = ifelse(price > 5000, "expensive", "cheap"))

## # A tibble: 53,940 × 8
##    carat cut       color clarity depth table price price_label
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <chr>      
##  1  0.23 Ideal     E     SI2      61.5    55   326 cheap      
##  2  0.21 Premium   E     SI1      59.8    61   326 cheap      
##  3  0.23 Good      E     VS1      56.9    65   327 cheap      
##  4  0.29 Premium   I     VS2      62.4    58   334 cheap      
##  5  0.31 Good      J     SI2      63.3    58   335 cheap      
##  6  0.24 Very Good J     VVS2     62.8    57   336 cheap      
##  7  0.24 Very Good I     VVS1     62.3    57   336 cheap      
##  8  0.26 Very Good H     SI1      61.9    55   337 cheap      
##  9  0.22 Fair      E     VS2      65.1    61   337 cheap      
## 10  0.23 Very Good H     VS1      59.4    61   338 cheap      
## # … with 53,930 more rows

# Make the plot
diamonds_df %>% select(-x,-y,-z) %>%
mutate(price_label = ifelse(price > 5000,'expensive','cheap')) %>%
ggplot(aes(x=price, fill = price_label)) +
geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Lệnh case_when()

diamonds_df %>%
select(clarity) %>%
mutate(clarity_group = case_when(clarity == 'IF' ~ 'flawless',
str_detect(clarity, 'VVS') ~ 'VV_slight',
str_detect(clarity, 'VS') ~ 'V_slight',
str_detect(clarity, 'SI') ~ 'slight',
clarity == 'I1' ~ 'impurity',
TRUE ~ 'other')) %>% head(10)

## # A tibble: 10 × 2
##    clarity clarity_group
##    <ord>   <chr>        
##  1 SI2     slight       
##  2 SI1     slight       
##  3 VS1     V_slight     
##  4 VS2     V_slight     
##  5 SI2     slight       
##  6 VVS2    VV_slight    
##  7 VVS1    VV_slight    
##  8 SI1     slight       
##  9 VS2     V_slight     
## 10 VS1     V_slight

#Toán tử n() và summarize()

diamonds_df %>% summarize(mean_price = mean(price),
sd_price = sd(price),
min_price = min(price),
max_price = max(price),
n_rows = n())

## # A tibble: 1 × 5
##   mean_price sd_price min_price max_price n_rows
##        <dbl>    <dbl>     <int>     <int>  <int>
## 1      3933.    3989.       326     18823  53940

diamonds_df %>% group_by(clarity) %>%
summarize(mean_price = mean(price),
sd_price = sd(price),
min_price = min(price),
max_price = max(price),
n_rows = n()) %>% head(10)

## # A tibble: 8 × 6
##   clarity mean_price sd_price min_price max_price n_rows
##   <ord>        <dbl>    <dbl>     <int>     <int>  <int>
## 1 I1           3924.    2807.       345     18531    741
## 2 SI2          5063.    4260.       326     18804   9194
## 3 SI1          3996.    3799.       326     18818  13065
## 4 VS2          3925.    4042.       334     18823  12258
## 5 VS1          3839.    4012.       327     18795   8171
## 6 VVS2         3284.    3822.       336     18768   5066
## 7 VVS1         2523.    3335.       336     18777   3655
## 8 IF           2865.    3920.       369     18806   1790

diamonds_df %>%
group_by(clarity, cut) %>%
summarize(mean_price = mean(price),
sd_price = sd(price),
min_price = min(price),
max_price = max(price),
n_rows = n()) %>% head(10)

## `summarise()` has grouped output by 'clarity'. You can override using the
## `.groups` argument.

## # A tibble: 10 × 7
## # Groups:   clarity [2]
##    clarity cut       mean_price sd_price min_price max_price n_rows
##    <ord>   <ord>          <dbl>    <dbl>     <int>     <int>  <int>
##  1 I1      Fair           3704.    3099.       584     18531    210
##  2 I1      Good           3597.    2285.       361     11548     96
##  3 I1      Very Good      4078.    2720.       511     15984     84
##  4 I1      Premium        3947.    2827.       345     16193    205
##  5 I1      Ideal          4336.    2671.       413     16538    146
##  6 SI2     Fair           5174.    3928.       536     18308    466
##  7 SI2     Good           4580.    3901.       335     18788   1081
##  8 SI2     Very Good      4989.    4126.       383     18692   2100
##  9 SI2     Premium        5546.    4488.       345     18784   2949
## 10 SI2     Ideal          4756.    4252.       326     18804   2598

CDL

hoàng văn thảo

2023-02-07