Untitled

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.2.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data("airquality")
head(airquality)

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6

#Hàm lấy tên cột
select(airquality, Ozone, Temp) %>% head(4)

##   Ozone Temp
## 1    41   67
## 2    36   72
## 3    12   74
## 4    18   62

airquality %>% select(-Ozone, -Temp) %>% head(4)

##   Solar.R Wind Month Day
## 1     190  7.4     5   1
## 2     118  8.0     5   2
## 3     149 12.6     5   3
## 4     313 11.5     5   4

#Hàm lọc dữ liệu
filter(airquality, Ozone > 106) %>% head(5)

##   Ozone Solar.R Wind Temp Month Day
## 1   115     223  5.7   79     5  30
## 2   135     269  4.1   84     7   1
## 3   108     223  8.0   85     7  25
## 4   122     255  4.0   89     8   7
## 5   110     207  8.0   90     8   9

filter(airquality, Ozone > 106 & Month == 8) %>% head(5)

##   Ozone Solar.R Wind Temp Month Day
## 1   122     255  4.0   89     8   7
## 2   110     207  8.0   90     8   9
## 3   168     238  3.4   81     8  25
## 4   118     225  2.3   94     8  29

filter(airquality, Ozone > 106, Month == 8, Wind > 7) %>% head(5)

##   Ozone Solar.R Wind Temp Month Day
## 1   110     207    8   90     8   9

mutate(airquality, temp.celsius = Temp - 31) %>% head(5)

##   Ozone Solar.R Wind Temp Month Day temp.celsius
## 1    41     190  7.4   67     5   1           36
## 2    36     118  8.0   72     5   2           41
## 3    12     149 12.6   74     5   3           43
## 4    18     313 11.5   62     5   4           31
## 5    NA      NA 14.3   56     5   5           25

#Hàm khái quát thống kê và hàm xếp theo nhóm
library(dplyr)
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.2.2

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   1.0.1
## ✔ tibble  3.1.8     ✔ stringr 1.5.0
## ✔ tidyr   1.3.0     ✔ forcats 1.0.0
## ✔ readr   2.1.3

## Warning: package 'ggplot2' was built under R version 4.2.2

## Warning: package 'tibble' was built under R version 4.2.2

## Warning: package 'tidyr' was built under R version 4.2.2

## Warning: package 'readr' was built under R version 4.2.2

## Warning: package 'purrr' was built under R version 4.2.2

## Warning: package 'stringr' was built under R version 4.2.2

## Warning: package 'forcats' was built under R version 4.2.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(magrittr)

## Warning: package 'magrittr' was built under R version 4.2.2

## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract

summarize(group_by(airquality, Month), round(mean(Temp, na.rm = T)))

## # A tibble: 5 × 2
##   Month `round(mean(Temp, na.rm = T))`
##   <int>                          <dbl>
## 1     5                             66
## 2     6                             79
## 3     7                             84
## 4     8                             84
## 5     9                             77

sample_n(airquality, 4) # Lấy ngẫu nhiên 4

##   Ozone Solar.R Wind Temp Month Day
## 1    79     187  5.1   87     7  19
## 2    NA     138  8.0   83     6  30
## 3    30     322 11.5   68     5  19
## 4    13     238 12.6   64     9  21

sample_frac(airquality, size = 0.02) # Lấy ngẫu nhiên 2%

##   Ozone Solar.R Wind Temp Month Day
## 1    20     252 10.9   80     9   7
## 2    36     139 10.3   81     9  23
## 3    NA     322 11.5   79     6  15

count(airquality, Month > 5) # Đếm xem có bao nhiêu quan sát có tháng lớn hơn 5 và nhỏ hơn

##   Month > 5   n
## 1     FALSE  31
## 2      TRUE 122

count(airquality, Month) # Thống kê theo tháng

##   Month  n
## 1     5 31
## 2     6 30
## 3     7 31
## 4     8 31
## 5     9 30

arrange(airquality, Month, Day) %>% head(5) # mặc định là ascending- tăng dần

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5

arrange(airquality, desc(Month), Day) %>% head(5) # để sắp giảm dần, dùng desc (tức descending)

##   Ozone Solar.R Wind Temp Month Day
## 1    96     167  6.9   91     9   1
## 2    78     197  5.1   92     9   2
## 3    73     183  2.8   93     9   3
## 4    91     189  4.6   93     9   4
## 5    47      95  7.4   87     9   5

#Hàm Contains()
mpg_df <- mpg
mpg_df %>% select(contains('y')) %>% head(4)

## # A tibble: 4 × 4
##    year   cyl   cty   hwy
##   <int> <int> <int> <int>
## 1  1999     4    18    29
## 2  1999     4    21    29
## 3  2008     4    20    31
## 4  2008     4    21    30

#Hàm start_with()
mpg_df %>% select(starts_with('c')) %>% head(4)

## # A tibble: 4 × 3
##     cyl   cty class  
##   <int> <int> <chr>  
## 1     4    18 compact
## 2     4    21 compact
## 3     4    20 compact
## 4     4    21 compact

mpg_df %>% select( 2, 1, class, contains('y')) %>% head(4)

## # A tibble: 4 × 7
##   model manufacturer class    year   cyl   cty   hwy
##   <chr> <chr>        <chr>   <int> <int> <int> <int>
## 1 a4    audi         compact  1999     4    18    29
## 2 a4    audi         compact  1999     4    21    29
## 3 a4    audi         compact  2008     4    20    31
## 4 a4    audi         compact  2008     4    21    30

#Hàm everything()
mpg_df %>% select(class,displ,year,everything()) %>% head(4)

## # A tibble: 4 × 11
##   class   displ  year manufacturer model   cyl trans     drv     cty   hwy fl   
##   <chr>   <dbl> <int> <chr>        <chr> <int> <chr>     <chr> <int> <int> <chr>
## 1 compact   1.8  1999 audi         a4        4 auto(l5)  f        18    29 p    
## 2 compact   1.8  1999 audi         a4        4 manual(m… f        21    29 p    
## 3 compact   2    2008 audi         a4        4 manual(m… f        20    31 p    
## 4 compact   2    2008 audi         a4        4 auto(av)  f        21    30 p

#Toán tử pipe
#1.Forward pipe
rnorm(100,20,5)%>%
  matrix(ncol=2)%>%
  data.frame(x=.[,1],y=.[,2])%>%
  plot(col="red")

#2.Toán tử T pipe %T>%
rnorm(100,10,1)%T>%
  ts.plot(col="red")%>%
  density()%>%
  plot(col="blue","densityplot")

#3.Toán tử Assigning pipe
library(magrittr)
data.frame(x=rnorm(100,10,5),
           y=rnorm(100,20,5)) %$% ts.plot(x,col="red")

data.frame(x=rnorm(100,10,5),
           y=rnorm(100,20,5)) %$% ts.plot(y,col="blue")

#4.Toán tử Backward pipe
library(magrittr)
x=abs(rnorm(100))
y=x
cbind(x,y)%>%head()

##              x         y
## [1,] 1.3264016 1.3264016
## [2,] 0.3857248 0.3857248
## [3,] 0.3728777 0.3728777
## [4,] 0.3124767 0.3124767
## [5,] 0.3979740 0.3979740
## [6,] 0.2715807 0.2715807

x %<>% .^2 %>% sqrt()
x==y

##   [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

#Thực hành với mpg
library(tidyverse) 
library(dplyr)
mpg_df <- mpg
# filter the mpg_df data for cars manufactured in the year 1999
filter(mpg_df, year == 1999) %>% head(5)

## # A tibble: 5 × 11
##   manufacturer model      displ  year   cyl trans  drv     cty   hwy fl    class
##   <chr>        <chr>      <dbl> <int> <int> <chr>  <chr> <int> <int> <chr> <chr>
## 1 audi         a4           1.8  1999     4 auto(… f        18    29 p     comp…
## 2 audi         a4           1.8  1999     4 manua… f        21    29 p     comp…
## 3 audi         a4           2.8  1999     6 auto(… f        16    26 p     comp…
## 4 audi         a4           2.8  1999     6 manua… f        18    26 p     comp…
## 5 audi         a4 quattro   1.8  1999     4 manua… 4        18    26 p     comp…

# Show the dimension
filter(mpg_df, year == 1999) %>% dim()

## [1] 117  11

# let’s take all vehicles in the ‘midsize’ class:
filter(mpg_df, class == "midsize") %>% head(5)

## # A tibble: 5 × 11
##   manufacturer model      displ  year   cyl trans  drv     cty   hwy fl    class
##   <chr>        <chr>      <dbl> <int> <int> <chr>  <chr> <int> <int> <chr> <chr>
## 1 audi         a6 quattro   2.8  1999     6 auto(… 4        15    24 p     mids…
## 2 audi         a6 quattro   3.1  2008     6 auto(… 4        17    25 p     mids…
## 3 audi         a6 quattro   4.2  2008     8 auto(… 4        16    23 p     mids…
## 4 chevrolet    malibu       2.4  1999     4 auto(… f        19    27 r     mids…
## 5 chevrolet    malibu       2.4  2008     4 auto(… f        22    30 r     mids…

# filter for vehicles built in 1999 and with mileage in the city (cty) greater than 18.
mpg_df %>% filter(year==1999 & cty > 18) %>% head(4)

## # A tibble: 4 × 11
##   manufacturer model  displ  year   cyl trans      drv     cty   hwy fl    class
##   <chr>        <chr>  <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr>
## 1 audi         a4       1.8  1999     4 manual(m5) f        21    29 p     comp…
## 2 chevrolet    malibu   2.4  1999     4 auto(l4)   f        19    27 r     mids…
## 3 honda        civic    1.6  1999     4 manual(m5) f        28    33 r     subc…
## 4 honda        civic    1.6  1999     4 auto(l4)   f        24    32 r     subc…

# filter for vehicles (i.e., rows) where the manufacturer is Chevrolet or the class is ‘suv’.
mpg_df %>% filter(manufacturer=='chevrolet' | class=='suv') %>% head(4)

## # A tibble: 4 × 11
##   manufacturer model       displ  year   cyl trans drv     cty   hwy fl    class
##   <chr>        <chr>       <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 chevrolet    c1500 subu…   5.3  2008     8 auto… r        14    20 r     suv  
## 2 chevrolet    c1500 subu…   5.3  2008     8 auto… r        11    15 e     suv  
## 3 chevrolet    c1500 subu…   5.3  2008     8 auto… r        14    20 r     suv  
## 4 chevrolet    c1500 subu…   5.7  1999     8 auto… r        13    17 r     suv

mpg_df %>% filter( (manufacturer=='chevrolet' | class=='suv') & hwy < 20) %>% head(4)

## # A tibble: 4 × 11
##   manufacturer model       displ  year   cyl trans drv     cty   hwy fl    class
##   <chr>        <chr>       <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 chevrolet    c1500 subu…   5.3  2008     8 auto… r        11    15 e     suv  
## 2 chevrolet    c1500 subu…   5.7  1999     8 auto… r        13    17 r     suv  
## 3 chevrolet    c1500 subu…   6    2008     8 auto… r        12    17 r     suv  
## 4 chevrolet    k1500 taho…   5.3  2008     8 auto… 4        14    19 r     suv

library(tidyverse)
mpg_df %>% filter(str_detect(model,'4wd')) %>% head(5)

## # A tibble: 5 × 11
##   manufacturer model       displ  year   cyl trans drv     cty   hwy fl    class
##   <chr>        <chr>       <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 chevrolet    k1500 taho…   5.3  2008     8 auto… 4        14    19 r     suv  
## 2 chevrolet    k1500 taho…   5.3  2008     8 auto… 4        11    14 e     suv  
## 3 chevrolet    k1500 taho…   5.7  1999     8 auto… 4        11    15 r     suv  
## 4 chevrolet    k1500 taho…   6.5  1999     8 auto… 4        14    17 d     suv  
## 5 dodge        dakota pic…   3.7  2008     6 manu… 4        15    19 r     pick…

mpg_df %>% filter(cyl %in% c(4,5,6)) %>% head(5)

## # A tibble: 5 × 11
##   manufacturer model displ  year   cyl trans      drv     cty   hwy fl    class 
##   <chr>        <chr> <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr> 
## 1 audi         a4      1.8  1999     4 auto(l5)   f        18    29 p     compa…
## 2 audi         a4      1.8  1999     4 manual(m5) f        21    29 p     compa…
## 3 audi         a4      2    2008     4 manual(m6) f        20    31 p     compa…
## 4 audi         a4      2    2008     4 auto(av)   f        21    30 p     compa…
## 5 audi         a4      2.8  1999     6 auto(l5)   f        16    26 p     compa…

#Phát hiện dữ liệu trống
mpg %>% filter(is.na(year))

## # A tibble: 0 × 11
## # … with 11 variables: manufacturer <chr>, model <chr>, displ <dbl>,
## #   year <int>, cyl <int>, trans <chr>, drv <chr>, cty <int>, hwy <int>,
## #   fl <chr>, class <chr>

# To drop out all the missing value
mpg %>% filter(!is.na(year)) %>% head(4)

## # A tibble: 4 × 11
##   manufacturer model displ  year   cyl trans      drv     cty   hwy fl    class 
##   <chr>        <chr> <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr> 
## 1 audi         a4      1.8  1999     4 auto(l5)   f        18    29 p     compa…
## 2 audi         a4      1.8  1999     4 manual(m5) f        21    29 p     compa…
## 3 audi         a4      2    2008     4 manual(m6) f        20    31 p     compa…
## 4 audi         a4      2    2008     4 auto(av)   f        21    30 p     compa…

mpg %>% filter(complete.cases(.)) %>% head(4)

## # A tibble: 4 × 11
##   manufacturer model displ  year   cyl trans      drv     cty   hwy fl    class 
##   <chr>        <chr> <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr> 
## 1 audi         a4      1.8  1999     4 auto(l5)   f        18    29 p     compa…
## 2 audi         a4      1.8  1999     4 manual(m5) f        21    29 p     compa…
## 3 audi         a4      2    2008     4 manual(m6) f        20    31 p     compa…
## 4 audi         a4      2    2008     4 auto(av)   f        21    30 p     compa…

# And to filter for all rows with a missing value in at least one column:
mpg %>% filter( !complete.cases(.) )

## # A tibble: 0 × 11
## # … with 11 variables: manufacturer <chr>, model <chr>, displ <dbl>,
## #   year <int>, cyl <int>, trans <chr>, drv <chr>, cty <int>, hwy <int>,
## #   fl <chr>, class <chr>

#Chaining dplyr và ggplot
mpg_df %>%
  filter(class=='midsize') %>%
  select(class,manufacturer,displ,year) %>%
  arrange(displ) %>%
  ggplot(aes(x=class,y=displ)) + geom_boxplot()

#Dữ liệu diamond
library(ggplot2)
diamonds_df <- diamonds
diamonds_df %>% select(-x, -y, -z) %>% mutate(AUD = price*1.25) %>% head(4)

## # A tibble: 4 × 8
##   carat cut     color clarity depth table price   AUD
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  408.
## 2  0.21 Premium E     SI1      59.8    61   326  408.
## 3  0.23 Good    E     VS1      56.9    65   327  409.
## 4  0.29 Premium I     VS2      62.4    58   334  418.

#Thêm 1 biến giá 1 carat bằng cách lấy price/carat
diamonds_df %>% select(-x, -y , -z) %>% mutate(ppc = price/carat) %>% head(4)

## # A tibble: 4 × 8
##   carat cut     color clarity depth table price   ppc
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326 1417.
## 2  0.21 Premium E     SI1      59.8    61   326 1552.
## 3  0.23 Good    E     VS1      56.9    65   327 1422.
## 4  0.29 Premium I     VS2      62.4    58   334 1152.

#Ví dụ: tạo ra một biến số là price_label với điều kiện nếu giá price>5000 thì kim cương là loại đắt (expensive), ngược lại là loại rẻ (cheap)
diamonds_df %>% select(-x, -y, -z) %>% mutate(price_label = ifelse(price > 5000, "expensive", "cheap"))

## # A tibble: 53,940 × 8
##    carat cut       color clarity depth table price price_label
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <chr>      
##  1  0.23 Ideal     E     SI2      61.5    55   326 cheap      
##  2  0.21 Premium   E     SI1      59.8    61   326 cheap      
##  3  0.23 Good      E     VS1      56.9    65   327 cheap      
##  4  0.29 Premium   I     VS2      62.4    58   334 cheap      
##  5  0.31 Good      J     SI2      63.3    58   335 cheap      
##  6  0.24 Very Good J     VVS2     62.8    57   336 cheap      
##  7  0.24 Very Good I     VVS1     62.3    57   336 cheap      
##  8  0.26 Very Good H     SI1      61.9    55   337 cheap      
##  9  0.22 Fair      E     VS2      65.1    61   337 cheap      
## 10  0.23 Very Good H     VS1      59.4    61   338 cheap      
## # … with 53,930 more rows

# Make the plot
diamonds_df %>% select(-x,-y,-z) %>%
  mutate(price_label = ifelse(price > 5000,'expensive','cheap')) %>%
  ggplot(aes(x=price, fill = price_label)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Lệnh case_when
diamonds_df %>%
  select(clarity) %>%
  mutate(clarity_group = case_when(clarity == 'IF' ~ 'flawless',
                                   str_detect(clarity, 'VVS') ~ 'VV_slight',
                                   str_detect(clarity, 'VS') ~ 'V_slight',
                                   str_detect(clarity, 'SI') ~ 'slight',
                                   clarity == 'I1' ~ 'impurity',
                                   TRUE ~ 'other')) %>% head(10)

## # A tibble: 10 × 2
##    clarity clarity_group
##    <ord>   <chr>        
##  1 SI2     slight       
##  2 SI1     slight       
##  3 VS1     V_slight     
##  4 VS2     V_slight     
##  5 SI2     slight       
##  6 VVS2    VV_slight    
##  7 VVS1    VV_slight    
##  8 SI1     slight       
##  9 VS2     V_slight     
## 10 VS1     V_slight

#Toán tử n() và summarize()
diamonds_df %>% summarize(mean_price = mean(price),
                          sd_price = sd(price),
                          min_price = min(price),
                          max_price = max(price),
                          n_rows = n())

## # A tibble: 1 × 5
##   mean_price sd_price min_price max_price n_rows
##        <dbl>    <dbl>     <int>     <int>  <int>
## 1      3933.    3989.       326     18823  53940

#Hàm summarize và group_by khi kết hợp.
diamonds_df %>% group_by(clarity) %>%
  summarize(mean_price = mean(price),
            sd_price = sd(price),
            min_price = min(price),
            max_price = max(price),
            n_rows = n()) %>% head(10)

## # A tibble: 8 × 6
##   clarity mean_price sd_price min_price max_price n_rows
##   <ord>        <dbl>    <dbl>     <int>     <int>  <int>
## 1 I1           3924.    2807.       345     18531    741
## 2 SI2          5063.    4260.       326     18804   9194
## 3 SI1          3996.    3799.       326     18818  13065
## 4 VS2          3925.    4042.       334     18823  12258
## 5 VS1          3839.    4012.       327     18795   8171
## 6 VVS2         3284.    3822.       336     18768   5066
## 7 VVS1         2523.    3335.       336     18777   3655
## 8 IF           2865.    3920.       369     18806   1790

diamonds_df %>%
  group_by(clarity, cut) %>%
  summarize(mean_price = mean(price),
            sd_price = sd(price),
            min_price = min(price),
            max_price = max(price),
            n_rows = n()) %>% head(10)

## `summarise()` has grouped output by 'clarity'. You can override using the
## `.groups` argument.

## # A tibble: 10 × 7
## # Groups:   clarity [2]
##    clarity cut       mean_price sd_price min_price max_price n_rows
##    <ord>   <ord>          <dbl>    <dbl>     <int>     <int>  <int>
##  1 I1      Fair           3704.    3099.       584     18531    210
##  2 I1      Good           3597.    2285.       361     11548     96
##  3 I1      Very Good      4078.    2720.       511     15984     84
##  4 I1      Premium        3947.    2827.       345     16193    205
##  5 I1      Ideal          4336.    2671.       413     16538    146
##  6 SI2     Fair           5174.    3928.       536     18308    466
##  7 SI2     Good           4580.    3901.       335     18788   1081
##  8 SI2     Very Good      4989.    4126.       383     18692   2100
##  9 SI2     Premium        5546.    4488.       345     18784   2949
## 10 SI2     Ideal          4756.    4252.       326     18804   2598

Untitled

Vũ Xuân Quý

2023-02-09