library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data("airquality")
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
#Hàm lấy tên cột
select(airquality, Ozone, Temp) %>% head(4)
## Ozone Temp
## 1 41 67
## 2 36 72
## 3 12 74
## 4 18 62
airquality %>% select(-Ozone, -Temp) %>% head(4)
## Solar.R Wind Month Day
## 1 190 7.4 5 1
## 2 118 8.0 5 2
## 3 149 12.6 5 3
## 4 313 11.5 5 4
#Hàm lọc dữ liệu
filter(airquality, Ozone > 106) %>% head(5)
## Ozone Solar.R Wind Temp Month Day
## 1 115 223 5.7 79 5 30
## 2 135 269 4.1 84 7 1
## 3 108 223 8.0 85 7 25
## 4 122 255 4.0 89 8 7
## 5 110 207 8.0 90 8 9
filter(airquality, Ozone > 106 & Month == 8) %>% head(5)
## Ozone Solar.R Wind Temp Month Day
## 1 122 255 4.0 89 8 7
## 2 110 207 8.0 90 8 9
## 3 168 238 3.4 81 8 25
## 4 118 225 2.3 94 8 29
filter(airquality, Ozone > 106, Month == 8, Wind > 7) %>% head(5)
## Ozone Solar.R Wind Temp Month Day
## 1 110 207 8 90 8 9
mutate(airquality, temp.celsius = Temp - 31) %>% head(5)
## Ozone Solar.R Wind Temp Month Day temp.celsius
## 1 41 190 7.4 67 5 1 36
## 2 36 118 8.0 72 5 2 41
## 3 12 149 12.6 74 5 3 43
## 4 18 313 11.5 62 5 4 31
## 5 NA NA 14.3 56 5 5 25
#Hàm khái quát thống kê và hàm xếp theo nhóm
library(dplyr)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.2
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ stringr 1.5.0
## ✔ tidyr 1.3.0 ✔ forcats 1.0.0
## ✔ readr 2.1.3
## Warning: package 'ggplot2' was built under R version 4.2.2
## Warning: package 'tibble' was built under R version 4.2.2
## Warning: package 'tidyr' was built under R version 4.2.2
## Warning: package 'readr' was built under R version 4.2.2
## Warning: package 'purrr' was built under R version 4.2.2
## Warning: package 'stringr' was built under R version 4.2.2
## Warning: package 'forcats' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(magrittr)
## Warning: package 'magrittr' was built under R version 4.2.2
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
summarize(group_by(airquality, Month), round(mean(Temp, na.rm = T)))
## # A tibble: 5 × 2
## Month `round(mean(Temp, na.rm = T))`
## <int> <dbl>
## 1 5 66
## 2 6 79
## 3 7 84
## 4 8 84
## 5 9 77
sample_n(airquality, 4) # Lấy ngẫu nhiên 4
## Ozone Solar.R Wind Temp Month Day
## 1 79 187 5.1 87 7 19
## 2 NA 138 8.0 83 6 30
## 3 30 322 11.5 68 5 19
## 4 13 238 12.6 64 9 21
sample_frac(airquality, size = 0.02) # Lấy ngẫu nhiên 2%
## Ozone Solar.R Wind Temp Month Day
## 1 20 252 10.9 80 9 7
## 2 36 139 10.3 81 9 23
## 3 NA 322 11.5 79 6 15
count(airquality, Month > 5) # Đếm xem có bao nhiêu quan sát có tháng lớn hơn 5 và nhỏ hơn
## Month > 5 n
## 1 FALSE 31
## 2 TRUE 122
count(airquality, Month) # Thống kê theo tháng
## Month n
## 1 5 31
## 2 6 30
## 3 7 31
## 4 8 31
## 5 9 30
arrange(airquality, Month, Day) %>% head(5) # mặc định là ascending- tăng dần
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
arrange(airquality, desc(Month), Day) %>% head(5) # để sắp giảm dần, dùng desc (tức descending)
## Ozone Solar.R Wind Temp Month Day
## 1 96 167 6.9 91 9 1
## 2 78 197 5.1 92 9 2
## 3 73 183 2.8 93 9 3
## 4 91 189 4.6 93 9 4
## 5 47 95 7.4 87 9 5
#Hàm Contains()
mpg_df <- mpg
mpg_df %>% select(contains('y')) %>% head(4)
## # A tibble: 4 × 4
## year cyl cty hwy
## <int> <int> <int> <int>
## 1 1999 4 18 29
## 2 1999 4 21 29
## 3 2008 4 20 31
## 4 2008 4 21 30
#Hàm start_with()
mpg_df %>% select(starts_with('c')) %>% head(4)
## # A tibble: 4 × 3
## cyl cty class
## <int> <int> <chr>
## 1 4 18 compact
## 2 4 21 compact
## 3 4 20 compact
## 4 4 21 compact
mpg_df %>% select( 2, 1, class, contains('y')) %>% head(4)
## # A tibble: 4 × 7
## model manufacturer class year cyl cty hwy
## <chr> <chr> <chr> <int> <int> <int> <int>
## 1 a4 audi compact 1999 4 18 29
## 2 a4 audi compact 1999 4 21 29
## 3 a4 audi compact 2008 4 20 31
## 4 a4 audi compact 2008 4 21 30
#Hàm everything()
mpg_df %>% select(class,displ,year,everything()) %>% head(4)
## # A tibble: 4 × 11
## class displ year manufacturer model cyl trans drv cty hwy fl
## <chr> <dbl> <int> <chr> <chr> <int> <chr> <chr> <int> <int> <chr>
## 1 compact 1.8 1999 audi a4 4 auto(l5) f 18 29 p
## 2 compact 1.8 1999 audi a4 4 manual(m… f 21 29 p
## 3 compact 2 2008 audi a4 4 manual(m… f 20 31 p
## 4 compact 2 2008 audi a4 4 auto(av) f 21 30 p
#Toán tử pipe
#1.Forward pipe
rnorm(100,20,5)%>%
matrix(ncol=2)%>%
data.frame(x=.[,1],y=.[,2])%>%
plot(col="red")

#2.Toán tử T pipe %T>%
rnorm(100,10,1)%T>%
ts.plot(col="red")%>%
density()%>%
plot(col="blue","densityplot")


#3.Toán tử Assigning pipe
library(magrittr)
data.frame(x=rnorm(100,10,5),
y=rnorm(100,20,5)) %$% ts.plot(x,col="red")

data.frame(x=rnorm(100,10,5),
y=rnorm(100,20,5)) %$% ts.plot(y,col="blue")

#4.Toán tử Backward pipe
library(magrittr)
x=abs(rnorm(100))
y=x
cbind(x,y)%>%head()
## x y
## [1,] 1.3264016 1.3264016
## [2,] 0.3857248 0.3857248
## [3,] 0.3728777 0.3728777
## [4,] 0.3124767 0.3124767
## [5,] 0.3979740 0.3979740
## [6,] 0.2715807 0.2715807
x %<>% .^2 %>% sqrt()
x==y
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#Thực hành với mpg
library(tidyverse)
library(dplyr)
mpg_df <- mpg
# filter the mpg_df data for cars manufactured in the year 1999
filter(mpg_df, year == 1999) %>% head(5)
## # A tibble: 5 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto(… f 18 29 p comp…
## 2 audi a4 1.8 1999 4 manua… f 21 29 p comp…
## 3 audi a4 2.8 1999 6 auto(… f 16 26 p comp…
## 4 audi a4 2.8 1999 6 manua… f 18 26 p comp…
## 5 audi a4 quattro 1.8 1999 4 manua… 4 18 26 p comp…
# Show the dimension
filter(mpg_df, year == 1999) %>% dim()
## [1] 117 11
# let’s take all vehicles in the ‘midsize’ class:
filter(mpg_df, class == "midsize") %>% head(5)
## # A tibble: 5 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a6 quattro 2.8 1999 6 auto(… 4 15 24 p mids…
## 2 audi a6 quattro 3.1 2008 6 auto(… 4 17 25 p mids…
## 3 audi a6 quattro 4.2 2008 8 auto(… 4 16 23 p mids…
## 4 chevrolet malibu 2.4 1999 4 auto(… f 19 27 r mids…
## 5 chevrolet malibu 2.4 2008 4 auto(… f 22 30 r mids…
# filter for vehicles built in 1999 and with mileage in the city (cty) greater than 18.
mpg_df %>% filter(year==1999 & cty > 18) %>% head(4)
## # A tibble: 4 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 manual(m5) f 21 29 p comp…
## 2 chevrolet malibu 2.4 1999 4 auto(l4) f 19 27 r mids…
## 3 honda civic 1.6 1999 4 manual(m5) f 28 33 r subc…
## 4 honda civic 1.6 1999 4 auto(l4) f 24 32 r subc…
# filter for vehicles (i.e., rows) where the manufacturer is Chevrolet or the class is ‘suv’.
mpg_df %>% filter(manufacturer=='chevrolet' | class=='suv') %>% head(4)
## # A tibble: 4 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 chevrolet c1500 subu… 5.3 2008 8 auto… r 14 20 r suv
## 2 chevrolet c1500 subu… 5.3 2008 8 auto… r 11 15 e suv
## 3 chevrolet c1500 subu… 5.3 2008 8 auto… r 14 20 r suv
## 4 chevrolet c1500 subu… 5.7 1999 8 auto… r 13 17 r suv
mpg_df %>% filter( (manufacturer=='chevrolet' | class=='suv') & hwy < 20) %>% head(4)
## # A tibble: 4 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 chevrolet c1500 subu… 5.3 2008 8 auto… r 11 15 e suv
## 2 chevrolet c1500 subu… 5.7 1999 8 auto… r 13 17 r suv
## 3 chevrolet c1500 subu… 6 2008 8 auto… r 12 17 r suv
## 4 chevrolet k1500 taho… 5.3 2008 8 auto… 4 14 19 r suv
library(tidyverse)
mpg_df %>% filter(str_detect(model,'4wd')) %>% head(5)
## # A tibble: 5 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 chevrolet k1500 taho… 5.3 2008 8 auto… 4 14 19 r suv
## 2 chevrolet k1500 taho… 5.3 2008 8 auto… 4 11 14 e suv
## 3 chevrolet k1500 taho… 5.7 1999 8 auto… 4 11 15 r suv
## 4 chevrolet k1500 taho… 6.5 1999 8 auto… 4 14 17 d suv
## 5 dodge dakota pic… 3.7 2008 6 manu… 4 15 19 r pick…
mpg_df %>% filter(cyl %in% c(4,5,6)) %>% head(5)
## # A tibble: 5 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto(l5) f 18 29 p compa…
## 2 audi a4 1.8 1999 4 manual(m5) f 21 29 p compa…
## 3 audi a4 2 2008 4 manual(m6) f 20 31 p compa…
## 4 audi a4 2 2008 4 auto(av) f 21 30 p compa…
## 5 audi a4 2.8 1999 6 auto(l5) f 16 26 p compa…
#Phát hiện dữ liệu trống
mpg %>% filter(is.na(year))
## # A tibble: 0 × 11
## # … with 11 variables: manufacturer <chr>, model <chr>, displ <dbl>,
## # year <int>, cyl <int>, trans <chr>, drv <chr>, cty <int>, hwy <int>,
## # fl <chr>, class <chr>
# To drop out all the missing value
mpg %>% filter(!is.na(year)) %>% head(4)
## # A tibble: 4 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto(l5) f 18 29 p compa…
## 2 audi a4 1.8 1999 4 manual(m5) f 21 29 p compa…
## 3 audi a4 2 2008 4 manual(m6) f 20 31 p compa…
## 4 audi a4 2 2008 4 auto(av) f 21 30 p compa…
mpg %>% filter(complete.cases(.)) %>% head(4)
## # A tibble: 4 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto(l5) f 18 29 p compa…
## 2 audi a4 1.8 1999 4 manual(m5) f 21 29 p compa…
## 3 audi a4 2 2008 4 manual(m6) f 20 31 p compa…
## 4 audi a4 2 2008 4 auto(av) f 21 30 p compa…
# And to filter for all rows with a missing value in at least one column:
mpg %>% filter( !complete.cases(.) )
## # A tibble: 0 × 11
## # … with 11 variables: manufacturer <chr>, model <chr>, displ <dbl>,
## # year <int>, cyl <int>, trans <chr>, drv <chr>, cty <int>, hwy <int>,
## # fl <chr>, class <chr>
#Chaining dplyr và ggplot
mpg_df %>%
filter(class=='midsize') %>%
select(class,manufacturer,displ,year) %>%
arrange(displ) %>%
ggplot(aes(x=class,y=displ)) + geom_boxplot()

#Dữ liệu diamond
library(ggplot2)
diamonds_df <- diamonds
diamonds_df %>% select(-x, -y, -z) %>% mutate(AUD = price*1.25) %>% head(4)
## # A tibble: 4 × 8
## carat cut color clarity depth table price AUD
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 408.
## 2 0.21 Premium E SI1 59.8 61 326 408.
## 3 0.23 Good E VS1 56.9 65 327 409.
## 4 0.29 Premium I VS2 62.4 58 334 418.
#Thêm 1 biến giá 1 carat bằng cách lấy price/carat
diamonds_df %>% select(-x, -y , -z) %>% mutate(ppc = price/carat) %>% head(4)
## # A tibble: 4 × 8
## carat cut color clarity depth table price ppc
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 1417.
## 2 0.21 Premium E SI1 59.8 61 326 1552.
## 3 0.23 Good E VS1 56.9 65 327 1422.
## 4 0.29 Premium I VS2 62.4 58 334 1152.
#Ví dụ: tạo ra một biến số là price_label với điều kiện nếu giá price>5000 thì kim cương là loại đắt (expensive), ngược lại là loại rẻ (cheap)
diamonds_df %>% select(-x, -y, -z) %>% mutate(price_label = ifelse(price > 5000, "expensive", "cheap"))
## # A tibble: 53,940 × 8
## carat cut color clarity depth table price price_label
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <chr>
## 1 0.23 Ideal E SI2 61.5 55 326 cheap
## 2 0.21 Premium E SI1 59.8 61 326 cheap
## 3 0.23 Good E VS1 56.9 65 327 cheap
## 4 0.29 Premium I VS2 62.4 58 334 cheap
## 5 0.31 Good J SI2 63.3 58 335 cheap
## 6 0.24 Very Good J VVS2 62.8 57 336 cheap
## 7 0.24 Very Good I VVS1 62.3 57 336 cheap
## 8 0.26 Very Good H SI1 61.9 55 337 cheap
## 9 0.22 Fair E VS2 65.1 61 337 cheap
## 10 0.23 Very Good H VS1 59.4 61 338 cheap
## # … with 53,930 more rows
# Make the plot
diamonds_df %>% select(-x,-y,-z) %>%
mutate(price_label = ifelse(price > 5000,'expensive','cheap')) %>%
ggplot(aes(x=price, fill = price_label)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Lệnh case_when
diamonds_df %>%
select(clarity) %>%
mutate(clarity_group = case_when(clarity == 'IF' ~ 'flawless',
str_detect(clarity, 'VVS') ~ 'VV_slight',
str_detect(clarity, 'VS') ~ 'V_slight',
str_detect(clarity, 'SI') ~ 'slight',
clarity == 'I1' ~ 'impurity',
TRUE ~ 'other')) %>% head(10)
## # A tibble: 10 × 2
## clarity clarity_group
## <ord> <chr>
## 1 SI2 slight
## 2 SI1 slight
## 3 VS1 V_slight
## 4 VS2 V_slight
## 5 SI2 slight
## 6 VVS2 VV_slight
## 7 VVS1 VV_slight
## 8 SI1 slight
## 9 VS2 V_slight
## 10 VS1 V_slight
#Toán tử n() và summarize()
diamonds_df %>% summarize(mean_price = mean(price),
sd_price = sd(price),
min_price = min(price),
max_price = max(price),
n_rows = n())
## # A tibble: 1 × 5
## mean_price sd_price min_price max_price n_rows
## <dbl> <dbl> <int> <int> <int>
## 1 3933. 3989. 326 18823 53940
#Hàm summarize và group_by khi kết hợp.
diamonds_df %>% group_by(clarity) %>%
summarize(mean_price = mean(price),
sd_price = sd(price),
min_price = min(price),
max_price = max(price),
n_rows = n()) %>% head(10)
## # A tibble: 8 × 6
## clarity mean_price sd_price min_price max_price n_rows
## <ord> <dbl> <dbl> <int> <int> <int>
## 1 I1 3924. 2807. 345 18531 741
## 2 SI2 5063. 4260. 326 18804 9194
## 3 SI1 3996. 3799. 326 18818 13065
## 4 VS2 3925. 4042. 334 18823 12258
## 5 VS1 3839. 4012. 327 18795 8171
## 6 VVS2 3284. 3822. 336 18768 5066
## 7 VVS1 2523. 3335. 336 18777 3655
## 8 IF 2865. 3920. 369 18806 1790
diamonds_df %>%
group_by(clarity, cut) %>%
summarize(mean_price = mean(price),
sd_price = sd(price),
min_price = min(price),
max_price = max(price),
n_rows = n()) %>% head(10)
## `summarise()` has grouped output by 'clarity'. You can override using the
## `.groups` argument.
## # A tibble: 10 × 7
## # Groups: clarity [2]
## clarity cut mean_price sd_price min_price max_price n_rows
## <ord> <ord> <dbl> <dbl> <int> <int> <int>
## 1 I1 Fair 3704. 3099. 584 18531 210
## 2 I1 Good 3597. 2285. 361 11548 96
## 3 I1 Very Good 4078. 2720. 511 15984 84
## 4 I1 Premium 3947. 2827. 345 16193 205
## 5 I1 Ideal 4336. 2671. 413 16538 146
## 6 SI2 Fair 5174. 3928. 536 18308 466
## 7 SI2 Good 4580. 3901. 335 18788 1081
## 8 SI2 Very Good 4989. 4126. 383 18692 2100
## 9 SI2 Premium 5546. 4488. 345 18784 2949
## 10 SI2 Ideal 4756. 4252. 326 18804 2598