#Cài đặt thư viện dplyr ##Hàm select,filter,mutate,sample,count,arrange,contains,starts_with,everything.
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data("airquality")
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
data("mpg")
mpg_df <- mpg
#####Select
select(airquality, Ozone, Temp) %>% head(10)
## Ozone Temp
## 1 41 67
## 2 36 72
## 3 12 74
## 4 18 62
## 5 NA 56
## 6 28 66
## 7 23 65
## 8 19 59
## 9 8 61
## 10 NA 69
airquality %>% select(-Ozone, -Temp) %>% head(10)
## Solar.R Wind Month Day
## 1 190 7.4 5 1
## 2 118 8.0 5 2
## 3 149 12.6 5 3
## 4 313 11.5 5 4
## 5 NA 14.3 5 5
## 6 NA 14.9 5 6
## 7 299 8.6 5 7
## 8 99 13.8 5 8
## 9 19 20.1 5 9
## 10 194 8.6 5 10
#####Filter
filter(airquality, Ozone > 106) %>% head(10)
## Ozone Solar.R Wind Temp Month Day
## 1 115 223 5.7 79 5 30
## 2 135 269 4.1 84 7 1
## 3 108 223 8.0 85 7 25
## 4 122 255 4.0 89 8 7
## 5 110 207 8.0 90 8 9
## 6 168 238 3.4 81 8 25
## 7 118 225 2.3 94 8 29
filter(airquality, Ozone > 106 & Month == 8) %>% head(10)
## Ozone Solar.R Wind Temp Month Day
## 1 122 255 4.0 89 8 7
## 2 110 207 8.0 90 8 9
## 3 168 238 3.4 81 8 25
## 4 118 225 2.3 94 8 29
filter(airquality, Ozone > 106, Month == 8, Wind > 7) %>% head(10)
## Ozone Solar.R Wind Temp Month Day
## 1 110 207 8 90 8 9
#####Mutate
mutate(airquality, temp.celsius = Temp - 31) %>% head(3)
## Ozone Solar.R Wind Temp Month Day temp.celsius
## 1 41 190 7.4 67 5 1 36
## 2 36 118 8.0 72 5 2 41
## 3 12 149 12.6 74 5 3 43
summarize(group_by(airquality, Month), round(mean(Temp, na.rm = T)))
## # A tibble: 5 × 2
## Month `round(mean(Temp, na.rm = T))`
## <int> <dbl>
## 1 5 66
## 2 6 79
## 3 7 84
## 4 8 84
## 5 9 77
#####Sample
sample_n(airquality, 2) # Lấy ngẫu nhiên 2 giá trị
## Ozone Solar.R Wind Temp Month Day
## 1 85 175 7.4 89 7 10
## 2 NA 153 5.7 88 8 27
sample_frac(airquality, size = 0.1) # Lấy ngẫu nhiên 10%
## Ozone Solar.R Wind Temp Month Day
## 1 9 24 13.8 81 8 2
## 2 44 236 14.9 81 9 11
## 3 21 230 10.9 75 9 9
## 4 6 78 18.4 57 5 18
## 5 85 188 6.3 94 8 31
## 6 118 225 2.3 94 8 29
## 7 35 NA 7.4 85 8 5
## 8 91 189 4.6 93 9 4
## 9 18 131 8.0 76 9 29
## 10 23 115 7.4 76 8 18
## 11 NA 150 6.3 77 6 21
## 12 27 175 14.9 81 7 13
## 13 18 313 11.5 62 5 4
## 14 11 290 9.2 66 5 13
## 15 20 37 9.2 65 6 18
#####Count
count(airquality, Month > 5) # Đếm xem có bao nhiêu qs có tháng lớn hơn 5 và nhỏ hơn
## Month > 5 n
## 1 FALSE 31
## 2 TRUE 122
count(airquality, Month) # Thống kê theo tháng
## Month n
## 1 5 31
## 2 6 30
## 3 7 31
## 4 8 31
## 5 9 30
#####Arrange
arrange(airquality, Month, Day) %>% head(5) # mặc định là ascending- tăng dần
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
arrange(airquality, desc(Month), Day) %>% head(5) # để sắp giảm dần, dùng desc (tức descending)
## Ozone Solar.R Wind Temp Month Day
## 1 96 167 6.9 91 9 1
## 2 78 197 5.1 92 9 2
## 3 73 183 2.8 93 9 3
## 4 91 189 4.6 93 9 4
## 5 47 95 7.4 87 9 5
#####Contains
mpg_df %>% select(contains('a')) %>% head(4)
## # A tibble: 4 × 4
## manufacturer year trans class
## <chr> <int> <chr> <chr>
## 1 audi 1999 auto(l5) compact
## 2 audi 1999 manual(m5) compact
## 3 audi 2008 manual(m6) compact
## 4 audi 2008 auto(av) compact
#####Starts_with
mpg_df %>% select(starts_with('p')) %>% head(5)
## # A tibble: 5 × 0
mpg_df %>% select( 2, 1, class, contains('y')) %>% head(5)
## # A tibble: 5 × 7
## model manufacturer class year cyl cty hwy
## <chr> <chr> <chr> <int> <int> <int> <int>
## 1 a4 audi compact 1999 4 18 29
## 2 a4 audi compact 1999 4 21 29
## 3 a4 audi compact 2008 4 20 31
## 4 a4 audi compact 2008 4 21 30
## 5 a4 audi compact 1999 6 16 26
#####Everything
mpg_df %>% select(class,displ,year,everything()) %>% head(4)
## # A tibble: 4 × 11
## class displ year manufacturer model cyl trans drv cty hwy fl
## <chr> <dbl> <int> <chr> <chr> <int> <chr> <chr> <int> <int> <chr>
## 1 compact 1.8 1999 audi a4 4 auto(l5) f 18 29 p
## 2 compact 1.8 1999 audi a4 4 manual(m… f 21 29 p
## 3 compact 2 2008 audi a4 4 manual(m… f 20 31 p
## 4 compact 2 2008 audi a4 4 auto(av) f 21 30 p
#Toán tử pipe
#Chuyển tiếp ống ( Forward pipe )
rnorm(100,20,5)%>%
matrix(ncol=2)%>%
data.frame(x=.[,1],y=.[,2])%>%
plot(col="pink")
##Toán tử T pipe
rnorm(100,10,1)%T>%
ts.plot(col="pink")%>%
density()%>%
plot(col="red","densityplot")
##Toán tử Assigning pipe
library(magrittr)
data.frame(x=rnorm(100,10,5),
y=rnorm(100,20,5)) %$% ts.plot(x,col="red")
data.frame(x=rnorm(100,10,5),
y=rnorm(100,20,5)) %$% ts.plot(y,col="blue")
##Toán tử Backwaed pipe
library(magrittr)
x=abs(rnorm(100))
y=x
cbind(x,y)%>%head()
## x y
## [1,] 0.16079413 0.16079413
## [2,] 0.64129290 0.64129290
## [3,] 0.39291205 0.39291205
## [4,] 0.04778004 0.04778004
## [5,] 0.57212651 0.57212651
## [6,] 3.48549299 3.48549299
x %<>% .^2 %>% sqrt()
x==y
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#library(tidyverse)
library(dplyr)
# lọc dữ liệu mpg_df cho ô tô sản xuất năm 1999
filter(mpg_df, year == 1999) %>% head(5)
## # A tibble: 5 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto(… f 18 29 p comp…
## 2 audi a4 1.8 1999 4 manua… f 21 29 p comp…
## 3 audi a4 2.8 1999 6 auto(… f 16 26 p comp…
## 4 audi a4 2.8 1999 6 manua… f 18 26 p comp…
## 5 audi a4 quattro 1.8 1999 4 manua… 4 18 26 p comp…
# Hiển thị kích thước, bộ lọc
filter(mpg_df, year == 1999) %>% dim()
## [1] 117 11
#chúng ta hãy lấy tất cả các phương tiện trong lớp 'hạng trung'
filter(mpg_df, class == "midsize") %>% head(5)
## # A tibble: 5 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a6 quattro 2.8 1999 6 auto(… 4 15 24 p mids…
## 2 audi a6 quattro 3.1 2008 6 auto(… 4 17 25 p mids…
## 3 audi a6 quattro 4.2 2008 8 auto(… 4 16 23 p mids…
## 4 chevrolet malibu 2.4 1999 4 auto(… f 19 27 r mids…
## 5 chevrolet malibu 2.4 2008 4 auto(… f 22 30 r mids…
#bộ lọc cho các phương tiện được sản xuất vào năm 1999 và có số dặm trong thành phố (cty) lớn hơn 18
mpg_df %>% filter(year==1999 & cty > 18) %>% head(4)
## # A tibble: 4 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 manual(m5) f 21 29 p comp…
## 2 chevrolet malibu 2.4 1999 4 auto(l4) f 19 27 r mids…
## 3 honda civic 1.6 1999 4 manual(m5) f 28 33 r subc…
## 4 honda civic 1.6 1999 4 auto(l4) f 24 32 r subc…
# bộ lọc dành cho xe (tức là hàng) có nhà sản xuất là Chevrolet hoặc hạng là 'suv'.
mpg_df %>% filter(manufacturer=='chevrolet' | class=='suv') %>% head(4)
## # A tibble: 4 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 chevrolet c1500 subu… 5.3 2008 8 auto… r 14 20 r suv
## 2 chevrolet c1500 subu… 5.3 2008 8 auto… r 11 15 e suv
## 3 chevrolet c1500 subu… 5.3 2008 8 auto… r 14 20 r suv
## 4 chevrolet c1500 subu… 5.7 1999 8 auto… r 13 17 r suv
mpg_df %>% filter( (manufacturer=='chevrolet' | class=='suv') & hwy < 20) %>% head(4)
## # A tibble: 4 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 chevrolet c1500 subu… 5.3 2008 8 auto… r 11 15 e suv
## 2 chevrolet c1500 subu… 5.7 1999 8 auto… r 13 17 r suv
## 3 chevrolet c1500 subu… 6 2008 8 auto… r 12 17 r suv
## 4 chevrolet k1500 taho… 5.3 2008 8 auto… 4 14 19 r suv
##Sử dụng str_detect() và %in
#library(tidyverse)
#mpg_df %>% filter(str_detect(model,'4wd')) %>% head(5)
mpg_df %>% filter(cyl %in% c(4,5,6)) %>% head(5)
## # A tibble: 5 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto(l5) f 18 29 p compa…
## 2 audi a4 1.8 1999 4 manual(m5) f 21 29 p compa…
## 3 audi a4 2 2008 4 manual(m6) f 20 31 p compa…
## 4 audi a4 2 2008 4 auto(av) f 21 30 p compa…
## 5 audi a4 2.8 1999 6 auto(l5) f 16 26 p compa…
#Phát hiện dl trống
mpg %>% filter(is.na(year))
## # A tibble: 0 × 11
## # … with 11 variables: manufacturer <chr>, model <chr>, displ <dbl>,
## # year <int>, cyl <int>, trans <chr>, drv <chr>, cty <int>, hwy <int>,
## # fl <chr>, class <chr>
#Loại bỏ tất cả giá trị còn thiếu
mpg %>% filter(!is.na(year)) %>% head(4)
## # A tibble: 4 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto(l5) f 18 29 p compa…
## 2 audi a4 1.8 1999 4 manual(m5) f 21 29 p compa…
## 3 audi a4 2 2008 4 manual(m6) f 20 31 p compa…
## 4 audi a4 2 2008 4 auto(av) f 21 30 p compa…
#Kiểm tra giá trị bằng complete.cases Và để lọc tất cả các hàng có giá trị bị thiếu trong ít nhất một cột:
mpg %>% filter(complete.cases(.)) %>% head(4)
## # A tibble: 4 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto(l5) f 18 29 p compa…
## 2 audi a4 1.8 1999 4 manual(m5) f 21 29 p compa…
## 3 audi a4 2 2008 4 manual(m6) f 20 31 p compa…
## 4 audi a4 2 2008 4 auto(av) f 21 30 p compa…
mpg %>% filter( !complete.cases(.) )
## # A tibble: 0 × 11
## # … with 11 variables: manufacturer <chr>, model <chr>, displ <dbl>,
## # year <int>, cyl <int>, trans <chr>, drv <chr>, cty <int>, hwy <int>,
## # fl <chr>, class <chr>
##Xâu chuổi dplyr và ggplot
mpg_df %>%
filter(class=='midsize') %>%
select(class,manufacturer,displ,year) %>%
arrange(displ) %>%
ggplot(aes(x=class,y=displ)) + geom_boxplot()
#Dữ liệu với Kim cương
library(ggplot2)
data("diamonds")
diamonds_df <- diamonds
diamonds_df %>% select(-x, -y, -z) %>% mutate(AUD = price*1.25) %>% head(4)
## # A tibble: 4 × 8
## carat cut color clarity depth table price AUD
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 408.
## 2 0.21 Premium E SI1 59.8 61 326 408.
## 3 0.23 Good E VS1 56.9 65 327 409.
## 4 0.29 Premium I VS2 62.4 58 334 418.
diamonds_df %>% select(-x, -y , -z) %>% mutate(ppc = price/carat) %>% head(4)
## # A tibble: 4 × 8
## carat cut color clarity depth table price ppc
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 1417.
## 2 0.21 Premium E SI1 59.8 61 326 1552.
## 3 0.23 Good E VS1 56.9 65 327 1422.
## 4 0.29 Premium I VS2 62.4 58 334 1152.
diamonds_df %>% select(-x, -y, -z) %>% mutate(price_label = ifelse(price > 5000, "expensive", "cheap"))
## # A tibble: 53,940 × 8
## carat cut color clarity depth table price price_label
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <chr>
## 1 0.23 Ideal E SI2 61.5 55 326 cheap
## 2 0.21 Premium E SI1 59.8 61 326 cheap
## 3 0.23 Good E VS1 56.9 65 327 cheap
## 4 0.29 Premium I VS2 62.4 58 334 cheap
## 5 0.31 Good J SI2 63.3 58 335 cheap
## 6 0.24 Very Good J VVS2 62.8 57 336 cheap
## 7 0.24 Very Good I VVS1 62.3 57 336 cheap
## 8 0.26 Very Good H SI1 61.9 55 337 cheap
## 9 0.22 Fair E VS2 65.1 61 337 cheap
## 10 0.23 Very Good H VS1 59.4 61 338 cheap
## # … with 53,930 more rows
#Lập biểu đồ
diamonds_df %>% select(-x,-y,-z) %>%
mutate(price_label = ifelse(price > 5000,'expensive','cheap')) %>%
ggplot(aes(x=price, fill = price_label)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
diamonds_df %>%
select(clarity) %>%
mutate(clarity_group = case_when(clarity == 'IF' ~ 'flawless',
#str_detect(clarity, 'VVS') ~ 'VV_slight',
#str_detect(clarity, 'VS') ~ 'V_slight',
#str_detect(clarity, 'SI') ~ 'slight',
clarity == 'I1' ~ 'impurity',
TRUE ~ 'other')) %>% head(10)
## # A tibble: 10 × 2
## clarity clarity_group
## <ord> <chr>
## 1 SI2 other
## 2 SI1 other
## 3 VS1 other
## 4 VS2 other
## 5 SI2 other
## 6 VVS2 other
## 7 VVS1 other
## 8 SI1 other
## 9 VS2 other
## 10 VS1 other
#Toán tử n() và summarize()
diamonds_df %>% summarize(mean_price = mean(price),
sd_price = sd(price),
min_price = min(price),
max_price = max(price),
n_rows = n())
## # A tibble: 1 × 5
## mean_price sd_price min_price max_price n_rows
## <dbl> <dbl> <int> <int> <int>
## 1 3933. 3989. 326 18823 53940
diamonds_df %>% group_by(clarity) %>%
summarize(mean_price = mean(price),
sd_price = sd(price),
min_price = min(price),
max_price = max(price),
n_rows = n()) %>% head(10)
## # A tibble: 8 × 6
## clarity mean_price sd_price min_price max_price n_rows
## <ord> <dbl> <dbl> <int> <int> <int>
## 1 I1 3924. 2807. 345 18531 741
## 2 SI2 5063. 4260. 326 18804 9194
## 3 SI1 3996. 3799. 326 18818 13065
## 4 VS2 3925. 4042. 334 18823 12258
## 5 VS1 3839. 4012. 327 18795 8171
## 6 VVS2 3284. 3822. 336 18768 5066
## 7 VVS1 2523. 3335. 336 18777 3655
## 8 IF 2865. 3920. 369 18806 1790
diamonds_df %>%
group_by(clarity, cut) %>%
summarize(mean_price = mean(price),
sd_price = sd(price),
min_price = min(price),
max_price = max(price),
n_rows = n()) %>% head(10)
## `summarise()` has grouped output by 'clarity'. You can override using the
## `.groups` argument.
## # A tibble: 10 × 7
## # Groups: clarity [2]
## clarity cut mean_price sd_price min_price max_price n_rows
## <ord> <ord> <dbl> <dbl> <int> <int> <int>
## 1 I1 Fair 3704. 3099. 584 18531 210
## 2 I1 Good 3597. 2285. 361 11548 96
## 3 I1 Very Good 4078. 2720. 511 15984 84
## 4 I1 Premium 3947. 2827. 345 16193 205
## 5 I1 Ideal 4336. 2671. 413 16538 146
## 6 SI2 Fair 5174. 3928. 536 18308 466
## 7 SI2 Good 4580. 3901. 335 18788 1081
## 8 SI2 Very Good 4989. 4126. 383 18692 2100
## 9 SI2 Premium 5546. 4488. 345 18784 2949
## 10 SI2 Ideal 4756. 4252. 326 18804 2598