load library
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
load data
df <- mpg
print(df)
## # A tibble: 234 × 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto… f 18 29 p comp…
## 2 audi a4 1.8 1999 4 manu… f 21 29 p comp…
## 3 audi a4 2 2008 4 manu… f 20 31 p comp…
## 4 audi a4 2 2008 4 auto… f 21 30 p comp…
## 5 audi a4 2.8 1999 6 auto… f 16 26 p comp…
## 6 audi a4 2.8 1999 6 manu… f 18 26 p comp…
## 7 audi a4 3.1 2008 6 auto… f 18 27 p comp…
## 8 audi a4 quattro 1.8 1999 4 manu… 4 18 26 p comp…
## 9 audi a4 quattro 1.8 1999 4 auto… 4 16 25 p comp…
## 10 audi a4 quattro 2 2008 4 manu… 4 20 28 p comp…
## # … with 224 more rows
# check null values
any(is.na(df))
## [1] FALSE
# explore number of columns and rows
ncol(df); nrow(df)
## [1] 11
## [1] 234
# explore structure of a table
str(df)
## tibble [234 × 11] (S3: tbl_df/tbl/data.frame)
## $ manufacturer: chr [1:234] "audi" "audi" "audi" "audi" ...
## $ model : chr [1:234] "a4" "a4" "a4" "a4" ...
## $ displ : num [1:234] 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
## $ year : int [1:234] 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
## $ cyl : int [1:234] 4 4 4 4 6 6 6 4 4 4 ...
## $ trans : chr [1:234] "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
## $ drv : chr [1:234] "f" "f" "f" "f" ...
## $ cty : int [1:234] 18 21 20 21 16 18 18 18 16 20 ...
## $ hwy : int [1:234] 29 29 31 30 26 26 27 26 25 28 ...
## $ fl : chr [1:234] "p" "p" "p" "p" ...
## $ class : chr [1:234] "compact" "compact" "compact" "compact" ...
# explore column names
colnames(df)
## [1] "manufacturer" "model" "displ" "year" "cyl"
## [6] "trans" "drv" "cty" "hwy" "fl"
## [11] "class"
function to select column like select clause in SQL
# extract specified column
df %>% select(manufacturer, model, year, class)
## # A tibble: 234 × 4
## manufacturer model year class
## <chr> <chr> <int> <chr>
## 1 audi a4 1999 compact
## 2 audi a4 1999 compact
## 3 audi a4 2008 compact
## 4 audi a4 2008 compact
## 5 audi a4 1999 compact
## 6 audi a4 1999 compact
## 7 audi a4 2008 compact
## 8 audi a4 quattro 1999 compact
## 9 audi a4 quattro 1999 compact
## 10 audi a4 quattro 2008 compact
## # … with 224 more rows
# extract matched criteria
df %>% select(starts_with(match = "c"))
## # A tibble: 234 × 3
## cyl cty class
## <int> <int> <chr>
## 1 4 18 compact
## 2 4 21 compact
## 3 4 20 compact
## 4 4 21 compact
## 5 6 16 compact
## 6 6 18 compact
## 7 6 18 compact
## 8 4 18 compact
## 9 4 16 compact
## 10 4 20 compact
## # … with 224 more rows
df %>% select(contains("a"))
## # A tibble: 234 × 4
## manufacturer year trans class
## <chr> <int> <chr> <chr>
## 1 audi 1999 auto(l5) compact
## 2 audi 1999 manual(m5) compact
## 3 audi 2008 manual(m6) compact
## 4 audi 2008 auto(av) compact
## 5 audi 1999 auto(l5) compact
## 6 audi 1999 manual(m5) compact
## 7 audi 2008 auto(av) compact
## 8 audi 1999 manual(m5) compact
## 9 audi 1999 auto(l5) compact
## 10 audi 2008 manual(m6) compact
## # … with 224 more rows
# select using index
df %>% select(2:5)
## # A tibble: 234 × 4
## model displ year cyl
## <chr> <dbl> <int> <int>
## 1 a4 1.8 1999 4
## 2 a4 1.8 1999 4
## 3 a4 2 2008 4
## 4 a4 2 2008 4
## 5 a4 2.8 1999 6
## 6 a4 2.8 1999 6
## 7 a4 3.1 2008 6
## 8 a4 quattro 1.8 1999 4
## 9 a4 quattro 1.8 1999 4
## 10 a4 quattro 2 2008 4
## # … with 224 more rows
# select last n columns (little tricky)
df %>% select((ncol(df)-2):ncol(df)) # it will extract last 3 columns
## # A tibble: 234 × 3
## hwy fl class
## <int> <chr> <chr>
## 1 29 p compact
## 2 29 p compact
## 3 31 p compact
## 4 30 p compact
## 5 26 p compact
## 6 26 p compact
## 7 27 p compact
## 8 26 p compact
## 9 25 p compact
## 10 28 p compact
## # … with 224 more rows
everything is a function to grab all the remaining unspecified column
# rename all column
df1 <- df %>% rename(yes = everything())
# using select and rename simultaneously
df %>% select(mnfc = manufacturer, mod = model, display = displ, everything())
## # A tibble: 234 × 11
## mnfc mod display year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto(l5) f 18 29 p comp…
## 2 audi a4 1.8 1999 4 manual(m5) f 21 29 p comp…
## 3 audi a4 2 2008 4 manual(m6) f 20 31 p comp…
## 4 audi a4 2 2008 4 auto(av) f 21 30 p comp…
## 5 audi a4 2.8 1999 6 auto(l5) f 16 26 p comp…
## 6 audi a4 2.8 1999 6 manual(m5) f 18 26 p comp…
## 7 audi a4 3.1 2008 6 auto(av) f 18 27 p comp…
## 8 audi a4 quattro 1.8 1999 4 manual(m5) 4 18 26 p comp…
## 9 audi a4 quattro 1.8 1999 4 auto(l5) 4 16 25 p comp…
## 10 audi a4 quattro 2 2008 4 manual(m6) 4 20 28 p comp…
## # … with 224 more rows
# note that this code doesn't rename original data set, it just rename the displayed data, i.e aliases
#to rename the original data set, use assign
# df <-df %>% rename(mnfc = manufacturer, mod = model)
mutate is a function to create new variable in a data set
transmute is a function to create new variable in a data set, but it
will remove all others variables
# create new variable
df <- df %>% mutate(`avg mile per galon` = (cty+hwy)/2)
df <- df %>% mutate(car = paste(manufacturer, model))
df <- df %>% mutate(`cyl and trans` = paste(cyl, "cylinders and", trans, "transmissions"))
filter is a function to filter rows by criteria
slice is a function to filter rows by its indices
# use same order logic as SQL logic
df %>% select(manufacturer, year, hwy) %>% filter(manufacturer %in% c("audi","ford"))
## # A tibble: 43 × 3
## manufacturer year hwy
## <chr> <int> <int>
## 1 audi 1999 29
## 2 audi 1999 29
## 3 audi 2008 31
## 4 audi 2008 30
## 5 audi 1999 26
## 6 audi 1999 26
## 7 audi 2008 27
## 8 audi 1999 26
## 9 audi 1999 25
## 10 audi 2008 28
## # … with 33 more rows
# if we want to filter manufacturer but not want manufacturer to be displayed, then put the filter prior to select
df %>% filter(manufacturer %in% c("audi","ford")) %>% select(year, hwy)
## # A tibble: 43 × 2
## year hwy
## <int> <int>
## 1 1999 29
## 2 1999 29
## 3 2008 31
## 4 2008 30
## 5 1999 26
## 6 1999 26
## 7 2008 27
## 8 1999 26
## 9 1999 25
## 10 2008 28
## # … with 33 more rows
# if we want to filter manufacturer but not want manufacturer to be displayed but put the filter after select, it will cause an error
# combining filter and select
df %>% select(manufacturer, year, hwy) %>% filter(manufacturer %in% c("audi","ford")) %>% group_by(manufacturer, year) %>% summarise(`avg mean` = mean(hwy))
## `summarise()` has grouped output by 'manufacturer'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 3
## # Groups: manufacturer [2]
## manufacturer year `avg mean`
## <chr> <int> <dbl>
## 1 audi 1999 26.1
## 2 audi 2008 26.8
## 3 ford 1999 18.6
## 4 ford 2008 20.5
slice()
# get data from row 20 to 30
df %>% slice(20:30)
## # A tibble: 11 × 14
## manufac…¹ model displ year cyl trans drv cty hwy fl class avg m…²
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> <dbl>
## 1 chevrolet c150… 5.3 2008 8 auto… r 11 15 e suv 13
## 2 chevrolet c150… 5.3 2008 8 auto… r 14 20 r suv 17
## 3 chevrolet c150… 5.7 1999 8 auto… r 13 17 r suv 15
## 4 chevrolet c150… 6 2008 8 auto… r 12 17 r suv 14.5
## 5 chevrolet corv… 5.7 1999 8 manu… r 16 26 p 2sea… 21
## 6 chevrolet corv… 5.7 1999 8 auto… r 15 23 p 2sea… 19
## 7 chevrolet corv… 6.2 2008 8 manu… r 16 26 p 2sea… 21
## 8 chevrolet corv… 6.2 2008 8 auto… r 15 25 p 2sea… 20
## 9 chevrolet corv… 7 2008 8 manu… r 15 24 p 2sea… 19.5
## 10 chevrolet k150… 5.3 2008 8 auto… 4 14 19 r suv 16.5
## 11 chevrolet k150… 5.3 2008 8 auto… 4 11 14 e suv 12.5
## # … with 2 more variables: car <chr>, `cyl and trans` <chr>, and abbreviated
## # variable names ¹manufacturer, ²`avg mile per galon`
# combining slice and select
df %>% select(manufacturer, model, year) %>% slice(20:30)
## # A tibble: 11 × 3
## manufacturer model year
## <chr> <chr> <int>
## 1 chevrolet c1500 suburban 2wd 2008
## 2 chevrolet c1500 suburban 2wd 2008
## 3 chevrolet c1500 suburban 2wd 1999
## 4 chevrolet c1500 suburban 2wd 2008
## 5 chevrolet corvette 1999
## 6 chevrolet corvette 1999
## 7 chevrolet corvette 2008
## 8 chevrolet corvette 2008
## 9 chevrolet corvette 2008
## 10 chevrolet k1500 tahoe 4wd 2008
## 11 chevrolet k1500 tahoe 4wd 2008
just like order by clause in SQL
default ordering is ascending
# order cty in asc and hwy in desc, then display 20 records
print(df %>% arrange(cty, desc(hwy)), n=20)
## # A tibble: 234 × 14
## manufac…¹ model displ year cyl trans drv cty hwy fl class avg m…²
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> <dbl>
## 1 dodge dako… 4.7 2008 8 auto… 4 9 12 e pick… 10.5
## 2 dodge dura… 4.7 2008 8 auto… 4 9 12 e suv 10.5
## 3 dodge ram … 4.7 2008 8 auto… 4 9 12 e pick… 10.5
## 4 dodge ram … 4.7 2008 8 manu… 4 9 12 e pick… 10.5
## 5 jeep gran… 4.7 2008 8 auto… 4 9 12 e suv 10.5
## 6 dodge cara… 3.3 2008 6 auto… f 11 17 e mini… 14
## 7 dodge dako… 5.2 1999 8 manu… 4 11 17 r pick… 14
## 8 ford expe… 4.6 1999 8 auto… r 11 17 r suv 14
## 9 ford expe… 5.4 1999 8 auto… r 11 17 r suv 14
## 10 lincoln navi… 5.4 1999 8 auto… r 11 17 r suv 14
## 11 dodge dura… 5.2 1999 8 auto… 4 11 16 r suv 13.5
## 12 dodge ram … 5.2 1999 8 manu… 4 11 16 r pick… 13.5
## 13 lincoln navi… 5.4 1999 8 auto… r 11 16 p suv 13.5
## 14 chevrolet c150… 5.3 2008 8 auto… r 11 15 e suv 13
## 15 chevrolet k150… 5.7 1999 8 auto… 4 11 15 r suv 13
## 16 dodge dako… 5.2 1999 8 auto… 4 11 15 r pick… 13
## 17 dodge dura… 5.9 1999 8 auto… 4 11 15 r suv 13
## 18 dodge ram … 5.2 1999 8 auto… 4 11 15 r pick… 13
## 19 dodge ram … 5.9 1999 8 auto… 4 11 15 r pick… 13
## 20 ford f150… 5.4 1999 8 auto… 4 11 15 r pick… 13
## # … with 214 more rows, 2 more variables: car <chr>, `cyl and trans` <chr>, and
## # abbreviated variable names ¹manufacturer, ²`avg mile per galon`
# using arrange in advance
print(df %>% select(manufacturer, year, hwy) %>% group_by(manufacturer) %>% filter(year == 1999) %>% summarise(`average of hwy` = mean(hwy)) %>% arrange(desc(`average of hwy`)), n=5)
## # A tibble: 15 × 2
## manufacturer `average of hwy`
## <chr> <dbl>
## 1 honda 31.6
## 2 volkswagen 29.7
## 3 hyundai 26.7
## 4 pontiac 26.3
## 5 audi 26.1
## # … with 10 more rows
just like distinct cluase in SQL
df %>% distinct(manufacturer, year) %>% arrange(manufacturer, year)
## # A tibble: 30 × 2
## manufacturer year
## <chr> <int>
## 1 audi 1999
## 2 audi 2008
## 3 chevrolet 1999
## 4 chevrolet 2008
## 5 dodge 1999
## 6 dodge 2008
## 7 ford 1999
## 8 ford 2008
## 9 honda 1999
## 10 honda 2008
## # … with 20 more rows
binding either row(s) or column(s) just like rbind and cbind
exmp <- data.frame(id = c(1,2,3), names = c("john", "michael", "aries"))
exmp <- bind_rows(exmp, slice(exmp, 2:3))
exmp <- arrange(exmp, id)
sample_n() to sampling number of n rows
sample_frac() to sampling percent of rows
# retrieve 20 samples from df without replacement
sample_n(df, 20, replace = F)
## # A tibble: 20 × 14
## manufac…¹ model displ year cyl trans drv cty hwy fl class avg m…²
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> <dbl>
## 1 jeep gran… 6.1 2008 8 auto… 4 11 14 p suv 12.5
## 2 toyota toyo… 2.7 2008 4 manu… 4 17 22 r pick… 19.5
## 3 land rov… rang… 4.6 1999 8 auto… 4 11 15 p suv 13
## 4 subaru fore… 2.5 1999 4 manu… 4 18 25 r suv 21.5
## 5 dodge dako… 5.2 1999 8 auto… 4 11 15 r pick… 13
## 6 ford expe… 4.6 1999 8 auto… r 11 17 r suv 14
## 7 toyota coro… 1.8 1999 4 manu… f 26 35 r comp… 30.5
## 8 dodge ram … 4.7 2008 8 manu… 4 12 16 r pick… 14
## 9 volkswag… new … 1.9 1999 4 manu… f 35 44 d subc… 39.5
## 10 toyota camr… 3.3 2008 6 auto… f 18 27 r comp… 22.5
## 11 mercury moun… 5 1999 8 auto… 4 13 17 r suv 15
## 12 honda civic 1.8 2008 4 auto… f 24 36 c subc… 30
## 13 ford expe… 5.4 1999 8 auto… r 11 17 r suv 14
## 14 chevrolet mali… 3.6 2008 6 auto… f 17 26 r mids… 21.5
## 15 volkswag… jetta 2 2008 4 manu… f 21 29 p comp… 25
## 16 dodge ram … 4.7 2008 8 manu… 4 9 12 e pick… 10.5
## 17 dodge dako… 4.7 2008 8 auto… 4 9 12 e pick… 10.5
## 18 pontiac gran… 3.1 1999 6 auto… f 18 26 r mids… 22
## 19 dodge cara… 3 1999 6 auto… f 17 24 r mini… 20.5
## 20 audi a6 q… 3.1 2008 6 auto… 4 17 25 p mids… 21
## # … with 2 more variables: car <chr>, `cyl and trans` <chr>, and abbreviated
## # variable names ¹manufacturer, ²`avg mile per galon`
# retrieve 20% sample from df
sample_frac(df, 0.2)
## # A tibble: 47 × 14
## manufac…¹ model displ year cyl trans drv cty hwy fl class avg m…²
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> <dbl>
## 1 hyundai tibu… 2 2008 4 auto… f 20 27 r subc… 23.5
## 2 toyota coro… 1.8 2008 4 manu… f 28 37 r comp… 32.5
## 3 ford f150… 4.2 1999 6 auto… 4 14 17 r pick… 15.5
## 4 dodge cara… 3 1999 6 auto… f 17 24 r mini… 20.5
## 5 subaru fore… 2.5 1999 4 manu… 4 18 25 r suv 21.5
## 6 chevrolet c150… 5.7 1999 8 auto… r 13 17 r suv 15
## 7 dodge dura… 4.7 2008 8 auto… 4 9 12 e suv 10.5
## 8 nissan path… 5.6 2008 8 auto… 4 12 18 p suv 15
## 9 volkswag… gti 2 2008 4 manu… f 21 29 p comp… 25
## 10 chevrolet corv… 6.2 2008 8 auto… r 15 25 p 2sea… 20
## # … with 37 more rows, 2 more variables: car <chr>, `cyl and trans` <chr>, and
## # abbreviated variable names ¹manufacturer, ²`avg mile per galon`
used with aggregate function
# count number of records and number of distinct cty records
df %>% summarise(n(), n_distinct(cty))
## # A tibble: 1 × 2
## `n()` `n_distinct(cty)`
## <int> <int>
## 1 234 21
# count number of records in advance
df %>% select(manufacturer, model) %>% group_by(manufacturer, model) %>% summarise(numrec = n())
## `summarise()` has grouped output by 'manufacturer'. You can override using the
## `.groups` argument.
## # A tibble: 38 × 3
## # Groups: manufacturer [15]
## manufacturer model numrec
## <chr> <chr> <int>
## 1 audi a4 7
## 2 audi a4 quattro 8
## 3 audi a6 quattro 3
## 4 chevrolet c1500 suburban 2wd 5
## 5 chevrolet corvette 5
## 6 chevrolet k1500 tahoe 4wd 4
## 7 chevrolet malibu 5
## 8 dodge caravan 2wd 11
## 9 dodge dakota pickup 4wd 9
## 10 dodge durango 4wd 7
## # … with 28 more rows
the same approach with simpler code line is using
count()
count(df, manufacturer, model) # will produce exactly same result as in second code line of summarise() part
## # A tibble: 38 × 3
## manufacturer model n
## <chr> <chr> <int>
## 1 audi a4 7
## 2 audi a4 quattro 8
## 3 audi a6 quattro 3
## 4 chevrolet c1500 suburban 2wd 5
## 5 chevrolet corvette 5
## 6 chevrolet k1500 tahoe 4wd 4
## 7 chevrolet malibu 5
## 8 dodge caravan 2wd 11
## 9 dodge dakota pickup 4wd 9
## 10 dodge durango 4wd 7
## # … with 28 more rows
df %>% group_by(manufacturer) %>% summarise(n())
## # A tibble: 15 × 2
## manufacturer `n()`
## <chr> <int>
## 1 audi 18
## 2 chevrolet 19
## 3 dodge 37
## 4 ford 25
## 5 honda 9
## 6 hyundai 14
## 7 jeep 8
## 8 land rover 4
## 9 lincoln 3
## 10 mercury 4
## 11 nissan 13
## 12 pontiac 5
## 13 subaru 14
## 14 toyota 34
## 15 volkswagen 27
df %>% group_by(manufacturer) %>% summarise(`min hwy` = min(hwy), `max hwy` = max(hwy),
`min cyl` = min(cyl), `max cyl` = max(cyl))
## # A tibble: 15 × 5
## manufacturer `min hwy` `max hwy` `min cyl` `max cyl`
## <chr> <int> <int> <int> <int>
## 1 audi 23 31 4 8
## 2 chevrolet 14 30 4 8
## 3 dodge 12 24 4 8
## 4 ford 15 26 6 8
## 5 honda 29 36 4 4
## 6 hyundai 24 31 4 6
## 7 jeep 12 22 6 8
## 8 land rover 15 18 8 8
## 9 lincoln 16 18 8 8
## 10 mercury 17 19 6 8
## 11 nissan 17 32 4 8
## 12 pontiac 25 28 6 8
## 13 subaru 23 27 4 4
## 14 toyota 15 37 4 8
## 15 volkswagen 23 44 4 6
df %>% group_by(manufacturer, model, class, trans) %>%
summarise(`mean hwy` = mean(hwy), cars = n()) %>%
filter(`mean hwy` > 30) %>%
arrange(desc(`mean hwy`)) %>% ungroup()
## `summarise()` has grouped output by 'manufacturer', 'model', 'class'. You can
## override using the `.groups` argument.
## # A tibble: 10 × 6
## manufacturer model class trans `mean hwy` cars
## <chr> <chr> <chr> <chr> <dbl> <int>
## 1 honda civic subcompact auto(l5) 36 2
## 2 toyota corolla compact manual(m5) 36 2
## 3 toyota corolla compact auto(l4) 34 2
## 4 volkswagen new beetle subcompact manual(m5) 33.7 3
## 5 volkswagen new beetle subcompact auto(l4) 33.5 2
## 6 honda civic subcompact auto(l4) 32 2
## 7 honda civic subcompact manual(m5) 32 4
## 8 volkswagen jetta compact manual(m5) 31.5 4
## 9 audi a4 compact manual(m6) 31 1
## 10 toyota camry midsize auto(l5) 31 1
transforming table from wide format to long format and vice versa.
# create long table example
long <- data.frame(id = 1:6, type = c("a", "b", "b", "c", "b", "a"),
total = c(20, 40, 30, 45, 12, 35))
# convert to wide format
wide <- pivot_wider(long, names_from = type, values_from = total)
# bring back wide to long
wide %>% pivot_longer(cols = c("a", "b", "c"), names_to = "type", values_to = "total", values_drop_na = T)
## # A tibble: 6 × 3
## id type total
## <int> <chr> <dbl>
## 1 1 a 20
## 2 2 b 40
## 3 3 b 30
## 4 4 c 45
## 5 5 b 12
## 6 6 a 35
# play in advance
play <- df %>% filter(manufacturer %in% c("jeep", "hyundai", "land rover")) %>%
select(model, trans, hwy)
wideplay <- play %>% group_by(model, trans) %>% summarise(`avg hwy` = mean(hwy)) %>%
pivot_wider(names_from = trans, values_from = `avg hwy`)
## `summarise()` has grouped output by 'model'. You can override using the
## `.groups` argument.
#convert back with little trick
wideplay %>% pivot_longer(cols = -model, # exclude column model and remain all the rest column
names_to = "trans",
values_to = "avg hwy",
values_drop_na = T)
## # A tibble: 10 × 3
## # Groups: model [4]
## model trans `avg hwy`
## <chr> <chr> <dbl>
## 1 grand cherokee 4wd auto(l4) 18.5
## 2 grand cherokee 4wd auto(l5) 17.3
## 3 range rover auto(l4) 15
## 4 range rover auto(s6) 18
## 5 sonata auto(l4) 27.3
## 6 sonata auto(l5) 28
## 7 sonata manual(m5) 28
## 8 tiburon auto(l4) 25.7
## 9 tiburon manual(m5) 27
## 10 tiburon manual(m6) 24
just like split and merge in excel
separate
# create dates table just for example
dates <- seq.Date(from = as.Date("2022-01-01"), to = as.Date("2022-12-31"), by = "day")
# place in table format
datedf <- data.frame(date = dates)
sptdatedf <- datedf %>% separate(col = date, into = c("year", "month", "date"), sep = "-") %>%
mutate(month = as.numeric(month),
date = as.numeric(date)) # remove leading zeros
# other ways to mutate
datedf %>% separate(col = date, into = c("year", "month", "date"), sep = "-") %>%
mutate_at(.vars = c("month", "date"), # determine what's columns to mutate
.funs = as.numeric) # determine what kind of function to apply
## year month date
## 1 2022 1 1
## 2 2022 1 2
## 3 2022 1 3
## 4 2022 1 4
## 5 2022 1 5
## 6 2022 1 6
## 7 2022 1 7
## 8 2022 1 8
## 9 2022 1 9
## 10 2022 1 10
## 11 2022 1 11
## 12 2022 1 12
## 13 2022 1 13
## 14 2022 1 14
## 15 2022 1 15
## 16 2022 1 16
## 17 2022 1 17
## 18 2022 1 18
## 19 2022 1 19
## 20 2022 1 20
## 21 2022 1 21
## 22 2022 1 22
## 23 2022 1 23
## 24 2022 1 24
## 25 2022 1 25
## 26 2022 1 26
## 27 2022 1 27
## 28 2022 1 28
## 29 2022 1 29
## 30 2022 1 30
## 31 2022 1 31
## 32 2022 2 1
## 33 2022 2 2
## 34 2022 2 3
## 35 2022 2 4
## 36 2022 2 5
## 37 2022 2 6
## 38 2022 2 7
## 39 2022 2 8
## 40 2022 2 9
## 41 2022 2 10
## 42 2022 2 11
## 43 2022 2 12
## 44 2022 2 13
## 45 2022 2 14
## 46 2022 2 15
## 47 2022 2 16
## 48 2022 2 17
## 49 2022 2 18
## 50 2022 2 19
## 51 2022 2 20
## 52 2022 2 21
## 53 2022 2 22
## 54 2022 2 23
## 55 2022 2 24
## 56 2022 2 25
## 57 2022 2 26
## 58 2022 2 27
## 59 2022 2 28
## 60 2022 3 1
## 61 2022 3 2
## 62 2022 3 3
## 63 2022 3 4
## 64 2022 3 5
## 65 2022 3 6
## 66 2022 3 7
## 67 2022 3 8
## 68 2022 3 9
## 69 2022 3 10
## 70 2022 3 11
## 71 2022 3 12
## 72 2022 3 13
## 73 2022 3 14
## 74 2022 3 15
## 75 2022 3 16
## 76 2022 3 17
## 77 2022 3 18
## 78 2022 3 19
## 79 2022 3 20
## 80 2022 3 21
## 81 2022 3 22
## 82 2022 3 23
## 83 2022 3 24
## 84 2022 3 25
## 85 2022 3 26
## 86 2022 3 27
## 87 2022 3 28
## 88 2022 3 29
## 89 2022 3 30
## 90 2022 3 31
## 91 2022 4 1
## 92 2022 4 2
## 93 2022 4 3
## 94 2022 4 4
## 95 2022 4 5
## 96 2022 4 6
## 97 2022 4 7
## 98 2022 4 8
## 99 2022 4 9
## 100 2022 4 10
## 101 2022 4 11
## 102 2022 4 12
## 103 2022 4 13
## 104 2022 4 14
## 105 2022 4 15
## 106 2022 4 16
## 107 2022 4 17
## 108 2022 4 18
## 109 2022 4 19
## 110 2022 4 20
## 111 2022 4 21
## 112 2022 4 22
## 113 2022 4 23
## 114 2022 4 24
## 115 2022 4 25
## 116 2022 4 26
## 117 2022 4 27
## 118 2022 4 28
## 119 2022 4 29
## 120 2022 4 30
## 121 2022 5 1
## 122 2022 5 2
## 123 2022 5 3
## 124 2022 5 4
## 125 2022 5 5
## 126 2022 5 6
## 127 2022 5 7
## 128 2022 5 8
## 129 2022 5 9
## 130 2022 5 10
## 131 2022 5 11
## 132 2022 5 12
## 133 2022 5 13
## 134 2022 5 14
## 135 2022 5 15
## 136 2022 5 16
## 137 2022 5 17
## 138 2022 5 18
## 139 2022 5 19
## 140 2022 5 20
## 141 2022 5 21
## 142 2022 5 22
## 143 2022 5 23
## 144 2022 5 24
## 145 2022 5 25
## 146 2022 5 26
## 147 2022 5 27
## 148 2022 5 28
## 149 2022 5 29
## 150 2022 5 30
## 151 2022 5 31
## 152 2022 6 1
## 153 2022 6 2
## 154 2022 6 3
## 155 2022 6 4
## 156 2022 6 5
## 157 2022 6 6
## 158 2022 6 7
## 159 2022 6 8
## 160 2022 6 9
## 161 2022 6 10
## 162 2022 6 11
## 163 2022 6 12
## 164 2022 6 13
## 165 2022 6 14
## 166 2022 6 15
## 167 2022 6 16
## 168 2022 6 17
## 169 2022 6 18
## 170 2022 6 19
## 171 2022 6 20
## 172 2022 6 21
## 173 2022 6 22
## 174 2022 6 23
## 175 2022 6 24
## 176 2022 6 25
## 177 2022 6 26
## 178 2022 6 27
## 179 2022 6 28
## 180 2022 6 29
## 181 2022 6 30
## 182 2022 7 1
## 183 2022 7 2
## 184 2022 7 3
## 185 2022 7 4
## 186 2022 7 5
## 187 2022 7 6
## 188 2022 7 7
## 189 2022 7 8
## 190 2022 7 9
## 191 2022 7 10
## 192 2022 7 11
## 193 2022 7 12
## 194 2022 7 13
## 195 2022 7 14
## 196 2022 7 15
## 197 2022 7 16
## 198 2022 7 17
## 199 2022 7 18
## 200 2022 7 19
## 201 2022 7 20
## 202 2022 7 21
## 203 2022 7 22
## 204 2022 7 23
## 205 2022 7 24
## 206 2022 7 25
## 207 2022 7 26
## 208 2022 7 27
## 209 2022 7 28
## 210 2022 7 29
## 211 2022 7 30
## 212 2022 7 31
## 213 2022 8 1
## 214 2022 8 2
## 215 2022 8 3
## 216 2022 8 4
## 217 2022 8 5
## 218 2022 8 6
## 219 2022 8 7
## 220 2022 8 8
## 221 2022 8 9
## 222 2022 8 10
## 223 2022 8 11
## 224 2022 8 12
## 225 2022 8 13
## 226 2022 8 14
## 227 2022 8 15
## 228 2022 8 16
## 229 2022 8 17
## 230 2022 8 18
## 231 2022 8 19
## 232 2022 8 20
## 233 2022 8 21
## 234 2022 8 22
## 235 2022 8 23
## 236 2022 8 24
## 237 2022 8 25
## 238 2022 8 26
## 239 2022 8 27
## 240 2022 8 28
## 241 2022 8 29
## 242 2022 8 30
## 243 2022 8 31
## 244 2022 9 1
## 245 2022 9 2
## 246 2022 9 3
## 247 2022 9 4
## 248 2022 9 5
## 249 2022 9 6
## 250 2022 9 7
## 251 2022 9 8
## 252 2022 9 9
## 253 2022 9 10
## 254 2022 9 11
## 255 2022 9 12
## 256 2022 9 13
## 257 2022 9 14
## 258 2022 9 15
## 259 2022 9 16
## 260 2022 9 17
## 261 2022 9 18
## 262 2022 9 19
## 263 2022 9 20
## 264 2022 9 21
## 265 2022 9 22
## 266 2022 9 23
## 267 2022 9 24
## 268 2022 9 25
## 269 2022 9 26
## 270 2022 9 27
## 271 2022 9 28
## 272 2022 9 29
## 273 2022 9 30
## 274 2022 10 1
## 275 2022 10 2
## 276 2022 10 3
## 277 2022 10 4
## 278 2022 10 5
## 279 2022 10 6
## 280 2022 10 7
## 281 2022 10 8
## 282 2022 10 9
## 283 2022 10 10
## 284 2022 10 11
## 285 2022 10 12
## 286 2022 10 13
## 287 2022 10 14
## 288 2022 10 15
## 289 2022 10 16
## 290 2022 10 17
## 291 2022 10 18
## 292 2022 10 19
## 293 2022 10 20
## 294 2022 10 21
## 295 2022 10 22
## 296 2022 10 23
## 297 2022 10 24
## 298 2022 10 25
## 299 2022 10 26
## 300 2022 10 27
## 301 2022 10 28
## 302 2022 10 29
## 303 2022 10 30
## 304 2022 10 31
## 305 2022 11 1
## 306 2022 11 2
## 307 2022 11 3
## 308 2022 11 4
## 309 2022 11 5
## 310 2022 11 6
## 311 2022 11 7
## 312 2022 11 8
## 313 2022 11 9
## 314 2022 11 10
## 315 2022 11 11
## 316 2022 11 12
## 317 2022 11 13
## 318 2022 11 14
## 319 2022 11 15
## 320 2022 11 16
## 321 2022 11 17
## 322 2022 11 18
## 323 2022 11 19
## 324 2022 11 20
## 325 2022 11 21
## 326 2022 11 22
## 327 2022 11 23
## 328 2022 11 24
## 329 2022 11 25
## 330 2022 11 26
## 331 2022 11 27
## 332 2022 11 28
## 333 2022 11 29
## 334 2022 11 30
## 335 2022 12 1
## 336 2022 12 2
## 337 2022 12 3
## 338 2022 12 4
## 339 2022 12 5
## 340 2022 12 6
## 341 2022 12 7
## 342 2022 12 8
## 343 2022 12 9
## 344 2022 12 10
## 345 2022 12 11
## 346 2022 12 12
## 347 2022 12 13
## 348 2022 12 14
## 349 2022 12 15
## 350 2022 12 16
## 351 2022 12 17
## 352 2022 12 18
## 353 2022 12 19
## 354 2022 12 20
## 355 2022 12 21
## 356 2022 12 22
## 357 2022 12 23
## 358 2022 12 24
## 359 2022 12 25
## 360 2022 12 26
## 361 2022 12 27
## 362 2022 12 28
## 363 2022 12 29
## 364 2022 12 30
## 365 2022 12 31
unite
# add leading zeros before merge dates columns
sptdatedf %>% mutate(month = str_pad(width = 2, side = "left", string = month, pad = 0),
date = str_pad(width = 2, string = date, side = "left", pad = 0)) %>%
unite(col = "full dates", c(year,month,date), sep = "-", remove = F)
## full dates year month date
## 1 2022-01-01 2022 01 01
## 2 2022-01-02 2022 01 02
## 3 2022-01-03 2022 01 03
## 4 2022-01-04 2022 01 04
## 5 2022-01-05 2022 01 05
## 6 2022-01-06 2022 01 06
## 7 2022-01-07 2022 01 07
## 8 2022-01-08 2022 01 08
## 9 2022-01-09 2022 01 09
## 10 2022-01-10 2022 01 10
## 11 2022-01-11 2022 01 11
## 12 2022-01-12 2022 01 12
## 13 2022-01-13 2022 01 13
## 14 2022-01-14 2022 01 14
## 15 2022-01-15 2022 01 15
## 16 2022-01-16 2022 01 16
## 17 2022-01-17 2022 01 17
## 18 2022-01-18 2022 01 18
## 19 2022-01-19 2022 01 19
## 20 2022-01-20 2022 01 20
## 21 2022-01-21 2022 01 21
## 22 2022-01-22 2022 01 22
## 23 2022-01-23 2022 01 23
## 24 2022-01-24 2022 01 24
## 25 2022-01-25 2022 01 25
## 26 2022-01-26 2022 01 26
## 27 2022-01-27 2022 01 27
## 28 2022-01-28 2022 01 28
## 29 2022-01-29 2022 01 29
## 30 2022-01-30 2022 01 30
## 31 2022-01-31 2022 01 31
## 32 2022-02-01 2022 02 01
## 33 2022-02-02 2022 02 02
## 34 2022-02-03 2022 02 03
## 35 2022-02-04 2022 02 04
## 36 2022-02-05 2022 02 05
## 37 2022-02-06 2022 02 06
## 38 2022-02-07 2022 02 07
## 39 2022-02-08 2022 02 08
## 40 2022-02-09 2022 02 09
## 41 2022-02-10 2022 02 10
## 42 2022-02-11 2022 02 11
## 43 2022-02-12 2022 02 12
## 44 2022-02-13 2022 02 13
## 45 2022-02-14 2022 02 14
## 46 2022-02-15 2022 02 15
## 47 2022-02-16 2022 02 16
## 48 2022-02-17 2022 02 17
## 49 2022-02-18 2022 02 18
## 50 2022-02-19 2022 02 19
## 51 2022-02-20 2022 02 20
## 52 2022-02-21 2022 02 21
## 53 2022-02-22 2022 02 22
## 54 2022-02-23 2022 02 23
## 55 2022-02-24 2022 02 24
## 56 2022-02-25 2022 02 25
## 57 2022-02-26 2022 02 26
## 58 2022-02-27 2022 02 27
## 59 2022-02-28 2022 02 28
## 60 2022-03-01 2022 03 01
## 61 2022-03-02 2022 03 02
## 62 2022-03-03 2022 03 03
## 63 2022-03-04 2022 03 04
## 64 2022-03-05 2022 03 05
## 65 2022-03-06 2022 03 06
## 66 2022-03-07 2022 03 07
## 67 2022-03-08 2022 03 08
## 68 2022-03-09 2022 03 09
## 69 2022-03-10 2022 03 10
## 70 2022-03-11 2022 03 11
## 71 2022-03-12 2022 03 12
## 72 2022-03-13 2022 03 13
## 73 2022-03-14 2022 03 14
## 74 2022-03-15 2022 03 15
## 75 2022-03-16 2022 03 16
## 76 2022-03-17 2022 03 17
## 77 2022-03-18 2022 03 18
## 78 2022-03-19 2022 03 19
## 79 2022-03-20 2022 03 20
## 80 2022-03-21 2022 03 21
## 81 2022-03-22 2022 03 22
## 82 2022-03-23 2022 03 23
## 83 2022-03-24 2022 03 24
## 84 2022-03-25 2022 03 25
## 85 2022-03-26 2022 03 26
## 86 2022-03-27 2022 03 27
## 87 2022-03-28 2022 03 28
## 88 2022-03-29 2022 03 29
## 89 2022-03-30 2022 03 30
## 90 2022-03-31 2022 03 31
## 91 2022-04-01 2022 04 01
## 92 2022-04-02 2022 04 02
## 93 2022-04-03 2022 04 03
## 94 2022-04-04 2022 04 04
## 95 2022-04-05 2022 04 05
## 96 2022-04-06 2022 04 06
## 97 2022-04-07 2022 04 07
## 98 2022-04-08 2022 04 08
## 99 2022-04-09 2022 04 09
## 100 2022-04-10 2022 04 10
## 101 2022-04-11 2022 04 11
## 102 2022-04-12 2022 04 12
## 103 2022-04-13 2022 04 13
## 104 2022-04-14 2022 04 14
## 105 2022-04-15 2022 04 15
## 106 2022-04-16 2022 04 16
## 107 2022-04-17 2022 04 17
## 108 2022-04-18 2022 04 18
## 109 2022-04-19 2022 04 19
## 110 2022-04-20 2022 04 20
## 111 2022-04-21 2022 04 21
## 112 2022-04-22 2022 04 22
## 113 2022-04-23 2022 04 23
## 114 2022-04-24 2022 04 24
## 115 2022-04-25 2022 04 25
## 116 2022-04-26 2022 04 26
## 117 2022-04-27 2022 04 27
## 118 2022-04-28 2022 04 28
## 119 2022-04-29 2022 04 29
## 120 2022-04-30 2022 04 30
## 121 2022-05-01 2022 05 01
## 122 2022-05-02 2022 05 02
## 123 2022-05-03 2022 05 03
## 124 2022-05-04 2022 05 04
## 125 2022-05-05 2022 05 05
## 126 2022-05-06 2022 05 06
## 127 2022-05-07 2022 05 07
## 128 2022-05-08 2022 05 08
## 129 2022-05-09 2022 05 09
## 130 2022-05-10 2022 05 10
## 131 2022-05-11 2022 05 11
## 132 2022-05-12 2022 05 12
## 133 2022-05-13 2022 05 13
## 134 2022-05-14 2022 05 14
## 135 2022-05-15 2022 05 15
## 136 2022-05-16 2022 05 16
## 137 2022-05-17 2022 05 17
## 138 2022-05-18 2022 05 18
## 139 2022-05-19 2022 05 19
## 140 2022-05-20 2022 05 20
## 141 2022-05-21 2022 05 21
## 142 2022-05-22 2022 05 22
## 143 2022-05-23 2022 05 23
## 144 2022-05-24 2022 05 24
## 145 2022-05-25 2022 05 25
## 146 2022-05-26 2022 05 26
## 147 2022-05-27 2022 05 27
## 148 2022-05-28 2022 05 28
## 149 2022-05-29 2022 05 29
## 150 2022-05-30 2022 05 30
## 151 2022-05-31 2022 05 31
## 152 2022-06-01 2022 06 01
## 153 2022-06-02 2022 06 02
## 154 2022-06-03 2022 06 03
## 155 2022-06-04 2022 06 04
## 156 2022-06-05 2022 06 05
## 157 2022-06-06 2022 06 06
## 158 2022-06-07 2022 06 07
## 159 2022-06-08 2022 06 08
## 160 2022-06-09 2022 06 09
## 161 2022-06-10 2022 06 10
## 162 2022-06-11 2022 06 11
## 163 2022-06-12 2022 06 12
## 164 2022-06-13 2022 06 13
## 165 2022-06-14 2022 06 14
## 166 2022-06-15 2022 06 15
## 167 2022-06-16 2022 06 16
## 168 2022-06-17 2022 06 17
## 169 2022-06-18 2022 06 18
## 170 2022-06-19 2022 06 19
## 171 2022-06-20 2022 06 20
## 172 2022-06-21 2022 06 21
## 173 2022-06-22 2022 06 22
## 174 2022-06-23 2022 06 23
## 175 2022-06-24 2022 06 24
## 176 2022-06-25 2022 06 25
## 177 2022-06-26 2022 06 26
## 178 2022-06-27 2022 06 27
## 179 2022-06-28 2022 06 28
## 180 2022-06-29 2022 06 29
## 181 2022-06-30 2022 06 30
## 182 2022-07-01 2022 07 01
## 183 2022-07-02 2022 07 02
## 184 2022-07-03 2022 07 03
## 185 2022-07-04 2022 07 04
## 186 2022-07-05 2022 07 05
## 187 2022-07-06 2022 07 06
## 188 2022-07-07 2022 07 07
## 189 2022-07-08 2022 07 08
## 190 2022-07-09 2022 07 09
## 191 2022-07-10 2022 07 10
## 192 2022-07-11 2022 07 11
## 193 2022-07-12 2022 07 12
## 194 2022-07-13 2022 07 13
## 195 2022-07-14 2022 07 14
## 196 2022-07-15 2022 07 15
## 197 2022-07-16 2022 07 16
## 198 2022-07-17 2022 07 17
## 199 2022-07-18 2022 07 18
## 200 2022-07-19 2022 07 19
## 201 2022-07-20 2022 07 20
## 202 2022-07-21 2022 07 21
## 203 2022-07-22 2022 07 22
## 204 2022-07-23 2022 07 23
## 205 2022-07-24 2022 07 24
## 206 2022-07-25 2022 07 25
## 207 2022-07-26 2022 07 26
## 208 2022-07-27 2022 07 27
## 209 2022-07-28 2022 07 28
## 210 2022-07-29 2022 07 29
## 211 2022-07-30 2022 07 30
## 212 2022-07-31 2022 07 31
## 213 2022-08-01 2022 08 01
## 214 2022-08-02 2022 08 02
## 215 2022-08-03 2022 08 03
## 216 2022-08-04 2022 08 04
## 217 2022-08-05 2022 08 05
## 218 2022-08-06 2022 08 06
## 219 2022-08-07 2022 08 07
## 220 2022-08-08 2022 08 08
## 221 2022-08-09 2022 08 09
## 222 2022-08-10 2022 08 10
## 223 2022-08-11 2022 08 11
## 224 2022-08-12 2022 08 12
## 225 2022-08-13 2022 08 13
## 226 2022-08-14 2022 08 14
## 227 2022-08-15 2022 08 15
## 228 2022-08-16 2022 08 16
## 229 2022-08-17 2022 08 17
## 230 2022-08-18 2022 08 18
## 231 2022-08-19 2022 08 19
## 232 2022-08-20 2022 08 20
## 233 2022-08-21 2022 08 21
## 234 2022-08-22 2022 08 22
## 235 2022-08-23 2022 08 23
## 236 2022-08-24 2022 08 24
## 237 2022-08-25 2022 08 25
## 238 2022-08-26 2022 08 26
## 239 2022-08-27 2022 08 27
## 240 2022-08-28 2022 08 28
## 241 2022-08-29 2022 08 29
## 242 2022-08-30 2022 08 30
## 243 2022-08-31 2022 08 31
## 244 2022-09-01 2022 09 01
## 245 2022-09-02 2022 09 02
## 246 2022-09-03 2022 09 03
## 247 2022-09-04 2022 09 04
## 248 2022-09-05 2022 09 05
## 249 2022-09-06 2022 09 06
## 250 2022-09-07 2022 09 07
## 251 2022-09-08 2022 09 08
## 252 2022-09-09 2022 09 09
## 253 2022-09-10 2022 09 10
## 254 2022-09-11 2022 09 11
## 255 2022-09-12 2022 09 12
## 256 2022-09-13 2022 09 13
## 257 2022-09-14 2022 09 14
## 258 2022-09-15 2022 09 15
## 259 2022-09-16 2022 09 16
## 260 2022-09-17 2022 09 17
## 261 2022-09-18 2022 09 18
## 262 2022-09-19 2022 09 19
## 263 2022-09-20 2022 09 20
## 264 2022-09-21 2022 09 21
## 265 2022-09-22 2022 09 22
## 266 2022-09-23 2022 09 23
## 267 2022-09-24 2022 09 24
## 268 2022-09-25 2022 09 25
## 269 2022-09-26 2022 09 26
## 270 2022-09-27 2022 09 27
## 271 2022-09-28 2022 09 28
## 272 2022-09-29 2022 09 29
## 273 2022-09-30 2022 09 30
## 274 2022-10-01 2022 10 01
## 275 2022-10-02 2022 10 02
## 276 2022-10-03 2022 10 03
## 277 2022-10-04 2022 10 04
## 278 2022-10-05 2022 10 05
## 279 2022-10-06 2022 10 06
## 280 2022-10-07 2022 10 07
## 281 2022-10-08 2022 10 08
## 282 2022-10-09 2022 10 09
## 283 2022-10-10 2022 10 10
## 284 2022-10-11 2022 10 11
## 285 2022-10-12 2022 10 12
## 286 2022-10-13 2022 10 13
## 287 2022-10-14 2022 10 14
## 288 2022-10-15 2022 10 15
## 289 2022-10-16 2022 10 16
## 290 2022-10-17 2022 10 17
## 291 2022-10-18 2022 10 18
## 292 2022-10-19 2022 10 19
## 293 2022-10-20 2022 10 20
## 294 2022-10-21 2022 10 21
## 295 2022-10-22 2022 10 22
## 296 2022-10-23 2022 10 23
## 297 2022-10-24 2022 10 24
## 298 2022-10-25 2022 10 25
## 299 2022-10-26 2022 10 26
## 300 2022-10-27 2022 10 27
## 301 2022-10-28 2022 10 28
## 302 2022-10-29 2022 10 29
## 303 2022-10-30 2022 10 30
## 304 2022-10-31 2022 10 31
## 305 2022-11-01 2022 11 01
## 306 2022-11-02 2022 11 02
## 307 2022-11-03 2022 11 03
## 308 2022-11-04 2022 11 04
## 309 2022-11-05 2022 11 05
## 310 2022-11-06 2022 11 06
## 311 2022-11-07 2022 11 07
## 312 2022-11-08 2022 11 08
## 313 2022-11-09 2022 11 09
## 314 2022-11-10 2022 11 10
## 315 2022-11-11 2022 11 11
## 316 2022-11-12 2022 11 12
## 317 2022-11-13 2022 11 13
## 318 2022-11-14 2022 11 14
## 319 2022-11-15 2022 11 15
## 320 2022-11-16 2022 11 16
## 321 2022-11-17 2022 11 17
## 322 2022-11-18 2022 11 18
## 323 2022-11-19 2022 11 19
## 324 2022-11-20 2022 11 20
## 325 2022-11-21 2022 11 21
## 326 2022-11-22 2022 11 22
## 327 2022-11-23 2022 11 23
## 328 2022-11-24 2022 11 24
## 329 2022-11-25 2022 11 25
## 330 2022-11-26 2022 11 26
## 331 2022-11-27 2022 11 27
## 332 2022-11-28 2022 11 28
## 333 2022-11-29 2022 11 29
## 334 2022-11-30 2022 11 30
## 335 2022-12-01 2022 12 01
## 336 2022-12-02 2022 12 02
## 337 2022-12-03 2022 12 03
## 338 2022-12-04 2022 12 04
## 339 2022-12-05 2022 12 05
## 340 2022-12-06 2022 12 06
## 341 2022-12-07 2022 12 07
## 342 2022-12-08 2022 12 08
## 343 2022-12-09 2022 12 09
## 344 2022-12-10 2022 12 10
## 345 2022-12-11 2022 12 11
## 346 2022-12-12 2022 12 12
## 347 2022-12-13 2022 12 13
## 348 2022-12-14 2022 12 14
## 349 2022-12-15 2022 12 15
## 350 2022-12-16 2022 12 16
## 351 2022-12-17 2022 12 17
## 352 2022-12-18 2022 12 18
## 353 2022-12-19 2022 12 19
## 354 2022-12-20 2022 12 20
## 355 2022-12-21 2022 12 21
## 356 2022-12-22 2022 12 22
## 357 2022-12-23 2022 12 23
## 358 2022-12-24 2022 12 24
## 359 2022-12-25 2022 12 25
## 360 2022-12-26 2022 12 26
## 361 2022-12-27 2022 12 27
## 362 2022-12-28 2022 12 28
## 363 2022-12-29 2022 12 29
## 364 2022-12-30 2022 12 30
## 365 2022-12-31 2022 12 31
# others way to adding leading zeros with mutate_at
sptdatedf %>% mutate_at(.vars = c("month", "date"),
.funs = str_pad, width = 2, side = "left", pad = 0) %>%
unite(col = "full date", c("year", "month", "date"), sep = "-", remove = F)
## full date year month date
## 1 2022-01-01 2022 01 01
## 2 2022-01-02 2022 01 02
## 3 2022-01-03 2022 01 03
## 4 2022-01-04 2022 01 04
## 5 2022-01-05 2022 01 05
## 6 2022-01-06 2022 01 06
## 7 2022-01-07 2022 01 07
## 8 2022-01-08 2022 01 08
## 9 2022-01-09 2022 01 09
## 10 2022-01-10 2022 01 10
## 11 2022-01-11 2022 01 11
## 12 2022-01-12 2022 01 12
## 13 2022-01-13 2022 01 13
## 14 2022-01-14 2022 01 14
## 15 2022-01-15 2022 01 15
## 16 2022-01-16 2022 01 16
## 17 2022-01-17 2022 01 17
## 18 2022-01-18 2022 01 18
## 19 2022-01-19 2022 01 19
## 20 2022-01-20 2022 01 20
## 21 2022-01-21 2022 01 21
## 22 2022-01-22 2022 01 22
## 23 2022-01-23 2022 01 23
## 24 2022-01-24 2022 01 24
## 25 2022-01-25 2022 01 25
## 26 2022-01-26 2022 01 26
## 27 2022-01-27 2022 01 27
## 28 2022-01-28 2022 01 28
## 29 2022-01-29 2022 01 29
## 30 2022-01-30 2022 01 30
## 31 2022-01-31 2022 01 31
## 32 2022-02-01 2022 02 01
## 33 2022-02-02 2022 02 02
## 34 2022-02-03 2022 02 03
## 35 2022-02-04 2022 02 04
## 36 2022-02-05 2022 02 05
## 37 2022-02-06 2022 02 06
## 38 2022-02-07 2022 02 07
## 39 2022-02-08 2022 02 08
## 40 2022-02-09 2022 02 09
## 41 2022-02-10 2022 02 10
## 42 2022-02-11 2022 02 11
## 43 2022-02-12 2022 02 12
## 44 2022-02-13 2022 02 13
## 45 2022-02-14 2022 02 14
## 46 2022-02-15 2022 02 15
## 47 2022-02-16 2022 02 16
## 48 2022-02-17 2022 02 17
## 49 2022-02-18 2022 02 18
## 50 2022-02-19 2022 02 19
## 51 2022-02-20 2022 02 20
## 52 2022-02-21 2022 02 21
## 53 2022-02-22 2022 02 22
## 54 2022-02-23 2022 02 23
## 55 2022-02-24 2022 02 24
## 56 2022-02-25 2022 02 25
## 57 2022-02-26 2022 02 26
## 58 2022-02-27 2022 02 27
## 59 2022-02-28 2022 02 28
## 60 2022-03-01 2022 03 01
## 61 2022-03-02 2022 03 02
## 62 2022-03-03 2022 03 03
## 63 2022-03-04 2022 03 04
## 64 2022-03-05 2022 03 05
## 65 2022-03-06 2022 03 06
## 66 2022-03-07 2022 03 07
## 67 2022-03-08 2022 03 08
## 68 2022-03-09 2022 03 09
## 69 2022-03-10 2022 03 10
## 70 2022-03-11 2022 03 11
## 71 2022-03-12 2022 03 12
## 72 2022-03-13 2022 03 13
## 73 2022-03-14 2022 03 14
## 74 2022-03-15 2022 03 15
## 75 2022-03-16 2022 03 16
## 76 2022-03-17 2022 03 17
## 77 2022-03-18 2022 03 18
## 78 2022-03-19 2022 03 19
## 79 2022-03-20 2022 03 20
## 80 2022-03-21 2022 03 21
## 81 2022-03-22 2022 03 22
## 82 2022-03-23 2022 03 23
## 83 2022-03-24 2022 03 24
## 84 2022-03-25 2022 03 25
## 85 2022-03-26 2022 03 26
## 86 2022-03-27 2022 03 27
## 87 2022-03-28 2022 03 28
## 88 2022-03-29 2022 03 29
## 89 2022-03-30 2022 03 30
## 90 2022-03-31 2022 03 31
## 91 2022-04-01 2022 04 01
## 92 2022-04-02 2022 04 02
## 93 2022-04-03 2022 04 03
## 94 2022-04-04 2022 04 04
## 95 2022-04-05 2022 04 05
## 96 2022-04-06 2022 04 06
## 97 2022-04-07 2022 04 07
## 98 2022-04-08 2022 04 08
## 99 2022-04-09 2022 04 09
## 100 2022-04-10 2022 04 10
## 101 2022-04-11 2022 04 11
## 102 2022-04-12 2022 04 12
## 103 2022-04-13 2022 04 13
## 104 2022-04-14 2022 04 14
## 105 2022-04-15 2022 04 15
## 106 2022-04-16 2022 04 16
## 107 2022-04-17 2022 04 17
## 108 2022-04-18 2022 04 18
## 109 2022-04-19 2022 04 19
## 110 2022-04-20 2022 04 20
## 111 2022-04-21 2022 04 21
## 112 2022-04-22 2022 04 22
## 113 2022-04-23 2022 04 23
## 114 2022-04-24 2022 04 24
## 115 2022-04-25 2022 04 25
## 116 2022-04-26 2022 04 26
## 117 2022-04-27 2022 04 27
## 118 2022-04-28 2022 04 28
## 119 2022-04-29 2022 04 29
## 120 2022-04-30 2022 04 30
## 121 2022-05-01 2022 05 01
## 122 2022-05-02 2022 05 02
## 123 2022-05-03 2022 05 03
## 124 2022-05-04 2022 05 04
## 125 2022-05-05 2022 05 05
## 126 2022-05-06 2022 05 06
## 127 2022-05-07 2022 05 07
## 128 2022-05-08 2022 05 08
## 129 2022-05-09 2022 05 09
## 130 2022-05-10 2022 05 10
## 131 2022-05-11 2022 05 11
## 132 2022-05-12 2022 05 12
## 133 2022-05-13 2022 05 13
## 134 2022-05-14 2022 05 14
## 135 2022-05-15 2022 05 15
## 136 2022-05-16 2022 05 16
## 137 2022-05-17 2022 05 17
## 138 2022-05-18 2022 05 18
## 139 2022-05-19 2022 05 19
## 140 2022-05-20 2022 05 20
## 141 2022-05-21 2022 05 21
## 142 2022-05-22 2022 05 22
## 143 2022-05-23 2022 05 23
## 144 2022-05-24 2022 05 24
## 145 2022-05-25 2022 05 25
## 146 2022-05-26 2022 05 26
## 147 2022-05-27 2022 05 27
## 148 2022-05-28 2022 05 28
## 149 2022-05-29 2022 05 29
## 150 2022-05-30 2022 05 30
## 151 2022-05-31 2022 05 31
## 152 2022-06-01 2022 06 01
## 153 2022-06-02 2022 06 02
## 154 2022-06-03 2022 06 03
## 155 2022-06-04 2022 06 04
## 156 2022-06-05 2022 06 05
## 157 2022-06-06 2022 06 06
## 158 2022-06-07 2022 06 07
## 159 2022-06-08 2022 06 08
## 160 2022-06-09 2022 06 09
## 161 2022-06-10 2022 06 10
## 162 2022-06-11 2022 06 11
## 163 2022-06-12 2022 06 12
## 164 2022-06-13 2022 06 13
## 165 2022-06-14 2022 06 14
## 166 2022-06-15 2022 06 15
## 167 2022-06-16 2022 06 16
## 168 2022-06-17 2022 06 17
## 169 2022-06-18 2022 06 18
## 170 2022-06-19 2022 06 19
## 171 2022-06-20 2022 06 20
## 172 2022-06-21 2022 06 21
## 173 2022-06-22 2022 06 22
## 174 2022-06-23 2022 06 23
## 175 2022-06-24 2022 06 24
## 176 2022-06-25 2022 06 25
## 177 2022-06-26 2022 06 26
## 178 2022-06-27 2022 06 27
## 179 2022-06-28 2022 06 28
## 180 2022-06-29 2022 06 29
## 181 2022-06-30 2022 06 30
## 182 2022-07-01 2022 07 01
## 183 2022-07-02 2022 07 02
## 184 2022-07-03 2022 07 03
## 185 2022-07-04 2022 07 04
## 186 2022-07-05 2022 07 05
## 187 2022-07-06 2022 07 06
## 188 2022-07-07 2022 07 07
## 189 2022-07-08 2022 07 08
## 190 2022-07-09 2022 07 09
## 191 2022-07-10 2022 07 10
## 192 2022-07-11 2022 07 11
## 193 2022-07-12 2022 07 12
## 194 2022-07-13 2022 07 13
## 195 2022-07-14 2022 07 14
## 196 2022-07-15 2022 07 15
## 197 2022-07-16 2022 07 16
## 198 2022-07-17 2022 07 17
## 199 2022-07-18 2022 07 18
## 200 2022-07-19 2022 07 19
## 201 2022-07-20 2022 07 20
## 202 2022-07-21 2022 07 21
## 203 2022-07-22 2022 07 22
## 204 2022-07-23 2022 07 23
## 205 2022-07-24 2022 07 24
## 206 2022-07-25 2022 07 25
## 207 2022-07-26 2022 07 26
## 208 2022-07-27 2022 07 27
## 209 2022-07-28 2022 07 28
## 210 2022-07-29 2022 07 29
## 211 2022-07-30 2022 07 30
## 212 2022-07-31 2022 07 31
## 213 2022-08-01 2022 08 01
## 214 2022-08-02 2022 08 02
## 215 2022-08-03 2022 08 03
## 216 2022-08-04 2022 08 04
## 217 2022-08-05 2022 08 05
## 218 2022-08-06 2022 08 06
## 219 2022-08-07 2022 08 07
## 220 2022-08-08 2022 08 08
## 221 2022-08-09 2022 08 09
## 222 2022-08-10 2022 08 10
## 223 2022-08-11 2022 08 11
## 224 2022-08-12 2022 08 12
## 225 2022-08-13 2022 08 13
## 226 2022-08-14 2022 08 14
## 227 2022-08-15 2022 08 15
## 228 2022-08-16 2022 08 16
## 229 2022-08-17 2022 08 17
## 230 2022-08-18 2022 08 18
## 231 2022-08-19 2022 08 19
## 232 2022-08-20 2022 08 20
## 233 2022-08-21 2022 08 21
## 234 2022-08-22 2022 08 22
## 235 2022-08-23 2022 08 23
## 236 2022-08-24 2022 08 24
## 237 2022-08-25 2022 08 25
## 238 2022-08-26 2022 08 26
## 239 2022-08-27 2022 08 27
## 240 2022-08-28 2022 08 28
## 241 2022-08-29 2022 08 29
## 242 2022-08-30 2022 08 30
## 243 2022-08-31 2022 08 31
## 244 2022-09-01 2022 09 01
## 245 2022-09-02 2022 09 02
## 246 2022-09-03 2022 09 03
## 247 2022-09-04 2022 09 04
## 248 2022-09-05 2022 09 05
## 249 2022-09-06 2022 09 06
## 250 2022-09-07 2022 09 07
## 251 2022-09-08 2022 09 08
## 252 2022-09-09 2022 09 09
## 253 2022-09-10 2022 09 10
## 254 2022-09-11 2022 09 11
## 255 2022-09-12 2022 09 12
## 256 2022-09-13 2022 09 13
## 257 2022-09-14 2022 09 14
## 258 2022-09-15 2022 09 15
## 259 2022-09-16 2022 09 16
## 260 2022-09-17 2022 09 17
## 261 2022-09-18 2022 09 18
## 262 2022-09-19 2022 09 19
## 263 2022-09-20 2022 09 20
## 264 2022-09-21 2022 09 21
## 265 2022-09-22 2022 09 22
## 266 2022-09-23 2022 09 23
## 267 2022-09-24 2022 09 24
## 268 2022-09-25 2022 09 25
## 269 2022-09-26 2022 09 26
## 270 2022-09-27 2022 09 27
## 271 2022-09-28 2022 09 28
## 272 2022-09-29 2022 09 29
## 273 2022-09-30 2022 09 30
## 274 2022-10-01 2022 10 01
## 275 2022-10-02 2022 10 02
## 276 2022-10-03 2022 10 03
## 277 2022-10-04 2022 10 04
## 278 2022-10-05 2022 10 05
## 279 2022-10-06 2022 10 06
## 280 2022-10-07 2022 10 07
## 281 2022-10-08 2022 10 08
## 282 2022-10-09 2022 10 09
## 283 2022-10-10 2022 10 10
## 284 2022-10-11 2022 10 11
## 285 2022-10-12 2022 10 12
## 286 2022-10-13 2022 10 13
## 287 2022-10-14 2022 10 14
## 288 2022-10-15 2022 10 15
## 289 2022-10-16 2022 10 16
## 290 2022-10-17 2022 10 17
## 291 2022-10-18 2022 10 18
## 292 2022-10-19 2022 10 19
## 293 2022-10-20 2022 10 20
## 294 2022-10-21 2022 10 21
## 295 2022-10-22 2022 10 22
## 296 2022-10-23 2022 10 23
## 297 2022-10-24 2022 10 24
## 298 2022-10-25 2022 10 25
## 299 2022-10-26 2022 10 26
## 300 2022-10-27 2022 10 27
## 301 2022-10-28 2022 10 28
## 302 2022-10-29 2022 10 29
## 303 2022-10-30 2022 10 30
## 304 2022-10-31 2022 10 31
## 305 2022-11-01 2022 11 01
## 306 2022-11-02 2022 11 02
## 307 2022-11-03 2022 11 03
## 308 2022-11-04 2022 11 04
## 309 2022-11-05 2022 11 05
## 310 2022-11-06 2022 11 06
## 311 2022-11-07 2022 11 07
## 312 2022-11-08 2022 11 08
## 313 2022-11-09 2022 11 09
## 314 2022-11-10 2022 11 10
## 315 2022-11-11 2022 11 11
## 316 2022-11-12 2022 11 12
## 317 2022-11-13 2022 11 13
## 318 2022-11-14 2022 11 14
## 319 2022-11-15 2022 11 15
## 320 2022-11-16 2022 11 16
## 321 2022-11-17 2022 11 17
## 322 2022-11-18 2022 11 18
## 323 2022-11-19 2022 11 19
## 324 2022-11-20 2022 11 20
## 325 2022-11-21 2022 11 21
## 326 2022-11-22 2022 11 22
## 327 2022-11-23 2022 11 23
## 328 2022-11-24 2022 11 24
## 329 2022-11-25 2022 11 25
## 330 2022-11-26 2022 11 26
## 331 2022-11-27 2022 11 27
## 332 2022-11-28 2022 11 28
## 333 2022-11-29 2022 11 29
## 334 2022-11-30 2022 11 30
## 335 2022-12-01 2022 12 01
## 336 2022-12-02 2022 12 02
## 337 2022-12-03 2022 12 03
## 338 2022-12-04 2022 12 04
## 339 2022-12-05 2022 12 05
## 340 2022-12-06 2022 12 06
## 341 2022-12-07 2022 12 07
## 342 2022-12-08 2022 12 08
## 343 2022-12-09 2022 12 09
## 344 2022-12-10 2022 12 10
## 345 2022-12-11 2022 12 11
## 346 2022-12-12 2022 12 12
## 347 2022-12-13 2022 12 13
## 348 2022-12-14 2022 12 14
## 349 2022-12-15 2022 12 15
## 350 2022-12-16 2022 12 16
## 351 2022-12-17 2022 12 17
## 352 2022-12-18 2022 12 18
## 353 2022-12-19 2022 12 19
## 354 2022-12-20 2022 12 20
## 355 2022-12-21 2022 12 21
## 356 2022-12-22 2022 12 22
## 357 2022-12-23 2022 12 23
## 358 2022-12-24 2022 12 24
## 359 2022-12-25 2022 12 25
## 360 2022-12-26 2022 12 26
## 361 2022-12-27 2022 12 27
## 362 2022-12-28 2022 12 28
## 363 2022-12-29 2022 12 29
## 364 2022-12-30 2022 12 30
## 365 2022-12-31 2022 12 31
extract column values as a vector
df %>% pull(hwy)
## [1] 29 29 31 30 26 26 27 26 25 28 27 25 25 25 25 24 25 23 20 15 20 17 17 26 23
## [26] 26 25 24 19 14 15 17 27 30 26 29 26 24 24 22 22 24 24 17 22 21 23 23 19 18
## [51] 17 17 19 19 12 17 15 17 17 12 17 16 18 15 16 12 17 17 16 12 15 16 17 15 17
## [76] 17 18 17 19 17 19 19 17 17 17 16 16 17 15 17 26 25 26 24 21 22 23 22 20 33
## [101] 32 32 29 32 34 36 36 29 26 27 30 31 26 26 28 26 29 28 27 24 24 24 22 19 20
## [126] 17 12 19 18 14 15 18 18 15 17 16 18 17 19 19 17 29 27 31 32 27 26 26 25 25
## [151] 17 17 20 18 26 26 27 28 25 25 24 27 25 26 23 26 26 26 26 25 27 25 27 20 20
## [176] 19 17 20 17 29 27 31 31 26 26 28 27 29 31 31 26 26 27 30 33 35 37 35 15 18
## [201] 20 20 22 17 19 18 20 29 26 29 29 24 44 29 26 29 29 29 29 23 24 44 41 29 26
## [226] 28 29 29 29 28 29 26 26 26
df %>% group_by(manufacturer, model) %>% mutate(`avg hwy` = mean(hwy))
## # A tibble: 234 × 15
## # Groups: manufacturer, model [38]
## manufac…¹ model displ year cyl trans drv cty hwy fl class avg m…²
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> <dbl>
## 1 audi a4 1.8 1999 4 auto… f 18 29 p comp… 23.5
## 2 audi a4 1.8 1999 4 manu… f 21 29 p comp… 25
## 3 audi a4 2 2008 4 manu… f 20 31 p comp… 25.5
## 4 audi a4 2 2008 4 auto… f 21 30 p comp… 25.5
## 5 audi a4 2.8 1999 6 auto… f 16 26 p comp… 21
## 6 audi a4 2.8 1999 6 manu… f 18 26 p comp… 22
## 7 audi a4 3.1 2008 6 auto… f 18 27 p comp… 22.5
## 8 audi a4 q… 1.8 1999 4 manu… 4 18 26 p comp… 22
## 9 audi a4 q… 1.8 1999 4 auto… 4 16 25 p comp… 20.5
## 10 audi a4 q… 2 2008 4 manu… 4 20 28 p comp… 24
## # … with 224 more rows, 3 more variables: car <chr>, `cyl and trans` <chr>,
## # `avg hwy` <dbl>, and abbreviated variable names ¹manufacturer,
## # ²`avg mile per galon`
# the different between combined group_by-mutate and group_by-summarise is, the first combine
# also added summarised value to column for each corresponding row, while the other one only
# summarise grouped rows without adding it to a column.
df %>% group_by(manufacturer, model) %>% summarise(mean(hwy))
## `summarise()` has grouped output by 'manufacturer'. You can override using the
## `.groups` argument.
## # A tibble: 38 × 3
## # Groups: manufacturer [15]
## manufacturer model `mean(hwy)`
## <chr> <chr> <dbl>
## 1 audi a4 28.3
## 2 audi a4 quattro 25.8
## 3 audi a6 quattro 24
## 4 chevrolet c1500 suburban 2wd 17.8
## 5 chevrolet corvette 24.8
## 6 chevrolet k1500 tahoe 4wd 16.2
## 7 chevrolet malibu 27.6
## 8 dodge caravan 2wd 22.4
## 9 dodge dakota pickup 4wd 17
## 10 dodge durango 4wd 16
## # … with 28 more rows
just like case when in SQL
# library data.table is loaded to use like clause like in SQL
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
# create a column to identify the transmission type
# whether automatic coded as AT or manual coded as MT
df1 <- df %>% mutate(`trans type` = case_when(
trans %like% "auto.*" ~ "AT",
trans %like% "manual.*" ~ "MT",
TRUE ~ "NA")
)
let’s play
df2 <- df1 %>% count(manufacturer, `trans type`) # long format
# convert to wide format
df2 %>% pivot_wider(names_from = `trans type`, values_from = n) # this result will similar
## # A tibble: 15 × 3
## manufacturer AT MT
## <chr> <int> <int>
## 1 audi 11 7
## 2 chevrolet 16 3
## 3 dodge 30 7
## 4 ford 17 8
## 5 honda 4 5
## 6 hyundai 7 7
## 7 jeep 8 NA
## 8 land rover 4 NA
## 9 lincoln 3 NA
## 10 mercury 4 NA
## 11 nissan 8 5
## 12 pontiac 5 NA
## 13 subaru 7 7
## 14 toyota 20 14
## 15 volkswagen 13 14
# to this syntax:
table(df1$manufacturer, df1$`trans type`)
##
## AT MT
## audi 11 7
## chevrolet 16 3
## dodge 30 7
## ford 17 8
## honda 4 5
## hyundai 7 7
## jeep 8 0
## land rover 4 0
## lincoln 3 0
## mercury 4 0
## nissan 8 5
## pontiac 5 0
## subaru 7 7
## toyota 20 14
## volkswagen 13 14
# but table function will cause 0 as a 0 numeric, whereas pivot function will cause 0 as null
rank a row
# adding id to car data
df %>% mutate(`car id` = row_number()) %>% print(n = 20)
## # A tibble: 234 × 15
## manufac…¹ model displ year cyl trans drv cty hwy fl class avg m…²
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> <dbl>
## 1 audi a4 1.8 1999 4 auto… f 18 29 p comp… 23.5
## 2 audi a4 1.8 1999 4 manu… f 21 29 p comp… 25
## 3 audi a4 2 2008 4 manu… f 20 31 p comp… 25.5
## 4 audi a4 2 2008 4 auto… f 21 30 p comp… 25.5
## 5 audi a4 2.8 1999 6 auto… f 16 26 p comp… 21
## 6 audi a4 2.8 1999 6 manu… f 18 26 p comp… 22
## 7 audi a4 3.1 2008 6 auto… f 18 27 p comp… 22.5
## 8 audi a4 q… 1.8 1999 4 manu… 4 18 26 p comp… 22
## 9 audi a4 q… 1.8 1999 4 auto… 4 16 25 p comp… 20.5
## 10 audi a4 q… 2 2008 4 manu… 4 20 28 p comp… 24
## 11 audi a4 q… 2 2008 4 auto… 4 19 27 p comp… 23
## 12 audi a4 q… 2.8 1999 6 auto… 4 15 25 p comp… 20
## 13 audi a4 q… 2.8 1999 6 manu… 4 17 25 p comp… 21
## 14 audi a4 q… 3.1 2008 6 auto… 4 17 25 p comp… 21
## 15 audi a4 q… 3.1 2008 6 manu… 4 15 25 p comp… 20
## 16 audi a6 q… 2.8 1999 6 auto… 4 15 24 p mids… 19.5
## 17 audi a6 q… 3.1 2008 6 auto… 4 17 25 p mids… 21
## 18 audi a6 q… 4.2 2008 8 auto… 4 16 23 p mids… 19.5
## 19 chevrolet c150… 5.3 2008 8 auto… r 14 20 r suv 17
## 20 chevrolet c150… 5.3 2008 8 auto… r 11 15 e suv 13
## # … with 214 more rows, 3 more variables: car <chr>, `cyl and trans` <chr>,
## # `car id` <int>, and abbreviated variable names ¹manufacturer,
## # ²`avg mile per galon`
create a car id considering group
# it will create incremented id in each group
df %>% group_by(model) %>% mutate(`mod id` = row_number())
## # A tibble: 234 × 15
## # Groups: model [38]
## manufac…¹ model displ year cyl trans drv cty hwy fl class avg m…²
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> <dbl>
## 1 audi a4 1.8 1999 4 auto… f 18 29 p comp… 23.5
## 2 audi a4 1.8 1999 4 manu… f 21 29 p comp… 25
## 3 audi a4 2 2008 4 manu… f 20 31 p comp… 25.5
## 4 audi a4 2 2008 4 auto… f 21 30 p comp… 25.5
## 5 audi a4 2.8 1999 6 auto… f 16 26 p comp… 21
## 6 audi a4 2.8 1999 6 manu… f 18 26 p comp… 22
## 7 audi a4 3.1 2008 6 auto… f 18 27 p comp… 22.5
## 8 audi a4 q… 1.8 1999 4 manu… 4 18 26 p comp… 22
## 9 audi a4 q… 1.8 1999 4 auto… 4 16 25 p comp… 20.5
## 10 audi a4 q… 2 2008 4 manu… 4 20 28 p comp… 24
## # … with 224 more rows, 3 more variables: car <chr>, `cyl and trans` <chr>,
## # `mod id` <int>, and abbreviated variable names ¹manufacturer,
## # ²`avg mile per galon`
mutate_all() apply function to all columns
mutate_if() apply function over columns that meet
condition
mutate_at() apply function to a specific columns
# install.packages("hflights")
library(hflights)
df <- hflights
class(df)
## [1] "data.frame"
glimpse(df)
## Rows: 227,496
## Columns: 21
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2…
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 1355, 1443…
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 1454, 1554…
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA"…
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428, 428, 42…
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N492AA", "N…
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, 56, 63, …
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, 41, 44, …
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29, 5, -9, …
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, -2, -3, …
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IA…
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DF…
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 22…
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6, 12, 8,…
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11, 13, 15…
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", …
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
there are more than 200 thousands records
# check globally
anyNA(df) # or using
## [1] TRUE
any(is.na(df))
## [1] TRUE
# find the total number of missing values in each column
colSums(is.na(df))
## Year Month DayofMonth DayOfWeek
## 0 0 0 0
## DepTime ArrTime UniqueCarrier FlightNum
## 2905 3066 0 0
## TailNum ActualElapsedTime AirTime ArrDelay
## 0 3622 3622 3622
## DepDelay Origin Dest Distance
## 2905 0 0 0
## TaxiIn TaxiOut Cancelled CancellationCode
## 3066 2947 0 0
## Diverted
## 0
# find only columns and its position that contains missing values
which(colSums(is.na(df))>0)
## DepTime ArrTime ActualElapsedTime AirTime
## 5 6 10 11
## ArrDelay DepDelay TaxiIn TaxiOut
## 12 13 17 18
# display the names of columns that contains missing values
names(which(colSums(is.na(df))>0))
## [1] "DepTime" "ArrTime" "ActualElapsedTime"
## [4] "AirTime" "ArrDelay" "DepDelay"
## [7] "TaxiIn" "TaxiOut"
another way to identify columns with null values
colnames(df)[colSums(is.na(df))>0]
## [1] "DepTime" "ArrTime" "ActualElapsedTime"
## [4] "AirTime" "ArrDelay" "DepDelay"
## [7] "TaxiIn" "TaxiOut"
or using apply
# apply(df, 2, anyNA)
# which(apply(df, 2, anyNA))
names(which(apply(df, 2, anyNA)))
## [1] "DepTime" "ArrTime" "ActualElapsedTime"
## [4] "AirTime" "ArrDelay" "DepDelay"
## [7] "TaxiIn" "TaxiOut"
but the third way take longer time to retrieve the results
place the list of columns that contains null values in a tibble
NullCol <- data.frame(col = names(which(colSums(is.na(df))>0)))
at the first step we see that some columns not formatted properly
glimpse(df)
## Rows: 227,496
## Columns: 21
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2…
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 1355, 1443…
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 1454, 1554…
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA"…
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428, 428, 42…
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N492AA", "N…
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, 56, 63, …
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, 41, 44, …
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29, 5, -9, …
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, -2, -3, …
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IA…
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DF…
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 22…
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6, 12, 8,…
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11, 13, 15…
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", …
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
the year, month, and day columns should be formatted as date not
integer instead.
but before formatting the columns, we have to merge these columns to
make it as a full date column
# add leading zeros to column month and dayofmonth
df <- df %>% mutate_at(.vars = c("Month", "DayofMonth"),
.funs = str_pad, width = 2, side = "left", pad = 0) %>%
# merge appropriate columns to create a full dates column
unite("Dates", c(Year, Month, DayofMonth), sep = "-")
after full dates column was generated, we convert the column’s format from string to date format
df$Dates <- as.POSIXct(df$Dates, "%Y-%m-%d")
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y-%m-%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y-%m-%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y-%m-%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y-%m-%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y-%m-%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y-%m-%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y-%m-%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y-%m-%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y-%m-%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y-%m-%d'
## Warning in strptime(x, f, tz = tz): unknown timezone '%Y-%m-%d'
## Warning in as.POSIXct.POSIXlt(as.POSIXlt(x, tz, ...), tz, ...): unknown timezone
## '%Y-%m-%d'
total flight for each carrier
df %>% count(UniqueCarrier) %>% arrange(desc(n))
## UniqueCarrier n
## 1 XE 73053
## 2 CO 70032
## 3 WN 45343
## 4 OO 16061
## 5 MQ 4648
## 6 US 4082
## 7 AA 3244
## 8 DL 2641
## 9 EV 2204
## 10 FL 2139
## 11 UA 2072
## 12 F9 838
## 13 B6 695
## 14 AS 365
## 15 YV 79
count the number of cancellation in each carrier
df %>% count(UniqueCarrier, Cancelled) %>%
filter(Cancelled == 1) %>%
arrange(desc(n))
## UniqueCarrier Cancelled n
## 1 XE 1 1132
## 2 WN 1 703
## 3 CO 1 475
## 4 OO 1 224
## 5 MQ 1 135
## 6 EV 1 76
## 7 AA 1 60
## 8 US 1 46
## 9 DL 1 42
## 10 UA 1 34
## 11 FL 1 21
## 12 B6 1 18
## 13 F9 1 6
## 14 YV 1 1
percentage of cancelled flight per carrier
percent_of_cancel <- df %>% count(UniqueCarrier, Cancelled) %>%
group_by(UniqueCarrier) %>%
mutate(`total flights` = sum(n),
percentage = round(n/`total flights`*100, 2)) %>%
filter(Cancelled == 1) %>% select(UniqueCarrier, percentage) %>%
arrange(desc(percentage)) %>% ungroup()
takeaway:
although XE is the carrier with the most number of cancelled flights
among other carriers, but the rank of a percentage of its cancellation
is no more than the top 50% among others.
count flight per canceled code for each carriers
numcancode <- df %>% count(UniqueCarrier, CancellationCode)
numcancode %>% pivot_wider(names_from = CancellationCode, values_from = n)
the pivoting process will error because there is an empty value in cancellation code, thus we have to encoding the empty value, so that the pivoting process will succeed
numcancode %>% #coded empty type of cancellation code
mutate(CancellationCode = case_when(CancellationCode == "" ~ "0",
TRUE ~ CancellationCode)) %>%
# pivoting the table
pivot_wider(names_from = CancellationCode, values_from = n, values_fill = 0)
## # A tibble: 15 × 6
## UniqueCarrier `0` A B C D
## <chr> <int> <int> <int> <int> <int>
## 1 AA 3184 20 29 11 0
## 2 AS 365 0 0 0 0
## 3 B6 677 5 13 0 0
## 4 CO 69557 37 436 2 0
## 5 DL 2599 13 27 2 0
## 6 EV 2128 60 14 2 0
## 7 F9 832 2 4 0 0
## 8 FL 2118 8 12 1 0
## 9 MQ 4513 39 71 25 0
## 10 OO 15837 121 87 15 1
## 11 UA 2038 21 10 3 0
## 12 US 4036 27 17 2 0
## 13 WN 44640 517 181 5 0
## 14 XE 71921 331 751 50 0
## 15 YV 78 1 0 0 0
another way to do the same thing is using table()
table(df$UniqueCarrier, df$CancellationCode)
##
## A B C D
## AA 3184 20 29 11 0
## AS 365 0 0 0 0
## B6 677 5 13 0 0
## CO 69557 37 436 2 0
## DL 2599 13 27 2 0
## EV 2128 60 14 2 0
## F9 832 2 4 0 0
## FL 2118 8 12 1 0
## MQ 4513 39 71 25 0
## OO 15837 121 87 15 1
## UA 2038 21 10 3 0
## US 4036 27 17 2 0
## WN 44640 517 181 5 0
## XE 71921 331 751 50 0
## YV 78 1 0 0 0