Manipulate Data

load library

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

load data

df <- mpg
print(df)

## # A tibble: 234 × 11
##    manufacturer model      displ  year   cyl trans drv     cty   hwy fl    class
##    <chr>        <chr>      <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
##  1 audi         a4           1.8  1999     4 auto… f        18    29 p     comp…
##  2 audi         a4           1.8  1999     4 manu… f        21    29 p     comp…
##  3 audi         a4           2    2008     4 manu… f        20    31 p     comp…
##  4 audi         a4           2    2008     4 auto… f        21    30 p     comp…
##  5 audi         a4           2.8  1999     6 auto… f        16    26 p     comp…
##  6 audi         a4           2.8  1999     6 manu… f        18    26 p     comp…
##  7 audi         a4           3.1  2008     6 auto… f        18    27 p     comp…
##  8 audi         a4 quattro   1.8  1999     4 manu… 4        18    26 p     comp…
##  9 audi         a4 quattro   1.8  1999     4 auto… 4        16    25 p     comp…
## 10 audi         a4 quattro   2    2008     4 manu… 4        20    28 p     comp…
## # … with 224 more rows

# check null values
any(is.na(df))

## [1] FALSE

# explore number of columns and rows
ncol(df); nrow(df)

## [1] 11

## [1] 234

# explore structure of a table
str(df)

## tibble [234 × 11] (S3: tbl_df/tbl/data.frame)
##  $ manufacturer: chr [1:234] "audi" "audi" "audi" "audi" ...
##  $ model       : chr [1:234] "a4" "a4" "a4" "a4" ...
##  $ displ       : num [1:234] 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int [1:234] 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int [1:234] 4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : chr [1:234] "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
##  $ drv         : chr [1:234] "f" "f" "f" "f" ...
##  $ cty         : int [1:234] 18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int [1:234] 29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : chr [1:234] "p" "p" "p" "p" ...
##  $ class       : chr [1:234] "compact" "compact" "compact" "compact" ...

# explore column names
colnames(df)

##  [1] "manufacturer" "model"        "displ"        "year"         "cyl"         
##  [6] "trans"        "drv"          "cty"          "hwy"          "fl"          
## [11] "class"

select()

function to select column like select clause in SQL

# extract specified column
df %>% select(manufacturer, model, year, class)

## # A tibble: 234 × 4
##    manufacturer model       year class  
##    <chr>        <chr>      <int> <chr>  
##  1 audi         a4          1999 compact
##  2 audi         a4          1999 compact
##  3 audi         a4          2008 compact
##  4 audi         a4          2008 compact
##  5 audi         a4          1999 compact
##  6 audi         a4          1999 compact
##  7 audi         a4          2008 compact
##  8 audi         a4 quattro  1999 compact
##  9 audi         a4 quattro  1999 compact
## 10 audi         a4 quattro  2008 compact
## # … with 224 more rows

# extract matched criteria
df %>% select(starts_with(match = "c"))

## # A tibble: 234 × 3
##      cyl   cty class  
##    <int> <int> <chr>  
##  1     4    18 compact
##  2     4    21 compact
##  3     4    20 compact
##  4     4    21 compact
##  5     6    16 compact
##  6     6    18 compact
##  7     6    18 compact
##  8     4    18 compact
##  9     4    16 compact
## 10     4    20 compact
## # … with 224 more rows

df %>% select(contains("a"))

## # A tibble: 234 × 4
##    manufacturer  year trans      class  
##    <chr>        <int> <chr>      <chr>  
##  1 audi          1999 auto(l5)   compact
##  2 audi          1999 manual(m5) compact
##  3 audi          2008 manual(m6) compact
##  4 audi          2008 auto(av)   compact
##  5 audi          1999 auto(l5)   compact
##  6 audi          1999 manual(m5) compact
##  7 audi          2008 auto(av)   compact
##  8 audi          1999 manual(m5) compact
##  9 audi          1999 auto(l5)   compact
## 10 audi          2008 manual(m6) compact
## # … with 224 more rows

# select using index
df %>% select(2:5)

## # A tibble: 234 × 4
##    model      displ  year   cyl
##    <chr>      <dbl> <int> <int>
##  1 a4           1.8  1999     4
##  2 a4           1.8  1999     4
##  3 a4           2    2008     4
##  4 a4           2    2008     4
##  5 a4           2.8  1999     6
##  6 a4           2.8  1999     6
##  7 a4           3.1  2008     6
##  8 a4 quattro   1.8  1999     4
##  9 a4 quattro   1.8  1999     4
## 10 a4 quattro   2    2008     4
## # … with 224 more rows

# select last n columns (little tricky)
df %>% select((ncol(df)-2):ncol(df)) # it will extract last 3 columns

## # A tibble: 234 × 3
##      hwy fl    class  
##    <int> <chr> <chr>  
##  1    29 p     compact
##  2    29 p     compact
##  3    31 p     compact
##  4    30 p     compact
##  5    26 p     compact
##  6    26 p     compact
##  7    27 p     compact
##  8    26 p     compact
##  9    25 p     compact
## 10    28 p     compact
## # … with 224 more rows

rename() and everything()

everything is a function to grab all the remaining unspecified column

# rename all column
df1 <- df %>% rename(yes = everything())

# using select and rename simultaneously
df %>% select(mnfc = manufacturer, mod = model, display = displ, everything())

## # A tibble: 234 × 11
##    mnfc  mod        display  year   cyl trans      drv     cty   hwy fl    class
##    <chr> <chr>        <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr>
##  1 audi  a4             1.8  1999     4 auto(l5)   f        18    29 p     comp…
##  2 audi  a4             1.8  1999     4 manual(m5) f        21    29 p     comp…
##  3 audi  a4             2    2008     4 manual(m6) f        20    31 p     comp…
##  4 audi  a4             2    2008     4 auto(av)   f        21    30 p     comp…
##  5 audi  a4             2.8  1999     6 auto(l5)   f        16    26 p     comp…
##  6 audi  a4             2.8  1999     6 manual(m5) f        18    26 p     comp…
##  7 audi  a4             3.1  2008     6 auto(av)   f        18    27 p     comp…
##  8 audi  a4 quattro     1.8  1999     4 manual(m5) 4        18    26 p     comp…
##  9 audi  a4 quattro     1.8  1999     4 auto(l5)   4        16    25 p     comp…
## 10 audi  a4 quattro     2    2008     4 manual(m6) 4        20    28 p     comp…
## # … with 224 more rows

# note that this code doesn't rename original data set, it just rename the displayed data, i.e aliases

#to rename the original data set, use assign
# df <-df %>% rename(mnfc = manufacturer, mod = model)

mutate() and transmute()

mutate is a function to create new variable in a data set
transmute is a function to create new variable in a data set, but it will remove all others variables

# create new variable
df <- df %>% mutate(`avg mile per galon` = (cty+hwy)/2)

df <- df %>% mutate(car = paste(manufacturer, model))

df <- df %>% mutate(`cyl and trans` = paste(cyl, "cylinders and", trans, "transmissions"))

filter() and slice()

filter is a function to filter rows by criteria
slice is a function to filter rows by its indices

# use same order logic as SQL logic
df %>%  select(manufacturer, year, hwy) %>% filter(manufacturer %in% c("audi","ford"))

## # A tibble: 43 × 3
##    manufacturer  year   hwy
##    <chr>        <int> <int>
##  1 audi          1999    29
##  2 audi          1999    29
##  3 audi          2008    31
##  4 audi          2008    30
##  5 audi          1999    26
##  6 audi          1999    26
##  7 audi          2008    27
##  8 audi          1999    26
##  9 audi          1999    25
## 10 audi          2008    28
## # … with 33 more rows

# if we want to filter manufacturer but not want manufacturer to be displayed, then put the filter prior to select
df %>% filter(manufacturer %in% c("audi","ford")) %>%  select(year, hwy)

## # A tibble: 43 × 2
##     year   hwy
##    <int> <int>
##  1  1999    29
##  2  1999    29
##  3  2008    31
##  4  2008    30
##  5  1999    26
##  6  1999    26
##  7  2008    27
##  8  1999    26
##  9  1999    25
## 10  2008    28
## # … with 33 more rows

# if we want to filter manufacturer but not want manufacturer to be displayed but put the filter after select, it will cause an error

# combining filter and select
df %>%  select(manufacturer, year, hwy) %>% filter(manufacturer %in% c("audi","ford")) %>% group_by(manufacturer, year) %>% summarise(`avg mean` = mean(hwy))

## `summarise()` has grouped output by 'manufacturer'. You can override using the
## `.groups` argument.

## # A tibble: 4 × 3
## # Groups:   manufacturer [2]
##   manufacturer  year `avg mean`
##   <chr>        <int>      <dbl>
## 1 audi          1999       26.1
## 2 audi          2008       26.8
## 3 ford          1999       18.6
## 4 ford          2008       20.5

slice()

# get data from row 20 to 30
df %>% slice(20:30)

## # A tibble: 11 × 14
##    manufac…¹ model displ  year   cyl trans drv     cty   hwy fl    class avg m…²
##    <chr>     <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>   <dbl>
##  1 chevrolet c150…   5.3  2008     8 auto… r        11    15 e     suv      13  
##  2 chevrolet c150…   5.3  2008     8 auto… r        14    20 r     suv      17  
##  3 chevrolet c150…   5.7  1999     8 auto… r        13    17 r     suv      15  
##  4 chevrolet c150…   6    2008     8 auto… r        12    17 r     suv      14.5
##  5 chevrolet corv…   5.7  1999     8 manu… r        16    26 p     2sea…    21  
##  6 chevrolet corv…   5.7  1999     8 auto… r        15    23 p     2sea…    19  
##  7 chevrolet corv…   6.2  2008     8 manu… r        16    26 p     2sea…    21  
##  8 chevrolet corv…   6.2  2008     8 auto… r        15    25 p     2sea…    20  
##  9 chevrolet corv…   7    2008     8 manu… r        15    24 p     2sea…    19.5
## 10 chevrolet k150…   5.3  2008     8 auto… 4        14    19 r     suv      16.5
## 11 chevrolet k150…   5.3  2008     8 auto… 4        11    14 e     suv      12.5
## # … with 2 more variables: car <chr>, `cyl and trans` <chr>, and abbreviated
## #   variable names ¹manufacturer, ²`avg mile per galon`

# combining slice and select
df %>% select(manufacturer, model, year) %>% slice(20:30)

## # A tibble: 11 × 3
##    manufacturer model               year
##    <chr>        <chr>              <int>
##  1 chevrolet    c1500 suburban 2wd  2008
##  2 chevrolet    c1500 suburban 2wd  2008
##  3 chevrolet    c1500 suburban 2wd  1999
##  4 chevrolet    c1500 suburban 2wd  2008
##  5 chevrolet    corvette            1999
##  6 chevrolet    corvette            1999
##  7 chevrolet    corvette            2008
##  8 chevrolet    corvette            2008
##  9 chevrolet    corvette            2008
## 10 chevrolet    k1500 tahoe 4wd     2008
## 11 chevrolet    k1500 tahoe 4wd     2008

arrange()

just like order by clause in SQL
default ordering is ascending

# order cty in asc and hwy in desc, then display 20 records
print(df %>% arrange(cty, desc(hwy)), n=20)

## # A tibble: 234 × 14
##    manufac…¹ model displ  year   cyl trans drv     cty   hwy fl    class avg m…²
##    <chr>     <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>   <dbl>
##  1 dodge     dako…   4.7  2008     8 auto… 4         9    12 e     pick…    10.5
##  2 dodge     dura…   4.7  2008     8 auto… 4         9    12 e     suv      10.5
##  3 dodge     ram …   4.7  2008     8 auto… 4         9    12 e     pick…    10.5
##  4 dodge     ram …   4.7  2008     8 manu… 4         9    12 e     pick…    10.5
##  5 jeep      gran…   4.7  2008     8 auto… 4         9    12 e     suv      10.5
##  6 dodge     cara…   3.3  2008     6 auto… f        11    17 e     mini…    14  
##  7 dodge     dako…   5.2  1999     8 manu… 4        11    17 r     pick…    14  
##  8 ford      expe…   4.6  1999     8 auto… r        11    17 r     suv      14  
##  9 ford      expe…   5.4  1999     8 auto… r        11    17 r     suv      14  
## 10 lincoln   navi…   5.4  1999     8 auto… r        11    17 r     suv      14  
## 11 dodge     dura…   5.2  1999     8 auto… 4        11    16 r     suv      13.5
## 12 dodge     ram …   5.2  1999     8 manu… 4        11    16 r     pick…    13.5
## 13 lincoln   navi…   5.4  1999     8 auto… r        11    16 p     suv      13.5
## 14 chevrolet c150…   5.3  2008     8 auto… r        11    15 e     suv      13  
## 15 chevrolet k150…   5.7  1999     8 auto… 4        11    15 r     suv      13  
## 16 dodge     dako…   5.2  1999     8 auto… 4        11    15 r     pick…    13  
## 17 dodge     dura…   5.9  1999     8 auto… 4        11    15 r     suv      13  
## 18 dodge     ram …   5.2  1999     8 auto… 4        11    15 r     pick…    13  
## 19 dodge     ram …   5.9  1999     8 auto… 4        11    15 r     pick…    13  
## 20 ford      f150…   5.4  1999     8 auto… 4        11    15 r     pick…    13  
## # … with 214 more rows, 2 more variables: car <chr>, `cyl and trans` <chr>, and
## #   abbreviated variable names ¹manufacturer, ²`avg mile per galon`

# using arrange in advance
print(df %>% select(manufacturer, year, hwy) %>% group_by(manufacturer) %>% filter(year == 1999) %>% summarise(`average of hwy` = mean(hwy)) %>% arrange(desc(`average of hwy`)), n=5)

## # A tibble: 15 × 2
##   manufacturer `average of hwy`
##   <chr>                   <dbl>
## 1 honda                    31.6
## 2 volkswagen               29.7
## 3 hyundai                  26.7
## 4 pontiac                  26.3
## 5 audi                     26.1
## # … with 10 more rows

distinct()

just like distinct cluase in SQL

df %>% distinct(manufacturer, year) %>% arrange(manufacturer, year)

## # A tibble: 30 × 2
##    manufacturer  year
##    <chr>        <int>
##  1 audi          1999
##  2 audi          2008
##  3 chevrolet     1999
##  4 chevrolet     2008
##  5 dodge         1999
##  6 dodge         2008
##  7 ford          1999
##  8 ford          2008
##  9 honda         1999
## 10 honda         2008
## # … with 20 more rows

bind_()

binding either row(s) or column(s) just like rbind and cbind

exmp <- data.frame(id = c(1,2,3), names = c("john", "michael", "aries"))

exmp <- bind_rows(exmp, slice(exmp, 2:3))

exmp <- arrange(exmp, id)

sample_n() and sample_frac()

sample_n() to sampling number of n rows
sample_frac() to sampling percent of rows

# retrieve 20 samples from df without replacement
sample_n(df, 20, replace = F)

## # A tibble: 20 × 14
##    manufac…¹ model displ  year   cyl trans drv     cty   hwy fl    class avg m…²
##    <chr>     <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>   <dbl>
##  1 jeep      gran…   6.1  2008     8 auto… 4        11    14 p     suv      12.5
##  2 toyota    toyo…   2.7  2008     4 manu… 4        17    22 r     pick…    19.5
##  3 land rov… rang…   4.6  1999     8 auto… 4        11    15 p     suv      13  
##  4 subaru    fore…   2.5  1999     4 manu… 4        18    25 r     suv      21.5
##  5 dodge     dako…   5.2  1999     8 auto… 4        11    15 r     pick…    13  
##  6 ford      expe…   4.6  1999     8 auto… r        11    17 r     suv      14  
##  7 toyota    coro…   1.8  1999     4 manu… f        26    35 r     comp…    30.5
##  8 dodge     ram …   4.7  2008     8 manu… 4        12    16 r     pick…    14  
##  9 volkswag… new …   1.9  1999     4 manu… f        35    44 d     subc…    39.5
## 10 toyota    camr…   3.3  2008     6 auto… f        18    27 r     comp…    22.5
## 11 mercury   moun…   5    1999     8 auto… 4        13    17 r     suv      15  
## 12 honda     civic   1.8  2008     4 auto… f        24    36 c     subc…    30  
## 13 ford      expe…   5.4  1999     8 auto… r        11    17 r     suv      14  
## 14 chevrolet mali…   3.6  2008     6 auto… f        17    26 r     mids…    21.5
## 15 volkswag… jetta   2    2008     4 manu… f        21    29 p     comp…    25  
## 16 dodge     ram …   4.7  2008     8 manu… 4         9    12 e     pick…    10.5
## 17 dodge     dako…   4.7  2008     8 auto… 4         9    12 e     pick…    10.5
## 18 pontiac   gran…   3.1  1999     6 auto… f        18    26 r     mids…    22  
## 19 dodge     cara…   3    1999     6 auto… f        17    24 r     mini…    20.5
## 20 audi      a6 q…   3.1  2008     6 auto… 4        17    25 p     mids…    21  
## # … with 2 more variables: car <chr>, `cyl and trans` <chr>, and abbreviated
## #   variable names ¹manufacturer, ²`avg mile per galon`

# retrieve 20% sample from df
sample_frac(df, 0.2)

## # A tibble: 47 × 14
##    manufac…¹ model displ  year   cyl trans drv     cty   hwy fl    class avg m…²
##    <chr>     <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>   <dbl>
##  1 hyundai   tibu…   2    2008     4 auto… f        20    27 r     subc…    23.5
##  2 toyota    coro…   1.8  2008     4 manu… f        28    37 r     comp…    32.5
##  3 ford      f150…   4.2  1999     6 auto… 4        14    17 r     pick…    15.5
##  4 dodge     cara…   3    1999     6 auto… f        17    24 r     mini…    20.5
##  5 subaru    fore…   2.5  1999     4 manu… 4        18    25 r     suv      21.5
##  6 chevrolet c150…   5.7  1999     8 auto… r        13    17 r     suv      15  
##  7 dodge     dura…   4.7  2008     8 auto… 4         9    12 e     suv      10.5
##  8 nissan    path…   5.6  2008     8 auto… 4        12    18 p     suv      15  
##  9 volkswag… gti     2    2008     4 manu… f        21    29 p     comp…    25  
## 10 chevrolet corv…   6.2  2008     8 auto… r        15    25 p     2sea…    20  
## # … with 37 more rows, 2 more variables: car <chr>, `cyl and trans` <chr>, and
## #   abbreviated variable names ¹manufacturer, ²`avg mile per galon`

summarise()

used with aggregate function

# count number of records and number of distinct cty records
df %>% summarise(n(), n_distinct(cty))

## # A tibble: 1 × 2
##   `n()` `n_distinct(cty)`
##   <int>             <int>
## 1   234                21

# count number of records in advance
df %>% select(manufacturer, model) %>% group_by(manufacturer, model) %>% summarise(numrec = n())

## `summarise()` has grouped output by 'manufacturer'. You can override using the
## `.groups` argument.

## # A tibble: 38 × 3
## # Groups:   manufacturer [15]
##    manufacturer model              numrec
##    <chr>        <chr>               <int>
##  1 audi         a4                      7
##  2 audi         a4 quattro              8
##  3 audi         a6 quattro              3
##  4 chevrolet    c1500 suburban 2wd      5
##  5 chevrolet    corvette                5
##  6 chevrolet    k1500 tahoe 4wd         4
##  7 chevrolet    malibu                  5
##  8 dodge        caravan 2wd            11
##  9 dodge        dakota pickup 4wd       9
## 10 dodge        durango 4wd             7
## # … with 28 more rows

the same approach with simpler code line is using count()

count()

count(df, manufacturer, model) # will produce exactly same result as in second code line of summarise() part

## # A tibble: 38 × 3
##    manufacturer model                  n
##    <chr>        <chr>              <int>
##  1 audi         a4                     7
##  2 audi         a4 quattro             8
##  3 audi         a6 quattro             3
##  4 chevrolet    c1500 suburban 2wd     5
##  5 chevrolet    corvette               5
##  6 chevrolet    k1500 tahoe 4wd        4
##  7 chevrolet    malibu                 5
##  8 dodge        caravan 2wd           11
##  9 dodge        dakota pickup 4wd      9
## 10 dodge        durango 4wd            7
## # … with 28 more rows

df %>% group_by(manufacturer) %>% summarise(n())

## # A tibble: 15 × 2
##    manufacturer `n()`
##    <chr>        <int>
##  1 audi            18
##  2 chevrolet       19
##  3 dodge           37
##  4 ford            25
##  5 honda            9
##  6 hyundai         14
##  7 jeep             8
##  8 land rover       4
##  9 lincoln          3
## 10 mercury          4
## 11 nissan          13
## 12 pontiac          5
## 13 subaru          14
## 14 toyota          34
## 15 volkswagen      27

df %>% group_by(manufacturer) %>% summarise(`min hwy` = min(hwy), `max hwy` = max(hwy),
                                            `min cyl` = min(cyl), `max cyl` = max(cyl))

## # A tibble: 15 × 5
##    manufacturer `min hwy` `max hwy` `min cyl` `max cyl`
##    <chr>            <int>     <int>     <int>     <int>
##  1 audi                23        31         4         8
##  2 chevrolet           14        30         4         8
##  3 dodge               12        24         4         8
##  4 ford                15        26         6         8
##  5 honda               29        36         4         4
##  6 hyundai             24        31         4         6
##  7 jeep                12        22         6         8
##  8 land rover          15        18         8         8
##  9 lincoln             16        18         8         8
## 10 mercury             17        19         6         8
## 11 nissan              17        32         4         8
## 12 pontiac             25        28         6         8
## 13 subaru              23        27         4         4
## 14 toyota              15        37         4         8
## 15 volkswagen          23        44         4         6

df %>% group_by(manufacturer, model, class, trans) %>% 
        summarise(`mean hwy` = mean(hwy), cars = n()) %>% 
        filter(`mean hwy` > 30) %>% 
        arrange(desc(`mean hwy`)) %>% ungroup()

## `summarise()` has grouped output by 'manufacturer', 'model', 'class'. You can
## override using the `.groups` argument.

## # A tibble: 10 × 6
##    manufacturer model      class      trans      `mean hwy`  cars
##    <chr>        <chr>      <chr>      <chr>           <dbl> <int>
##  1 honda        civic      subcompact auto(l5)         36       2
##  2 toyota       corolla    compact    manual(m5)       36       2
##  3 toyota       corolla    compact    auto(l4)         34       2
##  4 volkswagen   new beetle subcompact manual(m5)       33.7     3
##  5 volkswagen   new beetle subcompact auto(l4)         33.5     2
##  6 honda        civic      subcompact auto(l4)         32       2
##  7 honda        civic      subcompact manual(m5)       32       4
##  8 volkswagen   jetta      compact    manual(m5)       31.5     4
##  9 audi         a4         compact    manual(m6)       31       1
## 10 toyota       camry      midsize    auto(l5)         31       1

pivot_longer() and pivot_wider()

transforming table from wide format to long format and vice versa.

# create long table example
long <- data.frame(id = 1:6, type = c("a", "b", "b", "c", "b", "a"),
                   total = c(20, 40, 30, 45, 12, 35))

# convert to wide format
wide <- pivot_wider(long, names_from = type, values_from = total)

# bring back wide to long
wide %>% pivot_longer(cols = c("a", "b", "c"), names_to = "type", values_to = "total", values_drop_na = T)

## # A tibble: 6 × 3
##      id type  total
##   <int> <chr> <dbl>
## 1     1 a        20
## 2     2 b        40
## 3     3 b        30
## 4     4 c        45
## 5     5 b        12
## 6     6 a        35

# play in advance
play <- df %>% filter(manufacturer %in% c("jeep", "hyundai", "land rover")) %>% 
        select(model, trans, hwy)

wideplay <- play %>% group_by(model, trans) %>%  summarise(`avg hwy` = mean(hwy)) %>% 
        pivot_wider(names_from = trans, values_from = `avg hwy`)

## `summarise()` has grouped output by 'model'. You can override using the
## `.groups` argument.

#convert back with little trick
wideplay %>% pivot_longer(cols = -model, # exclude column model and remain all the rest column
                          names_to = "trans",
                          values_to = "avg hwy",
                          values_drop_na = T)

## # A tibble: 10 × 3
## # Groups:   model [4]
##    model              trans      `avg hwy`
##    <chr>              <chr>          <dbl>
##  1 grand cherokee 4wd auto(l4)        18.5
##  2 grand cherokee 4wd auto(l5)        17.3
##  3 range rover        auto(l4)        15  
##  4 range rover        auto(s6)        18  
##  5 sonata             auto(l4)        27.3
##  6 sonata             auto(l5)        28  
##  7 sonata             manual(m5)      28  
##  8 tiburon            auto(l4)        25.7
##  9 tiburon            manual(m5)      27  
## 10 tiburon            manual(m6)      24

separate() and unite()

just like split and merge in excel
separate

# create dates table just for example
dates <- seq.Date(from = as.Date("2022-01-01"), to = as.Date("2022-12-31"), by = "day")
# place in table format
datedf <- data.frame(date = dates)

sptdatedf <- datedf %>% separate(col = date, into = c("year", "month", "date"), sep = "-") %>% 
        mutate(month = as.numeric(month), 
               date = as.numeric(date)) # remove leading zeros

# other ways to mutate
datedf %>% separate(col = date, into = c("year", "month", "date"), sep = "-") %>% 
        mutate_at(.vars = c("month", "date"), # determine what's columns to mutate
                  .funs = as.numeric) # determine what kind of function to apply

##     year month date
## 1   2022     1    1
## 2   2022     1    2
## 3   2022     1    3
## 4   2022     1    4
## 5   2022     1    5
## 6   2022     1    6
## 7   2022     1    7
## 8   2022     1    8
## 9   2022     1    9
## 10  2022     1   10
## 11  2022     1   11
## 12  2022     1   12
## 13  2022     1   13
## 14  2022     1   14
## 15  2022     1   15
## 16  2022     1   16
## 17  2022     1   17
## 18  2022     1   18
## 19  2022     1   19
## 20  2022     1   20
## 21  2022     1   21
## 22  2022     1   22
## 23  2022     1   23
## 24  2022     1   24
## 25  2022     1   25
## 26  2022     1   26
## 27  2022     1   27
## 28  2022     1   28
## 29  2022     1   29
## 30  2022     1   30
## 31  2022     1   31
## 32  2022     2    1
## 33  2022     2    2
## 34  2022     2    3
## 35  2022     2    4
## 36  2022     2    5
## 37  2022     2    6
## 38  2022     2    7
## 39  2022     2    8
## 40  2022     2    9
## 41  2022     2   10
## 42  2022     2   11
## 43  2022     2   12
## 44  2022     2   13
## 45  2022     2   14
## 46  2022     2   15
## 47  2022     2   16
## 48  2022     2   17
## 49  2022     2   18
## 50  2022     2   19
## 51  2022     2   20
## 52  2022     2   21
## 53  2022     2   22
## 54  2022     2   23
## 55  2022     2   24
## 56  2022     2   25
## 57  2022     2   26
## 58  2022     2   27
## 59  2022     2   28
## 60  2022     3    1
## 61  2022     3    2
## 62  2022     3    3
## 63  2022     3    4
## 64  2022     3    5
## 65  2022     3    6
## 66  2022     3    7
## 67  2022     3    8
## 68  2022     3    9
## 69  2022     3   10
## 70  2022     3   11
## 71  2022     3   12
## 72  2022     3   13
## 73  2022     3   14
## 74  2022     3   15
## 75  2022     3   16
## 76  2022     3   17
## 77  2022     3   18
## 78  2022     3   19
## 79  2022     3   20
## 80  2022     3   21
## 81  2022     3   22
## 82  2022     3   23
## 83  2022     3   24
## 84  2022     3   25
## 85  2022     3   26
## 86  2022     3   27
## 87  2022     3   28
## 88  2022     3   29
## 89  2022     3   30
## 90  2022     3   31
## 91  2022     4    1
## 92  2022     4    2
## 93  2022     4    3
## 94  2022     4    4
## 95  2022     4    5
## 96  2022     4    6
## 97  2022     4    7
## 98  2022     4    8
## 99  2022     4    9
## 100 2022     4   10
## 101 2022     4   11
## 102 2022     4   12
## 103 2022     4   13
## 104 2022     4   14
## 105 2022     4   15
## 106 2022     4   16
## 107 2022     4   17
## 108 2022     4   18
## 109 2022     4   19
## 110 2022     4   20
## 111 2022     4   21
## 112 2022     4   22
## 113 2022     4   23
## 114 2022     4   24
## 115 2022     4   25
## 116 2022     4   26
## 117 2022     4   27
## 118 2022     4   28
## 119 2022     4   29
## 120 2022     4   30
## 121 2022     5    1
## 122 2022     5    2
## 123 2022     5    3
## 124 2022     5    4
## 125 2022     5    5
## 126 2022     5    6
## 127 2022     5    7
## 128 2022     5    8
## 129 2022     5    9
## 130 2022     5   10
## 131 2022     5   11
## 132 2022     5   12
## 133 2022     5   13
## 134 2022     5   14
## 135 2022     5   15
## 136 2022     5   16
## 137 2022     5   17
## 138 2022     5   18
## 139 2022     5   19
## 140 2022     5   20
## 141 2022     5   21
## 142 2022     5   22
## 143 2022     5   23
## 144 2022     5   24
## 145 2022     5   25
## 146 2022     5   26
## 147 2022     5   27
## 148 2022     5   28
## 149 2022     5   29
## 150 2022     5   30
## 151 2022     5   31
## 152 2022     6    1
## 153 2022     6    2
## 154 2022     6    3
## 155 2022     6    4
## 156 2022     6    5
## 157 2022     6    6
## 158 2022     6    7
## 159 2022     6    8
## 160 2022     6    9
## 161 2022     6   10
## 162 2022     6   11
## 163 2022     6   12
## 164 2022     6   13
## 165 2022     6   14
## 166 2022     6   15
## 167 2022     6   16
## 168 2022     6   17
## 169 2022     6   18
## 170 2022     6   19
## 171 2022     6   20
## 172 2022     6   21
## 173 2022     6   22
## 174 2022     6   23
## 175 2022     6   24
## 176 2022     6   25
## 177 2022     6   26
## 178 2022     6   27
## 179 2022     6   28
## 180 2022     6   29
## 181 2022     6   30
## 182 2022     7    1
## 183 2022     7    2
## 184 2022     7    3
## 185 2022     7    4
## 186 2022     7    5
## 187 2022     7    6
## 188 2022     7    7
## 189 2022     7    8
## 190 2022     7    9
## 191 2022     7   10
## 192 2022     7   11
## 193 2022     7   12
## 194 2022     7   13
## 195 2022     7   14
## 196 2022     7   15
## 197 2022     7   16
## 198 2022     7   17
## 199 2022     7   18
## 200 2022     7   19
## 201 2022     7   20
## 202 2022     7   21
## 203 2022     7   22
## 204 2022     7   23
## 205 2022     7   24
## 206 2022     7   25
## 207 2022     7   26
## 208 2022     7   27
## 209 2022     7   28
## 210 2022     7   29
## 211 2022     7   30
## 212 2022     7   31
## 213 2022     8    1
## 214 2022     8    2
## 215 2022     8    3
## 216 2022     8    4
## 217 2022     8    5
## 218 2022     8    6
## 219 2022     8    7
## 220 2022     8    8
## 221 2022     8    9
## 222 2022     8   10
## 223 2022     8   11
## 224 2022     8   12
## 225 2022     8   13
## 226 2022     8   14
## 227 2022     8   15
## 228 2022     8   16
## 229 2022     8   17
## 230 2022     8   18
## 231 2022     8   19
## 232 2022     8   20
## 233 2022     8   21
## 234 2022     8   22
## 235 2022     8   23
## 236 2022     8   24
## 237 2022     8   25
## 238 2022     8   26
## 239 2022     8   27
## 240 2022     8   28
## 241 2022     8   29
## 242 2022     8   30
## 243 2022     8   31
## 244 2022     9    1
## 245 2022     9    2
## 246 2022     9    3
## 247 2022     9    4
## 248 2022     9    5
## 249 2022     9    6
## 250 2022     9    7
## 251 2022     9    8
## 252 2022     9    9
## 253 2022     9   10
## 254 2022     9   11
## 255 2022     9   12
## 256 2022     9   13
## 257 2022     9   14
## 258 2022     9   15
## 259 2022     9   16
## 260 2022     9   17
## 261 2022     9   18
## 262 2022     9   19
## 263 2022     9   20
## 264 2022     9   21
## 265 2022     9   22
## 266 2022     9   23
## 267 2022     9   24
## 268 2022     9   25
## 269 2022     9   26
## 270 2022     9   27
## 271 2022     9   28
## 272 2022     9   29
## 273 2022     9   30
## 274 2022    10    1
## 275 2022    10    2
## 276 2022    10    3
## 277 2022    10    4
## 278 2022    10    5
## 279 2022    10    6
## 280 2022    10    7
## 281 2022    10    8
## 282 2022    10    9
## 283 2022    10   10
## 284 2022    10   11
## 285 2022    10   12
## 286 2022    10   13
## 287 2022    10   14
## 288 2022    10   15
## 289 2022    10   16
## 290 2022    10   17
## 291 2022    10   18
## 292 2022    10   19
## 293 2022    10   20
## 294 2022    10   21
## 295 2022    10   22
## 296 2022    10   23
## 297 2022    10   24
## 298 2022    10   25
## 299 2022    10   26
## 300 2022    10   27
## 301 2022    10   28
## 302 2022    10   29
## 303 2022    10   30
## 304 2022    10   31
## 305 2022    11    1
## 306 2022    11    2
## 307 2022    11    3
## 308 2022    11    4
## 309 2022    11    5
## 310 2022    11    6
## 311 2022    11    7
## 312 2022    11    8
## 313 2022    11    9
## 314 2022    11   10
## 315 2022    11   11
## 316 2022    11   12
## 317 2022    11   13
## 318 2022    11   14
## 319 2022    11   15
## 320 2022    11   16
## 321 2022    11   17
## 322 2022    11   18
## 323 2022    11   19
## 324 2022    11   20
## 325 2022    11   21
## 326 2022    11   22
## 327 2022    11   23
## 328 2022    11   24
## 329 2022    11   25
## 330 2022    11   26
## 331 2022    11   27
## 332 2022    11   28
## 333 2022    11   29
## 334 2022    11   30
## 335 2022    12    1
## 336 2022    12    2
## 337 2022    12    3
## 338 2022    12    4
## 339 2022    12    5
## 340 2022    12    6
## 341 2022    12    7
## 342 2022    12    8
## 343 2022    12    9
## 344 2022    12   10
## 345 2022    12   11
## 346 2022    12   12
## 347 2022    12   13
## 348 2022    12   14
## 349 2022    12   15
## 350 2022    12   16
## 351 2022    12   17
## 352 2022    12   18
## 353 2022    12   19
## 354 2022    12   20
## 355 2022    12   21
## 356 2022    12   22
## 357 2022    12   23
## 358 2022    12   24
## 359 2022    12   25
## 360 2022    12   26
## 361 2022    12   27
## 362 2022    12   28
## 363 2022    12   29
## 364 2022    12   30
## 365 2022    12   31

unite

# add leading zeros before merge dates columns
sptdatedf %>% mutate(month = str_pad(width = 2, side = "left", string = month, pad = 0),
                     date = str_pad(width = 2, string = date, side = "left", pad = 0)) %>% 
        unite(col = "full dates", c(year,month,date), sep = "-", remove = F)

##     full dates year month date
## 1   2022-01-01 2022    01   01
## 2   2022-01-02 2022    01   02
## 3   2022-01-03 2022    01   03
## 4   2022-01-04 2022    01   04
## 5   2022-01-05 2022    01   05
## 6   2022-01-06 2022    01   06
## 7   2022-01-07 2022    01   07
## 8   2022-01-08 2022    01   08
## 9   2022-01-09 2022    01   09
## 10  2022-01-10 2022    01   10
## 11  2022-01-11 2022    01   11
## 12  2022-01-12 2022    01   12
## 13  2022-01-13 2022    01   13
## 14  2022-01-14 2022    01   14
## 15  2022-01-15 2022    01   15
## 16  2022-01-16 2022    01   16
## 17  2022-01-17 2022    01   17
## 18  2022-01-18 2022    01   18
## 19  2022-01-19 2022    01   19
## 20  2022-01-20 2022    01   20
## 21  2022-01-21 2022    01   21
## 22  2022-01-22 2022    01   22
## 23  2022-01-23 2022    01   23
## 24  2022-01-24 2022    01   24
## 25  2022-01-25 2022    01   25
## 26  2022-01-26 2022    01   26
## 27  2022-01-27 2022    01   27
## 28  2022-01-28 2022    01   28
## 29  2022-01-29 2022    01   29
## 30  2022-01-30 2022    01   30
## 31  2022-01-31 2022    01   31
## 32  2022-02-01 2022    02   01
## 33  2022-02-02 2022    02   02
## 34  2022-02-03 2022    02   03
## 35  2022-02-04 2022    02   04
## 36  2022-02-05 2022    02   05
## 37  2022-02-06 2022    02   06
## 38  2022-02-07 2022    02   07
## 39  2022-02-08 2022    02   08
## 40  2022-02-09 2022    02   09
## 41  2022-02-10 2022    02   10
## 42  2022-02-11 2022    02   11
## 43  2022-02-12 2022    02   12
## 44  2022-02-13 2022    02   13
## 45  2022-02-14 2022    02   14
## 46  2022-02-15 2022    02   15
## 47  2022-02-16 2022    02   16
## 48  2022-02-17 2022    02   17
## 49  2022-02-18 2022    02   18
## 50  2022-02-19 2022    02   19
## 51  2022-02-20 2022    02   20
## 52  2022-02-21 2022    02   21
## 53  2022-02-22 2022    02   22
## 54  2022-02-23 2022    02   23
## 55  2022-02-24 2022    02   24
## 56  2022-02-25 2022    02   25
## 57  2022-02-26 2022    02   26
## 58  2022-02-27 2022    02   27
## 59  2022-02-28 2022    02   28
## 60  2022-03-01 2022    03   01
## 61  2022-03-02 2022    03   02
## 62  2022-03-03 2022    03   03
## 63  2022-03-04 2022    03   04
## 64  2022-03-05 2022    03   05
## 65  2022-03-06 2022    03   06
## 66  2022-03-07 2022    03   07
## 67  2022-03-08 2022    03   08
## 68  2022-03-09 2022    03   09
## 69  2022-03-10 2022    03   10
## 70  2022-03-11 2022    03   11
## 71  2022-03-12 2022    03   12
## 72  2022-03-13 2022    03   13
## 73  2022-03-14 2022    03   14
## 74  2022-03-15 2022    03   15
## 75  2022-03-16 2022    03   16
## 76  2022-03-17 2022    03   17
## 77  2022-03-18 2022    03   18
## 78  2022-03-19 2022    03   19
## 79  2022-03-20 2022    03   20
## 80  2022-03-21 2022    03   21
## 81  2022-03-22 2022    03   22
## 82  2022-03-23 2022    03   23
## 83  2022-03-24 2022    03   24
## 84  2022-03-25 2022    03   25
## 85  2022-03-26 2022    03   26
## 86  2022-03-27 2022    03   27
## 87  2022-03-28 2022    03   28
## 88  2022-03-29 2022    03   29
## 89  2022-03-30 2022    03   30
## 90  2022-03-31 2022    03   31
## 91  2022-04-01 2022    04   01
## 92  2022-04-02 2022    04   02
## 93  2022-04-03 2022    04   03
## 94  2022-04-04 2022    04   04
## 95  2022-04-05 2022    04   05
## 96  2022-04-06 2022    04   06
## 97  2022-04-07 2022    04   07
## 98  2022-04-08 2022    04   08
## 99  2022-04-09 2022    04   09
## 100 2022-04-10 2022    04   10
## 101 2022-04-11 2022    04   11
## 102 2022-04-12 2022    04   12
## 103 2022-04-13 2022    04   13
## 104 2022-04-14 2022    04   14
## 105 2022-04-15 2022    04   15
## 106 2022-04-16 2022    04   16
## 107 2022-04-17 2022    04   17
## 108 2022-04-18 2022    04   18
## 109 2022-04-19 2022    04   19
## 110 2022-04-20 2022    04   20
## 111 2022-04-21 2022    04   21
## 112 2022-04-22 2022    04   22
## 113 2022-04-23 2022    04   23
## 114 2022-04-24 2022    04   24
## 115 2022-04-25 2022    04   25
## 116 2022-04-26 2022    04   26
## 117 2022-04-27 2022    04   27
## 118 2022-04-28 2022    04   28
## 119 2022-04-29 2022    04   29
## 120 2022-04-30 2022    04   30
## 121 2022-05-01 2022    05   01
## 122 2022-05-02 2022    05   02
## 123 2022-05-03 2022    05   03
## 124 2022-05-04 2022    05   04
## 125 2022-05-05 2022    05   05
## 126 2022-05-06 2022    05   06
## 127 2022-05-07 2022    05   07
## 128 2022-05-08 2022    05   08
## 129 2022-05-09 2022    05   09
## 130 2022-05-10 2022    05   10
## 131 2022-05-11 2022    05   11
## 132 2022-05-12 2022    05   12
## 133 2022-05-13 2022    05   13
## 134 2022-05-14 2022    05   14
## 135 2022-05-15 2022    05   15
## 136 2022-05-16 2022    05   16
## 137 2022-05-17 2022    05   17
## 138 2022-05-18 2022    05   18
## 139 2022-05-19 2022    05   19
## 140 2022-05-20 2022    05   20
## 141 2022-05-21 2022    05   21
## 142 2022-05-22 2022    05   22
## 143 2022-05-23 2022    05   23
## 144 2022-05-24 2022    05   24
## 145 2022-05-25 2022    05   25
## 146 2022-05-26 2022    05   26
## 147 2022-05-27 2022    05   27
## 148 2022-05-28 2022    05   28
## 149 2022-05-29 2022    05   29
## 150 2022-05-30 2022    05   30
## 151 2022-05-31 2022    05   31
## 152 2022-06-01 2022    06   01
## 153 2022-06-02 2022    06   02
## 154 2022-06-03 2022    06   03
## 155 2022-06-04 2022    06   04
## 156 2022-06-05 2022    06   05
## 157 2022-06-06 2022    06   06
## 158 2022-06-07 2022    06   07
## 159 2022-06-08 2022    06   08
## 160 2022-06-09 2022    06   09
## 161 2022-06-10 2022    06   10
## 162 2022-06-11 2022    06   11
## 163 2022-06-12 2022    06   12
## 164 2022-06-13 2022    06   13
## 165 2022-06-14 2022    06   14
## 166 2022-06-15 2022    06   15
## 167 2022-06-16 2022    06   16
## 168 2022-06-17 2022    06   17
## 169 2022-06-18 2022    06   18
## 170 2022-06-19 2022    06   19
## 171 2022-06-20 2022    06   20
## 172 2022-06-21 2022    06   21
## 173 2022-06-22 2022    06   22
## 174 2022-06-23 2022    06   23
## 175 2022-06-24 2022    06   24
## 176 2022-06-25 2022    06   25
## 177 2022-06-26 2022    06   26
## 178 2022-06-27 2022    06   27
## 179 2022-06-28 2022    06   28
## 180 2022-06-29 2022    06   29
## 181 2022-06-30 2022    06   30
## 182 2022-07-01 2022    07   01
## 183 2022-07-02 2022    07   02
## 184 2022-07-03 2022    07   03
## 185 2022-07-04 2022    07   04
## 186 2022-07-05 2022    07   05
## 187 2022-07-06 2022    07   06
## 188 2022-07-07 2022    07   07
## 189 2022-07-08 2022    07   08
## 190 2022-07-09 2022    07   09
## 191 2022-07-10 2022    07   10
## 192 2022-07-11 2022    07   11
## 193 2022-07-12 2022    07   12
## 194 2022-07-13 2022    07   13
## 195 2022-07-14 2022    07   14
## 196 2022-07-15 2022    07   15
## 197 2022-07-16 2022    07   16
## 198 2022-07-17 2022    07   17
## 199 2022-07-18 2022    07   18
## 200 2022-07-19 2022    07   19
## 201 2022-07-20 2022    07   20
## 202 2022-07-21 2022    07   21
## 203 2022-07-22 2022    07   22
## 204 2022-07-23 2022    07   23
## 205 2022-07-24 2022    07   24
## 206 2022-07-25 2022    07   25
## 207 2022-07-26 2022    07   26
## 208 2022-07-27 2022    07   27
## 209 2022-07-28 2022    07   28
## 210 2022-07-29 2022    07   29
## 211 2022-07-30 2022    07   30
## 212 2022-07-31 2022    07   31
## 213 2022-08-01 2022    08   01
## 214 2022-08-02 2022    08   02
## 215 2022-08-03 2022    08   03
## 216 2022-08-04 2022    08   04
## 217 2022-08-05 2022    08   05
## 218 2022-08-06 2022    08   06
## 219 2022-08-07 2022    08   07
## 220 2022-08-08 2022    08   08
## 221 2022-08-09 2022    08   09
## 222 2022-08-10 2022    08   10
## 223 2022-08-11 2022    08   11
## 224 2022-08-12 2022    08   12
## 225 2022-08-13 2022    08   13
## 226 2022-08-14 2022    08   14
## 227 2022-08-15 2022    08   15
## 228 2022-08-16 2022    08   16
## 229 2022-08-17 2022    08   17
## 230 2022-08-18 2022    08   18
## 231 2022-08-19 2022    08   19
## 232 2022-08-20 2022    08   20
## 233 2022-08-21 2022    08   21
## 234 2022-08-22 2022    08   22
## 235 2022-08-23 2022    08   23
## 236 2022-08-24 2022    08   24
## 237 2022-08-25 2022    08   25
## 238 2022-08-26 2022    08   26
## 239 2022-08-27 2022    08   27
## 240 2022-08-28 2022    08   28
## 241 2022-08-29 2022    08   29
## 242 2022-08-30 2022    08   30
## 243 2022-08-31 2022    08   31
## 244 2022-09-01 2022    09   01
## 245 2022-09-02 2022    09   02
## 246 2022-09-03 2022    09   03
## 247 2022-09-04 2022    09   04
## 248 2022-09-05 2022    09   05
## 249 2022-09-06 2022    09   06
## 250 2022-09-07 2022    09   07
## 251 2022-09-08 2022    09   08
## 252 2022-09-09 2022    09   09
## 253 2022-09-10 2022    09   10
## 254 2022-09-11 2022    09   11
## 255 2022-09-12 2022    09   12
## 256 2022-09-13 2022    09   13
## 257 2022-09-14 2022    09   14
## 258 2022-09-15 2022    09   15
## 259 2022-09-16 2022    09   16
## 260 2022-09-17 2022    09   17
## 261 2022-09-18 2022    09   18
## 262 2022-09-19 2022    09   19
## 263 2022-09-20 2022    09   20
## 264 2022-09-21 2022    09   21
## 265 2022-09-22 2022    09   22
## 266 2022-09-23 2022    09   23
## 267 2022-09-24 2022    09   24
## 268 2022-09-25 2022    09   25
## 269 2022-09-26 2022    09   26
## 270 2022-09-27 2022    09   27
## 271 2022-09-28 2022    09   28
## 272 2022-09-29 2022    09   29
## 273 2022-09-30 2022    09   30
## 274 2022-10-01 2022    10   01
## 275 2022-10-02 2022    10   02
## 276 2022-10-03 2022    10   03
## 277 2022-10-04 2022    10   04
## 278 2022-10-05 2022    10   05
## 279 2022-10-06 2022    10   06
## 280 2022-10-07 2022    10   07
## 281 2022-10-08 2022    10   08
## 282 2022-10-09 2022    10   09
## 283 2022-10-10 2022    10   10
## 284 2022-10-11 2022    10   11
## 285 2022-10-12 2022    10   12
## 286 2022-10-13 2022    10   13
## 287 2022-10-14 2022    10   14
## 288 2022-10-15 2022    10   15
## 289 2022-10-16 2022    10   16
## 290 2022-10-17 2022    10   17
## 291 2022-10-18 2022    10   18
## 292 2022-10-19 2022    10   19
## 293 2022-10-20 2022    10   20
## 294 2022-10-21 2022    10   21
## 295 2022-10-22 2022    10   22
## 296 2022-10-23 2022    10   23
## 297 2022-10-24 2022    10   24
## 298 2022-10-25 2022    10   25
## 299 2022-10-26 2022    10   26
## 300 2022-10-27 2022    10   27
## 301 2022-10-28 2022    10   28
## 302 2022-10-29 2022    10   29
## 303 2022-10-30 2022    10   30
## 304 2022-10-31 2022    10   31
## 305 2022-11-01 2022    11   01
## 306 2022-11-02 2022    11   02
## 307 2022-11-03 2022    11   03
## 308 2022-11-04 2022    11   04
## 309 2022-11-05 2022    11   05
## 310 2022-11-06 2022    11   06
## 311 2022-11-07 2022    11   07
## 312 2022-11-08 2022    11   08
## 313 2022-11-09 2022    11   09
## 314 2022-11-10 2022    11   10
## 315 2022-11-11 2022    11   11
## 316 2022-11-12 2022    11   12
## 317 2022-11-13 2022    11   13
## 318 2022-11-14 2022    11   14
## 319 2022-11-15 2022    11   15
## 320 2022-11-16 2022    11   16
## 321 2022-11-17 2022    11   17
## 322 2022-11-18 2022    11   18
## 323 2022-11-19 2022    11   19
## 324 2022-11-20 2022    11   20
## 325 2022-11-21 2022    11   21
## 326 2022-11-22 2022    11   22
## 327 2022-11-23 2022    11   23
## 328 2022-11-24 2022    11   24
## 329 2022-11-25 2022    11   25
## 330 2022-11-26 2022    11   26
## 331 2022-11-27 2022    11   27
## 332 2022-11-28 2022    11   28
## 333 2022-11-29 2022    11   29
## 334 2022-11-30 2022    11   30
## 335 2022-12-01 2022    12   01
## 336 2022-12-02 2022    12   02
## 337 2022-12-03 2022    12   03
## 338 2022-12-04 2022    12   04
## 339 2022-12-05 2022    12   05
## 340 2022-12-06 2022    12   06
## 341 2022-12-07 2022    12   07
## 342 2022-12-08 2022    12   08
## 343 2022-12-09 2022    12   09
## 344 2022-12-10 2022    12   10
## 345 2022-12-11 2022    12   11
## 346 2022-12-12 2022    12   12
## 347 2022-12-13 2022    12   13
## 348 2022-12-14 2022    12   14
## 349 2022-12-15 2022    12   15
## 350 2022-12-16 2022    12   16
## 351 2022-12-17 2022    12   17
## 352 2022-12-18 2022    12   18
## 353 2022-12-19 2022    12   19
## 354 2022-12-20 2022    12   20
## 355 2022-12-21 2022    12   21
## 356 2022-12-22 2022    12   22
## 357 2022-12-23 2022    12   23
## 358 2022-12-24 2022    12   24
## 359 2022-12-25 2022    12   25
## 360 2022-12-26 2022    12   26
## 361 2022-12-27 2022    12   27
## 362 2022-12-28 2022    12   28
## 363 2022-12-29 2022    12   29
## 364 2022-12-30 2022    12   30
## 365 2022-12-31 2022    12   31

# others way to adding leading zeros with mutate_at
sptdatedf %>% mutate_at(.vars = c("month", "date"),
                        .funs = str_pad, width = 2, side = "left", pad = 0) %>% 
        unite(col = "full date", c("year", "month", "date"), sep = "-", remove = F)

##      full date year month date
## 1   2022-01-01 2022    01   01
## 2   2022-01-02 2022    01   02
## 3   2022-01-03 2022    01   03
## 4   2022-01-04 2022    01   04
## 5   2022-01-05 2022    01   05
## 6   2022-01-06 2022    01   06
## 7   2022-01-07 2022    01   07
## 8   2022-01-08 2022    01   08
## 9   2022-01-09 2022    01   09
## 10  2022-01-10 2022    01   10
## 11  2022-01-11 2022    01   11
## 12  2022-01-12 2022    01   12
## 13  2022-01-13 2022    01   13
## 14  2022-01-14 2022    01   14
## 15  2022-01-15 2022    01   15
## 16  2022-01-16 2022    01   16
## 17  2022-01-17 2022    01   17
## 18  2022-01-18 2022    01   18
## 19  2022-01-19 2022    01   19
## 20  2022-01-20 2022    01   20
## 21  2022-01-21 2022    01   21
## 22  2022-01-22 2022    01   22
## 23  2022-01-23 2022    01   23
## 24  2022-01-24 2022    01   24
## 25  2022-01-25 2022    01   25
## 26  2022-01-26 2022    01   26
## 27  2022-01-27 2022    01   27
## 28  2022-01-28 2022    01   28
## 29  2022-01-29 2022    01   29
## 30  2022-01-30 2022    01   30
## 31  2022-01-31 2022    01   31
## 32  2022-02-01 2022    02   01
## 33  2022-02-02 2022    02   02
## 34  2022-02-03 2022    02   03
## 35  2022-02-04 2022    02   04
## 36  2022-02-05 2022    02   05
## 37  2022-02-06 2022    02   06
## 38  2022-02-07 2022    02   07
## 39  2022-02-08 2022    02   08
## 40  2022-02-09 2022    02   09
## 41  2022-02-10 2022    02   10
## 42  2022-02-11 2022    02   11
## 43  2022-02-12 2022    02   12
## 44  2022-02-13 2022    02   13
## 45  2022-02-14 2022    02   14
## 46  2022-02-15 2022    02   15
## 47  2022-02-16 2022    02   16
## 48  2022-02-17 2022    02   17
## 49  2022-02-18 2022    02   18
## 50  2022-02-19 2022    02   19
## 51  2022-02-20 2022    02   20
## 52  2022-02-21 2022    02   21
## 53  2022-02-22 2022    02   22
## 54  2022-02-23 2022    02   23
## 55  2022-02-24 2022    02   24
## 56  2022-02-25 2022    02   25
## 57  2022-02-26 2022    02   26
## 58  2022-02-27 2022    02   27
## 59  2022-02-28 2022    02   28
## 60  2022-03-01 2022    03   01
## 61  2022-03-02 2022    03   02
## 62  2022-03-03 2022    03   03
## 63  2022-03-04 2022    03   04
## 64  2022-03-05 2022    03   05
## 65  2022-03-06 2022    03   06
## 66  2022-03-07 2022    03   07
## 67  2022-03-08 2022    03   08
## 68  2022-03-09 2022    03   09
## 69  2022-03-10 2022    03   10
## 70  2022-03-11 2022    03   11
## 71  2022-03-12 2022    03   12
## 72  2022-03-13 2022    03   13
## 73  2022-03-14 2022    03   14
## 74  2022-03-15 2022    03   15
## 75  2022-03-16 2022    03   16
## 76  2022-03-17 2022    03   17
## 77  2022-03-18 2022    03   18
## 78  2022-03-19 2022    03   19
## 79  2022-03-20 2022    03   20
## 80  2022-03-21 2022    03   21
## 81  2022-03-22 2022    03   22
## 82  2022-03-23 2022    03   23
## 83  2022-03-24 2022    03   24
## 84  2022-03-25 2022    03   25
## 85  2022-03-26 2022    03   26
## 86  2022-03-27 2022    03   27
## 87  2022-03-28 2022    03   28
## 88  2022-03-29 2022    03   29
## 89  2022-03-30 2022    03   30
## 90  2022-03-31 2022    03   31
## 91  2022-04-01 2022    04   01
## 92  2022-04-02 2022    04   02
## 93  2022-04-03 2022    04   03
## 94  2022-04-04 2022    04   04
## 95  2022-04-05 2022    04   05
## 96  2022-04-06 2022    04   06
## 97  2022-04-07 2022    04   07
## 98  2022-04-08 2022    04   08
## 99  2022-04-09 2022    04   09
## 100 2022-04-10 2022    04   10
## 101 2022-04-11 2022    04   11
## 102 2022-04-12 2022    04   12
## 103 2022-04-13 2022    04   13
## 104 2022-04-14 2022    04   14
## 105 2022-04-15 2022    04   15
## 106 2022-04-16 2022    04   16
## 107 2022-04-17 2022    04   17
## 108 2022-04-18 2022    04   18
## 109 2022-04-19 2022    04   19
## 110 2022-04-20 2022    04   20
## 111 2022-04-21 2022    04   21
## 112 2022-04-22 2022    04   22
## 113 2022-04-23 2022    04   23
## 114 2022-04-24 2022    04   24
## 115 2022-04-25 2022    04   25
## 116 2022-04-26 2022    04   26
## 117 2022-04-27 2022    04   27
## 118 2022-04-28 2022    04   28
## 119 2022-04-29 2022    04   29
## 120 2022-04-30 2022    04   30
## 121 2022-05-01 2022    05   01
## 122 2022-05-02 2022    05   02
## 123 2022-05-03 2022    05   03
## 124 2022-05-04 2022    05   04
## 125 2022-05-05 2022    05   05
## 126 2022-05-06 2022    05   06
## 127 2022-05-07 2022    05   07
## 128 2022-05-08 2022    05   08
## 129 2022-05-09 2022    05   09
## 130 2022-05-10 2022    05   10
## 131 2022-05-11 2022    05   11
## 132 2022-05-12 2022    05   12
## 133 2022-05-13 2022    05   13
## 134 2022-05-14 2022    05   14
## 135 2022-05-15 2022    05   15
## 136 2022-05-16 2022    05   16
## 137 2022-05-17 2022    05   17
## 138 2022-05-18 2022    05   18
## 139 2022-05-19 2022    05   19
## 140 2022-05-20 2022    05   20
## 141 2022-05-21 2022    05   21
## 142 2022-05-22 2022    05   22
## 143 2022-05-23 2022    05   23
## 144 2022-05-24 2022    05   24
## 145 2022-05-25 2022    05   25
## 146 2022-05-26 2022    05   26
## 147 2022-05-27 2022    05   27
## 148 2022-05-28 2022    05   28
## 149 2022-05-29 2022    05   29
## 150 2022-05-30 2022    05   30
## 151 2022-05-31 2022    05   31
## 152 2022-06-01 2022    06   01
## 153 2022-06-02 2022    06   02
## 154 2022-06-03 2022    06   03
## 155 2022-06-04 2022    06   04
## 156 2022-06-05 2022    06   05
## 157 2022-06-06 2022    06   06
## 158 2022-06-07 2022    06   07
## 159 2022-06-08 2022    06   08
## 160 2022-06-09 2022    06   09
## 161 2022-06-10 2022    06   10
## 162 2022-06-11 2022    06   11
## 163 2022-06-12 2022    06   12
## 164 2022-06-13 2022    06   13
## 165 2022-06-14 2022    06   14
## 166 2022-06-15 2022    06   15
## 167 2022-06-16 2022    06   16
## 168 2022-06-17 2022    06   17
## 169 2022-06-18 2022    06   18
## 170 2022-06-19 2022    06   19
## 171 2022-06-20 2022    06   20
## 172 2022-06-21 2022    06   21
## 173 2022-06-22 2022    06   22
## 174 2022-06-23 2022    06   23
## 175 2022-06-24 2022    06   24
## 176 2022-06-25 2022    06   25
## 177 2022-06-26 2022    06   26
## 178 2022-06-27 2022    06   27
## 179 2022-06-28 2022    06   28
## 180 2022-06-29 2022    06   29
## 181 2022-06-30 2022    06   30
## 182 2022-07-01 2022    07   01
## 183 2022-07-02 2022    07   02
## 184 2022-07-03 2022    07   03
## 185 2022-07-04 2022    07   04
## 186 2022-07-05 2022    07   05
## 187 2022-07-06 2022    07   06
## 188 2022-07-07 2022    07   07
## 189 2022-07-08 2022    07   08
## 190 2022-07-09 2022    07   09
## 191 2022-07-10 2022    07   10
## 192 2022-07-11 2022    07   11
## 193 2022-07-12 2022    07   12
## 194 2022-07-13 2022    07   13
## 195 2022-07-14 2022    07   14
## 196 2022-07-15 2022    07   15
## 197 2022-07-16 2022    07   16
## 198 2022-07-17 2022    07   17
## 199 2022-07-18 2022    07   18
## 200 2022-07-19 2022    07   19
## 201 2022-07-20 2022    07   20
## 202 2022-07-21 2022    07   21
## 203 2022-07-22 2022    07   22
## 204 2022-07-23 2022    07   23
## 205 2022-07-24 2022    07   24
## 206 2022-07-25 2022    07   25
## 207 2022-07-26 2022    07   26
## 208 2022-07-27 2022    07   27
## 209 2022-07-28 2022    07   28
## 210 2022-07-29 2022    07   29
## 211 2022-07-30 2022    07   30
## 212 2022-07-31 2022    07   31
## 213 2022-08-01 2022    08   01
## 214 2022-08-02 2022    08   02
## 215 2022-08-03 2022    08   03
## 216 2022-08-04 2022    08   04
## 217 2022-08-05 2022    08   05
## 218 2022-08-06 2022    08   06
## 219 2022-08-07 2022    08   07
## 220 2022-08-08 2022    08   08
## 221 2022-08-09 2022    08   09
## 222 2022-08-10 2022    08   10
## 223 2022-08-11 2022    08   11
## 224 2022-08-12 2022    08   12
## 225 2022-08-13 2022    08   13
## 226 2022-08-14 2022    08   14
## 227 2022-08-15 2022    08   15
## 228 2022-08-16 2022    08   16
## 229 2022-08-17 2022    08   17
## 230 2022-08-18 2022    08   18
## 231 2022-08-19 2022    08   19
## 232 2022-08-20 2022    08   20
## 233 2022-08-21 2022    08   21
## 234 2022-08-22 2022    08   22
## 235 2022-08-23 2022    08   23
## 236 2022-08-24 2022    08   24
## 237 2022-08-25 2022    08   25
## 238 2022-08-26 2022    08   26
## 239 2022-08-27 2022    08   27
## 240 2022-08-28 2022    08   28
## 241 2022-08-29 2022    08   29
## 242 2022-08-30 2022    08   30
## 243 2022-08-31 2022    08   31
## 244 2022-09-01 2022    09   01
## 245 2022-09-02 2022    09   02
## 246 2022-09-03 2022    09   03
## 247 2022-09-04 2022    09   04
## 248 2022-09-05 2022    09   05
## 249 2022-09-06 2022    09   06
## 250 2022-09-07 2022    09   07
## 251 2022-09-08 2022    09   08
## 252 2022-09-09 2022    09   09
## 253 2022-09-10 2022    09   10
## 254 2022-09-11 2022    09   11
## 255 2022-09-12 2022    09   12
## 256 2022-09-13 2022    09   13
## 257 2022-09-14 2022    09   14
## 258 2022-09-15 2022    09   15
## 259 2022-09-16 2022    09   16
## 260 2022-09-17 2022    09   17
## 261 2022-09-18 2022    09   18
## 262 2022-09-19 2022    09   19
## 263 2022-09-20 2022    09   20
## 264 2022-09-21 2022    09   21
## 265 2022-09-22 2022    09   22
## 266 2022-09-23 2022    09   23
## 267 2022-09-24 2022    09   24
## 268 2022-09-25 2022    09   25
## 269 2022-09-26 2022    09   26
## 270 2022-09-27 2022    09   27
## 271 2022-09-28 2022    09   28
## 272 2022-09-29 2022    09   29
## 273 2022-09-30 2022    09   30
## 274 2022-10-01 2022    10   01
## 275 2022-10-02 2022    10   02
## 276 2022-10-03 2022    10   03
## 277 2022-10-04 2022    10   04
## 278 2022-10-05 2022    10   05
## 279 2022-10-06 2022    10   06
## 280 2022-10-07 2022    10   07
## 281 2022-10-08 2022    10   08
## 282 2022-10-09 2022    10   09
## 283 2022-10-10 2022    10   10
## 284 2022-10-11 2022    10   11
## 285 2022-10-12 2022    10   12
## 286 2022-10-13 2022    10   13
## 287 2022-10-14 2022    10   14
## 288 2022-10-15 2022    10   15
## 289 2022-10-16 2022    10   16
## 290 2022-10-17 2022    10   17
## 291 2022-10-18 2022    10   18
## 292 2022-10-19 2022    10   19
## 293 2022-10-20 2022    10   20
## 294 2022-10-21 2022    10   21
## 295 2022-10-22 2022    10   22
## 296 2022-10-23 2022    10   23
## 297 2022-10-24 2022    10   24
## 298 2022-10-25 2022    10   25
## 299 2022-10-26 2022    10   26
## 300 2022-10-27 2022    10   27
## 301 2022-10-28 2022    10   28
## 302 2022-10-29 2022    10   29
## 303 2022-10-30 2022    10   30
## 304 2022-10-31 2022    10   31
## 305 2022-11-01 2022    11   01
## 306 2022-11-02 2022    11   02
## 307 2022-11-03 2022    11   03
## 308 2022-11-04 2022    11   04
## 309 2022-11-05 2022    11   05
## 310 2022-11-06 2022    11   06
## 311 2022-11-07 2022    11   07
## 312 2022-11-08 2022    11   08
## 313 2022-11-09 2022    11   09
## 314 2022-11-10 2022    11   10
## 315 2022-11-11 2022    11   11
## 316 2022-11-12 2022    11   12
## 317 2022-11-13 2022    11   13
## 318 2022-11-14 2022    11   14
## 319 2022-11-15 2022    11   15
## 320 2022-11-16 2022    11   16
## 321 2022-11-17 2022    11   17
## 322 2022-11-18 2022    11   18
## 323 2022-11-19 2022    11   19
## 324 2022-11-20 2022    11   20
## 325 2022-11-21 2022    11   21
## 326 2022-11-22 2022    11   22
## 327 2022-11-23 2022    11   23
## 328 2022-11-24 2022    11   24
## 329 2022-11-25 2022    11   25
## 330 2022-11-26 2022    11   26
## 331 2022-11-27 2022    11   27
## 332 2022-11-28 2022    11   28
## 333 2022-11-29 2022    11   29
## 334 2022-11-30 2022    11   30
## 335 2022-12-01 2022    12   01
## 336 2022-12-02 2022    12   02
## 337 2022-12-03 2022    12   03
## 338 2022-12-04 2022    12   04
## 339 2022-12-05 2022    12   05
## 340 2022-12-06 2022    12   06
## 341 2022-12-07 2022    12   07
## 342 2022-12-08 2022    12   08
## 343 2022-12-09 2022    12   09
## 344 2022-12-10 2022    12   10
## 345 2022-12-11 2022    12   11
## 346 2022-12-12 2022    12   12
## 347 2022-12-13 2022    12   13
## 348 2022-12-14 2022    12   14
## 349 2022-12-15 2022    12   15
## 350 2022-12-16 2022    12   16
## 351 2022-12-17 2022    12   17
## 352 2022-12-18 2022    12   18
## 353 2022-12-19 2022    12   19
## 354 2022-12-20 2022    12   20
## 355 2022-12-21 2022    12   21
## 356 2022-12-22 2022    12   22
## 357 2022-12-23 2022    12   23
## 358 2022-12-24 2022    12   24
## 359 2022-12-25 2022    12   25
## 360 2022-12-26 2022    12   26
## 361 2022-12-27 2022    12   27
## 362 2022-12-28 2022    12   28
## 363 2022-12-29 2022    12   29
## 364 2022-12-30 2022    12   30
## 365 2022-12-31 2022    12   31

pull()

extract column values as a vector

df %>% pull(hwy)

##   [1] 29 29 31 30 26 26 27 26 25 28 27 25 25 25 25 24 25 23 20 15 20 17 17 26 23
##  [26] 26 25 24 19 14 15 17 27 30 26 29 26 24 24 22 22 24 24 17 22 21 23 23 19 18
##  [51] 17 17 19 19 12 17 15 17 17 12 17 16 18 15 16 12 17 17 16 12 15 16 17 15 17
##  [76] 17 18 17 19 17 19 19 17 17 17 16 16 17 15 17 26 25 26 24 21 22 23 22 20 33
## [101] 32 32 29 32 34 36 36 29 26 27 30 31 26 26 28 26 29 28 27 24 24 24 22 19 20
## [126] 17 12 19 18 14 15 18 18 15 17 16 18 17 19 19 17 29 27 31 32 27 26 26 25 25
## [151] 17 17 20 18 26 26 27 28 25 25 24 27 25 26 23 26 26 26 26 25 27 25 27 20 20
## [176] 19 17 20 17 29 27 31 31 26 26 28 27 29 31 31 26 26 27 30 33 35 37 35 15 18
## [201] 20 20 22 17 19 18 20 29 26 29 29 24 44 29 26 29 29 29 29 23 24 44 41 29 26
## [226] 28 29 29 29 28 29 26 26 26

combining group_by and mutate

df %>% group_by(manufacturer, model) %>% mutate(`avg hwy` = mean(hwy))

## # A tibble: 234 × 15
## # Groups:   manufacturer, model [38]
##    manufac…¹ model displ  year   cyl trans drv     cty   hwy fl    class avg m…²
##    <chr>     <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>   <dbl>
##  1 audi      a4      1.8  1999     4 auto… f        18    29 p     comp…    23.5
##  2 audi      a4      1.8  1999     4 manu… f        21    29 p     comp…    25  
##  3 audi      a4      2    2008     4 manu… f        20    31 p     comp…    25.5
##  4 audi      a4      2    2008     4 auto… f        21    30 p     comp…    25.5
##  5 audi      a4      2.8  1999     6 auto… f        16    26 p     comp…    21  
##  6 audi      a4      2.8  1999     6 manu… f        18    26 p     comp…    22  
##  7 audi      a4      3.1  2008     6 auto… f        18    27 p     comp…    22.5
##  8 audi      a4 q…   1.8  1999     4 manu… 4        18    26 p     comp…    22  
##  9 audi      a4 q…   1.8  1999     4 auto… 4        16    25 p     comp…    20.5
## 10 audi      a4 q…   2    2008     4 manu… 4        20    28 p     comp…    24  
## # … with 224 more rows, 3 more variables: car <chr>, `cyl and trans` <chr>,
## #   `avg hwy` <dbl>, and abbreviated variable names ¹manufacturer,
## #   ²`avg mile per galon`

# the different between combined group_by-mutate and group_by-summarise is, the first combine
# also added summarised value to column for each corresponding row, while the other one only
# summarise grouped rows without adding it to a column.

df %>% group_by(manufacturer, model) %>% summarise(mean(hwy))

## `summarise()` has grouped output by 'manufacturer'. You can override using the
## `.groups` argument.

## # A tibble: 38 × 3
## # Groups:   manufacturer [15]
##    manufacturer model              `mean(hwy)`
##    <chr>        <chr>                    <dbl>
##  1 audi         a4                        28.3
##  2 audi         a4 quattro                25.8
##  3 audi         a6 quattro                24  
##  4 chevrolet    c1500 suburban 2wd        17.8
##  5 chevrolet    corvette                  24.8
##  6 chevrolet    k1500 tahoe 4wd           16.2
##  7 chevrolet    malibu                    27.6
##  8 dodge        caravan 2wd               22.4
##  9 dodge        dakota pickup 4wd         17  
## 10 dodge        durango 4wd               16  
## # … with 28 more rows

case_when()

just like case when in SQL

# library data.table is loaded to use like clause like in SQL
library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

## The following object is masked from 'package:purrr':
## 
##     transpose

# create a column to identify the transmission type
# whether automatic coded as AT or manual coded as MT
df1 <- df %>% mutate(`trans type` = case_when(
        trans %like% "auto.*" ~ "AT",
        trans %like% "manual.*" ~ "MT",
        TRUE ~ "NA")
)

let’s play

df2 <- df1 %>% count(manufacturer, `trans type`) # long format
# convert to wide format
df2 %>% pivot_wider(names_from = `trans type`, values_from = n) # this result will similar

## # A tibble: 15 × 3
##    manufacturer    AT    MT
##    <chr>        <int> <int>
##  1 audi            11     7
##  2 chevrolet       16     3
##  3 dodge           30     7
##  4 ford            17     8
##  5 honda            4     5
##  6 hyundai          7     7
##  7 jeep             8    NA
##  8 land rover       4    NA
##  9 lincoln          3    NA
## 10 mercury          4    NA
## 11 nissan           8     5
## 12 pontiac          5    NA
## 13 subaru           7     7
## 14 toyota          20    14
## 15 volkswagen      13    14

# to this syntax:
table(df1$manufacturer, df1$`trans type`)

##             
##              AT MT
##   audi       11  7
##   chevrolet  16  3
##   dodge      30  7
##   ford       17  8
##   honda       4  5
##   hyundai     7  7
##   jeep        8  0
##   land rover  4  0
##   lincoln     3  0
##   mercury     4  0
##   nissan      8  5
##   pontiac     5  0
##   subaru      7  7
##   toyota     20 14
##   volkswagen 13 14

# but table function will cause 0 as a 0 numeric, whereas pivot function will cause 0 as null

row_number()

rank a row

# adding id to car data
df %>% mutate(`car id` = row_number()) %>% print(n = 20)

## # A tibble: 234 × 15
##    manufac…¹ model displ  year   cyl trans drv     cty   hwy fl    class avg m…²
##    <chr>     <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>   <dbl>
##  1 audi      a4      1.8  1999     4 auto… f        18    29 p     comp…    23.5
##  2 audi      a4      1.8  1999     4 manu… f        21    29 p     comp…    25  
##  3 audi      a4      2    2008     4 manu… f        20    31 p     comp…    25.5
##  4 audi      a4      2    2008     4 auto… f        21    30 p     comp…    25.5
##  5 audi      a4      2.8  1999     6 auto… f        16    26 p     comp…    21  
##  6 audi      a4      2.8  1999     6 manu… f        18    26 p     comp…    22  
##  7 audi      a4      3.1  2008     6 auto… f        18    27 p     comp…    22.5
##  8 audi      a4 q…   1.8  1999     4 manu… 4        18    26 p     comp…    22  
##  9 audi      a4 q…   1.8  1999     4 auto… 4        16    25 p     comp…    20.5
## 10 audi      a4 q…   2    2008     4 manu… 4        20    28 p     comp…    24  
## 11 audi      a4 q…   2    2008     4 auto… 4        19    27 p     comp…    23  
## 12 audi      a4 q…   2.8  1999     6 auto… 4        15    25 p     comp…    20  
## 13 audi      a4 q…   2.8  1999     6 manu… 4        17    25 p     comp…    21  
## 14 audi      a4 q…   3.1  2008     6 auto… 4        17    25 p     comp…    21  
## 15 audi      a4 q…   3.1  2008     6 manu… 4        15    25 p     comp…    20  
## 16 audi      a6 q…   2.8  1999     6 auto… 4        15    24 p     mids…    19.5
## 17 audi      a6 q…   3.1  2008     6 auto… 4        17    25 p     mids…    21  
## 18 audi      a6 q…   4.2  2008     8 auto… 4        16    23 p     mids…    19.5
## 19 chevrolet c150…   5.3  2008     8 auto… r        14    20 r     suv      17  
## 20 chevrolet c150…   5.3  2008     8 auto… r        11    15 e     suv      13  
## # … with 214 more rows, 3 more variables: car <chr>, `cyl and trans` <chr>,
## #   `car id` <int>, and abbreviated variable names ¹manufacturer,
## #   ²`avg mile per galon`

create a car id considering group

# it will create incremented id in each group
df %>% group_by(model) %>% mutate(`mod id` = row_number())

## # A tibble: 234 × 15
## # Groups:   model [38]
##    manufac…¹ model displ  year   cyl trans drv     cty   hwy fl    class avg m…²
##    <chr>     <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>   <dbl>
##  1 audi      a4      1.8  1999     4 auto… f        18    29 p     comp…    23.5
##  2 audi      a4      1.8  1999     4 manu… f        21    29 p     comp…    25  
##  3 audi      a4      2    2008     4 manu… f        20    31 p     comp…    25.5
##  4 audi      a4      2    2008     4 auto… f        21    30 p     comp…    25.5
##  5 audi      a4      2.8  1999     6 auto… f        16    26 p     comp…    21  
##  6 audi      a4      2.8  1999     6 manu… f        18    26 p     comp…    22  
##  7 audi      a4      3.1  2008     6 auto… f        18    27 p     comp…    22.5
##  8 audi      a4 q…   1.8  1999     4 manu… 4        18    26 p     comp…    22  
##  9 audi      a4 q…   1.8  1999     4 auto… 4        16    25 p     comp…    20.5
## 10 audi      a4 q…   2    2008     4 manu… 4        20    28 p     comp…    24  
## # … with 224 more rows, 3 more variables: car <chr>, `cyl and trans` <chr>,
## #   `mod id` <int>, and abbreviated variable names ¹manufacturer,
## #   ²`avg mile per galon`

mutate FAMILY

mutate_all() apply function to all columns
mutate_if() apply function over columns that meet condition
mutate_at() apply function to a specific columns

let’s get action with dplyr and tidyr

load data

# install.packages("hflights")
library(hflights)

df <- hflights
class(df)

## [1] "data.frame"

getting acquainted with the data

glimpse(df)

## Rows: 227,496
## Columns: 21
## $ Year              <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
## $ Month             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DayofMonth        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ DayOfWeek         <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2…
## $ DepTime           <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 1355, 1443…
## $ ArrTime           <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 1454, 1554…
## $ UniqueCarrier     <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA"…
## $ FlightNum         <int> 428, 428, 428, 428, 428, 428, 428, 428, 428, 428, 42…
## $ TailNum           <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N492AA", "N…
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, 56, 63, …
## $ AirTime           <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, 41, 44, …
## $ ArrDelay          <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29, 5, -9, …
## $ DepDelay          <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, -2, -3, …
## $ Origin            <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IA…
## $ Dest              <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DF…
## $ Distance          <int> 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 22…
## $ TaxiIn            <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6, 12, 8,…
## $ TaxiOut           <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11, 13, 15…
## $ Cancelled         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CancellationCode  <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", …
## $ Diverted          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

there are more than 200 thousands records

Checking null values

# check globally
anyNA(df) # or using

## [1] TRUE

any(is.na(df))

## [1] TRUE

# find the total number of missing values in each column
colSums(is.na(df))

##              Year             Month        DayofMonth         DayOfWeek 
##                 0                 0                 0                 0 
##           DepTime           ArrTime     UniqueCarrier         FlightNum 
##              2905              3066                 0                 0 
##           TailNum ActualElapsedTime           AirTime          ArrDelay 
##                 0              3622              3622              3622 
##          DepDelay            Origin              Dest          Distance 
##              2905                 0                 0                 0 
##            TaxiIn           TaxiOut         Cancelled  CancellationCode 
##              3066              2947                 0                 0 
##          Diverted 
##                 0

# find only columns and its position that contains missing values
which(colSums(is.na(df))>0)

##           DepTime           ArrTime ActualElapsedTime           AirTime 
##                 5                 6                10                11 
##          ArrDelay          DepDelay            TaxiIn           TaxiOut 
##                12                13                17                18

# display the names of columns that contains missing values
names(which(colSums(is.na(df))>0))

## [1] "DepTime"           "ArrTime"           "ActualElapsedTime"
## [4] "AirTime"           "ArrDelay"          "DepDelay"         
## [7] "TaxiIn"            "TaxiOut"

another way to identify columns with null values

colnames(df)[colSums(is.na(df))>0]

## [1] "DepTime"           "ArrTime"           "ActualElapsedTime"
## [4] "AirTime"           "ArrDelay"          "DepDelay"         
## [7] "TaxiIn"            "TaxiOut"

or using apply

# apply(df, 2, anyNA)

# which(apply(df, 2, anyNA))

names(which(apply(df, 2, anyNA)))

## [1] "DepTime"           "ArrTime"           "ActualElapsedTime"
## [4] "AirTime"           "ArrDelay"          "DepDelay"         
## [7] "TaxiIn"            "TaxiOut"

but the third way take longer time to retrieve the results

place the list of columns that contains null values in a tibble

NullCol <- data.frame(col = names(which(colSums(is.na(df))>0)))

adjust the appropriate format of each columns to ensure data integrity

at the first step we see that some columns not formatted properly

glimpse(df)

## Rows: 227,496
## Columns: 21
## $ Year              <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
## $ Month             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DayofMonth        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ DayOfWeek         <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2…
## $ DepTime           <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 1355, 1443…
## $ ArrTime           <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 1454, 1554…
## $ UniqueCarrier     <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA"…
## $ FlightNum         <int> 428, 428, 428, 428, 428, 428, 428, 428, 428, 428, 42…
## $ TailNum           <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N492AA", "N…
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, 56, 63, …
## $ AirTime           <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, 41, 44, …
## $ ArrDelay          <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29, 5, -9, …
## $ DepDelay          <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, -2, -3, …
## $ Origin            <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IA…
## $ Dest              <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DF…
## $ Distance          <int> 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 22…
## $ TaxiIn            <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6, 12, 8,…
## $ TaxiOut           <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11, 13, 15…
## $ Cancelled         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CancellationCode  <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", …
## $ Diverted          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

the year, month, and day columns should be formatted as date not integer instead.
but before formatting the columns, we have to merge these columns to make it as a full date column

# add leading zeros to column month and dayofmonth
df <- df %>% mutate_at(.vars = c("Month", "DayofMonth"),
                 .funs = str_pad, width = 2, side = "left", pad = 0) %>%
        # merge appropriate columns to create a full dates column
        unite("Dates", c(Year, Month, DayofMonth), sep = "-")

after full dates column was generated, we convert the column’s format from string to date format

df$Dates <- as.POSIXct(df$Dates, "%Y-%m-%d")

## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y-%m-%d'

## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y-%m-%d'

## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y-%m-%d'

## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y-%m-%d'

## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y-%m-%d'

## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y-%m-%d'

## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y-%m-%d'

## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y-%m-%d'

## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y-%m-%d'

## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y-%m-%d'

## Warning in strptime(x, f, tz = tz): unknown timezone '%Y-%m-%d'

## Warning in as.POSIXct.POSIXlt(as.POSIXlt(x, tz, ...), tz, ...): unknown timezone
## '%Y-%m-%d'

exploring data

total flight for each carrier

df %>% count(UniqueCarrier) %>% arrange(desc(n))

##    UniqueCarrier     n
## 1             XE 73053
## 2             CO 70032
## 3             WN 45343
## 4             OO 16061
## 5             MQ  4648
## 6             US  4082
## 7             AA  3244
## 8             DL  2641
## 9             EV  2204
## 10            FL  2139
## 11            UA  2072
## 12            F9   838
## 13            B6   695
## 14            AS   365
## 15            YV    79

count the number of cancellation in each carrier

df %>% count(UniqueCarrier, Cancelled) %>% 
        filter(Cancelled == 1) %>% 
        arrange(desc(n))

##    UniqueCarrier Cancelled    n
## 1             XE         1 1132
## 2             WN         1  703
## 3             CO         1  475
## 4             OO         1  224
## 5             MQ         1  135
## 6             EV         1   76
## 7             AA         1   60
## 8             US         1   46
## 9             DL         1   42
## 10            UA         1   34
## 11            FL         1   21
## 12            B6         1   18
## 13            F9         1    6
## 14            YV         1    1

percentage of cancelled flight per carrier

percent_of_cancel <- df %>% count(UniqueCarrier, Cancelled) %>% 
        group_by(UniqueCarrier) %>% 
        mutate(`total flights` = sum(n), 
               percentage = round(n/`total flights`*100, 2)) %>% 
        filter(Cancelled == 1) %>% select(UniqueCarrier, percentage) %>% 
        arrange(desc(percentage)) %>% ungroup()

takeaway:
although XE is the carrier with the most number of cancelled flights among other carriers, but the rank of a percentage of its cancellation is no more than the top 50% among others.

count flight per canceled code for each carriers

numcancode <- df %>% count(UniqueCarrier, CancellationCode)

numcancode %>% pivot_wider(names_from = CancellationCode, values_from = n)

the pivoting process will error because there is an empty value in cancellation code, thus we have to encoding the empty value, so that the pivoting process will succeed

numcancode %>% #coded empty type of cancellation code
        mutate(CancellationCode = case_when(CancellationCode == "" ~ "0",
                                                   TRUE ~ CancellationCode)) %>% 
        # pivoting the table
        pivot_wider(names_from = CancellationCode, values_from = n, values_fill = 0)

## # A tibble: 15 × 6
##    UniqueCarrier   `0`     A     B     C     D
##    <chr>         <int> <int> <int> <int> <int>
##  1 AA             3184    20    29    11     0
##  2 AS              365     0     0     0     0
##  3 B6              677     5    13     0     0
##  4 CO            69557    37   436     2     0
##  5 DL             2599    13    27     2     0
##  6 EV             2128    60    14     2     0
##  7 F9              832     2     4     0     0
##  8 FL             2118     8    12     1     0
##  9 MQ             4513    39    71    25     0
## 10 OO            15837   121    87    15     1
## 11 UA             2038    21    10     3     0
## 12 US             4036    27    17     2     0
## 13 WN            44640   517   181     5     0
## 14 XE            71921   331   751    50     0
## 15 YV               78     1     0     0     0

another way to do the same thing is using table()

table(df$UniqueCarrier, df$CancellationCode)

##     
##                A     B     C     D
##   AA  3184    20    29    11     0
##   AS   365     0     0     0     0
##   B6   677     5    13     0     0
##   CO 69557    37   436     2     0
##   DL  2599    13    27     2     0
##   EV  2128    60    14     2     0
##   F9   832     2     4     0     0
##   FL  2118     8    12     1     0
##   MQ  4513    39    71    25     0
##   OO 15837   121    87    15     1
##   UA  2038    21    10     3     0
##   US  4036    27    17     2     0
##   WN 44640   517   181     5     0
##   XE 71921   331   751    50     0
##   YV    78     1     0     0     0

dplyr and tidyr

dzaky jaya

2022-08-28

Manipulate Data

select()

rename() and everything()

mutate() and transmute()

filter() and slice()

arrange()

distinct()

bind_()

sample_n() and sample_frac()

summarise()

count()

pivot_longer() and pivot_wider()

separate() and unite()

pull()

combining group_by and mutate

case_when()

row_number()

mutate FAMILY

let’s get action with dplyr and tidyr

load data

getting acquainted with the data

Checking null values

adjust the appropriate format of each columns to ensure data integrity

exploring data