Manipulate Data

load library

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

load data

df <- mpg
print(df)
## # A tibble: 234 × 11
##    manufacturer model      displ  year   cyl trans drv     cty   hwy fl    class
##    <chr>        <chr>      <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
##  1 audi         a4           1.8  1999     4 auto… f        18    29 p     comp…
##  2 audi         a4           1.8  1999     4 manu… f        21    29 p     comp…
##  3 audi         a4           2    2008     4 manu… f        20    31 p     comp…
##  4 audi         a4           2    2008     4 auto… f        21    30 p     comp…
##  5 audi         a4           2.8  1999     6 auto… f        16    26 p     comp…
##  6 audi         a4           2.8  1999     6 manu… f        18    26 p     comp…
##  7 audi         a4           3.1  2008     6 auto… f        18    27 p     comp…
##  8 audi         a4 quattro   1.8  1999     4 manu… 4        18    26 p     comp…
##  9 audi         a4 quattro   1.8  1999     4 auto… 4        16    25 p     comp…
## 10 audi         a4 quattro   2    2008     4 manu… 4        20    28 p     comp…
## # … with 224 more rows
# check null values
any(is.na(df))
## [1] FALSE
# explore number of columns and rows
ncol(df); nrow(df)
## [1] 11
## [1] 234
# explore structure of a table
str(df)
## tibble [234 × 11] (S3: tbl_df/tbl/data.frame)
##  $ manufacturer: chr [1:234] "audi" "audi" "audi" "audi" ...
##  $ model       : chr [1:234] "a4" "a4" "a4" "a4" ...
##  $ displ       : num [1:234] 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int [1:234] 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int [1:234] 4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : chr [1:234] "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
##  $ drv         : chr [1:234] "f" "f" "f" "f" ...
##  $ cty         : int [1:234] 18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int [1:234] 29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : chr [1:234] "p" "p" "p" "p" ...
##  $ class       : chr [1:234] "compact" "compact" "compact" "compact" ...
# explore column names
colnames(df)
##  [1] "manufacturer" "model"        "displ"        "year"         "cyl"         
##  [6] "trans"        "drv"          "cty"          "hwy"          "fl"          
## [11] "class"

select()

function to select column like select clause in SQL

# extract specified column
df %>% select(manufacturer, model, year, class)
## # A tibble: 234 × 4
##    manufacturer model       year class  
##    <chr>        <chr>      <int> <chr>  
##  1 audi         a4          1999 compact
##  2 audi         a4          1999 compact
##  3 audi         a4          2008 compact
##  4 audi         a4          2008 compact
##  5 audi         a4          1999 compact
##  6 audi         a4          1999 compact
##  7 audi         a4          2008 compact
##  8 audi         a4 quattro  1999 compact
##  9 audi         a4 quattro  1999 compact
## 10 audi         a4 quattro  2008 compact
## # … with 224 more rows
# extract matched criteria
df %>% select(starts_with(match = "c"))
## # A tibble: 234 × 3
##      cyl   cty class  
##    <int> <int> <chr>  
##  1     4    18 compact
##  2     4    21 compact
##  3     4    20 compact
##  4     4    21 compact
##  5     6    16 compact
##  6     6    18 compact
##  7     6    18 compact
##  8     4    18 compact
##  9     4    16 compact
## 10     4    20 compact
## # … with 224 more rows
df %>% select(contains("a"))
## # A tibble: 234 × 4
##    manufacturer  year trans      class  
##    <chr>        <int> <chr>      <chr>  
##  1 audi          1999 auto(l5)   compact
##  2 audi          1999 manual(m5) compact
##  3 audi          2008 manual(m6) compact
##  4 audi          2008 auto(av)   compact
##  5 audi          1999 auto(l5)   compact
##  6 audi          1999 manual(m5) compact
##  7 audi          2008 auto(av)   compact
##  8 audi          1999 manual(m5) compact
##  9 audi          1999 auto(l5)   compact
## 10 audi          2008 manual(m6) compact
## # … with 224 more rows
# select using index
df %>% select(2:5)
## # A tibble: 234 × 4
##    model      displ  year   cyl
##    <chr>      <dbl> <int> <int>
##  1 a4           1.8  1999     4
##  2 a4           1.8  1999     4
##  3 a4           2    2008     4
##  4 a4           2    2008     4
##  5 a4           2.8  1999     6
##  6 a4           2.8  1999     6
##  7 a4           3.1  2008     6
##  8 a4 quattro   1.8  1999     4
##  9 a4 quattro   1.8  1999     4
## 10 a4 quattro   2    2008     4
## # … with 224 more rows
# select last n columns (little tricky)
df %>% select((ncol(df)-2):ncol(df)) # it will extract last 3 columns
## # A tibble: 234 × 3
##      hwy fl    class  
##    <int> <chr> <chr>  
##  1    29 p     compact
##  2    29 p     compact
##  3    31 p     compact
##  4    30 p     compact
##  5    26 p     compact
##  6    26 p     compact
##  7    27 p     compact
##  8    26 p     compact
##  9    25 p     compact
## 10    28 p     compact
## # … with 224 more rows

rename() and everything()

everything is a function to grab all the remaining unspecified column

# rename all column
df1 <- df %>% rename(yes = everything())

# using select and rename simultaneously
df %>% select(mnfc = manufacturer, mod = model, display = displ, everything())
## # A tibble: 234 × 11
##    mnfc  mod        display  year   cyl trans      drv     cty   hwy fl    class
##    <chr> <chr>        <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr>
##  1 audi  a4             1.8  1999     4 auto(l5)   f        18    29 p     comp…
##  2 audi  a4             1.8  1999     4 manual(m5) f        21    29 p     comp…
##  3 audi  a4             2    2008     4 manual(m6) f        20    31 p     comp…
##  4 audi  a4             2    2008     4 auto(av)   f        21    30 p     comp…
##  5 audi  a4             2.8  1999     6 auto(l5)   f        16    26 p     comp…
##  6 audi  a4             2.8  1999     6 manual(m5) f        18    26 p     comp…
##  7 audi  a4             3.1  2008     6 auto(av)   f        18    27 p     comp…
##  8 audi  a4 quattro     1.8  1999     4 manual(m5) 4        18    26 p     comp…
##  9 audi  a4 quattro     1.8  1999     4 auto(l5)   4        16    25 p     comp…
## 10 audi  a4 quattro     2    2008     4 manual(m6) 4        20    28 p     comp…
## # … with 224 more rows
# note that this code doesn't rename original data set, it just rename the displayed data, i.e aliases

#to rename the original data set, use assign
# df <-df %>% rename(mnfc = manufacturer, mod = model)

mutate() and transmute()

mutate is a function to create new variable in a data set
transmute is a function to create new variable in a data set, but it will remove all others variables

# create new variable
df <- df %>% mutate(`avg mile per galon` = (cty+hwy)/2)

df <- df %>% mutate(car = paste(manufacturer, model))

df <- df %>% mutate(`cyl and trans` = paste(cyl, "cylinders and", trans, "transmissions"))

filter() and slice()

filter is a function to filter rows by criteria
slice is a function to filter rows by its indices

# use same order logic as SQL logic
df %>%  select(manufacturer, year, hwy) %>% filter(manufacturer %in% c("audi","ford"))
## # A tibble: 43 × 3
##    manufacturer  year   hwy
##    <chr>        <int> <int>
##  1 audi          1999    29
##  2 audi          1999    29
##  3 audi          2008    31
##  4 audi          2008    30
##  5 audi          1999    26
##  6 audi          1999    26
##  7 audi          2008    27
##  8 audi          1999    26
##  9 audi          1999    25
## 10 audi          2008    28
## # … with 33 more rows
# if we want to filter manufacturer but not want manufacturer to be displayed, then put the filter prior to select
df %>% filter(manufacturer %in% c("audi","ford")) %>%  select(year, hwy)
## # A tibble: 43 × 2
##     year   hwy
##    <int> <int>
##  1  1999    29
##  2  1999    29
##  3  2008    31
##  4  2008    30
##  5  1999    26
##  6  1999    26
##  7  2008    27
##  8  1999    26
##  9  1999    25
## 10  2008    28
## # … with 33 more rows
# if we want to filter manufacturer but not want manufacturer to be displayed but put the filter after select, it will cause an error

# combining filter and select
df %>%  select(manufacturer, year, hwy) %>% filter(manufacturer %in% c("audi","ford")) %>% group_by(manufacturer, year) %>% summarise(`avg mean` = mean(hwy))
## `summarise()` has grouped output by 'manufacturer'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 3
## # Groups:   manufacturer [2]
##   manufacturer  year `avg mean`
##   <chr>        <int>      <dbl>
## 1 audi          1999       26.1
## 2 audi          2008       26.8
## 3 ford          1999       18.6
## 4 ford          2008       20.5

slice()

# get data from row 20 to 30
df %>% slice(20:30)
## # A tibble: 11 × 14
##    manufac…¹ model displ  year   cyl trans drv     cty   hwy fl    class avg m…²
##    <chr>     <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>   <dbl>
##  1 chevrolet c150…   5.3  2008     8 auto… r        11    15 e     suv      13  
##  2 chevrolet c150…   5.3  2008     8 auto… r        14    20 r     suv      17  
##  3 chevrolet c150…   5.7  1999     8 auto… r        13    17 r     suv      15  
##  4 chevrolet c150…   6    2008     8 auto… r        12    17 r     suv      14.5
##  5 chevrolet corv…   5.7  1999     8 manu… r        16    26 p     2sea…    21  
##  6 chevrolet corv…   5.7  1999     8 auto… r        15    23 p     2sea…    19  
##  7 chevrolet corv…   6.2  2008     8 manu… r        16    26 p     2sea…    21  
##  8 chevrolet corv…   6.2  2008     8 auto… r        15    25 p     2sea…    20  
##  9 chevrolet corv…   7    2008     8 manu… r        15    24 p     2sea…    19.5
## 10 chevrolet k150…   5.3  2008     8 auto… 4        14    19 r     suv      16.5
## 11 chevrolet k150…   5.3  2008     8 auto… 4        11    14 e     suv      12.5
## # … with 2 more variables: car <chr>, `cyl and trans` <chr>, and abbreviated
## #   variable names ¹​manufacturer, ²​`avg mile per galon`
# combining slice and select
df %>% select(manufacturer, model, year) %>% slice(20:30)
## # A tibble: 11 × 3
##    manufacturer model               year
##    <chr>        <chr>              <int>
##  1 chevrolet    c1500 suburban 2wd  2008
##  2 chevrolet    c1500 suburban 2wd  2008
##  3 chevrolet    c1500 suburban 2wd  1999
##  4 chevrolet    c1500 suburban 2wd  2008
##  5 chevrolet    corvette            1999
##  6 chevrolet    corvette            1999
##  7 chevrolet    corvette            2008
##  8 chevrolet    corvette            2008
##  9 chevrolet    corvette            2008
## 10 chevrolet    k1500 tahoe 4wd     2008
## 11 chevrolet    k1500 tahoe 4wd     2008

arrange()

just like order by clause in SQL
default ordering is ascending

# order cty in asc and hwy in desc, then display 20 records
print(df %>% arrange(cty, desc(hwy)), n=20)
## # A tibble: 234 × 14
##    manufac…¹ model displ  year   cyl trans drv     cty   hwy fl    class avg m…²
##    <chr>     <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>   <dbl>
##  1 dodge     dako…   4.7  2008     8 auto… 4         9    12 e     pick…    10.5
##  2 dodge     dura…   4.7  2008     8 auto… 4         9    12 e     suv      10.5
##  3 dodge     ram …   4.7  2008     8 auto… 4         9    12 e     pick…    10.5
##  4 dodge     ram …   4.7  2008     8 manu… 4         9    12 e     pick…    10.5
##  5 jeep      gran…   4.7  2008     8 auto… 4         9    12 e     suv      10.5
##  6 dodge     cara…   3.3  2008     6 auto… f        11    17 e     mini…    14  
##  7 dodge     dako…   5.2  1999     8 manu… 4        11    17 r     pick…    14  
##  8 ford      expe…   4.6  1999     8 auto… r        11    17 r     suv      14  
##  9 ford      expe…   5.4  1999     8 auto… r        11    17 r     suv      14  
## 10 lincoln   navi…   5.4  1999     8 auto… r        11    17 r     suv      14  
## 11 dodge     dura…   5.2  1999     8 auto… 4        11    16 r     suv      13.5
## 12 dodge     ram …   5.2  1999     8 manu… 4        11    16 r     pick…    13.5
## 13 lincoln   navi…   5.4  1999     8 auto… r        11    16 p     suv      13.5
## 14 chevrolet c150…   5.3  2008     8 auto… r        11    15 e     suv      13  
## 15 chevrolet k150…   5.7  1999     8 auto… 4        11    15 r     suv      13  
## 16 dodge     dako…   5.2  1999     8 auto… 4        11    15 r     pick…    13  
## 17 dodge     dura…   5.9  1999     8 auto… 4        11    15 r     suv      13  
## 18 dodge     ram …   5.2  1999     8 auto… 4        11    15 r     pick…    13  
## 19 dodge     ram …   5.9  1999     8 auto… 4        11    15 r     pick…    13  
## 20 ford      f150…   5.4  1999     8 auto… 4        11    15 r     pick…    13  
## # … with 214 more rows, 2 more variables: car <chr>, `cyl and trans` <chr>, and
## #   abbreviated variable names ¹​manufacturer, ²​`avg mile per galon`
# using arrange in advance
print(df %>% select(manufacturer, year, hwy) %>% group_by(manufacturer) %>% filter(year == 1999) %>% summarise(`average of hwy` = mean(hwy)) %>% arrange(desc(`average of hwy`)), n=5)
## # A tibble: 15 × 2
##   manufacturer `average of hwy`
##   <chr>                   <dbl>
## 1 honda                    31.6
## 2 volkswagen               29.7
## 3 hyundai                  26.7
## 4 pontiac                  26.3
## 5 audi                     26.1
## # … with 10 more rows

distinct()

just like distinct cluase in SQL

df %>% distinct(manufacturer, year) %>% arrange(manufacturer, year)
## # A tibble: 30 × 2
##    manufacturer  year
##    <chr>        <int>
##  1 audi          1999
##  2 audi          2008
##  3 chevrolet     1999
##  4 chevrolet     2008
##  5 dodge         1999
##  6 dodge         2008
##  7 ford          1999
##  8 ford          2008
##  9 honda         1999
## 10 honda         2008
## # … with 20 more rows

bind_()

binding either row(s) or column(s) just like rbind and cbind

exmp <- data.frame(id = c(1,2,3), names = c("john", "michael", "aries"))

exmp <- bind_rows(exmp, slice(exmp, 2:3))

exmp <- arrange(exmp, id)

sample_n() and sample_frac()

sample_n() to sampling number of n rows
sample_frac() to sampling percent of rows

# retrieve 20 samples from df without replacement
sample_n(df, 20, replace = F)
## # A tibble: 20 × 14
##    manufac…¹ model displ  year   cyl trans drv     cty   hwy fl    class avg m…²
##    <chr>     <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>   <dbl>
##  1 jeep      gran…   6.1  2008     8 auto… 4        11    14 p     suv      12.5
##  2 toyota    toyo…   2.7  2008     4 manu… 4        17    22 r     pick…    19.5
##  3 land rov… rang…   4.6  1999     8 auto… 4        11    15 p     suv      13  
##  4 subaru    fore…   2.5  1999     4 manu… 4        18    25 r     suv      21.5
##  5 dodge     dako…   5.2  1999     8 auto… 4        11    15 r     pick…    13  
##  6 ford      expe…   4.6  1999     8 auto… r        11    17 r     suv      14  
##  7 toyota    coro…   1.8  1999     4 manu… f        26    35 r     comp…    30.5
##  8 dodge     ram …   4.7  2008     8 manu… 4        12    16 r     pick…    14  
##  9 volkswag… new …   1.9  1999     4 manu… f        35    44 d     subc…    39.5
## 10 toyota    camr…   3.3  2008     6 auto… f        18    27 r     comp…    22.5
## 11 mercury   moun…   5    1999     8 auto… 4        13    17 r     suv      15  
## 12 honda     civic   1.8  2008     4 auto… f        24    36 c     subc…    30  
## 13 ford      expe…   5.4  1999     8 auto… r        11    17 r     suv      14  
## 14 chevrolet mali…   3.6  2008     6 auto… f        17    26 r     mids…    21.5
## 15 volkswag… jetta   2    2008     4 manu… f        21    29 p     comp…    25  
## 16 dodge     ram …   4.7  2008     8 manu… 4         9    12 e     pick…    10.5
## 17 dodge     dako…   4.7  2008     8 auto… 4         9    12 e     pick…    10.5
## 18 pontiac   gran…   3.1  1999     6 auto… f        18    26 r     mids…    22  
## 19 dodge     cara…   3    1999     6 auto… f        17    24 r     mini…    20.5
## 20 audi      a6 q…   3.1  2008     6 auto… 4        17    25 p     mids…    21  
## # … with 2 more variables: car <chr>, `cyl and trans` <chr>, and abbreviated
## #   variable names ¹​manufacturer, ²​`avg mile per galon`
# retrieve 20% sample from df
sample_frac(df, 0.2)
## # A tibble: 47 × 14
##    manufac…¹ model displ  year   cyl trans drv     cty   hwy fl    class avg m…²
##    <chr>     <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>   <dbl>
##  1 hyundai   tibu…   2    2008     4 auto… f        20    27 r     subc…    23.5
##  2 toyota    coro…   1.8  2008     4 manu… f        28    37 r     comp…    32.5
##  3 ford      f150…   4.2  1999     6 auto… 4        14    17 r     pick…    15.5
##  4 dodge     cara…   3    1999     6 auto… f        17    24 r     mini…    20.5
##  5 subaru    fore…   2.5  1999     4 manu… 4        18    25 r     suv      21.5
##  6 chevrolet c150…   5.7  1999     8 auto… r        13    17 r     suv      15  
##  7 dodge     dura…   4.7  2008     8 auto… 4         9    12 e     suv      10.5
##  8 nissan    path…   5.6  2008     8 auto… 4        12    18 p     suv      15  
##  9 volkswag… gti     2    2008     4 manu… f        21    29 p     comp…    25  
## 10 chevrolet corv…   6.2  2008     8 auto… r        15    25 p     2sea…    20  
## # … with 37 more rows, 2 more variables: car <chr>, `cyl and trans` <chr>, and
## #   abbreviated variable names ¹​manufacturer, ²​`avg mile per galon`

summarise()

used with aggregate function

# count number of records and number of distinct cty records
df %>% summarise(n(), n_distinct(cty))
## # A tibble: 1 × 2
##   `n()` `n_distinct(cty)`
##   <int>             <int>
## 1   234                21
# count number of records in advance
df %>% select(manufacturer, model) %>% group_by(manufacturer, model) %>% summarise(numrec = n())
## `summarise()` has grouped output by 'manufacturer'. You can override using the
## `.groups` argument.
## # A tibble: 38 × 3
## # Groups:   manufacturer [15]
##    manufacturer model              numrec
##    <chr>        <chr>               <int>
##  1 audi         a4                      7
##  2 audi         a4 quattro              8
##  3 audi         a6 quattro              3
##  4 chevrolet    c1500 suburban 2wd      5
##  5 chevrolet    corvette                5
##  6 chevrolet    k1500 tahoe 4wd         4
##  7 chevrolet    malibu                  5
##  8 dodge        caravan 2wd            11
##  9 dodge        dakota pickup 4wd       9
## 10 dodge        durango 4wd             7
## # … with 28 more rows

the same approach with simpler code line is using count()

count()

count(df, manufacturer, model) # will produce exactly same result as in second code line of summarise() part
## # A tibble: 38 × 3
##    manufacturer model                  n
##    <chr>        <chr>              <int>
##  1 audi         a4                     7
##  2 audi         a4 quattro             8
##  3 audi         a6 quattro             3
##  4 chevrolet    c1500 suburban 2wd     5
##  5 chevrolet    corvette               5
##  6 chevrolet    k1500 tahoe 4wd        4
##  7 chevrolet    malibu                 5
##  8 dodge        caravan 2wd           11
##  9 dodge        dakota pickup 4wd      9
## 10 dodge        durango 4wd            7
## # … with 28 more rows
df %>% group_by(manufacturer) %>% summarise(n())
## # A tibble: 15 × 2
##    manufacturer `n()`
##    <chr>        <int>
##  1 audi            18
##  2 chevrolet       19
##  3 dodge           37
##  4 ford            25
##  5 honda            9
##  6 hyundai         14
##  7 jeep             8
##  8 land rover       4
##  9 lincoln          3
## 10 mercury          4
## 11 nissan          13
## 12 pontiac          5
## 13 subaru          14
## 14 toyota          34
## 15 volkswagen      27
df %>% group_by(manufacturer) %>% summarise(`min hwy` = min(hwy), `max hwy` = max(hwy),
                                            `min cyl` = min(cyl), `max cyl` = max(cyl))
## # A tibble: 15 × 5
##    manufacturer `min hwy` `max hwy` `min cyl` `max cyl`
##    <chr>            <int>     <int>     <int>     <int>
##  1 audi                23        31         4         8
##  2 chevrolet           14        30         4         8
##  3 dodge               12        24         4         8
##  4 ford                15        26         6         8
##  5 honda               29        36         4         4
##  6 hyundai             24        31         4         6
##  7 jeep                12        22         6         8
##  8 land rover          15        18         8         8
##  9 lincoln             16        18         8         8
## 10 mercury             17        19         6         8
## 11 nissan              17        32         4         8
## 12 pontiac             25        28         6         8
## 13 subaru              23        27         4         4
## 14 toyota              15        37         4         8
## 15 volkswagen          23        44         4         6
df %>% group_by(manufacturer, model, class, trans) %>% 
        summarise(`mean hwy` = mean(hwy), cars = n()) %>% 
        filter(`mean hwy` > 30) %>% 
        arrange(desc(`mean hwy`)) %>% ungroup()
## `summarise()` has grouped output by 'manufacturer', 'model', 'class'. You can
## override using the `.groups` argument.
## # A tibble: 10 × 6
##    manufacturer model      class      trans      `mean hwy`  cars
##    <chr>        <chr>      <chr>      <chr>           <dbl> <int>
##  1 honda        civic      subcompact auto(l5)         36       2
##  2 toyota       corolla    compact    manual(m5)       36       2
##  3 toyota       corolla    compact    auto(l4)         34       2
##  4 volkswagen   new beetle subcompact manual(m5)       33.7     3
##  5 volkswagen   new beetle subcompact auto(l4)         33.5     2
##  6 honda        civic      subcompact auto(l4)         32       2
##  7 honda        civic      subcompact manual(m5)       32       4
##  8 volkswagen   jetta      compact    manual(m5)       31.5     4
##  9 audi         a4         compact    manual(m6)       31       1
## 10 toyota       camry      midsize    auto(l5)         31       1

pivot_longer() and pivot_wider()

transforming table from wide format to long format and vice versa.

# create long table example
long <- data.frame(id = 1:6, type = c("a", "b", "b", "c", "b", "a"),
                   total = c(20, 40, 30, 45, 12, 35))

# convert to wide format
wide <- pivot_wider(long, names_from = type, values_from = total)

# bring back wide to long
wide %>% pivot_longer(cols = c("a", "b", "c"), names_to = "type", values_to = "total", values_drop_na = T)
## # A tibble: 6 × 3
##      id type  total
##   <int> <chr> <dbl>
## 1     1 a        20
## 2     2 b        40
## 3     3 b        30
## 4     4 c        45
## 5     5 b        12
## 6     6 a        35
# play in advance
play <- df %>% filter(manufacturer %in% c("jeep", "hyundai", "land rover")) %>% 
        select(model, trans, hwy)

wideplay <- play %>% group_by(model, trans) %>%  summarise(`avg hwy` = mean(hwy)) %>% 
        pivot_wider(names_from = trans, values_from = `avg hwy`)
## `summarise()` has grouped output by 'model'. You can override using the
## `.groups` argument.
#convert back with little trick
wideplay %>% pivot_longer(cols = -model, # exclude column model and remain all the rest column
                          names_to = "trans",
                          values_to = "avg hwy",
                          values_drop_na = T)
## # A tibble: 10 × 3
## # Groups:   model [4]
##    model              trans      `avg hwy`
##    <chr>              <chr>          <dbl>
##  1 grand cherokee 4wd auto(l4)        18.5
##  2 grand cherokee 4wd auto(l5)        17.3
##  3 range rover        auto(l4)        15  
##  4 range rover        auto(s6)        18  
##  5 sonata             auto(l4)        27.3
##  6 sonata             auto(l5)        28  
##  7 sonata             manual(m5)      28  
##  8 tiburon            auto(l4)        25.7
##  9 tiburon            manual(m5)      27  
## 10 tiburon            manual(m6)      24

separate() and unite()

just like split and merge in excel
separate

# create dates table just for example
dates <- seq.Date(from = as.Date("2022-01-01"), to = as.Date("2022-12-31"), by = "day")
# place in table format
datedf <- data.frame(date = dates)

sptdatedf <- datedf %>% separate(col = date, into = c("year", "month", "date"), sep = "-") %>% 
        mutate(month = as.numeric(month), 
               date = as.numeric(date)) # remove leading zeros

# other ways to mutate
datedf %>% separate(col = date, into = c("year", "month", "date"), sep = "-") %>% 
        mutate_at(.vars = c("month", "date"), # determine what's columns to mutate
                  .funs = as.numeric) # determine what kind of function to apply
##     year month date
## 1   2022     1    1
## 2   2022     1    2
## 3   2022     1    3
## 4   2022     1    4
## 5   2022     1    5
## 6   2022     1    6
## 7   2022     1    7
## 8   2022     1    8
## 9   2022     1    9
## 10  2022     1   10
## 11  2022     1   11
## 12  2022     1   12
## 13  2022     1   13
## 14  2022     1   14
## 15  2022     1   15
## 16  2022     1   16
## 17  2022     1   17
## 18  2022     1   18
## 19  2022     1   19
## 20  2022     1   20
## 21  2022     1   21
## 22  2022     1   22
## 23  2022     1   23
## 24  2022     1   24
## 25  2022     1   25
## 26  2022     1   26
## 27  2022     1   27
## 28  2022     1   28
## 29  2022     1   29
## 30  2022     1   30
## 31  2022     1   31
## 32  2022     2    1
## 33  2022     2    2
## 34  2022     2    3
## 35  2022     2    4
## 36  2022     2    5
## 37  2022     2    6
## 38  2022     2    7
## 39  2022     2    8
## 40  2022     2    9
## 41  2022     2   10
## 42  2022     2   11
## 43  2022     2   12
## 44  2022     2   13
## 45  2022     2   14
## 46  2022     2   15
## 47  2022     2   16
## 48  2022     2   17
## 49  2022     2   18
## 50  2022     2   19
## 51  2022     2   20
## 52  2022     2   21
## 53  2022     2   22
## 54  2022     2   23
## 55  2022     2   24
## 56  2022     2   25
## 57  2022     2   26
## 58  2022     2   27
## 59  2022     2   28
## 60  2022     3    1
## 61  2022     3    2
## 62  2022     3    3
## 63  2022     3    4
## 64  2022     3    5
## 65  2022     3    6
## 66  2022     3    7
## 67  2022     3    8
## 68  2022     3    9
## 69  2022     3   10
## 70  2022     3   11
## 71  2022     3   12
## 72  2022     3   13
## 73  2022     3   14
## 74  2022     3   15
## 75  2022     3   16
## 76  2022     3   17
## 77  2022     3   18
## 78  2022     3   19
## 79  2022     3   20
## 80  2022     3   21
## 81  2022     3   22
## 82  2022     3   23
## 83  2022     3   24
## 84  2022     3   25
## 85  2022     3   26
## 86  2022     3   27
## 87  2022     3   28
## 88  2022     3   29
## 89  2022     3   30
## 90  2022     3   31
## 91  2022     4    1
## 92  2022     4    2
## 93  2022     4    3
## 94  2022     4    4
## 95  2022     4    5
## 96  2022     4    6
## 97  2022     4    7
## 98  2022     4    8
## 99  2022     4    9
## 100 2022     4   10
## 101 2022     4   11
## 102 2022     4   12
## 103 2022     4   13
## 104 2022     4   14
## 105 2022     4   15
## 106 2022     4   16
## 107 2022     4   17
## 108 2022     4   18
## 109 2022     4   19
## 110 2022     4   20
## 111 2022     4   21
## 112 2022     4   22
## 113 2022     4   23
## 114 2022     4   24
## 115 2022     4   25
## 116 2022     4   26
## 117 2022     4   27
## 118 2022     4   28
## 119 2022     4   29
## 120 2022     4   30
## 121 2022     5    1
## 122 2022     5    2
## 123 2022     5    3
## 124 2022     5    4
## 125 2022     5    5
## 126 2022     5    6
## 127 2022     5    7
## 128 2022     5    8
## 129 2022     5    9
## 130 2022     5   10
## 131 2022     5   11
## 132 2022     5   12
## 133 2022     5   13
## 134 2022     5   14
## 135 2022     5   15
## 136 2022     5   16
## 137 2022     5   17
## 138 2022     5   18
## 139 2022     5   19
## 140 2022     5   20
## 141 2022     5   21
## 142 2022     5   22
## 143 2022     5   23
## 144 2022     5   24
## 145 2022     5   25
## 146 2022     5   26
## 147 2022     5   27
## 148 2022     5   28
## 149 2022     5   29
## 150 2022     5   30
## 151 2022     5   31
## 152 2022     6    1
## 153 2022     6    2
## 154 2022     6    3
## 155 2022     6    4
## 156 2022     6    5
## 157 2022     6    6
## 158 2022     6    7
## 159 2022     6    8
## 160 2022     6    9
## 161 2022     6   10
## 162 2022     6   11
## 163 2022     6   12
## 164 2022     6   13
## 165 2022     6   14
## 166 2022     6   15
## 167 2022     6   16
## 168 2022     6   17
## 169 2022     6   18
## 170 2022     6   19
## 171 2022     6   20
## 172 2022     6   21
## 173 2022     6   22
## 174 2022     6   23
## 175 2022     6   24
## 176 2022     6   25
## 177 2022     6   26
## 178 2022     6   27
## 179 2022     6   28
## 180 2022     6   29
## 181 2022     6   30
## 182 2022     7    1
## 183 2022     7    2
## 184 2022     7    3
## 185 2022     7    4
## 186 2022     7    5
## 187 2022     7    6
## 188 2022     7    7
## 189 2022     7    8
## 190 2022     7    9
## 191 2022     7   10
## 192 2022     7   11
## 193 2022     7   12
## 194 2022     7   13
## 195 2022     7   14
## 196 2022     7   15
## 197 2022     7   16
## 198 2022     7   17
## 199 2022     7   18
## 200 2022     7   19
## 201 2022     7   20
## 202 2022     7   21
## 203 2022     7   22
## 204 2022     7   23
## 205 2022     7   24
## 206 2022     7   25
## 207 2022     7   26
## 208 2022     7   27
## 209 2022     7   28
## 210 2022     7   29
## 211 2022     7   30
## 212 2022     7   31
## 213 2022     8    1
## 214 2022     8    2
## 215 2022     8    3
## 216 2022     8    4
## 217 2022     8    5
## 218 2022     8    6
## 219 2022     8    7
## 220 2022     8    8
## 221 2022     8    9
## 222 2022     8   10
## 223 2022     8   11
## 224 2022     8   12
## 225 2022     8   13
## 226 2022     8   14
## 227 2022     8   15
## 228 2022     8   16
## 229 2022     8   17
## 230 2022     8   18
## 231 2022     8   19
## 232 2022     8   20
## 233 2022     8   21
## 234 2022     8   22
## 235 2022     8   23
## 236 2022     8   24
## 237 2022     8   25
## 238 2022     8   26
## 239 2022     8   27
## 240 2022     8   28
## 241 2022     8   29
## 242 2022     8   30
## 243 2022     8   31
## 244 2022     9    1
## 245 2022     9    2
## 246 2022     9    3
## 247 2022     9    4
## 248 2022     9    5
## 249 2022     9    6
## 250 2022     9    7
## 251 2022     9    8
## 252 2022     9    9
## 253 2022     9   10
## 254 2022     9   11
## 255 2022     9   12
## 256 2022     9   13
## 257 2022     9   14
## 258 2022     9   15
## 259 2022     9   16
## 260 2022     9   17
## 261 2022     9   18
## 262 2022     9   19
## 263 2022     9   20
## 264 2022     9   21
## 265 2022     9   22
## 266 2022     9   23
## 267 2022     9   24
## 268 2022     9   25
## 269 2022     9   26
## 270 2022     9   27
## 271 2022     9   28
## 272 2022     9   29
## 273 2022     9   30
## 274 2022    10    1
## 275 2022    10    2
## 276 2022    10    3
## 277 2022    10    4
## 278 2022    10    5
## 279 2022    10    6
## 280 2022    10    7
## 281 2022    10    8
## 282 2022    10    9
## 283 2022    10   10
## 284 2022    10   11
## 285 2022    10   12
## 286 2022    10   13
## 287 2022    10   14
## 288 2022    10   15
## 289 2022    10   16
## 290 2022    10   17
## 291 2022    10   18
## 292 2022    10   19
## 293 2022    10   20
## 294 2022    10   21
## 295 2022    10   22
## 296 2022    10   23
## 297 2022    10   24
## 298 2022    10   25
## 299 2022    10   26
## 300 2022    10   27
## 301 2022    10   28
## 302 2022    10   29
## 303 2022    10   30
## 304 2022    10   31
## 305 2022    11    1
## 306 2022    11    2
## 307 2022    11    3
## 308 2022    11    4
## 309 2022    11    5
## 310 2022    11    6
## 311 2022    11    7
## 312 2022    11    8
## 313 2022    11    9
## 314 2022    11   10
## 315 2022    11   11
## 316 2022    11   12
## 317 2022    11   13
## 318 2022    11   14
## 319 2022    11   15
## 320 2022    11   16
## 321 2022    11   17
## 322 2022    11   18
## 323 2022    11   19
## 324 2022    11   20
## 325 2022    11   21
## 326 2022    11   22
## 327 2022    11   23
## 328 2022    11   24
## 329 2022    11   25
## 330 2022    11   26
## 331 2022    11   27
## 332 2022    11   28
## 333 2022    11   29
## 334 2022    11   30
## 335 2022    12    1
## 336 2022    12    2
## 337 2022    12    3
## 338 2022    12    4
## 339 2022    12    5
## 340 2022    12    6
## 341 2022    12    7
## 342 2022    12    8
## 343 2022    12    9
## 344 2022    12   10
## 345 2022    12   11
## 346 2022    12   12
## 347 2022    12   13
## 348 2022    12   14
## 349 2022    12   15
## 350 2022    12   16
## 351 2022    12   17
## 352 2022    12   18
## 353 2022    12   19
## 354 2022    12   20
## 355 2022    12   21
## 356 2022    12   22
## 357 2022    12   23
## 358 2022    12   24
## 359 2022    12   25
## 360 2022    12   26
## 361 2022    12   27
## 362 2022    12   28
## 363 2022    12   29
## 364 2022    12   30
## 365 2022    12   31

unite

# add leading zeros before merge dates columns
sptdatedf %>% mutate(month = str_pad(width = 2, side = "left", string = month, pad = 0),
                     date = str_pad(width = 2, string = date, side = "left", pad = 0)) %>% 
        unite(col = "full dates", c(year,month,date), sep = "-", remove = F)
##     full dates year month date
## 1   2022-01-01 2022    01   01
## 2   2022-01-02 2022    01   02
## 3   2022-01-03 2022    01   03
## 4   2022-01-04 2022    01   04
## 5   2022-01-05 2022    01   05
## 6   2022-01-06 2022    01   06
## 7   2022-01-07 2022    01   07
## 8   2022-01-08 2022    01   08
## 9   2022-01-09 2022    01   09
## 10  2022-01-10 2022    01   10
## 11  2022-01-11 2022    01   11
## 12  2022-01-12 2022    01   12
## 13  2022-01-13 2022    01   13
## 14  2022-01-14 2022    01   14
## 15  2022-01-15 2022    01   15
## 16  2022-01-16 2022    01   16
## 17  2022-01-17 2022    01   17
## 18  2022-01-18 2022    01   18
## 19  2022-01-19 2022    01   19
## 20  2022-01-20 2022    01   20
## 21  2022-01-21 2022    01   21
## 22  2022-01-22 2022    01   22
## 23  2022-01-23 2022    01   23
## 24  2022-01-24 2022    01   24
## 25  2022-01-25 2022    01   25
## 26  2022-01-26 2022    01   26
## 27  2022-01-27 2022    01   27
## 28  2022-01-28 2022    01   28
## 29  2022-01-29 2022    01   29
## 30  2022-01-30 2022    01   30
## 31  2022-01-31 2022    01   31
## 32  2022-02-01 2022    02   01
## 33  2022-02-02 2022    02   02
## 34  2022-02-03 2022    02   03
## 35  2022-02-04 2022    02   04
## 36  2022-02-05 2022    02   05
## 37  2022-02-06 2022    02   06
## 38  2022-02-07 2022    02   07
## 39  2022-02-08 2022    02   08
## 40  2022-02-09 2022    02   09
## 41  2022-02-10 2022    02   10
## 42  2022-02-11 2022    02   11
## 43  2022-02-12 2022    02   12
## 44  2022-02-13 2022    02   13
## 45  2022-02-14 2022    02   14
## 46  2022-02-15 2022    02   15
## 47  2022-02-16 2022    02   16
## 48  2022-02-17 2022    02   17
## 49  2022-02-18 2022    02   18
## 50  2022-02-19 2022    02   19
## 51  2022-02-20 2022    02   20
## 52  2022-02-21 2022    02   21
## 53  2022-02-22 2022    02   22
## 54  2022-02-23 2022    02   23
## 55  2022-02-24 2022    02   24
## 56  2022-02-25 2022    02   25
## 57  2022-02-26 2022    02   26
## 58  2022-02-27 2022    02   27
## 59  2022-02-28 2022    02   28
## 60  2022-03-01 2022    03   01
## 61  2022-03-02 2022    03   02
## 62  2022-03-03 2022    03   03
## 63  2022-03-04 2022    03   04
## 64  2022-03-05 2022    03   05
## 65  2022-03-06 2022    03   06
## 66  2022-03-07 2022    03   07
## 67  2022-03-08 2022    03   08
## 68  2022-03-09 2022    03   09
## 69  2022-03-10 2022    03   10
## 70  2022-03-11 2022    03   11
## 71  2022-03-12 2022    03   12
## 72  2022-03-13 2022    03   13
## 73  2022-03-14 2022    03   14
## 74  2022-03-15 2022    03   15
## 75  2022-03-16 2022    03   16
## 76  2022-03-17 2022    03   17
## 77  2022-03-18 2022    03   18
## 78  2022-03-19 2022    03   19
## 79  2022-03-20 2022    03   20
## 80  2022-03-21 2022    03   21
## 81  2022-03-22 2022    03   22
## 82  2022-03-23 2022    03   23
## 83  2022-03-24 2022    03   24
## 84  2022-03-25 2022    03   25
## 85  2022-03-26 2022    03   26
## 86  2022-03-27 2022    03   27
## 87  2022-03-28 2022    03   28
## 88  2022-03-29 2022    03   29
## 89  2022-03-30 2022    03   30
## 90  2022-03-31 2022    03   31
## 91  2022-04-01 2022    04   01
## 92  2022-04-02 2022    04   02
## 93  2022-04-03 2022    04   03
## 94  2022-04-04 2022    04   04
## 95  2022-04-05 2022    04   05
## 96  2022-04-06 2022    04   06
## 97  2022-04-07 2022    04   07
## 98  2022-04-08 2022    04   08
## 99  2022-04-09 2022    04   09
## 100 2022-04-10 2022    04   10
## 101 2022-04-11 2022    04   11
## 102 2022-04-12 2022    04   12
## 103 2022-04-13 2022    04   13
## 104 2022-04-14 2022    04   14
## 105 2022-04-15 2022    04   15
## 106 2022-04-16 2022    04   16
## 107 2022-04-17 2022    04   17
## 108 2022-04-18 2022    04   18
## 109 2022-04-19 2022    04   19
## 110 2022-04-20 2022    04   20
## 111 2022-04-21 2022    04   21
## 112 2022-04-22 2022    04   22
## 113 2022-04-23 2022    04   23
## 114 2022-04-24 2022    04   24
## 115 2022-04-25 2022    04   25
## 116 2022-04-26 2022    04   26
## 117 2022-04-27 2022    04   27
## 118 2022-04-28 2022    04   28
## 119 2022-04-29 2022    04   29
## 120 2022-04-30 2022    04   30
## 121 2022-05-01 2022    05   01
## 122 2022-05-02 2022    05   02
## 123 2022-05-03 2022    05   03
## 124 2022-05-04 2022    05   04
## 125 2022-05-05 2022    05   05
## 126 2022-05-06 2022    05   06
## 127 2022-05-07 2022    05   07
## 128 2022-05-08 2022    05   08
## 129 2022-05-09 2022    05   09
## 130 2022-05-10 2022    05   10
## 131 2022-05-11 2022    05   11
## 132 2022-05-12 2022    05   12
## 133 2022-05-13 2022    05   13
## 134 2022-05-14 2022    05   14
## 135 2022-05-15 2022    05   15
## 136 2022-05-16 2022    05   16
## 137 2022-05-17 2022    05   17
## 138 2022-05-18 2022    05   18
## 139 2022-05-19 2022    05   19
## 140 2022-05-20 2022    05   20
## 141 2022-05-21 2022    05   21
## 142 2022-05-22 2022    05   22
## 143 2022-05-23 2022    05   23
## 144 2022-05-24 2022    05   24
## 145 2022-05-25 2022    05   25
## 146 2022-05-26 2022    05   26
## 147 2022-05-27 2022    05   27
## 148 2022-05-28 2022    05   28
## 149 2022-05-29 2022    05   29
## 150 2022-05-30 2022    05   30
## 151 2022-05-31 2022    05   31
## 152 2022-06-01 2022    06   01
## 153 2022-06-02 2022    06   02
## 154 2022-06-03 2022    06   03
## 155 2022-06-04 2022    06   04
## 156 2022-06-05 2022    06   05
## 157 2022-06-06 2022    06   06
## 158 2022-06-07 2022    06   07
## 159 2022-06-08 2022    06   08
## 160 2022-06-09 2022    06   09
## 161 2022-06-10 2022    06   10
## 162 2022-06-11 2022    06   11
## 163 2022-06-12 2022    06   12
## 164 2022-06-13 2022    06   13
## 165 2022-06-14 2022    06   14
## 166 2022-06-15 2022    06   15
## 167 2022-06-16 2022    06   16
## 168 2022-06-17 2022    06   17
## 169 2022-06-18 2022    06   18
## 170 2022-06-19 2022    06   19
## 171 2022-06-20 2022    06   20
## 172 2022-06-21 2022    06   21
## 173 2022-06-22 2022    06   22
## 174 2022-06-23 2022    06   23
## 175 2022-06-24 2022    06   24
## 176 2022-06-25 2022    06   25
## 177 2022-06-26 2022    06   26
## 178 2022-06-27 2022    06   27
## 179 2022-06-28 2022    06   28
## 180 2022-06-29 2022    06   29
## 181 2022-06-30 2022    06   30
## 182 2022-07-01 2022    07   01
## 183 2022-07-02 2022    07   02
## 184 2022-07-03 2022    07   03
## 185 2022-07-04 2022    07   04
## 186 2022-07-05 2022    07   05
## 187 2022-07-06 2022    07   06
## 188 2022-07-07 2022    07   07
## 189 2022-07-08 2022    07   08
## 190 2022-07-09 2022    07   09
## 191 2022-07-10 2022    07   10
## 192 2022-07-11 2022    07   11
## 193 2022-07-12 2022    07   12
## 194 2022-07-13 2022    07   13
## 195 2022-07-14 2022    07   14
## 196 2022-07-15 2022    07   15
## 197 2022-07-16 2022    07   16
## 198 2022-07-17 2022    07   17
## 199 2022-07-18 2022    07   18
## 200 2022-07-19 2022    07   19
## 201 2022-07-20 2022    07   20
## 202 2022-07-21 2022    07   21
## 203 2022-07-22 2022    07   22
## 204 2022-07-23 2022    07   23
## 205 2022-07-24 2022    07   24
## 206 2022-07-25 2022    07   25
## 207 2022-07-26 2022    07   26
## 208 2022-07-27 2022    07   27
## 209 2022-07-28 2022    07   28
## 210 2022-07-29 2022    07   29
## 211 2022-07-30 2022    07   30
## 212 2022-07-31 2022    07   31
## 213 2022-08-01 2022    08   01
## 214 2022-08-02 2022    08   02
## 215 2022-08-03 2022    08   03
## 216 2022-08-04 2022    08   04
## 217 2022-08-05 2022    08   05
## 218 2022-08-06 2022    08   06
## 219 2022-08-07 2022    08   07
## 220 2022-08-08 2022    08   08
## 221 2022-08-09 2022    08   09
## 222 2022-08-10 2022    08   10
## 223 2022-08-11 2022    08   11
## 224 2022-08-12 2022    08   12
## 225 2022-08-13 2022    08   13
## 226 2022-08-14 2022    08   14
## 227 2022-08-15 2022    08   15
## 228 2022-08-16 2022    08   16
## 229 2022-08-17 2022    08   17
## 230 2022-08-18 2022    08   18
## 231 2022-08-19 2022    08   19
## 232 2022-08-20 2022    08   20
## 233 2022-08-21 2022    08   21
## 234 2022-08-22 2022    08   22
## 235 2022-08-23 2022    08   23
## 236 2022-08-24 2022    08   24
## 237 2022-08-25 2022    08   25
## 238 2022-08-26 2022    08   26
## 239 2022-08-27 2022    08   27
## 240 2022-08-28 2022    08   28
## 241 2022-08-29 2022    08   29
## 242 2022-08-30 2022    08   30
## 243 2022-08-31 2022    08   31
## 244 2022-09-01 2022    09   01
## 245 2022-09-02 2022    09   02
## 246 2022-09-03 2022    09   03
## 247 2022-09-04 2022    09   04
## 248 2022-09-05 2022    09   05
## 249 2022-09-06 2022    09   06
## 250 2022-09-07 2022    09   07
## 251 2022-09-08 2022    09   08
## 252 2022-09-09 2022    09   09
## 253 2022-09-10 2022    09   10
## 254 2022-09-11 2022    09   11
## 255 2022-09-12 2022    09   12
## 256 2022-09-13 2022    09   13
## 257 2022-09-14 2022    09   14
## 258 2022-09-15 2022    09   15
## 259 2022-09-16 2022    09   16
## 260 2022-09-17 2022    09   17
## 261 2022-09-18 2022    09   18
## 262 2022-09-19 2022    09   19
## 263 2022-09-20 2022    09   20
## 264 2022-09-21 2022    09   21
## 265 2022-09-22 2022    09   22
## 266 2022-09-23 2022    09   23
## 267 2022-09-24 2022    09   24
## 268 2022-09-25 2022    09   25
## 269 2022-09-26 2022    09   26
## 270 2022-09-27 2022    09   27
## 271 2022-09-28 2022    09   28
## 272 2022-09-29 2022    09   29
## 273 2022-09-30 2022    09   30
## 274 2022-10-01 2022    10   01
## 275 2022-10-02 2022    10   02
## 276 2022-10-03 2022    10   03
## 277 2022-10-04 2022    10   04
## 278 2022-10-05 2022    10   05
## 279 2022-10-06 2022    10   06
## 280 2022-10-07 2022    10   07
## 281 2022-10-08 2022    10   08
## 282 2022-10-09 2022    10   09
## 283 2022-10-10 2022    10   10
## 284 2022-10-11 2022    10   11
## 285 2022-10-12 2022    10   12
## 286 2022-10-13 2022    10   13
## 287 2022-10-14 2022    10   14
## 288 2022-10-15 2022    10   15
## 289 2022-10-16 2022    10   16
## 290 2022-10-17 2022    10   17
## 291 2022-10-18 2022    10   18
## 292 2022-10-19 2022    10   19
## 293 2022-10-20 2022    10   20
## 294 2022-10-21 2022    10   21
## 295 2022-10-22 2022    10   22
## 296 2022-10-23 2022    10   23
## 297 2022-10-24 2022    10   24
## 298 2022-10-25 2022    10   25
## 299 2022-10-26 2022    10   26
## 300 2022-10-27 2022    10   27
## 301 2022-10-28 2022    10   28
## 302 2022-10-29 2022    10   29
## 303 2022-10-30 2022    10   30
## 304 2022-10-31 2022    10   31
## 305 2022-11-01 2022    11   01
## 306 2022-11-02 2022    11   02
## 307 2022-11-03 2022    11   03
## 308 2022-11-04 2022    11   04
## 309 2022-11-05 2022    11   05
## 310 2022-11-06 2022    11   06
## 311 2022-11-07 2022    11   07
## 312 2022-11-08 2022    11   08
## 313 2022-11-09 2022    11   09
## 314 2022-11-10 2022    11   10
## 315 2022-11-11 2022    11   11
## 316 2022-11-12 2022    11   12
## 317 2022-11-13 2022    11   13
## 318 2022-11-14 2022    11   14
## 319 2022-11-15 2022    11   15
## 320 2022-11-16 2022    11   16
## 321 2022-11-17 2022    11   17
## 322 2022-11-18 2022    11   18
## 323 2022-11-19 2022    11   19
## 324 2022-11-20 2022    11   20
## 325 2022-11-21 2022    11   21
## 326 2022-11-22 2022    11   22
## 327 2022-11-23 2022    11   23
## 328 2022-11-24 2022    11   24
## 329 2022-11-25 2022    11   25
## 330 2022-11-26 2022    11   26
## 331 2022-11-27 2022    11   27
## 332 2022-11-28 2022    11   28
## 333 2022-11-29 2022    11   29
## 334 2022-11-30 2022    11   30
## 335 2022-12-01 2022    12   01
## 336 2022-12-02 2022    12   02
## 337 2022-12-03 2022    12   03
## 338 2022-12-04 2022    12   04
## 339 2022-12-05 2022    12   05
## 340 2022-12-06 2022    12   06
## 341 2022-12-07 2022    12   07
## 342 2022-12-08 2022    12   08
## 343 2022-12-09 2022    12   09
## 344 2022-12-10 2022    12   10
## 345 2022-12-11 2022    12   11
## 346 2022-12-12 2022    12   12
## 347 2022-12-13 2022    12   13
## 348 2022-12-14 2022    12   14
## 349 2022-12-15 2022    12   15
## 350 2022-12-16 2022    12   16
## 351 2022-12-17 2022    12   17
## 352 2022-12-18 2022    12   18
## 353 2022-12-19 2022    12   19
## 354 2022-12-20 2022    12   20
## 355 2022-12-21 2022    12   21
## 356 2022-12-22 2022    12   22
## 357 2022-12-23 2022    12   23
## 358 2022-12-24 2022    12   24
## 359 2022-12-25 2022    12   25
## 360 2022-12-26 2022    12   26
## 361 2022-12-27 2022    12   27
## 362 2022-12-28 2022    12   28
## 363 2022-12-29 2022    12   29
## 364 2022-12-30 2022    12   30
## 365 2022-12-31 2022    12   31
# others way to adding leading zeros with mutate_at
sptdatedf %>% mutate_at(.vars = c("month", "date"),
                        .funs = str_pad, width = 2, side = "left", pad = 0) %>% 
        unite(col = "full date", c("year", "month", "date"), sep = "-", remove = F)
##      full date year month date
## 1   2022-01-01 2022    01   01
## 2   2022-01-02 2022    01   02
## 3   2022-01-03 2022    01   03
## 4   2022-01-04 2022    01   04
## 5   2022-01-05 2022    01   05
## 6   2022-01-06 2022    01   06
## 7   2022-01-07 2022    01   07
## 8   2022-01-08 2022    01   08
## 9   2022-01-09 2022    01   09
## 10  2022-01-10 2022    01   10
## 11  2022-01-11 2022    01   11
## 12  2022-01-12 2022    01   12
## 13  2022-01-13 2022    01   13
## 14  2022-01-14 2022    01   14
## 15  2022-01-15 2022    01   15
## 16  2022-01-16 2022    01   16
## 17  2022-01-17 2022    01   17
## 18  2022-01-18 2022    01   18
## 19  2022-01-19 2022    01   19
## 20  2022-01-20 2022    01   20
## 21  2022-01-21 2022    01   21
## 22  2022-01-22 2022    01   22
## 23  2022-01-23 2022    01   23
## 24  2022-01-24 2022    01   24
## 25  2022-01-25 2022    01   25
## 26  2022-01-26 2022    01   26
## 27  2022-01-27 2022    01   27
## 28  2022-01-28 2022    01   28
## 29  2022-01-29 2022    01   29
## 30  2022-01-30 2022    01   30
## 31  2022-01-31 2022    01   31
## 32  2022-02-01 2022    02   01
## 33  2022-02-02 2022    02   02
## 34  2022-02-03 2022    02   03
## 35  2022-02-04 2022    02   04
## 36  2022-02-05 2022    02   05
## 37  2022-02-06 2022    02   06
## 38  2022-02-07 2022    02   07
## 39  2022-02-08 2022    02   08
## 40  2022-02-09 2022    02   09
## 41  2022-02-10 2022    02   10
## 42  2022-02-11 2022    02   11
## 43  2022-02-12 2022    02   12
## 44  2022-02-13 2022    02   13
## 45  2022-02-14 2022    02   14
## 46  2022-02-15 2022    02   15
## 47  2022-02-16 2022    02   16
## 48  2022-02-17 2022    02   17
## 49  2022-02-18 2022    02   18
## 50  2022-02-19 2022    02   19
## 51  2022-02-20 2022    02   20
## 52  2022-02-21 2022    02   21
## 53  2022-02-22 2022    02   22
## 54  2022-02-23 2022    02   23
## 55  2022-02-24 2022    02   24
## 56  2022-02-25 2022    02   25
## 57  2022-02-26 2022    02   26
## 58  2022-02-27 2022    02   27
## 59  2022-02-28 2022    02   28
## 60  2022-03-01 2022    03   01
## 61  2022-03-02 2022    03   02
## 62  2022-03-03 2022    03   03
## 63  2022-03-04 2022    03   04
## 64  2022-03-05 2022    03   05
## 65  2022-03-06 2022    03   06
## 66  2022-03-07 2022    03   07
## 67  2022-03-08 2022    03   08
## 68  2022-03-09 2022    03   09
## 69  2022-03-10 2022    03   10
## 70  2022-03-11 2022    03   11
## 71  2022-03-12 2022    03   12
## 72  2022-03-13 2022    03   13
## 73  2022-03-14 2022    03   14
## 74  2022-03-15 2022    03   15
## 75  2022-03-16 2022    03   16
## 76  2022-03-17 2022    03   17
## 77  2022-03-18 2022    03   18
## 78  2022-03-19 2022    03   19
## 79  2022-03-20 2022    03   20
## 80  2022-03-21 2022    03   21
## 81  2022-03-22 2022    03   22
## 82  2022-03-23 2022    03   23
## 83  2022-03-24 2022    03   24
## 84  2022-03-25 2022    03   25
## 85  2022-03-26 2022    03   26
## 86  2022-03-27 2022    03   27
## 87  2022-03-28 2022    03   28
## 88  2022-03-29 2022    03   29
## 89  2022-03-30 2022    03   30
## 90  2022-03-31 2022    03   31
## 91  2022-04-01 2022    04   01
## 92  2022-04-02 2022    04   02
## 93  2022-04-03 2022    04   03
## 94  2022-04-04 2022    04   04
## 95  2022-04-05 2022    04   05
## 96  2022-04-06 2022    04   06
## 97  2022-04-07 2022    04   07
## 98  2022-04-08 2022    04   08
## 99  2022-04-09 2022    04   09
## 100 2022-04-10 2022    04   10
## 101 2022-04-11 2022    04   11
## 102 2022-04-12 2022    04   12
## 103 2022-04-13 2022    04   13
## 104 2022-04-14 2022    04   14
## 105 2022-04-15 2022    04   15
## 106 2022-04-16 2022    04   16
## 107 2022-04-17 2022    04   17
## 108 2022-04-18 2022    04   18
## 109 2022-04-19 2022    04   19
## 110 2022-04-20 2022    04   20
## 111 2022-04-21 2022    04   21
## 112 2022-04-22 2022    04   22
## 113 2022-04-23 2022    04   23
## 114 2022-04-24 2022    04   24
## 115 2022-04-25 2022    04   25
## 116 2022-04-26 2022    04   26
## 117 2022-04-27 2022    04   27
## 118 2022-04-28 2022    04   28
## 119 2022-04-29 2022    04   29
## 120 2022-04-30 2022    04   30
## 121 2022-05-01 2022    05   01
## 122 2022-05-02 2022    05   02
## 123 2022-05-03 2022    05   03
## 124 2022-05-04 2022    05   04
## 125 2022-05-05 2022    05   05
## 126 2022-05-06 2022    05   06
## 127 2022-05-07 2022    05   07
## 128 2022-05-08 2022    05   08
## 129 2022-05-09 2022    05   09
## 130 2022-05-10 2022    05   10
## 131 2022-05-11 2022    05   11
## 132 2022-05-12 2022    05   12
## 133 2022-05-13 2022    05   13
## 134 2022-05-14 2022    05   14
## 135 2022-05-15 2022    05   15
## 136 2022-05-16 2022    05   16
## 137 2022-05-17 2022    05   17
## 138 2022-05-18 2022    05   18
## 139 2022-05-19 2022    05   19
## 140 2022-05-20 2022    05   20
## 141 2022-05-21 2022    05   21
## 142 2022-05-22 2022    05   22
## 143 2022-05-23 2022    05   23
## 144 2022-05-24 2022    05   24
## 145 2022-05-25 2022    05   25
## 146 2022-05-26 2022    05   26
## 147 2022-05-27 2022    05   27
## 148 2022-05-28 2022    05   28
## 149 2022-05-29 2022    05   29
## 150 2022-05-30 2022    05   30
## 151 2022-05-31 2022    05   31
## 152 2022-06-01 2022    06   01
## 153 2022-06-02 2022    06   02
## 154 2022-06-03 2022    06   03
## 155 2022-06-04 2022    06   04
## 156 2022-06-05 2022    06   05
## 157 2022-06-06 2022    06   06
## 158 2022-06-07 2022    06   07
## 159 2022-06-08 2022    06   08
## 160 2022-06-09 2022    06   09
## 161 2022-06-10 2022    06   10
## 162 2022-06-11 2022    06   11
## 163 2022-06-12 2022    06   12
## 164 2022-06-13 2022    06   13
## 165 2022-06-14 2022    06   14
## 166 2022-06-15 2022    06   15
## 167 2022-06-16 2022    06   16
## 168 2022-06-17 2022    06   17
## 169 2022-06-18 2022    06   18
## 170 2022-06-19 2022    06   19
## 171 2022-06-20 2022    06   20
## 172 2022-06-21 2022    06   21
## 173 2022-06-22 2022    06   22
## 174 2022-06-23 2022    06   23
## 175 2022-06-24 2022    06   24
## 176 2022-06-25 2022    06   25
## 177 2022-06-26 2022    06   26
## 178 2022-06-27 2022    06   27
## 179 2022-06-28 2022    06   28
## 180 2022-06-29 2022    06   29
## 181 2022-06-30 2022    06   30
## 182 2022-07-01 2022    07   01
## 183 2022-07-02 2022    07   02
## 184 2022-07-03 2022    07   03
## 185 2022-07-04 2022    07   04
## 186 2022-07-05 2022    07   05
## 187 2022-07-06 2022    07   06
## 188 2022-07-07 2022    07   07
## 189 2022-07-08 2022    07   08
## 190 2022-07-09 2022    07   09
## 191 2022-07-10 2022    07   10
## 192 2022-07-11 2022    07   11
## 193 2022-07-12 2022    07   12
## 194 2022-07-13 2022    07   13
## 195 2022-07-14 2022    07   14
## 196 2022-07-15 2022    07   15
## 197 2022-07-16 2022    07   16
## 198 2022-07-17 2022    07   17
## 199 2022-07-18 2022    07   18
## 200 2022-07-19 2022    07   19
## 201 2022-07-20 2022    07   20
## 202 2022-07-21 2022    07   21
## 203 2022-07-22 2022    07   22
## 204 2022-07-23 2022    07   23
## 205 2022-07-24 2022    07   24
## 206 2022-07-25 2022    07   25
## 207 2022-07-26 2022    07   26
## 208 2022-07-27 2022    07   27
## 209 2022-07-28 2022    07   28
## 210 2022-07-29 2022    07   29
## 211 2022-07-30 2022    07   30
## 212 2022-07-31 2022    07   31
## 213 2022-08-01 2022    08   01
## 214 2022-08-02 2022    08   02
## 215 2022-08-03 2022    08   03
## 216 2022-08-04 2022    08   04
## 217 2022-08-05 2022    08   05
## 218 2022-08-06 2022    08   06
## 219 2022-08-07 2022    08   07
## 220 2022-08-08 2022    08   08
## 221 2022-08-09 2022    08   09
## 222 2022-08-10 2022    08   10
## 223 2022-08-11 2022    08   11
## 224 2022-08-12 2022    08   12
## 225 2022-08-13 2022    08   13
## 226 2022-08-14 2022    08   14
## 227 2022-08-15 2022    08   15
## 228 2022-08-16 2022    08   16
## 229 2022-08-17 2022    08   17
## 230 2022-08-18 2022    08   18
## 231 2022-08-19 2022    08   19
## 232 2022-08-20 2022    08   20
## 233 2022-08-21 2022    08   21
## 234 2022-08-22 2022    08   22
## 235 2022-08-23 2022    08   23
## 236 2022-08-24 2022    08   24
## 237 2022-08-25 2022    08   25
## 238 2022-08-26 2022    08   26
## 239 2022-08-27 2022    08   27
## 240 2022-08-28 2022    08   28
## 241 2022-08-29 2022    08   29
## 242 2022-08-30 2022    08   30
## 243 2022-08-31 2022    08   31
## 244 2022-09-01 2022    09   01
## 245 2022-09-02 2022    09   02
## 246 2022-09-03 2022    09   03
## 247 2022-09-04 2022    09   04
## 248 2022-09-05 2022    09   05
## 249 2022-09-06 2022    09   06
## 250 2022-09-07 2022    09   07
## 251 2022-09-08 2022    09   08
## 252 2022-09-09 2022    09   09
## 253 2022-09-10 2022    09   10
## 254 2022-09-11 2022    09   11
## 255 2022-09-12 2022    09   12
## 256 2022-09-13 2022    09   13
## 257 2022-09-14 2022    09   14
## 258 2022-09-15 2022    09   15
## 259 2022-09-16 2022    09   16
## 260 2022-09-17 2022    09   17
## 261 2022-09-18 2022    09   18
## 262 2022-09-19 2022    09   19
## 263 2022-09-20 2022    09   20
## 264 2022-09-21 2022    09   21
## 265 2022-09-22 2022    09   22
## 266 2022-09-23 2022    09   23
## 267 2022-09-24 2022    09   24
## 268 2022-09-25 2022    09   25
## 269 2022-09-26 2022    09   26
## 270 2022-09-27 2022    09   27
## 271 2022-09-28 2022    09   28
## 272 2022-09-29 2022    09   29
## 273 2022-09-30 2022    09   30
## 274 2022-10-01 2022    10   01
## 275 2022-10-02 2022    10   02
## 276 2022-10-03 2022    10   03
## 277 2022-10-04 2022    10   04
## 278 2022-10-05 2022    10   05
## 279 2022-10-06 2022    10   06
## 280 2022-10-07 2022    10   07
## 281 2022-10-08 2022    10   08
## 282 2022-10-09 2022    10   09
## 283 2022-10-10 2022    10   10
## 284 2022-10-11 2022    10   11
## 285 2022-10-12 2022    10   12
## 286 2022-10-13 2022    10   13
## 287 2022-10-14 2022    10   14
## 288 2022-10-15 2022    10   15
## 289 2022-10-16 2022    10   16
## 290 2022-10-17 2022    10   17
## 291 2022-10-18 2022    10   18
## 292 2022-10-19 2022    10   19
## 293 2022-10-20 2022    10   20
## 294 2022-10-21 2022    10   21
## 295 2022-10-22 2022    10   22
## 296 2022-10-23 2022    10   23
## 297 2022-10-24 2022    10   24
## 298 2022-10-25 2022    10   25
## 299 2022-10-26 2022    10   26
## 300 2022-10-27 2022    10   27
## 301 2022-10-28 2022    10   28
## 302 2022-10-29 2022    10   29
## 303 2022-10-30 2022    10   30
## 304 2022-10-31 2022    10   31
## 305 2022-11-01 2022    11   01
## 306 2022-11-02 2022    11   02
## 307 2022-11-03 2022    11   03
## 308 2022-11-04 2022    11   04
## 309 2022-11-05 2022    11   05
## 310 2022-11-06 2022    11   06
## 311 2022-11-07 2022    11   07
## 312 2022-11-08 2022    11   08
## 313 2022-11-09 2022    11   09
## 314 2022-11-10 2022    11   10
## 315 2022-11-11 2022    11   11
## 316 2022-11-12 2022    11   12
## 317 2022-11-13 2022    11   13
## 318 2022-11-14 2022    11   14
## 319 2022-11-15 2022    11   15
## 320 2022-11-16 2022    11   16
## 321 2022-11-17 2022    11   17
## 322 2022-11-18 2022    11   18
## 323 2022-11-19 2022    11   19
## 324 2022-11-20 2022    11   20
## 325 2022-11-21 2022    11   21
## 326 2022-11-22 2022    11   22
## 327 2022-11-23 2022    11   23
## 328 2022-11-24 2022    11   24
## 329 2022-11-25 2022    11   25
## 330 2022-11-26 2022    11   26
## 331 2022-11-27 2022    11   27
## 332 2022-11-28 2022    11   28
## 333 2022-11-29 2022    11   29
## 334 2022-11-30 2022    11   30
## 335 2022-12-01 2022    12   01
## 336 2022-12-02 2022    12   02
## 337 2022-12-03 2022    12   03
## 338 2022-12-04 2022    12   04
## 339 2022-12-05 2022    12   05
## 340 2022-12-06 2022    12   06
## 341 2022-12-07 2022    12   07
## 342 2022-12-08 2022    12   08
## 343 2022-12-09 2022    12   09
## 344 2022-12-10 2022    12   10
## 345 2022-12-11 2022    12   11
## 346 2022-12-12 2022    12   12
## 347 2022-12-13 2022    12   13
## 348 2022-12-14 2022    12   14
## 349 2022-12-15 2022    12   15
## 350 2022-12-16 2022    12   16
## 351 2022-12-17 2022    12   17
## 352 2022-12-18 2022    12   18
## 353 2022-12-19 2022    12   19
## 354 2022-12-20 2022    12   20
## 355 2022-12-21 2022    12   21
## 356 2022-12-22 2022    12   22
## 357 2022-12-23 2022    12   23
## 358 2022-12-24 2022    12   24
## 359 2022-12-25 2022    12   25
## 360 2022-12-26 2022    12   26
## 361 2022-12-27 2022    12   27
## 362 2022-12-28 2022    12   28
## 363 2022-12-29 2022    12   29
## 364 2022-12-30 2022    12   30
## 365 2022-12-31 2022    12   31

pull()

extract column values as a vector

df %>% pull(hwy)
##   [1] 29 29 31 30 26 26 27 26 25 28 27 25 25 25 25 24 25 23 20 15 20 17 17 26 23
##  [26] 26 25 24 19 14 15 17 27 30 26 29 26 24 24 22 22 24 24 17 22 21 23 23 19 18
##  [51] 17 17 19 19 12 17 15 17 17 12 17 16 18 15 16 12 17 17 16 12 15 16 17 15 17
##  [76] 17 18 17 19 17 19 19 17 17 17 16 16 17 15 17 26 25 26 24 21 22 23 22 20 33
## [101] 32 32 29 32 34 36 36 29 26 27 30 31 26 26 28 26 29 28 27 24 24 24 22 19 20
## [126] 17 12 19 18 14 15 18 18 15 17 16 18 17 19 19 17 29 27 31 32 27 26 26 25 25
## [151] 17 17 20 18 26 26 27 28 25 25 24 27 25 26 23 26 26 26 26 25 27 25 27 20 20
## [176] 19 17 20 17 29 27 31 31 26 26 28 27 29 31 31 26 26 27 30 33 35 37 35 15 18
## [201] 20 20 22 17 19 18 20 29 26 29 29 24 44 29 26 29 29 29 29 23 24 44 41 29 26
## [226] 28 29 29 29 28 29 26 26 26

combining group_by and mutate

df %>% group_by(manufacturer, model) %>% mutate(`avg hwy` = mean(hwy))
## # A tibble: 234 × 15
## # Groups:   manufacturer, model [38]
##    manufac…¹ model displ  year   cyl trans drv     cty   hwy fl    class avg m…²
##    <chr>     <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>   <dbl>
##  1 audi      a4      1.8  1999     4 auto… f        18    29 p     comp…    23.5
##  2 audi      a4      1.8  1999     4 manu… f        21    29 p     comp…    25  
##  3 audi      a4      2    2008     4 manu… f        20    31 p     comp…    25.5
##  4 audi      a4      2    2008     4 auto… f        21    30 p     comp…    25.5
##  5 audi      a4      2.8  1999     6 auto… f        16    26 p     comp…    21  
##  6 audi      a4      2.8  1999     6 manu… f        18    26 p     comp…    22  
##  7 audi      a4      3.1  2008     6 auto… f        18    27 p     comp…    22.5
##  8 audi      a4 q…   1.8  1999     4 manu… 4        18    26 p     comp…    22  
##  9 audi      a4 q…   1.8  1999     4 auto… 4        16    25 p     comp…    20.5
## 10 audi      a4 q…   2    2008     4 manu… 4        20    28 p     comp…    24  
## # … with 224 more rows, 3 more variables: car <chr>, `cyl and trans` <chr>,
## #   `avg hwy` <dbl>, and abbreviated variable names ¹​manufacturer,
## #   ²​`avg mile per galon`
# the different between combined group_by-mutate and group_by-summarise is, the first combine
# also added summarised value to column for each corresponding row, while the other one only
# summarise grouped rows without adding it to a column.

df %>% group_by(manufacturer, model) %>% summarise(mean(hwy))
## `summarise()` has grouped output by 'manufacturer'. You can override using the
## `.groups` argument.
## # A tibble: 38 × 3
## # Groups:   manufacturer [15]
##    manufacturer model              `mean(hwy)`
##    <chr>        <chr>                    <dbl>
##  1 audi         a4                        28.3
##  2 audi         a4 quattro                25.8
##  3 audi         a6 quattro                24  
##  4 chevrolet    c1500 suburban 2wd        17.8
##  5 chevrolet    corvette                  24.8
##  6 chevrolet    k1500 tahoe 4wd           16.2
##  7 chevrolet    malibu                    27.6
##  8 dodge        caravan 2wd               22.4
##  9 dodge        dakota pickup 4wd         17  
## 10 dodge        durango 4wd               16  
## # … with 28 more rows

case_when()

just like case when in SQL

# library data.table is loaded to use like clause like in SQL
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
# create a column to identify the transmission type
# whether automatic coded as AT or manual coded as MT
df1 <- df %>% mutate(`trans type` = case_when(
        trans %like% "auto.*" ~ "AT",
        trans %like% "manual.*" ~ "MT",
        TRUE ~ "NA")
)

let’s play

df2 <- df1 %>% count(manufacturer, `trans type`) # long format
# convert to wide format
df2 %>% pivot_wider(names_from = `trans type`, values_from = n) # this result will similar
## # A tibble: 15 × 3
##    manufacturer    AT    MT
##    <chr>        <int> <int>
##  1 audi            11     7
##  2 chevrolet       16     3
##  3 dodge           30     7
##  4 ford            17     8
##  5 honda            4     5
##  6 hyundai          7     7
##  7 jeep             8    NA
##  8 land rover       4    NA
##  9 lincoln          3    NA
## 10 mercury          4    NA
## 11 nissan           8     5
## 12 pontiac          5    NA
## 13 subaru           7     7
## 14 toyota          20    14
## 15 volkswagen      13    14
# to this syntax:
table(df1$manufacturer, df1$`trans type`)
##             
##              AT MT
##   audi       11  7
##   chevrolet  16  3
##   dodge      30  7
##   ford       17  8
##   honda       4  5
##   hyundai     7  7
##   jeep        8  0
##   land rover  4  0
##   lincoln     3  0
##   mercury     4  0
##   nissan      8  5
##   pontiac     5  0
##   subaru      7  7
##   toyota     20 14
##   volkswagen 13 14
# but table function will cause 0 as a 0 numeric, whereas pivot function will cause 0 as null

row_number()

rank a row

# adding id to car data
df %>% mutate(`car id` = row_number()) %>% print(n = 20)
## # A tibble: 234 × 15
##    manufac…¹ model displ  year   cyl trans drv     cty   hwy fl    class avg m…²
##    <chr>     <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>   <dbl>
##  1 audi      a4      1.8  1999     4 auto… f        18    29 p     comp…    23.5
##  2 audi      a4      1.8  1999     4 manu… f        21    29 p     comp…    25  
##  3 audi      a4      2    2008     4 manu… f        20    31 p     comp…    25.5
##  4 audi      a4      2    2008     4 auto… f        21    30 p     comp…    25.5
##  5 audi      a4      2.8  1999     6 auto… f        16    26 p     comp…    21  
##  6 audi      a4      2.8  1999     6 manu… f        18    26 p     comp…    22  
##  7 audi      a4      3.1  2008     6 auto… f        18    27 p     comp…    22.5
##  8 audi      a4 q…   1.8  1999     4 manu… 4        18    26 p     comp…    22  
##  9 audi      a4 q…   1.8  1999     4 auto… 4        16    25 p     comp…    20.5
## 10 audi      a4 q…   2    2008     4 manu… 4        20    28 p     comp…    24  
## 11 audi      a4 q…   2    2008     4 auto… 4        19    27 p     comp…    23  
## 12 audi      a4 q…   2.8  1999     6 auto… 4        15    25 p     comp…    20  
## 13 audi      a4 q…   2.8  1999     6 manu… 4        17    25 p     comp…    21  
## 14 audi      a4 q…   3.1  2008     6 auto… 4        17    25 p     comp…    21  
## 15 audi      a4 q…   3.1  2008     6 manu… 4        15    25 p     comp…    20  
## 16 audi      a6 q…   2.8  1999     6 auto… 4        15    24 p     mids…    19.5
## 17 audi      a6 q…   3.1  2008     6 auto… 4        17    25 p     mids…    21  
## 18 audi      a6 q…   4.2  2008     8 auto… 4        16    23 p     mids…    19.5
## 19 chevrolet c150…   5.3  2008     8 auto… r        14    20 r     suv      17  
## 20 chevrolet c150…   5.3  2008     8 auto… r        11    15 e     suv      13  
## # … with 214 more rows, 3 more variables: car <chr>, `cyl and trans` <chr>,
## #   `car id` <int>, and abbreviated variable names ¹​manufacturer,
## #   ²​`avg mile per galon`

create a car id considering group

# it will create incremented id in each group
df %>% group_by(model) %>% mutate(`mod id` = row_number())
## # A tibble: 234 × 15
## # Groups:   model [38]
##    manufac…¹ model displ  year   cyl trans drv     cty   hwy fl    class avg m…²
##    <chr>     <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>   <dbl>
##  1 audi      a4      1.8  1999     4 auto… f        18    29 p     comp…    23.5
##  2 audi      a4      1.8  1999     4 manu… f        21    29 p     comp…    25  
##  3 audi      a4      2    2008     4 manu… f        20    31 p     comp…    25.5
##  4 audi      a4      2    2008     4 auto… f        21    30 p     comp…    25.5
##  5 audi      a4      2.8  1999     6 auto… f        16    26 p     comp…    21  
##  6 audi      a4      2.8  1999     6 manu… f        18    26 p     comp…    22  
##  7 audi      a4      3.1  2008     6 auto… f        18    27 p     comp…    22.5
##  8 audi      a4 q…   1.8  1999     4 manu… 4        18    26 p     comp…    22  
##  9 audi      a4 q…   1.8  1999     4 auto… 4        16    25 p     comp…    20.5
## 10 audi      a4 q…   2    2008     4 manu… 4        20    28 p     comp…    24  
## # … with 224 more rows, 3 more variables: car <chr>, `cyl and trans` <chr>,
## #   `mod id` <int>, and abbreviated variable names ¹​manufacturer,
## #   ²​`avg mile per galon`

mutate FAMILY

mutate_all() apply function to all columns
mutate_if() apply function over columns that meet condition
mutate_at() apply function to a specific columns

let’s get action with dplyr and tidyr

load data

# install.packages("hflights")
library(hflights)

df <- hflights
class(df)
## [1] "data.frame"

getting acquainted with the data

glimpse(df)
## Rows: 227,496
## Columns: 21
## $ Year              <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
## $ Month             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DayofMonth        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ DayOfWeek         <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2…
## $ DepTime           <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 1355, 1443…
## $ ArrTime           <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 1454, 1554…
## $ UniqueCarrier     <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA"…
## $ FlightNum         <int> 428, 428, 428, 428, 428, 428, 428, 428, 428, 428, 42…
## $ TailNum           <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N492AA", "N…
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, 56, 63, …
## $ AirTime           <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, 41, 44, …
## $ ArrDelay          <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29, 5, -9, …
## $ DepDelay          <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, -2, -3, …
## $ Origin            <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IA…
## $ Dest              <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DF…
## $ Distance          <int> 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 22…
## $ TaxiIn            <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6, 12, 8,…
## $ TaxiOut           <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11, 13, 15…
## $ Cancelled         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CancellationCode  <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", …
## $ Diverted          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

there are more than 200 thousands records

Checking null values

# check globally
anyNA(df) # or using
## [1] TRUE
any(is.na(df))
## [1] TRUE
# find the total number of missing values in each column
colSums(is.na(df))
##              Year             Month        DayofMonth         DayOfWeek 
##                 0                 0                 0                 0 
##           DepTime           ArrTime     UniqueCarrier         FlightNum 
##              2905              3066                 0                 0 
##           TailNum ActualElapsedTime           AirTime          ArrDelay 
##                 0              3622              3622              3622 
##          DepDelay            Origin              Dest          Distance 
##              2905                 0                 0                 0 
##            TaxiIn           TaxiOut         Cancelled  CancellationCode 
##              3066              2947                 0                 0 
##          Diverted 
##                 0
# find only columns and its position that contains missing values
which(colSums(is.na(df))>0)
##           DepTime           ArrTime ActualElapsedTime           AirTime 
##                 5                 6                10                11 
##          ArrDelay          DepDelay            TaxiIn           TaxiOut 
##                12                13                17                18
# display the names of columns that contains missing values
names(which(colSums(is.na(df))>0))
## [1] "DepTime"           "ArrTime"           "ActualElapsedTime"
## [4] "AirTime"           "ArrDelay"          "DepDelay"         
## [7] "TaxiIn"            "TaxiOut"

another way to identify columns with null values

colnames(df)[colSums(is.na(df))>0]
## [1] "DepTime"           "ArrTime"           "ActualElapsedTime"
## [4] "AirTime"           "ArrDelay"          "DepDelay"         
## [7] "TaxiIn"            "TaxiOut"

or using apply

# apply(df, 2, anyNA)

# which(apply(df, 2, anyNA))

names(which(apply(df, 2, anyNA)))
## [1] "DepTime"           "ArrTime"           "ActualElapsedTime"
## [4] "AirTime"           "ArrDelay"          "DepDelay"         
## [7] "TaxiIn"            "TaxiOut"

but the third way take longer time to retrieve the results

place the list of columns that contains null values in a tibble

NullCol <- data.frame(col = names(which(colSums(is.na(df))>0)))

adjust the appropriate format of each columns to ensure data integrity

at the first step we see that some columns not formatted properly

glimpse(df)
## Rows: 227,496
## Columns: 21
## $ Year              <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011…
## $ Month             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DayofMonth        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ DayOfWeek         <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2…
## $ DepTime           <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 1355, 1443…
## $ ArrTime           <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 1454, 1554…
## $ UniqueCarrier     <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AA"…
## $ FlightNum         <int> 428, 428, 428, 428, 428, 428, 428, 428, 428, 428, 42…
## $ TailNum           <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N492AA", "N…
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, 56, 63, …
## $ AirTime           <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, 41, 44, …
## $ ArrDelay          <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29, 5, -9, …
## $ DepDelay          <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, -2, -3, …
## $ Origin            <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "IA…
## $ Dest              <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "DF…
## $ Distance          <int> 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 22…
## $ TaxiIn            <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6, 12, 8,…
## $ TaxiOut           <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11, 13, 15…
## $ Cancelled         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CancellationCode  <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", …
## $ Diverted          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

the year, month, and day columns should be formatted as date not integer instead.
but before formatting the columns, we have to merge these columns to make it as a full date column

# add leading zeros to column month and dayofmonth
df <- df %>% mutate_at(.vars = c("Month", "DayofMonth"),
                 .funs = str_pad, width = 2, side = "left", pad = 0) %>%
        # merge appropriate columns to create a full dates column
        unite("Dates", c(Year, Month, DayofMonth), sep = "-")

after full dates column was generated, we convert the column’s format from string to date format

df$Dates <- as.POSIXct(df$Dates, "%Y-%m-%d")
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y-%m-%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y-%m-%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y-%m-%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y-%m-%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y-%m-%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y-%m-%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y-%m-%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y-%m-%d'
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y-%m-%d'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y-%m-%d'
## Warning in strptime(x, f, tz = tz): unknown timezone '%Y-%m-%d'
## Warning in as.POSIXct.POSIXlt(as.POSIXlt(x, tz, ...), tz, ...): unknown timezone
## '%Y-%m-%d'

exploring data

total flight for each carrier

df %>% count(UniqueCarrier) %>% arrange(desc(n))
##    UniqueCarrier     n
## 1             XE 73053
## 2             CO 70032
## 3             WN 45343
## 4             OO 16061
## 5             MQ  4648
## 6             US  4082
## 7             AA  3244
## 8             DL  2641
## 9             EV  2204
## 10            FL  2139
## 11            UA  2072
## 12            F9   838
## 13            B6   695
## 14            AS   365
## 15            YV    79

count the number of cancellation in each carrier

df %>% count(UniqueCarrier, Cancelled) %>% 
        filter(Cancelled == 1) %>% 
        arrange(desc(n))
##    UniqueCarrier Cancelled    n
## 1             XE         1 1132
## 2             WN         1  703
## 3             CO         1  475
## 4             OO         1  224
## 5             MQ         1  135
## 6             EV         1   76
## 7             AA         1   60
## 8             US         1   46
## 9             DL         1   42
## 10            UA         1   34
## 11            FL         1   21
## 12            B6         1   18
## 13            F9         1    6
## 14            YV         1    1

percentage of cancelled flight per carrier

percent_of_cancel <- df %>% count(UniqueCarrier, Cancelled) %>% 
        group_by(UniqueCarrier) %>% 
        mutate(`total flights` = sum(n), 
               percentage = round(n/`total flights`*100, 2)) %>% 
        filter(Cancelled == 1) %>% select(UniqueCarrier, percentage) %>% 
        arrange(desc(percentage)) %>% ungroup()

takeaway:
although XE is the carrier with the most number of cancelled flights among other carriers, but the rank of a percentage of its cancellation is no more than the top 50% among others.



count flight per canceled code for each carriers

numcancode <- df %>% count(UniqueCarrier, CancellationCode)
numcancode %>% pivot_wider(names_from = CancellationCode, values_from = n)

the pivoting process will error because there is an empty value in cancellation code, thus we have to encoding the empty value, so that the pivoting process will succeed

numcancode %>% #coded empty type of cancellation code
        mutate(CancellationCode = case_when(CancellationCode == "" ~ "0",
                                                   TRUE ~ CancellationCode)) %>% 
        # pivoting the table
        pivot_wider(names_from = CancellationCode, values_from = n, values_fill = 0)
## # A tibble: 15 × 6
##    UniqueCarrier   `0`     A     B     C     D
##    <chr>         <int> <int> <int> <int> <int>
##  1 AA             3184    20    29    11     0
##  2 AS              365     0     0     0     0
##  3 B6              677     5    13     0     0
##  4 CO            69557    37   436     2     0
##  5 DL             2599    13    27     2     0
##  6 EV             2128    60    14     2     0
##  7 F9              832     2     4     0     0
##  8 FL             2118     8    12     1     0
##  9 MQ             4513    39    71    25     0
## 10 OO            15837   121    87    15     1
## 11 UA             2038    21    10     3     0
## 12 US             4036    27    17     2     0
## 13 WN            44640   517   181     5     0
## 14 XE            71921   331   751    50     0
## 15 YV               78     1     0     0     0

another way to do the same thing is using table()

table(df$UniqueCarrier, df$CancellationCode)
##     
##                A     B     C     D
##   AA  3184    20    29    11     0
##   AS   365     0     0     0     0
##   B6   677     5    13     0     0
##   CO 69557    37   436     2     0
##   DL  2599    13    27     2     0
##   EV  2128    60    14     2     0
##   F9   832     2     4     0     0
##   FL  2118     8    12     1     0
##   MQ  4513    39    71    25     0
##   OO 15837   121    87    15     1
##   UA  2038    21    10     3     0
##   US  4036    27    17     2     0
##   WN 44640   517   181     5     0
##   XE 71921   331   751    50     0
##   YV    78     1     0     0     0