Jimmy Ng’s Tidyverse Recipe

Code can be found on Jimmy Ng’s Github.

Let’s load the tidyverse package and then load a sample data set from my github account. The original dataset comes from kaggle. We will look at packages under the tidyverse, functions within “dplyr”, and eventually zoom in into a specific dplyr function “case_when” with a sample use case.


Set-up

# load package(s)
library(tidyverse)
library(janitor)
library(kableExtra)
# load data
df <- read.csv("https://raw.githubusercontent.com/myvioletrose/school_of_professional_studies/master/607.%20Data%20Acquisition%20and%20Management/%23%20misc/Mall_Customers.csv") %>% 
        janitor::clean_names(.)  # using janitor::clean_names() to clean up column names
head(df) %>% kable() %>% kable_styling()
customer_id gender age annual_income_k spending_score_1_100
1 Male 19 15 39
2 Male 21 15 81
3 Female 20 16 6
4 Female 23 16 77
5 Female 31 17 40
6 Female 22 17 76

tidyverse packages

# how many packages live under the tidyverse?
tidyverse_packages()
##  [1] "broom"       "cli"         "crayon"      "dplyr"       "dbplyr"     
##  [6] "forcats"     "ggplot2"     "haven"       "hms"         "httr"       
## [11] "jsonlite"    "lubridate"   "magrittr"    "modelr"      "purrr"      
## [16] "readr"       "readxl\n(>=" "reprex"      "rlang"       "rstudioapi" 
## [21] "rvest"       "stringr"     "tibble"      "tidyr"       "xml2"       
## [26] "tidyverse"

dplyr

# how many functions within "dplyr"?
ls(pos = "package:dplyr")
##   [1] "%>%"                "add_count"          "add_count_"        
##   [4] "add_row"            "add_rownames"       "add_tally"         
##   [7] "add_tally_"         "all_equal"          "all_vars"          
##  [10] "anti_join"          "any_vars"           "arrange"           
##  [13] "arrange_"           "arrange_all"        "arrange_at"        
##  [16] "arrange_if"         "as.tbl"             "as.tbl_cube"       
##  [19] "as_data_frame"      "as_tibble"          "auto_copy"         
##  [22] "band_instruments"   "band_instruments2"  "band_members"      
##  [25] "bench_tbls"         "between"            "bind_cols"         
##  [28] "bind_rows"          "case_when"          "changes"           
##  [31] "check_dbplyr"       "coalesce"           "collapse"          
##  [34] "collect"            "combine"            "common_by"         
##  [37] "compare_tbls"       "compare_tbls2"      "compute"           
##  [40] "contains"           "copy_to"            "count"             
##  [43] "count_"             "cumall"             "cumany"            
##  [46] "cume_dist"          "cummean"            "current_vars"      
##  [49] "data_frame"         "data_frame_"        "db_analyze"        
##  [52] "db_begin"           "db_commit"          "db_create_index"   
##  [55] "db_create_indexes"  "db_create_table"    "db_data_type"      
##  [58] "db_desc"            "db_drop_table"      "db_explain"        
##  [61] "db_has_table"       "db_insert_into"     "db_list_tables"    
##  [64] "db_query_fields"    "db_query_rows"      "db_rollback"       
##  [67] "db_save_query"      "db_write_table"     "dense_rank"        
##  [70] "desc"               "dim_desc"           "distinct"          
##  [73] "distinct_"          "do"                 "do_"               
##  [76] "dr_dplyr"           "ends_with"          "enexpr"            
##  [79] "enexprs"            "enquo"              "enquos"            
##  [82] "ensym"              "ensyms"             "eval_tbls"         
##  [85] "eval_tbls2"         "everything"         "explain"           
##  [88] "expr"               "failwith"           "filter"            
##  [91] "filter_"            "filter_all"         "filter_at"         
##  [94] "filter_if"          "first"              "frame_data"        
##  [97] "full_join"          "funs"               "funs_"             
## [100] "glimpse"            "group_by"           "group_by_"         
## [103] "group_by_all"       "group_by_at"        "group_by_if"       
## [106] "group_by_prepare"   "group_indices"      "group_indices_"    
## [109] "group_size"         "group_vars"         "grouped_df"        
## [112] "groups"             "id"                 "ident"             
## [115] "if_else"            "inner_join"         "intersect"         
## [118] "is.grouped_df"      "is.src"             "is.tbl"            
## [121] "is_grouped_df"      "lag"                "last"              
## [124] "lead"               "left_join"          "location"          
## [127] "lst"                "lst_"               "make_tbl"          
## [130] "matches"            "min_rank"           "mutate"            
## [133] "mutate_"            "mutate_all"         "mutate_at"         
## [136] "mutate_each"        "mutate_each_"       "mutate_if"         
## [139] "n"                  "n_distinct"         "n_groups"          
## [142] "na_if"              "nasa"               "near"              
## [145] "nth"                "ntile"              "num_range"         
## [148] "one_of"             "order_by"           "percent_rank"      
## [151] "progress_estimated" "pull"               "quo"               
## [154] "quo_name"           "quos"               "rbind_all"         
## [157] "rbind_list"         "recode"             "recode_factor"     
## [160] "rename"             "rename_"            "rename_all"        
## [163] "rename_at"          "rename_if"          "rename_vars"       
## [166] "rename_vars_"       "right_join"         "row_number"        
## [169] "rowwise"            "same_src"           "sample_frac"       
## [172] "sample_n"           "select"             "select_"           
## [175] "select_all"         "select_at"          "select_if"         
## [178] "select_var"         "select_vars"        "select_vars_"      
## [181] "semi_join"          "setdiff"            "setequal"          
## [184] "show_query"         "slice"              "slice_"            
## [187] "sql"                "sql_escape_ident"   "sql_escape_string" 
## [190] "sql_join"           "sql_select"         "sql_semi_join"     
## [193] "sql_set_op"         "sql_subquery"       "sql_translate_env" 
## [196] "src"                "src_df"             "src_local"         
## [199] "src_mysql"          "src_postgres"       "src_sqlite"        
## [202] "src_tbls"           "starts_with"        "starwars"          
## [205] "storms"             "summarise"          "summarise_"        
## [208] "summarise_all"      "summarise_at"       "summarise_each"    
## [211] "summarise_each_"    "summarise_if"       "summarize"         
## [214] "summarize_"         "summarize_all"      "summarize_at"      
## [217] "summarize_each"     "summarize_each_"    "summarize_if"      
## [220] "sym"                "syms"               "tally"             
## [223] "tally_"             "tbl"                "tbl_cube"          
## [226] "tbl_df"             "tbl_nongroup_vars"  "tbl_sum"           
## [229] "tbl_vars"           "tibble"             "top_n"             
## [232] "transmute"          "transmute_"         "transmute_all"     
## [235] "transmute_at"       "transmute_if"       "tribble"           
## [238] "trunc_mat"          "type_sum"           "ungroup"           
## [241] "union"              "union_all"          "vars"              
## [244] "with_order"         "wrap_dbplyr_obj"

dplyr::case_when

Here’s an official description: This function allows you to vectorise multiple if_else() statements. It is an R equivalent of the SQL CASE WHEN statement. If no cases match, NA is returned.

# let's create a new variable using case_when by transforming a numeric variable into buckets
# let's look at the distribution - before transformation
hist(df$annual_income_k, main = "Annual Income (before transformation)")

# let's create 10 buckets to represent this numeric variable
buckets <- cut(df$annual_income_k, breaks = 10) %>% levels
buckets
##  [1] "(14.9,27.2]" "(27.2,39.4]" "(39.4,51.6]" "(51.6,63.8]" "(63.8,76]"  
##  [6] "(76,88.2]"   "(88.2,100]"  "(100,113]"   "(113,125]"   "(125,137]"
# create a new categorical variable
df$income_bucket <- cut(df$annual_income_k, breaks = 10)
df <- df %>%
        dplyr::mutate(income_bucket = dplyr::case_when(income_bucket == buckets[1] ~ "income level 0",
                                                       income_bucket == buckets[2] ~ "income level 1",
                                                       income_bucket == buckets[3] ~ "income level 2",
                                                       income_bucket == buckets[4] ~ "income level 3",
                                                       income_bucket == buckets[5] ~ "income level 4",
                                                       income_bucket == buckets[6] ~ "income level 5",
                                                       income_bucket == buckets[7] ~ "income level 6",
                                                       income_bucket == buckets[8] ~ "income level 7",
                                                       income_bucket == buckets[9] ~ "income level 8",
                                                       income_bucket == buckets[10] ~ "income level 9",
                                                       TRUE ~ as.character(income_bucket)))
head(df) %>% kable() %>% kable_styling()
customer_id gender age annual_income_k spending_score_1_100 income_bucket
1 Male 19 15 39 income level 0
2 Male 21 15 81 income level 0
3 Female 20 16 6 income level 0
4 Female 23 16 77 income level 0
5 Female 31 17 40 income level 0
6 Female 22 17 76 income level 0
# let's look at the distribution - after transformation
table(df$income_bucket) %>% barplot(., ylab = "Frequency", main = "Annual Income (after transformation)")


Samantha Deokinanan’s Extensions

I found Jimmy Ng’s tidyverse recipe very helpful as I have never used dplyr::case_when, thus I am extending this discussion with two (2) other dplyr functions, namely dplyr::if_else and dplyr::recode. I selected these fuctions because I thought they could have produced the same outputs as case_when (particularly recode) but there all have there differences and usefulness.


dplyr::if_else

This function is more strict than base::ifelse such that it will preserve the data type. if_else work great when a single set of conditions is to be satisfied. But if multiple sets of conditions are to be tested, nested if_else statements become cumbersome and thus prone to error. The following code highlights an example of nested if_else statements from what Jimmy Ng demonstrated:

# Reset to df just before Jimmy Ng applied `dplyr::case_when` 
# i.e. create a new categorical variable
df$income_bucket <- cut(df$annual_income_k, breaks = 10)

df1 <- df %>%
        dplyr::mutate(income_bucket = 
                        dplyr::if_else(income_bucket == buckets[1], "income level 0",
                        dplyr::if_else(income_bucket == buckets[2], "income level 1",
                        dplyr::if_else(income_bucket == buckets[3], "income level 2",
                        dplyr::if_else(income_bucket == buckets[4], "income level 3",
                        dplyr::if_else(income_bucket == buckets[5], "income level 4",
                        dplyr::if_else(income_bucket == buckets[6], "income level 5",
                        dplyr::if_else(income_bucket == buckets[7], "income level 6",
                        dplyr::if_else(income_bucket == buckets[8], "income level 7",
                        dplyr::if_else(income_bucket == buckets[9], "income level 8", "income level 9"))))))))))

head(df1) %>% kable() %>% kable_styling()
customer_id gender age annual_income_k spending_score_1_100 income_bucket
1 Male 19 15 39 income level 0
2 Male 21 15 81 income level 0
3 Female 20 16 6 income level 0
4 Female 23 16 77 income level 0
5 Female 31 17 40 income level 0
6 Female 22 17 76 income level 0
str(df1)
## 'data.frame':    200 obs. of  6 variables:
##  $ customer_id         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ gender              : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 1 1 1 2 1 ...
##  $ age                 : int  19 21 20 23 31 22 35 23 64 30 ...
##  $ annual_income_k     : int  15 15 16 16 17 17 18 18 19 19 ...
##  $ spending_score_1_100: int  39 81 6 77 40 76 6 94 3 72 ...
##  $ income_bucket       : chr  "income level 0" "income level 0" "income level 0" "income level 0" ...

Since buckets were already stored as character, the income_buckets categories were preserved without any errors. If buckets were numeric, then it would have to be converted into characters beforehand.


dplyr::recode

Another solution is to use the dplyr::recode function, as it is well suited for replacing values but it will not allow for more complex operations. Let’s look at it again with Jimmy Ng’s example:

df2 <- df %>%
        mutate(income_bucket = 
dplyr::recode(income_bucket, "(14.9,27.2]" = "income level 0",
                             "(27.2,39.4]" = "income level 1",
                             "(39.4,51.6]" = "income level 2",
                             "(51.6,63.8]" = "income level 3",
                             "(63.8,76]" = "income level 4",
                             "(76,88.2]" = "income level 5",
                             "(88.2,100]" = "income level 6",
                             "(100,113]" = "income level 7",
                             "(113,125]" = "income level 8", 
                             "(125,137]" = "income level 9"))

head(df2) %>% kable() %>% kable_styling()
customer_id gender age annual_income_k spending_score_1_100 income_bucket
1 Male 19 15 39 income level 0
2 Male 21 15 81 income level 0
3 Female 20 16 6 income level 0
4 Female 23 16 77 income level 0
5 Female 31 17 40 income level 0
6 Female 22 17 76 income level 0
str(df2)
## 'data.frame':    200 obs. of  6 variables:
##  $ customer_id         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ gender              : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 1 1 1 2 1 ...
##  $ age                 : int  19 21 20 23 31 22 35 23 64 30 ...
##  $ annual_income_k     : int  15 15 16 16 17 17 18 18 19 19 ...
##  $ spending_score_1_100: int  39 81 6 77 40 76 6 94 3 72 ...
##  $ income_bucket       : Factor w/ 10 levels "income level 0",..: 1 1 1 1 1 1 1 1 1 1 ...

Here, the recoded income_bucket was stored as a factor.


Conclusion

In the end, dplyr provides multiple functions that can be used when you want to create an output value that is conditioned on an evaluation. The right function can make the transformation less cumbersome.