CUNY MSDS Data 607 TidyVerse

Jimmy Ng’s Tidyverse Recipe

Code can be found on Jimmy Ng’s Github.

Let’s load the tidyverse package and then load a sample data set from my github account. The original dataset comes from kaggle. We will look at packages under the tidyverse, functions within “dplyr”, and eventually zoom in into a specific dplyr function “case_when” with a sample use case.

Set-up

# load package(s)
library(tidyverse)
library(janitor)
library(kableExtra)
# load data
df <- read.csv("https://raw.githubusercontent.com/myvioletrose/school_of_professional_studies/master/607.%20Data%20Acquisition%20and%20Management/%23%20misc/Mall_Customers.csv") %>% 
        janitor::clean_names(.)  # using janitor::clean_names() to clean up column names
head(df) %>% kable() %>% kable_styling()

customer_id	gender	age	annual_income_k	spending_score_1_100
1	Male	19	15	39
2	Male	21	15	81
3	Female	20	16	6
4	Female	23	16	77
5	Female	31	17	40
6	Female	22	17	76

tidyverse packages

# how many packages live under the tidyverse?
tidyverse_packages()

##  [1] "broom"       "cli"         "crayon"      "dplyr"       "dbplyr"     
##  [6] "forcats"     "ggplot2"     "haven"       "hms"         "httr"       
## [11] "jsonlite"    "lubridate"   "magrittr"    "modelr"      "purrr"      
## [16] "readr"       "readxl\n(>=" "reprex"      "rlang"       "rstudioapi" 
## [21] "rvest"       "stringr"     "tibble"      "tidyr"       "xml2"       
## [26] "tidyverse"

dplyr

# how many functions within "dplyr"?
ls(pos = "package:dplyr")

##   [1] "%>%"                "add_count"          "add_count_"        
##   [4] "add_row"            "add_rownames"       "add_tally"         
##   [7] "add_tally_"         "all_equal"          "all_vars"          
##  [10] "anti_join"          "any_vars"           "arrange"           
##  [13] "arrange_"           "arrange_all"        "arrange_at"        
##  [16] "arrange_if"         "as.tbl"             "as.tbl_cube"       
##  [19] "as_data_frame"      "as_tibble"          "auto_copy"         
##  [22] "band_instruments"   "band_instruments2"  "band_members"      
##  [25] "bench_tbls"         "between"            "bind_cols"         
##  [28] "bind_rows"          "case_when"          "changes"           
##  [31] "check_dbplyr"       "coalesce"           "collapse"          
##  [34] "collect"            "combine"            "common_by"         
##  [37] "compare_tbls"       "compare_tbls2"      "compute"           
##  [40] "contains"           "copy_to"            "count"             
##  [43] "count_"             "cumall"             "cumany"            
##  [46] "cume_dist"          "cummean"            "current_vars"      
##  [49] "data_frame"         "data_frame_"        "db_analyze"        
##  [52] "db_begin"           "db_commit"          "db_create_index"   
##  [55] "db_create_indexes"  "db_create_table"    "db_data_type"      
##  [58] "db_desc"            "db_drop_table"      "db_explain"        
##  [61] "db_has_table"       "db_insert_into"     "db_list_tables"    
##  [64] "db_query_fields"    "db_query_rows"      "db_rollback"       
##  [67] "db_save_query"      "db_write_table"     "dense_rank"        
##  [70] "desc"               "dim_desc"           "distinct"          
##  [73] "distinct_"          "do"                 "do_"               
##  [76] "dr_dplyr"           "ends_with"          "enexpr"            
##  [79] "enexprs"            "enquo"              "enquos"            
##  [82] "ensym"              "ensyms"             "eval_tbls"         
##  [85] "eval_tbls2"         "everything"         "explain"           
##  [88] "expr"               "failwith"           "filter"            
##  [91] "filter_"            "filter_all"         "filter_at"         
##  [94] "filter_if"          "first"              "frame_data"        
##  [97] "full_join"          "funs"               "funs_"             
## [100] "glimpse"            "group_by"           "group_by_"         
## [103] "group_by_all"       "group_by_at"        "group_by_if"       
## [106] "group_by_prepare"   "group_indices"      "group_indices_"    
## [109] "group_size"         "group_vars"         "grouped_df"        
## [112] "groups"             "id"                 "ident"             
## [115] "if_else"            "inner_join"         "intersect"         
## [118] "is.grouped_df"      "is.src"             "is.tbl"            
## [121] "is_grouped_df"      "lag"                "last"              
## [124] "lead"               "left_join"          "location"          
## [127] "lst"                "lst_"               "make_tbl"          
## [130] "matches"            "min_rank"           "mutate"            
## [133] "mutate_"            "mutate_all"         "mutate_at"         
## [136] "mutate_each"        "mutate_each_"       "mutate_if"         
## [139] "n"                  "n_distinct"         "n_groups"          
## [142] "na_if"              "nasa"               "near"              
## [145] "nth"                "ntile"              "num_range"         
## [148] "one_of"             "order_by"           "percent_rank"      
## [151] "progress_estimated" "pull"               "quo"               
## [154] "quo_name"           "quos"               "rbind_all"         
## [157] "rbind_list"         "recode"             "recode_factor"     
## [160] "rename"             "rename_"            "rename_all"        
## [163] "rename_at"          "rename_if"          "rename_vars"       
## [166] "rename_vars_"       "right_join"         "row_number"        
## [169] "rowwise"            "same_src"           "sample_frac"       
## [172] "sample_n"           "select"             "select_"           
## [175] "select_all"         "select_at"          "select_if"         
## [178] "select_var"         "select_vars"        "select_vars_"      
## [181] "semi_join"          "setdiff"            "setequal"          
## [184] "show_query"         "slice"              "slice_"            
## [187] "sql"                "sql_escape_ident"   "sql_escape_string" 
## [190] "sql_join"           "sql_select"         "sql_semi_join"     
## [193] "sql_set_op"         "sql_subquery"       "sql_translate_env" 
## [196] "src"                "src_df"             "src_local"         
## [199] "src_mysql"          "src_postgres"       "src_sqlite"        
## [202] "src_tbls"           "starts_with"        "starwars"          
## [205] "storms"             "summarise"          "summarise_"        
## [208] "summarise_all"      "summarise_at"       "summarise_each"    
## [211] "summarise_each_"    "summarise_if"       "summarize"         
## [214] "summarize_"         "summarize_all"      "summarize_at"      
## [217] "summarize_each"     "summarize_each_"    "summarize_if"      
## [220] "sym"                "syms"               "tally"             
## [223] "tally_"             "tbl"                "tbl_cube"          
## [226] "tbl_df"             "tbl_nongroup_vars"  "tbl_sum"           
## [229] "tbl_vars"           "tibble"             "top_n"             
## [232] "transmute"          "transmute_"         "transmute_all"     
## [235] "transmute_at"       "transmute_if"       "tribble"           
## [238] "trunc_mat"          "type_sum"           "ungroup"           
## [241] "union"              "union_all"          "vars"              
## [244] "with_order"         "wrap_dbplyr_obj"

dplyr::case_when

Here’s an official description: This function allows you to vectorise multiple if_else() statements. It is an R equivalent of the SQL CASE WHEN statement. If no cases match, NA is returned.

# let's create a new variable using case_when by transforming a numeric variable into buckets
# let's look at the distribution - before transformation
hist(df$annual_income_k, main = "Annual Income (before transformation)")

# let's create 10 buckets to represent this numeric variable
buckets <- cut(df$annual_income_k, breaks = 10) %>% levels
buckets

##  [1] "(14.9,27.2]" "(27.2,39.4]" "(39.4,51.6]" "(51.6,63.8]" "(63.8,76]"  
##  [6] "(76,88.2]"   "(88.2,100]"  "(100,113]"   "(113,125]"   "(125,137]"

# create a new categorical variable
df$income_bucket <- cut(df$annual_income_k, breaks = 10)
df <- df %>%
        dplyr::mutate(income_bucket = dplyr::case_when(income_bucket == buckets[1] ~ "income level 0",
                                                       income_bucket == buckets[2] ~ "income level 1",
                                                       income_bucket == buckets[3] ~ "income level 2",
                                                       income_bucket == buckets[4] ~ "income level 3",
                                                       income_bucket == buckets[5] ~ "income level 4",
                                                       income_bucket == buckets[6] ~ "income level 5",
                                                       income_bucket == buckets[7] ~ "income level 6",
                                                       income_bucket == buckets[8] ~ "income level 7",
                                                       income_bucket == buckets[9] ~ "income level 8",
                                                       income_bucket == buckets[10] ~ "income level 9",
                                                       TRUE ~ as.character(income_bucket)))
head(df) %>% kable() %>% kable_styling()

customer_id	gender	age	annual_income_k	spending_score_1_100	income_bucket
1	Male	19	15	39	income level 0
2	Male	21	15	81	income level 0
3	Female	20	16	6	income level 0
4	Female	23	16	77	income level 0
5	Female	31	17	40	income level 0
6	Female	22	17	76	income level 0

# let's look at the distribution - after transformation
table(df$income_bucket) %>% barplot(., ylab = "Frequency", main = "Annual Income (after transformation)")

Samantha Deokinanan’s Extensions

I found Jimmy Ng’s tidyverse recipe very helpful as I have never used dplyr::case_when, thus I am extending this discussion with two (2) other dplyr functions, namely dplyr::if_else and dplyr::recode. I selected these fuctions because I thought they could have produced the same outputs as case_when (particularly recode) but there all have there differences and usefulness.

dplyr::if_else

This function is more strict than base::ifelse such that it will preserve the data type. if_else work great when a single set of conditions is to be satisfied. But if multiple sets of conditions are to be tested, nested if_else statements become cumbersome and thus prone to error. The following code highlights an example of nested if_else statements from what Jimmy Ng demonstrated:

# Reset to df just before Jimmy Ng applied `dplyr::case_when` 
# i.e. create a new categorical variable
df$income_bucket <- cut(df$annual_income_k, breaks = 10)

df1 <- df %>%
        dplyr::mutate(income_bucket = 
                        dplyr::if_else(income_bucket == buckets[1], "income level 0",
                        dplyr::if_else(income_bucket == buckets[2], "income level 1",
                        dplyr::if_else(income_bucket == buckets[3], "income level 2",
                        dplyr::if_else(income_bucket == buckets[4], "income level 3",
                        dplyr::if_else(income_bucket == buckets[5], "income level 4",
                        dplyr::if_else(income_bucket == buckets[6], "income level 5",
                        dplyr::if_else(income_bucket == buckets[7], "income level 6",
                        dplyr::if_else(income_bucket == buckets[8], "income level 7",
                        dplyr::if_else(income_bucket == buckets[9], "income level 8", "income level 9"))))))))))

head(df1) %>% kable() %>% kable_styling()

customer_id	gender	age	annual_income_k	spending_score_1_100	income_bucket
1	Male	19	15	39	income level 0
2	Male	21	15	81	income level 0
3	Female	20	16	6	income level 0
4	Female	23	16	77	income level 0
5	Female	31	17	40	income level 0
6	Female	22	17	76	income level 0

str(df1)

## 'data.frame':    200 obs. of  6 variables:
##  $ customer_id         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ gender              : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 1 1 1 2 1 ...
##  $ age                 : int  19 21 20 23 31 22 35 23 64 30 ...
##  $ annual_income_k     : int  15 15 16 16 17 17 18 18 19 19 ...
##  $ spending_score_1_100: int  39 81 6 77 40 76 6 94 3 72 ...
##  $ income_bucket       : chr  "income level 0" "income level 0" "income level 0" "income level 0" ...

Since buckets were already stored as character, the income_buckets categories were preserved without any errors. If buckets were numeric, then it would have to be converted into characters beforehand.

dplyr::recode

Another solution is to use the dplyr::recode function, as it is well suited for replacing values but it will not allow for more complex operations. Let’s look at it again with Jimmy Ng’s example:

df2 <- df %>%
        mutate(income_bucket = 
dplyr::recode(income_bucket, "(14.9,27.2]" = "income level 0",
                             "(27.2,39.4]" = "income level 1",
                             "(39.4,51.6]" = "income level 2",
                             "(51.6,63.8]" = "income level 3",
                             "(63.8,76]" = "income level 4",
                             "(76,88.2]" = "income level 5",
                             "(88.2,100]" = "income level 6",
                             "(100,113]" = "income level 7",
                             "(113,125]" = "income level 8", 
                             "(125,137]" = "income level 9"))

head(df2) %>% kable() %>% kable_styling()

customer_id	gender	age	annual_income_k	spending_score_1_100	income_bucket
1	Male	19	15	39	income level 0
2	Male	21	15	81	income level 0
3	Female	20	16	6	income level 0
4	Female	23	16	77	income level 0
5	Female	31	17	40	income level 0
6	Female	22	17	76	income level 0

str(df2)

## 'data.frame':    200 obs. of  6 variables:
##  $ customer_id         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ gender              : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 1 1 1 2 1 ...
##  $ age                 : int  19 21 20 23 31 22 35 23 64 30 ...
##  $ annual_income_k     : int  15 15 16 16 17 17 18 18 19 19 ...
##  $ spending_score_1_100: int  39 81 6 77 40 76 6 94 3 72 ...
##  $ income_bucket       : Factor w/ 10 levels "income level 0",..: 1 1 1 1 1 1 1 1 1 1 ...

Here, the recoded income_bucket was stored as a factor.

Conclusion

In the end, dplyr provides multiple functions that can be used when you want to create an output value that is conditioned on an evaluation. The right function can make the transformation less cumbersome.