tidyverse_recipe: dplyr::case

(1) Create an Example

Let’s load the tidyverse package and then load a sample data set from my github account. The original dataset comes from kaggle. We will look at packages under the tidyverse, functions within “dplyr”, and eventually zoom in into a specific dplyr function “case_when” with a sample use case.

Set-up

# load package(s)
library(tidyverse)
library(janitor)
library(kableExtra)

# load data
df <- read.csv("https://raw.githubusercontent.com/myvioletrose/school_of_professional_studies/master/607.%20Data%20Acquisition%20and%20Management/%23%20misc/Mall_Customers.csv") %>% 
        janitor::clean_names(.)  # using janitor::clean_names() to clean up column names

head(df) %>% kable() %>% kable_styling()

customer_id	gender	age	annual_income_k	spending_score_1_100
1	Male	19	15	39
2	Male	21	15	81
3	Female	20	16	6
4	Female	23	16	77
5	Female	31	17	40
6	Female	22	17	76

tidyverse packages

# how many packages live under the tidyverse?
tidyverse_packages()

##  [1] "broom"       "cli"         "crayon"      "dplyr"       "dbplyr"     
##  [6] "forcats"     "ggplot2"     "haven"       "hms"         "httr"       
## [11] "jsonlite"    "lubridate"   "magrittr"    "modelr"      "purrr"      
## [16] "readr"       "readxl\n(>=" "reprex"      "rlang"       "rstudioapi" 
## [21] "rvest"       "stringr"     "tibble"      "tidyr"       "xml2"       
## [26] "tidyverse"

dplyr

# how many functions within "dplyr"?
ls(pos = "package:dplyr")

##   [1] "%>%"                 "add_count"           "add_count_"         
##   [4] "add_row"             "add_rownames"        "add_tally"          
##   [7] "add_tally_"          "all_equal"           "all_vars"           
##  [10] "anti_join"           "any_vars"            "arrange"            
##  [13] "arrange_"            "arrange_all"         "arrange_at"         
##  [16] "arrange_if"          "as.tbl"              "as.tbl_cube"        
##  [19] "as_data_frame"       "as_tibble"           "auto_copy"          
##  [22] "band_instruments"    "band_instruments2"   "band_members"       
##  [25] "bench_tbls"          "between"             "bind_cols"          
##  [28] "bind_rows"           "case_when"           "changes"            
##  [31] "check_dbplyr"        "coalesce"            "collapse"           
##  [34] "collect"             "combine"             "common_by"          
##  [37] "compare_tbls"        "compare_tbls2"       "compute"            
##  [40] "contains"            "copy_to"             "count"              
##  [43] "count_"              "cumall"              "cumany"             
##  [46] "cume_dist"           "cummean"             "current_vars"       
##  [49] "data_frame"          "data_frame_"         "db_analyze"         
##  [52] "db_begin"            "db_commit"           "db_create_index"    
##  [55] "db_create_indexes"   "db_create_table"     "db_data_type"       
##  [58] "db_desc"             "db_drop_table"       "db_explain"         
##  [61] "db_has_table"        "db_insert_into"      "db_list_tables"     
##  [64] "db_query_fields"     "db_query_rows"       "db_rollback"        
##  [67] "db_save_query"       "db_write_table"      "dense_rank"         
##  [70] "desc"                "dim_desc"            "distinct"           
##  [73] "distinct_"           "distinct_all"        "distinct_at"        
##  [76] "distinct_if"         "distinct_prepare"    "do"                 
##  [79] "do_"                 "dr_dplyr"            "ends_with"          
##  [82] "enexpr"              "enexprs"             "enquo"              
##  [85] "enquos"              "ensym"               "ensyms"             
##  [88] "eval_tbls"           "eval_tbls2"          "everything"         
##  [91] "explain"             "expr"                "failwith"           
##  [94] "filter"              "filter_"             "filter_all"         
##  [97] "filter_at"           "filter_if"           "first"              
## [100] "frame_data"          "full_join"           "funs"               
## [103] "funs_"               "glimpse"             "group_by"           
## [106] "group_by_"           "group_by_all"        "group_by_at"        
## [109] "group_by_if"         "group_by_prepare"    "group_cols"         
## [112] "group_data"          "group_indices"       "group_indices_"     
## [115] "group_keys"          "group_map"           "group_nest"         
## [118] "group_rows"          "group_size"          "group_split"        
## [121] "group_trim"          "group_vars"          "group_walk"         
## [124] "grouped_df"          "groups"              "hybrid_call"        
## [127] "id"                  "ident"               "if_else"            
## [130] "inner_join"          "intersect"           "is.grouped_df"      
## [133] "is.src"              "is.tbl"              "is_grouped_df"      
## [136] "lag"                 "last"                "last_col"           
## [139] "lead"                "left_join"           "location"           
## [142] "lst"                 "lst_"                "make_tbl"           
## [145] "matches"             "min_rank"            "mutate"             
## [148] "mutate_"             "mutate_all"          "mutate_at"          
## [151] "mutate_each"         "mutate_each_"        "mutate_if"          
## [154] "n"                   "n_distinct"          "n_groups"           
## [157] "na_if"               "nasa"                "near"               
## [160] "nest_join"           "new_grouped_df"      "nth"                
## [163] "ntile"               "num_range"           "one_of"             
## [166] "order_by"            "percent_rank"        "progress_estimated" 
## [169] "pull"                "quo"                 "quo_name"           
## [172] "quos"                "rbind_all"           "rbind_list"         
## [175] "recode"              "recode_factor"       "rename"             
## [178] "rename_"             "rename_all"          "rename_at"          
## [181] "rename_if"           "rename_vars"         "rename_vars_"       
## [184] "right_join"          "row_number"          "rowwise"            
## [187] "same_src"            "sample_frac"         "sample_n"           
## [190] "select"              "select_"             "select_all"         
## [193] "select_at"           "select_if"           "select_var"         
## [196] "select_vars"         "select_vars_"        "semi_join"          
## [199] "setdiff"             "setequal"            "show_query"         
## [202] "slice"               "slice_"              "sql"                
## [205] "sql_escape_ident"    "sql_escape_string"   "sql_join"           
## [208] "sql_select"          "sql_semi_join"       "sql_set_op"         
## [211] "sql_subquery"        "sql_translate_env"   "src"                
## [214] "src_df"              "src_local"           "src_mysql"          
## [217] "src_postgres"        "src_sqlite"          "src_tbls"           
## [220] "starts_with"         "starwars"            "storms"             
## [223] "summarise"           "summarise_"          "summarise_all"      
## [226] "summarise_at"        "summarise_each"      "summarise_each_"    
## [229] "summarise_if"        "summarize"           "summarize_"         
## [232] "summarize_all"       "summarize_at"        "summarize_each"     
## [235] "summarize_each_"     "summarize_if"        "sym"                
## [238] "syms"                "tally"               "tally_"             
## [241] "tbl"                 "tbl_cube"            "tbl_df"             
## [244] "tbl_nongroup_vars"   "tbl_sum"             "tbl_vars"           
## [247] "tibble"              "top_n"               "transmute"          
## [250] "transmute_"          "transmute_all"       "transmute_at"       
## [253] "transmute_if"        "tribble"             "trunc_mat"          
## [256] "type_sum"            "ungroup"             "union"              
## [259] "union_all"           "validate_grouped_df" "vars"               
## [262] "with_order"          "wrap_dbplyr_obj"

dplyr::case_when

Here’s an official description: This function allows you to vectorise multiple if_else() statements. It is an R equivalent of the SQL CASE WHEN statement. If no cases match, NA is returned.

# let's create a new variable using case_when by transforming a numeric variable into buckets
# let's look at the distribution - before transformation
hist(df$annual_income_k, main = "Annual Income (before transformation)")

# let's create 10 buckets to represent this numeric variable
buckets <- cut(df$annual_income_k, breaks = 10) %>% levels
buckets

##  [1] "(14.9,27.2]" "(27.2,39.4]" "(39.4,51.6]" "(51.6,63.8]" "(63.8,76]"  
##  [6] "(76,88.2]"   "(88.2,100]"  "(100,113]"   "(113,125]"   "(125,137]"

# create a new categorical variable
df$income_bucket <- cut(df$annual_income_k, breaks = 10)
df <- df %>%
        dplyr::mutate(income_bucket = dplyr::case_when(income_bucket == buckets[1] ~ "income level 0",
                                                       income_bucket == buckets[2] ~ "income level 1",
                                                       income_bucket == buckets[3] ~ "income level 2",
                                                       income_bucket == buckets[4] ~ "income level 3",
                                                       income_bucket == buckets[5] ~ "income level 4",
                                                       income_bucket == buckets[6] ~ "income level 5",
                                                       income_bucket == buckets[7] ~ "income level 6",
                                                       income_bucket == buckets[8] ~ "income level 7",
                                                       income_bucket == buckets[9] ~ "income level 8",
                                                       income_bucket == buckets[10] ~ "income level 9",
                                                       TRUE ~ as.character(income_bucket)))

head(df) %>% kable() %>% kable_styling()

customer_id	gender	age	annual_income_k	spending_score_1_100	income_bucket
1	Male	19	15	39	income level 0
2	Male	21	15	81	income level 0
3	Female	20	16	6	income level 0
4	Female	23	16	77	income level 0
5	Female	31	17	40	income level 0
6	Female	22	17	76	income level 0

# let's look at the distribution - after transformation
table(df$income_bucket) %>% barplot(., ylab = "Frequency", main = "Annual Income (after transformation)")

***

(2) Extend an Existing Example

I want to extend an existing example from Omar Pineda (“omarp120”). Here, we are looking at Poll data. Do we see higher apporving (or favoring) of the health care bill when “Obama” is mentioned in the question (instead of “American Health Care Act”)? Omar is using “str_detect” to extract “obama” from text and create a column to flag those pollsters out. We are going to transform the data and visualize the result.

dplyr::gather

ratings <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/ahca-polls/ahca_polls.csv")

ratings$obama <- str_detect(ratings$Text, "Obamacare")

# before gathering
ratings %>% head() %>% kable() %>% kable_styling()

Start	End	Pollster	Favor	Oppose	Url	Text	obama
6/9/17	6/11/17	Public Policy Polling	24	55	http://www.publicpolicypolling.com/pdf/2017/PPP_Release_National_61217.pdf	Do you support or oppose the health care bill passed by the House last month, known as the American Health Care Act?	FALSE
6/4/17	6/6/17	YouGov	33	45	https://d25d2506sfb94s.cloudfront.net/cumulus_uploads/document/4vf8rhr0jz/econToplines.pdf	Do you approve or disapprove of the American Health Care Act, the Republican legislation to repeal and replace the Affordable Care Act/Obamacare?	TRUE
5/31/17	6/6/17	Qunnipiac	17	62	https://poll.qu.edu/national/release-detail?ReleaseID=2463	There is a revised Republican health care plan to replace Obamacare, known as the American Health Care Act. Do you approve or disapprove of this revised Republican health care plan?	TRUE
5/26/17	5/30/17	IPSOS	30	46	https://www.realclearpolitics.com/docs/Core_Political-Topline-2017-05-31.pdf	Based on everything you know about the new American Health Care Act (AHCA), to what extent do you support or oppose it?	FALSE
5/25/17	5/30/17	Morning Consult	38	47	https://morningconsult.com/wp-content/uploads/2017/05/170516_crosstabs_Politico_v3_TB-1.pdf	As you may know, the House of Representatives recently passed a bill called the American Health Care Act that would that repeal and replace the 2010 Affordable Care Act, sometimes referred to as Obamacare. Based on what you know, do you approve or disapprove of this health care bill?	TRUE
5/17/17	5/23/17	Quinnipiac	20	57	https://poll.qu.edu/national/release-detail?ReleaseID=2461	There is a revised Republican health care plan to replace Obamacare, known as the American Health Care Act. Do you approve or disapprove of this revised Republican health care plan?	TRUE

ratings.gather <- ratings %>%
        dplyr::select(Start, End, Pollster, Favor, Oppose, obama) %>%
        tidyr::gather(., attitude, percent, -c(Start, End, Pollster, obama)) %>%
        arrange(Start, End, Pollster)

# after gathering the Favor and Oppose columns
ratings.gather %>% kable() %>% kable_styling()

Start	End	Pollster	obama	attitude	percent
5/11/17	5/15/17	NBC/WSJ	FALSE	Favor	23
5/11/17	5/15/17	NBC/WSJ	FALSE	Oppose	48
5/12/17	5/16/17	Public Policy Polling	FALSE	Favor	25
5/12/17	5/16/17	Public Policy Polling	FALSE	Oppose	52
5/13/17	5/16/17	YouGov	FALSE	Favor	31
5/13/17	5/16/17	YouGov	FALSE	Oppose	47
5/13/17	5/20/17	Monmouth	TRUE	Favor	32
5/13/17	5/20/17	Monmouth	TRUE	Oppose	55
5/16/17	5/22/17	Kaiser Family Foundation	FALSE	Favor	31
5/16/17	5/22/17	Kaiser Family Foundation	FALSE	Oppose	55
5/17/17	5/23/17	Quinnipiac	TRUE	Favor	20
5/17/17	5/23/17	Quinnipiac	TRUE	Oppose	57
5/25/17	5/30/17	Morning Consult	TRUE	Favor	38
5/25/17	5/30/17	Morning Consult	TRUE	Oppose	47
5/26/17	5/30/17	IPSOS	FALSE	Favor	30
5/26/17	5/30/17	IPSOS	FALSE	Oppose	46
5/31/17	6/6/17	Qunnipiac	TRUE	Favor	17
5/31/17	6/6/17	Qunnipiac	TRUE	Oppose	62
5/4/17	5/10/17	Quinnipiac	TRUE	Favor	21
5/4/17	5/10/17	Quinnipiac	TRUE	Oppose	56
5/4/17	5/9/17	Morning Consult	TRUE	Favor	38
5/4/17	5/9/17	Morning Consult	TRUE	Oppose	44
5/6/17	5/13/17	YouGov	FALSE	Favor	31
5/6/17	5/13/17	YouGov	FALSE	Oppose	44
5/6/17	5/6/17	YouGov	FALSE	Favor	31
5/6/17	5/6/17	YouGov	FALSE	Oppose	47
6/4/17	6/6/17	YouGov	TRUE	Favor	33
6/4/17	6/6/17	YouGov	TRUE	Oppose	45
6/9/17	6/11/17	Public Policy Polling	FALSE	Favor	24
6/9/17	6/11/17	Public Policy Polling	FALSE	Oppose	55

ggplot - when Obama is NOT mentioned

ratings.gather %>%
        dplyr::filter(obama == F & attitude == "Favor") %>%
        ggplot(., aes(x = reorder(paste(Pollster, Start, End, sep = " - "), percent),
                      y = percent, group = attitude)) +
        geom_bar(stat = "identity", position = "stack", fill = "deepskyblue") + 
        geom_hline(yintercept = mean(
                ratings.gather$percent[ratings.gather$obama == F & 
                                               ratings.gather$attitude == "Favor"]
                ), 
                col = "black", linetype = "dashed") + 
        scale_y_continuous(limits = c(0, 40), breaks = seq(0, 40, 5)) + 
        labs(x = "", y = "") + 
        ggtitle("Percentage of Favoring the Health Care Bill \n- when Obama is NOT mentioned") +
        theme(legend.position = "none", plot.title = element_text(hjust = 0.5),
              axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 0.5))

ggplot - when Obama is mentioned

The result does not seem to make a big difference (when we look at the dashed line representing the average among all pollsters); in fact, mentioning “obama” seems to draw higher approval.

ratings.gather %>%
        dplyr::filter(obama == T & attitude == "Favor") %>%
        ggplot(., aes(x = reorder(paste(Pollster, Start, End, sep = " - "), percent),
                      y = percent, group = attitude)) +
        geom_bar(stat = "identity", position = "stack", fill = "firebrick") + 
        geom_hline(yintercept = mean(
                ratings.gather$percent[ratings.gather$obama == T & 
                                               ratings.gather$attitude == "Favor"]
                ),
                col = "black", linetype = "dashed") + 
        scale_y_continuous(limits = c(0, 40), breaks = seq(0, 40, 5)) + 
        labs(x = "", y = "") + 
        ggtitle("Percentage of Favoring the Health Care Bill \n- when Obama is mentioned") +
        theme(legend.position = "none", plot.title = element_text(hjust = 0.5),
              axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 0.5))

***

tidyverse_recipe: dplyr::case_when

Jimmy Ng

April 18, 2019