(1) Create an Example

Let’s load the tidyverse package and then load a sample data set from my github account. The original dataset comes from kaggle. We will look at packages under the tidyverse, functions within “dplyr”, and eventually zoom in into a specific dplyr function “case_when” with a sample use case.

Set-up

# load package(s)
library(tidyverse)
library(janitor)
library(kableExtra)

# load data
df <- read.csv("https://raw.githubusercontent.com/myvioletrose/school_of_professional_studies/master/607.%20Data%20Acquisition%20and%20Management/%23%20misc/Mall_Customers.csv") %>% 
        janitor::clean_names(.)  # using janitor::clean_names() to clean up column names

head(df) %>% kable() %>% kable_styling()
customer_id gender age annual_income_k spending_score_1_100
1 Male 19 15 39
2 Male 21 15 81
3 Female 20 16 6
4 Female 23 16 77
5 Female 31 17 40
6 Female 22 17 76

tidyverse packages

# how many packages live under the tidyverse?
tidyverse_packages()
##  [1] "broom"       "cli"         "crayon"      "dplyr"       "dbplyr"     
##  [6] "forcats"     "ggplot2"     "haven"       "hms"         "httr"       
## [11] "jsonlite"    "lubridate"   "magrittr"    "modelr"      "purrr"      
## [16] "readr"       "readxl\n(>=" "reprex"      "rlang"       "rstudioapi" 
## [21] "rvest"       "stringr"     "tibble"      "tidyr"       "xml2"       
## [26] "tidyverse"

dplyr

# how many functions within "dplyr"?
ls(pos = "package:dplyr")
##   [1] "%>%"                 "add_count"           "add_count_"         
##   [4] "add_row"             "add_rownames"        "add_tally"          
##   [7] "add_tally_"          "all_equal"           "all_vars"           
##  [10] "anti_join"           "any_vars"            "arrange"            
##  [13] "arrange_"            "arrange_all"         "arrange_at"         
##  [16] "arrange_if"          "as.tbl"              "as.tbl_cube"        
##  [19] "as_data_frame"       "as_tibble"           "auto_copy"          
##  [22] "band_instruments"    "band_instruments2"   "band_members"       
##  [25] "bench_tbls"          "between"             "bind_cols"          
##  [28] "bind_rows"           "case_when"           "changes"            
##  [31] "check_dbplyr"        "coalesce"            "collapse"           
##  [34] "collect"             "combine"             "common_by"          
##  [37] "compare_tbls"        "compare_tbls2"       "compute"            
##  [40] "contains"            "copy_to"             "count"              
##  [43] "count_"              "cumall"              "cumany"             
##  [46] "cume_dist"           "cummean"             "current_vars"       
##  [49] "data_frame"          "data_frame_"         "db_analyze"         
##  [52] "db_begin"            "db_commit"           "db_create_index"    
##  [55] "db_create_indexes"   "db_create_table"     "db_data_type"       
##  [58] "db_desc"             "db_drop_table"       "db_explain"         
##  [61] "db_has_table"        "db_insert_into"      "db_list_tables"     
##  [64] "db_query_fields"     "db_query_rows"       "db_rollback"        
##  [67] "db_save_query"       "db_write_table"      "dense_rank"         
##  [70] "desc"                "dim_desc"            "distinct"           
##  [73] "distinct_"           "distinct_all"        "distinct_at"        
##  [76] "distinct_if"         "distinct_prepare"    "do"                 
##  [79] "do_"                 "dr_dplyr"            "ends_with"          
##  [82] "enexpr"              "enexprs"             "enquo"              
##  [85] "enquos"              "ensym"               "ensyms"             
##  [88] "eval_tbls"           "eval_tbls2"          "everything"         
##  [91] "explain"             "expr"                "failwith"           
##  [94] "filter"              "filter_"             "filter_all"         
##  [97] "filter_at"           "filter_if"           "first"              
## [100] "frame_data"          "full_join"           "funs"               
## [103] "funs_"               "glimpse"             "group_by"           
## [106] "group_by_"           "group_by_all"        "group_by_at"        
## [109] "group_by_if"         "group_by_prepare"    "group_cols"         
## [112] "group_data"          "group_indices"       "group_indices_"     
## [115] "group_keys"          "group_map"           "group_nest"         
## [118] "group_rows"          "group_size"          "group_split"        
## [121] "group_trim"          "group_vars"          "group_walk"         
## [124] "grouped_df"          "groups"              "hybrid_call"        
## [127] "id"                  "ident"               "if_else"            
## [130] "inner_join"          "intersect"           "is.grouped_df"      
## [133] "is.src"              "is.tbl"              "is_grouped_df"      
## [136] "lag"                 "last"                "last_col"           
## [139] "lead"                "left_join"           "location"           
## [142] "lst"                 "lst_"                "make_tbl"           
## [145] "matches"             "min_rank"            "mutate"             
## [148] "mutate_"             "mutate_all"          "mutate_at"          
## [151] "mutate_each"         "mutate_each_"        "mutate_if"          
## [154] "n"                   "n_distinct"          "n_groups"           
## [157] "na_if"               "nasa"                "near"               
## [160] "nest_join"           "new_grouped_df"      "nth"                
## [163] "ntile"               "num_range"           "one_of"             
## [166] "order_by"            "percent_rank"        "progress_estimated" 
## [169] "pull"                "quo"                 "quo_name"           
## [172] "quos"                "rbind_all"           "rbind_list"         
## [175] "recode"              "recode_factor"       "rename"             
## [178] "rename_"             "rename_all"          "rename_at"          
## [181] "rename_if"           "rename_vars"         "rename_vars_"       
## [184] "right_join"          "row_number"          "rowwise"            
## [187] "same_src"            "sample_frac"         "sample_n"           
## [190] "select"              "select_"             "select_all"         
## [193] "select_at"           "select_if"           "select_var"         
## [196] "select_vars"         "select_vars_"        "semi_join"          
## [199] "setdiff"             "setequal"            "show_query"         
## [202] "slice"               "slice_"              "sql"                
## [205] "sql_escape_ident"    "sql_escape_string"   "sql_join"           
## [208] "sql_select"          "sql_semi_join"       "sql_set_op"         
## [211] "sql_subquery"        "sql_translate_env"   "src"                
## [214] "src_df"              "src_local"           "src_mysql"          
## [217] "src_postgres"        "src_sqlite"          "src_tbls"           
## [220] "starts_with"         "starwars"            "storms"             
## [223] "summarise"           "summarise_"          "summarise_all"      
## [226] "summarise_at"        "summarise_each"      "summarise_each_"    
## [229] "summarise_if"        "summarize"           "summarize_"         
## [232] "summarize_all"       "summarize_at"        "summarize_each"     
## [235] "summarize_each_"     "summarize_if"        "sym"                
## [238] "syms"                "tally"               "tally_"             
## [241] "tbl"                 "tbl_cube"            "tbl_df"             
## [244] "tbl_nongroup_vars"   "tbl_sum"             "tbl_vars"           
## [247] "tibble"              "top_n"               "transmute"          
## [250] "transmute_"          "transmute_all"       "transmute_at"       
## [253] "transmute_if"        "tribble"             "trunc_mat"          
## [256] "type_sum"            "ungroup"             "union"              
## [259] "union_all"           "validate_grouped_df" "vars"               
## [262] "with_order"          "wrap_dbplyr_obj"

dplyr::case_when

Here’s an official description: This function allows you to vectorise multiple if_else() statements. It is an R equivalent of the SQL CASE WHEN statement. If no cases match, NA is returned.

# let's create a new variable using case_when by transforming a numeric variable into buckets
# let's look at the distribution - before transformation
hist(df$annual_income_k, main = "Annual Income (before transformation)")

# let's create 10 buckets to represent this numeric variable
buckets <- cut(df$annual_income_k, breaks = 10) %>% levels
buckets
##  [1] "(14.9,27.2]" "(27.2,39.4]" "(39.4,51.6]" "(51.6,63.8]" "(63.8,76]"  
##  [6] "(76,88.2]"   "(88.2,100]"  "(100,113]"   "(113,125]"   "(125,137]"
# create a new categorical variable
df$income_bucket <- cut(df$annual_income_k, breaks = 10)
df <- df %>%
        dplyr::mutate(income_bucket = dplyr::case_when(income_bucket == buckets[1] ~ "income level 0",
                                                       income_bucket == buckets[2] ~ "income level 1",
                                                       income_bucket == buckets[3] ~ "income level 2",
                                                       income_bucket == buckets[4] ~ "income level 3",
                                                       income_bucket == buckets[5] ~ "income level 4",
                                                       income_bucket == buckets[6] ~ "income level 5",
                                                       income_bucket == buckets[7] ~ "income level 6",
                                                       income_bucket == buckets[8] ~ "income level 7",
                                                       income_bucket == buckets[9] ~ "income level 8",
                                                       income_bucket == buckets[10] ~ "income level 9",
                                                       TRUE ~ as.character(income_bucket)))

head(df) %>% kable() %>% kable_styling()
customer_id gender age annual_income_k spending_score_1_100 income_bucket
1 Male 19 15 39 income level 0
2 Male 21 15 81 income level 0
3 Female 20 16 6 income level 0
4 Female 23 16 77 income level 0
5 Female 31 17 40 income level 0
6 Female 22 17 76 income level 0
# let's look at the distribution - after transformation
table(df$income_bucket) %>% barplot(., ylab = "Frequency", main = "Annual Income (after transformation)")

***

(2) Extend an Existing Example

I want to extend an existing example from Omar Pineda (“omarp120”). Here, we are looking at Poll data. Do we see higher apporving (or favoring) of the health care bill when “Obama” is mentioned in the question (instead of “American Health Care Act”)? Omar is using “str_detect” to extract “obama” from text and create a column to flag those pollsters out. We are going to transform the data and visualize the result.

dplyr::gather

ratings <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/ahca-polls/ahca_polls.csv")

ratings$obama <- str_detect(ratings$Text, "Obamacare")

# before gathering
ratings %>% head() %>% kable() %>% kable_styling()
Start End Pollster Favor Oppose Url Text obama
6/9/17 6/11/17 Public Policy Polling 24 55 http://www.publicpolicypolling.com/pdf/2017/PPP_Release_National_61217.pdf Do you support or oppose the health care bill passed by the House last month, known as the American Health Care Act? FALSE
6/4/17 6/6/17 YouGov 33 45 https://d25d2506sfb94s.cloudfront.net/cumulus_uploads/document/4vf8rhr0jz/econToplines.pdf Do you approve or disapprove of the American Health Care Act, the Republican legislation to repeal and replace the Affordable Care Act/Obamacare? TRUE
5/31/17 6/6/17 Qunnipiac 17 62 https://poll.qu.edu/national/release-detail?ReleaseID=2463 There is a revised Republican health care plan to replace Obamacare, known as the American Health Care Act. Do you approve or disapprove of this revised Republican health care plan? TRUE
5/26/17 5/30/17 IPSOS 30 46 https://www.realclearpolitics.com/docs/Core_Political-Topline-2017-05-31.pdf Based on everything you know about the new American Health Care Act (AHCA), to what extent do you support or oppose it? FALSE
5/25/17 5/30/17 Morning Consult 38 47 https://morningconsult.com/wp-content/uploads/2017/05/170516_crosstabs_Politico_v3_TB-1.pdf As you may know, the House of Representatives recently passed a bill called the American Health Care Act that would that repeal and replace the 2010 Affordable Care Act, sometimes referred to as Obamacare. Based on what you know, do you approve or disapprove of this health care bill? TRUE
5/17/17 5/23/17 Quinnipiac 20 57 https://poll.qu.edu/national/release-detail?ReleaseID=2461 There is a revised Republican health care plan to replace Obamacare, known as the American Health Care Act. Do you approve or disapprove of this revised Republican health care plan? TRUE
ratings.gather <- ratings %>%
        dplyr::select(Start, End, Pollster, Favor, Oppose, obama) %>%
        tidyr::gather(., attitude, percent, -c(Start, End, Pollster, obama)) %>%
        arrange(Start, End, Pollster)

# after gathering the Favor and Oppose columns
ratings.gather %>% kable() %>% kable_styling()
Start End Pollster obama attitude percent
5/11/17 5/15/17 NBC/WSJ FALSE Favor 23
5/11/17 5/15/17 NBC/WSJ FALSE Oppose 48
5/12/17 5/16/17 Public Policy Polling FALSE Favor 25
5/12/17 5/16/17 Public Policy Polling FALSE Oppose 52
5/13/17 5/16/17 YouGov FALSE Favor 31
5/13/17 5/16/17 YouGov FALSE Oppose 47
5/13/17 5/20/17 Monmouth TRUE Favor 32
5/13/17 5/20/17 Monmouth TRUE Oppose 55
5/16/17 5/22/17 Kaiser Family Foundation FALSE Favor 31
5/16/17 5/22/17 Kaiser Family Foundation FALSE Oppose 55
5/17/17 5/23/17 Quinnipiac TRUE Favor 20
5/17/17 5/23/17 Quinnipiac TRUE Oppose 57
5/25/17 5/30/17 Morning Consult TRUE Favor 38
5/25/17 5/30/17 Morning Consult TRUE Oppose 47
5/26/17 5/30/17 IPSOS FALSE Favor 30
5/26/17 5/30/17 IPSOS FALSE Oppose 46
5/31/17 6/6/17 Qunnipiac TRUE Favor 17
5/31/17 6/6/17 Qunnipiac TRUE Oppose 62
5/4/17 5/10/17 Quinnipiac TRUE Favor 21
5/4/17 5/10/17 Quinnipiac TRUE Oppose 56
5/4/17 5/9/17 Morning Consult TRUE Favor 38
5/4/17 5/9/17 Morning Consult TRUE Oppose 44
5/6/17 5/13/17 YouGov FALSE Favor 31
5/6/17 5/13/17 YouGov FALSE Oppose 44
5/6/17 5/6/17 YouGov FALSE Favor 31
5/6/17 5/6/17 YouGov FALSE Oppose 47
6/4/17 6/6/17 YouGov TRUE Favor 33
6/4/17 6/6/17 YouGov TRUE Oppose 45
6/9/17 6/11/17 Public Policy Polling FALSE Favor 24
6/9/17 6/11/17 Public Policy Polling FALSE Oppose 55

ggplot - when Obama is NOT mentioned

ratings.gather %>%
        dplyr::filter(obama == F & attitude == "Favor") %>%
        ggplot(., aes(x = reorder(paste(Pollster, Start, End, sep = " - "), percent),
                      y = percent, group = attitude)) +
        geom_bar(stat = "identity", position = "stack", fill = "deepskyblue") + 
        geom_hline(yintercept = mean(
                ratings.gather$percent[ratings.gather$obama == F & 
                                               ratings.gather$attitude == "Favor"]
                ), 
                col = "black", linetype = "dashed") + 
        scale_y_continuous(limits = c(0, 40), breaks = seq(0, 40, 5)) + 
        labs(x = "", y = "") + 
        ggtitle("Percentage of Favoring the Health Care Bill \n- when Obama is NOT mentioned") +
        theme(legend.position = "none", plot.title = element_text(hjust = 0.5),
              axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 0.5)) 

ggplot - when Obama is mentioned

The result does not seem to make a big difference (when we look at the dashed line representing the average among all pollsters); in fact, mentioning “obama” seems to draw higher approval.

ratings.gather %>%
        dplyr::filter(obama == T & attitude == "Favor") %>%
        ggplot(., aes(x = reorder(paste(Pollster, Start, End, sep = " - "), percent),
                      y = percent, group = attitude)) +
        geom_bar(stat = "identity", position = "stack", fill = "firebrick") + 
        geom_hline(yintercept = mean(
                ratings.gather$percent[ratings.gather$obama == T & 
                                               ratings.gather$attitude == "Favor"]
                ),
                col = "black", linetype = "dashed") + 
        scale_y_continuous(limits = c(0, 40), breaks = seq(0, 40, 5)) + 
        labs(x = "", y = "") + 
        ggtitle("Percentage of Favoring the Health Care Bill \n- when Obama is mentioned") +
        theme(legend.position = "none", plot.title = element_text(hjust = 0.5),
              axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 0.5)) 

***