Let’s load the tidyverse package and then load a sample data set from my github account. The original dataset comes from kaggle. We will look at packages under the tidyverse, functions within “dplyr”, and eventually zoom in into a specific dplyr function “case_when” with a sample use case.
# load package(s)
library(tidyverse)
library(janitor)
library(kableExtra)
# load data
df <- read.csv("https://raw.githubusercontent.com/myvioletrose/school_of_professional_studies/master/607.%20Data%20Acquisition%20and%20Management/%23%20misc/Mall_Customers.csv") %>%
janitor::clean_names(.) # using janitor::clean_names() to clean up column names
head(df) %>% kable() %>% kable_styling()
| customer_id | gender | age | annual_income_k | spending_score_1_100 |
|---|---|---|---|---|
| 1 | Male | 19 | 15 | 39 |
| 2 | Male | 21 | 15 | 81 |
| 3 | Female | 20 | 16 | 6 |
| 4 | Female | 23 | 16 | 77 |
| 5 | Female | 31 | 17 | 40 |
| 6 | Female | 22 | 17 | 76 |
# how many packages live under the tidyverse?
tidyverse_packages()
## [1] "broom" "cli" "crayon" "dplyr" "dbplyr"
## [6] "forcats" "ggplot2" "haven" "hms" "httr"
## [11] "jsonlite" "lubridate" "magrittr" "modelr" "purrr"
## [16] "readr" "readxl\n(>=" "reprex" "rlang" "rstudioapi"
## [21] "rvest" "stringr" "tibble" "tidyr" "xml2"
## [26] "tidyverse"
# how many functions within "dplyr"?
ls(pos = "package:dplyr")
## [1] "%>%" "add_count" "add_count_"
## [4] "add_row" "add_rownames" "add_tally"
## [7] "add_tally_" "all_equal" "all_vars"
## [10] "anti_join" "any_vars" "arrange"
## [13] "arrange_" "arrange_all" "arrange_at"
## [16] "arrange_if" "as.tbl" "as.tbl_cube"
## [19] "as_data_frame" "as_tibble" "auto_copy"
## [22] "band_instruments" "band_instruments2" "band_members"
## [25] "bench_tbls" "between" "bind_cols"
## [28] "bind_rows" "case_when" "changes"
## [31] "check_dbplyr" "coalesce" "collapse"
## [34] "collect" "combine" "common_by"
## [37] "compare_tbls" "compare_tbls2" "compute"
## [40] "contains" "copy_to" "count"
## [43] "count_" "cumall" "cumany"
## [46] "cume_dist" "cummean" "current_vars"
## [49] "data_frame" "data_frame_" "db_analyze"
## [52] "db_begin" "db_commit" "db_create_index"
## [55] "db_create_indexes" "db_create_table" "db_data_type"
## [58] "db_desc" "db_drop_table" "db_explain"
## [61] "db_has_table" "db_insert_into" "db_list_tables"
## [64] "db_query_fields" "db_query_rows" "db_rollback"
## [67] "db_save_query" "db_write_table" "dense_rank"
## [70] "desc" "dim_desc" "distinct"
## [73] "distinct_" "distinct_all" "distinct_at"
## [76] "distinct_if" "distinct_prepare" "do"
## [79] "do_" "dr_dplyr" "ends_with"
## [82] "enexpr" "enexprs" "enquo"
## [85] "enquos" "ensym" "ensyms"
## [88] "eval_tbls" "eval_tbls2" "everything"
## [91] "explain" "expr" "failwith"
## [94] "filter" "filter_" "filter_all"
## [97] "filter_at" "filter_if" "first"
## [100] "frame_data" "full_join" "funs"
## [103] "funs_" "glimpse" "group_by"
## [106] "group_by_" "group_by_all" "group_by_at"
## [109] "group_by_if" "group_by_prepare" "group_cols"
## [112] "group_data" "group_indices" "group_indices_"
## [115] "group_keys" "group_map" "group_nest"
## [118] "group_rows" "group_size" "group_split"
## [121] "group_trim" "group_vars" "group_walk"
## [124] "grouped_df" "groups" "hybrid_call"
## [127] "id" "ident" "if_else"
## [130] "inner_join" "intersect" "is.grouped_df"
## [133] "is.src" "is.tbl" "is_grouped_df"
## [136] "lag" "last" "last_col"
## [139] "lead" "left_join" "location"
## [142] "lst" "lst_" "make_tbl"
## [145] "matches" "min_rank" "mutate"
## [148] "mutate_" "mutate_all" "mutate_at"
## [151] "mutate_each" "mutate_each_" "mutate_if"
## [154] "n" "n_distinct" "n_groups"
## [157] "na_if" "nasa" "near"
## [160] "nest_join" "new_grouped_df" "nth"
## [163] "ntile" "num_range" "one_of"
## [166] "order_by" "percent_rank" "progress_estimated"
## [169] "pull" "quo" "quo_name"
## [172] "quos" "rbind_all" "rbind_list"
## [175] "recode" "recode_factor" "rename"
## [178] "rename_" "rename_all" "rename_at"
## [181] "rename_if" "rename_vars" "rename_vars_"
## [184] "right_join" "row_number" "rowwise"
## [187] "same_src" "sample_frac" "sample_n"
## [190] "select" "select_" "select_all"
## [193] "select_at" "select_if" "select_var"
## [196] "select_vars" "select_vars_" "semi_join"
## [199] "setdiff" "setequal" "show_query"
## [202] "slice" "slice_" "sql"
## [205] "sql_escape_ident" "sql_escape_string" "sql_join"
## [208] "sql_select" "sql_semi_join" "sql_set_op"
## [211] "sql_subquery" "sql_translate_env" "src"
## [214] "src_df" "src_local" "src_mysql"
## [217] "src_postgres" "src_sqlite" "src_tbls"
## [220] "starts_with" "starwars" "storms"
## [223] "summarise" "summarise_" "summarise_all"
## [226] "summarise_at" "summarise_each" "summarise_each_"
## [229] "summarise_if" "summarize" "summarize_"
## [232] "summarize_all" "summarize_at" "summarize_each"
## [235] "summarize_each_" "summarize_if" "sym"
## [238] "syms" "tally" "tally_"
## [241] "tbl" "tbl_cube" "tbl_df"
## [244] "tbl_nongroup_vars" "tbl_sum" "tbl_vars"
## [247] "tibble" "top_n" "transmute"
## [250] "transmute_" "transmute_all" "transmute_at"
## [253] "transmute_if" "tribble" "trunc_mat"
## [256] "type_sum" "ungroup" "union"
## [259] "union_all" "validate_grouped_df" "vars"
## [262] "with_order" "wrap_dbplyr_obj"
Here’s an official description: This function allows you to vectorise multiple if_else() statements. It is an R equivalent of the SQL CASE WHEN statement. If no cases match, NA is returned.
# let's create a new variable using case_when by transforming a numeric variable into buckets
# let's look at the distribution - before transformation
hist(df$annual_income_k, main = "Annual Income (before transformation)")
# let's create 10 buckets to represent this numeric variable
buckets <- cut(df$annual_income_k, breaks = 10) %>% levels
buckets
## [1] "(14.9,27.2]" "(27.2,39.4]" "(39.4,51.6]" "(51.6,63.8]" "(63.8,76]"
## [6] "(76,88.2]" "(88.2,100]" "(100,113]" "(113,125]" "(125,137]"
# create a new categorical variable
df$income_bucket <- cut(df$annual_income_k, breaks = 10)
df <- df %>%
dplyr::mutate(income_bucket = dplyr::case_when(income_bucket == buckets[1] ~ "income level 0",
income_bucket == buckets[2] ~ "income level 1",
income_bucket == buckets[3] ~ "income level 2",
income_bucket == buckets[4] ~ "income level 3",
income_bucket == buckets[5] ~ "income level 4",
income_bucket == buckets[6] ~ "income level 5",
income_bucket == buckets[7] ~ "income level 6",
income_bucket == buckets[8] ~ "income level 7",
income_bucket == buckets[9] ~ "income level 8",
income_bucket == buckets[10] ~ "income level 9",
TRUE ~ as.character(income_bucket)))
head(df) %>% kable() %>% kable_styling()
| customer_id | gender | age | annual_income_k | spending_score_1_100 | income_bucket |
|---|---|---|---|---|---|
| 1 | Male | 19 | 15 | 39 | income level 0 |
| 2 | Male | 21 | 15 | 81 | income level 0 |
| 3 | Female | 20 | 16 | 6 | income level 0 |
| 4 | Female | 23 | 16 | 77 | income level 0 |
| 5 | Female | 31 | 17 | 40 | income level 0 |
| 6 | Female | 22 | 17 | 76 | income level 0 |
# let's look at the distribution - after transformation
table(df$income_bucket) %>% barplot(., ylab = "Frequency", main = "Annual Income (after transformation)")
***
I want to extend an existing example from Omar Pineda (“omarp120”). Here, we are looking at Poll data. Do we see higher apporving (or favoring) of the health care bill when “Obama” is mentioned in the question (instead of “American Health Care Act”)? Omar is using “str_detect” to extract “obama” from text and create a column to flag those pollsters out. We are going to transform the data and visualize the result.
ratings <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/ahca-polls/ahca_polls.csv")
ratings$obama <- str_detect(ratings$Text, "Obamacare")
# before gathering
ratings %>% head() %>% kable() %>% kable_styling()
| Start | End | Pollster | Favor | Oppose | Url | Text | obama |
|---|---|---|---|---|---|---|---|
| 6/9/17 | 6/11/17 | Public Policy Polling | 24 | 55 | http://www.publicpolicypolling.com/pdf/2017/PPP_Release_National_61217.pdf | Do you support or oppose the health care bill passed by the House last month, known as the American Health Care Act? | FALSE |
| 6/4/17 | 6/6/17 | YouGov | 33 | 45 | https://d25d2506sfb94s.cloudfront.net/cumulus_uploads/document/4vf8rhr0jz/econToplines.pdf | Do you approve or disapprove of the American Health Care Act, the Republican legislation to repeal and replace the Affordable Care Act/Obamacare? | TRUE |
| 5/31/17 | 6/6/17 | Qunnipiac | 17 | 62 | https://poll.qu.edu/national/release-detail?ReleaseID=2463 | There is a revised Republican health care plan to replace Obamacare, known as the American Health Care Act. Do you approve or disapprove of this revised Republican health care plan? | TRUE |
| 5/26/17 | 5/30/17 | IPSOS | 30 | 46 | https://www.realclearpolitics.com/docs/Core_Political-Topline-2017-05-31.pdf | Based on everything you know about the new American Health Care Act (AHCA), to what extent do you support or oppose it? | FALSE |
| 5/25/17 | 5/30/17 | Morning Consult | 38 | 47 | https://morningconsult.com/wp-content/uploads/2017/05/170516_crosstabs_Politico_v3_TB-1.pdf | As you may know, the House of Representatives recently passed a bill called the American Health Care Act that would that repeal and replace the 2010 Affordable Care Act, sometimes referred to as Obamacare. Based on what you know, do you approve or disapprove of this health care bill? | TRUE |
| 5/17/17 | 5/23/17 | Quinnipiac | 20 | 57 | https://poll.qu.edu/national/release-detail?ReleaseID=2461 | There is a revised Republican health care plan to replace Obamacare, known as the American Health Care Act. Do you approve or disapprove of this revised Republican health care plan? | TRUE |
ratings.gather <- ratings %>%
dplyr::select(Start, End, Pollster, Favor, Oppose, obama) %>%
tidyr::gather(., attitude, percent, -c(Start, End, Pollster, obama)) %>%
arrange(Start, End, Pollster)
# after gathering the Favor and Oppose columns
ratings.gather %>% kable() %>% kable_styling()
| Start | End | Pollster | obama | attitude | percent |
|---|---|---|---|---|---|
| 5/11/17 | 5/15/17 | NBC/WSJ | FALSE | Favor | 23 |
| 5/11/17 | 5/15/17 | NBC/WSJ | FALSE | Oppose | 48 |
| 5/12/17 | 5/16/17 | Public Policy Polling | FALSE | Favor | 25 |
| 5/12/17 | 5/16/17 | Public Policy Polling | FALSE | Oppose | 52 |
| 5/13/17 | 5/16/17 | YouGov | FALSE | Favor | 31 |
| 5/13/17 | 5/16/17 | YouGov | FALSE | Oppose | 47 |
| 5/13/17 | 5/20/17 | Monmouth | TRUE | Favor | 32 |
| 5/13/17 | 5/20/17 | Monmouth | TRUE | Oppose | 55 |
| 5/16/17 | 5/22/17 | Kaiser Family Foundation | FALSE | Favor | 31 |
| 5/16/17 | 5/22/17 | Kaiser Family Foundation | FALSE | Oppose | 55 |
| 5/17/17 | 5/23/17 | Quinnipiac | TRUE | Favor | 20 |
| 5/17/17 | 5/23/17 | Quinnipiac | TRUE | Oppose | 57 |
| 5/25/17 | 5/30/17 | Morning Consult | TRUE | Favor | 38 |
| 5/25/17 | 5/30/17 | Morning Consult | TRUE | Oppose | 47 |
| 5/26/17 | 5/30/17 | IPSOS | FALSE | Favor | 30 |
| 5/26/17 | 5/30/17 | IPSOS | FALSE | Oppose | 46 |
| 5/31/17 | 6/6/17 | Qunnipiac | TRUE | Favor | 17 |
| 5/31/17 | 6/6/17 | Qunnipiac | TRUE | Oppose | 62 |
| 5/4/17 | 5/10/17 | Quinnipiac | TRUE | Favor | 21 |
| 5/4/17 | 5/10/17 | Quinnipiac | TRUE | Oppose | 56 |
| 5/4/17 | 5/9/17 | Morning Consult | TRUE | Favor | 38 |
| 5/4/17 | 5/9/17 | Morning Consult | TRUE | Oppose | 44 |
| 5/6/17 | 5/13/17 | YouGov | FALSE | Favor | 31 |
| 5/6/17 | 5/13/17 | YouGov | FALSE | Oppose | 44 |
| 5/6/17 | 5/6/17 | YouGov | FALSE | Favor | 31 |
| 5/6/17 | 5/6/17 | YouGov | FALSE | Oppose | 47 |
| 6/4/17 | 6/6/17 | YouGov | TRUE | Favor | 33 |
| 6/4/17 | 6/6/17 | YouGov | TRUE | Oppose | 45 |
| 6/9/17 | 6/11/17 | Public Policy Polling | FALSE | Favor | 24 |
| 6/9/17 | 6/11/17 | Public Policy Polling | FALSE | Oppose | 55 |
ratings.gather %>%
dplyr::filter(obama == F & attitude == "Favor") %>%
ggplot(., aes(x = reorder(paste(Pollster, Start, End, sep = " - "), percent),
y = percent, group = attitude)) +
geom_bar(stat = "identity", position = "stack", fill = "deepskyblue") +
geom_hline(yintercept = mean(
ratings.gather$percent[ratings.gather$obama == F &
ratings.gather$attitude == "Favor"]
),
col = "black", linetype = "dashed") +
scale_y_continuous(limits = c(0, 40), breaks = seq(0, 40, 5)) +
labs(x = "", y = "") +
ggtitle("Percentage of Favoring the Health Care Bill \n- when Obama is NOT mentioned") +
theme(legend.position = "none", plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 0.5))
The result does not seem to make a big difference (when we look at the dashed line representing the average among all pollsters); in fact, mentioning “obama” seems to draw higher approval.
ratings.gather %>%
dplyr::filter(obama == T & attitude == "Favor") %>%
ggplot(., aes(x = reorder(paste(Pollster, Start, End, sep = " - "), percent),
y = percent, group = attitude)) +
geom_bar(stat = "identity", position = "stack", fill = "firebrick") +
geom_hline(yintercept = mean(
ratings.gather$percent[ratings.gather$obama == T &
ratings.gather$attitude == "Favor"]
),
col = "black", linetype = "dashed") +
scale_y_continuous(limits = c(0, 40), breaks = seq(0, 40, 5)) +
labs(x = "", y = "") +
ggtitle("Percentage of Favoring the Health Care Bill \n- when Obama is mentioned") +
theme(legend.position = "none", plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 0.5))
***