setwd("C:/Users/DellPC/Desktop/Corner/R_source_code/Julia_Silge")
#Manipulation
library(tidyverse)
#Create models
library(tidymodels)
#Feature importance
library(vip)
library(lime)
#Visualization
library(RColorBrewer)
library(scales)
library(DT)
library(ggsci)
library(ggthemes)
library(ggpubr)
library(gghalves)
library(ggridges)
library(correlationfunnel)
library(ggalluvial)
library(tidymodels)
library(cowplot)
library(fmsb)
#Auto ML
library(h2o)
df <- read.csv("bank_churner.csv")
df <- df %>%
select(-c(CLIENTNUM,
Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,
Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2)) %>%
set_names(names(.) %>%
str_to_lower())naniar::vis_miss(df)DataExplorer::plot_missing(df)DataExplorer::plot_intro(df) + theme_light()glimpse(df)## Rows: 10,127
## Columns: 20
## $ attrition_flag <chr> "Existing Customer", "Existing Customer", "Ex~
## $ customer_age <int> 45, 49, 51, 40, 40, 44, 51, 32, 37, 48, 42, 6~
## $ gender <chr> "M", "F", "M", "F", "M", "M", "M", "M", "M", ~
## $ dependent_count <int> 3, 5, 3, 4, 3, 2, 4, 0, 3, 2, 5, 1, 1, 3, 2, ~
## $ education_level <chr> "High School", "Graduate", "Graduate", "High ~
## $ marital_status <chr> "Married", "Single", "Married", "Unknown", "M~
## $ income_category <chr> "$60K - $80K", "Less than $40K", "$80K - $120~
## $ card_category <chr> "Blue", "Blue", "Blue", "Blue", "Blue", "Blue~
## $ months_on_book <int> 39, 44, 36, 34, 21, 36, 46, 27, 36, 36, 31, 5~
## $ total_relationship_count <int> 5, 6, 4, 3, 5, 3, 6, 2, 5, 6, 5, 6, 3, 5, 5, ~
## $ months_inactive_12_mon <int> 1, 1, 1, 4, 1, 1, 1, 2, 2, 3, 3, 2, 6, 1, 2, ~
## $ contacts_count_12_mon <int> 3, 2, 0, 1, 0, 2, 3, 2, 0, 3, 2, 3, 0, 3, 2, ~
## $ credit_limit <dbl> 12691.0, 8256.0, 3418.0, 3313.0, 4716.0, 4010~
## $ total_revolving_bal <int> 777, 864, 0, 2517, 0, 1247, 2264, 1396, 2517,~
## $ avg_open_to_buy <dbl> 11914.0, 7392.0, 3418.0, 796.0, 4716.0, 2763.~
## $ total_amt_chng_q4_q1 <dbl> 1.335, 1.541, 2.594, 1.405, 2.175, 1.376, 1.9~
## $ total_trans_amt <int> 1144, 1291, 1887, 1171, 816, 1088, 1330, 1538~
## $ total_trans_ct <int> 42, 33, 20, 20, 28, 24, 31, 36, 24, 32, 42, 2~
## $ total_ct_chng_q4_q1 <dbl> 1.625, 3.714, 2.333, 2.333, 2.500, 0.846, 0.7~
## $ avg_utilization_ratio <dbl> 0.061, 0.105, 0.000, 0.760, 0.000, 0.311, 0.0~
skimr::skim(df)| Name | df |
| Number of rows | 10127 |
| Number of columns | 20 |
| _______________________ | |
| Column type frequency: | |
| character | 6 |
| numeric | 14 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| attrition_flag | 0 | 1 | 17 | 17 | 0 | 2 | 0 |
| gender | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
| education_level | 0 | 1 | 7 | 13 | 0 | 7 | 0 |
| marital_status | 0 | 1 | 6 | 8 | 0 | 4 | 0 |
| income_category | 0 | 1 | 7 | 14 | 0 | 6 | 0 |
| card_category | 0 | 1 | 4 | 8 | 0 | 4 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| customer_age | 0 | 1 | 46.33 | 8.02 | 26.0 | 41.00 | 46.00 | 52.00 | 73.00 | ▂▆▇▃▁ |
| dependent_count | 0 | 1 | 2.35 | 1.30 | 0.0 | 1.00 | 2.00 | 3.00 | 5.00 | ▇▇▇▅▁ |
| months_on_book | 0 | 1 | 35.93 | 7.99 | 13.0 | 31.00 | 36.00 | 40.00 | 56.00 | ▁▃▇▃▂ |
| total_relationship_count | 0 | 1 | 3.81 | 1.55 | 1.0 | 3.00 | 4.00 | 5.00 | 6.00 | ▇▇▆▆▆ |
| months_inactive_12_mon | 0 | 1 | 2.34 | 1.01 | 0.0 | 2.00 | 2.00 | 3.00 | 6.00 | ▅▇▇▁▁ |
| contacts_count_12_mon | 0 | 1 | 2.46 | 1.11 | 0.0 | 2.00 | 2.00 | 3.00 | 6.00 | ▅▇▇▃▁ |
| credit_limit | 0 | 1 | 8631.95 | 9088.78 | 1438.3 | 2555.00 | 4549.00 | 11067.50 | 34516.00 | ▇▂▁▁▁ |
| total_revolving_bal | 0 | 1 | 1162.81 | 814.99 | 0.0 | 359.00 | 1276.00 | 1784.00 | 2517.00 | ▇▅▇▇▅ |
| avg_open_to_buy | 0 | 1 | 7469.14 | 9090.69 | 3.0 | 1324.50 | 3474.00 | 9859.00 | 34516.00 | ▇▂▁▁▁ |
| total_amt_chng_q4_q1 | 0 | 1 | 0.76 | 0.22 | 0.0 | 0.63 | 0.74 | 0.86 | 3.40 | ▅▇▁▁▁ |
| total_trans_amt | 0 | 1 | 4404.09 | 3397.13 | 510.0 | 2155.50 | 3899.00 | 4741.00 | 18484.00 | ▇▅▁▁▁ |
| total_trans_ct | 0 | 1 | 64.86 | 23.47 | 10.0 | 45.00 | 67.00 | 81.00 | 139.00 | ▂▅▇▂▁ |
| total_ct_chng_q4_q1 | 0 | 1 | 0.71 | 0.24 | 0.0 | 0.58 | 0.70 | 0.82 | 3.71 | ▇▆▁▁▁ |
| avg_utilization_ratio | 0 | 1 | 0.27 | 0.28 | 0.0 | 0.02 | 0.18 | 0.50 | 1.00 | ▇▂▂▂▁ |
DT::datatable(df %>% sample_n(100))The goal of business problem is to better understand what attributes(features) are making our clients to leave the organization. Moreover, in this business problem we have three main tasks that we will perform to come up with potential solutions and conclusion:
Many organizations eventually will face a situation in which a customer decides to leave the organization. Nevertheless, questions come to pop up especially when it comes to the reasons that customer decided to leave the company. Definitely there are reasons that are not consistent with other customers that have left an organization but what if there is a pattern as to the reason customers decided to leave the organization? If the company would be able to detect the major reasons why customers are leaving the company, the company would be able to react and prevent customers from leaving. Furthermore, if the company is able to understand from the past, the organization will be able to prevent further customer attritions in the future
Note: Before proceeding further, please the notebook if you find it useful to the community and of course if you like it. Please comment if you have a constructive criticism on the comment section.
I will keep the feature definitions to the most important however, if I see the need to analyze further any other features I will add them to the list.
ul>Here are the main libraries and how they will help us solve this business problem.
custom_theme <- theme_bw() +
theme(plot.title = element_text(face = "bold", color = "black", size=14),
plot.subtitle = element_text(face = "italic", color = "black", size=12),
axis.text = element_text(color = "black"), legend.text = element_text(size=10),
legend.title = element_text(size = 12), legend.position = "none",
strip.background =element_rect(fill="#666666"), strip.text = element_text(color="white", face="bold"),
plot.caption = element_text(face = "italic"))We will use the skimr package to gather some brief information on our dataset.
Note:
There are other features to explore but I kept them into these three as we will further analyze this later in the data exploration section.
# Some of the features can be categorical
skimr::skim(df)| Name | df |
| Number of rows | 10127 |
| Number of columns | 20 |
| _______________________ | |
| Column type frequency: | |
| character | 6 |
| numeric | 14 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| attrition_flag | 0 | 1 | 17 | 17 | 0 | 2 | 0 |
| gender | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
| education_level | 0 | 1 | 7 | 13 | 0 | 7 | 0 |
| marital_status | 0 | 1 | 6 | 8 | 0 | 4 | 0 |
| income_category | 0 | 1 | 7 | 14 | 0 | 6 | 0 |
| card_category | 0 | 1 | 4 | 8 | 0 | 4 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| customer_age | 0 | 1 | 46.33 | 8.02 | 26.0 | 41.00 | 46.00 | 52.00 | 73.00 | ▂▆▇▃▁ |
| dependent_count | 0 | 1 | 2.35 | 1.30 | 0.0 | 1.00 | 2.00 | 3.00 | 5.00 | ▇▇▇▅▁ |
| months_on_book | 0 | 1 | 35.93 | 7.99 | 13.0 | 31.00 | 36.00 | 40.00 | 56.00 | ▁▃▇▃▂ |
| total_relationship_count | 0 | 1 | 3.81 | 1.55 | 1.0 | 3.00 | 4.00 | 5.00 | 6.00 | ▇▇▆▆▆ |
| months_inactive_12_mon | 0 | 1 | 2.34 | 1.01 | 0.0 | 2.00 | 2.00 | 3.00 | 6.00 | ▅▇▇▁▁ |
| contacts_count_12_mon | 0 | 1 | 2.46 | 1.11 | 0.0 | 2.00 | 2.00 | 3.00 | 6.00 | ▅▇▇▃▁ |
| credit_limit | 0 | 1 | 8631.95 | 9088.78 | 1438.3 | 2555.00 | 4549.00 | 11067.50 | 34516.00 | ▇▂▁▁▁ |
| total_revolving_bal | 0 | 1 | 1162.81 | 814.99 | 0.0 | 359.00 | 1276.00 | 1784.00 | 2517.00 | ▇▅▇▇▅ |
| avg_open_to_buy | 0 | 1 | 7469.14 | 9090.69 | 3.0 | 1324.50 | 3474.00 | 9859.00 | 34516.00 | ▇▂▁▁▁ |
| total_amt_chng_q4_q1 | 0 | 1 | 0.76 | 0.22 | 0.0 | 0.63 | 0.74 | 0.86 | 3.40 | ▅▇▁▁▁ |
| total_trans_amt | 0 | 1 | 4404.09 | 3397.13 | 510.0 | 2155.50 | 3899.00 | 4741.00 | 18484.00 | ▇▅▁▁▁ |
| total_trans_ct | 0 | 1 | 64.86 | 23.47 | 10.0 | 45.00 | 67.00 | 81.00 | 139.00 | ▂▅▇▂▁ |
| total_ct_chng_q4_q1 | 0 | 1 | 0.71 | 0.24 | 0.0 | 0.58 | 0.70 | 0.82 | 3.71 | ▇▆▁▁▁ |
| avg_utilization_ratio | 0 | 1 | 0.27 | 0.28 | 0.0 | 0.02 | 0.18 | 0.50 | 1.00 | ▇▂▂▂▁ |
df %>%
select_if(is.character) %>%
map( ~ table(.) %>% prop.table())## $attrition_flag
## .
## Attrited Customer Existing Customer
## 0.1606596 0.8393404
##
## $gender
## .
## F M
## 0.5290807 0.4709193
##
## $education_level
## .
## College Doctorate Graduate High School Post-Graduate
## 0.10002962 0.04453441 0.30887726 0.19877555 0.05095290
## Uneducated Unknown
## 0.14683519 0.14999506
##
## $marital_status
## .
## Divorced Married Single Unknown
## 0.07386195 0.46282216 0.38935519 0.07396070
##
## $income_category
## .
## $120K + $40K - $60K $60K - $80K $80K - $120K Less than $40K
## 0.07178829 0.17675521 0.13844179 0.15157500 0.35163425
## Unknown
## 0.10980547
##
## $card_category
## .
## Blue Gold Platinum Silver
## 0.931766565 0.011454528 0.001974919 0.054803989
Summary
# Step 2: Income Category ----
# Assign the level order
level_order <- c("$120K +", "Less than $40K", "$40K - $60K", "$80K - $120K", "$40K - $60K", "$60K - $80K", "Unknown")
# a. Find % of customers who are about to leave
income_tbl <- df %>%
select(attrition_flag, income_category) %>%
count(attrition_flag, income_category) %>%
group_by(income_category) %>%
mutate(pct = n / sum(n)) %>%
ungroup() %>%
arrange(desc(pct)) %>%
mutate(
pct_txt = scales::percent(pct),
income_category = income_category %>% factor(levels = c("$120K +", "Less than $40K", "$40K - $60K", "$80K - $120K", "$60K - $80K", "Unknown")) %>% fct_rev(),
income_category = income_category %>% as_factor() %>% fct_reorder(pct),
attrition_flag = attrition_flag %>% as_factor() %>% fct_rev()
)
# Plot Visualization
income_tbl %>%
ggplot(aes(x=income_category, y=pct, color= attrition_flag)) +
geom_segment(aes(yend = 0, xend = income_category), size = 1) +
geom_point() +
geom_label(aes(label = pct_txt), hjust = "inward",
size = 3) +
coord_flip() +
facet_wrap(~ attrition_flag) + custom_theme +
theme(legend.position = "none") +
scale_colour_nejm() +
labs(
title = "Levels of Attrition",
subtitle = "by Income Category",
x = "Income Category",
y = "Percentage of Attrition"
)Summary:
# Reference function: Stackoverflow
specify_decimal <- function(x, k) trimws(format(round(x, k), nsmall=k))
# From which income and educational categories are most attrited customers coming from?
# Compare attrition levels by income_category and education level
income_education_pct_tbl <- df %>%
count(attrition_flag, income_category, education_level) %>%
group_by(attrition_flag, income_category, education_level) %>%
summarise(total = sum(n)) %>%
ungroup() %>%
group_by(income_category) %>%
mutate(
pct = total / sum(total)
) %>% ungroup() %>%
mutate(
income_category = income_category %>% factor(levels = c("$120K +", "Less than $40K", "$40K - $60K", "$80K - $120K", "$60K - $80K", "Unknown")) %>% fct_rev()
) %>%
mutate(
pct = as.numeric(specify_decimal(pct,4)),
pct_txt = str_glue("{pct*100}%")
)
# Plot
income_education_pct_tbl %>%
ggplot(aes(x=education_level, y = income_category)) +
geom_tile(aes(fill = pct)) +
scale_fill_gradient2(low = "#6F99ADFF", mid = "white", high = "#BC3C29FF") +
facet_wrap(~ attrition_flag, scales = "free_x") + custom_theme +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1),
plot.caption = element_text(face = "italic")) +
geom_text(aes(label = pct_txt), size=2.5) +
labs(
title = "Heatmap Customer Attrition",
caption = "Highly educated and mid to high income earners have a great % of Attrition",
x = "Education Level",
y = "Income Category"
)Lets make a confirmation we saw that the group that had the “second” highest attrition rate flag was the Less than 40k income category group. In this case I wanted to explore the lowest income category to see what was the level of revolving balance to this group.
Summary:
# Revolving balance by income category
# Think of a chart.
# Correlation in credit_limit and revolving_balance
lowest_income_cat_tbl <- df %>%
filter(income_category == "Less than $40K" & attrition_flag == "Attrited Customer") %>%
summarise(
correlation = scales::percent(cor(credit_limit, total_revolving_bal))
)
df %>%
filter(attrition_flag == "Attrited Customer") %>%
ggplot(aes(x=credit_limit, y=total_revolving_bal, color=income_category)) + geom_point(alpha = 0.3) +
facet_wrap(~ income_category, scales = "free_x") + geom_smooth(method = "lm", color="red") +
custom_theme + scale_colour_nejm() +
labs(
title = "Correlation of Credit Limit and Revolving Balance",
caption = str_glue("Correlation of lowest income category: {lowest_income_cat_tbl}"),
x = "Credit Limit",
y = "Total Revolving Balance"
)The lower income category has a slightly larger utilization ratio as opposed to the other income categories. Nevertheless, this is not a significant amount but it could indicate that the lower income category has a slightly level of activity. This will be good to understand especially when we get into the phase of feature importance for our model. The utilization ratio also tells us the level of activity by income category but again we should go to the feature importance phase to see if levels of activity is an indicator whether a customer will leave the organization.
income_metrics <- df %>%
select(income_category, total_trans_ct, avg_utilization_ratio, total_trans_amt) %>%
group_by(income_category) %>%
summarise(
avg_trans_ct = round(mean(total_trans_ct, na.rm = TRUE),2),
avg_utilization_ratio = round(mean(avg_utilization_ratio, na.rm = TRUE),2)
) %>%
ungroup()
# Perform the radar chart
# Maybe create several metrics
prep_tbl <- income_metrics %>%
pivot_longer(2:3) %>%
mutate(
income_category = as.factor(income_category) %>% fct_reorder(value)
)
prep_tbl %>%
ggplot(aes(x=income_category, y=value, fill=name)) +
geom_col(color="black") +
facet_wrap(~name, scales = "free_x") +
coord_flip() + custom_theme + scale_fill_nejm() +
geom_label(aes(label=value), color="white", hjust=1.2) +
labs(
title = "Metrics by Income Category",
subtitle = "Getting deeper into levels of activity",
x = "Value",
y = "Income Category"
)The purpose of exploring the card category is to see if our clients are satisfied with the product offerings the institution offers. What are the different type of credit card the credit institution offers?
card_category_attrition_tbl <- df %>%
select(attrition_flag, card_category) %>%
count(attrition_flag, card_category) %>%
group_by(card_category) %>%
mutate(pct = n / sum(n)) %>%
ungroup() %>%
arrange(desc(pct)) %>%
mutate(
pct_txt = scales::percent(pct),
card_category = card_category %>% as_factor() %>% fct_reorder(pct)
)
card_category_attrition_tbl %>%
ggplot(aes(x = pct, y = card_category, fill = attrition_flag)) +
geom_col(position = "dodge", width = 0.5, color = "black") +
facet_wrap(~ attrition_flag, scales = "free_x") + custom_theme +
scale_fill_nejm() +
scale_x_continuous(labels = scales::percent) +
geom_label(aes(label = pct_txt), hjust = "inward", color = "white") +
labs(
title = "Customer Status by Income Cateogry",
x = "Attrition Rate",
y = "Card Category",
caption = "Platinum cards have the highest attrition rates"
)# Distribution of Utulization ratio by type of card
card_utilization_tbl <- df %>%
select(attrition_flag, card_category, avg_utilization_ratio, total_trans_amt)
card_utilization_tbl %>%
ggplot(aes(x=card_category, y=total_trans_amt,fill=card_category)) +
geom_jitter(aes(color=card_category),
alpha = 0.5,
size=0.5,
show.legend = FALSE) +
geom_half_violin(aes(fill=card_category),
side = "l",
alpha = 0.45,
show.legend = FALSE,
trim = FALSE) +
geom_half_boxplot(aes(fill = card_category),
side = "r",
outlier.size = 1,
outlier.color = "red",
width = 0.4,
alpha = 0.2,
show.legend = FALSE) +
stat_summary(fun = median, geom="line") +
facet_wrap(~ attrition_flag) + custom_theme +
scale_fill_manual(values = c("#1C366B", "#FCD16B", "#C7CEF6", "#D3DDDC")) +
scale_color_manual(values = c("#1C366B", "#FCD16B", "#C7CEF6", "#D3DDDC")) +
scale_y_continuous(labels = scales::dollar) +
labs(
title = "Distribution of Transactions by Type of Card",
x = "Card Category",
y = "Total Transactions"
)Lets analyze the level of inactivity for those customers that decided or that will leave the organization and those customers.
attrition_tbl <- df %>%
select(attrition_flag, months_inactive_12_mon, total_revolving_bal, total_trans_amt, credit_limit)
# Levels of Inactivity by attrition flag
p1 <- attrition_tbl %>%
ggplot(aes(x = months_inactive_12_mon, y = attrition_flag, fill = attrition_flag)) +
stat_density_ridges(quantile_lines = TRUE, quantiles = 2, alpha = 0.4) +
scale_fill_nejm() + custom_theme +
labs(
title = "Levels of Inactivity",
x = "Inactive Months",
y = "Attrition Flag",
caption = "Attrition Customers have higher levels of Inactivity (3 months vs 2 months Median)"
)
p1We can clearly see that the customers that had a lower revolving balance were the most likely to leave. This could indicate that those customers could have left because they found other lower interest rates to other companies that offered better products at lower rates. In the case of existing customers on average they had a much higher revolving balance but customers with higher revolving balance have a much harder time to look for other financial institutions to offer lower rates due to the risk that the customer will not pay at all what is left of the remaining balance. (High risk, higher rates.)
# Radar Chart for attrited customers
max_val <- df %>%
select(attrition_flag, income_category, total_revolving_bal) %>%
filter(attrition_flag == "Attrited Customer") %>%
group_by(income_category) %>%
summarise(
max_val = 1000
) %>% ungroup() %>%
pivot_wider(names_from = income_category, values_from = c(max_val))
min_val <- df %>%
select(attrition_flag, income_category, total_revolving_bal) %>%
filter(attrition_flag == "Attrited Customer") %>%
group_by(income_category) %>%
summarise(
min_val = min(total_revolving_bal)
) %>% ungroup() %>%
pivot_wider(names_from = income_category, values_from = c(min_val))
revo_df <- df %>%
select(attrition_flag, income_category, total_revolving_bal) %>%
filter(attrition_flag == "Attrited Customer") %>%
group_by(income_category) %>%
summarise(
avg_rev = mean(total_revolving_bal, na.rm = TRUE)
) %>%
ungroup() %>%
pivot_wider(names_from = income_category, values_from = c(avg_rev))
combined_df_attrited <- max_val %>%
rbind(min_val) %>%
rbind(revo_df)
rownames(combined_df_attrited) <- 1:nrow(combined_df_attrited)
revo_df <- df %>%
select(attrition_flag, income_category, total_revolving_bal) %>%
filter(attrition_flag == "Attrited Customer") %>%
group_by(income_category) %>%
summarise(
avg_rev = mean(total_revolving_bal, na.rm = TRUE)
) %>%
ungroup() %>%
pivot_wider(names_from = income_category, values_from = c(avg_rev))
# Customers that remained
max_val_existing <- df %>%
select(attrition_flag, income_category, total_revolving_bal) %>%
filter(attrition_flag == "Attrited Customer") %>%
group_by(income_category) %>%
summarise(
max_val = 1700
) %>% ungroup() %>%
pivot_wider(names_from = income_category, values_from = c(max_val))
min_val_existing <- df %>%
select(attrition_flag, income_category, total_revolving_bal) %>%
filter(attrition_flag == "Existing Customer") %>%
group_by(income_category) %>%
summarise(
min_val = min(total_revolving_bal)
) %>% ungroup() %>%
pivot_wider(names_from = income_category, values_from = c(min_val))
revo_df_existing <- df %>%
select(attrition_flag, income_category, total_revolving_bal) %>%
filter(attrition_flag == "Existing Customer") %>%
group_by(income_category) %>%
summarise(
avg_rev = mean(total_revolving_bal, na.rm = TRUE)
) %>%
ungroup() %>%
pivot_wider(names_from = income_category, values_from = c(avg_rev))
combined_df_existing <- max_val_existing %>%
rbind(min_val_existing) %>%
rbind(revo_df_existing)
combined_df_attrited <- max_val %>%
rbind(min_val) %>%
rbind(revo_df)
rownames(combined_df_attrited) <- 1:nrow(combined_df_attrited)
create_beautiful_radarchart <- function(data, color = "#00AFBB",
vlabels = colnames(data), vlcex = 0.7,
caxislabels = NULL, title = NULL, ...){
radarchart(
data, axistype = 1,
# Customize the polygon
pcol = color, pfcol = scales::alpha(color, 0.5), plwd = 2, plty = 1,
# Customize the grid
cglcol = "grey", cglty = 1, cglwd = 0.8,
# Customize the axis
axislabcol = "grey",
# Variable labels
vlcex = vlcex, vlabels = vlabels,
caxislabels = caxislabels, title = title, ...
)
}
op <- par(mar = c(1, 2, 2, 1))
attrited_plot <- create_beautiful_radarchart(combined_df_attrited, caxislabels = c(0,300 , 600, 900, 1200), title = "Attrited Customer", color="#F92923")existing_plot <- create_beautiful_radarchart(combined_df_existing, caxislabels = c(0, 400, 800, 1200, 1600), title = "Existing Customer", color="#23F993")In this section we will explore possible combinations of groups of people that leave the organization. Through this analysis we could quickly visualize clusters that are more likely to leave the organization.
sample_data <- df %>%
select(attrition_flag, education_level, gender, income_category) %>%
filter(attrition_flag == "Attrited Customer") %>%
count(attrition_flag, education_level, gender, income_category) %>%
mutate(
interesting_group = ifelse(gender == "F" & education_level == "Graduate" &
income_category == "Less than $40K", "Interesting", "Not Interesting")
)
sample_data %>%
ggplot(aes(y = n,
axis1 = gender, axis2 = education_level, axis3 = income_category)) +
geom_alluvium(aes(fill = interesting_group), alpha = 0.4, absolute = FALSE) +
geom_stratum(absolute = FALSE, width = 0.45) +
geom_text(stat = "stratum", aes(label = after_stat(stratum)),
absolute = FALSE) +
scale_x_discrete(limits = c("Income Category", "Education Level", "Gender"), expand = c(.1, .05)) +
custom_theme + scale_fill_nejm() +
labs(
title = "Different Groups within Customer Attrition",
subtitle = "Attrited Customer Sample Population",
y = "Amount",
caption = "Female Graduates in the low income category makes a good portion of Attrited Customers"
)Before getting into the AutoML phase, it will be interesting to explore which features are correlated with whether a customer would leave the organization. For this we will use a so called “correlation” funner which will allow us to understand which features are correlated with whether someone will leave the organization. Note: For numeric variables we have decided to bin them into 4 categories and then create dummy variables based on these features through one hot encoding.
correlation_prepared_tbl <- df %>% binarize()
# Plot correlation funnel
correlation_prepared_tbl %>%
correlate(target = attrition_flag__Attrited_Customer) %>%
plot_correlation_funnel(interactive = FALSE) + custom_theme In this section we will start implementing H2o. H2o is an automl library that simplifies the process of model selection. Not only that, but it reduces the time we would have spent performing a GridSearch process (Hyperparameter tuning.)
These will be the steps to implement H2o:library(h2o)
Sys.setenv(JAVA_HOME = "C:/Program Files/Java/jdk1.8.0_301")
print(Sys.getenv("JAVA_HOME"))## [1] "C:/Program Files/Java/jdk1.8.0_301"
#Call H20
h2o.init()## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 hours 25 minutes
## H2O cluster timezone: Asia/Bangkok
## H2O data parsing timezone: UTC
## H2O cluster version: 3.32.1.7
## H2O cluster version age: 2 days
## H2O cluster name: H2O_started_from_R_DellPC_uta384
## H2O cluster total nodes: 1
## H2O cluster total memory: 2.42 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 4.1.1 (2021-08-10)
In this section we will implement AutoML from a series of models. AutoML is a framework that helps us simplify the implementation of machine learning models, preprocessing steps, hyperparameter tuning among other aspects of the machine learning pipeline. We will implement the following steps:
The top models that we got are all Gradient Boosting models with different parameters.
leaderboard_data <- automl_models_h2o@leaderboard %>%
as_tibble() %>%
mutate(
# Takes the first part before the underscore
model_type = str_split(model_id, "_", simplify = T)[,1]
) %>% rownames_to_column() %>%
mutate(
model_id = model_id %>% as_factor() %>% fct_reorder(auc),
model_type = model_type %>% as_factor()
) %>%
select(rowname, model_id, model_type, everything()) %>%
pivot_longer(4:9) %>%
mutate(
model_id = paste0(rowname, ". ", model_id) %>% as_factor() %>% fct_rev()
) %>%
filter(name %in% c("auc", "logloss"))
leaderboard_data %>%
ggplot(aes(x=value, y=model_id, color = model_type)) +
geom_segment(aes(x = 0, y = model_id, xend = value, yend = model_id), color = "grey50") +
geom_point(size=3) + facet_wrap(~ name, scales = "free_x") +
scale_color_nejm() + custom_theme +
geom_label(aes(label = round(value, 2), hjust = "inward"), size = 3) +
labs(
title = "Model Leaderboard",
subtitle = paste0("Ordered by: auc")
)Precision and recall are metrics that are commonly used for classification problems. This will allow us to better evaluate the performance of our model. Furthermore, in this case we will used the model that performed the best and that is the GBM_3_AUTOML model. However, it is possible to evaluate more than one model at the same time but to keep things simple in this tutorial I will only use one model.
metrics_tbl <- performance_h2o %>%
h2o.metric() %>%
as_tibble()
pr_plot <- metrics_tbl %>%
ggplot(aes(x=threshold)) +
geom_line(aes(y=precision), color = "#F24D29", size=1) +
geom_line(aes(y=recall), color = "#1DACE8", size=1) +
geom_vline(xintercept = h2o.find_threshold_by_max_metric(performance_h2o, "f1"),
lty="dashed") +
custom_theme +
labs(
title = "Precision vis a vis Recall"
)roc_tbl <- performance_h2o %>%
h2o.metric() %>%
as_tibble() %>%
mutate(
auc = h2o.auc(performance_h2o)
) %>%
select(tpr, fpr, auc)
roc_plot <- roc_tbl %>%
ggplot(aes(fpr, tpr)) +
geom_line(color = "#F24D29", size=1) +
geom_abline() +
custom_theme +
labs(
title = "ROC Plot"
)
plot_grid(pr_plot, roc_plot, ncol = 2)Now it is time to better understand our model. We would want to understand what factors drive our model to make certain predictions. In this case what factors make our model determine whether a customer will leave the organization? In order to know this we need to understand which features are important to our model.
fi_plot <- vip(leader_model) +
custom_theme + geom_col(fill="#F24D29", color="black") +
labs(
title = "Feature Importance",
caption = "Which features our model considers important"
)
fi_ploth2o.no_progress()
total_trans_plot <- h2o.partialPlot(leader_model, data = as.h2o(train_tbl), cols = c("total_trans_ct", "total_trans_amt"))In this case we will use the Lime package which helps us to understand the features that our model considers important. However, we will explore single observations in which the customer decided to leave the organization.
Note: The 4th observation is an incorrect prediction on the side of our model. It looks like this was an exception even if it had all the characteristics our model thought was important to consider that a customer will be leaving the organization.
h2o.no_progress()
explainer <- train_tbl %>%
select(-attrition_flag) %>%
lime(
model = leader_model,
bin_continuous = TRUE,
n_bins = 4,
quantile_bins = TRUE
)
# Feature number 4 is an incorrect prediction!
explanation <- test_tbl %>%
filter(attrition_flag == "Attrited Customer") %>%
slice(1:6) %>%
select(-attrition_flag) %>%
lime::explain(
explainer = explainer,
n_labels = 1,
n_features = 8,
n_permutations = 5000,
kernel_width = 1
)
plot_features(explanation = explanation)In this plot we can see that the trend continues with the four observations. So we see that for the first three trends we see that there is a consistency and a pattern. We could also try to explore different observations of attrited customers in order to see if this patterns continues. With this you can conclude that these are the features that our model understands that are important in determining whether the customer will leave the organization.
plot_explanations(explanation = explanation)
Note: You should also look at the business implications, False Positives have a lower impact to the organizations as opposed to False Negatives!
metrics_h2o <- leader_model %>%
h2o.performance(newdata = test_h2o)
predictions_tbl <- leader_model %>%
h2o.predict(newdata = test_h2o) %>%
as_tibble() %>%
bind_cols(test_h2o %>% as_tibble())
predictions_tbl %>%
select(1:4) %>%
conf_mat(attrition_flag, predict) %>%
pluck(1) %>%
as_tibble() %>%
ggplot(aes(x=Prediction, y=Truth, fill=n)) +
geom_tile(show.legend = F, alpha=0.5) +
geom_text(aes(label=n), color="black", alpha=1, size=8) +
scale_fill_gsea()+
custom_theme +
labs(
title = "Confusion Matrix"
)The concepts of thresholds is important when we come to classification problems. In this case we show the performance of different thresholds at various levels. True Negative Rates and False Positive Rates as well as false negative rates and true positive rates will always add to 1. As the threshold increases, the more precission our model has when determining whether a customer will leave the organization.
new_performance_h2o <- leader_model %>%
h2o.performance(newdata = test_h2o)
rates_by_threshold <- new_performance_h2o %>%
h2o.metric() %>%
as_tibble()
rates_by_threshold %>%
select(threshold, f1, tnr:tpr) %>%
gather(key = "key", value = "value", tnr:tpr, factor_key = TRUE) %>%
mutate(
key = fct_reorder2(key, threshold, value)
) %>%
ggplot(aes(x=threshold, value, color=key)) +
geom_point() +
geom_smooth() +
custom_theme +
theme(legend.position = "right") +
labs(
title = "Rates",
y = "Value",
x = "Threshold"
)And thats it I hope you liked this project and how we can use AutoML to help us with certain business tasks. Thanks for the support and dont forget to upvote if you did like the notebook.