# Import Libraries
# Core
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
library(vip)
##
## Attaching package: 'vip'
##
## The following object is masked from 'package:utils':
##
## vi
library(lime)
##
## Attaching package: 'lime'
##
## The following object is masked from 'package:dplyr':
##
## explain
# Visualization
library(ggsci)
library(ggthemes)
library(ggpubr)
library(gghalves)
library(ggridges)
library(correlationfunnel)
## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>
library(ggalluvial)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.5 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ lime::explain() masks dplyr::explain()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(cowplot)
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:ggpubr':
##
## get_legend
##
## The following object is masked from 'package:ggthemes':
##
## theme_map
##
## The following object is masked from 'package:lubridate':
##
## stamp
library(fmsb)
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
##
## Attaching package: 'h2o'
##
## The following objects are masked from 'package:lubridate':
##
## day, hour, month, week, year
##
## The following objects are masked from 'package:stats':
##
## cor, sd, var
##
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
suppressMessages(library(tidyverse))
df = read_csv("/Users/nguyenbuiminh/Desktop/coding_python/ml/time_series/BankChurners.csv")
## Rows: 10127 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): Attrition_Flag, Gender, Education_Level, Marital_Status, Income_Ca...
## dbl (17): CLIENTNUM, Customer_Age, Dependent_count, Months_on_book, Total_Re...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(df)
## Rows: 10,127
## Columns: 23
## $ CLIENTNUM <dbl> …
## $ Attrition_Flag <chr> …
## $ Customer_Age <dbl> …
## $ Gender <chr> …
## $ Dependent_count <dbl> …
## $ Education_Level <chr> …
## $ Marital_Status <chr> …
## $ Income_Category <chr> …
## $ Card_Category <chr> …
## $ Months_on_book <dbl> …
## $ Total_Relationship_Count <dbl> …
## $ Months_Inactive_12_mon <dbl> …
## $ Contacts_Count_12_mon <dbl> …
## $ Credit_Limit <dbl> …
## $ Total_Revolving_Bal <dbl> …
## $ Avg_Open_To_Buy <dbl> …
## $ Total_Amt_Chng_Q4_Q1 <dbl> …
## $ Total_Trans_Amt <dbl> …
## $ Total_Trans_Ct <dbl> …
## $ Total_Ct_Chng_Q4_Q1 <dbl> …
## $ Avg_Utilization_Ratio <dbl> …
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 <dbl> …
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 <dbl> …
df <- df %>%
select(-c(CLIENTNUM,
Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,
Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2)) %>%
set_names(names(.) %>% str_to_lower())
custom_theme <- theme_bw() +
theme(plot.title = element_text(face = "bold", color = "black", size=14),
plot.subtitle = element_text(face = "italic", color = "black", size=12),
axis.text = element_text(color = "black"), legend.text = element_text(size=10),
legend.title = element_text(size = 12), legend.position = "none",
strip.background =element_rect(fill="#666666"), strip.text = element_text(color="white", face="bold"),
plot.caption = element_text(face = "italic"))
We will use the skimr package to gather some brief information on our dataset
Note: There are other features to explore but I kept them into these three as we will further analyze this later in the data exploration section
skim(df)
| Name | df |
| Number of rows | 10127 |
| Number of columns | 20 |
| _______________________ | |
| Column type frequency: | |
| character | 6 |
| numeric | 14 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| attrition_flag | 0 | 1 | 17 | 17 | 0 | 2 | 0 |
| gender | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
| education_level | 0 | 1 | 7 | 13 | 0 | 7 | 0 |
| marital_status | 0 | 1 | 6 | 8 | 0 | 4 | 0 |
| income_category | 0 | 1 | 7 | 14 | 0 | 6 | 0 |
| card_category | 0 | 1 | 4 | 8 | 0 | 4 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| customer_age | 0 | 1 | 46.33 | 8.02 | 26.0 | 41.00 | 46.00 | 52.00 | 73.00 | ▂▆▇▃▁ |
| dependent_count | 0 | 1 | 2.35 | 1.30 | 0.0 | 1.00 | 2.00 | 3.00 | 5.00 | ▇▇▇▅▁ |
| months_on_book | 0 | 1 | 35.93 | 7.99 | 13.0 | 31.00 | 36.00 | 40.00 | 56.00 | ▁▃▇▃▂ |
| total_relationship_count | 0 | 1 | 3.81 | 1.55 | 1.0 | 3.00 | 4.00 | 5.00 | 6.00 | ▇▇▆▆▆ |
| months_inactive_12_mon | 0 | 1 | 2.34 | 1.01 | 0.0 | 2.00 | 2.00 | 3.00 | 6.00 | ▅▇▇▁▁ |
| contacts_count_12_mon | 0 | 1 | 2.46 | 1.11 | 0.0 | 2.00 | 2.00 | 3.00 | 6.00 | ▅▇▇▃▁ |
| credit_limit | 0 | 1 | 8631.95 | 9088.78 | 1438.3 | 2555.00 | 4549.00 | 11067.50 | 34516.00 | ▇▂▁▁▁ |
| total_revolving_bal | 0 | 1 | 1162.81 | 814.99 | 0.0 | 359.00 | 1276.00 | 1784.00 | 2517.00 | ▇▅▇▇▅ |
| avg_open_to_buy | 0 | 1 | 7469.14 | 9090.69 | 3.0 | 1324.50 | 3474.00 | 9859.00 | 34516.00 | ▇▂▁▁▁ |
| total_amt_chng_q4_q1 | 0 | 1 | 0.76 | 0.22 | 0.0 | 0.63 | 0.74 | 0.86 | 3.40 | ▅▇▁▁▁ |
| total_trans_amt | 0 | 1 | 4404.09 | 3397.13 | 510.0 | 2155.50 | 3899.00 | 4741.00 | 18484.00 | ▇▅▁▁▁ |
| total_trans_ct | 0 | 1 | 64.86 | 23.47 | 10.0 | 45.00 | 67.00 | 81.00 | 139.00 | ▂▅▇▂▁ |
| total_ct_chng_q4_q1 | 0 | 1 | 0.71 | 0.24 | 0.0 | 0.58 | 0.70 | 0.82 | 3.71 | ▇▆▁▁▁ |
| avg_utilization_ratio | 0 | 1 | 0.27 | 0.28 | 0.0 | 0.02 | 0.18 | 0.50 | 1.00 | ▇▂▂▂▁ |
#Imbalanced datasets, 16% of customers have Attributed
df %>%
select_if(is.character) %>%
map(~ table(.) %>% prop.table())
## $attrition_flag
## .
## Attrited Customer Existing Customer
## 0.1606596 0.8393404
##
## $gender
## .
## F M
## 0.5290807 0.4709193
##
## $education_level
## .
## College Doctorate Graduate High School Post-Graduate
## 0.10002962 0.04453441 0.30887726 0.19877555 0.05095290
## Uneducated Unknown
## 0.14683519 0.14999506
##
## $marital_status
## .
## Divorced Married Single Unknown
## 0.07386195 0.46282216 0.38935519 0.07396070
##
## $income_category
## .
## $120K + $40K - $60K $60K - $80K $80K - $120K Less than $40K
## 0.07178829 0.17675521 0.13844179 0.15157500 0.35163425
## Unknown
## 0.10980547
##
## $card_category
## .
## Blue Gold Platinum Silver
## 0.931766565 0.011454528 0.001974919 0.054803989
#Summary - High Attrition level with High income category? We see that the higest income category has a high %, compared to the other income categories.
# Step 2: Income Category
# Assign the level order
level_order <- c("$120K +", "Less than $40K", "$40K - $60K", "$80K - $120K", "$40K - $60K", "$60K - $80K", "Unknown")
# a. Find % of customers who are about to leave
income_tbl <- df %>%
select(attrition_flag, income_category) %>%
count(attrition_flag, income_category) %>%
group_by(income_category) %>%
mutate(pct = n/sum(n)) %>%
ungroup() %>%
arrange(desc(pct)) %>%
mutate(
pct_txt = scales::percent(pct),
income_category = income_category %>% factor(levels = c("$120K +", "Less than $40K", "$40K - $60K", "$80K - $120K", "$60K - $80K", "Unknown")) %>% fct_rev(),
income_category = income_category %>% as_factor() %>% fct_reorder(pct),
attrition_flag = attrition_flag %>% as_factor() %>% fct_rev()
)
income_tbl
## # A tibble: 12 × 5
## attrition_flag income_category n pct pct_txt
## <fct> <fct> <int> <dbl> <chr>
## 1 Existing Customer $60K - $80K 1213 0.865 86.52%
## 2 Existing Customer $40K - $60K 1519 0.849 84.86%
## 3 Existing Customer $80K - $120K 1293 0.842 84.23%
## 4 Existing Customer Unknown 925 0.832 83.18%
## 5 Existing Customer Less than $40K 2949 0.828 82.81%
## 6 Existing Customer $120K + 601 0.827 82.67%
## 7 Attrited Customer $120K + 126 0.173 17.33%
## 8 Attrited Customer Less than $40K 612 0.172 17.19%
## 9 Attrited Customer Unknown 187 0.168 16.82%
## 10 Attrited Customer $80K - $120K 242 0.158 15.77%
## 11 Attrited Customer $40K - $60K 271 0.151 15.14%
## 12 Attrited Customer $60K - $80K 189 0.135 13.48%
income_tbl %>%
ggplot(aes(x=income_category, y = pct, color = attrition_flag)) +
geom_segment(aes(yend = 0, xend = income_category), size = 1) +
geom_point() +
geom_label(aes(label = pct_txt), hjust = "inward", size = 3) +
coord_flip() +
facet_wrap(~ attrition_flag) +
custom_theme +
theme(legend.position = "none") +
scale_colour_nejm() +
labs(
title= "Level of Attrition",
subtitle = "by Income Category",
x = "Income Category",
y = "percentage of Attrition"
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Summary: - Highest customer base: Interesting enough we see that the main customer base has a graduate program. Probably meaning that our main customer base are clients at a young age - Highest attrition rate: We see that the segment group of Graduate and highschool have the highest attrition rate
specify_decimal <- function(x, k) trimws(format(round(x, k), nsmall=k))
# From which income and edicational categories are most attributed customers coming from?
# Compare attrition levels by income_category and education level
income_education_pct_tbl <- df %>%
count(attrition_flag, income_category, education_level) %>%
group_by(attrition_flag, income_category, education_level) %>%
summarise(total = sum(n)) %>%
ungroup() %>%
group_by(income_category) %>%
mutate(pct = total / sum(total)) %>%
ungroup() %>%
mutate(
income_category = income_category %>% factor(levels = c("$120K +", "Less than $40K", "$40K - $60K", "$80K - $120K", "$60K - $80K", "Unknown")) %>% fct_rev()
) %>%
mutate(
pct = as.numeric(specify_decimal(pct, 4)),
pct_txt = str_glue("{pct*100}%")
)
## `summarise()` has grouped output by 'attrition_flag', 'income_category'. You
## can override using the `.groups` argument.
# Plot
income_education_pct_tbl %>%
ggplot(aes(x=education_level, y = income_category)) +
geom_tile(aes(fill =pct)) +
scale_fill_gradient2(low = "#6F99ADFF", mid = "white", high = "#BC3C29FF") + facet_wrap(~ attrition_flag, scales = "free_x") + custom_theme +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1),
plot.caption = element_text(face = "italic")) +
geom_text(aes(label = pct_txt), size=2.5) +
labs(
title = "Heatmap Customer Attrition",
caption = "Highly educated and mid to high income earners have a great % of Attrition",
x = "Education Level",
y = "Income Category"
)
Lets make a confirmation we saw the group that had the “second” highest attrition rate flag was the Less than 40k Income category group. In this case, I wanted to explore the lowest category to see what was the level of revolving balance to this group.
Summary: - Low levels of credit limit: The credit limit for the low income group was low for those attributed customer - High revolving balance: this group had a high revolving balance, which could be a reason why the credit limit for them was low - Not enough credit limit for certain groups: We see that there were individuals whi attributed who did not have any revolving balance but had low credit limits. Could this be a n explnation for attrition.
# Revolving balance by income category
# Correlation in credit limit and revolving balance
lowest_income_cat_tbl <- df %>%
filter(income_category == "Less than $40K" & attrition_flag == "Attrited Customer") %>%
summarise(
correlation = scales::percent(cor(credit_limit, total_revolving_bal))
)
df %>%
filter(attrition_flag == "Attrited Customer") %>%
ggplot(aes(x=credit_limit, y=total_revolving_bal, color=income_category)) + geom_point(alpha = 0.3) +
facet_wrap(~ income_category, scales = "free_x") + geom_smooth(method = "lm", color="red") +
custom_theme + scale_colour_nejm() +
labs(
title = "Correlation of Credit Limit and Revolving Balance",
caption = str_glue("Correlation of lowest income category: {lowest_income_cat_tbl}"),
x = "Credit Limit",
y = "Total Revolving Balance"
)
## `geom_smooth()` using formula = 'y ~ x'
The lower income category has a slightly larger utilization ratio as opposed to the other income categories. Nevertheless, this is not a significant amount but it could indicate that the lower income category has a slightly level of activity. This will be good to understand especially when we get into the phase of feature importance of our model.
income_metrics <- df %>%
select(income_category, total_trans_ct, avg_utilization_ratio, total_trans_amt) %>%
group_by(income_category) %>%
summarise(
avg_trans_ct = round(mean(total_trans_ct, na.rm = TRUE), 2),
avg_utilization_ratio = round(mean(avg_utilization_ratio, na.rm = TRUE), 2)
) %>%
ungroup()
prep_tbl <- income_metrics %>%
pivot_longer(2:3) %>%
mutate(
income_category = as.factor(income_category) %>% fct_reorder(value)
)
prep_tbl %>%
ggplot(aes(x=income_category, y=value, fill=name)) +
geom_col(color = "black") +
facet_wrap(~name, scales = "free_x") +
coord_flip() +
custom_theme +
scale_fill_nejm() +
geom_label(aes(label = value), color = "white", hjust = 1.2) +
labs(
title = "Metrics by Income Category",
subtitle = "Getting depper into levels of activity",
x = "Value",
y = "Income Category"
)
The purpose of exploring the card category is to see if our clients are satisfied with the product offerings the institution offers. What are the different type of credit card the credit institution offers - Blue Card - Gold Card - Silver Card - Platinum Card
card_category_attrition_tbl <- df %>%
select(attrition_flag, card_category) %>%
count(attrition_flag, card_category) %>%
group_by(card_category) %>%
mutate(pct = n / sum(n)) %>%
ungroup() %>%
arrange(desc(pct)) %>%
mutate(
pct_txt = scales::percent(pct),
card_category = card_category %>% as_factor() %>% fct_reorder(pct)
)
card_category_attrition_tbl %>%
ggplot(aes(x = pct, y = card_category, fill = attrition_flag)) +
geom_col(position = "dodge", width = 0.5, color = "black") +
facet_wrap(~ attrition_flag, scales = "free_x") + custom_theme +
scale_fill_nejm() +
scale_x_continuous(labels = scales::percent) +
geom_label(aes(label = pct_txt), hjust = "inward", color = "white") +
labs(
title = "Customer Status by Income Cateogry",
x = "Attrition Rate",
y = "Card Category",
caption = "Platinum cards have the highest attrition rates"
)
# Distribution of utilization ratio by type of card
card_utilization_tbl <- df %>%
select(attrition_flag, card_category, avg_utilization_ratio, total_trans_amt)
card_utilization_tbl %>%
ggplot(aes(x=card_category, y=total_trans_amt, fill = card_category)) +
geom_jitter(aes(color=card_category),
alpha = 0.5,
size =0.5,
show_legend = FALSE ) +
geom_half_violin(aes(fill = card_category),
alpha = 0.45,
side = "l",
show.legend = FALSE,
trim = FALSE) +
geom_half_boxplot(aes(fill = card_category),
side = "r",
outlier.size = 1,
outlier.color = "red",
width = 0.4,
alpha = 0.2,
show.legend = FALSE) +
stat_summary(fun = median, geom ="line") +
facet_wrap(~ attrition_flag) + custom_theme +
scale_fill_manual(values = c("#1C366B", "#FCD16B", "#C7CEF6", "#D3DDDC")) +
scale_color_manual(values = c("#1C366B", "#FCD16B", "#C7CEF6", "#D3DDDC")) +
scale_y_continuous(labels = scales::dollar) +
labs(
title = "Distribution of Transactions by Type of Card",
x = "Card Category",
y = "Total Transactions"
)
## Warning in geom_jitter(aes(color = card_category), alpha = 0.5, size = 0.5, :
## Ignoring unknown parameters: `show_legend`
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
Lets analyze the level of inactivity for those customers that decided or that will leave the organization and those customers
attrition_tbl <- df %>%
select(attrition_flag, months_inactive_12_mon, total_revolving_bal, total_trans_amt, credit_limit)
#Levels of Inactivity by attrition flag
p1 <- attrition_tbl %>%
ggplot(aes(x = months_inactive_12_mon, y = attrition_flag, fill = attrition_flag)) +
stat_density_ridges(quantile_lines = TRUE, quantiles = 2, alpha = 0.4) +
scale_fill_nejm() + custom_theme +
labs(title = "Levels of Inactivity",
x = "Inactive Months",
y = "Attrition FLag",
caption = "Attrition Customers have higher levels of Inactivity (3 months vs 2 months Median)"
)
p1
## Picking joint bandwidth of 0.151
## Warning: Using the `size` aesthetic with geom_segment was deprecated in ggplot2 3.4.0.
## ℹ Please use the `linewidth` aesthetic instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
We can clearly see that the customers that had a lower revolving balance were the most likely to leave. This could indicate that those customers could have left because they found other lower interest rates to other companies that offered better products at lower rates. In the case of existing customers on average they had a much higher revolving balance but customers with higher revolving balance have a much harder time to look for other financial institution to offer lower rates due to the risk the customer will not pay at all what is left of the remaining balance.
# Radar Chart for attrited customers
max_val <- df %>%
select(attrition_flag, income_category, total_revolving_bal) %>%
filter(attrition_flag == "Attrited Customer") %>%
group_by(income_category) %>%
summarise(
max_val = 1000
) %>% ungroup() %>%
pivot_wider(names_from = income_category, values_from = c(max_val))
min_val <- df %>%
select(attrition_flag, income_category, total_revolving_bal) %>%
filter(attrition_flag == "Attrited Customer") %>%
group_by(income_category) %>%
summarise(
min_val = min(total_revolving_bal)
) %>% ungroup() %>%
pivot_wider(names_from = income_category, values_from = c(min_val))
revo_df <- df %>%
select(attrition_flag, income_category, total_revolving_bal) %>%
filter(attrition_flag == "Attrited Customer") %>%
group_by(income_category) %>%
summarise(
avg_rev = mean(total_revolving_bal, na.rm = TRUE)
) %>%
ungroup() %>%
pivot_wider(names_from = income_category, values_from = c(avg_rev))
combined_df_attrited <- max_val %>%
rbind(min_val) %>%
rbind(revo_df)
rownames(combined_df_attrited) <- 1:nrow(combined_df_attrited)
## Warning: Setting row names on a tibble is deprecated.
revo_df <- df %>%
select(attrition_flag, income_category, total_revolving_bal) %>%
filter(attrition_flag == "Attrited Customer") %>%
group_by(income_category) %>%
summarise(
avg_rev = mean(total_revolving_bal, na.rm = TRUE)
) %>%
ungroup() %>%
pivot_wider(names_from = income_category, values_from = c(avg_rev))
# Customers that remained
max_val_existing <- df %>%
select(attrition_flag, income_category, total_revolving_bal) %>%
filter(attrition_flag == "Attrited Customer") %>%
group_by(income_category) %>%
summarise(
max_val = 1700
) %>% ungroup() %>%
pivot_wider(names_from = income_category, values_from = c(max_val))
min_val_existing <- df %>%
select(attrition_flag, income_category, total_revolving_bal) %>%
filter(attrition_flag == "Existing Customer") %>%
group_by(income_category) %>%
summarise(
min_val = min(total_revolving_bal)
) %>% ungroup() %>%
pivot_wider(names_from = income_category, values_from = c(min_val))
revo_df_existing <- df %>%
select(attrition_flag, income_category, total_revolving_bal) %>%
filter(attrition_flag == "Existing Customer") %>%
group_by(income_category) %>%
summarise(
avg_rev = mean(total_revolving_bal, na.rm = TRUE)
) %>%
ungroup() %>%
pivot_wider(names_from = income_category, values_from = c(avg_rev))
combined_df_existing <- max_val_existing %>%
rbind(min_val_existing) %>%
rbind(revo_df_existing)
combined_df_attrited <- max_val %>%
rbind(min_val) %>%
rbind(revo_df)
rownames(combined_df_attrited) <- 1:nrow(combined_df_attrited)
## Warning: Setting row names on a tibble is deprecated.
create_beautiful_radarchart <- function(data, color = "#00AFBB",
vlabels = colnames(data), vlcex = 0.7,
caxislabels = NULL, title = NULL, ...){
radarchart(
data, axistype = 1,
# Customize the polygon
pcol = color, pfcol = scales::alpha(color, 0.5), plwd = 2, plty = 1,
# Customize the grid
cglcol = "grey", cglty = 1, cglwd = 0.8,
# Customize the axis
axislabcol = "grey",
# Variable labels
vlcex = vlcex, vlabels = vlabels,
caxislabels = caxislabels, title = title, ...
)
}
op <- par(mar = c(1, 2, 2, 1))
attrited_plot <- create_beautiful_radarchart(combined_df_attrited, caxislabels = c(0,300 , 600, 900, 1200), title = "Attrited Customer", color="#F92923")
existing_plot <- create_beautiful_radarchart(combined_df_existing, caxislabels = c(0, 400, 800, 1200, 1600), title = "Existing Customer", color="#23F993")
In this section we will explore possible combinations of groups of people that leave the organization. Through this analysis we could quickly visualize clusters that are more likely to leave the organization
sample_data <- df %>%
select(attrition_flag, education_level, gender, income_category) %>%
filter(attrition_flag == "Attrited Customer") %>%
count(attrition_flag, education_level, gender, income_category) %>%
mutate(
interesting_group = ifelse(gender == "F" & education_level == "Graduate" &
income_category == "Less than $40K", "Interesting", "Not Interesting")
)
sample_data %>%
ggplot(aes(y = n,
axis1 = gender, axis2 = education_level, axis3 = income_category)) +
geom_alluvium(aes(fill = interesting_group), alpha = 0.4, absolute = FALSE) +
geom_stratum(absolute = FALSE, width = 0.45) +
geom_text(stat = "stratum", aes(label = after_stat(stratum)),
absolute = FALSE) +
scale_x_discrete(limits = c("Income Category", "Education Level", "Gender"), expand = c(.1, .05)) +
custom_theme + scale_fill_nejm() +
labs(
title = "Different Groups within Customer Attrition",
subtitle = "Attrited Customer Sample Population",
y = "Amount",
caption = "Female Graduates in the low income category makes a good portion of Attrited Customers"
)
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
Before getting into the AutoML phase, it will be interesting to explore which features are correlated with whether a customer would leave the organization.
correlation_prepared_tbl <- df %>% binarize()
# Plot correlation funnel
correlation_prepared_tbl %>%
correlate(target = attrition_flag__Attrited_Customer) %>%
plot_correlation_funnel(interactive = FALSE) + custom_theme
## Warning: ggrepel: 17 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps