# Import Libraries
# Core 

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)

library(vip)
## 
## Attaching package: 'vip'
## 
## The following object is masked from 'package:utils':
## 
##     vi
library(lime)
## 
## Attaching package: 'lime'
## 
## The following object is masked from 'package:dplyr':
## 
##     explain
# Visualization 

library(ggsci)
library(ggthemes)
library(ggpubr)
library(gghalves)
library(ggridges)
library(correlationfunnel)
## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>
library(ggalluvial)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ lime::explain()   masks dplyr::explain()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(cowplot)
## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:ggpubr':
## 
##     get_legend
## 
## The following object is masked from 'package:ggthemes':
## 
##     theme_map
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
library(fmsb)

library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## 
## Attaching package: 'h2o'
## 
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## 
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## 
## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

Import the data

suppressMessages(library(tidyverse))

df = read_csv("/Users/nguyenbuiminh/Desktop/coding_python/ml/time_series/BankChurners.csv")
## Rows: 10127 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): Attrition_Flag, Gender, Education_Level, Marital_Status, Income_Ca...
## dbl (17): CLIENTNUM, Customer_Age, Dependent_count, Months_on_book, Total_Re...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(df)
## Rows: 10,127
## Columns: 23
## $ CLIENTNUM                                                                                                                          <dbl> …
## $ Attrition_Flag                                                                                                                     <chr> …
## $ Customer_Age                                                                                                                       <dbl> …
## $ Gender                                                                                                                             <chr> …
## $ Dependent_count                                                                                                                    <dbl> …
## $ Education_Level                                                                                                                    <chr> …
## $ Marital_Status                                                                                                                     <chr> …
## $ Income_Category                                                                                                                    <chr> …
## $ Card_Category                                                                                                                      <chr> …
## $ Months_on_book                                                                                                                     <dbl> …
## $ Total_Relationship_Count                                                                                                           <dbl> …
## $ Months_Inactive_12_mon                                                                                                             <dbl> …
## $ Contacts_Count_12_mon                                                                                                              <dbl> …
## $ Credit_Limit                                                                                                                       <dbl> …
## $ Total_Revolving_Bal                                                                                                                <dbl> …
## $ Avg_Open_To_Buy                                                                                                                    <dbl> …
## $ Total_Amt_Chng_Q4_Q1                                                                                                               <dbl> …
## $ Total_Trans_Amt                                                                                                                    <dbl> …
## $ Total_Trans_Ct                                                                                                                     <dbl> …
## $ Total_Ct_Chng_Q4_Q1                                                                                                                <dbl> …
## $ Avg_Utilization_Ratio                                                                                                              <dbl> …
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 <dbl> …
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 <dbl> …
df <- df %>% 
  select(-c(CLIENTNUM, 
            Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,
            Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2)) %>% 
  set_names(names(.) %>% str_to_lower())

Exploratory Data Analysis (EDA)

Create theme parameters

custom_theme <- theme_bw() + 
      theme(plot.title = element_text(face = "bold", color = "black", size=14),
        plot.subtitle = element_text(face = "italic", color = "black", size=12),
        axis.text = element_text(color = "black"), legend.text = element_text(size=10),
        legend.title = element_text(size = 12), legend.position = "none",
        strip.background =element_rect(fill="#666666"), strip.text = element_text(color="white", face="bold"),
        plot.caption = element_text(face = "italic"))

Summarization

We will use the skimr package to gather some brief information on our dataset

  • No missing values
  • Average Credit Limit: The average credit limit is 8,631.95\(, while the median is 4,549\) which cound indicate that there are some outliers in our dataset due to this wide discrepancy between the mean and the median
  • Total Revolving Balance: the revolving balance has no specific distribution and the average revolving balance lies in the 1000$

Note: There are other features to explore but I kept them into these three as we will further analyze this later in the data exploration section

skim(df)
Data summary
Name df
Number of rows 10127
Number of columns 20
_______________________
Column type frequency:
character 6
numeric 14
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
attrition_flag 0 1 17 17 0 2 0
gender 0 1 1 1 0 2 0
education_level 0 1 7 13 0 7 0
marital_status 0 1 6 8 0 4 0
income_category 0 1 7 14 0 6 0
card_category 0 1 4 8 0 4 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
customer_age 0 1 46.33 8.02 26.0 41.00 46.00 52.00 73.00 ▂▆▇▃▁
dependent_count 0 1 2.35 1.30 0.0 1.00 2.00 3.00 5.00 ▇▇▇▅▁
months_on_book 0 1 35.93 7.99 13.0 31.00 36.00 40.00 56.00 ▁▃▇▃▂
total_relationship_count 0 1 3.81 1.55 1.0 3.00 4.00 5.00 6.00 ▇▇▆▆▆
months_inactive_12_mon 0 1 2.34 1.01 0.0 2.00 2.00 3.00 6.00 ▅▇▇▁▁
contacts_count_12_mon 0 1 2.46 1.11 0.0 2.00 2.00 3.00 6.00 ▅▇▇▃▁
credit_limit 0 1 8631.95 9088.78 1438.3 2555.00 4549.00 11067.50 34516.00 ▇▂▁▁▁
total_revolving_bal 0 1 1162.81 814.99 0.0 359.00 1276.00 1784.00 2517.00 ▇▅▇▇▅
avg_open_to_buy 0 1 7469.14 9090.69 3.0 1324.50 3474.00 9859.00 34516.00 ▇▂▁▁▁
total_amt_chng_q4_q1 0 1 0.76 0.22 0.0 0.63 0.74 0.86 3.40 ▅▇▁▁▁
total_trans_amt 0 1 4404.09 3397.13 510.0 2155.50 3899.00 4741.00 18484.00 ▇▅▁▁▁
total_trans_ct 0 1 64.86 23.47 10.0 45.00 67.00 81.00 139.00 ▂▅▇▂▁
total_ct_chng_q4_q1 0 1 0.71 0.24 0.0 0.58 0.70 0.82 3.71 ▇▆▁▁▁
avg_utilization_ratio 0 1 0.27 0.28 0.0 0.02 0.18 0.50 1.00 ▇▂▂▂▁
#Imbalanced datasets, 16% of customers have Attributed

df %>% 
  select_if(is.character) %>% 
  map(~ table(.) %>% prop.table())
## $attrition_flag
## .
## Attrited Customer Existing Customer 
##         0.1606596         0.8393404 
## 
## $gender
## .
##         F         M 
## 0.5290807 0.4709193 
## 
## $education_level
## .
##       College     Doctorate      Graduate   High School Post-Graduate 
##    0.10002962    0.04453441    0.30887726    0.19877555    0.05095290 
##    Uneducated       Unknown 
##    0.14683519    0.14999506 
## 
## $marital_status
## .
##   Divorced    Married     Single    Unknown 
## 0.07386195 0.46282216 0.38935519 0.07396070 
## 
## $income_category
## .
##        $120K +    $40K - $60K    $60K - $80K   $80K - $120K Less than $40K 
##     0.07178829     0.17675521     0.13844179     0.15157500     0.35163425 
##        Unknown 
##     0.10980547 
## 
## $card_category
## .
##        Blue        Gold    Platinum      Silver 
## 0.931766565 0.011454528 0.001974919 0.054803989

Income Category

Comparing Attrition Levels

#Summary - High Attrition level with High income category? We see that the higest income category has a high %, compared to the other income categories.

# Step 2: Income Category 

# Assign the level order
level_order <- c("$120K +", "Less than $40K", "$40K - $60K", "$80K - $120K", "$40K - $60K", "$60K - $80K", "Unknown")

# a. Find % of customers who are about to leave

income_tbl <- df %>%
  select(attrition_flag, income_category) %>%
  count(attrition_flag, income_category) %>%
  group_by(income_category) %>%
  mutate(pct = n/sum(n)) %>%
  ungroup() %>%
  arrange(desc(pct)) %>%
    mutate(
    pct_txt = scales::percent(pct),
    income_category = income_category %>% factor(levels = c("$120K +", "Less than $40K", "$40K - $60K", "$80K - $120K", "$60K - $80K", "Unknown")) %>% fct_rev(),
    income_category = income_category %>% as_factor() %>% fct_reorder(pct),
    attrition_flag = attrition_flag %>% as_factor() %>% fct_rev()
  ) 

income_tbl
## # A tibble: 12 × 5
##    attrition_flag    income_category     n   pct pct_txt
##    <fct>             <fct>           <int> <dbl> <chr>  
##  1 Existing Customer $60K - $80K      1213 0.865 86.52% 
##  2 Existing Customer $40K - $60K      1519 0.849 84.86% 
##  3 Existing Customer $80K - $120K     1293 0.842 84.23% 
##  4 Existing Customer Unknown           925 0.832 83.18% 
##  5 Existing Customer Less than $40K   2949 0.828 82.81% 
##  6 Existing Customer $120K +           601 0.827 82.67% 
##  7 Attrited Customer $120K +           126 0.173 17.33% 
##  8 Attrited Customer Less than $40K    612 0.172 17.19% 
##  9 Attrited Customer Unknown           187 0.168 16.82% 
## 10 Attrited Customer $80K - $120K      242 0.158 15.77% 
## 11 Attrited Customer $40K - $60K       271 0.151 15.14% 
## 12 Attrited Customer $60K - $80K       189 0.135 13.48%
income_tbl %>%
  ggplot(aes(x=income_category, y = pct, color = attrition_flag)) + 
  geom_segment(aes(yend = 0, xend = income_category), size = 1) +
  geom_point() +
  geom_label(aes(label = pct_txt), hjust = "inward", size = 3) + 
  coord_flip() +
  facet_wrap(~ attrition_flag) + 
  custom_theme + 
  theme(legend.position = "none") + 
  scale_colour_nejm() + 
  labs(
    title= "Level of Attrition",
    subtitle = "by Income Category",
    x = "Income Category",
    y = "percentage of Attrition"
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Influence of the Education Level

Summary: - Highest customer base: Interesting enough we see that the main customer base has a graduate program. Probably meaning that our main customer base are clients at a young age - Highest attrition rate: We see that the segment group of Graduate and highschool have the highest attrition rate

specify_decimal <- function(x, k) trimws(format(round(x, k), nsmall=k))

# From which income and edicational categories are most attributed customers coming from?

# Compare attrition levels by income_category and education level 

income_education_pct_tbl <- df %>%
  count(attrition_flag, income_category, education_level) %>%
  group_by(attrition_flag, income_category, education_level) %>%
  summarise(total = sum(n)) %>%
  ungroup() %>%
  group_by(income_category) %>% 
  mutate(pct = total / sum(total)) %>%
  ungroup() %>%
  mutate(
    income_category = income_category %>% factor(levels = c("$120K +", "Less than $40K", "$40K - $60K", "$80K - $120K", "$60K - $80K", "Unknown")) %>% fct_rev()
  ) %>%
  mutate(
    pct = as.numeric(specify_decimal(pct, 4)),
    pct_txt = str_glue("{pct*100}%")
  )
## `summarise()` has grouped output by 'attrition_flag', 'income_category'. You
## can override using the `.groups` argument.
# Plot 

income_education_pct_tbl %>%
  ggplot(aes(x=education_level, y = income_category)) +
  geom_tile(aes(fill =pct)) + 
  scale_fill_gradient2(low = "#6F99ADFF", mid = "white", high = "#BC3C29FF") + facet_wrap(~ attrition_flag, scales = "free_x") + custom_theme + 
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45, hjust = 1),
        plot.caption = element_text(face = "italic")) + 
  geom_text(aes(label = pct_txt), size=2.5) + 
  labs(
    title = "Heatmap Customer Attrition",
    caption = "Highly educated and mid to high income earners have a great % of Attrition",
    x = "Education Level", 
    y = "Income Category"
  )

Revolving Balance vs Credit Limit

Lets make a confirmation we saw the group that had the “second” highest attrition rate flag was the Less than 40k Income category group. In this case, I wanted to explore the lowest category to see what was the level of revolving balance to this group.

Summary: - Low levels of credit limit: The credit limit for the low income group was low for those attributed customer - High revolving balance: this group had a high revolving balance, which could be a reason why the credit limit for them was low - Not enough credit limit for certain groups: We see that there were individuals whi attributed who did not have any revolving balance but had low credit limits. Could this be a n explnation for attrition.

# Revolving balance by income category 

# Correlation in credit  limit and revolving balance 

lowest_income_cat_tbl <- df %>%
  filter(income_category == "Less than $40K" & attrition_flag == "Attrited Customer") %>%
  summarise(
    correlation = scales::percent(cor(credit_limit, total_revolving_bal))
  ) 



df %>% 
  filter(attrition_flag == "Attrited Customer") %>% 
  ggplot(aes(x=credit_limit, y=total_revolving_bal, color=income_category)) + geom_point(alpha = 0.3) +
  facet_wrap(~ income_category, scales = "free_x") + geom_smooth(method = "lm", color="red") + 
  custom_theme + scale_colour_nejm() + 
  labs(
    title = "Correlation of Credit Limit and Revolving Balance",
    caption = str_glue("Correlation of lowest income category: {lowest_income_cat_tbl}"),
    x = "Credit Limit",
    y = "Total Revolving Balance"
  )
## `geom_smooth()` using formula = 'y ~ x'

Activity Levels Metrics

The lower income category has a slightly larger utilization ratio as opposed to the other income categories. Nevertheless, this is not a significant amount but it could indicate that the lower income category has a slightly level of activity. This will be good to understand especially when we get into the phase of feature importance of our model.

income_metrics <- df %>%
  select(income_category, total_trans_ct, avg_utilization_ratio, total_trans_amt) %>%
  group_by(income_category) %>%
  summarise(
    avg_trans_ct = round(mean(total_trans_ct, na.rm = TRUE), 2),
    avg_utilization_ratio = round(mean(avg_utilization_ratio, na.rm = TRUE), 2)
  ) %>%
  ungroup()

prep_tbl <- income_metrics %>%
  pivot_longer(2:3) %>%
  mutate(
    income_category = as.factor(income_category) %>% fct_reorder(value)
  )


prep_tbl %>%
  ggplot(aes(x=income_category, y=value, fill=name)) + 
  geom_col(color = "black") +
  facet_wrap(~name, scales = "free_x") +
  coord_flip() + 
  custom_theme +
  scale_fill_nejm() +
  geom_label(aes(label = value), color = "white", hjust = 1.2) +
  labs(
    title = "Metrics by Income Category",
    subtitle = "Getting depper into levels of activity",
    x = "Value",
    y = "Income Category"
  )

Card Category

The purpose of exploring the card category is to see if our clients are satisfied with the product offerings the institution offers. What are the different type of credit card the credit institution offers - Blue Card - Gold Card - Silver Card - Platinum Card

Attrition Rates

  • Levels of attrition: The level highest percentage of attrition are coming from platinum and gold card users
  • Lowest levels of attrition: The lowest levels of attrition are coming from silver and blue cards
card_category_attrition_tbl <- df %>% 
  select(attrition_flag, card_category) %>% 
  count(attrition_flag, card_category) %>% 
  group_by(card_category) %>% 
  mutate(pct = n / sum(n)) %>% 
  ungroup() %>% 
  arrange(desc(pct)) %>% 
    mutate(
    pct_txt = scales::percent(pct),
    card_category = card_category %>% as_factor() %>% fct_reorder(pct)
    )



card_category_attrition_tbl %>% 
  ggplot(aes(x = pct, y = card_category,  fill = attrition_flag)) +
  geom_col(position = "dodge", width = 0.5, color = "black") +
  facet_wrap(~ attrition_flag, scales = "free_x") + custom_theme + 
  scale_fill_nejm() +
  scale_x_continuous(labels = scales::percent) + 
  geom_label(aes(label = pct_txt), hjust = "inward", color = "white") +
  labs(
    title = "Customer Status by Income Cateogry",
    x = "Attrition Rate",
    y = "Card Category",
    caption = "Platinum cards have the highest attrition rates"
  )

Total Transactions

  • Total transactions much lower for platinum: This can indicate 2 things from the customers leaving in this group, either they dont have anough credit limit or they belong to a lower category. You can check that the median of the distribution for the platinum category for the total transactions is much lower than the customers that stay
  • Gold card group: For the gold card group we dont see any major discrepancies
  • Different clusters in our distribution: Note that we see different clusters in some of our card group distributions, this could mean that these are clusters of income level which could be correlated to levels of transactions.
# Distribution of utilization ratio by type of card 

card_utilization_tbl <- df %>%
  select(attrition_flag, card_category, avg_utilization_ratio, total_trans_amt) 

card_utilization_tbl %>%
  ggplot(aes(x=card_category, y=total_trans_amt, fill = card_category)) + 
  geom_jitter(aes(color=card_category), 
              alpha = 0.5, 
              size =0.5,
              show_legend = FALSE ) + 
  geom_half_violin(aes(fill = card_category), 
                   alpha = 0.45,
                   side = "l",
                   show.legend = FALSE,
                   trim = FALSE) +
  geom_half_boxplot(aes(fill = card_category),
                    side = "r",
                    outlier.size = 1,
                    outlier.color = "red",
                    width = 0.4,
                    alpha = 0.2,
                    show.legend = FALSE) + 
  stat_summary(fun = median, geom ="line") +
  facet_wrap(~ attrition_flag) + custom_theme + 
    scale_fill_manual(values = c("#1C366B", "#FCD16B", "#C7CEF6", "#D3DDDC")) + 
  scale_color_manual(values = c("#1C366B", "#FCD16B", "#C7CEF6", "#D3DDDC")) + 
  scale_y_continuous(labels = scales::dollar) +
  labs(
    title = "Distribution of Transactions by Type of Card",
    x = "Card Category",
    y = "Total Transactions"
  )
## Warning in geom_jitter(aes(color = card_category), alpha = 0.5, size = 0.5, :
## Ignoring unknown parameters: `show_legend`
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

Attrition Flags

Level of Activity

Lets analyze the level of inactivity for those customers that decided or that will leave the organization and those customers

  • Attrited Customers: We see that the level of interactivity has a median of 3 months, 1 month higher than those who still stay with the org
  • Understanding this distribution: This could give us an indication that when the level of inactivity starts getting “beyond” the 2 months threshold, then 3 is higher chance that the person will decide to leave the org
  • Possible alternative: Contact the client to see if they are satisfied with the service and what could the org do to improve as a whole
attrition_tbl <- df %>% 
  select(attrition_flag, months_inactive_12_mon, total_revolving_bal, total_trans_amt, credit_limit)

#Levels of Inactivity by attrition flag

p1 <- attrition_tbl %>%
  ggplot(aes(x = months_inactive_12_mon, y = attrition_flag, fill = attrition_flag)) + 
  stat_density_ridges(quantile_lines = TRUE, quantiles = 2, alpha = 0.4) +
  scale_fill_nejm() + custom_theme + 
  labs(title = "Levels of Inactivity",
       x = "Inactive Months",
       y = "Attrition FLag",
       caption = "Attrition Customers have higher levels of Inactivity (3 months vs 2 months Median)"
       )

p1
## Picking joint bandwidth of 0.151
## Warning: Using the `size` aesthetic with geom_segment was deprecated in ggplot2 3.4.0.
## ℹ Please use the `linewidth` aesthetic instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Levels of Revolving Balance

We can clearly see that the customers that had a lower revolving balance were the most likely to leave. This could indicate that those customers could have left because they found other lower interest rates to other companies that offered better products at lower rates. In the case of existing customers on average they had a much higher revolving balance but customers with higher revolving balance have a much harder time to look for other financial institution to offer lower rates due to the risk the customer will not pay at all what is left of the remaining balance.

# Radar Chart for attrited customers

max_val <- df %>% 
  select(attrition_flag, income_category, total_revolving_bal) %>% 
  filter(attrition_flag == "Attrited Customer") %>% 
  group_by(income_category) %>% 
  summarise(
    max_val = 1000
  ) %>% ungroup() %>% 
  pivot_wider(names_from = income_category, values_from = c(max_val))


min_val <- df %>% 
  select(attrition_flag, income_category, total_revolving_bal) %>% 
  filter(attrition_flag == "Attrited Customer") %>% 
  group_by(income_category) %>% 
  summarise(
    min_val = min(total_revolving_bal)
  ) %>% ungroup() %>% 
  pivot_wider(names_from = income_category, values_from = c(min_val))



revo_df <- df %>% 
  select(attrition_flag, income_category, total_revolving_bal) %>% 
  filter(attrition_flag == "Attrited Customer") %>% 
  group_by(income_category) %>% 
  summarise(
    avg_rev = mean(total_revolving_bal, na.rm = TRUE)
  ) %>% 
  ungroup() %>% 
  pivot_wider(names_from = income_category, values_from = c(avg_rev))



combined_df_attrited <- max_val %>% 
  rbind(min_val) %>% 
  rbind(revo_df)


rownames(combined_df_attrited) <- 1:nrow(combined_df_attrited) 
## Warning: Setting row names on a tibble is deprecated.
revo_df <- df %>% 
  select(attrition_flag, income_category, total_revolving_bal) %>% 
  filter(attrition_flag == "Attrited Customer") %>% 
  group_by(income_category) %>% 
  summarise(
    avg_rev = mean(total_revolving_bal, na.rm = TRUE)
  ) %>% 
  ungroup() %>% 
  pivot_wider(names_from = income_category, values_from = c(avg_rev))



# Customers that remained

max_val_existing <- df %>% 
  select(attrition_flag, income_category, total_revolving_bal) %>% 
  filter(attrition_flag == "Attrited Customer") %>% 
  group_by(income_category) %>% 
  summarise(
    max_val = 1700
  ) %>% ungroup() %>% 
  pivot_wider(names_from = income_category, values_from = c(max_val))


min_val_existing <- df %>% 
  select(attrition_flag, income_category, total_revolving_bal) %>% 
  filter(attrition_flag == "Existing Customer") %>% 
  group_by(income_category) %>% 
  summarise(
    min_val = min(total_revolving_bal)
  ) %>% ungroup() %>% 
  pivot_wider(names_from = income_category, values_from = c(min_val))



revo_df_existing <- df %>% 
  select(attrition_flag, income_category, total_revolving_bal) %>% 
  filter(attrition_flag == "Existing Customer") %>% 
  group_by(income_category) %>% 
  summarise(
    avg_rev = mean(total_revolving_bal, na.rm = TRUE)
  ) %>% 
  ungroup() %>% 
  pivot_wider(names_from = income_category, values_from = c(avg_rev))


combined_df_existing <- max_val_existing %>% 
  rbind(min_val_existing) %>% 
  rbind(revo_df_existing)


combined_df_attrited <- max_val %>% 
  rbind(min_val) %>% 
  rbind(revo_df)


rownames(combined_df_attrited) <- 1:nrow(combined_df_attrited) 
## Warning: Setting row names on a tibble is deprecated.
create_beautiful_radarchart <- function(data, color = "#00AFBB", 
                                        vlabels = colnames(data), vlcex = 0.7,
                                        caxislabels = NULL, title = NULL, ...){
  radarchart(
    data, axistype = 1,
    # Customize the polygon
    pcol = color, pfcol = scales::alpha(color, 0.5), plwd = 2, plty = 1,
    # Customize the grid
    cglcol = "grey", cglty = 1, cglwd = 0.8,
    # Customize the axis
    axislabcol = "grey", 
    # Variable labels
    vlcex = vlcex, vlabels = vlabels,
    caxislabels = caxislabels, title = title, ...
  )
}




op <- par(mar = c(1, 2, 2, 1))
attrited_plot <- create_beautiful_radarchart(combined_df_attrited, caxislabels = c(0,300 , 600, 900, 1200), title = "Attrited Customer", color="#F92923")

existing_plot <- create_beautiful_radarchart(combined_df_existing, caxislabels = c(0, 400, 800, 1200, 1600), title = "Existing Customer", color="#23F993")

Customer Attrition Groups

In this section we will explore possible combinations of groups of people that leave the organization. Through this analysis we could quickly visualize clusters that are more likely to leave the organization

  • Low Income Female Graduates: This cluster group composes the highest amount of people who of attrited customers.
sample_data <- df %>% 
  select(attrition_flag, education_level, gender, income_category) %>% 
  filter(attrition_flag == "Attrited Customer") %>% 
  count(attrition_flag, education_level, gender, income_category) %>% 
  mutate(
    interesting_group = ifelse(gender == "F" & education_level == "Graduate" &
                                 income_category == "Less than $40K", "Interesting", "Not Interesting")
  )



sample_data %>% 
  ggplot(aes(y = n,
             axis1 = gender, axis2 = education_level, axis3 = income_category)) +
  geom_alluvium(aes(fill = interesting_group), alpha = 0.4,  absolute = FALSE) + 
  geom_stratum(absolute = FALSE, width = 0.45) + 
    geom_text(stat = "stratum", aes(label = after_stat(stratum)),
            absolute = FALSE) + 
  scale_x_discrete(limits = c("Income Category", "Education Level", "Gender"), expand = c(.1, .05)) + 
  custom_theme + scale_fill_nejm() + 
  labs(
    title = "Different Groups within Customer Attrition",
    subtitle = "Attrited Customer Sample Population",
    y = "Amount",
    caption = "Female Graduates in the low income category makes a good portion of Attrited Customers"
  )
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

Correlation Analysis

Before getting into the AutoML phase, it will be interesting to explore which features are correlated with whether a customer would leave the organization.

  • Low revolving balance: People that usually pay their credit cards on time are more likely to leave the organization
  • Low levels of inactivity: If the transaction count is less than 0.582, it is most likely the customer will leave. Same for total transaction count if it is less than 45 for a specific client then the client is more likely to leave the organization
  • Low transaction amount: TH e lower the transaction amounts the higher the correlation with attrition
correlation_prepared_tbl <- df %>% binarize()


# Plot correlation funnel
correlation_prepared_tbl %>% 
  correlate(target = attrition_flag__Attrited_Customer) %>% 
    plot_correlation_funnel(interactive = FALSE) + custom_theme 
## Warning: ggrepel: 17 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps