setwd("C:/Users/DellPC/Desktop/Corner/R_source_code/Julia_Silge")

#Manipulation
library(tidyverse)

#Create models
library(tidymodels)

#Feature importance
library(vip)
library(lime)

#Visualization
library(RColorBrewer)
library(scales)
library(DT)
library(ggsci)
library(ggthemes)
library(ggpubr)
library(gghalves)
library(ggridges)
library(correlationfunnel)
library(ggalluvial)
library(tidymodels)
library(cowplot)
library(fmsb)

#Auto ML
library(h2o)


df <- read.csv("bank_churner.csv")

df <- df %>% 
  select(-c(CLIENTNUM, 
            Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,
            Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2)) %>% 
               set_names(names(.) %>%
                           str_to_lower())

1 Overal Explroe

naniar::vis_miss(df)

DataExplorer::plot_missing(df)

DataExplorer::plot_intro(df) + theme_light()

glimpse(df)
## Rows: 10,127
## Columns: 20
## $ attrition_flag           <chr> "Existing Customer", "Existing Customer", "Ex~
## $ customer_age             <int> 45, 49, 51, 40, 40, 44, 51, 32, 37, 48, 42, 6~
## $ gender                   <chr> "M", "F", "M", "F", "M", "M", "M", "M", "M", ~
## $ dependent_count          <int> 3, 5, 3, 4, 3, 2, 4, 0, 3, 2, 5, 1, 1, 3, 2, ~
## $ education_level          <chr> "High School", "Graduate", "Graduate", "High ~
## $ marital_status           <chr> "Married", "Single", "Married", "Unknown", "M~
## $ income_category          <chr> "$60K - $80K", "Less than $40K", "$80K - $120~
## $ card_category            <chr> "Blue", "Blue", "Blue", "Blue", "Blue", "Blue~
## $ months_on_book           <int> 39, 44, 36, 34, 21, 36, 46, 27, 36, 36, 31, 5~
## $ total_relationship_count <int> 5, 6, 4, 3, 5, 3, 6, 2, 5, 6, 5, 6, 3, 5, 5, ~
## $ months_inactive_12_mon   <int> 1, 1, 1, 4, 1, 1, 1, 2, 2, 3, 3, 2, 6, 1, 2, ~
## $ contacts_count_12_mon    <int> 3, 2, 0, 1, 0, 2, 3, 2, 0, 3, 2, 3, 0, 3, 2, ~
## $ credit_limit             <dbl> 12691.0, 8256.0, 3418.0, 3313.0, 4716.0, 4010~
## $ total_revolving_bal      <int> 777, 864, 0, 2517, 0, 1247, 2264, 1396, 2517,~
## $ avg_open_to_buy          <dbl> 11914.0, 7392.0, 3418.0, 796.0, 4716.0, 2763.~
## $ total_amt_chng_q4_q1     <dbl> 1.335, 1.541, 2.594, 1.405, 2.175, 1.376, 1.9~
## $ total_trans_amt          <int> 1144, 1291, 1887, 1171, 816, 1088, 1330, 1538~
## $ total_trans_ct           <int> 42, 33, 20, 20, 28, 24, 31, 36, 24, 32, 42, 2~
## $ total_ct_chng_q4_q1      <dbl> 1.625, 3.714, 2.333, 2.333, 2.500, 0.846, 0.7~
## $ avg_utilization_ratio    <dbl> 0.061, 0.105, 0.000, 0.760, 0.000, 0.311, 0.0~
skimr::skim(df)
Data summary
Name df
Number of rows 10127
Number of columns 20
_______________________
Column type frequency:
character 6
numeric 14
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
attrition_flag 0 1 17 17 0 2 0
gender 0 1 1 1 0 2 0
education_level 0 1 7 13 0 7 0
marital_status 0 1 6 8 0 4 0
income_category 0 1 7 14 0 6 0
card_category 0 1 4 8 0 4 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
customer_age 0 1 46.33 8.02 26.0 41.00 46.00 52.00 73.00 ▂▆▇▃▁
dependent_count 0 1 2.35 1.30 0.0 1.00 2.00 3.00 5.00 ▇▇▇▅▁
months_on_book 0 1 35.93 7.99 13.0 31.00 36.00 40.00 56.00 ▁▃▇▃▂
total_relationship_count 0 1 3.81 1.55 1.0 3.00 4.00 5.00 6.00 ▇▇▆▆▆
months_inactive_12_mon 0 1 2.34 1.01 0.0 2.00 2.00 3.00 6.00 ▅▇▇▁▁
contacts_count_12_mon 0 1 2.46 1.11 0.0 2.00 2.00 3.00 6.00 ▅▇▇▃▁
credit_limit 0 1 8631.95 9088.78 1438.3 2555.00 4549.00 11067.50 34516.00 ▇▂▁▁▁
total_revolving_bal 0 1 1162.81 814.99 0.0 359.00 1276.00 1784.00 2517.00 ▇▅▇▇▅
avg_open_to_buy 0 1 7469.14 9090.69 3.0 1324.50 3474.00 9859.00 34516.00 ▇▂▁▁▁
total_amt_chng_q4_q1 0 1 0.76 0.22 0.0 0.63 0.74 0.86 3.40 ▅▇▁▁▁
total_trans_amt 0 1 4404.09 3397.13 510.0 2155.50 3899.00 4741.00 18484.00 ▇▅▁▁▁
total_trans_ct 0 1 64.86 23.47 10.0 45.00 67.00 81.00 139.00 ▂▅▇▂▁
total_ct_chng_q4_q1 0 1 0.71 0.24 0.0 0.58 0.70 0.82 3.71 ▇▆▁▁▁
avg_utilization_ratio 0 1 0.27 0.28 0.0 0.02 0.18 0.50 1.00 ▇▂▂▂▁
DT::datatable(df %>% sample_n(100))

2 Business Problem

The goal of business problem is to better understand what attributes(features) are making our clients to leave the organization. Moreover, in this business problem we have three main tasks that we will perform to come up with potential solutions and conclusion:

3 Introduction

Many organizations eventually will face a situation in which a customer decides to leave the organization. Nevertheless, questions come to pop up especially when it comes to the reasons that customer decided to leave the company. Definitely there are reasons that are not consistent with other customers that have left an organization but what if there is a pattern as to the reason customers decided to leave the organization? If the company would be able to detect the major reasons why customers are leaving the company, the company would be able to react and prevent customers from leaving. Furthermore, if the company is able to understand from the past, the organization will be able to prevent further customer attritions in the future

Note: Before proceeding further, please the notebook if you find it useful to the community and of course if you like it. Please comment if you have a constructive criticism on the comment section.

Feature Definitions:

I will keep the feature definitions to the most important however, if I see the need to analyze further any other features I will add them to the list.

ul>
  • Attrition Flag: This is our target variable, means whether our customer decided to leave the organization or that there is a high probability the customer will leave.
  • Gender: Male or Female
  • Customer age: Age of the customer
  • Income category: To which income category does the customer belongs to.
  • Card category: Which card category does the customer have?
  • Months Inactive: Amount of inactivity when using the credit card.
  • Credit Limit: Credit Limit the customer currently has.
  • Total Revolving Balance: The unpaid portion that carries over to the next month when a customer does not pay.
  • Average Utilization Ratio: Measures how much credit you are using compared to how much you have available.
  • Open to buy: The amount of credit available at a given time on a credit card holders account. Thus, the average open to buy is the average credit available allocated to a specific customer.
  • 3.1 Import Libraries

    Here are the main libraries and how they will help us solve this business problem.

    • GGplot: Visualization library that will help us identify patterns through exploratory data analysis.
    • LIME: Help us understand how our ML models are making decisions and also understand what features have the biggest impact on the decision making of our model.
    • H2o: AutoML library that we will use in order to decide which model will perform the best when anticipating which customer would decide to leave the organization.

    4 Exploratory Data Analysis (EDA)

    4.1 Create theme parameters

    custom_theme <- theme_bw() + 
          theme(plot.title = element_text(face = "bold", color = "black", size=14),
            plot.subtitle = element_text(face = "italic", color = "black", size=12),
            axis.text = element_text(color = "black"), legend.text = element_text(size=10),
            legend.title = element_text(size = 12), legend.position = "none",
            strip.background =element_rect(fill="#666666"), strip.text = element_text(color="white", face="bold"),
            plot.caption = element_text(face = "italic"))

    4.2 Summarization

    We will use the skimr package to gather some brief information on our dataset.

    • No Missing values: There are no missing values in our dataset (we are lucky, because no imputation method has to be implemented), which we do use when having lots of missing values.
    • Average Credit Limit: The average credit limit is 8,631.95 USD, while the median is 4,549 USD which could indicate that there are some outliers in our dataset due to this wide discrepancy between the mean and the median.
    • Total Revolving Balance: The revolving balance has no specific distribution and the average revolving balance lies in the 1,000 USD.

    Note:
    There are other features to explore but I kept them into these three as we will further analyze this later in the data exploration section.

    # Some of the features can be categorical
    
    skimr::skim(df)
    Data summary
    Name df
    Number of rows 10127
    Number of columns 20
    _______________________
    Column type frequency:
    character 6
    numeric 14
    ________________________
    Group variables None

    Variable type: character

    skim_variable n_missing complete_rate min max empty n_unique whitespace
    attrition_flag 0 1 17 17 0 2 0
    gender 0 1 1 1 0 2 0
    education_level 0 1 7 13 0 7 0
    marital_status 0 1 6 8 0 4 0
    income_category 0 1 7 14 0 6 0
    card_category 0 1 4 8 0 4 0

    Variable type: numeric

    skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
    customer_age 0 1 46.33 8.02 26.0 41.00 46.00 52.00 73.00 ▂▆▇▃▁
    dependent_count 0 1 2.35 1.30 0.0 1.00 2.00 3.00 5.00 ▇▇▇▅▁
    months_on_book 0 1 35.93 7.99 13.0 31.00 36.00 40.00 56.00 ▁▃▇▃▂
    total_relationship_count 0 1 3.81 1.55 1.0 3.00 4.00 5.00 6.00 ▇▇▆▆▆
    months_inactive_12_mon 0 1 2.34 1.01 0.0 2.00 2.00 3.00 6.00 ▅▇▇▁▁
    contacts_count_12_mon 0 1 2.46 1.11 0.0 2.00 2.00 3.00 6.00 ▅▇▇▃▁
    credit_limit 0 1 8631.95 9088.78 1438.3 2555.00 4549.00 11067.50 34516.00 ▇▂▁▁▁
    total_revolving_bal 0 1 1162.81 814.99 0.0 359.00 1276.00 1784.00 2517.00 ▇▅▇▇▅
    avg_open_to_buy 0 1 7469.14 9090.69 3.0 1324.50 3474.00 9859.00 34516.00 ▇▂▁▁▁
    total_amt_chng_q4_q1 0 1 0.76 0.22 0.0 0.63 0.74 0.86 3.40 ▅▇▁▁▁
    total_trans_amt 0 1 4404.09 3397.13 510.0 2155.50 3899.00 4741.00 18484.00 ▇▅▁▁▁
    total_trans_ct 0 1 64.86 23.47 10.0 45.00 67.00 81.00 139.00 ▂▅▇▂▁
    total_ct_chng_q4_q1 0 1 0.71 0.24 0.0 0.58 0.70 0.82 3.71 ▇▆▁▁▁
    avg_utilization_ratio 0 1 0.27 0.28 0.0 0.02 0.18 0.50 1.00 ▇▂▂▂▁
    df %>% 
                   select_if(is.character) %>% 
                    map( ~ table(.) %>% prop.table())
    ## $attrition_flag
    ## .
    ## Attrited Customer Existing Customer 
    ##         0.1606596         0.8393404 
    ## 
    ## $gender
    ## .
    ##         F         M 
    ## 0.5290807 0.4709193 
    ## 
    ## $education_level
    ## .
    ##       College     Doctorate      Graduate   High School Post-Graduate 
    ##    0.10002962    0.04453441    0.30887726    0.19877555    0.05095290 
    ##    Uneducated       Unknown 
    ##    0.14683519    0.14999506 
    ## 
    ## $marital_status
    ## .
    ##   Divorced    Married     Single    Unknown 
    ## 0.07386195 0.46282216 0.38935519 0.07396070 
    ## 
    ## $income_category
    ## .
    ##        $120K +    $40K - $60K    $60K - $80K   $80K - $120K Less than $40K 
    ##     0.07178829     0.17675521     0.13844179     0.15157500     0.35163425 
    ##        Unknown 
    ##     0.10980547 
    ## 
    ## $card_category
    ## .
    ##        Blue        Gold    Platinum      Silver 
    ## 0.931766565 0.011454528 0.001974919 0.054803989

    4.3 Income Category

    4.3.1 Comparing Attribution Levels

    Summary

    • High Attrition level with High income category? We see that the highest income category has a high % level of attrition (although not by much), compared to the other income categories. Follow by the lowest income category.
    # Step 2: Income Category  ----
    
    # Assign the level order
    level_order <- c("$120K +", "Less than $40K", "$40K - $60K", "$80K - $120K", "$40K - $60K", "$60K - $80K", "Unknown")
    
    # a. Find % of customers who are about to leave
    
    income_tbl <- df %>% 
      select(attrition_flag, income_category) %>% 
      count(attrition_flag, income_category) %>% 
      group_by(income_category) %>% 
      mutate(pct = n / sum(n)) %>% 
      ungroup() %>% 
      arrange(desc(pct)) %>% 
      mutate(
        pct_txt = scales::percent(pct),
        income_category = income_category %>% factor(levels = c("$120K +", "Less than $40K", "$40K - $60K", "$80K - $120K", "$60K - $80K", "Unknown")) %>% fct_rev(),
        income_category = income_category %>% as_factor() %>% fct_reorder(pct),
        attrition_flag = attrition_flag %>% as_factor() %>% fct_rev()
      ) 
    
    
    # Plot Visualization
    
    income_tbl %>% 
      ggplot(aes(x=income_category, y=pct, color= attrition_flag)) + 
      geom_segment(aes(yend = 0, xend = income_category), size = 1) + 
      geom_point() + 
                   geom_label(aes(label = pct_txt), hjust = "inward",
                                size = 3) + 
                   coord_flip() +
      facet_wrap(~ attrition_flag)  + custom_theme + 
      theme(legend.position = "none") + 
      scale_colour_nejm() + 
      labs(
        title = "Levels of Attrition",
        subtitle = "by Income Category",
        x = "Income Category",
        y = "Percentage of Attrition"
      )

    4.3.2 Influence of the Education Level

    Summary:

    • Highest customer base: Interesting enough we see that the main customer base has a graduate program. Probably meaning that our main customer base are clients at a younger age.
    • Highest attrition rate: We see that the segment group of Graduate and Highschool have the highest attrition rate.
    # Reference function: Stackoverflow
    specify_decimal <- function(x, k) trimws(format(round(x, k), nsmall=k))
    
    
    # From which income and educational categories are most attrited customers coming from? 
    
    # Compare attrition levels by income_category and education level
    income_education_pct_tbl <- df %>% 
      count(attrition_flag, income_category, education_level) %>% 
      group_by(attrition_flag, income_category, education_level) %>% 
      summarise(total = sum(n)) %>% 
      ungroup() %>% 
      group_by(income_category) %>% 
      mutate(
        pct = total / sum(total)
      ) %>% ungroup() %>% 
      mutate(
        income_category = income_category %>% factor(levels = c("$120K +", "Less than $40K", "$40K - $60K", "$80K - $120K", "$60K - $80K", "Unknown")) %>% fct_rev()
      ) %>% 
      mutate(
        pct = as.numeric(specify_decimal(pct,4)),
        pct_txt = str_glue("{pct*100}%")
      )
      
    
    
    # Plot
    
    income_education_pct_tbl %>% 
      ggplot(aes(x=education_level, y = income_category)) + 
      geom_tile(aes(fill = pct)) + 
                   scale_fill_gradient2(low = "#6F99ADFF", mid = "white", high = "#BC3C29FF") + 
                   facet_wrap(~ attrition_flag, scales = "free_x") + custom_theme + 
      theme(legend.position = "none",
            axis.text.x = element_text(angle = 45, hjust = 1),
            plot.caption = element_text(face = "italic")) + 
      geom_text(aes(label = pct_txt), size=2.5) + 
      labs(
        title = "Heatmap Customer Attrition",
        caption = "Highly educated and mid to high income earners have a great % of Attrition",
        x = "Education Level", 
        y = "Income Category"
      )

    4.3.3 Revolving Balance vs Credit Limit

    Lets make a confirmation we saw that the group that had the “second” highest attrition rate flag was the Less than 40k income category group. In this case I wanted to explore the lowest income category to see what was the level of revolving balance to this group.

    Summary:

    • Low levels of credit limit: The credit limit for the low income group was low for those attrited customers.
    • High Revolving Balance: This group had a high revolving balance, which could be a reason why the credit limit for them was low.
    • Not enough credit limits for certain groups? We see that there were individuals who attrited who did not have any revolving balance but had low credit limits. Could this be an explanation for attrition?
    # Revolving balance by income category
    # Think of a chart.
    
    # Correlation in credit_limit and revolving_balance
    lowest_income_cat_tbl <- df %>% 
      filter(income_category == "Less than $40K" & attrition_flag == "Attrited Customer") %>% 
      summarise(
        correlation = scales::percent(cor(credit_limit, total_revolving_bal))
      )
    
    
    df %>% 
      filter(attrition_flag == "Attrited Customer") %>% 
      ggplot(aes(x=credit_limit, y=total_revolving_bal, color=income_category)) + geom_point(alpha = 0.3) +
      facet_wrap(~ income_category, scales = "free_x") + geom_smooth(method = "lm", color="red") + 
      custom_theme + scale_colour_nejm() + 
      labs(
        title = "Correlation of Credit Limit and Revolving Balance",
        caption = str_glue("Correlation of lowest income category: {lowest_income_cat_tbl}"),
        x = "Credit Limit",
        y = "Total Revolving Balance"
      )

    4.3.4 Activity Levels Metrics

    The lower income category has a slightly larger utilization ratio as opposed to the other income categories. Nevertheless, this is not a significant amount but it could indicate that the lower income category has a slightly level of activity. This will be good to understand especially when we get into the phase of feature importance for our model. The utilization ratio also tells us the level of activity by income category but again we should go to the feature importance phase to see if levels of activity is an indicator whether a customer will leave the organization.

    income_metrics <- df %>% 
      select(income_category, total_trans_ct, avg_utilization_ratio, total_trans_amt) %>% 
      group_by(income_category) %>% 
      summarise(
        avg_trans_ct = round(mean(total_trans_ct, na.rm = TRUE),2),
        avg_utilization_ratio = round(mean(avg_utilization_ratio, na.rm = TRUE),2)
      ) %>% 
      ungroup()
    
    # Perform the radar chart
    # Maybe create several metrics
    prep_tbl <- income_metrics %>% 
      pivot_longer(2:3) %>% 
      mutate(
        income_category = as.factor(income_category) %>% fct_reorder(value)
      )
    
    prep_tbl %>% 
      ggplot(aes(x=income_category, y=value, fill=name)) + 
      geom_col(color="black") +
      facet_wrap(~name, scales = "free_x") + 
      coord_flip() + custom_theme + scale_fill_nejm() + 
      geom_label(aes(label=value), color="white", hjust=1.2) + 
      labs(
        title = "Metrics by Income Category",
        subtitle = "Getting deeper into levels of activity",
        x = "Value",
        y = "Income Category"
      )

    4.4 Card Category

    The purpose of exploring the card category is to see if our clients are satisfied with the product offerings the institution offers. What are the different type of credit card the credit institution offers?

    • Blue Card
    • Gold Card
    • Silver Card
    • Platinum Card

    4.4.1 Attrition Rates

    • Levels of attrition: The levels highest percentage of attrition are coming from platinum and gold card users.
    • Lowest levels of attrition: The lowest levels of attrition are coming from the silver and blue cards.
    card_category_attrition_tbl <- df %>% 
      select(attrition_flag, card_category) %>% 
      count(attrition_flag, card_category) %>% 
      group_by(card_category) %>% 
      mutate(pct = n / sum(n)) %>% 
      ungroup() %>% 
      arrange(desc(pct)) %>% 
        mutate(
        pct_txt = scales::percent(pct),
        card_category = card_category %>% as_factor() %>% fct_reorder(pct)
        )
    
    
    
    card_category_attrition_tbl %>% 
      ggplot(aes(x = pct, y = card_category,  fill = attrition_flag)) +
      geom_col(position = "dodge", width = 0.5, color = "black") +
      facet_wrap(~ attrition_flag, scales = "free_x") + custom_theme + 
      scale_fill_nejm() +
      scale_x_continuous(labels = scales::percent) + 
      geom_label(aes(label = pct_txt), hjust = "inward", color = "white") +
      labs(
        title = "Customer Status by Income Cateogry",
        x = "Attrition Rate",
        y = "Card Category",
        caption = "Platinum cards have the highest attrition rates"
      )

    4.4.2 Total Transactions

    • Total transactions much lower for platinum: This can indicate two things from the customers leaving in this group, either they dont have enough credit limit or they belong to a lower income category. You can check that the median of the distribution for the platinum category for the total transactions is much lower than the customers that stay.
    • Gold card group: For the gold card group we dont see any major discrepancies.
    • Different clusters in our distribution: Note that we see different clusters in some of our card group distributions, this could mean that these are clusters of income level which could be correlated to levels of transactions.
    # Distribution of Utulization ratio by type of card
    
    card_utilization_tbl <- df %>% 
      select(attrition_flag, card_category, avg_utilization_ratio, total_trans_amt)
    
    
    card_utilization_tbl %>% 
      ggplot(aes(x=card_category, y=total_trans_amt,fill=card_category)) + 
      geom_jitter(aes(color=card_category),
                  alpha = 0.5,
                  size=0.5,
                  show.legend = FALSE) + 
      geom_half_violin(aes(fill=card_category),
                       side = "l",
                       alpha = 0.45,
                       show.legend = FALSE,
                       trim = FALSE) +
      geom_half_boxplot(aes(fill = card_category),
                        side = "r",
                        outlier.size = 1,
                        outlier.color = "red",
                        width = 0.4, 
                        alpha = 0.2,
                        show.legend = FALSE) + 
      stat_summary(fun = median, geom="line") + 
      facet_wrap(~ attrition_flag) + custom_theme + 
      scale_fill_manual(values = c("#1C366B", "#FCD16B", "#C7CEF6", "#D3DDDC")) + 
      scale_color_manual(values = c("#1C366B", "#FCD16B", "#C7CEF6", "#D3DDDC")) + 
      scale_y_continuous(labels = scales::dollar) +
      labs(
        title = "Distribution of Transactions by Type of Card",
        x = "Card Category",
        y = "Total Transactions"
      )

    4.5 Attrition Flags

    4.5.1 Level of Activity

    Lets analyze the level of inactivity for those customers that decided or that will leave the organization and those customers.

    • Attrited Customers: We see that the level of interactivity has a median of three months, 1 month higher than those who still stay with the organization.
    • Understanding this distribution: This could give us an indication that when the level of inactivity starts getting “beyond” the 2 month threshold, then there is a higher chance that the person will decide to leave the organization.
    • Possible alternative: Contact the client to see if they are satisfied with the service and what could the organization do to improve as a whole?
    attrition_tbl <- df %>% 
      select(attrition_flag, months_inactive_12_mon, total_revolving_bal, total_trans_amt, credit_limit)
    
    
    # Levels of Inactivity by attrition flag
    p1 <- attrition_tbl %>% 
      ggplot(aes(x = months_inactive_12_mon, y = attrition_flag, fill = attrition_flag)) + 
      stat_density_ridges(quantile_lines = TRUE, quantiles = 2, alpha = 0.4) + 
      scale_fill_nejm() + custom_theme + 
      labs(
        title = "Levels of Inactivity",
        x = "Inactive Months", 
        y = "Attrition Flag",
        caption = "Attrition Customers have higher levels of Inactivity (3 months vs 2 months Median)"
      )
    
    
    p1

    4.5.2 Levels of Revolving Balance

    We can clearly see that the customers that had a lower revolving balance were the most likely to leave. This could indicate that those customers could have left because they found other lower interest rates to other companies that offered better products at lower rates. In the case of existing customers on average they had a much higher revolving balance but customers with higher revolving balance have a much harder time to look for other financial institutions to offer lower rates due to the risk that the customer will not pay at all what is left of the remaining balance. (High risk, higher rates.)

    # Radar Chart for attrited customers
    
    max_val <- df %>% 
      select(attrition_flag, income_category, total_revolving_bal) %>% 
      filter(attrition_flag == "Attrited Customer") %>% 
      group_by(income_category) %>% 
      summarise(
        max_val = 1000
      ) %>% ungroup() %>% 
      pivot_wider(names_from = income_category, values_from = c(max_val))
    
    
    min_val <- df %>% 
      select(attrition_flag, income_category, total_revolving_bal) %>% 
      filter(attrition_flag == "Attrited Customer") %>% 
      group_by(income_category) %>% 
      summarise(
        min_val = min(total_revolving_bal)
      ) %>% ungroup() %>% 
      pivot_wider(names_from = income_category, values_from = c(min_val))
    
    
    
    revo_df <- df %>% 
      select(attrition_flag, income_category, total_revolving_bal) %>% 
      filter(attrition_flag == "Attrited Customer") %>% 
      group_by(income_category) %>% 
      summarise(
        avg_rev = mean(total_revolving_bal, na.rm = TRUE)
      ) %>% 
      ungroup() %>% 
      pivot_wider(names_from = income_category, values_from = c(avg_rev))
    
    
    
    combined_df_attrited <- max_val %>% 
      rbind(min_val) %>% 
      rbind(revo_df)
    
    
    rownames(combined_df_attrited) <- 1:nrow(combined_df_attrited) 
    
    
    
    revo_df <- df %>% 
      select(attrition_flag, income_category, total_revolving_bal) %>% 
      filter(attrition_flag == "Attrited Customer") %>% 
      group_by(income_category) %>% 
      summarise(
        avg_rev = mean(total_revolving_bal, na.rm = TRUE)
      ) %>% 
      ungroup() %>% 
      pivot_wider(names_from = income_category, values_from = c(avg_rev))
    
    
    
    # Customers that remained
    
    max_val_existing <- df %>% 
      select(attrition_flag, income_category, total_revolving_bal) %>% 
      filter(attrition_flag == "Attrited Customer") %>% 
      group_by(income_category) %>% 
      summarise(
        max_val = 1700
      ) %>% ungroup() %>% 
      pivot_wider(names_from = income_category, values_from = c(max_val))
    
    
    min_val_existing <- df %>% 
      select(attrition_flag, income_category, total_revolving_bal) %>% 
      filter(attrition_flag == "Existing Customer") %>% 
      group_by(income_category) %>% 
      summarise(
        min_val = min(total_revolving_bal)
      ) %>% ungroup() %>% 
      pivot_wider(names_from = income_category, values_from = c(min_val))
    
    
    
    revo_df_existing <- df %>% 
      select(attrition_flag, income_category, total_revolving_bal) %>% 
      filter(attrition_flag == "Existing Customer") %>% 
      group_by(income_category) %>% 
      summarise(
        avg_rev = mean(total_revolving_bal, na.rm = TRUE)
      ) %>% 
      ungroup() %>% 
      pivot_wider(names_from = income_category, values_from = c(avg_rev))
    
    
    combined_df_existing <- max_val_existing %>% 
      rbind(min_val_existing) %>% 
      rbind(revo_df_existing)
    
    
    combined_df_attrited <- max_val %>% 
      rbind(min_val) %>% 
      rbind(revo_df)
    
    
    rownames(combined_df_attrited) <- 1:nrow(combined_df_attrited) 
    
    
    create_beautiful_radarchart <- function(data, color = "#00AFBB", 
                                            vlabels = colnames(data), vlcex = 0.7,
                                            caxislabels = NULL, title = NULL, ...){
      radarchart(
        data, axistype = 1,
        # Customize the polygon
        pcol = color, pfcol = scales::alpha(color, 0.5), plwd = 2, plty = 1,
        # Customize the grid
        cglcol = "grey", cglty = 1, cglwd = 0.8,
        # Customize the axis
        axislabcol = "grey", 
        # Variable labels
        vlcex = vlcex, vlabels = vlabels,
        caxislabels = caxislabels, title = title, ...
      )
    }
    
    
    
    
    op <- par(mar = c(1, 2, 2, 1))
    attrited_plot <- create_beautiful_radarchart(combined_df_attrited, caxislabels = c(0,300 , 600, 900, 1200), title = "Attrited Customer", color="#F92923")

    existing_plot <- create_beautiful_radarchart(combined_df_existing, caxislabels = c(0, 400, 800, 1200, 1600), title = "Existing Customer", color="#23F993")

    4.5.3 Customer Attrition Groups

    In this section we will explore possible combinations of groups of people that leave the organization. Through this analysis we could quickly visualize clusters that are more likely to leave the organization.

    • Explanation of the chart: This chart is called a sanky diagram, and what basically its telling you is the amount of people allocated to each categorical variable from the attrited customers.
    • Low Income Female Graduates: This cluster group composes the highest amount of people who of attrited customers..
    sample_data <- df %>% 
      select(attrition_flag, education_level, gender, income_category) %>% 
      filter(attrition_flag == "Attrited Customer") %>% 
      count(attrition_flag, education_level, gender, income_category) %>% 
      mutate(
        interesting_group = ifelse(gender == "F" & education_level == "Graduate" &
                                     income_category == "Less than $40K", "Interesting", "Not Interesting")
      )
    
    
    
    
    
    sample_data %>% 
      ggplot(aes(y = n,
                 axis1 = gender, axis2 = education_level, axis3 = income_category)) +
      geom_alluvium(aes(fill = interesting_group), alpha = 0.4,  absolute = FALSE) + 
      geom_stratum(absolute = FALSE, width = 0.45) + 
        geom_text(stat = "stratum", aes(label = after_stat(stratum)),
                absolute = FALSE) + 
      scale_x_discrete(limits = c("Income Category", "Education Level", "Gender"), expand = c(.1, .05)) + 
      custom_theme + scale_fill_nejm() + 
      labs(
        title = "Different Groups within Customer Attrition",
        subtitle = "Attrited Customer Sample Population",
        y = "Amount",
        caption = "Female Graduates in the low income category makes a good portion of Attrited Customers"
      )

    4.6 Correlation Analysis

    Before getting into the AutoML phase, it will be interesting to explore which features are correlated with whether a customer would leave the organization. For this we will use a so called “correlation” funner which will allow us to understand which features are correlated with whether someone will leave the organization. Note: For numeric variables we have decided to bin them into 4 categories and then create dummy variables based on these features through one hot encoding.

    • Low revolving balance: People that usually pay their credit cards on time are more likely to leave the organization.
    • Low levels of inactivity: If the transaction count is less than 0.582 it is most likely that the customer will leave. Same for total transaction count if it is less than 45 for a specific client then the client is more likely to leave the organization
    • Low transaction amount: The lower the transaction amounts the higher the correlation with attrition.
    correlation_prepared_tbl <- df %>% binarize()
    
    
    # Plot correlation funnel
    correlation_prepared_tbl %>% 
      correlate(target = attrition_flag__Attrited_Customer) %>% 
      plot_correlation_funnel(interactive = FALSE) + custom_theme 

    4.7 AutoML with H2O

    In this section we will start implementing H2o. H2o is an automl library that simplifies the process of model selection. Not only that, but it reduces the time we would have spent performing a GridSearch process (Hyperparameter tuning.)

    These will be the steps to implement H2o:
    • Importing H2o
    • Split our dataframe
    • Pick the model
    • Evaluate performance metrics:

    4.7.1 Importing H2o

    library(h2o)
    Sys.setenv(JAVA_HOME = "C:/Program Files/Java/jdk1.8.0_301")
    print(Sys.getenv("JAVA_HOME"))
    ## [1] "C:/Program Files/Java/jdk1.8.0_301"
    #Call H20
    h2o.init()
    ##  Connection successful!
    ## 
    ## R is connected to the H2O cluster: 
    ##     H2O cluster uptime:         2 hours 25 minutes 
    ##     H2O cluster timezone:       Asia/Bangkok 
    ##     H2O data parsing timezone:  UTC 
    ##     H2O cluster version:        3.32.1.7 
    ##     H2O cluster version age:    2 days  
    ##     H2O cluster name:           H2O_started_from_R_DellPC_uta384 
    ##     H2O cluster total nodes:    1 
    ##     H2O cluster total memory:   2.42 GB 
    ##     H2O cluster total cores:    4 
    ##     H2O cluster allowed cores:  4 
    ##     H2O cluster healthy:        TRUE 
    ##     H2O Connection ip:          localhost 
    ##     H2O Connection port:        54321 
    ##     H2O Connection proxy:       NA 
    ##     H2O Internal Security:      FALSE 
    ##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4 
    ##     R Version:                  R version 4.1.1 (2021-08-10)

    4.7.2 H2o Dataframe Splits

    4.7.3 AutoML

    In this section we will implement AutoML from a series of models. AutoML is a framework that helps us simplify the implementation of machine learning models, preprocessing steps, hyperparameter tuning among other aspects of the machine learning pipeline. We will implement the following steps:

    • Implement a series of models and determine which is the best model to use.
    • Evaluate our model with a recall vs precision plot and ROC plot


    The top models that we got are all Gradient Boosting models with different parameters.

    leaderboard_data <- automl_models_h2o@leaderboard %>% 
      as_tibble() %>% 
      mutate(
        # Takes the first part before the underscore
        model_type = str_split(model_id, "_", simplify = T)[,1]
      ) %>% rownames_to_column() %>% 
      mutate(
        model_id = model_id %>% as_factor() %>% fct_reorder(auc),
        model_type = model_type %>% as_factor()
      ) %>% 
      select(rowname, model_id, model_type, everything()) %>% 
      pivot_longer(4:9) %>% 
      mutate(
        model_id = paste0(rowname, ". ", model_id) %>% as_factor() %>% fct_rev()
      ) %>% 
      filter(name %in% c("auc", "logloss"))
    
    
    
    leaderboard_data %>% 
      ggplot(aes(x=value, y=model_id, color = model_type)) + 
      geom_segment(aes(x = 0, y = model_id, xend = value, yend = model_id), color = "grey50") +
      geom_point(size=3) + facet_wrap(~ name, scales = "free_x") +
      scale_color_nejm() + custom_theme + 
      geom_label(aes(label = round(value, 2), hjust = "inward"), size = 3) +
      labs(
        title = "Model Leaderboard",
        subtitle = paste0("Ordered by: auc")
      )

    4.7.4 Performance Metrics

    Precision and recall are metrics that are commonly used for classification problems. This will allow us to better evaluate the performance of our model. Furthermore, in this case we will used the model that performed the best and that is the GBM_3_AUTOML model. However, it is possible to evaluate more than one model at the same time but to keep things simple in this tutorial I will only use one model.

    • Precision: This is the ratio of true positives against all positives. In this case, a positive will be the customers leaving the organization. A True positive will be a correct prediction by our model that a customer is leaving the organization and a false positive will be our model predicting incorrectly that a customer will be leaving.
    • Recall: is a metric of our model correctly identifying True positives (in other words customers leaving the organization.) The recall is basically the ratio of how many customers did our model predict correctly from the total customers within the institution.
    • Receiver Operating Characteristic (ROC) and Area Under Curve (AUC): ROC is a probability curve that measures how correctly the model is able to predict the correct outcome while AUC represents the degree of separability. The higher the AUC (closer to 1) the better our model is.
    metrics_tbl <- performance_h2o %>% 
      h2o.metric() %>% 
      as_tibble()
    
    
    
    
    pr_plot <- metrics_tbl %>% 
      ggplot(aes(x=threshold)) + 
      geom_line(aes(y=precision), color = "#F24D29", size=1) + 
      geom_line(aes(y=recall), color = "#1DACE8", size=1) + 
      geom_vline(xintercept = h2o.find_threshold_by_max_metric(performance_h2o, "f1"),
                 lty="dashed") + 
      custom_theme + 
      labs(
        title = "Precision vis a vis Recall"
      )
    roc_tbl <- performance_h2o %>% 
      h2o.metric() %>% 
      as_tibble() %>% 
      mutate(
      auc = h2o.auc(performance_h2o)
      ) %>% 
      select(tpr, fpr, auc)
    
    
    roc_plot <- roc_tbl %>% 
      ggplot(aes(fpr, tpr)) + 
      geom_line(color = "#F24D29", size=1) +
      geom_abline() +
      custom_theme + 
      labs(
        title = "ROC Plot"
      )
    
    plot_grid(pr_plot, roc_plot, ncol = 2)

    4.8 Understanding Feature Importance

    4.8.1 Showcasing Feature Importance

    Now it is time to better understand our model. We would want to understand what factors drive our model to make certain predictions. In this case what factors make our model determine whether a customer will leave the organization? In order to know this we need to understand which features are important to our model.



    • Important features: The features that our model considers the most important in determining whether a customer will leave the organization include: Total Transaction Amount, Total transaction count and total revolving balance.
    • Total transaction Count: When the total amount of transactions gets lower than 45 transactions it is most likely that the customer will leave the organization.
    fi_plot <- vip(leader_model) + 
      custom_theme + geom_col(fill="#F24D29", color="black") + 
      labs(
        title = "Feature Importance",
        caption = "Which features our model considers important"
      ) 
    
    
    
    
    fi_plot

    h2o.no_progress()
    
    total_trans_plot <- h2o.partialPlot(leader_model, data = as.h2o(train_tbl), cols = c("total_trans_ct", "total_trans_amt"))

    4.8.2 What Features Influences Attrition the Most?

    In this case we will use the Lime package which helps us to understand the features that our model considers important. However, we will explore single observations in which the customer decided to leave the organization.

    • Total Transaction Counts: When total transaction counts are less than 45 we see that there is a greater chance that the individual will leave the organization meaning that levels of inactivity can be a risk for a customer to leave the organization.
    • Customers that owe less to the institution: Customers that have a lower revolving balance have a greater chance to leave the organization. This is something interesting since what we would like to have are customers that owe less to the institution.



    Note: The 4th observation is an incorrect prediction on the side of our model. It looks like this was an exception even if it had all the characteristics our model thought was important to consider that a customer will be leaving the organization.

    h2o.no_progress()
    
    explainer <- train_tbl %>%
        select(-attrition_flag) %>%
        lime(
            model           = leader_model,
            bin_continuous  = TRUE,
            n_bins          = 4,
            quantile_bins   = TRUE
        )
    
    
    # Feature number 4 is an incorrect prediction!
    
    explanation <- test_tbl %>%
        filter(attrition_flag == "Attrited Customer") %>%
      slice(1:6) %>% 
        select(-attrition_flag) %>%
        lime::explain(
            explainer = explainer,
            n_labels   = 1,
            n_features = 8,
            n_permutations = 5000,
            kernel_width   = 1
        )
    
    
    plot_features(explanation = explanation)

    In this plot we can see that the trend continues with the four observations. So we see that for the first three trends we see that there is a consistency and a pattern. We could also try to explore different observations of attrited customers in order to see if this patterns continues. With this you can conclude that these are the features that our model understands that are important in determining whether the customer will leave the organization.

    plot_explanations(explanation = explanation)

    4.8.3 Interpreting Confusion Matrix

    In this section we will explain the concepts needed to interpret the metrics of a classification problem. The goal is to understand whether our model is able to get
    • True Negatives: This means that our model predicts our customer will stay and not leave the organization. The customer actually stays in the organization meaning these are the correct number of predictions from our model.
    • False Positives: Our model predicts that our employee will leave the financial institution however, the customer actually stayed.
    • False Negatives: Our model predicts that the customer will stay in the organization however, the customer leaves the organization. This has the biggest negative impact for this business case.
    • True Positives: Our model predicts the customer will leave the organization and the customer does decide to leave the institution. Meaning our model is able to correctly predict when the model leaves the organization.


    Note: You should also look at the business implications, False Positives have a lower impact to the organizations as opposed to False Negatives!

    metrics_h2o <- leader_model %>% 
      h2o.performance(newdata = test_h2o)
    
    
    predictions_tbl <- leader_model %>% 
      h2o.predict(newdata = test_h2o) %>%
      as_tibble() %>% 
      bind_cols(test_h2o %>% as_tibble())
    
    
    
    predictions_tbl %>% 
      select(1:4) %>% 
      conf_mat(attrition_flag, predict) %>% 
      pluck(1) %>% 
      as_tibble() %>% 
      ggplot(aes(x=Prediction, y=Truth, fill=n)) +
      geom_tile(show.legend = F, alpha=0.5) +
      geom_text(aes(label=n), color="black", alpha=1, size=8) + 
      scale_fill_gsea()+ 
      custom_theme + 
      labs(
        title = "Confusion Matrix"
      )

    4.8.4 Thresholds

    The concepts of thresholds is important when we come to classification problems. In this case we show the performance of different thresholds at various levels. True Negative Rates and False Positive Rates as well as false negative rates and true positive rates will always add to 1. As the threshold increases, the more precission our model has when determining whether a customer will leave the organization.

    new_performance_h2o <- leader_model %>% 
      h2o.performance(newdata = test_h2o)
    
    
    rates_by_threshold <- new_performance_h2o %>% 
      h2o.metric() %>% 
      as_tibble()
    
    
    
    
    rates_by_threshold %>% 
      select(threshold, f1, tnr:tpr) %>% 
      gather(key = "key", value = "value", tnr:tpr, factor_key = TRUE) %>% 
      mutate(
        key = fct_reorder2(key, threshold, value)
      ) %>% 
      ggplot(aes(x=threshold, value, color=key)) + 
      geom_point() + 
      geom_smooth() + 
      custom_theme + 
      theme(legend.position = "right") + 
      labs(
        title = "Rates",
        y = "Value", 
        x = "Threshold"
      )

    4.9 Strategies to Reduce Customer Attrition

    • Focus on the lower income group: Although the purchasing power is not that significant, most of our customers are from the lower income group. Implementing promotions to benefit the lower income group could be a good alternative to reduce customer churn among that cluster group.
    • Act when activity levels are low: We saw that customers that had a lower level of activity (transactions lower than 45), had a higher probability of leaving the organization. If employees call those customers with lower levels of activity to offer new products for their needs or to ask them if customers are happy with the services we provide and if there is something we can do to improve, we would probably get better insights as to what we can do to increase the level of activity.
    • Increase credit limits to those with lower revolving balance? According to our model customers with lower revolving balance are more likely to leave the organization. Maybe by implementing higher credit balance to those customers the probability of that segment group to leave the organization could be lowered.

    And thats it I hope you liked this project and how we can use AutoML to help us with certain business tasks. Thanks for the support and dont forget to upvote if you did like the notebook.