Predict attrition using the data from a credit card company, using
the classification algorithm from the rpart
package
library(tidyverse)
library(rpart)
library(rpart.plot)
library(gmodels)
library(AER)
random
to the datasetCreditCardData <- read_csv("~/Downloads/CreditCardData.csv")
set.seed(123)
attrition <- CreditCardData %>%
mutate(random = runif(10127))
train_tree <- attrition %>%
filter(random < .7) %>%
select(-random)
validation_tree <- attrition %>%
filter(random >= .7) %>%
select(-random)
str(attrition)
## tibble [10,127 × 22] (S3: tbl_df/tbl/data.frame)
## $ CLIENTNUM : num [1:10127] 7.69e+08 8.19e+08 7.14e+08 7.70e+08 7.09e+08 ...
## $ Attrition_Flag : chr [1:10127] "Existing Customer" "Existing Customer" "Existing Customer" "Existing Customer" ...
## $ Customer_Age : num [1:10127] 45 49 51 40 40 44 51 32 37 48 ...
## $ Gender : chr [1:10127] "M" "F" "M" "F" ...
## $ Dependent_count : num [1:10127] 3 5 3 4 3 2 4 0 3 2 ...
## $ Education_Level : chr [1:10127] "High School" "Graduate" "Graduate" "High School" ...
## $ Marital_Status : chr [1:10127] "Married" "Single" "Married" "Unknown" ...
## $ Income_Category : chr [1:10127] "$60K - $80K" "Less than $40K" "$80K - $120K" "Less than $40K" ...
## $ Card_Category : chr [1:10127] "Blue" "Blue" "Blue" "Blue" ...
## $ Months_on_book : num [1:10127] 39 44 36 34 21 36 46 27 36 36 ...
## $ Total_Relationship_Count: num [1:10127] 5 6 4 3 5 3 6 2 5 6 ...
## $ Months_Inactive_12_mon : num [1:10127] 1 1 1 4 1 1 1 2 2 3 ...
## $ Contacts_Count_12_mon : num [1:10127] 3 2 0 1 0 2 3 2 0 3 ...
## $ Credit_Limit : num [1:10127] 12691 8256 3418 3313 4716 ...
## $ Total_Revolving_Bal : num [1:10127] 777 864 0 2517 0 ...
## $ Avg_Open_To_Buy : num [1:10127] 11914 7392 3418 796 4716 ...
## $ Total_Amt_Chng_Q4_Q1 : num [1:10127] 1.33 1.54 2.59 1.41 2.17 ...
## $ Total_Trans_Amt : num [1:10127] 1144 1291 1887 1171 816 ...
## $ Total_Trans_Ct : num [1:10127] 42 33 20 20 28 24 31 36 24 32 ...
## $ Total_Ct_Chng_Q4_Q1 : num [1:10127] 1.62 3.71 2.33 2.33 2.5 ...
## $ Avg_Utilization_Ratio : num [1:10127] 0.061 0.105 0 0.76 0 0.311 0.066 0.048 0.113 0.144 ...
## $ random : num [1:10127] 0.288 0.788 0.409 0.883 0.94 ...
ct1 <- rpart(Attrition_Flag ~ ., data = train_tree , method = 'class')
rpart.plot(ct1)
ct1$variable.importance %>%
data.frame() %>%
rownames_to_column(var = "Feature") %>%
rename(Overall = '.') %>%
ggplot(aes(x = fct_reorder(Feature, Overall), y = Overall)) +
geom_pointrange(aes(ymin = 0, ymax = Overall), color = "cadetblue", size = .3) +
theme_minimal() +
coord_flip() +
labs(x = "", y = "", title = "Variable Importance with Simple Classication")
Using the validation data, it is observed that the model is able to:
validation_tree$Attrition_Flag_predicted <- predict(ct1 , validation_tree , type = 'class')
CrossTable(validation_tree$Attrition_Flag , validation_tree$Attrition_Flag_predicted , chisq = T)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 2990
##
##
## | validation_tree$Attrition_Flag_predicted
## validation_tree$Attrition_Flag | Attrited Customer | Existing Customer | Row Total |
## -------------------------------|-------------------|-------------------|-------------------|
## Attrited Customer | 364 | 118 | 482 |
## | 1155.572 | 206.873 | |
## | 0.755 | 0.245 | 0.161 |
## | 0.802 | 0.047 | |
## | 0.122 | 0.039 | |
## -------------------------------|-------------------|-------------------|-------------------|
## Existing Customer | 90 | 2418 | 2508 |
## | 222.084 | 39.758 | |
## | 0.036 | 0.964 | 0.839 |
## | 0.198 | 0.953 | |
## | 0.030 | 0.809 | |
## -------------------------------|-------------------|-------------------|-------------------|
## Column Total | 454 | 2536 | 2990 |
## | 0.152 | 0.848 | |
## -------------------------------|-------------------|-------------------|-------------------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 1624.287 d.f. = 1 p = 0
##
## Pearson's Chi-squared test with Yates' continuity correction
## ------------------------------------------------------------
## Chi^2 = 1618.706 d.f. = 1 p = 0
##
##