#Remove variables not needed and recode if needed
Data.Full <- Data %>% dplyr::select(-c(Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,
                              Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2,
                              CLIENTNUM))

#recode gender variable
Data.Full$Gender[Data.Full$Gender == 'M'] = "Male"
Data.Full$Gender[Data.Full$Gender == 'F'] = "Female"

#recode outcome variable
Data.Full$Attrition_Flag[Data.Full$Attrition_Flag == 'Existing Customer'] = "not_churned"
Data.Full$Attrition_Flag[Data.Full$Attrition_Flag == 'Attrited Customer'] = "churned"


#set factors
Data.Full$Attrition_Flag = as.factor(Data.Full$Attrition_Flag)
Data.Full$Gender = as.factor(Data.Full$Gender)
Data.Full$Education_Level= as.factor(Data.Full$Education_Level)
Data.Full$Marital_Status = as.factor(Data.Full$Marital_Status)
Data.Full$Income_Category = as.factor(Data.Full$Income_Category)
Data.Full$Card_Category = as.factor(Data.Full$Card_Category)

str(Data.Full)
## 'data.frame':    10127 obs. of  20 variables:
##  $ Attrition_Flag          : Factor w/ 2 levels "churned","not_churned": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Customer_Age            : int  45 49 51 40 40 44 51 32 37 48 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 2 1 2 1 2 2 2 2 2 2 ...
##  $ Dependent_count         : int  3 5 3 4 3 2 4 0 3 2 ...
##  $ Education_Level         : Factor w/ 7 levels "College","Doctorate",..: 4 3 3 4 6 3 7 4 6 3 ...
##  $ Marital_Status          : Factor w/ 4 levels "Divorced","Married",..: 2 3 2 4 2 2 2 4 3 3 ...
##  $ Income_Category         : Factor w/ 6 levels "$120K +","$40K - $60K",..: 3 5 4 5 3 2 1 3 3 4 ...
##  $ Card_Category           : Factor w/ 4 levels "Blue","Gold",..: 1 1 1 1 1 1 2 4 1 1 ...
##  $ Months_on_book          : int  39 44 36 34 21 36 46 27 36 36 ...
##  $ Total_Relationship_Count: int  5 6 4 3 5 3 6 2 5 6 ...
##  $ Months_Inactive_12_mon  : int  1 1 1 4 1 1 1 2 2 3 ...
##  $ Contacts_Count_12_mon   : int  3 2 0 1 0 2 3 2 0 3 ...
##  $ Credit_Limit            : num  12691 8256 3418 3313 4716 ...
##  $ Total_Revolving_Bal     : int  777 864 0 2517 0 1247 2264 1396 2517 1677 ...
##  $ Avg_Open_To_Buy         : num  11914 7392 3418 796 4716 ...
##  $ Total_Amt_Chng_Q4_Q1    : num  1.33 1.54 2.59 1.4 2.17 ...
##  $ Total_Trans_Amt         : int  1144 1291 1887 1171 816 1088 1330 1538 1350 1441 ...
##  $ Total_Trans_Ct          : int  42 33 20 20 28 24 31 36 24 32 ...
##  $ Total_Ct_Chng_Q4_Q1     : num  1.62 3.71 2.33 2.33 2.5 ...
##  $ Avg_Utilization_Ratio   : num  0.061 0.105 0 0.76 0 0.311 0.066 0.048 0.113 0.144 ...

Data Explorations

#check for missing values
sum(is.na(Data.Full))
## [1] 0
#plot of missing values 
plot_missing(Data.Full, 
             group = c("Excellent" = 0.0, "Good" = .01, "Ok" = .05, "Bad" = .1),
             missing_only = F,
             title= "Missing on Churn Data",
             ggtheme = theme_bw(),
             theme_config = list(legend.position = c("below")) )

Plot of outcome variable

ggplot(Data.Full, aes(x = Attrition_Flag)) +
  geom_bar(aes(fill = Attrition_Flag),width = .8, stat = "count", position = "dodge") +
  geom_text(stat='count', aes(label=..count..), vjust= 1, colour = "black") +
  labs(title= "Distribution of Attrition", y = "Count", x = "Attrition Flag") +
  scale_fill_discrete(name = "Attrition Flag") +
  scale_fill_manual("Attrition Flag", values = c("churned" = "#cc6666", "not_churned" = "#999999"))+ 
  theme(axis.title.y=element_blank(), 
        axis.title.x=element_blank(),
        axis.ticks = element_blank(),
        legend.position = "none", 
        panel.grid = element_blank())
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

Histograms of all numeric variables

Data.Full %>%
  keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(value)) +
  facet_wrap(~ key, scales = "free") +
  geom_histogram(aes(y=..density..), colour="black", fill="white")+
  geom_density(alpha=.2, fill="#3399CC")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Boxplots of all numeric variables

par(mar=c(2,2,2,2))
par(mfrow=c(5,3)) 
boxplot(Data.Full$Customer_Age, col="#FF9999" , main="Customer's Age in Years" )
boxplot(Data.Full$Dependent_count,col="#FF9999" , main="Number of dependents")
boxplot(Data.Full$Months_on_book, col="#FF9999" , main="Period of relationship with bank")
boxplot(Data.Full$Total_Relationship_Count,col="#FF9999" , main="Total no. of products held by the customer")
boxplot(Data.Full$Months_Inactive_12_mon,col="#FF9999" , main="No. of months inactive in the last 12 months")
boxplot(Data.Full$Contacts_Count_12_mon,col="#FF9999" , main="No. of Contacts in the last 12 months")
boxplot(Data.Full$Credit_Limit,col="#FF9999" , main="Credit Limit on the Credit Card") 
boxplot(Data.Full$Total_Revolving_Bal,col="#FF9999" , main="Total Revolving Balance on the Credit Card")
boxplot(Data.Full$Avg_Open_To_Buy,col="#FF9999" , main="Credit Line Average of last 12 months")
boxplot(Data.Full$Total_Amt_Chng_Q4_Q1,col="#FF9999" , main="Change in Transaction Amount (Q4 over Q1)")
boxplot(Data.Full$Total_Trans_Amt,col="#FF9999" , main="Total Transaction Amount (Last 12 months)")
boxplot(Data.Full$Total_Trans_Ct, col="#FF9999" , main="Total Transaction Count (Last 12 months)" )
boxplot(Data.Full$Total_Ct_Chng_Q4_Q1, col="#FF9999" , main="Change in Transaction Count (Q4 over Q1)" )
boxplot(Data.Full$Avg_Utilization_Ratio, col="#FF9999" , main="Average Card Utilization Ratio" )

Plots of all factor variables to the outcome variables

GENDER

ggplot(Data.Full, aes(x = Gender)) +
  geom_bar(aes(fill = Attrition_Flag),width = .8, stat = "count", position = "dodge") +
  labs(title= "Distribution of Attrition by Gender", y = "Count", x = "Attrition Flag") +
  scale_fill_discrete(name = "Attrition Flag") +
  scale_fill_manual("Attrition Flag", values = c("churned" = "#cc6666", "not_churned" = "#999999"))+ 
  theme(axis.title.y=element_blank(), 
        axis.title.x=element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank())
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

EDUCATION LEVEL

ggplot(Data.Full, aes(x = Education_Level)) +
  geom_bar(aes(fill = Attrition_Flag),width = .8, stat = "count", position = "dodge") +
  labs(title= "Distribution of Attrition by Education Level", y = "Count", x = "Attrition Flag") +
  scale_fill_discrete(name = "Attrition Flag") +
  scale_fill_manual("Attrition Flag", values = c("churned" = "#cc6666", "not_churned" = "#999999"))+ 
  theme(axis.title.y=element_blank(), 
        axis.title.x=element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank())
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

MARITAL STATUS

ggplot(Data.Full, aes(x = Marital_Status)) +
  geom_bar(aes(fill = Attrition_Flag),width = .8, stat = "count", position = "dodge") +
  labs(title= "Distribution of Attrition by Marital Status", y = "Count", x = "Attrition Flag") +
  scale_fill_discrete(name = "Attrition Flag") +
  scale_fill_manual("Attrition Flag", values = c("churned" = "#cc6666", "not_churned" = "#999999"))+ 
  theme(axis.title.y=element_blank(), 
        axis.title.x=element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank())
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

INCOME CATEGORY

ggplot(Data.Full, aes(x = Income_Category)) +
  geom_bar(aes(fill = Attrition_Flag),width = .8, stat = "count", position = "dodge") +
  labs(title= "Distribution of Attrition by Income Category", y = "Count", x = "Attrition Flag") +
  scale_fill_discrete(name = "Attrition Flag") +
  scale_fill_manual("Attrition Flag", values = c("churned" = "#cc6666", "not_churned" = "#999999"))+ 
  theme(axis.title.y=element_blank(), 
        axis.title.x=element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank())
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

CARD CATEGORY

ggplot(Data.Full, aes(x = Card_Category)) +
  geom_bar(aes(fill = Attrition_Flag),width = .8, stat = "count", position = "dodge") +
  labs(title= "Distribution of Attrition by Card Category", y = "Count", x = "Attrition Flag") +
  scale_fill_discrete(name = "Attrition Flag") +
  scale_fill_manual("Attrition Flag", values = c("churned" = "#cc6666", "not_churned" = "#999999"))+ 
  theme(axis.title.y=element_blank(), 
        axis.title.x=element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank())
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

Correlation

#Correlation of numeric variables 

NumVar1 <- Data.Full[, sapply(Data.Full, class) %in% c('integer', 'numeric')]
cormatrix1 <- cor(NumVar1)
corrplot(cormatrix1, method = "number", tl.col = "black", tl.cex = .80, number.cex = .80)

#get name of the suggested variable to be remove due to being highly correlated with other variables
NumVar1.High.Cor <- findCorrelation(cormatrix1, .75, names = T)
NumVar1.High.Cor
## [1] "Avg_Open_To_Buy" "Total_Trans_Amt" "Customer_Age"
#Remove highly correlated variable "Average open to buy " first 
NumVar2 <- select(NumVar1, -Avg_Open_To_Buy)
cormatrix2 <- cor(NumVar2)
corrplot(cormatrix2, method = "number", tl.col = "black", tl.cex = .70, number.cex = .80)

#get name of the suggested variable to be remove due to being highly correlated with other variables
NumVar2.High.Cor <- findCorrelation(cormatrix2, .75, names = T)
NumVar2.High.Cor
## [1] "Total_Trans_Amt" "Customer_Age"
#Remove highly correlated variable "Total Trans Amount" because it is correlated with "Total Trans Amt" at 0.81
NumVar3 <- select(NumVar2, -Total_Trans_Amt)
cormatrix3 <- cor(NumVar3)
corrplot(cormatrix3, method = "number", tl.col = "black", tl.cex = .70, number.cex = .80)

#get name of the suggested variable to be remove due to being highly correlated with other variables
NumVar3.High.Cor <- findCorrelation(cormatrix3, .75, names = T)
NumVar3.High.Cor
## [1] "Customer_Age"
#Remove highly correlated variable "Customer Age" 
NumVar4 <- select(NumVar3, -Customer_Age)
cormatrix4 <- cor(NumVar4)
corrplot(cormatrix4, method = "number", tl.col = "black", tl.cex = .90, number.cex = .90)

#get name of the suggested variable to be remove due to being highly correlated with other variables
NumVar4.High.Cor <- findCorrelation(cormatrix4, .75, names = T)
NumVar4.High.Cor
## character(0)

Remove highly correlated variables and create REDUCED dataset

#Produce reduced dataset without the highly correlated data, the data should have 17 variables
Data.Reduced <- dplyr::select(Data.Full, -c(Avg_Open_To_Buy, Total_Trans_Amt, Customer_Age))
str(Data.Reduced)
## 'data.frame':    10127 obs. of  17 variables:
##  $ Attrition_Flag          : Factor w/ 2 levels "churned","not_churned": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 2 1 2 1 2 2 2 2 2 2 ...
##  $ Dependent_count         : int  3 5 3 4 3 2 4 0 3 2 ...
##  $ Education_Level         : Factor w/ 7 levels "College","Doctorate",..: 4 3 3 4 6 3 7 4 6 3 ...
##  $ Marital_Status          : Factor w/ 4 levels "Divorced","Married",..: 2 3 2 4 2 2 2 4 3 3 ...
##  $ Income_Category         : Factor w/ 6 levels "$120K +","$40K - $60K",..: 3 5 4 5 3 2 1 3 3 4 ...
##  $ Card_Category           : Factor w/ 4 levels "Blue","Gold",..: 1 1 1 1 1 1 2 4 1 1 ...
##  $ Months_on_book          : int  39 44 36 34 21 36 46 27 36 36 ...
##  $ Total_Relationship_Count: int  5 6 4 3 5 3 6 2 5 6 ...
##  $ Months_Inactive_12_mon  : int  1 1 1 4 1 1 1 2 2 3 ...
##  $ Contacts_Count_12_mon   : int  3 2 0 1 0 2 3 2 0 3 ...
##  $ Credit_Limit            : num  12691 8256 3418 3313 4716 ...
##  $ Total_Revolving_Bal     : int  777 864 0 2517 0 1247 2264 1396 2517 1677 ...
##  $ Total_Amt_Chng_Q4_Q1    : num  1.33 1.54 2.59 1.4 2.17 ...
##  $ Total_Trans_Ct          : int  42 33 20 20 28 24 31 36 24 32 ...
##  $ Total_Ct_Chng_Q4_Q1     : num  1.62 3.71 2.33 2.33 2.5 ...
##  $ Avg_Utilization_Ratio   : num  0.061 0.105 0 0.76 0 0.311 0.066 0.048 0.113 0.144 ...

Create test and train set from the full dataset

set.seed(1)
full.inTrain <- createDataPartition(Data.Full$Attrition_Flag, p = .8)[[1]]
full.train <- Data.Full[full.inTrain,]
full.test <- Data.Full[-full.inTrain,]

Create test and train set from the reduced dataset

set.seed(1)
reduced.inTrain <- createDataPartition(Data.Reduced$Attrition_Flag, p = .8)[[1]]
reduced.train <- Data.Reduced[reduced.inTrain,]
reduced.test <- Data.Reduced[-reduced.inTrain,]
#Remove unneeded objects from the environment
rm(cormatrix1, cormatrix2, cormatrix3, cormatrix4, NumVar1, NumVar2, NumVar3, NumVar4, NumVar1.High.Cor, NumVar2.High.Cor, NumVar3.High.Cor, NumVar4.High.Cor)

Set control

set.seed(1)
ctrl <- trainControl(method = "repeatedcv",
                     number =10, 
                     repeats=3,  
                     summaryFunction=twoClassSummary,
                     classProbs = TRUE)

MODELING

Full Logistic Regression

Fit the full logistic model **Removed Avg_Open_To_Buy because it is perfectly correlated with credit limit and VIF will not run if there are variables that are perfectly correlated

set.seed(1)
glm.full <- train(Attrition_Flag~ . -Avg_Open_To_Buy , data = full.train, method = "glm", metric = "ROC", trControl = ctrl)
summary(glm.full)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.5268   0.0688   0.1736   0.3644   2.8020  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -6.404e+00  5.294e-01 -12.098  < 2e-16 ***
## Customer_Age                     6.893e-03  8.650e-03   0.797 0.425508    
## GenderMale                       8.497e-01  1.609e-01   5.279 1.30e-07 ***
## Dependent_count                 -1.321e-01  3.351e-02  -3.940 8.14e-05 ***
## Education_LevelDoctorate        -4.023e-01  2.279e-01  -1.765 0.077486 .  
## Education_LevelGraduate          7.764e-03  1.558e-01   0.050 0.960255    
## `Education_LevelHigh School`    -8.308e-02  1.654e-01  -0.502 0.615529    
## `Education_LevelPost-Graduate`  -2.086e-01  2.325e-01  -0.897 0.369529    
## Education_LevelUneducated       -6.068e-02  1.754e-01  -0.346 0.729331    
## Education_LevelUnknown          -1.608e-01  1.726e-01  -0.932 0.351313    
## Marital_StatusMarried            5.246e-01  1.737e-01   3.021 0.002520 ** 
## Marital_StatusSingle            -3.970e-02  1.739e-01  -0.228 0.819399    
## Marital_StatusUnknown            9.720e-03  2.220e-01   0.044 0.965083    
## `Income_Category$40K - $60K`     8.409e-01  2.275e-01   3.696 0.000219 ***
## `Income_Category$60K - $80K`     5.339e-01  2.022e-01   2.640 0.008284 ** 
## `Income_Category$80K - $120K`    2.595e-01  1.882e-01   1.379 0.167930    
## `Income_CategoryLess than $40K`  6.359e-01  2.458e-01   2.587 0.009668 ** 
## Income_CategoryUnknown           6.689e-01  2.573e-01   2.599 0.009336 ** 
## Card_CategoryGold               -5.885e-01  4.322e-01  -1.362 0.173297    
## Card_CategoryPlatinum           -5.265e-01  8.126e-01  -0.648 0.517067    
## Card_CategorySilver             -3.139e-01  2.212e-01  -1.419 0.155977    
## Months_on_book                   3.871e-03  8.579e-03   0.451 0.651803    
## Total_Relationship_Count         4.467e-01  3.073e-02  14.537  < 2e-16 ***
## Months_Inactive_12_mon          -5.118e-01  4.217e-02 -12.137  < 2e-16 ***
## Contacts_Count_12_mon           -5.356e-01  4.061e-02 -13.189  < 2e-16 ***
## Credit_Limit                     1.919e-05  7.711e-06   2.488 0.012845 *  
## Total_Revolving_Bal              9.075e-04  8.048e-05  11.276  < 2e-16 ***
## Total_Amt_Chng_Q4_Q1             3.603e-01  2.081e-01   1.731 0.083425 .  
## Total_Trans_Amt                 -4.755e-04  2.531e-05 -18.788  < 2e-16 ***
## Total_Trans_Ct                   1.164e-01  4.115e-03  28.292  < 2e-16 ***
## Total_Ct_Chng_Q4_Q1              2.787e+00  2.116e-01  13.169  < 2e-16 ***
## Avg_Utilization_Ratio            2.390e-01  2.794e-01   0.855 0.392348    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7143.2  on 8101  degrees of freedom
## Residual deviance: 3795.7  on 8070  degrees of freedom
## AIC: 3859.7
## 
## Number of Fisher Scoring iterations: 6

Check VIF of the full logistic regression

car::vif(glm.full$finalModel)
##                    Customer_Age                      GenderMale 
##                        2.880910                        3.655510 
##                 Dependent_count        Education_LevelDoctorate 
##                        1.057423                        1.475645 
##         Education_LevelGraduate    `Education_LevelHigh School` 
##                        2.911585                        2.468186 
##  `Education_LevelPost-Graduate`       Education_LevelUneducated 
##                        1.449436                        2.137589 
##          Education_LevelUnknown           Marital_StatusMarried 
##                        2.220887                        4.247873 
##            Marital_StatusSingle           Marital_StatusUnknown 
##                        4.101392                        1.961592 
##    `Income_Category$40K - $60K`    `Income_Category$60K - $80K` 
##                        3.973161                        2.627034 
##   `Income_Category$80K - $120K` `Income_CategoryLess than $40K` 
##                        2.650383                        7.863067 
##          Income_CategoryUnknown               Card_CategoryGold 
##                        3.922451                        1.124715 
##           Card_CategoryPlatinum             Card_CategorySilver 
##                        1.037942                        1.397398 
##                  Months_on_book        Total_Relationship_Count 
##                        2.877328                        1.191303 
##          Months_Inactive_12_mon           Contacts_Count_12_mon 
##                        1.059515                        1.048592 
##                    Credit_Limit             Total_Revolving_Bal 
##                        2.689645                        2.552419 
##            Total_Amt_Chng_Q4_Q1                 Total_Trans_Amt 
##                        1.161880                        4.343041 
##                  Total_Trans_Ct             Total_Ct_Chng_Q4_Q1 
##                        4.486725                        1.186426 
##           Avg_Utilization_Ratio 
##                        2.976799

Confusion matrix of the full logistic model

set.seed(1)
glm.probs.full <- predict(glm.full, newdata = full.test, type = "raw")
glm.cm.full<- caret::confusionMatrix(as.factor(glm.probs.full),as.factor(full.test$Attrition_Flag), positive = "churned")
glm.cm.full
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    churned not_churned
##   churned         195          54
##   not_churned     130        1646
##                                           
##                Accuracy : 0.9091          
##                  95% CI : (0.8958, 0.9213)
##     No Information Rate : 0.8395          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6276          
##                                           
##  Mcnemar's Test P-Value : 3.219e-08       
##                                           
##             Sensitivity : 0.6000          
##             Specificity : 0.9682          
##          Pos Pred Value : 0.7831          
##          Neg Pred Value : 0.9268          
##              Prevalence : 0.1605          
##          Detection Rate : 0.0963          
##    Detection Prevalence : 0.1230          
##       Balanced Accuracy : 0.7841          
##                                           
##        'Positive' Class : churned         
## 

Plot of the full logistic model confusion matrix

draw_confusion_matrix <- function(glm.cm.full) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('Full Logistic Regression Confusion Matrix', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#72a376')
  text(195, 435, 'churned', cex=1.2)
  rect(250, 430, 340, 370, col='#c0504d')
  text(295, 435, 'not churned', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#c0504d')
  rect(250, 305, 340, 365, col='#72a376')
  text(140, 400, 'churned', cex=1.2, srt=90)
  text(140, 335, 'not churned', cex=1.2, srt=90)

  # add in the glm.cm results 
  res <- as.numeric(glm.cm.full$table)
  text(195, 400, res[1], cex=2, font=2, col='white')
  text(195, 335, res[2], cex=2, font=2, col='white')
  text(295, 400, res[3], cex=2, font=2, col='white')
  text(295, 335, res[4], cex=2, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(glm.cm.full$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(glm.cm.full$byClass[1]), 3), cex=1.2)
  text(30, 85, names(glm.cm.full$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(glm.cm.full$byClass[2]), 3), cex=1.2)
  text(50, 85, names(glm.cm.full$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(glm.cm.full$byClass[5]), 3), cex=1.2)
  text(70, 85, names(glm.cm.full$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(glm.cm.full$byClass[6]), 3), cex=1.2)
  text(90, 85, names(glm.cm.full$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(glm.cm.full$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(glm.cm.full$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(glm.cm.full$overall[1]), 3), cex=1.4)
  text(70, 35, names(glm.cm.full$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(glm.cm.full$overall[2]), 3), cex=1.4)
}  

draw_confusion_matrix(glm.cm.full)

Variable importance of the full logistic model

varImp(glm.full, scale = FALSE)
## glm variable importance
## 
##   only 20 most important variables shown (out of 31)
## 
##                                 Overall
## Total_Trans_Ct                   28.292
## Total_Trans_Amt                  18.788
## Total_Relationship_Count         14.537
## Contacts_Count_12_mon            13.189
## Total_Ct_Chng_Q4_Q1              13.169
## Months_Inactive_12_mon           12.137
## Total_Revolving_Bal              11.276
## GenderMale                        5.279
## Dependent_count                   3.940
## `Income_Category$40K - $60K`      3.696
## Marital_StatusMarried             3.021
## `Income_Category$60K - $80K`      2.640
## Income_CategoryUnknown            2.599
## `Income_CategoryLess than $40K`   2.587
## Credit_Limit                      2.488
## Education_LevelDoctorate          1.765
## Total_Amt_Chng_Q4_Q1              1.731
## Card_CategorySilver               1.419
## `Income_Category$80K - $120K`     1.379
## Card_CategoryGold                 1.362
plot(varImp(glm.full, scale = FALSE))

Reduced Logistic Regression

Fit the reduced logistic model

set.seed(1)
glm.reduced <- train(Attrition_Flag~., data = reduced.train, method = "glm", metric = "ROC", trControl = ctrl)
summary(glm.reduced)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.7187   0.0890   0.2066   0.4129   2.5978  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -4.992e+00  4.649e-01 -10.738  < 2e-16 ***
## GenderMale                       6.849e-01  1.549e-01   4.422 9.76e-06 ***
## Dependent_count                 -1.149e-01  3.209e-02  -3.582 0.000342 ***
## Education_LevelDoctorate        -4.373e-01  2.164e-01  -2.021 0.043284 *  
## Education_LevelGraduate         -8.449e-02  1.491e-01  -0.567 0.570934    
## `Education_LevelHigh School`    -1.835e-01  1.586e-01  -1.157 0.247228    
## `Education_LevelPost-Graduate`  -3.051e-01  2.221e-01  -1.374 0.169568    
## Education_LevelUneducated       -1.435e-01  1.682e-01  -0.853 0.393423    
## Education_LevelUnknown          -2.486e-01  1.655e-01  -1.502 0.133106    
## Marital_StatusMarried            3.951e-01  1.653e-01   2.389 0.016873 *  
## Marital_StatusSingle            -7.506e-02  1.657e-01  -0.453 0.650481    
## Marital_StatusUnknown           -1.304e-01  2.122e-01  -0.615 0.538808    
## `Income_Category$40K - $60K`     6.552e-01  2.202e-01   2.976 0.002923 ** 
## `Income_Category$60K - $80K`     4.442e-01  1.970e-01   2.254 0.024167 *  
## `Income_Category$80K - $120K`    2.006e-01  1.824e-01   1.100 0.271384    
## `Income_CategoryLess than $40K`  4.372e-01  2.382e-01   1.836 0.066386 .  
## Income_CategoryUnknown           5.739e-01  2.482e-01   2.312 0.020777 *  
## Card_CategoryGold               -7.216e-01  4.313e-01  -1.673 0.094289 .  
## Card_CategoryPlatinum           -8.979e-01  8.418e-01  -1.067 0.286127    
## Card_CategorySilver             -5.461e-01  2.143e-01  -2.549 0.010816 *  
## Months_on_book                   9.645e-03  5.021e-03   1.921 0.054707 .  
## Total_Relationship_Count         5.273e-01  2.958e-02  17.825  < 2e-16 ***
## Months_Inactive_12_mon          -4.791e-01  4.019e-02 -11.921  < 2e-16 ***
## Contacts_Count_12_mon           -4.949e-01  3.848e-02 -12.860  < 2e-16 ***
## Credit_Limit                     1.484e-05  7.490e-06   1.981 0.047610 *  
## Total_Revolving_Bal              8.028e-04  7.807e-05  10.284  < 2e-16 ***
## Total_Amt_Chng_Q4_Q1             6.491e-03  1.930e-01   0.034 0.973175    
## Total_Trans_Ct                   6.526e-02  2.415e-03  27.025  < 2e-16 ***
## Total_Ct_Chng_Q4_Q1              2.682e+00  2.056e-01  13.046  < 2e-16 ***
## Avg_Utilization_Ratio            5.259e-01  2.673e-01   1.968 0.049116 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7143.2  on 8101  degrees of freedom
## Residual deviance: 4134.5  on 8072  degrees of freedom
## AIC: 4194.5
## 
## Number of Fisher Scoring iterations: 6

Check VIF of the reduced logistic regression

car::vif(glm.reduced$finalModel)
##                      GenderMale                 Dependent_count 
##                        3.676797                        1.055243 
##        Education_LevelDoctorate         Education_LevelGraduate 
##                        1.484708                        2.915320 
##    `Education_LevelHigh School`  `Education_LevelPost-Graduate` 
##                        2.464830                        1.450609 
##       Education_LevelUneducated          Education_LevelUnknown 
##                        2.130946                        2.215503 
##           Marital_StatusMarried            Marital_StatusSingle 
##                        4.193971                        4.068851 
##           Marital_StatusUnknown    `Income_Category$40K - $60K` 
##                        1.938676                        4.114081 
##    `Income_Category$60K - $80K`   `Income_Category$80K - $120K` 
##                        2.637744                        2.682099 
## `Income_CategoryLess than $40K`          Income_CategoryUnknown 
##                        8.096989                        4.023133 
##               Card_CategoryGold           Card_CategoryPlatinum 
##                        1.121417                        1.035684 
##             Card_CategorySilver                  Months_on_book 
##                        1.412281                        1.048936 
##        Total_Relationship_Count          Months_Inactive_12_mon 
##                        1.119741                        1.047173 
##           Contacts_Count_12_mon                    Credit_Limit 
##                        1.035410                        2.756137 
##             Total_Revolving_Bal            Total_Amt_Chng_Q4_Q1 
##                        2.566631                        1.150400 
##                  Total_Trans_Ct             Total_Ct_Chng_Q4_Q1 
##                        1.295491                        1.188908 
##           Avg_Utilization_Ratio 
##                        2.975631

Confusion matrix of the reduced logistic model

set.seed(1)
glm.probs.reduced <- predict(glm.reduced, newdata = reduced.test, type = "raw")
glm.cm.reduced<- caret::confusionMatrix(as.factor(glm.probs.reduced),as.factor(reduced.test$Attrition_Flag), positive = "churned")
glm.cm.reduced
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    churned not_churned
##   churned         175          55
##   not_churned     150        1645
##                                           
##                Accuracy : 0.8988          
##                  95% CI : (0.8848, 0.9116)
##     No Information Rate : 0.8395          
##     P-Value [Acc > NIR] : 9.185e-15       
##                                           
##                   Kappa : 0.574           
##                                           
##  Mcnemar's Test P-Value : 5.195e-11       
##                                           
##             Sensitivity : 0.53846         
##             Specificity : 0.96765         
##          Pos Pred Value : 0.76087         
##          Neg Pred Value : 0.91643         
##              Prevalence : 0.16049         
##          Detection Rate : 0.08642         
##    Detection Prevalence : 0.11358         
##       Balanced Accuracy : 0.75305         
##                                           
##        'Positive' Class : churned         
## 

Plot of the reduced logistic model confusion matrix

draw_confusion_matrix <- function(glm.cm.reduced) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('Reduced Logistic Regression Confusion Matrix', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#72a376')
  text(195, 435, 'churned', cex=1.2)
  rect(250, 430, 340, 370, col='#c0504d')
  text(295, 435, 'not churned', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#c0504d')
  rect(250, 305, 340, 365, col='#72a376')
  text(140, 400, 'churned', cex=1.2, srt=90)
  text(140, 335, 'not churned', cex=1.2, srt=90)

  # add in the glm.cm.reduced results 
  res <- as.numeric(glm.cm.reduced$table)
  text(195, 400, res[1], cex=2, font=2, col='white')
  text(195, 335, res[2], cex=2, font=2, col='white')
  text(295, 400, res[3], cex=2, font=2, col='white')
  text(295, 335, res[4], cex=2, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(glm.cm.reduced$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(glm.cm.reduced$byClass[1]), 3), cex=1.2)
  text(30, 85, names(glm.cm.reduced$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(glm.cm.reduced$byClass[2]), 3), cex=1.2)
  text(50, 85, names(glm.cm.reduced$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(glm.cm.reduced$byClass[5]), 3), cex=1.2)
  text(70, 85, names(glm.cm.reduced$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(glm.cm.reduced$byClass[6]), 3), cex=1.2)
  text(90, 85, names(glm.cm.reduced$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(glm.cm.reduced$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(glm.cm.reduced$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(glm.cm.reduced$overall[1]), 3), cex=1.4)
  text(70, 35, names(glm.cm.reduced$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(glm.cm.reduced$overall[2]), 3), cex=1.4)
}  

draw_confusion_matrix(glm.cm.reduced)

Variable importance of the reduced logistic model

varImp(glm.reduced, scale = FALSE)
## glm variable importance
## 
##   only 20 most important variables shown (out of 29)
## 
##                                 Overall
## Total_Trans_Ct                   27.025
## Total_Relationship_Count         17.825
## Total_Ct_Chng_Q4_Q1              13.046
## Contacts_Count_12_mon            12.860
## Months_Inactive_12_mon           11.921
## Total_Revolving_Bal              10.284
## GenderMale                        4.422
## Dependent_count                   3.582
## `Income_Category$40K - $60K`      2.976
## Card_CategorySilver               2.549
## Marital_StatusMarried             2.389
## Income_CategoryUnknown            2.312
## `Income_Category$60K - $80K`      2.254
## Education_LevelDoctorate          2.021
## Credit_Limit                      1.981
## Avg_Utilization_Ratio             1.968
## Months_on_book                    1.921
## `Income_CategoryLess than $40K`   1.836
## Card_CategoryGold                 1.673
## Education_LevelUnknown            1.502
plot(varImp(glm.reduced, scale = FALSE))

Full Random Forest

Fit the full random forest model

#fit full random forest
# set.seed(1)
# rf.full <- train(Attrition_Flag~., data = full.train, method = "rf", metric= "Accuracy", trControl = ctrl , tuneGrid = NULL)
# print(rf.full)
#save the rf model to an object to avoid running it again for results
# saveRDS(rf.full, "rf.full.rds")
rf.full <- readRDS("rf.full.rds")
print(rf.full)
## Random Forest 
## 
## 8102 samples
##   19 predictor
##    2 classes: 'churned', 'not_churned' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 7292, 7292, 7292, 7291, 7292, 7292, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9193600  0.6370257
##   17    0.9638362  0.8620252
##   32    0.9610387  0.8522121
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 17.

Confusion matrix of the full random forest model

set.seed(1)
rf.probs.full <- predict(rf.full, newdata = full.test, type = "raw")
rf.cm.full<- caret::confusionMatrix(as.factor(rf.probs.full),as.factor(full.test$Attrition_Flag), positive = "churned")
rf.cm.full
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    churned not_churned
##   churned         286          19
##   not_churned      39        1681
##                                           
##                Accuracy : 0.9714          
##                  95% CI : (0.9631, 0.9782)
##     No Information Rate : 0.8395          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.891           
##                                           
##  Mcnemar's Test P-Value : 0.0126          
##                                           
##             Sensitivity : 0.8800          
##             Specificity : 0.9888          
##          Pos Pred Value : 0.9377          
##          Neg Pred Value : 0.9773          
##              Prevalence : 0.1605          
##          Detection Rate : 0.1412          
##    Detection Prevalence : 0.1506          
##       Balanced Accuracy : 0.9344          
##                                           
##        'Positive' Class : churned         
## 

Plot of the full random forest model confusion matrix

draw_confusion_matrix <- function(rf.cm.full) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('Full Random Forest Confusion Matrix', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#72a376')
  text(195, 435, 'churned', cex=1.2)
  rect(250, 430, 340, 370, col='#c0504d')
  text(295, 435, 'not churned', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#c0504d')
  rect(250, 305, 340, 365, col='#72a376')
  text(140, 400, 'churned', cex=1.2, srt=90)
  text(140, 335, 'not churned', cex=1.2, srt=90)

  # add in the glm.cm results 
  res <- as.numeric(rf.cm.full$table)
  text(195, 400, res[1], cex=2, font=2, col='white')
  text(195, 335, res[2], cex=2, font=2, col='white')
  text(295, 400, res[3], cex=2, font=2, col='white')
  text(295, 335, res[4], cex=2, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(rf.cm.full$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(rf.cm.full$byClass[1]), 3), cex=1.2)
  text(30, 85, names(rf.cm.full$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(rf.cm.full$byClass[2]), 3), cex=1.2)
  text(50, 85, names(rf.cm.full$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(rf.cm.full$byClass[5]), 3), cex=1.2)
  text(70, 85, names(rf.cm.full$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(rf.cm.full$byClass[6]), 3), cex=1.2)
  text(90, 85, names(rf.cm.full$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(rf.cm.full$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(rf.cm.full$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(rf.cm.full$overall[1]), 3), cex=1.4)
  text(70, 35, names(rf.cm.full$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(rf.cm.full$overall[2]), 3), cex=1.4)
}  

draw_confusion_matrix(rf.cm.full)

Reduced Random Forest

Fit the reduced random forest model

#fit reduced random forest
# set.seed(1)
# rf.reduced <- train(Attrition_Flag~., data = reduced.train, method = "rf", metric= "Accuracy", trControl = ctrl , tuneGrid = NULL)
# print(rf.reduced)
#save the rf model to an object to avoid running it again for results
# saveRDS(rf.reduced, "rf.reduced.rds")
rf.reduced <- readRDS("rf.reduced.rds")
print(rf.reduced)
## Random Forest 
## 
## 8102 samples
##   16 predictor
##    2 classes: 'churned', 'not_churned' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 7292, 7292, 7292, 7291, 7292, 7292, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9036037  0.5432447
##   15    0.9308395  0.7238281
##   29    0.9293996  0.7205106
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 15.

Confusion matrix of the reduced random forest model

set.seed(1)
rf.probs.reduced <- predict(rf.reduced, newdata = reduced.test, type = "raw")
rf.cm.reduced<- caret::confusionMatrix(as.factor(rf.probs.reduced),as.factor(reduced.test$Attrition_Flag), positive = "churned")
rf.cm.reduced
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    churned not_churned
##   churned         231          40
##   not_churned      94        1660
##                                           
##                Accuracy : 0.9338          
##                  95% CI : (0.9221, 0.9443)
##     No Information Rate : 0.8395          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7367          
##                                           
##  Mcnemar's Test P-Value : 4.683e-06       
##                                           
##             Sensitivity : 0.7108          
##             Specificity : 0.9765          
##          Pos Pred Value : 0.8524          
##          Neg Pred Value : 0.9464          
##              Prevalence : 0.1605          
##          Detection Rate : 0.1141          
##    Detection Prevalence : 0.1338          
##       Balanced Accuracy : 0.8436          
##                                           
##        'Positive' Class : churned         
## 

Plot of the reduced random forest model confusion matrix

draw_confusion_matrix <- function(rf.cm.reduced) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('Reduced Random Forest Confusion Matrix', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#72a376')
  text(195, 435, 'churned', cex=1.2)
  rect(250, 430, 340, 370, col='#c0504d')
  text(295, 435, 'not churned', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#c0504d')
  rect(250, 305, 340, 365, col='#72a376')
  text(140, 400, 'churned', cex=1.2, srt=90)
  text(140, 335, 'not churned', cex=1.2, srt=90)

  # add in the glm.cm results 
  res <- as.numeric(rf.cm.reduced$table)
  text(195, 400, res[1], cex=2, font=2, col='white')
  text(195, 335, res[2], cex=2, font=2, col='white')
  text(295, 400, res[3], cex=2, font=2, col='white')
  text(295, 335, res[4], cex=2, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(rf.cm.reduced$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(rf.cm.reduced$byClass[1]), 3), cex=1.2)
  text(30, 85, names(rf.cm.reduced$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(rf.cm.reduced$byClass[2]), 3), cex=1.2)
  text(50, 85, names(rf.cm.reduced$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(rf.cm.reduced$byClass[5]), 3), cex=1.2)
  text(70, 85, names(rf.cm.reduced$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(rf.cm.reduced$byClass[6]), 3), cex=1.2)
  text(90, 85, names(rf.cm.reduced$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(rf.cm.reduced$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(rf.cm.reduced$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(rf.cm.reduced$overall[1]), 3), cex=1.4)
  text(70, 35, names(rf.cm.reduced$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(rf.cm.reduced$overall[2]), 3), cex=1.4)
}  

draw_confusion_matrix(rf.cm.reduced)

Full Support Ventor Machine

Fit the full support vector machine model

#fit full support vector machine model
# set.seed(1)
# svm.full <- train(Attrition_Flag~., data = full.train, method = "svmRadial", tuneLength = 5, preProc = c("center", "scale"), metric = "ROC", trControl = ctrl)
# print(svm.full)
#save the rf model to an object to avoid running it again for results
# saveRDS(svm.full, "svm.full.rds")
svm.full <- readRDS("svm.full.rds")
print(svm.full)
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 8102 samples
##   19 predictor
##    2 classes: 'churned', 'not_churned' 
## 
## Pre-processing: centered (32), scaled (32) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 7292, 7292, 7292, 7291, 7292, 7292, ... 
## Resampling results across tuning parameters:
## 
##   C     ROC        Sens       Spec     
##   0.25  0.9281944  0.6041632  0.9682353
##   0.50  0.9396694  0.6241358  0.9700490
##   1.00  0.9480545  0.6538188  0.9710294
##   2.00  0.9522997  0.6640517  0.9715686
##   4.00  0.9545427  0.6773693  0.9711275
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01978265
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.01978265 and C = 4.

Confusion matrix of the full support vector machine model

set.seed(1)
svm.probs.full <- predict(svm.full, newdata = full.test, type = "raw")
svm.cm.full<- caret::confusionMatrix(as.factor(svm.probs.full),as.factor(full.test$Attrition_Flag), positive = "churned")
svm.cm.full
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    churned not_churned
##   churned         230          30
##   not_churned      95        1670
##                                           
##                Accuracy : 0.9383          
##                  95% CI : (0.9269, 0.9484)
##     No Information Rate : 0.8395          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7508          
##                                           
##  Mcnemar's Test P-Value : 1.038e-08       
##                                           
##             Sensitivity : 0.7077          
##             Specificity : 0.9824          
##          Pos Pred Value : 0.8846          
##          Neg Pred Value : 0.9462          
##              Prevalence : 0.1605          
##          Detection Rate : 0.1136          
##    Detection Prevalence : 0.1284          
##       Balanced Accuracy : 0.8450          
##                                           
##        'Positive' Class : churned         
## 

Plot of the full support vector machine confusion matrix

draw_confusion_matrix <- function(svm.cm.full) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('Full Support Vector Machine Matrix', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#72a376')
  text(195, 435, 'churned', cex=1.2)
  rect(250, 430, 340, 370, col='#c0504d')
  text(295, 435, 'not churned', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#c0504d')
  rect(250, 305, 340, 365, col='#72a376')
  text(140, 400, 'churned', cex=1.2, srt=90)
  text(140, 335, 'not churned', cex=1.2, srt=90)

  # add in the glm.cm results 
  res <- as.numeric(svm.cm.full$table)
  text(195, 400, res[1], cex=2, font=2, col='white')
  text(195, 335, res[2], cex=2, font=2, col='white')
  text(295, 400, res[3], cex=2, font=2, col='white')
  text(295, 335, res[4], cex=2, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(svm.cm.full$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(svm.cm.full$byClass[1]), 3), cex=1.2)
  text(30, 85, names(svm.cm.full$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(svm.cm.full$byClass[2]), 3), cex=1.2)
  text(50, 85, names(svm.cm.full$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(svm.cm.full$byClass[5]), 3), cex=1.2)
  text(70, 85, names(svm.cm.full$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(svm.cm.full$byClass[6]), 3), cex=1.2)
  text(90, 85, names(svm.cm.full$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(svm.cm.full$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(svm.cm.full$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(svm.cm.full$overall[1]), 3), cex=1.4)
  text(70, 35, names(svm.cm.full$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(svm.cm.full$overall[2]), 3), cex=1.4)
}  

draw_confusion_matrix(svm.cm.full)

Reduced Support Vector Machine

Fit the reduced support vector machine model

#fit reduced support vector machine
# set.seed(1)
# svm.reduced <- train(Attrition_Flag~., data = reduced.train, method = "svmRadial", tuneLength = 5, preProc = c("center", "scale"), metric = "ROC", trControl = ctrl)
# print(svm.reduced)
#save the rf model to an object to avoid running it again for results
# saveRDS(svm.reduced, "svm.reduced.rds")
svm.reduced <- readRDS("svm.reduced.rds")
print(svm.reduced)
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 8102 samples
##   16 predictor
##    2 classes: 'churned', 'not_churned' 
## 
## Pre-processing: centered (29), scaled (29) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 7292, 7292, 7292, 7291, 7292, 7292, ... 
## Resampling results across tuning parameters:
## 
##   C     ROC        Sens       Spec     
##   0.25  0.9146633  0.5798532  0.9692157
##   0.50  0.9211653  0.5900783  0.9705882
##   1.00  0.9270046  0.5913701  0.9718137
##   2.00  0.9315260  0.6013525  0.9718627
##   4.00  0.9342141  0.6016109  0.9716176
## 
## Tuning parameter 'sigma' was held constant at a value of 0.02197041
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.02197041 and C = 4.

Confusion matrix of the reduced support vector machine model

set.seed(1)
svm.probs.reduced <- predict(svm.reduced, newdata = reduced.test, type = "raw")
svm.cm.reduced<- caret::confusionMatrix(as.factor(svm.probs.reduced),as.factor(reduced.test$Attrition_Flag), positive = "churned")
svm.cm.reduced
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    churned not_churned
##   churned         205          36
##   not_churned     120        1664
##                                           
##                Accuracy : 0.923           
##                  95% CI : (0.9105, 0.9342)
##     No Information Rate : 0.8395          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6807          
##                                           
##  Mcnemar's Test P-Value : 3.026e-11       
##                                           
##             Sensitivity : 0.6308          
##             Specificity : 0.9788          
##          Pos Pred Value : 0.8506          
##          Neg Pred Value : 0.9327          
##              Prevalence : 0.1605          
##          Detection Rate : 0.1012          
##    Detection Prevalence : 0.1190          
##       Balanced Accuracy : 0.8048          
##                                           
##        'Positive' Class : churned         
## 

Plot of the reduced support vector machine confusion matrix

draw_confusion_matrix <- function(svm.cm.reduced) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('Reduced Support Vector Machine Confusion Matrix', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#72a376')
  text(195, 435, 'churned', cex=1.2)
  rect(250, 430, 340, 370, col='#c0504d')
  text(295, 435, 'not churned', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#c0504d')
  rect(250, 305, 340, 365, col='#72a376')
  text(140, 400, 'churned', cex=1.2, srt=90)
  text(140, 335, 'not churned', cex=1.2, srt=90)

  # add in the glm.cm results 
  res <- as.numeric(svm.cm.reduced$table)
  text(195, 400, res[1], cex=2, font=2, col='white')
  text(195, 335, res[2], cex=2, font=2, col='white')
  text(295, 400, res[3], cex=2, font=2, col='white')
  text(295, 335, res[4], cex=2, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(svm.cm.reduced$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(svm.cm.reduced$byClass[1]), 3), cex=1.2)
  text(30, 85, names(svm.cm.reduced$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(svm.cm.reduced$byClass[2]), 3), cex=1.2)
  text(50, 85, names(svm.cm.reduced$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(svm.cm.reduced$byClass[5]), 3), cex=1.2)
  text(70, 85, names(svm.cm.reduced$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(svm.cm.reduced$byClass[6]), 3), cex=1.2)
  text(90, 85, names(svm.cm.reduced$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(svm.cm.reduced$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(svm.cm.reduced$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(svm.cm.reduced$overall[1]), 3), cex=1.4)
  text(70, 35, names(svm.cm.reduced$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(svm.cm.reduced$overall[2]), 3), cex=1.4)
}  

draw_confusion_matrix(svm.cm.reduced)