Bankchurners.R

pacman::p_load(pacman,tidyr,stringr,GGally,ggplot2,ggthemes,ggvis,
               rio,rmarkdown,shiny,plotly,psych,httr,tidyverse,lubridate,dummies,
               rpart.plot,caret,pROC,DT,rpart,randomForest,ROCR)

bank_churners<-read.csv("c:\\Users\\xholi\\Downloads\\BankChurners\\BankChurners.csv")
sum(is.na(bank_churners))

## [1] 0

#our data set has no missing values
bank_churners1<-bank_churners[,1:21] #removing the last 2 variables
Churners<-bank_churners1

#set seed and create Training and testing Data sets
str(Churners)

## 'data.frame':    10127 obs. of  21 variables:
##  $ CLIENTNUM               : int  768805383 818770008 713982108 769911858 709106358 713061558 810347208 818906208 710930508 719661558 ...
##  $ Attrition_Flag          : chr  "Existing Customer" "Existing Customer" "Existing Customer" "Existing Customer" ...
##  $ Customer_Age            : int  45 49 51 40 40 44 51 32 37 48 ...
##  $ Gender                  : chr  "M" "F" "M" "F" ...
##  $ Dependent_count         : int  3 5 3 4 3 2 4 0 3 2 ...
##  $ Education_Level         : chr  "High School" "Graduate" "Graduate" "High School" ...
##  $ Marital_Status          : chr  "Married" "Single" "Married" "Unknown" ...
##  $ Income_Category         : chr  "$60K - $80K" "Less than $40K" "$80K - $120K" "Less than $40K" ...
##  $ Card_Category           : chr  "Blue" "Blue" "Blue" "Blue" ...
##  $ Months_on_book          : int  39 44 36 34 21 36 46 27 36 36 ...
##  $ Total_Relationship_Count: int  5 6 4 3 5 3 6 2 5 6 ...
##  $ Months_Inactive_12_mon  : int  1 1 1 4 1 1 1 2 2 3 ...
##  $ Contacts_Count_12_mon   : int  3 2 0 1 0 2 3 2 0 3 ...
##  $ Credit_Limit            : num  12691 8256 3418 3313 4716 ...
##  $ Total_Revolving_Bal     : int  777 864 0 2517 0 1247 2264 1396 2517 1677 ...
##  $ Avg_Open_To_Buy         : num  11914 7392 3418 796 4716 ...
##  $ Total_Amt_Chng_Q4_Q1    : num  1.33 1.54 2.59 1.4 2.17 ...
##  $ Total_Trans_Amt         : int  1144 1291 1887 1171 816 1088 1330 1538 1350 1441 ...
##  $ Total_Trans_Ct          : int  42 33 20 20 28 24 31 36 24 32 ...
##  $ Total_Ct_Chng_Q4_Q1     : num  1.62 3.71 2.33 2.33 2.5 ...
##  $ Avg_Utilization_Ratio   : num  0.061 0.105 0 0.76 0 0.311 0.066 0.048 0.113 0.144 ...

set.seed(191)
intrain<-createDataPartition(y=Churners$Attrition_Flag,p=0.8,list = F)
training<-Churners[intrain,]
testing<-Churners[-intrain,]

names(Churners)

##  [1] "CLIENTNUM"                "Attrition_Flag"          
##  [3] "Customer_Age"             "Gender"                  
##  [5] "Dependent_count"          "Education_Level"         
##  [7] "Marital_Status"           "Income_Category"         
##  [9] "Card_Category"            "Months_on_book"          
## [11] "Total_Relationship_Count" "Months_Inactive_12_mon"  
## [13] "Contacts_Count_12_mon"    "Credit_Limit"            
## [15] "Total_Revolving_Bal"      "Avg_Open_To_Buy"         
## [17] "Total_Amt_Chng_Q4_Q1"     "Total_Trans_Amt"         
## [19] "Total_Trans_Ct"           "Total_Ct_Chng_Q4_Q1"     
## [21] "Avg_Utilization_Ratio"

#converting the variables to factors
Churners$Attrition_Flag<-as.factor(Churners$Attrition_Flag)
Churners$Gender<-as.factor(Churners$Gender)
Churners$Education_Level<-as.factor(Churners$Education_Level)
Churners$Dependent_count<-as.factor(Churners$Dependent_count)
Churners$Marital_Status<-as.factor(Churners$Marital_Status)
Churners$Income_Category<-as.factor(Churners$Income_Category)
Churners$Card_Category<-as.factor(Churners$Card_Category)


#BASIC VISUALIZATIONS OF THE DATA

#Total Attrition 
x<-Churners$Attrition_Flag
plot(x,col=rainbow(2),
     ylab="Number of Clients",
     main="Total number of Attritions")
box()

summary(x)

## Attrited Customer Existing Customer 
##              1627              8500

#Attrited Customer Existing Customer 
# 1627              8500 
#that is about 16% of the Clients left our Bank
#we are more concerned about the 16% than the 84% who are stil with us, 
#reason being, we want to know what made them leave our company and what classification group they fall in
# as to alert us to which class of individuals to directly focus our market at 
#in the hopes of reducing this number of leaving clients and thus in return 
# save the profit deficit created by the percentage of leaving clients
Churners$Attrition_Flag<-as.numeric(Churners$Attrition_Flag)
#Attrition by Gender
x<-Churners$Gender[Churners$Attrition_Flag==1]
plot(x,col=rainbow(2),
     main="Number clients who Attrited by Gender",
     xlab="Gender",
     ylab="Number of clients")
box()

summary(x)

##   F   M 
## 930 697

# F   M 
# 930 697 
#It becomes clear that most attrited clients were females and the percentage is 57.16%
# the percentage for men is 42.84%

#Attrition by number of Dependents
x<-Churners$Dependent_count[Churners$Attrition_Flag==1]
plot(x,col=rainbow(6),
     main="Number of Attrited Client by Number of Dependents",
     ylab="Number of Clients",
     xlab="Number of Dependents")
box()

summary(x)

##   0   1   2   3   4   5 
## 135 269 417 482 260  64

#  0   1   2   3   4   5 
# 135 269 417 482 260  64

#its clear that most attrited clients had 3 dependents followed by those with 2 and 4
# and since we saw that by gender females attrited the most
# thus its safe to assume that this were females mostly with 3 dependents,2 and 4 respectively
#clients with 5 dependents attrited the least followed by those with no dependents

#Attrition by Education level
x<-Churners$Education_Level[Churners$Attrition_Flag==1]
plot(x,col=rainbow(7),
     xlab="Education Level",
     ylab="Number of Cliets",
     main="Attrited Clients by Education Level")
box()

summary(x)

##       College     Doctorate      Graduate   High School Post-Graduate 
##           154            95           487           306            92 
##    Uneducated       Unknown 
##           237           256

#  College     Doctorate      Graduate   High School Post-Graduate    Uneducated 
# 154            95           487           306            92           237 
# Unknown 
# 256 

#when comparing attrited clients by Education, it becomes clear that Graduates attrited the most followed by
#High school  ,then Unknown and Uneducated individuals.
# clients with education level Post-Grad attrited the least follwed by Doctorate
# we will focus on Under grads and High school students since they contribute the largest in terms of propotion

#Attrition by Marital Status
x<-Churners$Marital_Status[Churners$Attrition_Flag==1]
plot(x,col=rainbow(4),
     main="Attrited Clients by Marital status",
     ylab="Number of Clients",
     xlab="Marital Status")
box()

summary(x)

## Divorced  Married   Single  Unknown 
##      121      709      668      129

#Divorced  Married   Single  Unknown 
# 121      709      668      129
#the data implies that most of the attrited clients were married followed by those who are single
# we can assume from the above info that it was mostly females,who are mostly Under grads or High school students
#with 3 or 2 Dependents and are married or single.
str(Churners)

## 'data.frame':    10127 obs. of  21 variables:
##  $ CLIENTNUM               : int  768805383 818770008 713982108 769911858 709106358 713061558 810347208 818906208 710930508 719661558 ...
##  $ Attrition_Flag          : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ Customer_Age            : int  45 49 51 40 40 44 51 32 37 48 ...
##  $ Gender                  : Factor w/ 2 levels "F","M": 2 1 2 1 2 2 2 2 2 2 ...
##  $ Dependent_count         : Factor w/ 6 levels "0","1","2","3",..: 4 6 4 5 4 3 5 1 4 3 ...
##  $ Education_Level         : Factor w/ 7 levels "College","Doctorate",..: 4 3 3 4 6 3 7 4 6 3 ...
##  $ Marital_Status          : Factor w/ 4 levels "Divorced","Married",..: 2 3 2 4 2 2 2 4 3 3 ...
##  $ Income_Category         : Factor w/ 6 levels "$120K +","$40K - $60K",..: 3 5 4 5 3 2 1 3 3 4 ...
##  $ Card_Category           : Factor w/ 4 levels "Blue","Gold",..: 1 1 1 1 1 1 2 4 1 1 ...
##  $ Months_on_book          : int  39 44 36 34 21 36 46 27 36 36 ...
##  $ Total_Relationship_Count: int  5 6 4 3 5 3 6 2 5 6 ...
##  $ Months_Inactive_12_mon  : int  1 1 1 4 1 1 1 2 2 3 ...
##  $ Contacts_Count_12_mon   : int  3 2 0 1 0 2 3 2 0 3 ...
##  $ Credit_Limit            : num  12691 8256 3418 3313 4716 ...
##  $ Total_Revolving_Bal     : int  777 864 0 2517 0 1247 2264 1396 2517 1677 ...
##  $ Avg_Open_To_Buy         : num  11914 7392 3418 796 4716 ...
##  $ Total_Amt_Chng_Q4_Q1    : num  1.33 1.54 2.59 1.4 2.17 ...
##  $ Total_Trans_Amt         : int  1144 1291 1887 1171 816 1088 1330 1538 1350 1441 ...
##  $ Total_Trans_Ct          : int  42 33 20 20 28 24 31 36 24 32 ...
##  $ Total_Ct_Chng_Q4_Q1     : num  1.62 3.71 2.33 2.33 2.5 ...
##  $ Avg_Utilization_Ratio   : num  0.061 0.105 0 0.76 0 0.311 0.066 0.048 0.113 0.144 ...

#Attrition by Income Category
x<-Churners$Income_Category[Churners$Attrition_Flag==1]
plot(x,col=rainbow(6),xlab="Income Category",
     ylab="Number of Clients",
     main="Attrited Clients by Income Cat")
box()

summary(x)

##        $120K +    $40K - $60K    $60K - $80K   $80K - $120K Less than $40K 
##            126            271            189            242            612 
##        Unknown 
##            187

#($120K +)    ($40K - $60K)    ($60K - $80K)   ($80K - $120K) (Less than $40K) 
# 126            271            189            242            612 
# (Unknown) 
# 187 

#in terms of income category, most attrited clients are in the Less than $40K category
#followed by those who earn between $40K-$60k then $80k-$120k and those who are in the category
# $60k-$80k.
# the ones that attrited the least earn around $120k +

#Attrition by Card Category
x<-Churners$Card_Category[Churners$Attrition_Flag==1]
plot(x,col=rainbow(4),
     xlab="Card Category",
     ylab="Number of Clients",
     main="Attrited Clients by Card category")
box()

#as expected the blue card holders attrited the most since most attritions were from the income rage of 
#less tha $40k .
summary(x)

##     Blue     Gold Platinum   Silver 
##     1519       21        5       82

# Blue     Gold Platinum   Silver 
# 1519       21        5       82 

#Attrition by Months on book
x<-Churners$Months_on_book[Churners$Attrition_Flag==1]
boxplot(x,horizontal = T,col="purple",main="Box plot of attrited clients by Month on books")

summary(x)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   32.00   36.00   36.18   40.00   56.00

#the average Month on book for Clients who Attrited is 36.18 months with a min of 13 months and a maz of 56
#with a few outliers


#MODELLING

#Model1
#STEPWISE LOGISTIC REGRESSION
md_glm<-glm(as.factor(Attrition_Flag)~.,data = training,
            family = binomial(logit))
md_step<-step(md_glm,direction = "both",
              steps = 10000,
              trace = F)
prediction_step<-predict(md_step,testing,type="response")

#to decide the cut off point
pred_step<-prediction(prediction_step,testing$Attrition_Flag)
plot(performance(pred_step,"tpr","fpr"),colorize=T)

auc_step<-performance(pred_step,"auc")
auc_step

## A performance instance
##   'Area under the ROC curve'

roc_step<-roc(response=testing$Attrition_Flag,predictor=prediction_step)

## Setting levels: control = Attrited Customer, case = Existing Customer

## Setting direction: controls < cases

d<-coords(roc_step,"best","threshold",transpose=T)
d

##   threshold specificity sensitivity 
##   0.7813074   0.7907692   0.8994118

#threshold specificity sensitivity 
# 0.7813074   0.7907692   0.8994118 

#We find that the best cut off point to distinguish the class probabilities into 1 and 2 is 
# 0.7813
testing$Attrition_Flag<-as.character(testing$Attrition_Flag)
#forming confusion matrix from the decided cutoff point
Attrition_step<-if_else(prediction_step>=d[[1]],"Existing Customer","Attrited Customer")
cm_1<-confusionMatrix(table(testing$Attrition_Flag,Attrition_step))
str(testing$Attrition_Flag)

##  chr [1:2025] "Existing Customer" "Existing Customer" "Existing Customer" ...

str(Attrition_step)

##  chr [1:2025] "Existing Customer" "Existing Customer" "Existing Customer" ...

cm_1

## Confusion Matrix and Statistics
## 
##                    Attrition_step
##                     Attrited Customer Existing Customer
##   Attrited Customer               257                68
##   Existing Customer               171              1529
##                                            
##                Accuracy : 0.882            
##                  95% CI : (0.8671, 0.8957) 
##     No Information Rate : 0.7886           
##     P-Value [Acc > NIR] : < 2.2e-16        
##                                            
##                   Kappa : 0.6118           
##                                            
##  Mcnemar's Test P-Value : 4.172e-11        
##                                            
##             Sensitivity : 0.6005           
##             Specificity : 0.9574           
##          Pos Pred Value : 0.7908           
##          Neg Pred Value : 0.8994           
##              Prevalence : 0.2114           
##          Detection Rate : 0.1269           
##    Detection Prevalence : 0.1605           
##       Balanced Accuracy : 0.7789           
##                                            
##        'Positive' Class : Attrited Customer
##

# cm_1 --------------------------------------------------------------------


#MODEL 2
#RANDOM FOREST MODELLING
tc_rf<-trainControl(method = "repeatedcv",
                    repeats = 10,
                    number = 3,search = "random")
rf_train<-train(Attrition_Flag~.,data=training,
                method="rf",
                trainControl=tc_rf)
plot(varImp(rf_train,scale = F))

testing$Attrition_Flag<-as.factor(testing$Attrition_Flag)
predict_rftrain<-predict(rf_train,testing)
cm_2<-confusionMatrix(predict_rftrain,testing$Attrition_Flag)
cm_2

## Confusion Matrix and Statistics
## 
##                    Reference
## Prediction          Attrited Customer Existing Customer
##   Attrited Customer               274                27
##   Existing Customer                51              1673
##                                            
##                Accuracy : 0.9615           
##                  95% CI : (0.9522, 0.9694) 
##     No Information Rate : 0.8395           
##     P-Value [Acc > NIR] : < 2.2e-16        
##                                            
##                   Kappa : 0.8527           
##                                            
##  Mcnemar's Test P-Value : 0.009208         
##                                            
##             Sensitivity : 0.8431           
##             Specificity : 0.9841           
##          Pos Pred Value : 0.9103           
##          Neg Pred Value : 0.9704           
##              Prevalence : 0.1605           
##          Detection Rate : 0.1353           
##    Detection Prevalence : 0.1486           
##       Balanced Accuracy : 0.9136           
##                                            
##        'Positive' Class : Attrited Customer
##

#MODEL 3
#DECISION TREES-RPART
rp<-rpart(Attrition_Flag~.,data=training,
          method = "class")
predictions_rp<-predict(rp,testing,type = "class")
rpart.plot(rp,type = 2,
           extra = 100,
           branch.lty=1,
           box.palette = "RdYlGn",
           tweak = 1.5,
           fallen.leaves = F)

cm_rpart<-confusionMatrix(predictions_rp,testing$Attrition_Flag)
cm_rpart

## Confusion Matrix and Statistics
## 
##                    Reference
## Prediction          Attrited Customer Existing Customer
##   Attrited Customer               233                52
##   Existing Customer                92              1648
##                                            
##                Accuracy : 0.9289           
##                  95% CI : (0.9168, 0.9397) 
##     No Information Rate : 0.8395           
##     P-Value [Acc > NIR] : < 2.2e-16        
##                                            
##                   Kappa : 0.7223           
##                                            
##  Mcnemar's Test P-Value : 0.001154         
##                                            
##             Sensitivity : 0.7169           
##             Specificity : 0.9694           
##          Pos Pred Value : 0.8175           
##          Neg Pred Value : 0.9471           
##              Prevalence : 0.1605           
##          Detection Rate : 0.1151           
##    Detection Prevalence : 0.1407           
##       Balanced Accuracy : 0.8432           
##                                            
##        'Positive' Class : Attrited Customer
##

#model 2 is the best among the three, since it has 
#accuracy of 0.9615 and Specificity of =0.9841
# model 3 has accuracy=0.9289 and Specificity=0.9694
#while model1 has accuracy=0.882  and Specificity= 0.9574
                                      #ACCURACY SPECIFICITY SENSITIVITY
#MODEL 1: STEPWISE LOGISTIC REGRESSION- 0.882     0.9574     0.6005  
#MODEL 2: RANDOM FOREST               - 0.9615    0.9894     0.8431
#MODEL 3: DECISION TREES, RPART       - 0.9289    0.9694     0.7169

Bankchurners.R

xholi

2021-01-28