pacman::p_load(pacman,tidyr,stringr,GGally,ggplot2,ggthemes,ggvis,
rio,rmarkdown,shiny,plotly,psych,httr,tidyverse,lubridate,dummies,
rpart.plot,caret,pROC,DT,rpart,randomForest,ROCR)
bank_churners<-read.csv("c:\\Users\\xholi\\Downloads\\BankChurners\\BankChurners.csv")
sum(is.na(bank_churners))
## [1] 0
#our data set has no missing values
bank_churners1<-bank_churners[,1:21] #removing the last 2 variables
Churners<-bank_churners1
#set seed and create Training and testing Data sets
str(Churners)
## 'data.frame': 10127 obs. of 21 variables:
## $ CLIENTNUM : int 768805383 818770008 713982108 769911858 709106358 713061558 810347208 818906208 710930508 719661558 ...
## $ Attrition_Flag : chr "Existing Customer" "Existing Customer" "Existing Customer" "Existing Customer" ...
## $ Customer_Age : int 45 49 51 40 40 44 51 32 37 48 ...
## $ Gender : chr "M" "F" "M" "F" ...
## $ Dependent_count : int 3 5 3 4 3 2 4 0 3 2 ...
## $ Education_Level : chr "High School" "Graduate" "Graduate" "High School" ...
## $ Marital_Status : chr "Married" "Single" "Married" "Unknown" ...
## $ Income_Category : chr "$60K - $80K" "Less than $40K" "$80K - $120K" "Less than $40K" ...
## $ Card_Category : chr "Blue" "Blue" "Blue" "Blue" ...
## $ Months_on_book : int 39 44 36 34 21 36 46 27 36 36 ...
## $ Total_Relationship_Count: int 5 6 4 3 5 3 6 2 5 6 ...
## $ Months_Inactive_12_mon : int 1 1 1 4 1 1 1 2 2 3 ...
## $ Contacts_Count_12_mon : int 3 2 0 1 0 2 3 2 0 3 ...
## $ Credit_Limit : num 12691 8256 3418 3313 4716 ...
## $ Total_Revolving_Bal : int 777 864 0 2517 0 1247 2264 1396 2517 1677 ...
## $ Avg_Open_To_Buy : num 11914 7392 3418 796 4716 ...
## $ Total_Amt_Chng_Q4_Q1 : num 1.33 1.54 2.59 1.4 2.17 ...
## $ Total_Trans_Amt : int 1144 1291 1887 1171 816 1088 1330 1538 1350 1441 ...
## $ Total_Trans_Ct : int 42 33 20 20 28 24 31 36 24 32 ...
## $ Total_Ct_Chng_Q4_Q1 : num 1.62 3.71 2.33 2.33 2.5 ...
## $ Avg_Utilization_Ratio : num 0.061 0.105 0 0.76 0 0.311 0.066 0.048 0.113 0.144 ...
set.seed(191)
intrain<-createDataPartition(y=Churners$Attrition_Flag,p=0.8,list = F)
training<-Churners[intrain,]
testing<-Churners[-intrain,]
names(Churners)
## [1] "CLIENTNUM" "Attrition_Flag"
## [3] "Customer_Age" "Gender"
## [5] "Dependent_count" "Education_Level"
## [7] "Marital_Status" "Income_Category"
## [9] "Card_Category" "Months_on_book"
## [11] "Total_Relationship_Count" "Months_Inactive_12_mon"
## [13] "Contacts_Count_12_mon" "Credit_Limit"
## [15] "Total_Revolving_Bal" "Avg_Open_To_Buy"
## [17] "Total_Amt_Chng_Q4_Q1" "Total_Trans_Amt"
## [19] "Total_Trans_Ct" "Total_Ct_Chng_Q4_Q1"
## [21] "Avg_Utilization_Ratio"
#converting the variables to factors
Churners$Attrition_Flag<-as.factor(Churners$Attrition_Flag)
Churners$Gender<-as.factor(Churners$Gender)
Churners$Education_Level<-as.factor(Churners$Education_Level)
Churners$Dependent_count<-as.factor(Churners$Dependent_count)
Churners$Marital_Status<-as.factor(Churners$Marital_Status)
Churners$Income_Category<-as.factor(Churners$Income_Category)
Churners$Card_Category<-as.factor(Churners$Card_Category)
#BASIC VISUALIZATIONS OF THE DATA
#Total Attrition
x<-Churners$Attrition_Flag
plot(x,col=rainbow(2),
ylab="Number of Clients",
main="Total number of Attritions")
box()

summary(x)
## Attrited Customer Existing Customer
## 1627 8500
#Attrited Customer Existing Customer
# 1627 8500
#that is about 16% of the Clients left our Bank
#we are more concerned about the 16% than the 84% who are stil with us,
#reason being, we want to know what made them leave our company and what classification group they fall in
# as to alert us to which class of individuals to directly focus our market at
#in the hopes of reducing this number of leaving clients and thus in return
# save the profit deficit created by the percentage of leaving clients
Churners$Attrition_Flag<-as.numeric(Churners$Attrition_Flag)
#Attrition by Gender
x<-Churners$Gender[Churners$Attrition_Flag==1]
plot(x,col=rainbow(2),
main="Number clients who Attrited by Gender",
xlab="Gender",
ylab="Number of clients")
box()

summary(x)
## F M
## 930 697
# F M
# 930 697
#It becomes clear that most attrited clients were females and the percentage is 57.16%
# the percentage for men is 42.84%
#Attrition by number of Dependents
x<-Churners$Dependent_count[Churners$Attrition_Flag==1]
plot(x,col=rainbow(6),
main="Number of Attrited Client by Number of Dependents",
ylab="Number of Clients",
xlab="Number of Dependents")
box()

summary(x)
## 0 1 2 3 4 5
## 135 269 417 482 260 64
# 0 1 2 3 4 5
# 135 269 417 482 260 64
#its clear that most attrited clients had 3 dependents followed by those with 2 and 4
# and since we saw that by gender females attrited the most
# thus its safe to assume that this were females mostly with 3 dependents,2 and 4 respectively
#clients with 5 dependents attrited the least followed by those with no dependents
#Attrition by Education level
x<-Churners$Education_Level[Churners$Attrition_Flag==1]
plot(x,col=rainbow(7),
xlab="Education Level",
ylab="Number of Cliets",
main="Attrited Clients by Education Level")
box()

summary(x)
## College Doctorate Graduate High School Post-Graduate
## 154 95 487 306 92
## Uneducated Unknown
## 237 256
# College Doctorate Graduate High School Post-Graduate Uneducated
# 154 95 487 306 92 237
# Unknown
# 256
#when comparing attrited clients by Education, it becomes clear that Graduates attrited the most followed by
#High school ,then Unknown and Uneducated individuals.
# clients with education level Post-Grad attrited the least follwed by Doctorate
# we will focus on Under grads and High school students since they contribute the largest in terms of propotion
#Attrition by Marital Status
x<-Churners$Marital_Status[Churners$Attrition_Flag==1]
plot(x,col=rainbow(4),
main="Attrited Clients by Marital status",
ylab="Number of Clients",
xlab="Marital Status")
box()

summary(x)
## Divorced Married Single Unknown
## 121 709 668 129
#Divorced Married Single Unknown
# 121 709 668 129
#the data implies that most of the attrited clients were married followed by those who are single
# we can assume from the above info that it was mostly females,who are mostly Under grads or High school students
#with 3 or 2 Dependents and are married or single.
str(Churners)
## 'data.frame': 10127 obs. of 21 variables:
## $ CLIENTNUM : int 768805383 818770008 713982108 769911858 709106358 713061558 810347208 818906208 710930508 719661558 ...
## $ Attrition_Flag : num 2 2 2 2 2 2 2 2 2 2 ...
## $ Customer_Age : int 45 49 51 40 40 44 51 32 37 48 ...
## $ Gender : Factor w/ 2 levels "F","M": 2 1 2 1 2 2 2 2 2 2 ...
## $ Dependent_count : Factor w/ 6 levels "0","1","2","3",..: 4 6 4 5 4 3 5 1 4 3 ...
## $ Education_Level : Factor w/ 7 levels "College","Doctorate",..: 4 3 3 4 6 3 7 4 6 3 ...
## $ Marital_Status : Factor w/ 4 levels "Divorced","Married",..: 2 3 2 4 2 2 2 4 3 3 ...
## $ Income_Category : Factor w/ 6 levels "$120K +","$40K - $60K",..: 3 5 4 5 3 2 1 3 3 4 ...
## $ Card_Category : Factor w/ 4 levels "Blue","Gold",..: 1 1 1 1 1 1 2 4 1 1 ...
## $ Months_on_book : int 39 44 36 34 21 36 46 27 36 36 ...
## $ Total_Relationship_Count: int 5 6 4 3 5 3 6 2 5 6 ...
## $ Months_Inactive_12_mon : int 1 1 1 4 1 1 1 2 2 3 ...
## $ Contacts_Count_12_mon : int 3 2 0 1 0 2 3 2 0 3 ...
## $ Credit_Limit : num 12691 8256 3418 3313 4716 ...
## $ Total_Revolving_Bal : int 777 864 0 2517 0 1247 2264 1396 2517 1677 ...
## $ Avg_Open_To_Buy : num 11914 7392 3418 796 4716 ...
## $ Total_Amt_Chng_Q4_Q1 : num 1.33 1.54 2.59 1.4 2.17 ...
## $ Total_Trans_Amt : int 1144 1291 1887 1171 816 1088 1330 1538 1350 1441 ...
## $ Total_Trans_Ct : int 42 33 20 20 28 24 31 36 24 32 ...
## $ Total_Ct_Chng_Q4_Q1 : num 1.62 3.71 2.33 2.33 2.5 ...
## $ Avg_Utilization_Ratio : num 0.061 0.105 0 0.76 0 0.311 0.066 0.048 0.113 0.144 ...
#Attrition by Income Category
x<-Churners$Income_Category[Churners$Attrition_Flag==1]
plot(x,col=rainbow(6),xlab="Income Category",
ylab="Number of Clients",
main="Attrited Clients by Income Cat")
box()

summary(x)
## $120K + $40K - $60K $60K - $80K $80K - $120K Less than $40K
## 126 271 189 242 612
## Unknown
## 187
#($120K +) ($40K - $60K) ($60K - $80K) ($80K - $120K) (Less than $40K)
# 126 271 189 242 612
# (Unknown)
# 187
#in terms of income category, most attrited clients are in the Less than $40K category
#followed by those who earn between $40K-$60k then $80k-$120k and those who are in the category
# $60k-$80k.
# the ones that attrited the least earn around $120k +
#Attrition by Card Category
x<-Churners$Card_Category[Churners$Attrition_Flag==1]
plot(x,col=rainbow(4),
xlab="Card Category",
ylab="Number of Clients",
main="Attrited Clients by Card category")
box()

#as expected the blue card holders attrited the most since most attritions were from the income rage of
#less tha $40k .
summary(x)
## Blue Gold Platinum Silver
## 1519 21 5 82
# Blue Gold Platinum Silver
# 1519 21 5 82
#Attrition by Months on book
x<-Churners$Months_on_book[Churners$Attrition_Flag==1]
boxplot(x,horizontal = T,col="purple",main="Box plot of attrited clients by Month on books")

summary(x)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 32.00 36.00 36.18 40.00 56.00
#the average Month on book for Clients who Attrited is 36.18 months with a min of 13 months and a maz of 56
#with a few outliers
#MODELLING
#Model1
#STEPWISE LOGISTIC REGRESSION
md_glm<-glm(as.factor(Attrition_Flag)~.,data = training,
family = binomial(logit))
md_step<-step(md_glm,direction = "both",
steps = 10000,
trace = F)
prediction_step<-predict(md_step,testing,type="response")
#to decide the cut off point
pred_step<-prediction(prediction_step,testing$Attrition_Flag)
plot(performance(pred_step,"tpr","fpr"),colorize=T)

auc_step<-performance(pred_step,"auc")
auc_step
## A performance instance
## 'Area under the ROC curve'
roc_step<-roc(response=testing$Attrition_Flag,predictor=prediction_step)
## Setting levels: control = Attrited Customer, case = Existing Customer
## Setting direction: controls < cases
d<-coords(roc_step,"best","threshold",transpose=T)
d
## threshold specificity sensitivity
## 0.7813074 0.7907692 0.8994118
#threshold specificity sensitivity
# 0.7813074 0.7907692 0.8994118
#We find that the best cut off point to distinguish the class probabilities into 1 and 2 is
# 0.7813
testing$Attrition_Flag<-as.character(testing$Attrition_Flag)
#forming confusion matrix from the decided cutoff point
Attrition_step<-if_else(prediction_step>=d[[1]],"Existing Customer","Attrited Customer")
cm_1<-confusionMatrix(table(testing$Attrition_Flag,Attrition_step))
str(testing$Attrition_Flag)
## chr [1:2025] "Existing Customer" "Existing Customer" "Existing Customer" ...
str(Attrition_step)
## chr [1:2025] "Existing Customer" "Existing Customer" "Existing Customer" ...
cm_1
## Confusion Matrix and Statistics
##
## Attrition_step
## Attrited Customer Existing Customer
## Attrited Customer 257 68
## Existing Customer 171 1529
##
## Accuracy : 0.882
## 95% CI : (0.8671, 0.8957)
## No Information Rate : 0.7886
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6118
##
## Mcnemar's Test P-Value : 4.172e-11
##
## Sensitivity : 0.6005
## Specificity : 0.9574
## Pos Pred Value : 0.7908
## Neg Pred Value : 0.8994
## Prevalence : 0.2114
## Detection Rate : 0.1269
## Detection Prevalence : 0.1605
## Balanced Accuracy : 0.7789
##
## 'Positive' Class : Attrited Customer
##
# cm_1 --------------------------------------------------------------------
#MODEL 2
#RANDOM FOREST MODELLING
tc_rf<-trainControl(method = "repeatedcv",
repeats = 10,
number = 3,search = "random")
rf_train<-train(Attrition_Flag~.,data=training,
method="rf",
trainControl=tc_rf)
plot(varImp(rf_train,scale = F))

testing$Attrition_Flag<-as.factor(testing$Attrition_Flag)
predict_rftrain<-predict(rf_train,testing)
cm_2<-confusionMatrix(predict_rftrain,testing$Attrition_Flag)
cm_2
## Confusion Matrix and Statistics
##
## Reference
## Prediction Attrited Customer Existing Customer
## Attrited Customer 274 27
## Existing Customer 51 1673
##
## Accuracy : 0.9615
## 95% CI : (0.9522, 0.9694)
## No Information Rate : 0.8395
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8527
##
## Mcnemar's Test P-Value : 0.009208
##
## Sensitivity : 0.8431
## Specificity : 0.9841
## Pos Pred Value : 0.9103
## Neg Pred Value : 0.9704
## Prevalence : 0.1605
## Detection Rate : 0.1353
## Detection Prevalence : 0.1486
## Balanced Accuracy : 0.9136
##
## 'Positive' Class : Attrited Customer
##
#MODEL 3
#DECISION TREES-RPART
rp<-rpart(Attrition_Flag~.,data=training,
method = "class")
predictions_rp<-predict(rp,testing,type = "class")
rpart.plot(rp,type = 2,
extra = 100,
branch.lty=1,
box.palette = "RdYlGn",
tweak = 1.5,
fallen.leaves = F)

cm_rpart<-confusionMatrix(predictions_rp,testing$Attrition_Flag)
cm_rpart
## Confusion Matrix and Statistics
##
## Reference
## Prediction Attrited Customer Existing Customer
## Attrited Customer 233 52
## Existing Customer 92 1648
##
## Accuracy : 0.9289
## 95% CI : (0.9168, 0.9397)
## No Information Rate : 0.8395
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7223
##
## Mcnemar's Test P-Value : 0.001154
##
## Sensitivity : 0.7169
## Specificity : 0.9694
## Pos Pred Value : 0.8175
## Neg Pred Value : 0.9471
## Prevalence : 0.1605
## Detection Rate : 0.1151
## Detection Prevalence : 0.1407
## Balanced Accuracy : 0.8432
##
## 'Positive' Class : Attrited Customer
##
#model 2 is the best among the three, since it has
#accuracy of 0.9615 and Specificity of =0.9841
# model 3 has accuracy=0.9289 and Specificity=0.9694
#while model1 has accuracy=0.882 and Specificity= 0.9574
#ACCURACY SPECIFICITY SENSITIVITY
#MODEL 1: STEPWISE LOGISTIC REGRESSION- 0.882 0.9574 0.6005
#MODEL 2: RANDOM FOREST - 0.9615 0.9894 0.8431
#MODEL 3: DECISION TREES, RPART - 0.9289 0.9694 0.7169