Bank Marketing Analysis

ARJUN SIROHI- MS Business Analytics

2022-07-22

About the Data

Methodology

ML-Classification Models

Exploratory Data Analysis

job_tab <- data.frame(table(bankdata$job))
colnames(job_tab) <- c("Job", "count")
ggplot(data=job_tab, aes(x=count, y=reorder(Job,count), fill=Job))+
  geom_bar(stat = 'identity')+
  labs(X=NULL,
       y=NULL,
       title="Customers Job Distribution")+
  theme_pander()

Response of the recent campaign

Response Percentage(%)
No 88.30
Yes 11.69

Response from previous campaign

p_out_tab <- data.frame(table(bankdata$poutcome,bankdata$y))
colnames(p_out_tab) <- c("PreviousOutcome", "Response","Count")
ggplot(p_out_tab, aes(x=PreviousOutcome, y=Count, fill=Response))+
  geom_bar(stat = 'identity', position = 'dodge')+
  labs(title=" Contact-Response Outcome")+theme_pander()+
  scale_fill_manual(values=c("darkorange",
                             "dodgerblue4"))

Response from Each occupation

job_y_tab <- data.frame(table(bankdata$job, bankdata$y))
colnames(job_y_tab) <- c("job","Response","count")
ggplot(data=job_y_tab, aes(x=count,y=reorder(job,count), fill=Response))+
  geom_bar(stat = 'identity', position = 'dodge' )+
  labs(X="Number of customers",
       y=NULL,
       title="Campaign result by Job distribution")+theme_pander()+ scale_fill_manual(values=c("yellowgreen",
                             "gray25"))

Customer jobs and Median balance distribution

ggplot(bankdata, aes(x=balance,y=job))+
  geom_boxplot(fill= "yellow2", color="red3")+
   labs(y=NULL)+
  theme_pander()

Customer Loan Status

ggplot(data=bankdata, aes(x=loan, fill=loan))+
  geom_bar(position = 'dodge')+
  labs(X="Number of customers",
       y=NULL)+theme_pander()+ scale_fill_manual(values=c("orange",
                             "gray25"))

Loan Status & Customer response

loan_tab <- data.frame(table(bankdata$loan,bankdata$y))
colnames(loan_tab) <- c("Loan", "Response","Count")
ggplot(data=loan_tab, aes(x=Loan,y=Count,fill=Response))+
  geom_bar(stat = 'identity',position = 'dodge')+
  labs(X="Number of customers",
       y=NULL)+theme_pander()+ scale_fill_manual(values=c("orange",
                             "red3"))

Logistic Regression Performance

#Create 80/20 training/test split:
set.seed(22)
inTrain <- createDataPartition(bankdata$y, p = 0.8, list = FALSE)
bank_train <- bankdata[inTrain, ]
bank_test <- bankdata[-inTrain, ]

## Set the trainControl to 10-fold cross validation to be used across all three models:

ctrl <- trainControl(method = "cv", number = 10, classProbs = TRUE)

## Logistic Regression
log_fit <- train(y ~., data = bank_train,
                 method = "glm",
                 trControl = ctrl)
####Predictions:
pred_log <- predict(log_fit, newdata = bank_test)
confusionMatrix(pred_log, bank_test$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7805  698
##        yes  179  359
##                                          
##                Accuracy : 0.903          
##                  95% CI : (0.8967, 0.909)
##     No Information Rate : 0.8831         
##     P-Value [Acc > NIR] : 8.351e-10      
##                                          
##                   Kappa : 0.4031         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.9776         
##             Specificity : 0.3396         
##          Pos Pred Value : 0.9179         
##          Neg Pred Value : 0.6673         
##              Prevalence : 0.8831         
##          Detection Rate : 0.8633         
##    Detection Prevalence : 0.9405         
##       Balanced Accuracy : 0.6586         
##                                          
##        'Positive' Class : no             
## 

Important Variables

#Important Variables
log_imp <- varImp(log_fit, scale = FALSE, competes = FALSE)
plot(log_imp)

Adaptive-Boosting Model

model_ada<-ada(y ~ .,data=bank_train,loss="exponential",type="discrete",iter=50 )
##variable selection plot
varplot(model_ada)

Adaptive-Boosting Performance

### prediction
pred_ada<-predict(model_ada,bank_test)
confusionMatrix(as.factor(bank_test$y),as.factor(pred_ada))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7763  221
##        yes  700  357
##                                           
##                Accuracy : 0.8981          
##                  95% CI : (0.8917, 0.9043)
##     No Information Rate : 0.9361          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3859          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9173          
##             Specificity : 0.6176          
##          Pos Pred Value : 0.9723          
##          Neg Pred Value : 0.3377          
##              Prevalence : 0.9361          
##          Detection Rate : 0.8586          
##    Detection Prevalence : 0.8831          
##       Balanced Accuracy : 0.7675          
##                                           
##        'Positive' Class : no              
## 

Performance Analysis & Conclusion