#Goal : predict behavior to retain customers

#The data set includes information about:
  
#The column "Existed" is customers who left Bank.

#data source https://www.kaggle.com/nswapnil31/bank-customer-churn


#########################################################################################################

                                           # Churn Analysis 

#########################################################################################################

# Load Libraries

library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("tidytext")
library("ggplot2")
library("caTools")
## Warning: package 'caTools' was built under R version 3.5.3
library("ROCR")
## Warning: package 'ROCR' was built under R version 3.5.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.5.3
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library("magrittr")
## Warning: package 'magrittr' was built under R version 3.5.3
library("forecast")
## Warning: package 'forecast' was built under R version 3.5.3
library("nnet")
## Warning: package 'nnet' was built under R version 3.5.3
getwd()
## [1] "C:/Data Science A-Z/Part 3 Data Prep/Logistic Regression"
#Import Data
setwd("C:\\Data Science A-Z\\Part 3 Data Prep\\Logistic Regression")
data1 <- read.csv("P12-Churn-Modelling.csv", header = TRUE)
# changing character variables to factors
data1 <- data1 %>% mutate_if(is.character, as.factor)
attach(data1)
# looking for missing values
colSums(is.na(data1))
##       RowNumber      CustomerId         Surname     CreditScore 
##               0               0               0               0 
##       Geography          Gender             Age          Tenure 
##               0               0               0               0 
##         Balance   NumOfProducts       HasCrCard  IsActiveMember 
##               0               0               0               0 
## EstimatedSalary          Exited 
##               0               0
summary(data1)
##    RowNumber       CustomerId           Surname      CreditScore   
##  Min.   :    1   Min.   :15565701   Smith   :  32   Min.   :350.0  
##  1st Qu.: 2501   1st Qu.:15628528   Martin  :  29   1st Qu.:584.0  
##  Median : 5000   Median :15690738   Scott   :  29   Median :652.0  
##  Mean   : 5000   Mean   :15690941   Walker  :  28   Mean   :650.5  
##  3rd Qu.: 7500   3rd Qu.:15753234   Brown   :  26   3rd Qu.:718.0  
##  Max.   :10000   Max.   :15815690   Genovese:  25   Max.   :850.0  
##                                     (Other) :9831                  
##    Geography       Gender          Age            Tenure      
##  France :5014   Female:4543   Min.   :18.00   Min.   : 0.000  
##  Germany:2509   Male  :5457   1st Qu.:32.00   1st Qu.: 3.000  
##  Spain  :2477                 Median :37.00   Median : 5.000  
##                               Mean   :38.92   Mean   : 5.013  
##                               3rd Qu.:44.00   3rd Qu.: 7.000  
##                               Max.   :92.00   Max.   :10.000  
##                                                               
##     Balance       NumOfProducts    HasCrCard      IsActiveMember  
##  Min.   :     0   Min.   :1.00   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:     0   1st Qu.:1.00   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 97199   Median :1.00   Median :1.0000   Median :1.0000  
##  Mean   : 76486   Mean   :1.53   Mean   :0.7055   Mean   :0.5151  
##  3rd Qu.:127644   3rd Qu.:2.00   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :250898   Max.   :4.00   Max.   :1.0000   Max.   :1.0000  
##                                                                   
##  EstimatedSalary         Exited      
##  Min.   :    11.58   Min.   :0.0000  
##  1st Qu.: 51002.11   1st Qu.:0.0000  
##  Median :100193.91   Median :0.0000  
##  Mean   :100090.24   Mean   :0.2037  
##  3rd Qu.:149388.25   3rd Qu.:0.0000  
##  Max.   :199992.48   Max.   :1.0000  
## 
# removing RowNumber,customerID,Surname; doesn't add any value to the model
data1$RowNumber <- NULL
data1$CustomerId <- NULL
data1$Surname <-NULL

str(data1)
## 'data.frame':    10000 obs. of  11 variables:
##  $ CreditScore    : int  619 608 502 699 850 645 822 376 501 684 ...
##  $ Geography      : Factor w/ 3 levels "France","Germany",..: 1 3 1 1 3 3 1 2 1 1 ...
##  $ Gender         : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 2 2 1 2 2 ...
##  $ Age            : int  42 41 42 39 43 44 50 29 44 27 ...
##  $ Tenure         : int  2 1 8 1 2 8 7 4 4 2 ...
##  $ Balance        : num  0 83808 159661 0 125511 ...
##  $ NumOfProducts  : int  1 1 3 2 1 2 2 4 2 1 ...
##  $ HasCrCard      : int  1 0 1 0 1 1 1 1 0 1 ...
##  $ IsActiveMember : int  1 1 0 0 1 0 1 0 1 1 ...
##  $ EstimatedSalary: num  101349 112543 113932 93827 79084 ...
##  $ Exited         : int  1 0 1 0 0 1 0 1 0 0 ...
head(data1)
##   CreditScore Geography Gender Age Tenure   Balance NumOfProducts
## 1         619    France Female  42      2      0.00             1
## 2         608     Spain Female  41      1  83807.86             1
## 3         502    France Female  42      8 159660.80             3
## 4         699    France Female  39      1      0.00             2
## 5         850     Spain Female  43      2 125510.82             1
## 6         645     Spain   Male  44      8 113755.78             2
##   HasCrCard IsActiveMember EstimatedSalary Exited
## 1         1              1       101348.88      1
## 2         0              1       112542.58      0
## 3         1              0       113931.57      1
## 4         0              0        93826.63      0
## 5         1              1        79084.10      0
## 6         1              0       149756.71      1
# fitting the model

model <- glm(Exited~.,data1,family = "binomial")
summary(model)
## 
## Call:
## glm(formula = Exited ~ ., family = "binomial", data = data1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3097  -0.6589  -0.4560  -0.2697   2.9940  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -3.392e+00  2.448e-01 -13.857  < 2e-16 ***
## CreditScore      -6.683e-04  2.803e-04  -2.384   0.0171 *  
## GeographyGermany  7.747e-01  6.767e-02  11.448  < 2e-16 ***
## GeographySpain    3.522e-02  7.064e-02   0.499   0.6181    
## GenderMale       -5.285e-01  5.449e-02  -9.699  < 2e-16 ***
## Age               7.271e-02  2.576e-03  28.230  < 2e-16 ***
## Tenure           -1.595e-02  9.355e-03  -1.705   0.0882 .  
## Balance           2.637e-06  5.142e-07   5.128 2.92e-07 ***
## NumOfProducts    -1.015e-01  4.713e-02  -2.154   0.0312 *  
## HasCrCard        -4.468e-02  5.934e-02  -0.753   0.4515    
## IsActiveMember   -1.075e+00  5.769e-02 -18.643  < 2e-16 ***
## EstimatedSalary   4.807e-07  4.737e-07   1.015   0.3102    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10109.8  on 9999  degrees of freedom
## Residual deviance:  8561.4  on 9988  degrees of freedom
## AIC: 8585.4
## 
## Number of Fisher Scoring iterations: 5
# making predictions
res <- predict(model,data1,type="response")
table(Actualvalue=data1$Exited,Predictedvalue=res>0.5)
##            Predictedvalue
## Actualvalue FALSE TRUE
##           0  7666  297
##           1  1600  437
#Removing Geography from model

model1 <- glm(Exited~(CreditScore) +(Age)+(Tenure)+(Gender)+Balance+NumOfProducts+HasCrCard+EstimatedSalary+IsActiveMember ,data1,family = "binomial")
 summary(model1)
## 
## Call:
## glm(formula = Exited ~ (CreditScore) + (Age) + (Tenure) + (Gender) + 
##     Balance + NumOfProducts + HasCrCard + EstimatedSalary + IsActiveMember, 
##     family = "binomial", data = data1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1373  -0.6713  -0.4660  -0.2797   2.9715  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -3.476e+00  2.414e-01 -14.397   <2e-16 ***
## CreditScore     -6.521e-04  2.777e-04  -2.349   0.0188 *  
## Age              7.287e-02  2.551e-03  28.561   <2e-16 ***
## Tenure          -1.471e-02  9.278e-03  -1.585   0.1129    
## GenderMale      -5.429e-01  5.397e-02 -10.060   <2e-16 ***
## Balance          5.045e-06  4.601e-07  10.964   <2e-16 ***
## NumOfProducts   -3.617e-02  4.640e-02  -0.780   0.4356    
## HasCrCard       -2.899e-02  5.875e-02  -0.493   0.6217    
## EstimatedSalary  4.978e-07  4.698e-07   1.060   0.2893    
## IsActiveMember  -1.079e+00  5.723e-02 -18.860   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10109.8  on 9999  degrees of freedom
## Residual deviance:  8706.5  on 9990  degrees of freedom
## AIC: 8726.5
## 
## Number of Fisher Scoring iterations: 5
res <- predict(model1,data1,type="response")
table(Actualvalue=data1$Exited,Predictedvalue=res>0.5)
##            Predictedvalue
## Actualvalue FALSE TRUE
##           0  7717  246
##           1  1683  354
model2 <- glm(Exited~(CreditScore) +(Age)+(Tenure)+(Gender)+Balance+NumOfProducts+EstimatedSalary+IsActiveMember ,data1,family = "binomial")
summary(model2)
## 
## Call:
## glm(formula = Exited ~ (CreditScore) + (Age) + (Tenure) + (Gender) + 
##     Balance + NumOfProducts + EstimatedSalary + IsActiveMember, 
##     family = "binomial", data = data1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1410  -0.6711  -0.4667  -0.2804   2.9687  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -3.498e+00  2.375e-01 -14.725   <2e-16 ***
## CreditScore     -6.507e-04  2.776e-04  -2.344   0.0191 *  
## Age              7.287e-02  2.551e-03  28.564   <2e-16 ***
## Tenure          -1.481e-02  9.275e-03  -1.597   0.1104    
## GenderMale      -5.430e-01  5.397e-02 -10.061   <2e-16 ***
## Balance          5.048e-06  4.600e-07  10.974   <2e-16 ***
## NumOfProducts   -3.616e-02  4.639e-02  -0.780   0.4357    
## EstimatedSalary  4.981e-07  4.698e-07   1.060   0.2890    
## IsActiveMember  -1.079e+00  5.721e-02 -18.855   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10109.8  on 9999  degrees of freedom
## Residual deviance:  8706.7  on 9991  degrees of freedom
## AIC: 8724.7
## 
## Number of Fisher Scoring iterations: 5
res <- predict(model2,data1,type="response")
table(Actualvalue=data1$Exited,Predictedvalue=res>0.5)
##            Predictedvalue
## Actualvalue FALSE TRUE
##           0  7719  244
##           1  1681  356
model3 <- glm(Exited~(CreditScore) +(Age)+(Tenure)+(Gender)+Balance+NumOfProducts+IsActiveMember ,data1,family = "binomial")
summary(model3)
## 
## Call:
## glm(formula = Exited ~ (CreditScore) + (Age) + (Tenure) + (Gender) + 
##     Balance + NumOfProducts + IsActiveMember, family = "binomial", 
##     data = data1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1538  -0.6711  -0.4671  -0.2790   2.9612  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -3.447e+00  2.326e-01 -14.817   <2e-16 ***
## CreditScore    -6.527e-04  2.776e-04  -2.351   0.0187 *  
## Age             7.284e-02  2.551e-03  28.560   <2e-16 ***
## Tenure         -1.466e-02  9.274e-03  -1.581   0.1139    
## GenderMale     -5.434e-01  5.396e-02 -10.070   <2e-16 ***
## Balance         5.057e-06  4.599e-07  10.995   <2e-16 ***
## NumOfProducts  -3.520e-02  4.638e-02  -0.759   0.4480    
## IsActiveMember -1.080e+00  5.721e-02 -18.870   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10109.8  on 9999  degrees of freedom
## Residual deviance:  8707.8  on 9992  degrees of freedom
## AIC: 8723.8
## 
## Number of Fisher Scoring iterations: 5
res <- predict(model3,data1,type="response")
table(Actualvalue=data1$Exited,Predictedvalue=res>0.5)
##            Predictedvalue
## Actualvalue FALSE TRUE
##           0  7719  244
##           1  1680  357
model4 <- glm(Exited~(CreditScore) +(Age)+(Gender)+Balance+NumOfProducts+IsActiveMember ,data1,family = "binomial")
summary(model4)
## 
## Call:
## glm(formula = Exited ~ (CreditScore) + (Age) + (Gender) + Balance + 
##     NumOfProducts + IsActiveMember, family = "binomial", data = data1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1598  -0.6715  -0.4659  -0.2797   2.9510  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -3.519e+00  2.283e-01 -15.412   <2e-16 ***
## CreditScore    -6.528e-04  2.776e-04  -2.352   0.0187 *  
## Age             7.283e-02  2.550e-03  28.559   <2e-16 ***
## GenderMale     -5.448e-01  5.395e-02 -10.099   <2e-16 ***
## Balance         5.053e-06  4.598e-07  10.990   <2e-16 ***
## NumOfProducts  -3.587e-02  4.636e-02  -0.774   0.4391    
## IsActiveMember -1.077e+00  5.717e-02 -18.834   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10109.8  on 9999  degrees of freedom
## Residual deviance:  8710.3  on 9993  degrees of freedom
## AIC: 8724.3
## 
## Number of Fisher Scoring iterations: 5
res <- predict(model4,data1,type="response")
table(Actualvalue=data1$Exited,Predictedvalue=res>0.5)
##            Predictedvalue
## Actualvalue FALSE TRUE
##           0  7715  248
##           1  1690  347
#SEMI FINAL MODEL

model5 <- glm(Exited~(CreditScore) +(Age)+(Gender)+Tenure+Geography+Balance+NumOfProducts+IsActiveMember ,data1,family = "binomial")
summary(model5)
## 
## Call:
## glm(formula = Exited ~ (CreditScore) + (Age) + (Gender) + Tenure + 
##     Geography + Balance + NumOfProducts + IsActiveMember, family = "binomial", 
##     data = data1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3270  -0.6592  -0.4553  -0.2682   2.9826  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -3.377e+00  2.359e-01 -14.312  < 2e-16 ***
## CreditScore      -6.682e-04  2.803e-04  -2.384   0.0171 *  
## Age               7.268e-02  2.575e-03  28.230  < 2e-16 ***
## GenderMale       -5.291e-01  5.448e-02  -9.712  < 2e-16 ***
## Tenure           -1.595e-02  9.350e-03  -1.706   0.0879 .  
## GeographyGermany  7.741e-01  6.766e-02  11.441  < 2e-16 ***
## GeographySpain    3.586e-02  7.063e-02   0.508   0.6116    
## Balance           2.653e-06  5.140e-07   5.162 2.45e-07 ***
## NumOfProducts    -1.007e-01  4.712e-02  -2.137   0.0326 *  
## IsActiveMember   -1.075e+00  5.767e-02 -18.648  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10110  on 9999  degrees of freedom
## Residual deviance:  8563  on 9990  degrees of freedom
## AIC: 8583
## 
## Number of Fisher Scoring iterations: 5
res <- predict(model5,data1,type="response")
table(Actualvalue=data1$Exited,Predictedvalue=res>0.5)
##            Predictedvalue
## Actualvalue FALSE TRUE
##           0  7676  287
##           1  1597  440
#VAriable Transform

log_BL<-log(data1$Balance+1)

#Final Model
model6 <- glm(Exited~(CreditScore) +(Age)+(Gender)+Tenure+Geography+log_BL+NumOfProducts+IsActiveMember ,data1,family = "binomial")
summary(model6)
## 
## Call:
## glm(formula = Exited ~ (CreditScore) + (Age) + (Gender) + Tenure + 
##     Geography + log_BL + NumOfProducts + IsActiveMember, family = "binomial", 
##     data = data1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3101  -0.6587  -0.4552  -0.2686   2.9867  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -3.3957408  0.2374592 -14.300  < 2e-16 ***
## CreditScore      -0.0006767  0.0002803  -2.414   0.0158 *  
## Age               0.0726380  0.0025746  28.213  < 2e-16 ***
## GenderMale       -0.5268640  0.0544604  -9.674  < 2e-16 ***
## Tenure           -0.0158492  0.0093469  -1.696   0.0900 .  
## GeographyGermany  0.7597409  0.0692535  10.970  < 2e-16 ***
## GeographySpain    0.0363329  0.0706155   0.515   0.6069    
## log_BL            0.0299784  0.0060625   4.945 7.62e-07 ***
## NumOfProducts    -0.0952465  0.0475445  -2.003   0.0451 *  
## IsActiveMember   -1.0760359  0.0576486 -18.665  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10109.8  on 9999  degrees of freedom
## Residual deviance:  8564.9  on 9990  degrees of freedom
## AIC: 8584.9
## 
## Number of Fisher Scoring iterations: 5
res <- predict(model6,data1,type="response")
table(Actualvalue=data1$Exited,Predictedvalue=res>0.5)
##            Predictedvalue
## Actualvalue FALSE TRUE
##           0  7687  276
##           1  1597  440
############################################################

                 # Performance Measure

#############################################################


#CIs using profiled log-lkelihood
confint(model6)
## Waiting for profiling to be done...
##                         2.5 %        97.5 %
## (Intercept)      -3.862834028 -2.9318935713
## CreditScore      -0.001226397 -0.0001274082
## Age               0.067617390  0.0777113655
## GenderMale       -0.633763743 -0.4202583080
## Tenure           -0.034178447  0.0024649705
## GeographyGermany  0.624239365  0.8957423318
## GeographySpain   -0.102657442  0.1742082600
## log_BL            0.018117927  0.0418866846
## NumOfProducts    -0.188767410 -0.0023595984
## IsActiveMember   -1.189584783 -0.9635748790
#CIs using standard errors
confint.default(model6)
##                         2.5 %        97.5 %
## (Intercept)      -3.861152378 -2.9303292253
## CreditScore      -0.001226134 -0.0001272788
## Age               0.067591752  0.0776841550
## GenderMale       -0.633604544 -0.4201235347
## Tenure           -0.034168776  0.0024704571
## GeographyGermany  0.624006441  0.8954753097
## GeographySpain   -0.102070943  0.1747367105
## log_BL            0.018096080  0.0418607579
## NumOfProducts    -0.188431942 -0.0020610113
## IsActiveMember   -1.189025151 -0.9630467216
#Put the coefficents and CI in a format onto a useful Scale.

exp(confint(model6))
## Waiting for profiling to be done...
##                       2.5 %     97.5 %
## (Intercept)      0.02100838 0.05329602
## CreditScore      0.99877436 0.99987260
## Age              1.06995585 1.08081065
## GenderMale       0.53059103 0.65687712
## Tenure           0.96639904 1.00246801
## GeographyGermany 1.86682545 2.44915320
## GeographySpain   0.90243606 1.19030343
## log_BL           1.01828305 1.04277631
## NumOfProducts    0.82797906 0.99764318
## IsActiveMember   0.30434761 0.38152653
# odds ratios only
exp(coef(model6))
##      (Intercept)      CreditScore              Age       GenderMale 
##       0.03351572       0.99932352       1.07534114       0.59045371 
##           Tenure GeographyGermany   GeographySpain           log_BL 
##       0.98427578       2.13772221       1.03700099       1.03043230 
##    NumOfProducts   IsActiveMember 
##       0.90914882       0.34094438
# odds ratios and 95% CI
exp(cbind(OR = coef(model6),confint(model6)))
## Waiting for profiling to be done...
##                          OR      2.5 %     97.5 %
## (Intercept)      0.03351572 0.02100838 0.05329602
## CreditScore      0.99932352 0.99877436 0.99987260
## Age              1.07534114 1.06995585 1.08081065
## GenderMale       0.59045371 0.53059103 0.65687712
## Tenure           0.98427578 0.96639904 1.00246801
## GeographyGermany 2.13772221 1.86682545 2.44915320
## GeographySpain   1.03700099 0.90243606 1.19030343
## log_BL           1.03043230 1.01828305 1.04277631
## NumOfProducts    0.90914882 0.82797906 0.99764318
## IsActiveMember   0.34094438 0.30434761 0.38152653
#Model Testing Using Test Data

setwd("C:\\Data Science A-Z\\Part 3 Data Prep\\Logistic Regression")
Testdata <- read.csv("P12-Churn-Modelling-Test-Data.csv", header = TRUE)
# changing character variables to factors
Testdata <- Testdata %>% mutate_if(is.character, as.factor)
attach(Testdata)
## The following objects are masked from data1:
## 
##     Age, Balance, CreditScore, CustomerId, EstimatedSalary,
##     Exited, Gender, Geography, HasCrCard, IsActiveMember,
##     NumOfProducts, RowNumber, Surname, Tenure
# looking for missing values
colSums(is.na(Testdata))
##       RowNumber      CustomerId         Surname     CreditScore 
##               0               0               0               0 
##       Geography          Gender             Age          Tenure 
##               0               0               0               0 
##         Balance   NumOfProducts       HasCrCard  IsActiveMember 
##               0               0               0               0 
## EstimatedSalary          Exited 
##               0               0
summary(Testdata)
##    RowNumber       CustomerId            Surname     CreditScore   
##  Min.   :10001   Min.   :15565862   King     :  6   Min.   :366.0  
##  1st Qu.:10251   1st Qu.:15629656   Colombo  :  5   1st Qu.:582.0  
##  Median :10500   Median :15695155   Henderson:  5   Median :656.0  
##  Mean   :10500   Mean   :15692736   Hsia     :  5   Mean   :652.2  
##  3rd Qu.:10750   3rd Qu.:15754996   Mitchell :  5   3rd Qu.:719.0  
##  Max.   :11000   Max.   :15815463   Boyle    :  4   Max.   :850.0  
##                                     (Other)  :970                  
##    Geography      Gender         Age            Tenure     
##  France :501   Female:433   Min.   :18.00   Min.   : 0.00  
##  Germany:256   Male  :567   1st Qu.:32.00   1st Qu.: 2.00  
##  Spain  :243                Median :38.00   Median : 5.00  
##                             Mean   :39.22   Mean   : 4.86  
##                             3rd Qu.:44.00   3rd Qu.: 7.00  
##                             Max.   :91.00   Max.   :10.00  
##                                                            
##     Balance       NumOfProducts     HasCrCard    IsActiveMember 
##  Min.   :     0   Min.   :1.000   Min.   :0.00   Min.   :0.000  
##  1st Qu.:     0   1st Qu.:1.000   1st Qu.:0.00   1st Qu.:0.000  
##  Median : 97927   Median :1.000   Median :1.00   Median :1.000  
##  Mean   : 75160   Mean   :1.521   Mean   :0.68   Mean   :0.529  
##  3rd Qu.:128142   3rd Qu.:2.000   3rd Qu.:1.00   3rd Qu.:1.000  
##  Max.   :211520   Max.   :4.000   Max.   :1.00   Max.   :1.000  
##                                                                 
##  EstimatedSalary        Exited    
##  Min.   :   245.5   Min.   :0.00  
##  1st Qu.: 49099.9   1st Qu.:0.00  
##  Median :104081.6   Median :0.00  
##  Mean   :101500.4   Mean   :0.26  
##  3rd Qu.:151514.4   3rd Qu.:1.00  
##  Max.   :199633.7   Max.   :1.00  
## 
# removing RowNumber,customerID,Surname; doesn't add any value to the model
Testdata$RowNumber <- NULL
Testdata$CustomerId <- NULL
Testdata$Surname <-NULL

log_BL<-log(Testdata$Balance+1)

Testmodel <- glm(Exited~(CreditScore) +(Age)+(Gender)+Tenure+Geography+log_BL+NumOfProducts+IsActiveMember ,Testdata,family = "binomial")
summary(Testmodel)
## 
## Call:
## glm(formula = Exited ~ (CreditScore) + (Age) + (Gender) + Tenure + 
##     Geography + log_BL + NumOfProducts + IsActiveMember, family = "binomial", 
##     data = Testdata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9508  -0.7481  -0.5271   0.7255   2.4931  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -3.5694575  0.6848742  -5.212 1.87e-07 ***
## CreditScore       0.0007295  0.0008007   0.911  0.36230    
## Age               0.0703750  0.0075305   9.345  < 2e-16 ***
## GenderMale       -0.5031528  0.1577396  -3.190  0.00142 ** 
## Tenure           -0.0265185  0.0269771  -0.983  0.32561    
## GeographyGermany  0.2958640  0.2021623   1.463  0.14333    
## GeographySpain   -0.3397193  0.2104439  -1.614  0.10646    
## log_BL            0.0314575  0.0171925   1.830  0.06729 .  
## NumOfProducts    -0.1703151  0.1379350  -1.235  0.21692    
## IsActiveMember   -0.8678126  0.1632340  -5.316 1.06e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1146.11  on 999  degrees of freedom
## Residual deviance:  990.07  on 990  degrees of freedom
## AIC: 1010.1
## 
## Number of Fisher Scoring iterations: 4
res <- predict(Testmodel,Testdata,type="response")
table(Actualvalue=Testdata$Exited,Predictedvalue=res>0.5)
##            Predictedvalue
## Actualvalue FALSE TRUE
##           0   698   42
##           1   195   65