#Goal : predict behavior to retain customers
#The data set includes information about:
#The column "Existed" is customers who left Bank.
#data source https://www.kaggle.com/nswapnil31/bank-customer-churn
#########################################################################################################
# Churn Analysis
#########################################################################################################
# Load Libraries
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("tidytext")
library("ggplot2")
library("caTools")
## Warning: package 'caTools' was built under R version 3.5.3
library("ROCR")
## Warning: package 'ROCR' was built under R version 3.5.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.5.3
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library("magrittr")
## Warning: package 'magrittr' was built under R version 3.5.3
library("forecast")
## Warning: package 'forecast' was built under R version 3.5.3
library("nnet")
## Warning: package 'nnet' was built under R version 3.5.3
getwd()
## [1] "C:/Data Science A-Z/Part 3 Data Prep/Logistic Regression"
#Import Data
setwd("C:\\Data Science A-Z\\Part 3 Data Prep\\Logistic Regression")
data1 <- read.csv("P12-Churn-Modelling.csv", header = TRUE)
# changing character variables to factors
data1 <- data1 %>% mutate_if(is.character, as.factor)
attach(data1)
# looking for missing values
colSums(is.na(data1))
## RowNumber CustomerId Surname CreditScore
## 0 0 0 0
## Geography Gender Age Tenure
## 0 0 0 0
## Balance NumOfProducts HasCrCard IsActiveMember
## 0 0 0 0
## EstimatedSalary Exited
## 0 0
summary(data1)
## RowNumber CustomerId Surname CreditScore
## Min. : 1 Min. :15565701 Smith : 32 Min. :350.0
## 1st Qu.: 2501 1st Qu.:15628528 Martin : 29 1st Qu.:584.0
## Median : 5000 Median :15690738 Scott : 29 Median :652.0
## Mean : 5000 Mean :15690941 Walker : 28 Mean :650.5
## 3rd Qu.: 7500 3rd Qu.:15753234 Brown : 26 3rd Qu.:718.0
## Max. :10000 Max. :15815690 Genovese: 25 Max. :850.0
## (Other) :9831
## Geography Gender Age Tenure
## France :5014 Female:4543 Min. :18.00 Min. : 0.000
## Germany:2509 Male :5457 1st Qu.:32.00 1st Qu.: 3.000
## Spain :2477 Median :37.00 Median : 5.000
## Mean :38.92 Mean : 5.013
## 3rd Qu.:44.00 3rd Qu.: 7.000
## Max. :92.00 Max. :10.000
##
## Balance NumOfProducts HasCrCard IsActiveMember
## Min. : 0 Min. :1.00 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0 1st Qu.:1.00 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 97199 Median :1.00 Median :1.0000 Median :1.0000
## Mean : 76486 Mean :1.53 Mean :0.7055 Mean :0.5151
## 3rd Qu.:127644 3rd Qu.:2.00 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :250898 Max. :4.00 Max. :1.0000 Max. :1.0000
##
## EstimatedSalary Exited
## Min. : 11.58 Min. :0.0000
## 1st Qu.: 51002.11 1st Qu.:0.0000
## Median :100193.91 Median :0.0000
## Mean :100090.24 Mean :0.2037
## 3rd Qu.:149388.25 3rd Qu.:0.0000
## Max. :199992.48 Max. :1.0000
##
# removing RowNumber,customerID,Surname; doesn't add any value to the model
data1$RowNumber <- NULL
data1$CustomerId <- NULL
data1$Surname <-NULL
str(data1)
## 'data.frame': 10000 obs. of 11 variables:
## $ CreditScore : int 619 608 502 699 850 645 822 376 501 684 ...
## $ Geography : Factor w/ 3 levels "France","Germany",..: 1 3 1 1 3 3 1 2 1 1 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 2 2 1 2 2 ...
## $ Age : int 42 41 42 39 43 44 50 29 44 27 ...
## $ Tenure : int 2 1 8 1 2 8 7 4 4 2 ...
## $ Balance : num 0 83808 159661 0 125511 ...
## $ NumOfProducts : int 1 1 3 2 1 2 2 4 2 1 ...
## $ HasCrCard : int 1 0 1 0 1 1 1 1 0 1 ...
## $ IsActiveMember : int 1 1 0 0 1 0 1 0 1 1 ...
## $ EstimatedSalary: num 101349 112543 113932 93827 79084 ...
## $ Exited : int 1 0 1 0 0 1 0 1 0 0 ...
head(data1)
## CreditScore Geography Gender Age Tenure Balance NumOfProducts
## 1 619 France Female 42 2 0.00 1
## 2 608 Spain Female 41 1 83807.86 1
## 3 502 France Female 42 8 159660.80 3
## 4 699 France Female 39 1 0.00 2
## 5 850 Spain Female 43 2 125510.82 1
## 6 645 Spain Male 44 8 113755.78 2
## HasCrCard IsActiveMember EstimatedSalary Exited
## 1 1 1 101348.88 1
## 2 0 1 112542.58 0
## 3 1 0 113931.57 1
## 4 0 0 93826.63 0
## 5 1 1 79084.10 0
## 6 1 0 149756.71 1
# fitting the model
model <- glm(Exited~.,data1,family = "binomial")
summary(model)
##
## Call:
## glm(formula = Exited ~ ., family = "binomial", data = data1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3097 -0.6589 -0.4560 -0.2697 2.9940
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.392e+00 2.448e-01 -13.857 < 2e-16 ***
## CreditScore -6.683e-04 2.803e-04 -2.384 0.0171 *
## GeographyGermany 7.747e-01 6.767e-02 11.448 < 2e-16 ***
## GeographySpain 3.522e-02 7.064e-02 0.499 0.6181
## GenderMale -5.285e-01 5.449e-02 -9.699 < 2e-16 ***
## Age 7.271e-02 2.576e-03 28.230 < 2e-16 ***
## Tenure -1.595e-02 9.355e-03 -1.705 0.0882 .
## Balance 2.637e-06 5.142e-07 5.128 2.92e-07 ***
## NumOfProducts -1.015e-01 4.713e-02 -2.154 0.0312 *
## HasCrCard -4.468e-02 5.934e-02 -0.753 0.4515
## IsActiveMember -1.075e+00 5.769e-02 -18.643 < 2e-16 ***
## EstimatedSalary 4.807e-07 4.737e-07 1.015 0.3102
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10109.8 on 9999 degrees of freedom
## Residual deviance: 8561.4 on 9988 degrees of freedom
## AIC: 8585.4
##
## Number of Fisher Scoring iterations: 5
# making predictions
res <- predict(model,data1,type="response")
table(Actualvalue=data1$Exited,Predictedvalue=res>0.5)
## Predictedvalue
## Actualvalue FALSE TRUE
## 0 7666 297
## 1 1600 437
#Removing Geography from model
model1 <- glm(Exited~(CreditScore) +(Age)+(Tenure)+(Gender)+Balance+NumOfProducts+HasCrCard+EstimatedSalary+IsActiveMember ,data1,family = "binomial")
summary(model1)
##
## Call:
## glm(formula = Exited ~ (CreditScore) + (Age) + (Tenure) + (Gender) +
## Balance + NumOfProducts + HasCrCard + EstimatedSalary + IsActiveMember,
## family = "binomial", data = data1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1373 -0.6713 -0.4660 -0.2797 2.9715
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.476e+00 2.414e-01 -14.397 <2e-16 ***
## CreditScore -6.521e-04 2.777e-04 -2.349 0.0188 *
## Age 7.287e-02 2.551e-03 28.561 <2e-16 ***
## Tenure -1.471e-02 9.278e-03 -1.585 0.1129
## GenderMale -5.429e-01 5.397e-02 -10.060 <2e-16 ***
## Balance 5.045e-06 4.601e-07 10.964 <2e-16 ***
## NumOfProducts -3.617e-02 4.640e-02 -0.780 0.4356
## HasCrCard -2.899e-02 5.875e-02 -0.493 0.6217
## EstimatedSalary 4.978e-07 4.698e-07 1.060 0.2893
## IsActiveMember -1.079e+00 5.723e-02 -18.860 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10109.8 on 9999 degrees of freedom
## Residual deviance: 8706.5 on 9990 degrees of freedom
## AIC: 8726.5
##
## Number of Fisher Scoring iterations: 5
res <- predict(model1,data1,type="response")
table(Actualvalue=data1$Exited,Predictedvalue=res>0.5)
## Predictedvalue
## Actualvalue FALSE TRUE
## 0 7717 246
## 1 1683 354
model2 <- glm(Exited~(CreditScore) +(Age)+(Tenure)+(Gender)+Balance+NumOfProducts+EstimatedSalary+IsActiveMember ,data1,family = "binomial")
summary(model2)
##
## Call:
## glm(formula = Exited ~ (CreditScore) + (Age) + (Tenure) + (Gender) +
## Balance + NumOfProducts + EstimatedSalary + IsActiveMember,
## family = "binomial", data = data1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1410 -0.6711 -0.4667 -0.2804 2.9687
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.498e+00 2.375e-01 -14.725 <2e-16 ***
## CreditScore -6.507e-04 2.776e-04 -2.344 0.0191 *
## Age 7.287e-02 2.551e-03 28.564 <2e-16 ***
## Tenure -1.481e-02 9.275e-03 -1.597 0.1104
## GenderMale -5.430e-01 5.397e-02 -10.061 <2e-16 ***
## Balance 5.048e-06 4.600e-07 10.974 <2e-16 ***
## NumOfProducts -3.616e-02 4.639e-02 -0.780 0.4357
## EstimatedSalary 4.981e-07 4.698e-07 1.060 0.2890
## IsActiveMember -1.079e+00 5.721e-02 -18.855 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10109.8 on 9999 degrees of freedom
## Residual deviance: 8706.7 on 9991 degrees of freedom
## AIC: 8724.7
##
## Number of Fisher Scoring iterations: 5
res <- predict(model2,data1,type="response")
table(Actualvalue=data1$Exited,Predictedvalue=res>0.5)
## Predictedvalue
## Actualvalue FALSE TRUE
## 0 7719 244
## 1 1681 356
model3 <- glm(Exited~(CreditScore) +(Age)+(Tenure)+(Gender)+Balance+NumOfProducts+IsActiveMember ,data1,family = "binomial")
summary(model3)
##
## Call:
## glm(formula = Exited ~ (CreditScore) + (Age) + (Tenure) + (Gender) +
## Balance + NumOfProducts + IsActiveMember, family = "binomial",
## data = data1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1538 -0.6711 -0.4671 -0.2790 2.9612
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.447e+00 2.326e-01 -14.817 <2e-16 ***
## CreditScore -6.527e-04 2.776e-04 -2.351 0.0187 *
## Age 7.284e-02 2.551e-03 28.560 <2e-16 ***
## Tenure -1.466e-02 9.274e-03 -1.581 0.1139
## GenderMale -5.434e-01 5.396e-02 -10.070 <2e-16 ***
## Balance 5.057e-06 4.599e-07 10.995 <2e-16 ***
## NumOfProducts -3.520e-02 4.638e-02 -0.759 0.4480
## IsActiveMember -1.080e+00 5.721e-02 -18.870 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10109.8 on 9999 degrees of freedom
## Residual deviance: 8707.8 on 9992 degrees of freedom
## AIC: 8723.8
##
## Number of Fisher Scoring iterations: 5
res <- predict(model3,data1,type="response")
table(Actualvalue=data1$Exited,Predictedvalue=res>0.5)
## Predictedvalue
## Actualvalue FALSE TRUE
## 0 7719 244
## 1 1680 357
model4 <- glm(Exited~(CreditScore) +(Age)+(Gender)+Balance+NumOfProducts+IsActiveMember ,data1,family = "binomial")
summary(model4)
##
## Call:
## glm(formula = Exited ~ (CreditScore) + (Age) + (Gender) + Balance +
## NumOfProducts + IsActiveMember, family = "binomial", data = data1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1598 -0.6715 -0.4659 -0.2797 2.9510
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.519e+00 2.283e-01 -15.412 <2e-16 ***
## CreditScore -6.528e-04 2.776e-04 -2.352 0.0187 *
## Age 7.283e-02 2.550e-03 28.559 <2e-16 ***
## GenderMale -5.448e-01 5.395e-02 -10.099 <2e-16 ***
## Balance 5.053e-06 4.598e-07 10.990 <2e-16 ***
## NumOfProducts -3.587e-02 4.636e-02 -0.774 0.4391
## IsActiveMember -1.077e+00 5.717e-02 -18.834 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10109.8 on 9999 degrees of freedom
## Residual deviance: 8710.3 on 9993 degrees of freedom
## AIC: 8724.3
##
## Number of Fisher Scoring iterations: 5
res <- predict(model4,data1,type="response")
table(Actualvalue=data1$Exited,Predictedvalue=res>0.5)
## Predictedvalue
## Actualvalue FALSE TRUE
## 0 7715 248
## 1 1690 347
#SEMI FINAL MODEL
model5 <- glm(Exited~(CreditScore) +(Age)+(Gender)+Tenure+Geography+Balance+NumOfProducts+IsActiveMember ,data1,family = "binomial")
summary(model5)
##
## Call:
## glm(formula = Exited ~ (CreditScore) + (Age) + (Gender) + Tenure +
## Geography + Balance + NumOfProducts + IsActiveMember, family = "binomial",
## data = data1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3270 -0.6592 -0.4553 -0.2682 2.9826
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.377e+00 2.359e-01 -14.312 < 2e-16 ***
## CreditScore -6.682e-04 2.803e-04 -2.384 0.0171 *
## Age 7.268e-02 2.575e-03 28.230 < 2e-16 ***
## GenderMale -5.291e-01 5.448e-02 -9.712 < 2e-16 ***
## Tenure -1.595e-02 9.350e-03 -1.706 0.0879 .
## GeographyGermany 7.741e-01 6.766e-02 11.441 < 2e-16 ***
## GeographySpain 3.586e-02 7.063e-02 0.508 0.6116
## Balance 2.653e-06 5.140e-07 5.162 2.45e-07 ***
## NumOfProducts -1.007e-01 4.712e-02 -2.137 0.0326 *
## IsActiveMember -1.075e+00 5.767e-02 -18.648 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10110 on 9999 degrees of freedom
## Residual deviance: 8563 on 9990 degrees of freedom
## AIC: 8583
##
## Number of Fisher Scoring iterations: 5
res <- predict(model5,data1,type="response")
table(Actualvalue=data1$Exited,Predictedvalue=res>0.5)
## Predictedvalue
## Actualvalue FALSE TRUE
## 0 7676 287
## 1 1597 440
#VAriable Transform
log_BL<-log(data1$Balance+1)
#Final Model
model6 <- glm(Exited~(CreditScore) +(Age)+(Gender)+Tenure+Geography+log_BL+NumOfProducts+IsActiveMember ,data1,family = "binomial")
summary(model6)
##
## Call:
## glm(formula = Exited ~ (CreditScore) + (Age) + (Gender) + Tenure +
## Geography + log_BL + NumOfProducts + IsActiveMember, family = "binomial",
## data = data1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3101 -0.6587 -0.4552 -0.2686 2.9867
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.3957408 0.2374592 -14.300 < 2e-16 ***
## CreditScore -0.0006767 0.0002803 -2.414 0.0158 *
## Age 0.0726380 0.0025746 28.213 < 2e-16 ***
## GenderMale -0.5268640 0.0544604 -9.674 < 2e-16 ***
## Tenure -0.0158492 0.0093469 -1.696 0.0900 .
## GeographyGermany 0.7597409 0.0692535 10.970 < 2e-16 ***
## GeographySpain 0.0363329 0.0706155 0.515 0.6069
## log_BL 0.0299784 0.0060625 4.945 7.62e-07 ***
## NumOfProducts -0.0952465 0.0475445 -2.003 0.0451 *
## IsActiveMember -1.0760359 0.0576486 -18.665 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10109.8 on 9999 degrees of freedom
## Residual deviance: 8564.9 on 9990 degrees of freedom
## AIC: 8584.9
##
## Number of Fisher Scoring iterations: 5
res <- predict(model6,data1,type="response")
table(Actualvalue=data1$Exited,Predictedvalue=res>0.5)
## Predictedvalue
## Actualvalue FALSE TRUE
## 0 7687 276
## 1 1597 440
############################################################
# Performance Measure
#############################################################
#CIs using profiled log-lkelihood
confint(model6)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -3.862834028 -2.9318935713
## CreditScore -0.001226397 -0.0001274082
## Age 0.067617390 0.0777113655
## GenderMale -0.633763743 -0.4202583080
## Tenure -0.034178447 0.0024649705
## GeographyGermany 0.624239365 0.8957423318
## GeographySpain -0.102657442 0.1742082600
## log_BL 0.018117927 0.0418866846
## NumOfProducts -0.188767410 -0.0023595984
## IsActiveMember -1.189584783 -0.9635748790
#CIs using standard errors
confint.default(model6)
## 2.5 % 97.5 %
## (Intercept) -3.861152378 -2.9303292253
## CreditScore -0.001226134 -0.0001272788
## Age 0.067591752 0.0776841550
## GenderMale -0.633604544 -0.4201235347
## Tenure -0.034168776 0.0024704571
## GeographyGermany 0.624006441 0.8954753097
## GeographySpain -0.102070943 0.1747367105
## log_BL 0.018096080 0.0418607579
## NumOfProducts -0.188431942 -0.0020610113
## IsActiveMember -1.189025151 -0.9630467216
#Put the coefficents and CI in a format onto a useful Scale.
exp(confint(model6))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.02100838 0.05329602
## CreditScore 0.99877436 0.99987260
## Age 1.06995585 1.08081065
## GenderMale 0.53059103 0.65687712
## Tenure 0.96639904 1.00246801
## GeographyGermany 1.86682545 2.44915320
## GeographySpain 0.90243606 1.19030343
## log_BL 1.01828305 1.04277631
## NumOfProducts 0.82797906 0.99764318
## IsActiveMember 0.30434761 0.38152653
# odds ratios only
exp(coef(model6))
## (Intercept) CreditScore Age GenderMale
## 0.03351572 0.99932352 1.07534114 0.59045371
## Tenure GeographyGermany GeographySpain log_BL
## 0.98427578 2.13772221 1.03700099 1.03043230
## NumOfProducts IsActiveMember
## 0.90914882 0.34094438
# odds ratios and 95% CI
exp(cbind(OR = coef(model6),confint(model6)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.03351572 0.02100838 0.05329602
## CreditScore 0.99932352 0.99877436 0.99987260
## Age 1.07534114 1.06995585 1.08081065
## GenderMale 0.59045371 0.53059103 0.65687712
## Tenure 0.98427578 0.96639904 1.00246801
## GeographyGermany 2.13772221 1.86682545 2.44915320
## GeographySpain 1.03700099 0.90243606 1.19030343
## log_BL 1.03043230 1.01828305 1.04277631
## NumOfProducts 0.90914882 0.82797906 0.99764318
## IsActiveMember 0.34094438 0.30434761 0.38152653
#Model Testing Using Test Data
setwd("C:\\Data Science A-Z\\Part 3 Data Prep\\Logistic Regression")
Testdata <- read.csv("P12-Churn-Modelling-Test-Data.csv", header = TRUE)
# changing character variables to factors
Testdata <- Testdata %>% mutate_if(is.character, as.factor)
attach(Testdata)
## The following objects are masked from data1:
##
## Age, Balance, CreditScore, CustomerId, EstimatedSalary,
## Exited, Gender, Geography, HasCrCard, IsActiveMember,
## NumOfProducts, RowNumber, Surname, Tenure
# looking for missing values
colSums(is.na(Testdata))
## RowNumber CustomerId Surname CreditScore
## 0 0 0 0
## Geography Gender Age Tenure
## 0 0 0 0
## Balance NumOfProducts HasCrCard IsActiveMember
## 0 0 0 0
## EstimatedSalary Exited
## 0 0
summary(Testdata)
## RowNumber CustomerId Surname CreditScore
## Min. :10001 Min. :15565862 King : 6 Min. :366.0
## 1st Qu.:10251 1st Qu.:15629656 Colombo : 5 1st Qu.:582.0
## Median :10500 Median :15695155 Henderson: 5 Median :656.0
## Mean :10500 Mean :15692736 Hsia : 5 Mean :652.2
## 3rd Qu.:10750 3rd Qu.:15754996 Mitchell : 5 3rd Qu.:719.0
## Max. :11000 Max. :15815463 Boyle : 4 Max. :850.0
## (Other) :970
## Geography Gender Age Tenure
## France :501 Female:433 Min. :18.00 Min. : 0.00
## Germany:256 Male :567 1st Qu.:32.00 1st Qu.: 2.00
## Spain :243 Median :38.00 Median : 5.00
## Mean :39.22 Mean : 4.86
## 3rd Qu.:44.00 3rd Qu.: 7.00
## Max. :91.00 Max. :10.00
##
## Balance NumOfProducts HasCrCard IsActiveMember
## Min. : 0 Min. :1.000 Min. :0.00 Min. :0.000
## 1st Qu.: 0 1st Qu.:1.000 1st Qu.:0.00 1st Qu.:0.000
## Median : 97927 Median :1.000 Median :1.00 Median :1.000
## Mean : 75160 Mean :1.521 Mean :0.68 Mean :0.529
## 3rd Qu.:128142 3rd Qu.:2.000 3rd Qu.:1.00 3rd Qu.:1.000
## Max. :211520 Max. :4.000 Max. :1.00 Max. :1.000
##
## EstimatedSalary Exited
## Min. : 245.5 Min. :0.00
## 1st Qu.: 49099.9 1st Qu.:0.00
## Median :104081.6 Median :0.00
## Mean :101500.4 Mean :0.26
## 3rd Qu.:151514.4 3rd Qu.:1.00
## Max. :199633.7 Max. :1.00
##
# removing RowNumber,customerID,Surname; doesn't add any value to the model
Testdata$RowNumber <- NULL
Testdata$CustomerId <- NULL
Testdata$Surname <-NULL
log_BL<-log(Testdata$Balance+1)
Testmodel <- glm(Exited~(CreditScore) +(Age)+(Gender)+Tenure+Geography+log_BL+NumOfProducts+IsActiveMember ,Testdata,family = "binomial")
summary(Testmodel)
##
## Call:
## glm(formula = Exited ~ (CreditScore) + (Age) + (Gender) + Tenure +
## Geography + log_BL + NumOfProducts + IsActiveMember, family = "binomial",
## data = Testdata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9508 -0.7481 -0.5271 0.7255 2.4931
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.5694575 0.6848742 -5.212 1.87e-07 ***
## CreditScore 0.0007295 0.0008007 0.911 0.36230
## Age 0.0703750 0.0075305 9.345 < 2e-16 ***
## GenderMale -0.5031528 0.1577396 -3.190 0.00142 **
## Tenure -0.0265185 0.0269771 -0.983 0.32561
## GeographyGermany 0.2958640 0.2021623 1.463 0.14333
## GeographySpain -0.3397193 0.2104439 -1.614 0.10646
## log_BL 0.0314575 0.0171925 1.830 0.06729 .
## NumOfProducts -0.1703151 0.1379350 -1.235 0.21692
## IsActiveMember -0.8678126 0.1632340 -5.316 1.06e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1146.11 on 999 degrees of freedom
## Residual deviance: 990.07 on 990 degrees of freedom
## AIC: 1010.1
##
## Number of Fisher Scoring iterations: 4
res <- predict(Testmodel,Testdata,type="response")
table(Actualvalue=Testdata$Exited,Predictedvalue=res>0.5)
## Predictedvalue
## Actualvalue FALSE TRUE
## 0 698 42
## 1 195 65