Loading packages that we will be using

library(ggplot2)
library(ROCR)
library(popbio)

Load and explore the data

churnData = read.csv("churn.csv")
churnData = churnData[,2:6]
head(churnData)
##   Churned Age Married Cust_years Churned_contacts
## 1       0  61       1          3                1
## 2       0  50       1          3                2
## 3       0  47       1          2                0
## 4       0  50       1          3                3
## 5       0  29       1          1                3
## 6       0  43       1          4                3

How many customers are churned?

sum(churnData$Churned)
## [1] 1743

Plot the data

 qplot(Age, Churned, data=churnData) + geom_point(colour = "#3366FF", size = 3)

Plot the conditional denisty plot

Computes and plots conditional densities describing how the conditional distribution of a categorical variable Chruned changes over a numerical variable Age.

cdplot(factor(Churned) ~ Age, data=churnData, main="Estimated categ prob", ylab='Churned')

qplot(Age, ..count.., data=churnData, geom="density", fill=factor(Churned), position="fill") + 
  ylab('Probability')+theme(legend.position='bottom')

Perform the logistic regression

Function glm is used to fit generalized linear models, specified by giving a symbolic description of the linear predictor and a description of the error distribution.

churnLogreg = glm(Churned ~ ., data=churnData, family=binomial(link="logit"))
summary(churnLogreg)
## 
## Call:
## glm(formula = Churned ~ ., family = binomial(link = "logit"), 
##     data = churnData)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4542  -0.5206  -0.1971  -0.0728   3.3786  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       3.415201   0.163734  20.858   <2e-16 ***
## Age              -0.156643   0.004088 -38.320   <2e-16 ***
## Married           0.066432   0.068302   0.973    0.331    
## Cust_years        0.017857   0.030497   0.586    0.558    
## Churned_contacts  0.382324   0.027313  13.998   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8387.3  on 7999  degrees of freedom
## Residual deviance: 5357.9  on 7995  degrees of freedom
## AIC: 5367.9
## 
## Number of Fisher Scoring iterations: 6

Generate confidence intervals for regression coefficients

confint(churnLogreg)
## Waiting for profiling to be done...
##                        2.5 %      97.5 %
## (Intercept)       3.09678395  3.73872531
## Age              -0.16478103 -0.14875424
## Married          -0.06740737  0.20037431
## Cust_years       -0.04202648  0.07755425
## Churned_contacts  0.32902320  0.43611325

Split the data as Training and Test sets

set.seed(2015)
splitChurn = caret::createDataPartition(churnData[,1], p = 0.8, list=F, times=1)
head(splitChurn)
##      Resample1
## [1,]         2
## [2,]         3
## [3,]         4
## [4,]         7
## [5,]         8
## [6,]         9
trainChurn = churnData[splitChurn,]
head(trainChurn)
##   Churned Age Married Cust_years Churned_contacts
## 2       0  50       1          3                2
## 3       0  47       1          2                0
## 4       0  50       1          3                3
## 7       0  50       0          2                1
## 8       0  29       0          3                2
## 9       0  32       1          3                3
testChurn = churnData[!row.names(churnData) %in% row.names(trainChurn),]
head(testChurn)
##    Churned Age Married Cust_years Churned_contacts
## 1        0  61       1          3                1
## 5        0  29       1          1                3
## 6        0  43       1          4                3
## 11       0  59       1          5                2
## 15       0  34       1          2                2
## 16       0  59       1          2                1

Apply Logistic Regression on Training set

trainChurnLR = glm(Churned ~ ., data=trainChurn, family=binomial(link="logit"))
summary(trainChurnLR)
## 
## Call:
## glm(formula = Churned ~ ., family = binomial(link = "logit"), 
##     data = trainChurn)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4067  -0.5190  -0.1940  -0.0736   3.3821  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       3.442674   0.180464  19.077   <2e-16 ***
## Age              -0.156591   0.004551 -34.405   <2e-16 ***
## Married           0.136188   0.076362   1.783   0.0745 .  
## Cust_years        0.010409   0.033916   0.307   0.7589    
## Churned_contacts  0.362264   0.030413  11.912   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6726.7  on 6399  degrees of freedom
## Residual deviance: 4290.3  on 6395  degrees of freedom
## AIC: 4300.3
## 
## Number of Fisher Scoring iterations: 6

Predict on Test data set

testChurn$Predicted = round(predict(trainChurnLR, testChurn[,2:5], type="response"), 2)
head(testChurn)
##    Churned Age Married Cust_years Churned_contacts Predicted
## 1        0  61       1          3                1      0.00
## 5        0  29       1          1                3      0.53
## 6        0  43       1          4                3      0.12
## 11       0  59       1          5                2      0.01
## 15       0  34       1          2                2      0.27
## 16       0  59       1          2                1      0.01

Visualization of Logistic Regression results

plot(trainChurn$Age, trainChurn$Churned, xlab="Age", ylab="P(Churned)")
trainLR = glm(Churned ~ Age, data=trainChurn, family=binomial(link="logit"))
curve(predict(trainLR,data.frame(Age=x),type="resp"),add=TRUE)
points(trainChurn$Age,fitted(trainLR),pch=20)

popbio::logi.hist.plot(trainChurn$Age, trainChurn$Churned, boxp=FALSE,type="hist",col="gray")

logReg = glm(Churned ~ ., data=trainChurn, family=binomial(link="logit"))
visreg::visreg(logReg, "Age", scale="response", partial=FALSE, xlab="Age", ylab="P(Churned)", rug=2)

Model evaluation - Receiver Operating Characteristic (ROC) Curve

Every classifier evaluation using ROCR starts with creating a prediction object. This function is used to transform the input data into a standardized format.

pred = predict(trainChurnLR, testChurn[,2:5], type="response")
pObject = ROCR::prediction(pred, testChurn$Churned )

All kinds of predictor evaluations are performed using performance function

rocObj = ROCR::performance(pObject, measure="tpr", x.measure="fpr")
aucObj = ROCR::performance(pObject, measure="auc")  
plot(rocObj, main = paste("Area under the curve:", round(aucObj@y.values[[1]] ,4))) 

Random Test data

rTest = data.frame(
                   'Age' = round(rnorm(nrow(testChurn), mean(testChurn$Age), sd(testChurn$Age))), 
                   'Married' = sample(c(1,0), nrow(testChurn), replace=T),
                   'Cust_years' = round(rnorm(nrow(testChurn), mean(testChurn$Cust_years), sd(testChurn$Cust_years))),
                   'Churned_contacts' = round(rnorm(nrow(testChurn), mean(testChurn$Churned_contacts), sd(testChurn$Churned_contacts))),
                   'Churned' = sample(c(1,0), nrow(testChurn), replace=T)
                   )

rand_pred = predict(trainChurnLR, rTest[,1:4], type="response")
randObject = ROCR::prediction(rand_pred, rTest$Churned )
rocRandObj = ROCR::performance(randObject, measure="tpr", x.measure="fpr")
aucRandObj = ROCR::performance(randObject, measure="auc")  
plot(rocRandObj, main = paste("Area under the curve:", round(aucRandObj@y.values[[1]] ,4))) 

Assess model fit

Phat = predict(trainChurnLR,testChurn,type="response")
head(Phat)
##           1           5           6          11          15          16 
## 0.003759550 0.533681236 0.116488135 0.007514151 0.268964943 0.005082195
prop.table(xtabs(~ Churned, data=testChurn))
## Churned
##       0       1 
## 0.78625 0.21375
thresh = 0.5
facHat = cut(Phat, breaks=c(-Inf, thresh, Inf), labels=c(0, 1))
cTab   = xtabs(~ Churned + facHat, data=testChurn)
addmargins(cTab)
##        facHat
## Churned    0    1  Sum
##     0   1157  101 1258
##     1    148  194  342
##     Sum 1305  295 1600
CCR = sum(diag(cTab)) / sum(cTab)
CCR
## [1] 0.844375

Updating models

If you want to modify a model you may consider using the special function update

lrUpdate = update(trainChurnLR, ~ . -Married-Churned_contacts-Cust_years)
summary(lrUpdate)
## 
## Call:
## glm(formula = Churned ~ Age, family = binomial(link = "logit"), 
##     data = trainChurn)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7484  -0.5360  -0.2084  -0.0917   3.3087  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.985340   0.139029   28.66   <2e-16 ***
## Age         -0.150076   0.004357  -34.45   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6726.7  on 6399  degrees of freedom
## Residual deviance: 4441.4  on 6398  degrees of freedom
## AIC: 4445.4
## 
## Number of Fisher Scoring iterations: 6
pred = predict(lrUpdate, testChurn[,2:5], type="response")
pObject = ROCR::prediction(pred, testChurn$Churned )
rocObj = ROCR::performance(pObject, measure="tpr", x.measure="fpr")
aucObj = ROCR::performance(pObject, measure="auc")  
plot(rocObj, main = paste("Area under the curve:", round(aucObj@y.values[[1]] ,4)))