Document Contents

Problem Statement

Come up with a churn model for a telecom firm using account information like data plan, data usage, account age, monthly billing and many more.

Use Logistic Regression for this classification problem.

Data Preperation

#Load Data
library(XLConnect)
## Warning: package 'XLConnect' was built under R version 3.3.3
## Loading required package: XLConnectJars
## Warning: package 'XLConnectJars' was built under R version 3.3.3
## XLConnect 0.2-13 by Mirai Solutions GmbH [aut],
##   Martin Studer [cre],
##   The Apache Software Foundation [ctb, cph] (Apache POI),
##   Graph Builder [ctb, cph] (Curvesapi Java library)
## http://www.mirai-solutions.com ,
## http://miraisolutions.wordpress.com
wb = loadWorkbook("./input/PM_GA_Datset.xlsx")
accounts = readWorksheet(wb, sheet = "Data", header = TRUE)
str(accounts)
## 'data.frame':    3333 obs. of  11 variables:
##  $ Churn          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AccountWeeks   : num  128 107 137 84 75 118 121 147 117 141 ...
##  $ ContractRenewal: num  1 1 1 0 0 0 1 0 1 0 ...
##  $ DataPlan       : num  1 1 0 0 0 0 1 0 0 1 ...
##  $ DataUsage      : num  2.7 3.7 0 0 0 0 2.03 0 0.19 3.02 ...
##  $ CustServCalls  : num  1 1 0 2 3 0 3 0 1 0 ...
##  $ DayMins        : num  265 162 243 299 167 ...
##  $ DayCalls       : num  110 123 114 71 113 98 88 79 97 84 ...
##  $ MonthlyCharge  : num  89 82 52 57 41 57 87.3 36 63.9 93.2 ...
##  $ OverageFee     : num  9.87 9.78 6.06 3.1 7.42 ...
##  $ RoamMins       : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
accounts_n <- accounts
# Convert to factors
accounts$Churn = as.factor(accounts$Churn)
accounts$ContractRenewal = as.factor(accounts$ContractRenewal)
accounts$DataPlan = as.factor(accounts$DataPlan)

Exploratory Data Analysis

Churn

table(accounts$Churn)
## 
##    0    1 
## 2850  483
prop.table(table(accounts$Churn))
## 
##         0         1 
## 0.8550855 0.1449145
library(ggplot2)
ggplot(accounts, aes(Churn)) + geom_bar(fill="salmon")

14.5% is the churn rate, 483/3333 have churned.

Account Weeks

summary(accounts$AccountWeeks)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    74.0   101.0   101.1   127.0   243.0
ActWeeks <- cut(accounts$AccountWeeks, breaks = seq(0, 250, by = 25))
ggplot(accounts, aes(ActWeeks, ..count.., fill = Churn)) + geom_bar(position="dodge")

One would expect a decreasing churn rate with the increase in the time (account weeks) of an account, but it does not seem to be the case. There is no clear trend visible.

Contract Renewal

ggplot(accounts, aes(ContractRenewal, ..count.., fill = Churn)) + geom_bar(position="dodge")

Clearly, there is a good probability (approx 40%) of an account churning if the contract has not been renewed.

Data Plan

ggplot(accounts, aes(DataPlan, ..count.., fill = Churn)) + geom_bar(position="dodge")

table(accounts$Churn, accounts$DataPlan)
##    
##        0    1
##   0 2008  842
##   1  403   80

The probability of an account churning is higher if the account has not subscribed to a data plan.

Data Usage

summary(accounts$DataUsage)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.8165  1.7800  5.4000
dataUsage <- cut(accounts$DataUsage, include.lowest = TRUE, breaks = seq(0, 5.5, by = 0.5))
ggplot(accounts, aes(dataUsage, ..count.., fill = Churn)) + geom_bar(position="dodge")

Clearly, maximum churn is in the 0-0.5 data usage category.

CustServCall

summary(accounts$CustServCall)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   1.000   1.563   2.000   9.000
custServCalls <- cut(accounts$CustServCalls, include.lowest = TRUE, breaks = seq(0, 9, by = 1))
ggplot(accounts, aes(CustServCalls, ..count.., fill = Churn)) + geom_bar(position="dodge")

The churn rate increases if a customer makes 4 or more calls to the customer service.

DayMins

summary(accounts$DayMins)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   143.7   179.4   179.8   216.4   350.8
dayMins <- cut(accounts$DayMins, include.lowest = TRUE, breaks = seq(0, 385, by = 35))
ggplot(accounts, aes(dayMins, ..count.., fill = Churn)) + geom_bar(position="dodge")

The churn rate increases if the monthly average daytime minutes are greater than 245.

DayCalls

summary(accounts$DayCalls)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    87.0   101.0   100.4   114.0   165.0
dayCalls <- cut(accounts$DayCalls, include.lowest = TRUE, breaks = seq(0, 165, by = 16.5))
ggplot(accounts, aes(dayCalls, ..count.., fill = Churn)) + geom_bar(position="dodge")

There is no clear churn pattern visible vis-Ă -vis day calls.

MonthlyCharge

summary(accounts$MonthlyCharge)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   14.00   45.00   53.50   56.31   66.20  111.30
monthlyCharge <- cut(accounts$MonthlyCharge, include.lowest = TRUE, breaks = seq(14, 114, by = 10))
ggplot(accounts, aes(monthlyCharge, ..count.., fill = Churn)) + geom_bar(position="dodge")

The churn Rate is maximum if the monthly bill is between 64-74.

OverageFee

summary(accounts$OverageFee)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    8.33   10.07   10.05   11.77   18.19
overageFee <- cut(accounts$OverageFee, include.lowest = TRUE, breaks = seq(0, 19, by = 1.9))
ggplot(accounts, aes(overageFee, ..count.., fill = Churn)) + geom_bar(position="dodge")

There is no clear churn pattern visible vis-Ă -vis Overage Fee.

RoamMins

summary(accounts$RoamMins)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    8.50   10.30   10.24   12.10   20.00
roamMins <- cut(accounts$RoamMins, include.lowest = TRUE, breaks = seq(0, 20, by = 2))
ggplot(accounts, aes(roamMins, ..count.., fill = Churn)) + geom_bar(position="dodge")

There is no clear churn pattern visible vis-Ă -vis roaming minutes.

Correlation

Let’s look at the correlation between all the variables.

library(corrplot)
## Warning: package 'corrplot' was built under R version 3.3.3
corrplot(cor(accounts_n))

Data Usage and Data Plan are highly corelated. Monthly Charge is also highly correlated with Data Usage, Data Plan and Day Mins. Churn does not seem to be highly corelated with any of the variables. Churn has maximum correlation with Contract Renewal, Customer Service Calls and Day Mins.

Model

Preprocessing

Divide the data into training and testing sets in the ratio of 70:30.

library(caTools)
## Warning: package 'caTools' was built under R version 3.3.3
set.seed(777)
spl = sample.split(accounts$Churn, SplitRatio = 0.7)
train = subset(accounts, spl==TRUE)
test = subset(accounts, spl==FALSE)

Model 1 - Logistic Regression | All variables

model1 <- glm(Churn ~ ., data= accounts, family=binomial)
summary(model1)
## 
## Call:
## glm(formula = Churn ~ ., family = binomial, data = accounts)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0058  -0.5112  -0.3477  -0.2093   2.9981  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -5.9510252  0.5486763 -10.846  < 2e-16 ***
## AccountWeeks      0.0006525  0.0013873   0.470 0.638112    
## ContractRenewal1 -1.9855172  0.1436107 -13.826  < 2e-16 ***
## DataPlan1        -1.1841611  0.5363668  -2.208 0.027262 *  
## DataUsage         0.3636565  1.9231751   0.189 0.850021    
## CustServCalls     0.5081349  0.0389682  13.040  < 2e-16 ***
## DayMins           0.0174407  0.0324841   0.537 0.591337    
## DayCalls          0.0036523  0.0027497   1.328 0.184097    
## MonthlyCharge    -0.0275526  0.1909074  -0.144 0.885244    
## OverageFee        0.1868114  0.3256902   0.574 0.566248    
## RoamMins          0.0789226  0.0220522   3.579 0.000345 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2758.3  on 3332  degrees of freedom
## Residual deviance: 2188.4  on 3322  degrees of freedom
## AIC: 2210.4
## 
## Number of Fisher Scoring iterations: 5
library(car)
## Warning: package 'car' was built under R version 3.3.3
vif(model1)
##    AccountWeeks ContractRenewal        DataPlan       DataUsage 
##        1.003246        1.058705       14.087816     1601.163095 
##   CustServCalls         DayMins        DayCalls   MonthlyCharge 
##        1.081250      952.539781        1.004592     2829.804947 
##      OverageFee        RoamMins 
##      211.716226        1.193368

The multicolliniearity has caused the inflated VIF values for correlated variables, making the model unreliable. We will use a stepwise variable reduction function using VIF values. The function works like this:

  • It uses the full set of explanatory variables.
  • It calculates VIF for each variable,
  • It removes the variable with the single highest value,
  • It then recalculates all VIF values with the new set of variables,
  • It removes the variable with the next highest value, and so on, until all values are below the threshold. (Source: https://beckmw.wordpress.com/2013/02/05/collinearity-and-stepwise-vif-selection/)
vif_func <- dget("vif_func.R")
vif_func(in_frame=accounts_n[,-1],thresh=5,trace=T)
## Warning: package 'fmsb' was built under R version 3.3.3
##  var             vif             
##  AccountWeeks    1.00379056966792
##  ContractRenewal 1.0072163639093 
##  DataPlan        12.4734695151247
##  DataUsage       1964.8002067194 
##  CustServCalls   1.00194507320884
##  DayMins         1031.49060758217
##  DayCalls        1.00293512970177
##  MonthlyCharge   3243.30055507161
##  OverageFee      224.639750372869
##  RoamMins        1.34658276919068
## 
## removed:  MonthlyCharge 3243.301 
## 
##  var             vif             
##  AccountWeeks    1.00349668958413
##  ContractRenewal 1.00651641850144
##  DataPlan        12.4695602982927
##  DataUsage       12.8138032553607
##  CustServCalls   1.00177759301348
##  DayMins         1.00333362434266
##  DayCalls        1.00292948433251
##  OverageFee      1.0016574227944 
##  RoamMins        1.346470768479  
## 
## removed:  DataUsage 12.8138
## [1] "AccountWeeks"    "ContractRenewal" "DataPlan"        "CustServCalls"  
## [5] "DayMins"         "DayCalls"        "OverageFee"      "RoamMins"

Model 2 - Logistic Regression | Remove correlated variables

We will not use the MonthlyCharge and DataUsage variables and create a new model.

model2 <- glm(Churn ~ . -MonthlyCharge - DataUsage  , data= accounts, family=binomial)
summary(model2)
## 
## Call:
## glm(formula = Churn ~ . - MonthlyCharge - DataUsage, family = binomial, 
##     data = accounts)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0113  -0.5099  -0.3496  -0.2100   2.9978  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -5.9945437  0.5377103 -11.148  < 2e-16 ***
## AccountWeeks      0.0006621  0.0013873   0.477    0.633    
## ContractRenewal1 -1.9880114  0.1435423 -13.850  < 2e-16 ***
## DataPlan1        -0.9353165  0.1441298  -6.489 8.62e-11 ***
## CustServCalls     0.5072934  0.0389173  13.035  < 2e-16 ***
## DayMins           0.0127543  0.0010725  11.892  < 2e-16 ***
## DayCalls          0.0036213  0.0027486   1.318    0.188    
## OverageFee        0.1398147  0.0226568   6.171 6.79e-10 ***
## RoamMins          0.0831284  0.0203211   4.091 4.30e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2758.3  on 3332  degrees of freedom
## Residual deviance: 2188.6  on 3324  degrees of freedom
## AIC: 2206.6
## 
## Number of Fisher Scoring iterations: 5
library(car)
vif(model2)
##    AccountWeeks ContractRenewal        DataPlan   CustServCalls 
##        1.002458        1.057352        1.018550        1.078674 
##         DayMins        DayCalls      OverageFee        RoamMins 
##        1.038782        1.004109        1.025100        1.010804

AccountWeeks and DayCalls are insignificant for the model, we will remove these variables a create a new model

Model 3 - Logistic Regression | Remove insignificant variables

model3 <- glm(Churn ~ . -MonthlyCharge - DataUsage -AccountWeeks -DayCalls, data= accounts, family=binomial)
summary(model3)
## 
## Call:
## glm(formula = Churn ~ . - MonthlyCharge - DataUsage - AccountWeeks - 
##     DayCalls, family = binomial, data = accounts)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9932  -0.5154  -0.3480  -0.2095   2.9906  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -5.552897   0.432757 -12.831  < 2e-16 ***
## ContractRenewal1 -1.989219   0.143452 -13.867  < 2e-16 ***
## DataPlan1        -0.934814   0.144015  -6.491 8.52e-11 ***
## CustServCalls     0.505651   0.038834  13.021  < 2e-16 ***
## DayMins           0.012774   0.001073  11.907  < 2e-16 ***
## OverageFee        0.138612   0.022648   6.120 9.34e-10 ***
## RoamMins          0.083476   0.020304   4.111 3.93e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2758.3  on 3332  degrees of freedom
## Residual deviance: 2190.6  on 3326  degrees of freedom
## AIC: 2204.6
## 
## Number of Fisher Scoring iterations: 5
library(car)
vif(model2)
##    AccountWeeks ContractRenewal        DataPlan   CustServCalls 
##        1.002458        1.057352        1.018550        1.078674 
##         DayMins        DayCalls      OverageFee        RoamMins 
##        1.038782        1.004109        1.025100        1.010804

The value of coeffients signify how they affect the the log odds ratio of a customer churning and not churning. The signs of the coefficents are in line with the insignts found in the exploratory data analysis.

Prediction

predictTest = predict(model3, type="response", newdata=test)

Accuracy

# Accuracy | Base Line Model
 nrow(accounts[accounts$Churn == 0,])/nrow(accounts)
## [1] 0.8550855
# Confusion matrix with threshold of 0.5
table(test$Churn, predictTest > 0.5)
##    
##     FALSE TRUE
##   0   835   20
##   1   129   16
# Accuracy | Logistic Model
(835+16)/(835+20+129+16)
## [1] 0.851
# Sensitivity (True Positive Rate)
16/(129+16)
## [1] 0.1103448

Although, the logistic regression has not bettered the baseline accuracy, the purpose of a churn model is to identify those customers who are likely to churn. Let’s review the ROC curve and find the optimal threshold value to increase the sensitivity by decreasing the threshold. The current Sensitivity is a poor 0.11.

ROC Curve

library(ROCR)
## Warning: package 'ROCR' was built under R version 3.3.3
## Warning: package 'gplots' was built under R version 3.3.3
predictROC = predict(model2, newdata = test)
pred = prediction(predictROC, test$Churn)
perf = performance(pred, "tpr", "tnr")

plot(perf)

The threshold values on the curve range from 0 to 1(left to right). A lower threshold value of 0.2 towards the left of the curve will improve the Sensitivity (true positive rate).

Accuracy | Lower Threshold

# Accuracy | Base Line Model
 nrow(accounts[accounts$Churn == 0,])/nrow(accounts)
## [1] 0.8550855
# Confusion matrix with threshold of 0.2
table(test$Churn, predictTest > 0.2)
##    
##     FALSE TRUE
##   0   724  131
##   1    55   90
# Accuracy | Logistic Model 
(724+90)/(724+90+131+55)
## [1] 0.814
# Sensitivity
90/(90+55)
## [1] 0.6206897

Conclusion

By lowering the threshold, we have improved the sensitivity of the model to 0.61 from earlier 0.11, of course compromising on the overall accuracy which stands lower at 0.81. The specifity of the model is also compromised. The threshold can be further reduced to further improve the sensitivity.