Churn Analysis using Logistic Regression with Telco data

(It’s a work in progress)

Goal : predict behavior to retain customers

The data set includes information about:

  • The column “Churn” is customers who left within the last month.
  • Services that each customer has signed up for ? phone, multiple lines, internet, online security, online backup, device protection, tech support, and streaming TV and movies.
  • Customer account information: how long they have been a customer, contract, payment method, paperless billing, monthly charges, and total charges.
  • Demographic info about customers: gender, age range, and if they have partners and dependents

data source https://www.kaggle.com/blastchar/telco-customer-churn

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(MASS)
## Warning: package 'MASS' was built under R version 3.4.4
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(boot)
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
## 
##     melanoma
library(e1071)
library(SDMTools)
## 
## Attaching package: 'SDMTools'
## The following objects are masked from 'package:caret':
## 
##     sensitivity, specificity
#import data (default stringAsFactor = TRUE )
customer <- read.csv("WA_Fn-UseC_-Telco-Customer-Churn.csv",stringsAsFactors = T)

View(customer)

Data wrangling

# Make ID back to correct format
customer$customerID <- as.character(customer$customerID)

#reorder
customer <- customer[c(1,21,2:20)]

# scan missing value
colSums(is.na(customer))
##       customerID            Churn           gender    SeniorCitizen 
##                0                0                0                0 
##          Partner       Dependents           tenure     PhoneService 
##                0                0                0                0 
##    MultipleLines  InternetService   OnlineSecurity     OnlineBackup 
##                0                0                0                0 
## DeviceProtection      TechSupport      StreamingTV  StreamingMovies 
##                0                0                0                0 
##         Contract PaperlessBilling    PaymentMethod   MonthlyCharges 
##                0                0                0                0 
##     TotalCharges 
##               11
#observation with missing values 
missing <- filter(customer, is.na(customer$TotalCharges) == TRUE )

# since TotalCharges is roughly equals to tenure * monthly charges, I replace the missing value accordingly.
customer_m <- customer %>% mutate(TotalCharges = ifelse(is.na(customer$TotalCharges), customer$MonthlyCharges*customer$tenure, TotalCharges) )

check for outliers

boxplot(customer_m$tenure)

boxplot(customer_m$MonthlyCharges)

# not outliers 

Exploring data

How many churns in this dataset ?

ggplot(customer_m, aes(x = Churn))+
  geom_histogram(stat = "count", fill = c("sky blue", "orange"))
## Warning: Ignoring unknown parameters: binwidth, bins, pad

# churn yes : 1869 , no: 5174

When do customer churns ?

# churn mostly happens in short tenure
customer_m %>% filter(customer_m$Churn == "Yes") %>% 
  ggplot( aes(x=  tenure))+
  geom_bar(fill = "orange" )

some more exploratory on different variables

variables <- list(  'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport','StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling','PaymentMethod' )

plotG <- list() #x

for (i in variables){
 plotG <-  ggplot(customer_m, aes_string(x = i, fill = as.factor(customer_m$Churn)))+
    geom_bar( position = "stack")+ scale_fill_discrete(name = "churn")
 
 print(plotG)
}

Model Specification

# full logistic model 

logisticModelFull <- glm(Churn ~  gender + SeniorCitizen + Partner + Dependents + tenure + PhoneService + MultipleLines + InternetService + OnlineSecurity + OnlineBackup + DeviceProtection + TechSupport + StreamingTV + StreamingMovies + Contract + PaperlessBilling + PaymentMethod + MonthlyCharges , family = "binomial", customer_m)

summary(logisticModelFull)
## 
## Call:
## glm(formula = Churn ~ gender + SeniorCitizen + Partner + Dependents + 
##     tenure + PhoneService + MultipleLines + InternetService + 
##     OnlineSecurity + OnlineBackup + DeviceProtection + TechSupport + 
##     StreamingTV + StreamingMovies + Contract + PaperlessBilling + 
##     PaymentMethod + MonthlyCharges, family = "binomial", data = customer_m)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9780  -0.6707  -0.2946   0.6918   3.1454  
## 
## Coefficients: (7 not defined because of singularities)
##                                       Estimate Std. Error z value Pr(>|z|)
## (Intercept)                           0.612080   0.811986   0.754  0.45097
## genderMale                           -0.020514   0.064885  -0.316  0.75189
## SeniorCitizen                         0.217015   0.084920   2.556  0.01060
## PartnerYes                           -0.002440   0.077741  -0.031  0.97496
## DependentsYes                        -0.167071   0.089678  -1.863  0.06246
## tenure                               -0.034172   0.002366 -14.443  < 2e-16
## PhoneServiceYes                       0.165499   0.652460   0.254  0.79976
## MultipleLinesNo phone service               NA         NA      NA       NA
## MultipleLinesYes                      0.462796   0.178054   2.599  0.00934
## InternetServiceFiber optic            1.720069   0.803709   2.140  0.03234
## InternetServiceNo                    -1.622325   0.811846  -1.998  0.04568
## OnlineSecurityNo internet service           NA         NA      NA       NA
## OnlineSecurityYes                    -0.199497   0.179719  -1.110  0.26698
## OnlineBackupNo internet service             NA         NA      NA       NA
## OnlineBackupYes                       0.049975   0.176251   0.284  0.77676
## DeviceProtectionNo internet service         NA         NA      NA       NA
## DeviceProtectionYes                   0.162576   0.177303   0.917  0.35918
## TechSupportNo internet service              NA         NA      NA       NA
## TechSupportYes                       -0.168836   0.181586  -0.930  0.35248
## StreamingTVNo internet service              NA         NA      NA       NA
## StreamingTVYes                        0.593806   0.328488   1.808  0.07065
## StreamingMoviesNo internet service          NA         NA      NA       NA
## StreamingMoviesYes                    0.608397   0.328840   1.850  0.06429
## ContractOne year                     -0.666321   0.106644  -6.248 4.15e-10
## ContractTwo year                     -1.356836   0.173956  -7.800 6.20e-15
## PaperlessBillingYes                   0.335906   0.074277   4.522 6.12e-06
## PaymentMethodCredit card (automatic) -0.086598   0.114085  -0.759  0.44782
## PaymentMethodElectronic check         0.314319   0.094582   3.323  0.00089
## PaymentMethodMailed check            -0.005299   0.113719  -0.047  0.96283
## MonthlyCharges                       -0.032716   0.031940  -1.024  0.30570
##                                         
## (Intercept)                             
## genderMale                              
## SeniorCitizen                        *  
## PartnerYes                              
## DependentsYes                        .  
## tenure                               ***
## PhoneServiceYes                         
## MultipleLinesNo phone service           
## MultipleLinesYes                     ** 
## InternetServiceFiber optic           *  
## InternetServiceNo                    *  
## OnlineSecurityNo internet service       
## OnlineSecurityYes                       
## OnlineBackupNo internet service         
## OnlineBackupYes                         
## DeviceProtectionNo internet service     
## DeviceProtectionYes                     
## TechSupportNo internet service          
## TechSupportYes                          
## StreamingTVNo internet service          
## StreamingTVYes                       .  
## StreamingMoviesNo internet service      
## StreamingMoviesYes                   .  
## ContractOne year                     ***
## ContractTwo year                     ***
## PaperlessBillingYes                  ***
## PaymentMethodCredit card (automatic)    
## PaymentMethodElectronic check        ***
## PaymentMethodMailed check               
## MonthlyCharges                          
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8150.1  on 7042  degrees of freedom
## Residual deviance: 5851.0  on 7020  degrees of freedom
## AIC: 5897
## 
## Number of Fisher Scoring iterations: 6

new model (optimize model by finding the min. AIC value)

logisticModelNew <-  stepAIC(logisticModelFull, trace = 0)

summary(logisticModelNew)
## 
## Call:
## glm(formula = Churn ~ SeniorCitizen + Dependents + tenure + MultipleLines + 
##     InternetService + OnlineSecurity + TechSupport + StreamingTV + 
##     StreamingMovies + Contract + PaperlessBilling + PaymentMethod, 
##     family = "binomial", data = customer_m)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9707  -0.6703  -0.2945   0.6969   3.1704  
## 
## Coefficients: (4 not defined because of singularities)
##                                       Estimate Std. Error z value Pr(>|z|)
## (Intercept)                          -0.721834   0.124473  -5.799 6.67e-09
## SeniorCitizen                         0.215385   0.084324   2.554 0.010642
## DependentsYes                        -0.170407   0.081230  -2.098 0.035920
## tenure                               -0.035008   0.002232 -15.681  < 2e-16
## MultipleLinesNo phone service         0.488421   0.126689   3.855 0.000116
## MultipleLinesYes                      0.297143   0.078485   3.786 0.000153
## InternetServiceFiber optic            0.903428   0.091324   9.893  < 2e-16
## InternetServiceNo                    -0.774775   0.135591  -5.714 1.10e-08
## OnlineSecurityNo internet service           NA         NA      NA       NA
## OnlineSecurityYes                    -0.364489   0.084334  -4.322 1.55e-05
## TechSupportNo internet service              NA         NA      NA       NA
## TechSupportYes                       -0.336474   0.085088  -3.954 7.67e-05
## StreamingTVNo internet service              NA         NA      NA       NA
## StreamingTVYes                        0.265842   0.079022   3.364 0.000768
## StreamingMoviesNo internet service          NA         NA      NA       NA
## StreamingMoviesYes                    0.282268   0.078875   3.579 0.000345
## ContractOne year                     -0.668486   0.106275  -6.290 3.17e-10
## ContractTwo year                     -1.355075   0.173584  -7.806 5.88e-15
## PaperlessBillingYes                   0.335235   0.074208   4.518 6.26e-06
## PaymentMethodCredit card (automatic) -0.087953   0.113990  -0.772 0.440360
## PaymentMethodElectronic check         0.314533   0.094484   3.329 0.000872
## PaymentMethodMailed check            -0.006216   0.113561  -0.055 0.956347
##                                         
## (Intercept)                          ***
## SeniorCitizen                        *  
## DependentsYes                        *  
## tenure                               ***
## MultipleLinesNo phone service        ***
## MultipleLinesYes                     ***
## InternetServiceFiber optic           ***
## InternetServiceNo                    ***
## OnlineSecurityNo internet service       
## OnlineSecurityYes                    ***
## TechSupportNo internet service          
## TechSupportYes                       ***
## StreamingTVNo internet service          
## StreamingTVYes                       ***
## StreamingMoviesNo internet service      
## StreamingMoviesYes                   ***
## ContractOne year                     ***
## ContractTwo year                     ***
## PaperlessBillingYes                  ***
## PaymentMethodCredit card (automatic)    
## PaymentMethodElectronic check        ***
## PaymentMethodMailed check               
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8150.1  on 7042  degrees of freedom
## Residual deviance: 5854.3  on 7025  degrees of freedom
## AIC: 5890.3
## 
## Number of Fisher Scoring iterations: 6

Interpretation of coefficient (odds)

interpretation of odds : eg. user with Multiple Lines (MultipleLinesYes) increase the odd of churn by 35%. Being a senior citizen is 24% more likely to churn than non-senior citizen group.

# transform Coefficient to Odds

odds <-  coef(logisticModelNew) %>% exp() %>% round(2)

odds
##                          (Intercept)                        SeniorCitizen 
##                                 0.49                                 1.24 
##                        DependentsYes                               tenure 
##                                 0.84                                 0.97 
##        MultipleLinesNo phone service                     MultipleLinesYes 
##                                 1.63                                 1.35 
##           InternetServiceFiber optic                    InternetServiceNo 
##                                 2.47                                 0.46 
##    OnlineSecurityNo internet service                    OnlineSecurityYes 
##                                   NA                                 0.69 
##       TechSupportNo internet service                       TechSupportYes 
##                                   NA                                 0.71 
##       StreamingTVNo internet service                       StreamingTVYes 
##                                   NA                                 1.30 
##   StreamingMoviesNo internet service                   StreamingMoviesYes 
##                                   NA                                 1.33 
##                     ContractOne year                     ContractTwo year 
##                                 0.51                                 0.26 
##                  PaperlessBillingYes PaymentMethodCredit card (automatic) 
##                                 1.40                                 0.92 
##        PaymentMethodElectronic check            PaymentMethodMailed check 
##                                 1.37                                 0.99

Out-of-sample validation and cross validation

devide data into training & testing set

  • Training set : 2/3 of data
  • Testing set : 1/3 of data
#generating random index for training & testing set
set.seed(27345)

customer_m$isTrain <- rbinom(nrow(customer_m),1,0.66)

train <-  customer_m %>% filter(customer_m$isTrain =="1")
test <- customer_m %>%  filter(customer_m$isTrain == "0")

train model

# modeling 
LogisticTrainNew <- glm( Churn ~ gender + SeniorCitizen + Partner + Dependents + tenure + PhoneService + MultipleLines + InternetService + OnlineSecurity + OnlineBackup + DeviceProtection + TechSupport + StreamingTV + StreamingMovies + Contract + PaperlessBilling + PaymentMethod + MonthlyCharges , family = "binomial", train)


#prediciton 
test$predictNew <- predict(LogisticTrainNew , type = "response" , newdata = test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
test$Churn <- ifelse(test$Churn == "Yes", 1, 0)

Accuracy

  • out-of-sample accuracy
# calculating confusion matrix


confMatrix <- confusion.matrix(test$Churn, test$predictNew ,threshold = 0.5) # tested different thresholds where 0.5 got highest accuracy 

confMatrix
##     obs
## pred    0   1
##    0 1636 300
##    1  168 354
## attr(,"class")
## [1] "confusion.matrix"
accurracyNew <- sum(diag(confMatrix))/sum(confMatrix)

accurracyNew
## [1] 0.8096013
  • cross-validation accuracy
# Accuracy function with threshold 0.5 

Accu <- function(r , pi = 0){
  cm <- confusion.matrix( r , pi , 0.5)
  acc <- sum(diag(cm))/sum(cm)
  return(acc)
} 

# cross-validation
set.seed(34238)
crossAcc <- cv.glm(customer_m, logisticModelNew, cost = Accu , K =8  )$delta
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
crossAcc
## [1] 0.8010791 0.8020909