Predicting customer churn

Team Members:

Sloan Jinks
Burhanuddin Samiwala

Inputting the various libraries

library(tidyverse) 
library(MASS)
library(ggcorrplot)
library(olsrr)
library(caret)
library('car')
library(randomForest)
df<- read.table(file = "C:/Users/17che/OneDrive - Georgia Institute of Technology/Desktop/telco.csv"
                , sep=",", header=T,stringsAsFactors=T)

df <- subset(df, select = -customerID)
(head(df));

Get names of the columns that have NA values in them

colnames(df)[colSums(is.na(df)) > 0]

## [1] "TotalCharges"

Omitting the 11 rows that have NA values in them

df <- na.omit(df)

Seeing the variation of churn with respect to the continuous variables

library(ggpubr)

a <- ggplot(df, aes(y= tenure, x = "", fill = Churn)) + 
  geom_boxplot()+  theme_bw()+ ylab("Tenure ")+ xlab(" ") 

b <- ggplot(df, aes(y= MonthlyCharges, x = "", fill = Churn, )) + 
  geom_boxplot()+ theme_bw()+ ylab("Monthly Charges")+ xlab(" ")

c <- ggplot(df, aes(y= TotalCharges, x = "", fill = Churn, colors ='b')) + 
  geom_boxplot()+ theme_bw()+ ylab("Total Charges")+  xlab(" ")

ggarrange(a,b,c, ncol = 3, nrow = 1)

Visualizing churn with respect to the categorical variables (Part 1)

color <- scale_fill_manual(values=c("chartreuse3","red"))
d <- ggplot(df,aes(x=gender,fill=Churn))+geom_bar(position = 'fill')+xlab("Gender ")+ 
  ylab("") +color+theme_bw()

e <- ggplot(df,aes(x=Dependents,fill=Churn))+geom_bar(position = 'fill') +xlab("Dependents")+ ylab("")+color+theme_bw()

f <- ggplot(df,aes(x=Partner,fill=Churn))+geom_bar(position = 'fill') + xlab("Partner")+ ylab("")+color+theme_bw()

g <- ggplot(df,aes(x=InternetService,fill=Churn))+geom_bar(position = 'fill')+ xlab("Internet Service")+ ylab("")+color+theme_bw()

ggarrange(d,e,f,g, ncol = 2, nrow = 2)

(Part 2)

h <- ggplot(df,aes(x=OnlineSecurity,fill=Churn))+geom_bar(position = 'fill')+ xlab("Online Security")+ ylab("")+ scale_x_discrete(labels=function(x) str_wrap(x,width=5))+color+theme_bw()
                       
i <- ggplot(df,aes(x=PaymentMethod,fill=Churn))+geom_bar(position = 'fill')+ xlab("Payment Method")+ ylab("") + scale_x_discrete(labels=function(x) str_wrap(x,width=10))+color+theme_bw()

j <- ggplot(df,aes(x=Contract,fill=Churn))+geom_bar(position = 'fill')+ xlab("Contract")+ ylab("")+ scale_x_discrete(labels=function(x) str_wrap(x,width=5))+color+theme_bw()

k <- ggplot(df,aes(x= DeviceProtection,fill=Churn))+ geom_bar(position = 'fill')+ xlab("Device Protection")+ ylab("") + scale_x_discrete(labels=function(x) str_wrap(x,width=5))+color+theme_bw()
ggarrange(h,i,j,k, ncol = 2, nrow = 2)

(Part 3)

l <- ggplot(df,aes(x=TechSupport,fill=Churn))+geom_bar(position = 'fill')+theme_bw()+ xlab("Tech Support")+ ylab("")+color

m <- ggplot(df,aes(x=StreamingTV,fill=Churn))+geom_bar(position = 'fill')+theme_bw()+ xlab("Streaming TV")+ ylab("")+color

n <- m <- ggplot(df,aes(x=PaperlessBilling,fill=Churn))+geom_bar(position = 'fill')+theme_bw()+ xlab("Paperless Billing")+ ylab("")+color

o <- ggplot(df,aes(x=StreamingMovies,fill=Churn))+geom_bar(position = 'fill')+theme_bw()+ xlab("Streaming Movies")+ ylab("")+color

ggarrange(l,m,n,o, ncol = 2, nrow = 2)

Visualizing the correlations between the continuous variables

corr <- cor(df[,c('tenure', 'MonthlyCharges', 'TotalCharges')])
ggcorrplot(corr,hc.order = TRUE, outline.col = "white")

Considering No phone service and No internet as No, thus cleaning the data Also setting ‘Male’ as 1 and ‘Female’ as 0

df <- data.frame(lapply(df, function(x) {
                  gsub("No internet service", "No", x)}))
df <- data.frame(lapply(df, function(x) {
                  gsub("No phone service", "No", x)}))
df$gender<-ifelse(df$gender=="Male",1,0)

Pre-processing the data by replacing all the data that has ‘No’ as 0 and ‘Yes’ as 1

df <- data.frame(lapply(df, function(x) {
                  gsub("No", 0, x)}))
df <- data.frame(lapply(df, function(x) {
                  gsub("Yes", 1, x)}))

Creating dummy variables for the categorical variables that have multiple levels

library('fastDummies')
categorical<- dummy_cols(df, select_columns = c('InternetService', 'Contract', 'PaymentMethod'), remove_first_dummy = FALSE)
cat<- categorical[,-c(5,8,15,17,18,19)]

Combining the categorical variables, dummy variables and continuous variables in one dataframe

cont <- df[,c(5,18,19)]
df_final <- cbind(cont,cat)
df_final <- sapply(df_final, as.numeric)
df_final <- as.data.frame(df_final)
head(df_final)

Renaming the variables for ease of understanding Splitting the data into testing and training data

library(caTools)
colnames(df_final)[20] <- "FiberOptic"
colnames(df_final)[21] <- "Contract_month_to_month"
colnames(df_final)[22] <- "Contract_one_year"
colnames(df_final)[23] <- "Contract_two_year"
colnames(df_final)[24] <- "Payment_bank_transfer"
colnames(df_final)[25] <- "Payment_credit_card"
colnames(df_final)[26] <- "Payment_electronic_check"
colnames(df_final)[27] <- "Payment_mailed_check"

set.seed(123)
x=sample.split (df_final, SplitRatio= 0.7)
Train= df_final[x, 1:27]
Test= df_final[!x, 1:27]

Fitting all the variables on the training data to get the full model

model <- glm(Churn ~ ., family = binomial(link="logit"), data = Train);
summary(model)

## 
## Call:
## glm(formula = Churn ~ ., family = binomial(link = "logit"), data = Train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8969  -0.6780  -0.3014   0.7238   3.3806  
## 
## Coefficients: (3 not defined because of singularities)
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               3.059e+00  1.942e+00   1.575 0.115150    
## tenure                   -5.623e-02  7.535e-03  -7.463 8.43e-14 ***
## MonthlyCharges           -7.101e-02  3.837e-02  -1.850 0.064260 .  
## TotalCharges              2.558e-04  8.512e-05   3.006 0.002649 ** 
## gender                   -4.326e-02  7.932e-02  -0.545 0.585512    
## SeniorCitizen             1.630e-01  1.041e-01   1.566 0.117422    
## Partner                   1.383e-01  9.563e-02   1.446 0.148129    
## Dependents               -2.338e-01  1.100e-01  -2.125 0.033547 *  
## PhoneService              9.277e-01  7.870e-01   1.179 0.238490    
## MultipleLines             6.221e-01  2.146e-01   2.899 0.003740 ** 
## OnlineSecurity           -8.279e-02  2.159e-01  -0.383 0.701367    
## OnlineBackup              2.309e-01  2.120e-01   1.089 0.276151    
## DeviceProtection          3.475e-01  2.145e-01   1.620 0.105216    
## TechSupport               4.375e-02  2.168e-01   0.202 0.840046    
## StreamingTV               8.913e-01  3.936e-01   2.265 0.023538 *  
## StreamingMovies           9.470e-01  3.959e-01   2.392 0.016753 *  
## PaperlessBilling          3.137e-01  9.160e-02   3.425 0.000615 ***
## InternetService_0        -5.244e+00  1.927e+00  -2.721 0.006501 ** 
## InternetService_DSL      -2.615e+00  9.644e-01  -2.711 0.006700 ** 
## FiberOptic                       NA         NA      NA       NA    
## Contract_month_to_month   1.167e+00  2.094e-01   5.575 2.47e-08 ***
## Contract_one_year         5.844e-01  2.066e-01   2.828 0.004682 ** 
## Contract_two_year                NA         NA      NA       NA    
## Payment_bank_transfer     6.516e-02  1.429e-01   0.456 0.648497    
## Payment_credit_card       6.787e-02  1.425e-01   0.476 0.633881    
## Payment_electronic_check  3.879e-01  1.202e-01   3.228 0.001247 ** 
## Payment_mailed_check             NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5428.3  on 4686  degrees of freedom
## Residual deviance: 3909.5  on 4663  degrees of freedom
## AIC: 3957.5
## 
## Number of Fisher Scoring iterations: 6

Predicting the churn using the fitted model

Test$model_prob <- predict(model, Test, type = "response")
Test <- Test  %>% mutate(model_pred = 1*(model_prob > .5) + 0,
                                 visit_binary = 1*(Churn == 1) + 0)

Checking the accuracy of the full model

Test <- Test %>% mutate(accurate = 1*(model_pred == visit_binary))
sum(Test$accurate, na.rm = T)/nrow(Test)

## [1] 0.7991471

Hence the accuracy of the logistic regression- full model is 79.9%

Calculating the accuracy for the reduced model determined using AIC

Test$model_prob <- predict(model_AIC, Test, type = "response")
Test <- Test  %>% mutate(model_pred = 1*(model_prob > .5) + 0,
                                 visit_binary = 1*(Churn == 1) + 0)

Test <- Test %>% mutate(accurate = 1*(model_pred == visit_binary))
sum(Test$accurate, na.rm = T)/nrow(Test)

## [1] 0.7987207

The accuracy of the model is now 79.87% when only retaining the variables decided by AIC

VIF is used for checking the multicollinearity between the variables selected by AIC

vif(model_AIC)

##                   tenure           MonthlyCharges             TotalCharges 
##                15.490829                82.076936                20.278638 
##            SeniorCitizen                  Partner               Dependents 
##                 1.138209                 1.401656                 1.300379 
##             PhoneService            MultipleLines             OnlineBackup 
##                 5.431540                 2.120849                 1.893234 
##         DeviceProtection              StreamingTV          StreamingMovies 
##                 1.986773                 4.251463                 4.426197 
##         PaperlessBilling        InternetService_0      InternetService_DSL 
##                 1.118060                26.599814                14.879846 
##  Contract_month_to_month        Contract_one_year Payment_electronic_check 
##                 4.475069                 3.451895                 1.140161

Variables removed:

Removing Total charges from the model since it is highly correlated to Monthly Charges, but is less significant
Removing Streaming Movies since it is explained by Streaming TV.
No Internet service is also highly correlated to DSL, but DSL is greatly less significant
Contract_one year is correlated to Contract_Month-to-month
Phone service is removed due to low significance and high VIF value

model_vif <- glm(formula = Churn ~ tenure + MonthlyCharges + SeniorCitizen + 
    InternetService_0 + OnlineBackup + DeviceProtection + StreamingTV+
    OnlineSecurity + OnlineBackup + TechSupport + MultipleLines +
    + `Contract_month_to_month` + PaperlessBilling + 
    `Payment_electronic_check`, family = "binomial", data = Train)
summary(model_vif)

## 
## Call:
## glm(formula = Churn ~ tenure + MonthlyCharges + SeniorCitizen + 
##     InternetService_0 + OnlineBackup + DeviceProtection + StreamingTV + 
##     OnlineSecurity + OnlineBackup + TechSupport + MultipleLines + 
##     +Contract_month_to_month + PaperlessBilling + Payment_electronic_check, 
##     family = "binomial", data = Train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9639  -0.6508  -0.3329   0.6998   3.0207  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -2.318793   0.214849 -10.793  < 2e-16 ***
## tenure                   -0.033959   0.002724 -12.465  < 2e-16 ***
## MonthlyCharges            0.023140   0.002862   8.086 6.15e-16 ***
## SeniorCitizen             0.264976   0.101608   2.608 0.009112 ** 
## InternetService_0        -0.677124   0.187495  -3.611 0.000305 ***
## OnlineBackup             -0.197324   0.092709  -2.128 0.033302 *  
## DeviceProtection         -0.060840   0.096472  -0.631 0.528269    
## StreamingTV               0.071674   0.101641   0.705 0.480705    
## OnlineSecurity           -0.576618   0.100596  -5.732 9.92e-09 ***
## TechSupport              -0.445492   0.101351  -4.396 1.11e-05 ***
## MultipleLines             0.144893   0.098591   1.470 0.141659    
## Contract_month_to_month   0.757402   0.122975   6.159 7.32e-10 ***
## PaperlessBilling          0.354271   0.090412   3.918 8.91e-05 ***
## Payment_electronic_check  0.403909   0.083875   4.816 1.47e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5428.3  on 4686  degrees of freedom
## Residual deviance: 3971.5  on 4673  degrees of freedom
## AIC: 3999.5
## 
## Number of Fisher Scoring iterations: 5

vif(model_vif)

##                   tenure           MonthlyCharges            SeniorCitizen 
##                 2.098769                 3.570934                 1.082891 
##        InternetService_0             OnlineBackup         DeviceProtection 
##                 1.879425                 1.220498                 1.325454 
##              StreamingTV           OnlineSecurity              TechSupport 
##                 1.621973                 1.110586                 1.169765 
##            MultipleLines  Contract_month_to_month         PaperlessBilling 
##                 1.545203                 1.596986                 1.107892 
## Payment_electronic_check 
##                 1.128538

Fitting another model determined using the variance inflation factor

model_vif2 <- glm(formula = Churn ~ tenure + MonthlyCharges + SeniorCitizen + 
    InternetService_0 + OnlineBackup + OnlineSecurity + OnlineBackup + TechSupport + 
    + `Contract_month_to_month` + PaperlessBilling + 
    `Payment_electronic_check`, family = "binomial", data = Train)
final <- model_vif2
summary(model_vif2)

## 
## Call:
## glm(formula = Churn ~ tenure + MonthlyCharges + SeniorCitizen + 
##     InternetService_0 + OnlineBackup + OnlineSecurity + OnlineBackup + 
##     TechSupport + +Contract_month_to_month + PaperlessBilling + 
##     Payment_electronic_check, family = "binomial", data = Train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9193  -0.6541  -0.3314   0.7139   3.0074  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -2.421909   0.204018 -11.871  < 2e-16 ***
## tenure                   -0.033237   0.002615 -12.710  < 2e-16 ***
## MonthlyCharges            0.025167   0.002268  11.098  < 2e-16 ***
## SeniorCitizen             0.268023   0.101434   2.642  0.00823 ** 
## InternetService_0        -0.615371   0.183543  -3.353  0.00080 ***
## OnlineBackup             -0.205191   0.092483  -2.219  0.02651 *  
## OnlineSecurity           -0.583323   0.100285  -5.817 6.00e-09 ***
## TechSupport              -0.459340   0.100753  -4.559 5.14e-06 ***
## Contract_month_to_month   0.769366   0.121407   6.337 2.34e-10 ***
## PaperlessBilling          0.361056   0.090321   3.997 6.40e-05 ***
## Payment_electronic_check  0.405412   0.083700   4.844 1.27e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5428.3  on 4686  degrees of freedom
## Residual deviance: 3974.4  on 4676  degrees of freedom
## AIC: 3996.4
## 
## Number of Fisher Scoring iterations: 5

vif(model_vif2)

##                   tenure           MonthlyCharges            SeniorCitizen 
##                 1.934739                 2.245331                 1.081630 
##        InternetService_0             OnlineBackup           OnlineSecurity 
##                 1.800568                 1.215665                 1.104441 
##              TechSupport  Contract_month_to_month         PaperlessBilling 
##                 1.156389                 1.556317                 1.105920 
## Payment_electronic_check 
##                 1.124904

Hence, variables selected in our final model:

tenure
MonthlyCharges
SeniorCitizen
InternetService
OnlineBackup
OnlineSecurity
TechSupport
Contract
PaperlessBilling
PaymentMethod

predicted <- predict(final, type = "response", newdata = Test[,-17])
Test$prob <- predicted
predicted_churn <- factor(ifelse(predicted >= 0.50, "Yes", "No"))
actual_churn <- factor(ifelse(Test$Churn==1,"Yes","No"))

Get the values of accuracy, sensitivity and specificity for cutoff = 0.5

cutoff_churn <- factor(ifelse(predicted >=0.50, "Yes", "No"))
confusionMatrix_final <- confusionMatrix(cutoff_churn, actual_churn, positive = "Yes")
accuracy <- confusionMatrix_final$overall[1]
sensitivity <- confusionMatrix_final$byClass[1]
specificity <- confusionMatrix_final$byClass[2]
accuracy

##  Accuracy 
## 0.7970149

sensitivity

## Sensitivity 
##   0.5280899

specificity

## Specificity 
##   0.8943089

Hence, the obtained values are:

Accuracy = 79.7%
Sensitivity = 52.8%
Specificity = 89.4%

perform_fn <- function(cutoff) 
{
  predicted_churn <- factor(ifelse(predicted >= cutoff, "Yes", "No"))
  conf <- confusionMatrix(predicted_churn, actual_churn, positive = "Yes")
  accuray <- conf$overall[1]
  sensitivity <- conf$byClass[1]
  specificity <- conf$byClass[2]
  out <- t(as.matrix(c(sensitivity, specificity, accuray))) 
  colnames(out) <- c("sensitivity", "specificity", "accuracy")
  return(out)
}

Plotting the value of these three metrics for different cutoff values

s = seq(0.01,0.90,length=100)
OUT = matrix(0,100,3)

for(i in 1:100)
{
  OUT[i,] = perform_fn(s[i])
} 

plot(s, OUT[,1],xlab="Cutoff",ylab="Value",axes=FALSE,col=2,type="l")
axis(1,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
axis(2,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
lines(s,OUT[,2],col='black',lwd=2)
lines(s,OUT[,3],col=4,lwd=2)
box()
legend("bottom",col=c(2,"black",4,"red")
       ,lwd=c(2,2,2,2),c("Sensitivity","Specificity","Accuracy"))
axis(1, at = seq(0.1, 1, by = 0.1))

Hence, the cutoff value is taken to be = 0.3 as that is where accuracy, specificity and sensitivity intersect. This gives a high value for all three parameters

cutoff_churn <- factor(ifelse(predicted >=0.3, "Yes", "No"))
conf_final <- confusionMatrix(cutoff_churn, actual_churn, positive = "Yes")
accuracy <- conf_final$overall[1]
sensitivity <- conf_final$byClass[1]
specificity <- conf_final$byClass[2]
accuracy

##  Accuracy 
## 0.7688699

sensitivity

## Sensitivity 
##   0.7495987

specificity

## Specificity 
##    0.775842

Hence the obtained values are:

Accuracy = 76.9%
Sensitivity = 75.0%
Specificity = 77.6%

Get the Area under the curve value

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

model_final.roc <- roc(response = Test$Churn, predictor = as.numeric(predicted))

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(model_final.roc, print.auc = TRUE,legacy.axes = TRUE, col='blue')

Training a random forest model

Train$Churn <- as.character(Train$Churn)
Train$Churn <- as.factor(Train$Churn)
model.rf <- randomForest(Churn ~ ., data=Train, proximity=FALSE,importance = FALSE,
                        ntree=500,mtry=4, do.trace=FALSE)
model.rf

## 
## Call:
##  randomForest(formula = Churn ~ ., data = Train, proximity = FALSE,      importance = FALSE, ntree = 500, mtry = 4, do.trace = FALSE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 20.03%
## Confusion matrix:
##      0   1 class.error
## 0 3075 366   0.1063644
## 1  573 673   0.4598716

The ten most important variables according to the Random Forest

varImpPlot(model.rf, main="Random Forest Variable importance", n.var = 10)

Plotting the ROC curve for the random forest

predicted <- predict(model.rf, type = "response", newdata = Test[,-17])
model.rf.roc <- roc(response = Test$Churn, predictor = as.numeric(predicted))

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(model.rf.roc, print.auc = TRUE,legacy.axes = TRUE, col='red')

Area under ROC curve is lesser for a random forest model as compared to the logistic regression case

Hence, logistic regression model with cutoff = 0.3 is the best model for this data