Predicting Telecom Customer Churn using Logistic Regression

# Setting the working directory
setwd("G:\\IIMK DABS\\Session Files\\Datasets and R files")
getwd()

## [1] "G:/IIMK DABS/Session Files/Datasets and R files"

# Loading data and converting variables into proper classes
data <- read.csv("telecom_churn.csv", header = T)
sum(is.na(data))

## [1] 0

data$Churn <- as.factor(data$Churn)
data$ContractRenewal <- as.factor(data$ContractRenewal)
data$DataPlan <- as.factor(data$DataPlan)

# Loading necessary packages
library(caret)
library(ROSE)
library(car)
library(pROC)

# Estimation using the data
model <- glm(Churn~., data = data, family = binomial)
summary(model)

## 
## Call:
## glm(formula = Churn ~ ., family = binomial, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0058  -0.5112  -0.3477  -0.2093   2.9981  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -5.9510252  0.5486763 -10.846  < 2e-16 ***
## AccountWeeks      0.0006525  0.0013873   0.470 0.638112    
## ContractRenewal1 -1.9855172  0.1436107 -13.826  < 2e-16 ***
## DataPlan1        -1.1841611  0.5363668  -2.208 0.027262 *  
## DataUsage         0.3636565  1.9231751   0.189 0.850021    
## CustServCalls     0.5081349  0.0389682  13.040  < 2e-16 ***
## DayMins           0.0174407  0.0324841   0.537 0.591337    
## DayCalls          0.0036523  0.0027497   1.328 0.184097    
## MonthlyCharge    -0.0275526  0.1909074  -0.144 0.885244    
## OverageFee        0.1868114  0.3256902   0.574 0.566248    
## RoamMins          0.0789226  0.0220522   3.579 0.000345 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2758.3  on 3332  degrees of freedom
## Residual deviance: 2188.4  on 3322  degrees of freedom
## AIC: 2210.4
## 
## Number of Fisher Scoring iterations: 5

# Stepwise Regression to identify significant variables
model1 <- step(model, trace = F)
summary(model1)

## 
## Call:
## glm(formula = Churn ~ ContractRenewal + DataPlan + CustServCalls + 
##     DayMins + OverageFee + RoamMins, family = binomial, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9932  -0.5154  -0.3480  -0.2095   2.9906  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -5.552897   0.432757 -12.831  < 2e-16 ***
## ContractRenewal1 -1.989219   0.143452 -13.867  < 2e-16 ***
## DataPlan1        -0.934814   0.144015  -6.491 8.52e-11 ***
## CustServCalls     0.505651   0.038834  13.021  < 2e-16 ***
## DayMins           0.012774   0.001073  11.907  < 2e-16 ***
## OverageFee        0.138612   0.022648   6.120 9.34e-10 ***
## RoamMins          0.083476   0.020304   4.111 3.93e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2758.3  on 3332  degrees of freedom
## Residual deviance: 2190.6  on 3326  degrees of freedom
## AIC: 2204.6
## 
## Number of Fisher Scoring iterations: 5

# Obtaining model coefficients to understand the direction of independent variables
coefficients(model1)

##      (Intercept) ContractRenewal1        DataPlan1    CustServCalls 
##      -5.55289657      -1.98921921      -0.93481403       0.50565134 
##          DayMins       OverageFee         RoamMins 
##       0.01277352       0.13861170       0.08347648

# Obtaining odds ratio to understand the magnitude of impact of independent variables
exp(coefficients(model1))

##      (Intercept) ContractRenewal1        DataPlan1    CustServCalls 
##      0.003876213      0.136802198      0.392658882      1.658065131 
##          DayMins       OverageFee         RoamMins 
##      1.012855446      1.148677979      1.087059649

# Checking for multicollinearity (variable inflation factor)
vif(model1)

## ContractRenewal        DataPlan   CustServCalls         DayMins      OverageFee 
##        1.056179        1.018476        1.076219        1.039028        1.022948 
##        RoamMins 
##        1.010395

# Using caret package to create data partition
set.seed(1234)
index <- createDataPartition(data$Churn, p=0.8, list = F)
train_data <- data [index,]
prop.table(table(train_data$Churn))

## 
##         0         1 
## 0.8548931 0.1451069

test_data <- data [-index,]
prop.table(table(test_data$Churn))

## 
##         0         1 
## 0.8558559 0.1441441

# Training the model using train data
model3 <- glm(Churn~., train_data, family = binomial)

# Stepwise Regression to identify significant variables
model4 <- step(model3, trace = F)

# predicting the test data using the model
predict <- predict(model4, newdata = test_data, type= "response")
test_data$predicted <- predict
test_data$class <- ifelse(test_data$predicted >= 0.5, 1, 0)

test_data$Churn <- as.factor(test_data$Churn)
test_data$class <- as.factor(test_data$class)

# Creating confusion matrix to identify accuracy and sensitivity
confusionMatrix(test_data$class, test_data$Churn, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 561  85
##          1   9  11
##                                           
##                Accuracy : 0.8589          
##                  95% CI : (0.8301, 0.8844)
##     No Information Rate : 0.8559          
##     P-Value [Acc > NIR] : 0.4393          
##                                           
##                   Kappa : 0.1473          
##                                           
##  Mcnemar's Test P-Value : 1.029e-14       
##                                           
##             Sensitivity : 0.11458         
##             Specificity : 0.98421         
##          Pos Pred Value : 0.55000         
##          Neg Pred Value : 0.86842         
##              Prevalence : 0.14414         
##          Detection Rate : 0.01652         
##    Detection Prevalence : 0.03003         
##       Balanced Accuracy : 0.54940         
##                                           
##        'Positive' Class : 1               
##

The sensitivity value is very low. Sensitivity is the True positive rate. The value of sensitivity is low because the data is imbalanced. The proportion of event class and non-event class in the dependent variable is not same. In order to balance that, data balancing needs to be done. This can be done using ROSE package and creating over-sampled, under-sampled and ‘both’ sampled data

set.seed(1234)
over<- ovun.sample(Churn~., data = train_data, method = "over", N = 4560)$data
table(over$Churn)

## 
##    0    1 
## 2280 2280

set.seed(1234)
under<- ovun.sample(Churn~., data = train_data, method = "under", N = 774)$data
table (under$Churn)

## 
##   0   1 
## 387 387

both<- ovun.sample(Churn~., data = train_data, method = "both", p = .50, seed = 1234, N = 2667 )$data
prop.table (table (both$Churn))

## 
##         0         1 
## 0.4983127 0.5016873

# Creating model using over-sampled data, predicitng test data using the model and creating confusion matrix

model5 <- glm (Churn~., family = binomial, data = over)
predicted1 <- predict (model5, newdata = test_data, type = "response")
class1 <- ifelse (predicted1 >= .50, 1, 0)
class1 <- factor (class1)
confusionMatrix(class1, test_data$Churn, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 455  24
##          1 115  72
##                                           
##                Accuracy : 0.7913          
##                  95% CI : (0.7584, 0.8216)
##     No Information Rate : 0.8559          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3933          
##                                           
##  Mcnemar's Test P-Value : 2.281e-14       
##                                           
##             Sensitivity : 0.7500          
##             Specificity : 0.7982          
##          Pos Pred Value : 0.3850          
##          Neg Pred Value : 0.9499          
##              Prevalence : 0.1441          
##          Detection Rate : 0.1081          
##    Detection Prevalence : 0.2808          
##       Balanced Accuracy : 0.7741          
##                                           
##        'Positive' Class : 1               
##

# Creating model using under-sampled data, predicitng test data using the model and creating confusion matrix

model6 <- glm (Churn~., family = binomial, data = under)
predicted2 <- predict (model6, newdata = test_data, type = "response")
class2 <- ifelse (predicted2 >= .50, 1, 0)
class2 <- factor (class2)
confusionMatrix(class2, test_data$Churn, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 457  26
##          1 113  70
##                                           
##                Accuracy : 0.7913          
##                  95% CI : (0.7584, 0.8216)
##     No Information Rate : 0.8559          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3856          
##                                           
##  Mcnemar's Test P-Value : 2.999e-13       
##                                           
##             Sensitivity : 0.7292          
##             Specificity : 0.8018          
##          Pos Pred Value : 0.3825          
##          Neg Pred Value : 0.9462          
##              Prevalence : 0.1441          
##          Detection Rate : 0.1051          
##    Detection Prevalence : 0.2748          
##       Balanced Accuracy : 0.7655          
##                                           
##        'Positive' Class : 1               
##

# Creating model using under-sampled data, predicitng test data using the model and creating confusion matrix

model7 <- glm (Churn~., family = binomial, data = both)
predicted3 <- predict (model7, newdata = test_data, type = "response")
class3 <- ifelse (predicted3 >= .50, 1, 0)
class3 <- factor (class3)
confusionMatrix(class3, test_data$Churn, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 455  24
##          1 115  72
##                                           
##                Accuracy : 0.7913          
##                  95% CI : (0.7584, 0.8216)
##     No Information Rate : 0.8559          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3933          
##                                           
##  Mcnemar's Test P-Value : 2.281e-14       
##                                           
##             Sensitivity : 0.7500          
##             Specificity : 0.7982          
##          Pos Pred Value : 0.3850          
##          Neg Pred Value : 0.9499          
##              Prevalence : 0.1441          
##          Detection Rate : 0.1081          
##    Detection Prevalence : 0.2808          
##       Balanced Accuracy : 0.7741          
##                                           
##        'Positive' Class : 1               
##

# Using library proc, creating ROC curve and calculatng AUCvalues for all 3 models
roc(test_data$Churn, predicted1, plot=TRUE, legacy.axes=TRUE, percent=TRUE, xlab="False Positive Percentage", ylab="True Postive Percentage", col="green4", lwd=2, print.auc=TRUE)

## 
## Call:
## roc.default(response = test_data$Churn, predictor = predicted1,     percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage",     ylab = "True Postive Percentage", col = "green4", lwd = 2,     print.auc = TRUE)
## 
## Data: predicted1 in 570 controls (test_data$Churn 0) < 96 cases (test_data$Churn 1).
## Area under the curve: 82.93%

plot.roc(test_data$Churn, predicted2, percent=TRUE, col="red", lwd=2, print.auc=TRUE, add=TRUE, print.auc.y=30)
plot.roc(test_data$Churn, predicted3, percent=TRUE, col ="blue", lwd=2,print.auc=TRUE, add=TRUE,print.auc.y=40)
legend("topright", bg ="transparent", legend=c("Over sampled", "Under sampled", "Both"), col=c("green4", "red","blue"), cex = .5,lwd=2)

Predicting Telecom Customer Churn using Logistic Regression

Aravind S

16/09/2022