Predicting customer churn at QWE Inc

Group 1

# Importing Data

  library(ISLR)
  # reading inbuilt data as data frame
  churn <- read.csv(paste("ChurnData.csv", sep =))

Number of rows and columns

nrow(churn)
[1] 6347
ncol(churn)
[1] 13

Data Summary

summary(churn)
       ID            Age       Churn          CHIDec        CHIChange     
 Min.   :   1   Min.   : 1.0   No :6024   Min.   :  1.0   Min.   :  1.00  
 1st Qu.:1588   1st Qu.: 8.0   Yes: 323   1st Qu.: 16.0   1st Qu.:  2.00  
 Median :3174   Median :18.0              Median : 73.0   Median : 49.00  
 Mean   :3174   Mean   :24.5              Mean   :103.1   Mean   : 80.39  
 3rd Qu.:4760   3rd Qu.:38.0              3rd Qu.:204.0   3rd Qu.:160.50  
 Max.   :6347   Max.   :61.0              Max.   :263.0   Max.   :242.00  
 SupportCasesDec  SupportCasesChange SupportPriorityDec
 Min.   : 1.000   Min.   : 1.000     Min.   : 1.000    
 1st Qu.: 1.000   1st Qu.: 1.000     1st Qu.: 1.000    
 Median : 1.000   Median : 1.000     Median : 1.000    
 Mean   : 3.032   Mean   : 6.419     Mean   : 3.604    
 3rd Qu.: 2.000   3rd Qu.:11.000     3rd Qu.: 8.000    
 Max.   :21.000   Max.   :37.000     Max.   :17.000    
 SupportPriorityChange  LoginsChange   BlogPostsChange   ViewsChange    
 Min.   : 1.000        Min.   :  1.0   Min.   : 1.000   Min.   :   1.0  
 1st Qu.: 1.000        1st Qu.: 12.0   1st Qu.: 1.000   1st Qu.:   1.0  
 Median : 1.000        Median : 98.0   Median : 1.000   Median : 372.0  
 Mean   : 8.794        Mean   :115.9   Mean   : 9.944   Mean   : 471.7  
 3rd Qu.:17.000        3rd Qu.:210.0   3rd Qu.:15.000   3rd Qu.: 877.0  
 Max.   :46.000        Max.   :294.0   Max.   :57.000   Max.   :1360.0  
 DaysSinceLoginChange
 Min.   :  1.0       
 1st Qu.:  1.0       
 Median : 11.0       
 Mean   : 52.8       
 3rd Qu.:121.0       
 Max.   :143.0       
table(churn$Churn)

  No  Yes 
6024  323 
count <-table(churn$Churn)
View(count)

Data Types

str(churn)
'data.frame':   6347 obs. of  13 variables:
 $ ID                   : int  1 1112 2223 3334 4445 5556 6015 6126 6237 1002 ...
 $ Age                  : int  58 58 52 57 54 55 54 42 53 53 ...
 $ Churn                : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ CHIDec               : int  1 223 1 149 202 45 92 21 240 240 ...
 $ CHIChange            : int  1 179 1 95 2 3 53 8 75 39 ...
 $ SupportCasesDec      : int  1 1 1 2 1 1 2 1 2 1 ...
 $ SupportCasesChange   : int  1 1 1 2 1 1 20 1 11 1 ...
 $ SupportPriorityDec   : int  1 1 1 11 1 1 11 1 11 1 ...
 $ SupportPriorityChange: int  1 1 1 1 1 1 42 1 1 1 ...
 $ LoginsChange         : int  1 1 1 152 1 228 120 1 84 72 ...
 $ BlogPostsChange      : int  1 1 1 23 1 1 2 1 25 1 ...
 $ ViewsChange          : int  1 145 1 887 1331 319 1334 1044 1 989 ...
 $ DaysSinceLoginChange : int  134 134 134 1 134 1 1 139 141 115 ...

Data Visualization - Discrete

Pie chart of customers (percentage) who churned/not churned

count

  No  Yes 
6024  323 
count_percent <-count*100/6347
count_percent

       No       Yes 
94.910982  5.089018 
pie(count_percent, col = c("red","blue"))

plot of chunk unnamed-chunk-5

Bar plot of customers (percentage) who churned/not churned

barplot(count_percent, xlab = "Churn", ylab = "Percentage", las = 1, col = c("red","blue") )

plot of chunk unnamed-chunk-6

Data Visualization - Continuous

Box Plot of Age and Churn

boxplot(Age ~ Churn, data = churn, horizontal = TRUE, col = "blue", xlab = "Age", ylab = "churn")

plot of chunk unnamed-chunk-7

Box Plot of CHI and Churn

boxplot(CHIDec ~ Churn, data = churn, horizontal = TRUE, col = "blue", xlab = "CHI", ylab = "churn")

plot of chunk unnamed-chunk-8

Box Plot of CHI Chnage and Churn

boxplot(CHIChange ~ Churn, data = churn, horizontal = TRUE, col = "blue", xlab = "CHI Change", ylab = "churn")

plot of chunk unnamed-chunk-9

Box Plot of Support Cases and Churn

boxplot(SupportCasesDec ~ Churn, data = churn, horizontal = TRUE, col = "blue", xlab = "NO. of Suport Cases", ylab = "churn")

plot of chunk unnamed-chunk-10

Box Plot of Support Cases Change and Churn

boxplot(SupportCasesChange ~ Churn, data = churn, horizontal = TRUE, col = "blue", xlab = "CHnage in Support Cases", ylab = "churn")

plot of chunk unnamed-chunk-11

Box Plot of Support Priority and Churn

boxplot(SupportPriorityDec ~ Churn, data = churn, horizontal = TRUE, col = "blue", xlab = "Support priority type", ylab = "churn")

plot of chunk unnamed-chunk-12

Box Plot of Support Priority Change and Churn

boxplot(SupportPriorityChange ~ Churn, data = churn, horizontal = TRUE, col = "blue", xlab = "Support priority Change", ylab = "churn")

plot of chunk unnamed-chunk-13

Box Plot of Logins Change and Churn

boxplot(LoginsChange ~ Churn, data = churn, horizontal = TRUE, col = "blue", xlab = "No. of Logins change", ylab = "churn")

plot of chunk unnamed-chunk-14

Box Plot of Blog Posts Change and Churn

boxplot(BlogPostsChange ~ Churn, data = churn, horizontal = TRUE, col = "blue", xlab = "Blog Posts Change", ylab = "churn")

plot of chunk unnamed-chunk-15

Box Plot of Views Change and Churn

boxplot(ViewsChange ~ Churn, data = churn, horizontal = TRUE, col = "blue", xlab = "No. of views change", ylab = "churn")

plot of chunk unnamed-chunk-16

Box Plot of Days Since Login Change and Churn

boxplot(DaysSinceLoginChange ~ Churn, data = churn, horizontal = TRUE, col = "blue", xlab = "No. of days since login change", ylab = "churn")

plot of chunk unnamed-chunk-17

Data Preparation

Creating Training and Testing Data

library(caTools)
# use set.seed to use the same random number sequence
set.seed(123)
# craeting 70% data for training 
split <- sample.split(churn$Churn, SplitRatio = 0.70)
trainData <- subset(churn, split == TRUE)
# dimensions of training data
dim(trainData)
[1] 4443   13
# creating 30% data for testing
testData <- subset(churn, split == FALSE)
# dimensions of testing data
dim(testData)
[1] 1904   13

Classification using Binomial Logistic Model

Classifier 1

logitClassifier1 <- glm(Churn ~ DaysSinceLoginChange,
data = trainData, 
family = binomial())
summary(logitClassifier1)

Call:
glm(formula = Churn ~ DaysSinceLoginChange, family = binomial(), 
    data = trainData)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.4393  -0.3991  -0.2508  -0.2498   2.6392  

Coefficients:
                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)          -3.459623   0.113939 -30.364  < 2e-16 ***
DaysSinceLoginChange  0.008183   0.001163   7.037 1.96e-12 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1786.6  on 4442  degrees of freedom
Residual deviance: 1735.7  on 4441  degrees of freedom
AIC: 1739.7

Number of Fisher Scoring iterations: 6

Classifier 2

logitClassifier2 <- glm(Churn ~ Age,
data = trainData, 
family = binomial())
summary(logitClassifier2)

Call:
glm(formula = Churn ~ Age, family = binomial(), data = trainData)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.3506  -0.3405  -0.3259  -0.2972   2.5495  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept) -2.750736   0.105856 -25.986   <2e-16 ***
Age         -0.007537   0.003664  -2.057   0.0397 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1786.6  on 4442  degrees of freedom
Residual deviance: 1782.2  on 4441  degrees of freedom
AIC: 1786.2

Number of Fisher Scoring iterations: 5

Classifier 3

logitClassifier3 <- glm(Churn ~ Age + CHIDec + CHIChange + SupportCasesDec + SupportCasesChange + SupportPriorityDec + SupportPriorityChange + LoginsChange + BlogPostsChange + ViewsChange + DaysSinceLoginChange, 
data = trainData, 
family = binomial())
summary(logitClassifier3)

Call:
glm(formula = Churn ~ Age + CHIDec + CHIChange + SupportCasesDec + 
    SupportCasesChange + SupportPriorityDec + SupportPriorityChange + 
    LoginsChange + BlogPostsChange + ViewsChange + DaysSinceLoginChange, 
    family = binomial(), data = trainData)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.5818  -0.3653  -0.2924  -0.2260   2.9497  

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -3.0004030  0.1661893 -18.054  < 2e-16 ***
Age                   -0.0078014  0.0037800  -2.064   0.0390 *  
CHIDec                 0.0002225  0.0007690   0.289   0.7724    
CHIChange             -0.0024404  0.0011366  -2.147   0.0318 *  
SupportCasesDec       -0.0164333  0.0314598  -0.522   0.6014    
SupportCasesChange    -0.0067348  0.0191888  -0.351   0.7256    
SupportPriorityDec    -0.0133276  0.0289308  -0.461   0.6450    
SupportPriorityChange  0.0069379  0.0093564   0.742   0.4584    
LoginsChange          -0.0008033  0.0008359  -0.961   0.3365    
BlogPostsChange       -0.0050106  0.0062331  -0.804   0.4215    
ViewsChange            0.0001539  0.0001613   0.954   0.3400    
DaysSinceLoginChange   0.0080484  0.0012157   6.620 3.58e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1786.6  on 4442  degrees of freedom
Residual deviance: 1709.8  on 4431  degrees of freedom
AIC: 1733.8

Number of Fisher Scoring iterations: 6

Prediction

Predicting Test Data results using Classifier 1

predProbClass1 <- predict(logitClassifier1, type = 'response', newdata = testData[-3])
#table(predProbClass1)
yPred1 <- ifelse(predProbClass1 > 0.5, "Yes", "No")
table(yPred1)
yPred1
  No 
1904 

Predicting Test Data results using Classifier 2

predProbClass2 <- predict(logitClassifier2, type = 'response', newdata = testData[-3])
yPred2 <- ifelse(predProbClass2 > 0.5, "Yes", "No")
table(yPred2)
yPred2
  No 
1904 

Predicting Test Data results using Classifier 3

predProbClass3 <- predict(logitClassifier3, type = 'response', newdata = testData[-3])
yPred3 <- ifelse(predProbClass3 > 0.5, "Yes", "No")
table(yPred3)
yPred3
  No 
1904 

Confusion Matrix

Classifier 1

confMatrix1 <- table(yActual = testData[, 3], yPred1)
confMatrix1
       yPred1
yActual   No
    No  1807
    Yes   97

Classifier 2

confMatrix2 <- table(yActual = testData[, 3], yPred2)
confMatrix2
       yPred2
yActual   No
    No  1807
    Yes   97

Classifier 3

confMatrix3 <- table(yActual = testData[, 3], yPred3)
confMatrix3
       yPred3
yActual   No
    No  1807
    Yes   97

Accuracy

#Model 1
library(MLmetrics)
Accuracy(y_pred = yPred1, y_true = testData$Churn)
[1] 0.9490546
#Model 2
Accuracy(y_pred = yPred2, y_true = testData$Churn)
[1] 0.9490546
#Model 3
Accuracy(y_pred = yPred3, y_true = testData$Churn)
[1] 0.9490546

ROC Plot

library(ROCR)
#Every classifier evaluation using ROCR starts with creating a prediction object. This function is used to transform the input data into a standardized format.
PredictObject1 <- prediction(predProbClass3, testData$Churn)

# All kinds of predictor evaluations are performed using the performance function
PerformObject1 <- performance(PredictObject1, "tpr","fpr")

# Plot the ROC Curve for Credit Card Default
plot(PerformObject1, main = "ROC Curve for Churn", col = "black", lwd = 2)
abline(a = 0,b = 1, lwd = 2, lty = 3, col = "black")

plot of chunk unnamed-chunk-29

Decision Tree Approach

library(caret)
# data partition
set.seed(2341)
trainIndex <- createDataPartition(churn$Churn, p = 0.70, list = FALSE)
# 70% training data
trainChurn <- churn[trainIndex, ]
# 30% testing data
testChurn <- churn[-trainIndex, ]
table(trainChurn$Churn)

  No  Yes 
4217  227 
table(testChurn$Churn)

  No  Yes 
1807   96 

Information Gain

dTreeInfoGain <- train(Churn ~ ., 
                       data = trainChurn, 
                       method = "rpart", 
                       parms = list(split = "information"), 
                       trControl = trainControl(method = "cv"))
dTreeInfoGain
CART 

4444 samples
  12 predictor
   2 classes: 'No', 'Yes' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 3999, 3999, 4000, 4000, 4000, 3999, ... 
Resampling results across tuning parameters:

  cp           Accuracy   Kappa     
  0.002202643  0.9446436  0.11174868
  0.002936858  0.9450931  0.11335721
  0.004405286  0.9475716  0.07675207

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.004405286.

Vizualization

# viasulaziation
library(rpart.plot)
prp(dTreeInfoGain$finalModel, box.palette = "Reds", tweak = 1.2, varlen = 20)

plot of chunk unnamed-chunk-32

Prediction

# prediction of churn = {no, yes} on test data (information gain)
predClassInfoGain <- predict(dTreeInfoGain, testChurn, type = 'raw')
table(predClassInfoGain)
predClassInfoGain
  No  Yes 
1891   12 

Confusion Matrix

# prediction on test data
predClassTestInfoGain <- predict(dTreeInfoGain, 
                         testChurn[, 1:13], 
                         type = 'raw')
# confusion matrix
confusionMatrix(predClassTestInfoGain, testChurn$Churn, 
                positive = "Yes")
Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  1801   90
       Yes    6    6

               Accuracy : 0.9496          
                 95% CI : (0.9387, 0.9589)
    No Information Rate : 0.9496          
    P-Value [Acc > NIR] : 0.5271          

                  Kappa : 0.101           
 Mcnemar's Test P-Value : <2e-16          

            Sensitivity : 0.062500        
            Specificity : 0.996680        
         Pos Pred Value : 0.500000        
         Neg Pred Value : 0.952406        
             Prevalence : 0.050447        
         Detection Rate : 0.003153        
   Detection Prevalence : 0.006306        
      Balanced Accuracy : 0.529590        

       'Positive' Class : Yes             

ROC

library(ROCR)
predClassTestInfoGain2 <- predict(dTreeInfoGain, 
                         testChurn, 
                         type = 'prob')

PredictObject2 <- prediction(predClassTestInfoGain2[2], testChurn$Churn)
PerformObject2 <- performance(PredictObject2, "tpr","fpr")

plot(PerformObject2, main = "ROC Curve for Churn", col = "black", lwd = 2)
abline(a = 0,b = 1, lwd = 2, lty = 3, col = "black")

plot of chunk unnamed-chunk-35

Area Under the Curve

aucInfoGain <- performance(PredictObject2, measure = "auc")
aucInfoGain <- aucInfoGain@y.values[[1]]
aucInfoGain
[1] 0.6458016

Probability of Churn

# predicted probabilities (information gain)
predTestProbInfoGain <- predict(dTreeInfoGain, testChurn, type = "prob")
# plot of probabilities
plot(predTestProbInfoGain[,2], 
     main = "Scatterplot of Probabilities of churn (test data)", 
     xlab = "Customer ID", ylab = "Predicted Probability of Churn")

plot of chunk unnamed-chunk-37

Gini Index

set.seed(4321)
dTreeGiniIndex <- train(Churn ~., 
                        data = trainChurn, 
                        method = "rpart", 
                        parms = list(split = "gini"))
dTreeGiniIndex
CART 

4444 samples
  12 predictor
   2 classes: 'No', 'Yes' 

No pre-processing
Resampling: Bootstrapped (25 reps) 
Summary of sample sizes: 4444, 4444, 4444, 4444, 4444, 4444, ... 
Resampling results across tuning parameters:

  cp           Accuracy   Kappa     
  0.002202643  0.9343059  0.08866110
  0.002936858  0.9366505  0.08679528
  0.004405286  0.9402744  0.08634553

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.004405286.

Visualization

library(rpart.plot)
prp(dTreeGiniIndex$finalModel, box.palette = "Reds", tweak = 1.2, varlen = 20)

plot of chunk unnamed-chunk-39

Prediction

# prediction of Churn = {no, yes} on test data (gini Index)
predClassGiniIndex <- predict(dTreeGiniIndex, testChurn, type = 'raw')
table(predClassGiniIndex)
predClassGiniIndex
  No  Yes 
1903    0 

Confusion Matrix

#confusion matrix (gini index)
confusionMatrix(predClassGiniIndex, testChurn$Churn, positive = "Yes")
Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  1807   96
       Yes    0    0

               Accuracy : 0.9496          
                 95% CI : (0.9387, 0.9589)
    No Information Rate : 0.9496          
    P-Value [Acc > NIR] : 0.5271          

                  Kappa : 0               
 Mcnemar's Test P-Value : <2e-16          

            Sensitivity : 0.00000         
            Specificity : 1.00000         
         Pos Pred Value :     NaN         
         Neg Pred Value : 0.94955         
             Prevalence : 0.05045         
         Detection Rate : 0.00000         
   Detection Prevalence : 0.00000         
      Balanced Accuracy : 0.50000         

       'Positive' Class : Yes             

ROC Plot

library(ROCR)
# prediction
predTestProbGiniIndex <- predict(dTreeGiniIndex, testChurn, type = 'prob')
PredictObjectGiniIndex <- prediction(predTestProbGiniIndex[2], testChurn$Churn)

# performance
PerformObjectGiniIndex <- performance(PredictObjectGiniIndex, "tpr","fpr")

# plot of the ROC curve for Customer Churn
plot(PerformObjectGiniIndex, 
     main = "ROC Curve for Customer Churn (Gini Index)",
     col = "red",
     lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

plot of chunk unnamed-chunk-42

Area under curve

aucGiniIndex <- performance(PredictObjectGiniIndex, measure = "auc")
aucGiniIndex <- aucGiniIndex@y.values[[1]]
aucGiniIndex
[1] 0.5

Probability of Churn

predTestProbGiniIndex <- predict(dTreeGiniIndex, testChurn, type = "prob")
# plot of probabilities
plot(predTestProbGiniIndex[,2], 
     main = "Scatterplot of Probabilities of Customer Churn (test data)", 
     xlab = "Customer ID", ylab = "Predicted Probability of Customer Churn")

plot of chunk unnamed-chunk-44