reading the dataset

cellphonedata<-read.csv("D:/Freelancer_questions/cellphone_churn/Cellphone_convt.csv", header = TRUE)
str(cellphonedata)
## 'data.frame':    3333 obs. of  11 variables:
##  $ Churn          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AccountWeeks   : int  128 107 137 84 75 118 121 147 117 141 ...
##  $ ContractRenewal: int  1 1 1 0 0 0 1 0 1 0 ...
##  $ DataPlan       : int  1 1 0 0 0 0 1 0 0 1 ...
##  $ DataUsage      : num  2.7 3.7 0 0 0 0 2.03 0 0.19 3.02 ...
##  $ CustServCalls  : int  1 1 0 2 3 0 3 0 1 0 ...
##  $ DayMins        : num  265 162 243 299 167 ...
##  $ DayCalls       : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ MonthlyCharge  : num  89 82 52 57 41 57 87.3 36 63.9 93.2 ...
##  $ OverageFee     : num  9.87 9.78 6.06 3.1 7.42 ...
##  $ RoamMins       : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...

Correcting the name of columns

colnames(cellphonedata)<-c("Churn","AccountWeeks","ContractRenewal","DataPlan","DataUsage","CustServCalls","DayMins",
                           "DayCalls","MonthlyCharge","OverageFee","RoamMins")

checking the dimension of the data

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(miscset)
## 
## Attaching package: 'miscset'
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
# dimensions of the data
dim_desc(cellphonedata)
## [1] "[3,333 x 11]"

We have close to 3333 rows and 11 columns in the data

converting the Y variable or churn prediction variable to factor

##Convert the Dependent variable and 2 other predicors into factor
cellphonedata$Churn<-factor(cellphonedata$Churn)
cellphonedata$ContractRenewal<-factor(cellphonedata$ContractRenewal)
cellphonedata$DataPlan<-factor(cellphonedata$DataPlan)

checking for the correlation between the numeric variable

library(corrplot)
## corrplot 0.92 loaded
numeric.var <- sapply(cellphonedata, is.numeric)
corr.matrix <- cor(cellphonedata[,numeric.var])
corrplot(corr.matrix, method="number")

Inference

  1. The Data usage and Monthly charge are correlated

  2. There is not much correlation between the other features

univariate analysis of the continous variables

## Univariate analysis;price distribution

par(mfrow = c(3, 4))  
hist(cellphonedata$AccountWeeks,
     main = "Histogram of AccountWeeks",
     xlab = "Units")

hist(cellphonedata$DataUsage,
     main = "Histogram of DataUsage",
     xlab = "Units")

hist(cellphonedata$DayMins,
     main = "Histogram of DayMins",
     xlab = "Units")

hist(cellphonedata$DayCalls,
     main = "Histogram of DayCalls",
     xlab = "Units")

hist(cellphonedata$MonthlyCharge,
     main = "Histogram of MonthlyCharge",
     xlab = "Units")

hist(cellphonedata$OverageFee,
     main = "Histogram of OverageFee",
     xlab = "Units")


hist(cellphonedata$RoamMins,
     main = "Histogram of RoamMins",
     xlab = "Units")

Inference

1)The AccountWeek is close to normal distributed

  1. The DataUsage and Monthly charge are not close to normal

  2. We can use a logit transformation for all the variables to make it close to normal

using logistic regression to predict churn of the customer after dividing the data to training and testing

Splitting the data into training and testing based on random sampling and training and test ratio of 70% and 30%

##Split Data into Train and test
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
set.seed(101)
spindex<-createDataPartition(cellphonedata$Churn, p=0.7, list = FALSE)
train<-cellphonedata[spindex,]
test<-cellphonedata[-spindex,]

Running the logistic regression model

LG<-glm(Churn~., data = train, family = binomial(link = 'logit'))
summary(LG)
## 
## Call:
## glm(formula = Churn ~ ., family = binomial(link = "logit"), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9723  -0.5199  -0.3371  -0.1916   2.9710  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -5.9260775  0.6609324  -8.966  < 2e-16 ***
## AccountWeeks     -0.0002855  0.0016580  -0.172 0.863295    
## ContractRenewal1 -1.8539716  0.1749537 -10.597  < 2e-16 ***
## DataPlan1        -1.2914261  0.6560678  -1.968 0.049018 *  
## DataUsage         2.9261911  2.2952866   1.275 0.202355    
## CustServCalls     0.5176240  0.0472205  10.962  < 2e-16 ***
## DayMins           0.0624299  0.0387759   1.610 0.107394    
## DayCalls         -0.0020701  0.0032534  -0.636 0.524597    
## MonthlyCharge    -0.2834968  0.2277530  -1.245 0.213222    
## OverageFee        0.6356891  0.3880935   1.638 0.101426    
## RoamMins          0.0993264  0.0265857   3.736 0.000187 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1934.3  on 2333  degrees of freedom
## Residual deviance: 1520.7  on 2323  degrees of freedom
## AIC: 1542.7
## 
## Number of Fisher Scoring iterations: 6

Inference

The variables AccountWeeks,DataUsage, DayMins, Monthly charge, OverageFee is not significant so we can remove these variables

After setting the probability threshold to 0.5 for

using anova test to check the top 2 feature

anova(LG, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Churn
## 
## Terms added sequentially (first to last)
## 
## 
##                 Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                             2333     1934.3              
## AccountWeeks     1    0.020      2332     1934.3 0.8881513    
## ContractRenewal  1   95.365      2331     1838.9 < 2.2e-16 ***
## DataPlan         1   35.555      2330     1803.3 2.479e-09 ***
## DataUsage        1    2.199      2329     1801.1 0.1380825    
## CustServCalls    1  101.794      2328     1699.3 < 2.2e-16 ***
## DayMins          1  130.687      2327     1568.7 < 2.2e-16 ***
## DayCalls         1    0.665      2326     1568.0 0.4148001    
## MonthlyCharge    1   30.327      2325     1537.7 3.650e-08 ***
## OverageFee       1    2.629      2324     1535.0 0.1049500    
## RoamMins         1   14.327      2323     1520.7 0.0001536 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

From the Anova test the top 2 features are ContractRenewal and CustomerServCalls

##Predict the outcome
prdprob<-predict(LG,test[,2:11], type="response")
Pred<-ifelse(prdprob>0.5,1,0)
Pred<-as.factor(Pred)
##Confusion Matrix
confusionMatrix(Pred,test$Churn)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 830 115
##          1  25  29
##                                           
##                Accuracy : 0.8599          
##                  95% CI : (0.8368, 0.8808)
##     No Information Rate : 0.8559          
##     P-Value [Acc > NIR] : 0.38            
##                                           
##                   Kappa : 0.2326          
##                                           
##  Mcnemar's Test P-Value : 5.4e-14         
##                                           
##             Sensitivity : 0.9708          
##             Specificity : 0.2014          
##          Pos Pred Value : 0.8783          
##          Neg Pred Value : 0.5370          
##              Prevalence : 0.8559          
##          Detection Rate : 0.8308          
##    Detection Prevalence : 0.9459          
##       Balanced Accuracy : 0.5861          
##                                           
##        'Positive' Class : 0               
## 

Inference

  1. Based on the model the Accuracy using Logitic regression is close to 87%
  2. The Sensitivity and Prevalence is also pretty high for the model
prob=predict(LG,test[,2:11], type="response")
mydata<-test
mydata$prob=prob
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
g <- roc(Churn ~ prob, data = mydata)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(g)    

understanding the odds ratio of the model

Basically, Odds ratio is what the odds of an event is happening

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
exp(cbind(OR=coef(LG), confint(LG)))
## Waiting for profiling to be done...
##                           OR        2.5 %       97.5 %
## (Intercept)       0.00266893 0.0007180842 9.596245e-03
## AccountWeeks      0.99971457 0.9964676952 1.002969e+00
## ContractRenewal1  0.15661393 0.1110639982 2.206682e-01
## DataPlan1         0.27487849 0.0742012559 9.747084e-01
## DataUsage        18.65643375 0.2086376229 1.696788e+03
## CustServCalls     1.67803593 1.5307947012 1.842407e+00
## DayMins           1.06441985 0.9866513452 1.148741e+00
## DayCalls          0.99793207 0.9915854516 1.004320e+00
## MonthlyCharge     0.75314556 0.4813976750 1.176257e+00
## OverageFee        1.88832293 0.8836106058 4.049621e+00
## RoamMins          1.10442675 1.0487539745 1.164026e+00

using Random forest model for prediction

library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
rfModel <- randomForest(Churn ~., data = train)
print(rfModel)
## 
## Call:
##  randomForest(formula = Churn ~ ., data = train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 5.96%
## Confusion matrix:
##      0   1 class.error
## 0 1965  30  0.01503759
## 1  109 230  0.32153392

checking the confusion matrix of random forest

pred_rf <- predict(rfModel, test)
caret::confusionMatrix(pred_rf, test$Churn)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 833  53
##          1  22  91
##                                           
##                Accuracy : 0.9249          
##                  95% CI : (0.9068, 0.9405)
##     No Information Rate : 0.8559          
##     P-Value [Acc > NIR] : 1.157e-11       
##                                           
##                   Kappa : 0.6658          
##                                           
##  Mcnemar's Test P-Value : 0.000532        
##                                           
##             Sensitivity : 0.9743          
##             Specificity : 0.6319          
##          Pos Pred Value : 0.9402          
##          Neg Pred Value : 0.8053          
##              Prevalence : 0.8559          
##          Detection Rate : 0.8338          
##    Detection Prevalence : 0.8869          
##       Balanced Accuracy : 0.8031          
##                                           
##        'Positive' Class : 0               
## 

Inference

  1. Using Random forest the test accuracy has increased to 94%

  2. Random forest is performing better than logistic regression

checking best number of trees for random forest

plot(rfModel)

tuning the random forest with 100 trees

t <- tuneRF(train[, -1], train[, 1], stepFactor = 0.5, plot = TRUE, ntreeTry = 200, trace = TRUE, improve = 0.05)
## mtry = 3  OOB error = 6.04% 
## Searching left ...
## mtry = 6     OOB error = 6.04% 
## 0 0.05 
## Searching right ...
## mtry = 1     OOB error = 10.45% 
## -0.7304965 0.05

With Mtree 3 the error is lowest

lets retrain the model

rfModel_new <- randomForest(Churn ~., data = train, ntree = 300, mtry = 3 ,importance = TRUE, proximity = TRUE)
print(rfModel_new)
## 
## Call:
##  randomForest(formula = Churn ~ ., data = train, ntree = 300,      mtry = 3, importance = TRUE, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 300
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 6.13%
## Confusion matrix:
##      0   1 class.error
## 0 1965  30  0.01503759
## 1  113 226  0.33333333

on the testing data

pred_rf_new <- predict(rfModel_new, test)
caret::confusionMatrix(pred_rf_new, test$Churn)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 832  52
##          1  23  92
##                                           
##                Accuracy : 0.9249          
##                  95% CI : (0.9068, 0.9405)
##     No Information Rate : 0.8559          
##     P-Value [Acc > NIR] : 1.157e-11       
##                                           
##                   Kappa : 0.6679          
##                                           
##  Mcnemar's Test P-Value : 0.001224        
##                                           
##             Sensitivity : 0.9731          
##             Specificity : 0.6389          
##          Pos Pred Value : 0.9412          
##          Neg Pred Value : 0.8000          
##              Prevalence : 0.8559          
##          Detection Rate : 0.8328          
##    Detection Prevalence : 0.8849          
##       Balanced Accuracy : 0.8060          
##                                           
##        'Positive' Class : 0               
## 

The best number of trees is between 200-300 for the model not to overfit

finding best features from random forest

varImpPlot(rfModel_new, sort=T, n.var = 10, main = 'Top 10 Feature Importance')

Inference

Here is the feature significance based on descending order

naive bayes model

#Fitting the Naive Bayes model
library(e1071)
Naive_Bayes_Model=naiveBayes(Churn ~., data=train)
#What does the model say? Print the model summary
summary(Naive_Bayes_Model)
##           Length Class  Mode     
## apriori    2     table  numeric  
## tables    10     -none- list     
## levels     2     -none- character
## isnumeric 10     -none- logical  
## call       4     -none- call

Confusion Matrix for Naive Matrix

pred_nb_new <- predict(Naive_Bayes_Model, test)
caret::confusionMatrix(pred_nb_new, test$Churn)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 828  98
##          1  27  46
##                                           
##                Accuracy : 0.8749          
##                  95% CI : (0.8527, 0.8948)
##     No Information Rate : 0.8559          
##     P-Value [Acc > NIR] : 0.04581         
##                                           
##                   Kappa : 0.3621          
##                                           
##  Mcnemar's Test P-Value : 3.825e-10       
##                                           
##             Sensitivity : 0.9684          
##             Specificity : 0.3194          
##          Pos Pred Value : 0.8942          
##          Neg Pred Value : 0.6301          
##              Prevalence : 0.8559          
##          Detection Rate : 0.8288          
##    Detection Prevalence : 0.9269          
##       Balanced Accuracy : 0.6439          
##                                           
##        'Positive' Class : 0               
## 

Inference

Naive Bayes model is giving accuracy close to 97.5%

Running KNN model on the dataset

##load the package class
library(class)
 ##run knn function
# subset the dataset
tel.train<- train[1:2334,]
tel.train.target<- train[1:2334,1]
tel.test<- test[1:999,]
tel.test.target<- test[1:999,1]
model1<- knn(train=tel.train, test=tel.test, cl=tel.train.target, k=60)
caret::confusionMatrix(tel.test.target, model1)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 841  14
##          1 122  22
##                                          
##                Accuracy : 0.8639         
##                  95% CI : (0.841, 0.8845)
##     No Information Rate : 0.964          
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.1982         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.8733         
##             Specificity : 0.6111         
##          Pos Pred Value : 0.9836         
##          Neg Pred Value : 0.1528         
##              Prevalence : 0.9640         
##          Detection Rate : 0.8418         
##    Detection Prevalence : 0.8559         
##       Balanced Accuracy : 0.7422         
##                                          
##        'Positive' Class : 0              
## 

Inference

The KNN classification with optimal k 60 is giving accuracy close to 86.8%

Model Comparison

  1. Logistic regression : Accuracy 87, sensitivity 98, specificity 20

  2. Random Forest : Accuracy 94, sensitivity 98, specificity 65

  3. NAive Bayes : Accuracy 87, sensitivity 97, specificity 31

  4. KNN : Accuracy 86.8, sensitivity 87, specificity 74

Based on the specificity KNN model is giving the highest value and also providing decent accuracy as well