## Loading required package: carData
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
cellphonedata<-read.csv("Dataset_Cellphone.csv", header = TRUE)
str(cellphonedata)
## 'data.frame': 3333 obs. of 11 variables:
## $ Churn : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AccountWeeks : int 128 107 137 84 75 118 121 147 117 141 ...
## $ ContractRenewal: int 1 1 1 0 0 0 1 0 1 0 ...
## $ DataPlan : int 1 1 0 0 0 0 1 0 0 1 ...
## $ DataUsage : num 2.7 3.7 0 0 0 0 2.03 0 0.19 3.02 ...
## $ CustServCalls : int 1 1 0 2 3 0 3 0 1 0 ...
## $ DayMins : num 265 162 243 299 167 ...
## $ DayCalls : int 110 123 114 71 113 98 88 79 97 84 ...
## $ MonthlyCharge : num 89 82 52 57 41 57 87.3 36 63.9 93.2 ...
## $ OverageFee : num 9.87 9.78 6.06 3.1 7.42 ...
## $ RoamMins : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##Convert the Dependent variable and 2 other predicors into factor
cellphonedata$Churn<-factor(cellphonedata$Churn)
##cellphonedata$ContractRenewal<-factor(cellphonedata$ContractRenewal)
##cellphonedata$DataPlan<-factor(cellphonedata$DataPlan)
##Split Data into Train and test
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
set.seed(101)
spindex<-createDataPartition(cellphonedata$Churn, p=0.7, list = FALSE)
cellphonetrain<-cellphonedata[spindex,]
cellphonetest<-cellphonedata[-spindex,]
LogRegModel<-glm(Churn~., data = cellphonetrain, family = binomial(link = 'logit'))
summary(LogRegModel)
##
## Call:
## glm(formula = Churn ~ ., family = binomial(link = "logit"), data = cellphonetrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0101 -0.5135 -0.3485 -0.2039 3.0493
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.456862 0.662712 -9.743 < 2e-16 ***
## AccountWeeks 0.001063 0.001667 0.638 0.52370
## ContractRenewal -1.919653 0.171185 -11.214 < 2e-16 ***
## DataPlan -1.424391 0.672065 -2.119 0.03405 *
## DataUsage -0.397500 2.307736 -0.172 0.86324
## CustServCalls 0.509929 0.046259 11.023 < 2e-16 ***
## DayMins 0.003815 0.038976 0.098 0.92203
## DayCalls 0.006983 0.003265 2.139 0.03246 *
## MonthlyCharge 0.051165 0.229109 0.223 0.82328
## OverageFee 0.063334 0.390369 0.162 0.87112
## RoamMins 0.078190 0.026144 2.991 0.00278 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1934.3 on 2333 degrees of freedom
## Residual deviance: 1532.5 on 2323 degrees of freedom
## AIC: 1554.5
##
## Number of Fisher Scoring iterations: 6
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##Log Likelihood Test
lrtest(LogRegModel)
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
##Pseudo RSquare Test
pR2(LogRegModel)
## llh llhNull G2 McFadden r2ML
## -766.2572201 -967.1400908 401.7657413 0.2077081 0.1581354
## r2CU
## 0.2806800
##Predict the outcome
predictprob<-predict(LogRegModel,cellphonetest[,2:11], type="response")
predictedresponse<-ifelse(predictprob>0.5,1,0)
predictedresponse<-as.factor(predictedresponse)
##Confusion Matrix
confusionMatrix(predictedresponse,cellphonetest$Churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 840 115
## 1 15 29
##
## Accuracy : 0.8699
## 95% CI : (0.8474, 0.8901)
## No Information Rate : 0.8559
## P-Value [Acc > NIR] : 0.1109
##
## Kappa : 0.2585
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9825
## Specificity : 0.2014
## Pos Pred Value : 0.8796
## Neg Pred Value : 0.6591
## Prevalence : 0.8559
## Detection Rate : 0.8408
## Detection Prevalence : 0.9560
## Balanced Accuracy : 0.5919
##
## 'Positive' Class : 0
##
The overall accuracy is 87%, with sensitivity (prediction of true positives) at 98% and Specificiity (predictionof ture negatives) at 20%. The Positive prediction value is 87% and Negative prediction value is 65%
Let us look at the Odds in the model we built
oddModel<-exp(coef(LogRegModel))
print(oddModel)
## (Intercept) AccountWeeks ContractRenewal DataPlan
## 0.001569714 1.001063383 0.146657862 0.240654971
## DataUsage CustServCalls DayMins DayCalls
## 0.671998121 1.665172875 1.003821928 1.007007090
## MonthlyCharge OverageFee RoamMins
## 1.052497001 1.065382434 1.081327662
write.csv(file = "CellPhoneOdds.csv", oddModel)