cellphonedata<-read.csv("D:/Freelancer_questions/cellphone_churn/Cellphone_convt.csv", header = TRUE)
str(cellphonedata)
## 'data.frame': 3333 obs. of 11 variables:
## $ Churn : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AccountWeeks : int 128 107 137 84 75 118 121 147 117 141 ...
## $ ContractRenewal: int 1 1 1 0 0 0 1 0 1 0 ...
## $ DataPlan : int 1 1 0 0 0 0 1 0 0 1 ...
## $ DataUsage : num 2.7 3.7 0 0 0 0 2.03 0 0.19 3.02 ...
## $ CustServCalls : int 1 1 0 2 3 0 3 0 1 0 ...
## $ DayMins : num 265 162 243 299 167 ...
## $ DayCalls : int 110 123 114 71 113 98 88 79 97 84 ...
## $ MonthlyCharge : num 89 82 52 57 41 57 87.3 36 63.9 93.2 ...
## $ OverageFee : num 9.87 9.78 6.06 3.1 7.42 ...
## $ RoamMins : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
colnames(cellphonedata)<-c("Churn","AccountWeeks","ContractRenewal","DataPlan","DataUsage","CustServCalls","DayMins",
"DayCalls","MonthlyCharge","OverageFee","RoamMins")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(miscset)
##
## Attaching package: 'miscset'
##
## The following object is masked from 'package:dplyr':
##
## collapse
# dimensions of the data
dim_desc(cellphonedata)
## [1] "[3,333 x 11]"
We have close to 3333 rows and 11 columns in the data
##Convert the Dependent variable and 2 other predicors into factor
cellphonedata$Churn<-factor(cellphonedata$Churn)
cellphonedata$ContractRenewal<-factor(cellphonedata$ContractRenewal)
cellphonedata$DataPlan<-factor(cellphonedata$DataPlan)
library(corrplot)
## corrplot 0.92 loaded
numeric.var <- sapply(cellphonedata, is.numeric)
corr.matrix <- cor(cellphonedata[,numeric.var])
corrplot(corr.matrix, method="number")
Inference
The Data usage and Monthly charge are correlated
There is not much correlation between the other features
## Univariate analysis;price distribution
par(mfrow = c(3, 4))
hist(cellphonedata$AccountWeeks,
main = "Histogram of AccountWeeks",
xlab = "Units")
hist(cellphonedata$DataUsage,
main = "Histogram of DataUsage",
xlab = "Units")
hist(cellphonedata$DayMins,
main = "Histogram of DayMins",
xlab = "Units")
hist(cellphonedata$DayCalls,
main = "Histogram of DayCalls",
xlab = "Units")
hist(cellphonedata$MonthlyCharge,
main = "Histogram of MonthlyCharge",
xlab = "Units")
hist(cellphonedata$OverageFee,
main = "Histogram of OverageFee",
xlab = "Units")
hist(cellphonedata$RoamMins,
main = "Histogram of RoamMins",
xlab = "Units")
Inference
1)The AccountWeek is close to normal distributed
The DataUsage and Monthly charge are not close to normal
We can use a logit transformation for all the variables to make it close to normal
Splitting the data into training and testing based on random sampling and training and test ratio of 70% and 30%
##Split Data into Train and test
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(101)
spindex<-createDataPartition(cellphonedata$Churn, p=0.7, list = FALSE)
train<-cellphonedata[spindex,]
test<-cellphonedata[-spindex,]
LG<-glm(Churn~., data = train, family = binomial(link = 'logit'))
summary(LG)
##
## Call:
## glm(formula = Churn ~ ., family = binomial(link = "logit"), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9723 -0.5199 -0.3371 -0.1916 2.9710
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.9260775 0.6609324 -8.966 < 2e-16 ***
## AccountWeeks -0.0002855 0.0016580 -0.172 0.863295
## ContractRenewal1 -1.8539716 0.1749537 -10.597 < 2e-16 ***
## DataPlan1 -1.2914261 0.6560678 -1.968 0.049018 *
## DataUsage 2.9261911 2.2952866 1.275 0.202355
## CustServCalls 0.5176240 0.0472205 10.962 < 2e-16 ***
## DayMins 0.0624299 0.0387759 1.610 0.107394
## DayCalls -0.0020701 0.0032534 -0.636 0.524597
## MonthlyCharge -0.2834968 0.2277530 -1.245 0.213222
## OverageFee 0.6356891 0.3880935 1.638 0.101426
## RoamMins 0.0993264 0.0265857 3.736 0.000187 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1934.3 on 2333 degrees of freedom
## Residual deviance: 1520.7 on 2323 degrees of freedom
## AIC: 1542.7
##
## Number of Fisher Scoring iterations: 6
Inference
The variables AccountWeeks,DataUsage, DayMins, Monthly charge, OverageFee is not significant so we can remove these variables
After setting the probability threshold to 0.5 for
anova(LG, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Churn
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 2333 1934.3
## AccountWeeks 1 0.020 2332 1934.3 0.8881513
## ContractRenewal 1 95.365 2331 1838.9 < 2.2e-16 ***
## DataPlan 1 35.555 2330 1803.3 2.479e-09 ***
## DataUsage 1 2.199 2329 1801.1 0.1380825
## CustServCalls 1 101.794 2328 1699.3 < 2.2e-16 ***
## DayMins 1 130.687 2327 1568.7 < 2.2e-16 ***
## DayCalls 1 0.665 2326 1568.0 0.4148001
## MonthlyCharge 1 30.327 2325 1537.7 3.650e-08 ***
## OverageFee 1 2.629 2324 1535.0 0.1049500
## RoamMins 1 14.327 2323 1520.7 0.0001536 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
From the Anova test the top 2 features are ContractRenewal and CustomerServCalls
##Predict the outcome
prdprob<-predict(LG,test[,2:11], type="response")
Pred<-ifelse(prdprob>0.5,1,0)
Pred<-as.factor(Pred)
##Confusion Matrix
confusionMatrix(Pred,test$Churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 830 115
## 1 25 29
##
## Accuracy : 0.8599
## 95% CI : (0.8368, 0.8808)
## No Information Rate : 0.8559
## P-Value [Acc > NIR] : 0.38
##
## Kappa : 0.2326
##
## Mcnemar's Test P-Value : 5.4e-14
##
## Sensitivity : 0.9708
## Specificity : 0.2014
## Pos Pred Value : 0.8783
## Neg Pred Value : 0.5370
## Prevalence : 0.8559
## Detection Rate : 0.8308
## Detection Prevalence : 0.9459
## Balanced Accuracy : 0.5861
##
## 'Positive' Class : 0
##
Inference
prob=predict(LG,test[,2:11], type="response")
mydata<-test
mydata$prob=prob
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
g <- roc(Churn ~ prob, data = mydata)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(g)
Basically, Odds ratio is what the odds of an event is happening
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
exp(cbind(OR=coef(LG), confint(LG)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.00266893 0.0007180842 9.596245e-03
## AccountWeeks 0.99971457 0.9964676952 1.002969e+00
## ContractRenewal1 0.15661393 0.1110639982 2.206682e-01
## DataPlan1 0.27487849 0.0742012559 9.747084e-01
## DataUsage 18.65643375 0.2086376229 1.696788e+03
## CustServCalls 1.67803593 1.5307947012 1.842407e+00
## DayMins 1.06441985 0.9866513452 1.148741e+00
## DayCalls 0.99793207 0.9915854516 1.004320e+00
## MonthlyCharge 0.75314556 0.4813976750 1.176257e+00
## OverageFee 1.88832293 0.8836106058 4.049621e+00
## RoamMins 1.10442675 1.0487539745 1.164026e+00
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
rfModel <- randomForest(Churn ~., data = train)
print(rfModel)
##
## Call:
## randomForest(formula = Churn ~ ., data = train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 5.96%
## Confusion matrix:
## 0 1 class.error
## 0 1965 30 0.01503759
## 1 109 230 0.32153392
pred_rf <- predict(rfModel, test)
caret::confusionMatrix(pred_rf, test$Churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 833 53
## 1 22 91
##
## Accuracy : 0.9249
## 95% CI : (0.9068, 0.9405)
## No Information Rate : 0.8559
## P-Value [Acc > NIR] : 1.157e-11
##
## Kappa : 0.6658
##
## Mcnemar's Test P-Value : 0.000532
##
## Sensitivity : 0.9743
## Specificity : 0.6319
## Pos Pred Value : 0.9402
## Neg Pred Value : 0.8053
## Prevalence : 0.8559
## Detection Rate : 0.8338
## Detection Prevalence : 0.8869
## Balanced Accuracy : 0.8031
##
## 'Positive' Class : 0
##
Inference
Using Random forest the test accuracy has increased to 94%
Random forest is performing better than logistic regression
plot(rfModel)
t <- tuneRF(train[, -1], train[, 1], stepFactor = 0.5, plot = TRUE, ntreeTry = 200, trace = TRUE, improve = 0.05)
## mtry = 3 OOB error = 6.04%
## Searching left ...
## mtry = 6 OOB error = 6.04%
## 0 0.05
## Searching right ...
## mtry = 1 OOB error = 10.45%
## -0.7304965 0.05
With Mtree 3 the error is lowest
lets retrain the model
rfModel_new <- randomForest(Churn ~., data = train, ntree = 300, mtry = 3 ,importance = TRUE, proximity = TRUE)
print(rfModel_new)
##
## Call:
## randomForest(formula = Churn ~ ., data = train, ntree = 300, mtry = 3, importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 300
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 6.13%
## Confusion matrix:
## 0 1 class.error
## 0 1965 30 0.01503759
## 1 113 226 0.33333333
pred_rf_new <- predict(rfModel_new, test)
caret::confusionMatrix(pred_rf_new, test$Churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 832 52
## 1 23 92
##
## Accuracy : 0.9249
## 95% CI : (0.9068, 0.9405)
## No Information Rate : 0.8559
## P-Value [Acc > NIR] : 1.157e-11
##
## Kappa : 0.6679
##
## Mcnemar's Test P-Value : 0.001224
##
## Sensitivity : 0.9731
## Specificity : 0.6389
## Pos Pred Value : 0.9412
## Neg Pred Value : 0.8000
## Prevalence : 0.8559
## Detection Rate : 0.8328
## Detection Prevalence : 0.8849
## Balanced Accuracy : 0.8060
##
## 'Positive' Class : 0
##
The best number of trees is between 200-300 for the model not to overfit
varImpPlot(rfModel_new, sort=T, n.var = 10, main = 'Top 10 Feature Importance')
Inference
Here is the feature significance based on descending order
#Fitting the Naive Bayes model
library(e1071)
Naive_Bayes_Model=naiveBayes(Churn ~., data=train)
#What does the model say? Print the model summary
summary(Naive_Bayes_Model)
## Length Class Mode
## apriori 2 table numeric
## tables 10 -none- list
## levels 2 -none- character
## isnumeric 10 -none- logical
## call 4 -none- call
pred_nb_new <- predict(Naive_Bayes_Model, test)
caret::confusionMatrix(pred_nb_new, test$Churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 828 98
## 1 27 46
##
## Accuracy : 0.8749
## 95% CI : (0.8527, 0.8948)
## No Information Rate : 0.8559
## P-Value [Acc > NIR] : 0.04581
##
## Kappa : 0.3621
##
## Mcnemar's Test P-Value : 3.825e-10
##
## Sensitivity : 0.9684
## Specificity : 0.3194
## Pos Pred Value : 0.8942
## Neg Pred Value : 0.6301
## Prevalence : 0.8559
## Detection Rate : 0.8288
## Detection Prevalence : 0.9269
## Balanced Accuracy : 0.6439
##
## 'Positive' Class : 0
##
Inference
Naive Bayes model is giving accuracy close to 97.5%
##load the package class
library(class)
##run knn function
# subset the dataset
tel.train<- train[1:2334,]
tel.train.target<- train[1:2334,1]
tel.test<- test[1:999,]
tel.test.target<- test[1:999,1]
model1<- knn(train=tel.train, test=tel.test, cl=tel.train.target, k=60)
caret::confusionMatrix(tel.test.target, model1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 841 14
## 1 122 22
##
## Accuracy : 0.8639
## 95% CI : (0.841, 0.8845)
## No Information Rate : 0.964
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1982
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8733
## Specificity : 0.6111
## Pos Pred Value : 0.9836
## Neg Pred Value : 0.1528
## Prevalence : 0.9640
## Detection Rate : 0.8418
## Detection Prevalence : 0.8559
## Balanced Accuracy : 0.7422
##
## 'Positive' Class : 0
##
Inference
The KNN classification with optimal k 60 is giving accuracy close to 86.8%
Logistic regression : Accuracy 87, sensitivity 98, specificity 20
Random Forest : Accuracy 94, sensitivity 98, specificity 65
NAive Bayes : Accuracy 87, sensitivity 97, specificity 31
KNN : Accuracy 86.8, sensitivity 87, specificity 74
Based on the specificity KNN model is giving the highest value and also providing decent accuracy as well