This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
library(dplyr)
library(rpart)
library(randomForest)
library(ROCR)
library(rpart.plot)
library(dummies)
library(caret)
library(ggplot2)
library(pROC)
library(DT)
str(telcod)
'data.frame': 7043 obs. of 21 variables:
$ customerID : Factor w/ 7043 levels "0002-ORFBO","0003-MKNFE",..: 5376 3963 2565 5536 6512 6552 1003 4771 5605 4535 ...
$ gender : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 1 2 1 1 2 ...
$ SeniorCitizen : int 0 0 0 0 0 0 0 0 0 0 ...
$ Partner : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 2 1 ...
$ Dependents : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 2 ...
$ tenure : int 1 34 2 45 2 8 22 10 28 62 ...
$ PhoneService : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 1 2 2 ...
$ MultipleLines : Factor w/ 3 levels "No","No phone service",..: 2 1 1 2 1 3 3 2 3 1 ...
$ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 1 1 2 2 2 1 2 1 ...
$ OnlineSecurity : Factor w/ 3 levels "No","No internet service",..: 1 3 3 3 1 1 1 3 1 3 ...
$ OnlineBackup : Factor w/ 3 levels "No","No internet service",..: 3 1 3 1 1 1 3 1 1 3 ...
$ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 1 3 1 3 1 3 1 1 3 1 ...
$ TechSupport : Factor w/ 3 levels "No","No internet service",..: 1 1 1 3 1 1 1 1 3 1 ...
$ StreamingTV : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 3 1 3 1 ...
$ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 1 1 3 1 ...
$ Contract : Factor w/ 3 levels "Month-to-month",..: 1 2 1 2 1 1 1 1 1 2 ...
$ PaperlessBilling: Factor w/ 2 levels "No","Yes": 2 1 2 1 2 2 2 1 2 1 ...
$ PaymentMethod : Factor w/ 4 levels "Bank transfer (automatic)",..: 3 4 4 1 3 3 2 4 3 1 ...
$ MonthlyCharges : num 29.9 57 53.9 42.3 70.7 ...
$ TotalCharges : num 29.9 1889.5 108.2 1840.8 151.7 ...
$ Churn : Factor w/ 2 levels "No","Yes": 1 1 2 1 2 2 1 1 2 1 ...
sum(is.na(telcod_copy))
[1] 11
# Create a Training Control Object that stores information about how we want to develop(train) the models
# We will use 10 fold cross validation to train and evaluate model
TrainingParameters <- trainControl(method = "cv", number =10)
# train model
#############c5.0##################
DecTreeModel <- train(Churn ~ ., data = trainData,
method = "C5.0",
trControl= TrainingParameters,
na.action = na.omit
)
DTcm
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1397 275
Yes 151 285
Accuracy : 0.7979
95% CI : (0.7801, 0.8149)
No Information Rate : 0.7343
P-Value [Acc > NIR] : 6.398e-12
Kappa : 0.4427
Mcnemar's Test P-Value : 2.532e-09
Sensitivity : 0.9025
Specificity : 0.5089
Pos Pred Value : 0.8355
Neg Pred Value : 0.6537
Prevalence : 0.7343
Detection Rate : 0.6627
Detection Prevalence : 0.7932
Balanced Accuracy : 0.7057
'Positive' Class : No
##############Ensemble#################
library(ggplot2)
library(kernlab)
library(caret)
library(plyr)
library(dplyr)
library(C50)
library(kernlab)
library(e1071)
library(caretEnsemble)
econtrol <- trainControl(method="cv", number=10, summaryFunction = twoClassSummary, savePredictions=TRUE, classProbs=TRUE)
econtrol
Corrmodels <- caretList(Churn ~., data=trainData,
methodList=c("svmPoly", "nnet", "C5.0", "naive_bayes"),
trControl = econtrol
)
Corresults <- resamples(Corrmodels)
dotplot(Corresults)
?resamples
mcr <-modelCor(Corresults)
mcr
splom(mcr)
smallmodels <- c(Corrmodels$C5.0, Corrmodels$nnet)
?caretEnsemble
ensmodel <- caretEnsemble(smallmodels,
metric = "Accuracy",
trControl = trainControl(method="cv", number = 10, classProbs = TRUE)
)
enstackpredictions <-predict(ensmodel,testData, na.action = na.omit)
cmPIMA <-confusionMatrix(enstackpredictions, testData$Churn, mode="everything")
cmPIMA
cmPIMA
#Predict
enstackpredictions <-predict(enstackmodel, testData, na.action = na.omit)
# Create confusion matrix
cmPIMA <-confusionMatrix(enstackpredictions, testData$Churn)
cmPIMA
cm_rf
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1467 352
Yes 81 208
Accuracy : 0.7946
95% CI : (0.7767, 0.8117)
No Information Rate : 0.7343
P-Value [Acc > NIR] : 7.528e-11
Kappa : 0.3774
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9477
Specificity : 0.3714
Pos Pred Value : 0.8065
Neg Pred Value : 0.7197
Prevalence : 0.7343
Detection Rate : 0.6959
Detection Prevalence : 0.8629
Balanced Accuracy : 0.6596
'Positive' Class : No
cmSVMup
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1102 130
Yes 446 430
Accuracy : 0.7268
95% CI : (0.7072, 0.7457)
No Information Rate : 0.7343
P-Value [Acc > NIR] : 0.7925
Kappa : 0.4065
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.7119
Specificity : 0.7679
Pos Pred Value : 0.8945
Neg Pred Value : 0.4909
Prevalence : 0.7343
Detection Rate : 0.5228
Detection Prevalence : 0.5844
Balanced Accuracy : 0.7399
'Positive' Class : No
GBTreeModel <- train(Churn ~ ., data = trainData,
method = "gbm",
trControl= TrainingParameters,
na.action = na.omit
)
GBcm
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1414 287
Yes 134 273
Accuracy : 0.8003
95% CI : (0.7826, 0.8172)
No Information Rate : 0.7343
P-Value [Acc > NIR] : 1.009e-12
Kappa : 0.4392
Mcnemar's Test P-Value : 1.282e-13
Sensitivity : 0.9134
Specificity : 0.4875
Pos Pred Value : 0.8313
Neg Pred Value : 0.6708
Prevalence : 0.7343
Detection Rate : 0.6708
Detection Prevalence : 0.8069
Balanced Accuracy : 0.7005
'Positive' Class : No
CMList
[[1]]
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1397 275
Yes 151 285
Accuracy : 0.7979
95% CI : (0.7801, 0.8149)
No Information Rate : 0.7343
P-Value [Acc > NIR] : 6.398e-12
Kappa : 0.4427
Mcnemar's Test P-Value : 2.532e-09
Sensitivity : 0.9025
Specificity : 0.5089
Pos Pred Value : 0.8355
Neg Pred Value : 0.6537
Prevalence : 0.7343
Detection Rate : 0.6627
Detection Prevalence : 0.7932
Balanced Accuracy : 0.7057
'Positive' Class : No
[[2]]
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 908 68
Yes 640 492
Accuracy : 0.6641
95% CI : (0.6435, 0.6843)
No Information Rate : 0.7343
P-Value [Acc > NIR] : 1
Kappa : 0.3508
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.5866
Specificity : 0.8786
Pos Pred Value : 0.9303
Neg Pred Value : 0.4346
Prevalence : 0.7343
Detection Rate : 0.4307
Detection Prevalence : 0.4630
Balanced Accuracy : 0.7326
'Positive' Class : No
[[3]]
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1362 241
Yes 186 319
Accuracy : 0.7974
95% CI : (0.7796, 0.8144)
No Information Rate : 0.7343
P-Value [Acc > NIR] : 9.178e-12
Kappa : 0.464
Mcnemar's Test P-Value : 0.008969
Sensitivity : 0.8798
Specificity : 0.5696
Pos Pred Value : 0.8497
Neg Pred Value : 0.6317
Prevalence : 0.7343
Detection Rate : 0.6461
Detection Prevalence : 0.7604
Balanced Accuracy : 0.7247
'Positive' Class : No
[[4]]
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1467 352
Yes 81 208
Accuracy : 0.7946
95% CI : (0.7767, 0.8117)
No Information Rate : 0.7343
P-Value [Acc > NIR] : 7.528e-11
Kappa : 0.3774
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9477
Specificity : 0.3714
Pos Pred Value : 0.8065
Neg Pred Value : 0.7197
Prevalence : 0.7343
Detection Rate : 0.6959
Detection Prevalence : 0.8629
Balanced Accuracy : 0.6596
'Positive' Class : No
[[5]]
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1102 130
Yes 446 430
Accuracy : 0.7268
95% CI : (0.7072, 0.7457)
No Information Rate : 0.7343
P-Value [Acc > NIR] : 0.7925
Kappa : 0.4065
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.7119
Specificity : 0.7679
Pos Pred Value : 0.8945
Neg Pred Value : 0.4909
Prevalence : 0.7343
Detection Rate : 0.5228
Detection Prevalence : 0.5844
Balanced Accuracy : 0.7399
'Positive' Class : No
[[6]]
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1414 287
Yes 134 273
Accuracy : 0.8003
95% CI : (0.7826, 0.8172)
No Information Rate : 0.7343
P-Value [Acc > NIR] : 1.009e-12
Kappa : 0.4392
Mcnemar's Test P-Value : 1.282e-13
Sensitivity : 0.9134
Specificity : 0.4875
Pos Pred Value : 0.8313
Neg Pred Value : 0.6708
Prevalence : 0.7343
Detection Rate : 0.6708
Detection Prevalence : 0.8069
Balanced Accuracy : 0.7005
'Positive' Class : No
[[7]]
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1395 294
Yes 153 266
Accuracy : 0.788
95% CI : (0.7699, 0.8052)
No Information Rate : 0.7343
P-Value [Acc > NIR] : 6.868e-09
Kappa : 0.409
Mcnemar's Test P-Value : 3.549e-11
Sensitivity : 0.9012
Specificity : 0.4750
Pos Pred Value : 0.8259
Neg Pred Value : 0.6348
Prevalence : 0.7343
Detection Rate : 0.6618
Detection Prevalence : 0.8012
Balanced Accuracy : 0.6881
'Positive' Class : No
[[8]]
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1419 290
Yes 129 270
Accuracy : 0.8012
95% CI : (0.7835, 0.8181)
No Information Rate : 0.7343
P-Value [Acc > NIR] : 4.720e-13
Kappa : 0.4391
Mcnemar's Test P-Value : 5.431e-15
Sensitivity : 0.9167
Specificity : 0.4821
Pos Pred Value : 0.8303
Neg Pred Value : 0.6767
Prevalence : 0.7343
Detection Rate : 0.6731
Detection Prevalence : 0.8107
Balanced Accuracy : 0.6994
'Positive' Class : No
##In this project we aimed to build a model to predict whether the customer will churn. According to our problem statement, NAÏVE BAYES yields SPECIFICITY of 87%, better than any other model and hence we select it as a final model.
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.