library(C50)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(mlbench)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(modeldata)
library(rpart)
##Preparing the Dataset
set.seed(1881)
data(mlc_churn)
str(mlc_churn)
## tibble [5,000 × 20] (S3: tbl_df/tbl/data.frame)
## $ state : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
## $ account_length : int [1:5000] 128 107 137 84 75 118 121 147 117 141 ...
## $ area_code : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
## $ international_plan : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
## $ voice_mail_plan : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
## $ number_vmail_messages : int [1:5000] 25 26 0 0 0 0 24 0 0 37 ...
## $ total_day_minutes : num [1:5000] 265 162 243 299 167 ...
## $ total_day_calls : int [1:5000] 110 123 114 71 113 98 88 79 97 84 ...
## $ total_day_charge : num [1:5000] 45.1 27.5 41.4 50.9 28.3 ...
## $ total_eve_minutes : num [1:5000] 197.4 195.5 121.2 61.9 148.3 ...
## $ total_eve_calls : int [1:5000] 99 103 110 88 122 101 108 94 80 111 ...
## $ total_eve_charge : num [1:5000] 16.78 16.62 10.3 5.26 12.61 ...
## $ total_night_minutes : num [1:5000] 245 254 163 197 187 ...
## $ total_night_calls : int [1:5000] 91 103 104 89 121 118 118 96 90 97 ...
## $ total_night_charge : num [1:5000] 11.01 11.45 7.32 8.86 8.41 ...
## $ total_intl_minutes : num [1:5000] 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
## $ total_intl_calls : int [1:5000] 3 3 5 7 3 6 7 6 4 5 ...
## $ total_intl_charge : num [1:5000] 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
## $ number_customer_service_calls: int [1:5000] 1 1 0 2 3 0 3 0 1 0 ...
## $ churn : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
table(mlc_churn$churn)
##
## yes no
## 707 4293
churnTrainIndex<-createDataPartition(mlc_churn$churn,p=0.80, list=FALSE)
churnTrain<-mlc_churn[churnTrainIndex,]
churnTest<-mlc_churn[-churnTrainIndex, ]
names (churnTrain)
## [1] "state" "account_length"
## [3] "area_code" "international_plan"
## [5] "voice_mail_plan" "number_vmail_messages"
## [7] "total_day_minutes" "total_day_calls"
## [9] "total_day_charge" "total_eve_minutes"
## [11] "total_eve_calls" "total_eve_charge"
## [13] "total_night_minutes" "total_night_calls"
## [15] "total_night_charge" "total_intl_minutes"
## [17] "total_intl_calls" "total_intl_charge"
## [19] "number_customer_service_calls" "churn"
summary (churnTrain)
## state account_length area_code international_plan
## WV : 120 Min. : 1.0 area_code_408: 992 no :3617
## AL : 99 1st Qu.: 73.0 area_code_415:1993 yes: 384
## MN : 97 Median : 99.0 area_code_510:1016
## NJ : 97 Mean :100.1
## OH : 96 3rd Qu.:127.0
## NY : 95 Max. :243.0
## (Other):3397
## voice_mail_plan number_vmail_messages total_day_minutes total_day_calls
## no :2946 Min. : 0.00 Min. : 0.0 Min. : 0.0
## yes:1055 1st Qu.: 0.00 1st Qu.:143.7 1st Qu.: 87.0
## Median : 0.00 Median :179.6 Median :100.0
## Mean : 7.77 Mean :180.1 Mean :100.1
## 3rd Qu.:17.00 3rd Qu.:215.5 3rd Qu.:114.0
## Max. :51.00 Max. :351.5 Max. :165.0
##
## total_day_charge total_eve_minutes total_eve_calls total_eve_charge
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Min. : 0.00
## 1st Qu.:24.43 1st Qu.:165.9 1st Qu.: 87.0 1st Qu.:14.10
## Median :30.53 Median :201.4 Median :100.0 Median :17.12
## Mean :30.62 Mean :201.0 Mean :100.1 Mean :17.09
## 3rd Qu.:36.64 3rd Qu.:234.9 3rd Qu.:113.0 3rd Qu.:19.97
## Max. :59.76 Max. :361.8 Max. :169.0 Max. :30.75
##
## total_night_minutes total_night_calls total_night_charge total_intl_minutes
## Min. : 23.2 Min. : 33.00 Min. : 1.040 Min. : 0.00
## 1st Qu.:167.3 1st Qu.: 87.00 1st Qu.: 7.530 1st Qu.: 8.60
## Median :200.2 Median :100.00 Median : 9.010 Median :10.40
## Mean :200.5 Mean : 99.74 Mean : 9.021 Mean :10.27
## 3rd Qu.:234.0 3rd Qu.:113.00 3rd Qu.:10.530 3rd Qu.:12.00
## Max. :395.0 Max. :175.00 Max. :17.770 Max. :20.00
##
## total_intl_calls total_intl_charge number_customer_service_calls churn
## Min. : 0.00 Min. :0.000 Min. :0.000 yes: 566
## 1st Qu.: 3.00 1st Qu.:2.320 1st Qu.:1.000 no :3435
## Median : 4.00 Median :2.810 Median :1.000
## Mean : 4.45 Mean :2.773 Mean :1.564
## 3rd Qu.: 6.00 3rd Qu.:3.240 3rd Qu.:2.000
## Max. :20.00 Max. :5.400 Max. :9.000
##
##Model Training and Testing
dt_model <- train (churn ~ ., data = churnTrain, method = "rpart")
dt_pred <- predict (dt_model, newdata = churnTest)
histogram (dt_pred)
table (dt_pred, churnTest$churn)
##
## dt_pred yes no
## yes 57 34
## no 84 824
prop.table (table(dt_pred, churnTest$churn), margin = NULL)
##
## dt_pred yes no
## yes 0.05705706 0.03403403
## no 0.08408408 0.82482482
table (dt_pred, churnTest$churn)
##
## dt_pred yes no
## yes 57 34
## no 84 824
prop.table (table(dt_pred, churnTest$churn), margin = NULL)
##
## dt_pred yes no
## yes 0.05705706 0.03403403
## no 0.08408408 0.82482482
confusionMatrix (dt_pred, churnTest$churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 57 34
## no 84 824
##
## Accuracy : 0.8819
## 95% CI : (0.8602, 0.9012)
## No Information Rate : 0.8589
## P-Value [Acc > NIR] : 0.01866
##
## Kappa : 0.4281
##
## Mcnemar's Test P-Value : 6.458e-06
##
## Sensitivity : 0.40426
## Specificity : 0.96037
## Pos Pred Value : 0.62637
## Neg Pred Value : 0.90749
## Prevalence : 0.14114
## Detection Rate : 0.05706
## Detection Prevalence : 0.09109
## Balanced Accuracy : 0.68231
##
## 'Positive' Class : yes
##
#Precision, Recall and F Measure
precision <-posPredValue (dt_pred, churnTest$churn, positive = "yes")
recall <-sensitivity (dt_pred, churnTest$churn, positive ="yes")
f <-- 2 * precision * recall / (precision + recall)
sprintf("Precision is %.2f; recall is %.2f; F measure is %.2f", precision, recall, f)
## [1] "Precision is 0.63; recall is 0.40; F measure is -0.49"
train_index <-sample (1:nrow(churnTrain), replace = F, size = nrow(churnTrain) * 0.8)
churnTrain2 <- churnTrain[train_index, ]
churnTest2<-churnTrain[-train_index, ]
prop.table(table (churnTrain2$churn))
##
## yes no
## 0.14125 0.85875
#Cross Validation
tr_control <- trainControl (method = "repeatedcv", number = 10, repeats = 3)
dt_model_cv <-train(churn~ ., data = churnTrain2, method = "rpart", metric = "Accuracy")
dt_model_cv
## CART
##
## 3200 samples
## 19 predictor
## 2 classes: 'yes', 'no'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 3200, 3200, 3200, 3200, 3200, 3200, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.06858407 0.8888245 0.4368219
## 0.08296460 0.8695920 0.2743505
## 0.09513274 0.8623577 0.1726141
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.06858407.
tr_control <- trainControl (method = "repeatedcv", number = 10, repeats = 3)
dt_model_cv2 <-train(churn~., data = churnTrain2, method = "rpart",trControl = tr_control, metric = "Accuracy", na.action = na.omit, control = rpart.control (minsplit = 30, minbucket = 10, maxdepth = 6, cp = 0.07))
pred_cv2<-predict (dt_model_cv2, newdata = churnTest2)
table (pred_cv2, churnTest2$churn)
##
## pred_cv2 yes no
## yes 48 19
## no 66 668
prop.table (table(pred_cv2, churnTest2$churn), margin = NULL)
##
## pred_cv2 yes no
## yes 0.05992509 0.02372035
## no 0.08239700 0.83395755
confusionMatrix (pred_cv2, churnTest2$churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 48 19
## no 66 668
##
## Accuracy : 0.8939
## 95% CI : (0.8705, 0.9144)
## No Information Rate : 0.8577
## P-Value [Acc > NIR] : 0.001429
##
## Kappa : 0.4751
##
## Mcnemar's Test P-Value : 6.057e-07
##
## Sensitivity : 0.42105
## Specificity : 0.97234
## Pos Pred Value : 0.71642
## Neg Pred Value : 0.91008
## Prevalence : 0.14232
## Detection Rate : 0.05993
## Detection Prevalence : 0.08365
## Balanced Accuracy : 0.69670
##
## 'Positive' Class : yes
##
#Precision, Recall and F Measure
precision <-posPredValue (pred_cv2, churnTest2$churn, positive = "yes")
recall <-sensitivity (pred_cv2, churnTest2$churn, positive ="yes")
f <-- 2 * precision * recall / (precision + recall)
sprintf("Precision is %.2f; recall is %.2f; F measure is %.2f", precision, recall, f)
## [1] "Precision is 0.72; recall is 0.42; F measure is -0.53"
#Hold out method (2), Training and Testing
train_index <-createDataPartition (churnTrain$churn, p = 0.8, list = F)
length (train_index)
## [1] 3201
nrow (churnTrain)
## [1] 4001
churnTrain3 <- churnTrain [train_index, ]
churnTest3 <-churnTrain [- train_index, ]
table (table (churnTrain3$churn))
##
## 453 2748
## 1 1
prop.table (table(churnTest3$churn))
##
## yes no
## 0.14125 0.85875
tr_control <- trainControl (method = "repeatedcv", number = 10, repeats = 3)
dt_model_cv3 <-train(churn~., data = churnTrain3, method = "rpart",trControl = tr_control, metric = "Accuracy", na.action = na.omit, control = rpart.control (minsplit = 30, minbucket = 10, maxdepth = 6, cp = 0.07))
pred_cv3<-predict (dt_model_cv3, newdata = churnTest3)
table (pred_cv3, churnTest3$churn)
##
## pred_cv3 yes no
## yes 48 26
## no 65 661
prop.table (table(pred_cv3, churnTest3$churn), margin = NULL)
##
## pred_cv3 yes no
## yes 0.06000 0.03250
## no 0.08125 0.82625
confusionMatrix (pred_cv3, churnTest3$churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 48 26
## no 65 661
##
## Accuracy : 0.8862
## 95% CI : (0.8622, 0.9074)
## No Information Rate : 0.8588
## P-Value [Acc > NIR] : 0.01277
##
## Kappa : 0.4521
##
## Mcnemar's Test P-Value : 6.791e-05
##
## Sensitivity : 0.4248
## Specificity : 0.9622
## Pos Pred Value : 0.6486
## Neg Pred Value : 0.9105
## Prevalence : 0.1412
## Detection Rate : 0.0600
## Detection Prevalence : 0.0925
## Balanced Accuracy : 0.6935
##
## 'Positive' Class : yes
##
#Precision, Recall and F Measure
precision <-posPredValue (pred_cv3, churnTest3$churn, positive = "yes")
recall <-sensitivity (pred_cv3, churnTest3$churn, positive ="yes")
f <-- 2 * precision * recall / (precision + recall)
sprintf("Precision is %.2f; recall is %.2f; F measure is %.2f", precision, recall, f)
## [1] "Precision is 0.65; recall is 0.42; F measure is -0.51"
#Bootstrapping
tr_control <-trainControl (method = "boot", number = 10)
dt_model_cv4 <-train(churn~., data = churnTrain2, method = "rpart",trControl = tr_control, metric = "Accuracy", na.action = na.omit, control = rpart.control (minsplit = 30, minbucket = 10, maxdepth = 6, cp = 0.07))
pred_cv4<-predict (dt_model_cv4, newdata = churnTest2)
table (pred_cv4, churnTest2$churn)
##
## pred_cv4 yes no
## yes 48 19
## no 66 668
prop.table (table(pred_cv4, churnTest2$churn), margin = NULL)
##
## pred_cv4 yes no
## yes 0.05992509 0.02372035
## no 0.08239700 0.83395755
confusionMatrix (pred_cv4, churnTest2$churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 48 19
## no 66 668
##
## Accuracy : 0.8939
## 95% CI : (0.8705, 0.9144)
## No Information Rate : 0.8577
## P-Value [Acc > NIR] : 0.001429
##
## Kappa : 0.4751
##
## Mcnemar's Test P-Value : 6.057e-07
##
## Sensitivity : 0.42105
## Specificity : 0.97234
## Pos Pred Value : 0.71642
## Neg Pred Value : 0.91008
## Prevalence : 0.14232
## Detection Rate : 0.05993
## Detection Prevalence : 0.08365
## Balanced Accuracy : 0.69670
##
## 'Positive' Class : yes
##
#Precision, Recall and F Measure
precision <-posPredValue (pred_cv4, churnTest2$churn, positive = "yes")
recall <-sensitivity (pred_cv4, churnTest2$churn, positive ="yes")
f <-- 2 * precision * recall / (precision + recall)
sprintf("Precision is %.2f; recall is %.2f; F measure is %.2f", precision, recall, f)
## [1] "Precision is 0.72; recall is 0.42; F measure is -0.53"
#Leave One Out
tr_control <-trainControl (method = "LOOCV", number = 10)
dt_model_cv5 <-train(churn~., data = churnTrain2, method = "rpart",trControl = tr_control, metric = "Accuracy", na.action = na.omit, control = rpart.control (minsplit = 30, minbucket = 10, maxdepth = 6, cp = 0.07))
pred_cv5<-predict (dt_model_cv5, newdata = churnTest2)
table (pred_cv5, churnTest2$churn)
##
## pred_cv5 yes no
## yes 48 19
## no 66 668
prop.table (table(pred_cv5, churnTest2$churn), margin = NULL)
##
## pred_cv5 yes no
## yes 0.05992509 0.02372035
## no 0.08239700 0.83395755
confusionMatrix (pred_cv5, churnTest2$churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 48 19
## no 66 668
##
## Accuracy : 0.8939
## 95% CI : (0.8705, 0.9144)
## No Information Rate : 0.8577
## P-Value [Acc > NIR] : 0.001429
##
## Kappa : 0.4751
##
## Mcnemar's Test P-Value : 6.057e-07
##
## Sensitivity : 0.42105
## Specificity : 0.97234
## Pos Pred Value : 0.71642
## Neg Pred Value : 0.91008
## Prevalence : 0.14232
## Detection Rate : 0.05993
## Detection Prevalence : 0.08365
## Balanced Accuracy : 0.69670
##
## 'Positive' Class : yes
##
#Precision, Recall and F Measure
precision <-posPredValue (pred_cv5, churnTest2$churn, positive = "yes")
recall <-sensitivity (pred_cv5, churnTest2$churn, positive ="yes")
f <-- 2 * precision * recall / (precision + recall)
sprintf("Precision is %.2f; recall is %.2f; F measure is %.2f", precision, recall, f)
## [1] "Precision is 0.72; recall is 0.42; F measure is -0.53"