Setting up libraries

library(C50)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(mlbench)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(modeldata)
library(rpart)

##Preparing the Dataset

set.seed(1881)
data(mlc_churn)
str(mlc_churn)
## tibble [5,000 × 20] (S3: tbl_df/tbl/data.frame)
##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
##  $ account_length               : int [1:5000] 128 107 137 84 75 118 121 147 117 141 ...
##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ number_vmail_messages        : int [1:5000] 25 26 0 0 0 0 24 0 0 37 ...
##  $ total_day_minutes            : num [1:5000] 265 162 243 299 167 ...
##  $ total_day_calls              : int [1:5000] 110 123 114 71 113 98 88 79 97 84 ...
##  $ total_day_charge             : num [1:5000] 45.1 27.5 41.4 50.9 28.3 ...
##  $ total_eve_minutes            : num [1:5000] 197.4 195.5 121.2 61.9 148.3 ...
##  $ total_eve_calls              : int [1:5000] 99 103 110 88 122 101 108 94 80 111 ...
##  $ total_eve_charge             : num [1:5000] 16.78 16.62 10.3 5.26 12.61 ...
##  $ total_night_minutes          : num [1:5000] 245 254 163 197 187 ...
##  $ total_night_calls            : int [1:5000] 91 103 104 89 121 118 118 96 90 97 ...
##  $ total_night_charge           : num [1:5000] 11.01 11.45 7.32 8.86 8.41 ...
##  $ total_intl_minutes           : num [1:5000] 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ total_intl_calls             : int [1:5000] 3 3 5 7 3 6 7 6 4 5 ...
##  $ total_intl_charge            : num [1:5000] 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ number_customer_service_calls: int [1:5000] 1 1 0 2 3 0 3 0 1 0 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
table(mlc_churn$churn)
## 
##  yes   no 
##  707 4293
churnTrainIndex<-createDataPartition(mlc_churn$churn,p=0.80, list=FALSE)
churnTrain<-mlc_churn[churnTrainIndex,]
churnTest<-mlc_churn[-churnTrainIndex, ]
names (churnTrain)
##  [1] "state"                         "account_length"               
##  [3] "area_code"                     "international_plan"           
##  [5] "voice_mail_plan"               "number_vmail_messages"        
##  [7] "total_day_minutes"             "total_day_calls"              
##  [9] "total_day_charge"              "total_eve_minutes"            
## [11] "total_eve_calls"               "total_eve_charge"             
## [13] "total_night_minutes"           "total_night_calls"            
## [15] "total_night_charge"            "total_intl_minutes"           
## [17] "total_intl_calls"              "total_intl_charge"            
## [19] "number_customer_service_calls" "churn"
summary (churnTrain)
##      state      account_length          area_code    international_plan
##  WV     : 120   Min.   :  1.0   area_code_408: 992   no :3617          
##  AL     :  99   1st Qu.: 73.0   area_code_415:1993   yes: 384          
##  MN     :  97   Median : 99.0   area_code_510:1016                     
##  NJ     :  97   Mean   :100.1                                          
##  OH     :  96   3rd Qu.:127.0                                          
##  NY     :  95   Max.   :243.0                                          
##  (Other):3397                                                          
##  voice_mail_plan number_vmail_messages total_day_minutes total_day_calls
##  no :2946        Min.   : 0.00         Min.   :  0.0     Min.   :  0.0  
##  yes:1055        1st Qu.: 0.00         1st Qu.:143.7     1st Qu.: 87.0  
##                  Median : 0.00         Median :179.6     Median :100.0  
##                  Mean   : 7.77         Mean   :180.1     Mean   :100.1  
##                  3rd Qu.:17.00         3rd Qu.:215.5     3rd Qu.:114.0  
##                  Max.   :51.00         Max.   :351.5     Max.   :165.0  
##                                                                         
##  total_day_charge total_eve_minutes total_eve_calls total_eve_charge
##  Min.   : 0.00    Min.   :  0.0     Min.   :  0.0   Min.   : 0.00   
##  1st Qu.:24.43    1st Qu.:165.9     1st Qu.: 87.0   1st Qu.:14.10   
##  Median :30.53    Median :201.4     Median :100.0   Median :17.12   
##  Mean   :30.62    Mean   :201.0     Mean   :100.1   Mean   :17.09   
##  3rd Qu.:36.64    3rd Qu.:234.9     3rd Qu.:113.0   3rd Qu.:19.97   
##  Max.   :59.76    Max.   :361.8     Max.   :169.0   Max.   :30.75   
##                                                                     
##  total_night_minutes total_night_calls total_night_charge total_intl_minutes
##  Min.   : 23.2       Min.   : 33.00    Min.   : 1.040     Min.   : 0.00     
##  1st Qu.:167.3       1st Qu.: 87.00    1st Qu.: 7.530     1st Qu.: 8.60     
##  Median :200.2       Median :100.00    Median : 9.010     Median :10.40     
##  Mean   :200.5       Mean   : 99.74    Mean   : 9.021     Mean   :10.27     
##  3rd Qu.:234.0       3rd Qu.:113.00    3rd Qu.:10.530     3rd Qu.:12.00     
##  Max.   :395.0       Max.   :175.00    Max.   :17.770     Max.   :20.00     
##                                                                             
##  total_intl_calls total_intl_charge number_customer_service_calls churn     
##  Min.   : 0.00    Min.   :0.000     Min.   :0.000                 yes: 566  
##  1st Qu.: 3.00    1st Qu.:2.320     1st Qu.:1.000                 no :3435  
##  Median : 4.00    Median :2.810     Median :1.000                           
##  Mean   : 4.45    Mean   :2.773     Mean   :1.564                           
##  3rd Qu.: 6.00    3rd Qu.:3.240     3rd Qu.:2.000                           
##  Max.   :20.00    Max.   :5.400     Max.   :9.000                           
## 

##Model Training and Testing

dt_model <- train (churn ~ ., data = churnTrain, method = "rpart")
dt_pred <- predict (dt_model, newdata = churnTest)
histogram (dt_pred)

table (dt_pred, churnTest$churn)
##        
## dt_pred yes  no
##     yes  57  34
##     no   84 824
prop.table (table(dt_pred, churnTest$churn), margin = NULL)
##        
## dt_pred        yes         no
##     yes 0.05705706 0.03403403
##     no  0.08408408 0.82482482

Check the Prediction Performance

table (dt_pred, churnTest$churn)
##        
## dt_pred yes  no
##     yes  57  34
##     no   84 824
prop.table (table(dt_pred, churnTest$churn), margin = NULL)
##        
## dt_pred        yes         no
##     yes 0.05705706 0.03403403
##     no  0.08408408 0.82482482
confusionMatrix (dt_pred, churnTest$churn)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction yes  no
##        yes  57  34
##        no   84 824
##                                           
##                Accuracy : 0.8819          
##                  95% CI : (0.8602, 0.9012)
##     No Information Rate : 0.8589          
##     P-Value [Acc > NIR] : 0.01866         
##                                           
##                   Kappa : 0.4281          
##                                           
##  Mcnemar's Test P-Value : 6.458e-06       
##                                           
##             Sensitivity : 0.40426         
##             Specificity : 0.96037         
##          Pos Pred Value : 0.62637         
##          Neg Pred Value : 0.90749         
##              Prevalence : 0.14114         
##          Detection Rate : 0.05706         
##    Detection Prevalence : 0.09109         
##       Balanced Accuracy : 0.68231         
##                                           
##        'Positive' Class : yes             
## 

#Precision, Recall and F Measure

precision <-posPredValue (dt_pred, churnTest$churn, positive = "yes")
recall <-sensitivity (dt_pred, churnTest$churn, positive ="yes")
f <--  2 * precision * recall / (precision + recall)
sprintf("Precision is %.2f; recall is  %.2f;  F measure is  %.2f",  precision, recall, f)
## [1] "Precision is 0.63; recall is  0.40;  F measure is  -0.49"

Hold Out Method (1)

train_index <-sample (1:nrow(churnTrain), replace = F, size = nrow(churnTrain) * 0.8)
churnTrain2 <- churnTrain[train_index, ]
churnTest2<-churnTrain[-train_index, ]
prop.table(table (churnTrain2$churn))
## 
##     yes      no 
## 0.14125 0.85875

#Cross Validation

tr_control <- trainControl (method = "repeatedcv", number = 10, repeats = 3)
dt_model_cv <-train(churn~ ., data = churnTrain2, method = "rpart", metric = "Accuracy")
dt_model_cv
## CART 
## 
## 3200 samples
##   19 predictor
##    2 classes: 'yes', 'no' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 3200, 3200, 3200, 3200, 3200, 3200, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.06858407  0.8888245  0.4368219
##   0.08296460  0.8695920  0.2743505
##   0.09513274  0.8623577  0.1726141
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.06858407.
tr_control <- trainControl (method = "repeatedcv", number = 10, repeats = 3)
dt_model_cv2 <-train(churn~., data = churnTrain2, method = "rpart",trControl = tr_control, metric = "Accuracy",   na.action = na.omit, control = rpart.control (minsplit = 30, minbucket = 10, maxdepth = 6,   cp = 0.07))

pred_cv2<-predict (dt_model_cv2, newdata = churnTest2)
table (pred_cv2, churnTest2$churn)
##         
## pred_cv2 yes  no
##      yes  48  19
##      no   66 668
prop.table (table(pred_cv2, churnTest2$churn), margin = NULL)
##         
## pred_cv2        yes         no
##      yes 0.05992509 0.02372035
##      no  0.08239700 0.83395755
confusionMatrix (pred_cv2, churnTest2$churn)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction yes  no
##        yes  48  19
##        no   66 668
##                                           
##                Accuracy : 0.8939          
##                  95% CI : (0.8705, 0.9144)
##     No Information Rate : 0.8577          
##     P-Value [Acc > NIR] : 0.001429        
##                                           
##                   Kappa : 0.4751          
##                                           
##  Mcnemar's Test P-Value : 6.057e-07       
##                                           
##             Sensitivity : 0.42105         
##             Specificity : 0.97234         
##          Pos Pred Value : 0.71642         
##          Neg Pred Value : 0.91008         
##              Prevalence : 0.14232         
##          Detection Rate : 0.05993         
##    Detection Prevalence : 0.08365         
##       Balanced Accuracy : 0.69670         
##                                           
##        'Positive' Class : yes             
## 
#Precision, Recall and F Measure
precision <-posPredValue (pred_cv2, churnTest2$churn, positive = "yes")
recall <-sensitivity (pred_cv2, churnTest2$churn, positive ="yes")
f <--  2 * precision * recall / (precision + recall)
sprintf("Precision is %.2f; recall is  %.2f;  F measure is  %.2f",  precision, recall, f)
## [1] "Precision is 0.72; recall is  0.42;  F measure is  -0.53"

#Hold out method (2), Training and Testing

    train_index <-createDataPartition (churnTrain$churn, p = 0.8, list = F)
  length (train_index)
## [1] 3201
  nrow (churnTrain)
## [1] 4001
  churnTrain3 <- churnTrain [train_index, ]
    churnTest3 <-churnTrain [- train_index, ]
  table (table (churnTrain3$churn))
## 
##  453 2748 
##    1    1
  prop.table (table(churnTest3$churn)) 
## 
##     yes      no 
## 0.14125 0.85875
tr_control <- trainControl (method = "repeatedcv", number = 10, repeats = 3)
dt_model_cv3 <-train(churn~., data = churnTrain3, method = "rpart",trControl = tr_control, metric = "Accuracy",   na.action = na.omit, control = rpart.control (minsplit = 30, minbucket = 10, maxdepth = 6,   cp = 0.07))
pred_cv3<-predict (dt_model_cv3, newdata = churnTest3)
table (pred_cv3, churnTest3$churn)
##         
## pred_cv3 yes  no
##      yes  48  26
##      no   65 661
prop.table (table(pred_cv3, churnTest3$churn), margin = NULL)
##         
## pred_cv3     yes      no
##      yes 0.06000 0.03250
##      no  0.08125 0.82625
confusionMatrix (pred_cv3, churnTest3$churn)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction yes  no
##        yes  48  26
##        no   65 661
##                                           
##                Accuracy : 0.8862          
##                  95% CI : (0.8622, 0.9074)
##     No Information Rate : 0.8588          
##     P-Value [Acc > NIR] : 0.01277         
##                                           
##                   Kappa : 0.4521          
##                                           
##  Mcnemar's Test P-Value : 6.791e-05       
##                                           
##             Sensitivity : 0.4248          
##             Specificity : 0.9622          
##          Pos Pred Value : 0.6486          
##          Neg Pred Value : 0.9105          
##              Prevalence : 0.1412          
##          Detection Rate : 0.0600          
##    Detection Prevalence : 0.0925          
##       Balanced Accuracy : 0.6935          
##                                           
##        'Positive' Class : yes             
## 
#Precision, Recall and F Measure
precision <-posPredValue (pred_cv3, churnTest3$churn, positive = "yes")
recall <-sensitivity (pred_cv3, churnTest3$churn, positive ="yes")
f <--  2 * precision * recall / (precision + recall)
sprintf("Precision is %.2f; recall is  %.2f;  F measure is  %.2f",  precision, recall, f)
## [1] "Precision is 0.65; recall is  0.42;  F measure is  -0.51"

#Bootstrapping

tr_control <-trainControl (method = "boot", number = 10)
dt_model_cv4 <-train(churn~., data = churnTrain2, method = "rpart",trControl = tr_control, metric = "Accuracy",   na.action = na.omit, control = rpart.control (minsplit = 30, minbucket = 10, maxdepth = 6,   cp = 0.07))

pred_cv4<-predict (dt_model_cv4, newdata = churnTest2)
table (pred_cv4, churnTest2$churn)
##         
## pred_cv4 yes  no
##      yes  48  19
##      no   66 668
prop.table (table(pred_cv4, churnTest2$churn), margin = NULL)
##         
## pred_cv4        yes         no
##      yes 0.05992509 0.02372035
##      no  0.08239700 0.83395755
confusionMatrix (pred_cv4, churnTest2$churn)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction yes  no
##        yes  48  19
##        no   66 668
##                                           
##                Accuracy : 0.8939          
##                  95% CI : (0.8705, 0.9144)
##     No Information Rate : 0.8577          
##     P-Value [Acc > NIR] : 0.001429        
##                                           
##                   Kappa : 0.4751          
##                                           
##  Mcnemar's Test P-Value : 6.057e-07       
##                                           
##             Sensitivity : 0.42105         
##             Specificity : 0.97234         
##          Pos Pred Value : 0.71642         
##          Neg Pred Value : 0.91008         
##              Prevalence : 0.14232         
##          Detection Rate : 0.05993         
##    Detection Prevalence : 0.08365         
##       Balanced Accuracy : 0.69670         
##                                           
##        'Positive' Class : yes             
## 
#Precision, Recall and F Measure
precision <-posPredValue (pred_cv4, churnTest2$churn, positive = "yes")
recall <-sensitivity (pred_cv4, churnTest2$churn, positive ="yes")
f <--  2 * precision * recall / (precision + recall)
sprintf("Precision is %.2f; recall is  %.2f;  F measure is  %.2f",  precision, recall, f)
## [1] "Precision is 0.72; recall is  0.42;  F measure is  -0.53"

#Leave One Out

tr_control <-trainControl (method = "LOOCV", number = 10)
dt_model_cv5 <-train(churn~., data = churnTrain2, method = "rpart",trControl = tr_control, metric = "Accuracy",   na.action = na.omit, control = rpart.control (minsplit = 30, minbucket = 10, maxdepth = 6,   cp = 0.07))

pred_cv5<-predict (dt_model_cv5, newdata = churnTest2)
table (pred_cv5, churnTest2$churn)
##         
## pred_cv5 yes  no
##      yes  48  19
##      no   66 668
prop.table (table(pred_cv5, churnTest2$churn), margin = NULL)
##         
## pred_cv5        yes         no
##      yes 0.05992509 0.02372035
##      no  0.08239700 0.83395755
confusionMatrix (pred_cv5, churnTest2$churn)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction yes  no
##        yes  48  19
##        no   66 668
##                                           
##                Accuracy : 0.8939          
##                  95% CI : (0.8705, 0.9144)
##     No Information Rate : 0.8577          
##     P-Value [Acc > NIR] : 0.001429        
##                                           
##                   Kappa : 0.4751          
##                                           
##  Mcnemar's Test P-Value : 6.057e-07       
##                                           
##             Sensitivity : 0.42105         
##             Specificity : 0.97234         
##          Pos Pred Value : 0.71642         
##          Neg Pred Value : 0.91008         
##              Prevalence : 0.14232         
##          Detection Rate : 0.05993         
##    Detection Prevalence : 0.08365         
##       Balanced Accuracy : 0.69670         
##                                           
##        'Positive' Class : yes             
## 
#Precision, Recall and F Measure
precision <-posPredValue (pred_cv5, churnTest2$churn, positive = "yes")
recall <-sensitivity (pred_cv5, churnTest2$churn, positive ="yes")
f <--  2 * precision * recall / (precision + recall)
sprintf("Precision is %.2f; recall is  %.2f;  F measure is  %.2f",  precision, recall, f)
## [1] "Precision is 0.72; recall is  0.42;  F measure is  -0.53"