# Dataset: churn
rm(list = ls())
library(C50)
library(modeldata)
# Churn dataset was no longer in the c50 package. So I used the one available in modeldata and split the data into train and test parts.
data(mlc_churn)
set.seed(123)
train_sample <- sample(5000, 3333) # The train part in the churn dataset in previous version of c50 had 3333 samples. That is why I randomly selected 3333 samples from my mlc_churn dataset.
str(train_sample)
##  int [1:3333] 2463 2511 2227 526 4291 2986 1842 1142 3371 3446 ...
churnTrain <- mlc_churn[train_sample, ]
churnTest <- mlc_churn[-train_sample, ]
churn <- rbind (churnTrain, churnTest)
str (churnTrain)
## tibble [3,333 × 20] (S3: tbl_df/tbl/data.frame)
##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 30 46 20 37 38 16 50 6 36 7 ...
##  $ account_length               : int [1:3333] 92 118 120 176 170 7 52 157 54 60 ...
##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 2 1 2 1 1 2 3 2 2 2 ...
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 1 1 1 1 2 1 2 1 1 2 ...
##  $ number_vmail_messages        : int [1:3333] 0 0 0 0 22 0 21 0 0 20 ...
##  $ total_day_minutes            : num [1:3333] 181.4 189.3 137.3 47.4 173.6 ...
##  $ total_day_calls              : int [1:3333] 98 119 100 125 66 87 119 74 122 113 ...
##  $ total_day_charge             : num [1:3333] 30.84 32.18 23.34 8.06 29.51 ...
##  $ total_eve_minutes            : num [1:3333] 164 234 212 168 142 ...
##  $ total_eve_calls              : int [1:3333] 98 112 129 90 83 83 95 96 87 97 ...
##  $ total_eve_charge             : num [1:3333] 14 19.9 18 14.3 12.1 ...
##  $ total_night_minutes          : num [1:3333] 171 271 153 163 156 ...
##  $ total_night_calls            : int [1:3333] 110 104 92 107 93 77 94 81 72 78 ...
##  $ total_night_charge           : num [1:3333] 7.69 12.19 6.87 7.34 7.04 ...
##  $ total_intl_minutes           : num [1:3333] 10.9 10 10.5 10.5 5.2 11 5.3 7.9 11.1 9.4 ...
##  $ total_intl_calls             : int [1:3333] 4 1 2 8 3 5 3 6 6 4 ...
##  $ total_intl_charge            : num [1:3333] 2.94 2.7 2.84 2.84 1.4 2.97 1.43 2.13 3 2.54 ...
##  $ number_customer_service_calls: int [1:3333] 2 2 1 2 1 3 2 1 0 1 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
str (churnTest)
## tibble [1,667 × 20] (S3: tbl_df/tbl/data.frame)
##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 36 2 25 19 27 35 6 4 41 27 ...
##  $ account_length               : int [1:1667] 84 118 147 117 95 161 77 130 111 54 ...
##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 1 3 2 1 3 2 1 2 2 1 ...
##  $ international_plan           : Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 1 1 1 1 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ number_vmail_messages        : int [1:1667] 0 0 0 0 0 0 0 0 0 0 ...
##  $ total_day_minutes            : num [1:1667] 299 223 157 184 157 ...
##  $ total_day_calls              : int [1:1667] 71 98 79 97 88 67 89 112 103 73 ...
##  $ total_day_charge             : num [1:1667] 50.9 38 26.7 31.4 26.6 ...
##  $ total_eve_minutes            : num [1:1667] 61.9 220.6 103.1 351.6 247.6 ...
##  $ total_eve_calls              : int [1:1667] 88 101 94 80 75 97 121 99 102 100 ...
##  $ total_eve_charge             : num [1:1667] 5.26 18.75 8.76 29.89 21.05 ...
##  $ total_night_minutes          : num [1:1667] 197 204 212 216 192 ...
##  $ total_night_calls            : int [1:1667] 89 118 96 90 115 128 64 78 105 68 ...
##  $ total_night_charge           : num [1:1667] 8.86 9.18 9.53 9.71 8.65 7.23 9.43 8.18 8.53 4.59 ...
##  $ total_intl_minutes           : num [1:1667] 6.6 6.3 7.1 8.7 12.3 5.4 5.7 9.5 7.7 14.7 ...
##  $ total_intl_calls             : int [1:1667] 7 6 6 4 5 9 6 19 6 4 ...
##  $ total_intl_charge            : num [1:1667] 1.78 1.7 1.92 2.35 3.32 1.46 1.54 2.57 2.08 3.97 ...
##  $ number_customer_service_calls: int [1:1667] 2 0 0 1 3 4 5 0 2 3 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 1 1 2 2 2 ...
names (churnTrain)
##  [1] "state"                         "account_length"               
##  [3] "area_code"                     "international_plan"           
##  [5] "voice_mail_plan"               "number_vmail_messages"        
##  [7] "total_day_minutes"             "total_day_calls"              
##  [9] "total_day_charge"              "total_eve_minutes"            
## [11] "total_eve_calls"               "total_eve_charge"             
## [13] "total_night_minutes"           "total_night_calls"            
## [15] "total_night_charge"            "total_intl_minutes"           
## [17] "total_intl_calls"              "total_intl_charge"            
## [19] "number_customer_service_calls" "churn"
summary (churnTrain)
##      state      account_length          area_code    international_plan
##  WV     : 105   Min.   :  1.0   area_code_408: 841   no :3009          
##  VA     :  85   1st Qu.: 73.0   area_code_415:1685   yes: 324          
##  AL     :  82   Median :100.0   area_code_510: 807                     
##  ID     :  82   Mean   :100.8                                          
##  NY     :  82   3rd Qu.:128.0                                          
##  MN     :  80   Max.   :243.0                                          
##  (Other):2817                                                          
##  voice_mail_plan number_vmail_messages total_day_minutes total_day_calls
##  no :2444        Min.   : 0.00         Min.   :  0.0     Min.   :  0.0  
##  yes: 889        1st Qu.: 0.00         1st Qu.:144.0     1st Qu.: 87.0  
##                  Median : 0.00         Median :179.7     Median :100.0  
##                  Mean   : 7.84         Mean   :180.2     Mean   :100.3  
##                  3rd Qu.:17.00         3rd Qu.:215.8     3rd Qu.:114.0  
##                  Max.   :51.00         Max.   :351.5     Max.   :165.0  
##                                                                         
##  total_day_charge total_eve_minutes total_eve_calls total_eve_charge
##  Min.   : 0.00    Min.   : 22.3     Min.   : 37.0   Min.   : 1.90   
##  1st Qu.:24.48    1st Qu.:166.2     1st Qu.: 87.0   1st Qu.:14.13   
##  Median :30.55    Median :201.3     Median :101.0   Median :17.11   
##  Mean   :30.63    Mean   :200.3     Mean   :100.4   Mean   :17.03   
##  3rd Qu.:36.69    3rd Qu.:233.9     3rd Qu.:113.0   3rd Qu.:19.88   
##  Max.   :59.76    Max.   :361.8     Max.   :170.0   Max.   :30.75   
##                                                                     
##  total_night_minutes total_night_calls total_night_charge total_intl_minutes
##  Min.   :  0.0       Min.   :  0.00    Min.   : 0.000     Min.   : 0.00     
##  1st Qu.:166.5       1st Qu.: 86.00    1st Qu.: 7.490     1st Qu.: 8.50     
##  Median :200.6       Median :100.00    Median : 9.030     Median :10.30     
##  Mean   :200.6       Mean   : 99.87    Mean   : 9.027     Mean   :10.23     
##  3rd Qu.:234.9       3rd Qu.:114.00    3rd Qu.:10.570     3rd Qu.:12.00     
##  Max.   :395.0       Max.   :175.00    Max.   :17.770     Max.   :20.00     
##                                                                             
##  total_intl_calls total_intl_charge number_customer_service_calls churn     
##  Min.   : 0.000   Min.   :0.000     Min.   :0.000                 yes: 475  
##  1st Qu.: 3.000   1st Qu.:2.300     1st Qu.:1.000                 no :2858  
##  Median : 4.000   Median :2.780     Median :1.000                           
##  Mean   : 4.402   Mean   :2.764     Mean   :1.559                           
##  3rd Qu.: 6.000   3rd Qu.:3.240     3rd Qu.:2.000                           
##  Max.   :20.000   Max.   :5.400     Max.   :9.000                           
## 
# Model Training and Testing
library (caret)
## Loading required package: ggplot2
## Loading required package: lattice
library (rpart)
dt_model <- train (churn ~ ., data = churnTrain, method = "rpart")
dt_pred <- predict (dt_model, newdata = churnTest)
histogram (dt_pred)

# Check the Prediction Performance
table (dt_pred, churnTest$churn)
##        
## dt_pred  yes   no
##     yes  149   24
##     no    83 1411
table (table(dt_pred, churnTest$churn))
## 
##   24   83  149 1411 
##    1    1    1    1
confusionMatrix (dt_pred, churnTest$churn)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  yes   no
##        yes  149   24
##        no    83 1411
##                                          
##                Accuracy : 0.9358         
##                  95% CI : (0.923, 0.9471)
##     No Information Rate : 0.8608         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.7002         
##                                          
##  Mcnemar's Test P-Value : 2.058e-08      
##                                          
##             Sensitivity : 0.64224        
##             Specificity : 0.98328        
##          Pos Pred Value : 0.86127        
##          Neg Pred Value : 0.94444        
##              Prevalence : 0.13917        
##          Detection Rate : 0.08938        
##    Detection Prevalence : 0.10378        
##       Balanced Accuracy : 0.81276        
##                                          
##        'Positive' Class : yes            
## 
# Precision, Recall, and F Measure
precision <--  posPredValue (dt_pred, churnTest$churn, positive = "yes")
recall <--  sensitivity (dt_pred, churnTest$churn, positive ="yes")
f <--  2 * precision * recall / (precision + recall)
sprintf ("Prediction is %.2f; recall is  %.2f;  F measure is  %.2f ",  precision, recall, f)
## [1] "Prediction is -0.86; recall is  -0.64;  F measure is  0.74 "
# Hold Out Method (1)
train_index <-  sample (1:nrow(churnTrain), replace = F, size = nrow(churnTrain) * 0.8)
churnTrain2 <-  churnTrain[train_index, ]
table (table (churnTrain2$churn))
## 
##  392 2274 
##    1    1
# Hold Out Method (2)
train_index <- createDataPartition (churnTrain$churn, p = 0.8, list = F)
length (train_index)
## [1] 2667
nrow (churnTrain)
## [1] 3333
churnTrain2 <-  churnTrain [train_index, ]
churnTest2 <-  churnTrain [- train_index, ]
table (table (churnTrain2$churn))
## 
##  380 2287 
##    1    1
table (table(churnTrain$churn)) 
## 
##  475 2858 
##    1    1
 # Cross Validation Method
tr_control <- trainControl (method = "repeatedcv", number = 10, repeats = 3)
dt_model_cv <- train (churn~ ., data = churnTrain, method = "rpart", metric = "Accuracy",
control = rpart.control (minsplit = 30, minbucket = 10, maxdepth = 6,   cp = 0.07),  trControl = tr_control, na.action = na.omit)
 
# ********************
# Other Types of Evaluation Method to Try
# To try these other methods, replace the tr_control statement with the following:
 
# Bootstrapping
 
tr_control <- trainControl (method = "boot", number = 10)
dt_model_cv <- train (churn~ ., data = churnTrain, method = "rpart", metric = "Accuracy",
control = rpart.control (minsplit = 30, minbucket = 10, maxdepth = 6,   cp = 0.07),  trControl = tr_control, na.action = na.omit)
 
# Leave One Out
 
tr_control <- trainControl (method = "LOOCV", number = 10)
dt_model_cv <- train (churn~ ., data = churnTrain, method = "rpart", metric = "Accuracy",
control = rpart.control (minsplit = 30, minbucket = 10, maxdepth = 6,   cp = 0.07),  trControl = tr_control, na.action = na.omit)

Use Random Forest

# Dataset: churn
rm(list = ls())
library(C50)
library(modeldata)
# Churn dataset was no longer in the c50 package. So I used the one available in modeldata and split the data into train and test parts.
data(mlc_churn)
set.seed(123)
train_sample <- sample(5000, 3333) # The train part in the churn dataset in previous version of c50 had 3333 samples. That is why I randomly selected 3333 samples from my mlc_churn dataset.
str(train_sample)
##  int [1:3333] 2463 2511 2227 526 4291 2986 1842 1142 3371 3446 ...
churnTrain <- mlc_churn[train_sample, ]
churnTest <- mlc_churn[-train_sample, ]
churn <- rbind (churnTrain, churnTest)
str (churnTrain)
## tibble [3,333 × 20] (S3: tbl_df/tbl/data.frame)
##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 30 46 20 37 38 16 50 6 36 7 ...
##  $ account_length               : int [1:3333] 92 118 120 176 170 7 52 157 54 60 ...
##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 2 1 2 1 1 2 3 2 2 2 ...
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 1 1 1 1 2 1 2 1 1 2 ...
##  $ number_vmail_messages        : int [1:3333] 0 0 0 0 22 0 21 0 0 20 ...
##  $ total_day_minutes            : num [1:3333] 181.4 189.3 137.3 47.4 173.6 ...
##  $ total_day_calls              : int [1:3333] 98 119 100 125 66 87 119 74 122 113 ...
##  $ total_day_charge             : num [1:3333] 30.84 32.18 23.34 8.06 29.51 ...
##  $ total_eve_minutes            : num [1:3333] 164 234 212 168 142 ...
##  $ total_eve_calls              : int [1:3333] 98 112 129 90 83 83 95 96 87 97 ...
##  $ total_eve_charge             : num [1:3333] 14 19.9 18 14.3 12.1 ...
##  $ total_night_minutes          : num [1:3333] 171 271 153 163 156 ...
##  $ total_night_calls            : int [1:3333] 110 104 92 107 93 77 94 81 72 78 ...
##  $ total_night_charge           : num [1:3333] 7.69 12.19 6.87 7.34 7.04 ...
##  $ total_intl_minutes           : num [1:3333] 10.9 10 10.5 10.5 5.2 11 5.3 7.9 11.1 9.4 ...
##  $ total_intl_calls             : int [1:3333] 4 1 2 8 3 5 3 6 6 4 ...
##  $ total_intl_charge            : num [1:3333] 2.94 2.7 2.84 2.84 1.4 2.97 1.43 2.13 3 2.54 ...
##  $ number_customer_service_calls: int [1:3333] 2 2 1 2 1 3 2 1 0 1 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
str (churnTest)
## tibble [1,667 × 20] (S3: tbl_df/tbl/data.frame)
##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 36 2 25 19 27 35 6 4 41 27 ...
##  $ account_length               : int [1:1667] 84 118 147 117 95 161 77 130 111 54 ...
##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 1 3 2 1 3 2 1 2 2 1 ...
##  $ international_plan           : Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 1 1 1 1 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ number_vmail_messages        : int [1:1667] 0 0 0 0 0 0 0 0 0 0 ...
##  $ total_day_minutes            : num [1:1667] 299 223 157 184 157 ...
##  $ total_day_calls              : int [1:1667] 71 98 79 97 88 67 89 112 103 73 ...
##  $ total_day_charge             : num [1:1667] 50.9 38 26.7 31.4 26.6 ...
##  $ total_eve_minutes            : num [1:1667] 61.9 220.6 103.1 351.6 247.6 ...
##  $ total_eve_calls              : int [1:1667] 88 101 94 80 75 97 121 99 102 100 ...
##  $ total_eve_charge             : num [1:1667] 5.26 18.75 8.76 29.89 21.05 ...
##  $ total_night_minutes          : num [1:1667] 197 204 212 216 192 ...
##  $ total_night_calls            : int [1:1667] 89 118 96 90 115 128 64 78 105 68 ...
##  $ total_night_charge           : num [1:1667] 8.86 9.18 9.53 9.71 8.65 7.23 9.43 8.18 8.53 4.59 ...
##  $ total_intl_minutes           : num [1:1667] 6.6 6.3 7.1 8.7 12.3 5.4 5.7 9.5 7.7 14.7 ...
##  $ total_intl_calls             : int [1:1667] 7 6 6 4 5 9 6 19 6 4 ...
##  $ total_intl_charge            : num [1:1667] 1.78 1.7 1.92 2.35 3.32 1.46 1.54 2.57 2.08 3.97 ...
##  $ number_customer_service_calls: int [1:1667] 2 0 0 1 3 4 5 0 2 3 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 1 1 2 2 2 ...
names (churnTrain)
##  [1] "state"                         "account_length"               
##  [3] "area_code"                     "international_plan"           
##  [5] "voice_mail_plan"               "number_vmail_messages"        
##  [7] "total_day_minutes"             "total_day_calls"              
##  [9] "total_day_charge"              "total_eve_minutes"            
## [11] "total_eve_calls"               "total_eve_charge"             
## [13] "total_night_minutes"           "total_night_calls"            
## [15] "total_night_charge"            "total_intl_minutes"           
## [17] "total_intl_calls"              "total_intl_charge"            
## [19] "number_customer_service_calls" "churn"
summary (churnTrain)
##      state      account_length          area_code    international_plan
##  WV     : 105   Min.   :  1.0   area_code_408: 841   no :3009          
##  VA     :  85   1st Qu.: 73.0   area_code_415:1685   yes: 324          
##  AL     :  82   Median :100.0   area_code_510: 807                     
##  ID     :  82   Mean   :100.8                                          
##  NY     :  82   3rd Qu.:128.0                                          
##  MN     :  80   Max.   :243.0                                          
##  (Other):2817                                                          
##  voice_mail_plan number_vmail_messages total_day_minutes total_day_calls
##  no :2444        Min.   : 0.00         Min.   :  0.0     Min.   :  0.0  
##  yes: 889        1st Qu.: 0.00         1st Qu.:144.0     1st Qu.: 87.0  
##                  Median : 0.00         Median :179.7     Median :100.0  
##                  Mean   : 7.84         Mean   :180.2     Mean   :100.3  
##                  3rd Qu.:17.00         3rd Qu.:215.8     3rd Qu.:114.0  
##                  Max.   :51.00         Max.   :351.5     Max.   :165.0  
##                                                                         
##  total_day_charge total_eve_minutes total_eve_calls total_eve_charge
##  Min.   : 0.00    Min.   : 22.3     Min.   : 37.0   Min.   : 1.90   
##  1st Qu.:24.48    1st Qu.:166.2     1st Qu.: 87.0   1st Qu.:14.13   
##  Median :30.55    Median :201.3     Median :101.0   Median :17.11   
##  Mean   :30.63    Mean   :200.3     Mean   :100.4   Mean   :17.03   
##  3rd Qu.:36.69    3rd Qu.:233.9     3rd Qu.:113.0   3rd Qu.:19.88   
##  Max.   :59.76    Max.   :361.8     Max.   :170.0   Max.   :30.75   
##                                                                     
##  total_night_minutes total_night_calls total_night_charge total_intl_minutes
##  Min.   :  0.0       Min.   :  0.00    Min.   : 0.000     Min.   : 0.00     
##  1st Qu.:166.5       1st Qu.: 86.00    1st Qu.: 7.490     1st Qu.: 8.50     
##  Median :200.6       Median :100.00    Median : 9.030     Median :10.30     
##  Mean   :200.6       Mean   : 99.87    Mean   : 9.027     Mean   :10.23     
##  3rd Qu.:234.9       3rd Qu.:114.00    3rd Qu.:10.570     3rd Qu.:12.00     
##  Max.   :395.0       Max.   :175.00    Max.   :17.770     Max.   :20.00     
##                                                                             
##  total_intl_calls total_intl_charge number_customer_service_calls churn     
##  Min.   : 0.000   Min.   :0.000     Min.   :0.000                 yes: 475  
##  1st Qu.: 3.000   1st Qu.:2.300     1st Qu.:1.000                 no :2858  
##  Median : 4.000   Median :2.780     Median :1.000                           
##  Mean   : 4.402   Mean   :2.764     Mean   :1.559                           
##  3rd Qu.: 6.000   3rd Qu.:3.240     3rd Qu.:2.000                           
##  Max.   :20.000   Max.   :5.400     Max.   :9.000                           
## 
# Model Training and Testing
library (caret)
library (rpart)
dt_model <- train (churn ~ ., data = churnTrain, method="rf", metric="Accuracy", ntree=50)
dt_pred <- predict (dt_model, newdata = churnTest)
histogram (dt_pred)

# Check the Prediction Performance
table (dt_pred, churnTest$churn)
##        
## dt_pred  yes   no
##     yes  178   14
##     no    54 1421
table (table(dt_pred, churnTest$churn))
## 
##   14   54  178 1421 
##    1    1    1    1
confusionMatrix (dt_pred, churnTest$churn)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  yes   no
##        yes  178   14
##        no    54 1421
##                                           
##                Accuracy : 0.9592          
##                  95% CI : (0.9486, 0.9682)
##     No Information Rate : 0.8608          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8165          
##                                           
##  Mcnemar's Test P-Value : 2.251e-06       
##                                           
##             Sensitivity : 0.7672          
##             Specificity : 0.9902          
##          Pos Pred Value : 0.9271          
##          Neg Pred Value : 0.9634          
##              Prevalence : 0.1392          
##          Detection Rate : 0.1068          
##    Detection Prevalence : 0.1152          
##       Balanced Accuracy : 0.8787          
##                                           
##        'Positive' Class : yes             
## 
# Precision, Recall, and F Measure
precision <--  posPredValue (dt_pred, churnTest$churn, positive = "yes")
recall <--  sensitivity (dt_pred, churnTest$churn, positive ="yes")
f <--  2 * precision * recall / (precision + recall)
sprintf ("Prediction is %.2f; recall is  %.2f;  F measure is  %.2f ",  precision, recall, f)
## [1] "Prediction is -0.93; recall is  -0.77;  F measure is  0.84 "