# Dataset: churn
rm(list = ls())
library(C50)
library(modeldata)
# Churn dataset was no longer in the c50 package. So I used the one available in modeldata and split the data into train and test parts.
data(mlc_churn)
set.seed(123)
train_sample <- sample(5000, 3333) # The train part in the churn dataset in previous version of c50 had 3333 samples. That is why I randomly selected 3333 samples from my mlc_churn dataset.
str(train_sample)
## int [1:3333] 2463 2511 2227 526 4291 2986 1842 1142 3371 3446 ...
churnTrain <- mlc_churn[train_sample, ]
churnTest <- mlc_churn[-train_sample, ]
churn <- rbind (churnTrain, churnTest)
str (churnTrain)
## tibble [3,333 × 20] (S3: tbl_df/tbl/data.frame)
## $ state : Factor w/ 51 levels "AK","AL","AR",..: 30 46 20 37 38 16 50 6 36 7 ...
## $ account_length : int [1:3333] 92 118 120 176 170 7 52 157 54 60 ...
## $ area_code : Factor w/ 3 levels "area_code_408",..: 2 1 2 1 1 2 3 2 2 2 ...
## $ international_plan : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ voice_mail_plan : Factor w/ 2 levels "no","yes": 1 1 1 1 2 1 2 1 1 2 ...
## $ number_vmail_messages : int [1:3333] 0 0 0 0 22 0 21 0 0 20 ...
## $ total_day_minutes : num [1:3333] 181.4 189.3 137.3 47.4 173.6 ...
## $ total_day_calls : int [1:3333] 98 119 100 125 66 87 119 74 122 113 ...
## $ total_day_charge : num [1:3333] 30.84 32.18 23.34 8.06 29.51 ...
## $ total_eve_minutes : num [1:3333] 164 234 212 168 142 ...
## $ total_eve_calls : int [1:3333] 98 112 129 90 83 83 95 96 87 97 ...
## $ total_eve_charge : num [1:3333] 14 19.9 18 14.3 12.1 ...
## $ total_night_minutes : num [1:3333] 171 271 153 163 156 ...
## $ total_night_calls : int [1:3333] 110 104 92 107 93 77 94 81 72 78 ...
## $ total_night_charge : num [1:3333] 7.69 12.19 6.87 7.34 7.04 ...
## $ total_intl_minutes : num [1:3333] 10.9 10 10.5 10.5 5.2 11 5.3 7.9 11.1 9.4 ...
## $ total_intl_calls : int [1:3333] 4 1 2 8 3 5 3 6 6 4 ...
## $ total_intl_charge : num [1:3333] 2.94 2.7 2.84 2.84 1.4 2.97 1.43 2.13 3 2.54 ...
## $ number_customer_service_calls: int [1:3333] 2 2 1 2 1 3 2 1 0 1 ...
## $ churn : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
str (churnTest)
## tibble [1,667 × 20] (S3: tbl_df/tbl/data.frame)
## $ state : Factor w/ 51 levels "AK","AL","AR",..: 36 2 25 19 27 35 6 4 41 27 ...
## $ account_length : int [1:1667] 84 118 147 117 95 161 77 130 111 54 ...
## $ area_code : Factor w/ 3 levels "area_code_408",..: 1 3 2 1 3 2 1 2 2 1 ...
## $ international_plan : Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 1 1 1 1 ...
## $ voice_mail_plan : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ number_vmail_messages : int [1:1667] 0 0 0 0 0 0 0 0 0 0 ...
## $ total_day_minutes : num [1:1667] 299 223 157 184 157 ...
## $ total_day_calls : int [1:1667] 71 98 79 97 88 67 89 112 103 73 ...
## $ total_day_charge : num [1:1667] 50.9 38 26.7 31.4 26.6 ...
## $ total_eve_minutes : num [1:1667] 61.9 220.6 103.1 351.6 247.6 ...
## $ total_eve_calls : int [1:1667] 88 101 94 80 75 97 121 99 102 100 ...
## $ total_eve_charge : num [1:1667] 5.26 18.75 8.76 29.89 21.05 ...
## $ total_night_minutes : num [1:1667] 197 204 212 216 192 ...
## $ total_night_calls : int [1:1667] 89 118 96 90 115 128 64 78 105 68 ...
## $ total_night_charge : num [1:1667] 8.86 9.18 9.53 9.71 8.65 7.23 9.43 8.18 8.53 4.59 ...
## $ total_intl_minutes : num [1:1667] 6.6 6.3 7.1 8.7 12.3 5.4 5.7 9.5 7.7 14.7 ...
## $ total_intl_calls : int [1:1667] 7 6 6 4 5 9 6 19 6 4 ...
## $ total_intl_charge : num [1:1667] 1.78 1.7 1.92 2.35 3.32 1.46 1.54 2.57 2.08 3.97 ...
## $ number_customer_service_calls: int [1:1667] 2 0 0 1 3 4 5 0 2 3 ...
## $ churn : Factor w/ 2 levels "yes","no": 2 2 2 2 2 1 1 2 2 2 ...
names (churnTrain)
## [1] "state" "account_length"
## [3] "area_code" "international_plan"
## [5] "voice_mail_plan" "number_vmail_messages"
## [7] "total_day_minutes" "total_day_calls"
## [9] "total_day_charge" "total_eve_minutes"
## [11] "total_eve_calls" "total_eve_charge"
## [13] "total_night_minutes" "total_night_calls"
## [15] "total_night_charge" "total_intl_minutes"
## [17] "total_intl_calls" "total_intl_charge"
## [19] "number_customer_service_calls" "churn"
summary (churnTrain)
## state account_length area_code international_plan
## WV : 105 Min. : 1.0 area_code_408: 841 no :3009
## VA : 85 1st Qu.: 73.0 area_code_415:1685 yes: 324
## AL : 82 Median :100.0 area_code_510: 807
## ID : 82 Mean :100.8
## NY : 82 3rd Qu.:128.0
## MN : 80 Max. :243.0
## (Other):2817
## voice_mail_plan number_vmail_messages total_day_minutes total_day_calls
## no :2444 Min. : 0.00 Min. : 0.0 Min. : 0.0
## yes: 889 1st Qu.: 0.00 1st Qu.:144.0 1st Qu.: 87.0
## Median : 0.00 Median :179.7 Median :100.0
## Mean : 7.84 Mean :180.2 Mean :100.3
## 3rd Qu.:17.00 3rd Qu.:215.8 3rd Qu.:114.0
## Max. :51.00 Max. :351.5 Max. :165.0
##
## total_day_charge total_eve_minutes total_eve_calls total_eve_charge
## Min. : 0.00 Min. : 22.3 Min. : 37.0 Min. : 1.90
## 1st Qu.:24.48 1st Qu.:166.2 1st Qu.: 87.0 1st Qu.:14.13
## Median :30.55 Median :201.3 Median :101.0 Median :17.11
## Mean :30.63 Mean :200.3 Mean :100.4 Mean :17.03
## 3rd Qu.:36.69 3rd Qu.:233.9 3rd Qu.:113.0 3rd Qu.:19.88
## Max. :59.76 Max. :361.8 Max. :170.0 Max. :30.75
##
## total_night_minutes total_night_calls total_night_charge total_intl_minutes
## Min. : 0.0 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.:166.5 1st Qu.: 86.00 1st Qu.: 7.490 1st Qu.: 8.50
## Median :200.6 Median :100.00 Median : 9.030 Median :10.30
## Mean :200.6 Mean : 99.87 Mean : 9.027 Mean :10.23
## 3rd Qu.:234.9 3rd Qu.:114.00 3rd Qu.:10.570 3rd Qu.:12.00
## Max. :395.0 Max. :175.00 Max. :17.770 Max. :20.00
##
## total_intl_calls total_intl_charge number_customer_service_calls churn
## Min. : 0.000 Min. :0.000 Min. :0.000 yes: 475
## 1st Qu.: 3.000 1st Qu.:2.300 1st Qu.:1.000 no :2858
## Median : 4.000 Median :2.780 Median :1.000
## Mean : 4.402 Mean :2.764 Mean :1.559
## 3rd Qu.: 6.000 3rd Qu.:3.240 3rd Qu.:2.000
## Max. :20.000 Max. :5.400 Max. :9.000
##
# Model Training and Testing
library (caret)
## Loading required package: ggplot2
## Loading required package: lattice
library (rpart)
dt_model <- train (churn ~ ., data = churnTrain, method = "rpart")
dt_pred <- predict (dt_model, newdata = churnTest)
histogram (dt_pred)

# Check the Prediction Performance
table (dt_pred, churnTest$churn)
##
## dt_pred yes no
## yes 149 24
## no 83 1411
table (table(dt_pred, churnTest$churn))
##
## 24 83 149 1411
## 1 1 1 1
confusionMatrix (dt_pred, churnTest$churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 149 24
## no 83 1411
##
## Accuracy : 0.9358
## 95% CI : (0.923, 0.9471)
## No Information Rate : 0.8608
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7002
##
## Mcnemar's Test P-Value : 2.058e-08
##
## Sensitivity : 0.64224
## Specificity : 0.98328
## Pos Pred Value : 0.86127
## Neg Pred Value : 0.94444
## Prevalence : 0.13917
## Detection Rate : 0.08938
## Detection Prevalence : 0.10378
## Balanced Accuracy : 0.81276
##
## 'Positive' Class : yes
##
# Precision, Recall, and F Measure
precision <-- posPredValue (dt_pred, churnTest$churn, positive = "yes")
recall <-- sensitivity (dt_pred, churnTest$churn, positive ="yes")
f <-- 2 * precision * recall / (precision + recall)
sprintf ("Prediction is %.2f; recall is %.2f; F measure is %.2f ", precision, recall, f)
## [1] "Prediction is -0.86; recall is -0.64; F measure is 0.74 "
# Hold Out Method (1)
train_index <- sample (1:nrow(churnTrain), replace = F, size = nrow(churnTrain) * 0.8)
churnTrain2 <- churnTrain[train_index, ]
table (table (churnTrain2$churn))
##
## 392 2274
## 1 1
# Hold Out Method (2)
train_index <- createDataPartition (churnTrain$churn, p = 0.8, list = F)
length (train_index)
## [1] 2667
nrow (churnTrain)
## [1] 3333
churnTrain2 <- churnTrain [train_index, ]
churnTest2 <- churnTrain [- train_index, ]
table (table (churnTrain2$churn))
##
## 380 2287
## 1 1
table (table(churnTrain$churn))
##
## 475 2858
## 1 1
# Cross Validation Method
tr_control <- trainControl (method = "repeatedcv", number = 10, repeats = 3)
dt_model_cv <- train (churn~ ., data = churnTrain, method = "rpart", metric = "Accuracy",
control = rpart.control (minsplit = 30, minbucket = 10, maxdepth = 6, cp = 0.07), trControl = tr_control, na.action = na.omit)
# ********************
# Other Types of Evaluation Method to Try
# To try these other methods, replace the tr_control statement with the following:
# Bootstrapping
tr_control <- trainControl (method = "boot", number = 10)
dt_model_cv <- train (churn~ ., data = churnTrain, method = "rpart", metric = "Accuracy",
control = rpart.control (minsplit = 30, minbucket = 10, maxdepth = 6, cp = 0.07), trControl = tr_control, na.action = na.omit)
# Leave One Out
tr_control <- trainControl (method = "LOOCV", number = 10)
dt_model_cv <- train (churn~ ., data = churnTrain, method = "rpart", metric = "Accuracy",
control = rpart.control (minsplit = 30, minbucket = 10, maxdepth = 6, cp = 0.07), trControl = tr_control, na.action = na.omit)
Use Random Forest
# Dataset: churn
rm(list = ls())
library(C50)
library(modeldata)
# Churn dataset was no longer in the c50 package. So I used the one available in modeldata and split the data into train and test parts.
data(mlc_churn)
set.seed(123)
train_sample <- sample(5000, 3333) # The train part in the churn dataset in previous version of c50 had 3333 samples. That is why I randomly selected 3333 samples from my mlc_churn dataset.
str(train_sample)
## int [1:3333] 2463 2511 2227 526 4291 2986 1842 1142 3371 3446 ...
churnTrain <- mlc_churn[train_sample, ]
churnTest <- mlc_churn[-train_sample, ]
churn <- rbind (churnTrain, churnTest)
str (churnTrain)
## tibble [3,333 × 20] (S3: tbl_df/tbl/data.frame)
## $ state : Factor w/ 51 levels "AK","AL","AR",..: 30 46 20 37 38 16 50 6 36 7 ...
## $ account_length : int [1:3333] 92 118 120 176 170 7 52 157 54 60 ...
## $ area_code : Factor w/ 3 levels "area_code_408",..: 2 1 2 1 1 2 3 2 2 2 ...
## $ international_plan : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ voice_mail_plan : Factor w/ 2 levels "no","yes": 1 1 1 1 2 1 2 1 1 2 ...
## $ number_vmail_messages : int [1:3333] 0 0 0 0 22 0 21 0 0 20 ...
## $ total_day_minutes : num [1:3333] 181.4 189.3 137.3 47.4 173.6 ...
## $ total_day_calls : int [1:3333] 98 119 100 125 66 87 119 74 122 113 ...
## $ total_day_charge : num [1:3333] 30.84 32.18 23.34 8.06 29.51 ...
## $ total_eve_minutes : num [1:3333] 164 234 212 168 142 ...
## $ total_eve_calls : int [1:3333] 98 112 129 90 83 83 95 96 87 97 ...
## $ total_eve_charge : num [1:3333] 14 19.9 18 14.3 12.1 ...
## $ total_night_minutes : num [1:3333] 171 271 153 163 156 ...
## $ total_night_calls : int [1:3333] 110 104 92 107 93 77 94 81 72 78 ...
## $ total_night_charge : num [1:3333] 7.69 12.19 6.87 7.34 7.04 ...
## $ total_intl_minutes : num [1:3333] 10.9 10 10.5 10.5 5.2 11 5.3 7.9 11.1 9.4 ...
## $ total_intl_calls : int [1:3333] 4 1 2 8 3 5 3 6 6 4 ...
## $ total_intl_charge : num [1:3333] 2.94 2.7 2.84 2.84 1.4 2.97 1.43 2.13 3 2.54 ...
## $ number_customer_service_calls: int [1:3333] 2 2 1 2 1 3 2 1 0 1 ...
## $ churn : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
str (churnTest)
## tibble [1,667 × 20] (S3: tbl_df/tbl/data.frame)
## $ state : Factor w/ 51 levels "AK","AL","AR",..: 36 2 25 19 27 35 6 4 41 27 ...
## $ account_length : int [1:1667] 84 118 147 117 95 161 77 130 111 54 ...
## $ area_code : Factor w/ 3 levels "area_code_408",..: 1 3 2 1 3 2 1 2 2 1 ...
## $ international_plan : Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 1 1 1 1 ...
## $ voice_mail_plan : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ number_vmail_messages : int [1:1667] 0 0 0 0 0 0 0 0 0 0 ...
## $ total_day_minutes : num [1:1667] 299 223 157 184 157 ...
## $ total_day_calls : int [1:1667] 71 98 79 97 88 67 89 112 103 73 ...
## $ total_day_charge : num [1:1667] 50.9 38 26.7 31.4 26.6 ...
## $ total_eve_minutes : num [1:1667] 61.9 220.6 103.1 351.6 247.6 ...
## $ total_eve_calls : int [1:1667] 88 101 94 80 75 97 121 99 102 100 ...
## $ total_eve_charge : num [1:1667] 5.26 18.75 8.76 29.89 21.05 ...
## $ total_night_minutes : num [1:1667] 197 204 212 216 192 ...
## $ total_night_calls : int [1:1667] 89 118 96 90 115 128 64 78 105 68 ...
## $ total_night_charge : num [1:1667] 8.86 9.18 9.53 9.71 8.65 7.23 9.43 8.18 8.53 4.59 ...
## $ total_intl_minutes : num [1:1667] 6.6 6.3 7.1 8.7 12.3 5.4 5.7 9.5 7.7 14.7 ...
## $ total_intl_calls : int [1:1667] 7 6 6 4 5 9 6 19 6 4 ...
## $ total_intl_charge : num [1:1667] 1.78 1.7 1.92 2.35 3.32 1.46 1.54 2.57 2.08 3.97 ...
## $ number_customer_service_calls: int [1:1667] 2 0 0 1 3 4 5 0 2 3 ...
## $ churn : Factor w/ 2 levels "yes","no": 2 2 2 2 2 1 1 2 2 2 ...
names (churnTrain)
## [1] "state" "account_length"
## [3] "area_code" "international_plan"
## [5] "voice_mail_plan" "number_vmail_messages"
## [7] "total_day_minutes" "total_day_calls"
## [9] "total_day_charge" "total_eve_minutes"
## [11] "total_eve_calls" "total_eve_charge"
## [13] "total_night_minutes" "total_night_calls"
## [15] "total_night_charge" "total_intl_minutes"
## [17] "total_intl_calls" "total_intl_charge"
## [19] "number_customer_service_calls" "churn"
summary (churnTrain)
## state account_length area_code international_plan
## WV : 105 Min. : 1.0 area_code_408: 841 no :3009
## VA : 85 1st Qu.: 73.0 area_code_415:1685 yes: 324
## AL : 82 Median :100.0 area_code_510: 807
## ID : 82 Mean :100.8
## NY : 82 3rd Qu.:128.0
## MN : 80 Max. :243.0
## (Other):2817
## voice_mail_plan number_vmail_messages total_day_minutes total_day_calls
## no :2444 Min. : 0.00 Min. : 0.0 Min. : 0.0
## yes: 889 1st Qu.: 0.00 1st Qu.:144.0 1st Qu.: 87.0
## Median : 0.00 Median :179.7 Median :100.0
## Mean : 7.84 Mean :180.2 Mean :100.3
## 3rd Qu.:17.00 3rd Qu.:215.8 3rd Qu.:114.0
## Max. :51.00 Max. :351.5 Max. :165.0
##
## total_day_charge total_eve_minutes total_eve_calls total_eve_charge
## Min. : 0.00 Min. : 22.3 Min. : 37.0 Min. : 1.90
## 1st Qu.:24.48 1st Qu.:166.2 1st Qu.: 87.0 1st Qu.:14.13
## Median :30.55 Median :201.3 Median :101.0 Median :17.11
## Mean :30.63 Mean :200.3 Mean :100.4 Mean :17.03
## 3rd Qu.:36.69 3rd Qu.:233.9 3rd Qu.:113.0 3rd Qu.:19.88
## Max. :59.76 Max. :361.8 Max. :170.0 Max. :30.75
##
## total_night_minutes total_night_calls total_night_charge total_intl_minutes
## Min. : 0.0 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.:166.5 1st Qu.: 86.00 1st Qu.: 7.490 1st Qu.: 8.50
## Median :200.6 Median :100.00 Median : 9.030 Median :10.30
## Mean :200.6 Mean : 99.87 Mean : 9.027 Mean :10.23
## 3rd Qu.:234.9 3rd Qu.:114.00 3rd Qu.:10.570 3rd Qu.:12.00
## Max. :395.0 Max. :175.00 Max. :17.770 Max. :20.00
##
## total_intl_calls total_intl_charge number_customer_service_calls churn
## Min. : 0.000 Min. :0.000 Min. :0.000 yes: 475
## 1st Qu.: 3.000 1st Qu.:2.300 1st Qu.:1.000 no :2858
## Median : 4.000 Median :2.780 Median :1.000
## Mean : 4.402 Mean :2.764 Mean :1.559
## 3rd Qu.: 6.000 3rd Qu.:3.240 3rd Qu.:2.000
## Max. :20.000 Max. :5.400 Max. :9.000
##
# Model Training and Testing
library (caret)
library (rpart)
dt_model <- train (churn ~ ., data = churnTrain, method="rf", metric="Accuracy", ntree=50)
dt_pred <- predict (dt_model, newdata = churnTest)
histogram (dt_pred)

# Check the Prediction Performance
table (dt_pred, churnTest$churn)
##
## dt_pred yes no
## yes 178 14
## no 54 1421
table (table(dt_pred, churnTest$churn))
##
## 14 54 178 1421
## 1 1 1 1
confusionMatrix (dt_pred, churnTest$churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 178 14
## no 54 1421
##
## Accuracy : 0.9592
## 95% CI : (0.9486, 0.9682)
## No Information Rate : 0.8608
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8165
##
## Mcnemar's Test P-Value : 2.251e-06
##
## Sensitivity : 0.7672
## Specificity : 0.9902
## Pos Pred Value : 0.9271
## Neg Pred Value : 0.9634
## Prevalence : 0.1392
## Detection Rate : 0.1068
## Detection Prevalence : 0.1152
## Balanced Accuracy : 0.8787
##
## 'Positive' Class : yes
##
# Precision, Recall, and F Measure
precision <-- posPredValue (dt_pred, churnTest$churn, positive = "yes")
recall <-- sensitivity (dt_pred, churnTest$churn, positive ="yes")
f <-- 2 * precision * recall / (precision + recall)
sprintf ("Prediction is %.2f; recall is %.2f; F measure is %.2f ", precision, recall, f)
## [1] "Prediction is -0.93; recall is -0.77; F measure is 0.84 "