setwd("~/MVC/AI4OPT/Data Mining II/Assignments")
library(readr)
library(dplyr)
library(stats)
library(tidyverse)
library(modeldata)
library(caret)
library(rpart)
library(C50)
library(rattle)
library(rpart.plot)
data(mlc_churn)
churn = mlc_churn
remove(mlc_churn)
churn
## # A tibble: 5,000 × 20
## state accou…¹ area_…² inter…³ voice…⁴ numbe…⁵ total…⁶ total…⁷ total…⁸ total…⁹
## <fct> <int> <fct> <fct> <fct> <int> <dbl> <int> <dbl> <dbl>
## 1 KS 128 area_c… no yes 25 265. 110 45.1 197.
## 2 OH 107 area_c… no yes 26 162. 123 27.5 196.
## 3 NJ 137 area_c… no no 0 243. 114 41.4 121.
## 4 OH 84 area_c… yes no 0 299. 71 50.9 61.9
## 5 OK 75 area_c… yes no 0 167. 113 28.3 148.
## 6 AL 118 area_c… yes no 0 223. 98 38.0 221.
## 7 MA 121 area_c… no yes 24 218. 88 37.1 348.
## 8 MO 147 area_c… yes no 0 157 79 26.7 103.
## 9 LA 117 area_c… no no 0 184. 97 31.4 352.
## 10 WV 141 area_c… yes yes 37 259. 84 44.0 222
## # … with 4,990 more rows, 10 more variables: total_eve_calls <int>,
## # total_eve_charge <dbl>, total_night_minutes <dbl>, total_night_calls <int>,
## # total_night_charge <dbl>, total_intl_minutes <dbl>, total_intl_calls <int>,
## # total_intl_charge <dbl>, number_customer_service_calls <int>, churn <fct>,
## # and abbreviated variable names ¹account_length, ²area_code,
## # ³international_plan, ⁴voice_mail_plan, ⁵number_vmail_messages,
## # ⁶total_day_minutes, ⁷total_day_calls, ⁸total_day_charge, …
## # ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names
80% of the data will be training data 20% of the ata will be testing data
split1 = sample(c(rep(0,0.8*nrow(churn)), rep(1, 0.2*nrow(churn))))
churnTrain = churn[split1==0,]
churnTest = churn[split1 ==1,]
str(churnTrain)
## tibble [4,000 × 20] (S3: tbl_df/tbl/data.frame)
## $ state : Factor w/ 51 levels "AK","AL","AR",..: 17 32 36 37 2 20 25 19 50 13 ...
## $ account_length : int [1:4000] 128 137 84 75 118 121 147 117 141 168 ...
## $ area_code : Factor w/ 3 levels "area_code_408",..: 2 2 1 2 3 3 2 1 2 1 ...
## $ international_plan : Factor w/ 2 levels "no","yes": 1 1 2 2 2 1 2 1 2 1 ...
## $ voice_mail_plan : Factor w/ 2 levels "no","yes": 2 1 1 1 1 2 1 1 2 1 ...
## $ number_vmail_messages : int [1:4000] 25 0 0 0 0 24 0 0 37 0 ...
## $ total_day_minutes : num [1:4000] 265 243 299 167 223 ...
## $ total_day_calls : int [1:4000] 110 114 71 113 98 88 79 97 84 96 ...
## $ total_day_charge : num [1:4000] 45.1 41.4 50.9 28.3 38 ...
## $ total_eve_minutes : num [1:4000] 197.4 121.2 61.9 148.3 220.6 ...
## $ total_eve_calls : int [1:4000] 99 110 88 122 101 108 94 80 111 71 ...
## $ total_eve_charge : num [1:4000] 16.78 10.3 5.26 12.61 18.75 ...
## $ total_night_minutes : num [1:4000] 245 163 197 187 204 ...
## $ total_night_calls : int [1:4000] 91 104 89 121 118 118 96 90 97 128 ...
## $ total_night_charge : num [1:4000] 11.01 7.32 8.86 8.41 9.18 ...
## $ total_intl_minutes : num [1:4000] 10 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 11.2 ...
## $ total_intl_calls : int [1:4000] 3 5 7 3 6 7 6 4 5 2 ...
## $ total_intl_charge : num [1:4000] 2.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 3.02 ...
## $ number_customer_service_calls: int [1:4000] 1 0 2 3 0 3 0 1 0 1 ...
## $ churn : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
str(churnTest)
## tibble [1,000 × 20] (S3: tbl_df/tbl/data.frame)
## $ state : Factor w/ 51 levels "AK","AL","AR",..: 36 16 40 27 46 46 31 4 11 21 ...
## $ account_length : int [1:1000] 107 65 74 95 76 132 75 12 98 135 ...
## $ area_code : Factor w/ 3 levels "area_code_408",..: 2 2 2 3 3 3 3 1 1 1 ...
## $ international_plan : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 2 ...
## $ voice_mail_plan : Factor w/ 2 levels "no","yes": 2 1 1 1 2 1 1 1 1 2 ...
## $ number_vmail_messages : int [1:1000] 26 0 0 0 33 0 0 0 0 41 ...
## $ total_day_minutes : num [1:1000] 162 129 188 157 190 ...
## $ total_day_calls : int [1:1000] 123 137 127 88 66 86 105 118 102 85 ...
## $ total_day_charge : num [1:1000] 27.5 21.9 31.9 26.6 32.2 ...
## $ total_eve_minutes : num [1:1000] 196 228 163 248 213 ...
## $ total_eve_calls : int [1:1000] 103 83 148 75 65 72 107 119 85 107 ...
## $ total_eve_charge : num [1:1000] 16.6 19.4 13.9 21.1 18.1 ...
## $ total_night_minutes : num [1:1000] 254 209 196 192 166 ...
## $ total_night_calls : int [1:1000] 103 111 94 115 108 115 98 90 135 78 ...
## $ total_night_charge : num [1:1000] 11.45 9.4 8.82 8.65 7.46 ...
## $ total_intl_minutes : num [1:1000] 13.7 12.7 9.1 12.3 10 10.3 10.3 11.8 9.4 14.6 ...
## $ total_intl_calls : int [1:1000] 3 6 5 5 5 2 5 3 2 15 ...
## $ total_intl_charge : num [1:1000] 3.7 3.43 2.46 3.32 2.7 2.78 2.78 3.19 2.54 3.94 ...
## $ number_customer_service_calls: int [1:1000] 1 4 0 3 1 0 1 1 3 0 ...
## $ churn : Factor w/ 2 levels "yes","no": 2 1 2 2 2 2 2 1 2 1 ...
We are telling the program to find out how the churn column is affected by all the other columns. The symbol ~. is used to tell the program that all the other columns are not predictors. We will use rpart as the training method.
dt_model = train(churn ~., data = churnTrain, metric = "Accuracy", method = "rpart")
Below are the descriptive features of the model. The function typeof will display the information about our model.
typeof(dt_model)
## [1] "list"
names(dt_model)
## [1] "method" "modelInfo" "modelType" "results" "pred"
## [6] "bestTune" "call" "dots" "metric" "control"
## [11] "finalModel" "preProcess" "trainingData" "ptype" "resample"
## [16] "resampledCM" "perfNames" "maximize" "yLimits" "times"
## [21] "levels" "terms" "coefnames" "contrasts" "xlevels"
Our training model had 4000 samples which is 80% of our original data.
print(dt_model)
## CART
##
## 4000 samples
## 19 predictor
## 2 classes: 'yes', 'no'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 4000, 4000, 4000, 4000, 4000, 4000, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.05184534 0.9183780 0.6121341
## 0.07908612 0.8842500 0.3811028
## 0.10896309 0.8610094 0.1338099
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.05184534.
Based on the results from our model, we find that using CART (Classification & Regression Trees) is the best decision tree algorithm for our type of data which is nominal. We have 19 predictors with a cp (complexity parameter) of 0.0518453 with an accuracy of 0.0518453. This means that if we don’t see 5.2% increase in our prediction/decision then don’t split the node.
Kappa is a classification accuracy which represents the amount of agreement between our raters in our case is yes or no. Because our highest value of kappa is between 0.21-0.40 it is considered a fair agreement. Since it is not a strong amount of classification accuracy we need to tune our data to maximize our decision/prediction.
print(dt_model$finalModel)
## n= 4000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4000 569 no (0.14225000 0.85775000)
## 2) total_day_minutes>=265.75 242 90 yes (0.62809917 0.37190083)
## 4) voice_mail_planyes< 0.5 191 42 yes (0.78010471 0.21989529) *
## 5) voice_mail_planyes>=0.5 51 3 no (0.05882353 0.94117647) *
## 3) total_day_minutes< 265.75 3758 417 no (0.11096328 0.88903672)
## 6) number_customer_service_calls>=3.5 295 144 no (0.48813559 0.51186441)
## 12) total_day_minutes< 162.95 120 15 yes (0.87500000 0.12500000) *
## 13) total_day_minutes>=162.95 175 39 no (0.22285714 0.77714286) *
## 7) number_customer_service_calls< 3.5 3463 273 no (0.07883338 0.92116662) *
dt_predict<- predict(dt_model,newdata = churnTest,na.action=na.omit, type="prob")
head(dt_predict, 5)
## yes no
## 1 0.07883338 0.9211666
## 2 0.87500000 0.1250000
## 3 0.07883338 0.9211666
## 4 0.07883338 0.9211666
## 5 0.07883338 0.9211666
The prediction results in probabilities for each observation. The probability that it will be yes based on our decision tree or no. Let’s create a visualization of these results for a better understanding.
for (rw in 1:nrow(dt_predict)){ #for each row in dt_predict
#the prediction is the column name where the max probability is found.
dt_predict$pred_churn[rw]=
colnames(dt_predict[rw,][which.max(dt_predict[rw,])])
}
# Time for confusion matrix!
confusion_mat = table(churnTest$churn, dt_predict$pred_churn)
confusion_mat
##
## no yes
## yes 85 53
## no 842 20
Based on this confusion matrix our model accuracy is 10.5%. The accuracy is low and thus we might need to continue working with this data and do some tuning.
dt_predict2 <- predict(dt_model, newdata = churnTest, type = "raw")
head(dt_predict2)
## [1] no yes no no no no
## Levels: yes no
The results from the prediction 2 model is the same as the previous prediction but instead of outputting a probability it outputted the classification of yes or no. We can verify that we get a similar conclusion without having to use the for loop.
conf_mat2 = table(churnTest$churn, dt_predict2)
conf_mat2
## dt_predict2
## yes no
## yes 53 85
## no 20 842
From the result we can see that our confusion matrix for prediction 2 has the same accuracy of 10.5%.
dt_model_tune <- train(churn ~., data = churnTrain, method="rpart", metric ="Accuracy", tuneLength =8)
print(dt_model_tune$finalModel)
## n= 4000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4000 569 no (0.14225000 0.85775000)
## 2) total_day_minutes>=265.75 242 90 yes (0.62809917 0.37190083)
## 4) voice_mail_planyes< 0.5 191 42 yes (0.78010471 0.21989529)
## 8) total_eve_minutes>=138.55 167 21 yes (0.87425150 0.12574850)
## 16) total_night_minutes>=130.15 156 13 yes (0.91666667 0.08333333) *
## 17) total_night_minutes< 130.15 11 3 no (0.27272727 0.72727273) *
## 9) total_eve_minutes< 138.55 24 3 no (0.12500000 0.87500000) *
## 5) voice_mail_planyes>=0.5 51 3 no (0.05882353 0.94117647) *
## 3) total_day_minutes< 265.75 3758 417 no (0.11096328 0.88903672)
## 6) number_customer_service_calls>=3.5 295 144 no (0.48813559 0.51186441)
## 12) total_day_minutes< 162.95 120 15 yes (0.87500000 0.12500000) *
## 13) total_day_minutes>=162.95 175 39 no (0.22285714 0.77714286)
## 26) total_eve_minutes< 141.45 19 7 yes (0.63157895 0.36842105) *
## 27) total_eve_minutes>=141.45 156 27 no (0.17307692 0.82692308) *
## 7) number_customer_service_calls< 3.5 3463 273 no (0.07883338 0.92116662)
## 14) international_planyes>=0.5 318 117 no (0.36792453 0.63207547)
## 28) total_intl_minutes>=13.05 59 0 yes (1.00000000 0.00000000) *
## 29) total_intl_minutes< 13.05 259 58 no (0.22393822 0.77606178)
## 58) total_intl_calls< 2.5 51 0 yes (1.00000000 0.00000000) *
## 59) total_intl_calls>=2.5 208 7 no (0.03365385 0.96634615) *
## 15) international_planyes< 0.5 3145 156 no (0.04960254 0.95039746)
## 30) total_day_minutes>=221.85 529 88 no (0.16635161 0.83364839)
## 60) total_eve_minutes>=242.35 104 46 yes (0.55769231 0.44230769)
## 120) voice_mail_planyes< 0.5 81 23 yes (0.71604938 0.28395062)
## 240) total_night_minutes>=172.5 58 8 yes (0.86206897 0.13793103) *
## 241) total_night_minutes< 172.5 23 8 no (0.34782609 0.65217391) *
## 121) voice_mail_planyes>=0.5 23 0 no (0.00000000 1.00000000) *
## 61) total_eve_minutes< 242.35 425 30 no (0.07058824 0.92941176) *
## 31) total_day_minutes< 221.85 2616 68 no (0.02599388 0.97400612) *
The arguments method =“rpart” is telling the model to use the training method rpart which stands for recursive partitioning. The argument metric =“Accuracy” is choosing this accuracy to find the optimal model.
dt_model_tune_2 <- train(churn ~., data = churnTrain, method="rpart", tuneGrid = expand.grid(cp=seq(0,0.1,0.01)))
print(dt_model_tune$finalModel)
## n= 4000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4000 569 no (0.14225000 0.85775000)
## 2) total_day_minutes>=265.75 242 90 yes (0.62809917 0.37190083)
## 4) voice_mail_planyes< 0.5 191 42 yes (0.78010471 0.21989529)
## 8) total_eve_minutes>=138.55 167 21 yes (0.87425150 0.12574850)
## 16) total_night_minutes>=130.15 156 13 yes (0.91666667 0.08333333) *
## 17) total_night_minutes< 130.15 11 3 no (0.27272727 0.72727273) *
## 9) total_eve_minutes< 138.55 24 3 no (0.12500000 0.87500000) *
## 5) voice_mail_planyes>=0.5 51 3 no (0.05882353 0.94117647) *
## 3) total_day_minutes< 265.75 3758 417 no (0.11096328 0.88903672)
## 6) number_customer_service_calls>=3.5 295 144 no (0.48813559 0.51186441)
## 12) total_day_minutes< 162.95 120 15 yes (0.87500000 0.12500000) *
## 13) total_day_minutes>=162.95 175 39 no (0.22285714 0.77714286)
## 26) total_eve_minutes< 141.45 19 7 yes (0.63157895 0.36842105) *
## 27) total_eve_minutes>=141.45 156 27 no (0.17307692 0.82692308) *
## 7) number_customer_service_calls< 3.5 3463 273 no (0.07883338 0.92116662)
## 14) international_planyes>=0.5 318 117 no (0.36792453 0.63207547)
## 28) total_intl_minutes>=13.05 59 0 yes (1.00000000 0.00000000) *
## 29) total_intl_minutes< 13.05 259 58 no (0.22393822 0.77606178)
## 58) total_intl_calls< 2.5 51 0 yes (1.00000000 0.00000000) *
## 59) total_intl_calls>=2.5 208 7 no (0.03365385 0.96634615) *
## 15) international_planyes< 0.5 3145 156 no (0.04960254 0.95039746)
## 30) total_day_minutes>=221.85 529 88 no (0.16635161 0.83364839)
## 60) total_eve_minutes>=242.35 104 46 yes (0.55769231 0.44230769)
## 120) voice_mail_planyes< 0.5 81 23 yes (0.71604938 0.28395062)
## 240) total_night_minutes>=172.5 58 8 yes (0.86206897 0.13793103) *
## 241) total_night_minutes< 172.5 23 8 no (0.34782609 0.65217391) *
## 121) voice_mail_planyes>=0.5 23 0 no (0.00000000 1.00000000) *
## 61) total_eve_minutes< 242.35 425 30 no (0.07058824 0.92941176) *
## 31) total_day_minutes< 221.85 2616 68 no (0.02599388 0.97400612) *
The default for cp= in the tuneGrid argument \(0.01\). By having a threshold it saves computing time by eliminating splits if they are insignificant to the value of the model. The results shows the decision tree with many nodes being terminal nodes and this is indicated by the character *. I hope that this means our accuracy levels will increase.
We can use tuning parameters like minsplit, minbucket and maxdepth to prune our tree and prevent overfitting the training data.
dt_model_preprune <- train(churn~., data=churnTrain, method="rpart",
metric="Accuracy", tuneLength=8, control=rpart.control(minsplit = 50,minbucket = 20, maxdepth=5))
print(dt_model_preprune)
## CART
##
## 4000 samples
## 19 predictor
## 2 classes: 'yes', 'no'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 4000, 4000, 4000, 4000, 4000, 4000, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.005858231 0.9347213 0.7131986
## 0.008787346 0.9342325 0.7056393
## 0.012302285 0.9344851 0.7042524
## 0.020503808 0.9342836 0.6977766
## 0.031634446 0.9324698 0.6879739
## 0.051845343 0.9192851 0.6159049
## 0.079086116 0.8823769 0.3694415
## 0.108963093 0.8627178 0.1454675
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.005858231.
print(dt_model_preprune$finalModel)
## n= 4000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4000 569 no (0.14225000 0.85775000)
## 2) total_day_minutes>=265.75 242 90 yes (0.62809917 0.37190083)
## 4) voice_mail_planyes< 0.5 191 42 yes (0.78010471 0.21989529)
## 8) total_eve_minutes>=138.55 167 21 yes (0.87425150 0.12574850) *
## 9) total_eve_minutes< 138.55 24 3 no (0.12500000 0.87500000) *
## 5) voice_mail_planyes>=0.5 51 3 no (0.05882353 0.94117647) *
## 3) total_day_minutes< 265.75 3758 417 no (0.11096328 0.88903672)
## 6) number_customer_service_calls>=3.5 295 144 no (0.48813559 0.51186441)
## 12) total_day_minutes< 162.95 120 15 yes (0.87500000 0.12500000) *
## 13) total_day_minutes>=162.95 175 39 no (0.22285714 0.77714286) *
## 7) number_customer_service_calls< 3.5 3463 273 no (0.07883338 0.92116662)
## 14) international_planyes>=0.5 318 117 no (0.36792453 0.63207547)
## 28) total_intl_minutes>=13.05 59 0 yes (1.00000000 0.00000000) *
## 29) total_intl_minutes< 13.05 259 58 no (0.22393822 0.77606178)
## 58) total_intl_calls< 2.5 51 0 yes (1.00000000 0.00000000) *
## 59) total_intl_calls>=2.5 208 7 no (0.03365385 0.96634615) *
## 15) international_planyes< 0.5 3145 156 no (0.04960254 0.95039746)
## 30) total_day_minutes>=221.85 529 88 no (0.16635161 0.83364839)
## 60) total_eve_minutes>=242.35 104 46 yes (0.55769231 0.44230769) *
## 61) total_eve_minutes< 242.35 425 30 no (0.07058824 0.92941176) *
## 31) total_day_minutes< 221.85 2616 68 no (0.02599388 0.97400612) *
The results show that we are heading the right direction because our kappa values are increasing.
dt_model_postprune <- prune(dt_model$finalModel, cp=0.2)
print(dt_model_postprune)
## n= 4000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4000 569 no (0.1422500 0.8577500) *
plot(dt_model$finalModel)
text(dt_model$finalModel)
#summary(dt_model$finalModel)
Our decision tree is very small I don’t think we have enough information to make a prediction about churn.
fancyRpartPlot(dt_model$finalModel)
I am visual learner so I appreciate being able to see the decision tree
and the splits.
prp(dt_model$finalModel)
rpart.plot(dt_model$finalModel)