C50 packagelapply(c("C50", "pander"), library, character=T)
## [[1]]
## [1] "C50" "stats" "graphics" "grDevices" "utils" "datasets"
## [7] "methods" "base"
##
## [[2]]
## [1] "pander" "C50" "stats" "graphics" "grDevices" "utils"
## [7] "datasets" "methods" "base"
data(churn)
str to read the structure of the dataset:str(churnTrain)
## 'data.frame': 3333 obs. of 20 variables:
## $ state : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
## $ account_length : int 128 107 137 84 75 118 121 147 117 141 ...
## $ area_code : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
## $ international_plan : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
## $ voice_mail_plan : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
## $ number_vmail_messages : int 25 26 0 0 0 0 24 0 0 37 ...
## $ total_day_minutes : num 265 162 243 299 167 ...
## $ total_day_calls : int 110 123 114 71 113 98 88 79 97 84 ...
## $ total_day_charge : num 45.1 27.5 41.4 50.9 28.3 ...
## $ total_eve_minutes : num 197.4 195.5 121.2 61.9 148.3 ...
## $ total_eve_calls : int 99 103 110 88 122 101 108 94 80 111 ...
## $ total_eve_charge : num 16.78 16.62 10.3 5.26 12.61 ...
## $ total_night_minutes : num 245 254 163 197 187 ...
## $ total_night_calls : int 91 103 104 89 121 118 118 96 90 97 ...
## $ total_night_charge : num 11.01 11.45 7.32 8.86 8.41 ...
## $ total_intl_minutes : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
## $ total_intl_calls : int 3 3 5 7 3 6 7 6 4 5 ...
## $ total_intl_charge : num 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
## $ number_customer_service_calls: int 1 1 0 2 3 0 3 0 1 0 ...
## $ churn : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
state, area_code, and account_length attributes which are not appropriate for classification features:churnTrain <- churnTest[,!names(churnTrain) %in% c("state", "area_code", "account_length")]
set.seed(2)
ind <- sample(2, nrow(churnTrain), replace=T, prob=c(0.7, 0.3))
trainset <- churnTrain[ind==1,]
testset <- churnTrain[ind==2,]
dim to explorer the dimension of both the training and testing data sets:pander(dim(trainset))
1153 and 17
pander(dim(testset))
514 and 17
#' Function to split data into test and training set
splitData <- function(data, p=0.7, s=555){
set.seed(s)
index <- sample(1:dim(data)[1])
train <- data[index[1:floor(dim(data)[1]*p)],]
test <- data[index[((ceiling(dim(data)[1]*p))+1):dim(data)[1]],]
return(list(train=train, test=test))
}
Perform the following steps to split the churn dataset into the training and testing data sets:
rpart packages:library(rpart)
rpart function to build a classification tree model:churn.rp <- rpart(churn~., data=trainset)
churn.rp to retrieve the node detail of the classification tree:churn.rp
## n= 1153
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 1153 151 no (0.13096271 0.86903729)
## 2) total_day_minutes>=265.25 71 30 yes (0.57746479 0.42253521)
## 4) voice_mail_plan=no 56 16 yes (0.71428571 0.28571429)
## 8) total_eve_minutes>=155.15 35 2 yes (0.94285714 0.05714286) *
## 9) total_eve_minutes< 155.15 21 7 no (0.33333333 0.66666667)
## 18) total_day_minutes>=284.95 7 2 yes (0.71428571 0.28571429) *
## 19) total_day_minutes< 284.95 14 2 no (0.14285714 0.85714286) *
## 5) voice_mail_plan=yes 15 1 no (0.06666667 0.93333333) *
## 3) total_day_minutes< 265.25 1082 110 no (0.10166359 0.89833641)
## 6) number_customer_service_calls>=3.5 89 39 no (0.43820225 0.56179775)
## 12) total_day_minutes< 163.95 29 2 yes (0.93103448 0.06896552) *
## 13) total_day_minutes>=163.95 60 12 no (0.20000000 0.80000000)
## 26) total_night_minutes< 174.35 20 8 no (0.40000000 0.60000000)
## 52) total_day_minutes< 198.6 8 1 yes (0.87500000 0.12500000) *
## 53) total_day_minutes>=198.6 12 1 no (0.08333333 0.91666667) *
## 27) total_night_minutes>=174.35 40 4 no (0.10000000 0.90000000) *
## 7) number_customer_service_calls< 3.5 993 71 no (0.07150050 0.92849950)
## 14) international_plan=yes 100 35 no (0.35000000 0.65000000)
## 28) total_intl_calls< 2.5 18 0 yes (1.00000000 0.00000000) *
## 29) total_intl_calls>=2.5 82 17 no (0.20731707 0.79268293)
## 58) total_intl_minutes>=13.1 16 0 yes (1.00000000 0.00000000) *
## 59) total_intl_minutes< 13.1 66 1 no (0.01515152 0.98484848) *
## 15) international_plan=no 893 36 no (0.04031355 0.95968645) *
printcp function to examine the complexity parameter:printcp(churn.rp)
##
## Classification tree:
## rpart(formula = churn ~ ., data = trainset)
##
## Variables actually used in tree construction:
## [1] international_plan number_customer_service_calls
## [3] total_day_minutes total_eve_minutes
## [5] total_intl_calls total_intl_minutes
## [7] total_night_minutes voice_mail_plan
##
## Root node error: 151/1153 = 0.13096
##
## n= 1153
##
## CP nsplit rel error xerror xstd
## 1 0.081126 0 1.00000 1.00000 0.075863
## 2 0.059603 4 0.67550 0.78808 0.068414
## 3 0.046358 7 0.45033 0.53642 0.057471
## 4 0.019868 8 0.40397 0.47682 0.054411
## 5 0.010000 11 0.34437 0.47020 0.054057
plotcp function to plot the cost complexity parameters:plotcp(churn.rp)
summary function to examine the built model:summary(churn.rp)
## Call:
## rpart(formula = churn ~ ., data = trainset)
## n= 1153
##
## CP nsplit rel error xerror xstd
## 1 0.08112583 0 1.0000000 1.0000000 0.07586313
## 2 0.05960265 4 0.6754967 0.7880795 0.06841355
## 3 0.04635762 7 0.4503311 0.5364238 0.05747094
## 4 0.01986755 8 0.4039735 0.4768212 0.05441110
## 5 0.01000000 11 0.3443709 0.4701987 0.05405691
##
## Variable importance
## total_day_charge total_day_minutes
## 20 20
## total_intl_charge total_intl_minutes
## 8 8
## number_customer_service_calls total_intl_calls
## 7 7
## international_plan total_eve_charge
## 6 4
## total_eve_minutes number_vmail_messages
## 4 3
## voice_mail_plan total_night_minutes
## 3 2
## total_night_charge total_day_calls
## 2 1
## total_night_calls total_eve_calls
## 1 1
##
## Node number 1: 1153 observations, complexity param=0.08112583
## predicted class=no expected loss=0.1309627 P(node) =1
## class counts: 151 1002
## probabilities: 0.131 0.869
## left son=2 (71 obs) right son=3 (1082 obs)
## Primary splits:
## total_day_minutes < 265.25 to the right, improve=30.167360, (0 missing)
## total_day_charge < 45.095 to the right, improve=30.167360, (0 missing)
## international_plan splits as RL, improve=22.877360, (0 missing)
## number_customer_service_calls < 3.5 to the right, improve=22.050500, (0 missing)
## voice_mail_plan splits as LR, improve= 3.989194, (0 missing)
## Surrogate splits:
## total_day_charge < 45.095 to the right, agree=1.00, adj=1.000, (0 split)
## total_night_minutes < 357.5 to the right, agree=0.94, adj=0.028, (0 split)
## total_night_charge < 16.09 to the right, agree=0.94, adj=0.028, (0 split)
##
## Node number 2: 71 observations, complexity param=0.08112583
## predicted class=yes expected loss=0.4225352 P(node) =0.06157849
## class counts: 41 30
## probabilities: 0.577 0.423
## left son=4 (56 obs) right son=5 (15 obs)
## Primary splits:
## voice_mail_plan splits as LR, improve=9.924078, (0 missing)
## number_vmail_messages < 7 to the left, improve=9.924078, (0 missing)
## total_day_minutes < 299.6 to the right, improve=7.375160, (0 missing)
## total_day_charge < 50.935 to the right, improve=7.375160, (0 missing)
## total_eve_minutes < 156.85 to the right, improve=6.828757, (0 missing)
## Surrogate splits:
## number_vmail_messages < 7 to the left, agree=1.000, adj=1.000, (0 split)
## total_night_calls < 58.5 to the right, agree=0.803, adj=0.067, (0 split)
##
## Node number 3: 1082 observations, complexity param=0.08112583
## predicted class=no expected loss=0.1016636 P(node) =0.9384215
## class counts: 110 972
## probabilities: 0.102 0.898
## left son=6 (89 obs) right son=7 (993 obs)
## Primary splits:
## number_customer_service_calls < 3.5 to the right, improve=21.966860, (0 missing)
## international_plan splits as RL, improve=18.820730, (0 missing)
## total_night_minutes < 298.45 to the right, improve= 2.271806, (0 missing)
## total_night_charge < 13.43 to the right, improve= 2.271806, (0 missing)
## total_intl_calls < 2.5 to the left, improve= 2.126618, (0 missing)
##
## Node number 4: 56 observations, complexity param=0.04635762
## predicted class=yes expected loss=0.2857143 P(node) =0.04856895
## class counts: 40 16
## probabilities: 0.714 0.286
## left son=8 (35 obs) right son=9 (21 obs)
## Primary splits:
## total_eve_minutes < 155.15 to the right, improve=9.752381, (0 missing)
## total_eve_charge < 13.19 to the right, improve=9.752381, (0 missing)
## total_day_minutes < 289.15 to the right, improve=5.079365, (0 missing)
## total_day_charge < 49.155 to the right, improve=5.079365, (0 missing)
## total_night_minutes < 196.95 to the right, improve=2.627258, (0 missing)
## Surrogate splits:
## total_eve_charge < 13.19 to the right, agree=1.000, adj=1.000, (0 split)
## total_day_minutes < 269.25 to the right, agree=0.679, adj=0.143, (0 split)
## total_day_charge < 45.775 to the right, agree=0.679, adj=0.143, (0 split)
## total_day_calls < 75.5 to the right, agree=0.643, adj=0.048, (0 split)
## total_night_minutes < 259.15 to the left, agree=0.643, adj=0.048, (0 split)
##
## Node number 5: 15 observations
## predicted class=no expected loss=0.06666667 P(node) =0.01300954
## class counts: 1 14
## probabilities: 0.067 0.933
##
## Node number 6: 89 observations, complexity param=0.08112583
## predicted class=no expected loss=0.4382022 P(node) =0.07718994
## class counts: 39 50
## probabilities: 0.438 0.562
## left son=12 (29 obs) right son=13 (60 obs)
## Primary splits:
## total_day_minutes < 163.95 to the left, improve=20.896090, (0 missing)
## total_day_charge < 27.875 to the left, improve=20.896090, (0 missing)
## total_eve_calls < 92.5 to the left, improve= 4.799904, (0 missing)
## international_plan splits as RL, improve= 2.666915, (0 missing)
## total_night_minutes < 176.7 to the left, improve= 2.476909, (0 missing)
## Surrogate splits:
## total_day_charge < 27.875 to the left, agree=1.000, adj=1.000, (0 split)
## total_eve_minutes < 112.2 to the left, agree=0.708, adj=0.103, (0 split)
## total_eve_charge < 9.54 to the left, agree=0.708, adj=0.103, (0 split)
## total_eve_calls < 90.5 to the left, agree=0.697, adj=0.069, (0 split)
## number_vmail_messages < 34.5 to the right, agree=0.685, adj=0.034, (0 split)
##
## Node number 7: 993 observations, complexity param=0.05960265
## predicted class=no expected loss=0.0715005 P(node) =0.8612316
## class counts: 71 922
## probabilities: 0.072 0.928
## left son=14 (100 obs) right son=15 (893 obs)
## Primary splits:
## international_plan splits as RL, improve=17.249500, (0 missing)
## total_day_minutes < 251.2 to the right, improve= 2.805061, (0 missing)
## total_day_charge < 42.705 to the right, improve= 2.805061, (0 missing)
## total_intl_minutes < 13.35 to the right, improve= 2.613941, (0 missing)
## total_intl_charge < 3.605 to the right, improve= 2.613941, (0 missing)
## Surrogate splits:
## total_night_minutes < 333.15 to the right, agree=0.9, adj=0.01, (0 split)
## total_night_charge < 14.995 to the right, agree=0.9, adj=0.01, (0 split)
##
## Node number 8: 35 observations
## predicted class=yes expected loss=0.05714286 P(node) =0.03035559
## class counts: 33 2
## probabilities: 0.943 0.057
##
## Node number 9: 21 observations, complexity param=0.01986755
## predicted class=no expected loss=0.3333333 P(node) =0.01821336
## class counts: 7 14
## probabilities: 0.333 0.667
## left son=18 (7 obs) right son=19 (14 obs)
## Primary splits:
## total_day_minutes < 284.95 to the right, improve=3.047619, (0 missing)
## total_day_charge < 48.44 to the right, improve=3.047619, (0 missing)
## total_day_calls < 107.5 to the left, improve=2.871795, (0 missing)
## total_night_minutes < 200.6 to the right, improve=2.715152, (0 missing)
## total_night_charge < 9.03 to the right, improve=2.715152, (0 missing)
## Surrogate splits:
## total_day_charge < 48.44 to the right, agree=1.000, adj=1.000, (0 split)
## total_day_calls < 74 to the left, agree=0.762, adj=0.286, (0 split)
## total_night_minutes < 200.6 to the right, agree=0.762, adj=0.286, (0 split)
## total_night_charge < 9.03 to the right, agree=0.762, adj=0.286, (0 split)
## number_customer_service_calls < 3.5 to the right, agree=0.762, adj=0.286, (0 split)
##
## Node number 12: 29 observations
## predicted class=yes expected loss=0.06896552 P(node) =0.02515178
## class counts: 27 2
## probabilities: 0.931 0.069
##
## Node number 13: 60 observations, complexity param=0.01986755
## predicted class=no expected loss=0.2 P(node) =0.05203816
## class counts: 12 48
## probabilities: 0.200 0.800
## left son=26 (20 obs) right son=27 (40 obs)
## Primary splits:
## total_night_minutes < 174.35 to the left, improve=2.400000, (0 missing)
## total_night_charge < 7.845 to the left, improve=2.400000, (0 missing)
## total_day_minutes < 200.1 to the left, improve=1.960181, (0 missing)
## total_day_charge < 34.02 to the left, improve=1.960181, (0 missing)
## total_night_calls < 119.5 to the right, improve=1.908075, (0 missing)
## Surrogate splits:
## total_night_charge < 7.845 to the left, agree=1.000, adj=1.0, (0 split)
## total_day_calls < 76.5 to the left, agree=0.733, adj=0.2, (0 split)
## total_intl_minutes < 6.75 to the left, agree=0.700, adj=0.1, (0 split)
## total_intl_calls < 1.5 to the left, agree=0.700, adj=0.1, (0 split)
## total_intl_charge < 1.825 to the left, agree=0.700, adj=0.1, (0 split)
##
## Node number 14: 100 observations, complexity param=0.05960265
## predicted class=no expected loss=0.35 P(node) =0.08673027
## class counts: 35 65
## probabilities: 0.350 0.650
## left son=28 (18 obs) right son=29 (82 obs)
## Primary splits:
## total_intl_calls < 2.5 to the left, improve=18.548780, (0 missing)
## total_intl_minutes < 13.1 to the right, improve=17.307230, (0 missing)
## total_intl_charge < 3.535 to the right, improve=17.307230, (0 missing)
## total_eve_calls < 96.5 to the left, improve= 2.545455, (0 missing)
## total_eve_minutes < 180.25 to the left, improve= 2.479897, (0 missing)
## Surrogate splits:
## total_day_calls < 70.5 to the left, agree=0.83, adj=0.056, (0 split)
## total_night_minutes < 291.35 to the right, agree=0.83, adj=0.056, (0 split)
## total_night_charge < 13.11 to the right, agree=0.83, adj=0.056, (0 split)
##
## Node number 15: 893 observations
## predicted class=no expected loss=0.04031355 P(node) =0.7745013
## class counts: 36 857
## probabilities: 0.040 0.960
##
## Node number 18: 7 observations
## predicted class=yes expected loss=0.2857143 P(node) =0.006071119
## class counts: 5 2
## probabilities: 0.714 0.286
##
## Node number 19: 14 observations
## predicted class=no expected loss=0.1428571 P(node) =0.01214224
## class counts: 2 12
## probabilities: 0.143 0.857
##
## Node number 26: 20 observations, complexity param=0.01986755
## predicted class=no expected loss=0.4 P(node) =0.01734605
## class counts: 8 12
## probabilities: 0.400 0.600
## left son=52 (8 obs) right son=53 (12 obs)
## Primary splits:
## total_day_minutes < 198.6 to the left, improve=6.016667, (0 missing)
## total_day_charge < 33.765 to the left, improve=6.016667, (0 missing)
## total_eve_calls < 99 to the left, improve=2.327273, (0 missing)
## total_eve_minutes < 220.95 to the left, improve=2.016667, (0 missing)
## total_eve_charge < 18.78 to the left, improve=2.016667, (0 missing)
## Surrogate splits:
## total_day_charge < 33.765 to the left, agree=1.0, adj=1.00, (0 split)
## total_night_calls < 119.5 to the right, agree=0.8, adj=0.50, (0 split)
## total_eve_minutes < 220.95 to the left, agree=0.7, adj=0.25, (0 split)
## total_eve_calls < 76 to the left, agree=0.7, adj=0.25, (0 split)
## total_eve_charge < 18.78 to the left, agree=0.7, adj=0.25, (0 split)
##
## Node number 27: 40 observations
## predicted class=no expected loss=0.1 P(node) =0.03469211
## class counts: 4 36
## probabilities: 0.100 0.900
##
## Node number 28: 18 observations
## predicted class=yes expected loss=0 P(node) =0.01561145
## class counts: 18 0
## probabilities: 1.000 0.000
##
## Node number 29: 82 observations, complexity param=0.05960265
## predicted class=no expected loss=0.2073171 P(node) =0.07111882
## class counts: 17 65
## probabilities: 0.207 0.793
## left son=58 (16 obs) right son=59 (66 obs)
## Primary splits:
## total_intl_minutes < 13.1 to the right, improve=24.981520, (0 missing)
## total_intl_charge < 3.535 to the right, improve=24.981520, (0 missing)
## total_eve_calls < 90.5 to the left, improve= 1.696828, (0 missing)
## total_night_calls < 115.5 to the left, improve= 1.451220, (0 missing)
## total_day_calls < 100.5 to the left, improve= 1.341696, (0 missing)
## Surrogate splits:
## total_intl_charge < 3.535 to the right, agree=1.000, adj=1.000, (0 split)
## total_day_calls < 140 to the right, agree=0.817, adj=0.062, (0 split)
## total_night_minutes < 272.95 to the right, agree=0.817, adj=0.062, (0 split)
## total_night_charge < 12.285 to the right, agree=0.817, adj=0.062, (0 split)
## total_intl_calls < 10 to the right, agree=0.817, adj=0.062, (0 split)
##
## Node number 52: 8 observations
## predicted class=yes expected loss=0.125 P(node) =0.006938422
## class counts: 7 1
## probabilities: 0.875 0.125
##
## Node number 53: 12 observations
## predicted class=no expected loss=0.08333333 P(node) =0.01040763
## class counts: 1 11
## probabilities: 0.083 0.917
##
## Node number 58: 16 observations
## predicted class=yes expected loss=0 P(node) =0.01387684
## class counts: 16 0
## probabilities: 1.000 0.000
##
## Node number 59: 66 observations
## predicted class=no expected loss=0.01515152 P(node) =0.05724198
## class counts: 1 65
## probabilities: 0.015 0.985
From the last recipe, we learned how to print the classification tree in a text format. To make the tree more readable, we can use the plot function to obtain the graphical display of a built classification tree.
Perform the following steps to visualise the classification tree
plot function and the text function to plot the classification tree:plot(churn.rp, margin=0.1)
text(churn.rp, all=T, use.n=T)
uniform, branch and margin parameter to adjust the layout:plot(churn.rp, uniform=T, branch=0.6, margin=0.1)
text(churn.rp, all=T, use.n=T)
Since we have built a classification tree in the previous recipes, we can us it to predict the category (Class table) of new observations. Before making a prediction, we first validate the prediction power of the classification tree, which can be done by generating a classification table on the testing dataset. In this recipe, we will introduce how to generate a predicted label versus a real label table with the predict function and the table function, and explain how to generate a confusion matrix to measure the performance.
predict function to generate a predicted label of the testing dataset.predictions <- predict(churn.rp, testset, type="class")
table function to generate a classification table for the testing dataset:table(testset$churn, predictions)
## predictions
## yes no
## yes 41 32
## no 7 434
confusionMatrix function provided in the caret package:library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
confusionMatrix(table(predictions, testset$churn))
## Confusion Matrix and Statistics
##
##
## predictions yes no
## yes 41 7
## no 32 434
##
## Accuracy : 0.9241
## 95% CI : (0.8977, 0.9455)
## No Information Rate : 0.858
## P-Value [Acc > NIR] : 2.54e-06
##
## Kappa : 0.6368
## Mcnemar's Test P-Value : 0.0001215
##
## Sensitivity : 0.56164
## Specificity : 0.98413
## Pos Pred Value : 0.85417
## Neg Pred Value : 0.93133
## Prevalence : 0.14202
## Detection Rate : 0.07977
## Detection Prevalence : 0.09339
## Balanced Accuracy : 0.77289
##
## 'Positive' Class : yes
##
Perform the following steps to prune the classification tree:
min(churn.rp$cptable[,"xerror"])
## [1] 0.4701987
which.min(churn.rp$cptable[,"xerror"])
## 5
## 5
churn.cp <- churn.rp$cptable[which.min(churn.rp$cptable[,"xerror"]), "CP"]
churn.cp
## [1] 0.01
cp parameter to the CP value of the record with the minimum cross-validation errors:prune.tree = prune(churn.rp, cp=churn.cp)
plot and text function:plot(prune.tree, uniform=T, branch=0.6, margin=0.1)
text(prune.tree, all=T, use.n=T)
predictions <- predict(prune.tree, testset, type="class")
table(testset$churn, predictions)
## predictions
## yes no
## yes 41 32
## no 7 434
confusionMatrix(table(predictions, testset$churn))
## Confusion Matrix and Statistics
##
##
## predictions yes no
## yes 41 7
## no 32 434
##
## Accuracy : 0.9241
## 95% CI : (0.8977, 0.9455)
## No Information Rate : 0.858
## P-Value [Acc > NIR] : 2.54e-06
##
## Kappa : 0.6368
## Mcnemar's Test P-Value : 0.0001215
##
## Sensitivity : 0.56164
## Specificity : 0.98413
## Pos Pred Value : 0.85417
## Neg Pred Value : 0.93133
## Prevalence : 0.14202
## Detection Rate : 0.07977
## Detection Prevalence : 0.09339
## Balanced Accuracy : 0.77289
##
## 'Positive' Class : yes
##
In addition to traditional decision trees (rpart), condition inference trees (ctree) are another popular tree-based classification method. Similar to traditional decision trees, condition inference trees also recursively partition the data by performing a univariate split on the dependent variable. However, what makes conditional inference trees different from traditional decision trees is that condition inference trees adapt the significance test procedures to select variables rate than selecting variables by maximising information measures (rpart employs a Gini coefficient). In this recipe, we will introduce how to adapt a condition inference tree to build a classification model.
ctree from the party package to build the classification model:library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Loading required package: sandwich
ctree.model = ctree(churn ~ ., data=trainset)
ctree.model
##
## Conditional inference tree with 12 terminal nodes
##
## Response: churn
## Inputs: international_plan, voice_mail_plan, number_vmail_messages, total_day_minutes, total_day_calls, total_day_charge, total_eve_minutes, total_eve_calls, total_eve_charge, total_night_minutes, total_night_calls, total_night_charge, total_intl_minutes, total_intl_calls, total_intl_charge, number_customer_service_calls
## Number of observations: 1153
##
## 1) international_plan == {no}; criterion = 1, statistic = 100.418
## 2) number_customer_service_calls <= 3; criterion = 1, statistic = 66.785
## 3) total_day_minutes <= 251.1; criterion = 1, statistic = 106.263
## 4)* weights = 869
## 3) total_day_minutes > 251.1
## 5) voice_mail_plan == {yes}; criterion = 0.999, statistic = 16.628
## 6)* weights = 19
## 5) voice_mail_plan == {no}
## 7) total_eve_charge <= 13.3; criterion = 1, statistic = 23.854
## 8) total_day_minutes <= 277.7; criterion = 0.989, statistic = 11.486
## 9)* weights = 16
## 8) total_day_minutes > 277.7
## 10)* weights = 7
## 7) total_eve_charge > 13.3
## 11)* weights = 38
## 2) number_customer_service_calls > 3
## 12) total_day_charge <= 27.63; criterion = 1, statistic = 27.504
## 13)* weights = 27
## 12) total_day_charge > 27.63
## 14)* weights = 60
## 1) international_plan == {yes}
## 15) total_intl_charge <= 3.51; criterion = 0.983, statistic = 10.648
## 16) total_intl_calls <= 2; criterion = 1, statistic = 24.939
## 17)* weights = 22
## 16) total_intl_calls > 2
## 18) number_customer_service_calls <= 3; criterion = 0.999, statistic = 17.025
## 19) total_day_minutes <= 245; criterion = 0.999, statistic = 17.017
## 20)* weights = 63
## 19) total_day_minutes > 245
## 21)* weights = 7
## 18) number_customer_service_calls > 3
## 22)* weights = 7
## 15) total_intl_charge > 3.51
## 23)* weights = 18
summary(ctree.model)
## Length Class Mode
## 1 BinaryTree S4
Similar to rpart, the party package also provides a visualisation method for users to plot condition inference trees. In the following recipe, we will introduce how to use the plot function to visualise conditional inference trees.
Perform the following steps to visualise the condition inference trees.
plot function to plot ctree.model built in the last recipe:plot(ctree.model)
daycharge.model = ctree(churn ~ total_day_charge, data=trainset)
plot(daycharge.model)
After building a conditional inference tree as a classification model, we can use the treeresponse and predict functions to predict categories of the testing dataset, testset and further validate the prediction power with a classification table and a confusion matrix.
testsetctree.predict <- predict(ctree.model, testset)
table(ctree.predict, testset$churn)
##
## ctree.predict yes no
## yes 50 9
## no 23 432
confusionMatrix from the caret package to generate the performance measurements of the prediction results:confusionMatrix(table(ctree.predict, testset$churn))
## Confusion Matrix and Statistics
##
##
## ctree.predict yes no
## yes 50 9
## no 23 432
##
## Accuracy : 0.9377
## 95% CI : (0.9132, 0.957)
## No Information Rate : 0.858
## P-Value [Acc > NIR] : 8.501e-09
##
## Kappa : 0.7223
## Mcnemar's Test P-Value : 0.02156
##
## Sensitivity : 0.68493
## Specificity : 0.97959
## Pos Pred Value : 0.84746
## Neg Pred Value : 0.94945
## Prevalence : 0.14202
## Detection Rate : 0.09728
## Detection Prevalence : 0.11479
## Balanced Accuracy : 0.83226
##
## 'Positive' Class : yes
##
treeresponse function, which will tell you the list of the class probabilities:tr <- treeresponse(ctree.model, newdata=testset[1:5,])
tr
## [[1]]
## [1] 0.03222094 0.96777906
##
## [[2]]
## [1] 0.03222094 0.96777906
##
## [[3]]
## [1] 0.03222094 0.96777906
##
## [[4]]
## [1] 0.03222094 0.96777906
##
## [[5]]
## [1] 0.03222094 0.96777906
K-nearest neighbour (KNN) is a non parametric lazy learning method. From a non-parametric view, it does not make any assumptions about data distribution. In terms of lazy learning, it does not require any specific learning phase for generalisation. The following recipe will introduce how to apply the k-nearest neighbour algorithm on the churn dataset.
class package have have it loaded in an R sessionlibrary(class)
yes and no of the voice_mail_plan and international_plan attributes in both the training dataset and testing dataset to 1 and 0.levels(trainset$international_plan) <- list("0" ="no", "1"="yes")
levels(trainset$voice_mail_plan) <- list("0" ="no", "1"="yes")
levels(testset$international_plan) <- list("0" ="no", "1"="yes")
levels(testset$voice_mail_plan) <- list("0" ="no", "1"="yes")
knn classification method on the training and testing dataset.churn.knn <- knn(trainset[,!names(trainset) %in% c("churn")],
testset[,!names(testset) %in% c("churn")],
trainset$churn, k=3)
summary function to retrieve the number of predicted labels:summary(churn.knn)
## yes no
## 42 472
table function:table(testset$churn, churn.knn)
## churn.knn
## yes no
## yes 27 46
## no 15 426
confusionMatrix function.confusionMatrix(table(testset$churn, churn.knn))
## Confusion Matrix and Statistics
##
## churn.knn
## yes no
## yes 27 46
## no 15 426
##
## Accuracy : 0.8813
## 95% CI : (0.8502, 0.908)
## No Information Rate : 0.9183
## P-Value [Acc > NIR] : 0.9985330
##
## Kappa : 0.4082
## Mcnemar's Test P-Value : 0.0001225
##
## Sensitivity : 0.64286
## Specificity : 0.90254
## Pos Pred Value : 0.36986
## Neg Pred Value : 0.96599
## Prevalence : 0.08171
## Detection Rate : 0.05253
## Detection Prevalence : 0.14202
## Balanced Accuracy : 0.77270
##
## 'Positive' Class : yes
##
Logistic regression is a from of probability statistics classification model, which can be used to predict class labels based on one or more features. The classification is done by using the logit function to estimate the outcome probability. One can use logistic regression by specifying the family as binomial while using the glm function. In this recipe we will introduce how to classify data using the logistic regression.
glm function on the dataset, trainset, by using the churn as a class label and the rest of the variables as input features:fit <- glm(churn ~., data=trainset, family=binomial)
summary function to obtain summary information of the built logistic regression modelsummary(fit)
##
## Call:
## glm(formula = churn ~ ., family = binomial, data = trainset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1029 0.1760 0.3009 0.4556 2.0471
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.802847 1.268728 5.362 8.23e-08 ***
## international_plan1 -2.485414 0.265020 -9.378 < 2e-16 ***
## voice_mail_plan1 2.557456 1.123261 2.277 0.02280 *
## number_vmail_messages -0.038171 0.035315 -1.081 0.27976
## total_day_minutes -15.727733 6.000102 -2.621 0.00876 **
## total_day_calls 0.003972 0.005339 0.744 0.45690
## total_day_charge 92.443304 35.294053 2.619 0.00881 **
## total_eve_minutes 2.231732 3.029727 0.737 0.46136
## total_eve_calls 0.007340 0.005195 1.413 0.15767
## total_eve_charge -26.317420 35.644893 -0.738 0.46032
## total_night_minutes -0.267550 1.617910 -0.165 0.86865
## total_night_calls 0.004920 0.005133 0.958 0.33782
## total_night_charge 5.833745 35.952222 0.162 0.87110
## total_intl_minutes -2.380082 9.665829 -0.246 0.80550
## total_intl_calls 0.064672 0.045013 1.437 0.15079
## total_intl_charge 8.419632 35.806890 0.235 0.81410
## number_customer_service_calls -0.558023 0.074306 -7.510 5.92e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 895.22 on 1152 degrees of freedom
## Residual deviance: 660.78 on 1136 degrees of freedom
## AIC: 694.78
##
## Number of Fisher Scoring iterations: 6
fit <- glm(churn ~ international_plan + voice_mail_plan + total_intl_calls + number_customer_service_calls, data=trainset, family = binomial)
summary(fit)
##
## Call:
## glm(formula = churn ~ international_plan + voice_mail_plan +
## total_intl_calls + number_customer_service_calls, family = binomial,
## data = trainset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8270 0.2437 0.3674 0.5049 1.7744
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.81964 0.26751 10.540 < 2e-16 ***
## international_plan1 -2.19558 0.23899 -9.187 < 2e-16 ***
## voice_mail_plan1 1.31543 0.29595 4.445 8.8e-06 ***
## total_intl_calls 0.06362 0.04361 1.459 0.145
## number_customer_service_calls -0.53926 0.06873 -7.846 4.3e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 895.22 on 1152 degrees of freedom
## Residual deviance: 729.53 on 1148 degrees of freedom
## AIC: 739.53
##
## Number of Fisher Scoring iterations: 5
fit, to predict the outcome of testset. You can also determine the class by judging whether the probability is above 0.5.pred <- predict(fit, testset, type="response")
class = pred >.5
summary function to show you the binary outcome count, and reveal whether the probability is above 0.5.summary(class)
## Mode FALSE TRUE NA's
## logical 12 502 0
tb <- table(testset$churn, class)
churn.mod <- ifelse(testset$churn == "yes" ,1, 0)
pred_class <- churn.mod
pred_class[pred<=0.5] = 1-pred_class[pred<=.5]
ctb <- table(churn.mod, pred_class)
ctb
## pred_class
## churn.mod 0 1
## 0 435 6
## 1 6 67
confusionMatrix(ctb)
## Confusion Matrix and Statistics
##
## pred_class
## churn.mod 0 1
## 0 435 6
## 1 6 67
##
## Accuracy : 0.9767
## 95% CI : (0.9596, 0.9879)
## No Information Rate : 0.858
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9042
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9864
## Specificity : 0.9178
## Pos Pred Value : 0.9864
## Neg Pred Value : 0.9178
## Prevalence : 0.8580
## Detection Rate : 0.8463
## Detection Prevalence : 0.8580
## Balanced Accuracy : 0.9521
##
## 'Positive' Class : 0
##
The Naive Bayes classifier is also a probability-based classifier, which is based on applying Bayes theorem with a strong independent assumption. In this recipe, we will introduce how to classify data with the Naive Bayes classifier
e1071 library and employ the naiveBayes function to build the classifier.library(e1071)
classifier = naiveBayes(trainset[,!names(trainset) %in% c("churn")],
trainset$churn)
classifier into the console to examine the function call, a priori probability, and conditional probabilityclassifier
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = trainset[, !names(trainset) %in% c("churn")],
## y = trainset$churn)
##
## A-priori probabilities:
## trainset$churn
## yes no
## 0.1309627 0.8690373
##
## Conditional probabilities:
## international_plan
## trainset$churn 0 1
## yes 0.66887417 0.33112583
## no 0.93313373 0.06686627
##
## voice_mail_plan
## trainset$churn 0 1
## yes 0.8940397 0.1059603
## no 0.7375250 0.2624750
##
## number_vmail_messages
## trainset$churn [,1] [,2]
## yes 3.271523 9.755637
## no 7.626747 13.488975
##
## total_day_minutes
## trainset$churn [,1] [,2]
## yes 204.0152 68.88358
## no 176.2123 48.09356
##
## total_day_calls
## trainset$churn [,1] [,2]
## yes 98.52980 19.56964
## no 99.73752 18.90277
##
## total_day_charge
## trainset$churn [,1] [,2]
## yes 34.68252 11.710554
## no 29.95666 8.175785
##
## total_eve_minutes
## trainset$churn [,1] [,2]
## yes 204.1808 50.27204
## no 198.4301 48.24539
##
## total_eve_calls
## trainset$churn [,1] [,2]
## yes 97.94702 19.29656
## no 100.13972 19.63560
##
## total_eve_charge
## trainset$churn [,1] [,2]
## yes 17.35550 4.273422
## no 16.86679 4.100651
##
## total_night_minutes
## trainset$churn [,1] [,2]
## yes 208.3510 57.88027
## no 197.5606 50.31016
##
## total_night_calls
## trainset$churn [,1] [,2]
## yes 98.49669 17.97549
## no 99.34631 20.88633
##
## total_night_charge
## trainset$churn [,1] [,2]
## yes 9.376026 2.604645
## no 8.890369 2.263923
##
## total_intl_minutes
## trainset$churn [,1] [,2]
## yes 10.83179 2.745794
## no 10.27964 2.702924
##
## total_intl_calls
## trainset$churn [,1] [,2]
## yes 3.980132 2.319397
## no 4.306387 2.330131
##
## total_intl_charge
## trainset$churn [,1] [,2]
## yes 2.925166 0.7410451
## no 2.775988 0.7297420
##
## number_customer_service_calls
## trainset$churn [,1] [,2]
## yes 2.443709 1.738335
## no 1.495010 1.194591
bayes.table <- table(predict(classifier, testset[,!names(testset) %in% c("churn")]), testset$churn)
bayes.table
##
## yes no
## yes 33 15
## no 40 426
confusionMatrix(bayes.table)
## Confusion Matrix and Statistics
##
##
## yes no
## yes 33 15
## no 40 426
##
## Accuracy : 0.893
## 95% CI : (0.863, 0.9184)
## No Information Rate : 0.858
## P-Value [Acc > NIR] : 0.011362
##
## Kappa : 0.4877
## Mcnemar's Test P-Value : 0.001211
##
## Sensitivity : 0.45205
## Specificity : 0.96599
## Pos Pred Value : 0.68750
## Neg Pred Value : 0.91416
## Prevalence : 0.14202
## Detection Rate : 0.06420
## Detection Prevalence : 0.09339
## Balanced Accuracy : 0.70902
##
## 'Positive' Class : yes
##