准备训练和测试数据

require(C50)
## Loading required package: C50
data(churn)
str(churnTrain)
## 'data.frame':    3333 obs. of  20 variables:
##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
##  $ account_length               : int  128 107 137 84 75 118 121 147 117 141 ...
##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ number_vmail_messages        : int  25 26 0 0 0 0 24 0 0 37 ...
##  $ total_day_minutes            : num  265 162 243 299 167 ...
##  $ total_day_calls              : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ total_day_charge             : num  45.1 27.5 41.4 50.9 28.3 ...
##  $ total_eve_minutes            : num  197.4 195.5 121.2 61.9 148.3 ...
##  $ total_eve_calls              : int  99 103 110 88 122 101 108 94 80 111 ...
##  $ total_eve_charge             : num  16.78 16.62 10.3 5.26 12.61 ...
##  $ total_night_minutes          : num  245 254 163 197 187 ...
##  $ total_night_calls            : int  91 103 104 89 121 118 118 96 90 97 ...
##  $ total_night_charge           : num  11.01 11.45 7.32 8.86 8.41 ...
##  $ total_intl_minutes           : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ total_intl_calls             : int  3 3 5 7 3 6 7 6 4 5 ...
##  $ total_intl_charge            : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ number_customer_service_calls: int  1 1 0 2 3 0 3 0 1 0 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
churnTrain <- churnTrain[,!names(churnTrain) %in% c('state', 'area_code', 'account_length')]
# 将70%的数据划分为训练集, 30%划分为测试数据集
set.seed(2)
ind <- sample(2, nrow(churnTrain), replace = T, prob = c(0.7,0.3))
trainset <- churnTrain[ind == 1,]
testset <- churnTrain[ind == 2,]
dim(trainset)
## [1] 2315   17
dim(testset)
## [1] 1018   17

递归分割树

# 使用递归分割树构建分类模型
require(rpart)
## Loading required package: rpart
churn.rp <- rpart(churn ~ ., data = trainset)
churn.rp
## n= 2315 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 2315 342 no (0.14773218 0.85226782)  
##    2) total_day_minutes>=265.45 144  59 yes (0.59027778 0.40972222)  
##      4) voice_mail_plan=no 110  29 yes (0.73636364 0.26363636)  
##        8) total_eve_minutes>=188.5 67   3 yes (0.95522388 0.04477612) *
##        9) total_eve_minutes< 188.5 43  17 no (0.39534884 0.60465116)  
##         18) total_day_minutes>=282.7 19   6 yes (0.68421053 0.31578947) *
##         19) total_day_minutes< 282.7 24   4 no (0.16666667 0.83333333) *
##      5) voice_mail_plan=yes 34   4 no (0.11764706 0.88235294) *
##    3) total_day_minutes< 265.45 2171 257 no (0.11837863 0.88162137)  
##      6) number_customer_service_calls>=3.5 168  82 yes (0.51190476 0.48809524)  
##       12) total_day_minutes< 160.2 71  10 yes (0.85915493 0.14084507) *
##       13) total_day_minutes>=160.2 97  25 no (0.25773196 0.74226804)  
##         26) total_eve_minutes< 155.5 20   7 yes (0.65000000 0.35000000) *
##         27) total_eve_minutes>=155.5 77  12 no (0.15584416 0.84415584) *
##      7) number_customer_service_calls< 3.5 2003 171 no (0.08537194 0.91462806)  
##       14) international_plan=yes 188  76 no (0.40425532 0.59574468)  
##         28) total_intl_calls< 2.5 38   0 yes (1.00000000 0.00000000) *
##         29) total_intl_calls>=2.5 150  38 no (0.25333333 0.74666667)  
##           58) total_intl_minutes>=13.1 32   0 yes (1.00000000 0.00000000) *
##           59) total_intl_minutes< 13.1 118   6 no (0.05084746 0.94915254) *
##       15) international_plan=no 1815  95 no (0.05234160 0.94765840)  
##         30) total_day_minutes>=224.15 251  50 no (0.19920319 0.80079681)  
##           60) total_eve_minutes>=259.8 36  10 yes (0.72222222 0.27777778) *
##           61) total_eve_minutes< 259.8 215  24 no (0.11162791 0.88837209) *
##         31) total_day_minutes< 224.15 1564  45 no (0.02877238 0.97122762) *
# 调用printcp函数检查复杂性参数
printcp(churn.rp)
## 
## Classification tree:
## rpart(formula = churn ~ ., data = trainset)
## 
## Variables actually used in tree construction:
## [1] international_plan            number_customer_service_calls
## [3] total_day_minutes             total_eve_minutes            
## [5] total_intl_calls              total_intl_minutes           
## [7] voice_mail_plan              
## 
## Root node error: 342/2315 = 0.14773
## 
## n= 2315 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.076023      0   1.00000 1.00000 0.049920
## 2 0.074561      2   0.84795 0.99708 0.049860
## 3 0.055556      4   0.69883 0.76023 0.044421
## 4 0.026316      7   0.49415 0.52632 0.037673
## 5 0.023392      8   0.46784 0.52047 0.037481
## 6 0.020468     10   0.42105 0.50877 0.037092
## 7 0.017544     11   0.40058 0.47076 0.035788
## 8 0.010000     12   0.38304 0.47661 0.035993
# 调用plotcp函数绘制成本复杂性参数
plotcp(churn.rp)

# 检查模型
summary(churn.rp)
## Call:
## rpart(formula = churn ~ ., data = trainset)
##   n= 2315 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.07602339      0 1.0000000 1.0000000 0.04992005
## 2 0.07456140      2 0.8479532 0.9970760 0.04985964
## 3 0.05555556      4 0.6988304 0.7602339 0.04442127
## 4 0.02631579      7 0.4941520 0.5263158 0.03767329
## 5 0.02339181      8 0.4678363 0.5204678 0.03748096
## 6 0.02046784     10 0.4210526 0.5087719 0.03709209
## 7 0.01754386     11 0.4005848 0.4707602 0.03578773
## 8 0.01000000     12 0.3830409 0.4766082 0.03599261
## 
## Variable importance
##             total_day_minutes              total_day_charge 
##                            18                            18 
## number_customer_service_calls            total_intl_minutes 
##                            10                             8 
##             total_intl_charge              total_eve_charge 
##                             8                             8 
##             total_eve_minutes            international_plan 
##                             8                             7 
##              total_intl_calls         number_vmail_messages 
##                             6                             3 
##               voice_mail_plan             total_night_calls 
##                             3                             1 
##               total_eve_calls 
##                             1 
## 
## Node number 1: 2315 observations,    complexity param=0.07602339
##   predicted class=no   expected loss=0.1477322  P(node) =1
##     class counts:   342  1973
##    probabilities: 0.148 0.852 
##   left son=2 (144 obs) right son=3 (2171 obs)
##   Primary splits:
##       total_day_minutes             < 265.45 to the right, improve=60.145020, (0 missing)
##       total_day_charge              < 45.125 to the right, improve=60.145020, (0 missing)
##       number_customer_service_calls < 3.5    to the right, improve=53.641430, (0 missing)
##       international_plan            splits as  RL,         improve=43.729370, (0 missing)
##       voice_mail_plan               splits as  LR,         improve= 6.089388, (0 missing)
##   Surrogate splits:
##       total_day_charge < 45.125 to the right, agree=1, adj=1, (0 split)
## 
## Node number 2: 144 observations,    complexity param=0.07602339
##   predicted class=yes  expected loss=0.4097222  P(node) =0.06220302
##     class counts:    85    59
##    probabilities: 0.590 0.410 
##   left son=4 (110 obs) right son=5 (34 obs)
##   Primary splits:
##       voice_mail_plan       splits as  LR,         improve=19.884860, (0 missing)
##       number_vmail_messages < 9.5    to the left,  improve=19.884860, (0 missing)
##       total_eve_minutes     < 167.05 to the right, improve=14.540020, (0 missing)
##       total_eve_charge      < 14.2   to the right, improve=14.540020, (0 missing)
##       total_day_minutes     < 283.9  to the right, improve= 6.339827, (0 missing)
##   Surrogate splits:
##       number_vmail_messages < 9.5    to the left,  agree=1.000, adj=1.000, (0 split)
##       total_night_minutes   < 110.3  to the right, agree=0.785, adj=0.088, (0 split)
##       total_night_charge    < 4.965  to the right, agree=0.785, adj=0.088, (0 split)
##       total_night_calls     < 50     to the right, agree=0.778, adj=0.059, (0 split)
##       total_intl_minutes    < 15.3   to the left,  agree=0.771, adj=0.029, (0 split)
## 
## Node number 3: 2171 observations,    complexity param=0.0745614
##   predicted class=no   expected loss=0.1183786  P(node) =0.937797
##     class counts:   257  1914
##    probabilities: 0.118 0.882 
##   left son=6 (168 obs) right son=7 (2003 obs)
##   Primary splits:
##       number_customer_service_calls < 3.5    to the right, improve=56.398210, (0 missing)
##       international_plan            splits as  RL,         improve=43.059160, (0 missing)
##       total_day_minutes             < 224.15 to the right, improve=10.847440, (0 missing)
##       total_day_charge              < 38.105 to the right, improve=10.847440, (0 missing)
##       total_intl_minutes            < 13.15  to the right, improve= 6.347319, (0 missing)
## 
## Node number 4: 110 observations,    complexity param=0.02631579
##   predicted class=yes  expected loss=0.2636364  P(node) =0.0475162
##     class counts:    81    29
##    probabilities: 0.736 0.264 
##   left son=8 (67 obs) right son=9 (43 obs)
##   Primary splits:
##       total_eve_minutes   < 188.5  to the right, improve=16.419610, (0 missing)
##       total_eve_charge    < 16.025 to the right, improve=16.419610, (0 missing)
##       total_night_minutes < 206.85 to the right, improve= 5.350500, (0 missing)
##       total_night_charge  < 9.305  to the right, improve= 5.350500, (0 missing)
##       total_day_minutes   < 281.15 to the right, improve= 5.254545, (0 missing)
##   Surrogate splits:
##       total_eve_charge   < 16.025 to the right, agree=1.000, adj=1.000, (0 split)
##       total_night_calls  < 82     to the right, agree=0.655, adj=0.116, (0 split)
##       total_intl_minutes < 3.35   to the right, agree=0.636, adj=0.070, (0 split)
##       total_intl_charge  < 0.905  to the right, agree=0.636, adj=0.070, (0 split)
##       total_day_minutes  < 268.55 to the right, agree=0.627, adj=0.047, (0 split)
## 
## Node number 5: 34 observations
##   predicted class=no   expected loss=0.1176471  P(node) =0.01468683
##     class counts:     4    30
##    probabilities: 0.118 0.882 
## 
## Node number 6: 168 observations,    complexity param=0.0745614
##   predicted class=yes  expected loss=0.4880952  P(node) =0.07257019
##     class counts:    86    82
##    probabilities: 0.512 0.488 
##   left son=12 (71 obs) right son=13 (97 obs)
##   Primary splits:
##       total_day_minutes             < 160.2  to the left,  improve=29.655880, (0 missing)
##       total_day_charge              < 27.235 to the left,  improve=29.655880, (0 missing)
##       total_eve_minutes             < 180.65 to the left,  improve= 8.556953, (0 missing)
##       total_eve_charge              < 15.355 to the left,  improve= 8.556953, (0 missing)
##       number_customer_service_calls < 4.5    to the right, improve= 5.975362, (0 missing)
##   Surrogate splits:
##       total_day_charge              < 27.235 to the left,  agree=1.000, adj=1.000, (0 split)
##       total_night_calls             < 79     to the left,  agree=0.625, adj=0.113, (0 split)
##       total_intl_calls              < 2.5    to the left,  agree=0.619, adj=0.099, (0 split)
##       number_customer_service_calls < 4.5    to the right, agree=0.607, adj=0.070, (0 split)
##       total_eve_calls               < 89.5   to the left,  agree=0.601, adj=0.056, (0 split)
## 
## Node number 7: 2003 observations,    complexity param=0.05555556
##   predicted class=no   expected loss=0.08537194  P(node) =0.8652268
##     class counts:   171  1832
##    probabilities: 0.085 0.915 
##   left son=14 (188 obs) right son=15 (1815 obs)
##   Primary splits:
##       international_plan splits as  RL,         improve=42.194510, (0 missing)
##       total_day_minutes  < 224.15 to the right, improve=16.838410, (0 missing)
##       total_day_charge   < 38.105 to the right, improve=16.838410, (0 missing)
##       total_intl_minutes < 13.15  to the right, improve= 6.210678, (0 missing)
##       total_intl_charge  < 3.55   to the right, improve= 6.210678, (0 missing)
## 
## Node number 8: 67 observations
##   predicted class=yes  expected loss=0.04477612  P(node) =0.02894168
##     class counts:    64     3
##    probabilities: 0.955 0.045 
## 
## Node number 9: 43 observations,    complexity param=0.02046784
##   predicted class=no   expected loss=0.3953488  P(node) =0.01857451
##     class counts:    17    26
##    probabilities: 0.395 0.605 
##   left son=18 (19 obs) right son=19 (24 obs)
##   Primary splits:
##       total_day_minutes   < 282.7  to the right, improve=5.680947, (0 missing)
##       total_day_charge    < 48.06  to the right, improve=5.680947, (0 missing)
##       total_night_minutes < 212.65 to the right, improve=4.558140, (0 missing)
##       total_night_charge  < 9.57   to the right, improve=4.558140, (0 missing)
##       total_eve_minutes   < 145.4  to the right, improve=4.356169, (0 missing)
##   Surrogate splits:
##       total_day_charge   < 48.06  to the right, agree=1.000, adj=1.000, (0 split)
##       total_day_calls    < 103    to the left,  agree=0.674, adj=0.263, (0 split)
##       total_eve_calls    < 104.5  to the left,  agree=0.674, adj=0.263, (0 split)
##       total_intl_minutes < 11.55  to the left,  agree=0.651, adj=0.211, (0 split)
##       total_intl_charge  < 3.12   to the left,  agree=0.651, adj=0.211, (0 split)
## 
## Node number 12: 71 observations
##   predicted class=yes  expected loss=0.1408451  P(node) =0.03066955
##     class counts:    61    10
##    probabilities: 0.859 0.141 
## 
## Node number 13: 97 observations,    complexity param=0.01754386
##   predicted class=no   expected loss=0.257732  P(node) =0.04190065
##     class counts:    25    72
##    probabilities: 0.258 0.742 
##   left son=26 (20 obs) right son=27 (77 obs)
##   Primary splits:
##       total_eve_minutes             < 155.5  to the left,  improve=7.753662, (0 missing)
##       total_eve_charge              < 13.22  to the left,  improve=7.753662, (0 missing)
##       total_intl_minutes            < 13.55  to the right, improve=2.366149, (0 missing)
##       total_intl_charge             < 3.66   to the right, improve=2.366149, (0 missing)
##       number_customer_service_calls < 4.5    to the right, improve=2.297667, (0 missing)
##   Surrogate splits:
##       total_eve_charge  < 13.22  to the left,  agree=1.000, adj=1.00, (0 split)
##       total_night_calls < 143.5  to the right, agree=0.814, adj=0.10, (0 split)
##       total_eve_calls   < 62     to the left,  agree=0.804, adj=0.05, (0 split)
## 
## Node number 14: 188 observations,    complexity param=0.05555556
##   predicted class=no   expected loss=0.4042553  P(node) =0.0812095
##     class counts:    76   112
##    probabilities: 0.404 0.596 
##   left son=28 (38 obs) right son=29 (150 obs)
##   Primary splits:
##       total_intl_calls   < 2.5    to the left,  improve=33.806520, (0 missing)
##       total_intl_minutes < 13.1   to the right, improve=30.527050, (0 missing)
##       total_intl_charge  < 3.535  to the right, improve=30.527050, (0 missing)
##       total_day_minutes  < 221.95 to the right, improve= 3.386095, (0 missing)
##       total_day_charge   < 37.735 to the right, improve= 3.386095, (0 missing)
## 
## Node number 15: 1815 observations,    complexity param=0.02339181
##   predicted class=no   expected loss=0.0523416  P(node) =0.7840173
##     class counts:    95  1720
##    probabilities: 0.052 0.948 
##   left son=30 (251 obs) right son=31 (1564 obs)
##   Primary splits:
##       total_day_minutes   < 224.15 to the right, improve=12.5649300, (0 missing)
##       total_day_charge    < 38.105 to the right, improve=12.5649300, (0 missing)
##       total_eve_minutes   < 244.95 to the right, improve= 4.7875890, (0 missing)
##       total_eve_charge    < 20.825 to the right, improve= 4.7875890, (0 missing)
##       total_night_minutes < 163.85 to the right, improve= 0.9074391, (0 missing)
##   Surrogate splits:
##       total_day_charge < 38.105 to the right, agree=1, adj=1, (0 split)
## 
## Node number 18: 19 observations
##   predicted class=yes  expected loss=0.3157895  P(node) =0.008207343
##     class counts:    13     6
##    probabilities: 0.684 0.316 
## 
## Node number 19: 24 observations
##   predicted class=no   expected loss=0.1666667  P(node) =0.01036717
##     class counts:     4    20
##    probabilities: 0.167 0.833 
## 
## Node number 26: 20 observations
##   predicted class=yes  expected loss=0.35  P(node) =0.008639309
##     class counts:    13     7
##    probabilities: 0.650 0.350 
## 
## Node number 27: 77 observations
##   predicted class=no   expected loss=0.1558442  P(node) =0.03326134
##     class counts:    12    65
##    probabilities: 0.156 0.844 
## 
## Node number 28: 38 observations
##   predicted class=yes  expected loss=0  P(node) =0.01641469
##     class counts:    38     0
##    probabilities: 1.000 0.000 
## 
## Node number 29: 150 observations,    complexity param=0.05555556
##   predicted class=no   expected loss=0.2533333  P(node) =0.06479482
##     class counts:    38   112
##    probabilities: 0.253 0.747 
##   left son=58 (32 obs) right son=59 (118 obs)
##   Primary splits:
##       total_intl_minutes < 13.1   to the right, improve=45.356840, (0 missing)
##       total_intl_charge  < 3.535  to the right, improve=45.356840, (0 missing)
##       total_day_calls    < 95.5   to the left,  improve= 4.036407, (0 missing)
##       total_day_minutes  < 237.75 to the right, improve= 1.879020, (0 missing)
##       total_day_charge   < 40.42  to the right, improve= 1.879020, (0 missing)
##   Surrogate splits:
##       total_intl_charge < 3.535  to the right, agree=1.0, adj=1.000, (0 split)
##       total_day_minutes < 52.45  to the left,  agree=0.8, adj=0.063, (0 split)
##       total_day_charge  < 8.92   to the left,  agree=0.8, adj=0.063, (0 split)
## 
## Node number 30: 251 observations,    complexity param=0.02339181
##   predicted class=no   expected loss=0.1992032  P(node) =0.1084233
##     class counts:    50   201
##    probabilities: 0.199 0.801 
##   left son=60 (36 obs) right son=61 (215 obs)
##   Primary splits:
##       total_eve_minutes     < 259.8  to the right, improve=22.993380, (0 missing)
##       total_eve_charge      < 22.08  to the right, improve=22.993380, (0 missing)
##       voice_mail_plan       splits as  LR,         improve= 4.745664, (0 missing)
##       number_vmail_messages < 7.5    to the left,  improve= 4.745664, (0 missing)
##       total_night_minutes   < 181.15 to the right, improve= 3.509731, (0 missing)
##   Surrogate splits:
##       total_eve_charge < 22.08  to the right, agree=1, adj=1, (0 split)
## 
## Node number 31: 1564 observations
##   predicted class=no   expected loss=0.02877238  P(node) =0.675594
##     class counts:    45  1519
##    probabilities: 0.029 0.971 
## 
## Node number 58: 32 observations
##   predicted class=yes  expected loss=0  P(node) =0.01382289
##     class counts:    32     0
##    probabilities: 1.000 0.000 
## 
## Node number 59: 118 observations
##   predicted class=no   expected loss=0.05084746  P(node) =0.05097192
##     class counts:     6   112
##    probabilities: 0.051 0.949 
## 
## Node number 60: 36 observations
##   predicted class=yes  expected loss=0.2777778  P(node) =0.01555076
##     class counts:    26    10
##    probabilities: 0.722 0.278 
## 
## Node number 61: 215 observations
##   predicted class=no   expected loss=0.1116279  P(node) =0.09287257
##     class counts:    24   191
##    probabilities: 0.112 0.888
# 递归分割树的可视化:plot and text函数
plot(churn.rp,margin = 0.1)
text(churn.rp,all = T, use.n = T)

# 优化可视化的效果:设置uniform, branch, margin参数
plot(churn.rp, uniform = T, branch = 0.6, margin = 0.1)
text(churn.rp, all = T, use.n = T)

# 递归分割树模型的评估
predictions <- predict(churn.rp, testset, type = 'class')
table(testset$churn, predictions)
##      predictions
##       yes  no
##   yes 100  41
##   no   18 859
# 调用caret包中的confusionMatrix函数生成混淆矩阵
require(caret)
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
confusionMatrix(table(predictions,testset$churn))
## Confusion Matrix and Statistics
## 
##            
## predictions yes  no
##         yes 100  18
##         no   41 859
##                                           
##                Accuracy : 0.942           
##                  95% CI : (0.9259, 0.9556)
##     No Information Rate : 0.8615          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7393          
##  Mcnemar's Test P-Value : 0.004181        
##                                           
##             Sensitivity : 0.70922         
##             Specificity : 0.97948         
##          Pos Pred Value : 0.84746         
##          Neg Pred Value : 0.95444         
##              Prevalence : 0.13851         
##          Detection Rate : 0.09823         
##    Detection Prevalence : 0.11591         
##       Balanced Accuracy : 0.84435         
##                                           
##        'Positive' Class : yes             
## 
# 递归分割树剪枝:避免过度适应(使用成本复杂度剪枝方法)
# 最小交叉检验误差
min(churn.rp$cptable[,"xerror"])
## [1] 0.4707602
# 定位交叉检验误差最小的记录
which.min(churn.rp$cptable[,"xerror"])
## 7 
## 7
# 获取交叉检验误差最小记录的成本复杂度参数值
churn.cp <- churn.rp$cptable[7,'CP']
# 设置参数cp为交叉检验误差最小记录的CP值以进行剪枝
prune.tree <- prune(churn.rp, cp = churn.cp)
# 可视化
plot(prune.tree,margin = 0.1)
text(prune.tree, all = T, use.n = T)

# 剪枝后模型评估
predictions <- predict(prune.tree, testset, type = 'class')
table(testset$churn, predictions)
##      predictions
##       yes  no
##   yes  95  46
##   no   14 863
confusionMatrix(table(predictions, testset$churn))
## Confusion Matrix and Statistics
## 
##            
## predictions yes  no
##         yes  95  14
##         no   46 863
##                                           
##                Accuracy : 0.9411          
##                  95% CI : (0.9248, 0.9547)
##     No Information Rate : 0.8615          
##     P-Value [Acc > NIR] : 2.786e-16       
##                                           
##                   Kappa : 0.727           
##  Mcnemar's Test P-Value : 6.279e-05       
##                                           
##             Sensitivity : 0.67376         
##             Specificity : 0.98404         
##          Pos Pred Value : 0.87156         
##          Neg Pred Value : 0.94939         
##              Prevalence : 0.13851         
##          Detection Rate : 0.09332         
##    Detection Prevalence : 0.10707         
##       Balanced Accuracy : 0.82890         
##                                           
##        'Positive' Class : yes             
## 

条件推理树

# 调用party包,使用其中的ctree函数
require(party)
## Loading required package: party
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
ctree.model <- ctree(churn ~ ., data = trainset)
ctree.model
## 
##   Conditional inference tree with 18 terminal nodes
## 
## Response:  churn 
## Inputs:  international_plan, voice_mail_plan, number_vmail_messages, total_day_minutes, total_day_calls, total_day_charge, total_eve_minutes, total_eve_calls, total_eve_charge, total_night_minutes, total_night_calls, total_night_charge, total_intl_minutes, total_intl_calls, total_intl_charge, number_customer_service_calls 
## Number of observations:  2315 
## 
## 1) international_plan == {no}; criterion = 1, statistic = 173.582
##   2) number_customer_service_calls <= 3; criterion = 1, statistic = 133.882
##     3) total_day_minutes <= 259.3; criterion = 1, statistic = 232.371
##       4) total_eve_minutes <= 258.7; criterion = 1, statistic = 39.065
##         5)*  weights = 1544 
##       4) total_eve_minutes > 258.7
##         6) total_day_minutes <= 222.9; criterion = 1, statistic = 47.453
##           7)*  weights = 209 
##         6) total_day_minutes > 222.9
##           8) voice_mail_plan == {yes}; criterion = 1, statistic = 20
##             9)*  weights = 8 
##           8) voice_mail_plan == {no}
##             10)*  weights = 28 
##     3) total_day_minutes > 259.3
##       11) voice_mail_plan == {no}; criterion = 1, statistic = 46.262
##         12) total_eve_charge <= 14.09; criterion = 1, statistic = 37.877
##           13)*  weights = 21 
##         12) total_eve_charge > 14.09
##           14) total_night_minutes <= 178.3; criterion = 1, statistic = 19.789
##             15)*  weights = 23 
##           14) total_night_minutes > 178.3
##             16)*  weights = 60 
##       11) voice_mail_plan == {yes}
##         17)*  weights = 34 
##   2) number_customer_service_calls > 3
##     18) total_day_minutes <= 159.4; criterion = 1, statistic = 34.903
##       19) total_eve_minutes <= 233.2; criterion = 0.991, statistic = 11.885
##         20) voice_mail_plan == {no}; criterion = 0.99, statistic = 11.683
##           21)*  weights = 40 
##         20) voice_mail_plan == {yes}
##           22)*  weights = 7 
##       19) total_eve_minutes > 233.2
##         23)*  weights = 16 
##     18) total_day_minutes > 159.4
##       24)*  weights = 96 
## 1) international_plan == {yes}
##   25) total_intl_charge <= 3.51; criterion = 1, statistic = 35.28
##     26) total_intl_calls <= 2; criterion = 1, statistic = 28.013
##       27)*  weights = 40 
##     26) total_intl_calls > 2
##       28) number_customer_service_calls <= 3; criterion = 0.957, statistic = 8.954
##         29) total_day_minutes <= 271.5; criterion = 1, statistic = 25.328
##           30) total_eve_charge <= 25.82; criterion = 0.987, statistic = 11.167
##             31)*  weights = 116 
##           30) total_eve_charge > 25.82
##             32)*  weights = 7 
##         29) total_day_minutes > 271.5
##           33)*  weights = 11 
##       28) number_customer_service_calls > 3
##         34)*  weights = 14 
##   25) total_intl_charge > 3.51
##     35)*  weights = 41
summary(ctree.model)
##     Length      Class       Mode 
##          1 BinaryTree         S4
# 可视化
plot(ctree.model)

# 减少特征参数,重新绘图
daycharge.model <- ctree(churn ~ total_day_charge, data = trainset)
plot(daycharge.model)

# 模型评估
ctree.predict <- predict(ctree.model, testset)
table(ctree.predict, testset$churn)
##              
## ctree.predict yes  no
##           yes  99  15
##           no   42 862
# 性能评测:confusionMatrix
confusionMatrix(table(ctree.predict, testset$churn))
## Confusion Matrix and Statistics
## 
##              
## ctree.predict yes  no
##           yes  99  15
##           no   42 862
##                                           
##                Accuracy : 0.944           
##                  95% CI : (0.9281, 0.9573)
##     No Information Rate : 0.8615          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7449          
##  Mcnemar's Test P-Value : 0.0005736       
##                                           
##             Sensitivity : 0.70213         
##             Specificity : 0.98290         
##          Pos Pred Value : 0.86842         
##          Neg Pred Value : 0.95354         
##              Prevalence : 0.13851         
##          Detection Rate : 0.09725         
##    Detection Prevalence : 0.11198         
##       Balanced Accuracy : 0.84251         
##                                           
##        'Positive' Class : yes             
## 
# 调用treeresponse函数,输出类概率
tr <- treeresponse(ctree.model, newdata = testset[1:5,])
tr
## [[1]]
## [1] 0.03497409 0.96502591
## 
## [[2]]
## [1] 0.02586207 0.97413793
## 
## [[3]]
## [1] 0.02586207 0.97413793
## 
## [[4]]
## [1] 0.02586207 0.97413793
## 
## [[5]]
## [1] 0.03497409 0.96502591

K近邻分类算法

require(class)
## Loading required package: class
# 替换voice_mail_plan 和international_plan属性原来的yes和no 为1和0
levels(trainset$voice_mail_plan) <- list('0' = 'no','1' = 'yes')
levels(trainset$international_plan) <- list('0' = 'no','1' = 'yes')
levels(testset$voice_mail_plan) <- list('0' = 'no','1' = 'yes')
levels(testset$international_plan) <- list('0' = 'no','1' = 'yes')
# 使用knn分类算法处理训练和测试数据集
churn.knn <- knn(trainset[,!names(trainset) %in% c('churn')], testset[,!names(testset) %in% c('churn')], trainset$churn, k = 3)
summary(churn.knn)
## yes  no 
##  77 941
table(testset$churn, churn.knn)
##      churn.knn
##       yes  no
##   yes  44  97
##   no   33 844
confusionMatrix(table(testset$churn, churn.knn))
## Confusion Matrix and Statistics
## 
##      churn.knn
##       yes  no
##   yes  44  97
##   no   33 844
##                                           
##                Accuracy : 0.8723          
##                  95% CI : (0.8502, 0.8922)
##     No Information Rate : 0.9244          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.339           
##  Mcnemar's Test P-Value : 3.286e-08       
##                                           
##             Sensitivity : 0.57143         
##             Specificity : 0.89692         
##          Pos Pred Value : 0.31206         
##          Neg Pred Value : 0.96237         
##              Prevalence : 0.07564         
##          Detection Rate : 0.04322         
##    Detection Prevalence : 0.13851         
##       Balanced Accuracy : 0.73417         
##                                           
##        'Positive' Class : yes             
## 

逻辑回归分类算法

# 逻辑回归属于基于概率统计分类模型的方法,通过调用logit函数执行逻辑回归分类算法并预测输出概率
# 通过调用glm函数,设置响应分布参数family = binominal
fit <- glm(churn ~ ., data = trainset, family = binomial)
summary(fit)
## 
## Call:
## glm(formula = churn ~ ., family = binomial, data = trainset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1519   0.1983   0.3460   0.5186   2.1284  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    8.3462866  0.8364914   9.978  < 2e-16 ***
## international_plan1           -2.0534243  0.1726694 -11.892  < 2e-16 ***
## voice_mail_plan1               1.3445887  0.6618905   2.031 0.042211 *  
## number_vmail_messages         -0.0155101  0.0209220  -0.741 0.458496    
## total_day_minutes              0.2398946  3.9168466   0.061 0.951163    
## total_day_calls               -0.0014003  0.0032769  -0.427 0.669141    
## total_day_charge              -1.4855284 23.0402950  -0.064 0.948592    
## total_eve_minutes              0.3600678  1.9349825   0.186 0.852379    
## total_eve_calls               -0.0028484  0.0033061  -0.862 0.388928    
## total_eve_charge              -4.3204432 22.7644698  -0.190 0.849475    
## total_night_minutes            0.4431210  1.0478105   0.423 0.672367    
## total_night_calls              0.0003978  0.0033188   0.120 0.904588    
## total_night_charge            -9.9162795 23.2836376  -0.426 0.670188    
## total_intl_minutes             0.4587114  6.3524560   0.072 0.942435    
## total_intl_calls               0.1065264  0.0304318   3.500 0.000464 ***
## total_intl_charge             -2.0803428 23.5262100  -0.088 0.929538    
## number_customer_service_calls -0.5109077  0.0476289 -10.727  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1938.8  on 2314  degrees of freedom
## Residual deviance: 1515.3  on 2298  degrees of freedom
## AIC: 1549.3
## 
## Number of Fisher Scoring iterations: 6
# 根据p值,仅用显著的变量重新建模
fit <- glm(churn ~ international_plan + voice_mail_plan + total_intl_calls + number_customer_service_calls, data = trainset,family = binomial)
summary(fit)
## 
## Call:
## glm(formula = churn ~ international_plan + voice_mail_plan + 
##     total_intl_calls + number_customer_service_calls, family = binomial, 
##     data = trainset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7308   0.3103   0.4196   0.5381   1.6716  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    2.32304    0.16770  13.852  < 2e-16 ***
## international_plan1           -2.00346    0.16096 -12.447  < 2e-16 ***
## voice_mail_plan1               0.79228    0.16380   4.837 1.32e-06 ***
## total_intl_calls               0.08414    0.02862   2.939  0.00329 ** 
## number_customer_service_calls -0.44227    0.04451  -9.937  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1938.8  on 2314  degrees of freedom
## Residual deviance: 1669.4  on 2310  degrees of freedom
## AIC: 1679.4
## 
## Number of Fisher Scoring iterations: 5
# 在测试数据集testset上进行模型测试(输出预测类别的概率 type = 'response')
pred <- predict(fit, newdata = testset, type = 'response')
# 通过调整概率改变输出结果(0.5)
Class <- pred > 0.5
summary(Class)
##    Mode   FALSE    TRUE 
## logical      29     989
# 模型评估
tb <- table(testset$churn, Class)
tb
##      Class
##       FALSE TRUE
##   yes    18  123
##   no     11  866
churn.mod <- ifelse(testset$churn == 'yes',1,0)
pred_class <- churn.mod
pred_class[pred <= 0.5] = 1 - pred_class[pred <= 0.5]
ctb <- table(churn.mod, pred_class)
ctb
##          pred_class
## churn.mod   0   1
##         0 866  11
##         1  18 123
confusionMatrix(ctb)
## Confusion Matrix and Statistics
## 
##          pred_class
## churn.mod   0   1
##         0 866  11
##         1  18 123
##                                           
##                Accuracy : 0.9715          
##                  95% CI : (0.9593, 0.9808)
##     No Information Rate : 0.8684          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8781          
##  Mcnemar's Test P-Value : 0.2652          
##                                           
##             Sensitivity : 0.9796          
##             Specificity : 0.9179          
##          Pos Pred Value : 0.9875          
##          Neg Pred Value : 0.8723          
##              Prevalence : 0.8684          
##          Detection Rate : 0.8507          
##    Detection Prevalence : 0.8615          
##       Balanced Accuracy : 0.9488          
##                                           
##        'Positive' Class : 0               
## 

朴素贝叶斯分类算法

library(e1071)
# 使用e1071包中的naiveBayes函数构建分类器
classifier <- naiveBayes(trainset[,!names(trainset) %in% c('churn')], trainset$churn)
classifier
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = trainset[, !names(trainset) %in% c("churn")], 
##     y = trainset$churn)
## 
## A-priori probabilities:
## trainset$churn
##       yes        no 
## 0.1477322 0.8522678 
## 
## Conditional probabilities:
##               international_plan
## trainset$churn          0          1
##            yes 0.70467836 0.29532164
##            no  0.93512418 0.06487582
## 
##               voice_mail_plan
## trainset$churn         0         1
##            yes 0.8333333 0.1666667
##            no  0.7045109 0.2954891
## 
##               number_vmail_messages
## trainset$churn     [,1]     [,2]
##            yes 5.099415 11.80618
##            no  8.674607 14.03670
## 
##               total_day_minutes
## trainset$churn     [,1]     [,2]
##            yes 205.8877 69.10294
##            no  174.2555 50.16357
## 
##               total_day_calls
## trainset$churn     [,1]     [,2]
##            yes 101.0234 22.02903
##            no  100.5509 19.67038
## 
##               total_day_charge
## trainset$churn     [,1]      [,2]
##            yes 35.00143 11.747587
##            no  29.62402  8.527769
## 
##               total_eve_minutes
## trainset$churn     [,1]     [,2]
##            yes 213.7269 51.92206
##            no  199.6197 50.53780
## 
##               total_eve_calls
## trainset$churn     [,1]     [,2]
##            yes 101.4123 19.48658
##            no   99.9478 20.16161
## 
##               total_eve_charge
## trainset$churn     [,1]     [,2]
##            yes 18.16702 4.413058
##            no  16.96789 4.295730
## 
##               total_night_minutes
## trainset$churn     [,1]     [,2]
##            yes 205.4640 47.11434
##            no  201.4184 51.34049
## 
##               total_night_calls
## trainset$churn     [,1]     [,2]
##            yes 100.2573 20.32690
##            no  100.0193 19.68094
## 
##               total_night_charge
## trainset$churn     [,1]    [,2]
##            yes 9.245994 2.12038
##            no  9.063882 2.31040
## 
##               total_intl_minutes
## trainset$churn     [,1]     [,2]
##            yes 10.73684 2.752784
##            no  10.15119 2.819086
## 
##               total_intl_calls
## trainset$churn     [,1]     [,2]
##            yes 4.134503 2.487395
##            no  4.514445 2.394724
## 
##               total_intl_charge
## trainset$churn     [,1]      [,2]
##            yes 2.899386 0.7432760
##            no  2.741343 0.7611755
## 
##               number_customer_service_calls
## trainset$churn     [,1]     [,2]
##            yes 2.204678 1.808803
##            no  1.441460 1.150114
# 生成测试数据集的分类表
bayes.table <- table(predict(classifier, testset[,!names(testset) %in% c('churn')]), testset$churn)
bayes.table
##      
##       yes  no
##   yes  68  45
##   no   73 832
# 利用分类表生成混淆矩阵
confusionMatrix(bayes.table)
## Confusion Matrix and Statistics
## 
##      
##       yes  no
##   yes  68  45
##   no   73 832
##                                           
##                Accuracy : 0.8841          
##                  95% CI : (0.8628, 0.9031)
##     No Information Rate : 0.8615          
##     P-Value [Acc > NIR] : 0.01880         
##                                           
##                   Kappa : 0.4701          
##  Mcnemar's Test P-Value : 0.01294         
##                                           
##             Sensitivity : 0.4823          
##             Specificity : 0.9487          
##          Pos Pred Value : 0.6018          
##          Neg Pred Value : 0.9193          
##              Prevalence : 0.1385          
##          Detection Rate : 0.0668          
##    Detection Prevalence : 0.1110          
##       Balanced Accuracy : 0.7155          
##                                           
##        'Positive' Class : yes             
##