dplyr補充:連接資料庫範例(以sqlite3為例)

# sqlite3 download page: https://www.sqlite.org/download.html
#install.packages('dbplyr')
#install.packages('RSQLite')
library('dplyr')
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library('dbplyr')
## 
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
## 
##     ident, sql
library('RSQLite')

setwd('~/lecture/riii')
load('Statistics/applenews.RData')

my_database = src_sqlite('./mydatabase',create=T)
copy_to(my_database,applenews,temporary = F)
tbl(my_database,"applenews")
## # Source:   table<applenews> [?? x 5]
## # Database: sqlite 3.19.3 [./mydatabase]
##                                                                                                        content
##                                                                                                          <chr>
##  1    "\n                                        (更新:新增影片)想要透過刮刮樂彩券一夕致富,但他卻用錯方法!台
##  2 "\n                                        澳洲一名就讀雪梨大學的華裔博士生,日前公開一段燒毀中國護照的影片
##  3 "\n                                        【行銷專題企劃】房價高高在上,沒錢買房沒關係,但你認為自己是聰明
##  4             "\n                                        本內容由中央廣播電臺提供        美國國防部長卡特(Ash
##  5                   "\n                                        俄羅斯夫妻Murad和Nataly,因牽手背景照「Follow 
##  6           "\n                                        台灣浩鼎生技股份有限公司(4174)今(15)日中午在法務部公司
##  7         "\n                                        本內容由中央廣播電臺提供        衛生福利部國民健康署今天
##  8    "\n                                        連結嘉義縣民雄與竹崎交流道的嘉166線道民雄鄉大崎村段,近來出現
##  9   "\n                                        日本九州熊本縣遭強震重創,民航局表示,華航即日起到5月8日止,從
## 10  "\n                                        台中市政府二月下旬抽驗麵龜,發現2家業者麵龜製品檢出非法色素「鹽
## # ... with more rows, and 4 more variables: title <chr>, dt <dbl>,
## #   category <chr>, clicked <int>
tbl(my_database,"applenews") %>% collect()
## # A tibble: 1,500 x 5
##                                                                                                        content
##                                                                                                          <chr>
##  1    "\n                                        (更新:新增影片)想要透過刮刮樂彩券一夕致富,但他卻用錯方法!台
##  2 "\n                                        澳洲一名就讀雪梨大學的華裔博士生,日前公開一段燒毀中國護照的影片
##  3 "\n                                        【行銷專題企劃】房價高高在上,沒錢買房沒關係,但你認為自己是聰明
##  4             "\n                                        本內容由中央廣播電臺提供        美國國防部長卡特(Ash
##  5                   "\n                                        俄羅斯夫妻Murad和Nataly,因牽手背景照「Follow 
##  6           "\n                                        台灣浩鼎生技股份有限公司(4174)今(15)日中午在法務部公司
##  7         "\n                                        本內容由中央廣播電臺提供        衛生福利部國民健康署今天
##  8    "\n                                        連結嘉義縣民雄與竹崎交流道的嘉166線道民雄鄉大崎村段,近來出現
##  9   "\n                                        日本九州熊本縣遭強震重創,民航局表示,華航即日起到5月8日止,從
## 10  "\n                                        台中市政府二月下旬抽驗麵龜,發現2家業者麵龜製品檢出非法色素「鹽
## # ... with 1,490 more rows, and 4 more variables: title <chr>, dt <dbl>,
## #   category <chr>, clicked <int>
category_stat = tbl(my_database,"applenews") %>% 
  group_by(category) %>%
  summarise_at(.funs=funs(min,max,mean), .vars=vars(matches('clicked'))) %>%
  arrange(desc(mean)) %>%
  collect()

library('ggplot2')
g <- ggplot(category_stat,aes(x=category,y=mean))
g + geom_bar(stat='identity')  + scale_x_discrete(limits=category_stat$category) + theme(text=element_text(size=16,  family="Songti SC"))

Learning map

Classification

Decision Tree - using churn data in C50 package

#install.packages("C50")
library(C50)

data(churn)
str(churnTrain)
## 'data.frame':    3333 obs. of  20 variables:
##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
##  $ account_length               : int  128 107 137 84 75 118 121 147 117 141 ...
##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ number_vmail_messages        : int  25 26 0 0 0 0 24 0 0 37 ...
##  $ total_day_minutes            : num  265 162 243 299 167 ...
##  $ total_day_calls              : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ total_day_charge             : num  45.1 27.5 41.4 50.9 28.3 ...
##  $ total_eve_minutes            : num  197.4 195.5 121.2 61.9 148.3 ...
##  $ total_eve_calls              : int  99 103 110 88 122 101 108 94 80 111 ...
##  $ total_eve_charge             : num  16.78 16.62 10.3 5.26 12.61 ...
##  $ total_night_minutes          : num  245 254 163 197 187 ...
##  $ total_night_calls            : int  91 103 104 89 121 118 118 96 90 97 ...
##  $ total_night_charge           : num  11.01 11.45 7.32 8.86 8.41 ...
##  $ total_intl_minutes           : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ total_intl_calls             : int  3 3 5 7 3 6 7 6 4 5 ...
##  $ total_intl_charge            : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ number_customer_service_calls: int  1 1 0 2 3 0 3 0 1 0 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
names(churnTrain) %in% c("state", "area_code", "account_length")
##  [1]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
!names(churnTrain) %in% c("state", "area_code", "account_length")
##  [1] FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [12]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
#選擇建模變數
variable.list = !names(churnTrain) %in% c('state','area_code','account_length')
churnTrain=churnTrain[,variable.list]

str(churnTrain)
## 'data.frame':    3333 obs. of  17 variables:
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ number_vmail_messages        : int  25 26 0 0 0 0 24 0 0 37 ...
##  $ total_day_minutes            : num  265 162 243 299 167 ...
##  $ total_day_calls              : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ total_day_charge             : num  45.1 27.5 41.4 50.9 28.3 ...
##  $ total_eve_minutes            : num  197.4 195.5 121.2 61.9 148.3 ...
##  $ total_eve_calls              : int  99 103 110 88 122 101 108 94 80 111 ...
##  $ total_eve_charge             : num  16.78 16.62 10.3 5.26 12.61 ...
##  $ total_night_minutes          : num  245 254 163 197 187 ...
##  $ total_night_calls            : int  91 103 104 89 121 118 118 96 90 97 ...
##  $ total_night_charge           : num  11.01 11.45 7.32 8.86 8.41 ...
##  $ total_intl_minutes           : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ total_intl_calls             : int  3 3 5 7 3 6 7 6 4 5 ...
##  $ total_intl_charge            : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ number_customer_service_calls: int  1 1 0 2 3 0 3 0 1 0 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
#sample
?sample
## Help on topic 'sample' was found in the following packages:
## 
##   Package               Library
##   dplyr                 /Library/Frameworks/R.framework/Versions/3.3/Resources/library
##   base                  /Library/Frameworks/R.framework/Resources/library
## 
## 
## Using the first match ...
sample(1:10)
##  [1]  8  9  3  5  7  1  4 10  2  6
sample(1:10, size = 5)
## [1] 5 4 7 8 3
sample(c(0,1), size= 10, replace = T)
##  [1] 0 1 0 0 1 0 1 1 0 1
sample.int(20, 12) # 兩個參數都要放整數,此例為取1:20中的12個不重複樣本
##  [1] 12  3  6  2 11  5  7 13 19  9 10 18
set.seed(2)
#把資料分成training data 和 testing data
ind<-sample(1:2, size=nrow(churnTrain), replace=T, prob=c(0.7, 0.3))
trainset=churnTrain[ind==1,]
testset=churnTrain[ind==2,]

rpart

#install.packages('rpart')
library('rpart')
#使用rpart(CART)建立決策樹模型
?rpart
con = rpart.control(cp=0.01)
?rpart.control
churn.rp<-rpart(churn ~., data=trainset,control = con)
#churn.rp<-rpart(churn ~ total_day_charge + international_plan, data=trainset)

churn.rp
## n= 2315 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 2315 342 no (0.14773218 0.85226782)  
##    2) total_day_minutes>=265.45 144  59 yes (0.59027778 0.40972222)  
##      4) voice_mail_plan=no 110  29 yes (0.73636364 0.26363636)  
##        8) total_eve_minutes>=188.5 67   3 yes (0.95522388 0.04477612) *
##        9) total_eve_minutes< 188.5 43  17 no (0.39534884 0.60465116)  
##         18) total_day_minutes>=282.7 19   6 yes (0.68421053 0.31578947) *
##         19) total_day_minutes< 282.7 24   4 no (0.16666667 0.83333333) *
##      5) voice_mail_plan=yes 34   4 no (0.11764706 0.88235294) *
##    3) total_day_minutes< 265.45 2171 257 no (0.11837863 0.88162137)  
##      6) number_customer_service_calls>=3.5 168  82 yes (0.51190476 0.48809524)  
##       12) total_day_minutes< 160.2 71  10 yes (0.85915493 0.14084507) *
##       13) total_day_minutes>=160.2 97  25 no (0.25773196 0.74226804)  
##         26) total_eve_minutes< 155.5 20   7 yes (0.65000000 0.35000000) *
##         27) total_eve_minutes>=155.5 77  12 no (0.15584416 0.84415584) *
##      7) number_customer_service_calls< 3.5 2003 171 no (0.08537194 0.91462806)  
##       14) international_plan=yes 188  76 no (0.40425532 0.59574468)  
##         28) total_intl_calls< 2.5 38   0 yes (1.00000000 0.00000000) *
##         29) total_intl_calls>=2.5 150  38 no (0.25333333 0.74666667)  
##           58) total_intl_minutes>=13.1 32   0 yes (1.00000000 0.00000000) *
##           59) total_intl_minutes< 13.1 118   6 no (0.05084746 0.94915254) *
##       15) international_plan=no 1815  95 no (0.05234160 0.94765840)  
##         30) total_day_minutes>=224.15 251  50 no (0.19920319 0.80079681)  
##           60) total_eve_minutes>=259.8 36  10 yes (0.72222222 0.27777778) *
##           61) total_eve_minutes< 259.8 215  24 no (0.11162791 0.88837209) *
##         31) total_day_minutes< 224.15 1564  45 no (0.02877238 0.97122762) *
summary(churn.rp)
## Call:
## rpart(formula = churn ~ ., data = trainset, control = con)
##   n= 2315 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.07602339      0 1.0000000 1.0000000 0.04992005
## 2 0.07456140      2 0.8479532 0.9970760 0.04985964
## 3 0.05555556      4 0.6988304 0.7602339 0.04442127
## 4 0.02631579      7 0.4941520 0.5263158 0.03767329
## 5 0.02339181      8 0.4678363 0.5204678 0.03748096
## 6 0.02046784     10 0.4210526 0.5087719 0.03709209
## 7 0.01754386     11 0.4005848 0.4707602 0.03578773
## 8 0.01000000     12 0.3830409 0.4766082 0.03599261
## 
## Variable importance
##             total_day_minutes              total_day_charge 
##                            18                            18 
## number_customer_service_calls            total_intl_minutes 
##                            10                             8 
##             total_intl_charge              total_eve_charge 
##                             8                             8 
##             total_eve_minutes            international_plan 
##                             8                             7 
##              total_intl_calls         number_vmail_messages 
##                             6                             3 
##               voice_mail_plan             total_night_calls 
##                             3                             1 
##               total_eve_calls 
##                             1 
## 
## Node number 1: 2315 observations,    complexity param=0.07602339
##   predicted class=no   expected loss=0.1477322  P(node) =1
##     class counts:   342  1973
##    probabilities: 0.148 0.852 
##   left son=2 (144 obs) right son=3 (2171 obs)
##   Primary splits:
##       total_day_minutes             < 265.45 to the right, improve=60.145020, (0 missing)
##       total_day_charge              < 45.125 to the right, improve=60.145020, (0 missing)
##       number_customer_service_calls < 3.5    to the right, improve=53.641430, (0 missing)
##       international_plan            splits as  RL,         improve=43.729370, (0 missing)
##       voice_mail_plan               splits as  LR,         improve= 6.089388, (0 missing)
##   Surrogate splits:
##       total_day_charge < 45.125 to the right, agree=1, adj=1, (0 split)
## 
## Node number 2: 144 observations,    complexity param=0.07602339
##   predicted class=yes  expected loss=0.4097222  P(node) =0.06220302
##     class counts:    85    59
##    probabilities: 0.590 0.410 
##   left son=4 (110 obs) right son=5 (34 obs)
##   Primary splits:
##       voice_mail_plan       splits as  LR,         improve=19.884860, (0 missing)
##       number_vmail_messages < 9.5    to the left,  improve=19.884860, (0 missing)
##       total_eve_minutes     < 167.05 to the right, improve=14.540020, (0 missing)
##       total_eve_charge      < 14.2   to the right, improve=14.540020, (0 missing)
##       total_day_minutes     < 283.9  to the right, improve= 6.339827, (0 missing)
##   Surrogate splits:
##       number_vmail_messages < 9.5    to the left,  agree=1.000, adj=1.000, (0 split)
##       total_night_minutes   < 110.3  to the right, agree=0.785, adj=0.088, (0 split)
##       total_night_charge    < 4.965  to the right, agree=0.785, adj=0.088, (0 split)
##       total_night_calls     < 50     to the right, agree=0.778, adj=0.059, (0 split)
##       total_intl_minutes    < 15.3   to the left,  agree=0.771, adj=0.029, (0 split)
## 
## Node number 3: 2171 observations,    complexity param=0.0745614
##   predicted class=no   expected loss=0.1183786  P(node) =0.937797
##     class counts:   257  1914
##    probabilities: 0.118 0.882 
##   left son=6 (168 obs) right son=7 (2003 obs)
##   Primary splits:
##       number_customer_service_calls < 3.5    to the right, improve=56.398210, (0 missing)
##       international_plan            splits as  RL,         improve=43.059160, (0 missing)
##       total_day_minutes             < 224.15 to the right, improve=10.847440, (0 missing)
##       total_day_charge              < 38.105 to the right, improve=10.847440, (0 missing)
##       total_intl_minutes            < 13.15  to the right, improve= 6.347319, (0 missing)
## 
## Node number 4: 110 observations,    complexity param=0.02631579
##   predicted class=yes  expected loss=0.2636364  P(node) =0.0475162
##     class counts:    81    29
##    probabilities: 0.736 0.264 
##   left son=8 (67 obs) right son=9 (43 obs)
##   Primary splits:
##       total_eve_minutes   < 188.5  to the right, improve=16.419610, (0 missing)
##       total_eve_charge    < 16.025 to the right, improve=16.419610, (0 missing)
##       total_night_minutes < 206.85 to the right, improve= 5.350500, (0 missing)
##       total_night_charge  < 9.305  to the right, improve= 5.350500, (0 missing)
##       total_day_minutes   < 281.15 to the right, improve= 5.254545, (0 missing)
##   Surrogate splits:
##       total_eve_charge   < 16.025 to the right, agree=1.000, adj=1.000, (0 split)
##       total_night_calls  < 82     to the right, agree=0.655, adj=0.116, (0 split)
##       total_intl_minutes < 3.35   to the right, agree=0.636, adj=0.070, (0 split)
##       total_intl_charge  < 0.905  to the right, agree=0.636, adj=0.070, (0 split)
##       total_day_minutes  < 268.55 to the right, agree=0.627, adj=0.047, (0 split)
## 
## Node number 5: 34 observations
##   predicted class=no   expected loss=0.1176471  P(node) =0.01468683
##     class counts:     4    30
##    probabilities: 0.118 0.882 
## 
## Node number 6: 168 observations,    complexity param=0.0745614
##   predicted class=yes  expected loss=0.4880952  P(node) =0.07257019
##     class counts:    86    82
##    probabilities: 0.512 0.488 
##   left son=12 (71 obs) right son=13 (97 obs)
##   Primary splits:
##       total_day_minutes             < 160.2  to the left,  improve=29.655880, (0 missing)
##       total_day_charge              < 27.235 to the left,  improve=29.655880, (0 missing)
##       total_eve_minutes             < 180.65 to the left,  improve= 8.556953, (0 missing)
##       total_eve_charge              < 15.355 to the left,  improve= 8.556953, (0 missing)
##       number_customer_service_calls < 4.5    to the right, improve= 5.975362, (0 missing)
##   Surrogate splits:
##       total_day_charge              < 27.235 to the left,  agree=1.000, adj=1.000, (0 split)
##       total_night_calls             < 79     to the left,  agree=0.625, adj=0.113, (0 split)
##       total_intl_calls              < 2.5    to the left,  agree=0.619, adj=0.099, (0 split)
##       number_customer_service_calls < 4.5    to the right, agree=0.607, adj=0.070, (0 split)
##       total_eve_calls               < 89.5   to the left,  agree=0.601, adj=0.056, (0 split)
## 
## Node number 7: 2003 observations,    complexity param=0.05555556
##   predicted class=no   expected loss=0.08537194  P(node) =0.8652268
##     class counts:   171  1832
##    probabilities: 0.085 0.915 
##   left son=14 (188 obs) right son=15 (1815 obs)
##   Primary splits:
##       international_plan splits as  RL,         improve=42.194510, (0 missing)
##       total_day_minutes  < 224.15 to the right, improve=16.838410, (0 missing)
##       total_day_charge   < 38.105 to the right, improve=16.838410, (0 missing)
##       total_intl_minutes < 13.15  to the right, improve= 6.210678, (0 missing)
##       total_intl_charge  < 3.55   to the right, improve= 6.210678, (0 missing)
## 
## Node number 8: 67 observations
##   predicted class=yes  expected loss=0.04477612  P(node) =0.02894168
##     class counts:    64     3
##    probabilities: 0.955 0.045 
## 
## Node number 9: 43 observations,    complexity param=0.02046784
##   predicted class=no   expected loss=0.3953488  P(node) =0.01857451
##     class counts:    17    26
##    probabilities: 0.395 0.605 
##   left son=18 (19 obs) right son=19 (24 obs)
##   Primary splits:
##       total_day_minutes   < 282.7  to the right, improve=5.680947, (0 missing)
##       total_day_charge    < 48.06  to the right, improve=5.680947, (0 missing)
##       total_night_minutes < 212.65 to the right, improve=4.558140, (0 missing)
##       total_night_charge  < 9.57   to the right, improve=4.558140, (0 missing)
##       total_eve_minutes   < 145.4  to the right, improve=4.356169, (0 missing)
##   Surrogate splits:
##       total_day_charge   < 48.06  to the right, agree=1.000, adj=1.000, (0 split)
##       total_day_calls    < 103    to the left,  agree=0.674, adj=0.263, (0 split)
##       total_eve_calls    < 104.5  to the left,  agree=0.674, adj=0.263, (0 split)
##       total_intl_minutes < 11.55  to the left,  agree=0.651, adj=0.211, (0 split)
##       total_intl_charge  < 3.12   to the left,  agree=0.651, adj=0.211, (0 split)
## 
## Node number 12: 71 observations
##   predicted class=yes  expected loss=0.1408451  P(node) =0.03066955
##     class counts:    61    10
##    probabilities: 0.859 0.141 
## 
## Node number 13: 97 observations,    complexity param=0.01754386
##   predicted class=no   expected loss=0.257732  P(node) =0.04190065
##     class counts:    25    72
##    probabilities: 0.258 0.742 
##   left son=26 (20 obs) right son=27 (77 obs)
##   Primary splits:
##       total_eve_minutes             < 155.5  to the left,  improve=7.753662, (0 missing)
##       total_eve_charge              < 13.22  to the left,  improve=7.753662, (0 missing)
##       total_intl_minutes            < 13.55  to the right, improve=2.366149, (0 missing)
##       total_intl_charge             < 3.66   to the right, improve=2.366149, (0 missing)
##       number_customer_service_calls < 4.5    to the right, improve=2.297667, (0 missing)
##   Surrogate splits:
##       total_eve_charge  < 13.22  to the left,  agree=1.000, adj=1.00, (0 split)
##       total_night_calls < 143.5  to the right, agree=0.814, adj=0.10, (0 split)
##       total_eve_calls   < 62     to the left,  agree=0.804, adj=0.05, (0 split)
## 
## Node number 14: 188 observations,    complexity param=0.05555556
##   predicted class=no   expected loss=0.4042553  P(node) =0.0812095
##     class counts:    76   112
##    probabilities: 0.404 0.596 
##   left son=28 (38 obs) right son=29 (150 obs)
##   Primary splits:
##       total_intl_calls   < 2.5    to the left,  improve=33.806520, (0 missing)
##       total_intl_minutes < 13.1   to the right, improve=30.527050, (0 missing)
##       total_intl_charge  < 3.535  to the right, improve=30.527050, (0 missing)
##       total_day_minutes  < 221.95 to the right, improve= 3.386095, (0 missing)
##       total_day_charge   < 37.735 to the right, improve= 3.386095, (0 missing)
## 
## Node number 15: 1815 observations,    complexity param=0.02339181
##   predicted class=no   expected loss=0.0523416  P(node) =0.7840173
##     class counts:    95  1720
##    probabilities: 0.052 0.948 
##   left son=30 (251 obs) right son=31 (1564 obs)
##   Primary splits:
##       total_day_minutes   < 224.15 to the right, improve=12.5649300, (0 missing)
##       total_day_charge    < 38.105 to the right, improve=12.5649300, (0 missing)
##       total_eve_minutes   < 244.95 to the right, improve= 4.7875890, (0 missing)
##       total_eve_charge    < 20.825 to the right, improve= 4.7875890, (0 missing)
##       total_night_minutes < 163.85 to the right, improve= 0.9074391, (0 missing)
##   Surrogate splits:
##       total_day_charge < 38.105 to the right, agree=1, adj=1, (0 split)
## 
## Node number 18: 19 observations
##   predicted class=yes  expected loss=0.3157895  P(node) =0.008207343
##     class counts:    13     6
##    probabilities: 0.684 0.316 
## 
## Node number 19: 24 observations
##   predicted class=no   expected loss=0.1666667  P(node) =0.01036717
##     class counts:     4    20
##    probabilities: 0.167 0.833 
## 
## Node number 26: 20 observations
##   predicted class=yes  expected loss=0.35  P(node) =0.008639309
##     class counts:    13     7
##    probabilities: 0.650 0.350 
## 
## Node number 27: 77 observations
##   predicted class=no   expected loss=0.1558442  P(node) =0.03326134
##     class counts:    12    65
##    probabilities: 0.156 0.844 
## 
## Node number 28: 38 observations
##   predicted class=yes  expected loss=0  P(node) =0.01641469
##     class counts:    38     0
##    probabilities: 1.000 0.000 
## 
## Node number 29: 150 observations,    complexity param=0.05555556
##   predicted class=no   expected loss=0.2533333  P(node) =0.06479482
##     class counts:    38   112
##    probabilities: 0.253 0.747 
##   left son=58 (32 obs) right son=59 (118 obs)
##   Primary splits:
##       total_intl_minutes < 13.1   to the right, improve=45.356840, (0 missing)
##       total_intl_charge  < 3.535  to the right, improve=45.356840, (0 missing)
##       total_day_calls    < 95.5   to the left,  improve= 4.036407, (0 missing)
##       total_day_minutes  < 237.75 to the right, improve= 1.879020, (0 missing)
##       total_day_charge   < 40.42  to the right, improve= 1.879020, (0 missing)
##   Surrogate splits:
##       total_intl_charge < 3.535  to the right, agree=1.0, adj=1.000, (0 split)
##       total_day_minutes < 52.45  to the left,  agree=0.8, adj=0.063, (0 split)
##       total_day_charge  < 8.92   to the left,  agree=0.8, adj=0.063, (0 split)
## 
## Node number 30: 251 observations,    complexity param=0.02339181
##   predicted class=no   expected loss=0.1992032  P(node) =0.1084233
##     class counts:    50   201
##    probabilities: 0.199 0.801 
##   left son=60 (36 obs) right son=61 (215 obs)
##   Primary splits:
##       total_eve_minutes     < 259.8  to the right, improve=22.993380, (0 missing)
##       total_eve_charge      < 22.08  to the right, improve=22.993380, (0 missing)
##       voice_mail_plan       splits as  LR,         improve= 4.745664, (0 missing)
##       number_vmail_messages < 7.5    to the left,  improve= 4.745664, (0 missing)
##       total_night_minutes   < 181.15 to the right, improve= 3.509731, (0 missing)
##   Surrogate splits:
##       total_eve_charge < 22.08  to the right, agree=1, adj=1, (0 split)
## 
## Node number 31: 1564 observations
##   predicted class=no   expected loss=0.02877238  P(node) =0.675594
##     class counts:    45  1519
##    probabilities: 0.029 0.971 
## 
## Node number 58: 32 observations
##   predicted class=yes  expected loss=0  P(node) =0.01382289
##     class counts:    32     0
##    probabilities: 1.000 0.000 
## 
## Node number 59: 118 observations
##   predicted class=no   expected loss=0.05084746  P(node) =0.05097192
##     class counts:     6   112
##    probabilities: 0.051 0.949 
## 
## Node number 60: 36 observations
##   predicted class=yes  expected loss=0.2777778  P(node) =0.01555076
##     class counts:    26    10
##    probabilities: 0.722 0.278 
## 
## Node number 61: 215 observations
##   predicted class=no   expected loss=0.1116279  P(node) =0.09287257
##     class counts:    24   191
##    probabilities: 0.112 0.888
#畫出決策樹
par(mfrow=c(1,1))
?plot.rpart
plot(churn.rp, uniform=TRUE,branch = 0.6, margin=0.1)
text(churn.rp, all=TRUE, use.n=TRUE, cex=0.7)

printcp(churn.rp)
## 
## Classification tree:
## rpart(formula = churn ~ ., data = trainset, control = con)
## 
## Variables actually used in tree construction:
## [1] international_plan            number_customer_service_calls
## [3] total_day_minutes             total_eve_minutes            
## [5] total_intl_calls              total_intl_minutes           
## [7] voice_mail_plan              
## 
## Root node error: 342/2315 = 0.14773
## 
## n= 2315 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.076023      0   1.00000 1.00000 0.049920
## 2 0.074561      2   0.84795 0.99708 0.049860
## 3 0.055556      4   0.69883 0.76023 0.044421
## 4 0.026316      7   0.49415 0.52632 0.037673
## 5 0.023392      8   0.46784 0.52047 0.037481
## 6 0.020468     10   0.42105 0.50877 0.037092
## 7 0.017544     11   0.40058 0.47076 0.035788
## 8 0.010000     12   0.38304 0.47661 0.035993
plotcp(churn.rp)

Prune

#找出minimum cross-validation errors
min_row = which.min(churn.rp$cptable[,"xerror"])
churn.cp = churn.rp$cptable[min_row, "CP"]
#將churn.cp設為臨界值來修剪樹
prune.tree=prune(churn.rp, cp=churn.cp)

plot(prune.tree, margin=0.1)
text(prune.tree, all=TRUE, use.n=TRUE, cex=0.7)

predictions <-predict(prune.tree, testset, type='class')
table(predictions,testset$churn)
##            
## predictions yes  no
##         yes  95  14
##         no   46 863
#install.packages('caret')
#install.packages('e1071')
library('caret')
## Loading required package: lattice
library('e1071')
confusionMatrix(table(predictions, testset$churn))
## Confusion Matrix and Statistics
## 
##            
## predictions yes  no
##         yes  95  14
##         no   46 863
##                                           
##                Accuracy : 0.9411          
##                  95% CI : (0.9248, 0.9547)
##     No Information Rate : 0.8615          
##     P-Value [Acc > NIR] : 2.786e-16       
##                                           
##                   Kappa : 0.727           
##  Mcnemar's Test P-Value : 6.279e-05       
##                                           
##             Sensitivity : 0.67376         
##             Specificity : 0.98404         
##          Pos Pred Value : 0.87156         
##          Neg Pred Value : 0.94939         
##              Prevalence : 0.13851         
##          Detection Rate : 0.09332         
##    Detection Prevalence : 0.10707         
##       Balanced Accuracy : 0.82890         
##                                           
##        'Positive' Class : yes             
## 
?confusionMatrix
## Help on topic 'confusionMatrix' was found in the following
## packages:
## 
##   Package               Library
##   ModelMetrics          /Library/Frameworks/R.framework/Versions/3.3/Resources/library
##   caret                 /Library/Frameworks/R.framework/Versions/3.3/Resources/library
## 
## 
## Using the first match ...

ctree

#install.packages("party")
library('party')
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
ctree.model = ctree(churn ~ . , data = trainset)
plot(ctree.model, margin=0.1)

daycharge.model = ctree(churn ~ total_day_charge + international_plan, data = trainset)
plot(daycharge.model)

ctree.predict = predict(ctree.model ,testset)
table(ctree.predict, testset$churn)
##              
## ctree.predict yes  no
##           yes  99  15
##           no   42 862
confusionMatrix(table(ctree.predict, testset$churn))
## Confusion Matrix and Statistics
## 
##              
## ctree.predict yes  no
##           yes  99  15
##           no   42 862
##                                           
##                Accuracy : 0.944           
##                  95% CI : (0.9281, 0.9573)
##     No Information Rate : 0.8615          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7449          
##  Mcnemar's Test P-Value : 0.0005736       
##                                           
##             Sensitivity : 0.70213         
##             Specificity : 0.98290         
##          Pos Pred Value : 0.86842         
##          Neg Pred Value : 0.95354         
##              Prevalence : 0.13851         
##          Detection Rate : 0.09725         
##    Detection Prevalence : 0.11198         
##       Balanced Accuracy : 0.84251         
##                                           
##        'Positive' Class : yes             
## 

C5.0

#install.packages("C50")
library(C50)
c50.model = C5.0(churn ~., data=trainset)

?C5.0Control

c=C5.0Control(minCases = 20)
c50.model = C5.0(churn ~., data=trainset,control = c)

summary(c50.model)
## 
## Call:
## C5.0.formula(formula = churn ~ ., data = trainset, control = c)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Mon Feb  5 11:52:34 2018
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 2315 cases (17 attributes) from undefined.data
## 
## Decision tree:
## 
## number_customer_service_calls > 3:
## :...total_day_minutes <= 160.1: yes (71/10)
## :   total_day_minutes > 160.1: no (108/32)
## number_customer_service_calls <= 3:
## :...international_plan = yes:
##     :...total_intl_calls <= 2: yes (41)
##     :   total_intl_calls > 2:
##     :   :...total_intl_minutes <= 13.1: no (134/13)
##     :       total_intl_minutes > 13.1: yes (34)
##     international_plan = no:
##     :...total_day_minutes <= 224.1: no (1564/45)
##         total_day_minutes > 224.1:
##         :...voice_mail_plan = yes: no (97/4)
##             voice_mail_plan = no:
##             :...total_eve_charge <= 17.47:
##                 :...total_day_minutes <= 278.4: no (124/10)
##                 :   total_day_minutes > 278.4: yes (20/5)
##                 total_eve_charge > 17.47:
##                 :...total_day_minutes > 264: yes (46)
##                     total_day_minutes <= 264:
##                     :...total_eve_charge > 22.04: yes (29/4)
##                         total_eve_charge <= 22.04:
##                         :...total_night_charge <= 9.04: no (23/1)
##                             total_night_charge > 9.04: yes (24/9)
## 
## 
## Evaluation on training data (2315 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      13  133( 5.7%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     237   105    (a): class yes
##      28  1945    (b): class no
## 
## 
##  Attribute usage:
## 
##  100.00% number_customer_service_calls
##   92.27% international_plan
##   90.97% total_day_minutes
##   15.68% voice_mail_plan
##   11.49% total_eve_charge
##    9.03% total_intl_calls
##    7.26% total_intl_minutes
##    2.03% total_night_charge
## 
## 
## Time: 0.0 secs
plot(c50.model)

c50.predict = predict(c50.model,testset)
table(c50.predict, testset$churn)
##            
## c50.predict yes  no
##         yes  97  15
##         no   44 862
confusionMatrix(table(c50.predict, testset$churn))
## Confusion Matrix and Statistics
## 
##            
## c50.predict yes  no
##         yes  97  15
##         no   44 862
##                                           
##                Accuracy : 0.942           
##                  95% CI : (0.9259, 0.9556)
##     No Information Rate : 0.8615          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7342          
##  Mcnemar's Test P-Value : 0.0002671       
##                                           
##             Sensitivity : 0.68794         
##             Specificity : 0.98290         
##          Pos Pred Value : 0.86607         
##          Neg Pred Value : 0.95143         
##              Prevalence : 0.13851         
##          Detection Rate : 0.09528         
##    Detection Prevalence : 0.11002         
##       Balanced Accuracy : 0.83542         
##                                           
##        'Positive' Class : yes             
## 

Estimating model performance with k-fold cross-validation

ind = cut(1:nrow(churnTrain), breaks=10, labels=F)
ind
##    [1]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##   [24]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##   [47]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##   [70]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##   [93]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [116]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [139]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [162]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [185]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [208]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [231]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [254]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [277]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [300]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##  [323]  1  1  1  1  1  1  1  1  1  1  1  1  2  2  2  2  2  2  2  2  2  2  2
##  [346]  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
##  [369]  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
##  [392]  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
##  [415]  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
##  [438]  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
##  [461]  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
##  [484]  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
##  [507]  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
##  [530]  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
##  [553]  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
##  [576]  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
##  [599]  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
##  [622]  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
##  [645]  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
##  [668]  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3
##  [691]  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3
##  [714]  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3
##  [737]  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3
##  [760]  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3
##  [783]  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3
##  [806]  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3
##  [829]  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3
##  [852]  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3
##  [875]  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3
##  [898]  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3
##  [921]  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3
##  [944]  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3
##  [967]  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3
##  [990]  3  3  3  3  3  3  3  3  3  3  3  4  4  4  4  4  4  4  4  4  4  4  4
## [1013]  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
## [1036]  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
## [1059]  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
## [1082]  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
## [1105]  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
## [1128]  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
## [1151]  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
## [1174]  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
## [1197]  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
## [1220]  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
## [1243]  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
## [1266]  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
## [1289]  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
## [1312]  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  5
## [1335]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
## [1358]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
## [1381]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
## [1404]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
## [1427]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
## [1450]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
## [1473]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
## [1496]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
## [1519]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
## [1542]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
## [1565]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
## [1588]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
## [1611]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
## [1634]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
## [1657]  5  5  5  5  5  5  5  5  5  5  5  6  6  6  6  6  6  6  6  6  6  6  6
## [1680]  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6
## [1703]  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6
## [1726]  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6
## [1749]  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6
## [1772]  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6
## [1795]  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6
## [1818]  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6
## [1841]  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6
## [1864]  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6
## [1887]  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6
## [1910]  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6
## [1933]  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6
## [1956]  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6
## [1979]  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  7
## [2002]  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
## [2025]  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
## [2048]  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
## [2071]  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
## [2094]  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
## [2117]  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
## [2140]  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
## [2163]  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
## [2186]  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
## [2209]  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
## [2232]  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
## [2255]  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
## [2278]  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
## [2301]  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
## [2324]  7  7  7  7  7  7  7  7  7  7  8  8  8  8  8  8  8  8  8  8  8  8  8
## [2347]  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8
## [2370]  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8
## [2393]  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8
## [2416]  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8
## [2439]  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8
## [2462]  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8
## [2485]  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8
## [2508]  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8
## [2531]  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8
## [2554]  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8
## [2577]  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8
## [2600]  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8
## [2623]  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8
## [2646]  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  9  9
## [2669]  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
## [2692]  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
## [2715]  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
## [2738]  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
## [2761]  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
## [2784]  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
## [2807]  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
## [2830]  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
## [2853]  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
## [2876]  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
## [2899]  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
## [2922]  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
## [2945]  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
## [2968]  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
## [2991]  9  9  9  9  9  9  9  9  9 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [3014] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [3037] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [3060] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [3083] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [3106] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [3129] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [3152] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [3175] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [3198] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [3221] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [3244] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [3267] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [3290] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## [3313] 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
accuracies = c()
for (i in 1:10) {
  fit = rpart(formula=churn ~., data=churnTrain[ind != i,])
  predictions = predict(fit, churnTrain[ind == i, ! names(churnTrain) %in% c("churn")], type="class")
  correct_count = sum(predictions == churnTrain[ind == i,c("churn")])
  accuracies = append(correct_count / nrow(churnTrain[ind == i,]), accuracies)
}
accuracies
##  [1] 0.9640719 0.9099099 0.9519520 0.9519520 0.9369369 0.9371257 0.9369369
##  [8] 0.9429429 0.9399399 0.9341317
mean(accuracies)
## [1] 0.94059

caret cross-validation

#install.packages("caret")
library(caret)
control=trainControl(method="repeatedcv", number=10, repeats=3)
model =train(churn~., data=trainset, method="rpart", trControl=control)
model
## CART 
## 
## 2315 samples
##   16 predictor
##    2 classes: 'yes', 'no' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2083, 2084, 2083, 2084, 2084, 2083, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.05555556  0.9036795  0.5396942
##   0.07456140  0.8663810  0.2526744
##   0.07602339  0.8574617  0.1761057
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.05555556.
predictions = predict(model, testset)

table(predictions,testset$churn)
##            
## predictions yes  no
##         yes  64  10
##         no   77 867

find importance variable

library('caret')
importance = varImp(model, scale=FALSE)
importance
## rpart variable importance
## 
##                               Overall
## number_customer_service_calls 116.015
## total_day_minutes             106.988
## total_day_charge              100.648
## international_planyes          86.789
## voice_mail_planyes             25.974
## total_eve_minutes              23.097
## total_eve_charge               23.097
## number_vmail_messages          19.885
## total_intl_minutes              6.347
## total_day_calls                 0.000
## total_night_minutes             0.000
## total_night_calls               0.000
## total_intl_calls                0.000
## total_night_charge              0.000
## total_intl_charge               0.000
## total_eve_calls                 0.000
plot(importance)

ROC

#install.packages("ROCR")
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
predictions <-predict(churn.rp, testset, type="prob")
head(predictions)
##           yes         no
## 2  0.02877238 0.97122762
## 5  0.05084746 0.94915254
## 6  0.05084746 0.94915254
## 8  0.05084746 0.94915254
## 13 0.02877238 0.97122762
## 16 0.95522388 0.04477612
pred.to.roc<-predictions[, 1]
head(pred.to.roc)
##          2          5          6          8         13         16 
## 0.02877238 0.05084746 0.05084746 0.05084746 0.02877238 0.95522388
pred.rocr<-prediction(pred.to.roc, testset$churn)
pred.rocr
## An object of class "prediction"
## Slot "predictions":
## [[1]]
##          2          5          6          8         13         16 
## 0.02877238 0.05084746 0.05084746 0.05084746 0.02877238 0.95522388 
##         17         23         29         33         34         37 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 0.02877238 
##         41         45         46         47         50         54 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##         56         57         58         60         61         62 
## 0.02877238 0.02877238 0.02877238 0.11162791 0.02877238 0.11764706 
##         65         76         78         86         87         89 
## 0.02877238 0.02877238 0.85915493 0.02877238 0.85915493 0.02877238 
##         91        103        104        107        112        115 
## 0.02877238 0.02877238 0.02877238 0.11764706 0.02877238 0.02877238 
##        116        117        119        123        124        126 
## 1.00000000 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##        132        134        138        146        153        155 
## 0.02877238 0.15584416 0.02877238 0.11162791 0.02877238 0.65000000 
##        157        161        162        164        168        171 
## 0.95522388 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 
##        174        176        181        183        184        186 
## 0.02877238 0.11162791 0.15584416 0.02877238 0.02877238 0.02877238 
##        188        193        201        204        206        211 
## 0.95522388 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 
##        213        215        222        226        227        229 
## 0.02877238 1.00000000 0.02877238 0.02877238 0.02877238 0.11764706 
##        230        233        235        241        244        245 
## 0.02877238 0.02877238 0.05084746 0.02877238 0.11162791 0.02877238 
##        246        251        255        261        265        269 
## 0.02877238 0.85915493 0.05084746 0.02877238 0.11162791 0.02877238 
##        271        272        273        275        277        282 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.05084746 0.02877238 
##        286        289        293        295        301        304 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##        305        309        313        319        332        333 
## 0.02877238 0.02877238 0.02877238 0.11162791 0.95522388 0.85915493 
##        339        340        341        342        343        348 
## 0.72222222 0.11162791 0.02877238 0.02877238 0.02877238 0.02877238 
##        354        356        357        369        371        378 
## 0.02877238 0.02877238 0.05084746 0.02877238 0.02877238 0.02877238 
##        389        390        391        392        398        400 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.72222222 0.02877238 
##        405        406        410        412        417        420 
## 0.15584416 0.05084746 0.02877238 0.02877238 1.00000000 0.02877238 
##        421        425        432        433        436        438 
## 0.02877238 0.02877238 0.02877238 0.11162791 0.11162791 0.16666667 
##        439        443        447        452        455        456 
## 0.02877238 0.02877238 0.02877238 0.11162791 0.72222222 0.11162791 
##        458        459        461        466        468        476 
## 0.02877238 0.02877238 0.11162791 1.00000000 0.02877238 0.02877238 
##        480        482        483        484        485        487 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.05084746 
##        489        490        494        498        506        509 
## 0.11162791 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##        517        520        521        524        530        532 
## 0.02877238 0.02877238 0.11162791 0.02877238 0.05084746 0.02877238 
##        533        536        537        540        542        544 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 
##        548        550        551        555        560        561 
## 0.85915493 0.15584416 0.15584416 0.02877238 0.02877238 0.02877238 
##        563        565        569        571        573        574 
## 0.02877238 0.11162791 0.68421053 0.02877238 0.05084746 0.11162791 
##        576        577        583        586        589        590 
## 0.11764706 0.02877238 0.02877238 0.02877238 0.85915493 0.02877238 
##        596        605        609        611        615        617 
## 0.02877238 0.02877238 0.02877238 0.11162791 0.11162791 0.05084746 
##        620        628        633        642        643        645 
## 0.95522388 0.15584416 0.02877238 0.02877238 0.02877238 0.02877238 
##        648        651        653        655        657        661 
## 0.02877238 0.11162791 0.02877238 0.02877238 0.02877238 0.72222222 
##        664        667        669        673        676        677 
## 0.02877238 0.11162791 0.11162791 0.02877238 0.11162791 0.11162791 
##        678        689        695        697        701        702 
## 0.02877238 0.11764706 0.85915493 0.02877238 0.02877238 0.02877238 
##        703        707        709        711        715        717 
## 0.02877238 0.02877238 0.05084746 0.02877238 0.02877238 0.05084746 
##        721        722        723        726        728        734 
## 0.02877238 0.85915493 0.02877238 0.02877238 0.11162791 0.11162791 
##        735        743        747        754        755        759 
## 0.05084746 0.15584416 0.02877238 0.02877238 0.02877238 0.11764706 
##        760        764        769        777        778        779 
## 0.02877238 0.02877238 0.11162791 0.11162791 0.05084746 0.15584416 
##        783        785        787        792        795        798 
## 0.11162791 0.11764706 0.02877238 0.02877238 0.02877238 1.00000000 
##        799        800        806        809        814        817 
## 0.02877238 0.02877238 0.02877238 0.05084746 0.02877238 0.11162791 
##        820        830        833        837        838        841 
## 0.02877238 0.02877238 0.95522388 0.02877238 0.05084746 0.02877238 
##        845        846        847        849        851        853 
## 0.02877238 0.11764706 0.02877238 0.11162791 0.16666667 0.02877238 
##        855        858        864        866        867        868 
## 0.02877238 0.02877238 0.02877238 0.15584416 0.02877238 0.02877238 
##        874        890        892        895        906        908 
## 0.02877238 0.02877238 0.02877238 0.85915493 0.65000000 0.02877238 
##        909        915        917        922        929        932 
## 0.15584416 0.95522388 0.02877238 0.02877238 0.02877238 0.02877238 
##        934        937        938        941        943        945 
## 1.00000000 0.02877238 0.11162791 0.02877238 0.11162791 0.15584416 
##        946        954        956        962        966        968 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 
##        969        974        978        981        985        988 
## 0.02877238 0.02877238 0.85915493 0.02877238 0.02877238 0.11764706 
##        992        998       1003       1008       1011       1015 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 0.02877238 
##       1019       1022       1026       1027       1030       1033 
## 0.11162791 0.02877238 0.72222222 0.02877238 0.02877238 0.11162791 
##       1034       1035       1036       1039       1043       1044 
## 0.02877238 0.02877238 0.02877238 0.85915493 0.05084746 0.02877238 
##       1048       1050       1051       1053       1054       1057 
## 0.02877238 0.02877238 0.02877238 0.05084746 0.02877238 0.02877238 
##       1065       1067       1068       1072       1073       1076 
## 0.05084746 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       1077       1079       1086       1087       1091       1094 
## 0.02877238 0.95522388 0.11162791 0.02877238 0.02877238 0.02877238 
##       1099       1103       1105       1107       1108       1111 
## 0.02877238 0.65000000 0.15584416 0.02877238 0.02877238 0.02877238 
##       1112       1115       1117       1120       1121       1125 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       1131       1132       1133       1135       1136       1137 
## 0.02877238 0.11162791 0.02877238 0.02877238 0.11162791 0.11764706 
##       1141       1142       1143       1144       1146       1148 
## 0.02877238 0.02877238 0.85915493 0.02877238 0.02877238 0.11162791 
##       1149       1150       1153       1157       1159       1160 
## 0.02877238 0.05084746 0.02877238 0.02877238 0.05084746 0.02877238 
##       1163       1164       1167       1172       1174       1179 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       1186       1188       1190       1193       1209       1211 
## 0.02877238 0.02877238 0.02877238 0.15584416 0.02877238 0.16666667 
##       1212       1214       1215       1216       1217       1222 
## 0.02877238 0.11162791 0.02877238 0.02877238 0.02877238 0.02877238 
##       1224       1226       1227       1228       1232       1233 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.95522388 0.11162791 
##       1236       1242       1243       1246       1249       1252 
## 0.15584416 0.15584416 0.05084746 0.02877238 0.05084746 0.02877238 
##       1256       1257       1260       1262       1263       1266 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.65000000 0.11162791 
##       1267       1270       1273       1276       1279       1291 
## 0.02877238 0.85915493 0.15584416 0.02877238 0.72222222 0.11162791 
##       1294       1296       1300       1301       1303       1306 
## 0.02877238 0.02877238 0.95522388 0.02877238 0.95522388 0.02877238 
##       1309       1310       1316       1322       1323       1325 
## 0.02877238 0.05084746 0.11162791 0.02877238 1.00000000 0.02877238 
##       1326       1329       1332       1333       1339       1347 
## 0.85915493 0.02877238 0.02877238 0.11162791 0.02877238 1.00000000 
##       1368       1384       1385       1386       1390       1396 
## 0.05084746 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       1397       1399       1401       1406       1407       1411 
## 0.15584416 0.02877238 0.02877238 0.85915493 0.02877238 0.02877238 
##       1412       1414       1422       1423       1424       1425 
## 0.02877238 0.02877238 0.11162791 0.02877238 0.05084746 0.02877238 
##       1426       1427       1431       1433       1437       1439 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 0.02877238 
##       1444       1445       1446       1450       1451       1454 
## 0.05084746 0.02877238 0.02877238 0.15584416 0.02877238 0.11162791 
##       1455       1460       1461       1462       1463       1465 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       1469       1471       1476       1477       1483       1484 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 0.05084746 
##       1485       1487       1495       1497       1500       1508 
## 1.00000000 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       1510       1528       1529       1531       1532       1538 
## 0.11162791 0.02877238 0.02877238 0.11162791 0.85915493 0.85915493 
##       1540       1541       1546       1550       1554       1560 
## 0.05084746 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 
##       1564       1569       1573       1574       1575       1580 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.05084746 0.02877238 
##       1581       1596       1597       1599       1600       1604 
## 0.72222222 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       1606       1613       1615       1617       1618       1628 
## 0.02877238 0.02877238 0.72222222 0.02877238 0.05084746 0.02877238 
##       1636       1638       1642       1643       1645       1650 
## 0.85915493 0.11162791 0.05084746 0.02877238 0.02877238 0.02877238 
##       1658       1659       1660       1663       1669       1673 
## 0.02877238 0.02877238 0.02877238 0.85915493 0.02877238 0.02877238 
##       1678       1681       1682       1683       1684       1685 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       1686       1694       1702       1704       1705       1708 
## 0.02877238 0.02877238 0.95522388 0.02877238 0.95522388 0.85915493 
##       1709       1713       1714       1718       1719       1721 
## 0.15584416 0.15584416 0.85915493 0.02877238 0.16666667 0.02877238 
##       1725       1727       1728       1730       1731       1736 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       1738       1743       1754       1759       1761       1763 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       1765       1766       1771       1775       1777       1788 
## 0.11162791 0.85915493 0.02877238 0.11162791 0.02877238 0.02877238 
##       1795       1800       1803       1813       1821       1824 
## 0.95522388 0.02877238 0.15584416 0.95522388 0.02877238 0.11162791 
##       1825       1827       1828       1829       1831       1835 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.68421053 0.02877238 
##       1836       1837       1838       1841       1843       1848 
## 0.15584416 0.02877238 1.00000000 0.11162791 0.95522388 0.02877238 
##       1850       1852       1856       1858       1859       1866 
## 0.68421053 0.15584416 0.02877238 0.02877238 0.72222222 0.85915493 
##       1869       1873       1875       1876       1882       1889 
## 0.02877238 0.05084746 0.02877238 0.02877238 0.95522388 0.02877238 
##       1892       1894       1902       1907       1911       1912 
## 0.02877238 0.11162791 0.02877238 0.02877238 0.02877238 0.15584416 
##       1915       1916       1926       1930       1932       1937 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.05084746 0.02877238 
##       1938       1939       1940       1941       1946       1961 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 0.02877238 
##       1962       1966       1968       1972       1982       1983 
## 0.02877238 0.11162791 0.02877238 0.02877238 0.02877238 0.02877238 
##       1985       1987       1988       1990       1991       1992 
## 0.95522388 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       1996       2000       2003       2008       2011       2014 
## 0.11162791 0.02877238 0.02877238 0.11162791 0.02877238 0.02877238 
##       2019       2023       2025       2027       2029       2032 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.68421053 0.85915493 
##       2034       2037       2038       2042       2043       2046 
## 0.02877238 0.11162791 0.02877238 0.02877238 0.72222222 0.15584416 
##       2051       2055       2057       2059       2060       2063 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.95522388 0.02877238 
##       2065       2069       2070       2075       2077       2078 
## 0.02877238 0.02877238 0.11162791 0.02877238 0.15584416 0.11162791 
##       2079       2080       2096       2097       2102       2110 
## 0.02877238 0.11162791 0.11162791 0.11162791 0.02877238 0.02877238 
##       2112       2116       2122       2125       2128       2131 
## 0.02877238 1.00000000 0.02877238 0.02877238 0.02877238 0.11162791 
##       2133       2139       2141       2145       2147       2153 
## 0.02877238 0.02877238 0.65000000 0.02877238 0.05084746 0.05084746 
##       2154       2157       2161       2163       2165       2167 
## 0.02877238 0.11162791 1.00000000 0.02877238 1.00000000 0.02877238 
##       2168       2178       2180       2182       2189       2194 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 
##       2196       2199       2200       2208       2211       2212 
## 0.02877238 0.11162791 0.11162791 0.02877238 0.95522388 0.15584416 
##       2213       2219       2224       2233       2238       2243 
## 0.11162791 0.85915493 0.65000000 0.02877238 0.15584416 0.02877238 
##       2245       2246       2250       2257       2258       2259 
## 0.11162791 0.02877238 0.02877238 0.02877238 0.02877238 0.95522388 
##       2261       2262       2269       2270       2274       2282 
## 0.11764706 0.15584416 0.02877238 0.02877238 0.02877238 0.02877238 
##       2283       2285       2288       2289       2290       2292 
## 0.02877238 0.02877238 0.16666667 0.02877238 0.05084746 0.02877238 
##       2293       2295       2297       2305       2313       2316 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       2319       2320       2321       2323       2325       2328 
## 0.02877238 0.11764706 0.02877238 0.15584416 0.85915493 0.65000000 
##       2331       2332       2340       2343       2344       2351 
## 0.02877238 0.02877238 0.02877238 0.02877238 1.00000000 0.02877238 
##       2354       2355       2356       2360       2362       2369 
## 0.05084746 0.95522388 0.11162791 0.02877238 0.02877238 0.11162791 
##       2370       2372       2374       2377       2379       2381 
## 0.65000000 0.02877238 0.11162791 0.95522388 0.02877238 0.65000000 
##       2382       2386       2393       2396       2397       2399 
## 0.02877238 0.02877238 0.02877238 0.15584416 0.05084746 0.02877238 
##       2407       2413       2416       2427       2428       2436 
## 0.02877238 0.68421053 0.85915493 0.02877238 0.02877238 0.11764706 
##       2440       2441       2444       2445       2448       2449 
## 0.02877238 0.02877238 0.02877238 0.15584416 0.02877238 0.11162791 
##       2452       2454       2456       2468       2470       2472 
## 0.95522388 0.02877238 0.02877238 0.05084746 0.02877238 0.02877238 
##       2474       2475       2476       2477       2480       2482 
## 0.02877238 0.02877238 0.02877238 0.02877238 1.00000000 0.02877238 
##       2485       2487       2488       2492       2495       2496 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 0.02877238 
##       2497       2507       2508       2509       2512       2523 
## 0.02877238 0.11162791 0.02877238 0.02877238 0.02877238 0.02877238 
##       2524       2527       2529       2533       2539       2544 
## 0.02877238 1.00000000 0.02877238 0.02877238 0.02877238 0.02877238 
##       2549       2551       2552       2555       2556       2561 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       2565       2569       2571       2574       2579       2580 
## 0.02877238 0.05084746 0.02877238 0.68421053 0.02877238 0.02877238 
##       2582       2583       2584       2585       2590       2592 
## 0.85915493 0.02877238 0.02877238 0.02877238 0.02877238 0.72222222 
##       2593       2598       2600       2602       2604       2605 
## 0.15584416 0.02877238 0.72222222 0.02877238 0.11764706 0.02877238 
##       2606       2611       2614       2616       2617       2619 
## 0.72222222 0.02877238 0.11162791 0.02877238 0.02877238 0.05084746 
##       2620       2623       2632       2634       2635       2636 
## 1.00000000 0.85915493 0.02877238 0.02877238 0.02877238 0.02877238 
##       2637       2640       2643       2645       2647       2651 
## 0.02877238 0.02877238 0.05084746 0.02877238 0.15584416 0.02877238 
##       2655       2656       2658       2660       2661       2662 
## 0.02877238 0.02877238 0.11162791 0.02877238 0.95522388 0.11162791 
##       2665       2666       2668       2671       2678       2688 
## 1.00000000 0.05084746 0.02877238 0.02877238 0.68421053 1.00000000 
##       2689       2697       2700       2705       2710       2712 
## 0.02877238 0.95522388 0.02877238 0.02877238 0.02877238 0.02877238 
##       2714       2718       2720       2725       2728       2730 
## 0.02877238 0.02877238 0.02877238 0.16666667 0.05084746 0.02877238 
##       2740       2743       2752       2753       2754       2756 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       2761       2768       2770       2772       2780       2782 
## 1.00000000 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       2783       2785       2786       2787       2791       2793 
## 0.02877238 0.11162791 0.15584416 0.85915493 0.02877238 0.02877238 
##       2795       2798       2801       2803       2810       2812 
## 0.05084746 0.02877238 1.00000000 0.02877238 0.02877238 0.02877238 
##       2813       2822       2823       2827       2835       2837 
## 0.02877238 0.11162791 0.05084746 0.02877238 0.02877238 0.02877238 
##       2838       2839       2840       2850       2851       2855 
## 0.02877238 0.02877238 0.95522388 0.11162791 0.02877238 0.02877238 
##       2862       2865       2866       2868       2869       2872 
## 0.15584416 0.11162791 0.02877238 0.11162791 1.00000000 0.02877238 
##       2874       2877       2878       2881       2882       2891 
## 0.02877238 0.11162791 0.02877238 0.11162791 0.02877238 0.11162791 
##       2892       2900       2905       2906       2908       2910 
## 0.02877238 0.02877238 0.05084746 0.02877238 0.02877238 0.02877238 
##       2920       2923       2938       2940       2944       2948 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.72222222 
##       2951       2954       2955       2960       2962       2963 
## 0.02877238 0.15584416 0.02877238 0.11162791 0.15584416 0.11162791 
##       2966       2973       2975       2981       2982       2984 
## 0.02877238 0.02877238 0.02877238 0.72222222 0.02877238 0.02877238 
##       2985       2992       2996       2997       2998       3002 
## 0.02877238 0.05084746 0.02877238 0.02877238 0.11162791 0.02877238 
##       3005       3008       3013       3016       3018       3026 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.15584416 0.02877238 
##       3027       3032       3034       3037       3038       3043 
## 0.15584416 0.05084746 0.02877238 0.02877238 0.02877238 0.02877238 
##       3048       3049       3050       3051       3057       3061 
## 0.02877238 0.02877238 0.02877238 0.85915493 0.11162791 0.02877238 
##       3063       3064       3069       3071       3072       3073 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.68421053 
##       3084       3091       3092       3097       3098       3099 
## 0.11162791 0.02877238 0.02877238 0.11764706 0.02877238 0.02877238 
##       3107       3110       3112       3115       3116       3123 
## 0.02877238 0.05084746 0.02877238 0.02877238 0.15584416 0.02877238 
##       3125       3127       3129       3131       3132       3135 
## 0.11162791 0.11162791 0.02877238 0.02877238 0.15584416 0.02877238 
##       3140       3143       3144       3150       3152       3154 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       3155       3159       3165       3168       3169       3176 
## 0.02877238 0.02877238 0.02877238 0.11162791 0.72222222 0.11764706 
##       3180       3184       3190       3193       3196       3198 
## 0.02877238 0.02877238 1.00000000 0.02877238 0.02877238 0.11162791 
##       3200       3207       3217       3218       3223       3226 
## 0.02877238 0.02877238 0.02877238 0.11162791 0.05084746 0.11162791 
##       3227       3229       3232       3235       3245       3254 
## 0.11162791 0.15584416 0.02877238 0.02877238 0.11162791 0.02877238 
##       3258       3259       3260       3261       3267       3270 
## 0.02877238 0.11162791 0.02877238 0.11162791 0.02877238 0.05084746 
##       3277       3279       3285       3290       3292       3296 
## 0.11162791 0.11162791 0.02877238 0.02877238 1.00000000 0.11162791 
##       3301       3302       3307       3311       3315       3316 
## 0.02877238 0.95522388 0.02877238 0.02877238 0.02877238 0.02877238 
##       3323       3325       3329       3333 
## 0.95522388 0.02877238 0.02877238 0.72222222 
## 
## 
## Slot "labels":
## [[1]]
##    [1] no  no  no  no  no  yes no  no  no  no  yes no  no  no  no  no  no 
##   [18] no  no  no  yes no  no  no  no  no  yes no  yes no  no  no  no  no 
##   [35] no  no  yes no  no  no  no  no  no  no  no  no  no  no  yes no  no 
##   [52] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  yes
##   [69] no  no  no  no  no  no  no  no  no  yes no  yes no  no  no  no  no 
##   [86] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  yes yes
##  [103] no  no  yes no  no  no  no  no  no  no  no  no  no  no  no  no  yes
##  [120] yes no  no  no  no  yes no  no  no  no  no  no  yes no  no  no  no 
##  [137] yes yes no  no  no  yes no  no  no  no  no  no  no  no  no  no  no 
##  [154] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  yes no 
##  [171] yes no  no  no  no  no  no  no  no  no  no  no  no  no  yes no  no 
##  [188] no  no  no  no  no  yes no  no  no  no  no  no  no  no  no  no  yes
##  [205] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no 
##  [222] no  no  yes no  no  no  no  no  no  no  no  no  no  no  no  no  no 
##  [239] no  no  no  no  no  no  no  yes no  no  no  no  no  no  no  no  yes
##  [256] no  no  no  no  no  yes no  no  no  no  no  no  no  no  no  no  no 
##  [273] no  yes yes no  no  yes no  no  no  no  yes no  no  no  no  yes no 
##  [290] no  no  no  no  no  yes no  yes no  no  yes no  no  no  no  no  no 
##  [307] yes no  no  no  no  no  no  no  no  yes no  no  no  no  no  no  no 
##  [324] no  no  no  no  no  no  no  no  yes no  no  no  no  no  yes no  no 
##  [341] no  no  no  no  no  no  no  no  no  no  no  no  no  yes no  no  yes
##  [358] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no 
##  [375] no  yes no  no  no  no  no  no  no  no  no  no  no  no  yes yes no 
##  [392] no  no  no  no  no  no  no  no  no  no  no  no  yes no  no  yes no 
##  [409] no  no  yes no  yes no  no  no  no  no  yes no  yes no  no  no  no 
##  [426] yes no  no  no  no  no  no  no  no  no  yes no  no  no  no  no  no 
##  [443] no  no  no  no  no  no  no  no  no  no  no  no  no  no  yes no  no 
##  [460] no  no  no  no  no  no  no  no  no  yes no  no  no  no  no  no  no 
##  [477] yes yes yes yes no  no  no  no  no  no  no  no  no  no  no  no  no 
##  [494] no  no  no  no  no  no  no  yes no  no  no  yes no  no  no  no  no 
##  [511] no  no  no  yes no  no  no  no  no  no  no  no  no  no  yes no  yes
##  [528] yes no  no  yes no  yes no  no  no  no  no  no  no  no  no  no  no 
##  [545] no  no  yes yes no  no  no  no  yes no  no  no  no  no  no  no  no 
##  [562] no  no  no  no  no  yes no  yes no  yes yes no  yes no  yes no  no 
##  [579] no  no  yes no  no  yes no  no  no  no  no  no  no  no  no  no  no 
##  [596] no  yes no  no  no  no  yes no  no  no  no  yes no  no  no  no  no 
##  [613] no  no  no  no  no  no  no  no  no  no  yes no  yes no  no  no  no 
##  [630] no  no  no  no  no  yes no  no  no  yes no  yes no  no  no  no  no 
##  [647] no  no  no  yes no  no  no  no  no  no  no  no  no  no  no  no  yes
##  [664] no  yes no  no  no  no  no  no  no  no  no  no  no  yes no  no  yes
##  [681] no  no  yes no  no  no  no  no  no  yes no  no  no  no  no  no  no 
##  [698] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  yes yes
##  [715] no  no  no  no  yes no  no  yes no  no  no  no  yes no  no  yes no 
##  [732] yes no  no  no  no  no  no  no  yes yes no  no  no  no  no  no  no 
##  [749] no  no  yes no  no  no  no  no  no  no  no  no  yes no  no  no  no 
##  [766] no  no  no  no  no  no  no  no  no  no  yes no  no  no  no  no  no 
##  [783] no  no  no  no  no  no  no  yes no  no  yes no  no  no  no  yes no 
##  [800] no  yes no  no  no  no  no  no  no  no  no  yes no  no  no  no  no 
##  [817] no  no  no  no  no  no  no  no  no  no  yes no  yes no  no  no  yes
##  [834] yes no  yes no  no  no  no  no  no  no  yes no  no  no  no  no  no 
##  [851] no  no  yes no  no  no  no  no  no  yes yes yes no  no  yes no  yes
##  [868] no  no  no  no  no  no  no  no  no  no  no  yes no  no  no  no  no 
##  [885] yes no  yes no  no  no  no  no  no  no  no  no  no  no  no  no  no 
##  [902] no  no  no  no  yes yes no  no  no  yes no  no  yes no  yes no  no 
##  [919] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no 
##  [936] no  no  no  no  yes no  no  no  no  no  yes no  yes no  no  no  no 
##  [953] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no 
##  [970] no  yes no  no  no  no  no  yes no  no  no  yes no  no  no  no  no 
##  [987] no  no  no  no  yes no  no  no  no  no  no  no  no  no  no  no  no 
## [1004] no  no  no  yes no  no  yes no  no  no  no  yes no  no  no 
## Levels: no < yes
## 
## 
## Slot "cutoffs":
## [[1]]
##                  3292       3323       3051       3333       3073 
##        Inf 1.00000000 0.95522388 0.85915493 0.72222222 0.68421053 
##       2381       2725       3229       3176       3296       3270 
## 0.65000000 0.16666667 0.15584416 0.11764706 0.11162791 0.05084746 
##       3329 
## 0.02877238 
## 
## 
## Slot "fp":
## [[1]]
##  [1]   0   0   2   5  12  14  18  21  56  70 174 228 877
## 
## 
## Slot "tp":
## [[1]]
##  [1]   0  24  51  79  89  95 100 103 111 113 124 125 141
## 
## 
## Slot "tn":
## [[1]]
##  [1] 877 877 875 872 865 863 859 856 821 807 703 649   0
## 
## 
## Slot "fn":
## [[1]]
##  [1] 141 117  90  62  52  46  41  38  30  28  17  16   0
## 
## 
## Slot "n.pos":
## [[1]]
## [1] 141
## 
## 
## Slot "n.neg":
## [[1]]
## [1] 877
## 
## 
## Slot "n.pos.pred":
## [[1]]
##  [1]    0   24   53   84  101  109  118  124  167  183  298  353 1018
## 
## 
## Slot "n.neg.pred":
## [[1]]
##  [1] 1018  994  965  934  917  909  900  894  851  835  720  665    0
perf.rocr<-performance(pred.rocr, measure ="auc", x.measure="cutoff")
perf.tpr.rocr<-performance(pred.rocr, "tpr","fpr")
plot(perf.tpr.rocr,main=paste("AUC:",(perf.rocr@y.values)))

model comparison

#rpart
library('rpart')
churn.rp<-rpart(churn ~., data=trainset)

#ctree
#install.packages("party")
library('party')
ctree.model = ctree(churn ~ . , data = trainset)

#C5.0
library(C50)
c50.model = C5.0(churn ~., data=trainset)

rp.predict.prob = predict(churn.rp, testset,type='prob')
c50.predict.prob = predict(c50.model,testset,type='prob')
ctree.predict.prob = sapply(predict(ctree.model ,testset,type='prob'),function(e){unlist(e)[1]})
rp.prediction = prediction(rp.predict.prob[,1],testset$churn)
c50.prediction = prediction(c50.predict.prob[,1],testset$churn)
ctree.prediction = prediction(ctree.predict.prob,testset$churn)
rp.performance = performance(rp.prediction, "tpr","fpr")
c50.performance = performance(c50.prediction, "tpr","fpr")
ctree.performance = performance(ctree.prediction, "tpr","fpr")
plot(rp.performance,col='red')
plot(c50.performance, add=T,col='green')
plot(ctree.performance, add=T,col='blue')

補充:隨機森林(Random Forest)

library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
forest <- randomForest(churn ~., data = trainset, ntree=200,importance=T, proximity=T)
rf.predict.prob <- predict(forest, testset, type="prob")
rf.prediction <- prediction(rf.predict.prob[,1], as.factor(testset$churn))
rf.auc <- performance(rf.prediction, measure = "auc", x.measure = "cutoff")
rf.performance <- performance(rf.prediction, "tpr","fpr")

# Roc curve
plot(rp.performance,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart', 'randomforest'), 1:2)
plot(rf.performance, col=2, add=TRUE)