review dplyr select & filter

setwd('~/lecture/riii')
load('Statistics/applenews.RData')
str(applenews)

## 'data.frame':    1500 obs. of  5 variables:
##  $ content : chr  "\n                                        (更新:新增影片)想要透過刮刮樂彩券一夕致富，但他卻用錯方法！台中市一名"| __truncated__ "\n                                        澳洲一名就讀雪梨大學的華裔博士生，日前公開一段燒毀中國護照的影片，還"| __truncated__ "\n                                        【行銷專題企劃】房價高高在上，沒錢買房沒關係，但你認為自己是聰明的租"| __truncated__ "\n                                        本內容由中央廣播電臺提供        美國國防部長卡特(Ash Carter)今天(15日"| __truncated__ ...
##  $ title   : chr  "【更新】搶2.2萬彩券刮中1.4萬　沒發財還得入獄" "拿到澳洲護照後　他放火燒中國護照" "【特企】房市大追擊- 租屋這些事情要小心" "【央廣RTI】美菲軍演  美防長南海登艦" ...
##  $ dt      : POSIXct, format: "2016-04-15 14:32:00" "2016-04-15 14:32:00" ...
##  $ category: chr  "社會" "國際" "地產" "國際" ...
##  $ clicked : int  1754 0 0 0 311 24 20 314 27 308 ...

applenews = applenews[,-1]

#install.packages('dplyr')
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#原先R 提供的過濾功能
head(applenews[applenews$category == "娛樂",])

##                                         title                  dt category
## 16           澎恰恰收女弟子　拱當台灣第一名伶 2016-04-15 14:17:00     娛樂
## 21 【唱新聞】詐騙嗎？R.O.C.有CHINA但不是CHINA 2016-04-15 14:00:00     娛樂
## 32         白曉燕命案19年了　白冰冰「不能忘」 2016-04-15 13:49:00     娛樂
## 40           好萊塢男神好威　女友再當高齡產婦 2016-04-15 13:40:00     娛樂
## 47         隋棠帶兒遠征南台灣　吃成膨皮母子檔 2016-04-15 13:30:00     娛樂
## 50   伊勢谷友介掰了長澤雅美　半同居小16歲辣模 2016-04-15 13:23:00     娛樂
##    clicked
## 16    1749
## 21   11696
## 32    3329
## 40    4307
## 47    4651
## 50    5141

#dplyr 的過濾功能
head(filter(applenews, category == "娛樂"))

##                                        title                  dt category
## 1           澎恰恰收女弟子　拱當台灣第一名伶 2016-04-15 14:17:00     娛樂
## 2 【唱新聞】詐騙嗎？R.O.C.有CHINA但不是CHINA 2016-04-15 14:00:00     娛樂
## 3         白曉燕命案19年了　白冰冰「不能忘」 2016-04-15 13:49:00     娛樂
## 4           好萊塢男神好威　女友再當高齡產婦 2016-04-15 13:40:00     娛樂
## 5         隋棠帶兒遠征南台灣　吃成膨皮母子檔 2016-04-15 13:30:00     娛樂
## 6   伊勢谷友介掰了長澤雅美　半同居小16歲辣模 2016-04-15 13:23:00     娛樂
##   clicked
## 1    1749
## 2   11696
## 3    3329
## 4    4307
## 5    4651
## 6    5141

#and/or 
head(filter(applenews, category == "娛樂" & clicked > 10000))

##                                          title                  dt
## 1   【唱新聞】詐騙嗎？R.O.C.有CHINA但不是CHINA 2016-04-15 14:00:00
## 2             徐佳瑩開唱前驚傳失聲　急診照曝光 2016-04-15 13:08:00
## 3 【更新】《太陽》完美結局　沉船灣之吻收視最熱 2016-04-15 12:29:00
## 4 宋慧喬愛國拒代言　台大教授批韓軍「應被唾棄」 2016-04-15 11:52:00
## 5  【請回答】六月李易對質實況　出軌X次說清楚！ 2016-04-15 11:40:00
## 6         劉愷威被問楊冪超囧　擠不出上次何時見 2016-04-15 10:30:00
##   category clicked
## 1     娛樂   11696
## 2     娛樂   11588
## 3     娛樂   27010
## 4     娛樂   10455
## 5     娛樂   88386
## 6     娛樂   10478

head(filter(applenews,category == '娛樂' , clicked > 10000))

##                                          title                  dt
## 1   【唱新聞】詐騙嗎？R.O.C.有CHINA但不是CHINA 2016-04-15 14:00:00
## 2             徐佳瑩開唱前驚傳失聲　急診照曝光 2016-04-15 13:08:00
## 3 【更新】《太陽》完美結局　沉船灣之吻收視最熱 2016-04-15 12:29:00
## 4 宋慧喬愛國拒代言　台大教授批韓軍「應被唾棄」 2016-04-15 11:52:00
## 5  【請回答】六月李易對質實況　出軌X次說清楚！ 2016-04-15 11:40:00
## 6         劉愷威被問楊冪超囧　擠不出上次何時見 2016-04-15 10:30:00
##   category clicked
## 1     娛樂   11696
## 2     娛樂   11588
## 3     娛樂   27010
## 4     娛樂   10455
## 5     娛樂   88386
## 6     娛樂   10478

head(filter(applenews, category == "娛樂" | clicked > 10000))

##                                                   title
## 1                       又要下雨了　中南部6縣市大雨特報
## 2                      澎恰恰收女弟子　拱當台灣第一名伶
## 3             【驚險有片】BMW撞翻撞公車　後方機車神穿越
## 4            【唱新聞】詐騙嗎？R.O.C.有CHINA但不是CHINA
## 5 蔡英文、林全人事公佈記者會　14:30【蘋果Live】同步直播
## 6                    白曉燕命案19年了　白冰冰「不能忘」
##                    dt category clicked
## 1 2016-04-15 14:19:00     生活   12347
## 2 2016-04-15 14:17:00     娛樂    1749
## 3 2016-04-15 14:12:00     社會   11886
## 4 2016-04-15 14:00:00     娛樂   11696
## 5 2016-04-15 13:50:00     政治   12425
## 6 2016-04-15 13:49:00     娛樂    3329

#篩選多個類別
head(filter(applenews, category %in% c("娛樂", "社會")))

##                                          title                  dt
## 1 【更新】搶2.2萬彩券刮中1.4萬　沒發財還得入獄 2016-04-15 14:32:00
## 2                   同居人女兒熟睡　淫男伸狼爪 2016-04-15 14:22:00
## 3             澎恰恰收女弟子　拱當台灣第一名伶 2016-04-15 14:17:00
## 4    【驚險有片】BMW撞翻撞公車　後方機車神穿越 2016-04-15 14:12:00
## 5   【唱新聞】詐騙嗎？R.O.C.有CHINA但不是CHINA 2016-04-15 14:00:00
## 6               九巡翁霸坐展售車？　原因好心酸 2016-04-15 13:52:00
##   category clicked
## 1     社會    1754
## 2     社會    1076
## 3     娛樂    1749
## 4     社會   11886
## 5     娛樂   11696
## 6     社會    4582

#原先R的欄位選取
head(applenews[, c("category","clicked")])

##   category clicked
## 1     社會    1754
## 2     國際       0
## 3     地產       0
## 4     國際       0
## 5     時尚     311
## 6     財經      24

#dplyr 的欄位選取

#選擇列舉出的欄位
head(select(applenews,category,clicked))

##   category clicked
## 1     社會    1754
## 2     國際       0
## 3     地產       0
## 4     國際       0
## 5     時尚     311
## 6     財經      24

#選擇從category~clicked欄位
head(select(applenews,dt:clicked))

##                    dt category clicked
## 1 2016-04-15 14:32:00     社會    1754
## 2 2016-04-15 14:32:00     國際       0
## 3 2016-04-15 14:31:00     地產       0
## 4 2016-04-15 14:30:00     國際       0
## 5 2016-04-15 14:28:00     時尚     311
## 6 2016-04-15 14:28:00     財經      24

#選擇欄位名稱含有click字串的欄位
head(select(applenews,contains('click')))

##   clicked
## 1    1754
## 2       0
## 3       0
## 4       0
## 5     311
## 6      24

##iris - selected helpers
head(select(iris,starts_with("Sepal")))

##   Sepal.Length Sepal.Width
## 1          5.1         3.5
## 2          4.9         3.0
## 3          4.7         3.2
## 4          4.6         3.1
## 5          5.0         3.6
## 6          5.4         3.9

head(select(iris,ends_with("Length")))

##   Sepal.Length Petal.Length
## 1          5.1          1.4
## 2          4.9          1.4
## 3          4.7          1.3
## 4          4.6          1.5
## 5          5.0          1.4
## 6          5.4          1.7

?matches

## Help on topic 'matches' was found in the following packages:
## 
##   Package               Library
##   dplyr                 /Library/Frameworks/R.framework/Versions/3.5/Resources/library
##   tidyselect            /Library/Frameworks/R.framework/Versions/3.5/Resources/library
## 
## 
## Using the first match ...

#想同時filter 和 select
head(filter(select(applenews,category:clicked),category == '娛樂'))

##   category clicked
## 1     娛樂    1749
## 2     娛樂   11696
## 3     娛樂    3329
## 4     娛樂    4307
## 5     娛樂    4651
## 6     娛樂    5141

#使用Chaining
select(applenews,category:clicked) %>%
  filter(category == '娛樂') %>%
  head()

##   category clicked
## 1     娛樂    1749
## 2     娛樂   11696
## 3     娛樂    3329
## 4     娛樂    4307
## 5     娛樂    4651
## 6     娛樂    5141

applenews %>% 
  select(category:clicked) %>%
  filter(category == '娛樂') %>%
  head()

##   category clicked
## 1     娛樂    1749
## 2     娛樂   11696
## 3     娛樂    3329
## 4     娛樂    4307
## 5     娛樂    4651
## 6     娛樂    5141

dplyr 其他常用函式

#使用mutate產生新欄位
# 總點擊數
freqsum = applenews %>%
  select(clicked) %>% 
  sum()

#新增portion欄位
applenews = applenews %>%
  select(title, category,clicked) %>% 
  mutate(portion= clicked / freqsum)

# arrange (資料排序)
applenews %>% arrange(desc(clicked)) %>% head()

##                                          title category clicked    portion
## 1     泰國超正變性人徵兵處報到　網友：這我可以     正妹  344733 0.01597908
## 2     本土劇女神整形前照出土？！黑皮塌鼻引戰火     娛樂  299235 0.01387016
## 3     【狗仔偷拍】好大的派頭　李㼈違停霸氣外露     娛樂  265355 0.01229975
## 4  【更新】正晶揭露新詐騙案　7百萬存款不翼而飛     社會  241842 0.01120987
## 5 有內情？辣模女友控MP廷廷　「對我做可怕的事」     娛樂  239697 0.01111045
## 6         貴婦人妻太閒了　她只好和一些網友嘿咻     社會  228203 0.01057768

#group_by & summarise
applenews %>%
  group_by(category) %>%
  summarise(clicked_sum = sum(clicked, na.rm=TRUE)) %>%
  arrange(desc(clicked_sum))

## # A tibble: 14 x 2
##    category clicked_sum
##    <chr>          <int>
##  1 社會         5721750
##  2 娛樂         3571005
##  3 生活         3417804
##  4 國際         2540411
##  5 政治         1701980
##  6 體育         1598067
##  7 正妹          672949
##  8 搜奇          668307
##  9 財經          618243
## 10 論壇          312592
## 11 時尚          260499
## 12 地產          220812
## 13 3C            146308
## 14 動物          123287

#多個欄位計算
applenews %>%
  group_by(category) %>% 
  summarise_at(.vars=vars(clicked,portion),.funs=funs(sum,mean,min,max,sd))

## # A tibble: 14 x 11
##    category clicked_sum portion_sum clicked_mean portion_mean clicked_min
##    <chr>          <int>       <dbl>        <dbl>        <dbl>       <dbl>
##  1 3C            146308     0.00678        3954.     0.000183         267
##  2 財經          618243     0.0287         5109.     0.000237          24
##  3 地產          220812     0.0102         6900.     0.000320           0
##  4 動物          123287     0.00571        4742.     0.000220        1211
##  5 國際         2540411     0.118          8914.     0.000413           0
##  6 論壇          312592     0.0145         5683.     0.000263         275
##  7 社會         5721750     0.265         29494.     0.00137          918
##  8 生活         3417804     0.158         11469.     0.000532          20
##  9 時尚          260499     0.0121         6855.     0.000318         311
## 10 搜奇          668307     0.0310        12151.     0.000563         199
## 11 體育         1598067     0.0741        16822.     0.000780         523
## 12 娛樂         3571005     0.166         31602.     0.00146         1631
## 13 正妹          672949     0.0312        84119.     0.00390         7999
## 14 政治         1701980     0.0789        11902.     0.000552         221
## # ... with 5 more variables: portion_min <dbl>, clicked_max <dbl>,
## #   portion_max <dbl>, clicked_sd <dbl>, portion_sd <dbl>

applenews %>%
  group_by(category) %>% 
  summarise_at(.vars=vars(clicked),.funs=funs(sum,mean))

## # A tibble: 14 x 3
##    category     sum   mean
##    <chr>      <int>  <dbl>
##  1 3C        146308  3954.
##  2 財經      618243  5109.
##  3 地產      220812  6900.
##  4 動物      123287  4742.
##  5 國際     2540411  8914.
##  6 論壇      312592  5683.
##  7 社會     5721750 29494.
##  8 生活     3417804 11469.
##  9 時尚      260499  6855.
## 10 搜奇      668307 12151.
## 11 體育     1598067 16822.
## 12 娛樂     3571005 31602.
## 13 正妹      672949 84119.
## 14 政治     1701980 11902.

applenews %>%
  group_by(category) %>%
  summarise_at(.funs=funs(min,max), .vars=vars(matches('clicked')), na.rm=T)

## # A tibble: 14 x 3
##    category   min    max
##    <chr>    <dbl>  <dbl>
##  1 3C         267  20509
##  2 財經        24  54886
##  3 地產         0  80691
##  4 動物      1211  11753
##  5 國際         0 150825
##  6 論壇       275  68208
##  7 社會       918 241842
##  8 生活        20 132880
##  9 時尚       311  67086
## 10 搜奇       199  83036
## 11 體育       523 162907
## 12 娛樂      1631 299235
## 13 正妹      7999 344733
## 14 政治       221  83059

cat_stat = applenews %>%
  group_by(category) %>%
  summarise(clicked_sum = sum(clicked)) 

cat_stat

## # A tibble: 14 x 2
##    category clicked_sum
##    <chr>          <int>
##  1 3C            146308
##  2 財經          618243
##  3 地產          220812
##  4 動物          123287
##  5 國際         2540411
##  6 論壇          312592
##  7 社會         5721750
##  8 生活         3417804
##  9 時尚          260499
## 10 搜奇          668307
## 11 體育         1598067
## 12 娛樂         3571005
## 13 正妹          672949
## 14 政治         1701980

#繪製長條圖
barplot(cat_stat$clicked_sum, names.arg=cat_stat$category, col=rainbow(length(cat_stat$category)),family="Songti SC")

#繪製圓餅圖
pie(cat_stat$clicked_sum, label = cat_stat$category,family="Songti SC")

docker run –rm –name some-mysql -p 3306:3306 -v $(pwd)/docker_mysql:/var/lib/mysql -e MYSQL_ROOT_PASSWORD=pythonetl mysql:5.7

alter mysql 8.0 password encryption

https://stackoverflow.com/questions/49194719/authentication-plugin-caching-sha2-password-cannot-be-loaded ``` ALTER USER ‘yourusername'@'localhost’ IDENTIFIED WITH mysql_native_password BY ‘youpassword’;

CREATE DATABASE test CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; ``` ### dplyr連接資料庫範例(mysql) - 可改用 RMariaDB package

#install.packages('RMariaDB')
#install.packages('dbplyr')
library('dbplyr')

## 
## Attaching package: 'dbplyr'

## The following objects are masked from 'package:dplyr':
## 
##     ident, sql

library('RMariaDB')

##iris example
conn = dbConnect(MariaDB(),dbname='test',host='127.0.0.1',port=3306,user='root',password='pythonetl')
db_drop_table(conn,'iris')

## [1] 0

copy_to(conn,iris,temporary = F)

tbl(conn,"iris") %>%
  select(starts_with('Sepal'),'Species') %>%
  group_by(Species) %>%
  summarise_at(.funs=funs(mean(.,na.rm=T),sd(.,na.rm=T)),.vars=vars(starts_with('Sepal'))) %>%
  collect()

## # A tibble: 3 x 5
##   Species Sepal.Length_me… Sepal.Width_mean Sepal.Length_sd Sepal.Width_sd
##   <chr>              <dbl>            <dbl>           <dbl>          <dbl>
## 1 setosa              5.01             3.43           0.352          0.379
## 2 versic…             5.94             2.77           0.516          0.314
## 3 virgin…             6.59             2.97           0.636          0.322

dbGetQuery(conn,'select * from iris') %>% filter(Species == 'setosa')

##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1           5.1         3.5          1.4         0.2  setosa
## 2           4.9         3.0          1.4         0.2  setosa
## 3           4.7         3.2          1.3         0.2  setosa
## 4           4.6         3.1          1.5         0.2  setosa
## 5           5.0         3.6          1.4         0.2  setosa
## 6           5.4         3.9          1.7         0.4  setosa
## 7           4.6         3.4          1.4         0.3  setosa
## 8           5.0         3.4          1.5         0.2  setosa
## 9           4.4         2.9          1.4         0.2  setosa
## 10          4.9         3.1          1.5         0.1  setosa
## 11          5.4         3.7          1.5         0.2  setosa
## 12          4.8         3.4          1.6         0.2  setosa
## 13          4.8         3.0          1.4         0.1  setosa
## 14          4.3         3.0          1.1         0.1  setosa
## 15          5.8         4.0          1.2         0.2  setosa
## 16          5.7         4.4          1.5         0.4  setosa
## 17          5.4         3.9          1.3         0.4  setosa
## 18          5.1         3.5          1.4         0.3  setosa
## 19          5.7         3.8          1.7         0.3  setosa
## 20          5.1         3.8          1.5         0.3  setosa
## 21          5.4         3.4          1.7         0.2  setosa
## 22          5.1         3.7          1.5         0.4  setosa
## 23          4.6         3.6          1.0         0.2  setosa
## 24          5.1         3.3          1.7         0.5  setosa
## 25          4.8         3.4          1.9         0.2  setosa
## 26          5.0         3.0          1.6         0.2  setosa
## 27          5.0         3.4          1.6         0.4  setosa
## 28          5.2         3.5          1.5         0.2  setosa
## 29          5.2         3.4          1.4         0.2  setosa
## 30          4.7         3.2          1.6         0.2  setosa
## 31          4.8         3.1          1.6         0.2  setosa
## 32          5.4         3.4          1.5         0.4  setosa
## 33          5.2         4.1          1.5         0.1  setosa
## 34          5.5         4.2          1.4         0.2  setosa
## 35          4.9         3.1          1.5         0.2  setosa
## 36          5.0         3.2          1.2         0.2  setosa
## 37          5.5         3.5          1.3         0.2  setosa
## 38          4.9         3.6          1.4         0.1  setosa
## 39          4.4         3.0          1.3         0.2  setosa
## 40          5.1         3.4          1.5         0.2  setosa
## 41          5.0         3.5          1.3         0.3  setosa
## 42          4.5         2.3          1.3         0.3  setosa
## 43          4.4         3.2          1.3         0.2  setosa
## 44          5.0         3.5          1.6         0.6  setosa
## 45          5.1         3.8          1.9         0.4  setosa
## 46          4.8         3.0          1.4         0.3  setosa
## 47          5.1         3.8          1.6         0.2  setosa
## 48          4.6         3.2          1.4         0.2  setosa
## 49          5.3         3.7          1.5         0.2  setosa
## 50          5.0         3.3          1.4         0.2  setosa

dbListTables(conn)

## [1] "applenews" "iris"      "pcc"       "user"

## applenews example
copy_to(conn,applenews,temporary = F,overwrite=T)
tbl(conn,"applenews")

## # Source:   table<applenews> [?? x 4]
## # Database: mysql 5.7.22 [root@127.0.0.1:/test]
##    title                                       category clicked    portion
##    <chr>                                       <chr>      <int>      <dbl>
##  1 【更新】搶2.2萬彩券刮中1.4萬　沒發財還得入獄… 社會        1754    8.13e-5
##  2 拿到澳洲護照後　他放火燒中國護照            國際           0    0.     
##  3 【特企】房市大追擊- 租屋這些事情要小心      地產           0    0.     
##  4 【央廣RTI】美菲軍演  美防長南海登艦         國際           0    0.     
##  5 全球最閃牽手夫妻　絕美禮服出自台灣…         時尚         311    1.44e-5
##  6 公司遭搜索　浩鼎籲檢調勿公開商業機密        財經          24    1.11e-6
##  7 【央廣RTI】每318秒就有1人罹癌  大腸癌名列第一… 生活          20    9.27e-7
##  8 垃圾掉滿地　村民請神明幫忙                  生活         314    1.46e-5
##  9 【熊本強震】取消去九州　華航5月8日前退改票免手續費… 生活          27    1.25e-6
## 10 麵龜摻非工業色素　千顆不良品早下肚          生活         308    1.43e-5
## # ... with more rows

dbSendQuery(conn, "SET NAMES utf8");

## <MariaDBResult>
##   SQL  SET NAMES utf8
##   ROWS Fetched: 0 [complete]
##        Changed: 0

tbl(conn,"applenews")

## Warning in result_create(conn@ptr, statement, is_statement): Cancelling
## previous query

## # Source:   table<applenews> [?? x 4]
## # Database: mysql 5.7.22 [root@127.0.0.1:/test]
##    title                                       category clicked    portion
##    <chr>                                       <chr>      <int>      <dbl>
##  1 【更新】搶2.2萬彩券刮中1.4萬　沒發財還得入獄… 社會        1754    8.13e-5
##  2 拿到澳洲護照後　他放火燒中國護照            國際           0    0.     
##  3 【特企】房市大追擊- 租屋這些事情要小心      地產           0    0.     
##  4 【央廣RTI】美菲軍演  美防長南海登艦         國際           0    0.     
##  5 全球最閃牽手夫妻　絕美禮服出自台灣…         時尚         311    1.44e-5
##  6 公司遭搜索　浩鼎籲檢調勿公開商業機密        財經          24    1.11e-6
##  7 【央廣RTI】每318秒就有1人罹癌  大腸癌名列第一… 生活          20    9.27e-7
##  8 垃圾掉滿地　村民請神明幫忙                  生活         314    1.46e-5
##  9 【熊本強震】取消去九州　華航5月8日前退改票免手續費… 生活          27    1.25e-6
## 10 麵龜摻非工業色素　千顆不良品早下肚          生活         308    1.43e-5
## # ... with more rows

category_stat = tbl(conn,"applenews") %>%
  group_by(category) %>%
  summarise_at(.funs=funs(min(.,na.rm=T),max(.,na.rm=T),mean(.,na.rm=T)), .vars=vars(matches('clicked'))) %>%
  arrange(desc(mean)) %>%
  collect()

library('ggplot2')
g <- ggplot(category_stat,aes(x=category,y=mean))
g + geom_bar(stat='identity') + theme(text=element_text(size=16,  family="Songti SC")) + scale_x_discrete(limits=category_stat$category)

Learning map

http://scikit-learn.org/stable/_static/ml_map.png
http://www.r-bloggers.com/whats-the-difference-between-machine-learning-statistics-and-data-mining/
http://mp.weixin.qq.com/s?__biz=MjM5ODczNTkwMA==&mid=2650107069&idx=1&sn=44a2eab6c4858c56af236749fdd1d784#rd
https://hk.saowen.com/a/2d78153a4263c35e9889ebb0cd07e731d79ed1c1ab0e712c68dba24ffd4367f4 -https://www.youtube.com/watch?v=mRro1Ge_OCg

Classification

Decision Tree - using churn data in C50 package

#install.packages("C50")
library(C50)

data(churn)
str(churnTrain)

## 'data.frame':    3333 obs. of  20 variables:
##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
##  $ account_length               : int  128 107 137 84 75 118 121 147 117 141 ...
##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ number_vmail_messages        : int  25 26 0 0 0 0 24 0 0 37 ...
##  $ total_day_minutes            : num  265 162 243 299 167 ...
##  $ total_day_calls              : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ total_day_charge             : num  45.1 27.5 41.4 50.9 28.3 ...
##  $ total_eve_minutes            : num  197.4 195.5 121.2 61.9 148.3 ...
##  $ total_eve_calls              : int  99 103 110 88 122 101 108 94 80 111 ...
##  $ total_eve_charge             : num  16.78 16.62 10.3 5.26 12.61 ...
##  $ total_night_minutes          : num  245 254 163 197 187 ...
##  $ total_night_calls            : int  91 103 104 89 121 118 118 96 90 97 ...
##  $ total_night_charge           : num  11.01 11.45 7.32 8.86 8.41 ...
##  $ total_intl_minutes           : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ total_intl_calls             : int  3 3 5 7 3 6 7 6 4 5 ...
##  $ total_intl_charge            : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ number_customer_service_calls: int  1 1 0 2 3 0 3 0 1 0 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...

names(churnTrain) %in% c("state", "area_code", "account_length")

##  [1]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

!names(churnTrain) %in% c("state", "area_code", "account_length")

##  [1] FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [12]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

#選擇建模變數
variable.list = !names(churnTrain) %in% c('state','area_code','account_length')
churnTrain=churnTrain[,variable.list]
churnTest=churnTest[,variable.list]

str(churnTrain)

## 'data.frame':    3333 obs. of  17 variables:
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ number_vmail_messages        : int  25 26 0 0 0 0 24 0 0 37 ...
##  $ total_day_minutes            : num  265 162 243 299 167 ...
##  $ total_day_calls              : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ total_day_charge             : num  45.1 27.5 41.4 50.9 28.3 ...
##  $ total_eve_minutes            : num  197.4 195.5 121.2 61.9 148.3 ...
##  $ total_eve_calls              : int  99 103 110 88 122 101 108 94 80 111 ...
##  $ total_eve_charge             : num  16.78 16.62 10.3 5.26 12.61 ...
##  $ total_night_minutes          : num  245 254 163 197 187 ...
##  $ total_night_calls            : int  91 103 104 89 121 118 118 96 90 97 ...
##  $ total_night_charge           : num  11.01 11.45 7.32 8.86 8.41 ...
##  $ total_intl_minutes           : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ total_intl_calls             : int  3 3 5 7 3 6 7 6 4 5 ...
##  $ total_intl_charge            : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ number_customer_service_calls: int  1 1 0 2 3 0 3 0 1 0 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...

#sample
?sample

## Help on topic 'sample' was found in the following packages:
## 
##   Package               Library
##   dplyr                 /Library/Frameworks/R.framework/Versions/3.5/Resources/library
##   base                  /Library/Frameworks/R.framework/Resources/library
## 
## 
## Using the first match ...

sample(1:10)

##  [1]  2  1  6  9  5  8  4 10  7  3

sample(1:10, size = 5)

## [1] 5 9 6 8 2

sample(c(0,1), size= 10, replace = T)

##  [1] 1 1 1 1 0 0 0 1 1 1

sample.int(20, 12) # 兩個參數都要放整數，此例為取1:20中的12個不重複樣本

##  [1] 10 18 12  7 11 19  3  5 15  2  9 14

set.seed(2)
#把資料分成training data 和 validation data
ind<-sample(1:2, size=nrow(churnTrain), replace=T, prob=c(0.7, 0.3))
trainset=churnTrain[ind==1,]
testset=churnTrain[ind==2,]

rpart

#install.packages('rpart')
library('rpart')
#使用rpart(CART)建立決策樹模型
?rpart
con = rpart.control(minsplit=20,cp=0.01)
?rpart.control
churn.rp<-rpart(churn ~., data=trainset,control = con)
#churn.rp<-rpart(churn ~ total_day_charge + international_plan, data=trainset)

churn.rp

## n= 2315 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 2315 342 no (0.14773218 0.85226782)  
##    2) total_day_minutes>=265.45 144  59 yes (0.59027778 0.40972222)  
##      4) voice_mail_plan=no 110  29 yes (0.73636364 0.26363636)  
##        8) total_eve_minutes>=188.5 67   3 yes (0.95522388 0.04477612) *
##        9) total_eve_minutes< 188.5 43  17 no (0.39534884 0.60465116)  
##         18) total_day_minutes>=282.7 19   6 yes (0.68421053 0.31578947) *
##         19) total_day_minutes< 282.7 24   4 no (0.16666667 0.83333333) *
##      5) voice_mail_plan=yes 34   4 no (0.11764706 0.88235294) *
##    3) total_day_minutes< 265.45 2171 257 no (0.11837863 0.88162137)  
##      6) number_customer_service_calls>=3.5 168  82 yes (0.51190476 0.48809524)  
##       12) total_day_minutes< 160.2 71  10 yes (0.85915493 0.14084507) *
##       13) total_day_minutes>=160.2 97  25 no (0.25773196 0.74226804)  
##         26) total_eve_minutes< 155.5 20   7 yes (0.65000000 0.35000000) *
##         27) total_eve_minutes>=155.5 77  12 no (0.15584416 0.84415584) *
##      7) number_customer_service_calls< 3.5 2003 171 no (0.08537194 0.91462806)  
##       14) international_plan=yes 188  76 no (0.40425532 0.59574468)  
##         28) total_intl_calls< 2.5 38   0 yes (1.00000000 0.00000000) *
##         29) total_intl_calls>=2.5 150  38 no (0.25333333 0.74666667)  
##           58) total_intl_minutes>=13.1 32   0 yes (1.00000000 0.00000000) *
##           59) total_intl_minutes< 13.1 118   6 no (0.05084746 0.94915254) *
##       15) international_plan=no 1815  95 no (0.05234160 0.94765840)  
##         30) total_day_minutes>=224.15 251  50 no (0.19920319 0.80079681)  
##           60) total_eve_minutes>=259.8 36  10 yes (0.72222222 0.27777778) *
##           61) total_eve_minutes< 259.8 215  24 no (0.11162791 0.88837209) *
##         31) total_day_minutes< 224.15 1564  45 no (0.02877238 0.97122762) *

summary(churn.rp)

## Call:
## rpart(formula = churn ~ ., data = trainset, control = con)
##   n= 2315 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.07602339      0 1.0000000 1.0000000 0.04992005
## 2 0.07456140      2 0.8479532 0.9970760 0.04985964
## 3 0.05555556      4 0.6988304 0.7602339 0.04442127
## 4 0.02631579      7 0.4941520 0.5263158 0.03767329
## 5 0.02339181      8 0.4678363 0.5204678 0.03748096
## 6 0.02046784     10 0.4210526 0.5087719 0.03709209
## 7 0.01754386     11 0.4005848 0.4707602 0.03578773
## 8 0.01000000     12 0.3830409 0.4766082 0.03599261
## 
## Variable importance
##             total_day_minutes              total_day_charge 
##                            18                            18 
## number_customer_service_calls            total_intl_minutes 
##                            10                             8 
##             total_intl_charge              total_eve_charge 
##                             8                             8 
##             total_eve_minutes            international_plan 
##                             8                             7 
##              total_intl_calls         number_vmail_messages 
##                             6                             3 
##               voice_mail_plan             total_night_calls 
##                             3                             1 
##               total_eve_calls 
##                             1 
## 
## Node number 1: 2315 observations,    complexity param=0.07602339
##   predicted class=no   expected loss=0.1477322  P(node) =1
##     class counts:   342  1973
##    probabilities: 0.148 0.852 
##   left son=2 (144 obs) right son=3 (2171 obs)
##   Primary splits:
##       total_day_minutes             < 265.45 to the right, improve=60.145020, (0 missing)
##       total_day_charge              < 45.125 to the right, improve=60.145020, (0 missing)
##       number_customer_service_calls < 3.5    to the right, improve=53.641430, (0 missing)
##       international_plan            splits as  RL,         improve=43.729370, (0 missing)
##       voice_mail_plan               splits as  LR,         improve= 6.089388, (0 missing)
##   Surrogate splits:
##       total_day_charge < 45.125 to the right, agree=1, adj=1, (0 split)
## 
## Node number 2: 144 observations,    complexity param=0.07602339
##   predicted class=yes  expected loss=0.4097222  P(node) =0.06220302
##     class counts:    85    59
##    probabilities: 0.590 0.410 
##   left son=4 (110 obs) right son=5 (34 obs)
##   Primary splits:
##       voice_mail_plan       splits as  LR,         improve=19.884860, (0 missing)
##       number_vmail_messages < 9.5    to the left,  improve=19.884860, (0 missing)
##       total_eve_minutes     < 167.05 to the right, improve=14.540020, (0 missing)
##       total_eve_charge      < 14.2   to the right, improve=14.540020, (0 missing)
##       total_day_minutes     < 283.9  to the right, improve= 6.339827, (0 missing)
##   Surrogate splits:
##       number_vmail_messages < 9.5    to the left,  agree=1.000, adj=1.000, (0 split)
##       total_night_minutes   < 110.3  to the right, agree=0.785, adj=0.088, (0 split)
##       total_night_charge    < 4.965  to the right, agree=0.785, adj=0.088, (0 split)
##       total_night_calls     < 50     to the right, agree=0.778, adj=0.059, (0 split)
##       total_intl_minutes    < 15.3   to the left,  agree=0.771, adj=0.029, (0 split)
## 
## Node number 3: 2171 observations,    complexity param=0.0745614
##   predicted class=no   expected loss=0.1183786  P(node) =0.937797
##     class counts:   257  1914
##    probabilities: 0.118 0.882 
##   left son=6 (168 obs) right son=7 (2003 obs)
##   Primary splits:
##       number_customer_service_calls < 3.5    to the right, improve=56.398210, (0 missing)
##       international_plan            splits as  RL,         improve=43.059160, (0 missing)
##       total_day_minutes             < 224.15 to the right, improve=10.847440, (0 missing)
##       total_day_charge              < 38.105 to the right, improve=10.847440, (0 missing)
##       total_intl_minutes            < 13.15  to the right, improve= 6.347319, (0 missing)
## 
## Node number 4: 110 observations,    complexity param=0.02631579
##   predicted class=yes  expected loss=0.2636364  P(node) =0.0475162
##     class counts:    81    29
##    probabilities: 0.736 0.264 
##   left son=8 (67 obs) right son=9 (43 obs)
##   Primary splits:
##       total_eve_minutes   < 188.5  to the right, improve=16.419610, (0 missing)
##       total_eve_charge    < 16.025 to the right, improve=16.419610, (0 missing)
##       total_night_minutes < 206.85 to the right, improve= 5.350500, (0 missing)
##       total_night_charge  < 9.305  to the right, improve= 5.350500, (0 missing)
##       total_day_minutes   < 281.15 to the right, improve= 5.254545, (0 missing)
##   Surrogate splits:
##       total_eve_charge   < 16.025 to the right, agree=1.000, adj=1.000, (0 split)
##       total_night_calls  < 82     to the right, agree=0.655, adj=0.116, (0 split)
##       total_intl_minutes < 3.35   to the right, agree=0.636, adj=0.070, (0 split)
##       total_intl_charge  < 0.905  to the right, agree=0.636, adj=0.070, (0 split)
##       total_day_minutes  < 268.55 to the right, agree=0.627, adj=0.047, (0 split)
## 
## Node number 5: 34 observations
##   predicted class=no   expected loss=0.1176471  P(node) =0.01468683
##     class counts:     4    30
##    probabilities: 0.118 0.882 
## 
## Node number 6: 168 observations,    complexity param=0.0745614
##   predicted class=yes  expected loss=0.4880952  P(node) =0.07257019
##     class counts:    86    82
##    probabilities: 0.512 0.488 
##   left son=12 (71 obs) right son=13 (97 obs)
##   Primary splits:
##       total_day_minutes             < 160.2  to the left,  improve=29.655880, (0 missing)
##       total_day_charge              < 27.235 to the left,  improve=29.655880, (0 missing)
##       total_eve_minutes             < 180.65 to the left,  improve= 8.556953, (0 missing)
##       total_eve_charge              < 15.355 to the left,  improve= 8.556953, (0 missing)
##       number_customer_service_calls < 4.5    to the right, improve= 5.975362, (0 missing)
##   Surrogate splits:
##       total_day_charge              < 27.235 to the left,  agree=1.000, adj=1.000, (0 split)
##       total_night_calls             < 79     to the left,  agree=0.625, adj=0.113, (0 split)
##       total_intl_calls              < 2.5    to the left,  agree=0.619, adj=0.099, (0 split)
##       number_customer_service_calls < 4.5    to the right, agree=0.607, adj=0.070, (0 split)
##       total_eve_calls               < 89.5   to the left,  agree=0.601, adj=0.056, (0 split)
## 
## Node number 7: 2003 observations,    complexity param=0.05555556
##   predicted class=no   expected loss=0.08537194  P(node) =0.8652268
##     class counts:   171  1832
##    probabilities: 0.085 0.915 
##   left son=14 (188 obs) right son=15 (1815 obs)
##   Primary splits:
##       international_plan splits as  RL,         improve=42.194510, (0 missing)
##       total_day_minutes  < 224.15 to the right, improve=16.838410, (0 missing)
##       total_day_charge   < 38.105 to the right, improve=16.838410, (0 missing)
##       total_intl_minutes < 13.15  to the right, improve= 6.210678, (0 missing)
##       total_intl_charge  < 3.55   to the right, improve= 6.210678, (0 missing)
## 
## Node number 8: 67 observations
##   predicted class=yes  expected loss=0.04477612  P(node) =0.02894168
##     class counts:    64     3
##    probabilities: 0.955 0.045 
## 
## Node number 9: 43 observations,    complexity param=0.02046784
##   predicted class=no   expected loss=0.3953488  P(node) =0.01857451
##     class counts:    17    26
##    probabilities: 0.395 0.605 
##   left son=18 (19 obs) right son=19 (24 obs)
##   Primary splits:
##       total_day_minutes   < 282.7  to the right, improve=5.680947, (0 missing)
##       total_day_charge    < 48.06  to the right, improve=5.680947, (0 missing)
##       total_night_minutes < 212.65 to the right, improve=4.558140, (0 missing)
##       total_night_charge  < 9.57   to the right, improve=4.558140, (0 missing)
##       total_eve_minutes   < 145.4  to the right, improve=4.356169, (0 missing)
##   Surrogate splits:
##       total_day_charge   < 48.06  to the right, agree=1.000, adj=1.000, (0 split)
##       total_day_calls    < 103    to the left,  agree=0.674, adj=0.263, (0 split)
##       total_eve_calls    < 104.5  to the left,  agree=0.674, adj=0.263, (0 split)
##       total_intl_minutes < 11.55  to the left,  agree=0.651, adj=0.211, (0 split)
##       total_intl_charge  < 3.12   to the left,  agree=0.651, adj=0.211, (0 split)
## 
## Node number 12: 71 observations
##   predicted class=yes  expected loss=0.1408451  P(node) =0.03066955
##     class counts:    61    10
##    probabilities: 0.859 0.141 
## 
## Node number 13: 97 observations,    complexity param=0.01754386
##   predicted class=no   expected loss=0.257732  P(node) =0.04190065
##     class counts:    25    72
##    probabilities: 0.258 0.742 
##   left son=26 (20 obs) right son=27 (77 obs)
##   Primary splits:
##       total_eve_minutes             < 155.5  to the left,  improve=7.753662, (0 missing)
##       total_eve_charge              < 13.22  to the left,  improve=7.753662, (0 missing)
##       total_intl_minutes            < 13.55  to the right, improve=2.366149, (0 missing)
##       total_intl_charge             < 3.66   to the right, improve=2.366149, (0 missing)
##       number_customer_service_calls < 4.5    to the right, improve=2.297667, (0 missing)
##   Surrogate splits:
##       total_eve_charge  < 13.22  to the left,  agree=1.000, adj=1.00, (0 split)
##       total_night_calls < 143.5  to the right, agree=0.814, adj=0.10, (0 split)
##       total_eve_calls   < 62     to the left,  agree=0.804, adj=0.05, (0 split)
## 
## Node number 14: 188 observations,    complexity param=0.05555556
##   predicted class=no   expected loss=0.4042553  P(node) =0.0812095
##     class counts:    76   112
##    probabilities: 0.404 0.596 
##   left son=28 (38 obs) right son=29 (150 obs)
##   Primary splits:
##       total_intl_calls   < 2.5    to the left,  improve=33.806520, (0 missing)
##       total_intl_minutes < 13.1   to the right, improve=30.527050, (0 missing)
##       total_intl_charge  < 3.535  to the right, improve=30.527050, (0 missing)
##       total_day_minutes  < 221.95 to the right, improve= 3.386095, (0 missing)
##       total_day_charge   < 37.735 to the right, improve= 3.386095, (0 missing)
## 
## Node number 15: 1815 observations,    complexity param=0.02339181
##   predicted class=no   expected loss=0.0523416  P(node) =0.7840173
##     class counts:    95  1720
##    probabilities: 0.052 0.948 
##   left son=30 (251 obs) right son=31 (1564 obs)
##   Primary splits:
##       total_day_minutes   < 224.15 to the right, improve=12.5649300, (0 missing)
##       total_day_charge    < 38.105 to the right, improve=12.5649300, (0 missing)
##       total_eve_minutes   < 244.95 to the right, improve= 4.7875890, (0 missing)
##       total_eve_charge    < 20.825 to the right, improve= 4.7875890, (0 missing)
##       total_night_minutes < 163.85 to the right, improve= 0.9074391, (0 missing)
##   Surrogate splits:
##       total_day_charge < 38.105 to the right, agree=1, adj=1, (0 split)
## 
## Node number 18: 19 observations
##   predicted class=yes  expected loss=0.3157895  P(node) =0.008207343
##     class counts:    13     6
##    probabilities: 0.684 0.316 
## 
## Node number 19: 24 observations
##   predicted class=no   expected loss=0.1666667  P(node) =0.01036717
##     class counts:     4    20
##    probabilities: 0.167 0.833 
## 
## Node number 26: 20 observations
##   predicted class=yes  expected loss=0.35  P(node) =0.008639309
##     class counts:    13     7
##    probabilities: 0.650 0.350 
## 
## Node number 27: 77 observations
##   predicted class=no   expected loss=0.1558442  P(node) =0.03326134
##     class counts:    12    65
##    probabilities: 0.156 0.844 
## 
## Node number 28: 38 observations
##   predicted class=yes  expected loss=0  P(node) =0.01641469
##     class counts:    38     0
##    probabilities: 1.000 0.000 
## 
## Node number 29: 150 observations,    complexity param=0.05555556
##   predicted class=no   expected loss=0.2533333  P(node) =0.06479482
##     class counts:    38   112
##    probabilities: 0.253 0.747 
##   left son=58 (32 obs) right son=59 (118 obs)
##   Primary splits:
##       total_intl_minutes < 13.1   to the right, improve=45.356840, (0 missing)
##       total_intl_charge  < 3.535  to the right, improve=45.356840, (0 missing)
##       total_day_calls    < 95.5   to the left,  improve= 4.036407, (0 missing)
##       total_day_minutes  < 237.75 to the right, improve= 1.879020, (0 missing)
##       total_day_charge   < 40.42  to the right, improve= 1.879020, (0 missing)
##   Surrogate splits:
##       total_intl_charge < 3.535  to the right, agree=1.0, adj=1.000, (0 split)
##       total_day_minutes < 52.45  to the left,  agree=0.8, adj=0.063, (0 split)
##       total_day_charge  < 8.92   to the left,  agree=0.8, adj=0.063, (0 split)
## 
## Node number 30: 251 observations,    complexity param=0.02339181
##   predicted class=no   expected loss=0.1992032  P(node) =0.1084233
##     class counts:    50   201
##    probabilities: 0.199 0.801 
##   left son=60 (36 obs) right son=61 (215 obs)
##   Primary splits:
##       total_eve_minutes     < 259.8  to the right, improve=22.993380, (0 missing)
##       total_eve_charge      < 22.08  to the right, improve=22.993380, (0 missing)
##       voice_mail_plan       splits as  LR,         improve= 4.745664, (0 missing)
##       number_vmail_messages < 7.5    to the left,  improve= 4.745664, (0 missing)
##       total_night_minutes   < 181.15 to the right, improve= 3.509731, (0 missing)
##   Surrogate splits:
##       total_eve_charge < 22.08  to the right, agree=1, adj=1, (0 split)
## 
## Node number 31: 1564 observations
##   predicted class=no   expected loss=0.02877238  P(node) =0.675594
##     class counts:    45  1519
##    probabilities: 0.029 0.971 
## 
## Node number 58: 32 observations
##   predicted class=yes  expected loss=0  P(node) =0.01382289
##     class counts:    32     0
##    probabilities: 1.000 0.000 
## 
## Node number 59: 118 observations
##   predicted class=no   expected loss=0.05084746  P(node) =0.05097192
##     class counts:     6   112
##    probabilities: 0.051 0.949 
## 
## Node number 60: 36 observations
##   predicted class=yes  expected loss=0.2777778  P(node) =0.01555076
##     class counts:    26    10
##    probabilities: 0.722 0.278 
## 
## Node number 61: 215 observations
##   predicted class=no   expected loss=0.1116279  P(node) =0.09287257
##     class counts:    24   191
##    probabilities: 0.112 0.888

#畫出決策樹
par(mfrow=c(1,1))
?plot.rpart
plot(churn.rp, uniform=TRUE,branch = 0.6, margin=0.1)
text(churn.rp, all=TRUE, use.n=TRUE, cex=0.7)

library('rpart.plot')
rpart.plot(churn.rp)

cost complexity

http://mlwiki.org/index.php/Cost-Complexity_Pruning

Prune

printcp(churn.rp)

## 
## Classification tree:
## rpart(formula = churn ~ ., data = trainset, control = con)
## 
## Variables actually used in tree construction:
## [1] international_plan            number_customer_service_calls
## [3] total_day_minutes             total_eve_minutes            
## [5] total_intl_calls              total_intl_minutes           
## [7] voice_mail_plan              
## 
## Root node error: 342/2315 = 0.14773
## 
## n= 2315 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.076023      0   1.00000 1.00000 0.049920
## 2 0.074561      2   0.84795 0.99708 0.049860
## 3 0.055556      4   0.69883 0.76023 0.044421
## 4 0.026316      7   0.49415 0.52632 0.037673
## 5 0.023392      8   0.46784 0.52047 0.037481
## 6 0.020468     10   0.42105 0.50877 0.037092
## 7 0.017544     11   0.40058 0.47076 0.035788
## 8 0.010000     12   0.38304 0.47661 0.035993

plotcp(churn.rp)

#找出minimum cross-validation errors
min_row = which.min(churn.rp$cptable[,"xerror"])
churn.cp = churn.rp$cptable[min_row, "CP"]
#將churn.cp設為臨界值來修剪樹
prune.tree=prune(churn.rp, cp=churn.cp)
plot(prune.tree, margin=0.1)
text(prune.tree, all=TRUE, use.n=TRUE, cex=0.7)

test_tree = prune(churn.rp,cp=0.06)
plot(test_tree, margin=0.1)
text(test_tree, all=TRUE, use.n=TRUE, cex=0.7)

predictions <-predict(prune.tree, testset, type='class')
table(predictions,testset$churn)

##            
## predictions yes  no
##         yes  95  14
##         no   46 863

#install.packages('caret')
#install.packages('e1071')
library('caret')

## Loading required package: lattice

library('e1071')
confusionMatrix(table(predictions, testset$churn))

## Confusion Matrix and Statistics
## 
##            
## predictions yes  no
##         yes  95  14
##         no   46 863
##                                           
##                Accuracy : 0.9411          
##                  95% CI : (0.9248, 0.9547)
##     No Information Rate : 0.8615          
##     P-Value [Acc > NIR] : 2.786e-16       
##                                           
##                   Kappa : 0.727           
##  Mcnemar's Test P-Value : 6.279e-05       
##                                           
##             Sensitivity : 0.67376         
##             Specificity : 0.98404         
##          Pos Pred Value : 0.87156         
##          Neg Pred Value : 0.94939         
##              Prevalence : 0.13851         
##          Detection Rate : 0.09332         
##    Detection Prevalence : 0.10707         
##       Balanced Accuracy : 0.82890         
##                                           
##        'Positive' Class : yes             
##

?confusionMatrix

## Help on topic 'confusionMatrix' was found in the following
## packages:
## 
##   Package               Library
##   caret                 /Library/Frameworks/R.framework/Versions/3.5/Resources/library
##   ModelMetrics          /Library/Frameworks/R.framework/Versions/3.5/Resources/library
## 
## 
## Using the first match ...

ctree

#install.packages("party")
library('party')

## Loading required package: grid

## Loading required package: mvtnorm

## Loading required package: modeltools

## Loading required package: stats4

## Loading required package: strucchange

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

ctree.model = ctree(churn ~ . , data = trainset,controls = ctree_control(minsplit = 10))
plot(ctree.model, margin=0.1)

daycharge.model = ctree(churn ~ total_day_charge + international_plan, data = trainset)
plot(daycharge.model)

ctree.predict = predict(ctree.model ,testset,type='response')
table(ctree.predict, testset$churn)

##              
## ctree.predict yes  no
##           yes  98  13
##           no   43 864

confusionMatrix(table(ctree.predict, testset$churn))

## Confusion Matrix and Statistics
## 
##              
## ctree.predict yes  no
##           yes  98  13
##           no   43 864
##                                           
##                Accuracy : 0.945           
##                  95% CI : (0.9292, 0.9582)
##     No Information Rate : 0.8615          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7469          
##  Mcnemar's Test P-Value : 0.0001065       
##                                           
##             Sensitivity : 0.69504         
##             Specificity : 0.98518         
##          Pos Pred Value : 0.88288         
##          Neg Pred Value : 0.95259         
##              Prevalence : 0.13851         
##          Detection Rate : 0.09627         
##    Detection Prevalence : 0.10904         
##       Balanced Accuracy : 0.84011         
##                                           
##        'Positive' Class : yes             
##

C5.0

#install.packages("C50")
library(C50)
c50.model = C5.0(churn ~., data=trainset)

?C5.0Control

c=C5.0Control(minCases = 20)
c50.model = C5.0(churn ~., data=trainset,control = c)

summary(c50.model)

## 
## Call:
## C5.0.formula(formula = churn ~ ., data = trainset, control = c)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Tue Feb 12 17:12:52 2019
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 2315 cases (17 attributes) from undefined.data
## 
## Decision tree:
## 
## number_customer_service_calls > 3:
## :...total_day_minutes <= 160.1: yes (71/10)
## :   total_day_minutes > 160.1: no (108/32)
## number_customer_service_calls <= 3:
## :...international_plan = yes:
##     :...total_intl_calls <= 2: yes (41)
##     :   total_intl_calls > 2:
##     :   :...total_intl_minutes <= 13.1: no (134/13)
##     :       total_intl_minutes > 13.1: yes (34)
##     international_plan = no:
##     :...total_day_minutes <= 224.1: no (1564/45)
##         total_day_minutes > 224.1:
##         :...voice_mail_plan = yes: no (97/4)
##             voice_mail_plan = no:
##             :...total_eve_charge <= 17.47:
##                 :...total_day_minutes <= 278.4: no (124/10)
##                 :   total_day_minutes > 278.4: yes (20/5)
##                 total_eve_charge > 17.47:
##                 :...total_day_minutes > 264: yes (46)
##                     total_day_minutes <= 264:
##                     :...total_eve_charge > 22.04: yes (29/4)
##                         total_eve_charge <= 22.04:
##                         :...total_night_charge <= 9.04: no (23/1)
##                             total_night_charge > 9.04: yes (24/9)
## 
## 
## Evaluation on training data (2315 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      13  133( 5.7%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     237   105    (a): class yes
##      28  1945    (b): class no
## 
## 
##  Attribute usage:
## 
##  100.00% number_customer_service_calls
##   92.27% international_plan
##   90.97% total_day_minutes
##   15.68% voice_mail_plan
##   11.49% total_eve_charge
##    9.03% total_intl_calls
##    7.26% total_intl_minutes
##    2.03% total_night_charge
## 
## 
## Time: 0.0 secs

plot(c50.model)

c50.predict = predict(c50.model,testset,type='class')
table(c50.predict, testset$churn)

##            
## c50.predict yes  no
##         yes  97  15
##         no   44 862

confusionMatrix(table(c50.predict, testset$churn))

## Confusion Matrix and Statistics
## 
##            
## c50.predict yes  no
##         yes  97  15
##         no   44 862
##                                           
##                Accuracy : 0.942           
##                  95% CI : (0.9259, 0.9556)
##     No Information Rate : 0.8615          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7342          
##  Mcnemar's Test P-Value : 0.0002671       
##                                           
##             Sensitivity : 0.68794         
##             Specificity : 0.98290         
##          Pos Pred Value : 0.86607         
##          Neg Pred Value : 0.95143         
##              Prevalence : 0.13851         
##          Detection Rate : 0.09528         
##    Detection Prevalence : 0.11002         
##       Balanced Accuracy : 0.83542         
##                                           
##        'Positive' Class : yes             
##

use caret package

#install.packages("caret")
library(caret)
control=trainControl(method="repeatedcv", number=10, repeats=3)
model =train(churn~., data=churnTrain, method="rpart", trControl=control)


control=trainControl(method="repeatedcv", number=10, repeats=3,classProbs = TRUE,summaryFunction = multiClassSummary)
tune_funs = expand.grid(cp=seq(0.01,0.1,0.01))
model =train(churn~., data=churnTrain, method="rpart", trControl=control,tuneGrid=tune_funs)

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

model

## CART 
## 
## 3333 samples
##   16 predictor
##    2 classes: 'yes', 'no' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2999, 3000, 3000, 2999, 3000, 3000, ... 
## Resampling results across tuning parameters:
## 
##   cp    logLoss    AUC        prAUC        Accuracy   Kappa     
##   0.01  0.2183740  0.8993844  0.748816542  0.9410941  0.74180070
##   0.02  0.2412734  0.8541321  0.671743381  0.9267945  0.66829253
##   0.03  0.2362443  0.8425755  0.732258969  0.9286940  0.67811511
##   0.04  0.2362443  0.8425755  0.732258969  0.9286940  0.67811511
##   0.05  0.2474592  0.8276578  0.686167398  0.9245938  0.65323752
##   0.06  0.3134127  0.7350591  0.258464836  0.9004909  0.50678549
##   0.07  0.3134127  0.7350591  0.258464836  0.9004909  0.50678549
##   0.08  0.3545264  0.6493653  0.281334959  0.8822898  0.36579912
##   0.09  0.4033295  0.5386085  0.057210892  0.8561885  0.10038175
##   0.10  0.4130831  0.5045595  0.009106077  0.8545878  0.01213431
##   F1         Sensitivity  Specificity  Pos_Pred_Value  Neg_Pred_Value
##   0.7751921  0.70522959   0.9810526    0.8643794       0.9516527     
##   0.7088791  0.61848073   0.9790643    0.8385450       0.9381492     
##   0.7178292  0.62872732   0.9795322    0.8408383       0.9397181     
##   0.7178292  0.62872732   0.9795322    0.8408383       0.9397181     
##   0.6945537  0.60043934   0.9795322    0.8323579       0.9355206     
##   0.5575400  0.43408447   0.9795322    0.7836547       0.9108595     
##   0.5575400  0.43408447   0.9795322    0.7836547       0.9108595     
##   0.4305091  0.30308957   0.9804678    0.7261865       0.8927735     
##   0.3164827  0.09136905   0.9858480    0.5194401       0.8652928     
##   0.2502704  0.01169218   0.9974269    0.4368421       0.8562896     
##   Precision  Recall      Detection_Rate  Balanced_Accuracy
##   0.8643794  0.70522959  0.102207297     0.8431411        
##   0.8385450  0.61848073  0.089610569     0.7987725        
##   0.8408383  0.62872732  0.091108474     0.8041297        
##   0.8408383  0.62872732  0.091108474     0.8041297        
##   0.8323579  0.60043934  0.087008266     0.7899858        
##   0.7836547  0.43408447  0.062905420     0.7068083        
##   0.7836547  0.43408447  0.062905420     0.7068083        
##   0.7261865  0.30308957  0.043906481     0.6417787        
##   0.5194401  0.09136905  0.013206919     0.5386085        
##   0.4368421  0.01169218  0.001699304     0.5045595        
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01.

predictions = predict(model, churnTest)


confusionMatrix(table(predictions,churnTest$churn))

## Confusion Matrix and Statistics
## 
##            
## predictions  yes   no
##         yes  145   15
##         no    79 1428
##                                           
##                Accuracy : 0.9436          
##                  95% CI : (0.9314, 0.9542)
##     No Information Rate : 0.8656          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7243          
##  Mcnemar's Test P-Value : 8.142e-11       
##                                           
##             Sensitivity : 0.64732         
##             Specificity : 0.98960         
##          Pos Pred Value : 0.90625         
##          Neg Pred Value : 0.94758         
##              Prevalence : 0.13437         
##          Detection Rate : 0.08698         
##    Detection Prevalence : 0.09598         
##       Balanced Accuracy : 0.81846         
##                                           
##        'Positive' Class : yes             
##

caret 套件使用說明

# 查詢caret package 有實作的所有演算法
names(getModelInfo())

##   [1] "ada"                 "AdaBag"              "AdaBoost.M1"        
##   [4] "adaboost"            "amdai"               "ANFIS"              
##   [7] "avNNet"              "awnb"                "awtan"              
##  [10] "bag"                 "bagEarth"            "bagEarthGCV"        
##  [13] "bagFDA"              "bagFDAGCV"           "bam"                
##  [16] "bartMachine"         "bayesglm"            "binda"              
##  [19] "blackboost"          "blasso"              "blassoAveraged"     
##  [22] "bridge"              "brnn"                "BstLm"              
##  [25] "bstSm"               "bstTree"             "C5.0"               
##  [28] "C5.0Cost"            "C5.0Rules"           "C5.0Tree"           
##  [31] "cforest"             "chaid"               "CSimca"             
##  [34] "ctree"               "ctree2"              "cubist"             
##  [37] "dda"                 "deepboost"           "DENFIS"             
##  [40] "dnn"                 "dwdLinear"           "dwdPoly"            
##  [43] "dwdRadial"           "earth"               "elm"                
##  [46] "enet"                "evtree"              "extraTrees"         
##  [49] "fda"                 "FH.GBML"             "FIR.DM"             
##  [52] "foba"                "FRBCS.CHI"           "FRBCS.W"            
##  [55] "FS.HGD"              "gam"                 "gamboost"           
##  [58] "gamLoess"            "gamSpline"           "gaussprLinear"      
##  [61] "gaussprPoly"         "gaussprRadial"       "gbm_h2o"            
##  [64] "gbm"                 "gcvEarth"            "GFS.FR.MOGUL"       
##  [67] "GFS.LT.RS"           "GFS.THRIFT"          "glm.nb"             
##  [70] "glm"                 "glmboost"            "glmnet_h2o"         
##  [73] "glmnet"              "glmStepAIC"          "gpls"               
##  [76] "hda"                 "hdda"                "hdrda"              
##  [79] "HYFIS"               "icr"                 "J48"                
##  [82] "JRip"                "kernelpls"           "kknn"               
##  [85] "knn"                 "krlsPoly"            "krlsRadial"         
##  [88] "lars"                "lars2"               "lasso"              
##  [91] "lda"                 "lda2"                "leapBackward"       
##  [94] "leapForward"         "leapSeq"             "Linda"              
##  [97] "lm"                  "lmStepAIC"           "LMT"                
## [100] "loclda"              "logicBag"            "LogitBoost"         
## [103] "logreg"              "lssvmLinear"         "lssvmPoly"          
## [106] "lssvmRadial"         "lvq"                 "M5"                 
## [109] "M5Rules"             "manb"                "mda"                
## [112] "Mlda"                "mlp"                 "mlpKerasDecay"      
## [115] "mlpKerasDecayCost"   "mlpKerasDropout"     "mlpKerasDropoutCost"
## [118] "mlpML"               "mlpSGD"              "mlpWeightDecay"     
## [121] "mlpWeightDecayML"    "monmlp"              "msaenet"            
## [124] "multinom"            "mxnet"               "mxnetAdam"          
## [127] "naive_bayes"         "nb"                  "nbDiscrete"         
## [130] "nbSearch"            "neuralnet"           "nnet"               
## [133] "nnls"                "nodeHarvest"         "null"               
## [136] "OneR"                "ordinalNet"          "ORFlog"             
## [139] "ORFpls"              "ORFridge"            "ORFsvm"             
## [142] "ownn"                "pam"                 "parRF"              
## [145] "PART"                "partDSA"             "pcaNNet"            
## [148] "pcr"                 "pda"                 "pda2"               
## [151] "penalized"           "PenalizedLDA"        "plr"                
## [154] "pls"                 "plsRglm"             "polr"               
## [157] "ppr"                 "PRIM"                "protoclass"         
## [160] "qda"                 "QdaCov"              "qrf"                
## [163] "qrnn"                "randomGLM"           "ranger"             
## [166] "rbf"                 "rbfDDA"              "Rborist"            
## [169] "rda"                 "regLogistic"         "relaxo"             
## [172] "rf"                  "rFerns"              "RFlda"              
## [175] "rfRules"             "ridge"               "rlda"               
## [178] "rlm"                 "rmda"                "rocc"               
## [181] "rotationForest"      "rotationForestCp"    "rpart"              
## [184] "rpart1SE"            "rpart2"              "rpartCost"          
## [187] "rpartScore"          "rqlasso"             "rqnc"               
## [190] "RRF"                 "RRFglobal"           "rrlda"              
## [193] "RSimca"              "rvmLinear"           "rvmPoly"            
## [196] "rvmRadial"           "SBC"                 "sda"                
## [199] "sdwd"                "simpls"              "SLAVE"              
## [202] "slda"                "smda"                "snn"                
## [205] "sparseLDA"           "spikeslab"           "spls"               
## [208] "stepLDA"             "stepQDA"             "superpc"            
## [211] "svmBoundrangeString" "svmExpoString"       "svmLinear"          
## [214] "svmLinear2"          "svmLinear3"          "svmLinearWeights"   
## [217] "svmLinearWeights2"   "svmPoly"             "svmRadial"          
## [220] "svmRadialCost"       "svmRadialSigma"      "svmRadialWeights"   
## [223] "svmSpectrumString"   "tan"                 "tanSearch"          
## [226] "treebag"             "vbmpRadial"          "vglmAdjCat"         
## [229] "vglmContRatio"       "vglmCumulative"      "widekernelpls"      
## [232] "WM"                  "wsrf"                "xgbDART"            
## [235] "xgbLinear"           "xgbTree"             "xyf"

# 查詢caret package 有沒有實作rpart演算法
names(getModelInfo())[grep('rpart',names(getModelInfo()))]

## [1] "rpart"      "rpart1SE"   "rpart2"     "rpartCost"  "rpartScore"

# 查詢rpart model資訊
getModelInfo('rpart')

## $rpart
## $rpart$label
## [1] "CART"
## 
## $rpart$library
## [1] "rpart"
## 
## $rpart$type
## [1] "Regression"     "Classification"
## 
## $rpart$parameters
##   parameter   class                label
## 1        cp numeric Complexity Parameter
## 
## $rpart$grid
## function (x, y, len = NULL, search = "grid") 
## {
##     dat <- if (is.data.frame(x)) 
##         x
##     else as.data.frame(x)
##     dat$.outcome <- y
##     initialFit <- rpart::rpart(.outcome ~ ., data = dat, control = rpart::rpart.control(cp = 0))$cptable
##     initialFit <- initialFit[order(-initialFit[, "CP"]), , drop = FALSE]
##     if (search == "grid") {
##         if (nrow(initialFit) < len) {
##             tuneSeq <- data.frame(cp = seq(min(initialFit[, "CP"]), 
##                 max(initialFit[, "CP"]), length = len))
##         }
##         else tuneSeq <- data.frame(cp = initialFit[1:len, "CP"])
##         colnames(tuneSeq) <- "cp"
##     }
##     else {
##         tuneSeq <- data.frame(cp = unique(sample(initialFit[, 
##             "CP"], size = len, replace = TRUE)))
##     }
##     tuneSeq
## }
## 
## $rpart$loop
## function (grid) 
## {
##     grid <- grid[order(grid$cp, decreasing = FALSE), , drop = FALSE]
##     loop <- grid[1, , drop = FALSE]
##     submodels <- list(grid[-1, , drop = FALSE])
##     list(loop = loop, submodels = submodels)
## }
## 
## $rpart$fit
## function (x, y, wts, param, lev, last, classProbs, ...) 
## {
##     cpValue <- if (!last) 
##         param$cp
##     else 0
##     theDots <- list(...)
##     if (any(names(theDots) == "control")) {
##         theDots$control$cp <- cpValue
##         theDots$control$xval <- 0
##         ctl <- theDots$control
##         theDots$control <- NULL
##     }
##     else ctl <- rpart::rpart.control(cp = cpValue, xval = 0)
##     if (!is.null(wts)) 
##         theDots$weights <- wts
##     modelArgs <- c(list(formula = as.formula(".outcome ~ ."), 
##         data = if (is.data.frame(x)) x else as.data.frame(x), 
##         control = ctl), theDots)
##     modelArgs$data$.outcome <- y
##     out <- do.call(rpart::rpart, modelArgs)
##     if (last) 
##         out <- rpart::prune.rpart(out, cp = param$cp)
##     out
## }
## 
## $rpart$predict
## function (modelFit, newdata, submodels = NULL) 
## {
##     if (!is.data.frame(newdata)) 
##         newdata <- as.data.frame(newdata)
##     pType <- if (modelFit$problemType == "Classification") 
##         "class"
##     else "vector"
##     out <- predict(modelFit, newdata, type = pType)
##     if (!is.null(submodels)) {
##         tmp <- vector(mode = "list", length = nrow(submodels) + 
##             1)
##         tmp[[1]] <- out
##         for (j in seq(along = submodels$cp)) {
##             prunedFit <- rpart::prune.rpart(modelFit, cp = submodels$cp[j])
##             tmp[[j + 1]] <- predict(prunedFit, newdata, type = pType)
##         }
##         out <- tmp
##     }
##     out
## }
## 
## $rpart$prob
## function (modelFit, newdata, submodels = NULL) 
## {
##     if (!is.data.frame(newdata)) 
##         newdata <- as.data.frame(newdata)
##     out <- predict(modelFit, newdata, type = "prob")
##     if (!is.null(submodels)) {
##         tmp <- vector(mode = "list", length = nrow(submodels) + 
##             1)
##         tmp[[1]] <- out
##         for (j in seq(along = submodels$cp)) {
##             prunedFit <- rpart::prune.rpart(modelFit, cp = submodels$cp[j])
##             tmpProb <- predict(prunedFit, newdata, type = "prob")
##             tmp[[j + 1]] <- as.data.frame(tmpProb[, modelFit$obsLevels, 
##                 drop = FALSE])
##         }
##         out <- tmp
##     }
##     out
## }
## 
## $rpart$predictors
## function (x, surrogate = TRUE, ...) 
## {
##     out <- as.character(x$frame$var)
##     out <- out[!(out %in% c("<leaf>"))]
##     if (surrogate) {
##         splits <- x$splits
##         splits <- splits[splits[, "adj"] > 0, ]
##         out <- c(out, rownames(splits))
##     }
##     unique(out)
## }
## 
## $rpart$varImp
## function (object, surrogates = FALSE, competes = TRUE, ...) 
## {
##     if (nrow(object$splits) > 0) {
##         tmp <- rownames(object$splits)
##         rownames(object$splits) <- 1:nrow(object$splits)
##         splits <- data.frame(object$splits)
##         splits$var <- tmp
##         splits$type <- ""
##         frame <- as.data.frame(object$frame)
##         index <- 0
##         for (i in 1:nrow(frame)) {
##             if (frame$var[i] != "<leaf>") {
##                 index <- index + 1
##                 splits$type[index] <- "primary"
##                 if (frame$ncompete[i] > 0) {
##                   for (j in 1:frame$ncompete[i]) {
##                     index <- index + 1
##                     splits$type[index] <- "competing"
##                   }
##                 }
##                 if (frame$nsurrogate[i] > 0) {
##                   for (j in 1:frame$nsurrogate[i]) {
##                     index <- index + 1
##                     splits$type[index] <- "surrogate"
##                   }
##                 }
##             }
##         }
##         splits$var <- factor(as.character(splits$var))
##         if (!surrogates) 
##             splits <- subset(splits, type != "surrogate")
##         if (!competes) 
##             splits <- subset(splits, type != "competing")
##         out <- aggregate(splits$improve, list(Variable = splits$var), 
##             sum, na.rm = TRUE)
##     }
##     else {
##         out <- data.frame(x = numeric(), Vaiable = character())
##     }
##     allVars <- colnames(attributes(object$terms)$factors)
##     if (!all(allVars %in% out$Variable)) {
##         missingVars <- allVars[!(allVars %in% out$Variable)]
##         zeros <- data.frame(x = rep(0, length(missingVars)), 
##             Variable = missingVars)
##         out <- rbind(out, zeros)
##     }
##     out2 <- data.frame(Overall = out$x)
##     rownames(out2) <- out$Variable
##     out2
## }
## 
## $rpart$levels
## function (x) 
## x$obsLevels
## 
## $rpart$trim
## function (x) 
## {
##     x$call <- list(na.action = (x$call)$na.action)
##     x$x <- NULL
##     x$y <- NULL
##     x$where <- NULL
##     x
## }
## 
## $rpart$tags
## [1] "Tree-Based Model"              "Implicit Feature Selection"   
## [3] "Handle Missing Predictor Data" "Accepts Case Weights"         
## 
## $rpart$sort
## function (x) 
## x[order(x[, 1], decreasing = TRUE), ]
## 
## 
## $rpart1SE
## $rpart1SE$label
## [1] "CART"
## 
## $rpart1SE$library
## [1] "rpart"
## 
## $rpart1SE$type
## [1] "Regression"     "Classification"
## 
## $rpart1SE$parameters
##   parameter     class     label
## 1 parameter character parameter
## 
## $rpart1SE$grid
## function (x, y, len = NULL, search = "grid") 
## data.frame(parameter = "none")
## 
## $rpart1SE$loop
## NULL
## 
## $rpart1SE$fit
## function (x, y, wts, param, lev, last, classProbs, ...) 
## {
##     dat <- if (is.data.frame(x)) 
##         x
##     else as.data.frame(x)
##     dat$.outcome <- y
##     if (!is.null(wts)) {
##         out <- rpart::rpart(.outcome ~ ., data = dat, ...)
##     }
##     else {
##         out <- rpart::rpart(.outcome ~ ., data = dat, weights = wts, 
##             ...)
##     }
##     out
## }
## 
## $rpart1SE$predict
## function (modelFit, newdata, submodels = NULL) 
## {
##     if (!is.data.frame(newdata)) 
##         newdata <- as.data.frame(newdata)
##     out <- if (modelFit$problemType == "Classification") 
##         predict(modelFit, newdata, type = "class")
##     else predict(modelFit, newdata)
##     out
## }
## 
## $rpart1SE$prob
## function (modelFit, newdata, submodels = NULL) 
## {
##     if (!is.data.frame(newdata)) 
##         newdata <- as.data.frame(newdata)
##     predict(modelFit, newdata, type = "prob")
## }
## 
## $rpart1SE$predictors
## function (x, surrogate = TRUE, ...) 
## {
##     out <- as.character(x$frame$var)
##     out <- out[!(out %in% c("<leaf>"))]
##     if (surrogate) {
##         splits <- x$splits
##         splits <- splits[splits[, "adj"] > 0, ]
##         out <- c(out, rownames(splits))
##     }
##     unique(out)
## }
## 
## $rpart1SE$varImp
## function (object, surrogates = FALSE, competes = TRUE, ...) 
## {
##     tmp <- rownames(object$splits)
##     rownames(object$splits) <- 1:nrow(object$splits)
##     splits <- data.frame(object$splits)
##     splits$var <- tmp
##     splits$type <- ""
##     frame <- as.data.frame(object$frame)
##     index <- 0
##     for (i in 1:nrow(frame)) {
##         if (frame$var[i] != "<leaf>") {
##             index <- index + 1
##             splits$type[index] <- "primary"
##             if (frame$ncompete[i] > 0) {
##                 for (j in 1:frame$ncompete[i]) {
##                   index <- index + 1
##                   splits$type[index] <- "competing"
##                 }
##             }
##             if (frame$nsurrogate[i] > 0) {
##                 for (j in 1:frame$nsurrogate[i]) {
##                   index <- index + 1
##                   splits$type[index] <- "surrogate"
##                 }
##             }
##         }
##     }
##     splits$var <- factor(as.character(splits$var))
##     if (!surrogates) 
##         splits <- subset(splits, type != "surrogate")
##     if (!competes) 
##         splits <- subset(splits, type != "competing")
##     out <- aggregate(splits$improve, list(Variable = splits$var), 
##         sum, na.rm = TRUE)
##     allVars <- colnames(attributes(object$terms)$factors)
##     if (!all(allVars %in% out$Variable)) {
##         missingVars <- allVars[!(allVars %in% out$Variable)]
##         zeros <- data.frame(x = rep(0, length(missingVars)), 
##             Variable = missingVars)
##         out <- rbind(out, zeros)
##     }
##     out2 <- data.frame(Overall = out$x)
##     rownames(out2) <- out$Variable
##     out2
## }
## 
## $rpart1SE$levels
## function (x) 
## x$obsLevels
## 
## $rpart1SE$trim
## function (x) 
## {
##     x$call <- list(na.action = (x$call)$na.action)
##     x$x <- NULL
##     x$y <- NULL
##     x$where <- NULL
##     x
## }
## 
## $rpart1SE$notes
## [1] "This CART model replicates the same process used by the `rpart` function where the model complexity is determined using the one-standard error method. This procedure is replicated inside of the resampling done by `train` so that an external resampling estimate can be obtained."
## 
## $rpart1SE$tags
## [1] "Tree-Based Model"              "Implicit Feature Selection"   
## [3] "Handle Missing Predictor Data" "Accepts Case Weights"         
## 
## $rpart1SE$sort
## function (x) 
## x[order(x[, 1], decreasing = TRUE), ]
## 
## 
## $rpart2
## $rpart2$label
## [1] "CART"
## 
## $rpart2$library
## [1] "rpart"
## 
## $rpart2$type
## [1] "Regression"     "Classification"
## 
## $rpart2$parameters
##   parameter   class          label
## 1  maxdepth numeric Max Tree Depth
## 
## $rpart2$grid
## function (x, y, len = NULL, search = "grid") 
## {
##     dat <- if (is.data.frame(x)) 
##         x
##     else as.data.frame(x)
##     dat$.outcome <- y
##     initialFit <- rpart::rpart(.outcome ~ ., data = dat, control = rpart::rpart.control(cp = 0))$cptable
##     initialFit <- initialFit[order(-initialFit[, "CP"]), "nsplit", 
##         drop = FALSE]
##     initialFit <- initialFit[initialFit[, "nsplit"] > 0 & initialFit[, 
##         "nsplit"] <= 30, , drop = FALSE]
##     if (search == "grid") {
##         if (dim(initialFit)[1] < len) {
##             cat("note: only", nrow(initialFit), "possible values of the max tree depth from the initial fit.\n", 
##                 "Truncating the grid to", nrow(initialFit), ".\n\n")
##             tuneSeq <- as.data.frame(initialFit)
##         }
##         else tuneSeq <- as.data.frame(initialFit[1:len, ])
##         colnames(tuneSeq) <- "maxdepth"
##     }
##     else {
##         tuneSeq <- data.frame(maxdepth = unique(sample(as.vector(initialFit[, 
##             1]), size = len, replace = TRUE)))
##     }
##     tuneSeq
## }
## 
## $rpart2$loop
## function (grid) 
## {
##     grid <- grid[order(grid$maxdepth, decreasing = TRUE), , drop = FALSE]
##     loop <- grid[1, , drop = FALSE]
##     submodels <- list(grid[-1, , drop = FALSE])
##     list(loop = loop, submodels = submodels)
## }
## 
## $rpart2$fit
## function (x, y, wts, param, lev, last, classProbs, ...) 
## {
##     theDots <- list(...)
##     if (any(names(theDots) == "control")) {
##         theDots$control$maxdepth <- param$maxdepth
##         theDots$control$xval <- 0
##         ctl <- theDots$control
##         theDots$control <- NULL
##     }
##     else ctl <- rpart::rpart.control(maxdepth = param$maxdepth, 
##         xval = 0)
##     if (!is.null(wts)) 
##         theDots$weights <- wts
##     modelArgs <- c(list(formula = as.formula(".outcome ~ ."), 
##         data = if (is.data.frame(x)) x else as.data.frame(x), 
##         control = ctl), theDots)
##     modelArgs$data$.outcome <- y
##     out <- do.call(rpart::rpart, modelArgs)
##     out
## }
## 
## $rpart2$predict
## function (modelFit, newdata, submodels = NULL) 
## {
##     depth2cp <- function(x, depth) {
##         out <- approx(x[, "nsplit"], x[, "CP"], depth)$y
##         out[depth > max(x[, "nsplit"])] <- min(x[, "CP"]) * 0.99
##         out
##     }
##     if (!is.data.frame(newdata)) 
##         newdata <- as.data.frame(newdata)
##     pType <- if (modelFit$problemType == "Classification") 
##         "class"
##     else "vector"
##     out <- predict(modelFit, newdata, type = pType)
##     if (!is.null(submodels)) {
##         tmp <- vector(mode = "list", length = nrow(submodels) + 
##             1)
##         tmp[[1]] <- out
##         cpValues <- depth2cp(modelFit$cptable, submodels$maxdepth)
##         for (j in seq(along = cpValues)) {
##             prunedFit <- rpart::prune.rpart(modelFit, cp = cpValues[j])
##             tmp[[j + 1]] <- predict(prunedFit, newdata, type = pType)
##         }
##         out <- tmp
##     }
##     out
## }
## 
## $rpart2$prob
## function (modelFit, newdata, submodels = NULL) 
## {
##     depth2cp <- function(x, depth) {
##         out <- approx(x[, "nsplit"], x[, "CP"], depth)$y
##         out[depth > max(x[, "nsplit"])] <- min(x[, "CP"]) * 0.99
##         out
##     }
##     if (!is.data.frame(newdata)) 
##         newdata <- as.data.frame(newdata)
##     out <- predict(modelFit, newdata, type = "prob")
##     if (!is.null(submodels)) {
##         tmp <- vector(mode = "list", length = nrow(submodels) + 
##             1)
##         tmp[[1]] <- out
##         cpValues <- depth2cp(modelFit$cptable, submodels$maxdepth)
##         for (j in seq(along = cpValues)) {
##             prunedFit <- rpart::prune.rpart(modelFit, cp = cpValues[j])
##             tmpProb <- predict(prunedFit, newdata, type = "prob")
##             tmp[[j + 1]] <- as.data.frame(tmpProb[, modelFit$obsLevels, 
##                 drop = FALSE])
##         }
##         out <- tmp
##     }
##     out
## }
## 
## $rpart2$predictors
## function (x, surrogate = TRUE, ...) 
## {
##     out <- as.character(x$frame$var)
##     out <- out[!(out %in% c("<leaf>"))]
##     if (surrogate) {
##         splits <- x$splits
##         splits <- splits[splits[, "adj"] > 0, ]
##         out <- c(out, rownames(splits))
##     }
##     unique(out)
## }
## 
## $rpart2$varImp
## function (object, surrogates = FALSE, competes = TRUE, ...) 
## {
##     tmp <- rownames(object$splits)
##     rownames(object$splits) <- 1:nrow(object$splits)
##     splits <- data.frame(object$splits)
##     splits$var <- tmp
##     splits$type <- ""
##     frame <- as.data.frame(object$frame)
##     index <- 0
##     for (i in 1:nrow(frame)) {
##         if (frame$var[i] != "<leaf>") {
##             index <- index + 1
##             splits$type[index] <- "primary"
##             if (frame$ncompete[i] > 0) {
##                 for (j in 1:frame$ncompete[i]) {
##                   index <- index + 1
##                   splits$type[index] <- "competing"
##                 }
##             }
##             if (frame$nsurrogate[i] > 0) {
##                 for (j in 1:frame$nsurrogate[i]) {
##                   index <- index + 1
##                   splits$type[index] <- "surrogate"
##                 }
##             }
##         }
##     }
##     splits$var <- factor(as.character(splits$var))
##     if (!surrogates) 
##         splits <- subset(splits, type != "surrogate")
##     if (!competes) 
##         splits <- subset(splits, type != "competing")
##     out <- aggregate(splits$improve, list(Variable = splits$var), 
##         sum, na.rm = TRUE)
##     allVars <- colnames(attributes(object$terms)$factors)
##     if (!all(allVars %in% out$Variable)) {
##         missingVars <- allVars[!(allVars %in% out$Variable)]
##         zeros <- data.frame(x = rep(0, length(missingVars)), 
##             Variable = missingVars)
##         out <- rbind(out, zeros)
##     }
##     out2 <- data.frame(Overall = out$x)
##     rownames(out2) <- out$Variable
##     out2
## }
## 
## $rpart2$levels
## function (x) 
## x$obsLevels
## 
## $rpart2$trim
## function (x) 
## {
##     x$call <- list(na.action = (x$call)$na.action)
##     x$x <- NULL
##     x$y <- NULL
##     x$where <- NULL
##     x
## }
## 
## $rpart2$tags
## [1] "Tree-Based Model"              "Implicit Feature Selection"   
## [3] "Handle Missing Predictor Data" "Accepts Case Weights"         
## 
## $rpart2$sort
## function (x) 
## x[order(x[, 1]), ]
## 
## 
## $rpartCost
## $rpartCost$label
## [1] "Cost-Sensitive CART"
## 
## $rpartCost$library
## [1] "rpart" "plyr" 
## 
## $rpartCost$type
## [1] "Classification"
## 
## $rpartCost$parameters
##   parameter   class                label
## 1        cp numeric Complexity Parameter
## 2      Cost numeric                 Cost
## 
## $rpartCost$grid
## function (x, y, len = NULL, search = "grid") 
## {
##     dat <- if (is.data.frame(x)) 
##         x
##     else as.data.frame(x)
##     dat$.outcome <- y
##     initialFit <- rpart::rpart(.outcome ~ ., data = dat, control = rpart::rpart.control(cp = 0))$cptable
##     initialFit <- initialFit[order(-initialFit[, "CP"]), , drop = FALSE]
##     if (search == "grid") {
##         if (nrow(initialFit) < len) {
##             tuneSeq <- expand.grid(cp = seq(min(initialFit[, 
##                 "CP"]), max(initialFit[, "CP"]), length = len), 
##                 Cost = 1:len)
##         }
##         else tuneSeq <- data.frame(cp = initialFit[1:len, "CP"], 
##             Cost = 1:len)
##         colnames(tuneSeq) <- c("cp", "Cost")
##     }
##     else {
##         tuneSeq <- data.frame(cp = 10^runif(len, min = -8, max = -1), 
##             Cost = runif(len, min = 1, max = 30))
##     }
##     tuneSeq
## }
## 
## $rpartCost$loop
## function (grid) 
## {
##     loop <- plyr::ddply(grid, plyr::.(Cost), function(x) c(cp = min(x$cp)))
##     submodels <- vector(mode = "list", length = nrow(loop))
##     for (i in seq(along = submodels)) {
##         larger_cp <- subset(grid, subset = Cost == loop$Cost[i] & 
##             cp > loop$cp[i])
##         submodels[[i]] <- data.frame(cp = sort(larger_cp$cp))
##     }
##     list(loop = loop, submodels = submodels)
## }
## 
## $rpartCost$fit
## function (x, y, wts, param, lev, last, classProbs, ...) 
## {
##     theDots <- list(...)
##     if (any(names(theDots) == "control")) {
##         theDots$control$cp <- param$cp
##         theDots$control$xval <- 0
##         ctl <- theDots$control
##         theDots$control <- NULL
##     }
##     else ctl <- rpart::rpart.control(cp = param$cp, xval = 0)
##     lmat <- matrix(c(0, 1, param$Cost, 0), ncol = 2)
##     rownames(lmat) <- colnames(lmat) <- levels(y)
##     if (any(names(theDots) == "parms")) {
##         theDots$parms$loss <- lmat
##     }
##     else parms <- list(loss = lmat)
##     if (!is.null(wts)) 
##         theDots$weights <- wts
##     modelArgs <- c(list(formula = as.formula(".outcome ~ ."), 
##         data = if (is.data.frame(x)) x else as.data.frame(x), 
##         parms = parms, control = ctl), theDots)
##     modelArgs$data$.outcome <- y
##     out <- do.call(rpart::rpart, modelArgs)
##     out
## }
## 
## $rpartCost$predict
## function (modelFit, newdata, submodels = NULL) 
## {
##     if (!is.data.frame(newdata)) 
##         newdata <- as.data.frame(newdata)
##     pType <- if (modelFit$problemType == "Classification") 
##         "class"
##     else "vector"
##     out <- predict(modelFit, newdata, type = pType)
##     if (!is.null(submodels)) {
##         tmp <- vector(mode = "list", length = nrow(submodels) + 
##             1)
##         tmp[[1]] <- out
##         for (j in seq(along = submodels$cp)) {
##             prunedFit <- rpart::prune.rpart(modelFit, cp = submodels$cp[j])
##             tmp[[j + 1]] <- predict(prunedFit, newdata, type = pType)
##         }
##         out <- tmp
##     }
##     out
## }
## 
## $rpartCost$levels
## function (x) 
## x$obsLevels
## 
## $rpartCost$prob
## NULL
## 
## $rpartCost$tags
## [1] "Tree-Based Model"              "Implicit Feature Selection"   
## [3] "Cost Sensitive Learning"       "Two Class Only"               
## [5] "Handle Missing Predictor Data" "Accepts Case Weights"         
## 
## $rpartCost$sort
## function (x) 
## x[order(-x$cp, -x$Cost), ]
## 
## 
## $rpartScore
## $rpartScore$label
## [1] "CART or Ordinal Responses"
## 
## $rpartScore$library
## [1] "rpartScore" "plyr"      
## 
## $rpartScore$type
## [1] "Classification"
## 
## $rpartScore$parameters
##   parameter     class                label
## 1        cp   numeric Complexity Parameter
## 2     split character       Split Function
## 3     prune character      Pruning Measure
## 
## $rpartScore$grid
## function (x, y, len = NULL, search = "grid") 
## {
##     dat <- if (is.data.frame(x)) 
##         x
##     else as.data.frame(x)
##     dat$.outcome <- y
##     initialFit <- rpart::rpart(.outcome ~ ., data = dat, control = rpart::rpart.control(cp = 0))$cptable
##     initialFit <- initialFit[order(-initialFit[, "CP"]), , drop = FALSE]
##     if (search == "grid") {
##         if (nrow(initialFit) < len) {
##             tuneSeq <- expand.grid(cp = seq(min(initialFit[, 
##                 "CP"]), max(initialFit[, "CP"]), length = len), 
##                 split = c("abs", "quad"), prune = c("mr", "mc"))
##         }
##         else tuneSeq <- expand.grid(cp = initialFit[1:len, "CP"], 
##             split = c("abs", "quad"), prune = c("mr", "mc"))
##         colnames(tuneSeq)[1] <- "cp"
##     }
##     else {
##         tuneSeq <- expand.grid(cp = unique(sample(initialFit[, 
##             "CP"], size = len, replace = TRUE)), split = c("abs", 
##             "quad"), prune = c("mr", "mc"))
##     }
##     tuneSeq
## }
## 
## $rpartScore$fit
## function (x, y, wts, param, lev, last, classProbs, ...) 
## {
##     cpValue <- if (!last) 
##         param$cp
##     else 0
##     theDots <- list(...)
##     if (any(names(theDots) == "control")) {
##         theDots$control$cp <- cpValue
##         theDots$control$xval <- 0
##         ctl <- theDots$control
##         theDots$control <- NULL
##     }
##     else ctl <- rpart::rpart.control(cp = cpValue, xval = 0)
##     if (!is.null(wts)) 
##         theDots$weights <- wts
##     modelArgs <- c(list(formula = as.formula(".outcome ~ ."), 
##         data = if (is.data.frame(x)) x else as.data.frame(x), 
##         split = as.character(param$split), prune = as.character(param$prune), 
##         control = ctl), theDots)
##     modelArgs$data$.outcome <- as.numeric(y)
##     out <- do.call(rpartScore::rpartScore, modelArgs)
##     if (last) 
##         out <- rpart::prune.rpart(out, cp = param$cp)
##     out
## }
## 
## $rpartScore$predict
## function (modelFit, newdata, submodels = NULL) 
## {
##     if (!is.data.frame(newdata)) 
##         newdata <- as.data.frame(newdata)
##     out <- modelFit$obsLevels[predict(modelFit, newdata)]
##     if (!is.null(submodels)) {
##         tmp <- vector(mode = "list", length = nrow(submodels) + 
##             1)
##         tmp[[1]] <- out
##         for (j in seq(along = submodels$cp)) {
##             prunedFit <- rpart::prune.rpart(modelFit, cp = submodels$cp[j])
##             tmp[[j + 1]] <- modelFit$obsLevels[predict(prunedFit, 
##                 newdata)]
##         }
##         out <- tmp
##     }
##     out
## }
## 
## $rpartScore$prob
## NULL
## 
## $rpartScore$predictors
## function (x, surrogate = TRUE, ...) 
## {
##     out <- as.character(x$frame$var)
##     out <- out[!(out %in% c("<leaf>"))]
##     if (surrogate) {
##         splits <- x$splits
##         splits <- splits[splits[, "adj"] > 0, ]
##         out <- c(out, rownames(splits))
##     }
##     unique(out)
## }
## 
## $rpartScore$varImp
## function (object, surrogates = FALSE, competes = TRUE, ...) 
## {
##     allVars <- all.vars(object$terms)
##     allVars <- allVars[allVars != ".outcome"]
##     out <- data.frame(Overall = object$variable.importance, Variable = names(object$variable.importance))
##     rownames(out) <- names(object$variable.importance)
##     if (!all(allVars %in% out$Variable)) {
##         missingVars <- allVars[!(allVars %in% out$Variable)]
##         zeros <- data.frame(Overall = rep(0, length(missingVars)), 
##             Variable = missingVars)
##         out <- rbind(out, zeros)
##     }
##     rownames(out) <- out$Variable
##     out$Variable <- NULL
##     out
## }
## 
## $rpartScore$levels
## function (x) 
## x$obsLevels
## 
## $rpartScore$trim
## function (x) 
## {
##     x$call <- list(na.action = (x$call)$na.action)
##     x$x <- NULL
##     x$y <- NULL
##     x$where <- NULL
##     x
## }
## 
## $rpartScore$tags
## [1] "Tree-Based Model"              "Implicit Feature Selection"   
## [3] "Handle Missing Predictor Data" "Accepts Case Weights"         
## [5] "Ordinal Outcomes"             
## 
## $rpartScore$sort
## function (x) 
## x[order(x[, 1], decreasing = TRUE), ]

# 查詢rpart model可以tune的parameters
getModelInfo('rpart')$rpart$parameters

##   parameter   class                label
## 1        cp numeric Complexity Parameter

R_basic4

York Lin

2019年02月12日

review dplyr select & filter

dplyr 其他常用函式

alter mysql 8.0 password encryption

Learning map

Classification

Decision Tree - using churn data in C50 package

rpart

cost complexity

Prune

ctree

C5.0

use caret package

caret 套件使用說明