serim_tele_tree_0509

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.6.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(C50)

## Warning: package 'C50' was built under R version 3.6.1

library(tree)

## Warning: package 'tree' was built under R version 3.6.1

## Registered S3 method overwritten by 'tree':
##   method     from
##   print.tree cli

library(e1071)

## Warning: package 'e1071' was built under R version 3.6.1

library(caret)

## Warning: package 'caret' was built under R version 3.6.1

## Loading required package: lattice

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.6.3

library(tree)
library(rpart)

## Warning: package 'rpart' was built under R version 3.6.1

library(party)

## Warning: package 'party' was built under R version 3.6.1

## Loading required package: grid

## Loading required package: mvtnorm

## Loading required package: modeltools

## Loading required package: stats4

## Loading required package: strucchange

## Warning: package 'strucchange' was built under R version 3.6.1

## Loading required package: zoo

## Warning: package 'zoo' was built under R version 3.6.1

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

## Warning: package 'sandwich' was built under R version 3.6.1

library(C50)

##1.0509_의사결정나무_tele_churn 데이터로
  raw <- read.csv("tele_churn.csv")
  names(raw) <- tolower(names(raw))
  
  summary(raw)

##       customerid      gender     seniorcitizen    partner    dependents
##  0002-ORFBO:   1   Female:3488   Min.   :0.0000   No :3641   No :4933  
##  0003-MKNFE:   1   Male  :3555   1st Qu.:0.0000   Yes:3402   Yes:2110  
##  0004-TLHLJ:   1                 Median :0.0000                        
##  0011-IGKFF:   1                 Mean   :0.1621                        
##  0013-EXCHZ:   1                 3rd Qu.:0.0000                        
##  0013-MHZWF:   1                 Max.   :1.0000                        
##  (Other)   :7037                                                       
##      tenure      phoneservice          multiplelines     internetservice
##  Min.   : 0.00   No : 682     No              :3390   DSL        :2421  
##  1st Qu.: 9.00   Yes:6361     No phone service: 682   Fiber optic:3096  
##  Median :29.00                Yes             :2971   No         :1526  
##  Mean   :32.37                                                          
##  3rd Qu.:55.00                                                          
##  Max.   :72.00                                                          
##                                                                         
##              onlinesecurity              onlinebackup 
##  No                 :3498   No                 :3088  
##  No internet service:1526   No internet service:1526  
##  Yes                :2019   Yes                :2429  
##                                                       
##                                                       
##                                                       
##                                                       
##             deviceprotection              techsupport  
##  No                 :3095    No                 :3473  
##  No internet service:1526    No internet service:1526  
##  Yes                :2422    Yes                :2044  
##                                                        
##                                                        
##                                                        
##                                                        
##               streamingtv              streamingmovies
##  No                 :2810   No                 :2785  
##  No internet service:1526   No internet service:1526  
##  Yes                :2707   Yes                :2732  
##                                                       
##                                                       
##                                                       
##                                                       
##            contract    paperlessbilling                   paymentmethod 
##  Month-to-month:3875   No :2872         Bank transfer (automatic):1544  
##  One year      :1473   Yes:4171         Credit card (automatic)  :1522  
##  Two year      :1695                    Electronic check         :2365  
##                                         Mailed check             :1612  
##                                                                         
##                                                                         
##                                                                         
##  monthlycharges    totalcharges    churn     
##  Min.   : 18.25   Min.   :  18.8   No :5174  
##  1st Qu.: 35.50   1st Qu.: 401.4   Yes:1869  
##  Median : 70.35   Median :1397.5             
##  Mean   : 64.76   Mean   :2283.3             
##  3rd Qu.: 89.85   3rd Qu.:3794.7             
##  Max.   :118.75   Max.   :8684.8             
##                   NA's   :11

  glimpse(raw)

## Rows: 7,043
## Columns: 21
## $ customerid       <fct> 7590-VHVEG, 5575-GNVDE, 3668-QPYBK, 7795-CFOC...
## $ gender           <fct> Female, Male, Male, Male, Female, Female, Mal...
## $ seniorcitizen    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ partner          <fct> Yes, No, No, No, No, No, No, No, Yes, No, Yes...
## $ dependents       <fct> No, No, No, No, No, No, Yes, No, No, Yes, Yes...
## $ tenure           <int> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 5...
## $ phoneservice     <fct> No, Yes, Yes, No, Yes, Yes, Yes, No, Yes, Yes...
## $ multiplelines    <fct> No phone service, No, No, No phone service, N...
## $ internetservice  <fct> DSL, DSL, DSL, DSL, Fiber optic, Fiber optic,...
## $ onlinesecurity   <fct> No, Yes, Yes, Yes, No, No, No, Yes, No, Yes, ...
## $ onlinebackup     <fct> Yes, No, Yes, No, No, No, Yes, No, No, Yes, N...
## $ deviceprotection <fct> No, Yes, No, Yes, No, Yes, No, No, Yes, No, N...
## $ techsupport      <fct> No, No, No, Yes, No, No, No, No, Yes, No, No,...
## $ streamingtv      <fct> No, No, No, No, No, Yes, Yes, No, Yes, No, No...
## $ streamingmovies  <fct> No, No, No, No, No, Yes, No, No, Yes, No, No,...
## $ contract         <fct> Month-to-month, One year, Month-to-month, One...
## $ paperlessbilling <fct> Yes, No, Yes, No, Yes, Yes, Yes, No, Yes, No,...
## $ paymentmethod    <fct> Electronic check, Mailed check, Mailed check,...
## $ monthlycharges   <dbl> 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89....
## $ totalcharges     <dbl> 29.85, 1889.50, 108.15, 1840.75, 151.65, 820....
## $ churn            <fct> No, No, Yes, No, Yes, Yes, No, No, Yes, No, N...

    #의외로 factor value가 많네, charge는 dbl이고
  
  
  ##잠깐, 모델 모두 돌려보는게 목표니까, 섬세한 전처리는 생략한다!! (NA가 11개 있지만, 그냥 빼버리자!)
  raw2 <- raw %>% 
    mutate(senior = as.factor(ifelse(seniorcitizen == 0, "junior", "senior"))) %>%
    select(-c(customerid, seniorcitizen)) 

  View(head(raw2))

  ##a.샘플링 7:3으로
  set.seed(123)
  #idx <- sample(1:nrow(raw2), size = nrow(raw2)*0.7)  
  idx <- createDataPartition(y=raw2$churn, p =0.7, list=FALSE)
  tr_raw <- raw2[idx,]  
  te_raw <- raw2[-idx,]

  ##b-1-1.모델링 tree: binary recursive partitionings
  #library(tree)
  #불순도 측도로 엔트로피 사용. 불순도가 높으면 엔틀피높다
  #엔트로피는 0~1값 사이
  tree_m <- tree(churn ~., data = tr_raw) 
  plot(tree_m)
  text(tree_m)

    ##b-1-2.CV tree (cross-validation) 최적의 가지 개수를 
    cv_tree <- cv.tree(tree_m, FUN = prune.misclass)
    str(cv_tree)

## List of 4
##  $ size  : int [1:3] 6 4 1
##  $ dev   : num [1:3] 1048 1048 1309
##  $ k     : num [1:3] -Inf 0 91
##  $ method: chr "misclass"
##  - attr(*, "class")= chr [1:2] "prune" "tree.sequence"

    cv_tree$size #나무사이즈

## [1] 6 4 1

    cv_tree$dev #잘못된분류수

## [1] 1048 1048 1309

    plot(cv_tree)  #4개군

    ##b-1-3.prune
    prune_tree <- prune.misclass(tree_m, best = 4) #4개라니까.
    plot(prune_tree)  
    text(prune_tree, pretty = 0)

    ##b-1-4.predict
    tree_pred <- predict(prune_tree, te_raw, type = "class")  
    confusionMatrix(tree_pred, te_raw$churn)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1464  353
##        Yes   88  207
##                                           
##                Accuracy : 0.7912          
##                  95% CI : (0.7732, 0.8084)
##     No Information Rate : 0.7348          
##     P-Value [Acc > NIR] : 1.074e-09       
##                                           
##                   Kappa : 0.3687          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9433          
##             Specificity : 0.3696          
##          Pos Pred Value : 0.8057          
##          Neg Pred Value : 0.7017          
##              Prevalence : 0.7348          
##          Detection Rate : 0.6932          
##    Detection Prevalence : 0.8603          
##       Balanced Accuracy : 0.6565          
##                                           
##        'Positive' Class : No              
##

  ##b-2-1.모델링 rpart: gini 사용, prune해야함
  #library(rpart)
  #분류와 회귀나무. CART 사용. gini가 작아지는 방향
  rpart_tree <- rpart(churn ~ ., data = tr_raw, method = "class")
  #summary(rpart_tree)
  #print(rpart_tree)
  plot(rpart_tree)  
  text(rpart_tree)

  rpart_tree$cptable

##           CP nsplit rel error    xerror       xstd
## 1 0.05423988      0 1.0000000 1.0000000 0.02368846
## 2 0.01184110      3 0.7914439 0.7983193 0.02192312
## 3 0.01000000      5 0.7677617 0.8090145 0.02202970

    ##b-2-2. predict
    rpart_tree_pred <- predict(rpart_tree, te_raw, type = "class")
    confusionMatrix(rpart_tree_pred, te_raw$churn)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1404  296
##        Yes  148  264
##                                          
##                Accuracy : 0.7898         
##                  95% CI : (0.7718, 0.807)
##     No Information Rate : 0.7348         
##     P-Value [Acc > NIR] : 2.764e-09      
##                                          
##                   Kappa : 0.4108         
##                                          
##  Mcnemar's Test P-Value : 3.030e-12      
##                                          
##             Sensitivity : 0.9046         
##             Specificity : 0.4714         
##          Pos Pred Value : 0.8259         
##          Neg Pred Value : 0.6408         
##              Prevalence : 0.7348         
##          Detection Rate : 0.6648         
##    Detection Prevalence : 0.8049         
##       Balanced Accuracy : 0.6880         
##                                          
##        'Positive' Class : No             
##

    ##b-2-3. prune 제일 괜찮은 cp값을 찾아야함 (cv.tree같은 나무최적화)
    printcp(rpart_tree)

## 
## Classification tree:
## rpart(formula = churn ~ ., data = tr_raw, method = "class")
## 
## Variables actually used in tree construction:
## [1] contract        internetservice tenure         
## 
## Root node error: 1309/4931 = 0.26546
## 
## n= 4931 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.054240      0   1.00000 1.00000 0.023688
## 2 0.011841      3   0.79144 0.79832 0.021923
## 3 0.010000      5   0.76776 0.80901 0.022030

    plotcp(rpart_tree)  #여기도 4가 최적이네.

    prune_rpart_tree <- prune(rpart_tree, cp = rpart_tree$cptable[which.min(rpart_tree$cptable[,"xerror"]),"CP"])
    summary(prune_rpart_tree)

## Call:
## rpart(formula = churn ~ ., data = tr_raw, method = "class")
##   n= 4931 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.05423988      0 1.0000000 1.0000000 0.02368846
## 2 0.01184110      3 0.7914439 0.7983193 0.02192312
## 
## Variable importance
##         contract           tenure     totalcharges      techsupport 
##               23               17               13               11 
##   onlinesecurity   monthlycharges deviceprotection  internetservice 
##               10                7                7                7 
##    multiplelines          partner     onlinebackup 
##                3                1                1 
## 
## Node number 1: 4931 observations,    complexity param=0.05423988
##   predicted class=No   expected loss=0.2654634  P(node) =1
##     class counts:  3622  1309
##    probabilities: 0.735 0.265 
##   left son=2 (2212 obs) right son=3 (2719 obs)
##   Primary splits:
##       contract        splits as  RLL,          improve=326.4672, (0 missing)
##       onlinesecurity  splits as  RLL,          improve=229.0817, (0 missing)
##       techsupport     splits as  RLL,          improve=215.8435, (0 missing)
##       tenure          < 16.5     to the right, improve=202.0216, (0 missing)
##       internetservice splits as  LRL,          improve=183.0848, (0 missing)
##   Surrogate splits:
##       tenure           < 34.5     to the right, agree=0.789, adj=0.529, (0 split)
##       techsupport      splits as  RLL,          agree=0.718, adj=0.371, (0 split)
##       onlinesecurity   splits as  RLL,          agree=0.706, adj=0.344, (0 split)
##       totalcharges     < 3000.775 to the right, agree=0.696, adj=0.323, (0 split)
##       deviceprotection splits as  RLL,          agree=0.691, adj=0.311, (0 split)
## 
## Node number 2: 2212 observations
##   predicted class=No   expected loss=0.06374322  P(node) =0.4485905
##     class counts:  2071   141
##    probabilities: 0.936 0.064 
## 
## Node number 3: 2719 observations,    complexity param=0.05423988
##   predicted class=No   expected loss=0.4295697  P(node) =0.5514095
##     class counts:  1551  1168
##    probabilities: 0.570 0.430 
##   left son=6 (1227 obs) right son=7 (1492 obs)
##   Primary splits:
##       internetservice splits as  LRL,          improve=92.09919, (0 missing)
##       onlinesecurity  splits as  RLL,          improve=76.17279, (0 missing)
##       monthlycharges  < 69.175   to the left,  improve=72.86882, (0 missing)
##       techsupport     splits as  RLL,          improve=64.30651, (0 missing)
##       paymentmethod   splits as  LLRL,         improve=52.71926, (0 missing)
##   Surrogate splits:
##       monthlycharges < 68.975   to the left,  agree=0.972, adj=0.937, (0 split)
##       onlinesecurity splits as  RLL,          agree=0.697, adj=0.328, (0 split)
##       multiplelines  splits as  LLR,          agree=0.696, adj=0.326, (0 split)
##       totalcharges   < 781.325  to the left,  agree=0.688, adj=0.308, (0 split)
##       techsupport    splits as  RLL,          agree=0.687, adj=0.307, (0 split)
## 
## Node number 6: 1227 observations
##   predicted class=No   expected loss=0.2860636  P(node) =0.2488339
##     class counts:   876   351
##    probabilities: 0.714 0.286 
## 
## Node number 7: 1492 observations,    complexity param=0.05423988
##   predicted class=Yes  expected loss=0.4524129  P(node) =0.3025755
##     class counts:   675   817
##    probabilities: 0.452 0.548 
##   left son=14 (795 obs) right son=15 (697 obs)
##   Primary splits:
##       tenure         < 14.5     to the right, improve=57.49984, (0 missing)
##       totalcharges   < 1556.25  to the right, improve=51.85579, (0 missing)
##       paymentmethod  splits as  LLRR,         improve=15.45871, (0 missing)
##       onlinesecurity splits as  R-L,          improve=14.74208, (0 missing)
##       techsupport    splits as  R-L,          improve=10.76805, (0 missing)
##   Surrogate splits:
##       totalcharges   < 1267.025 to the right, agree=0.975, adj=0.945, (0 split)
##       monthlycharges < 82.975   to the right, agree=0.662, adj=0.275, (0 split)
##       multiplelines  splits as  R-L,          agree=0.634, adj=0.217, (0 split)
##       partner        splits as  RL,           agree=0.617, adj=0.179, (0 split)
##       onlinebackup   splits as  R-L,          agree=0.611, adj=0.168, (0 split)
## 
## Node number 14: 795 observations
##   predicted class=No   expected loss=0.4176101  P(node) =0.1612249
##     class counts:   463   332
##    probabilities: 0.582 0.418 
## 
## Node number 15: 697 observations
##   predicted class=Yes  expected loss=0.3041607  P(node) =0.1413506
##     class counts:   212   485
##    probabilities: 0.304 0.696

    plot(prune_rpart_tree)  
    text(prune_rpart_tree)

    rpart.plot::prp(prune_rpart_tree, type = 2, extra = 103, fallen.leaves = T)

    ##b-2-4. predict_prune
    rpart_tree_pr_pred <- predict(prune_rpart_tree, te_raw, type = "class")
    confusionMatrix(rpart_tree_pr_pred, te_raw$churn)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1464  353
##        Yes   88  207
##                                           
##                Accuracy : 0.7912          
##                  95% CI : (0.7732, 0.8084)
##     No Information Rate : 0.7348          
##     P-Value [Acc > NIR] : 1.074e-09       
##                                           
##                   Kappa : 0.3687          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9433          
##             Specificity : 0.3696          
##          Pos Pred Value : 0.8057          
##          Neg Pred Value : 0.7017          
##              Prevalence : 0.7348          
##          Detection Rate : 0.6932          
##    Detection Prevalence : 0.8603          
##       Balanced Accuracy : 0.6565          
##                                           
##        'Positive' Class : No              
##

    ##b-2-5. graph package rpart 조금 예쁜 그래프 그리자!
    #install.packages("rattle")
    library(rattle)

## Warning: package 'rattle' was built under R version 3.6.3

## Rattle: A free graphical interface for data science with R.
## Version 5.3.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

    fancyRpartPlot(prune_rpart_tree)

  ##b-3-1. 모델링 ctree:가지치기 필요없음
    #unbiased recursive partitioning based on permutation
    #p-test를 거친 significance를 기준으로 가지치기 변수 결정
    #입력변수 레벨은 31개로 제한
  #library(party)
  party_tree <- ctree(churn~., data = tr_raw)    
  plot(party_tree)

    ##b-3-2. predict
    party_tree_pred <- predict(party_tree, te_raw)
    confusionMatrix(party_tree_pred, te_raw$churn)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1435  337
##        Yes  117  223
##                                           
##                Accuracy : 0.785           
##                  95% CI : (0.7669, 0.8024)
##     No Information Rate : 0.7348          
##     P-Value [Acc > NIR] : 5.396e-08       
##                                           
##                   Kappa : 0.3692          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9246          
##             Specificity : 0.3982          
##          Pos Pred Value : 0.8098          
##          Neg Pred Value : 0.6559          
##              Prevalence : 0.7348          
##          Detection Rate : 0.6795          
##    Detection Prevalence : 0.8390          
##       Balanced Accuracy : 0.6614          
##                                           
##        'Positive' Class : No              
##

    ##b-3-3. 나무깊이 설정을 바꿔보자, 다를바 없군 
      #ctree_control쓰면 섬세하게 파라미터조정가능
    ctree_control(maxdepth = 4)

## An object of class "TreeControl"
## Slot "varctrl":
## An object of class "VariableControl"
## Slot "teststat":
## [1] quad
## Levels: max quad
## 
## Slot "pvalue":
## [1] TRUE
## 
## Slot "tol":
## [1] 1e-10
## 
## Slot "maxpts":
## [1] 25000
## 
## Slot "abseps":
## [1] 1e-04
## 
## Slot "releps":
## [1] 0
## 
## 
## Slot "splitctrl":
## An object of class "SplitControl"
## Slot "minprob":
## [1] 0.01
## 
## Slot "minsplit":
## [1] 20
## 
## Slot "minbucket":
## [1] 7
## 
## Slot "tol":
## [1] 1e-10
## 
## Slot "maxsurrogate":
## [1] 0
## 
## 
## Slot "gtctrl":
## An object of class "GlobalTestControl"
## Slot "testtype":
## [1] Bonferroni
## Levels: Bonferroni MonteCarlo Aggregated Univariate Teststatistic
## 
## Slot "nresample":
## [1] 9999
## 
## Slot "randomsplits":
## [1] FALSE
## 
## Slot "mtry":
## [1] 0
## 
## Slot "mincriterion":
## [1] 0.95
## 
## 
## Slot "tgctrl":
## An object of class "TreeGrowControl"
## Slot "stump":
## [1] FALSE
## 
## Slot "maxdepth":
## [1] 4
## 
## Slot "savesplitstats":
## [1] TRUE
## 
## Slot "remove_weights":
## [1] FALSE

    party_tree_m4 <- ctree(churn~., data = tr_raw)    
    plot(party_tree_m4)

    party_tree_pred_m4 <- predict(party_tree_m4, te_raw)
    confusionMatrix(party_tree_pred_m4, te_raw$churn)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1435  337
##        Yes  117  223
##                                           
##                Accuracy : 0.785           
##                  95% CI : (0.7669, 0.8024)
##     No Information Rate : 0.7348          
##     P-Value [Acc > NIR] : 5.396e-08       
##                                           
##                   Kappa : 0.3692          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9246          
##             Specificity : 0.3982          
##          Pos Pred Value : 0.8098          
##          Neg Pred Value : 0.6559          
##              Prevalence : 0.7348          
##          Detection Rate : 0.6795          
##    Detection Prevalence : 0.8390          
##       Balanced Accuracy : 0.6614          
##                                           
##        'Positive' Class : No              
##

  ##b-4-1. 모델링 C50
  #library(C50)
  c5_tree <- C5.0(tr_raw[-19], tr_raw$churn, trials = 10, costs = NULL)    
    #10회 시행시 테이스데이터에 대한 오류율이 약 25%줄어든다고 함
  c5_tree

## 
## Call:
## C5.0.default(x = tr_raw[-19], y = tr_raw$churn, trials = 10, costs = NULL)
## 
## Classification Tree
## Number of samples: 4931 
## Number of predictors: 19 
## 
## Number of boosting iterations: 10 
## Average tree size: 8.1 
## 
## Non-standard options: attempt to group attributes

  summary(c5_tree)

## 
## Call:
## C5.0.default(x = tr_raw[-19], y = tr_raw$churn, trials = 10, costs = NULL)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sat May 09 16:55:22 2020
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 4931 cases (20 attributes) from undefined.data
## 
## -----  Trial 0:  -----
## 
## Decision tree:
## 
## contract in {One year,Two year}: No (2212/141)
## contract = Month-to-month:
## :...internetservice in {DSL,No}:
##     :...tenure > 4: No (751/144)
##     :   tenure <= 4:
##     :   :...senior = junior: No (438/175)
##     :       senior = senior: Yes (38/6)
##     internetservice = Fiber optic:
##     :...tenure > 14:
##         :...tenure > 51: No (148/37)
##         :   tenure <= 51:
##         :   :...multiplelines in {No,No phone service}: No (218/79)
##         :       multiplelines = Yes:
##         :       :...paymentmethod in {Bank transfer (automatic),
##         :           :                 Credit card (automatic),
##         :           :                 Mailed check}: No (175/72)
##         :           paymentmethod = Electronic check: Yes (254/110)
##         tenure <= 14:
##         :...tenure <= 1: Yes (151/19)
##             tenure > 1:
##             :...streamingtv in {No internet service,Yes}: Yes (216/51)
##                 streamingtv = No:
##                 :...onlinesecurity = No internet service: Yes (0)
##                     onlinesecurity = Yes: No (40/14)
##                     onlinesecurity = No:
##                     :...multiplelines in {No phone service,
##                         :                 Yes}: Yes (138/40)
##                         multiplelines = No:
##                         :...techsupport in {No,
##                             :               No internet service}: No (141/67)
##                             techsupport = Yes: Yes (11/2)
## 
## -----  Trial 1:  -----
## 
## Decision tree:
## 
## contract in {One year,Two year}: No (1930.1/252.1)
## contract = Month-to-month:
## :...tenure <= 5: Yes (1015.4/377.4)
##     tenure > 5:
##     :...internetservice in {DSL,No}: No (704.5/237.8)
##         internetservice = Fiber optic:
##         :...techsupport = No internet service: Yes (0)
##             techsupport = Yes: No (238.8/100.7)
##             techsupport = No:
##             :...multiplelines in {No,No phone service}: Yes (359.9/149.3)
##                 multiplelines = Yes:
##                 :...monthlycharges <= 95.5: No (455.3/194.5)
##                     monthlycharges > 95.5: Yes (227.1/87.3)
## 
## -----  Trial 2:  -----
## 
## Decision tree:
## 
## contract in {One year,Two year}: No (1740.5/315.9)
## contract = Month-to-month:
## :...onlinesecurity in {No internet service,Yes}: No (978/367.1)
##     onlinesecurity = No:
##     :...phoneservice = No: Yes (229.4/87.7)
##         phoneservice = Yes:
##         :...tenure <= 1: Yes (257/92.5)
##             tenure > 1:
##             :...paperlessbilling = No: No (399.6/173.4)
##                 paperlessbilling = Yes: Yes (1326.5/622)
## 
## -----  Trial 3:  -----
## 
## Decision tree:
## 
## techsupport = No internet service: No (844.7/171.1)
## techsupport in {No,Yes}:
## :...tenure > 16: No (2427.5/882.2)
##     tenure <= 16:
##     :...monthlycharges > 97.45: Yes (90.5/24.2)
##         monthlycharges <= 97.45:
##         :...contract in {One year,Two year}: No (47.7/10.3)
##             contract = Month-to-month:
##             :...multiplelines = No: No (928.9/444.7)
##                 multiplelines in {No phone service,Yes}: Yes (591.8/257.1)
## 
## -----  Trial 4:  -----
## 
## Decision tree:
## 
## contract = Two year: No (717.8/97.8)
## contract in {Month-to-month,One year}:
## :...internetservice = No: No (548.6/183.8)
##     internetservice = DSL:
##     :...totalcharges > 1195.95: No (438.4/143.2)
##     :   totalcharges <= 1195.95:
##     :   :...onlinebackup in {No,No internet service}: Yes (711.6/327.2)
##     :       onlinebackup = Yes: No (188.2/76.5)
##     internetservice = Fiber optic:
##     :...tenure <= 1: Yes (151.7/44.4)
##         tenure > 1:
##         :...streamingmovies in {No internet service,Yes}: Yes (1203.4/531.3)
##             streamingmovies = No:
##             :...contract = One year: No (67.7/23)
##                 contract = Month-to-month:
##                 :...paymentmethod in {Bank transfer (automatic),
##                     :                 Credit card (automatic),
##                     :                 Electronic check}: Yes (812.9/387.2)
##                     paymentmethod = Mailed check: No (90.6/30.9)
## 
## -----  Trial 5:  -----
## 
## Decision tree:
## 
## tenure <= 1: Yes (527.6/219.9)
## tenure > 1: No (4403.4/1730)
## 
## -----  Trial 6:  -----
## 
## Decision tree:
## 
## contract = Two year: No (522.4)
## contract = One year:
## :...monthlycharges <= 99.15: No (429.2/51.5)
## :   monthlycharges > 99.15: Yes (253.5/103)
## contract = Month-to-month:
## :...tenure > 55: No (158.7/60.5)
##     tenure <= 55:
##     :...internetservice = Fiber optic: Yes (2051.9/889.2)
##         internetservice = No: No (344.4/123.6)
##         internetservice = DSL:
##         :...totalcharges > 1175.6: No (139.8/41.5)
##             totalcharges <= 1175.6:
##             :...paymentmethod in {Bank transfer (automatic),
##                 :                 Mailed check}: No (426.7/191.7)
##                 paymentmethod in {Credit card (automatic),
##                                   Electronic check}: Yes (499.3/221.8)
## 
## -----  Trial 7:  -----
## 
## Decision tree:
## 
## contract in {One year,Two year}: No (1109.4/135)
## contract = Month-to-month:
## :...tenure <= 2: Yes (769.3/327.5)
##     tenure > 2:
##     :...internetservice in {DSL,No}: No (813.3/269.6)
##         internetservice = Fiber optic:
##         :...streamingtv in {No,No internet service}: No (1018.7/434.7)
##             streamingtv = Yes:
##             :...tenure <= 14: Yes (256.5/100.2)
##                 tenure > 14: No (777.8/368.5)
## 
## -----  Trial 8:  -----
## 
## Decision tree:
## 
## contract in {One year,Two year}: No (886.5)
## contract = Month-to-month:
## :...internetservice = No: No (382.5/106.4)
##     internetservice = DSL:
##     :...totalcharges > 1175.6: No (91.4)
##     :   totalcharges <= 1175.6:
##     :   :...monthlycharges > 55.25: No (184.3/45)
##     :       monthlycharges <= 55.25:
##     :       :...deviceprotection = No: No (558.9/270.3)
##     :           deviceprotection in {No internet service,Yes}: Yes (84.4/24.2)
##     internetservice = Fiber optic:
##     :...tenure <= 1: Yes (73.2)
##         tenure > 1:
##         :...tenure > 51: No (171.9/62)
##             tenure <= 51:
##             :...multiplelines in {No phone service,Yes}: Yes (1209.2/465.4)
##                 multiplelines = No:
##                 :...totalcharges <= 3478.15: Yes (848.4/382.2)
##                     totalcharges > 3478.15: No (52.3/13.9)
## 
## -----  Trial 9:  -----
## 
## Decision tree:
## 
## contract in {One year,Two year}: No (782.7)
## contract = Month-to-month:
## :...onlinesecurity = No internet service: No (249.7/5.9)
##     onlinesecurity in {No,Yes}:
##     :...tenure <= 5: Yes (946.2/314)
##         tenure > 5:
##         :...tenure > 55: No (70.7)
##             tenure <= 55:
##             :...internetservice in {DSL,No}: No (338/53.7)
##                 internetservice = Fiber optic:
##                 :...tenure <= 15: Yes (499.9/199.5)
##                     tenure > 15:
##                     :...paperlessbilling = No: No (255.4/66.4)
##                         paperlessbilling = Yes:
##                         :...techsupport in {No internet service,
##                             :               Yes}: No (259.6/77)
##                             techsupport = No:
##                             :...monthlycharges <= 95.4: No (729/285.8)
##                                 monthlycharges > 95.4: Yes (235.7/76.8)
## 
## 
## Evaluation on training data (4931 cases):
## 
## Trial        Decision Tree   
## -----      ----------------  
##    Size      Errors  
## 
##    0     14  957(19.4%)
##    1      7 1179(23.9%)
##    2      6 1181(24.0%)
##    3      6 1145(23.2%)
##    4     10 1405(28.5%)
##    5      2 1216(24.7%)
##    6      9 1243(25.2%)
##    7      6 1088(22.1%)
##    8     11 1069(21.7%)
##    9     10  986(20.0%)
## boost            938(19.0%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##    3299   323    (a): class No
##     615   694    (b): class Yes
## 
## 
##  Attribute usage:
## 
##  100.00% tenure
##  100.00% techsupport
##  100.00% contract
##   75.95% internetservice
##   60.09% monthlycharges
##   55.14% onlinesecurity
##   40.42% multiplelines
##   37.48% phoneservice
##   35.39% totalcharges
##   34.76% streamingmovies
##   31.49% paperlessbilling
##   31.27% paymentmethod
##   27.20% streamingtv
##   14.07% onlinebackup
##    9.65% senior
##    9.27% deviceprotection
## 
## 
## Time: 0.2 secs

    ##b-4-2. predict
    c5_tree_pred <- predict(c5_tree, te_raw, type = "class")
    confusionMatrix(c5_tree_pred, te_raw$churn)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1407  281
##        Yes  145  279
##                                           
##                Accuracy : 0.7983          
##                  95% CI : (0.7805, 0.8152)
##     No Information Rate : 0.7348          
##     P-Value [Acc > NIR] : 6.491e-12       
##                                           
##                   Kappa : 0.4388          
##                                           
##  Mcnemar's Test P-Value : 6.120e-11       
##                                           
##             Sensitivity : 0.9066          
##             Specificity : 0.4982          
##          Pos Pred Value : 0.8335          
##          Neg Pred Value : 0.6580          
##              Prevalence : 0.7348          
##          Detection Rate : 0.6662          
##    Detection Prevalence : 0.7992          
##       Balanced Accuracy : 0.7024          
##                                           
##        'Positive' Class : No              
##

            #      Reference
            # Prediction   No  Yes
            #        No  1439  298
            #        Yes  113  262
    
    ##b-4-3. costs matrix 추가
      # matrix_dimensions <- list(c("no", "yes"), c("no", "yes"))
      # names(matrix_dimensions) <- c("pre", "act")
      # matrix_dimensions 
      #이런 방법으로 costs matrix만들수도 있고, 아래 방법도 있는데, 아래가 더 깔끔하네.
    
      error_cost <- matrix(c(0,1,4,0), nrow = 2)
      rownames(error_cost) <- colnames(error_cost) <- c("No", "Yes")
    
      # c5_tree_cost <- C5.0(tr_raw[,-19], tr_raw$churn, trials = 10, costs = error_cost)
      # c5_tree_cost_pred <- predict(c5_tree_cost, te_raw, type = "class")
      # costs 를 쓰니까 에러난다. cost는 잘못이 없어보이는데, 뭐가문제냐, 
      # 아, 문제는 colnames였다. tr_raw$churn은 "No", "Yes" 나는 costs에 "no", "yes"라고 썼다. 어휴, 변경하니 되네.
    
      c5_tree_cost <- C5.0(tr_raw[,-19], tr_raw$churn, trials = 10, costs = error_cost)
      c5_tree_cost_pred <- predict(c5_tree_cost, te_raw, type = "class")
      confusionMatrix(c5_tree_cost_pred, te_raw$churn)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1420  284
##        Yes  132  276
##                                           
##                Accuracy : 0.803           
##                  95% CI : (0.7854, 0.8198)
##     No Information Rate : 0.7348          
##     P-Value [Acc > NIR] : 1.503e-13       
##                                           
##                   Kappa : 0.4465          
##                                           
##  Mcnemar's Test P-Value : 1.328e-13       
##                                           
##             Sensitivity : 0.9149          
##             Specificity : 0.4929          
##          Pos Pred Value : 0.8333          
##          Neg Pred Value : 0.6765          
##              Prevalence : 0.7348          
##          Detection Rate : 0.6723          
##    Detection Prevalence : 0.8068          
##       Balanced Accuracy : 0.7039          
##                                           
##        'Positive' Class : No              
##

          #        Reference
          # Prediction   No  Yes
          #        No  1418  270
          #        Yes  134  290