library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(C50)
## Warning: package 'C50' was built under R version 3.6.1
library(tree)
## Warning: package 'tree' was built under R version 3.6.1
## Registered S3 method overwritten by 'tree':
## method from
## print.tree cli
library(e1071)
## Warning: package 'e1071' was built under R version 3.6.1
library(caret)
## Warning: package 'caret' was built under R version 3.6.1
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.3
library(tree)
library(rpart)
## Warning: package 'rpart' was built under R version 3.6.1
library(party)
## Warning: package 'party' was built under R version 3.6.1
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.6.1
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.6.1
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.6.1
library(C50)
##1.0509_의사결정나무_tele_churn 데이터로
raw <- read.csv("tele_churn.csv")
names(raw) <- tolower(names(raw))
summary(raw)
## customerid gender seniorcitizen partner dependents
## 0002-ORFBO: 1 Female:3488 Min. :0.0000 No :3641 No :4933
## 0003-MKNFE: 1 Male :3555 1st Qu.:0.0000 Yes:3402 Yes:2110
## 0004-TLHLJ: 1 Median :0.0000
## 0011-IGKFF: 1 Mean :0.1621
## 0013-EXCHZ: 1 3rd Qu.:0.0000
## 0013-MHZWF: 1 Max. :1.0000
## (Other) :7037
## tenure phoneservice multiplelines internetservice
## Min. : 0.00 No : 682 No :3390 DSL :2421
## 1st Qu.: 9.00 Yes:6361 No phone service: 682 Fiber optic:3096
## Median :29.00 Yes :2971 No :1526
## Mean :32.37
## 3rd Qu.:55.00
## Max. :72.00
##
## onlinesecurity onlinebackup
## No :3498 No :3088
## No internet service:1526 No internet service:1526
## Yes :2019 Yes :2429
##
##
##
##
## deviceprotection techsupport
## No :3095 No :3473
## No internet service:1526 No internet service:1526
## Yes :2422 Yes :2044
##
##
##
##
## streamingtv streamingmovies
## No :2810 No :2785
## No internet service:1526 No internet service:1526
## Yes :2707 Yes :2732
##
##
##
##
## contract paperlessbilling paymentmethod
## Month-to-month:3875 No :2872 Bank transfer (automatic):1544
## One year :1473 Yes:4171 Credit card (automatic) :1522
## Two year :1695 Electronic check :2365
## Mailed check :1612
##
##
##
## monthlycharges totalcharges churn
## Min. : 18.25 Min. : 18.8 No :5174
## 1st Qu.: 35.50 1st Qu.: 401.4 Yes:1869
## Median : 70.35 Median :1397.5
## Mean : 64.76 Mean :2283.3
## 3rd Qu.: 89.85 3rd Qu.:3794.7
## Max. :118.75 Max. :8684.8
## NA's :11
glimpse(raw)
## Rows: 7,043
## Columns: 21
## $ customerid <fct> 7590-VHVEG, 5575-GNVDE, 3668-QPYBK, 7795-CFOC...
## $ gender <fct> Female, Male, Male, Male, Female, Female, Mal...
## $ seniorcitizen <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ partner <fct> Yes, No, No, No, No, No, No, No, Yes, No, Yes...
## $ dependents <fct> No, No, No, No, No, No, Yes, No, No, Yes, Yes...
## $ tenure <int> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 5...
## $ phoneservice <fct> No, Yes, Yes, No, Yes, Yes, Yes, No, Yes, Yes...
## $ multiplelines <fct> No phone service, No, No, No phone service, N...
## $ internetservice <fct> DSL, DSL, DSL, DSL, Fiber optic, Fiber optic,...
## $ onlinesecurity <fct> No, Yes, Yes, Yes, No, No, No, Yes, No, Yes, ...
## $ onlinebackup <fct> Yes, No, Yes, No, No, No, Yes, No, No, Yes, N...
## $ deviceprotection <fct> No, Yes, No, Yes, No, Yes, No, No, Yes, No, N...
## $ techsupport <fct> No, No, No, Yes, No, No, No, No, Yes, No, No,...
## $ streamingtv <fct> No, No, No, No, No, Yes, Yes, No, Yes, No, No...
## $ streamingmovies <fct> No, No, No, No, No, Yes, No, No, Yes, No, No,...
## $ contract <fct> Month-to-month, One year, Month-to-month, One...
## $ paperlessbilling <fct> Yes, No, Yes, No, Yes, Yes, Yes, No, Yes, No,...
## $ paymentmethod <fct> Electronic check, Mailed check, Mailed check,...
## $ monthlycharges <dbl> 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89....
## $ totalcharges <dbl> 29.85, 1889.50, 108.15, 1840.75, 151.65, 820....
## $ churn <fct> No, No, Yes, No, Yes, Yes, No, No, Yes, No, N...
#의외로 factor value가 많네, charge는 dbl이고
##잠깐, 모델 모두 돌려보는게 목표니까, 섬세한 전처리는 생략한다!! (NA가 11개 있지만, 그냥 빼버리자!)
raw2 <- raw %>%
mutate(senior = as.factor(ifelse(seniorcitizen == 0, "junior", "senior"))) %>%
select(-c(customerid, seniorcitizen))
View(head(raw2))
##a.샘플링 7:3으로
set.seed(123)
#idx <- sample(1:nrow(raw2), size = nrow(raw2)*0.7)
idx <- createDataPartition(y=raw2$churn, p =0.7, list=FALSE)
tr_raw <- raw2[idx,]
te_raw <- raw2[-idx,]
##b-1-1.모델링 tree: binary recursive partitionings
#library(tree)
#불순도 측도로 엔트로피 사용. 불순도가 높으면 엔틀피높다
#엔트로피는 0~1값 사이
tree_m <- tree(churn ~., data = tr_raw)
plot(tree_m)
text(tree_m)

##b-1-2.CV tree (cross-validation) 최적의 가지 개수를
cv_tree <- cv.tree(tree_m, FUN = prune.misclass)
str(cv_tree)
## List of 4
## $ size : int [1:3] 6 4 1
## $ dev : num [1:3] 1048 1048 1309
## $ k : num [1:3] -Inf 0 91
## $ method: chr "misclass"
## - attr(*, "class")= chr [1:2] "prune" "tree.sequence"
cv_tree$size #나무사이즈
## [1] 6 4 1
cv_tree$dev #잘못된분류수
## [1] 1048 1048 1309
plot(cv_tree) #4개군

##b-1-3.prune
prune_tree <- prune.misclass(tree_m, best = 4) #4개라니까.
plot(prune_tree)
text(prune_tree, pretty = 0)

##b-1-4.predict
tree_pred <- predict(prune_tree, te_raw, type = "class")
confusionMatrix(tree_pred, te_raw$churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1464 353
## Yes 88 207
##
## Accuracy : 0.7912
## 95% CI : (0.7732, 0.8084)
## No Information Rate : 0.7348
## P-Value [Acc > NIR] : 1.074e-09
##
## Kappa : 0.3687
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9433
## Specificity : 0.3696
## Pos Pred Value : 0.8057
## Neg Pred Value : 0.7017
## Prevalence : 0.7348
## Detection Rate : 0.6932
## Detection Prevalence : 0.8603
## Balanced Accuracy : 0.6565
##
## 'Positive' Class : No
##
##b-2-1.모델링 rpart: gini 사용, prune해야함
#library(rpart)
#분류와 회귀나무. CART 사용. gini가 작아지는 방향
rpart_tree <- rpart(churn ~ ., data = tr_raw, method = "class")
#summary(rpart_tree)
#print(rpart_tree)
plot(rpart_tree)
text(rpart_tree)

rpart_tree$cptable
## CP nsplit rel error xerror xstd
## 1 0.05423988 0 1.0000000 1.0000000 0.02368846
## 2 0.01184110 3 0.7914439 0.7983193 0.02192312
## 3 0.01000000 5 0.7677617 0.8090145 0.02202970
##b-2-2. predict
rpart_tree_pred <- predict(rpart_tree, te_raw, type = "class")
confusionMatrix(rpart_tree_pred, te_raw$churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1404 296
## Yes 148 264
##
## Accuracy : 0.7898
## 95% CI : (0.7718, 0.807)
## No Information Rate : 0.7348
## P-Value [Acc > NIR] : 2.764e-09
##
## Kappa : 0.4108
##
## Mcnemar's Test P-Value : 3.030e-12
##
## Sensitivity : 0.9046
## Specificity : 0.4714
## Pos Pred Value : 0.8259
## Neg Pred Value : 0.6408
## Prevalence : 0.7348
## Detection Rate : 0.6648
## Detection Prevalence : 0.8049
## Balanced Accuracy : 0.6880
##
## 'Positive' Class : No
##
##b-2-3. prune 제일 괜찮은 cp값을 찾아야함 (cv.tree같은 나무최적화)
printcp(rpart_tree)
##
## Classification tree:
## rpart(formula = churn ~ ., data = tr_raw, method = "class")
##
## Variables actually used in tree construction:
## [1] contract internetservice tenure
##
## Root node error: 1309/4931 = 0.26546
##
## n= 4931
##
## CP nsplit rel error xerror xstd
## 1 0.054240 0 1.00000 1.00000 0.023688
## 2 0.011841 3 0.79144 0.79832 0.021923
## 3 0.010000 5 0.76776 0.80901 0.022030
plotcp(rpart_tree) #여기도 4가 최적이네.

prune_rpart_tree <- prune(rpart_tree, cp = rpart_tree$cptable[which.min(rpart_tree$cptable[,"xerror"]),"CP"])
summary(prune_rpart_tree)
## Call:
## rpart(formula = churn ~ ., data = tr_raw, method = "class")
## n= 4931
##
## CP nsplit rel error xerror xstd
## 1 0.05423988 0 1.0000000 1.0000000 0.02368846
## 2 0.01184110 3 0.7914439 0.7983193 0.02192312
##
## Variable importance
## contract tenure totalcharges techsupport
## 23 17 13 11
## onlinesecurity monthlycharges deviceprotection internetservice
## 10 7 7 7
## multiplelines partner onlinebackup
## 3 1 1
##
## Node number 1: 4931 observations, complexity param=0.05423988
## predicted class=No expected loss=0.2654634 P(node) =1
## class counts: 3622 1309
## probabilities: 0.735 0.265
## left son=2 (2212 obs) right son=3 (2719 obs)
## Primary splits:
## contract splits as RLL, improve=326.4672, (0 missing)
## onlinesecurity splits as RLL, improve=229.0817, (0 missing)
## techsupport splits as RLL, improve=215.8435, (0 missing)
## tenure < 16.5 to the right, improve=202.0216, (0 missing)
## internetservice splits as LRL, improve=183.0848, (0 missing)
## Surrogate splits:
## tenure < 34.5 to the right, agree=0.789, adj=0.529, (0 split)
## techsupport splits as RLL, agree=0.718, adj=0.371, (0 split)
## onlinesecurity splits as RLL, agree=0.706, adj=0.344, (0 split)
## totalcharges < 3000.775 to the right, agree=0.696, adj=0.323, (0 split)
## deviceprotection splits as RLL, agree=0.691, adj=0.311, (0 split)
##
## Node number 2: 2212 observations
## predicted class=No expected loss=0.06374322 P(node) =0.4485905
## class counts: 2071 141
## probabilities: 0.936 0.064
##
## Node number 3: 2719 observations, complexity param=0.05423988
## predicted class=No expected loss=0.4295697 P(node) =0.5514095
## class counts: 1551 1168
## probabilities: 0.570 0.430
## left son=6 (1227 obs) right son=7 (1492 obs)
## Primary splits:
## internetservice splits as LRL, improve=92.09919, (0 missing)
## onlinesecurity splits as RLL, improve=76.17279, (0 missing)
## monthlycharges < 69.175 to the left, improve=72.86882, (0 missing)
## techsupport splits as RLL, improve=64.30651, (0 missing)
## paymentmethod splits as LLRL, improve=52.71926, (0 missing)
## Surrogate splits:
## monthlycharges < 68.975 to the left, agree=0.972, adj=0.937, (0 split)
## onlinesecurity splits as RLL, agree=0.697, adj=0.328, (0 split)
## multiplelines splits as LLR, agree=0.696, adj=0.326, (0 split)
## totalcharges < 781.325 to the left, agree=0.688, adj=0.308, (0 split)
## techsupport splits as RLL, agree=0.687, adj=0.307, (0 split)
##
## Node number 6: 1227 observations
## predicted class=No expected loss=0.2860636 P(node) =0.2488339
## class counts: 876 351
## probabilities: 0.714 0.286
##
## Node number 7: 1492 observations, complexity param=0.05423988
## predicted class=Yes expected loss=0.4524129 P(node) =0.3025755
## class counts: 675 817
## probabilities: 0.452 0.548
## left son=14 (795 obs) right son=15 (697 obs)
## Primary splits:
## tenure < 14.5 to the right, improve=57.49984, (0 missing)
## totalcharges < 1556.25 to the right, improve=51.85579, (0 missing)
## paymentmethod splits as LLRR, improve=15.45871, (0 missing)
## onlinesecurity splits as R-L, improve=14.74208, (0 missing)
## techsupport splits as R-L, improve=10.76805, (0 missing)
## Surrogate splits:
## totalcharges < 1267.025 to the right, agree=0.975, adj=0.945, (0 split)
## monthlycharges < 82.975 to the right, agree=0.662, adj=0.275, (0 split)
## multiplelines splits as R-L, agree=0.634, adj=0.217, (0 split)
## partner splits as RL, agree=0.617, adj=0.179, (0 split)
## onlinebackup splits as R-L, agree=0.611, adj=0.168, (0 split)
##
## Node number 14: 795 observations
## predicted class=No expected loss=0.4176101 P(node) =0.1612249
## class counts: 463 332
## probabilities: 0.582 0.418
##
## Node number 15: 697 observations
## predicted class=Yes expected loss=0.3041607 P(node) =0.1413506
## class counts: 212 485
## probabilities: 0.304 0.696
plot(prune_rpart_tree)
text(prune_rpart_tree)

rpart.plot::prp(prune_rpart_tree, type = 2, extra = 103, fallen.leaves = T)

##b-2-4. predict_prune
rpart_tree_pr_pred <- predict(prune_rpart_tree, te_raw, type = "class")
confusionMatrix(rpart_tree_pr_pred, te_raw$churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1464 353
## Yes 88 207
##
## Accuracy : 0.7912
## 95% CI : (0.7732, 0.8084)
## No Information Rate : 0.7348
## P-Value [Acc > NIR] : 1.074e-09
##
## Kappa : 0.3687
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9433
## Specificity : 0.3696
## Pos Pred Value : 0.8057
## Neg Pred Value : 0.7017
## Prevalence : 0.7348
## Detection Rate : 0.6932
## Detection Prevalence : 0.8603
## Balanced Accuracy : 0.6565
##
## 'Positive' Class : No
##
##b-2-5. graph package rpart 조금 예쁜 그래프 그리자!
#install.packages("rattle")
library(rattle)
## Warning: package 'rattle' was built under R version 3.6.3
## Rattle: A free graphical interface for data science with R.
## Version 5.3.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
fancyRpartPlot(prune_rpart_tree)

##b-3-1. 모델링 ctree:가지치기 필요없음
#unbiased recursive partitioning based on permutation
#p-test를 거친 significance를 기준으로 가지치기 변수 결정
#입력변수 레벨은 31개로 제한
#library(party)
party_tree <- ctree(churn~., data = tr_raw)
plot(party_tree)

##b-3-2. predict
party_tree_pred <- predict(party_tree, te_raw)
confusionMatrix(party_tree_pred, te_raw$churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1435 337
## Yes 117 223
##
## Accuracy : 0.785
## 95% CI : (0.7669, 0.8024)
## No Information Rate : 0.7348
## P-Value [Acc > NIR] : 5.396e-08
##
## Kappa : 0.3692
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9246
## Specificity : 0.3982
## Pos Pred Value : 0.8098
## Neg Pred Value : 0.6559
## Prevalence : 0.7348
## Detection Rate : 0.6795
## Detection Prevalence : 0.8390
## Balanced Accuracy : 0.6614
##
## 'Positive' Class : No
##
##b-3-3. 나무깊이 설정을 바꿔보자, 다를바 없군
#ctree_control쓰면 섬세하게 파라미터조정가능
ctree_control(maxdepth = 4)
## An object of class "TreeControl"
## Slot "varctrl":
## An object of class "VariableControl"
## Slot "teststat":
## [1] quad
## Levels: max quad
##
## Slot "pvalue":
## [1] TRUE
##
## Slot "tol":
## [1] 1e-10
##
## Slot "maxpts":
## [1] 25000
##
## Slot "abseps":
## [1] 1e-04
##
## Slot "releps":
## [1] 0
##
##
## Slot "splitctrl":
## An object of class "SplitControl"
## Slot "minprob":
## [1] 0.01
##
## Slot "minsplit":
## [1] 20
##
## Slot "minbucket":
## [1] 7
##
## Slot "tol":
## [1] 1e-10
##
## Slot "maxsurrogate":
## [1] 0
##
##
## Slot "gtctrl":
## An object of class "GlobalTestControl"
## Slot "testtype":
## [1] Bonferroni
## Levels: Bonferroni MonteCarlo Aggregated Univariate Teststatistic
##
## Slot "nresample":
## [1] 9999
##
## Slot "randomsplits":
## [1] FALSE
##
## Slot "mtry":
## [1] 0
##
## Slot "mincriterion":
## [1] 0.95
##
##
## Slot "tgctrl":
## An object of class "TreeGrowControl"
## Slot "stump":
## [1] FALSE
##
## Slot "maxdepth":
## [1] 4
##
## Slot "savesplitstats":
## [1] TRUE
##
## Slot "remove_weights":
## [1] FALSE
party_tree_m4 <- ctree(churn~., data = tr_raw)
plot(party_tree_m4)

party_tree_pred_m4 <- predict(party_tree_m4, te_raw)
confusionMatrix(party_tree_pred_m4, te_raw$churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1435 337
## Yes 117 223
##
## Accuracy : 0.785
## 95% CI : (0.7669, 0.8024)
## No Information Rate : 0.7348
## P-Value [Acc > NIR] : 5.396e-08
##
## Kappa : 0.3692
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9246
## Specificity : 0.3982
## Pos Pred Value : 0.8098
## Neg Pred Value : 0.6559
## Prevalence : 0.7348
## Detection Rate : 0.6795
## Detection Prevalence : 0.8390
## Balanced Accuracy : 0.6614
##
## 'Positive' Class : No
##
##b-4-1. 모델링 C50
#library(C50)
c5_tree <- C5.0(tr_raw[-19], tr_raw$churn, trials = 10, costs = NULL)
#10회 시행시 테이스데이터에 대한 오류율이 약 25%줄어든다고 함
c5_tree
##
## Call:
## C5.0.default(x = tr_raw[-19], y = tr_raw$churn, trials = 10, costs = NULL)
##
## Classification Tree
## Number of samples: 4931
## Number of predictors: 19
##
## Number of boosting iterations: 10
## Average tree size: 8.1
##
## Non-standard options: attempt to group attributes
summary(c5_tree)
##
## Call:
## C5.0.default(x = tr_raw[-19], y = tr_raw$churn, trials = 10, costs = NULL)
##
##
## C5.0 [Release 2.07 GPL Edition] Sat May 09 16:55:22 2020
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 4931 cases (20 attributes) from undefined.data
##
## ----- Trial 0: -----
##
## Decision tree:
##
## contract in {One year,Two year}: No (2212/141)
## contract = Month-to-month:
## :...internetservice in {DSL,No}:
## :...tenure > 4: No (751/144)
## : tenure <= 4:
## : :...senior = junior: No (438/175)
## : senior = senior: Yes (38/6)
## internetservice = Fiber optic:
## :...tenure > 14:
## :...tenure > 51: No (148/37)
## : tenure <= 51:
## : :...multiplelines in {No,No phone service}: No (218/79)
## : multiplelines = Yes:
## : :...paymentmethod in {Bank transfer (automatic),
## : : Credit card (automatic),
## : : Mailed check}: No (175/72)
## : paymentmethod = Electronic check: Yes (254/110)
## tenure <= 14:
## :...tenure <= 1: Yes (151/19)
## tenure > 1:
## :...streamingtv in {No internet service,Yes}: Yes (216/51)
## streamingtv = No:
## :...onlinesecurity = No internet service: Yes (0)
## onlinesecurity = Yes: No (40/14)
## onlinesecurity = No:
## :...multiplelines in {No phone service,
## : Yes}: Yes (138/40)
## multiplelines = No:
## :...techsupport in {No,
## : No internet service}: No (141/67)
## techsupport = Yes: Yes (11/2)
##
## ----- Trial 1: -----
##
## Decision tree:
##
## contract in {One year,Two year}: No (1930.1/252.1)
## contract = Month-to-month:
## :...tenure <= 5: Yes (1015.4/377.4)
## tenure > 5:
## :...internetservice in {DSL,No}: No (704.5/237.8)
## internetservice = Fiber optic:
## :...techsupport = No internet service: Yes (0)
## techsupport = Yes: No (238.8/100.7)
## techsupport = No:
## :...multiplelines in {No,No phone service}: Yes (359.9/149.3)
## multiplelines = Yes:
## :...monthlycharges <= 95.5: No (455.3/194.5)
## monthlycharges > 95.5: Yes (227.1/87.3)
##
## ----- Trial 2: -----
##
## Decision tree:
##
## contract in {One year,Two year}: No (1740.5/315.9)
## contract = Month-to-month:
## :...onlinesecurity in {No internet service,Yes}: No (978/367.1)
## onlinesecurity = No:
## :...phoneservice = No: Yes (229.4/87.7)
## phoneservice = Yes:
## :...tenure <= 1: Yes (257/92.5)
## tenure > 1:
## :...paperlessbilling = No: No (399.6/173.4)
## paperlessbilling = Yes: Yes (1326.5/622)
##
## ----- Trial 3: -----
##
## Decision tree:
##
## techsupport = No internet service: No (844.7/171.1)
## techsupport in {No,Yes}:
## :...tenure > 16: No (2427.5/882.2)
## tenure <= 16:
## :...monthlycharges > 97.45: Yes (90.5/24.2)
## monthlycharges <= 97.45:
## :...contract in {One year,Two year}: No (47.7/10.3)
## contract = Month-to-month:
## :...multiplelines = No: No (928.9/444.7)
## multiplelines in {No phone service,Yes}: Yes (591.8/257.1)
##
## ----- Trial 4: -----
##
## Decision tree:
##
## contract = Two year: No (717.8/97.8)
## contract in {Month-to-month,One year}:
## :...internetservice = No: No (548.6/183.8)
## internetservice = DSL:
## :...totalcharges > 1195.95: No (438.4/143.2)
## : totalcharges <= 1195.95:
## : :...onlinebackup in {No,No internet service}: Yes (711.6/327.2)
## : onlinebackup = Yes: No (188.2/76.5)
## internetservice = Fiber optic:
## :...tenure <= 1: Yes (151.7/44.4)
## tenure > 1:
## :...streamingmovies in {No internet service,Yes}: Yes (1203.4/531.3)
## streamingmovies = No:
## :...contract = One year: No (67.7/23)
## contract = Month-to-month:
## :...paymentmethod in {Bank transfer (automatic),
## : Credit card (automatic),
## : Electronic check}: Yes (812.9/387.2)
## paymentmethod = Mailed check: No (90.6/30.9)
##
## ----- Trial 5: -----
##
## Decision tree:
##
## tenure <= 1: Yes (527.6/219.9)
## tenure > 1: No (4403.4/1730)
##
## ----- Trial 6: -----
##
## Decision tree:
##
## contract = Two year: No (522.4)
## contract = One year:
## :...monthlycharges <= 99.15: No (429.2/51.5)
## : monthlycharges > 99.15: Yes (253.5/103)
## contract = Month-to-month:
## :...tenure > 55: No (158.7/60.5)
## tenure <= 55:
## :...internetservice = Fiber optic: Yes (2051.9/889.2)
## internetservice = No: No (344.4/123.6)
## internetservice = DSL:
## :...totalcharges > 1175.6: No (139.8/41.5)
## totalcharges <= 1175.6:
## :...paymentmethod in {Bank transfer (automatic),
## : Mailed check}: No (426.7/191.7)
## paymentmethod in {Credit card (automatic),
## Electronic check}: Yes (499.3/221.8)
##
## ----- Trial 7: -----
##
## Decision tree:
##
## contract in {One year,Two year}: No (1109.4/135)
## contract = Month-to-month:
## :...tenure <= 2: Yes (769.3/327.5)
## tenure > 2:
## :...internetservice in {DSL,No}: No (813.3/269.6)
## internetservice = Fiber optic:
## :...streamingtv in {No,No internet service}: No (1018.7/434.7)
## streamingtv = Yes:
## :...tenure <= 14: Yes (256.5/100.2)
## tenure > 14: No (777.8/368.5)
##
## ----- Trial 8: -----
##
## Decision tree:
##
## contract in {One year,Two year}: No (886.5)
## contract = Month-to-month:
## :...internetservice = No: No (382.5/106.4)
## internetservice = DSL:
## :...totalcharges > 1175.6: No (91.4)
## : totalcharges <= 1175.6:
## : :...monthlycharges > 55.25: No (184.3/45)
## : monthlycharges <= 55.25:
## : :...deviceprotection = No: No (558.9/270.3)
## : deviceprotection in {No internet service,Yes}: Yes (84.4/24.2)
## internetservice = Fiber optic:
## :...tenure <= 1: Yes (73.2)
## tenure > 1:
## :...tenure > 51: No (171.9/62)
## tenure <= 51:
## :...multiplelines in {No phone service,Yes}: Yes (1209.2/465.4)
## multiplelines = No:
## :...totalcharges <= 3478.15: Yes (848.4/382.2)
## totalcharges > 3478.15: No (52.3/13.9)
##
## ----- Trial 9: -----
##
## Decision tree:
##
## contract in {One year,Two year}: No (782.7)
## contract = Month-to-month:
## :...onlinesecurity = No internet service: No (249.7/5.9)
## onlinesecurity in {No,Yes}:
## :...tenure <= 5: Yes (946.2/314)
## tenure > 5:
## :...tenure > 55: No (70.7)
## tenure <= 55:
## :...internetservice in {DSL,No}: No (338/53.7)
## internetservice = Fiber optic:
## :...tenure <= 15: Yes (499.9/199.5)
## tenure > 15:
## :...paperlessbilling = No: No (255.4/66.4)
## paperlessbilling = Yes:
## :...techsupport in {No internet service,
## : Yes}: No (259.6/77)
## techsupport = No:
## :...monthlycharges <= 95.4: No (729/285.8)
## monthlycharges > 95.4: Yes (235.7/76.8)
##
##
## Evaluation on training data (4931 cases):
##
## Trial Decision Tree
## ----- ----------------
## Size Errors
##
## 0 14 957(19.4%)
## 1 7 1179(23.9%)
## 2 6 1181(24.0%)
## 3 6 1145(23.2%)
## 4 10 1405(28.5%)
## 5 2 1216(24.7%)
## 6 9 1243(25.2%)
## 7 6 1088(22.1%)
## 8 11 1069(21.7%)
## 9 10 986(20.0%)
## boost 938(19.0%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 3299 323 (a): class No
## 615 694 (b): class Yes
##
##
## Attribute usage:
##
## 100.00% tenure
## 100.00% techsupport
## 100.00% contract
## 75.95% internetservice
## 60.09% monthlycharges
## 55.14% onlinesecurity
## 40.42% multiplelines
## 37.48% phoneservice
## 35.39% totalcharges
## 34.76% streamingmovies
## 31.49% paperlessbilling
## 31.27% paymentmethod
## 27.20% streamingtv
## 14.07% onlinebackup
## 9.65% senior
## 9.27% deviceprotection
##
##
## Time: 0.2 secs
##b-4-2. predict
c5_tree_pred <- predict(c5_tree, te_raw, type = "class")
confusionMatrix(c5_tree_pred, te_raw$churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1407 281
## Yes 145 279
##
## Accuracy : 0.7983
## 95% CI : (0.7805, 0.8152)
## No Information Rate : 0.7348
## P-Value [Acc > NIR] : 6.491e-12
##
## Kappa : 0.4388
##
## Mcnemar's Test P-Value : 6.120e-11
##
## Sensitivity : 0.9066
## Specificity : 0.4982
## Pos Pred Value : 0.8335
## Neg Pred Value : 0.6580
## Prevalence : 0.7348
## Detection Rate : 0.6662
## Detection Prevalence : 0.7992
## Balanced Accuracy : 0.7024
##
## 'Positive' Class : No
##
# Reference
# Prediction No Yes
# No 1439 298
# Yes 113 262
##b-4-3. costs matrix 추가
# matrix_dimensions <- list(c("no", "yes"), c("no", "yes"))
# names(matrix_dimensions) <- c("pre", "act")
# matrix_dimensions
#이런 방법으로 costs matrix만들수도 있고, 아래 방법도 있는데, 아래가 더 깔끔하네.
error_cost <- matrix(c(0,1,4,0), nrow = 2)
rownames(error_cost) <- colnames(error_cost) <- c("No", "Yes")
# c5_tree_cost <- C5.0(tr_raw[,-19], tr_raw$churn, trials = 10, costs = error_cost)
# c5_tree_cost_pred <- predict(c5_tree_cost, te_raw, type = "class")
# costs 를 쓰니까 에러난다. cost는 잘못이 없어보이는데, 뭐가문제냐,
# 아, 문제는 colnames였다. tr_raw$churn은 "No", "Yes" 나는 costs에 "no", "yes"라고 썼다. 어휴, 변경하니 되네.
c5_tree_cost <- C5.0(tr_raw[,-19], tr_raw$churn, trials = 10, costs = error_cost)
c5_tree_cost_pred <- predict(c5_tree_cost, te_raw, type = "class")
confusionMatrix(c5_tree_cost_pred, te_raw$churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1420 284
## Yes 132 276
##
## Accuracy : 0.803
## 95% CI : (0.7854, 0.8198)
## No Information Rate : 0.7348
## P-Value [Acc > NIR] : 1.503e-13
##
## Kappa : 0.4465
##
## Mcnemar's Test P-Value : 1.328e-13
##
## Sensitivity : 0.9149
## Specificity : 0.4929
## Pos Pred Value : 0.8333
## Neg Pred Value : 0.6765
## Prevalence : 0.7348
## Detection Rate : 0.6723
## Detection Prevalence : 0.8068
## Balanced Accuracy : 0.7039
##
## 'Positive' Class : No
##
# Reference
# Prediction No Yes
# No 1418 270
# Yes 134 290