library(C50)

data(churn)

names(churnTrain) %in% c("state", "area_code", "account_length")
##  [1]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
!names(churnTrain) %in% c("state", "area_code", "account_length")
##  [1] FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [12]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
#選擇建模變數
variable.list = !names(churnTrain) %in% c('state','area_code','account_length')
churnTrain=churnTrain[,variable.list]
churnTest=churnTest[,variable.list]

set.seed(2)
#把資料分成training data 和 validation data
ind<-sample(1:2, size=nrow(churnTrain), replace=T, prob=c(0.7, 0.3))
trainset=churnTrain[ind==1,]
testset=churnTrain[ind==2,]

補充:隨機森林(Random Forest)

#install.packages('randomForest')
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library('caret')
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
library('e1071')
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
rf_model = randomForest(formula=churn ~ .,data=churnTrain)
#find best ntree
plot(rf_model)
legend("topright",colnames(rf_model$err.rate),col=1:3,cex=0.8,fill=1:3)

#find nest mtry
tuneRF(churnTrain[,-17],churnTrain[,17])
## mtry = 4  OOB error = 4.71% 
## Searching left ...
## mtry = 2     OOB error = 6.21% 
## -0.3184713 0.05 
## Searching right ...
## mtry = 8     OOB error = 4.68% 
## 0.006369427 0.05

##       mtry   OOBError
## 2.OOB    2 0.06210621
## 4.OOB    4 0.04710471
## 8.OOB    8 0.04680468
rf_model <- randomForest(churn ~., data = churnTrain, ntree=50,mtry=4)
# rf_model = train(churn~.,data=churnTrain,method='rf')
confusionMatrix(table(predict(rf_model,churnTest),churnTest$churn))
## Confusion Matrix and Statistics
## 
##      
##        yes   no
##   yes  162    4
##   no    62 1439
##                                           
##                Accuracy : 0.9604          
##                  95% CI : (0.9499, 0.9692)
##     No Information Rate : 0.8656          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8089          
##  Mcnemar's Test P-Value : 2.28e-12        
##                                           
##             Sensitivity : 0.72321         
##             Specificity : 0.99723         
##          Pos Pred Value : 0.97590         
##          Neg Pred Value : 0.95869         
##              Prevalence : 0.13437         
##          Detection Rate : 0.09718         
##    Detection Prevalence : 0.09958         
##       Balanced Accuracy : 0.86022         
##                                           
##        'Positive' Class : yes             
## 
rf.predict.prob <- predict(rf_model, churnTest, type="prob")
rf.prediction <- prediction(rf.predict.prob[,1], as.factor(churnTest$churn))
rf.auc <- performance(rf.prediction, measure = "auc", x.measure = "cutoff")
rf.performance <- performance(rf.prediction, "tpr","fpr")
plot(rf.performance)

#比較CART和RandomForest
tune_funs = expand.grid(cp=seq(0.01,0.1,0.01))
rpart_model =train(churn~., data=churnTrain, method="rpart",tuneGrid=tune_funs)

rpart_prob_yes = predict(rpart_model,churnTest,type='prob')[,1]
rpart_pred.rocr = prediction(rpart_prob_yes,churnTest$churn)
rpart_perf.rocr = performance(rpart_pred.rocr,measure = 'tpr',x.measure = 'fpr')

plot(rpart_perf.rocr,col='red')
plot(rf.performance,col='black',add=T)
legend(0.7, 0.2, c('randomforest','rpart'), 1:2)