RPART

data(iris)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
library(rpart)
## Warning: package 'rpart' was built under R version 3.4.2
fit <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris)
fit
## n= 150 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)  
##   2) Petal.Length< 2.45 50   0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.45 100  50 versicolor (0.00000000 0.50000000 0.50000000)  
##     6) Petal.Width< 1.75 54   5 versicolor (0.00000000 0.90740741 0.09259259) *
##     7) Petal.Width>=1.75 46   1 virginica (0.00000000 0.02173913 0.97826087) *
plot(fit, margin = 0.1)
text(fit)

plot(Petal.Width ~ Petal.Length, data= iris, col=Species)
abline(v = 2.45, col = 'orange')
abline(h = 1.75, col = 'blue')

predicted <- predict(fit, iris, type='class')

sum(iris$Species == predicted) / length(iris$Species)
## [1] 0.96
tb <- table(iris$Species, predicted)
tb
##             predicted
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         1
##   virginica       0          5        45
library(caret)
## Warning: package 'caret' was built under R version 3.4.2
## Loading required package: lattice
## Loading required package: ggplot2

confusionMatrix(tb)
## Confusion Matrix and Statistics
## 
##             predicted
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         1
##   virginica       0          5        45
## 
## Overall Statistics
##                                          
##                Accuracy : 0.96           
##                  95% CI : (0.915, 0.9852)
##     No Information Rate : 0.36           
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.94           
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9074           0.9783
## Specificity                 1.0000            0.9896           0.9519
## Pos Pred Value              1.0000            0.9800           0.9000
## Neg Pred Value              1.0000            0.9500           0.9900
## Prevalence                  0.3333            0.3600           0.3067
## Detection Rate              0.3333            0.3267           0.3000
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            0.9485           0.9651

Split Data Into Training & Testing Dataset

set.seed(123)
sample.int(42,6)
## [1] 13 33 17 35 36  2
set.seed(123)
idx <- sample.int(2, nrow(iris), replace = TRUE, prob = c(0.7, 0.3))
trainset <- iris[idx == 1, ]
testset <- iris[idx == 2, ]

dim(trainset)
## [1] 106   5
dim(testset)
## [1] 44  5
fit2 <- rpart(Species ~ . , data = trainset)
predicted2 <- predict(fit2, testset, type='class')
table(testset$Species, predicted2)
##             predicted2
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         10         4
##   virginica       0          1        14
sum(testset$Species == predicted2) / length(testset$Species)
## [1] 0.8863636

CTREE

# install.packages('party')
library(party)
## Warning: package 'party' was built under R version 3.4.2
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.4.2
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.4.2
fit <- ctree(Species ~ ., data = iris)
plot(fit)

predicted <- predict(fit, iris)
table(iris$Species, predicted)
##             predicted
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         1
##   virginica       0          5        45
plot(Petal.Width ~ Petal.Length, data= iris, col=Species)
abline(h = 1.7, col = 'orange')
abline(v = 1.9, col = 'blue')
abline(v = 4.8, col = 'red')

## Logistic Regression

dataset <- iris[iris$Species != 'setosa', ]
dataset$Species <- factor(dataset$Species)
fit <- glm(Species ~ ., data = dataset, family='binomial')
summary(fit)
## 
## Call:
## glm(formula = Species ~ ., family = "binomial", data = dataset)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.01105  -0.00541  -0.00001   0.00677   1.78065  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   -42.638     25.707  -1.659   0.0972 .
## Sepal.Length   -2.465      2.394  -1.030   0.3032  
## Sepal.Width    -6.681      4.480  -1.491   0.1359  
## Petal.Length    9.429      4.737   1.991   0.0465 *
## Petal.Width    18.286      9.743   1.877   0.0605 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 138.629  on 99  degrees of freedom
## Residual deviance:  11.899  on 95  degrees of freedom
## AIC: 21.899
## 
## Number of Fisher Scoring iterations: 10
predict(fit, dataset)
##          51          52          53          54          55          56 
## -11.3544818  -9.9326130  -6.7253803 -10.0730364  -6.5638417  -9.1918314 
##          57          58          59          60          61          62 
##  -6.6396889 -21.3484037 -11.1356798 -11.1201500 -17.0366939 -10.1926410 
##          63          64          65          66          67          68 
## -16.1233989  -7.1315175 -18.0998447 -12.7756425  -6.6242594 -18.0278599 
##          69          70          71          72          73          74 
##  -2.7586819 -16.2559018  -0.3853463 -14.8926121  -1.2377160 -10.1206562 
##          75          76          77          78          79          80 
## -13.4714513 -11.8610318  -7.2461444  -0.9640817  -6.9422588 -22.7708802 
##          81          82          83          84          85          86 
## -16.2842296 -19.0557818 -16.2565095   1.8801634  -6.1312154  -8.4540886 
##          87          88          89          90          91          92 
##  -8.1182133  -8.2734585 -14.0532409 -11.4092139 -10.1341622  -8.7425447 
##          93          94          95          96          97          98 
## -14.6454823 -20.9268370 -11.1060362 -15.1854381 -12.6887357 -12.9784072 
##          99         100         101         102         103         104 
## -23.5097383 -12.9635855  22.0760350   7.8590485  13.8507316   8.1763399 
##         105         106         107         108         109         110 
##  16.2155389  19.2186911   2.0990656  12.3116893  11.7484836  18.7960092 
##         111         112         113         114         115         116 
##   4.6215645   8.2657934  10.8185436  10.3274231  16.3340282  12.2398047 
##         117         118         119         120         121         122 
##   6.0722686  16.3990117  28.1305669   2.4490547  14.7789486   7.6267405 
##         123         124         125         126         127         128 
##  19.4226714   2.9119477  10.9466766   7.7251297   1.5474425   1.4007256 
##         129         130         131         132         133         134 
##  14.0837476   3.5182027  12.6759926   9.4199248  15.9123613  -1.3561051 
##         135         136         137         138         139         140 
##   3.3591953  17.9147039  15.8075785   5.6507019   0.7043091   8.9609944 
##         141         142         143         144         145         146 
##  16.8257565   9.7894062   7.8590485  16.9113477  18.2611313  11.8934775 
##         147         148         149         150 
##   7.0196773   6.9006805  12.3396098   3.7796467

SVM

#install.packages('e1071')
library(e1071)
fit <- svm(Species ~ ., data = iris)
predicted <- predict(fit, iris)
sum(predicted == iris$Species) / length(iris$Species)
## [1] 0.9733333
table(iris$Species, predicted)
##             predicted
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         2
##   virginica       0          2        48

Customer Churn Prediction

#install.packages('C50')
library(C50)
## Warning: package 'C50' was built under R version 3.4.2
data(churn)
head(churnTrain)
##   state account_length     area_code international_plan voice_mail_plan
## 1    KS            128 area_code_415                 no             yes
## 2    OH            107 area_code_415                 no             yes
## 3    NJ            137 area_code_415                 no              no
## 4    OH             84 area_code_408                yes              no
## 5    OK             75 area_code_415                yes              no
## 6    AL            118 area_code_510                yes              no
##   number_vmail_messages total_day_minutes total_day_calls total_day_charge
## 1                    25             265.1             110            45.07
## 2                    26             161.6             123            27.47
## 3                     0             243.4             114            41.38
## 4                     0             299.4              71            50.90
## 5                     0             166.7             113            28.34
## 6                     0             223.4              98            37.98
##   total_eve_minutes total_eve_calls total_eve_charge total_night_minutes
## 1             197.4              99            16.78               244.7
## 2             195.5             103            16.62               254.4
## 3             121.2             110            10.30               162.6
## 4              61.9              88             5.26               196.9
## 5             148.3             122            12.61               186.9
## 6             220.6             101            18.75               203.9
##   total_night_calls total_night_charge total_intl_minutes total_intl_calls
## 1                91              11.01               10.0                3
## 2               103              11.45               13.7                3
## 3               104               7.32               12.2                5
## 4                89               8.86                6.6                7
## 5               121               8.41               10.1                3
## 6               118               9.18                6.3                6
##   total_intl_charge number_customer_service_calls churn
## 1              2.70                             1    no
## 2              3.70                             1    no
## 3              3.29                             0    no
## 4              1.78                             2    no
## 5              2.73                             3    no
## 6              1.70                             0    no
churnTrain <- churnTrain[,! names(churnTrain) %in% c('state', 'account_length', 'area_code')]

head(churnTrain)
##   international_plan voice_mail_plan number_vmail_messages
## 1                 no             yes                    25
## 2                 no             yes                    26
## 3                 no              no                     0
## 4                yes              no                     0
## 5                yes              no                     0
## 6                yes              no                     0
##   total_day_minutes total_day_calls total_day_charge total_eve_minutes
## 1             265.1             110            45.07             197.4
## 2             161.6             123            27.47             195.5
## 3             243.4             114            41.38             121.2
## 4             299.4              71            50.90              61.9
## 5             166.7             113            28.34             148.3
## 6             223.4              98            37.98             220.6
##   total_eve_calls total_eve_charge total_night_minutes total_night_calls
## 1              99            16.78               244.7                91
## 2             103            16.62               254.4               103
## 3             110            10.30               162.6               104
## 4              88             5.26               196.9                89
## 5             122            12.61               186.9               121
## 6             101            18.75               203.9               118
##   total_night_charge total_intl_minutes total_intl_calls total_intl_charge
## 1              11.01               10.0                3              2.70
## 2              11.45               13.7                3              3.70
## 3               7.32               12.2                5              3.29
## 4               8.86                6.6                7              1.78
## 5               8.41               10.1                3              2.73
## 6               9.18                6.3                6              1.70
##   number_customer_service_calls churn
## 1                             1    no
## 2                             1    no
## 3                             0    no
## 4                             2    no
## 5                             3    no
## 6                             0    no
set.seed(2)
idx <- sample.int(2, nrow(churnTrain), replace=TRUE, prob=c(0.7,0.3))
trainset <- churnTrain[idx == 1,]
testset  <- churnTrain[idx == 2,]

dim(trainset)
## [1] 2315   17
dim(testset)
## [1] 1018   17
library(rpart)
fit <- rpart(churn ~ ., data = trainset)
plot(fit, margin = 0.1)
text(fit)

predicted <- predict(fit, testset, type= 'class')
table(testset$churn, predicted)
##      predicted
##       yes  no
##   yes 100  41
##   no   18 859
sum(predicted == testset$churn) / length(testset$churn)
## [1] 0.9420432
table(testset$churn)
## 
## yes  no 
## 141 877
877 / (141 +877)
## [1] 0.8614931
library(e1071)
fit2 <- svm(churn ~., data = trainset, kernel='polynomial')
#?svm
predicted2 <- predict(fit2, testset)
table(testset$churn, predicted2)
##      predicted2
##       yes  no
##   yes  67  74
##   no    8 869
sum(testset$churn == predicted2) / length(testset$churn)
## [1] 0.9194499

Confusion Matrix

library(rpart)
fit <- rpart(churn ~ ., data = trainset)
plot(fit, margin = 0.1)
text(fit)

predicted <- predict(fit, testset, type= 'class')
tb <- table(testset$churn, predicted)

sum(predicted == testset$churn) / length(testset$churn)
## [1] 0.9420432
library(caret)
confusionMatrix(tb)
## Confusion Matrix and Statistics
## 
##      predicted
##       yes  no
##   yes 100  41
##   no   18 859
##                                           
##                Accuracy : 0.942           
##                  95% CI : (0.9259, 0.9556)
##     No Information Rate : 0.8841          
##     P-Value [Acc > NIR] : 2.052e-10       
##                                           
##                   Kappa : 0.7393          
##  Mcnemar's Test P-Value : 0.004181        
##                                           
##             Sensitivity : 0.84746         
##             Specificity : 0.95444         
##          Pos Pred Value : 0.70922         
##          Neg Pred Value : 0.97948         
##              Prevalence : 0.11591         
##          Detection Rate : 0.09823         
##    Detection Prevalence : 0.13851         
##       Balanced Accuracy : 0.90095         
##                                           
##        'Positive' Class : yes             
## 

caret

library(caret)
control <- trainControl(method="repeatedcv", numbe=10, repeats=3)
model <- train(churn~., data=trainset, method="rpart", preProcess="scale", trControl=control)

#predicted <- predict(model, testset, type='prob')
predicted <- predict(model, testset, type='prob')

ROC Curve

library(rpart)
fit <- rpart(churn ~ ., data = trainset)
plot(fit, margin = 0.1)
text(fit)

predicted <- predict(fit, testset)
#head(predicted)
predicted[,1]
##          2          5          6          8         13         16 
## 0.02877238 0.05084746 0.05084746 0.05084746 0.02877238 0.95522388 
##         17         23         29         33         34         37 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 0.02877238 
##         41         45         46         47         50         54 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##         56         57         58         60         61         62 
## 0.02877238 0.02877238 0.02877238 0.11162791 0.02877238 0.11764706 
##         65         76         78         86         87         89 
## 0.02877238 0.02877238 0.85915493 0.02877238 0.85915493 0.02877238 
##         91        103        104        107        112        115 
## 0.02877238 0.02877238 0.02877238 0.11764706 0.02877238 0.02877238 
##        116        117        119        123        124        126 
## 1.00000000 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##        132        134        138        146        153        155 
## 0.02877238 0.15584416 0.02877238 0.11162791 0.02877238 0.65000000 
##        157        161        162        164        168        171 
## 0.95522388 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 
##        174        176        181        183        184        186 
## 0.02877238 0.11162791 0.15584416 0.02877238 0.02877238 0.02877238 
##        188        193        201        204        206        211 
## 0.95522388 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 
##        213        215        222        226        227        229 
## 0.02877238 1.00000000 0.02877238 0.02877238 0.02877238 0.11764706 
##        230        233        235        241        244        245 
## 0.02877238 0.02877238 0.05084746 0.02877238 0.11162791 0.02877238 
##        246        251        255        261        265        269 
## 0.02877238 0.85915493 0.05084746 0.02877238 0.11162791 0.02877238 
##        271        272        273        275        277        282 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.05084746 0.02877238 
##        286        289        293        295        301        304 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##        305        309        313        319        332        333 
## 0.02877238 0.02877238 0.02877238 0.11162791 0.95522388 0.85915493 
##        339        340        341        342        343        348 
## 0.72222222 0.11162791 0.02877238 0.02877238 0.02877238 0.02877238 
##        354        356        357        369        371        378 
## 0.02877238 0.02877238 0.05084746 0.02877238 0.02877238 0.02877238 
##        389        390        391        392        398        400 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.72222222 0.02877238 
##        405        406        410        412        417        420 
## 0.15584416 0.05084746 0.02877238 0.02877238 1.00000000 0.02877238 
##        421        425        432        433        436        438 
## 0.02877238 0.02877238 0.02877238 0.11162791 0.11162791 0.16666667 
##        439        443        447        452        455        456 
## 0.02877238 0.02877238 0.02877238 0.11162791 0.72222222 0.11162791 
##        458        459        461        466        468        476 
## 0.02877238 0.02877238 0.11162791 1.00000000 0.02877238 0.02877238 
##        480        482        483        484        485        487 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.05084746 
##        489        490        494        498        506        509 
## 0.11162791 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##        517        520        521        524        530        532 
## 0.02877238 0.02877238 0.11162791 0.02877238 0.05084746 0.02877238 
##        533        536        537        540        542        544 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 
##        548        550        551        555        560        561 
## 0.85915493 0.15584416 0.15584416 0.02877238 0.02877238 0.02877238 
##        563        565        569        571        573        574 
## 0.02877238 0.11162791 0.68421053 0.02877238 0.05084746 0.11162791 
##        576        577        583        586        589        590 
## 0.11764706 0.02877238 0.02877238 0.02877238 0.85915493 0.02877238 
##        596        605        609        611        615        617 
## 0.02877238 0.02877238 0.02877238 0.11162791 0.11162791 0.05084746 
##        620        628        633        642        643        645 
## 0.95522388 0.15584416 0.02877238 0.02877238 0.02877238 0.02877238 
##        648        651        653        655        657        661 
## 0.02877238 0.11162791 0.02877238 0.02877238 0.02877238 0.72222222 
##        664        667        669        673        676        677 
## 0.02877238 0.11162791 0.11162791 0.02877238 0.11162791 0.11162791 
##        678        689        695        697        701        702 
## 0.02877238 0.11764706 0.85915493 0.02877238 0.02877238 0.02877238 
##        703        707        709        711        715        717 
## 0.02877238 0.02877238 0.05084746 0.02877238 0.02877238 0.05084746 
##        721        722        723        726        728        734 
## 0.02877238 0.85915493 0.02877238 0.02877238 0.11162791 0.11162791 
##        735        743        747        754        755        759 
## 0.05084746 0.15584416 0.02877238 0.02877238 0.02877238 0.11764706 
##        760        764        769        777        778        779 
## 0.02877238 0.02877238 0.11162791 0.11162791 0.05084746 0.15584416 
##        783        785        787        792        795        798 
## 0.11162791 0.11764706 0.02877238 0.02877238 0.02877238 1.00000000 
##        799        800        806        809        814        817 
## 0.02877238 0.02877238 0.02877238 0.05084746 0.02877238 0.11162791 
##        820        830        833        837        838        841 
## 0.02877238 0.02877238 0.95522388 0.02877238 0.05084746 0.02877238 
##        845        846        847        849        851        853 
## 0.02877238 0.11764706 0.02877238 0.11162791 0.16666667 0.02877238 
##        855        858        864        866        867        868 
## 0.02877238 0.02877238 0.02877238 0.15584416 0.02877238 0.02877238 
##        874        890        892        895        906        908 
## 0.02877238 0.02877238 0.02877238 0.85915493 0.65000000 0.02877238 
##        909        915        917        922        929        932 
## 0.15584416 0.95522388 0.02877238 0.02877238 0.02877238 0.02877238 
##        934        937        938        941        943        945 
## 1.00000000 0.02877238 0.11162791 0.02877238 0.11162791 0.15584416 
##        946        954        956        962        966        968 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 
##        969        974        978        981        985        988 
## 0.02877238 0.02877238 0.85915493 0.02877238 0.02877238 0.11764706 
##        992        998       1003       1008       1011       1015 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 0.02877238 
##       1019       1022       1026       1027       1030       1033 
## 0.11162791 0.02877238 0.72222222 0.02877238 0.02877238 0.11162791 
##       1034       1035       1036       1039       1043       1044 
## 0.02877238 0.02877238 0.02877238 0.85915493 0.05084746 0.02877238 
##       1048       1050       1051       1053       1054       1057 
## 0.02877238 0.02877238 0.02877238 0.05084746 0.02877238 0.02877238 
##       1065       1067       1068       1072       1073       1076 
## 0.05084746 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       1077       1079       1086       1087       1091       1094 
## 0.02877238 0.95522388 0.11162791 0.02877238 0.02877238 0.02877238 
##       1099       1103       1105       1107       1108       1111 
## 0.02877238 0.65000000 0.15584416 0.02877238 0.02877238 0.02877238 
##       1112       1115       1117       1120       1121       1125 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       1131       1132       1133       1135       1136       1137 
## 0.02877238 0.11162791 0.02877238 0.02877238 0.11162791 0.11764706 
##       1141       1142       1143       1144       1146       1148 
## 0.02877238 0.02877238 0.85915493 0.02877238 0.02877238 0.11162791 
##       1149       1150       1153       1157       1159       1160 
## 0.02877238 0.05084746 0.02877238 0.02877238 0.05084746 0.02877238 
##       1163       1164       1167       1172       1174       1179 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       1186       1188       1190       1193       1209       1211 
## 0.02877238 0.02877238 0.02877238 0.15584416 0.02877238 0.16666667 
##       1212       1214       1215       1216       1217       1222 
## 0.02877238 0.11162791 0.02877238 0.02877238 0.02877238 0.02877238 
##       1224       1226       1227       1228       1232       1233 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.95522388 0.11162791 
##       1236       1242       1243       1246       1249       1252 
## 0.15584416 0.15584416 0.05084746 0.02877238 0.05084746 0.02877238 
##       1256       1257       1260       1262       1263       1266 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.65000000 0.11162791 
##       1267       1270       1273       1276       1279       1291 
## 0.02877238 0.85915493 0.15584416 0.02877238 0.72222222 0.11162791 
##       1294       1296       1300       1301       1303       1306 
## 0.02877238 0.02877238 0.95522388 0.02877238 0.95522388 0.02877238 
##       1309       1310       1316       1322       1323       1325 
## 0.02877238 0.05084746 0.11162791 0.02877238 1.00000000 0.02877238 
##       1326       1329       1332       1333       1339       1347 
## 0.85915493 0.02877238 0.02877238 0.11162791 0.02877238 1.00000000 
##       1368       1384       1385       1386       1390       1396 
## 0.05084746 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       1397       1399       1401       1406       1407       1411 
## 0.15584416 0.02877238 0.02877238 0.85915493 0.02877238 0.02877238 
##       1412       1414       1422       1423       1424       1425 
## 0.02877238 0.02877238 0.11162791 0.02877238 0.05084746 0.02877238 
##       1426       1427       1431       1433       1437       1439 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 0.02877238 
##       1444       1445       1446       1450       1451       1454 
## 0.05084746 0.02877238 0.02877238 0.15584416 0.02877238 0.11162791 
##       1455       1460       1461       1462       1463       1465 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       1469       1471       1476       1477       1483       1484 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 0.05084746 
##       1485       1487       1495       1497       1500       1508 
## 1.00000000 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       1510       1528       1529       1531       1532       1538 
## 0.11162791 0.02877238 0.02877238 0.11162791 0.85915493 0.85915493 
##       1540       1541       1546       1550       1554       1560 
## 0.05084746 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 
##       1564       1569       1573       1574       1575       1580 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.05084746 0.02877238 
##       1581       1596       1597       1599       1600       1604 
## 0.72222222 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       1606       1613       1615       1617       1618       1628 
## 0.02877238 0.02877238 0.72222222 0.02877238 0.05084746 0.02877238 
##       1636       1638       1642       1643       1645       1650 
## 0.85915493 0.11162791 0.05084746 0.02877238 0.02877238 0.02877238 
##       1658       1659       1660       1663       1669       1673 
## 0.02877238 0.02877238 0.02877238 0.85915493 0.02877238 0.02877238 
##       1678       1681       1682       1683       1684       1685 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       1686       1694       1702       1704       1705       1708 
## 0.02877238 0.02877238 0.95522388 0.02877238 0.95522388 0.85915493 
##       1709       1713       1714       1718       1719       1721 
## 0.15584416 0.15584416 0.85915493 0.02877238 0.16666667 0.02877238 
##       1725       1727       1728       1730       1731       1736 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       1738       1743       1754       1759       1761       1763 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       1765       1766       1771       1775       1777       1788 
## 0.11162791 0.85915493 0.02877238 0.11162791 0.02877238 0.02877238 
##       1795       1800       1803       1813       1821       1824 
## 0.95522388 0.02877238 0.15584416 0.95522388 0.02877238 0.11162791 
##       1825       1827       1828       1829       1831       1835 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.68421053 0.02877238 
##       1836       1837       1838       1841       1843       1848 
## 0.15584416 0.02877238 1.00000000 0.11162791 0.95522388 0.02877238 
##       1850       1852       1856       1858       1859       1866 
## 0.68421053 0.15584416 0.02877238 0.02877238 0.72222222 0.85915493 
##       1869       1873       1875       1876       1882       1889 
## 0.02877238 0.05084746 0.02877238 0.02877238 0.95522388 0.02877238 
##       1892       1894       1902       1907       1911       1912 
## 0.02877238 0.11162791 0.02877238 0.02877238 0.02877238 0.15584416 
##       1915       1916       1926       1930       1932       1937 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.05084746 0.02877238 
##       1938       1939       1940       1941       1946       1961 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 0.02877238 
##       1962       1966       1968       1972       1982       1983 
## 0.02877238 0.11162791 0.02877238 0.02877238 0.02877238 0.02877238 
##       1985       1987       1988       1990       1991       1992 
## 0.95522388 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       1996       2000       2003       2008       2011       2014 
## 0.11162791 0.02877238 0.02877238 0.11162791 0.02877238 0.02877238 
##       2019       2023       2025       2027       2029       2032 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.68421053 0.85915493 
##       2034       2037       2038       2042       2043       2046 
## 0.02877238 0.11162791 0.02877238 0.02877238 0.72222222 0.15584416 
##       2051       2055       2057       2059       2060       2063 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.95522388 0.02877238 
##       2065       2069       2070       2075       2077       2078 
## 0.02877238 0.02877238 0.11162791 0.02877238 0.15584416 0.11162791 
##       2079       2080       2096       2097       2102       2110 
## 0.02877238 0.11162791 0.11162791 0.11162791 0.02877238 0.02877238 
##       2112       2116       2122       2125       2128       2131 
## 0.02877238 1.00000000 0.02877238 0.02877238 0.02877238 0.11162791 
##       2133       2139       2141       2145       2147       2153 
## 0.02877238 0.02877238 0.65000000 0.02877238 0.05084746 0.05084746 
##       2154       2157       2161       2163       2165       2167 
## 0.02877238 0.11162791 1.00000000 0.02877238 1.00000000 0.02877238 
##       2168       2178       2180       2182       2189       2194 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 
##       2196       2199       2200       2208       2211       2212 
## 0.02877238 0.11162791 0.11162791 0.02877238 0.95522388 0.15584416 
##       2213       2219       2224       2233       2238       2243 
## 0.11162791 0.85915493 0.65000000 0.02877238 0.15584416 0.02877238 
##       2245       2246       2250       2257       2258       2259 
## 0.11162791 0.02877238 0.02877238 0.02877238 0.02877238 0.95522388 
##       2261       2262       2269       2270       2274       2282 
## 0.11764706 0.15584416 0.02877238 0.02877238 0.02877238 0.02877238 
##       2283       2285       2288       2289       2290       2292 
## 0.02877238 0.02877238 0.16666667 0.02877238 0.05084746 0.02877238 
##       2293       2295       2297       2305       2313       2316 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       2319       2320       2321       2323       2325       2328 
## 0.02877238 0.11764706 0.02877238 0.15584416 0.85915493 0.65000000 
##       2331       2332       2340       2343       2344       2351 
## 0.02877238 0.02877238 0.02877238 0.02877238 1.00000000 0.02877238 
##       2354       2355       2356       2360       2362       2369 
## 0.05084746 0.95522388 0.11162791 0.02877238 0.02877238 0.11162791 
##       2370       2372       2374       2377       2379       2381 
## 0.65000000 0.02877238 0.11162791 0.95522388 0.02877238 0.65000000 
##       2382       2386       2393       2396       2397       2399 
## 0.02877238 0.02877238 0.02877238 0.15584416 0.05084746 0.02877238 
##       2407       2413       2416       2427       2428       2436 
## 0.02877238 0.68421053 0.85915493 0.02877238 0.02877238 0.11764706 
##       2440       2441       2444       2445       2448       2449 
## 0.02877238 0.02877238 0.02877238 0.15584416 0.02877238 0.11162791 
##       2452       2454       2456       2468       2470       2472 
## 0.95522388 0.02877238 0.02877238 0.05084746 0.02877238 0.02877238 
##       2474       2475       2476       2477       2480       2482 
## 0.02877238 0.02877238 0.02877238 0.02877238 1.00000000 0.02877238 
##       2485       2487       2488       2492       2495       2496 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 0.02877238 
##       2497       2507       2508       2509       2512       2523 
## 0.02877238 0.11162791 0.02877238 0.02877238 0.02877238 0.02877238 
##       2524       2527       2529       2533       2539       2544 
## 0.02877238 1.00000000 0.02877238 0.02877238 0.02877238 0.02877238 
##       2549       2551       2552       2555       2556       2561 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       2565       2569       2571       2574       2579       2580 
## 0.02877238 0.05084746 0.02877238 0.68421053 0.02877238 0.02877238 
##       2582       2583       2584       2585       2590       2592 
## 0.85915493 0.02877238 0.02877238 0.02877238 0.02877238 0.72222222 
##       2593       2598       2600       2602       2604       2605 
## 0.15584416 0.02877238 0.72222222 0.02877238 0.11764706 0.02877238 
##       2606       2611       2614       2616       2617       2619 
## 0.72222222 0.02877238 0.11162791 0.02877238 0.02877238 0.05084746 
##       2620       2623       2632       2634       2635       2636 
## 1.00000000 0.85915493 0.02877238 0.02877238 0.02877238 0.02877238 
##       2637       2640       2643       2645       2647       2651 
## 0.02877238 0.02877238 0.05084746 0.02877238 0.15584416 0.02877238 
##       2655       2656       2658       2660       2661       2662 
## 0.02877238 0.02877238 0.11162791 0.02877238 0.95522388 0.11162791 
##       2665       2666       2668       2671       2678       2688 
## 1.00000000 0.05084746 0.02877238 0.02877238 0.68421053 1.00000000 
##       2689       2697       2700       2705       2710       2712 
## 0.02877238 0.95522388 0.02877238 0.02877238 0.02877238 0.02877238 
##       2714       2718       2720       2725       2728       2730 
## 0.02877238 0.02877238 0.02877238 0.16666667 0.05084746 0.02877238 
##       2740       2743       2752       2753       2754       2756 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       2761       2768       2770       2772       2780       2782 
## 1.00000000 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       2783       2785       2786       2787       2791       2793 
## 0.02877238 0.11162791 0.15584416 0.85915493 0.02877238 0.02877238 
##       2795       2798       2801       2803       2810       2812 
## 0.05084746 0.02877238 1.00000000 0.02877238 0.02877238 0.02877238 
##       2813       2822       2823       2827       2835       2837 
## 0.02877238 0.11162791 0.05084746 0.02877238 0.02877238 0.02877238 
##       2838       2839       2840       2850       2851       2855 
## 0.02877238 0.02877238 0.95522388 0.11162791 0.02877238 0.02877238 
##       2862       2865       2866       2868       2869       2872 
## 0.15584416 0.11162791 0.02877238 0.11162791 1.00000000 0.02877238 
##       2874       2877       2878       2881       2882       2891 
## 0.02877238 0.11162791 0.02877238 0.11162791 0.02877238 0.11162791 
##       2892       2900       2905       2906       2908       2910 
## 0.02877238 0.02877238 0.05084746 0.02877238 0.02877238 0.02877238 
##       2920       2923       2938       2940       2944       2948 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.72222222 
##       2951       2954       2955       2960       2962       2963 
## 0.02877238 0.15584416 0.02877238 0.11162791 0.15584416 0.11162791 
##       2966       2973       2975       2981       2982       2984 
## 0.02877238 0.02877238 0.02877238 0.72222222 0.02877238 0.02877238 
##       2985       2992       2996       2997       2998       3002 
## 0.02877238 0.05084746 0.02877238 0.02877238 0.11162791 0.02877238 
##       3005       3008       3013       3016       3018       3026 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.15584416 0.02877238 
##       3027       3032       3034       3037       3038       3043 
## 0.15584416 0.05084746 0.02877238 0.02877238 0.02877238 0.02877238 
##       3048       3049       3050       3051       3057       3061 
## 0.02877238 0.02877238 0.02877238 0.85915493 0.11162791 0.02877238 
##       3063       3064       3069       3071       3072       3073 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.68421053 
##       3084       3091       3092       3097       3098       3099 
## 0.11162791 0.02877238 0.02877238 0.11764706 0.02877238 0.02877238 
##       3107       3110       3112       3115       3116       3123 
## 0.02877238 0.05084746 0.02877238 0.02877238 0.15584416 0.02877238 
##       3125       3127       3129       3131       3132       3135 
## 0.11162791 0.11162791 0.02877238 0.02877238 0.15584416 0.02877238 
##       3140       3143       3144       3150       3152       3154 
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 
##       3155       3159       3165       3168       3169       3176 
## 0.02877238 0.02877238 0.02877238 0.11162791 0.72222222 0.11764706 
##       3180       3184       3190       3193       3196       3198 
## 0.02877238 0.02877238 1.00000000 0.02877238 0.02877238 0.11162791 
##       3200       3207       3217       3218       3223       3226 
## 0.02877238 0.02877238 0.02877238 0.11162791 0.05084746 0.11162791 
##       3227       3229       3232       3235       3245       3254 
## 0.11162791 0.15584416 0.02877238 0.02877238 0.11162791 0.02877238 
##       3258       3259       3260       3261       3267       3270 
## 0.02877238 0.11162791 0.02877238 0.11162791 0.02877238 0.05084746 
##       3277       3279       3285       3290       3292       3296 
## 0.11162791 0.11162791 0.02877238 0.02877238 1.00000000 0.11162791 
##       3301       3302       3307       3311       3315       3316 
## 0.02877238 0.95522388 0.02877238 0.02877238 0.02877238 0.02877238 
##       3323       3325       3329       3333 
## 0.95522388 0.02877238 0.02877238 0.72222222
res <- ifelse(predicted[,1] > 0.2, 'yes', 'no' )
res <- factor(res, levels = c('yes','no'), labels = c('yes', 'no'))
#res
tb <- table(testset$churn,res)
TP <- tb[1,1]
FN <- tb[2,1]
FP <- tb[1,2]
TN <- tb[2,2] 

FPR <- FP / (TN + FP)
TPR <- TP / (TP + FN)

FPR
## [1] 0.04555556
TPR
## [1] 0.8474576
x <- c(0)
y <- c(0)
t <- c(0)
for (threshold in seq(0,1,0.01)){
  res <- ifelse(predicted[,1] >= threshold, 'yes', 'no' )
  res <- factor(res, levels = c('yes','no'), labels = c('yes', 'no'))
  tb <- table(testset$churn,res)
  #print(tb)
  TP <- tb[1,1]
  FN <- tb[2,1]
  FP <- tb[1,2]
  TN <- tb[2,2] 
  FPR <- FP / (TN + FP)
  TPR <- TP / (TP + FN)
  if (! is.na(FPR)){
    x <- c(x, FPR)
    y <- c(y, TPR)
    t <- c(t, threshold)
  }
}
x <- c(x, 1)
y <- c(y, 1)
t <- c(t, 1)

t[order(y / x, decreasing = TRUE)]
##   [1] 0.16 0.12 0.13 0.14 0.15 0.17 0.18 0.19 0.20 0.21 0.22 0.23 0.24 0.25
##  [15] 0.26 0.27 0.28 0.29 0.30 0.31 0.32 0.33 0.34 0.35 0.36 0.37 0.38 0.39
##  [29] 0.40 0.41 0.42 0.43 0.44 0.45 0.46 0.47 0.48 0.49 0.50 0.51 0.52 0.53
##  [43] 0.54 0.55 0.56 0.57 0.58 0.59 0.60 0.61 0.62 0.63 0.64 0.65 0.06 0.07
##  [57] 0.08 0.09 0.10 0.11 0.66 0.67 0.68 0.69 0.70 0.71 0.72 0.03 0.04 0.05
##  [71] 0.73 0.74 0.75 0.76 0.77 0.78 0.79 0.80 0.81 0.82 0.83 0.84 0.85 0.86
##  [85] 0.87 0.88 0.89 0.90 0.91 0.92 0.93 0.94 0.95 0.96 0.97 0.98 0.99 1.00
##  [99] 1.00 0.00
plot(x, y, type = 'b', xlab = 'FPR', ylab ='TPR', main = 'ROC Curve', col='blue', xlim = c(0,1), ylim=c(0,1))
lines(c(0,1), c(0,1), col='red')

set.seed

set.seed(23)
#.Random.seed
sample.int(42,6)
## [1] 25 10 14 28 32 16
sample.int(42,6)
## [1] 41 42 34 39 33 26
sample.int(42,6)
## [1] 17 13 34  6 20 22

ROC Curve Comparison

# Tree Model
library(rpart)
fit1  <- rpart(churn ~ ., data = trainset)

# Random Forest
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
fit2 <- randomForest(churn ~., data = trainset)

# ConfusionMatirx
## tree
predicted.tree <- predict(fit1, testset, type='class')
table(testset$churn,predicted.tree)
##      predicted.tree
##       yes  no
##   yes 100  41
##   no   18 859
## forest
predicted.forest <- predict(fit2, testset)
table(testset$churn,predicted.forest)
##      predicted.forest
##       yes  no
##   yes 109  32
##   no    6 871
# ROC Curve

### ROC Generation Function
ROCCurve <- function(model){
  predicted <- predict(model, testset, type='prob')
  x1 <- c(0)
  y1 <- c(0)
  for (threshold in seq(0,1,0.01)){
    res <- ifelse(predicted[,1] >= threshold, 'yes', 'no' )
    res <- factor(res, levels = c('yes','no'), labels = c('yes', 'no'))
    tb <- table(testset$churn,res)
    #print(tb)
    TP <- tb[1,1]
    FN <- tb[2,1]
    FP <- tb[1,2]
    TN <- tb[2,2] 
    FPR <- FP / (TN + FP)
    TPR <- TP / (TP + FN)
    if (! is.na(FPR)){
      x1 <- c(x1, FPR)
      y1 <- c(y1, TPR)
    }
  }
  x1 <- c(x1, 1)
  y1 <- c(y1, 1)
  return(list(x=x1, y = y1))
}

### Compare Model
tree   <- ROCCurve(fit1)
forest <- ROCCurve(fit2)
plot(c(0,1),c(0,1), type= 'n',xlab = 'FPR', ylab ='TPR', main = 'ROC Curve', col='blue', xlim = c(0,1), ylim=c(0,1))
lines(tree$x,   tree$y, col='red')
lines(forest$x, forest$y, col='orange')
legend(0.7,0.2, legend = c('tree', 'forest'), col = c('red', 'orange'),lwd = 3)

## Using ROCR Package

#install.packages('ROCR')
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.4.2
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
predicted.tree <- predict(fit1, testset, type='prob')
tree.to.roc <- predicted.tree[,1]
tree.pred.rocr <- prediction(tree.to.roc, testset$churn)

tree.perf.rocr <- performance(tree.pred.rocr, measure = "auc", x.measure = "cutoff") 
tree.perf.tpr.rocr <- performance(tree.pred.rocr, "tpr","fpr") 


predicted.forest <- predict(fit2, testset, type='prob')
forest.to.roc <- predicted.forest[,1]
forest.pred.rocr <- prediction(forest.to.roc, testset$churn)

forest.perf.rocr <- performance(forest.pred.rocr, measure = "auc", x.measure = "cutoff") 
#forest.perf.rocr
forest.perf.tpr.rocr <- performance(forest.pred.rocr, "tpr","fpr") 


plot(tree.perf.tpr.rocr, col="red",main="ROC Comparison")
plot(forest.perf.tpr.rocr, col="blue", add=TRUE)
legend(0.7,0.2, legend = c(paste0('tree:',as.character(round(tree.perf.rocr@y.values[[1]],2))), paste0('forest:',as.character(round(forest.perf.rocr@y.values[[1]],2) ))), col = c('red', 'blue'),lwd = 3)

## Find Most Important Variable

#install.packages('rminer')
library(rminer)
## Warning: package 'rminer' was built under R version 3.4.2
## 
## Attaching package: 'rminer'
## The following object is masked from 'package:party':
## 
##     fit
## The following object is masked from 'package:modeltools':
## 
##     fit
model <- fit(churn~., data = trainset,model="svm")

VariableImportance <- Importance(model,trainset,method="sensv")

L <- list(runs=1,sen=t(VariableImportance$imp),sresponses=VariableImportance$sresponses)

mgraph(L,graph="IMP",leg=names(trainset),col="gray",Grid=10)

## 文章分類 - https://github.com/ywchiu/rtibame/blob/master/data/applenews.RData

#load('applenews.RData')
#head(applenews)

apple.subset <- applenews[applenews$category %in% c('社會', '財經')  ,   ]

library(jiebaR)
mixseg <- worker()
apple.seg <- lapply(apple.subset$content, function(e) segment(e, jiebar = mixseg))

library(tm)
corpus <- Corpus(VectorSource(apple.seg))
dtm <- DocumentTermMatrix(corpus)


convert_counts <- function(x) {
  x <- ifelse(x > 0, 1, 0)
  x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
  return(x)
}
dtm.count <- apply(dtm, MARGIN = 2, convert_counts)

#dtm.count[1:100,1:100]

m <- as.data.frame(dtm.count)

idx <- sample.int(2, nrow(m), replace=TRUE, prob=c(0.7,0.3))
trainset <- m[idx==1,]
testset <- m[idx==2,]
traintag <- apple.subset[idx==1,"category"]
testtag <-apple.subset[idx==2,"category"]

library(e1071)
model <- naiveBayes(trainset,as.factor(traintag) )
pred <- predict(model, testset)
tb   <- table(pred, as.factor(testtag) )
tb