DATA PREPERATION:= IT INCLUDES THE FOLLWING STEPS

Structure of the dat set
skewness of the Target class
Check for NULL values and constant columns

library(C50)

## Warning: package 'C50' was built under R version 3.3.3

data(churn)

str(churnTest)

## 'data.frame':    1667 obs. of  20 variables:
##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 12 27 36 33 41 13 29 19 25 44 ...
##  $ account_length               : int  101 137 103 99 108 117 63 94 138 128 ...
##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 3 3 1 2 2 2 2 1 3 2 ...
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 2 1 1 2 ...
##  $ number_vmail_messages        : int  0 0 29 0 0 0 32 0 0 43 ...
##  $ total_day_minutes            : num  70.9 223.6 294.7 216.8 197.4 ...
##  $ total_day_calls              : int  123 86 95 123 78 85 124 97 117 100 ...
##  $ total_day_charge             : num  12.1 38 50.1 36.9 33.6 ...
##  $ total_eve_minutes            : num  212 245 237 126 124 ...
##  $ total_eve_calls              : int  73 139 105 88 101 68 125 112 46 89 ...
##  $ total_eve_charge             : num  18 20.8 20.2 10.7 10.5 ...
##  $ total_night_minutes          : num  236 94.2 300.3 220.6 204.5 ...
##  $ total_night_calls            : int  73 81 127 82 107 90 120 106 71 92 ...
##  $ total_night_charge           : num  10.62 4.24 13.51 9.93 9.2 ...
##  $ total_intl_minutes           : num  10.6 9.5 13.7 15.7 7.7 6.9 12.9 11.1 9.9 11.9 ...
##  $ total_intl_calls             : int  3 7 6 2 4 5 3 6 4 1 ...
##  $ total_intl_charge            : num  2.86 2.57 3.7 4.24 2.08 1.86 3.48 3 2.67 3.21 ...
##  $ number_customer_service_calls: int  3 0 1 1 2 1 1 0 2 0 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...

str(churnTrain)

## 'data.frame':    3333 obs. of  20 variables:
##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
##  $ account_length               : int  128 107 137 84 75 118 121 147 117 141 ...
##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ number_vmail_messages        : int  25 26 0 0 0 0 24 0 0 37 ...
##  $ total_day_minutes            : num  265 162 243 299 167 ...
##  $ total_day_calls              : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ total_day_charge             : num  45.1 27.5 41.4 50.9 28.3 ...
##  $ total_eve_minutes            : num  197.4 195.5 121.2 61.9 148.3 ...
##  $ total_eve_calls              : int  99 103 110 88 122 101 108 94 80 111 ...
##  $ total_eve_charge             : num  16.78 16.62 10.3 5.26 12.61 ...
##  $ total_night_minutes          : num  245 254 163 197 187 ...
##  $ total_night_calls            : int  91 103 104 89 121 118 118 96 90 97 ...
##  $ total_night_charge           : num  11.01 11.45 7.32 8.86 8.41 ...
##  $ total_intl_minutes           : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ total_intl_calls             : int  3 3 5 7 3 6 7 6 4 5 ...
##  $ total_intl_charge            : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ number_customer_service_calls: int  1 1 0 2 3 0 3 0 1 0 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...

Data=rbind.data.frame(churnTest,churnTrain)


sapply(Data, function(x) sum(is.na(x)))

##                         state                account_length 
##                             0                             0 
##                     area_code            international_plan 
##                             0                             0 
##               voice_mail_plan         number_vmail_messages 
##                             0                             0 
##             total_day_minutes               total_day_calls 
##                             0                             0 
##              total_day_charge             total_eve_minutes 
##                             0                             0 
##               total_eve_calls              total_eve_charge 
##                             0                             0 
##           total_night_minutes             total_night_calls 
##                             0                             0 
##            total_night_charge            total_intl_minutes 
##                             0                             0 
##              total_intl_calls             total_intl_charge 
##                             0                             0 
## number_customer_service_calls                         churn 
##                             0                             0

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.3.3

nearZeroVar(Data, saveMetrics= TRUE)

##                               freqRatio percentUnique zeroVar   nzv
## state                          1.264000          1.02   FALSE FALSE
## account_length                 1.101695          4.36   FALSE FALSE
## area_code                      1.981732          0.06   FALSE FALSE
## international_plan             9.570825          0.04   FALSE FALSE
## voice_mail_plan                2.779289          0.04   FALSE FALSE
## number_vmail_messages         44.313253          0.96   FALSE  TRUE
## total_day_minutes              1.000000         39.22   FALSE FALSE
## total_day_calls                1.035398          2.46   FALSE FALSE
## total_day_charge               1.000000         39.22   FALSE FALSE
## total_eve_minutes              1.000000         37.58   FALSE FALSE
## total_eve_calls                1.045455          2.52   FALSE FALSE
## total_eve_charge               1.000000         33.18   FALSE FALSE
## total_night_minutes            1.000000         37.06   FALSE FALSE
## total_night_calls              1.110092          2.62   FALSE FALSE
## total_night_charge             1.000000         20.56   FALSE FALSE
## total_intl_minutes             1.022727          3.40   FALSE FALSE
## total_intl_calls               1.040923          0.42   FALSE FALSE
## total_intl_charge              1.022727          3.40   FALSE FALSE
## number_customer_service_calls  1.584738          0.20   FALSE FALSE
## churn                          6.072136          0.04   FALSE FALSE

Vizualisation of Categorical and continous varaible

Dropping IDS
Seperating categorical and continous varaible

Data=Data[,-2]

Cont_Data=Data[,-c(1,2,3,4)]

Cat_Data=Data[,c(1,2,3,4,19)]



library(AppliedPredictiveModeling)
transparentTheme(trans = .4)
library(caret)

            
            
featurePlot (x = Cont_Data[, 1:14], 
            y = Cont_Data$churn, 
            plot = "box", 
            ## Pass in options to bwplot() 
            scales = list(y = list(relation="free"),
                          x = list(rot = 90)),  
            layout = c(7,2))

Running the Decision Tree and Random Forest model

steps include 1. Partition of data set into train and test 2. Running the model-First Result 3. Hyperparameter Tuning using Grid.Approach

library(rpart)

## Warning: package 'rpart' was built under R version 3.3.3

library(rpart.plot)
library(rattle)

## Warning: package 'rattle' was built under R version 3.3.3

## Rattle: A free graphical interface for data mining with R.
## Versión 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Escriba 'rattle()' para agitar, sacudir y  rotar sus datos.

library(caret)

set.seed(3456)
trainIndex <- createDataPartition(Data$churn, p = .6, 
                                  list = FALSE, 
                                  times = 1)


Train <- Data[ trainIndex,]
Test  <- Data[-trainIndex,]




Tree_model <- rpart(churn~., data = Train)
summary(Tree_model)

## Call:
## rpart(formula = churn ~ ., data = Train)
##   n= 3001 
## 
##            CP nsplit rel error    xerror       xstd
## 1  0.07058824      0 1.0000000 1.0000000 0.04494128
## 2  0.06000000      3 0.7717647 0.8423529 0.04177997
## 3  0.05176471      5 0.6517647 0.6588235 0.03749051
## 4  0.04000000      8 0.4564706 0.4941176 0.03288271
## 5  0.01882353      9 0.4164706 0.4658824 0.03199795
## 6  0.01647059     11 0.3788235 0.4635294 0.03192274
## 7  0.01529412     12 0.3623529 0.4635294 0.03192274
## 8  0.01411765     14 0.3317647 0.4682353 0.03207293
## 9  0.01176471     16 0.3035294 0.4635294 0.03192274
## 10 0.01000000     18 0.2800000 0.4611765 0.03184729
## 
## Variable importance
##             total_day_minutes              total_day_charge 
##                            16                            16 
##              total_eve_charge             total_eve_minutes 
##                             8                             8 
##                         state            total_intl_minutes 
##                             8                             7 
##             total_intl_charge number_customer_service_calls 
##                             7                             7 
##              total_intl_calls            international_plan 
##                             5                             5 
##         number_vmail_messages               voice_mail_plan 
##                             5                             4 
##           total_night_minutes            total_night_charge 
##                             2                             2 
##               total_day_calls 
##                             1 
## 
## Node number 1: 3001 observations,    complexity param=0.07058824
##   predicted class=no   expected loss=0.1416195  P(node) =1
##     class counts:   425  2576
##    probabilities: 0.142 0.858 
##   left son=2 (301 obs) right son=3 (2700 obs)
##   Primary splits:
##       total_day_minutes             < 248.65 to the right, improve=85.14364, (0 missing)
##       total_day_charge              < 42.27  to the right, improve=85.14364, (0 missing)
##       number_customer_service_calls < 3.5    to the right, improve=60.91735, (0 missing)
##       international_plan            splits as  RL, improve=51.33479, (0 missing)
##       state                         splits as  RRLRLRLRRRRRRRRLRRRLLLRRRLLLRRRLRLLRLLRRLRLLRRRLRRR, improve=12.70748, (0 missing)
##   Surrogate splits:
##       total_day_charge  < 42.27  to the right, agree=1.0, adj=1.000, (0 split)
##       total_eve_minutes < 41.95  to the left,  agree=0.9, adj=0.003, (0 split)
##       total_eve_charge  < 3.565  to the left,  agree=0.9, adj=0.003, (0 split)
## 
## Node number 2: 301 observations,    complexity param=0.07058824
##   predicted class=no   expected loss=0.4983389  P(node) =0.1002999
##     class counts:   150   151
##    probabilities: 0.498 0.502 
##   left son=4 (234 obs) right son=5 (67 obs)
##   Primary splits:
##       voice_mail_plan       splits as  LR,         improve=35.45930, (0 missing)
##       number_vmail_messages < 6.5    to the left,  improve=35.45930, (0 missing)
##       total_eve_minutes     < 201.15 to the right, improve=28.62253, (0 missing)
##       total_eve_charge      < 17.1   to the right, improve=28.62253, (0 missing)
##       total_day_minutes     < 285.5  to the right, improve=17.58056, (0 missing)
##   Surrogate splits:
##       number_vmail_messages < 6.5    to the left,  agree=1.000, adj=1.000, (0 split)
##       state                 splits as  LRLLLLLLLLLLLLLLLLRLLLLLLLLRLLLLLLLLLLLLLLLLLLLLLLR, agree=0.791, adj=0.060, (0 split)
##       total_night_minutes   < 298.75 to the left,  agree=0.781, adj=0.015, (0 split)
##       total_night_charge    < 13.445 to the left,  agree=0.781, adj=0.015, (0 split)
## 
## Node number 3: 2700 observations,    complexity param=0.06
##   predicted class=no   expected loss=0.1018519  P(node) =0.8997001
##     class counts:   275  2425
##    probabilities: 0.102 0.898 
##   left son=6 (207 obs) right son=7 (2493 obs)
##   Primary splits:
##       number_customer_service_calls < 3.5    to the right, improve=65.168630, (0 missing)
##       international_plan            splits as  RL, improve=46.007020, (0 missing)
##       state                         splits as  RRLRLRLRLRRRRRLLRRLLLLLLRLLLLRRLRLLRLLRRLLLLRRRLRLR, improve= 7.779143, (0 missing)
##       total_intl_minutes            < 13.45  to the right, improve= 6.884975, (0 missing)
##       total_intl_charge             < 3.635  to the right, improve= 6.884975, (0 missing)
## 
## Node number 4: 234 observations,    complexity param=0.07058824
##   predicted class=yes  expected loss=0.3717949  P(node) =0.07797401
##     class counts:   147    87
##    probabilities: 0.628 0.372 
##   left son=8 (125 obs) right son=9 (109 obs)
##   Primary splits:
##       total_eve_minutes < 195.6  to the right, improve=36.22351, (0 missing)
##       total_eve_charge  < 16.625 to the right, improve=36.22351, (0 missing)
##       total_day_minutes < 285.5  to the right, improve=17.80445, (0 missing)
##       total_day_charge  < 48.535 to the right, improve=17.80445, (0 missing)
##       state             splits as  LLRLRRLRRLLRLLRLLL-LLLLLRLLRRRLLRLLLLLLRLLLRRRLLRLR, improve=14.52764, (0 missing)
##   Surrogate splits:
##       total_eve_charge   < 16.625 to the right, agree=1.000, adj=1.000, (0 split)
##       state              splits as  RLRLLLLRRLLLLLRLRR-RRLLLLRLLLLRLLLRLLLRRLLLRRRLLRLL, agree=0.697, adj=0.349, (0 split)
##       total_day_calls    < 85.5   to the right, agree=0.568, adj=0.073, (0 split)
##       total_intl_minutes < 6.65   to the right, agree=0.564, adj=0.064, (0 split)
##       total_intl_calls   < 1.5    to the right, agree=0.564, adj=0.064, (0 split)
## 
## Node number 5: 67 observations
##   predicted class=no   expected loss=0.04477612  P(node) =0.02232589
##     class counts:     3    64
##    probabilities: 0.045 0.955 
## 
## Node number 6: 207 observations,    complexity param=0.06
##   predicted class=no   expected loss=0.4830918  P(node) =0.06897701
##     class counts:   100   107
##    probabilities: 0.483 0.517 
##   left son=12 (59 obs) right son=13 (148 obs)
##   Primary splits:
##       total_day_minutes < 144.65 to the left,  improve=33.288880, (0 missing)
##       total_day_charge  < 24.595 to the left,  improve=33.288880, (0 missing)
##       state             splits as  LRLRLRRRRRLRRLRRRRLLRLRRRLLR-RLRRRLRRRLRRLRRLRRLLRR, improve=17.409420, (0 missing)
##       total_eve_minutes < 190.8  to the left,  improve= 8.907583, (0 missing)
##       total_eve_charge  < 16.22  to the left,  improve= 8.907583, (0 missing)
##   Surrogate splits:
##       total_day_charge      < 24.595 to the left,  agree=1.000, adj=1.000, (0 split)
##       state                 splits as  RRLRRRRRRRRRLRRRRRLRRRRRRRLR-RRRRRRRRRLRRLRRLRRRLRL, agree=0.783, adj=0.237, (0 split)
##       total_eve_minutes     < 98.85  to the left,  agree=0.725, adj=0.034, (0 split)
##       total_eve_charge      < 8.405  to the left,  agree=0.725, adj=0.034, (0 split)
##       number_vmail_messages < 43.5   to the right, agree=0.720, adj=0.017, (0 split)
## 
## Node number 7: 2493 observations,    complexity param=0.05176471
##   predicted class=no   expected loss=0.07019655  P(node) =0.8307231
##     class counts:   175  2318
##    probabilities: 0.070 0.930 
##   left son=14 (238 obs) right son=15 (2255 obs)
##   Primary splits:
##       international_plan splits as  RL,         improve=44.607750, (0 missing)
##       total_intl_minutes < 13.45  to the right, improve= 6.570178, (0 missing)
##       total_intl_charge  < 3.635  to the right, improve= 6.570178, (0 missing)
##       total_day_minutes  < 217.75 to the right, improve= 6.152075, (0 missing)
##       total_day_charge   < 37.02  to the right, improve= 6.152075, (0 missing)
## 
## Node number 8: 125 observations,    complexity param=0.01176471
##   predicted class=yes  expected loss=0.112  P(node) =0.04165278
##     class counts:   111    14
##    probabilities: 0.888 0.112 
##   left son=16 (105 obs) right son=17 (20 obs)
##   Primary splits:
##       state               splits as  -LLLRRL-LLLRLL-LLL-LLLLLLLLLRRLLLLLLLL--LLRLRRLLLLL, improve=9.135429, (0 missing)
##       total_night_minutes < 157.9  to the right, improve=6.724000, (0 missing)
##       total_night_charge  < 7.105  to the right, improve=6.724000, (0 missing)
##       total_day_minutes   < 264.65 to the right, improve=2.586222, (0 missing)
##       total_day_charge    < 44.99  to the right, improve=2.586222, (0 missing)
##   Surrogate splits:
##       total_night_minutes < 110.15 to the right, agree=0.856, adj=0.1, (0 split)
##       total_night_charge  < 4.955  to the right, agree=0.856, adj=0.1, (0 split)
## 
## Node number 9: 109 observations,    complexity param=0.04
##   predicted class=no   expected loss=0.3302752  P(node) =0.03632123
##     class counts:    36    73
##    probabilities: 0.330 0.670 
##   left son=18 (23 obs) right son=19 (86 obs)
##   Primary splits:
##       total_day_minutes   < 286.9  to the right, improve=16.956280, (0 missing)
##       total_day_charge    < 48.775 to the right, improve=16.956280, (0 missing)
##       state               splits as  L-LL-RRLR---RLRLLL-LLR-RRLRRRRLLRLLRLLLL-R-LLLLRLRR, improve= 9.030310, (0 missing)
##       total_night_minutes < 186.45 to the right, improve= 6.784356, (0 missing)
##       total_night_charge  < 8.39   to the right, improve= 6.784356, (0 missing)
##   Surrogate splits:
##       total_day_charge < 48.775 to the right, agree=1.000, adj=1.00, (0 split)
##       state            splits as  R-RR-RRRR---RRRLRL-RRR-RRRRRLRRRRRRRRRRR-R-RRRRRRRR, agree=0.817, adj=0.13, (0 split)
## 
## Node number 12: 59 observations
##   predicted class=yes  expected loss=0.06779661  P(node) =0.01966011
##     class counts:    55     4
##    probabilities: 0.932 0.068 
## 
## Node number 13: 148 observations,    complexity param=0.01529412
##   predicted class=no   expected loss=0.3040541  P(node) =0.04931689
##     class counts:    45   103
##    probabilities: 0.304 0.696 
##   left son=26 (77 obs) right son=27 (71 obs)
##   Primary splits:
##       state             splits as  LRLRLRRLRRLRRLRRLRLLRLLLRL-L-RLLRLLRRR-RLRLLRRRLLLR, improve=11.521910, (0 missing)
##       total_eve_minutes < 191.6  to the left,  improve= 8.649079, (0 missing)
##       total_eve_charge  < 16.29  to the left,  improve= 8.649079, (0 missing)
##       total_day_minutes < 196.95 to the left,  improve= 4.095429, (0 missing)
##       total_day_charge  < 33.48  to the left,  improve= 4.095429, (0 missing)
##   Surrogate splits:
##       total_eve_minutes   < 189.55 to the left,  agree=0.649, adj=0.268, (0 split)
##       total_eve_charge    < 16.11  to the left,  agree=0.649, adj=0.268, (0 split)
##       total_intl_minutes  < 10.2   to the left,  agree=0.622, adj=0.211, (0 split)
##       total_intl_charge   < 2.755  to the left,  agree=0.622, adj=0.211, (0 split)
##       total_night_minutes < 240.75 to the left,  agree=0.595, adj=0.155, (0 split)
## 
## Node number 14: 238 observations,    complexity param=0.05176471
##   predicted class=no   expected loss=0.3613445  P(node) =0.0793069
##     class counts:    86   152
##    probabilities: 0.361 0.639 
##   left son=28 (44 obs) right son=29 (194 obs)
##   Primary splits:
##       total_intl_calls   < 2.5    to the left,  improve=44.034310, (0 missing)
##       total_intl_minutes < 13.05  to the right, improve=41.603840, (0 missing)
##       total_intl_charge  < 3.525  to the right, improve=41.603840, (0 missing)
##       state              splits as  RLLRLRRRLLRRLRL-LLRLLLRLRRRLRRRLRLRLLRRRLRLLRRRRRRR, improve=14.736300, (0 missing)
##       total_eve_calls    < 103.5  to the left,  improve= 2.067076, (0 missing)
##   Surrogate splits:
##       state               splits as  RRRRLRRRRRRRLRR-RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR, agree=0.824, adj=0.045, (0 split)
##       total_night_minutes < 303.15 to the right, agree=0.819, adj=0.023, (0 split)
##       total_night_charge  < 13.64  to the right, agree=0.819, adj=0.023, (0 split)
##       total_intl_minutes  < 4.35   to the left,  agree=0.819, adj=0.023, (0 split)
##       total_intl_charge   < 1.175  to the left,  agree=0.819, adj=0.023, (0 split)
## 
## Node number 15: 2255 observations,    complexity param=0.01882353
##   predicted class=no   expected loss=0.03946785  P(node) =0.7514162
##     class counts:    89  2166
##    probabilities: 0.039 0.961 
##   left son=30 (273 obs) right son=31 (1982 obs)
##   Primary splits:
##       total_day_minutes < 221.95 to the right, improve=6.640284, (0 missing)
##       total_day_charge  < 37.73  to the right, improve=6.640284, (0 missing)
##       total_eve_minutes < 257.95 to the right, improve=5.995820, (0 missing)
##       total_eve_charge  < 21.925 to the right, improve=5.995820, (0 missing)
##       state             splits as  RRRRLRRLLRRRRRRLRRLRLRLLRLLLLRRLRRRRRLRRLLLRRRRLRLR, improve=2.648549, (0 missing)
##   Surrogate splits:
##       total_day_charge < 37.73  to the right, agree=1, adj=1, (0 split)
## 
## Node number 16: 105 observations
##   predicted class=yes  expected loss=0.02857143  P(node) =0.03498834
##     class counts:   102     3
##    probabilities: 0.971 0.029 
## 
## Node number 17: 20 observations,    complexity param=0.01176471
##   predicted class=no   expected loss=0.45  P(node) =0.006664445
##     class counts:     9    11
##    probabilities: 0.450 0.550 
##   left son=34 (8 obs) right son=35 (12 obs)
##   Primary splits:
##       total_night_minutes < 200.15 to the right, improve=8.066667, (0 missing)
##       total_night_charge  < 9.01   to the right, improve=8.066667, (0 missing)
##       total_eve_calls     < 102    to the left,  improve=3.570330, (0 missing)
##       total_day_minutes   < 263.85 to the right, improve=2.500000, (0 missing)
##       total_day_charge    < 44.855 to the right, improve=2.500000, (0 missing)
##   Surrogate splits:
##       total_night_charge < 9.01   to the right, agree=1.00, adj=1.000, (0 split)
##       total_eve_calls    < 102    to the left,  agree=0.75, adj=0.375, (0 split)
##       total_day_minutes  < 263.85 to the right, agree=0.70, adj=0.250, (0 split)
##       total_day_charge   < 44.855 to the right, agree=0.70, adj=0.250, (0 split)
##       total_intl_minutes < 11.85  to the right, agree=0.70, adj=0.250, (0 split)
## 
## Node number 18: 23 observations
##   predicted class=yes  expected loss=0.1304348  P(node) =0.007664112
##     class counts:    20     3
##    probabilities: 0.870 0.130 
## 
## Node number 19: 86 observations
##   predicted class=no   expected loss=0.1860465  P(node) =0.02865711
##     class counts:    16    70
##    probabilities: 0.186 0.814 
## 
## Node number 26: 77 observations,    complexity param=0.01529412
##   predicted class=no   expected loss=0.4935065  P(node) =0.02565811
##     class counts:    38    39
##    probabilities: 0.494 0.506 
##   left son=52 (53 obs) right son=53 (24 obs)
##   Primary splits:
##       total_eve_minutes < 211.2  to the left,  improve=5.671179, (0 missing)
##       total_eve_charge  < 17.95  to the left,  improve=5.671179, (0 missing)
##       total_day_minutes < 200.1  to the left,  improve=4.407300, (0 missing)
##       total_day_charge  < 34.02  to the left,  improve=4.407300, (0 missing)
##       state             splits as  L-R-R--R--L--R--R-LL-RRR-L-R--RR-RR-----R-RR---LLR-, improve=4.110310, (0 missing)
##   Surrogate splits:
##       total_eve_charge    < 17.95  to the left,  agree=1.000, adj=1.000, (0 split)
##       state               splits as  L-L-L--L--R--L--L-LL-LLR-L-L--LL-LL-----R-LL---LLL-, agree=0.727, adj=0.125, (0 split)
##       total_night_minutes < 146.75 to the right, agree=0.714, adj=0.083, (0 split)
##       total_night_charge  < 6.605  to the right, agree=0.714, adj=0.083, (0 split)
##       total_intl_minutes  < 5.75   to the right, agree=0.714, adj=0.083, (0 split)
## 
## Node number 27: 71 observations
##   predicted class=no   expected loss=0.09859155  P(node) =0.02365878
##     class counts:     7    64
##    probabilities: 0.099 0.901 
## 
## Node number 28: 44 observations
##   predicted class=yes  expected loss=0  P(node) =0.01466178
##     class counts:    44     0
##    probabilities: 1.000 0.000 
## 
## Node number 29: 194 observations,    complexity param=0.05176471
##   predicted class=no   expected loss=0.2164948  P(node) =0.06464512
##     class counts:    42   152
##    probabilities: 0.216 0.784 
##   left son=58 (39 obs) right son=59 (155 obs)
##   Primary splits:
##       total_intl_minutes < 13.05  to the right, improve=59.930560, (0 missing)
##       total_intl_charge  < 3.525  to the right, improve=59.930560, (0 missing)
##       state              splits as  RLLRRRRRLLRRRRL-LLRLLLRRRRRLRRRLLLLLLRRRLRLLRRLRRRR, improve=10.501880, (0 missing)
##       total_eve_calls    < 73     to the left,  improve= 2.056191, (0 missing)
##       total_day_calls    < 100.5  to the left,  improve= 1.947298, (0 missing)
##   Surrogate splits:
##       total_intl_charge     < 3.525  to the right, agree=1.000, adj=1.000, (0 split)
##       state                 splits as  RRRRRRRRRRRRRRR-RRRRRLRRRRRRRRRRRRRRRRRRRRRRRRRRRRR, agree=0.809, adj=0.051, (0 split)
##       number_vmail_messages < 40.5   to the right, agree=0.804, adj=0.026, (0 split)
##       total_day_minutes     < 52.45  to the left,  agree=0.804, adj=0.026, (0 split)
##       total_day_calls       < 143.5  to the right, agree=0.804, adj=0.026, (0 split)
## 
## Node number 30: 273 observations,    complexity param=0.01882353
##   predicted class=no   expected loss=0.1428571  P(node) =0.09096968
##     class counts:    39   234
##    probabilities: 0.143 0.857 
##   left son=60 (30 obs) right son=61 (243 obs)
##   Primary splits:
##       total_eve_minutes   < 267.35 to the right, improve=26.230810, (0 missing)
##       total_eve_charge    < 22.725 to the right, improve=26.230810, (0 missing)
##       state               splits as  RRLRLRLRLLRRRRRLRLRLRRLLLLLRLRRLLRRLRLRRRLRLRRRRRLR, improve= 9.714764, (0 missing)
##       total_night_minutes < 191.85 to the right, improve= 3.223335, (0 missing)
##       total_night_charge  < 8.635  to the right, improve= 3.223335, (0 missing)
##   Surrogate splits:
##       total_eve_charge   < 22.725 to the right, agree=1.000, adj=1.000, (0 split)
##       total_intl_minutes < 2.05   to the left,  agree=0.894, adj=0.033, (0 split)
##       total_intl_charge  < 0.555  to the left,  agree=0.894, adj=0.033, (0 split)
## 
## Node number 31: 1982 observations
##   predicted class=no   expected loss=0.02522704  P(node) =0.6604465
##     class counts:    50  1932
##    probabilities: 0.025 0.975 
## 
## Node number 34: 8 observations
##   predicted class=yes  expected loss=0  P(node) =0.002665778
##     class counts:     8     0
##    probabilities: 1.000 0.000 
## 
## Node number 35: 12 observations
##   predicted class=no   expected loss=0.08333333  P(node) =0.003998667
##     class counts:     1    11
##    probabilities: 0.083 0.917 
## 
## Node number 52: 53 observations,    complexity param=0.01411765
##   predicted class=yes  expected loss=0.3773585  P(node) =0.01766078
##     class counts:    33    20
##    probabilities: 0.623 0.377 
##   left son=104 (18 obs) right son=105 (35 obs)
##   Primary splits:
##       state             splits as  L-L-L--R--L--R--R-LL-LLR-L-R--RR-RR-----L-RR---LLR-, improve=7.762803, (0 missing)
##       total_day_minutes < 182.75 to the left,  improve=7.563398, (0 missing)
##       total_day_charge  < 31.065 to the left,  improve=7.563398, (0 missing)
##       total_eve_minutes < 138.55 to the left,  improve=2.092474, (0 missing)
##       total_eve_charge  < 11.78  to the left,  improve=2.092474, (0 missing)
##   Surrogate splits:
##       total_day_minutes   < 174.45 to the left,  agree=0.736, adj=0.222, (0 split)
##       total_day_calls     < 66     to the left,  agree=0.736, adj=0.222, (0 split)
##       total_day_charge    < 29.66  to the left,  agree=0.736, adj=0.222, (0 split)
##       total_night_minutes < 178.75 to the left,  agree=0.736, adj=0.222, (0 split)
##       total_night_charge  < 8.045  to the left,  agree=0.736, adj=0.222, (0 split)
## 
## Node number 53: 24 observations
##   predicted class=no   expected loss=0.2083333  P(node) =0.007997334
##     class counts:     5    19
##    probabilities: 0.208 0.792 
## 
## Node number 58: 39 observations
##   predicted class=yes  expected loss=0  P(node) =0.01299567
##     class counts:    39     0
##    probabilities: 1.000 0.000 
## 
## Node number 59: 155 observations
##   predicted class=no   expected loss=0.01935484  P(node) =0.05164945
##     class counts:     3   152
##    probabilities: 0.019 0.981 
## 
## Node number 60: 30 observations,    complexity param=0.01647059
##   predicted class=yes  expected loss=0.2333333  P(node) =0.009996668
##     class counts:    23     7
##    probabilities: 0.767 0.233 
##   left son=120 (23 obs) right son=121 (7 obs)
##   Primary splits:
##       state             splits as  -RL-L-L-L-RR---LRL----LLLLL-LR--L--L-L---L------RL-, improve=10.7333300, (0 missing)
##       total_day_calls   < 102.5  to the left,  improve= 2.1777780, (0 missing)
##       total_night_calls < 113.5  to the right, improve= 1.1878790, (0 missing)
##       total_day_minutes < 239.75 to the left,  improve= 0.6960663, (0 missing)
##       total_day_charge  < 40.76  to the left,  improve= 0.6960663, (0 missing)
##   Surrogate splits:
##       voice_mail_plan       splits as  LR,         agree=0.9, adj=0.571, (0 split)
##       number_vmail_messages < 12     to the left,  agree=0.9, adj=0.571, (0 split)
##       total_night_minutes   < 179.15 to the right, agree=0.8, adj=0.143, (0 split)
##       total_night_charge    < 8.06   to the right, agree=0.8, adj=0.143, (0 split)
## 
## Node number 61: 243 observations
##   predicted class=no   expected loss=0.06584362  P(node) =0.08097301
##     class counts:    16   227
##    probabilities: 0.066 0.934 
## 
## Node number 104: 18 observations
##   predicted class=yes  expected loss=0  P(node) =0.005998001
##     class counts:    18     0
##    probabilities: 1.000 0.000 
## 
## Node number 105: 35 observations,    complexity param=0.01411765
##   predicted class=no   expected loss=0.4285714  P(node) =0.01166278
##     class counts:    15    20
##    probabilities: 0.429 0.571 
##   left son=210 (9 obs) right son=211 (26 obs)
##   Primary splits:
##       total_day_minutes < 182.5  to the left,  improve=5.134310, (0 missing)
##       total_day_charge  < 31.025 to the left,  improve=5.134310, (0 missing)
##       total_eve_minutes < 138.55 to the left,  improve=3.862857, (0 missing)
##       total_eve_charge  < 11.78  to the left,  improve=3.862857, (0 missing)
##       total_eve_calls   < 87.5   to the left,  improve=2.142857, (0 missing)
##   Surrogate splits:
##       total_day_charge   < 31.025 to the left,  agree=1.0, adj=1.000, (0 split)
##       state              splits as  -------R-----R--R------R---R--RR-LL-------RR-----R-, agree=0.8, adj=0.222, (0 split)
##       total_eve_calls    < 67.5   to the left,  agree=0.8, adj=0.222, (0 split)
##       total_intl_minutes < 14.9   to the right, agree=0.8, adj=0.222, (0 split)
##       total_intl_charge  < 4.025  to the right, agree=0.8, adj=0.222, (0 split)
## 
## Node number 120: 23 observations
##   predicted class=yes  expected loss=0  P(node) =0.007664112
##     class counts:    23     0
##    probabilities: 1.000 0.000 
## 
## Node number 121: 7 observations
##   predicted class=no   expected loss=0  P(node) =0.002332556
##     class counts:     0     7
##    probabilities: 0.000 1.000 
## 
## Node number 210: 9 observations
##   predicted class=yes  expected loss=0.1111111  P(node) =0.002999
##     class counts:     8     1
##    probabilities: 0.889 0.111 
## 
## Node number 211: 26 observations
##   predicted class=no   expected loss=0.2692308  P(node) =0.008663779
##     class counts:     7    19
##    probabilities: 0.269 0.731

fancyRpartPlot(Tree_model)

Tree_pre <- predict(Tree_model,Test,type = 'class')
Tree_pre2 <-predict(Tree_model, Test,type = 'prob')



library(randomForest)

## Warning: package 'randomForest' was built under R version 3.3.3

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

rf=randomForest(churn~ ., data=Train, maxnodes=4, ntree=50,importance=TRUE)

varImp(rf)

##                                      yes         no
## state                          0.3297402  0.3297402
## area_code                      0.0000000  0.0000000
## international_plan             2.5833397  2.5833397
## voice_mail_plan                2.0002049  2.0002049
## number_vmail_messages          1.8125712  1.8125712
## total_day_minutes              4.3514624  4.3514624
## total_day_calls                0.0000000  0.0000000
## total_day_charge               4.9337185  4.9337185
## total_eve_minutes              2.0165740  2.0165740
## total_eve_calls                0.0000000  0.0000000
## total_eve_charge               1.2499870  1.2499870
## total_night_minutes            1.3547760  1.3547760
## total_night_calls              1.1197686  1.1197686
## total_night_charge            -1.0101525 -1.0101525
## total_intl_minutes             1.4228727  1.4228727
## total_intl_calls               1.7747736  1.7747736
## total_intl_charge             -0.4899153 -0.4899153
## number_customer_service_calls  1.9553956  1.9553956

varImpPlot(rf)

rf_pre <- predict(rf,Test,type = 'class')
rf_pre2 <-predict(rf, Test,type = 'prob')

Model validation using Precision Recall curve and ROC curve

library(ggplot2)
library(plotROC)

## Warning: package 'plotROC' was built under R version 3.3.3

Ds_df=cbind.data.frame(Tree_pre,Tree_pre2)

Rf_df=cbind.data.frame(rf_pre,rf_pre2)



ggplot(Ds_df, aes(d = Tree_pre, m = no)) + geom_roc()+ style_roc()

## Warning in verify_d(data$d): D not labeled 0/1, assuming no = 0 and yes =
## 1!

ggplot(Rf_df, aes(d = rf_pre, m = no)) + geom_roc()+ style_roc()

## Warning in verify_d(data$d): D not labeled 0/1, assuming no = 0 and yes =
## 1!

Churn_Prediction

Swarnava

15 de marzo de 2017

DATA PREPERATION:= IT INCLUDES THE FOLLWING STEPS

Vizualisation of Categorical and continous varaible

Running the Decision Tree and Random Forest model

Model validation using Precision Recall curve and ROC curve