options(width=100)

# Basic (instructional) logistic regression analysis script
# Built from R Commander script file
# Includes nonessential instructional elements
#   (suboptimal) Ordinary least-squares regression
#   Additional prediction logic
#   Histograms and other plots to compare alternative predictions

# Suggested usage:
#   Copy the required data files, ChurnExampleTrain.csv, ChurnExamplePred.csv 
#     and ChurnExampleTest.csv into a new project folder <folder>
#   Change the working directory (setwd below) to point to the project <folder>
#   Run this entire code block

# Set up working directory for file locations
#   and load required libraries

setwd("D:/MBA/SecondTerm/Data/churn")

library(RcmdrMisc)
library(car)

# DATA ENTRY AND SUMMARIZATION
# Read Training data set, list, summarize,
#   convert AreaCode to factor

ChurnTrain <- 
  read.table("ChurnExampleTrain.csv", 
  header=TRUE, sep=",", na.strings="NA", dec=".", strip.white=TRUE)
TrainRows <- nrow(ChurnTrain)
TrainRows
## [1] 508
ChurnTrain[1:5,]
##   State AreaCode PhoneNum Churn ChurnL ChurnTF ChurnYN Tenure VmailPlan IntPlan CustServCall
## 1    NJ      415 358-1921     0 False.   FALSE      no    137        no      no            0
## 2    IN      415 329-6603     1  True.    TRUE     yes     65        no      no            4
## 3    IA      408 363-1107     0 False.   FALSE      no    168        no      no            1
## 4    VT      510 386-2923     0 False.   FALSE      no     93        no      no            3
## 5    TX      415 373-2782     0 False.   FALSE      no     73        no      no            1
##   VmailMsgs DayCall DayMin DayCharge EveCall EveMin EveCharge NightCall NightMin NightCharge
## 1         0     114  243.4     41.38     110  121.2     10.30       104    162.6        7.32
## 2         0     137  129.1     21.95      83  228.5     19.42       111    208.8        9.40
## 3         0      96  128.8     21.90      71  104.9      8.92       128    141.1        6.35
## 4         0     114  190.7     32.42     111  218.2     18.55       121    129.6        5.83
## 5         0      90  224.4     38.15      88  159.5     13.56        74    192.8        8.68
##   IntCall IntMin IntCharge
## 1       5   12.2      3.29
## 2       6   12.7      3.43
## 3       2   11.2      3.02
## 4       3    8.1      2.19
## 5       2   13.0      3.51
ChurnTrain <- within(ChurnTrain, {
  AreaCode <- as.factor(AreaCode)
})

# Demonstration of Simple Logistic Regression vs Simple Linear Regression

#   Churn vs DayMin with fitted regression line

scatterplot(Churn~DayMin, reg.line=lm, smooth=FALSE, spread=FALSE, 
  boxplots=FALSE, span=0.5, ellipse=FALSE, levels=c(.5, .9), data=ChurnTrain)

#   Churn vs DayMin with fitted logistic probabilities

SimpleLogistic <- glm(Churn ~ DayMin, family=binomial(logit), data=ChurnTrain)
summary(SimpleLogistic)
## 
## Call:
## glm(formula = Churn ~ DayMin, family = binomial(logit), data = ChurnTrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0657  -0.6039  -0.4798  -0.3464   2.8830  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.140170   0.506363  -8.176 2.93e-16 ***
## DayMin       0.012268   0.002366   5.184 2.17e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 425.29  on 507  degrees of freedom
## Residual deviance: 395.43  on 506  degrees of freedom
## AIC: 399.43
## 
## Number of Fisher Scoring iterations: 5
exp(coef(SimpleLogistic))  # Exponentiated coefficients ("odds ratios")
## (Intercept)      DayMin 
##  0.01592015  1.01234340
LogisticProb <- predict(SimpleLogistic, newdata=ChurnTrain, type = "response",
                        interval="none", level=.95, se.fit=FALSE)
DayMin <- ChurnTrain$DayMin
Churn <- ChurnTrain$Churn
plot(x = DayMin, y = Churn)
points(x = DayMin, y = LogisticProb, col = "red")

# Data Understanding -- Numerical summaries and scatterplot matrix

numSummary(ChurnTrain[,c("Churn", "CustServCall", "DayCall", "DayCharge", 
  "DayMin", "EveCall", "EveCharge", "EveMin", "IntCall", "IntCharge", 
  "IntMin", "NightCall", "NightCharge", "NightMin", "Tenure", "VmailMsgs")],
  statistics=c("mean", "sd", "quantiles", "skewness"), 
  quantiles=c(0,.25,.5,.75,1), type="2")
##                     mean         sd    skewness    0%      25%     50%     75%   100%   n
## Churn          0.1476378  0.3550902  1.99247906  0.00   0.0000   0.000   0.000   1.00 508
## CustServCall   1.6377953  1.3683909  1.07708652  0.00   1.0000   1.000   2.000   8.00 508
## DayCall      101.0984252 20.2879216 -0.40160587  0.00  87.7500 102.000 115.000 151.00 508
## DayCharge     30.6920669 10.0253583 -0.05716419  0.00  24.3550  30.975  37.465  58.96 508
## DayMin       180.5377953 58.9715977 -0.05720994  0.00 143.2750 182.200 220.400 346.80 508
## EveCall      100.7814961 20.0806434  0.07542696 46.00  87.0000 101.000 114.000 168.00 508
## EveCharge     16.9373031  4.4196795 -0.05282099  3.61  13.9075  16.875  19.985  28.56 508
## EveMin       199.2576772 51.9954637 -0.05283473 42.50 163.5750 198.550 235.150 336.00 508
## IntCall        4.4094488  2.3191318  1.03693287  0.00   3.0000   4.000   6.000  14.00 508
## IntCharge      2.7086417  0.7440602 -0.21759992  0.00   2.2700   2.730   3.190   4.97 508
## IntMin        10.0297244  2.7553574 -0.21620538  0.00   8.4000  10.100  11.800  18.40 508
## NightCall    100.1358268 20.2707331 -0.01736662 33.00  86.0000 100.000 115.250 157.00 508
## NightCharge    8.9138189  2.2598655 -0.01784827  2.86   7.3150   8.860  10.590  16.99 508
## NightMin     198.0807087 50.2123410 -0.01805058 63.60 162.5000 196.800 235.400 377.50 508
## Tenure       101.7755906 39.7799772  0.09847105  1.00  76.0000 100.500 127.000 232.00 508
## VmailMsgs      8.1515748 13.8050193  1.28510519  0.00   0.0000   0.000  20.000  51.00 508
scatterplotMatrix(~Churn+Tenure+VmailMsgs+CustServCall+
  DayMin+EveMin+IntMin+NightMin, reg.line=lm, smooth=TRUE, 
  spread=FALSE, span=0.5, id.n=0, diagonal = 'histogram', 
  data=ChurnTrain)

# ANALYSIS AND MODELING OF TRAINING DATA
# Fit full and reduced regression models

FullRegression <- 
  lm(Churn~CustServCall+DayCall+DayCharge+DayMin+EveCall+EveCharge+EveMin+
  IntCall+IntCharge+IntMin+NightCall+NightCharge+NightMin+Tenure+VmailMsgs,
  data=ChurnTrain)
summary(FullRegression)
## 
## Call:
## lm(formula = Churn ~ CustServCall + DayCall + DayCharge + DayMin + 
##     EveCall + EveCharge + EveMin + IntCall + IntCharge + IntMin + 
##     NightCall + NightCharge + NightMin + Tenure + VmailMsgs, 
##     data = ChurnTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5270 -0.1957 -0.1042  0.0048  1.0841 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -5.312e-01  1.798e-01  -2.955 0.003281 ** 
## CustServCall  3.780e-02  1.104e-02   3.425 0.000667 ***
## DayCall       5.264e-05  7.466e-04   0.071 0.943822    
## DayCharge    -1.590e+00  5.171e+00  -0.307 0.758626    
## DayMin        2.716e-01  8.790e-01   0.309 0.757463    
## EveCall       8.711e-04  7.493e-04   1.163 0.245574    
## EveCharge    -1.076e+00  5.098e+00  -0.211 0.832915    
## EveMin        9.218e-02  4.333e-01   0.213 0.831636    
## IntCall      -1.786e-02  6.515e-03  -2.741 0.006347 ** 
## IntCharge     4.901e+00  5.039e+00   0.973 0.331269    
## IntMin       -1.317e+00  1.361e+00  -0.968 0.333493    
## NightCall     6.678e-04  7.487e-04   0.892 0.372864    
## NightCharge   4.746e+00  5.530e+00   0.858 0.391193    
## NightMin     -2.129e-01  2.489e-01  -0.855 0.392732    
## Tenure       -2.614e-04  3.783e-04  -0.691 0.489869    
## VmailMsgs    -1.999e-03  1.090e-03  -1.835 0.067177 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.337 on 492 degrees of freedom
## Multiple R-squared:  0.1262, Adjusted R-squared:  0.09953 
## F-statistic: 4.736 on 15 and 492 DF,  p-value: 1.389e-08
ReducedRegression <- stepwise(FullRegression, direction='forward/backward', criterion='BIC')
## 
## Direction:  forward/backward
## Criterion:  BIC 
## 
## Start:  AIC=-1046.72
## Churn ~ 1
## 
##                Df Sum of Sq    RSS     AIC
## + DayMin        1    3.6136 60.314 -1070.0
## + DayCharge     1    3.6134 60.314 -1070.0
## + CustServCall  1    1.3777 62.549 -1051.6
## + IntCall       1    0.9062 63.021 -1047.7
## + EveMin        1    0.8051 63.122 -1046.9
## + EveCharge     1    0.8047 63.122 -1046.9
## <none>                      63.927 -1046.7
## + NightCharge   1    0.5400 63.387 -1044.8
## + NightMin      1    0.5398 63.387 -1044.8
## + VmailMsgs     1    0.5397 63.387 -1044.8
## + EveCall       1    0.2104 63.717 -1042.2
## + Tenure        1    0.1409 63.786 -1041.6
## + IntCharge     1    0.1005 63.827 -1041.3
## + IntMin        1    0.0995 63.828 -1041.3
## + NightCall     1    0.0124 63.915 -1040.6
## + DayCall       1    0.0064 63.921 -1040.5
## 
## Step:  AIC=-1070.05
## Churn ~ DayMin
## 
##                Df Sum of Sq    RSS     AIC
## + CustServCall  1    1.3037 59.010 -1074.9
## + IntCall       1    0.8004 59.513 -1070.6
## <none>                      60.314 -1070.0
## + EveMin        1    0.6638 59.650 -1069.4
## + EveCharge     1    0.6636 59.650 -1069.4
## + NightCharge   1    0.5963 59.717 -1068.9
## + NightMin      1    0.5959 59.718 -1068.9
## + VmailMsgs     1    0.4207 59.893 -1067.4
## + IntCharge     1    0.1314 60.182 -1064.9
## + IntMin        1    0.1305 60.183 -1064.9
## + EveCall       1    0.1291 60.184 -1064.9
## + Tenure        1    0.1204 60.193 -1064.8
## + DayCharge     1    0.0287 60.285 -1064.1
## + NightCall     1    0.0233 60.290 -1064.0
## + DayCall       1    0.0001 60.313 -1063.8
## - DayMin        1    3.6136 63.927 -1046.7
## 
## Step:  AIC=-1074.92
## Churn ~ DayMin + CustServCall
## 
##                Df Sum of Sq    RSS     AIC
## + IntCall       1    0.7498 58.260 -1075.2
## <none>                      59.010 -1074.9
## + EveMin        1    0.6502 58.360 -1074.3
## + EveCharge     1    0.6501 58.360 -1074.3
## + NightCharge   1    0.6457 58.364 -1074.3
## + NightMin      1    0.6453 58.365 -1074.3
## + VmailMsgs     1    0.4189 58.591 -1072.3
## + EveCall       1    0.1708 58.839 -1070.2
## - CustServCall  1    1.3037 60.314 -1070.0
## + Tenure        1    0.1243 58.886 -1069.8
## + IntCharge     1    0.1192 58.891 -1069.7
## + IntMin        1    0.1183 58.892 -1069.7
## + NightCall     1    0.0575 58.952 -1069.2
## + DayCharge     1    0.0133 58.997 -1068.8
## + DayCall       1    0.0012 59.009 -1068.7
## - DayMin        1    3.5396 62.549 -1051.6
## 
## Step:  AIC=-1075.18
## Churn ~ DayMin + CustServCall + IntCall
## 
##                Df Sum of Sq    RSS     AIC
## <none>                      58.260 -1075.2
## + EveMin        1    0.7006 57.559 -1075.1
## + EveCharge     1    0.7004 57.560 -1075.1
## - IntCall       1    0.7498 59.010 -1074.9
## + NightCharge   1    0.6585 57.602 -1074.7
## + NightMin      1    0.6580 57.602 -1074.7
## + VmailMsgs     1    0.4093 57.851 -1072.5
## - CustServCall  1    1.2531 59.513 -1070.6
## + EveCall       1    0.1744 58.086 -1070.5
## + IntCharge     1    0.1403 58.120 -1070.2
## + IntMin        1    0.1394 58.121 -1070.2
## + Tenure        1    0.1068 58.153 -1069.9
## + NightCall     1    0.0846 58.175 -1069.7
## + DayCharge     1    0.0100 58.250 -1069.0
## + DayCall       1    0.0027 58.257 -1069.0
## - DayMin        1    3.4396 61.700 -1052.3
summary(ReducedRegression)
## 
## Call:
## lm(formula = Churn ~ DayMin + CustServCall + IntCall, data = ChurnTrain)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.45779 -0.19007 -0.11183 -0.01495  0.99278 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.0910259  0.0599865  -1.517  0.12978    
## DayMin        0.0013975  0.0002562   5.455 7.69e-08 ***
## CustServCall  0.0363484  0.0110397   3.293  0.00106 ** 
## IntCall      -0.0165953  0.0065160  -2.547  0.01117 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.34 on 504 degrees of freedom
## Multiple R-squared:  0.08865,    Adjusted R-squared:  0.08322 
## F-statistic: 16.34 on 3 and 504 DF,  p-value: 3.775e-10
oldpar <- par(oma=c(0,0,3,0), mfrow=c(2,2))
plot(ReducedRegression)

par(oldpar)

#   Compute predictions for reduced regression model
#     Better/smarter prediction planning

ChurnPred <- 
  read.table("ChurnExamplePred.csv", 
  header=TRUE, sep=",", na.strings="NA", dec=".", strip.white=TRUE)
ChurnPred # Sample prediction data, n = 5
##   State AreaCode PhoneNum Churn ChurnL ChurnTF ChurnYN Tenure VmailPlan IntPlan CustServCall
## 1    KS      415 413-9854     0 False.   FALSE      no     90       yes      no            1
## 2    NJ      408 402-8337     0 False.   FALSE      no    113        no     yes            3
## 3    AL      408 351-4935     0 False.   FALSE      no     74       yes      no            1
## 4    MI      510 355-9594     0 False.   FALSE      no     52        no      no            3
## 5    NE      415 329-9540     0 False.   FALSE      no    115        no      no            1
##   VmailMsgs DayCall DayMin DayCharge EveCall EveMin EveCharge NightCall NightMin NightCharge
## 1        30     104  191.3     32.52     109  257.1     21.85        89    177.7        8.00
## 2         0      98  122.4     20.81     116  223.8     19.02       148    267.5       12.04
## 3        46     109  277.7     47.21     114  270.0     22.95        82    165.3        7.44
## 4         0     100  219.0     37.23     110  148.9     12.66       129    151.8        6.83
## 5         0     104  135.4     23.02      91  138.8     11.80        92    208.5        9.38
##   IntCall IntMin IntCharge
## 1       1   11.0      2.97
## 2       5   11.6      3.13
## 3       2   10.4      2.81
## 4       8   11.9      3.21
## 5       3    5.4      1.46
predict(ReducedRegression, newdata=ChurnPred, interval="prediction", 
  level=.95, se.fit=FALSE)
##          fit        lwr       upr
## 1 0.19607649 -0.4741610 0.8663140
## 2 0.10610147 -0.5638881 0.7760910
## 3 0.30022860 -0.3710101 0.9714673
## 4 0.19131783 -0.4799034 0.8625391
## 5 0.08476337 -0.5846646 0.7541913
#   Compute, examine and save full-sample predictions for reduced regression

RegPred <- predict(ReducedRegression, newdata=ChurnTrain, interval="none")
with(ChurnTrain, Hist(RegPred, scale="frequency", breaks="Sturges", 
  col="darkgray"))

# Fit full and reduced logistic regression models

FullLogistic <- glm(Churn ~ CustServCall + DayCall + DayCharge + 
  DayMin + EveCall + EveCharge + EveMin + IntCall + IntCharge + 
  IntMin + IntPlan + NightCall + NightCharge + NightMin + 
  Tenure + VmailMsgs + VmailPlan, 
  family=binomial(logit), 
  data=ChurnTrain)
summary(FullLogistic)
## 
## Call:
## glm(formula = Churn ~ CustServCall + DayCall + DayCharge + DayMin + 
##     EveCall + EveCharge + EveMin + IntCall + IntCharge + IntMin + 
##     IntPlan + NightCall + NightCharge + NightMin + Tenure + VmailMsgs + 
##     VmailPlan, family = binomial(logit), data = ChurnTrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7021  -0.5193  -0.3071  -0.1529   3.1766  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -8.853535   1.792735  -4.939 7.87e-07 ***
## CustServCall   0.364789   0.096916   3.764 0.000167 ***
## DayCall       -0.003032   0.007220  -0.420 0.674568    
## DayCharge    -21.870697  49.357515  -0.443 0.657688    
## DayMin         3.732121   8.390839   0.445 0.656475    
## EveCall        0.008711   0.007410   1.176 0.239781    
## EveCharge    -27.354617  49.843629  -0.549 0.583137    
## EveMin         2.331303   4.236754   0.550 0.582143    
## IntCall       -0.225158   0.077349  -2.911 0.003603 ** 
## IntCharge     62.930868  51.101898   1.231 0.218144    
## IntMin       -16.927849  13.799308  -1.227 0.219929    
## IntPlanyes     2.165740   0.381130   5.682 1.33e-08 ***
## NightCall      0.005313   0.007606   0.699 0.484803    
## NightCharge   32.613200  54.785074   0.595 0.551647    
## NightMin      -1.459913   2.465650  -0.592 0.553783    
## Tenure        -0.001558   0.003693  -0.422 0.673210    
## VmailMsgs      0.079129   0.043244   1.830 0.067276 .  
## VmailPlanyes  -3.430729   1.428155  -2.402 0.016296 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 425.29  on 507  degrees of freedom
## Residual deviance: 319.09  on 490  degrees of freedom
## AIC: 355.09
## 
## Number of Fisher Scoring iterations: 6
#    Full: Pseudo (Deviance) R2 = (425.29 - 319.09)/425.29 = 0.2497

ReducedLogistic <- stepwise(FullLogistic, direction='forward/backward', criterion='BIC')
## 
## Direction:  forward/backward
## Criterion:  BIC 
## 
## Start:  AIC=431.52
## Churn ~ 1
## 
##                Df Deviance    AIC
## + DayMin        1   395.43 407.89
## + DayCharge     1   395.43 407.89
## + IntPlan       1   398.55 411.01
## + CustServCall  1   415.18 427.64
## + IntCall       1   417.35 429.81
## + VmailPlan     1   418.63 431.09
## + EveMin        1   418.82 431.28
## + EveCharge     1   418.82 431.29
## <none>              425.29 431.52
## + VmailMsgs     1   420.56 433.03
## + NightCharge   1   420.97 433.43
## + NightMin      1   420.97 433.43
## + EveCall       1   423.62 436.08
## + Tenure        1   424.16 436.62
## + IntCharge     1   424.48 436.94
## + IntMin        1   424.49 436.95
## + NightCall     1   425.19 437.65
## + DayCall       1   425.24 437.70
## 
## Step:  AIC=407.89
## Churn ~ DayMin
## 
##                Df Deviance    AIC
## + IntPlan       1   369.16 387.85
## + CustServCall  1   385.13 403.83
## + IntCall       1   387.25 405.94
## <none>              395.43 407.89
## + VmailPlan     1   389.24 407.93
## + EveMin        1   389.55 408.24
## + EveCharge     1   389.55 408.24
## + NightCharge   1   390.82 409.51
## + NightMin      1   390.82 409.51
## + VmailMsgs     1   391.22 409.91
## + IntCharge     1   394.29 412.98
## + IntMin        1   394.29 412.98
## + EveCall       1   394.41 413.10
## + Tenure        1   394.59 413.29
## + DayCharge     1   395.10 413.80
## + NightCall     1   395.31 414.00
## + DayCall       1   395.40 414.09
## - DayMin        1   425.29 431.52
## 
## Step:  AIC=387.85
## Churn ~ DayMin + IntPlan
## 
##                Df Deviance    AIC
## + CustServCall  1   356.34 381.26
## + VmailPlan     1   361.21 386.13
## + IntCall       1   361.97 386.89
## <none>              369.16 387.85
## + VmailMsgs     1   363.78 388.70
## + NightCharge   1   363.84 388.76
## + NightMin      1   363.84 388.76
## + EveMin        1   364.01 388.94
## + EveCharge     1   364.02 388.94
## + EveCall       1   367.86 392.78
## + DayCharge     1   368.35 393.28
## + Tenure        1   368.36 393.29
## + IntCharge     1   368.48 393.40
## + IntMin        1   368.48 393.40
## + NightCall     1   369.13 394.05
## + DayCall       1   369.16 394.08
## - IntPlan       1   395.43 407.89
## - DayMin        1   398.55 411.01
## 
## Step:  AIC=381.26
## Churn ~ DayMin + IntPlan + CustServCall
## 
##                Df Deviance    AIC
## + VmailPlan     1   348.11 379.26
## + IntCall       1   349.02 380.17
## <none>              356.34 381.26
## + NightCharge   1   350.61 381.76
## + NightMin      1   350.61 381.76
## + VmailMsgs     1   350.68 381.83
## + EveMin        1   351.03 382.18
## + EveCharge     1   351.03 382.18
## + EveCall       1   354.71 385.87
## + Tenure        1   355.36 386.51
## + DayCharge     1   355.76 386.91
## + IntCharge     1   355.82 386.98
## + IntMin        1   355.83 386.98
## + NightCall     1   356.09 387.24
## + DayCall       1   356.33 387.48
## - CustServCall  1   369.16 387.85
## - IntPlan       1   385.13 403.83
## - DayMin        1   386.84 405.53
## 
## Step:  AIC=379.26
## Churn ~ DayMin + IntPlan + CustServCall + VmailPlan
## 
##                Df Deviance    AIC
## + IntCall       1   340.52 377.90
## + NightCharge   1   341.60 378.98
## + NightMin      1   341.60 378.98
## <none>              348.11 379.26
## + EveMin        1   342.67 380.05
## + EveCharge     1   342.67 380.05
## - VmailPlan     1   356.34 381.26
## + VmailMsgs     1   345.96 383.34
## + EveCall       1   346.53 383.92
## + Tenure        1   347.57 384.95
## + IntCharge     1   347.61 385.00
## + IntMin        1   347.62 385.00
## + DayCharge     1   347.87 385.26
## + NightCall     1   347.93 385.32
## + DayCall       1   348.11 385.49
## - CustServCall  1   361.21 386.13
## - IntPlan       1   378.74 403.66
## - DayMin        1   379.38 404.30
## 
## Step:  AIC=377.9
## Churn ~ DayMin + IntPlan + CustServCall + VmailPlan + IntCall
## 
##                Df Deviance    AIC
## + NightCharge   1   333.68 377.29
## + NightMin      1   333.68 377.30
## <none>              340.52 377.90
## + EveMin        1   334.55 378.17
## + EveCharge     1   334.55 378.17
## - IntCall       1   348.11 379.26
## - VmailPlan     1   349.02 380.17
## + VmailMsgs     1   338.34 381.96
## + EveCall       1   338.97 382.59
## + IntCharge     1   339.24 382.85
## + IntMin        1   339.25 382.86
## + NightCall     1   339.93 383.54
## + Tenure        1   340.25 383.86
## + DayCharge     1   340.48 384.10
## + DayCall       1   340.52 384.13
## - CustServCall  1   353.77 384.92
## - IntPlan       1   370.28 401.43
## - DayMin        1   373.06 404.22
## 
## Step:  AIC=377.29
## Churn ~ DayMin + IntPlan + CustServCall + VmailPlan + IntCall + 
##     NightCharge
## 
##                Df Deviance    AIC
## <none>              333.68 377.29
## - NightCharge   1   340.52 377.90
## + EveMin        1   328.28 378.12
## + EveCharge     1   328.28 378.12
## - IntCall       1   341.60 378.98
## + VmailMsgs     1   330.38 380.23
## - VmailPlan     1   343.03 380.41
## + EveCall       1   331.99 381.84
## + IntCharge     1   332.44 382.28
## + IntMin        1   332.45 382.29
## + NightMin      1   333.13 382.98
## + NightCall     1   333.24 383.08
## + Tenure        1   333.52 383.36
## + DayCharge     1   333.64 383.48
## + DayCall       1   333.67 383.52
## - CustServCall  1   347.53 384.91
## - IntPlan       1   364.54 401.92
## - DayMin        1   367.09 404.47
summary(ReducedLogistic)
## 
## Call:
## glm(formula = Churn ~ DayMin + IntPlan + CustServCall + VmailPlan + 
##     IntCall + NightCharge, family = binomial(logit), data = ChurnTrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6054  -0.5235  -0.3381  -0.1870   2.7745  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.990306   0.913782  -6.556 5.55e-11 ***
## DayMin        0.014215   0.002641   5.383 7.31e-08 ***
## IntPlanyes    2.061300   0.363443   5.672 1.41e-08 ***
## CustServCall  0.350339   0.093376   3.752 0.000175 ***
## VmailPlanyes -1.071071   0.376938  -2.842 0.004490 ** 
## IntCall      -0.191472   0.071325  -2.684 0.007264 ** 
## NightCharge   0.164072   0.063666   2.577 0.009964 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 425.29  on 507  degrees of freedom
## Residual deviance: 333.68  on 501  degrees of freedom
## AIC: 347.68
## 
## Number of Fisher Scoring iterations: 6
#    Reduced: Pseudo (Deviance) R2 = (425.29 - 333.68)/425.29 = 0.2154

# Exponentiate and interpret logistic coefficients
#   Multiplicative change in odds per unit change in predictor

LogisticCoef <- coef(summary(ReducedLogistic))
LogisticCoef
##                 Estimate  Std. Error   z value     Pr(>|z|)
## (Intercept)  -5.99030568 0.913781875 -6.555509 5.545225e-11
## DayMin        0.01421539 0.002640608  5.383379 7.310051e-08
## IntPlanyes    2.06130037 0.363443128  5.671590 1.414783e-08
## CustServCall  0.35033876 0.093375946  3.751917 1.754878e-04
## VmailPlanyes -1.07107135 0.376938333 -2.841503 4.490145e-03
## IntCall      -0.19147186 0.071325210 -2.684491 7.264040e-03
## NightCharge   0.16407217 0.063666230  2.577067 9.964251e-03
LogisticCoef[,1]
##  (Intercept)       DayMin   IntPlanyes CustServCall VmailPlanyes      IntCall  NightCharge 
##  -5.99030568   0.01421539   2.06130037   0.35033876  -1.07107135  -0.19147186   0.16407217
ExpCoef <- (exp(LogisticCoef[,1]))
ExpCoef
##  (Intercept)       DayMin   IntPlanyes CustServCall VmailPlanyes      IntCall  NightCharge 
##  0.002502899  1.014316910  7.856179119  1.419548356  0.342641233  0.825742858  1.178299346
cbind(LogisticCoef, ExpCoef)
##                 Estimate  Std. Error   z value     Pr(>|z|)     ExpCoef
## (Intercept)  -5.99030568 0.913781875 -6.555509 5.545225e-11 0.002502899
## DayMin        0.01421539 0.002640608  5.383379 7.310051e-08 1.014316910
## IntPlanyes    2.06130037 0.363443128  5.671590 1.414783e-08 7.856179119
## CustServCall  0.35033876 0.093375946  3.751917 1.754878e-04 1.419548356
## VmailPlanyes -1.07107135 0.376938333 -2.841503 4.490145e-03 0.342641233
## IntCall      -0.19147186 0.071325210 -2.684491 7.264040e-03 0.825742858
## NightCharge   0.16407217 0.063666230  2.577067 9.964251e-03 1.178299346
# Compute and examine training-sample predictions 
#   for reduced logistic --
#   Logistic Pred, Logistic Prob, Classification
#   Better/smarter prediction planning

ChurnPred # Sample prediction data, n = 5
##   State AreaCode PhoneNum Churn ChurnL ChurnTF ChurnYN Tenure VmailPlan IntPlan CustServCall
## 1    KS      415 413-9854     0 False.   FALSE      no     90       yes      no            1
## 2    NJ      408 402-8337     0 False.   FALSE      no    113        no     yes            3
## 3    AL      408 351-4935     0 False.   FALSE      no     74       yes      no            1
## 4    MI      510 355-9594     0 False.   FALSE      no     52        no      no            3
## 5    NE      415 329-9540     0 False.   FALSE      no    115        no      no            1
##   VmailMsgs DayCall DayMin DayCharge EveCall EveMin EveCharge NightCall NightMin NightCharge
## 1        30     104  191.3     32.52     109  257.1     21.85        89    177.7        8.00
## 2         0      98  122.4     20.81     116  223.8     19.02       148    267.5       12.04
## 3        46     109  277.7     47.21     114  270.0     22.95        82    165.3        7.44
## 4         0     100  219.0     37.23     110  148.9     12.66       129    151.8        6.83
## 5         0     104  135.4     23.02      91  138.8     11.80        92    208.5        9.38
##   IntCall IntMin IntCharge
## 1       1   11.0      2.97
## 2       5   11.6      3.13
## 3       2   10.4      2.81
## 4       8   11.9      3.21
## 5       3    5.4      1.46
predict(ReducedLogistic, newdata=ChurnPred)
##          1          2          3          4          5 
## -2.8705286 -0.1199557 -1.9256711 -2.2372809 -2.7506217
predict(ReducedLogistic, newdata=ChurnPred, type="response")
##          1          2          3          4          5 
## 0.05362982 0.47004699 0.12723050 0.09645225 0.06005155
#   Compute and examine full-sample predictions for reduced logistic

LogisticPred <- predict(ReducedLogistic, newdata=ChurnTrain)
with(ChurnTrain, Hist(LogisticPred, scale="frequency", 
  breaks="Sturges", col="darkgray"))

LogisticProb <- predict(ReducedLogistic, newdata=ChurnTrain, 
  type="response")
with(ChurnTrain, Hist(LogisticProb, scale="frequency", 
  breaks="Sturges", col="darkgray"))

scatterplot(LogisticProb~RegPred, reg.line=FALSE, smooth=TRUE, spread=FALSE,
   id.method='mahal', id.n = 2, boxplots='xy', span=0.5, data=ChurnTrain)

## 181 280 
## 181 280
scatterplot(LogisticProb~LogisticPred, reg.line=FALSE, 
  smooth=FALSE, spread=FALSE, id.method='mahal', id.n = 2, 
  boxplots='xy', span=0.5)

## 181 368 
## 181 368
# Classification analysis, training sample
Threshold <- 0.5
LogisticClass <- rep(0, TrainRows)
LogisticClass[LogisticProb > Threshold] <- 1
LogisticClass
##   [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
##  [48] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
##  [95] 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [142] 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0
## [189] 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## [236] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## [283] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## [330] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## [377] 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## [424] 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## [471] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
table(LogisticClass, ChurnTrain$Churn)
##              
## LogisticClass   0   1
##             0 420  58
##             1  13  17
mean(LogisticClass==ChurnTrain$Churn)
## [1] 0.8602362
# Add predictions to data set, save to .Rdata and .csv

ChurnTrain$RegPred <- RegPred
ChurnTrain$LogisticPred <- LogisticPred
ChurnTrain$LogisticProb <- LogisticProb
ChurnTrain$LogisticClass <- LogisticClass
ChurnTrain[1:5,]
##   State AreaCode PhoneNum Churn ChurnL ChurnTF ChurnYN Tenure VmailPlan IntPlan CustServCall
## 1    NJ      415 358-1921     0 False.   FALSE      no    137        no      no            0
## 2    IN      415 329-6603     1  True.    TRUE     yes     65        no      no            4
## 3    IA      408 363-1107     0 False.   FALSE      no    168        no      no            1
## 4    VT      510 386-2923     0 False.   FALSE      no     93        no      no            3
## 5    TX      415 373-2782     0 False.   FALSE      no     73        no      no            1
##   VmailMsgs DayCall DayMin DayCharge EveCall EveMin EveCharge NightCall NightMin NightCharge
## 1         0     114  243.4     41.38     110  121.2     10.30       104    162.6        7.32
## 2         0     137  129.1     21.95      83  228.5     19.42       111    208.8        9.40
## 3         0      96  128.8     21.90      71  104.9      8.92       128    141.1        6.35
## 4         0     114  190.7     32.42     111  218.2     18.55       121    129.6        5.83
## 5         0      90  224.4     38.15      88  159.5     13.56        74    192.8        8.68
##   IntCall IntMin IntCharge    RegPred LogisticPred LogisticProb LogisticClass
## 1       5   12.2      3.29 0.16615862    -2.286631   0.09223627             0
## 2       6   12.7      3.43 0.13521804    -2.360297   0.08625082             0
## 3       2   11.2      3.02 0.09213493    -3.150110   0.04108694             0
## 4       3    8.1      2.19 0.23474408    -1.846289   0.13630917             0
## 5       2   13.0      3.51 0.22573973    -1.408831   0.19641857             0
save("ChurnTrain", file="ChurnExampleTrainAnalysis.RData")
write.table(ChurnTrain, 
  "ChurnExampleTrainAnalysis.csv", sep=",", 
  col.names=TRUE, row.names=FALSE, quote=FALSE, na="NA")

# MODEL EVALUATION USING TEST DATA
# Read Test data set, list, summarize,
#   convert AreaCode to factor

ChurnTest <- 
  read.table("ChurnExampleTest.csv", 
  header=TRUE, sep=",", na.strings="NA", dec=".", strip.white=TRUE)
TestRows <- nrow(ChurnTest)
TestRows
## [1] 106
ChurnTest[1:5,]
##   State AreaCode PhoneNum Churn ChurnL ChurnTF ChurnYN Tenure VmailPlan IntPlan CustServCall
## 1    KS      415 413-9854     0 False.   FALSE      no     90       yes      no            1
## 2    NJ      408 402-8337     0 False.   FALSE      no    113        no     yes            3
## 3    AL      408 351-4935     0 False.   FALSE      no     74       yes      no            1
## 4    MI      510 355-9594     0 False.   FALSE      no     52        no      no            3
## 5    NE      415 329-9540     0 False.   FALSE      no    115        no      no            1
##   VmailMsgs DayCall DayMin DayCharge EveCall EveMin EveCharge NightCall NightMin NightCharge
## 1        30     104  191.3     32.52     109  257.1     21.85        89    177.7        8.00
## 2         0      98  122.4     20.81     116  223.8     19.02       148    267.5       12.04
## 3        46     109  277.7     47.21     114  270.0     22.95        82    165.3        7.44
## 4         0     100  219.0     37.23     110  148.9     12.66       129    151.8        6.83
## 5         0     104  135.4     23.02      91  138.8     11.80        92    208.5        9.38
##   IntCall IntMin IntCharge
## 1       1   11.0      2.97
## 2       5   11.6      3.13
## 3       2   10.4      2.81
## 4       8   11.9      3.21
## 5       3    5.4      1.46
ChurnTest <- within(ChurnTest, {
  AreaCode <- as.factor(AreaCode)
})

# Data Understanding -- Numerical summaries and scatterplot matrix

numSummary(ChurnTest[,c("Churn", "CustServCall", "DayCall", "DayCharge", 
  "DayMin", "EveCall", "EveCharge", "EveMin", "IntCall", "IntCharge", 
  "IntMin", "NightCall", "NightCharge", "NightMin", "Tenure", "VmailMsgs")],
  statistics=c("mean", "sd", "quantiles", "skewness"), 
  quantiles=c(0,.25,.5,.75,1), type="2")
##                     mean         sd     skewness    0%      25%     50%      75%   100%   n
## Churn          0.1603774  0.3686989  1.877704622  0.00   0.0000   0.000   0.0000   1.00 106
## CustServCall   1.5377358  1.3393284  0.826951321  0.00   1.0000   1.000   2.0000   6.00 106
## DayCall      101.3773585 18.7816794 -0.126190599 50.00  88.0000 104.000 114.0000 156.00 106
## DayCharge     31.6448113  9.6023418 -0.264342776  7.16  24.2025  31.375  39.2350  53.11 106
## DayMin       186.1443396 56.4871820 -0.264436491 42.10 142.3750 184.550 230.8000 312.40 106
## EveCall       99.4811321 18.9777971 -0.045083789 47.00  87.5000  98.500 114.0000 147.00 106
## EveCharge     17.7786792  4.1822027 -0.440479653  7.09  15.4900  18.115  20.6725  26.64 106
## EveMin       209.1594340 49.2020979 -0.440627070 83.40 182.2500 213.150 243.2500 313.40 106
## IntCall        3.8962264  2.6472960  2.628713145  1.00   2.0000   4.000   5.0000  18.00 106
## IntCharge      2.7464151  0.7067622  0.147656965  1.35   2.2100   2.810   3.2100   5.00 106
## IntMin        10.1698113  2.6182571  0.144306310  5.00   8.1750  10.400  11.9000  18.50 106
## NightCall     98.6415094 20.3573731  0.066433941 51.00  84.2500  99.000 110.7500 151.00 106
## NightCharge    8.8598113  2.2876476  0.233862423  3.47   7.3825   8.760  10.2600  16.20 106
## NightMin     196.8632075 50.8252680  0.233439322 77.10 164.0500 194.650 228.0250 359.90 106
## Tenure        95.9622642 42.7386130 -0.004030175  1.00  62.7500 103.000 127.0000 222.00 106
## VmailMsgs      7.7830189 13.4078902  1.327406636  0.00   0.0000   0.000  15.0000  46.00 106
scatterplotMatrix(~Churn+Tenure+VmailMsgs+CustServCall+
  DayMin+EveMin+IntMin+NightMin, reg.line=lm, smooth=TRUE, 
  spread=FALSE, span=0.5, id.n=0, diagonal = 'histogram', 
  data=ChurnTest)

# Generate test set predictions -- 
#   Regression Pred, Logistic Pred, Logistic Prob, Classification

RegPred <- predict(ReducedRegression, newdata=ChurnTest, 
  interval="none")
LogisticPred <- predict(ReducedLogistic, newdata=ChurnTest, 
  interval="none")
LogisticProb <- predict(ReducedLogistic, newdata=ChurnTest, 
  interval="none", type="response")
Threshold <- 0.5
LogisticClass <- rep(0, TestRows)
LogisticClass[LogisticProb > Threshold] <- 1
LogisticClass
##   [1] 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [48] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [95] 0 0 0 0 0 1 0 0 0 0 1 0
table(LogisticClass, ChurnTest$Churn)
##              
## LogisticClass  0  1
##             0 89 11
##             1  0  6
mean(LogisticClass==ChurnTest$Churn)
## [1] 0.8962264
# Add predictions to test set, save and export to csv

ChurnTest$RegPred <- RegPred
ChurnTest$LogisticPred <- LogisticPred
ChurnTest$LogisticProb <- LogisticProb
ChurnTest$LogisticClass <- LogisticClass
ChurnTest[1:5,]
##   State AreaCode PhoneNum Churn ChurnL ChurnTF ChurnYN Tenure VmailPlan IntPlan CustServCall
## 1    KS      415 413-9854     0 False.   FALSE      no     90       yes      no            1
## 2    NJ      408 402-8337     0 False.   FALSE      no    113        no     yes            3
## 3    AL      408 351-4935     0 False.   FALSE      no     74       yes      no            1
## 4    MI      510 355-9594     0 False.   FALSE      no     52        no      no            3
## 5    NE      415 329-9540     0 False.   FALSE      no    115        no      no            1
##   VmailMsgs DayCall DayMin DayCharge EveCall EveMin EveCharge NightCall NightMin NightCharge
## 1        30     104  191.3     32.52     109  257.1     21.85        89    177.7        8.00
## 2         0      98  122.4     20.81     116  223.8     19.02       148    267.5       12.04
## 3        46     109  277.7     47.21     114  270.0     22.95        82    165.3        7.44
## 4         0     100  219.0     37.23     110  148.9     12.66       129    151.8        6.83
## 5         0     104  135.4     23.02      91  138.8     11.80        92    208.5        9.38
##   IntCall IntMin IntCharge    RegPred LogisticPred LogisticProb LogisticClass
## 1       1   11.0      2.97 0.19607649   -2.8705286   0.05362982             0
## 2       5   11.6      3.13 0.10610147   -0.1199557   0.47004699             0
## 3       2   10.4      2.81 0.30022860   -1.9256711   0.12723050             0
## 4       8   11.9      3.21 0.19131783   -2.2372809   0.09645225             0
## 5       3    5.4      1.46 0.08476337   -2.7506217   0.06005155             0
save("ChurnTest", file="ChurnTestAnalysis.RData")
write.table(ChurnTest, 
  "ChurnTestAnalysis.csv", sep=",", 
  col.names=TRUE, row.names=FALSE, quote=FALSE, na="NA")