options(width=100)
# Basic (instructional) logistic regression analysis script
# Built from R Commander script file
# Includes nonessential instructional elements
# (suboptimal) Ordinary least-squares regression
# Additional prediction logic
# Histograms and other plots to compare alternative predictions
# Suggested usage:
# Copy the required data files, ChurnExampleTrain.csv, ChurnExamplePred.csv
# and ChurnExampleTest.csv into a new project folder <folder>
# Change the working directory (setwd below) to point to the project <folder>
# Run this entire code block
# Set up working directory for file locations
# and load required libraries
setwd("D:/MBA/SecondTerm/Data/churn")
library(RcmdrMisc)
library(car)
# DATA ENTRY AND SUMMARIZATION
# Read Training data set, list, summarize,
# convert AreaCode to factor
ChurnTrain <-
read.table("ChurnExampleTrain.csv",
header=TRUE, sep=",", na.strings="NA", dec=".", strip.white=TRUE)
TrainRows <- nrow(ChurnTrain)
TrainRows
## [1] 508
ChurnTrain[1:5,]
## State AreaCode PhoneNum Churn ChurnL ChurnTF ChurnYN Tenure VmailPlan IntPlan CustServCall
## 1 NJ 415 358-1921 0 False. FALSE no 137 no no 0
## 2 IN 415 329-6603 1 True. TRUE yes 65 no no 4
## 3 IA 408 363-1107 0 False. FALSE no 168 no no 1
## 4 VT 510 386-2923 0 False. FALSE no 93 no no 3
## 5 TX 415 373-2782 0 False. FALSE no 73 no no 1
## VmailMsgs DayCall DayMin DayCharge EveCall EveMin EveCharge NightCall NightMin NightCharge
## 1 0 114 243.4 41.38 110 121.2 10.30 104 162.6 7.32
## 2 0 137 129.1 21.95 83 228.5 19.42 111 208.8 9.40
## 3 0 96 128.8 21.90 71 104.9 8.92 128 141.1 6.35
## 4 0 114 190.7 32.42 111 218.2 18.55 121 129.6 5.83
## 5 0 90 224.4 38.15 88 159.5 13.56 74 192.8 8.68
## IntCall IntMin IntCharge
## 1 5 12.2 3.29
## 2 6 12.7 3.43
## 3 2 11.2 3.02
## 4 3 8.1 2.19
## 5 2 13.0 3.51
ChurnTrain <- within(ChurnTrain, {
AreaCode <- as.factor(AreaCode)
})
# Demonstration of Simple Logistic Regression vs Simple Linear Regression
# Churn vs DayMin with fitted regression line
scatterplot(Churn~DayMin, reg.line=lm, smooth=FALSE, spread=FALSE,
boxplots=FALSE, span=0.5, ellipse=FALSE, levels=c(.5, .9), data=ChurnTrain)

# Churn vs DayMin with fitted logistic probabilities
SimpleLogistic <- glm(Churn ~ DayMin, family=binomial(logit), data=ChurnTrain)
summary(SimpleLogistic)
##
## Call:
## glm(formula = Churn ~ DayMin, family = binomial(logit), data = ChurnTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0657 -0.6039 -0.4798 -0.3464 2.8830
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.140170 0.506363 -8.176 2.93e-16 ***
## DayMin 0.012268 0.002366 5.184 2.17e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 425.29 on 507 degrees of freedom
## Residual deviance: 395.43 on 506 degrees of freedom
## AIC: 399.43
##
## Number of Fisher Scoring iterations: 5
exp(coef(SimpleLogistic)) # Exponentiated coefficients ("odds ratios")
## (Intercept) DayMin
## 0.01592015 1.01234340
LogisticProb <- predict(SimpleLogistic, newdata=ChurnTrain, type = "response",
interval="none", level=.95, se.fit=FALSE)
DayMin <- ChurnTrain$DayMin
Churn <- ChurnTrain$Churn
plot(x = DayMin, y = Churn)
points(x = DayMin, y = LogisticProb, col = "red")

# Data Understanding -- Numerical summaries and scatterplot matrix
numSummary(ChurnTrain[,c("Churn", "CustServCall", "DayCall", "DayCharge",
"DayMin", "EveCall", "EveCharge", "EveMin", "IntCall", "IntCharge",
"IntMin", "NightCall", "NightCharge", "NightMin", "Tenure", "VmailMsgs")],
statistics=c("mean", "sd", "quantiles", "skewness"),
quantiles=c(0,.25,.5,.75,1), type="2")
## mean sd skewness 0% 25% 50% 75% 100% n
## Churn 0.1476378 0.3550902 1.99247906 0.00 0.0000 0.000 0.000 1.00 508
## CustServCall 1.6377953 1.3683909 1.07708652 0.00 1.0000 1.000 2.000 8.00 508
## DayCall 101.0984252 20.2879216 -0.40160587 0.00 87.7500 102.000 115.000 151.00 508
## DayCharge 30.6920669 10.0253583 -0.05716419 0.00 24.3550 30.975 37.465 58.96 508
## DayMin 180.5377953 58.9715977 -0.05720994 0.00 143.2750 182.200 220.400 346.80 508
## EveCall 100.7814961 20.0806434 0.07542696 46.00 87.0000 101.000 114.000 168.00 508
## EveCharge 16.9373031 4.4196795 -0.05282099 3.61 13.9075 16.875 19.985 28.56 508
## EveMin 199.2576772 51.9954637 -0.05283473 42.50 163.5750 198.550 235.150 336.00 508
## IntCall 4.4094488 2.3191318 1.03693287 0.00 3.0000 4.000 6.000 14.00 508
## IntCharge 2.7086417 0.7440602 -0.21759992 0.00 2.2700 2.730 3.190 4.97 508
## IntMin 10.0297244 2.7553574 -0.21620538 0.00 8.4000 10.100 11.800 18.40 508
## NightCall 100.1358268 20.2707331 -0.01736662 33.00 86.0000 100.000 115.250 157.00 508
## NightCharge 8.9138189 2.2598655 -0.01784827 2.86 7.3150 8.860 10.590 16.99 508
## NightMin 198.0807087 50.2123410 -0.01805058 63.60 162.5000 196.800 235.400 377.50 508
## Tenure 101.7755906 39.7799772 0.09847105 1.00 76.0000 100.500 127.000 232.00 508
## VmailMsgs 8.1515748 13.8050193 1.28510519 0.00 0.0000 0.000 20.000 51.00 508
scatterplotMatrix(~Churn+Tenure+VmailMsgs+CustServCall+
DayMin+EveMin+IntMin+NightMin, reg.line=lm, smooth=TRUE,
spread=FALSE, span=0.5, id.n=0, diagonal = 'histogram',
data=ChurnTrain)

# ANALYSIS AND MODELING OF TRAINING DATA
# Fit full and reduced regression models
FullRegression <-
lm(Churn~CustServCall+DayCall+DayCharge+DayMin+EveCall+EveCharge+EveMin+
IntCall+IntCharge+IntMin+NightCall+NightCharge+NightMin+Tenure+VmailMsgs,
data=ChurnTrain)
summary(FullRegression)
##
## Call:
## lm(formula = Churn ~ CustServCall + DayCall + DayCharge + DayMin +
## EveCall + EveCharge + EveMin + IntCall + IntCharge + IntMin +
## NightCall + NightCharge + NightMin + Tenure + VmailMsgs,
## data = ChurnTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5270 -0.1957 -0.1042 0.0048 1.0841
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.312e-01 1.798e-01 -2.955 0.003281 **
## CustServCall 3.780e-02 1.104e-02 3.425 0.000667 ***
## DayCall 5.264e-05 7.466e-04 0.071 0.943822
## DayCharge -1.590e+00 5.171e+00 -0.307 0.758626
## DayMin 2.716e-01 8.790e-01 0.309 0.757463
## EveCall 8.711e-04 7.493e-04 1.163 0.245574
## EveCharge -1.076e+00 5.098e+00 -0.211 0.832915
## EveMin 9.218e-02 4.333e-01 0.213 0.831636
## IntCall -1.786e-02 6.515e-03 -2.741 0.006347 **
## IntCharge 4.901e+00 5.039e+00 0.973 0.331269
## IntMin -1.317e+00 1.361e+00 -0.968 0.333493
## NightCall 6.678e-04 7.487e-04 0.892 0.372864
## NightCharge 4.746e+00 5.530e+00 0.858 0.391193
## NightMin -2.129e-01 2.489e-01 -0.855 0.392732
## Tenure -2.614e-04 3.783e-04 -0.691 0.489869
## VmailMsgs -1.999e-03 1.090e-03 -1.835 0.067177 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.337 on 492 degrees of freedom
## Multiple R-squared: 0.1262, Adjusted R-squared: 0.09953
## F-statistic: 4.736 on 15 and 492 DF, p-value: 1.389e-08
ReducedRegression <- stepwise(FullRegression, direction='forward/backward', criterion='BIC')
##
## Direction: forward/backward
## Criterion: BIC
##
## Start: AIC=-1046.72
## Churn ~ 1
##
## Df Sum of Sq RSS AIC
## + DayMin 1 3.6136 60.314 -1070.0
## + DayCharge 1 3.6134 60.314 -1070.0
## + CustServCall 1 1.3777 62.549 -1051.6
## + IntCall 1 0.9062 63.021 -1047.7
## + EveMin 1 0.8051 63.122 -1046.9
## + EveCharge 1 0.8047 63.122 -1046.9
## <none> 63.927 -1046.7
## + NightCharge 1 0.5400 63.387 -1044.8
## + NightMin 1 0.5398 63.387 -1044.8
## + VmailMsgs 1 0.5397 63.387 -1044.8
## + EveCall 1 0.2104 63.717 -1042.2
## + Tenure 1 0.1409 63.786 -1041.6
## + IntCharge 1 0.1005 63.827 -1041.3
## + IntMin 1 0.0995 63.828 -1041.3
## + NightCall 1 0.0124 63.915 -1040.6
## + DayCall 1 0.0064 63.921 -1040.5
##
## Step: AIC=-1070.05
## Churn ~ DayMin
##
## Df Sum of Sq RSS AIC
## + CustServCall 1 1.3037 59.010 -1074.9
## + IntCall 1 0.8004 59.513 -1070.6
## <none> 60.314 -1070.0
## + EveMin 1 0.6638 59.650 -1069.4
## + EveCharge 1 0.6636 59.650 -1069.4
## + NightCharge 1 0.5963 59.717 -1068.9
## + NightMin 1 0.5959 59.718 -1068.9
## + VmailMsgs 1 0.4207 59.893 -1067.4
## + IntCharge 1 0.1314 60.182 -1064.9
## + IntMin 1 0.1305 60.183 -1064.9
## + EveCall 1 0.1291 60.184 -1064.9
## + Tenure 1 0.1204 60.193 -1064.8
## + DayCharge 1 0.0287 60.285 -1064.1
## + NightCall 1 0.0233 60.290 -1064.0
## + DayCall 1 0.0001 60.313 -1063.8
## - DayMin 1 3.6136 63.927 -1046.7
##
## Step: AIC=-1074.92
## Churn ~ DayMin + CustServCall
##
## Df Sum of Sq RSS AIC
## + IntCall 1 0.7498 58.260 -1075.2
## <none> 59.010 -1074.9
## + EveMin 1 0.6502 58.360 -1074.3
## + EveCharge 1 0.6501 58.360 -1074.3
## + NightCharge 1 0.6457 58.364 -1074.3
## + NightMin 1 0.6453 58.365 -1074.3
## + VmailMsgs 1 0.4189 58.591 -1072.3
## + EveCall 1 0.1708 58.839 -1070.2
## - CustServCall 1 1.3037 60.314 -1070.0
## + Tenure 1 0.1243 58.886 -1069.8
## + IntCharge 1 0.1192 58.891 -1069.7
## + IntMin 1 0.1183 58.892 -1069.7
## + NightCall 1 0.0575 58.952 -1069.2
## + DayCharge 1 0.0133 58.997 -1068.8
## + DayCall 1 0.0012 59.009 -1068.7
## - DayMin 1 3.5396 62.549 -1051.6
##
## Step: AIC=-1075.18
## Churn ~ DayMin + CustServCall + IntCall
##
## Df Sum of Sq RSS AIC
## <none> 58.260 -1075.2
## + EveMin 1 0.7006 57.559 -1075.1
## + EveCharge 1 0.7004 57.560 -1075.1
## - IntCall 1 0.7498 59.010 -1074.9
## + NightCharge 1 0.6585 57.602 -1074.7
## + NightMin 1 0.6580 57.602 -1074.7
## + VmailMsgs 1 0.4093 57.851 -1072.5
## - CustServCall 1 1.2531 59.513 -1070.6
## + EveCall 1 0.1744 58.086 -1070.5
## + IntCharge 1 0.1403 58.120 -1070.2
## + IntMin 1 0.1394 58.121 -1070.2
## + Tenure 1 0.1068 58.153 -1069.9
## + NightCall 1 0.0846 58.175 -1069.7
## + DayCharge 1 0.0100 58.250 -1069.0
## + DayCall 1 0.0027 58.257 -1069.0
## - DayMin 1 3.4396 61.700 -1052.3
summary(ReducedRegression)
##
## Call:
## lm(formula = Churn ~ DayMin + CustServCall + IntCall, data = ChurnTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.45779 -0.19007 -0.11183 -0.01495 0.99278
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0910259 0.0599865 -1.517 0.12978
## DayMin 0.0013975 0.0002562 5.455 7.69e-08 ***
## CustServCall 0.0363484 0.0110397 3.293 0.00106 **
## IntCall -0.0165953 0.0065160 -2.547 0.01117 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.34 on 504 degrees of freedom
## Multiple R-squared: 0.08865, Adjusted R-squared: 0.08322
## F-statistic: 16.34 on 3 and 504 DF, p-value: 3.775e-10
oldpar <- par(oma=c(0,0,3,0), mfrow=c(2,2))
plot(ReducedRegression)

par(oldpar)
# Compute predictions for reduced regression model
# Better/smarter prediction planning
ChurnPred <-
read.table("ChurnExamplePred.csv",
header=TRUE, sep=",", na.strings="NA", dec=".", strip.white=TRUE)
ChurnPred # Sample prediction data, n = 5
## State AreaCode PhoneNum Churn ChurnL ChurnTF ChurnYN Tenure VmailPlan IntPlan CustServCall
## 1 KS 415 413-9854 0 False. FALSE no 90 yes no 1
## 2 NJ 408 402-8337 0 False. FALSE no 113 no yes 3
## 3 AL 408 351-4935 0 False. FALSE no 74 yes no 1
## 4 MI 510 355-9594 0 False. FALSE no 52 no no 3
## 5 NE 415 329-9540 0 False. FALSE no 115 no no 1
## VmailMsgs DayCall DayMin DayCharge EveCall EveMin EveCharge NightCall NightMin NightCharge
## 1 30 104 191.3 32.52 109 257.1 21.85 89 177.7 8.00
## 2 0 98 122.4 20.81 116 223.8 19.02 148 267.5 12.04
## 3 46 109 277.7 47.21 114 270.0 22.95 82 165.3 7.44
## 4 0 100 219.0 37.23 110 148.9 12.66 129 151.8 6.83
## 5 0 104 135.4 23.02 91 138.8 11.80 92 208.5 9.38
## IntCall IntMin IntCharge
## 1 1 11.0 2.97
## 2 5 11.6 3.13
## 3 2 10.4 2.81
## 4 8 11.9 3.21
## 5 3 5.4 1.46
predict(ReducedRegression, newdata=ChurnPred, interval="prediction",
level=.95, se.fit=FALSE)
## fit lwr upr
## 1 0.19607649 -0.4741610 0.8663140
## 2 0.10610147 -0.5638881 0.7760910
## 3 0.30022860 -0.3710101 0.9714673
## 4 0.19131783 -0.4799034 0.8625391
## 5 0.08476337 -0.5846646 0.7541913
# Compute, examine and save full-sample predictions for reduced regression
RegPred <- predict(ReducedRegression, newdata=ChurnTrain, interval="none")
with(ChurnTrain, Hist(RegPred, scale="frequency", breaks="Sturges",
col="darkgray"))

# Fit full and reduced logistic regression models
FullLogistic <- glm(Churn ~ CustServCall + DayCall + DayCharge +
DayMin + EveCall + EveCharge + EveMin + IntCall + IntCharge +
IntMin + IntPlan + NightCall + NightCharge + NightMin +
Tenure + VmailMsgs + VmailPlan,
family=binomial(logit),
data=ChurnTrain)
summary(FullLogistic)
##
## Call:
## glm(formula = Churn ~ CustServCall + DayCall + DayCharge + DayMin +
## EveCall + EveCharge + EveMin + IntCall + IntCharge + IntMin +
## IntPlan + NightCall + NightCharge + NightMin + Tenure + VmailMsgs +
## VmailPlan, family = binomial(logit), data = ChurnTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7021 -0.5193 -0.3071 -0.1529 3.1766
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.853535 1.792735 -4.939 7.87e-07 ***
## CustServCall 0.364789 0.096916 3.764 0.000167 ***
## DayCall -0.003032 0.007220 -0.420 0.674568
## DayCharge -21.870697 49.357515 -0.443 0.657688
## DayMin 3.732121 8.390839 0.445 0.656475
## EveCall 0.008711 0.007410 1.176 0.239781
## EveCharge -27.354617 49.843629 -0.549 0.583137
## EveMin 2.331303 4.236754 0.550 0.582143
## IntCall -0.225158 0.077349 -2.911 0.003603 **
## IntCharge 62.930868 51.101898 1.231 0.218144
## IntMin -16.927849 13.799308 -1.227 0.219929
## IntPlanyes 2.165740 0.381130 5.682 1.33e-08 ***
## NightCall 0.005313 0.007606 0.699 0.484803
## NightCharge 32.613200 54.785074 0.595 0.551647
## NightMin -1.459913 2.465650 -0.592 0.553783
## Tenure -0.001558 0.003693 -0.422 0.673210
## VmailMsgs 0.079129 0.043244 1.830 0.067276 .
## VmailPlanyes -3.430729 1.428155 -2.402 0.016296 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 425.29 on 507 degrees of freedom
## Residual deviance: 319.09 on 490 degrees of freedom
## AIC: 355.09
##
## Number of Fisher Scoring iterations: 6
# Full: Pseudo (Deviance) R2 = (425.29 - 319.09)/425.29 = 0.2497
ReducedLogistic <- stepwise(FullLogistic, direction='forward/backward', criterion='BIC')
##
## Direction: forward/backward
## Criterion: BIC
##
## Start: AIC=431.52
## Churn ~ 1
##
## Df Deviance AIC
## + DayMin 1 395.43 407.89
## + DayCharge 1 395.43 407.89
## + IntPlan 1 398.55 411.01
## + CustServCall 1 415.18 427.64
## + IntCall 1 417.35 429.81
## + VmailPlan 1 418.63 431.09
## + EveMin 1 418.82 431.28
## + EveCharge 1 418.82 431.29
## <none> 425.29 431.52
## + VmailMsgs 1 420.56 433.03
## + NightCharge 1 420.97 433.43
## + NightMin 1 420.97 433.43
## + EveCall 1 423.62 436.08
## + Tenure 1 424.16 436.62
## + IntCharge 1 424.48 436.94
## + IntMin 1 424.49 436.95
## + NightCall 1 425.19 437.65
## + DayCall 1 425.24 437.70
##
## Step: AIC=407.89
## Churn ~ DayMin
##
## Df Deviance AIC
## + IntPlan 1 369.16 387.85
## + CustServCall 1 385.13 403.83
## + IntCall 1 387.25 405.94
## <none> 395.43 407.89
## + VmailPlan 1 389.24 407.93
## + EveMin 1 389.55 408.24
## + EveCharge 1 389.55 408.24
## + NightCharge 1 390.82 409.51
## + NightMin 1 390.82 409.51
## + VmailMsgs 1 391.22 409.91
## + IntCharge 1 394.29 412.98
## + IntMin 1 394.29 412.98
## + EveCall 1 394.41 413.10
## + Tenure 1 394.59 413.29
## + DayCharge 1 395.10 413.80
## + NightCall 1 395.31 414.00
## + DayCall 1 395.40 414.09
## - DayMin 1 425.29 431.52
##
## Step: AIC=387.85
## Churn ~ DayMin + IntPlan
##
## Df Deviance AIC
## + CustServCall 1 356.34 381.26
## + VmailPlan 1 361.21 386.13
## + IntCall 1 361.97 386.89
## <none> 369.16 387.85
## + VmailMsgs 1 363.78 388.70
## + NightCharge 1 363.84 388.76
## + NightMin 1 363.84 388.76
## + EveMin 1 364.01 388.94
## + EveCharge 1 364.02 388.94
## + EveCall 1 367.86 392.78
## + DayCharge 1 368.35 393.28
## + Tenure 1 368.36 393.29
## + IntCharge 1 368.48 393.40
## + IntMin 1 368.48 393.40
## + NightCall 1 369.13 394.05
## + DayCall 1 369.16 394.08
## - IntPlan 1 395.43 407.89
## - DayMin 1 398.55 411.01
##
## Step: AIC=381.26
## Churn ~ DayMin + IntPlan + CustServCall
##
## Df Deviance AIC
## + VmailPlan 1 348.11 379.26
## + IntCall 1 349.02 380.17
## <none> 356.34 381.26
## + NightCharge 1 350.61 381.76
## + NightMin 1 350.61 381.76
## + VmailMsgs 1 350.68 381.83
## + EveMin 1 351.03 382.18
## + EveCharge 1 351.03 382.18
## + EveCall 1 354.71 385.87
## + Tenure 1 355.36 386.51
## + DayCharge 1 355.76 386.91
## + IntCharge 1 355.82 386.98
## + IntMin 1 355.83 386.98
## + NightCall 1 356.09 387.24
## + DayCall 1 356.33 387.48
## - CustServCall 1 369.16 387.85
## - IntPlan 1 385.13 403.83
## - DayMin 1 386.84 405.53
##
## Step: AIC=379.26
## Churn ~ DayMin + IntPlan + CustServCall + VmailPlan
##
## Df Deviance AIC
## + IntCall 1 340.52 377.90
## + NightCharge 1 341.60 378.98
## + NightMin 1 341.60 378.98
## <none> 348.11 379.26
## + EveMin 1 342.67 380.05
## + EveCharge 1 342.67 380.05
## - VmailPlan 1 356.34 381.26
## + VmailMsgs 1 345.96 383.34
## + EveCall 1 346.53 383.92
## + Tenure 1 347.57 384.95
## + IntCharge 1 347.61 385.00
## + IntMin 1 347.62 385.00
## + DayCharge 1 347.87 385.26
## + NightCall 1 347.93 385.32
## + DayCall 1 348.11 385.49
## - CustServCall 1 361.21 386.13
## - IntPlan 1 378.74 403.66
## - DayMin 1 379.38 404.30
##
## Step: AIC=377.9
## Churn ~ DayMin + IntPlan + CustServCall + VmailPlan + IntCall
##
## Df Deviance AIC
## + NightCharge 1 333.68 377.29
## + NightMin 1 333.68 377.30
## <none> 340.52 377.90
## + EveMin 1 334.55 378.17
## + EveCharge 1 334.55 378.17
## - IntCall 1 348.11 379.26
## - VmailPlan 1 349.02 380.17
## + VmailMsgs 1 338.34 381.96
## + EveCall 1 338.97 382.59
## + IntCharge 1 339.24 382.85
## + IntMin 1 339.25 382.86
## + NightCall 1 339.93 383.54
## + Tenure 1 340.25 383.86
## + DayCharge 1 340.48 384.10
## + DayCall 1 340.52 384.13
## - CustServCall 1 353.77 384.92
## - IntPlan 1 370.28 401.43
## - DayMin 1 373.06 404.22
##
## Step: AIC=377.29
## Churn ~ DayMin + IntPlan + CustServCall + VmailPlan + IntCall +
## NightCharge
##
## Df Deviance AIC
## <none> 333.68 377.29
## - NightCharge 1 340.52 377.90
## + EveMin 1 328.28 378.12
## + EveCharge 1 328.28 378.12
## - IntCall 1 341.60 378.98
## + VmailMsgs 1 330.38 380.23
## - VmailPlan 1 343.03 380.41
## + EveCall 1 331.99 381.84
## + IntCharge 1 332.44 382.28
## + IntMin 1 332.45 382.29
## + NightMin 1 333.13 382.98
## + NightCall 1 333.24 383.08
## + Tenure 1 333.52 383.36
## + DayCharge 1 333.64 383.48
## + DayCall 1 333.67 383.52
## - CustServCall 1 347.53 384.91
## - IntPlan 1 364.54 401.92
## - DayMin 1 367.09 404.47
summary(ReducedLogistic)
##
## Call:
## glm(formula = Churn ~ DayMin + IntPlan + CustServCall + VmailPlan +
## IntCall + NightCharge, family = binomial(logit), data = ChurnTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6054 -0.5235 -0.3381 -0.1870 2.7745
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.990306 0.913782 -6.556 5.55e-11 ***
## DayMin 0.014215 0.002641 5.383 7.31e-08 ***
## IntPlanyes 2.061300 0.363443 5.672 1.41e-08 ***
## CustServCall 0.350339 0.093376 3.752 0.000175 ***
## VmailPlanyes -1.071071 0.376938 -2.842 0.004490 **
## IntCall -0.191472 0.071325 -2.684 0.007264 **
## NightCharge 0.164072 0.063666 2.577 0.009964 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 425.29 on 507 degrees of freedom
## Residual deviance: 333.68 on 501 degrees of freedom
## AIC: 347.68
##
## Number of Fisher Scoring iterations: 6
# Reduced: Pseudo (Deviance) R2 = (425.29 - 333.68)/425.29 = 0.2154
# Exponentiate and interpret logistic coefficients
# Multiplicative change in odds per unit change in predictor
LogisticCoef <- coef(summary(ReducedLogistic))
LogisticCoef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.99030568 0.913781875 -6.555509 5.545225e-11
## DayMin 0.01421539 0.002640608 5.383379 7.310051e-08
## IntPlanyes 2.06130037 0.363443128 5.671590 1.414783e-08
## CustServCall 0.35033876 0.093375946 3.751917 1.754878e-04
## VmailPlanyes -1.07107135 0.376938333 -2.841503 4.490145e-03
## IntCall -0.19147186 0.071325210 -2.684491 7.264040e-03
## NightCharge 0.16407217 0.063666230 2.577067 9.964251e-03
LogisticCoef[,1]
## (Intercept) DayMin IntPlanyes CustServCall VmailPlanyes IntCall NightCharge
## -5.99030568 0.01421539 2.06130037 0.35033876 -1.07107135 -0.19147186 0.16407217
ExpCoef <- (exp(LogisticCoef[,1]))
ExpCoef
## (Intercept) DayMin IntPlanyes CustServCall VmailPlanyes IntCall NightCharge
## 0.002502899 1.014316910 7.856179119 1.419548356 0.342641233 0.825742858 1.178299346
cbind(LogisticCoef, ExpCoef)
## Estimate Std. Error z value Pr(>|z|) ExpCoef
## (Intercept) -5.99030568 0.913781875 -6.555509 5.545225e-11 0.002502899
## DayMin 0.01421539 0.002640608 5.383379 7.310051e-08 1.014316910
## IntPlanyes 2.06130037 0.363443128 5.671590 1.414783e-08 7.856179119
## CustServCall 0.35033876 0.093375946 3.751917 1.754878e-04 1.419548356
## VmailPlanyes -1.07107135 0.376938333 -2.841503 4.490145e-03 0.342641233
## IntCall -0.19147186 0.071325210 -2.684491 7.264040e-03 0.825742858
## NightCharge 0.16407217 0.063666230 2.577067 9.964251e-03 1.178299346
# Compute and examine training-sample predictions
# for reduced logistic --
# Logistic Pred, Logistic Prob, Classification
# Better/smarter prediction planning
ChurnPred # Sample prediction data, n = 5
## State AreaCode PhoneNum Churn ChurnL ChurnTF ChurnYN Tenure VmailPlan IntPlan CustServCall
## 1 KS 415 413-9854 0 False. FALSE no 90 yes no 1
## 2 NJ 408 402-8337 0 False. FALSE no 113 no yes 3
## 3 AL 408 351-4935 0 False. FALSE no 74 yes no 1
## 4 MI 510 355-9594 0 False. FALSE no 52 no no 3
## 5 NE 415 329-9540 0 False. FALSE no 115 no no 1
## VmailMsgs DayCall DayMin DayCharge EveCall EveMin EveCharge NightCall NightMin NightCharge
## 1 30 104 191.3 32.52 109 257.1 21.85 89 177.7 8.00
## 2 0 98 122.4 20.81 116 223.8 19.02 148 267.5 12.04
## 3 46 109 277.7 47.21 114 270.0 22.95 82 165.3 7.44
## 4 0 100 219.0 37.23 110 148.9 12.66 129 151.8 6.83
## 5 0 104 135.4 23.02 91 138.8 11.80 92 208.5 9.38
## IntCall IntMin IntCharge
## 1 1 11.0 2.97
## 2 5 11.6 3.13
## 3 2 10.4 2.81
## 4 8 11.9 3.21
## 5 3 5.4 1.46
predict(ReducedLogistic, newdata=ChurnPred)
## 1 2 3 4 5
## -2.8705286 -0.1199557 -1.9256711 -2.2372809 -2.7506217
predict(ReducedLogistic, newdata=ChurnPred, type="response")
## 1 2 3 4 5
## 0.05362982 0.47004699 0.12723050 0.09645225 0.06005155
# Compute and examine full-sample predictions for reduced logistic
LogisticPred <- predict(ReducedLogistic, newdata=ChurnTrain)
with(ChurnTrain, Hist(LogisticPred, scale="frequency",
breaks="Sturges", col="darkgray"))

LogisticProb <- predict(ReducedLogistic, newdata=ChurnTrain,
type="response")
with(ChurnTrain, Hist(LogisticProb, scale="frequency",
breaks="Sturges", col="darkgray"))

scatterplot(LogisticProb~RegPred, reg.line=FALSE, smooth=TRUE, spread=FALSE,
id.method='mahal', id.n = 2, boxplots='xy', span=0.5, data=ChurnTrain)

## 181 280
## 181 280
scatterplot(LogisticProb~LogisticPred, reg.line=FALSE,
smooth=FALSE, spread=FALSE, id.method='mahal', id.n = 2,
boxplots='xy', span=0.5)

## 181 368
## 181 368
# Classification analysis, training sample
Threshold <- 0.5
LogisticClass <- rep(0, TrainRows)
LogisticClass[LogisticProb > Threshold] <- 1
LogisticClass
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## [48] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## [95] 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [142] 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0
## [189] 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## [236] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## [283] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## [330] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## [377] 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## [424] 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## [471] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
table(LogisticClass, ChurnTrain$Churn)
##
## LogisticClass 0 1
## 0 420 58
## 1 13 17
mean(LogisticClass==ChurnTrain$Churn)
## [1] 0.8602362
# Add predictions to data set, save to .Rdata and .csv
ChurnTrain$RegPred <- RegPred
ChurnTrain$LogisticPred <- LogisticPred
ChurnTrain$LogisticProb <- LogisticProb
ChurnTrain$LogisticClass <- LogisticClass
ChurnTrain[1:5,]
## State AreaCode PhoneNum Churn ChurnL ChurnTF ChurnYN Tenure VmailPlan IntPlan CustServCall
## 1 NJ 415 358-1921 0 False. FALSE no 137 no no 0
## 2 IN 415 329-6603 1 True. TRUE yes 65 no no 4
## 3 IA 408 363-1107 0 False. FALSE no 168 no no 1
## 4 VT 510 386-2923 0 False. FALSE no 93 no no 3
## 5 TX 415 373-2782 0 False. FALSE no 73 no no 1
## VmailMsgs DayCall DayMin DayCharge EveCall EveMin EveCharge NightCall NightMin NightCharge
## 1 0 114 243.4 41.38 110 121.2 10.30 104 162.6 7.32
## 2 0 137 129.1 21.95 83 228.5 19.42 111 208.8 9.40
## 3 0 96 128.8 21.90 71 104.9 8.92 128 141.1 6.35
## 4 0 114 190.7 32.42 111 218.2 18.55 121 129.6 5.83
## 5 0 90 224.4 38.15 88 159.5 13.56 74 192.8 8.68
## IntCall IntMin IntCharge RegPred LogisticPred LogisticProb LogisticClass
## 1 5 12.2 3.29 0.16615862 -2.286631 0.09223627 0
## 2 6 12.7 3.43 0.13521804 -2.360297 0.08625082 0
## 3 2 11.2 3.02 0.09213493 -3.150110 0.04108694 0
## 4 3 8.1 2.19 0.23474408 -1.846289 0.13630917 0
## 5 2 13.0 3.51 0.22573973 -1.408831 0.19641857 0
save("ChurnTrain", file="ChurnExampleTrainAnalysis.RData")
write.table(ChurnTrain,
"ChurnExampleTrainAnalysis.csv", sep=",",
col.names=TRUE, row.names=FALSE, quote=FALSE, na="NA")
# MODEL EVALUATION USING TEST DATA
# Read Test data set, list, summarize,
# convert AreaCode to factor
ChurnTest <-
read.table("ChurnExampleTest.csv",
header=TRUE, sep=",", na.strings="NA", dec=".", strip.white=TRUE)
TestRows <- nrow(ChurnTest)
TestRows
## [1] 106
ChurnTest[1:5,]
## State AreaCode PhoneNum Churn ChurnL ChurnTF ChurnYN Tenure VmailPlan IntPlan CustServCall
## 1 KS 415 413-9854 0 False. FALSE no 90 yes no 1
## 2 NJ 408 402-8337 0 False. FALSE no 113 no yes 3
## 3 AL 408 351-4935 0 False. FALSE no 74 yes no 1
## 4 MI 510 355-9594 0 False. FALSE no 52 no no 3
## 5 NE 415 329-9540 0 False. FALSE no 115 no no 1
## VmailMsgs DayCall DayMin DayCharge EveCall EveMin EveCharge NightCall NightMin NightCharge
## 1 30 104 191.3 32.52 109 257.1 21.85 89 177.7 8.00
## 2 0 98 122.4 20.81 116 223.8 19.02 148 267.5 12.04
## 3 46 109 277.7 47.21 114 270.0 22.95 82 165.3 7.44
## 4 0 100 219.0 37.23 110 148.9 12.66 129 151.8 6.83
## 5 0 104 135.4 23.02 91 138.8 11.80 92 208.5 9.38
## IntCall IntMin IntCharge
## 1 1 11.0 2.97
## 2 5 11.6 3.13
## 3 2 10.4 2.81
## 4 8 11.9 3.21
## 5 3 5.4 1.46
ChurnTest <- within(ChurnTest, {
AreaCode <- as.factor(AreaCode)
})
# Data Understanding -- Numerical summaries and scatterplot matrix
numSummary(ChurnTest[,c("Churn", "CustServCall", "DayCall", "DayCharge",
"DayMin", "EveCall", "EveCharge", "EveMin", "IntCall", "IntCharge",
"IntMin", "NightCall", "NightCharge", "NightMin", "Tenure", "VmailMsgs")],
statistics=c("mean", "sd", "quantiles", "skewness"),
quantiles=c(0,.25,.5,.75,1), type="2")
## mean sd skewness 0% 25% 50% 75% 100% n
## Churn 0.1603774 0.3686989 1.877704622 0.00 0.0000 0.000 0.0000 1.00 106
## CustServCall 1.5377358 1.3393284 0.826951321 0.00 1.0000 1.000 2.0000 6.00 106
## DayCall 101.3773585 18.7816794 -0.126190599 50.00 88.0000 104.000 114.0000 156.00 106
## DayCharge 31.6448113 9.6023418 -0.264342776 7.16 24.2025 31.375 39.2350 53.11 106
## DayMin 186.1443396 56.4871820 -0.264436491 42.10 142.3750 184.550 230.8000 312.40 106
## EveCall 99.4811321 18.9777971 -0.045083789 47.00 87.5000 98.500 114.0000 147.00 106
## EveCharge 17.7786792 4.1822027 -0.440479653 7.09 15.4900 18.115 20.6725 26.64 106
## EveMin 209.1594340 49.2020979 -0.440627070 83.40 182.2500 213.150 243.2500 313.40 106
## IntCall 3.8962264 2.6472960 2.628713145 1.00 2.0000 4.000 5.0000 18.00 106
## IntCharge 2.7464151 0.7067622 0.147656965 1.35 2.2100 2.810 3.2100 5.00 106
## IntMin 10.1698113 2.6182571 0.144306310 5.00 8.1750 10.400 11.9000 18.50 106
## NightCall 98.6415094 20.3573731 0.066433941 51.00 84.2500 99.000 110.7500 151.00 106
## NightCharge 8.8598113 2.2876476 0.233862423 3.47 7.3825 8.760 10.2600 16.20 106
## NightMin 196.8632075 50.8252680 0.233439322 77.10 164.0500 194.650 228.0250 359.90 106
## Tenure 95.9622642 42.7386130 -0.004030175 1.00 62.7500 103.000 127.0000 222.00 106
## VmailMsgs 7.7830189 13.4078902 1.327406636 0.00 0.0000 0.000 15.0000 46.00 106
scatterplotMatrix(~Churn+Tenure+VmailMsgs+CustServCall+
DayMin+EveMin+IntMin+NightMin, reg.line=lm, smooth=TRUE,
spread=FALSE, span=0.5, id.n=0, diagonal = 'histogram',
data=ChurnTest)

# Generate test set predictions --
# Regression Pred, Logistic Pred, Logistic Prob, Classification
RegPred <- predict(ReducedRegression, newdata=ChurnTest,
interval="none")
LogisticPred <- predict(ReducedLogistic, newdata=ChurnTest,
interval="none")
LogisticProb <- predict(ReducedLogistic, newdata=ChurnTest,
interval="none", type="response")
Threshold <- 0.5
LogisticClass <- rep(0, TestRows)
LogisticClass[LogisticProb > Threshold] <- 1
LogisticClass
## [1] 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [48] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [95] 0 0 0 0 0 1 0 0 0 0 1 0
table(LogisticClass, ChurnTest$Churn)
##
## LogisticClass 0 1
## 0 89 11
## 1 0 6
mean(LogisticClass==ChurnTest$Churn)
## [1] 0.8962264
# Add predictions to test set, save and export to csv
ChurnTest$RegPred <- RegPred
ChurnTest$LogisticPred <- LogisticPred
ChurnTest$LogisticProb <- LogisticProb
ChurnTest$LogisticClass <- LogisticClass
ChurnTest[1:5,]
## State AreaCode PhoneNum Churn ChurnL ChurnTF ChurnYN Tenure VmailPlan IntPlan CustServCall
## 1 KS 415 413-9854 0 False. FALSE no 90 yes no 1
## 2 NJ 408 402-8337 0 False. FALSE no 113 no yes 3
## 3 AL 408 351-4935 0 False. FALSE no 74 yes no 1
## 4 MI 510 355-9594 0 False. FALSE no 52 no no 3
## 5 NE 415 329-9540 0 False. FALSE no 115 no no 1
## VmailMsgs DayCall DayMin DayCharge EveCall EveMin EveCharge NightCall NightMin NightCharge
## 1 30 104 191.3 32.52 109 257.1 21.85 89 177.7 8.00
## 2 0 98 122.4 20.81 116 223.8 19.02 148 267.5 12.04
## 3 46 109 277.7 47.21 114 270.0 22.95 82 165.3 7.44
## 4 0 100 219.0 37.23 110 148.9 12.66 129 151.8 6.83
## 5 0 104 135.4 23.02 91 138.8 11.80 92 208.5 9.38
## IntCall IntMin IntCharge RegPred LogisticPred LogisticProb LogisticClass
## 1 1 11.0 2.97 0.19607649 -2.8705286 0.05362982 0
## 2 5 11.6 3.13 0.10610147 -0.1199557 0.47004699 0
## 3 2 10.4 2.81 0.30022860 -1.9256711 0.12723050 0
## 4 8 11.9 3.21 0.19131783 -2.2372809 0.09645225 0
## 5 3 5.4 1.46 0.08476337 -2.7506217 0.06005155 0
save("ChurnTest", file="ChurnTestAnalysis.RData")
write.table(ChurnTest,
"ChurnTestAnalysis.csv", sep=",",
col.names=TRUE, row.names=FALSE, quote=FALSE, na="NA")