Marketing Midterm by Hanyue Kuang #Predicting Customer Churn at QWE INC.
## Warning: package 'readxl' was built under R version 3.5.3
## Warning: package 'rpart' was built under R version 3.5.3
## Warning: package 'rpart.plot' was built under R version 3.5.3
# Relationship between customer age and churn
boxplot(customerage ~ churn, data = customers, main = "Churn VS Customer Age",
sub = "1 = YES 0 = NO",
xlab="customer attrition", ylab="customer age")
# Build up MLR model
fm1 <- glm(churn ~ customerage + chi0 + chichange + spcase0 + spcase1 +spmonth0 + spmonth1 +
login1 +blog1+view1 + dayssin1,
family = binomial, x = TRUE, data = customers)
summary(fm1)
##
## Call:
## glm(formula = churn ~ customerage + chi0 + chichange + spcase0 +
## spcase1 + spmonth0 + spmonth1 + login1 + blog1 + view1 +
## dayssin1, family = binomial, data = customers, x = TRUE)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0047 -0.3542 -0.2957 -0.2328 3.0660
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.763e+00 1.069e-01 -25.841 < 2e-16 ***
## customerage 1.271e-02 5.370e-03 2.366 0.0180 *
## chi0 -1.493e-02 2.566e-03 -5.820 5.88e-09 ***
## chichange 1.027e-02 2.474e-03 4.153 3.29e-05 ***
## spcase0 -1.524e-01 1.049e-01 -1.452 0.1464
## spcase1 1.703e-01 9.050e-02 1.881 0.0599 .
## spmonth0 1.593e-02 1.022e-01 0.156 0.8761
## spmonth1 -5.194e-02 7.852e-02 -0.661 0.5083
## login1 2.893e-04 2.092e-03 0.138 0.8900
## blog1 2.905e-04 1.960e-02 0.015 0.9882
## view1 -1.098e-04 4.071e-05 -2.697 0.0070 **
## dayssin1 1.724e-02 4.289e-03 4.020 5.81e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2553.1 on 6346 degrees of freedom
## Residual deviance: 2440.3 on 6335 degrees of freedom
## AIC: 2464.3
##
## Number of Fisher Scoring iterations: 7
predict.glm(fm1, newdata = customerspci, type = "response")
## 1 2 3
## 0.03810451 0.04779791 0.04273919
# test accuracy of first model
threshold=0.5
predicted_values<-ifelse(predict(fm1,type="response")>threshold,1,0)
actual_values <- fm1$y
conf_matrix <- table(predicted_values,actual_values)
conf_matrix
## actual_values
## predicted_values 0 1
## 0 6024 323
# decision tree model
ft.tree <- rpart(churn ~ dayssin1 + login1+ view1 + customerage,
method="class", data=customers,
control =rpart.control(minsplit =30,minbucket=10, cp=0))
ft.tree
## n= 6347
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 6347 323 0 (0.94910982 0.05089018)
## 2) dayssin1< 17.5 5624 218 0 (0.96123755 0.03876245) *
## 3) dayssin1>=17.5 723 105 0 (0.85477178 0.14522822)
## 6) login1>=2.5 229 11 0 (0.95196507 0.04803493) *
## 7) login1< 2.5 494 94 0 (0.80971660 0.19028340)
## 14) customerage>=21.5 180 17 0 (0.90555556 0.09444444) *
## 15) customerage< 21.5 314 77 0 (0.75477707 0.24522293)
## 30) view1>=-140.5 304 69 0 (0.77302632 0.22697368)
## 60) customerage< 11.5 130 16 0 (0.87692308 0.12307692) *
## 61) customerage>=11.5 174 53 0 (0.69540230 0.30459770)
## 122) customerage>=12.5 141 31 0 (0.78014184 0.21985816) *
## 123) customerage< 12.5 33 11 1 (0.33333333 0.66666667)
## 246) view1>=1 10 4 0 (0.60000000 0.40000000) *
## 247) view1< 1 23 5 1 (0.21739130 0.78260870) *
## 31) view1< -140.5 10 2 1 (0.20000000 0.80000000) *
rpart.plot(ft.tree)
# accuracy of decision tree
conf_matrix2 <- table(predict(ft.tree, type = "class"),actual_values)
confusionMatrix(conf_matrix2)
## Confusion Matrix and Statistics
##
## actual_values
## 0 1
## 0 6017 297
## 1 7 26
##
## Accuracy : 0.9521
## 95% CI : (0.9466, 0.9572)
## No Information Rate : 0.9491
## P-Value [Acc > NIR] : 0.1451
##
## Kappa : 0.1379
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9988
## Specificity : 0.0805
## Pos Pred Value : 0.9530
## Neg Pred Value : 0.7879
## Prevalence : 0.9491
## Detection Rate : 0.9480
## Detection Prevalence : 0.9948
## Balanced Accuracy : 0.5397
##
## 'Positive' Class : 0
##