Marketing Midterm by Hanyue Kuang #Predicting Customer Churn at QWE INC.

## Warning: package 'readxl' was built under R version 3.5.3
## Warning: package 'rpart' was built under R version 3.5.3
## Warning: package 'rpart.plot' was built under R version 3.5.3
# Relationship between customer age and churn
boxplot(customerage ~ churn, data = customers, main = "Churn VS Customer Age",
        sub = "1 = YES 0 = NO",
        xlab="customer attrition", ylab="customer age")

# Build up MLR model
fm1 <- glm(churn ~ customerage + chi0 + chichange + spcase0 + spcase1 +spmonth0 + spmonth1 +
             login1 +blog1+view1 + dayssin1, 
           family = binomial, x = TRUE, data = customers)
summary(fm1)
## 
## Call:
## glm(formula = churn ~ customerage + chi0 + chichange + spcase0 + 
##     spcase1 + spmonth0 + spmonth1 + login1 + blog1 + view1 + 
##     dayssin1, family = binomial, data = customers, x = TRUE)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0047  -0.3542  -0.2957  -0.2328   3.0660  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.763e+00  1.069e-01 -25.841  < 2e-16 ***
## customerage  1.271e-02  5.370e-03   2.366   0.0180 *  
## chi0        -1.493e-02  2.566e-03  -5.820 5.88e-09 ***
## chichange    1.027e-02  2.474e-03   4.153 3.29e-05 ***
## spcase0     -1.524e-01  1.049e-01  -1.452   0.1464    
## spcase1      1.703e-01  9.050e-02   1.881   0.0599 .  
## spmonth0     1.593e-02  1.022e-01   0.156   0.8761    
## spmonth1    -5.194e-02  7.852e-02  -0.661   0.5083    
## login1       2.893e-04  2.092e-03   0.138   0.8900    
## blog1        2.905e-04  1.960e-02   0.015   0.9882    
## view1       -1.098e-04  4.071e-05  -2.697   0.0070 ** 
## dayssin1     1.724e-02  4.289e-03   4.020 5.81e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2553.1  on 6346  degrees of freedom
## Residual deviance: 2440.3  on 6335  degrees of freedom
## AIC: 2464.3
## 
## Number of Fisher Scoring iterations: 7
predict.glm(fm1, newdata = customerspci, type = "response")
##          1          2          3 
## 0.03810451 0.04779791 0.04273919
# test accuracy of first model
threshold=0.5
predicted_values<-ifelse(predict(fm1,type="response")>threshold,1,0)
actual_values <- fm1$y
conf_matrix <- table(predicted_values,actual_values)
conf_matrix
##                 actual_values
## predicted_values    0    1
##                0 6024  323
# decision tree model
ft.tree <- rpart(churn ~ dayssin1 + login1+ view1 + customerage,
             method="class", data=customers,
             control =rpart.control(minsplit =30,minbucket=10, cp=0))
ft.tree
## n= 6347 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 6347 323 0 (0.94910982 0.05089018)  
##     2) dayssin1< 17.5 5624 218 0 (0.96123755 0.03876245) *
##     3) dayssin1>=17.5 723 105 0 (0.85477178 0.14522822)  
##       6) login1>=2.5 229  11 0 (0.95196507 0.04803493) *
##       7) login1< 2.5 494  94 0 (0.80971660 0.19028340)  
##        14) customerage>=21.5 180  17 0 (0.90555556 0.09444444) *
##        15) customerage< 21.5 314  77 0 (0.75477707 0.24522293)  
##          30) view1>=-140.5 304  69 0 (0.77302632 0.22697368)  
##            60) customerage< 11.5 130  16 0 (0.87692308 0.12307692) *
##            61) customerage>=11.5 174  53 0 (0.69540230 0.30459770)  
##             122) customerage>=12.5 141  31 0 (0.78014184 0.21985816) *
##             123) customerage< 12.5 33  11 1 (0.33333333 0.66666667)  
##               246) view1>=1 10   4 0 (0.60000000 0.40000000) *
##               247) view1< 1 23   5 1 (0.21739130 0.78260870) *
##          31) view1< -140.5 10   2 1 (0.20000000 0.80000000) *
rpart.plot(ft.tree)

# accuracy of decision tree
conf_matrix2 <- table(predict(ft.tree, type = "class"),actual_values)
confusionMatrix(conf_matrix2)
## Confusion Matrix and Statistics
## 
##    actual_values
##        0    1
##   0 6017  297
##   1    7   26
##                                           
##                Accuracy : 0.9521          
##                  95% CI : (0.9466, 0.9572)
##     No Information Rate : 0.9491          
##     P-Value [Acc > NIR] : 0.1451          
##                                           
##                   Kappa : 0.1379          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9988          
##             Specificity : 0.0805          
##          Pos Pred Value : 0.9530          
##          Neg Pred Value : 0.7879          
##              Prevalence : 0.9491          
##          Detection Rate : 0.9480          
##    Detection Prevalence : 0.9948          
##       Balanced Accuracy : 0.5397          
##                                           
##        'Positive' Class : 0               
##