setwd("~/Documents/ESCP/Consumer Insight Analytics/Berlin Semester")
kkbox <- read.csv("kkbox.csv")
kkbox$X <- NULL
kkbox$msno <- NULL
kkbox$payment_method_id <- NULL
kkbox$is_duplicate <- NULL
kkbox$city <- factor(kkbox$city)
kkbox$gender <- factor(kkbox$gender)
kkbox$registered_via <- factor(kkbox$registered_via)
kkbox$is_churn <- factor(kkbox$is_churn)
kkbox$registration_init_year = factor(substr(kkbox$registration_init_time,1,4))
kkbox$registration_init_month = factor(substr(kkbox$registration_init_time,5,6))
kkbox$registration_init_time<- NULL

Summary of Variables

skimr::skim(kkbox)
Data summary
Name kkbox
Number of rows 10000
Number of columns 14
_______________________
Column type frequency:
factor 6
numeric 8
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
is_churn 0 1 FALSE 2 0: 9104, 1: 896
city 0 1 FALSE 21 13: 2253, 5: 1606, 4: 1107, 15: 1007
gender 0 1 FALSE 2 2: 5326, 1: 4674
registered_via 0 1 FALSE 5 9: 5162, 3: 2267, 7: 1916, 4: 650
registration_init_year 0 1 FALSE 14 201: 1253, 201: 1222, 201: 1138, 201: 1118
registration_init_month 0 1 FALSE 12 10: 973, 12: 965, 01: 927, 02: 866

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 30.02 13.52 1 24.00 28.00 34.00 1028 ▇▁▁▁▁
payment_plan_days 0 1 37.87 46.39 1 28.58 29.04 30.00 410 ▇▁▁▁▁
plan_list_price 0 1 176.82 192.87 0 133.32 143.27 149.00 1788 ▇▁▁▁▁
actual_amount_paid 0 1 184.01 191.73 0 149.00 149.00 149.00 1788 ▇▁▁▁▁
is_auto_renew 0 1 0.78 0.38 0 0.75 1.00 1.00 1 ▂▁▁▁▇
is_cancel 0 1 0.02 0.04 0 0.00 0.00 0.03 1 ▇▁▁▁▁
n_transactions 0 1 17.38 8.30 1 10.00 19.00 25.00 50 ▅▆▇▁▁
payment_price_diff 0 1 -7.19 17.62 -596 -7.84 0.00 0.00 144 ▁▁▁▇▇

Correlations

library(stargazer)
numeric <-kkbox[,sapply(kkbox, function (x) is.numeric(x))]
stargazer(cor(numeric), type = "html")
age payment_plan_days plan_list_price actual_amount_paid is_auto_renew is_cancel n_transactions payment_price_diff
age 1 -0.020 -0.023 -0.014 0.187 -0.011 0.129 -0.106
payment_plan_days -0.020 1 0.981 0.978 -0.370 -0.035 -0.369 0.103
plan_list_price -0.023 0.981 1 0.996 -0.375 -0.030 -0.365 0.110
actual_amount_paid -0.014 0.978 0.996 1 -0.364 -0.041 -0.348 0.019
is_auto_renew 0.187 -0.370 -0.375 -0.364 1 0.128 0.509 -0.149
is_cancel -0.011 -0.035 -0.030 -0.041 0.128 1 0.040 0.127
n_transactions 0.129 -0.369 -0.365 -0.348 0.509 0.040 1 -0.204
payment_price_diff -0.106 0.103 0.110 0.019 -0.149 0.127 -0.204 1

We see that payment_plan_days and actual_amount_paid are heavily correlated with plan_list_price, so we should remove them

Remove payment_plan_days and actual_amount_paid

numeric$payment_plan_days <- NULL
numeric$actual_amount_paid <- NULL
stargazer(cor(numeric), type = "html")
age plan_list_price is_auto_renew is_cancel n_transactions payment_price_diff
age 1 -0.023 0.187 -0.011 0.129 -0.106
plan_list_price -0.023 1 -0.375 -0.030 -0.365 0.110
is_auto_renew 0.187 -0.375 1 0.128 0.509 -0.149
is_cancel -0.011 -0.030 0.128 1 0.040 0.127
n_transactions 0.129 -0.365 0.509 0.040 1 -0.204
payment_price_diff -0.106 0.110 -0.149 0.127 -0.204 1
kkbox$payment_plan_days <- NULL
kkbox$actual_amount_paid <- NULL

Correlations look good now

Standardize Data

for(i in 1:ncol(kkbox) ){
  if(!is.factor(kkbox[,i])){
    kkbox[,i] = BBmisc::normalize(kkbox[,i], method="standardize")
  }
}

80/20 Train/Test Split

library(caret)
set.seed(123)
train_index <- createDataPartition(kkbox$is_churn, p=0.7999)
train <- kkbox[train_index$Resample1,]
test <- kkbox[-train_index$Resample1,]

Logistic Regression Model For Churn (Task 1)

Training

logit_mod <- glm(is_churn~., family=binomial, data = train)
stargazer::stargazer(logit_mod, type = "html", report = "vc*", star.cutoffs = c(.05, .01, .001))
Dependent variable:
is_churn
city3 -0.260
city4 -0.196
city5 -0.305
city6 -0.163
city7 0.323
city8 -0.446
city9 -0.121
city10 -0.620
city11 -0.003
city12 -0.461
city13 -0.315
city14 -0.918**
city15 -0.194
city16 -0.913
city17 -0.159
city18 -0.093
city19 -11.142
city20 -11.667
city21 -0.329
city22 -0.184
age 0.197**
gender2 0.064
registered_via4 -0.151
registered_via7 0.039
registered_via9 -0.147
registered_via13 -10.554
plan_list_price 0.126***
is_auto_renew -1.162***
is_cancel 0.644***
n_transactions -0.265***
payment_price_diff 0.019
registration_init_year2005 -0.153
registration_init_year2006 -0.122
registration_init_year2007 -0.555
registration_init_year2008 -0.544
registration_init_year2009 -0.276
registration_init_year2010 -0.593
registration_init_year2011 -0.588
registration_init_year2012 -0.408
registration_init_year2013 -0.227
registration_init_year2014 -0.266
registration_init_year2015 -0.508
registration_init_year2016 -0.654
registration_init_year2017 -0.824
registration_init_month02 0.261
registration_init_month03 0.116
registration_init_month04 -0.427
registration_init_month05 -0.128
registration_init_month06 -0.147
registration_init_month07 -0.087
registration_init_month08 -0.109
registration_init_month09 -0.096
registration_init_month10 0.209
registration_init_month11 -0.267
registration_init_month12 -0.340
Constant -2.350***
Observations 8,000
Log Likelihood -1,726.998
Akaike Inf. Crit. 3,565.995
Note: p<0.05; p<0.01; p<0.001

From the logistic regression model above we can see that variables age, plan_list_price, and is_cancel have a significantly positive impact on the odds of a customer churning (at a 5% level). For example, age has a coefficient of 0.197, meaning that when age increases by 1, the odds of a customer churning increase by 0.197. On the other, variables city14, is_auto_renew and n_transactions have a have significantly negative impact on customer churn (5% lvl). For example, when the number of transactions increases by 1, the odds of churning decreases by 0.265.

Testing

prediction <- predict(logit_mod, type='response', newdata = test)
prediction <- factor(ifelse(prediction > .5,1,0))
confusionMatrix(prediction,test$is_churn, positive = "1")

Confusion Matrix and Statistics

      Reference

Prediction 0 1 0 1794 148 1 27 31

           Accuracy : 0.9125          
             95% CI : (0.8993, 0.9245)
No Information Rate : 0.9105          
P-Value [Acc > NIR] : 0.3958          
                                      
              Kappa : 0.2278          
                                      

Mcnemar’s Test P-Value : <2e-16

        Sensitivity : 0.1732          
        Specificity : 0.9852          
     Pos Pred Value : 0.5345          
     Neg Pred Value : 0.9238          
         Prevalence : 0.0895          
     Detection Rate : 0.0155          

Detection Prevalence : 0.0290
Balanced Accuracy : 0.5792

   'Positive' Class : 1               
                                      

Classification Tree Model For Churn (Task 2)

Training

library(rpart)
set.seed(123)
churn_tree <- rpart(is_churn ~ ., method="class", data = train,control = rpart.control(minsplit = 20, cp = .005))
stargazer(printcp(churn_tree), type="html")

Classification tree: rpart(formula = is_churn ~ ., data = train, method = “class”, control = rpart.control(minsplit = 20, cp = 0.005))

Variables actually used in tree construction: [1] age city is_auto_renew
[4] n_transactions plan_list_price registration_init_month [7] registration_init_year

Root node error: 717/8000 = 0.089625

n= 8000

     CP nsplit rel error  xerror     xstd

1 0.0425384 0 1.00000 1.00000 0.035633 2 0.0223152 2 0.91492 0.91492 0.034226 3 0.0083682 3 0.89261 0.90516 0.034059 4 0.0055788 7 0.85495 0.95258 0.034859 5 0.0050000 9 0.84379 0.96234 0.035020

CP nsplit rel error xerror xstd
1 0.043 0 1 1 0.036
2 0.022 2 0.915 0.915 0.034
3 0.008 3 0.893 0.905 0.034
4 0.006 7 0.855 0.953 0.035
5 0.005 9 0.844 0.962 0.035
rpart.plot::prp(churn_tree, type=0, extra=4, varlen=0)

Pruned Tree

set.seed(123)
churn_tree_pruned <- rpart(is_churn ~ ., method="class", data = train,
                            control = rpart.control(minsplit = 20, cp = .025))
stargazer(printcp(churn_tree_pruned), type="html")

Classification tree: rpart(formula = is_churn ~ ., data = train, method = “class”, control = rpart.control(minsplit = 20, cp = 0.025))

Variables actually used in tree construction: [1] is_auto_renew n_transactions

Root node error: 717/8000 = 0.089625

n= 8000

    CP nsplit rel error  xerror     xstd

1 0.042538 0 1.00000 1.00000 0.035633 2 0.025000 2 0.91492 0.91492 0.034226

CP nsplit rel error xerror xstd
1 0.043 0 1 1 0.036
2 0.025 2 0.915 0.915 0.034
rpart.plot::prp(churn_tree_pruned, type=0, extra=4, varlen=0)

Based on the pruned tree model, it seems that is_auto_renew and n_transactions are the most influential variables for predicting customer churn

Testing Non-pruned model

prediction <- predict(churn_tree, test, type="class")
prediction <- factor(ifelse(prediction =="1", 1, 0))
confusionMatrix(prediction,test$is_churn, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1791  145
##          1   30   34
##                                           
##                Accuracy : 0.9125          
##                  95% CI : (0.8993, 0.9245)
##     No Information Rate : 0.9105          
##     P-Value [Acc > NIR] : 0.3958          
##                                           
##                   Kappa : 0.2442          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.1899          
##             Specificity : 0.9835          
##          Pos Pred Value : 0.5312          
##          Neg Pred Value : 0.9251          
##              Prevalence : 0.0895          
##          Detection Rate : 0.0170          
##    Detection Prevalence : 0.0320          
##       Balanced Accuracy : 0.5867          
##                                           
##        'Positive' Class : 1               
## 

Testing Pruned model

prediction <- predict(churn_tree_pruned, test, type="class")
prediction <- factor(ifelse(prediction =="1", 1, 0))
confusionMatrix(prediction,test$is_churn, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1805  148
##          1   16   31
##                                           
##                Accuracy : 0.918           
##                  95% CI : (0.9051, 0.9297)
##     No Information Rate : 0.9105          
##     P-Value [Acc > NIR] : 0.1273          
##                                           
##                   Kappa : 0.2463          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.1732          
##             Specificity : 0.9912          
##          Pos Pred Value : 0.6596          
##          Neg Pred Value : 0.9242          
##              Prevalence : 0.0895          
##          Detection Rate : 0.0155          
##    Detection Prevalence : 0.0235          
##       Balanced Accuracy : 0.5822          
##                                           
##        'Positive' Class : 1               
## 

Original model vs pruned model yield pretty much the same results