setwd("~/Documents/ESCP/Consumer Insight Analytics/Berlin Semester")
kkbox <- read.csv("kkbox.csv")
kkbox$X <- NULL
kkbox$msno <- NULL
kkbox$payment_method_id <- NULL
kkbox$is_duplicate <- NULL
kkbox$city <- factor(kkbox$city)
kkbox$gender <- factor(kkbox$gender)
kkbox$registered_via <- factor(kkbox$registered_via)
kkbox$is_churn <- factor(kkbox$is_churn)
kkbox$registration_init_year = factor(substr(kkbox$registration_init_time,1,4))
kkbox$registration_init_month = factor(substr(kkbox$registration_init_time,5,6))
kkbox$registration_init_time<- NULL
skimr::skim(kkbox)
Name | kkbox |
Number of rows | 10000 |
Number of columns | 14 |
_______________________ | |
Column type frequency: | |
factor | 6 |
numeric | 8 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
is_churn | 0 | 1 | FALSE | 2 | 0: 9104, 1: 896 |
city | 0 | 1 | FALSE | 21 | 13: 2253, 5: 1606, 4: 1107, 15: 1007 |
gender | 0 | 1 | FALSE | 2 | 2: 5326, 1: 4674 |
registered_via | 0 | 1 | FALSE | 5 | 9: 5162, 3: 2267, 7: 1916, 4: 650 |
registration_init_year | 0 | 1 | FALSE | 14 | 201: 1253, 201: 1222, 201: 1138, 201: 1118 |
registration_init_month | 0 | 1 | FALSE | 12 | 10: 973, 12: 965, 01: 927, 02: 866 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
age | 0 | 1 | 30.02 | 13.52 | 1 | 24.00 | 28.00 | 34.00 | 1028 | ▇▁▁▁▁ |
payment_plan_days | 0 | 1 | 37.87 | 46.39 | 1 | 28.58 | 29.04 | 30.00 | 410 | ▇▁▁▁▁ |
plan_list_price | 0 | 1 | 176.82 | 192.87 | 0 | 133.32 | 143.27 | 149.00 | 1788 | ▇▁▁▁▁ |
actual_amount_paid | 0 | 1 | 184.01 | 191.73 | 0 | 149.00 | 149.00 | 149.00 | 1788 | ▇▁▁▁▁ |
is_auto_renew | 0 | 1 | 0.78 | 0.38 | 0 | 0.75 | 1.00 | 1.00 | 1 | ▂▁▁▁▇ |
is_cancel | 0 | 1 | 0.02 | 0.04 | 0 | 0.00 | 0.00 | 0.03 | 1 | ▇▁▁▁▁ |
n_transactions | 0 | 1 | 17.38 | 8.30 | 1 | 10.00 | 19.00 | 25.00 | 50 | ▅▆▇▁▁ |
payment_price_diff | 0 | 1 | -7.19 | 17.62 | -596 | -7.84 | 0.00 | 0.00 | 144 | ▁▁▁▇▇ |
library(stargazer)
numeric <-kkbox[,sapply(kkbox, function (x) is.numeric(x))]
stargazer(cor(numeric), type = "html")
age | payment_plan_days | plan_list_price | actual_amount_paid | is_auto_renew | is_cancel | n_transactions | payment_price_diff | |
age | 1 | -0.020 | -0.023 | -0.014 | 0.187 | -0.011 | 0.129 | -0.106 |
payment_plan_days | -0.020 | 1 | 0.981 | 0.978 | -0.370 | -0.035 | -0.369 | 0.103 |
plan_list_price | -0.023 | 0.981 | 1 | 0.996 | -0.375 | -0.030 | -0.365 | 0.110 |
actual_amount_paid | -0.014 | 0.978 | 0.996 | 1 | -0.364 | -0.041 | -0.348 | 0.019 |
is_auto_renew | 0.187 | -0.370 | -0.375 | -0.364 | 1 | 0.128 | 0.509 | -0.149 |
is_cancel | -0.011 | -0.035 | -0.030 | -0.041 | 0.128 | 1 | 0.040 | 0.127 |
n_transactions | 0.129 | -0.369 | -0.365 | -0.348 | 0.509 | 0.040 | 1 | -0.204 |
payment_price_diff | -0.106 | 0.103 | 0.110 | 0.019 | -0.149 | 0.127 | -0.204 | 1 |
We see that payment_plan_days and actual_amount_paid are heavily correlated with plan_list_price, so we should remove them
numeric$payment_plan_days <- NULL
numeric$actual_amount_paid <- NULL
stargazer(cor(numeric), type = "html")
age | plan_list_price | is_auto_renew | is_cancel | n_transactions | payment_price_diff | |
age | 1 | -0.023 | 0.187 | -0.011 | 0.129 | -0.106 |
plan_list_price | -0.023 | 1 | -0.375 | -0.030 | -0.365 | 0.110 |
is_auto_renew | 0.187 | -0.375 | 1 | 0.128 | 0.509 | -0.149 |
is_cancel | -0.011 | -0.030 | 0.128 | 1 | 0.040 | 0.127 |
n_transactions | 0.129 | -0.365 | 0.509 | 0.040 | 1 | -0.204 |
payment_price_diff | -0.106 | 0.110 | -0.149 | 0.127 | -0.204 | 1 |
kkbox$payment_plan_days <- NULL
kkbox$actual_amount_paid <- NULL
Correlations look good now
for(i in 1:ncol(kkbox) ){
if(!is.factor(kkbox[,i])){
kkbox[,i] = BBmisc::normalize(kkbox[,i], method="standardize")
}
}
library(caret)
set.seed(123)
train_index <- createDataPartition(kkbox$is_churn, p=0.7999)
train <- kkbox[train_index$Resample1,]
test <- kkbox[-train_index$Resample1,]
logit_mod <- glm(is_churn~., family=binomial, data = train)
stargazer::stargazer(logit_mod, type = "html", report = "vc*", star.cutoffs = c(.05, .01, .001))
Dependent variable: | |
is_churn | |
city3 | -0.260 |
city4 | -0.196 |
city5 | -0.305 |
city6 | -0.163 |
city7 | 0.323 |
city8 | -0.446 |
city9 | -0.121 |
city10 | -0.620 |
city11 | -0.003 |
city12 | -0.461 |
city13 | -0.315 |
city14 | -0.918** |
city15 | -0.194 |
city16 | -0.913 |
city17 | -0.159 |
city18 | -0.093 |
city19 | -11.142 |
city20 | -11.667 |
city21 | -0.329 |
city22 | -0.184 |
age | 0.197** |
gender2 | 0.064 |
registered_via4 | -0.151 |
registered_via7 | 0.039 |
registered_via9 | -0.147 |
registered_via13 | -10.554 |
plan_list_price | 0.126*** |
is_auto_renew | -1.162*** |
is_cancel | 0.644*** |
n_transactions | -0.265*** |
payment_price_diff | 0.019 |
registration_init_year2005 | -0.153 |
registration_init_year2006 | -0.122 |
registration_init_year2007 | -0.555 |
registration_init_year2008 | -0.544 |
registration_init_year2009 | -0.276 |
registration_init_year2010 | -0.593 |
registration_init_year2011 | -0.588 |
registration_init_year2012 | -0.408 |
registration_init_year2013 | -0.227 |
registration_init_year2014 | -0.266 |
registration_init_year2015 | -0.508 |
registration_init_year2016 | -0.654 |
registration_init_year2017 | -0.824 |
registration_init_month02 | 0.261 |
registration_init_month03 | 0.116 |
registration_init_month04 | -0.427 |
registration_init_month05 | -0.128 |
registration_init_month06 | -0.147 |
registration_init_month07 | -0.087 |
registration_init_month08 | -0.109 |
registration_init_month09 | -0.096 |
registration_init_month10 | 0.209 |
registration_init_month11 | -0.267 |
registration_init_month12 | -0.340 |
Constant | -2.350*** |
Observations | 8,000 |
Log Likelihood | -1,726.998 |
Akaike Inf. Crit. | 3,565.995 |
Note: | p<0.05; p<0.01; p<0.001 |
From the logistic regression model above we can see that variables age, plan_list_price, and is_cancel have a significantly positive impact on the odds of a customer churning (at a 5% level). For example, age has a coefficient of 0.197, meaning that when age increases by 1, the odds of a customer churning increase by 0.197. On the other, variables city14, is_auto_renew and n_transactions have a have significantly negative impact on customer churn (5% lvl). For example, when the number of transactions increases by 1, the odds of churning decreases by 0.265.
prediction <- predict(logit_mod, type='response', newdata = test)
prediction <- factor(ifelse(prediction > .5,1,0))
confusionMatrix(prediction,test$is_churn, positive = "1")
Confusion Matrix and Statistics
Reference
Prediction 0 1 0 1794 148 1 27 31
Accuracy : 0.9125
95% CI : (0.8993, 0.9245)
No Information Rate : 0.9105
P-Value [Acc > NIR] : 0.3958
Kappa : 0.2278
Mcnemar’s Test P-Value : <2e-16
Sensitivity : 0.1732
Specificity : 0.9852
Pos Pred Value : 0.5345
Neg Pred Value : 0.9238
Prevalence : 0.0895
Detection Rate : 0.0155
Detection Prevalence : 0.0290
Balanced Accuracy : 0.5792
'Positive' Class : 1
library(rpart)
set.seed(123)
churn_tree <- rpart(is_churn ~ ., method="class", data = train,control = rpart.control(minsplit = 20, cp = .005))
stargazer(printcp(churn_tree), type="html")
Classification tree: rpart(formula = is_churn ~ ., data = train, method = “class”, control = rpart.control(minsplit = 20, cp = 0.005))
Variables actually used in tree construction: [1] age city is_auto_renew
[4] n_transactions plan_list_price registration_init_month [7] registration_init_year
Root node error: 717/8000 = 0.089625
n= 8000
CP nsplit rel error xerror xstd
1 0.0425384 0 1.00000 1.00000 0.035633 2 0.0223152 2 0.91492 0.91492 0.034226 3 0.0083682 3 0.89261 0.90516 0.034059 4 0.0055788 7 0.85495 0.95258 0.034859 5 0.0050000 9 0.84379 0.96234 0.035020
CP | nsplit | rel error | xerror | xstd | |
1 | 0.043 | 0 | 1 | 1 | 0.036 |
2 | 0.022 | 2 | 0.915 | 0.915 | 0.034 |
3 | 0.008 | 3 | 0.893 | 0.905 | 0.034 |
4 | 0.006 | 7 | 0.855 | 0.953 | 0.035 |
5 | 0.005 | 9 | 0.844 | 0.962 | 0.035 |
rpart.plot::prp(churn_tree, type=0, extra=4, varlen=0)
set.seed(123)
churn_tree_pruned <- rpart(is_churn ~ ., method="class", data = train,
control = rpart.control(minsplit = 20, cp = .025))
stargazer(printcp(churn_tree_pruned), type="html")
Classification tree: rpart(formula = is_churn ~ ., data = train, method = “class”, control = rpart.control(minsplit = 20, cp = 0.025))
Variables actually used in tree construction: [1] is_auto_renew n_transactions
Root node error: 717/8000 = 0.089625
n= 8000
CP nsplit rel error xerror xstd
1 0.042538 0 1.00000 1.00000 0.035633 2 0.025000 2 0.91492 0.91492 0.034226
CP | nsplit | rel error | xerror | xstd | |
1 | 0.043 | 0 | 1 | 1 | 0.036 |
2 | 0.025 | 2 | 0.915 | 0.915 | 0.034 |
rpart.plot::prp(churn_tree_pruned, type=0, extra=4, varlen=0)
Based on the pruned tree model, it seems that is_auto_renew and n_transactions are the most influential variables for predicting customer churn
prediction <- predict(churn_tree, test, type="class")
prediction <- factor(ifelse(prediction =="1", 1, 0))
confusionMatrix(prediction,test$is_churn, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1791 145
## 1 30 34
##
## Accuracy : 0.9125
## 95% CI : (0.8993, 0.9245)
## No Information Rate : 0.9105
## P-Value [Acc > NIR] : 0.3958
##
## Kappa : 0.2442
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.1899
## Specificity : 0.9835
## Pos Pred Value : 0.5312
## Neg Pred Value : 0.9251
## Prevalence : 0.0895
## Detection Rate : 0.0170
## Detection Prevalence : 0.0320
## Balanced Accuracy : 0.5867
##
## 'Positive' Class : 1
##
prediction <- predict(churn_tree_pruned, test, type="class")
prediction <- factor(ifelse(prediction =="1", 1, 0))
confusionMatrix(prediction,test$is_churn, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1805 148
## 1 16 31
##
## Accuracy : 0.918
## 95% CI : (0.9051, 0.9297)
## No Information Rate : 0.9105
## P-Value [Acc > NIR] : 0.1273
##
## Kappa : 0.2463
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.1732
## Specificity : 0.9912
## Pos Pred Value : 0.6596
## Neg Pred Value : 0.9242
## Prevalence : 0.0895
## Detection Rate : 0.0155
## Detection Prevalence : 0.0235
## Balanced Accuracy : 0.5822
##
## 'Positive' Class : 1
##
Original model vs pruned model yield pretty much the same results