The market of telecommunication services is saturated and the growth rates of the customer base are low. Therefore the main focus of telecom providers should be maintaining their current client base. This project examines fictional data with the aim of identifying key predictors of customer churn by evaluating different machine learning models, using standard and validated metrics.
library(tidyverse)
library(plyr)
library(lubridate)
library(skimr)
library(visdat)
library(naniar)
library(simputation)
library(correlationfunnel)
library(DataExplorer)
library(gridExtra)
library(corrplot)
library(gridExtra)
library(pROC)
library(C50)
library(caret)
library(rpart)
library(WRS2)
library(afex)
library(ggthemes)
library(ggstatsplot)
SDV_DAT_JAN <- read.csv("SDV_DAT_JAN.csv", header = TRUE, sep =";")
skim(SDV_DAT_JAN)
| Name | SDV_DAT_JAN |
| Number of rows | 500000 |
| Number of columns | 47 |
| _______________________ | |
| Column type frequency: | |
| character | 15 |
| logical | 1 |
| numeric | 31 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| PP_GROUP_NAME | 0 | 1 | 7 | 9 | 0 | 2 | 0 |
| SUBSCRIPTION_STATUS | 0 | 1 | 1 | 1 | 0 | 1 | 0 |
| GENDER | 0 | 1 | 0 | 6 | 7799 | 3 | 0 |
| PRICE_PLAN_DESC | 0 | 1 | 10 | 20 | 0 | 36 | 0 |
| PORTIN_COMPETITOR_NAME_ENG | 0 | 1 | 0 | 7 | 440437 | 3 | 0 |
| REF_PRICE_PLAN_DESC | 0 | 1 | 0 | 46 | 44749 | 36 | 0 |
| LE_TYPE_DESC | 0 | 1 | 7 | 22 | 0 | 7 | 0 |
| TERMINAL_TYPE | 0 | 1 | 0 | 16 | 282133 | 4 | 0 |
| TERMINAL_BRAND | 0 | 1 | 0 | 9 | 291555 | 21 | 0 |
| TERMINAL_CAT | 0 | 1 | 0 | 15 | 291231 | 6 | 0 |
| DEVICE_TYPE_DESC | 0 | 1 | 0 | 25 | 9728 | 11 | 0 |
| DEVICE_MANUFACTURER_DESC | 0 | 1 | 0 | 25 | 14564 | 26 | 0 |
| SILENT_SMS | 0 | 1 | 1 | 3 | 0 | 26 | 0 |
| SILENT_DATA | 0 | 1 | 1 | 3 | 0 | 26 | 0 |
| MTA_MAU_CURRENT | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| SUBSCRIPTION_STATUS_NEXT_MNT | 5e+05 | 0 | NaN | : |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| DATE_ID_CALC | 0 | 1.00 | 20210131.00 | 0.00 | 20210131.00 | 20210131.00 | 20210131.00 | 20210131.00 | 20210131.00 | ▁▁▇▁▁ |
| CNT_BARRED_STATUS | 262485 | 0.48 | 9.26 | 7.61 | 1.00 | 2.00 | 7.00 | 14.00 | 38.00 | ▇▅▂▁▁ |
| CNT_BARRED_DAYS | 263417 | 0.47 | 23.39 | 29.75 | 0.00 | 1.27 | 12.82 | 40.25 | 303.71 | ▇▁▁▁▁ |
| CNT_SUSPENDED_STATUS | 259637 | 0.48 | 0.02 | 0.15 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| CNT_SUSPENDED_DAYS | 264225 | 0.47 | 1.15 | 4.72 | 0.00 | 0.00 | 0.00 | 0.01 | 39.12 | ▇▁▁▁▁ |
| CNT_SUB_CUST | 0 | 1.00 | 1.81 | 1.27 | 0.00 | 1.00 | 2.00 | 3.00 | 8.00 | ▇▆▂▁▁ |
| NETWORK_AGE | 0 | 1.00 | 94.20 | 56.79 | 16.95 | 40.97 | 86.59 | 149.64 | 235.69 | ▇▃▅▅▁ |
| CUST_AGE | 19628 | 0.96 | 45.20 | 15.31 | 14.00 | 33.00 | 43.00 | 59.00 | 91.00 | ▅▇▆▅▁ |
| REAL_MF_WITH_VAT | 0 | 1.00 | 20.72 | 10.56 | 3.46 | 12.28 | 17.90 | 30.56 | 54.52 | ▇▇▇▁▁ |
| GA_TYPE_FLAG | 242133 | 0.52 | 4.35 | 2.38 | 1.00 | 1.00 | 5.00 | 7.00 | 7.00 | ▇▁▃▆▇ |
| MONTHS_TO_EXPIRY_CRNT | 0 | 1.00 | 2.43 | 0.22 | 1.94 | 2.28 | 2.45 | 2.56 | 3.04 | ▂▇▇▃▁ |
| REF_REAL_MF_WITH_VAT | 56800 | 0.89 | 19.24 | 8.52 | 0.00 | 11.29 | 19.24 | 21.13 | 66.29 | ▅▇▁▁▁ |
| LEASING_CNT | 288866 | 0.42 | 1.87 | 1.08 | 1.00 | 1.00 | 2.00 | 2.00 | 5.00 | ▇▅▂▁▁ |
| TERMINAL_CNT | 150379 | 0.70 | 1.64 | 0.68 | 1.00 | 1.00 | 2.00 | 2.00 | 4.00 | ▇▇▁▂▁ |
| AVG_MONTHLY_BILL_6M_NO_VAT | 0 | 1.00 | 36.28 | 32.08 | 0.88 | 18.84 | 24.80 | 43.71 | 224.58 | ▇▂▁▁▁ |
| INVOICE_CNT | 0 | 1.00 | 5.79 | 0.41 | 5.00 | 6.00 | 6.00 | 6.00 | 6.00 | ▂▁▁▁▇ |
| ROAMING_MIN_CHARGE_6M | 0 | 1.00 | 0.08 | 0.55 | 0.00 | 0.00 | 0.00 | 0.00 | 8.55 | ▇▁▁▁▁ |
| ROAMING_GPRS_CHARGE_6M | 0 | 1.00 | 0.13 | 1.14 | 0.00 | 0.00 | 0.00 | 0.01 | 23.00 | ▇▁▁▁▁ |
| VAS_CHARGE_6M | 0 | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.01 | ▇▂▁▁▁ |
| DOB_CHARGE_6M | 488003 | 0.02 | 12.50 | 4.34 | 1.68 | 12.30 | 12.31 | 12.32 | 29.91 | ▁▇▁▁▁ |
| GPRS_USAGE_MB_L3M | 0 | 1.00 | 1776.90 | 2859.71 | 0.00 | 0.00 | 239.21 | 2850.23 | 21289.22 | ▇▁▁▁▁ |
| ROAMING_GPRS_USAGE_L3M | 0 | 1.00 | 33.93 | 293.06 | 0.00 | 0.00 | 0.00 | 0.00 | 6053.26 | ▇▁▁▁▁ |
| CNT_PAYMENTS_L12M | 0 | 1.00 | 11.27 | 1.62 | 1.00 | 11.00 | 11.00 | 12.00 | 25.00 | ▁▁▇▁▁ |
| TNSHOPS_PAY_L12M | 165812 | 0.67 | 5.80 | 3.77 | 1.00 | 3.00 | 5.00 | 8.00 | 28.00 | ▇▃▂▁▁ |
| EASYPAY_PAY_L12M | 254293 | 0.49 | 6.69 | 3.85 | 0.54 | 3.10 | 7.41 | 10.25 | 12.66 | ▇▅▇▂▇ |
| EPAY_PAY_L12M | 408505 | 0.18 | 7.90 | 3.64 | 0.00 | 6.00 | 8.00 | 12.00 | 12.00 | ▃▂▂▇▇ |
| MTAPP_PAY_L12M | 411465 | 0.18 | 4.06 | 2.23 | 1.00 | 3.00 | 4.00 | 4.00 | 15.00 | ▅▇▂▁▁ |
| BGPOST_PAY_L12M | 400421 | 0.20 | 6.19 | 3.25 | 1.00 | 4.00 | 6.00 | 8.00 | 12.00 | ▃▁▇▂▃ |
| OTHER_PAY_L12M | 415186 | 0.17 | 5.21 | 2.75 | 1.00 | 4.00 | 5.00 | 5.00 | 13.00 | ▂▇▁▂▁ |
| PLAY_DIEMA_XTRA_YES | 0 | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ▁▁▇▁▁ |
| churn_flag | 0 | 1.00 | 0.01 | 0.09 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
SDV_DAT_JAN %>% plot_intro()
EDA of the data shows that some columns can be removed due to the following reasons:
Such columns are: DATE_ID_CALC, SUBSCRIPTION_STATUS - lac of relevance to the model; and SUBSCRIPTION_STATUS_NEXT_MNT - empty column.
SDV_DAT_JAN %>% plot_missing()
SDV_DAT_JAN %>% vis_miss(warn_large_data = FALSE)
SDV_DAT_JAN %>% gg_miss_upset()
In order to perform thorough analysis of all independent variables and their significance as predictors, certain columns require manipulation to be included in the model. For example:
By performing these steps it becomes possible to include these columns in a subsequent correlation and t-test analysis and finally to evaluate them for predictability of churn and their statistical significance. An alternative approach will be to remove them completely from the model, but this leads to the risk of omitting potentially important factors.
DF <- within(SDV_DAT_JAN, rm(DATE_ID_CALC,
SUBSCRIPTION_STATUS,
SUBSCRIPTION_STATUS_NEXT_MNT,
DOB_CHARGE_6M,
OTHER_PAY_L12M,
MTAPP_PAY_L12M,
EPAY_PAY_L12M,
BGPOST_PAY_L12M,
PLAY_DIEMA_XTRA_YES))
churn_yes <- DF %>%
filter(churn_flag == "1")
churn_no <- DF %>%
filter(churn_flag == "0") %>%
na.omit()
clean_df <- join(churn_no, churn_yes, type = "full")
This procedure groups all subscribers with a positive churn flag (1) in a separate sub-table containing 4300 observations. This is done in order to preserve all true positive observations. All other subscribers are grouped in another table based on churn = no flag (0), after which all rows with missing data in churn_no table are removed. This reduces the total number of observations to only 9995 from the initial 495,700. Applying this method over the churn_yes table is not applicable because the model overfits and becomes useless. Additionally, only 163 observations in churn_yes table contain complete data, which is less than 2% of the total.
This operation sanitizes the data and provides more accurate overview of the distribution. Separating and merging the two tables results in a clean and tidy dataframe, which helps the modeling processes downstream.
# Wrangle, Transform & Impute
clean_tbl <- clean_df %>%
# Round all numerical columns across the dataset
mutate_if(is.numeric, round, digits = 3) %>%
# Revert data coding from 0 to 1 in CNT_SUSPENDED_STATUS
mutate(CNT_SUSPENDED_STATUS = ifelse(CNT_SUSPENDED_STATUS == "0", 1,0)) %>%
# Replace all NAs across the dataset using base R
replace(is.na(.), 0) %>%
# Convert the churn flag from 1 and 0 to Yes and No
mutate(churn_flag = ifelse(churn_flag == "1", "Yes", "No")) %>%
# Convert churn to factor post imputation
mutate(churn_flag = as.factor(churn_flag))
# Fill in null rows with "NoData" character value
clean_tbl[clean_tbl == ""] <- "NoData"
exp1 <- clean_tbl %>%
filter(GENDER != "NoData") %>%
ggplot(aes(GENDER, fill = churn_flag)) +
geom_bar(position = "fill") +
labs(x = "Gender", y = "") +
theme(legend.position = "none")
exp3 <- clean_tbl %>%
filter(GA_TYPE_FLAG != "0") %>%
ggplot(aes(GA_TYPE_FLAG, fill = churn_flag)) +
geom_bar(position = "fill") +
labs(x = "How the sub joined", y = "") +
theme(legend.position = "none")
exp4 <- ggplot(clean_tbl, aes(PP_GROUP_NAME, fill = churn_flag)) +
geom_bar(position = "fill") +
labs(x = "Segment", y = "") +
theme(legend.position = "none")
grid.arrange(exp1, exp3, exp4, ncol = 4, nrow = 1, top = "Churn/Non-churn Proportion")
exp2 <- ggplot(clean_tbl, aes(LE_TYPE_DESC, fill = churn_flag)) +
geom_bar(position = "fill") +
labs(x = "Last Event Type", y = "") +
theme(legend.position = "none")
grid.arrange(exp2, ncol = 1, nrow = 1, top = "Churn/Non-churn Proportion")
We can observe that among these variables whether a customer would churn depends largely on the type of contract.
It seems that subscribers who use mainly voice services without Internet / data services are much more likely to churn than subscribers who use full data and voice contract. It can be speculated that customers using only voice services are likely from poor socio-economic background or just senior citizens.
Additionally, the way subscribers join the telecom also has an impact. The GA_TYPE_FLAG column illustrates that for code numbers 4 and 5, subscribers differ in the total number of churn.
Also, the way subscribers change their plan from REF_PRICE_PLAN to PRICE_PLAN - with the following options - Migration / Retention / Saving, etc., seem to affect the level of churn.
It is important to note that these hypotheses are based solely on graphical distribution. Without being tested for significance, it would not be prudent to draw any conclusions. Subsequently, by correlation analysis and t-test, some of these hypotheses are tested.
exp5 <- ggplot(clean_tbl, aes(churn_flag, CNT_BARRED_STATUS, fill = churn_flag)) +
geom_boxplot(alpha = 0.8) +
theme(legend.position = "null")
exp6 <- ggplot(clean_tbl, aes(churn_flag, GPRS_USAGE_MB_L3M, fill = churn_flag)) +
geom_boxplot(alpha = 0.8) +
theme(legend.position = "null")
exp7 <- ggplot(clean_tbl, aes(churn_flag, TERMINAL_CNT, fill = churn_flag)) +
geom_boxplot(alpha = 0.8) +
theme(legend.position = "null")
exp8 <- ggplot(clean_tbl, aes(churn_flag, MONTHS_TO_EXPIRY_CRNT, fill = churn_flag)) +
geom_boxplot(alpha = 0.8) +
theme(legend.position = "null")
exp9 <- ggplot(clean_tbl, aes(churn_flag, CNT_SUB_CUST, fill = churn_flag)) +
geom_boxplot(alpha = 0.8) +
theme(legend.position = "null")
exp10 <- ggplot(clean_tbl, aes(churn_flag, CUST_AGE, fill = churn_flag)) +
geom_boxplot(alpha = 0.8) +
theme(legend.position = "null")
grid.arrange(exp5, exp6, exp7,
exp8, exp9, exp10,
ncol = 3, nrow = 2)
From the above graph of this set of factors it is difficult to draw conclusions. Graphically, the differences seem negligible. Conducting t-test may reveal other conclusions.
T-test is a standard statistical tool for testing hypotheses. In this analysis, its use is intended to show comprehension of such methods, rather than confirming / rejecting certain assumptions and hypotheses. Performing scientific testing takes time :)
This analysis looks at the distribution between contract change type (Migration / Retention / Saving) and customer age. The graph contains many statistical indicators, for example: significance, confidence intervals, and many others.
s <- clean_df %>%
mutate(id = rownames(.)) %>%
select(id, CUST_AGE, GENDER, NETWORK_AGE, LE_TYPE_DESC, churn_flag) %>%
filter(GENDER != "")
ggwithinstats(
data = s,
x = LE_TYPE_DESC,
y = CUST_AGE,
type = "r", # robust statistics
xlab = "REF_PRICE_PLAN to PRICE_PLAN",
ylab = "Customer Age",
outlier.tagging = FALSE)
The following T-test is applied over contract duration and churn flag.
w <- clean_df %>%
mutate(id = rownames(.)) %>%
select(id, CUST_AGE, GENDER, NETWORK_AGE, LE_TYPE_DESC, churn_flag) %>%
filter(GENDER != "")
ggwithinstats(
data = w,
x = churn_flag,
y = NETWORK_AGE,
type = "p", # Parametric test
xlab = "Churn",
ylab = "Subscription duration in months",
outlier.tagging = FALSE)
For the purpose of this last t-test which is applied over customer age and churn flag the following hypotheses are formulated.
ggbetweenstats(s,
churn_flag,
CUST_AGE
)
Welch’s T-test revealed that in 13,843 observations, the difference in subscribers’ age and whether they were willing to opt out was not statistically significant.
The effect size (g = -0.08) is very low, practically zero. According to the Cohen Convention (1988), this indicator is not statistically significant. The Bayes factor revealed that the alternative hypothesis can be completely rejected (BF = -4.96). This can be considered serious evidence (Jeffreys, 1961) in favor of the null hypothesis.
The application of correlation analysis helps to identify factors associated with customer churn. This is a well established method for illustrating relationships between variables.
corrplot(cor(clean_tbl[sapply(clean_tbl, is.numeric)]))
The following correlogram complements the conclusions made above with the addition of correlation coefficients. Relationships range from low (r = + -0.4) to moderate (r = + -0.5-0.7). It is important to note that some of the moderate correlation coefficients are between directly related variables, for example: CNT_BARRED and CNT_SUSPENDED, which do not add value to the analysis.
churn_bin_tbl <- clean_tbl %>% binarize()
options(ggrepel.max.overlaps = 20)
churn_bin_tbl %>%
correlate(target = churn_flag__Yes) %>%
plot_correlation_funnel() +
geom_point(size = 2, color = "#2c3e50")
A correlation coefficient of up to + -0.7 is considered a moderate relationship. It is apparent that with this sample, observing high / very high correlation between the variables is unlikely. Very high relationships are difficult to detect and observe and they may be misleading. There are infinite amount of factors that have potential impact on each variable.
For the purpose of the current analysis all models are subject to three-step cross-validation.
In the case of k-fold cross-validation, the original sample is randomly divided into k samples of the same size. Of the k sub-samples, one sub-sample is saved as model validation and testing data, and the remaining k-1 sub-samples are used as model training data.
This ensures that each observation from the original dataset has a chance to appear in a training and test sample. This is one of the most established approaches for model validation.
Before proceeding to the cross-validation split, the data were divided into churnTrain table (10720 observations) and churnTest table (3537 observations). churnTrain will be used to build the models, while churnTest will be used to evaluate them.
## 75% of the sample size
smp_size <- floor(0.75 * nrow(clean_tbl))
set.seed(123)
train_ind <- sample(seq_len(nrow(clean_tbl)), size = smp_size)
churnTrain <- clean_tbl[train_ind, ] %>% filter(TERMINAL_BRAND != "LENOVO")
churnTrain <- clean_tbl[train_ind, ] %>% filter(TERMINAL_BRAND != "PHILIPS")
churnTest <- clean_tbl[-train_ind, ] %>% filter(TERMINAL_BRAND != "LENOVO")
churnTest <- clean_tbl[-train_ind, ] %>% filter(TERMINAL_BRAND != "PHILIPS")
Logistic regression models are fast and easy to interpret. In general, this type of modeling does not require scaling of the input variables, no additional settings (hyperparameter tuning) are required, they are easily adjusted and well-calibrated probabilities are displayed.
modelCtrl <- trainControl(method = "cv", number = 3)
set.seed(123)
glm.model <- train(churn_flag ~ .,
method = "glm",
family = "binomial",
data = churnTrain,
trControl = modelCtrl)
# summary(glm.model)
# Deviance Residuals:
# Min 1Q Median 3Q Max
# -3.5160 -0.3849 -0.1697 0.0405 3.8338
#
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)
# CNT_BARRED_STATUS -1.799e-02 5.107e-03 -3.523 0.000426 ***
# CNT_SUSPENDED_STATUS -3.640e+00 1.446e-01 -25.165 < 2e-16 ***
# NETWORK_AGE 3.078e-03 8.740e-04 3.521 0.000429 ***
# CUST_AGE -1.124e-02 2.875e-03 -3.908 9.31e-05 ***
# GA_TYPE_FLAG -2.269e-01 1.496e-02 -15.168 < 2e-16 ***
# PORTIN_COMPETITOR_NAME_ENGNoData -6.804e-01 1.623e-01 -4.192 2.77e-05 ***
# LE_TYPE_DESCHandset only Migration -8.689e-01 2.029e-01 -4.282 1.85e-05 ***
# LE_TYPE_DESCPrice Plan Migration -8.962e-01 1.689e-01 -5.306 1.12e-07 ***
# LE_TYPE_DESCRetention -8.226e-01 1.778e-01 -4.626 3.74e-06 ***
# TERMINAL_BRANDAPPLE 1.441e+00 3.759e-01 3.835 0.000126 ***
# LEASING_CNT -9.881e-01 4.456e-02 -22.175 < 2e-16 ***
# TERMINAL_CNT -2.693e-01 5.019e-02 -5.366 8.04e-08 ***
# INVOICE_CNT -3.080e-01 8.332e-02 -3.696 0.000219 ***
#
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# (Dispersion parameter for binomial family taken to be 1)
#
# Null deviance: 13021.5 on 10719 degrees of freedom
# Residual deviance: 5147.8 on 10514 degrees of freedom
# AIC: 5559.8
#
# Number of Fisher Scoring iterations: 17
The summary of the logistics model reveals the main predictors of churn. At a significance level of 5% (0.05) and for a sample size of 10720, the variables denoted by *** (p = 0.001) showed a statistically significant relationship between churn and predictor variables.
varImp(glm.model)
## glm variable importance
##
## only 20 most important variables shown (out of 205)
##
## Overall
## EASYPAY_PAY_L12M 100.000
## CNT_SUSPENDED_STATUS 78.574
## LEASING_CNT 69.237
## GA_TYPE_FLAG 47.361
## MTA_MAU_CURRENTY 20.986
## TNSHOPS_PAY_L12M 19.696
## TERMINAL_CNT 16.755
## `LE_TYPE_DESCPrice Plan Migration` 16.567
## GPRS_USAGE_MB_L3M 15.549
## LE_TYPE_DESCRetention 14.443
## `LE_TYPE_DESCHandset only Migration` 13.371
## PORTIN_COMPETITOR_NAME_ENGNoData 13.088
## CUST_AGE 12.202
## TERMINAL_BRANDAPPLE 11.974
## INVOICE_CNT 11.541
## CNT_BARRED_STATUS 11.001
## NETWORK_AGE 10.995
## CNT_BARRED_DAYS 10.193
## TERMINAL_BRANDHONOR 9.839
## TERMINAL_BRANDNoData 9.815
The results of the statistical model are partially related to the previous observations conducted through the initial phase of the correlation analysis between the dependent (forecasted) and independent (predictor) variables.
An important predictor is CNT_SUSPENDED_STATUS, which depicts how many times the subscriber’s service has been suspended.
Other important predictors of churn are factors such as whether the subscribers use leasing services, how the subscribers have signed a contract with the telecom (through PortIN / from Sales / Pre2Post Migration), the length of the period in which the customer is an active subscriber, whether mobile internet and/or voice services are used and others.
Common practice for evaluating models is to evaluate them against new, “unseen” data. In this case, the churnTest table is used to evaluate and measure the performance of the model.
glm.pred <- predict(glm.model, newdata = churnTest)
glm.accuracy <- round(1 - mean(glm.pred != churnTest$churn_flag), 2)
table(actual = churnTest$churn_flag, predicted = glm.pred)
## predicted
## actual No Yes
## No 2353 92
## Yes 244 884
Results of a confusion matrix are used to evaluate the predicted vs actual values. It is important to note that while the accuracy of the model may seem high, much of this result is attributed to the disproportionate number of non-churn correct predictions. In order to fully evaluate the logistic model, its results are compared with the results of a second model based on a decision tree (CART) model.
True positive results (TP): represents the number of customers that were correctly predicted as intending to churn.
True negatives (TN): represents the number of customers that were correctly predicted as not intending to churn.
False positives (FP): represents the number of customers that were incorrectly predicted as intending to churn but never churned or intended to either (type I error)
False negatives (FN): represents the number of customers that were incorrectly predicted as not intending to churn but actually did churn (type II error)
Accuracy metric: how often the classifier is correct? (TP + TN) / total = (2353 + 884) / 3573 = 0.91
Percentage of misclassification: how often the classifier makes an error? (FP + FN) / total = (92 + 244) / 3573 = 0.09
A few more metrics can be derived from the confusion matrix, but in this case they will not be considered.
set.seed(123)
cart.model <- train(churn_flag ~ ., method = "rpart2", data = churnTrain, trControl = modelCtrl)
cart.pred <- predict(cart.model, newdata = churnTest)
cart.accuracy <- 1-mean(cart.pred != churnTest$churn_flag)
table(actual = churnTest$churn_flag, predicted = cart.pred)
## predicted
## actual No Yes
## No 2445 0
## Yes 76 1052
Accuracy: how often the classifier is correct? (TP + TN) / total = (2445 + 1052) / 3573 = 0.98
Percentage of misclassification: how often the classifier makes an error? (FP + FN) / total = (76 + 0) / 3573 = 0.02
It seems that the second CART model improves accuracy. More importantly, a higher degree of true positive results constitute this accuracy. This is significant improvement over the previous logistics model.
There is room for improvement, in particular in the true positive percentage (model specificity). It would be logical to continue the search for a more efficient and accurate model :)
To identify the best model, accuracy will be compared between the following models: Single Decision Tree (CART), Bagged Trees, Boosted Tree and Random Forest.
#CART (Classification And Regression Tree) - gini
set.seed(123)
gini.cart.model <- train(churn_flag ~ ., method = "rpart", data = churnTrain, parms = list(split = "gini"), trControl = modelCtrl)
#Bagging
set.seed(123)
treebag.model <- train(churn_flag ~ ., method = "treebag", data = churnTrain, trControl = modelCtrl)
# Boosted Tree
set.seed(123)
C5.model <- train(churn_flag ~ ., method = "C5.0", data = churnTrain, trControl = modelCtrl)
#Random Forest
set.seed(123)
rf.model <- train(churn_flag ~ ., method = "rf", data = churnTrain, trControl = modelCtrl)
results <- resamples(list(CART=gini.cart.model, BAG=treebag.model, BOOST=C5.model, RF=rf.model))
dotplot(results)
Bagging (from bootstrap aggregating) is a meta-algorithm machine learning ensemble designed to improve the stability and accuracy, used in statistical classification and regression. This method reduces deviations and helps to avoid overfitting.
Gradient Boosting is a regression and classification technique that creates a prediction model in the form of an ensemble of weak prediction models, usually decision trees.
Random Forest is an ensemble method of classification and regression that works by building multiple decision trees during training. For classification tasks, the result of a random forest model is the class chosen by most trees.
The Boosted tree, Random Forest and Bagged Trees models seem to have the highest accuracy. To select the winner, the models are compared using a ROC curve.
gb.pred <- predict(C5.model, newdata = churnTest)
gb.accuracy <- 1-mean(gb.pred != churnTest$churn_flag)
rf.pred1 <- predict(rf.model, newdata = churnTest)
rf.accuracy <- 1-mean(rf.pred1 != churnTest$churn_flag)
glm.roc <- roc(response = churnTest$churn_flag, predictor = as.numeric(glm.pred))
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
gb.roc <- roc(response = churnTest$churn_flag, predictor = as.numeric(gb.pred))
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
rf.roc <- roc(response = churnTest$churn_flag, predictor = as.numeric(rf.pred1))
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
plot(glm.roc, legacy.axes = TRUE, print.auc.y = 0.4, print.auc = TRUE)
plot(gb.roc, col = "blue", add = TRUE, print.auc.y = 0.5, print.auc = TRUE)
plot(rf.roc, col = "red" , add = TRUE, print.auc.y = 0.6, print.auc = TRUE)
legend(0.0,0.2, c("Random Forest", "Boosted Tree", "Logistic"), lty = c(1,1), lwd = c(2, 2), col = c("red", "blue", "black"), cex = 0.75)
The ROC curve (receiver operating characteristic curve) is a graphical way to review the performance of classification models. ROC is a good indicator of efficiency, especially when the focus is on the true positive percentage of models. The area under the curve (AUC) measures the ability of models to classify correctly. The higher the AUC, the more accurate the model.
In the scientific community generally accepted AUC values are as follows:
# 0.5 = This suggests no discrimination, so we might as well flip a coin.
# 0.5-0.7 = We consider this poor discrimination, not much better than a coin toss.
# 0.7-0.8 = Acceptable discrimination
# 0.8-0.9 = Excellent discrimination
# > 0.9 = Outstanding discrimination
According to the result of the ROC curve, the difference between Random Forest and Boosted Tree is minimal, but both models show an improvement over the logistics model. In this sample and with these parameters, Random Forest is the best predicting model.
Given the achieved results, further tuning and cleaning the models may not be necessary. It is worth considering whether it would be possible to introduce additional factors during the modeling phase. Moreover, performing additional t-tests on the remaining factors is advised.
Concerning the modeling part, this analysis uses relatively old and “simple” logistic regression algorithms. For a real business case, modeling using H2O, XGBoost and TidyModels in combination with Parsnip would be optimal, as this would significantly reduce the processing time and would be in line with current trends in machine learning. High-performing algorithms are always preferred as they are much more powerful.
Additionally, it is possible to develop specific function to assess potential profits for the company when anti-churn strategy through various marketing / promotional campaigns aimed at subscribers at risk is implemented. Sample function below:
benefit <- function(confMatrix) {
tp <- confMatrix[1,1]; fn <- confMatrix[1,2]
fp <- confMatrix[2,1]; tn <- confMatrix[2,2]
# Hypothesized subscriber revenue for a period of one year upon successful upsell
tp.bnft <- 1000 ; fn.cost <- -2000
fp.cost <- -250 ; tn.bnft <- 50
return(0.75*tp*tp.bnft +
(1-0.75)*tp*fn.cost + fn*fn.cost +
tn*tn.bnft +
0.90*fp*fp.cost + (1-0.9)*fp*tn.bnft)
}
In order to implement this particular workflow in production, it is recommended to develop Targets pipeline (https://github.com/ropensci/targets), which is very similar to Airflow in Python. This packages has been developed by the data science team of Eli Lilly and as of recently it has gained a lot of exposure and positive feedback across the data science community.
# Linear Imputation - impute_lm() ----
# DF %>%
#
# # Label if Ozone is missing
# add_label_missings(Ozone) %>%
#
# # Imputation - Linear Regression
# mutate(Ozone = as.double(Ozone)) %>%
# impute_lm(Ozone ~ Temp + Wind) %>%
#
# # Visualize
# ggplot(aes(Solar.R, Ozone, color = any_missing)) +
# geom_point()
#
# # Random Forest - impute_rf() ----
#
# air_quality_tbl %>%
#
# # Label if Ozone is missing
# add_label_missings(Ozone) %>%
#
# # Imputation - Ozone
# mutate(Ozone = as.double(Ozone)) %>%
# impute_rf(Ozone ~ Temp + Wind) %>%
#
# # Imputation - Solar.R
# mutate(Solar.R = as.double(Solar.R)) %>%
# impute_rf(Solar.R ~ Temp + Wind) %>%
#
# # Visualize
# ggplot(aes(Solar.R, Ozone, color = any_missing)) +
# geom_point()
#
# Churn <- DF %>%
# select(-RowNumber, -CustomerId) %>% #remove unwanted column
# mutate(Geography = as.factor(Geography),
# Gender = as.factor(Gender),
# HasCrCard = as.factor(HasCrCard),
# IsActiveMember = as.factor(IsActiveMember),
# Exited = as.factor(Exited),
# Tenure = as.factor(Tenure),
# NumOfProducts = as.factor(NumOfProducts))
# mutate(LE_TYPE_DESC %in% "Price Plan Migration" <- "Mig")
# Transform and impute the Gender column
# mutate(GENDER = ifelse(GENDER == "Male", 1,0)) %>%
# Replace all NAs across the dataset using dplyr
# mutate(across(where(anyNA), ~ replace_na(., 0))) %>%
# Populate blank rows with dummy data
# mutate(across(c("PORTIN_COMPETITOR_NAME_ENG",
# "REF_PRICE_PLAN_DESC",
# "LE_TYPE_DESC",
# "TERMINAL_TYPE",
# "TERMINAL_BRAND",
# "TERMINAL_CAT",
# "DEVICE_TYPE_DESC",
# "DEVICE_MANUFACTURER_DESC",
# "CNT_BARRED_STATUS"), ~ifelse(. == "", "NoData", as.character(.)))) %>%
# Convert churn to factor post imputation
#mutate(GENDER = as.factor(GENDER))
#mutate_if(is.character, as.factor)
# churnTrain %>%
# filter(TERMINAL_BRAND == "LENOVO")
# Generate ID column using row_number
# mutate(id = row_number())
# churn_no_clean_tbl <- DF[rowSums(is.na(churn_no)) == 0,]