Churn Analysis

Synopsis

The market of telecommunication services is saturated and the growth rates of the customer base are low. Therefore the main focus of telecom providers should be maintaining their current client base. This project examines fictional data with the aim of identifying key predictors of customer churn by evaluating different machine learning models, using standard and validated metrics.

library(tidyverse)
library(plyr)
library(lubridate)
library(skimr)
library(visdat)
library(naniar)
library(simputation)
library(correlationfunnel)
library(DataExplorer)
library(gridExtra)
library(corrplot)
library(gridExtra)
library(pROC)
library(C50)
library(caret)
library(rpart)
library(WRS2)
library(afex)
library(ggthemes)
library(ggstatsplot)

SDV_DAT_JAN <- read.csv("SDV_DAT_JAN.csv", header = TRUE, sep =";")

skim(SDV_DAT_JAN)

Data summary
Name	SDV_DAT_JAN
Number of rows	500000
Number of columns	47
_______________________
Column type frequency:
character	15
logical	1
numeric	31
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	empty	n_unique
PP_GROUP_NAME	1	7	9	0	2
SUBSCRIPTION_STATUS	1	1	1	0	1
GENDER	1	0	6	7799	3
PRICE_PLAN_DESC	1	10	20	0	36
PORTIN_COMPETITOR_NAME_ENG	1	0	7	440437	3
REF_PRICE_PLAN_DESC	1	0	46	44749	36
LE_TYPE_DESC	1	7	22	0	7
TERMINAL_TYPE	1	0	16	282133	4
TERMINAL_BRAND	1	0	9	291555	21
TERMINAL_CAT	1	0	15	291231	6
DEVICE_TYPE_DESC	1	0	25	9728	11
DEVICE_MANUFACTURER_DESC	1	0	25	14564	26
SILENT_SMS	1	1	3	0	26
SILENT_DATA	1	1	3	0	26
MTA_MAU_CURRENT	1	1	1	0	2

Variable type: logical

skim_variable	n_missing	complete_rate	mean	count
SUBSCRIPTION_STATUS_NEXT_MNT	5e+05	0	NaN	:

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
DATE_ID_CALC	0	1.00	20210131.00	0.00	20210131.00	20210131.00	20210131.00	20210131.00	20210131.00	▁▁▇▁▁
CNT_BARRED_STATUS	262485	0.48	9.26	7.61	1.00	2.00	7.00	14.00	38.00	▇▅▂▁▁
CNT_BARRED_DAYS	263417	0.47	23.39	29.75	0.00	1.27	12.82	40.25	303.71	▇▁▁▁▁
CNT_SUSPENDED_STATUS	259637	0.48	0.02	0.15	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
CNT_SUSPENDED_DAYS	264225	0.47	1.15	4.72	0.00	0.00	0.00	0.01	39.12	▇▁▁▁▁
CNT_SUB_CUST	0	1.00	1.81	1.27	0.00	1.00	2.00	3.00	8.00	▇▆▂▁▁
NETWORK_AGE	0	1.00	94.20	56.79	16.95	40.97	86.59	149.64	235.69	▇▃▅▅▁
CUST_AGE	19628	0.96	45.20	15.31	14.00	33.00	43.00	59.00	91.00	▅▇▆▅▁
REAL_MF_WITH_VAT	0	1.00	20.72	10.56	3.46	12.28	17.90	30.56	54.52	▇▇▇▁▁
GA_TYPE_FLAG	242133	0.52	4.35	2.38	1.00	1.00	5.00	7.00	7.00	▇▁▃▆▇
MONTHS_TO_EXPIRY_CRNT	0	1.00	2.43	0.22	1.94	2.28	2.45	2.56	3.04	▂▇▇▃▁
REF_REAL_MF_WITH_VAT	56800	0.89	19.24	8.52	0.00	11.29	19.24	21.13	66.29	▅▇▁▁▁
LEASING_CNT	288866	0.42	1.87	1.08	1.00	1.00	2.00	2.00	5.00	▇▅▂▁▁
TERMINAL_CNT	150379	0.70	1.64	0.68	1.00	1.00	2.00	2.00	4.00	▇▇▁▂▁
AVG_MONTHLY_BILL_6M_NO_VAT	0	1.00	36.28	32.08	0.88	18.84	24.80	43.71	224.58	▇▂▁▁▁
INVOICE_CNT	0	1.00	5.79	0.41	5.00	6.00	6.00	6.00	6.00	▂▁▁▁▇
ROAMING_MIN_CHARGE_6M	0	1.00	0.08	0.55	0.00	0.00	0.00	0.00	8.55	▇▁▁▁▁
ROAMING_GPRS_CHARGE_6M	0	1.00	0.13	1.14	0.00	0.00	0.00	0.01	23.00	▇▁▁▁▁
VAS_CHARGE_6M	0	1.00	0.00	0.00	0.00	0.00	0.00	0.00	0.01	▇▂▁▁▁
DOB_CHARGE_6M	488003	0.02	12.50	4.34	1.68	12.30	12.31	12.32	29.91	▁▇▁▁▁
GPRS_USAGE_MB_L3M	0	1.00	1776.90	2859.71	0.00	0.00	239.21	2850.23	21289.22	▇▁▁▁▁
ROAMING_GPRS_USAGE_L3M	0	1.00	33.93	293.06	0.00	0.00	0.00	0.00	6053.26	▇▁▁▁▁
CNT_PAYMENTS_L12M	0	1.00	11.27	1.62	1.00	11.00	11.00	12.00	25.00	▁▁▇▁▁
TNSHOPS_PAY_L12M	165812	0.67	5.80	3.77	1.00	3.00	5.00	8.00	28.00	▇▃▂▁▁
EASYPAY_PAY_L12M	254293	0.49	6.69	3.85	0.54	3.10	7.41	10.25	12.66	▇▅▇▂▇
EPAY_PAY_L12M	408505	0.18	7.90	3.64	0.00	6.00	8.00	12.00	12.00	▃▂▂▇▇
MTAPP_PAY_L12M	411465	0.18	4.06	2.23	1.00	3.00	4.00	4.00	15.00	▅▇▂▁▁
BGPOST_PAY_L12M	400421	0.20	6.19	3.25	1.00	4.00	6.00	8.00	12.00	▃▁▇▂▃
OTHER_PAY_L12M	415186	0.17	5.21	2.75	1.00	4.00	5.00	5.00	13.00	▂▇▁▂▁
PLAY_DIEMA_XTRA_YES	0	1.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	▁▁▇▁▁
churn_flag	0	1.00	0.01	0.09	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁

SDV_DAT_JAN %>% plot_intro()

Preliminary conclusions

EDA of the data shows that some columns can be removed due to the following reasons:

their values are not directly relevant to customer churn.
the number of missing observations is too high for using imputation techniques.

Such columns are: DATE_ID_CALC, SUBSCRIPTION_STATUS - lac of relevance to the model; and SUBSCRIPTION_STATUS_NEXT_MNT - empty column.

SDV_DAT_JAN %>% plot_missing()

SDV_DAT_JAN %>% vis_miss(warn_large_data = FALSE)

SDV_DAT_JAN %>% gg_miss_upset()

In order to perform thorough analysis of all independent variables and their significance as predictors, certain columns require manipulation to be included in the model. For example:

CNT_SUSPENDED_STATUS will be recoded from 0 to 1, and NA values will be converted to 0.
PP_GROUP_NAME will be converted to factor for easier consumption of the ml model.
NA values in columns such as CNT_BARRED_STATUS and CNT_SUSPENDED_DAYS will be converted to 0.
Blank rows (nulls) will be converted to character type and will be imputed with “NoData”.

By performing these steps it becomes possible to include these columns in a subsequent correlation and t-test analysis and finally to evaluate them for predictability of churn and their statistical significance. An alternative approach will be to remove them completely from the model, but this leads to the risk of omitting potentially important factors.

DF <- within(SDV_DAT_JAN, rm(DATE_ID_CALC,
                             SUBSCRIPTION_STATUS,
                             SUBSCRIPTION_STATUS_NEXT_MNT,
                             DOB_CHARGE_6M,
                             OTHER_PAY_L12M,
                             MTAPP_PAY_L12M,
                             EPAY_PAY_L12M,
                             BGPOST_PAY_L12M,
                             PLAY_DIEMA_XTRA_YES))

churn_yes <- DF %>% 
    filter(churn_flag == "1")

churn_no <- DF %>% 
    filter(churn_flag == "0") %>% 
    na.omit() 

clean_df <- join(churn_no, churn_yes, type = "full")

This procedure groups all subscribers with a positive churn flag (1) in a separate sub-table containing 4300 observations. This is done in order to preserve all true positive observations. All other subscribers are grouped in another table based on churn = no flag (0), after which all rows with missing data in churn_no table are removed. This reduces the total number of observations to only 9995 from the initial 495,700. Applying this method over the churn_yes table is not applicable because the model overfits and becomes useless. Additionally, only 163 observations in churn_yes table contain complete data, which is less than 2% of the total.

This operation sanitizes the data and provides more accurate overview of the distribution. Separating and merging the two tables results in a clean and tidy dataframe, which helps the modeling processes downstream.

# Wrangle, Transform & Impute

clean_tbl <- clean_df %>%
   
    # Round all numerical columns across the dataset
    mutate_if(is.numeric, round, digits = 3) %>% 
    
    # Revert data coding from 0 to 1 in CNT_SUSPENDED_STATUS
    mutate(CNT_SUSPENDED_STATUS = ifelse(CNT_SUSPENDED_STATUS == "0", 1,0)) %>%
    
    # Replace all NAs across the dataset using base R
    replace(is.na(.), 0) %>% 
    
    # Convert the churn flag from 1 and 0 to Yes and No
    mutate(churn_flag = ifelse(churn_flag == "1", "Yes", "No")) %>%
    
    # Convert churn to factor post imputation
    mutate(churn_flag = as.factor(churn_flag))
    
    # Fill in null rows with "NoData" character value
    clean_tbl[clean_tbl == ""] <- "NoData"

Graphical distribution of categorical predictors

exp1 <- clean_tbl %>% 
    filter(GENDER != "NoData") %>% 
    ggplot(aes(GENDER, fill = churn_flag)) + 
    geom_bar(position = "fill") + 
    labs(x = "Gender", y = "") + 
    theme(legend.position = "none")

exp3 <- clean_tbl %>% 
    filter(GA_TYPE_FLAG != "0") %>% 
    ggplot(aes(GA_TYPE_FLAG, fill = churn_flag)) + 
    geom_bar(position = "fill") + 
    labs(x = "How the sub joined", y = "") + 
    theme(legend.position = "none") 

exp4 <- ggplot(clean_tbl, aes(PP_GROUP_NAME, fill = churn_flag)) + 
    geom_bar(position = "fill") + 
    labs(x = "Segment", y = "") + 
    theme(legend.position = "none")

grid.arrange(exp1, exp3, exp4, ncol = 4, nrow = 1, top = "Churn/Non-churn Proportion")

exp2 <- ggplot(clean_tbl, aes(LE_TYPE_DESC, fill = churn_flag)) + 
    geom_bar(position = "fill") + 
    labs(x = "Last Event Type", y = "") + 
    theme(legend.position = "none")

grid.arrange(exp2, ncol = 1, nrow = 1, top = "Churn/Non-churn Proportion")

We can observe that among these variables whether a customer would churn depends largely on the type of contract.

It seems that subscribers who use mainly voice services without Internet / data services are much more likely to churn than subscribers who use full data and voice contract. It can be speculated that customers using only voice services are likely from poor socio-economic background or just senior citizens.

Additionally, the way subscribers join the telecom also has an impact. The GA_TYPE_FLAG column illustrates that for code numbers 4 and 5, subscribers differ in the total number of churn.

Also, the way subscribers change their plan from REF_PRICE_PLAN to PRICE_PLAN - with the following options - Migration / Retention / Saving, etc., seem to affect the level of churn.

It is important to note that these hypotheses are based solely on graphical distribution. Without being tested for significance, it would not be prudent to draw any conclusions. Subsequently, by correlation analysis and t-test, some of these hypotheses are tested.

exp5 <- ggplot(clean_tbl, aes(churn_flag, CNT_BARRED_STATUS, fill = churn_flag)) + 
    geom_boxplot(alpha = 0.8) + 
    theme(legend.position = "null")

exp6 <- ggplot(clean_tbl, aes(churn_flag, GPRS_USAGE_MB_L3M, fill = churn_flag)) + 
    geom_boxplot(alpha = 0.8) + 
    theme(legend.position = "null")

exp7 <- ggplot(clean_tbl, aes(churn_flag, TERMINAL_CNT, fill = churn_flag)) + 
    geom_boxplot(alpha = 0.8) + 
    theme(legend.position = "null")

exp8 <- ggplot(clean_tbl, aes(churn_flag, MONTHS_TO_EXPIRY_CRNT, fill = churn_flag)) + 
    geom_boxplot(alpha = 0.8) + 
    theme(legend.position = "null")

exp9 <- ggplot(clean_tbl, aes(churn_flag, CNT_SUB_CUST, fill = churn_flag)) + 
    geom_boxplot(alpha = 0.8) + 
    theme(legend.position = "null")

exp10 <- ggplot(clean_tbl, aes(churn_flag, CUST_AGE, fill = churn_flag)) + 
    geom_boxplot(alpha = 0.8) + 
    theme(legend.position = "null")

grid.arrange(exp5, exp6, exp7, 
 
                         exp8, exp9, exp10, 
             ncol = 3, nrow = 2)

From the above graph of this set of factors it is difficult to draw conclusions. Graphically, the differences seem negligible. Conducting t-test may reveal other conclusions.

T-test analysis

T-test is a standard statistical tool for testing hypotheses. In this analysis, its use is intended to show comprehension of such methods, rather than confirming / rejecting certain assumptions and hypotheses. Performing scientific testing takes time :)

This analysis looks at the distribution between contract change type (Migration / Retention / Saving) and customer age. The graph contains many statistical indicators, for example: significance, confidence intervals, and many others.

s <- clean_df %>% 
    mutate(id = rownames(.)) %>% 
    select(id, CUST_AGE, GENDER, NETWORK_AGE, LE_TYPE_DESC, churn_flag) %>% 
    filter(GENDER != "")

ggwithinstats(
  data = s,
  x = LE_TYPE_DESC,
  y = CUST_AGE,
  type = "r", # robust statistics
  xlab = "REF_PRICE_PLAN to PRICE_PLAN",
  ylab = "Customer Age",
  outlier.tagging = FALSE)

The following T-test is applied over contract duration and churn flag.

w <- clean_df %>% 
    mutate(id = rownames(.)) %>% 
    select(id, CUST_AGE, GENDER, NETWORK_AGE, LE_TYPE_DESC, churn_flag) %>% 
    filter(GENDER != "")

ggwithinstats(
  data = w,
  x = churn_flag,
  y = NETWORK_AGE,
  type = "p", # Parametric test
  xlab = "Churn",
  ylab = "Subscription duration in months",
  outlier.tagging = FALSE)

For the purpose of this last t-test which is applied over customer age and churn flag the following hypotheses are formulated.

H0 - the age of the subscribers does not affect whether they are likely to churn.
Ha - the age of the subscribers affects whether they are likely to churn.

ggbetweenstats(s, 
               churn_flag, 
               CUST_AGE
               )

Welch’s T-test revealed that in 13,843 observations, the difference in subscribers’ age and whether they were willing to opt out was not statistically significant.

The effect size (g = -0.08) is very low, practically zero. According to the Cohen Convention (1988), this indicator is not statistically significant. The Bayes factor revealed that the alternative hypothesis can be completely rejected (BF = -4.96). This can be considered serious evidence (Jeffreys, 1961) in favor of the null hypothesis.

Correlation analysis

The application of correlation analysis helps to identify factors associated with customer churn. This is a well established method for illustrating relationships between variables.

corrplot(cor(clean_tbl[sapply(clean_tbl, is.numeric)]))

The following correlogram complements the conclusions made above with the addition of correlation coefficients. Relationships range from low (r = + -0.4) to moderate (r = + -0.5-0.7). It is important to note that some of the moderate correlation coefficients are between directly related variables, for example: CNT_BARRED and CNT_SUSPENDED, which do not add value to the analysis.

Alternative approach to correlation analysis is the CORRELATION FUNNEL package

Pros:
- Works with categories (converts into bins)
- Shows positive and negative relationship
- Works with nonlinear data (uses binary conversion)
- Quick operation - Uses Pearson’s correlation formula and is multithreaded
Cons:
- Nonlinear relationship is based on bins
- May become poor performer if the data is not normally distributed (skew, kurtosis)

churn_bin_tbl <- clean_tbl %>% binarize()

options(ggrepel.max.overlaps = 20)

churn_bin_tbl %>%
    correlate(target = churn_flag__Yes) %>%
    plot_correlation_funnel() +
    geom_point(size = 2, color = "#2c3e50")

Binary version of correlation matrix

A correlation coefficient of up to + -0.7 is considered a moderate relationship. It is apparent that with this sample, observing high / very high correlation between the variables is unlikely. Very high relationships are difficult to detect and observe and they may be misleading. There are infinite amount of factors that have potential impact on each variable.

Modeling

For the purpose of the current analysis all models are subject to three-step cross-validation.

In the case of k-fold cross-validation, the original sample is randomly divided into k samples of the same size. Of the k sub-samples, one sub-sample is saved as model validation and testing data, and the remaining k-1 sub-samples are used as model training data.

This ensures that each observation from the original dataset has a chance to appear in a training and test sample. This is one of the most established approaches for model validation.

Before proceeding to the cross-validation split, the data were divided into churnTrain table (10720 observations) and churnTest table (3537 observations). churnTrain will be used to build the models, while churnTest will be used to evaluate them.

## 75% of the sample size
smp_size <- floor(0.75 * nrow(clean_tbl))

set.seed(123)
train_ind <- sample(seq_len(nrow(clean_tbl)), size = smp_size)

churnTrain <- clean_tbl[train_ind, ] %>% filter(TERMINAL_BRAND != "LENOVO")
churnTrain <- clean_tbl[train_ind, ] %>% filter(TERMINAL_BRAND != "PHILIPS") 
churnTest <- clean_tbl[-train_ind, ] %>% filter(TERMINAL_BRAND != "LENOVO")
churnTest <- clean_tbl[-train_ind, ] %>% filter(TERMINAL_BRAND != "PHILIPS")

Logistic regression models are fast and easy to interpret. In general, this type of modeling does not require scaling of the input variables, no additional settings (hyperparameter tuning) are required, they are easily adjusted and well-calibrated probabilities are displayed.

modelCtrl <- trainControl(method = "cv", number = 3)

Logistic regression

set.seed(123)
glm.model <- train(churn_flag ~ .,
                   method = "glm",
                   family = "binomial",
                   data = churnTrain,
                   trControl = modelCtrl)

# summary(glm.model)

Statistically significant factors to churn

# Deviance Residuals: 
#     Min       1Q   Median       3Q      Max  
# -3.5160  -0.3849  -0.1697   0.0405   3.8338  
# 
# Coefficients:
#                                                                     Estimate Std. Error z value Pr(>|z|)    

# CNT_BARRED_STATUS                                                 -1.799e-02  5.107e-03  -3.523 0.000426 ***
# CNT_SUSPENDED_STATUS                                              -3.640e+00  1.446e-01 -25.165  < 2e-16 ***
# NETWORK_AGE                                                        3.078e-03  8.740e-04   3.521 0.000429 ***
# CUST_AGE                                                          -1.124e-02  2.875e-03  -3.908 9.31e-05 ***
# GA_TYPE_FLAG                                                      -2.269e-01  1.496e-02 -15.168  < 2e-16 ***
# PORTIN_COMPETITOR_NAME_ENGNoData                                  -6.804e-01  1.623e-01  -4.192 2.77e-05 ***
# LE_TYPE_DESCHandset only Migration                                -8.689e-01  2.029e-01  -4.282 1.85e-05 ***
# LE_TYPE_DESCPrice Plan Migration                                  -8.962e-01  1.689e-01  -5.306 1.12e-07 ***
# LE_TYPE_DESCRetention                                             -8.226e-01  1.778e-01  -4.626 3.74e-06 ***
# TERMINAL_BRANDAPPLE                                                1.441e+00  3.759e-01   3.835 0.000126 ***
# LEASING_CNT                                                       -9.881e-01  4.456e-02 -22.175  < 2e-16 ***
# TERMINAL_CNT                                                      -2.693e-01  5.019e-02  -5.366 8.04e-08 *** 
# INVOICE_CNT                                                       -3.080e-01  8.332e-02  -3.696 0.000219 ***
# 
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# (Dispersion parameter for binomial family taken to be 1)
# 
#     Null deviance: 13021.5  on 10719  degrees of freedom
# Residual deviance:  5147.8  on 10514  degrees of freedom
# AIC: 5559.8
# 
# Number of Fisher Scoring iterations: 17

Results of the glm model

The summary of the logistics model reveals the main predictors of churn. At a significance level of 5% (0.05) and for a sample size of 10720, the variables denoted by *** (p = 0.001) showed a statistically significant relationship between churn and predictor variables.

varImp(glm.model)

## glm variable importance
## 
##   only 20 most important variables shown (out of 205)
## 
##                                      Overall
## EASYPAY_PAY_L12M                     100.000
## CNT_SUSPENDED_STATUS                  78.574
## LEASING_CNT                           69.237
## GA_TYPE_FLAG                          47.361
## MTA_MAU_CURRENTY                      20.986
## TNSHOPS_PAY_L12M                      19.696
## TERMINAL_CNT                          16.755
## `LE_TYPE_DESCPrice Plan Migration`    16.567
## GPRS_USAGE_MB_L3M                     15.549
## LE_TYPE_DESCRetention                 14.443
## `LE_TYPE_DESCHandset only Migration`  13.371
## PORTIN_COMPETITOR_NAME_ENGNoData      13.088
## CUST_AGE                              12.202
## TERMINAL_BRANDAPPLE                   11.974
## INVOICE_CNT                           11.541
## CNT_BARRED_STATUS                     11.001
## NETWORK_AGE                           10.995
## CNT_BARRED_DAYS                       10.193
## TERMINAL_BRANDHONOR                    9.839
## TERMINAL_BRANDNoData                   9.815

The results of the statistical model are partially related to the previous observations conducted through the initial phase of the correlation analysis between the dependent (forecasted) and independent (predictor) variables.

An important predictor is CNT_SUSPENDED_STATUS, which depicts how many times the subscriber’s service has been suspended.
Other important predictors of churn are factors such as whether the subscribers use leasing services, how the subscribers have signed a contract with the telecom (through PortIN / from Sales / Pre2Post Migration), the length of the period in which the customer is an active subscriber, whether mobile internet and/or voice services are used and others.

Model evaluation

Common practice for evaluating models is to evaluate them against new, “unseen” data. In this case, the churnTest table is used to evaluate and measure the performance of the model.

glm.pred <- predict(glm.model, newdata = churnTest)
glm.accuracy <- round(1 - mean(glm.pred != churnTest$churn_flag), 2)
table(actual = churnTest$churn_flag, predicted = glm.pred)

##       predicted
## actual   No  Yes
##    No  2353   92
##    Yes  244  884

Results of a confusion matrix are used to evaluate the predicted vs actual values. It is important to note that while the accuracy of the model may seem high, much of this result is attributed to the disproportionate number of non-churn correct predictions. In order to fully evaluate the logistic model, its results are compared with the results of a second model based on a decision tree (CART) model.

True positive results (TP): represents the number of customers that were correctly predicted as intending to churn.
True negatives (TN): represents the number of customers that were correctly predicted as not intending to churn.
False positives (FP): represents the number of customers that were incorrectly predicted as intending to churn but never churned or intended to either (type I error)
False negatives (FN): represents the number of customers that were incorrectly predicted as not intending to churn but actually did churn (type II error)

Accuracy metric: how often the classifier is correct? (TP + TN) / total = (2353 + 884) / 3573 = 0.91

Percentage of misclassification: how often the classifier makes an error? (FP + FN) / total = (92 + 244) / 3573 = 0.09

A few more metrics can be derived from the confusion matrix, but in this case they will not be considered.

set.seed(123)
cart.model <- train(churn_flag ~ ., method = "rpart2", data = churnTrain, trControl = modelCtrl)
cart.pred <- predict(cart.model, newdata = churnTest)
cart.accuracy <- 1-mean(cart.pred != churnTest$churn_flag)
table(actual = churnTest$churn_flag, predicted = cart.pred)

##       predicted
## actual   No  Yes
##    No  2445    0
##    Yes   76 1052

Accuracy: how often the classifier is correct? (TP + TN) / total = (2445 + 1052) / 3573 = 0.98

Percentage of misclassification: how often the classifier makes an error? (FP + FN) / total = (76 + 0) / 3573 = 0.02

It seems that the second CART model improves accuracy. More importantly, a higher degree of true positive results constitute this accuracy. This is significant improvement over the previous logistics model.

There is room for improvement, in particular in the true positive percentage (model specificity). It would be logical to continue the search for a more efficient and accurate model :)

Choosing the best model

To identify the best model, accuracy will be compared between the following models: Single Decision Tree (CART), Bagged Trees, Boosted Tree and Random Forest.

#CART (Classification And Regression Tree) - gini
set.seed(123)
gini.cart.model <- train(churn_flag ~ ., method = "rpart", data = churnTrain,  parms = list(split = "gini"), trControl = modelCtrl)

#Bagging
set.seed(123)
treebag.model <- train(churn_flag ~ ., method = "treebag", data = churnTrain, trControl = modelCtrl)

# Boosted Tree
set.seed(123)
C5.model <- train(churn_flag ~ ., method = "C5.0", data = churnTrain, trControl = modelCtrl)

#Random Forest
set.seed(123)
rf.model <- train(churn_flag ~ ., method = "rf", data = churnTrain, trControl = modelCtrl)
results <- resamples(list(CART=gini.cart.model, BAG=treebag.model, BOOST=C5.model, RF=rf.model))

dotplot(results)

Bagging (from bootstrap aggregating) is a meta-algorithm machine learning ensemble designed to improve the stability and accuracy, used in statistical classification and regression. This method reduces deviations and helps to avoid overfitting.

Gradient Boosting is a regression and classification technique that creates a prediction model in the form of an ensemble of weak prediction models, usually decision trees.

Random Forest is an ensemble method of classification and regression that works by building multiple decision trees during training. For classification tasks, the result of a random forest model is the class chosen by most trees.

The Boosted tree, Random Forest and Bagged Trees models seem to have the highest accuracy. To select the winner, the models are compared using a ROC curve.

gb.pred <- predict(C5.model, newdata = churnTest)
gb.accuracy <- 1-mean(gb.pred != churnTest$churn_flag)

rf.pred1 <- predict(rf.model, newdata = churnTest)
rf.accuracy <- 1-mean(rf.pred1 != churnTest$churn_flag)

glm.roc <- roc(response = churnTest$churn_flag, predictor = as.numeric(glm.pred))

## Setting levels: control = No, case = Yes

## Setting direction: controls < cases

gb.roc <- roc(response = churnTest$churn_flag, predictor = as.numeric(gb.pred))

## Setting levels: control = No, case = Yes
## Setting direction: controls < cases

rf.roc <- roc(response = churnTest$churn_flag, predictor = as.numeric(rf.pred1))

## Setting levels: control = No, case = Yes
## Setting direction: controls < cases

plot(glm.roc,      legacy.axes = TRUE, print.auc.y = 0.4, print.auc = TRUE)
plot(gb.roc, col = "blue", add = TRUE, print.auc.y = 0.5, print.auc = TRUE)
plot(rf.roc, col = "red" , add = TRUE, print.auc.y = 0.6, print.auc = TRUE)
legend(0.0,0.2, c("Random Forest", "Boosted Tree", "Logistic"), lty = c(1,1), lwd = c(2, 2), col = c("red", "blue", "black"), cex = 0.75)

Results

The ROC curve (receiver operating characteristic curve) is a graphical way to review the performance of classification models. ROC is a good indicator of efficiency, especially when the focus is on the true positive percentage of models. The area under the curve (AUC) measures the ability of models to classify correctly. The higher the AUC, the more accurate the model.

In the scientific community generally accepted AUC values are as follows:

#  0.5     = This suggests no discrimination, so we might as well flip a coin.
#  0.5-0.7 = We consider this poor discrimination, not much better than a coin toss.
#  0.7-0.8 = Acceptable discrimination
#  0.8-0.9 = Excellent discrimination
#    > 0.9 = Outstanding discrimination

Reference: https://www.researchgate.net/publication/7511660_Rice_ME_Harris_GTComparing_effect_sizes_in_follow-up_studies_ROC_Area_Cohen's_d_and_r_Law_Hum_Behav_29_615-620

According to the result of the ROC curve, the difference between Random Forest and Boosted Tree is minimal, but both models show an improvement over the logistics model. In this sample and with these parameters, Random Forest is the best predicting model.

Model improvement and conclusion

Given the achieved results, further tuning and cleaning the models may not be necessary. It is worth considering whether it would be possible to introduce additional factors during the modeling phase. Moreover, performing additional t-tests on the remaining factors is advised.

Concerning the modeling part, this analysis uses relatively old and “simple” logistic regression algorithms. For a real business case, modeling using H2O, XGBoost and TidyModels in combination with Parsnip would be optimal, as this would significantly reduce the processing time and would be in line with current trends in machine learning. High-performing algorithms are always preferred as they are much more powerful.

Additionally, it is possible to develop specific function to assess potential profits for the company when anti-churn strategy through various marketing / promotional campaigns aimed at subscribers at risk is implemented. Sample function below:

benefit <- function(confMatrix) {
  tp <- confMatrix[1,1]; fn <- confMatrix[1,2]
  fp <- confMatrix[2,1]; tn <- confMatrix[2,2]
  
  # Hypothesized subscriber revenue for a period of one year upon successful upsell
  
  tp.bnft <- 1000      ; fn.cost <- -2000  
  fp.cost <- -250      ; tn.bnft <- 50
  
  return(0.75*tp*tp.bnft + 
           (1-0.75)*tp*fn.cost + fn*fn.cost + 
           tn*tn.bnft + 
           0.90*fp*fp.cost + (1-0.9)*fp*tn.bnft)
}

In order to implement this particular workflow in production, it is recommended to develop Targets pipeline (https://github.com/ropensci/targets), which is very similar to Airflow in Python. This packages has been developed by the data science team of Eli Lilly and as of recently it has gained a lot of exposure and positive feedback across the data science community.

Backup code

# Linear Imputation - impute_lm() ----

# DF %>%
# 
#     # Label if Ozone is missing
#     add_label_missings(Ozone) %>%
# 
#     # Imputation - Linear Regression
#     mutate(Ozone = as.double(Ozone)) %>%
#     impute_lm(Ozone ~ Temp + Wind) %>%
# 
#     # Visualize
#     ggplot(aes(Solar.R, Ozone, color = any_missing)) +
#     geom_point()
# 
# # Random Forest - impute_rf() ----
# 
# air_quality_tbl %>%
# 
#     # Label if Ozone is missing
#     add_label_missings(Ozone) %>%
# 
#     # Imputation - Ozone
#     mutate(Ozone = as.double(Ozone)) %>%
#     impute_rf(Ozone ~ Temp + Wind) %>%
# 
#     # Imputation - Solar.R
#     mutate(Solar.R = as.double(Solar.R)) %>%
#     impute_rf(Solar.R ~ Temp + Wind) %>%
# 
#     # Visualize
#     ggplot(aes(Solar.R, Ozone, color = any_missing)) +
#     geom_point()
#

# Churn <- DF %>% 
#   select(-RowNumber, -CustomerId) %>% #remove unwanted column 
#   mutate(Geography = as.factor(Geography),
#          Gender = as.factor(Gender),
#          HasCrCard = as.factor(HasCrCard),
#          IsActiveMember = as.factor(IsActiveMember),
#          Exited = as.factor(Exited),
#          Tenure = as.factor(Tenure),
#          NumOfProducts = as.factor(NumOfProducts))

# mutate(LE_TYPE_DESC %in% "Price Plan Migration" <- "Mig")

# Transform and impute the Gender column 
# mutate(GENDER = ifelse(GENDER == "Male", 1,0)) %>%

# Replace all NAs across the dataset using dplyr
# mutate(across(where(anyNA), ~ replace_na(., 0))) %>% 
    
# Populate blank rows with dummy data
# mutate(across(c("PORTIN_COMPETITOR_NAME_ENG",
#                 "REF_PRICE_PLAN_DESC",
#                 "LE_TYPE_DESC",
#                 "TERMINAL_TYPE",
#                 "TERMINAL_BRAND",
#                 "TERMINAL_CAT",
#                 "DEVICE_TYPE_DESC",
#                 "DEVICE_MANUFACTURER_DESC",
#                 "CNT_BARRED_STATUS"), ~ifelse(. == "", "NoData", as.character(.)))) %>% 
    
# Convert churn to factor post imputation
#mutate(GENDER = as.factor(GENDER))
#mutate_if(is.character, as.factor)

# churnTrain %>%
#     filter(TERMINAL_BRAND == "LENOVO")

# Generate ID column using row_number
# mutate(id = row_number())

# churn_no_clean_tbl <- DF[rowSums(is.na(churn_no)) == 0,]