Forecasting Churn at a Telecom company

The dataset comes from a CRM system holding demographic, financial, product and communication information on customers. It also records whether or not the customer churned. In order to be included in this dataset a customer had to have signed up and paid for at least one month of service. A record covers a customer’s tenure period - a new record will be logged if they are won back.

sapply(telecom, class)
##       customerID           gender    SeniorCitizen          Partner 
##      "character"      "character"        "numeric"      "character" 
##       Dependents           tenure     PhoneService    MultipleLines 
##      "character"        "numeric"      "character"      "character" 
##  InternetService   OnlineSecurity     OnlineBackup DeviceProtection 
##      "character"      "character"      "character"      "character" 
##      TechSupport      StreamingTV  StreamingMovies         Contract 
##      "character"      "character"      "character"      "character" 
## PaperlessBilling    PaymentMethod   MonthlyCharges     TotalCharges 
##      "character"      "character"        "numeric"        "numeric" 
##  numAdminTickets   numTechTickets            Churn 
##        "numeric"        "numeric"      "character"
#very few explicit NA values
sapply(telecom, function(x) sum(is.na(x)))
##       customerID           gender    SeniorCitizen          Partner 
##                0                0                0                0 
##       Dependents           tenure     PhoneService    MultipleLines 
##                0                0                0                0 
##  InternetService   OnlineSecurity     OnlineBackup DeviceProtection 
##                0                0                0                0 
##      TechSupport      StreamingTV  StreamingMovies         Contract 
##                0                0                0                0 
## PaperlessBilling    PaymentMethod   MonthlyCharges     TotalCharges 
##                0                0                0               11 
##  numAdminTickets   numTechTickets            Churn 
##                0                0                0
#number of levels per category: all are reasonable
telecom |>
  select(!customerID) |>
  select_if(is.character) |>
  mutate_if(is.character,as.factor) |>
  summarise_all(~length(levels(.))) |>
  print(width=Inf)
## # A tibble: 1 × 16
##   gender Partner Dependents PhoneService MultipleLines InternetService
##    <int>   <int>      <int>        <int>         <int>           <int>
## 1      2       2          2            2             3               3
##   OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV
##            <int>        <int>            <int>       <int>       <int>
## 1              3            3                3           3           3
##   StreamingMovies Contract PaperlessBilling PaymentMethod Churn
##             <int>    <int>            <int>         <int> <int>
## 1               3        3                2             4     2

First mutation

telecom |>
  select(!customerID) |>
  mutate(across(where(is.character),as.factor)) -> telecom.f


length(levels(as.factor(c('one','two'))))
## [1] 2

Distribution of numeric fields

numAdminTickets = heavily skewed; very few accounts raise admin tickets; should check distribution of the response against numAdminTikets = 0 and numAdminTikets > 0

numTechTickets: conduct same analysis as for numAdminTikets =, as the variable is also heavily skewed

Exploratory Data Analysis

telecom.f |>
  mutate(numAdminTickets.s = ifelse(numAdminTickets>0, 'greater', 'zero')) |>
  mutate(numTechTickets.f = ifelse(numTechTickets>0, 'greater','zero')) -> telecom.f

#there is little difference in splits between numAminTickets = 0 and when > 0; this field is a candidate for removal
telecom.f |>
  ggplot(aes(numAdminTickets.s, fill = Churn)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent)

#there is significant difference in Churn proportion depending on this variable... but is each bin created equally?
telecom.f |>
  ggplot(aes(numTechTickets.f, fill = Churn)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent)

Data mutation: binning the variable

telecom.f |>
  mutate(numTechTickets.bin = cut(numTechTickets, breaks = c(0,0.5,3,6,9), include.lowest=T)) -> telecom.f


levels(telecom.f$numTechTickets.bin) <- c('0','>=3','3<x<=6','6<x<=9')
telecom.f |>
  ggplot(aes(x=numTechTickets.bin, fill=Churn))+
  geom_bar(position="fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Yes|No churn split on Number of Tech Tickets opened")

table(telecom.f$numTechTickets.bin)
## 
##      0    >=3 3<x<=6 6<x<=9 
##   6073    608    321     41
#whittle down the dataframe
telecom.f |>
  select(!c(numTechTickets,numTechTickets.f,numAdminTickets.s,numAdminTickets)) |>
  filter(!is.na(TotalCharges))-> telecom.f

Convert SeniorCitizen to categorical

telecom.f |>
  mutate(seniorCitizen.f = as.factor(SeniorCitizen)) |>
  select(-SeniorCitizen) -> telecom.f

sort(names(telecom.f))
##  [1] "Churn"              "Contract"           "Dependents"        
##  [4] "DeviceProtection"   "gender"             "InternetService"   
##  [7] "MonthlyCharges"     "MultipleLines"      "numTechTickets.bin"
## [10] "OnlineBackup"       "OnlineSecurity"     "PaperlessBilling"  
## [13] "Partner"            "PaymentMethod"      "PhoneService"      
## [16] "seniorCitizen.f"    "StreamingMovies"    "StreamingTV"       
## [19] "TechSupport"        "tenure"             "TotalCharges"

OUTLIER ANALYSIS; IQR is the spread around the median

Identify those records that are below the 25th percentile by the IQR (IQR is a single number)

prop_outside <-function(vec){
  prop_under = sum(vec < (quantile(vec, .25, na.rm=T) - 1.5*stats::IQR(vec, na.rm=T)))/length(vec)
  prop_ovr = sum(vec > (quantile(vec, .75) + 1.5*stats::IQR(vec)))/length(vec)
  return(prop_under+prop_ovr)
}

#apply across all numeric variables and print the output
#this proves that there are no outliers to worry about
telecom.f %>%
  select_if(is.numeric) %>%
  pivot_longer(cols=everything(), names_to='variables', values_to='value')  %>%
  group_by(variables) %>%
  summarize(proportion_outliers = prop_outside(value))
## # A tibble: 3 × 2
##   variables      proportion_outliers
##   <chr>                        <dbl>
## 1 MonthlyCharges                   0
## 2 TotalCharges                     0
## 3 tenure                           0

Corrplot of numeric variables

Numeric variables generally are highly correlated, which makes sense

telecom.f %>%
  keep(is.numeric) %>%
  cor() %>%
  corrplot(type = "upper")

Theory: tenure and Total Charges are linear

Plot tenure against TotalCharges to verify that TotalCharges is just cumulative Tenure is discrete and not continuous

telecom.f |>
  ggplot(aes(x=tenure,y=TotalCharges)) +
  geom_point() +
  labs(title="Linear relationship between tenure and TotalCharges")

#I'll select tenure, and drop TotalCharges
telecom.f |>
  select(-TotalCharges) -> telecom.f

END OF EDA

DATA PRE-PROCESING for the Random Forest model

#plot the balance of the dataset: split is 73/27; ROSE may help

Split of the response variable

table(telecom.f$Churn)/length(telecom.f$Churn)
## 
##       No      Yes 
## 0.734215 0.265785
#no preprocessing necessary for random forest
train_ix <- caret::createDataPartition(telecom.f$Churn, p=.8,list=FALSE,times=1)

telecom.f_train <- telecom.f[train_ix,]
telecom.f_test <- telecom.f[-train_ix,]

#creating a balanced dataset
#ROSE will oversample the minority cases (Churn = Yes)
table(telecom.f_train$Churn) # shoot for x2 the "No" cases
## 
##   No  Yes 
## 4131 1496
no_cases <- nrow(telecom.f_train[telecom.f_train$Churn=="No",])
rose_train <- ROSE(Churn ~ ., data  = telecom.f_train, no_cases*2, p=0.5)$data
#prove class balance
table(rose_train$Churn)
## 
##   No  Yes 
## 4087 4175
rf_dflt <- randomForest(
  Churn ~ ., 
  data = telecom.f_train, 
  #ntree = 100, 
  #mtry = 6, 
  #na.action = na.roughfix 
)


rf_balanced <- randomForest(
  Churn ~ ., 
  data = rose_train 
  #ntree = 100, 
  #mtry = 6, 
  #na.action = na.roughfix 
)

#a 2D matrix
rf_dflt_prob <- predict(rf_dflt, telecom.f_test, type = "prob")
dim(rf_dflt_prob)
## [1] 1405    2
#a factor vector
rf_predict <- predict(rf_dflt, telecom.f_test, type="class")
rf_predict_bal <- predict(rf_balanced, telecom.f_test, type="class")

cm_rf_dflt <- confusionMatrix(rf_predict, telecom.f_test$Churn, positive = "Yes")
cm_rf_bal <- confusionMatrix(rf_predict_bal, telecom.f_test$Churn, positive = "Yes")

cm_rf_dflt$byClass
##          Sensitivity          Specificity       Pos Pred Value 
##            0.7184987            0.9273256            0.7813411 
##       Neg Pred Value            Precision               Recall 
##            0.9011299            0.7813411            0.7184987 
##                   F1           Prevalence       Detection Rate 
##            0.7486034            0.2654804            0.1907473 
## Detection Prevalence    Balanced Accuracy 
##            0.2441281            0.8229121
cm_rf_bal$byClass
##          Sensitivity          Specificity       Pos Pred Value 
##            0.8954424            0.8246124            0.6485437 
##       Neg Pred Value            Precision               Recall 
##            0.9561798            0.6485437            0.8954424 
##                   F1           Prevalence       Detection Rate 
##            0.7522523            0.2654804            0.2377224 
## Detection Prevalence    Balanced Accuracy 
##            0.3665480            0.8600274

Variable importance

vim.mat.1 <- randomForest::importance(rf_balanced,type=1)
vim.mat.2 <- randomForest::importance(rf_balanced,type=2)
vim.mat.2[order(vim.mat.2[, 1], decreasing=TRUE), ]
##             tenure numTechTickets.bin           Contract     MonthlyCharges 
##          673.86095          564.30363          462.91216          376.79826 
##      PaymentMethod     OnlineSecurity        TechSupport    InternetService 
##          206.81881          193.16346          175.19971          133.79516 
##       OnlineBackup   DeviceProtection        StreamingTV    StreamingMovies 
##          120.19357           96.94393           85.08181           80.47320 
##             gender      MultipleLines   PaperlessBilling            Partner 
##           78.57071           77.95508           68.47034           68.35103 
##         Dependents    seniorCitizen.f       PhoneService 
##           56.64703           52.43403           17.77561
dim(vim.mat.2)
## [1] 19  1
as.data.frame(vim.mat.2) -> df
df$vars <- rownames(df)

df |>
  arrange(desc(MeanDecreaseGini)) -> df 

df|>
  ggplot(aes(x=reorder(vars, MeanDecreaseGini),y=MeanDecreaseGini)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance")

Fit a decision tree in order to identify some key thresholds

rose_train |>
  select(tenure, numTechTickets.bin,  Contract,  MonthlyCharges,  PaymentMethod, OnlineSecurity, TechSupport, InternetService,  OnlineBackup, DeviceProtection) -> rose.train.k

tree_mod <- 
  rpart::rpart(
    Churn ~ ., 
    method = "class",
    xval = 5,
    data = rose_train,
    #original dataset has no NAs
    #na.action = na.roughfix
    )

  #model evaluation
#default_pred <- predict(tree_mod,test_set,type="class")
dt_predict_bal <- predict(tree_mod, telecom.f_test, type="class")
cm <- confusionMatrix(dt_predict_bal, telecom.f_test$Churn, positive = "Yes")
cm$byClass
##          Sensitivity          Specificity       Pos Pred Value 
##            0.8981233            0.7567829            0.5716724 
##       Neg Pred Value            Precision               Recall 
##            0.9536020            0.5716724            0.8981233 
##                   F1           Prevalence       Detection Rate 
##            0.6986444            0.2654804            0.2384342 
## Detection Prevalence    Balanced Accuracy 
##            0.4170819            0.8274531
rpart.plot::rpart.plot(tree_mod, main="Balanced Dataset Decision Tree Model")

NEURAL NETWORK

Manual preprocessing for keras

sapply(telecom.f,class)
##             gender            Partner         Dependents             tenure 
##           "factor"           "factor"           "factor"          "numeric" 
##       PhoneService      MultipleLines    InternetService     OnlineSecurity 
##           "factor"           "factor"           "factor"           "factor" 
##       OnlineBackup   DeviceProtection        TechSupport        StreamingTV 
##           "factor"           "factor"           "factor"           "factor" 
##    StreamingMovies           Contract   PaperlessBilling      PaymentMethod 
##           "factor"           "factor"           "factor"           "factor" 
##     MonthlyCharges              Churn numTechTickets.bin    seniorCitizen.f 
##          "numeric"           "factor"           "factor"           "factor"
test.dummy <- caret::dummyVars( ~. , telecom.f, sep='_',fullRank=TRUE) #Churn will be excluded from new list
test.dummy #provides a summary
## Dummy Variable Object
## 
## Formula: ~.
## 20 variables, 18 factors
## Variables and levels will be separated by '_'
## A full rank encoding is used
df <- predict(test.dummy,telecom.f)
df <- data.frame(df)
names(df)
##  [1] "gender_Male"                          
##  [2] "Partner_Yes"                          
##  [3] "Dependents_Yes"                       
##  [4] "tenure"                               
##  [5] "PhoneService_Yes"                     
##  [6] "MultipleLines_No.phone.service"       
##  [7] "MultipleLines_Yes"                    
##  [8] "InternetService_Fiber.optic"          
##  [9] "InternetService_No"                   
## [10] "OnlineSecurity_No.internet.service"   
## [11] "OnlineSecurity_Yes"                   
## [12] "OnlineBackup_No.internet.service"     
## [13] "OnlineBackup_Yes"                     
## [14] "DeviceProtection_No.internet.service" 
## [15] "DeviceProtection_Yes"                 
## [16] "TechSupport_No.internet.service"      
## [17] "TechSupport_Yes"                      
## [18] "StreamingTV_No.internet.service"      
## [19] "StreamingTV_Yes"                      
## [20] "StreamingMovies_No.internet.service"  
## [21] "StreamingMovies_Yes"                  
## [22] "Contract_One.year"                    
## [23] "Contract_Two.year"                    
## [24] "PaperlessBilling_Yes"                 
## [25] "PaymentMethod_Credit.card..automatic."
## [26] "PaymentMethod_Electronic.check"       
## [27] "PaymentMethod_Mailed.check"           
## [28] "MonthlyCharges"                       
## [29] "Churn_Yes"                            
## [30] "numTechTickets.bin_..3"               
## [31] "numTechTickets.bin_3.x..6"            
## [32] "numTechTickets.bin_6.x..9"            
## [33] "seniorCitizen.f_1"
df_ix <- caret::createDataPartition(df$Churn_Yes, p=.8,list=FALSE,times=1)

#scale/normalize the test and training set separately
df_train <- df[df_ix,]
df_test <- df[-df_ix,]

#scale numeric values from test & train
num.cols <- names(telecom.f)[sapply(telecom.f,is.numeric)]

#scale numeric values separately
train.scale <- scale(df_train[num.cols])
test.scale <- scale(df_test[num.cols])


#make sure it worked by printing the mean and var of the numeric columns
as.data.frame(train.scale) |> 
  summarize(across(everything(),c(mean,var)))
##       tenure_1 tenure_2 MonthlyCharges_1 MonthlyCharges_2
## 1 3.637608e-17        1    -1.868858e-16                1
#remove Churn column from both test and train and convert to binary matrix
#y_test <- to_categorical(g_test, 10)
y_train <- to_categorical(df_train$Churn_Yes)
y_test <- to_categorical(df_test$Churn_Yes)

df_train |> 
  select(-any_of(c(num.cols,'Churn_Yes'))) |>
  cbind(train.scale) -> df_train.fin

df_test |> 
  select(-any_of(c(num.cols,'Churn_Yes'))) |>
  cbind(test.scale) -> df_test.fin

#take a look at the final results: all the columns of the data.frame
sort(names(df_train.fin))
##  [1] "Contract_One.year"                    
##  [2] "Contract_Two.year"                    
##  [3] "Dependents_Yes"                       
##  [4] "DeviceProtection_No.internet.service" 
##  [5] "DeviceProtection_Yes"                 
##  [6] "gender_Male"                          
##  [7] "InternetService_Fiber.optic"          
##  [8] "InternetService_No"                   
##  [9] "MonthlyCharges"                       
## [10] "MultipleLines_No.phone.service"       
## [11] "MultipleLines_Yes"                    
## [12] "numTechTickets.bin_..3"               
## [13] "numTechTickets.bin_3.x..6"            
## [14] "numTechTickets.bin_6.x..9"            
## [15] "OnlineBackup_No.internet.service"     
## [16] "OnlineBackup_Yes"                     
## [17] "OnlineSecurity_No.internet.service"   
## [18] "OnlineSecurity_Yes"                   
## [19] "PaperlessBilling_Yes"                 
## [20] "Partner_Yes"                          
## [21] "PaymentMethod_Credit.card..automatic."
## [22] "PaymentMethod_Electronic.check"       
## [23] "PaymentMethod_Mailed.check"           
## [24] "PhoneService_Yes"                     
## [25] "seniorCitizen.f_1"                    
## [26] "StreamingMovies_No.internet.service"  
## [27] "StreamingMovies_Yes"                  
## [28] "StreamingTV_No.internet.service"      
## [29] "StreamingTV_Yes"                      
## [30] "TechSupport_No.internet.service"      
## [31] "TechSupport_Yes"                      
## [32] "tenure"

Build the Neural Net Model: a sequential model with backward propogation with multiple hidden layers. Learning rate is on an exponential decay schedule in order to avoid overfitting. The problem statement set to binary crossentropy - adequate for binary classification problems.

mat.train <- as.matrix(df_train.fin)
mat.test <- as.matrix(df_test.fin)


#build model architecture
model <- keras_model_sequential() 

model %>% 
  layer_dense(units = 256, activation = 'relu', input_shape = c(ncol(mat.train)), name ="input_layer") %>% 
  layer_dropout(rate = 0.4) %>% 
  layer_dense(units = 128, activation = 'relu', name = 'first_hidden') %>%
  layer_dropout(rate = 0.3) %>%
  layer_dense(units = 64, activation = 'relu', name = 'second_hidden') %>%
  layer_dropout(rate = 0.2) %>%
  layer_dense(units = 2, activation = 'sigmoid',name = 'output_layer') #this activation funct can be changed

Adjusting the learning decay

## Epoch 1/10
## 90/90 - 1s - loss: 0.4155 - accuracy: 0.8062 - val_loss: 0.3503 - val_accuracy: 0.8526 - 662ms/epoch - 7ms/step
## Epoch 2/10
## 90/90 - 0s - loss: 0.3578 - accuracy: 0.8287 - val_loss: 0.3173 - val_accuracy: 0.8597 - 132ms/epoch - 1ms/step
## Epoch 3/10
## 90/90 - 0s - loss: 0.3412 - accuracy: 0.8407 - val_loss: 0.3299 - val_accuracy: 0.8570 - 131ms/epoch - 1ms/step
## Epoch 4/10
## 90/90 - 0s - loss: 0.3375 - accuracy: 0.8442 - val_loss: 0.3271 - val_accuracy: 0.8632 - 132ms/epoch - 1ms/step
## Epoch 5/10
## 90/90 - 0s - loss: 0.3277 - accuracy: 0.8451 - val_loss: 0.3104 - val_accuracy: 0.8597 - 124ms/epoch - 1ms/step
## Epoch 6/10
## 90/90 - 0s - loss: 0.3325 - accuracy: 0.8431 - val_loss: 0.3095 - val_accuracy: 0.8641 - 118ms/epoch - 1ms/step
## Epoch 7/10
## 90/90 - 0s - loss: 0.3304 - accuracy: 0.8462 - val_loss: 0.3079 - val_accuracy: 0.8641 - 115ms/epoch - 1ms/step
## Epoch 8/10
## 90/90 - 0s - loss: 0.3286 - accuracy: 0.8502 - val_loss: 0.3123 - val_accuracy: 0.8659 - 127ms/epoch - 1ms/step
## Epoch 9/10
## 90/90 - 0s - loss: 0.3218 - accuracy: 0.8491 - val_loss: 0.3057 - val_accuracy: 0.8632 - 114ms/epoch - 1ms/step
## Epoch 10/10
## 90/90 - 0s - loss: 0.3288 - accuracy: 0.8502 - val_loss: 0.3282 - val_accuracy: 0.8641 - 133ms/epoch - 1ms/step

Outputs the best/avg loss and accuracy

test_results <- model %>% evaluate(mat.test, y_test)
## 44/44 - 0s - loss: 0.3169 - accuracy: 0.8642 - 39ms/epoch - 893us/step
print(test_results)
##      loss  accuracy 
## 0.3169046 0.8641536