Introduction

In this report we aim to highlight customer churn in relation to customer tenure and total monthly charge. we will be using ‘status’ as a response variable and ‘Tenure’ and ‘TotalCharge’ as predictor variables

Data Preparation

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.6     ✔ recipes      1.1.0
## ✔ dials        1.3.0     ✔ rsample      1.2.1
## ✔ dplyr        1.1.4     ✔ tibble       3.2.1
## ✔ ggplot2      3.5.1     ✔ tidyr        1.3.1
## ✔ infer        1.0.7     ✔ tune         1.2.1
## ✔ modeldata    1.4.0     ✔ workflows    1.1.4
## ✔ parsnip      1.2.1     ✔ workflowsets 1.1.0
## ✔ purrr        1.0.2     ✔ yardstick    1.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.3     ✔ stringr   1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ stringr::fixed()    masks recipes::fixed()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ readr::spec()       masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(baguette)
## Warning: package 'baguette' was built under R version 4.4.2
library(vip)
## 
## Attaching package: 'vip'
## 
## The following object is masked from 'package:utils':
## 
##     vi
library(pdp)
## Warning: package 'pdp' was built under R version 4.4.2
## 
## Attaching package: 'pdp'
## 
## The following object is masked from 'package:purrr':
## 
##     partial
library(here)
## here() starts at C:/Users/ethan/OneDrive - University of Cincinnati/Documents/BANA 4080/Final Project/BANA4080 Group Final
library(dplyr)
library(ggplot2)
library(yardstick)
library(earth)
## Warning: package 'earth' was built under R version 4.4.2
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.4.2
## Loading required package: plotrix
## 
## Attaching package: 'plotrix'
## 
## The following object is masked from 'package:scales':
## 
##     rescale
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following objects are masked from 'package:yardstick':
## 
##     precision, recall, sensitivity, specificity
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.2
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

Importing Data

#Import data file

retention <- read.csv("customer_retention.csv")

# Convert response variable to factor
retention <- retention %>%
  mutate(Status = as.factor(Status))

# Remove missing values
retention <- na.omit(retention)

glimpse(retention)
## Rows: 6,988
## Columns: 20
## $ Gender           <chr> "Female", "Male", "Male", "Male", "Female", "Female",…
## $ SeniorCitizen    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Partner          <chr> "Yes", "No", "No", "No", "No", "No", "No", "No", "Yes…
## $ Dependents       <chr> "No", "No", "No", "No", "No", "No", "Yes", "No", "No"…
## $ Tenure           <int> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58, 49, 2…
## $ PhoneService     <chr> "No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "No", …
## $ MultipleLines    <chr> "No phone service", "No", "No", "No phone service", "…
## $ InternetService  <chr> "DSL", "DSL", "DSL", "DSL", "Fiber optic", "Fiber opt…
## $ OnlineSecurity   <chr> "No", "Yes", "Yes", "Yes", "No", "No", "No", "Yes", "…
## $ OnlineBackup     <chr> "Yes", "No", "Yes", "No", "No", "No", "Yes", "No", "N…
## $ DeviceProtection <chr> "No", "Yes", "No", "Yes", "No", "Yes", "No", "No", "Y…
## $ TechSupport      <chr> "No", "No", "No", "Yes", "No", "No", "No", "No", "Yes…
## $ StreamingTV      <chr> "No", "No", "No", "No", "No", "Yes", "Yes", "No", "Ye…
## $ StreamingMovies  <chr> "No", "No", "No", "No", "No", "Yes", "No", "No", "Yes…
## $ Contract         <chr> "Month-to-month", "One year", "Month-to-month", "One …
## $ PaperlessBilling <chr> "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "No", …
## $ PaymentMethod    <chr> "Electronic check", "Mailed check", "Mailed check", "…
## $ MonthlyCharges   <dbl> 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10, 29.7…
## $ TotalCharges     <dbl> 29.85, 1889.50, 108.15, 1840.75, 151.65, 820.50, 1949…
## $ Status           <fct> Current, Current, Left, Current, Left, Left, Current,…

Data Preprocessing

set.seed(123)
split <- initial_split(retention, prop = 0.7, strata = Status)
train <- training(split)
test <- testing(split)

#Set predictor variables

train <- select(train, Tenure, TotalCharges, Status)
test <- select(test, Tenure, TotalCharges, Status)
glimpse(train)
## Rows: 4,891
## Columns: 3
## $ Tenure       <int> 1, 34, 45, 10, 13, 25, 69, 52, 71, 21, 49, 30, 72, 71, 2,…
## $ TotalCharges <dbl> 29.85, 1889.50, 1840.75, 301.90, 587.45, 2686.05, 7895.15…
## $ Status       <fct> Current, Current, Current, Current, Current, Current, Cur…
glimpse(test)
## Rows: 2,097
## Columns: 3
## $ Tenure       <int> 22, 28, 62, 16, 58, 12, 58, 1, 1, 5, 70, 17, 2, 2, 52, 60…
## $ TotalCharges <dbl> 1949.40, 3046.05, 3487.95, 326.80, 5681.10, 202.25, 3505.…
## $ Status       <fct> Current, Left, Current, Current, Current, Current, Curren…

Exploratory Data Analysis (EDA)

# Distribution of Tenure by Status
ggplot(train, aes(x = Tenure, fill = Status)) +
  geom_histogram(bins = 30, position = "dodge") +
  labs(title = "Distribution of Tenure by Status", x = "Tenure (months)", y = "# of Customer Accounts")

# Distribution of Tenure by Status
ggplot(train, aes(x = TotalCharges, fill = Status)) +
  geom_histogram(bins = 30, position = "dodge") +
  labs(title = "Distribution of TotalCharges by Status", x = "Total Charges (USD)", y = "# of Customer Accounts")

# Check Correlation
correlation <- train %>%
  mutate(Status = as.numeric(Status) - 1) %>%
  select(-Status) %>%
  cor()

correlation
##                Tenure TotalCharges
## Tenure       1.000000     0.825092
## TotalCharges 0.825092     1.000000

Logististic Model

# Set seed for reproducibility
set.seed(123)
logistic_split <- initial_split(retention, prop = 0.7, strata = Status)
logistic_train <- training(logistic_split)
logistic_test <- testing(logistic_split)

# Create 5-fold cross-validation folds
set.seed(123)
logistic_kfolds <- vfold_cv(logistic_train, v = 5, strata = Status)

# Fit model
logistic_results <- logistic_reg() %>%
  set_engine("glm") %>%
  set_mode("classification") %>%
  fit_resamples(Status ~ Tenure + TotalCharges, logistic_kfolds)

# Collect metrics
logistic_metrics <- logistic_results %>%
  collect_metrics()

logistic_metrics
## # A tibble: 3 × 6
##   .metric     .estimator  mean     n std_err .config             
##   <chr>       <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy    binary     0.770     5 0.00600 Preprocessor1_Model1
## 2 brier_class binary     0.158     5 0.00132 Preprocessor1_Model1
## 3 roc_auc     binary     0.787     5 0.00517 Preprocessor1_Model1

Bagged Tree

set.seed(123)
bagged_split <- initial_split(retention, prop = 0.7, strata = Status)
bagged_train <- training(bagged_split)
bagged_test <- testing(bagged_split)

# Create 5-fold cross-validation folds
set.seed(123)
bagged_kfolds <- vfold_cv(bagged_train, v = 5, strata = Status)

# Fit model
bagged_results <- bag_tree() %>%
  set_engine("rpart") %>%
  set_mode("classification") %>%
  fit_resamples(Status ~ Tenure + TotalCharges, bagged_kfolds)

# Collect metrics
bagged_metrics <- bagged_results %>%
  collect_metrics()

bagged_metrics
## # A tibble: 3 × 6
##   .metric     .estimator  mean     n  std_err .config             
##   <chr>       <chr>      <dbl> <int>    <dbl> <chr>               
## 1 accuracy    binary     0.736     5 0.00119  Preprocessor1_Model1
## 2 brier_class binary     0.195     5 0.000842 Preprocessor1_Model1
## 3 roc_auc     binary     0.730     5 0.00272  Preprocessor1_Model1

MARS Model

# Train Model

set.seed(123)
mars <- train(
  Status ~ Tenure + TotalCharges,
  data = train,
  method = "earth",
  metric = "ROC",
  trControl =trainControl(
    method = "cv",
    number =5,
    classProbs = TRUE,
    summaryFunction = twoClassSummary
  )
  
)


# Print summary

print(mars)
## Multivariate Adaptive Regression Spline 
## 
## 4891 samples
##    2 predictor
##    2 classes: 'Current', 'Left' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 3913, 3914, 3912, 3913, 3912 
## Resampling results across tuning parameters:
## 
##   nprune  ROC        Sens       Spec     
##   2       0.6940389  0.9649513  0.1246154
##   4       0.7808040  0.9039543  0.4033739
##   7       0.8077495  0.8892016  0.4841907
## 
## Tuning parameter 'degree' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nprune = 7 and degree = 1.
# model

# Data Splitting
set.seed(123)
split <- initial_split(retention, prop = 0.7, strata = Status)
train <- training(split)
test <- testing(split)

# Cross-Validation Folds
set.seed(123)
mars_kfolds <- vfold_cv(train, v = 5, strata = Status)

# Define MARS Model 
mars_mod <- mars(num_terms = tune(), prod_degree = tune()) %>%
  set_engine("earth") %>%
  set_mode("classification")

# Create Workflow
mars_wf <- workflow() %>%
  add_model(mars_mod) %>%
  add_formula(Status ~ Tenure + TotalCharges)

# Define Grid for Tuning
mars_grid <- grid_regular(
  num_terms(range = c(2, 20)),
  prod_degree(range = c(1, 2)),
  levels = 5
)

# Perform Tuning
mars_tuned <- mars_wf %>%
  tune_grid(
    resamples = mars_kfolds,
    grid = mars_grid
  )

# Collect Metrics
mars_metrics <- mars_tuned %>%
  collect_metrics()

mars_metrics
## # A tibble: 30 × 8
##    num_terms prod_degree .metric     .estimator  mean     n std_err .config     
##        <int>       <int> <chr>       <chr>      <dbl> <int>   <dbl> <chr>       
##  1         2           1 accuracy    binary     0.738     5 0.00211 Preprocesso…
##  2         2           1 brier_class binary     0.175     5 0.00225 Preprocesso…
##  3         2           1 roc_auc     binary     0.696     5 0.0231  Preprocesso…
##  4         6           1 accuracy    binary     0.778     5 0.00309 Preprocesso…
##  5         6           1 brier_class binary     0.151     5 0.00184 Preprocesso…
##  6         6           1 roc_auc     binary     0.805     5 0.00635 Preprocesso…
##  7        11           1 accuracy    binary     0.779     5 0.00324 Preprocesso…
##  8        11           1 brier_class binary     0.150     5 0.00173 Preprocesso…
##  9        11           1 roc_auc     binary     0.808     5 0.00574 Preprocesso…
## 10        15           1 accuracy    binary     0.779     5 0.00324 Preprocesso…
## # ℹ 20 more rows

Evaluate Logistic Regression

# Refit Logistic Regression directly on the full training set
logistic_model <- logistic_reg() %>%
  set_engine("glm") %>%
  set_mode("classification") %>%
  fit(Status ~ Tenure + TotalCharges, data = train)

# Make predictions on the test set
logistic_predictions <- logistic_model %>%
  predict(new_data = test, type = "prob")

# Evaluate using ROC-AUC
logistic_roc <- roc_auc(
  data = bind_cols(test, logistic_predictions),
  truth = Status,
  .pred_Left
)

logistic_roc
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.214

Evaluate Bagged Tree

# Define and Fit Model

set.seed(123)
bagged_model <- bag_tree() %>%
  set_engine("rpart") %>%
  set_mode("classification") %>%
  fit(Status ~ Tenure + TotalCharges, data = train)

# Make Predictions on the Test Set
bagged_predictions <- bagged_model %>%
  predict(new_data = test, type = "prob")

# Evaluate Bagged Tree Model Using ROC-AUC
bagged_roc <- roc_auc(
  data = bind_cols(test, bagged_predictions),
  truth = Status,
  .pred_Left
)

bagged_roc
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.298

Evaluate MARS

# Define and Fit the MARS Model

set.seed(123)
mars_model <- mars(num_terms = 10, prod_degree = 1) %>%  # Fixed parameters
  set_engine("earth") %>%
  set_mode("classification") %>%
  fit(Status ~ Tenure + TotalCharges, data = train)

# Make Predictions on the Test Set
mars_predictions <- mars_model %>%
  predict(new_data = test, type = "prob")

# Evaluate MARS Model Using ROC-AUC
mars_roc <- roc_auc(
  data = bind_cols(test, mars_predictions),
  truth = Status,
  .pred_Left
)

mars_roc
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.193
# Generate partial dependence data for Tenure
partial_tenure <- partial(mars_model, pred.var = "Tenure", train = train)

# Plot the partial dependence for Tenure
autoplot(partial_tenure) +
  labs(title = "Partial Dependence for Tenure", x = "Tenure (Months)", y = "Predicted Probability")

# Generate partial dependence data for TotalCharges
partial_totalcharges <- partial(mars_model, pred.var = "TotalCharges", train = train)

# Plot the partial dependence for TotalCharges
autoplot(partial_totalcharges) +
  labs(title = "Partial Dependence for Total Charges", x = "Total Charges (USD)", y = "Predicted Probability")