In this report we aim to highlight customer churn in relation to customer tenure and total monthly charge. we will be using ‘status’ as a response variable and ‘Tenure’ and ‘TotalCharge’ as predictor variables
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.6 ✔ recipes 1.1.0
## ✔ dials 1.3.0 ✔ rsample 1.2.1
## ✔ dplyr 1.1.4 ✔ tibble 3.2.1
## ✔ ggplot2 3.5.1 ✔ tidyr 1.3.1
## ✔ infer 1.0.7 ✔ tune 1.2.1
## ✔ modeldata 1.4.0 ✔ workflows 1.1.4
## ✔ parsnip 1.2.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.2 ✔ yardstick 1.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.3 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ stringr::fixed() masks recipes::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ readr::spec() masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(baguette)
## Warning: package 'baguette' was built under R version 4.4.2
library(vip)
##
## Attaching package: 'vip'
##
## The following object is masked from 'package:utils':
##
## vi
library(pdp)
## Warning: package 'pdp' was built under R version 4.4.2
##
## Attaching package: 'pdp'
##
## The following object is masked from 'package:purrr':
##
## partial
library(here)
## here() starts at C:/Users/ethan/OneDrive - University of Cincinnati/Documents/BANA 4080/Final Project/BANA4080 Group Final
library(dplyr)
library(ggplot2)
library(yardstick)
library(earth)
## Warning: package 'earth' was built under R version 4.4.2
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.4.2
## Loading required package: plotrix
##
## Attaching package: 'plotrix'
##
## The following object is masked from 'package:scales':
##
## rescale
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following objects are masked from 'package:yardstick':
##
## precision, recall, sensitivity, specificity
##
## The following object is masked from 'package:purrr':
##
## lift
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.2
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
#Import data file
retention <- read.csv("customer_retention.csv")
# Convert response variable to factor
retention <- retention %>%
mutate(Status = as.factor(Status))
# Remove missing values
retention <- na.omit(retention)
glimpse(retention)
## Rows: 6,988
## Columns: 20
## $ Gender <chr> "Female", "Male", "Male", "Male", "Female", "Female",…
## $ SeniorCitizen <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Partner <chr> "Yes", "No", "No", "No", "No", "No", "No", "No", "Yes…
## $ Dependents <chr> "No", "No", "No", "No", "No", "No", "Yes", "No", "No"…
## $ Tenure <int> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58, 49, 2…
## $ PhoneService <chr> "No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "No", …
## $ MultipleLines <chr> "No phone service", "No", "No", "No phone service", "…
## $ InternetService <chr> "DSL", "DSL", "DSL", "DSL", "Fiber optic", "Fiber opt…
## $ OnlineSecurity <chr> "No", "Yes", "Yes", "Yes", "No", "No", "No", "Yes", "…
## $ OnlineBackup <chr> "Yes", "No", "Yes", "No", "No", "No", "Yes", "No", "N…
## $ DeviceProtection <chr> "No", "Yes", "No", "Yes", "No", "Yes", "No", "No", "Y…
## $ TechSupport <chr> "No", "No", "No", "Yes", "No", "No", "No", "No", "Yes…
## $ StreamingTV <chr> "No", "No", "No", "No", "No", "Yes", "Yes", "No", "Ye…
## $ StreamingMovies <chr> "No", "No", "No", "No", "No", "Yes", "No", "No", "Yes…
## $ Contract <chr> "Month-to-month", "One year", "Month-to-month", "One …
## $ PaperlessBilling <chr> "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "No", …
## $ PaymentMethod <chr> "Electronic check", "Mailed check", "Mailed check", "…
## $ MonthlyCharges <dbl> 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10, 29.7…
## $ TotalCharges <dbl> 29.85, 1889.50, 108.15, 1840.75, 151.65, 820.50, 1949…
## $ Status <fct> Current, Current, Left, Current, Left, Left, Current,…
set.seed(123)
split <- initial_split(retention, prop = 0.7, strata = Status)
train <- training(split)
test <- testing(split)
#Set predictor variables
train <- select(train, Tenure, TotalCharges, Status)
test <- select(test, Tenure, TotalCharges, Status)
glimpse(train)
## Rows: 4,891
## Columns: 3
## $ Tenure <int> 1, 34, 45, 10, 13, 25, 69, 52, 71, 21, 49, 30, 72, 71, 2,…
## $ TotalCharges <dbl> 29.85, 1889.50, 1840.75, 301.90, 587.45, 2686.05, 7895.15…
## $ Status <fct> Current, Current, Current, Current, Current, Current, Cur…
glimpse(test)
## Rows: 2,097
## Columns: 3
## $ Tenure <int> 22, 28, 62, 16, 58, 12, 58, 1, 1, 5, 70, 17, 2, 2, 52, 60…
## $ TotalCharges <dbl> 1949.40, 3046.05, 3487.95, 326.80, 5681.10, 202.25, 3505.…
## $ Status <fct> Current, Left, Current, Current, Current, Current, Curren…
# Distribution of Tenure by Status
ggplot(train, aes(x = Tenure, fill = Status)) +
geom_histogram(bins = 30, position = "dodge") +
labs(title = "Distribution of Tenure by Status", x = "Tenure (months)", y = "# of Customer Accounts")
# Distribution of Tenure by Status
ggplot(train, aes(x = TotalCharges, fill = Status)) +
geom_histogram(bins = 30, position = "dodge") +
labs(title = "Distribution of TotalCharges by Status", x = "Total Charges (USD)", y = "# of Customer Accounts")
# Check Correlation
correlation <- train %>%
mutate(Status = as.numeric(Status) - 1) %>%
select(-Status) %>%
cor()
correlation
## Tenure TotalCharges
## Tenure 1.000000 0.825092
## TotalCharges 0.825092 1.000000
# Set seed for reproducibility
set.seed(123)
logistic_split <- initial_split(retention, prop = 0.7, strata = Status)
logistic_train <- training(logistic_split)
logistic_test <- testing(logistic_split)
# Create 5-fold cross-validation folds
set.seed(123)
logistic_kfolds <- vfold_cv(logistic_train, v = 5, strata = Status)
# Fit model
logistic_results <- logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification") %>%
fit_resamples(Status ~ Tenure + TotalCharges, logistic_kfolds)
# Collect metrics
logistic_metrics <- logistic_results %>%
collect_metrics()
logistic_metrics
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.770 5 0.00600 Preprocessor1_Model1
## 2 brier_class binary 0.158 5 0.00132 Preprocessor1_Model1
## 3 roc_auc binary 0.787 5 0.00517 Preprocessor1_Model1
set.seed(123)
bagged_split <- initial_split(retention, prop = 0.7, strata = Status)
bagged_train <- training(bagged_split)
bagged_test <- testing(bagged_split)
# Create 5-fold cross-validation folds
set.seed(123)
bagged_kfolds <- vfold_cv(bagged_train, v = 5, strata = Status)
# Fit model
bagged_results <- bag_tree() %>%
set_engine("rpart") %>%
set_mode("classification") %>%
fit_resamples(Status ~ Tenure + TotalCharges, bagged_kfolds)
# Collect metrics
bagged_metrics <- bagged_results %>%
collect_metrics()
bagged_metrics
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.736 5 0.00119 Preprocessor1_Model1
## 2 brier_class binary 0.195 5 0.000842 Preprocessor1_Model1
## 3 roc_auc binary 0.730 5 0.00272 Preprocessor1_Model1
# Train Model
set.seed(123)
mars <- train(
Status ~ Tenure + TotalCharges,
data = train,
method = "earth",
metric = "ROC",
trControl =trainControl(
method = "cv",
number =5,
classProbs = TRUE,
summaryFunction = twoClassSummary
)
)
# Print summary
print(mars)
## Multivariate Adaptive Regression Spline
##
## 4891 samples
## 2 predictor
## 2 classes: 'Current', 'Left'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 3913, 3914, 3912, 3913, 3912
## Resampling results across tuning parameters:
##
## nprune ROC Sens Spec
## 2 0.6940389 0.9649513 0.1246154
## 4 0.7808040 0.9039543 0.4033739
## 7 0.8077495 0.8892016 0.4841907
##
## Tuning parameter 'degree' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nprune = 7 and degree = 1.
# model
# Data Splitting
set.seed(123)
split <- initial_split(retention, prop = 0.7, strata = Status)
train <- training(split)
test <- testing(split)
# Cross-Validation Folds
set.seed(123)
mars_kfolds <- vfold_cv(train, v = 5, strata = Status)
# Define MARS Model
mars_mod <- mars(num_terms = tune(), prod_degree = tune()) %>%
set_engine("earth") %>%
set_mode("classification")
# Create Workflow
mars_wf <- workflow() %>%
add_model(mars_mod) %>%
add_formula(Status ~ Tenure + TotalCharges)
# Define Grid for Tuning
mars_grid <- grid_regular(
num_terms(range = c(2, 20)),
prod_degree(range = c(1, 2)),
levels = 5
)
# Perform Tuning
mars_tuned <- mars_wf %>%
tune_grid(
resamples = mars_kfolds,
grid = mars_grid
)
# Collect Metrics
mars_metrics <- mars_tuned %>%
collect_metrics()
mars_metrics
## # A tibble: 30 × 8
## num_terms prod_degree .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 2 1 accuracy binary 0.738 5 0.00211 Preprocesso…
## 2 2 1 brier_class binary 0.175 5 0.00225 Preprocesso…
## 3 2 1 roc_auc binary 0.696 5 0.0231 Preprocesso…
## 4 6 1 accuracy binary 0.778 5 0.00309 Preprocesso…
## 5 6 1 brier_class binary 0.151 5 0.00184 Preprocesso…
## 6 6 1 roc_auc binary 0.805 5 0.00635 Preprocesso…
## 7 11 1 accuracy binary 0.779 5 0.00324 Preprocesso…
## 8 11 1 brier_class binary 0.150 5 0.00173 Preprocesso…
## 9 11 1 roc_auc binary 0.808 5 0.00574 Preprocesso…
## 10 15 1 accuracy binary 0.779 5 0.00324 Preprocesso…
## # ℹ 20 more rows
# Refit Logistic Regression directly on the full training set
logistic_model <- logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification") %>%
fit(Status ~ Tenure + TotalCharges, data = train)
# Make predictions on the test set
logistic_predictions <- logistic_model %>%
predict(new_data = test, type = "prob")
# Evaluate using ROC-AUC
logistic_roc <- roc_auc(
data = bind_cols(test, logistic_predictions),
truth = Status,
.pred_Left
)
logistic_roc
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.214
# Define and Fit Model
set.seed(123)
bagged_model <- bag_tree() %>%
set_engine("rpart") %>%
set_mode("classification") %>%
fit(Status ~ Tenure + TotalCharges, data = train)
# Make Predictions on the Test Set
bagged_predictions <- bagged_model %>%
predict(new_data = test, type = "prob")
# Evaluate Bagged Tree Model Using ROC-AUC
bagged_roc <- roc_auc(
data = bind_cols(test, bagged_predictions),
truth = Status,
.pred_Left
)
bagged_roc
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.298
# Define and Fit the MARS Model
set.seed(123)
mars_model <- mars(num_terms = 10, prod_degree = 1) %>% # Fixed parameters
set_engine("earth") %>%
set_mode("classification") %>%
fit(Status ~ Tenure + TotalCharges, data = train)
# Make Predictions on the Test Set
mars_predictions <- mars_model %>%
predict(new_data = test, type = "prob")
# Evaluate MARS Model Using ROC-AUC
mars_roc <- roc_auc(
data = bind_cols(test, mars_predictions),
truth = Status,
.pred_Left
)
mars_roc
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.193
# Generate partial dependence data for Tenure
partial_tenure <- partial(mars_model, pred.var = "Tenure", train = train)
# Plot the partial dependence for Tenure
autoplot(partial_tenure) +
labs(title = "Partial Dependence for Tenure", x = "Tenure (Months)", y = "Predicted Probability")
# Generate partial dependence data for TotalCharges
partial_totalcharges <- partial(mars_model, pred.var = "TotalCharges", train = train)
# Plot the partial dependence for TotalCharges
autoplot(partial_totalcharges) +
labs(title = "Partial Dependence for Total Charges", x = "Total Charges (USD)", y = "Predicted Probability")