Introduction

This document presents an exploratory data analysis of a customer retention data set for Regork. The goal is to identify and use trends and patterns to develop a solution to improve customer retention.

Data Preparation

First, we load the necessary libraries and the dataset.

library(fastDummies)
library(ggplot2)
library(dplyr)
library(tidyr)

data <- read.csv("customer_retention.csv")

Data Exploration

# Load necessary library
library(dplyr)

# Check for missing values in each column
missing_values <- sapply(data, function(x) sum(is.na(x)))
print(missing_values)
print(table(data$Status))

# Remove rows with any missing value
clean_data <- na.omit(data)
print(table(clean_data$Status))

# Check again for missing values
missing_values_clean <- sapply(clean_data, function(x) sum(is.na(x)))
print(missing_values_clean)

print(unique(clean_data$Status))

Visualization

ggplot(data, aes(x = Status)) +
  geom_bar() +
  labs(title = "Distribution of Customer Status", x = "Status", y = "Number of Customers")

Correlation Analysis

data_without_status <- clean_data[, names(clean_data) != 'Status']
data_dummy <- fastDummies::dummy_cols(data_without_status, remove_first_dummy = TRUE)
data_dummy$Status <- clean_data$Status
data_dummy$Status <- as.numeric(data_dummy$Status == 'Left')
correlation_matrix <- cor(data_dummy[, sapply(data_dummy, is.numeric)], use = "complete.obs")
status_correlation <- correlation_matrix[,'Status']
status_correlation <- status_correlation[names(status_correlation) != 'Status']

# Visualize
corr_data <- data.frame(Variable = names(status_correlation), Correlation = status_correlation)
ggplot(corr_data, aes(x = reorder(Variable, Correlation), y = Correlation)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Correlation with Status ('Left')", x = "Variables", y = "Correlation Coefficient")

Machine Learning

# Load necessary libraries
library(caret)
library(rpart)
library(randomForest)

# Set seed for reproducibility
set.seed(123)

# Inspect the 'Status' variable
print(unique(clean_data$Status))
print(table(clean_data$Status))

# Splitting the data into training and testing sets
index <- createDataPartition(clean_data$Status, p = 0.8, list = FALSE)
train_data <- clean_data[index, ]
test_data <- clean_data[-index, ]

# Ensure all factor levels in the test set also exist in the training set
train_data$Status <- factor(train_data$Status, levels = c('Current', 'Left'))
test_data$Status <- factor(test_data$Status, levels = c('Current', 'Left'))

# Control object for training with 5-fold cross-validation
control <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary)

# Metric for model comparison
metric <- "ROC"

# Logistic Regression Model
model_log <- train(Status ~ ., data = train_data, method = "glm", family = "binomial", trControl = control, metric = metric)

# Decision Tree Model
model_tree <- train(Status ~ ., data = train_data, method = "rpart", trControl = control, metric = metric)

# Random Forest Model
model_rf <- train(Status ~ ., data = train_data, method = "rf", trControl = control, metric = metric)

# Comparing Models
results <- resamples(list(logistic = model_log, tree = model_tree, randomForest = model_rf))
summary(results)

# Plotting model comparisons
dotplot(results)

# Assessing feature importance for the selected model (e.g., Random Forest)
importance <- varImp(model_rf, scale = FALSE)
plot(importance)

# Evaluating the final model on the test set
predictions <- predict(model_rf, test_data)
confusionMatrix(predictions, test_data$Status)

Business Analysis & Conclusion

importance <- varImp(model_rf, scale = FALSE)
# Predict the status on the test dataset
predicted_status <- predict(model_rf, test_data, type = "prob")
at_risk_customers <- test_data[predicted_status[, "Left"] > 0.5, ]
predicted_loss <- sum(at_risk_customers$MonthlyCharges)
show(predicted_loss)

## [1] 15648.15

# Calculate the cost of the incentive
incentive_cost <- sum(at_risk_customers$MonthlyCharges) * 0.10 * 6

# Benefit is the predicted loss minus the incentive cost
benefit <- predicted_loss - incentive_cost

# Cost-Benefit Analysis
cost_benefit_analysis <- data.frame(IncentiveCost = incentive_cost, PredictedLoss = predicted_loss, NetBenefit = benefit)
show(cost_benefit_analysis)

##   IncentiveCost PredictedLoss NetBenefit
## 1       9388.89      15648.15    6259.26

Regork Customer Retention Final Project

Matthew Beckman

2023-12-04