This document presents an exploratory data analysis of a customer retention data set for Regork. The goal is to identify and use trends and patterns to develop a solution to improve customer retention.
First, we load the necessary libraries and the dataset.
library(fastDummies)
library(ggplot2)
library(dplyr)
library(tidyr)
data <- read.csv("customer_retention.csv")
# Load necessary library
library(dplyr)
# Check for missing values in each column
missing_values <- sapply(data, function(x) sum(is.na(x)))
print(missing_values)
print(table(data$Status))
# Remove rows with any missing value
clean_data <- na.omit(data)
print(table(clean_data$Status))
# Check again for missing values
missing_values_clean <- sapply(clean_data, function(x) sum(is.na(x)))
print(missing_values_clean)
print(unique(clean_data$Status))
ggplot(data, aes(x = Status)) +
geom_bar() +
labs(title = "Distribution of Customer Status", x = "Status", y = "Number of Customers")
data_without_status <- clean_data[, names(clean_data) != 'Status']
data_dummy <- fastDummies::dummy_cols(data_without_status, remove_first_dummy = TRUE)
data_dummy$Status <- clean_data$Status
data_dummy$Status <- as.numeric(data_dummy$Status == 'Left')
correlation_matrix <- cor(data_dummy[, sapply(data_dummy, is.numeric)], use = "complete.obs")
status_correlation <- correlation_matrix[,'Status']
status_correlation <- status_correlation[names(status_correlation) != 'Status']
# Visualize
corr_data <- data.frame(Variable = names(status_correlation), Correlation = status_correlation)
ggplot(corr_data, aes(x = reorder(Variable, Correlation), y = Correlation)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Correlation with Status ('Left')", x = "Variables", y = "Correlation Coefficient")
# Load necessary libraries
library(caret)
library(rpart)
library(randomForest)
# Set seed for reproducibility
set.seed(123)
# Inspect the 'Status' variable
print(unique(clean_data$Status))
print(table(clean_data$Status))
# Splitting the data into training and testing sets
index <- createDataPartition(clean_data$Status, p = 0.8, list = FALSE)
train_data <- clean_data[index, ]
test_data <- clean_data[-index, ]
# Ensure all factor levels in the test set also exist in the training set
train_data$Status <- factor(train_data$Status, levels = c('Current', 'Left'))
test_data$Status <- factor(test_data$Status, levels = c('Current', 'Left'))
# Control object for training with 5-fold cross-validation
control <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary)
# Metric for model comparison
metric <- "ROC"
# Logistic Regression Model
model_log <- train(Status ~ ., data = train_data, method = "glm", family = "binomial", trControl = control, metric = metric)
# Decision Tree Model
model_tree <- train(Status ~ ., data = train_data, method = "rpart", trControl = control, metric = metric)
# Random Forest Model
model_rf <- train(Status ~ ., data = train_data, method = "rf", trControl = control, metric = metric)
# Comparing Models
results <- resamples(list(logistic = model_log, tree = model_tree, randomForest = model_rf))
summary(results)
# Plotting model comparisons
dotplot(results)
# Assessing feature importance for the selected model (e.g., Random Forest)
importance <- varImp(model_rf, scale = FALSE)
plot(importance)
# Evaluating the final model on the test set
predictions <- predict(model_rf, test_data)
confusionMatrix(predictions, test_data$Status)
importance <- varImp(model_rf, scale = FALSE)
# Predict the status on the test dataset
predicted_status <- predict(model_rf, test_data, type = "prob")
at_risk_customers <- test_data[predicted_status[, "Left"] > 0.5, ]
predicted_loss <- sum(at_risk_customers$MonthlyCharges)
show(predicted_loss)
## [1] 15648.15
# Calculate the cost of the incentive
incentive_cost <- sum(at_risk_customers$MonthlyCharges) * 0.10 * 6
# Benefit is the predicted loss minus the incentive cost
benefit <- predicted_loss - incentive_cost
# Cost-Benefit Analysis
cost_benefit_analysis <- data.frame(IncentiveCost = incentive_cost, PredictedLoss = predicted_loss, NetBenefit = benefit)
show(cost_benefit_analysis)
## IncentiveCost PredictedLoss NetBenefit
## 1 9388.89 15648.15 6259.26