This automated churn predictor generates sample customer data (with company names), trains a machine-learning model, evaluates performance, and saves risk predictions for a connected Shiny dashboard.
if (file.exists("data/synthetic_churn_data.csv")) {
churn_data <- read_csv("data/synthetic_churn_data.csv", show_col_types = FALSE)
cat("Loaded existing churn dataset.
")
} else {
dir.create("data", showWarnings = FALSE)
n <- 1000
churn_data <- tibble(
company = paste("Company", seq_len(n)),
usage_score = runif(n, 0, 100),
nps = sample(seq(0, 10), n, replace = TRUE),
tickets = rpois(n, 3),
days_since_login = rpois(n, 30),
contract_length_months = sample(c(6, 12, 24), n, replace = TRUE)
) %>%
mutate(
health_score = (usage_score * 0.35 + nps * 10 * 0.35 - tickets * 0.1),
churn_prob = plogis(-3 + 0.04 * days_since_login - 0.02 * usage_score - 0.1 * nps + 0.3 * tickets),
churn = rbinom(n, 1, churn_prob)
)
write_csv(churn_data, "data/synthetic_churn_data.csv")
cat("✅ Synthetic churn dataset created with company names.
")
}
## ✅ Synthetic churn dataset created with company names.
data_model <- churn_data %>% select(-churn_prob)
train_index <- createDataPartition(data_model$churn, p = 0.8, list = FALSE)
train_data <- data_model[train_index, ]
test_data <- data_model[-train_index, ]
rf_model <- randomForest(factor(churn) ~ usage_score + nps + tickets + days_since_login + contract_length_months + health_score,
data = train_data, ntree = 200, importance = TRUE)
rf_model
##
## Call:
## randomForest(formula = factor(churn) ~ usage_score + nps + tickets + days_since_login + contract_length_months + health_score, data = train_data, ntree = 200, importance = TRUE)
## Type of random forest: classification
## Number of trees: 200
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 11.62%
## Confusion matrix:
## 0 1 class.error
## 0 702 7 0.009873061
## 1 86 5 0.945054945
preds <- predict(rf_model, test_data, type = "response")
conf_matrix <- confusionMatrix(factor(preds), factor(test_data$churn))
conf_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 182 16
## 1 2 0
##
## Accuracy : 0.91
## 95% CI : (0.8615, 0.9458)
## No Information Rate : 0.92
## P-Value [Acc > NIR] : 0.749439
##
## Kappa : -0.0181
##
## Mcnemar's Test P-Value : 0.002183
##
## Sensitivity : 0.9891
## Specificity : 0.0000
## Pos Pred Value : 0.9192
## Neg Pred Value : 0.0000
## Prevalence : 0.9200
## Detection Rate : 0.9100
## Detection Prevalence : 0.9900
## Balanced Accuracy : 0.4946
##
## 'Positive' Class : 0
##
importance_df <- as.data.frame(importance(rf_model)) %>%
rownames_to_column("Feature")
ggplot(importance_df, aes(x = reorder(Feature, MeanDecreaseGini), y = MeanDecreaseGini)) +
geom_col(fill = "#2b8cbe") +
coord_flip() +
labs(title = "Feature Importance for Churn Model", x = "Feature", y = "Importance") +
theme_minimal()
preds_prob <- predict(rf_model, test_data, type = "prob")[, 2]
test_results <- test_data %>% mutate(predicted_prob = preds_prob) %>% arrange(desc(predicted_prob))
write_csv(test_results, "data/latest_churn_predictions.csv")
cat("Saved predictions to data/latest_churn_predictions.csv
")
## Saved predictions to data/latest_churn_predictions.csv
cat("Model Accuracy:", round(conf_matrix$overall["Accuracy"], 3), "
")
## Model Accuracy: 0.91
cat("F1 Score:", round(conf_matrix$byClass["F1"], 3), "
")
## F1 Score: 0.953
0 6 * * * /usr/local/bin/Rscript -e "rmarkdown::render('~/Projects/churn_predictor_with_company_names.Rmd')"
The output file below feeds the live dashboard.
data/latest_churn_predictions.csv
the Shiny app reads this automatically and displays company names in the Top 10 At‑Risk Customers table.