Gabriel Elías Chanchí Golondrino
Tarea 4
Business
Intelligence<> Presentado al profesor: Sixyel Jeyson Castaneda
Coronado
The development of Exercise 1 is presented below.
pkgs <- c("tidyverse", "lubridate", "janitor", "scales", "gridExtra", "car", "caret")
to_install <- setdiff(pkgs, rownames(installed.packages()))
if (length(to_install)) install.packages(to_install, dependencies = TRUE)
invisible(lapply(pkgs, library, character.only = TRUE))
data_path <- "Appendix 3 - Database.csv"
df_raw <- read_csv(data_path, show_col_types = FALSE) %>% clean_names()
head(df_raw)
## # A tibble: 6 × 8
## invoice_no stock_code description quantity invoice_date unit_price customer_id
## <chr> <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 536365 85123A WHITE HANG… 6 12/1/2010 8… 2.55 17850
## 2 536365 71053 WHITE META… 6 12/1/2010 8… 3.39 17850
## 3 536365 84406B CREAM CUPI… 8 12/1/2010 8… 2.75 17850
## 4 536365 84029G KNITTED UN… 6 12/1/2010 8… 3.39 17850
## 5 536365 84029E RED WOOLLY… 6 12/1/2010 8… 3.39 17850
## 6 536365 22752 SET 7 BABU… 2 12/1/2010 8… 7.65 17850
## # ℹ 1 more variable: country <chr>
df <- df_raw %>%
mutate(
# Parse flexible date-time formats (prevents negative recency bugs)
invoice_date = parse_date_time(
invoice_date,
orders = c("dmy HMS", "dmy HM", "dmy",
"mdy HMS", "mdy HM", "mdy",
"Ymd HMS", "Ymd HM", "Ymd"),
tz = "UTC"
),
invoice_date = as_datetime(invoice_date)
) %>%
# Keep only rows with a customer id and a valid date
filter(!is.na(customer_id), !is.na(invoice_date)) %>%
# Basic data sanity: drop zero-quantity rows and negative prices
filter(quantity != 0, unit_price >= 0) %>%
# IDs are identifiers, not numbers → keep as character
mutate(
customer_id = as.character(customer_id),
customer_uid = customer_id, # canonical id
amount = quantity * unit_price # line value
)
# Safety check: analysis date = max observed date (prevents hardcoding)
analysis_date <- as_date(max(df$invoice_date, na.rm = TRUE))
head(df)
## # A tibble: 6 × 10
## invoice_no stock_code description quantity invoice_date unit_price
## <chr> <chr> <chr> <dbl> <dttm> <dbl>
## 1 536365 85123A WHITE HANGING H… 6 2010-12-01 08:26:00 2.55
## 2 536365 71053 WHITE METAL LAN… 6 2010-12-01 08:26:00 3.39
## 3 536365 84406B CREAM CUPID HEA… 8 2010-12-01 08:26:00 2.75
## 4 536365 84029G KNITTED UNION F… 6 2010-12-01 08:26:00 3.39
## 5 536365 84029E RED WOOLLY HOTT… 6 2010-12-01 08:26:00 3.39
## 6 536365 22752 SET 7 BABUSHKA … 2 2010-12-01 08:26:00 7.65
## # ℹ 4 more variables: customer_id <chr>, country <chr>, customer_uid <chr>,
## # amount <dbl>
rfm <- df %>%
group_by(customer_uid) %>%
summarise(
last_purchase = max(invoice_date, na.rm = TRUE), # last tx
recency_days = as.numeric(analysis_date - as_date(last_purchase)), # recency
frequency = n_distinct(invoice_no), # #invoices
monetary_total = sum(amount, na.rm = TRUE), # total spend
monetary_avg = mean(amount, na.rm = TRUE), # avg line
monetary_min = min(amount, na.rm = TRUE), # min line
.groups = "drop"
) %>%
# Guard against any negative due to oddities (clip to 0)
mutate(recency_days = pmax(recency_days, 0))
head(rfm)
## # A tibble: 6 × 7
## customer_uid last_purchase recency_days frequency monetary_total
## <chr> <dttm> <dbl> <int> <dbl>
## 1 12346 2011-01-18 10:17:00 325 2 0
## 2 12347 2011-12-07 15:52:00 2 7 4310
## 3 12348 2011-09-25 13:13:00 75 4 1797.
## 4 12349 2011-11-21 09:51:00 18 1 1758.
## 5 12350 2011-02-02 16:01:00 310 1 334.
## 6 12352 2011-11-03 14:37:00 36 11 1545.
## # ℹ 2 more variables: monetary_avg <dbl>, monetary_min <dbl>
rfm <- rfm %>%
mutate(churn = if_else(recency_days > 90, 1L, 0L))
# Overall churn rate
churn_rate <- mean(rfm$churn)
cat("Churn rate:", percent(churn_rate), "\n")
## Churn rate: 33%
head(rfm$churn)
## [1] 1 0 0 0 1 0
print(summary(select(rfm, recency_days, frequency, monetary_total,
monetary_avg, monetary_min, churn)))
## recency_days frequency monetary_total monetary_avg
## Min. : 0.00 Min. : 1.000 Min. : -4287.6 Min. :-4287.63
## 1st Qu.: 16.00 1st Qu.: 1.000 1st Qu.: 293.4 1st Qu.: 10.99
## Median : 50.00 Median : 3.000 Median : 648.1 Median : 16.92
## Mean : 91.58 Mean : 5.075 Mean : 1898.5 Mean : 28.84
## 3rd Qu.:143.00 3rd Qu.: 5.000 3rd Qu.: 1611.7 3rd Qu.: 23.54
## Max. :373.00 Max. :248.000 Max. :279489.0 Max. : 3861.00
## monetary_min churn
## Min. :-168469.60 Min. :0.0000
## 1st Qu.: -11.50 1st Qu.:0.0000
## Median : 0.79 Median :0.0000
## Mean : -86.43 Mean :0.3326
## 3rd Qu.: 6.96 3rd Qu.:1.0000
## Max. : 3861.00 Max. :1.0000
Display of histograms for churn analysis.
# Hist: Recency (colored by churn)
p1 <- ggplot(rfm, aes(x = recency_days, fill = factor(churn))) +
geom_histogram(bins = 30, alpha = 0.7, position = "identity") +
scale_fill_manual(values = c("steelblue", "tomato"),
name = "Churn", labels = c("Active (0)", "Churned (1)")) +
labs(title = "Distribution of Recency (days)",
x = "Days since last purchase", y = "Customers")
# Hist: Frequency (clip x-axis at 99th percentile to tame long tail)
p2 <- ggplot(rfm, aes(x = frequency, fill = factor(churn))) +
geom_histogram(bins = 30, alpha = 0.7, position = "identity") +
scale_x_continuous(limits = c(0, quantile(rfm$frequency, 0.99))) +
scale_fill_manual(values = c("steelblue", "tomato"),
name = "Churn", labels = c("Active (0)", "Churned (1)")) +
labs(title = "Frequency of Transactions",
x = "Unique invoices", y = "Customers")
# Density: Monetary total (by churn)
p3 <- ggplot(rfm, aes(x = monetary_total, fill = factor(churn))) +
geom_density(alpha = 0.6) +
scale_x_continuous(labels = dollar_format(prefix = "£")) +
scale_fill_manual(values = c("steelblue", "tomato"),
name = "Churn", labels = c("Active (0)", "Churned (1)")) +
labs(title = "Density of Total Monetary Value",
x = "Total spend (£)", y = "Density")
# Bar: Class balance (churn distribution)
p4 <- ggplot(rfm, aes(x = factor(churn))) +
geom_bar(fill = "skyblue") +
geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
labs(title = "Churn Distribution (Class Balance)",
x = "Churn (1 = inactive, 0 = active)", y = "Customers")
# Show a 2x2 grid of the above plots
gridExtra::grid.arrange(p1, p2, p3, p4, ncol = 2)
Boxplots to visually spot outliers in RFM
rfm_long <- rfm %>%
select(recency_days, frequency, monetary_total) %>%
pivot_longer(everything(), names_to = "metric", values_to = "value")
ggplot(rfm_long, aes(x = metric, y = value, fill = metric)) +
geom_boxplot(alpha = 0.6, outlier.alpha = 0.5) +
scale_y_continuous(labels = comma) +
labs(title = "Boxplots of RFM Variables (Outlier Check)",
x = "Metric", y = "Value") +
theme(legend.position = "none")
cat(
"\nKey findings:\n",
"- Churn rate: ", percent(churn_rate), " of customers have not purchased in the last 90 days.\n",
"- Recency is now correctly computed (0 = purchased on last day; higher = older purchase).\n",
"- Frequency and Monetary show heavy right tails (a few large/wholesale buyers).\n",
"- Boxplots confirm outliers in spend/frequency; consider winsorizing for modeling.\n",
sep = ""
)
##
## Key findings:
## - Churn rate: 33% of customers have not purchased in the last 90 days.
## - Recency is now correctly computed (0 = purchased on last day; higher = older purchase).
## - Frequency and Monetary show heavy right tails (a few large/wholesale buyers).
## - Boxplots confirm outliers in spend/frequency; consider winsorizing for modeling.
Optional: save for next exercises (modeling, correlation, etc.)
write_csv(rfm, "rfm_churn_dataset.csv")
cat("\nSaved: rfm_churn_dataset.csv (customer-level RFM + churn)\n")
##
## Saved: rfm_churn_dataset.csv (customer-level RFM + churn)
Objective: 1) Standardize numeric RFM variables with scale()
2)
Apply K-means clustering (start with k = 3)
3) Determine optimal k
using:
- Elbow method (factoextra::fviz_nbclust)
- Silhouette
method
4) Retrain K-means with optimal k and visualize results
5) Perform Hierarchical Clustering (hclust) and compare
6) Describe
clusters: size, average RFM values, key differences
pkgs <- c("tidyverse", "factoextra", "cluster", "gridExtra")
to_install <- setdiff(pkgs, rownames(installed.packages()))
if (length(to_install)) install.packages(to_install)
invisible(lapply(pkgs, library, character.only = TRUE))
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Select only numerical RFM variables (excluding churn)
rfm_vars <- rfm %>%
select(recency_days, frequency, monetary_total, monetary_avg, monetary_min)
# Standardize using scale()
rfm_scaled <- scale(rfm_vars)
# Check result: means ~ 0, SD ~ 1
summary(as.data.frame(rfm_scaled))
## recency_days frequency monetary_total monetary_avg
## Min. :-0.9088 Min. :-0.436405 Min. :-0.75263 Min. :-33.90148
## 1st Qu.:-0.7500 1st Qu.:-0.436405 1st Qu.:-0.19528 1st Qu.: -0.14022
## Median :-0.4126 Median :-0.222244 Median :-0.15213 Median : -0.09362
## Mean : 0.0000 Mean : 0.000000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.5102 3rd Qu.:-0.008082 3rd Qu.:-0.03488 3rd Qu.: -0.04160
## Max. : 2.7926 Max. :26.012519 Max. :33.77283 Max. : 30.09773
## monetary_min
## Min. :-58.53958
## 1st Qu.: 0.02605
## Median : 0.03032
## Mean : 0.00000
## 3rd Qu.: 0.03247
## Max. : 1.37235
set.seed(123) # for reproducibility
k_init <- 3
kmeans_model <- kmeans(rfm_scaled, centers = k_init, nstart = 25)
# Add cluster labels to dataset
rfm_kmeans <- rfm %>%
mutate(cluster_kmeans = factor(kmeans_model$cluster))
head(rfm_kmeans$cluster_kmeans)
## [1] 2 3 3 3 2 3
## Levels: 1 2 3
fviz_nbclust(rfm_scaled, kmeans, method = "wss") +
labs(title = "Elbow Method – Optimal Number of Clusters (K-means)")
4b. Silhouette Method
fviz_nbclust(rfm_scaled, kmeans, method = "silhouette") +
labs(title = "Silhouette Method – Optimal Number of Clusters (K-means)")
After checking both plots visually, choose the best k (5)
optimal_k <- 5 # adjust this after observing the plots
# Retrain K-means with optimal k
set.seed(123)
kmeans_final <- kmeans(rfm_scaled, centers = optimal_k, nstart = 25)
# Add new cluster assignments
rfm_kmeans <- rfm_kmeans %>%
mutate(cluster_final = factor(kmeans_final$cluster))
head(rfm_kmeans$cluster_final)
## [1] 3 4 4 4 1 4
## Levels: 1 2 3 4 5
# Use first two principal components for visualization
fviz_cluster(kmeans_final, data = rfm_scaled,
geom = "point", ellipse.type = "norm",
ggtheme = theme_minimal(),
main = "K-means Clustering (after scaling)")
## Too few points to calculate an ellipse
Scatterplot using ggplot2 (colored by cluster)
ggplot(rfm_kmeans, aes(x = frequency, y = monetary_total, color = cluster_final)) +
geom_point(alpha = 0.7) +
scale_color_brewer(palette = "Set1") +
labs(title = "K-means Clusters by Frequency and Monetary Value",
x = "Frequency (transactions)", y = "Total Spend (£)")
cluster_profile <- rfm_kmeans %>%
group_by(cluster_final) %>%
summarise(
n_customers = n(),
avg_recency = mean(recency_days, na.rm = TRUE),
avg_frequency = mean(frequency, na.rm = TRUE),
avg_monetary = mean(monetary_total, na.rm = TRUE),
avg_monetary_avg = mean(monetary_avg, na.rm = TRUE),
churn_rate = mean(churn, na.rm = TRUE)
)
print(cluster_profile)
## # A tibble: 5 × 7
## cluster_final n_customers avg_recency avg_frequency avg_monetary
## <fct> <int> <dbl> <dbl> <dbl>
## 1 1 1105 246. 1.85 439.
## 2 2 26 5.35 83.3 75966.
## 3 3 2 162. 2.5 1.45
## 4 4 3233 39.6 5.55 1791.
## 5 5 6 89.5 2.17 8278.
## # ℹ 2 more variables: avg_monetary_avg <dbl>, churn_rate <dbl>
library(kableExtra)
##
## Adjuntando el paquete: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
kable(cluster_profile)
| cluster_final | n_customers | avg_recency | avg_frequency | avg_monetary | avg_monetary_avg | churn_rate |
|---|---|---|---|---|---|---|
| 1 | 1105 | 245.605430 | 1.851584 | 439.3184 | 21.09466 | 1.0000000 |
| 2 | 26 | 5.346154 | 83.346154 | 75966.3873 | 149.33882 | 0.0000000 |
| 3 | 2 | 162.500000 | 2.500000 | 1.4500 | 0.36250 | 0.5000000 |
| 4 | 3233 | 39.591092 | 5.554903 | 1790.8511 | 26.41538 | 0.1070213 |
| 5 | 6 | 89.500000 | 2.166667 | 8278.0833 | 2248.35074 | 0.3333333 |
# Compute Euclidean distance matrix on scaled data
dist_matrix <- dist(rfm_scaled, method = "euclidean")
# Apply hierarchical clustering (Ward's method = minimizes variance)
hc <- hclust(dist_matrix, method = "ward.D2")
# Plot dendrogram
plot(hc, labels = FALSE, hang = -1, main = "Hierarchical Clustering Dendrogram")
# Cut the tree at the same number of clusters as K-means for comparison
rfm_hclust <- rfm %>%
mutate(cluster_hclust = factor(cutree(hc, k = optimal_k)))
head(rfm_hclust)
## # A tibble: 6 × 9
## customer_uid last_purchase recency_days frequency monetary_total
## <chr> <dttm> <dbl> <int> <dbl>
## 1 12346 2011-01-18 10:17:00 325 2 0
## 2 12347 2011-12-07 15:52:00 2 7 4310
## 3 12348 2011-09-25 13:13:00 75 4 1797.
## 4 12349 2011-11-21 09:51:00 18 1 1758.
## 5 12350 2011-02-02 16:01:00 310 1 334.
## 6 12352 2011-11-03 14:37:00 36 11 1545.
## # ℹ 4 more variables: monetary_avg <dbl>, monetary_min <dbl>, churn <int>,
## # cluster_hclust <fct>
p_kmeans <- ggplot(rfm_kmeans, aes(x = frequency, y = monetary_total, color = cluster_final)) +
geom_point(alpha = 0.7) +
labs(title = "K-means Clustering", x = "Frequency", y = "Total Spend (£)") +
theme_minimal()
p_hclust <- ggplot(rfm_hclust, aes(x = frequency, y = monetary_total, color = cluster_hclust)) +
geom_point(alpha = 0.7) +
labs(title = "Hierarchical Clustering", x = "Frequency", y = "Total Spend (£)") +
theme_minimal()
gridExtra::grid.arrange(p_kmeans, p_hclust, ncol = 2)
cat("\nSegment notes (k=5):\n",
"- C1: highest avg_monetary & frequency, low recency → core loyal/wholesale.\n",
"- C2: medium spend, recent, moderate frequency → regulars with growth potential.\n",
"- C3: low spend, high recency → inactive/at-risk.\n",
"- C4: low frequency but decent ticket → infrequent, higher-value orders.\n",
"- C5: very low spend & frequency, mixed recency → one-time/near-lost.\n",
sep = "")
##
## Segment notes (k=5):
## - C1: highest avg_monetary & frequency, low recency → core loyal/wholesale.
## - C2: medium spend, recent, moderate frequency → regulars with growth potential.
## - C3: low spend, high recency → inactive/at-risk.
## - C4: low frequency but decent ticket → infrequent, higher-value orders.
## - C5: very low spend & frequency, mixed recency → one-time/near-lost.
# Save for your report if needed
write_csv(cluster_profile, "customer_segments_summary_k5.csv")
rfm_class <- rfm %>%
select(recency_days, frequency, monetary_total, monetary_avg, monetary_min, churn)
head(rfm_class)
## # A tibble: 6 × 6
## recency_days frequency monetary_total monetary_avg monetary_min churn
## <dbl> <int> <dbl> <dbl> <dbl> <int>
## 1 325 2 0 0 -77184. 1
## 2 2 7 4310 23.7 5.04 0
## 3 75 4 1797. 58.0 13.2 0
## 4 18 1 1758. 24.1 6.64 0
## 5 310 1 334. 19.7 8.5 1
## 6 36 11 1545. 16.3 -376. 0
#Installation of the required packages.
pkgs <- c("caret", "dplyr", "e1071")
to_install <- setdiff(pkgs, rownames(installed.packages()))
if (length(to_install)) install.packages(to_install, dependencies = TRUE)
invisible(lapply(pkgs, library, character.only = TRUE))
##
## Adjuntando el paquete: 'e1071'
## The following object is masked from 'package:ggplot2':
##
## element
#Ensure churn is a factor (recommended for stratified sampling)
rfm_class$churn <- as.factor(rfm_class$churn)
set.seed(123) # for reproducibility
#Create stratified training partition (70% of the data)
idx_train <- createDataPartition(rfm_class$churn, p = 0.7, list = FALSE)
#Train and test datasets
train <- rfm_class[idx_train, ]
test <- rfm_class[-idx_train, ]
# Ensure target variable is a factor
train$churn <- as.factor(train$churn)
test$churn <- as.factor(test$churn)
# Define X and y
x_train <- train %>% select(-churn)
y_train <- train$churn
x_test <- test %>% select(-churn)
y_test <- test$churn
head(x_train)
## # A tibble: 6 × 5
## recency_days frequency monetary_total monetary_avg monetary_min
## <dbl> <int> <dbl> <dbl> <dbl>
## 1 325 2 0 0 -77184.
## 2 2 7 4310 23.7 5.04
## 3 75 4 1797. 58.0 13.2
## 4 310 1 334. 19.7 8.5
## 5 36 11 1545. 16.3 -376.
## 6 204 1 89 22.2 11.6
set.seed(123)
logit_model <- train(
churn ~ .,
data = train,
method = "glm",
family = "binomial",
trControl = trainControl(method = "none")
)
# Predict classes
logit_pred <- predict(logit_model, newdata = test)
head(logit_pred)
## [1] 0 0 1 0 1 1
## Levels: 0 1
2a) Confusion matrix and performance metrics.
logit_cm <- confusionMatrix(logit_pred, y_test, positive = "1")
logit_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 875 1
## 1 0 435
##
## Accuracy : 0.9992
## 95% CI : (0.9958, 1)
## No Information Rate : 0.6674
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9983
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9977
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9989
## Prevalence : 0.3326
## Detection Rate : 0.3318
## Detection Prevalence : 0.3318
## Balanced Accuracy : 0.9989
##
## 'Positive' Class : 1
##
2b) The performance metrics are printed
# Accuracy
logit_accuracy <- logit_cm$overall["Accuracy"]
# Precision, Recall, F1
logit_precision <- logit_cm$byClass["Precision"]
logit_recall <- logit_cm$byClass["Recall"]
logit_f1 <- logit_cm$byClass["F1"]
print(logit_accuracy)
## Accuracy
## 0.9992372
print(logit_precision)
## Precision
## 1
print(logit_recall)
## Recall
## 0.9977064
print(logit_f1)
## F1
## 0.9988519
2c) Analysis of the results
cat("\nConfusion Matrix Notes (Logistic Regression):\n",
"- FN extremely low (only 1) → almost no true churners missed.\n",
"- FP = 0 → model never flags active customers as churn.\n",
"- Strong balance between FP & FN → minimal business risk in retention decisions.\n",
"- Near-perfect accuracy and class separation → highly reliable churn predictor.\n",
sep = "")
##
## Confusion Matrix Notes (Logistic Regression):
## - FN extremely low (only 1) → almost no true churners missed.
## - FP = 0 → model never flags active customers as churn.
## - Strong balance between FP & FN → minimal business risk in retention decisions.
## - Near-perfect accuracy and class separation → highly reliable churn predictor.
set.seed(123)
svm_linear_model <- train(
churn ~ .,
data = train,
method = "svmLinear",
trControl = trainControl(method = "none")
)
# Predict classes
svm_pred <- predict(svm_linear_model, newdata = test)
head(svm_pred)
## [1] 0 0 1 0 1 1
## Levels: 0 1
3a) Confusion matrix and performance metrics.
svm_cm <- confusionMatrix(svm_pred, y_test, positive = "1")
svm_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 873 6
## 1 2 430
##
## Accuracy : 0.9939
## 95% CI : (0.988, 0.9974)
## No Information Rate : 0.6674
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9862
##
## Mcnemar's Test P-Value : 0.2888
##
## Sensitivity : 0.9862
## Specificity : 0.9977
## Pos Pred Value : 0.9954
## Neg Pred Value : 0.9932
## Prevalence : 0.3326
## Detection Rate : 0.3280
## Detection Prevalence : 0.3295
## Balanced Accuracy : 0.9920
##
## 'Positive' Class : 1
##
3b) The performance metrics are printed
# Accuracy
svm_accuracy <- svm_cm$overall["Accuracy"]
# Precision, Recall, F1
svm_precision <- svm_cm$byClass["Precision"]
svm_recall <- svm_cm$byClass["Recall"]
svm_f1 <- svm_cm$byClass["F1"]
print(svm_accuracy)
## Accuracy
## 0.9938978
print(svm_precision)
## Precision
## 0.9953704
print(svm_recall)
## Recall
## 0.9862385
print(svm_f1)
## F1
## 0.9907834
3c) Analysis of the results
cat("\nConfusion Matrix Notes (SVM Linear):\n",
"- FN moderately low (6) → a few true churners missed.\n",
"- FP very low (2) → minimal misclassification of active customers as churn.\n",
"- Slightly higher FN than FP → greater risk of overlooking at-risk customers.\n",
"- Accuracy and balance remain very high → strong, reliable classifier overall.\n",
sep = "")
##
## Confusion Matrix Notes (SVM Linear):
## - FN moderately low (6) → a few true churners missed.
## - FP very low (2) → minimal misclassification of active customers as churn.
## - Slightly higher FN than FP → greater risk of overlooking at-risk customers.
## - Accuracy and balance remain very high → strong, reliable classifier overall.
Summary of the results
results <- data.frame(
Model = c("Logistic Regression", "SVM Linear"),
Accuracy = c(logit_accuracy, svm_accuracy),
Precision = c(logit_precision, svm_precision),
Recall = c(logit_recall, svm_recall),
F1_Score = c(logit_f1, svm_f1)
)
print(results)
## Model Accuracy Precision Recall F1_Score
## 1 Logistic Regression 0.9992372 1.0000000 0.9977064 0.9988519
## 2 SVM Linear 0.9938978 0.9953704 0.9862385 0.9907834
cat("\nModel Performance Summary:\n",
"- Logistic Regression: highest accuracy (0.9992) and perfect precision (1.00), very strong recall and F1.\n",
"- SVM Linear: slightly lower accuracy (0.9939) but still excellent precision, recall, and F1.\n",
"- Overall: Logistic Regression is the most balanced and stable across all metrics.\n",
sep = "")
##
## Model Performance Summary:
## - Logistic Regression: highest accuracy (0.9992) and perfect precision (1.00), very strong recall and F1.
## - SVM Linear: slightly lower accuracy (0.9939) but still excellent precision, recall, and F1.
## - Overall: Logistic Regression is the most balanced and stable across all metrics.
rfm_class <- rfm %>%
select(recency_days, frequency, monetary_total, monetary_avg, monetary_min, churn)
head(rfm_class)
## # A tibble: 6 × 6
## recency_days frequency monetary_total monetary_avg monetary_min churn
## <dbl> <int> <dbl> <dbl> <dbl> <int>
## 1 325 2 0 0 -77184. 1
## 2 2 7 4310 23.7 5.04 0
## 3 75 4 1797. 58.0 13.2 0
## 4 18 1 1758. 24.1 6.64 0
## 5 310 1 334. 19.7 8.5 1
## 6 36 11 1545. 16.3 -376. 0
Ensure churn is a factor and the positive class is “1”
rfm_class$churn <- factor(rfm_class$churn, levels = c(0, 1))
my_summary <- function(data, lev = NULL, model = NULL) {
# data$obs: true labels, data$pred: predicted labels
cm <- confusionMatrix(
data = data$pred,
reference = data$obs,
positive = lev[2] # use "1" as positive class
)
out <- c(
Accuracy = cm$overall["Accuracy"],
Precision = cm$byClass["Precision"],
Recall = cm$byClass["Recall"],
F1 = cm$byClass["F1"]
)
return(out)
}
set.seed(123)
ctrl <- trainControl(
method = "cv",
number = 10,
savePredictions = "final", # keep fold-level predictions
classProbs = FALSE,
summaryFunction = my_summary
)
logit_cv <- train(
churn ~ .,
data = rfm_class,
method = "glm",
family = "binomial",
metric = "Accuracy",
trControl = ctrl
)
3a. Metrics for each fold
# Metrics for each fold (one row per resample)
fold_metrics <- logit_cv$resample
fold_metrics
## Accuracy.Accuracy Precision.Precision Recall.Recall F1.F1 Resample
## 1 1.0000000 1.0000000 1.0000000 1.0000000 Fold01
## 2 1.0000000 1.0000000 1.0000000 1.0000000 Fold02
## 3 0.9702517 0.9782609 0.9310345 0.9540636 Fold03
## 4 1.0000000 1.0000000 1.0000000 1.0000000 Fold04
## 5 0.9748858 0.9720280 0.9520548 0.9619377 Fold05
## 6 0.9885321 0.9794521 0.9862069 0.9828179 Fold06
## 7 0.9977169 0.9931973 1.0000000 0.9965870 Fold07
## 8 0.9977117 0.9931507 1.0000000 0.9965636 Fold08
## 9 0.9977064 0.9931507 1.0000000 0.9965636 Fold09
## 10 1.0000000 1.0000000 1.0000000 1.0000000 Fold10
names(fold_metrics)
## [1] "Accuracy.Accuracy" "Precision.Precision" "Recall.Recall"
## [4] "F1.F1" "Resample"
3b. Mean and standard deviation per metric
library(dplyr)
library(tidyr)
# Remove non-numeric columns (e.g., "Resample") and summarise
metrics_summary <- fold_metrics %>%
select(where(is.numeric)) %>% # keep only numeric metrics
summarise(across(
.cols = everything(),
.fns = list(mean = mean, sd = sd) # mean and sd for each metric
))
metrics_summary
## Accuracy.Accuracy_mean Accuracy.Accuracy_sd Precision.Precision_mean
## 1 0.9926805 0.01119101 0.990924
## Precision.Precision_sd Recall.Recall_mean Recall.Recall_sd F1.F1_mean
## 1 0.01050714 0.9869296 0.02480342 0.9888533
## F1.F1_sd
## 1 0.01714935
3c. Same information in a compact table (metric x mean x sd)
results_table <- metrics_summary %>%
pivot_longer(
cols = everything(),
names_to = c("Metric", ".value"), # split names by "_" into Metric / mean / sd
names_sep = "_"
)
results_table
## # A tibble: 4 × 3
## Metric mean sd
## <chr> <dbl> <dbl>
## 1 Accuracy.Accuracy 0.993 0.0112
## 2 Precision.Precision 0.991 0.0105
## 3 Recall.Recall 0.987 0.0248
## 4 F1.F1 0.989 0.0171
3d. Analysis of the results obtained with logistic regression.
cat("\nCross-Validation Summary (10-fold):\n",
"- Accuracy ~0.993 with low variability → model performs consistently well across folds.\n",
"- Precision ~0.991 → very few active customers are misclassified as churn.\n",
"- Recall ~0.987 → model reliably detects most true churn cases with minimal misses.\n",
"- F1 ~0.989 → strong balance between precision and recall.\n",
"- Overall: metrics show a highly stable and well-generalized model with minimal variance.\n",
sep = "")
##
## Cross-Validation Summary (10-fold):
## - Accuracy ~0.993 with low variability → model performs consistently well across folds.
## - Precision ~0.991 → very few active customers are misclassified as churn.
## - Recall ~0.987 → model reliably detects most true churn cases with minimal misses.
## - F1 ~0.989 → strong balance between precision and recall.
## - Overall: metrics show a highly stable and well-generalized model with minimal variance.
set.seed(123)
ctrl <- trainControl(
method = "cv",
number = 10,
savePredictions = "final",
classProbs = FALSE,
summaryFunction = my_summary
)
svm_linear_cv <- train(
churn ~ .,
data = rfm_class,
method = "svmLinear", # SVM lineal
metric = "Accuracy",
trControl = ctrl
)
4a. Metrics for each fold
# Metrics for each fold (one row per resample)
fold_metrics <- svm_linear_cv$resample
fold_metrics
## Accuracy.Accuracy Precision.Precision Recall.Recall F1.F1 Resample
## 1 0.9977169 1.0000000 0.9931507 0.9965636 Fold01
## 2 0.9931507 1.0000000 0.9794521 0.9896194 Fold02
## 3 0.9931350 0.9930556 0.9862069 0.9896194 Fold03
## 4 0.9977117 1.0000000 0.9931034 0.9965398 Fold04
## 5 0.9931507 1.0000000 0.9794521 0.9896194 Fold05
## 6 0.9954128 1.0000000 0.9862069 0.9930556 Fold06
## 7 0.9977169 1.0000000 0.9931507 0.9965636 Fold07
## 8 0.9931350 1.0000000 0.9793103 0.9895470 Fold08
## 9 0.9931193 0.9930556 0.9862069 0.9896194 Fold09
## 10 1.0000000 1.0000000 1.0000000 1.0000000 Fold10
names(fold_metrics)
## [1] "Accuracy.Accuracy" "Precision.Precision" "Recall.Recall"
## [4] "F1.F1" "Resample"
4b. Mean and standard deviation per metric
library(dplyr)
library(tidyr)
# Remove non-numeric columns (e.g., "Resample") and summarise
metrics_summary <- fold_metrics %>%
select(where(is.numeric)) %>% # keep only numeric metrics
summarise(across(
.cols = everything(),
.fns = list(mean = mean, sd = sd) # mean and sd for each metric
))
metrics_summary
## Accuracy.Accuracy_mean Accuracy.Accuracy_sd Precision.Precision_mean
## 1 0.9954249 0.002641858 0.9986111
## Precision.Precision_sd Recall.Recall_mean Recall.Recall_sd F1.F1_mean
## 1 0.002928035 0.987624 0.00709445 0.9930747
## F1.F1_sd
## 1 0.004007117
4c. Same information in a compact table (metric x mean x sd)
results_table <- metrics_summary %>%
pivot_longer(
cols = everything(),
names_to = c("Metric", ".value"), # split names by "_" into Metric / mean / sd
names_sep = "_"
)
results_table
## # A tibble: 4 × 3
## Metric mean sd
## <chr> <dbl> <dbl>
## 1 Accuracy.Accuracy 0.995 0.00264
## 2 Precision.Precision 0.999 0.00293
## 3 Recall.Recall 0.988 0.00709
## 4 F1.F1 0.993 0.00401
4d. Analysis of the results obtained with logistic regression.
cat("\nCross-Validation Summary (10-fold):\n",
"- Accuracy ~0.995 with very low variability → the model performs consistently well across folds.\n",
"- Precision ~0.999 → almost no active customers are misclassified as churn.\n",
"- Recall ~0.988 → the model successfully identifies most true churn cases.\n",
"- F1 ~0.993 → strong balance between precision and recall.\n",
"- Overall: results indicate a highly stable and well-generalized linear SVM model with minimal variance.\n",
sep = "")
##
## Cross-Validation Summary (10-fold):
## - Accuracy ~0.995 with very low variability → the model performs consistently well across folds.
## - Precision ~0.999 → almost no active customers are misclassified as churn.
## - Recall ~0.988 → the model successfully identifies most true churn cases.
## - F1 ~0.993 → strong balance between precision and recall.
## - Overall: results indicate a highly stable and well-generalized linear SVM model with minimal variance.
4e. Application of the resamples() method for both models
cv_models <- resamples(list(
Logistic = logit_cv,
SVM_Linear = svm_linear_cv
))
# Quick summary (mean, min, max, quantiles) for each metric & model
summary(cv_models)
##
## Call:
## summary.resamples(object = cv_models)
##
## Models: Logistic, SVM_Linear
## Number of resamples: 10
##
## Accuracy.Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Logistic 0.9702517 0.9908257 0.9977143 0.9926805 1.0000000 1 0
## SVM_Linear 0.9931193 0.9931389 0.9942818 0.9954249 0.9977156 1 0
##
## F1.F1
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Logistic 0.9540636 0.9862543 0.9965753 0.9888533 1.0000000 1 0
## SVM_Linear 0.9895470 0.9896194 0.9913375 0.9930747 0.9965576 1 0
##
## Precision.Precision
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Logistic 0.9720280 0.9828767 0.993174 0.9909240 1 1 0
## SVM_Linear 0.9930556 1.0000000 1.000000 0.9986111 1 1 0
##
## Recall.Recall
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Logistic 0.9310345 0.9896552 1.0000000 0.9869296 1.0000000 1 0
## SVM_Linear 0.9793103 0.9811408 0.9862069 0.9876240 0.9931389 1 0
Analysis of the results derived from the application of the resamples() method.
cat("\nCross-Validation Comparison (Logistic vs SVM Linear):\n",
"- Both models show very high and stable performance across folds.\n",
"- Logistic Regression: slightly higher Accuracy mean (~0.9927) but more variability than SVM.\n",
"- SVM Linear: best overall mean Accuracy (~0.9954) and more consistent fold-to-fold performance.\n",
"- F1-score: Logistic (~0.9889) strong, but SVM slightly higher (~0.9933), showing better balance.\n",
"- Precision: both models near perfect; Logistic ~0.9909, SVM ~0.9986 (SVM wins).\n",
"- Recall: Logistic ~0.9867, SVM ~0.9876 (very similar, SVM marginally higher).\n",
"- Overall conclusion: SVM Linear is the most consistently balanced and robust across CV folds, \n",
" while Logistic Regression also performs exceptionally but with slightly greater variability.\n",
sep = "")
##
## Cross-Validation Comparison (Logistic vs SVM Linear):
## - Both models show very high and stable performance across folds.
## - Logistic Regression: slightly higher Accuracy mean (~0.9927) but more variability than SVM.
## - SVM Linear: best overall mean Accuracy (~0.9954) and more consistent fold-to-fold performance.
## - F1-score: Logistic (~0.9889) strong, but SVM slightly higher (~0.9933), showing better balance.
## - Precision: both models near perfect; Logistic ~0.9909, SVM ~0.9986 (SVM wins).
## - Recall: Logistic ~0.9867, SVM ~0.9876 (very similar, SVM marginally higher).
## - Overall conclusion: SVM Linear is the most consistently balanced and robust across CV folds,
## while Logistic Regression also performs exceptionally but with slightly greater variability.
Generation of the comparative boxplot chart.
bwplot(cv_models) # boxplots per metric
Analysis of the boxplot chart of the models.
cat("\nCross-Validation Summary (10-fold):\n",
"- Accuracy ~0.995 with very low variability → the model performs consistently well across folds.\n",
"- Precision ~0.999 → almost no active customers are misclassified as churn.\n",
"- Recall ~0.988 → the model successfully identifies most true churn cases.\n",
"- F1 ~0.993 → strong balance between precision and recall.\n",
"- Overall: results indicate a highly stable and well-generalized linear SVM model with minimal variance.\n",
sep = "")
##
## Cross-Validation Summary (10-fold):
## - Accuracy ~0.995 with very low variability → the model performs consistently well across folds.
## - Precision ~0.999 → almost no active customers are misclassified as churn.
## - Recall ~0.988 → the model successfully identifies most true churn cases.
## - F1 ~0.993 → strong balance between precision and recall.
## - Overall: results indicate a highly stable and well-generalized linear SVM model with minimal variance.
library(dplyr)
library(caret)
library(glmnet) # for glmnet backend
## Cargando paquete requerido: Matrix
##
## Adjuntando el paquete: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-10
library(e1071) # required for SVM in caret
rfm_class <- rfm %>%
select(recency_days, frequency, monetary_total, monetary_avg, monetary_min, churn)
head(rfm_class)
## # A tibble: 6 × 6
## recency_days frequency monetary_total monetary_avg monetary_min churn
## <dbl> <int> <dbl> <dbl> <dbl> <int>
## 1 325 2 0 0 -77184. 1
## 2 2 7 4310 23.7 5.04 0
## 3 75 4 1797. 58.0 13.2 0
## 4 18 1 1758. 24.1 6.64 0
## 5 310 1 334. 19.7 8.5 1
## 6 36 11 1545. 16.3 -376. 0
# Ensure churn is a factor and set positive class = "1"
rfm_class$churn <- factor(rfm_class$churn, levels = c(0, 1))
my_summary <- function(data, lev = NULL, model = NULL) {
cm <- confusionMatrix(
data = data$pred,
reference = data$obs,
positive = lev[2] # use "1" as positive class
)
c(
Accuracy = cm$overall["Accuracy"],
Precision = cm$byClass["Precision"],
Recall = cm$byClass["Recall"],
F1 = cm$byClass["F1"]
)
}
set.seed(123)
ctrl <- trainControl(
method = "repeatedcv",
number = 5,
repeats = 3,
savePredictions = "final",
classProbs = FALSE,
summaryFunction = my_summary
)
4.1 Grid for glmnet (elastic-net logistic regression)
grid_glmnet <- expand.grid(
alpha = seq(0, 1, by = 0.25), # mixing parameter (ridge → lasso)
lambda = 10 ^ seq(-4, 1, length.out = 10) # regularization strength
)
4.2 Grid for SVM with radial kernel (C and sigma)
grid_svm <- expand.grid(
C = c(0.1, 1, 10, 100),
sigma = c(0.001, 0.01, 0.05, 0.1)
)
5.1 Logistic regression via glmnet
set.seed(123)
fit_glmnet <- train(
churn ~ .,
data = rfm_class,
method = "glmnet",
tuneGrid = grid_glmnet,
metric = "F1", # select best tune based on F1-score
maximize = TRUE,
trControl = ctrl
)
5.2 SVM with radial basis kernel
set.seed(123)
fit_svm_radial <- train(
churn ~ .,
data = rfm_class,
method = "svmRadial",
tuneGrid = grid_svm,
metric = "F1", # select best tune based on F1-score
maximize = TRUE,
trControl = ctrl
)
# Best hyperparameters based on F1-score
fit_glmnet$bestTune
## alpha lambda
## 42 1 0.0003593814
fit_svm_radial$bestTune
## sigma C
## 15 0.05 100
# Full tuning results (one row per hyperparameter combination)
head(fit_glmnet$results)
## alpha lambda Accuracy.Accuracy Precision.Precision Recall.Recall
## 1 0 0.0001000000 0.9374051 0.9997245 0.8120149
## 2 0 0.0003593814 0.9374051 0.9997245 0.8120149
## 3 0 0.0012915497 0.9374051 0.9997245 0.8120149
## 4 0 0.0046415888 0.9374051 0.9997245 0.8120149
## 5 0 0.0166810054 0.9374051 0.9997245 0.8120149
## 6 0 0.0599484250 0.9260465 0.9997089 0.7778623
## F1.F1 Accuracy.AccuracySD Precision.PrecisionSD Recall.RecallSD
## 1 0.8960064 0.006864509 0.001066938 0.02079821
## 2 0.8960064 0.006864509 0.001066938 0.02079821
## 3 0.8960064 0.006864509 0.001066938 0.02079821
## 4 0.8960064 0.006864509 0.001066938 0.02079821
## 5 0.8960064 0.006864509 0.001066938 0.02079821
## 6 0.8747243 0.008480456 0.001127506 0.02551633
## F1.F1SD
## 1 0.01265890
## 2 0.01265890
## 3 0.01265890
## 4 0.01265890
## 5 0.01265890
## 6 0.01614326
head(fit_svm_radial$results)
## C sigma Accuracy.Accuracy Precision.Precision Recall.Recall F1.F1
## 1 0.1 0.001 0.7382581 1.0000000 0.2129676 0.3504579
## 5 1.0 0.001 0.9292478 0.9988602 0.7881795 0.8809092
## 9 10.0 0.001 0.9736960 0.9992609 0.9215942 0.9587909
## 13 100.0 0.001 0.9898590 0.9992965 0.9701955 0.9845031
## 2 0.1 0.010 0.9277992 1.0000000 0.7829063 0.8780529
## 6 1.0 0.010 0.9729333 0.9994978 0.9190718 0.9575303
## Accuracy.AccuracySD Precision.PrecisionSD Recall.RecallSD F1.F1SD
## 1 0.008515557 0.000000000 0.02580373 0.034928836
## 5 0.007682144 0.001956781 0.02358191 0.014541451
## 9 0.005096753 0.001530272 0.01555409 0.008319426
## 13 0.003402062 0.001456376 0.01037370 0.005271209
## 2 0.007849565 0.000000000 0.02354712 0.014862256
## 6 0.005532224 0.001325480 0.01633389 0.009033237
library(caret)
library(glmnet)
set.seed(123)
logit_model <- train(
churn ~ .,
data = train,
method = "glmnet",
family = "binomial",
trControl = trainControl(method = "none"),
# Set the exact hyperparameters you requested
tuneGrid = expand.grid(
alpha = 1,
lambda = 0.0003593814
)
)
# Predict classes
logit_pred <- predict(logit_model, newdata = test)
head(logit_pred)
## [1] 0 0 1 0 1 1
## Levels: 0 1
7a. Obtaining the confusion matrix.
logit_cm <- confusionMatrix(logit_pred, y_test, positive = "1")
logit_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 875 0
## 1 0 436
##
## Accuracy : 1
## 95% CI : (0.9972, 1)
## No Information Rate : 0.6674
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.3326
## Detection Rate : 0.3326
## Detection Prevalence : 0.3326
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 1
##
7b Confusion matrix metrics.
# Accuracy
logit_accuracy <- logit_cm$overall["Accuracy"]
# Precision, Recall, F1
logit_precision <- logit_cm$byClass["Precision"]
logit_recall <- logit_cm$byClass["Recall"]
logit_f1 <- logit_cm$byClass["F1"]
print(logit_accuracy)
## Accuracy
## 1
print(logit_precision)
## Precision
## 1
print(logit_recall)
## Recall
## 1
print(logit_f1)
## F1
## 1
7c. Analysis of the results obtained with logistic regression.
cat("\nLogistic Regression (alpha=1, lambda=0.000359) Summary:\n",
"- Perfect classification: Accuracy = 1.00 with zero errors.\n",
"- Sensitivity and Specificity both = 1.00 → no churners or non-churners misclassified.\n",
"- Precision and Recall = 1.00 → flawless detection of positive and negative classes.\n",
"- Balanced Accuracy = 1.00 → ideal performance across both classes.\n",
"- Overall: the model achieves perfect separation on the test set.\n",
sep = "")
##
## Logistic Regression (alpha=1, lambda=0.000359) Summary:
## - Perfect classification: Accuracy = 1.00 with zero errors.
## - Sensitivity and Specificity both = 1.00 → no churners or non-churners misclassified.
## - Precision and Recall = 1.00 → flawless detection of positive and negative classes.
## - Balanced Accuracy = 1.00 → ideal performance across both classes.
## - Overall: the model achieves perfect separation on the test set.
library(caret)
library(e1071)
set.seed(123)
svm_radial_model <- train(
churn ~ .,
data = train,
method = "svmRadial", # radial kernel
tuneGrid = expand.grid(
C = 100, # cost parameter
sigma = 0.05 # radial kernel width
),
trControl = trainControl(method = "none")
)
# Predict classes
svm_radial_pred <- predict(svm_radial_model, newdata = test)
head(svm_radial_pred)
## [1] 0 0 1 0 1 1
## Levels: 0 1
8a.Obtaining the confusion matrix.
svm_cm <- confusionMatrix(svm_radial_pred, y_test, positive = "1")
svm_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 872 2
## 1 3 434
##
## Accuracy : 0.9962
## 95% CI : (0.9911, 0.9988)
## No Information Rate : 0.6674
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9914
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9954
## Specificity : 0.9966
## Pos Pred Value : 0.9931
## Neg Pred Value : 0.9977
## Prevalence : 0.3326
## Detection Rate : 0.3310
## Detection Prevalence : 0.3333
## Balanced Accuracy : 0.9960
##
## 'Positive' Class : 1
##
8b Confusion matrix metrics.
# Accuracy
svm_accuracy <- svm_cm$overall["Accuracy"]
# Precision, Recall, F1
svm_precision <- svm_cm$byClass["Precision"]
svm_recall <- svm_cm$byClass["Recall"]
svm_f1 <- svm_cm$byClass["F1"]
print(svm_accuracy)
## Accuracy
## 0.9961861
print(svm_precision)
## Precision
## 0.993135
print(svm_recall)
## Recall
## 0.9954128
print(svm_f1)
## F1
## 0.9942726
8c. Analysis of the results obtained with the SVM model using a radial kernel.
cat("\nSVM Radial (C=100, sigma=0.05) Summary:\n",
"- Accuracy ~0.9962 → extremely high overall performance with very few errors.\n",
"- Sensitivity ~0.9954 → the model correctly identifies almost all churners.\n",
"- Specificity ~0.9966 → very strong detection of non-churn customers.\n",
"- Precision ~0.9931 and Recall ~0.9954 → balanced and reliable prediction quality.\n",
"- Only 5 misclassifications (2 FN, 3 FP) → minimal business impact.\n",
"- Balanced Accuracy ~0.9960 → robust performance across both classes.\n",
"- Overall: the radial SVM achieves near-perfect classification with excellent stability.\n",
sep = "")
##
## SVM Radial (C=100, sigma=0.05) Summary:
## - Accuracy ~0.9962 → extremely high overall performance with very few errors.
## - Sensitivity ~0.9954 → the model correctly identifies almost all churners.
## - Specificity ~0.9966 → very strong detection of non-churn customers.
## - Precision ~0.9931 and Recall ~0.9954 → balanced and reliable prediction quality.
## - Only 5 misclassifications (2 FN, 3 FP) → minimal business impact.
## - Balanced Accuracy ~0.9960 → robust performance across both classes.
## - Overall: the radial SVM achieves near-perfect classification with excellent stability.
results <- data.frame(
Model = c("Logistic Regression", "SVM Radial"),
Accuracy = c(logit_accuracy, svm_accuracy),
Precision = c(logit_precision, svm_precision),
Recall = c(logit_recall, svm_recall),
F1_Score = c(logit_f1, svm_f1)
)
print(results)
## Model Accuracy Precision Recall F1_Score
## 1 Logistic Regression 1.0000000 1.000000 1.0000000 1.0000000
## 2 SVM Radial 0.9961861 0.993135 0.9954128 0.9942726
9.a Analysis of the consolidated results.
cat("\nModel Comparison Summary:\n",
"1. With tuned hyperparameters, Logistic Regression achieves perfect performance (Accuracy, Precision, Recall, F1 = 1.00).\n",
"2. The tuned SVM Radial model performs extremely well (Accuracy ~0.996), but remains slightly below Logistic Regression.\n",
"3. The optimized SVM Radial offers a balanced and robust alternative, with only marginal losses compared to the perfect Logistic model.\n",
"4. When comparing to non-tuned models, both Logistic Regression and SVM Linear show a small but clear improvement after hyperparameter optimization.\n",
"5. Overall, hyperparameter tuning boosts performance and stability, with the tuned Logistic model being the top performer across all metrics.\n",
"6. Although all models are strong candidates for production, the tuned Logistic Regression model stands out as the only one achieving perfect predictive performance.\n",
sep = "")
##
## Model Comparison Summary:
## 1. With tuned hyperparameters, Logistic Regression achieves perfect performance (Accuracy, Precision, Recall, F1 = 1.00).
## 2. The tuned SVM Radial model performs extremely well (Accuracy ~0.996), but remains slightly below Logistic Regression.
## 3. The optimized SVM Radial offers a balanced and robust alternative, with only marginal losses compared to the perfect Logistic model.
## 4. When comparing to non-tuned models, both Logistic Regression and SVM Linear show a small but clear improvement after hyperparameter optimization.
## 5. Overall, hyperparameter tuning boosts performance and stability, with the tuned Logistic model being the top performer across all metrics.
## 6. Although all models are strong candidates for production, the tuned Logistic Regression model stands out as the only one achieving perfect predictive performance.