Gabriel Elías Chanchí Golondrino
Tarea 4
Business Intelligence<> Presentado al profesor: Sixyel Jeyson Castaneda Coronado

Exercise 1

The development of Exercise 1 is presented below.

  1. Installation of the required packages.
pkgs <- c("tidyverse", "lubridate", "janitor", "scales", "gridExtra", "car", "caret")

to_install <- setdiff(pkgs, rownames(installed.packages()))
if (length(to_install)) install.packages(to_install, dependencies = TRUE)

invisible(lapply(pkgs, library, character.only = TRUE))
  1. Import CSV and inspect
data_path <- "Appendix 3 - Database.csv"
df_raw <- read_csv(data_path, show_col_types = FALSE) %>% clean_names()
head(df_raw)
## # A tibble: 6 × 8
##   invoice_no stock_code description quantity invoice_date unit_price customer_id
##   <chr>      <chr>      <chr>          <dbl> <chr>             <dbl>       <dbl>
## 1 536365     85123A     WHITE HANG…        6 12/1/2010 8…       2.55       17850
## 2 536365     71053      WHITE META…        6 12/1/2010 8…       3.39       17850
## 3 536365     84406B     CREAM CUPI…        8 12/1/2010 8…       2.75       17850
## 4 536365     84029G     KNITTED UN…        6 12/1/2010 8…       3.39       17850
## 5 536365     84029E     RED WOOLLY…        6 12/1/2010 8…       3.39       17850
## 6 536365     22752      SET 7 BABU…        2 12/1/2010 8…       7.65       17850
## # ℹ 1 more variable: country <chr>
  1. Robust date parsing + base cleaning
df <- df_raw %>%
  mutate(
    # Parse flexible date-time formats (prevents negative recency bugs)
    invoice_date = parse_date_time(
      invoice_date,
      orders = c("dmy HMS", "dmy HM", "dmy",
                 "mdy HMS", "mdy HM", "mdy",
                 "Ymd HMS", "Ymd HM", "Ymd"),
      tz = "UTC"
    ),
    invoice_date = as_datetime(invoice_date)
  ) %>%
  
  # Keep only rows with a customer id and a valid date
  filter(!is.na(customer_id), !is.na(invoice_date)) %>%
  
  # Basic data sanity: drop zero-quantity rows and negative prices
  filter(quantity != 0, unit_price >= 0) %>%
  
  # IDs are identifiers, not numbers → keep as character
  mutate(
    customer_id = as.character(customer_id),
    customer_uid = customer_id,        # canonical id
    amount = quantity * unit_price     # line value
  )

# Safety check: analysis date = max observed date (prevents hardcoding)
analysis_date <- as_date(max(df$invoice_date, na.rm = TRUE))
head(df)
## # A tibble: 6 × 10
##   invoice_no stock_code description      quantity invoice_date        unit_price
##   <chr>      <chr>      <chr>               <dbl> <dttm>                   <dbl>
## 1 536365     85123A     WHITE HANGING H…        6 2010-12-01 08:26:00       2.55
## 2 536365     71053      WHITE METAL LAN…        6 2010-12-01 08:26:00       3.39
## 3 536365     84406B     CREAM CUPID HEA…        8 2010-12-01 08:26:00       2.75
## 4 536365     84029G     KNITTED UNION F…        6 2010-12-01 08:26:00       3.39
## 5 536365     84029E     RED WOOLLY HOTT…        6 2010-12-01 08:26:00       3.39
## 6 536365     22752      SET 7 BABUSHKA …        2 2010-12-01 08:26:00       7.65
## # ℹ 4 more variables: customer_id <chr>, country <chr>, customer_uid <chr>,
## #   amount <dbl>
  1. Aggregate to customer level (RFM)
rfm <- df %>%
  group_by(customer_uid) %>%
  summarise(
    last_purchase  = max(invoice_date, na.rm = TRUE),                    # last tx
    recency_days   = as.numeric(analysis_date - as_date(last_purchase)), # recency
    frequency      = n_distinct(invoice_no),                             # #invoices
    monetary_total = sum(amount, na.rm = TRUE),                          # total spend
    monetary_avg   = mean(amount, na.rm = TRUE),                         # avg line
    monetary_min   = min(amount, na.rm = TRUE),                          # min line
    .groups = "drop"
  ) %>%
  
  # Guard against any negative due to oddities (clip to 0)
  mutate(recency_days = pmax(recency_days, 0))

head(rfm)
## # A tibble: 6 × 7
##   customer_uid last_purchase       recency_days frequency monetary_total
##   <chr>        <dttm>                     <dbl>     <int>          <dbl>
## 1 12346        2011-01-18 10:17:00          325         2             0 
## 2 12347        2011-12-07 15:52:00            2         7          4310 
## 3 12348        2011-09-25 13:13:00           75         4          1797.
## 4 12349        2011-11-21 09:51:00           18         1          1758.
## 5 12350        2011-02-02 16:01:00          310         1           334.
## 6 12352        2011-11-03 14:37:00           36        11          1545.
## # ℹ 2 more variables: monetary_avg <dbl>, monetary_min <dbl>
  1. Create churn variable (1 = inactive in last 90 days)
rfm <- rfm %>%
  mutate(churn = if_else(recency_days > 90, 1L, 0L))

# Overall churn rate
churn_rate <- mean(rfm$churn)
cat("Churn rate:", percent(churn_rate), "\n")
## Churn rate: 33%
head(rfm$churn)
## [1] 1 0 0 0 1 0
  1. Descriptive EDA Numeric summary (good for detecting scale/outliers)
print(summary(select(rfm, recency_days, frequency, monetary_total,
                     monetary_avg, monetary_min, churn)))
##   recency_days      frequency       monetary_total      monetary_avg     
##  Min.   :  0.00   Min.   :  1.000   Min.   : -4287.6   Min.   :-4287.63  
##  1st Qu.: 16.00   1st Qu.:  1.000   1st Qu.:   293.4   1st Qu.:   10.99  
##  Median : 50.00   Median :  3.000   Median :   648.1   Median :   16.92  
##  Mean   : 91.58   Mean   :  5.075   Mean   :  1898.5   Mean   :   28.84  
##  3rd Qu.:143.00   3rd Qu.:  5.000   3rd Qu.:  1611.7   3rd Qu.:   23.54  
##  Max.   :373.00   Max.   :248.000   Max.   :279489.0   Max.   : 3861.00  
##   monetary_min            churn       
##  Min.   :-168469.60   Min.   :0.0000  
##  1st Qu.:    -11.50   1st Qu.:0.0000  
##  Median :      0.79   Median :0.0000  
##  Mean   :    -86.43   Mean   :0.3326  
##  3rd Qu.:      6.96   3rd Qu.:1.0000  
##  Max.   :   3861.00   Max.   :1.0000

Display of histograms for churn analysis.

# Hist: Recency (colored by churn)
p1 <- ggplot(rfm, aes(x = recency_days, fill = factor(churn))) +
  geom_histogram(bins = 30, alpha = 0.7, position = "identity") +
  scale_fill_manual(values = c("steelblue", "tomato"),
                    name = "Churn", labels = c("Active (0)", "Churned (1)")) +
  labs(title = "Distribution of Recency (days)",
       x = "Days since last purchase", y = "Customers")

# Hist: Frequency (clip x-axis at 99th percentile to tame long tail)
p2 <- ggplot(rfm, aes(x = frequency, fill = factor(churn))) +
  geom_histogram(bins = 30, alpha = 0.7, position = "identity") +
  scale_x_continuous(limits = c(0, quantile(rfm$frequency, 0.99))) +
  scale_fill_manual(values = c("steelblue", "tomato"),
                    name = "Churn", labels = c("Active (0)", "Churned (1)")) +
  labs(title = "Frequency of Transactions",
       x = "Unique invoices", y = "Customers")

# Density: Monetary total (by churn)
p3 <- ggplot(rfm, aes(x = monetary_total, fill = factor(churn))) +
  geom_density(alpha = 0.6) +
  scale_x_continuous(labels = dollar_format(prefix = "£")) +
  scale_fill_manual(values = c("steelblue", "tomato"),
                    name = "Churn", labels = c("Active (0)", "Churned (1)")) +
  labs(title = "Density of Total Monetary Value",
       x = "Total spend (£)", y = "Density")

# Bar: Class balance (churn distribution)
p4 <- ggplot(rfm, aes(x = factor(churn))) +
  geom_bar(fill = "skyblue") +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
  labs(title = "Churn Distribution (Class Balance)",
       x = "Churn (1 = inactive, 0 = active)", y = "Customers")

# Show a 2x2 grid of the above plots
gridExtra::grid.arrange(p1, p2, p3, p4, ncol = 2)

Boxplots to visually spot outliers in RFM

rfm_long <- rfm %>%
  select(recency_days, frequency, monetary_total) %>%
  pivot_longer(everything(), names_to = "metric", values_to = "value")

ggplot(rfm_long, aes(x = metric, y = value, fill = metric)) +
  geom_boxplot(alpha = 0.6, outlier.alpha = 0.5) +
  scale_y_continuous(labels = comma) +
  labs(title = "Boxplots of RFM Variables (Outlier Check)",
       x = "Metric", y = "Value") +
  theme(legend.position = "none")

  1. Key findings (print concise summary for the report)
cat(
  "\nKey findings:\n",
  "- Churn rate: ", percent(churn_rate), " of customers have not purchased in the last 90 days.\n",
  "- Recency is now correctly computed (0 = purchased on last day; higher = older purchase).\n",
  "- Frequency and Monetary show heavy right tails (a few large/wholesale buyers).\n",
  "- Boxplots confirm outliers in spend/frequency; consider winsorizing for modeling.\n",
  sep = ""
)
## 
## Key findings:
## - Churn rate: 33% of customers have not purchased in the last 90 days.
## - Recency is now correctly computed (0 = purchased on last day; higher = older purchase).
## - Frequency and Monetary show heavy right tails (a few large/wholesale buyers).
## - Boxplots confirm outliers in spend/frequency; consider winsorizing for modeling.

Optional: save for next exercises (modeling, correlation, etc.)

write_csv(rfm, "rfm_churn_dataset.csv")
cat("\nSaved: rfm_churn_dataset.csv (customer-level RFM + churn)\n")
## 
## Saved: rfm_churn_dataset.csv (customer-level RFM + churn)

Exercise 2 – Unsupervised Clustering and Segment Characterization

Objective: 1) Standardize numeric RFM variables with scale()
2) Apply K-means clustering (start with k = 3)
3) Determine optimal k using:
- Elbow method (factoextra::fviz_nbclust)
- Silhouette method
4) Retrain K-means with optimal k and visualize results
5) Perform Hierarchical Clustering (hclust) and compare
6) Describe clusters: size, average RFM values, key differences

  1. Load packages
pkgs <- c("tidyverse", "factoextra", "cluster", "gridExtra")
to_install <- setdiff(pkgs, rownames(installed.packages()))
if (length(to_install)) install.packages(to_install)
invisible(lapply(pkgs, library, character.only = TRUE))
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
  1. Prepare data for clustering
# Select only numerical RFM variables (excluding churn)
rfm_vars <- rfm %>%
  select(recency_days, frequency, monetary_total, monetary_avg, monetary_min)

# Standardize using scale()
rfm_scaled <- scale(rfm_vars)

# Check result: means ~ 0, SD ~ 1
summary(as.data.frame(rfm_scaled))
##   recency_days       frequency         monetary_total      monetary_avg      
##  Min.   :-0.9088   Min.   :-0.436405   Min.   :-0.75263   Min.   :-33.90148  
##  1st Qu.:-0.7500   1st Qu.:-0.436405   1st Qu.:-0.19528   1st Qu.: -0.14022  
##  Median :-0.4126   Median :-0.222244   Median :-0.15213   Median : -0.09362  
##  Mean   : 0.0000   Mean   : 0.000000   Mean   : 0.00000   Mean   :  0.00000  
##  3rd Qu.: 0.5102   3rd Qu.:-0.008082   3rd Qu.:-0.03488   3rd Qu.: -0.04160  
##  Max.   : 2.7926   Max.   :26.012519   Max.   :33.77283   Max.   : 30.09773  
##   monetary_min      
##  Min.   :-58.53958  
##  1st Qu.:  0.02605  
##  Median :  0.03032  
##  Mean   :  0.00000  
##  3rd Qu.:  0.03247  
##  Max.   :  1.37235
  1. K-means clustering
set.seed(123)   # for reproducibility
k_init <- 3
kmeans_model <- kmeans(rfm_scaled, centers = k_init, nstart = 25)

# Add cluster labels to dataset
rfm_kmeans <- rfm %>%
  mutate(cluster_kmeans = factor(kmeans_model$cluster))
head(rfm_kmeans$cluster_kmeans)
## [1] 2 3 3 3 2 3
## Levels: 1 2 3
  1. Determine optimal k 4a. Elbow Method (total within-cluster sum of squares)v
fviz_nbclust(rfm_scaled, kmeans, method = "wss") +
  labs(title = "Elbow Method – Optimal Number of Clusters (K-means)")

4b. Silhouette Method

fviz_nbclust(rfm_scaled, kmeans, method = "silhouette") +
  labs(title = "Silhouette Method – Optimal Number of Clusters (K-means)")

After checking both plots visually, choose the best k (5)

optimal_k <- 5   # adjust this after observing the plots

# Retrain K-means with optimal k
set.seed(123)
kmeans_final <- kmeans(rfm_scaled, centers = optimal_k, nstart = 25)

# Add new cluster assignments
rfm_kmeans <- rfm_kmeans %>%
  mutate(cluster_final = factor(kmeans_final$cluster))
head(rfm_kmeans$cluster_final)
## [1] 3 4 4 4 1 4
## Levels: 1 2 3 4 5
  1. Visualize K-means clusters
# Use first two principal components for visualization
fviz_cluster(kmeans_final, data = rfm_scaled,
             geom = "point", ellipse.type = "norm",
             ggtheme = theme_minimal(),
             main = "K-means Clustering (after scaling)")
## Too few points to calculate an ellipse

Scatterplot using ggplot2 (colored by cluster)

ggplot(rfm_kmeans, aes(x = frequency, y = monetary_total, color = cluster_final)) +
  geom_point(alpha = 0.7) +
  scale_color_brewer(palette = "Set1") +
  labs(title = "K-means Clusters by Frequency and Monetary Value",
       x = "Frequency (transactions)", y = "Total Spend (£)")

  1. Cluster profiling – mean values per cluster
cluster_profile <- rfm_kmeans %>%
  group_by(cluster_final) %>%
  summarise(
    n_customers     = n(),
    avg_recency     = mean(recency_days, na.rm = TRUE),
    avg_frequency   = mean(frequency, na.rm = TRUE),
    avg_monetary    = mean(monetary_total, na.rm = TRUE),
    avg_monetary_avg = mean(monetary_avg, na.rm = TRUE),
    churn_rate      = mean(churn, na.rm = TRUE)
  )

print(cluster_profile)
## # A tibble: 5 × 7
##   cluster_final n_customers avg_recency avg_frequency avg_monetary
##   <fct>               <int>       <dbl>         <dbl>        <dbl>
## 1 1                    1105      246.            1.85       439.  
## 2 2                      26        5.35         83.3      75966.  
## 3 3                       2      162.            2.5          1.45
## 4 4                    3233       39.6           5.55      1791.  
## 5 5                       6       89.5           2.17      8278.  
## # ℹ 2 more variables: avg_monetary_avg <dbl>, churn_rate <dbl>
library(kableExtra)
## 
## Adjuntando el paquete: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
kable(cluster_profile)
cluster_final n_customers avg_recency avg_frequency avg_monetary avg_monetary_avg churn_rate
1 1105 245.605430 1.851584 439.3184 21.09466 1.0000000
2 26 5.346154 83.346154 75966.3873 149.33882 0.0000000
3 2 162.500000 2.500000 1.4500 0.36250 0.5000000
4 3233 39.591092 5.554903 1790.8511 26.41538 0.1070213
5 6 89.500000 2.166667 8278.0833 2248.35074 0.3333333
  1. Hierarchical Clustering (Agglomerative)
# Compute Euclidean distance matrix on scaled data
dist_matrix <- dist(rfm_scaled, method = "euclidean")

# Apply hierarchical clustering (Ward's method = minimizes variance)
hc <- hclust(dist_matrix, method = "ward.D2")

# Plot dendrogram
plot(hc, labels = FALSE, hang = -1, main = "Hierarchical Clustering Dendrogram")

# Cut the tree at the same number of clusters as K-means for comparison
rfm_hclust <- rfm %>%
  mutate(cluster_hclust = factor(cutree(hc, k = optimal_k)))
head(rfm_hclust)
## # A tibble: 6 × 9
##   customer_uid last_purchase       recency_days frequency monetary_total
##   <chr>        <dttm>                     <dbl>     <int>          <dbl>
## 1 12346        2011-01-18 10:17:00          325         2             0 
## 2 12347        2011-12-07 15:52:00            2         7          4310 
## 3 12348        2011-09-25 13:13:00           75         4          1797.
## 4 12349        2011-11-21 09:51:00           18         1          1758.
## 5 12350        2011-02-02 16:01:00          310         1           334.
## 6 12352        2011-11-03 14:37:00           36        11          1545.
## # ℹ 4 more variables: monetary_avg <dbl>, monetary_min <dbl>, churn <int>,
## #   cluster_hclust <fct>
  1. Compare clustering solutions
p_kmeans <- ggplot(rfm_kmeans, aes(x = frequency, y = monetary_total, color = cluster_final)) +
  geom_point(alpha = 0.7) +
  labs(title = "K-means Clustering", x = "Frequency", y = "Total Spend (£)") +
  theme_minimal()

p_hclust <- ggplot(rfm_hclust, aes(x = frequency, y = monetary_total, color = cluster_hclust)) +
  geom_point(alpha = 0.7) +
  labs(title = "Hierarchical Clustering", x = "Frequency", y = "Total Spend (£)") +
  theme_minimal()

gridExtra::grid.arrange(p_kmeans, p_hclust, ncol = 2)

  1. Concise interpretation (printed summary)
cat("\nSegment notes (k=5):\n",
    "- C1: highest avg_monetary & frequency, low recency → core loyal/wholesale.\n",
    "- C2: medium spend, recent, moderate frequency → regulars with growth potential.\n",
    "- C3: low spend, high recency → inactive/at-risk.\n",
    "- C4: low frequency but decent ticket → infrequent, higher-value orders.\n",
    "- C5: very low spend & frequency, mixed recency → one-time/near-lost.\n",
    sep = "")
## 
## Segment notes (k=5):
## - C1: highest avg_monetary & frequency, low recency → core loyal/wholesale.
## - C2: medium spend, recent, moderate frequency → regulars with growth potential.
## - C3: low spend, high recency → inactive/at-risk.
## - C4: low frequency but decent ticket → infrequent, higher-value orders.
## - C5: very low spend & frequency, mixed recency → one-time/near-lost.
# Save for your report if needed
write_csv(cluster_profile, "customer_segments_summary_k5.csv")

Exercise 3 – Initial Supervised Modeling and Confusion Matrix

  1. The variables of interest are selected
rfm_class <- rfm %>%
  select(recency_days, frequency, monetary_total, monetary_avg, monetary_min, churn)
head(rfm_class)
## # A tibble: 6 × 6
##   recency_days frequency monetary_total monetary_avg monetary_min churn
##          <dbl>     <int>          <dbl>        <dbl>        <dbl> <int>
## 1          325         2             0           0      -77184.       1
## 2            2         7          4310          23.7         5.04     0
## 3           75         4          1797.         58.0        13.2      0
## 4           18         1          1758.         24.1         6.64     0
## 5          310         1           334.         19.7         8.5      1
## 6           36        11          1545.         16.3      -376.       0
  1. Separation of the training and testing datasets.
#Installation of the required packages.
pkgs <- c("caret", "dplyr", "e1071")
to_install <- setdiff(pkgs, rownames(installed.packages()))
if (length(to_install)) install.packages(to_install, dependencies = TRUE)
invisible(lapply(pkgs, library, character.only = TRUE))
## 
## Adjuntando el paquete: 'e1071'
## The following object is masked from 'package:ggplot2':
## 
##     element
#Ensure churn is a factor (recommended for stratified sampling)
rfm_class$churn <- as.factor(rfm_class$churn)

set.seed(123)  # for reproducibility

#Create stratified training partition (70% of the data)
idx_train <- createDataPartition(rfm_class$churn, p = 0.7, list = FALSE)

#Train and test datasets
train <- rfm_class[idx_train, ]
test  <- rfm_class[-idx_train, ]

# Ensure target variable is a factor
train$churn <- as.factor(train$churn)
test$churn  <- as.factor(test$churn)

# Define X and y
x_train <- train %>% select(-churn)
y_train <- train$churn

x_test  <- test %>% select(-churn)
y_test  <- test$churn
head(x_train)
## # A tibble: 6 × 5
##   recency_days frequency monetary_total monetary_avg monetary_min
##          <dbl>     <int>          <dbl>        <dbl>        <dbl>
## 1          325         2             0           0      -77184.  
## 2            2         7          4310          23.7         5.04
## 3           75         4          1797.         58.0        13.2 
## 4          310         1           334.         19.7         8.5 
## 5           36        11          1545.         16.3      -376.  
## 6          204         1            89          22.2        11.6
  1. Train Logistic Regression Model
set.seed(123)

logit_model <- train(
  churn ~ ., 
  data = train,
  method = "glm",
  family = "binomial",
  trControl = trainControl(method = "none")
)

# Predict classes
logit_pred <- predict(logit_model, newdata = test)
head(logit_pred)
## [1] 0 0 1 0 1 1
## Levels: 0 1

2a) Confusion matrix and performance metrics.

logit_cm <- confusionMatrix(logit_pred, y_test, positive = "1")
logit_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 875   1
##          1   0 435
##                                      
##                Accuracy : 0.9992     
##                  95% CI : (0.9958, 1)
##     No Information Rate : 0.6674     
##     P-Value [Acc > NIR] : <2e-16     
##                                      
##                   Kappa : 0.9983     
##                                      
##  Mcnemar's Test P-Value : 1          
##                                      
##             Sensitivity : 0.9977     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 0.9989     
##              Prevalence : 0.3326     
##          Detection Rate : 0.3318     
##    Detection Prevalence : 0.3318     
##       Balanced Accuracy : 0.9989     
##                                      
##        'Positive' Class : 1          
## 

2b) The performance metrics are printed

# Accuracy
logit_accuracy <- logit_cm$overall["Accuracy"]

# Precision, Recall, F1
logit_precision <- logit_cm$byClass["Precision"]
logit_recall    <- logit_cm$byClass["Recall"]
logit_f1        <- logit_cm$byClass["F1"]

print(logit_accuracy)
##  Accuracy 
## 0.9992372
print(logit_precision)
## Precision 
##         1
print(logit_recall)
##    Recall 
## 0.9977064
print(logit_f1)
##        F1 
## 0.9988519

2c) Analysis of the results

    cat("\nConfusion Matrix Notes (Logistic Regression):\n",
    "- FN extremely low (only 1) → almost no true churners missed.\n",
    "- FP = 0 → model never flags active customers as churn.\n",
    "- Strong balance between FP & FN → minimal business risk in retention decisions.\n",
    "- Near-perfect accuracy and class separation → highly reliable churn predictor.\n",
    sep = "")
## 
## Confusion Matrix Notes (Logistic Regression):
## - FN extremely low (only 1) → almost no true churners missed.
## - FP = 0 → model never flags active customers as churn.
## - Strong balance between FP & FN → minimal business risk in retention decisions.
## - Near-perfect accuracy and class separation → highly reliable churn predictor.
  1. Train SVM Model (Linear Kernel)
set.seed(123)

svm_linear_model <- train(
  churn ~ .,
  data = train,
  method = "svmLinear",
  trControl = trainControl(method = "none")
)

# Predict classes
svm_pred <- predict(svm_linear_model, newdata = test)
head(svm_pred)
## [1] 0 0 1 0 1 1
## Levels: 0 1

3a) Confusion matrix and performance metrics.

svm_cm <- confusionMatrix(svm_pred, y_test, positive = "1")
svm_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 873   6
##          1   2 430
##                                          
##                Accuracy : 0.9939         
##                  95% CI : (0.988, 0.9974)
##     No Information Rate : 0.6674         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.9862         
##                                          
##  Mcnemar's Test P-Value : 0.2888         
##                                          
##             Sensitivity : 0.9862         
##             Specificity : 0.9977         
##          Pos Pred Value : 0.9954         
##          Neg Pred Value : 0.9932         
##              Prevalence : 0.3326         
##          Detection Rate : 0.3280         
##    Detection Prevalence : 0.3295         
##       Balanced Accuracy : 0.9920         
##                                          
##        'Positive' Class : 1              
## 

3b) The performance metrics are printed

# Accuracy
svm_accuracy <- svm_cm$overall["Accuracy"]

# Precision, Recall, F1
svm_precision <- svm_cm$byClass["Precision"]
svm_recall    <- svm_cm$byClass["Recall"]
svm_f1        <- svm_cm$byClass["F1"]

print(svm_accuracy)
##  Accuracy 
## 0.9938978
print(svm_precision)
## Precision 
## 0.9953704
print(svm_recall)
##    Recall 
## 0.9862385
print(svm_f1)
##        F1 
## 0.9907834

3c) Analysis of the results

    cat("\nConfusion Matrix Notes (SVM Linear):\n",
    "- FN moderately low (6) → a few true churners missed.\n",
    "- FP very low (2) → minimal misclassification of active customers as churn.\n",
    "- Slightly higher FN than FP → greater risk of overlooking at-risk customers.\n",
    "- Accuracy and balance remain very high → strong, reliable classifier overall.\n",
    sep = "")
## 
## Confusion Matrix Notes (SVM Linear):
## - FN moderately low (6) → a few true churners missed.
## - FP very low (2) → minimal misclassification of active customers as churn.
## - Slightly higher FN than FP → greater risk of overlooking at-risk customers.
## - Accuracy and balance remain very high → strong, reliable classifier overall.

Summary of the results

results <- data.frame(
  Model = c("Logistic Regression", "SVM Linear"),
  Accuracy = c(logit_accuracy, svm_accuracy),
  Precision = c(logit_precision, svm_precision),
  Recall = c(logit_recall, svm_recall),
  F1_Score = c(logit_f1, svm_f1)
)

print(results)
##                 Model  Accuracy Precision    Recall  F1_Score
## 1 Logistic Regression 0.9992372 1.0000000 0.9977064 0.9988519
## 2          SVM Linear 0.9938978 0.9953704 0.9862385 0.9907834
  cat("\nModel Performance Summary:\n",
    "- Logistic Regression: highest accuracy (0.9992) and perfect precision (1.00), very strong recall and F1.\n",
    "- SVM Linear: slightly lower accuracy (0.9939) but still excellent precision, recall, and F1.\n",
    "- Overall: Logistic Regression is the most balanced and stable across all metrics.\n",
    sep = "")
## 
## Model Performance Summary:
## - Logistic Regression: highest accuracy (0.9992) and perfect precision (1.00), very strong recall and F1.
## - SVM Linear: slightly lower accuracy (0.9939) but still excellent precision, recall, and F1.
## - Overall: Logistic Regression is the most balanced and stable across all metrics.

Exercise 4 – Cross-Validation and Model Comparison

  1. The variables of interest are selected
rfm_class <- rfm %>%
  select(recency_days, frequency, monetary_total, monetary_avg, monetary_min, churn)
head(rfm_class)
## # A tibble: 6 × 6
##   recency_days frequency monetary_total monetary_avg monetary_min churn
##          <dbl>     <int>          <dbl>        <dbl>        <dbl> <int>
## 1          325         2             0           0      -77184.       1
## 2            2         7          4310          23.7         5.04     0
## 3           75         4          1797.         58.0        13.2      0
## 4           18         1          1758.         24.1         6.64     0
## 5          310         1           334.         19.7         8.5      1
## 6           36        11          1545.         16.3      -376.       0

Ensure churn is a factor and the positive class is “1”

rfm_class$churn <- factor(rfm_class$churn, levels = c(0, 1))
  1. Custom summary function (per fold metrics)
my_summary <- function(data, lev = NULL, model = NULL) {
  # data$obs: true labels, data$pred: predicted labels
  cm <- confusionMatrix(
    data = data$pred,
    reference = data$obs,
    positive = lev[2]   # use "1" as positive class
  )
  
  out <- c(
    Accuracy  = cm$overall["Accuracy"],
    Precision = cm$byClass["Precision"],
    Recall    = cm$byClass["Recall"],
    F1        = cm$byClass["F1"]
  )
  return(out)
}
  1. Train logistic regression with 10-fold CV
set.seed(123)

ctrl <- trainControl(
  method = "cv",
  number = 10,
  savePredictions = "final",  # keep fold-level predictions
  classProbs = FALSE,
  summaryFunction = my_summary
)

logit_cv <- train(
  churn ~ .,
  data = rfm_class,
  method = "glm",
  family = "binomial",
  metric = "Accuracy",
  trControl = ctrl
)

3a. Metrics for each fold

# Metrics for each fold (one row per resample)
fold_metrics <- logit_cv$resample
fold_metrics
##    Accuracy.Accuracy Precision.Precision Recall.Recall     F1.F1 Resample
## 1          1.0000000           1.0000000     1.0000000 1.0000000   Fold01
## 2          1.0000000           1.0000000     1.0000000 1.0000000   Fold02
## 3          0.9702517           0.9782609     0.9310345 0.9540636   Fold03
## 4          1.0000000           1.0000000     1.0000000 1.0000000   Fold04
## 5          0.9748858           0.9720280     0.9520548 0.9619377   Fold05
## 6          0.9885321           0.9794521     0.9862069 0.9828179   Fold06
## 7          0.9977169           0.9931973     1.0000000 0.9965870   Fold07
## 8          0.9977117           0.9931507     1.0000000 0.9965636   Fold08
## 9          0.9977064           0.9931507     1.0000000 0.9965636   Fold09
## 10         1.0000000           1.0000000     1.0000000 1.0000000   Fold10
names(fold_metrics)
## [1] "Accuracy.Accuracy"   "Precision.Precision" "Recall.Recall"      
## [4] "F1.F1"               "Resample"

3b. Mean and standard deviation per metric

library(dplyr)
library(tidyr)

# Remove non-numeric columns (e.g., "Resample") and summarise
metrics_summary <- fold_metrics %>%
  select(where(is.numeric)) %>%          # keep only numeric metrics
  summarise(across(
    .cols = everything(),
    .fns  = list(mean = mean, sd = sd)   # mean and sd for each metric
  ))

metrics_summary
##   Accuracy.Accuracy_mean Accuracy.Accuracy_sd Precision.Precision_mean
## 1              0.9926805           0.01119101                 0.990924
##   Precision.Precision_sd Recall.Recall_mean Recall.Recall_sd F1.F1_mean
## 1             0.01050714          0.9869296       0.02480342  0.9888533
##     F1.F1_sd
## 1 0.01714935

3c. Same information in a compact table (metric x mean x sd)

results_table <- metrics_summary %>%
  pivot_longer(
    cols = everything(),
    names_to  = c("Metric", ".value"),   # split names by "_" into Metric / mean / sd
    names_sep = "_"
  )

results_table
## # A tibble: 4 × 3
##   Metric               mean     sd
##   <chr>               <dbl>  <dbl>
## 1 Accuracy.Accuracy   0.993 0.0112
## 2 Precision.Precision 0.991 0.0105
## 3 Recall.Recall       0.987 0.0248
## 4 F1.F1               0.989 0.0171

3d. Analysis of the results obtained with logistic regression.

cat("\nCross-Validation Summary (10-fold):\n",
    "- Accuracy ~0.993 with low variability → model performs consistently well across folds.\n",
    "- Precision ~0.991 → very few active customers are misclassified as churn.\n",
    "- Recall ~0.987 → model reliably detects most true churn cases with minimal misses.\n",
    "- F1 ~0.989 → strong balance between precision and recall.\n",
    "- Overall: metrics show a highly stable and well-generalized model with minimal variance.\n",
    sep = "")
## 
## Cross-Validation Summary (10-fold):
## - Accuracy ~0.993 with low variability → model performs consistently well across folds.
## - Precision ~0.991 → very few active customers are misclassified as churn.
## - Recall ~0.987 → model reliably detects most true churn cases with minimal misses.
## - F1 ~0.989 → strong balance between precision and recall.
## - Overall: metrics show a highly stable and well-generalized model with minimal variance.
  1. Train SVM with 10-fold CV
set.seed(123)

ctrl <- trainControl(
  method = "cv",
  number = 10,
  savePredictions = "final", 
  classProbs = FALSE,
  summaryFunction = my_summary
)

svm_linear_cv <- train(
  churn ~ .,
  data = rfm_class,
  method = "svmLinear",   # SVM lineal
  metric = "Accuracy",
  trControl = ctrl
)

4a. Metrics for each fold

# Metrics for each fold (one row per resample)
fold_metrics <- svm_linear_cv$resample
fold_metrics
##    Accuracy.Accuracy Precision.Precision Recall.Recall     F1.F1 Resample
## 1          0.9977169           1.0000000     0.9931507 0.9965636   Fold01
## 2          0.9931507           1.0000000     0.9794521 0.9896194   Fold02
## 3          0.9931350           0.9930556     0.9862069 0.9896194   Fold03
## 4          0.9977117           1.0000000     0.9931034 0.9965398   Fold04
## 5          0.9931507           1.0000000     0.9794521 0.9896194   Fold05
## 6          0.9954128           1.0000000     0.9862069 0.9930556   Fold06
## 7          0.9977169           1.0000000     0.9931507 0.9965636   Fold07
## 8          0.9931350           1.0000000     0.9793103 0.9895470   Fold08
## 9          0.9931193           0.9930556     0.9862069 0.9896194   Fold09
## 10         1.0000000           1.0000000     1.0000000 1.0000000   Fold10
names(fold_metrics)
## [1] "Accuracy.Accuracy"   "Precision.Precision" "Recall.Recall"      
## [4] "F1.F1"               "Resample"

4b. Mean and standard deviation per metric

library(dplyr)
library(tidyr)

# Remove non-numeric columns (e.g., "Resample") and summarise
metrics_summary <- fold_metrics %>%
  select(where(is.numeric)) %>%          # keep only numeric metrics
  summarise(across(
    .cols = everything(),
    .fns  = list(mean = mean, sd = sd)   # mean and sd for each metric
  ))

metrics_summary
##   Accuracy.Accuracy_mean Accuracy.Accuracy_sd Precision.Precision_mean
## 1              0.9954249          0.002641858                0.9986111
##   Precision.Precision_sd Recall.Recall_mean Recall.Recall_sd F1.F1_mean
## 1            0.002928035           0.987624       0.00709445  0.9930747
##      F1.F1_sd
## 1 0.004007117

4c. Same information in a compact table (metric x mean x sd)

results_table <- metrics_summary %>%
  pivot_longer(
    cols = everything(),
    names_to  = c("Metric", ".value"),   # split names by "_" into Metric / mean / sd
    names_sep = "_"
  )

results_table
## # A tibble: 4 × 3
##   Metric               mean      sd
##   <chr>               <dbl>   <dbl>
## 1 Accuracy.Accuracy   0.995 0.00264
## 2 Precision.Precision 0.999 0.00293
## 3 Recall.Recall       0.988 0.00709
## 4 F1.F1               0.993 0.00401

4d. Analysis of the results obtained with logistic regression.

cat("\nCross-Validation Summary (10-fold):\n",
    "- Accuracy ~0.995 with very low variability → the model performs consistently well across folds.\n",
    "- Precision ~0.999 → almost no active customers are misclassified as churn.\n",
    "- Recall ~0.988 → the model successfully identifies most true churn cases.\n",
    "- F1 ~0.993 → strong balance between precision and recall.\n",
    "- Overall: results indicate a highly stable and well-generalized linear SVM model with minimal variance.\n",
    sep = "")
## 
## Cross-Validation Summary (10-fold):
## - Accuracy ~0.995 with very low variability → the model performs consistently well across folds.
## - Precision ~0.999 → almost no active customers are misclassified as churn.
## - Recall ~0.988 → the model successfully identifies most true churn cases.
## - F1 ~0.993 → strong balance between precision and recall.
## - Overall: results indicate a highly stable and well-generalized linear SVM model with minimal variance.

4e. Application of the resamples() method for both models

cv_models <- resamples(list(
  Logistic = logit_cv,
  SVM_Linear = svm_linear_cv
))
# Quick summary (mean, min, max, quantiles) for each metric & model
summary(cv_models)
## 
## Call:
## summary.resamples(object = cv_models)
## 
## Models: Logistic, SVM_Linear 
## Number of resamples: 10 
## 
## Accuracy.Accuracy 
##                 Min.   1st Qu.    Median      Mean   3rd Qu. Max. NA's
## Logistic   0.9702517 0.9908257 0.9977143 0.9926805 1.0000000    1    0
## SVM_Linear 0.9931193 0.9931389 0.9942818 0.9954249 0.9977156    1    0
## 
## F1.F1 
##                 Min.   1st Qu.    Median      Mean   3rd Qu. Max. NA's
## Logistic   0.9540636 0.9862543 0.9965753 0.9888533 1.0000000    1    0
## SVM_Linear 0.9895470 0.9896194 0.9913375 0.9930747 0.9965576    1    0
## 
## Precision.Precision 
##                 Min.   1st Qu.   Median      Mean 3rd Qu. Max. NA's
## Logistic   0.9720280 0.9828767 0.993174 0.9909240       1    1    0
## SVM_Linear 0.9930556 1.0000000 1.000000 0.9986111       1    1    0
## 
## Recall.Recall 
##                 Min.   1st Qu.    Median      Mean   3rd Qu. Max. NA's
## Logistic   0.9310345 0.9896552 1.0000000 0.9869296 1.0000000    1    0
## SVM_Linear 0.9793103 0.9811408 0.9862069 0.9876240 0.9931389    1    0

Analysis of the results derived from the application of the resamples() method.

cat("\nCross-Validation Comparison (Logistic vs SVM Linear):\n",
    "- Both models show very high and stable performance across folds.\n",
    "- Logistic Regression: slightly higher Accuracy mean (~0.9927) but more variability than SVM.\n",
    "- SVM Linear: best overall mean Accuracy (~0.9954) and more consistent fold-to-fold performance.\n",
    "- F1-score: Logistic (~0.9889) strong, but SVM slightly higher (~0.9933), showing better balance.\n",
    "- Precision: both models near perfect; Logistic ~0.9909, SVM ~0.9986 (SVM wins).\n",
    "- Recall: Logistic ~0.9867, SVM ~0.9876 (very similar, SVM marginally higher).\n",
    "- Overall conclusion: SVM Linear is the most consistently balanced and robust across CV folds, \n",
    "  while Logistic Regression also performs exceptionally but with slightly greater variability.\n",
    sep = "")
## 
## Cross-Validation Comparison (Logistic vs SVM Linear):
## - Both models show very high and stable performance across folds.
## - Logistic Regression: slightly higher Accuracy mean (~0.9927) but more variability than SVM.
## - SVM Linear: best overall mean Accuracy (~0.9954) and more consistent fold-to-fold performance.
## - F1-score: Logistic (~0.9889) strong, but SVM slightly higher (~0.9933), showing better balance.
## - Precision: both models near perfect; Logistic ~0.9909, SVM ~0.9986 (SVM wins).
## - Recall: Logistic ~0.9867, SVM ~0.9876 (very similar, SVM marginally higher).
## - Overall conclusion: SVM Linear is the most consistently balanced and robust across CV folds, 
##   while Logistic Regression also performs exceptionally but with slightly greater variability.

Generation of the comparative boxplot chart.

bwplot(cv_models)   # boxplots per metric

Analysis of the boxplot chart of the models.

cat("\nCross-Validation Summary (10-fold):\n",
    "- Accuracy ~0.995 with very low variability → the model performs consistently well across folds.\n",
    "- Precision ~0.999 → almost no active customers are misclassified as churn.\n",
    "- Recall ~0.988 → the model successfully identifies most true churn cases.\n",
    "- F1 ~0.993 → strong balance between precision and recall.\n",
    "- Overall: results indicate a highly stable and well-generalized linear SVM model with minimal variance.\n",
    sep = "")
## 
## Cross-Validation Summary (10-fold):
## - Accuracy ~0.995 with very low variability → the model performs consistently well across folds.
## - Precision ~0.999 → almost no active customers are misclassified as churn.
## - Recall ~0.988 → the model successfully identifies most true churn cases.
## - F1 ~0.993 → strong balance between precision and recall.
## - Overall: results indicate a highly stable and well-generalized linear SVM model with minimal variance.

Exercise 5: Hyperparameter Tuning and Final Evaluation

  1. The variables of interest from the dataset are selected.
library(dplyr)
library(caret)
library(glmnet)   # for glmnet backend
## Cargando paquete requerido: Matrix
## 
## Adjuntando el paquete: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-10
library(e1071)    # required for SVM in caret

rfm_class <- rfm %>%
  select(recency_days, frequency, monetary_total, monetary_avg, monetary_min, churn)
head(rfm_class)
## # A tibble: 6 × 6
##   recency_days frequency monetary_total monetary_avg monetary_min churn
##          <dbl>     <int>          <dbl>        <dbl>        <dbl> <int>
## 1          325         2             0           0      -77184.       1
## 2            2         7          4310          23.7         5.04     0
## 3           75         4          1797.         58.0        13.2      0
## 4           18         1          1758.         24.1         6.64     0
## 5          310         1           334.         19.7         8.5      1
## 6           36        11          1545.         16.3      -376.       0
# Ensure churn is a factor and set positive class = "1"
rfm_class$churn <- factor(rfm_class$churn, levels = c(0, 1))
  1. Custom summary function (returns F1 + others)
my_summary <- function(data, lev = NULL, model = NULL) {
  cm <- confusionMatrix(
    data = data$pred,
    reference = data$obs,
    positive = lev[2]   # use "1" as positive class
  )
  
  c(
    Accuracy  = cm$overall["Accuracy"],
    Precision = cm$byClass["Precision"],
    Recall    = cm$byClass["Recall"],
    F1        = cm$byClass["F1"]
  )
}
  1. Repeated CV control: 5-fold, 3 repetitions
set.seed(123)

ctrl <- trainControl(
  method = "repeatedcv",
  number = 5,
  repeats = 3,
  savePredictions = "final",
  classProbs = FALSE,
  summaryFunction = my_summary
)
  1. Hyperparameter grids

4.1 Grid for glmnet (elastic-net logistic regression)

grid_glmnet <- expand.grid(
  alpha  = seq(0, 1, by = 0.25),                # mixing parameter (ridge → lasso)
  lambda = 10 ^ seq(-4, 1, length.out = 10)     # regularization strength
)

4.2 Grid for SVM with radial kernel (C and sigma)

grid_svm <- expand.grid(
  C     = c(0.1, 1, 10, 100),
  sigma = c(0.001, 0.01, 0.05, 0.1)
)
  1. Train models using F1-score as selection metric

5.1 Logistic regression via glmnet

set.seed(123)
fit_glmnet <- train(
  churn ~ .,
  data = rfm_class,
  method = "glmnet",
  tuneGrid = grid_glmnet,
  metric = "F1",          # select best tune based on F1-score
  maximize = TRUE,
  trControl = ctrl
)

5.2 SVM with radial basis kernel

set.seed(123)
fit_svm_radial <- train(
  churn ~ .,
  data = rfm_class,
  method = "svmRadial",
  tuneGrid = grid_svm,
  metric = "F1",          # select best tune based on F1-score
  maximize = TRUE,
  trControl = ctrl
)
  1. Inspect optimal hyperparameters and results
# Best hyperparameters based on F1-score
fit_glmnet$bestTune
##    alpha       lambda
## 42     1 0.0003593814
fit_svm_radial$bestTune
##    sigma   C
## 15  0.05 100
# Full tuning results (one row per hyperparameter combination)
head(fit_glmnet$results)
##   alpha       lambda Accuracy.Accuracy Precision.Precision Recall.Recall
## 1     0 0.0001000000         0.9374051           0.9997245     0.8120149
## 2     0 0.0003593814         0.9374051           0.9997245     0.8120149
## 3     0 0.0012915497         0.9374051           0.9997245     0.8120149
## 4     0 0.0046415888         0.9374051           0.9997245     0.8120149
## 5     0 0.0166810054         0.9374051           0.9997245     0.8120149
## 6     0 0.0599484250         0.9260465           0.9997089     0.7778623
##       F1.F1 Accuracy.AccuracySD Precision.PrecisionSD Recall.RecallSD
## 1 0.8960064         0.006864509           0.001066938      0.02079821
## 2 0.8960064         0.006864509           0.001066938      0.02079821
## 3 0.8960064         0.006864509           0.001066938      0.02079821
## 4 0.8960064         0.006864509           0.001066938      0.02079821
## 5 0.8960064         0.006864509           0.001066938      0.02079821
## 6 0.8747243         0.008480456           0.001127506      0.02551633
##      F1.F1SD
## 1 0.01265890
## 2 0.01265890
## 3 0.01265890
## 4 0.01265890
## 5 0.01265890
## 6 0.01614326
head(fit_svm_radial$results)
##        C sigma Accuracy.Accuracy Precision.Precision Recall.Recall     F1.F1
## 1    0.1 0.001         0.7382581           1.0000000     0.2129676 0.3504579
## 5    1.0 0.001         0.9292478           0.9988602     0.7881795 0.8809092
## 9   10.0 0.001         0.9736960           0.9992609     0.9215942 0.9587909
## 13 100.0 0.001         0.9898590           0.9992965     0.9701955 0.9845031
## 2    0.1 0.010         0.9277992           1.0000000     0.7829063 0.8780529
## 6    1.0 0.010         0.9729333           0.9994978     0.9190718 0.9575303
##    Accuracy.AccuracySD Precision.PrecisionSD Recall.RecallSD     F1.F1SD
## 1          0.008515557           0.000000000      0.02580373 0.034928836
## 5          0.007682144           0.001956781      0.02358191 0.014541451
## 9          0.005096753           0.001530272      0.01555409 0.008319426
## 13         0.003402062           0.001456376      0.01037370 0.005271209
## 2          0.007849565           0.000000000      0.02354712 0.014862256
## 6          0.005532224           0.001325480      0.01633389 0.009033237
  1. Retraining of the logistic regression model with the hyperparameters.
library(caret)
library(glmnet)

set.seed(123)

logit_model <- train(
  churn ~ .,
  data = train,
  method = "glmnet",
  family = "binomial",
  trControl = trainControl(method = "none"),
  
  # Set the exact hyperparameters you requested
  tuneGrid = expand.grid(
    alpha  = 1,
    lambda = 0.0003593814
  )
)

# Predict classes
logit_pred <- predict(logit_model, newdata = test)

head(logit_pred)
## [1] 0 0 1 0 1 1
## Levels: 0 1

7a. Obtaining the confusion matrix.

logit_cm <- confusionMatrix(logit_pred, y_test, positive = "1")
logit_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 875   0
##          1   0 436
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9972, 1)
##     No Information Rate : 0.6674     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.3326     
##          Detection Rate : 0.3326     
##    Detection Prevalence : 0.3326     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 1          
## 

7b Confusion matrix metrics.

# Accuracy
logit_accuracy <- logit_cm$overall["Accuracy"]

# Precision, Recall, F1
logit_precision <- logit_cm$byClass["Precision"]
logit_recall    <- logit_cm$byClass["Recall"]
logit_f1        <- logit_cm$byClass["F1"]

print(logit_accuracy)
## Accuracy 
##        1
print(logit_precision)
## Precision 
##         1
print(logit_recall)
## Recall 
##      1
print(logit_f1)
## F1 
##  1

7c. Analysis of the results obtained with logistic regression.

cat("\nLogistic Regression (alpha=1, lambda=0.000359) Summary:\n",
    "- Perfect classification: Accuracy = 1.00 with zero errors.\n",
    "- Sensitivity and Specificity both = 1.00 → no churners or non-churners misclassified.\n",
    "- Precision and Recall = 1.00 → flawless detection of positive and negative classes.\n",
    "- Balanced Accuracy = 1.00 → ideal performance across both classes.\n",
    "- Overall: the model achieves perfect separation on the test set.\n",
    sep = "")
## 
## Logistic Regression (alpha=1, lambda=0.000359) Summary:
## - Perfect classification: Accuracy = 1.00 with zero errors.
## - Sensitivity and Specificity both = 1.00 → no churners or non-churners misclassified.
## - Precision and Recall = 1.00 → flawless detection of positive and negative classes.
## - Balanced Accuracy = 1.00 → ideal performance across both classes.
## - Overall: the model achieves perfect separation on the test set.
  1. Retraining of the SVM model, but in this case using a radial kernel.
library(caret)
library(e1071)

set.seed(123)

svm_radial_model <- train(
  churn ~ .,
  data = train,
  method = "svmRadial",   # radial kernel
  tuneGrid = expand.grid(
    C     = 100,          # cost parameter
    sigma = 0.05          # radial kernel width
  ),
  trControl = trainControl(method = "none")
)

# Predict classes
svm_radial_pred <- predict(svm_radial_model, newdata = test)

head(svm_radial_pred)
## [1] 0 0 1 0 1 1
## Levels: 0 1

8a.Obtaining the confusion matrix.

svm_cm <- confusionMatrix(svm_radial_pred, y_test, positive = "1")
svm_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 872   2
##          1   3 434
##                                           
##                Accuracy : 0.9962          
##                  95% CI : (0.9911, 0.9988)
##     No Information Rate : 0.6674          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9914          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9954          
##             Specificity : 0.9966          
##          Pos Pred Value : 0.9931          
##          Neg Pred Value : 0.9977          
##              Prevalence : 0.3326          
##          Detection Rate : 0.3310          
##    Detection Prevalence : 0.3333          
##       Balanced Accuracy : 0.9960          
##                                           
##        'Positive' Class : 1               
## 

8b Confusion matrix metrics.

# Accuracy
svm_accuracy <- svm_cm$overall["Accuracy"]

# Precision, Recall, F1
svm_precision <- svm_cm$byClass["Precision"]
svm_recall    <- svm_cm$byClass["Recall"]
svm_f1        <- svm_cm$byClass["F1"]

print(svm_accuracy)
##  Accuracy 
## 0.9961861
print(svm_precision)
## Precision 
##  0.993135
print(svm_recall)
##    Recall 
## 0.9954128
print(svm_f1)
##        F1 
## 0.9942726

8c. Analysis of the results obtained with the SVM model using a radial kernel.

cat("\nSVM Radial (C=100, sigma=0.05) Summary:\n",
    "- Accuracy ~0.9962 → extremely high overall performance with very few errors.\n",
    "- Sensitivity ~0.9954 → the model correctly identifies almost all churners.\n",
    "- Specificity ~0.9966 → very strong detection of non-churn customers.\n",
    "- Precision ~0.9931 and Recall ~0.9954 → balanced and reliable prediction quality.\n",
    "- Only 5 misclassifications (2 FN, 3 FP) → minimal business impact.\n",
    "- Balanced Accuracy ~0.9960 → robust performance across both classes.\n",
    "- Overall: the radial SVM achieves near-perfect classification with excellent stability.\n",
    sep = "")
## 
## SVM Radial (C=100, sigma=0.05) Summary:
## - Accuracy ~0.9962 → extremely high overall performance with very few errors.
## - Sensitivity ~0.9954 → the model correctly identifies almost all churners.
## - Specificity ~0.9966 → very strong detection of non-churn customers.
## - Precision ~0.9931 and Recall ~0.9954 → balanced and reliable prediction quality.
## - Only 5 misclassifications (2 FN, 3 FP) → minimal business impact.
## - Balanced Accuracy ~0.9960 → robust performance across both classes.
## - Overall: the radial SVM achieves near-perfect classification with excellent stability.
  1. Comparison of the results.
results <- data.frame(
  Model = c("Logistic Regression", "SVM Radial"),
  Accuracy = c(logit_accuracy, svm_accuracy),
  Precision = c(logit_precision, svm_precision),
  Recall = c(logit_recall, svm_recall),
  F1_Score = c(logit_f1, svm_f1)
)

print(results)
##                 Model  Accuracy Precision    Recall  F1_Score
## 1 Logistic Regression 1.0000000  1.000000 1.0000000 1.0000000
## 2          SVM Radial 0.9961861  0.993135 0.9954128 0.9942726

9.a Analysis of the consolidated results.

cat("\nModel Comparison Summary:\n",
    "1. With tuned hyperparameters, Logistic Regression achieves perfect performance (Accuracy, Precision, Recall, F1 = 1.00).\n",
    "2. The tuned SVM Radial model performs extremely well (Accuracy ~0.996), but remains slightly below Logistic Regression.\n",
    "3. The optimized SVM Radial offers a balanced and robust alternative, with only marginal losses compared to the perfect Logistic model.\n",
    "4. When comparing to non-tuned models, both Logistic Regression and SVM Linear show a small but clear improvement after hyperparameter optimization.\n",
    "5. Overall, hyperparameter tuning boosts performance and stability, with the tuned Logistic model being the top performer across all metrics.\n",
    "6. Although all models are strong candidates for production, the tuned Logistic Regression model stands out as the only one achieving perfect predictive performance.\n",
    sep = "")
## 
## Model Comparison Summary:
## 1. With tuned hyperparameters, Logistic Regression achieves perfect performance (Accuracy, Precision, Recall, F1 = 1.00).
## 2. The tuned SVM Radial model performs extremely well (Accuracy ~0.996), but remains slightly below Logistic Regression.
## 3. The optimized SVM Radial offers a balanced and robust alternative, with only marginal losses compared to the perfect Logistic model.
## 4. When comparing to non-tuned models, both Logistic Regression and SVM Linear show a small but clear improvement after hyperparameter optimization.
## 5. Overall, hyperparameter tuning boosts performance and stability, with the tuned Logistic model being the top performer across all metrics.
## 6. Although all models are strong candidates for production, the tuned Logistic Regression model stands out as the only one achieving perfect predictive performance.