About Datasets

The data is record of baby milk purchases by each customer with membership. The transaction period taken for 12 months (January to December 2019) was withdrawn on 01 January 2020.

Business Understanding

Company XYZ produces and sells baby products, one of which is baby milk. You can purchase baby milk with a membership or not. Purchasing with membership allows customers to take part in loyalty programs, and also makes it possible for companies to carry out Customer Segmentation using Data Science on customers with membership.

library(readxl)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
setwd("D:/Kuliah/Mat/TSA Kominfo/Praktikum/segmentation-tsa-main")
data <- read.csv("customer_segmentation.csv", sep = ',')
head(data)
##   MemberID NoChild tenure_months youngest_kid_age_join    recency frequency
## 1    32269       1      12.91170              1.084189  15.502778        43
## 2   219312       2      28.74743              2.475017   1.388889        18
## 3   198412       3      22.57084              1.333333  14.277778        60
## 4   308087       1      22.89938              4.019165 332.597917         4
## 5   349520       1      36.82957              2.546201  10.598611        69
## 6   357289       2      31.77002              1.032170 325.288889         2
##   monetary avg_monthly_frequency avg_monetary freq_last3mo avg_interpurchase
## 1    13951              3.583333    324.44186            6          8.309524
## 2    10576              1.800000    587.55556            5         21.352941
## 3    22409              5.000000    373.48333           14          5.932203
## 4      729              2.000000    182.25000            0         10.666667
## 5     5668              5.750000     82.14493           21          5.205882
## 6     1076              1.000000    538.00000            0         39.000000
##   avg_consumption
## 1        14.30233
## 2        24.00000
## 3        17.70000
## 4         9.75000
## 5         4.00000
## 6        25.50000

1 Explorasi Data

str(data)
## 'data.frame':    313 obs. of  12 variables:
##  $ MemberID             : int  32269 219312 198412 308087 349520 357289 390918 146715 304517 115868 ...
##  $ NoChild              : int  1 2 3 1 1 2 2 1 1 1 ...
##  $ tenure_months        : num  12.9 28.7 22.6 22.9 36.8 ...
##  $ youngest_kid_age_join: num  1.08 2.48 1.33 4.02 2.55 ...
##  $ recency              : num  15.5 1.39 14.28 332.6 10.6 ...
##  $ frequency            : int  43 18 60 4 69 2 4 3 7 45 ...
##  $ monetary             : int  13951 10576 22409 729 5668 1076 994 375 822 4245 ...
##  $ avg_monthly_frequency: num  3.58 1.8 5 2 5.75 ...
##  $ avg_monetary         : num  324.4 587.6 373.5 182.2 82.1 ...
##  $ freq_last3mo         : int  6 5 14 0 21 0 0 0 0 0 ...
##  $ avg_interpurchase    : num  8.31 21.35 5.93 10.67 5.21 ...
##  $ avg_consumption      : num  14.3 24 17.7 9.75 4 ...

1.1 About Dataset

Variabel Description
MemberID Customer ID is unique
NoChild No of Child
tenure_months Length of stay as member (month)
youngest_kid_age_join Youngest kid’s age when join membership (year)
recency Length of the last transaction from the date of data withdrawal (days)
frequency Number of transaction within last 12 months
monetary Total amount spent (x1000 rupiah)
avg_monthly_frequency Average number of transaction per month (only on month which there are transaction)
avg_monetary Average amount spent each transaction (x1000 rupiah)
avg_comsuption Average weight of product purchases per transaction (ounces)
freq_last3mo Number of transactions in the last 3 months
avg_interpurchase Average time distance between a transaction and the next transaction (days);
(0 = no next transaction / continues shopping every day)

Statistik untuk setiap variabel:

summary(data$youngest_kid_age_join)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.002   1.185   1.678   2.071   2.456  10.804
hist(data$youngest_kid_age_join, n = 100)

table(data$NoChild)
## 
##   1   2   3   5 
## 253  51   8   1
hist(data$freq_last3mo, n = 50)

summary(data$freq_last3mo)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   2.326   3.000  25.000

merupakan plot histogram lama tergabung sebagai member.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ lubridate 1.9.2     ✔ stringr   1.5.0
## ✔ purrr     1.0.2     ✔ tibble    3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
data %>% 
  ggplot(aes(x = NoChild)) +
  geom_bar() +
  scale_x_continuous(breaks = seq(1, 10, by = 1)) +
  scale_y_continuous(breaks = seq(0, 22000, by = 5000), 
                     labels = number_format(big.mark = ",")) +
  labs(x = "No of Child") +
  theme_bw() +
  theme(
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank()
  )

x_mean <- mean(data$tenure_months, na.rm = TRUE)
data %>%
  ggplot(aes(x = tenure_months)) + 
  geom_density(fill = "skyblue", alpha = 0.75) +
  geom_boxplot(width = 0.005) + 
  geom_point(x = x_mean, y = 0, 
             inherit.aes = FALSE, color = "red") + 
  scale_x_continuous(breaks = seq(0, 150, by = 10)) + 
  labs(x = "Tenure (month)") + 
  theme_bw() + 
  theme(panel.grid.minor = element_blank())

x_mean <- mean(data$youngest_kid_age_join, na.rm = TRUE)
data %>%
  ggplot(aes(x = youngest_kid_age_join)) + 
  scale_x_continuous(breaks = seq(1, 10, by = 1)) +
  geom_density(fill = "skyblue", alpha = 0.75) +
  geom_boxplot(width = 0.1) + 
  geom_point(x = x_mean, y = 0, 
             inherit.aes = FALSE, color = "red") + 
  labs(x = "Youngest Kid Age When Join (year)") + 
  theme_bw() + 
  theme(panel.grid.minor = element_blank())

x_mean <- mean(data$recency, na.rm = TRUE)
data %>% 
  ggplot(aes(x = recency)) + 
  geom_density(fill = "skyblue", alpha = 0.75) +
  geom_boxplot(width = 0.005) + 
  geom_point(x = x_mean, y = 0,
             inherit.aes = FALSE, color = "red") +
  scale_x_continuous(breaks = seq(0, 100, by = 5)) +
  labs(x = "Recency (day)") + 
  theme_bw() + 
  theme(panel.grid.minor = element_blank())

x_mean <- mean(data$avg_monthly_frequency, na.rm = TRUE)
data %>% 
  ggplot(aes(x = avg_monthly_frequency)) + 
  geom_density(fill = "skyblue", alpha = 0.75) +
  geom_boxplot(width = 0.05) + 
  geom_point(x = x_mean, y = 0,
             inherit.aes = FALSE, color = "red") +
  scale_x_continuous(breaks = seq(0, 50, by = 5)) +
  labs(x = "Avg Monthly Visit") + 
  theme_bw() + 
  theme(panel.grid.minor = element_blank())

x_mean <- mean(data$avg_monetary, na.rm = TRUE)
data %>% 
  ggplot(aes(x = avg_monetary)) + 
  geom_density(fill = "skyblue", alpha = 0.75) +
  geom_boxplot(width = 0.0005) + 
  geom_point(aes(x = x_mean, y = 0),
             inherit.aes = FALSE, color = "red") +
  scale_x_continuous(
    breaks = seq(0, 10000, by = 1000), 
    labels = number_format(big.mark = ",")
    ) +
  labs(x = "Avg Monthly Monetary") + 
  theme_bw() + 
  theme(panel.grid.minor = element_blank())

x_mean <- mean(data$avg_interpurchase, na.rm = TRUE)
data %>% 
  ggplot(aes(x = avg_interpurchase)) + 
  geom_density(fill = "skyblue", alpha = 0.75) +
  geom_boxplot(width = 0.005) + 
  geom_point(x = x_mean, y = 0,
             inherit.aes = FALSE, color = "red") +
  scale_x_continuous(breaks = seq(0, 200, by = 10)) +
  labs(x = "Avg Interpurchase") + 
  theme_bw() + 
  theme(panel.grid.minor = element_blank())

x_mean <- mean(data$freq_last3mo, na.rm = TRUE)
data %>% 
  ggplot(aes(x = freq_last3mo)) + 
  geom_density(fill = "skyblue", alpha = 0.75) +
  geom_boxplot(width = 0.01) + 
  geom_point(x = x_mean, y = 0,
             inherit.aes = FALSE, color = "red") +
  scale_x_continuous(breaks = seq(0, 200, by = 10)) +
  labs(x = "Freq Last 3 Months") + 
  theme_bw() + 
  theme(panel.grid.minor = element_blank())

library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.3.2
pmat <- data %>% 
  select(-MemberID) %>%
  cor_pmat()
data %>% 
  select(-MemberID) %>%
  cor() %>% 
  ggcorrplot(colors = c("firebrick", "white", "green"), 
             hc.order = TRUE)

data %>% 
  select(-MemberID) %>%
  cor() %>% 
  ggcorrplot(colors = c("firebrick", "white", "green"), 
             hc.order = TRUE, p.mat = pmat, 
             lab = TRUE, lab_size = 4, digits = 1, 
             ggtheme = theme_bw())

2 Optimasi

library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.2
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## Warning: package 'dials' was built under R version 4.3.2
## Warning: package 'infer' was built under R version 4.3.2
## Warning: package 'modeldata' was built under R version 4.3.2
## Warning: package 'parsnip' was built under R version 4.3.2
## Warning: package 'rsample' was built under R version 4.3.2
## Warning: package 'tune' was built under R version 4.3.2
## Warning: package 'workflows' was built under R version 4.3.2
## Warning: package 'workflowsets' was built under R version 4.3.2
## Warning: package 'yardstick' was built under R version 4.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
library(tidyclust)
## Warning: package 'tidyclust' was built under R version 4.3.2
## 
## Attaching package: 'tidyclust'
## The following objects are masked from 'package:parsnip':
## 
##     knit_engine_docs, list_md_problems
data_cluster <- data %>% 
  select(-MemberID)
pc <- data_cluster %>%
  prcomp(retx = TRUE, scale. = TRUE, center = TRUE)
pc
## Standard deviations (1, .., p=11):
##  [1] 1.7861028 1.5790299 1.1569356 1.0077414 0.9683249 0.9164862 0.7257820
##  [8] 0.5341338 0.5164178 0.2827819 0.1617002
## 
## Rotation (n x k) = (11 x 11):
##                               PC1          PC2          PC3         PC4
## NoChild                0.14859382 -0.174448842  0.386588909  0.17516129
## tenure_months          0.02074758 -0.207848497  0.700252033 -0.14144613
## youngest_kid_age_join -0.12135659 -0.003538326  0.013724414 -0.75900654
## recency               -0.34433471 -0.032176495  0.467106683  0.16973060
## frequency              0.48881181  0.180963072  0.205559588 -0.09942156
## monetary               0.48265452 -0.164103383  0.043186301  0.01233098
## avg_monthly_frequency  0.33375803  0.321508455  0.209052296  0.01368195
## avg_monetary           0.15563821 -0.559743114 -0.156421591  0.13024450
## freq_last3mo           0.46069981  0.184219229 -0.126459341 -0.08702848
## avg_interpurchase      0.02775600 -0.308333969 -0.002211109 -0.54627617
## avg_consumption        0.15385333 -0.565645297 -0.116326530  0.10598576
##                               PC5         PC6         PC7         PC8
## NoChild                0.23679502  0.79154970  0.28553887 -0.04597101
## tenure_months          0.13027355 -0.20144178 -0.47896564  0.36894939
## youngest_kid_age_join -0.53029727  0.34969369 -0.01546362  0.01778577
## recency               -0.36424675 -0.20370631  0.23689423 -0.32692398
## frequency             -0.10547522 -0.14708392  0.01161943 -0.21320702
## monetary              -0.12802011 -0.04734074 -0.22628030 -0.63813181
## avg_monthly_frequency -0.19858534 -0.22021757  0.60203582  0.34405981
## avg_monetary          -0.24451992 -0.06981420  0.11550698  0.19148419
## freq_last3mo           0.03158551  0.11988205 -0.20197957  0.28203060
## avg_interpurchase      0.56317774 -0.27104073  0.39575767 -0.16378154
## avg_consumption       -0.26107263 -0.06331612  0.09030397  0.20121866
##                                PC9        PC10         PC11
## NoChild                0.002879705  0.03260916 -0.003843385
## tenure_months         -0.122471166 -0.08330400 -0.031434890
## youngest_kid_age_join -0.061579423 -0.02582784 -0.021571739
## recency                0.542121420 -0.04706985 -0.010946475
## frequency             -0.017481267  0.77299170 -0.001812218
## monetary              -0.175710953 -0.47975934 -0.002990623
## avg_monthly_frequency -0.253608234 -0.32903550  0.003184833
## avg_monetary           0.028072791  0.09438137 -0.704744670
## freq_last3mo           0.747970869 -0.18717598 -0.007170914
## avg_interpurchase      0.173989599 -0.03694782  0.004351662
## avg_consumption        0.035873499  0.08864006  0.708275462
pc %>% 
  broom::tidy("pcs") %>% 
  mutate(eigenvalue = std.dev^2) %>% 
  filter(eigenvalue >= 0.7 | cumulative <= 0.95)
## # A tibble: 7 × 5
##      PC std.dev percent cumulative eigenvalue
##   <dbl>   <dbl>   <dbl>      <dbl>      <dbl>
## 1     1   1.79   0.290       0.290      3.19 
## 2     2   1.58   0.227       0.517      2.49 
## 3     3   1.16   0.122       0.638      1.34 
## 4     4   1.01   0.0923      0.731      1.02 
## 5     5   0.968  0.0852      0.816      0.938
## 6     6   0.916  0.0764      0.892      0.840
## 7     7   0.726  0.0479      0.940      0.527
pc %>% 
  broom::tidy("rotation") %>% 
  filter(PC <= 6) %>%  
  pivot_wider(id_cols = column, 
              names_from = PC, names_prefix = "PC", 
              values_from = value)
## # A tibble: 11 × 7
##    column                    PC1      PC2      PC3     PC4     PC5     PC6
##    <chr>                   <dbl>    <dbl>    <dbl>   <dbl>   <dbl>   <dbl>
##  1 NoChild                0.149  -0.174    0.387    0.175   0.237   0.792 
##  2 tenure_months          0.0207 -0.208    0.700   -0.141   0.130  -0.201 
##  3 youngest_kid_age_join -0.121  -0.00354  0.0137  -0.759  -0.530   0.350 
##  4 recency               -0.344  -0.0322   0.467    0.170  -0.364  -0.204 
##  5 frequency              0.489   0.181    0.206   -0.0994 -0.105  -0.147 
##  6 monetary               0.483  -0.164    0.0432   0.0123 -0.128  -0.0473
##  7 avg_monthly_frequency  0.334   0.322    0.209    0.0137 -0.199  -0.220 
##  8 avg_monetary           0.156  -0.560   -0.156    0.130  -0.245  -0.0698
##  9 freq_last3mo           0.461   0.184   -0.126   -0.0870  0.0316  0.120 
## 10 avg_interpurchase      0.0278 -0.308   -0.00221 -0.546   0.563  -0.271 
## 11 avg_consumption        0.154  -0.566   -0.116    0.106  -0.261  -0.0633
pc %>% 
  broom::tidy("loadings") %>% 
  filter(PC <= 6) %>% 
  mutate(#value = if_else(abs(value) < 0.25, 0, value), 
    influence = if_else(value < 0, "negative", "positive"), 
    PC = factor(PC, labels = paste0("PC", PC), levels = PC), 
    column = factor(column, levels = column %>%  unique() %>%  sort())
  ) %>% 
  ggplot(aes(x = value, y = column, fill = influence)) + 
  geom_col() + 
  geom_vline(xintercept = 0) + 
  labs(x = "Loadings", 
       y = "Column") + 
  facet_wrap(~PC, nrow = 1) + 
  theme_light() + 
  theme(legend.position = "none")

cs_recipe <- recipe(~ ., data = data_cluster) %>%  
  step_pca(all_numeric_predictors(), threshold = 0.95, 
           options = list(retx = TRUE, scale. = TRUE, center = TRUE)) %>%  
  prep()
kmeans_spec <- k_means(num_clusters = tune()) %>% 
  set_engine(engine = "stats", iter.max = 5000, nstart = 25)
kmeans_wf <- workflow(
  preprocessor = cs_recipe, 
  spec = kmeans_spec
  )
set.seed(1001)
bs <- vfold_cv(data = data_cluster, v = 3)
maxk <- 10
grid_cluster <- tibble(num_clusters = 1:maxk)
bs_sse <- tune_cluster(
  object = kmeans_wf, 
  resamples = bs, 
  grid = grid_cluster, 
  metrics = cluster_metric_set(sse_within_total)
  )
# Script berikut ini membutuhkan waktu proses cukup lama!
bs_silhouette <- tune_cluster(
  object = kmeans_wf, 
  resamples = bs, 
  grid = grid_cluster, 
  metrics = cluster_metric_set(silhouette_avg)
)
gg_sse <- bs_sse %>% 
  collect_metrics() %>%  
  ggplot(aes(x = num_clusters, y = mean)) + 
  geom_line() + 
  geom_point() + 
  scale_x_continuous(breaks = 1:10) + 
  scale_y_continuous(labels = scales::number_format(big.mark = ",")) + 
  labs(
    x = NULL, 
    y = "SSE Within Total"
  ) + 
  theme_light() + 
  theme(
    panel.grid.minor = element_blank(), 
    axis.text = element_blank(), 
    axis.ticks = element_blank()
    )
gg_sse

gg_silhouette <- bs_silhouette %>% 
  collect_metrics() %>% 
  ggplot(aes(x = num_clusters, y = mean)) + 
  geom_line() + 
  geom_point() + 
  scale_x_continuous(breaks = 1:10) + 
  scale_y_continuous(labels = scales::number_format(big.mark = ",")) + 
  labs(
    x = "Number of Cluster", 
    y = "Avg Silhouette"
  ) + 
  theme_light() + 
  theme(
    panel.grid.minor = element_blank(), 
    axis.text.y = element_blank(), 
    axis.ticks = element_blank()
    )
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(gg_sse, gg_silhouette, ncol = 1)
## Warning: Removed 1 row containing missing values (`geom_line()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).

Sys.time()
## [1] "2023-12-15 00:27:22 WIB"

3 Clustering

library(tidymodels)
library(tidyclust)
data_cluster <- data %>% 
  select(-MemberID)
pc <- data_cluster %>% 
  prcomp(retx = TRUE, scale. = TRUE, center = TRUE)
optimk <- 3
kmeans_spec <- k_means(num_clusters = optimk) %>% 
  set_engine(engine = "stats", iter.max = 5000, nstart = 25)
kmeans_wf <- kmeans_wf %>% 
  update_model(kmeans_spec)
set.seed(1001)
kmeans_fit <- kmeans_wf %>% 
  fit(data = data_cluster)
kmeans_fit %>% 
  extract_centroids()
## # A tibble: 3 × 9
##   .cluster     PC1    PC2     PC3     PC4    PC5     PC6     PC7     PC8
##   <fct>      <dbl>  <dbl>   <dbl>   <dbl>  <dbl>   <dbl>   <dbl>   <dbl>
## 1 Cluster_1  2.61   1.06   0.187  -0.133  -0.177  0.0171 -0.0367 -0.0380
## 2 Cluster_2 -0.900  0.340 -0.0469  0.0248  0.109  0.0345 -0.0165 -0.0311
## 3 Cluster_3  0.653 -2.76  -0.0286  0.0568 -0.249 -0.169   0.116   0.180
kmeans_fit %>% 
  extract_cluster_assignment() %>% 
  count(.cluster) %>% 
  mutate(pct = n/sum(n))
## # A tibble: 3 × 3
##   .cluster      n   pct
##   <fct>     <int> <dbl>
## 1 Cluster_1    59 0.188
## 2 Cluster_2   206 0.658
## 3 Cluster_3    48 0.153
kmeans_fit %>% 
  sse_within_total(new_data = data_cluster)
## # A tibble: 1 × 3
##   .metric          .estimator .estimate
##   <chr>            <chr>          <dbl>
## 1 sse_within_total standard       2257.
kmeans_fit %>%  
  silhouette_avg(new_data = data_cluster)
## # A tibble: 1 × 3
##   .metric        .estimator .estimate
##   <chr>          <chr>          <dbl>
## 1 silhouette_avg standard       0.271
cs_cluster <- data_cluster %>% 
  bind_cols(
    kmeans_fit %>%  
      extract_cluster_assignment()
  )
cs_cluster %>% 
  count(.cluster) %>% 
  mutate(Variable = "member") %>% 
  pivot_wider(id_cols = Variable, names_from = .cluster, values_from = n) %>% 
  bind_rows(
    cs_cluster %>% 
      count(.cluster) %>% 
      mutate(Variable = "percent member", 
             pct = n/sum(n)) %>% 
      pivot_wider(id_cols = Variable, names_from = .cluster, values_from = pct), 
    cs_cluster %>% 
      rownames_to_column("id") %>% 
      pivot_longer(cols = -c(id, .cluster), names_to = "Variable", values_to = "Values") %>% 
      group_by(.cluster, Variable) %>% 
      summarise(
        avg = mean(Values), 
        .groups = "drop"
      ) %>% 
      pivot_wider(id_cols = Variable, names_from = .cluster, values_from = avg)
  ) 
## # A tibble: 13 × 4
##    Variable              Cluster_1 Cluster_2 Cluster_3
##    <chr>                     <dbl>     <dbl>     <dbl>
##  1 member                   59       206        48    
##  2 percent member            0.188     0.658     0.153
##  3 NoChild                   1.34      1.15      1.44 
##  4 avg_consumption          13.1      10.8      36.0  
##  5 avg_interpurchase        12.2      16.3      35.6  
##  6 avg_monetary            311.      267.      909.   
##  7 avg_monthly_frequency     3.18      1.52      1.28 
##  8 freq_last3mo              7.41      1.05      1.56 
##  9 frequency                24.3       4.00      6.40 
## 10 monetary               7279.     1098.     5549.   
## 11 recency                  31.5     146.      114.   
## 12 tenure_months            14.8      14.0      22.6  
## 13 youngest_kid_age_join     1.91      2.13      2.02

4 Cluster Profile

library(gridExtra)
library(ggridges)
## Warning: package 'ggridges' was built under R version 4.3.2
cluster_profiling <- function(.data, .variable, .cluster = .cluster, 
                              discrete = FALSE, pop.box.width = 0.05, 
                              cluster.box.width = 0.1){
  .data <- .data %>% 
    select({{.variable}}, {{.cluster}})
  # names(.data) <- c("Variable", "Cluster")
  
  if(discrete == FALSE){
    grid.arrange(
      ggplot(data = .data, aes(x = {{.variable}})) + 
        geom_density(
          fill = "skyblue", 
          color = "grey", 
          alpha = 0.5
        ) + 
        geom_boxplot(fill = "skyblue", alpha = 0.5, width = pop.box.width) + 
        labs(y = "Population") + 
        theme_light() + 
        theme(
          panel.grid = element_blank(), 
          axis.text.y = element_blank(), 
          axis.text.x = element_blank(), 
          axis.title.x = element_blank(), 
          axis.ticks.y = element_blank()
        ),
      ggplot(data = .data, 
             aes(x = {{.variable}}, y = {{.cluster}}, fill = {{.cluster}})) + 
        geom_density_ridges(
          color = "grey", 
          alpha = 0.5
        ) + 
        geom_boxplot(width = cluster.box.width, alpha = 0.5) + 
        stat_summary(fun = mean, color = "red") + 
        labs(y = "Cluster") +  
        xlab(vars({{.variable}})) + 
        theme_light() + 
        theme(
          axis.text.y = element_blank(), 
          axis.ticks.y = element_blank(), 
          legend.position = "top"
        ), 
      ncol = 1, 
      heights = c(0.3, 0.7)
    )
  } else {
    
    grid.arrange(
      ggplot(data = .data, aes(x = {{.variable}})) + 
        geom_bar(
          fill = "skyblue", 
          color = "grey", 
          alpha = 0.5
        ) + 
        scale_x_continuous(breaks = 1:10) + 
        labs(y = "Population") + 
        theme_light() + 
        theme(
          panel.grid = element_blank(), 
          axis.text.y = element_blank(), 
          axis.title.x = element_blank(), 
          axis.ticks.y = element_blank()
        ),
      .data %>% 
        count({{.cluster}}, {{.variable}}) %>% 
        mutate(pct = n/sum(n)) %>% 
        mutate({{.variable}} := factor({{.variable}})) %>% 
        ggplot(aes(x = pct, y = {{.cluster}}, fill = {{.variable}})) + 
        geom_col(
          position = position_fill(reverse = TRUE), 
          color = "grey", 
          alpha = 0.75
        ) + 
        scale_fill_brewer(type = "qual") +
        labs(x = "Proportion", 
              y = "Cluster") +  
        theme_light() + 
        theme(
          panel.grid.major.x = element_blank(), 
          axis.text.y = element_text(hjust = 1.0), 
          axis.ticks.y = element_blank(), 
          legend.position = "top"
        ), 
      ncol = 1, 
      heights = c(0.3, 0.7)
    )
  }
  
}
cs_cluster %>%
  ggplot(aes(x = NoChild)) +
  geom_bar(
    fill = "skyblue",
    color = "grey",
    alpha = 0.5
  ) +
  scale_x_continuous(breaks = 1:10) +
  labs(y = "Population") +
  theme_light() +
  theme(
    panel.grid = element_blank(),
    axis.text.y = element_blank(),
    # axis.text.x = element_blank(),
    axis.title.x = element_blank(),
    axis.ticks.y = element_blank()
  )

cs_cluster %>%
  count(.cluster, NoChild) %>%
  mutate(pct = n/sum(n)) %>%
  ggplot(aes(x = pct, y = .cluster, fill = factor(NoChild))) +
  geom_col(
    position = position_fill(reverse = TRUE),
    color = "grey",
    alpha = 0.75
  ) +
  # geom_boxplot(width = cluster.box.width, alpha = 0.75) +
  # stat_summary(fun = mean, color = "red") +
  scale_x_continuous(labels = percent_format()) +
  scale_fill_brewer(type = "qual") +
  labs(x = "Proportion",
       y = "Cluster") +
  theme_light() +
  theme(
    panel.grid.major.x = element_blank(),
    axis.text.y = element_text(hjust = 1.0),
    axis.ticks.y = element_blank(),
    legend.position = "top"
  )

cs_cluster %>% 
  cluster_profiling(
    .variable = NoChild, 
    .cluster = .cluster, 
    discrete = TRUE, 
    pop.box.width = 0.005, 
    cluster.box.width = 0.2
    )

cs_cluster %>% 
  cluster_profiling(
    .variable = avg_monetary, 
    .cluster = .cluster, 
    discrete = FALSE, 
    pop.box.width = 0.0005, 
    cluster.box.width = 0.2
    )
## Picking joint bandwidth of 73.2
## Warning: Removed 3 rows containing missing values (`geom_segment()`).

cs_cluster %>% 
  cluster_profiling(
    .variable = avg_monthly_frequency, 
    .cluster = .cluster, 
    discrete = FALSE, 
    pop.box.width = 0.3, 
    cluster.box.width = 0.2
    )
## Picking joint bandwidth of 0.266
## Warning: Removed 3 rows containing missing values (`geom_segment()`).

cs_cluster %>% 
  cluster_profiling(
    .variable = avg_interpurchase,
    pop.box.width = 0.005, 
    cluster.box.width = 0.2
    )
## Picking joint bandwidth of 4.18
## Warning: Removed 3 rows containing missing values (`geom_segment()`).

cs_cluster %>% 
  cluster_profiling(
    freq_last3mo, 
    pop.box.width = 0.01, 
    cluster.box.width = 0.2
    )
## Picking joint bandwidth of 0.943
## Warning: Removed 3 rows containing missing values (`geom_segment()`).

cs_cluster %>% 
  cluster_profiling(
    tenure_months, 
    pop.box.width = 0.005, 
    cluster.box.width = 0.2
    )
## Picking joint bandwidth of 4.22
## Warning: Removed 3 rows containing missing values (`geom_segment()`).

cs_cluster %>% 
  cluster_profiling(
    avg_consumption, 
    pop.box.width = 0.005, 
    cluster.box.width = 0.2
    )
## Picking joint bandwidth of 3.07
## Warning: Removed 3 rows containing missing values (`geom_segment()`).

library(rpart)
## 
## Attaching package: 'rpart'
## The following object is masked from 'package:dials':
## 
##     prune
library(rpart.plot)
library(dplyr)
dtree <- cs_cluster %>%
  rpart(formula = .cluster ~ ., 
        data = ., 
        control = rpart.control(
          maxdepth = 4, 
          minsplit = 1
          )
        )
dtree %>% 
  rpart.plot()
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call rpart.plot with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.

dtree %>% 
  rpart.plot(
    type = 4, 
    extra = 104, 
    tweak = 1.2, 
    under = TRUE, 
    box.palette = "auto", 
    fallen.leaves = TRUE, 
    legend.cex = 1
    )
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call rpart.plot with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.