About Datasets
The data is record of baby milk purchases by each customer with membership. The transaction period taken for 12 months (January to December 2019) was withdrawn on 01 January 2020.
Business Understanding
Company XYZ produces and sells baby products, one of which is baby milk. You can purchase baby milk with a membership or not. Purchasing with membership allows customers to take part in loyalty programs, and also makes it possible for companies to carry out Customer Segmentation using Data Science on customers with membership.
library(readxl)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
setwd("D:/Kuliah/Mat/TSA Kominfo/Praktikum/segmentation-tsa-main")
data <- read.csv("customer_segmentation.csv", sep = ',')
head(data)
## MemberID NoChild tenure_months youngest_kid_age_join recency frequency
## 1 32269 1 12.91170 1.084189 15.502778 43
## 2 219312 2 28.74743 2.475017 1.388889 18
## 3 198412 3 22.57084 1.333333 14.277778 60
## 4 308087 1 22.89938 4.019165 332.597917 4
## 5 349520 1 36.82957 2.546201 10.598611 69
## 6 357289 2 31.77002 1.032170 325.288889 2
## monetary avg_monthly_frequency avg_monetary freq_last3mo avg_interpurchase
## 1 13951 3.583333 324.44186 6 8.309524
## 2 10576 1.800000 587.55556 5 21.352941
## 3 22409 5.000000 373.48333 14 5.932203
## 4 729 2.000000 182.25000 0 10.666667
## 5 5668 5.750000 82.14493 21 5.205882
## 6 1076 1.000000 538.00000 0 39.000000
## avg_consumption
## 1 14.30233
## 2 24.00000
## 3 17.70000
## 4 9.75000
## 5 4.00000
## 6 25.50000
str(data)
## 'data.frame': 313 obs. of 12 variables:
## $ MemberID : int 32269 219312 198412 308087 349520 357289 390918 146715 304517 115868 ...
## $ NoChild : int 1 2 3 1 1 2 2 1 1 1 ...
## $ tenure_months : num 12.9 28.7 22.6 22.9 36.8 ...
## $ youngest_kid_age_join: num 1.08 2.48 1.33 4.02 2.55 ...
## $ recency : num 15.5 1.39 14.28 332.6 10.6 ...
## $ frequency : int 43 18 60 4 69 2 4 3 7 45 ...
## $ monetary : int 13951 10576 22409 729 5668 1076 994 375 822 4245 ...
## $ avg_monthly_frequency: num 3.58 1.8 5 2 5.75 ...
## $ avg_monetary : num 324.4 587.6 373.5 182.2 82.1 ...
## $ freq_last3mo : int 6 5 14 0 21 0 0 0 0 0 ...
## $ avg_interpurchase : num 8.31 21.35 5.93 10.67 5.21 ...
## $ avg_consumption : num 14.3 24 17.7 9.75 4 ...
| Variabel | Description |
|---|---|
| MemberID | Customer ID is unique |
| NoChild | No of Child |
| tenure_months | Length of stay as member (month) |
| youngest_kid_age_join | Youngest kid’s age when join membership (year) |
| recency | Length of the last transaction from the date of data withdrawal (days) |
| frequency | Number of transaction within last 12 months |
| monetary | Total amount spent (x1000 rupiah) |
| avg_monthly_frequency | Average number of transaction per month (only on month which there are transaction) |
| avg_monetary | Average amount spent each transaction (x1000 rupiah) |
| avg_comsuption | Average weight of product purchases per transaction (ounces) |
| freq_last3mo | Number of transactions in the last 3 months |
| avg_interpurchase | Average time distance between a transaction and the next transaction (days); |
| (0 = no next transaction / continues shopping every day) |
Statistik untuk setiap variabel:
summary(data$youngest_kid_age_join)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.002 1.185 1.678 2.071 2.456 10.804
hist(data$youngest_kid_age_join, n = 100)
table(data$NoChild)
##
## 1 2 3 5
## 253 51 8 1
hist(data$freq_last3mo, n = 50)
summary(data$freq_last3mo)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 2.326 3.000 25.000
merupakan plot histogram lama tergabung sebagai member.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ lubridate 1.9.2 ✔ stringr 1.5.0
## ✔ purrr 1.0.2 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
data %>%
ggplot(aes(x = NoChild)) +
geom_bar() +
scale_x_continuous(breaks = seq(1, 10, by = 1)) +
scale_y_continuous(breaks = seq(0, 22000, by = 5000),
labels = number_format(big.mark = ",")) +
labs(x = "No of Child") +
theme_bw() +
theme(
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank()
)
x_mean <- mean(data$tenure_months, na.rm = TRUE)
data %>%
ggplot(aes(x = tenure_months)) +
geom_density(fill = "skyblue", alpha = 0.75) +
geom_boxplot(width = 0.005) +
geom_point(x = x_mean, y = 0,
inherit.aes = FALSE, color = "red") +
scale_x_continuous(breaks = seq(0, 150, by = 10)) +
labs(x = "Tenure (month)") +
theme_bw() +
theme(panel.grid.minor = element_blank())
x_mean <- mean(data$youngest_kid_age_join, na.rm = TRUE)
data %>%
ggplot(aes(x = youngest_kid_age_join)) +
scale_x_continuous(breaks = seq(1, 10, by = 1)) +
geom_density(fill = "skyblue", alpha = 0.75) +
geom_boxplot(width = 0.1) +
geom_point(x = x_mean, y = 0,
inherit.aes = FALSE, color = "red") +
labs(x = "Youngest Kid Age When Join (year)") +
theme_bw() +
theme(panel.grid.minor = element_blank())
x_mean <- mean(data$recency, na.rm = TRUE)
data %>%
ggplot(aes(x = recency)) +
geom_density(fill = "skyblue", alpha = 0.75) +
geom_boxplot(width = 0.005) +
geom_point(x = x_mean, y = 0,
inherit.aes = FALSE, color = "red") +
scale_x_continuous(breaks = seq(0, 100, by = 5)) +
labs(x = "Recency (day)") +
theme_bw() +
theme(panel.grid.minor = element_blank())
x_mean <- mean(data$avg_monthly_frequency, na.rm = TRUE)
data %>%
ggplot(aes(x = avg_monthly_frequency)) +
geom_density(fill = "skyblue", alpha = 0.75) +
geom_boxplot(width = 0.05) +
geom_point(x = x_mean, y = 0,
inherit.aes = FALSE, color = "red") +
scale_x_continuous(breaks = seq(0, 50, by = 5)) +
labs(x = "Avg Monthly Visit") +
theme_bw() +
theme(panel.grid.minor = element_blank())
x_mean <- mean(data$avg_monetary, na.rm = TRUE)
data %>%
ggplot(aes(x = avg_monetary)) +
geom_density(fill = "skyblue", alpha = 0.75) +
geom_boxplot(width = 0.0005) +
geom_point(aes(x = x_mean, y = 0),
inherit.aes = FALSE, color = "red") +
scale_x_continuous(
breaks = seq(0, 10000, by = 1000),
labels = number_format(big.mark = ",")
) +
labs(x = "Avg Monthly Monetary") +
theme_bw() +
theme(panel.grid.minor = element_blank())
x_mean <- mean(data$avg_interpurchase, na.rm = TRUE)
data %>%
ggplot(aes(x = avg_interpurchase)) +
geom_density(fill = "skyblue", alpha = 0.75) +
geom_boxplot(width = 0.005) +
geom_point(x = x_mean, y = 0,
inherit.aes = FALSE, color = "red") +
scale_x_continuous(breaks = seq(0, 200, by = 10)) +
labs(x = "Avg Interpurchase") +
theme_bw() +
theme(panel.grid.minor = element_blank())
x_mean <- mean(data$freq_last3mo, na.rm = TRUE)
data %>%
ggplot(aes(x = freq_last3mo)) +
geom_density(fill = "skyblue", alpha = 0.75) +
geom_boxplot(width = 0.01) +
geom_point(x = x_mean, y = 0,
inherit.aes = FALSE, color = "red") +
scale_x_continuous(breaks = seq(0, 200, by = 10)) +
labs(x = "Freq Last 3 Months") +
theme_bw() +
theme(panel.grid.minor = element_blank())
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.3.2
pmat <- data %>%
select(-MemberID) %>%
cor_pmat()
data %>%
select(-MemberID) %>%
cor() %>%
ggcorrplot(colors = c("firebrick", "white", "green"),
hc.order = TRUE)
data %>%
select(-MemberID) %>%
cor() %>%
ggcorrplot(colors = c("firebrick", "white", "green"),
hc.order = TRUE, p.mat = pmat,
lab = TRUE, lab_size = 4, digits = 1,
ggtheme = theme_bw())
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.2
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.5 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## Warning: package 'dials' was built under R version 4.3.2
## Warning: package 'infer' was built under R version 4.3.2
## Warning: package 'modeldata' was built under R version 4.3.2
## Warning: package 'parsnip' was built under R version 4.3.2
## Warning: package 'rsample' was built under R version 4.3.2
## Warning: package 'tune' was built under R version 4.3.2
## Warning: package 'workflows' was built under R version 4.3.2
## Warning: package 'workflowsets' was built under R version 4.3.2
## Warning: package 'yardstick' was built under R version 4.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
library(tidyclust)
## Warning: package 'tidyclust' was built under R version 4.3.2
##
## Attaching package: 'tidyclust'
## The following objects are masked from 'package:parsnip':
##
## knit_engine_docs, list_md_problems
data_cluster <- data %>%
select(-MemberID)
pc <- data_cluster %>%
prcomp(retx = TRUE, scale. = TRUE, center = TRUE)
pc
## Standard deviations (1, .., p=11):
## [1] 1.7861028 1.5790299 1.1569356 1.0077414 0.9683249 0.9164862 0.7257820
## [8] 0.5341338 0.5164178 0.2827819 0.1617002
##
## Rotation (n x k) = (11 x 11):
## PC1 PC2 PC3 PC4
## NoChild 0.14859382 -0.174448842 0.386588909 0.17516129
## tenure_months 0.02074758 -0.207848497 0.700252033 -0.14144613
## youngest_kid_age_join -0.12135659 -0.003538326 0.013724414 -0.75900654
## recency -0.34433471 -0.032176495 0.467106683 0.16973060
## frequency 0.48881181 0.180963072 0.205559588 -0.09942156
## monetary 0.48265452 -0.164103383 0.043186301 0.01233098
## avg_monthly_frequency 0.33375803 0.321508455 0.209052296 0.01368195
## avg_monetary 0.15563821 -0.559743114 -0.156421591 0.13024450
## freq_last3mo 0.46069981 0.184219229 -0.126459341 -0.08702848
## avg_interpurchase 0.02775600 -0.308333969 -0.002211109 -0.54627617
## avg_consumption 0.15385333 -0.565645297 -0.116326530 0.10598576
## PC5 PC6 PC7 PC8
## NoChild 0.23679502 0.79154970 0.28553887 -0.04597101
## tenure_months 0.13027355 -0.20144178 -0.47896564 0.36894939
## youngest_kid_age_join -0.53029727 0.34969369 -0.01546362 0.01778577
## recency -0.36424675 -0.20370631 0.23689423 -0.32692398
## frequency -0.10547522 -0.14708392 0.01161943 -0.21320702
## monetary -0.12802011 -0.04734074 -0.22628030 -0.63813181
## avg_monthly_frequency -0.19858534 -0.22021757 0.60203582 0.34405981
## avg_monetary -0.24451992 -0.06981420 0.11550698 0.19148419
## freq_last3mo 0.03158551 0.11988205 -0.20197957 0.28203060
## avg_interpurchase 0.56317774 -0.27104073 0.39575767 -0.16378154
## avg_consumption -0.26107263 -0.06331612 0.09030397 0.20121866
## PC9 PC10 PC11
## NoChild 0.002879705 0.03260916 -0.003843385
## tenure_months -0.122471166 -0.08330400 -0.031434890
## youngest_kid_age_join -0.061579423 -0.02582784 -0.021571739
## recency 0.542121420 -0.04706985 -0.010946475
## frequency -0.017481267 0.77299170 -0.001812218
## monetary -0.175710953 -0.47975934 -0.002990623
## avg_monthly_frequency -0.253608234 -0.32903550 0.003184833
## avg_monetary 0.028072791 0.09438137 -0.704744670
## freq_last3mo 0.747970869 -0.18717598 -0.007170914
## avg_interpurchase 0.173989599 -0.03694782 0.004351662
## avg_consumption 0.035873499 0.08864006 0.708275462
pc %>%
broom::tidy("pcs") %>%
mutate(eigenvalue = std.dev^2) %>%
filter(eigenvalue >= 0.7 | cumulative <= 0.95)
## # A tibble: 7 × 5
## PC std.dev percent cumulative eigenvalue
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1.79 0.290 0.290 3.19
## 2 2 1.58 0.227 0.517 2.49
## 3 3 1.16 0.122 0.638 1.34
## 4 4 1.01 0.0923 0.731 1.02
## 5 5 0.968 0.0852 0.816 0.938
## 6 6 0.916 0.0764 0.892 0.840
## 7 7 0.726 0.0479 0.940 0.527
pc %>%
broom::tidy("rotation") %>%
filter(PC <= 6) %>%
pivot_wider(id_cols = column,
names_from = PC, names_prefix = "PC",
values_from = value)
## # A tibble: 11 × 7
## column PC1 PC2 PC3 PC4 PC5 PC6
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NoChild 0.149 -0.174 0.387 0.175 0.237 0.792
## 2 tenure_months 0.0207 -0.208 0.700 -0.141 0.130 -0.201
## 3 youngest_kid_age_join -0.121 -0.00354 0.0137 -0.759 -0.530 0.350
## 4 recency -0.344 -0.0322 0.467 0.170 -0.364 -0.204
## 5 frequency 0.489 0.181 0.206 -0.0994 -0.105 -0.147
## 6 monetary 0.483 -0.164 0.0432 0.0123 -0.128 -0.0473
## 7 avg_monthly_frequency 0.334 0.322 0.209 0.0137 -0.199 -0.220
## 8 avg_monetary 0.156 -0.560 -0.156 0.130 -0.245 -0.0698
## 9 freq_last3mo 0.461 0.184 -0.126 -0.0870 0.0316 0.120
## 10 avg_interpurchase 0.0278 -0.308 -0.00221 -0.546 0.563 -0.271
## 11 avg_consumption 0.154 -0.566 -0.116 0.106 -0.261 -0.0633
pc %>%
broom::tidy("loadings") %>%
filter(PC <= 6) %>%
mutate(#value = if_else(abs(value) < 0.25, 0, value),
influence = if_else(value < 0, "negative", "positive"),
PC = factor(PC, labels = paste0("PC", PC), levels = PC),
column = factor(column, levels = column %>% unique() %>% sort())
) %>%
ggplot(aes(x = value, y = column, fill = influence)) +
geom_col() +
geom_vline(xintercept = 0) +
labs(x = "Loadings",
y = "Column") +
facet_wrap(~PC, nrow = 1) +
theme_light() +
theme(legend.position = "none")
cs_recipe <- recipe(~ ., data = data_cluster) %>%
step_pca(all_numeric_predictors(), threshold = 0.95,
options = list(retx = TRUE, scale. = TRUE, center = TRUE)) %>%
prep()
kmeans_spec <- k_means(num_clusters = tune()) %>%
set_engine(engine = "stats", iter.max = 5000, nstart = 25)
kmeans_wf <- workflow(
preprocessor = cs_recipe,
spec = kmeans_spec
)
set.seed(1001)
bs <- vfold_cv(data = data_cluster, v = 3)
maxk <- 10
grid_cluster <- tibble(num_clusters = 1:maxk)
bs_sse <- tune_cluster(
object = kmeans_wf,
resamples = bs,
grid = grid_cluster,
metrics = cluster_metric_set(sse_within_total)
)
# Script berikut ini membutuhkan waktu proses cukup lama!
bs_silhouette <- tune_cluster(
object = kmeans_wf,
resamples = bs,
grid = grid_cluster,
metrics = cluster_metric_set(silhouette_avg)
)
gg_sse <- bs_sse %>%
collect_metrics() %>%
ggplot(aes(x = num_clusters, y = mean)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = 1:10) +
scale_y_continuous(labels = scales::number_format(big.mark = ",")) +
labs(
x = NULL,
y = "SSE Within Total"
) +
theme_light() +
theme(
panel.grid.minor = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank()
)
gg_sse
gg_silhouette <- bs_silhouette %>%
collect_metrics() %>%
ggplot(aes(x = num_clusters, y = mean)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = 1:10) +
scale_y_continuous(labels = scales::number_format(big.mark = ",")) +
labs(
x = "Number of Cluster",
y = "Avg Silhouette"
) +
theme_light() +
theme(
panel.grid.minor = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank()
)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(gg_sse, gg_silhouette, ncol = 1)
## Warning: Removed 1 row containing missing values (`geom_line()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
Sys.time()
## [1] "2023-12-15 00:27:22 WIB"
library(tidymodels)
library(tidyclust)
data_cluster <- data %>%
select(-MemberID)
pc <- data_cluster %>%
prcomp(retx = TRUE, scale. = TRUE, center = TRUE)
optimk <- 3
kmeans_spec <- k_means(num_clusters = optimk) %>%
set_engine(engine = "stats", iter.max = 5000, nstart = 25)
kmeans_wf <- kmeans_wf %>%
update_model(kmeans_spec)
set.seed(1001)
kmeans_fit <- kmeans_wf %>%
fit(data = data_cluster)
kmeans_fit %>%
extract_centroids()
## # A tibble: 3 × 9
## .cluster PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Cluster_1 2.61 1.06 0.187 -0.133 -0.177 0.0171 -0.0367 -0.0380
## 2 Cluster_2 -0.900 0.340 -0.0469 0.0248 0.109 0.0345 -0.0165 -0.0311
## 3 Cluster_3 0.653 -2.76 -0.0286 0.0568 -0.249 -0.169 0.116 0.180
kmeans_fit %>%
extract_cluster_assignment() %>%
count(.cluster) %>%
mutate(pct = n/sum(n))
## # A tibble: 3 × 3
## .cluster n pct
## <fct> <int> <dbl>
## 1 Cluster_1 59 0.188
## 2 Cluster_2 206 0.658
## 3 Cluster_3 48 0.153
kmeans_fit %>%
sse_within_total(new_data = data_cluster)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 sse_within_total standard 2257.
kmeans_fit %>%
silhouette_avg(new_data = data_cluster)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 silhouette_avg standard 0.271
cs_cluster <- data_cluster %>%
bind_cols(
kmeans_fit %>%
extract_cluster_assignment()
)
cs_cluster %>%
count(.cluster) %>%
mutate(Variable = "member") %>%
pivot_wider(id_cols = Variable, names_from = .cluster, values_from = n) %>%
bind_rows(
cs_cluster %>%
count(.cluster) %>%
mutate(Variable = "percent member",
pct = n/sum(n)) %>%
pivot_wider(id_cols = Variable, names_from = .cluster, values_from = pct),
cs_cluster %>%
rownames_to_column("id") %>%
pivot_longer(cols = -c(id, .cluster), names_to = "Variable", values_to = "Values") %>%
group_by(.cluster, Variable) %>%
summarise(
avg = mean(Values),
.groups = "drop"
) %>%
pivot_wider(id_cols = Variable, names_from = .cluster, values_from = avg)
)
## # A tibble: 13 × 4
## Variable Cluster_1 Cluster_2 Cluster_3
## <chr> <dbl> <dbl> <dbl>
## 1 member 59 206 48
## 2 percent member 0.188 0.658 0.153
## 3 NoChild 1.34 1.15 1.44
## 4 avg_consumption 13.1 10.8 36.0
## 5 avg_interpurchase 12.2 16.3 35.6
## 6 avg_monetary 311. 267. 909.
## 7 avg_monthly_frequency 3.18 1.52 1.28
## 8 freq_last3mo 7.41 1.05 1.56
## 9 frequency 24.3 4.00 6.40
## 10 monetary 7279. 1098. 5549.
## 11 recency 31.5 146. 114.
## 12 tenure_months 14.8 14.0 22.6
## 13 youngest_kid_age_join 1.91 2.13 2.02
library(gridExtra)
library(ggridges)
## Warning: package 'ggridges' was built under R version 4.3.2
cluster_profiling <- function(.data, .variable, .cluster = .cluster,
discrete = FALSE, pop.box.width = 0.05,
cluster.box.width = 0.1){
.data <- .data %>%
select({{.variable}}, {{.cluster}})
# names(.data) <- c("Variable", "Cluster")
if(discrete == FALSE){
grid.arrange(
ggplot(data = .data, aes(x = {{.variable}})) +
geom_density(
fill = "skyblue",
color = "grey",
alpha = 0.5
) +
geom_boxplot(fill = "skyblue", alpha = 0.5, width = pop.box.width) +
labs(y = "Population") +
theme_light() +
theme(
panel.grid = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.ticks.y = element_blank()
),
ggplot(data = .data,
aes(x = {{.variable}}, y = {{.cluster}}, fill = {{.cluster}})) +
geom_density_ridges(
color = "grey",
alpha = 0.5
) +
geom_boxplot(width = cluster.box.width, alpha = 0.5) +
stat_summary(fun = mean, color = "red") +
labs(y = "Cluster") +
xlab(vars({{.variable}})) +
theme_light() +
theme(
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "top"
),
ncol = 1,
heights = c(0.3, 0.7)
)
} else {
grid.arrange(
ggplot(data = .data, aes(x = {{.variable}})) +
geom_bar(
fill = "skyblue",
color = "grey",
alpha = 0.5
) +
scale_x_continuous(breaks = 1:10) +
labs(y = "Population") +
theme_light() +
theme(
panel.grid = element_blank(),
axis.text.y = element_blank(),
axis.title.x = element_blank(),
axis.ticks.y = element_blank()
),
.data %>%
count({{.cluster}}, {{.variable}}) %>%
mutate(pct = n/sum(n)) %>%
mutate({{.variable}} := factor({{.variable}})) %>%
ggplot(aes(x = pct, y = {{.cluster}}, fill = {{.variable}})) +
geom_col(
position = position_fill(reverse = TRUE),
color = "grey",
alpha = 0.75
) +
scale_fill_brewer(type = "qual") +
labs(x = "Proportion",
y = "Cluster") +
theme_light() +
theme(
panel.grid.major.x = element_blank(),
axis.text.y = element_text(hjust = 1.0),
axis.ticks.y = element_blank(),
legend.position = "top"
),
ncol = 1,
heights = c(0.3, 0.7)
)
}
}
cs_cluster %>%
ggplot(aes(x = NoChild)) +
geom_bar(
fill = "skyblue",
color = "grey",
alpha = 0.5
) +
scale_x_continuous(breaks = 1:10) +
labs(y = "Population") +
theme_light() +
theme(
panel.grid = element_blank(),
axis.text.y = element_blank(),
# axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.ticks.y = element_blank()
)
cs_cluster %>%
count(.cluster, NoChild) %>%
mutate(pct = n/sum(n)) %>%
ggplot(aes(x = pct, y = .cluster, fill = factor(NoChild))) +
geom_col(
position = position_fill(reverse = TRUE),
color = "grey",
alpha = 0.75
) +
# geom_boxplot(width = cluster.box.width, alpha = 0.75) +
# stat_summary(fun = mean, color = "red") +
scale_x_continuous(labels = percent_format()) +
scale_fill_brewer(type = "qual") +
labs(x = "Proportion",
y = "Cluster") +
theme_light() +
theme(
panel.grid.major.x = element_blank(),
axis.text.y = element_text(hjust = 1.0),
axis.ticks.y = element_blank(),
legend.position = "top"
)
cs_cluster %>%
cluster_profiling(
.variable = NoChild,
.cluster = .cluster,
discrete = TRUE,
pop.box.width = 0.005,
cluster.box.width = 0.2
)
cs_cluster %>%
cluster_profiling(
.variable = avg_monetary,
.cluster = .cluster,
discrete = FALSE,
pop.box.width = 0.0005,
cluster.box.width = 0.2
)
## Picking joint bandwidth of 73.2
## Warning: Removed 3 rows containing missing values (`geom_segment()`).
cs_cluster %>%
cluster_profiling(
.variable = avg_monthly_frequency,
.cluster = .cluster,
discrete = FALSE,
pop.box.width = 0.3,
cluster.box.width = 0.2
)
## Picking joint bandwidth of 0.266
## Warning: Removed 3 rows containing missing values (`geom_segment()`).
cs_cluster %>%
cluster_profiling(
.variable = avg_interpurchase,
pop.box.width = 0.005,
cluster.box.width = 0.2
)
## Picking joint bandwidth of 4.18
## Warning: Removed 3 rows containing missing values (`geom_segment()`).
cs_cluster %>%
cluster_profiling(
freq_last3mo,
pop.box.width = 0.01,
cluster.box.width = 0.2
)
## Picking joint bandwidth of 0.943
## Warning: Removed 3 rows containing missing values (`geom_segment()`).
cs_cluster %>%
cluster_profiling(
tenure_months,
pop.box.width = 0.005,
cluster.box.width = 0.2
)
## Picking joint bandwidth of 4.22
## Warning: Removed 3 rows containing missing values (`geom_segment()`).
cs_cluster %>%
cluster_profiling(
avg_consumption,
pop.box.width = 0.005,
cluster.box.width = 0.2
)
## Picking joint bandwidth of 3.07
## Warning: Removed 3 rows containing missing values (`geom_segment()`).
library(rpart)
##
## Attaching package: 'rpart'
## The following object is masked from 'package:dials':
##
## prune
library(rpart.plot)
library(dplyr)
dtree <- cs_cluster %>%
rpart(formula = .cluster ~ .,
data = .,
control = rpart.control(
maxdepth = 4,
minsplit = 1
)
)
dtree %>%
rpart.plot()
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
## Call rpart.plot with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.
dtree %>%
rpart.plot(
type = 4,
extra = 104,
tweak = 1.2,
under = TRUE,
box.palette = "auto",
fallen.leaves = TRUE,
legend.cex = 1
)
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
## Call rpart.plot with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.