chooseCRANmirror(graphics = FALSE, ind = 1)
install.packages("tidyverse")
##
## The downloaded binary packages are in
## /var/folders/8w/l_qbjg8n5v3_5v1lkl2ww_z80000gn/T//RtmpmxehNm/downloaded_packages
# Load Required Libraries
library(tidyverse) # For data manipulation and analysis
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr) # For data manipulation
library(data.table) # For K-means clustering
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(ggplot2) # For data visualization
install.packages("factoextra")
##
## The downloaded binary packages are in
## /var/folders/8w/l_qbjg8n5v3_5v1lkl2ww_z80000gn/T//RtmpmxehNm/downloaded_packages
library(factoextra) # For clustering visualization
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
install.packages("plotly")
##
## The downloaded binary packages are in
## /var/folders/8w/l_qbjg8n5v3_5v1lkl2ww_z80000gn/T//RtmpmxehNm/downloaded_packages
library(plotly) # For interactive visualization
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
# Load retail.segmentation.csv from your working directory
retail_segmentation <-read.csv("retail_segmentation.csv")
head(retail_segmentation)
## Cust_No avg_order_size avg_order_freq crossbuy multichannel per_sale tenure
## 1 1 23.400000 2.2222222 3 2 0.00000000 3
## 2 2 34.260377 6.6250000 7 2 0.11111111 35
## 3 3 43.575641 4.8750000 5 2 0.07407407 12
## 4 4 26.316667 0.9000000 4 2 0.25000000 9
## 5 5 8.269231 1.0833333 3 1 0.50000000 40
## 6 6 21.500000 0.2222222 1 2 0.00000000 7
## return_rate married own_home household_size loyalty_card income age
## 1 0.1175214 1 1 1 1 35 47
## 2 0.2818684 1 1 3 1 140 70
## 3 0.2741769 1 0 4 0 35 21
## 4 0.1435508 0 0 1 1 35 62
## 5 0.0000000 0 0 2 0 140 21
## 6 0.0000000 0 1 1 1 80 21
## avg_mktg_cnt zip_code
## 1 56.000000 21230
## 2 14.914286 22301
## 3 20.083333 19002
## 4 8.222222 22304
## 5 1.350000 20124
## 6 2.714286 22033
ggplot(retail_segmentation, aes(x = avg_order_size, y = avg_order_freq)) +
geom_point() +
labs(x = "Average Order Size", y = "Average Order Frequency")

# Log scaled axes
ggplot(retail_segmentation, aes(x = avg_order_size, y = avg_order_freq)) +
theme_bw() +
geom_point(aes(color = factor(own_home))) +
coord_trans(x = "log10", y = "log10") +
labs(x = "Average Order Size", y = "Average Order Frequency") +
ggtitle("Relationship between Average Order Size and Average Order Frequency")

retail_segmentation |>
keep(is.numeric) |>
gather() |>
ggplot(aes(value)) +
facet_wrap( ~ key, scales = "free") +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# In this dataset, Sales is simply the average order size times
# the average order frequency. Profit is defined as a 52% margin
# on sales minus a cost of $0.75 per marketing contact.
df <- retail_segmentation |>
mutate(
sales = avg_order_size * avg_order_freq * (1 - return_rate),
profit = sales * 0.52 - avg_mktg_cnt * 0.75)
seg_data <- df |>
select(sales, profit, income, married, own_home, tenure)
# Compute WSS for k from 1 to 10
wss <- sapply(1:10, function(k) {
kmeans(seg_data, centers = k, nstart = 25)$tot.withinss})
# Plot the elbow curve
ggplot(data.frame(k = 1:10, wss = wss), aes(x = k, y = wss)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = 1:10)

set.seed(123)
km <- kmeans(seg_data, centers = 4, nstart = 25)
km
## K-means clustering with 4 clusters of sizes 248, 28, 643, 1081
##
## Cluster means:
## sales profit income married own_home tenure
## 1 128.47196 52.123579 71.49194 0.5241935 0.4677419 16.50403
## 2 372.36812 169.286353 95.00000 0.3214286 0.4642857 14.25000
## 3 24.00274 4.283970 127.25505 0.5396579 0.5132193 16.44323
## 4 20.91801 1.801579 45.66142 0.4079556 0.6262720 12.19056
##
## Clustering vector:
## [1] 4 1 1 4 3 4 4 4 4 4 4 3 4 1 4 4 4 3 4 3 4 3 4 1 3 1 3 3 3 3 3 4 4 4 4 4 4
## [38] 4 4 3 4 3 4 4 4 4 4 1 1 4 4 4 4 4 3 4 3 4 3 4 2 3 4 1 4 4 4 4 3 4 4 4 4 1
## [75] 1 4 1 4 4 1 3 3 4 4 3 4 3 1 4 4 4 3 4 4 1 4 3 4 1 4 4 4 4 1 4 3 3 4 4 1 3
## [112] 4 4 4 4 4 4 4 3 4 4 3 4 4 4 3 1 3 4 4 1 3 4 4 3 1 4 4 4 4 4 4 4 4 3 4 3 4
## [149] 3 3 4 3 3 4 3 3 4 4 4 4 3 3 4 4 3 4 3 4 1 1 4 3 3 4 3 4 4 4 3 4 1 1 3 4 4
## [186] 4 4 3 4 1 3 3 1 4 4 4 1 3 4 4 1 4 3 4 3 4 1 4 4 4 4 4 4 3 1 4 3 4 3 3 4 3
## [223] 4 3 3 4 4 4 4 1 4 4 3 4 4 4 4 4 1 4 4 4 3 3 1 3 4 1 1 4 1 4 4 4 4 3 4 1 4
## [260] 1 4 1 3 4 3 3 4 4 4 3 3 4 3 3 4 3 3 4 4 1 4 1 4 3 3 4 4 3 4 3 3 4 4 4 3 1
## [297] 1 3 4 4 3 4 3 4 4 4 4 1 3 4 3 4 4 4 3 4 1 4 3 4 4 4 4 1 4 4 4 4 3 1 3 3 4
## [334] 4 4 3 4 3 3 3 3 3 3 4 4 1 3 3 4 4 4 4 3 4 1 4 4 2 4 1 1 3 3 4 4 4 4 3 3 4
## [371] 4 3 3 3 3 2 1 4 3 4 4 1 4 1 4 3 4 4 3 1 4 4 1 3 3 4 4 1 3 3 4 4 4 4 4 3 1
## [408] 1 1 3 3 4 4 4 4 4 4 3 4 3 1 4 4 4 4 4 4 4 1 4 3 4 4 1 4 4 1 4 3 1 1 4 1 4
## [445] 3 3 4 4 3 4 4 2 3 4 4 4 3 4 3 3 4 4 4 1 4 3 4 1 4 4 3 3 4 3 3 3 4 4 4 4 1
## [482] 3 4 4 3 4 4 3 4 3 4 4 3 3 1 3 3 4 3 1 4 4 3 4 3 1 4 3 3 3 3 4 3 4 4 3 4 4
## [519] 4 4 4 4 4 3 3 3 4 4 3 4 1 4 4 3 3 3 3 3 1 4 3 3 3 4 3 4 4 1 4 4 3 4 2 4 2
## [556] 4 4 4 4 3 4 4 1 4 4 3 4 3 3 1 4 3 3 1 1 3 4 3 3 4 3 4 3 1 3 4 2 4 4 1 4 3
## [593] 4 1 4 3 4 3 3 1 3 3 4 4 3 3 4 4 4 1 4 3 4 2 4 4 4 4 1 3 4 3 1 4 4 4 4 3 3
## [630] 4 4 1 4 4 4 4 3 4 4 3 4 3 4 4 4 4 4 1 1 3 1 1 1 3 2 1 2 4 4 4 4 4 3 3 4 4
## [667] 4 3 1 4 4 3 1 4 1 3 4 1 3 4 4 4 4 1 1 4 3 3 4 3 4 4 4 4 4 4 3 3 4 3 1 4 3
## [704] 3 3 3 3 1 4 4 2 1 1 3 2 4 3 4 4 3 4 4 4 1 4 4 4 4 4 4 3 3 4 3 4 4 3 4 1 4
## [741] 4 4 4 4 4 4 3 4 4 3 3 4 3 3 4 4 3 4 1 4 1 4 4 1 3 1 1 3 3 3 3 4 3 4 3 4 4
## [778] 4 3 4 4 2 1 4 1 4 1 4 4 4 3 3 3 4 4 4 4 3 4 4 4 3 4 4 3 3 4 4 3 3 4 4 4 3
## [815] 4 1 3 4 3 4 3 4 3 4 1 1 3 4 1 4 4 3 4 3 3 4 3 3 4 4 3 4 3 4 4 4 4 4 4 4 3
## [852] 4 4 3 3 4 4 3 4 4 3 3 3 4 3 3 4 1 4 4 1 4 4 3 4 4 3 4 3 4 4 4 3 4 4 3 3 4
## [889] 3 1 4 4 4 4 3 3 4 3 3 4 3 4 1 3 3 3 4 4 4 4 1 4 3 4 4 4 4 4 4 3 4 4 3 4 1
## [926] 3 3 4 4 4 4 4 1 4 4 4 4 4 4 4 3 4 3 4 4 1 4 3 1 4 4 4 4 1 3 3 4 4 4 4 3 3
## [963] 1 4 4 4 3 4 1 1 4 3 4 4 4 4 3 4 4 1 1 4 4 3 4 3 3 4 3 4 4 4 4 4 1 3 4 3 3
## [1000] 3 3 3 4 4 1 3 4 3 4 4 3 3 4 3 4 1 3 3 3 4 4 4 4 3 4 4 3 4 3 2 3 3 3 4 4 1
## [1037] 4 4 4 4 4 4 4 3 4 4 4 4 4 3 3 3 4 3 4 4 4 1 1 4 3 4 3 4 3 1 4 4 4 4 4 3 3
## [1074] 3 4 3 4 3 4 4 4 4 1 1 3 4 3 4 4 4 4 4 3 3 4 4 4 4 3 4 4 1 3 1 3 3 4 4 1 4
## [1111] 3 4 4 3 4 3 4 3 4 3 3 1 4 4 4 4 4 3 3 3 4 3 3 3 3 3 3 3 1 4 3 3 3 3 4 4 3
## [1148] 1 4 3 3 3 4 4 4 4 3 3 1 4 4 4 4 1 3 4 4 3 3 3 4 1 4 3 4 1 4 3 4 3 4 4 1 4
## [1185] 1 3 3 4 3 3 4 3 1 3 4 4 4 4 4 4 3 3 4 4 1 3 3 4 4 4 3 3 3 4 4 4 4 4 4 4 3
## [1222] 4 3 4 3 3 4 4 1 4 3 3 4 3 3 4 3 3 4 3 2 3 3 3 3 4 4 4 4 4 3 3 3 3 4 4 4 4
## [1259] 4 4 3 1 1 4 3 4 4 3 4 4 4 3 3 4 3 4 4 4 4 4 4 3 3 4 4 4 3 4 1 4 3 4 4 4 4
## [1296] 1 3 3 3 4 4 4 4 3 4 4 3 4 4 4 4 3 4 4 4 3 1 4 4 4 3 4 4 4 4 3 3 4 4 4 4 4
## [1333] 3 4 4 4 3 4 4 1 4 1 1 4 4 3 4 3 3 3 3 4 4 1 4 3 4 4 4 4 2 4 3 3 4 1 3 3 4
## [1370] 3 4 3 3 4 4 4 3 3 1 1 4 4 4 3 4 4 1 3 3 3 3 4 1 4 4 3 3 1 4 4 4 3 1 3 4 4
## [1407] 3 4 4 3 4 3 3 1 1 4 1 3 4 3 4 3 3 1 4 4 4 4 2 4 3 4 4 4 3 4 4 4 4 4 3 3 4
## [1444] 3 4 4 4 3 1 4 3 4 3 2 4 3 4 4 4 3 3 3 4 3 3 4 1 3 4 1 2 4 3 4 4 4 3 4 3 4
## [1481] 3 4 4 3 4 4 4 4 1 4 4 4 4 4 4 2 1 4 1 3 3 3 4 3 4 4 4 3 1 4 4 3 4 3 4 4 1
## [1518] 4 4 4 4 4 4 1 4 3 3 3 3 3 1 1 3 4 1 3 4 3 3 3 4 1 4 4 4 1 4 4 4 4 4 4 3 4
## [1555] 3 4 1 1 1 1 3 4 1 4 4 4 4 1 1 3 4 4 4 4 3 4 1 1 3 4 4 4 4 1 1 4 1 3 4 2 4
## [1592] 4 4 4 4 4 4 4 4 4 4 4 2 4 1 4 1 4 3 4 4 4 3 4 3 4 4 3 1 4 4 4 3 3 1 4 4 3
## [1629] 4 4 1 4 4 4 4 3 4 1 4 3 3 3 3 4 4 4 4 4 3 4 4 4 1 4 4 4 4 4 3 1 4 3 4 4 4
## [1666] 3 4 3 4 4 4 4 3 1 3 3 3 3 3 4 4 4 3 4 3 3 3 4 3 4 3 4 1 4 3 4 3 2 2 1 3 3
## [1703] 3 4 1 4 3 1 2 4 1 3 4 4 4 4 3 1 3 3 1 4 3 4 4 4 1 4 4 4 4 4 4 4 3 4 3 3 4
## [1740] 3 4 4 4 3 4 3 4 4 3 3 4 4 4 3 4 1 4 4 3 1 4 3 4 4 1 4 3 4 3 2 1 1 4 4 3 3
## [1777] 4 3 3 3 3 3 4 4 4 4 3 4 3 4 4 4 3 1 1 4 4 4 4 1 4 4 4 1 4 3 4 3 3 3 4 4 4
## [1814] 3 4 4 3 4 3 3 4 3 4 3 4 4 4 3 4 4 4 4 4 3 3 4 1 4 4 3 4 4 4 1 4 3 4 4 3 3
## [1851] 3 3 1 4 2 3 3 4 3 3 3 4 4 4 4 4 4 4 4 4 3 4 4 4 4 3 3 4 2 4 4 3 1 4 1 4 3
## [1888] 4 3 3 3 4 3 3 1 3 4 4 4 4 4 3 4 4 3 1 1 3 4 4 4 4 1 1 4 4 3 4 3 4 3 3 3 3
## [1925] 1 3 4 3 1 3 4 3 4 4 4 4 4 4 4 1 3 4 3 3 4 3 3 4 4 3 4 3 4 3 3 4 4 4 1 3 4
## [1962] 4 3 3 1 3 4 3 1 4 3 4 4 3 3 3 3 1 4 4 3 4 1 4 1 4 4 3 3 4 4 4 3 4 4 4 3 4
## [1999] 3 3
##
## Within cluster sum of squares by cluster:
## [1] 1028457.3 582524.9 1021490.5 1078508.5
## (between_SS / total_SS = 72.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
aggregate(df, by = list(cluster = km$cluster), mean)
## cluster Cust_No avg_order_size avg_order_freq crossbuy multichannel
## 1 1 960.6734 42.85727 4.5130665 4.423387 2.024194
## 2 2 1084.3214 38.26149 12.1950857 5.464286 2.214286
## 3 3 1026.2551 34.00571 0.9989449 2.391913 1.508554
## 4 4 992.1462 34.39422 0.9341166 2.246068 1.461610
## per_sale tenure return_rate married own_home household_size
## 1 0.13538171 16.50403 0.1337130 0.5241935 0.4677419 2.931452
## 2 0.13341252 14.25000 0.1250334 0.3214286 0.4642857 2.678571
## 3 0.13042920 16.44323 0.1937715 0.5396579 0.5132193 2.744946
## 4 0.07904374 12.19056 0.1777547 0.4079556 0.6262720 2.933395
## loyalty_card income age avg_mktg_cnt zip_code sales profit
## 1 0.6451613 71.49194 41.28629 19.57579 18292.33 128.47196 52.123579
## 2 0.5000000 95.00000 46.64286 32.46009 17249.71 372.36812 169.286353
## 3 0.6189736 127.25505 43.32504 10.92994 18490.43 24.00274 4.283970
## 4 0.6151711 45.66142 42.98057 12.10105 18263.23 20.91801 1.801579
# Log scaled axes
ggplot(df, aes(x = avg_order_size, y = avg_order_freq)) +
theme_bw() +
geom_point(aes(color = factor(km$cluster))) +
coord_trans(x = "log10", y = "log10") +
labs(x = "Average Order Size", y = "Average Order Frequency") +
ggtitle("Relationship between Average Order Size and Average Order Frequency")

# Clustering visualization
fviz_nbclust(seg_data, kmeans, method = "wss")

fviz_nbclust(seg_data, kmeans, method = "silhouette")

km <- kmeans(seg_data, centers = 4, nstart = 25)
fviz_cluster(km, seg_data, geom = "point", ellipse.type = "norm")

library(plotly)
seg_data <- seg_data |>
mutate(cluster = as.factor(km$cluster))
plot_ly(
x = seg_data$tenure,
y = seg_data$income,
z = seg_data$profit,
type = "scatter3d",
mode = "markers",
color = seg_data$cluster,
size = 1
) |>
layout(scene = list(
xaxis = list(title = "Tenure"),
yaxis = list(title = "Income"),
zaxis = list(title = "Profit")
))