Project 2 Part 2

chooseCRANmirror(graphics = FALSE, ind = 1)
install.packages("tidyverse")

## 
## The downloaded binary packages are in
##  /var/folders/8w/l_qbjg8n5v3_5v1lkl2ww_z80000gn/T//RtmpmxehNm/downloaded_packages

# Load Required Libraries 

library(tidyverse)   # For data manipulation and analysis

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)       # For data manipulation
library(data.table)  # For K-means clustering

## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose

library(ggplot2)     # For data visualization
install.packages("factoextra")

## 
## The downloaded binary packages are in
##  /var/folders/8w/l_qbjg8n5v3_5v1lkl2ww_z80000gn/T//RtmpmxehNm/downloaded_packages

library(factoextra) # For clustering visualization

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

install.packages("plotly")

## 
## The downloaded binary packages are in
##  /var/folders/8w/l_qbjg8n5v3_5v1lkl2ww_z80000gn/T//RtmpmxehNm/downloaded_packages

library(plotly)     # For interactive visualization

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

# Load retail.segmentation.csv from your working directory
retail_segmentation <-read.csv("retail_segmentation.csv")

head(retail_segmentation)

##   Cust_No avg_order_size avg_order_freq crossbuy multichannel   per_sale tenure
## 1       1      23.400000      2.2222222        3            2 0.00000000      3
## 2       2      34.260377      6.6250000        7            2 0.11111111     35
## 3       3      43.575641      4.8750000        5            2 0.07407407     12
## 4       4      26.316667      0.9000000        4            2 0.25000000      9
## 5       5       8.269231      1.0833333        3            1 0.50000000     40
## 6       6      21.500000      0.2222222        1            2 0.00000000      7
##   return_rate married own_home household_size loyalty_card income age
## 1   0.1175214       1        1              1            1     35  47
## 2   0.2818684       1        1              3            1    140  70
## 3   0.2741769       1        0              4            0     35  21
## 4   0.1435508       0        0              1            1     35  62
## 5   0.0000000       0        0              2            0    140  21
## 6   0.0000000       0        1              1            1     80  21
##   avg_mktg_cnt zip_code
## 1    56.000000    21230
## 2    14.914286    22301
## 3    20.083333    19002
## 4     8.222222    22304
## 5     1.350000    20124
## 6     2.714286    22033

ggplot(retail_segmentation, aes(x = avg_order_size, y = avg_order_freq)) +
  geom_point() +
  labs(x = "Average Order Size", y = "Average Order Frequency")

# Log scaled axes
ggplot(retail_segmentation, aes(x = avg_order_size, y = avg_order_freq)) +
  theme_bw() +
  geom_point(aes(color = factor(own_home))) +
  coord_trans(x = "log10", y = "log10") +
  labs(x = "Average Order Size", y = "Average Order Frequency") +
  ggtitle("Relationship between Average Order Size and Average Order Frequency")

retail_segmentation |>
  keep(is.numeric) |>
  gather() |>
  ggplot(aes(value)) +
  facet_wrap( ~ key, scales = "free") +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# In this dataset, Sales is simply the average order size times
# the average order frequency. Profit is defined as a 52% margin
# on sales minus a cost of $0.75 per marketing contact.

df <- retail_segmentation |>
  mutate(
    sales = avg_order_size * avg_order_freq * (1 - return_rate),
    profit = sales * 0.52 - avg_mktg_cnt * 0.75)

seg_data <- df |>
  select(sales, profit, income, married, own_home, tenure)

# Compute WSS for k from 1 to 10
wss <- sapply(1:10, function(k) {
  kmeans(seg_data, centers = k, nstart = 25)$tot.withinss})

# Plot the elbow curve
ggplot(data.frame(k = 1:10, wss = wss), aes(x = k, y = wss)) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = 1:10)

set.seed(123)

km <- kmeans(seg_data, centers = 4, nstart = 25)

km

## K-means clustering with 4 clusters of sizes 248, 28, 643, 1081
## 
## Cluster means:
##       sales     profit    income   married  own_home   tenure
## 1 128.47196  52.123579  71.49194 0.5241935 0.4677419 16.50403
## 2 372.36812 169.286353  95.00000 0.3214286 0.4642857 14.25000
## 3  24.00274   4.283970 127.25505 0.5396579 0.5132193 16.44323
## 4  20.91801   1.801579  45.66142 0.4079556 0.6262720 12.19056
## 
## Clustering vector:
##    [1] 4 1 1 4 3 4 4 4 4 4 4 3 4 1 4 4 4 3 4 3 4 3 4 1 3 1 3 3 3 3 3 4 4 4 4 4 4
##   [38] 4 4 3 4 3 4 4 4 4 4 1 1 4 4 4 4 4 3 4 3 4 3 4 2 3 4 1 4 4 4 4 3 4 4 4 4 1
##   [75] 1 4 1 4 4 1 3 3 4 4 3 4 3 1 4 4 4 3 4 4 1 4 3 4 1 4 4 4 4 1 4 3 3 4 4 1 3
##  [112] 4 4 4 4 4 4 4 3 4 4 3 4 4 4 3 1 3 4 4 1 3 4 4 3 1 4 4 4 4 4 4 4 4 3 4 3 4
##  [149] 3 3 4 3 3 4 3 3 4 4 4 4 3 3 4 4 3 4 3 4 1 1 4 3 3 4 3 4 4 4 3 4 1 1 3 4 4
##  [186] 4 4 3 4 1 3 3 1 4 4 4 1 3 4 4 1 4 3 4 3 4 1 4 4 4 4 4 4 3 1 4 3 4 3 3 4 3
##  [223] 4 3 3 4 4 4 4 1 4 4 3 4 4 4 4 4 1 4 4 4 3 3 1 3 4 1 1 4 1 4 4 4 4 3 4 1 4
##  [260] 1 4 1 3 4 3 3 4 4 4 3 3 4 3 3 4 3 3 4 4 1 4 1 4 3 3 4 4 3 4 3 3 4 4 4 3 1
##  [297] 1 3 4 4 3 4 3 4 4 4 4 1 3 4 3 4 4 4 3 4 1 4 3 4 4 4 4 1 4 4 4 4 3 1 3 3 4
##  [334] 4 4 3 4 3 3 3 3 3 3 4 4 1 3 3 4 4 4 4 3 4 1 4 4 2 4 1 1 3 3 4 4 4 4 3 3 4
##  [371] 4 3 3 3 3 2 1 4 3 4 4 1 4 1 4 3 4 4 3 1 4 4 1 3 3 4 4 1 3 3 4 4 4 4 4 3 1
##  [408] 1 1 3 3 4 4 4 4 4 4 3 4 3 1 4 4 4 4 4 4 4 1 4 3 4 4 1 4 4 1 4 3 1 1 4 1 4
##  [445] 3 3 4 4 3 4 4 2 3 4 4 4 3 4 3 3 4 4 4 1 4 3 4 1 4 4 3 3 4 3 3 3 4 4 4 4 1
##  [482] 3 4 4 3 4 4 3 4 3 4 4 3 3 1 3 3 4 3 1 4 4 3 4 3 1 4 3 3 3 3 4 3 4 4 3 4 4
##  [519] 4 4 4 4 4 3 3 3 4 4 3 4 1 4 4 3 3 3 3 3 1 4 3 3 3 4 3 4 4 1 4 4 3 4 2 4 2
##  [556] 4 4 4 4 3 4 4 1 4 4 3 4 3 3 1 4 3 3 1 1 3 4 3 3 4 3 4 3 1 3 4 2 4 4 1 4 3
##  [593] 4 1 4 3 4 3 3 1 3 3 4 4 3 3 4 4 4 1 4 3 4 2 4 4 4 4 1 3 4 3 1 4 4 4 4 3 3
##  [630] 4 4 1 4 4 4 4 3 4 4 3 4 3 4 4 4 4 4 1 1 3 1 1 1 3 2 1 2 4 4 4 4 4 3 3 4 4
##  [667] 4 3 1 4 4 3 1 4 1 3 4 1 3 4 4 4 4 1 1 4 3 3 4 3 4 4 4 4 4 4 3 3 4 3 1 4 3
##  [704] 3 3 3 3 1 4 4 2 1 1 3 2 4 3 4 4 3 4 4 4 1 4 4 4 4 4 4 3 3 4 3 4 4 3 4 1 4
##  [741] 4 4 4 4 4 4 3 4 4 3 3 4 3 3 4 4 3 4 1 4 1 4 4 1 3 1 1 3 3 3 3 4 3 4 3 4 4
##  [778] 4 3 4 4 2 1 4 1 4 1 4 4 4 3 3 3 4 4 4 4 3 4 4 4 3 4 4 3 3 4 4 3 3 4 4 4 3
##  [815] 4 1 3 4 3 4 3 4 3 4 1 1 3 4 1 4 4 3 4 3 3 4 3 3 4 4 3 4 3 4 4 4 4 4 4 4 3
##  [852] 4 4 3 3 4 4 3 4 4 3 3 3 4 3 3 4 1 4 4 1 4 4 3 4 4 3 4 3 4 4 4 3 4 4 3 3 4
##  [889] 3 1 4 4 4 4 3 3 4 3 3 4 3 4 1 3 3 3 4 4 4 4 1 4 3 4 4 4 4 4 4 3 4 4 3 4 1
##  [926] 3 3 4 4 4 4 4 1 4 4 4 4 4 4 4 3 4 3 4 4 1 4 3 1 4 4 4 4 1 3 3 4 4 4 4 3 3
##  [963] 1 4 4 4 3 4 1 1 4 3 4 4 4 4 3 4 4 1 1 4 4 3 4 3 3 4 3 4 4 4 4 4 1 3 4 3 3
## [1000] 3 3 3 4 4 1 3 4 3 4 4 3 3 4 3 4 1 3 3 3 4 4 4 4 3 4 4 3 4 3 2 3 3 3 4 4 1
## [1037] 4 4 4 4 4 4 4 3 4 4 4 4 4 3 3 3 4 3 4 4 4 1 1 4 3 4 3 4 3 1 4 4 4 4 4 3 3
## [1074] 3 4 3 4 3 4 4 4 4 1 1 3 4 3 4 4 4 4 4 3 3 4 4 4 4 3 4 4 1 3 1 3 3 4 4 1 4
## [1111] 3 4 4 3 4 3 4 3 4 3 3 1 4 4 4 4 4 3 3 3 4 3 3 3 3 3 3 3 1 4 3 3 3 3 4 4 3
## [1148] 1 4 3 3 3 4 4 4 4 3 3 1 4 4 4 4 1 3 4 4 3 3 3 4 1 4 3 4 1 4 3 4 3 4 4 1 4
## [1185] 1 3 3 4 3 3 4 3 1 3 4 4 4 4 4 4 3 3 4 4 1 3 3 4 4 4 3 3 3 4 4 4 4 4 4 4 3
## [1222] 4 3 4 3 3 4 4 1 4 3 3 4 3 3 4 3 3 4 3 2 3 3 3 3 4 4 4 4 4 3 3 3 3 4 4 4 4
## [1259] 4 4 3 1 1 4 3 4 4 3 4 4 4 3 3 4 3 4 4 4 4 4 4 3 3 4 4 4 3 4 1 4 3 4 4 4 4
## [1296] 1 3 3 3 4 4 4 4 3 4 4 3 4 4 4 4 3 4 4 4 3 1 4 4 4 3 4 4 4 4 3 3 4 4 4 4 4
## [1333] 3 4 4 4 3 4 4 1 4 1 1 4 4 3 4 3 3 3 3 4 4 1 4 3 4 4 4 4 2 4 3 3 4 1 3 3 4
## [1370] 3 4 3 3 4 4 4 3 3 1 1 4 4 4 3 4 4 1 3 3 3 3 4 1 4 4 3 3 1 4 4 4 3 1 3 4 4
## [1407] 3 4 4 3 4 3 3 1 1 4 1 3 4 3 4 3 3 1 4 4 4 4 2 4 3 4 4 4 3 4 4 4 4 4 3 3 4
## [1444] 3 4 4 4 3 1 4 3 4 3 2 4 3 4 4 4 3 3 3 4 3 3 4 1 3 4 1 2 4 3 4 4 4 3 4 3 4
## [1481] 3 4 4 3 4 4 4 4 1 4 4 4 4 4 4 2 1 4 1 3 3 3 4 3 4 4 4 3 1 4 4 3 4 3 4 4 1
## [1518] 4 4 4 4 4 4 1 4 3 3 3 3 3 1 1 3 4 1 3 4 3 3 3 4 1 4 4 4 1 4 4 4 4 4 4 3 4
## [1555] 3 4 1 1 1 1 3 4 1 4 4 4 4 1 1 3 4 4 4 4 3 4 1 1 3 4 4 4 4 1 1 4 1 3 4 2 4
## [1592] 4 4 4 4 4 4 4 4 4 4 4 2 4 1 4 1 4 3 4 4 4 3 4 3 4 4 3 1 4 4 4 3 3 1 4 4 3
## [1629] 4 4 1 4 4 4 4 3 4 1 4 3 3 3 3 4 4 4 4 4 3 4 4 4 1 4 4 4 4 4 3 1 4 3 4 4 4
## [1666] 3 4 3 4 4 4 4 3 1 3 3 3 3 3 4 4 4 3 4 3 3 3 4 3 4 3 4 1 4 3 4 3 2 2 1 3 3
## [1703] 3 4 1 4 3 1 2 4 1 3 4 4 4 4 3 1 3 3 1 4 3 4 4 4 1 4 4 4 4 4 4 4 3 4 3 3 4
## [1740] 3 4 4 4 3 4 3 4 4 3 3 4 4 4 3 4 1 4 4 3 1 4 3 4 4 1 4 3 4 3 2 1 1 4 4 3 3
## [1777] 4 3 3 3 3 3 4 4 4 4 3 4 3 4 4 4 3 1 1 4 4 4 4 1 4 4 4 1 4 3 4 3 3 3 4 4 4
## [1814] 3 4 4 3 4 3 3 4 3 4 3 4 4 4 3 4 4 4 4 4 3 3 4 1 4 4 3 4 4 4 1 4 3 4 4 3 3
## [1851] 3 3 1 4 2 3 3 4 3 3 3 4 4 4 4 4 4 4 4 4 3 4 4 4 4 3 3 4 2 4 4 3 1 4 1 4 3
## [1888] 4 3 3 3 4 3 3 1 3 4 4 4 4 4 3 4 4 3 1 1 3 4 4 4 4 1 1 4 4 3 4 3 4 3 3 3 3
## [1925] 1 3 4 3 1 3 4 3 4 4 4 4 4 4 4 1 3 4 3 3 4 3 3 4 4 3 4 3 4 3 3 4 4 4 1 3 4
## [1962] 4 3 3 1 3 4 3 1 4 3 4 4 3 3 3 3 1 4 4 3 4 1 4 1 4 4 3 3 4 4 4 3 4 4 4 3 4
## [1999] 3 3
## 
## Within cluster sum of squares by cluster:
## [1] 1028457.3  582524.9 1021490.5 1078508.5
##  (between_SS / total_SS =  72.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

aggregate(df, by = list(cluster = km$cluster), mean)

##   cluster   Cust_No avg_order_size avg_order_freq crossbuy multichannel
## 1       1  960.6734       42.85727      4.5130665 4.423387     2.024194
## 2       2 1084.3214       38.26149     12.1950857 5.464286     2.214286
## 3       3 1026.2551       34.00571      0.9989449 2.391913     1.508554
## 4       4  992.1462       34.39422      0.9341166 2.246068     1.461610
##     per_sale   tenure return_rate   married  own_home household_size
## 1 0.13538171 16.50403   0.1337130 0.5241935 0.4677419       2.931452
## 2 0.13341252 14.25000   0.1250334 0.3214286 0.4642857       2.678571
## 3 0.13042920 16.44323   0.1937715 0.5396579 0.5132193       2.744946
## 4 0.07904374 12.19056   0.1777547 0.4079556 0.6262720       2.933395
##   loyalty_card    income      age avg_mktg_cnt zip_code     sales     profit
## 1    0.6451613  71.49194 41.28629     19.57579 18292.33 128.47196  52.123579
## 2    0.5000000  95.00000 46.64286     32.46009 17249.71 372.36812 169.286353
## 3    0.6189736 127.25505 43.32504     10.92994 18490.43  24.00274   4.283970
## 4    0.6151711  45.66142 42.98057     12.10105 18263.23  20.91801   1.801579

# Log scaled axes
ggplot(df, aes(x = avg_order_size, y = avg_order_freq)) +
  theme_bw() +
  geom_point(aes(color = factor(km$cluster))) +
  coord_trans(x = "log10", y = "log10") +
  labs(x = "Average Order Size", y = "Average Order Frequency") +
  ggtitle("Relationship between Average Order Size and Average Order Frequency")

# Clustering visualization

fviz_nbclust(seg_data, kmeans, method = "wss")

fviz_nbclust(seg_data, kmeans, method = "silhouette")

km <- kmeans(seg_data, centers = 4, nstart = 25)

fviz_cluster(km, seg_data, geom = "point", ellipse.type = "norm")

library(plotly)

seg_data <- seg_data |>
  mutate(cluster = as.factor(km$cluster))

plot_ly(
  x = seg_data$tenure,
  y = seg_data$income,
  z = seg_data$profit,
  type = "scatter3d",
  mode = "markers",
  color = seg_data$cluster,
  size = 1
) |>
  layout(scene = list(
    xaxis = list(title = "Tenure"),
    yaxis = list(title = "Income"),
    zaxis = list(title = "Profit")
  ))

Project 2 Part 2

Kaitlyn

2025-02-26