INTRODUCTION

This file contains the r code for the second half of Project Two. Here we run a K-means analysis on the code for the retail segmentation. The data set contains information about a retailer with a range of variables like demographics, frequency, purchase size, tenure with retailer etc.

Step 1: Here I am setting up the directory and loading in the file used for the Project. As Well, all libraries used are loaded here as well. We displayed the first few rows of the loaded retail data set.

getwd()
## [1] "C:/Users/kylev/OneDrive/Documents"
setwd("C:/Users/kylev/OneDrive/Documents")
library(tidyverse)   # For data manipulation and analysis
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)       # For data manipulation
library(data.table)  # For K-means clustering
## Warning: package 'data.table' was built under R version 4.3.3
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(ggplot2)     # For data visualization
library (factoextra) # For clustering visualization
## Warning: package 'factoextra' was built under R version 4.3.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library (plotly)     # For interactive visualization
## Warning: package 'plotly' was built under R version 4.3.3
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(pastecs)     # descripive statistics of data
## Warning: package 'pastecs' was built under R version 4.3.3
## 
## Attaching package: 'pastecs'
## 
## The following objects are masked from 'package:data.table':
## 
##     first, last
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library (cluster)    # For clustering analysis
## Warning: package 'cluster' was built under R version 4.3.3

Step 2: Here we are loading in the file used for the Project, naming it retail_segmentation. We explore the data set, checking the first few rows of the data, the structure of the data, and the summary statistics of the data. We also constructed a few graphs to start to visualize the relationship between the variables.

retail_segmentation <-read.csv("retail_segmentation.csv")

head(retail_segmentation)
##   Cust_No avg_order_size avg_order_freq crossbuy multichannel   per_sale tenure
## 1       1      23.400000      2.2222222        3            2 0.00000000      3
## 2       2      34.260377      6.6250000        7            2 0.11111111     35
## 3       3      43.575641      4.8750000        5            2 0.07407407     12
## 4       4      26.316667      0.9000000        4            2 0.25000000      9
## 5       5       8.269231      1.0833333        3            1 0.50000000     40
## 6       6      21.500000      0.2222222        1            2 0.00000000      7
##   return_rate married own_home household_size loyalty_card income age
## 1   0.1175214       1        1              1            1     35  47
## 2   0.2818684       1        1              3            1    140  70
## 3   0.2741769       1        0              4            0     35  21
## 4   0.1435508       0        0              1            1     35  62
## 5   0.0000000       0        0              2            0    140  21
## 6   0.0000000       0        1              1            1     80  21
##   avg_mktg_cnt zip_code
## 1    56.000000    21230
## 2    14.914286    22301
## 3    20.083333    19002
## 4     8.222222    22304
## 5     1.350000    20124
## 6     2.714286    22033
str(retail_segmentation)
## 'data.frame':    2000 obs. of  16 variables:
##  $ Cust_No       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ avg_order_size: num  23.4 34.26 43.58 26.32 8.27 ...
##  $ avg_order_freq: num  2.22 6.62 4.88 0.9 1.08 ...
##  $ crossbuy      : int  3 7 5 4 3 1 2 3 1 1 ...
##  $ multichannel  : int  2 2 2 2 1 2 1 2 1 1 ...
##  $ per_sale      : num  0 0.1111 0.0741 0.25 0.5 ...
##  $ tenure        : int  3 35 12 9 40 7 8 17 14 3 ...
##  $ return_rate   : num  0.118 0.282 0.274 0.144 0 ...
##  $ married       : int  1 1 1 0 0 0 1 0 0 0 ...
##  $ own_home      : int  1 1 0 0 0 1 0 1 1 1 ...
##  $ household_size: int  1 3 4 1 2 1 1 1 2 8 ...
##  $ loyalty_card  : int  1 1 0 1 0 1 0 1 1 1 ...
##  $ income        : int  35 140 35 35 140 80 70 35 35 35 ...
##  $ age           : int  47 70 21 62 21 21 86 70 57 21 ...
##  $ avg_mktg_cnt  : num  56 14.91 20.08 8.22 1.35 ...
##  $ zip_code      : int  21230 22301 19002 22304 20124 22033 8757 8109 21122 21208 ...
summary(retail_segmentation)
##     Cust_No       avg_order_size    avg_order_freq        crossbuy    
##  Min.   :   1.0   Min.   :  1.833   Min.   : 0.02778   Min.   :1.000  
##  1st Qu.: 500.8   1st Qu.: 23.157   1st Qu.: 0.30769   1st Qu.:1.000  
##  Median :1000.5   Median : 30.790   Median : 0.76923   Median :2.000  
##  Mean   :1000.5   Mean   : 35.373   Mean   : 1.55640   Mean   :2.608  
##  3rd Qu.:1500.2   3rd Qu.: 40.959   3rd Qu.: 1.90584   3rd Qu.:4.000  
##  Max.   :2000.0   Max.   :528.250   Max.   :31.87500   Max.   :7.000  
##   multichannel      per_sale          tenure       return_rate     
##  Min.   :1.000   Min.   :0.0000   Min.   : 1.00   Min.   :0.00000  
##  1st Qu.:1.000   1st Qu.:0.0000   1st Qu.: 4.00   1st Qu.:0.00000  
##  Median :1.000   Median :0.0000   Median :10.00   Median :0.01947  
##  Mean   :1.557   Mean   :0.1033   Mean   :14.12   Mean   :0.17671  
##  3rd Qu.:2.000   3rd Qu.:0.1400   3rd Qu.:20.00   3rd Qu.:0.24560  
##  Max.   :3.000   Max.   :1.0000   Max.   :40.00   Max.   :6.90909  
##     married          own_home     household_size   loyalty_card   
##  Min.   :0.0000   Min.   :0.000   Min.   :1.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :1.000   Median :2.000   Median :1.0000  
##  Mean   :0.4635   Mean   :0.568   Mean   :2.869   Mean   :0.6185  
##  3rd Qu.:1.0000   3rd Qu.:1.000   3rd Qu.:4.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.000   Max.   :8.000   Max.   :1.0000  
##      income            age         avg_mktg_cnt       zip_code    
##  Min.   : 35.00   Min.   :21.00   Min.   :  0.00   Min.   : 7726  
##  1st Qu.: 35.00   1st Qu.:21.00   1st Qu.:  4.00   1st Qu.:19010  
##  Median : 60.00   Median :37.00   Median :  7.75   Median :20854  
##  Mean   : 75.79   Mean   :42.93   Mean   : 12.94   Mean   :18326  
##  3rd Qu.:110.00   3rd Qu.:61.25   3rd Qu.: 15.00   3rd Qu.:21777  
##  Max.   :175.00   Max.   :99.00   Max.   :297.00   Max.   :24060

The following graph generated uses the retail segmentation data set and plots a scatter graph comparing and show the relationship between the average order size and the order frequency

ggplot(retail_segmentation, aes(x = avg_order_size, y = avg_order_freq)) +
  geom_point() +
  labs(x = "Average Order Size", y = "Average Order Frequency")

Here we smooth out the graph and add in color to better show the relationship between the order size and frequency. As well we differentiate the customers who own a home and who do not own a home.

# Log scaled axes
ggplot(retail_segmentation, aes(x = avg_order_size, y = avg_order_freq)) +
  theme_bw() +
  geom_point(aes(color = factor(own_home))) +
  coord_trans(x = "log10", y = "log10") +
  labs(x = "Average Order Size", y = "Average Order Frequency") +
  ggtitle("Relationship between Average Order Size and Average Order Frequency")

This next chunk of code shows the distribution of each customer and how the data is spread across the different variables. We want to target the people with higher frequences, higher order size. The x axis displays the values of each variable, the Y- axis displays the count of each value.

retail_segmentation |>
  keep(is.numeric) |>
  gather() |>
  ggplot(aes(value)) +
  facet_wrap( ~ key, scales = "free") +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Step 3: Next we took the data and manipulated it to better process it for the rest of the assignment. We created the variable sales and profit variables for each customer. Sales is the average size of order times frequency. Profit is a 52% margin on sales minus a cost of $0.75 per marketing contact. We then updated the data frame to see the new rows that were created.

retail_segmentation <- retail_segmentation |>
  mutate(
    sales = avg_order_size * avg_order_freq * (1 - return_rate),
    profit = sales * 0.52 - avg_mktg_cnt * 0.75)

view(retail_segmentation)

Step 4: Here we select the variables that we wanted to focus on for our analysis. Our main analysis goal is to see the impact of how profit is impacted by different variables. In summarry the variables that we want to focus on are the variables: sales, profit, income, married, avg_mktg_cnt and tenure. Some other variables do get used.

seg_data <- retail_segmentation |>
  select(sales, profit, income, married, avg_mktg_cnt, tenure, crossbuy)

Step 5: In section we use the elbow method to determine our optimal number of clusters. We find the center of the clusters, and then we use the sum of the squares to plot them them one by one in our elbow curve. After 5 clusters there is a noticeable drop in the difference between each added cluster.

# Compute WSS for k from 1 to 10
wss <- sapply(1:10, function(k) {
  kmeans(seg_data, centers = k, nstart = 25)$tot.withinss})

# Plot the elbow curve
ggplot(data.frame(k = 1:10, wss = wss), aes(x = k, y = wss)) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = 1:10)

Step 6: By assigning a seed we are set the randomness of the clusters to become consistent and ensures reproducibility in the number generator. We then ran the kmeans for 5 clusters. Next we display the information from the k means. This contains the cluster assignments, the centers and more. The metric between_SS/total_ss shows how well the clusters are seperated. 72.2% is the amount of variation that is explained with our model.

set.seed(123)

kmeans_model <- kmeans(seg_data, centers = 5, nstart = 25)

kmeans_model
## K-means clustering with 5 clusters of sizes 916, 479, 240, 28, 337
## 
## Cluster means:
##       sales     profit    income   married avg_mktg_cnt   tenure crossbuy
## 1  21.01485   1.800171  40.04913 0.4159389     12.17007 11.99672 2.242358
## 2  21.85945   2.954788  94.39457 0.4488518     11.21617 13.68267 2.277662
## 3 129.63654  52.452462  68.97917 0.5041667     19.94472 16.07917 4.425000
## 4 372.36812 169.286353  95.00000 0.3214286     32.46009 14.25000 5.464286
## 5  26.92618   5.863075 149.73294 0.5964392     10.85139 19.11573 2.540059
## 
## Clustering vector:
##    [1] 1 3 3 1 5 2 2 1 1 1 1 2 2 5 2 1 1 5 1 2 2 5 2 3 5 3 5 5 5 2 5 2 1 1 1 1 1
##   [38] 1 2 5 1 2 2 1 1 1 1 3 3 1 1 1 1 1 5 1 5 1 5 1 4 2 1 3 1 1 1 1 5 1 1 1 1 3
##   [75] 3 1 3 2 1 3 2 5 1 1 2 1 2 3 1 1 1 2 1 1 3 1 5 1 3 1 1 1 2 3 1 5 5 1 1 3 5
##  [112] 1 1 2 1 1 1 1 5 1 1 2 1 1 1 2 3 2 2 2 3 2 1 2 2 5 1 1 1 1 1 1 1 2 5 1 2 1
##  [149] 2 2 1 5 2 1 2 5 2 1 1 1 2 5 2 1 2 2 2 1 3 3 1 2 2 1 2 1 1 1 5 1 3 3 2 1 1
##  [186] 2 1 2 1 3 2 5 3 1 1 1 3 2 1 1 5 2 5 1 2 2 3 2 1 1 1 1 1 5 3 1 2 1 2 5 1 5
##  [223] 1 5 5 1 1 2 1 3 1 2 5 1 1 1 1 2 3 1 1 1 2 5 3 2 1 3 3 1 3 1 1 1 1 2 1 3 1
##  [260] 3 1 3 2 1 5 5 1 1 1 2 2 1 2 5 1 5 5 1 1 3 1 3 1 5 5 1 1 2 1 2 2 2 1 1 5 3
##  [297] 3 5 1 1 2 1 5 1 1 1 1 3 5 2 2 1 1 1 2 2 3 1 2 2 1 2 1 3 1 1 1 1 5 3 5 2 1
##  [334] 2 1 2 1 5 5 5 5 5 2 1 1 3 5 2 1 1 1 1 2 2 3 1 1 4 1 3 3 5 5 1 1 1 2 5 5 1
##  [371] 1 5 2 2 5 4 3 1 5 1 1 3 2 3 1 2 1 1 2 3 1 1 3 5 5 1 1 3 5 2 1 1 1 1 1 2 3
##  [408] 3 3 5 5 1 1 1 1 2 1 2 1 2 3 1 1 1 2 1 1 2 3 2 2 1 1 3 2 1 3 1 5 3 3 1 3 1
##  [445] 5 5 1 1 2 1 1 4 5 1 1 1 5 1 5 2 1 1 1 3 1 5 2 3 1 1 5 5 1 5 2 2 2 2 2 2 3
##  [482] 5 1 1 5 1 1 5 1 5 1 1 5 5 3 2 2 2 2 3 1 1 5 1 2 3 1 5 2 5 2 1 5 1 1 2 1 1
##  [519] 1 1 1 1 1 5 2 5 1 1 5 1 3 1 2 2 5 2 5 5 3 1 2 2 5 1 5 2 1 3 1 1 5 1 4 1 4
##  [556] 1 1 2 1 5 1 1 3 1 1 5 1 5 2 3 1 2 5 3 3 5 1 5 2 2 5 1 5 3 5 1 4 2 1 3 1 2
##  [593] 1 3 1 2 1 2 5 3 5 5 1 1 5 5 1 1 1 3 1 5 1 4 1 1 1 2 3 2 2 5 3 2 1 1 1 2 2
##  [630] 2 1 3 1 1 1 1 2 1 1 2 1 5 1 1 2 1 2 3 2 2 3 3 3 5 4 3 4 2 1 1 1 1 5 2 1 1
##  [667] 1 5 3 1 2 2 3 2 3 2 2 5 5 1 1 1 1 3 3 1 5 2 1 2 1 1 2 2 1 1 2 5 2 5 3 1 2
##  [704] 2 5 5 2 3 1 1 4 3 3 5 4 1 5 1 2 2 1 3 1 3 1 1 2 1 1 2 5 5 1 5 1 1 2 1 3 1
##  [741] 2 1 1 1 1 1 2 1 1 5 2 1 5 5 1 1 5 1 3 1 3 1 1 3 5 3 3 5 5 2 5 1 2 1 5 1 1
##  [778] 1 2 1 1 4 3 1 3 1 3 1 1 1 2 2 5 1 2 1 1 5 1 1 1 5 1 1 2 2 1 1 5 5 1 2 1 2
##  [815] 1 3 2 2 2 1 5 1 5 1 3 3 2 1 3 1 1 2 1 2 5 1 5 2 1 2 2 1 2 2 1 1 1 1 1 1 5
##  [852] 1 1 5 2 1 1 2 1 1 5 2 2 1 5 2 1 3 1 1 3 1 2 2 1 1 2 1 5 1 2 1 5 1 2 5 5 1
##  [889] 5 3 1 1 1 1 5 2 2 5 2 1 2 1 2 5 2 2 1 2 1 1 3 1 2 1 2 2 1 1 1 2 1 1 2 2 3
##  [926] 2 5 1 1 2 1 1 3 1 1 1 1 1 2 1 5 1 2 1 1 3 1 5 3 1 2 1 1 3 5 5 1 1 1 2 5 5
##  [963] 3 1 1 1 2 2 3 3 1 2 1 1 1 1 2 1 1 3 3 1 2 5 2 5 5 1 5 1 1 1 1 1 3 2 1 2 2
## [1000] 2 5 2 1 1 3 5 1 2 1 1 5 2 1 2 2 3 2 5 2 1 1 2 1 5 1 1 5 1 2 4 5 2 5 1 1 3
## [1037] 1 1 1 1 1 1 1 5 1 1 1 1 1 2 5 5 1 5 1 1 1 3 3 2 2 1 5 1 2 3 1 1 1 1 1 2 2
## [1074] 2 1 5 1 5 1 1 1 1 3 3 2 1 5 1 1 1 2 1 5 2 1 1 1 1 5 2 1 3 2 3 5 5 1 1 3 1
## [1111] 5 1 1 2 1 5 2 5 2 5 5 3 1 1 1 1 1 5 5 2 1 2 5 2 5 5 2 5 3 1 5 5 5 2 1 2 2
## [1148] 3 2 5 5 5 1 1 1 1 2 2 3 1 1 1 1 3 2 1 1 2 2 5 1 3 1 2 1 3 1 5 1 5 1 2 3 1
## [1185] 3 5 2 1 5 5 1 2 3 5 1 1 1 1 1 2 5 5 1 1 3 2 5 1 1 1 5 2 2 1 1 1 1 2 1 1 2
## [1222] 1 2 1 2 2 1 1 3 1 2 5 1 2 5 1 2 2 1 2 4 5 5 5 2 1 1 2 1 1 2 5 5 2 1 1 2 2
## [1259] 1 1 5 3 3 1 2 1 1 5 1 1 1 5 2 1 2 2 2 1 2 1 1 5 5 1 1 1 2 1 3 1 2 1 1 1 1
## [1296] 3 2 5 2 1 1 1 1 2 2 1 2 1 1 1 1 2 1 1 1 5 3 1 1 1 2 1 1 1 2 2 5 1 1 2 1 1
## [1333] 5 1 1 1 5 1 1 3 1 3 3 1 1 2 1 5 5 2 2 1 2 3 1 2 1 1 1 1 4 1 2 5 1 3 5 2 1
## [1370] 2 1 2 2 1 1 2 5 5 3 5 1 1 1 5 2 2 3 2 5 2 2 1 3 1 1 5 2 3 1 1 1 2 3 5 1 1
## [1407] 2 1 1 5 1 5 5 3 3 1 3 2 2 5 1 2 5 3 1 1 1 1 4 1 5 1 1 1 5 1 1 1 1 1 2 2 1
## [1444] 2 2 1 1 5 3 1 5 1 5 4 1 5 1 1 1 2 5 5 1 5 5 1 3 5 1 3 4 1 2 1 1 2 2 2 2 2
## [1481] 2 1 1 5 1 1 1 1 3 2 1 1 1 2 1 4 3 1 3 5 5 5 1 2 1 1 1 2 3 1 1 5 1 2 1 1 3
## [1518] 1 1 1 1 1 2 3 1 2 2 5 5 2 3 5 2 1 3 5 1 5 2 2 2 3 1 1 1 3 2 2 1 2 1 1 5 1
## [1555] 2 1 3 3 3 3 5 2 2 1 1 1 1 3 3 2 1 1 1 1 2 1 3 3 2 1 2 1 2 3 3 1 3 5 2 4 1
## [1592] 1 1 1 1 1 1 1 1 2 2 1 4 1 3 1 3 1 5 2 1 1 2 1 5 2 1 2 3 1 1 1 2 2 3 1 1 2
## [1629] 1 2 3 1 1 1 1 5 2 3 1 5 2 5 5 1 1 1 1 1 2 1 1 1 3 1 2 1 2 2 5 3 2 5 1 2 1
## [1666] 2 1 5 1 1 1 1 2 3 5 5 5 5 5 1 1 1 2 1 2 5 2 1 5 1 5 1 3 1 5 1 5 4 4 3 5 5
## [1703] 5 1 3 1 2 3 4 1 3 5 1 1 1 1 2 3 5 5 3 1 2 1 1 1 3 1 1 1 1 2 1 1 2 2 5 5 1
## [1740] 2 1 1 2 2 1 2 1 1 2 2 2 1 1 2 1 3 1 1 5 3 2 2 1 1 3 1 2 1 5 4 3 3 1 1 2 2
## [1777] 1 2 5 5 2 2 1 2 1 1 2 1 2 1 1 1 5 3 3 1 1 1 1 3 2 1 1 3 2 5 1 5 5 2 1 1 1
## [1814] 2 1 1 2 1 5 2 1 2 1 5 1 1 1 5 2 2 1 1 1 2 2 1 3 1 2 5 1 2 1 3 2 5 1 1 2 5
## [1851] 5 2 3 1 4 5 2 2 2 5 5 1 1 2 1 1 1 1 1 1 2 1 1 1 1 2 5 1 4 1 1 2 3 1 3 1 2
## [1888] 1 2 2 2 1 2 2 3 2 2 1 1 1 1 2 1 1 5 3 3 2 1 1 2 1 3 3 1 2 5 1 5 1 5 5 2 5
## [1925] 3 2 1 2 3 5 1 2 2 1 1 2 1 1 1 3 2 1 2 2 2 2 5 1 2 5 1 5 1 2 2 1 1 1 3 2 1
## [1962] 1 2 5 3 2 1 2 3 1 2 1 1 5 2 2 2 3 1 1 5 2 3 1 3 1 1 2 2 1 1 1 5 1 1 1 5 2
## [1999] 5 5
## 
## Within cluster sum of squares by cluster:
## [1] 1100462.4  564388.7 1047339.6  600268.6  559310.9
##  (between_SS / total_SS =  72.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

This aggregation allows us to find the different means based on the clusters that we created. It gives us the mean for each variable of each cluster

aggregate(seg_data, by = list(cluster = kmeans_model$cluster), mean)
##   cluster     sales     profit    income   married avg_mktg_cnt   tenure
## 1       1  21.01485   1.800171  40.04913 0.4159389     12.17007 11.99672
## 2       2  21.85945   2.954788  94.39457 0.4488518     11.21617 13.68267
## 3       3 129.63654  52.452462  68.97917 0.5041667     19.94472 16.07917
## 4       4 372.36812 169.286353  95.00000 0.3214286     32.46009 14.25000
## 5       5  26.92618   5.863075 149.73294 0.5964392     10.85139 19.11573
##   crossbuy
## 1 2.242358
## 2 2.277662
## 3 4.425000
## 4 5.464286
## 5 2.540059

Step 7: Using the clusters that we created, we start out cluster analysis. The first step we did was create a cluster variable to the original data frame to identify which cluster a given customer belonged to. Next we view the first few rows to ensure the cluster column was added to the data. We generate our first chart to analysis which customer segments are better to target when comparing customer income vs company profit. Looking at the graph below, we can see that the clusters to target as a retail store are clusters 4 and 3. These clusters are the most profitable across the same level of income.

retail_segmentation <- retail_segmentation %>%
  mutate(cluster = kmeans_model$cluster)

head(retail_segmentation)
##   Cust_No avg_order_size avg_order_freq crossbuy multichannel   per_sale tenure
## 1       1      23.400000      2.2222222        3            2 0.00000000      3
## 2       2      34.260377      6.6250000        7            2 0.11111111     35
## 3       3      43.575641      4.8750000        5            2 0.07407407     12
## 4       4      26.316667      0.9000000        4            2 0.25000000      9
## 5       5       8.269231      1.0833333        3            1 0.50000000     40
## 6       6      21.500000      0.2222222        1            2 0.00000000      7
##   return_rate married own_home household_size loyalty_card income age
## 1   0.1175214       1        1              1            1     35  47
## 2   0.2818684       1        1              3            1    140  70
## 3   0.2741769       1        0              4            0     35  21
## 4   0.1435508       0        0              1            1     35  62
## 5   0.0000000       0        0              2            0    140  21
## 6   0.0000000       0        1              1            1     80  21
##   avg_mktg_cnt zip_code      sales      profit cluster
## 1    56.000000    21230  45.888889 -18.1377778       1
## 2    14.914286    22301 162.997917  73.5732024       3
## 3    20.083333    19002 154.187500  65.1150000       3
## 4     8.222222    22304  20.285000   4.3815333       1
## 5     1.350000    20124   8.958333   3.6458333       5
## 6     2.714286    22033   4.777778   0.4487302       2
ggplot(retail_segmentation, aes(x = income, y = profit, 
                                color = factor(cluster))) +
  geom_point(size=3) +
  labs(x = "Income", y = "Profit", color = "Cluster")

Using the code kmeans_model$centers, we are able to create the cluster centers which tell us the average values of the variables for each cluster. This data confirms that cluster 4 and 3 are the most profitable. Using the factoextra package we can enhance the visualization of the clusters

kmeans_model$centers %>%
  round(2) %>%
  as.data.frame() %>%
  tibble::rownames_to_column(var = "Cluster") 
##   Cluster  sales profit income married avg_mktg_cnt tenure crossbuy
## 1       1  21.01   1.80  40.05    0.42        12.17  12.00     2.24
## 2       2  21.86   2.95  94.39    0.45        11.22  13.68     2.28
## 3       3 129.64  52.45  68.98    0.50        19.94  16.08     4.42
## 4       4 372.37 169.29  95.00    0.32        32.46  14.25     5.46
## 5       5  26.93   5.86 149.73    0.60        10.85  19.12     2.54
fviz_cluster(kmeans_model, data=seg_data, geom="point", 
             ellipse.type="norm")

Step 8: The next section of data are the analysis’s that we have chosen to do to see the impact of profit agaisnt the other different variables and customers that we have chosen.

This graph shows the relationship between marketing contact and order frequency. Note Cluster 3 and 4 have the highest order frequency across the same level of Marketing Contact.

ggplot(retail_segmentation, aes(x = sales, y = avg_mktg_cnt, 
                                color = factor(cluster))) +
  geom_point(size=2) +
  labs(x = "Sales ", y = "Average Marketing Contaact", color = "Cluster")+
  ggtitle("Relationship between Sales and Marketing Contact")

This following graph below highlights the relationship between marketing contact and income. Again this shows that cluster 3 and 4 are the most profitable clusters across the different sizes of households.

ggplot(retail_segmentation, aes(x = household_size, y = profit, 
                                color = factor(cluster))) +
  geom_point(size=3) +
  labs(x = "Household Size", y = " Profit", color = "Cluster")+
  ggtitle("Relationship between Household Size and Profit")

The following graph shows the relationship between profit and household size. Again, cluster 4 and 3 show that they are the target clusters between contact and profit.

ggplot(retail_segmentation, aes(x = avg_mktg_cnt, y = profit , 
                                color = factor(cluster))) +
  geom_point(size=3) +
  labs(x = "Average Marketing Contact", y = " Profit", color = "Cluster")+
  ggtitle("Relationship between Average Order Frequency and Average Marketing Contact")

The following graph shows the relationship of sales, which is order size across levels of order frequency that are similar. This graph show that clusters 4 and 3 are the best clusters again to target.

# Log scaled axes
ggplot(retail_segmentation, aes(x = avg_order_size, y = avg_order_freq)) +
  theme_bw() +
  geom_point(aes(color = factor(kmeans_model$cluster))) +
  coord_trans(x = "log10", y = "log10") +
  labs(x = "Average Order Size", y = "Average Order Frequency") +
  ggtitle("Relationship between Average Order Size and Average Order Frequency")

This graph below is based off of profit, income and tenure, and displays the clusters that we have created earlier.Again, this model shows that cluster 3 and 4 are the most profitable across employment tenure and income.

seg_data <- seg_data |>
  mutate(cluster = as.factor(kmeans_model$cluster))

plot_ly(
  x = seg_data$tenure,
  y = seg_data$income,
  z = seg_data$profit,
  type = "scatter3d",
  mode = "markers",
  color = seg_data$cluster,
  size = 1
) |>
  layout(scene = list(
    xaxis = list(title = "Tenure"),
    yaxis = list(title = "Income"),
    zaxis = list(title = "Profit")
  ))

This graph below displays the variables of tenure, income and profit within the lense of marriage.

plot_ly(
  x = seg_data$tenure,
  y = seg_data$income,
  z = seg_data$profit,
  type = "scatter3d",
  mode = "markers",
  color = as.factor(seg_data$married),
  colors = c( 'blue', 'red'),
  marker = list(size = 3)

) |>
  layout(scene = list(
    xaxis = list(title = "Tenure"),
    yaxis = list(title = "Income"),
    zaxis = list(title = "Profit")
  ),
  legend = list(title =list(text = "Married"))
  )