R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# set up my working directory (you need to replace with your own)

setwd("/Users/macbookair/desktop/MKTG R Project 2")

STEP 1: Load Required Libraries

# Load Required Libraries 

library(tidyverse)   # For data manipulation and analysis
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.5.1     ✔ purrr   1.0.4
## ✔ tibble  3.2.1     ✔ dplyr   1.1.4
## ✔ tidyr   1.3.1     ✔ stringr 1.5.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr)       # For data manipulation
library(data.table)  # For K-means clustering
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(ggplot2)     # For data visualization
library (factoextra) # For clustering visualization
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library (plotly)     # For interactive visualization
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

STEP 2: Load & Explore the dataset (retail.segmentation.csv)

## Read the dataset (Assuming it's in your working directory)
retail_segmentation <-read.csv("retail_segmentation.csv")

## Check the first few rows
head(retail_segmentation)
##   Cust_No avg_order_size avg_order_freq crossbuy multichannel   per_sale tenure
## 1       1      23.400000      2.2222222        3            2 0.00000000      3
## 2       2      34.260377      6.6250000        7            2 0.11111111     35
## 3       3      43.575641      4.8750000        5            2 0.07407407     12
## 4       4      26.316667      0.9000000        4            2 0.25000000      9
## 5       5       8.269231      1.0833333        3            1 0.50000000     40
## 6       6      21.500000      0.2222222        1            2 0.00000000      7
##   return_rate married own_home household_size loyalty_card income age
## 1   0.1175214       1        1              1            1     35  47
## 2   0.2818684       1        1              3            1    140  70
## 3   0.2741769       1        0              4            0     35  21
## 4   0.1435508       0        0              1            1     35  62
## 5   0.0000000       0        0              2            0    140  21
## 6   0.0000000       0        1              1            1     80  21
##   avg_mktg_cnt zip_code
## 1    56.000000    21230
## 2    14.914286    22301
## 3    20.083333    19002
## 4     8.222222    22304
## 5     1.350000    20124
## 6     2.714286    22033
## Check structure of data
str(retail_segmentation)
## 'data.frame':    2000 obs. of  16 variables:
##  $ Cust_No       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ avg_order_size: num  23.4 34.26 43.58 26.32 8.27 ...
##  $ avg_order_freq: num  2.22 6.62 4.88 0.9 1.08 ...
##  $ crossbuy      : int  3 7 5 4 3 1 2 3 1 1 ...
##  $ multichannel  : int  2 2 2 2 1 2 1 2 1 1 ...
##  $ per_sale      : num  0 0.1111 0.0741 0.25 0.5 ...
##  $ tenure        : int  3 35 12 9 40 7 8 17 14 3 ...
##  $ return_rate   : num  0.118 0.282 0.274 0.144 0 ...
##  $ married       : int  1 1 1 0 0 0 1 0 0 0 ...
##  $ own_home      : int  1 1 0 0 0 1 0 1 1 1 ...
##  $ household_size: int  1 3 4 1 2 1 1 1 2 8 ...
##  $ loyalty_card  : int  1 1 0 1 0 1 0 1 1 1 ...
##  $ income        : int  35 140 35 35 140 80 70 35 35 35 ...
##  $ age           : int  47 70 21 62 21 21 86 70 57 21 ...
##  $ avg_mktg_cnt  : num  56 14.91 20.08 8.22 1.35 ...
##  $ zip_code      : int  21230 22301 19002 22304 20124 22033 8757 8109 21122 21208 ...
## Summary statistics
summary(retail_segmentation)
##     Cust_No       avg_order_size    avg_order_freq        crossbuy    
##  Min.   :   1.0   Min.   :  1.833   Min.   : 0.02778   Min.   :1.000  
##  1st Qu.: 500.8   1st Qu.: 23.157   1st Qu.: 0.30769   1st Qu.:1.000  
##  Median :1000.5   Median : 30.790   Median : 0.76923   Median :2.000  
##  Mean   :1000.5   Mean   : 35.373   Mean   : 1.55640   Mean   :2.608  
##  3rd Qu.:1500.2   3rd Qu.: 40.959   3rd Qu.: 1.90584   3rd Qu.:4.000  
##  Max.   :2000.0   Max.   :528.250   Max.   :31.87500   Max.   :7.000  
##   multichannel      per_sale          tenure       return_rate     
##  Min.   :1.000   Min.   :0.0000   Min.   : 1.00   Min.   :0.00000  
##  1st Qu.:1.000   1st Qu.:0.0000   1st Qu.: 4.00   1st Qu.:0.00000  
##  Median :1.000   Median :0.0000   Median :10.00   Median :0.01947  
##  Mean   :1.557   Mean   :0.1033   Mean   :14.12   Mean   :0.17671  
##  3rd Qu.:2.000   3rd Qu.:0.1400   3rd Qu.:20.00   3rd Qu.:0.24560  
##  Max.   :3.000   Max.   :1.0000   Max.   :40.00   Max.   :6.90909  
##     married          own_home     household_size   loyalty_card   
##  Min.   :0.0000   Min.   :0.000   Min.   :1.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :1.000   Median :2.000   Median :1.0000  
##  Mean   :0.4635   Mean   :0.568   Mean   :2.869   Mean   :0.6185  
##  3rd Qu.:1.0000   3rd Qu.:1.000   3rd Qu.:4.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.000   Max.   :8.000   Max.   :1.0000  
##      income            age         avg_mktg_cnt       zip_code    
##  Min.   : 35.00   Min.   :21.00   Min.   :  0.00   Min.   : 7726  
##  1st Qu.: 35.00   1st Qu.:21.00   1st Qu.:  4.00   1st Qu.:19010  
##  Median : 60.00   Median :37.00   Median :  7.75   Median :20854  
##  Mean   : 75.79   Mean   :42.93   Mean   : 12.94   Mean   :18326  
##  3rd Qu.:110.00   3rd Qu.:61.25   3rd Qu.: 15.00   3rd Qu.:21776  
##  Max.   :175.00   Max.   :99.00   Max.   :297.00   Max.   :24060

STEP 3: Preprocessing the Data (Manipulation)

## Compute sales and profit. In our dataset sales and profit 
## are defined as follows: sales = avg_order_freg X avg_prder_size
## profit = 0.52 X sales – 0.75 X avg_mktg_cnt

retail_segmentation <- retail_segmentation %>%
  mutate(sales = avg_order_freq * avg_order_size, 
         profit = 0.52 * sales - 0.75 * avg_mktg_cnt)

### View the updated dataframe
View(retail_segmentation) ## see the new columns created. 

STEP 4: Select variables for clustering analysis ## In your work, you can replace one or two variables from the list

cluster_data <-retail_segmentation %>% select(tenure, own_home, 
                                              married, household_size, income, sales, profit,avg_mktg_cnt, loyalty_card,avg_order_freq) 

STEP 5: Determine the Optional Number of Clusters (Elbow Method)

## We can use the elbow method to determine the optimal number of 
## clusters. The elbow method plots the within-cluster sum of squares 
## (WCSS) against the number of clusters. 
## The "elbow" point indicates the optimal number of clusters.

fviz_nbclust(cluster_data, kmeans, k.max = 20,  method = "wss")

## Notes on the Elbow chart. 
### Look for the "elbow point" where the WCSS (within sum of squares)
### stop decreasing significantly. 
###For our example, we will choose 6 clusters (judgmental from 
### the "elbow chart" point)

STEP 6: Perform k-means clustering (Assuming k=6 from elbow point)

kmeans_model <- kmeans(cluster_data, centers = 6)

## View model results
kmeans_model
## K-means clustering with 6 clusters of sizes 31, 447, 280, 35, 880, 327
## 
## Cluster means:
##      tenure  own_home   married household_size    income     sales     profit
## 1 17.903226 0.4838710 0.4193548       2.838710  94.35484 421.91801 198.351647
## 2 14.029083 0.5346756 0.4429530       2.843400  94.30649  25.15432   6.377704
## 3 16.339286 0.4892857 0.5321429       2.817857  72.16071 145.67261  61.218801
## 4  1.857143 0.4285714 0.3714286       2.771429  66.14286  50.93522 -51.869758
## 5 12.109091 0.6386364 0.4125000       2.979545  39.92614  24.26219   5.145437
## 6 18.718654 0.5137615 0.5840979       2.663609 149.35780  28.63241   7.639105
##   avg_mktg_cnt loyalty_card avg_order_freq
## 1    28.060959    0.5483871     12.4101222
## 2     8.936721    0.6219239      0.8698941
## 3    19.374608    0.6357143      4.4103521
## 4   104.474762    0.7142857      1.5262847
## 5     9.961202    0.6147727      0.8408859
## 6     9.666334    0.6055046      0.9509172
## 
## Clustering vector:
##    [1] 5 3 3 5 6 2 2 5 5 5 5 2 2 3 2 5 5 6 5 2 2 6 2 1 6 3 6 6 6 2 6 2 5 5 5 5 5
##   [38] 5 2 6 5 2 2 5 5 5 5 3 3 5 5 5 5 5 6 5 6 5 6 5 1 2 4 3 5 5 5 5 6 5 5 5 5 3
##   [75] 3 3 3 2 5 3 2 6 5 3 2 5 6 3 5 5 5 2 5 5 3 5 6 4 3 5 5 5 2 3 5 6 4 5 5 3 6
##  [112] 5 5 2 5 5 5 5 6 5 5 2 5 5 5 2 3 2 2 2 3 2 4 2 2 3 5 5 5 5 5 5 5 2 6 5 2 5
##  [149] 2 2 5 6 2 5 2 6 2 5 5 5 2 6 2 5 2 2 2 5 3 3 5 2 2 5 2 5 5 5 6 5 3 3 2 5 5
##  [186] 2 5 2 5 3 2 6 3 5 5 5 3 2 5 5 3 2 6 5 2 2 3 2 5 5 5 5 5 6 3 5 2 5 2 6 5 6
##  [223] 5 6 6 5 5 2 5 3 5 3 6 5 5 5 5 4 3 5 5 5 3 6 3 2 5 3 3 5 3 5 5 5 5 2 5 1 5
##  [260] 3 5 5 2 5 6 6 5 4 5 2 2 5 2 6 5 6 6 5 5 3 5 3 5 6 6 3 5 2 5 2 2 2 5 5 6 3
##  [297] 3 6 5 5 2 5 6 5 5 5 5 3 6 2 2 5 5 5 2 2 3 5 2 2 5 2 5 3 5 5 5 5 6 3 6 2 5
##  [334] 2 5 2 5 6 6 6 6 6 2 5 5 3 6 2 5 5 5 5 2 2 3 5 5 1 5 3 3 6 6 4 3 5 2 6 6 5
##  [371] 5 6 2 2 6 1 3 3 6 5 5 3 2 3 5 2 5 5 2 3 4 5 3 6 6 5 5 3 6 2 5 5 5 5 5 2 3
##  [408] 3 3 6 6 5 5 5 5 2 5 2 5 2 3 5 5 5 4 5 5 2 2 2 2 5 5 3 2 5 3 5 6 3 3 5 3 5
##  [445] 6 6 5 5 2 5 5 1 6 5 5 5 6 5 6 2 5 5 5 3 5 6 2 3 5 5 6 6 5 6 2 2 2 2 2 2 3
##  [482] 6 5 5 6 5 5 6 5 6 5 5 6 6 5 2 2 2 2 3 5 5 6 5 2 3 5 6 2 6 2 5 6 5 5 2 5 5
##  [519] 5 5 5 5 5 6 2 6 3 5 6 5 3 5 2 2 6 2 6 6 3 5 2 2 6 5 6 4 5 3 5 5 6 5 3 5 1
##  [556] 5 5 2 5 6 5 5 3 5 5 6 5 6 2 3 5 2 6 3 3 6 5 6 3 3 6 5 3 3 6 5 1 2 5 3 5 2
##  [593] 5 3 5 2 5 6 6 3 6 6 5 3 6 6 5 5 5 3 5 6 5 1 5 5 5 4 3 2 2 6 5 2 5 5 5 2 2
##  [630] 2 5 3 5 5 5 5 4 5 5 2 5 6 5 4 2 5 2 3 2 2 3 3 3 6 1 3 1 2 5 5 5 5 4 2 5 5
##  [667] 5 6 3 5 2 2 4 2 3 2 2 3 6 5 5 5 5 3 3 5 6 2 5 2 5 4 2 2 5 5 2 4 2 6 1 5 2
##  [704] 2 6 6 2 3 5 5 1 3 3 6 1 5 6 5 2 2 5 3 5 3 5 5 2 5 5 2 6 6 5 6 5 5 2 5 3 5
##  [741] 2 5 5 5 5 5 2 5 5 6 2 5 6 3 5 5 6 5 3 5 3 5 4 3 6 3 5 6 6 2 6 5 3 5 6 5 5
##  [778] 5 2 5 5 1 3 5 3 5 3 5 5 5 2 2 6 5 2 5 3 6 5 5 5 6 5 5 2 2 5 5 6 6 5 2 5 2
##  [815] 5 3 2 4 2 5 6 5 6 5 3 3 2 5 3 5 5 2 5 2 6 5 6 2 5 2 2 5 2 2 5 5 5 5 5 5 6
##  [852] 5 3 6 2 5 5 2 5 5 6 2 2 5 6 2 5 3 5 5 3 5 2 2 5 3 2 5 6 5 2 5 6 5 2 6 6 5
##  [889] 6 5 5 5 4 5 6 2 2 6 2 5 3 5 3 6 2 2 5 2 5 5 3 5 2 5 2 2 5 5 5 2 5 5 2 3 3
##  [926] 2 6 5 5 2 5 5 3 5 5 5 5 5 2 5 6 5 2 5 5 3 5 3 3 5 2 5 5 3 6 6 3 5 5 2 6 6
##  [963] 3 5 5 5 6 2 5 3 5 2 5 5 5 5 2 5 5 3 3 5 2 6 2 6 6 5 6 5 5 5 5 5 3 2 5 2 2
## [1000] 2 6 2 5 3 3 6 5 2 5 5 6 2 5 2 2 3 2 6 2 5 5 2 5 6 5 5 6 5 2 1 6 2 6 5 5 3
## [1037] 5 5 5 5 5 5 5 6 5 5 5 5 5 2 6 6 5 6 5 5 5 3 3 2 2 5 6 5 2 3 5 5 5 5 5 2 2
## [1074] 2 5 6 5 6 5 4 4 5 3 3 2 5 6 5 5 5 2 5 6 2 5 5 5 5 6 2 5 3 2 3 6 6 5 5 3 5
## [1111] 6 5 5 2 5 6 2 6 2 6 6 3 5 5 5 5 3 3 6 2 5 2 6 2 6 6 2 6 3 5 6 6 6 3 5 2 2
## [1148] 3 2 6 6 6 5 5 5 5 2 3 3 5 5 5 5 3 2 5 5 2 2 6 4 3 5 2 3 3 5 6 5 6 5 2 3 5
## [1185] 3 6 2 5 6 6 5 2 3 6 5 5 5 5 5 2 6 6 5 5 2 4 6 5 5 5 6 2 2 5 5 5 5 2 5 5 2
## [1222] 5 2 5 2 2 5 5 3 5 2 6 5 3 6 5 2 3 5 2 1 6 6 6 2 5 5 2 5 5 2 6 6 4 5 5 2 2
## [1259] 5 5 6 3 3 5 2 5 5 6 5 5 5 6 2 5 2 2 2 5 2 5 5 6 6 5 5 5 2 5 3 5 2 5 5 5 5
## [1296] 3 2 6 2 5 5 5 5 2 2 5 2 3 5 5 5 2 5 5 5 6 3 5 5 5 2 5 5 5 2 2 6 5 5 2 5 5
## [1333] 3 5 5 5 6 5 4 3 5 3 3 5 5 2 5 6 6 2 2 5 2 3 5 2 5 5 3 5 1 5 2 6 4 3 6 2 4
## [1370] 2 5 2 2 3 5 2 6 6 3 6 5 5 5 6 2 2 3 3 6 2 2 5 3 5 5 6 2 3 5 5 5 2 3 6 5 5
## [1407] 2 5 5 6 5 6 6 3 3 5 3 2 2 3 5 2 6 3 5 5 5 5 1 5 6 5 5 5 6 5 3 5 5 5 2 2 5
## [1444] 3 2 5 5 6 3 5 6 5 6 1 5 6 5 5 5 2 3 6 5 6 6 5 3 6 5 3 1 5 2 5 5 2 2 2 2 2
## [1481] 2 5 5 6 5 5 5 5 3 2 5 5 5 2 5 1 3 5 3 6 6 6 5 2 5 5 5 2 3 5 5 3 5 2 5 5 3
## [1518] 5 5 5 5 5 2 3 5 2 2 6 6 2 3 6 4 5 3 6 5 6 2 2 2 3 5 5 5 3 2 2 5 2 5 5 6 5
## [1555] 2 5 3 3 3 3 6 2 2 5 5 5 5 3 3 2 5 5 5 3 2 5 3 3 2 5 2 3 2 3 3 5 3 6 2 1 5
## [1592] 5 5 5 5 5 5 5 5 2 2 5 1 5 5 5 3 5 6 2 5 5 2 5 6 2 5 2 3 5 5 5 2 2 3 5 5 2
## [1629] 5 2 3 5 5 5 5 6 2 3 5 6 2 6 6 5 5 5 5 5 2 5 5 5 3 5 3 5 2 2 6 3 2 6 5 3 5
## [1666] 2 5 6 5 5 5 5 2 3 6 6 6 6 6 5 5 5 2 5 2 6 2 5 6 5 6 5 3 5 6 5 6 1 1 3 6 6
## [1703] 6 5 3 3 2 3 1 5 3 6 5 5 5 5 2 3 6 6 3 5 2 5 5 5 3 5 5 3 5 2 5 4 6 2 6 6 5
## [1740] 2 5 5 2 2 5 2 5 5 2 2 3 5 5 2 5 3 5 5 6 3 2 2 5 5 3 5 2 5 6 3 3 3 5 5 2 6
## [1777] 4 2 6 6 2 2 5 2 5 5 2 5 2 5 5 3 6 3 3 5 5 5 5 3 2 5 5 3 2 6 5 6 6 2 5 5 5
## [1814] 2 5 5 2 5 6 2 5 2 5 6 5 5 5 6 2 2 5 5 4 2 2 5 3 5 2 6 5 2 5 3 2 6 5 5 2 6
## [1851] 6 2 3 5 1 6 2 2 2 6 6 5 5 2 5 5 5 5 5 5 2 5 5 5 5 2 6 5 1 5 5 2 3 5 3 5 2
## [1888] 5 2 2 2 5 2 2 3 2 2 5 5 5 5 2 5 5 6 3 3 2 5 5 2 5 1 3 5 2 6 5 6 5 6 6 2 6
## [1925] 3 4 5 2 3 6 5 2 2 5 5 2 5 5 5 3 2 5 4 2 2 2 6 5 2 6 5 6 5 4 2 5 5 5 3 2 3
## [1962] 5 2 6 3 2 5 2 3 5 2 5 5 6 2 2 2 3 5 5 6 2 1 5 3 5 5 2 2 5 5 5 6 5 5 5 6 2
## [1999] 6 6
## 
## Within cluster sum of squares by cluster:
## [1]  578933.7  479665.3 1424176.7  233579.7  780322.4  444458.3
##  (between_SS / total_SS =  77.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
### Notes about the output of kmeans model
#### The output of the k-means model is a list 
#### containing the cluster assignments, the cluster centers, 
#### and other information.

#### The metric between_SS/total_SS helps evaluate how well the 
#### clusters are separated. It represents the proportion of total
#### variance that is explained by the clusters,
#### which is 82.1% in our case here. 

kmeans_model$centers %>%
  round(2) %>%
  as.data.frame() %>%
  tibble::rownames_to_column(var = "Cluster") 
##   Cluster tenure own_home married household_size income  sales profit
## 1       1  17.90     0.48    0.42           2.84  94.35 421.92 198.35
## 2       2  14.03     0.53    0.44           2.84  94.31  25.15   6.38
## 3       3  16.34     0.49    0.53           2.82  72.16 145.67  61.22
## 4       4   1.86     0.43    0.37           2.77  66.14  50.94 -51.87
## 5       5  12.11     0.64    0.41           2.98  39.93  24.26   5.15
## 6       6  18.72     0.51    0.58           2.66 149.36  28.63   7.64
##   avg_mktg_cnt loyalty_card avg_order_freq
## 1        28.06         0.55          12.41
## 2         8.94         0.62           0.87
## 3        19.37         0.64           4.41
## 4       104.47         0.71           1.53
## 5         9.96         0.61           0.84
## 6         9.67         0.61           0.95

STEP 7.1: Clustering Analysis (Income vs Profit) scatterpoint

##First, Add cluster labels to the original dataframe
retail_segmentation <- retail_segmentation %>%
  mutate(cluster = kmeans_model$cluster)

## View the first few rows with cluster assignment and see
## the new column of cluster in the data 
head(retail_segmentation)  
##   Cust_No avg_order_size avg_order_freq crossbuy multichannel   per_sale tenure
## 1       1      23.400000      2.2222222        3            2 0.00000000      3
## 2       2      34.260377      6.6250000        7            2 0.11111111     35
## 3       3      43.575641      4.8750000        5            2 0.07407407     12
## 4       4      26.316667      0.9000000        4            2 0.25000000      9
## 5       5       8.269231      1.0833333        3            1 0.50000000     40
## 6       6      21.500000      0.2222222        1            2 0.00000000      7
##   return_rate married own_home household_size loyalty_card income age
## 1   0.1175214       1        1              1            1     35  47
## 2   0.2818684       1        1              3            1    140  70
## 3   0.2741769       1        0              4            0     35  21
## 4   0.1435508       0        0              1            1     35  62
## 5   0.0000000       0        0              2            0    140  21
## 6   0.0000000       0        1              1            1     80  21
##   avg_mktg_cnt zip_code      sales      profit cluster
## 1    56.000000    21230  52.000000 -14.9600000       5
## 2    14.914286    22301 226.975000 106.8412857       3
## 3    20.083333    19002 212.431250  95.4017500       3
## 4     8.222222    22304  23.685000   6.1495333       5
## 5     1.350000    20124   8.958333   3.6458333       6
## 6     2.714286    22033   4.777778   0.4487302       2
## Scatter plot of clusters (using 'Income' versus 'profit')
ggplot(retail_segmentation, aes(x = income, y = profit, 
                                color = factor(cluster))) +
  geom_point(size=3) +
  labs(x = "Income", y = "profit", color = "Cluster")

### Notes for interpretation: Cluster 1, 2, and 4 look more profitable 
### across the levels of annual income. 

Step 7.2 Clustering Analysis (Income vs Profit) cluster graph

##We can use the "kmeans_model$centers" to get the cluster centers 
##that contains the average values of the variables for each cluster. 

## See the average of the variables per cluster

## Notes on the results: Yes, Clusters 1, 2 and 4 are more profitable!! 

# Define meaningful cluster names based on insights
cluster_labels <- c("Best Customers", "Loyal Customers", "Potential Loyalists", 
                    "At-Risk Customers", "Lost Customers", "Hesitant Shoppers")

# Assign meaningful names to the clusters
retail_segmentation$cluster_name <- cluster_labels[retail_segmentation$cluster]

head(retail_segmentation)  
##   Cust_No avg_order_size avg_order_freq crossbuy multichannel   per_sale tenure
## 1       1      23.400000      2.2222222        3            2 0.00000000      3
## 2       2      34.260377      6.6250000        7            2 0.11111111     35
## 3       3      43.575641      4.8750000        5            2 0.07407407     12
## 4       4      26.316667      0.9000000        4            2 0.25000000      9
## 5       5       8.269231      1.0833333        3            1 0.50000000     40
## 6       6      21.500000      0.2222222        1            2 0.00000000      7
##   return_rate married own_home household_size loyalty_card income age
## 1   0.1175214       1        1              1            1     35  47
## 2   0.2818684       1        1              3            1    140  70
## 3   0.2741769       1        0              4            0     35  21
## 4   0.1435508       0        0              1            1     35  62
## 5   0.0000000       0        0              2            0    140  21
## 6   0.0000000       0        1              1            1     80  21
##   avg_mktg_cnt zip_code      sales      profit cluster        cluster_name
## 1    56.000000    21230  52.000000 -14.9600000       5      Lost Customers
## 2    14.914286    22301 226.975000 106.8412857       3 Potential Loyalists
## 3    20.083333    19002 212.431250  95.4017500       3 Potential Loyalists
## 4     8.222222    22304  23.685000   6.1495333       5      Lost Customers
## 5     1.350000    20124   8.958333   3.6458333       6   Hesitant Shoppers
## 6     2.714286    22033   4.777778   0.4487302       2     Loyal Customers
ggplot(retail_segmentation, aes(x = income, y = profit, color = factor(cluster_name))) +
  geom_point(size=3) +
  labs(x = "Income", y = "Profit", color = "Customer Segment") +
  theme_minimal()

fviz_cluster(kmeans_model, data=cluster_data, geom="point", ellipse.type="norm") +
  scale_color_manual(values = c("red", "green", "blue", "yellow", "pink", "cyan"),
                     labels = cluster_labels) +
  labs(title = "Cluster Segmentation", color = "Customer Segment")

Step 8.1 (return_rate vs profit) scatter point

## Scatter plot of clusters (using 'Income' versus 'profit')
ggplot(retail_segmentation, aes(x = return_rate, y = profit, color = factor(cluster_name))) +
  geom_point(size=3) +
  labs(x = "return_rate", y = "Profit", color = "Customer Segment") +
  theme_minimal()

### Notes for interpretation: Cluster 1, 2, and 4 look more profitable 
### across the levels of annual income. 

Step. 8.2 (return_rate vs profit) cluster graph

fviz_cluster(kmeans_model, data=cluster_data, geom="point", ellipse.type="norm") +
  scale_color_manual(values = c("red", "green", "blue", "yellow", "pink", "cyan"),
                     labels = cluster_labels) +
  labs(title = "Cluster Segmentation", color = "Customer Segment")

Step 9.1 (average order frequency vs average marketing count) scatter point

ggplot(retail_segmentation, aes(x = avg_mktg_cnt, y = avg_order_freq, color = factor(cluster_name))) +
  geom_point(size=3) +
  labs(x = "average marketing count", y = "average order frequency", color = "Customer Segment") +
  theme_minimal()

Step 9.2 (average order frequency vs average marketing count) cluster graph

fviz_cluster(kmeans_model, data=cluster_data, geom="point", ellipse.type="norm") +
  scale_color_manual(values = c("red", "green", "blue", "yellow", "pink", "cyan"),
                     labels = cluster_labels) +
  labs(title = "Cluster Segmentation", color = "Customer Segment")