R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

# Set up working directory 
setwd("C:/Users/blake/OneDrive/R MKTG")

# Load necessary libraries
library(ggplot2)    # For data visualization
library(dplyr)      # For data manipulation
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(data.table) # For handling large datasets
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(cluster)    # For clustering analysis
library(factoextra) # For clustering visualization
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(plotly)     # For interactive visualization
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(tidyverse)  # For data manipulation and analysis
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ data.table::between() masks dplyr::between()
## ✖ plotly::filter()      masks dplyr::filter(), stats::filter()
## ✖ data.table::first()   masks dplyr::first()
## ✖ lubridate::hour()     masks data.table::hour()
## ✖ lubridate::isoweek()  masks data.table::isoweek()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ data.table::last()    masks dplyr::last()
## ✖ lubridate::mday()     masks data.table::mday()
## ✖ lubridate::minute()   masks data.table::minute()
## ✖ lubridate::month()    masks data.table::month()
## ✖ lubridate::quarter()  masks data.table::quarter()
## ✖ lubridate::second()   masks data.table::second()
## ✖ purrr::transpose()    masks data.table::transpose()
## ✖ lubridate::wday()     masks data.table::wday()
## ✖ lubridate::week()     masks data.table::week()
## ✖ lubridate::yday()     masks data.table::yday()
## ✖ lubridate::year()     masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# STEP 2: Load & Explore the dataset (retail.segmentation.csv)
# Read the dataset 
retail_segmentation <- read.csv("retail_segmentation.csv")

# Check the first few rows
head(retail_segmentation)
##   Cust_No avg_order_size avg_order_freq crossbuy multichannel   per_sale tenure
## 1       1      23.400000      2.2222222        3            2 0.00000000      3
## 2       2      34.260377      6.6250000        7            2 0.11111111     35
## 3       3      43.575641      4.8750000        5            2 0.07407407     12
## 4       4      26.316667      0.9000000        4            2 0.25000000      9
## 5       5       8.269231      1.0833333        3            1 0.50000000     40
## 6       6      21.500000      0.2222222        1            2 0.00000000      7
##   return_rate married own_home household_size loyalty_card income age
## 1   0.1175214       1        1              1            1     35  47
## 2   0.2818684       1        1              3            1    140  70
## 3   0.2741769       1        0              4            0     35  21
## 4   0.1435508       0        0              1            1     35  62
## 5   0.0000000       0        0              2            0    140  21
## 6   0.0000000       0        1              1            1     80  21
##   avg_mktg_cnt zip_code
## 1    56.000000    21230
## 2    14.914286    22301
## 3    20.083333    19002
## 4     8.222222    22304
## 5     1.350000    20124
## 6     2.714286    22033
# Check structure of data
str(retail_segmentation)
## 'data.frame':    2000 obs. of  16 variables:
##  $ Cust_No       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ avg_order_size: num  23.4 34.26 43.58 26.32 8.27 ...
##  $ avg_order_freq: num  2.22 6.62 4.88 0.9 1.08 ...
##  $ crossbuy      : int  3 7 5 4 3 1 2 3 1 1 ...
##  $ multichannel  : int  2 2 2 2 1 2 1 2 1 1 ...
##  $ per_sale      : num  0 0.1111 0.0741 0.25 0.5 ...
##  $ tenure        : int  3 35 12 9 40 7 8 17 14 3 ...
##  $ return_rate   : num  0.118 0.282 0.274 0.144 0 ...
##  $ married       : int  1 1 1 0 0 0 1 0 0 0 ...
##  $ own_home      : int  1 1 0 0 0 1 0 1 1 1 ...
##  $ household_size: int  1 3 4 1 2 1 1 1 2 8 ...
##  $ loyalty_card  : int  1 1 0 1 0 1 0 1 1 1 ...
##  $ income        : int  35 140 35 35 140 80 70 35 35 35 ...
##  $ age           : int  47 70 21 62 21 21 86 70 57 21 ...
##  $ avg_mktg_cnt  : num  56 14.91 20.08 8.22 1.35 ...
##  $ zip_code      : int  21230 22301 19002 22304 20124 22033 8757 8109 21122 21208 ...
# Summary statistics
summary(retail_segmentation)
##     Cust_No       avg_order_size    avg_order_freq        crossbuy    
##  Min.   :   1.0   Min.   :  1.833   Min.   : 0.02778   Min.   :1.000  
##  1st Qu.: 500.8   1st Qu.: 23.157   1st Qu.: 0.30769   1st Qu.:1.000  
##  Median :1000.5   Median : 30.790   Median : 0.76923   Median :2.000  
##  Mean   :1000.5   Mean   : 35.373   Mean   : 1.55640   Mean   :2.608  
##  3rd Qu.:1500.2   3rd Qu.: 40.959   3rd Qu.: 1.90584   3rd Qu.:4.000  
##  Max.   :2000.0   Max.   :528.250   Max.   :31.87500   Max.   :7.000  
##   multichannel      per_sale          tenure       return_rate     
##  Min.   :1.000   Min.   :0.0000   Min.   : 1.00   Min.   :0.00000  
##  1st Qu.:1.000   1st Qu.:0.0000   1st Qu.: 4.00   1st Qu.:0.00000  
##  Median :1.000   Median :0.0000   Median :10.00   Median :0.01947  
##  Mean   :1.557   Mean   :0.1033   Mean   :14.12   Mean   :0.17671  
##  3rd Qu.:2.000   3rd Qu.:0.1400   3rd Qu.:20.00   3rd Qu.:0.24560  
##  Max.   :3.000   Max.   :1.0000   Max.   :40.00   Max.   :6.90909  
##     married          own_home     household_size   loyalty_card   
##  Min.   :0.0000   Min.   :0.000   Min.   :1.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :1.000   Median :2.000   Median :1.0000  
##  Mean   :0.4635   Mean   :0.568   Mean   :2.869   Mean   :0.6185  
##  3rd Qu.:1.0000   3rd Qu.:1.000   3rd Qu.:4.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.000   Max.   :8.000   Max.   :1.0000  
##      income            age         avg_mktg_cnt       zip_code    
##  Min.   : 35.00   Min.   :21.00   Min.   :  0.00   Min.   : 7726  
##  1st Qu.: 35.00   1st Qu.:21.00   1st Qu.:  4.00   1st Qu.:19010  
##  Median : 60.00   Median :37.00   Median :  7.75   Median :20854  
##  Mean   : 75.79   Mean   :42.93   Mean   : 12.94   Mean   :18326  
##  3rd Qu.:110.00   3rd Qu.:61.25   3rd Qu.: 15.00   3rd Qu.:21777  
##  Max.   :175.00   Max.   :99.00   Max.   :297.00   Max.   :24060
# STEP 3: Preprocessing the Data (Manipulation)
# Compute sales and profit
retail_segmentation <- retail_segmentation %>%
  mutate(sales = avg_order_freq * avg_order_size, 
         profit = 0.52 * sales - 0.75 * avg_mktg_cnt)

# View the updated dataframe
View(retail_segmentation) ## See the new columns created.

# STEP 4: Select variables for clustering analysis
cluster_data <- retail_segmentation %>% 
  select(tenure, own_home, married, household_size, income, sales, profit)

# STEP 5: Determine the Optimal Number of Clusters (Elbow Method)
fviz_nbclust(cluster_data, kmeans, k.max = 20, method = "wss")

# STEP 6: Perform k-means clustering (Assuming k=6 from elbow point)
set.seed(123)  # Ensure reproducibility in random number generation
kmeans_model <- kmeans(cluster_data, centers = 6)

## View model results
kmeans_model
## K-means clustering with 6 clusters of sizes 28, 93, 571, 130, 912, 266
## 
## Cluster means:
##     tenure  own_home   married household_size    income     sales      profit
## 1 17.82143 0.5000000 0.4285714       2.785714  96.07143 433.91012 204.7848768
## 2 15.16129 0.6129032 0.4516129       3.290323  60.48387 205.77286  89.9889359
## 3 16.17863 0.5183888 0.5271454       2.760070 127.76708  22.36598   4.3885284
## 4 18.75385 0.4000000 0.6230769       2.569231 125.34615 111.75165  44.1802056
## 5 11.61294 0.6250000 0.3980263       2.968202  45.78947  17.75938   0.5171155
## 6 15.28947 0.5526316 0.4812030       2.770677  46.05263  85.81361  32.0590063
## 
## Clustering vector:
##    [1] 5 2 2 5 3 5 5 5 5 5 5 3 5 4 6 5 5 3 5 3 5 3 5 1 3 2 3 3 3 3 3 5 5 6 6 5 5
##   [38] 6 6 3 6 4 5 5 5 5 5 6 4 5 5 5 5 5 3 5 3 5 3 5 1 3 5 2 5 5 5 5 3 5 5 6 6 6
##   [75] 4 6 4 5 6 2 3 3 5 6 3 5 3 2 5 5 5 3 5 5 2 6 3 5 2 5 5 6 5 2 5 3 4 5 5 6 3
##  [112] 5 5 5 5 5 5 5 3 5 5 3 5 5 5 3 2 3 5 6 2 4 6 6 3 4 5 5 5 5 5 5 5 5 3 5 3 5
##  [149] 3 3 5 3 3 6 3 3 5 5 5 5 3 4 5 5 3 5 3 5 6 4 5 3 3 5 3 5 5 5 3 5 2 2 3 5 5
##  [186] 5 5 3 6 4 3 3 2 5 5 5 2 4 6 5 4 5 3 5 3 5 4 5 5 5 5 5 5 3 2 5 3 5 3 3 5 3
##  [223] 5 3 3 5 5 5 5 2 5 6 3 6 5 5 5 5 2 5 5 5 4 3 6 3 5 4 2 5 2 5 5 5 5 3 5 1 5
##  [260] 6 5 6 3 5 3 3 5 5 5 3 3 6 3 3 5 3 3 6 6 4 5 4 5 3 3 6 5 3 5 3 6 6 5 5 3 6
##  [297] 6 3 5 5 3 6 3 5 5 5 6 4 3 5 3 5 6 5 6 5 4 5 3 5 6 5 5 2 5 5 5 5 3 2 4 3 6
##  [334] 5 5 3 5 3 3 3 3 3 3 5 5 4 3 3 5 5 5 5 3 5 6 5 6 1 5 2 6 3 4 6 6 5 5 3 3 6
##  [371] 5 3 3 3 3 1 6 6 3 5 5 2 5 2 5 3 5 6 3 4 5 5 2 3 3 5 6 4 4 3 5 5 5 5 5 3 6
##  [408] 6 2 3 3 5 5 5 6 5 5 3 6 3 2 5 5 5 5 5 5 5 4 6 3 5 6 2 5 5 2 5 3 4 6 5 2 5
##  [445] 3 3 5 5 3 5 6 1 3 5 5 5 3 5 3 3 5 5 5 2 5 3 5 2 5 5 3 3 5 3 3 3 5 5 5 5 2
##  [482] 3 5 5 3 5 5 3 5 3 5 5 3 3 6 3 4 5 3 2 5 5 3 5 3 6 6 3 3 3 3 5 3 6 5 3 6 5
##  [519] 5 6 5 5 5 3 3 3 6 6 3 5 4 5 5 3 3 3 3 3 4 5 3 3 3 5 4 5 6 4 6 6 3 5 2 5 1
##  [556] 5 5 5 5 3 5 5 4 6 6 3 5 3 3 4 5 6 3 2 6 3 5 3 4 6 3 6 4 4 3 5 1 5 5 2 5 3
##  [593] 6 6 5 3 5 3 3 6 3 3 5 6 3 3 5 5 5 2 6 3 5 1 5 5 5 5 4 3 5 3 6 5 5 5 5 3 3
##  [630] 5 5 2 5 5 5 5 3 5 5 3 5 3 5 5 5 5 5 6 6 3 6 6 6 3 1 2 2 5 5 5 5 5 4 3 5 5
##  [667] 5 4 6 5 6 3 6 6 2 3 5 4 3 5 5 5 5 4 4 5 3 3 5 3 5 5 6 6 5 5 3 4 5 3 2 5 3
##  [704] 3 4 3 3 6 5 5 1 6 2 4 1 5 3 5 5 3 5 6 6 6 5 5 5 5 5 5 4 3 5 3 6 5 3 5 6 6
##  [741] 5 5 5 5 5 5 3 5 5 4 3 5 4 4 5 5 3 5 4 5 6 5 5 4 3 6 6 3 3 3 3 6 4 5 4 5 5
##  [778] 5 3 5 6 1 6 5 2 5 6 5 5 5 3 3 3 6 5 5 6 3 5 5 6 3 6 6 3 3 5 5 3 3 5 5 5 4
##  [815] 5 2 3 5 3 5 3 5 4 5 6 4 3 5 4 5 5 3 6 4 3 6 3 3 5 5 3 5 3 5 5 5 5 5 5 5 3
##  [852] 5 6 3 3 5 5 3 5 5 3 3 3 5 3 3 5 2 5 5 6 5 5 3 6 6 3 5 3 5 5 5 3 5 5 3 3 5
##  [889] 3 6 5 6 5 5 3 4 5 3 3 6 4 5 4 3 3 3 5 5 5 5 4 5 3 5 5 5 6 5 5 3 5 5 3 6 6
##  [926] 3 3 5 6 5 5 5 4 5 5 5 5 5 5 5 3 5 3 5 6 2 5 4 6 5 5 5 6 4 3 3 6 5 5 5 4 3
##  [963] 6 5 5 5 3 6 6 6 5 3 5 5 5 5 3 5 5 2 6 6 5 3 5 3 3 5 4 5 5 5 5 5 4 3 5 3 3
## [1000] 3 3 3 6 6 6 3 6 3 5 5 3 3 5 3 5 2 3 3 3 5 6 5 5 3 5 5 3 6 3 1 3 4 3 5 5 2
## [1037] 5 5 5 5 5 6 5 3 5 5 5 5 5 3 4 3 5 4 5 5 5 4 6 5 3 6 3 5 3 2 5 5 5 5 5 3 3
## [1074] 3 5 3 6 3 6 6 5 5 6 2 3 6 3 5 5 6 5 6 3 3 5 6 5 5 3 5 5 2 3 6 3 3 5 5 6 5
## [1111] 3 5 5 3 6 4 5 3 5 3 3 4 5 6 5 5 6 4 3 3 5 4 3 3 3 3 3 3 4 6 3 3 3 4 5 5 3
## [1148] 2 5 3 3 3 5 5 5 5 3 4 6 5 6 5 5 6 3 5 5 3 3 3 5 6 5 3 6 6 5 3 6 3 5 5 4 5
## [1185] 6 3 3 5 3 3 5 3 6 4 5 5 5 5 5 6 3 3 5 5 4 3 3 5 5 5 3 3 3 6 5 5 5 5 6 5 3
## [1222] 5 6 5 3 3 5 5 6 5 4 3 5 4 4 5 3 4 5 3 1 3 3 3 4 5 5 5 5 5 3 3 3 3 5 5 6 5
## [1259] 5 5 3 6 4 5 3 5 5 3 5 5 5 3 3 5 3 5 5 5 5 5 5 3 3 5 5 5 3 5 4 5 3 6 5 5 5
## [1296] 6 3 3 3 5 5 5 5 3 5 5 3 6 5 5 5 3 5 5 5 3 6 5 5 5 3 5 5 5 5 3 3 5 5 5 5 5
## [1333] 4 5 5 5 3 5 5 2 5 4 2 6 5 3 5 3 3 3 3 5 5 2 5 3 5 5 6 5 1 5 3 3 5 6 4 3 5
## [1370] 3 5 3 3 6 5 5 3 3 6 4 5 5 5 3 5 5 2 4 3 3 3 5 2 5 5 3 3 6 5 5 6 3 4 3 5 5
## [1407] 3 5 5 4 5 3 3 2 6 5 2 3 5 4 5 3 3 2 5 5 5 6 1 5 3 5 5 5 3 5 2 5 5 5 3 4 5
## [1444] 4 6 5 6 3 6 5 3 5 3 1 5 3 5 5 5 3 4 3 5 3 3 5 6 3 5 4 2 5 3 5 5 5 3 5 3 6
## [1481] 3 5 5 3 5 6 5 5 4 5 5 5 5 5 5 1 2 5 6 3 3 3 5 3 5 5 5 3 4 5 5 4 5 3 5 5 2
## [1518] 5 6 5 5 5 6 6 6 3 3 3 4 3 6 4 6 5 2 3 5 3 3 4 6 2 6 5 6 2 5 5 5 5 5 5 3 5
## [1555] 3 6 6 2 6 2 3 5 4 5 5 5 5 6 6 3 5 5 5 6 3 5 4 2 3 5 5 6 6 2 4 5 2 3 5 1 5
## [1592] 5 6 5 5 5 5 5 5 5 5 5 1 5 6 5 2 5 3 5 5 5 3 5 3 5 5 4 2 5 5 5 3 3 4 5 5 3
## [1629] 5 5 4 5 6 5 5 3 5 2 5 3 3 3 3 5 5 5 5 5 3 5 5 6 2 5 4 5 6 5 3 4 5 3 5 6 5
## [1666] 3 6 3 5 5 5 5 3 6 3 3 3 3 3 5 5 5 3 5 3 3 3 6 3 5 3 5 6 5 3 5 3 1 1 6 3 3
## [1703] 3 5 6 6 3 6 1 5 2 3 5 5 5 5 3 4 4 3 4 5 3 5 5 5 2 5 5 2 6 6 5 5 3 5 3 3 5
## [1740] 3 5 5 5 3 5 3 6 5 3 3 6 5 6 3 5 6 5 5 3 2 5 3 6 5 6 5 3 5 3 2 6 4 5 5 3 3
## [1777] 5 3 3 3 3 3 5 6 5 5 3 5 3 5 5 6 3 6 6 5 5 5 5 6 6 5 5 6 5 3 5 3 3 6 5 6 5
## [1814] 3 6 5 3 5 3 3 5 3 5 3 5 5 5 3 5 5 5 5 5 3 3 5 6 6 5 3 5 5 5 6 5 3 5 5 3 3
## [1851] 3 3 2 5 1 3 3 5 3 3 3 5 5 5 5 5 5 5 6 5 6 5 6 6 5 3 3 5 1 5 5 4 2 5 4 5 3
## [1888] 5 3 3 3 6 3 3 4 3 5 6 6 5 5 3 5 5 3 2 2 3 5 5 5 5 1 2 5 5 4 5 3 5 3 3 3 3
## [1925] 4 3 5 3 2 4 5 3 5 6 5 5 5 5 6 4 3 5 3 3 6 3 3 5 5 3 6 3 5 5 3 5 5 5 6 4 6
## [1962] 5 3 3 6 3 5 3 2 5 4 5 5 3 3 3 3 2 5 6 3 5 1 5 6 5 5 3 3 5 5 5 3 5 5 5 3 5
## [1999] 3 3
## 
## Within cluster sum of squares by cluster:
## [1] 503852.1 323019.0 768792.8 323478.8 691495.1 344053.9
##  (between_SS / total_SS =  82.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# STEP 7: Clustering Analysis
# Add cluster labels to the original dataframe
retail_segmentation <- retail_segmentation %>%
  mutate(cluster = kmeans_model$cluster)

# View the first few rows with cluster assignment
head(retail_segmentation)
##   Cust_No avg_order_size avg_order_freq crossbuy multichannel   per_sale tenure
## 1       1      23.400000      2.2222222        3            2 0.00000000      3
## 2       2      34.260377      6.6250000        7            2 0.11111111     35
## 3       3      43.575641      4.8750000        5            2 0.07407407     12
## 4       4      26.316667      0.9000000        4            2 0.25000000      9
## 5       5       8.269231      1.0833333        3            1 0.50000000     40
## 6       6      21.500000      0.2222222        1            2 0.00000000      7
##   return_rate married own_home household_size loyalty_card income age
## 1   0.1175214       1        1              1            1     35  47
## 2   0.2818684       1        1              3            1    140  70
## 3   0.2741769       1        0              4            0     35  21
## 4   0.1435508       0        0              1            1     35  62
## 5   0.0000000       0        0              2            0    140  21
## 6   0.0000000       0        1              1            1     80  21
##   avg_mktg_cnt zip_code      sales      profit cluster
## 1    56.000000    21230  52.000000 -14.9600000       5
## 2    14.914286    22301 226.975000 106.8412857       2
## 3    20.083333    19002 212.431250  95.4017500       2
## 4     8.222222    22304  23.685000   6.1495333       5
## 5     1.350000    20124   8.958333   3.6458333       3
## 6     2.714286    22033   4.777778   0.4487302       5
# Scatter plot of clusters (using 'income' versus 'profit')
ggplot(retail_segmentation, aes(x = income, y = profit, color = factor(cluster))) +
  geom_point(size = 3) +
  labs(x = "Income", y = "Profit", color = "Cluster") +
  theme_minimal()

# See the average of the variables per cluster
kmeans_model$centers %>%
  round(2) %>%
  as.data.frame() %>%
  tibble::rownames_to_column(var = "Cluster")
##   Cluster tenure own_home married household_size income  sales profit
## 1       1  17.82     0.50    0.43           2.79  96.07 433.91 204.78
## 2       2  15.16     0.61    0.45           3.29  60.48 205.77  89.99
## 3       3  16.18     0.52    0.53           2.76 127.77  22.37   4.39
## 4       4  18.75     0.40    0.62           2.57 125.35 111.75  44.18
## 5       5  11.61     0.62    0.40           2.97  45.79  17.76   0.52
## 6       6  15.29     0.55    0.48           2.77  46.05  85.81  32.06
# Enhanced cluster visualization using factoextra package
fviz_cluster(kmeans_model, data = cluster_data, geom = "point", ellipse.type = "norm")

# STEP 8: Further Analysis and Visualization of Clusters

# 1. Cluster Distribution (Bar Plot)**
ggplot(retail_segmentation, aes(x = factor(cluster), fill = factor(cluster))) +
  geom_bar() +
  labs(x = "Cluster", y = "Number of Customers", title = "Distribution of Customers Across Clusters") +
  theme_minimal()

# 2. Boxplots for Cluster Characteristics**
# Boxplot for income across clusters
ggplot(retail_segmentation, aes(x = factor(cluster), y = income, fill = factor(cluster))) +
  geom_boxplot() +
  labs(x = "Cluster", y = "Income", title = "Income Distribution Across Clusters") +
  theme_minimal()

# Boxplot for sales across clusters
ggplot(retail_segmentation, aes(x = factor(cluster), y = sales, fill = factor(cluster))) +
  geom_boxplot() +
  labs(x = "Cluster", y = "Sales", title = "Sales Distribution Across Clusters") +
  theme_minimal()

# Boxplot for profit across clusters
ggplot(retail_segmentation, aes(x = factor(cluster), y = profit, fill = factor(cluster))) +
  geom_boxplot() +
  labs(x = "Cluster", y = "Profit", title = "Profit Distribution Across Clusters") +
  theme_minimal()

# 4. Interactive 3D Scatter Plot
plot_ly(retail_segmentation, x = ~income, y = ~sales, z = ~profit, 
        color = ~factor(cluster), colors = rainbow(6)) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'Income'),
                      yaxis = list(title = 'Sales'),
                      zaxis = list(title = 'Profit')),
         title = "3D Scatter Plot of Clusters")
# 5. Cluster Profiling
# Cluster profiling
cluster_profile <- retail_segmentation %>%
  group_by(cluster) %>%
  summarise(
    avg_tenure = mean(tenure),
    avg_own_home = mean(own_home),
    avg_married = mean(married),
    avg_household_size = mean(household_size),
    avg_income = mean(income),
    avg_sales = mean(sales),
    avg_profit = mean(profit)
  )

# View cluster profile
print(cluster_profile)
## # A tibble: 6 × 8
##   cluster avg_tenure avg_own_home avg_married avg_household_size avg_income
##     <int>      <dbl>        <dbl>       <dbl>              <dbl>      <dbl>
## 1       1       17.8        0.5         0.429               2.79       96.1
## 2       2       15.2        0.613       0.452               3.29       60.5
## 3       3       16.2        0.518       0.527               2.76      128. 
## 4       4       18.8        0.4         0.623               2.57      125. 
## 5       5       11.6        0.625       0.398               2.97       45.8
## 6       6       15.3        0.553       0.481               2.77       46.1
## # ℹ 2 more variables: avg_sales <dbl>, avg_profit <dbl>
# Boxplot for sales across clusters
ggplot(retail_segmentation, aes(x = factor(cluster), y = avg_order_freq, fill = factor(cluster))) +
  geom_boxplot() +
  labs(x = "Cluster", y = "avg_order_freq", title = "Sales Distribution Across Clusters") +
  theme_minimal()