This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# set up my working directory (you need to replace with your own)
setwd("/Users/macbookair/desktop/MKTG R Project 2")
STEP 1: Load Required Libraries
# Load Required Libraries
library(tidyverse) # For data manipulation and analysis
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.5.1 ✔ purrr 1.0.4
## ✔ tibble 3.2.1 ✔ dplyr 1.1.4
## ✔ tidyr 1.3.1 ✔ stringr 1.5.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr) # For data manipulation
library(data.table) # For K-means clustering
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(ggplot2) # For data visualization
library (factoextra) # For clustering visualization
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library (plotly) # For interactive visualization
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
STEP 2: Load & Explore the dataset (retail.segmentation.csv)
## Read the dataset (Assuming it's in your working directory)
retail_segmentation <-read.csv("retail_segmentation.csv")
## Check the first few rows
head(retail_segmentation)
## Cust_No avg_order_size avg_order_freq crossbuy multichannel per_sale tenure
## 1 1 23.400000 2.2222222 3 2 0.00000000 3
## 2 2 34.260377 6.6250000 7 2 0.11111111 35
## 3 3 43.575641 4.8750000 5 2 0.07407407 12
## 4 4 26.316667 0.9000000 4 2 0.25000000 9
## 5 5 8.269231 1.0833333 3 1 0.50000000 40
## 6 6 21.500000 0.2222222 1 2 0.00000000 7
## return_rate married own_home household_size loyalty_card income age
## 1 0.1175214 1 1 1 1 35 47
## 2 0.2818684 1 1 3 1 140 70
## 3 0.2741769 1 0 4 0 35 21
## 4 0.1435508 0 0 1 1 35 62
## 5 0.0000000 0 0 2 0 140 21
## 6 0.0000000 0 1 1 1 80 21
## avg_mktg_cnt zip_code
## 1 56.000000 21230
## 2 14.914286 22301
## 3 20.083333 19002
## 4 8.222222 22304
## 5 1.350000 20124
## 6 2.714286 22033
## Check structure of data
str(retail_segmentation)
## 'data.frame': 2000 obs. of 16 variables:
## $ Cust_No : int 1 2 3 4 5 6 7 8 9 10 ...
## $ avg_order_size: num 23.4 34.26 43.58 26.32 8.27 ...
## $ avg_order_freq: num 2.22 6.62 4.88 0.9 1.08 ...
## $ crossbuy : int 3 7 5 4 3 1 2 3 1 1 ...
## $ multichannel : int 2 2 2 2 1 2 1 2 1 1 ...
## $ per_sale : num 0 0.1111 0.0741 0.25 0.5 ...
## $ tenure : int 3 35 12 9 40 7 8 17 14 3 ...
## $ return_rate : num 0.118 0.282 0.274 0.144 0 ...
## $ married : int 1 1 1 0 0 0 1 0 0 0 ...
## $ own_home : int 1 1 0 0 0 1 0 1 1 1 ...
## $ household_size: int 1 3 4 1 2 1 1 1 2 8 ...
## $ loyalty_card : int 1 1 0 1 0 1 0 1 1 1 ...
## $ income : int 35 140 35 35 140 80 70 35 35 35 ...
## $ age : int 47 70 21 62 21 21 86 70 57 21 ...
## $ avg_mktg_cnt : num 56 14.91 20.08 8.22 1.35 ...
## $ zip_code : int 21230 22301 19002 22304 20124 22033 8757 8109 21122 21208 ...
## Summary statistics
summary(retail_segmentation)
## Cust_No avg_order_size avg_order_freq crossbuy
## Min. : 1.0 Min. : 1.833 Min. : 0.02778 Min. :1.000
## 1st Qu.: 500.8 1st Qu.: 23.157 1st Qu.: 0.30769 1st Qu.:1.000
## Median :1000.5 Median : 30.790 Median : 0.76923 Median :2.000
## Mean :1000.5 Mean : 35.373 Mean : 1.55640 Mean :2.608
## 3rd Qu.:1500.2 3rd Qu.: 40.959 3rd Qu.: 1.90584 3rd Qu.:4.000
## Max. :2000.0 Max. :528.250 Max. :31.87500 Max. :7.000
## multichannel per_sale tenure return_rate
## Min. :1.000 Min. :0.0000 Min. : 1.00 Min. :0.00000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.: 4.00 1st Qu.:0.00000
## Median :1.000 Median :0.0000 Median :10.00 Median :0.01947
## Mean :1.557 Mean :0.1033 Mean :14.12 Mean :0.17671
## 3rd Qu.:2.000 3rd Qu.:0.1400 3rd Qu.:20.00 3rd Qu.:0.24560
## Max. :3.000 Max. :1.0000 Max. :40.00 Max. :6.90909
## married own_home household_size loyalty_card
## Min. :0.0000 Min. :0.000 Min. :1.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :1.000 Median :2.000 Median :1.0000
## Mean :0.4635 Mean :0.568 Mean :2.869 Mean :0.6185
## 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:4.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.000 Max. :8.000 Max. :1.0000
## income age avg_mktg_cnt zip_code
## Min. : 35.00 Min. :21.00 Min. : 0.00 Min. : 7726
## 1st Qu.: 35.00 1st Qu.:21.00 1st Qu.: 4.00 1st Qu.:19010
## Median : 60.00 Median :37.00 Median : 7.75 Median :20854
## Mean : 75.79 Mean :42.93 Mean : 12.94 Mean :18326
## 3rd Qu.:110.00 3rd Qu.:61.25 3rd Qu.: 15.00 3rd Qu.:21776
## Max. :175.00 Max. :99.00 Max. :297.00 Max. :24060
STEP 3: Preprocessing the Data (Manipulation)
## Compute sales and profit. In our dataset sales and profit
## are defined as follows: sales = avg_order_freg X avg_prder_size
## profit = 0.52 X sales – 0.75 X avg_mktg_cnt
retail_segmentation <- retail_segmentation %>%
mutate(sales = avg_order_freq * avg_order_size,
profit = 0.52 * sales - 0.75 * avg_mktg_cnt)
### View the updated dataframe
View(retail_segmentation) ## see the new columns created.
STEP 4: Select variables for clustering analysis ## In your work, you can replace one or two variables from the list
cluster_data <-retail_segmentation %>% select(tenure, own_home,
married, household_size, income, sales, profit,avg_mktg_cnt, loyalty_card,avg_order_freq)
STEP 5: Determine the Optional Number of Clusters (Elbow Method)
## We can use the elbow method to determine the optimal number of
## clusters. The elbow method plots the within-cluster sum of squares
## (WCSS) against the number of clusters.
## The "elbow" point indicates the optimal number of clusters.
fviz_nbclust(cluster_data, kmeans, k.max = 20, method = "wss")
## Notes on the Elbow chart.
### Look for the "elbow point" where the WCSS (within sum of squares)
### stop decreasing significantly.
###For our example, we will choose 6 clusters (judgmental from
### the "elbow chart" point)
STEP 6: Perform k-means clustering (Assuming k=6 from elbow point)
kmeans_model <- kmeans(cluster_data, centers = 6)
## View model results
kmeans_model
## K-means clustering with 6 clusters of sizes 31, 447, 280, 35, 880, 327
##
## Cluster means:
## tenure own_home married household_size income sales profit
## 1 17.903226 0.4838710 0.4193548 2.838710 94.35484 421.91801 198.351647
## 2 14.029083 0.5346756 0.4429530 2.843400 94.30649 25.15432 6.377704
## 3 16.339286 0.4892857 0.5321429 2.817857 72.16071 145.67261 61.218801
## 4 1.857143 0.4285714 0.3714286 2.771429 66.14286 50.93522 -51.869758
## 5 12.109091 0.6386364 0.4125000 2.979545 39.92614 24.26219 5.145437
## 6 18.718654 0.5137615 0.5840979 2.663609 149.35780 28.63241 7.639105
## avg_mktg_cnt loyalty_card avg_order_freq
## 1 28.060959 0.5483871 12.4101222
## 2 8.936721 0.6219239 0.8698941
## 3 19.374608 0.6357143 4.4103521
## 4 104.474762 0.7142857 1.5262847
## 5 9.961202 0.6147727 0.8408859
## 6 9.666334 0.6055046 0.9509172
##
## Clustering vector:
## [1] 5 3 3 5 6 2 2 5 5 5 5 2 2 3 2 5 5 6 5 2 2 6 2 1 6 3 6 6 6 2 6 2 5 5 5 5 5
## [38] 5 2 6 5 2 2 5 5 5 5 3 3 5 5 5 5 5 6 5 6 5 6 5 1 2 4 3 5 5 5 5 6 5 5 5 5 3
## [75] 3 3 3 2 5 3 2 6 5 3 2 5 6 3 5 5 5 2 5 5 3 5 6 4 3 5 5 5 2 3 5 6 4 5 5 3 6
## [112] 5 5 2 5 5 5 5 6 5 5 2 5 5 5 2 3 2 2 2 3 2 4 2 2 3 5 5 5 5 5 5 5 2 6 5 2 5
## [149] 2 2 5 6 2 5 2 6 2 5 5 5 2 6 2 5 2 2 2 5 3 3 5 2 2 5 2 5 5 5 6 5 3 3 2 5 5
## [186] 2 5 2 5 3 2 6 3 5 5 5 3 2 5 5 3 2 6 5 2 2 3 2 5 5 5 5 5 6 3 5 2 5 2 6 5 6
## [223] 5 6 6 5 5 2 5 3 5 3 6 5 5 5 5 4 3 5 5 5 3 6 3 2 5 3 3 5 3 5 5 5 5 2 5 1 5
## [260] 3 5 5 2 5 6 6 5 4 5 2 2 5 2 6 5 6 6 5 5 3 5 3 5 6 6 3 5 2 5 2 2 2 5 5 6 3
## [297] 3 6 5 5 2 5 6 5 5 5 5 3 6 2 2 5 5 5 2 2 3 5 2 2 5 2 5 3 5 5 5 5 6 3 6 2 5
## [334] 2 5 2 5 6 6 6 6 6 2 5 5 3 6 2 5 5 5 5 2 2 3 5 5 1 5 3 3 6 6 4 3 5 2 6 6 5
## [371] 5 6 2 2 6 1 3 3 6 5 5 3 2 3 5 2 5 5 2 3 4 5 3 6 6 5 5 3 6 2 5 5 5 5 5 2 3
## [408] 3 3 6 6 5 5 5 5 2 5 2 5 2 3 5 5 5 4 5 5 2 2 2 2 5 5 3 2 5 3 5 6 3 3 5 3 5
## [445] 6 6 5 5 2 5 5 1 6 5 5 5 6 5 6 2 5 5 5 3 5 6 2 3 5 5 6 6 5 6 2 2 2 2 2 2 3
## [482] 6 5 5 6 5 5 6 5 6 5 5 6 6 5 2 2 2 2 3 5 5 6 5 2 3 5 6 2 6 2 5 6 5 5 2 5 5
## [519] 5 5 5 5 5 6 2 6 3 5 6 5 3 5 2 2 6 2 6 6 3 5 2 2 6 5 6 4 5 3 5 5 6 5 3 5 1
## [556] 5 5 2 5 6 5 5 3 5 5 6 5 6 2 3 5 2 6 3 3 6 5 6 3 3 6 5 3 3 6 5 1 2 5 3 5 2
## [593] 5 3 5 2 5 6 6 3 6 6 5 3 6 6 5 5 5 3 5 6 5 1 5 5 5 4 3 2 2 6 5 2 5 5 5 2 2
## [630] 2 5 3 5 5 5 5 4 5 5 2 5 6 5 4 2 5 2 3 2 2 3 3 3 6 1 3 1 2 5 5 5 5 4 2 5 5
## [667] 5 6 3 5 2 2 4 2 3 2 2 3 6 5 5 5 5 3 3 5 6 2 5 2 5 4 2 2 5 5 2 4 2 6 1 5 2
## [704] 2 6 6 2 3 5 5 1 3 3 6 1 5 6 5 2 2 5 3 5 3 5 5 2 5 5 2 6 6 5 6 5 5 2 5 3 5
## [741] 2 5 5 5 5 5 2 5 5 6 2 5 6 3 5 5 6 5 3 5 3 5 4 3 6 3 5 6 6 2 6 5 3 5 6 5 5
## [778] 5 2 5 5 1 3 5 3 5 3 5 5 5 2 2 6 5 2 5 3 6 5 5 5 6 5 5 2 2 5 5 6 6 5 2 5 2
## [815] 5 3 2 4 2 5 6 5 6 5 3 3 2 5 3 5 5 2 5 2 6 5 6 2 5 2 2 5 2 2 5 5 5 5 5 5 6
## [852] 5 3 6 2 5 5 2 5 5 6 2 2 5 6 2 5 3 5 5 3 5 2 2 5 3 2 5 6 5 2 5 6 5 2 6 6 5
## [889] 6 5 5 5 4 5 6 2 2 6 2 5 3 5 3 6 2 2 5 2 5 5 3 5 2 5 2 2 5 5 5 2 5 5 2 3 3
## [926] 2 6 5 5 2 5 5 3 5 5 5 5 5 2 5 6 5 2 5 5 3 5 3 3 5 2 5 5 3 6 6 3 5 5 2 6 6
## [963] 3 5 5 5 6 2 5 3 5 2 5 5 5 5 2 5 5 3 3 5 2 6 2 6 6 5 6 5 5 5 5 5 3 2 5 2 2
## [1000] 2 6 2 5 3 3 6 5 2 5 5 6 2 5 2 2 3 2 6 2 5 5 2 5 6 5 5 6 5 2 1 6 2 6 5 5 3
## [1037] 5 5 5 5 5 5 5 6 5 5 5 5 5 2 6 6 5 6 5 5 5 3 3 2 2 5 6 5 2 3 5 5 5 5 5 2 2
## [1074] 2 5 6 5 6 5 4 4 5 3 3 2 5 6 5 5 5 2 5 6 2 5 5 5 5 6 2 5 3 2 3 6 6 5 5 3 5
## [1111] 6 5 5 2 5 6 2 6 2 6 6 3 5 5 5 5 3 3 6 2 5 2 6 2 6 6 2 6 3 5 6 6 6 3 5 2 2
## [1148] 3 2 6 6 6 5 5 5 5 2 3 3 5 5 5 5 3 2 5 5 2 2 6 4 3 5 2 3 3 5 6 5 6 5 2 3 5
## [1185] 3 6 2 5 6 6 5 2 3 6 5 5 5 5 5 2 6 6 5 5 2 4 6 5 5 5 6 2 2 5 5 5 5 2 5 5 2
## [1222] 5 2 5 2 2 5 5 3 5 2 6 5 3 6 5 2 3 5 2 1 6 6 6 2 5 5 2 5 5 2 6 6 4 5 5 2 2
## [1259] 5 5 6 3 3 5 2 5 5 6 5 5 5 6 2 5 2 2 2 5 2 5 5 6 6 5 5 5 2 5 3 5 2 5 5 5 5
## [1296] 3 2 6 2 5 5 5 5 2 2 5 2 3 5 5 5 2 5 5 5 6 3 5 5 5 2 5 5 5 2 2 6 5 5 2 5 5
## [1333] 3 5 5 5 6 5 4 3 5 3 3 5 5 2 5 6 6 2 2 5 2 3 5 2 5 5 3 5 1 5 2 6 4 3 6 2 4
## [1370] 2 5 2 2 3 5 2 6 6 3 6 5 5 5 6 2 2 3 3 6 2 2 5 3 5 5 6 2 3 5 5 5 2 3 6 5 5
## [1407] 2 5 5 6 5 6 6 3 3 5 3 2 2 3 5 2 6 3 5 5 5 5 1 5 6 5 5 5 6 5 3 5 5 5 2 2 5
## [1444] 3 2 5 5 6 3 5 6 5 6 1 5 6 5 5 5 2 3 6 5 6 6 5 3 6 5 3 1 5 2 5 5 2 2 2 2 2
## [1481] 2 5 5 6 5 5 5 5 3 2 5 5 5 2 5 1 3 5 3 6 6 6 5 2 5 5 5 2 3 5 5 3 5 2 5 5 3
## [1518] 5 5 5 5 5 2 3 5 2 2 6 6 2 3 6 4 5 3 6 5 6 2 2 2 3 5 5 5 3 2 2 5 2 5 5 6 5
## [1555] 2 5 3 3 3 3 6 2 2 5 5 5 5 3 3 2 5 5 5 3 2 5 3 3 2 5 2 3 2 3 3 5 3 6 2 1 5
## [1592] 5 5 5 5 5 5 5 5 2 2 5 1 5 5 5 3 5 6 2 5 5 2 5 6 2 5 2 3 5 5 5 2 2 3 5 5 2
## [1629] 5 2 3 5 5 5 5 6 2 3 5 6 2 6 6 5 5 5 5 5 2 5 5 5 3 5 3 5 2 2 6 3 2 6 5 3 5
## [1666] 2 5 6 5 5 5 5 2 3 6 6 6 6 6 5 5 5 2 5 2 6 2 5 6 5 6 5 3 5 6 5 6 1 1 3 6 6
## [1703] 6 5 3 3 2 3 1 5 3 6 5 5 5 5 2 3 6 6 3 5 2 5 5 5 3 5 5 3 5 2 5 4 6 2 6 6 5
## [1740] 2 5 5 2 2 5 2 5 5 2 2 3 5 5 2 5 3 5 5 6 3 2 2 5 5 3 5 2 5 6 3 3 3 5 5 2 6
## [1777] 4 2 6 6 2 2 5 2 5 5 2 5 2 5 5 3 6 3 3 5 5 5 5 3 2 5 5 3 2 6 5 6 6 2 5 5 5
## [1814] 2 5 5 2 5 6 2 5 2 5 6 5 5 5 6 2 2 5 5 4 2 2 5 3 5 2 6 5 2 5 3 2 6 5 5 2 6
## [1851] 6 2 3 5 1 6 2 2 2 6 6 5 5 2 5 5 5 5 5 5 2 5 5 5 5 2 6 5 1 5 5 2 3 5 3 5 2
## [1888] 5 2 2 2 5 2 2 3 2 2 5 5 5 5 2 5 5 6 3 3 2 5 5 2 5 1 3 5 2 6 5 6 5 6 6 2 6
## [1925] 3 4 5 2 3 6 5 2 2 5 5 2 5 5 5 3 2 5 4 2 2 2 6 5 2 6 5 6 5 4 2 5 5 5 3 2 3
## [1962] 5 2 6 3 2 5 2 3 5 2 5 5 6 2 2 2 3 5 5 6 2 1 5 3 5 5 2 2 5 5 5 6 5 5 5 6 2
## [1999] 6 6
##
## Within cluster sum of squares by cluster:
## [1] 578933.7 479665.3 1424176.7 233579.7 780322.4 444458.3
## (between_SS / total_SS = 77.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
### Notes about the output of kmeans model
#### The output of the k-means model is a list
#### containing the cluster assignments, the cluster centers,
#### and other information.
#### The metric between_SS/total_SS helps evaluate how well the
#### clusters are separated. It represents the proportion of total
#### variance that is explained by the clusters,
#### which is 82.1% in our case here.
kmeans_model$centers %>%
round(2) %>%
as.data.frame() %>%
tibble::rownames_to_column(var = "Cluster")
## Cluster tenure own_home married household_size income sales profit
## 1 1 17.90 0.48 0.42 2.84 94.35 421.92 198.35
## 2 2 14.03 0.53 0.44 2.84 94.31 25.15 6.38
## 3 3 16.34 0.49 0.53 2.82 72.16 145.67 61.22
## 4 4 1.86 0.43 0.37 2.77 66.14 50.94 -51.87
## 5 5 12.11 0.64 0.41 2.98 39.93 24.26 5.15
## 6 6 18.72 0.51 0.58 2.66 149.36 28.63 7.64
## avg_mktg_cnt loyalty_card avg_order_freq
## 1 28.06 0.55 12.41
## 2 8.94 0.62 0.87
## 3 19.37 0.64 4.41
## 4 104.47 0.71 1.53
## 5 9.96 0.61 0.84
## 6 9.67 0.61 0.95
STEP 7.1: Clustering Analysis (Income vs Profit) scatterpoint
##First, Add cluster labels to the original dataframe
retail_segmentation <- retail_segmentation %>%
mutate(cluster = kmeans_model$cluster)
## View the first few rows with cluster assignment and see
## the new column of cluster in the data
head(retail_segmentation)
## Cust_No avg_order_size avg_order_freq crossbuy multichannel per_sale tenure
## 1 1 23.400000 2.2222222 3 2 0.00000000 3
## 2 2 34.260377 6.6250000 7 2 0.11111111 35
## 3 3 43.575641 4.8750000 5 2 0.07407407 12
## 4 4 26.316667 0.9000000 4 2 0.25000000 9
## 5 5 8.269231 1.0833333 3 1 0.50000000 40
## 6 6 21.500000 0.2222222 1 2 0.00000000 7
## return_rate married own_home household_size loyalty_card income age
## 1 0.1175214 1 1 1 1 35 47
## 2 0.2818684 1 1 3 1 140 70
## 3 0.2741769 1 0 4 0 35 21
## 4 0.1435508 0 0 1 1 35 62
## 5 0.0000000 0 0 2 0 140 21
## 6 0.0000000 0 1 1 1 80 21
## avg_mktg_cnt zip_code sales profit cluster
## 1 56.000000 21230 52.000000 -14.9600000 5
## 2 14.914286 22301 226.975000 106.8412857 3
## 3 20.083333 19002 212.431250 95.4017500 3
## 4 8.222222 22304 23.685000 6.1495333 5
## 5 1.350000 20124 8.958333 3.6458333 6
## 6 2.714286 22033 4.777778 0.4487302 2
## Scatter plot of clusters (using 'Income' versus 'profit')
ggplot(retail_segmentation, aes(x = income, y = profit,
color = factor(cluster))) +
geom_point(size=3) +
labs(x = "Income", y = "profit", color = "Cluster")
### Notes for interpretation: Cluster 1, 2, and 4 look more profitable
### across the levels of annual income.
Step 7.2 Clustering Analysis (Income vs Profit) cluster graph
##We can use the "kmeans_model$centers" to get the cluster centers
##that contains the average values of the variables for each cluster.
## See the average of the variables per cluster
## Notes on the results: Yes, Clusters 1, 2 and 4 are more profitable!!
# Define meaningful cluster names based on insights
cluster_labels <- c("Best Customers", "Loyal Customers", "Potential Loyalists",
"At-Risk Customers", "Lost Customers", "Hesitant Shoppers")
# Assign meaningful names to the clusters
retail_segmentation$cluster_name <- cluster_labels[retail_segmentation$cluster]
head(retail_segmentation)
## Cust_No avg_order_size avg_order_freq crossbuy multichannel per_sale tenure
## 1 1 23.400000 2.2222222 3 2 0.00000000 3
## 2 2 34.260377 6.6250000 7 2 0.11111111 35
## 3 3 43.575641 4.8750000 5 2 0.07407407 12
## 4 4 26.316667 0.9000000 4 2 0.25000000 9
## 5 5 8.269231 1.0833333 3 1 0.50000000 40
## 6 6 21.500000 0.2222222 1 2 0.00000000 7
## return_rate married own_home household_size loyalty_card income age
## 1 0.1175214 1 1 1 1 35 47
## 2 0.2818684 1 1 3 1 140 70
## 3 0.2741769 1 0 4 0 35 21
## 4 0.1435508 0 0 1 1 35 62
## 5 0.0000000 0 0 2 0 140 21
## 6 0.0000000 0 1 1 1 80 21
## avg_mktg_cnt zip_code sales profit cluster cluster_name
## 1 56.000000 21230 52.000000 -14.9600000 5 Lost Customers
## 2 14.914286 22301 226.975000 106.8412857 3 Potential Loyalists
## 3 20.083333 19002 212.431250 95.4017500 3 Potential Loyalists
## 4 8.222222 22304 23.685000 6.1495333 5 Lost Customers
## 5 1.350000 20124 8.958333 3.6458333 6 Hesitant Shoppers
## 6 2.714286 22033 4.777778 0.4487302 2 Loyal Customers
ggplot(retail_segmentation, aes(x = income, y = profit, color = factor(cluster_name))) +
geom_point(size=3) +
labs(x = "Income", y = "Profit", color = "Customer Segment") +
theme_minimal()
fviz_cluster(kmeans_model, data=cluster_data, geom="point", ellipse.type="norm") +
scale_color_manual(values = c("red", "green", "blue", "yellow", "pink", "cyan"),
labels = cluster_labels) +
labs(title = "Cluster Segmentation", color = "Customer Segment")
Step 8.1 (return_rate vs profit) scatter point
## Scatter plot of clusters (using 'Income' versus 'profit')
ggplot(retail_segmentation, aes(x = return_rate, y = profit, color = factor(cluster_name))) +
geom_point(size=3) +
labs(x = "return_rate", y = "Profit", color = "Customer Segment") +
theme_minimal()
### Notes for interpretation: Cluster 1, 2, and 4 look more profitable
### across the levels of annual income.
Step. 8.2 (return_rate vs profit) cluster graph
fviz_cluster(kmeans_model, data=cluster_data, geom="point", ellipse.type="norm") +
scale_color_manual(values = c("red", "green", "blue", "yellow", "pink", "cyan"),
labels = cluster_labels) +
labs(title = "Cluster Segmentation", color = "Customer Segment")
Step 9.1 (average order frequency vs average marketing count) scatter point
ggplot(retail_segmentation, aes(x = avg_mktg_cnt, y = avg_order_freq, color = factor(cluster_name))) +
geom_point(size=3) +
labs(x = "average marketing count", y = "average order frequency", color = "Customer Segment") +
theme_minimal()
Step 9.2 (average order frequency vs average marketing count) cluster graph
fviz_cluster(kmeans_model, data=cluster_data, geom="point", ellipse.type="norm") +
scale_color_manual(values = c("red", "green", "blue", "yellow", "pink", "cyan"),
labels = cluster_labels) +
labs(title = "Cluster Segmentation", color = "Customer Segment")