setwd("/Users/elyseterceira/Documents/year 3/semester 2/MKTG 3P98/Rstudio/assignment 1")
library (ggplot2)
library (dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library (data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library (cluster)
library (factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library (plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ data.table::between() masks dplyr::between()
## ✖ plotly::filter() masks dplyr::filter(), stats::filter()
## ✖ data.table::first() masks dplyr::first()
## ✖ lubridate::hour() masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag() masks stats::lag()
## ✖ data.table::last() masks dplyr::last()
## ✖ lubridate::mday() masks data.table::mday()
## ✖ lubridate::minute() masks data.table::minute()
## ✖ lubridate::month() masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second() masks data.table::second()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ lubridate::wday() masks data.table::wday()
## ✖ lubridate::week() masks data.table::week()
## ✖ lubridate::yday() masks data.table::yday()
## ✖ lubridate::year() masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
retail_segmentation <-read.csv("retail_segmentation.csv")
head(retail_segmentation)
## Cust_No avg_order_size avg_order_freq crossbuy multichannel per_sale tenure
## 1 1 23.400000 2.2222222 3 2 0.00000000 3
## 2 2 34.260377 6.6250000 7 2 0.11111111 35
## 3 3 43.575641 4.8750000 5 2 0.07407407 12
## 4 4 26.316667 0.9000000 4 2 0.25000000 9
## 5 5 8.269231 1.0833333 3 1 0.50000000 40
## 6 6 21.500000 0.2222222 1 2 0.00000000 7
## return_rate married own_home household_size loyalty_card income age
## 1 0.1175214 1 1 1 1 35 47
## 2 0.2818684 1 1 3 1 140 70
## 3 0.2741769 1 0 4 0 35 21
## 4 0.1435508 0 0 1 1 35 62
## 5 0.0000000 0 0 2 0 140 21
## 6 0.0000000 0 1 1 1 80 21
## avg_mktg_cnt zip_code
## 1 56.000000 21230
## 2 14.914286 22301
## 3 20.083333 19002
## 4 8.222222 22304
## 5 1.350000 20124
## 6 2.714286 22033
str(retail_segmentation)
## 'data.frame': 2000 obs. of 16 variables:
## $ Cust_No : int 1 2 3 4 5 6 7 8 9 10 ...
## $ avg_order_size: num 23.4 34.26 43.58 26.32 8.27 ...
## $ avg_order_freq: num 2.22 6.62 4.88 0.9 1.08 ...
## $ crossbuy : int 3 7 5 4 3 1 2 3 1 1 ...
## $ multichannel : int 2 2 2 2 1 2 1 2 1 1 ...
## $ per_sale : num 0 0.1111 0.0741 0.25 0.5 ...
## $ tenure : int 3 35 12 9 40 7 8 17 14 3 ...
## $ return_rate : num 0.118 0.282 0.274 0.144 0 ...
## $ married : int 1 1 1 0 0 0 1 0 0 0 ...
## $ own_home : int 1 1 0 0 0 1 0 1 1 1 ...
## $ household_size: int 1 3 4 1 2 1 1 1 2 8 ...
## $ loyalty_card : int 1 1 0 1 0 1 0 1 1 1 ...
## $ income : int 35 140 35 35 140 80 70 35 35 35 ...
## $ age : int 47 70 21 62 21 21 86 70 57 21 ...
## $ avg_mktg_cnt : num 56 14.91 20.08 8.22 1.35 ...
## $ zip_code : int 21230 22301 19002 22304 20124 22033 8757 8109 21122 21208 ...
summary(retail_segmentation)
## Cust_No avg_order_size avg_order_freq crossbuy
## Min. : 1.0 Min. : 1.833 Min. : 0.02778 Min. :1.000
## 1st Qu.: 500.8 1st Qu.: 23.157 1st Qu.: 0.30769 1st Qu.:1.000
## Median :1000.5 Median : 30.790 Median : 0.76923 Median :2.000
## Mean :1000.5 Mean : 35.373 Mean : 1.55640 Mean :2.608
## 3rd Qu.:1500.2 3rd Qu.: 40.959 3rd Qu.: 1.90584 3rd Qu.:4.000
## Max. :2000.0 Max. :528.250 Max. :31.87500 Max. :7.000
## multichannel per_sale tenure return_rate
## Min. :1.000 Min. :0.0000 Min. : 1.00 Min. :0.00000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.: 4.00 1st Qu.:0.00000
## Median :1.000 Median :0.0000 Median :10.00 Median :0.01947
## Mean :1.557 Mean :0.1033 Mean :14.12 Mean :0.17671
## 3rd Qu.:2.000 3rd Qu.:0.1400 3rd Qu.:20.00 3rd Qu.:0.24560
## Max. :3.000 Max. :1.0000 Max. :40.00 Max. :6.90909
## married own_home household_size loyalty_card
## Min. :0.0000 Min. :0.000 Min. :1.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :1.000 Median :2.000 Median :1.0000
## Mean :0.4635 Mean :0.568 Mean :2.869 Mean :0.6185
## 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:4.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.000 Max. :8.000 Max. :1.0000
## income age avg_mktg_cnt zip_code
## Min. : 35.00 Min. :21.00 Min. : 0.00 Min. : 7726
## 1st Qu.: 35.00 1st Qu.:21.00 1st Qu.: 4.00 1st Qu.:19010
## Median : 60.00 Median :37.00 Median : 7.75 Median :20854
## Mean : 75.79 Mean :42.93 Mean : 12.94 Mean :18326
## 3rd Qu.:110.00 3rd Qu.:61.25 3rd Qu.: 15.00 3rd Qu.:21776
## Max. :175.00 Max. :99.00 Max. :297.00 Max. :24060
retail_segmentation <- retail_segmentation %>%
mutate(sales = avg_order_freq * avg_order_size,
profit = 0.52 * sales - 0.75 * avg_mktg_cnt)
View(retail_segmentation)
cluster_data <-retail_segmentation %>% select(age, zip_code,
avg_order_size, household_size, income, avg_order_freq, profit)
fviz_nbclust(cluster_data, kmeans, k.max = 20, method = "wss")
set.seed (123)
kmeans_model <- kmeans(cluster_data, centers = 6)
kmeans_model
## K-means clustering with 6 clusters of sizes 773, 289, 428, 241, 93, 176
##
## Cluster means:
## age zip_code avg_order_size household_size income avg_order_freq
## 1 42.69082 20788.577 36.30394 2.776197 74.89004 1.461444
## 2 43.42215 22267.201 35.56425 2.958478 77.38754 1.792427
## 3 42.85981 8284.979 34.16271 2.927570 76.29673 1.596838
## 4 44.70124 19076.979 35.43431 2.879668 75.97510 1.522804
## 5 41.32258 23207.118 36.34726 3.258065 80.69892 1.435899
## 6 41.79545 21845.523 33.31323 2.767045 73.01136 1.597251
## profit
## 1 14.34727
## 2 19.27478
## 3 15.31798
## 4 14.97264
## 5 13.31554
## 6 18.68085
##
## Clustering vector:
## [1] 1 2 4 2 1 6 3 3 1 1 1 2 1 1 1 4 4 5 3 4 1 1 2 2 2 4 1 6 1 3 1 2 1 4 1 4 3
## [38] 6 2 2 3 1 1 6 4 2 2 4 1 5 2 1 2 2 1 1 2 4 3 6 1 1 2 1 1 2 3 1 6 1 3 2 3 2
## [75] 3 1 5 4 1 3 1 2 1 1 2 2 1 1 1 6 1 1 4 4 1 1 2 1 1 6 3 1 1 3 3 4 5 6 3 2 6
## [112] 1 3 4 2 2 1 2 1 3 2 4 5 1 1 1 3 3 3 1 3 3 6 1 1 3 3 5 1 1 1 2 1 4 6 4 1 3
## [149] 1 2 1 3 4 1 6 4 1 1 1 1 1 2 1 2 1 1 4 1 1 6 3 2 5 2 3 1 2 3 3 1 2 1 1 1 4
## [186] 1 3 5 1 2 1 6 3 2 2 4 1 1 6 3 1 4 6 6 1 2 2 2 2 3 1 4 6 1 3 3 3 3 3 5 2 1
## [223] 1 1 6 6 3 4 3 2 4 1 2 1 1 6 4 4 3 3 1 3 1 3 3 2 1 6 5 1 6 1 1 3 3 1 3 4 4
## [260] 3 2 2 1 1 4 3 1 6 5 4 1 1 1 5 2 5 3 2 1 2 1 4 2 1 1 5 1 3 4 6 4 1 3 6 1 3
## [297] 3 1 3 3 1 6 6 1 2 3 1 4 4 1 1 2 3 3 1 1 5 2 2 3 1 1 3 1 5 1 1 2 6 3 6 1 1
## [334] 1 1 6 2 1 3 1 3 3 1 1 2 1 1 1 2 1 1 1 2 5 2 3 4 1 1 4 1 1 1 3 1 6 6 6 5 3
## [371] 6 4 3 4 1 1 1 1 2 4 2 1 3 3 2 1 1 1 4 1 4 2 6 3 3 1 3 1 6 4 5 1 4 1 1 1 1
## [408] 2 4 3 4 1 6 6 1 2 1 4 3 1 1 4 3 3 1 1 6 1 3 2 5 2 1 2 1 3 4 1 4 1 6 3 4 1
## [445] 1 6 4 5 4 3 3 2 5 5 1 1 1 1 3 1 4 6 2 1 1 3 5 2 3 1 2 1 4 3 6 4 4 3 1 3 1
## [482] 1 3 1 4 5 1 1 3 6 2 6 3 3 1 3 1 3 3 3 6 1 2 1 1 1 3 3 6 1 2 4 5 2 2 5 4 1
## [519] 6 1 2 6 1 3 6 4 1 1 3 3 3 1 1 3 3 6 1 6 1 3 1 1 3 2 1 3 2 1 1 4 1 6 3 1 2
## [556] 1 3 6 4 1 3 1 2 1 2 1 3 1 6 3 3 2 1 3 1 2 2 1 1 3 2 1 4 3 1 1 3 4 3 1 6 1
## [593] 2 3 1 1 1 1 3 2 1 1 1 1 2 5 3 3 1 6 1 3 4 2 2 1 5 3 1 3 3 2 4 3 1 4 1 2 1
## [630] 4 3 1 5 6 6 2 1 1 3 4 2 2 2 3 3 6 3 1 3 4 1 1 6 4 3 3 3 6 1 1 1 1 1 1 2 2
## [667] 5 1 1 1 1 5 1 4 1 1 6 6 1 3 2 1 1 1 4 3 3 6 1 3 6 1 1 2 3 5 4 4 1 4 5 1 3
## [704] 3 1 1 2 2 1 4 2 2 4 4 6 2 2 1 1 2 2 3 1 4 1 3 3 4 4 1 6 3 3 2 4 2 1 3 3 6
## [741] 1 3 1 3 1 1 1 1 1 1 4 4 1 1 1 3 1 2 1 1 6 3 3 3 3 3 1 1 6 3 1 4 3 3 5 1 6
## [778] 3 4 2 1 6 3 1 4 4 1 1 2 1 2 1 3 3 3 1 4 4 1 6 2 2 6 1 1 1 4 2 6 4 2 1 1 5
## [815] 6 1 1 4 1 2 3 3 1 2 1 1 2 1 3 1 1 5 1 2 1 4 1 1 4 2 1 1 3 1 1 1 3 1 1 3 3
## [852] 3 5 2 1 1 3 1 6 1 3 3 1 2 4 1 3 6 1 2 1 5 1 2 5 4 1 1 1 2 2 1 5 3 1 5 1 4
## [889] 1 1 5 4 3 3 1 2 5 1 1 1 6 2 2 4 1 3 3 2 3 6 1 1 2 6 1 3 6 1 4 6 1 6 3 1 6
## [926] 6 6 4 2 1 6 3 1 3 3 1 3 2 3 6 1 2 3 2 4 1 3 5 3 1 3 6 3 2 1 3 2 3 1 2 6 5
## [963] 4 1 1 6 1 1 1 3 1 6 1 3 1 6 1 3 1 2 3 1 3 1 1 1 1 1 1 2 1 4 2 1 3 3 3 2 2
## [1000] 1 4 6 3 5 1 3 1 3 1 4 3 5 1 1 2 3 3 1 2 1 1 5 2 4 1 2 1 6 1 2 4 3 1 1 4 6
## [1037] 6 3 3 3 2 3 1 1 1 2 4 3 1 1 3 2 2 3 6 6 4 1 6 5 3 3 2 4 2 1 2 1 1 4 2 2 2
## [1074] 6 1 1 1 3 3 5 4 4 2 3 1 3 1 4 1 1 1 1 3 1 1 1 4 1 6 6 1 6 3 5 4 3 6 3 2 4
## [1111] 3 4 5 1 1 4 1 1 5 3 3 4 1 2 1 2 1 3 1 2 1 4 1 3 1 3 1 1 4 3 1 1 1 1 3 1 4
## [1148] 5 1 1 1 4 1 1 3 1 3 2 1 3 1 3 3 1 1 1 2 1 2 1 2 6 1 3 1 4 4 1 3 2 2 5 4 1
## [1185] 1 3 1 4 1 1 1 1 1 2 3 1 5 4 3 3 4 3 2 1 2 3 1 1 6 1 6 1 1 3 3 2 6 4 1 3 1
## [1222] 1 2 4 1 1 6 4 1 4 6 2 1 2 3 4 1 4 3 3 3 2 3 1 1 2 1 1 1 2 2 1 1 1 6 2 1 3
## [1259] 3 3 4 3 3 3 5 6 3 3 1 1 1 3 1 3 3 4 3 2 2 2 1 4 6 6 4 1 4 3 1 2 1 3 1 2 3
## [1296] 1 1 3 1 4 5 1 1 1 6 1 2 1 6 3 4 2 1 2 5 1 3 4 2 2 3 1 1 1 1 6 5 1 4 2 3 6
## [1333] 6 1 3 1 1 3 4 1 5 1 1 2 1 5 3 2 3 4 1 1 1 1 6 4 3 1 3 3 3 1 3 1 3 3 2 1 1
## [1370] 1 4 3 3 3 1 4 2 6 2 3 2 2 3 1 1 3 2 1 6 2 1 5 2 1 4 2 1 4 5 4 3 2 1 2 2 3
## [1407] 4 2 3 2 1 1 2 3 3 3 2 1 1 1 1 2 2 4 3 1 5 6 6 3 4 6 1 1 3 4 1 3 4 2 1 1 3
## [1444] 1 1 6 1 1 6 1 1 1 3 2 4 1 1 1 3 4 1 1 1 1 4 1 2 2 4 4 6 3 6 1 5 1 1 3 1 1
## [1481] 2 1 1 4 2 3 1 1 2 5 1 1 1 3 6 1 1 3 1 1 1 3 1 2 1 2 3 3 4 3 1 4 1 4 1 3 1
## [1518] 5 3 4 1 1 4 1 5 1 1 1 2 6 2 2 4 3 3 2 1 5 3 1 1 4 4 1 1 4 3 1 1 1 1 2 2 1
## [1555] 5 2 3 1 2 5 4 1 4 1 4 2 1 1 1 6 5 4 6 1 1 1 1 2 1 4 1 1 4 5 2 3 3 3 3 3 3
## [1592] 3 5 1 1 1 5 4 6 4 3 1 1 1 2 1 1 1 1 3 6 1 6 2 3 1 3 4 3 5 3 1 1 3 6 2 1 2
## [1629] 3 1 1 3 6 1 6 3 1 2 6 1 1 2 4 3 2 2 3 1 2 3 1 1 1 1 1 1 5 3 3 2 4 1 1 2 3
## [1666] 3 1 4 6 2 3 1 3 3 3 3 4 4 3 2 3 4 6 6 1 2 1 6 1 1 4 1 6 4 2 6 1 4 6 4 1 1
## [1703] 4 3 3 1 3 4 1 1 1 4 1 3 1 1 3 3 1 1 1 3 2 1 1 1 1 1 1 3 3 6 1 2 1 1 1 1 5
## [1740] 3 3 3 3 4 3 3 4 1 1 6 1 1 1 3 4 3 1 4 2 4 6 1 4 1 1 6 1 6 5 3 1 1 2 3 2 1
## [1777] 2 4 4 5 1 6 6 1 1 6 1 6 1 4 5 3 1 1 1 4 5 1 3 1 1 3 1 2 1 2 4 3 1 4 3 3 4
## [1814] 1 3 1 1 1 3 5 1 6 1 2 5 3 1 1 1 1 3 1 3 1 5 1 1 1 3 2 1 6 4 4 2 2 4 4 1 6
## [1851] 5 4 1 4 3 2 3 1 1 3 5 2 3 6 3 2 1 3 6 3 2 1 6 6 4 4 2 6 6 1 1 5 3 4 2 3 3
## [1888] 1 1 4 1 1 2 3 1 2 4 3 1 5 6 4 3 6 3 3 2 2 4 1 1 1 4 2 1 3 3 3 3 1 3 2 1 1
## [1925] 4 3 1 2 3 1 1 3 1 2 3 4 1 1 5 6 1 1 3 6 3 3 1 3 1 4 1 3 1 1 3 1 3 1 1 1 1
## [1962] 2 1 4 2 3 2 4 1 5 4 3 1 3 4 1 4 1 1 6 1 2 2 6 1 4 1 2 3 4 1 2 3 1 4 1 1 3
## [1999] 1 1
##
## Within cluster sum of squares by cluster:
## [1] 135617966 7658759 44326795 4905060 9211629 10855460
## (between_SS / total_SS = 99.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
retail_segmentation <- retail_segmentation %>%
mutate(cluster = kmeans_model$cluster)
head(retail_segmentation)
## Cust_No avg_order_size avg_order_freq crossbuy multichannel per_sale tenure
## 1 1 23.400000 2.2222222 3 2 0.00000000 3
## 2 2 34.260377 6.6250000 7 2 0.11111111 35
## 3 3 43.575641 4.8750000 5 2 0.07407407 12
## 4 4 26.316667 0.9000000 4 2 0.25000000 9
## 5 5 8.269231 1.0833333 3 1 0.50000000 40
## 6 6 21.500000 0.2222222 1 2 0.00000000 7
## return_rate married own_home household_size loyalty_card income age
## 1 0.1175214 1 1 1 1 35 47
## 2 0.2818684 1 1 3 1 140 70
## 3 0.2741769 1 0 4 0 35 21
## 4 0.1435508 0 0 1 1 35 62
## 5 0.0000000 0 0 2 0 140 21
## 6 0.0000000 0 1 1 1 80 21
## avg_mktg_cnt zip_code sales profit cluster
## 1 56.000000 21230 52.000000 -14.9600000 1
## 2 14.914286 22301 226.975000 106.8412857 2
## 3 20.083333 19002 212.431250 95.4017500 4
## 4 8.222222 22304 23.685000 6.1495333 2
## 5 1.350000 20124 8.958333 3.6458333 1
## 6 2.714286 22033 4.777778 0.4487302 6
clustercolours <- c("pink", "orchid1", "lightblue", "deepskyblue1", "aquamarine1", "darkorchid1")
ggplot(retail_segmentation, aes(x = income, y = profit, color = factor(cluster))) +
geom_point(size = 3) +
scale_color_manual(values = clustercolours) + # Apply custom colors
labs(x = "Income", y = "Profit", color = "Cluster") +
theme_minimal()
kmeans_model$centers %>%
round(2) %>%
as.data.frame() %>%
tibble::rownames_to_column(var = "Cluster")
## Cluster age zip_code avg_order_size household_size income avg_order_freq
## 1 1 42.69 20788.58 36.30 2.78 74.89 1.46
## 2 2 43.42 22267.20 35.56 2.96 77.39 1.79
## 3 3 42.86 8284.98 34.16 2.93 76.30 1.60
## 4 4 44.70 19076.98 35.43 2.88 75.98 1.52
## 5 5 41.32 23207.12 36.35 3.26 80.70 1.44
## 6 6 41.80 21845.52 33.31 2.77 73.01 1.60
## profit
## 1 14.35
## 2 19.27
## 3 15.32
## 4 14.97
## 5 13.32
## 6 18.68
fviz_cluster(kmeans_model, data=cluster_data, geom="point",
ellipse.type="norm")