knitr::opts_chunk$set(echo = TRUE)
#' Clean slate
rm(list = ls())
#' Get working directory
getwd()
## [1] "/Users/rexlei/Documents/GitHub/BDM-Assignment-Group-4/main./BDM Group 4"
#' Import Libraries
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(writexl)
library(summarytools)
library(Hmisc)
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:summarytools':
##
## label, label<-
## The following objects are masked from 'package:base':
##
## format.pval, units
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ tibble 3.1.6 ✔ purrr 0.3.4
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ purrr::set_names() masks magrittr::set_names()
## ✖ dplyr::src() masks Hmisc::src()
## ✖ dplyr::summarize() masks Hmisc::summarize()
## ✖ tibble::view() masks summarytools::view()
library(data.table)
##
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
##
## transpose
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(mclust)
## Package 'mclust' version 5.4.9
## Type 'citation("mclust")' for citing this R package in publications.
##
## Attaching package: 'mclust'
## The following object is masked from 'package:purrr':
##
## map
library(scatterplot3d)
library(lattice)
library(kohonen)
##
## Attaching package: 'kohonen'
## The following object is masked from 'package:mclust':
##
## map
## The following object is masked from 'package:purrr':
##
## map
library(ggpubr)
library(corrplot)
## corrplot 0.92 loaded
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
library(paletteer)
library(cluster)
library(purrr)
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.15.2
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:ggpubr':
##
## rotate
## The following object is masked from 'package:data.table':
##
## set
## The following object is masked from 'package:stats':
##
## cutree
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(hopkins)
library(plotrix)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:Hmisc':
##
## subplot
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(xlsx)
#' Open text file
df <- as.data.frame(read.table("Group 4.txt",sep = "\t", header = TRUE, row.names = NULL))
#' Show column names and check variable dtype
names(df)
## [1] "customer_id" "Last.purchased" "frequency" "amount"
var_dtype <- str((df)[1:4])
## 'data.frame': 1000 obs. of 4 variables:
## $ customer_id : int 80 90 130 190 220 290 330 400 410 480 ...
## $ Last.purchased: chr "343" "758" "2970" "2592" ...
## $ frequency : int 4 4 1 2 1 2 3 1 4 3 ...
## $ amount : num 75 126 60 65 30 ...
#' Check for duplicates in the dataset within "customer_id"
table(duplicated(df["customer_id"]))
##
## FALSE
## 1000
#' Check its dimension
dim(df)
## [1] 1000 4
#' Check for missing values
table(is.na(df))
##
## FALSE TRUE
## 3972 28
#' Count missing values for each attribute
na_count <- sapply(df,function(y) sum(length(which(is.na(y)))))
na_count
## customer_id Last.purchased frequency amount
## 0 2 9 17
#' Remove any rows with missing values
non_NAN_df <- as.data.frame(na.omit(df))
sapply(non_NAN_df,function(y) sum(length(which(is.na(y)))))
## customer_id Last.purchased frequency amount
## 0 0 0 0
#' Check var_dtype
var_dtype <- str((non_NAN_df)[1:4])
## 'data.frame': 972 obs. of 4 variables:
## $ customer_id : int 80 90 130 190 220 290 330 400 410 480 ...
## $ Last.purchased: chr "343" "758" "2970" "2592" ...
## $ frequency : int 4 4 1 2 1 2 3 1 4 3 ...
## $ amount : num 75 126 60 65 30 ...
#' Convert "recency" variable to numeric
non_NAN_df[,2] <- as.numeric(non_NAN_df[,2])
## Warning: NAs introduced by coercion
sapply(non_NAN_df,
function(y)sum(length(which(is.na(y))))) # Check new mssing values
## customer_id Last.purchased frequency amount
## 0 4 0 0
which(non_NAN_df[,2] %in% NA) # Which index are thee missing values
## [1] 75 114 115 148
#' Manually input missing values from original df
non_NAN_df[75,2] <- 2969
non_NAN_df[114,2] <- 2192
non_NAN_df[115,2] <- 379
non_NAN_df[148,2] <- 26
#' Check for missing values if any
sapply(non_NAN_df,
function(y)sum(length(which(is.na(y)))))
## customer_id Last.purchased frequency amount
## 0 0 0 0
#' Check var_dtype
var_dtype <- str((non_NAN_df)[1:4])
## 'data.frame': 972 obs. of 4 variables:
## $ customer_id : int 80 90 130 190 220 290 330 400 410 480 ...
## $ Last.purchased: num 343 758 2970 2592 3664 ...
## $ frequency : int 4 4 1 2 1 2 3 1 4 3 ...
## $ amount : num 75 126 60 65 30 ...
#' Name variables
cus <- non_NAN_df[,1]
rec <- non_NAN_df[,2]
freq <- non_NAN_df[,3]
mon <- non_NAN_df[,4]
#' View Descriptive Statistics
non_NAN_df <- rename(non_NAN_df, recency = Last.purchased, monetary = amount)
descriptive_stats<- descr(non_NAN_df, style = "rmarkdown")
dfSummary(non_NAN_df)
## Data Frame Summary
## non_NAN_df
## Dimensions: 972 x 4
## Duplicates: 0
##
## ------------------------------------------------------------------------------------------------------------------
## No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
## ---- ------------- ------------------------------ --------------------- --------------------- ---------- ---------
## 1 customer_id Mean (sd) : 12408.3 (5738.6) 972 distinct values : : . . 972 0
## [integer] min < med < max: . : : : : (100.0%) (0.0%)
## 80 < 12595 < 27180 : : : : : .
## IQR (CV) : 8277.5 (0.5) : : : : : : : :
## : : : : : : : : . .
##
## 2 recency Mean (sd) : 1911.3 (1299.2) 529 distinct values : 972 0
## [numeric] min < med < max: : . : (100.0%) (0.0%)
## 1 < 1836.5 < 4014 : . : . . :
## IQR (CV) : 2402.5 (0.7) : : : : : : : :
## : : : : : : : :
##
## 3 frequency Mean (sd) : 2.6 (1.9) 14 distinct values : 972 0
## [integer] min < med < max: : (100.0%) (0.0%)
## 1 < 2 < 18 : .
## IQR (CV) : 3 (0.7) : :
## : : :
##
## 4 monetary Mean (sd) : 69.1 (166.8) 203 distinct values : 972 0
## [numeric] min < med < max: : (100.0%) (0.0%)
## 7.2 < 39.5 < 3100 :
## IQR (CV) : 31.5 (2.4) :
## :
## ------------------------------------------------------------------------------------------------------------------
write.csv(descriptive_stats,"Table 0.0")
# Name non missing data frame as "non_NAN_df" and assign the newly
new_df <- copy(non_NAN_df)
#' Utilised approach from https://medium.com/analytics-vidhya/customer-segmentation-using-rfm-analysis-in-r-cd8ba4e6891 to classify variables
summary(new_df)
## customer_id recency frequency monetary
## Min. : 80 Min. : 1 Min. : 1.000 Min. : 7.25
## 1st Qu.: 8335 1st Qu.: 731 1st Qu.: 1.000 1st Qu.: 28.51
## Median :12595 Median :1836 Median : 2.000 Median : 39.50
## Mean :12408 Mean :1911 Mean : 2.647 Mean : 69.07
## 3rd Qu.:16612 3rd Qu.:3134 3rd Qu.: 4.000 3rd Qu.: 60.00
## Max. :27180 Max. :4014 Max. :18.000 Max. :3100.00
new_df$R_score <- 0
new_df$F_score <- 0
new_df$M_score <- 0
new_df$score <- 0
new_df$Segment3<- "0"
new_df$Segment6<- "0"
quantile(new_df$recency, probs = seq(0, 1, 1/4))
## 0% 25% 50% 75% 100%
## 1.0 731.0 1836.5 3133.5 4014.0
quantile(new_df$frequency, probs = seq(0, 1, 1/5)) # Given Frequency is whole values, we split percentilees into 5 quartiles
## 0% 20% 40% 60% 80% 100%
## 1 1 2 3 4 18
quantile(new_df$monetary, probs = seq(0, 1, 1/4))
## 0% 25% 50% 75% 100%
## 7.2500 28.5119 39.5000 60.0000 3100.0000
# Classify variables into quanitles
#' R scored
new_df$R_score[new_df$recency >= 3133.5]<- 1
new_df$R_score[new_df$recency >= 1836.5 & new_df$recency <3133.5] <- 2
new_df$R_score[new_df$recency >= 731 & new_df$recency <1836.5] <- 3
new_df$R_score[new_df$recency < 731] <- 4
#' F scored
new_df$F_score[new_df$frequency >=4] <- 4
new_df$F_score[new_df$frequency == 3] <- 3 #
new_df$F_score[new_df$frequency == 2] <- 2
new_df$F_score[new_df$frequency == 1] <- 1
#' M scored
new_df$M_score[new_df$monetary >= 60] <- 4
new_df$M_score[new_df$monetary < 60.00 & new_df$monetary >= 39.5] <- 3
new_df$M_score[new_df$monetary < 39.50 & new_df$monetary >= 28.5119] <- 2
new_df$M_score[new_df$monetary <28.5119] <- 1
#' Combine each of the scores together
new_df <- new_df %>% mutate(score = 100 *R_score +10 * F_score + M_score)
new_df$Segment6 <- 0
new_df$Segment6[which(new_df$score %in% c(444,434,443, 344, 442, 244, 424, 441))] <-"1 (Best)"
new_df$Segment6[which(new_df$score %in% c(332,333,342, 343, 334, 412,413,414,431,432,441,421,422,423, 424, 433))] <- "2"
new_df$Segment6[which(new_df$score %in% c(233,234, 241,311, 312, 313,314,321,322,323,324, 331, 341))] <- "3"
new_df$Segment6[which(new_df$score %in% c(124, 133, 134, 142, 143, 144, 214,224,234, 242, 243, 232 ))] <- "4"
new_df$Segment6[which(new_df$score %in% c(122, 123,131 ,132, 141, 212, 213, 221, 222, 223, 231,411,114))] <- "5"
new_df$Segment6[which(new_df$score %in% c(111, 112, 113, 121, 131, 211, 311))] <-"6 (Worst)"
new_df$Segment3 <- 0
new_df$Segment3[which(new_df$score %in% c(444,434,443, 344, 442, 244, 424, 441,332,333,342, 343, 334, 412,413,414,431,432,441,421,422,423, 424, 433))] <-"1 (Best)"
new_df$Segment3[which(new_df$score %in% c(233,234, 241,311, 312, 313,314,321,322,323,324, 331, 341, 124, 133, 134, 142, 143, 144, 214,224,234, 242, 243, 232))] <- "2"
new_df$Segment3[which(new_df$score %in% c(122, 123,131 ,132, 141, 212, 213, 221, 222, 223, 231,411,114,111, 112, 113, 121, 131, 211, 311))] <- "3 (Worst)"
##
## 1 (Best) 2 3 4 5 6 (Worst)
## 166 193 130 100 172 211
##
## 1 (Best) 2 3 (Worst)
## 359 230 383
## [1] 61
## [1] 52
## [1] 166
## [1] 359
## The best customers are the ones on the far right encoded 444 (52 values). We can also see a pattern here:
## 1. Segment 6 shows 166 values while Segment 3 shows 359 values
## 2. There are more customers in the extreme ends (111 & 444) than the customers towards the middle
## 3. RFM shows 61 different segments when encoded with 4 levels
## Non-numerical variable(s) ignored: Segment3, Segment6
## ### Descriptive Statistics
## #### new_df
## **N:** 972
##
## | | customer_id | F_score | frequency | M_score | monetary | R_score |
## |----------------:|------------:|--------:|----------:|--------:|---------:|--------:|
## | **Mean** | 12408.25 | 2.33 | 2.65 | 2.53 | 69.07 | 2.49 |
## | **Std.Dev** | 5738.64 | 1.21 | 1.90 | 1.14 | 166.84 | 1.11 |
## | **Min** | 80.00 | 1.00 | 1.00 | 1.00 | 7.25 | 1.00 |
## | **Q1** | 8330.00 | 1.00 | 1.00 | 1.50 | 28.45 | 1.50 |
## | **Median** | 12595.00 | 2.00 | 2.00 | 2.50 | 39.50 | 2.50 |
## | **Q3** | 16615.00 | 4.00 | 4.00 | 4.00 | 60.00 | 3.00 |
## | **Max** | 27180.00 | 4.00 | 18.00 | 4.00 | 3100.00 | 4.00 |
## | **MAD** | 6078.66 | 1.48 | 1.48 | 2.22 | 21.50 | 0.74 |
## | **IQR** | 8277.50 | 3.00 | 3.00 | 2.25 | 31.49 | 1.25 |
## | **CV** | 0.46 | 0.52 | 0.72 | 0.45 | 2.42 | 0.45 |
## | **Skewness** | -0.02 | 0.23 | 1.94 | -0.01 | 11.94 | 0.00 |
## | **SE.Skewness** | 0.08 | 0.08 | 0.08 | 0.08 | 0.08 | 0.08 |
## | **Kurtosis** | -0.48 | -1.52 | 7.40 | -1.42 | 176.74 | -1.35 |
## | **N.Valid** | 972.00 | 972.00 | 972.00 | 972.00 | 972.00 | 972.00 |
## | **Pct.Valid** | 100.00 | 100.00 | 100.00 | 100.00 | 100.00 | 100.00 |
##
## Table: Table continues below
##
##
##
## | | recency | score |
## |----------------:|--------:|-------:|
## | **Mean** | 1911.31 | 275.36 |
## | **Std.Dev** | 1299.21 | 120.02 |
## | **Min** | 1.00 | 111.00 |
## | **Q1** | 731.00 | 177.00 |
## | **Median** | 1836.50 | 277.50 |
## | **Q3** | 3137.00 | 344.00 |
## | **Max** | 4014.00 | 444.00 |
## | **MAD** | 1769.48 | 98.59 |
## | **IQR** | 2402.50 | 150.00 |
## | **CV** | 0.68 | 0.44 |
## | **Skewness** | 0.09 | -0.01 |
## | **SE.Skewness** | 0.08 | 0.08 |
## | **Kurtosis** | -1.37 | -1.35 |
## | **N.Valid** | 972.00 | 972.00 |
## | **Pct.Valid** | 100.00 | 100.00 |
## Data Frame Summary
## new_df
## Dimensions: 972 x 10
## Duplicates: 0
##
## ------------------------------------------------------------------------------------------------------------------
## No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
## ---- ------------- ------------------------------ --------------------- --------------------- ---------- ---------
## 1 customer_id Mean (sd) : 12408.3 (5738.6) 972 distinct values : : . . 972 0
## [integer] min < med < max: . : : : : (100.0%) (0.0%)
## 80 < 12595 < 27180 : : : : : .
## IQR (CV) : 8277.5 (0.5) : : : : : : : :
## : : : : : : : : . .
##
## 2 recency Mean (sd) : 1911.3 (1299.2) 529 distinct values : 972 0
## [numeric] min < med < max: : . : (100.0%) (0.0%)
## 1 < 1836.5 < 4014 : . : . . :
## IQR (CV) : 2402.5 (0.7) : : : : : : : :
## : : : : : : : :
##
## 3 frequency Mean (sd) : 2.6 (1.9) 14 distinct values : 972 0
## [integer] min < med < max: : (100.0%) (0.0%)
## 1 < 2 < 18 : .
## IQR (CV) : 3 (0.7) : :
## : : :
##
## 4 monetary Mean (sd) : 69.1 (166.8) 203 distinct values : 972 0
## [numeric] min < med < max: : (100.0%) (0.0%)
## 7.2 < 39.5 < 3100 :
## IQR (CV) : 31.5 (2.4) :
## :
##
## 5 R_score Mean (sd) : 2.5 (1.1) 1 : 243 (25.0%) IIIII 972 0
## [numeric] min < med < max: 2 : 243 (25.0%) IIIII (100.0%) (0.0%)
## 1 < 2.5 < 4 3 : 248 (25.5%) IIIII
## IQR (CV) : 1.2 (0.4) 4 : 238 (24.5%) IIII
##
## 6 F_score Mean (sd) : 2.3 (1.2) 1 : 350 (36.0%) IIIIIII 972 0
## [numeric] min < med < max: 2 : 204 (21.0%) IIII (100.0%) (0.0%)
## 1 < 2 < 4 3 : 161 (16.6%) III
## IQR (CV) : 3 (0.5) 4 : 257 (26.4%) IIIII
##
## 7 M_score Mean (sd) : 2.5 (1.1) 1 : 243 (25.0%) IIIII 972 0
## [numeric] min < med < max: 2 : 243 (25.0%) IIIII (100.0%) (0.0%)
## 1 < 2.5 < 4 3 : 216 (22.2%) IIII
## IQR (CV) : 2.2 (0.5) 4 : 270 (27.8%) IIIII
##
## 8 score Mean (sd) : 275.4 (120) 61 distinct values : : : : 972 0
## [numeric] min < med < max: : : : : (100.0%) (0.0%)
## 111 < 277.5 < 444 : : : :
## IQR (CV) : 150 (0.4) : : : :
## : : : :
##
## 9 Segment3 1. 1 (Best) 359 (36.9%) IIIIIII 972 0
## [character] 2. 2 230 (23.7%) IIII (100.0%) (0.0%)
## 3. 3 (Worst) 383 (39.4%) IIIIIII
##
## 10 Segment6 1. 1 (Best) 166 (17.1%) III 972 0
## [character] 2. 2 193 (19.9%) III (100.0%) (0.0%)
## 3. 3 130 (13.4%) II
## 4. 4 100 (10.3%) II
## 5. 5 172 (17.7%) III
## 6. 6 (Worst) 211 (21.7%) IIII
## ------------------------------------------------------------------------------------------------------------------
#' Transform "monetary" feature to log(monetary) to make dataset more normal
#' But before transforming "monetary" feature to log(monetary),
#' we want to see if there are any negative values in monetary
counter = 0
for (i in new_df$monetary) {
if (i < 0) {counter = counter + 1}
}
print(counter) # returns zero
## [1] 0
# Transform "frequency", monetary" to "log(frequency) + log(monetary)"
logfreq <- log(new_df$frequency)
logmonetary <- log(new_df$monetary)
new_df <- new_df %>%
add_column(logfreq,.after = "frequency")
new_df <- new_df %>%
add_column(logmonetary,.after = "monetary")
# Trying out tranforming "monetary" to "monetary^(lamda)
lamda <- powerTransform(new_df$monetary,family = "bcPower")
parameter <- lamda[["lambda"]][["new_df$monetary"]] %>% as.numeric()
BCPower_monetary <- (new_df$monetary)^(parameter)
new_df <- new_df %>%
add_column(BCPower_monetary,.after = "logmonetary")
#' Identify Outliers
outliers_R <- quantile(new_df$recency, c(0.05, 0.95), type = 1)
outliers_F <- quantile(new_df$frequency, c(0.05, 0.95), type = 1)
outliers_M <- quantile(new_df$monetary, c(0.05, 0.95), type = 1)
outliers_F_log <- quantile(new_df$logfreq, c(0.05, 0.95), type = 1)
outliers_M_log <- quantile(new_df$logmonetary, c(0.05, 0.95), type = 1)
outliers_M_BC <- quantile(new_df$BCPower_monetary, c(0.05, 0.95), type = 1)
outliers <- table(outliers_R,outliers_F,outliers_M,outliers_F_log,
outliers_M_log,outliers_M_BC)
#' Winsoring Outliers
new_df %>% setDT()
main_df <- new_df %>%
.[frequency <= outliers_F[1], frequency := outliers_F[1]] %>%
.[frequency >= outliers_F[2], frequency := outliers_F[2]]
main_df <- new_df %>%
.[monetary <= outliers_M[1], monetary := outliers_M[1]] %>%
.[monetary >= outliers_M[2], monetary := outliers_M[2]]
main_df <- new_df %>%
.[logfreq <= outliers_F_log[1], logfreq := outliers_F_log[1]] %>%
.[logfreq >= outliers_F_log[2], logfreq := outliers_F_log[2]]
main_df <- new_df %>%
.[logmonetary <= outliers_M_log[1], logmonetary := outliers_M_log[1]] %>%
.[logmonetary >= outliers_M_log[2], logmonetary := outliers_M_log[2]]
main_df <- new_df %>%
.[BCPower_monetary <= outliers_M_BC[1], BCPower_monetary := outliers_M_BC[1]] %>%
.[BCPower_monetary >= outliers_M_BC[2], BCPower_monetary := outliers_M_BC[2]]
main_df <- main_df %>% as.data.frame()
# F-transformed Boxplot code
F_wins_boxplot <-ggplot(main_df, aes(x= freq, y= "freq + winsor")) + geom_boxplot(color="orange",
outlier.colour="red",
outlier.shape=8,
outlier.size=4) +
stat_summary(fun=mean, geom="point", shape=23, size=4)
F_winslog_boxplot <-ggplot(main_df, aes(x= logfreq, y= "log(freq) + winsor")) + geom_boxplot(color="orange",
outlier.colour="red",
outlier.shape=8,
outlier.size=4) +
stat_summary(fun=mean, geom="point", shape=23, size=4)
# M-transformed (logs) Boxplot code
M_winslog_boxplot <-ggplot(main_df,
aes(x = logmonetary, y="log(mon) + winsor")) +
geom_boxplot(color="orange",
outlier.colour="red",
outlier.shape=8, outlier.size=4) +
stat_summary(fun=mean, geom="point", shape=23, size=4)
# M-transformed (BCPower) Boxplot code
M_winsBC_boxplot <-ggplot(main_df,
aes(x = BCPower_monetary, y="BCPower(mon) + winsor")) +
geom_boxplot(color="orange",
outlier.colour="red",
outlier.shape=8, outlier.size=4) +
stat_summary(fun=mean, geom="point", shape=23, size=4)
F_wins_boxplot
F_winslog_boxplot
M_winslog_boxplot
M_winsBC_boxplot
# Remove unnecessary variables
colnames(main_df)
## [1] "customer_id" "recency" "frequency" "logfreq"
## [5] "monetary" "logmonetary" "BCPower_monetary" "R_score"
## [9] "F_score" "M_score" "score" "Segment3"
## [13] "Segment6"
# Center and Scale df
scalars <- as.data.frame(scale(main_df[,c(2,4,6)]))
# Remove unnecessary vars Redefine variables + rename variables
scaled_df <-select(main_df,-c(recency,frequency,logfreq,monetary,logmonetary))
scaled_df <-scaled_df %>%
add_column(scalars,.after = "customer_id") # no scaling + centering as this is already done through BC transformation
cusID <- scaled_df[,1]
rec <- scaled_df[,2]
lfreq <- scaled_df[,3]
lmon <- scaled_df[,4]
BC_mon <- scaled_df[,5]
R_score <- scaled_df[,6]
F_score <- scaled_df[,7]
M_score <- scaled_df[,8]
RFM_score <-scaled_df[,9]
class3 <- scaled_df[,10]
class6 <- scaled_df[,11]
colnames(scaled_df)
## [1] "customer_id" "recency" "logfreq" "logmonetary"
## [5] "BCPower_monetary" "R_score" "F_score" "M_score"
## [9] "score" "Segment3" "Segment6"
#' Draw 3D scatterplot of RFM analysis, grouping each by R,F,M
scatterplot3d(scaled_df[,2:4],color = scaled_df$R_score, pch = 5, angle = 80, main = "Grouped by R") # color coded by R
scatterplot3d(scaled_df[,2:4],color = scaled_df$F_score, pch = 5, angle = 80, main = "Grouped by F") # color coded by F
scatterplot3d(scaled_df[,2:4],color = scaled_df$M_score, pch = 5, angle = 80, main = "Grouped by M") # color coded by M
scatterplot3d(scaled_df[,2:4],color = scaled_df$score, pch = 5, angle = 80, main = "Grouped by RFM") # color coded by RFM
group6 <- unclass(factor(scaled_df$Segment6))
group3 <- unclass(factor(scaled_df$Segment3))
scatterplot3d(scaled_df[,2:4],color = group6, pch = 5, angle = 80, main = "Grouped by 6 segments") # color coded by class6
scatterplot3d(scaled_df[,2:4],color = group3, pch = 5, angle = 80, main = "Grouped by 3 segments") # color coded by class3
ggplot(data = scaled_df) +
geom_bar(mapping = aes(x = score))
cat("Segmentation is much clearer especially when segmenting across 3 classes and 6 classes. We also show frequency and monetary segments are not as clear as recency, given that recency is more normally distributed than the others.")
## Segmentation is much clearer especially when segmenting across 3 classes and 6 classes. We also show frequency and monetary segments are not as clear as recency, given that recency is more normally distributed than the others.
# Check Correlation prior and after preprocessing treatment on outliers + transformation
corr1 <- cor(non_NAN_df)
corrmatrix <- corrplot(corr1, type = "upper", order = "original", method = "number",
tl.col = "black", tl.srt = 45)
corr2 <- cor(scaled_df[,c(2:4)])
corrmatrix2 <- corrplot(corr2, type = "upper", order = "original", method = "number" ,tl.col = "black", tl.srt = 45)
cat("We are expecting people who came to shops recently to come more often, as newer customers often visit to the same shop if they like it. Plus, given that all variables within thee analysis (RFM) are all crucial for the analysis, we must keep the variables.")
## We are expecting people who came to shops recently to come more often, as newer customers often visit to the same shop if they like it. Plus, given that all variables within thee analysis (RFM) are all crucial for the analysis, we must keep the variables.
## The parameters which I will need to consider for SOM model are:
## 0.grid
## 1.rlen
## 2.alpha
## 3.radius.
## grid = size of the codebook (how will you visualise the SOM model).
## rlen = the no.of times the completed dataset will be presented to the network. After completing the grid size, the code will determine rlen through a plot.
## Alpha = Learning rate controls the size of weight vector in learning of SOM.
## radius = the radius of the neighbourhood.
## For simplicity, we are going to keep alpha and radius to the default metrics.
## Alpha: The learning rate will decline linearly from 0.05 to 0.01 over rlen updates.
## Radius: 2/3 of all unit distances
## We want around 5 to 10 samples per node. Hexagonal is preferred (more negihbours).
## Hence, given n= 968 and we want 5 to 10 samples per node, we want a grid size of 97 to 193.
## Therefore, we need grid sizes 10x10 to 14x14. We test each grid size to see which grid is best while also minimising nodes with empty samples.
## Note, most SOM models have nxn models and thus we use a nxn model too
## Given that grid sizes at and over 10x10 presents a lot of nodes with missing values, we test 8x8 and 9x9.
## Given our analysis, we use 10x10 grid & given there is not much change in terms of the sample range per node, whereas with 8x8, the sample range increases which is not we want as we want as few samples as possible per node. Whil 9x9 and 10x10 have similar sample sizes per node and only has one node fewer with zero samples.
## However, we can see in our analysis, there is always one node with more than 40 samples showing a high concentration of samples in one area at all times. This shows that there a high volume of the same customer with the same RFM characterisitcs.
## Given that we want to use a higher grid size as possible, we can
## Section B Part 1 - Creating the SOM Model
set.seed(123)
scaled_df.grid <- somgrid(xdim = 10,
ydim = 10,
topo = "hexagonal")
scaled_df.SOM <- som(X = RFM_SOM, grid = scaled_df.grid,rlen = 500,keep.data = TRUE)
plot(scaled_df.SOM, type = "counts", main = "Node counts (10x10)", pch = 15)
# Check whether the model is good = test for efficiency
summary(scaled_df.SOM)
## SOM of size 10x10 with a hexagonal topology and a bubble neighbourhood function.
## The number of data layers is 1.
## Distance measure(s) used: sumofsquares.
## Training data included: 972 objects.
## Mean distance to the closest unit in the map: 0.052.
meandist <- mean(scaled_df.SOM$distances) # mean distances between codes and the respective closest unit in the map. The lower the better
meandist
## [1] 0.05235707
meanunitD <- mean(unit.distances(scaled_df.grid)) #Distances between units give us a clue on the distances prior to the mapping.
meanunitD
## [1] 4.855542
AccountedD <- meanunitD - meandist #MeanD is the distances that are "left over" when we use the map.The difference between the distances going in and whats left when we are done are the distances we have accounted for.
AccountedD
## [1] 4.803185
EF <- AccountedD/meanunitD #Divide the distances accounted for by the mean distances prior to mapping.
#The ratio should say something about the efficiency.
EF # Close to 1 = good.
## [1] 0.9892171
## In the counts plot, most values have around 10 to 20 values. There are clearly 2 nodes with values higher than 40.
## In the neighbor plot, we can clearly see 2 obvious clusters with 2 other less obvious clusters (k=4).
## In the quality plot, it shows the quantization error, the smaller the distance the better the objects are represented by the codebook. Similarly, with the neighbor plot, we see 2 obvious cluster with higher quality codes with 2 other less obvious clusters.
## Warning in par(opar): argument 1 does not name a graphical parameter
## In the codes plot, we see
## high values of R & M and low F in the bottom-right
## high values of F & M and low R in the bottom-left
## low values of R,F,M in the middle
## high values of R and low values of F & M in the top-right
## high F and low R & M in the middle & top left. Although this is not obvious, average values of R,F,M are top-middle and bottom-middle.
##
## The values which we want to target are the values on the bottom-left
## These heatmaps show scaled data of each variable. We can clearly see 3-4 clear clusters in R & F. This is more unclear with M
## These heatmaps show unscaled data of each variable. We can clearly see 3-4 clear clusters in R & F. This is more unclear with M but we can roughly see 3-5 clusters.
cat("Before moving on, we can use some metrics to see how clusterable our data is")
## Before moving on, we can use some metrics to see how clusterable our data is
# Use hopkins metrics to measure the clustering tendency - how feasible is it for the data to be clustered into (the lower the value)
set.seed(123)
hopkins_data <- hopkins(scaled_df[2:4])
hopkins_SOM<- hopkins(scaled_df.SOM$unit.classif %>% as.data.frame())
hopkins_data # value > 0.5 so the data is more clusterable
## [1] 0.9641223
hopkins_SOM # value > 0.5 so the data is more clusterable
## [1] 1
cat("This clearly shows SOM does make data more clusterable given that",hopkins_SOM, "is higher than",hopkins_data, "hence, SOM should not be used in this case. Nevertheless, we evaluate the SOM data in the next section")
## This clearly shows SOM does make data more clusterable given that 1 is higher than 0.9641223 hence, SOM should not be used in this case. Nevertheless, we evaluate the SOM data in the next section
# Implement Kmeans on scaled data.SOM codes using Elbow method
set.seed(123) #for reproducability
par(mfrow=c(1,1)) # back to one plot
clusterdata <- getCodes(scaled_df.SOM)
# Elbow method
fviz_nbclust(clusterdata, kmeans, method = "wss",print.summary = TRUE) +
geom_vline(xintercept = 3, linetype = 2)+
labs(subtitle = "Elbow method")
tot_withinss <- map_dbl(1:10, function(k){
model <- kmeans(x = clusterdata, centers = k)
model$tot.withinss
})
elbow_df <- data.frame(
k = 1:10,
tot_withinss = tot_withinss
)
cat("The Elbow method shows clusters is the best because after 6 the elbow of the curve properly flattens.")
## The Elbow method shows clusters is the best because after 6 the elbow of the curve properly flattens.
# Silhouette method
fviz_nbclust(clusterdata, kmeans, method = "silhouette")+
labs(subtitle = "Silhouette method")
# Gap Statistic
GAP <- fviz_nbclust(clusterdata, kmeans, nstart = 25, method = "gap_stat", nboot = 200)+
labs(subtitle = "Gap statistic method")
# Apply NCClust() to view other performance metrics
NbClust::NbClust(data = clusterdata, diss = NULL, distance = "euclidean",min.nc = 3, max.nc = 15, method = "kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 8 proposed 3 as the best number of clusters
## * 4 proposed 4 as the best number of clusters
## * 5 proposed 6 as the best number of clusters
## * 2 proposed 12 as the best number of clusters
## * 2 proposed 13 as the best number of clusters
## * 2 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW
## 3 1.5766 57.1190 23.7963 -2.2790 195.8664 744186.2 1958.4890 131.7482
## 4 3.3530 54.7825 14.4759 -2.3192 262.5698 679000.3 1176.1741 105.7945
## 5 0.1118 50.3711 20.8600 -2.6729 309.8748 661069.5 730.2879 91.9320
## 6 33.5295 52.7560 9.9260 -1.5964 368.1594 531474.9 691.9410 75.3801
## 7 0.1349 49.7253 10.4847 -1.9301 400.8062 521906.7 478.6726 68.1805
## 8 0.7830 48.3985 5.2835 -1.9415 430.8329 504862.1 458.4106 61.2728
## 9 0.4587 44.9473 11.6397 -2.5834 447.2850 542034.6 393.6300 57.9450
## 10 4.8885 45.8474 6.9234 -2.1214 480.4730 480184.9 343.8238 51.3738
## 11 0.1954 44.6278 10.7390 -2.2669 512.7879 420583.4 308.2712 47.7041
## 12 8.0499 45.9205 5.4913 -1.7609 545.2030 361953.1 234.4606 42.5677
## 13 0.5123 44.6647 5.7083 -1.9706 569.9850 331550.3 211.2091 40.0675
## 14 5.7129 43.8632 4.2731 -2.0830 585.5502 329093.9 191.4074 37.6004
## 15 14.1311 42.5584 3.7213 -2.3615 585.6897 377260.0 155.1240 35.8206
## Friedman Rubin Cindex DB Silhouette Duda Pseudot2 Beale Ratkowsky
## 3 3.6442 2.2049 0.3847 1.1697 0.3085 1.4515 -16.4863 -0.5156 0.4236
## 4 5.4495 2.7458 0.4010 1.0941 0.2869 0.9533 2.1057 0.0811 0.3942
## 5 6.8499 3.1598 0.4029 1.2294 0.2561 1.3352 -7.2797 -0.4036 0.3677
## 6 8.8753 3.8536 0.4120 1.0266 0.3066 1.4918 -7.2529 -0.5238 0.3510
## 7 10.3375 4.2606 0.4115 1.1043 0.2784 1.1803 -3.6656 -0.2414 0.3304
## 8 11.6355 4.7409 0.4275 1.0802 0.2804 1.1247 -1.8849 -0.1753 0.3140
## 9 12.4361 5.0132 0.4143 1.1644 0.2648 1.5509 -6.3938 -0.5375 0.2982
## 10 14.2307 5.6544 0.4689 1.0846 0.2600 1.8022 -6.2315 -0.6630 0.2865
## 11 16.7386 6.0894 0.4064 1.0980 0.2638 2.0845 -6.7634 -0.7381 0.2755
## 12 18.9044 6.8241 0.3841 0.9679 0.2824 1.4936 -4.2959 -0.4822 0.2665
## 13 21.5439 7.2499 0.3756 0.9749 0.2741 0.6192 4.9191 0.9305 0.2573
## 14 22.6290 7.7256 0.4003 0.9999 0.2751 1.5059 -3.0233 -0.4575 0.2490
## 15 21.4708 8.1095 0.5281 1.0342 0.2507 1.0622 -0.6445 -0.0887 0.2414
## Ball Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
## 3 43.9161 0.5605 0.4231 1.1768 0.1343 0.0065 2.0645 1.0751 1.1334
## 4 26.4486 0.5604 0.6498 1.5326 0.1214 0.0072 1.8933 0.9771 0.5966
## 5 18.3864 0.5223 0.1674 2.1241 0.1556 0.0078 2.1198 0.9029 0.5346
## 6 12.5633 0.5295 0.6630 2.3757 0.1499 0.0082 1.8866 0.8220 0.2856
## 7 9.7401 0.5014 0.4740 2.8044 0.1556 0.0086 2.0219 0.7819 0.3024
## 8 7.6591 0.4772 0.3953 3.2711 0.1707 0.0088 1.9586 0.7429 0.2284
## 9 6.4383 0.4660 0.2953 3.5195 0.1707 0.0090 2.3234 0.7225 0.2201
## 10 5.1374 0.4516 0.4663 3.9001 0.2018 0.0093 2.2168 0.6845 0.1921
## 11 4.3367 0.4304 0.0890 4.4040 0.1838 0.0096 2.3952 0.6519 0.1814
## 12 3.5473 0.4305 0.4835 4.5039 0.1838 0.0099 2.1309 0.6181 0.1811
## 13 3.0821 0.4178 0.1634 4.8325 0.1838 0.0101 2.2230 0.5999 0.1658
## 14 2.6857 0.4131 0.7163 5.0029 0.2018 0.0102 2.1885 0.5830 0.1594
## 15 2.3880 0.3942 0.4898 5.5425 0.2281 0.0103 2.6015 0.5741 0.1413
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 3 0.4304 70.1484 1.0000
## 4 0.4208 59.1761 0.9702
## 5 0.2757 76.1793 1.0000
## 6 0.2298 73.7463 1.0000
## 7 0.2115 89.4894 1.0000
## 8 0.2115 63.3884 1.0000
## 9 0.0819 201.6502 1.0000
## 10 0.0438 305.8720 1.0000
## 11 -0.0559 -245.7553 1.0000
## 12 -0.0014 -9535.2353 1.0000
## 13 0.0819 89.6223 0.4412
## 14 -0.1234 -81.9049 1.0000
## 15 0.0819 123.2307 1.0000
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 6.0000 3.000 6.0000 6.0000 4.0000 6.0 4.0000
## Value_Index 33.5295 57.119 10.9341 -1.5964 66.7034 120026.4 782.3149
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 4.0000 13.0000 12.0000 13.0000 12.0000 3.0000 3.0000
## Value_Index 12.0913 2.6395 -0.3089 0.3756 0.9679 0.3085 1.4515
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters 3.0000 3.0000 3.0000 4.0000 3.0000 2 3.0000
## Value_Index -16.4863 -0.5156 0.4236 17.4675 0.5605 NA 1.1768
## Dunn Hubert SDindex Dindex SDbw
## Number_clusters 15.0000 0 6.0000 0 15.0000
## Value_Index 0.2281 0 1.8866 0 0.1413
##
## $Best.partition
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16
## 3 3 3 3 2 2 2 2 2 2 3 3 3 3 3 2
## V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 V29 V30 V31 V32
## 2 2 2 2 3 3 3 3 2 2 2 2 2 2 3 3
## V33 V34 V35 V36 V37 V38 V39 V40 V41 V42 V43 V44 V45 V46 V47 V48
## 3 3 3 2 2 2 2 2 3 3 3 3 3 1 2 2
## V49 V50 V51 V52 V53 V54 V55 V56 V57 V58 V59 V60 V61 V62 V63 V64
## 2 2 3 3 3 1 3 1 1 1 2 2 1 1 1 1
## V65 V66 V67 V68 V69 V70 V71 V72 V73 V74 V75 V76 V77 V78 V79 V80
## 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 2
## V81 V82 V83 V84 V85 V86 V87 V88 V89 V90 V91 V92 V93 V94 V95 V96
## 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1
## V97 V98 V99 V100
## 1 1 3 2
cat("The Silhouette width estimates the average distance between clusters. The higher the Silhouette width, the better the observations are clustered. Hence, despite k=3 is the best measure, the 2nd best score is 6 clusters. The majority rule follows this result too")
## The Silhouette width estimates the average distance between clusters. The higher the Silhouette width, the better the observations are clustered. Hence, despite k=3 is the best measure, the 2nd best score is 6 clusters. The majority rule follows this result too
fit_kmeans <- kmeans(clusterdata[,1:3], 3) # use 5 clusters, as indicated by the wss development.
cl_assignmentk <- fit_kmeans$cluster[scaled_df.SOM$unit.classif]
main_df$clustersKm <- cl_assignmentk #back to original data.
cat("In the last subsection, we cluster the codes into 6 segments given k=6 is the best according to k-means. We have 100 (since grid=10*10) codes with asigned clusters, but we have 968 units. The above is to assign units to clusters based on their customer-id in the SOM model.")
## In the last subsection, we cluster the codes into 6 segments given k=6 is the best according to k-means. We have 100 (since grid=10*10) codes with asigned clusters, but we have 968 units. The above is to assign units to clusters based on their customer-id in the SOM model.
# Visualising Silhouette Plot
sil <- silhouette(fit_kmeans$cluster, dist(clusterdata))
fviz_silhouette(sil)
## cluster size ave.sil.width
## 1 1 28 0.22
## 2 2 39 0.31
## 3 3 33 0.38
# Visualising 3 clusters using original data
colors3 <- c("black", "blue","green")
colors3 <- colors3[as.numeric(main_df$clustersKm)]
pairs(main_df[,c(2,4,6)],col = colors3, pch = 16, gap = 0, main = "K means clustering - Optimal K = 3")
legend("bottom", col =c("black", "blue","green"), legend = c("1 -best","2","3 - worst"), pch = 20, xpd = NA, ncol = 3, bty = "n", inset = -0.04, lwd =1, pt.cex = 1)
p <- plotly::plot_ly(main_df, x=~recency, y=~logfreq,
z=~logmonetary, color=~clustersKm) %>%
add_markers(size=1.5)
print(p)
set.seed(123)
# Create distance matrix
DistanceM <- dist(clusterdata[,1:3], method = "euclidean") #create the distance matrix
fit_hc <- hclust(DistanceM, method="ward.D2") #fit the distances. "D2", since "DistanceM" is not squared. Ward.D2 squares dissimilarities before clustering
height <- fit_hc$height
barplot(diff(height),las = 2) # k = 3 or 5
plot(fit_hc, hang = -1) # display dendrogram. To me, it looks like a 5 or 6 cluster solution in my world 3-5 clusters
abline(h = c(8,5), col = c("blue","darkgreen"), lty = c(3,3), lwd = c(1,1))
fit_hc.cut1 <- cutree(fit_hc, k=3) # cut tree into k clusters
fit_hc.cut2 <- cutree(fit_hc, k=5) # cut tree into k clusters
# draw dendrogram with red borders around the 3&6 clusters
rect.hclust(fit_hc, k=3, border="blue")
rect.hclust(fit_hc, k=5, border="pink")
# Silhouette method
fviz_nbclust(clusterdata, hcut, method = "silhouette")+
labs(subtitle = "Silhouette method")
# Gap Statistic
fviz_nbclust(clusterdata, hcut, nstart = 25, method = "gap_stat", nboot = 200)+
labs(subtitle = "Gap statistic method")
# Apply NCClust() to view other performance metrics
NbClust::NbClust(data = clusterdata, diss = NULL, distance = "euclidean",min.nc = 3, max.nc = 15, method = "ward.D2")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 3 as the best number of clusters
## * 3 proposed 4 as the best number of clusters
## * 8 proposed 5 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 2 proposed 12 as the best number of clusters
## * 2 proposed 14 as the best number of clusters
## * 1 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 5
##
##
## *******************************************************************
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW
## 3 0.4952 41.7362 25.4605 -5.1838 172.4496 940545.2 3831.1451 154.2077
## 4 0.5978 43.1646 23.8493 -5.1275 228.5886 953780.4 2126.6613 122.1466
## 5 4.1537 45.8952 10.3539 -3.7888 287.1949 829363.0 1156.7942 97.8402
## 6 10.9564 42.3382 10.3448 -4.2545 322.4075 839809.3 1056.4396 88.2247
## 7 0.1078 40.4537 10.2291 -4.4118 354.7475 827224.7 837.9521 79.4781
## 8 0.4432 39.5202 11.1092 -4.3817 385.0810 797756.8 732.1478 71.6025
## 9 1.6185 39.7082 8.8497 -4.0733 426.6127 666508.4 589.0595 63.8879
## 10 0.8857 39.2756 8.4408 -3.9914 462.9887 571928.4 528.3847 58.2255
## 11 0.9598 39.0683 7.9881 -3.8758 487.4729 541743.4 430.8135 53.2329
## 12 0.7300 38.9876 8.5723 -3.7496 530.4215 419611.5 385.6260 48.8486
## 13 1.0003 39.4806 8.2520 -3.4709 549.9042 405284.0 286.3830 44.5125
## 14 2.0253 40.0691 5.9583 -3.1846 569.6057 385981.3 231.8899 40.6562
## 15 1.0548 39.7429 5.5583 -3.1939 586.6607 373614.4 188.5291 38.0220
## Friedman Rubin Cindex DB Silhouette Duda Pseudot2 Beale Ratkowsky
## 3 3.4066 1.8837 0.3888 1.2586 0.2541 0.5524 34.0369 1.3476 0.3880
## 4 4.6848 2.3782 0.3747 1.2164 0.2300 0.5725 26.1357 1.2359 0.3726
## 5 6.0343 2.9690 0.4052 1.1783 0.2463 0.6189 14.7790 1.0064 0.3627
## 6 7.2144 3.2926 0.3873 1.1793 0.2360 0.6600 8.7563 0.8282 0.3399
## 7 8.4059 3.6549 0.3875 1.2461 0.2316 0.4505 19.5180 1.9546 0.3217
## 8 9.6408 4.0569 0.3729 1.1893 0.2292 0.5430 15.1511 1.3575 0.3060
## 9 11.7982 4.5468 0.3612 1.1330 0.2242 0.6387 8.4867 0.9030 0.2936
## 10 14.3159 4.9890 0.3447 1.1802 0.2279 0.3676 10.3199 2.5098 0.2819
## 11 15.4542 5.4569 0.3362 1.0988 0.2259 0.5647 10.7928 1.2249 0.2722
## 12 19.8152 5.9467 0.3210 1.0840 0.2276 0.5228 8.2158 1.3987 0.2630
## 13 20.7431 6.5260 0.3817 1.0710 0.2351 0.4296 15.9322 2.0864 0.2551
## 14 21.8800 7.1450 0.3819 1.0402 0.2288 0.5248 7.2446 1.3704 0.2475
## 15 23.0834 7.6400 0.3692 1.0683 0.2292 0.4243 8.1405 1.9798 0.2405
## Ball Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
## 3 51.4026 0.4863 0.4539 1.1444 0.1324 0.0056 2.1094 1.1700 0.9459
## 4 30.5367 0.4860 0.2379 1.6683 0.1379 0.0065 1.9723 1.0455 0.7729
## 5 19.5680 0.4966 0.6556 2.2114 0.1698 0.0074 2.0487 0.9378 0.6668
## 6 14.7041 0.4704 0.1601 2.6832 0.1408 0.0075 2.2797 0.8896 0.4547
## 7 11.3540 0.4726 0.4011 2.9141 0.1500 0.0081 2.2062 0.8427 0.3213
## 8 8.9503 0.4597 0.2447 3.2328 0.1500 0.0087 2.2020 0.7915 0.3423
## 9 7.0987 0.4499 0.2207 3.6052 0.1548 0.0092 2.1501 0.7419 0.2912
## 10 5.8225 0.4398 0.1121 3.9865 0.1581 0.0095 2.2985 0.7083 0.2971
## 11 4.8394 0.4397 0.3721 4.0535 0.1581 0.0096 2.1369 0.6814 0.2502
## 12 4.0707 0.4213 0.1360 4.5497 0.1327 0.0100 2.2844 0.6496 0.2035
## 13 3.4240 0.4180 0.3793 4.7288 0.1629 0.0101 2.2691 0.6231 0.1738
## 14 2.9040 0.4004 0.2298 5.2529 0.1419 0.0105 2.2910 0.5970 0.1563
## 15 2.5348 0.3924 0.2244 5.5368 0.1419 0.0106 2.5218 0.5778 0.1473
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 3 0.4513 51.0655 0.2620
## 4 0.4208 48.1666 0.3005
## 5 0.3500 44.5622 0.3950
## 6 0.2757 44.6568 0.4845
## 7 0.2617 45.1416 0.1334
## 8 0.2887 44.3516 0.2656
## 9 0.2464 45.8680 0.4472
## 10 -0.0014 -4400.8778 0.0914
## 11 0.2298 46.9294 0.3126
## 12 0.1148 69.3995 0.2647
## 13 0.1912 50.7466 0.1192
## 14 0.0819 89.6223 0.2757
## 15 -0.0014 -4400.8778 0.1531
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 6.0000 5.0000 5.0000 14.0000 5.0000 5.0 4.000
## Value_Index 10.9564 45.8952 13.4954 -3.1846 58.6063 134863.6 1704.484
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 5.000 12.000 5.0000 12.000 14.0000 3.0000 3.0000
## Value_Index 14.691 4.361 -0.2672 0.321 1.0402 0.2541 0.5524
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters 3.0000 3.0000 3.000 4.0000 5.0000 2 3.0000
## Value_Index 34.0369 1.3476 0.388 20.8659 0.4966 NA 1.1444
## Dunn Hubert SDindex Dindex SDbw
## Number_clusters 5.0000 0 4.0000 0 15.0000
## Value_Index 0.1698 0 1.9723 0 0.1473
##
## $Best.partition
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16
## 1 2 2 2 2 2 2 3 3 3 1 1 2 2 2 2
## V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 V29 V30 V31 V32
## 3 3 3 3 1 1 2 2 2 4 4 3 3 3 1 1
## V33 V34 V35 V36 V37 V38 V39 V40 V41 V42 V43 V44 V45 V46 V47 V48
## 2 2 2 4 4 4 3 3 1 1 4 4 4 4 4 4
## V49 V50 V51 V52 V53 V54 V55 V56 V57 V58 V59 V60 V61 V62 V63 V64
## 3 3 1 1 1 1 4 4 5 5 3 3 1 1 1 4
## V65 V66 V67 V68 V69 V70 V71 V72 V73 V74 V75 V76 V77 V78 V79 V80
## 4 4 5 3 3 3 5 1 1 5 5 4 5 5 5 3
## V81 V82 V83 V84 V85 V86 V87 V88 V89 V90 V91 V92 V93 V94 V95 V96
## 5 5 5 5 5 5 5 5 5 2 5 5 5 5 5 5
## V97 V98 V99 V100
## 5 5 2 2
cat("Using both the dendogram and the height difference plot, we can see k=3 & k=5 being good measures because of two reason.\n Reason 1: Using the dendogram, there are 3 clear clusters when cutting the dendogram around 8cm while there are six clusters around 5cm.\n Reason 2: Using the barplot, we can see there is a drastic drop in height difference between the first bar and second bar, which means k=3 is a good, while there is drop in height difference between the fifth bar and sixth bar, which means k=5 is good too. We will prove this using the Dunn index. However, given that we have found k = 3 has already been covered in Kmeans, we use k = 5 instead.")
## Using both the dendogram and the height difference plot, we can see k=3 & k=5 being good measures because of two reason.
## Reason 1: Using the dendogram, there are 3 clear clusters when cutting the dendogram around 8cm while there are six clusters around 5cm.
## Reason 2: Using the barplot, we can see there is a drastic drop in height difference between the first bar and second bar, which means k=3 is a good, while there is drop in height difference between the fifth bar and sixth bar, which means k=5 is good too. We will prove this using the Dunn index. However, given that we have found k = 3 has already been covered in Kmeans, we use k = 5 instead.
# Visualise Dendogram in another format + Apply silhouette method
hc.cut <- hcut(clusterdata[,1:3], k = 5 ,method="ward.D2")
fviz_dend(hc.cut, show_labels = FALSE, rect = TRUE)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
fviz_silhouette(hc.cut)
## cluster size ave.sil.width
## 1 1 18 0.33
## 2 2 19 0.13
## 3 3 20 0.37
## 4 4 17 0.18
## 5 5 26 0.22
# Bring that back to the original data
cl_assignmenth <- fit_hc.cut2[scaled_df.SOM$unit.classif] # Retrieve the clustered assignments for each sample
main_df$clustersHac <- cl_assignmenth #back to original data.
cat("Again, we have 100 (since grid=10*10) codes with assigned clusters,but we have 968 units. The above code assigns units to clusters based on their class-id (code-id) in the SOM model.")
## Again, we have 100 (since grid=10*10) codes with assigned clusters,but we have 968 units. The above code assigns units to clusters based on their class-id (code-id) in the SOM model.
# Visualise plots
colors5 <- c("black", "purple", "blue","cyan","green")
colors5 <- colors5[as.numeric(main_df$clustersHac)]
pairs(main_df[,c(2,4,6)],col = colors5,pch = 16, gap = 0, main = "HAC clustering - Optimal K = 5")
legend("bottom", col = c("black", "purple", "blue","cyan","green"), legend = c("1 -best","2","3","4","5 - worst"), lwd =1, pch = 20, xpd = NA, ncol = 5, bty = "n", inset = -0.04, pt.cex = 0.5)
p <- plotly::plot_ly(main_df, x=~recency, y=~logfreq,
z=~logmonetary, color=~clustersHac) %>%
add_markers(size=1.5)
print(p)
rownames1 <- rep("K-means",7)
Metrics1 <- c("Elbow","SIL","Gap Stat","Dunn","Davies-Bouldin","NBClust() Majority rule","Comments")
Results1 <- c(3,3,3,15,12,3,"Despite k=3 being best, k=6 was 2nd best in NBClust which gives a better cluster interpretation for the customer segmentation model")
table1 <- data.frame(Model = rownames1, Metrics = Metrics1 ,Results = Results1)
rownames2 <- rep("HAC",7)
Metrics2 <- c("Dend","SIL","Gap Stat","Dunn","Davies-Bouldin", "NBClust() Majority rule","Comments")
Results2 <- c(5,3,5,5,14,5,"k = 5 was best, but k=3 is still strong")
table2 <- data.frame(Model = rownames2, Metrics = Metrics2 ,Results = Results2)
write_xlsx(table1, "KMeans")
write_xlsx(table2, "HAC")
cat("We use k =3 & k = 5, given that is the best in HAC. Despite hacing relatively poorer performance in kmeans, this is not significant enough. Plus, for simplicitiy, we will compare 2 models k =3 and k = 5")
## We use k =3 & k = 5, given that is the best in HAC. Despite hacing relatively poorer performance in kmeans, this is not significant enough. Plus, for simplicitiy, we will compare 2 models k =3 and k = 5
main_df$trueN_k5 <- ifelse(main_df$Segment6 == "1 (Best)", 1, ifelse(main_df$Segment6 == "2", 2, ifelse(main_df$Segment6 == "3", 3, ifelse(main_df$Segment6 == "4", 4, ifelse(main_df$Segment6 == "5", 5, ifelse(main_df$Segment6 == "6 (Worst)", 5,NA))))))
main_df$trueN_k3 <- ifelse(main_df$Segment3 == "1 (Best)", 1, ifelse(main_df$Segment3 == "2", 2, ifelse(main_df$Segment3 == "3 (Worst)", 3, NA)))
kmeancolor <- main_df[,14] #set color indicator
haccolor <- main_df[,15] #set color indicator
trueccolor5 <- main_df[,16] #set color indicator
trueccolor3 <- main_df[,17] #set color indicator
# Set colour palettes
color_grid <- tricolor(scaled_df.SOM$grid, phi = c(pi/8, 6, -pi/6), offset = 0.1)
colors5 <- c("black", "darkviolet", "blue","cyan","seagreen")
colors5 <- colors5[as.numeric(main_df$clustersHac)]
colors3 <- c("black", "blue","seagreen")
colors3 <- colors3[as.numeric(main_df$clustersKm)]
contrast <- c("#FA4925", "#22693E", "#D4D40F", "#2C4382", "#F0F0F0", "#3D3D3D")
# Do a Mapping Plot
par(mfrow=c(1,1))
plot_KM <- plot(scaled_df.SOM, type="mapping", bg = rgb(color_grid), shape = "straight", border = "grey", labels = unique(main_df$Segment3),col=colors3, pch = 30, main = "Kmeans mapping plot")
add.cluster.boundaries(scaled_df.SOM, fit_kmeans$cluster, lwd = 3, lty = 2, col=contrast[4])
legend("right", col = c("black", "blue","seagreen"), legend = c("1 -best","2","3 - worst"), pch = 20, xpd = NA, ncol = 1, bty = "n", inset = 0.01, pt.cex = 1.5)
# Do a K means Mapping Plot for 3 segments
par(mfrow=c(1,2))
plot(scaled_df.SOM, type="mapping", bg = rgb(color_grid), shape = "straight", border = "grey", labels = unique(main_df$Segment3),col=contrast[kmeancolor], pch = 30, main = "Kmean (K=5), mapping, col=cluster, no=trueSp")
add.cluster.boundaries(scaled_df.SOM, fit_kmeans$cluster, lwd = 3, lty = 2, col=contrast[4])
legend("bottom", col = c("black", "blue","seagreen"), legend = c("1 -best","2","3 - worst"), pch = 20, xpd = NA, ncol = 3, bty = "n", inset = -0.04, pt.cex = 1.5)
plot(scaled_df.SOM, type="mapping", bg = rgb(color_grid), shape = "straight", border = "grey", labels = unique(main_df$Segment3),col=contrast[kmeancolor], pch = 30, main = "Kmean (K=5), mapping, col=true, no=cluster")
add.cluster.boundaries(scaled_df.SOM, fit_kmeans$cluster, lwd = 3, lty = 2, col=contrast[4])
legend("bottom", col = c("black", "blue","seagreen"), legend = c("1 -best","2","3 - worst"), pch = 20, xpd = NA, ncol = 3, bty = "n", inset = -0.04, pt.cex = 1.5)
# Test for the accuracy of Kmeans + ordering is off, but that does not mean that clustering is. Inspect the dataset with cluster assignments.
main_df$Segment3 <- main_df$Segment3 %>% as.factor()
summary(main_df$Segment3)# and with guidens from above plots:
## 1 (Best) 2 3 (Worst)
## 359 230 383
length(which(main_df$clustersKm == 3)) # Over by 29
## [1] 330
length(which(main_df$clustersKm == 1)) # Missed by 24
## [1] 206
length(which(main_df$clustersKm == 2)) # Missed by 53
## [1] 436
table1 <- data.frame(clusterKM_3 = c(length(which(main_df$clustersKm == 3)),length(which(main_df$clustersKm == 1)),length(which(main_df$clustersKm == 2))))
table1
## clusterKM_3
## 1 330
## 2 206
## 3 436
#Then:
main_df$testkm <- 1 #make everything "wrong" (i.e. 1).
#Cluster assignments can only be right in one whay, but wrong in k (no of clusters-1) ways.
#Better look for the "rights", rather than the "wrongs".
main_df$testkm[main_df$clustersKm == 3 & main_df$trueN_k3 == 1] <- 0 #Make 2 right, i.e. 0.
main_df$testkm[main_df$clustersKm == 1 & main_df$trueN_k3 == 2] <- 0 #3 was right for true 2.
main_df$testkm[main_df$clustersKm == 2 & main_df$trueN_k3 == 3] <- 0 #1 was righht for true 3.
table2 <- data.frame(truecluster_3 = c(length(which(main_df$trueN_k3 == 1)),length(which(main_df$trueN_k3 == 2)),length(which(main_df$trueN_k3 == 3))))
Residual <- table2 - table1
testKM <- sum(main_df$testkm)
testKM
## [1] 266
Accuracy_Score<- 1-(testKM)/972
Accuracy_Score # how accurate is the km model
## [1] 0.7263374
KM3_df <- data.frame(summary(main_df$Segment3),table1,table2, Residual, Accuracy_Score)
colnames(KM3_df)
## [1] "summary.main_df.Segment3." "clusterKM_3"
## [3] "truecluster_3" "truecluster_3.1"
## [5] "Accuracy_Score"
KM3_df <- KM3_df %>% rename(Residual = truecluster_3.1)
write.csv(KM3_df,"Table 1.4")
par(mfrow=c(1,1))
plot_HAC <- plot(scaled_df.SOM, type="mapping", bg = rgb(color_grid), shape = "straight", border = "grey", labels = unique(main_df$Segment6),col=colors5, pch = 30, main = "HAC mapping plot")
add.cluster.boundaries(scaled_df.SOM, fit_hc.cut2, lwd = 3, lty = 2, col=contrast[4])
legend("right", col = c("black", "darkviolet", "blue","cyan","seagreen"), legend = c("1 -best","2","3","4","5 - worst"), pch = 20, xpd = NA, ncol = 1, bty = "n", inset = 0.01, pt.cex = 1.5)
par(mfrow=c(1,2))
plot(scaled_df.SOM, type="mapping", bg = rgb(color_grid), shape = "straight", border = "grey", labels =unique(main_df$Segment6),col=contrast[haccolor], main = "HAC, mapping, col=cluster, no=trueSp")
add.cluster.boundaries(scaled_df.SOM, fit_hc.cut2, lwd = 3, lty = 2, col=contrast[4]) #name your pallette and choose color...
legend("bottom", col = c("black", "darkviolet", "blue","cyan","seagreen"), legend = c("1 -best","2","3","4","5 - worst"), pch = 20, xpd = NA, ncol = 1, bty = "n", inset = 0.01, pt.cex = 1.5)
plot(scaled_df.SOM, type="mapping", bg = rgb(color_grid), shape = "straight", border = "grey", labels = unique(main_df$Segment6),col=contrast[trueccolor5], main = "HAC, mapping, col=true, no=cluster")
add.cluster.boundaries(scaled_df.SOM, fit_hc.cut2, lwd = 3, lty = 2, col=contrast[4]) #name your pallette and choose color...
legend("bottom", col = c("black", "darkviolet", "blue","cyan","seagreen"), legend = c("1 -best","2","3","4","5 - worst"), pch = 20, xpd = NA, ncol = 1, bty = "n", inset = 0.01, pt.cex = 1.5)
# Test for the accuracy of Kmeans + ordering is off, but that does not mean that clustering is. Inspect the dataset with cluster assignments.
main_df$Segment6 <- main_df$Segment6 %>% as.factor()
summary(main_df$Segment6) # and with guidens from above plots:
## 1 (Best) 2 3 4 5 6 (Worst)
## 166 193 130 100 172 211
length(which(main_df$clustersHac == 1)) # Missed 8
## [1] 174
length(which(main_df$clustersHac == 5)) # Over by 42
## [1] 235
length(which(main_df$clustersHac == 4)) # Over by 19
## [1] 149
length(which(main_df$clustersHac == 2)) # Missed by 53
## [1] 153
length(which(main_df$clustersHac == 3)) # Over by 122
## [1] 261
table3 <- data.frame(clusterHac = c(length(which(main_df$clustersHac == 1)),length(which(main_df$clustersHac == 5)),length(which(main_df$clustersHac == 4)),length(which(main_df$clustersHac == 2)),length(which(main_df$clustersHac == 3))))
#Then:
main_df$testhc <- 1 #make everything "wrong" (i.e. 1).
#Cluster assignments can only be right in one whay, but wrong in k (no of clusters-1) ways.
#Better look for the "rights", rather than the "wrongs".
main_df$testhc[main_df$clustersHac == 1 & main_df$trueN_k5 == 1] <- 0 #Make 2 right, i.e. 0.
main_df$testhc[main_df$clustersHac == 5 & main_df$trueN_k5 == 2] <- 0 #3 was right for true 2.
main_df$testhc[main_df$clustersHac == 4 & main_df$trueN_k5 == 3] <- 0 #1 was righht for true 3.
main_df$testhc[main_df$clustersHac == 2 & main_df$trueN_k5 == 4] <- 0 #1 was righht for true 3.
main_df$testhc[main_df$clustersHac == 3 & main_df$trueN_k5 == 5] <- 0 #1 was righht for true 3.
table4 <- data.frame(truecluster_5 = c(length(which(main_df$trueN_k5 == 1)),length(which(main_df$trueN_k5 == 2)),length(which(main_df$trueN_k5 == 3)),length(which(main_df$trueN_k5 == 4)),length(which(main_df$trueN_k5 == 5))))
Residual <- table4 - table3
testHC <- sum(main_df$testhc)
testHC
## [1] 354
Accuracy_Score <- 1-(testHC)/972
Accuracy_Score # how accurate is the km model
## [1] 0.6358025
HAC5_df <- data.frame(k = c("1 -best","2","3","4","5 - worst"),table3,table4, Residual, Accuracy_Score)
colnames(HAC5_df)
## [1] "k" "clusterHac" "truecluster_5" "truecluster_5.1"
## [5] "Accuracy_Score"
HAC5_df <- HAC5_df %>% rename(Residual = truecluster_5.1)
HAC5_df
## k clusterHac truecluster_5 Residual Accuracy_Score
## 1 1 -best 174 166 -8 0.6358025
## 2 2 235 193 -42 0.6358025
## 3 3 149 130 -19 0.6358025
## 4 4 153 100 -53 0.6358025
## 5 5 - worst 261 383 122 0.6358025
write.csv(HAC5_df,"Table 1.5")
cat("After testing how well the model fitted the original dataset, we evaluate k=3 has a higher accuracy then k=5, so we use k=3. However, in k =5, the best customers (cluster number 1, is fitted well at 95%.")
## After testing how well the model fitted the original dataset, we evaluate k=3 has a higher accuracy then k=5, so we use k=3. However, in k =5, the best customers (cluster number 1, is fitted well at 95%.
str(scaled_df)
## 'data.frame': 972 obs. of 11 variables:
## $ customer_id : int 80 90 130 190 220 290 330 400 410 480 ...
## $ recency : num -1.207 -0.888 0.815 0.524 1.349 ...
## $ logfreq : num 1.0144 1.0144 -1.1626 -0.0741 -1.1626 ...
## $ logmonetary : num 0.893 1.678 0.554 0.675 -0.499 ...
## $ BCPower_monetary: num 0.196 0.161 0.213 0.206 0.277 ...
## $ R_score : num 4 3 2 2 1 2 2 1 3 4 ...
## $ F_score : num 4 4 1 2 1 2 3 1 4 3 ...
## $ M_score : num 4 4 4 4 2 2 2 1 2 3 ...
## $ score : num 444 344 214 224 112 222 232 111 342 433 ...
## $ Segment3 : chr "1 (Best)" "1 (Best)" "2" "2" ...
## $ Segment6 : chr "1 (Best)" "1 (Best)" "4" "4" ...
## - attr(*, "na.action")= 'omit' Named int [1:28] 21 33 78 79 80 81 82 83 84 85 ...
## ..- attr(*, "names")= chr [1:28] "21" "33" "78" "79" ...
y_km <- main_df["clustersKm"]
y_hc <- main_df["clustersHac"]
exp = scaled_df[,c(2,3,4)] %>% add_column(y_km,.after = "logmonetary")
exp = exp %>% add_column(y_hc,.after = "clustersKm")
str(exp)
## 'data.frame': 972 obs. of 5 variables:
## $ recency : num -1.207 -0.888 0.815 0.524 1.349 ...
## $ logfreq : num 1.0144 1.0144 -1.1626 -0.0741 -1.1626 ...
## $ logmonetary: num 0.893 1.678 0.554 0.675 -0.499 ...
## $ clustersKm : int 3 1 2 2 2 2 3 2 3 3 ...
## $ clustersHac: int 1 1 3 2 3 3 5 3 5 1 ...
set.seed(123)
validation_index = createDataPartition(exp$clustersKm, p = 0.8, list=FALSE)
training = exp[validation_index,]
testing = exp[-validation_index,]
# Predicting KM
library(rpart)
##
## Attaching package: 'rpart'
## The following object is masked from 'package:dendextend':
##
## prune
library(rpart.plot)
fit_km <- rpart(clustersKm ~ recency + logfreq + logmonetary, data = training, method = 'class')
rpart.plot(fit_km, extra = 104)
predict_unseen1 <-predict(fit_km, testing, type = 'class')
table_mat <- table(testing$clustersKm, predict_unseen1)
write.csv(table_mat,"Prediction Model Kmeans")
accuracy_Test1 <- sum(diag(table_mat)) / sum(table_mat)
print(paste('Accuracy for test', accuracy_Test1))
## [1] "Accuracy for test 0.917525773195876"
# Predicting HC
fit_hc <- rpart(clustersHac ~ recency + logfreq + logmonetary, data = training, method = 'class')
rpart.plot(fit_hc, extra = 104)
predict_unseen2 <-predict(fit_hc, testing, type = 'class')
table_matrix <- table(testing$clustersHac, predict_unseen2)
write.csv(table_matrix,"Prediction Model HAC")
accuracy_Test2 <- sum(diag(table_matrix)) / sum(table_matrix)
print(paste('Accuracy for test', accuracy_Test2))
## [1] "Accuracy for test 0.912371134020619"
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.