knitr::opts_chunk$set(echo = TRUE)
#' Clean slate
rm(list = ls())
#' Get working directory
getwd()

## [1] "/Users/rexlei/Documents/GitHub/BDM-Assignment-Group-4/main./BDM Group 4"

#' Import Libraries
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(writexl)
library(summarytools)
library(Hmisc)

## Loading required package: survival

## 
## Attaching package: 'survival'

## The following object is masked from 'package:caret':
## 
##     cluster

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:summarytools':
## 
##     label, label<-

## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:Hmisc':
## 
##     src, summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(magrittr)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ tibble  3.1.6     ✔ purrr   0.3.4
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract()   masks magrittr::extract()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ purrr::lift()      masks caret::lift()
## ✖ purrr::set_names() masks magrittr::set_names()
## ✖ dplyr::src()       masks Hmisc::src()
## ✖ dplyr::summarize() masks Hmisc::summarize()
## ✖ tibble::view()     masks summarytools::view()

library(data.table)

## 
## Attaching package: 'data.table'

## The following object is masked from 'package:purrr':
## 
##     transpose

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

library(mclust)

## Package 'mclust' version 5.4.9
## Type 'citation("mclust")' for citing this R package in publications.

## 
## Attaching package: 'mclust'

## The following object is masked from 'package:purrr':
## 
##     map

library(scatterplot3d)
library(lattice)
library(kohonen)

## 
## Attaching package: 'kohonen'

## The following object is masked from 'package:mclust':
## 
##     map

## The following object is masked from 'package:purrr':
## 
##     map

library(ggpubr)
library(corrplot)

## corrplot 0.92 loaded

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:purrr':
## 
##     some

## The following object is masked from 'package:dplyr':
## 
##     recode

library(paletteer)
library(cluster)
library(purrr)
library(dendextend)

## 
## ---------------------
## Welcome to dendextend version 1.15.2
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attaching package: 'dendextend'

## The following object is masked from 'package:ggpubr':
## 
##     rotate

## The following object is masked from 'package:data.table':
## 
##     set

## The following object is masked from 'package:stats':
## 
##     cutree

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(hopkins)
library(plotrix)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:Hmisc':
## 
##     subplot

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(xlsx)

Section A

Section A Part 1 - Removing Missing Values + Making df numeric

#' Open text file
df <- as.data.frame(read.table("Group 4.txt",sep = "\t", header = TRUE, row.names = NULL)) 
#' Show column names and check variable dtype
names(df)

## [1] "customer_id"    "Last.purchased" "frequency"      "amount"

var_dtype <- str((df)[1:4])

## 'data.frame':    1000 obs. of  4 variables:
##  $ customer_id   : int  80 90 130 190 220 290 330 400 410 480 ...
##  $ Last.purchased: chr  "343" "758" "2970" "2592" ...
##  $ frequency     : int  4 4 1 2 1 2 3 1 4 3 ...
##  $ amount        : num  75 126 60 65 30 ...

#' Check for duplicates in the dataset within "customer_id"
table(duplicated(df["customer_id"]))

## 
## FALSE 
##  1000

#' Check its dimension
dim(df)

## [1] 1000    4

#' Check for missing values
table(is.na(df))

## 
## FALSE  TRUE 
##  3972    28

#' Count missing values for each attribute
na_count <- sapply(df,function(y) sum(length(which(is.na(y)))))
na_count

##    customer_id Last.purchased      frequency         amount 
##              0              2              9             17

#' Remove any rows with missing values
non_NAN_df <- as.data.frame(na.omit(df))
sapply(non_NAN_df,function(y) sum(length(which(is.na(y)))))

##    customer_id Last.purchased      frequency         amount 
##              0              0              0              0

#' Check var_dtype
var_dtype <- str((non_NAN_df)[1:4])

## 'data.frame':    972 obs. of  4 variables:
##  $ customer_id   : int  80 90 130 190 220 290 330 400 410 480 ...
##  $ Last.purchased: chr  "343" "758" "2970" "2592" ...
##  $ frequency     : int  4 4 1 2 1 2 3 1 4 3 ...
##  $ amount        : num  75 126 60 65 30 ...

#' Convert "recency" variable to numeric
non_NAN_df[,2] <- as.numeric(non_NAN_df[,2])

## Warning: NAs introduced by coercion

sapply(non_NAN_df,
       function(y)sum(length(which(is.na(y))))) # Check new mssing values

##    customer_id Last.purchased      frequency         amount 
##              0              4              0              0

which(non_NAN_df[,2] %in% NA) # Which index are thee missing values

## [1]  75 114 115 148

#' Manually input missing values from original df
non_NAN_df[75,2] <- 2969
non_NAN_df[114,2] <- 2192
non_NAN_df[115,2] <- 379
non_NAN_df[148,2] <- 26

#' Check for missing values if any
sapply(non_NAN_df,
       function(y)sum(length(which(is.na(y)))))

##    customer_id Last.purchased      frequency         amount 
##              0              0              0              0

#' Check var_dtype
var_dtype <- str((non_NAN_df)[1:4])

## 'data.frame':    972 obs. of  4 variables:
##  $ customer_id   : int  80 90 130 190 220 290 330 400 410 480 ...
##  $ Last.purchased: num  343 758 2970 2592 3664 ...
##  $ frequency     : int  4 4 1 2 1 2 3 1 4 3 ...
##  $ amount        : num  75 126 60 65 30 ...

#' Name variables
cus <- non_NAN_df[,1]
rec <- non_NAN_df[,2]
freq <- non_NAN_df[,3]
mon <- non_NAN_df[,4]

#' View Descriptive Statistics
non_NAN_df <- rename(non_NAN_df, recency = Last.purchased, monetary = amount)
descriptive_stats<- descr(non_NAN_df, style = "rmarkdown")
dfSummary(non_NAN_df)

## Data Frame Summary  
## non_NAN_df  
## Dimensions: 972 x 4  
## Duplicates: 0  
## 
## ------------------------------------------------------------------------------------------------------------------
## No   Variable      Stats / Values                 Freqs (% of Valid)    Graph                 Valid      Missing  
## ---- ------------- ------------------------------ --------------------- --------------------- ---------- ---------
## 1    customer_id   Mean (sd) : 12408.3 (5738.6)   972 distinct values         : : . .         972        0        
##      [integer]     min < med < max:                                         . : : : :         (100.0%)   (0.0%)   
##                    80 < 12595 < 27180                                       : : : : : .                           
##                    IQR (CV) : 8277.5 (0.5)                              : : : : : : : :                           
##                                                                         : : : : : : : : . .                       
## 
## 2    recency       Mean (sd) : 1911.3 (1299.2)    529 distinct values   :                     972        0        
##      [numeric]     min < med < max:                                     :   .         :       (100.0%)   (0.0%)   
##                    1 < 1836.5 < 4014                                    : . :     . . :                           
##                    IQR (CV) : 2402.5 (0.7)                              : : : : : : : :                           
##                                                                         : : : : : : : :                           
## 
## 3    frequency     Mean (sd) : 2.6 (1.9)          14 distinct values    :                     972        0        
##      [integer]     min < med < max:                                     :                     (100.0%)   (0.0%)   
##                    1 < 2 < 18                                           : .                                       
##                    IQR (CV) : 3 (0.7)                                   : :                                       
##                                                                         : : :                                     
## 
## 4    monetary      Mean (sd) : 69.1 (166.8)       203 distinct values   :                     972        0        
##      [numeric]     min < med < max:                                     :                     (100.0%)   (0.0%)   
##                    7.2 < 39.5 < 3100                                    :                                         
##                    IQR (CV) : 31.5 (2.4)                                :                                         
##                                                                         :                                         
## ------------------------------------------------------------------------------------------------------------------

write.csv(descriptive_stats,"Table 0.0")

# Name non missing data frame as "non_NAN_df" and assign the newly 
new_df <- copy(non_NAN_df)

Section A Part 2 - Create boxplot (original distribution)

Section A Part 3 - Encoding data

#' Utilised approach from https://medium.com/analytics-vidhya/customer-segmentation-using-rfm-analysis-in-r-cd8ba4e6891 to classify variables
summary(new_df)

##   customer_id       recency       frequency         monetary      
##  Min.   :   80   Min.   :   1   Min.   : 1.000   Min.   :   7.25  
##  1st Qu.: 8335   1st Qu.: 731   1st Qu.: 1.000   1st Qu.:  28.51  
##  Median :12595   Median :1836   Median : 2.000   Median :  39.50  
##  Mean   :12408   Mean   :1911   Mean   : 2.647   Mean   :  69.07  
##  3rd Qu.:16612   3rd Qu.:3134   3rd Qu.: 4.000   3rd Qu.:  60.00  
##  Max.   :27180   Max.   :4014   Max.   :18.000   Max.   :3100.00

new_df$R_score <- 0
new_df$F_score <- 0
new_df$M_score <- 0
new_df$score <- 0
new_df$Segment3<- "0"
new_df$Segment6<- "0"

quantile(new_df$recency, probs = seq(0, 1, 1/4))

##     0%    25%    50%    75%   100% 
##    1.0  731.0 1836.5 3133.5 4014.0

quantile(new_df$frequency, probs = seq(0, 1, 1/5)) # Given Frequency is whole values, we split percentilees into 5 quartiles

##   0%  20%  40%  60%  80% 100% 
##    1    1    2    3    4   18

quantile(new_df$monetary, probs = seq(0, 1, 1/4))

##        0%       25%       50%       75%      100% 
##    7.2500   28.5119   39.5000   60.0000 3100.0000

# Classify variables into quanitles
#' R scored
new_df$R_score[new_df$recency >= 3133.5]<- 1
new_df$R_score[new_df$recency >= 1836.5 & new_df$recency <3133.5] <- 2
new_df$R_score[new_df$recency >= 731 & new_df$recency <1836.5] <- 3
new_df$R_score[new_df$recency < 731] <- 4
#' F scored
new_df$F_score[new_df$frequency >=4] <- 4
new_df$F_score[new_df$frequency == 3] <- 3 #
new_df$F_score[new_df$frequency == 2] <- 2
new_df$F_score[new_df$frequency == 1] <- 1
#' M scored
new_df$M_score[new_df$monetary >= 60] <- 4
new_df$M_score[new_df$monetary < 60.00 & new_df$monetary >= 39.5] <- 3
new_df$M_score[new_df$monetary < 39.50 & new_df$monetary >= 28.5119] <- 2
new_df$M_score[new_df$monetary <28.5119] <- 1

#' Combine each of the scores together
new_df <- new_df %>% mutate(score = 100 *R_score +10 * F_score + M_score)

new_df$Segment6 <- 0
new_df$Segment6[which(new_df$score %in% c(444,434,443, 344, 442, 244, 424, 441))] <-"1 (Best)"
new_df$Segment6[which(new_df$score %in% c(332,333,342, 343, 334, 412,413,414,431,432,441,421,422,423, 424, 433))] <- "2"
new_df$Segment6[which(new_df$score %in% c(233,234, 241,311, 312, 313,314,321,322,323,324, 331,  341))] <- "3"
new_df$Segment6[which(new_df$score %in% c(124, 133, 134, 142, 143, 144, 214,224,234, 242, 243, 232 ))] <- "4"
new_df$Segment6[which(new_df$score %in% c(122, 123,131 ,132, 141, 212, 213, 221, 222, 223, 231,411,114))] <- "5"
new_df$Segment6[which(new_df$score %in% c(111, 112, 113, 121, 131, 211, 311))] <-"6 (Worst)"

new_df$Segment3 <- 0
new_df$Segment3[which(new_df$score %in% c(444,434,443, 344, 442, 244, 424, 441,332,333,342, 343, 334, 412,413,414,431,432,441,421,422,423, 424, 433))] <-"1 (Best)"
new_df$Segment3[which(new_df$score %in% c(233,234, 241,311, 312, 313,314,321,322,323,324, 331,  341, 124, 133, 134, 142, 143, 144, 214,224,234, 242, 243, 232))] <- "2"
new_df$Segment3[which(new_df$score %in% c(122, 123,131 ,132, 141, 212, 213, 221, 222, 223, 231,411,114,111, 112, 113, 121, 131, 211, 311))] <- "3 (Worst)"

Section A Part 4 - Create boxplot and piechart

## 
##  1 (Best)         2         3         4         5 6 (Worst) 
##       166       193       130       100       172       211

## 
##  1 (Best)         2 3 (Worst) 
##       359       230       383

## [1] 61

## [1] 52

## [1] 166

## [1] 359

## The best customers are the ones on the far right encoded 444 (52 values). We can also see a pattern here:
## 1. Segment 6 shows 166 values while Segment 3 shows 359 values
## 2. There are more customers in the extreme ends (111 & 444) than the customers towards the middle
## 3. RFM shows 61 different segments when encoded with 4 levels

## Non-numerical variable(s) ignored: Segment3, Segment6

## ### Descriptive Statistics  
## #### new_df  
## **N:** 972  
## 
## |          &nbsp; | customer_id | F_score | frequency | M_score | monetary | R_score |
## |----------------:|------------:|--------:|----------:|--------:|---------:|--------:|
## |        **Mean** |    12408.25 |    2.33 |      2.65 |    2.53 |    69.07 |    2.49 |
## |     **Std.Dev** |     5738.64 |    1.21 |      1.90 |    1.14 |   166.84 |    1.11 |
## |         **Min** |       80.00 |    1.00 |      1.00 |    1.00 |     7.25 |    1.00 |
## |          **Q1** |     8330.00 |    1.00 |      1.00 |    1.50 |    28.45 |    1.50 |
## |      **Median** |    12595.00 |    2.00 |      2.00 |    2.50 |    39.50 |    2.50 |
## |          **Q3** |    16615.00 |    4.00 |      4.00 |    4.00 |    60.00 |    3.00 |
## |         **Max** |    27180.00 |    4.00 |     18.00 |    4.00 |  3100.00 |    4.00 |
## |         **MAD** |     6078.66 |    1.48 |      1.48 |    2.22 |    21.50 |    0.74 |
## |         **IQR** |     8277.50 |    3.00 |      3.00 |    2.25 |    31.49 |    1.25 |
## |          **CV** |        0.46 |    0.52 |      0.72 |    0.45 |     2.42 |    0.45 |
## |    **Skewness** |       -0.02 |    0.23 |      1.94 |   -0.01 |    11.94 |    0.00 |
## | **SE.Skewness** |        0.08 |    0.08 |      0.08 |    0.08 |     0.08 |    0.08 |
## |    **Kurtosis** |       -0.48 |   -1.52 |      7.40 |   -1.42 |   176.74 |   -1.35 |
## |     **N.Valid** |      972.00 |  972.00 |    972.00 |  972.00 |   972.00 |  972.00 |
## |   **Pct.Valid** |      100.00 |  100.00 |    100.00 |  100.00 |   100.00 |  100.00 |
## 
## Table: Table continues below
## 
##  
## 
## |          &nbsp; | recency |  score |
## |----------------:|--------:|-------:|
## |        **Mean** | 1911.31 | 275.36 |
## |     **Std.Dev** | 1299.21 | 120.02 |
## |         **Min** |    1.00 | 111.00 |
## |          **Q1** |  731.00 | 177.00 |
## |      **Median** | 1836.50 | 277.50 |
## |          **Q3** | 3137.00 | 344.00 |
## |         **Max** | 4014.00 | 444.00 |
## |         **MAD** | 1769.48 |  98.59 |
## |         **IQR** | 2402.50 | 150.00 |
## |          **CV** |    0.68 |   0.44 |
## |    **Skewness** |    0.09 |  -0.01 |
## | **SE.Skewness** |    0.08 |   0.08 |
## |    **Kurtosis** |   -1.37 |  -1.35 |
## |     **N.Valid** |  972.00 | 972.00 |
## |   **Pct.Valid** |  100.00 | 100.00 |

## Data Frame Summary  
## new_df  
## Dimensions: 972 x 10  
## Duplicates: 0  
## 
## ------------------------------------------------------------------------------------------------------------------
## No   Variable      Stats / Values                 Freqs (% of Valid)    Graph                 Valid      Missing  
## ---- ------------- ------------------------------ --------------------- --------------------- ---------- ---------
## 1    customer_id   Mean (sd) : 12408.3 (5738.6)   972 distinct values         : : . .         972        0        
##      [integer]     min < med < max:                                         . : : : :         (100.0%)   (0.0%)   
##                    80 < 12595 < 27180                                       : : : : : .                           
##                    IQR (CV) : 8277.5 (0.5)                              : : : : : : : :                           
##                                                                         : : : : : : : : . .                       
## 
## 2    recency       Mean (sd) : 1911.3 (1299.2)    529 distinct values   :                     972        0        
##      [numeric]     min < med < max:                                     :   .         :       (100.0%)   (0.0%)   
##                    1 < 1836.5 < 4014                                    : . :     . . :                           
##                    IQR (CV) : 2402.5 (0.7)                              : : : : : : : :                           
##                                                                         : : : : : : : :                           
## 
## 3    frequency     Mean (sd) : 2.6 (1.9)          14 distinct values    :                     972        0        
##      [integer]     min < med < max:                                     :                     (100.0%)   (0.0%)   
##                    1 < 2 < 18                                           : .                                       
##                    IQR (CV) : 3 (0.7)                                   : :                                       
##                                                                         : : :                                     
## 
## 4    monetary      Mean (sd) : 69.1 (166.8)       203 distinct values   :                     972        0        
##      [numeric]     min < med < max:                                     :                     (100.0%)   (0.0%)   
##                    7.2 < 39.5 < 3100                                    :                                         
##                    IQR (CV) : 31.5 (2.4)                                :                                         
##                                                                         :                                         
## 
## 5    R_score       Mean (sd) : 2.5 (1.1)          1 : 243 (25.0%)       IIIII                 972        0        
##      [numeric]     min < med < max:               2 : 243 (25.0%)       IIIII                 (100.0%)   (0.0%)   
##                    1 < 2.5 < 4                    3 : 248 (25.5%)       IIIII                                     
##                    IQR (CV) : 1.2 (0.4)           4 : 238 (24.5%)       IIII                                      
## 
## 6    F_score       Mean (sd) : 2.3 (1.2)          1 : 350 (36.0%)       IIIIIII               972        0        
##      [numeric]     min < med < max:               2 : 204 (21.0%)       IIII                  (100.0%)   (0.0%)   
##                    1 < 2 < 4                      3 : 161 (16.6%)       III                                       
##                    IQR (CV) : 3 (0.5)             4 : 257 (26.4%)       IIIII                                     
## 
## 7    M_score       Mean (sd) : 2.5 (1.1)          1 : 243 (25.0%)       IIIII                 972        0        
##      [numeric]     min < med < max:               2 : 243 (25.0%)       IIIII                 (100.0%)   (0.0%)   
##                    1 < 2.5 < 4                    3 : 216 (22.2%)       IIII                                      
##                    IQR (CV) : 2.2 (0.5)           4 : 270 (27.8%)       IIIII                                     
## 
## 8    score         Mean (sd) : 275.4 (120)        61 distinct values    :   :   :   :         972        0        
##      [numeric]     min < med < max:                                     :   :   :   :         (100.0%)   (0.0%)   
##                    111 < 277.5 < 444                                    :   :   :   :                             
##                    IQR (CV) : 150 (0.4)                                 :   :   :   :                             
##                                                                         :   :   :   :                             
## 
## 9    Segment3      1. 1 (Best)                    359 (36.9%)           IIIIIII               972        0        
##      [character]   2. 2                           230 (23.7%)           IIII                  (100.0%)   (0.0%)   
##                    3. 3 (Worst)                   383 (39.4%)           IIIIIII                                   
## 
## 10   Segment6      1. 1 (Best)                    166 (17.1%)           III                   972        0        
##      [character]   2. 2                           193 (19.9%)           III                   (100.0%)   (0.0%)   
##                    3. 3                           130 (13.4%)           II                                        
##                    4. 4                           100 (10.3%)           II                                        
##                    5. 5                           172 (17.7%)           III                                       
##                    6. 6 (Worst)                   211 (21.7%)           IIII                                      
## ------------------------------------------------------------------------------------------------------------------

Section A Part 3 - Do Log transformation on monetary

#' Transform "monetary" feature to log(monetary) to make dataset more normal
#' But before transforming "monetary" feature to log(monetary), 
#' we want to see if there are any negative values in monetary
counter = 0
for (i in new_df$monetary) {
  if (i < 0) {counter = counter + 1}
}
print(counter) # returns zero

## [1] 0

# Transform "frequency", monetary" to "log(frequency) + log(monetary)"
logfreq <- log(new_df$frequency)
logmonetary <- log(new_df$monetary)

new_df <- new_df %>%
  add_column(logfreq,.after = "frequency") 
new_df <- new_df %>%
  add_column(logmonetary,.after = "monetary") 

# Trying out tranforming "monetary" to "monetary^(lamda)
lamda <- powerTransform(new_df$monetary,family = "bcPower")
parameter <- lamda[["lambda"]][["new_df$monetary"]] %>% as.numeric()
BCPower_monetary <- (new_df$monetary)^(parameter)
new_df <- new_df %>%
  add_column(BCPower_monetary,.after = "logmonetary")

Section A Part 4 - Create box plot to see effects of transformation

Section A Part 5 - Winsor Outliers

#' Identify Outliers
outliers_R <- quantile(new_df$recency, c(0.05, 0.95), type = 1)
outliers_F <- quantile(new_df$frequency, c(0.05, 0.95), type = 1)
outliers_M <- quantile(new_df$monetary, c(0.05, 0.95), type = 1)
outliers_F_log <- quantile(new_df$logfreq, c(0.05, 0.95), type = 1)
outliers_M_log <- quantile(new_df$logmonetary, c(0.05, 0.95), type = 1)
outliers_M_BC <- quantile(new_df$BCPower_monetary, c(0.05, 0.95), type = 1)
outliers <- table(outliers_R,outliers_F,outliers_M,outliers_F_log,
                  outliers_M_log,outliers_M_BC)

#' Winsoring Outliers
new_df %>% setDT()

main_df <- new_df %>%
  .[frequency <= outliers_F[1], frequency := outliers_F[1]] %>%  
  .[frequency >= outliers_F[2], frequency := outliers_F[2]] 

main_df <- new_df %>%
  .[monetary <= outliers_M[1], monetary := outliers_M[1]] %>%  
  .[monetary >= outliers_M[2], monetary := outliers_M[2]] 

main_df <- new_df %>%
  .[logfreq <= outliers_F_log[1], logfreq := outliers_F_log[1]] %>%  
  .[logfreq >= outliers_F_log[2], logfreq := outliers_F_log[2]] 

main_df <- new_df %>%
  .[logmonetary <= outliers_M_log[1], logmonetary := outliers_M_log[1]] %>%  
  .[logmonetary >= outliers_M_log[2], logmonetary := outliers_M_log[2]] 

main_df <- new_df %>%
  .[BCPower_monetary <= outliers_M_BC[1], BCPower_monetary := outliers_M_BC[1]] %>%  
  .[BCPower_monetary >= outliers_M_BC[2], BCPower_monetary := outliers_M_BC[2]]

Section A Part 6 - Create box plot to see effects of winsoring

main_df <- main_df %>% as.data.frame()
# F-transformed Boxplot code
F_wins_boxplot <-ggplot(main_df, aes(x= freq, y= "freq + winsor")) + geom_boxplot(color="orange",
               outlier.colour="red", 
               outlier.shape=8, 
               outlier.size=4) + 
  stat_summary(fun=mean, geom="point", shape=23, size=4)

F_winslog_boxplot <-ggplot(main_df, aes(x= logfreq, y= "log(freq) + winsor")) + geom_boxplot(color="orange",
               outlier.colour="red", 
               outlier.shape=8, 
               outlier.size=4) + 
  stat_summary(fun=mean, geom="point", shape=23, size=4)

# M-transformed (logs) Boxplot code
M_winslog_boxplot <-ggplot(main_df, 
                   aes(x = logmonetary, y="log(mon) + winsor")) +
  geom_boxplot(color="orange",
               outlier.colour="red", 
               outlier.shape=8, outlier.size=4) + 
  stat_summary(fun=mean, geom="point", shape=23, size=4)
# M-transformed (BCPower) Boxplot code
M_winsBC_boxplot <-ggplot(main_df, 
                   aes(x = BCPower_monetary, y="BCPower(mon) + winsor")) +
  geom_boxplot(color="orange",
               outlier.colour="red", 
               outlier.shape=8, outlier.size=4) + 
  stat_summary(fun=mean, geom="point", shape=23, size=4)
F_wins_boxplot

F_winslog_boxplot

M_winslog_boxplot

M_winsBC_boxplot

Section A Part 7 - Remove unnecessary variable & Scale + Center dataset

# Remove unnecessary variables
colnames(main_df)

##  [1] "customer_id"      "recency"          "frequency"        "logfreq"         
##  [5] "monetary"         "logmonetary"      "BCPower_monetary" "R_score"         
##  [9] "F_score"          "M_score"          "score"            "Segment3"        
## [13] "Segment6"

# Center and Scale df
scalars <- as.data.frame(scale(main_df[,c(2,4,6)]))
# Remove unnecessary vars Redefine variables + rename variables
scaled_df <-select(main_df,-c(recency,frequency,logfreq,monetary,logmonetary))
scaled_df <-scaled_df %>%
  add_column(scalars,.after = "customer_id") # no scaling + centering as this is already done through BC transformation

cusID <- scaled_df[,1]
rec <- scaled_df[,2]
lfreq <- scaled_df[,3]
lmon <- scaled_df[,4]
BC_mon <- scaled_df[,5]
R_score <- scaled_df[,6]
F_score <- scaled_df[,7]
M_score <- scaled_df[,8]
RFM_score <-scaled_df[,9]
class3 <- scaled_df[,10]
class6 <- scaled_df[,11]

colnames(scaled_df)

##  [1] "customer_id"      "recency"          "logfreq"          "logmonetary"     
##  [5] "BCPower_monetary" "R_score"          "F_score"          "M_score"         
##  [9] "score"            "Segment3"         "Segment6"

Section A Part 8 - View distribution of scaled dataset

Section A Part 10 - View distribution on EDA & compare transformation technqiuees

Exploratory Data Analysis

#' Draw 3D scatterplot of RFM analysis, grouping each by R,F,M
scatterplot3d(scaled_df[,2:4],color = scaled_df$R_score, pch = 5, angle = 80, main = "Grouped by R") # color coded by R

scatterplot3d(scaled_df[,2:4],color = scaled_df$F_score, pch = 5, angle = 80, main = "Grouped by F") # color coded by F

scatterplot3d(scaled_df[,2:4],color = scaled_df$M_score, pch = 5, angle = 80, main = "Grouped by M") # color coded by M

scatterplot3d(scaled_df[,2:4],color = scaled_df$score, pch = 5, angle = 80, main = "Grouped by RFM") # color coded by RFM

group6 <- unclass(factor(scaled_df$Segment6))
group3 <- unclass(factor(scaled_df$Segment3))
scatterplot3d(scaled_df[,2:4],color = group6, pch = 5, angle = 80, main = "Grouped by 6 segments") # color coded by class6

scatterplot3d(scaled_df[,2:4],color = group3, pch = 5, angle = 80, main = "Grouped by 3 segments") # color coded by class3

ggplot(data = scaled_df) +
  geom_bar(mapping = aes(x = score))

cat("Segmentation is much clearer especially when segmenting across 3 classes and 6 classes. We also show frequency and monetary segments are not as clear as recency, given that recency is more normally distributed than the others.")

## Segmentation is much clearer especially when segmenting across 3 classes and 6 classes. We also show frequency and monetary segments are not as clear as recency, given that recency is more normally distributed than the others.

Section B

Section B Part 1 - Check for correlations

# Check Correlation prior and after preprocessing treatment on outliers + transformation
corr1 <- cor(non_NAN_df)
corrmatrix <- corrplot(corr1, type = "upper", order = "original", method = "number",
                       tl.col = "black", tl.srt = 45)

corr2 <- cor(scaled_df[,c(2:4)])
corrmatrix2 <- corrplot(corr2, type = "upper", order = "original",  method = "number" ,tl.col = "black", tl.srt = 45)

cat("We are expecting people who came to shops recently to come more often, as newer customers often visit to the same shop if they like it. Plus, given that all variables within thee analysis (RFM) are all crucial for the analysis, we must keep the variables.")

## We are expecting people who came to shops recently to come more often, as newer customers often visit to the same shop if they like it. Plus, given that all variables within thee analysis (RFM) are all crucial for the analysis, we must keep the variables.

Section B Part 1.2 - Create SOM Model - Trial and EError

## The parameters which I will need to consider for SOM model are:
## 0.grid 
## 1.rlen
## 2.alpha
## 3.radius.
##  grid = size of the codebook (how will you visualise the SOM model). 
##  rlen = the no.of times the completed dataset will be presented to the network. After completing the grid size, the code will determine rlen through a plot.
##  Alpha = Learning rate controls the size of weight vector in learning of SOM. 
##  radius = the radius of the neighbourhood.

## For simplicity, we are going to keep alpha and radius to the default metrics.
##  Alpha: The learning rate will decline linearly from 0.05 to 0.01 over rlen updates. 
##  Radius: 2/3 of all unit distances

## We want around 5 to 10 samples per node. Hexagonal is preferred (more negihbours).
## Hence, given n= 968 and we want 5 to 10 samples per node, we want a grid size of 97 to 193.
## Therefore, we need grid sizes 10x10 to 14x14. We test each grid size to see which grid is best while also minimising nodes with empty samples.
## Note, most SOM models have nxn models and thus we use a nxn model too

## Given that grid sizes at and over 10x10 presents a lot of nodes with missing values, we test 8x8 and 9x9.

## Given our analysis, we use 10x10 grid & given there is not much change in terms of the sample range per node, whereas with 8x8, the sample range increases which is not we want as we want as few samples as possible per node. Whil 9x9 and 10x10 have similar sample sizes per node and only has one node fewer with zero samples.
## However, we can see in our analysis, there is always one node with more than 40 samples showing a high concentration of samples in one area at all times. This shows that there a high volume of the same customer with the same RFM characterisitcs.
## Given that we want to use a higher grid size as possible, we can

## Section B Part 1 - Creating the SOM Model

set.seed(123)
scaled_df.grid <- somgrid(xdim = 10, 
                    ydim = 10, 
                    topo = "hexagonal")
scaled_df.SOM <- som(X = RFM_SOM, grid = scaled_df.grid,rlen = 500,keep.data = TRUE)
plot(scaled_df.SOM, type = "counts", main = "Node counts (10x10)", pch = 15)

Section B Part 2 - Determine whether the model is good or not - Model Development

# Check whether the model is good = test for efficiency
summary(scaled_df.SOM)

## SOM of size 10x10 with a hexagonal topology and a bubble neighbourhood function.
## The number of data layers is 1.
## Distance measure(s) used: sumofsquares.
## Training data included: 972 objects.
## Mean distance to the closest unit in the map: 0.052.

meandist <- mean(scaled_df.SOM$distances) # mean distances between codes and the respective closest unit in the map. The lower the better
meandist

## [1] 0.05235707

meanunitD <- mean(unit.distances(scaled_df.grid)) #Distances between units give us a clue on the distances prior to the mapping.
meanunitD

## [1] 4.855542

AccountedD <- meanunitD - meandist #MeanD is the distances that are "left over" when we use the map.The difference between the distances going in and whats left when we are done are the distances we have accounted for.
AccountedD

## [1] 4.803185

EF <- AccountedD/meanunitD #Divide the distances accounted for by the mean distances prior to mapping.
#The ratio should say something about the efficiency. 
EF # Close to 1 = good.

## [1] 0.9892171

Section B Part 2.2 Visualisations - Plots

Counts, Neighbors, Codes Plots

## In the counts plot, most values have around 10 to 20 values. There are clearly 2 nodes with values higher than 40.
## In the neighbor plot, we can clearly see 2 obvious clusters with 2 other less obvious clusters (k=4).
## In the quality plot, it shows the quantization error, the smaller the distance the better the objects are represented by the codebook. Similarly, with the neighbor plot, we see 2 obvious cluster with higher quality codes with 2 other less obvious clusters.

## Warning in par(opar): argument 1 does not name a graphical parameter

## In the codes plot, we see
##  high values of R & M and low F in the bottom-right
##  high values of F & M and low R in the bottom-left
##  low values of R,F,M in the middle
##  high values of R and low values of F & M in the top-right
##  high F and low R & M in the middle & top left. Although this is not obvious, average values of R,F,M are top-middle and bottom-middle.
## 
##           The values which we want to target are the values on the bottom-left

Heatmaps

Heatmaps for scaled data

## These heatmaps show scaled data of each variable. We can clearly see 3-4 clear clusters in R & F. This is more unclear with M

Heatmaps for unscaled data

## These heatmaps show unscaled data of each variable. We can clearly see 3-4 clear clusters in R & F. This is more unclear with M but we can roughly see 3-5 clusters.

Section B Part 3 - Clustering Model Implementation

How clusterable is our data

cat("Before moving on, we can use some metrics to see how clusterable our data is")

## Before moving on, we can use some metrics to see how clusterable our data is

# Use hopkins metrics to measure the clustering tendency - how feasible is it for the data to be clustered into (the lower the value)
set.seed(123)
hopkins_data <- hopkins(scaled_df[2:4])
hopkins_SOM<- hopkins(scaled_df.SOM$unit.classif %>% as.data.frame())
hopkins_data # value > 0.5 so the data is more clusterable

## [1] 0.9641223

hopkins_SOM # value > 0.5 so the data is more clusterable

## [1] 1

cat("This clearly shows SOM does make data  more clusterable given that",hopkins_SOM, "is higher than",hopkins_data, "hence, SOM should not be used in this case. Nevertheless, we evaluate the SOM data in the next section")

## This clearly shows SOM does make data  more clusterable given that 1 is higher than 0.9641223 hence, SOM should not be used in this case. Nevertheless, we evaluate the SOM data in the next section

K Means Clustering

# Implement Kmeans on scaled data.SOM codes using Elbow method
set.seed(123) #for reproducability
par(mfrow=c(1,1)) # back to one plot
clusterdata <- getCodes(scaled_df.SOM)

# Elbow method
fviz_nbclust(clusterdata, kmeans, method = "wss",print.summary = TRUE) +
    geom_vline(xintercept = 3, linetype = 2)+
  labs(subtitle = "Elbow method")

tot_withinss <- map_dbl(1:10,  function(k){
  model <- kmeans(x = clusterdata, centers = k)
  model$tot.withinss
})

elbow_df <- data.frame(
  k = 1:10,
  tot_withinss = tot_withinss
)

cat("The Elbow method shows  clusters is the best because after 6 the elbow of the curve properly flattens.")

## The Elbow method shows  clusters is the best because after 6 the elbow of the curve properly flattens.

# Silhouette method
fviz_nbclust(clusterdata, kmeans, method = "silhouette")+
  labs(subtitle = "Silhouette method")

# Gap Statistic
GAP <- fviz_nbclust(clusterdata, kmeans, nstart = 25,  method = "gap_stat", nboot = 200)+
  labs(subtitle = "Gap statistic method")

# Apply NCClust() to view other performance metrics
NbClust::NbClust(data = clusterdata, diss = NULL, distance = "euclidean",min.nc = 3, max.nc = 15, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 8 proposed 3 as the best number of clusters 
## * 4 proposed 4 as the best number of clusters 
## * 5 proposed 6 as the best number of clusters 
## * 2 proposed 12 as the best number of clusters 
## * 2 proposed 13 as the best number of clusters 
## * 2 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

## $All.index
##         KL      CH Hartigan     CCC    Scott  Marriot    TrCovW   TraceW
## 3   1.5766 57.1190  23.7963 -2.2790 195.8664 744186.2 1958.4890 131.7482
## 4   3.3530 54.7825  14.4759 -2.3192 262.5698 679000.3 1176.1741 105.7945
## 5   0.1118 50.3711  20.8600 -2.6729 309.8748 661069.5  730.2879  91.9320
## 6  33.5295 52.7560   9.9260 -1.5964 368.1594 531474.9  691.9410  75.3801
## 7   0.1349 49.7253  10.4847 -1.9301 400.8062 521906.7  478.6726  68.1805
## 8   0.7830 48.3985   5.2835 -1.9415 430.8329 504862.1  458.4106  61.2728
## 9   0.4587 44.9473  11.6397 -2.5834 447.2850 542034.6  393.6300  57.9450
## 10  4.8885 45.8474   6.9234 -2.1214 480.4730 480184.9  343.8238  51.3738
## 11  0.1954 44.6278  10.7390 -2.2669 512.7879 420583.4  308.2712  47.7041
## 12  8.0499 45.9205   5.4913 -1.7609 545.2030 361953.1  234.4606  42.5677
## 13  0.5123 44.6647   5.7083 -1.9706 569.9850 331550.3  211.2091  40.0675
## 14  5.7129 43.8632   4.2731 -2.0830 585.5502 329093.9  191.4074  37.6004
## 15 14.1311 42.5584   3.7213 -2.3615 585.6897 377260.0  155.1240  35.8206
##    Friedman  Rubin Cindex     DB Silhouette   Duda Pseudot2   Beale Ratkowsky
## 3    3.6442 2.2049 0.3847 1.1697     0.3085 1.4515 -16.4863 -0.5156    0.4236
## 4    5.4495 2.7458 0.4010 1.0941     0.2869 0.9533   2.1057  0.0811    0.3942
## 5    6.8499 3.1598 0.4029 1.2294     0.2561 1.3352  -7.2797 -0.4036    0.3677
## 6    8.8753 3.8536 0.4120 1.0266     0.3066 1.4918  -7.2529 -0.5238    0.3510
## 7   10.3375 4.2606 0.4115 1.1043     0.2784 1.1803  -3.6656 -0.2414    0.3304
## 8   11.6355 4.7409 0.4275 1.0802     0.2804 1.1247  -1.8849 -0.1753    0.3140
## 9   12.4361 5.0132 0.4143 1.1644     0.2648 1.5509  -6.3938 -0.5375    0.2982
## 10  14.2307 5.6544 0.4689 1.0846     0.2600 1.8022  -6.2315 -0.6630    0.2865
## 11  16.7386 6.0894 0.4064 1.0980     0.2638 2.0845  -6.7634 -0.7381    0.2755
## 12  18.9044 6.8241 0.3841 0.9679     0.2824 1.4936  -4.2959 -0.4822    0.2665
## 13  21.5439 7.2499 0.3756 0.9749     0.2741 0.6192   4.9191  0.9305    0.2573
## 14  22.6290 7.7256 0.4003 0.9999     0.2751 1.5059  -3.0233 -0.4575    0.2490
## 15  21.4708 8.1095 0.5281 1.0342     0.2507 1.0622  -0.6445 -0.0887    0.2414
##       Ball Ptbiserial   Frey McClain   Dunn Hubert SDindex Dindex   SDbw
## 3  43.9161     0.5605 0.4231  1.1768 0.1343 0.0065  2.0645 1.0751 1.1334
## 4  26.4486     0.5604 0.6498  1.5326 0.1214 0.0072  1.8933 0.9771 0.5966
## 5  18.3864     0.5223 0.1674  2.1241 0.1556 0.0078  2.1198 0.9029 0.5346
## 6  12.5633     0.5295 0.6630  2.3757 0.1499 0.0082  1.8866 0.8220 0.2856
## 7   9.7401     0.5014 0.4740  2.8044 0.1556 0.0086  2.0219 0.7819 0.3024
## 8   7.6591     0.4772 0.3953  3.2711 0.1707 0.0088  1.9586 0.7429 0.2284
## 9   6.4383     0.4660 0.2953  3.5195 0.1707 0.0090  2.3234 0.7225 0.2201
## 10  5.1374     0.4516 0.4663  3.9001 0.2018 0.0093  2.2168 0.6845 0.1921
## 11  4.3367     0.4304 0.0890  4.4040 0.1838 0.0096  2.3952 0.6519 0.1814
## 12  3.5473     0.4305 0.4835  4.5039 0.1838 0.0099  2.1309 0.6181 0.1811
## 13  3.0821     0.4178 0.1634  4.8325 0.1838 0.0101  2.2230 0.5999 0.1658
## 14  2.6857     0.4131 0.7163  5.0029 0.2018 0.0102  2.1885 0.5830 0.1594
## 15  2.3880     0.3942 0.4898  5.5425 0.2281 0.0103  2.6015 0.5741 0.1413
## 
## $All.CriticalValues
##    CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 3          0.4304            70.1484       1.0000
## 4          0.4208            59.1761       0.9702
## 5          0.2757            76.1793       1.0000
## 6          0.2298            73.7463       1.0000
## 7          0.2115            89.4894       1.0000
## 8          0.2115            63.3884       1.0000
## 9          0.0819           201.6502       1.0000
## 10         0.0438           305.8720       1.0000
## 11        -0.0559          -245.7553       1.0000
## 12        -0.0014         -9535.2353       1.0000
## 13         0.0819            89.6223       0.4412
## 14        -0.1234           -81.9049       1.0000
## 15         0.0819           123.2307       1.0000
## 
## $Best.nc
##                      KL     CH Hartigan     CCC   Scott  Marriot   TrCovW
## Number_clusters  6.0000  3.000   6.0000  6.0000  4.0000      6.0   4.0000
## Value_Index     33.5295 57.119  10.9341 -1.5964 66.7034 120026.4 782.3149
##                  TraceW Friedman   Rubin  Cindex      DB Silhouette   Duda
## Number_clusters  4.0000  13.0000 12.0000 13.0000 12.0000     3.0000 3.0000
## Value_Index     12.0913   2.6395 -0.3089  0.3756  0.9679     0.3085 1.4515
##                 PseudoT2   Beale Ratkowsky    Ball PtBiserial Frey McClain
## Number_clusters   3.0000  3.0000    3.0000  4.0000     3.0000    2  3.0000
## Value_Index     -16.4863 -0.5156    0.4236 17.4675     0.5605   NA  1.1768
##                    Dunn Hubert SDindex Dindex    SDbw
## Number_clusters 15.0000      0  6.0000      0 15.0000
## Value_Index      0.2281      0  1.8866      0  0.1413
## 
## $Best.partition
##   V1   V2   V3   V4   V5   V6   V7   V8   V9  V10  V11  V12  V13  V14  V15  V16 
##    3    3    3    3    2    2    2    2    2    2    3    3    3    3    3    2 
##  V17  V18  V19  V20  V21  V22  V23  V24  V25  V26  V27  V28  V29  V30  V31  V32 
##    2    2    2    2    3    3    3    3    2    2    2    2    2    2    3    3 
##  V33  V34  V35  V36  V37  V38  V39  V40  V41  V42  V43  V44  V45  V46  V47  V48 
##    3    3    3    2    2    2    2    2    3    3    3    3    3    1    2    2 
##  V49  V50  V51  V52  V53  V54  V55  V56  V57  V58  V59  V60  V61  V62  V63  V64 
##    2    2    3    3    3    1    3    1    1    1    2    2    1    1    1    1 
##  V65  V66  V67  V68  V69  V70  V71  V72  V73  V74  V75  V76  V77  V78  V79  V80 
##    1    1    1    1    2    2    1    1    1    1    1    1    1    1    1    2 
##  V81  V82  V83  V84  V85  V86  V87  V88  V89  V90  V91  V92  V93  V94  V95  V96 
##    1    1    1    1    1    1    1    1    1    2    1    1    1    1    1    1 
##  V97  V98  V99 V100 
##    1    1    3    2

cat("The Silhouette width estimates the average distance between clusters. The higher the Silhouette width, the better the observations are clustered. Hence, despite k=3 is the best measure, the 2nd best score is 6 clusters. The majority rule follows this result too")

## The Silhouette width estimates the average distance between clusters. The higher the Silhouette width, the better the observations are clustered. Hence, despite k=3 is the best measure, the 2nd best score is 6 clusters. The majority rule follows this result too

fit_kmeans <- kmeans(clusterdata[,1:3], 3) # use 5 clusters, as indicated by the wss development.
cl_assignmentk <- fit_kmeans$cluster[scaled_df.SOM$unit.classif] 
main_df$clustersKm <- cl_assignmentk #back to original data.

cat("In the last subsection, we cluster the codes into 6 segments given k=6 is the best according to k-means. We have 100 (since grid=10*10) codes with asigned clusters, but we have 968 units. The above is to assign units to clusters based on their customer-id in the SOM model.")

## In the last subsection, we cluster the codes into 6 segments given k=6 is the best according to k-means. We have 100 (since grid=10*10) codes with asigned clusters, but we have 968 units. The above is to assign units to clusters based on their customer-id in the SOM model.

# Visualising Silhouette Plot
sil <- silhouette(fit_kmeans$cluster, dist(clusterdata))
fviz_silhouette(sil)

##   cluster size ave.sil.width
## 1       1   28          0.22
## 2       2   39          0.31
## 3       3   33          0.38

# Visualising 3 clusters using original data
colors3 <- c("black", "blue","green")
colors3 <- colors3[as.numeric(main_df$clustersKm)]

pairs(main_df[,c(2,4,6)],col = colors3, pch = 16, gap = 0, main = "K means clustering - Optimal K = 3")
legend("bottom", col =c("black", "blue","green"), legend = c("1 -best","2","3 - worst"), pch = 20, xpd = NA, ncol = 3, bty = "n", inset = -0.04, lwd =1, pt.cex = 1)

p <- plotly::plot_ly(main_df, x=~recency, y=~logfreq, 
             z=~logmonetary, color=~clustersKm) %>%
  add_markers(size=1.5)
print(p)

Hierarchical Agglomerative Clustering

set.seed(123)
# Create distance matrix
DistanceM <- dist(clusterdata[,1:3], method = "euclidean") #create the distance matrix
fit_hc <- hclust(DistanceM, method="ward.D2") #fit the distances. "D2", since "DistanceM" is not squared. Ward.D2 squares dissimilarities before clustering
height <- fit_hc$height
barplot(diff(height),las = 2) # k = 3 or 5

plot(fit_hc, hang = -1) # display dendrogram. To me, it looks like a 5 or 6 cluster solution in my world 3-5 clusters
abline(h = c(8,5), col = c("blue","darkgreen"), lty = c(3,3), lwd = c(1,1))
fit_hc.cut1 <- cutree(fit_hc, k=3) # cut tree into k clusters
fit_hc.cut2 <- cutree(fit_hc, k=5) # cut tree into k clusters
# draw dendrogram with red borders around the 3&6 clusters 
rect.hclust(fit_hc, k=3, border="blue")
rect.hclust(fit_hc, k=5, border="pink")

# Silhouette method
fviz_nbclust(clusterdata, hcut, method = "silhouette")+
  labs(subtitle = "Silhouette method")

# Gap Statistic
fviz_nbclust(clusterdata, hcut, nstart = 25,  method = "gap_stat", nboot = 200)+
  labs(subtitle = "Gap statistic method")

# Apply NCClust() to view other performance metrics
NbClust::NbClust(data = clusterdata, diss = NULL, distance = "euclidean",min.nc = 3, max.nc = 15, method = "ward.D2")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 3 as the best number of clusters 
## * 3 proposed 4 as the best number of clusters 
## * 8 proposed 5 as the best number of clusters 
## * 1 proposed 6 as the best number of clusters 
## * 2 proposed 12 as the best number of clusters 
## * 2 proposed 14 as the best number of clusters 
## * 1 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  5 
##  
##  
## *******************************************************************

## $All.index
##         KL      CH Hartigan     CCC    Scott  Marriot    TrCovW   TraceW
## 3   0.4952 41.7362  25.4605 -5.1838 172.4496 940545.2 3831.1451 154.2077
## 4   0.5978 43.1646  23.8493 -5.1275 228.5886 953780.4 2126.6613 122.1466
## 5   4.1537 45.8952  10.3539 -3.7888 287.1949 829363.0 1156.7942  97.8402
## 6  10.9564 42.3382  10.3448 -4.2545 322.4075 839809.3 1056.4396  88.2247
## 7   0.1078 40.4537  10.2291 -4.4118 354.7475 827224.7  837.9521  79.4781
## 8   0.4432 39.5202  11.1092 -4.3817 385.0810 797756.8  732.1478  71.6025
## 9   1.6185 39.7082   8.8497 -4.0733 426.6127 666508.4  589.0595  63.8879
## 10  0.8857 39.2756   8.4408 -3.9914 462.9887 571928.4  528.3847  58.2255
## 11  0.9598 39.0683   7.9881 -3.8758 487.4729 541743.4  430.8135  53.2329
## 12  0.7300 38.9876   8.5723 -3.7496 530.4215 419611.5  385.6260  48.8486
## 13  1.0003 39.4806   8.2520 -3.4709 549.9042 405284.0  286.3830  44.5125
## 14  2.0253 40.0691   5.9583 -3.1846 569.6057 385981.3  231.8899  40.6562
## 15  1.0548 39.7429   5.5583 -3.1939 586.6607 373614.4  188.5291  38.0220
##    Friedman  Rubin Cindex     DB Silhouette   Duda Pseudot2  Beale Ratkowsky
## 3    3.4066 1.8837 0.3888 1.2586     0.2541 0.5524  34.0369 1.3476    0.3880
## 4    4.6848 2.3782 0.3747 1.2164     0.2300 0.5725  26.1357 1.2359    0.3726
## 5    6.0343 2.9690 0.4052 1.1783     0.2463 0.6189  14.7790 1.0064    0.3627
## 6    7.2144 3.2926 0.3873 1.1793     0.2360 0.6600   8.7563 0.8282    0.3399
## 7    8.4059 3.6549 0.3875 1.2461     0.2316 0.4505  19.5180 1.9546    0.3217
## 8    9.6408 4.0569 0.3729 1.1893     0.2292 0.5430  15.1511 1.3575    0.3060
## 9   11.7982 4.5468 0.3612 1.1330     0.2242 0.6387   8.4867 0.9030    0.2936
## 10  14.3159 4.9890 0.3447 1.1802     0.2279 0.3676  10.3199 2.5098    0.2819
## 11  15.4542 5.4569 0.3362 1.0988     0.2259 0.5647  10.7928 1.2249    0.2722
## 12  19.8152 5.9467 0.3210 1.0840     0.2276 0.5228   8.2158 1.3987    0.2630
## 13  20.7431 6.5260 0.3817 1.0710     0.2351 0.4296  15.9322 2.0864    0.2551
## 14  21.8800 7.1450 0.3819 1.0402     0.2288 0.5248   7.2446 1.3704    0.2475
## 15  23.0834 7.6400 0.3692 1.0683     0.2292 0.4243   8.1405 1.9798    0.2405
##       Ball Ptbiserial   Frey McClain   Dunn Hubert SDindex Dindex   SDbw
## 3  51.4026     0.4863 0.4539  1.1444 0.1324 0.0056  2.1094 1.1700 0.9459
## 4  30.5367     0.4860 0.2379  1.6683 0.1379 0.0065  1.9723 1.0455 0.7729
## 5  19.5680     0.4966 0.6556  2.2114 0.1698 0.0074  2.0487 0.9378 0.6668
## 6  14.7041     0.4704 0.1601  2.6832 0.1408 0.0075  2.2797 0.8896 0.4547
## 7  11.3540     0.4726 0.4011  2.9141 0.1500 0.0081  2.2062 0.8427 0.3213
## 8   8.9503     0.4597 0.2447  3.2328 0.1500 0.0087  2.2020 0.7915 0.3423
## 9   7.0987     0.4499 0.2207  3.6052 0.1548 0.0092  2.1501 0.7419 0.2912
## 10  5.8225     0.4398 0.1121  3.9865 0.1581 0.0095  2.2985 0.7083 0.2971
## 11  4.8394     0.4397 0.3721  4.0535 0.1581 0.0096  2.1369 0.6814 0.2502
## 12  4.0707     0.4213 0.1360  4.5497 0.1327 0.0100  2.2844 0.6496 0.2035
## 13  3.4240     0.4180 0.3793  4.7288 0.1629 0.0101  2.2691 0.6231 0.1738
## 14  2.9040     0.4004 0.2298  5.2529 0.1419 0.0105  2.2910 0.5970 0.1563
## 15  2.5348     0.3924 0.2244  5.5368 0.1419 0.0106  2.5218 0.5778 0.1473
## 
## $All.CriticalValues
##    CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 3          0.4513            51.0655       0.2620
## 4          0.4208            48.1666       0.3005
## 5          0.3500            44.5622       0.3950
## 6          0.2757            44.6568       0.4845
## 7          0.2617            45.1416       0.1334
## 8          0.2887            44.3516       0.2656
## 9          0.2464            45.8680       0.4472
## 10        -0.0014         -4400.8778       0.0914
## 11         0.2298            46.9294       0.3126
## 12         0.1148            69.3995       0.2647
## 13         0.1912            50.7466       0.1192
## 14         0.0819            89.6223       0.2757
## 15        -0.0014         -4400.8778       0.1531
## 
## $Best.nc
##                      KL      CH Hartigan     CCC   Scott  Marriot   TrCovW
## Number_clusters  6.0000  5.0000   5.0000 14.0000  5.0000      5.0    4.000
## Value_Index     10.9564 45.8952  13.4954 -3.1846 58.6063 134863.6 1704.484
##                 TraceW Friedman   Rubin Cindex      DB Silhouette   Duda
## Number_clusters  5.000   12.000  5.0000 12.000 14.0000     3.0000 3.0000
## Value_Index     14.691    4.361 -0.2672  0.321  1.0402     0.2541 0.5524
##                 PseudoT2  Beale Ratkowsky    Ball PtBiserial Frey McClain
## Number_clusters   3.0000 3.0000     3.000  4.0000     5.0000    2  3.0000
## Value_Index      34.0369 1.3476     0.388 20.8659     0.4966   NA  1.1444
##                   Dunn Hubert SDindex Dindex    SDbw
## Number_clusters 5.0000      0  4.0000      0 15.0000
## Value_Index     0.1698      0  1.9723      0  0.1473
## 
## $Best.partition
##   V1   V2   V3   V4   V5   V6   V7   V8   V9  V10  V11  V12  V13  V14  V15  V16 
##    1    2    2    2    2    2    2    3    3    3    1    1    2    2    2    2 
##  V17  V18  V19  V20  V21  V22  V23  V24  V25  V26  V27  V28  V29  V30  V31  V32 
##    3    3    3    3    1    1    2    2    2    4    4    3    3    3    1    1 
##  V33  V34  V35  V36  V37  V38  V39  V40  V41  V42  V43  V44  V45  V46  V47  V48 
##    2    2    2    4    4    4    3    3    1    1    4    4    4    4    4    4 
##  V49  V50  V51  V52  V53  V54  V55  V56  V57  V58  V59  V60  V61  V62  V63  V64 
##    3    3    1    1    1    1    4    4    5    5    3    3    1    1    1    4 
##  V65  V66  V67  V68  V69  V70  V71  V72  V73  V74  V75  V76  V77  V78  V79  V80 
##    4    4    5    3    3    3    5    1    1    5    5    4    5    5    5    3 
##  V81  V82  V83  V84  V85  V86  V87  V88  V89  V90  V91  V92  V93  V94  V95  V96 
##    5    5    5    5    5    5    5    5    5    2    5    5    5    5    5    5 
##  V97  V98  V99 V100 
##    5    5    2    2

cat("Using both the dendogram and the height difference plot, we can see k=3 & k=5 being good measures because of two reason.\n Reason 1: Using the dendogram, there are 3 clear clusters when cutting the dendogram around 8cm while there are six clusters around 5cm.\n Reason 2: Using the barplot, we can see there is a drastic drop in height difference between the first bar and second bar, which means k=3 is a good, while there is drop in height difference between the fifth bar and sixth bar, which means k=5 is good too. We will prove this using the Dunn index. However, given that we have found k = 3 has already been covered in Kmeans, we use k = 5 instead.")

## Using both the dendogram and the height difference plot, we can see k=3 & k=5 being good measures because of two reason.
##  Reason 1: Using the dendogram, there are 3 clear clusters when cutting the dendogram around 8cm while there are six clusters around 5cm.
##  Reason 2: Using the barplot, we can see there is a drastic drop in height difference between the first bar and second bar, which means k=3 is a good, while there is drop in height difference between the fifth bar and sixth bar, which means k=5 is good too. We will prove this using the Dunn index. However, given that we have found k = 3 has already been covered in Kmeans, we use k = 5 instead.

# Visualise Dendogram in another format + Apply silhouette method
hc.cut <- hcut(clusterdata[,1:3], k = 5 ,method="ward.D2")
fviz_dend(hc.cut, show_labels = FALSE, rect = TRUE)

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

fviz_silhouette(hc.cut)

##   cluster size ave.sil.width
## 1       1   18          0.33
## 2       2   19          0.13
## 3       3   20          0.37
## 4       4   17          0.18
## 5       5   26          0.22

# Bring that back to the original data
cl_assignmenth <- fit_hc.cut2[scaled_df.SOM$unit.classif] # Retrieve the clustered assignments for each sample
main_df$clustersHac <- cl_assignmenth #back to original data.

cat("Again, we have 100 (since grid=10*10) codes with assigned clusters,but we have 968 units. The above code assigns units to clusters based on their class-id (code-id) in the SOM model.")

## Again, we have 100 (since grid=10*10) codes with assigned clusters,but we have 968 units. The above code assigns units to clusters based on their class-id (code-id) in the SOM model.

# Visualise plots
colors5 <- c("black", "purple", "blue","cyan","green")
colors5 <- colors5[as.numeric(main_df$clustersHac)]

pairs(main_df[,c(2,4,6)],col = colors5,pch = 16, gap = 0, main = "HAC clustering - Optimal K = 5")
legend("bottom", col = c("black", "purple", "blue","cyan","green"), legend = c("1 -best","2","3","4","5 - worst"), lwd =1, pch = 20, xpd = NA, ncol = 5, bty = "n", inset = -0.04, pt.cex = 0.5)

p <- plotly::plot_ly(main_df, x=~recency, y=~logfreq, 
             z=~logmonetary, color=~clustersHac) %>%
  add_markers(size=1.5)
print(p)

Evaluate Performance Metrics

rownames1 <- rep("K-means",7)
Metrics1 <- c("Elbow","SIL","Gap Stat","Dunn","Davies-Bouldin","NBClust() Majority rule","Comments")
Results1 <- c(3,3,3,15,12,3,"Despite k=3 being best, k=6 was 2nd best in NBClust which gives a better cluster interpretation for the customer segmentation model")
table1 <- data.frame(Model = rownames1, Metrics = Metrics1 ,Results = Results1)

rownames2 <- rep("HAC",7)
Metrics2 <- c("Dend","SIL","Gap Stat","Dunn","Davies-Bouldin", "NBClust() Majority rule","Comments")
Results2 <- c(5,3,5,5,14,5,"k = 5 was best, but k=3 is still strong")
table2 <- data.frame(Model = rownames2, Metrics = Metrics2 ,Results = Results2)

write_xlsx(table1, "KMeans")
write_xlsx(table2, "HAC")

cat("We use k =3 & k = 5, given that is the best in HAC. Despite hacing relatively poorer performance  in kmeans, this is not significant enough. Plus, for simplicitiy, we will compare 2 models k =3 and k = 5")

## We use k =3 & k = 5, given that is the best in HAC. Despite hacing relatively poorer performance  in kmeans, this is not significant enough. Plus, for simplicitiy, we will compare 2 models k =3 and k = 5

Evaluating the Model

Evaluate K means

main_df$trueN_k5 <- ifelse(main_df$Segment6 == "1 (Best)", 1, ifelse(main_df$Segment6 == "2", 2, ifelse(main_df$Segment6 == "3", 3, ifelse(main_df$Segment6 == "4", 4, ifelse(main_df$Segment6 == "5", 5, ifelse(main_df$Segment6 == "6 (Worst)", 5,NA))))))

main_df$trueN_k3 <- ifelse(main_df$Segment3 == "1 (Best)", 1, ifelse(main_df$Segment3 == "2", 2, ifelse(main_df$Segment3 == "3 (Worst)", 3, NA)))

kmeancolor <- main_df[,14] #set color indicator
haccolor <- main_df[,15] #set color indicator
trueccolor5 <- main_df[,16] #set color indicator
trueccolor3 <- main_df[,17] #set color indicator

# Set colour palettes
color_grid <- tricolor(scaled_df.SOM$grid, phi = c(pi/8, 6, -pi/6), offset = 0.1)
colors5 <-  c("black", "darkviolet", "blue","cyan","seagreen")
colors5 <- colors5[as.numeric(main_df$clustersHac)]
colors3 <- c("black", "blue","seagreen")
colors3 <- colors3[as.numeric(main_df$clustersKm)]
contrast <- c("#FA4925", "#22693E", "#D4D40F", "#2C4382", "#F0F0F0", "#3D3D3D")

# Do a Mapping Plot 
par(mfrow=c(1,1))
plot_KM <- plot(scaled_df.SOM, type="mapping", bg = rgb(color_grid), shape = "straight", border = "grey", labels = unique(main_df$Segment3),col=colors3, pch = 30, main = "Kmeans mapping plot")
add.cluster.boundaries(scaled_df.SOM, fit_kmeans$cluster, lwd = 3, lty = 2, col=contrast[4]) 
legend("right", col = c("black", "blue","seagreen"), legend = c("1 -best","2","3 - worst"), pch = 20, xpd = NA, ncol = 1, bty = "n", inset = 0.01, pt.cex = 1.5)

# Do a K means Mapping Plot for 3 segments
par(mfrow=c(1,2))

plot(scaled_df.SOM, type="mapping", bg = rgb(color_grid), shape = "straight", border = "grey", labels = unique(main_df$Segment3),col=contrast[kmeancolor], pch = 30, main = "Kmean (K=5), mapping, col=cluster, no=trueSp")
add.cluster.boundaries(scaled_df.SOM, fit_kmeans$cluster, lwd = 3, lty = 2, col=contrast[4]) 
legend("bottom", col = c("black", "blue","seagreen"), legend = c("1 -best","2","3 - worst"), pch = 20, xpd = NA, ncol = 3, bty = "n", inset = -0.04, pt.cex = 1.5)

plot(scaled_df.SOM, type="mapping", bg = rgb(color_grid), shape = "straight", border = "grey", labels = unique(main_df$Segment3),col=contrast[kmeancolor], pch = 30,  main = "Kmean (K=5), mapping, col=true, no=cluster")
add.cluster.boundaries(scaled_df.SOM, fit_kmeans$cluster, lwd = 3, lty = 2, col=contrast[4])
legend("bottom", col = c("black", "blue","seagreen"), legend = c("1 -best","2","3 - worst"), pch = 20, xpd = NA, ncol = 3, bty = "n", inset = -0.04, pt.cex = 1.5)

# Test for the accuracy of Kmeans + ordering is off, but that does not mean that clustering is. Inspect the dataset with cluster assignments.
main_df$Segment3 <- main_df$Segment3 %>% as.factor()
summary(main_df$Segment3)# and with guidens from above plots:

##  1 (Best)         2 3 (Worst) 
##       359       230       383

length(which(main_df$clustersKm == 3)) # Over by 29

## [1] 330

length(which(main_df$clustersKm == 1)) # Missed by 24

## [1] 206

length(which(main_df$clustersKm == 2)) # Missed by 53

## [1] 436

table1 <- data.frame(clusterKM_3 = c(length(which(main_df$clustersKm == 3)),length(which(main_df$clustersKm == 1)),length(which(main_df$clustersKm == 2))))
table1

##   clusterKM_3
## 1         330
## 2         206
## 3         436

#Then:
main_df$testkm <- 1 #make everything "wrong" (i.e. 1).
#Cluster assignments can only be right in one whay, but wrong in k (no of clusters-1) ways.
#Better look for the "rights", rather than the "wrongs".
main_df$testkm[main_df$clustersKm == 3 & main_df$trueN_k3 == 1] <- 0 #Make 2 right, i.e. 0.
main_df$testkm[main_df$clustersKm == 1 & main_df$trueN_k3 == 2] <- 0 #3 was right for true 2.
main_df$testkm[main_df$clustersKm == 2 & main_df$trueN_k3 == 3] <- 0 #1 was righht for true 3.

table2 <- data.frame(truecluster_3 = c(length(which(main_df$trueN_k3 == 1)),length(which(main_df$trueN_k3 == 2)),length(which(main_df$trueN_k3 == 3))))
Residual <- table2 - table1
testKM <- sum(main_df$testkm)
testKM

## [1] 266

Accuracy_Score<- 1-(testKM)/972
Accuracy_Score # how accurate is the km model

## [1] 0.7263374

KM3_df <- data.frame(summary(main_df$Segment3),table1,table2, Residual, Accuracy_Score)
colnames(KM3_df)

## [1] "summary.main_df.Segment3." "clusterKM_3"              
## [3] "truecluster_3"             "truecluster_3.1"          
## [5] "Accuracy_Score"

KM3_df <- KM3_df %>% rename(Residual = truecluster_3.1)
write.csv(KM3_df,"Table 1.4")

Evaluate Hierarchical Clustering

par(mfrow=c(1,1))
plot_HAC <- plot(scaled_df.SOM, type="mapping", bg = rgb(color_grid), shape = "straight", border = "grey", labels = unique(main_df$Segment6),col=colors5, pch = 30,  main = "HAC mapping plot")
add.cluster.boundaries(scaled_df.SOM, fit_hc.cut2, lwd = 3, lty = 2, col=contrast[4]) 
legend("right", col = c("black", "darkviolet", "blue","cyan","seagreen"), legend = c("1 -best","2","3","4","5 - worst"), pch = 20, xpd = NA, ncol = 1, bty = "n", inset = 0.01, pt.cex = 1.5)

par(mfrow=c(1,2))
plot(scaled_df.SOM, type="mapping", bg = rgb(color_grid), shape = "straight", border = "grey", labels =unique(main_df$Segment6),col=contrast[haccolor], main = "HAC, mapping, col=cluster, no=trueSp")
add.cluster.boundaries(scaled_df.SOM, fit_hc.cut2, lwd = 3, lty = 2, col=contrast[4]) #name your pallette and choose color...
legend("bottom", col = c("black", "darkviolet", "blue","cyan","seagreen"), legend = c("1 -best","2","3","4","5 - worst"), pch = 20, xpd = NA, ncol = 1, bty = "n", inset = 0.01, pt.cex = 1.5)

plot(scaled_df.SOM, type="mapping", bg = rgb(color_grid), shape = "straight", border = "grey", labels = unique(main_df$Segment6),col=contrast[trueccolor5], main = "HAC, mapping, col=true, no=cluster")
add.cluster.boundaries(scaled_df.SOM, fit_hc.cut2, lwd = 3, lty = 2, col=contrast[4]) #name your pallette and choose color...
legend("bottom", col = c("black", "darkviolet", "blue","cyan","seagreen"), legend = c("1 -best","2","3","4","5 - worst"), pch = 20, xpd = NA, ncol = 1, bty = "n", inset = 0.01, pt.cex = 1.5)

# Test for the accuracy of Kmeans + ordering is off, but that does not mean that clustering is. Inspect the dataset with cluster assignments.
main_df$Segment6 <- main_df$Segment6 %>% as.factor()
summary(main_df$Segment6) # and with guidens from above plots:

##  1 (Best)         2         3         4         5 6 (Worst) 
##       166       193       130       100       172       211

length(which(main_df$clustersHac == 1)) # Missed 8

## [1] 174

length(which(main_df$clustersHac == 5)) # Over by 42

## [1] 235

length(which(main_df$clustersHac == 4)) # Over by 19

## [1] 149

length(which(main_df$clustersHac == 2)) # Missed by 53

## [1] 153

length(which(main_df$clustersHac == 3)) # Over by 122

## [1] 261

table3 <- data.frame(clusterHac = c(length(which(main_df$clustersHac == 1)),length(which(main_df$clustersHac == 5)),length(which(main_df$clustersHac == 4)),length(which(main_df$clustersHac == 2)),length(which(main_df$clustersHac == 3))))

#Then:
main_df$testhc <- 1 #make everything "wrong" (i.e. 1).
#Cluster assignments can only be right in one whay, but wrong in k (no of clusters-1) ways.
#Better look for the "rights", rather than the "wrongs".
main_df$testhc[main_df$clustersHac == 1 & main_df$trueN_k5 == 1] <- 0 #Make 2 right, i.e. 0.
main_df$testhc[main_df$clustersHac == 5 & main_df$trueN_k5 == 2] <- 0 #3 was right for true 2.
main_df$testhc[main_df$clustersHac == 4 & main_df$trueN_k5 == 3] <- 0 #1 was righht for true 3.
main_df$testhc[main_df$clustersHac == 2 & main_df$trueN_k5 == 4] <- 0 #1 was righht for true 3.
main_df$testhc[main_df$clustersHac == 3 & main_df$trueN_k5 == 5] <- 0 #1 was righht for true 3.

table4 <- data.frame(truecluster_5 = c(length(which(main_df$trueN_k5 == 1)),length(which(main_df$trueN_k5 == 2)),length(which(main_df$trueN_k5 == 3)),length(which(main_df$trueN_k5 == 4)),length(which(main_df$trueN_k5 == 5))))

Residual <- table4 - table3

testHC <- sum(main_df$testhc)
testHC

## [1] 354

Accuracy_Score <- 1-(testHC)/972
Accuracy_Score # how accurate is the km model

## [1] 0.6358025

HAC5_df <- data.frame(k = c("1 -best","2","3","4","5 - worst"),table3,table4, Residual, Accuracy_Score)
colnames(HAC5_df)

## [1] "k"               "clusterHac"      "truecluster_5"   "truecluster_5.1"
## [5] "Accuracy_Score"

HAC5_df <- HAC5_df %>% rename(Residual = truecluster_5.1)
HAC5_df

##           k clusterHac truecluster_5 Residual Accuracy_Score
## 1   1 -best        174           166       -8      0.6358025
## 2         2        235           193      -42      0.6358025
## 3         3        149           130      -19      0.6358025
## 4         4        153           100      -53      0.6358025
## 5 5 - worst        261           383      122      0.6358025

write.csv(HAC5_df,"Table 1.5")

cat("After testing how well the model fitted the original dataset, we evaluate k=3 has a higher accuracy then k=5, so we use k=3. However, in k =5, the best customers (cluster number 1, is fitted well at 95%.")

## After testing how well the model fitted the original dataset, we evaluate k=3 has a higher accuracy then k=5, so we use k=3. However, in k =5, the best customers (cluster number 1, is fitted well at 95%.

Section 2 Part 2 - Prediction Model

str(scaled_df)

## 'data.frame':    972 obs. of  11 variables:
##  $ customer_id     : int  80 90 130 190 220 290 330 400 410 480 ...
##  $ recency         : num  -1.207 -0.888 0.815 0.524 1.349 ...
##  $ logfreq         : num  1.0144 1.0144 -1.1626 -0.0741 -1.1626 ...
##  $ logmonetary     : num  0.893 1.678 0.554 0.675 -0.499 ...
##  $ BCPower_monetary: num  0.196 0.161 0.213 0.206 0.277 ...
##  $ R_score         : num  4 3 2 2 1 2 2 1 3 4 ...
##  $ F_score         : num  4 4 1 2 1 2 3 1 4 3 ...
##  $ M_score         : num  4 4 4 4 2 2 2 1 2 3 ...
##  $ score           : num  444 344 214 224 112 222 232 111 342 433 ...
##  $ Segment3        : chr  "1 (Best)" "1 (Best)" "2" "2" ...
##  $ Segment6        : chr  "1 (Best)" "1 (Best)" "4" "4" ...
##  - attr(*, "na.action")= 'omit' Named int [1:28] 21 33 78 79 80 81 82 83 84 85 ...
##   ..- attr(*, "names")= chr [1:28] "21" "33" "78" "79" ...

y_km <- main_df["clustersKm"]
y_hc <- main_df["clustersHac"]

exp = scaled_df[,c(2,3,4)] %>% add_column(y_km,.after = "logmonetary") 

exp = exp %>% add_column(y_hc,.after = "clustersKm")

str(exp)

## 'data.frame':    972 obs. of  5 variables:
##  $ recency    : num  -1.207 -0.888 0.815 0.524 1.349 ...
##  $ logfreq    : num  1.0144 1.0144 -1.1626 -0.0741 -1.1626 ...
##  $ logmonetary: num  0.893 1.678 0.554 0.675 -0.499 ...
##  $ clustersKm : int  3 1 2 2 2 2 3 2 3 3 ...
##  $ clustersHac: int  1 1 3 2 3 3 5 3 5 1 ...

set.seed(123)

validation_index = createDataPartition(exp$clustersKm, p = 0.8, list=FALSE)
training = exp[validation_index,]
testing = exp[-validation_index,]

# Predicting KM
library(rpart)

## 
## Attaching package: 'rpart'

## The following object is masked from 'package:dendextend':
## 
##     prune

library(rpart.plot)
fit_km <- rpart(clustersKm ~ recency + logfreq + logmonetary, data = training, method = 'class')
rpart.plot(fit_km, extra = 104)

predict_unseen1 <-predict(fit_km, testing, type = 'class')

table_mat <- table(testing$clustersKm, predict_unseen1)
write.csv(table_mat,"Prediction Model Kmeans")

accuracy_Test1 <- sum(diag(table_mat)) / sum(table_mat)

print(paste('Accuracy for test', accuracy_Test1))

## [1] "Accuracy for test 0.917525773195876"

# Predicting HC

fit_hc <- rpart(clustersHac ~ recency + logfreq + logmonetary, data = training, method = 'class')
rpart.plot(fit_hc, extra = 104)

predict_unseen2 <-predict(fit_hc, testing, type = 'class')

table_matrix <- table(testing$clustersHac, predict_unseen2)
write.csv(table_matrix,"Prediction Model HAC")

accuracy_Test2 <- sum(diag(table_matrix)) / sum(table_matrix)

print(paste('Accuracy for test', accuracy_Test2))

## [1] "Accuracy for test 0.912371134020619"

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Rex’s Code 2.0 Markdown

Rex

21/04/2022

Section A

Section A Part 1 - Removing Missing Values + Making df numeric

Section A Part 2 - Create boxplot (original distribution)

Section A Part 3 - Encoding data

Section A Part 4 - Create boxplot and piechart

Section A Part 3 - Do Log transformation on monetary

Section A Part 4 - Create box plot to see effects of transformation

Section A Part 5 - Winsor Outliers

Section A Part 6 - Create box plot to see effects of winsoring

Section A Part 7 - Remove unnecessary variable & Scale + Center dataset

Section A Part 8 - View distribution of scaled dataset

Section A Part 10 - View distribution on EDA & compare transformation technqiuees

Exploratory Data Analysis

Section B

Section B Part 1 - Check for correlations

Section B Part 1.2 - Create SOM Model - Trial and EError

Section B Part 2 - Determine whether the model is good or not - Model Development

Section B Part 2.2 Visualisations - Plots

Counts, Neighbors, Codes Plots

Heatmaps

Heatmaps for scaled data

Heatmaps for unscaled data

Section B Part 3 - Clustering Model Implementation

How clusterable is our data

K Means Clustering

Hierarchical Agglomerative Clustering

Evaluate Performance Metrics

Evaluating the Model

Evaluate K means

Evaluate Hierarchical Clustering

Section 2 Part 2 - Prediction Model