MENENTUKAN GEROMBOL

Package

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.4.0      v purrr   0.3.4 
## v tibble  3.1.8      v dplyr   1.0.10
## v tidyr   1.2.1      v stringr 1.4.1 
## v readr   2.1.2      v forcats 0.5.2 
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(rlang)
## 
## Attaching package: 'rlang'
## 
## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
##     flatten_lgl, flatten_raw, invoke, splice
library(ggplot2)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(StatMatch)
## Loading required package: proxy
## 
## Attaching package: 'proxy'
## 
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## 
## The following object is masked from 'package:base':
## 
##     as.matrix
## 
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loading required package: survival
## 
## Attaching package: 'survey'
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart
## 
## Loading required package: lpSolve

Input Data

dt <- read.csv("C:/Users/ASUS/OneDrive/Documents/STK SMT 5/Tugas/Individu/PSD/Data Mall_Customer.csv", sep = ";")
dt1 <- dt[,-c(1,2)]
str(dt1)
## 'data.frame':    200 obs. of  3 variables:
##  $ Age           : int  19 21 20 23 31 22 35 23 64 30 ...
##  $ Annual.Income : int  15 15 16 16 17 17 18 18 19 19 ...
##  $ Spending.Score: int  39 81 6 77 40 76 6 94 3 72 ...
head(dt1)
##   Age Annual.Income Spending.Score
## 1  19            15             39
## 2  21            15             81
## 3  20            16              6
## 4  23            16             77
## 5  31            17             40
## 6  22            17             76
data.stdz = scale(dt1)
apply(data.stdz, 2, mean) #cek mean = 0
##            Age  Annual.Income Spending.Score 
##  -1.016906e-16  -8.144310e-17  -1.096708e-16
apply(data.stdz, 2, sd) # cek 
##            Age  Annual.Income Spending.Score 
##              1              1              1

Penggerombolan HIRARKI

Pemilihan Banyaknya Klaster

Complete Lingkage

fviz_nbclust(data.stdz, FUNcluster = hcut, method = "silhouette", hc_method = "complete", hc_metric="euclidean")

Average Lingkage

fviz_nbclust(data.stdz, FUNcluster = hcut, method = "silhouette", hc_method = "average", hc_metric="euclidean")

Centroid Lingkage

fviz_nbclust(data.stdz, FUNcluster = hcut, method = "silhouette", hc_method = "centroid", hc_metric="euclidean")

Single Lingkage

fviz_nbclust(data.stdz, FUNcluster = hcut, method = "silhouette", hc_method = "single", hc_metric="euclidean")

Memilih Complete Lingkage

Menampilkan Dendogram

fviz_dend(hclust(dist(data.stdz, method = "euclidean"), method = "complete"))
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## i The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <]8;;https://github.com/kassambara/factoextra/issueshttps://github.com/kassambara/factoextra/issues]8;;>.

#Interpretasi, utk melihat anggota dari setiap gerombol
hc.data <- eclust(dt1, stand = TRUE, FUNcluster = "hclust", k=2, hc_method = "complete", hc_metric = "euclidean", graph = F)

hc.data$cluster #cluster dari setiap pengamatan
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2
## [149] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [186] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
aggregate(dt1, by=list(cluster=hc.data$cluster), FUN = mean)
##   cluster      Age Annual.Income Spending.Score
## 1       1 39.96825      44.83333       49.46032
## 2       2 36.94595      87.33784       51.45946
fviz_cluster(hc.data)

# Penggerombolan NON-HIRARKI (K-Means) ## K-Means pemilihan cluster optimum

fviz_nbclust(dt1, FUNcluster = hcut, method = "wss")

kmeans <- eclust(dt1, stand = F, FUNcluster = "kmeans", k = 6, graph = F)
kmeans$cluster
##   [1] 3 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6
##  [38] 3 6 3 6 3 6 3 6 3 6 6 6 6 6 3 6 6 6 6 6 6 6 6 6 3 6 6 6 3 6 6 3 6 6 6 6 6
##  [75] 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
## [112] 6 6 6 6 6 6 6 6 6 6 6 6 5 1 5 2 5 2 5 2 5 1 5 1 5 2 5 1 5 2 5 1 5 1 5 2 5
## [149] 1 5 2 5 2 5 2 5 1 5 1 5 2 5 1 5 2 5 2 5 2 5 2 5 1 5 2 5 2 5 2 5 4 5 4 5 4
## [186] 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5
kmeans$centers
##        Age Annual.Income Spending.Score
## 1 27.27273      76.72727       15.36364
## 2 48.52941      80.29412       18.76471
## 3 24.82143      28.71429       74.25000
## 4 41.00000     109.70000       22.00000
## 5 32.69231      86.53846       82.12821
## 6 44.89474      48.70526       42.63158
aggregate(dt1, by=list(cluster=kmeans$cluster), FUN = mean)
##   cluster      Age Annual.Income Spending.Score
## 1       1 27.27273      76.72727       15.36364
## 2       2 48.52941      80.29412       18.76471
## 3       3 24.82143      28.71429       74.25000
## 4       4 41.00000     109.70000       22.00000
## 5       5 32.69231      86.53846       82.12821
## 6       6 44.89474      48.70526       42.63158

Visualisasi K-Means

fviz_cluster(kmeans)