library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.4.0 v purrr 0.3.4
## v tibble 3.1.8 v dplyr 1.0.10
## v tidyr 1.2.1 v stringr 1.4.1
## v readr 2.1.2 v forcats 0.5.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(rlang)
##
## Attaching package: 'rlang'
##
## The following objects are masked from 'package:purrr':
##
## %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
## flatten_lgl, flatten_raw, invoke, splice
library(ggplot2)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(StatMatch)
## Loading required package: proxy
##
## Attaching package: 'proxy'
##
## The following objects are masked from 'package:stats':
##
## as.dist, dist
##
## The following object is masked from 'package:base':
##
## as.matrix
##
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loading required package: survival
##
## Attaching package: 'survey'
##
## The following object is masked from 'package:graphics':
##
## dotchart
##
## Loading required package: lpSolve
dt <- read.csv("C:/Users/ASUS/OneDrive/Documents/STK SMT 5/Tugas/Individu/PSD/Data Mall_Customer.csv", sep = ";")
dt1 <- dt[,-c(1,2)]
str(dt1)
## 'data.frame': 200 obs. of 3 variables:
## $ Age : int 19 21 20 23 31 22 35 23 64 30 ...
## $ Annual.Income : int 15 15 16 16 17 17 18 18 19 19 ...
## $ Spending.Score: int 39 81 6 77 40 76 6 94 3 72 ...
head(dt1)
## Age Annual.Income Spending.Score
## 1 19 15 39
## 2 21 15 81
## 3 20 16 6
## 4 23 16 77
## 5 31 17 40
## 6 22 17 76
data.stdz = scale(dt1)
apply(data.stdz, 2, mean) #cek mean = 0
## Age Annual.Income Spending.Score
## -1.016906e-16 -8.144310e-17 -1.096708e-16
apply(data.stdz, 2, sd) # cek
## Age Annual.Income Spending.Score
## 1 1 1
fviz_nbclust(data.stdz, FUNcluster = hcut, method = "silhouette", hc_method = "complete", hc_metric="euclidean")
fviz_nbclust(data.stdz, FUNcluster = hcut, method = "silhouette", hc_method = "average", hc_metric="euclidean")
fviz_nbclust(data.stdz, FUNcluster = hcut, method = "silhouette", hc_method = "centroid", hc_metric="euclidean")
fviz_nbclust(data.stdz, FUNcluster = hcut, method = "silhouette", hc_method = "single", hc_metric="euclidean")
fviz_dend(hclust(dist(data.stdz, method = "euclidean"), method = "complete"))
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## i The deprecated feature was likely used in the factoextra package.
## Please report the issue at <]8;;https://github.com/kassambara/factoextra/issueshttps://github.com/kassambara/factoextra/issues]8;;>.
#Interpretasi, utk melihat anggota dari setiap gerombol
hc.data <- eclust(dt1, stand = TRUE, FUNcluster = "hclust", k=2, hc_method = "complete", hc_metric = "euclidean", graph = F)
hc.data$cluster #cluster dari setiap pengamatan
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2
## [149] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [186] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
aggregate(dt1, by=list(cluster=hc.data$cluster), FUN = mean)
## cluster Age Annual.Income Spending.Score
## 1 1 39.96825 44.83333 49.46032
## 2 2 36.94595 87.33784 51.45946
fviz_cluster(hc.data)
# Penggerombolan NON-HIRARKI (K-Means) ##
K-Means pemilihan cluster optimum
fviz_nbclust(dt1, FUNcluster = hcut, method = "wss")
kmeans <- eclust(dt1, stand = F, FUNcluster = "kmeans", k = 6, graph = F)
kmeans$cluster
## [1] 3 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6 3 6
## [38] 3 6 3 6 3 6 3 6 3 6 6 6 6 6 3 6 6 6 6 6 6 6 6 6 3 6 6 6 3 6 6 3 6 6 6 6 6
## [75] 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
## [112] 6 6 6 6 6 6 6 6 6 6 6 6 5 1 5 2 5 2 5 2 5 1 5 1 5 2 5 1 5 2 5 1 5 1 5 2 5
## [149] 1 5 2 5 2 5 2 5 1 5 1 5 2 5 1 5 2 5 2 5 2 5 2 5 1 5 2 5 2 5 2 5 4 5 4 5 4
## [186] 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5
kmeans$centers
## Age Annual.Income Spending.Score
## 1 27.27273 76.72727 15.36364
## 2 48.52941 80.29412 18.76471
## 3 24.82143 28.71429 74.25000
## 4 41.00000 109.70000 22.00000
## 5 32.69231 86.53846 82.12821
## 6 44.89474 48.70526 42.63158
aggregate(dt1, by=list(cluster=kmeans$cluster), FUN = mean)
## cluster Age Annual.Income Spending.Score
## 1 1 27.27273 76.72727 15.36364
## 2 2 48.52941 80.29412 18.76471
## 3 3 24.82143 28.71429 74.25000
## 4 4 41.00000 109.70000 22.00000
## 5 5 32.69231 86.53846 82.12821
## 6 6 44.89474 48.70526 42.63158
fviz_cluster(kmeans)