1. Pendahuluan
2. Data
library(ggplot2)## Warning: package 'ggplot2' was built under R version 4.1.3
library(factoextra)## Warning: package 'factoextra' was built under R version 4.1.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(knitr)## Warning: package 'knitr' was built under R version 4.1.3
library(dplyr)## Warning: package 'dplyr' was built under R version 4.1.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggcorrplot)datapsd_2<- read.csv("C:/Users/user/Downloads/Data Mall_Customer.csv", sep=";")
datapsd_2## CustomerID Genre Age Annual.Income Spending.Score
## 1 1 Male 19 15 39
## 2 2 Male 21 15 81
## 3 3 Female 20 16 6
## 4 4 Female 23 16 77
## 5 5 Female 31 17 40
## 6 6 Female 22 17 76
## 7 7 Female 35 18 6
## 8 8 Female 23 18 94
## 9 9 Male 64 19 3
## 10 10 Female 30 19 72
## 11 11 Male 67 19 14
## 12 12 Female 35 19 99
## 13 13 Female 58 20 15
## 14 14 Female 24 20 77
## 15 15 Male 37 20 13
## 16 16 Male 22 20 79
## 17 17 Female 35 21 35
## 18 18 Male 20 21 66
## 19 19 Male 52 23 29
## 20 20 Female 35 23 98
## 21 21 Male 35 24 35
## 22 22 Male 25 24 73
## 23 23 Female 46 25 5
## 24 24 Male 31 25 73
## 25 25 Female 54 28 14
## 26 26 Male 29 28 82
## 27 27 Female 45 28 32
## 28 28 Male 35 28 61
## 29 29 Female 40 29 31
## 30 30 Female 23 29 87
## 31 31 Male 60 30 4
## 32 32 Female 21 30 73
## 33 33 Male 53 33 4
## 34 34 Male 18 33 92
## 35 35 Female 49 33 14
## 36 36 Female 21 33 81
## 37 37 Female 42 34 17
## 38 38 Female 30 34 73
## 39 39 Female 36 37 26
## 40 40 Female 20 37 75
## 41 41 Female 65 38 35
## 42 42 Male 24 38 92
## 43 43 Male 48 39 36
## 44 44 Female 31 39 61
## 45 45 Female 49 39 28
## 46 46 Female 24 39 65
## 47 47 Female 50 40 55
## 48 48 Female 27 40 47
## 49 49 Female 29 40 42
## 50 50 Female 31 40 42
## 51 51 Female 49 42 52
## 52 52 Male 33 42 60
## 53 53 Female 31 43 54
## 54 54 Male 59 43 60
## 55 55 Female 50 43 45
## 56 56 Male 47 43 41
## 57 57 Female 51 44 50
## 58 58 Male 69 44 46
## 59 59 Female 27 46 51
## 60 60 Male 53 46 46
## 61 61 Male 70 46 56
## 62 62 Male 19 46 55
## 63 63 Female 67 47 52
## 64 64 Female 54 47 59
## 65 65 Male 63 48 51
## 66 66 Male 18 48 59
## 67 67 Female 43 48 50
## 68 68 Female 68 48 48
## 69 69 Male 19 48 59
## 70 70 Female 32 48 47
## 71 71 Male 70 49 55
## 72 72 Female 47 49 42
## 73 73 Female 60 50 49
## 74 74 Female 60 50 56
## 75 75 Male 59 54 47
## 76 76 Male 26 54 54
## 77 77 Female 45 54 53
## 78 78 Male 40 54 48
## 79 79 Female 23 54 52
## 80 80 Female 49 54 42
## 81 81 Male 57 54 51
## 82 82 Male 38 54 55
## 83 83 Male 67 54 41
## 84 84 Female 46 54 44
## 85 85 Female 21 54 57
## 86 86 Male 48 54 46
## 87 87 Female 55 57 58
## 88 88 Female 22 57 55
## 89 89 Female 34 58 60
## 90 90 Female 50 58 46
## 91 91 Female 68 59 55
## 92 92 Male 18 59 41
## 93 93 Male 48 60 49
## 94 94 Female 40 60 40
## 95 95 Female 32 60 42
## 96 96 Male 24 60 52
## 97 97 Female 47 60 47
## 98 98 Female 27 60 50
## 99 99 Male 48 61 42
## 100 100 Male 20 61 49
## 101 101 Female 23 62 41
## 102 102 Female 49 62 48
## 103 103 Male 67 62 59
## 104 104 Male 26 62 55
## 105 105 Male 49 62 56
## 106 106 Female 21 62 42
## 107 107 Female 66 63 50
## 108 108 Male 54 63 46
## 109 109 Male 68 63 43
## 110 110 Male 66 63 48
## 111 111 Male 65 63 52
## 112 112 Female 19 63 54
## 113 113 Female 38 64 42
## 114 114 Male 19 64 46
## 115 115 Female 18 65 48
## 116 116 Female 19 65 50
## 117 117 Female 63 65 43
## 118 118 Female 49 65 59
## 119 119 Female 51 67 43
## 120 120 Female 50 67 57
## 121 121 Male 27 67 56
## 122 122 Female 38 67 40
## 123 123 Female 40 69 58
## 124 124 Male 39 69 91
## 125 125 Female 23 70 29
## 126 126 Female 31 70 77
## 127 127 Male 43 71 35
## 128 128 Male 40 71 95
## 129 129 Male 59 71 11
## 130 130 Male 38 71 75
## 131 131 Male 47 71 9
## 132 132 Male 39 71 75
## 133 133 Female 25 72 34
## 134 134 Female 31 72 71
## 135 135 Male 20 73 5
## 136 136 Female 29 73 88
## 137 137 Female 44 73 7
## 138 138 Male 32 73 73
## 139 139 Male 19 74 10
## 140 140 Female 35 74 72
## 141 141 Female 57 75 5
## 142 142 Male 32 75 93
## 143 143 Female 28 76 40
## 144 144 Female 32 76 87
## 145 145 Male 25 77 12
## 146 146 Male 28 77 97
## 147 147 Male 48 77 36
## 148 148 Female 32 77 74
## 149 149 Female 34 78 22
## 150 150 Male 34 78 90
## 151 151 Male 43 78 17
## 152 152 Male 39 78 88
## 153 153 Female 44 78 20
## 154 154 Female 38 78 76
## 155 155 Female 47 78 16
## 156 156 Female 27 78 89
## 157 157 Male 37 78 1
## 158 158 Female 30 78 78
## 159 159 Male 34 78 1
## 160 160 Female 30 78 73
## 161 161 Female 56 79 35
## 162 162 Female 29 79 83
## 163 163 Male 19 81 5
## 164 164 Female 31 81 93
## 165 165 Male 50 85 26
## 166 166 Female 36 85 75
## 167 167 Male 42 86 20
## 168 168 Female 33 86 95
## 169 169 Female 36 87 27
## 170 170 Male 32 87 63
## 171 171 Male 40 87 13
## 172 172 Male 28 87 75
## 173 173 Male 36 87 10
## 174 174 Male 36 87 92
## 175 175 Female 52 88 13
## 176 176 Female 30 88 86
## 177 177 Male 58 88 15
## 178 178 Male 27 88 69
## 179 179 Male 59 93 14
## 180 180 Male 35 93 90
## 181 181 Female 37 97 32
## 182 182 Female 32 97 86
## 183 183 Male 46 98 15
## 184 184 Female 29 98 88
## 185 185 Female 41 99 39
## 186 186 Male 30 99 97
## 187 187 Female 54 101 24
## 188 188 Male 28 101 68
## 189 189 Female 41 103 17
## 190 190 Female 36 103 85
## 191 191 Female 34 103 23
## 192 192 Female 32 103 69
## 193 193 Male 33 113 8
## 194 194 Female 38 113 91
## 195 195 Female 47 120 16
## 196 196 Female 35 120 79
## 197 197 Female 45 126 28
## 198 198 Male 32 126 74
## 199 199 Male 32 137 18
## 200 200 Male 30 137 83
str(datapsd_2)## 'data.frame': 200 obs. of 5 variables:
## $ CustomerID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Genre : chr "Male" "Male" "Female" "Female" ...
## $ Age : int 19 21 20 23 31 22 35 23 64 30 ...
## $ Annual.Income : int 15 15 16 16 17 17 18 18 19 19 ...
## $ Spending.Score: int 39 81 6 77 40 76 6 94 3 72 ...
datapsd_3 <- datapsd_2[,-c(2)]Standarisasi peubah merupakan proses transformasi peubah menjadi peubah yang memiliki rata-rata nol dan simpangan baku satu. Proses standarisasi dilakukan apabila melihat perbedaan satuan pengikuran peubah-peubah yang digunakan. Standarisasi dilakukan karena metode k-means menggunakan konsep jarak antara objek/amatan yang mana sensitif terhadap satuan pengukuran.
# Standardisasi Skala, krn skala satuan berbeda (krn disini menggunakan konsep Jarak)
datapsd <- datapsd_2[,-c(1:2)]
datapsd.stdz = scale(datapsd)apply(datapsd.stdz, 2, mean) #cek mean = 0## Age Annual.Income Spending.Score
## -1.016906e-16 -8.144310e-17 -1.096708e-16
apply(datapsd.stdz, 2, sd) # cek ## Age Annual.Income Spending.Score
## 1 1 1
3. Eksplorasi Data
4. Analisis Gerombol Berhierarkhi
Korelasi
GGally::ggpairs(datapsd)## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
dfall <- data.frame("Var"=c("Age", "Annual.Income", "Spending.Score"),
"Mean"=c(mean(datapsd$Age), mean(datapsd$Annual.Income), mean(datapsd$Spending.Score))); View(dfall)
dfall$Var <- as.factor(dfall$Var)
ggplot(dfall, aes(x=Var, y=Mean, fill=Var)) + geom_bar(position="dodge", stat="identity", col="black") +
labs(title="Mean Value for All Variable") + ylim(0, 100)korelasi<-cor(datapsd)
korelasi## Age Annual.Income Spending.Score
## Age 1.00000000 -0.012398043 -0.327226846
## Annual.Income -0.01239804 1.000000000 0.009902848
## Spending.Score -0.32722685 0.009902848 1.000000000
library(corrplot)## Warning: package 'corrplot' was built under R version 4.1.2
## corrplot 0.92 loaded
corrplot(corr=cor(korelasi), method = "number", type = "upper")ggcorrplot(korelasi,type="lower",lab = TRUE)Pemilihan Banyak Kluster
Complete Linkage Jarak Euclidean Metode Silhouette
fviz_nbclust(datapsd.stdz, FUNcluster = hcut, method = "silhouette", hc_method = "complete", hc_metric="euclidean") Interpreter : Jumlah kluster optimal pada complete linkage jarak euclidean metode silhoutte adalah jumlah kluster 5. Hal ini karena koefisien yang nilainya semakin besar menunjukkan cluster yang terbentuk telah sesuai dan menjadi jumlah kluster paling optimal.
Average Linkage Jarak Euclidean Metode Silhouette
fviz_nbclust(datapsd.stdz, FUNcluster = hcut, method = "silhouette", hc_method = "average", hc_metric="euclidean") Interpretasi : Jumlah kluster optimal pada average linkage jarak euclidean metode silhoutte adalah jumlah kluster 5. Hal ini karena koefisien yang nilainya semakin besar menunjukkan cluster yang terbentuk telah sesuai dan menjadi jumlah kluster paling optimal.
Centroid Linkage Jarak Euclidean Metode Silhouette
fviz_nbclust(datapsd.stdz, FUNcluster = hcut, method = "silhouette", hc_method = "centroid", hc_metric="euclidean") Interpretasi : Jumlah kluster optimal pada centroid linkage jarak euclidean metode silhoutte adalah jumlah kluster 2. Hal ini karena koefisien yang nilainya semakin besar menunjukkan cluster yang terbentuk telah sesuai dan menjadi jumlah kluster paling optimal.
Single Linkage Jarak Euclidean Metode Silhouette
fviz_nbclust(datapsd.stdz, FUNcluster = hcut, method = "silhouette", hc_method = "single", hc_metric="euclidean") Interpretasi : Jumlah kluster optimal pada single linkage jarak euclidean metode silhoutte adalah jumlah kluster 2. Hal ini karena koefisien yang nilainya semakin besar menunjukkan cluster yang terbentuk telah sesuai dan menjadi jumlah kluster paling optimal.
Memunculkan Dendogram
Complete Linkage Jarak Euclidean
fviz_dend(hclust(dist(datapsd.stdz, method = "euclidean"), method = "complete"))## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
Average Linkage Jarak Euclidean
fviz_dend(hclust(dist(datapsd.stdz, method = "euclidean"), method = "average"))## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
Centroid Linkage Jarak Euclidean
fviz_dend(hclust(dist(datapsd.stdz, method = "euclidean"), method = "centroid"))## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
Single Linkage Jarak Euclidean
fviz_dend(hclust(dist(datapsd.stdz, method = "euclidean"), method = "single"))## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Interpretasi tiap Individu ke Kluster ### Complete
#Interpretasi, utk melihat anggota dari setiap gerombol
hc.data.com <- eclust(datapsd, stand = TRUE, FUNcluster = "hclust", k=5, hc_method = "complete", hc_metric = "euclidean", graph = F)
hc.data.com$cluster #cluster dari setiap pengamatan## [1] 1 2 1 2 1 2 1 2 3 2 3 2 3 2 1 2 1 2 3 2 1 2 3 2 3 2 3 1 3 2 3 2 3 2 3 2 3
## [38] 2 3 2 3 2 3 1 3 2 3 1 1 1 3 1 1 3 3 3 3 3 1 3 3 1 3 3 3 1 1 3 1 1 3 3 3 3
## [75] 3 1 1 1 1 3 3 1 3 3 1 3 3 1 1 3 3 1 3 1 1 1 3 1 3 1 1 3 3 1 3 1 3 3 3 3 3
## [112] 1 1 1 1 1 3 3 3 3 1 1 1 4 1 4 5 4 5 4 5 4 1 4 5 4 5 4 5 4 5 4 1 4 5 4 5 4
## [149] 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5
## [186] 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4
cekcom <- data.frame("Observasi"= c(seq(1:200)), "Kluster" = hc.data.com$cluster)
cekcom$Observasi <- as.character(cekcom$Observasi)
cekcom$Kluster <- as.character(cekcom$Kluster)
com <- cekcom %>% group_by(Kluster)%>% group_rows()
com## <list_of<integer>[5]>
## [[1]]
## [1] 1 3 5 7 15 17 21 28 44 48 49 50 52 53 59 62 66 67 69
## [20] 70 76 77 78 79 82 85 88 89 92 94 95 96 98 100 101 104 106 112
## [39] 113 114 115 116 121 122 123 125 133 143
##
## [[2]]
## [1] 2 4 6 8 10 12 14 16 18 20 22 24 26 30 32 34 36 38 40 42 46
##
## [[3]]
## [1] 9 11 13 19 23 25 27 29 31 33 35 37 39 41 43 45 47 51 54
## [20] 55 56 57 58 60 61 63 64 65 68 71 72 73 74 75 80 81 83 84
## [39] 86 87 90 91 93 97 99 102 103 105 107 108 109 110 111 117 118 119 120
##
## [[4]]
## [1] 124 126 128 130 132 134 136 138 140 142 144 146 148 150 152 154 156 158 160
## [20] 162 164 166 168 170 172 174 176 178 180 182 184 186 188 190 192 194 196 198
## [39] 200
##
## [[5]]
## [1] 127 129 131 135 137 139 141 145 147 149 151 153 155 157 159 161 163 165 167
## [20] 169 171 173 175 177 179 181 183 185 187 189 191 193 195 197 199
cekcom%>% count(Kluster)## Kluster n
## 1 1 48
## 2 2 21
## 3 3 57
## 4 4 39
## 5 5 35
Average
#Interpretasi, utk melihat anggota dari setiap gerombol
hc.data.ave <- eclust(datapsd, stand = TRUE, FUNcluster = "hclust", k=5, hc_method = "average", hc_metric = "euclidean", graph = F)
hc.data.ave$cluster #cluster dari setiap pengamatan## [1] 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1
## [38] 2 1 2 3 2 3 2 3 2 3 2 2 2 3 2 2 3 3 3 3 3 2 3 3 2 3 3 3 2 3 3 2 2 3 3 3 3
## [75] 3 2 3 3 2 3 3 3 3 3 2 3 3 2 3 3 3 2 3 3 3 2 3 2 3 2 2 3 3 2 3 2 3 3 3 3 3
## [112] 2 3 2 2 2 3 3 3 3 2 3 3 4 2 4 5 4 5 4 5 4 2 4 5 4 5 4 5 4 5 4 2 4 5 4 5 4
## [149] 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5
## [186] 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4
cekave <- data.frame("Observasi"= c(seq(1:200)), "Kluster" = hc.data.ave$cluster)
cekave$Observasi <- as.character(cekave$Observasi)
cekave$Kluster <- as.character(cekave$Kluster)
ave <- cekave %>% group_by(Kluster)%>% group_rows()
ave## <list_of<integer>[5]>
## [[1]]
## [1] 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39
##
## [[2]]
## [1] 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38
## [20] 40 42 44 46 48 49 50 52 53 59 62 66 69 70 76 79 85 88 92
## [39] 96 98 100 101 104 106 112 114 115 116 121 125 133 143
##
## [[3]]
## [1] 41 43 45 47 51 54 55 56 57 58 60 61 63 64 65 67 68 71 72
## [20] 73 74 75 77 78 80 81 82 83 84 86 87 89 90 91 93 94 95 97
## [39] 99 102 103 105 107 108 109 110 111 113 117 118 119 120 122 123
##
## [[4]]
## [1] 124 126 128 130 132 134 136 138 140 142 144 146 148 150 152 154 156 158 160
## [20] 162 164 166 168 170 172 174 176 178 180 182 184 186 188 190 192 194 196 198
## [39] 200
##
## [[5]]
## [1] 127 129 131 135 137 139 141 145 147 149 151 153 155 157 159 161 163 165 167
## [20] 169 171 173 175 177 179 181 183 185 187 189 191 193 195 197 199
cekave%>% count(Kluster)## Kluster n
## 1 1 20
## 2 2 52
## 3 3 54
## 4 4 39
## 5 5 35
Centroid
#Interpretasi, utk melihat anggota dari setiap gerombol
hc.data.centro <- eclust(datapsd, stand = TRUE, FUNcluster = "hclust", k=2, hc_method = "centroid", hc_metric = "euclidean", graph = F)
hc.data.centro$cluster #cluster dari setiap pengamatan## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 2 1 2 1 2
cekcentro <- data.frame("Observasi"= c(seq(1:200)), "Kluster" = hc.data.centro$cluster)
cekcentro$Observasi <- as.character(cekcentro$Observasi)
cekcentro$Kluster <- as.character(cekcentro$Kluster)
centro <- cekcentro %>% group_by(Kluster)%>% group_rows()
centro## <list_of<integer>[2]>
## [[1]]
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## [19] 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## [37] 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## [55] 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## [73] 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## [91] 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## [109] 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## [127] 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## [145] 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
## [163] 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## [181] 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 197 199
##
## [[2]]
## [1] 196 198 200
cekcentro%>% count(Kluster)## Kluster n
## 1 1 197
## 2 2 3
Single
#Interpretasi, utk melihat anggota dari setiap gerombol
hc.data.sing <- eclust(datapsd, stand = TRUE, FUNcluster = "hclust", k=2, hc_method = "single", hc_metric = "euclidean", graph = F)
hc.data.sing$cluster #cluster dari setiap pengamatan## [1] 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
ceksing <- data.frame("Observasi"= c(seq(1:200)), "Kluster" = hc.data.sing$cluster)
ceksing$Observasi <- as.character(ceksing$Observasi)
ceksing$Kluster <- as.character(ceksing$Kluster)
sing <- ceksing %>% group_by(Kluster)%>% group_rows()
sing## <list_of<integer>[2]>
## [[1]]
## [1] 1 2 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
## [19] 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
## [37] 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
## [55] 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
## [73] 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
## [91] 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
## [109] 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
## [127] 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
## [145] 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
## [163] 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
## [181] 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
## [199] 200
##
## [[2]]
## [1] 3
ceksing%>% count(Kluster)## Kluster n
## 1 1 199
## 2 2 1
Cluster plot
Complete
fviz_cluster(hc.data.com)Average
fviz_cluster(hc.data.ave)Centroid
fviz_cluster(hc.data.centro)Single
fviz_cluster(hc.data.sing)Karakteristik
Complete
aggregate(datapsd, by=list(cluster=hc.data.com$cluster), FUN = mean)## cluster Age Annual.Income Spending.Score
## 1 1 28.35417 50.29167 45.93750
## 2 2 24.80952 25.61905 80.23810
## 3 3 55.33333 47.31579 41.08772
## 4 4 32.69231 86.53846 82.12821
## 5 5 41.68571 88.22857 17.28571
Average
aggregate(datapsd, by=list(cluster=hc.data.ave$cluster), FUN = mean)## cluster Age Annual.Income Spending.Score
## 1 1 43.90000 24.45000 19.10000
## 2 2 24.65385 42.94231 62.07692
## 3 3 53.25926 54.20370 48.55556
## 4 4 32.69231 86.53846 82.12821
## 5 5 41.68571 88.22857 17.28571
Centroid
aggregate(datapsd, by=list(cluster=hc.data.centro$cluster), FUN = mean)## cluster Age Annual.Income Spending.Score
## 1 1 38.94924 59.53807 49.76650
## 2 2 32.33333 127.66667 78.66667
Single
aggregate(datapsd, by=list(cluster=hc.data.sing$cluster), FUN = mean)## cluster Age Annual.Income Spending.Score
## 1 1 38.94472 60.78392 50.42211
## 2 2 20.00000 16.00000 6.00000
5. Analsisis Gerombol Tak Berhierarki (K-Means)
Penentuan Cluster
#Penentuan k dengan within sum square
fviz_nbclust(datapsd.stdz, FUNcluster = kmeans, method = "wss")fviz_nbclust(datapsd.stdz, FUNcluster = kmeans, method = "silhouette")kmeans.data6 <- eclust(datapsd, stand = TRUE, FUNcluster = "kmeans", k=6, graph = F)
kmeans.data6$cluster## [1] 6 3 6 3 6 3 6 3 2 3 2 3 2 3 6 3 6 3 2 3 6 3 2 3 2 3 2 3 6 3 2 3 2 3 2 3 2
## [38] 3 6 3 2 3 2 6 2 3 2 6 6 6 2 6 6 2 2 2 2 2 6 2 2 6 2 2 2 6 2 2 6 6 2 2 2 2
## [75] 2 6 2 6 6 2 2 6 2 2 6 2 2 6 6 2 2 6 2 6 6 6 2 6 2 6 6 2 2 6 2 6 2 2 2 2 2
## [112] 6 6 6 6 6 2 2 2 2 6 6 5 5 6 5 4 5 4 5 4 5 6 5 1 5 4 5 1 5 4 5 6 5 1 5 4 5
## [149] 1 5 4 5 4 5 4 5 1 5 1 5 4 5 1 5 4 5 4 5 1 5 4 5 1 5 4 5 4 5 4 5 1 5 4 5 4
## [186] 5 4 5 4 5 1 5 1 5 4 5 4 5 1 5
cekk6 <- data.frame("Observasi"= c(seq(1:200)), "Kluster" = kmeans.data6$cluster)
cekk6$Observasi <- as.character(cekk6 $Observasi)
cekk6$Kluster <- as.character(cekk6$Kluster)
k6<- cekk6 %>% group_by(Kluster)%>% group_rows()
k6## <list_of<integer>[6]>
## [[1]]
## [1] 135 139 145 149 157 159 163 169 173 181 191 193 199
##
## [[2]]
## [1] 9 11 13 19 23 25 27 31 33 35 37 41 43 45 47 51 54 55 56
## [20] 57 58 60 61 63 64 65 67 68 71 72 73 74 75 77 80 81 83 84
## [39] 86 87 90 91 93 97 99 102 103 105 107 108 109 110 111 117 118 119 120
##
## [[3]]
## [1] 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 46
##
## [[4]]
## [1] 127 129 131 137 141 147 151 153 155 161 165 167 171 175 177 179 183 185 187
## [20] 189 195 197
##
## [[5]]
## [1] 123 124 126 128 130 132 134 136 138 140 142 144 146 148 150 152 154 156 158
## [20] 160 162 164 166 168 170 172 174 176 178 180 182 184 186 188 190 192 194 196
## [39] 198 200
##
## [[6]]
## [1] 1 3 5 7 15 17 21 29 39 44 48 49 50 52 53 59 62 66 69
## [20] 70 76 78 79 82 85 88 89 92 94 95 96 98 100 101 104 106 112 113
## [39] 114 115 116 121 122 125 133 143
cekk6 %>% count(Kluster)## Kluster n
## 1 1 13
## 2 2 57
## 3 3 22
## 4 4 22
## 5 5 40
## 6 6 46
kmeans.data4 <- eclust(datapsd, stand = TRUE, FUNcluster = "kmeans", k=4, graph = F)
kmeans.data4$cluster## [1] 3 3 3 3 3 3 2 3 2 3 2 3 2 3 2 3 3 3 2 3 3 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2
## [38] 3 2 3 2 3 2 3 2 3 2 3 3 3 2 3 3 2 2 2 2 2 3 2 2 3 2 2 2 3 2 2 3 3 2 2 2 2
## [75] 2 3 2 2 3 2 2 3 2 2 3 2 2 3 3 2 2 3 2 2 3 3 2 3 2 3 3 2 2 3 2 3 2 2 2 2 2
## [112] 3 1 3 3 3 2 2 2 2 3 1 4 4 1 4 1 4 2 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4
## [149] 1 4 1 4 1 4 1 4 1 4 1 4 2 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1
## [186] 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4
cekk4 <- data.frame("Observasi"= c(seq(1:200)), "Kluster" = kmeans.data4$cluster)
cekk4$Observasi <- as.character(cekk4$Observasi)
cekk4$Kluster <- as.character(cekk4$Kluster)
k4<- cekk4 %>% group_by(Kluster)%>% group_rows()
k4## <list_of<integer>[4]>
## [[1]]
## [1] 113 122 125 127 131 133 135 137 139 141 143 145 147 149 151 153 155 157 159
## [20] 163 165 167 169 171 173 175 177 179 181 183 185 187 189 191 193 195 197 199
##
## [[2]]
## [1] 7 9 11 13 15 19 23 25 27 29 31 33 35 37 39 41 43 45 47
## [20] 51 54 55 56 57 58 60 61 63 64 65 67 68 71 72 73 74 75 77
## [39] 78 80 81 83 84 86 87 90 91 93 94 97 99 102 103 105 107 108 109
## [58] 110 111 117 118 119 120 129 161
##
## [[3]]
## [1] 1 2 3 4 5 6 8 10 12 14 16 17 18 20 21 22 24 26 28
## [20] 30 32 34 36 38 40 42 44 46 48 49 50 52 53 59 62 66 69 70
## [39] 76 79 82 85 88 89 92 95 96 98 100 101 104 106 112 114 115 116 121
##
## [[4]]
## [1] 123 124 126 128 130 132 134 136 138 140 142 144 146 148 150 152 154 156 158
## [20] 160 162 164 166 168 170 172 174 176 178 180 182 184 186 188 190 192 194 196
## [39] 198 200
cekk4 %>% count(Kluster)## Kluster n
## 1 1 38
## 2 2 65
## 3 3 57
## 4 4 40
kmeans.data8 <- eclust(datapsd, stand = TRUE, FUNcluster = "kmeans", k=8, graph = F)
kmeans.data8$cluster## [1] 3 3 8 3 8 3 8 3 8 3 8 3 8 3 8 3 8 3 8 3 8 3 8 3 8 3 8 3 8 3 8 3 8 3 8 3 8
## [38] 3 8 3 7 3 8 6 8 3 7 6 6 6 7 6 6 7 7 7 7 7 6 7 7 6 7 7 7 6 7 7 6 6 7 7 7 7
## [75] 7 6 7 6 6 7 7 6 7 7 6 7 7 6 6 7 7 6 7 6 6 6 7 6 7 6 6 7 7 6 7 6 7 7 7 7 7
## [112] 6 6 6 6 6 7 7 7 7 6 6 6 5 1 5 2 5 2 5 2 5 1 5 1 5 2 5 1 5 2 5 6 5 1 5 2 5
## [149] 1 5 2 5 2 5 2 5 1 5 1 5 2 5 1 5 2 5 2 5 1 5 2 5 1 5 2 5 2 5 2 5 4 5 2 5 4
## [186] 5 2 5 4 5 4 5 4 5 4 5 4 5 4 5
cekk8 <- data.frame("Observasi"= c(seq(1:200)), "Kluster" = kmeans.data8$cluster)
cekk8$Observasi <- as.character(cekk8$Observasi)
cekk8$Kluster <- as.character(cekk8$Kluster)
k8<- cekk8 %>% group_by(Kluster)%>% group_rows()
k8## <list_of<integer>[8]>
## [[1]]
## [1] 125 133 135 139 145 149 157 159 163 169 173
##
## [[2]]
## [1] 127 129 131 137 141 147 151 153 155 161 165 167 171 175 177 179 183 187
##
## [[3]]
## [1] 1 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 46
##
## [[4]]
## [1] 181 185 189 191 193 195 197 199
##
## [[5]]
## [1] 124 126 128 130 132 134 136 138 140 142 144 146 148 150 152 154 156 158 160
## [20] 162 164 166 168 170 172 174 176 178 180 182 184 186 188 190 192 194 196 198
## [39] 200
##
## [[6]]
## [1] 44 48 49 50 52 53 59 62 66 69 70 76 78 79 82 85 88 89 92
## [20] 94 95 96 98 100 101 104 106 112 113 114 115 116 121 122 123 143
##
## [[7]]
## [1] 41 47 51 54 55 56 57 58 60 61 63 64 65 67 68 71 72 73 74
## [20] 75 77 80 81 83 84 86 87 90 91 93 97 99 102 103 105 107 108 109
## [39] 110 111 117 118 119 120
##
## [[8]]
## [1] 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39 43 45
cekk8 %>% count(Kluster)## Kluster n
## 1 1 11
## 2 2 18
## 3 3 23
## 4 4 8
## 5 5 39
## 6 6 36
## 7 7 44
## 8 8 21
kmeans.data4$centers## Age Annual.Income Spending.Score
## 1 0.03711223 0.9876366 -1.1857814
## 2 1.08344244 -0.4893373 -0.3961802
## 3 -0.96008279 -0.7827991 0.3910484
## 4 -0.42773261 0.9724070 1.2130414
kmeans.data6$centers## Age Annual.Income Spending.Score
## 1 -0.6005052 1.1003939 -1.4256531
## 2 1.1950641 -0.4802119 -0.3216162
## 3 -0.9719569 -1.3262173 1.1293439
## 4 0.6777992 1.0257105 -1.1853182
## 5 -0.4277326 0.9724070 1.2130414
## 6 -0.7985068 -0.4177864 -0.2266218
kmeans.data8$centers## Age Annual.Income Spending.Score
## 1 -0.776719472 0.6536248 -1.394781947
## 2 0.754447946 0.8184193 -1.231865720
## 3 -0.991480616 -1.3439751 1.061384888
## 4 -0.007158705 1.9680392 -1.067824922
## 5 -0.440811016 0.9891010 1.236400114
## 6 -0.836375346 -0.1789050 -0.003442168
## 7 1.252122551 -0.2610138 -0.031507568
## 8 0.477758324 -1.3049552 -1.193448671
aggregate(datapsd_3, by=list(cluster=kmeans.data4$cluster), FUN = mean)## cluster CustomerID Age Annual.Income Spending.Score
## 1 1 160.55263 39.36842 86.50000 19.57895
## 2 2 69.41538 53.98462 47.70769 39.96923
## 3 3 53.43860 25.43860 40.00000 60.29825
## 4 4 161.02500 32.87500 86.10000 81.52500
aggregate(datapsd_3, by=list(cluster=kmeans.data6$cluster), FUN = mean)## cluster CustomerID Age Annual.Income Spending.Score
## 1 1 165.61538 30.46154 89.46154 13.38462
## 2 2 69.47368 55.54386 47.94737 41.89474
## 3 3 23.09091 25.27273 25.72727 79.36364
## 4 4 163.72727 48.31818 87.50000 19.59091
## 5 5 161.02500 32.87500 86.10000 81.52500
## 6 6 74.69565 27.69565 49.58696 44.34783
aggregate(datapsd_3, by=list(cluster=kmeans.data8$cluster), FUN = mean)## cluster CustomerID Age Annual.Income Spending.Score
## 1 1 149.72727 28.00000 77.72727 14.18182
## 2 2 157.55556 49.38889 82.05556 18.38889
## 3 3 22.13043 25.00000 25.26087 77.60870
## 4 4 191.25000 38.75000 112.25000 22.62500
## 5 5 162.00000 32.69231 86.53846 82.12821
## 6 6 87.88889 27.16667 55.86111 50.11111
## 7 7 82.02273 56.34091 53.70455 49.38636
## 8 8 23.19048 45.52381 26.28571 19.38095
fviz_cluster(kmeans.data4)fviz_cluster(kmeans.data6)fviz_cluster(kmeans.data8)