Membaca data
whole.sale <- read.csv("Wholesale customers data.csv")
Melihat data dari 6 paling atas
head(whole.sale)
## Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 2 3 12669 9656 7561 214 2674 1338
## 2 2 3 7057 9810 9568 1762 3293 1776
## 3 2 3 6353 8808 7684 2405 3516 7844
## 4 1 3 13265 1196 4221 6404 507 1788
## 5 2 3 22615 5410 7198 3915 1777 5185
## 6 2 3 9413 8259 5126 666 1795 1451
Melihat struktur data
str(whole.sale)
## 'data.frame': 440 obs. of 8 variables:
## $ Channel : int 2 2 2 1 2 2 2 2 1 2 ...
## $ Region : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Fresh : int 12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
## $ Milk : int 9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
## $ Grocery : int 7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
## $ Frozen : int 214 1762 2405 6404 3915 666 480 1669 425 1159 ...
## $ Detergents_Paper: int 2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
## $ Delicassen : int 1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
Deskripsi statistik data
summary(whole.sale)
## Channel Region Fresh Milk
## Min. :1.000 Min. :1.000 Min. : 3 Min. : 55
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.: 3128 1st Qu.: 1533
## Median :1.000 Median :3.000 Median : 8504 Median : 3627
## Mean :1.323 Mean :2.543 Mean : 12000 Mean : 5796
## 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.: 16934 3rd Qu.: 7190
## Max. :2.000 Max. :3.000 Max. :112151 Max. :73498
## Grocery Frozen Detergents_Paper Delicassen
## Min. : 3 Min. : 25.0 Min. : 3.0 Min. : 3.0
## 1st Qu.: 2153 1st Qu.: 742.2 1st Qu.: 256.8 1st Qu.: 408.2
## Median : 4756 Median : 1526.0 Median : 816.5 Median : 965.5
## Mean : 7951 Mean : 3071.9 Mean : 2881.5 Mean : 1524.9
## 3rd Qu.:10656 3rd Qu.: 3554.2 3rd Qu.: 3922.0 3rd Qu.: 1820.2
## Max. :92780 Max. :60869.0 Max. :40827.0 Max. :47943.0
Normalisasi data
whole.sale.matrix <- scale(whole.sale[,-c(1,2)])
Struktur data
str(whole.sale.matrix)
## num [1:440, 1:6] 0.0529 -0.3909 -0.4465 0.1 0.8393 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:6] "Fresh" "Milk" "Grocery" "Frozen" ...
## - attr(*, "scaled:center")= Named num [1:6] 12000 5796 7951 3072 2881 ...
## ..- attr(*, "names")= chr [1:6] "Fresh" "Milk" "Grocery" "Frozen" ...
## - attr(*, "scaled:scale")= Named num [1:6] 12647 7380 9503 4855 4768 ...
## ..- attr(*, "names")= chr [1:6] "Fresh" "Milk" "Grocery" "Frozen" ...
Menentukan jumlah kluster
wss <- (nrow(whole.sale.matrix))*sum(apply(whole.sale.matrix,2,var))
for(i in 2:7){
wss[i]<-sum(kmeans(whole.sale.matrix,centers=i)$withinss)
}
plot(1:7, wss, type="b", xlab="Jumlah Kluster",ylab="Jumlah kelompok")
Membuat model K-Means
model.kmeans <- kmeans(whole.sale.matrix,5)
Hasil sebagian
aggregate(whole.sale, by=list(model.kmeans$cluster), FUN = mean)
## Group.1 Channel Region Fresh Milk Grocery Frozen
## 1 1 1.131579 2.631579 29879.276 4316.895 5297.592 6588.539
## 2 2 1.904255 2.510638 5355.106 10606.330 16579.479 1409.436
## 3 3 2.000000 2.500000 15964.900 34708.500 48536.900 3054.600
## 4 4 1.250000 2.750000 52022.000 31696.000 18490.750 29825.500
## 5 5 1.140625 2.527344 8352.301 2935.191 3820.867 2221.039
## Detergents_Paper Delicassen
## 1 861.2895 2177.3026
## 2 7262.3085 1614.7234
## 3 24875.2000 2942.8000
## 4 2699.0000 19656.2500
## 5 1016.3828 959.4961
whole.sale$kluster <- model.kmeans$cluster
head(whole.sale,10)
## Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 2 3 12669 9656 7561 214 2674 1338
## 2 2 3 7057 9810 9568 1762 3293 1776
## 3 2 3 6353 8808 7684 2405 3516 7844
## 4 1 3 13265 1196 4221 6404 507 1788
## 5 2 3 22615 5410 7198 3915 1777 5185
## 6 2 3 9413 8259 5126 666 1795 1451
## 7 2 3 12126 3199 6975 480 3140 545
## 8 2 3 7579 4956 9426 1669 3321 2566
## 9 1 3 5963 3648 6192 425 1716 750
## 10 2 3 6006 11093 18881 1159 7425 2098
## kluster
## 1 5
## 2 2
## 3 2
## 4 5
## 5 1
## 6 5
## 7 5
## 8 5
## 9 5
## 10 2
Visualisasi kluster
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
ggplot(whole.sale, aes(x = Grocery,
y = Milk,
color = kluster)) +
geom_point()
ggplot(whole.sale, aes(x = Grocery,
y = Frozen,
color = kluster)) +
geom_point()
ggplot(whole.sale, aes(x = Detergents_Paper,
y = Delicassen,
color = kluster)) +
geom_point()
Hasil dari kluster dengan metode K-Means mendapatkan 5 kluster dengan berdasarkan pada variabel fresh,milk,grocery,frozen,detergents_paper,dan delicassen.