library (ggplot2)
library (cluster)
library (dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library (FactoMineR)
library (factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
Wholesale <- read.csv("wholesale.csv")
str(Wholesale)
## 'data.frame': 440 obs. of 8 variables:
## $ Channel : int 2 2 2 1 2 2 2 2 1 2 ...
## $ Region : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Fresh : int 12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
## $ Milk : int 9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
## $ Grocery : int 7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
## $ Frozen : int 214 1762 2405 6404 3915 666 480 1669 425 1159 ...
## $ Detergents_Paper: int 2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
## $ Delicassen : int 1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
Wholesale_k <- Wholesale[,-c(1:2)]
head(Wholesale_k)
## Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 12669 9656 7561 214 2674 1338
## 2 7057 9810 9568 1762 3293 1776
## 3 6353 8808 7684 2405 3516 7844
## 4 13265 1196 4221 6404 507 1788
## 5 22615 5410 7198 3915 1777 5185
## 6 9413 8259 5126 666 1795 1451
In this UL i will use K means clustering as a parameter because it is oen of the simpelest and popular unsupervised maching learning algorithms and this can be used to to confirm business assumptions about what types of groups exist or to identify unknown groups in complex data sets.
Scalling is to calculate distance between data and make all variables similary scaled and centered. Hence it is always advisable to bring all the features to the same scale for applying distance in K means clustering
Wholesale_scale <- scale(Wholesale_k)
summary(Wholesale_scale)
## Fresh Milk Grocery Frozen
## Min. :-0.9486 Min. :-0.7779 Min. :-0.8364 Min. :-0.62763
## 1st Qu.:-0.7015 1st Qu.:-0.5776 1st Qu.:-0.6101 1st Qu.:-0.47988
## Median :-0.2764 Median :-0.2939 Median :-0.3363 Median :-0.31844
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.3901 3rd Qu.: 0.1889 3rd Qu.: 0.2846 3rd Qu.: 0.09935
## Max. : 7.9187 Max. : 9.1732 Max. : 8.9264 Max. :11.90545
## Detergents_Paper Delicassen
## Min. :-0.6037 Min. :-0.5396
## 1st Qu.:-0.5505 1st Qu.:-0.3960
## Median :-0.4331 Median :-0.1984
## Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.2182 3rd Qu.: 0.1047
## Max. : 7.9586 Max. :16.4597
k.means.fit <- kmeans(Wholesale_scale, 4)
K means clustering is unsupervised learning algorithm. which groups the unlabelled dataset into different clusters
In cluster analysis, the elbow method is a heuristic used in determinig the number of clusters in data set. WSS is with in sum of squares , is the total distance of data points from their respective clusters
wss <- function(data, maxCluster = 10) {
SSw <- (nrow(data) - 1) * sum(apply(data, 2, var))
for (i in 2:maxCluster) {
set.seed(10)
SSw[i] <- sum(kmeans(data, centers = i)$withinss)
}
plot(1:maxCluster, SSw, type = "o", xlab = "Number of Clusters", ylab = "Within groups sum of squares", pch=20)
}
wss(Wholesale_scale)
clust2 <- kmeans(Wholesale_scale,2)
clust4 <- kmeans(Wholesale_scale,4)
Wholesale$clust2 <- as.factor(clust2$cluster)
Wholesale$clust4 <- as.factor(clust4$cluster)
head(Wholesale,10)
## Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen clust2
## 1 2 3 12669 9656 7561 214 2674 1338 1
## 2 2 3 7057 9810 9568 1762 3293 1776 1
## 3 2 3 6353 8808 7684 2405 3516 7844 1
## 4 1 3 13265 1196 4221 6404 507 1788 1
## 5 2 3 22615 5410 7198 3915 1777 5185 1
## 6 2 3 9413 8259 5126 666 1795 1451 1
## 7 2 3 12126 3199 6975 480 3140 545 1
## 8 2 3 7579 4956 9426 1669 3321 2566 1
## 9 1 3 5963 3648 6192 425 1716 750 1
## 10 2 3 6006 11093 18881 1159 7425 2098 2
## clust4
## 1 4
## 2 4
## 3 4
## 4 4
## 5 3
## 6 4
## 7 4
## 8 4
## 9 4
## 10 2
table(Wholesale$clust2)
##
## 1 2
## 391 49
table(Wholesale$clust4)
##
## 1 2 3 4
## 7 80 68 285
table(Wholesale$clust2, Wholesale$Channel)
##
## 1 2
## 1 292 99
## 2 6 43
num1 <- nrow(Wholesale %>%
filter(Wholesale$clust2 == "1" & Wholesale$Channel == "1"))
num2 <- nrow(Wholesale %>%
filter(Wholesale$clust2 == "2" & Wholesale$Channel == "2"))
accuracy <- (num1 + num2)/nrow(Wholesale)
#Transformation to pc
Wholesale_pr <- prcomp(Wholesale_scale)
#Adding coordinate with each PC to the data
Wholesale <- cbind(Wholesale, Wholesale_pr$x)
plot.PCA(PCA(Wholesale_scale, graph=F), choix = "var")
library clusters allow us to represent (with the aid of PCA) the cluster solutions into 2 dimensions
library(cluster)
clusplot(Wholesale_scale, k.means.fit$cluster,
main = ' 2 D representation of the cluster solution ',
color=TRUE, shade = TRUE,
labels=2, lines =0)
PC 1 and PC 2 has two components explain 72,46% of the point variability
below is the visualization of K
ggplot(Wholesale, aes(x=PC1, y=PC2)) +
geom_point(aes(col = clust2))
fviz_cluster(clust2, data = Wholesale_scale)
ggplot(Wholesale, aes(x=PC1, y=PC2)) +
geom_point(aes(col = clust4))
fviz_cluster(clust4, data = Wholesale_scale)