Load libraries
library(dplyr)
library(ggplot2)
library(cluster)
library(gridExtra)
Read the dataset
data <- read.csv("Wholesale customers data.csv", stringsAsFactors = FALSE)
Explore the dataset for names of variables, structure and get the first and last rows of the dataset
names(data)
## [1] "Channel" "Region" "Fresh"
## [4] "Milk" "Grocery" "Frozen"
## [7] "Detergents_Paper" "Delicassen"
str(data)
## 'data.frame': 440 obs. of 8 variables:
## $ Channel : int 2 2 2 1 2 2 2 2 1 2 ...
## $ Region : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Fresh : int 12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
## $ Milk : int 9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
## $ Grocery : int 7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
## $ Frozen : int 214 1762 2405 6404 3915 666 480 1669 425 1159 ...
## $ Detergents_Paper: int 2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
## $ Delicassen : int 1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
head(data)
## Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 2 3 12669 9656 7561 214 2674 1338
## 2 2 3 7057 9810 9568 1762 3293 1776
## 3 2 3 6353 8808 7684 2405 3516 7844
## 4 1 3 13265 1196 4221 6404 507 1788
## 5 2 3 22615 5410 7198 3915 1777 5185
## 6 2 3 9413 8259 5126 666 1795 1451
tail(data)
## Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 435 1 3 16731 3922 7994 688 2371 838
## 436 1 3 29703 12051 16027 13135 182 2204
## 437 1 3 39228 1431 764 4510 93 2346
## 438 2 3 14531 15488 30243 437 14841 1867
## 439 1 3 10290 1981 2232 1038 168 2125
## 440 1 3 2787 1698 2510 65 477 52
The variables Channel and Region must be converted to factors as they have finite values
data$Channel <- as.factor(data$Channel)
data$Region <- as.factor(data$Region)
Convert all the continuous variables to numeric
data[3:8] <- lapply(data[3:8], as.numeric)
Statistical summary of the continuous variables
summary(data[,3:8])
## Fresh Milk Grocery Frozen
## Min. : 3 Min. : 55 Min. : 3 Min. : 25.0
## 1st Qu.: 3128 1st Qu.: 1533 1st Qu.: 2153 1st Qu.: 742.2
## Median : 8504 Median : 3627 Median : 4756 Median : 1526.0
## Mean : 12000 Mean : 5796 Mean : 7951 Mean : 3071.9
## 3rd Qu.: 16934 3rd Qu.: 7190 3rd Qu.:10656 3rd Qu.: 3554.2
## Max. :112151 Max. :73498 Max. :92780 Max. :60869.0
## Detergents_Paper Delicassen
## Min. : 3.0 Min. : 3.0
## 1st Qu.: 256.8 1st Qu.: 408.2
## Median : 816.5 Median : 965.5
## Mean : 2881.5 Mean : 1524.9
## 3rd Qu.: 3922.0 3rd Qu.: 1820.2
## Max. :40827.0 Max. :47943.0
Barplots of the discrete variables
ggplot(data, aes(x = Channel)) +
geom_bar(stat = "count", width = 0.5, fill = "steelblue") +
theme_minimal() +
labs(title = "Barplot to display Channel Comparison", xlab = "Channel")
ggplot(data, aes(x = Region)) +
geom_bar(stat = "count", width = 0.5, fill = "steelblue") +
theme_minimal() +
labs(title = "Barplot to display Region Comparison", xlab = "Region")
Histograms of the distribution of the continuous variables
p1 <- ggplot(data, aes(x= Fresh)) + geom_histogram() + labs(title = "Histogram for Fresh distribution")
p2 <- ggplot(data, aes(x= Milk)) + geom_histogram() + labs(title = "Histogram for Milk distribution")
p3 <- ggplot(data, aes(x= Grocery)) + geom_histogram() + labs(title = "Histogram for Grocery distribution")
p4 <- ggplot(data, aes(x= Frozen)) + geom_histogram() + labs(title = "Histogram for Frozen distribution")
p5 <- ggplot(data, aes(x= Detergents_Paper)) + geom_histogram() + labs(title = "Histogram for Detegerents Paper distribution")
p6 <- ggplot(data, aes(x= Delicassen)) + geom_histogram() + labs(title = "Histogram for Delicassen distribution")
grid.arrange(p1, p2, p3, p4, p5, p6, nrow=2)
Given the skewed distributions, we shall perform a logarithmic transformation of all of the continuous variables
data[,3:8] <- log(data[,3:8])
Hisograms of the continuous variables by Channel
p1 <- ggplot(data, aes(x= Fresh, fill = Channel, color = Channel)) + geom_histogram(bins = 10, position = "identity", alpha = 0.5)
p2 <- ggplot(data, aes(x= Milk, fill = Channel, color = Channel)) + geom_histogram(bins = 10, position = "identity", alpha = 0.5)
p3 <- ggplot(data, aes(x= Grocery, fill = Channel, color = Channel)) + geom_histogram(bins = 10, position = "identity", alpha = 0.5)
p4 <- ggplot(data, aes(x= Frozen, fill = Channel, color = Channel)) + geom_histogram(bins = 10, position = "identity", alpha = 0.5)
p5 <- ggplot(data, aes(x= Detergents_Paper, fill = Channel, color = Channel)) + geom_histogram(bins = 10, position = "identity", alpha = 0.5)
p6 <- ggplot(data, aes(x= Delicassen, fill = Channel, color = Channel)) + geom_histogram(bins = 10, position = "identity", alpha = 0.5)
grid.arrange(p1, p2, p3, p4, p5, p6, nrow=3)
Density plots of the continuous variables
p1 <- ggplot(data, aes(x = Fresh)) + geom_density(fill = "blue") + labs(title = "Fresh Density")
p2 <- ggplot(data, aes(x = Milk)) + geom_density(fill = "blue") + labs(title = "Milk Density")
p3 <- ggplot(data, aes(x = Grocery)) + geom_density(fill = "blue") + labs(title = "Grocery Density")
p4 <- ggplot(data, aes(x = Frozen)) + geom_density(fill = "blue") + labs(title = "Frozen Density")
p5 <- ggplot(data, aes(x = Detergents_Paper)) + geom_density(fill = "blue") + labs(title = "Detergents Paper Density")
p6 <- ggplot(data, aes(x = Delicassen)) + geom_density(fill = "blue") + labs(title = "Delicassen Density")
grid.arrange(p1, p2, p3, p4, p5, p6, nrow=3)
The distributions may not have all become normalized, but are not as heavily skewed as before.
Boxplots of the continuous variables by Region and Channel
p1 <- ggplot(data, aes(x = Region, y= Fresh, fill = Channel)) + geom_boxplot() + labs(title = "Fresh Boxplot")
p2 <- ggplot(data, aes(x = Region, y= Milk, fill = Channel)) + geom_boxplot() + labs(title = "Milk Boxplot")
p3 <- ggplot(data, aes(x = Region, y= Grocery, fill = Channel)) + geom_boxplot() + labs(title = "Grocery Boxplot")
p4 <- ggplot(data, aes(x = Region, y= Frozen, fill = Channel)) + geom_boxplot() + labs(title = "Frozen Boxplot")
p5 <- ggplot(data, aes(x = Region, y= Detergents_Paper, fill = Channel)) + geom_boxplot() + labs(title = "Detergents Paper Boxplot")
p6 <- ggplot(data, aes(x = Region, y= Delicassen, fill = Channel)) + geom_boxplot() + labs(title = "Delicassen Boxplot")
grid.arrange(p1, p2, p3, p4, p5, p6, nrow=3)
Get the optimal number of clusters using the Gap statistic method
set.seed(123)
stat_gap <- clusGap(data[, 3:8], FUN = kmeans, nstart = 25, K.max = 10, B = 50)
## Warning: did not converge in 10 iterations
plot(stat_gap)
Find the optimal number of clusters using the elbow method
set.seed(123)
k.max <- 15
wss <- sapply(1:k.max, function(k){kmeans(data[,3:8], k, nstart=50,iter.max = 15 )$tot.withinss})
wss
## [1] 4800.919 3314.947 2807.248 2489.194 2283.392 2093.718 1963.929
## [8] 1833.298 1747.028 1656.294 1578.267 1504.479 1437.381 1380.279
## [15] 1329.492
plot(1:k.max, wss,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
Using k = 2 from the Gap statistic method, create a k-means clustering model
k2 <- kmeans(data[, 3:8], 2, iter.max = 100, nstart = 50, algorithm = "Lloyd")
k2
## K-means clustering with 2 clusters of sizes 182, 258
##
## Cluster means:
## Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 8.260679 8.921031 9.384720 6.750228 8.431986 6.817742
## 2 9.061999 7.556718 7.775563 7.690204 5.624831 6.557479
##
## Clustering vector:
## [1] 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 2 1 1 1 2 2 1 1 1 2 2 1 2 1 2 2 2 2
## [36] 1 2 1 1 2 2 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 2 1 1 1 1 1 2 1 1 1 2 2
## [71] 2 1 2 2 1 2 2 1 2 1 2 1 1 2 1 1 1 2 2 2 2 2 1 2 1 1 1 2 2 2 1 1 1 2 2
## [106] 2 1 1 1 1 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 1 1 2 2 2 2 2 2 2 1 1 2 2
## [141] 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 1 2 1 1 1 2 2 1 1 1 1 1 2 2 1 1 1 1 2
## [176] 1 2 2 2 2 1 1 1 2 2 2 2 1 1 1 2 2 2 1 2 2 2 1 2 2 1 1 2 2 2 1 2 1 1 1
## [211] 2 1 2 1 1 1 1 2 1 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 1 1
## [246] 1 2 2 2 2 2 1 2 2 1 2 2 2 2 2 2 2 2 1 1 1 1 2 1 2 2 2 2 2 2 2 2 2 2 1
## [281] 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 1 1 2 1 1 1 1 1 1 1 2 2 1 2 2 1 2 2
## [316] 1 2 1 2 1 2 2 2 1 2 2 2 2 2 2 2 1 2 1 2 1 2 2 2 2 1 1 1 1 2 1 1 1 2 1
## [351] 2 1 2 1 2 2 2 1 2 2 2 2 2 1 2 1 2 2 2 2 2 2 2 1 2 2 1 2 2 1 2 2 1 2 2
## [386] 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 1 1 1 1 2 1 2 2 1 1 1 1 1
## [421] 1 1 2 1 1 2 1 2 2 2 2 2 2 2 1 2 2 1 2 2
##
## Within cluster sum of squares by cluster:
## [1] 1401.941 1913.006
## (between_SS / total_SS = 31.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
clusplot(data, k2$cluster, color=TRUE, shade=TRUE, labels=0, lines=0)
Using k = 3 from the elbow method, create a k-means clustering model
k3 <- kmeans(data[, 3:8], 3, iter.max = 100, nstart = 50, algorithm = "Lloyd")
k3
## K-means clustering with 3 clusters of sizes 75, 216, 149
##
## Cluster means:
## Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 6.827410 8.586721 9.187840 5.831026 8.209773 5.725304
## 2 8.991553 7.402535 7.624570 7.569569 5.430247 6.441322
## 3 9.310122 8.928249 9.249122 7.652753 8.034640 7.462652
##
## Clustering vector:
## [1] 3 3 3 2 3 3 3 3 3 3 3 2 3 3 3 2 1 2 3 3 3 2 3 3 3 1 2 2 3 2 3 2 2 2 2
## [36] 1 3 3 1 2 3 3 1 1 1 3 3 3 3 3 2 1 3 1 2 3 3 1 2 1 1 3 3 3 2 1 1 3 3 2
## [71] 2 3 2 3 3 2 2 3 2 1 2 1 3 2 3 3 3 3 2 3 2 2 3 2 1 1 1 2 2 2 3 3 3 3 2
## [106] 2 1 3 1 1 2 3 2 2 2 2 2 2 2 2 2 2 2 3 2 3 2 3 1 2 2 2 2 2 2 2 3 1 2 2
## [141] 3 2 2 2 2 3 2 2 2 2 2 2 2 2 2 1 3 2 3 1 3 2 2 3 3 3 3 1 2 2 1 1 1 1 2
## [176] 1 3 2 2 2 3 3 1 3 1 2 2 1 3 1 2 2 2 1 2 2 3 3 2 2 3 3 3 1 2 1 2 1 1 3
## [211] 2 3 2 3 1 3 1 2 1 2 2 1 2 2 2 2 3 2 1 2 3 1 2 1 2 1 2 2 2 3 2 2 2 3 3
## [246] 1 2 2 2 2 2 3 2 3 3 2 2 3 3 3 2 2 2 3 1 3 1 2 3 2 2 2 2 2 2 2 3 2 2 3
## [281] 2 3 3 2 3 2 2 2 2 2 2 2 2 3 3 1 2 3 3 1 3 3 1 1 1 1 3 2 2 1 2 2 1 2 3
## [316] 3 2 1 2 3 2 2 2 3 2 3 2 2 2 2 2 3 2 1 2 3 2 2 2 2 1 3 1 1 2 1 3 3 2 3
## [351] 2 3 2 1 3 1 2 1 3 2 2 2 2 1 2 3 2 2 2 2 3 2 2 3 2 2 3 2 2 1 2 2 3 2 3
## [386] 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 3 2 3 3 3 3 3 2 1 3 2 3 3 3 1 3
## [421] 1 3 2 3 3 2 3 3 2 2 2 3 2 2 3 3 2 3 2 1
##
## Within cluster sum of squares by cluster:
## [1] 711.6976 1354.2249 741.3352
## (between_SS / total_SS = 41.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
clusplot(data, k3$cluster, color=TRUE, shade=TRUE, labels=0, lines=0)
Perform PCA and apply it to the dataset
pcclust <- prcomp(data[, 3:8], scale = FALSE)
summary(pcclust)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.1995 1.7391 1.1272 1.02557 0.70739 0.50095
## Proportion of Variance 0.4424 0.2766 0.1162 0.09618 0.04576 0.02295
## Cumulative Proportion 0.4424 0.7189 0.8351 0.93130 0.97705 1.00000
pcclust$rotation[, 1:2]
## PC1 PC2
## Fresh -0.1737170 0.68513571
## Milk 0.3944630 0.16239926
## Grocery 0.4543636 0.06937908
## Frozen -0.1721960 0.48769100
## Detergents_Paper 0.7455150 0.04191162
## Delicassen 0.1494356 0.50970874
Plot the clusters for k = 2
set.seed(123)
ggplot(data, aes(x = Detergents_Paper , y = Fresh)) +
geom_point(stat = "identity", aes(color = as.factor(k2$cluster))) +
scale_color_discrete(name = " ",
breaks=c("1", "2"),
labels=c("Cluster 1", "Cluster 2")) +
ggtitle("Segments of Wholesale Customers",
subtitle = "Using K-means Clustering")
Plot the clusters for k = 3
set.seed(123)
ggplot(data, aes(x = Fresh, y = Detergents_Paper)) +
geom_point(stat = "identity", aes(color = as.factor(k3$cluster))) +
scale_color_discrete(name = " ",
breaks=c("1", "2", "3"),
labels=c("Cluster 1", "Cluster 2", "Cluster 3")) +
ggtitle("Segments of Wholesale Customers",
subtitle = "Using K-means Clustering")