Data Description:
Customer ID: — Customer/Buyers Identity Number
Age: — Age of the customer
Age Group: — Age Category
Postcode: — Customer Location
Gender: —- Sex
Favourite Cookie: — Cookie Purchased
Cookies bought each week: — No. of Cookies bought each week.
options(repos = list(CRAN="http://cran.rstudio.com/"))
library(readr)
cookie_business <- read_csv("cookie_business.csv")
head(cookie_business)
## # A tibble: 6 × 7
## `Customer ID` Age `Age Group` Postcode Gender `Favourite Cookie` Cookies b…¹
## <dbl> <dbl> <chr> <dbl> <chr> <chr> <dbl>
## 1 1001 60 60-69 2000 M Choc chip 1
## 2 1002 53 50-59 2010 M Choc chip 1
## 3 1003 22 20-29 2010 F Choc chip 2
## 4 1004 30 30-39 2010 F Choc chip 6
## 5 1005 52 50-59 2010 F Macadamia 3
## 6 1006 22 20-29 2022 F Macadamia 3
## # … with abbreviated variable name ¹​`Cookies bought each week`
sum(is.na(cookie_business) == 1)
## [1] 0
requiredPackages = c("factoextra","flexclust", "fpc", "clustertend", "cluster","ClusterR", "grid",
"lattice","modeltools","stats4", "seriation", "devtools", "moments")
for(i in requiredPackages){if(!require(i,character.only = TRUE)) install.packages(i)}
for(i in requiredPackages){if(!require(i,character.only = TRUE)) library(i,character.only = TRUE) }
library(factoextra)
library(seriation)
library(flexclust)
library(fpc)
library(clustertend)
library(cluster)
library(ClusterR)
library(grid)
library(lattice)
library(modeltools)
library(stats4)
library(devtools)
library(moments)
summary(cookie_business)
## Customer ID Age Postcode Gender Favourite Cookie
## Min. :1001 Min. :12.00 Min. :2000 Min. :1.000 Min. :1.000
## 1st Qu.:1012 1st Qu.:20.25 1st Qu.:2000 1st Qu.:1.000 1st Qu.:1.250
## Median :1024 Median :31.50 Median :2014 Median :1.000 Median :2.000
## Mean :1024 Mean :34.17 Mean :2136 Mean :1.413 Mean :2.826
## 3rd Qu.:1035 3rd Qu.:44.75 3rd Qu.:2296 3rd Qu.:2.000 3rd Qu.:4.750
## Max. :1046 Max. :68.00 Max. :2873 Max. :2.000 Max. :6.000
## Cookies bought each week
## Min. : 1.000
## 1st Qu.: 1.250
## Median : 3.000
## Mean : 3.957
## 3rd Qu.: 5.000
## Max. :20.000
skewness(cookie_business)
## Customer ID Age Postcode
## 0.0000000 0.5806652 1.7950816
## Gender Favourite Cookie Cookies bought each week
## 0.3532086 0.6955363 2.2700018
kurtosis(cookie_business)
## Customer ID Age Postcode
## 1.798865 2.137290 6.251993
## Gender Favourite Cookie Cookies bought each week
## 1.124756 2.007562 9.471059
var(cookie_business)
## Customer ID Age Postcode Gender
## Customer ID 180.1666667 -75.622222 828.511111 0.12222222
## Age -75.6222222 262.102415 -1306.705314 1.05990338
## Postcode 828.5111111 -1306.705314 41744.796135 8.73043478
## Gender 0.1222222 1.059903 8.730435 0.24782609
## Favourite Cookie 3.7333333 -1.057971 5.816425 0.05120773
## Cookies bought each week 3.4888889 -12.592271 149.320773 -0.29275362
## Favourite Cookie Cookies bought each week
## Customer ID 3.73333333 3.4888889
## Age -1.05797101 -12.5922705
## Postcode 5.81642512 149.3207729
## Gender 0.05120773 -0.2927536
## Favourite Cookie 3.16908213 -1.2299517
## Cookies bought each week -1.22995169 13.3758454
# Compute Hopkins statistic for df
set.seed(123)
hopkins(df, n = nrow(df)-1)
## $H
## [1] 0.3261212
# Compute Hopkins statistic for random data
set.seed(123)
hopkins(random_df, n = nrow(random_df)-1)
## $H
## [1] 0.5656129
set.seed(123)
get_clust_tendency(df, n=nrow(df)-1, graph=TRUE, gradient=list(low = "white",mid="Grey", high = "Dark grey"))
## $hopkins_stat
## [1] 0.7135737
##
## $plot
get_clust_tendency(random_df, n=nrow(random_df)-1, graph=TRUE, gradient=list(low = "white",mid="Grey", high = "Dark grey") )
## $hopkins_stat
## [1] 0.3789834
##
## $plot
# Plotting df dataset with 2 clusters
k2 <- kmeans(df, centers = 2, nstart = 25)
fviz_cluster(k2, data = df)
# Plotting random dataset with 2 clusters
k2 <- kmeans(random_df, centers = 2, nstart = 25)
fviz_cluster(k2, data = random_df)
#Using the kmeans to test for number of clusters using both gap stat and silhouette
fviz_nbclust(df, kmeans, method = "gap_stat")
fviz_nbclust(df, kmeans, method = "silhouette")
#Using the PAM to test for number of clusters using both gap stat and silhouette
fviz_nbclust(df, pam, method ="gap_stat")+theme_minimal()
fviz_nbclust(df, pam, method ="silhouette")+theme_minimal()
k2cluster <- kmeans(df, centers = 2, nstart = 25)
fviz_cluster(k2cluster, data = df, elipse.type="concave", geom=c("point")) + ggtitle("Kmeans for 2 clusters")
sil<-silhouette(k2cluster$cluster, dist(df))
fviz_silhouette(sil)
## cluster size ave.sil.width
## 1 1 30 0.61
## 2 2 16 0.55
library(ggpubr)
ggarrange(fviz_cluster(k2, data = df, elipse.type="concave", geom=c("point")) + ggtitle("Kmeans for 2 clusters"),fviz_silhouette(sil) , ncol=2, nrow = 1)
## cluster size ave.sil.width
## 1 1 30 0.61
## 2 2 16 0.55
k4cluster <- kmeans(df, 4, nstart = 25)
fviz_cluster(k4cluster, data = df, elipse.type="concave", geom=c("point")) + ggtitle("Kmeans for 4 clusters")
sil<-silhouette(k4cluster$cluster, dist(df))
fviz_silhouette(sil)
## cluster size ave.sil.width
## 1 1 10 0.51
## 2 2 17 0.43
## 3 3 13 0.45
## 4 4 6 0.53
library(ggpubr)
ggarrange(fviz_cluster(k4cluster, data = df, elipse.type="concave", geom=c("point")) + ggtitle("Kmeans for 4 clusters"),fviz_silhouette(sil) , ncol=2, nrow = 1)
## cluster size ave.sil.width
## 1 1 10 0.51
## 2 2 17 0.43
## 3 3 13 0.45
## 4 4 6 0.53
k2cluster <- pam(df, 2, nstart = 25)
fviz_cluster(k2cluster, data = df, elipse.type="concave", geom=c("point")) + ggtitle("PAM for 2 clusters")
sil<-silhouette(k2cluster$cluster, dist(df))
fviz_silhouette(sil)
## cluster size ave.sil.width
## 1 1 17 0.51
## 2 2 29 0.62
library(ggpubr)
ggarrange(fviz_cluster(k2, data = df, elipse.type="concave", geom=c("point")) + ggtitle("PAM for 2 clusters"),fviz_silhouette(sil) , ncol=2, nrow = 1)
## cluster size ave.sil.width
## 1 1 17 0.51
## 2 2 29 0.62
k4cluster <- pam(df, 4, nstart = 25)
fviz_cluster(k4cluster, data = df, elipse.type="concave", geom=c("point")) + ggtitle("PAM for 4 clusters")
sil<-silhouette(k4cluster$cluster, dist(df))
fviz_silhouette(sil)
## cluster size ave.sil.width
## 1 1 9 0.52
## 2 2 17 0.43
## 3 3 13 0.49
## 4 4 7 0.40
library(ggpubr)
ggarrange(fviz_cluster(k4cluster, data = df, elipse.type="concave", geom=c("point")) + ggtitle("PAM for 4 clusters"),fviz_silhouette(sil) , ncol=2, nrow = 1)
## cluster size ave.sil.width
## 1 1 9 0.52
## 2 2 17 0.43
## 3 3 13 0.49
## 4 4 7 0.40