# Read the CSV file
mydata <- read.table("C:/Users/PC/OneDrive/Desktop/hw43/customer_segmentation.csv", header=TRUE, sep=",", dec=".")
head(mydata)
## ID Year_Birth Education Marital_Status Income Kidhome Teenhome
## 1 5524 1957 Graduation Single 58138 0 0
## 2 2174 1954 Graduation Single 46344 1 1
## 3 4141 1965 Graduation Together 71613 0 0
## 4 6182 1984 Graduation Together 26646 1 0
## 5 5324 1981 PhD Married 58293 1 0
## 6 7446 1967 Master Together 62513 0 1
## Dt_Customer Recency MntWines MntFruits MntMeatProducts
## 1 04-09-2012 58 635 88 546
## 2 08-03-2014 38 11 1 6
## 3 21-08-2013 26 426 49 127
## 4 10-02-2014 26 11 4 20
## 5 19-01-2014 94 173 43 118
## 6 09-09-2013 16 520 42 98
## MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases
## 1 172 88 88 3
## 2 2 1 6 2
## 3 111 21 42 1
## 4 10 3 5 2
## 5 46 27 15 5
## 6 0 42 14 2
## NumWebPurchases NumCatalogPurchases NumStorePurchases
## 1 8 10 4
## 2 1 1 2
## 3 8 2 10
## 4 2 0 4
## 5 5 3 6
## 6 6 4 10
## NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5
## 1 7 0 0 0
## 2 5 0 0 0
## 3 4 0 0 0
## 4 6 0 0 0
## 5 5 0 0 0
## 6 6 0 0 0
## AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
## 1 0 0 0 3 11 1
## 2 0 0 0 3 11 0
## 3 0 0 0 3 11 0
## 4 0 0 0 3 11 0
## 5 0 0 0 3 11 0
## 6 0 0 0 3 11 0
https://www.kaggle.com/datasets/vishakhdapat/customer-segmentation-clustering?resource=download
any(is.na(mydata))
## [1] TRUE
library(tidyr)
mydata <- drop_na(mydata)
data <- subset(mydata, select = c(ID, Education, Marital_Status, Income, Kidhome, MntWines, MntFruits, MntMeatProducts, MntFishProducts, MntSweetProducts, MntGoldProds ))
set.seed(1)
sampledata <- data[sample(nrow(data), size=400),]
sampledata$EducationF <- factor(sampledata$Education,
levels = c("Graduation", "Master" , "PhD" , "2n Cycle", "Master", "Basic" ),
labels = c ("Graduation" , "Master" , "PhD" , "2n Cycle", "Master", "Basic" ))
sampledata$MaritalStatusF <- factor(sampledata$Marital_Status,
levels = c("Married", "Single" , "Together" , "Divorced", "Widow" ),
labels = c ("Married" , "Single" , "Together" , "Divorced", "Widow" ))
summary(sampledata[c("MntWines", "MntFruits", "MntMeatProducts", "MntFishProducts", "MntSweetProducts", "MntGoldProds")])
## MntWines MntFruits MntMeatProducts MntFishProducts
## Min. : 0.0 Min. : 0.00 Min. : 1.0 Min. : 0.00
## 1st Qu.: 30.0 1st Qu.: 1.00 1st Qu.: 16.0 1st Qu.: 3.00
## Median : 169.0 Median : 9.00 Median : 66.0 Median : 13.00
## Mean : 299.9 Mean : 24.23 Mean : 175.0 Mean : 38.05
## 3rd Qu.: 489.2 3rd Qu.: 28.00 3rd Qu.: 239.8 3rd Qu.: 51.00
## Max. :1396.0 Max. :199.00 Max. :1622.0 Max. :258.00
## MntSweetProducts MntGoldProds
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.00 1st Qu.: 8.75
## Median : 9.00 Median : 26.00
## Mean : 27.39 Mean : 44.73
## 3rd Qu.: 35.00 3rd Qu.: 55.00
## Max. :194.00 Max. :291.00
data_cluster <- as.data.frame(scale(sampledata[c("MntWines", "MntFruits", "MntMeatProducts", "MntFishProducts", "MntSweetProducts", "MntGoldProds")]))
library(Hmisc)
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
rcorr(as.matrix(data_cluster[, c("MntWines", "MntFruits", "MntMeatProducts", "MntFishProducts", "MntSweetProducts", "MntGoldProds")]),
type="pearson")
## MntWines MntFruits MntMeatProducts MntFishProducts
## MntWines 1.00 0.39 0.55 0.45
## MntFruits 0.39 1.00 0.48 0.51
## MntMeatProducts 0.55 0.48 1.00 0.57
## MntFishProducts 0.45 0.51 0.57 1.00
## MntSweetProducts 0.43 0.49 0.57 0.62
## MntGoldProds 0.40 0.31 0.31 0.39
## MntSweetProducts MntGoldProds
## MntWines 0.43 0.40
## MntFruits 0.49 0.31
## MntMeatProducts 0.57 0.31
## MntFishProducts 0.62 0.39
## MntSweetProducts 1.00 0.31
## MntGoldProds 0.31 1.00
##
## n= 400
##
##
## P
## MntWines MntFruits MntMeatProducts MntFishProducts
## MntWines 0 0 0
## MntFruits 0 0 0
## MntMeatProducts 0 0 0
## MntFishProducts 0 0 0
## MntSweetProducts 0 0 0 0
## MntGoldProds 0 0 0 0
## MntSweetProducts MntGoldProds
## MntWines 0 0
## MntFruits 0 0
## MntMeatProducts 0 0
## MntFishProducts 0 0
## MntSweetProducts 0
## MntGoldProds 0
From the correlation matrix, some problematic correlations can be observed
However, we have a solution for this. If variables are correlated, we can use the same number of variables from each group of related variables:
*Note: We had a simliar problem with one lecture example, so I hope this is how it is solved.
sampledata$Dissimilarity <- sqrt(data_cluster$MntWines^2 + data_cluster$MntFruits^2 + data_cluster$MntMeatProducts^2 + data_cluster$MntFishProducts^2 + data_cluster$MntSweetProducts^2 + data_cluster$MntGoldProds^2)
head(sampledata[order(-sampledata$Dissimilarity), c("ID" , "Dissimilarity")], 15)
## ID Dissimilarity
## 843 1456 6.434335
## 634 4611 6.404922
## 998 5236 6.397162
## 675 1501 6.226904
## 444 4947 6.181341
## 1966 3334 5.744786
## 1167 5735 5.743609
## 912 8931 5.537889
## 549 3179 5.391720
## 1040 4475 5.163738
## 1894 5832 5.130645
## 975 4580 5.090701
## 2171 8722 5.070093
## 1301 7143 4.942641
## 1596 9242 4.895167
I will remove the units with dissimilarities higher than 5.744, because there is a big difference. I will standardize again after.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
sampledata <- sampledata %>%
filter(!ID %in% c(1456, 4611, 5236, 1501, 4947))
data_cluster <- as.data.frame(scale(sampledata[c("MntWines", "MntFruits", "MntMeatProducts", "MntFishProducts", "MntSweetProducts", "MntGoldProds")]))
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.2
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
distance <- get_dist(data_cluster[c("MntWines", "MntFruits", "MntMeatProducts", "MntFishProducts", "MntSweetProducts", "MntGoldProds")],
method = "euclidian")
distance2 <- distance^2
fviz_dist(distance)
The pink areas suggest some natural groups.
get_clust_tendency(data_cluster[c("MntWines", "MntFruits", "MntMeatProducts", "MntFishProducts", "MntSweetProducts", "MntGoldProds")],
n = nrow(data_cluster) - 1,
graph=FALSE)
## $hopkins_stat
## [1] 0.7888769
##
## $plot
## NULL
It is above 0.5, meaning that data is suitable and that there are natural groups.
library(dplyr)
WARD <- data_cluster %>%
get_dist(method = "euclidean") %>%
hclust (method = "ward.D2")
WARD
##
## Call:
## hclust(d = ., method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 395
library(factoextra)
fviz_dend(WARD)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none"
## instead as of ggplot2 3.3.4.
## βΉ The deprecated feature was likely used in the factoextra package.
## Please report the issue at
## <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning
## was generated.
Using the dendrogram, it can be concluded that 3 groups are most appropriate for clustering.
sampledata$ClusterWard <- cutree (WARD,
k=3)
head(sampledata[c("ID", "ClusterWard")])
## ID ClusterWard
## 1 8985 1
## 2 143 2
## 3 9014 3
## 4 6810 1
## 5 2217 3
## 6 1676 3
Leaders_initial <- aggregate(data_cluster,
by=list(sampledata$ClusterWard),
FUN=mean)
Leaders_initial
## Group.1 MntWines MntFruits MntMeatProducts MntFishProducts
## 1 1 0.7752190 1.0987793 1.26203295 1.2488743
## 2 2 0.7429790 -0.1751276 -0.03453963 -0.1576263
## 3 3 -0.7192361 -0.4870040 -0.63158591 -0.5714868
## MntSweetProducts MntGoldProds
## 1 1.2080110 0.3946003
## 2 -0.2032689 0.8658640
## 3 -0.5307659 -0.5775097
library(factoextra)
kmeans_clu <- hkmeans(data_cluster,
k=3,
hc.metric = "euclidean",
hc.method = "ward.D2")
kmeans_clu
## Hierarchical K-means clustering with 3 clusters of sizes 80, 85, 230
##
## Cluster means:
## MntWines MntFruits MntMeatProducts MntFishProducts
## 1 0.7758660 1.346413423 1.2889280 1.4682302
## 2 0.9298703 -0.009391192 0.4202140 0.1213956
## 3 -0.6135142 -0.464847054 -0.6036193 -0.5555523
## MntSweetProducts MntGoldProds
## 1 1.52224574 0.4458134
## 2 -0.03602783 0.9570749
## 3 -0.51616215 -0.5087671
##
## Clustering vector:
## [1] 1 2 3 1 3 3 3 3 1 1 1 2 3 3 1 3 3 3 3 2 3 1 2 3 3 3 3 3 3 3 3 3
## [33] 1 3 2 3 3 3 3 2 3 1 2 2 3 3 2 3 3 3 3 3 2 3 3 1 3 1 3 3 3 3 2 2
## [65] 3 3 2 3 2 2 3 3 3 1 1 1 3 3 3 3 3 1 3 1 2 1 3 1 3 3 3 2 3 2 2 2
## [97] 3 2 3 3 3 3 2 3 2 3 3 1 2 1 3 3 2 1 2 3 3 3 2 3 3 3 2 2 1 2 2 1
## [129] 3 2 3 3 3 2 2 3 3 3 3 1 3 2 3 2 3 3 3 1 3 3 3 2 3 3 2 2 3 2 1 3
## [161] 2 2 3 3 3 2 2 3 3 3 3 1 3 3 3 3 2 1 2 3 3 1 2 1 1 3 3 3 1 2 3 1
## [193] 3 1 3 3 1 3 3 1 1 2 3 3 1 1 3 3 1 3 2 1 1 1 2 3 3 2 3 2 3 3 3 2
## [225] 3 3 3 2 2 1 1 1 3 3 1 3 1 2 3 3 3 1 2 2 1 2 3 2 3 3 3 3 3 3 3 1
## [257] 3 1 3 1 1 3 3 1 3 1 2 3 3 3 1 2 3 3 3 3 3 3 1 3 1 3 3 3 1 3 2 3
## [289] 2 3 1 3 2 2 2 2 1 3 1 3 3 3 2 1 3 2 3 2 3 3 3 3 1 2 1 3 2 3 1 3
## [321] 1 2 3 3 3 3 3 2 2 3 2 3 3 1 2 3 1 3 1 3 3 3 3 3 3 2 3 2 3 1 3 1
## [353] 1 2 3 3 3 3 1 1 3 3 3 3 3 3 3 2 3 3 3 3 1 3 3 2 3 3 3 3 3 2 3 3
## [385] 1 1 3 3 3 3 1 3 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 567.5895 398.3432 118.6359
## (between_SS / total_SS = 54.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault" "data" "hclust"
The ratio between the sum of squares and the total sum of squares is 54.1%. This number indicates a relatively good fit.
library(factoextra)
fviz_cluster(kmeans_clu,
palette = "Set1",
repel - TRUE,
ggtheme=theme_bw())
I see some units that could be removed, however, for educational purposes I will just continue without removing them.
sampledata$ClusterK_Means <- kmeans_clu$cluster
head(sampledata[c("ClusterWard", "ClusterK_Means")])
## ClusterWard ClusterK_Means
## 1 1 1
## 2 2 2
## 3 3 3
## 4 1 1
## 5 3 3
## 6 3 3
table(sampledata$ClusterWard)
##
## 1 2 3
## 104 88 203
table(sampledata$ClusterK_Means)
##
## 1 2 3
## 80 85 230
table(sampledata$ClusterWard, sampledata$ClusterK_Means)
##
## 1 2 3
## 1 80 22 2
## 2 0 63 25
## 3 0 0 203
Leaders_final <- kmeans_clu$centers
Leaders_final
## MntWines MntFruits MntMeatProducts MntFishProducts
## 1 0.7758660 1.346413423 1.2889280 1.4682302
## 2 0.9298703 -0.009391192 0.4202140 0.1213956
## 3 -0.6135142 -0.464847054 -0.6036193 -0.5555523
## MntSweetProducts MntGoldProds
## 1 1.52224574 0.4458134
## 2 -0.03602783 0.9570749
## 3 -0.51616215 -0.5087671
Figure <- as.data.frame (Leaders_final)
Figure$ID <- 1: nrow (Figure)
library(tidyr)
Figure <- pivot_longer(Figure, cols = c("MntWines", "MntFruits", "MntMeatProducts", "MntFishProducts", "MntSweetProducts", "MntGoldProds"))
Figure$Group <- factor(Figure$ID,
levels = c(1, 2, 3, 4, 5, 6),
labels = c("1", "2", "3", "4", "5", "6"))
Figure$NameF <- factor(Figure$name,
levels = c("MntWines", "MntFruits", "MntMeatProducts", "MntFishProducts", "MntSweetProducts", "MntGoldProds"),
labels = c("MntWines", "MntFruits", "MntMeatProducts", "MntFishProducts", "MntSweetProducts", "MntGoldProds"))
library(ggplot2)
ggplot(Figure, aes(x = NameF, y = value)) +
geom_hline(yintercept = 0) +
theme_bw() +
geom_point(aes(shape = Group, col = Group), size = 3) +
geom_line(aes(group = ID), linewidth = 1) +
ylab("Averages") +
xlab("Cluster variables")+
ylim(-2, 2)
From the graph we can see how the clusters differ from each other based on the 6 cluster variables. The 0 line represents the average. The group 1 is above average in each of 6 variables, while group 3 is below average in each of 6 variables. Group 3 spends the least on wines, fruits, meat, fish, sweets and gold (all variables) from all of 3 groups. Group 2 spends the most on wines and gold products (as I called them luxury products after correlation check) out of all groups, while the group 2 spends the most on fruits and sweets (sugar products) and fish and meat (animal products) out of all 3 groups.
fit <- aov(cbind(MntWines, MntFruits, MntMeatProducts, MntFishProducts, MntSweetProducts, MntGoldProds) ~ as.factor(ClusterK_Means),
data = sampledata)
summary(fit)
## Response MntWines :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 2 22868997 11434499 219.69 < 2.2e-16 ***
## Residuals 392 20403278 52049
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response MntFruits :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 2 253659 126830 191.54 < 2.2e-16 ***
## Residuals 392 259565 662
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response MntMeatProducts :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 2 11595247 5797624 279.86 < 2.2e-16 ***
## Residuals 392 8120642 20716
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response MntFishProducts :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 2 609113 304557 321.22 < 2.2e-16 ***
## Residuals 392 371659 948
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response MntSweetProducts :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 2 379148 189574 328.5 < 2.2e-16 ***
## Residuals 392 226220 577
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response MntGoldProds :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 2 405988 202994 124.82 < 2.2e-16 ***
## Residuals 392 637496 1626
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
6 times One way ANOVA - to check if each variable differ between 3 groups.
I will write generalized hypothesis for all 6 tests (because there are 6 variables).
H0: π1 = π2 = π3 (because we have 3 groups)
H1: At least one π is different from others.
All p values are <0.0001 so we can reject the null hypothesis for each of 6 tests, and conclude that all cluster variables successfully differentiate between groups.
I will try to validate clusters with the following variables:
I will start with numerical ones my checking means of the both variables.
aggregate(sampledata$Income,
by = list(sampledata$ClusterK_Means),
FUN=mean)
## Group.1 x
## 1 1 73947.18
## 2 2 66899.55
## 3 3 40108.93
aggregate(sampledata$Kidhome,
by = list(sampledata$ClusterK_Means),
FUN=mean)
## Group.1 x
## 1 1 0.05000000
## 2 2 0.08235294
## 3 3 0.68260870
fit1 <- aov(cbind(Income, Kidhome) ~ as.factor(ClusterK_Means),
data = sampledata)
summary(fit1)
## Response Income :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 2 8.9717e+10 4.4858e+10 200.59 < 2.2e-16
## Residuals 392 8.7662e+10 2.2363e+08
##
## as.factor(ClusterK_Means) ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Kidhome :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 2 36.493 18.2464 99.267 < 2.2e-16 ***
## Residuals 392 72.054 0.1838
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
2 times One way ANOVA - to check if each numerical variable differ between 3 groups.
I will write generalized hypothesis for both tests (because there are 2 checked variables).
H0: π1 = π2 = π3 (because we have 3 groups)
H1: At least one π is different from others.
Both p values are <0.0001 so we can reject the null hypothesis for both tests, and conclude that variable Income successfully differentiate between groups and variable Kidhome successfully differentiate between groups .
We validate clustering with Income and Kidhome.
Units in first group have largest annual incomes and the units in the third group are the ones with the highest number of young children in the household (this is concluded from the above computed means).
Now I will move on to categorical variables for validation.
chisquare <- chisq.test(sampledata$EducationF, as.factor(sampledata$ClusterK_Means))
## Warning in chisq.test(sampledata$EducationF,
## as.factor(sampledata$ClusterK_Means)): Chi-squared approximation may
## be incorrect
chisquare
##
## Pearson's Chi-squared test
##
## data: sampledata$EducationF and as.factor(sampledata$ClusterK_Means)
## X-squared = 16.766, df = 8, p-value = 0.03264
Pearson Chi2 test for association between two categorical variables
H0: There is no association between the variables
H1: There is association between the variables
addmargins(chisquare$observed)
##
## sampledata$EducationF 1 2 3 Sum
## Graduation 48 42 118 208
## Master 8 8 41 57
## PhD 11 26 45 82
## 2n Cycle 13 9 22 44
## Basic 0 0 4 4
## Sum 80 85 230 395
addmargins(round(chisquare$expected, 2))
##
## sampledata$EducationF 1 2 3 Sum
## Graduation 42.13 44.76 121.11 208.00
## Master 11.54 12.27 33.19 57.00
## PhD 16.61 17.65 47.75 82.01
## 2n Cycle 8.91 9.47 25.62 44.00
## Basic 0.81 0.86 2.33 4.00
## Sum 80.00 85.01 230.00 395.01
3 expected frequencies are less than 5, however 3 out of 15 is okay. It is 20% of expected frequencies that are below than 5.
*Note: This reduces the power of the test.
round(chisquare$res, 2)
##
## sampledata$EducationF 1 2 3
## Graduation 0.90 -0.41 -0.28
## Master -1.04 -1.22 1.36
## PhD -1.38 1.99 -0.40
## 2n Cycle 1.37 -0.15 -0.72
## Basic -0.90 -0.93 1.09
Result of Chi squared: We can reject the null hypothesis at p = 0.033 and conclude there is a statistically significant association between the education and the clustering groups. There are more than expected number of customers with PhD in the second groups (a = 0.05).
We validate clustering with Education as well.
Letβs check for marital status as well.
chisquare1 <- chisq.test(sampledata$MaritalStatusF, as.factor(sampledata$ClusterK_Means))
## Warning in chisq.test(sampledata$MaritalStatusF,
## as.factor(sampledata$ClusterK_Means)): Chi-squared approximation may
## be incorrect
chisquare1
##
## Pearson's Chi-squared test
##
## data: sampledata$MaritalStatusF and as.factor(sampledata$ClusterK_Means)
## X-squared = 5.6951, df = 8, p-value = 0.6813
Pearson Chi2 test for association between two categorical variables
H0: There is no association between the variables
H1: There is association between the variables
addmargins(chisquare1$observed)
##
## sampledata$MaritalStatusF 1 2 3 Sum
## Married 32 34 94 160
## Single 22 15 48 85
## Together 21 27 62 110
## Divorced 5 7 18 30
## Widow 0 2 8 10
## Sum 80 85 230 395
addmargins(round(chisquare1$expected, 2))
##
## sampledata$MaritalStatusF 1 2 3 Sum
## Married 32.41 34.43 93.16 160.00
## Single 17.22 18.29 49.49 85.00
## Together 22.28 23.67 64.05 110.00
## Divorced 6.08 6.46 17.47 30.01
## Widow 2.03 2.15 5.82 10.00
## Sum 80.02 85.00 229.99 395.01
2 expected frequencies are less than 5, but this is okay since it is less than 20% of all expected frequencies.
round(chisquare1$res, 2)
##
## sampledata$MaritalStatusF 1 2 3
## Married -0.07 -0.07 0.09
## Single 1.15 -0.77 -0.21
## Together -0.27 0.68 -0.26
## Divorced -0.44 0.21 0.13
## Widow -1.42 -0.10 0.90
The differences between expected and actual frequencies are not statistically significant
Result of Chi squared: We do not have enough evidence to conclude that there is association between clustering groups and marital status (p>5%).
We cannot validate clusters with marital status.
I divided 395 customers into 3 segments based on six standardized variables.
For hierarchical clustering, we used Wardβs clustering algorithm and decided to divide them into 3 groups based on the analysis of the dendrogram. The classification was further optimized using K-Means clustering.
Group 3 contains the most customers (around 58% - 230/395),characterized by a lower than average value of most cluster variables. The customers in group 3 have the lowest mean amounts spent on wines, fruits, meat, fish, sweets and gold. On average, they have the lowest annual income, and the highest number of young children in the household. There is more than expected customers with Masters education in this group (this one is not significant but I explained it for educational purposes).