Part 1: Cluster Analysis Using White Wine Data

1. Introduction

This study segments white wines using clustering techniques based on their chemical properties. The analysis provides insights into wine differentiation and quality perception.

2. Data Overview

wine <- read.table("C:/Users/nicho/OneDrive/Desktop/MSDA/MK 6460 - Marketing Research & Analytics/Week 2 - Market Segmentation (Cluster Analysis) & Association Mining (Market Basket Analysis)/datasets/winequality-white(UCI).csv", sep=";", header=TRUE)
summary(wine)
##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.300   1st Qu.:0.2100   1st Qu.:0.2700   1st Qu.: 1.700  
##  Median : 6.800   Median :0.2600   Median :0.3200   Median : 5.200  
##  Mean   : 6.855   Mean   :0.2782   Mean   :0.3342   Mean   : 6.391  
##  3rd Qu.: 7.300   3rd Qu.:0.3200   3rd Qu.:0.3900   3rd Qu.: 9.900  
##  Max.   :14.200   Max.   :1.1000   Max.   :1.6600   Max.   :65.800  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.00900   Min.   :  2.00      Min.   :  9.0        Min.   :0.9871  
##  1st Qu.:0.03600   1st Qu.: 23.00      1st Qu.:108.0        1st Qu.:0.9917  
##  Median :0.04300   Median : 34.00      Median :134.0        Median :0.9937  
##  Mean   :0.04577   Mean   : 35.31      Mean   :138.4        Mean   :0.9940  
##  3rd Qu.:0.05000   3rd Qu.: 46.00      3rd Qu.:167.0        3rd Qu.:0.9961  
##  Max.   :0.34600   Max.   :289.00      Max.   :440.0        Max.   :1.0390  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.720   Min.   :0.2200   Min.   : 8.00   Min.   :3.000  
##  1st Qu.:3.090   1st Qu.:0.4100   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.180   Median :0.4700   Median :10.40   Median :6.000  
##  Mean   :3.188   Mean   :0.4898   Mean   :10.51   Mean   :5.878  
##  3rd Qu.:3.280   3rd Qu.:0.5500   3rd Qu.:11.40   3rd Qu.:6.000  
##  Max.   :3.820   Max.   :1.0800   Max.   :14.20   Max.   :9.000
str(wine)
## 'data.frame':    4898 obs. of  12 variables:
##  $ fixed.acidity       : num  7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
##  $ volatile.acidity    : num  0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
##  $ citric.acid         : num  0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
##  $ residual.sugar      : num  20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
##  $ chlorides           : num  0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
##  $ free.sulfur.dioxide : num  45 14 30 47 47 30 30 45 14 28 ...
##  $ total.sulfur.dioxide: num  170 132 97 186 186 97 136 170 132 129 ...
##  $ density             : num  1.001 0.994 0.995 0.996 0.996 ...
##  $ pH                  : num  3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
##  $ sulphates           : num  0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
##  $ alcohol             : num  8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
##  $ quality             : int  6 6 6 6 6 6 6 6 6 6 ...

3. K-Means Clustering (with sampling)

wine1 <- wine[, 1:11]
wine2 <- as.data.frame(scale(wine1))

set.seed(123)
wine_sample <- wine2[sample(1:nrow(wine2), 1000), ]

library(NbClust)
nbc_kmeans <- NbClust(wine_sample, min.nc=2, max.nc=10, method="kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 
## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 9 proposed 2 as the best number of clusters 
## * 4 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 3 proposed 5 as the best number of clusters 
## * 4 proposed 6 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 9 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
km <- kmeans(wine_sample, 2, nstart=25)
table(km$cluster)
## 
##   1   2 
## 624 376
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.2.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

km.res <- eclust(wine_sample, "kmeans", k=2, nstart=25, graph=FALSE)
fviz_cluster(km.res, geom="point", ellipse.type="norm", palette="jco", ggtheme=theme_minimal())

4. Hierarchical Clustering

dist_mat <- dist(wine_sample, method="euclidean")
hc <- hclust(dist_mat, method="complete")
plot(hc, hang=-1, main="Complete-Linkage")

hcWard <- hclust(dist_mat, method="ward.D2")
plot(hcWard, main="Ward's-Linkage")

5. PAM with Mixed Data

wine2_mixed <- wine1
wine2_mixed$pH <- as.factor(ifelse(wine1$pH > 3.2, "High", "Low"))

library(cluster)
disMat <- daisy(wine2_mixed, metric="gower")

sil_width <- c(NA)
for(i in 2:10){
  pam_fit <- pam(disMat, diss = TRUE, k = i)
  sil_width[i] <- pam_fit$silinfo$avg.width
}
plot(2:10, sil_width[2:10], type="b", xlab="k", ylab="Silhouette Width")

pamFit <- pam(disMat, k=3)
table(pamFit$clustering)
## 
##    1    2    3 
## 1424 2056 1418

6. Clusters vs. Quality

wine_quality_merge <- cbind(wine, cluster=pamFit$clustering)
table(wine_quality_merge$quality, wine_quality_merge$cluster)
##    
##       1   2   3
##   3   6  10   4
##   4  48  62  53
##   5 673 506 278
##   6 579 950 669
##   7 102 430 348
##   8  15  94  66
##   9   1   4   0

7. Marketing Insights

Clusters show different chemical profiles: - Cluster 1: Higher alcohol, lower sugar — may appeal to premium or bold-wine customers. - Cluster 2: Softer profile — could target casual drinkers or dessert wine enthusiasts.

Implications: - Tailor messaging to highlight these qualities (e.g., “crisp & bold” vs. “smooth & sweet”). - Distribute to relevant venues (e.g., restaurants, wine clubs). - Offer samples or bundle promotions by cluster type.

Part 2: RFM and Marketing Segmentation

1. Introduction

RFM (Recency, Frequency, Monetary) analysis segments customers based on purchasing behavior.

2. RFM Processing

rfm <- read.csv("C:/Users/nicho/OneDrive/Desktop/MSDA/MK 6460 - Marketing Research & Analytics/Week 2 - Market Segmentation (Cluster Analysis) & Association Mining (Market Basket Analysis)/datasets/RFM_data.csv", header=TRUE)
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.2.2
## Loading required package: timechange
## Warning: package 'timechange' was built under R version 4.2.2
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
rfm$trans_date <- dmy(rfm$trans_date)
ref_date <- max(rfm$trans_date)

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
rfm_summary <- rfm %>%
  group_by(customer_id) %>%
  summarise(
    recency = as.numeric(ref_date - max(trans_date)),
    frequency = n(),
    monetary = sum(tran_amount)
  )

3. K-Means Clustering

rfm_norm <- scale(rfm_summary[,-1])
set.seed(123)
kmeans_rfm <- kmeans(rfm_norm, 4, nstart=25)
rfm_clustered <- cbind(rfm_summary, cluster=kmeans_rfm$cluster)

aggregate(rfm_summary[,-1], by=list(kmeans_rfm$cluster), mean)
boxplot(rfm_summary$monetary ~ kmeans_rfm$cluster, col="skyblue", main="Monetary by Cluster")

4. Non-Cluster RFM Segmentation

rfm_quantiles <- rfm_summary
rfm_quantiles$R_score <- ntile(-rfm_quantiles$recency, 4)
rfm_quantiles$F_score <- ntile(rfm_quantiles$frequency, 4)
rfm_quantiles$M_score <- ntile(rfm_quantiles$monetary, 4)
rfm_quantiles$RFM_total <- rfm_quantiles$R_score + rfm_quantiles$F_score + rfm_quantiles$M_score

rfm_quantiles$segment <- cut(rfm_quantiles$RFM_total,
                             breaks=c(2, 6, 9, 12),
                             labels=c("Low Value", "Mid Value", "High Value"))

table(rfm_quantiles$segment)
## 
##  Low Value  Mid Value High Value 
##       2611       2408       1870

5. Marketing Strategy Insights

  • High Value: Recent, frequent buyers with high spend — target for loyalty programs, early access, premium offers, exclusive offers and benefits, etc.
  • Mid Value: Show potential — send personalized promotions or upgrade nudges.
  • Low Value: Use reactivation campaigns or surveys to understand disengagement.

More info on high value customers to lapsed customers and reengagement linked here -What is RFM analysis (recency, frequency, monetary)? | Definition from TechTarget