load the libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Hierachical clustering
data preparation
shop_slt<-mutate(shop,caseno=NULL)
shop_slt<-na.omit(shop_slt) # remove any missing value
shop_slt<-scale(shop_slt) # standardize the data when variables are in different scales
d<-dist(shop_slt) # calculate the distance matrix
cut tree into certain number of clusters
hc_grp<-cutree(result2,k=3) # cut into 3 clusters
table(hc_grp) # number of members in each cluster
## hc_grp
## 1 2 3
## 8 6 6
shop_new<-cbind(shop,hc_grp)
The following applies to both k-means and hierachical cluster methods
Determine the optimal number of clusters (elbow method - based on the within-cluster sum of squares)
fviz_nbclust(shop_slt,FUN=hcut,method="wss")

Discriminant Analysis
read the data (modify to your own directory)
shop1<-read.csv("~/Desktop/R/Shopping_discrim.csv",header=T,sep=',')
check the group mean
shop1 %>%
group_by(Cluster) %>%
summarize(ave_purchase_int=mean(Purchase_intention),
ave_income=mean(Income),
ave_age=mean(Age)
)
## # A tibble: 3 x 4
## Cluster ave_purchase_int ave_income ave_age
## <int> <dbl> <dbl> <dbl>
## 1 1 4.17 44.0 46.7
## 2 2 3.83 56.6 49.2
## 3 3 5.88 69.8 48.1
select variables and run the analysis
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
result3<- lda(Cluster~Income, shop1)
result3
## Call:
## lda(Cluster ~ Income, data = shop1)
##
## Prior probabilities of groups:
## 1 2 3
## 0.3 0.3 0.4
##
## Group means:
## Income
## 1 44.03333
## 2 56.65000
## 3 69.83750
##
## Coefficients of linear discriminants:
## LD1
## Income 0.1567633
predict the grouping, and merge with original data
predict<-predict(result3,shop1)
predict_cluster<-predict$class
shop1_new<-cbind(shop1,predict_cluster) # predict$class contains the predicted cluster for each customer
calculate the prediction accuracy
shop1_new$correct<-ifelse(shop1_new$Cluster==shop1_new$predict_cluster,1,0)
shop1_new %>%
group_by(Cluster)%>%
summarize(accuracy=mean(correct))
## # A tibble: 3 x 2
## Cluster accuracy
## <int> <dbl>
## 1 1 0.667
## 2 2 0.5
## 3 3 0.875