load the libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
read the file “Shopping_cluster.csv” (modify to your own directory)
shop<-read.csv("~/Desktop/R/Shopping_cluster.csv",header=T,sep=',')

Hierachical clustering

data preparation
shop_slt<-mutate(shop,caseno=NULL) 
shop_slt<-na.omit(shop_slt) # remove any missing value 
shop_slt<-scale(shop_slt) # standardize the data when variables are in different scales 
d<-dist(shop_slt) # calculate the distance matrix 
form clusters
result2<-hclust(d,method="ward.D2") # there are several methods (check help file for hclust) and ward's method is shown to be the superior method
plot(result2)

cut tree into certain number of clusters
hc_grp<-cutree(result2,k=3) # cut into 3 clusters
table(hc_grp) # number of members in each cluster
## hc_grp
## 1 2 3 
## 8 6 6
shop_new<-cbind(shop,hc_grp)

The following applies to both k-means and hierachical cluster methods

Visualize the clusters (need to first install.packages(“factoextra”) )
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(list(data=shop_new,cluster=hc_grp))

Determine the optimal number of clusters (elbow method - based on the within-cluster sum of squares)
fviz_nbclust(shop_slt,FUN=hcut,method="wss")

Discriminant Analysis

read the data (modify to your own directory)
shop1<-read.csv("~/Desktop/R/Shopping_discrim.csv",header=T,sep=',')
check the group mean
shop1 %>%
  group_by(Cluster) %>%
  summarize(ave_purchase_int=mean(Purchase_intention),
          ave_income=mean(Income),
          ave_age=mean(Age)
          )
## # A tibble: 3 x 4
##   Cluster ave_purchase_int ave_income ave_age
##     <int>            <dbl>      <dbl>   <dbl>
## 1       1             4.17       44.0    46.7
## 2       2             3.83       56.6    49.2
## 3       3             5.88       69.8    48.1
select variables and run the analysis
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
result3<- lda(Cluster~Income, shop1)
result3
## Call:
## lda(Cluster ~ Income, data = shop1)
## 
## Prior probabilities of groups:
##   1   2   3 
## 0.3 0.3 0.4 
## 
## Group means:
##     Income
## 1 44.03333
## 2 56.65000
## 3 69.83750
## 
## Coefficients of linear discriminants:
##              LD1
## Income 0.1567633
predict the grouping, and merge with original data
predict<-predict(result3,shop1)
predict_cluster<-predict$class
shop1_new<-cbind(shop1,predict_cluster) # predict$class contains the predicted cluster for each customer
calculate the prediction accuracy
shop1_new$correct<-ifelse(shop1_new$Cluster==shop1_new$predict_cluster,1,0)
shop1_new %>%
  group_by(Cluster)%>%
  summarize(accuracy=mean(correct))
## # A tibble: 3 x 2
##   Cluster accuracy
##     <int>    <dbl>
## 1       1    0.667
## 2       2    0.5  
## 3       3    0.875