#loading dataset
data6 <- read.csv("clustering.csv")
View(data6)
#Channel Hotel-1, Restaurant -2,Cafe -3
#Clean, augment, and preprocess the data into a more convenient form, if needed.
any(is.na(data6))
## [1] FALSE
data6$Region <- NULL
#Perform exploratory analysis of the data to better understand the data
plot(data6)

plot(data6$Channel,data6$Milk)

plot(data6$Channel,data6$Grocery)

plot(data6$Channel,data6$Detergents_Paper)

plot(data6$Channel,data6$Fresh)

#Describe a problem statement using association
#rule mining to find associations between interesting variables.
#which items are more frequently used in the hotels, restaurants?
#Normalizing the data
norm_data <- scale(data6)
#Create a PCA model of the data and inspect model output
pc <- princomp(norm_data,cor = TRUE,scores = TRUE)
#Inspect the results
summary(pc)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.7603202 1.3373485 0.8603248 0.75161147 0.68073486
## Proportion of Variance 0.4426753 0.2555001 0.1057370 0.08070283 0.06619999
## Cumulative Proportion 0.4426753 0.6981755 0.8039124 0.88461525 0.95081525
## Comp.6 Comp.7
## Standard deviation 0.53056661 0.250584015
## Proportion of Variance 0.04021442 0.008970336
## Cumulative Proportion 0.99102966 1.000000000
#d. Obtain some visualizations to better understand
#the PCA model. Create a biplot. Describe the output
#so in this case I have done three plots - in 1st plot you can
#see component 1 is higher then comp 3 and comp 5
#2nd plot is elbow curve comp1 and comp 2 is significant then
#other component.
#3rd we can see fresh, Delicassen and Frozen are apart or pertains
#to the variation then other variables.
plot(pc)

plot(pc,type='l')

biplot(pc)

#e. Describe how many and which components will
#be retained to preserve 90 to 95% of the variance.
summary(pc)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.7603202 1.3373485 0.8603248 0.75161147 0.68073486
## Proportion of Variance 0.4426753 0.2555001 0.1057370 0.08070283 0.06619999
## Cumulative Proportion 0.4426753 0.6981755 0.8039124 0.88461525 0.95081525
## Comp.6 Comp.7
## Standard deviation 0.53056661 0.250584015
## Proportion of Variance 0.04021442 0.008970336
## Cumulative Proportion 0.99102966 1.000000000
#by seeing the summary and plots we can see that cumulative proportion
#of comp1,comp2,comp3 for the model.
#f.Obtain a scree plot showing the proportion of variance explained by
#each principal component as well as the cumulative proportion of
#variance explained.Use the scree plot to select a natural number of
#principal components.
screeplot(pc, type = "l", main = "Scree Plot")
abline(h=1)

#we can see that component1, component2 and component3 are significant
#for the model.
#g.Evaluate and describe the results in context of the problem
#statement
#In order to find which things are used most in different channels,
#component1, Component2 and Component3 will pay the important
#role.
#a.Describe the problem statement for using the
#hierarchical clustering model
#Common things used in hotels, restaurant and cafe?
#b.Scale the data as needed
norm_data <- scale(data6)
#c. Create a hierarchical clustering model using appropriate distance metrics
#and linkage methods
hclust.data <- hclust(dist(norm_data))
# using linkage method
# Cluster using complete linkage: hclust.complete
hclust.complete <- hclust(dist(norm_data), method = "complete")
# Cluster using average linkage: hclust.average
hclust.average <- hclust(dist(norm_data), method = "average")
# Cluster using single linkage: hclust.single
hclust.single <- hclust(dist(norm_data), method = "single")
# Plot dendrogram of hclust.complete
plot(hclust.complete, main = "Complete")

# Plot dendrogram of hclust.average
plot(hclust.average, main = "Average")

# Plot dendrogram of hclust.single
plot(hclust.single, main = "Single")

#inspect the result
summary(hclust.data)
## Length Class Mode
## merge 878 -none- numeric
## height 439 -none- numeric
## order 440 -none- numeric
## labels 0 -none- NULL
## method 1 -none- character
## call 2 -none- call
## dist.method 1 -none- character
#d.Obtain plot of the hierarchical clustering with dendrogram
#e.Determine where to cut the dendrogram to obtain clusters
plot(hclust.data)
abline(h = 7, col = "red")

clusterCut <- cutree(hclust.data, 3)
table(clusterCut, data6$Channel)
##
## clusterCut 1 2
## 1 297 137
## 2 0 5
## 3 1 0
#f.Evaluate and describe the clusters obtained in the context of the
#problem?
#Plot here shows the best choices for total number of cluster is 7.
#and cut cluster shows that 1-hotel falls(297) in cluster 1 and some part of 2-restuarant
#also falls in cluster1 where as 2-restaurant(5) falls in cluster 2 and in 3rd
#cluster some part of 1-hotel falls.
#Describe the problem statement for using K-means
#Which things falls in which section- hotel, restaurant, cafe.
#b.Scale the data as needed
norm_data <- scale(data6)
#c.Create a K-means clustering model on the data
km.data <- kmeans(norm_data, centers = 3, nstart = 20)
#Print the cluster membership component of the model
km.data$cluster
## [1] 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 2 1 2 1 2 1 2 2 3 1 1 2 2 1 2 2 2 2 2 2
## [36] 1 2 1 1 2 2 2 1 1 1 1 1 3 1 1 2 2 1 1 2 2 3 1 2 2 1 3 1 1 2 3 2 1 2 2
## [71] 2 2 2 1 1 2 2 1 2 2 2 1 1 2 1 3 3 2 2 2 2 2 3 2 1 2 1 2 2 2 1 1 1 2 2
## [106] 2 1 1 1 1 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2
## [141] 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 1 2 1 1 1 2 2 1 1 1 1 2 2 2 1 1 2 1 2
## [176] 1 2 2 2 2 2 3 2 3 2 2 2 2 1 1 2 2 2 1 2 2 2 1 2 2 1 1 2 2 2 1 2 1 2 1
## [211] 2 3 2 2 1 2 1 2 1 2 2 2 2 1 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [246] 1 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 1 2 2 2 2 2 2 2 2 2 2 1
## [281] 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 1 1 2 1 1 1 1 1 1 1 2 2 1 2 2 1 2 2
## [316] 1 2 2 2 1 2 2 2 2 2 3 2 2 2 2 2 1 2 3 2 1 2 2 2 2 1 1 2 1 2 2 1 1 2 1
## [351] 2 1 2 1 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 1 2 2 1 2 2 1 2 2 1 2 2 2 2 2
## [386] 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 1 1 2 1 2
## [421] 2 1 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2
# Inspect the result
summary(km.data)
## Length Class Mode
## cluster 440 -none- numeric
## centers 21 -none- numeric
## totss 1 -none- numeric
## withinss 3 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 3 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
# Scatter plot of norm_data
library(cluster)
clusplot(norm_data,km.data$cluster,main = "2D representation of cluster",
shade = TRUE, labels = 2, lines = 0)

#Component1 and Compunent2 explain 69.82% of the point variability as mentioned
#in the plot.
#8) Compare K-means result to Hierarchical clustering
#a. Evaluate and describe your results
cut.data <- cutree(hclust.data, k = 3)
# Compare methods
table(km.data$cluster, cut.data)
## cut.data
## 1 2 3
## 1 130 0 0
## 2 296 0 0
## 3 8 5 1
#Cluster 1 has almost all the observations then 2 and 3 cluster.
#while the k-means algorithm distributes the observations relatively
#evenly among all clusters.
#9) Since PCA can uncorrelated the variables, it can be used to improve the performance of other modeling methods.
#Try to see if PCA improves the performance of hierarchical clustering.
pr.hclust <- hclust(dist(pc$scores), method = 'complete')
# Cut model into 3 clusters:pr.hclust.clusters
pr.hclust.clusters <- cutree(pr.hclust,k=3)
# Compare to actual diagnoses
table(pr.hclust.clusters, data6$Channel)
##
## pr.hclust.clusters 1 2
## 1 297 137
## 2 0 5
## 3 1 0
table(pr.hclust.clusters, km.data$cluster)
##
## pr.hclust.clusters 1 2 3
## 1 130 296 8
## 2 0 0 5
## 3 0 0 1
#Which method would you select to perform the clustering for your
#study and why?
#In my case, Kmeans does the best job where it showed component1 and component2
#explains 70%(almost)of the point variability.