mini_project2.2.utf8.md

#loading dataset
data6 <- read.csv("clustering.csv")
View(data6)
#Channel Hotel-1, Restaurant -2,Cafe -3
#Clean, augment, and preprocess the data into a more convenient form, if needed.
any(is.na(data6))

## [1] FALSE

data6$Region <- NULL
#Perform exploratory analysis of the data to better understand the data 
plot(data6)

plot(data6$Channel,data6$Milk)

plot(data6$Channel,data6$Grocery)

plot(data6$Channel,data6$Detergents_Paper)

plot(data6$Channel,data6$Fresh)

#Describe a problem statement using association 
#rule mining to find associations between interesting variables.
#which items are more frequently used in the hotels, restaurants?
#Normalizing the data
norm_data <- scale(data6)
#Create a PCA model of the data and inspect model output 
pc <- princomp(norm_data,cor = TRUE,scores = TRUE)
#Inspect the results 
summary(pc)

## Importance of components:
##                           Comp.1    Comp.2    Comp.3     Comp.4     Comp.5
## Standard deviation     1.7603202 1.3373485 0.8603248 0.75161147 0.68073486
## Proportion of Variance 0.4426753 0.2555001 0.1057370 0.08070283 0.06619999
## Cumulative Proportion  0.4426753 0.6981755 0.8039124 0.88461525 0.95081525
##                            Comp.6      Comp.7
## Standard deviation     0.53056661 0.250584015
## Proportion of Variance 0.04021442 0.008970336
## Cumulative Proportion  0.99102966 1.000000000

#d. Obtain some visualizations to better understand 
#the PCA model.  Create a biplot.  Describe the output
#so in this case I have done three plots - in 1st plot you can 
#see component 1 is higher then comp 3 and comp 5
#2nd plot is elbow curve comp1 and comp 2 is significant then
#other component.
#3rd we can see fresh, Delicassen and Frozen are apart or pertains
#to the variation then other variables.
plot(pc)

plot(pc,type='l')

biplot(pc)

#e. Describe how many and which components will
#be retained to preserve 90 to 95% of the variance.
summary(pc)

## Importance of components:
##                           Comp.1    Comp.2    Comp.3     Comp.4     Comp.5
## Standard deviation     1.7603202 1.3373485 0.8603248 0.75161147 0.68073486
## Proportion of Variance 0.4426753 0.2555001 0.1057370 0.08070283 0.06619999
## Cumulative Proportion  0.4426753 0.6981755 0.8039124 0.88461525 0.95081525
##                            Comp.6      Comp.7
## Standard deviation     0.53056661 0.250584015
## Proportion of Variance 0.04021442 0.008970336
## Cumulative Proportion  0.99102966 1.000000000

#by seeing the summary and plots we can see that cumulative proportion
#of comp1,comp2,comp3 for the model.
#f.Obtain a scree plot showing the proportion of variance explained by 
#each principal component as well as the cumulative proportion of 
#variance explained.Use the scree plot to select a natural number of 
#principal components.
screeplot(pc, type = "l", main = "Scree Plot")
abline(h=1)

#we can see that component1, component2 and component3 are significant
#for the model.
#g.Evaluate and describe the results in context of the problem 
#statement 
#In order to find which things are used most in different channels,
#component1, Component2 and Component3 will pay the important
#role.

#a.Describe the problem statement for using the 
#hierarchical clustering model 
#Common things used in hotels, restaurant and cafe?
#b.Scale the data as needed
norm_data <- scale(data6)
#c. Create a hierarchical clustering model using appropriate distance metrics 
#and linkage methods
hclust.data <- hclust(dist(norm_data))
# using linkage method 
# Cluster using complete linkage: hclust.complete
hclust.complete <- hclust(dist(norm_data), method = "complete")

# Cluster using average linkage: hclust.average
hclust.average <- hclust(dist(norm_data), method = "average")

# Cluster using single linkage: hclust.single
hclust.single <- hclust(dist(norm_data), method = "single")

# Plot dendrogram of hclust.complete
plot(hclust.complete, main = "Complete")

# Plot dendrogram of hclust.average
plot(hclust.average, main = "Average")

# Plot dendrogram of hclust.single
plot(hclust.single, main = "Single")

#inspect the result
summary(hclust.data)

##             Length Class  Mode     
## merge       878    -none- numeric  
## height      439    -none- numeric  
## order       440    -none- numeric  
## labels        0    -none- NULL     
## method        1    -none- character
## call          2    -none- call     
## dist.method   1    -none- character

#d.Obtain plot of the hierarchical clustering with dendrogram 
#e.Determine where to cut the dendrogram to obtain clusters 
plot(hclust.data)
abline(h = 7, col = "red")

clusterCut <- cutree(hclust.data, 3)
table(clusterCut, data6$Channel)

##           
## clusterCut   1   2
##          1 297 137
##          2   0   5
##          3   1   0

#f.Evaluate and describe the clusters obtained in the context of the 
#problem?
#Plot here shows the best choices for total number of cluster is 7.
#and cut cluster shows that 1-hotel falls(297) in cluster 1 and some part of 2-restuarant
#also falls in cluster1 where as 2-restaurant(5) falls in cluster 2 and in 3rd
#cluster some part of 1-hotel falls.

#Describe the problem statement for using K-means 
#Which things falls in which section- hotel, restaurant, cafe.
#b.Scale the data as needed
norm_data <- scale(data6)
#c.Create a K-means clustering model on the data 
km.data <- kmeans(norm_data, centers = 3, nstart = 20)
#Print the cluster membership component of the model
km.data$cluster

##   [1] 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 2 1 2 1 2 1 2 2 3 1 1 2 2 1 2 2 2 2 2 2
##  [36] 1 2 1 1 2 2 2 1 1 1 1 1 3 1 1 2 2 1 1 2 2 3 1 2 2 1 3 1 1 2 3 2 1 2 2
##  [71] 2 2 2 1 1 2 2 1 2 2 2 1 1 2 1 3 3 2 2 2 2 2 3 2 1 2 1 2 2 2 1 1 1 2 2
## [106] 2 1 1 1 1 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2
## [141] 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 1 2 1 1 1 2 2 1 1 1 1 2 2 2 1 1 2 1 2
## [176] 1 2 2 2 2 2 3 2 3 2 2 2 2 1 1 2 2 2 1 2 2 2 1 2 2 1 1 2 2 2 1 2 1 2 1
## [211] 2 3 2 2 1 2 1 2 1 2 2 2 2 1 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [246] 1 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 1 2 2 2 2 2 2 2 2 2 2 1
## [281] 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 1 1 2 1 1 1 1 1 1 1 2 2 1 2 2 1 2 2
## [316] 1 2 2 2 1 2 2 2 2 2 3 2 2 2 2 2 1 2 3 2 1 2 2 2 2 1 1 2 1 2 2 1 1 2 1
## [351] 2 1 2 1 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 1 2 2 1 2 2 1 2 2 1 2 2 2 2 2
## [386] 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 1 1 2 1 2
## [421] 2 1 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2

# Inspect the result
summary(km.data)

##              Length Class  Mode   
## cluster      440    -none- numeric
## centers       21    -none- numeric
## totss          1    -none- numeric
## withinss       3    -none- numeric
## tot.withinss   1    -none- numeric
## betweenss      1    -none- numeric
## size           3    -none- numeric
## iter           1    -none- numeric
## ifault         1    -none- numeric

# Scatter plot of norm_data
library(cluster)
clusplot(norm_data,km.data$cluster,main = "2D representation of cluster",
         shade = TRUE, labels = 2, lines = 0)

#Component1 and Compunent2 explain 69.82% of the point variability as mentioned
#in the plot.

#8) Compare K-means result to Hierarchical clustering 
#a. Evaluate and describe your results
cut.data <- cutree(hclust.data, k = 3)
# Compare methods
table(km.data$cluster, cut.data)

##    cut.data
##       1   2   3
##   1 130   0   0
##   2 296   0   0
##   3   8   5   1

#Cluster 1 has almost all the observations then 2 and 3 cluster.
#while the k-means algorithm distributes the observations relatively 
#evenly among all clusters.

#9) Since PCA can uncorrelated the variables, it can be used to improve the performance of other modeling methods.  
#Try to see if PCA improves the performance of hierarchical clustering.
pr.hclust <- hclust(dist(pc$scores), method = 'complete')
# Cut model into 3 clusters:pr.hclust.clusters
pr.hclust.clusters <- cutree(pr.hclust,k=3)
# Compare to actual diagnoses
table(pr.hclust.clusters, data6$Channel)

##                   
## pr.hclust.clusters   1   2
##                  1 297 137
##                  2   0   5
##                  3   1   0

table(pr.hclust.clusters, km.data$cluster)

##                   
## pr.hclust.clusters   1   2   3
##                  1 130 296   8
##                  2   0   0   5
##                  3   0   0   1

#Which method would you select to perform the clustering for your 
#study and why?
#In my case, Kmeans does the best job where it showed component1 and component2
#explains 70%(almost)of the point variability.

mini_project2.2.R

arnabchakraboty

2020-04-09