DATABASE -8 CLUSTERING

DATABASE-8

#These install and library commands only need to be run once after opening R studio
#install.packages("cluster")
library (cluster)
rm(list=ls())
#===================================
#Change the path here
#===================================
AdventureWorksDW <- read.csv("~/git-R/clustering/clustering/AdventureWorksClusteringInput.csv")
head(AdventureWorksDW)
  CustomerKey SalesAmount YearlyIncome
1       11012          81       100000
2       11013         114       100000
3       11014         138       100000
4       11015        2501        30000
5       11016        2332        30000
6       11019         184        40000
par(mfrow=c(1,2))
plot(density(AdventureWorksDW$SalesAmount), main="Density Sales Amount")
plot(density(AdventureWorksDW$YearlyIncome), main="Density Yearly Income")

#define income clusters
IncHigh.cluster <- ifelse(AdventureWorksDW$YearlyIncome > 60000, 3, 0)
IncMid.cluster <- ifelse(AdventureWorksDW$YearlyIncome <= 60000 & AdventureWorksDW$YearlyIncome > 30000, 2, 0)
IncLow.cluster <- ifelse(AdventureWorksDW$YearlyIncome <= 30000, 1, 0)
#bind clusters together
cat <- cbind(IncLow.cluster, IncMid.cluster, IncHigh.cluster)
IncCategoryKey <- IncLow.cluster + IncMid.cluster + IncHigh.cluster
#bind data set and category keys
AdventureWorksDW <- cbind(AdventureWorksDW,IncCategoryKey)
#assign labels to data set
IncCategoryLabel = factor(IncCategoryKey, labels = c("Low", "Mid", "High"))
#bind data set and category labels
AdventureWorksDW <- cbind(AdventureWorksDW,IncCategoryLabel)
par(mfrow=c(1,1))
#Screen snip 1
boxplot(AdventureWorksDW$YearlyIncome~AdventureWorksDW$IncCategoryLabel,
        main="gg21- Boxplot Income", 
        xlab="Income in USD",
        ylab="Income Cluster",horizontal=FALSE)

#Screen snip 2
boxplot(AdventureWorksDW$SalesAmount~AdventureWorksDW$IncCategoryLabel,
        main="gg21- Boxplot Sales Amount", 
        xlab="Income in USD",
        ylab="Sales Amount Cluster",horizontal=FALSE)

selectedSubset=subset(AdventureWorksDW, 
                select = c( "SalesAmount", "YearlyIncome", "IncCategoryLabel"))
selectedSubsetNoLabel<-selectedSubset
selectedSubsetNoLabel$IncCategoryLabel<-NULL
set.seed(1234)
gg21_kmeans.result<- kmeans(selectedSubsetNoLabel,3)
#gg21_kmeans.result
#Screen snip 3
table(selectedSubset$IncCategoryLabel, gg21_kmeans.result$cluster)
      
          1    2    3
  Low  3190    0    0
  Mid  1883    0 2784
  High    0 1252 2709
#Screen snip 4 after the points command
plot(selectedSubsetNoLabel[, c("SalesAmount", "YearlyIncome")], col=gg21_kmeans.result$cluster)
points(gg21_kmeans.result$centers [, c("SalesAmount", "YearlyIncome")], col=1:3, pch = 8, cex =2)

#Screen snip 5 after the plot command, just the clusplot
gg21_pam.result<-pam(selectedSubsetNoLabel, 3)
table(gg21_pam.result$clustering , selectedSubset$IncCategoryLabel)
   
     Low  Mid High
  1    0    0 1714
  2 3190 1883    0
  3    0 2784 2247
plot(gg21_pam.result)

#reduce the number of rows
rowsToKeep <- sample(nrow(selectedSubset),100)
reducedRows <- selectedSubset[rowsToKeep,]
reducedRowsNoLabel<-reducedRows
reducedRowsNoLabel$IncCategoryLabel<-NULL
#Screen snip 6 and 7 after the plot command (clusplot and silhouette plot)
gg21_pam.result<-pam(reducedRowsNoLabel, 3)
table(gg21_pam.result$clustering , reducedRows$IncCategoryLabel)
   
    Low Mid High
  1   0   0   18
  2   0  20   15
  3  32  15    0
plot(gg21_pam.result)

#Screen snip 8 after the rect command
hc<-hclust(dist(reducedRowsNoLabel), method="ave")
plot(hc, hang = -1, labels = reducedRows$IncCategoryLabel)
rect.hclust(hc, k =3)