#These install and library commands only need to be run once after opening R studio
#install.packages("cluster")
library (cluster)DATABASE -8 CLUSTERING
DATABASE-8
rm(list=ls())
#===================================
#Change the path here
#===================================
AdventureWorksDW <- read.csv("~/git-R/clustering/clustering/AdventureWorksClusteringInput.csv")
head(AdventureWorksDW) CustomerKey SalesAmount YearlyIncome
1 11012 81 100000
2 11013 114 100000
3 11014 138 100000
4 11015 2501 30000
5 11016 2332 30000
6 11019 184 40000
par(mfrow=c(1,2))
plot(density(AdventureWorksDW$SalesAmount), main="Density Sales Amount")
plot(density(AdventureWorksDW$YearlyIncome), main="Density Yearly Income")#define income clusters
IncHigh.cluster <- ifelse(AdventureWorksDW$YearlyIncome > 60000, 3, 0)
IncMid.cluster <- ifelse(AdventureWorksDW$YearlyIncome <= 60000 & AdventureWorksDW$YearlyIncome > 30000, 2, 0)
IncLow.cluster <- ifelse(AdventureWorksDW$YearlyIncome <= 30000, 1, 0)#bind clusters together
cat <- cbind(IncLow.cluster, IncMid.cluster, IncHigh.cluster)IncCategoryKey <- IncLow.cluster + IncMid.cluster + IncHigh.cluster
#bind data set and category keys
AdventureWorksDW <- cbind(AdventureWorksDW,IncCategoryKey)#assign labels to data set
IncCategoryLabel = factor(IncCategoryKey, labels = c("Low", "Mid", "High"))
#bind data set and category labels
AdventureWorksDW <- cbind(AdventureWorksDW,IncCategoryLabel)par(mfrow=c(1,1))
#Screen snip 1
boxplot(AdventureWorksDW$YearlyIncome~AdventureWorksDW$IncCategoryLabel,
main="gg21- Boxplot Income",
xlab="Income in USD",
ylab="Income Cluster",horizontal=FALSE)#Screen snip 2
boxplot(AdventureWorksDW$SalesAmount~AdventureWorksDW$IncCategoryLabel,
main="gg21- Boxplot Sales Amount",
xlab="Income in USD",
ylab="Sales Amount Cluster",horizontal=FALSE)selectedSubset=subset(AdventureWorksDW,
select = c( "SalesAmount", "YearlyIncome", "IncCategoryLabel"))
selectedSubsetNoLabel<-selectedSubset
selectedSubsetNoLabel$IncCategoryLabel<-NULLset.seed(1234)
gg21_kmeans.result<- kmeans(selectedSubsetNoLabel,3)
#gg21_kmeans.result#Screen snip 3
table(selectedSubset$IncCategoryLabel, gg21_kmeans.result$cluster)
1 2 3
Low 3190 0 0
Mid 1883 0 2784
High 0 1252 2709
#Screen snip 4 after the points command
plot(selectedSubsetNoLabel[, c("SalesAmount", "YearlyIncome")], col=gg21_kmeans.result$cluster)
points(gg21_kmeans.result$centers [, c("SalesAmount", "YearlyIncome")], col=1:3, pch = 8, cex =2)#Screen snip 5 after the plot command, just the clusplot
gg21_pam.result<-pam(selectedSubsetNoLabel, 3)
table(gg21_pam.result$clustering , selectedSubset$IncCategoryLabel)
Low Mid High
1 0 0 1714
2 3190 1883 0
3 0 2784 2247
plot(gg21_pam.result)#reduce the number of rows
rowsToKeep <- sample(nrow(selectedSubset),100)
reducedRows <- selectedSubset[rowsToKeep,]
reducedRowsNoLabel<-reducedRows
reducedRowsNoLabel$IncCategoryLabel<-NULL#Screen snip 6 and 7 after the plot command (clusplot and silhouette plot)
gg21_pam.result<-pam(reducedRowsNoLabel, 3)
table(gg21_pam.result$clustering , reducedRows$IncCategoryLabel)
Low Mid High
1 0 0 18
2 0 20 15
3 32 15 0
plot(gg21_pam.result)#Screen snip 8 after the rect command
hc<-hclust(dist(reducedRowsNoLabel), method="ave")
plot(hc, hang = -1, labels = reducedRows$IncCategoryLabel)
rect.hclust(hc, k =3)