#########################
##### Assignment 2 ######
#### Andrew Williams ####
####### 104431205 #######
#########################

library(plyr)
library(ggplot2)
library(cluster)
library(lattice)
library(graphics)
library(grid)
library(gridExtra)
library(cowplot)
## 
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
##   default ggplot2 theme anymore. To recover the previous
##   behavior, execute:
##   theme_set(theme_cowplot())
## ********************************************************

Import clustering.csv data as a data frame

packs<-read.csv("clustering.csv", header=TRUE, sep=",")
packs <- as.data.frame(packs)

Rename the columns in the data frame packs

#Column x = Weight_kg
#Column y = Volume
names(packs)[names(packs)=="y"] <- "Volume"
names(packs)[names(packs)=="x"] <- "Weight_kg"

Create a second, indentical data frame for pounds

packs1 <- packs

Create a third, indentical data frame for standardized data

packs.standard <- packs

Multiply the Weight_kg column in the packs1 data frame by 2.2046 to turn into pounds

packs1$Weight_kg <- packs1$Weight_kg * 2.20462

Rename the “Weight_kg column as”Weight_lbs" in the packs1 data frame

names(packs1)[names(packs1)=="Weight_kg"] <- "Weight_lbs"

Standardize the data in packs.standard data frame

packs.standard$Weight_kg <- scale(packs$Weight_kg)
packs.standard$Volume <- scale(packs.standard$Volume)

Rename the columns in packs.standard data frame

names(packs.standard)[names(packs.standard)=="Weight_kg"] <- "Standard_Weight"
names(packs.standard)[names(packs.standard)=="Volume"] <- "Standard_Volume"

Find the optimal value of k

packs_km<- as.matrix(packs[, c("Weight_kg", "Volume")])
packs_km.process<-packs_km

packs1_km<- as.matrix(packs1[, c("Weight_lbs", "Volume")])

packs_stand_km<- as.matrix(packs.standard[, c("Standard_Weight", "Standard_Volume")])

WSS to determine optimal k

# try k=1-10
wss <- numeric(10)
for (k in 1:10) wss[k]<- sum(kmeans(packs_km.process, centers = k, nstart = 30)$withinss)

Plot WSS against k

plot(1:10, wss, type="b", xlab="Number of Clusters", ylab="Within Sum of Squares")

Exmaine K means with k = 3 (packs)

km.3<-kmeans(packs_km, centers=3, nstart = 30)
km.3
## K-means clustering with 3 clusters of sizes 7, 3, 6
## 
## Cluster means:
##   Weight_kg    Volume
## 1  4.685714 10.971429
## 2  6.600000 18.600000
## 3  6.900000  5.016667
## 
## Clustering vector:
##  [1] 1 1 1 1 1 3 3 2 2 2 1 3 3 3 3 1
## 
## Within cluster sum of squares by cluster:
## [1] 78.88286  4.66000 50.26833
##  (between_SS / total_SS =  74.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Exmaine K means with k = 3 (packs1)

km1.3<-kmeans(packs1_km, centers=3, nstart = 30)
km1.3
## K-means clustering with 3 clusters of sizes 6, 5, 5
## 
## Cluster means:
##   Weight_lbs   Volume
## 1  16.167213 15.28333
## 2   5.732012  9.48000
## 3  16.314188  4.72000
## 
## Clustering vector:
##  [1] 1 2 2 2 2 2 3 1 1 1 1 3 3 3 3 1
## 
## Within cluster sum of squares by cluster:
## [1] 112.98980  61.88267  84.90805
##  (between_SS / total_SS =  72.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Exmaine K means with k = 3 (packs.standard)

km_stand.3<-kmeans(packs_stand_km, centers=3, nstart = 30)
km_stand.3
## K-means clustering with 3 clusters of sizes 6, 5, 5
## 
## Cluster means:
##   Standard_Weight Standard_Volume
## 1       0.5427104       0.9662039
## 2       0.5675200      -1.0293318
## 3      -1.2187725      -0.1301128
## 
## Clustering vector:
##  [1] 1 3 3 3 3 3 2 1 1 1 1 2 2 2 2 1
## 
## Within cluster sum of squares by cluster:
## [1] 3.725592 2.624595 1.861612
##  (between_SS / total_SS =  72.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Plot cluster with kilograms

packs.plot<-as.data.frame(packs)
packs.plot$cluster<-factor(km.3$cluster)
km.centers<-as.data.frame(km.3$centers)

plot.1<- ggplot(data = packs.plot, aes(x=Weight_kg, y = Volume, color=cluster)) + geom_point()+theme(legend.position = "right")+geom_point(data=km.centers, aes(x=Weight_kg, y = Volume, color=as.factor(c(1,2,3))), size=10, alpha=.3, show.legend = FALSE)
(plot.1)

Plot cluster with pounds

packs1.plot<-as.data.frame(packs1)
packs1.plot$cluster<-factor(km1.3$cluster)
km1.centers<-as.data.frame(km1.3$centers)

plot.2<- ggplot(data = packs1.plot, aes(x= Weight_lbs, y = Volume, color=cluster)) + geom_point()+theme(legend.position = "right")+geom_point(data=km1.centers, aes(x= Weight_lbs, y = Volume, color=as.factor(c(1,2,3))), size=10, alpha=.3, show.legend = FALSE)
plot(plot.2)

Plot cluster with standardized values

packs.stand.plot<-as.data.frame(packs.standard)
packs.stand.plot$cluster<-factor(km_stand.3$cluster)
km_stand.centers<-as.data.frame(km_stand.3$centers)

plot.3<- ggplot(data = packs.stand.plot, aes(x= Standard_Weight, y = Standard_Volume, color=cluster)) + geom_point()+theme(legend.position = "right")+geom_point(data=km_stand.centers, aes(x= Standard_Weight, y = Standard_Volume, color=as.factor(c(1,2,3))), size=10, alpha=.3, show.legend = FALSE)
plot(plot.3)