#########################
##### Assignment 2 ######
#### Andrew Williams ####
####### 104431205 #######
#########################
library(plyr)
library(ggplot2)
library(cluster)
library(lattice)
library(graphics)
library(grid)
library(gridExtra)
library(cowplot)
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
Import clustering.csv data as a data frame
packs<-read.csv("clustering.csv", header=TRUE, sep=",")
packs <- as.data.frame(packs)
Rename the columns in the data frame packs
#Column x = Weight_kg
#Column y = Volume
names(packs)[names(packs)=="y"] <- "Volume"
names(packs)[names(packs)=="x"] <- "Weight_kg"
Create a second, indentical data frame for pounds
packs1 <- packs
Create a third, indentical data frame for standardized data
packs.standard <- packs
Multiply the Weight_kg column in the packs1 data frame by 2.2046 to turn into pounds
packs1$Weight_kg <- packs1$Weight_kg * 2.20462
Rename the “Weight_kg column as”Weight_lbs" in the packs1 data frame
names(packs1)[names(packs1)=="Weight_kg"] <- "Weight_lbs"
Standardize the data in packs.standard data frame
packs.standard$Weight_kg <- scale(packs$Weight_kg)
packs.standard$Volume <- scale(packs.standard$Volume)
Rename the columns in packs.standard data frame
names(packs.standard)[names(packs.standard)=="Weight_kg"] <- "Standard_Weight"
names(packs.standard)[names(packs.standard)=="Volume"] <- "Standard_Volume"
Find the optimal value of k
packs_km<- as.matrix(packs[, c("Weight_kg", "Volume")])
packs_km.process<-packs_km
packs1_km<- as.matrix(packs1[, c("Weight_lbs", "Volume")])
packs_stand_km<- as.matrix(packs.standard[, c("Standard_Weight", "Standard_Volume")])
WSS to determine optimal k
# try k=1-10
wss <- numeric(10)
for (k in 1:10) wss[k]<- sum(kmeans(packs_km.process, centers = k, nstart = 30)$withinss)
Plot WSS against k
plot(1:10, wss, type="b", xlab="Number of Clusters", ylab="Within Sum of Squares")
Exmaine K means with k = 3 (packs)
km.3<-kmeans(packs_km, centers=3, nstart = 30)
km.3
## K-means clustering with 3 clusters of sizes 7, 3, 6
##
## Cluster means:
## Weight_kg Volume
## 1 4.685714 10.971429
## 2 6.600000 18.600000
## 3 6.900000 5.016667
##
## Clustering vector:
## [1] 1 1 1 1 1 3 3 2 2 2 1 3 3 3 3 1
##
## Within cluster sum of squares by cluster:
## [1] 78.88286 4.66000 50.26833
## (between_SS / total_SS = 74.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Exmaine K means with k = 3 (packs1)
km1.3<-kmeans(packs1_km, centers=3, nstart = 30)
km1.3
## K-means clustering with 3 clusters of sizes 6, 5, 5
##
## Cluster means:
## Weight_lbs Volume
## 1 16.167213 15.28333
## 2 5.732012 9.48000
## 3 16.314188 4.72000
##
## Clustering vector:
## [1] 1 2 2 2 2 2 3 1 1 1 1 3 3 3 3 1
##
## Within cluster sum of squares by cluster:
## [1] 112.98980 61.88267 84.90805
## (between_SS / total_SS = 72.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Exmaine K means with k = 3 (packs.standard)
km_stand.3<-kmeans(packs_stand_km, centers=3, nstart = 30)
km_stand.3
## K-means clustering with 3 clusters of sizes 6, 5, 5
##
## Cluster means:
## Standard_Weight Standard_Volume
## 1 0.5427104 0.9662039
## 2 0.5675200 -1.0293318
## 3 -1.2187725 -0.1301128
##
## Clustering vector:
## [1] 1 3 3 3 3 3 2 1 1 1 1 2 2 2 2 1
##
## Within cluster sum of squares by cluster:
## [1] 3.725592 2.624595 1.861612
## (between_SS / total_SS = 72.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Plot cluster with kilograms
packs.plot<-as.data.frame(packs)
packs.plot$cluster<-factor(km.3$cluster)
km.centers<-as.data.frame(km.3$centers)
plot.1<- ggplot(data = packs.plot, aes(x=Weight_kg, y = Volume, color=cluster)) + geom_point()+theme(legend.position = "right")+geom_point(data=km.centers, aes(x=Weight_kg, y = Volume, color=as.factor(c(1,2,3))), size=10, alpha=.3, show.legend = FALSE)
(plot.1)
Plot cluster with pounds
packs1.plot<-as.data.frame(packs1)
packs1.plot$cluster<-factor(km1.3$cluster)
km1.centers<-as.data.frame(km1.3$centers)
plot.2<- ggplot(data = packs1.plot, aes(x= Weight_lbs, y = Volume, color=cluster)) + geom_point()+theme(legend.position = "right")+geom_point(data=km1.centers, aes(x= Weight_lbs, y = Volume, color=as.factor(c(1,2,3))), size=10, alpha=.3, show.legend = FALSE)
plot(plot.2)
Plot cluster with standardized values
packs.stand.plot<-as.data.frame(packs.standard)
packs.stand.plot$cluster<-factor(km_stand.3$cluster)
km_stand.centers<-as.data.frame(km_stand.3$centers)
plot.3<- ggplot(data = packs.stand.plot, aes(x= Standard_Weight, y = Standard_Volume, color=cluster)) + geom_point()+theme(legend.position = "right")+geom_point(data=km_stand.centers, aes(x= Standard_Weight, y = Standard_Volume, color=as.factor(c(1,2,3))), size=10, alpha=.3, show.legend = FALSE)
plot(plot.3)