Clustering Assignment
library(plyr)
library(ggplot2)
library(cluster)
library(lattice)
library(graphics)
library(grid)
library(gridExtra)
library(cowplot)
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
setwd("C:/Users/Nick/Desktop/School/Year 4/Winter/Data Science/Assignment 2")
########################################################################
clustering<-read.csv("clustering.csv", header = TRUE)
wss <- numeric(16)
clustering.process<-clustering[6:16,]
for (k in 1:6) wss[k]<-sum(kmeans(clustering.process, centers=k, nstart=1)$withinss)
plot(1:16, wss, type="b", xlab="Number of Clusters", ylab="Within Sum of Squares")
km.3<-kmeans(clustering.process, centers=3, nstart=6)
km.3
## K-means clustering with 3 clusters of sizes 2, 6, 3
##
## Cluster means:
## x y
## 1 8.7 11.650000
## 2 6.9 5.016667
## 3 6.6 18.600000
##
## Clustering vector:
## 6 7 8 9 10 11 12 13 14 15 16
## 2 2 3 3 3 1 2 2 2 2 1
##
## Within cluster sum of squares by cluster:
## [1] 2.22500 50.26833 4.66000
## (between_SS / total_SS = 87.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
clustering.plot<-as.data.frame(clustering[6:16,])
clustering.plot$cluster<-factor(km.3$cluster)
km.centers<-as.data.frame(km.3$centers)
plot.1 <- ggplot(data = clustering.plot, aes(x=x, y=y, color=cluster))+geom_point()+theme((legend.position="right")+geom_point(data=km.centers, aes(x=x, y=y, color=as.factor(c(1,2,3))), size=10, alpha=.3, show.legend=FALSE))
plot.1
################################################################################################################################################################################################################################
lbsclustering<-read.csv("lbsclustering.csv", header= TRUE)
wss <- numeric(16)
lbsclustering.process<-lbsclustering[6:16,]
for (k in 1:6) wss[k]<-sum(kmeans(lbsclustering.process, centers=k, nstart=1)$withinss)
plot(1:16, wss, type="b", xlab="Number of Clusters", ylab="Within Sum of Squares")
km.4<-kmeans(lbsclustering.process, centers=3, nstart=6)
km.4
## K-means clustering with 3 clusters of sizes 6, 2, 3
##
## Cluster means:
## v p
## 1 6.9 11.05984
## 2 8.7 25.68382
## 3 6.6 41.00593
##
## Clustering vector:
## 6 7 8 9 10 11 12 13 14 15 16
## 1 1 3 3 3 2 1 1 1 1 2
##
## Within cluster sum of squares by cluster:
## [1] 170.588988 4.560511 16.781497
## (between_SS / total_SS = 90.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
lbsclustering.plot<-as.data.frame(lbsclustering[6:16,])
lbsclustering.plot$cluster<-factor(km.4$cluster)
km.centers<-as.data.frame(km.4$centers)
plot.2 <- ggplot(data = lbsclustering.plot, aes(x=v, y=p, color=cluster))+geom_point()+theme((legend.position="right")+geom_point(data=km.centers, aes(x=v, y=p, color=as.factor(c(1,2,3))), size=10, alpha=.3, show.legend=FALSE))
plot.2
###################################################################################################################################################################################################################################
plot_grid(plot.1, plot.2, labels= c("A", "B"), ncol = 1, nrow= 2)
###################################################################################################################################################################################################################################
med<-median(clustering$y, na.rm = FALSE)
ma<-mad(clustering$y)
clustering.scaled= scale(clustering$y, center = med, scale = ma)
clustering.scaled
## [,1]
## [1,] 0.65082442
## [2,] -0.01183317
## [3,] 0.41416099
## [4,] -0.05916586
## [5,] 0.01183317
## [6,] -0.79282247
## [7,] -2.07080496
## [8,] 2.37846741
## [9,] 2.04713862
## [10,] 1.78680885
## [11,] 0.55615905
## [12,] -0.50882636
## [13,] -1.26614932
## [14,] -0.69815710
## [15,] -1.52647909
## [16,] 0.29582928
## attr(,"scaled:center")
## [1] 9.85
## attr(,"scaled:scale")
## [1] 4.22541
med2<-median(clustering$x, na.rm = FALSE)
ma2<-mad(clustering$x)
clustering.scaled2= scale(clustering$x, center = med2, scale = ma2)
clustering.scaled2
## [,1]
## [1,] 0.14199805
## [2,] -1.98797276
## [3,] -1.84597471
## [4,] -1.27798249
## [5,] -0.92298736
## [6,] -0.70999027
## [7,] -0.56799222
## [8,] -0.14199805
## [9,] -0.07099903
## [10,] 0.42599416
## [11,] 0.49699319
## [12,] 0.07099903
## [13,] 0.63899125
## [14,] 0.70999027
## [15,] 0.92298736
## [16,] 1.13598444
## attr(,"scaled:center")
## [1] 6.4
## attr(,"scaled:scale")
## [1] 2.81694
####################################################################################################################################################################################################################################
standardized<-read.csv("standardized.csv", header = TRUE)
wss <- numeric(16)
standardized.process<-standardized[6:16,]
for (k in 1:6) wss[k]<-sum(kmeans(standardized.process, centers=k, nstart=1)$withinss)
plot(1:16, wss, type="b", xlab="Number of Clusters", ylab="Within Sum of Squares")
km.5<-kmeans(standardized.process, centers=3, nstart=6)
km.5
## K-means clustering with 3 clusters of sizes 3, 2, 6
##
## Cluster means:
## a b
## 1 2.0708050 0.07099903
## 2 0.4259942 0.81648882
## 3 -1.1438732 0.17749757
##
## Clustering vector:
## 6 7 8 9 10 11 12 13 14 15 16
## 3 3 1 1 1 2 3 3 3 3 2
##
## Within cluster sum of squares by cluster:
## [1] 0.3674228 0.2380407 4.1527367
## (between_SS / total_SS = 82.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
standardized.plot<-as.data.frame(standardized[6:16,])
standardized.plot$cluster<-factor(km.5$cluster)
km.centers<-as.data.frame(km.5$centers)
plot.3 <- ggplot(data = standardized.plot, aes(x=a, y=b, color=cluster))+geom_point()+theme((legend.position="right")+geom_point(data=km.centers, aes(x=a, y=b, color=as.factor(c(1,2,3))), size=10, alpha=.3, show.legend=FALSE))
plot.3
##################################################################################################################################################################################################################################
med3<-median(lbsclustering$v, na.rm = FALSE)
ma3<-mad(lbsclustering$v)
lbsclustering.scaled= scale(lbsclustering$v, center = med, scale = ma3)
lbsclustering.scaled
## [,1]
## [1,] -1.08273517
## [2,] -3.21270599
## [3,] -3.07070793
## [4,] -2.50271571
## [5,] -2.14772058
## [6,] -1.93472349
## [7,] -1.79272544
## [8,] -1.36673128
## [9,] -1.29573225
## [10,] -0.79873906
## [11,] -0.72774003
## [12,] -1.15373419
## [13,] -0.58574198
## [14,] -0.51474295
## [15,] -0.30174587
## [16,] -0.08874878
## attr(,"scaled:center")
## [1] 9.85
## attr(,"scaled:scale")
## [1] 2.81694
med4<-median(lbsclustering$p, na.rm = FALSE)
ma4<-mad(lbsclustering$p)
lbsclustering.scaled2= scale(lbsclustering$p, center = med4, scale = ma4)
lbsclustering.scaled2
## [,1]
## [1,] 0.65082442
## [2,] -0.01183317
## [3,] 0.41416099
## [4,] -0.05916586
## [5,] 0.01183317
## [6,] -0.79282247
## [7,] -2.07080496
## [8,] 2.37846741
## [9,] 2.04713862
## [10,] 1.78680885
## [11,] 0.55615905
## [12,] -0.50882636
## [13,] -1.26614932
## [14,] -0.69815710
## [15,] -1.52647909
## [16,] 0.29582928
## attr(,"scaled:center")
## [1] 21.71551
## attr(,"scaled:scale")
## [1] 9.315423
###################################################################################################################################################################################################################################
lbsstandardized<-read.csv("lbsstandardized.csv", header = TRUE)
wss <- numeric(16)
lbsstandardized.process<-lbsstandardized[6:16,]
for (k in 1:6) wss[k]<-sum(kmeans(lbsstandardized.process, centers=k, nstart=1)$withinss)
plot(1:16, wss, type="b", xlab="Number of Clusters", ylab="Within Sum of Squares")
km.6<-kmeans(lbsstandardized.process, centers=3, nstart=6)
km.6
## K-means clustering with 3 clusters of sizes 3, 2, 6
##
## Cluster means:
## c d
## 1 -1.1537342 2.0708050
## 2 -0.4082444 0.4259942
## 3 -1.0472357 -1.1438732
##
## Clustering vector:
## 6 7 8 9 10 11 12 13 14 15 16
## 3 3 1 1 1 2 3 3 3 3 2
##
## Within cluster sum of squares by cluster:
## [1] 0.3674228 0.2380407 4.1527367
## (between_SS / total_SS = 82.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
lbsstandardized.plot<-as.data.frame(lbsstandardized[6:16,])
lbsstandardized.plot$cluster<-factor(km.6$cluster)
km.centers<-as.data.frame(km.6$centers)
plot.4<- ggplot(data = lbsstandardized.plot, aes(x=c, y=d, color=cluster))+geom_point()+theme((legend.position="right")+geom_point(data=km.centers, aes(x=c, y=d, color=as.factor(c(1,2,3))), size=10, alpha=.3, show.legend=FALSE))
plot.4
#####################################################################################################################################################################################################################################
plot_grid(plot.3, plot.4, labels= c("C", "D"), ncol = 1, nrow= 2)