R Markdown

Clustering Assignment

library(plyr)
library(ggplot2)
library(cluster)
library(lattice)
library(graphics)
library(grid)
library(gridExtra)
library(cowplot)
## 
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
##   default ggplot2 theme anymore. To recover the previous
##   behavior, execute:
##   theme_set(theme_cowplot())
## ********************************************************
setwd("C:/Users/Nick/Desktop/School/Year 4/Winter/Data Science/Assignment 2")
########################################################################
clustering<-read.csv("clustering.csv", header = TRUE)
wss <- numeric(16)

clustering.process<-clustering[6:16,]
for (k in 1:6) wss[k]<-sum(kmeans(clustering.process, centers=k, nstart=1)$withinss)

plot(1:16, wss, type="b", xlab="Number of Clusters", ylab="Within Sum of Squares")

km.3<-kmeans(clustering.process, centers=3, nstart=6)
km.3
## K-means clustering with 3 clusters of sizes 2, 6, 3
## 
## Cluster means:
##     x         y
## 1 8.7 11.650000
## 2 6.9  5.016667
## 3 6.6 18.600000
## 
## Clustering vector:
##  6  7  8  9 10 11 12 13 14 15 16 
##  2  2  3  3  3  1  2  2  2  2  1 
## 
## Within cluster sum of squares by cluster:
## [1]  2.22500 50.26833  4.66000
##  (between_SS / total_SS =  87.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
clustering.plot<-as.data.frame(clustering[6:16,])
clustering.plot$cluster<-factor(km.3$cluster)
km.centers<-as.data.frame(km.3$centers)

plot.1 <- ggplot(data = clustering.plot, aes(x=x, y=y, color=cluster))+geom_point()+theme((legend.position="right")+geom_point(data=km.centers, aes(x=x, y=y, color=as.factor(c(1,2,3))), size=10, alpha=.3, show.legend=FALSE))
plot.1

################################################################################################################################################################################################################################
lbsclustering<-read.csv("lbsclustering.csv", header= TRUE)
wss <- numeric(16)

lbsclustering.process<-lbsclustering[6:16,]
for (k in 1:6) wss[k]<-sum(kmeans(lbsclustering.process, centers=k, nstart=1)$withinss)

plot(1:16, wss, type="b", xlab="Number of Clusters", ylab="Within Sum of Squares")

km.4<-kmeans(lbsclustering.process, centers=3, nstart=6)

km.4
## K-means clustering with 3 clusters of sizes 6, 2, 3
## 
## Cluster means:
##     v        p
## 1 6.9 11.05984
## 2 8.7 25.68382
## 3 6.6 41.00593
## 
## Clustering vector:
##  6  7  8  9 10 11 12 13 14 15 16 
##  1  1  3  3  3  2  1  1  1  1  2 
## 
## Within cluster sum of squares by cluster:
## [1] 170.588988   4.560511  16.781497
##  (between_SS / total_SS =  90.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
lbsclustering.plot<-as.data.frame(lbsclustering[6:16,])
lbsclustering.plot$cluster<-factor(km.4$cluster)
km.centers<-as.data.frame(km.4$centers)

plot.2 <- ggplot(data = lbsclustering.plot, aes(x=v, y=p, color=cluster))+geom_point()+theme((legend.position="right")+geom_point(data=km.centers, aes(x=v, y=p, color=as.factor(c(1,2,3))), size=10, alpha=.3, show.legend=FALSE))
plot.2

###################################################################################################################################################################################################################################
plot_grid(plot.1, plot.2, labels= c("A", "B"), ncol = 1, nrow= 2)

###################################################################################################################################################################################################################################
med<-median(clustering$y, na.rm = FALSE)
ma<-mad(clustering$y)
clustering.scaled= scale(clustering$y, center = med, scale = ma)
clustering.scaled
##              [,1]
##  [1,]  0.65082442
##  [2,] -0.01183317
##  [3,]  0.41416099
##  [4,] -0.05916586
##  [5,]  0.01183317
##  [6,] -0.79282247
##  [7,] -2.07080496
##  [8,]  2.37846741
##  [9,]  2.04713862
## [10,]  1.78680885
## [11,]  0.55615905
## [12,] -0.50882636
## [13,] -1.26614932
## [14,] -0.69815710
## [15,] -1.52647909
## [16,]  0.29582928
## attr(,"scaled:center")
## [1] 9.85
## attr(,"scaled:scale")
## [1] 4.22541
med2<-median(clustering$x, na.rm = FALSE)
ma2<-mad(clustering$x)
clustering.scaled2= scale(clustering$x, center = med2, scale = ma2)
clustering.scaled2
##              [,1]
##  [1,]  0.14199805
##  [2,] -1.98797276
##  [3,] -1.84597471
##  [4,] -1.27798249
##  [5,] -0.92298736
##  [6,] -0.70999027
##  [7,] -0.56799222
##  [8,] -0.14199805
##  [9,] -0.07099903
## [10,]  0.42599416
## [11,]  0.49699319
## [12,]  0.07099903
## [13,]  0.63899125
## [14,]  0.70999027
## [15,]  0.92298736
## [16,]  1.13598444
## attr(,"scaled:center")
## [1] 6.4
## attr(,"scaled:scale")
## [1] 2.81694
####################################################################################################################################################################################################################################
standardized<-read.csv("standardized.csv", header = TRUE)
wss <- numeric(16)

standardized.process<-standardized[6:16,]
for (k in 1:6) wss[k]<-sum(kmeans(standardized.process, centers=k, nstart=1)$withinss)

plot(1:16, wss, type="b", xlab="Number of Clusters", ylab="Within Sum of Squares")

km.5<-kmeans(standardized.process, centers=3, nstart=6)

km.5
## K-means clustering with 3 clusters of sizes 3, 2, 6
## 
## Cluster means:
##            a          b
## 1  2.0708050 0.07099903
## 2  0.4259942 0.81648882
## 3 -1.1438732 0.17749757
## 
## Clustering vector:
##  6  7  8  9 10 11 12 13 14 15 16 
##  3  3  1  1  1  2  3  3  3  3  2 
## 
## Within cluster sum of squares by cluster:
## [1] 0.3674228 0.2380407 4.1527367
##  (between_SS / total_SS =  82.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
standardized.plot<-as.data.frame(standardized[6:16,])
standardized.plot$cluster<-factor(km.5$cluster)
km.centers<-as.data.frame(km.5$centers)

plot.3 <- ggplot(data = standardized.plot, aes(x=a, y=b, color=cluster))+geom_point()+theme((legend.position="right")+geom_point(data=km.centers, aes(x=a, y=b, color=as.factor(c(1,2,3))), size=10, alpha=.3, show.legend=FALSE))
plot.3

##################################################################################################################################################################################################################################
med3<-median(lbsclustering$v, na.rm = FALSE)
ma3<-mad(lbsclustering$v)
lbsclustering.scaled= scale(lbsclustering$v, center = med, scale = ma3)
lbsclustering.scaled
##              [,1]
##  [1,] -1.08273517
##  [2,] -3.21270599
##  [3,] -3.07070793
##  [4,] -2.50271571
##  [5,] -2.14772058
##  [6,] -1.93472349
##  [7,] -1.79272544
##  [8,] -1.36673128
##  [9,] -1.29573225
## [10,] -0.79873906
## [11,] -0.72774003
## [12,] -1.15373419
## [13,] -0.58574198
## [14,] -0.51474295
## [15,] -0.30174587
## [16,] -0.08874878
## attr(,"scaled:center")
## [1] 9.85
## attr(,"scaled:scale")
## [1] 2.81694
med4<-median(lbsclustering$p, na.rm = FALSE)
ma4<-mad(lbsclustering$p)
lbsclustering.scaled2= scale(lbsclustering$p, center = med4, scale = ma4)
lbsclustering.scaled2
##              [,1]
##  [1,]  0.65082442
##  [2,] -0.01183317
##  [3,]  0.41416099
##  [4,] -0.05916586
##  [5,]  0.01183317
##  [6,] -0.79282247
##  [7,] -2.07080496
##  [8,]  2.37846741
##  [9,]  2.04713862
## [10,]  1.78680885
## [11,]  0.55615905
## [12,] -0.50882636
## [13,] -1.26614932
## [14,] -0.69815710
## [15,] -1.52647909
## [16,]  0.29582928
## attr(,"scaled:center")
## [1] 21.71551
## attr(,"scaled:scale")
## [1] 9.315423
###################################################################################################################################################################################################################################
lbsstandardized<-read.csv("lbsstandardized.csv", header = TRUE)
wss <- numeric(16)

lbsstandardized.process<-lbsstandardized[6:16,]
for (k in 1:6) wss[k]<-sum(kmeans(lbsstandardized.process, centers=k, nstart=1)$withinss)

plot(1:16, wss, type="b", xlab="Number of Clusters", ylab="Within Sum of Squares")

km.6<-kmeans(lbsstandardized.process, centers=3, nstart=6)

km.6
## K-means clustering with 3 clusters of sizes 3, 2, 6
## 
## Cluster means:
##            c          d
## 1 -1.1537342  2.0708050
## 2 -0.4082444  0.4259942
## 3 -1.0472357 -1.1438732
## 
## Clustering vector:
##  6  7  8  9 10 11 12 13 14 15 16 
##  3  3  1  1  1  2  3  3  3  3  2 
## 
## Within cluster sum of squares by cluster:
## [1] 0.3674228 0.2380407 4.1527367
##  (between_SS / total_SS =  82.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
lbsstandardized.plot<-as.data.frame(lbsstandardized[6:16,])
lbsstandardized.plot$cluster<-factor(km.6$cluster)
km.centers<-as.data.frame(km.6$centers)

plot.4<- ggplot(data = lbsstandardized.plot, aes(x=c, y=d, color=cluster))+geom_point()+theme((legend.position="right")+geom_point(data=km.centers, aes(x=c, y=d, color=as.factor(c(1,2,3))), size=10, alpha=.3, show.legend=FALSE))
plot.4

#####################################################################################################################################################################################################################################
plot_grid(plot.3, plot.4, labels= c("C", "D"), ncol = 1, nrow= 2)