# Cluster Analysis --- data file cluster.csv
city = read.csv("cluster.csv")
str(city)
## 'data.frame': 15 obs. of 3 variables:
## $ CityID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Income : num 1.14 -1.25 1.62 1.64 0.55 -0.94 0.89 -0.87 -0.44 0.08 ...
## $ Population: num 1.72 -1.17 0.89 1.35 0.1 -1.25 1.32 -0.63 -0.07 -0.55 ...
plot(city$Population, city$Income)
text(city$Population, city$Income, labels = city$CityID, cex = 0.9, pos = 2)
# hierarchical clustering
distance = dist(city[2:3], method = "euclidean")
city_hclust = hclust(distance, method = "ward")
ls(city_hclust)
## [1] "call" "dist.method" "height" "labels" "merge"
## [6] "method" "order"
plot(city_hclust, labels = city$CityID)
rect.hclust(city_hclust, k = 3, border = "red")
clusterGp = cutree(city_hclust, k = 3)
table(clusterGp)
## clusterGp
## 1 2 3
## 4 5 6
tapply(city$Income, clusterGp, summary)
## $`1`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.89 1.08 1.38 1.32 1.62 1.64
##
## $`2`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.29 -1.25 -1.07 -1.08 -0.94 -0.87
##
## $`3`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.4400 -0.1580 -0.0050 0.0217 0.1780 0.5500
boxplot(Income ~ clusterGp, data = city, xlab = "cluster (hierarchical)", ylab = "Income",
main = "Income by City cluster")
tapply(city$Population, clusterGp, summary)
## $`1`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.89 1.21 1.34 1.32 1.44 1.72
##
## $`2`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.38 -1.25 -1.17 -1.06 -0.86 -0.63
##
## $`3`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.5500 -0.1000 -0.0250 0.0017 0.0800 0.6200
boxplot(Population ~ clusterGp, data = city, xlab = "cluster (hierarchical)",
ylab = "Population", main = "Population by City cluster")
# k-means clustering
set.seed(123)
city_km = kmeans(city[2:3], centers = 3)
table(city_km$cluster)
##
## 1 2 3
## 4 6 5
ls(city_km)
## [1] "betweenss" "centers" "cluster" "ifault"
## [5] "iter" "size" "tot.withinss" "totss"
## [9] "withinss"
tapply(city$Income, city_km$cluster, summary)
## $`1`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.89 1.08 1.38 1.32 1.62 1.64
##
## $`2`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.4400 -0.1580 -0.0050 0.0217 0.1780 0.5500
##
## $`3`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.29 -1.25 -1.07 -1.08 -0.94 -0.87
boxplot(Income ~ city_km$cluster, data = city, xlab = "cluster (kmeans)", ylab = "Income",
main = "Income by City cluster")