# Cluster Analysis --- data file cluster.csv

city = read.csv("cluster.csv")
str(city)

## 'data.frame':    15 obs. of  3 variables:
##  $ CityID    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Income    : num  1.14 -1.25 1.62 1.64 0.55 -0.94 0.89 -0.87 -0.44 0.08 ...
##  $ Population: num  1.72 -1.17 0.89 1.35 0.1 -1.25 1.32 -0.63 -0.07 -0.55 ...

plot(city$Population, city$Income)
text(city$Population, city$Income, labels = city$CityID, cex = 0.9, pos = 2)

plot of chunk unnamed-chunk-1


# hierarchical clustering

distance = dist(city[2:3], method = "euclidean")

city_hclust = hclust(distance, method = "ward")
ls(city_hclust)

## [1] "call"        "dist.method" "height"      "labels"      "merge"      
## [6] "method"      "order"


plot(city_hclust, labels = city$CityID)


rect.hclust(city_hclust, k = 3, border = "red")

plot of chunk unnamed-chunk-1



clusterGp = cutree(city_hclust, k = 3)
table(clusterGp)

## clusterGp
## 1 2 3 
## 4 5 6


tapply(city$Income, clusterGp, summary)

## $`1`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.89    1.08    1.38    1.32    1.62    1.64 
## 
## $`2`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -1.29   -1.25   -1.07   -1.08   -0.94   -0.87 
## 
## $`3`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.4400 -0.1580 -0.0050  0.0217  0.1780  0.5500

boxplot(Income ~ clusterGp, data = city, xlab = "cluster (hierarchical)", ylab = "Income", 
    main = "Income by City cluster")

plot of chunk unnamed-chunk-1


tapply(city$Population, clusterGp, summary)

## $`1`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.89    1.21    1.34    1.32    1.44    1.72 
## 
## $`2`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -1.38   -1.25   -1.17   -1.06   -0.86   -0.63 
## 
## $`3`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.5500 -0.1000 -0.0250  0.0017  0.0800  0.6200

boxplot(Population ~ clusterGp, data = city, xlab = "cluster (hierarchical)", 
    ylab = "Population", main = "Population by City cluster")

plot of chunk unnamed-chunk-1


# k-means clustering

set.seed(123)
city_km = kmeans(city[2:3], centers = 3)
table(city_km$cluster)

## 
## 1 2 3 
## 4 6 5

ls(city_km)

## [1] "betweenss"    "centers"      "cluster"      "ifault"      
## [5] "iter"         "size"         "tot.withinss" "totss"       
## [9] "withinss"

tapply(city$Income, city_km$cluster, summary)

## $`1`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.89    1.08    1.38    1.32    1.62    1.64 
## 
## $`2`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.4400 -0.1580 -0.0050  0.0217  0.1780  0.5500 
## 
## $`3`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -1.29   -1.25   -1.07   -1.08   -0.94   -0.87

boxplot(Income ~ city_km$cluster, data = city, xlab = "cluster (kmeans)", ylab = "Income", 
    main = "Income by City cluster")

plot of chunk unnamed-chunk-1