Exercise 13 - Data Reduction

Creating & Viewing Data with Groups

# 2D DATA create data using data.frame() function rnorm(500) + 5 indicates
# 500 points with a normalized distribution and a mean of 5
a <- data.frame(rnorm(500) + 5, rnorm(500) + 5)
b <- data.frame(rnorm(500) + 1, rnorm(500) + 2)
c <- data.frame(rnorm(500) + 3, rnorm(500) - 3)
# draw groups using plot add anothe set of points with points() function
plot(a, xlim = c(-2, 9), ylim = c(-7, 9), pch = 16)
points(b, pch = 16)
points(c, pch = 16)

plot of chunk unnamed-chunk-1


# 3D DATA
library(scatterplot3d)
# create data
d <- data.frame(x = rnorm(500) + 5, y = rnorm(500) + 5, z = rnorm(500) + 3, 
    group = 1)
e <- data.frame(x = rnorm(500) + 1, y = rnorm(500) + 2, z = rnorm(500) + 1, 
    group = 2)
f <- data.frame(x = rnorm(500) + 3, y = rnorm(500) - 3, z = rnorm(500) - 1, 
    group = 3)

# place all three groups into one data frame using rbind:
g = rbind(d, e, f)

# draw plot use cex.symbols instead of just cex to set point size
scatterplot3d(x = g$x, y = g$y, z = g$z, pch = 16, cex.symbols = 0.5)

plot of chunk unnamed-chunk-1

# plot with color
scatterplot3d(x = g$x, y = g$y, z = g$z, pch = 16, cex.symbols = 0.5, color = g$group)

plot of chunk unnamed-chunk-1


# Interactive 3D graph library(rgl) plot3d(x=g$x,y=g$y,z=g$z, pch=16)

K-means Clustering

# Run kmeans k-means iteratively finds the best clusters and assignments
# (given a set number of clusters to use) by randomly placing cluster
# means, assigning points to the nearest cluster, recalculatig the mean of
# each cluster, and shifting the estimated means to match observed means
# -- theoretically until the esitmated and observed are equivalent.  the
# second number is the number of clusters to find iter.max= sets the max
# number of mean shifts allowed (higher = more precise) nstart = the
# number of times to run this process; kmeans is sensitive to the initial
# conditions.
groups <- kmeans(g, 3, iter.max = 20, nstart = 100)
g$cl <- groups$cluster

km <- scatterplot3d(x = g$x, y = g$y, z = g$z, pch = 16, color = g$cl, cex.symbols = 0.5)
km$points3d(groups$centers, pch = 14, lwd = 6, col = 7)

plot of chunk unnamed-chunk-2


# Example with USA data
library(maptools)
## Loading required package: foreign
## Loading required package: sp
## Loading required package: grid
## Loading required package: lattice
## Checking rgeos availability: TRUE
library(classInt)
## Loading required package: class
## Loading required package: e1071
library(sp)
USA <- readShapePoly("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/USAcopy.shp")
USA <- USA[, 16:30]
## How many groups?
USAg <- kmeans(USA@data, 7, iter.max = 1000)
USA$cl <- USAg$cluster
library(RColorBrewer)
spplot(USA, zcol = "cl", col.regions = brewer.pal(7, "Spectral"), cuts = 6)

plot of chunk unnamed-chunk-2

How many groups?

Scree Plots

# 4 Groups
groups <- kmeans(g, 4)
g$cl <- groups$cluster
km <- scatterplot3d(x = g$x, y = g$y, z = g$z, pch = 16, color = g$cl, cex.symbols = 0.5)
km$points3d(groups$centers, pch = 14, lwd = 6, col = 7)

plot of chunk unnamed-chunk-3


# 2 Groups
groups <- kmeans(g, 2)
g$cl <- groups$cluster
km <- scatterplot3d(x = g$x, y = g$y, z = g$z, pch = 16, color = g$cl, cex.symbols = 0.5)
km$points3d(groups$centers, pch = 14, lwd = 6, col = 7)

plot of chunk unnamed-chunk-3


# A scree plot -- plot within SS against number of clusters This will
# ALWAYS decrease as you add clusters, so looking for a 'kink' or 'elbow'
fit <- NA
for (i in 1:5) {
    groups <- kmeans(g, i, nstart = 99)
    fit[i] <- groups$tot.withinss
}
plot(fit, type = "l", ylab = "Within Sum of Squares", main = "Scree Plot for Clustering")

plot of chunk unnamed-chunk-3


# A scree plot with error bars
fit <- data.frame(fit = 1:2000, k = 1:2000)
res <- list()
ctr <- 1
# This loop conducts 100 kmeans clusterings for 1 to 20 clusters The
# counter is prsent so that they can all be stored in the same data frame,
# each instance in a new row.
for (i in 1:20) {
    for (j in 1:100) {
        groups <- kmeans(USA@data, i, iter.max = 100, nstart = 1)
        fit[ctr, "fit"] <- groups$tot.withinss
        fit[ctr, "k"] <- i
        ctr <- ctr + 1
    }
}
fit$k <- as.factor(fit$k)
library(ggplot2)
ggplot(fit, aes(x = k, y = fit, group = k, color = k)) + geom_boxplot()

plot of chunk unnamed-chunk-3

Use of similarity (silhouette width)

# The silhouette function:
library(cluster)
# loop over range of k from 2 to 20 clusters the distance between all
# pairs of rows is calculated first
diss <- dist(USA@data)
# avg.width output of silhouette function gives silhouette, comparing
# within a cluster to between cluster A and cluster B, where B is the
# closest cluster to clsuter A
numk <- c(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)
res <- list()  # hold cluster solutions in a list
for (i in 2:20) {
    res[i - 1] <- summary(silhouette(kmeans(USA@data, i, iter.max = 100, nstart = 10)$cluster, 
        diss))$avg.width
}
plot(numk, res, typ = "b", main = "Average Silhouette Width by k")

# Cascade Function:
library(vegan)
## Loading required package: permute
## This is vegan 2.0-7
# set smallest and largest possible k with inf.gr and sup.gr iter sets the
# number of iterations for each k
km <- cascadeKM(USA@data, inf.gr = 2, sup.gr = 20, iter = 100)
plot(km, sortg = TRUE)

plot of chunk unnamed-chunk-4 plot of chunk unnamed-chunk-4

Hierarchical Clustering:

library(maptools)
library(classInt)
library(sp)
UT <- readShapePoly("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/USAcopy.shp")
UT <- UT@data[UT$STATE_NAME == "Utah", ]

# hclust() only requires a dist object. Including row names is useful for
# legibility.  There are several methods, each of which will yield
# different results.  Seth likes Ward's.
rownames(UT) <- UT$NAME
diss <- dist(UT[, 16:30])
hc <- hclust(diss, method = "ward")
plot(hc)

plot of chunk unnamed-chunk-5


hcc <- hclust(diss, method = "complete")
plot(hcc)

plot of chunk unnamed-chunk-5


hca <- hclust(diss, method = "average")
plot(hca)

plot of chunk unnamed-chunk-5


# Visualize as heat map:
heatmap(as.matrix(diss), hclustfun = function(d) hclust(d, method = "ward"))

plot of chunk unnamed-chunk-5


# Extract cluster assignments from a denrogram: Need to specify the number
# of clusters to define
cutree(hca, 3)
##      Cache  Box Elder       Rich      Weber     Morgan     Summit 
##          1          1          2          1          1          3 
##      Davis     Tooele    Daggett  Salt Lake     Uintah   Duchesne 
##          1          1          2          1          2          2 
##    Wasatch       Utah       Juab     Carbon    Sanpete      Emery 
##          1          1          2          2          2          2 
##    Millard      Grand     Sevier     Beaver      Wayne      Piute 
##          2          2          2          2          2          2 
##   San Juan   Garfield       Iron Washington       Kane 
##          2          2          1          1          1

Exercise Step 1:

USA <- readShapePoly("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/USAcopy.shp")
USA.8 <- kmeans(USA@data[, 16:30], 8, iter.max = 100, nstart = 100)
plotDataZ <- as.data.frame(scale(USA@data[, 16:30], center = TRUE, scale = TRUE))
plotDataZ$Cluster <- as.factor(USA.8$cluster)

library(ggplot2)
library(reshape2)
plotDataM <- melt(plotDataZ, id = "Cluster")
plot1 <- ggplot(plotDataM, aes(group = variable, x = variable, y = value, fill = variable))
plot1 + stat_summary(fun.y = "mean", geom = "bar") + facet_grid(. ~ Cluster) + 
    opts(axis.text.x = element_text(size = 8, angle = 45, hjust = 1, vjust = 1), 
        title = "Community Type Profiles") + ylab("Variable Mean (Standard Score)") + 
    xlab("Variables by Community Type")
## 'opts' is deprecated. Use 'theme' instead. (Deprecated; last used in
## version 0.9.1)
## Setting the plot title with opts(title="...") is deprecated.  Use
## labs(title="...") or ggtitle("...") instead. (Deprecated; last used in
## version 0.9.1)

plot of chunk unnamed-chunk-6

Exercise Step 2:

library(rgeos)
## rgeos version: 0.2-16, (SVN revision 389) GEOS runtime version:
## 3.3.3-CAPI-1.7.4 Polygon checking: TRUE
library(gpclib)
## Warning: the specification for class "gpc.poly" in package 'gpclib' seems
## equivalent to one from package 'rgeos' and is not turning on duplicate
## class definitions for this class
## Warning: the specification for class "gpc.poly.nohole" in package 'gpclib'
## seems equivalent to one from package 'rgeos' and is not turning on
## duplicate class definitions for this class
## General Polygon Clipper Library for R (version 1.5-5) Type 'class ?
## gpc.poly' for help
## Attaching package: 'gpclib'
## The following objects are masked from 'package:rgeos':
## 
## append.poly, area.poly, get.bbox, get.pts, read.polyfile, scale.poly,
## triangulate, tristrip, write.polyfile
library(RColorBrewer)
gpclibPermit()  # Need to do this; I don't know why
## [1] TRUE

# Attach clusters to USA shapefile:
USA$cluster <- as.character(USA.8$cluster)  # use as.character to make discrete variable instead of continuous

# Step 1: Fortify
usa_geom <- fortify(USA, region = "FIPS")
# Step 2: Merge
usa_map_df <- merge(usa_geom, USA, by.x = "id", by.y = "FIPS")
# Step 3: Map
map1 <- ggplot(usa_map_df, aes(long, lat, group = group)) + geom_polygon(data = usa_map_df, 
    aes(fill = cluster)) + coord_equal() + scale_fill_brewer("Clusters", palette = "YlGnBu") + 
    ggtitle("Clusters for US Counties based on Socioeconomic Data") + geom_path(data = usa_geom, 
    aes(long, lat, group = group), lty = 3, lwd = 0.1, color = "white")
map1

plot of chunk unnamed-chunk-7