# 2D DATA create data using data.frame() function rnorm(500) + 5 indicates
# 500 points with a normalized distribution and a mean of 5
a <- data.frame(rnorm(500) + 5, rnorm(500) + 5)
b <- data.frame(rnorm(500) + 1, rnorm(500) + 2)
c <- data.frame(rnorm(500) + 3, rnorm(500) - 3)
# draw groups using plot add anothe set of points with points() function
plot(a, xlim = c(-2, 9), ylim = c(-7, 9), pch = 16)
points(b, pch = 16)
points(c, pch = 16)
# 3D DATA
library(scatterplot3d)
# create data
d <- data.frame(x = rnorm(500) + 5, y = rnorm(500) + 5, z = rnorm(500) + 3,
group = 1)
e <- data.frame(x = rnorm(500) + 1, y = rnorm(500) + 2, z = rnorm(500) + 1,
group = 2)
f <- data.frame(x = rnorm(500) + 3, y = rnorm(500) - 3, z = rnorm(500) - 1,
group = 3)
# place all three groups into one data frame using rbind:
g = rbind(d, e, f)
# draw plot use cex.symbols instead of just cex to set point size
scatterplot3d(x = g$x, y = g$y, z = g$z, pch = 16, cex.symbols = 0.5)
# plot with color
scatterplot3d(x = g$x, y = g$y, z = g$z, pch = 16, cex.symbols = 0.5, color = g$group)
# Interactive 3D graph library(rgl) plot3d(x=g$x,y=g$y,z=g$z, pch=16)
# Run kmeans k-means iteratively finds the best clusters and assignments
# (given a set number of clusters to use) by randomly placing cluster
# means, assigning points to the nearest cluster, recalculatig the mean of
# each cluster, and shifting the estimated means to match observed means
# -- theoretically until the esitmated and observed are equivalent. the
# second number is the number of clusters to find iter.max= sets the max
# number of mean shifts allowed (higher = more precise) nstart = the
# number of times to run this process; kmeans is sensitive to the initial
# conditions.
groups <- kmeans(g, 3, iter.max = 20, nstart = 100)
g$cl <- groups$cluster
km <- scatterplot3d(x = g$x, y = g$y, z = g$z, pch = 16, color = g$cl, cex.symbols = 0.5)
km$points3d(groups$centers, pch = 14, lwd = 6, col = 7)
# Example with USA data
library(maptools)
## Loading required package: foreign
## Loading required package: sp
## Loading required package: grid
## Loading required package: lattice
## Checking rgeos availability: TRUE
library(classInt)
## Loading required package: class
## Loading required package: e1071
library(sp)
USA <- readShapePoly("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/USAcopy.shp")
USA <- USA[, 16:30]
## How many groups?
USAg <- kmeans(USA@data, 7, iter.max = 1000)
USA$cl <- USAg$cluster
library(RColorBrewer)
spplot(USA, zcol = "cl", col.regions = brewer.pal(7, "Spectral"), cuts = 6)
# 4 Groups
groups <- kmeans(g, 4)
g$cl <- groups$cluster
km <- scatterplot3d(x = g$x, y = g$y, z = g$z, pch = 16, color = g$cl, cex.symbols = 0.5)
km$points3d(groups$centers, pch = 14, lwd = 6, col = 7)
# 2 Groups
groups <- kmeans(g, 2)
g$cl <- groups$cluster
km <- scatterplot3d(x = g$x, y = g$y, z = g$z, pch = 16, color = g$cl, cex.symbols = 0.5)
km$points3d(groups$centers, pch = 14, lwd = 6, col = 7)
# A scree plot -- plot within SS against number of clusters This will
# ALWAYS decrease as you add clusters, so looking for a 'kink' or 'elbow'
fit <- NA
for (i in 1:5) {
groups <- kmeans(g, i, nstart = 99)
fit[i] <- groups$tot.withinss
}
plot(fit, type = "l", ylab = "Within Sum of Squares", main = "Scree Plot for Clustering")
# A scree plot with error bars
fit <- data.frame(fit = 1:2000, k = 1:2000)
res <- list()
ctr <- 1
# This loop conducts 100 kmeans clusterings for 1 to 20 clusters The
# counter is prsent so that they can all be stored in the same data frame,
# each instance in a new row.
for (i in 1:20) {
for (j in 1:100) {
groups <- kmeans(USA@data, i, iter.max = 100, nstart = 1)
fit[ctr, "fit"] <- groups$tot.withinss
fit[ctr, "k"] <- i
ctr <- ctr + 1
}
}
fit$k <- as.factor(fit$k)
library(ggplot2)
ggplot(fit, aes(x = k, y = fit, group = k, color = k)) + geom_boxplot()
# The silhouette function:
library(cluster)
# loop over range of k from 2 to 20 clusters the distance between all
# pairs of rows is calculated first
diss <- dist(USA@data)
# avg.width output of silhouette function gives silhouette, comparing
# within a cluster to between cluster A and cluster B, where B is the
# closest cluster to clsuter A
numk <- c(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)
res <- list() # hold cluster solutions in a list
for (i in 2:20) {
res[i - 1] <- summary(silhouette(kmeans(USA@data, i, iter.max = 100, nstart = 10)$cluster,
diss))$avg.width
}
plot(numk, res, typ = "b", main = "Average Silhouette Width by k")
# Cascade Function:
library(vegan)
## Loading required package: permute
## This is vegan 2.0-7
# set smallest and largest possible k with inf.gr and sup.gr iter sets the
# number of iterations for each k
km <- cascadeKM(USA@data, inf.gr = 2, sup.gr = 20, iter = 100)
plot(km, sortg = TRUE)
library(maptools)
library(classInt)
library(sp)
UT <- readShapePoly("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/USAcopy.shp")
UT <- UT@data[UT$STATE_NAME == "Utah", ]
# hclust() only requires a dist object. Including row names is useful for
# legibility. There are several methods, each of which will yield
# different results. Seth likes Ward's.
rownames(UT) <- UT$NAME
diss <- dist(UT[, 16:30])
hc <- hclust(diss, method = "ward")
plot(hc)
hcc <- hclust(diss, method = "complete")
plot(hcc)
hca <- hclust(diss, method = "average")
plot(hca)
# Visualize as heat map:
heatmap(as.matrix(diss), hclustfun = function(d) hclust(d, method = "ward"))
# Extract cluster assignments from a denrogram: Need to specify the number
# of clusters to define
cutree(hca, 3)
## Cache Box Elder Rich Weber Morgan Summit
## 1 1 2 1 1 3
## Davis Tooele Daggett Salt Lake Uintah Duchesne
## 1 1 2 1 2 2
## Wasatch Utah Juab Carbon Sanpete Emery
## 1 1 2 2 2 2
## Millard Grand Sevier Beaver Wayne Piute
## 2 2 2 2 2 2
## San Juan Garfield Iron Washington Kane
## 2 2 1 1 1
USA <- readShapePoly("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/USAcopy.shp")
USA.8 <- kmeans(USA@data[, 16:30], 8, iter.max = 100, nstart = 100)
plotDataZ <- as.data.frame(scale(USA@data[, 16:30], center = TRUE, scale = TRUE))
plotDataZ$Cluster <- as.factor(USA.8$cluster)
library(ggplot2)
library(reshape2)
plotDataM <- melt(plotDataZ, id = "Cluster")
plot1 <- ggplot(plotDataM, aes(group = variable, x = variable, y = value, fill = variable))
plot1 + stat_summary(fun.y = "mean", geom = "bar") + facet_grid(. ~ Cluster) +
opts(axis.text.x = element_text(size = 8, angle = 45, hjust = 1, vjust = 1),
title = "Community Type Profiles") + ylab("Variable Mean (Standard Score)") +
xlab("Variables by Community Type")
## 'opts' is deprecated. Use 'theme' instead. (Deprecated; last used in
## version 0.9.1)
## Setting the plot title with opts(title="...") is deprecated. Use
## labs(title="...") or ggtitle("...") instead. (Deprecated; last used in
## version 0.9.1)
library(rgeos)
## rgeos version: 0.2-16, (SVN revision 389) GEOS runtime version:
## 3.3.3-CAPI-1.7.4 Polygon checking: TRUE
library(gpclib)
## Warning: the specification for class "gpc.poly" in package 'gpclib' seems
## equivalent to one from package 'rgeos' and is not turning on duplicate
## class definitions for this class
## Warning: the specification for class "gpc.poly.nohole" in package 'gpclib'
## seems equivalent to one from package 'rgeos' and is not turning on
## duplicate class definitions for this class
## General Polygon Clipper Library for R (version 1.5-5) Type 'class ?
## gpc.poly' for help
## Attaching package: 'gpclib'
## The following objects are masked from 'package:rgeos':
##
## append.poly, area.poly, get.bbox, get.pts, read.polyfile, scale.poly,
## triangulate, tristrip, write.polyfile
library(RColorBrewer)
gpclibPermit() # Need to do this; I don't know why
## [1] TRUE
# Attach clusters to USA shapefile:
USA$cluster <- as.character(USA.8$cluster) # use as.character to make discrete variable instead of continuous
# Step 1: Fortify
usa_geom <- fortify(USA, region = "FIPS")
# Step 2: Merge
usa_map_df <- merge(usa_geom, USA, by.x = "id", by.y = "FIPS")
# Step 3: Map
map1 <- ggplot(usa_map_df, aes(long, lat, group = group)) + geom_polygon(data = usa_map_df,
aes(fill = cluster)) + coord_equal() + scale_fill_brewer("Clusters", palette = "YlGnBu") +
ggtitle("Clusters for US Counties based on Socioeconomic Data") + geom_path(data = usa_geom,
aes(long, lat, group = group), lty = 3, lwd = 0.1, color = "white")
map1