Emily Gill - CVEN6833 - Homework #2

November 11, 2013

Question 6: Cluster Analysis

Part (i)

Identify the number of clusters, K.

rm(list = ls())

library(fields)

# Read in the precip data and convert to one column of average precipitation
# per division
pdata = matrix(scan("http://iridl.ldeo.columbia.edu/expert/SOURCES/.NOAA/.NCDC/.CIRS/.ClimateDivision/.pcp/T/%28Dec%201900%29%28Dec%202012%29RANGE/T/4/boxAverage/T/12/STEP/IDIV/201/202/203/204/205/206/207/401/402/403/404/405/406/407/502/504/505/1001/1002/1003/1004/1005/1006/1007/1008/1009/1010/2401/2402/2403/2404/2405/2406/2407/2601/2602/2603/2604/2901/2902/2904/2905/2906/2907/2908/3501/3502/3503/3504/3505/3506/3507/3508/3509/4201/4202/4203/4204/4205/4206/4207/4501/4502/4503/4504/4505/4506/4507/4508/4509/4510/4801/4802/4803/4804/4805/4806/4807/4808/4809/4810/VALUES/data.ch"), 
    ncol = 112, byrow = T)
precip <- t(pdata)
precip <- matrix(apply(precip, 2, mean), 81, 1)

# Read in the coordinate information so we can plot and later include lat
# and lon
coords <- read.table("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session2/files-4HW2/climdivcoords.txt", 
    header = TRUE, sep = "\t")
coords1 <- coords[coords$X < -104, ]
coords1 <- coords1[order(coords1$CLIMDIV), ]
latlonUS <- coords1[, 4:5]
lons <- coords1$X
lats <- coords1$Y

# Determine the numer of clusters by looking at WSS
nk = 30
wss <- (nrow(precip) - 1) * sum(apply(precip, 2, var))
par(mfrow = c(1, 1))
for (i in 2:nk) wss[i] <- sum(kmeans(precip, centers = i)$withinss)
plot(1:nk, wss, type = "b", xlab = "# of Clusters", ylab = "WSS", main = "Within Group Sum of Squares \n to Choose Cluster Number")

plot of chunk unnamed-chunk-1

Comments on WSS kink plot:

From this WSS plot, we will choose the best number of clusters as 5. This is the point on the plot where adding another cluster doesn't gain much in terms of WSS.

kbest <- 5  # Choosing 5 based on saturation in above plot

Part (ii)

Cluster the data into K clusters and display them.

zclust <- kmeans(precip, centers = kbest)
quilt.plot(lons, lats, zclust$cluster, xlab = "lon", ylab = "lat", main = "Clustering Using only Precip \n and Five Clusters")
# plot(lons,lats,col=c(zclust$cluster),xlab='lon',ylab='lat',main='Clustering
# Using only Precip \n and Five Clusters')
US(add = TRUE, col = "grey", lwd = 2, xlim = range(-125, -100))

plot of chunk unnamed-chunk-3

Comments on cluster plot with 5 clusters:

This plot uses 5 clusters and only precip to group the western winter US precipitation. Even with only five clusters, we see clear regions associated with each cluster.

Part (iii)

Repeat the first two parts but include latitude and longitude.

# Choose the number of clusters, this time including lat and lon
precipll <- cbind(precip, coords1$X, coords1$Y)
nk = 30
wss <- (nrow(precipll) - 1) * sum(apply(precipll, 2, var))
for (i in 2:nk) wss[i] <- sum(kmeans(precipll, centers = i)$withinss)
plot(1:nk, wss, type = "b", xlab = "Number of Clusters", ylab = "Within groups sum of squares")

plot of chunk unnamed-chunk-4


# Cluster with lat and lon.
kbest <- 10
zclust <- kmeans(precipll, centers = kbest)
quilt.plot(lons, lats, zclust$cluster, xlab = "lon", ylab = "lat", main = "Clustering Using P, Lat, Lon \n and 10 Clusters")
# plot(lons,lats,col=c(zclust$cluster),xlab='lon',ylab='lat',main='Clustering
# Using P, Lat, Lon \n and 10 Clusters')
US(add = TRUE, col = "grey", lwd = 2, xlim = range(-125, -100))

plot of chunk unnamed-chunk-4

Comments on cluster plot with 10 clusters, using lat and lon:

This plot uses 10 clusters and includes latitude and longitude in with precipitation to group the western winter US precipitation. The clustering is much better in this method, clearly groupd regions of the western US with amounts of precipitation.