# HW 2 Q 6 JLCY, 9 Nov 2013
############################################################
# part i
# cluster analysis on 112 years of winter PPT: 1900-01 to 2011-12
ntime = 112 # Dec 1900 to Mar 1901 - Dec 2011 to Mar 2012
ntimep = ntime
nlocation = 81 # 81 stations in W US
# Lat - Long grid.. create grid below:
# read coord online
# locs=matrix(scan('http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session2/files-4HW2/WesternUS-coords.txt'),
# ncol=5, byrow=T)
# read coord from pc
locs = matrix(scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\WesternUS-coords.txt"),
ncol = 5, byrow = T)
xlat = locs[, 3]
ylon = locs[, 4]
xygrid = matrix(0, nrow = 81, ncol = 2)
xygrid[, 1] = xlat
xygrid[, 2] = ylon
############################################################
# Read W US PPT data -- already annual averaged
# data=scan('http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session2/files-4HW2/western-US-cdiv-winter-1900-2012-precip.txt')
data = scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\W-US-cdiv-ppt-DJFM-1900-2012.txt")
data_m <- array(dim = c(ntime, nlocation))
# set data matrix: 112 years x 81 locations
i = 1
j = 1
for (i in 1:81) {
data_m[, i] = data[j:(j + 111)]
i = i + 1
j = j + 112
}
############################################################
nk = 10 # number of clusters to consider, j
# x is the data matrix to cluster - columns are the attributes to cluster on
# rows are records/observations to cluster
# x1 as 81 locations x 1 column of averged W US ppt
x1 = apply(data_m, 2, mean)
x1 = as.matrix(x1)
wss = 1:nk
wss <- (nrow(x1) - 1) * sum(apply(x1, 2, var))
for (i in 2:nk) wss[i] <- sum(kmeans(x1, centers = i)$withinss)
plot(1:10, wss, type = "b", xlab = "Number of Clusters", ylab = "Within groups Sum of Squares WSS",
main = "Number of Clusters vs WSS, using PPT")
par(ask = TRUE)
# select the K corresponding to the the minimum WSS or WSS beyond with the
# drop is small. Similar to the Eigen spectrum kbest = the best number of
# clusters.
kbest = 4 # by eye-balling
############################################################
# part ii
lats = locs[, 3]
lons = locs[, 4]
lons = -lons #W longitude to be negative.
library(maps)
library(mapdata)
library(akima)
library(fields)
## Loading required package: spam
## Loading required package: grid
## Spam version 0.30-1 (2013-08-25) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
##
## Attaching package: 'spam'
##
## The following object is masked from 'package:base':
##
## backsolve, forwardsolve
# x is ppt
zclust_p = kmeans(x1, centers = kbest)
plot(lons, lats, col = c(zclust_p$cluster), xlab = "Longitude", ylab = "Latitude",
main = "Clusters of W US PPT 1900-2012, kbest = 4, using PPT", pch = 16)
US(add = TRUE, col = "grey", lwd = 1, xlim = range(-125, -100))
par(ask = TRUE)
############################################################
# Use BIC to get best number of clusters found out 3 clusters are optimal
# comment all the codes out because it crashes package 'maps'
# library(mclust)
# Run the function to see how many clusters it finds to be optimal, set it
# to search for at least 1 model and up 20.
# d_clust <- Mclust(as.matrix(x1), G=1:20) m.best <- dim(d_clust$z)[2]
# cat('model-based optimal number of clusters:', m.best, '\n') # = 3
# clusters
# plot(d_clust)
kbest = 3 # by BIC
############################################################
# kbest = 3
zclust_p = kmeans(x1, centers = kbest)
plot(lons, lats, col = c(zclust_p$cluster), xlab = "Longitude", ylab = "Latitude",
main = "Clusters of W US PPT 1900-2012, kbest = 3, using PPT", pch = 16)
US(add = TRUE, col = "grey", lwd = 1, xlim = range(-125, -100))
############################################################
# part iii
# create data matrix of 3 columns: lat, lon, ppt
x3 = matrix(0, nrow = 81, ncol = 3)
x3[, 1] = ylon
x3[, 2] = xlat
x3[, 3] = x1
x3 = as.matrix(x3)
nk = 20 # number of clusters to consider, j
wss = 1:nk
wss <- (nrow(x3) - 1) * sum(apply(x3, 2, var))
for (i in 2:nk) wss[i] <- sum(kmeans(x3, centers = i)$withinss)
plot(1:20, wss, type = "b", xlab = "Number of Clusters", ylab = "Within groups Sum of Squares WSS",
main = "Number of Clusters vs WSS, using lat, lon, PPT")
par(ask = TRUE)
kbest = 7 # by eye-balling
############################################################
# kbest = 7
# x is lon, lat and precip
zclust_llp = kmeans(x3, centers = kbest)
plot(lons, lats, col = c(zclust_llp$cluster), xlab = "Longitude", ylab = "Latitude",
main = "Clusters of W US PPT 1900-2012, kbest = 7, using lat, lon, PPT",
pch = 16)
US(add = TRUE, col = "grey", lwd = 1, xlim = range(-125, -100))
par(ask = TRUE)
############################################################
# Use BIC to get best number of clusters found out 3 clusters are optimal
# comment all the codes out because it crashes package 'maps'
# library(mclust)
# Run the function to see how many clusters it finds to be optimal, set it
# to search for at least 1 model and up 20.
# d_clust3 <- Mclust(as.matrix(x3), G=1:20) m.best3 <- dim(d_clust$z)[2]
# cat('model-based optimal number of clusters:', m.best3, '\n') # = 3
# clusters
# plot(d_clust)
kbest = 3 # same as BIC from above
############################################################
# x is lon, lat and precip
zclust_llp = kmeans(x3, centers = kbest)
plot(lons, lats, col = c(zclust_llp$cluster), xlab = "Longitude", ylab = "Latitude",
main = "Clusters of W US PPT 1900-2012, kbest = 3, using lat, lon, PPT",
pch = 16)
US(add = TRUE, col = "grey", lwd = 1, xlim = range(-125, -100))
par(ask = TRUE)
############################################################