# HW 2 Q 6 JLCY, 9 Nov 2013

############################################################ 

# part i

# cluster analysis on 112 years of winter PPT: 1900-01 to 2011-12

ntime = 112  # Dec 1900 to Mar 1901 - Dec 2011 to Mar 2012

ntimep = ntime

nlocation = 81  # 81 stations in W US

# Lat - Long grid..  create grid below:

# read coord online
# locs=matrix(scan('http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session2/files-4HW2/WesternUS-coords.txt'),
# ncol=5, byrow=T)

# read coord from pc
locs = matrix(scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\WesternUS-coords.txt"), 
    ncol = 5, byrow = T)

xlat = locs[, 3]
ylon = locs[, 4]

xygrid = matrix(0, nrow = 81, ncol = 2)

xygrid[, 1] = xlat
xygrid[, 2] = ylon

############################################################ 

# Read W US PPT data -- already annual averaged

# data=scan('http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session2/files-4HW2/western-US-cdiv-winter-1900-2012-precip.txt')

data = scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\W-US-cdiv-ppt-DJFM-1900-2012.txt")

data_m <- array(dim = c(ntime, nlocation))

# set data matrix: 112 years x 81 locations
i = 1
j = 1
for (i in 1:81) {
    data_m[, i] = data[j:(j + 111)]
    i = i + 1
    j = j + 112
}

############################################################ 

nk = 10  # number of clusters to consider, j

# x is the data matrix to cluster - columns are the attributes to cluster on
# rows are records/observations to cluster

# x1 as 81 locations x 1 column of averged W US ppt
x1 = apply(data_m, 2, mean)
x1 = as.matrix(x1)

wss = 1:nk

wss <- (nrow(x1) - 1) * sum(apply(x1, 2, var))

for (i in 2:nk) wss[i] <- sum(kmeans(x1, centers = i)$withinss)

plot(1:10, wss, type = "b", xlab = "Number of Clusters", ylab = "Within groups Sum of Squares WSS", 
    main = "Number of Clusters vs WSS, using PPT")

plot of chunk unnamed-chunk-1


par(ask = TRUE)

# select the K corresponding to the the minimum WSS or WSS beyond with the
# drop is small. Similar to the Eigen spectrum kbest = the best number of
# clusters.

kbest = 4  # by eye-balling

############################################################ 

# part ii

lats = locs[, 3]
lons = locs[, 4]
lons = -lons  #W longitude to be negative.

library(maps)
library(mapdata)
library(akima)
library(fields)

## Loading required package: spam
## Loading required package: grid
## Spam version 0.30-1 (2013-08-25) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction 
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
## 
## Attaching package: 'spam'
## 
## The following object is masked from 'package:base':
## 
##     backsolve, forwardsolve


# x is ppt
zclust_p = kmeans(x1, centers = kbest)
plot(lons, lats, col = c(zclust_p$cluster), xlab = "Longitude", ylab = "Latitude", 
    main = "Clusters of W US PPT 1900-2012, kbest = 4, using PPT", pch = 16)
US(add = TRUE, col = "grey", lwd = 1, xlim = range(-125, -100))

plot of chunk unnamed-chunk-1


par(ask = TRUE)

############################################################ 

# Use BIC to get best number of clusters found out 3 clusters are optimal
# comment all the codes out because it crashes package 'maps'

# library(mclust)

# Run the function to see how many clusters it finds to be optimal, set it
# to search for at least 1 model and up 20.

# d_clust <- Mclust(as.matrix(x1), G=1:20) m.best <- dim(d_clust$z)[2]
# cat('model-based optimal number of clusters:', m.best, '\n') # = 3
# clusters

# plot(d_clust)

kbest = 3  # by BIC

############################################################ 

# kbest = 3

zclust_p = kmeans(x1, centers = kbest)
plot(lons, lats, col = c(zclust_p$cluster), xlab = "Longitude", ylab = "Latitude", 
    main = "Clusters of W US PPT 1900-2012, kbest = 3, using PPT", pch = 16)
US(add = TRUE, col = "grey", lwd = 1, xlim = range(-125, -100))

plot of chunk unnamed-chunk-1


############################################################ 

# part iii

# create data matrix of 3 columns: lat, lon, ppt

x3 = matrix(0, nrow = 81, ncol = 3)

x3[, 1] = ylon
x3[, 2] = xlat
x3[, 3] = x1

x3 = as.matrix(x3)

nk = 20  # number of clusters to consider, j

wss = 1:nk

wss <- (nrow(x3) - 1) * sum(apply(x3, 2, var))

for (i in 2:nk) wss[i] <- sum(kmeans(x3, centers = i)$withinss)

plot(1:20, wss, type = "b", xlab = "Number of Clusters", ylab = "Within groups Sum of Squares WSS", 
    main = "Number of Clusters vs WSS, using lat, lon, PPT")

plot of chunk unnamed-chunk-1


par(ask = TRUE)

kbest = 7  # by eye-balling

############################################################ 

# kbest = 7

# x is lon, lat and precip
zclust_llp = kmeans(x3, centers = kbest)
plot(lons, lats, col = c(zclust_llp$cluster), xlab = "Longitude", ylab = "Latitude", 
    main = "Clusters of W US PPT 1900-2012, kbest = 7, using lat, lon, PPT", 
    pch = 16)
US(add = TRUE, col = "grey", lwd = 1, xlim = range(-125, -100))

plot of chunk unnamed-chunk-1


par(ask = TRUE)

############################################################ 

# Use BIC to get best number of clusters found out 3 clusters are optimal
# comment all the codes out because it crashes package 'maps'

# library(mclust)

# Run the function to see how many clusters it finds to be optimal, set it
# to search for at least 1 model and up 20.

# d_clust3 <- Mclust(as.matrix(x3), G=1:20) m.best3 <- dim(d_clust$z)[2]
# cat('model-based optimal number of clusters:', m.best3, '\n') # = 3
# clusters

# plot(d_clust)

kbest = 3  # same as BIC from above

############################################################ 

# x is lon, lat and precip
zclust_llp = kmeans(x3, centers = kbest)
plot(lons, lats, col = c(zclust_llp$cluster), xlab = "Longitude", ylab = "Latitude", 
    main = "Clusters of W US PPT 1900-2012, kbest = 3, using lat, lon, PPT", 
    pch = 16)
US(add = TRUE, col = "grey", lwd = 1, xlim = range(-125, -100))

plot of chunk unnamed-chunk-1


par(ask = TRUE)

############################################################