####################
# Gabbi (Gabrielle) Allen
# Homework #4
# Math 5553
# Dr. Coberly

####################
# Exercise 2
# Suppose that we have four observations, for which we copute a dissimilarity 
# matrix.

# Exercise 2.a:
# On the basis of this dissimilarity matrix, sketch the dendrogram that results
# from hierarchically clustering these four observations using complete linkage. Be 
# sure to indicate on the polot the height at which each fusion occurs, as well as 
# the observations corresponding to each leaf in the dendogram.

set.seed(1)

dis.mat <- as.dist(matrix(c(0.0, 0.3, 0.4, 0.7, 
                            0.3, 0.0, 0.5, 0.8, 
                            0.4, 0.5, 0.0, 0.45, 
                            0.7, 0.8, 0.45, 0.0), ncol = 4))

plot(hclust(dis.mat, method = "complete"))  

# In this case, we first fuse observations 1 and 2 together as they have the 
# smallest inter-cluster distance. We fuse them at 0.3, as that is the 
# dissimilarity given. After that, we discard the first row and the first column
# of the matrix (the smaller ones) and only record the larger of the
# dissimilarities. The new matrix looks like:
# 0.0 0.5  0.8
# 0.5 0.0  0.45
# 0.8 0.45 0.0
# So we then fuse observations 3 and 4 together at 0.45, the smallest
# dissimilarity. We then end up with the matrix
# 0.0 0.8
# 0.8 0.0
# So we fuse the two clusters together at 0.8.

# Exercise 2.b:
# Repeat (a), this time using single linkage clustering.

plot(hclust(dis.mat, method = "single"))

# We first fuse observations 1 and 2 at 0.3, the smallest dissimilarity. We then 
# record the new matrix, only recording the smallest values. This looks like
# 0.0 0.4  0.7
# 0.4 0.0  0.45
# 0.7 0.45 0.0
# Then, we fuse observation 3 with the cluster containing observations 1 and 2 at 
# 0.4, the new smallest dissimilarity, and rerecord the matrix, only recording the
# smallest values. 
# 0.0  0.45
# 0.45 0.0
# Then, we fuse observation 4 with the cluster containing observations 1, 2, and 3
# at 0.45, the smallest remaining dissimilarity.

# Exercise 2.c:
# Suppose that we cut the dendrogram obtained in (a) such that two clusters result.
# Which observations are in each cluster?

# Observations 1 and 2 are in a cluster, and observations 3 and 4 are in a cluster.

# Exercise 2.d: 
# Suppose that we cut the dendrogram obtained in (b) such that two clusters result.
# Which observations are in each cluster?

# Observations 1, 2, and 3 are in a cluster, and observation 4 is in a 
# single-observation cluster by itself.

# Exercise 2.e: 
# It is mentioned in the chapter that at each fusion in the dendrogram, the position
# of the two clusters being fused can be swapped without changing the meaning of the
# dendrogram. Draw a dendrogram that is equivalent to the dendrogram in (a), but for
# which two or more of the leaves are repositioned, but for which the meaning of the
# dendrogram is the same.

plot(hclust(dis.mat, method = "complete"), labels = c(2, 1, 4, 3))

# This dendrogram is exactly the same - the labels for 2 and 1 have been switched, 
# and the labels for 4 and 3 have been switched. However, it doesn't matter which
# observation comes first within the cluster, as long as the heights are accurate.

rm(list = ls()) # clear memory for new exercise

#################
# Exercise 3
# In this problem, you will perform K-means clustering manually, with K = 2, on a
# small example with n = 6 observations and p = 2 features. The observations are 
# as follows.

obs <- cbind(c(1, 1, 0, 5, 6, 4), c(4, 3, 4, 1, 2, 0))

# Exercise 3.a: 
# Plot the observations

plot(obs)

# Exercise 3.b: 
# Randomly assign a cluster label to each observation. You can use the sample()
# command in R to do this. Report the cluster labels for each observation.

set.seed(13)
clust.label <- sample(x = 2, size = nrow(obs), replace = TRUE)
clust.label
## [1] 2 1 1 1 2 1
plot(obs[clust.label == 1, 1], obs[clust.label == 1, 2], col = 4, pch = 20, cex = 3,
     xlim = c(0, 6))
points(obs[clust.label == 2, 1], obs[clust.label == 2, 2], 
       col = 3, pch = 20, cex = 3)
# Exercise 3.c:
# Compute the centroid for each cluster.

# Centroids are just (in this case, with only two features) an ordered pair, with 
# the mean of each feature acting as the x and y value. 

centroid1 <- c(mean(obs[clust.label == 1, 1]), mean(obs[clust.label == 1, 2]))
centroid2 <- c(mean(obs[clust.label == 2, 1]), mean(obs[clust.label == 2, 2]))

centroid1
## [1] 2.5 2.0
centroid2
## [1] 3.5 3.0
# So the centroid of cluster 1 is (2.5, 2.0) and the centroid of cluster 2 is
# (3.5, 3.0). 
points(centroid1[1], centroid1[2], col = 4, pch = 4)
points(centroid2[1], centroid2[2], col = 3, pch = 4)

# Exercise 3.d:
# Assign each observation to the centroid to which it is closest, in terms of 
# Euclidean distance. Report the cluster labels for each observation.

distance <- function (x, y){
  return(sqrt((x[1] - y[1])^2 + (x[2] - y[2])^2))
}

# For observation 1:
distance(obs[1,], centroid1)
## [1] 2.5
distance(obs[1,], centroid2)
## [1] 2.692582
# This observation is closer to centroid 1.

clust.label[1] <- 1

# For observation 2:
distance(obs[2,], centroid1)
## [1] 1.802776
distance(obs[2,], centroid2)
## [1] 2.5
# This observation is closer to centroid 1.

clust.label[2] <- 1

# For observation 3:
distance(obs[3,], centroid1)
## [1] 3.201562
distance(obs[3,], centroid2)
## [1] 3.640055
# This observation is closer to centroid 1.

clust.label[3] <- 1

# For observation 4:
distance(obs[4,], centroid1)
## [1] 2.692582
distance(obs[4,], centroid2)
## [1] 2.5
# This observation is closer to centroid 2.

clust.label[4] <- 2

# For observation 5:

distance(obs[5,], centroid1)
## [1] 3.5
distance(obs[5,], centroid2)
## [1] 2.692582
# This observation is closer to centroid 2.

clust.label[5] <- 2

# For observation 6:

distance(obs[6,], centroid1)
## [1] 2.5
distance(obs[6,], centroid2)
## [1] 3.041381
#This observation is closer to centroid 1.

clust.label[6] <- 1

# Exercise 3.e:
# Repeat (c) and (d) until the answers obtained stop changing.

plot(obs[clust.label == 1, 1], obs[clust.label == 1, 2], col = 4, pch = 20, cex = 3,
     xlim = c(0, 6))
points(obs[clust.label == 2, 1], obs[clust.label == 2, 2], 
       col = 3, pch = 20, cex = 3)

centroid1 <- c(mean(obs[clust.label == 1, 1]), mean(obs[clust.label == 1, 2]))
centroid2 <- c(mean(obs[clust.label == 2, 1]), mean(obs[clust.label == 2, 2]))

points(centroid1[1], centroid1[2], col = 4, pch = 4)
points(centroid2[1], centroid2[2], col = 3, pch = 4)

# For observation 1:
distance(obs[1,], centroid1)
## [1] 1.346291
distance(obs[1,], centroid2)
## [1] 5.147815
# This observation is closer to centroid 1.

clust.label[1] <- 1

# For observation 2:
distance(obs[2,], centroid1)
## [1] 0.559017
distance(obs[2,], centroid2)
## [1] 4.743416
# This observation is closer to centroid 1.

clust.label[2] <- 1

# For observation 3:
distance(obs[3,], centroid1)
## [1] 1.952562
distance(obs[3,], centroid2)
## [1] 6.041523
# This observation is closer to centroid 1.

clust.label[3] <- 1

# For observation 4:
distance(obs[4,], centroid1)
## [1] 3.913119
distance(obs[4,], centroid2)
## [1] 0.7071068
# This observation is closer to centroid 2.

clust.label[4] <- 2

# For observation 5:

distance(obs[5,], centroid1)
## [1] 4.562072
distance(obs[5,], centroid2)
## [1] 0.7071068
# This observation is closer to centroid 2.

clust.label[5] <- 2

# For observation 6:

distance(obs[6,], centroid1)
## [1] 3.716517
distance(obs[6,], centroid2)
## [1] 2.12132
#This observation is closer to centroid 2 - different from the first time!

clust.label[6] <- 2

# Plot new clusters and compute and plot new centroids:

plot(obs[clust.label == 1, 1], obs[clust.label == 1, 2], col = 4, pch = 20, cex = 3,
     xlim = c(0, 6), ylim = c(0, 4))
points(obs[clust.label == 2, 1], obs[clust.label == 2, 2], 
       col = 3, pch = 20, cex = 3)

centroid1 <- c(mean(obs[clust.label == 1, 1]), mean(obs[clust.label == 1, 2]))
centroid2 <- c(mean(obs[clust.label == 2, 1]), mean(obs[clust.label == 2, 2]))

points(centroid1[1], centroid1[2], col = 4, pch = 4)
points(centroid2[1], centroid2[2], col = 5, pch = 4)

# This looks more accurate than the first one. Repeat one more time to verify that
# this is correct and we're done!

# For observation 1:
distance(obs[1,], centroid1)
## [1] 0.4714045
distance(obs[1,], centroid2)
## [1] 5
# This observation is closer to centroid 1.

clust.label[1] <- 1

# For observation 2:
distance(obs[2,], centroid1)
## [1] 0.745356
distance(obs[2,], centroid2)
## [1] 4.472136
# This observation is closer to centroid 1.

clust.label[2] <- 1

# For observation 3:
distance(obs[3,], centroid1)
## [1] 0.745356
distance(obs[3,], centroid2)
## [1] 5.830952
# This observation is closer to centroid 1.

clust.label[3] <- 1

# For observation 4:
distance(obs[4,], centroid1)
## [1] 5.088113
distance(obs[4,], centroid2)
## [1] 0
# This observation is closer to centroid 2.

clust.label[4] <- 2

# For observation 5:

distance(obs[5,], centroid1)
## [1] 5.587685
distance(obs[5,], centroid2)
## [1] 1.414214
# This observation is closer to centroid 2.

clust.label[5] <- 2

# For observation 6:

distance(obs[6,], centroid1)
## [1] 4.955356
distance(obs[6,], centroid2)
## [1] 1.414214
#This observation is closer to centroid 2 - different from the first time!

clust.label[6] <- 2

# Plot new clusters and compute and plot new centroids:

plot(obs[clust.label == 1, 1], obs[clust.label == 1, 2], col = 4, pch = 20, cex = 3,
     xlim = c(0, 6), ylim = c(0, 4))
points(obs[clust.label == 2, 1], obs[clust.label == 2, 2], 
       col = 3, pch = 20, cex = 3)

centroid1 <- c(mean(obs[clust.label == 1, 1]), mean(obs[clust.label == 1, 2]))
centroid2 <- c(mean(obs[clust.label == 2, 1]), mean(obs[clust.label == 2, 2]))

points(centroid1[1], centroid1[2], col = 4, pch = 4)
points(centroid2[1], centroid2[2], col = 5, pch = 4)

# Nothing changes - the plot looks the same as the last one. We're done!

# Exercise 3.f: 
# In your plot from (a), color the observations according to the cluster labels
# obtained.

plot(obs[clust.label == 1, 1], obs[clust.label == 1, 2], col = 4, pch = 20, cex = 3,
     xlim = c(0, 6), ylim = c(0, 4))
points(obs[clust.label == 2, 1], obs[clust.label == 2, 2], 
       col = 3, pch = 20, cex = 3)

rm(list = ls()) # clear memory for new exercise

##################
# Exercise 7:
# In the chapter, we mentioned the use of correlation-based distance and Euclidean
# distance as dissimilarity measures for hierarchical clustering. It turns out that
# these two measures are almost equivalent: if each observation has been centered
# to have mean zero and standard deviation one, and if we let rij denote the
# correlation between the ith and jth observations, then the quantity 1 - rij
# is proportional to the squared Euclidean distance between the ith and jth 
# observations. On the USArrests data, show that this proportionality holds.

# Hint: The Euclidean distance can be calculated using the dist() funciton, and 
# correlations can be calculated using the cor() function.

library(ISLR)
## Warning: package 'ISLR' was built under R version 3.4.4
set.seed(1)
us.scaled <- scale(USArrests)

distance <- dist(us.scaled)
dist.sq <- distance^2
r.ij <- cor(t(us.scaled))
minus.rij <- as.dist((1 - t(r.ij)))

last <- minus.rij/dist.sq
summary(minus.rij/dist.sq)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000086 0.069135 0.133943 0.234193 0.262589 4.887686
plot(minus.rij/dist.sq)

# It looks like there is some sort of proportion happening here - a straight line
# is visible around 0.2, even though there is some variation. Variation is to be
# expected because measurements aren't perfect - there will always be some error.
sum(last > 0.06 & last < 0.3)
## [1] 703
sum(last < 1)
## [1] 1189
sum(last > 0)
## [1] 1225
703/1225
## [1] 0.5738776
1189/1225
## [1] 0.9706122
# So here, we can see that approximately 57% of the observations fall between 0.06 
# and 0.3, and nearly all of the observations (97%) are less than 1. This implies
# that there is at least some sort of correlation observed here.

# Let's take a slightly closer look using just the first five rows.

new.us <- USArrests[1:5,]
rij2 <- cor(t(new.us))
minus.rij2 <- as.dist(1 - rij2)
dist.rij2 <- dist(new.us)
dist.rij2.sqrd <- dist.rij2^2

minus.rij2/dist.rij2.sqrd
##                 Alabama       Alaska      Arizona     Arkansas
## Alaska     6.565935e-06                                       
## Arizona    3.602367e-07 4.746410e-06                          
## Arkansas   1.729206e-07 1.567899e-06 2.828392e-08             
## California 2.006828e-06 7.941079e-06 3.149473e-06 3.690105e-07
# So here, we see that virtually all of these are essentially zero.

summary(minus.rij2/dist.rij2.sqrd)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 2.828e-08 3.624e-07 1.787e-06 2.691e-06 4.347e-06 7.941e-06
# Again, the minimum and maximum values here are very close - indicating an 
# approximate proportion exists here.

rm(list = ls()) # Clear for new problem


##########################
# Exercise 9
# Consider the USArrests data. We will now perform hierarchical clustering on the
# states.

# Exercise 9.a:
# Using hierarchical clustering with complete linkage and Euclidean distance, 
# cluster the states.

library(ISLR)
data(USArrests)
set.seed(1)

us.clust <- hclust(dist(USArrests), method = "complete")
plot(us.clust)

# for the most part, it looks like there are similar states in clusters - especially
# in larger clusters. Illinois and New York are together, which makes sense given
# that they have a very large city with a somewhat high crime rate. However, there
# are some states that are clustered together that are somewhat unexpected - North
# Dakota and Vermont, for example, are both fairly rural northern states, but
# don't tend to have much in common except for that. 

# Exercise 9.b:
# Cut the dendrogram at a height that results in three distinct clusters. Which 
# states belong to which clusters?

abline(h = 150)

us.cut <- cutree(us.clust, 3)
us.cut
##        Alabama         Alaska        Arizona       Arkansas     California 
##              1              1              1              2              1 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              2              3              1              1              2 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              3              3              1              3              3 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              3              3              1              3              1 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              2              1              3              1              2 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              3              3              1              3              2 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              1              1              1              3              3 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              2              2              3              2              1 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              3              2              2              3              3 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              2              2              3              3              2
# Cluster 1 is kind of a hodgepodge - It has some southern states, some western 
# states, California, Nevada, and some states in New England. It looks like 
# these may be states that have a larger city in them, and are maybe clustered 
# because of their similar "urban population" predictor. Cluster 2 contains more
# rural states - some southern ones, as well as some ones in the west that are a 
# little more rural. However, cluster 2 also contains Massachusetts and New Jersey -
# states that don't make a ton of sense when compared to the rest of the states in 
# the cluster. Cluster 3 contains some more rural states (Montana, the Dakotas, 
# midwestern states like Indiana and Iowa). I was surprised that there wasn't a more
# consistent correlation in geographical area - it seems to have more to do with 
# the urban population than geographical area, although there are outliers that 
# suprised me in each cluster - indicating closer analysis may be helpful to try
# to determine intra-cluster commonalities.

# Exercise 9.c: 
# Hierarchically cluster the states using complete linkage and Euclidean distance,
# after scaling the variables to have standard deviation one.

us.arrest <- scale(USArrests)

us.clust.scld <- hclust(dist(us.arrest), method = "complete")
plot(us.clust.scld)

# Now we're starting to see some geographical correlation! We will look closer
# at this data and compare the three-cluster cutoff in the next section.

# Exercise 9.d:
# What effect does scaling the variables have on the hierarchical clustering 
# obtained? In your opinion, should the variables be scaled before the 
# inter-observation dissimilarities are computed? Provide a justification for your
# answer.

abline(h = 4.4) # It's really hard to tell where the cutoff is between clusters 3

# and 4 - we'll use the cutree command to let R figure it out for us

us.cut.scld <- cutree(us.clust.scld, 3)
us.cut.scld
##        Alabama         Alaska        Arizona       Arkansas     California 
##              1              1              2              3              2 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              2              3              3              2              1 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              3              3              2              3              3 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              3              3              1              3              2 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              3              2              3              1              3 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              3              3              2              3              3 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              2              2              1              3              3 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              3              3              3              3              1 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              3              1              2              3              3 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              3              3              3              3              3
# Here, it looks like we're getting slightly more geographical correlation. Alabama,
# Georgia, and Mississippi are all together, as are Conneticut, Delaware, and 
# Rhode Island. Even though in none of these are we getting an entire geographical
# segment of the United States, we're getting states that have more similar
# characteristics this time - as well as keeping the more rural and well-populated 
# states together. Texas is now with New York, Illinois, and California, which makes
# more sense given the large cities and relatively high crime rates within the 
# large cities in those states.

table(us.cut, us.cut.scld)
##       us.cut.scld
## us.cut  1  2  3
##      1  6  9  1
##      2  2  2 10
##      3  0  0 20
28/50 
## [1] 0.56
# 56% of the observations are assigned to the same clusters in both methods

# In my opinion, I think the variables should be scaled before computing inter-
# observation dissimilarities. I think that the urban population variable had too 
# much power before - it was simply grouping all of the urban states together and
# all the rural states together. While there is a general trend in crime and 
# percentage of a state that lives in an urban area, there are also general trends
# among crime rates in geographical areas that aren't being accounted for in the 
# non-scaled graph. However, I think that in an unsupervised learning environment,
# both should be done - it looks like there might be more to the story in that 
# first dendrogram and it should be explored. If I had to pick only one, I'd use the
# scaled dendrogram. Additionally, when looking at the information about the data
# set in the help window, it looks like the urban population is given as a percent,
# while the murder, assault, and rape totals are given per thousand - different 
# units. Therefore, the data should be scaled to make everything uniform.