Problem Set 8

Introduction:

Migratory Geese patterns

dat1 <- read.csv("barnaclegeese.csv") ##greenland
dat2 <- read.csv("geese-svalbard.csv")
dat1$Year <- "1"
dat2 <- dat2[,-6] ##remove comments
#dat <- dat2
#dat <- dat1
dat <- rbind(dat1,dat2)

dat$id <- paste(dat$tag.local.identifier,dat$Year,sep="-")


#library(RgoogleMaps)
#library(ggmap)
#library(gplots)
#library(leaflet)
#query google map
library(maps)
#al1 <- GetMap(center=c(66.56,0),
#              zoom=3, destfile = "geese.png", 
#       format="png",maptype="satellite")

#gm <- get_map(location=c(66.56,0),zoom=3,maptype="satellite")
levs <- levels(as.factor(dat$id))
dat$Year <- as.factor(dat$Year)
addit<- F
map('world',xlim=c(-30,25),ylim=c(45,90))
for(lev in levs[1:33])
    {

        tmp <- dat[dat$id == lev,]
        col <- as.factor(tmp$Year)[1]
                                        #        plot(tmp$utm.easting,tmp$utm.northing,type="o")
        #path1 <- SimplifyPath(cbind(tmp$location.long,tmp$location.lat),tol=.25)

        long <- tmp$location.long
#        long[long<0] <- long[long<0] 
        path1 <- cbind(long,tmp$location.lat)
        
    #    tmp1 <- PlotOnStaticMap(al1,lat=path1[,2],lon=path1[,1],col="gold",lwd=2,
    #                          FUN=lines,add=addit)
      points(path1[,1],path1[,2],col="gold",type="l",cex=.5,pch=16)

#        plot(1:nrow(path1),path1[,1])
        addit <- T
}

distances <-read.csv("birdsim.csv")
colnames(distances) <- c("X",levs)
dd <-as.dist(distances[,-1])

Problem 1:

Overall Multidimensional Scaling:

Create MDS solutions using classic, sammon, and nonmetric scaling (cmdscale, sammon, and isoMDS). For each method, examine 1 through 4 dimensional solutions. Plot the results visually for the 1- and 2- 2 dimensional solution for each scaling method.

For each method, determine the most appropriate number of dimensions. Discuss what the MDS shows you about the different geese. Based on the results, try to identify which geese went to Greenland and which went to Svalbard. Do you see any geese that appear to be outliers? Hypothesize what this arose from, perhaps by examining the corresponding row(s) of the distance matrix.

Solutions:

For all the methods, the appropriate number of dimensions are 2. The Geese to the right side of the plot went to Greenland and the left of the plot went to Svalbard. There is an outlier which has not gone to the above 2 places which is no.10.

Code:

library(MASS)

model1 <- cmdscale(dd, eig = TRUE)
model1

## $points
##                   [,1]        [,2]
## 170563-2007 -141.58814  30.8738516
## 170563-2008 -132.26867  45.8488929
## 178199-2008 -147.50675 -16.0359520
## 186827-2009 -167.01941 -47.3055563
## 33102-2011  -115.57025 -25.7672062
## 33103-2011  -107.54905   6.9530106
## 33104-2011  -173.58808 -24.1600404
## 33145-2011  -125.14523  -7.8242065
## 33953-2010  -107.38084  16.5903030
## 33953-2011   162.34413 -55.7592622
## 33954-2010  -140.43879  10.3914219
## 64685-2006  -100.09280  -3.4644598
## 64687-2006  -131.73351  -9.8293602
## 64687-2007  -190.28721 -43.4234767
## 65698-1      406.16304  21.0254108
## 70563-1      489.59701 -79.6713063
## 70564-2007  -106.53922   1.5009855
## 70565-2007  -136.67468   4.5391751
## 70566-2007  -127.31670  30.1947274
## 70567-2007  -148.42171  36.9374732
## 70568-2007   -70.00743  35.2463937
## 70618-2007  -114.65569  -4.9626061
## 70619-2007  -169.76697  -4.5508813
## 78198-2008  -178.98163  -0.9164807
## 78199-1      400.43455  26.7821363
## 78207-1      509.51683   8.2706114
## 78208-1      430.22677  28.0078609
## 78209-1      514.82067  21.8982584
## 78210-1      457.57386  -2.7748828
## 78378-2008  -142.46477 -10.2105713
## 78378-2009  -116.79437  -6.5116018
## 86824-2009  -160.38456   8.8593596
## 86828-2009  -118.50040   9.2479783
## 
## $eig
##  [1]  1.984116e+06  2.513909e+04  2.437996e+04  9.509465e+03  6.379278e+03
##  [6]  4.826514e+03  3.825354e+03  2.401280e+03  1.881346e+03  1.119390e+03
## [11]  8.697979e+02  7.051432e+02  3.594075e+02  3.216826e+02  2.252216e+02
## [16]  1.924003e+02  1.379600e+02  6.440121e+01  1.967767e+01 -1.096652e-10
## [21] -2.771413e+01 -6.887677e+01 -9.260534e+01 -1.828054e+02 -2.764200e+02
## [26] -1.271938e+03 -1.772690e+03 -3.150443e+03 -3.690735e+03 -7.290932e+03
## [31] -1.289046e+04 -3.387611e+04 -7.940520e+04
## 
## $x
## NULL
## 
## $ac
## [1] 0
## 
## $GOF
## [1] 0.9089718 0.9723112

plot(model1$points, main="CMD Scale", xlab="Values", ylab="Eigen Values")
text(model1$points, rownames(model1), cex = 0.8)
abline(v=c(0,300),col="blue")

model2<- sammon(dd, tol = 0.001)

## Initial stress        : 0.09859
## stress after  10 iters: 0.03069, magic = 0.500
## stress after  20 iters: 0.02982, magic = 0.500

model2

## $points
##                   [,1]         [,2]
## 170563-2007 -132.39539  26.29468405
## 170563-2008 -131.63439   3.45407985
## 178199-2008 -138.58815  -0.04018823
## 186827-2009 -122.42215  -9.61600435
## 33102-2011  -103.77162 -15.75184485
## 33103-2011  -104.43803  12.88957743
## 33104-2011  -150.72528  -5.96562172
## 33145-2011  -125.98295  12.42253558
## 33953-2010  -115.63100  16.27053670
## 33953-2011   101.69216  -4.27818817
## 33954-2010  -129.33907 -27.13370522
## 64685-2006   -99.38494 -11.14416497
## 64687-2006  -116.17249  -4.16784242
## 64687-2007  -158.28416 -32.92540432
## 65698-1      394.08081  -6.04632759
## 70563-1      446.42722 -41.84753619
## 70564-2007  -111.34045   3.97928053
## 70565-2007  -133.28623  -7.99737776
## 70566-2007  -103.62929  26.15546980
## 70567-2007  -131.81971  14.55301125
## 70568-2007   -98.28341   5.22310818
## 70618-2007  -125.42798   2.92266723
## 70619-2007  -157.17742   4.63352267
## 78198-2008  -159.65227  30.72870257
## 78199-1      401.14300 -20.07135328
## 78207-1      477.26129   0.93879132
## 78208-1      413.03134   9.16347100
## 78209-1      487.39169  35.98328387
## 78210-1      432.79756 -13.07956176
## 78378-2008  -129.43502  -8.02850533
## 78378-2009  -112.17195 -12.83648272
## 86824-2009  -143.63208  14.51871194
## 86828-2009  -119.19965   0.79867493
## 
## $stress
## [1] 0.02981568
## 
## $call
## sammon(d = dd, tol = 0.001)

plot(model2$points, main="Sammon Scale", xlab="Dimension 1", ylab="Dimension 2")
text(model2$points, rownames(model2), cex = 0.8)
abline(v=c(0,300),col="blue")

model3 <- isoMDS(dd, tol = 0.001)

## initial  value 5.381891 
## final  value 5.380866 
## converged

model3

## $points
##                   [,1]        [,2]
## 170563-2007 -141.58964  30.8737550
## 170563-2008 -132.26980  45.8413658
## 178199-2008 -147.50655 -16.0357698
## 186827-2009 -167.00787 -47.2938414
## 33102-2011  -115.56988 -25.7687132
## 33103-2011  -107.54925   6.9542786
## 33104-2011  -173.58159 -24.1558926
## 33145-2011  -125.14722  -7.8254003
## 33953-2010  -107.38213  16.5911174
## 33953-2011   162.34687 -55.7590834
## 33954-2010  -140.44115  10.3926576
## 64685-2006  -100.09139  -3.4666189
## 64687-2006  -131.73410  -9.8303755
## 64687-2007  -190.27995 -43.4186510
## 65698-1      406.16557  21.0227481
## 70563-1      489.59201 -79.6689537
## 70564-2007  -106.54131   1.5004302
## 70565-2007  -136.67686   4.5391390
## 70566-2007  -127.31576  30.1966412
## 70567-2007  -148.42224  36.9344357
## 70568-2007   -70.02361  35.2364265
## 70618-2007  -114.65806  -4.9633825
## 70619-2007  -169.76560  -4.5503482
## 78198-2008  -178.98448  -0.9142418
## 78199-1      400.43857  26.7804040
## 78207-1      509.51528   8.2698407
## 78208-1      430.22982  28.0070220
## 78209-1      514.82257  21.9025436
## 78210-1      457.57501  -2.7757332
## 78378-2008  -142.46498 -10.2108619
## 78378-2009  -116.79434  -6.5139645
## 86824-2009  -160.38433   8.8605541
## 86828-2009  -118.50362   9.2484723
## 
## $stress
## [1] 5.380866

plot(model3$points, main="Iso MDS Scale", xlab="Dimension 1", ylab="Dimension 2")
text(model3$points, rownames(model3), cex = 0.8)
abline(v=c(0,300),col="blue")

Problem 2:

Hierachical Clustering:

Create and display divisive and agglomerative clustering solutions. For each, try at least two link methods before settling on the solution you feel best descibes the data. Discuss whether the hierarchical solutions would indicate two groups, or if they might involve additional groups within these two main groups.

Solution:

The 2 link methods tried are Single and Complete. All the methods show a similar trend in clustering and hence it can be assumed that the clusters are same. The hierachial clusters involve additional groups with the two main groups.

Code:

library(MASS)
library(ggdendro)

## Warning: package 'ggdendro' was built under R version 3.6.2

library(cluster)

## 
## Attaching package: 'cluster'

## The following object is masked from 'package:maps':
## 
##     votes.repub

library(gridExtra)
library(ggplot2)
library(fpc)

## Warning: package 'fpc' was built under R version 3.6.2

AC1<-hclust(dd, method = "single")
plot(AC1) 
rect.hclust(AC1, k = 3)

AC2<-hclust(dd, method = "complete")
plot(AC2) 
rect.hclust(AC2, k = 3)

DC1 <- diana(dd)
plot(DC1)

rect.hclust(DC1, k = 3)

Problem 3:

K-means and Fanny Clustering

Create a set of k-means clustering solutions for different values of k. To do this, you cannot use the similarity space directly. Instead, use an MDS solution with at least 5 dimensions as the features, and then use k-means on this dimensional solution. Compare goodness-of-fit statistics to determine which you think is the best number of clusters to use. Compare that solution to your conclusions for part 2.1. Then, do a fanny clustering, and determine the groups. Does this method produce a substantially different solution? Are there specific geese that are not easily clasified into just one group?

Solution:

Comparing the goodness of fit, we see that Cluster with 5 centers has the best similarity percent. From 2.1, we see that the agglomerative clustering with 3 centers is best. The Fanny clustering has a different solution as the clusters are divided differently. All the Geese are classified into groups.

Code:

newdd<-dist(distances[,-1])
model4 <- cmdscale(newdd)
k1 <- kmeans(model4, centers = 3)
sort(k1$cluster)

##  [1] 1 1 1 1 1 1 1 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3

k1$betweenss/k1$totss

## [1] 0.9895183

model5 <- cmdscale(newdd)
k2 <- kmeans(model5, centers = 5)
sort(k2$cluster)

##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 4 4 4 5

k2$betweenss/k2$totss

## [1] 0.9974476

model6 <- cmdscale(newdd)
k3 <- kmeans(model6, centers = 7)
sort(k3$cluster)

##  [1] 1 1 1 1 1 2 2 2 2 3 4 4 4 4 4 4 4 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6 7

k3$betweenss/k3$totss

## [1] 0.9927061

library(factoextra)

## Warning: package 'factoextra' was built under R version 3.6.2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

fviz_cluster(k1, data = newdd)

fviz_cluster(k2, data = newdd)

fviz_cluster(k3, data = newdd)

f1 <- fanny(model4, k = 3)
matplot(f1$membership, type = "b", xaxt = "n")

plot(f1)

f2 <- fanny(model5, k = 5)
matplot(f2$membership, type = "b", xaxt = "n")

plot(f2)

f3 <- fanny(model6, k = 7)
matplot(f3$membership, type = "b", xaxt = "n")

plot(f3)

Problem 4:

Plotting back into space

Regardless of which solution you thought was best in part 3, look at the For the 2-cluster solution in part 3 k-means. Re-plot the map trajectories, using the cluster to determine the color. Do this again for 3- and 4-cleuster solutions. For each solution, describe what appears to be the important aspects of the migration path that define or divide the different groups.

Solution:

The most important aspect of migration path is the ending point of the latitude.

Code:

library(ggmap)
k4 <- kmeans(model4, centers = 2)
k5 <- kmeans(model4, centers = 3)
k6 <- kmeans(model4, centers = 4)
k4

## K-means clustering with 2 clusters of sizes 7, 26
## 
## Cluster means:
##        [,1]       [,2]
## 1 2453.4160 -30.804892
## 2 -660.5351   8.293625
## 
## Clustering vector:
##  [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 1 1 1 1 1 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1]  385411.8 2068380.8
##  (between_SS / total_SS =  95.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

#######################################################################################
library(maps)
#al1 <- GetMap(center=c(66.56,0),
#              zoom=3, destfile = "geese.png", 
#       format="png",maptype="satellite")

#gm <- get_map(location=c(66.56,0),zoom=3,maptype="satellite")
levs <- levels(as.factor(dat$id))
dat$Year <- as.factor(dat$Year)
addit<- F
map('world',xlim=c(-30,25),ylim=c(45,90))
for(lev in levs[1:33])
    {

        tmp <- dat[dat$id == lev,]
        col <- as.factor(tmp$Year)[1]
                                        #        plot(tmp$utm.easting,tmp$utm.northing,type="o")
        #path1 <- SimplifyPath(cbind(tmp$location.long,tmp$location.lat),tol=.25)

        long <- tmp$location.long
#        long[long<0] <- long[long<0] 
        path1 <- cbind(long,tmp$location.lat)
        
    #    tmp1 <- PlotOnStaticMap(al1,lat=path1[,2],lon=path1[,1],col="gold",lwd=2,
    #                          FUN=lines,add=addit)
        
      points(path1[,1],path1[,2],col=k4$cluster,type="l",cex=.5,pch=16)

#        plot(1:nrow(path1),path1[,1])
        addit <- T
}

###########

library(maps)
#al1 <- GetMap(center=c(66.56,0),
#              zoom=3, destfile = "geese.png", 
#       format="png",maptype="satellite")

#gm <- get_map(location=c(66.56,0),zoom=3,maptype="satellite")
levs <- levels(as.factor(dat$id))
dat$Year <- as.factor(dat$Year)
addit<- F
map('world',xlim=c(-30,25),ylim=c(45,90))
for(lev in levs[1:33])
    {

        tmp <- dat[dat$id == lev,]
        col <- as.factor(tmp$Year)[1]
                                        #        plot(tmp$utm.easting,tmp$utm.northing,type="o")
        #path1 <- SimplifyPath(cbind(tmp$location.long,tmp$location.lat),tol=.25)

        long <- tmp$location.long
#        long[long<0] <- long[long<0] 
        path1 <- cbind(long,tmp$location.lat)
        
    #    tmp1 <- PlotOnStaticMap(al1,lat=path1[,2],lon=path1[,1],col="gold",lwd=2,
    #                          FUN=lines,add=addit)
        
      points(path1[,1],path1[,2],col=c("blue","green","red","black","yellow")[k5$cluster[lev]],type="l",cex=.5,pch=16)

#        plot(1:nrow(path1),path1[,1])
        addit <- T
}

###########

library(maps)
#al1 <- GetMap(center=c(66.56,0),
#              zoom=3, destfile = "geese.png", 
#       format="png",maptype="satellite")

#gm <- get_map(location=c(66.56,0),zoom=3,maptype="satellite")
levs <- levels(as.factor(dat$id))
dat$Year <- as.factor(dat$Year)
addit<- F
map('world',xlim=c(-30,25),ylim=c(45,90))
for(lev in levs[1:33])
    {

        tmp <- dat[dat$id == lev,]
        col <- as.factor(tmp$Year)[1]
                                        #        plot(tmp$utm.easting,tmp$utm.northing,type="o")
        #path1 <- SimplifyPath(cbind(tmp$location.long,tmp$location.lat),tol=.25)

        long <- tmp$location.long
#        long[long<0] <- long[long<0] 
        path1 <- cbind(long,tmp$location.lat)
        
    #    tmp1 <- PlotOnStaticMap(al1,lat=path1[,2],lon=path1[,1],col="gold",lwd=2,
    #                          FUN=lines,add=addit)
        
      points(path1[,1],path1[,2],col=path1[,2],type="l",cex=.5,pch=16)

#        plot(1:nrow(path1),path1[,1])
        addit <- T
}

Problem Set 8

Sneha Nimmagadda

March 31, 2019