Introduction:
Migratory Geese patterns
dat1 <- read.csv("barnaclegeese.csv") ##greenland
dat2 <- read.csv("geese-svalbard.csv")
dat1$Year <- "1"
dat2 <- dat2[,-6] ##remove comments
#dat <- dat2
#dat <- dat1
dat <- rbind(dat1,dat2)
dat$id <- paste(dat$tag.local.identifier,dat$Year,sep="-")
#library(RgoogleMaps)
#library(ggmap)
#library(gplots)
#library(leaflet)
#query google map
library(maps)
#al1 <- GetMap(center=c(66.56,0),
# zoom=3, destfile = "geese.png",
# format="png",maptype="satellite")
#gm <- get_map(location=c(66.56,0),zoom=3,maptype="satellite")
levs <- levels(as.factor(dat$id))
dat$Year <- as.factor(dat$Year)
addit<- F
map('world',xlim=c(-30,25),ylim=c(45,90))
for(lev in levs[1:33])
{
tmp <- dat[dat$id == lev,]
col <- as.factor(tmp$Year)[1]
# plot(tmp$utm.easting,tmp$utm.northing,type="o")
#path1 <- SimplifyPath(cbind(tmp$location.long,tmp$location.lat),tol=.25)
long <- tmp$location.long
# long[long<0] <- long[long<0]
path1 <- cbind(long,tmp$location.lat)
# tmp1 <- PlotOnStaticMap(al1,lat=path1[,2],lon=path1[,1],col="gold",lwd=2,
# FUN=lines,add=addit)
points(path1[,1],path1[,2],col="gold",type="l",cex=.5,pch=16)
# plot(1:nrow(path1),path1[,1])
addit <- T
}
distances <-read.csv("birdsim.csv")
colnames(distances) <- c("X",levs)
dd <-as.dist(distances[,-1])
Problem 1:
Overall Multidimensional Scaling:
Create MDS solutions using classic, sammon, and nonmetric scaling (cmdscale, sammon, and isoMDS). For each method, examine 1 through 4 dimensional solutions. Plot the results visually for the 1- and 2- 2 dimensional solution for each scaling method.
For each method, determine the most appropriate number of dimensions. Discuss what the MDS shows you about the different geese. Based on the results, try to identify which geese went to Greenland and which went to Svalbard. Do you see any geese that appear to be outliers? Hypothesize what this arose from, perhaps by examining the corresponding row(s) of the distance matrix.
Solutions:
For all the methods, the appropriate number of dimensions are 2. The Geese to the right side of the plot went to Greenland and the left of the plot went to Svalbard. There is an outlier which has not gone to the above 2 places which is no.10.
Code:
library(MASS)
model1 <- cmdscale(dd, eig = TRUE)
model1
## $points
## [,1] [,2]
## 170563-2007 -141.58814 30.8738516
## 170563-2008 -132.26867 45.8488929
## 178199-2008 -147.50675 -16.0359520
## 186827-2009 -167.01941 -47.3055563
## 33102-2011 -115.57025 -25.7672062
## 33103-2011 -107.54905 6.9530106
## 33104-2011 -173.58808 -24.1600404
## 33145-2011 -125.14523 -7.8242065
## 33953-2010 -107.38084 16.5903030
## 33953-2011 162.34413 -55.7592622
## 33954-2010 -140.43879 10.3914219
## 64685-2006 -100.09280 -3.4644598
## 64687-2006 -131.73351 -9.8293602
## 64687-2007 -190.28721 -43.4234767
## 65698-1 406.16304 21.0254108
## 70563-1 489.59701 -79.6713063
## 70564-2007 -106.53922 1.5009855
## 70565-2007 -136.67468 4.5391751
## 70566-2007 -127.31670 30.1947274
## 70567-2007 -148.42171 36.9374732
## 70568-2007 -70.00743 35.2463937
## 70618-2007 -114.65569 -4.9626061
## 70619-2007 -169.76697 -4.5508813
## 78198-2008 -178.98163 -0.9164807
## 78199-1 400.43455 26.7821363
## 78207-1 509.51683 8.2706114
## 78208-1 430.22677 28.0078609
## 78209-1 514.82067 21.8982584
## 78210-1 457.57386 -2.7748828
## 78378-2008 -142.46477 -10.2105713
## 78378-2009 -116.79437 -6.5116018
## 86824-2009 -160.38456 8.8593596
## 86828-2009 -118.50040 9.2479783
##
## $eig
## [1] 1.984116e+06 2.513909e+04 2.437996e+04 9.509465e+03 6.379278e+03
## [6] 4.826514e+03 3.825354e+03 2.401280e+03 1.881346e+03 1.119390e+03
## [11] 8.697979e+02 7.051432e+02 3.594075e+02 3.216826e+02 2.252216e+02
## [16] 1.924003e+02 1.379600e+02 6.440121e+01 1.967767e+01 -1.096652e-10
## [21] -2.771413e+01 -6.887677e+01 -9.260534e+01 -1.828054e+02 -2.764200e+02
## [26] -1.271938e+03 -1.772690e+03 -3.150443e+03 -3.690735e+03 -7.290932e+03
## [31] -1.289046e+04 -3.387611e+04 -7.940520e+04
##
## $x
## NULL
##
## $ac
## [1] 0
##
## $GOF
## [1] 0.9089718 0.9723112
plot(model1$points, main="CMD Scale", xlab="Values", ylab="Eigen Values")
text(model1$points, rownames(model1), cex = 0.8)
abline(v=c(0,300),col="blue")
model2<- sammon(dd, tol = 0.001)
## Initial stress : 0.09859
## stress after 10 iters: 0.03069, magic = 0.500
## stress after 20 iters: 0.02982, magic = 0.500
model2
## $points
## [,1] [,2]
## 170563-2007 -132.39539 26.29468405
## 170563-2008 -131.63439 3.45407985
## 178199-2008 -138.58815 -0.04018823
## 186827-2009 -122.42215 -9.61600435
## 33102-2011 -103.77162 -15.75184485
## 33103-2011 -104.43803 12.88957743
## 33104-2011 -150.72528 -5.96562172
## 33145-2011 -125.98295 12.42253558
## 33953-2010 -115.63100 16.27053670
## 33953-2011 101.69216 -4.27818817
## 33954-2010 -129.33907 -27.13370522
## 64685-2006 -99.38494 -11.14416497
## 64687-2006 -116.17249 -4.16784242
## 64687-2007 -158.28416 -32.92540432
## 65698-1 394.08081 -6.04632759
## 70563-1 446.42722 -41.84753619
## 70564-2007 -111.34045 3.97928053
## 70565-2007 -133.28623 -7.99737776
## 70566-2007 -103.62929 26.15546980
## 70567-2007 -131.81971 14.55301125
## 70568-2007 -98.28341 5.22310818
## 70618-2007 -125.42798 2.92266723
## 70619-2007 -157.17742 4.63352267
## 78198-2008 -159.65227 30.72870257
## 78199-1 401.14300 -20.07135328
## 78207-1 477.26129 0.93879132
## 78208-1 413.03134 9.16347100
## 78209-1 487.39169 35.98328387
## 78210-1 432.79756 -13.07956176
## 78378-2008 -129.43502 -8.02850533
## 78378-2009 -112.17195 -12.83648272
## 86824-2009 -143.63208 14.51871194
## 86828-2009 -119.19965 0.79867493
##
## $stress
## [1] 0.02981568
##
## $call
## sammon(d = dd, tol = 0.001)
plot(model2$points, main="Sammon Scale", xlab="Dimension 1", ylab="Dimension 2")
text(model2$points, rownames(model2), cex = 0.8)
abline(v=c(0,300),col="blue")
model3 <- isoMDS(dd, tol = 0.001)
## initial value 5.381891
## final value 5.380866
## converged
model3
## $points
## [,1] [,2]
## 170563-2007 -141.58964 30.8737550
## 170563-2008 -132.26980 45.8413658
## 178199-2008 -147.50655 -16.0357698
## 186827-2009 -167.00787 -47.2938414
## 33102-2011 -115.56988 -25.7687132
## 33103-2011 -107.54925 6.9542786
## 33104-2011 -173.58159 -24.1558926
## 33145-2011 -125.14722 -7.8254003
## 33953-2010 -107.38213 16.5911174
## 33953-2011 162.34687 -55.7590834
## 33954-2010 -140.44115 10.3926576
## 64685-2006 -100.09139 -3.4666189
## 64687-2006 -131.73410 -9.8303755
## 64687-2007 -190.27995 -43.4186510
## 65698-1 406.16557 21.0227481
## 70563-1 489.59201 -79.6689537
## 70564-2007 -106.54131 1.5004302
## 70565-2007 -136.67686 4.5391390
## 70566-2007 -127.31576 30.1966412
## 70567-2007 -148.42224 36.9344357
## 70568-2007 -70.02361 35.2364265
## 70618-2007 -114.65806 -4.9633825
## 70619-2007 -169.76560 -4.5503482
## 78198-2008 -178.98448 -0.9142418
## 78199-1 400.43857 26.7804040
## 78207-1 509.51528 8.2698407
## 78208-1 430.22982 28.0070220
## 78209-1 514.82257 21.9025436
## 78210-1 457.57501 -2.7757332
## 78378-2008 -142.46498 -10.2108619
## 78378-2009 -116.79434 -6.5139645
## 86824-2009 -160.38433 8.8605541
## 86828-2009 -118.50362 9.2484723
##
## $stress
## [1] 5.380866
plot(model3$points, main="Iso MDS Scale", xlab="Dimension 1", ylab="Dimension 2")
text(model3$points, rownames(model3), cex = 0.8)
abline(v=c(0,300),col="blue")
Problem 2:
Hierachical Clustering:
Create and display divisive and agglomerative clustering solutions. For each, try at least two link methods before settling on the solution you feel best descibes the data. Discuss whether the hierarchical solutions would indicate two groups, or if they might involve additional groups within these two main groups.
Solution:
The 2 link methods tried are Single and Complete. All the methods show a similar trend in clustering and hence it can be assumed that the clusters are same. The hierachial clusters involve additional groups with the two main groups.
Code:
library(MASS)
library(ggdendro)
## Warning: package 'ggdendro' was built under R version 3.6.2
library(cluster)
##
## Attaching package: 'cluster'
## The following object is masked from 'package:maps':
##
## votes.repub
library(gridExtra)
library(ggplot2)
library(fpc)
## Warning: package 'fpc' was built under R version 3.6.2
AC1<-hclust(dd, method = "single")
plot(AC1)
rect.hclust(AC1, k = 3)
AC2<-hclust(dd, method = "complete")
plot(AC2)
rect.hclust(AC2, k = 3)
DC1 <- diana(dd)
plot(DC1)
rect.hclust(DC1, k = 3)
Problem 3:
K-means and Fanny Clustering
Create a set of k-means clustering solutions for different values of k. To do this, you cannot use the similarity space directly. Instead, use an MDS solution with at least 5 dimensions as the features, and then use k-means on this dimensional solution. Compare goodness-of-fit statistics to determine which you think is the best number of clusters to use. Compare that solution to your conclusions for part 2.1. Then, do a fanny clustering, and determine the groups. Does this method produce a substantially different solution? Are there specific geese that are not easily clasified into just one group?
Solution:
Comparing the goodness of fit, we see that Cluster with 5 centers has the best similarity percent. From 2.1, we see that the agglomerative clustering with 3 centers is best. The Fanny clustering has a different solution as the clusters are divided differently. All the Geese are classified into groups.
Code:
newdd<-dist(distances[,-1])
model4 <- cmdscale(newdd)
k1 <- kmeans(model4, centers = 3)
sort(k1$cluster)
## [1] 1 1 1 1 1 1 1 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
k1$betweenss/k1$totss
## [1] 0.9895183
model5 <- cmdscale(newdd)
k2 <- kmeans(model5, centers = 5)
sort(k2$cluster)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 4 4 4 5
k2$betweenss/k2$totss
## [1] 0.9974476
model6 <- cmdscale(newdd)
k3 <- kmeans(model6, centers = 7)
sort(k3$cluster)
## [1] 1 1 1 1 1 2 2 2 2 3 4 4 4 4 4 4 4 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6 7
k3$betweenss/k3$totss
## [1] 0.9927061
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.6.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(k1, data = newdd)
fviz_cluster(k2, data = newdd)
fviz_cluster(k3, data = newdd)
f1 <- fanny(model4, k = 3)
matplot(f1$membership, type = "b", xaxt = "n")
plot(f1)
f2 <- fanny(model5, k = 5)
matplot(f2$membership, type = "b", xaxt = "n")
plot(f2)
f3 <- fanny(model6, k = 7)
matplot(f3$membership, type = "b", xaxt = "n")
plot(f3)
Problem 4:
Plotting back into space
Regardless of which solution you thought was best in part 3, look at the For the 2-cluster solution in part 3 k-means. Re-plot the map trajectories, using the cluster to determine the color. Do this again for 3- and 4-cleuster solutions. For each solution, describe what appears to be the important aspects of the migration path that define or divide the different groups.
Solution:
The most important aspect of migration path is the ending point of the latitude.
Code:
library(ggmap)
k4 <- kmeans(model4, centers = 2)
k5 <- kmeans(model4, centers = 3)
k6 <- kmeans(model4, centers = 4)
k4
## K-means clustering with 2 clusters of sizes 7, 26
##
## Cluster means:
## [,1] [,2]
## 1 2453.4160 -30.804892
## 2 -660.5351 8.293625
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 1 1 1 1 1 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 385411.8 2068380.8
## (between_SS / total_SS = 95.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
#######################################################################################
library(maps)
#al1 <- GetMap(center=c(66.56,0),
# zoom=3, destfile = "geese.png",
# format="png",maptype="satellite")
#gm <- get_map(location=c(66.56,0),zoom=3,maptype="satellite")
levs <- levels(as.factor(dat$id))
dat$Year <- as.factor(dat$Year)
addit<- F
map('world',xlim=c(-30,25),ylim=c(45,90))
for(lev in levs[1:33])
{
tmp <- dat[dat$id == lev,]
col <- as.factor(tmp$Year)[1]
# plot(tmp$utm.easting,tmp$utm.northing,type="o")
#path1 <- SimplifyPath(cbind(tmp$location.long,tmp$location.lat),tol=.25)
long <- tmp$location.long
# long[long<0] <- long[long<0]
path1 <- cbind(long,tmp$location.lat)
# tmp1 <- PlotOnStaticMap(al1,lat=path1[,2],lon=path1[,1],col="gold",lwd=2,
# FUN=lines,add=addit)
points(path1[,1],path1[,2],col=k4$cluster,type="l",cex=.5,pch=16)
# plot(1:nrow(path1),path1[,1])
addit <- T
}
###########
library(maps)
#al1 <- GetMap(center=c(66.56,0),
# zoom=3, destfile = "geese.png",
# format="png",maptype="satellite")
#gm <- get_map(location=c(66.56,0),zoom=3,maptype="satellite")
levs <- levels(as.factor(dat$id))
dat$Year <- as.factor(dat$Year)
addit<- F
map('world',xlim=c(-30,25),ylim=c(45,90))
for(lev in levs[1:33])
{
tmp <- dat[dat$id == lev,]
col <- as.factor(tmp$Year)[1]
# plot(tmp$utm.easting,tmp$utm.northing,type="o")
#path1 <- SimplifyPath(cbind(tmp$location.long,tmp$location.lat),tol=.25)
long <- tmp$location.long
# long[long<0] <- long[long<0]
path1 <- cbind(long,tmp$location.lat)
# tmp1 <- PlotOnStaticMap(al1,lat=path1[,2],lon=path1[,1],col="gold",lwd=2,
# FUN=lines,add=addit)
points(path1[,1],path1[,2],col=c("blue","green","red","black","yellow")[k5$cluster[lev]],type="l",cex=.5,pch=16)
# plot(1:nrow(path1),path1[,1])
addit <- T
}
###########
library(maps)
#al1 <- GetMap(center=c(66.56,0),
# zoom=3, destfile = "geese.png",
# format="png",maptype="satellite")
#gm <- get_map(location=c(66.56,0),zoom=3,maptype="satellite")
levs <- levels(as.factor(dat$id))
dat$Year <- as.factor(dat$Year)
addit<- F
map('world',xlim=c(-30,25),ylim=c(45,90))
for(lev in levs[1:33])
{
tmp <- dat[dat$id == lev,]
col <- as.factor(tmp$Year)[1]
# plot(tmp$utm.easting,tmp$utm.northing,type="o")
#path1 <- SimplifyPath(cbind(tmp$location.long,tmp$location.lat),tol=.25)
long <- tmp$location.long
# long[long<0] <- long[long<0]
path1 <- cbind(long,tmp$location.lat)
# tmp1 <- PlotOnStaticMap(al1,lat=path1[,2],lon=path1[,1],col="gold",lwd=2,
# FUN=lines,add=addit)
points(path1[,1],path1[,2],col=path1[,2],type="l",cex=.5,pch=16)
# plot(1:nrow(path1),path1[,1])
addit <- T
}