Cluster_Memo_01_2020_REV

2. Clustering by Physical Characteristics

2.1. Accessing physical characteristics table

This code chuck accesses the a table with the physical characteristics of the 671 gauged stations, as follows:LAT. LONG. AREA(km2) ELEV.(m) SLOPE, PERCENTAGE OF FOREST).

#Clean R memory
rm(list=ls(all=TRUE))

#Loading packge to read .mat files
library(R.matlab)

## Registered S3 method overwritten by 'R.oo':
##   method        from       
##   throw.default R.methodsS3

## R.matlab v3.6.2 (2018-09-26) successfully loaded. See ?R.matlab for help.

## 
## Attaching package: 'R.matlab'

## The following objects are masked from 'package:base':
## 
##     getOption, isOpen

#Loading clustering packages
library(cluster)
library(devtools)

## Loading required package: usethis

library("factoextra")

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

#Setting work directory
setwd("C:/Users/rampincg/Desktop/Clarkson_Pc/cassio_PC/AGU_2019/camels")
data.Set<-read.table(file="camelsData.txt",header=T,sep="\t")

2.2 Physical Characteristics Distributions

##############################################################
###DENSITIES WITH GGPLOT2#####################################
##############################################################
library(reshape2)
library(ggplot2)
library(cowplot)

## 
## ********************************************************

## Note: As of version 1.0.0, cowplot does not change the

##   default ggplot2 theme anymore. To recover the previous

##   behavior, execute:
##   theme_set(theme_cowplot())

## ********************************************************

index=seq(from=1,to=nrow(data.Set),by=1)

dataX1<-cbind(index,data.Set[,3])
dataX1<-as.data.frame(dataX1)
cnames=c("index","LAT.")
colnames(dataX1)=cnames
dataX1<-reshape2::melt(dataX1, id.var='index')


dataX2<-cbind(index,data.Set[,4])
dataX2<-as.data.frame(dataX2)
cnames=c("index","LONG.")
colnames(dataX2)=cnames
dataX2<-reshape2::melt(dataX2, id.var='index')


dataX3<-cbind(index,data.Set[,5])
dataX3<-as.data.frame(dataX3)
cnames=c("index","Area")
colnames(dataX3)=cnames
dataX3<-reshape2::melt(dataX3, id.var='index')

dataX4<-cbind(index,data.Set[,6])
dataX4<-as.data.frame(dataX4)
cnames=c("index","Elev.")
colnames(dataX4)=cnames
dataX4<-reshape2::melt(dataX4, id.var='index')

dataX5<-cbind(index,data.Set[,7])
dataX5<-as.data.frame(dataX5)
cnames=c("index","Slope")
colnames(dataX5)=cnames
dataX5<-reshape2::melt(dataX5, id.var='index')

dataX6<-cbind(index,data.Set[,8])
dataX6<-as.data.frame(dataX6)
cnames=c("index","Forest Perc.")
colnames(dataX6)=cnames
dataX6<-reshape2::melt(dataX6, id.var='index')



#Together with ggplot2
plot1<-ggplot(dataX1) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1,  geom="line",position = "identity")+
  ylab("Density")+
  xlab("LAT.")+
  xlim(27,49) + 
  theme_bw() +
  theme(legend.position="top")+
  theme(legend.title = element_blank())+
  theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
  theme(legend.text=element_text(size=14))+
  theme(axis.text.x = element_text(size = 16),axis.text.y = element_text(size = 16))


plot2<-ggplot(dataX2) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1,  geom="line",position = "identity")+
  ylab("Density")+
  xlab("LONG.")+
  xlim(-124,-111) +
  theme_bw() +
  theme(legend.position="top")+
  theme(legend.title = element_blank())+
  theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
  theme(legend.text=element_text(size=14))+
  theme(axis.text.x = element_text(size = 16),axis.text.y = element_text(size = 16))


plot3<-ggplot(dataX3) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1,  geom="line",position = "identity")+
  ylab("Density")+
  xlab("Area")+
  xlim(4,26000) +
  theme_bw() +
  theme(legend.position="top")+
  theme(legend.title = element_blank())+
  theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
  theme(legend.text=element_text(size=14))+
  theme(axis.text.x = element_text(size = 16),axis.text.y = element_text(size = 16))

plot4<-ggplot(dataX4) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1,  geom="line",position = "identity")+
  ylab("Density")+
  xlab("Elev")+
  xlim(10,3600) +
  theme_bw() +
  theme(legend.position="top")+
  theme(legend.title = element_blank())+
  theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
  theme(legend.text=element_text(size=14))+
  theme(axis.text.x = element_text(size = 16),axis.text.y = element_text(size = 16))


plot5<-ggplot(dataX5) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1,  geom="line",position = "identity")+
  ylab("Density")+
  xlab("Slope")+
  xlim(0.8,256) +
  theme_bw() +
  theme(legend.position="top")+
  theme(legend.title = element_blank())+
  theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
  theme(legend.text=element_text(size=14))+
  theme(axis.text.x = element_text(size = 16),axis.text.y = element_text(size = 16))

plot6<-ggplot(dataX6) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1,  geom="line",position = "identity")+
  ylab("Density")+
  xlab("Forest Perc.")+
  xlim(0,1) +
  theme_bw() +
  theme(legend.position="top")+
  theme(legend.title = element_blank())+
  theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
  theme(legend.text=element_text(size=14))+
  theme(axis.text.x = element_text(size = 16),axis.text.y = element_text(size = 16))


plot_grid(plot1,plot2,plot3,plot4,plot5,plot6, labels = "AUTO")

summary(data.Set)

##     gaugeID        
##  Min.   : 1013500  
##  1st Qu.: 2370650  
##  Median : 6278300  
##  Mean   : 6265831  
##  3rd Qu.: 9382765  
##  Max.   :14400000  
##                    
##                                               gaugeName        Lat       
##                                   DEER C NR VINA CA:  1   Min.   :27.05  
##                                   SALT C NR ADA, KS:  1   1st Qu.:35.70  
##                                  BULL C NR WEOTT CA:  1   Median :39.25  
##                                Menard Ck nr Rye, TX:  1   Mean   :39.24  
##                                MILL C NR PAXICO, KS:  1   3rd Qu.:43.21  
##                                REDWOOD C A ORICK CA:  1   Max.   :48.82  
##  (Other)                                           :665                  
##       Long              Area              Elev             Slope         
##  Min.   :-124.39   Min.   :    4.1   Min.   :  10.21   Min.   :  0.8222  
##  1st Qu.:-110.41   1st Qu.:  128.0   1st Qu.: 249.67   1st Qu.:  7.4268  
##  Median : -92.78   Median :  340.7   Median : 462.72   Median : 28.8016  
##  Mean   : -95.79   Mean   :  808.1   Mean   : 759.42   Mean   : 46.1953  
##  3rd Qu.: -81.77   3rd Qu.:  804.5   3rd Qu.: 928.88   3rd Qu.: 73.1695  
##  Max.   : -67.94   Max.   :25817.8   Max.   :3571.18   Max.   :255.6884  
##                                                                          
##   Forest.Perc.   
##  Min.   :0.0000  
##  1st Qu.:0.2771  
##  Median :0.8137  
##  Mean   :0.6395  
##  3rd Qu.:0.9724  
##  Max.   :1.0000  
##

2.3 Scaling the variables - range (0-1)

###############################
#Scaling data set
###############################
df.scaled<-data.Set[,3:8]
rownames(df.scaled)=data.Set[,1]


col1=col2=col3=col4=col5=col6=0

col1<-(df.scaled[,1]-min(df.scaled[,1]))/(max(df.scaled[,1])-min(df.scaled[,1]))
col2<-(df.scaled[,2]-min(df.scaled[,2]))/(max(df.scaled[,2])-min(df.scaled[,2]))
col3<-(df.scaled[,3]-min(df.scaled[,3]))/(max(df.scaled[,3])-min(df.scaled[,3]))
col4<-(df.scaled[,4]-min(df.scaled[,4]))/(max(df.scaled[,4])-min(df.scaled[,4]))
col5<-(df.scaled[,5]-min(df.scaled[,5]))/(max(df.scaled[,5])-min(df.scaled[,5]))
col6<-(df.scaled[,6]-min(df.scaled[,6]))/(max(df.scaled[,6])-min(df.scaled[,6]))


df.scaled[,1]=col1
df.scaled[,2]=col2
df.scaled[,3]=col3
df.scaled[,4]=col4
df.scaled[,5]=col5
df.scaled[,6]=col6

2.4 Clusters

2.4.1 Euclidean Distance Matrix

#Euclidian Distance
dist.eucl<-dist(df.scaled,method="euclidean")
#Visualizing the distance matrices
fviz_dist(dist.eucl)

2.4.2 Mahalanobis Distance Matrix

#Package to Mahalanobis Distance
library(StatMatch)

## Loading required package: proxy

## 
## Attaching package: 'proxy'

## The following objects are masked from 'package:stats':
## 
##     as.dist, dist

## The following object is masked from 'package:base':
## 
##     as.matrix

## Loading required package: clue

## Loading required package: survey

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

## Loading required package: RANN

## Loading required package: lpSolve

#Computing Mahalanobis distance
dist.maha<-mahalanobis.dist(df.scaled)

#Conerting the Mahalanobis distance to a dist class
dist.maha=as.dist(dist.maha)

#Visualizing the distance matrices
fviz_dist(dist.maha)

2.4.3 Number of Clusters

K-Means (Elbow Method)

#Loading package
library(factoextra)

###########
#K-Means###
###########

#Elbow method
fviz_nbclust(df.scaled,kmeans,method="wss")+geom_vline(xintercept=3,linetype=2)+labs(subtitle="Elbow method")

K-Means (Silhouette Method)

#Silhouette method
fviz_nbclust(df.scaled,kmeans,method="silhouette")+labs(subtitle="Silhouette method")

K-Means (Gap Statistic)

#Gap Statistic
#nboot=50 to keep the function speedy.
#recommended value: nboot=500 for your analysis.
#Use verbose=FALSE to hide computing progression.
fviz_nbclust(df.scaled,kmeans,method="gap_stat",nboot=50)+labs(subtitle="Gap statistic method")

#Compute k-means k=3
km.res<-kmeans(df.scaled,3,nstart=15)


#Plotting Cluster
fviz_cluster(km.res,data=df.scaled,palette="jco",repel=TRUE,ggtheme=theme_classic(),show.clust.cent = FALSE,stand=FALSE,geom="point")

#Compute k-means k=10
km.res<-kmeans(df.scaled,4,nstart=15)

#Plotting Cluster
fviz_cluster(km.res,data=df.scaled,palette="jco",repel=TRUE,ggtheme=theme_classic(),show.clust.cent = FALSE,stand=FALSE,geom="point")

K-Means- Clusters Plots

#Compute k-means k=3
km.res<-kmeans(df.scaled,3,nstart=15)

#Plotting Cluster
fviz_cluster(km.res,data=df.scaled,palette="jco",repel=TRUE,ggtheme=theme_classic(),show.clust.cent = FALSE,stand=FALSE,geom="point")

#Compute k-means k=4
km.res<-kmeans(df.scaled,4,nstart=15)

#Plotting Cluster
fviz_cluster(km.res,data=df.scaled,palette="jco",repel=TRUE,ggtheme=theme_classic(),show.clust.cent = FALSE,stand=FALSE,geom="point")

2.4.4 Hierarchical Clustering

Ward’s linkage

res.hc.ward<-hclust(d=dist.eucl,method="ward.D2")
fviz_dend(res.hc.ward,cex=0.2)

Average linkage

res.hc.average<-hclust(d=dist.eucl,method="average")
fviz_dend(res.hc.average,cex=0.2)

Ward’s X Average linkage

#Wards X Average Linkage

#Ward's linkage
#Computing cophentic distance
res.coph.ward<-cophenetic(res.hc.ward)
#Correlation between cophenetic distance and the original distance
cor(dist.eucl,res.coph.ward)

## [1] 0.7888279

#Average Linkage
#Computing cophentic distance
res.coph.average<-cophenetic(res.hc.average)
#Correlation between cophenetic distance and the original distance
cor(dist.eucl,res.coph.average)

## [1] 0.8301809

Cluster_Memo_01_2020_REV_00

Cassio Rampinelli (RA-Clarkson University)

1/27/2020

Advisor: Prof. Tyler Smith

1. Purpose

2. Clustering by Physical Characteristics

2.1. Accessing physical characteristics table

2.2 Physical Characteristics Distributions

2.3 Scaling the variables - range (0-1)

2.4 Clusters

2.4.1 Euclidean Distance Matrix

2.4.2 Mahalanobis Distance Matrix

2.4.3 Number of Clusters

K-Means (Elbow Method)

K-Means (Silhouette Method)

K-Means (Gap Statistic)

K-Means- Clusters Plots

2.4.4 Hierarchical Clustering

Ward’s linkage

Average linkage

Ward’s X Average linkage