Advisor: Prof. Tyler Smith

1. Purpose

This Memo aims to organize and briefly presents the main simulations and assumptions regarding a cluster analysis developed for 671 gauged catchments located in USA. The GR4J hydrologic model is calibrated for each of the catchments and hierarchical clustering is performed to classify catchments based on their Physical characteristics (Latitude, Longitude, Drainage Area and Elevation) and Hydrologic Signatures (Mean of Total Annual Runoff, Mean of Total Annual Potential Evapotranspiration, Mean of Total annual runoff, Runoff Ratio and Aridity index) and Model Parameters.

Most of the used code and text employed here comes from the book Pratical Guide To Cluster Analysis in R, Unsupervised Machine Learning, by Alboukadel Kassambara, Edition 1 (2017) and for the sake of simplicity the references will be omitted.

In addition, the book Regionalization of Watersheds-An Approach Based on Cluster Analysis, by A. Ramachandra Rao and V.V. Srinivas, (2008) and the book Data Mining Concepts and Techniques by Jiawei Han, Micheline Kamber, and Jian Pei, Edition 3, (2012) were also used as reference.

2. Clustering by Physical Characteristics

2.1. Accessing physical characteristics table

This code chuck accesses the a table with the physical characteristics of the 671 gauged stations, as follows:LAT. LONG. AREA(km2) ELEV.(m) SLOPE, PERCENTAGE OF FOREST).

#Clean R memory
rm(list=ls(all=TRUE))

#Loading packge to read .mat files
library(R.matlab)
## Registered S3 method overwritten by 'R.oo':
##   method        from       
##   throw.default R.methodsS3
## R.matlab v3.6.2 (2018-09-26) successfully loaded. See ?R.matlab for help.
## 
## Attaching package: 'R.matlab'
## The following objects are masked from 'package:base':
## 
##     getOption, isOpen
#Loading clustering packages
library(cluster)
library(devtools)
## Loading required package: usethis
library("factoextra")
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#Setting work directory
setwd("C:/Users/rampincg/Desktop/Clarkson_Pc/cassio_PC/AGU_2019/camels")
data.Set<-read.table(file="camelsData.txt",header=T,sep="\t")

2.2 Physical Characteristics Distributions

##############################################################
###DENSITIES WITH GGPLOT2#####################################
##############################################################
library(reshape2)
library(ggplot2)
library(cowplot)
## 
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
##   default ggplot2 theme anymore. To recover the previous
##   behavior, execute:
##   theme_set(theme_cowplot())
## ********************************************************
index=seq(from=1,to=nrow(data.Set),by=1)

dataX1<-cbind(index,data.Set[,3])
dataX1<-as.data.frame(dataX1)
cnames=c("index","LAT.")
colnames(dataX1)=cnames
dataX1<-reshape2::melt(dataX1, id.var='index')


dataX2<-cbind(index,data.Set[,4])
dataX2<-as.data.frame(dataX2)
cnames=c("index","LONG.")
colnames(dataX2)=cnames
dataX2<-reshape2::melt(dataX2, id.var='index')


dataX3<-cbind(index,data.Set[,5])
dataX3<-as.data.frame(dataX3)
cnames=c("index","Area")
colnames(dataX3)=cnames
dataX3<-reshape2::melt(dataX3, id.var='index')

dataX4<-cbind(index,data.Set[,6])
dataX4<-as.data.frame(dataX4)
cnames=c("index","Elev.")
colnames(dataX4)=cnames
dataX4<-reshape2::melt(dataX4, id.var='index')

dataX5<-cbind(index,data.Set[,7])
dataX5<-as.data.frame(dataX5)
cnames=c("index","Slope")
colnames(dataX5)=cnames
dataX5<-reshape2::melt(dataX5, id.var='index')

dataX6<-cbind(index,data.Set[,8])
dataX6<-as.data.frame(dataX6)
cnames=c("index","Forest Perc.")
colnames(dataX6)=cnames
dataX6<-reshape2::melt(dataX6, id.var='index')



#Together with ggplot2
plot1<-ggplot(dataX1) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1,  geom="line",position = "identity")+
  ylab("Density")+
  xlab("LAT.")+
  xlim(27,49) + 
  theme_bw() +
  theme(legend.position="top")+
  theme(legend.title = element_blank())+
  theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
  theme(legend.text=element_text(size=14))+
  theme(axis.text.x = element_text(size = 16),axis.text.y = element_text(size = 16))


plot2<-ggplot(dataX2) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1,  geom="line",position = "identity")+
  ylab("Density")+
  xlab("LONG.")+
  xlim(-124,-111) +
  theme_bw() +
  theme(legend.position="top")+
  theme(legend.title = element_blank())+
  theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
  theme(legend.text=element_text(size=14))+
  theme(axis.text.x = element_text(size = 16),axis.text.y = element_text(size = 16))


plot3<-ggplot(dataX3) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1,  geom="line",position = "identity")+
  ylab("Density")+
  xlab("Area")+
  xlim(4,26000) +
  theme_bw() +
  theme(legend.position="top")+
  theme(legend.title = element_blank())+
  theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
  theme(legend.text=element_text(size=14))+
  theme(axis.text.x = element_text(size = 16),axis.text.y = element_text(size = 16))

plot4<-ggplot(dataX4) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1,  geom="line",position = "identity")+
  ylab("Density")+
  xlab("Elev")+
  xlim(10,3600) +
  theme_bw() +
  theme(legend.position="top")+
  theme(legend.title = element_blank())+
  theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
  theme(legend.text=element_text(size=14))+
  theme(axis.text.x = element_text(size = 16),axis.text.y = element_text(size = 16))


plot5<-ggplot(dataX5) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1,  geom="line",position = "identity")+
  ylab("Density")+
  xlab("Slope")+
  xlim(0.8,256) +
  theme_bw() +
  theme(legend.position="top")+
  theme(legend.title = element_blank())+
  theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
  theme(legend.text=element_text(size=14))+
  theme(axis.text.x = element_text(size = 16),axis.text.y = element_text(size = 16))

plot6<-ggplot(dataX6) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1,  geom="line",position = "identity")+
  ylab("Density")+
  xlab("Forest Perc.")+
  xlim(0,1) +
  theme_bw() +
  theme(legend.position="top")+
  theme(legend.title = element_blank())+
  theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
  theme(legend.text=element_text(size=14))+
  theme(axis.text.x = element_text(size = 16),axis.text.y = element_text(size = 16))


plot_grid(plot1,plot2,plot3,plot4,plot5,plot6, labels = "AUTO")

summary(data.Set)
##     gaugeID        
##  Min.   : 1013500  
##  1st Qu.: 2370650  
##  Median : 6278300  
##  Mean   : 6265831  
##  3rd Qu.: 9382765  
##  Max.   :14400000  
##                    
##                                               gaugeName        Lat       
##                                   DEER C NR VINA CA:  1   Min.   :27.05  
##                                   SALT C NR ADA, KS:  1   1st Qu.:35.70  
##                                  BULL C NR WEOTT CA:  1   Median :39.25  
##                                Menard Ck nr Rye, TX:  1   Mean   :39.24  
##                                MILL C NR PAXICO, KS:  1   3rd Qu.:43.21  
##                                REDWOOD C A ORICK CA:  1   Max.   :48.82  
##  (Other)                                           :665                  
##       Long              Area              Elev             Slope         
##  Min.   :-124.39   Min.   :    4.1   Min.   :  10.21   Min.   :  0.8222  
##  1st Qu.:-110.41   1st Qu.:  128.0   1st Qu.: 249.67   1st Qu.:  7.4268  
##  Median : -92.78   Median :  340.7   Median : 462.72   Median : 28.8016  
##  Mean   : -95.79   Mean   :  808.1   Mean   : 759.42   Mean   : 46.1953  
##  3rd Qu.: -81.77   3rd Qu.:  804.5   3rd Qu.: 928.88   3rd Qu.: 73.1695  
##  Max.   : -67.94   Max.   :25817.8   Max.   :3571.18   Max.   :255.6884  
##                                                                          
##   Forest.Perc.   
##  Min.   :0.0000  
##  1st Qu.:0.2771  
##  Median :0.8137  
##  Mean   :0.6395  
##  3rd Qu.:0.9724  
##  Max.   :1.0000  
## 

2.3 Scaling the variables - range (0-1)

###############################
#Scaling data set
###############################
df.scaled<-data.Set[,3:8]
rownames(df.scaled)=data.Set[,1]


col1=col2=col3=col4=col5=col6=0

col1<-(df.scaled[,1]-min(df.scaled[,1]))/(max(df.scaled[,1])-min(df.scaled[,1]))
col2<-(df.scaled[,2]-min(df.scaled[,2]))/(max(df.scaled[,2])-min(df.scaled[,2]))
col3<-(df.scaled[,3]-min(df.scaled[,3]))/(max(df.scaled[,3])-min(df.scaled[,3]))
col4<-(df.scaled[,4]-min(df.scaled[,4]))/(max(df.scaled[,4])-min(df.scaled[,4]))
col5<-(df.scaled[,5]-min(df.scaled[,5]))/(max(df.scaled[,5])-min(df.scaled[,5]))
col6<-(df.scaled[,6]-min(df.scaled[,6]))/(max(df.scaled[,6])-min(df.scaled[,6]))


df.scaled[,1]=col1
df.scaled[,2]=col2
df.scaled[,3]=col3
df.scaled[,4]=col4
df.scaled[,5]=col5
df.scaled[,6]=col6

2.4 Clusters

2.4.1 Euclidean Distance Matrix

#Euclidian Distance
dist.eucl<-dist(df.scaled,method="euclidean")
#Visualizing the distance matrices
fviz_dist(dist.eucl)

2.4.2 Mahalanobis Distance Matrix

#Package to Mahalanobis Distance
library(StatMatch)
## Loading required package: proxy
## 
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
## Loading required package: clue
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
## Loading required package: RANN
## Loading required package: lpSolve
#Computing Mahalanobis distance
dist.maha<-mahalanobis.dist(df.scaled)

#Conerting the Mahalanobis distance to a dist class
dist.maha=as.dist(dist.maha)

#Visualizing the distance matrices
fviz_dist(dist.maha)

2.4.3 Number of Clusters

K-Means (Elbow Method)

#Loading package
library(factoextra)

###########
#K-Means###
###########

#Elbow method
fviz_nbclust(df.scaled,kmeans,method="wss")+geom_vline(xintercept=3,linetype=2)+labs(subtitle="Elbow method")

K-Means (Silhouette Method)

#Silhouette method
fviz_nbclust(df.scaled,kmeans,method="silhouette")+labs(subtitle="Silhouette method")

K-Means (Gap Statistic)

#Gap Statistic
#nboot=50 to keep the function speedy.
#recommended value: nboot=500 for your analysis.
#Use verbose=FALSE to hide computing progression.
fviz_nbclust(df.scaled,kmeans,method="gap_stat",nboot=50)+labs(subtitle="Gap statistic method")

#Compute k-means k=3
km.res<-kmeans(df.scaled,3,nstart=15)


#Plotting Cluster
fviz_cluster(km.res,data=df.scaled,palette="jco",repel=TRUE,ggtheme=theme_classic(),show.clust.cent = FALSE,stand=FALSE,geom="point")

#Compute k-means k=10
km.res<-kmeans(df.scaled,4,nstart=15)

#Plotting Cluster
fviz_cluster(km.res,data=df.scaled,palette="jco",repel=TRUE,ggtheme=theme_classic(),show.clust.cent = FALSE,stand=FALSE,geom="point")

K-Means- Clusters Plots

#Compute k-means k=3
km.res<-kmeans(df.scaled,3,nstart=15)

#Plotting Cluster
fviz_cluster(km.res,data=df.scaled,palette="jco",repel=TRUE,ggtheme=theme_classic(),show.clust.cent = FALSE,stand=FALSE,geom="point")

#Compute k-means k=4
km.res<-kmeans(df.scaled,4,nstart=15)

#Plotting Cluster
fviz_cluster(km.res,data=df.scaled,palette="jco",repel=TRUE,ggtheme=theme_classic(),show.clust.cent = FALSE,stand=FALSE,geom="point")

2.4.4 Hierarchical Clustering

Ward’s linkage

res.hc.ward<-hclust(d=dist.eucl,method="ward.D2")
fviz_dend(res.hc.ward,cex=0.2)

Average linkage

res.hc.average<-hclust(d=dist.eucl,method="average")
fviz_dend(res.hc.average,cex=0.2)

Ward’s X Average linkage

#Wards X Average Linkage

#Ward's linkage
#Computing cophentic distance
res.coph.ward<-cophenetic(res.hc.ward)
#Correlation between cophenetic distance and the original distance
cor(dist.eucl,res.coph.ward)
## [1] 0.7888279
#Average Linkage
#Computing cophentic distance
res.coph.average<-cophenetic(res.hc.average)
#Correlation between cophenetic distance and the original distance
cor(dist.eucl,res.coph.average)
## [1] 0.8301809