Cluster_Memo

1. Purpose

This Memo aims to organize and briefly presents the main simulations and assumptions regarding a cluster analysis developed for 159 gauged catchments located in southeastern Australia. A conceptual rainfall-runoff model is calibrated for each of the catchments and hierarchical clustering is performed to classify catchments based on their Physical characteristics (Latitude, Longitude, Drainage Area and Elevation) and Hydrologic Signatures (Mean of Total Annual Runoff, Mean of Total Annual Potential Evapotranspiration, Mean of Total annual runoff, Runoff Ratio and Aridity index).

Most of the used code and text employed here comes from the book Pratical Guide To Cluster Analysis in R, Unsupervised Machine Learning, by Alboukadel Kassambara, Edition 1 (2017) and for the sake of simplicity the references will be omitted.

In addition, the book Regionalization of Watersheds-An Approach Based on Cluster Analysis, by A. Ramachandra Rao and V.V. Srinivas, (2008) and the book Data Mining Concepts and Techniques by Jiawei Han, Micheline Kamber, and Jian Pei, Edition 3, (2012) were also used as reference.

2. Accessing the files

This code chuck accesses the gauging stations, precipitation, discharges and potential evapotranspiration times series that are saved in MATLAB files.

#################################################
#################################################
#Part I - Accessing the precipitation, discharges
#and potential evapotranspiration data
#################################################
#################################################

#Clean R memory
rm(list=ls(all=TRUE))

#Loading packge to read .mat files
library(R.matlab)

## Registered S3 method overwritten by 'R.oo':
##   method        from       
##   throw.default R.methodsS3

## R.matlab v3.6.2 (2018-09-26) successfully loaded. See ?R.matlab for help.

## 
## Attaching package: 'R.matlab'

## The following objects are masked from 'package:base':
## 
##     getOption, isOpen

#Loading clustering packages
library(cluster)
library(devtools)

## Loading required package: usethis

library(factoextra)

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

#Setting the base working directory where all the sub-folders
# are saved

setwd("C:/Users/rampincg/Desktop/Clarkson_Pc/cassio_PC/AGU_2019/dds_seaus240_gr4j/_gr4j_kge")



# Saving the path of the base directory
base.Directory<-getwd()
files<-list.files()
data.base.mat<-readMat(files[length(files)])
data.base.mat<-data.base.mat[[1]]


###########################
#Extracting gauging number
###########################

gauging.number<-0
total.gauging<-length(data.base.mat)/7
i=1
j=1
while(i<=total.gauging){
  gauging.number[i]=data.base.mat[[j]]
  j=j+7
  i=i+1
}

gauging.number=as.numeric(gauging.number)

###########################
#Extracting discharges lists
###########################

discharges<-0
discharges<-as.list(discharges)
temp=0
i=1
j=5

while(i<=total.gauging){
  temp=data.base.mat[[j]]
  discharges[[i]]=list(temp)
  j=j+7
  i=i+1
}

###########################
#Extracting precipitation lists
###########################

precipitation<-0
precipitation<-as.list(precipitation)
temp=0
i=1
j=6

while(i<=total.gauging){
  temp=data.base.mat[[j]]
  precipitation[[i]]=list(temp)
  j=j+7
  i=i+1
}

###############################################
#Extracting potential evapotranspiration lists
##################################################

pet<-0
pet<-as.list(pet)
temp=0
i=1
j=7

while(i<=total.gauging){
  temp=data.base.mat[[j]]
  pet[[i]]=list(temp)
  j=j+7
  i=i+1
}


#################################################
#################################################
#Part II - Extracting initial and final date from 
#the time series
#################################################
#################################################

initial.year=final.year=0
initial.month=final.month=0
initial.day=final.day=0
i=1
j=2
k=1
empty.gauging=0

while(i<=total.gauging){
  
  #Year
  year.temp=data.base.mat[[j]]
  
  if(length(year.temp)==0){
    
    initial.year[i]=NA
    final.year[i]=NA
    year.temp=0
    empty.gauging[k]=k
    
    #Month
    
    initial.month[i]=NA
    final.month[i]=NA
   
    
    #Day
    initial.day[i]=NA
    final.day[i]=NA
    
    j=j+7
    i=i+1
    k=k+1
    
  }
  
  else{
    
    year.temp=data.base.mat[[j]]
    initial.year[i]=year.temp[1]
    final.year[i]=year.temp[length(year.temp)]
    year.temp=0
    empty.gauging[k]=0
    
    #Month
    month.temp=data.base.mat[[j+1]]
    initial.month[i]=month.temp[1]
    final.month[i]=month.temp[length(month.temp)]
    month.temp=0
    
    #Day
    day.temp=data.base.mat[[j+2]]
    initial.day[i]=day.temp[1]
    final.day[i]=day.temp[length(day.temp)]
    month.temp=0
    
    j=j+7
    i=i+1
    k=k+1
    
  }
  
}

3. Time series range

This code chuck extracts the initial and final date from the time series. The day, month and year are saved in MATLAB files.

#################################################
#################################################
#Part II - Extracting initial and final date from 
#the time series
#################################################
#################################################

initial.year=final.year=0
initial.month=final.month=0
initial.day=final.day=0
i=1
j=2
k=1
empty.gauging=0

while(i<=total.gauging){
  
  #Year
  year.temp=data.base.mat[[j]]
  
  if(length(year.temp)==0){
    
    initial.year[i]=NA
    final.year[i]=NA
    year.temp=0
    empty.gauging[k]=k
    
    #Month
    
    initial.month[i]=NA
    final.month[i]=NA
   
    
    #Day
    initial.day[i]=NA
    final.day[i]=NA
    
    j=j+7
    i=i+1
    k=k+1
    
  }
  
  else{
    
    year.temp=data.base.mat[[j]]
    initial.year[i]=year.temp[1]
    final.year[i]=year.temp[length(year.temp)]
    year.temp=0
    empty.gauging[k]=0
    
    #Month
    month.temp=data.base.mat[[j+1]]
    initial.month[i]=month.temp[1]
    final.month[i]=month.temp[length(month.temp)]
    month.temp=0
    
    #Day
    day.temp=data.base.mat[[j+2]]
    initial.day[i]=day.temp[1]
    final.day[i]=day.temp[length(day.temp)]
    month.temp=0
    
    j=j+7
    i=i+1
    k=k+1
    
  }
  
}

4. Hydrologic Signatures

The code chunk below computes the following signatures for each site: Mean of the total annual rain, Mean of the total potential evapotranspiration, mean of the daily discharges, runnof ration and the aridity index.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tibble)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

#####################################################################################
#####################################################################################
#####################################################################################
#Computing Signatures
#####################################################################################
#####################################################################################
mean.Rain=0#Mean daily rain
mean.Pet=0#Mean daily PET
mean.discharges=0#Mean daily discharges
temp=0#dummy varibale
annual.Rain=0# mean of all total annual precipitation
annual.Pet=0#mean of all total annual PET
annual.Q=0#mean of all total Q

#Mean daily Rain
for(i in 1:length(precipitation)){
  

  empty.gauging[i]
  
  if(empty.gauging[i]==0){
    
    temp=precipitation[[i]]
    temp=temp[[1]]
    mean.Rain[i]=mean(temp)
    initial.date=c(initial.year[i],initial.month[i],initial.day[i])
    initial.date=paste(initial.date,collapse="-")
    final.date=c(final.year[i],final.month[i],final.day[i])
    final.date=paste(final.date,collapse="-")
    period<-data_frame(date=seq(as.Date(initial.date), as.Date(final.date), by=1))
    
    P=cbind(period,temp)
    #Rename columns
    colnames(P)<- c("date", "plu")
    #Reorder columns from days to month
    P.annual=P %>% group_by(month=floor_date(date, "year"))%>%summarize(plu=sum(plu))
    annual.Rain[i]=mean(P.annual$plu)
    temp=0
    initial.date=0
    final.date=0
    period=0
    P=0
    P.annual=0
   
  }
  
  else{
    
    mean.Rain[i]=NA
    annual.Rain[i]=NA
    
  }
  
}



#Mean daily pet
for(i in 1:length(pet)){
  
  empty.gauging[i]
  
  if(empty.gauging[i]==0){
    
    temp=pet[[i]]
    temp=temp[[1]]
    mean.Pet[i]=mean(temp)
    initial.date=c(initial.year[i],initial.month[i],initial.day[i])
    initial.date=paste(initial.date,collapse="-")
    final.date=c(final.year[i],final.month[i],final.day[i])
    final.date=paste(final.date,collapse="-")
    period<-data_frame(date=seq(as.Date(initial.date), as.Date(final.date), by=1))
    
    P=cbind(period,temp)
    #Rename columns
    colnames(P)<- c("date", "plu")
    #Reorder columns from days to month
    P.annual=P %>% group_by(month=floor_date(date, "year"))%>%summarize(plu=sum(plu))
    annual.Pet[i]=mean(P.annual$plu)
    temp=0
    initial.date=0
    final.date=0
    period=0
    P=0
    P.annual=0
    
  }
  
  else{
    
    mean.Pet[i]=NA
    annual.Pet[i]=NA
    
  }
}


#Mean daily discharge
for(i in 1:length(discharges)){

  empty.gauging[i]
  
  if(empty.gauging[i]==0){
    
    temp=discharges[[i]]
    temp=temp[[1]]
    mean.discharges[i]=mean(temp)
    initial.date=c(initial.year[i],initial.month[i],initial.day[i])
    initial.date=paste(initial.date,collapse="-")
    final.date=c(final.year[i],final.month[i],final.day[i])
    final.date=paste(final.date,collapse="-")
    period<-data_frame(date=seq(as.Date(initial.date), as.Date(final.date), by=1))
    
    P=cbind(period,temp)
    #Rename columns
    colnames(P)<- c("date", "plu")
    #Reorder columns from days to month
    P.annual=P %>% group_by(month=floor_date(date, "year"))%>%summarize(plu=sum(plu))
    annual.Q[i]=mean(P.annual$plu)
    temp=0
    initial.date=0
    final.date=0
    period=0
    P=0
    P.annual=0
    
  }
  
  else{
    
    mean.discharges[i]=NA
    annual.Q[i]=NA
    
  }

}

#Computing aridity index
ai=annual.Rain/annual.Pet
runnof.ratio=annual.Q/annual.Rain

#Creating a data frame with all discharges
signatures.data.frame<-cbind(annual.Rain,annual.Pet,mean.discharges,runnof.ratio,ai)

head(signatures.data.frame)

##      annual.Rain annual.Pet mean.discharges runnof.ratio        ai
## [1,]   1543.7029   1343.543       1.5662172    0.3706278 1.1489787
## [2,]          NA         NA              NA           NA        NA
## [3,]   1842.1200   1365.594       2.2037736    0.4370174 1.3489514
## [4,]   1025.7024   1420.295       0.5330344    0.1898382 0.7221755
## [5,]   1228.4457   1439.578       0.7715280    0.2294275 0.8533373
## [6,]    973.1329   1419.397       0.2832855    0.1063413 0.6855960

5. Physical Characteristics

The considered physical characteristics are Latitude, Longitude, Elevation and Drainage Area.

setwd("C:/Users/rampincg/Desktop/Clarkson_Pc/cassio_PC/AGU_2019/Reports/Memo_02")

physical.characteristics<-read.table(file="PhysicalCharacteristicsTable5.txt",sep="\t",header=T,quote = "")

head(physical.characteristics)

##   Order Gauging Field             Station        River      Lat     Long
## 1     1  201001     1            Eungella        Oxley -28.3550 153.2917
## 2     2  203002     3          Repentance   Coopers Ck -28.6433 153.4100
## 3     3  203005     4           Wiangaree     Richmond -28.5067 152.9667
## 4     4  203010     5         Rock Valley    Leycester -28.7383 153.1633
## 5     5  203030     6           Rappville    Myrtle Ck -29.1117 152.9983
## 6     6  204017     7 Dorrigo No.2 & No.3 Bielsdown Ck -30.3067 152.7133
##    El Area
## 1  15  213
## 2  45   62
## 3  65  702
## 4  15  179
## 5  35  332
## 6 632   82

6. Clustering by Physical Characteristics

6.1 Physical Characteristics Distributions

#Loading clustering packages
library(cluster)
library(devtools)
library("factoextra")


#Extracting the physical characteristics and field column from the original data matrix
data.Set<-cbind(physical.characteristics[,3],physical.characteristics[,6:9])
#Setting collumns names
col.names<-c("Field","Lat.","Long.","Elv.","Area")  
colnames(data.Set)<-col.names


################################
##Acessing characteristics histograms
################################
##############################################################
###DENSITIES WITH GGPLOT2#####################################
##############################################################
library(reshape2)
library(ggplot2)
library(cowplot)

## 
## ********************************************************

## Note: As of version 1.0.0, cowplot does not change the

##   default ggplot2 theme anymore. To recover the previous

##   behavior, execute:
##   theme_set(theme_cowplot())

## ********************************************************

## 
## Attaching package: 'cowplot'

## The following object is masked from 'package:lubridate':
## 
##     stamp

index=seq(from=1,to=nrow(data.Set),by=1)

dataX1<-cbind(index,data.Set[,2])
dataX1<-as.data.frame(dataX1)
cnames=c("index","LAT.")
colnames(dataX1)=cnames
dataX1<-reshape2::melt(dataX1, id.var='index')


dataX2<-cbind(index,data.Set[,3])
dataX2<-as.data.frame(dataX2)
cnames=c("index","LONG.")
colnames(dataX2)=cnames
dataX2<-reshape2::melt(dataX2, id.var='index')


dataX3<-cbind(index,data.Set[,4])
dataX3<-as.data.frame(dataX3)
cnames=c("index","Elev.")
colnames(dataX3)=cnames
dataX3<-reshape2::melt(dataX3, id.var='index')

dataX4<-cbind(index,data.Set[,5])
dataX4<-as.data.frame(dataX4)
cnames=c("index","AREA")
colnames(dataX4)=cnames
dataX4<-reshape2::melt(dataX4, id.var='index')


#Together with ggplot2
plot1<-ggplot(dataX1) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1,  geom="line",position = "identity")+
  ylab("Density")+
  xlab("LAT.")+
  xlim(-27,-35) + 
  theme_bw() +
  theme(legend.position="top")+
  theme(legend.title = element_blank())+
  theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
  theme(legend.text=element_text(size=14))+
  theme(axis.text.x = element_text(size = 16),axis.text.y = element_text(size = 16))


plot2<-ggplot(dataX2) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1,  geom="line",position = "identity")+
  ylab("Density")+
  xlab("LONG.")+
  xlim(142,155) +
  theme_bw() +
  theme(legend.position="top")+
  theme(legend.title = element_blank())+
  theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
  theme(legend.text=element_text(size=14))+
  theme(axis.text.x = element_text(size = 16),axis.text.y = element_text(size = 16))


plot3<-ggplot(dataX3) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1,  geom="line",position = "identity")+
  ylab("Density")+
  xlab("ELEV.")+
  xlim(0,1000) +
  theme_bw() +
  theme(legend.position="top")+
  theme(legend.title = element_blank())+
  theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
  theme(legend.text=element_text(size=14))+
  theme(axis.text.x = element_text(size = 16),axis.text.y = element_text(size = 16))

plot4<-ggplot(dataX4) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1,  geom="line",position = "identity")+
  ylab("Density")+
  xlab("AREA")+
  xlim(0,300) +
  theme_bw() +
  theme(legend.position="top")+
  theme(legend.title = element_blank())+
  theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
  theme(legend.text=element_text(size=14))+
  theme(axis.text.x = element_text(size = 16),axis.text.y = element_text(size = 16))


plot_grid(plot1,plot2,plot3,plot4, labels = "AUTO")

summary(data.Set)

##      Field            Lat.            Long.            Elv.       
##  Min.   :  1.0   Min.   :-38.54   Min.   :139.0   Min.   :   1.0  
##  1st Qu.: 48.5   1st Qu.:-36.61   1st Qu.:146.8   1st Qu.: 125.5  
##  Median :135.0   Median :-35.02   Median :149.3   Median : 276.0  
##  Mean   :123.5   Mean   :-33.66   Mean   :149.1   Mean   : 368.7  
##  3rd Qu.:188.5   3rd Qu.:-30.49   3rd Qu.:151.7   3rd Qu.: 542.5  
##  Max.   :240.0   Max.   :-27.30   Max.   :153.4   Max.   :1254.0  
##       Area         
##  Min.   :    51.0  
##  1st Qu.:   165.0  
##  Median :   376.0  
##  Mean   :  8908.1  
##  3rd Qu.:   678.5  
##  Max.   :502500.0

6.2 Scaling variables between 0-1

###############################
#Scaling data set
###############################
df.scaled<-data.Set[,2:5]
rownames(df.scaled)=data.Set[,1]


col1=col2=col3=col4=0

col1<-(df.scaled[,1]-min(df.scaled[,1]))/(max(df.scaled[,1])-min(df.scaled[,1]))
col2<-(df.scaled[,2]-min(df.scaled[,2]))/(max(df.scaled[,2])-min(df.scaled[,2]))
col3<-(df.scaled[,3]-min(df.scaled[,3]))/(max(df.scaled[,3])-min(df.scaled[,3]))
col4<-(df.scaled[,4]-min(df.scaled[,4]))/(max(df.scaled[,4])-min(df.scaled[,4]))


df.scaled[,1]=col1
df.scaled[,2]=col2
df.scaled[,3]=col3
df.scaled[,4]=col4

6.3 Clustering by coordinates

6.3.1 Euclidean Distance Matrix

#################################################
#################################################
#Cluster by LAT and LONG.
#################################################
#################################################
data.Coordinates<-df.scaled[,1:2]
data.Coordinates<-as.data.frame(data.Coordinates)

#Distance matrix computation-Euclidean

#Euclidian Distance
dist.eucl.lat<-dist(df.scaled,method="euclidean")
#Visualizing the distance matrices
fviz_dist(dist.eucl.lat)

6.3.2 Number of Clusters

fviz_nbclust(data.Coordinates,hcut,method="wss")

fviz_nbclust(data.Coordinates,hcut,method="silhouette")

fviz_nbclust(data.Coordinates,hcut,method="gap_stat")

pam.res<-pam(data.Coordinates,2)

fviz_cluster(pam.res,repel=TRUE,ggtheme=theme_classic())

library("NbClust")
nb<-NbClust(data.Coordinates,distance="euclidean",min.nc=2,max.nc=10,method="average")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 8 proposed 2 as the best number of clusters 
## * 3 proposed 3 as the best number of clusters 
## * 4 proposed 4 as the best number of clusters 
## * 6 proposed 5 as the best number of clusters 
## * 2 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************

fviz_nbclust(nb)

## Among all indices: 
## ===================
## * 2 proposed  0 as the best number of clusters
## * 1 proposed  1 as the best number of clusters
## * 8 proposed  2 as the best number of clusters
## * 3 proposed  3 as the best number of clusters
## * 4 proposed  4 as the best number of clusters
## * 6 proposed  5 as the best number of clusters
## * 2 proposed  10 as the best number of clusters
## 
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is  2 .

6.3.3 Hierarchical Clustering

#AVERAGE LINKAGE CORRELATION
res.hc.kge2<-hclust(d=dist.eucl.lat,method="average")
fviz_dend(res.hc.kge2,cex=0.5)

6.4 Clustering by Elevation and Drainage Area

6.4.1 Euclidean Distance Matrix

#################################################
#################################################
#Clustering by Elevation and Area
#################################################
#################################################
data.Elevation<-df.scaled[,3:4]
data.Elevation<-as.data.frame(data.Elevation)

#Distance matrix computation-Euclidean

#Euclidian Distance
dist.eucl.elev<-dist(data.Elevation,method="euclidean")
#Visualizing the distance matrices
fviz_dist(dist.eucl.elev)

6.4.2 Number of Clusters

fviz_nbclust(data.Elevation,hcut,method="wss")

fviz_nbclust(data.Elevation,hcut,method="silhouette")

fviz_nbclust(data.Elevation,hcut,method="gap_stat")

pam.res<-pam(data.Elevation,3)

fviz_cluster(pam.res,repel=TRUE,ggtheme=theme_classic())

library("NbClust")
nb<-NbClust(data.Elevation,distance="euclidean",min.nc=2,max.nc=10,method="average")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 3 proposed 2 as the best number of clusters 
## * 8 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 3 proposed 5 as the best number of clusters 
## * 3 proposed 6 as the best number of clusters 
## * 1 proposed 8 as the best number of clusters 
## * 1 proposed 9 as the best number of clusters 
## * 4 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

fviz_nbclust(nb)

## Among all indices: 
## ===================
## * 2 proposed  0 as the best number of clusters
## * 3 proposed  2 as the best number of clusters
## * 8 proposed  3 as the best number of clusters
## * 1 proposed  4 as the best number of clusters
## * 3 proposed  5 as the best number of clusters
## * 3 proposed  6 as the best number of clusters
## * 1 proposed  8 as the best number of clusters
## * 1 proposed  9 as the best number of clusters
## * 4 proposed  10 as the best number of clusters
## 
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is  3 .

6.4.3 Hierarchical Clustering

#AVERAGE LINKAGE CORRELATION
res.hc.kge2<-hclust(d=dist.eucl.elev,method="average")
fviz_dend(res.hc.kge2,cex=0.5)

6.5 Clustering by all Physical Characteristics

6.5.1 Euclidean Distance Matrix

#Euclidian Distance
dist.eucl<-dist(df.scaled,method="euclidean")
#Visualizing the distance matrices
fviz_dist(dist.eucl)

6.5.2 Number of Clusters

fviz_nbclust(df.scaled,hcut,method="wss")

fviz_nbclust(df.scaled,hcut,method="silhouette")

fviz_nbclust(df.scaled,hcut,method="gap_stat")

pam.res<-pam(df.scaled,5)

fviz_cluster(pam.res,repel=TRUE,ggtheme=theme_classic())

6.5.3 Hierarchical Clustering

res.hc.kge2<-hclust(d=dist.eucl,method="average")
fviz_dend(res.hc.kge2,cex=0.5)

7. Clustering by Hydrologic Signatures

7.1 Hydrologic Signatures Distributions

data.signatures<-signatures.data.frame
data.signatures<-cbind(gauging.number,signatures.data.frame)
data.signatures<-data.signatures[complete.cases(data.signatures), ]
row.names<-data.signatures[,1]
data.signatures<-data.signatures[,-1]
rownames(data.signatures)<-row.names


################################
##Acessing characteristics histograms
################################
##############################################################
###DENSITIES WITH GGPLOT2#####################################
##############################################################

index=seq(from=1,to=nrow(data.signatures),by=1)

dataX1<-cbind(index,data.signatures[,1])
dataX1<-as.data.frame(dataX1)
cnames=c("index","Annual Rain")
colnames(dataX1)=cnames
dataX1<-reshape2::melt(dataX1, id.var='index')


dataX2<-cbind(index,data.signatures[,2])
dataX2<-as.data.frame(dataX2)
cnames=c("index","Annual PET")
colnames(dataX2)=cnames
dataX2<-reshape2::melt(dataX2, id.var='index')


dataX3<-cbind(index,data.signatures[,3])
dataX3<-as.data.frame(dataX3)
cnames=c("index","Mean Discharge")
colnames(dataX3)=cnames
dataX3<-reshape2::melt(dataX3, id.var='index')

dataX4<-cbind(index,data.signatures[,4])
dataX4<-as.data.frame(dataX4)
cnames=c("index","Runnof Ratio")
colnames(dataX4)=cnames
dataX4<-reshape2::melt(dataX4, id.var='index')

dataX5<-cbind(index,data.signatures[,5])
dataX5<-as.data.frame(dataX5)
cnames=c("index","AI")
colnames(dataX5)=cnames
dataX5<-reshape2::melt(dataX5, id.var='index')




#Together with ggplot2
plot1<-ggplot(dataX1) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1,  geom="line",position = "identity")+
  ylab("Density")+
  xlab("Annual Rain")+
  xlim(300,2000) + 
  theme_bw() +
  theme(legend.position="top")+
  theme(legend.title = element_blank())+
  theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=10))+
  theme(legend.text=element_text(size=14))+
  theme(axis.text.x = element_text(size = 10),axis.text.y = element_text(size = 10))


plot2<-ggplot(dataX2) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1,  geom="line",position = "identity")+
  ylab("Density")+
  xlab("Annual PET")+
  xlim(1000,1500) +
  theme_bw() +
  theme(legend.position="top")+
  theme(legend.title = element_blank())+
  theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
  theme(legend.text=element_text(size=14))+
  theme(axis.text.x = element_text(size = 8),axis.text.y = element_text(size = 8))


plot3<-ggplot(dataX3) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1,  geom="line",position = "identity")+
  ylab("Density")+
  xlab("Mean Discharge")+
  xlim(0,4) +
  theme_bw() +
  theme(legend.position="top")+
  theme(legend.title = element_blank())+
  theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
  theme(legend.text=element_text(size=14))+
  theme(axis.text.x = element_text(size = 10),axis.text.y = element_text(size = 10))

plot4<-ggplot(dataX4) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1,  geom="line",position = "identity")+
  ylab("Density")+
  xlab("Runnof Ratio")+
  xlim(0,1) +
  theme_bw() +
  theme(legend.position="top")+
  theme(legend.title = element_blank())+
  theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
  theme(legend.text=element_text(size=14))+
  theme(axis.text.x = element_text(size = 10),axis.text.y = element_text(size = 10))

plot5<-ggplot(dataX4) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1,  geom="line",position = "identity")+
  ylab("Density")+
  xlab("AI")+
  xlim(0,2) +
  theme_bw() +
  theme(legend.position="top")+
  theme(legend.title = element_blank())+
  theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
  theme(legend.text=element_text(size=14))+
  theme(axis.text.x = element_text(size = 10),axis.text.y = element_text(size = 16))


plot_grid(plot1,plot2,plot3,plot4,plot5, labels = "AUTO")

summary(data.signatures)

##   annual.Rain       annual.Pet   mean.discharges     runnof.ratio     
##  Min.   : 385.4   Min.   :1009   Min.   :0.002399   Min.   :0.002217  
##  1st Qu.: 716.1   1st Qu.:1141   1st Qu.:0.139697   1st Qu.:0.069136  
##  Median : 865.1   Median :1212   Median :0.287660   Median :0.128169  
##  Mean   : 918.4   Mean   :1225   Mean   :0.535170   Mean   :0.174394  
##  3rd Qu.:1077.1   3rd Qu.:1294   3rd Qu.:0.671439   3rd Qu.:0.218621  
##  Max.   :1972.9   Max.   :1491   Max.   :3.853985   Max.   :0.933637  
##        ai        
##  Min.   :0.3087  
##  1st Qu.:0.5709  
##  Median :0.7095  
##  Mean   :0.7574  
##  3rd Qu.:0.8800  
##  Max.   :1.5464

7.2 Scaling Variables

###############################
#Scaling data set
###############################
df.scaled.sig<-data.signatures



col1=col2=col3=col4=col5=0

col1<-(df.scaled.sig[,1]-min(df.scaled.sig[,1]))/(max(df.scaled.sig[,1])-min(df.scaled.sig[,1]))
col2<-(df.scaled.sig[,2]-min(df.scaled.sig[,2]))/(max(df.scaled.sig[,2])-min(df.scaled.sig[,2]))
col3<-(df.scaled.sig[,3]-min(df.scaled.sig[,3]))/(max(df.scaled.sig[,3])-min(df.scaled.sig[,3]))
col4<-(df.scaled.sig[,4]-min(df.scaled.sig[,4]))/(max(df.scaled.sig[,4])-min(df.scaled.sig[,4]))
col5<-(df.scaled.sig[,5]-min(df.scaled.sig[,5]))/(max(df.scaled.sig[,5])-min(df.scaled.sig[,5]))


df.scaled.sig[,1]=col1
df.scaled.sig[,2]=col2
df.scaled.sig[,3]=col3
df.scaled.sig[,4]=col4
df.scaled.sig[,5]=col5

7.3 Clustering by Annual Rain and Annual PET

7.3.1 Euclidean Distance Matrix

#################################################
#################################################
#Clustering by Rain and PET
#################################################
#################################################
data.rain.pet<-df.scaled.sig[,1:2]
data.rain.pet<-as.data.frame(data.rain.pet)

#Distance matrix computation-Euclidean

#Euclidian Distance
dist.eucl.rain.pet<-dist(data.rain.pet,method="euclidean")
#Visualizing the distance matrices
fviz_dist(dist.eucl.rain.pet)

7.3.2 Number of Clusters

#################################################
#################################################
##Number of Clusters####################
#################################################
#################################################

fviz_nbclust(data.rain.pet,hcut,method="wss")

fviz_nbclust(data.rain.pet,hcut,method="silhouette")

fviz_nbclust(data.rain.pet,hcut,method="gap_stat")

pam.res<-pam(data.rain.pet,5)

fviz_cluster(pam.res,repel=TRUE,ggtheme=theme_classic())

library("NbClust")
nb<-NbClust(data.rain.pet,distance="euclidean",min.nc=2,max.nc=10,method="average")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 5 proposed 2 as the best number of clusters 
## * 4 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 7 proposed 5 as the best number of clusters 
## * 1 proposed 8 as the best number of clusters 
## * 3 proposed 9 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  5 
##  
##  
## *******************************************************************

fviz_nbclust(nb)

## Among all indices: 
## ===================
## * 2 proposed  0 as the best number of clusters
## * 1 proposed  1 as the best number of clusters
## * 5 proposed  2 as the best number of clusters
## * 4 proposed  3 as the best number of clusters
## * 2 proposed  4 as the best number of clusters
## * 7 proposed  5 as the best number of clusters
## * 1 proposed  8 as the best number of clusters
## * 3 proposed  9 as the best number of clusters
## * 1 proposed  10 as the best number of clusters
## 
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is  5 .

7.3.3 Hierarchical Clustering

#AVERAGE LINKAGE CORRELATION
res.hc.kge2<-hclust(d=dist.eucl.rain.pet,method="average")
fviz_dend(res.hc.kge2,cex=0.5)

7.4 Clustering by Annual Rain and Mean Discharge

7.4.1 Euclidean Distance Matrix

#################################################
#################################################
#Clustering by Annual Rain and Mean Discharge
#################################################
#################################################
data.rain.discharge<-cbind(df.scaled.sig[,1],df.scaled.sig[,3])
data.rain.discharge<-as.data.frame(data.rain.discharge)

#Distance matrix computation-Euclidean

#Euclidian Distance
dist.eucl.rain.discharge<-dist(data.rain.discharge,method="euclidean")
#Visualizing the distance matrices
fviz_dist(dist.eucl.rain.discharge)

7.4.2 Number of Clusters

#################################################
#################################################
##Number of Clusters####################
#################################################
#################################################

fviz_nbclust(data.rain.discharge,hcut,method="wss")

fviz_nbclust(data.rain.discharge,hcut,method="silhouette")

fviz_nbclust(data.rain.discharge,hcut,method="gap_stat")

pam.res<-pam(data.rain.discharge,2)

fviz_cluster(pam.res,repel=TRUE,ggtheme=theme_classic())

library("NbClust")
nb<-NbClust(data.rain.discharge,distance="euclidean",min.nc=2,max.nc=10,method="average")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 8 proposed 2 as the best number of clusters 
## * 2 proposed 3 as the best number of clusters 
## * 4 proposed 5 as the best number of clusters 
## * 5 proposed 6 as the best number of clusters 
## * 3 proposed 9 as the best number of clusters 
## * 2 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************

fviz_nbclust(nb)

## Among all indices: 
## ===================
## * 2 proposed  0 as the best number of clusters
## * 8 proposed  2 as the best number of clusters
## * 2 proposed  3 as the best number of clusters
## * 4 proposed  5 as the best number of clusters
## * 5 proposed  6 as the best number of clusters
## * 3 proposed  9 as the best number of clusters
## * 2 proposed  10 as the best number of clusters
## 
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is  2 .

7.4.3 Hierarchical Clustering

#################################################
#################################################
#Part III-#Hierarchical Clustering#################
#################################################
#################################################

#############################
##KGE################
#############################

#AVERAGE LINKAGE CORRELATION
res.hc.kge2<-hclust(d=dist.eucl.rain.discharge,method="average")
fviz_dend(res.hc.kge2,cex=0.5)

7.5 Clustering by Runnof Ratio and Aridity Index

7.5.1 Euclidean Distance Matrix

#################################################
#################################################
#Clustering by Runnof Ratio and Aridity Index
#################################################
#################################################
data.runnof.ratio<-cbind(df.scaled.sig[,4],df.scaled.sig[,5])
data.runnof.ratio<-as.data.frame(data.runnof.ratio)

#Distance matrix computation-Euclidean

#Euclidian Distance
dist.eucl.runnof.ratio<-dist(data.runnof.ratio,method="euclidean")
#Visualizing the distance matrices
fviz_dist(dist.eucl.runnof.ratio)

7.5.2 Number of Clusters

#################################################
#################################################
##Number of Clusters####################
#################################################
#################################################

fviz_nbclust(data.runnof.ratio,hcut,method="wss")

fviz_nbclust(data.runnof.ratio,hcut,method="silhouette")

fviz_nbclust(data.runnof.ratio,hcut,method="gap_stat")

pam.res<-pam(data.runnof.ratio,3)

fviz_cluster(pam.res,repel=TRUE,ggtheme=theme_classic())

library("NbClust")
nb<-NbClust(data.runnof.ratio,distance="euclidean",min.nc=2,max.nc=10,method="average")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 5 proposed 8 as the best number of clusters 
## * 1 proposed 9 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

fviz_nbclust(nb)

## Among all indices: 
## ===================
## * 2 proposed  0 as the best number of clusters
## * 6 proposed  2 as the best number of clusters
## * 9 proposed  3 as the best number of clusters
## * 2 proposed  4 as the best number of clusters
## * 5 proposed  8 as the best number of clusters
## * 1 proposed  9 as the best number of clusters
## * 1 proposed  10 as the best number of clusters
## 
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is  3 .

7.5.3 Hierarchical Clustering

#################################################
#################################################
#Part III-#Hierarchical Clustering#################
#################################################
#################################################

#############################
##KGE################
#############################

#AVERAGE LINKAGE CORRELATION
res.hc.kge2<-hclust(d=dist.eucl.runnof.ratio,method="average")
fviz_dend(res.hc.kge2,cex=0.5)

7.6 Clustering by All Hydrologic Signatures

7.6.1 Euclidean Distance Matrix

#Euclidian Distance
dist.eucl<-dist(df.scaled.sig,method="euclidean")
#Visualizing the distance matrices
fviz_dist(dist.eucl)

7.6.2 Number of Clusters

#################################################
#################################################
##Number of Clusters####################
#################################################
#################################################

fviz_nbclust(df.scaled.sig,hcut,method="wss")

fviz_nbclust(df.scaled.sig,hcut,method="silhouette")

fviz_nbclust(df.scaled.sig,hcut,method="gap_stat")

library("NbClust")
nb<-NbClust(df.scaled.sig,distance="euclidean",min.nc=2,max.nc=10,method="average")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 3 proposed 3 as the best number of clusters 
## * 11 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 6 as the best number of clusters 
## * 2 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  4 
##  
##  
## *******************************************************************

fviz_nbclust(nb)

## Among all indices: 
## ===================
## * 2 proposed  0 as the best number of clusters
## * 6 proposed  2 as the best number of clusters
## * 3 proposed  3 as the best number of clusters
## * 11 proposed  4 as the best number of clusters
## * 1 proposed  5 as the best number of clusters
## * 1 proposed  6 as the best number of clusters
## * 2 proposed  10 as the best number of clusters
## 
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is  4 .

pam.res<-pam(df.scaled.sig,4)

fviz_cluster(pam.res,repel=TRUE,ggtheme=theme_classic())

7.6.3 Hierarchical Clustering

#AVERAGE LINKAGE CORRELATION
res.hc.kge2<-hclust(d=dist.eucl,method="average")
fviz_dend(res.hc.kge2,cex=0.5)

Cluster_Memo_02

Cassio Rampinelli(RA-Clarkson University) & Tyler Smith (Professor-Clarkson University)

11/20/2019