This Memo aims to organize and briefly presents the main simulations and assumptions regarding a cluster analysis developed for 159 gauged catchments located in southeastern Australia. A conceptual rainfall-runoff model is calibrated for each of the catchments and hierarchical clustering is performed to classify catchments based on their Physical characteristics (Latitude, Longitude, Drainage Area and Elevation) and Hydrologic Signatures (Mean of Total Annual Runoff, Mean of Total Annual Potential Evapotranspiration, Mean of Total annual runoff, Runoff Ratio and Aridity index).
Most of the used code and text employed here comes from the book Pratical Guide To Cluster Analysis in R, Unsupervised Machine Learning, by Alboukadel Kassambara, Edition 1 (2017) and for the sake of simplicity the references will be omitted.
In addition, the book Regionalization of Watersheds-An Approach Based on Cluster Analysis, by A. Ramachandra Rao and V.V. Srinivas, (2008) and the book Data Mining Concepts and Techniques by Jiawei Han, Micheline Kamber, and Jian Pei, Edition 3, (2012) were also used as reference.
This code chuck accesses the gauging stations, precipitation, discharges and potential evapotranspiration times series that are saved in MATLAB files.
#################################################
#################################################
#Part I - Accessing the precipitation, discharges
#and potential evapotranspiration data
#################################################
#################################################
#Clean R memory
rm(list=ls(all=TRUE))
#Loading packge to read .mat files
library(R.matlab)
## Registered S3 method overwritten by 'R.oo':
## method from
## throw.default R.methodsS3
## R.matlab v3.6.2 (2018-09-26) successfully loaded. See ?R.matlab for help.
##
## Attaching package: 'R.matlab'
## The following objects are masked from 'package:base':
##
## getOption, isOpen
#Loading clustering packages
library(cluster)
library(devtools)
## Loading required package: usethis
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#Setting the base working directory where all the sub-folders
# are saved
setwd("C:/Users/rampincg/Desktop/Clarkson_Pc/cassio_PC/AGU_2019/dds_seaus240_gr4j/_gr4j_kge")
# Saving the path of the base directory
base.Directory<-getwd()
files<-list.files()
data.base.mat<-readMat(files[length(files)])
data.base.mat<-data.base.mat[[1]]
###########################
#Extracting gauging number
###########################
gauging.number<-0
total.gauging<-length(data.base.mat)/7
i=1
j=1
while(i<=total.gauging){
gauging.number[i]=data.base.mat[[j]]
j=j+7
i=i+1
}
gauging.number=as.numeric(gauging.number)
###########################
#Extracting discharges lists
###########################
discharges<-0
discharges<-as.list(discharges)
temp=0
i=1
j=5
while(i<=total.gauging){
temp=data.base.mat[[j]]
discharges[[i]]=list(temp)
j=j+7
i=i+1
}
###########################
#Extracting precipitation lists
###########################
precipitation<-0
precipitation<-as.list(precipitation)
temp=0
i=1
j=6
while(i<=total.gauging){
temp=data.base.mat[[j]]
precipitation[[i]]=list(temp)
j=j+7
i=i+1
}
###############################################
#Extracting potential evapotranspiration lists
##################################################
pet<-0
pet<-as.list(pet)
temp=0
i=1
j=7
while(i<=total.gauging){
temp=data.base.mat[[j]]
pet[[i]]=list(temp)
j=j+7
i=i+1
}
#################################################
#################################################
#Part II - Extracting initial and final date from
#the time series
#################################################
#################################################
initial.year=final.year=0
initial.month=final.month=0
initial.day=final.day=0
i=1
j=2
k=1
empty.gauging=0
while(i<=total.gauging){
#Year
year.temp=data.base.mat[[j]]
if(length(year.temp)==0){
initial.year[i]=NA
final.year[i]=NA
year.temp=0
empty.gauging[k]=k
#Month
initial.month[i]=NA
final.month[i]=NA
#Day
initial.day[i]=NA
final.day[i]=NA
j=j+7
i=i+1
k=k+1
}
else{
year.temp=data.base.mat[[j]]
initial.year[i]=year.temp[1]
final.year[i]=year.temp[length(year.temp)]
year.temp=0
empty.gauging[k]=0
#Month
month.temp=data.base.mat[[j+1]]
initial.month[i]=month.temp[1]
final.month[i]=month.temp[length(month.temp)]
month.temp=0
#Day
day.temp=data.base.mat[[j+2]]
initial.day[i]=day.temp[1]
final.day[i]=day.temp[length(day.temp)]
month.temp=0
j=j+7
i=i+1
k=k+1
}
}
This code chuck extracts the initial and final date from the time series. The day, month and year are saved in MATLAB files.
#################################################
#################################################
#Part II - Extracting initial and final date from
#the time series
#################################################
#################################################
initial.year=final.year=0
initial.month=final.month=0
initial.day=final.day=0
i=1
j=2
k=1
empty.gauging=0
while(i<=total.gauging){
#Year
year.temp=data.base.mat[[j]]
if(length(year.temp)==0){
initial.year[i]=NA
final.year[i]=NA
year.temp=0
empty.gauging[k]=k
#Month
initial.month[i]=NA
final.month[i]=NA
#Day
initial.day[i]=NA
final.day[i]=NA
j=j+7
i=i+1
k=k+1
}
else{
year.temp=data.base.mat[[j]]
initial.year[i]=year.temp[1]
final.year[i]=year.temp[length(year.temp)]
year.temp=0
empty.gauging[k]=0
#Month
month.temp=data.base.mat[[j+1]]
initial.month[i]=month.temp[1]
final.month[i]=month.temp[length(month.temp)]
month.temp=0
#Day
day.temp=data.base.mat[[j+2]]
initial.day[i]=day.temp[1]
final.day[i]=day.temp[length(day.temp)]
month.temp=0
j=j+7
i=i+1
k=k+1
}
}
The code chunk below computes the following signatures for each site: Mean of the total annual rain, Mean of the total potential evapotranspiration, mean of the daily discharges, runnof ration and the aridity index.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tibble)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
#####################################################################################
#####################################################################################
#####################################################################################
#Computing Signatures
#####################################################################################
#####################################################################################
mean.Rain=0#Mean daily rain
mean.Pet=0#Mean daily PET
mean.discharges=0#Mean daily discharges
temp=0#dummy varibale
annual.Rain=0# mean of all total annual precipitation
annual.Pet=0#mean of all total annual PET
annual.Q=0#mean of all total Q
#Mean daily Rain
for(i in 1:length(precipitation)){
empty.gauging[i]
if(empty.gauging[i]==0){
temp=precipitation[[i]]
temp=temp[[1]]
mean.Rain[i]=mean(temp)
initial.date=c(initial.year[i],initial.month[i],initial.day[i])
initial.date=paste(initial.date,collapse="-")
final.date=c(final.year[i],final.month[i],final.day[i])
final.date=paste(final.date,collapse="-")
period<-data_frame(date=seq(as.Date(initial.date), as.Date(final.date), by=1))
P=cbind(period,temp)
#Rename columns
colnames(P)<- c("date", "plu")
#Reorder columns from days to month
P.annual=P %>% group_by(month=floor_date(date, "year"))%>%summarize(plu=sum(plu))
annual.Rain[i]=mean(P.annual$plu)
temp=0
initial.date=0
final.date=0
period=0
P=0
P.annual=0
}
else{
mean.Rain[i]=NA
annual.Rain[i]=NA
}
}
#Mean daily pet
for(i in 1:length(pet)){
empty.gauging[i]
if(empty.gauging[i]==0){
temp=pet[[i]]
temp=temp[[1]]
mean.Pet[i]=mean(temp)
initial.date=c(initial.year[i],initial.month[i],initial.day[i])
initial.date=paste(initial.date,collapse="-")
final.date=c(final.year[i],final.month[i],final.day[i])
final.date=paste(final.date,collapse="-")
period<-data_frame(date=seq(as.Date(initial.date), as.Date(final.date), by=1))
P=cbind(period,temp)
#Rename columns
colnames(P)<- c("date", "plu")
#Reorder columns from days to month
P.annual=P %>% group_by(month=floor_date(date, "year"))%>%summarize(plu=sum(plu))
annual.Pet[i]=mean(P.annual$plu)
temp=0
initial.date=0
final.date=0
period=0
P=0
P.annual=0
}
else{
mean.Pet[i]=NA
annual.Pet[i]=NA
}
}
#Mean daily discharge
for(i in 1:length(discharges)){
empty.gauging[i]
if(empty.gauging[i]==0){
temp=discharges[[i]]
temp=temp[[1]]
mean.discharges[i]=mean(temp)
initial.date=c(initial.year[i],initial.month[i],initial.day[i])
initial.date=paste(initial.date,collapse="-")
final.date=c(final.year[i],final.month[i],final.day[i])
final.date=paste(final.date,collapse="-")
period<-data_frame(date=seq(as.Date(initial.date), as.Date(final.date), by=1))
P=cbind(period,temp)
#Rename columns
colnames(P)<- c("date", "plu")
#Reorder columns from days to month
P.annual=P %>% group_by(month=floor_date(date, "year"))%>%summarize(plu=sum(plu))
annual.Q[i]=mean(P.annual$plu)
temp=0
initial.date=0
final.date=0
period=0
P=0
P.annual=0
}
else{
mean.discharges[i]=NA
annual.Q[i]=NA
}
}
#Computing aridity index
ai=annual.Rain/annual.Pet
runnof.ratio=annual.Q/annual.Rain
#Creating a data frame with all discharges
signatures.data.frame<-cbind(annual.Rain,annual.Pet,mean.discharges,runnof.ratio,ai)
head(signatures.data.frame)
## annual.Rain annual.Pet mean.discharges runnof.ratio ai
## [1,] 1543.7029 1343.543 1.5662172 0.3706278 1.1489787
## [2,] NA NA NA NA NA
## [3,] 1842.1200 1365.594 2.2037736 0.4370174 1.3489514
## [4,] 1025.7024 1420.295 0.5330344 0.1898382 0.7221755
## [5,] 1228.4457 1439.578 0.7715280 0.2294275 0.8533373
## [6,] 973.1329 1419.397 0.2832855 0.1063413 0.6855960
The considered physical characteristics are Latitude, Longitude, Elevation and Drainage Area.
setwd("C:/Users/rampincg/Desktop/Clarkson_Pc/cassio_PC/AGU_2019/Reports/Memo_02")
physical.characteristics<-read.table(file="PhysicalCharacteristicsTable5.txt",sep="\t",header=T,quote = "")
head(physical.characteristics)
## Order Gauging Field Station River Lat Long
## 1 1 201001 1 Eungella Oxley -28.3550 153.2917
## 2 2 203002 3 Repentance Coopers Ck -28.6433 153.4100
## 3 3 203005 4 Wiangaree Richmond -28.5067 152.9667
## 4 4 203010 5 Rock Valley Leycester -28.7383 153.1633
## 5 5 203030 6 Rappville Myrtle Ck -29.1117 152.9983
## 6 6 204017 7 Dorrigo No.2 & No.3 Bielsdown Ck -30.3067 152.7133
## El Area
## 1 15 213
## 2 45 62
## 3 65 702
## 4 15 179
## 5 35 332
## 6 632 82
#Loading clustering packages
library(cluster)
library(devtools)
library("factoextra")
#Extracting the physical characteristics and field column from the original data matrix
data.Set<-cbind(physical.characteristics[,3],physical.characteristics[,6:9])
#Setting collumns names
col.names<-c("Field","Lat.","Long.","Elv.","Area")
colnames(data.Set)<-col.names
################################
##Acessing characteristics histograms
################################
##############################################################
###DENSITIES WITH GGPLOT2#####################################
##############################################################
library(reshape2)
library(ggplot2)
library(cowplot)
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:lubridate':
##
## stamp
index=seq(from=1,to=nrow(data.Set),by=1)
dataX1<-cbind(index,data.Set[,2])
dataX1<-as.data.frame(dataX1)
cnames=c("index","LAT.")
colnames(dataX1)=cnames
dataX1<-reshape2::melt(dataX1, id.var='index')
dataX2<-cbind(index,data.Set[,3])
dataX2<-as.data.frame(dataX2)
cnames=c("index","LONG.")
colnames(dataX2)=cnames
dataX2<-reshape2::melt(dataX2, id.var='index')
dataX3<-cbind(index,data.Set[,4])
dataX3<-as.data.frame(dataX3)
cnames=c("index","Elev.")
colnames(dataX3)=cnames
dataX3<-reshape2::melt(dataX3, id.var='index')
dataX4<-cbind(index,data.Set[,5])
dataX4<-as.data.frame(dataX4)
cnames=c("index","AREA")
colnames(dataX4)=cnames
dataX4<-reshape2::melt(dataX4, id.var='index')
#Together with ggplot2
plot1<-ggplot(dataX1) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1, geom="line",position = "identity")+
ylab("Density")+
xlab("LAT.")+
xlim(-27,-35) +
theme_bw() +
theme(legend.position="top")+
theme(legend.title = element_blank())+
theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
theme(legend.text=element_text(size=14))+
theme(axis.text.x = element_text(size = 16),axis.text.y = element_text(size = 16))
plot2<-ggplot(dataX2) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1, geom="line",position = "identity")+
ylab("Density")+
xlab("LONG.")+
xlim(142,155) +
theme_bw() +
theme(legend.position="top")+
theme(legend.title = element_blank())+
theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
theme(legend.text=element_text(size=14))+
theme(axis.text.x = element_text(size = 16),axis.text.y = element_text(size = 16))
plot3<-ggplot(dataX3) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1, geom="line",position = "identity")+
ylab("Density")+
xlab("ELEV.")+
xlim(0,1000) +
theme_bw() +
theme(legend.position="top")+
theme(legend.title = element_blank())+
theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
theme(legend.text=element_text(size=14))+
theme(axis.text.x = element_text(size = 16),axis.text.y = element_text(size = 16))
plot4<-ggplot(dataX4) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1, geom="line",position = "identity")+
ylab("Density")+
xlab("AREA")+
xlim(0,300) +
theme_bw() +
theme(legend.position="top")+
theme(legend.title = element_blank())+
theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
theme(legend.text=element_text(size=14))+
theme(axis.text.x = element_text(size = 16),axis.text.y = element_text(size = 16))
plot_grid(plot1,plot2,plot3,plot4, labels = "AUTO")
summary(data.Set)
## Field Lat. Long. Elv.
## Min. : 1.0 Min. :-38.54 Min. :139.0 Min. : 1.0
## 1st Qu.: 48.5 1st Qu.:-36.61 1st Qu.:146.8 1st Qu.: 125.5
## Median :135.0 Median :-35.02 Median :149.3 Median : 276.0
## Mean :123.5 Mean :-33.66 Mean :149.1 Mean : 368.7
## 3rd Qu.:188.5 3rd Qu.:-30.49 3rd Qu.:151.7 3rd Qu.: 542.5
## Max. :240.0 Max. :-27.30 Max. :153.4 Max. :1254.0
## Area
## Min. : 51.0
## 1st Qu.: 165.0
## Median : 376.0
## Mean : 8908.1
## 3rd Qu.: 678.5
## Max. :502500.0
###############################
#Scaling data set
###############################
df.scaled<-data.Set[,2:5]
rownames(df.scaled)=data.Set[,1]
col1=col2=col3=col4=0
col1<-(df.scaled[,1]-min(df.scaled[,1]))/(max(df.scaled[,1])-min(df.scaled[,1]))
col2<-(df.scaled[,2]-min(df.scaled[,2]))/(max(df.scaled[,2])-min(df.scaled[,2]))
col3<-(df.scaled[,3]-min(df.scaled[,3]))/(max(df.scaled[,3])-min(df.scaled[,3]))
col4<-(df.scaled[,4]-min(df.scaled[,4]))/(max(df.scaled[,4])-min(df.scaled[,4]))
df.scaled[,1]=col1
df.scaled[,2]=col2
df.scaled[,3]=col3
df.scaled[,4]=col4
#################################################
#################################################
#Cluster by LAT and LONG.
#################################################
#################################################
data.Coordinates<-df.scaled[,1:2]
data.Coordinates<-as.data.frame(data.Coordinates)
#Distance matrix computation-Euclidean
#Euclidian Distance
dist.eucl.lat<-dist(df.scaled,method="euclidean")
#Visualizing the distance matrices
fviz_dist(dist.eucl.lat)
fviz_nbclust(data.Coordinates,hcut,method="wss")
fviz_nbclust(data.Coordinates,hcut,method="silhouette")
fviz_nbclust(data.Coordinates,hcut,method="gap_stat")
pam.res<-pam(data.Coordinates,2)
fviz_cluster(pam.res,repel=TRUE,ggtheme=theme_classic())
library("NbClust")
nb<-NbClust(data.Coordinates,distance="euclidean",min.nc=2,max.nc=10,method="average")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 8 proposed 2 as the best number of clusters
## * 3 proposed 3 as the best number of clusters
## * 4 proposed 4 as the best number of clusters
## * 6 proposed 5 as the best number of clusters
## * 2 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
fviz_nbclust(nb)
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 1 proposed 1 as the best number of clusters
## * 8 proposed 2 as the best number of clusters
## * 3 proposed 3 as the best number of clusters
## * 4 proposed 4 as the best number of clusters
## * 6 proposed 5 as the best number of clusters
## * 2 proposed 10 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 2 .
#AVERAGE LINKAGE CORRELATION
res.hc.kge2<-hclust(d=dist.eucl.lat,method="average")
fviz_dend(res.hc.kge2,cex=0.5)
#################################################
#################################################
#Clustering by Elevation and Area
#################################################
#################################################
data.Elevation<-df.scaled[,3:4]
data.Elevation<-as.data.frame(data.Elevation)
#Distance matrix computation-Euclidean
#Euclidian Distance
dist.eucl.elev<-dist(data.Elevation,method="euclidean")
#Visualizing the distance matrices
fviz_dist(dist.eucl.elev)
fviz_nbclust(data.Elevation,hcut,method="wss")
fviz_nbclust(data.Elevation,hcut,method="silhouette")
fviz_nbclust(data.Elevation,hcut,method="gap_stat")
pam.res<-pam(data.Elevation,3)
fviz_cluster(pam.res,repel=TRUE,ggtheme=theme_classic())
library("NbClust")
nb<-NbClust(data.Elevation,distance="euclidean",min.nc=2,max.nc=10,method="average")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 3 proposed 2 as the best number of clusters
## * 8 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 3 proposed 5 as the best number of clusters
## * 3 proposed 6 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 1 proposed 9 as the best number of clusters
## * 4 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
fviz_nbclust(nb)
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 3 proposed 2 as the best number of clusters
## * 8 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 3 proposed 5 as the best number of clusters
## * 3 proposed 6 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 1 proposed 9 as the best number of clusters
## * 4 proposed 10 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 3 .
#AVERAGE LINKAGE CORRELATION
res.hc.kge2<-hclust(d=dist.eucl.elev,method="average")
fviz_dend(res.hc.kge2,cex=0.5)
#Euclidian Distance
dist.eucl<-dist(df.scaled,method="euclidean")
#Visualizing the distance matrices
fviz_dist(dist.eucl)
fviz_nbclust(df.scaled,hcut,method="wss")
fviz_nbclust(df.scaled,hcut,method="silhouette")
fviz_nbclust(df.scaled,hcut,method="gap_stat")
pam.res<-pam(df.scaled,5)
fviz_cluster(pam.res,repel=TRUE,ggtheme=theme_classic())
res.hc.kge2<-hclust(d=dist.eucl,method="average")
fviz_dend(res.hc.kge2,cex=0.5)
data.signatures<-signatures.data.frame
data.signatures<-cbind(gauging.number,signatures.data.frame)
data.signatures<-data.signatures[complete.cases(data.signatures), ]
row.names<-data.signatures[,1]
data.signatures<-data.signatures[,-1]
rownames(data.signatures)<-row.names
################################
##Acessing characteristics histograms
################################
##############################################################
###DENSITIES WITH GGPLOT2#####################################
##############################################################
index=seq(from=1,to=nrow(data.signatures),by=1)
dataX1<-cbind(index,data.signatures[,1])
dataX1<-as.data.frame(dataX1)
cnames=c("index","Annual Rain")
colnames(dataX1)=cnames
dataX1<-reshape2::melt(dataX1, id.var='index')
dataX2<-cbind(index,data.signatures[,2])
dataX2<-as.data.frame(dataX2)
cnames=c("index","Annual PET")
colnames(dataX2)=cnames
dataX2<-reshape2::melt(dataX2, id.var='index')
dataX3<-cbind(index,data.signatures[,3])
dataX3<-as.data.frame(dataX3)
cnames=c("index","Mean Discharge")
colnames(dataX3)=cnames
dataX3<-reshape2::melt(dataX3, id.var='index')
dataX4<-cbind(index,data.signatures[,4])
dataX4<-as.data.frame(dataX4)
cnames=c("index","Runnof Ratio")
colnames(dataX4)=cnames
dataX4<-reshape2::melt(dataX4, id.var='index')
dataX5<-cbind(index,data.signatures[,5])
dataX5<-as.data.frame(dataX5)
cnames=c("index","AI")
colnames(dataX5)=cnames
dataX5<-reshape2::melt(dataX5, id.var='index')
#Together with ggplot2
plot1<-ggplot(dataX1) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1, geom="line",position = "identity")+
ylab("Density")+
xlab("Annual Rain")+
xlim(300,2000) +
theme_bw() +
theme(legend.position="top")+
theme(legend.title = element_blank())+
theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=10))+
theme(legend.text=element_text(size=14))+
theme(axis.text.x = element_text(size = 10),axis.text.y = element_text(size = 10))
plot2<-ggplot(dataX2) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1, geom="line",position = "identity")+
ylab("Density")+
xlab("Annual PET")+
xlim(1000,1500) +
theme_bw() +
theme(legend.position="top")+
theme(legend.title = element_blank())+
theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
theme(legend.text=element_text(size=14))+
theme(axis.text.x = element_text(size = 8),axis.text.y = element_text(size = 8))
plot3<-ggplot(dataX3) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1, geom="line",position = "identity")+
ylab("Density")+
xlab("Mean Discharge")+
xlim(0,4) +
theme_bw() +
theme(legend.position="top")+
theme(legend.title = element_blank())+
theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
theme(legend.text=element_text(size=14))+
theme(axis.text.x = element_text(size = 10),axis.text.y = element_text(size = 10))
plot4<-ggplot(dataX4) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1, geom="line",position = "identity")+
ylab("Density")+
xlab("Runnof Ratio")+
xlim(0,1) +
theme_bw() +
theme(legend.position="top")+
theme(legend.title = element_blank())+
theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
theme(legend.text=element_text(size=14))+
theme(axis.text.x = element_text(size = 10),axis.text.y = element_text(size = 10))
plot5<-ggplot(dataX4) + stat_density(aes(x=value, y=..scaled..,color=variable,linetype=variable),lwd=1, geom="line",position = "identity")+
ylab("Density")+
xlab("AI")+
xlim(0,2) +
theme_bw() +
theme(legend.position="top")+
theme(legend.title = element_blank())+
theme(axis.title.x=element_text(size=14,face="bold"),axis.title.y=element_text(size=14))+
theme(legend.text=element_text(size=14))+
theme(axis.text.x = element_text(size = 10),axis.text.y = element_text(size = 16))
plot_grid(plot1,plot2,plot3,plot4,plot5, labels = "AUTO")
summary(data.signatures)
## annual.Rain annual.Pet mean.discharges runnof.ratio
## Min. : 385.4 Min. :1009 Min. :0.002399 Min. :0.002217
## 1st Qu.: 716.1 1st Qu.:1141 1st Qu.:0.139697 1st Qu.:0.069136
## Median : 865.1 Median :1212 Median :0.287660 Median :0.128169
## Mean : 918.4 Mean :1225 Mean :0.535170 Mean :0.174394
## 3rd Qu.:1077.1 3rd Qu.:1294 3rd Qu.:0.671439 3rd Qu.:0.218621
## Max. :1972.9 Max. :1491 Max. :3.853985 Max. :0.933637
## ai
## Min. :0.3087
## 1st Qu.:0.5709
## Median :0.7095
## Mean :0.7574
## 3rd Qu.:0.8800
## Max. :1.5464
###############################
#Scaling data set
###############################
df.scaled.sig<-data.signatures
col1=col2=col3=col4=col5=0
col1<-(df.scaled.sig[,1]-min(df.scaled.sig[,1]))/(max(df.scaled.sig[,1])-min(df.scaled.sig[,1]))
col2<-(df.scaled.sig[,2]-min(df.scaled.sig[,2]))/(max(df.scaled.sig[,2])-min(df.scaled.sig[,2]))
col3<-(df.scaled.sig[,3]-min(df.scaled.sig[,3]))/(max(df.scaled.sig[,3])-min(df.scaled.sig[,3]))
col4<-(df.scaled.sig[,4]-min(df.scaled.sig[,4]))/(max(df.scaled.sig[,4])-min(df.scaled.sig[,4]))
col5<-(df.scaled.sig[,5]-min(df.scaled.sig[,5]))/(max(df.scaled.sig[,5])-min(df.scaled.sig[,5]))
df.scaled.sig[,1]=col1
df.scaled.sig[,2]=col2
df.scaled.sig[,3]=col3
df.scaled.sig[,4]=col4
df.scaled.sig[,5]=col5
#################################################
#################################################
#Clustering by Rain and PET
#################################################
#################################################
data.rain.pet<-df.scaled.sig[,1:2]
data.rain.pet<-as.data.frame(data.rain.pet)
#Distance matrix computation-Euclidean
#Euclidian Distance
dist.eucl.rain.pet<-dist(data.rain.pet,method="euclidean")
#Visualizing the distance matrices
fviz_dist(dist.eucl.rain.pet)
#################################################
#################################################
##Number of Clusters####################
#################################################
#################################################
fviz_nbclust(data.rain.pet,hcut,method="wss")
fviz_nbclust(data.rain.pet,hcut,method="silhouette")
fviz_nbclust(data.rain.pet,hcut,method="gap_stat")
pam.res<-pam(data.rain.pet,5)
fviz_cluster(pam.res,repel=TRUE,ggtheme=theme_classic())
library("NbClust")
nb<-NbClust(data.rain.pet,distance="euclidean",min.nc=2,max.nc=10,method="average")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 5 proposed 2 as the best number of clusters
## * 4 proposed 3 as the best number of clusters
## * 2 proposed 4 as the best number of clusters
## * 7 proposed 5 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 3 proposed 9 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 5
##
##
## *******************************************************************
fviz_nbclust(nb)
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 1 proposed 1 as the best number of clusters
## * 5 proposed 2 as the best number of clusters
## * 4 proposed 3 as the best number of clusters
## * 2 proposed 4 as the best number of clusters
## * 7 proposed 5 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 3 proposed 9 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 5 .
#AVERAGE LINKAGE CORRELATION
res.hc.kge2<-hclust(d=dist.eucl.rain.pet,method="average")
fviz_dend(res.hc.kge2,cex=0.5)
#################################################
#################################################
#Clustering by Annual Rain and Mean Discharge
#################################################
#################################################
data.rain.discharge<-cbind(df.scaled.sig[,1],df.scaled.sig[,3])
data.rain.discharge<-as.data.frame(data.rain.discharge)
#Distance matrix computation-Euclidean
#Euclidian Distance
dist.eucl.rain.discharge<-dist(data.rain.discharge,method="euclidean")
#Visualizing the distance matrices
fviz_dist(dist.eucl.rain.discharge)
#################################################
#################################################
##Number of Clusters####################
#################################################
#################################################
fviz_nbclust(data.rain.discharge,hcut,method="wss")
fviz_nbclust(data.rain.discharge,hcut,method="silhouette")
fviz_nbclust(data.rain.discharge,hcut,method="gap_stat")
pam.res<-pam(data.rain.discharge,2)
fviz_cluster(pam.res,repel=TRUE,ggtheme=theme_classic())
library("NbClust")
nb<-NbClust(data.rain.discharge,distance="euclidean",min.nc=2,max.nc=10,method="average")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 8 proposed 2 as the best number of clusters
## * 2 proposed 3 as the best number of clusters
## * 4 proposed 5 as the best number of clusters
## * 5 proposed 6 as the best number of clusters
## * 3 proposed 9 as the best number of clusters
## * 2 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
fviz_nbclust(nb)
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 8 proposed 2 as the best number of clusters
## * 2 proposed 3 as the best number of clusters
## * 4 proposed 5 as the best number of clusters
## * 5 proposed 6 as the best number of clusters
## * 3 proposed 9 as the best number of clusters
## * 2 proposed 10 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 2 .
#################################################
#################################################
#Part III-#Hierarchical Clustering#################
#################################################
#################################################
#############################
##KGE################
#############################
#AVERAGE LINKAGE CORRELATION
res.hc.kge2<-hclust(d=dist.eucl.rain.discharge,method="average")
fviz_dend(res.hc.kge2,cex=0.5)
#################################################
#################################################
#Clustering by Runnof Ratio and Aridity Index
#################################################
#################################################
data.runnof.ratio<-cbind(df.scaled.sig[,4],df.scaled.sig[,5])
data.runnof.ratio<-as.data.frame(data.runnof.ratio)
#Distance matrix computation-Euclidean
#Euclidian Distance
dist.eucl.runnof.ratio<-dist(data.runnof.ratio,method="euclidean")
#Visualizing the distance matrices
fviz_dist(dist.eucl.runnof.ratio)
#################################################
#################################################
##Number of Clusters####################
#################################################
#################################################
fviz_nbclust(data.runnof.ratio,hcut,method="wss")
fviz_nbclust(data.runnof.ratio,hcut,method="silhouette")
fviz_nbclust(data.runnof.ratio,hcut,method="gap_stat")
pam.res<-pam(data.runnof.ratio,3)
fviz_cluster(pam.res,repel=TRUE,ggtheme=theme_classic())
library("NbClust")
nb<-NbClust(data.runnof.ratio,distance="euclidean",min.nc=2,max.nc=10,method="average")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 9 proposed 3 as the best number of clusters
## * 2 proposed 4 as the best number of clusters
## * 5 proposed 8 as the best number of clusters
## * 1 proposed 9 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
fviz_nbclust(nb)
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 6 proposed 2 as the best number of clusters
## * 9 proposed 3 as the best number of clusters
## * 2 proposed 4 as the best number of clusters
## * 5 proposed 8 as the best number of clusters
## * 1 proposed 9 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 3 .
#################################################
#################################################
#Part III-#Hierarchical Clustering#################
#################################################
#################################################
#############################
##KGE################
#############################
#AVERAGE LINKAGE CORRELATION
res.hc.kge2<-hclust(d=dist.eucl.runnof.ratio,method="average")
fviz_dend(res.hc.kge2,cex=0.5)
#Euclidian Distance
dist.eucl<-dist(df.scaled.sig,method="euclidean")
#Visualizing the distance matrices
fviz_dist(dist.eucl)
#################################################
#################################################
##Number of Clusters####################
#################################################
#################################################
fviz_nbclust(df.scaled.sig,hcut,method="wss")
fviz_nbclust(df.scaled.sig,hcut,method="silhouette")
fviz_nbclust(df.scaled.sig,hcut,method="gap_stat")
library("NbClust")
nb<-NbClust(df.scaled.sig,distance="euclidean",min.nc=2,max.nc=10,method="average")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 3 proposed 3 as the best number of clusters
## * 11 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 2 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 4
##
##
## *******************************************************************
fviz_nbclust(nb)
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 6 proposed 2 as the best number of clusters
## * 3 proposed 3 as the best number of clusters
## * 11 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 2 proposed 10 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 4 .
pam.res<-pam(df.scaled.sig,4)
fviz_cluster(pam.res,repel=TRUE,ggtheme=theme_classic())
#AVERAGE LINKAGE CORRELATION
res.hc.kge2<-hclust(d=dist.eucl,method="average")
fviz_dend(res.hc.kge2,cex=0.5)