Lets load the “chennai water Reservoir level” dataset for Predicting the model.
rain_lvl <- read.csv("chennai_reservoir_levels.csv")
summary(rain_lvl)
##          Date          POONDI         CHOLAVARAM       REDHILLS     
##  01-01-2004:   1   Min.   :   0.9   Min.   :  0.0   Min.   :   0.0  
##  01-01-2005:   1   1st Qu.: 198.0   1st Qu.: 17.0   1st Qu.: 804.2  
##  01-01-2006:   1   Median : 749.0   Median : 90.5   Median :1605.5  
##  01-01-2007:   1   Mean   :1115.6   Mean   :236.2   Mean   :1543.5  
##  01-01-2008:   1   3rd Qu.:1990.0   3rd Qu.:453.8   3rd Qu.:2223.0  
##  01-01-2009:   1   Max.   :3231.0   Max.   :896.0   Max.   :3300.0  
##  (Other)   :5732                                                    
##  CHEMBARAMBAKKAM 
##  Min.   :   0.0  
##  1st Qu.: 431.2  
##  Median :1207.0  
##  Mean   :1300.4  
##  3rd Qu.:2064.8  
##  Max.   :3396.0  
## 
str(rain_lvl)
## 'data.frame':    5738 obs. of  5 variables:
##  $ Date           : Factor w/ 5738 levels "01-01-2004","01-01-2005",..: 1 190 379 568 757 946 1135 1324 1513 1702 ...
##  $ POONDI         : num  3.9 3.9 3.9 3.9 3.8 3.8 3.8 3.7 3.7 3.7 ...
##  $ CHOLAVARAM     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ REDHILLS       : num  268 268 267 267 267 266 266 265 264 264 ...
##  $ CHEMBARAMBAKKAM: num  0 0 0 0 0 0 0 0 0 0 ...
dim(rain_lvl)
## [1] 5738    5
names(rain_lvl)
## [1] "Date"            "POONDI"          "CHOLAVARAM"      "REDHILLS"       
## [5] "CHEMBARAMBAKKAM"
Cleaning the Outlayered Data.
library(tidyverse)
## -- Attaching packages ----------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts -------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
glimpse(rain_lvl)
## Observations: 5,738
## Variables: 5
## $ Date            <fct> 01-01-2004, 02-01-2004, 03-01-2004, 04-01-2004...
## $ POONDI          <dbl> 3.9, 3.9, 3.9, 3.9, 3.8, 3.8, 3.8, 3.7, 3.7, 3...
## $ CHOLAVARAM      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ REDHILLS        <dbl> 268, 268, 267, 267, 267, 266, 266, 265, 264, 2...
## $ CHEMBARAMBAKKAM <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
lvl_res<-rain_lvl%>%filter(rain_lvl$POONDI>0 & rain_lvl$CHOLAVARAM >0 & rain_lvl$REDHILLS >0 & rain_lvl$CHEMBARAMBAKKAM >0)
rain_lvl<-separate(lvl_res,Date,c("Day","Month","Year"),sep="-")
names(rain_lvl)
## [1] "Day"             "Month"           "Year"            "POONDI"         
## [5] "CHOLAVARAM"      "REDHILLS"        "CHEMBARAMBAKKAM"

Exploratory Data Analysis

POONDI column data analysis
library(ggplot2)
#Boxplot
ggplot(data=rain_lvl,aes(x=rain_lvl$Year,y=rain_lvl$POONDI))+geom_boxplot()+labs(title="Rain Level in POONDI",x="Year",y="Rain level in Poodi")

#Heat Map
ggplot(data=rain_lvl,aes(x=rain_lvl$Month,y=rain_lvl$Year,fill=rain_lvl$POONDI))+geom_tile()+labs(title="Rain Level in POONDI",x="Year",y="Rain level in Poodi")

#Density Plot
ggplot(data=rain_lvl,aes(x=POONDI,fill=Year))+geom_density(alpha=0.1)+scale_x_log10()

#Bar Chart
ggplot(data=rain_lvl,aes(x=rain_lvl$Year,y=rain_lvl$POONDI))+geom_col()+labs(title="Rain Level in POONDI",x="Year",y="Rain level in Poodi")

REDHILLS River data analysis
#Boxplot
ggplot(data=rain_lvl,aes(x=rain_lvl$Year,y=rain_lvl$REDHILLS))+geom_boxplot()+
labs(title="Rain Level in REDHILLS",x="Year",y="Rain level in Redhills")

#Heatmap
ggplot(data=rain_lvl,aes(x=rain_lvl$Month,y=rain_lvl$Year,fill=rain_lvl$REDHILLS))+geom_tile()+labs(title="Rain Level in REDHILLS",x="Year",y="Rain level in Redhills")

#Density Plot
ggplot(data=rain_lvl,aes(x=REDHILLS,fill=Year))+geom_density(alpha=0.1)

#Barchart
ggplot(data=rain_lvl,aes(x=rain_lvl$Year,y=rain_lvl$REDHILLS))+geom_col()  +labs(title="Rain Level in REDHILLS",x="Year",y="Rain level in Redhills")

CHOLAVARAM River data Analysis
#Boxplot
ggplot(data=rain_lvl,aes(x=rain_lvl$Year,y=rain_lvl$CHOLAVARAM))+geom_boxplot()+labs(title="Rain Level in CHOLAVARAM",x="Year",y="Rain level in Cholavaram")

#Heat Map
ggplot(data=rain_lvl,aes(x=rain_lvl$Month,y=rain_lvl$Year,fill=rain_lvl$CHOLAVARAM))+geom_tile()+labs(title="Rain Level in CHOLAVARAM",x="Year",y="Rain level in Cholavaram")

#Density Plot
ggplot(data=rain_lvl,aes(x=CHOLAVARAM,fill=Year))+geom_density(alpha=0.1)+scale_x_log10()

#Bar Chart
ggplot(data=rain_lvl,aes(x=rain_lvl$Year,y=rain_lvl$CHOLAVARAM))+geom_col()+labs(title="Rain Level in CHOLAVARAM",x="Year",y="Rain level in Cholavaram")

CHEMBARAMBAKKAM River data analysis
#Boxplot
ggplot(data=rain_lvl,aes(x=rain_lvl$Year,y=rain_lvl$CHEMBARAMBAKKAM))+geom_boxplot()+labs(title="Rain Level in CHEMBARAMBAKKAM",x="Year",y="Rain level in Chembarambakkam")

#Heat map
ggplot(data=rain_lvl,aes(x=rain_lvl$Month,y=rain_lvl$Year,fill=rain_lvl$CHEMBARAMBAKKAM))+geom_tile()+labs(title="Rain Level in CHEMBARAMBAKKAM",x="Year",y="Rain level in Chembarambakkam")

#Density Plot
ggplot(data=rain_lvl,aes(x=CHEMBARAMBAKKAM,fill=Year))+geom_density(alpha=0.1)+scale_x_log10()

#Bar chart
ggplot(data=rain_lvl,aes(x=rain_lvl$Year,y=rain_lvl$CHEMBARAMBAKKAM))+geom_col()+labs(title="Rain Level in CHEMBARAMBAKKAM",x="Year",y="Rain level in Chembarambakkam")

Machine Learning Algorithm for Chennai Reservoir Level dataset

rain_lvl1=rain_lvl[3:7]
#Elbow plot
library(cluster)
set.seed(5)
wcss=vector()
for(i in 1:10)
  wcss[i]=sum(kmeans(rain_lvl,i)$withinss)
wcss
##  [1] 11075615105  3947107319  2522634465  1836678451  1500189826
##  [6]  1381666215  1228609990  1126126192  1025936477   996526094
plot(1:10,wcss,type='b',main="The Elbow Method",xlab="Number of Cluster",ylab='WCSS')

Analysis & Modeling

Unsupervised learning [kmeans clustering]

Tests to find the optimal number of clusters

kmeans=kmeans(x=rain_lvl,centers = 4)
y_kmeans=kmeans$cluster
clusplot(rain_lvl,y_kmeans,lines=0,color=T,shade=T,main="Cluster of Rain Level",xlab="year",ylab="Poondi Water Level")

#mclust
library(mclust)
## Package 'mclust' version 5.4.5
## Type 'citation("mclust")' for citing this R package in publications.
## 
## Attaching package: 'mclust'
## The following object is masked from 'package:purrr':
## 
##     map
fit<-Mclust(rain_lvl1)
plot(fit)

summary(fit)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VVV (ellipsoidal, varying volume, shape, and orientation) model
## with 9 components: 
## 
##  log-likelihood    n  df       BIC       ICL
##       -137133.5 4534 188 -275849.9 -276216.4
## 
## Clustering table:
##    1    2    3    4    5    6    7    8    9 
##  393  489  606  449  342 1234  609  170  242