Load the chennai reservoir rainfall data for Predicting the rainfall level.
rainfall<-read.csv(file="chennai_reservoir_rainfall.csv",header = T,sep = ",")
names(rainfall)
## [1] "Date"            "POONDI"          "CHOLAVARAM"      "REDHILLS"       
## [5] "CHEMBARAMBAKKAM"
summary(rainfall)
##          Date          POONDI          CHOLAVARAM        REDHILLS      
##  01-01-2004:   1   Min.   :  0.000   Min.   :  0.00   Min.   :  0.000  
##  01-01-2005:   1   1st Qu.:  0.000   1st Qu.:  0.00   1st Qu.:  0.000  
##  01-01-2006:   1   Median :  0.000   Median :  0.00   Median :  0.000  
##  01-01-2007:   1   Mean   :  3.377   Mean   :  3.64   Mean   :  3.764  
##  01-01-2008:   1   3rd Qu.:  0.000   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  01-01-2009:   1   Max.   :300.000   Max.   :293.00   Max.   :320.000  
##  (Other)   :5732                                                       
##  CHEMBARAMBAKKAM  
##  Min.   :  0.000  
##  1st Qu.:  0.000  
##  Median :  0.000  
##  Mean   :  3.934  
##  3rd Qu.:  0.000  
##  Max.   :475.000  
## 
dim(rainfall)
## [1] 5738    5
str(rainfall)
## 'data.frame':    5738 obs. of  5 variables:
##  $ Date           : Factor w/ 5738 levels "01-01-2004","01-01-2005",..: 1 190 379 568 757 946 1135 1324 1513 1702 ...
##  $ POONDI         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CHOLAVARAM     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ REDHILLS       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CHEMBARAMBAKKAM: num  0 0 0 0 0 0 0 0 0 0 ...
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
glimpse(rainfall)
## Observations: 5,738
## Variables: 5
## $ Date            <fct> 01-01-2004, 02-01-2004, 03-01-2004, 04-01-2004...
## $ POONDI          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ CHOLAVARAM      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ REDHILLS        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ CHEMBARAMBAKKAM <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...

Cleaning the data

Remove the outlayered data(Days without rain)
rainfall1<-rainfall%>%filter(rainfall$POONDI>0 & rainfall$CHOLAVARAM>0 & rainfall$REDHILLS>0 & rainfall$CHEMBARAMBAKKAM>0)

Date column contains ‘DD-MM-YYYY’. so we separate the date column into three columns (“Day”,“Month”,“Year”) using separate() in tidyr package.

library(tidyr)
rainfall_data<-separate(rainfall1,Date,c("Day","Month","Year"),sep="-")
names(rainfall_data)
## [1] "Day"             "Month"           "Year"            "POONDI"         
## [5] "CHOLAVARAM"      "REDHILLS"        "CHEMBARAMBAKKAM"
Removing the outliers in the data
rainfall_poondi<-rainfall_data%>%filter(POONDI<150)
rainfall_cholavaram<-rainfall_data%>%filter(CHOLAVARAM<150)
rainfall_redhills<-rainfall_data%>%filter(REDHILLS<150)
rainfall_chembarambakkam<-rainfall_data%>%filter(CHEMBARAMBAKKAM<150)
Change the year column as factor for Visualization
rainfall_data$Year<-as.factor(rainfall_data$Year)

Exploratory Data Analysis

Load ggplot2 for Visualization
library(ggplot2)
Visualize the Poondi data field for rainfall level
#Boxplot
ggplot(data=rainfall_poondi,aes(x=Year,y=POONDI))+geom_boxplot()+labs(title="Rainfall level in Poondi",x="Year",y="POONDI")

#Density plot
ggplot(data=rainfall_poondi,aes(x=POONDI,fill=Year))+geom_density(alpha=0.1)+scale_x_log10()+labs(title="Rainfall level in Poondi",x="POONDI")

#Find Maximum rainfall in poondi and plot time series 
time_poondi<-as.data.frame(rainfall_poondi%>%group_by(Year)%>%summarise(Max=max(POONDI)))
#Change the Year field in time_poondi as numeric for ploting
time_poondi$Year<-as.numeric(time_poondi$Year)
#Time series
ggplot(data=time_poondi,aes(x=Year,y=Max))+geom_point()+geom_line()+labs(title="Rainfall level in Poondi",x="Year",y="POONDI")

#Scatter plot
ggplot(data=time_poondi,aes(x=Year,y=Max,size=Max))+geom_smooth(method="loess")+geom_point()+labs(title="Rainfall level in Poondi",x="Year",y="POONDI")

#Heat map
ggplot(data=rainfall_poondi,aes(x=Month,y=Year,fill=POONDI))+geom_tile()+labs(title="Rainfall level in Poondi",x="Month",y="Year")

#Bar plot
ggplot(data=rainfall_poondi,aes(x=Year,y=POONDI))+geom_col()+labs(title="Rainfall level in Poondi",x="Year",y="POONDI")

#violin plot
ggplot(data=rainfall_poondi,aes(x=Year,y=POONDI,fill=Year))+geom_violin()+labs(title="Rainfall level in Poondi",x="Year",y="POONDI")

Visualize the Cholavaram data field for rainfall level
#Boxplot
ggplot(data=rainfall_cholavaram,aes(x=Year,y=CHOLAVARAM))+geom_boxplot()+labs(title="Rainfall level in Cholavaram",x="Year",y="CHOLAVARAM")

#Density plot
ggplot(data=rainfall_cholavaram,aes(x=CHOLAVARAM,fill=Year))+geom_density(alpha=0.1)+scale_x_log10()+labs(title="Rainfall level in Cholavaram",x="CHOLAVARAM")

#Find Maximum rainfall in Cholavaram and plot time series
time_cholavaram<-as.data.frame(rainfall_cholavaram%>%group_by(Year)%>%summarise(Max=max(CHOLAVARAM)))
#Change the Year field in time_cholavaram as numeric for ploting
time_cholavaram$Year<-as.numeric(time_cholavaram$Year)
#Time series
ggplot(data=time_cholavaram,aes(x=Year,y=Max))+geom_line()+geom_point()+labs(title="Rainfall level in Cholavaram",x="Year",y="CHOLAVARAM")

#Scatter plot
ggplot(data=time_cholavaram,aes(x=Year,y=Max,size=Max))+geom_smooth(method="loess")+geom_point()+labs(title="Rainfall level in Cholavaram",x="Year",y="CHOLAVARAM")

#Heat map
ggplot(data=rainfall_cholavaram,aes(x=Month,y=Year,fill=CHOLAVARAM))+geom_tile()+labs(title="Rainfall level in Cholavaram",x="Month",y="Year")

#Bar plot
ggplot(data=rainfall_cholavaram,aes(x=Year,y=CHOLAVARAM))+geom_col()+labs(title="Rainfall level in Cholavaram",x="Year",y="CHOLAVARAM")

#Violin plot
ggplot(data=rainfall_cholavaram,aes(x=Year,y=CHOLAVARAM,fill=Year))+geom_violin()+labs(title="Rainfall level in Cholavaram",x="Year",y="CHOLAVARAM")

Visualize the Redhills data field for rainfall level
#Boxplot
ggplot(data=rainfall_redhills,aes(x=Year,y=REDHILLS))+geom_boxplot()+labs(title="Rainfall level in Redhills",x="Year",y="REDHILLS")

#Density plot
ggplot(data=rainfall_redhills,aes(x=REDHILLS,fill=Year))+geom_density(alpha=0.1)+scale_x_log10()+labs(title="Rainfall level in Redhills",x="REDHILLS")

#Find Maximum rainfall in Redhills and plot time series
time_redhills<-as.data.frame(rainfall_redhills%>%group_by(Year)%>%summarise(Max=max(REDHILLS)))
#Change the Year field in time_redhills as numeric for ploting
time_redhills$Year<-as.numeric(time_redhills$Year)
#Time series
ggplot(data=time_redhills,aes(x=Year,y=Max))+geom_line()+geom_point()+labs(title="Rainfall level in Redhills",x="Year",y="REDHILLS")

#Scatter plot
ggplot(data=time_redhills,aes(x=Year,y=Max,size=Max))+geom_smooth(method="loess")+geom_point()+labs(title="Rainfall level in Redhills",x="Year",y="REDHILLS")

#Heatmap
ggplot(data=rainfall_redhills,aes(x=Month,y=Year,fill=REDHILLS))+geom_tile()+labs(title="Rainfall level in Redhills",x="Month",y="Year")

#Barplot
ggplot(data=rainfall_redhills,aes(x=Year,y=REDHILLS))+geom_col()+labs(title="Rainfall level in Redhills",x="Year",y="REDHILLS")

#Violin plot
ggplot(data=rainfall_redhills,aes(x=Year,y=REDHILLS,fill=Year))+geom_violin()+labs(title="Rainfall level in Redhills",x="Year",y="REDHILLS")

Visualize the Chembarambakkam data field for rainfall level
#Boxplot
ggplot(data=rainfall_chembarambakkam,aes(x=Year,y=CHEMBARAMBAKKAM))+geom_boxplot()+labs(title="Rainfall level in CHEMBARAMBAKKAM",x="Year",y="CHEMBARAMBAKKAM")

#Density plot
ggplot(data=rainfall_chembarambakkam,aes(x=CHEMBARAMBAKKAM,fill=Year))+geom_density(alpha=0.1)+scale_x_log10()+labs(title="Rainfall level in CHEMBARAMBAKKAM",x="CHEMBARAMBAKKAM")

#Find Maximum rainfall in CHEMBARAMBAKKAM and plot time series
time_chembarambakkam<-as.data.frame(rainfall_chembarambakkam%>%group_by(Year)%>%summarise(Max=max(CHOLAVARAM)))
#Change the Year field in time_chembarambakkam as numeric for ploting
time_chembarambakkam$Year<-as.numeric(time_chembarambakkam$Year)
#Time series
ggplot(data=time_chembarambakkam,aes(x=Year,y=Max))+geom_line()+geom_point()+labs(title="Rainfall level in CHEMBARAMBAKKAM",x="Year",y="CHEMBARAMBAKKAM")

#Scatter plot
ggplot(data=time_chembarambakkam,aes(x=Year,y=Max,size=Max))+geom_smooth(method="loess")+geom_point()+labs(title="Rainfall level in CHEMBARAMBAKKAM",x="Year",y="CHEMBARAMBAKKAM")

#Heat map
ggplot(data=rainfall_chembarambakkam,aes(x=Month,y=Year,fill=CHEMBARAMBAKKAM))+geom_tile()+labs(title="Rainfall level in CHEMBARAMBAKKAM",x="Month",y="Year")

#Bar plot
ggplot(data=rainfall_chembarambakkam,aes(x=Year,y=CHEMBARAMBAKKAM))+geom_col()+labs(title="Rainfall level in CHEMBARAMBAKKAM",x="Year",y="CHEMBARAMBAKKAM")

#Violin plot
ggplot(data=rainfall_chembarambakkam,aes(x=Year,y=CHEMBARAMBAKKAM,fill=Year))+geom_violin()+labs(title="Rainfall level in CHEMBARAMBAKKAM",x="Year",y="CHEMBARAMBAKKAM")

Visualization of overall datafields
#Comparing chembarambakkam,redhills,cholavaram,POONDI
#pie-chart
ggplot(data=rainfall_data,aes(x=(""),fill=Year))+geom_bar()+coord_polar(theta='y')+labs(title="Pie-chart for Rainfall level in all the areas")

#gathering the dataset for Comparing chembarambakkam,redhills,cholavaram,POONDI
rainfall_gather<-rainfall_data%>%gather("Area","Rainfall_lvl",c(4:7))
#Bar plot
ggplot(data=rainfall_gather,aes(x=Area,fill=Year))+geom_bar(position = "dodge")+facet_wrap(~Area,scales="free")+labs(title="Comparision of rainfall level in all the area")

Analysis & Modeling

Machine Learning Algorithm for Chennai Reservoir Rainfall dataset

Tests to find the optimal number of clusters

data_rainfall<-rainfall_data
data_rainfall1<-data_rainfall[3:7]

#Elbow plot -to find the number of centroids
set.seed(5)
wcss = vector()
for(i in 1:10) 
  wcss[i]=sum(kmeans(data_rainfall1,i)$withinss)
plot(1:10, wcss,type='b',main="The Elbow Method",xlab="Number of Cluster",ylab='WCSS')

Unsupervised learning [K-means clustering]
#Fitting K-mean to the dataset
kmeans=kmeans(x=data_rainfall1,centers = 5 )
y_kmeans = kmeans$cluster

#Visualising the cluster
library(cluster)
clusplot(data_rainfall1,y_kmeans,shade = T,lines = 1,color = T,main="Cluster of Rainfall",xlab = 'Year',ylab = 'OVERALL RAINFALL LEVEL')

#mclust
library(mclust)
## Package 'mclust' version 5.4.5
## Type 'citation("mclust")' for citing this R package in publications.
fit <- Mclust(data_rainfall1)
plot(fit) # plot results

summary(fit) # display the best model
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VVE (ellipsoidal, equal orientation) model with 5 components: 
## 
##  log-likelihood   n df       BIC       ICL
##       -12153.75 628 64 -24719.81 -24899.77
## 
## Clustering table:
##   1   2   3   4   5 
## 136 199  56  93 144