rainfall<-read.csv(file="chennai_reservoir_rainfall.csv",header = T,sep = ",")
names(rainfall)
## [1] "Date" "POONDI" "CHOLAVARAM" "REDHILLS"
## [5] "CHEMBARAMBAKKAM"
summary(rainfall)
## Date POONDI CHOLAVARAM REDHILLS
## 01-01-2004: 1 Min. : 0.000 Min. : 0.00 Min. : 0.000
## 01-01-2005: 1 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.000
## 01-01-2006: 1 Median : 0.000 Median : 0.00 Median : 0.000
## 01-01-2007: 1 Mean : 3.377 Mean : 3.64 Mean : 3.764
## 01-01-2008: 1 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.: 0.000
## 01-01-2009: 1 Max. :300.000 Max. :293.00 Max. :320.000
## (Other) :5732
## CHEMBARAMBAKKAM
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 3.934
## 3rd Qu.: 0.000
## Max. :475.000
##
dim(rainfall)
## [1] 5738 5
str(rainfall)
## 'data.frame': 5738 obs. of 5 variables:
## $ Date : Factor w/ 5738 levels "01-01-2004","01-01-2005",..: 1 190 379 568 757 946 1135 1324 1513 1702 ...
## $ POONDI : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CHOLAVARAM : num 0 0 0 0 0 0 0 0 0 0 ...
## $ REDHILLS : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CHEMBARAMBAKKAM: num 0 0 0 0 0 0 0 0 0 0 ...
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glimpse(rainfall)
## Observations: 5,738
## Variables: 5
## $ Date <fct> 01-01-2004, 02-01-2004, 03-01-2004, 04-01-2004...
## $ POONDI <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ CHOLAVARAM <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ REDHILLS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ CHEMBARAMBAKKAM <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
rainfall1<-rainfall%>%filter(rainfall$POONDI>0 & rainfall$CHOLAVARAM>0 & rainfall$REDHILLS>0 & rainfall$CHEMBARAMBAKKAM>0)
Date column contains ‘DD-MM-YYYY’. so we separate the date column into three columns (“Day”,“Month”,“Year”) using separate() in tidyr package.
library(tidyr)
rainfall_data<-separate(rainfall1,Date,c("Day","Month","Year"),sep="-")
names(rainfall_data)
## [1] "Day" "Month" "Year" "POONDI"
## [5] "CHOLAVARAM" "REDHILLS" "CHEMBARAMBAKKAM"
rainfall_poondi<-rainfall_data%>%filter(POONDI<150)
rainfall_cholavaram<-rainfall_data%>%filter(CHOLAVARAM<150)
rainfall_redhills<-rainfall_data%>%filter(REDHILLS<150)
rainfall_chembarambakkam<-rainfall_data%>%filter(CHEMBARAMBAKKAM<150)
rainfall_data$Year<-as.factor(rainfall_data$Year)
library(ggplot2)
#Boxplot
ggplot(data=rainfall_poondi,aes(x=Year,y=POONDI))+geom_boxplot()+labs(title="Rainfall level in Poondi",x="Year",y="POONDI")
#Density plot
ggplot(data=rainfall_poondi,aes(x=POONDI,fill=Year))+geom_density(alpha=0.1)+scale_x_log10()+labs(title="Rainfall level in Poondi",x="POONDI")
#Find Maximum rainfall in poondi and plot time series
time_poondi<-as.data.frame(rainfall_poondi%>%group_by(Year)%>%summarise(Max=max(POONDI)))
#Change the Year field in time_poondi as numeric for ploting
time_poondi$Year<-as.numeric(time_poondi$Year)
#Time series
ggplot(data=time_poondi,aes(x=Year,y=Max))+geom_point()+geom_line()+labs(title="Rainfall level in Poondi",x="Year",y="POONDI")
#Scatter plot
ggplot(data=time_poondi,aes(x=Year,y=Max,size=Max))+geom_smooth(method="loess")+geom_point()+labs(title="Rainfall level in Poondi",x="Year",y="POONDI")
#Heat map
ggplot(data=rainfall_poondi,aes(x=Month,y=Year,fill=POONDI))+geom_tile()+labs(title="Rainfall level in Poondi",x="Month",y="Year")
#Bar plot
ggplot(data=rainfall_poondi,aes(x=Year,y=POONDI))+geom_col()+labs(title="Rainfall level in Poondi",x="Year",y="POONDI")
#violin plot
ggplot(data=rainfall_poondi,aes(x=Year,y=POONDI,fill=Year))+geom_violin()+labs(title="Rainfall level in Poondi",x="Year",y="POONDI")
#Boxplot
ggplot(data=rainfall_cholavaram,aes(x=Year,y=CHOLAVARAM))+geom_boxplot()+labs(title="Rainfall level in Cholavaram",x="Year",y="CHOLAVARAM")
#Density plot
ggplot(data=rainfall_cholavaram,aes(x=CHOLAVARAM,fill=Year))+geom_density(alpha=0.1)+scale_x_log10()+labs(title="Rainfall level in Cholavaram",x="CHOLAVARAM")
#Find Maximum rainfall in Cholavaram and plot time series
time_cholavaram<-as.data.frame(rainfall_cholavaram%>%group_by(Year)%>%summarise(Max=max(CHOLAVARAM)))
#Change the Year field in time_cholavaram as numeric for ploting
time_cholavaram$Year<-as.numeric(time_cholavaram$Year)
#Time series
ggplot(data=time_cholavaram,aes(x=Year,y=Max))+geom_line()+geom_point()+labs(title="Rainfall level in Cholavaram",x="Year",y="CHOLAVARAM")
#Scatter plot
ggplot(data=time_cholavaram,aes(x=Year,y=Max,size=Max))+geom_smooth(method="loess")+geom_point()+labs(title="Rainfall level in Cholavaram",x="Year",y="CHOLAVARAM")
#Heat map
ggplot(data=rainfall_cholavaram,aes(x=Month,y=Year,fill=CHOLAVARAM))+geom_tile()+labs(title="Rainfall level in Cholavaram",x="Month",y="Year")
#Bar plot
ggplot(data=rainfall_cholavaram,aes(x=Year,y=CHOLAVARAM))+geom_col()+labs(title="Rainfall level in Cholavaram",x="Year",y="CHOLAVARAM")
#Violin plot
ggplot(data=rainfall_cholavaram,aes(x=Year,y=CHOLAVARAM,fill=Year))+geom_violin()+labs(title="Rainfall level in Cholavaram",x="Year",y="CHOLAVARAM")
#Boxplot
ggplot(data=rainfall_redhills,aes(x=Year,y=REDHILLS))+geom_boxplot()+labs(title="Rainfall level in Redhills",x="Year",y="REDHILLS")
#Density plot
ggplot(data=rainfall_redhills,aes(x=REDHILLS,fill=Year))+geom_density(alpha=0.1)+scale_x_log10()+labs(title="Rainfall level in Redhills",x="REDHILLS")
#Find Maximum rainfall in Redhills and plot time series
time_redhills<-as.data.frame(rainfall_redhills%>%group_by(Year)%>%summarise(Max=max(REDHILLS)))
#Change the Year field in time_redhills as numeric for ploting
time_redhills$Year<-as.numeric(time_redhills$Year)
#Time series
ggplot(data=time_redhills,aes(x=Year,y=Max))+geom_line()+geom_point()+labs(title="Rainfall level in Redhills",x="Year",y="REDHILLS")
#Scatter plot
ggplot(data=time_redhills,aes(x=Year,y=Max,size=Max))+geom_smooth(method="loess")+geom_point()+labs(title="Rainfall level in Redhills",x="Year",y="REDHILLS")
#Heatmap
ggplot(data=rainfall_redhills,aes(x=Month,y=Year,fill=REDHILLS))+geom_tile()+labs(title="Rainfall level in Redhills",x="Month",y="Year")
#Barplot
ggplot(data=rainfall_redhills,aes(x=Year,y=REDHILLS))+geom_col()+labs(title="Rainfall level in Redhills",x="Year",y="REDHILLS")
#Violin plot
ggplot(data=rainfall_redhills,aes(x=Year,y=REDHILLS,fill=Year))+geom_violin()+labs(title="Rainfall level in Redhills",x="Year",y="REDHILLS")
#Boxplot
ggplot(data=rainfall_chembarambakkam,aes(x=Year,y=CHEMBARAMBAKKAM))+geom_boxplot()+labs(title="Rainfall level in CHEMBARAMBAKKAM",x="Year",y="CHEMBARAMBAKKAM")
#Density plot
ggplot(data=rainfall_chembarambakkam,aes(x=CHEMBARAMBAKKAM,fill=Year))+geom_density(alpha=0.1)+scale_x_log10()+labs(title="Rainfall level in CHEMBARAMBAKKAM",x="CHEMBARAMBAKKAM")
#Find Maximum rainfall in CHEMBARAMBAKKAM and plot time series
time_chembarambakkam<-as.data.frame(rainfall_chembarambakkam%>%group_by(Year)%>%summarise(Max=max(CHOLAVARAM)))
#Change the Year field in time_chembarambakkam as numeric for ploting
time_chembarambakkam$Year<-as.numeric(time_chembarambakkam$Year)
#Time series
ggplot(data=time_chembarambakkam,aes(x=Year,y=Max))+geom_line()+geom_point()+labs(title="Rainfall level in CHEMBARAMBAKKAM",x="Year",y="CHEMBARAMBAKKAM")
#Scatter plot
ggplot(data=time_chembarambakkam,aes(x=Year,y=Max,size=Max))+geom_smooth(method="loess")+geom_point()+labs(title="Rainfall level in CHEMBARAMBAKKAM",x="Year",y="CHEMBARAMBAKKAM")
#Heat map
ggplot(data=rainfall_chembarambakkam,aes(x=Month,y=Year,fill=CHEMBARAMBAKKAM))+geom_tile()+labs(title="Rainfall level in CHEMBARAMBAKKAM",x="Month",y="Year")
#Bar plot
ggplot(data=rainfall_chembarambakkam,aes(x=Year,y=CHEMBARAMBAKKAM))+geom_col()+labs(title="Rainfall level in CHEMBARAMBAKKAM",x="Year",y="CHEMBARAMBAKKAM")
#Violin plot
ggplot(data=rainfall_chembarambakkam,aes(x=Year,y=CHEMBARAMBAKKAM,fill=Year))+geom_violin()+labs(title="Rainfall level in CHEMBARAMBAKKAM",x="Year",y="CHEMBARAMBAKKAM")
#Comparing chembarambakkam,redhills,cholavaram,POONDI
#pie-chart
ggplot(data=rainfall_data,aes(x=(""),fill=Year))+geom_bar()+coord_polar(theta='y')+labs(title="Pie-chart for Rainfall level in all the areas")
#gathering the dataset for Comparing chembarambakkam,redhills,cholavaram,POONDI
rainfall_gather<-rainfall_data%>%gather("Area","Rainfall_lvl",c(4:7))
#Bar plot
ggplot(data=rainfall_gather,aes(x=Area,fill=Year))+geom_bar(position = "dodge")+facet_wrap(~Area,scales="free")+labs(title="Comparision of rainfall level in all the area")
data_rainfall<-rainfall_data
data_rainfall1<-data_rainfall[3:7]
#Elbow plot -to find the number of centroids
set.seed(5)
wcss = vector()
for(i in 1:10)
wcss[i]=sum(kmeans(data_rainfall1,i)$withinss)
plot(1:10, wcss,type='b',main="The Elbow Method",xlab="Number of Cluster",ylab='WCSS')
#Fitting K-mean to the dataset
kmeans=kmeans(x=data_rainfall1,centers = 5 )
y_kmeans = kmeans$cluster
#Visualising the cluster
library(cluster)
clusplot(data_rainfall1,y_kmeans,shade = T,lines = 1,color = T,main="Cluster of Rainfall",xlab = 'Year',ylab = 'OVERALL RAINFALL LEVEL')
#mclust
library(mclust)
## Package 'mclust' version 5.4.5
## Type 'citation("mclust")' for citing this R package in publications.
fit <- Mclust(data_rainfall1)
plot(fit) # plot results
summary(fit) # display the best model
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVE (ellipsoidal, equal orientation) model with 5 components:
##
## log-likelihood n df BIC ICL
## -12153.75 628 64 -24719.81 -24899.77
##
## Clustering table:
## 1 2 3 4 5
## 136 199 56 93 144