# Waves Dataset
# Import the Data
path<-"CDS.csv"
data<-read.csv(path)
View(data)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
dataset<-separate(data,Date.Time,c("Date","Time"),sep=" ")
View(dataset)
date<-as.double(dataset$date)
dataset<-separate(dataset,Date,c("Date","Month","Year"),sep="/")
View(dataset)
library(ggplot2)
#Dimensions of the Dataset
dim(dataset)
## [1] 43728 10
# Structure of the Dataset
str(dataset)
## 'data.frame': 43728 obs. of 10 variables:
## $ Date : chr "01" "01" "01" "01" ...
## $ Month : chr "01" "01" "01" "01" ...
## $ Year : chr "2017" "2017" "2017" "2017" ...
## $ Time : chr "00:00" "00:30" "01:00" "01:30" ...
## $ Hs : num -99.9 0.875 0.763 0.77 0.747 0.718 0.707 0.729 0.733 0.711 ...
## $ Hmax : num -99.9 1.39 1.15 1.41 1.16 1.61 1.34 1.21 1.2 1.29 ...
## $ Tz : num -99.9 4.42 4.52 4.58 4.51 ...
## $ Tp : num -99.9 4.51 5.51 5.65 5.08 ...
## $ Peak.Direction: num -99.9 -99.9 49 75 91 68 73 63 68 66 ...
## $ SST : num -99.9 -99.9 25.6 25.5 25.4 ...
# Column Names of the Dataset
names(dataset)
## [1] "Date" "Month" "Year" "Time"
## [5] "Hs" "Hmax" "Tz" "Tp"
## [9] "Peak.Direction" "SST"
# Header of the Dataset
head(dataset)
## Date Month Year Time Hs Hmax Tz Tp Peak.Direction
## 1 01 01 2017 00:00 -99.900 -99.90 -99.900 -99.900 -99.9
## 2 01 01 2017 00:30 0.875 1.39 4.421 4.506 -99.9
## 3 01 01 2017 01:00 0.763 1.15 4.520 5.513 49.0
## 4 01 01 2017 01:30 0.770 1.41 4.582 5.647 75.0
## 5 01 01 2017 02:00 0.747 1.16 4.515 5.083 91.0
## 6 01 01 2017 02:30 0.718 1.61 4.614 6.181 68.0
## SST
## 1 -99.90
## 2 -99.90
## 3 25.65
## 4 25.50
## 5 25.45
## 6 25.45
#Tail of the Dataset
tail(dataset)
## Date Month Year Time Hs Hmax Tz Tp Peak.Direction SST
## 43723 30 06 2019 21:00 2.174 3.30 9.557 12.875 94 21.95
## 43724 30 06 2019 21:30 2.299 3.60 9.281 12.765 94 21.95
## 43725 30 06 2019 22:00 2.075 3.04 9.303 12.722 95 21.95
## 43726 30 06 2019 22:30 2.157 3.43 9.168 12.890 97 21.95
## 43727 30 06 2019 23:00 2.087 2.84 8.706 10.963 92 21.95
## 43728 30 06 2019 23:30 1.926 2.98 8.509 12.228 84 21.95
str(dataset)
## 'data.frame': 43728 obs. of 10 variables:
## $ Date : chr "01" "01" "01" "01" ...
## $ Month : chr "01" "01" "01" "01" ...
## $ Year : chr "2017" "2017" "2017" "2017" ...
## $ Time : chr "00:00" "00:30" "01:00" "01:30" ...
## $ Hs : num -99.9 0.875 0.763 0.77 0.747 0.718 0.707 0.729 0.733 0.711 ...
## $ Hmax : num -99.9 1.39 1.15 1.41 1.16 1.61 1.34 1.21 1.2 1.29 ...
## $ Tz : num -99.9 4.42 4.52 4.58 4.51 ...
## $ Tp : num -99.9 4.51 5.51 5.65 5.08 ...
## $ Peak.Direction: num -99.9 -99.9 49 75 91 68 73 63 68 66 ...
## $ SST : num -99.9 -99.9 25.6 25.5 25.4 ...
summary(dataset)
## Date Month Year
## Length:43728 Length:43728 Length:43728
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## Time Hs Hmax Tz
## Length:43728 Min. :-99.900 Min. :-99.900 Min. :-99.900
## Class :character 1st Qu.: 0.838 1st Qu.: 1.410 1st Qu.: 4.974
## Mode :character Median : 1.129 Median : 1.900 Median : 5.525
## Mean : 1.041 Mean : 1.892 Mean : 5.412
## 3rd Qu.: 1.542 3rd Qu.: 2.600 3rd Qu.: 6.162
## Max. : 4.257 Max. : 7.906 Max. : 10.921
## Tp Peak.Direction SST
## Min. :-99.900 Min. :-99.9 Min. :-99.90
## 1st Qu.: 7.286 1st Qu.: 85.0 1st Qu.: 21.90
## Median : 8.881 Median : 99.0 Median : 23.95
## Mean : 8.795 Mean : 97.4 Mean : 23.21
## 3rd Qu.: 10.663 3rd Qu.:116.0 3rd Qu.: 26.05
## Max. : 21.121 Max. :358.0 Max. : 28.65
library(ggplot2)
library(dplyr)
#cleaning the data
clean_data<-dataset %>% filter(Hs>0,Peak.Direction>0)
View(clean_data)
#Structure of the clean data
str(clean_data)
## 'data.frame': 43454 obs. of 10 variables:
## $ Date : chr "01" "01" "01" "01" ...
## $ Month : chr "01" "01" "01" "01" ...
## $ Year : chr "2017" "2017" "2017" "2017" ...
## $ Time : chr "01:00" "01:30" "02:00" "02:30" ...
## $ Hs : num 0.763 0.77 0.747 0.718 0.707 0.729 0.733 0.711 0.698 0.686 ...
## $ Hmax : num 1.15 1.41 1.16 1.61 1.34 1.21 1.2 1.29 1.11 1.14 ...
## $ Tz : num 4.52 4.58 4.51 4.61 4.57 ...
## $ Tp : num 5.51 5.65 5.08 6.18 4.71 ...
## $ Peak.Direction: num 49 75 91 68 73 63 68 66 64 56 ...
## $ SST : num 25.6 25.5 25.4 25.4 25.5 ...
#coherce the clean data
clean_data$Year<-as.factor(clean_data$Year)
clean_data$Month<-as.factor(clean_data$Month)
clean_data$Date<-as.numeric(clean_data$Date)
#structure of the coherce clean data
str(clean_data)
## 'data.frame': 43454 obs. of 10 variables:
## $ Date : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Month : Factor w/ 31 levels "01","02","03",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Year : Factor w/ 3 levels "2017","2018",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Time : chr "01:00" "01:30" "02:00" "02:30" ...
## $ Hs : num 0.763 0.77 0.747 0.718 0.707 0.729 0.733 0.711 0.698 0.686 ...
## $ Hmax : num 1.15 1.41 1.16 1.61 1.34 1.21 1.2 1.29 1.11 1.14 ...
## $ Tz : num 4.52 4.58 4.51 4.61 4.57 ...
## $ Tp : num 5.51 5.65 5.08 6.18 4.71 ...
## $ Peak.Direction: num 49 75 91 68 73 63 68 66 64 56 ...
## $ SST : num 25.6 25.5 25.4 25.4 25.5 ...
#Visualizing #histogram plot
ggplot(data=clean_data,aes(x=Peak.Direction))+geom_histogram(binwidth=30)+facet_wrap(~Year)+ ggtitle("Distibution of Peak Direction")

ggplot(data=dataset,aes(x=SST))+geom_histogram(binwidth=50)+ ggtitle("Distibution of Sea Surface Temperature")

#freqpoly plot
ggplot(data=dataset,mapping=aes(x=Peak.Direction,color="orange")) + geom_freqpoly(binwidth =0.1)

#title plot
dataset %>%
count(Peak.Direction,SST) %>%
ggplot(mapping = aes(x =Peak.Direction, y = SST)) +
geom_tile(mapping = aes(fill = SST))+xlim(c(0,200))+ylim(c(20,30))

#boxplot
boxplot(clean_data$Hs,clean_data$Hmax)

#extract the first month data
clean_month<-clean_data %>% filter(Month=="01")
View(clean_month)
library(dplyr)
library(ggplot2)
library(tidyr)
#hex plot
#install.packages("hexbin")
library(hexbin)
ggplot(data =clean_month) +geom_hex(aes(x=Hs,y=Hmax))

#treemap
#install.packages("treemap")
library(treemap)
treemap(clean_data,index=c("Date"),vSize="Hmax")

#violin plot
ggplot(data =dataset,aes(x=Year,y=Peak.Direction,fill=Year)) + geom_violin()

#plot
plot(clean_data$Year,clean_data$Hmax)

#geom point for date and hmax
ggplot(clean_data,aes(x=clean_data$Date,y=clean_data$Hmax,col=clean_data$Hmax,size=Date
))+
geom_point(shape=36)+xlab("Date")+ylab("Hmax")+ggtitle("Date VS Hmax")

#bar chart
ggplot(clean_data,aes(x=Tz,fill=Year))+geom_bar()+xlab("Tz")+ggtitle("Bar chart for Tz")

ggplot(clean_data,aes(x=SST,fill=Year))+geom_bar(position="dodge")+xlab("SST")

ggplot(clean_data,aes(x=SST,fill=Year))+geom_bar(position="fill")

ggplot(clean_data,aes(x=SST,fill=Year))+geom_bar(position="fill")+facet_wrap(~Year)+ theme(axis.text.x=element_text(angle=90))

#density plot
ggplot(clean_data,aes(x=Hs,fill=Month))+geom_density(alpha=0.4)

#time series chart
ggplot(data=clean_data,aes(x=Hs))+geom_line(aes(y=Month))+labs(title="Time Series Chart")

ggplot(clean_data,aes(x=Hs,y=Date))+geom_segment(aes(x=0,y=Date,xend=Hs,yend=Date))+geom_point()

#filtering the data
data_c<-data%>% filter(Hs>0,Peak.Direction>0,Tz>0,Tp>0,Peak.Direction>0,SST>0,Hmax>0)
data_c=data_c[,2:3]
View(data_c)
#using elbow method to find optimal clusters library(cluster)
set.seed(123)
wcss=vector()
for(i in 1:10)
wcss[i]=sum(kmeans(data_c,i)$withinss)
plot(1:10,wcss,type='b',main="Elbow method",
xlab="Number of cluster",ylab="WCSS")

#fitting k means dataset
set.seed(24)
kmeans=kmeans(x=data_c,centers=5)
y_kmeans=kmeans$cluster
library(cluster)
#visualize
clusplot(data_c,y_kmeans,lines=0,color=T,shade=T,main='cluster of waves',xlab='Hs',ylab='Hmax')
