# Waves Dataset
# Import the Data 
path<-"CDS.csv" 
data<-read.csv(path) 
View(data) 

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr) 
dataset<-separate(data,Date.Time,c("Date","Time"),sep=" ") 
View(dataset) 





date<-as.double(dataset$date) 
dataset<-separate(dataset,Date,c("Date","Month","Year"),sep="/") 
View(dataset) 

library(ggplot2)


#Dimensions of the Dataset 
dim(dataset) 
## [1] 43728    10
# Structure of the Dataset 
str(dataset) 
## 'data.frame':    43728 obs. of  10 variables:
##  $ Date          : chr  "01" "01" "01" "01" ...
##  $ Month         : chr  "01" "01" "01" "01" ...
##  $ Year          : chr  "2017" "2017" "2017" "2017" ...
##  $ Time          : chr  "00:00" "00:30" "01:00" "01:30" ...
##  $ Hs            : num  -99.9 0.875 0.763 0.77 0.747 0.718 0.707 0.729 0.733 0.711 ...
##  $ Hmax          : num  -99.9 1.39 1.15 1.41 1.16 1.61 1.34 1.21 1.2 1.29 ...
##  $ Tz            : num  -99.9 4.42 4.52 4.58 4.51 ...
##  $ Tp            : num  -99.9 4.51 5.51 5.65 5.08 ...
##  $ Peak.Direction: num  -99.9 -99.9 49 75 91 68 73 63 68 66 ...
##  $ SST           : num  -99.9 -99.9 25.6 25.5 25.4 ...
# Column Names of the Dataset 
names(dataset) 
##  [1] "Date"           "Month"          "Year"           "Time"          
##  [5] "Hs"             "Hmax"           "Tz"             "Tp"            
##  [9] "Peak.Direction" "SST"
# Header of the Dataset 
head(dataset) 
##   Date Month Year  Time      Hs   Hmax      Tz      Tp Peak.Direction
## 1   01    01 2017 00:00 -99.900 -99.90 -99.900 -99.900          -99.9
## 2   01    01 2017 00:30   0.875   1.39   4.421   4.506          -99.9
## 3   01    01 2017 01:00   0.763   1.15   4.520   5.513           49.0
## 4   01    01 2017 01:30   0.770   1.41   4.582   5.647           75.0
## 5   01    01 2017 02:00   0.747   1.16   4.515   5.083           91.0
## 6   01    01 2017 02:30   0.718   1.61   4.614   6.181           68.0
##      SST
## 1 -99.90
## 2 -99.90
## 3  25.65
## 4  25.50
## 5  25.45
## 6  25.45
#Tail of the Dataset 
tail(dataset) 
##       Date Month Year  Time    Hs Hmax    Tz     Tp Peak.Direction   SST
## 43723   30    06 2019 21:00 2.174 3.30 9.557 12.875             94 21.95
## 43724   30    06 2019 21:30 2.299 3.60 9.281 12.765             94 21.95
## 43725   30    06 2019 22:00 2.075 3.04 9.303 12.722             95 21.95
## 43726   30    06 2019 22:30 2.157 3.43 9.168 12.890             97 21.95
## 43727   30    06 2019 23:00 2.087 2.84 8.706 10.963             92 21.95
## 43728   30    06 2019 23:30 1.926 2.98 8.509 12.228             84 21.95
str(dataset) 
## 'data.frame':    43728 obs. of  10 variables:
##  $ Date          : chr  "01" "01" "01" "01" ...
##  $ Month         : chr  "01" "01" "01" "01" ...
##  $ Year          : chr  "2017" "2017" "2017" "2017" ...
##  $ Time          : chr  "00:00" "00:30" "01:00" "01:30" ...
##  $ Hs            : num  -99.9 0.875 0.763 0.77 0.747 0.718 0.707 0.729 0.733 0.711 ...
##  $ Hmax          : num  -99.9 1.39 1.15 1.41 1.16 1.61 1.34 1.21 1.2 1.29 ...
##  $ Tz            : num  -99.9 4.42 4.52 4.58 4.51 ...
##  $ Tp            : num  -99.9 4.51 5.51 5.65 5.08 ...
##  $ Peak.Direction: num  -99.9 -99.9 49 75 91 68 73 63 68 66 ...
##  $ SST           : num  -99.9 -99.9 25.6 25.5 25.4 ...
summary(dataset)     
##      Date              Month               Year          
##  Length:43728       Length:43728       Length:43728      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      Time                 Hs               Hmax               Tz         
##  Length:43728       Min.   :-99.900   Min.   :-99.900   Min.   :-99.900  
##  Class :character   1st Qu.:  0.838   1st Qu.:  1.410   1st Qu.:  4.974  
##  Mode  :character   Median :  1.129   Median :  1.900   Median :  5.525  
##                     Mean   :  1.041   Mean   :  1.892   Mean   :  5.412  
##                     3rd Qu.:  1.542   3rd Qu.:  2.600   3rd Qu.:  6.162  
##                     Max.   :  4.257   Max.   :  7.906   Max.   : 10.921  
##        Tp          Peak.Direction       SST        
##  Min.   :-99.900   Min.   :-99.9   Min.   :-99.90  
##  1st Qu.:  7.286   1st Qu.: 85.0   1st Qu.: 21.90  
##  Median :  8.881   Median : 99.0   Median : 23.95  
##  Mean   :  8.795   Mean   : 97.4   Mean   : 23.21  
##  3rd Qu.: 10.663   3rd Qu.:116.0   3rd Qu.: 26.05  
##  Max.   : 21.121   Max.   :358.0   Max.   : 28.65
library(ggplot2) 
library(dplyr) 

#cleaning the data 
clean_data<-dataset %>% filter(Hs>0,Peak.Direction>0) 
View(clean_data) 



#Structure of the clean data 
str(clean_data) 
## 'data.frame':    43454 obs. of  10 variables:
##  $ Date          : chr  "01" "01" "01" "01" ...
##  $ Month         : chr  "01" "01" "01" "01" ...
##  $ Year          : chr  "2017" "2017" "2017" "2017" ...
##  $ Time          : chr  "01:00" "01:30" "02:00" "02:30" ...
##  $ Hs            : num  0.763 0.77 0.747 0.718 0.707 0.729 0.733 0.711 0.698 0.686 ...
##  $ Hmax          : num  1.15 1.41 1.16 1.61 1.34 1.21 1.2 1.29 1.11 1.14 ...
##  $ Tz            : num  4.52 4.58 4.51 4.61 4.57 ...
##  $ Tp            : num  5.51 5.65 5.08 6.18 4.71 ...
##  $ Peak.Direction: num  49 75 91 68 73 63 68 66 64 56 ...
##  $ SST           : num  25.6 25.5 25.4 25.4 25.5 ...
#coherce the clean data 
clean_data$Year<-as.factor(clean_data$Year) 
clean_data$Month<-as.factor(clean_data$Month) 
clean_data$Date<-as.numeric(clean_data$Date) 

#structure of the coherce clean data 
str(clean_data) 
## 'data.frame':    43454 obs. of  10 variables:
##  $ Date          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Month         : Factor w/ 31 levels "01","02","03",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Year          : Factor w/ 3 levels "2017","2018",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Time          : chr  "01:00" "01:30" "02:00" "02:30" ...
##  $ Hs            : num  0.763 0.77 0.747 0.718 0.707 0.729 0.733 0.711 0.698 0.686 ...
##  $ Hmax          : num  1.15 1.41 1.16 1.61 1.34 1.21 1.2 1.29 1.11 1.14 ...
##  $ Tz            : num  4.52 4.58 4.51 4.61 4.57 ...
##  $ Tp            : num  5.51 5.65 5.08 6.18 4.71 ...
##  $ Peak.Direction: num  49 75 91 68 73 63 68 66 64 56 ...
##  $ SST           : num  25.6 25.5 25.4 25.4 25.5 ...
#Visualizing #histogram plot 
ggplot(data=clean_data,aes(x=Peak.Direction))+geom_histogram(binwidth=30)+facet_wrap(~Year)+ ggtitle("Distibution of Peak Direction") 

ggplot(data=dataset,aes(x=SST))+geom_histogram(binwidth=50)+   ggtitle("Distibution of Sea Surface Temperature") 

#freqpoly plot 
ggplot(data=dataset,mapping=aes(x=Peak.Direction,color="orange")) +   geom_freqpoly(binwidth =0.1) 

#title plot 
dataset %>%  
count(Peak.Direction,SST) %>%   
  ggplot(mapping = aes(x =Peak.Direction, y = SST)) + 
  geom_tile(mapping = aes(fill = SST))+xlim(c(0,200))+ylim(c(20,30)) 

#boxplot 


boxplot(clean_data$Hs,clean_data$Hmax) 

#extract the first month data 
clean_month<-clean_data %>% filter(Month=="01") 
View(clean_month) 




library(dplyr)
library(ggplot2)
library(tidyr)

#hex plot 
#install.packages("hexbin") 
library(hexbin) 
ggplot(data =clean_month) +geom_hex(aes(x=Hs,y=Hmax)) 

#treemap 
#install.packages("treemap")

library(treemap) 
treemap(clean_data,index=c("Date"),vSize="Hmax") 

#violin plot 
ggplot(data =dataset,aes(x=Year,y=Peak.Direction,fill=Year)) +   geom_violin() 

#plot 
plot(clean_data$Year,clean_data$Hmax) 

#geom point for date and hmax 

ggplot(clean_data,aes(x=clean_data$Date,y=clean_data$Hmax,col=clean_data$Hmax,size=Date
))+ 
  geom_point(shape=36)+xlab("Date")+ylab("Hmax")+ggtitle("Date VS Hmax") 

#bar chart 
ggplot(clean_data,aes(x=Tz,fill=Year))+geom_bar()+xlab("Tz")+ggtitle("Bar chart for Tz") 

ggplot(clean_data,aes(x=SST,fill=Year))+geom_bar(position="dodge")+xlab("SST") 

ggplot(clean_data,aes(x=SST,fill=Year))+geom_bar(position="fill") 

ggplot(clean_data,aes(x=SST,fill=Year))+geom_bar(position="fill")+facet_wrap(~Year)+   theme(axis.text.x=element_text(angle=90)) 

#density plot 

ggplot(clean_data,aes(x=Hs,fill=Month))+geom_density(alpha=0.4) 

#time series chart 
ggplot(data=clean_data,aes(x=Hs))+geom_line(aes(y=Month))+labs(title="Time Series Chart") 

ggplot(clean_data,aes(x=Hs,y=Date))+geom_segment(aes(x=0,y=Date,xend=Hs,yend=Date))+geom_point() 

#filtering the data 
data_c<-data%>% filter(Hs>0,Peak.Direction>0,Tz>0,Tp>0,Peak.Direction>0,SST>0,Hmax>0)

data_c=data_c[,2:3] 
View(data_c) 




#using elbow method to find optimal clusters library(cluster) 
set.seed(123) 

wcss=vector() 
for(i in 1:10) 
  wcss[i]=sum(kmeans(data_c,i)$withinss) 

plot(1:10,wcss,type='b',main="Elbow method", 
     xlab="Number of cluster",ylab="WCSS") 

#fitting k means dataset 
set.seed(24) 
kmeans=kmeans(x=data_c,centers=5) 

y_kmeans=kmeans$cluster 
library(cluster)
#visualize 
clusplot(data_c,y_kmeans,lines=0,color=T,shade=T,main='cluster of waves',xlab='Hs',ylab='Hmax')