The two variables’ distribution on their own:

par(mfrow = c(1,2))

wvprd <- density(buoy$WavePeriod, na.rm = TRUE)
plot(wvprd,  main = "Wave Period Density Plot")
abline(v = mean(buoy$WavePeriod, na.rm = TRUE), col = "red")

wvhgt <- density(buoy$WaveHeight, na.rm = TRUE)
plot(wvhgt,  main = "Wave Height Density Plot")
abline(v = mean(buoy$WaveHeight, na.rm = TRUE), col = "red")

Overall heatmap of the two vairables together:

ggplot(buoy, aes(y = WaveHeight, x = WavePeriod)) + 
    geom_point(aes(shape = station_id), size = 0.8) + 
    geom_density_2d_filled(aes(fill = ..level..), alpha = 0.5) +
    scale_shape_manual(values = c("M1" = 0, "M2" = 1, "M3" = 2, "FS1" = 3, 
                                  "M4-Archive" = 4, "M5" = 5, "M6" = 6, 
                                  "M4" = 7, "Belmullet-AMETS" = 8)) +
    scale_fill_viridis_d(option = "C")
## Warning: The dot-dot notation (`..level..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(level)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 2893 rows containing non-finite outside the scale range
## (`stat_density2d_filled()`).
## Warning: Removed 2893 rows containing missing values or values outside the scale range
## (`geom_point()`).

Split scatter depending on buoy:

# name of stations
sttns <- unique(buoy$station_id)
sttns
## [1] "M1"              "M2"              "M3"              "FS1"            
## [5] "M4-Archive"      "M5"              "M6"              "M4"             
## [9] "Belmullet-AMETS"
# seeing the counts
ggplot(buoy, aes(y = WaveHeight, x = WavePeriod)) + 
    geom_hex() + geom_smooth(method = "lm", alpha = 0.6) +
    facet_wrap(~station_id) + 
    scale_fill_viridis_c()
## Warning: Removed 2893 rows containing non-finite outside the scale range
## (`stat_binhex()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2893 rows containing non-finite outside the scale range
## (`stat_smooth()`).

- Most of the points are located in the bottom left corner, however certain buoys tend to have a higher variation (M1, M3, M4, M4 archive and M6) especially when their wave height and wave period values are higher.
- FS1, M2 and M5 don’t have variables as high as buoys mentioned prior and tend to stick close to the line of best fit.
- However for AMETS I am not too sure as there very little data compared to the others.

- Image was obtained here: https://www.researchgate.net/figure/rish-Moored-Weather-Buoy-Network-retrieved-from-http-datamarineie_fig1_277328871

Analysing Each buoy

# Are there outliers =, box plot of the two:

buoy$station_group <- ifelse(buoy$station_id %in% c("M1", "M3", "M4", "M4-Archive", "M6", "Belmullet-AMETS"), "Group_A", "Group_B")

ggplot(buoy, aes(x = station_id, y = WaveHeight, fill = station_group)) + 
    geom_boxplot() +
    scale_fill_manual(values = c("Group_A" = "maroon", "Group_B" = "yellow")) +
    labs(title = "Wave Height by Station",
         x = "Station ID",
         y = "Wave Height",
         fill = "Ocean") +
    theme_minimal()
## Warning: Removed 2890 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggplot(buoy, aes(x = station_id, y = WavePeriod, fill = station_group)) + 
    geom_boxplot() +
    scale_fill_manual(values = c("Group_A" = "maroon", "Group_B" = "yellow")) +
    labs(title = "Wave Period by Station",
         x = "Station ID",
         y = "Wave Period",
         fill = "Ocean") +
    theme_minimal()
## Warning: Removed 2893 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

buoy %>% filter(station_id == "Belmullet-AMETS") -> problbuoy
problbuoy
# summarize some key variables:
buoy %>% group_by(station_id) %>% 
    summarise(mWaveHeight=mean(WaveHeight, na.rm = TRUE))  ->
    waveheight_summary

buoy %>% group_by(station_id) %>% 
    summarise(mWavePeriod=mean(WavePeriod, na.rm = TRUE))  ->
    waveperiod_summary

# merge(waveheight_summary, waveperiod_summary)

Hypothesis:

ggplot(buoy,aes(x=as.Date(date),y=WaveHeight, colour = WindSpeed)) +
    geom_line() + 
    scale_color_gradient(low = "blue", high = "red") +
    facet_wrap(~station_id)

- It appears that the non-pacific buoys have lower wave heights due whenever wind speed is lower and visa versa.
- the larger values seem to be swayed by higher wind speeds.

# 
# ggplot(buoy,aes(x=as.Date(date),y=WavePeriod, colour = WindSpeed ))+
#     geom_line() + 
#     scale_color_gradient(low = "blue", high = "red") +
#     facet_wrap(~station_id)

ggplot(buoy,aes(y=WindSpeed,x=WavePeriod ))+
    geom_point(size = 0.5) +
    facet_wrap(~station_id)
## Warning: Removed 4876 rows containing missing values or values outside the scale range
## (`geom_point()`).

# 
# 
# buoy %>% group_by(station_id) %>%
#     summarise( corr = cor(WavePeriod, WindSpeed )) -> corr_mat
# 
# co

\[ \text{WaveHeight} = \alpha + \beta_o \beta_1 x\]

Fitting Wave height to wave period and wind speed

# Remove rows with missing values in the relevant columns
clean_buoy <- na.omit(buoy[, c("WaveHeight", "WavePeriod", "WindSpeed")])


lin_fit <- lm(WaveHeight~1 + WavePeriod,clean_buoy)
# summary(lin_fit)
lin_fit_A <- lm(WaveHeight~1 + WavePeriod + WindSpeed, clean_buoy)
# summary(lin_fit_A)
lin_fit_B <- lm(WaveHeight~1 + WavePeriod*WindSpeed, clean_buoy)
summary(lin_fit_B)
## 
## Call:
## lm(formula = WaveHeight ~ 1 + WavePeriod * WindSpeed, data = clean_buoy)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7027 -0.1859  0.0422  0.2110  3.9197 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.6930735  0.0307952  -22.51   <2e-16 ***
## WavePeriod            0.2523710  0.0048072   52.50   <2e-16 ***
## WindSpeed            -0.0661739  0.0019580  -33.80   <2e-16 ***
## WavePeriod:WindSpeed  0.0257426  0.0002856   90.12   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4418 on 21472 degrees of freedom
## Multiple R-squared:  0.9095, Adjusted R-squared:  0.9095 
## F-statistic: 7.191e+04 on 3 and 21472 DF,  p-value: < 2.2e-16
# the t-values are high here for all coeffiecient
# compare the two with anova

lin_fits_anova_base <- anova(lin_fit, lin_fit_A)
lin_fits_anova <- anova(lin_fit_A, lin_fit_B)

lin_fits_anova_base
print("anova of lin_fit_A and lin_fit_B")
## [1] "anova of lin_fit_A and lin_fit_B"
lin_fits_anova
# they are significant

\[ \text{WaveHeight}_{l.o.b.f} \text{~ intercept + WavePeriod} \] \[ \text{WaveHeight}_{A} \text{~ intercept + WavePeriod + WindSpeed} \] \[ \text{WaveHeight}_{B} \text{~ intercept + WavePeriod*WindSpeed} \]

Anova:

buoy$predicted <- predict(lin_fit, newdata = buoy)
buoy$predicted_A <- predict(lin_fit_A, newdata = buoy)
buoy$predicted_B <- predict(lin_fit_B, newdata = buoy)


ggplot(buoy, aes(y = WaveHeight, x = WavePeriod)) + 
    geom_point(size = 1, alpha = 0.5) + 
    geom_smooth(method = "lm") +
    geom_line(aes(y = predicted_B), color = "red", alpha = 0.5) +
    facet_wrap(~station_id)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2893 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2893 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 480 rows containing missing values or values outside the scale range
## (`geom_line()`).

### Conclusion ###