options(repos = c(CRAN = "https://cloud.r-project.org/"))
# Load necessary libraries
install.packages("ggplot2")
## Installing package into 'C:/Users/Naman/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Naman\AppData\Local\Temp\RtmpE9DMKh\downloaded_packages
install.packages("corrplot")
## Installing package into 'C:/Users/Naman/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'corrplot' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Naman\AppData\Local\Temp\RtmpE9DMKh\downloaded_packages
library(corrplot)
## corrplot 0.95 loaded
install.packages("factoextra")
## Installing package into 'C:/Users/Naman/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'factoextra' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Naman\AppData\Local\Temp\RtmpE9DMKh\downloaded_packages
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(cluster)
# Load the dataset
beach_data <- read.csv("C:\\Users\\Naman\\Downloads\\FW Swim.csv")
# Check the structure of the dataset
str(beach_data)
## 'data.frame': 3443 obs. of 11 variables:
## $ Beach : chr "Angle Lake" "Angle Lake" "Angle Lake" "Angle Lake" ...
## $ Jurisdiction : chr "SeaTac" "SeaTac" "SeaTac" "SeaTac" ...
## $ Locator : chr "A732SB" "A732SB" "A732SB" "A732SB" ...
## $ Date : chr "2024-07-15" "2024-07-22" "2024-07-29" "2024-08-05" ...
## $ Day : chr "Mon" "Mon" "Mon" "Mon" ...
## $ Time : chr "08:24" "08:16" "09:26" "08:33" ...
## $ Geomean30d : num 14.54 11.41 9.93 9.66 10.89 ...
## $ nSamplesHigh30d: int 0 0 0 0 0 0 0 0 0 0 ...
## $ HighToday : chr "false" "false" "false" "false" ...
## $ WaterTempC : num 24.4 24.3 22.7 23.3 23.5 24.3 22 21.1 21.7 20 ...
## $ WaterTempF : num 75.9 75.7 72.9 73.9 74.3 ...
# Summary statistics
summary(beach_data)
## Beach Jurisdiction Locator Date
## Length:3443 Length:3443 Length:3443 Length:3443
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Day Time Geomean30d nSamplesHigh30d
## Length:3443 Length:3443 Min. : 1.000 Min. : 0.000
## Class :character Class :character 1st Qu.: 7.201 1st Qu.: 0.000
## Mode :character Mode :character Median : 17.209 Median : 0.000
## Mean : 34.201 Mean : 0.846
## 3rd Qu.: 40.939 3rd Qu.: 1.000
## Max. :610.551 Max. :12.000
## NA's :566 NA's :566
## HighToday WaterTempC WaterTempF
## Length:3443 Min. : 9.90 Min. :49.82
## Class :character 1st Qu.:18.00 1st Qu.:64.40
## Mode :character Median :20.90 Median :69.62
## Mean :20.45 Mean :68.81
## 3rd Qu.:23.00 3rd Qu.:73.40
## Max. :29.40 Max. :84.92
## NA's :46 NA's :46
# Check for missing values
colSums(is.na(beach_data))
## Beach Jurisdiction Locator Date Day
## 0 0 0 0 0
## Time Geomean30d nSamplesHigh30d HighToday WaterTempC
## 0 566 566 0 46
## WaterTempF
## 46
beach_data <- na.omit(beach_data)
beach_data <- beach_data[!is.na(beach_data$WaterTempC), ]
beach_data <- beach_data[, colSums(is.na(beach_data)) == 0]
# Plotting histograms for Geomean30d, nSamplesHigh30d, and WaterTempC
ggplot(beach_data, aes(x = Geomean30d)) +
geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
theme_minimal() +
labs(title = "Distribution of Geomean30d", x = "Geomean30d")

ggplot(beach_data, aes(x = nSamplesHigh30d)) +
geom_histogram(bins = 30, fill = "green", color = "black", alpha = 0.7) +
theme_minimal() +
labs(title = "Distribution of nSamplesHigh30d", x = "nSamplesHigh30d")

ggplot(beach_data, aes(x = WaterTempC)) +
geom_histogram(bins = 30, fill = "orange", color = "black", alpha = 0.7) +
theme_minimal() +
labs(title = "Distribution of WaterTempC", x = "WaterTempC (°C)")

#Correlation matrix and heatmap
cor_matrix <- cor(beach_data[c("Geomean30d", "nSamplesHigh30d", "WaterTempC")])
corrplot(cor_matrix, method = "circle")

# K-means Clustering
# Scale the data
scaled_data <- scale(beach_data[, c("Geomean30d", "nSamplesHigh30d", "WaterTempC")])
# Determine the optimal number of clusters using the Elbow Method
fviz_nbclust(scaled_data, kmeans, method = "wss") +
labs(title = "Elbow Method for Optimal Number of Clusters")

# Apply K-means clustering (assuming 3 clusters for illustration)
set.seed(123) # For reproducibility
kmeans_result <- kmeans(scaled_data, centers = 3, nstart = 25)
# Add cluster results to the original dataset
beach_data$Cluster <- as.factor(kmeans_result$cluster)
# Visualize the clustering result (using Geomean30d vs WaterTempC)
ggplot(beach_data, aes(x = Geomean30d, y = WaterTempC, color = Cluster)) +
geom_point() +
labs(title = "K-means Clustering of Beaches", x = "Geomean30d", y = "WaterTempC") +
scale_color_manual(values = c("red", "green", "blue"))
