Clustering and EDA for Fresh water Swim Beach Data

options(repos = c(CRAN = "https://cloud.r-project.org/"))

# Load necessary libraries
install.packages("ggplot2")

## Installing package into 'C:/Users/Naman/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Naman\AppData\Local\Temp\RtmpE9DMKh\downloaded_packages

install.packages("corrplot")

## Installing package into 'C:/Users/Naman/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'corrplot' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Naman\AppData\Local\Temp\RtmpE9DMKh\downloaded_packages

library(corrplot)

## corrplot 0.95 loaded

install.packages("factoextra")

## Installing package into 'C:/Users/Naman/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'factoextra' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Naman\AppData\Local\Temp\RtmpE9DMKh\downloaded_packages

library(factoextra)

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(cluster)

# Load the dataset
beach_data <- read.csv("C:\\Users\\Naman\\Downloads\\FW Swim.csv")

# Check the structure of the dataset
str(beach_data)

## 'data.frame':    3443 obs. of  11 variables:
##  $ Beach          : chr  "Angle Lake" "Angle Lake" "Angle Lake" "Angle Lake" ...
##  $ Jurisdiction   : chr  "SeaTac" "SeaTac" "SeaTac" "SeaTac" ...
##  $ Locator        : chr  "A732SB" "A732SB" "A732SB" "A732SB" ...
##  $ Date           : chr  "2024-07-15" "2024-07-22" "2024-07-29" "2024-08-05" ...
##  $ Day            : chr  "Mon" "Mon" "Mon" "Mon" ...
##  $ Time           : chr  "08:24" "08:16" "09:26" "08:33" ...
##  $ Geomean30d     : num  14.54 11.41 9.93 9.66 10.89 ...
##  $ nSamplesHigh30d: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HighToday      : chr  "false" "false" "false" "false" ...
##  $ WaterTempC     : num  24.4 24.3 22.7 23.3 23.5 24.3 22 21.1 21.7 20 ...
##  $ WaterTempF     : num  75.9 75.7 72.9 73.9 74.3 ...

# Summary statistics
summary(beach_data)

##     Beach           Jurisdiction         Locator              Date          
##  Length:3443        Length:3443        Length:3443        Length:3443       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      Day                Time             Geomean30d      nSamplesHigh30d 
##  Length:3443        Length:3443        Min.   :  1.000   Min.   : 0.000  
##  Class :character   Class :character   1st Qu.:  7.201   1st Qu.: 0.000  
##  Mode  :character   Mode  :character   Median : 17.209   Median : 0.000  
##                                        Mean   : 34.201   Mean   : 0.846  
##                                        3rd Qu.: 40.939   3rd Qu.: 1.000  
##                                        Max.   :610.551   Max.   :12.000  
##                                        NA's   :566       NA's   :566     
##   HighToday           WaterTempC      WaterTempF   
##  Length:3443        Min.   : 9.90   Min.   :49.82  
##  Class :character   1st Qu.:18.00   1st Qu.:64.40  
##  Mode  :character   Median :20.90   Median :69.62  
##                     Mean   :20.45   Mean   :68.81  
##                     3rd Qu.:23.00   3rd Qu.:73.40  
##                     Max.   :29.40   Max.   :84.92  
##                     NA's   :46      NA's   :46

# Check for missing values
colSums(is.na(beach_data))

##           Beach    Jurisdiction         Locator            Date             Day 
##               0               0               0               0               0 
##            Time      Geomean30d nSamplesHigh30d       HighToday      WaterTempC 
##               0             566             566               0              46 
##      WaterTempF 
##              46

beach_data <- na.omit(beach_data)

beach_data <- beach_data[!is.na(beach_data$WaterTempC), ]

beach_data <- beach_data[, colSums(is.na(beach_data)) == 0]

# Plotting histograms for Geomean30d, nSamplesHigh30d, and WaterTempC
ggplot(beach_data, aes(x = Geomean30d)) + 
  geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
  theme_minimal() + 
  labs(title = "Distribution of Geomean30d", x = "Geomean30d")

ggplot(beach_data, aes(x = nSamplesHigh30d)) + 
  geom_histogram(bins = 30, fill = "green", color = "black", alpha = 0.7) +
  theme_minimal() + 
  labs(title = "Distribution of nSamplesHigh30d", x = "nSamplesHigh30d")

ggplot(beach_data, aes(x = WaterTempC)) + 
  geom_histogram(bins = 30, fill = "orange", color = "black", alpha = 0.7) +
  theme_minimal() + 
  labs(title = "Distribution of WaterTempC", x = "WaterTempC (°C)")

#Correlation matrix and heatmap
cor_matrix <- cor(beach_data[c("Geomean30d", "nSamplesHigh30d", "WaterTempC")])
corrplot(cor_matrix, method = "circle")

# K-means Clustering
# Scale the data
scaled_data <- scale(beach_data[, c("Geomean30d", "nSamplesHigh30d", "WaterTempC")])

# Determine the optimal number of clusters using the Elbow Method
fviz_nbclust(scaled_data, kmeans, method = "wss") + 
  labs(title = "Elbow Method for Optimal Number of Clusters")

# Apply K-means clustering (assuming 3 clusters for illustration)
set.seed(123)  # For reproducibility
kmeans_result <- kmeans(scaled_data, centers = 3, nstart = 25)

# Add cluster results to the original dataset
beach_data$Cluster <- as.factor(kmeans_result$cluster)

# Visualize the clustering result (using Geomean30d vs WaterTempC)
ggplot(beach_data, aes(x = Geomean30d, y = WaterTempC, color = Cluster)) + 
  geom_point() + 
  labs(title = "K-means Clustering of Beaches", x = "Geomean30d", y = "WaterTempC") +
  scale_color_manual(values = c("red", "green", "blue"))

Clustering and EDA for Fresh water Swim Beach Data

Namana Herle G

2024-10-24