options(repos = c(CRAN = "https://cran.rstudio.com/"))
# Load required libraries
install.packages("dplyr")
## Installing package into 'C:/Users/Naman/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Naman\AppData\Local\R\win-library\4.4\00LOCK\dplyr\libs\x64\dplyr.dll
## to C:\Users\Naman\AppData\Local\R\win-library\4.4\dplyr\libs\x64\dplyr.dll:
## Permission denied
## Warning: restored 'dplyr'
##
## The downloaded binary packages are in
## C:\Users\Naman\AppData\Local\Temp\Rtmpm0Sv7x\downloaded_packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
install.packages("ggplot2")
## Installing package into 'C:/Users/Naman/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Naman\AppData\Local\Temp\Rtmpm0Sv7x\downloaded_packages
library(ggplot2)
install.packages("caret")
## Installing package into 'C:/Users/Naman/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'caret' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'caret'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Naman\AppData\Local\R\win-library\4.4\00LOCK\caret\libs\x64\caret.dll
## to C:\Users\Naman\AppData\Local\R\win-library\4.4\caret\libs\x64\caret.dll:
## Permission denied
## Warning: restored 'caret'
##
## The downloaded binary packages are in
## C:\Users\Naman\AppData\Local\Temp\Rtmpm0Sv7x\downloaded_packages
library(caret)# For machine learning
## Loading required package: lattice
install.packages("randomForest")
## Installing package into 'C:/Users/Naman/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'randomForest' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Naman\AppData\Local\Temp\Rtmpm0Sv7x\downloaded_packages
library(randomForest) # Random forest classifier
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
# Load the dataset
data <- read.csv("C:\\Users\\Naman\\Downloads\\FW Swim.csv")
# View the structure of the dataset
str(data)
## 'data.frame': 3443 obs. of 11 variables:
## $ Beach : chr "Angle Lake" "Angle Lake" "Angle Lake" "Angle Lake" ...
## $ Jurisdiction : chr "SeaTac" "SeaTac" "SeaTac" "SeaTac" ...
## $ Locator : chr "A732SB" "A732SB" "A732SB" "A732SB" ...
## $ Date : chr "2024-07-15" "2024-07-22" "2024-07-29" "2024-08-05" ...
## $ Day : chr "Mon" "Mon" "Mon" "Mon" ...
## $ Time : chr "08:24" "08:16" "09:26" "08:33" ...
## $ Geomean30d : num 14.54 11.41 9.93 9.66 10.89 ...
## $ nSamplesHigh30d: int 0 0 0 0 0 0 0 0 0 0 ...
## $ HighToday : chr "false" "false" "false" "false" ...
## $ WaterTempC : num 24.4 24.3 22.7 23.3 23.5 24.3 22 21.1 21.7 20 ...
## $ WaterTempF : num 75.9 75.7 72.9 73.9 74.3 ...
##Data Preprocessing #We need to:
#1.Handle missing values (if any). #2.Convert HighToday into a binary target variable. #3.Convert categorical variables (like Beach and Jurisdiction) into factors.
# Convert HighToday into a binary target (1 = Unsafe, 0 = Safe)
data$HighToday <- ifelse(data$HighToday > 0, 1, 0)
# Convert Beach and Jurisdiction to factors
data$Beach <- as.factor(data$Beach)
data$Jurisdiction <- as.factor(data$Jurisdiction)
# Check for missing values
sum(is.na(data))
## [1] 1224
# If there are missing values, fill them (you can use imputation if necessary)
# data <- na.omit(data) # Or you can use other imputation techniques
# View a summary of the processed data
summary(data)
## Beach Jurisdiction Locator Date
## Gene Coulon: 151 Seattle :1257 Length:3443 Length:3443
## Newcastle : 146 Bellevue : 431 Class :character Class :character
## Enatai : 144 Kirkland : 424 Mode :character Mode :character
## Juanita : 143 Renton : 289
## Matthews : 143 Mercer Island: 201
## Houghton : 142 Sammamish : 199
## (Other) :2574 (Other) : 642
## Day Time Geomean30d nSamplesHigh30d
## Length:3443 Length:3443 Min. : 1.000 Min. : 0.000
## Class :character Class :character 1st Qu.: 7.201 1st Qu.: 0.000
## Mode :character Mode :character Median : 17.209 Median : 0.000
## Mean : 34.201 Mean : 0.846
## 3rd Qu.: 40.939 3rd Qu.: 1.000
## Max. :610.551 Max. :12.000
## NA's :566 NA's :566
## HighToday WaterTempC WaterTempF
## Min. :0.0000 Min. : 9.90 Min. :49.82
## 1st Qu.:1.0000 1st Qu.:18.00 1st Qu.:64.40
## Median :1.0000 Median :20.90 Median :69.62
## Mean :0.8507 Mean :20.45 Mean :68.81
## 3rd Qu.:1.0000 3rd Qu.:23.00 3rd Qu.:73.40
## Max. :1.0000 Max. :29.40 Max. :84.92
## NA's :46 NA's :46
#Split the Dataset into Train and Test Sets
# Install and load the caret package
install.packages("caret") # This line should only be run once
## Warning: package 'caret' is in use and will not be installed
library(caret)
# Set seed for reproducibility
set.seed(123)
# Assuming 'data' is your dataset
# Create a partition to split the data
trainIndex <- createDataPartition(data$HighToday, p = 0.7,
list = FALSE)
# Create training and test datasets
train_data <- data[trainIndex, ]
test_data <- data[-trainIndex, ]
# Check the dimensions of the datasets
dim(train_data) # Should show number of rows and columns in train_data
## [1] 2411 11
dim(test_data) # Should show number of rows and columns in test_data
## [1] 1032 11
#Build the logistic Regression Model
## Train a logistic regression model
log_model <- glm(HighToday ~ WaterTempC + Geomean30d + nSamplesHigh30d + Beach + Jurisdiction,
data = train_data, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# View the model summary
summary(log_model)
##
## Call:
## glm(formula = HighToday ~ WaterTempC + Geomean30d + nSamplesHigh30d +
## Beach + Jurisdiction, family = binomial, data = train_data)
##
## Coefficients: (11 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.890e+14 1.903e+16 -0.015 0.988
## WaterTempC -9.591e-01 9.647e-01 -0.994 0.320
## Geomean30d 1.253e+00 3.185e+00 0.393 0.694
## nSamplesHigh30d 1.272e+01 3.726e+03 0.003 0.997
## BeachBeaver Lake 2.890e+14 1.903e+16 0.015 0.988
## BeachEcho Lake 2.890e+14 1.903e+16 0.015 0.988
## BeachEnatai 2.890e+14 1.903e+16 0.015 0.988
## BeachFivemile Lake 2.890e+14 1.903e+16 0.015 0.988
## BeachGene Coulon 2.890e+14 1.903e+16 0.015 0.988
## BeachGreen Lake East 2.890e+14 1.903e+16 0.015 0.988
## BeachGreen Lake West 2.890e+14 1.903e+16 0.015 0.988
## BeachGroveland 2.890e+14 1.903e+16 0.015 0.988
## BeachHoughton 2.890e+14 1.903e+16 0.015 0.988
## BeachIdylwood 2.890e+14 1.903e+16 0.015 0.988
## BeachJuanita 2.890e+14 1.903e+16 0.015 0.988
## BeachKennydale 2.890e+14 1.903e+16 0.015 0.988
## BeachLake Sammamish State Park 2.890e+14 1.903e+16 0.015 0.988
## BeachLake Wilderness 2.890e+14 1.903e+16 0.015 0.988
## BeachLuther Burbank 2.890e+14 1.903e+16 0.015 0.988
## BeachMadison Park 2.890e+14 1.903e+16 0.015 0.988
## BeachMadrona 2.890e+14 1.903e+16 0.015 0.988
## BeachMagnuson 2.890e+14 1.903e+16 0.015 0.988
## BeachMatthews 2.890e+14 1.903e+16 0.015 0.988
## BeachMeydenbauer 2.890e+14 1.903e+16 0.015 0.988
## BeachMt Baker 2.890e+14 1.903e+16 0.015 0.988
## BeachNewcastle 2.890e+14 1.903e+16 0.015 0.988
## BeachPine Lake 2.890e+14 1.903e+16 0.015 0.988
## BeachPritchard 2.890e+14 1.903e+16 0.015 0.988
## BeachRattlesnake Lake 2.890e+14 1.903e+16 0.015 0.988
## BeachSammamish Landing 2.890e+14 1.903e+16 0.015 0.988
## BeachSeward Park 2.890e+14 1.903e+16 0.015 0.988
## BeachWaverly Park 2.890e+14 1.903e+16 0.015 0.988
## JurisdictionKing County Parks NA NA NA NA
## JurisdictionKirkland NA NA NA NA
## JurisdictionMaple Valley NA NA NA NA
## JurisdictionMercer Island NA NA NA NA
## JurisdictionRedmond NA NA NA NA
## JurisdictionRenton NA NA NA NA
## JurisdictionSammamish NA NA NA NA
## JurisdictionSeaTac 2.890e+14 1.903e+16 0.015 0.988
## JurisdictionSeattle NA NA NA NA
## JurisdictionSeattle Public Utilities NA NA NA NA
## JurisdictionShoreline NA NA NA NA
## JurisdictionState Parks NA NA NA NA
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 17.2252 on 2023 degrees of freedom
## Residual deviance: 4.6016 on 1991 degrees of freedom
## (387 observations deleted due to missingness)
## AIC: 70.602
##
## Number of Fisher Scoring iterations: 25
#Model Evaluation
# Load necessary libraries
library(caret)
# Make predictions on the test data using the logistic regression model
log_predictions <- predict(log_model, test_data, type = "response")
# Convert probabilities into binary outcomes
log_pred_binary <- ifelse(log_predictions > 0.5, 1, 0)
# Confusion matrix and accuracy
conf_matrix <- confusionMatrix(as.factor(log_pred_binary), as.factor(test_data$HighToday))
## Warning in confusionMatrix.default(as.factor(log_pred_binary),
## as.factor(test_data$HighToday)): Levels are not in the same order for reference
## and data. Refactoring data to match.
# View confusion matrix
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 0 0
## 1 0 834
##
## Accuracy : 1
## 95% CI : (0.9956, 1)
## No Information Rate : 1
## P-Value [Acc > NIR] : 1
##
## Kappa : NaN
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : NA
## Specificity : 1
## Pos Pred Value : NA
## Neg Pred Value : NA
## Prevalence : 0
## Detection Rate : 0
## Detection Prevalence : 0
## Balanced Accuracy : NA
##
## 'Positive' Class : 0
##