options(repos = c(CRAN = "https://cran.rstudio.com/"))
# Load required libraries
install.packages("dplyr")
## Installing package into 'C:/Users/Naman/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Naman\AppData\Local\R\win-library\4.4\00LOCK\dplyr\libs\x64\dplyr.dll
## to C:\Users\Naman\AppData\Local\R\win-library\4.4\dplyr\libs\x64\dplyr.dll:
## Permission denied
## Warning: restored 'dplyr'
## 
## The downloaded binary packages are in
##  C:\Users\Naman\AppData\Local\Temp\Rtmpm0Sv7x\downloaded_packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
install.packages("ggplot2")
## Installing package into 'C:/Users/Naman/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Naman\AppData\Local\Temp\Rtmpm0Sv7x\downloaded_packages
library(ggplot2)
install.packages("caret")
## Installing package into 'C:/Users/Naman/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'caret' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'caret'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Naman\AppData\Local\R\win-library\4.4\00LOCK\caret\libs\x64\caret.dll
## to C:\Users\Naman\AppData\Local\R\win-library\4.4\caret\libs\x64\caret.dll:
## Permission denied
## Warning: restored 'caret'
## 
## The downloaded binary packages are in
##  C:\Users\Naman\AppData\Local\Temp\Rtmpm0Sv7x\downloaded_packages
library(caret)# For machine learning
## Loading required package: lattice
install.packages("randomForest")
## Installing package into 'C:/Users/Naman/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'randomForest' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Naman\AppData\Local\Temp\Rtmpm0Sv7x\downloaded_packages
library(randomForest)  # Random forest classifier
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
# Load the dataset
data <- read.csv("C:\\Users\\Naman\\Downloads\\FW Swim.csv")

# View the structure of the dataset
str(data)
## 'data.frame':    3443 obs. of  11 variables:
##  $ Beach          : chr  "Angle Lake" "Angle Lake" "Angle Lake" "Angle Lake" ...
##  $ Jurisdiction   : chr  "SeaTac" "SeaTac" "SeaTac" "SeaTac" ...
##  $ Locator        : chr  "A732SB" "A732SB" "A732SB" "A732SB" ...
##  $ Date           : chr  "2024-07-15" "2024-07-22" "2024-07-29" "2024-08-05" ...
##  $ Day            : chr  "Mon" "Mon" "Mon" "Mon" ...
##  $ Time           : chr  "08:24" "08:16" "09:26" "08:33" ...
##  $ Geomean30d     : num  14.54 11.41 9.93 9.66 10.89 ...
##  $ nSamplesHigh30d: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HighToday      : chr  "false" "false" "false" "false" ...
##  $ WaterTempC     : num  24.4 24.3 22.7 23.3 23.5 24.3 22 21.1 21.7 20 ...
##  $ WaterTempF     : num  75.9 75.7 72.9 73.9 74.3 ...

##Data Preprocessing #We need to:

#1.Handle missing values (if any). #2.Convert HighToday into a binary target variable. #3.Convert categorical variables (like Beach and Jurisdiction) into factors.

# Convert HighToday into a binary target (1 = Unsafe, 0 = Safe)
data$HighToday <- ifelse(data$HighToday > 0, 1, 0)

# Convert Beach and Jurisdiction to factors
data$Beach <- as.factor(data$Beach)
data$Jurisdiction <- as.factor(data$Jurisdiction)

# Check for missing values
sum(is.na(data))
## [1] 1224
# If there are missing values, fill them (you can use imputation if necessary)
# data <- na.omit(data)  # Or you can use other imputation techniques

# View a summary of the processed data
summary(data)
##          Beach             Jurisdiction    Locator              Date          
##  Gene Coulon: 151   Seattle      :1257   Length:3443        Length:3443       
##  Newcastle  : 146   Bellevue     : 431   Class :character   Class :character  
##  Enatai     : 144   Kirkland     : 424   Mode  :character   Mode  :character  
##  Juanita    : 143   Renton       : 289                                        
##  Matthews   : 143   Mercer Island: 201                                        
##  Houghton   : 142   Sammamish    : 199                                        
##  (Other)    :2574   (Other)      : 642                                        
##      Day                Time             Geomean30d      nSamplesHigh30d 
##  Length:3443        Length:3443        Min.   :  1.000   Min.   : 0.000  
##  Class :character   Class :character   1st Qu.:  7.201   1st Qu.: 0.000  
##  Mode  :character   Mode  :character   Median : 17.209   Median : 0.000  
##                                        Mean   : 34.201   Mean   : 0.846  
##                                        3rd Qu.: 40.939   3rd Qu.: 1.000  
##                                        Max.   :610.551   Max.   :12.000  
##                                        NA's   :566       NA's   :566     
##    HighToday        WaterTempC      WaterTempF   
##  Min.   :0.0000   Min.   : 9.90   Min.   :49.82  
##  1st Qu.:1.0000   1st Qu.:18.00   1st Qu.:64.40  
##  Median :1.0000   Median :20.90   Median :69.62  
##  Mean   :0.8507   Mean   :20.45   Mean   :68.81  
##  3rd Qu.:1.0000   3rd Qu.:23.00   3rd Qu.:73.40  
##  Max.   :1.0000   Max.   :29.40   Max.   :84.92  
##                   NA's   :46      NA's   :46

#Split the Dataset into Train and Test Sets

# Install and load the caret package
install.packages("caret")  # This line should only be run once
## Warning: package 'caret' is in use and will not be installed
library(caret)

# Set seed for reproducibility
set.seed(123)

# Assuming 'data' is your dataset
# Create a partition to split the data
trainIndex <- createDataPartition(data$HighToday, p = 0.7, 
                                  list = FALSE)

# Create training and test datasets
train_data <- data[trainIndex, ]
test_data <- data[-trainIndex, ]

# Check the dimensions of the datasets
dim(train_data)  # Should show number of rows and columns in train_data
## [1] 2411   11
dim(test_data)   # Should show number of rows and columns in test_data
## [1] 1032   11

#Build the logistic Regression Model

## Train a logistic regression model
log_model <- glm(HighToday ~ WaterTempC + Geomean30d + nSamplesHigh30d + Beach + Jurisdiction,
                 data = train_data, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# View the model summary
summary(log_model)
## 
## Call:
## glm(formula = HighToday ~ WaterTempC + Geomean30d + nSamplesHigh30d + 
##     Beach + Jurisdiction, family = binomial, data = train_data)
## 
## Coefficients: (11 not defined because of singularities)
##                                        Estimate Std. Error z value Pr(>|z|)
## (Intercept)                          -2.890e+14  1.903e+16  -0.015    0.988
## WaterTempC                           -9.591e-01  9.647e-01  -0.994    0.320
## Geomean30d                            1.253e+00  3.185e+00   0.393    0.694
## nSamplesHigh30d                       1.272e+01  3.726e+03   0.003    0.997
## BeachBeaver Lake                      2.890e+14  1.903e+16   0.015    0.988
## BeachEcho Lake                        2.890e+14  1.903e+16   0.015    0.988
## BeachEnatai                           2.890e+14  1.903e+16   0.015    0.988
## BeachFivemile Lake                    2.890e+14  1.903e+16   0.015    0.988
## BeachGene Coulon                      2.890e+14  1.903e+16   0.015    0.988
## BeachGreen Lake East                  2.890e+14  1.903e+16   0.015    0.988
## BeachGreen Lake West                  2.890e+14  1.903e+16   0.015    0.988
## BeachGroveland                        2.890e+14  1.903e+16   0.015    0.988
## BeachHoughton                         2.890e+14  1.903e+16   0.015    0.988
## BeachIdylwood                         2.890e+14  1.903e+16   0.015    0.988
## BeachJuanita                          2.890e+14  1.903e+16   0.015    0.988
## BeachKennydale                        2.890e+14  1.903e+16   0.015    0.988
## BeachLake Sammamish State Park        2.890e+14  1.903e+16   0.015    0.988
## BeachLake Wilderness                  2.890e+14  1.903e+16   0.015    0.988
## BeachLuther Burbank                   2.890e+14  1.903e+16   0.015    0.988
## BeachMadison Park                     2.890e+14  1.903e+16   0.015    0.988
## BeachMadrona                          2.890e+14  1.903e+16   0.015    0.988
## BeachMagnuson                         2.890e+14  1.903e+16   0.015    0.988
## BeachMatthews                         2.890e+14  1.903e+16   0.015    0.988
## BeachMeydenbauer                      2.890e+14  1.903e+16   0.015    0.988
## BeachMt Baker                         2.890e+14  1.903e+16   0.015    0.988
## BeachNewcastle                        2.890e+14  1.903e+16   0.015    0.988
## BeachPine Lake                        2.890e+14  1.903e+16   0.015    0.988
## BeachPritchard                        2.890e+14  1.903e+16   0.015    0.988
## BeachRattlesnake Lake                 2.890e+14  1.903e+16   0.015    0.988
## BeachSammamish Landing                2.890e+14  1.903e+16   0.015    0.988
## BeachSeward Park                      2.890e+14  1.903e+16   0.015    0.988
## BeachWaverly Park                     2.890e+14  1.903e+16   0.015    0.988
## JurisdictionKing County Parks                NA         NA      NA       NA
## JurisdictionKirkland                         NA         NA      NA       NA
## JurisdictionMaple Valley                     NA         NA      NA       NA
## JurisdictionMercer Island                    NA         NA      NA       NA
## JurisdictionRedmond                          NA         NA      NA       NA
## JurisdictionRenton                           NA         NA      NA       NA
## JurisdictionSammamish                        NA         NA      NA       NA
## JurisdictionSeaTac                    2.890e+14  1.903e+16   0.015    0.988
## JurisdictionSeattle                          NA         NA      NA       NA
## JurisdictionSeattle Public Utilities         NA         NA      NA       NA
## JurisdictionShoreline                        NA         NA      NA       NA
## JurisdictionState Parks                      NA         NA      NA       NA
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 17.2252  on 2023  degrees of freedom
## Residual deviance:  4.6016  on 1991  degrees of freedom
##   (387 observations deleted due to missingness)
## AIC: 70.602
## 
## Number of Fisher Scoring iterations: 25

#Model Evaluation

# Load necessary libraries
library(caret)

# Make predictions on the test data using the logistic regression model
log_predictions <- predict(log_model, test_data, type = "response")

# Convert probabilities into binary outcomes
log_pred_binary <- ifelse(log_predictions > 0.5, 1, 0)

# Confusion matrix and accuracy
conf_matrix <- confusionMatrix(as.factor(log_pred_binary), as.factor(test_data$HighToday))
## Warning in confusionMatrix.default(as.factor(log_pred_binary),
## as.factor(test_data$HighToday)): Levels are not in the same order for reference
## and data. Refactoring data to match.
# View confusion matrix
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0   0   0
##          1   0 834
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9956, 1)
##     No Information Rate : 1          
##     P-Value [Acc > NIR] : 1          
##                                      
##                   Kappa : NaN        
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : NA         
##             Specificity :  1         
##          Pos Pred Value : NA         
##          Neg Pred Value : NA         
##              Prevalence :  0         
##          Detection Rate :  0         
##    Detection Prevalence :  0         
##       Balanced Accuracy : NA         
##                                      
##        'Positive' Class : 0          
##