*Source: https://www.pathlightpro.com/your-guide-to-turbidity-testing-in-florida-know-true-water-quality/

For this project, I will be working with the Water Quality dataset from the U.S. Fish and Wildlife Service. Water quality is an essential environmental factor as it has an influence on every living thing, which is why I decided to work with this dataset. For my topic, I wanted to determine how individual factors such as Water Depth (m), Salinity (ppt), and Dissolved Oxygen (mg/L) affected Water Temperature (C). To add a more in depth look to my graphs, I used categorical variables such as Field_Tech and Site_Id. Before analysis, I need to clean my dataset. What I will do is select all the variables I am going to be exploring, and then filtering out all missing values. Also, since Field_Tech does not remain consistent with the capitalization of names, I decided to convert all the names to lower case to confront that issue.

#Loading the dataset
WaterData<-readr::read_csv("C:/Users/panca/Downloads/BKB_WaterQualityData_2020084.csv")
## Rows: 2371 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): Site_Id, Unit_Id, Read_Date, Time (24:00), Field_Tech, DateVerifie...
## dbl (10): Salinity (ppt), Dissolved Oxygen (mg/L), pH (standard units), Secc...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Loading necessary libraries
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ lubridate 1.9.5     ✔ tibble    3.3.1
## ✔ purrr     1.2.1     ✔ tidyr     1.3.2
## ✔ readr     2.1.6     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
#Looking at the dataset to determine what variables to use
View(WaterData)
colnames(WaterData)
##  [1] "Site_Id"                 "Unit_Id"                
##  [3] "Read_Date"               "Salinity (ppt)"         
##  [5] "Dissolved Oxygen (mg/L)" "pH (standard units)"    
##  [7] "Secchi Depth (m)"        "Water Depth (m)"        
##  [9] "Water Temp (?C)"         "Air Temp-Celsius"       
## [11] "Air Temp (?F)"           "Time (24:00)"           
## [13] "Field_Tech"              "DateVerified"           
## [15] "WhoVerified"             "AirTemp (C)"            
## [17] "Year"
#Cleaning the dataset
WaterData_clean <- WaterData |> 
  #Selecting the variables used for analysis
  select(`Salinity (ppt)`,`Dissolved Oxygen (mg/L)`, `Water Depth (m)`,`Water Temp (?C)`,Site_Id,Field_Tech)|>
  #Removing missing values from the selected variables
  filter(
    !is.na(`Salinity (ppt)`),
    !is.na(`Dissolved Oxygen (mg/L)`),
    !is.na(`Water Depth (m)`), 
    !is.na(`Water Temp (?C)`),
    !is.na(Site_Id),
    !is.na(Field_Tech),
    #Determined that Not recorded was the same as na
    (Field_Tech) !="Not Recorded"
    )|>
  #Standardizing names to ensure consistency
  mutate(Field_Tech = tolower(Field_Tech))
#View final result to make sure everything worked correctly
View(WaterData_clean)
#Creates the first scatterplot with a regression line
G1<-ggplot(WaterData_clean,aes(x=`Water Depth (m)`,y=`Water Temp (?C)`,color=Site_Id))+
      geom_point(alpha=0.5)+
      #Adds a regression line
      geom_smooth(method="lm", color="black", linetype="dotdash", se=FALSE)+
      #Chooses a color palette
      scale_color_brewer(palette="Set2")+
      #Adds labels for title, x-axis, y-axis, and the caption
      labs(
        title="Relationship between Water Depth(m) and Water Temperature(C)",
        x="Water Depth(m)",
        y="Water Temperature(C)",
        caption="Source: U.S. Fish and Wildlife Service - Water Quality Data"
      )+
    #Minimal theme for a cleaner look
    theme_minimal()
#Makes the scatter plot interactive
ggplotly(G1)
## `geom_smooth()` using formula = 'y ~ x'
#Creates the second scatterplot with a regression line
G2<-ggplot(WaterData_clean,aes(x=`Salinity (ppt)`,y=`Water Temp (?C)`,color=Field_Tech))+
      geom_point(alpha=1)+
      #Adds a regression line
      geom_smooth(method="lm", color="white", linetype="dotdash", se=FALSE)+
      #Chooses a color palette
      scale_color_brewer(palette="Set3")+
      #Adds labels for title, x-axis, y-axis, and the caption
      labs(
        title="Amount of Salinity(ppt) on Water Temperature(C)",
        x="Salinity(ppt)",
        y="Water Temperature(C)",
        caption="Source: U.S. Fish and Wildlife Service - Water Quality Data"
      )+
      #Dark theme for a different look
      theme_dark()
#Makes the scatter plot interactive
ggplotly(G2)
## `geom_smooth()` using formula = 'y ~ x'
#Creates the third scatterplot with a regression line
G3<-ggplot(WaterData_clean,aes(x=`Dissolved Oxygen (mg/L)`,y=`Water Temp (?C)`, color=Site_Id))+
      geom_point(alpha=0.5)+
      #Adds a regression line
      geom_smooth(method="lm", color="black", linetype="dotdash", se=FALSE)+
      #Manually select colors for each Site
      scale_color_manual(values=c("A"="#fcba03","B"="#1ac952","Bay"="#94d1f2","C"="#c439af","D"="#db2a38"))+
      #Adds labels for title, x-axis, y-axis, and the caption
      labs(
        title="Dissolved Oxygen(mg/L) on Water Temperature(C)",
        x="Dissolved Oxygen(mg/L)",
        y="Water Temperature(C)",
        caption="Source: U.S. Fish and Wildlife Service - Water Quality Data"
      )+
  #Classic theme for a different look
  theme_classic()
#Makes the scatter plot interactive
ggplotly(G3)
## `geom_smooth()` using formula = 'y ~ x'
ggplot(WaterData_clean, aes(x = `Water Depth (m)`, y = `Water Temp (?C)`, color=`Dissolved Oxygen (mg/L)`, size = `Salinity (ppt)`)) +
  geom_point(alpha = 0.5) +
  labs(
    title = "Water Quality Variables: Water Depth, Salinity, and Dissolved Oxygen",
    x = "Water Depth (m)",
    y = "Water Temperature (C)",
    caption = "Source: U.S. Fish and Wildlife Service - Water Quality Data"
  ) +
  theme_minimal() 

  ggplotly()
#Builds a multiple linear regression model to predict water temperature using water depth, salinity, and dissolved oxygen as predictor variables
model <- lm(`Water Temp (?C)` ~ 
              `Water Depth (m)` + 
              `Salinity (ppt)` + 
              `Dissolved Oxygen (mg/L)`,
              data=WaterData_clean)
#Shows results like p-value and R-squared
summary(model)
## 
## Call:
## lm(formula = `Water Temp (?C)` ~ `Water Depth (m)` + `Salinity (ppt)` + 
##     `Dissolved Oxygen (mg/L)`, data = WaterData_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.670  -4.058   0.266   4.122  54.358 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               29.31982    0.76053  38.552  < 2e-16 ***
## `Water Depth (m)`          1.28761    0.38212   3.370 0.000788 ***
## `Salinity (ppt)`           1.33062    0.15493   8.588  < 2e-16 ***
## `Dissolved Oxygen (mg/L)` -1.96130    0.09613 -20.402  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.472 on 805 degrees of freedom
## Multiple R-squared:  0.3504, Adjusted R-squared:  0.348 
## F-statistic: 144.8 on 3 and 805 DF,  p-value: < 2.2e-16