1 R packages used

library(dataRetrieval)
library(lubridate)
library(tidyverse)
library(plotly)
library(devtools)
library(stringr)

2 Data Prep and Cleaning

2.1 Site Information for years 2016-2017

2.1.1 import site information from NWIS (years 2016-2017 only)

2.2 Dissolved Oxygen Time Series Table (2016 and 2017)

2.3 2015 data

### Write dataframes to files

2.4 Read in/Modify tables

2.4.1 Calculate Day night differnce as ordered pairs

library(RColorBrewer)
head(DO.data)
weather <- read.csv('X:/transfer/RobW/TNC_DO/44069h2017 (1).txt/wind_2017.txt', header=F, sep=' ', stringsAsFactors = FALSE) %>% select('year'=V1, 
         'month'=V2,
         'day'=V3,
         'hour'=V4,
         'min'=V5,
         'wind_speed'=V8,
         'wind_gust'=V10,
         'air_temp'=V17,
         'water_temp'=V20) %>% 
  slice(3:n())

# get all into numeric data types
i <- c(1:9)
weather[ , i] <- apply(weather[ , i], 2,            # Specify own function within apply
                    function(x) as.numeric(as.character(x)))

weather$month <- str_pad(weather$month, 2, pad = "0")
weather$day <- str_pad(weather$day,2,pad="0")

weather <- weather %>% unite(monthday, month,day,sep='-', remove=FALSE)
weather$monthday <- as.Date(weather$monthday, format='%m-%d')

# compute daily weather values for 2017 - months June-October (hours midnight to 7am)
weather <- 
  weather %>% 
  filter(month==c('06','07','08','09','10'), 
         hour %in% c(16:24)) %>% group_by(monthday) %>% 
  summarize('mean_wind_speed'=mean(wind_speed, na.rm=T),
            'mean_wind_gust'=mean(wind_gust,na.rm=T),
            'mean_air_temp'=mean(air_temp,na.rm=T),
            'mean_water_temp'=mean(water_temp,na.rm=T))




# get max day temp
day.2017 <- 
    DO.2017.all %>%  
      filter(year==2017, TOD=='day') %>% 
      group_by(SiteNum, monthday) %>% 
      summarize('day_max_DO'=max(DO_mgL))

day.2017$monthday <- as.Date(day.2017$monthday, format='%m-%d')



day.2017$seq_1 <- lead(ifelse(difftime(day.2017$monthday, lag(day.2017$monthday), units="days")==1, TRUE, FALSE))


# get DO min night daily values from the next day 
night.2017 <-
  DO.2017.all %>%  
    filter(year==2017, TOD=='night') %>%
    group_by(SiteNum, monthday) %>%    
    summarize('night_min_DO'=min(DO_mgL)) %>% 
    mutate(next_night_minDO= lead(night_min_DO))

night.2017$monthday <- as.Date(night.2017$monthday, format='%m-%d')
night.2017$seq_2 <- lead(ifelse(difftime(night.2017$monthday, lag(night.2017$monthday), units="days")==1, TRUE, FALSE))
#subtract 1 from night.2017$monthday for join purposes



night.2017$monthday <- night.2017$monthday - 1



water_temp <- DO.data %>%filter(year==2017) %>%  select(SiteNum, water.temp, monthday) 
water_temp$monthday <- as.Date(water_temp$monthday, format='%m-%d')
water_temp$SiteNum <- as.numeric(water_temp$SiteNum)
water_temp <- water_temp %>% 
  group_by(SiteNum,monthday) %>% summarize(mean_sensor_water_temp=mean(water.temp,na.rm=T)) 


dnw <- day.2017 %>% 
  left_join(night.2017, by=c("SiteNum"="SiteNum", "monthday"="monthday")) %>% 
  left_join(weather, by="monthday") %>%
  left_join(site.info, by="SiteNum") %>%
  left_join(water_temp, by=c("SiteNum"="SiteNum", "monthday"="monthday")) %>% 
  filter(seq_1==TRUE, seq_2==TRUE) 



# make a plot
dnw %>%
  #filter(StationName=="Great South Bay 15 ") %>% 
  filter(complete.cases(mean_sensor_water_temp)) %>% 
  #select(SiteNum,StationName, monthday,night_min_DO, day_max_DO, mean_wind_speed) %>% 
  ggplot(aes(y=next_night_minDO,x=day_max_DO, col=mean_sensor_water_temp)) + 
    geom_point(size=2) +
    geom_abline(slope=1,intercept=0) + 
    #facet_wrap(~StationName, ncol=4) +
    scale_color_gradientn(colours = rainbow(6)) + 
    labs(title='ALL')

# try some mulitple linear regression - 

### fit1 - night as a function of day DO and Station (Station 11,12,14,15 and 7 are the strongest)
fit1 <- lm(next_night_minDO ~ day_max_DO + StationName, data=dnw)
summary(fit1)
## 
## Call:
## lm(formula = next_night_minDO ~ day_max_DO + StationName, data = dnw)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.1432 -1.0083  0.3413  1.2920  4.2443 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     1.33580    0.33706   3.963 7.81e-05 ***
## day_max_DO                      0.38471    0.03403  11.306  < 2e-16 ***
## StationNameGreat South Bay 11   0.76309    0.26093   2.925 0.003510 ** 
## StationNameGreat South Bay 12   0.87654    0.28262   3.102 0.001967 ** 
## StationNameGreat South Bay 13   0.04183    0.25909   0.161 0.871756    
## StationNameGreat South Bay 14   0.85906    0.25076   3.426 0.000632 ***
## StationNameGreat South Bay 15   1.22960    0.25613   4.801 1.77e-06 ***
## StationNameGreat South Bay 4   -0.15437    0.25885  -0.596 0.551029    
## StationNameGreat South Bay 5   -0.66548    0.25731  -2.586 0.009811 ** 
## StationNameGreat South Bay 6    0.16608    0.25214   0.659 0.510200    
## StationNameGreat South Bay 7    0.46955    0.25151   1.867 0.062135 .  
## StationNameGreat South Bay 8    0.26851    0.26984   0.995 0.319895    
## StationNameGreat South Bay 9    0.29135    0.25331   1.150 0.250301    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.849 on 1277 degrees of freedom
## Multiple R-squared:  0.1708, Adjusted R-squared:  0.1631 
## F-statistic: 21.93 on 12 and 1277 DF,  p-value: < 2.2e-16
plot(fit1)

fit2 <- lm(next_night_minDO ~  mean_wind_speed, data=dnw)
summary(fit2)  
## 
## Call:
## lm(formula = next_night_minDO ~ mean_wind_speed, data = dnw)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0034 -1.1587  0.4467  1.4111  5.3824 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      4.25517    0.14089  30.201   <2e-16 ***
## mean_wind_speed  0.06166    0.02522   2.445   0.0146 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.026 on 1191 degrees of freedom
##   (97 observations deleted due to missingness)
## Multiple R-squared:  0.004995,   Adjusted R-squared:  0.00416 
## F-statistic: 5.979 on 1 and 1191 DF,  p-value: 0.01462
plot(fit2)

2.4.2 Tide Stage Classification - Method 2 (works but only have for 2017 - processing steps need commenting)

3 Data Analysis and Figures

3.1 FIGURE1- Description of Study Area

Figure 1 - Locations of continuous DO sensors in Great South Bay. Map processed in ArcGIS v. 10.7.1.

Figure 1 - Locations of continuous DO sensors in Great South Bay. Map processed in ArcGIS v. 10.7.1.

Notes: Need Bluepoints polygon

3.2 FIGURE2- AUV Survey

AUV survey of GSB performed August 23-26 2016. Products (Sample points and interpolated DO surface) released on ScienceBase at https://www.sciencebase.gov/catalog/item/59284998e4b0c6c925587b35.

The interpolated surface below shows the the spatial distribution of DO in GSB under the conditions associated with 8/23/16 between 1200 and 0500.
Figure 3a - Interpolated DO surface of August 23, 2016 AUV survey

The interpolated surface below is a heat map from the continuous sensors. Even with a dense sampling setup, interpolation misses key information.
Figure 2b - Interpolated DO surface from the continuous sensors during the time period of the AUV survey

The map below shows areas where AUV and continuous sensor interpolations vary. Created via ArcMap ‘Raster Calculator’ tool. Figure 3c - Difference between AUV and HOBO sensor spatial distributions

Notes: Will be 2 figures - describe conditions of AUV survey ie time of day, tidal stage, weather etc

3.3 Figure3- Distributions of DO daily minimum daily values - using boxplots

3.3.1 download daily values from NWIS

3.3.2 FIGURE 3 - for report - Boxplot of daily minimums by site

3.4 Figure4- DEC DO standards

3.4.1 Build the theoretical table based on governing equation (EPA via NYSDEC)

### Create DO intervals in increments of 0.1

3.4.2 Find chronic events - for 2016/2017 - ie same POR record for each year

# get daily means from NWIS 
daily_means.1617 <- readNWISdv(site.numbers,"00300", "2016-06-01", "2017-11-03", statCd='00003') %>% select(Date,SiteNum=site_no, daily_mean="X_00300_00003" )
daily_means.1617 <- daily_means.1617 %>% filter(daily_mean > 3.0 & daily_mean < 4.9)
DO.data$date <- as.Date(DO.data$date)
DO.data$chronic <- FALSE

for(i in 1:nrow(daily_means.1617)){
  
  # get subset of data from the site that is within a 66 day range 
  a <- daily_means.1617 %>% filter(Date >= daily_means.1617$Date[i] & Date <= daily_means.1617$Date[i] + days(66), SiteNum==SiteNum[i])
                      
  # store variables for start of 66 day window, end of window and site number
  start.event <- min(a$Date) 
  end.event <- max(a$Date)
  site <- a$SiteNum

  # calculate the statistic - cumulative fraction
  b <- a %>%   count(daily_mean) %>% 
    left_join(allowed.chronic, by='daily_mean') %>% 
    mutate(cum_fraction=n/time_allowed) %>% 
    summarize(cum_fraction=sum(cum_fraction)) 
    
  # if cumulative fraction is greater than 1, then the entire 66 day window is labeled as chronic
  if(b > 1){
    DO.data$chronic[DO.data$date >= start.event & DO.data$date <=end.event & DO.data$SiteNum==site] <- TRUE
  
    }
}
sum(DO.data$chronic)
## [1] 102567

3.4.3 Find chronic events - for all of 2017

# get daily mean DO for each site in 2017
daily_means.17 <- readNWISdv(site.numbers,"00300", "2017-06-01", "2017-12-03", statCd='00003') %>% select(Date,SiteNum=site_no, daily_mean="X_00300_00003")

# filter to get means between 3.0 and 4.9 only 
daily_means.17 <- daily_means.17 %>% filter(daily_mean > 3.0 & daily_mean < 4.9)

# get date into date format
DO.2017.all$date <- as.Date(DO.2017.all$date)

# set chronic variable to NA 
DO.2017.all$chronic <- FALSE

# run the for loop. 
# Each iteration step is a daily value
#1. retrieve all data from that day within a 66 day window
#2. store the beginning date and end date of the window, and the site number
#3. tally the number of each daily min value (interval frequency)
#4. join in the DEC allowable table
#5. calculate the cumulative fraction
#6. sum the cumulative fraction
#7. if the sum is greater than 1, all dates between the start and end dates(STEP#2) are chronic, otherwise they remain as NA
for(i in 1:nrow(daily_means.17)){
  
  # get subset of data from the site that is within a 66 day range 
  a <- daily_means.17 %>% filter(Date >= daily_means.17$Date[i], Date <= daily_means.17$Date[i] + days(66), SiteNum==SiteNum[i])
                      
  # store variables for start of 66 day window, end of window and site number
  start.event <- min(a$Date) 
  end.event <- max(a$Date)
  site <- a$SiteNum

  # calculate the statistic - cumulative fraction
  b <- a %>%   count(daily_mean) %>% 
    left_join(allowed.chronic, by='daily_mean') %>% 
    mutate(cum_fraction=n/time_allowed) %>% 
    summarize(cum_fraction=sum(cum_fraction)) 
    
  # if cumulative fraction is greater than 1, then the entire 66 day window is labeled as chronic
  if(b > 1){
    DO.2017.all$chronic[DO.2017.all$date >= start.event & DO.2017.all$date <=end.event & DO.2017.all$SiteNum==site] <- TRUE
  
    }
}
sum(DO.2017.all$chronic)
## [1] 85333
### acute daily
# get daily mean DO for each site in 2017
daily_means.17 <- readNWISdv(site.numbers,"00300", "2017-06-01", "2017-12-03", statCd='00003') %>% select(Date,SiteNum=site_no, daily_mean="X_00300_00003")

# filter to get means between 3.0 and 4.9 only 
daily_means.17.lt3 <- daily_means.17 %>% filter( daily_mean < 3.0)

DO.2017.all$dailyacute <- FALSE
# get date into date format
DO.2017.all$date <- as.Date(DO.2017.all$date)

for(i in 1:nrow(daily_means.17.lt3)){
    c <- daily_means.17.lt3 %>% filter(Date == daily_means.17.lt3$Date[i], SiteNum==SiteNum[i])
                      
  # store variables for start of 66 day window, end of window and site number
  start.event <- c$Date 
  site <- c$SiteNum
  
DO.2017.all$dailyacute[DO.2017.all$date >= start.event  & DO.2017.all$SiteNum==site] <- TRUE
 
  }

3.4.4 Acute violations

NYSDEC Crietria: The DO concentration shall not fall below the acute standard of 3.0 mg/L at any time. Exceedence proportion is defined as the number of observations that are above 3.0 divided by the total number of observations. ### Summary of acute violations

3.4.5 Figure 2 - Time series of chronic and acute violations

Figure 4 - Spatial distribution of acute violations

Figure 4 - Spatial distribution of acute violations

3.5 FIGURE5 - Water temperature curves

3.5.1 get daily max WT from NWIS and join to daily mins

3.5.2 WT curve - all 2016

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

### WT curve - all 2017

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

### USGS daily min from NWIS

daily_mins %>%
      filter(year %in% c(2016,2017)) %>% 
      group_by(monthday, year) %>%
      ggplot(aes(x=max_temp,y=daily_min)) + 
          geom_point(position='jitter') + 
          geom_smooth() +
          xlab('Temp daily max [C]') +
          ylab('DO daily min [mg/L') + 
          facet_grid(~watershed)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

###daily min as a function of max Temp faceting by site (one for each year)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

3.5.3 daily min as a function of max Temp by TOD faceting by site (not a daily value)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

### Water Temp Curves (2016 and 2017) not daily value

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

### Water Temperature Curves (2017 only)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

### WT 2016 only

 DO.data %>% 
      filter(year==2016) %>% 
      #filter(TOD == c('day', 'night')) %>% 
      group_by(monthday,StationName, month) %>%
      summarize(minDO = min(DO_mgL), maxWT= max(water.temp, na.rm=T)) %>% 
   ggplot(aes(x=maxWT,y=minDO)) + 
          geom_point(aes(color=as.factor(month)),position='jitter') + 
          geom_smooth() +
          xlab('Temp daily max [C]') +
          ylab('DO daily min [mg/L') + 
          guides(color=guide_legend(order=1)) +
          facet_wrap(~StationName, ncol=4) + 
          scale_color_manual(values=c('#999999','#E69F00','#56B4E9','blue'))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

### Water Temperature Curves (2017 only July and August )

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

3.6 Tides revisited

DO.data %>% filter(year==2017) %>% ggplot(aes(x=TOD,y=DO_mgL,color=stage)) + geom_boxplot()

3.7 summary of land use by watershed

3.8 Tables of DO averages

3.8.1 Mean, Median and std by year (2016-2017)

3.8.2 mean, median, std (2015)

3.8.3 Mean DO by year and watershed (2016-2017)

3.8.4 Median DO by year and watershed (2016-2017)

3.8.5 STD of DO by year and watershed (2016-2017)

3.8.6 Mean of DO By site (2016-2017)

3.8.7 DO by time of day (2016-2017)

3.8.8 DO by time of day (2015)

3.8.9 DO by nearshore location (2016-2017)

3.8.10 DO by nearshore location by site

3.8.11 Water temperature by year (2016-2017)

3.8.12 Mean WT by site - write to a file and plot in GIS

3.8.13 Table for GIS

included in map below

Figure 2a - Mean DO for 2016 at each location with interplated surface of mean water temperature.

Figure 2a - Mean DO for 2016 at each location with interplated surface of mean water temperature.

Figure 2b - Mean DO for 2017 at each location with interpolated surface of mean water temperature.

Figure 2b - Mean DO for 2017 at each location with interpolated surface of mean water temperature.

3.9 Boxplots- daily values - not included in final report

3.10 Other graphs

3.10.1 DO mean by year/watershed/time of day

### DO Cycle

## Selecting by hr_mean

3.10.2 Day/Night exceedences

### Day/Night exceedences by site number

Figure 2a - Mean DO for 2016 ### Tide Stage

3.10.3 Distribution of DO values

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
### Exceedences/Acute Violations

3.11 Boxplots showing time of day distribution

3.12 Time Series Plot

3.13 Distributions of DO real-time data - using boxplots

3.14 Tide stage classification - Method 1 (DOESNT WORK)

3.14.1 Tide stage test (2017)

DO.data %>% filter(year==2016,monthday==c('08-24','08-25','08-26'), StationName=='Great South Bay 11 ')
A <-matrix(c(1,3,2,4,1,3,1,0,1), byrow=TRUE, ncol=3)
det(A)
## [1] -4