Bioenergetics Data Input Selection

EPSCoR SCTC Aquatic Ecology

Bioenergetics data selection and subsetting – Analysis in progress

R Session Info

sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.3
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_3.4.3  backports_1.1.2 magrittr_1.5    rprojroot_1.3-2
##  [5] tools_3.4.3     htmltools_0.3.6 yaml_2.1.16     Rcpp_0.12.15   
##  [9] stringi_1.1.6   rmarkdown_1.8   knitr_1.19      stringr_1.2.0  
## [13] digest_0.6.15   evaluate_0.10.1

Bioenergetics modeling efforts will require the following fieldwork-generated data as inputs: mean weight at start and end of simulation, proportional daily mean diet inputs, and daily mean water temperature.

When these data sources are grouped by Year, Species, Age, and River (e.g. “Beaver Creek Coho 2015”), a total of 31 groupings emerge (all age 2 fish excluded). However, sufficiently robust data does not exist to support modeling effort in all of these iterations (low sample size; temperature data missing; etc.) Additionally, in some of these groupings an increasing size trend (positive slope) is not observed; likely due to insufficent sample size or complex habitat.

Table X below

Create table to aid in selecting proposed bioenergetics simulations with sufficient data inputs.

a.) Calculate temporal extent of available water temperature data.


# struggling to automate the process of reading in final csv temperature files from local folder.  Accomplished with manual read.csv for now

dir15 <-"/Users/bmeyer/Google Drive/Thesis/R Analysis/Temperature Data/Final Temperature Data/2015/"
dir16 <-"/Users/bmeyer/Google Drive/Thesis/R Analysis/Temperature Data/Final Temperature Data/2016/"

BC15 <- read.csv(paste0(dir15,"2015_","LBC","_H2OTemp_Final",".csv"))
RR15 <- read.csv(paste0(dir15,"2015_","LRR","_H2OTemp_Final",".csv"))
PC15 <- read.csv(paste0(dir15,"2015_","LPC","_H2OTemp_Final",".csv"))
LKR15 <- read.csv(paste0(dir15,"2015_","LKR","_H2OTemp_Final",".csv"))

BC16 <- read.csv(paste0(dir16,"2016_","LBC","_H2OTemp_Final",".csv"))
RR16 <- read.csv(paste0(dir16,"2016_","LRR","_H2OTemp_Final",".csv"))
PC16 <- read.csv(paste0(dir16,"2016_","MPC","_H2OTemp_Final",".csv"))
LKR16 <- read.csv(paste0(dir16,"2016_","LKR","_H2OTemp_Final",".csv"))

# bind all temperature files together
# binding and transformation performed seperately for 2015 and 2016 because for some reason POSIX format of DateTime column between the years is incompatible... (2/8/2018)
fnl_temps15 <- bind_rows(BC15,RR15,PC15,LKR15) %>%
transform(DateTime = mdy_hm(DateTime)) %>%
  mutate(year = year(DateTime)) %>%
  transform(Stream_Name= as.factor(Stream_Name),
            year = as.factor(year),
            Date = mdy(Date)) %>%
  select(-X,-X.1,-X.2,-X.3)

fnl_temps16 <- bind_rows(BC16,RR16,PC16,LKR16) %>%
  # something weird here eliminates 2016 datetime data
  transform(DateTime = as.POSIXct(DateTime)) %>%
  mutate(year = year(DateTime)) %>%
  transform(Stream_Name= as.factor(Stream_Name),
            year = as.factor(year),
            Date = as.Date(Date)) %>%
  select(-X)

# bind together 2015 and 2016
fnl_temps <- bind_rows(fnl_temps15,fnl_temps16)

# remove temporalry objects
rm(fnl_temps15,fnl_temps16,BC15,RR15,PC15,LKR15,BC16,RR16,PC16,LKR16)

# calculate min and max extent of water temperature logger deployments
h2o_logger_times <- fnl_temps %>%
  group_by(Stream_Name,year) %>% 
  summarize(logger.deploy.start = min(Date),
            logger.deploy.end = max(Date)) %>%
  transform(year = as.factor(year))

# to do: make chart that shows temporal extent of logger deployments
# in progress
# munge
#h2o_logger_times_x <- h2o_logger_times %>%
#  transform(logger.deploy.start = yday(logger.deploy.start),
#            logger.deploy.end = yday(logger.deploy.end),
#           year = as.factor(year))

#p <- ggplot(h2o_logger_times_x, aes(x=logger.deploy.start, xend = logger.deploy.end,
#                                  y=Stream_Name, yend=Stream_Name)) +
  
 # geom_segment(aes(color = Stream_Name),size = 24)

b.) Calculate catch summary data

# Import resultant data from "fish_ages_all.R" script.  The script uses methods described in Ch. 5 of Ogle 2016 ("Introductory fisheries analyses with R") to age all captured fish from a subset of fish scale samples.  Resultant csv file was copied to the google sheet referenced here.

# ID googlesheet with fish age data
fish_ages <- gs_title("all_fish_ages.csv") 
Sys.sleep(6)

#read in worksheet containing diet data
dat <- fish_ages %>% 
  gs_read(ws = "all_fish_ages.csv",col_names=TRUE) %>% 
  select (-X1) %>%
  separate(sample.event, c("reach","year","sample.event.num"), sep = "-", remove = F) %>%
  unite(river_spp, river, spp, remove = F, sep = " ") %>%
  # manually order appearance of seasons and rivers for plots
  transform(year = as.factor(year),
            river = factor(river, levels=c("Beaver Creek",
                                              'Russian River',
                                              'Ptarmigan Creek',
                                              'Kenai River')),
            season = factor(season, levels = c("Early Summer",
                                     "Mid-Summer",
                                     "Late Summer",
                                     "Fall")), 
            river_spp = as.factor(river_spp)) %>%
  mutate(river_spp_yr = paste(year,river,"\n",spp)) %>%
 # age 2 fish: this cohort is sparse and likely departing for the ocean.
  filter(age != 2) %>%
  # mutate column for yes/no diet sampled
  mutate(diet_sample= ifelse(!is.na(sample.id), "Y", "N")) %>%
  mutate(Count = 1) %>%
  transform(Date = mdy(Date)) %>%
  mutate(doy = yday(Date))  %>%
  # filter all except coho and chinook
  filter(spp %in% c("Chinook", "Coho"))


# catch summary table
catch_summary <- dat %>%
  group_by(year,river,spp,age,diet_sample) %>%
  summarise(ct = sum(Count)) %>%
  spread(diet_sample,ct) %>%
  replace(is.na(.), 0) %>%
  mutate(total_fish = sum(N,Y)) %>%
  rename(diet_sample = Y,
         len_wt_only = N)

c.) Calculate slopes of length and weight growth trends for grouping level: year ~ river ~ species ~ age


# calc slope of each size trend for len and wt...??
all_slopes <- dat %>%
  transform(river_spp_yr = as.factor(river_spp_yr),
            year = as.factor(year),
            spp = as.factor(spp),
            len_mm = as.numeric(len_mm)) %>%
  select(river_spp_yr,river,year,spp,age,len_mm,wt_g,doy) 

# length model formulation for growth trend
len_growth_model <- function(df) {
  lm(len_mm ~ doy, data = df)
}

# weight model formulation for growth trend
wt_growth_model <- function(df) {
  lm(wt_g ~ doy, data = df)
}


# create table of length and weight trend slopes for each simulation
size_age_spp_year <- all_slopes %>% 
  group_by(river, spp, age, year) %>% 
  nest() %>% 
  # calculate slopes & intercept for len_mm ~ day of year
  mutate(
    age_len = map(data, len_growth_model),
    tidy_age_len = map(age_len, broom::tidy),
    len_slope = map_dbl(tidy_age_len, ~.$estimate[2]),
    len_intercept = map_dbl(tidy_age_len, ~.$estimate[[1]])) %>%
  # calculate slopes for wt_g ~ day of year
  mutate(
    age_wt = map(data, wt_growth_model),
    tidy_age_wt = map(age_wt, broom::tidy),
    wt_slope = map_dbl(tidy_age_wt, ~.$estimate[2]),
    wt_intercept = map_dbl(tidy_age_wt, ~.$estimate[[1]])) %>%
  select(year,river, spp, age, len_slope, len_intercept, wt_slope, wt_intercept)

d.) Define temporal extents of proposed simulations

# Define temporal extents of proposed simulations

# NOTE 1/22/18: this code also exists as a script at "/Users/bmeyer/Google Drive/Thesis/R Analysis/Thesis Analyses R Project/Overall_scripts_2015_2016/sampling_periods.R".  Attempted to just source("\path") this script in to the markdown document but, 'source' function prevents knitting through markdown for some reason.  

############
#
# 2015
#
###########

# Verify that sheet is present, create object of sheet with all tabs
SCTC2015 <- gs_title("2015 EPSCoR SCTC.xlsx")

#read in specific worksheet containing fishing data
effort2015 <- SCTC2015 %>% 
  gs_read(ws = "B Fishing Data",col_names=TRUE, skip=2) %>%
  #rename needed columns  
  rename(site.id=Site_ID,
         sample.event=Sample_Event) %>%
  
  #select desired columns
  select(site.id,River,sample.event,sample.event.num,deploy_dt,collect_dt) %>%
  
  #filter out blanks
  filter(!is.na(sample.event)) %>%
  
  # transform columns to desired classes
  transform(site.id=as.factor(site.id),
            sample.event=as.factor(sample.event),
            sample.event.num=as.factor(sample.event.num)) 


###################
#
# 2016
#
###################

# Verify that sheet is present, create object of sheet with all tabs
SCTC2016 <- gs_title("2016_EPSCoR_Aquatic_Ecology_Database")

#read in specific worksheet containing fishing data
effort2016 <- SCTC2016 %>% 
  gs_read(ws = "B Fishing Data",col_names=TRUE, skip=2) %>%
  
  #rename needed columns  
  rename(site.id=Site_ID,
         sample.event=Sample_Event) %>%
  
  #select desired columns
  select(site.id,River,sample.event,sample.event.num,deploy_dt,collect_dt) %>%
  
  #filter our blanks
  filter(!is.na(sample.event)) %>%
  
  # transform columns to desired classes
  transform(site.id=as.factor(site.id),
            sample.event=as.factor(sample.event),
            sample.event.num=as.factor(sample.event.num)) 


# bind 2015 & 2016 data; format datetimes
effort <- bind_rows(effort2015,effort2016) %>%
  transform(collect_dt = anytime(collect_dt),
            deploy_dt = anytime(deploy_dt)) %>%
  mutate(year = year(collect_dt)) 



# Define Temporal Extents of proposed simulations 

# "Simulation" temporal extent
# Time period from first date of each season to first date of following season

# prepare data to tibble form
sim_extents <- effort %>% 
  select(-site.id) %>%
  group_by(River, year, sample.event.num) %>% 
  nest() %>% 
  mutate(season.start = map_dbl(data, ~min(.$deploy_dt)),
         season.end = map_dbl(data, ~min(.$deploy_dt))
         ) %>%
  transform(season.start = anytime(season.start),
            season.end = anytime(season.end)) %>%
  arrange(River,year,sample.event.num)

# prepare data such that end of one season is beginning of next
t <- sim_extents %>%
  mutate(season.end_x = lead(season.end)) %>%
  select(-season.end) %>%
  arrange(River,year,sample.event.num)

# prepare data such that final season of each year has duration of 30 days.
# 2015 had 3 seasons, 2016 had 4 seasons.

t_end15 <- t %>%
  filter(sample.event.num == 3,
         year == 2015) %>%
  mutate(season.end = season.start + days(30)) %>%
  select(-season.end_x)

t_end16 <- t %>%
  filter(sample.event.num == 4,
         year == 2016) %>%
  mutate(season.end = season.start + days(30)) %>%
  select(-season.end_x)

t_end15_1 <- t %>%
  filter(sample.event.num != 3,
         year != 2016) %>%
  mutate(season.end = season.start + days(30)) %>%
  select(-season.end_x)

t_end16_1 <- t %>%
  filter(sample.event.num != 4,
         year != 2015) %>%
  mutate(season.end = season.start + days(30)) %>%
  select(-season.end_x)

# final version of simulation extents
sim_extents <- bind_rows(t_end15_1,t_end15,t_end16_1,t_end16) %>%
  rename(Stream_Name = River,
         sim.start = season.start,
         sim.end = season.end) %>%
  transform(year = as.factor(year))

# remove temporary objects
rm(t,t_end15,t_end15_1,t_end16,t_end16_1,effort2015,effort2016)

# to do: make plot that visualizes extent of each simulation period faceted by year and river

e.) Calculate degree of overlap between proposed simulatin extents and availability of water tempeature data


# does water temperature data overlap with all of the simulation extents?

# NOTE: to do; fix kenai river water temp import for longer time extent

# by year, river, season, species
# join to simulation selection datatable when considering at scale of SEASON start -> finish 
sim_logger_overlap <- full_join(sim_extents,h2o_logger_times,by=c("Stream_Name","year")) %>%
  mutate(start_overlap = ifelse(sim.start > logger.deploy.start,"Yes","No")) %>%
  mutate(end_overlap = ifelse(sim.end < logger.deploy.end,"Yes","No")) %>%
  mutate(full_overlap = ifelse(start_overlap == "Yes" & end_overlap == "Yes","Yes","No")) %>%
  rename(river = Stream_Name) %>%
  select(river,year,sample.event.num,full_overlap) %>%
  transform(year = as.factor(year),
            river = as.factor(river))

# by year, river, species (NOT season)
# join to simulation selection datatable when considering at scale of YEAR start -> finish 
sim_logger_overlap_yr <- sim_extents %>%
  group_by(Stream_Name,year) %>%
  summarise(yr.sim.start = min(sim.start),
            yr.sim.end = max(sim.end)) %>%
  full_join(h2o_logger_times,by=c("Stream_Name","year")) %>%
  mutate(start_overlap = ifelse(yr.sim.start > logger.deploy.start,"Yes","No")) %>%
  mutate(end_overlap = ifelse(yr.sim.end < logger.deploy.end,"Yes","No")) %>%
  mutate(full_overlap = ifelse(start_overlap == "Yes" & end_overlap == "Yes","Yes","No")) %>%
  rename(river = Stream_Name) %>%
  select(river,year,full_overlap) %>%
  transform(year = as.factor(year),
            river = as.factor(river))


# join catch table and size trend slopes table
tbl <- full_join(catch_summary, size_age_spp_year, by=c("year","river","spp","age")) 

# join water temperature extent/overlap data summary to catch/growth data summary
tbl <- full_join(tbl,sim_logger_overlap_yr,by=c("river","year"))

Data selection table, grouping level: year ~ river ~ species ~ age


datatable(tbl, caption = "Juvenile Chinook and Coho Catch and Size Trend Summary Data", 
          filter = 'top', options = list(
  pageLength = 5, autoWidth = TRUE)) %>%
  formatSignif(c("len_slope","wt_slope","len_intercept","wt_intercept"), digits = 2)

Table X - Summary of values used to determine if sufficient data exists for bioenergetics modeling efforts at the scale of year ~ river ~ species ~ age. The column “full_overlap” refers to whether or not water temperature data exists during the full extent of the proposed simulation period.

Filtered data selection table: year ~ river ~ species ~ age

The following filters are applied to the data table below:

Total fish > 10
Slope of weight gain trend > 0
Slope of fork length gain trend > 0


tbl_filt <- tbl %>%
  filter(total_fish > 10,
         len_slope > 0,
         wt_slope > 0)

datatable(tbl_filt, filter = 'top', options = list(
  pageLength = 5, autoWidth = TRUE)) %>%
  formatSignif(c("len_slope","wt_slope","len_intercept","wt_intercept"), digits = 2)

Table X - Filtered summary of values used to determine if sufficient data exists for bioenergetics modeling efforts at the scale of year ~ river ~ species ~ age. The column “full_overlap” refers to whether or not water temperature data exists during the full extent of the proposed simulation period.

Discussion 1.1

At the grouping level of year ~ river ~ species ~ age, about 1/3 (13 of 31) of the total potential simulations meet the ad hoc standards for sufficient data to perform a bioenergetics simulation. An additional consideration is that not all of these simulations will have 100% coverage for water temperature data.

Calculate empirical start and end weights for chosen subset of simulations

# find start and end weights

# summary table
size_summary <- dat %>%
  group_by(river, season, age, spp, year) %>%
  summarise(average_wt = mean(wt_g),
            stdev_wt = sd(wt_g),
            average_len = mean(len_mm),
            stdev_len = sd(len_mm),
            count = n()) %>%
  select(river,season,year,spp,age,count,average_wt,stdev_wt,average_len,stdev_len) %>%
  rename("_age_" = age)

# join chosen simulations (from filter) to seasonal mean length/weight data;

# fix "_age_" name
size_summary <- size_summary %>%
  rename(age = `_age_`)

# join tables; use inner_join becuase we want to include only the simulations selected by the previous table
tbl2 <- inner_join(size_summary,tbl_filt,by=c("river","year","spp","age")) %>%
  transform(year = as.factor(year))

Linear model start and end weights for chosen subset of simulations


#from start doy to end doy
#want output for wt from y=mx+b

#need: day of year for the sim extent start and finish
sim_extents <- sim_extents %>%
  mutate(sim.start.doy = yday(sim.start),
         sim.end.doy = yday(sim.end)) %>%
  mutate(season = fct_recode(as.factor(sample.event.num), 
                             "Early Summer"="1", 
                             "Mid-Summer" = "2",
                             "Late Summer" ="3",
                             "Fall" = "4")) %>%
  rename(river = Stream_Name)
  
# use slope and intercept values from empirical growth observations to generate simulation start and end weights/lengths; use y = mx + b linear model
s <- left_join(tbl2,sim_extents,by=c("season","river","year")) %>%
  mutate(lm_wt_start = wt_slope*sim.start.doy + wt_intercept,
         lm_wt_end = wt_slope*sim.end.doy + wt_intercept,
         lm_len_start = len_slope*sim.start.doy + len_intercept,
         lm_len_end = len_slope*sim.end.doy + len_intercept)

Chosen subset of simulations by season


s1 <- s %>%
  select(-c(data,sample.event.num))
  

datatable(s1, filter = 'top', options = list(
  pageLength = 5)) %>%
  formatSignif(c("average_wt","stdev_wt","average_len","stdev_len","len_slope","wt_slope","len_intercept","wt_intercept","lm_wt_start","lm_wt_end","lm_len_start","lm_len_end"), digits = 2)