EPSCoR SCTC Aquatic Ecology

Fish weight-length relationships

R Session Info

sessionInfo()
## R version 3.5.3 (2019-03-11)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.4
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_3.5.3  magrittr_1.5    tools_3.5.3     htmltools_0.3.6
##  [5] yaml_2.2.0      Rcpp_1.0.1      stringi_1.4.3   rmarkdown_1.12 
##  [9] knitr_1.22      stringr_1.4.0   xfun_0.6        digest_0.6.18  
## [13] evaluate_0.13


What level of temporal/spatial segregation do we need in order to back-calculate fish weights from length-weight relationships?


Import weight/length data

# Fish ages used here were assigned by visual inspection of thresholds in length-frequency distribution charts as described in the script "fish_age_structure.R".  As time permits, these age assignment results will be compared with bayesian methods described in Sethi et al. 

# Prior to this approach, age-length key methods as described in Ch. 5 of Ogle 2016 ("Introductory fisheries analyses with R") were used. This appraoch was deemed to be unsuccessful in Spring 2018.


# Verify that sheets are present, create object of sheet with all tabs
SCTC2015 <- gs_title("2015 EPSCoR SCTC.xlsx")
SCTC2016 <- gs_title("2016_EPSCoR_Aquatic_Ecology_Database")

# read in desired worksheet 2015
fish_ages15 <- SCTC2015 %>% 
  gs_read(ws = "C 2015 Diet & LW Data",col_names=TRUE) %>%
  select(river,sample.id,sample.event,spp,Weight_g,Length_mm,Age_manual2,sample.event.num,Date,`Site ID`,reach_name, DS, Count,DOY,`Gear ID`) %>%
  mutate(season = fct_recode(as.factor(sample.event.num),         #code sampling events as seasons
                             "Early Summer"="1", 
                             "Mid-Summer" = "2",
                             "Late Summer" ="3")) %>%
  rename(wt_g = Weight_g,
         len_mm = Length_mm,
         age = Age_manual2,
         Hydrologic.Segment = reach_name,
         diet_sample = DS,
         Gear_ID = `Gear ID`) %>%
  transform(Date = anydate(Date)) %>%
  filter(spp %in% c("Coho", "Chinook")) %>% # filter out all species except Chinook and Coho
  filter(Gear_ID != "Aq. Net")  # filter out fish caught opportunistically with aquarium net for size evaluations

# read in desired worksheet 2016
fish_ages16 <- SCTC2016 %>% 
  gs_read(ws = "C 2016 Diet & LW Data",col_names=TRUE) %>%
  select(river,sample.id,sample.event,spp,Weight_g,Length_mm,Age_manual2,sample.event.num,Date,`Site ID`,reach_name,DS,Count,DOY) %>%
  mutate(season = fct_recode(as.factor(sample.event.num),         #code sampling events as seasons
                             "Early Summer"="1", 
                             "Mid-Summer" = "2",
                             "Late Summer" ="3",
                             "Fall" ="4")) %>%
  rename(wt_g = Weight_g,
         len_mm = Length_mm,
         age = Age_manual2,
         Hydrologic.Segment = reach_name,
         diet_sample = DS) %>%
  transform(Date = anydate(Date),
            wt_g = as.numeric(wt_g)) %>%
  filter(spp %in% c("Coho", "Chinook")) # filter out all species except Chinook and Coho


#join 2015 and 2016 data
fish_ages <- bind_rows(fish_ages15,fish_ages16)

#prep dataframe containing age data
dat <- fish_ages %>% 
  separate(sample.event, c("reach","year","sample.event.num"), sep = "-", remove = F) %>%
  unite(river_spp, river, spp, remove = F, sep = " ") %>%
  # manually order appearance of seasons and rivers for plots
  transform(year = as.factor(year),
            river = factor(river, levels=c("Beaver Creek",
                                              'Russian River',
                                              'Ptarmigan Creek',
                                              'Kenai River')),
            season = factor(season, levels = c("Early Summer",
                                     "Mid-Summer",
                                     "Late Summer",
                                     "Fall")), 
            river_spp = as.factor(river_spp),
            age = as.factor(age)) %>%
  mutate(river_spp = paste(river,"\n",spp)) %>%
  mutate(river_reach = paste(Hydrologic.Segment,river)) %>%
  mutate(river_spp_yr = paste(year,river,"\n",spp)) %>%
  mutate(doy = yday(Date)) %>%
  # filter out blanks where fish age/length/weight data is missing
  filter(!is.na(age),
         !is.na(wt_g),
         !is.na(len_mm)) %>%
 # age 2 fish: this cohort is sparse and likely departing for the ocean.
  filter(age != 2) %>%
    select(river_spp,river,river_reach,sample.id, sample.event,reach,year,sample.event.num,spp,wt_g,len_mm,
           age,season,Date,Site.ID,Hydrologic.Segment,Count,diet_sample,river_spp_yr,doy)

# housekeeping: remove Age 1 Chinook; not considered for growth models
age1chnk <- dat %>% filter(spp == "Chinook",
                           age == 1)
dat <- anti_join(dat,age1chnk)
rm(age1chnk)

# ensure that all age 2 fish are excluded!
dat <- dat %>% filter(age != 2)

# catch summary table; with age 1 chnk removed and all age 2 coho removed
catch_summary <- dat %>%
  group_by(year,river,season,river_reach,spp,age,diet_sample) %>%
  summarise(ct = sum(Count)) %>%
  spread(diet_sample,ct) %>%
  replace(is.na(.), 0) %>%
  mutate(total_fish = sum(N,Y)) %>%
  rename(diet_sample = Y,
         len_wt_only = N)

rm(fish_ages15,fish_ages16)

# assign row number to later match output to objects in other scripts
dat <- dat %>%
  mutate(rownum = row_number())


Fit basic models

## Fit basic models
# Age 0 
basic.lm.0 <- lm(wt_g ~ len_mm, data = subset(dat, age == 0))
tidy(summary(basic.lm.0))

# Age 1
basic.lm.1 <- lm(wt_g ~ len_mm, data = subset(dat, age == 1))
tidy(summary(basic.lm.1))

## Plot
dat %>%
ggplot(aes(len_mm,wt_g)) +
  facet_grid(spp~age) +
  geom_jitter(width = 0.1)+
  geom_smooth(method = "lm") +
  ggtitle("Length-Weight, All Age 0 and Age 1 Fish") + 
  theme_bw()



# diagnotsic plots
layout(matrix(c(1,2,3,4),2,2)) # optional 4 graphs/page 
plot(basic.lm.0)

plot(basic.lm.1)


Fit log-transformed models


### what do these relationships look like w/ log-transformed data?

dat.log <- dat %>%
  mutate(log.wt_g = log10(wt_g),
         log.len_mm = log10(len_mm))

## Fit basic models
# Age 0 
log.lm.0 <- lm(log.wt_g ~ log.len_mm, data = subset(dat.log, age == 0))
tidy(summary(log.lm.0))
glance(log.lm.0)

# Age 1
log.lm.1 <- lm(log.wt_g ~ log.len_mm, data = subset(dat.log, age == 1))
tidy(summary(log.lm.1))
glance(log.lm.1)

## Plot
dat.log %>%
ggplot(aes(log.len_mm,log.wt_g)) +
  facet_grid(spp~age) +
  geom_jitter(width = 0.1)+
  geom_smooth(method = "lm") +
  ggtitle("Length-Weight, All Age 0 and Age 1 Fish") + 
  theme_bw()


# diagnotsic plots
layout(matrix(c(1,2,3,4),2,2)) # optional 4 graphs/page 
plot(log.lm.0)

plot(log.lm.1)


The log-transformed overall model for length ~ weight has a higher r2 (0.9514) than the untransformed data (0.9269) and exhibits better adherence to linear model assumptions.




Following example of back-transformation of log.length values to acquire weight values (from Ogle 2016, Ch. 7, p 134):


Fit overall linear model

# fit overall lm
fit1 <- lm(log.wt_g ~ log.len_mm, data = dat.log)

# get details
coef(fit1)
## (Intercept)  log.len_mm 
##   -4.956608    3.003857
confint(fit1)
##                 2.5 %    97.5 %
## (Intercept) -4.981668 -4.931547
## log.len_mm   2.989967  3.017747


Verify that the model has a significant explanatory variable

Anova(fit1)
summary(fit1)
## 
## Call:
## lm(formula = log.wt_g ~ log.len_mm, data = dat.log)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.40994 -0.02479  0.00147  0.02698  0.52776 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.956608   0.012783  -387.8   <2e-16 ***
## log.len_mm   3.003857   0.007085   424.0   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04939 on 4273 degrees of freedom
## Multiple R-squared:  0.9768, Adjusted R-squared:  0.9768 
## F-statistic: 1.798e+05 on 1 and 4273 DF,  p-value: < 2.2e-16

Large F-value and small p-val indicate that model w/ explanatory variable explains more variability than model w/ just intercept.


Use the relationship to make a prediction about weight from length

lens <- dat$len_mm                                   # vector of lengths
nd <- data.frame(log.len_mm = log10(lens))           # df of log(lengths)
plogW <- predict(fit1,nd)                            # predicted log(weights)


We want to back-transform the predicted log-weights back to the original scale. However, values back-transformed as such are commonly biased. Apply correction factor (Sprugel 1983):

# calculate correction factor
 ( cf <- logbtcf(fit1,10) )
## [1] 1.006487


The corrected back-transformed predicted value of the response variable is then calculated by multiplying the back-transformed predicted value by the correction factor:

# apply correction factor
backcal.wts <- as.data.frame(cf*(10^plogW))


We also want confidence intervals for the predicted values:

mlogW <- predict(fit1,nd,interval = "confidence")
head(cf*10^mlogW)
##         fit       lwr       upr
## 1 0.5261507 0.5216771 0.5306626
## 2 0.6691608 0.6641393 0.6742204
## 3 0.6691608 0.6641393 0.6742204
## 4 0.7220364 0.7168415 0.7272689
## 5 0.7220364 0.7168415 0.7272689
## 6 0.7220364 0.7168415 0.7272689
plogW <- predict(fit1,nd,interval = "prediction")
head(cf*10^plogW)
##         fit       lwr       upr
## 1 0.5261507 0.4209343 0.6576668
## 2 0.6691608 0.5353656 0.8363933
## 3 0.6691608 0.5353656 0.8363933
## 4 0.7220364 0.5776749 0.9024739
## 5 0.7220364 0.5776749 0.9024739
## 6 0.7220364 0.5776749 0.9024739


Visualize the fit

# transformed plot
plot(log.wt_g~log.len_mm, data = dat.log,
     pch = 19, col = rgb(0,0,0,1/4),
     ylab = "log Weight (g)", xlab = "log Fork Length (mm)",
     main = "Transformed Data")
tmp <- range(dat.log$log.len_mm)
xs <- seq(tmp[1],tmp[2],length.out = 99)
ys <- predict(fit1,data.frame(log.len_mm = xs))
lines(ys~xs,lwd=2)


# back-transformed data
plot(wt_g~len_mm, data = dat,
     pch = 19, col = rgb(0,0,0,1/4),
     ylab = "log Weight (g)", xlab = "log Fork Length (mm)",
     main = "Back-Transformed Fish Weight Data")
btxs <- 10^xs
btys <- cf*10^ys
lines(btys~btxs,lwd=2)


Next, we should check the assumptions of our work, i.e. : -measurement errors are: –normally distributed –have constant variance

r <- residuals(fit1)
fv <- fitted(fit1)
residPlot(fit1)


It appears the model exhibits heteroscedastity, likely because error is not constant. Maybe segregate data by age – it would make sense have parameters for each age class!









Could we get a better model fit when data is segregated by age?

fit2 <- lm(log.wt_g ~ log.len_mm*age, data = dat.log)
Anova(fit2)
residPlot(fit2)

summary(fit2)
## 
## Call:
## lm(formula = log.wt_g ~ log.len_mm * age, data = dat.log)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.40518 -0.02642  0.00106  0.02751  0.52385 
## 
## Coefficients:
##                 Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)     -4.87853    0.02527 -193.079   <2e-16 ***
## log.len_mm       2.95816    0.01456  203.176   <2e-16 ***
## age1            -0.07576    0.04408   -1.719   0.0857 .  
## log.len_mm:age1  0.04544    0.02388    1.903   0.0571 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04932 on 4271 degrees of freedom
## Multiple R-squared:  0.9769, Adjusted R-squared:  0.9768 
## F-statistic: 6.009e+04 on 3 and 4271 DF,  p-value: < 2.2e-16

Blue = Age 0 Red = Age 1


Yes; clearly the residual distribution is quite different for the two age classes.

So: repeat whole procedure from above, but segregated by age!



AGE 0

Age 0 length-weight parameters

Fit linear model

dat.log.0 <- dat.log %>%
  filter(age == 0)
# fit overall age 0 lm
fit0 <- lm(log.wt_g ~ log.len_mm, data = dat.log.0)

# get details
tidy(coef(fit0))
confint(fit0)
##                 2.5 %    97.5 %
## (Intercept) -4.931416 -4.825641
## log.len_mm   2.927688  2.988639


Verify that the model has a significant explanatory variable

tidy(Anova(fit0))
tidy(summary(fit0))

Large F-value and small p-val indicate that model w/ explanatory variable explains more variability than model w/ just intercept.


Use the relationship to make a prediction about weight from length

lens <- dat.log.0$len_mm                                  # vector of lengths
ages <- dat.log.0$age
nd <- data.frame(log.len_mm = log10(lens), age = ages)           # df of log(lengths)
plogW <- predict(fit0,nd)                            # predicted log(weights)

Back-transform the predicted log-weights back to the original scale:

# calculate correction factor
 ( cf <- logbtcf(fit0,10) )
## [1] 1.007375

The corrected back-transformed predicted value of the response variable is then calculated by multiplying the back-transformed predicted value by the correction factor:

# apply correction factor
backcal.wts <- as.data.frame(cf*(10^plogW))

Confidence intervals for the predicted mean values:

mlogW <- predict(fit0,nd,interval = "confidence")

# create df for export
age0meanCI <- as.data.frame(cf*10^mlogW) %>%
  rename(lwrCImean = lwr,
         uprCImean = upr,
        backcal.wts = fit)

Upper and lower CI boundaries indicate we are 95% sure that MEAN weight of fish is between those values


Confidence intervals for the predicted individual values:

plogW <- predict(fit0,nd,interval = "prediction")
head(cf*10^plogW)
##         fit       lwr       upr
## 1 0.5351303 0.4217581 0.6789780
## 2 0.6780965 0.5344973 0.8602755
## 3 0.6780965 0.5344973 0.8602755
## 4 0.7308322 0.5760832 0.9271502
## 5 0.7308322 0.5760832 0.9271502
## 6 0.7308322 0.5760832 0.9271502

# create df for export
age0indvCI <- as.data.frame(cf*10^plogW) %>%
  rename(lwrCIindv = lwr,
         uprCIindv = upr,
        backcal.wts = fit)

Upper and lower CI boundaries indicate we are 95% sure that INDIVIDUAL weight of fish is between those values


Visualize the fit

# transformed plot
plot(log.wt_g~log.len_mm, data = dat.log.0,
     pch = 19, col = rgb(0,0,0,1/4),
     ylab = "log Weight (g)", xlab = "log Fork Length (mm)",
     main = "Transformed Data, Age 0")
tmp <- range(dat.log.0$log.len_mm)
xs <- seq(tmp[1],tmp[2],length.out = 99)
ys <- predict(fit0,data.frame(log.len_mm = xs))
lines(ys~xs,lwd=2)


# back-transformed data
plot(wt_g~len_mm, data = dat.log.0,
     pch = 19, col = rgb(0,0,0,1/4),
     ylab = "log Weight (g)", xlab = "log Fork Length (mm)",
     main = "Back-Transformed Data, Age 0")
btxs <- 10^xs
btys <- cf*10^ys
lines(btys~btxs,lwd=2)

We should check the assumptions of our work : -measurement errors are: –normally distributed –have constant variance

r <- residuals(fit0)
fv <- fitted(fit0)
residPlot(fit0)


Prep for export Age 0 back-calculated weight values for later use in other contexts

# finalize age 0 data
age0fnl <- bind_cols(age0meanCI,age0indvCI) %>%
  select(-backcal.wts1) %>%
  bind_cols(dat.log.0)



AGE 1

Age 1 length-weight parameters

Fit linear model

dat.log.1 <- dat.log %>%
  filter(age == 1)
# fit overall age 0 lm
fit1 <- lm(log.wt_g ~ log.len_mm, data = dat.log.1)

# get details
tidy(coef(fit1))
confint(fit1)
##                 2.5 %    97.5 %
## (Intercept) -5.016800 -4.891770
## log.len_mm   2.970842  3.036365


Verify that the model has a significant explanatory variable


tidy(Anova(fit1))
tidy(summary(fit1))

Large F-value and small p-val indicate that model w/ explanatory variable explains more variability than model w/ just intercept.


Use the relationship to make a prediction about weight from length

lens <- dat.log.1$len_mm                                  # vector of lengths
ages <- dat.log.1$age
nd <- data.frame(log.len_mm = log10(lens), age = ages)           # df of log(lengths)
plogW <- predict(fit1,nd)                            # predicted log(weights)

We want to back-transform the predicted log-weights back to the original scale. However, values back-transformed as such are commonly biased. Apply correction factor (Sprugel 1983):

# calculate correction factor
 ( cf <- logbtcf(fit1,10) )
## [1] 1.005035

The corrected back-transformed predicted value of the response variable is then calculated by multiplying the back-transformed predicted value by the correction factor:

# apply correction factor
backcal.wts <- as.data.frame(cf*(10^plogW))

We also want confidence intervals for the predicted values:

mlogW <- predict(fit1,nd,interval = "confidence")
head(cf*10^mlogW)
##        fit      lwr      upr
## 1 1.502310 1.478747 1.526248
## 2 1.783691 1.758827 1.808907
## 3 1.783691 1.758827 1.808907
## 4 1.884756 1.859530 1.910325
## 5 1.884756 1.859530 1.910325
## 6 1.884756 1.859530 1.910325

# create df for export
age1meanCI <- as.data.frame(cf*10^mlogW) %>%
  rename(lwrCImean = lwr,
         uprCImean = upr,
        backcal.wts = fit)
plogW <- predict(fit1,nd,interval = "prediction")
head(cf*10^plogW)
##        fit      lwr      upr
## 1 1.502310 1.233410 1.829834
## 2 1.783691 1.464623 2.172269
## 3 1.783691 1.464623 2.172269
## 4 1.884756 1.547670 2.295261
## 5 1.884756 1.547670 2.295261
## 6 1.884756 1.547670 2.295261

# create df for export
age1indvCI <- as.data.frame(cf*10^plogW) %>%
  rename(lwrCIindv = lwr,
         uprCIindv = upr,
        backcal.wts = fit)


Visualize the fit

# transformed plot
plot(log.wt_g~log.len_mm, data = dat.log.1,
     pch = 19, col = rgb(0,0,0,1/4),
     ylab = "log Weight (g)", xlab = "log Fork Length (mm)",
     main = "Transformed Data, Age 1")
tmp <- range(dat.log.1$log.len_mm)
xs <- seq(tmp[1],tmp[2],length.out = 99)
ys <- predict(fit1,data.frame(log.len_mm = xs))
lines(ys~xs,lwd=2)


# back-transformed data
plot(wt_g~len_mm, data = dat.log.1,
     pch = 19, col = rgb(0,0,0,1/4),
     ylab = "log Weight (g)", xlab = "log Fork Length (mm)",
     main = "Back-Transformed Data, Age 1")
btxs <- 10^xs
btys <- cf*10^ys
lines(btys~btxs,lwd=2)


We should check the assumptions of our work : -measurement errors are: –normally distributed –have constant variance

r <- residuals(fit1)
fv <- fitted(fit1)
residPlot(fit1)


Prep for export Age 1 back-calculated weight values for later use in other contexts

# finalize age 0 data
age1fnl <- bind_cols(age1meanCI,age1indvCI) %>%
  select(-backcal.wts1) %>%
  bind_cols(dat.log.1)


Join age 0 and age 1 data

# join age 0 and age 1 backcal data
fnl <- bind_rows(age0fnl,age1fnl)


Export backcal data for later use