R Session Info
sessionInfo()
## R version 3.5.3 (2019-03-11)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] compiler_3.5.3 magrittr_1.5 tools_3.5.3 htmltools_0.3.6
## [5] yaml_2.2.0 Rcpp_1.0.1 stringi_1.4.3 rmarkdown_1.12
## [9] knitr_1.22 stringr_1.4.0 xfun_0.6 digest_0.6.18
## [13] evaluate_0.13
What level of temporal/spatial segregation do we need in order to back-calculate fish weights from length-weight relationships?
Import weight/length data
# Fish ages used here were assigned by visual inspection of thresholds in length-frequency distribution charts as described in the script "fish_age_structure.R". As time permits, these age assignment results will be compared with bayesian methods described in Sethi et al.
# Prior to this approach, age-length key methods as described in Ch. 5 of Ogle 2016 ("Introductory fisheries analyses with R") were used. This appraoch was deemed to be unsuccessful in Spring 2018.
# Verify that sheets are present, create object of sheet with all tabs
SCTC2015 <- gs_title("2015 EPSCoR SCTC.xlsx")
SCTC2016 <- gs_title("2016_EPSCoR_Aquatic_Ecology_Database")
# read in desired worksheet 2015
fish_ages15 <- SCTC2015 %>%
gs_read(ws = "C 2015 Diet & LW Data",col_names=TRUE) %>%
select(river,sample.id,sample.event,spp,Weight_g,Length_mm,Age_manual2,sample.event.num,Date,`Site ID`,reach_name, DS, Count,DOY,`Gear ID`) %>%
mutate(season = fct_recode(as.factor(sample.event.num), #code sampling events as seasons
"Early Summer"="1",
"Mid-Summer" = "2",
"Late Summer" ="3")) %>%
rename(wt_g = Weight_g,
len_mm = Length_mm,
age = Age_manual2,
Hydrologic.Segment = reach_name,
diet_sample = DS,
Gear_ID = `Gear ID`) %>%
transform(Date = anydate(Date)) %>%
filter(spp %in% c("Coho", "Chinook")) %>% # filter out all species except Chinook and Coho
filter(Gear_ID != "Aq. Net") # filter out fish caught opportunistically with aquarium net for size evaluations
# read in desired worksheet 2016
fish_ages16 <- SCTC2016 %>%
gs_read(ws = "C 2016 Diet & LW Data",col_names=TRUE) %>%
select(river,sample.id,sample.event,spp,Weight_g,Length_mm,Age_manual2,sample.event.num,Date,`Site ID`,reach_name,DS,Count,DOY) %>%
mutate(season = fct_recode(as.factor(sample.event.num), #code sampling events as seasons
"Early Summer"="1",
"Mid-Summer" = "2",
"Late Summer" ="3",
"Fall" ="4")) %>%
rename(wt_g = Weight_g,
len_mm = Length_mm,
age = Age_manual2,
Hydrologic.Segment = reach_name,
diet_sample = DS) %>%
transform(Date = anydate(Date),
wt_g = as.numeric(wt_g)) %>%
filter(spp %in% c("Coho", "Chinook")) # filter out all species except Chinook and Coho
#join 2015 and 2016 data
fish_ages <- bind_rows(fish_ages15,fish_ages16)
#prep dataframe containing age data
dat <- fish_ages %>%
separate(sample.event, c("reach","year","sample.event.num"), sep = "-", remove = F) %>%
unite(river_spp, river, spp, remove = F, sep = " ") %>%
# manually order appearance of seasons and rivers for plots
transform(year = as.factor(year),
river = factor(river, levels=c("Beaver Creek",
'Russian River',
'Ptarmigan Creek',
'Kenai River')),
season = factor(season, levels = c("Early Summer",
"Mid-Summer",
"Late Summer",
"Fall")),
river_spp = as.factor(river_spp),
age = as.factor(age)) %>%
mutate(river_spp = paste(river,"\n",spp)) %>%
mutate(river_reach = paste(Hydrologic.Segment,river)) %>%
mutate(river_spp_yr = paste(year,river,"\n",spp)) %>%
mutate(doy = yday(Date)) %>%
# filter out blanks where fish age/length/weight data is missing
filter(!is.na(age),
!is.na(wt_g),
!is.na(len_mm)) %>%
# age 2 fish: this cohort is sparse and likely departing for the ocean.
filter(age != 2) %>%
select(river_spp,river,river_reach,sample.id, sample.event,reach,year,sample.event.num,spp,wt_g,len_mm,
age,season,Date,Site.ID,Hydrologic.Segment,Count,diet_sample,river_spp_yr,doy)
# housekeeping: remove Age 1 Chinook; not considered for growth models
age1chnk <- dat %>% filter(spp == "Chinook",
age == 1)
dat <- anti_join(dat,age1chnk)
rm(age1chnk)
# ensure that all age 2 fish are excluded!
dat <- dat %>% filter(age != 2)
# catch summary table; with age 1 chnk removed and all age 2 coho removed
catch_summary <- dat %>%
group_by(year,river,season,river_reach,spp,age,diet_sample) %>%
summarise(ct = sum(Count)) %>%
spread(diet_sample,ct) %>%
replace(is.na(.), 0) %>%
mutate(total_fish = sum(N,Y)) %>%
rename(diet_sample = Y,
len_wt_only = N)
rm(fish_ages15,fish_ages16)
# assign row number to later match output to objects in other scripts
dat <- dat %>%
mutate(rownum = row_number())
Fit basic models
## Fit basic models
# Age 0
basic.lm.0 <- lm(wt_g ~ len_mm, data = subset(dat, age == 0))
tidy(summary(basic.lm.0))
# Age 1
basic.lm.1 <- lm(wt_g ~ len_mm, data = subset(dat, age == 1))
tidy(summary(basic.lm.1))
## Plot
dat %>%
ggplot(aes(len_mm,wt_g)) +
facet_grid(spp~age) +
geom_jitter(width = 0.1)+
geom_smooth(method = "lm") +
ggtitle("Length-Weight, All Age 0 and Age 1 Fish") +
theme_bw()
# diagnotsic plots
layout(matrix(c(1,2,3,4),2,2)) # optional 4 graphs/page
plot(basic.lm.0)
plot(basic.lm.1)
Fit log-transformed models
### what do these relationships look like w/ log-transformed data?
dat.log <- dat %>%
mutate(log.wt_g = log10(wt_g),
log.len_mm = log10(len_mm))
## Fit basic models
# Age 0
log.lm.0 <- lm(log.wt_g ~ log.len_mm, data = subset(dat.log, age == 0))
tidy(summary(log.lm.0))
glance(log.lm.0)
# Age 1
log.lm.1 <- lm(log.wt_g ~ log.len_mm, data = subset(dat.log, age == 1))
tidy(summary(log.lm.1))
glance(log.lm.1)
## Plot
dat.log %>%
ggplot(aes(log.len_mm,log.wt_g)) +
facet_grid(spp~age) +
geom_jitter(width = 0.1)+
geom_smooth(method = "lm") +
ggtitle("Length-Weight, All Age 0 and Age 1 Fish") +
theme_bw()
# diagnotsic plots
layout(matrix(c(1,2,3,4),2,2)) # optional 4 graphs/page
plot(log.lm.0)
plot(log.lm.1)
The log-transformed overall model for length ~ weight has a higher r2 (0.9514) than the untransformed data (0.9269) and exhibits better adherence to linear model assumptions.
Following example of back-transformation of log.length values to acquire weight values (from Ogle 2016, Ch. 7, p 134):
Fit overall linear model
# fit overall lm
fit1 <- lm(log.wt_g ~ log.len_mm, data = dat.log)
# get details
coef(fit1)
## (Intercept) log.len_mm
## -4.956608 3.003857
confint(fit1)
## 2.5 % 97.5 %
## (Intercept) -4.981668 -4.931547
## log.len_mm 2.989967 3.017747
Verify that the model has a significant explanatory variable
Anova(fit1)
summary(fit1)
##
## Call:
## lm(formula = log.wt_g ~ log.len_mm, data = dat.log)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.40994 -0.02479 0.00147 0.02698 0.52776
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.956608 0.012783 -387.8 <2e-16 ***
## log.len_mm 3.003857 0.007085 424.0 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04939 on 4273 degrees of freedom
## Multiple R-squared: 0.9768, Adjusted R-squared: 0.9768
## F-statistic: 1.798e+05 on 1 and 4273 DF, p-value: < 2.2e-16
Large F-value and small p-val indicate that model w/ explanatory variable explains more variability than model w/ just intercept.
Use the relationship to make a prediction about weight from length
lens <- dat$len_mm # vector of lengths
nd <- data.frame(log.len_mm = log10(lens)) # df of log(lengths)
plogW <- predict(fit1,nd) # predicted log(weights)
We want to back-transform the predicted log-weights back to the original scale. However, values back-transformed as such are commonly biased. Apply correction factor (Sprugel 1983):
# calculate correction factor
( cf <- logbtcf(fit1,10) )
## [1] 1.006487
The corrected back-transformed predicted value of the response variable is then calculated by multiplying the back-transformed predicted value by the correction factor:
# apply correction factor
backcal.wts <- as.data.frame(cf*(10^plogW))
We also want confidence intervals for the predicted values:
mlogW <- predict(fit1,nd,interval = "confidence")
head(cf*10^mlogW)
## fit lwr upr
## 1 0.5261507 0.5216771 0.5306626
## 2 0.6691608 0.6641393 0.6742204
## 3 0.6691608 0.6641393 0.6742204
## 4 0.7220364 0.7168415 0.7272689
## 5 0.7220364 0.7168415 0.7272689
## 6 0.7220364 0.7168415 0.7272689
plogW <- predict(fit1,nd,interval = "prediction")
head(cf*10^plogW)
## fit lwr upr
## 1 0.5261507 0.4209343 0.6576668
## 2 0.6691608 0.5353656 0.8363933
## 3 0.6691608 0.5353656 0.8363933
## 4 0.7220364 0.5776749 0.9024739
## 5 0.7220364 0.5776749 0.9024739
## 6 0.7220364 0.5776749 0.9024739
Visualize the fit
# transformed plot
plot(log.wt_g~log.len_mm, data = dat.log,
pch = 19, col = rgb(0,0,0,1/4),
ylab = "log Weight (g)", xlab = "log Fork Length (mm)",
main = "Transformed Data")
tmp <- range(dat.log$log.len_mm)
xs <- seq(tmp[1],tmp[2],length.out = 99)
ys <- predict(fit1,data.frame(log.len_mm = xs))
lines(ys~xs,lwd=2)
# back-transformed data
plot(wt_g~len_mm, data = dat,
pch = 19, col = rgb(0,0,0,1/4),
ylab = "log Weight (g)", xlab = "log Fork Length (mm)",
main = "Back-Transformed Fish Weight Data")
btxs <- 10^xs
btys <- cf*10^ys
lines(btys~btxs,lwd=2)
Next, we should check the assumptions of our work, i.e. : -measurement errors are: –normally distributed –have constant variance
r <- residuals(fit1)
fv <- fitted(fit1)
residPlot(fit1)
It appears the model exhibits heteroscedastity, likely because error is not constant. Maybe segregate data by age – it would make sense have parameters for each age class!
Could we get a better model fit when data is segregated by age?
fit2 <- lm(log.wt_g ~ log.len_mm*age, data = dat.log)
Anova(fit2)
residPlot(fit2)
summary(fit2)
##
## Call:
## lm(formula = log.wt_g ~ log.len_mm * age, data = dat.log)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.40518 -0.02642 0.00106 0.02751 0.52385
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.87853 0.02527 -193.079 <2e-16 ***
## log.len_mm 2.95816 0.01456 203.176 <2e-16 ***
## age1 -0.07576 0.04408 -1.719 0.0857 .
## log.len_mm:age1 0.04544 0.02388 1.903 0.0571 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04932 on 4271 degrees of freedom
## Multiple R-squared: 0.9769, Adjusted R-squared: 0.9768
## F-statistic: 6.009e+04 on 3 and 4271 DF, p-value: < 2.2e-16
Blue = Age 0 Red = Age 1
Yes; clearly the residual distribution is quite different for the two age classes.
So: repeat whole procedure from above, but segregated by age!
AGE 0
Fit linear model
dat.log.0 <- dat.log %>%
filter(age == 0)
# fit overall age 0 lm
fit0 <- lm(log.wt_g ~ log.len_mm, data = dat.log.0)
# get details
tidy(coef(fit0))
confint(fit0)
## 2.5 % 97.5 %
## (Intercept) -4.931416 -4.825641
## log.len_mm 2.927688 2.988639
Verify that the model has a significant explanatory variable
tidy(Anova(fit0))
tidy(summary(fit0))
Large F-value and small p-val indicate that model w/ explanatory variable explains more variability than model w/ just intercept.
Use the relationship to make a prediction about weight from length
lens <- dat.log.0$len_mm # vector of lengths
ages <- dat.log.0$age
nd <- data.frame(log.len_mm = log10(lens), age = ages) # df of log(lengths)
plogW <- predict(fit0,nd) # predicted log(weights)
Back-transform the predicted log-weights back to the original scale:
# calculate correction factor
( cf <- logbtcf(fit0,10) )
## [1] 1.007375
The corrected back-transformed predicted value of the response variable is then calculated by multiplying the back-transformed predicted value by the correction factor:
# apply correction factor
backcal.wts <- as.data.frame(cf*(10^plogW))
Confidence intervals for the predicted mean values:
mlogW <- predict(fit0,nd,interval = "confidence")
# create df for export
age0meanCI <- as.data.frame(cf*10^mlogW) %>%
rename(lwrCImean = lwr,
uprCImean = upr,
backcal.wts = fit)
Upper and lower CI boundaries indicate we are 95% sure that MEAN weight of fish is between those values
Confidence intervals for the predicted individual values:
plogW <- predict(fit0,nd,interval = "prediction")
head(cf*10^plogW)
## fit lwr upr
## 1 0.5351303 0.4217581 0.6789780
## 2 0.6780965 0.5344973 0.8602755
## 3 0.6780965 0.5344973 0.8602755
## 4 0.7308322 0.5760832 0.9271502
## 5 0.7308322 0.5760832 0.9271502
## 6 0.7308322 0.5760832 0.9271502
# create df for export
age0indvCI <- as.data.frame(cf*10^plogW) %>%
rename(lwrCIindv = lwr,
uprCIindv = upr,
backcal.wts = fit)
Upper and lower CI boundaries indicate we are 95% sure that INDIVIDUAL weight of fish is between those values
Visualize the fit
# transformed plot
plot(log.wt_g~log.len_mm, data = dat.log.0,
pch = 19, col = rgb(0,0,0,1/4),
ylab = "log Weight (g)", xlab = "log Fork Length (mm)",
main = "Transformed Data, Age 0")
tmp <- range(dat.log.0$log.len_mm)
xs <- seq(tmp[1],tmp[2],length.out = 99)
ys <- predict(fit0,data.frame(log.len_mm = xs))
lines(ys~xs,lwd=2)
# back-transformed data
plot(wt_g~len_mm, data = dat.log.0,
pch = 19, col = rgb(0,0,0,1/4),
ylab = "log Weight (g)", xlab = "log Fork Length (mm)",
main = "Back-Transformed Data, Age 0")
btxs <- 10^xs
btys <- cf*10^ys
lines(btys~btxs,lwd=2)
We should check the assumptions of our work : -measurement errors are: –normally distributed –have constant variance
r <- residuals(fit0)
fv <- fitted(fit0)
residPlot(fit0)
Prep for export Age 0 back-calculated weight values for later use in other contexts
# finalize age 0 data
age0fnl <- bind_cols(age0meanCI,age0indvCI) %>%
select(-backcal.wts1) %>%
bind_cols(dat.log.0)
AGE 1
Fit linear model
dat.log.1 <- dat.log %>%
filter(age == 1)
# fit overall age 0 lm
fit1 <- lm(log.wt_g ~ log.len_mm, data = dat.log.1)
# get details
tidy(coef(fit1))
confint(fit1)
## 2.5 % 97.5 %
## (Intercept) -5.016800 -4.891770
## log.len_mm 2.970842 3.036365
Verify that the model has a significant explanatory variable
tidy(Anova(fit1))
tidy(summary(fit1))
Large F-value and small p-val indicate that model w/ explanatory variable explains more variability than model w/ just intercept.
Use the relationship to make a prediction about weight from length
lens <- dat.log.1$len_mm # vector of lengths
ages <- dat.log.1$age
nd <- data.frame(log.len_mm = log10(lens), age = ages) # df of log(lengths)
plogW <- predict(fit1,nd) # predicted log(weights)
We want to back-transform the predicted log-weights back to the original scale. However, values back-transformed as such are commonly biased. Apply correction factor (Sprugel 1983):
# calculate correction factor
( cf <- logbtcf(fit1,10) )
## [1] 1.005035
The corrected back-transformed predicted value of the response variable is then calculated by multiplying the back-transformed predicted value by the correction factor:
# apply correction factor
backcal.wts <- as.data.frame(cf*(10^plogW))
We also want confidence intervals for the predicted values:
mlogW <- predict(fit1,nd,interval = "confidence")
head(cf*10^mlogW)
## fit lwr upr
## 1 1.502310 1.478747 1.526248
## 2 1.783691 1.758827 1.808907
## 3 1.783691 1.758827 1.808907
## 4 1.884756 1.859530 1.910325
## 5 1.884756 1.859530 1.910325
## 6 1.884756 1.859530 1.910325
# create df for export
age1meanCI <- as.data.frame(cf*10^mlogW) %>%
rename(lwrCImean = lwr,
uprCImean = upr,
backcal.wts = fit)
plogW <- predict(fit1,nd,interval = "prediction")
head(cf*10^plogW)
## fit lwr upr
## 1 1.502310 1.233410 1.829834
## 2 1.783691 1.464623 2.172269
## 3 1.783691 1.464623 2.172269
## 4 1.884756 1.547670 2.295261
## 5 1.884756 1.547670 2.295261
## 6 1.884756 1.547670 2.295261
# create df for export
age1indvCI <- as.data.frame(cf*10^plogW) %>%
rename(lwrCIindv = lwr,
uprCIindv = upr,
backcal.wts = fit)
Visualize the fit
# transformed plot
plot(log.wt_g~log.len_mm, data = dat.log.1,
pch = 19, col = rgb(0,0,0,1/4),
ylab = "log Weight (g)", xlab = "log Fork Length (mm)",
main = "Transformed Data, Age 1")
tmp <- range(dat.log.1$log.len_mm)
xs <- seq(tmp[1],tmp[2],length.out = 99)
ys <- predict(fit1,data.frame(log.len_mm = xs))
lines(ys~xs,lwd=2)
# back-transformed data
plot(wt_g~len_mm, data = dat.log.1,
pch = 19, col = rgb(0,0,0,1/4),
ylab = "log Weight (g)", xlab = "log Fork Length (mm)",
main = "Back-Transformed Data, Age 1")
btxs <- 10^xs
btys <- cf*10^ys
lines(btys~btxs,lwd=2)
We should check the assumptions of our work : -measurement errors are: –normally distributed –have constant variance
r <- residuals(fit1)
fv <- fitted(fit1)
residPlot(fit1)
Prep for export Age 1 back-calculated weight values for later use in other contexts
# finalize age 0 data
age1fnl <- bind_cols(age1meanCI,age1indvCI) %>%
select(-backcal.wts1) %>%
bind_cols(dat.log.1)
Join age 0 and age 1 data
# join age 0 and age 1 backcal data
fnl <- bind_rows(age0fnl,age1fnl)
Export backcal data for later use