# Inputting Data
abalone <- read.csv("abalone2.csv")
head(abalone)
##   Age Sex Length Diameter Height Whole.wt Shucked.wt Viscera.wt Shell.wt
## 1   9   F  0.530    0.420  0.135   0.6770     0.2565     0.1415    0.210
## 2  20   F  0.530    0.415  0.150   0.7775     0.2370     0.1415    0.330
## 3  16   F  0.545    0.425  0.125   0.7680     0.2940     0.1495    0.260
## 4  19   F  0.550    0.440  0.150   0.8945     0.3145     0.1510    0.320
## 5  14   F  0.525    0.380  0.140   0.6065     0.1940     0.1475    0.210
## 6  10   F  0.535    0.405  0.145   0.6845     0.2725     0.1710    0.205
# Subsetting dataframe to include only variables of interest.
aba <- abalone[,c("Age", "Sex", "Diameter", "Shell.wt", "Shucked.wt")]
head(aba)
##   Age Sex Diameter Shell.wt Shucked.wt
## 1   9   F    0.420    0.210     0.2565
## 2  20   F    0.415    0.330     0.2370
## 3  16   F    0.425    0.260     0.2940
## 4  19   F    0.440    0.320     0.3145
## 5  14   F    0.380    0.210     0.1940
## 6  10   F    0.405    0.205     0.2725
abaF <- subset(aba, Sex == "F")
abaM <- subset(aba, Sex == "M")
# color palettes
names(pnw_palettes)
##  [1] "Starfish" "Shuksan"  "Bay"      "Winter"   "Lake"     "Sunset"  
##  [7] "Shuksan2" "Cascades" "Sailboat" "Moth"     "Spring"   "Mushroom"
## [13] "Sunset2"  "Anemone"
pal1 <- pnw_palette("Lake",2)
pal1

aba$Sex <- as.factor(aba$Sex)
colors <- pal1[unclass(aba$Sex)]
# Assessing distributions
par(mfrow=c(2,2))

hist(aba$Age, main = "Age (years)", xlab = "Age (years)", col = pnw_palette("Sunset2")) 
  # appears somewhat normal, but age is a positive, discrete variable, so Poisson is likely more appropriate.

hist(aba$Diameter, main = "Diameter", xlab = "Diameter", col = pnw_palette("Sunset2")) 
  # appears normal, with some left skew

hist(aba$Shell.wt, main = "Shell Weight", xlab = "Shell Weight", col = pnw_palette("Sunset2"))
  # appears normal, with some right skew

hist(aba$Shucked.wt, main = "Shucked weight", xlab = "Shucked Weight", col = pnw_palette("Sunset2"))

  # appears normal, with some right skew

The three numeric predictors - diameter, shell weight, and shucked weight - are all normally distributed, with some skew to the left or right. Age appears somewhat normal, but age is a positive, discrete variable, so a Poisson distribution may be more appropriate.

stripchart(Age ~ Sex, data = aba, 
           vertical = TRUE,
           method = "jitter", 
           pch = 16, 
           col = pal1, 
           cex = 0.5,
           main = "Abalone Age by Sex",
           ylab = "Age (years)")

Variance in age is similar between male and female abalone.

# Comparing variables.
pairs(aba[,-2], 
    #  col = aba$Sex,
      pch = 16, 
      cex = 0.25, 
      col = colors,
      main = "All (F = green, M = blue)")

pairs(abaF[,-2], 
      col = colors[1],
      pch = 16, 
      cex = 0.25, 
      main = "Female") # only Female abalone

pairs(abaM[,-2],
      col = colors[2835], 
      pch = 16, 
      cex = 0.25,
      main = "Male") # only Male abalone

All three predictors have some relationship to age (as expected, given the introductory information); it appears funnel shaped, with tighter data at smaller values and greater values as the Xi’s increase. Diameter, shell weight, and shucked weight are also interrelated, with exponential/logarithmic funneling patterns.

These patterns do not change when comparing male and female abalone.

# Testing for correlation
kable(cor(aba[,-c(1,2)]))
Diameter Shell.wt Shucked.wt
Diameter 1.0000000 0.8771149 0.873718
Shell.wt 0.8771149 1.0000000 0.822242
Shucked.wt 0.8737180 0.8222420 1.000000

The predictors are all strongly correlated to each other, as shown by the r values from Pearson correlation tests in the table above. However, since we are exploring main effects only for the purposes of the exams, I will omit any interactions from my candidate models.

# Evaluating univariate patterns with link - log()
par(mfrow=c(3,3))

plot(Age ~ Diameter, data = aba, col = colors, pch = 1, cex = 0.5)
legend("topright", legend = levels(aba$Sex), col = pal1, pch = 1, cex = 0.8, bty = "o")

plot(log(Age) ~ Diameter, data = aba, col = colors, pch = 1, cex = 0.5)
legend("topright", legend = levels(aba$Sex), col = pal1, pch = 1, cex = 0.8, bty = "o")

plot(log(Age) ~ log(Diameter), data = aba, col = colors, pch = 1, cex = 0.5)
legend("topright", legend = levels(aba$Sex), col = pal1, pch = 1, cex = 0.8, bty = "o")

plot(Age ~ Shell.wt, data = aba, col = colors, pch = 1, cex = 0.5)
legend("topright", legend = levels(aba$Sex), col = pal1, pch = 1, cex = 0.8, bty = "o")

plot(log(Age) ~ Shell.wt, data = aba, col = colors, pch = 1, cex = 0.5)
legend("topright", legend = levels(aba$Sex), col = pal1, pch = 1, cex = 0.8, bty = "o")

plot(log(Age) ~ log(Shell.wt), data = aba, col = colors, pch = 1, cex = 0.5)
legend("topright", legend = levels(aba$Sex), col = pal1, pch = 1, cex = 0.8, bty = "o")

plot(Age ~ Shucked.wt, data = aba, col = colors, pch = 1, cex = 0.5)
legend("topright", legend = levels(aba$Sex), col = pal1, pch = 1, cex = 0.8, bty = "o")

plot(log(Age) ~ Shucked.wt, data = aba, col = colors, pch = 1, cex = 0.5)
legend("topright", legend = levels(aba$Sex), col = pal1, pch = 1, cex = 0.8, bty = "o")

plot(log(Age) ~ log(Shucked.wt), data = aba, col = colors, pch = 1, cex = 0.5)
legend("topright", legend = levels(aba$Sex), col = pal1, pch = 1, cex = 0.8, bty = "o")

Relationships between variables and log of age (column 2) are a little less funneled and appear more linear/logarithmic in pattern than the raw data (column 1). Taking the log of both x and y variables (column 3) reveals a more linear pattern between the log of each predictor and the log response. However, this becomes very difficult to interpret, and since the predictors are already normally distributed, there is no need to log-transform them.

# Full Model.
m1 <- glm(Age ~ Diameter + Shell.wt + Shucked.wt + Sex, data = aba, family = "poisson")
summary(m1)
## 
## Call:
## glm(formula = Age ~ Diameter + Shell.wt + Shucked.wt + Sex, family = "poisson", 
##     data = aba)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.8288218  0.0572745  31.931  < 2e-16 ***
## Diameter     1.0987078  0.1859380   5.909 3.44e-09 ***
## Shell.wt     1.6569206  0.0834098  19.865  < 2e-16 ***
## Shucked.wt  -0.9688535  0.0576405 -16.809  < 2e-16 ***
## SexM        -0.0009045  0.0115182  -0.079    0.937    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2272.1  on 2834  degrees of freedom
## Residual deviance: 1428.6  on 2830  degrees of freedom
## AIC: 13367
## 
## Number of Fisher Scoring iterations: 4
anova(m1)
## Analysis of Deviance Table
## 
## Model: poisson, link: log
## 
## Response: Age
## 
## Terms added sequentially (first to last)
## 
## 
##            Df Deviance Resid. Df Resid. Dev
## NULL                        2834     2272.2
## Diameter    1  289.779      2833     1982.4
## Shell.wt    1  256.662      2832     1725.7
## Shucked.wt  1  297.076      2831     1428.6
## Sex         1    0.006      2830     1428.6
# Checking for collinearity
vif(m1)
##   Diameter   Shell.wt Shucked.wt        Sex 
##   6.119518   3.927146   4.359157   1.021480
kable(vif(m1))
x
Diameter 6.119518
Shell.wt 3.927146
Shucked.wt 4.359157
Sex 1.021480

There is some indication that Diameter is slightly collinear with the other explanatory variables (VIF = 6.12), but with all values lower than 10, we can assume no strong collinearity and continue with the model.

# Checking for overdispersion
D <- deviance(m1)
degf <- summary(m1)$df[2]
phi <- D/degf
phi
## [1] 0.5048138
# using Chi^2 test
pp <- sum(resid(m1, type = "pearson")^2)
1 - pchisq(pp, m1$df.resid)
## [1] 1

There is no overdispersion in the model (\(phi <\approx 1\)).

Three variables are contributing significantly to the model: diameter (\(z=5.91, p < 0.001\)), shell weight (\(z = 19.86, p < 0.001\)), and shucked weight (\(z=-16.81, p < 0.001\)). The categorical variable of sex did not affect the model fit (\(z = -0.08, p < 0.94\)). However, I will include sex in a few model variations as it is a variable of interest (per instructions, gonad type may affect the relationship between age and size, and we were told to include sex in the model options).

m1.1 <- glm(Age ~ Diameter + Shell.wt + Shucked.wt, data = aba, family = "poisson")
summary(m1.1)
## 
## Call:
## glm(formula = Age ~ Diameter + Shell.wt + Shucked.wt, family = "poisson", 
##     data = aba)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.82781    0.05580  32.756  < 2e-16 ***
## Diameter     1.10032    0.18480   5.954 2.61e-09 ***
## Shell.wt     1.65701    0.08340  19.868  < 2e-16 ***
## Shucked.wt  -0.96935    0.05729 -16.921  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2272.1  on 2834  degrees of freedom
## Residual deviance: 1428.6  on 2831  degrees of freedom
## AIC: 13365
## 
## Number of Fisher Scoring iterations: 4
anova(m1.1)
## Analysis of Deviance Table
## 
## Model: poisson, link: log
## 
## Response: Age
## 
## Terms added sequentially (first to last)
## 
## 
##            Df Deviance Resid. Df Resid. Dev
## NULL                        2834     2272.2
## Diameter    1   289.78      2833     1982.4
## Shell.wt    1   256.66      2832     1725.7
## Shucked.wt  1   297.08      2831     1428.6
# 2-predictor combination models, including Sex
m2 <- glm(Age ~ Diameter + Shell.wt + Sex, data = aba, family = "poisson")
m2.1 <- glm(Age ~ Diameter + Shell.wt, data = aba, family = "poisson")

m3 <- glm(Age ~ Diameter + Shucked.wt + Sex, data = aba, family = "poisson")
m3.1 <- glm(Age ~ Diameter + Shucked.wt, data = aba, family = "poisson")

m4 <- glm(Age ~ Shucked.wt + Shell.wt + Sex, data = aba, family = "poisson")
m4.1 <- glm(Age ~ Shucked.wt + Shell.wt, data = aba, family = "poisson")
# 1-predictor models, including Sex
m5 <- glm(Age ~ Diameter + Sex, data = aba, family = "poisson")
m5.1 <- glm(Age ~ Diameter + Sex, data = aba, family = "poisson")


m6 <- glm(Age ~ Shell.wt, data = aba, family = "poisson")
m6.1 <- glm(Age ~ Shell.wt + Sex, data = aba, family = "poisson")

m7 <- glm(Age ~ Shucked.wt + Sex, data = aba, family = "poisson")
m7.1 <- glm(Age ~ Shucked.wt, data = aba, family = "poisson")
# model comparison with AIC
aba.aic<-AIC(m1, m1.1, m2, m2.1, m3, m3.1, m4, m4.1, m5, m5.1, m6, m6.1, m7, m7.1)
formula <- c("Age ~ Diameter + Shell.wt + Shucked.wt + Sex",
             "Age ~ Diameter + Shell.wt + Shucked.wt",
             "Age ~ Diameter + Shell.wt + Sex",
             "Age ~ Diameter + Shell.wt",
             "Age ~ Diameter + Shucked.wt + Sex",
             "Age ~ Diameter + Shucked.wt",
             "Age ~ Shucked.wt + Shell.wt + Sex",
             "Age ~ Shucked.wt + Shell.wt",
             "Age ~ Diameter + Sex",
             "Age ~ Diameter",
             "Age ~ Shell.wt + Sex",
             "Age ~ Shell.wt",
             "Age ~ Shucked.wt + Sex",
             "Age ~ Shucked.wt")
aba.aic<- data.frame(aba.aic, formula)
kable(aba.aic)
df AIC formula
m1 5 13366.79 Age ~ Diameter + Shell.wt + Shucked.wt + Sex
m1.1 4 13364.80 Age ~ Diameter + Shell.wt + Shucked.wt
m2 4 13657.65 Age ~ Diameter + Shell.wt + Sex
m2.1 3 13659.87 Age ~ Diameter + Shell.wt
m3 4 13731.31 Age ~ Diameter + Shucked.wt + Sex
m3.1 3 13729.42 Age ~ Diameter + Shucked.wt
m4 4 13400.22 Age ~ Shucked.wt + Shell.wt + Sex
m4.1 3 13398.79 Age ~ Shucked.wt + Shell.wt
m5 3 13913.16 Age ~ Diameter + Sex
m5.1 3 13913.16 Age ~ Diameter
m6 2 13686.60 Age ~ Shell.wt + Sex
m6.1 3 13685.58 Age ~ Shell.wt
m7 3 14128.00 Age ~ Shucked.wt + Sex
m7.1 2 14136.07 Age ~ Shucked.wt
min(aba.aic$AIC)
## [1] 13364.8

By AIC comparison, the best fit model includes main effects of diameter, shell weight, and shucked weight, excluding sex. However, its AIC is very slightly lower than the full model (including sex), and there is biological reasoning to include sex despite its low significance. Moreover, with an n of 2835, there are plenty of data to justify including more predictor variables. I ultimately settled on the full model (\(Age \sim Diameter + Shell.wt + Shucked.wt + Sex\)) as the best model.

To be sure that a Poisson distribution was the best for the model, I compared my GLM to a Gaussian model.

# Comparing to Gaussian model
m1.2 <- glm(Age ~ Diameter + Shell.wt + Shucked.wt + Sex, data = aba, family = "gaussian")
summary(m1.2)
## 
## Call:
## glm(formula = Age ~ Diameter + Shell.wt + Shucked.wt + Sex, family = "gaussian", 
##     data = aba)
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.87705    0.44079  13.333  < 2e-16 ***
## Diameter      7.90255    1.47111   5.372 8.43e-08 ***
## Shell.wt     21.72073    0.76459  28.408  < 2e-16 ***
## Shucked.wt  -10.98454    0.46119 -23.818  < 2e-16 ***
## SexM         -0.01190    0.09317  -0.128    0.898    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 5.979576)
## 
##     Null deviance: 26697  on 2834  degrees of freedom
## Residual deviance: 16922  on 2830  degrees of freedom
## AIC: 13122
## 
## Number of Fisher Scoring iterations: 2
anova(m1.2)
## Analysis of Deviance Table
## 
## Model: gaussian, link: identity
## 
## Response: Age
## 
## Terms added sequentially (first to last)
## 
## 
##            Df Deviance Resid. Df Resid. Dev
## NULL                        2834      26697
## Diameter    1   3075.3      2833      23622
## Shell.wt    1   3260.8      2832      20361
## Shucked.wt  1   3438.7      2831      16922
## Sex         1      0.1      2830      16922

The Gaussian-fit model has a lower AIC (13122) than the original Poisson model (13367). However, comparing AIC between Gaussian and Poisson GLMs is controversial given the different fitting of each model, so I used log likelihood as well.

# log likelihood comparison
logLik(m1)
## 'log Lik.' -6678.396 (df=5)
logLik(m1.2)
## 'log Lik.' -6555.174 (df=6)

The Gaussian model has a higher log likelihood than the Poisson, indicating that a normal distribution might be a better fit for the age data. Nonetheless, since age is a positive, discrete numeric variable and the difference in log likelihood is relatively small, I will continue assessing the Poisson model.

par(mfrow=c(2,2))
plot(m1, main = "Full Model (Poisson)")

The residuals from the full model show distinct patterns. There is a strong clumping of points in the Pearson residuals, and the upper tail in the QQ plot strays significantly. These plots show that the model is not a very good fit for the data; if this were a more complete project, significant work would be needed to improve the model.

Just to be sure, I compared the residual plots with the Gaussian model, and found similar patterns.

par(mfrow=c(2,2))
plot(m1.2, main ="Gaussian")

# goodness of fit: pseudo-R^2
pR2 <- 1 - (m1$deviance / m1$null)
pR2
## [1] 0.3712452
# Added-variable Plots
avPlots(m1, col = colors, pch = 16, cex = 0.5)

Plotting the residuals with Added-Variable plots highlights the direction and strength of each variable’s relationship to age, holding all others constant at their means. Diameter has a slight, positive relationship to age; shell weight has the strongest, positive influence; shucked weight has a weaker and negative relationship.

# Diameter
aba.pred.D <- emmeans(m1, spec = ~ Diameter + Shell.wt + Shucked.wt + Sex,
                      at = list(Diameter = 
                                  seq(min(aba$Diameter),
                                      max(aba$Diameter), 
                                      length.out = 100)))
aba.pred.D <- as.data.frame(aba.pred.D)

D.plot <- ggplot(aba, aes(x = Diameter, y = Age, col = Sex)) +
  geom_point(size = 0.75) +
  scale_color_manual(name = "Sex", values = pal1) +
  geom_line(data = aba.pred.D, 
            aes(y = exp(emmean)), col = "blue3") +
  geom_line(data = aba.pred.D, 
            aes(y = exp(emmean + SE)), col = "orchid3", linetype = 2) +
  geom_line(data = aba.pred.D, 
            aes(y = exp(emmean - SE)), col = "orchid3", linetype = 2) +
  theme_minimal() +
  xlab("Diameter")+
  labs(title = bquote("Abalone Age ~ " ~ bold("Diameter") ~ " + Shell Weight + Shucked Weight + Sex")) 

D.plot

#Shell weight
aba.pred.Sw <- emmeans(m1, spec = ~ Diameter + Shell.wt + Shucked.wt + Sex, 
                       at = list(Shell.wt = 
                                   seq(min(aba$Shell.wt), 
                                       max(aba$Shell.wt), 
                                       length.out = 100)))
aba.pred.Sw <- as.data.frame(aba.pred.Sw)

Sw.plot <- ggplot(aba, aes(x = Shell.wt, y = Age, col = Sex)) +
  geom_point(size = 0.75) +
  scale_color_manual(name = "Sex", values = pal1) +
  geom_line(data = as.data.frame(aba.pred.Sw), 
            aes(y = exp(emmean)), col = "blue") +
  geom_line(data = aba.pred.Sw, 
            aes(y = exp(emmean + SE)), col = "orchid3", linetype = 2) +
  geom_line(data = aba.pred.Sw, 
            aes(y = exp(emmean - SE)), col = "orchid3", linetype = 2) +
  theme_minimal()+
  xlab("Shell Weight") +
  labs(title = bquote("Abalone Age ~ Diameter + " ~ bold("Shell Weight") ~ " + Shucked Weight + Sex"))

Sw.plot

#Shucked weight
aba.pred.Skw <- emmeans(m1, spec = ~ Diameter + Shell.wt + Shucked.wt + Sex, 
                        at = list(Shucked.wt 
                                  = seq(min(aba$Shucked.wt),
                                        max(aba$Shucked.wt), 
                                        length.out = 100)))
aba.pred.Skw <- as.data.frame(aba.pred.Skw)

Skw.plot <- ggplot(aba, aes(x = Shucked.wt, y = Age, col = Sex)) +
  geom_point(size = 0.75) +
  scale_color_manual(name = "Sex", values = pal1) +
  geom_line(data = aba.pred.Skw,
            aes(y = exp(emmean)), col = "blue") +
  geom_line(data = aba.pred.Skw, 
            aes(y = exp(emmean + SE)), col = "orchid3", linetype = 2) +
  geom_line(data = aba.pred.Skw, 
            aes(y = exp(emmean - SE)), col = "orchid3", linetype = 2) +
  theme_minimal()+
  xlab("Shucked Weight")+
  labs(title = bquote("Abalone Age ~ Diameter + Shell Weight + " ~ bold("Shucked Weight") ~ " + Sex"))

Skw.plot

#goodness of fit G^2
1-pchisq(m1$deviance, m1$df.resid)
## [1] 1

Abalone age can be described by the generalized linear model \(Age ~ Diameter + Shell Weight + Shucked Weight + Sex\) fitted with a Poisson distribution (intercept = 1.83, se = 0.057, z = 31.9, p < 0.001). Shell weight had the strongest relationship to age (slope = 1.657, se = 0.083 z = 19.87, p < 0.001), followed by shucked weight (slope = -0.969, se = 0.058, z = -16.81, p < 0.001), then diameter (slope = 1.099, se = 0.186, z = 5.91, p < 0.001). Although I kept it in my final model for biological/assignment reasons, sex did not contribute significantly (slope = -0.0009, z = -0.079, p < 0.937).

The model is not a great fit, only explaining 37.1% of the variance in the data (pseudo-R\(^2\) = 0.371, G\(^2\) = 1). This can also be seen best in the plot of Age vs. Shucked Weight, where the model poorly predicts age at low or high shucked weights.

Figures show real data for abalone age by each main effect - diameter, shell weight, and shucked weight - as well as sex. Blue lines show exponentially transformed estimated marginal means from the model across the range of predictor values; purple lines show transformed EMM +/- standard error. Values were transformed per the log link in a Poisson generalized linear model.

aba2 <- abalone[,c("Age", "Sex", "Diameter", "Shell.wt", "Shucked.wt")]
# resetting the dataframe for re-ordering purposes

plot(aba2$Shell.wt, aba2$Age,
     ylab="Abalone Age (years)", 
     xlab="Shell Weight", 
     main = "Abalone Age - Real and Predicted",
     pch=16, col=colors, cex = 0.5)

aba2$Sex <- as.factor(aba2$Sex)
aba2 <- aba2[order(aba2$Shell.wt),]
lines(aba2$Shell.wt, predict(m1, type="response"), col="blue", lwd=1)
legend("topright", legend = levels(aba2$Sex), col = pal1, pch = 16, cex = 0.75)