They define relative lactation duration as:
“Lactation length was analysed relative to the total period of reproduction so that relative lactation length = absolute lactation length/(absolute gestation length + absolute lactation length).”
Create a variable for relative lactation length. Use the with() command to make the code clean
dat2$rel.lactation <- with(dat2, lacatation.months/(gestation.month + lacatation.months))
As the authors did, Log transform relative lactation
dat2$rel.lact.log10 <- log10(dat2$rel.lactation)
Relative Lactation duration is a proportion, so a Logit transformatioN may be more appropriate.
library(car)
dat2$rel.lact.logit <- logit(dat2$rel.lactation,percents = T)
library(ggplot2)
qplot(y = fat.log10,
x = rel.lactation,
data = dat2) +
ggtitle("no transformation of x")
Interesting. This makes the plot really ugly.
qplot(y = fat.log10,
x = rel.lact.log10,
data = dat2)+
ggtitle("log10 transformation of x")
Logit makes it look bad too!
qplot(y = fat.log10,
x = rel.lact.logit,
data = dat2)
Still ugly. Any transformation of the x variable seems bad. I believe the original authors did do use the log transform, but perhaps they didn’t check what it did and just assumed it was a good idea.
qplot(y = fat.logit,
x = rel.lact.logit,
data = dat2)
Fit models raw lactation duration and compare to log and logit transformations
#model 1: log
f.rellact.raw <- lm(fat.log10 ~ rel.lactation, data = dat2)
#modlel 2: log-log
f.rellact.log10 <- lm(fat.log10 ~ rel.lact.log10, data = dat2)
#model 3: log-logit
f.rellact.logit <- lm(fat.log10 ~ rel.lact.logit, data = dat2)
The anova() command gives us the p value for the slpe
anova(f.rellact.raw)
## Analysis of Variance Table
##
## Response: fat.log10
## Df Sum Sq Mean Sq F value Pr(>F)
## rel.lactation 1 2.7321 2.73211 14.467 0.0002199 ***
## Residuals 128 24.1737 0.18886
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(f.rellact.log10)
## Analysis of Variance Table
##
## Response: fat.log10
## Df Sum Sq Mean Sq F value Pr(>F)
## rel.lact.log10 1 4.5748 4.5748 26.223 1.089e-06 ***
## Residuals 128 22.3310 0.1745
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(f.rellact.logit)
## Analysis of Variance Table
##
## Response: fat.log10
## Df Sum Sq Mean Sq F value Pr(>F)
## rel.lact.logit 1 4.5703 4.5703 26.191 1.104e-06 ***
## Residuals 128 22.3355 0.1745
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We can’t compare these 3 models directly, but we can assess how well the fit the data by looking at their residuals
Resid vs. fitted looks pretty bad - defiante triangle shape.
par(mfrow = c(2,2))
plot(f.rellact.raw)
Resid vs. fitted looks pretty bad - can’t say its really worse necessarily, but my intuition is that it is.
plot(f.rellact.log10)
plot(f.rellact.logit)
We’ll now fit a series of models as the author did then use AIC to assess which one is best. They used relative lactation duration as a continous predictor of the fat content of milk, & also several categorical variables:
They looked at different combinations of these predictors
Fit models with all of the combinations of these predictors - this is a long list
#NB: authors don't fit this model
f.null <- lm(fat.log10 ~ 1,
data = dat2)
#Biome (categorical)
f.biome <- lm(fat.log10 ~ biome,
data = dat2)
#rel lactation length (continous)
f.rellact <- lm(fat.log10 ~ rel.lact.log10,
data = dat2)
#Diet (cat.)
f.diet <- lm(fat.log10 ~ diet,
data = dat2)
#Arid-adapted (cat)
f.arid <- lm(fat.log10 ~ arid,
data = dat2)
#Biome + diet (2 cat vars)
f.biome.diet <- lm(fat.log10 ~ biome +
diet,
data = dat2)
#Rel lac length + diet (1 continous, 1 cat)
f.lact.diet <- lm(fat.log10 ~ rel.lact.log10 +
diet,
data = dat2)
#Rel lac length + biome (1 cont, 1 cat)
f.lact.biome <- lm(fat.log10 ~ rel.lact.log10 +
biome,
data = dat2)
#Re lac + diet + biome (1 cont, 2 cat)
f.lact.biome.diet <- lm(fat.log10 ~
rel.lact.log10 +
biome +
diet, data = dat2)
We can get individual AIC values for a model using AIC()
AIC(f.biome)
## [1] 109.3536
We can get ALL AIC values using bbmle:AICtab. “base=TRUE” returns the raw AIC value in addition to the delta AIC. base = TRUE gives us the raw AIC value
library(bbmle)
AICtab(base= TRUE,
f.null,
f.biome,
f.rellact,
f.diet,
f.arid,
f.biome.diet,
f.lact.diet,
f.lact.biome,
f.lact.biome.diet
)
## AIC dAIC df
## f.lact.biome.diet 97.2 0.0 6
## f.lact.diet 99.3 2.1 5
## f.biome.diet 101.8 4.6 5
## f.lact.biome 105.6 8.4 4
## f.diet 108.1 10.9 4
## f.biome 109.4 12.2 3
## f.rellact 145.9 48.8 3
## f.arid 163.3 66.2 3
## f.null 168.1 71.0 2
To get AICc values (small sample correction) we use the ICtab function
ICtab(type = "AICc",
base = TRUE,
f.null,
f.biome,
f.rellact,
f.diet,
f.arid,
f.biome.diet,
f.lact.diet,
f.lact.biome,
f.lact.biome.diet)
## AICc dAICc df
## f.lact.biome.diet 97.8 0.0 6
## f.lact.diet 99.8 1.9 5
## f.biome.diet 102.3 4.4 5
## f.lact.biome 105.9 8.1 4
## f.diet 108.4 10.6 4
## f.biome 109.5 11.7 3
## f.rellact 146.1 48.3 3
## f.arid 163.5 65.7 3
## f.null 168.2 70.4 2
AIC(f.lact.biome.diet)
## [1] 97.15102
plot(f.lact.biome.diet)
First, I’ll create indices for 3 diet groups using which()
#indices for each diet group
i.omni <- which(dat2$diet == "omnivore")
i.carn <- which(dat2$diet == "carnivore")
i.herb <- which(dat2$diet == "herbivore")
Then I’ll plot all the data in black using plot(), then overlay carnivores in red and herbivores in green using points()
#plot all raw data
par(mfrow = c(1,1))
plot(fat.log10 ~ rel.lact.log10, data = dat2)
#color carnivores
points(fat.log10 ~ rel.lact.log10, data = dat2[i.carn,],col = "red")
#color herbivores
points(fat.log10 ~ rel.lact.log10, data = dat2[i.herb,],col = "green")
#add a legend
legend("bottomleft",
legend = c("Carnivore",
"Omnivore",
"Herbiore"),
col = c("red",
"green",
"black"),
pch = 1)
Generate predictions for plotting works best if we create a new dataframe with organized data the span the entire original dataset.
rng <- range(dat2[,"rel.lact.log10"])
levs1 <- levels(dat2[,"diet"])
levs2 <- levels(dat2[,"biome"])
newdat <- expand.grid(rel.lact.log10 = rng,
diet = levs1,
biome = levs2)
Note: I’m using the model with lactation and diet (f.lact.diet), NOT the one best model which also included “biome” (f.lact.biome.diet).
y.hat <- predict(f.lact.diet,
newdata = newdat)
Add them to the newdat datafrme
newdat$fat.y.hat <- y.hat
#Plot points
## plot base plot
plot(fat.log10 ~ rel.lact.log10, data = dat2)
##plot red points
points(fat.log10 ~ rel.lact.log10, data = dat2[i.carn,],col = "red")
##plot green points
points(fat.log10 ~ rel.lact.log10, data = dat2[i.herb,],col = "green")
#Plot regression lines
## Indices for each subset
i.carn.newdat <- which(newdat$diet == "carnivore")
i.omni.newdat <- which(newdat$diet == "omnivore")
i.herb.newdat <- which(newdat$diet == "herbivore")
## Plot lines
### type = "l" makes points() plot lines
points(fat.y.hat ~ rel.lact.log10,
data = newdat[i.carn.newdat,],
col = "red",
type = "l")
points(fat.y.hat ~ rel.lact.log10,
data = newdat[i.omni.newdat,],
col = "black",
type = "l")
points(fat.y.hat ~ rel.lact.log10,
data = newdat[i.herb.newdat,],
col = "green",
type = "l")
QUESTION It looks like the herbiore and omnivore lines are almost exaclty on top of each other. What does this mean?
ANSWER This means the must have very similar intercept parameters. I can check this with coef()
coef(f.lact.diet)
## (Intercept) rel.lact.log10 dietherbivore dietomnivore
## 1.2124103 -0.4570772 -0.5800950 -0.5714318
Both “dietherbivore” and “dietomnivore” have intercepts of about -0.57.
QUESTION How do you write this as a regression equation?
#Plot points
plot(fat.log10 ~ rel.lact.log10, data = dat2)
# Plot outliers
i.aust <- which(dat2$Australia == "Australia")
i.lemur <- which(dat2$lemurs == "lemur")
i.odd.ung <- which(dat2$order == "Perrissodactyla")
#plot points
## australia spp
points(fat.log10 ~ rel.lact.log10,
data = dat2[i.aust,],
col = "black",
pch = 16)
##lemurs
points(fat.log10 ~ rel.lact.log10,
data = dat2[i.lemur,],
col = "black",
pch = 22)
## odd-toed ungulates
points(fat.log10 ~ rel.lact.log10,
data = dat2[i.odd.ung,],
col = "black",
pch = 3)
#Plot colors
points(fat.log10 ~ rel.lact.log10, data = dat2[i.carn,],col = "red")
points(fat.log10 ~ rel.lact.log10, data = dat2[i.herb,],col = "green")
#plot legend
legend("bottomleft",pch = c(16,22,3,1), legend = c("Australia","lemur","even-toed ungulates","other"))
#Plot regression lines
## Indices
i.carn.newdat <- which(newdat$diet == "carnivore")
i.omni.newdat <- which(newdat$diet == "omnivore")
i.herb.newdat <- which(newdat$diet == "herbivore")
## Plot
points(fat.y.hat ~ rel.lact.log10,
data = newdat[i.carn.newdat,],
col = "red",
type = "l")
points(fat.y.hat ~ rel.lact.log10,
data = newdat[i.omni.newdat,],
col = "black",
type = "l")
points(fat.y.hat ~ rel.lact.log10,
data = newdat[i.herb.newdat,],
col = "green",
type = "l")