#To see what is in the object Benthos, type:
names(Benthos)
## [1] "Period" "Fishing" "OrganicM" "Mud"
## [5] "Silt" "Clay" "Ampeliscidae" "Cirratulidae"
str(Benthos)
## 'data.frame': 80 obs. of 8 variables:
## $ Period : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Fishing : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 2 2 2 ...
## $ OrganicM : num 1.16 1.38 1.84 1.62 1.62 2.13 1.78 1.87 1.56 1.15 ...
## $ Mud : num 35.3 34.5 38.4 30.7 32.6 ...
## $ Silt : num 25.1 21.9 23.5 17.7 23.3 ...
## $ Clay : num 10.16 12.52 14.94 12.99 9.27 ...
## $ Ampeliscidae: int 7 4 7 3 4 2 2 0 2 3 ...
## $ Cirratulidae: int 29 52 68 43 41 22 11 7 5 4 ...
To model Ampeliscidae as a function of the covariates:
| Variable | Definition |
|---|---|
| Fishing | Fishing vs no fishing |
| Period | Three time periods |
#Converting Period and Fishing into factors
Benthos$fPeriod <- factor(Benthos$Period)
Benthos$fFishing <- factor(Benthos$Fishing,
levels = c("no", "yes"),
labels = c("No Fishing", "Fishing"))
Benthos %>%
select(Ampeliscidae, OrganicM, Silt, Clay, Mud) %>%
gather(Var, Value) %>%
ggplot() +
geom_jitter(aes(factor(1), Value)) +
facet_wrap(~Var)
Great because response var ranges till 15 (<20-25), Poisson may be good.
xtable(table(Benthos$Ampeliscidae))
% latex table generated in R 3.1.1 by xtable 1.7-4 package % Tue May 5 01:05:48 2015
Calculate percentage of zeros in data
100 * sum(Benthos$Ampeliscidae == 0) / nrow(Benthos)
## [1] 10
#Not a problem
#???? Why is it not a problem
Do we have a reasonable number of observations per level of a categorical covariate?
with(Benthos, table(fFishing)) #with the object "Benthos", give me a table
## fFishing
## No Fishing Fishing
## 40 40
with(Benthos, table(fPeriod))
## fPeriod
## 1 2 3
## 20 30 30
with(Benthos, table(fPeriod,fFishing)) #if I was interested in their interaction
## fFishing
## fPeriod No Fishing Fishing
## 1 10 10
## 2 15 15
## 3 15 15
#Nice, more or less balanced design
pairs(Benthos[, c("OrganicM", "Mud", "Silt", "Clay")])
#gives you a correlation matrix
cor(Benthos[, c("OrganicM", "Mud", "Silt", "Clay")]) %>% stargazer(type='html')
| OrganicM | Mud | Silt | Clay | |
| OrganicM | 1 | 0.061 | 0.027 | 0.058 |
| Mud | 0.061 | 1 | 0.658 | 0.755 |
| Silt | 0.027 | 0.658 | 1 | 0.002 |
| Clay | 0.058 | 0.755 | 0.002 | 1 |
Too much collinearity!
df = Benthos %>%
select(Ampeliscidae, OrganicM, Silt, Clay, Mud, fPeriod, fFishing) %>%
gather(Var, Value, -fFishing, -fPeriod)
df %>% ggplot() +
geom_boxplot(aes(x=fPeriod, y=Value)) +
facet_wrap(~Var, scales="free")+ggtitle("Period")
df %>%
ggplot() +
geom_boxplot(aes(x=fFishing, y=Value)) +
facet_wrap(~Var, scales="free")+ggtitle("Fishing")
pPeriod = df %>% filter(Var == 'Silt') %>%
ggplot() +
geom_boxplot(aes(x=fPeriod, y=Value))
pFishing = df %>% filter(Var == 'Silt') %>%
ggplot() +
geom_boxplot(aes(x=fFishing, y=Value))
grid.arrange(pPeriod, pFishing, nrow=1, main="Silt")
pPeriod = df %>% filter(Var == 'Clay') %>%
ggplot() +
geom_boxplot(aes(x=fPeriod, y=Value))
pFishing = df %>% filter(Var == 'Clay') %>%
ggplot() +
geom_boxplot(aes(x=fFishing, y=Value))
grid.arrange(pPeriod, pFishing, nrow=1, main="Clay")
Either use mud or silt and clay. Fishing seems to be collinear with some of these. If you use mud or clay you cannot use fishing as it is collinear!
Based on biology we would expect that Ampeliscidae vs O_Material, changes depending on the dredging effect (CT)!!!
qplot(OrganicM, Ampeliscidae, data=Benthos) + facet_wrap(~fFishing) +
stat_smooth(method = "lm", formula = y ~ x, na.rm=T)
#There is indication for interaction!!
#Fit the model E(Ampeliscidae ) = mu = exp(alpha + A + B + A:B + C )
M1 <- glm(Ampeliscidae ~ OrganicM * fFishing + fPeriod,
data = Benthos,
family = poisson)
#Exactly the same model
M1 <- glm(Ampeliscidae ~ OrganicM + fFishing +
OrganicM : fFishing +
fPeriod,
data = Benthos,
family = poisson)
#What is the model that we are fitting?
#Ampeliscidae_i ~ P(mu_i)
#E(Ampeliscidae_i) = mu_i
#and var(Ampeliscidae_i) = mu_i
#
#log(mu_i) = alpha + OrganicM_i + fFishing_i +
# Period_i + OrganicM_i x fFishing_i
Check for overdispersion
`#Pearson residuals:
# (Y - E(y)) (Y - mu)
# -------------- = -----------------
# sqrt(var(Y)) sqrt(mu)
#mu = exp(blah blah blah)
E1 <- resid(M1, type = "pearson")
N <- nrow(Benthos)
p <- length(coef(M1))
sum(E1^2) / (N - p)
## [1] 1.422794
stargazer(M1, type='html')
| Dependent variable: | |
| Ampeliscidae | |
| OrganicM | 0.658** |
| (0.323) | |
| fFishingFishing | 1.369** |
| (0.656) | |
| fPeriod2 | 1.192*** |
| (0.179) | |
| fPeriod3 | 1.437*** |
| (0.176) | |
| OrganicM:fFishingFishing | -1.131** |
| (0.449) | |
| Constant | -0.177 |
| (0.506) | |
| Observations | 80 |
| Log Likelihood | -184.027 |
| Akaike Inf. Crit. | 380.054 |
| Note: | p<0.1; p<0.05; p<0.01 |
drop1(M1, test = "Chi") %>% stargazer(type='html')
| Statistic | N | Mean | St. Dev. | Min | Max |
| Df | 2 | 1.500 | 0.707 | 1 | 2 |
| Deviance | 3 | 142.526 | 51.804 | 109.491 | 202.231 |
| AIC | 3 | 411.089 | 50.022 | 380.054 | 468.794 |
| LRT | 2 | 49.552 | 61.077 | 6.364 | 92.740 |
| Pr(> Chi) | 2 | 0.006 | 0.008 | 0.000 | 0.012 |
Just ok!
Instead of pvalue you have (LRT) Likelihood Ratio Test
#Suppose that the interaction is not significant:
#Mtest <- glm(Ampeliscidae ~ OrganicM + fFishing + fPeriod,
# data = Benthos,
# family = poisson)
#summary(Mtest)
#drop1(Mtest, test = "Chi")
Plot residuals vs fitted values #Influential observations Plot residuals vs each covariate (in the model, and not in the model)
F1 <- fitted(M1) #command fitted gives already e^model
E1 <- resid(M1, type = "pearson")
plot(x = F1,
y = E1,
xlab = "Fitted values",
ylab = "Pearson residuals")
abline(h = 0, lty = 2)
#Cook's distance: influential obs
#There's a paper in the email which you can use (not usable with random effects)
par(mfrow = c(1, 1))
plot(M1, which = 4)
#Plot Pearson residuals versus each covariate
plot(x = Benthos$OrganicM,
y = E1)
abline(h = 0, lty = 2)
#do the same for other covariates not in model
plot(x = Benthos$Clay,
y = E1)
abline(h = 0, lty = 2)
plot(x = Benthos$Mud,
y = E1)
abline(h = 0, lty = 2)
plot(x = Benthos$Silt,
y = E1)
abline(h = 0, lty = 2)
boxplot(E1 ~ fPeriod, data = Benthos)
boxplot(E1 ~ fFishing, data = Benthos)
#Looks all ok
#Sketch fitted values for the GLM Poisson model
M1 <- glm(Ampeliscidae ~ OrganicM * fFishing + fPeriod,
data = Benthos,
family = poisson)
| Dependent variable: | |
| Ampeliscidae | |
| OrganicM | 0.658** |
| (0.323) | |
| fFishingFishing | 1.369** |
| (0.656) | |
| fPeriod2 | 1.192*** |
| (0.179) | |
| fPeriod3 | 1.437*** |
| (0.176) | |
| OrganicM:fFishingFishing | -1.131** |
| (0.449) | |
| Constant | -0.177 |
| (0.506) | |
| Observations | 80 |
| Log Likelihood | -184.027 |
| Akaike Inf. Crit. | 380.054 |
| Note: | p<0.1; p<0.05; p<0.01 |
Ampeliscidae_i ~ Poisson(mu_i)
E(Ampeliscidae_i) = mu_i
var(Ampeliscidae_i) = mu_i
alpha + OrganicM + Fishing + Period + OrganicM:Fishing
mu_i = e
###### What is the equation for Period = 1 & Fishing = Non Fishing
-0.1766 + 0.657 * OrganicM_i
mu_i = e
-0.1766 + 1.1919 + 0.657 * OrganicM_i
mu_i = e
-0.1766 + 1.4371 + 0.657 * OrganicM_i
mu_i = e
-0.1766 + 1.3694 + (0.657 -1.1314) * OrganicM_i
mu_i = e
-0.1766 + 1.3694 + 1.19 + (0.657 -1.1314) * OrganicM_i
#mu_i = e
#Sketch them in a graph
# range(Benthos$OrganicM)
myData = Benthos %>% select(fPeriod, fFishing) %>% unique %$%
do.call(rbind,mapply(
function(fFishing,fPeriod){
data.frame(
OrganicM = with(Benthos, seq(min(OrganicM),max(OrganicM),len=10)),
fPeriod,
fFishing
)
},
fFishing = fFishing,
fPeriod = fPeriod,
SIMPLIFY = FALSE))
myData %<>%
group_by(fPeriod, fFishing) %>%
do({
data.frame(.,Ampeliscidae= predict(M1, newdata= ., type="response", se=TRUE)) #whats the difference between type=link
})
p0 = ggplot(Benthos)+
geom_point(aes(OrganicM, Ampeliscidae, color = factor(fFishing))) +
labs(y = "Ampeliscidae",x = "Organic Matter")+scale_color_discrete("Fishing")
p1 = p0 +
geom_smooth(data = myData,
aes(y=Ampeliscidae.fit, x=OrganicM,
ymin = Ampeliscidae.fit-Ampeliscidae.se.fit, ymax = Ampeliscidae.fit+Ampeliscidae.se.fit,
color=fFishing,
linetype=fPeriod
), stat = "identity")
p1
Similar data exploration steps