Basic Exploratory Analysis

#To see what is in the object Benthos, type:
names(Benthos)
## [1] "Period"       "Fishing"      "OrganicM"     "Mud"         
## [5] "Silt"         "Clay"         "Ampeliscidae" "Cirratulidae"
str(Benthos)
## 'data.frame':    80 obs. of  8 variables:
##  $ Period      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Fishing     : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 2 2 2 ...
##  $ OrganicM    : num  1.16 1.38 1.84 1.62 1.62 2.13 1.78 1.87 1.56 1.15 ...
##  $ Mud         : num  35.3 34.5 38.4 30.7 32.6 ...
##  $ Silt        : num  25.1 21.9 23.5 17.7 23.3 ...
##  $ Clay        : num  10.16 12.52 14.94 12.99 9.27 ...
##  $ Ampeliscidae: int  7 4 7 3 4 2 2 0 2 3 ...
##  $ Cirratulidae: int  29 52 68 43 41 22 11 7 5 4 ...

Aim of the analysis:

To model Ampeliscidae as a function of the covariates:

Variable Definition
Fishing Fishing vs no fishing
Period Three time periods
#Converting Period and Fishing into factors
Benthos$fPeriod <- factor(Benthos$Period)
Benthos$fFishing <- factor(Benthos$Fishing,
                           levels = c("no", "yes"),
                           labels = c("No Fishing", "Fishing"))

BASIC DATA EXPLORATION:

  1. Outliers: Y and X
  2. Collinearity X
  3. Relationships Y vs X but also interactions
  4. Zero inflation: Make a frequency plot

1. Are there any outliers?

  • Outliers in the response variable?
  • Outliers in the explanatory variables?

Benthos                                         %>%
    select(Ampeliscidae, OrganicM, Silt, Clay, Mud) %>%
    gather(Var, Value)                              %>%
    ggplot()                           +
        geom_jitter(aes(factor(1), Value)) +
        facet_wrap(~Var)

Great because response var ranges till 15 (<20-25), Poisson may be good.

Numer of zeros in the response variable

xtable(table(Benthos$Ampeliscidae))
% latex table generated in R 3.1.1 by xtable 1.7-4 package % Tue May 5 01:05:48 2015

Calculate percentage of zeros in data

100 * sum(Benthos$Ampeliscidae == 0) / nrow(Benthos)
## [1] 10
#Not a problem
#???? Why is it not a problem

What about categorical covariates?

Do we have a reasonable number of observations per level of a categorical covariate?

with(Benthos, table(fFishing)) #with the object "Benthos", give me a table
## fFishing
## No Fishing    Fishing 
##         40         40
with(Benthos, table(fPeriod))
## fPeriod
##  1  2  3 
## 20 30 30
with(Benthos, table(fPeriod,fFishing)) #if I was interested in their interaction
##        fFishing
## fPeriod No Fishing Fishing
##       1         10      10
##       2         15      15
##       3         15      15
#Nice, more or less balanced design

2. Collinearity

pairs(Benthos[, c("OrganicM", "Mud", "Silt", "Clay")])

#gives you a correlation matrix
cor(Benthos[, c("OrganicM", "Mud", "Silt", "Clay")]) %>% stargazer(type='html')
OrganicM Mud Silt Clay
OrganicM 1 0.061 0.027 0.058
Mud 0.061 1 0.658 0.755
Silt 0.027 0.658 1 0.002
Clay 0.058 0.755 0.002 1

Too much collinearity!

df = Benthos                                         %>%
    select(Ampeliscidae, OrganicM, Silt, Clay, Mud, fPeriod, fFishing) %>%
    gather(Var, Value, -fFishing, -fPeriod)                             
df %>% ggplot()                           +
    geom_boxplot(aes(x=fPeriod, y=Value)) +
    facet_wrap(~Var, scales="free")+ggtitle("Period")

df %>% 
    ggplot()                           +
    geom_boxplot(aes(x=fFishing, y=Value)) +
    facet_wrap(~Var, scales="free")+ggtitle("Fishing")

pPeriod = df %>% filter(Var == 'Silt') %>%
    ggplot()                           +
    geom_boxplot(aes(x=fPeriod, y=Value))

pFishing = df %>% filter(Var == 'Silt') %>%
    ggplot()                           +
    geom_boxplot(aes(x=fFishing, y=Value))
grid.arrange(pPeriod, pFishing, nrow=1, main="Silt")

pPeriod = df %>% filter(Var == 'Clay') %>%
    ggplot()                           +
    geom_boxplot(aes(x=fPeriod, y=Value))

pFishing = df %>% filter(Var == 'Clay') %>%
    ggplot()                           +
    geom_boxplot(aes(x=fFishing, y=Value))
grid.arrange(pPeriod, pFishing, nrow=1, main="Clay")

Either use mud or silt and clay. Fishing seems to be collinear with some of these. If you use mud or clay you cannot use fishing as it is collinear!

Interactions?

Based on biology we would expect that Ampeliscidae vs O_Material, changes depending on the dredging effect (CT)!!!

qplot(OrganicM, Ampeliscidae, data=Benthos) + facet_wrap(~fFishing) + 
  stat_smooth(method = "lm", formula = y ~ x, na.rm=T)

#There is indication for interaction!!

Start analysis

#Fit the model E(Ampeliscidae ) = mu = exp(alpha + A + B + A:B + C )
M1 <- glm(Ampeliscidae ~ OrganicM * fFishing + fPeriod, 
          data = Benthos,
          family = poisson)
         
#Exactly the same model
M1 <- glm(Ampeliscidae ~ OrganicM + fFishing + 
                         OrganicM : fFishing + 
                         fPeriod, 
          data = Benthos,
          family = poisson)
#What is the model that we are fitting?
#Ampeliscidae_i ~ P(mu_i)
#E(Ampeliscidae_i) = mu_i   
#and   var(Ampeliscidae_i) = mu_i
#
#log(mu_i) = alpha + OrganicM_i + fFishing_i + 
#            Period_i + OrganicM_i x fFishing_i

Check for overdispersion

`#Pearson residuals:
#  (Y - E(y))          (Y - mu)
# --------------  = -----------------
#  sqrt(var(Y))         sqrt(mu)
#mu = exp(blah blah blah)

Is the model overdispersed?

E1 <- resid(M1, type = "pearson")
N  <- nrow(Benthos)
p  <- length(coef(M1))
sum(E1^2) / (N - p)
## [1] 1.422794
stargazer(M1, type='html')
Dependent variable:
Ampeliscidae
OrganicM 0.658**
(0.323)
fFishingFishing 1.369**
(0.656)
fPeriod2 1.192***
(0.179)
fPeriod3 1.437***
(0.176)
OrganicM:fFishingFishing -1.131**
(0.449)
Constant -0.177
(0.506)
Observations 80
Log Likelihood -184.027
Akaike Inf. Crit. 380.054
Note: p<0.1; p<0.05; p<0.01
drop1(M1, test = "Chi") %>% stargazer(type='html')
Statistic N Mean St. Dev. Min Max
Df 2 1.500 0.707 1 2
Deviance 3 142.526 51.804 109.491 202.231
AIC 3 411.089 50.022 380.054 468.794
LRT 2 49.552 61.077 6.364 92.740
Pr(> Chi) 2 0.006 0.008 0.000 0.012

Just ok!

Instead of pvalue you have (LRT) Likelihood Ratio Test

#Suppose that the interaction is not significant:
#Mtest <- glm(Ampeliscidae ~ OrganicM + fFishing  + fPeriod, 
#          data = Benthos,
#          family = poisson)
#summary(Mtest)
#drop1(Mtest, test =  "Chi")

Model validation

Plot residuals vs fitted values #Influential observations Plot residuals vs each covariate (in the model, and not in the model)

F1 <- fitted(M1) #command fitted gives already e^model
E1 <- resid(M1, type = "pearson")
plot(x = F1, 
     y = E1,
     xlab = "Fitted values",
     ylab = "Pearson residuals")
abline(h = 0, lty = 2)

#Cook's distance: influential obs
#There's a paper in the email which you can use (not usable with random effects)
par(mfrow = c(1, 1))
plot(M1, which = 4)

#Plot Pearson residuals versus each covariate
plot(x = Benthos$OrganicM, 
     y = E1)
abline(h = 0, lty = 2)

#do the same for other covariates not in model 
plot(x = Benthos$Clay, 
     y = E1)
abline(h = 0, lty = 2)

plot(x = Benthos$Mud, 
     y = E1)
abline(h = 0, lty = 2)

plot(x = Benthos$Silt, 
     y = E1)
abline(h = 0, lty = 2)

boxplot(E1 ~ fPeriod, data = Benthos) 

boxplot(E1 ~ fFishing, data = Benthos) 

#Looks all ok

Model interpretation

#Sketch fitted values  for the GLM Poisson model
M1 <- glm(Ampeliscidae ~ OrganicM * fFishing + fPeriod, 
          data = Benthos,
          family = poisson)
Dependent variable:
Ampeliscidae
OrganicM 0.658**
(0.323)
fFishingFishing 1.369**
(0.656)
fPeriod2 1.192***
(0.179)
fPeriod3 1.437***
(0.176)
OrganicM:fFishingFishing -1.131**
(0.449)
Constant -0.177
(0.506)
Observations 80
Log Likelihood -184.027
Akaike Inf. Crit. 380.054
Note: p<0.1; p<0.05; p<0.01

A. What is the model that we are fitting?

Ampeliscidae_i ~ Poisson(mu_i)
E(Ampeliscidae_i)   = mu_i
var(Ampeliscidae_i) = mu_i
         alpha + OrganicM + Fishing + Period + OrganicM:Fishing  
mu_i = e

B. What are the equations for each period/Fishing combination?

###### What is the equation for Period = 1 & Fishing = Non Fishing
         -0.1766 + 0.657 * OrganicM_i  
mu_i = e
What is the equation for Period = 2 & CT = Non-Fishing
         -0.1766 +  1.1919 + 0.657 * OrganicM_i  
mu_i = e
What is the equation for Period = 3 & CT = Non-Fishing
         -0.1766 +  1.4371 + 0.657 * OrganicM_i  
mu_i = e
What is the equation for Period = 1 & CT = Fishing
         -0.1766 + 1.3694 + (0.657 -1.1314) * OrganicM_i  
mu_i = e
What is the equation for Period = 2 & CT = Fishing
-0.1766 + 1.3694 + 1.19 + (0.657 -1.1314) * OrganicM_i  
#mu_i = e
#Sketch them in a graph
# range(Benthos$OrganicM)
myData = Benthos %>% select(fPeriod, fFishing) %>% unique %$%
do.call(rbind,mapply(
    function(fFishing,fPeriod){
          data.frame(
              OrganicM = with(Benthos, seq(min(OrganicM),max(OrganicM),len=10)),
              fPeriod,
              fFishing
          )
    },
    fFishing = fFishing,
    fPeriod  = fPeriod,
    SIMPLIFY = FALSE))

myData %<>%
    group_by(fPeriod, fFishing) %>%
    do({
       data.frame(.,Ampeliscidae= predict(M1, newdata= ., type="response", se=TRUE))    #whats the difference between type=link
    })

p0 = ggplot(Benthos)+
        geom_point(aes(OrganicM, Ampeliscidae, color = factor(fFishing))) +
        labs(y = "Ampeliscidae",x = "Organic Matter")+scale_color_discrete("Fishing")

p1 = p0 +
geom_smooth(data = myData, 
            aes(y=Ampeliscidae.fit, x=OrganicM, 
                ymin = Ampeliscidae.fit-Ampeliscidae.se.fit, ymax = Ampeliscidae.fit+Ampeliscidae.se.fit,
                color=fFishing,
                linetype=fPeriod
                ), stat = "identity")
p1

Cirratulidae (Second Species)

Similar data exploration steps

Sketch model fit