This assignment is to fitting glms and making predictions. The data is hanson_birds / Abundances was collected by Andrea Hanson during her MSc. project in the School of Natural Resources. She sampled 14 transects in CRP fields scattered across SE Nebraska 4 times throughout the summer. She measured Visual Obstruction Readings (VOR) at 12 spots along each transect; these were averaged to obtain a single number representing vegetation structure within the CRP field. She also recorded the amount of 4 types of land use in the surrounding 2 km radius circle. These landscape metrics were highly correlated, so she used a Principle Components analysis to derive two combined metrics that were uncorrelated with each other. PC1 represents a gradient of increasing row crops, while PC2 represents decreasing woodland.
Dickcissels:
Thanks to the owner (Andrea Hanson) of the dataset Analysis of Dickcissels (column “DICK” in the data.frame) by plotting the response variable against all three covariates individually. Checking for any signs of non-linearities.
Abundances <- read.csv("Abundances.csv")
Abundances <- Abundances[,1:7] # drop bogus excel columns
str(Abundances)
## 'data.frame': 56 obs. of 7 variables:
## $ Site: int 3 3 3 3 6 6 6 6 8 8 ...
## $ GRSP: int 2 0 0 2 3 6 5 9 2 0 ...
## $ DICK: int 12 14 19 15 6 7 13 14 1 0 ...
## $ ME : int 1 1 0 1 0 0 0 0 0 0 ...
## $ vor : num 6.54 6.54 6.54 6.54 7.21 7.21 7.21 7.21 3.88 3.88 ...
## $ pc1 : num -411 -411 -411 -411 537 ...
## $ pc2 : num 46.6 46.6 46.6 46.6 -29.8 ...
head(Abundances)
## Site GRSP DICK ME vor pc1 pc2
## 1 3 2 12 1 6.54 -410.7957 46.57120
## 2 3 0 14 1 6.54 -410.7957 46.57120
## 3 3 0 19 0 6.54 -410.7957 46.57120
## 4 3 2 15 1 6.54 -410.7957 46.57120
## 5 6 3 6 0 7.21 537.1697 -29.75741
## 6 6 6 7 0 7.21 537.1697 -29.75741
library(ggplot2)
library(tidyverse) # load all the things
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gridExtra) # to put multiple plots on one page
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
# theme_classic(
# base_size = 11,
# base_family = "",
# base_line_size = base_size/22,
# base_rect_size = base_size/22
# )
# make up a theme object that does what I want
my_theme <- theme_classic() +
theme(text=element_text(size=rel(4)))
#some points overlap so use geom_count()
ggvor <- ggplot(Abundances, aes(x=vor, y=DICK)) +
geom_count() + geom_smooth() +
scale_size_area(max_size=4) +
guides(size="none") +
my_theme
## OMG I just figured out how to make these plots
## with different x axis variables!!!
ggpc1 <- ggvor + aes(x = pc1)
ggpc2 <- ggvor + aes(x = pc2)
# geom_bar with stat="count" nice for discrete data
ggdick <- ggplot(Abundances, aes(x=DICK)) +
geom_bar(stat="count") + my_theme
grid.arrange(ggvor, ggpc1, ggpc2, ggdick)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Here in the datset, the response variable (# of birds ) is discrete as this is count data and no negative value, therefore poisson distribution is good choice. For first step, a glm model with poisson distribution using all 3 covariates can be fitted.
library(cli)
library(broom)
dick.1 <- glm(DICK~(vor + pc1 + pc2)^2,data=Abundances,
family=poisson)
sum.dick.1 <- glance(dick.1)
gof.p.dick.1 <- with(sum.dick.1,pchisq(deviance,df.residual,lower=FALSE))
The model is significantly over-dispersed (χ^2= 88.06, df = 49, p = 0.0005217, therefore the estimated std errs are too small.
Checking linearity for each covariate:
library(ggplot2)
#install.packages("gridExtra")
library(gridExtra)
dick.1.test <- augment(dick.1)
lin.vor <- ggplot(dick.1.test, aes(x = vor, y = .std.resid)) + geom_count() +
scale_size_area(max_size=4) +
geom_hline(yintercept = 0, linetype = 2) +
geom_smooth() + guides(size = FALSE) +
my_theme
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
lin.pc1 <- lin.vor + aes(x = pc1)
lin.pc2 <- lin.vor + aes(x = pc2)
grid.arrange(lin.vor, lin.pc1, lin.pc2)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
The horizontal line is inside the confidence polygon of the smooth fit throughout, so there is no indication that the linearity assumption is violated.
the residual plots
resd1 <- ggplot(dick.1.test, aes(x = .fitted, y = .resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, linetype=2) +
my_theme
resd2 <- resd1 + aes(y = sqrt(abs(.std.resid))) +
geom_hline(yintercept = 1, linetype=2)
resd3 <- ggplot(dick.1.test, aes(.hat, .std.resid)) +
geom_vline(size = 2, colour = "white", xintercept = 0) +
geom_hline(size = 2, colour = "white", yintercept = 0) +
geom_point(aes(size = .cooksd)) + geom_smooth(se = FALSE) +
my_theme
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
probs <- c(0.25,0.75)
y <- quantile(dick.1.test$.std.resid, probs, names = FALSE, na.rm = TRUE)
x <- qnorm(probs)
slope <- diff(y)/diff(x)
int <- y[1L] - slope * x[1L]
qq.dick <- ggplot(dick.1.test, aes(sample=.std.resid)) +
geom_qq(size=rel(4)) +
geom_abline(intercept = int, slope = slope, linetype = 2, size = 2) +
my_theme
grid.arrange(resd1, resd2, resd3, qq.dick)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
There are some particularly large negative residuals that are probably contributing to the overdispersion, but they don’t look too bad.
Ratio of the summed Pearson residuals can be used to residual degrees of freedom to correct the AICc values. First we want to create a list of possible models. With 3 main effects and their 3 interactions, we have 48 possible models to consider. That is far too many for such a limited dataset. Background knowledge suggests that we know vor will be important, so all models we consider should include that effect. Then we’ll add each of the landscape variables in turn, together with the interaction with VOR.
models = list(DICK~1,DICK~vor,
DICK~vor + pc1,
DICK~vor + pc1 + vor:pc1,
DICK~vor + pc2,
DICK~vor + pc2 + vor:pc2,
DICK~vor + pc1 + pc2,
DICK~vor + pc1 + pc2 + vor:pc1,
DICK~vor + pc1 + pc2 + vor:pc2,
DICK~vor + pc1 + pc2 + vor:pc1 + vor:pc2,
DICK~(vor + pc1 + pc2)^2)
fits = lapply(models,glm,data=Abundances,family="poisson")
QAICc values used when dealing with overdispersed count data and when estimating a c-hat parameter.
library(AICcmodavg)
library(pander)
# get shorter model names (remove DICK ~)
modnames=sapply(models,function(ff)deparse(ff[[3]]))
pander(aictab(fits,c.hat=c_hat(fits[[11]]),modname=modnames), caption="1. AICc model selection table for Dickcissel relative abundance in CRP fields.", split.tables=Inf)
| Modnames | K | QAICc | Delta_QAICc | ModelLik | QAICcWt | Quasi.LL | c_hat | Cum.Wt | |
|---|---|---|---|---|---|---|---|---|---|
| 6 | vor + pc2 + vor:pc2 | 5 | 209.4 | 0 | 1 | 0.5156 | -99.09 | 1.517 | 0.5156 |
| 9 | vor + pc1 + pc2 + vor:pc2 | 6 | 211.2 | 1.779 | 0.4108 | 0.2118 | -98.72 | 1.517 | 0.7274 |
| 10 | vor + pc1 + pc2 + vor:pc1 + vor:pc2 | 7 | 211.4 | 2.055 | 0.358 | 0.1846 | -97.55 | 1.517 | 0.9119 |
| 11 | (vor + pc1 + pc2)^2 | 8 | 213.8 | 4.422 | 0.1096 | 0.0565 | -97.37 | 1.517 | 0.9684 |
| 5 | vor + pc2 | 4 | 216.3 | 6.965 | 0.03074 | 0.01585 | -103.8 | 1.517 | 0.9843 |
| 7 | vor + pc1 + pc2 | 5 | 217.4 | 8.023 | 0.01811 | 0.009335 | -103.1 | 1.517 | 0.9936 |
| 8 | vor + pc1 + pc2 + vor:pc1 | 6 | 219.3 | 9.922 | 0.007004 | 0.003611 | -102.8 | 1.517 | 0.9972 |
| 2 | vor | 3 | 221.1 | 11.75 | 0.002803 | 0.001445 | -107.3 | 1.517 | 0.9987 |
| 3 | vor + pc1 | 4 | 222 | 12.57 | 0.001864 | 0.0009609 | -106.6 | 1.517 | 0.9996 |
| 4 | vor + pc1 + vor:pc1 | 5 | 223.8 | 14.41 | 0.0007424 | 0.0003828 | -106.3 | 1.517 | 1 |
| 1 | 1 | 2 | 256 | 46.6 | 7.606e-11 | 3.921e-11 | -125.9 | 1.517 | 1 |
Here lowest QAICc values are found in top 4 models. The second through fourth ranked models are all within 2 likelihood units of the top model, suggesting that nesting is playing a role here. Models 6, 9, 10, and 11 are the top models to consider, as they have the highest likelihood of being the best models to explain the data.
nd = data.frame(vor=seq(2,9,0.1),
pc1=median(Abundances$pc1),
pc2=median(Abundances$pc2))
pred.dick = augment(fits[[6]], newdata = nd, type.predict="response")
pred.mavg = modavgPred(fits, newdata=nd, type="response",
c.hat=c_hat(fits[[11]]),modnames=modnames)
pred.dick$mod.avg.pred <- pred.mavg$mod.avg.pred
pred.dick$uncond.se <- pred.mavg$uncond.se
ggplot(Abundances, aes(x = vor, y = DICK)) +
geom_count() +
scale_size_area(max_size = 4) +
geom_line(aes(y=.fitted), data=pred.dick, size=2) +
geom_line(aes(y=mod.avg.pred), data=pred.dick, size=2, linetype = 2) +
my_theme + guides(size=FALSE)
The model averaged prediction is right on top of the top model prediction in this case. The model coefficients for those top four models:
library(tidyverse)
# poo. easy way with stargazer doesn't play with tidyverse
# make a dataframe
tidy_w_name <- function(.x, .y){
df <- tidy(.x) #get the coefficient table
df$modname <- .y #add a column with the modname
df # return df
}
# just the top 4 models to save grief later
etc <- map2_df(fits[c(6,9,10,11)], modnames[c(6,9,10,11)], tidy_w_name)
etc <- mutate(etc, Estimate = paste(formatC(estimate,digits=3, format="f")," (", formatC(std.error,digits=3, format="f"),")", sep=""))
tab <- spread(etc[,c(1,6,7)], key=modname, value = Estimate )
# arrange the models in order ...
pander(tab[,5:2], split.tables=Inf, caption="Table 2. Estimates (SE) for each of the top four models.")
| vor + pc2 + vor:pc2 | vor + pc1 + pc2 + vor:pc2 | vor + pc1 + pc2 + vor:pc1 + vor:pc2 | (vor + pc1 + pc2)^2 |
|---|---|---|---|
| 0.746 (0.193) | 0.643 (0.217) | 0.673 (0.218) | 0.675 (0.218) |
| NA | -0.000 (0.000) | 0.002 (0.001) | 0.001 (0.001) |
| NA | NA | NA | -0.000 (0.000) |
| -0.008 (0.002) | -0.008 (0.002) | -0.009 (0.002) | -0.010 (0.002) |
| 0.247 (0.031) | 0.266 (0.036) | 0.270 (0.036) | 0.268 (0.036) |
| NA | NA | -0.000 (0.000) | -0.000 (0.000) |
| 0.001 (0.000) | 0.001 (0.000) | 0.002 (0.000) | 0.002 (0.000) |
Check to see if any of the coefficients from the second through fourth ranked models have estimates more than 2 SE from zero. For example, pc1 is at most 2 times the SE in those models. The value of the other two coefficients pc1:pc2 and vor:pc1 also have small magnitudes relative to their SE. This increases our confidence that nesting effects are responsible for the model selection uncertainty. Ignoring the model selection uncertainty might make our predictions a bit too precise, but the parameters vor, pc2 and vor:pc2 are very similar across the four models. Finally, I want to make a plot showing the prediction from the top model that demonstrates the interaction with pc2. To do this, I’ll make predictions from the lower quartile, median and upper quartile of pc2 for a range of vor.
nd = crossing(vor=seq(2,9,0.1),
pc2=quantile(Abundances$pc2))
nd$pc2Labels <- factor(rep_len(c("Min","25%","50%","75%","Max"), nrow(nd)))
pred.dick = augment(fits[[6]], newdata = nd, type.predict="response")
ggplot(Abundances, aes(x = vor, y = DICK)) +
geom_count() +
scale_size_area(max_size = 4) +
geom_line(aes(y=.fitted, colour=pc2Labels), data=pred.dick, size=2) +
my_theme +
guides(size=FALSE) +
theme(legend.text=element_text(size=16))+
scale_color_discrete(breaks=c("Min","25%","50%","75%","Max"), name="PC2") +
scale_y_continuous(limits=c(0,25)) +
xlab("Visual Obstruction Reading") +
ylab("Dickcissel Abundance")
## Warning: Removed 17 rows containing missing values (`geom_line()`).
Working with binary data is similar, but back-transforming the link function is a bit more involved. Also the global goodness of fit test doesn’t work when the data are zero/one. Download the file isolation.txt, which has three variables, incidence (presence/absence of a species), area of island (km^2), and isolation (distance to mainland in km). This exercise is modified from pg. 595 of Michael Crawley’s “The R Book”.
Top model in Table 2 suggests the interaction between (vor:pc2) has the strongest positive effect on Dickcissel, while the effect of (pc2) alone is weakly positive. ###########################################################################################
Incidence function model
Area and isolation can have both positive and negative effects on incidence.
WE might expect negative effects of isolation and potentially positive effects of area on the outcome the presence and abscence of species
isolation <- read.table("isolation.txt",header=TRUE)
names(isolation)
## [1] "incidence" "area" "isolation"
head(isolation)
## incidence area isolation
## 1 1 7.928 3.317
## 2 0 1.925 7.554
## 3 1 2.045 5.883
## 4 0 4.781 5.932
## 5 0 1.536 5.308
## 6 1 7.369 4.934
ggplot(isolation, aes(x = factor(incidence), y = area)) +
geom_boxplot() +
xlab("Presence") +
ylab("Area")
#try
ggplot(isolation, aes(x = factor(incidence), y = isolation)) +
geom_boxplot() +
xlab("Presence") +
ylab("isolation")
Now Binomial GLM model
isolation.1 = glm(incidence~area * isolation, data=isolation,family=binomial)
isolation.2 = glm(incidence~area + isolation, data=isolation,family=binomial)
anova(isolation.2,isolation.1,test="Chisq")
## Analysis of Deviance Table
##
## Model 1: incidence ~ area + isolation
## Model 2: incidence ~ area * isolation
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 47 28.402
## 2 46 28.252 1 0.15043 0.6981
#summary(isolation.2)
Since devience is 28.402, Simple model is fine.
summary(isolation.2)
##
## Call:
## glm(formula = incidence ~ area + isolation, family = binomial,
## data = isolation)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8189 -0.3089 0.0490 0.3635 2.1192
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.6417 2.9218 2.273 0.02302 *
## area 0.5807 0.2478 2.344 0.01909 *
## isolation -1.3719 0.4769 -2.877 0.00401 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 68.029 on 49 degrees of freedom
## Residual deviance: 28.402 on 47 degrees of freedom
## AIC: 34.402
##
## Number of Fisher Scoring iterations: 6
isolation.3 = update(isolation.2,.~.-area)
anova(isolation.3,isolation.2,test="Chisq")
## Analysis of Deviance Table
##
## Model 1: incidence ~ isolation
## Model 2: incidence ~ area + isolation
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 48 36.640
## 2 47 28.402 1 8.2375 0.004103 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
So removing area does make a significant difference with the other model here. Therefore,the model island.2 is best fitted model.
Thought provoker - Does the sign of the coefficients for area and isolation match your expectations from the boxplots? Though provoker - Have a look at the plots of the residuals - do they look good to you? Next we want to plot the effects.
Since the boxplots represents that the presence of spcies increased significantly when area is higher 0.58(2.92), however, it decreases when the distance from main island increased -1.37(0.477).
Plots of the residuals
old.par = par(mfrow=c(2,2))
plot(isolation.2)
par(old.par)
nd = data.frame(area=seq(0,9,0.01),isolation=median(isolation$isolation))
pred.p = predict(isolation.2,newdata=nd,type="response")
plot(nd$area,pred.p,type="l",ylim=c(0,1),xlab="Island Area [km^2]",ylab="Probability of Presence")
rug(isolation[isolation$incidence==0,"area"])
rug(isolation[isolation$incidence==1,"area"],side=3)
The “rug plots” are a way to show where the data points are located along the x-axis without cluttering up the graph too much. Thought provoker - can you find another way to get a vector of island areas when incidence == 0? Make a plot of the effect of isolation on incidence when island area is held at the median value. Adding confidence limits to this sort of plot requires making the prediction on the linear predictor scale, constructing the confidence limits, and then back-transforming the predictions to the response scale using the inverse link function. There are several ways to write the inverse link function for a logit link model, but the easiest to use in my opinion is p ̂=1/(1+e^(-βX) ) where βX is the prediction on the link scale. 4. Go back and make a new plot for the effect of isolation (not area) that includes the 95% confidence region.
nd = data.frame(area=seq(0,9,0.01),isolation=median(isolation$isolation))
pred.p = predict(isolation.2,newdata=nd,type="link",se.fit=TRUE)
plot(nd$area,1/(1+exp(-pred.p$fit)),type="l",ylim=c(0,1),
xlab="Island Area [km^2]",ylab="Probability of Presence")
rug(isolation[isolation$incidence==0,"area"])
rug(isolation[isolation$incidence==1,"area"],side=3)
lo.95 = pred.p$fit - 2*pred.p$se.fit
up.95 = pred.p$fit + 2*pred.p$se.fit
polygon(c(nd$area,rev(nd$area)),1/(1+exp(-c(lo.95,rev(up.95)))),
col=rgb(0.8,0.8,0.8,0.5),border=NA)
What would happen if we used AIC model selection on this dataset?
models = list(incidence~1,
incidence~area,
incidence~isolation,
incidence~area + isolation,
incidence~area + isolation + area:isolation)
fits = lapply(models,glm,family=binomial,data=isolation)
aictab(fits,as.character(models))
##
## Model selection based on AICc:
##
## K AICc Delta_AICc AICcWt Cum.Wt
## incidence ~ area + isolation 3 34.92 0.00 0.72 0.72
## incidence ~ area + isolation + area:isolation 4 37.14 2.22 0.24 0.96
## incidence ~ isolation 2 40.90 5.97 0.04 1.00
## incidence ~ area 2 54.43 19.50 0.00 1.00
## incidence ~ 1 1 70.11 35.19 0.00 1.00
## LL
## incidence ~ area + isolation -14.20
## incidence ~ area + isolation + area:isolation -14.13
## incidence ~ isolation -18.32
## incidence ~ area -25.09
## incidence ~ 1 -34.01
There is strong evidence that both area and isolation have an effect on incidence, with the model including both variables and their interaction having the highest AIC weight. There is weaker evidence for a model including only isolation, and no evidence for a model including only area or a model with just an intercept.AIC suggests that the model with both area and isolation as predictors is the best model