#1) Problem 3.11 from the text

11.a) Load ragweed data

library(HRW)
data(ragweed)
#help(ragweed)

#reordering data
ord<-order(ragweed$dayInSeason)
DayInSeason<-ragweed$dayInSeason[ord]
Year<-ragweed$year[ord]
TemperatureResidual<-ragweed$temperatureResidual[ord]
Rain<-ragweed$rain[ord]
WindSpeed<-ragweed$windSpeed[ord]
PollenCount<-ragweed$pollenCount[ord]

11.b) Fit a Poisson GAM to the ragweed data

library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-38. For overview type 'help("mgcv-package")'.
fit1<-gam(PollenCount~factor(Year)+s(DayInSeason,k=27,by=factor(Year))+
            TemperatureResidual+Rain+s(WindSpeed,k=27), 
          data=ragweed, family=poisson)

summary(fit1)
## 
## Family: poisson 
## Link function: log 
## 
## Formula:
## PollenCount ~ factor(Year) + s(DayInSeason, k = 27, by = factor(Year)) + 
##     TemperatureResidual + Rain + s(WindSpeed, k = 27)
## 
## Parametric coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           1.797348   0.083187  21.606   <2e-16 ***
## factor(Year)1992     -2.939181  19.585551  -0.150   0.8807    
## factor(Year)1993     -1.422067   0.633147  -2.246   0.0247 *  
## factor(Year)1994    -44.641951  74.797155  -0.597   0.5506    
## TemperatureResidual   0.054357   0.004504  12.068   <2e-16 ***
## Rain                  0.819443   0.059913  13.677   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                   edf Ref.df Chi.sq p-value    
## s(DayInSeason):factor(Year)1991 25.89  25.99 3287.6  <2e-16 ***
## s(DayInSeason):factor(Year)1992 22.23  22.48 2343.0  <2e-16 ***
## s(DayInSeason):factor(Year)1993 24.37  24.95 2439.6  <2e-16 ***
## s(DayInSeason):factor(Year)1994 22.87  23.21 1103.9  <2e-16 ***
## s(WindSpeed)                    25.40  25.94  711.1  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.888   Deviance explained = 94.6%
## UBRE = 4.4524  Scale est. = 1         n = 334

Checking the residual plots for fit1

par(mfrow=c(2,2))
gam.check(fit1)

## 
## Method: UBRE   Optimizer: outer newton
## full convergence after 16 iterations.
## Gradient range [-1.461271e-05,9.722198e-07]
## (score 4.452403 & scale 1).
## Hessian positive definite, eigenvalue range [0.0004693913,0.1070829].
## Model rank =  136 / 136 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                                   k'  edf k-index p-value  
## s(DayInSeason):factor(Year)1991 26.0 25.9    0.92   0.085 .
## s(DayInSeason):factor(Year)1992 26.0 22.2    0.92   0.070 .
## s(DayInSeason):factor(Year)1993 26.0 24.4    0.92   0.055 .
## s(DayInSeason):factor(Year)1994 26.0 22.9    0.92   0.035 *
## s(WindSpeed)                    26.0 25.4    1.15   1.000  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(1,1))
qqnorm(residuals(fit1,type="scaled.pearson"))
abline(0,1,col="red")

An analysis of the residual plots and QQ plot for fit1 reveals that the Poisson GAM does not appear to be a very good fit for this data. The plot of the predictor versus the residuals reveals nonconstant variance, with the variance increasing as the linear predictor increases. Although the distribution of the residuals does appear to be symmetrical, the variance of the residuals is larger than expected. Both the residuals histogram and the Normal QQ plot suggest that the data are overdispersed.

In addition, gam.check reveals issues with the splines in the model; the levels of the dayInSeason variable all have a k-index value less than 1 which is indicative of a flawed model, especially when the number of knots is already large (k=27).

11.c) Create other GAM fits for the data

#i) Quasipoisson
fit2<-gam(PollenCount~factor(Year)+s(DayInSeason,k=27,by=factor(Year))+
            TemperatureResidual+Rain+s(WindSpeed,k=27), 
          data=ragweed, family=quasipoisson)
summary(fit2)
## 
## Family: quasipoisson 
## Link function: log 
## 
## Formula:
## PollenCount ~ factor(Year) + s(DayInSeason, k = 27, by = factor(Year)) + 
##     TemperatureResidual + Rain + s(WindSpeed, k = 27)
## 
## Parametric coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          2.008789   0.184479  10.889  < 2e-16 ***
## factor(Year)1992    -0.483085   0.389142  -1.241   0.2155    
## factor(Year)1993    -0.556736   0.229638  -2.424   0.0160 *  
## factor(Year)1994    -1.129297   0.449265  -2.514   0.0125 *  
## TemperatureResidual  0.047169   0.006997   6.741 8.94e-11 ***
## Rain                 0.853755   0.161041   5.301 2.33e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                    edf Ref.df      F p-value    
## s(DayInSeason):factor(Year)1991  7.905  9.782 47.793  <2e-16 ***
## s(DayInSeason):factor(Year)1992 11.733 14.158 24.058  <2e-16 ***
## s(DayInSeason):factor(Year)1993  7.667  9.517 32.056  <2e-16 ***
## s(DayInSeason):factor(Year)1994  5.875  7.336 21.071  <2e-16 ***
## s(WindSpeed)                    14.654 17.530  5.367  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.879   Deviance explained = 92.3%
## GCV = 9.5977  Scale est. = 8.5443    n = 334
#ii) sqrt Gaussian model
fit3<-gam(sqrt(PollenCount)~factor(Year)+s(DayInSeason,k=27,by=factor(Year))+
            TemperatureResidual+Rain+s(WindSpeed,k=27), 
          data=ragweed, family=gaussian)
summary(fit3)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## sqrt(PollenCount) ~ factor(Year) + s(DayInSeason, k = 27, by = factor(Year)) + 
##     TemperatureResidual + Rain + s(WindSpeed, k = 27)
## 
## Parametric coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          3.60601    0.35948  10.031  < 2e-16 ***
## factor(Year)1992     0.09090    0.29382   0.309 0.757277    
## factor(Year)1993    -0.20278    0.26391  -0.768 0.442920    
## factor(Year)1994    -1.13137    0.31720  -3.567 0.000425 ***
## TemperatureResidual  0.10174    0.01902   5.348 1.85e-07 ***
## Rain                 1.53507    0.34149   4.495 1.02e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                    edf Ref.df      F p-value    
## s(DayInSeason):factor(Year)1991 15.226 18.434 29.971  <2e-16 ***
## s(DayInSeason):factor(Year)1992  9.305 11.504 44.530  <2e-16 ***
## s(DayInSeason):factor(Year)1993 10.170 12.578 48.865  <2e-16 ***
## s(DayInSeason):factor(Year)1994  5.689  7.078 47.483  <2e-16 ***
## s(WindSpeed)                     7.798  9.655  6.015  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.867   Deviance explained = 88.8%
## GCV = 3.5274  Scale est. = 2.9551    n = 334

11.d) Examine residual plots for fit2 & fit3

Checking the residuals for fit2:

par(mfrow=c(2,2))
gam.check(fit2)

## 
## Method: GCV   Optimizer: outer newton
## full convergence after 6 iterations.
## Gradient range [-6.282682e-07,3.073244e-05]
## (score 9.597656 & scale 8.54425).
## Hessian positive definite, eigenvalue range [0.01099335,0.05950797].
## Model rank =  136 / 136 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                                    k'   edf k-index p-value
## s(DayInSeason):factor(Year)1991 26.00  7.90    0.96    0.32
## s(DayInSeason):factor(Year)1992 26.00 11.73    0.96    0.22
## s(DayInSeason):factor(Year)1993 26.00  7.67    0.96    0.32
## s(DayInSeason):factor(Year)1994 26.00  5.88    0.96    0.32
## s(WindSpeed)                    26.00 14.65    1.10    0.99
par(mfrow=c(1,1))
qqnorm(residuals(fit2,type="scaled.pearson"))
abline(0,1,col="red")

An analysis of the residual plots and QQ plot for fit2 shows that this is a better model than fit1, but there are still some serious concerns. The plot of the predictor versus the residuals reveals nonconstant variance, with the variance consistently increasing as the linear predictor increases. The distribution of the residuals is closer to Normal for this model, although it is slightly skewed to the right as indicated by the histogram and Normal QQ plot. Even so, the distribution is acceptably Normal. This leaves the nonconstant variance as the main issue with this model.

Checking the residuals for fit3:

par(mfrow=c(2,2))
gam.check(fit3)

## 
## Method: GCV   Optimizer: magic
## Smoothing parameter selection converged after 7 iterations.
## The RMS GCV score gradient at convergence was 1.73366e-07 .
## The Hessian was positive definite.
## Model rank =  136 / 136 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                                    k'   edf k-index p-value
## s(DayInSeason):factor(Year)1991 26.00 15.23    0.97    0.29
## s(DayInSeason):factor(Year)1992 26.00  9.31    0.97    0.25
## s(DayInSeason):factor(Year)1993 26.00 10.17    0.97    0.21
## s(DayInSeason):factor(Year)1994 26.00  5.69    0.97    0.27
## s(WindSpeed)                    26.00  7.80    1.05    0.84
par(mfrow=c(1,1))
qqnorm(residuals(fit3,type="scaled.pearson"))
abline(0,1,col="red")

An analysis of the residual plots and QQ plot for fit3 reveals that the square root transformation on the response appears to provide a fairly good fit. The plot of the predictor versus the residuals shows the most randomness out of the three models, and although the variance still appears to be increasing slightly it is not immediately obvious whether it is enough to violate the assumption of a constant variance. The distribution of the residuals appears to be the most Normal of the models, with a bell-curve histogram distribution and a Normal QQ plot that is very close to a linear fit.

11.e) Choose a model

Based on the analysis given in the previous section for each of the models, the square root transformation in fit3 provides the best fit for the data. Of all the models, it appears to do the best job of meeting the model conditions of having a constant variance and Normally distributed residuals.

11.f) Graph fit3 showing the dayInSeason effect on the response for each year.

par(mfrow=c(1,1))

#create graph
myLnCols=c(1,2,3,4)

plot(DayInSeason,sqrt(PollenCount),type="n",
     xlab="Day In Season",
     ylab="Square Root of Pollen Count",
     main="Predicted Polled Counts by Year")

ng=3000

#1991 model
range = seq(min(DayInSeason),max(DayInSeason),length = ng)
fit1991 = predict(fit3, newdata=data.frame(
  Year=1991,
  DayInSeason=range,
  TemperatureResidual=rep(mean(TemperatureResidual),ng),
  Rain=rep(1,ng),
  WindSpeed=rep(mean(WindSpeed),ng)))
lines(range,fit1991,col = myLnCols[1])

#1992 model
fit1992 = predict(fit3, newdata=data.frame(
  Year=1992,
  DayInSeason=range,
  TemperatureResidual=rep(mean(TemperatureResidual),ng),
  Rain=rep(1,ng),
  WindSpeed=rep(mean(WindSpeed),ng)))
lines(range,fit1992,col = myLnCols[2])

#1993 model
fit1993 = predict(fit3, newdata=data.frame(
  Year=1993,
  DayInSeason=range,
  TemperatureResidual=rep(mean(TemperatureResidual),ng),
  Rain=rep(1,ng),
  WindSpeed=rep(mean(WindSpeed),ng)))
lines(range,fit1993,col = myLnCols[3])

#1994 model
fit1994 = predict(fit3, newdata=data.frame(
  Year=1994,
  DayInSeason=range,
  TemperatureResidual=rep(mean(TemperatureResidual),ng),
  Rain=rep(1,ng),
  WindSpeed=rep(mean(WindSpeed),ng)))
lines(range,fit1994,col = myLnCols[4])

legend(70,18,legend = c("1991","1992","1993","1994"),
       col = c(1,2,3,4),lty=rep(1,4),cex=0.7)

For the plot above I used rain=1 mainly because the vertical shift allows more of the curves to be visible for the higher DayInSeason values. From this graph we see the pollen counts peaking at 20-30 days into the season, with the peaks occurring earlier in the season as the years increase. We also see a tend of lower peak pollen counts as the years increase.

#2) Test for nonconstant variance in fit3

#create absolute residuals plot
par(mfrow=c(1,1))
plot(DayInSeason, abs(fit3$residuals),
     ylab="Absolute Residuals",
     main="Absolute Residuals: Square Root Transformation",
     xlab="Day in Season")

#add fit to residual model
x1 = DayInSeason
y1 = abs(fit3$residuals)
fit3ar = gam(y1~s(x1,bs="cr"))
lines(x1,fitted(fit3ar),lwd=3,col="blue")

max(fitted(fit3ar))/min(fitted(fit3ar))
## [1] 6.382269

Based on the fitted residuals graph, there is some concern for heteroscedasticity; the variability is steadily increasing from 0 to about 25, then decreasing from about 25 to about 50. The ratio of max fitted value to min fitted value is about 6.4, which is greater than the threshold value of 3 and gives reason to believe that there is heteroscedasticity in the data. We do not however see evidence of severe heteroscedasticity, as the predicted value curve mostly has values between 0.75 and 2 which is within the ratio of 3 limit - it is only the upper 7% of days in the season where the fitted residual curve sinks lower.

#3) Analysis of overdispersion

Yes there is concern for overdispersion based on the quasi-Poisson model. Running the summary of fit2 yields a scale estimate c^2 of 8.54 which is strong evidence that the data is overdispersed - our estimated variance is roughly 8.5 times greater than our estimated mean instead of being equal.

#4) Significance of year & day in season interaction

anova(fit1)
## 
## Family: poisson 
## Link function: log 
## 
## Formula:
## PollenCount ~ factor(Year) + s(DayInSeason, k = 27, by = factor(Year)) + 
##     TemperatureResidual + Rain + s(WindSpeed, k = 27)
## 
## Parametric Terms:
##                     df  Chi.sq p-value
## factor(Year)         3   5.423   0.143
## TemperatureResidual  1 145.645  <2e-16
## Rain                 1 187.064  <2e-16
## 
## Approximate significance of smooth terms:
##                                   edf Ref.df Chi.sq p-value
## s(DayInSeason):factor(Year)1991 25.89  25.99 3287.6  <2e-16
## s(DayInSeason):factor(Year)1992 22.23  22.48 2343.0  <2e-16
## s(DayInSeason):factor(Year)1993 24.37  24.95 2439.6  <2e-16
## s(DayInSeason):factor(Year)1994 22.87  23.21 1103.9  <2e-16
## s(WindSpeed)                    25.40  25.94  711.1  <2e-16
#no interaction Poisson model
fit0<-gam(PollenCount~factor(Year)+s(DayInSeason,k=27)+
            TemperatureResidual+Rain+s(WindSpeed,k=27), 
          data=ragweed, family=poisson)

anova(fit0)
## 
## Family: poisson 
## Link function: log 
## 
## Formula:
## PollenCount ~ factor(Year) + s(DayInSeason, k = 27) + TemperatureResidual + 
##     Rain + s(WindSpeed, k = 27)
## 
## Parametric Terms:
##                     df Chi.sq p-value
## factor(Year)         3  344.5  <2e-16
## TemperatureResidual  1  699.0  <2e-16
## Rain                 1  260.5  <2e-16
## 
## Approximate significance of smooth terms:
##                  edf Ref.df Chi.sq p-value
## s(DayInSeason) 25.76  25.97   9725  <2e-16
## s(WindSpeed)   25.13  25.88   1192  <2e-16

The interaction between year and dayInSeason does appear to be significant for the Poisson model. Looking at the summary for fit1, the year predictor on its own is not significant but each of the dayInSeason:year spline terms is statistically significant. Looking at the model with no interaction, the year predictor is now very significant, so there is strong evidence that year should be in the model and that the interaction between year and dayInSeason is significant.

#5) Stepwise regression on the square root transformation model

#no interaction model
detach("package:mgcv", unload=TRUE)
library(gam)
## Warning: package 'gam' was built under R version 4.1.3
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.20
fitinit<-gam::gam(sqrt(PollenCount)~factor(Year)+DayInSeason+TemperatureResidual+Rain+WindSpeed, data=ragweed, family=gaussian)

fitstep<-gam::step.Gam(fitinit, scope=
                         list("Year" = ~1+factor(Year),
                              "DayInSeason"=~1+DayInSeason+s(DayInSeason,df=3),
                              "TemperatureResidual"=~1+TemperatureResidual+
                                s(TemperatureResidual,df=3),
                              "Rain"=~1+factor(Rain),
                              "WindSpeed"=~1+WindSpeed+s(WindSpeed,df=3)),
                       family=gaussian, data=ragweed)
## Start:  sqrt(PollenCount) ~ factor(Year) + DayInSeason + TemperatureResidual +      Rain + WindSpeed; AIC= 1839.174 
## Step:1 sqrt(PollenCount) ~ Rain + factor(Year) + s(DayInSeason, df = 3) +      TemperatureResidual + WindSpeed ; AIC= 1594.542
print(names(fitstep$"model")[-1])
## [1] "Rain"                   "factor(Year)"           "s(DayInSeason, df = 3)"
## [4] "TemperatureResidual"    "WindSpeed"

The stepwise selection chooses a model that includes Rain, Year, TemperatureResidual, and WindSpeed as linear predictors and DayInSeason as the only spline in the model.