Parametric (linear) effect

Exercise 2: Fit a cubic and natural splines

# Load the Boston Housing data
library("MASS")
data(Boston)

Cubic spline

# Set up a cubic spline basis for variable `lstat`
library("splines")
bs_x = bs(Boston$lstat, df = 5)


# A matrix of dimension `c(length(x), df)`, 
# where `df = length(knots) + degree` plus one if there is an intercept.
head(bs_x)

##              1          2            3            4 5
## [1,] 0.5910036 0.26790606 0.0110909274 0.000000e+00 0
## [2,] 0.1599432 0.70905115 0.1309062042 9.948541e-05 0
## [3,] 0.5711464 0.14927001 0.0039309841 0.000000e+00 0
## [4,] 0.4093920 0.04610658 0.0005723661 0.000000e+00 0
## [5,] 0.5766033 0.31509124 0.0150738879 0.000000e+00 0
## [6,] 0.5826565 0.29880037 0.0136161871 0.000000e+00 0

# (a) Where are the knots located? 2
attr(bs_x, "knots")

## 33.33333% 66.66667% 
##  8.316667 14.696667

# (b) How many basis functions were generated? 3
attr(bs_x, "degree")

## [1] 3

# (c) Create a plot with the value of each basis function on the y-axis
# and the `lstat` variable on the x-axis
matplot(
  Boston$lstat[order(Boston$lstat)],
  bs_x[order(Boston$lstat),],
  type = "l", lwd = 2,
  xlab = "x: lower status of the population (percent)", 
  ylab = expression(b[k](x))
)

# Build a cubic spline regression model
library("gam")

## Loading required package: foreach

## Warning: package 'foreach' was built under R version 4.0.5

## Loaded gam 1.22

# (d) Regress response variable `medv` on the cubic spline
# Compare models with df = 2, 5, 8

################### df = 5
mod_df5 = gam(medv ~ bs(lstat, df = 5), data = Boston)
summary(mod_df5)

## 
## Call: gam(formula = medv ~ bs(lstat, df = 5), data = Boston)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.1774  -3.1790  -0.7981   2.0964  26.6755 
## 
## (Dispersion Parameter for gaussian family taken to be 27.109)
## 
##     Null Deviance: 42716.3 on 505 degrees of freedom
## Residual Deviance: 13554.52 on 500 degrees of freedom
## AIC: 3113.663 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                    Df Sum Sq Mean Sq F value    Pr(>F)    
## bs(lstat, df = 5)   5  29162  5832.4  215.14 < 2.2e-16 ***
## Residuals         500  13554    27.1                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(mod_df5, residuals = TRUE, col = "blue", cex = .5, 
     xlab="lstat: lower status of the population (percent)", 
     ylab="medv: median value owner-occupied homes")

################### df = 8
mod_df8 = gam(medv ~ bs(lstat, df = 8), data = Boston)
summary(mod_df8)

## 
## Call: gam(formula = medv ~ bs(lstat, df = 8), data = Boston)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.9627  -3.1253  -0.6612   2.0831  26.0972 
## 
## (Dispersion Parameter for gaussian family taken to be 26.7118)
## 
##     Null Deviance: 42716.3 on 505 degrees of freedom
## Residual Deviance: 13275.77 on 497 degrees of freedom
## AIC: 3109.148 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                    Df Sum Sq Mean Sq F value    Pr(>F)    
## bs(lstat, df = 8)   8  29441  3680.1  137.77 < 2.2e-16 ***
## Residuals         497  13276    26.7                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(mod_df8, residuals = TRUE, col = "blue", cex = .5, 
     xlab="lstat: lower status of the population (percent)", 
     ylab="medv: median value owner-occupied homes")

################### df = 2
mod_df2 = gam(medv ~ bs(lstat, df = 2), data = Boston)

## Warning in bs(lstat, df = 2): 'df' was too small; have used 3

summary(mod_df2)

## 
## Call: gam(formula = medv ~ bs(lstat, df = 2), data = Boston)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.5441  -3.7122  -0.5145   2.4846  26.4153 
## 
## (Dispersion Parameter for gaussian family taken to be 29.1145)
## 
##     Null Deviance: 42716.3 on 505 degrees of freedom
## Residual Deviance: 14615.48 on 502 degrees of freedom
## AIC: 3147.796 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                    Df Sum Sq Mean Sq F value    Pr(>F)    
## bs(lstat, df = 2)   3  28101  9366.9  321.73 < 2.2e-16 ***
## Residuals         502  14616    29.1                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(mod_df2, residuals = TRUE, col = "blue", cex = .5, 
     xlab="lstat: lower status of the population (percent)", 
     ylab="medv: median value owner-occupied homes")

###============== compare
c("df=2"=BIC(mod_df2), "df=5"=BIC(mod_df5), "df=8"=BIC(mod_df8))

##     df=2     df=5     df=8 
## 3168.928 3143.249 3151.414

The df=5 cubic spline fits best according to BIC, the plots suggest similar: df=8 yields a slightly too wiggly function, while the df=2 linear spline seems too inflexible. Note that models with different (number of location of) knots are not nested, so we should not use the difference test (as with e.g., the anova function).

Natural spline

ns3 = ns(Boston$lstat, df = 3)

## (e) Where are the knots located for the 3 df natural spline?
attr(ns3, "knots")

## 33.33333% 66.66667% 
##  8.316667 14.696667

## (f) Inspect, interpret and compare the fitted models
################### df = 3
mod_ns3 = gam(medv ~ ns(lstat, df = 3), data = Boston)
summary(mod_ns3)

## 
## Call: gam(formula = medv ~ ns(lstat, df = 3), data = Boston)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.7595  -3.3628  -0.6468   2.3062  27.2857 
## 
## (Dispersion Parameter for gaussian family taken to be 28.4261)
## 
##     Null Deviance: 42716.3 on 505 degrees of freedom
## Residual Deviance: 14269.9 on 502 degrees of freedom
## AIC: 3135.688 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                    Df Sum Sq Mean Sq F value    Pr(>F)    
## ns(lstat, df = 3)   3  28446  9482.1  333.57 < 2.2e-16 ***
## Residuals         502  14270    28.4                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(mod_ns3, residuals = TRUE, col = "blue", cex = .5)

The number and location of the knots in the df=3 natural spline are identical to those of the 5 df cubic spline. By using a natural spline, we thus gain (save) 2 df.

################### df = 5
ns5_x = ns(Boston$lstat, df = 5)
attr(ns5_x, "knots")

##   20%   40%   60%   80% 
##  6.29  9.53 13.33 18.06

matplot(
  Boston$lstat[order(Boston$lstat)],
  ns5_x[order(Boston$lstat),],
  type = "l", lwd = 2,
  xlab = "x: lower status of the population (percent)",
  ylab = expression(b[k](x))
)

mod_ns5 = gam(medv ~ ns(lstat, df = 5), data = Boston)
summary(mod_ns5)

## 
## Call: gam(formula = medv ~ ns(lstat, df = 5), data = Boston)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.9811  -3.0266  -0.7252   2.1416  26.5111 
## 
## (Dispersion Parameter for gaussian family taken to be 26.9021)
## 
##     Null Deviance: 42716.3 on 505 degrees of freedom
## Residual Deviance: 13451.03 on 500 degrees of freedom
## AIC: 3109.785 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                    Df Sum Sq Mean Sq F value    Pr(>F)    
## ns(lstat, df = 5)   5  29265  5853.1  217.57 < 2.2e-16 ***
## Residuals         500  13451    26.9                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(mod_ns5, residuals = TRUE, col = "blue", cex = .5)

###============== compare
c("df=3"=BIC(mod_ns3), "df=5"=BIC(mod_ns5))

##    df=3    df=5 
## 3156.82 3139.37

The residual deviance is lower for the natural spline than for the cubic spline with same df. According to the BIC, the natural spline with 5 df fits best; and it is also better than the earlier cubic spline with 5 df. The plots suggest similar. The natural spline indeed seems an improvement over the cubic spline.

Non-parametric (non-linear) effect

Exercise 3 - Fit a smoothing spline

Use function s() to fit a smoothing spline to the lstat variable and predict medv. Use both a high (e.g., > 100) and a low value (e.g., a value between 3 and 8) for the df argument of the s() function.
Inspect, interpret and compare the fitted models using summary(), plot(), bic(). Do the smoothing splines provide an improvement over the parametric natural and cubic splines fitted earlier?

## more complex smoothing spline:
mod_sc = gam(medv ~ s(lstat, df = 100), data = Boston) 
summary(mod_sc)

## 
## Call: gam(formula = medv ~ s(lstat, df = 100), data = Boston)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.4068  -2.8770  -0.5317   2.0195  21.8413 
## 
## (Dispersion Parameter for gaussian family taken to be 27.156)
## 
##     Null Deviance: 42716.3 on 505 degrees of freedom
## Residual Deviance: 10998.23 on 405.0015 degrees of freedom
## AIC: 3197.913 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                     Df Sum Sq Mean Sq F value    Pr(>F)    
## s(lstat, df = 100)   1  23244 23243.9  855.94 < 2.2e-16 ***
## Residuals          405  10998    27.2                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                    Npar Df Npar F     Pr(F)    
## (Intercept)                                    
## s(lstat, df = 100)      99 3.1521 4.441e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(mod_sc, residuals = TRUE, cex = .5, col = "blue")

## simpler smoothing splines:
mod_ss = gam(medv ~ s(lstat, df = 5), data = Boston)  
summary(mod_ss)

## 
## Call: gam(formula = medv ~ s(lstat, df = 5), data = Boston)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.6332  -3.2159  -0.6577   2.2051  26.8386 
## 
## (Dispersion Parameter for gaussian family taken to be 27.6492)
## 
##     Null Deviance: 42716.3 on 505 degrees of freedom
## Residual Deviance: 13824.6 on 499.9999 degrees of freedom
## AIC: 3123.646 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                   Df Sum Sq Mean Sq F value    Pr(>F)    
## s(lstat, df = 5)   1  23244 23243.9  840.67 < 2.2e-16 ***
## Residuals        500  13825    27.6                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                  Npar Df Npar F     Pr(F)    
## (Intercept)                                  
## s(lstat, df = 5)       4 51.065 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(mod_ss, residuals = TRUE, col = "blue", cex = .5)

The parametric effect represent the linear slope, thus using only 1 df. The non-parametric effects represent the non-linear effects. With df = 100, 1 df goes to the linear effect, and the other 99 go to the non-linear part. This is a lot of df for the effect of a single variable, reflected in the very wiggly curve.

c("mod_ss"=BIC(mod_ss), "mod_sc"=BIC(mod_sc), "mod_ns5"=BIC(mod_ns5))

##   mod_ss   mod_sc  mod_ns5 
## 3136.326 3210.593 3139.370

According to the BIC, the smoothing spline with df = 5 has better fit than the hardly regularized smoothing spline. This is in accordance with what we can conclude from the visual inspection of the fitted smoothing splines. The smoothing spline with 5 df also outperforms the natural spline with 5 df from the previous exercise. Thus, the non-parametric smoothing spline approach appears to improve on the parametric spline approaches.

Exercise 4 - Fit a GAM

Fit a GAM (using function gam from package mgcv) to the Boston Housing data, in which you evaluate the effects of lstat, rm, ptratio, crim and dis on medv using smoothing splines (i.e., apply function s() to each of the possible predictors).

# Before we fit the model, we first inspect variable distributions:
vars = c("medv", "lstat", "rm", "ptratio", "crim", "dis")
plot(Boston[ , vars])

Most variable pairs are associated. Most variables have somewhat skewed distributions. Crime rate (crim) is most heavily skewed.

To avoid masking of function names, and to make sure we use the functions from package mgcv, and not package gam, we first unload package gam before we load package mgcv:

detach("package:gam", unload = TRUE)
library("mgcv")

## Loading required package: nlme

## This is mgcv 1.8-33. For overview type 'help("mgcv-package")'.

boston_GAM = gam(medv ~ s(lstat) + s(rm) + s(ptratio) + s(crim) + s(dis),
                 data = Boston, method = "REML")

I used restricted maximum likelihood estimation to fit the model (method = “REML”). (Later, we’ll repeat the analysis using generalized cross-validation to illustrate it’s slightly different smoothing behavior).

Note that I did not specify the degrees of freedom, that is, the number of basis functions of which the smoothing spline has to be composed, and also the initial degrees of freedom (not to be confused with the finally obtained empirical degrees of freedom after penalization has been applied). By default, for each predictor variable for which a smoothing spline is specified (by wrapping it in s(...)), a thin-plate regression spline basis (\(bs = "tp"\)) is set up comprising nine basis functions (the default \(k=-1\) in fact yields \(k=9\) for univariate thin-plate splines).\ Although we would normally not do this, for illustration here we extract the model matrix, containing all basis functions of all predictors, using function model.matrix():

mod_mat = model.matrix(boston_GAM)
colnames(mod_mat)

##  [1] "(Intercept)"  "s(lstat).1"   "s(lstat).2"   "s(lstat).3"   "s(lstat).4"  
##  [6] "s(lstat).5"   "s(lstat).6"   "s(lstat).7"   "s(lstat).8"   "s(lstat).9"  
## [11] "s(rm).1"      "s(rm).2"      "s(rm).3"      "s(rm).4"      "s(rm).5"     
## [16] "s(rm).6"      "s(rm).7"      "s(rm).8"      "s(rm).9"      "s(ptratio).1"
## [21] "s(ptratio).2" "s(ptratio).3" "s(ptratio).4" "s(ptratio).5" "s(ptratio).6"
## [26] "s(ptratio).7" "s(ptratio).8" "s(ptratio).9" "s(crim).1"    "s(crim).2"   
## [31] "s(crim).3"    "s(crim).4"    "s(crim).5"    "s(crim).6"    "s(crim).7"   
## [36] "s(crim).8"    "s(crim).9"    "s(dis).1"     "s(dis).2"     "s(dis).3"    
## [41] "s(dis).4"     "s(dis).5"     "s(dis).6"     "s(dis).7"     "s(dis).8"    
## [46] "s(dis).9"

Use plot (it is often helpful to additionally specify residuals = TRUE) and summary to evaluate the fitted model and curves. Which predictor variables seem to be most important? Which variables’ effects deviate from a linear effect?

# We can plot the basis functions created, e.g., for rm (number of rooms):
mod_mat_rm = mod_mat[ , grepl("rm", colnames(mod_mat))]
matplot(Boston$rm, mod_mat_rm, pch = 1, cex = .2, col = rainbow(9), 
        cex.main = .7, xlab = "rm", ylab = expression(b[k](x)), 
        cex.axis = .7, cex.lab = .7,
        main = "Basis functions for predictor variable: rm")
legend("top", legend = 1:9, title = "basis function", col = rainbow(9), 
       pch = 1, cex = .7, pt.cex = .5, ncol = 3)

Note that the basis functions do not add up to 1 in the way they are set up by mgcv. Note that the 9th basis function is simply the identity function: It is used to estimate the linear, unpenalized effect of the predictor variable.\

After setting up the thin-plate regression spline bases by funciton s(), function gam() estimates the coefficients of all basis functions. The coefficients of the non-linear basis functions are estimated with a quadratic penalty (i.e., they are treated as random effects), while the coefficients of the linear basis function is estimated in an unpenalized manner (i.e., treated as a fixed effect). Although we are not directly interested in the estimated values of the coefficients, because they are difficult to interpret because the non-linear basis functions are relatively complex, we can still extract them:

head(coef(boston_GAM))

## (Intercept)  s(lstat).1  s(lstat).2  s(lstat).3  s(lstat).4  s(lstat).5 
##   22.532806    0.569823   -8.812774   -1.187634    5.395840    1.492320

summary(boston_GAM)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## medv ~ s(lstat) + s(rm) + s(ptratio) + s(crim) + s(dis)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  22.5328     0.1763   127.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##              edf Ref.df     F  p-value    
## s(lstat)   5.758  6.945 46.39  < 2e-16 ***
## s(rm)      6.614  7.762 25.21  < 2e-16 ***
## s(ptratio) 1.620  2.019 12.98 3.26e-06 ***
## s(crim)    1.364  1.637 24.16 2.77e-07 ***
## s(dis)     1.002  1.004 27.99 6.46e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.814   Deviance explained =   82%
## -REML = 1431.1  Scale est. = 15.724    n = 506

Package mgcv does not distinguish between the parametric and non-parametric part of the smoothing spline. The edf values (empirical degrees of freedom), however, quantify the amount of non-linearity. With a value of 1, the spline takes up only 1 df, reflecting only a linear slope. With df > 1, non-linear effects were captured by the smoothing spline. The p-values test the null hypothesis of no effect of the predictor variable.\

The results indicate that all predictors significantly contribute to predicting median values of owner-occupied homes. The strongest effect (highest F-value) is observed for lstat (percent of lower status population), followed by dis (weighted mean of distances to five Boston employment centres), rm (average number of rooms per dwelling) and crim (per capita crime rate). ptratio (pupil-teacher ratio by town) has the smallest, but still significant effect. The edf values indicate that dis, crim and ptratio have close to linear effects.

par(mfrow = c(2, 3))
plot(boston_GAM, residuals = TRUE)

The plots show the conditional effects of the predictors. We see a non-linear effect of neuroticism. The effects of lstat and rm appears strongly non-linear. Also, the plots show 95% confidence intervals for the pointwise effects.

Extra: Compare REML and GCV estimation

Smoothness selection in mgcv can also be performed using generalized cross validation (this is the default, so we can specify \(method = "GCV.Cp"\), or simply omit it):

boston_GAM2 = gam(medv ~ s(lstat) + s(rm) + s(ptratio) + s(crim) + s(dis),
                  data = Boston)
summary(boston_GAM2)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## medv ~ s(lstat) + s(rm) + s(ptratio) + s(crim) + s(dis)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  22.5328     0.1679   134.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##              edf Ref.df      F  p-value    
## s(lstat)   6.157  7.341 40.865  < 2e-16 ***
## s(rm)      6.992  8.075 23.742  < 2e-16 ***
## s(ptratio) 3.454  4.253  7.550 4.94e-06 ***
## s(crim)    1.784  2.213 22.106  < 2e-16 ***
## s(dis)     8.662  8.964  9.028  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.831   Deviance explained =   84%
## GCV = 15.099  Scale est. = 14.262    n = 506

par(mfrow = c(2, 3))
plot(boston_GAM2, residuals = TRUE)

We obtained higher edf values using GCV. The plots also show that with GCV, the fitted curves show stronger non-linearity.\ We can use 10-fold CV to evaluate which estimation procedure performs best for these data:

set.seed(42)
fold_ids = sample(1:10, nrow(Boston), replace = TRUE)
preds = data.frame(matrix(rep(NA, times = nrow(Boston), each = 2), ncol = 2))
names(preds) = c("REML", "GCV")
for (i in 1:10) {
  GCV_fit = gam(medv ~ s(lstat) + s(rm) + s(ptratio) + s(crim) + s(dis),
                data = Boston[fold_ids != i,])
  REML_fit = gam(medv ~ s(lstat) + s(rm) + s(ptratio) + s(crim) + s(dis),
                 data = Boston[fold_ids != i,], method = "REML")
  preds$GCV[fold_ids == i] = predict(GCV_fit, newdata=Boston[fold_ids == i,])
  preds$REML[fold_ids == i] = predict(REML_fit, newdata=Boston[fold_ids == i,])
}
colMeans((preds - Boston$medv)^2) ## MSE

##     REML      GCV 
## 17.32625 16.16959

The CV results indicate slightly lower test MSE for GCV for these data. The lower amount of smoothing provided by GCV may be preferable in this case. Note that asymptotically, GCV performs better than REML, but in finite-sample cases, REML may perform better. We have a relatively large sample here, and the Boston data has relatively low noise, which may explain why GCV performs better here. Both GAMs explain about 80% of variance in median housing prices (which were measured in units of 1,000 USD).\

Exercise 5: Binary outcome

Fit a GAM (using function gam from package mgcv)

library("foreign")

# nesda.sav: (Netherlands Study of Depression and Anxiety
NESDA = read.spss("nesda.sav", use.value.labels=F,to.data.frame=T)

## re-encoding from CP1252

dim(NESDA)

## [1] 600  15

plot(NESDA[ , c("mdd", "aconscie", "neurot", "openes")])

Specify mdd (Major Depressive Disorder diagnosis) as the response; make sure to also specify family = binomial
Specify neurot (Neuroticism, one of the big five personality traits as measured by a questionnaire), openes (Openness, idem), and aconscie (Conscientiousness, idem) as predictors, using function s()

NESDA_GAM = gam(mdd ~ s(aconscie) + s(neurot) + s(openes), family = binomial,
                data = NESDA, method = "REML")

Use the plot and summary functions to evaluate the fitted model and curves. Which predictor variables seem to be most important? Which variables’ effects deviate from a linear effect?

summary(NESDA_GAM)

## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## mdd ~ s(aconscie) + s(neurot) + s(openes)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.9452     0.1185  -7.977  1.5e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##               edf Ref.df Chi.sq p-value    
## s(aconscie) 1.001  1.001  3.707  0.0542 .  
## s(neurot)   2.426  3.094 62.405  <2e-16 ***
## s(openes)   1.000  1.000  0.600  0.4387    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.186   Deviance explained =   17%
## -REML = 326.27  Scale est. = 1         n = 600

Package mgcv does not distinguish between the parametric and non-parametric part of the smoothing spline. The edf values (empirical degrees of freedom), however, quantify the amount of non-linearity. With a value of 1, the spline takes up only 1 df, reflecting only a linear slope. With df > 1, non-linear effects were capture by the smoothing spline. The p-values test the null hypothesis of no effect. Thus, only neuroticism contributes to predicting the presence of major depressive disorder. Also, the edf values indicate that only neuroticism has a non-linear effect.

par(mfrow = c(1, 3))
plot(NESDA_GAM, residuals = TRUE)

The plots show the conditional effects of the predictors. We see a non-linear effect of neuroticism. Also, the plots show 95% confidence intervals for the pointwise effects. For openness and conscientiousness, we see that the intervals always include 0.

Extra: Compare REML and GCV estimation

set.seed(42)
fold_ids = sample(1:10, nrow(NESDA), replace = TRUE)
preds = data.frame(matrix(rep(NA, times = nrow(NESDA), each = 2), ncol = 2))
names(preds) = c("REML", "GCV")
for (i in 1:10) {
  GCV_fit = gam(mdd ~ s(aconscie) + s(neurot) + s(openes),
        family = binomial,data = NESDA[fold_ids != i,])
  REML_fit = gam(mdd ~ s(aconscie) + s(neurot) + s(openes),
      family = binomial, data = NESDA[fold_ids != i,], method = "REML")
  preds$GCV[fold_ids == i] = predict(GCV_fit, newdata=NESDA[fold_ids == i,])
  preds$REML[fold_ids == i] = predict(REML_fit, newdata=NESDA[fold_ids == i,])
}
## Predicted cross-validated probabilities
preds = exp(preds) / (1 + exp(preds)) 
y = as.numeric(NESDA$mdd) ## observed outcome
cor(cbind(preds, y))

##           REML       GCV         y
## REML 1.0000000 0.9777990 0.4099189
## GCV  0.9777990 1.0000000 0.3971731
## y    0.4099189 0.3971731 1.0000000

colMeans((preds - y)^2) ## MSE on predicted probabilities

##      REML       GCV 
## 0.1868823 0.1894893

sapply(((preds - y)^2), sd)/sqrt(nrow(NESDA)) ## SE of MSE

##        REML         GCV 
## 0.007499788 0.007723749

1 - colMeans((preds - y)^2)/var(y) ## Pseudo R-squared

##      REML       GCV 
## 0.1685791 0.1569809

We see the predictions obtained with REML versus GCV are very strongly correlated. REML performed better in terms of squared error loss on predicted probabilities (a.k.a. Brier score). According to cross-validation, the fitted GAMs explain about 16-17% of variance in depressive disorder diagnoses.

Lecture9 Splines and GAMSs

Parametric (linear) effect

Exercise 2: Fit a cubic and natural splines

Cubic spline

Natural spline

Non-parametric (non-linear) effect

Exercise 3 - Fit a smoothing spline

Exercise 4 - Fit a GAM

Extra: Compare REML and GCV estimation

Exercise 5: Binary outcome

Extra: Compare REML and GCV estimation