# Load the Boston Housing data
library("MASS")
data(Boston)
# Set up a cubic spline basis for variable `lstat`
library("splines")
bs_x = bs(Boston$lstat, df = 5)
# A matrix of dimension `c(length(x), df)`,
# where `df = length(knots) + degree` plus one if there is an intercept.
head(bs_x)
## 1 2 3 4 5
## [1,] 0.5910036 0.26790606 0.0110909274 0.000000e+00 0
## [2,] 0.1599432 0.70905115 0.1309062042 9.948541e-05 0
## [3,] 0.5711464 0.14927001 0.0039309841 0.000000e+00 0
## [4,] 0.4093920 0.04610658 0.0005723661 0.000000e+00 0
## [5,] 0.5766033 0.31509124 0.0150738879 0.000000e+00 0
## [6,] 0.5826565 0.29880037 0.0136161871 0.000000e+00 0
# (a) Where are the knots located? 2
attr(bs_x, "knots")
## 33.33333% 66.66667%
## 8.316667 14.696667
# (b) How many basis functions were generated? 3
attr(bs_x, "degree")
## [1] 3
# (c) Create a plot with the value of each basis function on the y-axis
# and the `lstat` variable on the x-axis
matplot(
Boston$lstat[order(Boston$lstat)],
bs_x[order(Boston$lstat),],
type = "l", lwd = 2,
xlab = "x: lower status of the population (percent)",
ylab = expression(b[k](x))
)
# Build a cubic spline regression model
library("gam")
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.0.5
## Loaded gam 1.22
# (d) Regress response variable `medv` on the cubic spline
# Compare models with df = 2, 5, 8
################### df = 5
mod_df5 = gam(medv ~ bs(lstat, df = 5), data = Boston)
summary(mod_df5)
##
## Call: gam(formula = medv ~ bs(lstat, df = 5), data = Boston)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -15.1774 -3.1790 -0.7981 2.0964 26.6755
##
## (Dispersion Parameter for gaussian family taken to be 27.109)
##
## Null Deviance: 42716.3 on 505 degrees of freedom
## Residual Deviance: 13554.52 on 500 degrees of freedom
## AIC: 3113.663
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## bs(lstat, df = 5) 5 29162 5832.4 215.14 < 2.2e-16 ***
## Residuals 500 13554 27.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(mod_df5, residuals = TRUE, col = "blue", cex = .5,
xlab="lstat: lower status of the population (percent)",
ylab="medv: median value owner-occupied homes")
################### df = 8
mod_df8 = gam(medv ~ bs(lstat, df = 8), data = Boston)
summary(mod_df8)
##
## Call: gam(formula = medv ~ bs(lstat, df = 8), data = Boston)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -14.9627 -3.1253 -0.6612 2.0831 26.0972
##
## (Dispersion Parameter for gaussian family taken to be 26.7118)
##
## Null Deviance: 42716.3 on 505 degrees of freedom
## Residual Deviance: 13275.77 on 497 degrees of freedom
## AIC: 3109.148
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## bs(lstat, df = 8) 8 29441 3680.1 137.77 < 2.2e-16 ***
## Residuals 497 13276 26.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(mod_df8, residuals = TRUE, col = "blue", cex = .5,
xlab="lstat: lower status of the population (percent)",
ylab="medv: median value owner-occupied homes")
################### df = 2
mod_df2 = gam(medv ~ bs(lstat, df = 2), data = Boston)
## Warning in bs(lstat, df = 2): 'df' was too small; have used 3
summary(mod_df2)
##
## Call: gam(formula = medv ~ bs(lstat, df = 2), data = Boston)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -14.5441 -3.7122 -0.5145 2.4846 26.4153
##
## (Dispersion Parameter for gaussian family taken to be 29.1145)
##
## Null Deviance: 42716.3 on 505 degrees of freedom
## Residual Deviance: 14615.48 on 502 degrees of freedom
## AIC: 3147.796
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## bs(lstat, df = 2) 3 28101 9366.9 321.73 < 2.2e-16 ***
## Residuals 502 14616 29.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(mod_df2, residuals = TRUE, col = "blue", cex = .5,
xlab="lstat: lower status of the population (percent)",
ylab="medv: median value owner-occupied homes")
###============== compare
c("df=2"=BIC(mod_df2), "df=5"=BIC(mod_df5), "df=8"=BIC(mod_df8))
## df=2 df=5 df=8
## 3168.928 3143.249 3151.414
The df=5 cubic spline fits best according to BIC, the plots suggest
similar: df=8 yields a slightly too wiggly function, while the df=2
linear spline seems too inflexible. Note that models with different
(number of location of) knots are not nested, so we should not use the
difference test (as with e.g., the anova
function).
ns3 = ns(Boston$lstat, df = 3)
## (e) Where are the knots located for the 3 df natural spline?
attr(ns3, "knots")
## 33.33333% 66.66667%
## 8.316667 14.696667
## (f) Inspect, interpret and compare the fitted models
################### df = 3
mod_ns3 = gam(medv ~ ns(lstat, df = 3), data = Boston)
summary(mod_ns3)
##
## Call: gam(formula = medv ~ ns(lstat, df = 3), data = Boston)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -13.7595 -3.3628 -0.6468 2.3062 27.2857
##
## (Dispersion Parameter for gaussian family taken to be 28.4261)
##
## Null Deviance: 42716.3 on 505 degrees of freedom
## Residual Deviance: 14269.9 on 502 degrees of freedom
## AIC: 3135.688
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## ns(lstat, df = 3) 3 28446 9482.1 333.57 < 2.2e-16 ***
## Residuals 502 14270 28.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(mod_ns3, residuals = TRUE, col = "blue", cex = .5)
The number and location of the knots in the df=3 natural spline are identical to those of the 5 df cubic spline. By using a natural spline, we thus gain (save) 2 df.
################### df = 5
ns5_x = ns(Boston$lstat, df = 5)
attr(ns5_x, "knots")
## 20% 40% 60% 80%
## 6.29 9.53 13.33 18.06
matplot(
Boston$lstat[order(Boston$lstat)],
ns5_x[order(Boston$lstat),],
type = "l", lwd = 2,
xlab = "x: lower status of the population (percent)",
ylab = expression(b[k](x))
)
mod_ns5 = gam(medv ~ ns(lstat, df = 5), data = Boston)
summary(mod_ns5)
##
## Call: gam(formula = medv ~ ns(lstat, df = 5), data = Boston)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -13.9811 -3.0266 -0.7252 2.1416 26.5111
##
## (Dispersion Parameter for gaussian family taken to be 26.9021)
##
## Null Deviance: 42716.3 on 505 degrees of freedom
## Residual Deviance: 13451.03 on 500 degrees of freedom
## AIC: 3109.785
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## ns(lstat, df = 5) 5 29265 5853.1 217.57 < 2.2e-16 ***
## Residuals 500 13451 26.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(mod_ns5, residuals = TRUE, col = "blue", cex = .5)
###============== compare
c("df=3"=BIC(mod_ns3), "df=5"=BIC(mod_ns5))
## df=3 df=5
## 3156.82 3139.37
The residual deviance is lower for the natural spline than for the cubic spline with same df. According to the BIC, the natural spline with 5 df fits best; and it is also better than the earlier cubic spline with 5 df. The plots suggest similar. The natural spline indeed seems an improvement over the cubic spline.
lstat
variable and predict medv
. Use both a high (e.g., > 100)
and a low value (e.g., a value between 3 and 8) for the df argument of
the s() function.summary()
, plot()
, bic()
. Do the
smoothing splines provide an improvement over the parametric natural and
cubic splines fitted earlier?## more complex smoothing spline:
mod_sc = gam(medv ~ s(lstat, df = 100), data = Boston)
summary(mod_sc)
##
## Call: gam(formula = medv ~ s(lstat, df = 100), data = Boston)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -13.4068 -2.8770 -0.5317 2.0195 21.8413
##
## (Dispersion Parameter for gaussian family taken to be 27.156)
##
## Null Deviance: 42716.3 on 505 degrees of freedom
## Residual Deviance: 10998.23 on 405.0015 degrees of freedom
## AIC: 3197.913
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(lstat, df = 100) 1 23244 23243.9 855.94 < 2.2e-16 ***
## Residuals 405 10998 27.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(lstat, df = 100) 99 3.1521 4.441e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(mod_sc, residuals = TRUE, cex = .5, col = "blue")
## simpler smoothing splines:
mod_ss = gam(medv ~ s(lstat, df = 5), data = Boston)
summary(mod_ss)
##
## Call: gam(formula = medv ~ s(lstat, df = 5), data = Boston)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -13.6332 -3.2159 -0.6577 2.2051 26.8386
##
## (Dispersion Parameter for gaussian family taken to be 27.6492)
##
## Null Deviance: 42716.3 on 505 degrees of freedom
## Residual Deviance: 13824.6 on 499.9999 degrees of freedom
## AIC: 3123.646
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(lstat, df = 5) 1 23244 23243.9 840.67 < 2.2e-16 ***
## Residuals 500 13825 27.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(lstat, df = 5) 4 51.065 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(mod_ss, residuals = TRUE, col = "blue", cex = .5)
The parametric effect represent the linear slope, thus using only 1 df. The non-parametric effects represent the non-linear effects. With df = 100, 1 df goes to the linear effect, and the other 99 go to the non-linear part. This is a lot of df for the effect of a single variable, reflected in the very wiggly curve.
c("mod_ss"=BIC(mod_ss), "mod_sc"=BIC(mod_sc), "mod_ns5"=BIC(mod_ns5))
## mod_ss mod_sc mod_ns5
## 3136.326 3210.593 3139.370
According to the BIC, the smoothing spline with df = 5 has better fit than the hardly regularized smoothing spline. This is in accordance with what we can conclude from the visual inspection of the fitted smoothing splines. The smoothing spline with 5 df also outperforms the natural spline with 5 df from the previous exercise. Thus, the non-parametric smoothing spline approach appears to improve on the parametric spline approaches.
lstat
, rm
, ptratio
,
crim
and dis
on medv
using
smoothing splines (i.e., apply function s() to each of the possible
predictors).# Before we fit the model, we first inspect variable distributions:
vars = c("medv", "lstat", "rm", "ptratio", "crim", "dis")
plot(Boston[ , vars])
Most variable pairs are associated. Most variables have somewhat
skewed distributions. Crime rate (crim
) is most heavily
skewed.
To avoid masking of function names, and to make sure we use the functions from package mgcv, and not package gam, we first unload package gam before we load package mgcv:
detach("package:gam", unload = TRUE)
library("mgcv")
## Loading required package: nlme
## This is mgcv 1.8-33. For overview type 'help("mgcv-package")'.
boston_GAM = gam(medv ~ s(lstat) + s(rm) + s(ptratio) + s(crim) + s(dis),
data = Boston, method = "REML")
I used restricted maximum likelihood estimation to fit the model (method = “REML”). (Later, we’ll repeat the analysis using generalized cross-validation to illustrate it’s slightly different smoothing behavior).
Note that I did not specify the degrees of freedom, that is, the
number of basis functions of which the smoothing spline has to be
composed, and also the initial degrees of freedom (not
to be confused with the finally obtained empirical degrees of freedom
after penalization has been applied). By default, for each predictor
variable for which a smoothing spline is specified (by wrapping it in
s(...)
), a thin-plate regression spline basis (\(bs = "tp"\)) is set up comprising
nine basis functions (the default \(k=-1\) in fact yields \(k=9\) for univariate thin-plate splines).\
Although we would normally not do this, for illustration here we extract
the model matrix, containing all basis functions of all predictors,
using function model.matrix()
:
mod_mat = model.matrix(boston_GAM)
colnames(mod_mat)
## [1] "(Intercept)" "s(lstat).1" "s(lstat).2" "s(lstat).3" "s(lstat).4"
## [6] "s(lstat).5" "s(lstat).6" "s(lstat).7" "s(lstat).8" "s(lstat).9"
## [11] "s(rm).1" "s(rm).2" "s(rm).3" "s(rm).4" "s(rm).5"
## [16] "s(rm).6" "s(rm).7" "s(rm).8" "s(rm).9" "s(ptratio).1"
## [21] "s(ptratio).2" "s(ptratio).3" "s(ptratio).4" "s(ptratio).5" "s(ptratio).6"
## [26] "s(ptratio).7" "s(ptratio).8" "s(ptratio).9" "s(crim).1" "s(crim).2"
## [31] "s(crim).3" "s(crim).4" "s(crim).5" "s(crim).6" "s(crim).7"
## [36] "s(crim).8" "s(crim).9" "s(dis).1" "s(dis).2" "s(dis).3"
## [41] "s(dis).4" "s(dis).5" "s(dis).6" "s(dis).7" "s(dis).8"
## [46] "s(dis).9"
residuals = TRUE
) and summary to evaluate the
fitted model and curves. Which predictor variables seem to be most
important? Which variables’ effects deviate from a linear effect?# We can plot the basis functions created, e.g., for rm (number of rooms):
mod_mat_rm = mod_mat[ , grepl("rm", colnames(mod_mat))]
matplot(Boston$rm, mod_mat_rm, pch = 1, cex = .2, col = rainbow(9),
cex.main = .7, xlab = "rm", ylab = expression(b[k](x)),
cex.axis = .7, cex.lab = .7,
main = "Basis functions for predictor variable: rm")
legend("top", legend = 1:9, title = "basis function", col = rainbow(9),
pch = 1, cex = .7, pt.cex = .5, ncol = 3)
Note that the basis functions do not add up to 1 in the way they are set up by mgcv. Note that the 9th basis function is simply the identity function: It is used to estimate the linear, unpenalized effect of the predictor variable.\
After setting up the thin-plate regression spline bases by funciton
s()
, function gam()
estimates the coefficients
of all basis functions. The coefficients of the non-linear basis
functions are estimated with a quadratic penalty (i.e., they are treated
as random effects), while the coefficients of the linear basis function
is estimated in an unpenalized manner (i.e., treated as a fixed effect).
Although we are not directly interested in the estimated values of the
coefficients, because they are difficult to interpret because the
non-linear basis functions are relatively complex, we can still extract
them:
head(coef(boston_GAM))
## (Intercept) s(lstat).1 s(lstat).2 s(lstat).3 s(lstat).4 s(lstat).5
## 22.532806 0.569823 -8.812774 -1.187634 5.395840 1.492320
summary(boston_GAM)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## medv ~ s(lstat) + s(rm) + s(ptratio) + s(crim) + s(dis)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.5328 0.1763 127.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(lstat) 5.758 6.945 46.39 < 2e-16 ***
## s(rm) 6.614 7.762 25.21 < 2e-16 ***
## s(ptratio) 1.620 2.019 12.98 3.26e-06 ***
## s(crim) 1.364 1.637 24.16 2.77e-07 ***
## s(dis) 1.002 1.004 27.99 6.46e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.814 Deviance explained = 82%
## -REML = 1431.1 Scale est. = 15.724 n = 506
Package mgcv does not distinguish between the parametric and non-parametric part of the smoothing spline. The edf values (empirical degrees of freedom), however, quantify the amount of non-linearity. With a value of 1, the spline takes up only 1 df, reflecting only a linear slope. With df > 1, non-linear effects were captured by the smoothing spline. The p-values test the null hypothesis of no effect of the predictor variable.\
The results indicate that all predictors significantly contribute to
predicting median values of owner-occupied homes. The strongest effect
(highest F-value) is observed for lstat
(percent of lower
status population), followed by dis
(weighted mean of
distances to five Boston employment centres), rm
(average
number of rooms per dwelling) and crim
(per capita crime
rate). ptratio
(pupil-teacher ratio by town) has the
smallest, but still significant effect. The edf values indicate
that dis
, crim
and ptratio
have
close to linear effects.
par(mfrow = c(2, 3))
plot(boston_GAM, residuals = TRUE)
The plots show the conditional effects of the
predictors. We see a non-linear effect of neuroticism. The effects of
lstat
and rm
appears strongly non-linear.
Also, the plots show 95% confidence intervals for the pointwise
effects.
Smoothness selection in mgcv can also be performed using generalized cross validation (this is the default, so we can specify \(method = "GCV.Cp"\), or simply omit it):
boston_GAM2 = gam(medv ~ s(lstat) + s(rm) + s(ptratio) + s(crim) + s(dis),
data = Boston)
summary(boston_GAM2)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## medv ~ s(lstat) + s(rm) + s(ptratio) + s(crim) + s(dis)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.5328 0.1679 134.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(lstat) 6.157 7.341 40.865 < 2e-16 ***
## s(rm) 6.992 8.075 23.742 < 2e-16 ***
## s(ptratio) 3.454 4.253 7.550 4.94e-06 ***
## s(crim) 1.784 2.213 22.106 < 2e-16 ***
## s(dis) 8.662 8.964 9.028 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.831 Deviance explained = 84%
## GCV = 15.099 Scale est. = 14.262 n = 506
par(mfrow = c(2, 3))
plot(boston_GAM2, residuals = TRUE)
We obtained higher edf values using GCV. The plots also show that with GCV, the fitted curves show stronger non-linearity.\ We can use 10-fold CV to evaluate which estimation procedure performs best for these data:
set.seed(42)
fold_ids = sample(1:10, nrow(Boston), replace = TRUE)
preds = data.frame(matrix(rep(NA, times = nrow(Boston), each = 2), ncol = 2))
names(preds) = c("REML", "GCV")
for (i in 1:10) {
GCV_fit = gam(medv ~ s(lstat) + s(rm) + s(ptratio) + s(crim) + s(dis),
data = Boston[fold_ids != i,])
REML_fit = gam(medv ~ s(lstat) + s(rm) + s(ptratio) + s(crim) + s(dis),
data = Boston[fold_ids != i,], method = "REML")
preds$GCV[fold_ids == i] = predict(GCV_fit, newdata=Boston[fold_ids == i,])
preds$REML[fold_ids == i] = predict(REML_fit, newdata=Boston[fold_ids == i,])
}
colMeans((preds - Boston$medv)^2) ## MSE
## REML GCV
## 17.32625 16.16959
The CV results indicate slightly lower test MSE for GCV for these data. The lower amount of smoothing provided by GCV may be preferable in this case. Note that asymptotically, GCV performs better than REML, but in finite-sample cases, REML may perform better. We have a relatively large sample here, and the Boston data has relatively low noise, which may explain why GCV performs better here. Both GAMs explain about 80% of variance in median housing prices (which were measured in units of 1,000 USD).\
Fit a GAM (using function gam from package mgcv)
library("foreign")
# nesda.sav: (Netherlands Study of Depression and Anxiety
NESDA = read.spss("nesda.sav", use.value.labels=F,to.data.frame=T)
## re-encoding from CP1252
dim(NESDA)
## [1] 600 15
plot(NESDA[ , c("mdd", "aconscie", "neurot", "openes")])
mdd
(Major Depressive Disorder diagnosis) as
the response; make sure to also specify family = binomialneurot
(Neuroticism, one of the big five
personality traits as measured by a questionnaire), openes
(Openness, idem), and aconscie
(Conscientiousness, idem) as
predictors, using function s()NESDA_GAM = gam(mdd ~ s(aconscie) + s(neurot) + s(openes), family = binomial,
data = NESDA, method = "REML")
summary(NESDA_GAM)
##
## Family: binomial
## Link function: logit
##
## Formula:
## mdd ~ s(aconscie) + s(neurot) + s(openes)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.9452 0.1185 -7.977 1.5e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(aconscie) 1.001 1.001 3.707 0.0542 .
## s(neurot) 2.426 3.094 62.405 <2e-16 ***
## s(openes) 1.000 1.000 0.600 0.4387
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.186 Deviance explained = 17%
## -REML = 326.27 Scale est. = 1 n = 600
Package mgcv does not distinguish between the parametric and
non-parametric part of the smoothing spline. The edf values
(empirical degrees of freedom), however, quantify the amount of
non-linearity. With a value of 1, the spline takes up only 1 df,
reflecting only a linear slope. With df > 1, non-linear effects were
capture by the smoothing spline. The p-values test the null hypothesis
of no effect. Thus, only neuroticism
contributes to
predicting the presence of major depressive disorder. Also, the
edf values indicate that only neuroticism has a non-linear
effect.
par(mfrow = c(1, 3))
plot(NESDA_GAM, residuals = TRUE)
The plots show the conditional effects of the predictors. We see a
non-linear effect of neuroticism.
Also, the plots show 95%
confidence intervals for the pointwise effects. For
openness
and conscientiousness
, we see that
the intervals always include 0.
set.seed(42)
fold_ids = sample(1:10, nrow(NESDA), replace = TRUE)
preds = data.frame(matrix(rep(NA, times = nrow(NESDA), each = 2), ncol = 2))
names(preds) = c("REML", "GCV")
for (i in 1:10) {
GCV_fit = gam(mdd ~ s(aconscie) + s(neurot) + s(openes),
family = binomial,data = NESDA[fold_ids != i,])
REML_fit = gam(mdd ~ s(aconscie) + s(neurot) + s(openes),
family = binomial, data = NESDA[fold_ids != i,], method = "REML")
preds$GCV[fold_ids == i] = predict(GCV_fit, newdata=NESDA[fold_ids == i,])
preds$REML[fold_ids == i] = predict(REML_fit, newdata=NESDA[fold_ids == i,])
}
## Predicted cross-validated probabilities
preds = exp(preds) / (1 + exp(preds))
y = as.numeric(NESDA$mdd) ## observed outcome
cor(cbind(preds, y))
## REML GCV y
## REML 1.0000000 0.9777990 0.4099189
## GCV 0.9777990 1.0000000 0.3971731
## y 0.4099189 0.3971731 1.0000000
colMeans((preds - y)^2) ## MSE on predicted probabilities
## REML GCV
## 0.1868823 0.1894893
sapply(((preds - y)^2), sd)/sqrt(nrow(NESDA)) ## SE of MSE
## REML GCV
## 0.007499788 0.007723749
1 - colMeans((preds - y)^2)/var(y) ## Pseudo R-squared
## REML GCV
## 0.1685791 0.1569809
We see the predictions obtained with REML versus GCV are very strongly correlated. REML performed better in terms of squared error loss on predicted probabilities (a.k.a. Brier score). According to cross-validation, the fitted GAMs explain about 16-17% of variance in depressive disorder diagnoses.