## Homework 8: Generalized Additive Models
# load data
pisa <- read.csv("https://raw.githubusercontent.com/m-clark/generalized-additive-models/master/data/pisasci2006.csv")
# fit a cubic spline regression model
model <- lm(Overall ~ bs(Income, df = 4), data = pisa)
# create a sequence of income values for predictions
income_values <- seq(min(pisa$Income, na.rm = TRUE), max(pisa$Income, na.rm = TRUE), length.out = 100)
# predict overall score values
score_pred <- predict(model, data.frame(Income = income_values))
# create prediction dataframe
pred_data <- data.frame(
Income = income_values,
Overall = score_pred
)
# plot the data and smooth the curve
ggplot(pisa, aes(x = Income, y = Overall)) +
geom_point() +
geom_line(data = pred_data, aes(x = Income, y = Overall), color = "red")
## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_point()`).
# fit the loess model
l_model <- loess(Overall ~ Income, data = pisa)
# create a sequence of income values for predictions
income_values
## [1] 0.4070000 0.4123939 0.4177879 0.4231818 0.4285758 0.4339697 0.4393636
## [8] 0.4447576 0.4501515 0.4555455 0.4609394 0.4663333 0.4717273 0.4771212
## [15] 0.4825152 0.4879091 0.4933030 0.4986970 0.5040909 0.5094848 0.5148788
## [22] 0.5202727 0.5256667 0.5310606 0.5364545 0.5418485 0.5472424 0.5526364
## [29] 0.5580303 0.5634242 0.5688182 0.5742121 0.5796061 0.5850000 0.5903939
## [36] 0.5957879 0.6011818 0.6065758 0.6119697 0.6173636 0.6227576 0.6281515
## [43] 0.6335455 0.6389394 0.6443333 0.6497273 0.6551212 0.6605152 0.6659091
## [50] 0.6713030 0.6766970 0.6820909 0.6874848 0.6928788 0.6982727 0.7036667
## [57] 0.7090606 0.7144545 0.7198485 0.7252424 0.7306364 0.7360303 0.7414242
## [64] 0.7468182 0.7522121 0.7576061 0.7630000 0.7683939 0.7737879 0.7791818
## [71] 0.7845758 0.7899697 0.7953636 0.8007576 0.8061515 0.8115455 0.8169394
## [78] 0.8223333 0.8277273 0.8331212 0.8385152 0.8439091 0.8493030 0.8546970
## [85] 0.8600909 0.8654848 0.8708788 0.8762727 0.8816667 0.8870606 0.8924545
## [92] 0.8978485 0.9032424 0.9086364 0.9140303 0.9194242 0.9248182 0.9302121
## [99] 0.9356061 0.9410000
# predict overall scores
overall_pred <- predict(l_model, data.frame(Income = income_values))
# create a new predictions data frame
pred_data_2 <- data.frame(
Income = income_values,
Overall = overall_pred
)
# plot the data and smooth the curve
ggplot(pisa, aes(Income, y = Overall)) +
geom_point() +
geom_line(data = pred_data_2, aes(x = Income, y = Overall), color = "red")
## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_point()`).
# fit the GAM model
GAM_model <- gam(Overall ~ s(Income) + s(Health) + s(Edu), data = pisa)
summary(GAM_model)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Overall ~ s(Income) + s(Health) + s(Edu)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 471.154 2.772 170 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Income) 7.593 8.415 8.826 1.29e-06 ***
## s(Health) 1.000 1.000 2.736 0.10679
## s(Edu) 6.204 7.178 3.308 0.00771 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.863 Deviance explained = 90.3%
## GCV = 573.83 Scale est. = 399.5 n = 52
# plot the model
plot(GAM_model)