## Homework 8: Generalized Additive Models

1. Splines (3 points)

# load data
pisa <- read.csv("https://raw.githubusercontent.com/m-clark/generalized-additive-models/master/data/pisasci2006.csv")

# fit a cubic spline regression model
model <- lm(Overall ~ bs(Income, df = 4), data = pisa)

# create a sequence of income values for predictions
income_values <- seq(min(pisa$Income, na.rm = TRUE), max(pisa$Income, na.rm = TRUE), length.out = 100)

# predict overall score values
score_pred <- predict(model, data.frame(Income = income_values))

# create prediction dataframe
pred_data <- data.frame(
  Income = income_values,
  Overall = score_pred
)

# plot the data and smooth the curve
ggplot(pisa, aes(x = Income, y = Overall)) +
  geom_point() +
  geom_line(data = pred_data, aes(x = Income, y = Overall), color = "red")
## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_point()`).

2. LOESS (3 points)

# fit the loess model
l_model <- loess(Overall ~ Income, data = pisa)

# create a sequence of income values for predictions
income_values
##   [1] 0.4070000 0.4123939 0.4177879 0.4231818 0.4285758 0.4339697 0.4393636
##   [8] 0.4447576 0.4501515 0.4555455 0.4609394 0.4663333 0.4717273 0.4771212
##  [15] 0.4825152 0.4879091 0.4933030 0.4986970 0.5040909 0.5094848 0.5148788
##  [22] 0.5202727 0.5256667 0.5310606 0.5364545 0.5418485 0.5472424 0.5526364
##  [29] 0.5580303 0.5634242 0.5688182 0.5742121 0.5796061 0.5850000 0.5903939
##  [36] 0.5957879 0.6011818 0.6065758 0.6119697 0.6173636 0.6227576 0.6281515
##  [43] 0.6335455 0.6389394 0.6443333 0.6497273 0.6551212 0.6605152 0.6659091
##  [50] 0.6713030 0.6766970 0.6820909 0.6874848 0.6928788 0.6982727 0.7036667
##  [57] 0.7090606 0.7144545 0.7198485 0.7252424 0.7306364 0.7360303 0.7414242
##  [64] 0.7468182 0.7522121 0.7576061 0.7630000 0.7683939 0.7737879 0.7791818
##  [71] 0.7845758 0.7899697 0.7953636 0.8007576 0.8061515 0.8115455 0.8169394
##  [78] 0.8223333 0.8277273 0.8331212 0.8385152 0.8439091 0.8493030 0.8546970
##  [85] 0.8600909 0.8654848 0.8708788 0.8762727 0.8816667 0.8870606 0.8924545
##  [92] 0.8978485 0.9032424 0.9086364 0.9140303 0.9194242 0.9248182 0.9302121
##  [99] 0.9356061 0.9410000
# predict overall scores
overall_pred <- predict(l_model, data.frame(Income = income_values))

# create a new predictions data frame
pred_data_2 <- data.frame(
  Income = income_values,
  Overall = overall_pred
)

# plot the data and smooth the curve
ggplot(pisa, aes(Income, y = Overall)) + 
  geom_point() + 
  geom_line(data = pred_data_2, aes(x = Income, y = Overall), color = "red")
## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_point()`).

3. GAM (4 points)

# fit the GAM model
GAM_model <- gam(Overall ~ s(Income) + s(Health) + s(Edu), data = pisa)
summary(GAM_model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Overall ~ s(Income) + s(Health) + s(Edu)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  471.154      2.772     170   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##             edf Ref.df     F  p-value    
## s(Income) 7.593  8.415 8.826 1.29e-06 ***
## s(Health) 1.000  1.000 2.736  0.10679    
## s(Edu)    6.204  7.178 3.308  0.00771 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.863   Deviance explained = 90.3%
## GCV = 573.83  Scale est. = 399.5     n = 52
# plot the model
plot(GAM_model)