library(MASS)
data(Boston)
attach(Boston)

Problem 1

  1. Use the poly() function to fit a cubic polynomial regression to predict nox using dis. Report the regression output, and plot the resulting data and polynomial fits.
mod1 <- lm(nox~poly(dis, 3), data = Boston)
coef(summary(mod1))
##                 Estimate Std. Error    t value      Pr(>|t|)
## (Intercept)    0.5546951 0.00275939 201.020894  0.000000e+00
## poly(dis, 3)1 -2.0030959 0.06207094 -32.271071 1.597201e-124
## poly(dis, 3)2  0.8563300 0.06207094  13.795987  6.133104e-37
## poly(dis, 3)3 -0.3180490 0.06207094  -5.123959  4.274950e-07
plot(dis, nox)
lines(dis, fitted(mod1), col="blue")

  1. Plot the polynomial fits for a range of different polynomial degrees (say, from 1 to 10), and report the associated residual sum of squares.
set.seed(2)
### 50-50 split (train vs. test)
train <- sample(506, 253)

mod1 <- lm(nox~dis, data = Boston, subset = train)
mean((nox-predict(mod1, Boston))[-train]^2)
## [1] 0.005455645
mod2 <- lm(nox~poly(dis, 2), data = Boston, subset = train)
mean((nox-predict(mod2, Boston))[-train]^2)
## [1] 0.004028465
mod3 <- lm(nox~poly(dis, 3), data = Boston, subset = train)
mean((nox-predict(mod3, Boston))[-train]^2)
## [1] 0.003773303
mod4 <- lm(nox~poly(dis, 4), data = Boston, subset = train)
mean((nox-predict(mod4, Boston))[-train]^2)
## [1] 0.003746374
mod5 <- lm(nox~poly(dis, 5), data = Boston, subset = train)
mean((nox-predict(mod5, Boston))[-train]^2)
## [1] 0.004679533
mod6 <- lm(nox~poly(dis, 6), data = Boston, subset = train)
mean((nox-predict(mod6, Boston))[-train]^2)
## [1] 0.007991185
mod7 <- lm(nox~poly(dis, 7), data = Boston, subset = train)
mean((nox-predict(mod7, Boston))[-train]^2)
## [1] 0.01192018
mod8 <- lm(nox~poly(dis, 8), data = Boston, subset = train)
mean((nox-predict(mod8, Boston))[-train]^2)
## [1] 0.06333546
mod9 <- lm(nox~poly(dis, 9), data = Boston, subset = train)
mean((nox-predict(mod9, Boston))[-train]^2)
## [1] 2.521413
mod10 <- lm(nox~poly(dis, 10), data = Boston, subset = train)
mean((nox-predict(mod10, Boston))[-train]^2)
## [1] 2.532109
  1. Perform cross-validation or another approach to select the optimal degree for the polynomial, and explain your results.
# Leave one out cross validation
library(tidyverse)
## ── Attaching packages ─────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
library(boot)
cv.error<-rep(0, 10)
for(i in 1:10){
  glm.fit<-glm(nox~poly(dis, i), data=Boston)
  cv.error[i]<-cv.glm(Boston, glm.fit)$delta[1]
}

cvDF<-data.frame(degree=1:10, cv.error)

ggplot(data=cvDF, aes(x=degree, y=cv.error))+
  geom_point()+
  geom_line()

Based on the associated error with the data, a 4th degree polynomial seems to be the appropriate fit for the data as the error values begin to increase with a 5th degree polynomial and greater.

  1. Use the ns() function to fit a regression spline to predict nox using dis. Report the output for the fit using four degrees of freedom. How did you choose the knots? Plot the resulting fit.
library(splines)
dislims <- range(dis)
fit2=lm(nox~ns(dis ,df=4),data=Boston)

plot(dis ,nox ,xlim=dislims ,cex =.5,col=" darkgrey ")
title("Smoothing Spline")

fit=smooth.spline(dis ,nox ,df=4)
fit2=smooth.spline(dis ,nox ,cv=TRUE)
## Warning in smooth.spline(dis, nox, cv = TRUE): cross-validation with non-
## unique 'x' values seems doubtful

Problem 2

college<-read.csv("College.csv", header=TRUE, na.strings="?")
attach(college)
  1. Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.
set.seed(1)
train <- sample(1:dim(college)[1], dim(college)[1]/2)
trainS <- college[train,]
testS <- college[-train,]

head(college)
##                              X Private Apps Accept Enroll Top10perc
## 1 Abilene Christian University     Yes 1660   1232    721        23
## 2           Adelphi University     Yes 2186   1924    512        16
## 3               Adrian College     Yes 1428   1097    336        22
## 4          Agnes Scott College     Yes  417    349    137        60
## 5    Alaska Pacific University     Yes  193    146     55        16
## 6            Albertson College     Yes  587    479    158        38
##   Top25perc F.Undergrad P.Undergrad Outstate Room.Board Books Personal PhD
## 1        52        2885         537     7440       3300   450     2200  70
## 2        29        2683        1227    12280       6450   750     1500  29
## 3        50        1036          99    11250       3750   400     1165  53
## 4        89         510          63    12960       5450   450      875  92
## 5        44         249         869     7560       4120   800     1500  76
## 6        62         678          41    13500       3335   500      675  67
##   Terminal S.F.Ratio perc.alumni Expend Grad.Rate
## 1       78      18.1          12   7041        60
## 2       30      12.2          16  10527        56
## 3       66      12.9          30   8735        54
## 4       97       7.7          37  19016        59
## 5       72      11.9           2  10922        15
## 6       73       9.4          11   9727        55
fit1 <- lm(Outstate~poly(Room.Board, 10), data=trainS)
summary(fit1)
## 
## Call:
## lm(formula = Outstate ~ poly(Room.Board, 10), data = trainS)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7471.0 -2108.6  -331.5  1831.7 11220.9 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             10484.0      157.2  66.705  < 2e-16 ***
## poly(Room.Board, 10)1   56089.5     3095.9  18.118  < 2e-16 ***
## poly(Room.Board, 10)2    -645.1     3095.9  -0.208  0.83505    
## poly(Room.Board, 10)3     699.4     3095.9   0.226  0.82140    
## poly(Room.Board, 10)4   -5066.7     3095.9  -1.637  0.10255    
## poly(Room.Board, 10)5   -6670.6     3095.9  -2.155  0.03182 *  
## poly(Room.Board, 10)6    3516.2     3095.9   1.136  0.25677    
## poly(Room.Board, 10)7    5783.0     3095.9   1.868  0.06254 .  
## poly(Room.Board, 10)8   -5446.3     3095.9  -1.759  0.07935 .  
## poly(Room.Board, 10)9   -8849.8     3095.9  -2.859  0.00449 ** 
## poly(Room.Board, 10)10   2402.2     3095.9   0.776  0.43827    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3096 on 377 degrees of freedom
## Multiple R-squared:  0.4831, Adjusted R-squared:  0.4694 
## F-statistic: 35.23 on 10 and 377 DF,  p-value: < 2.2e-16
fit2 <- lm(Outstate~poly(Room.Board, 9) + poly(Books, 10), data=trainS)
summary(fit2)
## 
## Call:
## lm(formula = Outstate ~ poly(Room.Board, 9) + poly(Books, 10), 
##     data = trainS)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7286.4 -2060.6  -202.1  1821.4 11696.8 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           10484.0      156.1  67.170  < 2e-16 ***
## poly(Room.Board, 9)1  57187.8     3147.7  18.168  < 2e-16 ***
## poly(Room.Board, 9)2   -752.2     3107.9  -0.242  0.80890    
## poly(Room.Board, 9)3    629.6     3096.5   0.203  0.83898    
## poly(Room.Board, 9)4  -5283.6     3090.8  -1.709  0.08821 .  
## poly(Room.Board, 9)5  -6890.3     3095.8  -2.226  0.02664 *  
## poly(Room.Board, 9)6   4152.2     3104.5   1.337  0.18190    
## poly(Room.Board, 9)7   5145.7     3104.4   1.658  0.09827 .  
## poly(Room.Board, 9)8  -4778.2     3090.7  -1.546  0.12296    
## poly(Room.Board, 9)9  -8169.8     3096.9  -2.638  0.00869 ** 
## poly(Books, 10)1      -6813.0     3121.9  -2.182  0.02972 *  
## poly(Books, 10)2       3843.3     3107.9   1.237  0.21702    
## poly(Books, 10)3       3141.1     3115.6   1.008  0.31403    
## poly(Books, 10)4      -1279.1     3097.1  -0.413  0.67986    
## poly(Books, 10)5       3007.2     3092.8   0.972  0.33152    
## poly(Books, 10)6       5739.4     3099.7   1.852  0.06488 .  
## poly(Books, 10)7      -2663.4     3101.7  -0.859  0.39107    
## poly(Books, 10)8        516.7     3086.7   0.167  0.86716    
## poly(Books, 10)9      -4614.2     3087.1  -1.495  0.13586    
## poly(Books, 10)10      -558.8     3099.6  -0.180  0.85704    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3074 on 368 degrees of freedom
## Multiple R-squared:  0.5024, Adjusted R-squared:  0.4767 
## F-statistic: 19.55 on 19 and 368 DF,  p-value: < 2.2e-16
fit3 <- lm(Outstate~poly(Room.Board, 9) + Books + poly(Expend, 3), data=trainS)
summary(fit3)
## 
## Call:
## lm(formula = Outstate ~ poly(Room.Board, 9) + Books + poly(Expend, 
##     3), data = trainS)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9991.6 -1234.0    57.4  1388.6  7099.9 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           12505.174    483.079  25.886  < 2e-16 ***
## poly(Room.Board, 9)1  26179.195   3068.077   8.533 3.61e-16 ***
## poly(Room.Board, 9)2  -2569.474   2485.905  -1.034  0.30198    
## poly(Room.Board, 9)3  -2348.589   2386.170  -0.984  0.32563    
## poly(Room.Board, 9)4  -3018.006   2391.435  -1.262  0.20773    
## poly(Room.Board, 9)5  -3023.247   2398.787  -1.260  0.20834    
## poly(Room.Board, 9)6   3581.318   2389.248   1.499  0.13474    
## poly(Room.Board, 9)7   4420.583   2384.464   1.854  0.06454 .  
## poly(Room.Board, 9)8   -663.956   2400.584  -0.277  0.78225    
## poly(Room.Board, 9)9  -4970.182   2405.756  -2.066  0.03952 *  
## Books                    -3.634      0.841  -4.321 1.99e-05 ***
## poly(Expend, 3)1      42598.337   2932.294  14.527  < 2e-16 ***
## poly(Expend, 3)2     -25203.247   2643.696  -9.533  < 2e-16 ***
## poly(Expend, 3)3       7785.176   2491.581   3.125  0.00192 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2377 on 374 degrees of freedom
## Multiple R-squared:  0.6976, Adjusted R-squared:  0.6871 
## F-statistic: 66.36 on 13 and 374 DF,  p-value: < 2.2e-16
  1. Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.
library(gam)
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded gam 1.16.1
gam1 <- lm(Outstate~ns(Room.Board, 9) + ns(Books) + ns(Expend, 3), data = trainS)
summary(gam1)
## 
## Call:
## lm(formula = Outstate ~ ns(Room.Board, 9) + ns(Books) + ns(Expend, 
##     3), data = trainS)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10504.5  -1420.9    -46.3   1505.7   6793.8 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            3544       1494   2.372 0.018199 *  
## ns(Room.Board, 9)1     1956       1379   1.418 0.156960    
## ns(Room.Board, 9)2     4694       1670   2.812 0.005191 ** 
## ns(Room.Board, 9)3     4382       1539   2.847 0.004657 ** 
## ns(Room.Board, 9)4     4603       1622   2.837 0.004796 ** 
## ns(Room.Board, 9)5     3310       1605   2.062 0.039891 *  
## ns(Room.Board, 9)6     6135       1610   3.810 0.000163 ***
## ns(Room.Board, 9)7     6174       1225   5.039 7.30e-07 ***
## ns(Room.Board, 9)8     8737       3228   2.707 0.007103 ** 
## ns(Room.Board, 9)9     3757       1368   2.745 0.006337 ** 
## ns(Books)             -5149       1258  -4.093 5.22e-05 ***
## ns(Expend, 3)1        15309       1036  14.773  < 2e-16 ***
## ns(Expend, 3)2        13952       1679   8.309 1.80e-15 ***
## ns(Expend, 3)3         6590       1714   3.844 0.000142 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2363 on 374 degrees of freedom
## Multiple R-squared:  0.7011, Adjusted R-squared:  0.6907 
## F-statistic: 67.49 on 13 and 374 DF,  p-value: < 2.2e-16
plot.Gam(gam1, se=TRUE ,col ="red")

  1. Evaluate the model obtained on the test set, and explain the results obtained.
gam2 <- lm(Outstate~ns(Room.Board, 9) + ns(Books) + ns(Expend, 3), data = testS)
summary(gam2)
## 
## Call:
## lm(formula = Outstate ~ ns(Room.Board, 9) + ns(Books) + ns(Expend, 
##     3), data = testS)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7503.2 -1166.9   -55.1  1413.7  5379.9 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          3957.9     1618.8   2.445 0.014948 *  
## ns(Room.Board, 9)1   1794.6     1456.7   1.232 0.218749    
## ns(Room.Board, 9)2   2386.6     1767.1   1.351 0.177651    
## ns(Room.Board, 9)3   3337.9     1635.2   2.041 0.041929 *  
## ns(Room.Board, 9)4   3839.7     1688.5   2.274 0.023528 *  
## ns(Room.Board, 9)5   3567.6     1652.2   2.159 0.031466 *  
## ns(Room.Board, 9)6   4614.5     1675.1   2.755 0.006160 ** 
## ns(Room.Board, 9)7   4399.6     1222.6   3.598 0.000363 ***
## ns(Room.Board, 9)8   6000.2     3580.9   1.676 0.094645 .  
## ns(Room.Board, 9)9   4609.8     1681.8   2.741 0.006418 ** 
## ns(Books)           -3394.7     1747.9  -1.942 0.052862 .  
## ns(Expend, 3)1      11821.5      821.1  14.397  < 2e-16 ***
## ns(Expend, 3)2      11830.0     1357.2   8.717  < 2e-16 ***
## ns(Expend, 3)3       6693.4     1593.3   4.201 3.32e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2199 on 375 degrees of freedom
## Multiple R-squared:  0.6744, Adjusted R-squared:  0.6631 
## F-statistic: 59.75 on 13 and 375 DF,  p-value: < 2.2e-16

The results of the GAM model indicate if certain variable show significance at certain degrees of a polynomial given the use of natural splines.

  1. For which variables, if any, is there evidence of a non-linear relationship with the response?

Room.Board and Expend appear to have a non-linear relationship with the reponse (significance with greater than first degree polynomials, response = Outstate) while Books is limited to a linear relaitonship (no significance for 2nd degree polynomial or greater).