Problem 1: Boston

This question uses the variables dis (the weighted mean of distances to five Boston employment centers) and nox (nitrogen oxides concentration in parts per 10 million) from the Boston data. We will treat dis as the predictor and nox as the response.

library(tidyverse)
## -- Attaching packages ---------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
data("Boston")
names(Boston)
##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"

a)

Use the poly() function to fit a cubic polynomial regression to predict nox using dis. Report the regression output, and plot the resulting data and polynomial fits.

attach(Boston)
set.seed(1)
model = lm(nox ~ poly(dis, 3), data = Boston)

coef(summary (model))
##                 Estimate Std. Error    t value      Pr(>|t|)
## (Intercept)    0.5546951 0.00275939 201.020894  0.000000e+00
## poly(dis, 3)1 -2.0030959 0.06207094 -32.271071 1.597201e-124
## poly(dis, 3)2  0.8563300 0.06207094  13.795987  6.133104e-37
## poly(dis, 3)3 -0.3180490 0.06207094  -5.123959  4.274950e-07
pred = predict(model, Boston)

ggplot()+
  geom_point(aes(dis,nox))+
  geom_line(aes(dis,pred))

b)

Plot the polynomial fits for a range of different polynomial degrees (say, from 1 to 10), and report the associated residual sum of squares.

errors = matrix(ncol=1, nrow=10)

## n = 1
model1 = lm(nox ~ poly(dis, 1), data = Boston)

pred1 = predict(model1, Boston)

ggplot()+
  geom_point(aes(dis,nox))+
  geom_line(aes(dis,pred1))

errors[1,1] = mean((Boston$nox - pred1)^2)


## n = 2
model2 = lm(nox ~ poly(dis, 2), data = Boston)

pred2 = predict(model2, Boston)

ggplot()+
  geom_point(aes(dis,nox))+
  geom_line(aes(dis,pred2))

errors[2,1] = mean((Boston$nox - pred2)^2)


## n = 3
model3 = lm(nox ~ poly(dis, 3), data = Boston)

pred3 = predict(model3, Boston)

ggplot()+
  geom_point(aes(dis,nox))+
  geom_line(aes(dis,pred3))

errors[3,1] = mean((Boston$nox - pred3)^2)


## n = 4
model4 = lm(nox ~ poly(dis, 4), data = Boston)

pred4 = predict(model4, Boston)

ggplot()+
  geom_point(aes(dis,nox))+
  geom_line(aes(dis,pred4))

errors[4,1] = mean((Boston$nox - pred4)^2)


## n = 5
model5 = lm(nox ~ poly(dis, 5), data = Boston)

pred5 = predict(model5, Boston)

ggplot()+
  geom_point(aes(dis,nox))+
  geom_line(aes(dis,pred5))

errors[5,1] = mean((Boston$nox - pred5)^2)


## n = 6
model6 = lm(nox ~ poly(dis, 6), data = Boston)

pred6 = predict(model6, Boston)

ggplot()+
  geom_point(aes(dis,nox))+
  geom_line(aes(dis,pred6))

errors[6,1] = mean((Boston$nox - pred6)^2)


## n = 7
model7 = lm(nox ~ poly(dis, 7), data = Boston)

pred7 = predict(model7, Boston)

ggplot()+
  geom_point(aes(dis,nox))+
  geom_line(aes(dis,pred7))

errors[7,1] = mean((Boston$nox - pred7)^2)


## n = 8
model8 = lm(nox ~ poly(dis, 8), data = Boston)

pred8 = predict(model8, Boston)

ggplot()+
  geom_point(aes(dis,nox))+
  geom_line(aes(dis,pred8))

errors[8,1] = mean((Boston$nox - pred8)^2)


## n = 9
model9 = lm(nox ~ poly(dis, 9), data = Boston)

pred9 = predict(model9, Boston)

ggplot()+
  geom_point(aes(dis,nox))+
  geom_line(aes(dis,pred9))

errors[9,1] = mean((Boston$nox - pred9)^2)

## n = 10
model10 = lm(nox ~ poly(dis, 10), data = Boston)

pred10 = predict(model10, Boston)

ggplot()+
  geom_point(aes(dis,nox))+
  geom_line(aes(dis,pred10))

errors[10,1] = mean((Boston$nox - pred10)^2)

errors
##              [,1]
##  [1,] 0.005471468
##  [2,] 0.004022257
##  [3,] 0.003822345
##  [4,] 0.003820121
##  [5,] 0.003785158
##  [6,] 0.003711971
##  [7,] 0.003655106
##  [8,] 0.003627727
##  [9,] 0.003623183
## [10,] 0.003620892
fitDF<-data.frame(MSE=as.matrix(errors)^2, 
                  degree=1:10)
ggplot(fitDF, aes(degree, MSE))+
  geom_point()+
  geom_line()

c)

Perform cross-validation or another approach to select the optimal degree for the polynomial, and explain your results.

train = sample(1:dim(Boston)[1], dim(Boston)[1]/2)
trainW= Boston[train,]
testW = Boston[-train,]


polyDF = matrix(nrow = 10, ncol=2)
colnames(polyDF) = c("Degree", "MSE")
for (i in 1:10) {
  mod1 = lm(nox~poly(dis,i),data = trainW)
  polyDF[i,1] = i
  pred = predict(mod1, newdata = testW)
  polyDF[i,2] = mean((testW$dis-pred)^2)
}

polyDF = as.data.frame(polyDF)
ggplot(polyDF, aes(Degree,MSE))+
  geom_point()+
  geom_line()

We can either choose 3 or 4, because 4 is the lowest one and the MSE goes up with more complexity from 5.

d)

Use the bs() function to fit a regression spline to predict nox using dis. Report the output for the fit using four degrees of freedom. How did you choose the knots? Plot the resulting fit.

library(splines)

dislims =range(dis)
dis.grid=seq(from=dislims [1],to=dislims [2])
preds=predict (model4 ,newdata =list(dis=dis.grid),se=TRUE)
par(mfrow=c(1,1))
fit=lm(nox~bs(dis ,knots=c(25,50,75) ),data=Boston)
pred=predict (fit ,newdata =list(dis=dis.grid),se=T)
## Warning in predict.lm(fit, newdata = list(dis = dis.grid), se = T):
## prediction from a rank-deficient fit may be misleading
plot(dis ,nox ,col="gray")
lines(dis.grid ,pred$fit ,lwd=2)
lines(dis.grid ,pred$fit+2*pred$se ,lty="dashed")
lines(dis.grid ,pred$fit-2*pred$se ,lty="dashed")

Most data fit the prediction line very well and I chose 25th, 50th, 75th as our knots.

e)

Now fit a regression spline for a range of degrees of freedom, and plot the resulting fits and report the resulting RSS. Describe the results obtained.

error = matrix(ncol=1, nrow=10)
## n = 1
mod1 = lm(nox ~ bs(dis, df = 1), data = Boston)
## Warning in bs(dis, df = 1): 'df' was too small; have used 3
pre1 = predict(mod1, Boston)

ggplot()+
  geom_point(aes(dis,nox))+
  geom_line(aes(dis,pre1))

error[1,1] = mean((Boston$nox - pre1)^2)

## n = 2
mod2 = lm(nox ~ bs(dis, df = 2), data = Boston)
## Warning in bs(dis, df = 2): 'df' was too small; have used 3
pre2 = predict(mod2, Boston)

ggplot()+
  geom_point(aes(dis,nox))+
  geom_line(aes(dis,pre2))

error[2,1] = mean((Boston$nox - pre2)^2)

## n = 3
mod3 = lm(nox ~ bs(dis, df = 3), data = Boston)

pre3 = predict(mod3, Boston)

ggplot()+
  geom_point(aes(dis,nox))+
  geom_line(aes(dis,pre3))

error[3,1] = mean((Boston$nox - pre3)^2)

## n = 4
mod4 = lm(nox ~ bs(dis, df = 4), data = Boston)

pre4 = predict(mod4, Boston)

ggplot()+
  geom_point(aes(dis,nox))+
  geom_line(aes(dis,pre4))

error[4,1] = mean((Boston$nox - pre4)^2)

## n = 5
mod5 = lm(nox ~ bs(dis, df = 5), data = Boston)

pre5 = predict(mod5, Boston)

ggplot()+
  geom_point(aes(dis,nox))+
  geom_line(aes(dis,pre5))

error[5,1] = mean((Boston$nox - pre5)^2)

## n = 6
mod6 = lm(nox ~ bs(dis, df = 6), data = Boston)

pre6 = predict(mod6, Boston)

ggplot()+
  geom_point(aes(dis,nox))+
  geom_line(aes(dis,pre6))

error[6,1] = mean((Boston$nox - pre6)^2)

## n = 7
mod7 = lm(nox ~ bs(dis, df = 7), data = Boston)

pre7 = predict(mod7, Boston)

ggplot()+
  geom_point(aes(dis,nox))+
  geom_line(aes(dis,pre7))

error[7,1] = mean((Boston$nox - pre7)^2)

## n = 8
mod8 = lm(nox ~ bs(dis, df = 8), data = Boston)

pre8 = predict(mod8, Boston)

ggplot()+
  geom_point(aes(dis,nox))+
  geom_line(aes(dis,pre8))

error[8,1] = mean((Boston$nox - pre8)^2)

## n = 9
mod9 = lm(nox ~ bs(dis, df = 9), data = Boston)

pre9 = predict(mod9, Boston)

ggplot()+
  geom_point(aes(dis,nox))+
  geom_line(aes(dis,pre9))

error[9,1] = mean((Boston$nox - pre9)^2)

## n = 10
mod10 = lm(nox ~ bs(dis, df = 10), data = Boston)

pre10 = predict(mod10, Boston)

ggplot()+
  geom_point(aes(dis,nox))+
  geom_line(aes(dis,pre10))

error[10,1] = mean((Boston$nox - pre10)^2)

error
##              [,1]
##  [1,] 0.003822345
##  [2,] 0.003822345
##  [3,] 0.003822345
##  [4,] 0.003799951
##  [5,] 0.003636705
##  [6,] 0.003624439
##  [7,] 0.003616372
##  [8,] 0.003590899
##  [9,] 0.003608009
## [10,] 0.003542559
fitDF1<-data.frame(MSE=as.matrix(error)^2, 
                  degree=1:10)
ggplot(fitDF1, aes(degree, MSE))+
  geom_point()+
  geom_line()

detach(Boston)

Problem 2: College

This question relates to the College data set.

#install.packages("gam")
library(gam)
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded gam 1.16.1
library(ISLR)
data(College)

a)

Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

attach(College)
set.seed(1)
names(College)
##  [1] "Private"     "Apps"        "Accept"      "Enroll"      "Top10perc"  
##  [6] "Top25perc"   "F.Undergrad" "P.Undergrad" "Outstate"    "Room.Board" 
## [11] "Books"       "Personal"    "PhD"         "Terminal"    "S.F.Ratio"  
## [16] "perc.alumni" "Expend"      "Grad.Rate"
train = sample(1:dim(College)[1], dim(College)[1]/2)
training= College[train,]
testing = College[-train,]

gam1=lm(Outstate~Private ,data=training)
summary(gam1) # yes
## 
## Call:
## lm(formula = Outstate ~ Private, data = training)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -7493  -2327   -403   2007   9807 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6783.8      346.8   19.56   <2e-16 ***
## PrivateYes    5109.1      407.5   12.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3587 on 386 degrees of freedom
## Multiple R-squared:  0.2894, Adjusted R-squared:  0.2875 
## F-statistic: 157.2 on 1 and 386 DF,  p-value: < 2.2e-16
gam2=lm(Outstate~Apps ,data=training)
summary(gam2) # no
## 
## Call:
## lm(formula = Outstate ~ Apps, data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7664.9 -3415.9  -330.3  2847.3 11472.4 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.018e+04  2.692e+02  37.811   <2e-16 ***
## Apps        9.490e-02  5.028e-02   1.888   0.0598 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4236 on 386 degrees of freedom
## Multiple R-squared:  0.009147,   Adjusted R-squared:  0.00658 
## F-statistic: 3.563 on 1 and 386 DF,  p-value: 0.05982
gam3=lm(Outstate~Accept ,data=training)
summary(gam3) # no 
## 
## Call:
## lm(formula = Outstate ~ Accept, data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7836.8 -3456.3  -466.7  2770.3 11290.5 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.040e+04  2.780e+02  37.392   <2e-16 ***
## Accept      4.159e-02  8.267e-02   0.503    0.615    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4254 on 386 degrees of freedom
## Multiple R-squared:  0.0006551,  Adjusted R-squared:  -0.001934 
## F-statistic: 0.253 on 1 and 386 DF,  p-value: 0.6152
gam4=lm(Outstate~Enroll,data=training)
summary(gam4) # no
## 
## Call:
## lm(formula = Outstate ~ Enroll, data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8047.7 -3277.6  -612.2  2704.7 10926.3 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10822.0368   285.4255  37.915   <2e-16 ***
## Enroll         -0.4244     0.2355  -1.802   0.0723 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4238 on 386 degrees of freedom
## Multiple R-squared:  0.008343,   Adjusted R-squared:  0.005774 
## F-statistic: 3.248 on 1 and 386 DF,  p-value: 0.07231
gam5=lm(Outstate~Top10perc ,data=training)
summary(gam5) # yes
## 
## Call:
## lm(formula = Outstate ~ Top10perc, data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8882.6 -2681.2   209.2  2274.0 11608.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6606.034    329.100   20.07   <2e-16 ***
## Top10perc    139.410      9.993   13.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3470 on 386 degrees of freedom
## Multiple R-squared:  0.3352, Adjusted R-squared:  0.3335 
## F-statistic: 194.6 on 1 and 386 DF,  p-value: < 2.2e-16
gam6=lm(Outstate~Top25perc ,data=training)
summary(gam6) # yes
## 
## Call:
## lm(formula = Outstate ~ Top25perc, data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9023.1 -2974.0   208.1  2598.8 11524.8 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4695.557    566.287   8.292 1.87e-15 ***
## Top25perc    103.389      9.534  10.845  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3726 on 386 degrees of freedom
## Multiple R-squared:  0.2335, Adjusted R-squared:  0.2315 
## F-statistic: 117.6 on 1 and 386 DF,  p-value: < 2.2e-16
gam7=lm(Outstate~F.Undergrad ,data=training)
summary(gam7) # yes
## 
## Call:
## lm(formula = Outstate ~ F.Undergrad, data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8346.8 -3200.6  -561.3  2642.1 10695.7 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.107e+04  2.721e+02  40.708  < 2e-16 ***
## F.Undergrad -1.545e-01  4.435e-02  -3.484  0.00055 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4190 on 386 degrees of freedom
## Multiple R-squared:  0.03049,    Adjusted R-squared:  0.02798 
## F-statistic: 12.14 on 1 and 386 DF,  p-value: 0.0005502
gam8=lm(Outstate~P.Undergrad ,data=training)
summary(gam8) # yes
## 
## Call:
## lm(formula = Outstate ~ P.Undergrad, data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8265.1 -3277.2  -431.4  2713.2 10505.9 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11195.6443   247.8704  45.167  < 2e-16 ***
## P.Undergrad    -0.7670     0.1444  -5.312 1.84e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4108 on 386 degrees of freedom
## Multiple R-squared:  0.06812,    Adjusted R-squared:  0.06571 
## F-statistic: 28.22 on 1 and 386 DF,  p-value: 1.835e-07
gam9=lm(Outstate~Books ,data=training)
summary(gam9) # no
## 
## Call:
## lm(formula = Outstate ~ Books, data = training)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -7902  -3427   -500   2776  11214 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.046e+04  8.485e+02  12.326   <2e-16 ***
## Books       4.528e-02  1.475e+00   0.031    0.976    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4255 on 386 degrees of freedom
## Multiple R-squared:  2.441e-06,  Adjusted R-squared:  -0.002588 
## F-statistic: 0.0009421 on 1 and 386 DF,  p-value: 0.9755
gam10=lm(Outstate~Room.Board ,data=training)
summary(gam10) #  yes
## 
## Call:
## lm(formula = Outstate ~ Room.Board, data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7982.0 -2063.3  -362.2  1911.0 11985.8 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -613.2019   644.5743  -0.951    0.342    
## Room.Board     2.5189     0.1417  17.774   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3156 on 386 degrees of freedom
## Multiple R-squared:  0.4501, Adjusted R-squared:  0.4487 
## F-statistic: 315.9 on 1 and 386 DF,  p-value: < 2.2e-16
gam11=lm(Outstate~Personal ,data=training)
summary(gam1) # yes
## 
## Call:
## lm(formula = Outstate ~ Private, data = training)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -7493  -2327   -403   2007   9807 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6783.8      346.8   19.56   <2e-16 ***
## PrivateYes    5109.1      407.5   12.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3587 on 386 degrees of freedom
## Multiple R-squared:  0.2894, Adjusted R-squared:  0.2875 
## F-statistic: 157.2 on 1 and 386 DF,  p-value: < 2.2e-16
gam12=lm(Outstate~PhD ,data=training)
summary(gam12) # yes
## 
## Call:
## lm(formula = Outstate ~ PhD, data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8442.9 -3291.9    55.3  2846.0 15157.5 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2904.61     908.26   3.198   0.0015 ** 
## PhD           103.94      12.16   8.551  2.9e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3902 on 386 degrees of freedom
## Multiple R-squared:  0.1593, Adjusted R-squared:  0.1571 
## F-statistic: 73.12 on 1 and 386 DF,  p-value: 2.897e-16
gam13=lm(Outstate~Terminal ,data=training)
summary(gam13) # yes
## 
## Call:
## lm(formula = Outstate ~ Terminal, data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10259.4  -3160.4    203.9   2577.4  13684.8 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1072.97    1102.49   0.973    0.331    
## Terminal      117.66      13.56   8.677   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3893 on 386 degrees of freedom
## Multiple R-squared:  0.1632, Adjusted R-squared:  0.161 
## F-statistic: 75.28 on 1 and 386 DF,  p-value: < 2.2e-16
gam14=lm(Outstate~S.F.Ratio ,data=training)
summary(gam14) # yes
## 
## Call:
## lm(formula = Outstate ~ S.F.Ratio, data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10749.7  -2734.9   -207.4   2362.7   9334.2 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 19505.66     654.37   29.81   <2e-16 ***
## S.F.Ratio    -644.18      45.03  -14.31   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3440 on 386 degrees of freedom
## Multiple R-squared:  0.3465, Adjusted R-squared:  0.3448 
## F-statistic: 204.7 on 1 and 386 DF,  p-value: < 2.2e-16
gam15=lm(Outstate~perc.alumni ,data=training)
summary(gam15) # yes
## 
## Call:
## lm(formula = Outstate ~ perc.alumni, data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9996.2 -2490.7  -562.5  2101.1  9599.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6205.32     364.84   17.01   <2e-16 ***
## perc.alumni   194.17      14.44   13.44   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3512 on 386 degrees of freedom
## Multiple R-squared:  0.3189, Adjusted R-squared:  0.3171 
## F-statistic: 180.7 on 1 and 386 DF,  p-value: < 2.2e-16
gam16=lm(Outstate~Expend ,data=training)
summary(gam16) # yes
## 
## Call:
## lm(formula = Outstate ~ Expend, data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14016.9  -2277.4     -8.2   2231.3   8109.2 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.700e+03  3.201e+02   17.81   <2e-16 ***
## Expend      4.822e-01  2.782e-02   17.33   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3191 on 386 degrees of freedom
## Multiple R-squared:  0.4377, Adjusted R-squared:  0.4362 
## F-statistic: 300.4 on 1 and 386 DF,  p-value: < 2.2e-16
gam17=lm(Outstate~Grad.Rate ,data=training)
summary(gam17) # yes
## 
## Call:
## lm(formula = Outstate ~ Grad.Rate, data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11615.2  -2319.0   -290.2   2316.4  12670.4 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1176.1      681.4   1.726   0.0851 .  
## Grad.Rate      142.8       10.1  14.137   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3454 on 386 degrees of freedom
## Multiple R-squared:  0.3411, Adjusted R-squared:  0.3394 
## F-statistic: 199.8 on 1 and 386 DF,  p-value: < 2.2e-16
gam19=lm(Outstate~Private +Top10perc  ,data=training)
summary(gam19)
## 
## Call:
## lm(formula = Outstate ~ Private + Top10perc, data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7234.8 -1699.5   -61.9  1897.3 10328.4 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3739.734    339.662   11.01   <2e-16 ***
## PrivateYes  4498.731    324.972   13.84   <2e-16 ***
## Top10perc    125.325      8.239   15.21   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2839 on 385 degrees of freedom
## Multiple R-squared:  0.5561, Adjusted R-squared:  0.5538 
## F-statistic: 241.2 on 2 and 385 DF,  p-value: < 2.2e-16
gam20=lm(Outstate~Private +Top10perc + Top25perc  ,data=training)
summary(gam20)
## 
## Call:
## lm(formula = Outstate ~ Private + Top10perc + Top25perc, data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7227.8 -1675.9   -41.8  1888.1 10324.5 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3583.910    590.923   6.065 3.16e-09 ***
## PrivateYes  4515.813    329.636  13.699  < 2e-16 ***
## Top10perc    120.150     18.046   6.658 9.62e-11 ***
## Top25perc      5.134     15.921   0.322    0.747    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2842 on 384 degrees of freedom
## Multiple R-squared:  0.5562, Adjusted R-squared:  0.5528 
## F-statistic: 160.4 on 3 and 384 DF,  p-value: < 2.2e-16
gam21=lm(Outstate~Private +Top10perc + F.Undergrad  ,data=training)
summary(gam21)
## 
## Call:
## lm(formula = Outstate ~ Private + Top10perc + F.Undergrad, data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7219.4 -1691.8   -62.8  1859.6 10354.6 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.603e+03  4.332e+02   8.316  1.6e-15 ***
## PrivateYes  4.637e+03  4.230e+02  10.962  < 2e-16 ***
## Top10perc   1.239e+02  8.728e+00  14.191  < 2e-16 ***
## F.Undergrad 2.031e-02  3.980e-02   0.510     0.61    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2842 on 384 degrees of freedom
## Multiple R-squared:  0.5564, Adjusted R-squared:  0.553 
## F-statistic: 160.6 on 3 and 384 DF,  p-value: < 2.2e-16
gam22=lm(Outstate~Private +Top10perc + P.Undergrad  ,data=training)
summary(gam22)
## 
## Call:
## lm(formula = Outstate ~ Private + Top10perc + P.Undergrad, data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7249.4 -1688.2   -59.9  1919.8 10361.7 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.601e+03  4.176e+02   8.623   <2e-16 ***
## PrivateYes  4.599e+03  3.693e+02  12.451   <2e-16 ***
## Top10perc   1.255e+02  8.255e+00  15.208   <2e-16 ***
## P.Undergrad 6.516e-02  1.139e-01   0.572    0.568    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2841 on 384 degrees of freedom
## Multiple R-squared:  0.5565, Adjusted R-squared:  0.553 
## F-statistic: 160.6 on 3 and 384 DF,  p-value: < 2.2e-16
gam23=lm(Outstate~Private +Top10perc + Room.Board  ,data=training)
summary(gam23)
## 
## Call:
## lm(formula = Outstate ~ Private + Top10perc + Room.Board, data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7881.7 -1672.9  -156.6  1450.1 10946.8 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1193.5710   496.5161  -2.404   0.0167 *  
## PrivateYes   3545.0605   286.9933  12.352   <2e-16 ***
## Top10perc      86.2009     7.7001  11.195   <2e-16 ***
## Room.Board      1.5236     0.1248  12.211   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2412 on 384 degrees of freedom
## Multiple R-squared:  0.6803, Adjusted R-squared:  0.6778 
## F-statistic: 272.3 on 3 and 384 DF,  p-value: < 2.2e-16
gam24=lm(Outstate~Private +Top10perc + Room.Board + Personal  ,data=training)
summary(gam24)
## 
## Call:
## lm(formula = Outstate ~ Private + Top10perc + Room.Board + Personal, 
##     data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7119.7 -1484.9  -131.4  1471.6 10450.6 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -52.2526   582.0216  -0.090 0.928511    
## PrivateYes  3345.3311   287.9427  11.618  < 2e-16 ***
## Top10perc     85.6750     7.5833  11.298  < 2e-16 ***
## Room.Board     1.4955     0.1231  12.149  < 2e-16 ***
## Personal      -0.6345     0.1756  -3.614 0.000342 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2375 on 383 degrees of freedom
## Multiple R-squared:  0.6908, Adjusted R-squared:  0.6876 
## F-statistic: 213.9 on 4 and 383 DF,  p-value: < 2.2e-16
gam25=lm(Outstate~Private +Top10perc + Room.Board + PhD  ,data=training)
summary(gam25)
## 
## Call:
## lm(formula = Outstate ~ Private + Top10perc + Room.Board + PhD, 
##     data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7473.1 -1662.2  -117.4  1479.7 12242.0 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3200.9005   691.8850  -4.626 5.10e-06 ***
## PrivateYes   3964.3598   299.4820  13.237  < 2e-16 ***
## Top10perc      68.3599     8.7219   7.838 4.58e-14 ***
## Room.Board      1.3639     0.1284  10.622  < 2e-16 ***
## PhD            39.8165     9.7545   4.082 5.44e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2365 on 383 degrees of freedom
## Multiple R-squared:  0.6936, Adjusted R-squared:  0.6904 
## F-statistic: 216.8 on 4 and 383 DF,  p-value: < 2.2e-16
gam26=lm(Outstate~Private +Top10perc + Room.Board + PhD + Terminal  ,data=training)
summary(gam26)
## 
## Call:
## lm(formula = Outstate ~ Private + Top10perc + Room.Board + PhD + 
##     Terminal, data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7409.8 -1587.7   -48.9  1498.0 11999.6 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4179.0715   780.7669  -5.353 1.50e-07 ***
## PrivateYes   4040.3618   298.5926  13.531  < 2e-16 ***
## Top10perc      67.3749     8.6633   7.777 6.98e-14 ***
## Room.Board      1.3088     0.1291  10.136  < 2e-16 ***
## PhD            12.4176    14.2168   0.873  0.38297    
## Terminal       39.8985    15.1625   2.631  0.00885 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2347 on 382 degrees of freedom
## Multiple R-squared:  0.6991, Adjusted R-squared:  0.6951 
## F-statistic: 177.5 on 5 and 382 DF,  p-value: < 2.2e-16
gam27=lm(Outstate~Private +Top10perc + Room.Board + PhD + S.F.Ratio  ,data=training)
summary(gam27)
## 
## Call:
## lm(formula = Outstate ~ Private + Top10perc + Room.Board + PhD + 
##     S.F.Ratio, data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7251.2 -1518.0    -0.7  1427.3 11690.5 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  213.1338  1049.8490   0.203    0.839    
## PrivateYes  3411.8547   320.5010  10.645  < 2e-16 ***
## Top10perc     57.4778     8.9086   6.452 3.35e-10 ***
## Room.Board     1.2714     0.1275   9.972  < 2e-16 ***
## PhD           39.6702     9.5438   4.157 3.99e-05 ***
## S.F.Ratio   -163.7344    38.4852  -4.254 2.64e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2314 on 382 degrees of freedom
## Multiple R-squared:  0.7075, Adjusted R-squared:  0.7036 
## F-statistic: 184.8 on 5 and 382 DF,  p-value: < 2.2e-16
gam28=lm(Outstate~Private +Top10perc + Room.Board +PhD + perc.alumni  ,data=training)
summary(gam28)
## 
## Call:
## lm(formula = Outstate ~ Private + Top10perc + Room.Board + PhD + 
##     perc.alumni, data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6649.2 -1639.1   -88.1  1416.0 11474.1 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3583.6657   661.0026  -5.422 1.05e-07 ***
## PrivateYes   3129.9358   313.2784   9.991  < 2e-16 ***
## Top10perc      49.8640     8.7860   5.675 2.74e-08 ***
## Room.Board      1.4115     0.1224  11.532  < 2e-16 ***
## PhD            35.5360     9.3050   3.819 0.000156 ***
## perc.alumni    72.7893    11.3570   6.409 4.32e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2250 on 382 degrees of freedom
## Multiple R-squared:  0.7234, Adjusted R-squared:  0.7197 
## F-statistic: 199.8 on 5 and 382 DF,  p-value: < 2.2e-16
gam29=lm(Outstate~Private +Top10perc + Room.Board +PhD + perc.alumni + Expend  ,data=training)
summary(gam29)
## 
## Call:
## lm(formula = Outstate ~ Private + Top10perc + Room.Board + PhD + 
##     perc.alumni + Expend, data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6158.0 -1516.1  -113.9  1380.0 10226.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.055e+03  6.385e+02  -4.785 2.45e-06 ***
## PrivateYes   2.902e+03  3.021e+02   9.607  < 2e-16 ***
## Top10perc    2.295e+01  9.518e+00   2.411 0.016378 *  
## Room.Board   1.220e+00  1.213e-01  10.051  < 2e-16 ***
## PhD          3.127e+01  8.931e+00   3.502 0.000517 ***
## perc.alumni  6.866e+01  1.089e+01   6.306 7.93e-10 ***
## Expend       1.645e-01  2.729e-02   6.027 3.94e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2153 on 381 degrees of freedom
## Multiple R-squared:  0.7474, Adjusted R-squared:  0.7435 
## F-statistic: 187.9 on 6 and 381 DF,  p-value: < 2.2e-16
gam30=lm(Outstate~Private + Room.Board +PhD  + perc.alumni + Expend + Grad.Rate  ,data=training)
summary(gam30)
## 
## Call:
## lm(formula = Outstate ~ Private + Room.Board + PhD + perc.alumni + 
##     Expend + Grad.Rate, data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6465.2 -1400.5  -190.6  1332.8 10451.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.112e+03  6.262e+02  -6.566 1.69e-10 ***
## PrivateYes   2.713e+03  2.997e+02   9.053  < 2e-16 ***
## Room.Board   1.122e+00  1.225e-01   9.163  < 2e-16 ***
## PhD          3.195e+01  8.397e+00   3.805 0.000165 ***
## perc.alumni  6.045e+01  1.099e+01   5.501 6.92e-08 ***
## Expend       1.923e-01  2.377e-02   8.089 8.15e-15 ***
## Grad.Rate    3.246e+01  7.880e+00   4.120 4.66e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2122 on 381 degrees of freedom
## Multiple R-squared:  0.7545, Adjusted R-squared:  0.7507 
## F-statistic: 195.2 on 6 and 381 DF,  p-value: < 2.2e-16
The model with parameters: Privates, Room.Board, PhD, perc.alumni, Expend, and Grad.Rate is the best.

b)

Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

gam31 = gam(Outstate ~ Private + s(Room.Board, df = 2) + s(PhD, df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 2) + s(Grad.Rate, df = 2), data = training)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
par(mfrow = c(2, 3))
plot(gam31, se = T, col = "red")

c)

Evaluate the model obtained on the test set, and explain the results obtained.

gam.pred =predict(gam31, data = testing)
gam.err <- mean((testing$Outstate - gam.pred)^2)
## Warning in testing$Outstate - gam.pred: longer object length is not a
## multiple of shorter object length
gam.err
## [1] 28296297

d)

For which variables, if any, is there evidence of a non-linear relationship with the response?

summary(gam31)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 2) + s(PhD, 
##     df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 2) + s(Grad.Rate, 
##     df = 2), data = training)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -6500.0 -1344.2  -104.2  1350.9  8792.9 
## 
## (Dispersion Parameter for gaussian family taken to be 3946537)
## 
##     Null Deviance: 6989966760 on 387 degrees of freedom
## Residual Deviance: 1483898477 on 376.0001 degrees of freedom
## AIC: 7007.986 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                         Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private                  1 1843025465 1843025465 466.998 < 2.2e-16 ***
## s(Room.Board, df = 2)    1 1692725515 1692725515 428.914 < 2.2e-16 ***
## s(PhD, df = 2)           1  397159048  397159048 100.635 < 2.2e-16 ***
## s(perc.alumni, df = 2)   1  351049729  351049729  88.951 < 2.2e-16 ***
## s(Expend, df = 2)        1  419096845  419096845 106.194 < 2.2e-16 ***
## s(Grad.Rate, df = 2)     1   81869675   81869675  20.745 7.101e-06 ***
## Residuals              376 1483898477    3946537                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                        Npar Df Npar F     Pr(F)    
## (Intercept)                                        
## Private                                            
## s(Room.Board, df = 2)        1  2.005   0.15763    
## s(PhD, df = 2)               1  3.781   0.05258 .  
## s(perc.alumni, df = 2)       1  0.314   0.57572    
## s(Expend, df = 2)            1 47.156 2.725e-11 ***
## s(Grad.Rate, df = 2)         1  1.057   0.30446    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
It seems that Expend and Outstate has a non-linear relationship.