Homework 8

Problem 1

#1A) Use the poly function to fit a cubic polynomial regression to predict nox using dis. Report the regression output, and plot the resulting data and polynomial fits.

library(MASS)
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
data("Boston")
attach(Boston)

#visualize data

ggplot(Boston, aes(x=dis,y=nox))+
  geom_point()

#cubic model

mod1 <- lm(nox~poly(dis,3))
summary(mod1)
## 
## Call:
## lm(formula = nox ~ poly(dis, 3))
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.121130 -0.040619 -0.009738  0.023385  0.194904 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.554695   0.002759 201.021  < 2e-16 ***
## poly(dis, 3)1 -2.003096   0.062071 -32.271  < 2e-16 ***
## poly(dis, 3)2  0.856330   0.062071  13.796  < 2e-16 ***
## poly(dis, 3)3 -0.318049   0.062071  -5.124 4.27e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared:  0.7148, Adjusted R-squared:  0.7131 
## F-statistic: 419.3 on 3 and 502 DF,  p-value: < 2.2e-16
#MSE
degreePoly<-3
polyMSE<-matrix(nrow=degreePoly, ncol=2)
colnames(polyMSE)<-c("degree", "MSE")
for(i in 1:degreePoly){ 
  polyMSE[i,1]<-i
  this.fit<-lm(nox~poly(dis,i), data=Boston)
  polyMSE[i,2]<-mean((nox-predict(this.fit, Boston))^2)
}

polyDF<-as.data.frame(polyMSE)
head(polyDF)
##   degree         MSE
## 1      1 0.005471468
## 2      2 0.004022257
## 3      3 0.003822345
#plot MSE
ggplot(data=polyDF, aes(x=degree, y=MSE))+
  geom_point()+
  geom_line()

#1B) Plot the polynomial fits for a range of different polynomial degrees and report the associated sum of squares.

degreePoly<-10
polyMSE<-matrix(nrow=degreePoly, ncol=2)
colnames(polyMSE)<-c("degree", "MSE")
for(i in 1:degreePoly){ 
  polyMSE[i,1]<-i
  this.fit<-lm(nox~poly(dis,i), data=Boston)
  polyMSE[i,2]<-mean((nox-predict(this.fit, Boston))^2)
}

polyDF<-as.data.frame(polyMSE)
polyDF
##    degree         MSE
## 1       1 0.005471468
## 2       2 0.004022257
## 3       3 0.003822345
## 4       4 0.003820121
## 5       5 0.003785158
## 6       6 0.003711971
## 7       7 0.003655106
## 8       8 0.003627727
## 9       9 0.003623183
## 10     10 0.003620892
#MSE
ggplot(data=polyDF, aes(x=degree, y=MSE))+
  geom_point()+
  geom_line()

#1C) Perform a cross validation or another approach to select the optimal degree for the polynomial, and explain your results.

library(boot)
str(Boston)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
set.seed(3)

#50-50 split of the indexes
train<-sample(506,253)

degreePoly<-10
polyMSE<-matrix(nrow=degreePoly, ncol=2)
colnames(polyMSE)<-c("degree", "MSE")
for(i in 1:degreePoly){ 
  polyMSE[i,1]<-i
  this.fit<-lm(nox~poly(dis,i), data=Boston, subset=train)
  polyMSE[i,2]<-mean((nox-predict(this.fit, Boston))[-train]^2)
}

polyDF<-as.data.frame(polyMSE)
polyDF
##    degree         MSE
## 1       1 0.006023164
## 2       2 0.004283208
## 3       3 0.004043945
## 4       4 0.004048655
## 5       5 0.004004791
## 6       6 0.003901858
## 7       7 0.003825441
## 8       8 0.003801398
## 9       9 0.003796123
## 10     10 0.003806693
#k-fold
library(boot)
set.seed(3)

cv.error.k10<-rep(0, 10)
for(i in 1:10){
  glm.fit<-glm(nox~poly(dis, i), data=Boston)
  cv.error.k10[i]<-cv.glm(Boston, glm.fit, K=10)$delta[1]
}

cvDF<-data.frame(degree=1:10, cv.error.k10)

ggplot(data=cvDF, aes(x=degree, y=cv.error.k10))+
  geom_point()+
  geom_line()

#anova

fit.1=lm(nox~dis ,data=Boston)
fit.2=lm(nox~poly(dis ,2),data=Boston)
fit.3=lm(nox~poly(dis ,3),data=Boston)
fit.4=lm(nox~poly(dis,4),data=Boston)
fit.5=lm(nox~poly(dis ,5),data=Boston)
fit.6=lm(nox~poly(dis ,6),data=Boston)
fit.7=lm(nox~poly(dis ,7),data=Boston)
fit.8=lm(nox~poly(dis ,8),data=Boston)
fit.9=lm(nox~poly(dis ,9),data=Boston)
fit.10=lm(nox~poly(dis ,10),data=Boston)
anova(fit.1,fit.2,fit.3,fit.4,fit.5,fit.6,fit.7,fit.8,fit.9,fit.10)
## Analysis of Variance Table
## 
## Model  1: nox ~ dis
## Model  2: nox ~ poly(dis, 2)
## Model  3: nox ~ poly(dis, 3)
## Model  4: nox ~ poly(dis, 4)
## Model  5: nox ~ poly(dis, 5)
## Model  6: nox ~ poly(dis, 6)
## Model  7: nox ~ poly(dis, 7)
## Model  8: nox ~ poly(dis, 8)
## Model  9: nox ~ poly(dis, 9)
## Model 10: nox ~ poly(dis, 10)
##    Res.Df    RSS Df Sum of Sq        F    Pr(>F)    
## 1     504 2.7686                                    
## 2     503 2.0353  1   0.73330 198.1169 < 2.2e-16 ***
## 3     502 1.9341  1   0.10116  27.3292 2.535e-07 ***
## 4     501 1.9330  1   0.00113   0.3040  0.581606    
## 5     500 1.9153  1   0.01769   4.7797  0.029265 *  
## 6     499 1.8783  1   0.03703  10.0052  0.001657 ** 
## 7     498 1.8495  1   0.02877   7.7738  0.005505 ** 
## 8     497 1.8356  1   0.01385   3.7429  0.053601 .  
## 9     496 1.8333  1   0.00230   0.6211  0.431019    
## 10    495 1.8322  1   0.00116   0.3133  0.575908    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#If we look at the anova to compare different models (with different degree polynomials), we can see that a model going to the 7th degree is statistically significant (although the 4th degree is not). This shows that our model could be as complex as going to an 7th degree polynomial to predict nox using dis. However, the cross validations show that a model this complex is not worth using. In the 50-50 split, mean squared error continues to decline after the 3rd degree, but not by a substantial amount. In the k-fold, mean squared error actually increases after the 3rd degree. Therefore, the optimal degree for the polynomial is 3.

#1D) Use the bs() function to fit a regression spline to predict nox using dis. Report the output for the fit using four degrees of freedom. How did you choose the knots? Plot the resulting fit.

library(splines)

#The number of knots to use for the bs() function was chosen by R using four degrees of freedom.
attr(bs(dis,df=4) ,"knots")
##     50% 
## 3.20745
#output tells us to use 1 knot at 3.20745.

#basis function
dislims =range(dis)
dis.grid=seq(from=dislims[1],to=dislims[2])
par(mfrow=c(1,1))
fit=lm(nox~bs(dis, df=4,knots=c(3.20745)),data=Boston)
pred=predict(fit,newdata=data.frame(dis=dis.grid),se=T)

#output
summary(fit)
## 
## Call:
## lm(formula = nox ~ bs(dis, df = 4, knots = c(3.20745)), data = Boston)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.124622 -0.039259 -0.008514  0.020850  0.193891 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)
## (Intercept)                           0.73447    0.01460  50.306  < 2e-16
## bs(dis, df = 4, knots = c(3.20745))1 -0.05810    0.02186  -2.658  0.00812
## bs(dis, df = 4, knots = c(3.20745))2 -0.46356    0.02366 -19.596  < 2e-16
## bs(dis, df = 4, knots = c(3.20745))3 -0.19979    0.04311  -4.634 4.58e-06
## bs(dis, df = 4, knots = c(3.20745))4 -0.38881    0.04551  -8.544  < 2e-16
##                                         
## (Intercept)                          ***
## bs(dis, df = 4, knots = c(3.20745))1 ** 
## bs(dis, df = 4, knots = c(3.20745))2 ***
## bs(dis, df = 4, knots = c(3.20745))3 ***
## bs(dis, df = 4, knots = c(3.20745))4 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06195 on 501 degrees of freedom
## Multiple R-squared:  0.7164, Adjusted R-squared:  0.7142 
## F-statistic: 316.5 on 4 and 501 DF,  p-value: < 2.2e-16
#plot the resulting fit
plot(dis,nox,col="gray")
lines(dis.grid,pred$fit ,lwd=2)
lines(dis.grid,pred$fit+2*pred$se ,lty="dashed")
lines(dis.grid,pred$fit-2*pred$se ,lty="dashed")

#1E) Now fit a regression spline for a range of degrees of freedom, and plot the resulting fits and report the resulting RSS. Describe the results obtained.

degreeFreedom <-10
degreeMSE<-matrix(nrow=degreeFreedom, ncol=2)
colnames(degreeMSE)<-c("degree", "MSE")
for(i in 1:degreeFreedom){ 
  degreeMSE[i,1]<-i
  this.fit<-lm(nox~ns(dis,df=i),data=Boston)
  degreeMSE[i,2]<-mean((nox-predict(this.fit, Boston))^2)
}

degreeDF<-as.data.frame(degreeMSE)

#MSE for degrees of freedom
degreeDF
##    degree         MSE
## 1       1 0.005471468
## 2       2 0.003902330
## 3       3 0.003815219
## 4       4 0.003726888
## 5       5 0.003676347
## 6       6 0.003664342
## 7       7 0.003653364
## 8       8 0.003552863
## 9       9 0.003554311
## 10     10 0.003536053
ggplot(data=degreeDF, aes(x=degree, y=MSE))+
  geom_point()+
  geom_line()

#These results show that as the degrees of freedom increases, the mean squared error of the regression spline decreases. However, the ideal degrees of freedom appears to be 5 because MSE only decreases gradually after 5 degrees of freedom.

#1F) Perform cross-validation in order to select the best degrees of freedom for a regression spline on this data. Describe your results.

set.seed(1234)
train <- sample(506,253)

degreeFreedom <-10
degreeMSE<-matrix(nrow=degreeFreedom, ncol=2)
colnames(degreeMSE)<-c("degree", "MSE")
for(i in 1:degreeFreedom){ 
  degreeMSE[i,1]<-i
  this.fit<-lm(nox~ns(dis,df=i),data=Boston, subset = train)
  degreeMSE[i,2]<-mean((nox-predict(this.fit, Boston))[-train]^2)
}

degreeDF<-as.data.frame(degreeMSE)
head(degreeDF)
##   degree         MSE
## 1      1 0.005037671
## 2      2 0.003803709
## 3      3 0.003717473
## 4      4 0.003672904
## 5      5 0.003604826
## 6      6 0.003602487
ggplot(data=degreeDF, aes(x=degree, y=MSE))+
  geom_point()+
  geom_line()

#This cross validation shows that MSE declines as degrees of freedom increases, but MSE beyond 5 degrees of freedom either declines gradually or increases. Therefore, this cross validation confirms that the ideal degrees of freedom to use is 5.

Problem 2

#2A) Split data into a training set and a testing set. Using out of state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

library(ISLR)
data(College)
attach(College)
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:boot':
## 
##     logit
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
#choose variables from data set
College$Private <- as.numeric(College$Private)
cor.ci(College)

## Call:corCi(x = x, keys = keys, n.iter = n.iter, p = p, overlap = overlap, 
##     poly = poly, method = method, plot = plot, minlength = minlength)
## 
##  Coefficients and bootstrapped confidence intervals 
##             Privt Apps  Accpt Enrll Tp10p Tp25p F.Und P.Und Otstt Rm.Br
## Private      1.00                                                      
## Apps        -0.43  1.00                                                
## Accept      -0.48  0.94  1.00                                          
## Enroll      -0.57  0.85  0.91  1.00                                    
## Top10perc    0.16  0.34  0.19  0.18  1.00                              
## Top25perc    0.10  0.35  0.25  0.23  0.89  1.00                        
## F.Undergrad -0.62  0.81  0.87  0.96  0.14  0.20  1.00                  
## P.Undergrad -0.45  0.40  0.44  0.51 -0.11 -0.05  0.57  1.00            
## Outstate     0.55  0.05 -0.03 -0.16  0.56  0.49 -0.22 -0.25  1.00      
## Room.Board   0.34  0.16  0.09 -0.04  0.37  0.33 -0.07 -0.06  0.65  1.00
## Books       -0.02  0.13  0.11  0.11  0.12  0.12  0.12  0.08  0.04  0.13
## Personal    -0.30  0.18  0.20  0.28 -0.09 -0.08  0.32  0.32 -0.30 -0.20
## PhD         -0.16  0.39  0.36  0.33  0.53  0.55  0.32  0.15  0.38  0.33
## Terminal    -0.13  0.37  0.34  0.31  0.49  0.52  0.30  0.14  0.41  0.37
## S.F.Ratio   -0.47  0.10  0.18  0.24 -0.38 -0.29  0.28  0.23 -0.55 -0.36
## perc.alumni  0.41 -0.09 -0.16 -0.18  0.46  0.42 -0.23 -0.28  0.57  0.27
## Expend       0.26  0.26  0.12  0.06  0.66  0.53  0.02 -0.08  0.67  0.50
## Grad.Rate    0.34  0.15  0.07 -0.02  0.49  0.48 -0.08 -0.26  0.57  0.42
##             Books Prsnl PhD   Trmnl S.F.R prc.l Expnd Grd.R
## Books        1.00                                          
## Personal     0.18  1.00                                    
## PhD          0.03 -0.01  1.00                              
## Terminal     0.10 -0.03  0.85  1.00                        
## S.F.Ratio   -0.03  0.14 -0.13 -0.16  1.00                  
## perc.alumni -0.04 -0.29  0.25  0.27 -0.40  1.00            
## Expend       0.11 -0.10  0.43  0.44 -0.58  0.42  1.00      
## Grad.Rate    0.00 -0.27  0.31  0.29 -0.31  0.49  0.39  1.00
## 
##  scale correlations and bootstrapped confidence intervals 
##             lower.emp lower.norm estimate upper.norm upper.emp    p
## Privt-Apps      -0.52      -0.51    -0.43      -0.36     -0.38 0.00
## Privt-Accpt     -0.55      -0.54    -0.48      -0.41     -0.42 0.00
## Privt-Enrll     -0.62      -0.63    -0.57      -0.51     -0.51 0.00
## Privt-Tp10p      0.09       0.09     0.16       0.23      0.23 0.00
## Privt-Tp25p      0.02       0.02     0.10       0.17      0.17 0.02
## Privt-F.Und     -0.66      -0.67    -0.62      -0.56     -0.56 0.00
## Privt-P.Und     -0.53      -0.52    -0.45      -0.38     -0.39 0.00
## Privt-Otstt      0.50       0.51     0.55       0.60      0.60 0.00
## Privt-Rm.Br      0.30       0.29     0.34       0.40      0.39 0.00
## Privt-Books     -0.08      -0.08    -0.02       0.04      0.03 0.49
## Privt-Prsnl     -0.38      -0.38    -0.30      -0.22     -0.24 0.00
## Privt-PhD       -0.22      -0.22    -0.16      -0.09     -0.09 0.00
## Privt-Trmnl     -0.19      -0.20    -0.13      -0.06     -0.07 0.00
## Privt-S.F.R     -0.54      -0.54    -0.47      -0.39     -0.39 0.00
## Privt-prc.l      0.37       0.36     0.41       0.47      0.48 0.00
## Privt-Expnd      0.21       0.21     0.26       0.31      0.30 0.00
## Privt-Grd.R      0.26       0.27     0.34       0.40      0.40 0.00
## Apps-Accpt       0.91       0.92     0.94       0.96      0.96 0.00
## Apps-Enrll       0.80       0.79     0.85       0.90      0.90 0.00
## Apps-Tp10p       0.27       0.25     0.34       0.43      0.44 0.00
## Apps-Tp25p       0.29       0.29     0.35       0.42      0.42 0.00
## Apps-F.Und       0.77       0.75     0.81       0.87      0.87 0.00
## Apps-P.Und       0.35       0.33     0.40       0.46      0.47 0.00
## Apps-Otstt      -0.01      -0.02     0.05       0.13      0.13 0.17
## Apps-Rm.Br       0.10       0.09     0.16       0.24      0.24 0.00
## Apps-Books       0.07       0.07     0.13       0.20      0.21 0.00
## Apps-Prsnl       0.11       0.11     0.18       0.24      0.24 0.00
## Apps-PhD         0.35       0.34     0.39       0.45      0.44 0.00
## Apps-Trmnl       0.33       0.33     0.37       0.42      0.42 0.00
## Apps-S.F.R       0.02       0.02     0.10       0.16      0.16 0.02
## Apps-prc.l      -0.15      -0.15    -0.09      -0.03     -0.04 0.00
## Apps-Expnd       0.18       0.17     0.26       0.35      0.36 0.00
## Apps-Grd.R       0.11       0.10     0.15       0.20      0.20 0.00
## Accpt-Enrll      0.87       0.87     0.91       0.94      0.94 0.00
## Accpt-Tp10p      0.13       0.13     0.19       0.26      0.26 0.00
## Accpt-Tp25p      0.18       0.18     0.25       0.31      0.30 0.00
## Accpt-F.Und      0.84       0.83     0.87       0.91      0.91 0.00
## Accpt-P.Und      0.39       0.37     0.44       0.52      0.52 0.00
## Accpt-Otstt     -0.07      -0.08    -0.03       0.04      0.04 0.49
## Accpt-Rm.Br      0.04       0.02     0.09       0.16      0.16 0.01
## Accpt-Books      0.05       0.05     0.11       0.18      0.18 0.00
## Accpt-Prsnl      0.13       0.13     0.20       0.26      0.25 0.00
## Accpt-PhD        0.32       0.32     0.36       0.40      0.39 0.00
## Accpt-Trmnl      0.31       0.30     0.34       0.38      0.37 0.00
## Accpt-S.F.R      0.11       0.11     0.18       0.23      0.23 0.00
## Accpt-prc.l     -0.20      -0.21    -0.16      -0.11     -0.11 0.00
## Accpt-Expnd      0.07       0.07     0.12       0.18      0.18 0.00
## Accpt-Grd.R      0.03       0.02     0.07       0.12      0.12 0.01
## Enrll-Tp10p      0.12       0.11     0.18       0.26      0.26 0.00
## Enrll-Tp25p      0.15       0.16     0.23       0.29      0.30 0.00
## Enrll-F.Und      0.95       0.95     0.96       0.97      0.97 0.00
## Enrll-P.Und      0.46       0.45     0.51       0.58      0.57 0.00
## Enrll-Otstt     -0.21      -0.21    -0.16      -0.09     -0.09 0.00
## Enrll-Rm.Br     -0.09      -0.10    -0.04       0.02      0.03 0.18
## Enrll-Books      0.05       0.05     0.11       0.18      0.18 0.00
## Enrll-Prsnl      0.20       0.20     0.28       0.35      0.35 0.00
## Enrll-PhD        0.30       0.30     0.33       0.37      0.37 0.00
## Enrll-Trmnl      0.28       0.27     0.31       0.35      0.34 0.00
## Enrll-S.F.R      0.18       0.17     0.24       0.30      0.31 0.00
## Enrll-prc.l     -0.24      -0.24    -0.18      -0.12     -0.13 0.00
## Enrll-Expnd      0.01       0.01     0.06       0.12      0.13 0.02
## Enrll-Grd.R     -0.07      -0.08    -0.02       0.04      0.04 0.48
## Tp10p-Tp25p      0.88       0.88     0.89       0.91      0.90 0.00
## Tp10p-F.Und      0.08       0.07     0.14       0.21      0.22 0.00
## Tp10p-P.Und     -0.16      -0.15    -0.11      -0.06     -0.06 0.00
## Tp10p-Otstt      0.49       0.50     0.56       0.62      0.60 0.00
## Tp10p-Rm.Br      0.30       0.31     0.37       0.43      0.42 0.00
## Tp10p-Books      0.03       0.03     0.12       0.21      0.21 0.01
## Tp10p-Prsnl     -0.16      -0.16    -0.09      -0.02     -0.03 0.01
## Tp10p-PhD        0.50       0.50     0.53       0.57      0.57 0.00
## Tp10p-Trmnl      0.46       0.45     0.49       0.53      0.53 0.00
## Tp10p-S.F.R     -0.44      -0.45    -0.38      -0.32     -0.32 0.00
## Tp10p-prc.l      0.37       0.38     0.46       0.52      0.51 0.00
## Tp10p-Expnd      0.59       0.59     0.66       0.72      0.72 0.00
## Tp10p-Grd.R      0.43       0.43     0.49       0.55      0.53 0.00
## Tp25p-F.Und      0.13       0.13     0.20       0.27      0.27 0.00
## Tp25p-P.Und     -0.11      -0.11    -0.05       0.01      0.00 0.08
## Tp25p-Otstt      0.42       0.42     0.49       0.55      0.54 0.00
## Tp25p-Rm.Br      0.27       0.26     0.33       0.39      0.39 0.00
## Tp25p-Books      0.03       0.02     0.12       0.21      0.21 0.02
## Tp25p-Prsnl     -0.14      -0.15    -0.08      -0.01     -0.01 0.03
## Tp25p-PhD        0.50       0.50     0.55       0.59      0.59 0.00
## Tp25p-Trmnl      0.48       0.47     0.52       0.57      0.57 0.00
## Tp25p-S.F.R     -0.35      -0.36    -0.29      -0.23     -0.23 0.00
## Tp25p-prc.l      0.36       0.35     0.42       0.48      0.48 0.00
## Tp25p-Expnd      0.46       0.46     0.53       0.59      0.58 0.00
## Tp25p-Grd.R      0.42       0.42     0.48       0.53      0.52 0.00
## F.Und-P.Und      0.51       0.50     0.57       0.64      0.65 0.00
## F.Und-Otstt     -0.26      -0.27    -0.22      -0.16     -0.15 0.00
## F.Und-Rm.Br     -0.12      -0.13    -0.07      -0.01      0.00 0.02
## F.Und-Books      0.05       0.06     0.12       0.19      0.19 0.00
## F.Und-Prsnl      0.24       0.24     0.32       0.39      0.39 0.00
## F.Und-PhD        0.28       0.28     0.32       0.36      0.36 0.00
## F.Und-Trmnl      0.27       0.26     0.30       0.34      0.34 0.00
## F.Und-S.F.R      0.22       0.21     0.28       0.34      0.35 0.00
## F.Und-prc.l     -0.29      -0.29    -0.23      -0.17     -0.17 0.00
## F.Und-Expnd     -0.03      -0.03     0.02       0.07      0.07 0.43
## F.Und-Grd.R     -0.13      -0.14    -0.08      -0.02     -0.03 0.01
## P.Und-Otstt     -0.32      -0.32    -0.25      -0.19     -0.20 0.00
## P.Und-Rm.Br     -0.14      -0.14    -0.06       0.02      0.02 0.12
## P.Und-Books      0.04       0.04     0.08       0.14      0.14 0.00
## P.Und-Prsnl      0.27       0.25     0.32       0.39      0.39 0.00
## P.Und-PhD        0.12       0.11     0.15       0.19      0.19 0.00
## P.Und-Trmnl      0.11       0.10     0.14       0.18      0.18 0.00
## P.Und-S.F.R      0.15       0.14     0.23       0.32      0.34 0.00
## P.Und-prc.l     -0.37      -0.38    -0.28      -0.17     -0.19 0.00
## P.Und-Expnd     -0.16      -0.16    -0.08       0.00     -0.01 0.04
## P.Und-Grd.R     -0.32      -0.31    -0.26      -0.20     -0.21 0.00
## Otstt-Rm.Br      0.61       0.61     0.65       0.69      0.69 0.00
## Otstt-Books     -0.05      -0.04     0.04       0.10      0.10 0.37
## Otstt-Prsnl     -0.36      -0.36    -0.30      -0.23     -0.24 0.00
## Otstt-PhD        0.32       0.32     0.38       0.44      0.43 0.00
## Otstt-Trmnl      0.35       0.35     0.41       0.46      0.45 0.00
## Otstt-S.F.R     -0.60      -0.60    -0.55      -0.50     -0.49 0.00
## Otstt-prc.l      0.50       0.50     0.57       0.61      0.61 0.00
## Otstt-Expnd      0.63       0.62     0.67       0.72      0.72 0.00
## Otstt-Grd.R      0.52       0.52     0.57       0.62      0.62 0.00
## Rm.Br-Books      0.06       0.06     0.13       0.19      0.19 0.00
## Rm.Br-Prsnl     -0.27      -0.27    -0.20      -0.13     -0.14 0.00
## Rm.Br-PhD        0.27       0.26     0.33       0.39      0.40 0.00
## Rm.Br-Trmnl      0.32       0.31     0.37       0.43      0.43 0.00
## Rm.Br-S.F.R     -0.41      -0.42    -0.36      -0.31     -0.31 0.00
## Rm.Br-prc.l      0.21       0.20     0.27       0.34      0.35 0.00
## Rm.Br-Expnd      0.44       0.44     0.50       0.56      0.55 0.00
## Rm.Br-Grd.R      0.36       0.36     0.42       0.49      0.48 0.00
## Books-Prsnl      0.08       0.09     0.18       0.28      0.27 0.00
## Books-PhD       -0.10      -0.10     0.03       0.16      0.15 0.64
## Books-Trmnl      0.03       0.02     0.10       0.19      0.18 0.02
## Books-S.F.R     -0.09      -0.10    -0.03       0.05      0.05 0.46
## Books-prc.l     -0.11      -0.12    -0.04       0.04      0.03 0.30
## Books-Expnd      0.04       0.04     0.11       0.18      0.18 0.00
## Books-Grd.R     -0.07      -0.07     0.00       0.07      0.07 0.92
## Prsnl-PhD       -0.08      -0.10    -0.01       0.07      0.07 0.73
## Prsnl-Trmnl     -0.12      -0.12    -0.03       0.06      0.05 0.45
## Prsnl-S.F.R      0.05       0.04     0.14       0.23      0.22 0.01
## Prsnl-prc.l     -0.34      -0.35    -0.29      -0.22     -0.23 0.00
## Prsnl-Expnd     -0.18      -0.18    -0.10      -0.02     -0.03 0.01
## Prsnl-Grd.R     -0.34      -0.34    -0.27      -0.20     -0.21 0.00
## PhD-Trmnl        0.80       0.81     0.85       0.88      0.88 0.00
## PhD-S.F.R       -0.23      -0.22    -0.13      -0.05     -0.06 0.00
## PhD-prc.l        0.18       0.18     0.25       0.31      0.31 0.00
## PhD-Expnd        0.37       0.37     0.43       0.49      0.48 0.00
## PhD-Grd.R        0.24       0.24     0.31       0.37      0.37 0.00
## Trmnl-S.F.R     -0.25      -0.25    -0.16      -0.08     -0.08 0.00
## Trmnl-prc.l      0.20       0.20     0.27       0.33      0.32 0.00
## Trmnl-Expnd      0.39       0.39     0.44       0.49      0.48 0.00
## Trmnl-Grd.R      0.23       0.21     0.29       0.37      0.36 0.00
## S.F.R-prc.l     -0.46      -0.46    -0.40      -0.34     -0.34 0.00
## S.F.R-Expnd     -0.63      -0.64    -0.58      -0.53     -0.54 0.00
## S.F.R-Grd.R     -0.36      -0.37    -0.31      -0.24     -0.24 0.00
## prc.l-Expnd      0.36       0.36     0.42       0.47      0.47 0.00
## prc.l-Grd.R      0.43       0.43     0.49       0.54      0.54 0.00
## Expnd-Grd.R      0.32       0.32     0.39       0.45      0.45 0.00
#variables of particular interest are: Expend, Room.Board, Grad.Rate, perc.alumni, Top10perc, and Private


#train
set.seed(1234)
train <- sample(777,388)


#1 Predictor
mod1.a <- lm(Outstate~Expend,subset = train)
anova(mod1.a)
## Analysis of Variance Table
## 
## Response: Outstate
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Expend      1 2824855680 2824855680  364.25 < 2.2e-16 ***
## Residuals 386 2993513992    7755218                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 2993513992

mod1.b <- lm(Outstate~Room.Board, subset = train)
anova(mod1.b)
## Analysis of Variance Table
## 
## Response: Outstate
##             Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Room.Board   1 2140223022 2140223022   224.6 < 2.2e-16 ***
## Residuals  386 3678146650    9528877                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 3678146650

mod1.c <- lm(Outstate~Grad.Rate, subset = train)
anova(mod1.c)
## Analysis of Variance Table
## 
## Response: Outstate
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Grad.Rate   1 2028807055 2028807055  206.65 < 2.2e-16 ***
## Residuals 386 3789562618    9817520                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 3789562618

mod1.d <- lm(Outstate~perc.alumni, subset = train)
anova(mod1.d)
## Analysis of Variance Table
## 
## Response: Outstate
##              Df     Sum Sq    Mean Sq F value    Pr(>F)    
## perc.alumni   1 2035082678 2035082678  207.63 < 2.2e-16 ***
## Residuals   386 3783286995    9801262                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 3783286995

mod1.e <- lm(Outstate~Top10perc, subset = train)
anova(mod1.e)
## Analysis of Variance Table
## 
## Response: Outstate
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Top10perc   1 1872638109 1872638109  183.19 < 2.2e-16 ***
## Residuals 386 3945731564   10222102                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 3945731564
#Looks like Expend is best predictor

#2 Predictors
mod2.a <- lm(Outstate~Expend+Private, subset = train)
anova(mod2.a)
## Analysis of Variance Table
## 
## Response: Outstate
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Expend      1 2824855680 2824855680  489.52 < 2.2e-16 ***
## Private     1  771813369  771813369  133.75 < 2.2e-16 ***
## Residuals 385 2221700623    5770651                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 2221700623 

mod2.b <- lm(Outstate~Expend+Room.Board, subset = train)
anova(mod2.b)
## Analysis of Variance Table
## 
## Response: Outstate
##             Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Expend       1 2824855680 2824855680 440.704 < 2.2e-16 ***
## Room.Board   1  525713949  525713949  82.016 < 2.2e-16 ***
## Residuals  385 2467800043    6409870                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 2467800043

mod2.c <- lm(Outstate~Expend+Grad.Rate, subset = train)
anova(mod2.c)
## Analysis of Variance Table
## 
## Response: Outstate
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Expend      1 2824855680 2824855680 448.317 < 2.2e-16 ***
## Grad.Rate   1  567617990  567617990  90.083 < 2.2e-16 ***
## Residuals 385 2425896002    6301029                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 2425896002
#Expend+Private is best

#3 Predictors
mod3.a <- lm(Outstate~Expend+Private+Room.Board, subset = train)
anova(mod3.a)
## Analysis of Variance Table
## 
## Response: Outstate
##             Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Expend       1 2824855680 2824855680 557.205 < 2.2e-16 ***
## Private      1  771813369  771813369 152.241 < 2.2e-16 ***
## Room.Board   1  274940487  274940487  54.232 1.096e-12 ***
## Residuals  384 1946760136    5069688                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 1946760136

mod3.b <- lm(Outstate~Expend+Private+Grad.Rate, subset = train)
anova(mod3.b)
## Analysis of Variance Table
## 
## Response: Outstate
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Expend      1 2824855680 2824855680 571.245 < 2.2e-16 ***
## Private     1  771813369  771813369 156.077 < 2.2e-16 ***
## Grad.Rate   1  322788631  322788631  65.275 8.537e-15 ***
## Residuals 384 1898911992    4945083                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 1898911992 

mod3.c <- lm(Outstate~Expend+Private+perc.alumni, subset = train)
anova(mod3.c)
## Analysis of Variance Table
## 
## Response: Outstate
##              Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Expend        1 2824855680 2824855680 545.612 < 2.2e-16 ***
## Private       1  771813369  771813369 149.073 < 2.2e-16 ***
## perc.alumni   1  233575563  233575563  45.114 6.716e-11 ***
## Residuals   384 1988125060    5177409                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 1988125060
#Expend+Private+Grad.Rate is best

#4 Predictors
mod4.a <- lm(Outstate~Expend+Private+Grad.Rate+Room.Board, subset = train)
anova(mod4.a)
## Analysis of Variance Table
## 
## Response: Outstate
##             Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Expend       1 2824855680 2824855680 628.955 < 2.2e-16 ***
## Private      1  771813369  771813369 171.844 < 2.2e-16 ***
## Grad.Rate    1  322788631  322788631  71.869 5.038e-16 ***
## Room.Board   1  178725273  178725273  39.793 7.799e-10 ***
## Residuals  383 1720186719    4491349                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 1720186719

mod4.b <- lm(Outstate~Expend+Private+Grad.Rate+Top10perc, subset = train)
anova(mod4.b)
## Analysis of Variance Table
## 
## Response: Outstate
##            Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## Expend      1 2824855680 2824855680 574.3909 < 2.2e-16 ***
## Private     1  771813369  771813369 156.9364 < 2.2e-16 ***
## Grad.Rate   1  322788631  322788631  65.6341  7.35e-15 ***
## Top10perc   1   15317041   15317041   3.1145    0.0784 .  
## Residuals 383 1883594951    4918002                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 1883594951

mod4.c <- lm(Outstate~Expend+Private+Grad.Rate+perc.alumni, subset = train)
anova(mod4.c)
## Analysis of Variance Table
## 
## Response: Outstate
##              Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Expend        1 2824855680 2824855680 595.878 < 2.2e-16 ***
## Private       1  771813369  771813369 162.807 < 2.2e-16 ***
## Grad.Rate     1  322788631  322788631  68.089 2.546e-15 ***
## perc.alumni   1   83239964   83239964  17.559 3.462e-05 ***
## Residuals   383 1815672028    4740658                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 1815672028
#Expend+Private+Grad.Rate+Room.Board is best

#5 predictors
mod5.a <- lm(Outstate~Expend+Private+Grad.Rate+Room.Board+perc.alumni, subset = train)
anova(mod5.a)
## Analysis of Variance Table
## 
## Response: Outstate
##              Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Expend        1 2824855680 2824855680 677.068 < 2.2e-16 ***
## Private       1  771813369  771813369 184.990 < 2.2e-16 ***
## Grad.Rate     1  322788631  322788631  77.367 < 2.2e-16 ***
## Room.Board    1  178725273  178725273  42.837 1.918e-10 ***
## perc.alumni   1  126411732  126411732  30.299 6.806e-08 ***
## Residuals   382 1593774987    4172186                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 1593774987

mod5.b <- lm(Outstate~Expend+Private+Grad.Rate+Room.Board+Top10perc, subset = train)
anova(mod5.b)
## Analysis of Variance Table
## 
## Response: Outstate
##             Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## Expend       1 2824855680 2824855680 641.2379 < 2.2e-16 ***
## Private      1  771813369  771813369 175.2004 < 2.2e-16 ***
## Grad.Rate    1  322788631  322788631  73.2725 2.792e-16 ***
## Room.Board   1  178725273  178725273  40.5704 5.455e-10 ***
## Top10perc    1   37355830   37355830   8.4797  0.003802 ** 
## Residuals  382 1682830889    4405316                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 1682830889
#Expend+Private+Grad.Rate+Room.Board+perc.alumni is best

#6 predictors
mod6.a <- lm(Outstate~Expend+Private+Grad.Rate+Room.Board+perc.alumni+Top10perc, subset = train)
anova(mod6.a)
## Analysis of Variance Table
## 
## Response: Outstate
##              Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## Expend        1 2824855680 2824855680 681.3007 < 2.2e-16 ***
## Private       1  771813369  771813369 186.1465 < 2.2e-16 ***
## Grad.Rate     1  322788631  322788631  77.8504 < 2.2e-16 ***
## Room.Board    1  178725273  178725273  43.1051 1.701e-10 ***
## perc.alumni   1  126411732  126411732  30.4881 6.227e-08 ***
## Top10perc     1   14046731   14046731   3.3878   0.06646 .  
## Residuals   381 1579728256    4146268                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 1579728256

#Adding Top10perc is not significant, so we will not include this in the model. The final model will use five predictors: Expend+Private+Grad.Rate+Room.Board+perc.alumni

#2B) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

#install.packages("gam")
library(gam)
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded gam 1.16.1
#gam with natural splines

gam1=lm(Outstate~ns(Expend,4)+Private+ns(Grad.Rate,3)+ns(Room.Board,4)+ns(Top10perc,2), subset = train)
summary(gam1)
## 
## Call:
## lm(formula = Outstate ~ ns(Expend, 4) + Private + ns(Grad.Rate, 
##     3) + ns(Room.Board, 4) + ns(Top10perc, 2), subset = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6339.5 -1080.6   -73.4  1200.2  4837.1 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          3116.8     1507.5   2.068 0.039375 *  
## ns(Expend, 4)1       1461.9      827.7   1.766 0.078174 .  
## ns(Expend, 4)2      13989.9     1309.3  10.685  < 2e-16 ***
## ns(Expend, 4)3       5399.8     2323.9   2.324 0.020685 *  
## ns(Expend, 4)4      -3450.6     3151.0  -1.095 0.274181    
## PrivateYes           2345.6      245.4   9.559  < 2e-16 ***
## ns(Grad.Rate, 3)1    2358.2      625.2   3.772 0.000188 ***
## ns(Grad.Rate, 3)2     852.3     2293.3   0.372 0.710359    
## ns(Grad.Rate, 3)3    2196.7     1212.1   1.812 0.070731 .  
## ns(Room.Board, 4)1   2934.0     1046.8   2.803 0.005331 ** 
## ns(Room.Board, 4)2   3004.8      894.5   3.359 0.000862 ***
## ns(Room.Board, 4)3   4184.0     2442.1   1.713 0.087483 .  
## ns(Room.Board, 4)4   1734.6     1405.3   1.234 0.217850    
## ns(Top10perc, 2)1    1102.3      887.1   1.243 0.214813    
## ns(Top10perc, 2)2     283.5     1072.0   0.264 0.791568    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1893 on 373 degrees of freedom
## Multiple R-squared:  0.7703, Adjusted R-squared:  0.7617 
## F-statistic: 89.37 on 14 and 373 DF,  p-value: < 2.2e-16
#The results show that the number splines that should be used for each variable (based on significance) are Expend=4 splines, Grad.Rate = 3 splines, Room.Board = 4 splines, and Top10perc = 1 spline.


#plot
plot.Gam(gam1, se=TRUE ,col ="red")

#2C) Evaluate the model obtained on the test set, and explain the results obtained.

test <- College[-train,]
gam2=lm(Outstate~ns(Expend,4)+Private+ns(Grad.Rate,3)+ns(Room.Board,4)+ns(Top10perc,2),data = test)
summary(gam2)
## 
## Call:
## lm(formula = Outstate ~ ns(Expend, 4) + Private + ns(Grad.Rate, 
##     3) + ns(Room.Board, 4) + ns(Top10perc, 2), data = test)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7268.0 -1065.8    27.8  1274.3  7708.2 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           245.1     1202.6   0.204 0.838639    
## ns(Expend, 4)1       1750.7      606.9   2.884 0.004148 ** 
## ns(Expend, 4)2      11297.6     1080.3  10.457  < 2e-16 ***
## ns(Expend, 4)3       8629.3     1546.7   5.579 4.65e-08 ***
## ns(Expend, 4)4       5745.6     1504.2   3.820 0.000156 ***
## Private              2553.7      261.1   9.779  < 2e-16 ***
## ns(Grad.Rate, 3)1    2943.1      589.1   4.996 9.01e-07 ***
## ns(Grad.Rate, 3)2    2847.7     1920.7   1.483 0.139012    
## ns(Grad.Rate, 3)3    1198.3     1125.3   1.065 0.287606    
## ns(Room.Board, 4)1   1413.4      814.6   1.735 0.083542 .  
## ns(Room.Board, 4)2   3011.6      748.0   4.026 6.87e-05 ***
## ns(Room.Board, 4)3   3633.9     1900.7   1.912 0.056653 .  
## ns(Room.Board, 4)4   4231.0     1111.6   3.806 0.000165 ***
## ns(Top10perc, 2)1    1145.8      933.8   1.227 0.220556    
## ns(Top10perc, 2)2    -536.7      841.2  -0.638 0.523880    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1939 on 374 degrees of freedom
## Multiple R-squared:  0.7912, Adjusted R-squared:  0.7834 
## F-statistic: 101.3 on 14 and 374 DF,  p-value: < 2.2e-16

#The results show that having 4 splines for Expend is significant, Grad.Rate is not significant beyond 1 spline, Room.Board is only significant with 2 or 4 splines, and Top10perc is not significant.

#2D) For which variables, if any, is there evidence of a non-linear relationship with the response?

#Expend, Grad.Rate, and Room.Board appear to have a non-linear relationship with Outstate. Top10perc is slightly curved, so it might be slightly non-linear.