Problem 1

(a.)
mod1 <- lm(Boston$nox ~ poly(Boston$dis,3))
summary(mod1)
## 
## Call:
## lm(formula = Boston$nox ~ poly(Boston$dis, 3))
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.121130 -0.040619 -0.009738  0.023385  0.194904 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.554695   0.002759 201.021  < 2e-16 ***
## poly(Boston$dis, 3)1 -2.003096   0.062071 -32.271  < 2e-16 ***
## poly(Boston$dis, 3)2  0.856330   0.062071  13.796  < 2e-16 ***
## poly(Boston$dis, 3)3 -0.318049   0.062071  -5.124 4.27e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared:  0.7148, Adjusted R-squared:  0.7131 
## F-statistic: 419.3 on 3 and 502 DF,  p-value: < 2.2e-16
ggplot(data=Boston, aes(x=dis,y=nox))+
  geom_point()+
  geom_smooth(method = "lm" , formula = y~poly(x , 3) , se = FALSE)

##### (b.)

#train<-sample(392, 196)
#mpg-predict(this.fit,Boston)
dim(Boston)
## [1] 506  14
set.seed(239)
train = sample(nrow(Boston), floor(nrow(Boston)/2))

degreePoly = 10
polyMSE = matrix(nrow=degreePoly, ncol=2)
colnames(polyMSE) = c("degree", "MSE")
for(i in 1:degreePoly){ 
  polyMSE[i,1]<-i
  this.fit<-lm(nox~poly(dis,i), data=Boston, subset=train)
  polyMSE[i,2]<-mean((Boston$nox-predict(this.fit, Boston))[-train]^2)
  }

polyDF<-as.data.frame(polyMSE)
ggplot(data=polyDF, aes(x=degree,y=MSE))+
  geom_point()+
  geom_line()

(c.)
library(boot)
attach(Boston)
set.seed(239)
errors <- rep(0,10)
for(i in 1:10){
  glm.fit <- glm(nox ~ poly(dis,i),data=Boston)
  errors[i] <- cv.glm(Boston, glm.fit, K=10)$delta[1]
}

cvDF<-data.frame(degree=1:10, errors)
cvDF5 <- cvDF%>%
  filter(degree<=5)

ggplot(data=cvDF5, aes(x = degree, y = errors))+
  geom_point()+
  geom_line()

It looks like a 3rd degree polynomial fits the best, since that’s where the test error is the lowest.

(d.)
library(splines)
dislims=range(dis)
dis.grid=seq(from=dislims[1],to=dislims[2])


fit=lm(nox~bs(dis,knots=c(3,6,8) ),data=Boston)
pred=predict (fit ,newdata =list(dis=dis.grid),se=T)

plot(dis, nox, col="gray")
lines(dis.grid ,pred$fit ,lwd=2)
lines(dis.grid ,pred$fit+2*pred$se ,lty="dashed")
lines(dis.grid ,pred$fit-2*pred$se ,lty="dashed")

summary(fit)
## 
## Call:
## lm(formula = nox ~ bs(dis, knots = c(3, 6, 8)), data = Boston)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.132406 -0.038970 -0.009732  0.025620  0.187718 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   0.70803    0.01617  43.794  < 2e-16 ***
## bs(dis, knots = c(3, 6, 8))1  0.00953    0.02578   0.370    0.712    
## bs(dis, knots = c(3, 6, 8))2 -0.26101    0.01816 -14.375  < 2e-16 ***
## bs(dis, knots = c(3, 6, 8))3 -0.22973    0.02576  -8.920  < 2e-16 ***
## bs(dis, knots = c(3, 6, 8))4 -0.33005    0.03047 -10.832  < 2e-16 ***
## bs(dis, knots = c(3, 6, 8))5 -0.26763    0.06039  -4.432 1.15e-05 ***
## bs(dis, knots = c(3, 6, 8))6 -0.30598    0.06071  -5.040 6.52e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06117 on 499 degrees of freedom
## Multiple R-squared:  0.7247, Adjusted R-squared:  0.7214 
## F-statistic: 218.9 on 6 and 499 DF,  p-value: < 2.2e-16
detach(Boston)

I picked these knots by roughly eyeballing where it looked like the data might be changing shape, with roughly even sections. R would I think have chosen knots to separate the data exactly equally?

Problem 2

(a.)

Wait are we really supposed to do this on all the variables? There’s like 20 of them and two layers of things to check. Here’s how I would do forward stepwise selection if I really had to:

College = read.csv("http://faculty.marshall.usc.edu/gareth-james/ISL/College.csv",header=TRUE)


#names(College)
#dim(College)
train = sample(1:dim(College)[1] , floor(dim(College)[1] / 2))

mod1a <- lm(Outstate ~ perc.alumni + Room.Board,data=College)
anova(mod1a)
## Analysis of Variance Table
## 
## Response: Outstate
##              Df     Sum Sq    Mean Sq F value    Pr(>F)    
## perc.alumni   1 4027178047 4027178047  606.39 < 2.2e-16 ***
## Room.Board    1 3391774193 3391774193  510.71 < 2.2e-16 ***
## Residuals   774 5140345186    6641273                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#df = data.frame(matrix(ncol = 18, nrow = 10))
#colnames(df)=names(College)
#view(df)

#make a list of the sum sq value of the model on each predictor
mod1sums <- c()
for(pred in College[, names(College) != "Outstate"]){
    mod <- lm(Outstate ~ pred,data=College)
    mod1sums = c(mod1sums,anova(mod)[1,2])
}
## Warning in anova.lm(mod): ANOVA F-tests on an essentially perfect fit are
## unreliable
mod1sums
##  [1] 12559297426  3835884599    31598296     8330540   303598468
##  [6]  3971446292  3008031144   584567601   807167148  5376025290
## [11]    18960781  1123466496  1842141521  2090498711  3866086440
## [16]  4027178047  5684728234  4099005307
#output the predictor with the highest sum sq?
names(College)[which(mod1sums==max(mod1sums))]
## [1] "X"
#check that that predictor is significant
mod1 <- lm(Outstate ~ perc.alumni,data=College)
anova(mod1)
## Analysis of Variance Table
## 
## Response: Outstate
##              Df     Sum Sq    Mean Sq F value    Pr(>F)    
## perc.alumni   1 4027178047 4027178047   365.8 < 2.2e-16 ***
## Residuals   775 8532119379   11009186                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#then repeat that several times until it's no longer significant.

That sounds awful, though, so until we learn how to iterate on vector indices better in R and I can automate that, I’m just going to pick the significant variables like this:

mod = lm(Outstate~.-X, data = College , subset = train)
summary(mod)
## 
## Call:
## lm(formula = Outstate ~ . - X, data = College, subset = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6638.2 -1255.2   -36.9  1084.5  9197.7 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.170e+03  1.058e+03  -2.051    0.041 *  
## PrivateYes   2.095e+03  3.736e+02   5.609 3.99e-08 ***
## Apps        -6.053e-01  1.130e-01  -5.355 1.50e-07 ***
## Accept       1.247e+00  2.250e-01   5.539 5.77e-08 ***
## Enroll      -3.115e-01  5.806e-01  -0.536    0.592    
## Top10perc    2.377e+01  1.585e+01   1.500    0.135    
## Top25perc   -8.113e+00  1.225e+01  -0.662    0.508    
## F.Undergrad -1.527e-01  1.147e-01  -1.332    0.184    
## P.Undergrad  1.215e-01  1.015e-01   1.197    0.232    
## Room.Board   7.002e-01  1.215e-01   5.762 1.75e-08 ***
## Books       -7.128e-02  5.342e-01  -0.133    0.894    
## Personal    -3.426e-01  1.823e-01  -1.879    0.061 .  
## PhD          1.436e+01  1.116e+01   1.287    0.199    
## Terminal     1.781e+01  1.319e+01   1.351    0.178    
## S.F.Ratio   -3.108e+01  3.448e+01  -0.902    0.368    
## perc.alumni  4.500e+01  1.113e+01   4.041 6.47e-05 ***
## Expend       3.051e-01  3.334e-02   9.151  < 2e-16 ***
## Grad.Rate    3.728e+01  8.366e+00   4.457 1.10e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1922 on 370 degrees of freedom
## Multiple R-squared:  0.7941, Adjusted R-squared:  0.7846 
## F-statistic: 83.94 on 17 and 370 DF,  p-value: < 2.2e-16
newmod = lm(Outstate ~ Apps + Accept + perc.alumni + Grad.Rate + Room.Board + Expend,data=College, subset = train)
summary(newmod)
## 
## Call:
## lm(formula = Outstate ~ Apps + Accept + perc.alumni + Grad.Rate + 
##     Room.Board + Expend, data = College, subset = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6174.2 -1470.2  -112.6  1248.5  8636.8 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.902e+03  5.443e+02  -5.331 1.68e-07 ***
## Apps        -5.713e-01  1.074e-01  -5.317 1.80e-07 ***
## Accept       7.570e-01  1.686e-01   4.491 9.41e-06 ***
## perc.alumni  7.647e+01  1.097e+01   6.973 1.38e-11 ***
## Grad.Rate    5.708e+01  8.247e+00   6.921 1.91e-11 ***
## Room.Board   1.093e+00  1.201e-01   9.102  < 2e-16 ***
## Expend       3.587e-01  2.941e-02  12.195  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2104 on 381 degrees of freedom
## Multiple R-squared:  0.7457, Adjusted R-squared:  0.7417 
## F-statistic: 186.2 on 6 and 381 DF,  p-value: < 2.2e-16

There. That works fine, right?

(b.)
#install.packages("gam")
#install.packages("mgcv")
library(gam)
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded gam 1.16.1
gam1 = gam(Outstate ~ Private + s(Room.Board,4) + s(Expend,4) + s(perc.alumni,4) + s(Grad.Rate,4) + s(Terminal,4) , data = College , subset = train)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
summary(gam1)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 4) + s(Expend, 
##     4) + s(perc.alumni, 4) + s(Grad.Rate, 4) + s(Terminal, 4), 
##     data = College, subset = train)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -6947.5 -1133.5    73.4  1184.5  6673.1 
## 
## (Dispersion Parameter for gaussian family taken to be 3247109)
## 
##     Null Deviance: 6634665171 on 387 degrees of freedom
## Residual Deviance: 1188440845 on 365.9997 degrees of freedom
## AIC: 6941.839 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## Private             1 1852988435 1852988435 570.6579 < 2.2e-16 ***
## s(Room.Board, 4)    1 1140118801 1140118801 351.1181 < 2.2e-16 ***
## s(Expend, 4)        1 1414732578 1414732578 435.6899 < 2.2e-16 ***
## s(perc.alumni, 4)   1  156291036  156291036  48.1324 1.817e-11 ***
## s(Grad.Rate, 4)     1   84460230   84460230  26.0109 5.460e-07 ***
## s(Terminal, 4)      1   10298965   10298965   3.1717   0.07575 .  
## Residuals         366 1188440845    3247109                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df  Npar F     Pr(F)    
## (Intercept)                                    
## Private                                        
## s(Room.Board, 4)        3  2.9644   0.03209 *  
## s(Expend, 4)            3 25.1570 7.994e-15 ***
## s(perc.alumni, 4)       3  1.7457   0.15724    
## s(Grad.Rate, 4)         3  3.1035   0.02667 *  
## s(Terminal, 4)          3  0.7703   0.51121    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(1,6))
plot(gam1, se=TRUE ,col ="blue")

This all seems pretty good, I think? All of our smooth terms are significant, with Terminal a little less so. However, only Expend and (somewhat) perc.alumni have significant nonlinear effects; the rest should maybe be better represented as nonparametric terms. I’m going to try that:

gam2 = gam(Outstate ~ Private + Room.Board + s(Expend,4) + s(perc.alumni,4) + Grad.Rate + Terminal , data = College , subset = train)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
summary(gam2)
## 
## Call: gam(formula = Outstate ~ Private + Room.Board + s(Expend, 4) + 
##     s(perc.alumni, 4) + Grad.Rate + Terminal, data = College, 
##     subset = train)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -7100.23 -1203.03    39.89  1256.35  6718.56 
## 
## (Dispersion Parameter for gaussian family taken to be 3336592)
## 
##     Null Deviance: 6634665171 on 387 degrees of freedom
## Residual Deviance: 1251220917 on 374.9996 degrees of freedom
## AIC: 6943.813 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## Private             1 1848936042 1848936042 554.1390 < 2.2e-16 ***
## Room.Board          1 1127785912 1127785912 338.0053 < 2.2e-16 ***
## s(Expend, 4)        1 1396079889 1396079889 418.4149 < 2.2e-16 ***
## s(perc.alumni, 4)   1  165529756  165529756  49.6104 9.019e-12 ***
## Grad.Rate           1   88078707   88078707  26.3978 4.479e-07 ***
## Terminal            1   10751933   10751933   3.2224   0.07344 .  
## Residuals         375 1251220917    3336592                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df  Npar F     Pr(F)    
## (Intercept)                                    
## Private                                        
## Room.Board                                     
## s(Expend, 4)            3 27.5660 4.441e-16 ***
## s(perc.alumni, 4)       3  1.6874    0.1692    
## Grad.Rate                                      
## Terminal                                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
gam3 = gam(Outstate ~ Private + Room.Board + s(Expend,4) + perc.alumni + Grad.Rate, data = College , subset = train)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
summary(gam3)
## 
## Call: gam(formula = Outstate ~ Private + Room.Board + s(Expend, 4) + 
##     perc.alumni + Grad.Rate, data = College, subset = train)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -6875.56 -1072.64    22.33  1204.26  6487.87 
## 
## (Dispersion Parameter for gaussian family taken to be 3370829)
## 
##     Null Deviance: 6634665171 on 387 degrees of freedom
## Residual Deviance: 1277543644 on 378.9998 degrees of freedom
## AIC: 6943.89 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##               Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private        1 1860656625 1860656625 551.988 < 2.2e-16 ***
## Room.Board     1 1095797370 1095797370 325.082 < 2.2e-16 ***
## s(Expend, 4)   1 1367904265 1367904265 405.807 < 2.2e-16 ***
## perc.alumni    1  164068974  164068974  48.673 1.356e-11 ***
## Grad.Rate      1   91184388   91184388  27.051 3.250e-07 ***
## Residuals    379 1277543644    3370829                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##              Npar Df Npar F     Pr(F)    
## (Intercept)                              
## Private                                  
## Room.Board                               
## s(Expend, 4)       3 31.304 < 2.2e-16 ***
## perc.alumni                              
## Grad.Rate                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

I think this looks like a better model.

(c.)
gam1.pred = predict(gam1 , newdata = College[-train,])
gam1.mse <- mean((College$Outstate-predict(gam1 , College))[-train]^2)

gam3.pred = predict(gam3 , newdata = College[-train,])
gam3.mse <- mean((College$Outstate-predict(gam3 , College))[-train]^2)

newmod.pred = predict(newmod , newdata = College[-train,])
newmod.mse <- mean((College$Outstate-predict(newmod , College))[-train]^2)

gam1.mse
## [1] 3797807
gam3.mse
## [1] 3894087
newmod.mse
## [1] 5604592

Hmmm, the MSE for the third model is higher than the original one, so maybe I don’t know what I’m talking about. However, they’re both way lower than using just the linear model, so clearly we’re doing something right.

(d.)

Oops, I kind of answered this already. Expend is the main one, and somewhat perc.alumni, as evidenced by their p-value significance levels.