library(MASS)

attach(Boston)
fit=lm(nox~poly(dis ,3) ,data=Boston)
(summary (fit))
## 
## Call:
## lm(formula = nox ~ poly(dis, 3), data = Boston)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.121130 -0.040619 -0.009738  0.023385  0.194904 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.554695   0.002759 201.021  < 2e-16 ***
## poly(dis, 3)1 -2.003096   0.062071 -32.271  < 2e-16 ***
## poly(dis, 3)2  0.856330   0.062071  13.796  < 2e-16 ***
## poly(dis, 3)3 -0.318049   0.062071  -5.124 4.27e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared:  0.7148, Adjusted R-squared:  0.7131 
## F-statistic: 419.3 on 3 and 502 DF,  p-value: < 2.2e-16
plot(dis, nox)

dislims =range(dis)
dis.grid=seq(from=dislims [1],to=dislims [2])
preds=predict (fit ,newdata =list(dis=dis.grid),se=TRUE)
se.bands=cbind(preds$fit +2*preds$se.fit ,preds$fit -2*preds$se.fit)

# plot
par(mfrow=c(1,1),mar=c(4.5,4.5,1,1) ,oma=c(0,0,4,0))
plot(dis ,nox ,xlim=dislims ,cex =.5,col=" darkgrey ")
title(" Degree-3 Polynomial ",outer=T)
lines(dis.grid ,preds$fit ,lwd=2,col="blue")
matlines (dis.grid ,se.bands ,lwd=1, col=" blue",lty=3)

1b)

fit2=lm(nox~poly(dis ,4) ,data=Boston)


plot(dis, nox)

dislims =range(dis)
dis.grid=seq(from=dislims [1],to=dislims [2])
preds=predict (fit ,newdata =list(dis=dis.grid),se=TRUE)
se.bands=cbind(preds$fit +2*preds$se.fit ,preds$fit -2*preds$se.fit)

# plot
par(mfrow=c(1,1),mar=c(4.5,4.5,1,1) ,oma=c(0,0,4,0))
plot(dis ,nox ,xlim=dislims ,cex =.5,col=" darkgrey ")
title(" Degree-4 Polynomial ",outer=T)
lines(dis.grid ,preds$fit ,lwd=2,col="blue")
matlines (dis.grid ,se.bands ,lwd=1, col=" blue",lty=3)

fit3=lm(nox~poly(dis ,5) ,data=Boston)


plot(dis, nox)

dislims =range(dis)
dis.grid=seq(from=dislims [1],to=dislims [2])
preds=predict (fit ,newdata =list(dis=dis.grid),se=TRUE)
se.bands=cbind(preds$fit +2*preds$se.fit ,preds$fit -2*preds$se.fit)

# plot
par(mfrow=c(1,1),mar=c(4.5,4.5,1,1) ,oma=c(0,0,4,0))
plot(dis ,nox ,xlim=dislims ,cex =.5,col=" darkgrey ")
title(" Degree-5 Polynomial ",outer=T)
lines(dis.grid ,preds$fit ,lwd=2,col="blue")
matlines (dis.grid ,se.bands ,lwd=1, col=" blue",lty=3)

fit4=lm(nox~poly(dis ,8) ,data=Boston)


plot(dis, nox)

dislims =range(dis)
dis.grid=seq(from=dislims [1],to=dislims [2])
preds=predict (fit ,newdata =list(dis=dis.grid),se=TRUE)
se.bands=cbind(preds$fit +2*preds$se.fit ,preds$fit -2*preds$se.fit)

# plot
par(mfrow=c(1,1),mar=c(4.5,4.5,1,1) ,oma=c(0,0,4,0))
plot(dis ,nox ,xlim=dislims ,cex =.5,col=" darkgrey ")
title(" Degree-8 Polynomial ",outer=T)
lines(dis.grid ,preds$fit ,lwd=2,col="blue")
matlines (dis.grid ,se.bands ,lwd=1, col=" blue",lty=3)

fit5=lm(nox~poly(dis ,10) ,data=Boston)


plot(dis, nox)

dislims =range(dis)
dis.grid=seq(from=dislims [1],to=dislims [2])
preds=predict (fit ,newdata =list(dis=dis.grid),se=TRUE)
se.bands=cbind(preds$fit +2*preds$se.fit ,preds$fit -2*preds$se.fit)

# plot
par(mfrow=c(1,1),mar=c(4.5,4.5,1,1) ,oma=c(0,0,4,0))
plot(dis ,nox ,xlim=dislims ,cex =.5,col=" darkgrey ")
title(" Degree-10 Polynomial ",outer=T)
lines(dis.grid ,preds$fit ,lwd=2,col="blue")
matlines (dis.grid ,se.bands ,lwd=1, col=" blue",lty=3)

1c)

train<-sample(nrow(Boston), floor(nrow(Boston)/2))



degreePoly<-10
polyMSE<-matrix(nrow=degreePoly, ncol=2)
colnames(polyMSE)<-c("degree", "MSE")
for(i in 1:degreePoly){ 
  polyMSE[i,1]<-i
  this.fit<-lm(nox~poly(dis,i), data=Boston, subset=train)
  polyMSE[i,2]<-mean((nox-predict(this.fit, Boston))[-train]^2)
  }

polyDF<-as.data.frame(polyMSE)
head(polyDF)
##   degree         MSE
## 1      1 0.005989490
## 2      2 0.004434211
## 3      3 0.004226811
## 4      4 0.004227210
## 5      5 0.004293274
## 6      6 0.004257148
#install.packages("tidyverse")
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::select() masks MASS::select()
ggplot(data=polyDF, aes(x=degree, y=MSE))+
  geom_point()+
  geom_line()

The MsE gets lowest at 7 and stays there for up to 10 so 7 would be the best polynomial degree to use.

1d)

library(splines)
par(mfrow=c(1,1))
fit=lm(nox~bs(dis ,knots=c(5,8) ),data=Boston)
pred=predict (fit ,newdata =list(dis=dis.grid),se=T)
plot(dis ,nox ,col="gray")
lines(dis.grid ,pred$fit ,lwd=2)
lines(dis.grid ,pred$fit+2*pred$se ,lty="dashed")
lines(dis.grid ,pred$fit-2*pred$se ,lty="dashed")

attr(bs(dis ,df=4) ,"knots")
##     50% 
## 3.20745

I chose the knots where the curve of the polts seems to have an area where it changes the most. So at 5 the curve is like a corner, and at 8 the curve has leveled out the most.

2a)

college=read.csv("College.csv", header=TRUE)
attach(college)

names(college)
##  [1] "X"           "Private"     "Apps"        "Accept"      "Enroll"     
##  [6] "Top10perc"   "Top25perc"   "F.Undergrad" "P.Undergrad" "Outstate"   
## [11] "Room.Board"  "Books"       "Personal"    "PhD"         "Terminal"   
## [16] "S.F.Ratio"   "perc.alumni" "Expend"      "Grad.Rate"
train=sample(777, 385)

mod1=lm(Outstate~Private, subset=train)
anova(mod1)
## Analysis of Variance Table
## 
## Response: Outstate
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private     1 1998297729 1998297729  196.01 < 2.2e-16 ***
## Residuals 383 3904648234   10194904                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod2=lm(Outstate~Private+Top10perc, subset=train)
anova(mod2)
## Analysis of Variance Table
## 
## Response: Outstate
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private     1 1998297729 1998297729  305.08 < 2.2e-16 ***
## Top10perc   1 1402479270 1402479270  214.11 < 2.2e-16 ***
## Residuals 382 2502168964    6550181                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod3=lm(Outstate~Private+Top10perc+Grad.Rate, subset = train)
anova(mod3)
## Analysis of Variance Table
## 
## Response: Outstate
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private     1 1998297729 1998297729 326.125 < 2.2e-16 ***
## Top10perc   1 1402479270 1402479270 228.887 < 2.2e-16 ***
## Grad.Rate   1  167633252  167633252  27.358 2.796e-07 ***
## Residuals 381 2334535712    6127390                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod4=lm(Outstate~Private+Top10perc+Grad.Rate+Room.Board, subset = train)
anova(mod4)
## Analysis of Variance Table
## 
## Response: Outstate
##             Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private      1 1998297729 1998297729 395.022 < 2.2e-16 ***
## Top10perc    1 1402479270 1402479270 277.241 < 2.2e-16 ***
## Grad.Rate    1  167633252  167633252  33.138 1.772e-08 ***
## Room.Board   1  412230475  412230475  81.489 < 2.2e-16 ***
## Residuals  380 1922305237    5058698                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod5=lm(Outstate~Private+Top10perc+Grad.Rate+Room.Board+Accept, subset=train)
anova(mod5)
## Analysis of Variance Table
## 
## Response: Outstate
##             Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## Private      1 1998297729 1998297729 401.9568 < 2.2e-16 ***
## Top10perc    1 1402479270 1402479270 282.1081 < 2.2e-16 ***
## Grad.Rate    1  167633252  167633252  33.7194 1.348e-08 ***
## Room.Board   1  412230475  412230475  82.9200 < 2.2e-16 ***
## Accept       1   38135301   38135301   7.6709  0.005888 ** 
## Residuals  379 1884169936    4971425                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod6=lm(Outstate~Private+Top10perc+Grad.Rate+Room.Board+Top25perc, subset = train)
anova(mod6)
## Analysis of Variance Table
## 
## Response: Outstate
##             Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## Private      1 1998297729 1998297729 394.2423 < 2.2e-16 ***
## Top10perc    1 1402479270 1402479270 276.6938 < 2.2e-16 ***
## Grad.Rate    1  167633252  167633252  33.0722  1.83e-08 ***
## Room.Board   1  412230475  412230475  81.3286 < 2.2e-16 ***
## Top25perc    1    1266324    1266324   0.2498    0.6175    
## Residuals  379 1921038913    5068704                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod7=lm(Outstate~Private+Top10perc+Grad.Rate+Room.Board+Expend+perc.alumni, subset = train)
anova(mod7)
## Analysis of Variance Table
## 
## Response: Outstate
##              Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private       1 1998297729 1998297729 477.795 < 2.2e-16 ***
## Top10perc     1 1402479270 1402479270 335.334 < 2.2e-16 ***
## Grad.Rate     1  167633252  167633252  40.081 6.911e-10 ***
## Room.Board    1  412230475  412230475  98.565 < 2.2e-16 ***
## Expend        1  292968439  292968439  70.049 1.136e-15 ***
## perc.alumni   1   48415530   48415530  11.576 0.0007394 ***
## Residuals   378 1580921268    4182331                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod8=lm(Outstate~Private+Top10perc+Grad.Rate+Room.Board+Expend+perc.alumni+S.F.Ratio, subset = train)
anova(mod8)
## Analysis of Variance Table
## 
## Response: Outstate
##              Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## Private       1 1998297729 1998297729 476.5329 < 2.2e-16 ***
## Top10perc     1 1402479270 1402479270 334.4484 < 2.2e-16 ***
## Grad.Rate     1  167633252  167633252  39.9754 7.277e-10 ***
## Room.Board    1  412230475  412230475  98.3044 < 2.2e-16 ***
## Expend        1  292968439  292968439  69.8640 1.238e-15 ***
## perc.alumni   1   48415530   48415530  11.5456 0.0007515 ***
## S.F.Ratio     1       5616       5616   0.0013 0.9708258    
## Residuals   377 1580915652    4193410                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod9=lm(Outstate~Private+Top10perc+Grad.Rate+Room.Board+Expend+perc.alumni+Accept, subset = train)
anova(mod9)
## Analysis of Variance Table
## 
## Response: Outstate
##              Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## Private       1 1998297729 1998297729 489.0743 < 2.2e-16 ***
## Top10perc     1 1402479270 1402479270 343.2505 < 2.2e-16 ***
## Grad.Rate     1  167633252  167633252  41.0275 4.475e-10 ***
## Room.Board    1  412230475  412230475 100.8915 < 2.2e-16 ***
## Expend        1  292968439  292968439  71.7027 5.649e-16 ***
## perc.alumni   1   48415530   48415530  11.8495 0.0006415 ***
## Accept        1   40545565   40545565   9.9233 0.0017622 ** 
## Residuals   377 1540375703    4085877                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod10=lm(Outstate~Private+Top10perc+Grad.Rate+Room.Board+Expend+perc.alumni+Terminal, subset = train)
anova(mod10)
## Analysis of Variance Table
## 
## Response: Outstate
##              Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private       1 1998297729 1998297729 494.853 < 2.2e-16 ***
## Top10perc     1 1402479270 1402479270 347.306 < 2.2e-16 ***
## Grad.Rate     1  167633252  167633252  41.512 3.579e-10 ***
## Room.Board    1  412230475  412230475 102.084 < 2.2e-16 ***
## Expend        1  292968439  292968439  72.550 3.940e-16 ***
## perc.alumni   1   48415530   48415530  11.989 0.0005964 ***
## Terminal      1   58534540   58534540  14.495 0.0001639 ***
## Residuals   377 1522386728    4038161                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

2b)

#install.packages("gam")
library(gam)
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded gam 1.16.1
gam1=lm(Outstate~Private+ns(Top10perc, 4)+ns(Grad.Rate, 4)+ns(Room.Board, 4)+ns(Expend, 4)+ns(perc.alumni, 4)+ns(Terminal,4) ,data=college)
summary(gam1)
## 
## Call:
## lm(formula = Outstate ~ Private + ns(Top10perc, 4) + ns(Grad.Rate, 
##     4) + ns(Room.Board, 4) + ns(Expend, 4) + ns(perc.alumni, 
##     4) + ns(Terminal, 4), data = college)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7398.6 -1055.4   -14.1  1274.5  8079.1 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1472.61    1293.50   1.138  0.25529    
## PrivateYes           2522.27     205.05  12.300  < 2e-16 ***
## ns(Top10perc, 4)1      81.88     442.18   0.185  0.85314    
## ns(Top10perc, 4)2     685.17     487.36   1.406  0.16017    
## ns(Top10perc, 4)3     607.17    1045.31   0.581  0.56151    
## ns(Top10perc, 4)4    -655.11     742.18  -0.883  0.37769    
## ns(Grad.Rate, 4)1    1316.93     697.33   1.889  0.05934 .  
## ns(Grad.Rate, 4)2    3097.50     598.13   5.179 2.87e-07 ***
## ns(Grad.Rate, 4)3    3081.24    1592.23   1.935  0.05334 .  
## ns(Grad.Rate, 4)4    -114.20    1038.17  -0.110  0.91244    
## ns(Room.Board, 4)1   1968.03     625.17   3.148  0.00171 ** 
## ns(Room.Board, 4)2   2725.89     558.85   4.878 1.31e-06 ***
## ns(Room.Board, 4)3   3793.89    1461.05   2.597  0.00960 ** 
## ns(Room.Board, 4)4   3448.83     852.79   4.044 5.79e-05 ***
## ns(Expend, 4)1       1252.57     501.27   2.499  0.01267 *  
## ns(Expend, 4)2      10827.38     861.24  12.572  < 2e-16 ***
## ns(Expend, 4)3       7447.65    1291.92   5.765 1.19e-08 ***
## ns(Expend, 4)4       3980.63    1325.50   3.003  0.00276 ** 
## ns(perc.alumni, 4)1  1519.43     463.28   3.280  0.00109 ** 
## ns(perc.alumni, 4)2  1277.11     467.15   2.734  0.00641 ** 
## ns(perc.alumni, 4)3  3214.65    1101.31   2.919  0.00362 ** 
## ns(perc.alumni, 4)4  2298.92     709.02   3.242  0.00124 ** 
## ns(Terminal, 4)1       77.88     693.11   0.112  0.91056    
## ns(Terminal, 4)2     1132.23     592.26   1.912  0.05629 .  
## ns(Terminal, 4)3       76.87    1620.44   0.047  0.96218    
## ns(Terminal, 4)4      397.73     446.62   0.891  0.37347    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1872 on 751 degrees of freedom
## Multiple R-squared:  0.7905, Adjusted R-squared:  0.7836 
## F-statistic: 113.4 on 25 and 751 DF,  p-value: < 2.2e-16
plot.Gam(gam1, se=TRUE ,col ="red")

2c)

preds = predict(gam1, newdata = college[-train,])
mean((Outstate-predict(gam1, college))[-train]^2)
## [1] 3489975

This gives us the MSE and theres not much we can do with only single value and set.

2d)There seems to be a fairly signifcant non-linear realtionship with expend and terminal.