library(MASS)
attach(Boston)
fit=lm(nox~poly(dis ,3) ,data=Boston)
(summary (fit))
##
## Call:
## lm(formula = nox ~ poly(dis, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.121130 -0.040619 -0.009738 0.023385 0.194904
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.554695 0.002759 201.021 < 2e-16 ***
## poly(dis, 3)1 -2.003096 0.062071 -32.271 < 2e-16 ***
## poly(dis, 3)2 0.856330 0.062071 13.796 < 2e-16 ***
## poly(dis, 3)3 -0.318049 0.062071 -5.124 4.27e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared: 0.7148, Adjusted R-squared: 0.7131
## F-statistic: 419.3 on 3 and 502 DF, p-value: < 2.2e-16
plot(dis, nox)
dislims =range(dis)
dis.grid=seq(from=dislims [1],to=dislims [2])
preds=predict (fit ,newdata =list(dis=dis.grid),se=TRUE)
se.bands=cbind(preds$fit +2*preds$se.fit ,preds$fit -2*preds$se.fit)
# plot
par(mfrow=c(1,1),mar=c(4.5,4.5,1,1) ,oma=c(0,0,4,0))
plot(dis ,nox ,xlim=dislims ,cex =.5,col=" darkgrey ")
title(" Degree-3 Polynomial ",outer=T)
lines(dis.grid ,preds$fit ,lwd=2,col="blue")
matlines (dis.grid ,se.bands ,lwd=1, col=" blue",lty=3)
1b)
fit2=lm(nox~poly(dis ,4) ,data=Boston)
plot(dis, nox)
dislims =range(dis)
dis.grid=seq(from=dislims [1],to=dislims [2])
preds=predict (fit ,newdata =list(dis=dis.grid),se=TRUE)
se.bands=cbind(preds$fit +2*preds$se.fit ,preds$fit -2*preds$se.fit)
# plot
par(mfrow=c(1,1),mar=c(4.5,4.5,1,1) ,oma=c(0,0,4,0))
plot(dis ,nox ,xlim=dislims ,cex =.5,col=" darkgrey ")
title(" Degree-4 Polynomial ",outer=T)
lines(dis.grid ,preds$fit ,lwd=2,col="blue")
matlines (dis.grid ,se.bands ,lwd=1, col=" blue",lty=3)
fit3=lm(nox~poly(dis ,5) ,data=Boston)
plot(dis, nox)
dislims =range(dis)
dis.grid=seq(from=dislims [1],to=dislims [2])
preds=predict (fit ,newdata =list(dis=dis.grid),se=TRUE)
se.bands=cbind(preds$fit +2*preds$se.fit ,preds$fit -2*preds$se.fit)
# plot
par(mfrow=c(1,1),mar=c(4.5,4.5,1,1) ,oma=c(0,0,4,0))
plot(dis ,nox ,xlim=dislims ,cex =.5,col=" darkgrey ")
title(" Degree-5 Polynomial ",outer=T)
lines(dis.grid ,preds$fit ,lwd=2,col="blue")
matlines (dis.grid ,se.bands ,lwd=1, col=" blue",lty=3)
fit4=lm(nox~poly(dis ,8) ,data=Boston)
plot(dis, nox)
dislims =range(dis)
dis.grid=seq(from=dislims [1],to=dislims [2])
preds=predict (fit ,newdata =list(dis=dis.grid),se=TRUE)
se.bands=cbind(preds$fit +2*preds$se.fit ,preds$fit -2*preds$se.fit)
# plot
par(mfrow=c(1,1),mar=c(4.5,4.5,1,1) ,oma=c(0,0,4,0))
plot(dis ,nox ,xlim=dislims ,cex =.5,col=" darkgrey ")
title(" Degree-8 Polynomial ",outer=T)
lines(dis.grid ,preds$fit ,lwd=2,col="blue")
matlines (dis.grid ,se.bands ,lwd=1, col=" blue",lty=3)
fit5=lm(nox~poly(dis ,10) ,data=Boston)
plot(dis, nox)
dislims =range(dis)
dis.grid=seq(from=dislims [1],to=dislims [2])
preds=predict (fit ,newdata =list(dis=dis.grid),se=TRUE)
se.bands=cbind(preds$fit +2*preds$se.fit ,preds$fit -2*preds$se.fit)
# plot
par(mfrow=c(1,1),mar=c(4.5,4.5,1,1) ,oma=c(0,0,4,0))
plot(dis ,nox ,xlim=dislims ,cex =.5,col=" darkgrey ")
title(" Degree-10 Polynomial ",outer=T)
lines(dis.grid ,preds$fit ,lwd=2,col="blue")
matlines (dis.grid ,se.bands ,lwd=1, col=" blue",lty=3)
1c)
train<-sample(nrow(Boston), floor(nrow(Boston)/2))
degreePoly<-10
polyMSE<-matrix(nrow=degreePoly, ncol=2)
colnames(polyMSE)<-c("degree", "MSE")
for(i in 1:degreePoly){
polyMSE[i,1]<-i
this.fit<-lm(nox~poly(dis,i), data=Boston, subset=train)
polyMSE[i,2]<-mean((nox-predict(this.fit, Boston))[-train]^2)
}
polyDF<-as.data.frame(polyMSE)
head(polyDF)
## degree MSE
## 1 1 0.005989490
## 2 2 0.004434211
## 3 3 0.004226811
## 4 4 0.004227210
## 5 5 0.004293274
## 6 6 0.004257148
#install.packages("tidyverse")
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::select() masks MASS::select()
ggplot(data=polyDF, aes(x=degree, y=MSE))+
geom_point()+
geom_line()
The MsE gets lowest at 7 and stays there for up to 10 so 7 would be the best polynomial degree to use.
1d)
library(splines)
par(mfrow=c(1,1))
fit=lm(nox~bs(dis ,knots=c(5,8) ),data=Boston)
pred=predict (fit ,newdata =list(dis=dis.grid),se=T)
plot(dis ,nox ,col="gray")
lines(dis.grid ,pred$fit ,lwd=2)
lines(dis.grid ,pred$fit+2*pred$se ,lty="dashed")
lines(dis.grid ,pred$fit-2*pred$se ,lty="dashed")
attr(bs(dis ,df=4) ,"knots")
## 50%
## 3.20745
I chose the knots where the curve of the polts seems to have an area where it changes the most. So at 5 the curve is like a corner, and at 8 the curve has leveled out the most.
2a)
college=read.csv("College.csv", header=TRUE)
attach(college)
names(college)
## [1] "X" "Private" "Apps" "Accept" "Enroll"
## [6] "Top10perc" "Top25perc" "F.Undergrad" "P.Undergrad" "Outstate"
## [11] "Room.Board" "Books" "Personal" "PhD" "Terminal"
## [16] "S.F.Ratio" "perc.alumni" "Expend" "Grad.Rate"
train=sample(777, 385)
mod1=lm(Outstate~Private, subset=train)
anova(mod1)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1998297729 1998297729 196.01 < 2.2e-16 ***
## Residuals 383 3904648234 10194904
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod2=lm(Outstate~Private+Top10perc, subset=train)
anova(mod2)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1998297729 1998297729 305.08 < 2.2e-16 ***
## Top10perc 1 1402479270 1402479270 214.11 < 2.2e-16 ***
## Residuals 382 2502168964 6550181
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod3=lm(Outstate~Private+Top10perc+Grad.Rate, subset = train)
anova(mod3)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1998297729 1998297729 326.125 < 2.2e-16 ***
## Top10perc 1 1402479270 1402479270 228.887 < 2.2e-16 ***
## Grad.Rate 1 167633252 167633252 27.358 2.796e-07 ***
## Residuals 381 2334535712 6127390
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod4=lm(Outstate~Private+Top10perc+Grad.Rate+Room.Board, subset = train)
anova(mod4)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1998297729 1998297729 395.022 < 2.2e-16 ***
## Top10perc 1 1402479270 1402479270 277.241 < 2.2e-16 ***
## Grad.Rate 1 167633252 167633252 33.138 1.772e-08 ***
## Room.Board 1 412230475 412230475 81.489 < 2.2e-16 ***
## Residuals 380 1922305237 5058698
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod5=lm(Outstate~Private+Top10perc+Grad.Rate+Room.Board+Accept, subset=train)
anova(mod5)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1998297729 1998297729 401.9568 < 2.2e-16 ***
## Top10perc 1 1402479270 1402479270 282.1081 < 2.2e-16 ***
## Grad.Rate 1 167633252 167633252 33.7194 1.348e-08 ***
## Room.Board 1 412230475 412230475 82.9200 < 2.2e-16 ***
## Accept 1 38135301 38135301 7.6709 0.005888 **
## Residuals 379 1884169936 4971425
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod6=lm(Outstate~Private+Top10perc+Grad.Rate+Room.Board+Top25perc, subset = train)
anova(mod6)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1998297729 1998297729 394.2423 < 2.2e-16 ***
## Top10perc 1 1402479270 1402479270 276.6938 < 2.2e-16 ***
## Grad.Rate 1 167633252 167633252 33.0722 1.83e-08 ***
## Room.Board 1 412230475 412230475 81.3286 < 2.2e-16 ***
## Top25perc 1 1266324 1266324 0.2498 0.6175
## Residuals 379 1921038913 5068704
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod7=lm(Outstate~Private+Top10perc+Grad.Rate+Room.Board+Expend+perc.alumni, subset = train)
anova(mod7)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1998297729 1998297729 477.795 < 2.2e-16 ***
## Top10perc 1 1402479270 1402479270 335.334 < 2.2e-16 ***
## Grad.Rate 1 167633252 167633252 40.081 6.911e-10 ***
## Room.Board 1 412230475 412230475 98.565 < 2.2e-16 ***
## Expend 1 292968439 292968439 70.049 1.136e-15 ***
## perc.alumni 1 48415530 48415530 11.576 0.0007394 ***
## Residuals 378 1580921268 4182331
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod8=lm(Outstate~Private+Top10perc+Grad.Rate+Room.Board+Expend+perc.alumni+S.F.Ratio, subset = train)
anova(mod8)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1998297729 1998297729 476.5329 < 2.2e-16 ***
## Top10perc 1 1402479270 1402479270 334.4484 < 2.2e-16 ***
## Grad.Rate 1 167633252 167633252 39.9754 7.277e-10 ***
## Room.Board 1 412230475 412230475 98.3044 < 2.2e-16 ***
## Expend 1 292968439 292968439 69.8640 1.238e-15 ***
## perc.alumni 1 48415530 48415530 11.5456 0.0007515 ***
## S.F.Ratio 1 5616 5616 0.0013 0.9708258
## Residuals 377 1580915652 4193410
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod9=lm(Outstate~Private+Top10perc+Grad.Rate+Room.Board+Expend+perc.alumni+Accept, subset = train)
anova(mod9)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1998297729 1998297729 489.0743 < 2.2e-16 ***
## Top10perc 1 1402479270 1402479270 343.2505 < 2.2e-16 ***
## Grad.Rate 1 167633252 167633252 41.0275 4.475e-10 ***
## Room.Board 1 412230475 412230475 100.8915 < 2.2e-16 ***
## Expend 1 292968439 292968439 71.7027 5.649e-16 ***
## perc.alumni 1 48415530 48415530 11.8495 0.0006415 ***
## Accept 1 40545565 40545565 9.9233 0.0017622 **
## Residuals 377 1540375703 4085877
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod10=lm(Outstate~Private+Top10perc+Grad.Rate+Room.Board+Expend+perc.alumni+Terminal, subset = train)
anova(mod10)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1998297729 1998297729 494.853 < 2.2e-16 ***
## Top10perc 1 1402479270 1402479270 347.306 < 2.2e-16 ***
## Grad.Rate 1 167633252 167633252 41.512 3.579e-10 ***
## Room.Board 1 412230475 412230475 102.084 < 2.2e-16 ***
## Expend 1 292968439 292968439 72.550 3.940e-16 ***
## perc.alumni 1 48415530 48415530 11.989 0.0005964 ***
## Terminal 1 58534540 58534540 14.495 0.0001639 ***
## Residuals 377 1522386728 4038161
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
2b)
#install.packages("gam")
library(gam)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.16.1
gam1=lm(Outstate~Private+ns(Top10perc, 4)+ns(Grad.Rate, 4)+ns(Room.Board, 4)+ns(Expend, 4)+ns(perc.alumni, 4)+ns(Terminal,4) ,data=college)
summary(gam1)
##
## Call:
## lm(formula = Outstate ~ Private + ns(Top10perc, 4) + ns(Grad.Rate,
## 4) + ns(Room.Board, 4) + ns(Expend, 4) + ns(perc.alumni,
## 4) + ns(Terminal, 4), data = college)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7398.6 -1055.4 -14.1 1274.5 8079.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1472.61 1293.50 1.138 0.25529
## PrivateYes 2522.27 205.05 12.300 < 2e-16 ***
## ns(Top10perc, 4)1 81.88 442.18 0.185 0.85314
## ns(Top10perc, 4)2 685.17 487.36 1.406 0.16017
## ns(Top10perc, 4)3 607.17 1045.31 0.581 0.56151
## ns(Top10perc, 4)4 -655.11 742.18 -0.883 0.37769
## ns(Grad.Rate, 4)1 1316.93 697.33 1.889 0.05934 .
## ns(Grad.Rate, 4)2 3097.50 598.13 5.179 2.87e-07 ***
## ns(Grad.Rate, 4)3 3081.24 1592.23 1.935 0.05334 .
## ns(Grad.Rate, 4)4 -114.20 1038.17 -0.110 0.91244
## ns(Room.Board, 4)1 1968.03 625.17 3.148 0.00171 **
## ns(Room.Board, 4)2 2725.89 558.85 4.878 1.31e-06 ***
## ns(Room.Board, 4)3 3793.89 1461.05 2.597 0.00960 **
## ns(Room.Board, 4)4 3448.83 852.79 4.044 5.79e-05 ***
## ns(Expend, 4)1 1252.57 501.27 2.499 0.01267 *
## ns(Expend, 4)2 10827.38 861.24 12.572 < 2e-16 ***
## ns(Expend, 4)3 7447.65 1291.92 5.765 1.19e-08 ***
## ns(Expend, 4)4 3980.63 1325.50 3.003 0.00276 **
## ns(perc.alumni, 4)1 1519.43 463.28 3.280 0.00109 **
## ns(perc.alumni, 4)2 1277.11 467.15 2.734 0.00641 **
## ns(perc.alumni, 4)3 3214.65 1101.31 2.919 0.00362 **
## ns(perc.alumni, 4)4 2298.92 709.02 3.242 0.00124 **
## ns(Terminal, 4)1 77.88 693.11 0.112 0.91056
## ns(Terminal, 4)2 1132.23 592.26 1.912 0.05629 .
## ns(Terminal, 4)3 76.87 1620.44 0.047 0.96218
## ns(Terminal, 4)4 397.73 446.62 0.891 0.37347
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1872 on 751 degrees of freedom
## Multiple R-squared: 0.7905, Adjusted R-squared: 0.7836
## F-statistic: 113.4 on 25 and 751 DF, p-value: < 2.2e-16
plot.Gam(gam1, se=TRUE ,col ="red")
2c)
preds = predict(gam1, newdata = college[-train,])
mean((Outstate-predict(gam1, college))[-train]^2)
## [1] 3489975
This gives us the MSE and theres not much we can do with only single value and set.
2d)There seems to be a fairly signifcant non-linear realtionship with expend and terminal.