This question uses the variables dis (the weighted mean of distances to five Boston employment centers) and nox (nitrogen oxides concentration in parts per 10 million) from the Boston data. We will treat dis as the predictor and nox as the response.
# install.packages("tidyverse")
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
attach(Boston)
names(Boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
fitmat=lm(nox~poly(dis, 4),data=Boston)
fitmat
##
## Call:
## lm(formula = nox ~ poly(dis, 4), data = Boston)
##
## Coefficients:
## (Intercept) poly(dis, 4)1 poly(dis, 4)2 poly(dis, 4)3 poly(dis, 4)4
## 0.55470 -2.00310 0.85633 -0.31805 0.03355
#Regression output:
# (Intercept) poly(dis, 4)1 poly(dis, 4)2 poly(dis, 4)3 poly(dis, 4)4
# 0.55470 -2.00310 0.85633 -0.31805 0.03355
dislims =range(dis)
dis.grid=seq(from=dislims[1],to=dislims[2])
preds=predict(fitmat,newdata =list(dis=dis.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,1))
plot(nox~dis, xlim=dislims,cex=.5,col="darkgrey")
title("Degree-4 Polynomial",outer=T)
lines(dis.grid,preds$fit,lwd=2,col="blue")
matlines(dis.grid,se.bands,lwd=1,col=" blue",lty=3)
fitmat2=lm(nox~poly(dis, 10),data=Boston)
coef(fitmat2)
## (Intercept) poly(dis, 10)1 poly(dis, 10)2 poly(dis, 10)3
## 0.55469506 -2.00309590 0.85632995 -0.31804899
## poly(dis, 10)4 poly(dis, 10)5 poly(dis, 10)6 poly(dis, 10)7
## 0.03354668 0.13300890 -0.19243872 0.16962808
## poly(dis, 10)8 poly(dis, 10)9 poly(dis, 10)10
## -0.11770270 0.04794668 -0.03405408
dislims =range(dis)
dis.grid=seq(from=dislims[1],to=dislims[2])
preds=predict(fitmat2,newdata =list(dis=dis.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
par(mfrow=c(1,1))
plot(nox~dis, xlim=dislims,cex=.5,col="darkgrey")
title("Degree-10 Polynomial",outer=T)
lines(dis.grid,preds$fit,lwd=2,col="blue")
matlines(dis.grid,se.bands,lwd=1,col=" blue",lty=3)
#Residual sum of squares
fit1=lm(nox~dis,data=Boston)
fit2=lm(nox~poly(dis,2),data=Boston)
fit3=lm(nox~poly(dis,3),data=Boston)
fit4=lm(nox~poly(dis,4),data=Boston)
fit5=lm(nox~poly(dis,5),data=Boston)
fit6=lm(nox~poly(dis,6),data=Boston)
fit7=lm(nox~poly(dis,7),data=Boston)
fit8=lm(nox~poly(dis,8),data=Boston)
fit9=lm(nox~poly(dis,9),data=Boston)
fit10=lm(nox~poly(dis,10),data=Boston)
anova(fit1,fit2,fit3,fit4,fit5,fit6,fit7,fit8,fit9,fit10)
## Analysis of Variance Table
##
## Model 1: nox ~ dis
## Model 2: nox ~ poly(dis, 2)
## Model 3: nox ~ poly(dis, 3)
## Model 4: nox ~ poly(dis, 4)
## Model 5: nox ~ poly(dis, 5)
## Model 6: nox ~ poly(dis, 6)
## Model 7: nox ~ poly(dis, 7)
## Model 8: nox ~ poly(dis, 8)
## Model 9: nox ~ poly(dis, 9)
## Model 10: nox ~ poly(dis, 10)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 504 2.7686
## 2 503 2.0353 1 0.73330 198.1169 < 2.2e-16 ***
## 3 502 1.9341 1 0.10116 27.3292 2.535e-07 ***
## 4 501 1.9330 1 0.00113 0.3040 0.581606
## 5 500 1.9153 1 0.01769 4.7797 0.029265 *
## 6 499 1.8783 1 0.03703 10.0052 0.001657 **
## 7 498 1.8495 1 0.02877 7.7738 0.005505 **
## 8 497 1.8356 1 0.01385 3.7429 0.053601 .
## 9 496 1.8333 1 0.00230 0.6211 0.431019
## 10 495 1.8322 1 0.00116 0.3133 0.575908
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# SS(Res)):
#1 2.7686
#2 2.0353
#3 1.9341
#4 1.9330
#5 1.9153
#6 1.8783
#7 1.8495
#8 1.8356
#9 1.8333
#10 1.8322 (lowest SS(Res))
set.seed(1)
train<-sample(506, 253)
lm.fit<-lm(nox~dis, data=Boston, subset=train)
attach(Boston)
## The following objects are masked from Boston (pos = 3):
##
## age, black, chas, crim, dis, indus, lstat, medv, nox, ptratio,
## rad, rm, tax, zn
mean((nox-predict(lm.fit, Boston))[-train]^2)
## [1] 0.005838344
degreePoly<-10
polyMSE<-matrix(nrow=degreePoly, ncol=2)
colnames(polyMSE)<-c("degree", "MSE")
for(i in 1:degreePoly){
polyMSE[i,1]<-i
this.fit<-lm(nox~poly(dis,i), data=Boston, subset=train)
polyMSE[i,2]<-mean((nox-predict(this.fit, Boston))[-train]^2)
}
polyDF<-as.data.frame(polyMSE)
ggplot(data=polyDF, aes(x=-degree, y=MSE))+
geom_point()+
geom_line()
#My ggplot was showing up backwards, so I ended up choosing to input x as -degree to view the graph as would be expected. Based on the graph depiction, I would say the model is best at 3 nodes of complexity.
# install.packages("splines")
library(splines)
attr(bs(dis,df=4) ,"knots")#Used this function to determine knots.
## 50%
## 3.20745
fitspl=lm(nox~bs(dis,knots=3.20745), data=Boston)
coef(fitspl)
## (Intercept) bs(dis, knots = 3.20745)1
## 0.7344739 -0.0580976
## bs(dis, knots = 3.20745)2 bs(dis, knots = 3.20745)3
## -0.4635635 -0.1997882
## bs(dis, knots = 3.20745)4
## -0.3888095
par(mfrow=c(1,1))
fitspl=lm(nox~bs(dis,knots=3.20745), data=Boston)
pred=predict (fitspl,newdata =list(dis=dis.grid),se=T)
plot(nox~dis,col="gray")
lines(dis.grid ,pred$fit ,lwd=2)
lines(dis.grid ,pred$fit+2*pred$se ,lty="dashed")
lines(dis.grid ,pred$fit-2*pred$se ,lty="dashed")
BONUS (Extra Credit)
#fitspl2
attr(bs(dis,df=6) ,"knots")
## 25% 50% 75%
## 2.100175 3.207450 5.188425
fitspl2=lm(nox~bs(dis,knots=c(2.100175, 3.207450, 5.188425)), data=Boston)
coef(fitspl2)
## (Intercept)
## 0.65622323
## bs(dis, knots = c(2.100175, 3.20745, 5.188425))1
## 0.10222089
## bs(dis, knots = c(2.100175, 3.20745, 5.188425))2
## -0.02962935
## bs(dis, knots = c(2.100175, 3.20745, 5.188425))3
## -0.15958951
## bs(dis, knots = c(2.100175, 3.20745, 5.188425))4
## -0.22814651
## bs(dis, knots = c(2.100175, 3.20745, 5.188425))5
## -0.26271629
## bs(dis, knots = c(2.100175, 3.20745, 5.188425))6
## -0.24002467
par(mfrow=c(1,1))
fitspl2=lm(nox~bs(dis,knots=c(2.100175, 3.207450, 5.188425)), data=Boston)
pred=predict (fitspl2,newdata =list(dis=dis.grid),se=T)
plot(nox~dis,col="gray")
lines(dis.grid ,pred$fit ,lwd=2)
lines(dis.grid ,pred$fit+2*pred$se ,lty="dashed")
lines(dis.grid ,pred$fit-2*pred$se ,lty="dashed")
anova(fitspl2)#SS(Res): 1.834
## Analysis of Variance Table
##
## Response: nox
## Df Sum Sq Mean Sq F value
## bs(dis, knots = c(2.100175, 3.20745, 5.188425)) 6 4.947 0.82450 224.34
## Residuals 499 1.834 0.00368
## Pr(>F)
## bs(dis, knots = c(2.100175, 3.20745, 5.188425)) < 2.2e-16 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#fitspl3
attr(bs(dis,df=8) ,"knots")
## 16.66667% 33.33333% 50% 66.66667% 83.33333%
## 1.851317 2.384033 3.207450 4.325700 6.062200
fitspl3=lm(nox~bs(dis,knots=c(1.851317, 2.384033, 3.207450, 4.325700, 6.062200)), data=Boston)
coef(fitspl3)
## (Intercept)
## 0.63233987
## bs(dis, knots = c(1.851317, 2.384033, 3.20745, 4.3257, 6.0622))1
## 0.13966173
## bs(dis, knots = c(1.851317, 2.384033, 3.20745, 4.3257, 6.0622))2
## 0.03656085
## bs(dis, knots = c(1.851317, 2.384033, 3.20745, 4.3257, 6.0622))3
## -0.01656388
## bs(dis, knots = c(1.851317, 2.384033, 3.20745, 4.3257, 6.0622))4
## -0.13408173
## bs(dis, knots = c(1.851317, 2.384033, 3.20745, 4.3257, 6.0622))5
## -0.14378286
## bs(dis, knots = c(1.851317, 2.384033, 3.20745, 4.3257, 6.0622))6
## -0.23668725
## bs(dis, knots = c(1.851317, 2.384033, 3.20745, 4.3257, 6.0622))7
## -0.20770311
## bs(dis, knots = c(1.851317, 2.384033, 3.20745, 4.3257, 6.0622))8
## -0.22869189
par(mfrow=c(1,1))
fitspl3=lm(nox~bs(dis,knots=c(1.851317, 2.384033, 3.207450, 4.325700, 6.062200)), data=Boston)
pred=predict (fitspl3,newdata =list(dis=dis.grid),se=T)
plot(nox~dis,col="gray")
lines(dis.grid ,pred$fit ,lwd=2)
lines(dis.grid ,pred$fit+2*pred$se ,lty="dashed")
lines(dis.grid ,pred$fit-2*pred$se ,lty="dashed")
anova(fitspl3)#SS(Res): 1.817
## Analysis of Variance Table
##
## Response: nox
## Df Sum Sq
## bs(dis, knots = c(1.851317, 2.384033, 3.20745, 4.3257, 6.0622)) 8 4.964
## Residuals 497 1.817
## Mean Sq
## bs(dis, knots = c(1.851317, 2.384033, 3.20745, 4.3257, 6.0622)) 0.62050
## Residuals 0.00366
## F value
## bs(dis, knots = c(1.851317, 2.384033, 3.20745, 4.3257, 6.0622)) 169.72
## Residuals
## Pr(>F)
## bs(dis, knots = c(1.851317, 2.384033, 3.20745, 4.3257, 6.0622)) < 2.2e-16
## Residuals
##
## bs(dis, knots = c(1.851317, 2.384033, 3.20745, 4.3257, 6.0622)) ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#fitspl4
attr(bs(dis,df=5) ,"knots")
## 33.33333% 66.66667%
## 2.384033 4.325700
fitspl4=lm(nox~bs(dis,knots=c(2.384033, 4.325700)), data=Boston)
coef(fitspl3)
## (Intercept)
## 0.63233987
## bs(dis, knots = c(1.851317, 2.384033, 3.20745, 4.3257, 6.0622))1
## 0.13966173
## bs(dis, knots = c(1.851317, 2.384033, 3.20745, 4.3257, 6.0622))2
## 0.03656085
## bs(dis, knots = c(1.851317, 2.384033, 3.20745, 4.3257, 6.0622))3
## -0.01656388
## bs(dis, knots = c(1.851317, 2.384033, 3.20745, 4.3257, 6.0622))4
## -0.13408173
## bs(dis, knots = c(1.851317, 2.384033, 3.20745, 4.3257, 6.0622))5
## -0.14378286
## bs(dis, knots = c(1.851317, 2.384033, 3.20745, 4.3257, 6.0622))6
## -0.23668725
## bs(dis, knots = c(1.851317, 2.384033, 3.20745, 4.3257, 6.0622))7
## -0.20770311
## bs(dis, knots = c(1.851317, 2.384033, 3.20745, 4.3257, 6.0622))8
## -0.22869189
par(mfrow=c(1,1))
fitspl4=lm(nox~bs(dis,knots=c(2.384033, 4.325700)), data=Boston)
pred=predict (fitspl4,newdata =list(dis=dis.grid),se=T)
plot(nox~dis,col="gray")
lines(dis.grid ,pred$fit ,lwd=2)
lines(dis.grid ,pred$fit+2*pred$se ,lty="dashed")
lines(dis.grid ,pred$fit-2*pred$se ,lty="dashed")
anova(fitspl4)#SS(Res): 1.817
## Analysis of Variance Table
##
## Response: nox
## Df Sum Sq Mean Sq F value Pr(>F)
## bs(dis, knots = c(2.384033, 4.3257)) 5 4.9408 0.98816 268.5 < 2.2e-16
## Residuals 500 1.8402 0.00368
##
## bs(dis, knots = c(2.384033, 4.3257)) ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Because the SS(Res) does not decrease between 5 and 8 d.f., I can assume that the 5 d.f. is the better fit than a model with a higher d.f.
College<-read.csv("College.csv")
set.seed(2)
train2<-sample(777, 389)
# STEP 1:
mod1a<-lm(Outstate~Private, data=College, subset=train2)
anova(mod1a)#SS(Res): 4591090808
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1782357286 1782357286 152.82 < 2.2e-16 ***
## Residuals 387 4513540277 11662895
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1b<-lm(Outstate~Apps, data=College, subset=train2)
anova(mod1b)#SS(Res): 6266364602
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Apps 1 44971 44971 0.0028 0.9581
## Residuals 387 6295852593 16268353
mod1c<-lm(Outstate~Accept, data=College, subset=train2)
anova(mod1c)#SS(Res): 6318908842
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Accept 1 12480906 12480906 0.7687 0.3812
## Residuals 387 6283416657 16236219
mod1d<-lm(Outstate~Enroll, data=College, subset=train2)
anova(mod1d)#SS(Res): 6281125417
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Enroll 1 173116087 173116087 10.942 0.001028 **
## Residuals 387 6122781477 15821141
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1e<-lm(Outstate~Top10perc, data=College, subset=train2)
anova(mod1e)#SS(Res): 4251835806
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Top10perc 1 1849264979 1849264979 160.95 < 2.2e-16 ***
## Residuals 387 4446632584 11490007
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1f<-lm(Outstate~Top25perc, data=College, subset=train2)
anova(mod1f)#SS(Res): 4735252656
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Top25perc 1 1439678038 1439678038 114.73 < 2.2e-16 ***
## Residuals 387 4856219526 12548371
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1g<-lm(Outstate~F.Undergrad, data=College, subset=train2)
anova(mod1g)#SS(Res): 6193610713
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## F.Undergrad 1 321773428 321773428 20.844 6.703e-06 ***
## Residuals 387 5974124136 15437013
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1h<-lm(Outstate~P.Undergrad, data=College, subset=train2)
anova(mod1h)#SS(Res): 6044228456
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## P.Undergrad 1 422620049 422620049 27.847 2.192e-07 ***
## Residuals 387 5873277514 15176428
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1i<-lm(Outstate~Room.Board, data=College, subset=train2)
anova(mod1i)#SS(Res): 3596442911
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Room.Board 1 2635084556 2635084556 278.57 < 2.2e-16 ***
## Residuals 387 3660813007 9459465
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1j<-lm(Outstate~Books, data=College, subset=train2)
anova(mod1j)#SS(Res): 6296235608
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Books 1 1735353 1735353 0.1067 0.7441
## Residuals 387 6294162211 16263985
mod1k<-lm(Outstate~Personal, data=College, subset=train2)
anova(mod1k)#SS(Res): 5694489718
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Personal 1 586943315 586943315 39.788 7.741e-10 ***
## Residuals 387 5708954249 14751820
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1l<-lm(Outstate~PhD, data=College, subset=train2)
anova(mod1l)#SS(Res): 5278611441
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## PhD 1 844239521 844239521 59.931 8.666e-14 ***
## Residuals 387 5451658043 14086972
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1m<-lm(Outstate~Terminal, data=College, subset=train2)
anova(mod1m)#SS(Res): 5297553089
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Terminal 1 1085899781 1085899781 80.661 < 2.2e-16 ***
## Residuals 387 5209997783 13462527
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1n<-lm(Outstate~S.F.Ratio, data=College, subset=train2)
anova(mod1n)#SS(Res): 4658647520
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## S.F.Ratio 1 2215413197 2215413197 210.11 < 2.2e-16 ***
## Residuals 387 4080484367 10543887
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1o<-lm(Outstate~perc.alumni, data=College, subset=train2)
anova(mod1o)#SS(Res): 4411808546
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## perc.alumni 1 1801092540 1801092540 155.07 < 2.2e-16 ***
## Residuals 387 4494805023 11614483
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1p<-lm(Outstate~Expend, data=College, subset=train2)
anova(mod1p)#SS(Res): 3158403944 (lowest SS(Res))
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 3273654565 3273654565 419.19 < 2.2e-16 ***
## Residuals 387 3022242999 7809413
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1q<-lm(Outstate~Grad.Rate, data=College, subset=train2)
anova(mod1q)#SS(Res): 4246114477
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Grad.Rate 1 1971518478 1971518478 176.44 < 2.2e-16 ***
## Residuals 387 4324379086 11174106
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#STEP 2
mod2a<-lm(Outstate~Expend+Room.Board, data=College, subset=train2)
anova(mod2a)#SS(Res): 2450161953 (lowest SS(Res))
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 3273654565 3273654565 528.17 < 2.2e-16 ***
## Room.Board 1 629788790 629788790 101.61 < 2.2e-16 ***
## Residuals 386 2392454209 6198068
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod2b<-lm(Outstate~Expend+Grad.Rate, data=College, subset=train2)
anova(mod2b)#SS(Res): 2572849254
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 3273654565 3273654565 509.75 < 2.2e-16 ***
## Grad.Rate 1 543309506 543309506 84.60 < 2.2e-16 ***
## Residuals 386 2478933493 6422107
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod2c<-lm(Outstate~Expend+Top10perc, data=College, subset=train2)
anova(mod2c)#SS(Res): 3033568194
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 3273654565 3273654565 425.8522 < 2.2e-16 ***
## Top10perc 1 54944115 54944115 7.1474 0.007826 **
## Residuals 386 2967298884 7687303
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod2d<-lm(Outstate~Expend+Enroll, data=College, subset=train2)
anova(mod2d)#SS(Res): 3003655223
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 3273654565 3273654565 455.882 < 2.2e-16 ***
## Enroll 1 250406414 250406414 34.871 7.73e-09 ***
## Residuals 386 2771836585 7180924
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod2e<-lm(Outstate~Expend+P.Undergrad, data=College, subset=train2)
anova(mod2e)#SS(Res): 3035743880
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 3273654565 3273654565 455.678 < 2.2e-16 ***
## P.Undergrad 1 249164217 249164217 34.682 8.448e-09 ***
## Residuals 386 2773078782 7184142
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#STEP 3
mod3a<-lm(Outstate~Expend+Room.Board+Grad.Rate, data=College, subset=train2)
anova(mod3a)#SS(Res): 2098174445 (lowest SS(Res))
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 3273654565 3273654565 602.32 < 2.2e-16 ***
## Room.Board 1 629788790 629788790 115.88 < 2.2e-16 ***
## Grad.Rate 1 299958262 299958262 55.19 7.126e-13 ***
## Residuals 385 2092495947 5435054
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod3b<-lm(Outstate~Expend+Room.Board+Top10perc, data=College, subset=train2)
anova(mod3b)#SS(Res): 2298481559
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 3273654565 3273654565 534.4243 < 2e-16 ***
## Room.Board 1 629788790 629788790 102.8131 < 2e-16 ***
## Top10perc 1 34108897 34108897 5.5683 0.01879 *
## Residuals 385 2358345312 6125572
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod3c<-lm(Outstate~Expend+Room.Board+Enroll, data=College, subset=train2)
anova(mod3c)#SS(Res): 2322841728
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 3273654565 3273654565 570.998 < 2.2e-16 ***
## Room.Board 1 629788790 629788790 109.849 < 2.2e-16 ***
## Enroll 1 185165928 185165928 32.297 2.616e-08 ***
## Residuals 385 2207288281 5733216
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod3d<-lm(Outstate~Expend+Room.Board+P.Undergrad, data=College, subset=train2)
anova(mod3d)#SS(Res): 2238738796
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 3273654565 3273654565 582.868 < 2.2e-16 ***
## Room.Board 1 629788790 629788790 112.133 < 2.2e-16 ***
## P.Undergrad 1 230117529 230117529 40.972 4.495e-10 ***
## Residuals 385 2162336679 5616459
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#STEP 4
mod4a<-lm(Outstate~Expend+Room.Board+Grad.Rate+Top10perc, data=College, subset=train2)
anova(mod4a)#SS(Res): 2055247327
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 3273654565 3273654565 601.3769 < 2.2e-16 ***
## Room.Board 1 629788790 629788790 115.6935 < 2.2e-16 ***
## Grad.Rate 1 299958262 299958262 55.1029 7.439e-13 ***
## Top10perc 1 2153902 2153902 0.3957 0.5297
## Residuals 384 2090342045 5443599
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#STEP 5
mod5a<-lm(Outstate~Expend+Room.Board+Grad.Rate+Top10perc+Enroll, data=College, subset=train2)
anova(mod5a)#SS(Res): 1920563991
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 3273654565 3273654565 652.6432 < 2.2e-16 ***
## Room.Board 1 629788790 629788790 125.5561 < 2.2e-16 ***
## Grad.Rate 1 299958262 299958262 59.8004 9.367e-14 ***
## Top10perc 1 2153902 2153902 0.4294 0.5127
## Enroll 1 169216269 169216269 33.7353 1.328e-08 ***
## Residuals 383 1921125777 5015994
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#STEP 6
mod6a<-lm(Outstate~Expend+Room.Board+Grad.Rate+Top10perc+Enroll+P.Undergrad, data=College, subset=train2)
anova(mod6a)#SS(Res): 1907010902
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 3273654565 3273654565 659.3602 < 2.2e-16 ***
## Room.Board 1 629788790 629788790 126.8484 < 2.2e-16 ***
## Grad.Rate 1 299958262 299958262 60.4158 7.187e-14 ***
## Top10perc 1 2153902 2153902 0.4338 0.5105
## Enroll 1 169216269 169216269 34.0825 1.130e-08 ***
## P.Undergrad 1 24535752 24535752 4.9418 0.0268 *
## Residuals 382 1896590024 4964895
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#STEP 7
mod7a<-lm(Outstate~Expend+Room.Board+Grad.Rate+Top10perc+Enroll+P.Undergrad+Terminal, data=College, subset=train2)
anova(mod7a)#SS(Res): 1891272531
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 3273654565 3273654565 671.1427 < 2.2e-16 ***
## Room.Board 1 629788790 629788790 129.1151 < 2.2e-16 ***
## Grad.Rate 1 299958262 299958262 61.4954 4.503e-14 ***
## Top10perc 1 2153902 2153902 0.4416 0.506764
## Enroll 1 169216269 169216269 34.6916 8.493e-09 ***
## P.Undergrad 1 24535752 24535752 5.0302 0.025484 *
## Terminal 1 38174033 38174033 7.8262 0.005411 **
## Residuals 381 1858415991 4877732
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#STEP 8
mod8a<-lm(Outstate~Expend+Room.Board+Grad.Rate+Top10perc+Enroll+P.Undergrad+Terminal+Private, data=College, subset=train2)
anova(mod8a)#SS(Res): 1597288070
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 3273654565 3273654565 766.6603 < 2.2e-16 ***
## Room.Board 1 629788790 629788790 147.4908 < 2.2e-16 ***
## Grad.Rate 1 299958262 299958262 70.2475 1.029e-15 ***
## Top10perc 1 2153902 2153902 0.5044 0.477998
## Enroll 1 169216269 169216269 39.6289 8.479e-10 ***
## P.Undergrad 1 24535752 24535752 5.7461 0.017008 *
## Terminal 1 38174033 38174033 8.9400 0.002972 **
## Private 1 235808589 235808589 55.2242 7.177e-13 ***
## Residuals 380 1622607402 4270019
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#STEP 9
mod9a<-lm(Outstate~Expend+Room.Board+Grad.Rate+Top10perc+Enroll+P.Undergrad+Terminal+Private+Top25perc, data=College, subset=train2)
anova(mod9a)#SS(Res): 1597019550
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 3273654565 3273654565 768.1643 < 2.2e-16 ***
## Room.Board 1 629788790 629788790 147.7802 < 2.2e-16 ***
## Grad.Rate 1 299958262 299958262 70.3853 9.768e-16 ***
## Top10perc 1 2153902 2153902 0.5054 0.477568
## Enroll 1 169216269 169216269 39.7067 8.200e-10 ***
## P.Undergrad 1 24535752 24535752 5.7573 0.016902 *
## Terminal 1 38174033 38174033 8.9576 0.002944 **
## Private 1 235808589 235808589 55.3326 6.872e-13 ***
## Top25perc 1 7438420 7438420 1.7454 0.187250
## Residuals 379 1615168982 4261660
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#STEP 10
mod10a<-lm(Outstate~Expend+Room.Board+Grad.Rate+Top10perc+Enroll+P.Undergrad+Terminal+Private+Top25perc+F.Undergrad, data=College, subset=train2)
anova(mod10a)#SS(Res): 1596315162
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 3273654565 3273654565 774.8735 < 2.2e-16 ***
## Room.Board 1 629788790 629788790 149.0709 < 2.2e-16 ***
## Grad.Rate 1 299958262 299958262 71.0001 7.566e-16 ***
## Top10perc 1 2153902 2153902 0.5098 0.475654
## Enroll 1 169216269 169216269 40.0535 7.001e-10 ***
## P.Undergrad 1 24535752 24535752 5.8076 0.016434 *
## Terminal 1 38174033 38174033 9.0358 0.002824 **
## Private 1 235808589 235808589 55.8159 5.572e-13 ***
## Top25perc 1 7438420 7438420 1.7607 0.185342
## F.Undergrad 1 18209641 18209641 4.3102 0.038559 *
## Residuals 378 1596959341 4224760
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#STEP 11
mod11a<-lm(Outstate~Expend+Room.Board+Grad.Rate+Top10perc+Enroll+P.Undergrad+Terminal+Private+Top25perc+F.Undergrad+Personal, data=College, subset=train2)
anova(mod11a)#SS(Res): 1548018590
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 3273654565 3273654565 788.0290 < 2.2e-16 ***
## Room.Board 1 629788790 629788790 151.6018 < 2.2e-16 ***
## Grad.Rate 1 299958262 299958262 72.2055 4.561e-16 ***
## Top10perc 1 2153902 2153902 0.5185 0.471935
## Enroll 1 169216269 169216269 40.7335 5.126e-10 ***
## P.Undergrad 1 24535752 24535752 5.9062 0.015553 *
## Terminal 1 38174033 38174033 9.1892 0.002603 **
## Private 1 235808589 235808589 56.7635 3.681e-13 ***
## Top25perc 1 7438420 7438420 1.7906 0.181664
## F.Undergrad 1 18209641 18209641 4.3834 0.036958 *
## Personal 1 30814287 30814287 7.4176 0.006759 **
## Residuals 377 1566145055 4154231
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#STEP 12
mod12a<-lm(Outstate~Expend+Room.Board+Grad.Rate+Top10perc+Enroll+P.Undergrad+Terminal+Private+Top25perc+F.Undergrad+Personal+PhD, data=College, subset=train2)
anova(mod12a)#SS(Res): 1530778098
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 3273654565 3273654565 786.1278 < 2.2e-16 ***
## Room.Board 1 629788790 629788790 151.2360 < 2.2e-16 ***
## Grad.Rate 1 299958262 299958262 72.0313 4.949e-16 ***
## Top10perc 1 2153902 2153902 0.5172 0.472471
## Enroll 1 169216269 169216269 40.6352 5.378e-10 ***
## P.Undergrad 1 24535752 24535752 5.8920 0.015679 *
## Terminal 1 38174033 38174033 9.1670 0.002634 **
## Private 1 235808589 235808589 56.6265 3.929e-13 ***
## Top25perc 1 7438420 7438420 1.7862 0.182193
## F.Undergrad 1 18209641 18209641 4.3728 0.037187 *
## Personal 1 30814287 30814287 7.3997 0.006826 **
## PhD 1 376515 376515 0.0904 0.763816
## Residuals 376 1565768540 4164278
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#STEP 13
mod13a<-lm(Outstate~Expend+Room.Board+Grad.Rate+Top10perc+Enroll+P.Undergrad+Terminal+Private+Top25perc+F.Undergrad+Personal+PhD+S.F.Ratio, data=College, subset=train2)
anova(mod13a)#SS(Res): 1522195228
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 3273654565 3273654565 791.7048 < 2.2e-16 ***
## Room.Board 1 629788790 629788790 152.3089 < 2.2e-16 ***
## Grad.Rate 1 299958262 299958262 72.5423 4.013e-16 ***
## Top10perc 1 2153902 2153902 0.5209 0.470907
## Enroll 1 169216269 169216269 40.9235 4.721e-10 ***
## P.Undergrad 1 24535752 24535752 5.9338 0.015318 *
## Terminal 1 38174033 38174033 9.2321 0.002545 **
## Private 1 235808589 235808589 57.0282 3.307e-13 ***
## Top25perc 1 7438420 7438420 1.7989 0.180655
## F.Undergrad 1 18209641 18209641 4.4038 0.036526 *
## Personal 1 30814287 30814287 7.4522 0.006635 **
## PhD 1 376515 376515 0.0911 0.763005
## S.F.Ratio 1 15164674 15164674 3.6674 0.056246 .
## Residuals 375 1550603866 4134944
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#All other variables are not significant, so stop here.
#install.packages("gam")
library(gam)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.16.1
gam1<-gam(Outstate~Expend+Room.Board+Grad.Rate+Top10perc+Enroll+P.Undergrad+Terminal+Private+Top25perc+F.Undergrad+Personal+PhD+S.F.Ratio, data=College, subset=train2)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
par(mfrow=c(1,1))
plot(gam1, se=TRUE ,col ="blue")
#Fitted line shows a negative slope, which would reflect a decreasing SS(Res).
modtest<-lm(Outstate~Expend+Room.Board+Grad.Rate+Top10perc+Enroll+P.Undergrad+Terminal+Private+Top25perc+F.Undergrad+Personal+PhD+S.F.Ratio, data=College, subset=-train2)
summary(modtest)
##
## Call:
## lm(formula = Outstate ~ Expend + Room.Board + Grad.Rate + Top10perc +
## Enroll + P.Undergrad + Terminal + Private + Top25perc + F.Undergrad +
## Personal + PhD + S.F.Ratio, data = College, subset = -train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5755.9 -1210.6 -11.1 1286.4 5131.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.732e+03 1.058e+03 -2.582 0.010217 *
## Expend 1.455e-01 2.913e-02 4.993 9.13e-07 ***
## Room.Board 8.701e-01 1.239e-01 7.021 1.04e-11 ***
## Grad.Rate 3.041e+01 8.058e+00 3.774 0.000187 ***
## Top10perc 2.790e+01 1.515e+01 1.842 0.066297 .
## Enroll 2.876e-01 4.776e-01 0.602 0.547351
## P.Undergrad 5.498e-03 1.297e-01 0.042 0.966202
## Terminal 2.539e+01 1.282e+01 1.981 0.048371 *
## PrivateYes 3.137e+03 3.536e+02 8.870 < 2e-16 ***
## Top25perc -5.477e+00 1.214e+01 -0.451 0.652157
## F.Undergrad -1.020e-01 9.778e-02 -1.043 0.297657
## Personal -2.501e-01 2.025e-01 -1.235 0.217506
## PhD 2.942e+01 1.163e+01 2.530 0.011824 *
## S.F.Ratio -3.361e+01 3.314e+01 -1.014 0.311093
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2027 on 374 degrees of freedom
## Multiple R-squared: 0.7544, Adjusted R-squared: 0.7459
## F-statistic: 88.38 on 13 and 374 DF, p-value: < 2.2e-16
coef(modtest)
## (Intercept) Expend Room.Board Grad.Rate Top10perc
## -2.732359e+03 1.454575e-01 8.700907e-01 3.041055e+01 2.789818e+01
## Enroll P.Undergrad Terminal PrivateYes Top25perc
## 2.876355e-01 5.497668e-03 2.538643e+01 3.136621e+03 -5.477265e+00
## F.Undergrad Personal PhD S.F.Ratio
## -1.019745e-01 -2.501106e-01 2.942397e+01 -3.361150e+01
#Summary of the model gives an overall small p-value (< 2.2e-16), which would indicate the model is successful, but there are several individual p-values that are not significant, meaning the model would likely benefit from removing several levels of complexity.
#I would say any variable that lowered the SS(Res) but didn't appear to have a significant relationship (Enroll,P.Undergrad,Top25perc,F.Undergrad,Personal,and S.F.Ratio).