mod1 <- lm(Boston$nox ~ poly(Boston$dis,3))
summary(mod1)
##
## Call:
## lm(formula = Boston$nox ~ poly(Boston$dis, 3))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.121130 -0.040619 -0.009738 0.023385 0.194904
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.554695 0.002759 201.021 < 2e-16 ***
## poly(Boston$dis, 3)1 -2.003096 0.062071 -32.271 < 2e-16 ***
## poly(Boston$dis, 3)2 0.856330 0.062071 13.796 < 2e-16 ***
## poly(Boston$dis, 3)3 -0.318049 0.062071 -5.124 4.27e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared: 0.7148, Adjusted R-squared: 0.7131
## F-statistic: 419.3 on 3 and 502 DF, p-value: < 2.2e-16
ggplot(data=Boston, aes(x=dis,y=nox))+
geom_point()+
geom_smooth(method = "lm" , formula = y~poly(x , 3) , se = FALSE)
##### (b.)
#train<-sample(392, 196)
#mpg-predict(this.fit,Boston)
dim(Boston)
## [1] 506 14
set.seed(239)
train = sample(nrow(Boston), floor(nrow(Boston)/2))
degreePoly = 10
polyMSE = matrix(nrow=degreePoly, ncol=2)
colnames(polyMSE) = c("degree", "MSE")
for(i in 1:degreePoly){
polyMSE[i,1]<-i
this.fit<-lm(nox~poly(dis,i), data=Boston, subset=train)
polyMSE[i,2]<-mean((Boston$nox-predict(this.fit, Boston))[-train]^2)
}
polyDF<-as.data.frame(polyMSE)
ggplot(data=polyDF, aes(x=degree,y=MSE))+
geom_point()+
geom_line()
library(boot)
attach(Boston)
set.seed(239)
errors <- rep(0,10)
for(i in 1:10){
glm.fit <- glm(nox ~ poly(dis,i),data=Boston)
errors[i] <- cv.glm(Boston, glm.fit, K=10)$delta[1]
}
cvDF<-data.frame(degree=1:10, errors)
cvDF5 <- cvDF%>%
filter(degree<=5)
ggplot(data=cvDF5, aes(x = degree, y = errors))+
geom_point()+
geom_line()
It looks like a 3rd degree polynomial fits the best, since that’s where the test error is the lowest.
library(splines)
dislims=range(dis)
dis.grid=seq(from=dislims[1],to=dislims[2])
fit=lm(nox~bs(dis,knots=c(3,6,8) ),data=Boston)
pred=predict (fit ,newdata =list(dis=dis.grid),se=T)
plot(dis, nox, col="gray")
lines(dis.grid ,pred$fit ,lwd=2)
lines(dis.grid ,pred$fit+2*pred$se ,lty="dashed")
lines(dis.grid ,pred$fit-2*pred$se ,lty="dashed")
summary(fit)
##
## Call:
## lm(formula = nox ~ bs(dis, knots = c(3, 6, 8)), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.132406 -0.038970 -0.009732 0.025620 0.187718
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.70803 0.01617 43.794 < 2e-16 ***
## bs(dis, knots = c(3, 6, 8))1 0.00953 0.02578 0.370 0.712
## bs(dis, knots = c(3, 6, 8))2 -0.26101 0.01816 -14.375 < 2e-16 ***
## bs(dis, knots = c(3, 6, 8))3 -0.22973 0.02576 -8.920 < 2e-16 ***
## bs(dis, knots = c(3, 6, 8))4 -0.33005 0.03047 -10.832 < 2e-16 ***
## bs(dis, knots = c(3, 6, 8))5 -0.26763 0.06039 -4.432 1.15e-05 ***
## bs(dis, knots = c(3, 6, 8))6 -0.30598 0.06071 -5.040 6.52e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06117 on 499 degrees of freedom
## Multiple R-squared: 0.7247, Adjusted R-squared: 0.7214
## F-statistic: 218.9 on 6 and 499 DF, p-value: < 2.2e-16
detach(Boston)
I picked these knots by roughly eyeballing where it looked like the data might be changing shape, with roughly even sections. R would I think have chosen knots to separate the data exactly equally?
Wait are we really supposed to do this on all the variables? There’s like 20 of them and two layers of things to check. Here’s how I would do forward stepwise selection if I really had to:
College = read.csv("http://faculty.marshall.usc.edu/gareth-james/ISL/College.csv",header=TRUE)
#names(College)
#dim(College)
train = sample(1:dim(College)[1] , floor(dim(College)[1] / 2))
mod1a <- lm(Outstate ~ perc.alumni + Room.Board,data=College)
anova(mod1a)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## perc.alumni 1 4027178047 4027178047 606.39 < 2.2e-16 ***
## Room.Board 1 3391774193 3391774193 510.71 < 2.2e-16 ***
## Residuals 774 5140345186 6641273
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#df = data.frame(matrix(ncol = 18, nrow = 10))
#colnames(df)=names(College)
#view(df)
#make a list of the sum sq value of the model on each predictor
mod1sums <- c()
for(pred in College[, names(College) != "Outstate"]){
mod <- lm(Outstate ~ pred,data=College)
mod1sums = c(mod1sums,anova(mod)[1,2])
}
## Warning in anova.lm(mod): ANOVA F-tests on an essentially perfect fit are
## unreliable
mod1sums
## [1] 12559297426 3835884599 31598296 8330540 303598468
## [6] 3971446292 3008031144 584567601 807167148 5376025290
## [11] 18960781 1123466496 1842141521 2090498711 3866086440
## [16] 4027178047 5684728234 4099005307
#output the predictor with the highest sum sq?
names(College)[which(mod1sums==max(mod1sums))]
## [1] "X"
#check that that predictor is significant
mod1 <- lm(Outstate ~ perc.alumni,data=College)
anova(mod1)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## perc.alumni 1 4027178047 4027178047 365.8 < 2.2e-16 ***
## Residuals 775 8532119379 11009186
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#then repeat that several times until it's no longer significant.
That sounds awful, though, so until we learn how to iterate on vector indices better in R and I can automate that, I’m just going to pick the significant variables like this:
mod = lm(Outstate~.-X, data = College , subset = train)
summary(mod)
##
## Call:
## lm(formula = Outstate ~ . - X, data = College, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6638.2 -1255.2 -36.9 1084.5 9197.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.170e+03 1.058e+03 -2.051 0.041 *
## PrivateYes 2.095e+03 3.736e+02 5.609 3.99e-08 ***
## Apps -6.053e-01 1.130e-01 -5.355 1.50e-07 ***
## Accept 1.247e+00 2.250e-01 5.539 5.77e-08 ***
## Enroll -3.115e-01 5.806e-01 -0.536 0.592
## Top10perc 2.377e+01 1.585e+01 1.500 0.135
## Top25perc -8.113e+00 1.225e+01 -0.662 0.508
## F.Undergrad -1.527e-01 1.147e-01 -1.332 0.184
## P.Undergrad 1.215e-01 1.015e-01 1.197 0.232
## Room.Board 7.002e-01 1.215e-01 5.762 1.75e-08 ***
## Books -7.128e-02 5.342e-01 -0.133 0.894
## Personal -3.426e-01 1.823e-01 -1.879 0.061 .
## PhD 1.436e+01 1.116e+01 1.287 0.199
## Terminal 1.781e+01 1.319e+01 1.351 0.178
## S.F.Ratio -3.108e+01 3.448e+01 -0.902 0.368
## perc.alumni 4.500e+01 1.113e+01 4.041 6.47e-05 ***
## Expend 3.051e-01 3.334e-02 9.151 < 2e-16 ***
## Grad.Rate 3.728e+01 8.366e+00 4.457 1.10e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1922 on 370 degrees of freedom
## Multiple R-squared: 0.7941, Adjusted R-squared: 0.7846
## F-statistic: 83.94 on 17 and 370 DF, p-value: < 2.2e-16
newmod = lm(Outstate ~ Apps + Accept + perc.alumni + Grad.Rate + Room.Board + Expend,data=College, subset = train)
summary(newmod)
##
## Call:
## lm(formula = Outstate ~ Apps + Accept + perc.alumni + Grad.Rate +
## Room.Board + Expend, data = College, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6174.2 -1470.2 -112.6 1248.5 8636.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.902e+03 5.443e+02 -5.331 1.68e-07 ***
## Apps -5.713e-01 1.074e-01 -5.317 1.80e-07 ***
## Accept 7.570e-01 1.686e-01 4.491 9.41e-06 ***
## perc.alumni 7.647e+01 1.097e+01 6.973 1.38e-11 ***
## Grad.Rate 5.708e+01 8.247e+00 6.921 1.91e-11 ***
## Room.Board 1.093e+00 1.201e-01 9.102 < 2e-16 ***
## Expend 3.587e-01 2.941e-02 12.195 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2104 on 381 degrees of freedom
## Multiple R-squared: 0.7457, Adjusted R-squared: 0.7417
## F-statistic: 186.2 on 6 and 381 DF, p-value: < 2.2e-16
There. That works fine, right?
#install.packages("gam")
#install.packages("mgcv")
library(gam)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.16.1
gam1 = gam(Outstate ~ Private + s(Room.Board,4) + s(Expend,4) + s(perc.alumni,4) + s(Grad.Rate,4) + s(Terminal,4) , data = College , subset = train)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
summary(gam1)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 4) + s(Expend,
## 4) + s(perc.alumni, 4) + s(Grad.Rate, 4) + s(Terminal, 4),
## data = College, subset = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6947.5 -1133.5 73.4 1184.5 6673.1
##
## (Dispersion Parameter for gaussian family taken to be 3247109)
##
## Null Deviance: 6634665171 on 387 degrees of freedom
## Residual Deviance: 1188440845 on 365.9997 degrees of freedom
## AIC: 6941.839
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1852988435 1852988435 570.6579 < 2.2e-16 ***
## s(Room.Board, 4) 1 1140118801 1140118801 351.1181 < 2.2e-16 ***
## s(Expend, 4) 1 1414732578 1414732578 435.6899 < 2.2e-16 ***
## s(perc.alumni, 4) 1 156291036 156291036 48.1324 1.817e-11 ***
## s(Grad.Rate, 4) 1 84460230 84460230 26.0109 5.460e-07 ***
## s(Terminal, 4) 1 10298965 10298965 3.1717 0.07575 .
## Residuals 366 1188440845 3247109
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, 4) 3 2.9644 0.03209 *
## s(Expend, 4) 3 25.1570 7.994e-15 ***
## s(perc.alumni, 4) 3 1.7457 0.15724
## s(Grad.Rate, 4) 3 3.1035 0.02667 *
## s(Terminal, 4) 3 0.7703 0.51121
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(1,6))
plot(gam1, se=TRUE ,col ="blue")
This all seems pretty good, I think? All of our smooth terms are significant, with Terminal a little less so. However, only Expend and (somewhat) perc.alumni have significant nonlinear effects; the rest should maybe be better represented as nonparametric terms. I’m going to try that:
gam2 = gam(Outstate ~ Private + Room.Board + s(Expend,4) + s(perc.alumni,4) + Grad.Rate + Terminal , data = College , subset = train)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
summary(gam2)
##
## Call: gam(formula = Outstate ~ Private + Room.Board + s(Expend, 4) +
## s(perc.alumni, 4) + Grad.Rate + Terminal, data = College,
## subset = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7100.23 -1203.03 39.89 1256.35 6718.56
##
## (Dispersion Parameter for gaussian family taken to be 3336592)
##
## Null Deviance: 6634665171 on 387 degrees of freedom
## Residual Deviance: 1251220917 on 374.9996 degrees of freedom
## AIC: 6943.813
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1848936042 1848936042 554.1390 < 2.2e-16 ***
## Room.Board 1 1127785912 1127785912 338.0053 < 2.2e-16 ***
## s(Expend, 4) 1 1396079889 1396079889 418.4149 < 2.2e-16 ***
## s(perc.alumni, 4) 1 165529756 165529756 49.6104 9.019e-12 ***
## Grad.Rate 1 88078707 88078707 26.3978 4.479e-07 ***
## Terminal 1 10751933 10751933 3.2224 0.07344 .
## Residuals 375 1251220917 3336592
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## Room.Board
## s(Expend, 4) 3 27.5660 4.441e-16 ***
## s(perc.alumni, 4) 3 1.6874 0.1692
## Grad.Rate
## Terminal
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
gam3 = gam(Outstate ~ Private + Room.Board + s(Expend,4) + perc.alumni + Grad.Rate, data = College , subset = train)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
summary(gam3)
##
## Call: gam(formula = Outstate ~ Private + Room.Board + s(Expend, 4) +
## perc.alumni + Grad.Rate, data = College, subset = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6875.56 -1072.64 22.33 1204.26 6487.87
##
## (Dispersion Parameter for gaussian family taken to be 3370829)
##
## Null Deviance: 6634665171 on 387 degrees of freedom
## Residual Deviance: 1277543644 on 378.9998 degrees of freedom
## AIC: 6943.89
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1860656625 1860656625 551.988 < 2.2e-16 ***
## Room.Board 1 1095797370 1095797370 325.082 < 2.2e-16 ***
## s(Expend, 4) 1 1367904265 1367904265 405.807 < 2.2e-16 ***
## perc.alumni 1 164068974 164068974 48.673 1.356e-11 ***
## Grad.Rate 1 91184388 91184388 27.051 3.250e-07 ***
## Residuals 379 1277543644 3370829
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## Room.Board
## s(Expend, 4) 3 31.304 < 2.2e-16 ***
## perc.alumni
## Grad.Rate
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
I think this looks like a better model.
gam1.pred = predict(gam1 , newdata = College[-train,])
gam1.mse <- mean((College$Outstate-predict(gam1 , College))[-train]^2)
gam3.pred = predict(gam3 , newdata = College[-train,])
gam3.mse <- mean((College$Outstate-predict(gam3 , College))[-train]^2)
newmod.pred = predict(newmod , newdata = College[-train,])
newmod.mse <- mean((College$Outstate-predict(newmod , College))[-train]^2)
gam1.mse
## [1] 3797807
gam3.mse
## [1] 3894087
newmod.mse
## [1] 5604592
Hmmm, the MSE for the third model is higher than the original one, so maybe I don’t know what I’m talking about. However, they’re both way lower than using just the linear model, so clearly we’re doing something right.
Oops, I kind of answered this already. Expend is the main one, and somewhat perc.alumni, as evidenced by their p-value significance levels.