This question uses the variables dis (the weighted mean of distances to five Boston employment centers) and nox (nitrogen oxides concentration in parts per 10 million) from the Boston data. We will treat dis as the predictor and nox as the response.
library(tidyverse)
## -- Attaching packages ---------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
data("Boston")
names(Boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
Use the poly() function to fit a cubic polynomial regression to predict nox using dis. Report the regression output, and plot the resulting data and polynomial fits.
attach(Boston)
set.seed(1)
model = lm(nox ~ poly(dis, 3), data = Boston)
coef(summary (model))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.5546951 0.00275939 201.020894 0.000000e+00
## poly(dis, 3)1 -2.0030959 0.06207094 -32.271071 1.597201e-124
## poly(dis, 3)2 0.8563300 0.06207094 13.795987 6.133104e-37
## poly(dis, 3)3 -0.3180490 0.06207094 -5.123959 4.274950e-07
pred = predict(model, Boston)
ggplot()+
geom_point(aes(dis,nox))+
geom_line(aes(dis,pred))
Plot the polynomial fits for a range of different polynomial degrees (say, from 1 to 10), and report the associated residual sum of squares.
errors = matrix(ncol=1, nrow=10)
## n = 1
model1 = lm(nox ~ poly(dis, 1), data = Boston)
pred1 = predict(model1, Boston)
ggplot()+
geom_point(aes(dis,nox))+
geom_line(aes(dis,pred1))
errors[1,1] = mean((Boston$nox - pred1)^2)
## n = 2
model2 = lm(nox ~ poly(dis, 2), data = Boston)
pred2 = predict(model2, Boston)
ggplot()+
geom_point(aes(dis,nox))+
geom_line(aes(dis,pred2))
errors[2,1] = mean((Boston$nox - pred2)^2)
## n = 3
model3 = lm(nox ~ poly(dis, 3), data = Boston)
pred3 = predict(model3, Boston)
ggplot()+
geom_point(aes(dis,nox))+
geom_line(aes(dis,pred3))
errors[3,1] = mean((Boston$nox - pred3)^2)
## n = 4
model4 = lm(nox ~ poly(dis, 4), data = Boston)
pred4 = predict(model4, Boston)
ggplot()+
geom_point(aes(dis,nox))+
geom_line(aes(dis,pred4))
errors[4,1] = mean((Boston$nox - pred4)^2)
## n = 5
model5 = lm(nox ~ poly(dis, 5), data = Boston)
pred5 = predict(model5, Boston)
ggplot()+
geom_point(aes(dis,nox))+
geom_line(aes(dis,pred5))
errors[5,1] = mean((Boston$nox - pred5)^2)
## n = 6
model6 = lm(nox ~ poly(dis, 6), data = Boston)
pred6 = predict(model6, Boston)
ggplot()+
geom_point(aes(dis,nox))+
geom_line(aes(dis,pred6))
errors[6,1] = mean((Boston$nox - pred6)^2)
## n = 7
model7 = lm(nox ~ poly(dis, 7), data = Boston)
pred7 = predict(model7, Boston)
ggplot()+
geom_point(aes(dis,nox))+
geom_line(aes(dis,pred7))
errors[7,1] = mean((Boston$nox - pred7)^2)
## n = 8
model8 = lm(nox ~ poly(dis, 8), data = Boston)
pred8 = predict(model8, Boston)
ggplot()+
geom_point(aes(dis,nox))+
geom_line(aes(dis,pred8))
errors[8,1] = mean((Boston$nox - pred8)^2)
## n = 9
model9 = lm(nox ~ poly(dis, 9), data = Boston)
pred9 = predict(model9, Boston)
ggplot()+
geom_point(aes(dis,nox))+
geom_line(aes(dis,pred9))
errors[9,1] = mean((Boston$nox - pred9)^2)
## n = 10
model10 = lm(nox ~ poly(dis, 10), data = Boston)
pred10 = predict(model10, Boston)
ggplot()+
geom_point(aes(dis,nox))+
geom_line(aes(dis,pred10))
errors[10,1] = mean((Boston$nox - pred10)^2)
errors
## [,1]
## [1,] 0.005471468
## [2,] 0.004022257
## [3,] 0.003822345
## [4,] 0.003820121
## [5,] 0.003785158
## [6,] 0.003711971
## [7,] 0.003655106
## [8,] 0.003627727
## [9,] 0.003623183
## [10,] 0.003620892
fitDF<-data.frame(MSE=as.matrix(errors)^2,
degree=1:10)
ggplot(fitDF, aes(degree, MSE))+
geom_point()+
geom_line()
Perform cross-validation or another approach to select the optimal degree for the polynomial, and explain your results.
train = sample(1:dim(Boston)[1], dim(Boston)[1]/2)
trainW= Boston[train,]
testW = Boston[-train,]
polyDF = matrix(nrow = 10, ncol=2)
colnames(polyDF) = c("Degree", "MSE")
for (i in 1:10) {
mod1 = lm(nox~poly(dis,i),data = trainW)
polyDF[i,1] = i
pred = predict(mod1, newdata = testW)
polyDF[i,2] = mean((testW$dis-pred)^2)
}
polyDF = as.data.frame(polyDF)
ggplot(polyDF, aes(Degree,MSE))+
geom_point()+
geom_line()
We can either choose 3 or 4, because 4 is the lowest one and the MSE goes up with more complexity from 5.
Use the bs() function to fit a regression spline to predict nox using dis. Report the output for the fit using four degrees of freedom. How did you choose the knots? Plot the resulting fit.
library(splines)
dislims =range(dis)
dis.grid=seq(from=dislims [1],to=dislims [2])
preds=predict (model4 ,newdata =list(dis=dis.grid),se=TRUE)
par(mfrow=c(1,1))
fit=lm(nox~bs(dis ,knots=c(25,50,75) ),data=Boston)
pred=predict (fit ,newdata =list(dis=dis.grid),se=T)
## Warning in predict.lm(fit, newdata = list(dis = dis.grid), se = T):
## prediction from a rank-deficient fit may be misleading
plot(dis ,nox ,col="gray")
lines(dis.grid ,pred$fit ,lwd=2)
lines(dis.grid ,pred$fit+2*pred$se ,lty="dashed")
lines(dis.grid ,pred$fit-2*pred$se ,lty="dashed")
Most data fit the prediction line very well and I chose 25th, 50th, 75th as our knots.
Now fit a regression spline for a range of degrees of freedom, and plot the resulting fits and report the resulting RSS. Describe the results obtained.
error = matrix(ncol=1, nrow=10)
## n = 1
mod1 = lm(nox ~ bs(dis, df = 1), data = Boston)
## Warning in bs(dis, df = 1): 'df' was too small; have used 3
pre1 = predict(mod1, Boston)
ggplot()+
geom_point(aes(dis,nox))+
geom_line(aes(dis,pre1))
error[1,1] = mean((Boston$nox - pre1)^2)
## n = 2
mod2 = lm(nox ~ bs(dis, df = 2), data = Boston)
## Warning in bs(dis, df = 2): 'df' was too small; have used 3
pre2 = predict(mod2, Boston)
ggplot()+
geom_point(aes(dis,nox))+
geom_line(aes(dis,pre2))
error[2,1] = mean((Boston$nox - pre2)^2)
## n = 3
mod3 = lm(nox ~ bs(dis, df = 3), data = Boston)
pre3 = predict(mod3, Boston)
ggplot()+
geom_point(aes(dis,nox))+
geom_line(aes(dis,pre3))
error[3,1] = mean((Boston$nox - pre3)^2)
## n = 4
mod4 = lm(nox ~ bs(dis, df = 4), data = Boston)
pre4 = predict(mod4, Boston)
ggplot()+
geom_point(aes(dis,nox))+
geom_line(aes(dis,pre4))
error[4,1] = mean((Boston$nox - pre4)^2)
## n = 5
mod5 = lm(nox ~ bs(dis, df = 5), data = Boston)
pre5 = predict(mod5, Boston)
ggplot()+
geom_point(aes(dis,nox))+
geom_line(aes(dis,pre5))
error[5,1] = mean((Boston$nox - pre5)^2)
## n = 6
mod6 = lm(nox ~ bs(dis, df = 6), data = Boston)
pre6 = predict(mod6, Boston)
ggplot()+
geom_point(aes(dis,nox))+
geom_line(aes(dis,pre6))
error[6,1] = mean((Boston$nox - pre6)^2)
## n = 7
mod7 = lm(nox ~ bs(dis, df = 7), data = Boston)
pre7 = predict(mod7, Boston)
ggplot()+
geom_point(aes(dis,nox))+
geom_line(aes(dis,pre7))
error[7,1] = mean((Boston$nox - pre7)^2)
## n = 8
mod8 = lm(nox ~ bs(dis, df = 8), data = Boston)
pre8 = predict(mod8, Boston)
ggplot()+
geom_point(aes(dis,nox))+
geom_line(aes(dis,pre8))
error[8,1] = mean((Boston$nox - pre8)^2)
## n = 9
mod9 = lm(nox ~ bs(dis, df = 9), data = Boston)
pre9 = predict(mod9, Boston)
ggplot()+
geom_point(aes(dis,nox))+
geom_line(aes(dis,pre9))
error[9,1] = mean((Boston$nox - pre9)^2)
## n = 10
mod10 = lm(nox ~ bs(dis, df = 10), data = Boston)
pre10 = predict(mod10, Boston)
ggplot()+
geom_point(aes(dis,nox))+
geom_line(aes(dis,pre10))
error[10,1] = mean((Boston$nox - pre10)^2)
error
## [,1]
## [1,] 0.003822345
## [2,] 0.003822345
## [3,] 0.003822345
## [4,] 0.003799951
## [5,] 0.003636705
## [6,] 0.003624439
## [7,] 0.003616372
## [8,] 0.003590899
## [9,] 0.003608009
## [10,] 0.003542559
fitDF1<-data.frame(MSE=as.matrix(error)^2,
degree=1:10)
ggplot(fitDF1, aes(degree, MSE))+
geom_point()+
geom_line()
detach(Boston)
This question relates to the College data set.
#install.packages("gam")
library(gam)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.16.1
library(ISLR)
data(College)
Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.
attach(College)
set.seed(1)
names(College)
## [1] "Private" "Apps" "Accept" "Enroll" "Top10perc"
## [6] "Top25perc" "F.Undergrad" "P.Undergrad" "Outstate" "Room.Board"
## [11] "Books" "Personal" "PhD" "Terminal" "S.F.Ratio"
## [16] "perc.alumni" "Expend" "Grad.Rate"
train = sample(1:dim(College)[1], dim(College)[1]/2)
training= College[train,]
testing = College[-train,]
gam1=lm(Outstate~Private ,data=training)
summary(gam1) # yes
##
## Call:
## lm(formula = Outstate ~ Private, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7493 -2327 -403 2007 9807
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6783.8 346.8 19.56 <2e-16 ***
## PrivateYes 5109.1 407.5 12.54 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3587 on 386 degrees of freedom
## Multiple R-squared: 0.2894, Adjusted R-squared: 0.2875
## F-statistic: 157.2 on 1 and 386 DF, p-value: < 2.2e-16
gam2=lm(Outstate~Apps ,data=training)
summary(gam2) # no
##
## Call:
## lm(formula = Outstate ~ Apps, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7664.9 -3415.9 -330.3 2847.3 11472.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.018e+04 2.692e+02 37.811 <2e-16 ***
## Apps 9.490e-02 5.028e-02 1.888 0.0598 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4236 on 386 degrees of freedom
## Multiple R-squared: 0.009147, Adjusted R-squared: 0.00658
## F-statistic: 3.563 on 1 and 386 DF, p-value: 0.05982
gam3=lm(Outstate~Accept ,data=training)
summary(gam3) # no
##
## Call:
## lm(formula = Outstate ~ Accept, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7836.8 -3456.3 -466.7 2770.3 11290.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.040e+04 2.780e+02 37.392 <2e-16 ***
## Accept 4.159e-02 8.267e-02 0.503 0.615
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4254 on 386 degrees of freedom
## Multiple R-squared: 0.0006551, Adjusted R-squared: -0.001934
## F-statistic: 0.253 on 1 and 386 DF, p-value: 0.6152
gam4=lm(Outstate~Enroll,data=training)
summary(gam4) # no
##
## Call:
## lm(formula = Outstate ~ Enroll, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8047.7 -3277.6 -612.2 2704.7 10926.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10822.0368 285.4255 37.915 <2e-16 ***
## Enroll -0.4244 0.2355 -1.802 0.0723 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4238 on 386 degrees of freedom
## Multiple R-squared: 0.008343, Adjusted R-squared: 0.005774
## F-statistic: 3.248 on 1 and 386 DF, p-value: 0.07231
gam5=lm(Outstate~Top10perc ,data=training)
summary(gam5) # yes
##
## Call:
## lm(formula = Outstate ~ Top10perc, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8882.6 -2681.2 209.2 2274.0 11608.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6606.034 329.100 20.07 <2e-16 ***
## Top10perc 139.410 9.993 13.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3470 on 386 degrees of freedom
## Multiple R-squared: 0.3352, Adjusted R-squared: 0.3335
## F-statistic: 194.6 on 1 and 386 DF, p-value: < 2.2e-16
gam6=lm(Outstate~Top25perc ,data=training)
summary(gam6) # yes
##
## Call:
## lm(formula = Outstate ~ Top25perc, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9023.1 -2974.0 208.1 2598.8 11524.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4695.557 566.287 8.292 1.87e-15 ***
## Top25perc 103.389 9.534 10.845 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3726 on 386 degrees of freedom
## Multiple R-squared: 0.2335, Adjusted R-squared: 0.2315
## F-statistic: 117.6 on 1 and 386 DF, p-value: < 2.2e-16
gam7=lm(Outstate~F.Undergrad ,data=training)
summary(gam7) # yes
##
## Call:
## lm(formula = Outstate ~ F.Undergrad, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8346.8 -3200.6 -561.3 2642.1 10695.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.107e+04 2.721e+02 40.708 < 2e-16 ***
## F.Undergrad -1.545e-01 4.435e-02 -3.484 0.00055 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4190 on 386 degrees of freedom
## Multiple R-squared: 0.03049, Adjusted R-squared: 0.02798
## F-statistic: 12.14 on 1 and 386 DF, p-value: 0.0005502
gam8=lm(Outstate~P.Undergrad ,data=training)
summary(gam8) # yes
##
## Call:
## lm(formula = Outstate ~ P.Undergrad, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8265.1 -3277.2 -431.4 2713.2 10505.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11195.6443 247.8704 45.167 < 2e-16 ***
## P.Undergrad -0.7670 0.1444 -5.312 1.84e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4108 on 386 degrees of freedom
## Multiple R-squared: 0.06812, Adjusted R-squared: 0.06571
## F-statistic: 28.22 on 1 and 386 DF, p-value: 1.835e-07
gam9=lm(Outstate~Books ,data=training)
summary(gam9) # no
##
## Call:
## lm(formula = Outstate ~ Books, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7902 -3427 -500 2776 11214
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.046e+04 8.485e+02 12.326 <2e-16 ***
## Books 4.528e-02 1.475e+00 0.031 0.976
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4255 on 386 degrees of freedom
## Multiple R-squared: 2.441e-06, Adjusted R-squared: -0.002588
## F-statistic: 0.0009421 on 1 and 386 DF, p-value: 0.9755
gam10=lm(Outstate~Room.Board ,data=training)
summary(gam10) # yes
##
## Call:
## lm(formula = Outstate ~ Room.Board, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7982.0 -2063.3 -362.2 1911.0 11985.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -613.2019 644.5743 -0.951 0.342
## Room.Board 2.5189 0.1417 17.774 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3156 on 386 degrees of freedom
## Multiple R-squared: 0.4501, Adjusted R-squared: 0.4487
## F-statistic: 315.9 on 1 and 386 DF, p-value: < 2.2e-16
gam11=lm(Outstate~Personal ,data=training)
summary(gam1) # yes
##
## Call:
## lm(formula = Outstate ~ Private, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7493 -2327 -403 2007 9807
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6783.8 346.8 19.56 <2e-16 ***
## PrivateYes 5109.1 407.5 12.54 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3587 on 386 degrees of freedom
## Multiple R-squared: 0.2894, Adjusted R-squared: 0.2875
## F-statistic: 157.2 on 1 and 386 DF, p-value: < 2.2e-16
gam12=lm(Outstate~PhD ,data=training)
summary(gam12) # yes
##
## Call:
## lm(formula = Outstate ~ PhD, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8442.9 -3291.9 55.3 2846.0 15157.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2904.61 908.26 3.198 0.0015 **
## PhD 103.94 12.16 8.551 2.9e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3902 on 386 degrees of freedom
## Multiple R-squared: 0.1593, Adjusted R-squared: 0.1571
## F-statistic: 73.12 on 1 and 386 DF, p-value: 2.897e-16
gam13=lm(Outstate~Terminal ,data=training)
summary(gam13) # yes
##
## Call:
## lm(formula = Outstate ~ Terminal, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10259.4 -3160.4 203.9 2577.4 13684.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1072.97 1102.49 0.973 0.331
## Terminal 117.66 13.56 8.677 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3893 on 386 degrees of freedom
## Multiple R-squared: 0.1632, Adjusted R-squared: 0.161
## F-statistic: 75.28 on 1 and 386 DF, p-value: < 2.2e-16
gam14=lm(Outstate~S.F.Ratio ,data=training)
summary(gam14) # yes
##
## Call:
## lm(formula = Outstate ~ S.F.Ratio, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10749.7 -2734.9 -207.4 2362.7 9334.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19505.66 654.37 29.81 <2e-16 ***
## S.F.Ratio -644.18 45.03 -14.31 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3440 on 386 degrees of freedom
## Multiple R-squared: 0.3465, Adjusted R-squared: 0.3448
## F-statistic: 204.7 on 1 and 386 DF, p-value: < 2.2e-16
gam15=lm(Outstate~perc.alumni ,data=training)
summary(gam15) # yes
##
## Call:
## lm(formula = Outstate ~ perc.alumni, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9996.2 -2490.7 -562.5 2101.1 9599.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6205.32 364.84 17.01 <2e-16 ***
## perc.alumni 194.17 14.44 13.44 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3512 on 386 degrees of freedom
## Multiple R-squared: 0.3189, Adjusted R-squared: 0.3171
## F-statistic: 180.7 on 1 and 386 DF, p-value: < 2.2e-16
gam16=lm(Outstate~Expend ,data=training)
summary(gam16) # yes
##
## Call:
## lm(formula = Outstate ~ Expend, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14016.9 -2277.4 -8.2 2231.3 8109.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.700e+03 3.201e+02 17.81 <2e-16 ***
## Expend 4.822e-01 2.782e-02 17.33 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3191 on 386 degrees of freedom
## Multiple R-squared: 0.4377, Adjusted R-squared: 0.4362
## F-statistic: 300.4 on 1 and 386 DF, p-value: < 2.2e-16
gam17=lm(Outstate~Grad.Rate ,data=training)
summary(gam17) # yes
##
## Call:
## lm(formula = Outstate ~ Grad.Rate, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11615.2 -2319.0 -290.2 2316.4 12670.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1176.1 681.4 1.726 0.0851 .
## Grad.Rate 142.8 10.1 14.137 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3454 on 386 degrees of freedom
## Multiple R-squared: 0.3411, Adjusted R-squared: 0.3394
## F-statistic: 199.8 on 1 and 386 DF, p-value: < 2.2e-16
gam19=lm(Outstate~Private +Top10perc ,data=training)
summary(gam19)
##
## Call:
## lm(formula = Outstate ~ Private + Top10perc, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7234.8 -1699.5 -61.9 1897.3 10328.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3739.734 339.662 11.01 <2e-16 ***
## PrivateYes 4498.731 324.972 13.84 <2e-16 ***
## Top10perc 125.325 8.239 15.21 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2839 on 385 degrees of freedom
## Multiple R-squared: 0.5561, Adjusted R-squared: 0.5538
## F-statistic: 241.2 on 2 and 385 DF, p-value: < 2.2e-16
gam20=lm(Outstate~Private +Top10perc + Top25perc ,data=training)
summary(gam20)
##
## Call:
## lm(formula = Outstate ~ Private + Top10perc + Top25perc, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7227.8 -1675.9 -41.8 1888.1 10324.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3583.910 590.923 6.065 3.16e-09 ***
## PrivateYes 4515.813 329.636 13.699 < 2e-16 ***
## Top10perc 120.150 18.046 6.658 9.62e-11 ***
## Top25perc 5.134 15.921 0.322 0.747
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2842 on 384 degrees of freedom
## Multiple R-squared: 0.5562, Adjusted R-squared: 0.5528
## F-statistic: 160.4 on 3 and 384 DF, p-value: < 2.2e-16
gam21=lm(Outstate~Private +Top10perc + F.Undergrad ,data=training)
summary(gam21)
##
## Call:
## lm(formula = Outstate ~ Private + Top10perc + F.Undergrad, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7219.4 -1691.8 -62.8 1859.6 10354.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.603e+03 4.332e+02 8.316 1.6e-15 ***
## PrivateYes 4.637e+03 4.230e+02 10.962 < 2e-16 ***
## Top10perc 1.239e+02 8.728e+00 14.191 < 2e-16 ***
## F.Undergrad 2.031e-02 3.980e-02 0.510 0.61
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2842 on 384 degrees of freedom
## Multiple R-squared: 0.5564, Adjusted R-squared: 0.553
## F-statistic: 160.6 on 3 and 384 DF, p-value: < 2.2e-16
gam22=lm(Outstate~Private +Top10perc + P.Undergrad ,data=training)
summary(gam22)
##
## Call:
## lm(formula = Outstate ~ Private + Top10perc + P.Undergrad, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7249.4 -1688.2 -59.9 1919.8 10361.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.601e+03 4.176e+02 8.623 <2e-16 ***
## PrivateYes 4.599e+03 3.693e+02 12.451 <2e-16 ***
## Top10perc 1.255e+02 8.255e+00 15.208 <2e-16 ***
## P.Undergrad 6.516e-02 1.139e-01 0.572 0.568
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2841 on 384 degrees of freedom
## Multiple R-squared: 0.5565, Adjusted R-squared: 0.553
## F-statistic: 160.6 on 3 and 384 DF, p-value: < 2.2e-16
gam23=lm(Outstate~Private +Top10perc + Room.Board ,data=training)
summary(gam23)
##
## Call:
## lm(formula = Outstate ~ Private + Top10perc + Room.Board, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7881.7 -1672.9 -156.6 1450.1 10946.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1193.5710 496.5161 -2.404 0.0167 *
## PrivateYes 3545.0605 286.9933 12.352 <2e-16 ***
## Top10perc 86.2009 7.7001 11.195 <2e-16 ***
## Room.Board 1.5236 0.1248 12.211 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2412 on 384 degrees of freedom
## Multiple R-squared: 0.6803, Adjusted R-squared: 0.6778
## F-statistic: 272.3 on 3 and 384 DF, p-value: < 2.2e-16
gam24=lm(Outstate~Private +Top10perc + Room.Board + Personal ,data=training)
summary(gam24)
##
## Call:
## lm(formula = Outstate ~ Private + Top10perc + Room.Board + Personal,
## data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7119.7 -1484.9 -131.4 1471.6 10450.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -52.2526 582.0216 -0.090 0.928511
## PrivateYes 3345.3311 287.9427 11.618 < 2e-16 ***
## Top10perc 85.6750 7.5833 11.298 < 2e-16 ***
## Room.Board 1.4955 0.1231 12.149 < 2e-16 ***
## Personal -0.6345 0.1756 -3.614 0.000342 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2375 on 383 degrees of freedom
## Multiple R-squared: 0.6908, Adjusted R-squared: 0.6876
## F-statistic: 213.9 on 4 and 383 DF, p-value: < 2.2e-16
gam25=lm(Outstate~Private +Top10perc + Room.Board + PhD ,data=training)
summary(gam25)
##
## Call:
## lm(formula = Outstate ~ Private + Top10perc + Room.Board + PhD,
## data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7473.1 -1662.2 -117.4 1479.7 12242.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3200.9005 691.8850 -4.626 5.10e-06 ***
## PrivateYes 3964.3598 299.4820 13.237 < 2e-16 ***
## Top10perc 68.3599 8.7219 7.838 4.58e-14 ***
## Room.Board 1.3639 0.1284 10.622 < 2e-16 ***
## PhD 39.8165 9.7545 4.082 5.44e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2365 on 383 degrees of freedom
## Multiple R-squared: 0.6936, Adjusted R-squared: 0.6904
## F-statistic: 216.8 on 4 and 383 DF, p-value: < 2.2e-16
gam26=lm(Outstate~Private +Top10perc + Room.Board + PhD + Terminal ,data=training)
summary(gam26)
##
## Call:
## lm(formula = Outstate ~ Private + Top10perc + Room.Board + PhD +
## Terminal, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7409.8 -1587.7 -48.9 1498.0 11999.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4179.0715 780.7669 -5.353 1.50e-07 ***
## PrivateYes 4040.3618 298.5926 13.531 < 2e-16 ***
## Top10perc 67.3749 8.6633 7.777 6.98e-14 ***
## Room.Board 1.3088 0.1291 10.136 < 2e-16 ***
## PhD 12.4176 14.2168 0.873 0.38297
## Terminal 39.8985 15.1625 2.631 0.00885 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2347 on 382 degrees of freedom
## Multiple R-squared: 0.6991, Adjusted R-squared: 0.6951
## F-statistic: 177.5 on 5 and 382 DF, p-value: < 2.2e-16
gam27=lm(Outstate~Private +Top10perc + Room.Board + PhD + S.F.Ratio ,data=training)
summary(gam27)
##
## Call:
## lm(formula = Outstate ~ Private + Top10perc + Room.Board + PhD +
## S.F.Ratio, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7251.2 -1518.0 -0.7 1427.3 11690.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 213.1338 1049.8490 0.203 0.839
## PrivateYes 3411.8547 320.5010 10.645 < 2e-16 ***
## Top10perc 57.4778 8.9086 6.452 3.35e-10 ***
## Room.Board 1.2714 0.1275 9.972 < 2e-16 ***
## PhD 39.6702 9.5438 4.157 3.99e-05 ***
## S.F.Ratio -163.7344 38.4852 -4.254 2.64e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2314 on 382 degrees of freedom
## Multiple R-squared: 0.7075, Adjusted R-squared: 0.7036
## F-statistic: 184.8 on 5 and 382 DF, p-value: < 2.2e-16
gam28=lm(Outstate~Private +Top10perc + Room.Board +PhD + perc.alumni ,data=training)
summary(gam28)
##
## Call:
## lm(formula = Outstate ~ Private + Top10perc + Room.Board + PhD +
## perc.alumni, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6649.2 -1639.1 -88.1 1416.0 11474.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3583.6657 661.0026 -5.422 1.05e-07 ***
## PrivateYes 3129.9358 313.2784 9.991 < 2e-16 ***
## Top10perc 49.8640 8.7860 5.675 2.74e-08 ***
## Room.Board 1.4115 0.1224 11.532 < 2e-16 ***
## PhD 35.5360 9.3050 3.819 0.000156 ***
## perc.alumni 72.7893 11.3570 6.409 4.32e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2250 on 382 degrees of freedom
## Multiple R-squared: 0.7234, Adjusted R-squared: 0.7197
## F-statistic: 199.8 on 5 and 382 DF, p-value: < 2.2e-16
gam29=lm(Outstate~Private +Top10perc + Room.Board +PhD + perc.alumni + Expend ,data=training)
summary(gam29)
##
## Call:
## lm(formula = Outstate ~ Private + Top10perc + Room.Board + PhD +
## perc.alumni + Expend, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6158.0 -1516.1 -113.9 1380.0 10226.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.055e+03 6.385e+02 -4.785 2.45e-06 ***
## PrivateYes 2.902e+03 3.021e+02 9.607 < 2e-16 ***
## Top10perc 2.295e+01 9.518e+00 2.411 0.016378 *
## Room.Board 1.220e+00 1.213e-01 10.051 < 2e-16 ***
## PhD 3.127e+01 8.931e+00 3.502 0.000517 ***
## perc.alumni 6.866e+01 1.089e+01 6.306 7.93e-10 ***
## Expend 1.645e-01 2.729e-02 6.027 3.94e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2153 on 381 degrees of freedom
## Multiple R-squared: 0.7474, Adjusted R-squared: 0.7435
## F-statistic: 187.9 on 6 and 381 DF, p-value: < 2.2e-16
gam30=lm(Outstate~Private + Room.Board +PhD + perc.alumni + Expend + Grad.Rate ,data=training)
summary(gam30)
##
## Call:
## lm(formula = Outstate ~ Private + Room.Board + PhD + perc.alumni +
## Expend + Grad.Rate, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6465.2 -1400.5 -190.6 1332.8 10451.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.112e+03 6.262e+02 -6.566 1.69e-10 ***
## PrivateYes 2.713e+03 2.997e+02 9.053 < 2e-16 ***
## Room.Board 1.122e+00 1.225e-01 9.163 < 2e-16 ***
## PhD 3.195e+01 8.397e+00 3.805 0.000165 ***
## perc.alumni 6.045e+01 1.099e+01 5.501 6.92e-08 ***
## Expend 1.923e-01 2.377e-02 8.089 8.15e-15 ***
## Grad.Rate 3.246e+01 7.880e+00 4.120 4.66e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2122 on 381 degrees of freedom
## Multiple R-squared: 0.7545, Adjusted R-squared: 0.7507
## F-statistic: 195.2 on 6 and 381 DF, p-value: < 2.2e-16
The model with parameters: Privates, Room.Board, PhD, perc.alumni, Expend, and Grad.Rate is the best.
Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.
gam31 = gam(Outstate ~ Private + s(Room.Board, df = 2) + s(PhD, df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 2) + s(Grad.Rate, df = 2), data = training)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
par(mfrow = c(2, 3))
plot(gam31, se = T, col = "red")
Evaluate the model obtained on the test set, and explain the results obtained.
gam.pred =predict(gam31, data = testing)
gam.err <- mean((testing$Outstate - gam.pred)^2)
## Warning in testing$Outstate - gam.pred: longer object length is not a
## multiple of shorter object length
gam.err
## [1] 28296297
For which variables, if any, is there evidence of a non-linear relationship with the response?
summary(gam31)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 2) + s(PhD,
## df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 2) + s(Grad.Rate,
## df = 2), data = training)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6500.0 -1344.2 -104.2 1350.9 8792.9
##
## (Dispersion Parameter for gaussian family taken to be 3946537)
##
## Null Deviance: 6989966760 on 387 degrees of freedom
## Residual Deviance: 1483898477 on 376.0001 degrees of freedom
## AIC: 7007.986
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1843025465 1843025465 466.998 < 2.2e-16 ***
## s(Room.Board, df = 2) 1 1692725515 1692725515 428.914 < 2.2e-16 ***
## s(PhD, df = 2) 1 397159048 397159048 100.635 < 2.2e-16 ***
## s(perc.alumni, df = 2) 1 351049729 351049729 88.951 < 2.2e-16 ***
## s(Expend, df = 2) 1 419096845 419096845 106.194 < 2.2e-16 ***
## s(Grad.Rate, df = 2) 1 81869675 81869675 20.745 7.101e-06 ***
## Residuals 376 1483898477 3946537
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, df = 2) 1 2.005 0.15763
## s(PhD, df = 2) 1 3.781 0.05258 .
## s(perc.alumni, df = 2) 1 0.314 0.57572
## s(Expend, df = 2) 1 47.156 2.725e-11 ***
## s(Grad.Rate, df = 2) 1 1.057 0.30446
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
It seems that Expend and Outstate has a non-linear relationship.