#1A) Use the poly function to fit a cubic polynomial regression to predict nox using dis. Report the regression output, and plot the resulting data and polynomial fits.
library(MASS)
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
data("Boston")
attach(Boston)
#visualize data
ggplot(Boston, aes(x=dis,y=nox))+
geom_point()
#cubic model
mod1 <- lm(nox~poly(dis,3))
summary(mod1)
##
## Call:
## lm(formula = nox ~ poly(dis, 3))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.121130 -0.040619 -0.009738 0.023385 0.194904
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.554695 0.002759 201.021 < 2e-16 ***
## poly(dis, 3)1 -2.003096 0.062071 -32.271 < 2e-16 ***
## poly(dis, 3)2 0.856330 0.062071 13.796 < 2e-16 ***
## poly(dis, 3)3 -0.318049 0.062071 -5.124 4.27e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared: 0.7148, Adjusted R-squared: 0.7131
## F-statistic: 419.3 on 3 and 502 DF, p-value: < 2.2e-16
#MSE
degreePoly<-3
polyMSE<-matrix(nrow=degreePoly, ncol=2)
colnames(polyMSE)<-c("degree", "MSE")
for(i in 1:degreePoly){
polyMSE[i,1]<-i
this.fit<-lm(nox~poly(dis,i), data=Boston)
polyMSE[i,2]<-mean((nox-predict(this.fit, Boston))^2)
}
polyDF<-as.data.frame(polyMSE)
head(polyDF)
## degree MSE
## 1 1 0.005471468
## 2 2 0.004022257
## 3 3 0.003822345
#plot MSE
ggplot(data=polyDF, aes(x=degree, y=MSE))+
geom_point()+
geom_line()
#1B) Plot the polynomial fits for a range of different polynomial degrees and report the associated sum of squares.
degreePoly<-10
polyMSE<-matrix(nrow=degreePoly, ncol=2)
colnames(polyMSE)<-c("degree", "MSE")
for(i in 1:degreePoly){
polyMSE[i,1]<-i
this.fit<-lm(nox~poly(dis,i), data=Boston)
polyMSE[i,2]<-mean((nox-predict(this.fit, Boston))^2)
}
polyDF<-as.data.frame(polyMSE)
polyDF
## degree MSE
## 1 1 0.005471468
## 2 2 0.004022257
## 3 3 0.003822345
## 4 4 0.003820121
## 5 5 0.003785158
## 6 6 0.003711971
## 7 7 0.003655106
## 8 8 0.003627727
## 9 9 0.003623183
## 10 10 0.003620892
#MSE
ggplot(data=polyDF, aes(x=degree, y=MSE))+
geom_point()+
geom_line()
#1C) Perform a cross validation or another approach to select the optimal degree for the polynomial, and explain your results.
library(boot)
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
set.seed(3)
#50-50 split of the indexes
train<-sample(506,253)
degreePoly<-10
polyMSE<-matrix(nrow=degreePoly, ncol=2)
colnames(polyMSE)<-c("degree", "MSE")
for(i in 1:degreePoly){
polyMSE[i,1]<-i
this.fit<-lm(nox~poly(dis,i), data=Boston, subset=train)
polyMSE[i,2]<-mean((nox-predict(this.fit, Boston))[-train]^2)
}
polyDF<-as.data.frame(polyMSE)
polyDF
## degree MSE
## 1 1 0.006023164
## 2 2 0.004283208
## 3 3 0.004043945
## 4 4 0.004048655
## 5 5 0.004004791
## 6 6 0.003901858
## 7 7 0.003825441
## 8 8 0.003801398
## 9 9 0.003796123
## 10 10 0.003806693
#k-fold
library(boot)
set.seed(3)
cv.error.k10<-rep(0, 10)
for(i in 1:10){
glm.fit<-glm(nox~poly(dis, i), data=Boston)
cv.error.k10[i]<-cv.glm(Boston, glm.fit, K=10)$delta[1]
}
cvDF<-data.frame(degree=1:10, cv.error.k10)
ggplot(data=cvDF, aes(x=degree, y=cv.error.k10))+
geom_point()+
geom_line()
#anova
fit.1=lm(nox~dis ,data=Boston)
fit.2=lm(nox~poly(dis ,2),data=Boston)
fit.3=lm(nox~poly(dis ,3),data=Boston)
fit.4=lm(nox~poly(dis,4),data=Boston)
fit.5=lm(nox~poly(dis ,5),data=Boston)
fit.6=lm(nox~poly(dis ,6),data=Boston)
fit.7=lm(nox~poly(dis ,7),data=Boston)
fit.8=lm(nox~poly(dis ,8),data=Boston)
fit.9=lm(nox~poly(dis ,9),data=Boston)
fit.10=lm(nox~poly(dis ,10),data=Boston)
anova(fit.1,fit.2,fit.3,fit.4,fit.5,fit.6,fit.7,fit.8,fit.9,fit.10)
## Analysis of Variance Table
##
## Model 1: nox ~ dis
## Model 2: nox ~ poly(dis, 2)
## Model 3: nox ~ poly(dis, 3)
## Model 4: nox ~ poly(dis, 4)
## Model 5: nox ~ poly(dis, 5)
## Model 6: nox ~ poly(dis, 6)
## Model 7: nox ~ poly(dis, 7)
## Model 8: nox ~ poly(dis, 8)
## Model 9: nox ~ poly(dis, 9)
## Model 10: nox ~ poly(dis, 10)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 504 2.7686
## 2 503 2.0353 1 0.73330 198.1169 < 2.2e-16 ***
## 3 502 1.9341 1 0.10116 27.3292 2.535e-07 ***
## 4 501 1.9330 1 0.00113 0.3040 0.581606
## 5 500 1.9153 1 0.01769 4.7797 0.029265 *
## 6 499 1.8783 1 0.03703 10.0052 0.001657 **
## 7 498 1.8495 1 0.02877 7.7738 0.005505 **
## 8 497 1.8356 1 0.01385 3.7429 0.053601 .
## 9 496 1.8333 1 0.00230 0.6211 0.431019
## 10 495 1.8322 1 0.00116 0.3133 0.575908
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#If we look at the anova to compare different models (with different degree polynomials), we can see that a model going to the 7th degree is statistically significant (although the 4th degree is not). This shows that our model could be as complex as going to an 7th degree polynomial to predict nox using dis. However, the cross validations show that a model this complex is not worth using. In the 50-50 split, mean squared error continues to decline after the 3rd degree, but not by a substantial amount. In the k-fold, mean squared error actually increases after the 3rd degree. Therefore, the optimal degree for the polynomial is 3.
#1D) Use the bs() function to fit a regression spline to predict nox using dis. Report the output for the fit using four degrees of freedom. How did you choose the knots? Plot the resulting fit.
library(splines)
#The number of knots to use for the bs() function was chosen by R using four degrees of freedom.
attr(bs(dis,df=4) ,"knots")
## 50%
## 3.20745
#output tells us to use 1 knot at 3.20745.
#basis function
dislims =range(dis)
dis.grid=seq(from=dislims[1],to=dislims[2])
par(mfrow=c(1,1))
fit=lm(nox~bs(dis, df=4,knots=c(3.20745)),data=Boston)
pred=predict(fit,newdata=data.frame(dis=dis.grid),se=T)
#output
summary(fit)
##
## Call:
## lm(formula = nox ~ bs(dis, df = 4, knots = c(3.20745)), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.124622 -0.039259 -0.008514 0.020850 0.193891
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.73447 0.01460 50.306 < 2e-16
## bs(dis, df = 4, knots = c(3.20745))1 -0.05810 0.02186 -2.658 0.00812
## bs(dis, df = 4, knots = c(3.20745))2 -0.46356 0.02366 -19.596 < 2e-16
## bs(dis, df = 4, knots = c(3.20745))3 -0.19979 0.04311 -4.634 4.58e-06
## bs(dis, df = 4, knots = c(3.20745))4 -0.38881 0.04551 -8.544 < 2e-16
##
## (Intercept) ***
## bs(dis, df = 4, knots = c(3.20745))1 **
## bs(dis, df = 4, knots = c(3.20745))2 ***
## bs(dis, df = 4, knots = c(3.20745))3 ***
## bs(dis, df = 4, knots = c(3.20745))4 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06195 on 501 degrees of freedom
## Multiple R-squared: 0.7164, Adjusted R-squared: 0.7142
## F-statistic: 316.5 on 4 and 501 DF, p-value: < 2.2e-16
#plot the resulting fit
plot(dis,nox,col="gray")
lines(dis.grid,pred$fit ,lwd=2)
lines(dis.grid,pred$fit+2*pred$se ,lty="dashed")
lines(dis.grid,pred$fit-2*pred$se ,lty="dashed")
#1E) Now fit a regression spline for a range of degrees of freedom, and plot the resulting fits and report the resulting RSS. Describe the results obtained.
degreeFreedom <-10
degreeMSE<-matrix(nrow=degreeFreedom, ncol=2)
colnames(degreeMSE)<-c("degree", "MSE")
for(i in 1:degreeFreedom){
degreeMSE[i,1]<-i
this.fit<-lm(nox~ns(dis,df=i),data=Boston)
degreeMSE[i,2]<-mean((nox-predict(this.fit, Boston))^2)
}
degreeDF<-as.data.frame(degreeMSE)
#MSE for degrees of freedom
degreeDF
## degree MSE
## 1 1 0.005471468
## 2 2 0.003902330
## 3 3 0.003815219
## 4 4 0.003726888
## 5 5 0.003676347
## 6 6 0.003664342
## 7 7 0.003653364
## 8 8 0.003552863
## 9 9 0.003554311
## 10 10 0.003536053
ggplot(data=degreeDF, aes(x=degree, y=MSE))+
geom_point()+
geom_line()
#These results show that as the degrees of freedom increases, the mean squared error of the regression spline decreases. However, the ideal degrees of freedom appears to be 5 because MSE only decreases gradually after 5 degrees of freedom.
#1F) Perform cross-validation in order to select the best degrees of freedom for a regression spline on this data. Describe your results.
set.seed(1234)
train <- sample(506,253)
degreeFreedom <-10
degreeMSE<-matrix(nrow=degreeFreedom, ncol=2)
colnames(degreeMSE)<-c("degree", "MSE")
for(i in 1:degreeFreedom){
degreeMSE[i,1]<-i
this.fit<-lm(nox~ns(dis,df=i),data=Boston, subset = train)
degreeMSE[i,2]<-mean((nox-predict(this.fit, Boston))[-train]^2)
}
degreeDF<-as.data.frame(degreeMSE)
head(degreeDF)
## degree MSE
## 1 1 0.005037671
## 2 2 0.003803709
## 3 3 0.003717473
## 4 4 0.003672904
## 5 5 0.003604826
## 6 6 0.003602487
ggplot(data=degreeDF, aes(x=degree, y=MSE))+
geom_point()+
geom_line()
#This cross validation shows that MSE declines as degrees of freedom increases, but MSE beyond 5 degrees of freedom either declines gradually or increases. Therefore, this cross validation confirms that the ideal degrees of freedom to use is 5.
#2A) Split data into a training set and a testing set. Using out of state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.
library(ISLR)
data(College)
attach(College)
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:boot':
##
## logit
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
#choose variables from data set
College$Private <- as.numeric(College$Private)
cor.ci(College)
## Call:corCi(x = x, keys = keys, n.iter = n.iter, p = p, overlap = overlap,
## poly = poly, method = method, plot = plot, minlength = minlength)
##
## Coefficients and bootstrapped confidence intervals
## Privt Apps Accpt Enrll Tp10p Tp25p F.Und P.Und Otstt Rm.Br
## Private 1.00
## Apps -0.43 1.00
## Accept -0.48 0.94 1.00
## Enroll -0.57 0.85 0.91 1.00
## Top10perc 0.16 0.34 0.19 0.18 1.00
## Top25perc 0.10 0.35 0.25 0.23 0.89 1.00
## F.Undergrad -0.62 0.81 0.87 0.96 0.14 0.20 1.00
## P.Undergrad -0.45 0.40 0.44 0.51 -0.11 -0.05 0.57 1.00
## Outstate 0.55 0.05 -0.03 -0.16 0.56 0.49 -0.22 -0.25 1.00
## Room.Board 0.34 0.16 0.09 -0.04 0.37 0.33 -0.07 -0.06 0.65 1.00
## Books -0.02 0.13 0.11 0.11 0.12 0.12 0.12 0.08 0.04 0.13
## Personal -0.30 0.18 0.20 0.28 -0.09 -0.08 0.32 0.32 -0.30 -0.20
## PhD -0.16 0.39 0.36 0.33 0.53 0.55 0.32 0.15 0.38 0.33
## Terminal -0.13 0.37 0.34 0.31 0.49 0.52 0.30 0.14 0.41 0.37
## S.F.Ratio -0.47 0.10 0.18 0.24 -0.38 -0.29 0.28 0.23 -0.55 -0.36
## perc.alumni 0.41 -0.09 -0.16 -0.18 0.46 0.42 -0.23 -0.28 0.57 0.27
## Expend 0.26 0.26 0.12 0.06 0.66 0.53 0.02 -0.08 0.67 0.50
## Grad.Rate 0.34 0.15 0.07 -0.02 0.49 0.48 -0.08 -0.26 0.57 0.42
## Books Prsnl PhD Trmnl S.F.R prc.l Expnd Grd.R
## Books 1.00
## Personal 0.18 1.00
## PhD 0.03 -0.01 1.00
## Terminal 0.10 -0.03 0.85 1.00
## S.F.Ratio -0.03 0.14 -0.13 -0.16 1.00
## perc.alumni -0.04 -0.29 0.25 0.27 -0.40 1.00
## Expend 0.11 -0.10 0.43 0.44 -0.58 0.42 1.00
## Grad.Rate 0.00 -0.27 0.31 0.29 -0.31 0.49 0.39 1.00
##
## scale correlations and bootstrapped confidence intervals
## lower.emp lower.norm estimate upper.norm upper.emp p
## Privt-Apps -0.52 -0.51 -0.43 -0.36 -0.38 0.00
## Privt-Accpt -0.55 -0.54 -0.48 -0.41 -0.42 0.00
## Privt-Enrll -0.62 -0.63 -0.57 -0.51 -0.51 0.00
## Privt-Tp10p 0.09 0.09 0.16 0.23 0.23 0.00
## Privt-Tp25p 0.02 0.02 0.10 0.17 0.17 0.02
## Privt-F.Und -0.66 -0.67 -0.62 -0.56 -0.56 0.00
## Privt-P.Und -0.53 -0.52 -0.45 -0.38 -0.39 0.00
## Privt-Otstt 0.50 0.51 0.55 0.60 0.60 0.00
## Privt-Rm.Br 0.30 0.29 0.34 0.40 0.39 0.00
## Privt-Books -0.08 -0.08 -0.02 0.04 0.03 0.49
## Privt-Prsnl -0.38 -0.38 -0.30 -0.22 -0.24 0.00
## Privt-PhD -0.22 -0.22 -0.16 -0.09 -0.09 0.00
## Privt-Trmnl -0.19 -0.20 -0.13 -0.06 -0.07 0.00
## Privt-S.F.R -0.54 -0.54 -0.47 -0.39 -0.39 0.00
## Privt-prc.l 0.37 0.36 0.41 0.47 0.48 0.00
## Privt-Expnd 0.21 0.21 0.26 0.31 0.30 0.00
## Privt-Grd.R 0.26 0.27 0.34 0.40 0.40 0.00
## Apps-Accpt 0.91 0.92 0.94 0.96 0.96 0.00
## Apps-Enrll 0.80 0.79 0.85 0.90 0.90 0.00
## Apps-Tp10p 0.27 0.25 0.34 0.43 0.44 0.00
## Apps-Tp25p 0.29 0.29 0.35 0.42 0.42 0.00
## Apps-F.Und 0.77 0.75 0.81 0.87 0.87 0.00
## Apps-P.Und 0.35 0.33 0.40 0.46 0.47 0.00
## Apps-Otstt -0.01 -0.02 0.05 0.13 0.13 0.17
## Apps-Rm.Br 0.10 0.09 0.16 0.24 0.24 0.00
## Apps-Books 0.07 0.07 0.13 0.20 0.21 0.00
## Apps-Prsnl 0.11 0.11 0.18 0.24 0.24 0.00
## Apps-PhD 0.35 0.34 0.39 0.45 0.44 0.00
## Apps-Trmnl 0.33 0.33 0.37 0.42 0.42 0.00
## Apps-S.F.R 0.02 0.02 0.10 0.16 0.16 0.02
## Apps-prc.l -0.15 -0.15 -0.09 -0.03 -0.04 0.00
## Apps-Expnd 0.18 0.17 0.26 0.35 0.36 0.00
## Apps-Grd.R 0.11 0.10 0.15 0.20 0.20 0.00
## Accpt-Enrll 0.87 0.87 0.91 0.94 0.94 0.00
## Accpt-Tp10p 0.13 0.13 0.19 0.26 0.26 0.00
## Accpt-Tp25p 0.18 0.18 0.25 0.31 0.30 0.00
## Accpt-F.Und 0.84 0.83 0.87 0.91 0.91 0.00
## Accpt-P.Und 0.39 0.37 0.44 0.52 0.52 0.00
## Accpt-Otstt -0.07 -0.08 -0.03 0.04 0.04 0.49
## Accpt-Rm.Br 0.04 0.02 0.09 0.16 0.16 0.01
## Accpt-Books 0.05 0.05 0.11 0.18 0.18 0.00
## Accpt-Prsnl 0.13 0.13 0.20 0.26 0.25 0.00
## Accpt-PhD 0.32 0.32 0.36 0.40 0.39 0.00
## Accpt-Trmnl 0.31 0.30 0.34 0.38 0.37 0.00
## Accpt-S.F.R 0.11 0.11 0.18 0.23 0.23 0.00
## Accpt-prc.l -0.20 -0.21 -0.16 -0.11 -0.11 0.00
## Accpt-Expnd 0.07 0.07 0.12 0.18 0.18 0.00
## Accpt-Grd.R 0.03 0.02 0.07 0.12 0.12 0.01
## Enrll-Tp10p 0.12 0.11 0.18 0.26 0.26 0.00
## Enrll-Tp25p 0.15 0.16 0.23 0.29 0.30 0.00
## Enrll-F.Und 0.95 0.95 0.96 0.97 0.97 0.00
## Enrll-P.Und 0.46 0.45 0.51 0.58 0.57 0.00
## Enrll-Otstt -0.21 -0.21 -0.16 -0.09 -0.09 0.00
## Enrll-Rm.Br -0.09 -0.10 -0.04 0.02 0.03 0.18
## Enrll-Books 0.05 0.05 0.11 0.18 0.18 0.00
## Enrll-Prsnl 0.20 0.20 0.28 0.35 0.35 0.00
## Enrll-PhD 0.30 0.30 0.33 0.37 0.37 0.00
## Enrll-Trmnl 0.28 0.27 0.31 0.35 0.34 0.00
## Enrll-S.F.R 0.18 0.17 0.24 0.30 0.31 0.00
## Enrll-prc.l -0.24 -0.24 -0.18 -0.12 -0.13 0.00
## Enrll-Expnd 0.01 0.01 0.06 0.12 0.13 0.02
## Enrll-Grd.R -0.07 -0.08 -0.02 0.04 0.04 0.48
## Tp10p-Tp25p 0.88 0.88 0.89 0.91 0.90 0.00
## Tp10p-F.Und 0.08 0.07 0.14 0.21 0.22 0.00
## Tp10p-P.Und -0.16 -0.15 -0.11 -0.06 -0.06 0.00
## Tp10p-Otstt 0.49 0.50 0.56 0.62 0.60 0.00
## Tp10p-Rm.Br 0.30 0.31 0.37 0.43 0.42 0.00
## Tp10p-Books 0.03 0.03 0.12 0.21 0.21 0.01
## Tp10p-Prsnl -0.16 -0.16 -0.09 -0.02 -0.03 0.01
## Tp10p-PhD 0.50 0.50 0.53 0.57 0.57 0.00
## Tp10p-Trmnl 0.46 0.45 0.49 0.53 0.53 0.00
## Tp10p-S.F.R -0.44 -0.45 -0.38 -0.32 -0.32 0.00
## Tp10p-prc.l 0.37 0.38 0.46 0.52 0.51 0.00
## Tp10p-Expnd 0.59 0.59 0.66 0.72 0.72 0.00
## Tp10p-Grd.R 0.43 0.43 0.49 0.55 0.53 0.00
## Tp25p-F.Und 0.13 0.13 0.20 0.27 0.27 0.00
## Tp25p-P.Und -0.11 -0.11 -0.05 0.01 0.00 0.08
## Tp25p-Otstt 0.42 0.42 0.49 0.55 0.54 0.00
## Tp25p-Rm.Br 0.27 0.26 0.33 0.39 0.39 0.00
## Tp25p-Books 0.03 0.02 0.12 0.21 0.21 0.02
## Tp25p-Prsnl -0.14 -0.15 -0.08 -0.01 -0.01 0.03
## Tp25p-PhD 0.50 0.50 0.55 0.59 0.59 0.00
## Tp25p-Trmnl 0.48 0.47 0.52 0.57 0.57 0.00
## Tp25p-S.F.R -0.35 -0.36 -0.29 -0.23 -0.23 0.00
## Tp25p-prc.l 0.36 0.35 0.42 0.48 0.48 0.00
## Tp25p-Expnd 0.46 0.46 0.53 0.59 0.58 0.00
## Tp25p-Grd.R 0.42 0.42 0.48 0.53 0.52 0.00
## F.Und-P.Und 0.51 0.50 0.57 0.64 0.65 0.00
## F.Und-Otstt -0.26 -0.27 -0.22 -0.16 -0.15 0.00
## F.Und-Rm.Br -0.12 -0.13 -0.07 -0.01 0.00 0.02
## F.Und-Books 0.05 0.06 0.12 0.19 0.19 0.00
## F.Und-Prsnl 0.24 0.24 0.32 0.39 0.39 0.00
## F.Und-PhD 0.28 0.28 0.32 0.36 0.36 0.00
## F.Und-Trmnl 0.27 0.26 0.30 0.34 0.34 0.00
## F.Und-S.F.R 0.22 0.21 0.28 0.34 0.35 0.00
## F.Und-prc.l -0.29 -0.29 -0.23 -0.17 -0.17 0.00
## F.Und-Expnd -0.03 -0.03 0.02 0.07 0.07 0.43
## F.Und-Grd.R -0.13 -0.14 -0.08 -0.02 -0.03 0.01
## P.Und-Otstt -0.32 -0.32 -0.25 -0.19 -0.20 0.00
## P.Und-Rm.Br -0.14 -0.14 -0.06 0.02 0.02 0.12
## P.Und-Books 0.04 0.04 0.08 0.14 0.14 0.00
## P.Und-Prsnl 0.27 0.25 0.32 0.39 0.39 0.00
## P.Und-PhD 0.12 0.11 0.15 0.19 0.19 0.00
## P.Und-Trmnl 0.11 0.10 0.14 0.18 0.18 0.00
## P.Und-S.F.R 0.15 0.14 0.23 0.32 0.34 0.00
## P.Und-prc.l -0.37 -0.38 -0.28 -0.17 -0.19 0.00
## P.Und-Expnd -0.16 -0.16 -0.08 0.00 -0.01 0.04
## P.Und-Grd.R -0.32 -0.31 -0.26 -0.20 -0.21 0.00
## Otstt-Rm.Br 0.61 0.61 0.65 0.69 0.69 0.00
## Otstt-Books -0.05 -0.04 0.04 0.10 0.10 0.37
## Otstt-Prsnl -0.36 -0.36 -0.30 -0.23 -0.24 0.00
## Otstt-PhD 0.32 0.32 0.38 0.44 0.43 0.00
## Otstt-Trmnl 0.35 0.35 0.41 0.46 0.45 0.00
## Otstt-S.F.R -0.60 -0.60 -0.55 -0.50 -0.49 0.00
## Otstt-prc.l 0.50 0.50 0.57 0.61 0.61 0.00
## Otstt-Expnd 0.63 0.62 0.67 0.72 0.72 0.00
## Otstt-Grd.R 0.52 0.52 0.57 0.62 0.62 0.00
## Rm.Br-Books 0.06 0.06 0.13 0.19 0.19 0.00
## Rm.Br-Prsnl -0.27 -0.27 -0.20 -0.13 -0.14 0.00
## Rm.Br-PhD 0.27 0.26 0.33 0.39 0.40 0.00
## Rm.Br-Trmnl 0.32 0.31 0.37 0.43 0.43 0.00
## Rm.Br-S.F.R -0.41 -0.42 -0.36 -0.31 -0.31 0.00
## Rm.Br-prc.l 0.21 0.20 0.27 0.34 0.35 0.00
## Rm.Br-Expnd 0.44 0.44 0.50 0.56 0.55 0.00
## Rm.Br-Grd.R 0.36 0.36 0.42 0.49 0.48 0.00
## Books-Prsnl 0.08 0.09 0.18 0.28 0.27 0.00
## Books-PhD -0.10 -0.10 0.03 0.16 0.15 0.64
## Books-Trmnl 0.03 0.02 0.10 0.19 0.18 0.02
## Books-S.F.R -0.09 -0.10 -0.03 0.05 0.05 0.46
## Books-prc.l -0.11 -0.12 -0.04 0.04 0.03 0.30
## Books-Expnd 0.04 0.04 0.11 0.18 0.18 0.00
## Books-Grd.R -0.07 -0.07 0.00 0.07 0.07 0.92
## Prsnl-PhD -0.08 -0.10 -0.01 0.07 0.07 0.73
## Prsnl-Trmnl -0.12 -0.12 -0.03 0.06 0.05 0.45
## Prsnl-S.F.R 0.05 0.04 0.14 0.23 0.22 0.01
## Prsnl-prc.l -0.34 -0.35 -0.29 -0.22 -0.23 0.00
## Prsnl-Expnd -0.18 -0.18 -0.10 -0.02 -0.03 0.01
## Prsnl-Grd.R -0.34 -0.34 -0.27 -0.20 -0.21 0.00
## PhD-Trmnl 0.80 0.81 0.85 0.88 0.88 0.00
## PhD-S.F.R -0.23 -0.22 -0.13 -0.05 -0.06 0.00
## PhD-prc.l 0.18 0.18 0.25 0.31 0.31 0.00
## PhD-Expnd 0.37 0.37 0.43 0.49 0.48 0.00
## PhD-Grd.R 0.24 0.24 0.31 0.37 0.37 0.00
## Trmnl-S.F.R -0.25 -0.25 -0.16 -0.08 -0.08 0.00
## Trmnl-prc.l 0.20 0.20 0.27 0.33 0.32 0.00
## Trmnl-Expnd 0.39 0.39 0.44 0.49 0.48 0.00
## Trmnl-Grd.R 0.23 0.21 0.29 0.37 0.36 0.00
## S.F.R-prc.l -0.46 -0.46 -0.40 -0.34 -0.34 0.00
## S.F.R-Expnd -0.63 -0.64 -0.58 -0.53 -0.54 0.00
## S.F.R-Grd.R -0.36 -0.37 -0.31 -0.24 -0.24 0.00
## prc.l-Expnd 0.36 0.36 0.42 0.47 0.47 0.00
## prc.l-Grd.R 0.43 0.43 0.49 0.54 0.54 0.00
## Expnd-Grd.R 0.32 0.32 0.39 0.45 0.45 0.00
#variables of particular interest are: Expend, Room.Board, Grad.Rate, perc.alumni, Top10perc, and Private
#train
set.seed(1234)
train <- sample(777,388)
#1 Predictor
mod1.a <- lm(Outstate~Expend,subset = train)
anova(mod1.a)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 2824855680 2824855680 364.25 < 2.2e-16 ***
## Residuals 386 2993513992 7755218
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 2993513992
mod1.b <- lm(Outstate~Room.Board, subset = train)
anova(mod1.b)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Room.Board 1 2140223022 2140223022 224.6 < 2.2e-16 ***
## Residuals 386 3678146650 9528877
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 3678146650
mod1.c <- lm(Outstate~Grad.Rate, subset = train)
anova(mod1.c)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Grad.Rate 1 2028807055 2028807055 206.65 < 2.2e-16 ***
## Residuals 386 3789562618 9817520
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 3789562618
mod1.d <- lm(Outstate~perc.alumni, subset = train)
anova(mod1.d)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## perc.alumni 1 2035082678 2035082678 207.63 < 2.2e-16 ***
## Residuals 386 3783286995 9801262
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 3783286995
mod1.e <- lm(Outstate~Top10perc, subset = train)
anova(mod1.e)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Top10perc 1 1872638109 1872638109 183.19 < 2.2e-16 ***
## Residuals 386 3945731564 10222102
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 3945731564
#Looks like Expend is best predictor
#2 Predictors
mod2.a <- lm(Outstate~Expend+Private, subset = train)
anova(mod2.a)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 2824855680 2824855680 489.52 < 2.2e-16 ***
## Private 1 771813369 771813369 133.75 < 2.2e-16 ***
## Residuals 385 2221700623 5770651
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 2221700623
mod2.b <- lm(Outstate~Expend+Room.Board, subset = train)
anova(mod2.b)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 2824855680 2824855680 440.704 < 2.2e-16 ***
## Room.Board 1 525713949 525713949 82.016 < 2.2e-16 ***
## Residuals 385 2467800043 6409870
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 2467800043
mod2.c <- lm(Outstate~Expend+Grad.Rate, subset = train)
anova(mod2.c)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 2824855680 2824855680 448.317 < 2.2e-16 ***
## Grad.Rate 1 567617990 567617990 90.083 < 2.2e-16 ***
## Residuals 385 2425896002 6301029
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 2425896002
#Expend+Private is best
#3 Predictors
mod3.a <- lm(Outstate~Expend+Private+Room.Board, subset = train)
anova(mod3.a)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 2824855680 2824855680 557.205 < 2.2e-16 ***
## Private 1 771813369 771813369 152.241 < 2.2e-16 ***
## Room.Board 1 274940487 274940487 54.232 1.096e-12 ***
## Residuals 384 1946760136 5069688
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 1946760136
mod3.b <- lm(Outstate~Expend+Private+Grad.Rate, subset = train)
anova(mod3.b)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 2824855680 2824855680 571.245 < 2.2e-16 ***
## Private 1 771813369 771813369 156.077 < 2.2e-16 ***
## Grad.Rate 1 322788631 322788631 65.275 8.537e-15 ***
## Residuals 384 1898911992 4945083
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 1898911992
mod3.c <- lm(Outstate~Expend+Private+perc.alumni, subset = train)
anova(mod3.c)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 2824855680 2824855680 545.612 < 2.2e-16 ***
## Private 1 771813369 771813369 149.073 < 2.2e-16 ***
## perc.alumni 1 233575563 233575563 45.114 6.716e-11 ***
## Residuals 384 1988125060 5177409
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 1988125060
#Expend+Private+Grad.Rate is best
#4 Predictors
mod4.a <- lm(Outstate~Expend+Private+Grad.Rate+Room.Board, subset = train)
anova(mod4.a)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 2824855680 2824855680 628.955 < 2.2e-16 ***
## Private 1 771813369 771813369 171.844 < 2.2e-16 ***
## Grad.Rate 1 322788631 322788631 71.869 5.038e-16 ***
## Room.Board 1 178725273 178725273 39.793 7.799e-10 ***
## Residuals 383 1720186719 4491349
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 1720186719
mod4.b <- lm(Outstate~Expend+Private+Grad.Rate+Top10perc, subset = train)
anova(mod4.b)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 2824855680 2824855680 574.3909 < 2.2e-16 ***
## Private 1 771813369 771813369 156.9364 < 2.2e-16 ***
## Grad.Rate 1 322788631 322788631 65.6341 7.35e-15 ***
## Top10perc 1 15317041 15317041 3.1145 0.0784 .
## Residuals 383 1883594951 4918002
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 1883594951
mod4.c <- lm(Outstate~Expend+Private+Grad.Rate+perc.alumni, subset = train)
anova(mod4.c)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 2824855680 2824855680 595.878 < 2.2e-16 ***
## Private 1 771813369 771813369 162.807 < 2.2e-16 ***
## Grad.Rate 1 322788631 322788631 68.089 2.546e-15 ***
## perc.alumni 1 83239964 83239964 17.559 3.462e-05 ***
## Residuals 383 1815672028 4740658
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 1815672028
#Expend+Private+Grad.Rate+Room.Board is best
#5 predictors
mod5.a <- lm(Outstate~Expend+Private+Grad.Rate+Room.Board+perc.alumni, subset = train)
anova(mod5.a)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 2824855680 2824855680 677.068 < 2.2e-16 ***
## Private 1 771813369 771813369 184.990 < 2.2e-16 ***
## Grad.Rate 1 322788631 322788631 77.367 < 2.2e-16 ***
## Room.Board 1 178725273 178725273 42.837 1.918e-10 ***
## perc.alumni 1 126411732 126411732 30.299 6.806e-08 ***
## Residuals 382 1593774987 4172186
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 1593774987
mod5.b <- lm(Outstate~Expend+Private+Grad.Rate+Room.Board+Top10perc, subset = train)
anova(mod5.b)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 2824855680 2824855680 641.2379 < 2.2e-16 ***
## Private 1 771813369 771813369 175.2004 < 2.2e-16 ***
## Grad.Rate 1 322788631 322788631 73.2725 2.792e-16 ***
## Room.Board 1 178725273 178725273 40.5704 5.455e-10 ***
## Top10perc 1 37355830 37355830 8.4797 0.003802 **
## Residuals 382 1682830889 4405316
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 1682830889
#Expend+Private+Grad.Rate+Room.Board+perc.alumni is best
#6 predictors
mod6.a <- lm(Outstate~Expend+Private+Grad.Rate+Room.Board+perc.alumni+Top10perc, subset = train)
anova(mod6.a)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 2824855680 2824855680 681.3007 < 2.2e-16 ***
## Private 1 771813369 771813369 186.1465 < 2.2e-16 ***
## Grad.Rate 1 322788631 322788631 77.8504 < 2.2e-16 ***
## Room.Board 1 178725273 178725273 43.1051 1.701e-10 ***
## perc.alumni 1 126411732 126411732 30.4881 6.227e-08 ***
## Top10perc 1 14046731 14046731 3.3878 0.06646 .
## Residuals 381 1579728256 4146268
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#SS Res = 1579728256
#Adding Top10perc is not significant, so we will not include this in the model. The final model will use five predictors: Expend+Private+Grad.Rate+Room.Board+perc.alumni
#2B) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.
#install.packages("gam")
library(gam)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.16.1
#gam with natural splines
gam1=lm(Outstate~ns(Expend,4)+Private+ns(Grad.Rate,3)+ns(Room.Board,4)+ns(Top10perc,2), subset = train)
summary(gam1)
##
## Call:
## lm(formula = Outstate ~ ns(Expend, 4) + Private + ns(Grad.Rate,
## 3) + ns(Room.Board, 4) + ns(Top10perc, 2), subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6339.5 -1080.6 -73.4 1200.2 4837.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3116.8 1507.5 2.068 0.039375 *
## ns(Expend, 4)1 1461.9 827.7 1.766 0.078174 .
## ns(Expend, 4)2 13989.9 1309.3 10.685 < 2e-16 ***
## ns(Expend, 4)3 5399.8 2323.9 2.324 0.020685 *
## ns(Expend, 4)4 -3450.6 3151.0 -1.095 0.274181
## PrivateYes 2345.6 245.4 9.559 < 2e-16 ***
## ns(Grad.Rate, 3)1 2358.2 625.2 3.772 0.000188 ***
## ns(Grad.Rate, 3)2 852.3 2293.3 0.372 0.710359
## ns(Grad.Rate, 3)3 2196.7 1212.1 1.812 0.070731 .
## ns(Room.Board, 4)1 2934.0 1046.8 2.803 0.005331 **
## ns(Room.Board, 4)2 3004.8 894.5 3.359 0.000862 ***
## ns(Room.Board, 4)3 4184.0 2442.1 1.713 0.087483 .
## ns(Room.Board, 4)4 1734.6 1405.3 1.234 0.217850
## ns(Top10perc, 2)1 1102.3 887.1 1.243 0.214813
## ns(Top10perc, 2)2 283.5 1072.0 0.264 0.791568
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1893 on 373 degrees of freedom
## Multiple R-squared: 0.7703, Adjusted R-squared: 0.7617
## F-statistic: 89.37 on 14 and 373 DF, p-value: < 2.2e-16
#The results show that the number splines that should be used for each variable (based on significance) are Expend=4 splines, Grad.Rate = 3 splines, Room.Board = 4 splines, and Top10perc = 1 spline.
#plot
plot.Gam(gam1, se=TRUE ,col ="red")
#2C) Evaluate the model obtained on the test set, and explain the results obtained.
test <- College[-train,]
gam2=lm(Outstate~ns(Expend,4)+Private+ns(Grad.Rate,3)+ns(Room.Board,4)+ns(Top10perc,2),data = test)
summary(gam2)
##
## Call:
## lm(formula = Outstate ~ ns(Expend, 4) + Private + ns(Grad.Rate,
## 3) + ns(Room.Board, 4) + ns(Top10perc, 2), data = test)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7268.0 -1065.8 27.8 1274.3 7708.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 245.1 1202.6 0.204 0.838639
## ns(Expend, 4)1 1750.7 606.9 2.884 0.004148 **
## ns(Expend, 4)2 11297.6 1080.3 10.457 < 2e-16 ***
## ns(Expend, 4)3 8629.3 1546.7 5.579 4.65e-08 ***
## ns(Expend, 4)4 5745.6 1504.2 3.820 0.000156 ***
## Private 2553.7 261.1 9.779 < 2e-16 ***
## ns(Grad.Rate, 3)1 2943.1 589.1 4.996 9.01e-07 ***
## ns(Grad.Rate, 3)2 2847.7 1920.7 1.483 0.139012
## ns(Grad.Rate, 3)3 1198.3 1125.3 1.065 0.287606
## ns(Room.Board, 4)1 1413.4 814.6 1.735 0.083542 .
## ns(Room.Board, 4)2 3011.6 748.0 4.026 6.87e-05 ***
## ns(Room.Board, 4)3 3633.9 1900.7 1.912 0.056653 .
## ns(Room.Board, 4)4 4231.0 1111.6 3.806 0.000165 ***
## ns(Top10perc, 2)1 1145.8 933.8 1.227 0.220556
## ns(Top10perc, 2)2 -536.7 841.2 -0.638 0.523880
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1939 on 374 degrees of freedom
## Multiple R-squared: 0.7912, Adjusted R-squared: 0.7834
## F-statistic: 101.3 on 14 and 374 DF, p-value: < 2.2e-16
#The results show that having 4 splines for Expend is significant, Grad.Rate is not significant beyond 1 spline, Room.Board is only significant with 2 or 4 splines, and Top10perc is not significant.
#2D) For which variables, if any, is there evidence of a non-linear relationship with the response?
#Expend, Grad.Rate, and Room.Board appear to have a non-linear relationship with Outstate. Top10perc is slightly curved, so it might be slightly non-linear.