library(MASS)
library(tidyverse)
library(boot)
library(splines)
data("Boston")
attach(Boston)
# visualize data
ggplot(Boston, aes(x = dis, y = nox))+
geom_point()
# cubed polynomial model
mod1 <- lm(nox ~ poly(dis, 3), data = Boston)
summary(mod1)
##
## Call:
## lm(formula = nox ~ poly(dis, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.121130 -0.040619 -0.009738 0.023385 0.194904
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.554695 0.002759 201.021 < 2e-16 ***
## poly(dis, 3)1 -2.003096 0.062071 -32.271 < 2e-16 ***
## poly(dis, 3)2 0.856330 0.062071 13.796 < 2e-16 ***
## poly(dis, 3)3 -0.318049 0.062071 -5.124 4.27e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared: 0.7148, Adjusted R-squared: 0.7131
## F-statistic: 419.3 on 3 and 502 DF, p-value: < 2.2e-16
#visualize polynomial degrees
degreePoly<-3
polyMSE<-matrix(nrow=degreePoly, ncol=2)
colnames(polyMSE)<-c("degree", "MSE")
for(i in 1:degreePoly){
polyMSE[i,1]<-i
this.fit<-lm(nox ~ poly(dis ,i), data=Boston)
polyMSE[i,2]<-mean((Boston$nox-predict(this.fit, Boston))^2)
}
polyDF<-as.data.frame(polyMSE)
(polyDF)
## degree MSE
## 1 1 0.005471468
## 2 2 0.004022257
## 3 3 0.003822345
ggplot(data=polyDF, aes(x=degree, y=MSE))+
geom_point()+
geom_line()
# plot 10 degree polynomial
degreePoly<-10
polyMSE<-matrix(nrow=degreePoly, ncol=2)
colnames(polyMSE)<-c("degree", "MSE")
for(i in 1:degreePoly){
polyMSE[i,1]<-i
this.fit<-lm(nox ~ poly(dis ,i), data=Boston)
polyMSE[i,2]<-mean((Boston$nox-predict(this.fit, Boston))^2)
}
polyDF<-as.data.frame(polyMSE)
ggplot(data=polyDF, aes(x=degree, y=MSE))+
geom_point()+
geom_line()
# Mean squared error
polyDF
## degree MSE
## 1 1 0.005471468
## 2 2 0.004022257
## 3 3 0.003822345
## 4 4 0.003820121
## 5 5 0.003785158
## 6 6 0.003711971
## 7 7 0.003655106
## 8 8 0.003627727
## 9 9 0.003623183
## 10 10 0.003620892
# Single validation
set.seed(70)
train<-sample(506, 253)
degreePoly<-10
polyMSE<-matrix(nrow=degreePoly, ncol=2)
colnames(polyMSE)<-c("degree", "MSE")
for(i in 1:degreePoly){
polyMSE[i,1]<-i
this.fit<-lm(nox ~ poly(dis ,i), data=Boston, subset=train)
polyMSE[i,2]<-mean((Boston$nox-predict(this.fit, Boston))[-train]^2)
}
polyDF<-as.data.frame(polyMSE)
(polyDF)
## degree MSE
## 1 1 0.005607812
## 2 2 0.004248364
## 3 3 0.004067711
## 4 4 0.004072247
## 5 5 0.004024874
## 6 6 0.003928586
## 7 7 0.003868523
## 8 8 0.003872872
## 9 9 0.003890109
## 10 10 0.003906588
ggplot(data=polyDF, aes(x=degree, y=MSE))+
geom_point()+
geom_line()
#kfold
set.seed(87)
cv.error.k10<-rep(0, 10)
for(i in 1:10){
glm.fit<-glm(nox~poly(dis, i), data=Boston)
cv.error.k10[i]<-cv.glm(Boston, glm.fit, K=10)$delta[1]
}
cvDF<-data.frame(degree=1:10, cv.error.k10)
ggplot(data=cvDF, aes(x=degree, y=cv.error.k10))+
geom_point()+
geom_line()
#anova
fit.1=lm(nox~dis ,data=Boston)
fit.2=lm(nox~poly(dis ,2),data=Boston)
fit.3=lm(nox~poly(dis ,3),data=Boston)
fit.4=lm(nox~poly(dis ,4),data=Boston)
fit.5=lm(nox~poly(dis ,5),data=Boston)
fit.6=lm(nox~poly(dis ,6),data=Boston)
fit.7=lm(nox~poly(dis ,7),data=Boston)
fit.8=lm(nox~poly(dis ,8),data=Boston)
fit.9=lm(nox~poly(dis ,9),data=Boston)
fit.10=lm(nox~poly(dis ,10),data=Boston)
anova(fit.1,fit.2,fit.3,fit.4,fit.5, fit.6, fit.7, fit.8, fit.9, fit.10)
## Analysis of Variance Table
##
## Model 1: nox ~ dis
## Model 2: nox ~ poly(dis, 2)
## Model 3: nox ~ poly(dis, 3)
## Model 4: nox ~ poly(dis, 4)
## Model 5: nox ~ poly(dis, 5)
## Model 6: nox ~ poly(dis, 6)
## Model 7: nox ~ poly(dis, 7)
## Model 8: nox ~ poly(dis, 8)
## Model 9: nox ~ poly(dis, 9)
## Model 10: nox ~ poly(dis, 10)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 504 2.7686
## 2 503 2.0353 1 0.73330 198.1169 < 2.2e-16 ***
## 3 502 1.9341 1 0.10116 27.3292 2.535e-07 ***
## 4 501 1.9330 1 0.00113 0.3040 0.581606
## 5 500 1.9153 1 0.01769 4.7797 0.029265 *
## 6 499 1.8783 1 0.03703 10.0052 0.001657 **
## 7 498 1.8495 1 0.02877 7.7738 0.005505 **
## 8 497 1.8356 1 0.01385 3.7429 0.053601 .
## 9 496 1.8333 1 0.00230 0.6211 0.431019
## 10 495 1.8322 1 0.00116 0.3133 0.575908
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
These results indicate that a third degree polynomial is likely the best model. The single validation set shows that the testing set performs worse after the third degree polynomial. Even though it does begin to improve again, it appears the improvements are modest and seem unreliable. The k-fold also shows that a 3rd degree polynomial is the best model. After the 3rd degree, the error plateaus and/or increases. The ANOVA also supports this notion. The 4th degree polynomial is not significant and although the 5th, 6th, and 7th are significant, they don’t explain that much error and are not worth including when testing new observation (as shown in the cross-validation).
# Determine knots
attr(bs(dis, df = 4),"knots")
## 50%
## 3.20745
dislims = range(dis)
dis.grid = seq(from = dislims[1], to = dislims[2])
par(mfrow=c(1,1))
fit=lm(nox~bs(dis, df=4,knots=c(3.20745) ),data=Boston)
pred=predict(fit,newdata=data.frame(dis=dis.grid),se=T)
summary(fit)
##
## Call:
## lm(formula = nox ~ bs(dis, df = 4, knots = c(3.20745)), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.124622 -0.039259 -0.008514 0.020850 0.193891
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.73447 0.01460 50.306 < 2e-16
## bs(dis, df = 4, knots = c(3.20745))1 -0.05810 0.02186 -2.658 0.00812
## bs(dis, df = 4, knots = c(3.20745))2 -0.46356 0.02366 -19.596 < 2e-16
## bs(dis, df = 4, knots = c(3.20745))3 -0.19979 0.04311 -4.634 4.58e-06
## bs(dis, df = 4, knots = c(3.20745))4 -0.38881 0.04551 -8.544 < 2e-16
##
## (Intercept) ***
## bs(dis, df = 4, knots = c(3.20745))1 **
## bs(dis, df = 4, knots = c(3.20745))2 ***
## bs(dis, df = 4, knots = c(3.20745))3 ***
## bs(dis, df = 4, knots = c(3.20745))4 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06195 on 501 degrees of freedom
## Multiple R-squared: 0.7164, Adjusted R-squared: 0.7142
## F-statistic: 316.5 on 4 and 501 DF, p-value: < 2.2e-16
# Plot
plot(dis,nox,col="gray")
lines(dis.grid,pred$fit ,lwd=2)
lines(dis.grid,pred$fit+2*pred$se ,lty="dashed")
lines(dis.grid,pred$fit-2*pred$se ,lty="dashed")
The knots were chosen using the attr(bs()) function which has R automatically determine the best knots. R chose 1 knot at 3.20745
degreeFreed<-10
degreeMSE<-matrix(nrow=degreeFreed, ncol=2)
colnames(degreeMSE)<-c("degree", "MSE")
for(i in 1:degreeFreed){
degreeMSE[i,1]<-i
this.fit<-lm(nox ~ ns(dis ,df=i))
degreeMSE[i,2]<-mean((Boston$nox-predict(this.fit, Boston))^2)
}
degreeDF<-as.data.frame(degreeMSE)
(degreeDF)
## degree MSE
## 1 1 0.005471468
## 2 2 0.003902330
## 3 3 0.003815219
## 4 4 0.003726888
## 5 5 0.003676347
## 6 6 0.003664342
## 7 7 0.003653364
## 8 8 0.003552863
## 9 9 0.003554311
## 10 10 0.003536053
ggplot(data=degreeDF, aes(x=degree, y=MSE))+
geom_point()+
geom_line()
The results indicate that more cuts leads to less error. However, it plateaus around 5 cuts.
set.seed(98)
train<-sample(506, 253)
degreeFreed<-10
degreeMSE<-matrix(nrow=degreeFreed, ncol=2)
colnames(degreeMSE)<-c("degree", "MSE")
for(i in 1:degreeFreed){
degreeMSE[i,1]<-i
this.fit<-lm(nox ~ ns(dis ,df=i), subset = train)
degreeMSE[i,2]<-mean((Boston$nox-predict(this.fit, Boston))[-train]^2)
}
degreeDF<-as.data.frame(degreeMSE)
(degreeDF)
## degree MSE
## 1 1 0.004908388
## 2 2 0.003652215
## 3 3 0.003700686
## 4 4 0.003622678
## 5 5 0.003575279
## 6 6 0.003573679
## 7 7 0.003647784
## 8 8 0.003568142
## 9 9 0.003584419
## 10 10 0.003556405
ggplot(data=degreeDF, aes(x=degree, y=MSE))+
geom_point()+
geom_line()
detach(Boston)
The results of the cross-validation indicate that two cuts are better than one. However, three cuts is worse than two. Five cuts also seems to be a pretty amount, as the error plateaus after that.
library(ISLR)
data("College")
attach(College)
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:boot':
##
## logit
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
# To determine largest predictor
College$Private = as.numeric(College$Private)
cor.ci(College)
## Call:corCi(x = x, keys = keys, n.iter = n.iter, p = p, overlap = overlap,
## poly = poly, method = method, plot = plot, minlength = minlength)
##
## Coefficients and bootstrapped confidence intervals
## Privt Apps Accpt Enrll Tp10p Tp25p F.Und P.Und Otstt Rm.Br
## Private 1.00
## Apps -0.43 1.00
## Accept -0.48 0.94 1.00
## Enroll -0.57 0.85 0.91 1.00
## Top10perc 0.16 0.34 0.19 0.18 1.00
## Top25perc 0.10 0.35 0.25 0.23 0.89 1.00
## F.Undergrad -0.62 0.81 0.87 0.96 0.14 0.20 1.00
## P.Undergrad -0.45 0.40 0.44 0.51 -0.11 -0.05 0.57 1.00
## Outstate 0.55 0.05 -0.03 -0.16 0.56 0.49 -0.22 -0.25 1.00
## Room.Board 0.34 0.16 0.09 -0.04 0.37 0.33 -0.07 -0.06 0.65 1.00
## Books -0.02 0.13 0.11 0.11 0.12 0.12 0.12 0.08 0.04 0.13
## Personal -0.30 0.18 0.20 0.28 -0.09 -0.08 0.32 0.32 -0.30 -0.20
## PhD -0.16 0.39 0.36 0.33 0.53 0.55 0.32 0.15 0.38 0.33
## Terminal -0.13 0.37 0.34 0.31 0.49 0.52 0.30 0.14 0.41 0.37
## S.F.Ratio -0.47 0.10 0.18 0.24 -0.38 -0.29 0.28 0.23 -0.55 -0.36
## perc.alumni 0.41 -0.09 -0.16 -0.18 0.46 0.42 -0.23 -0.28 0.57 0.27
## Expend 0.26 0.26 0.12 0.06 0.66 0.53 0.02 -0.08 0.67 0.50
## Grad.Rate 0.34 0.15 0.07 -0.02 0.49 0.48 -0.08 -0.26 0.57 0.42
## Books Prsnl PhD Trmnl S.F.R prc.l Expnd Grd.R
## Books 1.00
## Personal 0.18 1.00
## PhD 0.03 -0.01 1.00
## Terminal 0.10 -0.03 0.85 1.00
## S.F.Ratio -0.03 0.14 -0.13 -0.16 1.00
## perc.alumni -0.04 -0.29 0.25 0.27 -0.40 1.00
## Expend 0.11 -0.10 0.43 0.44 -0.58 0.42 1.00
## Grad.Rate 0.00 -0.27 0.31 0.29 -0.31 0.49 0.39 1.00
##
## scale correlations and bootstrapped confidence intervals
## lower.emp lower.norm estimate upper.norm upper.emp p
## Privt-Apps -0.50 -0.49 -0.43 -0.37 -0.39 0.00
## Privt-Accpt -0.53 -0.53 -0.48 -0.43 -0.44 0.00
## Privt-Enrll -0.61 -0.61 -0.57 -0.52 -0.52 0.00
## Privt-Tp10p 0.10 0.10 0.16 0.23 0.23 0.00
## Privt-Tp25p 0.04 0.03 0.10 0.17 0.16 0.01
## Privt-F.Und -0.65 -0.66 -0.62 -0.57 -0.57 0.00
## Privt-P.Und -0.52 -0.53 -0.45 -0.39 -0.39 0.00
## Privt-Otstt 0.52 0.51 0.55 0.59 0.59 0.00
## Privt-Rm.Br 0.29 0.28 0.34 0.40 0.40 0.00
## Privt-Books -0.09 -0.09 -0.02 0.05 0.05 0.64
## Privt-Prsnl -0.40 -0.38 -0.30 -0.23 -0.23 0.00
## Privt-PhD -0.21 -0.21 -0.16 -0.10 -0.10 0.00
## Privt-Trmnl -0.18 -0.19 -0.13 -0.07 -0.07 0.00
## Privt-S.F.R -0.53 -0.53 -0.47 -0.41 -0.42 0.00
## Privt-prc.l 0.37 0.37 0.41 0.46 0.46 0.00
## Privt-Expnd 0.22 0.22 0.26 0.30 0.30 0.00
## Privt-Grd.R 0.28 0.28 0.34 0.39 0.38 0.00
## Apps-Accpt 0.92 0.92 0.94 0.96 0.96 0.00
## Apps-Enrll 0.81 0.80 0.85 0.89 0.89 0.00
## Apps-Tp10p 0.26 0.26 0.34 0.42 0.42 0.00
## Apps-Tp25p 0.28 0.29 0.35 0.42 0.41 0.00
## Apps-F.Und 0.77 0.76 0.81 0.87 0.86 0.00
## Apps-P.Und 0.35 0.34 0.40 0.46 0.46 0.00
## Apps-Otstt -0.01 -0.02 0.05 0.12 0.13 0.18
## Apps-Rm.Br 0.09 0.09 0.16 0.24 0.23 0.00
## Apps-Books 0.08 0.07 0.13 0.20 0.20 0.00
## Apps-Prsnl 0.12 0.12 0.18 0.24 0.24 0.00
## Apps-PhD 0.34 0.34 0.39 0.45 0.44 0.00
## Apps-Trmnl 0.32 0.32 0.37 0.42 0.42 0.00
## Apps-S.F.R 0.00 0.01 0.10 0.18 0.16 0.03
## Apps-prc.l -0.15 -0.15 -0.09 -0.03 -0.03 0.00
## Apps-Expnd 0.19 0.18 0.26 0.35 0.34 0.00
## Apps-Grd.R 0.09 0.09 0.15 0.21 0.21 0.00
## Accpt-Enrll 0.88 0.87 0.91 0.94 0.94 0.00
## Accpt-Tp10p 0.13 0.13 0.19 0.26 0.25 0.00
## Accpt-Tp25p 0.19 0.18 0.25 0.31 0.31 0.00
## Accpt-F.Und 0.84 0.84 0.87 0.91 0.91 0.00
## Accpt-P.Und 0.38 0.38 0.44 0.51 0.52 0.00
## Accpt-Otstt -0.08 -0.08 -0.03 0.03 0.03 0.35
## Accpt-Rm.Br 0.01 0.02 0.09 0.16 0.16 0.02
## Accpt-Books 0.06 0.06 0.11 0.18 0.18 0.00
## Accpt-Prsnl 0.13 0.13 0.20 0.27 0.26 0.00
## Accpt-PhD 0.31 0.31 0.36 0.40 0.40 0.00
## Accpt-Trmnl 0.30 0.30 0.34 0.38 0.39 0.00
## Accpt-S.F.R 0.11 0.11 0.18 0.25 0.24 0.00
## Accpt-prc.l -0.21 -0.20 -0.16 -0.11 -0.12 0.00
## Accpt-Expnd 0.08 0.07 0.12 0.17 0.18 0.00
## Accpt-Grd.R 0.03 0.03 0.07 0.11 0.11 0.00
## Enrll-Tp10p 0.12 0.11 0.18 0.25 0.24 0.00
## Enrll-Tp25p 0.16 0.16 0.23 0.29 0.29 0.00
## Enrll-F.Und 0.96 0.95 0.96 0.97 0.97 0.00
## Enrll-P.Und 0.47 0.46 0.51 0.57 0.58 0.00
## Enrll-Otstt -0.22 -0.21 -0.16 -0.10 -0.10 0.00
## Enrll-Rm.Br -0.12 -0.12 -0.04 0.03 0.02 0.25
## Enrll-Books 0.06 0.05 0.11 0.18 0.18 0.00
## Enrll-Prsnl 0.20 0.20 0.28 0.35 0.35 0.00
## Enrll-PhD 0.30 0.29 0.33 0.37 0.37 0.00
## Enrll-Trmnl 0.27 0.27 0.31 0.35 0.35 0.00
## Enrll-S.F.R 0.17 0.17 0.24 0.31 0.31 0.00
## Enrll-prc.l -0.24 -0.23 -0.18 -0.13 -0.14 0.00
## Enrll-Expnd 0.01 0.01 0.06 0.12 0.12 0.02
## Enrll-Grd.R -0.06 -0.07 -0.02 0.04 0.04 0.51
## Tp10p-Tp25p 0.88 0.88 0.89 0.91 0.91 0.00
## Tp10p-F.Und 0.08 0.08 0.14 0.21 0.20 0.00
## Tp10p-P.Und -0.15 -0.16 -0.11 -0.06 -0.06 0.00
## Tp10p-Otstt 0.51 0.51 0.56 0.61 0.62 0.00
## Tp10p-Rm.Br 0.31 0.31 0.37 0.44 0.44 0.00
## Tp10p-Books 0.04 0.03 0.12 0.21 0.20 0.01
## Tp10p-Prsnl -0.16 -0.16 -0.09 -0.03 -0.03 0.00
## Tp10p-PhD 0.49 0.49 0.53 0.57 0.57 0.00
## Tp10p-Trmnl 0.44 0.44 0.49 0.54 0.53 0.00
## Tp10p-S.F.R -0.46 -0.46 -0.38 -0.31 -0.33 0.00
## Tp10p-prc.l 0.39 0.39 0.46 0.52 0.52 0.00
## Tp10p-Expnd 0.61 0.60 0.66 0.72 0.72 0.00
## Tp10p-Grd.R 0.45 0.45 0.49 0.55 0.54 0.00
## Tp25p-F.Und 0.14 0.13 0.20 0.26 0.26 0.00
## Tp25p-P.Und -0.12 -0.12 -0.05 0.01 0.01 0.12
## Tp25p-Otstt 0.44 0.44 0.49 0.54 0.54 0.00
## Tp25p-Rm.Br 0.27 0.27 0.33 0.40 0.39 0.00
## Tp25p-Books 0.03 0.02 0.12 0.21 0.20 0.02
## Tp25p-Prsnl -0.15 -0.15 -0.08 -0.02 -0.02 0.01
## Tp25p-PhD 0.49 0.49 0.55 0.60 0.59 0.00
## Tp25p-Trmnl 0.46 0.47 0.52 0.58 0.58 0.00
## Tp25p-S.F.R -0.37 -0.37 -0.29 -0.22 -0.23 0.00
## Tp25p-prc.l 0.35 0.35 0.42 0.48 0.49 0.00
## Tp25p-Expnd 0.47 0.47 0.53 0.58 0.58 0.00
## Tp25p-Grd.R 0.43 0.42 0.48 0.54 0.53 0.00
## F.Und-P.Und 0.52 0.51 0.57 0.64 0.64 0.00
## F.Und-Otstt -0.27 -0.27 -0.22 -0.16 -0.17 0.00
## F.Und-Rm.Br -0.14 -0.14 -0.07 0.00 0.00 0.05
## F.Und-Books 0.06 0.06 0.12 0.18 0.18 0.00
## F.Und-Prsnl 0.24 0.24 0.32 0.39 0.38 0.00
## F.Und-PhD 0.28 0.28 0.32 0.36 0.36 0.00
## F.Und-Trmnl 0.27 0.26 0.30 0.34 0.34 0.00
## F.Und-S.F.R 0.21 0.21 0.28 0.35 0.35 0.00
## F.Und-prc.l -0.28 -0.28 -0.23 -0.18 -0.18 0.00
## F.Und-Expnd -0.03 -0.03 0.02 0.07 0.06 0.44
## F.Und-Grd.R -0.12 -0.12 -0.08 -0.02 -0.03 0.00
## P.Und-Otstt -0.31 -0.32 -0.25 -0.20 -0.21 0.00
## P.Und-Rm.Br -0.13 -0.14 -0.06 0.01 0.01 0.09
## P.Und-Books 0.03 0.02 0.08 0.14 0.13 0.01
## P.Und-Prsnl 0.26 0.24 0.32 0.40 0.42 0.00
## P.Und-PhD 0.11 0.11 0.15 0.19 0.20 0.00
## P.Und-Trmnl 0.11 0.10 0.14 0.19 0.19 0.00
## P.Und-S.F.R 0.17 0.16 0.23 0.33 0.32 0.00
## P.Und-prc.l -0.37 -0.39 -0.28 -0.19 -0.17 0.00
## P.Und-Expnd -0.16 -0.16 -0.08 -0.02 -0.02 0.02
## P.Und-Grd.R -0.31 -0.31 -0.26 -0.20 -0.20 0.00
## Otstt-Rm.Br 0.61 0.61 0.65 0.70 0.69 0.00
## Otstt-Books -0.04 -0.04 0.04 0.11 0.12 0.32
## Otstt-Prsnl -0.36 -0.36 -0.30 -0.24 -0.24 0.00
## Otstt-PhD 0.33 0.32 0.38 0.44 0.44 0.00
## Otstt-Trmnl 0.36 0.36 0.41 0.46 0.45 0.00
## Otstt-S.F.R -0.60 -0.60 -0.55 -0.51 -0.51 0.00
## Otstt-prc.l 0.52 0.52 0.57 0.62 0.62 0.00
## Otstt-Expnd 0.63 0.63 0.67 0.71 0.71 0.00
## Otstt-Grd.R 0.53 0.52 0.57 0.62 0.62 0.00
## Rm.Br-Books 0.06 0.05 0.13 0.20 0.19 0.00
## Rm.Br-Prsnl -0.27 -0.27 -0.20 -0.14 -0.14 0.00
## Rm.Br-PhD 0.26 0.26 0.33 0.40 0.39 0.00
## Rm.Br-Trmnl 0.32 0.32 0.37 0.43 0.43 0.00
## Rm.Br-S.F.R -0.41 -0.42 -0.36 -0.31 -0.31 0.00
## Rm.Br-prc.l 0.22 0.21 0.27 0.33 0.33 0.00
## Rm.Br-Expnd 0.45 0.44 0.50 0.56 0.56 0.00
## Rm.Br-Grd.R 0.37 0.37 0.42 0.48 0.48 0.00
## Books-Prsnl 0.09 0.08 0.18 0.28 0.28 0.00
## Books-PhD -0.11 -0.11 0.03 0.18 0.17 0.63
## Books-Trmnl 0.02 0.01 0.10 0.20 0.20 0.03
## Books-S.F.R -0.11 -0.11 -0.03 0.05 0.05 0.47
## Books-prc.l -0.11 -0.12 -0.04 0.05 0.04 0.39
## Books-Expnd 0.05 0.04 0.11 0.17 0.18 0.00
## Books-Grd.R -0.06 -0.07 0.00 0.07 0.07 0.93
## Prsnl-PhD -0.09 -0.09 -0.01 0.07 0.07 0.75
## Prsnl-Trmnl -0.11 -0.11 -0.03 0.05 0.05 0.46
## Prsnl-S.F.R 0.05 0.05 0.14 0.23 0.21 0.00
## Prsnl-prc.l -0.33 -0.34 -0.29 -0.22 -0.22 0.00
## Prsnl-Expnd -0.17 -0.17 -0.10 -0.03 -0.03 0.01
## Prsnl-Grd.R -0.32 -0.33 -0.27 -0.21 -0.21 0.00
## PhD-Trmnl 0.81 0.81 0.85 0.88 0.88 0.00
## PhD-S.F.R -0.20 -0.20 -0.13 -0.06 -0.06 0.00
## PhD-prc.l 0.20 0.19 0.25 0.30 0.31 0.00
## PhD-Expnd 0.38 0.38 0.43 0.49 0.48 0.00
## PhD-Grd.R 0.25 0.23 0.31 0.38 0.37 0.00
## Trmnl-S.F.R -0.22 -0.22 -0.16 -0.09 -0.10 0.00
## Trmnl-prc.l 0.21 0.21 0.27 0.32 0.31 0.00
## Trmnl-Expnd 0.40 0.40 0.44 0.48 0.48 0.00
## Trmnl-Grd.R 0.23 0.22 0.29 0.37 0.36 0.00
## S.F.R-prc.l -0.45 -0.45 -0.40 -0.35 -0.36 0.00
## S.F.R-Expnd -0.63 -0.63 -0.58 -0.54 -0.55 0.00
## S.F.R-Grd.R -0.37 -0.37 -0.31 -0.24 -0.24 0.00
## prc.l-Expnd 0.37 0.37 0.42 0.47 0.46 0.00
## prc.l-Grd.R 0.43 0.44 0.49 0.54 0.54 0.00
## Expnd-Grd.R 0.34 0.33 0.39 0.45 0.45 0.00
set.seed(98)
train<-sample(777, 388)
mod1 <- lm(Outstate ~ Expend, subset = train)
summary(mod1)
##
## Call:
## lm(formula = Outstate ~ Expend, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14470 -1768 41 1920 6553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5296.3804 320.3182 16.54 <2e-16 ***
## Expend 0.5513 0.0304 18.13 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2829 on 386 degrees of freedom
## Multiple R-squared: 0.46, Adjusted R-squared: 0.4586
## F-statistic: 328.8 on 1 and 386 DF, p-value: < 2.2e-16
mod2 <- lm(Outstate ~ Expend + Room.Board, subset = train)
summary(mod2)
##
## Call:
## lm(formula = Outstate ~ Expend + Room.Board, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9602.4 -1482.9 -93.4 1607.1 7875.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 238.43745 527.25403 0.452 0.651
## Expend 0.40037 0.02958 13.536 <2e-16 ***
## Room.Board 1.48897 0.13189 11.290 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2455 on 385 degrees of freedom
## Multiple R-squared: 0.5943, Adjusted R-squared: 0.5922
## F-statistic: 282 on 2 and 385 DF, p-value: < 2.2e-16
mod3 <- lm(Outstate ~ Expend + Room.Board + Grad.Rate, subset = train)
summary(mod3)
##
## Call:
## lm(formula = Outstate ~ Expend + Room.Board + Grad.Rate, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9382.3 -1334.2 -148.4 1446.6 6581.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.921e+03 5.705e+02 -3.367 0.000836 ***
## Expend 3.495e-01 2.847e-02 12.277 < 2e-16 ***
## Room.Board 1.201e+00 1.291e-01 9.300 < 2e-16 ***
## Grad.Rate 5.978e+01 7.953e+00 7.517 4e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2295 on 384 degrees of freedom
## Multiple R-squared: 0.6463, Adjusted R-squared: 0.6436
## F-statistic: 233.9 on 3 and 384 DF, p-value: < 2.2e-16
mod4 <- lm(Outstate ~ Expend + Room.Board + Grad.Rate + perc.alumni, subset = train)
summary(mod4)
##
## Call:
## lm(formula = Outstate ~ Expend + Room.Board + Grad.Rate + perc.alumni,
## subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8475.2 -1314.8 -61.6 1409.0 6464.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.910e+03 5.413e+02 -3.528 0.000469 ***
## Expend 3.024e-01 2.793e-02 10.826 < 2e-16 ***
## Room.Board 1.225e+00 1.226e-01 9.999 < 2e-16 ***
## Grad.Rate 3.953e+01 8.144e+00 4.855 1.76e-06 ***
## perc.alumni 7.416e+01 1.122e+01 6.608 1.31e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2178 on 383 degrees of freedom
## Multiple R-squared: 0.6825, Adjusted R-squared: 0.6792
## F-statistic: 205.8 on 4 and 383 DF, p-value: < 2.2e-16
mod5 <- lm(Outstate ~ Expend + Room.Board + Grad.Rate + perc.alumni + Private, subset = train)
summary(mod5)
##
## Call:
## lm(formula = Outstate ~ Expend + Room.Board + Grad.Rate + perc.alumni +
## Private, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8418.9 -1233.1 -41.2 1110.2 6265.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.843e+03 4.964e+02 -3.712 0.000236 ***
## Expend 2.996e-01 2.562e-02 11.694 < 2e-16 ***
## Room.Board 1.053e+00 1.142e-01 9.219 < 2e-16 ***
## Grad.Rate 3.434e+01 7.493e+00 4.583 6.22e-06 ***
## perc.alumni 4.561e+01 1.082e+01 4.216 3.10e-05 ***
## PrivateYes 2.266e+03 2.645e+02 8.566 2.67e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1997 on 382 degrees of freedom
## Multiple R-squared: 0.7337, Adjusted R-squared: 0.7302
## F-statistic: 210.5 on 5 and 382 DF, p-value: < 2.2e-16
mod6 <- lm(Outstate ~ Expend + Room.Board + Grad.Rate + perc.alumni + Private + Top10perc, subset = train)
summary(mod6)
##
## Call:
## lm(formula = Outstate ~ Expend + Room.Board + Grad.Rate + perc.alumni +
## Private + Top10perc, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8121.2 -1262.4 -108.9 1116.8 6245.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.722e+03 4.974e+02 -3.463 0.000595 ***
## Expend 2.670e-01 2.975e-02 8.977 < 2e-16 ***
## Room.Board 1.061e+00 1.137e-01 9.331 < 2e-16 ***
## Grad.Rate 2.928e+01 7.829e+00 3.740 0.000212 ***
## perc.alumni 4.221e+01 1.089e+01 3.876 0.000125 ***
## PrivateYes 2.360e+03 2.670e+02 8.839 < 2e-16 ***
## Top10perc 1.847e+01 8.696e+00 2.124 0.034287 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1988 on 381 degrees of freedom
## Multiple R-squared: 0.7368, Adjusted R-squared: 0.7326
## F-statistic: 177.8 on 6 and 381 DF, p-value: < 2.2e-16
mod7 <- lm(Outstate ~ Expend + Room.Board + Grad.Rate + perc.alumni + Private + Top10perc + S.F.Ratio, subset = train)
summary(mod7)
##
## Call:
## lm(formula = Outstate ~ Expend + Room.Board + Grad.Rate + perc.alumni +
## Private + Top10perc + S.F.Ratio, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8002.6 -1290.4 -73.1 1152.3 6276.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.108e+03 8.728e+02 -1.270 0.204879
## Expend 2.551e-01 3.287e-02 7.759 7.98e-14 ***
## Room.Board 1.063e+00 1.138e-01 9.340 < 2e-16 ***
## Grad.Rate 2.970e+01 7.847e+00 3.785 0.000178 ***
## perc.alumni 4.133e+01 1.094e+01 3.778 0.000184 ***
## PrivateYes 2.261e+03 2.913e+02 7.760 7.92e-14 ***
## Top10perc 1.833e+01 8.701e+00 2.106 0.035842 *
## S.F.Ratio -3.115e+01 3.637e+01 -0.856 0.392371
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1989 on 380 degrees of freedom
## Multiple R-squared: 0.7373, Adjusted R-squared: 0.7325
## F-statistic: 152.4 on 7 and 380 DF, p-value: < 2.2e-16
I added the variables that were the most related to Outstate until they were no longer significant.
library(gam)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.16.1
gam1=lm(Outstate ~ ns(Expend,2) + ns(Room.Board,3) + ns(Grad.Rate,2) + ns(perc.alumni,3) + Private + ns(Top10perc,2), subset = train)
summary(gam1)
##
## Call:
## lm(formula = Outstate ~ ns(Expend, 2) + ns(Room.Board, 3) + ns(Grad.Rate,
## 2) + ns(perc.alumni, 3) + Private + ns(Top10perc, 2), subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6666.1 -1136.6 43.5 1121.7 4939.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1456.4 1012.4 1.439 0.151090
## ns(Expend, 2)1 16569.2 1403.6 11.805 < 2e-16 ***
## ns(Expend, 2)2 -303.8 2282.8 -0.133 0.894214
## ns(Room.Board, 3)1 3351.6 539.0 6.219 1.34e-09 ***
## ns(Room.Board, 3)2 3914.8 1959.1 1.998 0.046412 *
## ns(Room.Board, 3)3 2586.9 1114.9 2.320 0.020862 *
## ns(Grad.Rate, 2)1 1687.0 1477.4 1.142 0.254216
## ns(Grad.Rate, 2)2 2582.7 741.2 3.484 0.000552 ***
## ns(perc.alumni, 3)1 920.0 498.1 1.847 0.065503 .
## ns(perc.alumni, 3)2 2196.1 1304.9 1.683 0.093209 .
## ns(perc.alumni, 3)3 2110.3 946.7 2.229 0.026391 *
## PrivateYes 2093.8 252.4 8.297 1.96e-15 ***
## ns(Top10perc, 2)1 1421.6 880.8 1.614 0.107382
## ns(Top10perc, 2)2 -739.7 929.3 -0.796 0.426552
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1817 on 374 degrees of freedom
## Multiple R-squared: 0.7842, Adjusted R-squared: 0.7767
## F-statistic: 104.5 on 13 and 374 DF, p-value: < 2.2e-16
#plot
par(mfrow=c(1,3))
plot.Gam(gam1, se=TRUE ,col ="red")
The results indicate that the proper number of cuts is 1 for Expend, 3 for Room.Board, 2 for Grad.Rate, 3 for perc.alumni, and none for Top10perc. Both Expend and Top10perc show some evidence for a non-linear effect in the training data.
test <- College[-train,]
gam2=lm(Outstate ~ ns(Expend,2) + ns(Room.Board,3) + ns(Grad.Rate,2) + ns(perc.alumni,3) + Private + ns(Top10perc,2), data = test)
summary(gam2)
##
## Call:
## lm(formula = Outstate ~ ns(Expend, 2) + ns(Room.Board, 3) + ns(Grad.Rate,
## 2) + ns(perc.alumni, 3) + Private + ns(Top10perc, 2), data = test)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6951.0 -1282.4 9.6 1328.6 7887.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2216.2 1001.1 -2.214 0.027451 *
## ns(Expend, 2)1 15646.1 1629.4 9.603 < 2e-16 ***
## ns(Expend, 2)2 3064.6 1629.8 1.880 0.060841 .
## ns(Room.Board, 3)1 2764.4 599.5 4.611 5.50e-06 ***
## ns(Room.Board, 3)2 4512.8 1649.4 2.736 0.006514 **
## ns(Room.Board, 3)3 3766.5 1054.5 3.572 0.000401 ***
## ns(Grad.Rate, 2)1 4635.9 1376.7 3.367 0.000838 ***
## ns(Grad.Rate, 2)2 2219.0 523.6 4.238 2.84e-05 ***
## ns(perc.alumni, 3)1 1689.0 520.3 3.246 0.001276 **
## ns(perc.alumni, 3)2 3484.8 1280.1 2.722 0.006788 **
## ns(perc.alumni, 3)3 2282.9 884.4 2.581 0.010223 *
## Private 2041.8 269.9 7.565 3.02e-13 ***
## ns(Top10perc, 2)1 765.6 948.8 0.807 0.420200
## ns(Top10perc, 2)2 342.3 874.1 0.392 0.695601
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2013 on 375 degrees of freedom
## Multiple R-squared: 0.7778, Adjusted R-squared: 0.7701
## F-statistic: 101 on 13 and 375 DF, p-value: < 2.2e-16
#plot
par(mfrow=c(1,3))
plot.Gam(gam2, se=TRUE ,col ="red")
For the testing set, it appears that everything was significant except for all cuts for Top10 percent and 2 cuts for Expend.
The variable Expend shows evidence of a non-linear relationship when using the testing set.