library(MASS)
data(Boston)
attach(Boston)
mod1 <- lm(nox~poly(dis, 3), data = Boston)
coef(summary(mod1))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.5546951 0.00275939 201.020894 0.000000e+00
## poly(dis, 3)1 -2.0030959 0.06207094 -32.271071 1.597201e-124
## poly(dis, 3)2 0.8563300 0.06207094 13.795987 6.133104e-37
## poly(dis, 3)3 -0.3180490 0.06207094 -5.123959 4.274950e-07
plot(dis, nox)
lines(dis, fitted(mod1), col="blue")
set.seed(2)
### 50-50 split (train vs. test)
train <- sample(506, 253)
mod1 <- lm(nox~dis, data = Boston, subset = train)
mean((nox-predict(mod1, Boston))[-train]^2)
## [1] 0.005455645
mod2 <- lm(nox~poly(dis, 2), data = Boston, subset = train)
mean((nox-predict(mod2, Boston))[-train]^2)
## [1] 0.004028465
mod3 <- lm(nox~poly(dis, 3), data = Boston, subset = train)
mean((nox-predict(mod3, Boston))[-train]^2)
## [1] 0.003773303
mod4 <- lm(nox~poly(dis, 4), data = Boston, subset = train)
mean((nox-predict(mod4, Boston))[-train]^2)
## [1] 0.003746374
mod5 <- lm(nox~poly(dis, 5), data = Boston, subset = train)
mean((nox-predict(mod5, Boston))[-train]^2)
## [1] 0.004679533
mod6 <- lm(nox~poly(dis, 6), data = Boston, subset = train)
mean((nox-predict(mod6, Boston))[-train]^2)
## [1] 0.007991185
mod7 <- lm(nox~poly(dis, 7), data = Boston, subset = train)
mean((nox-predict(mod7, Boston))[-train]^2)
## [1] 0.01192018
mod8 <- lm(nox~poly(dis, 8), data = Boston, subset = train)
mean((nox-predict(mod8, Boston))[-train]^2)
## [1] 0.06333546
mod9 <- lm(nox~poly(dis, 9), data = Boston, subset = train)
mean((nox-predict(mod9, Boston))[-train]^2)
## [1] 2.521413
mod10 <- lm(nox~poly(dis, 10), data = Boston, subset = train)
mean((nox-predict(mod10, Boston))[-train]^2)
## [1] 2.532109
# Leave one out cross validation
library(tidyverse)
## ── Attaching packages ─────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
library(boot)
cv.error<-rep(0, 10)
for(i in 1:10){
glm.fit<-glm(nox~poly(dis, i), data=Boston)
cv.error[i]<-cv.glm(Boston, glm.fit)$delta[1]
}
cvDF<-data.frame(degree=1:10, cv.error)
ggplot(data=cvDF, aes(x=degree, y=cv.error))+
geom_point()+
geom_line()
Based on the associated error with the data, a 4th degree polynomial seems to be the appropriate fit for the data as the error values begin to increase with a 5th degree polynomial and greater.
library(splines)
dislims <- range(dis)
fit2=lm(nox~ns(dis ,df=4),data=Boston)
plot(dis ,nox ,xlim=dislims ,cex =.5,col=" darkgrey ")
title("Smoothing Spline")
fit=smooth.spline(dis ,nox ,df=4)
fit2=smooth.spline(dis ,nox ,cv=TRUE)
## Warning in smooth.spline(dis, nox, cv = TRUE): cross-validation with non-
## unique 'x' values seems doubtful
college<-read.csv("College.csv", header=TRUE, na.strings="?")
attach(college)
set.seed(1)
train <- sample(1:dim(college)[1], dim(college)[1]/2)
trainS <- college[train,]
testS <- college[-train,]
head(college)
## X Private Apps Accept Enroll Top10perc
## 1 Abilene Christian University Yes 1660 1232 721 23
## 2 Adelphi University Yes 2186 1924 512 16
## 3 Adrian College Yes 1428 1097 336 22
## 4 Agnes Scott College Yes 417 349 137 60
## 5 Alaska Pacific University Yes 193 146 55 16
## 6 Albertson College Yes 587 479 158 38
## Top25perc F.Undergrad P.Undergrad Outstate Room.Board Books Personal PhD
## 1 52 2885 537 7440 3300 450 2200 70
## 2 29 2683 1227 12280 6450 750 1500 29
## 3 50 1036 99 11250 3750 400 1165 53
## 4 89 510 63 12960 5450 450 875 92
## 5 44 249 869 7560 4120 800 1500 76
## 6 62 678 41 13500 3335 500 675 67
## Terminal S.F.Ratio perc.alumni Expend Grad.Rate
## 1 78 18.1 12 7041 60
## 2 30 12.2 16 10527 56
## 3 66 12.9 30 8735 54
## 4 97 7.7 37 19016 59
## 5 72 11.9 2 10922 15
## 6 73 9.4 11 9727 55
fit1 <- lm(Outstate~poly(Room.Board, 10), data=trainS)
summary(fit1)
##
## Call:
## lm(formula = Outstate ~ poly(Room.Board, 10), data = trainS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7471.0 -2108.6 -331.5 1831.7 11220.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10484.0 157.2 66.705 < 2e-16 ***
## poly(Room.Board, 10)1 56089.5 3095.9 18.118 < 2e-16 ***
## poly(Room.Board, 10)2 -645.1 3095.9 -0.208 0.83505
## poly(Room.Board, 10)3 699.4 3095.9 0.226 0.82140
## poly(Room.Board, 10)4 -5066.7 3095.9 -1.637 0.10255
## poly(Room.Board, 10)5 -6670.6 3095.9 -2.155 0.03182 *
## poly(Room.Board, 10)6 3516.2 3095.9 1.136 0.25677
## poly(Room.Board, 10)7 5783.0 3095.9 1.868 0.06254 .
## poly(Room.Board, 10)8 -5446.3 3095.9 -1.759 0.07935 .
## poly(Room.Board, 10)9 -8849.8 3095.9 -2.859 0.00449 **
## poly(Room.Board, 10)10 2402.2 3095.9 0.776 0.43827
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3096 on 377 degrees of freedom
## Multiple R-squared: 0.4831, Adjusted R-squared: 0.4694
## F-statistic: 35.23 on 10 and 377 DF, p-value: < 2.2e-16
fit2 <- lm(Outstate~poly(Room.Board, 9) + poly(Books, 10), data=trainS)
summary(fit2)
##
## Call:
## lm(formula = Outstate ~ poly(Room.Board, 9) + poly(Books, 10),
## data = trainS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7286.4 -2060.6 -202.1 1821.4 11696.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10484.0 156.1 67.170 < 2e-16 ***
## poly(Room.Board, 9)1 57187.8 3147.7 18.168 < 2e-16 ***
## poly(Room.Board, 9)2 -752.2 3107.9 -0.242 0.80890
## poly(Room.Board, 9)3 629.6 3096.5 0.203 0.83898
## poly(Room.Board, 9)4 -5283.6 3090.8 -1.709 0.08821 .
## poly(Room.Board, 9)5 -6890.3 3095.8 -2.226 0.02664 *
## poly(Room.Board, 9)6 4152.2 3104.5 1.337 0.18190
## poly(Room.Board, 9)7 5145.7 3104.4 1.658 0.09827 .
## poly(Room.Board, 9)8 -4778.2 3090.7 -1.546 0.12296
## poly(Room.Board, 9)9 -8169.8 3096.9 -2.638 0.00869 **
## poly(Books, 10)1 -6813.0 3121.9 -2.182 0.02972 *
## poly(Books, 10)2 3843.3 3107.9 1.237 0.21702
## poly(Books, 10)3 3141.1 3115.6 1.008 0.31403
## poly(Books, 10)4 -1279.1 3097.1 -0.413 0.67986
## poly(Books, 10)5 3007.2 3092.8 0.972 0.33152
## poly(Books, 10)6 5739.4 3099.7 1.852 0.06488 .
## poly(Books, 10)7 -2663.4 3101.7 -0.859 0.39107
## poly(Books, 10)8 516.7 3086.7 0.167 0.86716
## poly(Books, 10)9 -4614.2 3087.1 -1.495 0.13586
## poly(Books, 10)10 -558.8 3099.6 -0.180 0.85704
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3074 on 368 degrees of freedom
## Multiple R-squared: 0.5024, Adjusted R-squared: 0.4767
## F-statistic: 19.55 on 19 and 368 DF, p-value: < 2.2e-16
fit3 <- lm(Outstate~poly(Room.Board, 9) + Books + poly(Expend, 3), data=trainS)
summary(fit3)
##
## Call:
## lm(formula = Outstate ~ poly(Room.Board, 9) + Books + poly(Expend,
## 3), data = trainS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9991.6 -1234.0 57.4 1388.6 7099.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12505.174 483.079 25.886 < 2e-16 ***
## poly(Room.Board, 9)1 26179.195 3068.077 8.533 3.61e-16 ***
## poly(Room.Board, 9)2 -2569.474 2485.905 -1.034 0.30198
## poly(Room.Board, 9)3 -2348.589 2386.170 -0.984 0.32563
## poly(Room.Board, 9)4 -3018.006 2391.435 -1.262 0.20773
## poly(Room.Board, 9)5 -3023.247 2398.787 -1.260 0.20834
## poly(Room.Board, 9)6 3581.318 2389.248 1.499 0.13474
## poly(Room.Board, 9)7 4420.583 2384.464 1.854 0.06454 .
## poly(Room.Board, 9)8 -663.956 2400.584 -0.277 0.78225
## poly(Room.Board, 9)9 -4970.182 2405.756 -2.066 0.03952 *
## Books -3.634 0.841 -4.321 1.99e-05 ***
## poly(Expend, 3)1 42598.337 2932.294 14.527 < 2e-16 ***
## poly(Expend, 3)2 -25203.247 2643.696 -9.533 < 2e-16 ***
## poly(Expend, 3)3 7785.176 2491.581 3.125 0.00192 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2377 on 374 degrees of freedom
## Multiple R-squared: 0.6976, Adjusted R-squared: 0.6871
## F-statistic: 66.36 on 13 and 374 DF, p-value: < 2.2e-16
library(gam)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.16.1
gam1 <- lm(Outstate~ns(Room.Board, 9) + ns(Books) + ns(Expend, 3), data = trainS)
summary(gam1)
##
## Call:
## lm(formula = Outstate ~ ns(Room.Board, 9) + ns(Books) + ns(Expend,
## 3), data = trainS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10504.5 -1420.9 -46.3 1505.7 6793.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3544 1494 2.372 0.018199 *
## ns(Room.Board, 9)1 1956 1379 1.418 0.156960
## ns(Room.Board, 9)2 4694 1670 2.812 0.005191 **
## ns(Room.Board, 9)3 4382 1539 2.847 0.004657 **
## ns(Room.Board, 9)4 4603 1622 2.837 0.004796 **
## ns(Room.Board, 9)5 3310 1605 2.062 0.039891 *
## ns(Room.Board, 9)6 6135 1610 3.810 0.000163 ***
## ns(Room.Board, 9)7 6174 1225 5.039 7.30e-07 ***
## ns(Room.Board, 9)8 8737 3228 2.707 0.007103 **
## ns(Room.Board, 9)9 3757 1368 2.745 0.006337 **
## ns(Books) -5149 1258 -4.093 5.22e-05 ***
## ns(Expend, 3)1 15309 1036 14.773 < 2e-16 ***
## ns(Expend, 3)2 13952 1679 8.309 1.80e-15 ***
## ns(Expend, 3)3 6590 1714 3.844 0.000142 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2363 on 374 degrees of freedom
## Multiple R-squared: 0.7011, Adjusted R-squared: 0.6907
## F-statistic: 67.49 on 13 and 374 DF, p-value: < 2.2e-16
plot.Gam(gam1, se=TRUE ,col ="red")
gam2 <- lm(Outstate~ns(Room.Board, 9) + ns(Books) + ns(Expend, 3), data = testS)
summary(gam2)
##
## Call:
## lm(formula = Outstate ~ ns(Room.Board, 9) + ns(Books) + ns(Expend,
## 3), data = testS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7503.2 -1166.9 -55.1 1413.7 5379.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3957.9 1618.8 2.445 0.014948 *
## ns(Room.Board, 9)1 1794.6 1456.7 1.232 0.218749
## ns(Room.Board, 9)2 2386.6 1767.1 1.351 0.177651
## ns(Room.Board, 9)3 3337.9 1635.2 2.041 0.041929 *
## ns(Room.Board, 9)4 3839.7 1688.5 2.274 0.023528 *
## ns(Room.Board, 9)5 3567.6 1652.2 2.159 0.031466 *
## ns(Room.Board, 9)6 4614.5 1675.1 2.755 0.006160 **
## ns(Room.Board, 9)7 4399.6 1222.6 3.598 0.000363 ***
## ns(Room.Board, 9)8 6000.2 3580.9 1.676 0.094645 .
## ns(Room.Board, 9)9 4609.8 1681.8 2.741 0.006418 **
## ns(Books) -3394.7 1747.9 -1.942 0.052862 .
## ns(Expend, 3)1 11821.5 821.1 14.397 < 2e-16 ***
## ns(Expend, 3)2 11830.0 1357.2 8.717 < 2e-16 ***
## ns(Expend, 3)3 6693.4 1593.3 4.201 3.32e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2199 on 375 degrees of freedom
## Multiple R-squared: 0.6744, Adjusted R-squared: 0.6631
## F-statistic: 59.75 on 13 and 375 DF, p-value: < 2.2e-16
The results of the GAM model indicate if certain variable show significance at certain degrees of a polynomial given the use of natural splines.
Room.Board and Expend appear to have a non-linear relationship with the reponse (significance with greater than first degree polynomials, response = Outstate) while Books is limited to a linear relaitonship (no significance for 2nd degree polynomial or greater).