Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.
library(ISLR)
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(modelr)
library(broom)
##
## Attaching package: 'broom'
## The following object is masked from 'package:modelr':
##
## bootstrap
map(.x = 1:5, ~ lm(wage ~ poly(age, .x), data = Wage)) %>%
map(glance) %>%
bind_rows()
## r.squared adj.r.squared sigma statistic p.value df logLik
## 1 0.03827391 0.03795313 40.92907 119.31172 2.900778e-27 2 -15391.34
## 2 0.08208515 0.08147259 39.99262 134.00436 1.817159e-56 3 -15321.40
## 3 0.08510227 0.08418615 39.93350 92.89432 1.749513e-57 4 -15316.46
## 4 0.08626467 0.08504433 39.91479 70.68860 2.774098e-57 5 -15314.55
## 5 0.08651028 0.08498474 39.91608 56.70820 1.672298e-56 6 -15314.15
## AIC BIC deviance df.residual
## 1 30788.67 30806.69 5022216 2998
## 2 30650.80 30674.82 4793430 2997
## 3 30642.92 30672.95 4777674 2996
## 4 30641.11 30677.15 4771604 2995
## 5 30642.30 30684.35 4770322 2994
map(.x = 1:5, ~ lm(wage ~ poly(age, .x), data = Wage)) %>%
map(glance) %>%
bind_rows() %>%
gather(measure, value, -df) %>%
filter(!(measure %in% c('df.residuals', 'p.value', 'df.residual'))) %>%
ggplot(aes(x = df, y = value)) +
geom_line() +
geom_point() +
facet_wrap( ~ measure, scales = 'free_y', ncol = 2)
Optimal degree d for the polynomial is 3, based on BIC and AIC charts.
anova(
lm(wage ~ poly(age, 1), data = Wage),
lm(wage ~ poly(age, 2), data = Wage),
lm(wage ~ poly(age, 3), data = Wage),
lm(wage ~ poly(age, 4), data = Wage))
## Analysis of Variance Table
##
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.6025 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8894 0.001679 **
## 4 2995 4771604 1 6070 3.8101 0.051039 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova test confirms that there is no improvement in model fit when using 4 degrees for the polinomial regression.
This question relates to the College data set.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
test <- sample_frac(College, .3)
training <- setdiff(College, test)
fit1 <- lm(Outstate ~1 , data = training)
fit2 <- lm(Outstate ~ . , data = training)
stepAIC(fit1, scope = list(lower = fit1 , upper = fit2), direction = 'forward')
## Start: AIC=9026.68
## Outstate ~ 1
##
## Df Sum of Sq RSS AIC
## + Expend 1 4044509247 4671640347 8689.4
## + Room.Board 1 3805713666 4910435928 8716.5
## + Grad.Rate 1 3029039719 5687109875 8796.4
## + Private 1 2909634972 5806514622 8807.7
## + Top10perc 1 2695109608 6021039986 8827.4
## + S.F.Ratio 1 2631872165 6084277429 8833.1
## + perc.alumni 1 2475898748 6240250846 8846.9
## + Top25perc 1 2098231897 6617917697 8878.9
## + Terminal 1 1386657083 7329492511 8934.4
## + PhD 1 1254617098 7461532496 8944.1
## + Personal 1 799346933 7916802661 8976.4
## + P.Undergrad 1 514352435 8201797159 8995.6
## + F.Undergrad 1 447526247 8268623346 9000.0
## + Enroll 1 214985473 8501164121 9015.1
## <none> 8716149594 9026.7
## + Apps 1 22506171 8693643422 9027.3
## + Accept 1 1831891 8714317703 9028.6
## + Books 1 751348 8715398245 9028.6
##
## Step: AIC=8689.41
## Outstate ~ Expend
##
## Df Sum of Sq RSS AIC
## + Private 1 1412904328 3258736019 8495.5
## + Room.Board 1 1165930130 3505710217 8535.2
## + Grad.Rate 1 1092182650 3579457697 8546.5
## + perc.alumni 1 700251528 3971388819 8603.1
## + Personal 1 517162874 4154477473 8627.6
## + F.Undergrad 1 489270353 4182369994 8631.2
## + Enroll 1 361651348 4309988999 8647.6
## + P.Undergrad 1 330793903 4340846444 8651.5
## + S.F.Ratio 1 290668620 4380971727 8656.5
## + Top10perc 1 259968630 4411671717 8660.3
## + Top25perc 1 230157369 4441482978 8663.9
## + Apps 1 147611694 4524028653 8673.9
## + Terminal 1 108609839 4563030508 8678.6
## + Accept 1 103651719 4567988628 8679.2
## + PhD 1 84241979 4587398368 8681.5
## + Books 1 49130163 4622510184 8685.7
## <none> 4671640347 8689.4
##
## Step: AIC=8495.48
## Outstate ~ Expend + Private
##
## Df Sum of Sq RSS AIC
## + Room.Board 1 636744380 2621991640 8379.2
## + Grad.Rate 1 539324788 2719411231 8399.1
## + Terminal 1 501115162 2757620857 8406.6
## + PhD 1 498849531 2759886488 8407.1
## + Top25perc 1 260986446 2997749573 8452.1
## + Top10perc 1 234060528 3024675491 8456.9
## + perc.alumni 1 212187542 3046548477 8460.8
## + Personal 1 163839869 3094896150 8469.4
## + Accept 1 138616048 3120119971 8473.8
## + Apps 1 108106623 3150629396 8479.1
## + Books 1 16480100 3242255919 8494.7
## + Enroll 1 16149213 3242586806 8494.8
## <none> 3258736019 8495.5
## + P.Undergrad 1 7545437 3251190582 8496.2
## + F.Undergrad 1 2950957 3255785062 8497.0
## + S.F.Ratio 1 1999298 3256736721 8497.1
##
## Step: AIC=8379.21
## Outstate ~ Expend + Private + Room.Board
##
## Df Sum of Sq RSS AIC
## + Grad.Rate 1 312933486 2309058154 8312.1
## + PhD 1 296367896 2325623743 8316.0
## + Terminal 1 265043744 2356947895 8323.2
## + perc.alumni 1 262867015 2359124625 8323.7
## + Top25perc 1 212712720 2409278919 8335.2
## + Top10perc 1 196158806 2425832833 8338.9
## + Personal 1 99572448 2522419191 8360.1
## + Books 1 54957799 2567033840 8369.7
## + Accept 1 41971607 2580020033 8372.4
## + P.Undergrad 1 24866726 2597124913 8376.0
## + Apps 1 17905449 2604086190 8377.5
## <none> 2621991640 8379.2
## + Enroll 1 3476342 2618515297 8380.5
## + S.F.Ratio 1 3182075 2618809565 8380.5
## + F.Undergrad 1 157003 2621834636 8381.2
##
## Step: AIC=8312.07
## Outstate ~ Expend + Private + Room.Board + Grad.Rate
##
## Df Sum of Sq RSS AIC
## + PhD 1 174004174 2135053979 8271.4
## + Terminal 1 159845557 2149212597 8275.0
## + perc.alumni 1 106318429 2202739725 8288.4
## + Top25perc 1 76851429 2232206725 8295.7
## + Top10perc 1 67611285 2241446869 8297.9
## + Personal 1 49383289 2259674865 8302.3
## + Books 1 35783963 2273274190 8305.6
## + Accept 1 21886761 2287171393 8308.9
## <none> 2309058154 8312.1
## + S.F.Ratio 1 3469871 2305588283 8313.3
## + Apps 1 2680178 2306377975 8313.4
## + P.Undergrad 1 1611256 2307446898 8313.7
## + F.Undergrad 1 1061374 2307996780 8313.8
## + Enroll 1 438560 2308619593 8314.0
##
## Step: AIC=8271.45
## Outstate ~ Expend + Private + Room.Board + Grad.Rate + PhD
##
## Df Sum of Sq RSS AIC
## + perc.alumni 1 70903793 2064150187 8255.1
## + Personal 1 47298800 2087755179 8261.3
## + Books 1 27328865 2107725114 8266.4
## + Top25perc 1 19841932 2115212047 8268.4
## + Terminal 1 18227082 2116826897 8268.8
## + Top10perc 1 16821503 2118232476 8269.1
## + F.Undergrad 1 11380701 2123673278 8270.5
## + S.F.Ratio 1 9048680 2126005299 8271.1
## + P.Undergrad 1 8043363 2127010616 8271.4
## <none> 2135053979 8271.4
## + Accept 1 6511344 2128542636 8271.8
## + Enroll 1 2696023 2132357957 8272.8
## + Apps 1 6200 2135047780 8273.4
##
## Step: AIC=8255.08
## Outstate ~ Expend + Private + Room.Board + Grad.Rate + PhD +
## perc.alumni
##
## Df Sum of Sq RSS AIC
## + Personal 1 31203008 2032947179 8248.8
## + Books 1 22809386 2041340801 8251.0
## + Accept 1 15781507 2048368679 8252.9
## + Terminal 1 13763485 2050386701 8253.4
## + Top25perc 1 10665334 2053484853 8254.3
## + Top10perc 1 8653569 2055496618 8254.8
## <none> 2064150187 8255.1
## + S.F.Ratio 1 6210987 2057939199 8255.4
## + F.Undergrad 1 5950279 2058199908 8255.5
## + P.Undergrad 1 4957810 2059192377 8255.8
## + Apps 1 1858805 2062291381 8256.6
## + Enroll 1 746994 2063403192 8256.9
##
## Step: AIC=8248.79
## Outstate ~ Expend + Private + Room.Board + Grad.Rate + PhD +
## perc.alumni + Personal
##
## Df Sum of Sq RSS AIC
## + Accept 1 20335295 2012611883 8245.3
## + Books 1 13886046 2019061133 8247.1
## + Terminal 1 12962086 2019985093 8247.3
## + Top25perc 1 12572286 2020374893 8247.4
## + Top10perc 1 10054341 2022892838 8248.1
## + S.F.Ratio 1 7815694 2025131485 8248.7
## <none> 2032947179 8248.8
## + Apps 1 3700292 2029246887 8249.8
## + F.Undergrad 1 1846155 2031101024 8250.3
## + P.Undergrad 1 1410219 2031536960 8250.4
## + Enroll 1 4244 2032942935 8250.8
##
## Step: AIC=8245.32
## Outstate ~ Expend + Private + Room.Board + Grad.Rate + PhD +
## perc.alumni + Personal + Accept
##
## Df Sum of Sq RSS AIC
## + F.Undergrad 1 90945840 1921666044 8222.2
## + Enroll 1 84989644 1927622239 8223.8
## + Apps 1 28030940 1984580943 8239.7
## + Books 1 15041428 1997570455 8243.2
## + Terminal 1 11374870 2001237014 8244.2
## + S.F.Ratio 1 9729497 2002882387 8244.7
## + Top25perc 1 8845186 2003766697 8244.9
## + Top10perc 1 8032794 2004579089 8245.1
## <none> 2012611883 8245.3
## + P.Undergrad 1 6098147 2006513736 8245.7
##
## Step: AIC=8222.16
## Outstate ~ Expend + Private + Room.Board + Grad.Rate + PhD +
## perc.alumni + Personal + Accept + F.Undergrad
##
## Df Sum of Sq RSS AIC
## + Top10perc 1 20109981 1901556063 8218.4
## + Top25perc 1 18806098 1902859946 8218.8
## + Apps 1 14428700 1907237344 8220.1
## + Terminal 1 14398189 1907267854 8220.1
## + Books 1 12079044 1909586999 8220.7
## + Enroll 1 8581781 1913084263 8221.7
## <none> 1921666044 8222.2
## + S.F.Ratio 1 4462000 1917204043 8222.9
## + P.Undergrad 1 76488 1921589556 8224.1
##
## Step: AIC=8218.44
## Outstate ~ Expend + Private + Room.Board + Grad.Rate + PhD +
## perc.alumni + Personal + Accept + F.Undergrad + Top10perc
##
## Df Sum of Sq RSS AIC
## + Apps 1 34833300 1866722763 8210.4
## + Books 1 16491192 1885064871 8215.7
## + Terminal 1 13735258 1887820805 8216.5
## + Enroll 1 10975741 1890580322 8217.3
## <none> 1901556063 8218.4
## + S.F.Ratio 1 3711229 1897844834 8219.4
## + Top25perc 1 1271371 1900284692 8220.1
## + P.Undergrad 1 762403 1900793660 8220.2
##
## Step: AIC=8210.38
## Outstate ~ Expend + Private + Room.Board + Grad.Rate + PhD +
## perc.alumni + Personal + Accept + F.Undergrad + Top10perc +
## Apps
##
## Df Sum of Sq RSS AIC
## + Books 1 16591749 1850131014 8207.5
## + Enroll 1 14449470 1852273293 8208.2
## + Terminal 1 11216811 1855505952 8209.1
## <none> 1866722763 8210.4
## + S.F.Ratio 1 3196633 1863526130 8211.5
## + P.Undergrad 1 1165174 1865557589 8212.0
## + Top25perc 1 393308 1866329455 8212.3
##
## Step: AIC=8207.53
## Outstate ~ Expend + Private + Room.Board + Grad.Rate + PhD +
## perc.alumni + Personal + Accept + F.Undergrad + Top10perc +
## Apps + Books
##
## Df Sum of Sq RSS AIC
## + Terminal 1 15074641 1835056372 8205.1
## + Enroll 1 13883476 1836247537 8205.4
## <none> 1850131014 8207.5
## + S.F.Ratio 1 3020972 1847110041 8208.6
## + P.Undergrad 1 1169552 1848961462 8209.2
## + Top25perc 1 781207 1849349807 8209.3
##
## Step: AIC=8205.08
## Outstate ~ Expend + Private + Room.Board + Grad.Rate + PhD +
## perc.alumni + Personal + Accept + F.Undergrad + Top10perc +
## Apps + Books + Terminal
##
## Df Sum of Sq RSS AIC
## + Enroll 1 12471514 1822584859 8203.4
## <none> 1835056372 8205.1
## + S.F.Ratio 1 3869783 1831186589 8205.9
## + P.Undergrad 1 1138262 1833918110 8206.7
## + Top25perc 1 141618 1834914755 8207.0
##
## Step: AIC=8203.37
## Outstate ~ Expend + Private + Room.Board + Grad.Rate + PhD +
## perc.alumni + Personal + Accept + F.Undergrad + Top10perc +
## Apps + Books + Terminal + Enroll
##
## Df Sum of Sq RSS AIC
## <none> 1822584859 8203.4
## + S.F.Ratio 1 3994598 1818590261 8204.2
## + P.Undergrad 1 1166224 1821418635 8205.0
## + Top25perc 1 14656 1822570203 8205.4
##
## Call:
## lm(formula = Outstate ~ Expend + Private + Room.Board + Grad.Rate +
## PhD + perc.alumni + Personal + Accept + F.Undergrad + Top10perc +
## Apps + Books + Terminal + Enroll, data = training)
##
## Coefficients:
## (Intercept) Expend PrivateYes Room.Board Grad.Rate
## -2334.5598 0.2300 2513.9545 0.8761 22.6124
## PhD perc.alumni Personal Accept F.Undergrad
## 15.4341 33.6114 -0.1768 0.9178 -0.1198
## Top10perc Apps Books Terminal Enroll
## 28.2481 -0.2597 -1.2080 21.2215 -0.7912
# library(leaps)
# regfitFwd <- regsubsets( Outstate ~ . , data = training, method = 'forward', nvmax = length(names(training)) - 1)
# summary(regfitFwd)
#
# data_frame(n = c(1:length(summary(regfitFwd)[['bic']])), BIC = summary(regfitFwd)[['bic']]) %>%
# ggplot(aes(x = n, y = BIC )) +
# geom_line()
library(gam)
## Loading required package: splines
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.14-4
fitGam <- gam(Outstate ~ s(Expend,4) + Private + s(Room.Board,4) + s(perc.alumni,4) +
s(PhD, 4) + s(Grad.Rate, 4) + s(S.F.Ratio, 4) + s(Top25perc, 4) + s(Books, 4) + s(Terminal, 4),
data = training)
summary(fitGam)
##
## Call: gam(formula = Outstate ~ s(Expend, 4) + Private + s(Room.Board,
## 4) + s(perc.alumni, 4) + s(PhD, 4) + s(Grad.Rate, 4) + s(S.F.Ratio,
## 4) + s(Top25perc, 4) + s(Books, 4) + s(Terminal, 4), data = training)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5832.518 -1102.821 -2.992 1121.083 4175.207
##
## (Dispersion Parameter for gaussian family taken to be 3164134)
##
## Null Deviance: 8716149594 on 543 degrees of freedom
## Residual Deviance: 1601049075 on 505.9991 degrees of freedom
## AIC: 9724.674
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(Expend, 4) 1 4344210024 4344210024 1372.9538 < 2.2e-16 ***
## Private 1 1096682242 1096682242 346.5979 < 2.2e-16 ***
## s(Room.Board, 4) 1 449341873 449341873 142.0110 < 2.2e-16 ***
## s(perc.alumni, 4) 1 174416139 174416139 55.1229 4.835e-13 ***
## s(PhD, 4) 1 114817153 114817153 36.2871 3.279e-09 ***
## s(Grad.Rate, 4) 1 118039758 118039758 37.3055 2.015e-09 ***
## s(S.F.Ratio, 4) 1 4855407 4855407 1.5345 0.2160106
## s(Top25perc, 4) 1 2034054 2034054 0.6428 0.4230581
## s(Books, 4) 1 37828163 37828163 11.9553 0.0005908 ***
## s(Terminal, 4) 1 10013325 10013325 3.1646 0.0758496 .
## Residuals 506 1601049075 3164134
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(Expend, 4) 3 18.3286 2.651e-11 ***
## Private
## s(Room.Board, 4) 3 3.1038 0.0263129 *
## s(perc.alumni, 4) 3 1.3010 0.2733679
## s(PhD, 4) 3 0.8965 0.4427605
## s(Grad.Rate, 4) 3 1.4429 0.2294374
## s(S.F.Ratio, 4) 3 2.2506 0.0815938 .
## s(Top25perc, 4) 3 2.5512 0.0549446 .
## s(Books, 4) 3 7.0557 0.0001183 ***
## s(Terminal, 4) 3 1.8787 0.1321738
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Evidence that PhD, S.F Ration and Books have a non linear term is required.
par(mfrow=c(2,2))
plot.gam(fitGam, se = T, col = 'red')
plot(predict(fitGam, test[, -which(names(test) =='Outstate')]), test$Outstate)
For PhD, S.F Ration and Books