library(ISLR)
attach(Wage)
summary(Wage)
## year age maritl race
## Min. :2003 Min. :18.00 1. Never Married: 648 1. White:2480
## 1st Qu.:2004 1st Qu.:33.75 2. Married :2074 2. Black: 293
## Median :2006 Median :42.00 3. Widowed : 19 3. Asian: 190
## Mean :2006 Mean :42.41 4. Divorced : 204 4. Other: 37
## 3rd Qu.:2008 3rd Qu.:51.00 5. Separated : 55
## Max. :2009 Max. :80.00
##
## education region jobclass
## 1. < HS Grad :268 2. Middle Atlantic :3000 1. Industrial :1544
## 2. HS Grad :971 1. New England : 0 2. Information:1456
## 3. Some College :650 3. East North Central: 0
## 4. College Grad :685 4. West North Central: 0
## 5. Advanced Degree:426 5. South Atlantic : 0
## 6. East South Central: 0
## (Other) : 0
## health health_ins logwage wage
## 1. <=Good : 858 1. Yes:2083 Min. :3.000 Min. : 20.09
## 2. >=Very Good:2142 2. No : 917 1st Qu.:4.447 1st Qu.: 85.38
## Median :4.653 Median :104.92
## Mean :4.654 Mean :111.70
## 3rd Qu.:4.857 3rd Qu.:128.68
## Max. :5.763 Max. :318.34
##
set.seed(1)
errors <- rep(NA,5) #empty vector
for (i in 1:5){
glm.fit <- glm(wage ~ poly(age,i),data=Wage)
errors[i]<- cv.glm(Wage,glm.fit,K=10)$delta[1]#put errors in empty vector
}
cv.errors <- data.frame(errors)
error_chart <- cv.errors %>%
hchart(dataLabels = list(enabled = TRUE), type = "line", hcaes(x = c(1:5), y = errors, color =viridis::inferno(5)),pointWidth = 20) %>%
hc_xAxis(title = list(text ="Degree")) %>%
hc_yAxis(title = list(text ="Error")) %>%
hc_add_theme(hc_theme_google())
error_chart
The least amount of error is detected at degree 5.
fit <- lm(wage ~ poly(age ,5) ,data=Wage) #this is using the 4th degree polynomial
coef(summary(fit))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 111.70361 0.7287647 153.2780243 0.000000e+00
## poly(age, 5)1 447.06785 39.9160847 11.2001930 1.491111e-28
## poly(age, 5)2 -478.31581 39.9160847 -11.9830341 2.367734e-32
## poly(age, 5)3 125.52169 39.9160847 3.1446392 1.679213e-03
## poly(age, 5)4 -77.91118 39.9160847 -1.9518743 5.104623e-02
## poly(age, 5)5 -35.81289 39.9160847 -0.8972045 3.696820e-01
fit.1 <- lm(wage ~ age, data = Wage)
fit.2 <- lm(wage ~ poly(age, 2, raw = T), data = Wage)
fit.3 <- lm(wage ~ poly(age, 3, raw = T), data = Wage)
fit.4 <- lm(wage ~ poly(age, 4, raw = T), data = Wage)
fit.5 <- lm(wage ~ poly(age, 5, raw = T), data = Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2, raw = T)
## Model 3: wage ~ poly(age, 3, raw = T)
## Model 4: wage ~ poly(age, 4, raw = T)
## Model 5: wage ~ poly(age, 5, raw = T)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Reviewing the results from the ANOVA model we see that the p-value comparing the linear model 1 to model 2 is extremely close to 0 (e ^ -16) indicitating that the fit is not sufficient. The p-value comparing model 2 to model 3 is 0.001679, which is really small, indicating insufficient. The p-value comparing model 3 to model 4 is 0.051046 and the p-vaoue cmparing model 4 to 5 is large, 0.369682. Hence either the cubic of quadratic polynomial appear to be a good fit.
set.seed(1)
cuts.error <- rep(NA, 20)
for (i in 2:20) {
Wage$age.cut <- cut(Wage$age, i)
fit <- glm(wage ~ age.cut, data = Wage)
cuts.error[i] <- cv.glm(Wage, fit, K = 10)$delta[1]
}
cuts.error2 <- data.frame(cuts.error)
cuts_chart <- cuts.error2 %>%
hchart(dataLabels = list(enabled = TRUE), type = "line", hcaes(x = c(1:20), y = cuts.error, color =viridis::inferno(20)),pointWidth = 20) %>%
hc_xAxis(title = list(text ="Cuts")) %>%
hc_yAxis(title = list(text ="Error")) %>%
hc_add_theme(hc_theme_google())
cuts_chart
The minimum error is at cut 16.
table(cut(age ,16)) #specify own cut points using Break option
##
## (17.9,21.9] (21.9,25.8] (25.8,29.6] (29.6,33.5] (33.5,37.4] (37.4,41.2]
## 60 171 217 302 294 377
## (41.2,45.1] (45.1,49] (49,52.9] (52.9,56.8] (56.8,60.6] (60.6,64.5]
## 374 354 246 257 175 101
## (64.5,68.4] (68.4,72.2] (72.2,76.1] (76.1,80.1]
## 32 22 13 5
fit<-lm(wage ~ cut(age ,16),data=Wage)
coef(summary (fit))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66.40936 5.151319 12.891719 4.796004e-37
## cut(age, 16)(21.9,25.8] 13.33639 5.987236 2.227470 2.599015e-02
## cut(age, 16)(25.8,29.6] 31.43918 5.820074 5.401852 7.114132e-08
## cut(age, 16)(29.6,33.5] 38.77136 5.639873 6.874509 7.544184e-12
## cut(age, 16)(33.5,37.4] 45.89510 5.652576 8.119323 6.784636e-16
## cut(age, 16)(37.4,41.2] 53.37573 5.546111 9.623993 1.300163e-21
## cut(age, 16)(41.2,45.1] 54.07627 5.549164 9.744941 4.127196e-22
## cut(age, 16)(45.1,49] 52.63106 5.570793 9.447678 6.764146e-21
## cut(age, 16)(49,52.9] 47.50389 5.745286 8.268325 2.020522e-16
## cut(age, 16)(52.9,56.8] 52.40537 5.721126 9.159974 9.390556e-20
## cut(age, 16)(56.8,60.6] 53.86153 5.969437 9.022882 3.203406e-19
## cut(age, 16)(60.6,64.5] 54.17113 6.503853 8.329082 1.225763e-16
## cut(age, 16)(64.5,68.4] 43.24595 8.734487 4.951172 7.786873e-07
## cut(age, 16)(68.4,72.2] 32.38383 9.945212 3.256223 1.141680e-03
## cut(age, 16)(72.2,76.1] 26.11595 12.206980 2.139428 3.248193e-02
## cut(age, 16)(76.1,80.1] 22.44619 18.573346 1.208516 2.269445e-01
agelims <- range(Wage$age)
age.grid<-seq(from=agelims[1], to=agelims[2])
preds<-predict(fit, data.frame(age=age.grid))
plot(wage~age, data=Wage, col="darkgrey")
lines(age.grid, preds, col="darkblue", lwd=2)
detach(Wage)
attach(College)
set.seed(1)
trainingindex<-sample(nrow(College), 0.75*nrow(College))
train<-College[trainingindex, ]
test<-College[-trainingindex, ]
regfit.fwd <- regsubsets (Outstate ~ .,data=train , nvmax=17,method ="forward")
summary (regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = train, nvmax = 17, method = "forward")
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" " " " " " " "*" " " " "
## 9 ( 1 ) "*" " " " " " " "*" " " " "
## 10 ( 1 ) "*" " " "*" " " "*" " " " "
## 11 ( 1 ) "*" "*" "*" " " "*" " " " "
## 12 ( 1 ) "*" "*" "*" " " "*" " " "*"
## 13 ( 1 ) "*" "*" "*" " " "*" " " "*"
## 14 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " "*" " " " "
## 6 ( 1 ) " " "*" " " " " "*" " " " "
## 7 ( 1 ) " " "*" " " "*" "*" " " " "
## 8 ( 1 ) " " "*" " " "*" "*" " " " "
## 9 ( 1 ) " " "*" " " "*" "*" "*" " "
## 10 ( 1 ) " " "*" " " "*" "*" "*" " "
## 11 ( 1 ) " " "*" " " "*" "*" "*" " "
## 12 ( 1 ) " " "*" " " "*" "*" "*" " "
## 13 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 14 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 15 ( 1 ) " " "*" " " "*" "*" "*" "*"
## 16 ( 1 ) "*" "*" " " "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) " " "*" " "
## 3 ( 1 ) " " "*" " "
## 4 ( 1 ) "*" "*" " "
## 5 ( 1 ) "*" "*" " "
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
## 9 ( 1 ) "*" "*" "*"
## 10 ( 1 ) "*" "*" "*"
## 11 ( 1 ) "*" "*" "*"
## 12 ( 1 ) "*" "*" "*"
## 13 ( 1 ) "*" "*" "*"
## 14 ( 1 ) "*" "*" "*"
## 15 ( 1 ) "*" "*" "*"
## 16 ( 1 ) "*" "*" "*"
## 17 ( 1 ) "*" "*" "*"
reg.summary=summary(regfit.fwd)
plot(reg.summary$cp ,xlab="Number of Variables ",ylab="Cp", type= 'l')
which.min(reg.summary$cp )
## [1] 12
points(12,reg.summary$cp [12],col="red",cex=2,pch=20)
which.min(reg.summary$bic)
## [1] 12
plot(reg.summary$bic ,xlab="Number of Variables ",ylab="BIC", type='l')
points(12,reg.summary$bic [12],col="red",cex=2,pch=20)
The best number of variables is 12. According to our stepwise selection those 12 variables are:
coeff <- coef(regfit.fwd, id = 12)
names(coeff)
## [1] "(Intercept)" "PrivateYes" "Apps" "Accept" "Top10perc"
## [6] "F.Undergrad" "Room.Board" "Personal" "PhD" "Terminal"
## [11] "perc.alumni" "Expend" "Grad.Rate"
gam(Outstate ~ Private + s(PhD, df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate, df = 2), data=College.train)
library(gam)
## Loading required package: splines
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.20
fit <- gam(Outstate ~ Private + Apps + Accept + Top10perc + F.Undergrad + s(Room.Board, df = 2) + Personal + s(PhD, df = 2) + Terminal + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate, df = 2) , data=train)
par(mfrow = c(2, 2))
plot(fit, se = T, col = "red")
preds <- predict(fit, test)
error <- mean((test$Outstate - preds)^2)
error
## [1] 2996563
lm.pred <- predict(lm(Outstate~Private+Apps+Accept+Top10perc+Room.Board+PhD+perc.alumni+Expend+Grad.Rate+F.Undergrad+Personal+Terminal, data = train), test)
lm.err <- mean((test$Outstate - lm.pred)^2)
lm.err
## [1] 3340443
summary(fit)
##
## Call: gam(formula = Outstate ~ Private + Apps + Accept + Top10perc +
## F.Undergrad + s(Room.Board, df = 2) + Personal + s(PhD, df = 2) +
## Terminal + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate,
## df = 2), data = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6876.88 -1165.09 96.64 1246.99 7648.39
##
## (Dispersion Parameter for gaussian family taken to be 3470693)
##
## Null Deviance: 9798277171 on 581 degrees of freedom
## Residual Deviance: 1947059274 on 561.0001 degrees of freedom
## AIC: 10439.1
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2524961515 2524961515 727.5092 < 2.2e-16 ***
## Apps 1 863632155 863632155 248.8356 < 2.2e-16 ***
## Accept 1 121688422 121688422 35.0617 5.570e-09 ***
## Top10perc 1 1254912871 1254912871 361.5741 < 2.2e-16 ***
## F.Undergrad 1 175281973 175281973 50.5034 3.629e-12 ***
## s(Room.Board, df = 2) 1 707120604 707120604 203.7404 < 2.2e-16 ***
## Personal 1 61167682 61167682 17.6241 3.130e-05 ***
## s(PhD, df = 2) 1 85443830 85443830 24.6187 9.278e-07 ***
## Terminal 1 27135144 27135144 7.8184 0.005349 **
## s(perc.alumni, df = 2) 1 198174869 198174869 57.0995 1.696e-13 ***
## s(Expend, df = 5) 1 503847640 503847640 145.1720 < 2.2e-16 ***
## s(Grad.Rate, df = 2) 1 66022273 66022273 19.0228 1.537e-05 ***
## Residuals 561 1947059274 3470693
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## Apps
## Accept
## Top10perc
## F.Undergrad
## s(Room.Board, df = 2) 1 3.2540 0.07178 .
## Personal
## s(PhD, df = 2) 1 0.9360 0.33372
## Terminal
## s(perc.alumni, df = 2) 1 0.8791 0.34884
## s(Expend, df = 5) 4 24.3736 < 2e-16 ***
## s(Grad.Rate, df = 2) 1 0.9060 0.34158
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ANOVA shows strong evidence of non-linear relationship between Outstate and Expend. (using p-value of 0.05).