data("Wage")
data("College")
In this exercise, you will further analyze the Wage data set considered throughout this chapter.
for(i in 1:10){
fit=glm(wage~poly(age, i), data=Wage)
}
summary(fit)
##
## Call:
## glm(formula = wage ~ poly(age, i), data = Wage)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -100.38 -24.45 -4.97 15.49 199.61
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 111.7036 0.7283 153.369 < 2e-16 ***
## poly(age, i)1 447.0679 39.8924 11.207 < 2e-16 ***
## poly(age, i)2 -478.3158 39.8924 -11.990 < 2e-16 ***
## poly(age, i)3 125.5217 39.8924 3.147 0.00167 **
## poly(age, i)4 -77.9112 39.8924 -1.953 0.05091 .
## poly(age, i)5 -35.8129 39.8924 -0.898 0.36940
## poly(age, i)6 62.7077 39.8924 1.572 0.11607
## poly(age, i)7 50.5498 39.8924 1.267 0.20520
## poly(age, i)8 -11.2547 39.8924 -0.282 0.77787
## poly(age, i)9 -83.6918 39.8924 -2.098 0.03599 *
## poly(age, i)10 1.6240 39.8924 0.041 0.96753
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1591.402)
##
## Null deviance: 5222086 on 2999 degrees of freedom
## Residual deviance: 4756701 on 2989 degrees of freedom
## AIC: 30644
##
## Number of Fisher Scoring iterations: 2
fit1=lm(wage~age, data=Wage)
fit2=lm(wage~poly(age, 2), data=Wage)
fit3=lm(wage~poly(age, 3), data=Wage)
fit4=lm(wage~poly(age, 4), data=Wage)
fit5=lm(wage~poly(age, 5), data=Wage)
anova(fit1, fit2, fit3, fit4, fit5)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Degree 4 was the chosen polynomial and after using Anova we see that either 3 or 4 degree would have been a good fit.
plot(wage~age, data=Wage, col="grey")
title ("Polinomial Regression with degree 4", outer=T)
agel=range(Wage$age)
ageg=seq(from=agel[1], to=agel[2])
fit=glm(wage~poly(age, 4), data=Wage)
pred=predict(fit, newdata=list(age=ageg))
lines(ageg, pred, col="blue", lwd=2)
for(i in 2:10){
fit=glm(wage~cut(age, i), data=Wage)
}
summary(fit)
##
## Call:
## glm(formula = wage ~ cut(age, i), data = Wage)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -99.966 -24.791 -4.883 16.025 201.787
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.344 3.024 24.253 < 2e-16 ***
## cut(age, i)(24.2,30.4] 24.359 3.709 6.567 6.02e-11 ***
## cut(age, i)(30.4,36.6] 35.441 3.570 9.929 < 2e-16 ***
## cut(age, i)(36.6,42.8] 44.767 3.478 12.871 < 2e-16 ***
## cut(age, i)(42.8,49] 46.707 3.413 13.687 < 2e-16 ***
## cut(age, i)(49,55.2] 43.145 3.574 12.072 < 2e-16 ***
## cut(age, i)(55.2,61.4] 46.389 3.882 11.948 < 2e-16 ***
## cut(age, i)(61.4,67.6] 43.211 5.081 8.504 < 2e-16 ***
## cut(age, i)(67.6,73.8] 23.271 7.796 2.985 0.00286 **
## cut(age, i)(73.8,80.1] 21.217 11.500 1.845 0.06515 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1600.43)
##
## Null deviance: 5222086 on 2999 degrees of freedom
## Residual deviance: 4785285 on 2990 degrees of freedom
## AIC: 30660
##
## Number of Fisher Scoring iterations: 2
plot(wage~age, data=Wage, col="grey")
title ("Step function with 10 cuts", outer=T)
agel=range(Wage$age)
ageg=seq(from=agel[1], to=agel[2])
fit=glm(wage~cut(age, 10), data=Wage)
pred=predict(fit, newdata=list(age=ageg))
lines(ageg, pred, col="blue", lwd=2)
This question relates to the College data set.
set.seed(1)
i=sample(1:nrow(College), nrow(College)*0.8)
train=College[i,]
test=College[-i,]
stepwise=regsubsets(Outstate~., data=train, method="forward", nvmax=18)
stepwises=summary(stepwise)
plot(stepwises$cp, xlab="Variables", ylab="cp")
points(which.min(stepwises$cp), stepwises$cp[which.min(stepwises$cp)], col="blue", cex=2, pch=20)
plot(stepwises$bic, xlab="Variables", ylab="bic")
points(which.min(stepwises$bic), stepwises$bic[which.min(stepwises$bic)], col="blue", cex=2, pch=20)
Using the BIC and Cp the best model is anywhere between 11-13 predictors
coef(stepwise, id=11)
## (Intercept) PrivateYes Apps Accept Enroll
## -2257.2968624 2528.5841801 -0.3256427 0.8004141 -0.7645416
## Top10perc Room.Board Personal PhD perc.alumni
## 27.1711538 0.9342392 -0.3472783 26.7456972 51.6472233
## Expend Grad.Rate
## 0.2031258 21.0112073
gam1=gam(Outstate~Private+s(Apps)+s(Accept)+s(Enroll)+s(Top10perc)+s(Room.Board)+s(Personal)+s(PhD)+s(perc.alumni)+s(Expend)+s(Grad.Rate), data=train)
plot(gam1, se=TRUE, col="blue")
There is linearity with some of the variables like PhD and perc.alumni, but also some non-linearity like with enroll or accept.
pred=predict(gam1, test)
error=mean((test$Outstate-pred)^2)
error
## [1] 2768897
t=mean((test$Outstate-mean(test$Outstate))^2)
r2=1-error/t
r2
## [1] 0.8026239
fitlm=lm(Outstate~Private+Apps+Accept+Enroll+Top10perc+Room.Board+Personal+PhD+perc.alumni+Expend+Grad.Rate, data=train)
predl=predict(fitlm, test)
errorl=mean((test$Outstate-predl)^2)
errorl
## [1] 3226502
tl=mean((test$Outstate-mean(test$Outstate))^2)
r2l=1-errorl/tl
r2l
## [1] 0.7700043
The gam model performed better than the linear model.
summary(gam1)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Outstate ~ Private + s(Apps) + s(Accept) + s(Enroll) + s(Top10perc) +
## s(Room.Board) + s(Personal) + s(PhD) + s(perc.alumni) + s(Expend) +
## s(Grad.Rate)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8740.8 209.8 41.664 <2e-16 ***
## PrivateYes 2398.1 267.1 8.978 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Apps) 1.000 1.000 2.110 0.14688
## s(Accept) 5.434 6.673 5.720 4.80e-06 ***
## s(Enroll) 8.423 8.874 4.677 8.43e-06 ***
## s(Top10perc) 1.370 1.659 2.651 0.10622
## s(Room.Board) 5.860 7.064 8.144 < 2e-16 ***
## s(Personal) 3.386 4.245 3.441 0.00746 **
## s(PhD) 1.000 1.000 2.225 0.13636
## s(perc.alumni) 1.000 1.000 27.443 3.93e-07 ***
## s(Expend) 6.902 7.970 23.204 < 2e-16 ***
## s(Grad.Rate) 3.267 4.142 3.985 0.00326 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.813 Deviance explained = 82.4%
## GCV = 3.3422e+06 Scale est. = 3.1289e+06 n = 621
The strongest evidence for non-linear relationship is for the variables Accept, Enroll, Room.Board, perc.alumni, and Expend. The second strongest evidence would be Personal and Grad.Rate.