HW6Ch7DevinQuebodeaux

6. In this exercise, you will further analyze the Wage data set considered throughout this chapter.

library("ISLR")

## Warning: package 'ISLR' was built under R version 4.0.5

attach(Wage)
?Wage

## starting httpd help server ... done

wages<-Wage

library(boot)

(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

library(boot)
set.seed(1)

errors <- rep(NA,10)

for (i in 1:10){ 
 fit=glm(wage~poly(age ,i),data=wages) 
 errors[i]<- cv.glm(wages, fit,K=10)$delta[1] 
}

errors

##  [1] 1676.826 1600.763 1598.399 1595.651 1594.977 1596.061 1594.298 1598.134
##  [9] 1593.913 1595.950

plot(1:10, errors, xlab = "Degree", ylab = "Test MSE", type = "l")
d.min <- which.min(errors)
points(which.min(errors), errors[which.min(errors)], col = "blue", cex = 2, pch = 20)

fit1 <- lm(wage ~ age, data = wages)
fit2 <- lm(wage ~ poly(age, 2), data = wages)
fit3 <- lm(wage ~ poly(age, 3), data = wages)
fit4 <- lm(wage ~ poly(age, 4), data = wages)
fit5 <- lm(wage ~ poly(age, 5), data = wages)
anova(fit1, fit2, fit3, fit4, fit5)

## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#We should use the fit from degree-3 polynomial
plot(wage ~ age, data = Wage, col = "darkgrey")
agelims <- range(Wage$age)
age.grid <- seq(from = agelims[1], to = agelims[2])
fit <- lm(wage ~ poly(age, 3), data = Wage)
preds <- predict(fit, newdata = list(age = age.grid))
lines(age.grid, preds, col = "blue", lwd = 2)

Age to the 3rd degree is the best fit model.

(b) Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.

set.seed(1)

lowest = 0
errors <- rep (NA,10)
for (i in 2:10){
  Wage$tmp <- cut(Wage$age,i)
  step.fit = glm(wage~tmp, data = Wage)
  errors[i] <- cv.glm(Wage ,step.fit, K= 10)$delta [1]
  
}    
errors

##  [1]       NA 1734.489 1684.271 1635.552 1632.080 1623.415 1614.996 1601.318
##  [9] 1613.954 1606.331

best.fit = glm(wage~cut(age, 8), data=Wage)
agelims = range(Wage$age)
age.grid = seq(from=agelims[1], to=agelims[2])
lm.pred = predict(best.fit, data.frame(age=age.grid))
plot(wage~age, data=wages, col="darkgrey")
lines(age.grid, lm.pred, col="red", lwd=2)

plot(2:10, errors[-1], xlab="Number of wage cuts", ylab="Errors", 
     type="l", pch=20, lwd=2)

The minimum error is for 8 cuts.

10. This question relates to the College data set.

(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

library(leaps)

## Warning: package 'leaps' was built under R version 4.0.5

attach(College)
set.seed(1)
college<-College

names(college)

##  [1] "Private"     "Apps"        "Accept"      "Enroll"      "Top10perc"  
##  [6] "Top25perc"   "F.Undergrad" "P.Undergrad" "Outstate"    "Room.Board" 
## [11] "Books"       "Personal"    "PhD"         "Terminal"    "S.F.Ratio"  
## [16] "perc.alumni" "Expend"      "Grad.Rate"

library(leaps)

train <- sample(length(Outstate), length(Outstate) / 2)
test <- -train
college.train <- college[train, ]
college.test <- college[test, ]
fit <- regsubsets(Outstate ~ ., data = college.train, nvmax = 17, method = "forward")
fit.summary <- summary(fit)
par(mfrow = c(1, 3))
plot(fit.summary$cp, xlab = "Number of variables", ylab = "Cp", type = "l")
min.cp <- min(fit.summary$cp)
std.cp <- sd(fit.summary$cp)
abline(h = min.cp + 0.2 * std.cp, col = "red", lty = 2)
abline(h = min.cp - 0.2 * std.cp, col = "red", lty = 2)
plot(fit.summary$bic, xlab = "Number of variables", ylab = "BIC", type='l')
min.bic <- min(fit.summary$bic)
std.bic <- sd(fit.summary$bic)
abline(h = min.bic + 0.2 * std.bic, col = "red", lty = 2)
abline(h = min.bic - 0.2 * std.bic, col = "red", lty = 2)
plot(fit.summary$adjr2, xlab = "Number of variables", ylab = "Adjusted R2", type = "l", ylim = c(0.4, 0.84))
max.adjr2 <- max(fit.summary$adjr2)
std.adjr2 <- sd(fit.summary$adjr2)
abline(h = max.adjr2 + 0.2 * std.adjr2, col = "red", lty = 2)
abline(h = max.adjr2 - 0.2 * std.adjr2, col = "red", lty = 2)

fit <- regsubsets(Outstate ~ ., data = college, method = "forward")
coeffs <- coef(fit, id = 6)
names(coeffs)

## [1] "(Intercept)" "PrivateYes"  "Room.Board"  "PhD"         "perc.alumni"
## [6] "Expend"      "Grad.Rate"

The graphs indicate that minimum size for the sumbsit should be 6.

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

library(mgcv)

## Loading required package: nlme

## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.

fit <-  gam(Outstate ~  Private + s(Room.Board) + s(PhD) + s(perc.alumni) + s(Expend) + s(Grad.Rate), data=college)
par(mfrow = c(2, 3))
plot(fit, se = T, col = "blue")

(c) Evaluate the model obtained on the test set, and explain the results obtained.

preds <- predict(fit, college.test)
errors <- mean((college.test$Outstate - preds)^2)
errors

## [1] 3083086

test.mean <- mean((college.test$Outstate - mean(college.test$Outstate))^2)
results <- 1 - errors / test.mean
results

## [1] 0.7846

Using Gam, we ended with six predictors and had a Rsquared of .76

(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

summary(fit)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + 
##     s(Expend) + s(Grad.Rate)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8618.4      160.5   53.69   <2e-16 ***
## PrivateYes    2506.0      201.0   12.46   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                  edf Ref.df      F  p-value    
## s(Room.Board)  5.299  6.493 14.664  < 2e-16 ***
## s(PhD)         3.934  4.896  3.151   0.0111 *  
## s(perc.alumni) 1.679  2.116 13.042 2.07e-06 ***
## s(Expend)      6.176  7.336 33.841  < 2e-16 ***
## s(Grad.Rate)   3.922  4.922  9.572 1.19e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.789   Deviance explained = 79.5%
## GCV = 3.522e+06  Scale est. = 3.4177e+06  n = 777

It looks like PhD is the most nonsignifcant non linear variable.

HW6Ch7DevinQuebodeaux

Devin Quebodeaux

7/30/2021