Assignment 6

In this exercise, you will further analyze the Wage data set considered throughout this chapter.

Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial ???t to the data.

Wage<-read.table("C:/Users/Joshua Escareno/Desktop/Wage.txt", header = T )
lm.fit1<-lm(wage~age, data = Wage)
lm.fit2<-lm(wage~poly(age,2), data = Wage)
lm.fit3<-lm(wage~poly(age,3), data = Wage)            
lm.fit4<-lm(wage~poly(age,4), data = Wage)
lm.fit5<-lm(wage~poly(age,5), data = Wage)            
lm.fit6<-lm(wage~poly(age,6), data = Wage)   
lm.fit7<-lm(wage~poly(age,7), data = Wage) 
anova(lm.fit1,lm.fit2,lm.fit3,lm.fit4,lm.fit5,lm.fit6,lm.fit7)

## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Model 6: wage ~ poly(age, 6)
## Model 7: wage ~ poly(age, 7)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.6926 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8956  0.001673 ** 
## 4   2995 4771604  1      6070   3.8125  0.050966 .  
## 5   2994 4770322  1      1283   0.8055  0.369516    
## 6   2993 4766389  1      3932   2.4697  0.116165    
## 7   2992 4763834  1      2555   1.6049  0.205311    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Now we will model the cross validation

library(boot)
deltas = rep(NA,10)
set.seed(1) #I set the seed because everytime I ran a new set of numbers would occur.
for (i in 1:10){
  fit = glm(wage~poly(age,i),data = Wage)
  deltas[i] = cv.glm(Wage,fit,K=10)$delta[1]
}
plot(1:10,deltas, xlab = "Degree", ylab = "Our Testing MSE",type = "l")

delta.min<-which.min(deltas)
delta.min

## [1] 4

Now we will plot the P-values

plot(wage~age,data = Wage, col = "blue")
agelims = range(Wage$age)
grid.age = seq(from = agelims[1], to=agelims[2])
lm.fit<-lm(wage~poly(age,3), data = Wage)
lm.pred<-predict(lm.fit,newdata = list(age = grid.age))
lines(grid.age,lm.pred,col = "green", lwd = "5")

(b) Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the ???t obtained.

cvs = rep(NA,10)
for(i in 2:10)
{
  Wage$age.cut = cut(Wage$age,i)
  fit = glm(wage~age.cut, data = Wage)
  cvs[i] = cv.glm(Wage,fit,K = 10)$delta[1]
}
plot(2:10,cvs[-1],xlab = "cuts", ylab = "Test Mean Sqaured Error", type = "o")

dim.min = which.min(cvs)
dim.min

## [1] 8

Using chross validations we will chose 8.

plot(wage~age,data = Wage, col = "blue")
agelims = range(Wage$age)
grid.age = seq(from = agelims[1], to=agelims[2])
glm.fit<-glm(wage~poly(age,8), data = Wage)
glm.pred<-predict(glm.fit,newdata = list(age = grid.age))
lines(grid.age,glm.pred,col = "green", lwd = "10")

10. This question relates to the College data set.

(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

library(ISLR)

## Warning: package 'ISLR' was built under R version 3.5.3

## 
## Attaching package: 'ISLR'

## The following object is masked _by_ '.GlobalEnv':
## 
##     Wage

library(leaps)

## Warning: package 'leaps' was built under R version 3.5.3

attach(College)
train = sample(length(Outstate), length(Outstate)/2)
test = -train
ctrain<-College[train,]
ctest<-College[test,]
reg.fit<-regsubsets(Outstate~.,data = ctrain, nvmax = 17,method = 'forward')
reg.summary<- summary(reg.fit)
par=(mfrow = c(1,3))
plot(reg.summary$cp,xlab = "OUTSTATE", ylab = "CP", type = "l")
min.cp = min(reg.summary$cp)
std.cp = sd(reg.summary$cp)
abline(h=min.cp+.02*std.cp,col="red", lty = 2)
abline(h=min.cp-.02*std.cp,col="red", lty = 2)

plot(reg.summary$bic,xlab = "OUtSTATE", ylab = "BIC", type = "l")
min.bic = min(reg.summary$bic)
std.bic = sd(reg.summary$bic)
abline(h=min.bic+.02*std.bic,col="red", lty = 2)
abline(h=std.bic+.02*std.bic,col="red", lty = 2)

plot(reg.summary$bic,xlab = "OUtSTATE", ylab = "BIC", type = "l")
min.adj = min(reg.summary$adjr2)
std.adj = sd(reg.summary$adjr2)
abline(h=min.adj+.02*std.adj,col="red", lty = 2)
abline(h=min.adj-.02*std.adj,col="red", lty = 2)

plot(reg.summary$adjr2,xlab = "y", ylab = "c", type = "l")

coefi = coef(reg.fit,id = 6)
names(coefi)

## [1] "(Intercept)" "PrivateYes"  "Room.Board"  "Terminal"    "perc.alumni"
## [6] "Expend"      "Grad.Rate"

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your ???ndings.

library(gam)

## Warning: package 'gam' was built under R version 3.5.3

## Loading required package: splines

## Loading required package: foreach

## Warning: package 'foreach' was built under R version 3.5.3

## Loaded gam 1.16

gam.fit= gam(Outstate~ Private+s(Room.Board,df = 2)+s(PhD,df= 2)+ s(perc.alumni,df = 2)+s(Expend,df = 5)+s(Grad.Rate, df = 2),data = ctrain)
plot(Outstate~ Room.Board)

It would appear that there are positive relations between Out of state applicants and Room and Board. Those who live furtyher pay more then someone say who is in the local area.

(c) Evaluate the model obtained on the test set, and explain the results obtained.

gam.fit2= gam(Outstate~ Private+s(Room.Board,df = 8)+s(PhD,df= 5)+ s(perc.alumni,df = 2)+s(Expend,df = 10)+s(Grad.Rate, df = 7),data = ctrain)
plot(gam.fit2,se = T, col = "green")

gam.pred<- predict(gam.fit, ctest)
gam.pred2<- predict(gam.fit2, ctest)
mean((ctest$Outstate-gam.pred)^2)

## [1] 3491988

mean((ctest$Outstate-gam.pred2)^2)

## [1] 3667829

summary(gam.fit)

## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 2) + s(PhD, 
##     df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate, 
##     df = 2), data = ctrain)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -7707.66 -1070.71   -58.67  1283.51  7348.01 
## 
## (Dispersion Parameter for gaussian family taken to be 3518025)
## 
##     Null Deviance: 6433245471 on 387 degrees of freedom
## Residual Deviance: 1312223032 on 372.9999 degrees of freedom
## AIC: 6966.282 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                         Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private                  1 1743281330 1743281330 495.528 < 2.2e-16 ***
## s(Room.Board, df = 2)    1 1349150163 1349150163 383.497 < 2.2e-16 ***
## s(PhD, df = 2)           1  327440246  327440246  93.075 < 2.2e-16 ***
## s(perc.alumni, df = 2)   1  245007093  245007093  69.643  1.40e-15 ***
## s(Expend, df = 5)        1  559665043  559665043 159.085 < 2.2e-16 ***
## s(Grad.Rate, df = 2)     1   55662319   55662319  15.822  8.36e-05 ***
## Residuals              373 1312223032    3518025                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                        Npar Df  Npar F     Pr(F)    
## (Intercept)                                         
## Private                                             
## s(Room.Board, df = 2)        1  2.5160   0.11354    
## s(PhD, df = 2)               1  1.7649   0.18482    
## s(perc.alumni, df = 2)       1  3.2300   0.07311 .  
## s(Expend, df = 5)            4 19.3548 1.754e-14 ***
## s(Grad.Rate, df = 2)         1  2.0473   0.15331    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(gam.fit2)

## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 8) + s(PhD, 
##     df = 5) + s(perc.alumni, df = 2) + s(Expend, df = 10) + s(Grad.Rate, 
##     df = 7), data = ctrain)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -6961.22 -1047.50   -38.53  1300.74  6783.71 
## 
## (Dispersion Parameter for gaussian family taken to be 3437620)
## 
##     Null Deviance: 6433245471 on 387 degrees of freedom
## Residual Deviance: 1216917265 on 354 degrees of freedom
## AIC: 6975.026 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                         Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private                  1 1791042090 1791042090 521.012 < 2.2e-16 ***
## s(Room.Board, df = 8)    1 1323839009 1323839009 385.103 < 2.2e-16 ***
## s(PhD, df = 5)           1  312273254  312273254  90.840 < 2.2e-16 ***
## s(perc.alumni, df = 2)   1  222294451  222294451  64.665 1.347e-14 ***
## s(Expend, df = 10)       1  583734943  583734943 169.808 < 2.2e-16 ***
## s(Grad.Rate, df = 7)     1   66070271   66070271  19.220 1.538e-05 ***
## Residuals              354 1216917265    3437620                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                        Npar Df Npar F     Pr(F)    
## (Intercept)                                        
## Private                                            
## s(Room.Board, df = 8)        7 1.3300   0.23491    
## s(PhD, df = 5)               4 2.5832   0.03698 *  
## s(perc.alumni, df = 2)       1 2.6372   0.10527    
## s(Expend, df = 10)           9 9.2786 1.147e-12 ***
## s(Grad.Rate, df = 7)         6 1.5369   0.16507    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

It would appear that Expend have a non- linear relationship, also with Grad. rate and Ph.D.