Q.6

(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

Split Data

library(glmnet)
## Warning: package 'glmnet' was built under R version 4.1.2
library(Matrix)
library(caret)

inTrain1 = createDataPartition(y=Wage$wage,  p=0.75,list = FALSE)
train1 = Wage[inTrain1,]
test1 = Wage[-inTrain1,]

Optimal Degree d

# train and test 
x.train = model.matrix(wage ~ ., data=train1)
x.test = model.matrix(wage ~ ., data=test1)

y.train= train1$wage
y.test = Wage$wage

# choose degree using cross-validation
cv.d=cv.glmnet(x.train,y.train,alpha=0)
d=round(cv.d$lambda.min, 0)
d
## [1] 4

Polynomial Function

# polynomial regression to the 4th degree
poly.fit = glm(wage ~ poly(age, 4), data = Wage)

# create  grid of values for age to predict wage
agelims = range(Wage$age)
age.grid = seq(from = agelims[1], to = agelims[2])

# predict wage using age
poly.preds = predict(poly.fit, newdata = list(age = age.grid), se = TRUE)

poly.se.bands = cbind(poly.preds$se.fit + 2 * poly.preds$se.fit, poly.preds$poly.fit - 2 * poly.preds$se.fit)

summary(poly.fit)
## 
## Call:
## glm(formula = wage ~ poly(age, 4), data = Wage)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -98.707  -24.626   -4.993   15.217  203.693  
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    111.7036     0.7287 153.283  < 2e-16 ***
## poly(age, 4)1  447.0679    39.9148  11.201  < 2e-16 ***
## poly(age, 4)2 -478.3158    39.9148 -11.983  < 2e-16 ***
## poly(age, 4)3  125.5217    39.9148   3.145  0.00168 ** 
## poly(age, 4)4  -77.9112    39.9148  -1.952  0.05104 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1593.19)
## 
##     Null deviance: 5222086  on 2999  degrees of freedom
## Residual deviance: 4771604  on 2995  degrees of freedom
## AIC: 30641
## 
## Number of Fisher Scoring iterations: 2

ANOVA Model of the Polynomial Function

# linear model
fit.1=lm(wage~age,data=Wage)
#polynomial models
fit.2=lm(wage~poly(age,2),data=Wage) 
fit.3=lm(wage~poly(age,3),data=Wage)
fit.4=lm(wage~poly(age,4),data=Wage)
fit.5=lm(wage~poly(age,5),data=Wage)

anova(fit.1,fit.2,fit.3,fit.4,fit.5)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The p-value is increasing and F statistic is decreasing as the degree of the polynomial is increasing. The model with the 5th degree polynomial has a F statistic that’s less than 1 and a p-value that is bigger 0.05. So the degree of the polynomial model for the Wage data needs to be less than 5.

Plot of the Polynomial Function

par(mfrow=c(1,1),mar=c(4.5,4.5,1,1),oma=c(0,0,2,0))
plot(age,wage,xlim=agelims,cex =.5,col="darkgrey")
title("Degree-4 Polynomial",outer=T)
lines(age.grid,poly.preds$fit,lwd=2,col="darkblue")
matlines(age.grid,poly.se.bands,lwd=1,col="lightblue",lty=3)

### (b) Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained.

Cross Validation

library(base)
library(ModelMetrics)
library(boot)
library(lava)

# cross-validation
cv <- rep(NA, 10)
for (i in 2:10) {
    Wage$age.cut <- cut(Wage$age, i)
    fit <- glm(wage ~ age.cut, data = Wage)
    cv[i] <- cv.glm(Wage, fit, K = 10)$delta[1]
}
which.min(cv)
## [1] 8
cv.df = as.data.frame(cbind(cv=cv[2:10], xaxis=(2:10)))
ggplot(cv.df, aes(xaxis, cv))+geom_line(color = "navy")+geom_point(size=2)+geom_point(cv.df[7,], mapping=aes(xaxis, cv),size=2, color = "red")+ xlab("Cut Points")

Step Function

# Fit a step function
step.fit=lm(wage~cut(age, 8),data=Wage)
coef(summary(step.fit))
##                        Estimate Std. Error   t value      Pr(>|t|)
## (Intercept)            76.28175   2.629812 29.006542 3.110596e-163
## cut(age, 8)(25.8,33.5] 25.83329   3.161343  8.171618  4.440913e-16
## cut(age, 8)(33.5,41.2] 40.22568   3.049065 13.192791  1.136044e-38
## cut(age, 8)(41.2,49]   43.50112   3.018341 14.412262  1.406253e-45
## cut(age, 8)(49,56.8]   40.13583   3.176792 12.634076  1.098741e-35
## cut(age, 8)(56.8,64.5] 44.10243   3.564299 12.373380  2.481643e-34
## cut(age, 8)(64.5,72.2] 28.94825   6.041576  4.791505  1.736008e-06
## cut(age, 8)(72.2,80.1] 15.22418   9.781110  1.556488  1.196978e-01
# predict the step function
step.preds=predict(step.fit,newdata=list(age=age.grid),se=TRUE)

step.se.bands = cbind(step.preds$step.fit + 2 * step.preds$se.fit, step.preds$fit - 2 * step.preds$se.fit)

Plot of the Step Function

par(mfrow=c(1,1),mar=c(4.5,4.5,1,1),oma=c(0,0,2,0))
plot(age,wage,xlim=agelims,cex =.5,col="darkgrey")
title("Step Function",outer=T)
lines(age.grid,step.preds$fit,lwd=2,col="darkblue")
matlines(age.grid,step.se.bands,lwd=1,col="lightblue",lty=3)

detach(Wage)

Q.10

attach(College) 

(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

Split Data

# Split the data into a training set and a test set
inTrain2 = createDataPartition(y = College$Outstate, list = FALSE, p=0.75)
train2 = College[inTrain2,]
test2= College[-inTrain2,]

Forward Stepwise Selection

library(leaps)

# forward stepwise selection on the training set
stepwise.fwd=regsubsets(Outstate~., data= train2, nvmax = 17,  method ="forward")
fwd.summary = summary(stepwise.fwd)
fwd.summary
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = train2, nvmax = 17, method = "forward")
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
##           PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 )  " "        " "  " "    " "    " "       " "       " "        
## 2  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 3  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 4  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 5  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 6  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 7  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 8  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 9  ( 1 )  "*"        " "  "*"    " "    " "       " "       " "        
## 10  ( 1 ) "*"        "*"  "*"    " "    " "       " "       " "        
## 11  ( 1 ) "*"        "*"  "*"    "*"    " "       " "       " "        
## 12  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       " "        
## 13  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       "*"        
## 14  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       "*"        
## 15  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
## 16  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
## 17  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
##           P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 2  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 3  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 4  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 5  ( 1 )  " "         "*"        " "   " "      " " "*"      " "      
## 6  ( 1 )  " "         "*"        " "   " "      " " "*"      " "      
## 7  ( 1 )  " "         "*"        " "   "*"      " " "*"      " "      
## 8  ( 1 )  " "         "*"        " "   "*"      " " "*"      "*"      
## 9  ( 1 )  " "         "*"        " "   "*"      " " "*"      "*"      
## 10  ( 1 ) " "         "*"        " "   "*"      " " "*"      "*"      
## 11  ( 1 ) " "         "*"        " "   "*"      " " "*"      "*"      
## 12  ( 1 ) " "         "*"        " "   "*"      " " "*"      "*"      
## 13  ( 1 ) " "         "*"        " "   "*"      " " "*"      "*"      
## 14  ( 1 ) " "         "*"        "*"   "*"      " " "*"      "*"      
## 15  ( 1 ) " "         "*"        "*"   "*"      " " "*"      "*"      
## 16  ( 1 ) "*"         "*"        "*"   "*"      " " "*"      "*"      
## 17  ( 1 ) "*"         "*"        "*"   "*"      "*" "*"      "*"      
##           perc.alumni Expend Grad.Rate
## 1  ( 1 )  " "         "*"    " "      
## 2  ( 1 )  " "         "*"    " "      
## 3  ( 1 )  " "         "*"    " "      
## 4  ( 1 )  "*"         "*"    " "      
## 5  ( 1 )  "*"         "*"    " "      
## 6  ( 1 )  "*"         "*"    "*"      
## 7  ( 1 )  "*"         "*"    "*"      
## 8  ( 1 )  "*"         "*"    "*"      
## 9  ( 1 )  "*"         "*"    "*"      
## 10  ( 1 ) "*"         "*"    "*"      
## 11  ( 1 ) "*"         "*"    "*"      
## 12  ( 1 ) "*"         "*"    "*"      
## 13  ( 1 ) "*"         "*"    "*"      
## 14  ( 1 ) "*"         "*"    "*"      
## 15  ( 1 ) "*"         "*"    "*"      
## 16  ( 1 ) "*"         "*"    "*"      
## 17  ( 1 ) "*"         "*"    "*"

Plots

rsq.df = as.data.frame(cbind(xaxis=(1:17), rsq=fwd.summary$rsq))
cp.df = as.data.frame(cbind(xaxis=(1:17), cp=fwd.summary$cp))
bic.df = as.data.frame(cbind(xaxis=(1:17), bic=fwd.summary$bic))
adjrsq.df = as.data.frame(cbind(xaxis=(1:17), adjrsq=fwd.summary$adjr2))

ggplot()+ 
  geom_line(rsq.df, mapping=aes(xaxis, rsq),color = "navy") + 
  geom_point(rsq.df, mapping=aes(xaxis, rsq),color = "navy", size=2) + geom_boxplot()

ggplot() + 
  geom_line(cp.df, mapping=aes(xaxis, cp),color = "black") + 
  geom_point(cp.df, mapping=aes(xaxis, cp),color = "black", size=2) 

ggplot() + 
  geom_line(bic.df, mapping=aes(xaxis, bic),color = "maroon") + 
  geom_point(bic.df, mapping=aes(xaxis, bic),color = "maroon", size=2)

ggplot() + 
  geom_line(adjrsq.df, mapping=aes(xaxis, adjrsq),color = "darkred") + 
  geom_point(adjrsq.df, mapping=aes(xaxis, adjrsq),color = "darkred", size=2)

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

library(gam)

gam.clg = gam(
  Outstate ~ Private + Private + s(Apps, 4) + s(Accept, 5) +  s(F.Undergrad, 5) + s(P.Undergrad, 4) + s(Room.Board, 5), data = train2)

summary.Gam(gam.clg)
## 
## Call: gam(formula = Outstate ~ Private + Private + s(Apps, 4) + s(Accept, 
##     5) + s(F.Undergrad, 5) + s(P.Undergrad, 4) + s(Room.Board, 
##     5), data = train2)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -8348.57 -1457.01    39.43  1612.54 10911.40 
## 
## (Dispersion Parameter for gaussian family taken to be 5520530)
## 
##     Null Deviance: 9448773184 on 583 degrees of freedom
## Residual Deviance: 3085975073 on 558.9998 degrees of freedom
## AIC: 10749.78 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## Private             1 2772355946 2772355946 502.1902 < 2.2e-16 ***
## s(Apps, 4)          1  975667155  975667155 176.7343 < 2.2e-16 ***
## s(Accept, 5)        1   58030536   58030536  10.5118  0.001257 ** 
## s(F.Undergrad, 5)   1  343657509  343657509  62.2508 1.601e-14 ***
## s(P.Undergrad, 4)   1   51137176   51137176   9.2631  0.002448 ** 
## s(Room.Board, 5)    1 1258974392 1258974392 228.0532 < 2.2e-16 ***
## Residuals         559 3085975073    5520530                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df  Npar F     Pr(F)    
## (Intercept)                                    
## Private                                        
## s(Apps, 4)              3 11.7634 1.753e-07 ***
## s(Accept, 5)            4  9.0412 4.417e-07 ***
## s(F.Undergrad, 5)       4  3.6128  0.006415 ** 
## s(P.Undergrad, 4)       3 17.5085 7.086e-11 ***
## s(Room.Board, 5)        4  3.6275  0.006256 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Plots

plot.Gam(gam.clg, col ="blue", se=TRUE)

(c) Evaluate the model obtained on the test set, and explain the results obtained.

preds = predict(gam.clg, test2)
RSS = sum((test2$Outstate - preds)^2) 
TSS = sum((test2$Outstate - mean(test2$Outstate))^2)
1 - (RSS / TSS)
## [1] 0.6675091

(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

summary.Gam(gam.clg)
## 
## Call: gam(formula = Outstate ~ Private + Private + s(Apps, 4) + s(Accept, 
##     5) + s(F.Undergrad, 5) + s(P.Undergrad, 4) + s(Room.Board, 
##     5), data = train2)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -8348.57 -1457.01    39.43  1612.54 10911.40 
## 
## (Dispersion Parameter for gaussian family taken to be 5520530)
## 
##     Null Deviance: 9448773184 on 583 degrees of freedom
## Residual Deviance: 3085975073 on 558.9998 degrees of freedom
## AIC: 10749.78 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## Private             1 2772355946 2772355946 502.1902 < 2.2e-16 ***
## s(Apps, 4)          1  975667155  975667155 176.7343 < 2.2e-16 ***
## s(Accept, 5)        1   58030536   58030536  10.5118  0.001257 ** 
## s(F.Undergrad, 5)   1  343657509  343657509  62.2508 1.601e-14 ***
## s(P.Undergrad, 4)   1   51137176   51137176   9.2631  0.002448 ** 
## s(Room.Board, 5)    1 1258974392 1258974392 228.0532 < 2.2e-16 ***
## Residuals         559 3085975073    5520530                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df  Npar F     Pr(F)    
## (Intercept)                                    
## Private                                        
## s(Apps, 4)              3 11.7634 1.753e-07 ***
## s(Accept, 5)            4  9.0412 4.417e-07 ***
## s(F.Undergrad, 5)       4  3.6128  0.006415 ** 
## s(P.Undergrad, 4)       3 17.5085 7.086e-11 ***
## s(Room.Board, 5)        4  3.6275  0.006256 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The model suggests a strong non-linear relationship between “Outstate” and “Expend”, and a moderate non-linear relationship between “Outstate” and “Grad.rate”