Assignment 6

library(ISLR)

## Warning: package 'ISLR' was built under R version 4.0.3

library(boot)

## Warning: package 'boot' was built under R version 4.0.4

library(caret)

## Warning: package 'caret' was built under R version 4.0.3

## Loading required package: lattice

## 
## Attaching package: 'lattice'

## The following object is masked from 'package:boot':
## 
##     melanoma

## Loading required package: ggplot2

library(gam)

## Warning: package 'gam' was built under R version 4.0.4

## Loading required package: splines

## Loading required package: foreach

## Warning: package 'foreach' was built under R version 4.0.3

## Loaded gam 1.20

library(leaps)

## Warning: package 'leaps' was built under R version 4.0.3

#Question 6

6a.

data(Wage)

set.seed(7)

cv.error = rep(NA, 10)
for (i in 1:10) {
  glm.fit = glm(wage ~ poly(age, i), data = Wage)
  cv.error[i]=cv.glm(Wage, glm.fit, K = 10)$delta[1]
}

print(cv.error)

##  [1] 1677.629 1600.838 1595.967 1593.759 1595.406 1594.984 1595.637 1595.991
##  [9] 1593.700 1594.877

plot(1:10, cv.error, xlab = "Degree of Polynomial", ylab = "CV Error", type = "line")

## Warning in plot.xy(xy, type, ...): plot type 'line' will be truncated to first
## character

points(which.min(cv.error), cv.error[which.min(cv.error)], col = "red", cex = 2, pch = 20)

From the line graph we can see that d = 9 is the optimal degree for the polynomial.

set.seed(7)
fit.1 = lm(wage~poly(age, 1), data=Wage)
fit.2 = lm(wage~poly(age, 2), data=Wage)
fit.3 = lm(wage~poly(age, 3), data=Wage)
fit.4 = lm(wage~poly(age, 4), data=Wage)
fit.5 = lm(wage~poly(age, 5), data=Wage)
fit.6 = lm(wage~poly(age, 6), data=Wage)
fit.7 = lm(wage~poly(age, 7), data=Wage)
fit.8 = lm(wage~poly(age, 8), data=Wage)
fit.9 = lm(wage~poly(age, 9), data=Wage)
fit.10 = lm(wage~poly(age, 10), data=Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5, fit.6, fit.7, fit.8, fit.9, fit.10)

## Analysis of Variance Table
## 
## Model  1: wage ~ poly(age, 1)
## Model  2: wage ~ poly(age, 2)
## Model  3: wage ~ poly(age, 3)
## Model  4: wage ~ poly(age, 4)
## Model  5: wage ~ poly(age, 5)
## Model  6: wage ~ poly(age, 6)
## Model  7: wage ~ poly(age, 7)
## Model  8: wage ~ poly(age, 8)
## Model  9: wage ~ poly(age, 9)
## Model 10: wage ~ poly(age, 10)
##    Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1    2998 5022216                                    
## 2    2997 4793430  1    228786 143.7638 < 2.2e-16 ***
## 3    2996 4777674  1     15756   9.9005  0.001669 ** 
## 4    2995 4771604  1      6070   3.8143  0.050909 .  
## 5    2994 4770322  1      1283   0.8059  0.369398    
## 6    2993 4766389  1      3932   2.4709  0.116074    
## 7    2992 4763834  1      2555   1.6057  0.205199    
## 8    2991 4763707  1       127   0.0796  0.777865    
## 9    2990 4756703  1      7004   4.4014  0.035994 *  
## 10   2989 4756701  1         3   0.0017  0.967529    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

From the ANOVA we can see that the 3rd and 9th order polynomial have a reasonable fit to the data.

plot(wage~age, data=Wage, col="darkgrey")
agelims = range(Wage$age)
age.grid = seq(from=agelims[1], to=agelims[2])
lm.fit = lm(wage~poly(age, 3), data=Wage)
lm.pred = predict(lm.fit, data.frame(age=age.grid))
lines(age.grid, lm.pred, col="blue", lwd=2)

plot(wage~age, data=Wage, col="darkgrey")
agelims = range(Wage$age)
age.grid = seq(from=agelims[1], to=agelims[2])
lm.fit = lm(wage~poly(age, 9), data=Wage)
lm.pred = predict(lm.fit, data.frame(age=age.grid))
lines(age.grid, lm.pred, col="blue", lwd=2)

6b.

set.seed(7)
cvs <- rep(0, 10)
for (i in 2:10) {
    Wage$age.cut <- cut(Wage$age, i)
    fit <- glm(wage ~ age.cut, data = Wage)
    cvs[i] <- cv.glm(Wage, fit, K = 10)$delta[1]
}

plot(2:10, cvs[-1], xlab = "Cuts", ylab = "Cross-valdation Error", type = "l")
points(which.min(cvs), cvs[which.min(cvs)], col = "red", cex = 2, pch = 20)

From the line plot we can see that 8 cuts is optimal.

plot(wage ~ age, data = Wage, col = "darkgrey")
agelims <- range(Wage$age)
age.grid <- seq(from = agelims[1], to = agelims[2])
fit <- glm(wage ~ cut(age, 8), data = Wage)
preds <- predict(fit, data.frame(age = age.grid))
lines(age.grid, preds, col = "red", lwd = 2)

#Question 10

10a.

data(College)

set.seed(7)
college.Index<-createDataPartition(y=College$Outstate, 
                                p = .8,list = FALSE,times = 1)


collegeTrain<-College[college.Index,]
collegeTest<-College[-college.Index,]

set.seed(7)
regfit.full = regsubsets(Outstate ~ ., data = collegeTrain, method = 'forward', nvmax = 20)

summary(regfit.full)

## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = collegeTrain, method = "forward", 
##     nvmax = 20)
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
##           PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 )  " "        " "  " "    " "    " "       " "       " "        
## 2  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 3  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 4  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 5  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 6  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 7  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 8  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 9  ( 1 )  "*"        " "  "*"    " "    " "       " "       " "        
## 10  ( 1 ) "*"        " "  "*"    "*"    " "       " "       " "        
## 11  ( 1 ) "*"        "*"  "*"    "*"    " "       " "       " "        
## 12  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       " "        
## 13  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       " "        
## 14  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       "*"        
## 15  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       "*"        
## 16  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       "*"        
## 17  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
##           P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 2  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 3  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 4  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 5  ( 1 )  " "         "*"        " "   " "      "*" " "      " "      
## 6  ( 1 )  " "         "*"        " "   " "      "*" " "      " "      
## 7  ( 1 )  " "         "*"        " "   "*"      "*" " "      " "      
## 8  ( 1 )  " "         "*"        " "   "*"      "*" " "      "*"      
## 9  ( 1 )  " "         "*"        " "   "*"      "*" " "      "*"      
## 10  ( 1 ) " "         "*"        " "   "*"      "*" " "      "*"      
## 11  ( 1 ) " "         "*"        " "   "*"      "*" " "      "*"      
## 12  ( 1 ) " "         "*"        " "   "*"      "*" " "      "*"      
## 13  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 14  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 15  ( 1 ) " "         "*"        "*"   "*"      "*" "*"      "*"      
## 16  ( 1 ) "*"         "*"        "*"   "*"      "*" "*"      "*"      
## 17  ( 1 ) "*"         "*"        "*"   "*"      "*" "*"      "*"      
##           perc.alumni Expend Grad.Rate
## 1  ( 1 )  " "         "*"    " "      
## 2  ( 1 )  " "         "*"    " "      
## 3  ( 1 )  " "         "*"    " "      
## 4  ( 1 )  " "         "*"    "*"      
## 5  ( 1 )  " "         "*"    "*"      
## 6  ( 1 )  "*"         "*"    "*"      
## 7  ( 1 )  "*"         "*"    "*"      
## 8  ( 1 )  "*"         "*"    "*"      
## 9  ( 1 )  "*"         "*"    "*"      
## 10  ( 1 ) "*"         "*"    "*"      
## 11  ( 1 ) "*"         "*"    "*"      
## 12  ( 1 ) "*"         "*"    "*"      
## 13  ( 1 ) "*"         "*"    "*"      
## 14  ( 1 ) "*"         "*"    "*"      
## 15  ( 1 ) "*"         "*"    "*"      
## 16  ( 1 ) "*"         "*"    "*"      
## 17  ( 1 ) "*"         "*"    "*"

reg.summary=summary(regfit.full)
reg.summary$rss

##  [1] 5409486904 3967921849 3198637338 2861982908 2646023259 2537067195
##  [7] 2507910353 2488391448 2467278495 2350019084 2319927210 2306603884
## [13] 2295828624 2287107972 2285053714 2284758514 2284543501

plot(reg.summary$rss,xlab="Number of Variables",ylab="RSS",type="l")

Model with 17 variables are a good choice. This model has the smallest error.

coef(regfit.full, id = 6)

##   (Intercept)    PrivateYes    Room.Board           PhD   perc.alumni 
## -3700.3613840  2667.0260439     0.9496997    38.1529684    42.4867544 
##        Expend     Grad.Rate 
##     0.2278933    33.3056352

forward.mod = lm(Outstate ~ Private + Room.Board + PhD + perc.alumni +
                   Expend + Grad.Rate, data = collegeTrain)

summary(forward.mod)

## 
## Call:
## lm(formula = Outstate ~ Private + Room.Board + PhD + perc.alumni + 
##     Expend + Grad.Rate, data = collegeTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7528.8 -1294.7  -109.3  1237.5 10541.1 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.700e+03  4.746e+02  -7.797 2.72e-14 ***
## PrivateYes   2.667e+03  2.321e+02  11.489  < 2e-16 ***
## Room.Board   9.497e-01  9.483e-02  10.015  < 2e-16 ***
## PhD          3.815e+01  6.014e+00   6.344 4.34e-10 ***
## perc.alumni  4.249e+01  8.260e+00   5.143 3.63e-07 ***
## Expend       2.279e-01  2.056e-02  11.085  < 2e-16 ***
## Grad.Rate    3.331e+01  5.946e+00   5.601 3.21e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2029 on 616 degrees of freedom
## Multiple R-squared:  0.7473, Adjusted R-squared:  0.7448 
## F-statistic: 303.6 on 6 and 616 DF,  p-value: < 2.2e-16

These are the predictors in the model

10b.

gam.fit=gam(Outstate~Private+s(Room.Board)+s(PhD)+s(perc.alumni)+s(Expend)+s(Grad.Rate),data=collegeTrain)

par(mfrow = c(2, 3))
plot(gam.fit,se=T,col='blue')

10c.

Error for GAM model

mean((predict(gam.fit, newdata = collegeTest) - collegeTest$Outstate)^2)

## [1] 3525787

Error for forward model

mean((predict(forward.mod, newdata = collegeTest) - collegeTest$Outstate)^2)

## [1] 4204301

The GAM model has a lower test error compared to the forward model.

10d.

summary(gam.fit)

## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + 
##     s(Expend) + s(Grad.Rate), data = collegeTrain)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -7858.72 -1096.61    -5.51  1214.18  7989.77 
## 
## (Dispersion Parameter for gaussian family taken to be 3468935)
## 
##     Null Deviance: 10038612702 on 622 degrees of freedom
## Residual Deviance: 2084832235 on 601.0006 degrees of freedom
## AIC: 11173.58 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                 Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private          1 2577999460 2577999460  743.17 < 2.2e-16 ***
## s(Room.Board)    1 1970010191 1970010191  567.90 < 2.2e-16 ***
## s(PhD)           1  737262125  737262125  212.53 < 2.2e-16 ***
## s(perc.alumni)   1  379073222  379073222  109.28 < 2.2e-16 ***
## s(Expend)        1  796019016  796019016  229.47 < 2.2e-16 ***
## s(Grad.Rate)     1  133346145  133346145   38.44  1.05e-09 ***
## Residuals      601 2084832235    3468935                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                Npar Df  Npar F   Pr(F)    
## (Intercept)                               
## Private                                   
## s(Room.Board)        3  3.7280 0.01125 *  
## s(PhD)               3  2.5940 0.05174 .  
## s(perc.alumni)       3  1.6803 0.17005    
## s(Expend)            3 29.0774 < 2e-16 ***
## s(Grad.Rate)         3  2.3793 0.06873 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

From the ANOVA output for nonparametric effects, we can see that Expand and Room.Board are significant and have smooth/non-linear effects.

From the ANOVA for parametric effects, we can see that Room.Board, PhD, perc.alumni, Expend, and Grad.Rate have linear effects because of their small p-value.

Assignment 6

Mary Aldrete

3/20/2021