Assignment #6

Exercises

Applied

6. In this exercise, you will further analyze the Wage data set considered throughout this chapter.

library(ISLR)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.5     v dplyr   1.0.3
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
attach(Wage)
str(Wage)
## 'data.frame':    3000 obs. of  11 variables:
##  $ year      : int  2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
##  $ age       : int  18 24 45 43 50 54 44 30 41 52 ...
##  $ maritl    : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
##  $ race      : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
##  $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
##  $ region    : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ jobclass  : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
##  $ health    : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
##  $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
##  $ logwage   : num  4.32 4.26 4.88 5.04 4.32 ...
##  $ wage      : num  75 70.5 131 154.7 75 ...

(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

ctrl = trainControl(method = "cv", number = 10)
cv.error = c() 
set.seed(123)
for (i in 1:10){ 
 lm.fit = train(y = Wage$wage,
                x = poly(Wage$age ,i, raw = TRUE), 
                method = "lm",
                metric = "RMSE",
                trcontrol = ctrl)
 cv.error[i] = lm.fit$results$RMSE 
}
data_frame = data.frame(degree = 1:10, CV_RMSE = cv.error)
data_frame
##    degree  CV_RMSE
## 1       1 40.95455
## 2       2 39.53766
## 3       3 39.78744
## 4       4 40.30738
## 5       5 39.84529
## 6       6 40.21556
## 7       7 40.39497
## 8       8 39.84910
## 9       9 39.78724
## 10     10 40.42590
min(cv.error)
## [1] 39.53766
data_frame %>%
  mutate(min_cv_error = as.numeric(min(cv.error) == cv.error)) %>%
  ggplot(aes(x = degree, y = cv.error)) +
  geom_line(col = "black") +
  geom_point(size = 2, aes(col = factor(min(cv.error)))) +
  scale_x_continuous(breaks = seq(1, 10), minor_breaks = NULL) +
  theme(legend.position = "none") +
  labs(title = "Polynomial degree with Cross-validation",
       x = "Degree",
       y = "CV RMSE")

The optimal degree d for the polynomial according to cross-validation is 2.

lm.fit.1 <- lm(wage ~ age, data = Wage)
lm.fit.2 <- lm(wage ~ poly(age, 2, raw = T), data = Wage)
lm.fit.3 <- lm(wage ~ poly(age, 3, raw = T), data = Wage)
lm.fit.4 <- lm(wage ~ poly(age, 4, raw = T), data = Wage)
lm.fit.5 <- lm(wage ~ poly(age, 5, raw = T), data = Wage)

anova(lm.fit.1, lm.fit.2, lm.fit.3, lm.fit.5, lm.fit.5)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2, raw = T)
## Model 3: wage ~ poly(age, 3, raw = T)
## Model 4: wage ~ poly(age, 5, raw = T)
## Model 5: wage ~ poly(age, 5, raw = T)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2994 4770322  2      7353   2.3074  0.099697 .  
## 5   2994 4770322  0         0                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

According to Anova, we could select either Model3 or Model 4.

ggplot(Wage, aes(x = age, y = wage)) + 
  geom_point(alpha = 0.3) + 
  geom_smooth(method = "lm", formula = "y ~ poly(x, 2, raw = T)") + 
  labs(title = "Polynomial Regression",
       subtitle = "Predicting 'wage' with a 2nd Degree Polynomial of 'age'")

ggplot(Wage, aes(x = age, y = wage)) + 
  geom_point(alpha = 0.3) + 
  geom_smooth(method = "lm", formula = "y ~ poly(x, 3, raw = T)") + 
  labs(title = "Polynomial Regression",
       subtitle = "Predicting 'wage' with a 3er Degree Polynomial of 'age'")

ggplot(Wage, aes(x = age, y = wage)) + 
  geom_point(alpha = 0.3) + 
  geom_smooth(method = "lm", formula = "y ~ poly(x, 4, raw = T)") + 
  labs(title = "Polynomial Regression",
       subtitle = "Predicting 'wage' with a 4th Degree Polynomial of 'age'")

(b) Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.

cv.error_step = c()
set.seed(123)
for (i in 2:20){ 
 lm.fit_step = train(y = Wage$wage,
                x = data.frame(cut(Wage$age, i)), 
                method = "lm",
                metric = "RMSE",
                trcontrol = ctrl)
 cv.error_step[i-1] = lm.fit_step$results$RMSE 
}
data_frame_step = data.frame(cuts = 2:20, CV_RMSE = cv.error_step)
data_frame_step
##    cuts  CV_RMSE
## 1     2 41.71835
## 2     3 40.52840
## 3     4 40.29139
## 4     5 40.77998
## 5     6 40.22279
## 6     7 40.44401
## 7     8 40.47687
## 8     9 40.03964
## 9    10 39.92958
## 10   11 40.50768
## 11   12 39.94227
## 12   13 39.71342
## 13   14 40.56430
## 14   15 39.83774
## 15   16 40.18459
## 16   17 40.08346
## 17   18 40.12602
## 18   19 40.01920
## 19   20 39.95358
table((cut(age, 13)))
## 
## (17.9,22.8] (22.8,27.5] (27.5,32.3] (32.3,37.1] (37.1,41.8] (41.8,46.6] 
##          98         233         332         381         377         454 
## (46.6,51.4] (51.4,56.2] (56.2,60.9] (60.9,65.7] (65.7,70.5] (70.5,75.2] 
##         451         326         175         109          35          21 
## (75.2,80.1] 
##           8
data_frame_step %>%
  mutate(min_cv_error = as.numeric(min(cv.error_step) == cv.error_step)) %>%
  ggplot(aes(x = cuts, y = cv.error_step)) +
  geom_line(col = "black") +
  geom_point(size = 2, aes(col = factor(min(cv.error_step)))) +
  scale_x_continuous(breaks = seq(2, 20), minor_breaks = NULL) +
  theme(legend.position = "none") +
  labs(title = "Cut points with Cross-validation",
       x = "Intervals",
       y = "CV RMSE")

ggplot(Wage, aes(x = age, y = wage)) + 
  geom_point(alpha = 0.3) + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 13)") + 
  labs(title = "Step Function",
       subtitle = "Predicting 'wage' with a 13-interval step function of 'age'")

10. This question relates to the College data set.

attach(College)
str(College)
## 'data.frame':    777 obs. of  18 variables:
##  $ Private    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Apps       : num  1660 2186 1428 417 193 ...
##  $ Accept     : num  1232 1924 1097 349 146 ...
##  $ Enroll     : num  721 512 336 137 55 158 103 489 227 172 ...
##  $ Top10perc  : num  23 16 22 60 16 38 17 37 30 21 ...
##  $ Top25perc  : num  52 29 50 89 44 62 45 68 63 44 ...
##  $ F.Undergrad: num  2885 2683 1036 510 249 ...
##  $ P.Undergrad: num  537 1227 99 63 869 ...
##  $ Outstate   : num  7440 12280 11250 12960 7560 ...
##  $ Room.Board : num  3300 6450 3750 5450 4120 ...
##  $ Books      : num  450 750 400 450 800 500 500 450 300 660 ...
##  $ Personal   : num  2200 1500 1165 875 1500 ...
##  $ PhD        : num  70 29 53 92 76 67 90 89 79 40 ...
##  $ Terminal   : num  78 30 66 97 72 73 93 100 84 41 ...
##  $ S.F.Ratio  : num  18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
##  $ perc.alumni: num  12 16 30 37 2 11 26 37 23 15 ...
##  $ Expend     : num  7041 10527 8735 19016 10922 ...
##  $ Grad.Rate  : num  60 56 54 59 15 55 63 73 80 52 ...

(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward step-wise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

set.seed(1)
trainIndex <- createDataPartition(College$Outstate, p = 0.7, list = FALSE)
train <- College[trainIndex, ]
test <- College[-trainIndex, ]
set.seed(42)
model_null <- lm(Outstate ~ 1, data = train)
model_full <- lm(Outstate ~ ., data = train)
model_forward <- step(model_null, direction = "forward", scope = formula(model_full), trace = 0)
summary(model_forward)
## 
## Call:
## lm(formula = Outstate ~ Expend + Private + Room.Board + perc.alumni + 
##     PhD + Grad.Rate + Terminal + S.F.Ratio + Personal + Top10perc, 
##     data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6357.4 -1257.9   -56.9  1285.6  5784.1 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.247e+03  8.707e+02  -2.581 0.010124 *  
## Expend       1.417e-01  2.345e-02   6.043 2.84e-09 ***
## PrivateYes   2.801e+03  2.486e+02  11.265  < 2e-16 ***
## Room.Board   8.855e-01  9.706e-02   9.123  < 2e-16 ***
## perc.alumni  4.136e+01  8.703e+00   4.752 2.59e-06 ***
## PhD          2.583e+01  1.066e+01   2.423 0.015743 *  
## Grad.Rate    2.433e+01  6.584e+00   3.695 0.000242 ***
## Terminal     2.473e+01  1.132e+01   2.185 0.029343 *  
## S.F.Ratio   -6.897e+01  3.032e+01  -2.275 0.023309 *  
## Personal    -3.028e-01  1.278e-01  -2.370 0.018148 *  
## Top10perc    1.365e+01  6.962e+00   1.961 0.050401 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1918 on 535 degrees of freedom
## Multiple R-squared:  0.7725, Adjusted R-squared:  0.7682 
## F-statistic: 181.6 on 10 and 535 DF,  p-value: < 2.2e-16
model_forward
## 
## Call:
## lm(formula = Outstate ~ Expend + Private + Room.Board + perc.alumni + 
##     PhD + Grad.Rate + Terminal + S.F.Ratio + Personal + Top10perc, 
##     data = train)
## 
## Coefficients:
## (Intercept)       Expend   PrivateYes   Room.Board  perc.alumni          PhD  
##  -2246.9583       0.1417    2800.7577       0.8855      41.3565      25.8289  
##   Grad.Rate     Terminal    S.F.Ratio     Personal    Top10perc  
##     24.3273      24.7326     -68.9744      -0.3027      13.6529
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
car::vif(model_forward)
##      Expend     Private  Room.Board perc.alumni         PhD   Grad.Rate 
##    2.560131    1.893810    1.665366    1.736361    4.179037    1.826930 
##    Terminal   S.F.Ratio    Personal   Top10perc 
##    3.930903    1.927617    1.201035    2.279483
ctrl <- trainControl(method = "repeatedcv", 
                     number = 10, 
                     repeats = 1, 
                     selectionFunction = "oneSE")

set.seed(11)

forward <- train(Outstate ~ .,
                       data = train,
                       method = "leapForward",
                       metric = "MSE",
                       maximize = F,
                       trControl = ctrl,
                       tuneGrid = data.frame(nvmax = 1:17))
## Warning in train.default(x, y, weights = w, ...): The metric "MSE" was not in
## the result set. RMSE will be used instead.
forward
## Linear Regression with Forward Selection 
## 
## 546 samples
##  17 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 491, 492, 494, 492, 491, 490, ... 
## Resampling results across tuning parameters:
## 
##   nvmax  RMSE      Rsquared   MAE     
##    1     3145.790  0.4213291  2490.437
##    2     2542.574  0.6128139  1919.293
##    3     2318.515  0.6697868  1771.760
##    4     2140.920  0.7126656  1674.759
##    5     1991.787  0.7512806  1572.973
##    6     1949.028  0.7620367  1545.758
##    7     1964.222  0.7581387  1556.702
##    8     1960.076  0.7591031  1563.759
##    9     1943.231  0.7633189  1550.525
##   10     1925.727  0.7671333  1533.877
##   11     1928.084  0.7663182  1539.351
##   12     1932.577  0.7655672  1531.555
##   13     1905.470  0.7716630  1517.793
##   14     1897.805  0.7735142  1513.250
##   15     1902.198  0.7725648  1516.383
##   16     1904.340  0.7721352  1517.022
##   17     1904.639  0.7720839  1516.796
## 
## RMSE was used to select the optimal model using  the one SE rule.
## The final value used for the model was nvmax = 6.
coef(forward$finalModel, id = 6)
##   (Intercept)    PrivateYes    Room.Board           PhD   perc.alumni 
## -3896.4692078  2996.1334413     0.9113605    46.2771415    50.9134890 
##        Expend     Grad.Rate 
##     0.1822738    28.3550059

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

library(gam)
## Loading required package: splines
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded gam 1.20
model_gam1 <- gam(Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + s(Expend) + s(Grad.Rate) + s(Terminal) + s(S.F.Ratio) + s(Personal) + s(Top10perc),data = train)

par(mfrow = c(5, 2))
plot(model_gam1, se = T, col = "blue")

model_gam2<- gam(Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + s(Expend) + s(Grad.Rate), data = train)

par(mfrow = c(2, 3))
plot(model_gam2, se = T, col = "blue")

anova(model_gam1, model_gam2, test = "F")
## Analysis of Deviance Table
## 
## Model 1: Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + 
##     s(Expend) + s(Grad.Rate) + s(Terminal) + s(S.F.Ratio) + s(Personal) + 
##     s(Top10perc)
## Model 2: Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + 
##     s(Expend) + s(Grad.Rate)
##   Resid. Df Resid. Dev  Df   Deviance      F   Pr(>F)   
## 1       508 1575791119                                  
## 2       524 1693648333 -16 -117857213 2.3747 0.002003 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model_gam2)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + 
##     s(Expend) + s(Grad.Rate), data = train)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -6980.56 -1188.05    20.76  1261.21  4694.53 
## 
## (Dispersion Parameter for gaussian family taken to be 3232157)
## 
##     Null Deviance: 8648835880 on 545 degrees of freedom
## Residual Deviance: 1693648333 on 523.9995 degrees of freedom
## AIC: 9756.834 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                 Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private          1 2529911179 2529911179 782.732 < 2.2e-16 ***
## s(Room.Board)    1 1638249614 1638249614 506.860 < 2.2e-16 ***
## s(PhD)           1  643605089  643605089 199.126 < 2.2e-16 ***
## s(perc.alumni)   1  347392831  347392831 107.480 < 2.2e-16 ***
## s(Expend)        1  643202403  643202403 199.001 < 2.2e-16 ***
## s(Grad.Rate)     1   82045608   82045608  25.384 6.476e-07 ***
## Residuals      524 1693648333    3232157                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                Npar Df  Npar F    Pr(F)    
## (Intercept)                                
## Private                                    
## s(Room.Board)        3  2.4737  0.06083 .  
## s(PhD)               3  1.1030  0.34739    
## s(perc.alumni)       3  0.7430  0.52676    
## s(Expend)            3 27.5206 2.22e-16 ***
## s(Grad.Rate)         3  1.7663  0.15257    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(c) Evaluate the model obtained on the test set, and explain the results obtained.

model_pred <- predict(model_gam2, newdata = test)
mean((model_pred - test$Outstate)^2)
## [1] 4224718
test_TSS <- sum((test$Outstate - mean(test$Outstate))^2)
test_RSS <- sum((model_pred - test$Outstate)^2)

1 - test_RSS/test_TSS
## [1] 0.7504265
forward_pred <- predict(forward, newdata = test)
mean((forward_pred - test$Outstate)^2)
## [1] 5073114
forward_test_RSS <- sum((forward_pred - test$Outstate)^2)

1 - forward_test_RSS/test_TSS
## [1] 0.7003078

The model model_gam2 has a better performance because it has lower mean error and higher \(R^2\).

(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

Base on the summary(model_gam2) the p-value for the smooth effect of Expend and Room.Board are significant. A new model is fit were I keep the smooth effect on only these two variables to determine is there evidence of non-linear relationship with the response variable.

model_gam3 <- gam(Outstate ~ Private + s(Room.Board) + PhD + perc.alumni + s(Expend) + Grad.Rate, data = train)
model_gam3_pred <- predict(model_gam3, newdata = test)
mean((model_gam3_pred - test$Outstate)^2)
## [1] 4234908
test_RSS <- sum((model_gam3_pred - test$Outstate)^2)
1 - test_RSS/test_TSS
## [1] 0.7498245
par(mfrow = c(2, 3))
plot(model_gam3, se = T, col = "blue")

model_gam4 <- gam(Outstate ~ Private + s(Room.Board) + PhD + perc.alumni + Expend + Grad.Rate, data = train)
model_gam4_pred <- predict(model_gam4, newdata = test)
mean((model_gam4_pred - test$Outstate)^2)
## [1] 5056435
test_RSS <- sum((model_gam4_pred - test$Outstate)^2)
1 - test_RSS/test_TSS
## [1] 0.7012932
par(mfrow = c(2, 3))
plot(model_gam4, se = T, col = "blue")

If we removed the splines()`function from the model the performance drops. So this might indicate that there is a non-linear relationship with the response variable.