Assignment #6

Exercises

Applied

6. In this exercise, you will further analyze the `Wage` data set considered throughout this chapter.

library(ISLR)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.5     v dplyr   1.0.3
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

attach(Wage)

str(Wage)

## 'data.frame':    3000 obs. of  11 variables:
##  $ year      : int  2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
##  $ age       : int  18 24 45 43 50 54 44 30 41 52 ...
##  $ maritl    : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
##  $ race      : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
##  $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
##  $ region    : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ jobclass  : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
##  $ health    : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
##  $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
##  $ logwage   : num  4.32 4.26 4.88 5.04 4.32 ...
##  $ wage      : num  75 70.5 131 154.7 75 ...

(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

ctrl = trainControl(method = "cv", number = 10)
cv.error = c() 
set.seed(123)
for (i in 1:10){ 
 lm.fit = train(y = Wage$wage,
                x = poly(Wage$age ,i, raw = TRUE), 
                method = "lm",
                metric = "RMSE",
                trcontrol = ctrl)
 cv.error[i] = lm.fit$results$RMSE 
}

data_frame = data.frame(degree = 1:10, CV_RMSE = cv.error)
data_frame

##    degree  CV_RMSE
## 1       1 40.95455
## 2       2 39.53766
## 3       3 39.78744
## 4       4 40.30738
## 5       5 39.84529
## 6       6 40.21556
## 7       7 40.39497
## 8       8 39.84910
## 9       9 39.78724
## 10     10 40.42590

min(cv.error)

## [1] 39.53766

data_frame %>%
  mutate(min_cv_error = as.numeric(min(cv.error) == cv.error)) %>%
  ggplot(aes(x = degree, y = cv.error)) +
  geom_line(col = "black") +
  geom_point(size = 2, aes(col = factor(min(cv.error)))) +
  scale_x_continuous(breaks = seq(1, 10), minor_breaks = NULL) +
  theme(legend.position = "none") +
  labs(title = "Polynomial degree with Cross-validation",
       x = "Degree",
       y = "CV RMSE")

The optimal degree d for the polynomial according to cross-validation is 2.

lm.fit.1 <- lm(wage ~ age, data = Wage)
lm.fit.2 <- lm(wage ~ poly(age, 2, raw = T), data = Wage)
lm.fit.3 <- lm(wage ~ poly(age, 3, raw = T), data = Wage)
lm.fit.4 <- lm(wage ~ poly(age, 4, raw = T), data = Wage)
lm.fit.5 <- lm(wage ~ poly(age, 5, raw = T), data = Wage)

anova(lm.fit.1, lm.fit.2, lm.fit.3, lm.fit.5, lm.fit.5)

## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2, raw = T)
## Model 3: wage ~ poly(age, 3, raw = T)
## Model 4: wage ~ poly(age, 5, raw = T)
## Model 5: wage ~ poly(age, 5, raw = T)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2994 4770322  2      7353   2.3074  0.099697 .  
## 5   2994 4770322  0         0                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

According to Anova, we could select either Model3 or Model 4.

ggplot(Wage, aes(x = age, y = wage)) + 
  geom_point(alpha = 0.3) + 
  geom_smooth(method = "lm", formula = "y ~ poly(x, 2, raw = T)") + 
  labs(title = "Polynomial Regression",
       subtitle = "Predicting 'wage' with a 2nd Degree Polynomial of 'age'")

ggplot(Wage, aes(x = age, y = wage)) + 
  geom_point(alpha = 0.3) + 
  geom_smooth(method = "lm", formula = "y ~ poly(x, 3, raw = T)") + 
  labs(title = "Polynomial Regression",
       subtitle = "Predicting 'wage' with a 3er Degree Polynomial of 'age'")

ggplot(Wage, aes(x = age, y = wage)) + 
  geom_point(alpha = 0.3) + 
  geom_smooth(method = "lm", formula = "y ~ poly(x, 4, raw = T)") + 
  labs(title = "Polynomial Regression",
       subtitle = "Predicting 'wage' with a 4th Degree Polynomial of 'age'")

(b) Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.

cv.error_step = c()
set.seed(123)
for (i in 2:20){ 
 lm.fit_step = train(y = Wage$wage,
                x = data.frame(cut(Wage$age, i)), 
                method = "lm",
                metric = "RMSE",
                trcontrol = ctrl)
 cv.error_step[i-1] = lm.fit_step$results$RMSE 
}

data_frame_step = data.frame(cuts = 2:20, CV_RMSE = cv.error_step)
data_frame_step

##    cuts  CV_RMSE
## 1     2 41.71835
## 2     3 40.52840
## 3     4 40.29139
## 4     5 40.77998
## 5     6 40.22279
## 6     7 40.44401
## 7     8 40.47687
## 8     9 40.03964
## 9    10 39.92958
## 10   11 40.50768
## 11   12 39.94227
## 12   13 39.71342
## 13   14 40.56430
## 14   15 39.83774
## 15   16 40.18459
## 16   17 40.08346
## 17   18 40.12602
## 18   19 40.01920
## 19   20 39.95358

table((cut(age, 13)))

## 
## (17.9,22.8] (22.8,27.5] (27.5,32.3] (32.3,37.1] (37.1,41.8] (41.8,46.6] 
##          98         233         332         381         377         454 
## (46.6,51.4] (51.4,56.2] (56.2,60.9] (60.9,65.7] (65.7,70.5] (70.5,75.2] 
##         451         326         175         109          35          21 
## (75.2,80.1] 
##           8

data_frame_step %>%
  mutate(min_cv_error = as.numeric(min(cv.error_step) == cv.error_step)) %>%
  ggplot(aes(x = cuts, y = cv.error_step)) +
  geom_line(col = "black") +
  geom_point(size = 2, aes(col = factor(min(cv.error_step)))) +
  scale_x_continuous(breaks = seq(2, 20), minor_breaks = NULL) +
  theme(legend.position = "none") +
  labs(title = "Cut points with Cross-validation",
       x = "Intervals",
       y = "CV RMSE")

ggplot(Wage, aes(x = age, y = wage)) + 
  geom_point(alpha = 0.3) + 
  geom_smooth(method = "lm", formula = "y ~ cut(x, 13)") + 
  labs(title = "Step Function",
       subtitle = "Predicting 'wage' with a 13-interval step function of 'age'")

10. This question relates to the College data set.

attach(College)
str(College)

## 'data.frame':    777 obs. of  18 variables:
##  $ Private    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Apps       : num  1660 2186 1428 417 193 ...
##  $ Accept     : num  1232 1924 1097 349 146 ...
##  $ Enroll     : num  721 512 336 137 55 158 103 489 227 172 ...
##  $ Top10perc  : num  23 16 22 60 16 38 17 37 30 21 ...
##  $ Top25perc  : num  52 29 50 89 44 62 45 68 63 44 ...
##  $ F.Undergrad: num  2885 2683 1036 510 249 ...
##  $ P.Undergrad: num  537 1227 99 63 869 ...
##  $ Outstate   : num  7440 12280 11250 12960 7560 ...
##  $ Room.Board : num  3300 6450 3750 5450 4120 ...
##  $ Books      : num  450 750 400 450 800 500 500 450 300 660 ...
##  $ Personal   : num  2200 1500 1165 875 1500 ...
##  $ PhD        : num  70 29 53 92 76 67 90 89 79 40 ...
##  $ Terminal   : num  78 30 66 97 72 73 93 100 84 41 ...
##  $ S.F.Ratio  : num  18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
##  $ perc.alumni: num  12 16 30 37 2 11 26 37 23 15 ...
##  $ Expend     : num  7041 10527 8735 19016 10922 ...
##  $ Grad.Rate  : num  60 56 54 59 15 55 63 73 80 52 ...

(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward step-wise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

set.seed(1)
trainIndex <- createDataPartition(College$Outstate, p = 0.7, list = FALSE)
train <- College[trainIndex, ]
test <- College[-trainIndex, ]

set.seed(42)
model_null <- lm(Outstate ~ 1, data = train)
model_full <- lm(Outstate ~ ., data = train)
model_forward <- step(model_null, direction = "forward", scope = formula(model_full), trace = 0)
summary(model_forward)

## 
## Call:
## lm(formula = Outstate ~ Expend + Private + Room.Board + perc.alumni + 
##     PhD + Grad.Rate + Terminal + S.F.Ratio + Personal + Top10perc, 
##     data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6357.4 -1257.9   -56.9  1285.6  5784.1 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.247e+03  8.707e+02  -2.581 0.010124 *  
## Expend       1.417e-01  2.345e-02   6.043 2.84e-09 ***
## PrivateYes   2.801e+03  2.486e+02  11.265  < 2e-16 ***
## Room.Board   8.855e-01  9.706e-02   9.123  < 2e-16 ***
## perc.alumni  4.136e+01  8.703e+00   4.752 2.59e-06 ***
## PhD          2.583e+01  1.066e+01   2.423 0.015743 *  
## Grad.Rate    2.433e+01  6.584e+00   3.695 0.000242 ***
## Terminal     2.473e+01  1.132e+01   2.185 0.029343 *  
## S.F.Ratio   -6.897e+01  3.032e+01  -2.275 0.023309 *  
## Personal    -3.028e-01  1.278e-01  -2.370 0.018148 *  
## Top10perc    1.365e+01  6.962e+00   1.961 0.050401 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1918 on 535 degrees of freedom
## Multiple R-squared:  0.7725, Adjusted R-squared:  0.7682 
## F-statistic: 181.6 on 10 and 535 DF,  p-value: < 2.2e-16

model_forward

## 
## Call:
## lm(formula = Outstate ~ Expend + Private + Room.Board + perc.alumni + 
##     PhD + Grad.Rate + Terminal + S.F.Ratio + Personal + Top10perc, 
##     data = train)
## 
## Coefficients:
## (Intercept)       Expend   PrivateYes   Room.Board  perc.alumni          PhD  
##  -2246.9583       0.1417    2800.7577       0.8855      41.3565      25.8289  
##   Grad.Rate     Terminal    S.F.Ratio     Personal    Top10perc  
##     24.3273      24.7326     -68.9744      -0.3027      13.6529

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

car::vif(model_forward)

##      Expend     Private  Room.Board perc.alumni         PhD   Grad.Rate 
##    2.560131    1.893810    1.665366    1.736361    4.179037    1.826930 
##    Terminal   S.F.Ratio    Personal   Top10perc 
##    3.930903    1.927617    1.201035    2.279483

ctrl <- trainControl(method = "repeatedcv", 
                     number = 10, 
                     repeats = 1, 
                     selectionFunction = "oneSE")

set.seed(11)

forward <- train(Outstate ~ .,
                       data = train,
                       method = "leapForward",
                       metric = "MSE",
                       maximize = F,
                       trControl = ctrl,
                       tuneGrid = data.frame(nvmax = 1:17))

## Warning in train.default(x, y, weights = w, ...): The metric "MSE" was not in
## the result set. RMSE will be used instead.

forward

## Linear Regression with Forward Selection 
## 
## 546 samples
##  17 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 491, 492, 494, 492, 491, 490, ... 
## Resampling results across tuning parameters:
## 
##   nvmax  RMSE      Rsquared   MAE     
##    1     3145.790  0.4213291  2490.437
##    2     2542.574  0.6128139  1919.293
##    3     2318.515  0.6697868  1771.760
##    4     2140.920  0.7126656  1674.759
##    5     1991.787  0.7512806  1572.973
##    6     1949.028  0.7620367  1545.758
##    7     1964.222  0.7581387  1556.702
##    8     1960.076  0.7591031  1563.759
##    9     1943.231  0.7633189  1550.525
##   10     1925.727  0.7671333  1533.877
##   11     1928.084  0.7663182  1539.351
##   12     1932.577  0.7655672  1531.555
##   13     1905.470  0.7716630  1517.793
##   14     1897.805  0.7735142  1513.250
##   15     1902.198  0.7725648  1516.383
##   16     1904.340  0.7721352  1517.022
##   17     1904.639  0.7720839  1516.796
## 
## RMSE was used to select the optimal model using  the one SE rule.
## The final value used for the model was nvmax = 6.

coef(forward$finalModel, id = 6)

##   (Intercept)    PrivateYes    Room.Board           PhD   perc.alumni 
## -3896.4692078  2996.1334413     0.9113605    46.2771415    50.9134890 
##        Expend     Grad.Rate 
##     0.1822738    28.3550059

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

library(gam)

## Loading required package: splines

## Loading required package: foreach

## 
## Attaching package: 'foreach'

## The following objects are masked from 'package:purrr':
## 
##     accumulate, when

## Loaded gam 1.20

model_gam1 <- gam(Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + s(Expend) + s(Grad.Rate) + s(Terminal) + s(S.F.Ratio) + s(Personal) + s(Top10perc),data = train)

par(mfrow = c(5, 2))
plot(model_gam1, se = T, col = "blue")

model_gam2<- gam(Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + s(Expend) + s(Grad.Rate), data = train)

par(mfrow = c(2, 3))
plot(model_gam2, se = T, col = "blue")

anova(model_gam1, model_gam2, test = "F")

## Analysis of Deviance Table
## 
## Model 1: Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + 
##     s(Expend) + s(Grad.Rate) + s(Terminal) + s(S.F.Ratio) + s(Personal) + 
##     s(Top10perc)
## Model 2: Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + 
##     s(Expend) + s(Grad.Rate)
##   Resid. Df Resid. Dev  Df   Deviance      F   Pr(>F)   
## 1       508 1575791119                                  
## 2       524 1693648333 -16 -117857213 2.3747 0.002003 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(model_gam2)

## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + 
##     s(Expend) + s(Grad.Rate), data = train)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -6980.56 -1188.05    20.76  1261.21  4694.53 
## 
## (Dispersion Parameter for gaussian family taken to be 3232157)
## 
##     Null Deviance: 8648835880 on 545 degrees of freedom
## Residual Deviance: 1693648333 on 523.9995 degrees of freedom
## AIC: 9756.834 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                 Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private          1 2529911179 2529911179 782.732 < 2.2e-16 ***
## s(Room.Board)    1 1638249614 1638249614 506.860 < 2.2e-16 ***
## s(PhD)           1  643605089  643605089 199.126 < 2.2e-16 ***
## s(perc.alumni)   1  347392831  347392831 107.480 < 2.2e-16 ***
## s(Expend)        1  643202403  643202403 199.001 < 2.2e-16 ***
## s(Grad.Rate)     1   82045608   82045608  25.384 6.476e-07 ***
## Residuals      524 1693648333    3232157                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                Npar Df  Npar F    Pr(F)    
## (Intercept)                                
## Private                                    
## s(Room.Board)        3  2.4737  0.06083 .  
## s(PhD)               3  1.1030  0.34739    
## s(perc.alumni)       3  0.7430  0.52676    
## s(Expend)            3 27.5206 2.22e-16 ***
## s(Grad.Rate)         3  1.7663  0.15257    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(c) Evaluate the model obtained on the test set, and explain the results obtained.

model_pred <- predict(model_gam2, newdata = test)
mean((model_pred - test$Outstate)^2)

## [1] 4224718

test_TSS <- sum((test$Outstate - mean(test$Outstate))^2)
test_RSS <- sum((model_pred - test$Outstate)^2)

1 - test_RSS/test_TSS

## [1] 0.7504265

forward_pred <- predict(forward, newdata = test)
mean((forward_pred - test$Outstate)^2)

## [1] 5073114

forward_test_RSS <- sum((forward_pred - test$Outstate)^2)

1 - forward_test_RSS/test_TSS

## [1] 0.7003078

The model model_gam2 has a better performance because it has lower mean error and higher \(R^2\).

(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

Base on the summary(model_gam2) the p-value for the smooth effect of Expend and Room.Board are significant. A new model is fit were I keep the smooth effect on only these two variables to determine is there evidence of non-linear relationship with the response variable.

model_gam3 <- gam(Outstate ~ Private + s(Room.Board) + PhD + perc.alumni + s(Expend) + Grad.Rate, data = train)
model_gam3_pred <- predict(model_gam3, newdata = test)
mean((model_gam3_pred - test$Outstate)^2)

## [1] 4234908

test_RSS <- sum((model_gam3_pred - test$Outstate)^2)
1 - test_RSS/test_TSS

## [1] 0.7498245

par(mfrow = c(2, 3))
plot(model_gam3, se = T, col = "blue")

model_gam4 <- gam(Outstate ~ Private + s(Room.Board) + PhD + perc.alumni + Expend + Grad.Rate, data = train)
model_gam4_pred <- predict(model_gam4, newdata = test)
mean((model_gam4_pred - test$Outstate)^2)

## [1] 5056435

test_RSS <- sum((model_gam4_pred - test$Outstate)^2)
1 - test_RSS/test_TSS

## [1] 0.7012932

par(mfrow = c(2, 3))
plot(model_gam4, se = T, col = "blue")

If we removed the splines()`function from the model the performance drops. So this might indicate that there is a non-linear relationship with the response variable.

Chapter 7: Moving Beyond Linearity

Maria E Morinigo

4/18/2021

Assignment #6

Exercises

Applied

6. In this exercise, you will further analyze the `Wage` data set considered throughout this chapter.

10. This question relates to the College data set.

Chapter 7: Moving Beyond Linearity

Maria E Morinigo

4/18/2021

Assignment #6

Exercises

Applied

6. In this exercise, you will further analyze the Wage data set considered throughout this chapter.

10. This question relates to the College data set.

6. In this exercise, you will further analyze the `Wage` data set considered throughout this chapter.