Homework #5

require(ISLR)

## Loading required package: ISLR

require(caret)

## Loading required package: caret

## Loading required package: ggplot2

## Loading required package: lattice

require(tidyverse)

## Loading required package: tidyverse

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ tibble  3.1.5     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ✓ purrr   0.3.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x purrr::lift()   masks caret::lift()

data('College')
set.seed(1)

inTrain <- createDataPartition(College$Apps, p = 0.75, list = FALSE)

training <- College[inTrain,]
testing <- College[-inTrain,]

preObj <- preProcess(training, method = c('center', 'scale'))

training <- predict(preObj, training)
testing <- predict(preObj, testing)

y_train <- training$Apps
y_test <- testing$Apps

one_hot_encoding <- dummyVars(Apps ~ ., data = training)
x_train <- predict(one_hot_encoding, training)
x_test <- predict(one_hot_encoding, testing)
lin_model <- lm(Apps ~ ., data = training)

pred <- predict(lin_model, testing)

(lin_info <- postResample(pred, testing$Apps))

##      RMSE  Rsquared       MAE 
## 0.2799768 0.9201765 0.1568743

ridge_fit <- train(x = x_train, y = y_train,
                   method = 'glmnet', 
                   trControl = trainControl(method = 'cv', number = 10),
                   tuneGrid = expand.grid(alpha = 0,
                                          lambda = seq(0, 10e2, length.out = 20)))

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.

(ridge_info <- postResample(predict(ridge_fit, x_test), y_test))

##      RMSE  Rsquared       MAE 
## 0.2853247 0.9211286 0.1645806

coef(ridge_fit$finalModel, ridge_fit$bestTune$lambda)

## 19 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept)  0.034871314
## Private.No   0.075423210
## Private.Yes -0.076037580
## Accept       0.665628733
## Enroll       0.090243372
## Top10perc    0.107160248
## Top25perc    0.011628030
## F.Undergrad  0.063308801
## P.Undergrad  0.017427317
## Outstate    -0.028995432
## Room.Board   0.048720533
## Books        0.012799145
## Personal    -0.002894430
## PhD         -0.017989250
## Terminal    -0.010434665
## S.F.Ratio    0.006920126
## perc.alumni -0.031683867
## Expend       0.083525070
## Grad.Rate    0.058131023

plot(ridge_fit)

plot(varImp(ridge_fit))

lasso_fit <- train(x = x_train, y = y_train, 
                   method = 'glmnet',
                   trControl = trainControl(method = 'cv', number = 10),
                   tuneGrid = expand.grid(alpha = 1,
                                          lambda = seq(0.0001, 1, length.out = 50)))

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.

(lasso_info <- postResample(predict(lasso_fit, x_test), y_test))

##      RMSE  Rsquared       MAE 
## 0.2914812 0.9141364 0.1511801

coef(lasso_fit$finalModel, lasso_fit$bestTune$lambda)

## 19 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept) -1.470609e-02
## Private.No   5.410732e-02
## Private.Yes  .           
## Accept       8.883839e-01
## Enroll       .           
## Top10perc    1.092201e-01
## Top25perc    .           
## F.Undergrad  .           
## P.Undergrad  .           
## Outstate     .           
## Room.Board   1.483337e-04
## Books        .           
## Personal     .           
## PhD          .           
## Terminal     .           
## S.F.Ratio    .           
## perc.alumni  .           
## Expend       4.067011e-02
## Grad.Rate    4.062721e-06

plot(lasso_fit)

plot(varImp(lasso_fit))

pcr_model <- train(x = x_train, y = y_train,
                   method = 'pcr',
                   trControl = trainControl(method = 'cv', number = 10),
                   tuneGrid = expand.grid(ncomp = 1:10))
(pcr_info <- postResample(predict(pcr_model, x_test), y_test))

##      RMSE  Rsquared       MAE 
## 0.3231292 0.8916531 0.1986075

coef(pcr_model$finalModel)

## , , 10 comps
## 
##                 .outcome
## Private.No   0.031985972
## Private.Yes -0.031985972
## Accept       0.343576750
## Enroll       0.305359773
## Top10perc    0.042630417
## Top25perc    0.027790893
## F.Undergrad  0.273818439
## P.Undergrad -0.049487667
## Outstate     0.038573119
## Room.Board   0.070607615
## Books        0.016433593
## Personal    -0.023529455
## PhD         -0.023992433
## Terminal    -0.024182230
## S.F.Ratio    0.003741623
## perc.alumni -0.070567887
## Expend       0.090126298
## Grad.Rate    0.071302714

plot(pcr_model)

plot(varImp(pcr_model))

pls_model <- train(x = x_train, y = y_train,
                   method = 'pls',
                   trControl = trainControl(method = 'cv', number = 10),
                   tuneGrid = expand.grid(ncomp = 1:10))
(pls_info <- postResample(predict(pls_model, x_test), y_test))

##      RMSE  Rsquared       MAE 
## 0.2792580 0.9209302 0.1572165

coef(pls_model$finalModel)

## , , 10 comps
## 
##                 .outcome
## Private.No   0.071314109
## Private.Yes -0.071314109
## Accept       1.039263053
## Enroll      -0.169855531
## Top10perc    0.235572152
## Top25perc   -0.070270182
## F.Undergrad -0.022580399
## P.Undergrad  0.032522451
## Outstate    -0.085919942
## Room.Board   0.036344367
## Books        0.004835496
## Personal     0.007546695
## PhD         -0.044709439
## Terminal     0.002332025
## S.F.Ratio    0.010868271
## perc.alumni -0.009046689
## Expend       0.072963738
## Grad.Rate    0.037425850

plot(pls_model)

as_data_frame(rbind(lin_info,
      ridge_info,
      lasso_info,
      pcr_info,
      pls_info)) %>%
    mutate(model = c('Linear', 'Ridge', 'Lasso', 'PCR', 'PLS')) %>%
    select(model, RMSE, Rsquared)

## Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

## # A tibble: 5 × 3
##   model   RMSE Rsquared
##   <chr>  <dbl>    <dbl>
## 1 Linear 0.280    0.920
## 2 Ridge  0.285    0.921
## 3 Lasso  0.291    0.914
## 4 PCR    0.323    0.892
## 5 PLS    0.279    0.921

testing %>%
    summarize(sd = sd(Apps))

##          sd
## 1 0.9818241

require(ggthemes)

## Loading required package: ggthemes

residfunc <- function(fit, data) {
 predict(fit, data) - testing$Apps
}

data_frame(Observed = testing$Apps,
           LM = residfunc(lin_model, testing),
           Ridge = residfunc(ridge_fit, x_test),
           Lasso = residfunc(lasso_fit, x_test),
           PCR = residfunc(pcr_model, x_test),
           PLS = residfunc(pls_model, x_test)) %>%
    gather(Model, Residuals, -Observed) %>%
    ggplot(aes(Observed, Residuals, col = Model)) +
    geom_hline(yintercept = 0, lty = 2) +
    geom_point(alpha = 0.6) +
    geom_smooth(method = 'loess', alpha = 0.01, col = 'lightsalmon2') +
    facet_wrap(~ Model, ncol = 5) +
    theme_tufte() +
    theme(legend.position = 'top') +
    coord_flip()

## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

## Warning: attributes are not identical across measure variables;
## they will be dropped

## `geom_smooth()` using formula 'y ~ x'

require(MASS); require(tidyverse); require(caret); require(leaps)

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## Loading required package: leaps

set.seed(1)
data('Boston')

inTrain <- createDataPartition(Boston$crim, p = 0.6, list = FALSE)

x_train <- Boston[inTrain, -1]
y_train <- Boston[inTrain, 1]
x_test <- Boston[-inTrain, -1]
y_test <- Boston[-inTrain, 1]
best_subs <- regsubsets(x = x_train, y = y_train, nvmax = 13)

fit_summary <- summary(best_subs)

require(ggplot2); require(ggthemes)

data_frame(MSE = fit_summary$rss/nrow(x_train),
           Cp = fit_summary$cp, 
           BIC = fit_summary$bic,
           AdjustedR2 = fit_summary$adjr2) %>%
    mutate(id = row_number()) %>%
    gather(Metric, value, -id) %>%
    ggplot(aes(id, value, col = Metric)) +
    geom_line() + geom_point() + ylab('') + 
    xlab('Number of Variables Used') + 
    facet_wrap(~ Metric, scales = 'free') +
    theme_tufte() + scale_x_continuous(breaks = 1:13)

scales = c('r2', 'adjr2', 'bic', 'Cp')
par(mfrow = c(2,2))

for (scale in scales) {
    plot(best_subs, scale = scale)
}

par(mfrow = c(1,1))
test_errors <- rep(NA,13)

test.mat <- model.matrix(crim ~ ., data = Boston[-inTrain,])

for (i in 1:13){
    coefs <- coef(best_subs, id=i)
    pred <- test.mat[,names(coefs)]%*%coefs
    test_errors[i] <- sqrt(mean((y_test - pred)^2))
}

data_frame(RMSE = test_errors) %>% 
    mutate(id = row_number()) %>% 
    ggplot(aes(id, RMSE)) +
    geom_line() + geom_point() + 
    xlab('Number of Variables Used') + 
    ggtitle('MSE on testing set') + 
    theme_tufte() + 
    scale_x_continuous(breaks = 1:13)

(regsubset_info <- min(test_errors))

## [1] 5.811256

coef(best_subs, id = 1:5)

## [[1]]
## (Intercept)         rad 
##  -2.4448623   0.6546826 
## 
## [[2]]
## (Intercept)         rad        medv 
##   2.6574618   0.5769903  -0.1963036 
## 
## [[3]]
## (Intercept)         rad     ptratio        medv 
##  10.8303135   0.6042171  -0.4067664  -0.2351904 
## 
## [[4]]
## (Intercept)          zn         dis         rad        medv 
##  5.83377235  0.05509227 -0.72924037  0.52407648 -0.21866690 
## 
## [[5]]
## (Intercept)          zn          rm         dis         rad        medv 
## -1.70826140  0.05299909  1.50466724 -0.73918276  0.51079178 -0.29652993

lasso_fit <- train(x = x_train, y = y_train,
                   method = 'glmnet',
                   trControl = trainControl(method = 'cv', number = 10),
                   tuneGrid = expand.grid(alpha = 1,
                                          lambda = seq(0.001, 1, length.out = 100)),
                   preProcess = c('center', 'scale'))

(lasso_info <- postResample(predict(lasso_fit, x_test), y_test))

##     RMSE Rsquared      MAE 
## 5.793303 0.433477 2.603667

coef(lasso_fit$finalModel, lasso_fit$bestTune$lambda)

## 14 x 1 sparse Matrix of class "dgCMatrix"
##                     s1
## (Intercept)  3.7575062
## zn           .        
## indus        .        
## chas         .        
## nox          .        
## rm           .        
## age          .        
## dis         -0.1856880
## rad          4.2982107
## tax          .        
## ptratio      .        
## black        .        
## lstat        0.1178773
## medv        -1.0546456

lasso_fit$bestTune

##    alpha    lambda
## 84     1 0.8385455

plot(lasso_fit)

plot(varImp(lasso_fit))

ridge_fit <- train(x_train, y_train,
                   method = 'glmnet',
                   trControl = trainControl(method = 'cv', number = 10),
                   tuneGrid = expand.grid(alpha = 0,
                                          lambda = seq(0, 1e2, length.out = 50)),
                   preProcess = c('center', 'scale'))
(ridge_info <- postResample(predict(ridge_fit, x_test), y_test))

##      RMSE  Rsquared       MAE 
## 5.7137087 0.4498421 2.7943386

coef(ridge_fit$finalModel, ridge_fit$bestTune$lambda)

## 14 x 1 sparse Matrix of class "dgCMatrix"
##                      s1
## (Intercept)  3.75750624
## zn           0.54011923
## indus       -0.34230036
## chas        -0.30633776
## nox          0.14724495
## rm           0.44593736
## age          0.10933511
## dis         -0.98108462
## rad          2.85345034
## tax          1.16015712
## ptratio     -0.09232032
## black       -0.31532518
## lstat        0.64017157
## medv        -1.44312967

ridge_fit$bestTune

##   alpha   lambda
## 2     0 2.040816

plot(ridge_fit)

plot(varImp(ridge_fit))

glmnet_fit <- train(x_train, y_train,
                    method = 'glmnet',
                    trControl = trainControl(method = 'cv', number = 10),
                    tuneGrid = expand.grid(alpha = seq(0, 1, length.out = 6),
                                           lambda = seq(0, 1e2, length.out = 20)),
                    preProcess = c('center', 'scale'))

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.

(glmnet_info <- postResample(predict(glmnet_fit, x_test), y_test))

##      RMSE  Rsquared       MAE 
## 5.7504399 0.4478022 2.8789035

coef(object = glmnet_fit$finalModel, s = glmnet_fit$bestTune$lambda)

## 14 x 1 sparse Matrix of class "dgCMatrix"
##                      s1
## (Intercept)  3.75750624
## zn           0.79927708
## indus       -0.56669351
## chas        -0.32661514
## nox         -0.32145785
## rm           0.70164835
## age          0.03721032
## dis         -1.61848336
## rad          4.01878688
## tax          0.60120176
## ptratio     -0.37415233
## black       -0.09300755
## lstat        0.54462549
## medv        -2.16987001

glmnet_fit$bestTune

##   alpha lambda
## 1     0      0

plot(glmnet_fit)

plot(varImp(glmnet_fit))

pcr_fit <- train(x_train, y_train,
                 method = 'pcr',
                 trControl = trainControl(method = 'cv', number = 10),
                 tuneGrid = expand.grid(ncomp = 1:13), 
                 preProcess = c('center', 'scale'))

(pcr_info <- postResample(predict(pcr_fit, x_test), y_test))

##      RMSE  Rsquared       MAE 
## 5.8816610 0.4296175 3.0845598

pcr_fit

## Principal Component Analysis 
## 
## 306 samples
##  13 predictor
## 
## Pre-processing: centered (13), scaled (13) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 275, 277, 276, 275, 275, 275, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE     
##    1     6.560574  0.4800110  3.843889
##    2     6.559955  0.4824507  3.879334
##    3     6.175577  0.5954423  3.134008
##    4     6.166002  0.6001710  3.114538
##    5     6.165989  0.6016351  3.099134
##    6     6.164470  0.5941572  3.241512
##    7     6.171809  0.5937923  3.231744
##    8     6.030915  0.6217769  3.231019
##    9     6.056110  0.6186845  3.265161
##   10     6.064727  0.6201973  3.238258
##   11     6.044505  0.6234399  3.204063
##   12     6.173310  0.5958234  3.406977
##   13     6.116109  0.6072988  3.293449
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 8.

plot(pcr_fit)

plot(varImp(pcr_fit))

pls_fit <- train(x_train, y_train,
                 method = 'pls',
                 trControl = trainControl(method = 'cv', number = 10),
                 tuneGrid = expand.grid(ncomp = 1:13), 
                 preProcess = c('center', 'scale'))

(pls_info <- postResample(predict(pls_fit, x_test), y_test))

##      RMSE  Rsquared       MAE 
## 5.7361826 0.4505769 2.8789603

pls_fit

## Partial Least Squares 
## 
## 306 samples
##  13 predictor
## 
## Pre-processing: centered (13), scaled (13) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 275, 275, 274, 275, 276, 276, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE     
##    1     6.201189  0.5251744  3.645062
##    2     5.818300  0.6334711  3.019228
##    3     5.818943  0.6391877  3.147983
##    4     5.820687  0.6344493  3.194376
##    5     5.855419  0.6352588  3.196379
##    6     5.864940  0.6351171  3.216697
##    7     5.867013  0.6354134  3.224509
##    8     5.848427  0.6395298  3.186069
##    9     5.844713  0.6395603  3.188456
##   10     5.846431  0.6391123  3.187532
##   11     5.845136  0.6392478  3.186003
##   12     5.845142  0.6392653  3.185977
##   13     5.845154  0.6392627  3.185996
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 2.

plot(pls_fit)

rbind(c(regsubset_info, NA, NA),
      lasso_info, 
      ridge_info,
      glmnet_info,
      pcr_info,
      pls_info)

##                 RMSE  Rsquared      MAE
##             5.811256        NA       NA
## lasso_info  5.793303 0.4334770 2.603667
## ridge_info  5.713709 0.4498421 2.794339
## glmnet_info 5.750440 0.4478022 2.878903
## pcr_info    5.881661 0.4296175 3.084560
## pls_info    5.736183 0.4505769 2.878960

Homework #5

Scott Schier

10/26/2021