library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.1.8
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggthemes)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(ISLR)
library(pcr)

Exercise 6.8 pg 262 - pg 263

no. 8

In this exercise, we will generate simulated data, and will then use this data to perform best subset selection.

(a) Use the rnorm() function to generate a predictor X of length n = 100, as well as a noise vector of length n = 100.

set.seed(1)
X <- rnorm(100)
noise <- rnorm(100)

(b) Generate a response vector Y of length n = 100 according to the model

Y = β0 + β1X + β2X2 + β3X3 + ϵ,

dimana β0,β1,β2,β3,β4 adalah konstanta pilihan Anda.

Y <- 3 + 1*X + 4*X^2 - 1*X^3 + noise

(C) Use the regsubsets() function to perform best subset selection in order to choose the best model containing the predictors X, X2,…,X10. What is the best model obtained according to Cp, BIC, and adjusted R2? Show some plots to provide evidence for your answer, and report the coefficients of the best model ob- tained. Note you will need to use the data.frame() function to create a single data set containing both X and Y .

require(leaps)
## Loading required package: leaps
df <- data.frame(Y, X)
fit <- regsubsets(Y ~ poly(X, 10), data = df, nvmax = 10)

fit_summary <- summary(fit)

require(tidyverse);require(ggplot2);require(ggthemes);

data_frame(Cp = fit_summary$cp,
           BIC = fit_summary$bic,
           AdjR2 = fit_summary$adjr2) %>%
    mutate(id = row_number()) %>%
    gather(value_type, value, -id) %>%
    ggplot(aes(id, value, col = value_type)) +
    geom_line() + geom_point() + ylab('') + xlab('Number of Variables Used') +
    facet_wrap(~ value_type, scales = 'free') +
    theme_tufte() + scale_x_continuous(breaks = 1:10)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.

(d) Repeat (c), using forward stepwise selection and also using backwards stepwise selection. How does your answer compare to the results in (c)?

require(caret)

model_back <- train(Y ~ poly(X, 10), data = df, 
                    method = 'glmStepAIC', direction = 'backward', 
                    trace = 0,
               trControl = trainControl(method = 'none', verboseIter = FALSE))

postResample(predict(model_back, df), df$Y)
##      RMSE  Rsquared       MAE 
## 0.9314956 0.9569843 0.7488821
summary(model_back$finalModel)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8914  -0.5860  -0.1516   0.5892   2.1794  
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      6.10265    0.09557  63.856  < 2e-16 ***
## `poly(X, 10)1`  -7.19295    0.95569  -7.526 2.96e-11 ***
## `poly(X, 10)2`  40.74405    0.95569  42.633  < 2e-16 ***
## `poly(X, 10)3` -14.70908    0.95569 -15.391  < 2e-16 ***
## `poly(X, 10)5`   1.48019    0.95569   1.549    0.125    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.9133516)
## 
##     Null deviance: 2017.132  on 99  degrees of freedom
## Residual deviance:   86.768  on 95  degrees of freedom
## AIC: 281.59
## 
## Number of Fisher Scoring iterations: 2

Model bertahap mundur setuju dengan model himpunan bagian terbaik.

x_poly <- poly(df$X, 10)

colnames(x_poly) <- paste0('poly', 1:10)
model_forw <- train(y = Y, x = x_poly,
                    method = 'glmStepAIC', direction = 'forward',
                    trace = 0,
               trControl = trainControl(method = 'none', verboseIter = FALSE))

postResample(predict(model_forw, data.frame(x_poly)), df$Y)
##      RMSE  Rsquared       MAE 
## 0.9314956 0.9569843 0.7488821
summary(model_forw$finalModel)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8914  -0.5860  -0.1516   0.5892   2.1794  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.10265    0.09557  63.856  < 2e-16 ***
## poly2        40.74405    0.95569  42.633  < 2e-16 ***
## poly3       -14.70908    0.95569 -15.391  < 2e-16 ***
## poly1        -7.19295    0.95569  -7.526 2.96e-11 ***
## poly5         1.48019    0.95569   1.549    0.125    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.9133516)
## 
##     Null deviance: 2017.132  on 99  degrees of freedom
## Residual deviance:   86.768  on 95  degrees of freedom
## AIC: 281.59
## 
## Number of Fisher Scoring iterations: 2

Model bertahap maju juga setuju.

(e) Now fit a lasso model to the simulated data, again using X,X2,…,X10 as predictors. Use cross-validation to select the optimal value of λ. Create plots of the cross-validation error as a function of λ. Report the resulting coefficient estimates, and discuss the results obtained.

lasso_model <- train(Y ~ poly(X, 10), data = df,
                     method = 'glmnet',
                     trControl = trainControl(method = 'cv', number = 10),
                     tuneGrid = expand.grid(alpha = 1,
                                            lambda = seq(0.001, 0.2, by = 0.005)))

plot(lasso_model)

plot(varImp(lasso_model))

coef(lasso_model$finalModel, lasso_model$bestTune$lambda)
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept)     6.1026472
## poly(X, 10)1   -6.6829512
## poly(X, 10)2   40.2340466
## poly(X, 10)3  -14.1990830
## poly(X, 10)4    0.7470950
## poly(X, 10)5    0.9701884
## poly(X, 10)6    .        
## poly(X, 10)7    .        
## poly(X, 10)8    .        
## poly(X, 10)9    .        
## poly(X, 10)10  -0.4412295
postResample(predict(lasso_model, df), df$Y)
##      RMSE  Rsquared       MAE 
## 0.9265197 0.9577192 0.7566424

(f) Now generate a response vector Y according to the model Y=β0+β7X7+ϵ, and perform best subset selection and the lasso. Discuss the results obtained.

Y_7 <- 3 + 8*X^7 + noise
df_2 <- data_frame(Y_7 = Y_7, X = df[,-1])

fit <- regsubsets(Y_7 ~ poly(X, 10), data = df_2, nvmax = 10)

fit_summary <- summary(fit)

data_frame(Cp = fit_summary$cp,
           BIC = fit_summary$bic,
           R2 = fit_summary$adjr2) %>%
    mutate(id = row_number()) %>%
    gather(value_type, value, -id) %>%
    ggplot(aes(id, value, col = value_type)) +
    geom_line() + geom_point() + ylab('') + xlab('Number of Variables Used') +
    facet_wrap(~ value_type, scales = 'free') +
    theme_tufte() + scale_x_continuous(breaks = 1:10)

lasso_y7_model <- train(Y_7 ~ poly(X, 10), data = df_2,
                        method = 'glmnet', 
                        trControl = trainControl(method = 'cv', number = 5),
                        tuneGrid = expand.grid(alpha = 1, 
                                               lambda = seq(0.001, 0.2, by = 0.005)))

plot(lasso_y7_model)

plot(varImp(lasso_y7_model))

coef(lasso_y7_model$finalModel, lasso_y7_model$bestTune$lambda)
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept)     36.72505
## poly(X, 10)1  2630.24239
## poly(X, 10)2  1059.42846
## poly(X, 10)3  3491.14380
## poly(X, 10)4   597.14287
## poly(X, 10)5  1363.96802
## poly(X, 10)6    70.93052
## poly(X, 10)7   177.79560
## poly(X, 10)8     .      
## poly(X, 10)9     .      
## poly(X, 10)10    .
postResample(predict(lasso_y7_model, df_2), df_2$Y_7)
##       RMSE   Rsquared        MAE 
## 14.2854375  0.9996164  4.9531386

no. 9

In this exercise, we will predict the number of applications received using the other variables in the College data set.

(a) Split the data set into a training set and a test set.

require(ISLR)
require(caret)
require(tidyverse)
data('College')
set.seed(1)

inTrain <- createDataPartition(College$Apps, p = 0.75, list = FALSE)

training <- College[inTrain,]
testing <- College[-inTrain,]

preObj <- preProcess(training, method = c('center', 'scale'))

training <- predict(preObj, training)
testing <- predict(preObj, testing)

y_train <- training$Apps
y_test <- testing$Apps

one_hot_encoding <- dummyVars(Apps ~ ., data = training)
x_train <- predict(one_hot_encoding, training)
x_test <- predict(one_hot_encoding, testing)

(b) Fit a linear model using least squares on the training set, and report the test error obtained.

lin_model <- lm(Apps ~ ., data = training)

pred <- predict(lin_model, testing)

(lin_info <- postResample(pred, testing$Apps))
##      RMSE  Rsquared       MAE 
## 0.2799768 0.9201765 0.1568743

(c) Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.

ridge_fit <- train(x = x_train, y = y_train,
                   method = 'glmnet', 
                   trControl = trainControl(method = 'cv', number = 10),
                   tuneGrid = expand.grid(alpha = 0,
                                          lambda = seq(0, 10e2, length.out = 20)))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
(ridge_info <- postResample(predict(ridge_fit, x_test), y_test))
##      RMSE  Rsquared       MAE 
## 0.2853247 0.9211286 0.1645806
coef(ridge_fit$finalModel, ridge_fit$bestTune$lambda)
## 19 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept)  0.034871314
## Private.No   0.075423210
## Private.Yes -0.076037580
## Accept       0.665628735
## Enroll       0.090243371
## Top10perc    0.107160248
## Top25perc    0.011628030
## F.Undergrad  0.063308800
## P.Undergrad  0.017427317
## Outstate    -0.028995432
## Room.Board   0.048720533
## Books        0.012799145
## Personal    -0.002894430
## PhD         -0.017989250
## Terminal    -0.010434665
## S.F.Ratio    0.006920126
## perc.alumni -0.031683867
## Expend       0.083525070
## Grad.Rate    0.058131023
plot(ridge_fit)

plot(varImp(ridge_fit))

(d) Fit a lasso model on the training set, with λ chosen by crossvalidation. Report the test error obtained, along with the number of non-zero coefficient estimates.

lasso_fit <- train(x = x_train, y = y_train, 
                   method = 'glmnet',
                   trControl = trainControl(method = 'cv', number = 10),
                   tuneGrid = expand.grid(alpha = 1,
                                          lambda = seq(0.0001, 1, length.out = 50)))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
(lasso_info <- postResample(predict(lasso_fit, x_test), y_test))
##      RMSE  Rsquared       MAE 
## 0.2914812 0.9141364 0.1511801
coef(lasso_fit$finalModel, lasso_fit$bestTune$lambda)
## 19 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept) -1.470609e-02
## Private.No   5.410732e-02
## Private.Yes -1.041607e-14
## Accept       8.883839e-01
## Enroll       .           
## Top10perc    1.092201e-01
## Top25perc    .           
## F.Undergrad  .           
## P.Undergrad  .           
## Outstate     .           
## Room.Board   1.483337e-04
## Books        .           
## Personal     .           
## PhD          .           
## Terminal     .           
## S.F.Ratio    .           
## perc.alumni  .           
## Expend       4.067011e-02
## Grad.Rate    4.062729e-06
plot(lasso_fit)

plot(varImp(lasso_fit))

(e) Fit a PCR model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.

pcr_model <- train(x = x_train, y = y_train,
                   method = 'pcr',
                   trControl = trainControl(method = 'cv', number = 10),
                   tuneGrid = expand.grid(ncomp = 1:10))
(pcr_info <- postResample(predict(pcr_model, x_test), y_test))
##      RMSE  Rsquared       MAE 
## 0.3231292 0.8916531 0.1986075
coef(lasso_fit$finalModel, lasso_fit$bestTune$lambda)
## 19 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept) -1.470609e-02
## Private.No   5.410732e-02
## Private.Yes -1.041607e-14
## Accept       8.883839e-01
## Enroll       .           
## Top10perc    1.092201e-01
## Top25perc    .           
## F.Undergrad  .           
## P.Undergrad  .           
## Outstate     .           
## Room.Board   1.483337e-04
## Books        .           
## Personal     .           
## PhD          .           
## Terminal     .           
## S.F.Ratio    .           
## perc.alumni  .           
## Expend       4.067011e-02
## Grad.Rate    4.062729e-06
plot(lasso_fit)

plot(varImp(lasso_fit))

(e) Fit a PCR model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.

pcr_model <- train(x = x_train, y = y_train,
                   method = 'pcr',
                   trControl = trainControl(method = 'cv', number = 10),
                   tuneGrid = expand.grid(ncomp = 1:10))
(pcr_info <- postResample(predict(pcr_model, x_test), y_test))
##      RMSE  Rsquared       MAE 
## 0.3231292 0.8916531 0.1986075
coef(pcr_model$finalModel)
## , , 10 comps
## 
##                 .outcome
## Private.No   0.031985972
## Private.Yes -0.031985972
## Accept       0.343576750
## Enroll       0.305359773
## Top10perc    0.042630417
## Top25perc    0.027790893
## F.Undergrad  0.273818439
## P.Undergrad -0.049487667
## Outstate     0.038573119
## Room.Board   0.070607615
## Books        0.016433593
## Personal    -0.023529455
## PhD         -0.023992433
## Terminal    -0.024182230
## S.F.Ratio    0.003741623
## perc.alumni -0.070567887
## Expend       0.090126298
## Grad.Rate    0.071302714
plot(pcr_model)

plot(varImp(pcr_model))

(f) Fit a PLS model on the training set, with M chosen by crossvalidation. Report the test error obtained, along with the value of M selected by cross-validation.

pls_model <- train(x = x_train, y = y_train,
                   method = 'pls',
                   trControl = trainControl(method = 'cv', number = 10),
                   tuneGrid = expand.grid(ncomp = 1:10))
(pls_info <- postResample(predict(pls_model, x_test), y_test))
##      RMSE  Rsquared       MAE 
## 0.2792580 0.9209302 0.1572165
coef(pls_model$finalModel)
## , , 10 comps
## 
##                 .outcome
## Private.No   0.071314109
## Private.Yes -0.071314109
## Accept       1.039263053
## Enroll      -0.169855531
## Top10perc    0.235572152
## Top25perc   -0.070270182
## F.Undergrad -0.022580399
## P.Undergrad  0.032522451
## Outstate    -0.085919942
## Room.Board   0.036344367
## Books        0.004835496
## Personal     0.007546695
## PhD         -0.044709439
## Terminal     0.002332025
## S.F.Ratio    0.010868271
## perc.alumni -0.009046689
## Expend       0.072963738
## Grad.Rate    0.037425850
plot(pls_model)

(g) Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches?

as_data_frame(rbind(lin_info,
      ridge_info,
      lasso_info,
      pcr_info,
      pls_info)) %>%
    mutate(model = c('Linear', 'Ridge', 'Lasso', 'PCR', 'PLS')) %>%
    select(model, RMSE, Rsquared)
## Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
testing %>%
    summarize(sd = sd(Apps))

Semua model bekerja dengan cara yang sama, kecuali model PCR. R2≥94 untuk mereka semua dan RMSE≤20. Ketika kita membandingkan skor RMSE dengan rata-rata dan simpangan baku dari variabel respons, kita melihat bahwa semua model memiliki akurasi yang fenomenal!

require(ggthemes)

residfunc <- function(fit, data) {
 predict(fit, data) - testing$Apps
}

data_frame(Observed = testing$Apps,
           LM = residfunc(lin_model, testing),
           Ridge = residfunc(ridge_fit, x_test),
           Lasso = residfunc(lasso_fit, x_test),
           PCR = residfunc(pcr_model, x_test),
           PLS = residfunc(pls_model, x_test)) %>%
    gather(Model, Residuals, -Observed) %>%
    ggplot(aes(Observed, Residuals, col = Model)) +
    geom_hline(yintercept = 0, lty = 2) +
    geom_point(alpha = 0.6) +
    geom_smooth(method = 'loess', alpha = 0.01, col = 'lightsalmon3') +
    facet_wrap(~ Model, ncol = 5) +
    theme_tufte() +
    theme(legend.position = 'top') +
    coord_flip()
## Warning: attributes are not identical across measure variables; they will be
## dropped
## `geom_smooth()` using formula = 'y ~ x'

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.