Author: Jay Liao (ID: RE6094028)

Load the required packages

library(dplyr)     # data-manipulation
library(leaps)     # subsets regression
library(tidyverse) # for data-manipulation
library(ggplot2)   # data visualization
library(ggthemes)  # data visualization
library(caret)     # classification and regression training

Exercise 6.8

In this exercise, we will generate simulated data, and will then use this data to perform best subset selection.

Exercise 6.8 - (a)

Use the rnorm() function to generate a predictor \(X\) of length \(n = 100\), as well as a noise vector \(\epsilon\) of length n = 100.

set.seed(12345)
X <- rnorm(100)
e <- rnorm(100)

Exercise 6.8 - (b)

Generate a response vector \(Y\) of length \(n = 100\) according to the model

\(Y = \beta_0 + \beta_1X + \beta_2 X^2 + \beta_3 X^3 + \epsilon\),

where \(\beta_0\), \(\beta_1\), \(\beta_2\), and \(\beta_3\) are constants of your choice.

Y <- 1 + 2*X + 3*X^2 + 4*X^3 + e

Exercise 6.8 - (c)

Use the regsubsets() function to perform best subset selection in order to choose the best model containing the predictors \(X\), \(X^2\), …, \(X^{10}\). What is the best model obtained according to \(C_p\), BIC, and adjusted \(R^2\)? Show some plots to provide evidence for your answer, and report the coefficients of the best model obtained. Note you will need to use the data.frame() function to create a single data set containing both \(X\) and \(Y\).

Construct the data set for visualization

df <- data.frame(X, Y)
fit <- df %>% regsubsets(Y ~ poly(X, 10), data = .)
fit_summary <- summary(fit); fit_summary
Subset selection object
Call: function_list[[k]](value)
10 Variables  (and intercept)
              Forced in Forced out
poly(X, 10)1      FALSE      FALSE
poly(X, 10)2      FALSE      FALSE
poly(X, 10)3      FALSE      FALSE
poly(X, 10)4      FALSE      FALSE
poly(X, 10)5      FALSE      FALSE
poly(X, 10)6      FALSE      FALSE
poly(X, 10)7      FALSE      FALSE
poly(X, 10)8      FALSE      FALSE
poly(X, 10)9      FALSE      FALSE
poly(X, 10)10     FALSE      FALSE
1 subsets of each size up to 8
Selection Algorithm: exhaustive
         poly(X, 10)1 poly(X, 10)2 poly(X, 10)3 poly(X, 10)4 poly(X, 10)5
1  ( 1 ) "*"          " "          " "          " "          " "         
2  ( 1 ) "*"          " "          "*"          " "          " "         
3  ( 1 ) "*"          "*"          "*"          " "          " "         
4  ( 1 ) "*"          "*"          "*"          " "          "*"         
5  ( 1 ) "*"          "*"          "*"          " "          "*"         
6  ( 1 ) "*"          "*"          "*"          " "          "*"         
7  ( 1 ) "*"          "*"          "*"          "*"          "*"         
8  ( 1 ) "*"          "*"          "*"          "*"          "*"         
         poly(X, 10)6 poly(X, 10)7 poly(X, 10)8 poly(X, 10)9 poly(X, 10)10
1  ( 1 ) " "          " "          " "          " "          " "          
2  ( 1 ) " "          " "          " "          " "          " "          
3  ( 1 ) " "          " "          " "          " "          " "          
4  ( 1 ) " "          " "          " "          " "          " "          
5  ( 1 ) " "          "*"          " "          " "          " "          
6  ( 1 ) " "          "*"          " "          "*"          " "          
7  ( 1 ) " "          "*"          " "          "*"          " "          
8  ( 1 ) " "          "*"          "*"          "*"          " "          
df_plt <- data.frame(adj_Rsq = fit_summary$adjr2,
                     BIC = fit_summary$bic, Cp = fit_summary$cp)

Visualize the criterion index

df_plt %>% mutate(id = row_number()) %>%
  gather(value_type, value, -id) %>%
  ggplot(aes(id, value, col=value_type)) +
  facet_wrap(~ value_type, scales = 'free') +
  scale_x_continuous(breaks = 1:10) +
  geom_line() +
  geom_point() + 
  ylab('Value of The Index') +
  xlab('Number of Variables Used') +
  ggtitle('Visualization of Index of Goodness of Fit of Different No. of Used Variables') +
  theme_bw() + theme(legend.position = 'None')

看起來regsubsets的方式建議3為最佳參數數目,與我們設定的Y的成分相符。

Exercise 6.8 - (d)

Repeat (c), using forward step-wise selection and also using backwards step-wise selection. How does your answer compare to the results in (c)?

fit_bw <- df %>% regsubsets(Y ~ poly(X, 10), data = ., method = 'backward')
fit_bw_summary <- summary(fit_bw); fit_bw_summary
Subset selection object
Call: function_list[[k]](value)
10 Variables  (and intercept)
              Forced in Forced out
poly(X, 10)1      FALSE      FALSE
poly(X, 10)2      FALSE      FALSE
poly(X, 10)3      FALSE      FALSE
poly(X, 10)4      FALSE      FALSE
poly(X, 10)5      FALSE      FALSE
poly(X, 10)6      FALSE      FALSE
poly(X, 10)7      FALSE      FALSE
poly(X, 10)8      FALSE      FALSE
poly(X, 10)9      FALSE      FALSE
poly(X, 10)10     FALSE      FALSE
1 subsets of each size up to 8
Selection Algorithm: backward
         poly(X, 10)1 poly(X, 10)2 poly(X, 10)3 poly(X, 10)4 poly(X, 10)5
1  ( 1 ) "*"          " "          " "          " "          " "         
2  ( 1 ) "*"          " "          "*"          " "          " "         
3  ( 1 ) "*"          "*"          "*"          " "          " "         
4  ( 1 ) "*"          "*"          "*"          " "          "*"         
5  ( 1 ) "*"          "*"          "*"          " "          "*"         
6  ( 1 ) "*"          "*"          "*"          " "          "*"         
7  ( 1 ) "*"          "*"          "*"          "*"          "*"         
8  ( 1 ) "*"          "*"          "*"          "*"          "*"         
         poly(X, 10)6 poly(X, 10)7 poly(X, 10)8 poly(X, 10)9 poly(X, 10)10
1  ( 1 ) " "          " "          " "          " "          " "          
2  ( 1 ) " "          " "          " "          " "          " "          
3  ( 1 ) " "          " "          " "          " "          " "          
4  ( 1 ) " "          " "          " "          " "          " "          
5  ( 1 ) " "          "*"          " "          " "          " "          
6  ( 1 ) " "          "*"          " "          "*"          " "          
7  ( 1 ) " "          "*"          " "          "*"          " "          
8  ( 1 ) " "          "*"          "*"          "*"          " "          
df_plt <- data.frame(adj_Rsq = fit_bw_summary$adjr2,
                     BIC = fit_bw_summary$bic, Cp = fit_bw_summary$cp)

df_plt %>% mutate(id = row_number()) %>%
  gather(value_type, value, -id) %>%
  ggplot(aes(id, value, col=value_type)) +
  facet_wrap(~ value_type, scales = 'free') +
  scale_x_continuous(breaks = 1:10) +
  geom_line() +
  geom_point() + 
  labs(x = 'Number of Variables Used', y = 'Value of The Index') +
  ggtitle('Visualization of Index of Goodness of Fit of Different No. of Used Variables') +
  theme_bw() + theme(legend.position = 'None')

看起來Backward的方式也是建議參數數目為3,與我們設定的Y的成分相符。

Exercise 6.8 - (e)

Now fit a lasso model to the simulated data, again using \(X\), \(X^2\), …, \(X^{10}\) as predictors. Use cross-validation to select the optimal value of \(\lambda\). Create plots of the cross-validation error as a function of \(\lambda\). Report the resulting coefficient estimates, and discuss the results obtained.

Fitting

fit_lasso <- df %>% train(Y ~ poly(X, 10), data=., method='glmnet',
                          trControl = trainControl(method = 'cv', number = 10),
                          tuneGrid = expand.grid(alpha = 1, lambda = seq(.001, .2, by = .005)))

Visualize the relationship between \(\lambda\) and RMSE

fit_lasso %>% plot()

隨著懲罰項係數上升,模型表現也跟著變差(error上升)。

Find the importances and coefficents of variables with the best-tuned \(\lambda\)

fit_lasso$bestTune
coef(fit_lasso$finalModel, fit_lasso$bestTune$lambda)
11 x 1 sparse Matrix of class "dgCMatrix"
                        1
(Intercept)     8.2964666
poly(X, 10)1  169.5687399
poly(X, 10)2   62.7212841
poly(X, 10)3   78.6681420
poly(X, 10)4    .        
poly(X, 10)5    1.8285544
poly(X, 10)6    .        
poly(X, 10)7   -0.9151879
poly(X, 10)8    .        
poly(X, 10)9    0.1652270
poly(X, 10)10   .        

LASSO模型剔除了4個term。

fit_lasso %>% varImp() %>% plot()

一次方、三次方與平方項比較重要。五次方、七次方與九次方重要度不高,但也被lasso模型選到了。使用Cross-Validation去tune lambda後得到的lasso模型似乎高估了需要的預測因子數目,也就是說抓進了一些不太重要的變項,這可能是因為lasso在選變項時是透過殘差平方和來選,不像regsubsets有考慮BIC與Adjusted R square等指標。

Exercise 6.8 - (f)

Now generate a response vector \(Y\) according to the model \(Y = \beta_0 + \beta_7 X^7 + \epsilon\), and perform best subset selection and the lasso. Discuss the results obtained.

Construct the new dataset

Y_new <- 1 + 8*X^7 + e
df_new <- data.frame(Y_new, X)
fit_new <- regsubsets(Y_new ~ poly(X, 10), data= df_new)
fit_new_summary <- summary(fit_new)
df_plt <- data.frame(adj_Rsq = fit_new_summary$adjr2,
                     BIC = fit_new_summary$bic, Cp = fit_new_summary$cp)

Visualize the criterion index

df_plt %>% mutate(id = row_number()) %>%
  gather(value_type, value, -id) %>%
  ggplot(aes(id, value, col=value_type)) + 
  geom_line() +
  geom_point() +
  facet_wrap(~ value_type, scales = 'free') +
  scale_x_continuous(breaks = 1:10) +
  labs(x = 'Number of Variables Used', y = 'Value of The Index') +
  ggtitle('Visualization of Index of Goodness of Fit of Different No. of Used Variables') +
  theme_bw() + theme(legend.position = 'None')

看起來是建議參數數目為7。

Fit the lasso model with cross-validation tuning

Fitting

fit_lasso_new <- df_new %>%
  train(Y_new ~ poly(X, 10), data= . , method = 'glmnet', 
        trControl = trainControl(method = 'cv', number = 5),
        tuneGrid = expand.grid(alpha = 1, lambda = seq(.001, .2, by = .005)))

Visualize the relationship between \(\lambda\) and RMSE

fit_lasso_new %>% plot()

看起來使用不同的\(\lambda\)的模型RMSE沒有什麼差異。

Find the importances and coefficents of variables with the best-tuned \(\lambda\)

fit_lasso_new$bestTune
coef(fit_lasso_new$finalModel, fit_lasso_new$bestTune$lambda)
11 x 1 sparse Matrix of class "dgCMatrix"
                       1
(Intercept)    103.03241
poly(X, 10)1  5080.71210
poly(X, 10)2   923.96159
poly(X, 10)3  5479.48139
poly(X, 10)4   423.11388
poly(X, 10)5  2337.41054
poly(X, 10)6    80.03282
poly(X, 10)7   226.10727
poly(X, 10)8     .      
poly(X, 10)9     .      
poly(X, 10)10    .      

LASSO模型剔除了3個term。

fit_lasso_new %>% varImp() %>% plot()

Exercise 6.9

In this exercise, we will predict the number of applications received using the other variables in the College data set.

library(ISLR)
data('College')
summary(College)
 Private        Apps           Accept          Enroll       Top10perc    
 No :212   Min.   :   81   Min.   :   72   Min.   :  35   Min.   : 1.00  
 Yes:565   1st Qu.:  776   1st Qu.:  604   1st Qu.: 242   1st Qu.:15.00  
           Median : 1558   Median : 1110   Median : 434   Median :23.00  
           Mean   : 3002   Mean   : 2019   Mean   : 780   Mean   :27.56  
           3rd Qu.: 3624   3rd Qu.: 2424   3rd Qu.: 902   3rd Qu.:35.00  
           Max.   :48094   Max.   :26330   Max.   :6392   Max.   :96.00  
   Top25perc      F.Undergrad     P.Undergrad         Outstate    
 Min.   :  9.0   Min.   :  139   Min.   :    1.0   Min.   : 2340  
 1st Qu.: 41.0   1st Qu.:  992   1st Qu.:   95.0   1st Qu.: 7320  
 Median : 54.0   Median : 1707   Median :  353.0   Median : 9990  
 Mean   : 55.8   Mean   : 3700   Mean   :  855.3   Mean   :10441  
 3rd Qu.: 69.0   3rd Qu.: 4005   3rd Qu.:  967.0   3rd Qu.:12925  
 Max.   :100.0   Max.   :31643   Max.   :21836.0   Max.   :21700  
   Room.Board       Books           Personal         PhD        
 Min.   :1780   Min.   :  96.0   Min.   : 250   Min.   :  8.00  
 1st Qu.:3597   1st Qu.: 470.0   1st Qu.: 850   1st Qu.: 62.00  
 Median :4200   Median : 500.0   Median :1200   Median : 75.00  
 Mean   :4358   Mean   : 549.4   Mean   :1341   Mean   : 72.66  
 3rd Qu.:5050   3rd Qu.: 600.0   3rd Qu.:1700   3rd Qu.: 85.00  
 Max.   :8124   Max.   :2340.0   Max.   :6800   Max.   :103.00  
    Terminal       S.F.Ratio      perc.alumni        Expend     
 Min.   : 24.0   Min.   : 2.50   Min.   : 0.00   Min.   : 3186  
 1st Qu.: 71.0   1st Qu.:11.50   1st Qu.:13.00   1st Qu.: 6751  
 Median : 82.0   Median :13.60   Median :21.00   Median : 8377  
 Mean   : 79.7   Mean   :14.09   Mean   :22.74   Mean   : 9660  
 3rd Qu.: 92.0   3rd Qu.:16.50   3rd Qu.:31.00   3rd Qu.:10830  
 Max.   :100.0   Max.   :39.80   Max.   :64.00   Max.   :56233  
   Grad.Rate     
 Min.   : 10.00  
 1st Qu.: 53.00  
 Median : 65.00  
 Mean   : 65.46  
 3rd Qu.: 78.00  
 Max.   :118.00  

Exercise 6.9 - (a)

Split the data set into a training set and a test set.

set.seed(6094028)
idx_train <- sample(c(T, F), nrow(College), replace = T, prob = c(.75, .25))

Exercise 6.9 - (b)

Fit a linear model using least squares on the training set, and report the test error obtained.

fit_linear <- lm(Apps ~ ., data = College[idx_train,])
pred <- predict(fit_linear, College[!idx_train,])
evaluation_linear <- postResample(pred, College[!idx_train, 'Apps'])

The testing MAE is 611.4085141 and the testing RMSE is 1030.7223369.

Exercise 6.9 - (c)

Fit a ridge regression model on the training set, with \(\lambda\) chosen by cross-validation. Report the test error obtained.

Construct the one-hot-encoding data set and target variable

one_hot_encoding <- dummyVars(Apps ~ ., data = College[idx_train,])
X_train <- predict(one_hot_encoding, College[idx_train,])
X_test <- predict(one_hot_encoding, College[!idx_train,])
y_train <- College[idx_train, 'Apps']
y_test <- College[!idx_train, 'Apps']

Fit the ridge model

fit_ridge <- train(x = X_train, y = y_train, method = 'glmnet', 
                   trControl = trainControl(method = 'cv', number = 5),
                   tuneGrid = expand.grid(alpha = 0, lambda = seq(0, 100, length.out = 20)))

The testing error and coefficients

evaluation_ridge <- postResample(predict(fit_ridge, X_test), y_test); evaluation_ridge 
       RMSE    Rsquared         MAE 
979.7027036   0.9309277 612.7301156 
coef(fit_ridge$finalModel, fit_ridge$bestTune$lambda)
19 x 1 sparse Matrix of class "dgCMatrix"
                        1
(Intercept) -1.581833e+03
Private.No   2.077000e+02
Private.Yes -2.098976e+02
Accept       1.073383e+00
Enroll       3.646862e-01
Top10perc    2.619772e+01
Top25perc    2.160353e+00
F.Undergrad  7.185480e-02
P.Undergrad  1.318617e-02
Outstate    -3.283413e-02
Room.Board   1.597394e-01
Books        1.234186e-01
Personal    -3.658509e-02
PhD         -5.371638e+00
Terminal    -3.230207e+00
S.F.Ratio    1.379503e+01
perc.alumni -1.005194e+01
Expend       8.257808e-02
Grad.Rate    1.002276e+01

The testing MAE is 612.7301156 and the testing RMSE is 979.7027036.

Visualization

The relationship between regularization parameter and testing error (RMSE)

plot(fit_ridge)

看起來使用不同的\(\lambda\)的模型的RMSE沒有什麼差異。

Importances of variables

plot(varImp(fit_ridge))

Exercise 6.9 - (d)

Fit a lasso model on the training set, with \(\lambda\) chosen by cross- validation. Report the test error obtained, along with the number of non-zero coefficient estimates.

Fit the lasso model

fit_lasso <- train(x = X_train, y = y_train, method = 'glmnet',
                   trControl = trainControl(method = 'cv', number = 5),
                   tuneGrid = expand.grid(alpha = 1, lambda = seq(.0001, 1, length.out = 50)))

The testing error and coefficients

evaluation_lasso <- postResample(predict(fit_lasso, X_test), y_test); evaluation_lasso
        RMSE     Rsquared          MAE 
1027.2269983    0.9239156  606.9610529 
coef(fit_lasso$finalModel, fit_lasso$bestTune$lambda)
19 x 1 sparse Matrix of class "dgCMatrix"
                        1
(Intercept) -781.67952389
Private.No   419.83895404
Private.Yes    .         
Accept         1.66142516
Enroll        -0.94162973
Top10perc     49.60322883
Top25perc    -11.14737947
F.Undergrad    0.03449318
P.Undergrad    0.05300442
Outstate      -0.08568760
Room.Board     0.10625680
Books         -0.12740072
Personal       0.05395446
PhD          -11.07352793
Terminal       .         
S.F.Ratio     15.39056097
perc.alumni   -1.61032595
Expend         0.08012070
Grad.Rate      6.66500806

The testing MAE is 606.9610529 and the testing RMSE is 1027.2269983. Lasso模型剔除掉了2個變項。

Visualization

The relationship between regularization parameter and testing error (RMSE)

plot(fit_lasso)

看起來使用不同的\(\lambda\)的模型的RMSE沒有什麼差異。

Importances of variables

plot(varImp(fit_lasso))

看起來模型還是納入了不少個重要度其實不高的變項。

Exercise 6.9 - (e)

Fit a PCR model on the training set, with \(M\) chosen by cross-validation. Report the test error obtained, along with the value of \(M\) selected by cross-validation.

Fit the PCR model

library(pls)
fit_pcr <- train(x = X_train, y = y_train, method = 'pcr',
                 trControl = trainControl(method = 'cv', number = 5),
                 tuneGrid = expand.grid(ncomp = 1:10))

Obtain the testing error and the coefficents

evaluation_pcr <- postResample(predict(fit_pcr, X_test), y_test); evaluation_pcr
        RMSE     Rsquared          MAE 
1094.9442366    0.9136148  649.0417305 
coef(fit_pcr$finalModel)
, , 10 comps

                .outcome
Private.No   0.027843009
Private.Yes -0.027843009
Accept       1.629715903
Enroll      -0.926598450
Top10perc    6.894928988
Top25perc    9.581846777
F.Undergrad  0.060999471
P.Undergrad  0.008371193
Outstate    -0.118171024
Room.Board   0.061073941
Books       -0.038977409
Personal     0.053082624
PhD          5.527407076
Terminal     4.539867018
S.F.Ratio    0.136606605
perc.alumni  2.059220504
Expend       0.114438495
Grad.Rate    4.126218690

The testing MAE is 649.0417305 and the testing RMSE is 1094.9442366.

Visualization

The relationship between regularization parameter and testing error (RMSE)

plot(fit_pcr)

隨著納入的成分數量上升,RMSE下降,下降幅度在納入成份數為2之後趨緩。

Importances of variables

plot(varImp(fit_pcr))

大部分的變項都有一定程度的重要度。

Exercise 6.9 - (f)

Fit a PLS model on the training set, with \(M\) chosen by cross-validation. Report the test error obtained, along with the value of \(M\) selected by cross-validation.

Fit the PLS model

fit_pls <- train(x = X_train, y = y_train, method = 'pls',
                 trControl = trainControl(method = 'cv', number = 5),
                 tuneGrid = expand.grid(ncomp = 1:10))

Obtain the testing error and the coefficents

evaluation_pls <- postResample(predict(fit_pls, X_test), y_test); evaluation_pls
        RMSE     Rsquared          MAE 
1077.4755794    0.9165545  657.5837815 
coef(fit_pls$finalModel)
, , 10 comps

               .outcome
Private.No   0.07062156
Private.Yes -0.07062156
Accept       1.64188666
Enroll      -0.96511195
Top10perc   15.29674948
Top25perc   12.73651695
F.Undergrad  0.06094583
P.Undergrad  0.03928566
Outstate    -0.12618431
Room.Board   0.08307624
Books       -0.18148013
Personal     0.06731608
PhD         -0.31407061
Terminal    -0.91469138
S.F.Ratio    0.55132450
perc.alumni  1.41035850
Expend       0.10287221
Grad.Rate    7.21181649

The testing MAE is 657.5837815 and the testing RMSE is 1077.4755794.

Visualization

The relationship between regularization parameter and testing error (RMSE)

plot(fit_pls)

隨著納入的成分數量上升,RMSE下降,下降幅度在納入成份數為4之後趨緩。

Importances of variables

plot(varImp(fit_pls))

大部分的變項都有一定程度的重要度。

Exercise 6.9 - (g)

Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches?

evaluation_models <- rbind(evaluation_linear, evaluation_ridge, evaluation_lasso,
                           evaluation_pcr, evaluation_pls) %>% as.data.frame() %>%
  mutate(model = c('Linear', 'Ridge', 'Lasso', 'PCR', 'PLS')) %>%
  dplyr::select(model, RMSE, Rsquared, MAE)
mtx_RANK <- evaluation_models %>% dplyr::select(-model) %>% apply(., 2, rank)
mtx_RANK[, 'Rsquared'] <- 6 - mtx_RANK[, 'Rsquared']  # Reverse
evaluation_models$average_rank <- apply(mtx_RANK, 1, mean)
evaluation_models

以三個模型評估指標將五個模型排序(1為表現最佳,5為最差),並將rank取平均。看起來ridge與lasso模型表現最佳,而PCR表現最差。

evaluation_models %>% dplyr::select(RMSE, Rsquared, MAE) %>% chisq.test()

    Pearson's Chi-squared test

data:  .
X-squared = 0.94746, df = 8, p-value = 0.9986
evaluation_models %>% dplyr::select(RMSE, Rsquared, MAE) %>%
  apply(., 2, chisq.test)
$RMSE

    Chi-squared test for given probabilities

data:  newX[, i]
X-squared = 7.9538, df = 4, p-value = 0.09328


$Rsquared

    Chi-squared test for given probabilities

data:  newX[, i]
X-squared = 0.00020052, df = 4, p-value = 1


$MAE

    Chi-squared test for given probabilities

data:  newX[, i]
X-squared = 3.6141, df = 4, p-value = 0.4607

所有卡方檢定皆不顯著(\(\alpha = 0.05\)),顯示:

  1. Approach與evaluation index間沒有顯著關聯
  2. 不同approach得到的testing RMSE沒有顯著差異
  3. 不同approach得到的testing R square沒有顯著差異
  4. 不同approach得到的testing MAE沒有顯著差異