4-1 Please explain how to use the K-Fold cross-validation to select the optimal number of variables in forward selection regression?

使用 forward selection regression 可以生成 p+1 個模型 \(\mu_1\sim\mu_{p+1}\) 其中 \(\mu_p\subset\mu_{p+1}\)\(\mu_1\subset\mu_2\subset...\subset\mu_{p+1}\) 為槽套模型 (nested models),使用 K-Fold cross-validation 找到最適合的 p 是多少,將 p+1 個模型中樣本數分為 K 份,取第 1 份為 testing set,剩餘為 training set,再來使用第 2 份作為 testing set,剩餘為 training set,直到所有等份都做過一次 testing set 為止,取其中最好的 error 代表 \(\mu_p\) 的 error,依次進行 p+1 次得到所有的 error 後選擇最好的那一個模型。


4-2 Write an R code to implement the K-Fold cross-validation for forward selection using the basesball data according to the model

"salary ~ G + AB + R + H + X2B + X3B + HR + RBI + SB + CS +
BB + SO + IBB + HBP + SH + SF + GIDP + years_MLB"
library(dplyr)
library(Lahman)
library(leaps)
## 先把需要用到的 col 放在一起
tbl_s <- Salaries %>% tbl_df() %>% filter(yearID == 2015) %>%
select(-yearID) %>% mutate(salary = salary/10000)
tbl_b <- Batting %>% tbl_df() %>% filter(yearID == 2014) %>%
select(-yearID, -stint, -teamID, -lgID) %>% group_by(playerID) %>%
summarise_all(funs(sum))
tbl_m <- Master %>% tbl_df() %>%
mutate(years_MLB = as.integer(as.Date("2014-10-29") - as.Date(debut)) / 365)
tbl_baseball <- tbl_s %>% left_join(tbl_b, by = "playerID") %>%
left_join(tbl_m, by = "playerID") %>% select(G:GIDP, years_MLB, salary) %>%
mutate_all(.funs = funs(replace(., is.na(.), 0)))
tbl_baseball
  # A tibble: 817 x 19
         G    AB     R     H   X2B   X3B    HR   RBI    SB    CS    BB    SO
     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
   1   25.   70.    9.   14.    2.    0.    1.    4.    0.    1.    3.   10.
   2   22.   34.    0.    1.    0.    0.    0.    0.    0.    0.    0.   14.
   3    3.    2.    0.    1.    0.    0.    0.    1.    0.    0.    0.    1.
   4   33.   54.    2.    6.    2.    0.    0.    2.    0.    0.    3.   15.
   5    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
   6   19.    2.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
   7   47.    9.    0.    1.    0.    0.    0.    0.    0.    0.    0.    4.
   8  109.  406.   75.  122.   39.    1.   19.   69.    9.    3.   64.  110.
   9   41.  129.    6.   29.    8.    0.    1.    7.    0.    0.    3.   24.
  10   13.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  # ... with 807 more rows, and 7 more variables: IBB <dbl>, HBP <dbl>,
  #   SH <dbl>, SF <dbl>, GIDP <dbl>, years_MLB <dbl>, salary <dbl>
predict.regsubsets = function(object, newdata, id, ...) {
    form = as.formula(object$call[[2]])
    mat = model.matrix(form, newdata)
    coefi = coef(object, id = id)
    mat[, names(coefi)] %*% coefi
}
## 使用 forward selection 針對 salary 做預測
set.seed(11)
folds = sample(rep(1:10, length = nrow(tbl_baseball)))
cv.errors = matrix(NA, 10, 18)
for (k in 1:10) {
    best.fit = regsubsets(salary ~ ., data = tbl_baseball[folds != k, ], nvmax = 18, 
        method = "forward")
    for (i in 1:18) {
        pred = predict(best.fit, tbl_baseball[folds == k, ], id = i)
        cv.errors[k, i] = mean((tbl_baseball$salary[folds == k] - pred)^2)
    }
}
rmse.cv = sqrt(apply(cv.errors, 2, mean))
plot(rmse.cv, pch = 18, type = "b")

有由此圖可得當 P=6,時有最好的結果