使用 forward selection regression 可以生成 p+1 個模型 \(\mu_1\sim\mu_{p+1}\) 其中 \(\mu_p\subset\mu_{p+1}\) 且 \(\mu_1\subset\mu_2\subset...\subset\mu_{p+1}\) 為槽套模型 (nested models),使用 K-Fold cross-validation 找到最適合的 p 是多少,將 p+1 個模型中樣本數分為 K 份,取第 1 份為 testing set,剩餘為 training set,再來使用第 2 份作為 testing set,剩餘為 training set,直到所有等份都做過一次 testing set 為止,取其中最好的 error 代表 \(\mu_p\) 的 error,依次進行 p+1 次得到所有的 error 後選擇最好的那一個模型。
"salary ~ G + AB + R + H + X2B + X3B + HR + RBI + SB + CS +
BB + SO + IBB + HBP + SH + SF + GIDP + years_MLB"
library(dplyr)
library(Lahman)
library(leaps)
## 先把需要用到的 col 放在一起
tbl_s <- Salaries %>% tbl_df() %>% filter(yearID == 2015) %>%
select(-yearID) %>% mutate(salary = salary/10000)
tbl_b <- Batting %>% tbl_df() %>% filter(yearID == 2014) %>%
select(-yearID, -stint, -teamID, -lgID) %>% group_by(playerID) %>%
summarise_all(funs(sum))
tbl_m <- Master %>% tbl_df() %>%
mutate(years_MLB = as.integer(as.Date("2014-10-29") - as.Date(debut)) / 365)
tbl_baseball <- tbl_s %>% left_join(tbl_b, by = "playerID") %>%
left_join(tbl_m, by = "playerID") %>% select(G:GIDP, years_MLB, salary) %>%
mutate_all(.funs = funs(replace(., is.na(.), 0)))
tbl_baseball
# A tibble: 817 x 19
G AB R H X2B X3B HR RBI SB CS BB SO
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 25. 70. 9. 14. 2. 0. 1. 4. 0. 1. 3. 10.
2 22. 34. 0. 1. 0. 0. 0. 0. 0. 0. 0. 14.
3 3. 2. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1.
4 33. 54. 2. 6. 2. 0. 0. 2. 0. 0. 3. 15.
5 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
6 19. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
7 47. 9. 0. 1. 0. 0. 0. 0. 0. 0. 0. 4.
8 109. 406. 75. 122. 39. 1. 19. 69. 9. 3. 64. 110.
9 41. 129. 6. 29. 8. 0. 1. 7. 0. 0. 3. 24.
10 13. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# ... with 807 more rows, and 7 more variables: IBB <dbl>, HBP <dbl>,
# SH <dbl>, SF <dbl>, GIDP <dbl>, years_MLB <dbl>, salary <dbl>
predict.regsubsets = function(object, newdata, id, ...) {
form = as.formula(object$call[[2]])
mat = model.matrix(form, newdata)
coefi = coef(object, id = id)
mat[, names(coefi)] %*% coefi
}
## 使用 forward selection 針對 salary 做預測
set.seed(11)
folds = sample(rep(1:10, length = nrow(tbl_baseball)))
cv.errors = matrix(NA, 10, 18)
for (k in 1:10) {
best.fit = regsubsets(salary ~ ., data = tbl_baseball[folds != k, ], nvmax = 18,
method = "forward")
for (i in 1:18) {
pred = predict(best.fit, tbl_baseball[folds == k, ], id = i)
cv.errors[k, i] = mean((tbl_baseball$salary[folds == k] - pred)^2)
}
}
rmse.cv = sqrt(apply(cv.errors, 2, mean))
plot(rmse.cv, pch = 18, type = "b")
有由此圖可得當 P=6,時有最好的結果