В практических примерах ниже показано как:
Модели: линейная регрессия, ридж, лассо, PCR, PLS.
Данные: Hitters {ISLR}
Подробные комментарии к коду лабораторных см. в [1], глава 6.
library('ISLR') # набор данных Hitters
library('leaps') # функция regsubset() -- отбор оптимального
# подмножества переменных
library('glmnet') # функция glmnet() -- лассо
library('pls') # регрессия на главные компоненты -- pcr()
# и частный МНК -- plsr()
my.seed <- 1
Набор данных по зарплатам бейсбольных игроков Hitters
.
?Hitters
## starting httpd help server ... done
names(Hitters)
## [1] "AtBat" "Hits" "HmRun" "Runs" "RBI" "Walks"
## [7] "Years" "CAtBat" "CHits" "CHmRun" "CRuns" "CRBI"
## [13] "CWalks" "League" "Division" "PutOuts" "Assists" "Errors"
## [19] "Salary" "NewLeague"
dim(Hitters)
## [1] 322 20
Считаем число пропусков в зависимой переменной и убираем их.
# считаем пропуски
sum(is.na(Hitters$Salary))
## [1] 59
# убираем пропуски
Hitters <- na.omit(Hitters)
# проверяем результат
dim(Hitters)
## [1] 263 20
sum(is.na(Hitters$Salary))
## [1] 0
# подгоняем модели с сочетаниями предикторов до 8 включительно
regfit.full <- regsubsets(Salary ~ ., Hitters)
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., Hitters)
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*"
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " " " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*" " "
## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " " " " "
## 4 ( 1 ) " " " " "*" "*" " " " " " "
## 5 ( 1 ) " " " " "*" "*" " " " " " "
## 6 ( 1 ) " " " " "*" "*" " " " " " "
## 7 ( 1 ) " " " " "*" "*" " " " " " "
## 8 ( 1 ) "*" " " "*" "*" " " " " " "
# подгоняем модели с сочетаниями предикторов до 19 (максимум в данных)
regfit.full <- regsubsets(Salary ~ ., Hitters, nvmax = 19)
reg.summary <- summary(regfit.full)
reg.summary
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., Hitters, nvmax = 19)
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: exhaustive
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*"
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " " " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*" " "
## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*"
## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " " " " "
## 4 ( 1 ) " " " " "*" "*" " " " " " "
## 5 ( 1 ) " " " " "*" "*" " " " " " "
## 6 ( 1 ) " " " " "*" "*" " " " " " "
## 7 ( 1 ) " " " " "*" "*" " " " " " "
## 8 ( 1 ) "*" " " "*" "*" " " " " " "
## 9 ( 1 ) "*" " " "*" "*" " " " " " "
## 10 ( 1 ) "*" " " "*" "*" "*" " " " "
## 11 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 12 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
# структура отчёта по модели (ищем характеристики качества)
names(reg.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
# R^2 и скорректированный R^2
round(reg.summary$rsq, 3)
## [1] 0.321 0.425 0.451 0.475 0.491 0.509 0.514 0.529 0.535 0.540 0.543 0.544
## [13] 0.544 0.545 0.545 0.546 0.546 0.546 0.546
# на графике
plot(1:19, reg.summary$rsq, type = 'b',
xlab = 'Количество предикторов', ylab = 'R-квадрат')
# сода же добавим скорректированный R-квадрат
points(1:19, reg.summary$adjr2, col = 'red')
# модель с максимальным скорректированным R-квадратом
which.max(reg.summary$adjr2)
## [1] 11
### 11
points(which.max(reg.summary$adjr2),
reg.summary$adjr2[which.max(reg.summary$adjr2)],
col = 'red', cex = 2, pch = 20)
legend('bottomright', legend = c('R^2', 'R^2_adg'),
col = c('black', 'red'), lty = c(1, NA),
pch = c(1, 1))
# C_p
reg.summary$cp
## [1] 104.281319 50.723090 38.693127 27.856220 21.613011 14.023870
## [7] 13.128474 7.400719 6.158685 5.009317 5.874113 7.330766
## [13] 8.888112 10.481576 12.346193 14.187546 16.087831 18.011425
## [19] 20.000000
# число предикторов у оптимального значения критерия
which.min(reg.summary$cp)
## [1] 10
### 10
# график
plot(reg.summary$cp, xlab = 'Число предикторов',
ylab = 'C_p', type = 'b')
points(which.min(reg.summary$cp),
reg.summary$cp[which.min(reg.summary$cp)],
col = 'red', cex = 2, pch = 20)
# BIC
reg.summary$bic
## [1] -90.84637 -128.92622 -135.62693 -141.80892 -144.07143 -147.91690
## [7] -145.25594 -147.61525 -145.44316 -143.21651 -138.86077 -133.87283
## [13] -128.77759 -123.64420 -118.21832 -112.81768 -107.35339 -101.86391
## [19] -96.30412
# число предикторов у оптимального значения критерия
which.min(reg.summary$bic)
## [1] 6
### 6
# график
plot(reg.summary$bic, xlab = 'Число предикторов',
ylab = 'BIC', type = 'b')
points(which.min(reg.summary$bic),
reg.summary$bic[which.min(reg.summary$bic)],
col = 'red', cex = 2, pch = 20)
# метод plot для визуализации результатов
?plot.regsubsets
plot(regfit.full, scale = 'r2')
plot(regfit.full, scale = 'adjr2')
plot(regfit.full, scale = 'Cp')
plot(regfit.full, scale = 'bic')
# коэффициенты модели с наименьшим BIC
round(coef(regfit.full, 6), 3)
## (Intercept) AtBat Hits Walks CRBI DivisionW
## 91.512 -1.869 7.604 3.698 0.643 -122.952
## PutOuts
## 0.264
regfit.fwd <- regsubsets(Salary ~ ., data = Hitters,
nvmax = 19, method = 'forward')
summary(regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "forward")
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: forward
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*"
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" "*"
## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*"
## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " " " " "
## 4 ( 1 ) " " " " "*" "*" " " " " " "
## 5 ( 1 ) " " " " "*" "*" " " " " " "
## 6 ( 1 ) " " " " "*" "*" " " " " " "
## 7 ( 1 ) "*" " " "*" "*" " " " " " "
## 8 ( 1 ) "*" " " "*" "*" " " " " " "
## 9 ( 1 ) "*" " " "*" "*" " " " " " "
## 10 ( 1 ) "*" " " "*" "*" "*" " " " "
## 11 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 12 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
regfit.bwd <- regsubsets(Salary ~ ., data = Hitters,
nvmax = 19, method = 'backward')
summary(regfit.bwd)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "backward")
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: backward
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " "*" " "
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " "*" " "
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " "*" " "
## 4 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " "*" " "
## 5 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " "
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " "
## 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" "*"
## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*"
## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " " " " "
## 4 ( 1 ) " " " " " " "*" " " " " " "
## 5 ( 1 ) " " " " " " "*" " " " " " "
## 6 ( 1 ) " " " " "*" "*" " " " " " "
## 7 ( 1 ) "*" " " "*" "*" " " " " " "
## 8 ( 1 ) "*" " " "*" "*" " " " " " "
## 9 ( 1 ) "*" " " "*" "*" " " " " " "
## 10 ( 1 ) "*" " " "*" "*" "*" " " " "
## 11 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 12 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
round(coef(regfit.full, 7), 3)
## (Intercept) Hits Walks CAtBat CHits CHmRun
## 79.451 1.283 3.227 -0.375 1.496 1.442
## DivisionW PutOuts
## -129.987 0.237
round(coef(regfit.fwd, 7), 3)
## (Intercept) AtBat Hits Walks CRBI CWalks
## 109.787 -1.959 7.450 4.913 0.854 -0.305
## DivisionW PutOuts
## -127.122 0.253
round(coef(regfit.bwd, 7), 3)
## (Intercept) AtBat Hits Walks CRuns CWalks
## 105.649 -1.976 6.757 6.056 1.129 -0.716
## DivisionW PutOuts
## -116.169 0.303
set.seed(my.seed)
train <- sample(c(T, F), nrow(Hitters), rep = T)
test <- !train
# обучаем модели
regfit.best <- regsubsets(Salary ~ ., data = Hitters[train, ],
nvmax = 19)
# матрица объясняющих переменных модели для тестовой выборки
test.mat <- model.matrix(Salary ~ ., data = Hitters[test, ])
# вектор ошибок
val.errors <- rep(NA, 19)
# цикл по количеству предикторов
for (i in 1:19){
coefi <- coef(regfit.best, id = i)
pred <- test.mat[, names(coefi)] %*% coefi
# записываем значение MSE на тестовой выборке в вектор
val.errors[i] <- mean((Hitters$Salary[test] - pred)^2)
}
round(val.errors, 0)
## [1] 164377 144405 152176 145198 137902 139176 126849 136191 132890 135435
## [11] 136963 140695 140691 141951 141508 142164 141767 142340 142238
# находим число предикторов у оптимальной модели
which.min(val.errors)
## [1] 7
### 10
# коэффициенты оптимальной модели
round(coef(regfit.best, 10), 3)
## (Intercept) AtBat Hits HmRun Walks CAtBat
## 71.807 -1.504 5.913 -11.524 8.435 -0.165
## CRuns CRBI CWalks DivisionW PutOuts
## 1.706 0.790 -0.911 -109.562 0.243
# функция для прогноза для функции regsubset()
predict.regsubsets <- function(object, newdata, id, ...){
form <- as.formula(object$call[[2]])
mat <- model.matrix(form, newdata)
coefi <- coef(object, id = id)
xvars <- names(coefi)
mat[, xvars] %*% coefi
}
# набор с оптимальным количеством переменных на полном наборе данных
regfit.best <- regsubsets(Salary ~ ., data = Hitters,
nvmax = 19)
round(coef(regfit.best, 10), 3)
## (Intercept) AtBat Hits Walks CAtBat CRuns
## 162.535 -2.169 6.918 5.773 -0.130 1.408
## CRBI CWalks DivisionW PutOuts Assists
## 0.774 -0.831 -112.380 0.297 0.283
# отбираем 10 блоков наблюдений
k <- 10
set.seed(my.seed)
folds <- sample(1:k, nrow(Hitters), replace = T)
# заготовка под матрицу с ошибками
cv.errors <- matrix(NA, k, 19, dimnames = list(NULL, paste(1:19)))
# заполняем матрицу в цикле по блокам данных
for (j in 1:k){
best.fit <- regsubsets(Salary ~ ., data = Hitters[folds != j, ],
nvmax = 19)
# теперь цикл по количеству объясняющих переменных
for (i in 1:19){
# модельные значения Salary
pred <- predict(best.fit, Hitters[folds == j, ], id = i)
# вписываем ошибку в матрицу
cv.errors[j, i] <- mean((Hitters$Salary[folds == j] - pred)^2)
}
}
# усредняем матрицу по каждому столбцу (т.е. по блокам наблюдений),
# чтобы получить оценку MSE для каждой модели с фиксированным
# количеством объясняющих переменных
mean.cv.errors <- apply(cv.errors, 2, mean)
round(mean.cv.errors, 0)
## 1 2 3 4 5 6 7 8 9 10 11
## 149821 130922 139127 131029 131050 119539 124286 113580 115556 112217 113251
## 12 13 14 15 16 17 18 19
## 115756 117821 119481 120122 120074 120085 120086 120404
# на графике
plot(mean.cv.errors, type = 'b')
points(which.min(mean.cv.errors), mean.cv.errors[which.min(mean.cv.errors)],
col = 'red', pch = 20, cex = 2)
# перестраиваем модель с 11 объясняющими переменными на всём наборе данных
reg.best <- regsubsets(Salary ~ ., data = Hitters, nvmax = 19)
round(coef(reg.best, 11), 3)
## (Intercept) AtBat Hits Walks CAtBat CRuns
## 135.751 -2.128 6.924 5.620 -0.139 1.455
## CRBI CWalks LeagueN DivisionW PutOuts Assists
## 0.785 -0.823 43.112 -111.146 0.289 0.269
# из-за синтаксиса glmnet() формируем явно матрицу объясняющих...
x <- model.matrix(Salary ~ ., Hitters)[, -1]
# и вектор значений зависимой переменной
y <- Hitters$Salary
# вектор значений гиперпараметра лямбда
grid <- 10^seq(10, -2, length = 100)
# подгоняем серию моделей ридж-регрессии
ridge.mod <- glmnet(x, y, alpha = 0, lambda = grid)
# размерность матрицы коэффициентов моделей
dim(coef(ridge.mod))
## [1] 20 100
# значение лямбда под номером 50
round(ridge.mod$lambda[50], 0)
## [1] 11498
# коэффициенты соответствующей модели
round(coef(ridge.mod)[, 50], 3)
## (Intercept) AtBat Hits HmRun Runs RBI
## 407.356 0.037 0.138 0.525 0.231 0.240
## Walks Years CAtBat CHits CHmRun CRuns
## 0.290 1.108 0.003 0.012 0.088 0.023
## CRBI CWalks LeagueN DivisionW PutOuts Assists
## 0.024 0.025 0.085 -6.215 0.016 0.003
## Errors NewLeagueN
## -0.021 0.301
# норма эль-два
round(sqrt(sum(coef(ridge.mod)[-1, 50]^2)), 2)
## [1] 6.36
# всё то же для лямбды под номером 60
# значение лямбда под номером 50
round(ridge.mod$lambda[60], 0)
## [1] 705
# коэффициенты соответствующей модели
round(coef(ridge.mod)[, 60], 3)
## (Intercept) AtBat Hits HmRun Runs RBI
## 54.325 0.112 0.656 1.180 0.938 0.847
## Walks Years CAtBat CHits CHmRun CRuns
## 1.320 2.596 0.011 0.047 0.338 0.094
## CRBI CWalks LeagueN DivisionW PutOuts Assists
## 0.098 0.072 13.684 -54.659 0.119 0.016
## Errors NewLeagueN
## -0.704 8.612
# норма эль-два
round(sqrt(sum(coef(ridge.mod)[-1, 60]^2)), 1)
## [1] 57.1
# мы можем получить значения коэффициентов для новой лямбды
round(predict(ridge.mod, s = 50, type = 'coefficients')[1:20, ], 3)
## (Intercept) AtBat Hits HmRun Runs RBI
## 48.766 -0.358 1.969 -1.278 1.146 0.804
## Walks Years CAtBat CHits CHmRun CRuns
## 2.716 -6.218 0.005 0.106 0.624 0.221
## CRBI CWalks LeagueN DivisionW PutOuts Assists
## 0.219 -0.150 45.926 -118.201 0.250 0.122
## Errors NewLeagueN
## -3.279 -9.497
set.seed(my.seed)
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y.test <- y[test]
# подгоняем ридж-модели с большей точностью (thresh ниже значения по умолчанию)
ridge.mod <- glmnet(x[train, ], y[train], alpha = 0, lambda = grid,
thresh = 1e-12)
plot(ridge.mod)
# прогнозы для модели с лямбда = 4
ridge.pred <- predict(ridge.mod, s = 4, newx = x[test, ])
round(mean((ridge.pred - y.test)^2), 0)
## [1] 142199
# сравним с MSE для нулевой модели (прогноз = среднее)
round(mean((mean(y[train]) - y.test)^2), 0)
## [1] 224670
# насколько модель с лямбда = 4 отличается от обычной ПЛР
ridge.pred <- predict(ridge.mod, s = 0, newx = x[test, ], exact = T,
x = x[train, ], y = y[train])
round(mean((ridge.pred - y.test)^2), 0)
## [1] 168589
# predict с лямбдой (s) = 0 даёт модель ПЛР
lm(y ~ x, subset = train)
##
## Call:
## lm(formula = y ~ x, subset = train)
##
## Coefficients:
## (Intercept) xAtBat xHits xHmRun xRuns xRBI
## 274.0145 -0.3521 -1.6377 5.8145 1.5424 1.1243
## xWalks xYears xCAtBat xCHits xCHmRun xCRuns
## 3.7287 -16.3773 -0.6412 3.1632 3.4008 -0.9739
## xCRBI xCWalks xLeagueN xDivisionW xPutOuts xAssists
## -0.6005 0.3379 119.1486 -144.0831 0.1976 0.6804
## xErrors xNewLeagueN
## -4.7128 -71.0951
round(predict(ridge.mod, s = 0, exact = T, type = 'coefficients',
x = x[train, ], y = y[train])[1:20, ], 3)
## (Intercept) AtBat Hits HmRun Runs RBI
## 274.020 -0.352 -1.637 5.815 1.542 1.124
## Walks Years CAtBat CHits CHmRun CRuns
## 3.729 -16.380 -0.641 3.163 3.401 -0.974
## CRBI CWalks LeagueN DivisionW PutOuts Assists
## -0.600 0.338 119.143 -144.085 0.198 0.680
## Errors NewLeagueN
## -4.713 -71.090
# k-кратная кросс-валидация
set.seed(my.seed)
# оценка ошибки
cv.out <- cv.glmnet(x[train, ], y[train], alpha = 0)
plot(cv.out)
# значение лямбда, обеспечивающее минимальную ошибку перекрёстной проверки
bestlam <- cv.out$lambda.min
round(bestlam, 0)
## [1] 326
# MSE на тестовой для этого значения лямбды
ridge.pred <- predict(ridge.mod, s = bestlam, newx = x[test, ])
round(mean((ridge.pred - y.test)^2), 0)
## [1] 139857
# наконец, подгоняем модель для оптимальной лямбды,
# найденной по перекрёстной проверке
out <- glmnet(x, y, alpha = 0)
round(predict(out, type = 'coefficients', s = bestlam)[1:20, ], 3)
## (Intercept) AtBat Hits HmRun Runs RBI
## 15.444 0.077 0.859 0.601 1.064 0.879
## Walks Years CAtBat CHits CHmRun CRuns
## 1.624 1.353 0.011 0.057 0.407 0.115
## CRBI CWalks LeagueN DivisionW PutOuts Assists
## 0.121 0.053 22.091 -79.040 0.166 0.029
## Errors NewLeagueN
## -1.361 9.125
lasso.mod <- glmnet(x[train, ], y[train], alpha = 1, lambda = grid)
plot(lasso.mod)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
set.seed(my.seed)
cv.out <- cv.glmnet(x[train, ], y[train], alpha = 1)
plot(cv.out)
bestlam <- cv.out$lambda.min
lasso.pred <- predict(lasso.mod, s = bestlam, newx = x[test, ])
round(mean((lasso.pred - y.test)^2), 0)
## [1] 143674
# коэффициенты лучшей модели
out <- glmnet(x, y, alpha = 1, lambda = grid)
lasso.coef <- predict(out, type = 'coefficients',
s = bestlam)[1:20, ]
round(lasso.coef, 3)
## (Intercept) AtBat Hits HmRun Runs RBI
## 1.275 -0.055 2.180 0.000 0.000 0.000
## Walks Years CAtBat CHits CHmRun CRuns
## 2.292 -0.338 0.000 0.000 0.028 0.216
## CRBI CWalks LeagueN DivisionW PutOuts Assists
## 0.417 0.000 20.286 -116.168 0.238 0.000
## Errors NewLeagueN
## -0.856 0.000
round(lasso.coef[lasso.coef != 0], 3)
## (Intercept) AtBat Hits Walks Years CHmRun
## 1.275 -0.055 2.180 2.292 -0.338 0.028
## CRuns CRBI LeagueN DivisionW PutOuts Errors
## 0.216 0.417 20.286 -116.168 0.238 -0.856
# кросс-валидация
set.seed(2) # непонятно почему они сменили зерно; похоже, опечатка
pcr.fit <- pcr(Salary ~ ., data = Hitters, scale = T, validation = 'CV')
summary(pcr.fit)
## Data: X dimension: 263 19
## Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 19
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 452 351.9 353.2 355.0 352.8 348.4 343.6
## adjCV 452 351.6 352.7 354.4 352.1 347.6 342.7
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 345.5 347.7 349.6 351.4 352.1 353.5 358.2
## adjCV 344.7 346.7 348.5 350.1 350.7 352.0 356.5
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps
## CV 349.7 349.4 339.9 341.6 339.2 339.6
## adjCV 348.0 347.7 338.2 339.7 337.2 337.6
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 38.31 60.16 70.84 79.03 84.29 88.63 92.26 94.96
## Salary 40.63 41.58 42.17 43.22 44.90 46.48 46.69 46.75
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 96.28 97.26 97.98 98.65 99.15 99.47 99.75
## Salary 46.86 47.76 47.82 47.85 48.10 50.40 50.55
## 16 comps 17 comps 18 comps 19 comps
## X 99.89 99.97 99.99 100.00
## Salary 53.01 53.85 54.61 54.61
# график ошибок
validationplot(pcr.fit, val.type = 'MSEP')
set.seed(my.seed)
pcr.fit <- pcr(Salary ~ ., data = Hitters, subset = train, scale = T,
validation = 'CV')
validationplot(pcr.fit, val.type = 'MSEP')
# MSE на тестовой выборке
pcr.pred <- predict(pcr.fit, x[test, ], ncomp = 7)
round(mean((pcr.pred - y.test)^2), 0)
## [1] 140751
# подгоняем модель на всей выборке для M = 7
# (оптимально по методу перекрёстной проверки)
pcr.fit <- pcr(y ~ x, scale = T, ncomp = 7)
summary(pcr.fit)
## Data: X dimension: 263 19
## Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 7
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 38.31 60.16 70.84 79.03 84.29 88.63 92.26
## y 40.63 41.58 42.17 43.22 44.90 46.48 46.69
set.seed(my.seed)
pls.fit <- plsr(Salary ~ ., data = Hitters, subset = train, scale = T,
validation = 'CV')
summary(pls.fit)
## Data: X dimension: 131 19
## Y dimension: 131 1
## Fit method: kernelpls
## Number of components considered: 19
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 428.3 325.5 329.9 328.8 339.0 338.9 340.1
## adjCV 428.3 325.0 328.2 327.2 336.6 336.1 336.6
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 339.0 347.1 346.4 343.4 341.5 345.4 356.4
## adjCV 336.2 343.4 342.8 340.2 338.3 341.8 351.1
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps
## CV 348.4 349.1 350.0 344.2 344.5 345.0
## adjCV 344.2 345.0 345.9 340.4 340.6 341.1
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 39.13 48.80 60.09 75.07 78.58 81.12 88.21 90.71
## Salary 46.36 50.72 52.23 53.03 54.07 54.77 55.05 55.66
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 93.17 96.05 97.08 97.61 97.97 98.70 99.12
## Salary 55.95 56.12 56.47 56.68 57.37 57.76 58.08
## 16 comps 17 comps 18 comps 19 comps
## X 99.61 99.70 99.95 100.00
## Salary 58.17 58.49 58.56 58.62
# теперь подгоняем модель для найденного оптимального M = 2
# и оцениваем MSE на тестовой
pls.pred <- predict(pls.fit, x[test, ], ncomp = 2)
round(mean((pls.pred - y.test)^2), 0)
## [1] 145368
# подгоняем модель на всей выборке
pls.fit <- plsr(Salary ~ ., data = Hitters, scale = T, ncomp = 2)
summary(pls.fit)
## Data: X dimension: 263 19
## Y dimension: 263 1
## Fit method: kernelpls
## Number of components considered: 2
## TRAINING: % variance explained
## 1 comps 2 comps
## X 38.08 51.03
## Salary 43.05 46.40
1 Примените указанные в варианте метод к набору данных по своему варианту (см. таблицу ниже). Не забудьте предварительно сделать из категориальных переменных факторы. Выберите оптимальную модель с помощью кросс-валидации. Выведите её коэффициенты с помощью функции coef(). Рассчитайте MSE модели на тестовой выборке.
2 Примените указанные в варианте метод к набору данных по своему варианту (см. таблицу ниже). Для модели:
- Подогнать модель на всей выборке и вычислить ошибку (MSE) с кросс-валидацией. По наименьшей MSE подобрать оптимальное значение настроечного параметра метода (гиперпараметр λ или число главных компонент M). - Подогнать модель с оптимальным значением параметра на обучающей выборке, посчитать MSE на тестовой.
- Подогнать модель с оптимальным значением параметра на всех данных, вывести характеристики модели функцией summary().
3 Сравните оптимальные модели, полученные в заданиях 1 и 2 по MSE на тестовой выборке. Какой метод дал лучший результат? Доля тестовой выборки: 50%.
Как сдавать: прислать на почту преподавателя ссылки:
на html-отчёт с видимыми блоками кода (блоки кода с параметром echo = T), размещённый на rpubs.com.
на код, генерирующий отчёт, в репозитории на github.com.
В текст отчёта включить постановку задачи и ответы на вопросы задания. В начале отчёта нужно описать рассматриваемый набор данных: тип и содержание переменных, количество наблюдений.
Номер варианта | Данные | Зависимая переменная | Объясняющие переменные | Методы для задания 1 | Методы для задания 2 |
1 |
Boston {MASS}
|
crim
|
все остальные | отбор оптимального подмножества | гребневая регрессия |
2 |
Boston {MASS}
|
crim
|
все остальные | отбор путём пошагового исключения | частный метод наименьших квадратов |
3 |
Boston {MASS}
|
crim
|
все остальные | отбор путём пошагового включения | лассо-регрессия |
4 |
Auto {ISLR}
|
mpg
|
все остальные, кроме name
|
отбор оптимального подмножества | регрессия на главные компоненты |
5 |
Auto {ISLR}
|
mpg
|
все остальные, кроме name
|
отбор путём пошагового исключения | гребневая регрессия |
6 |
Auto {ISLR}
|
mpg
|
все остальные, кроме name
|
отбор путём пошагового включения | частный метод наименьших квадратов |
7 |
Carseats {ISLR}
|
Sales
|
все остальные | отбор оптимального подмножества | лассо-регрессия |
8 |
Carseats {ISLR}
|
Sales
|
все остальные | отбор путём пошагового исключения | регрессия на главные компоненты |
9 |
Carseats {ISLR}
|
Sales
|
все остальные | отбор путём пошагового включения | гребневая регрессия |
10 |
College {ISLR}
|
Accept
|
все остальные | отбор оптимального подмножества | частный метод наименьших квадратов |
11 |
College {ISLR}
|
Accept
|
Private , Apps , Enroll , Top10perc , Outstate , Room.Board , Personal , Terminal , perc.alumni , Grad.Rate
|
отбор путём пошагового исключения | лассо-регрессия |
12 |
College {ISLR}
|
Accept
|
Private , Apps , Enroll , Top10perc , Outstate , Room.Board , Personal , Terminal , perc.alumni , Grad.Rate
|
отбор путём пошагового включения | регрессия на главные компоненты |
13 |
Prestige {car}
|
prestige
|
все остальные | отбор оптимального подмножества | гребневая регрессия |
14 |
Prestige {car}
|
prestige
|
все остальные | отбор путём пошагового исключения | частный метод наименьших квадратов |
15 |
Prestige {car}
|
prestige
|
все остальные | отбор путём пошагового включения | лассо-регрессия |
16 |
College {ISLR}
|
Accept
|
Private , Apps , F.Undergrad , Top25perc , P.Undergrad , Books , PhD , S.F.Ratio , Expend
|
отбор оптимального подмножества | регрессия на главные компоненты |
17 |
College {ISLR}
|
Accept
|
Private , Apps , F.Undergrad , Top25perc , P.Undergrad , Books , PhD , S.F.Ratio , Expend
|
отбор путём пошагового исключения | гребневая регрессия |
18 |
College {ISLR}
|
Accept
|
Private , Apps , F.Undergrad , Top25perc , P.Undergrad , Books , PhD , S.F.Ratio , Expend
|
отбор путём пошагового включения | частный метод наименьших квадратов |
Источники