Модели: линейная регрессия, лассо.
Данные: Boston {MASS}- статистика стоимости жилья в пригороде Бостона.
library('leaps') # функция regsubset() -- отбор оптимального
## Warning: package 'leaps' was built under R version 3.5.3
# подмножества переменных
library('glmnet') # функция glmnet() -- лассо
## Warning: package 'glmnet' was built under R version 3.5.3
## Loading required package: Matrix
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.5.3
## Loaded glmnet 2.0-16
library('pls') # регрессия на главные компоненты -- pcr()
## Warning: package 'pls' was built under R version 3.5.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
library('MASS')
## Warning: package 'MASS' was built under R version 3.5.3
library('ISLR')
## Warning: package 'ISLR' was built under R version 3.5.3
my.seed <- 1
# набор данных
train.percent <- 0.5
data(Boston) # открываем данные
?Boston
## starting httpd help server ... done
Boston$chas <- as.factor(Boston$chas)
sum(is.na(Boston$crim))
## [1] 0
inTrain <- sample(seq_along(Boston$crim),
nrow(Boston) * train.percent)
df.test <- Boston[-inTrain, -1]
# подгоняем модели с сочетаниями предикторов до 8 включительно
regfit.full <- regsubsets(crim ~ ., Boston)
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(crim ~ ., Boston)
## 13 Variables (and intercept)
## Forced in Forced out
## zn FALSE FALSE
## indus FALSE FALSE
## chas1 FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## black FALSE FALSE
## lstat FALSE FALSE
## medv FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## zn indus chas1 nox rm age dis rad tax ptratio black lstat medv
## 1 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " "*" " "
## 3 ( 1 ) " " " " " " " " " " " " " " "*" " " " " "*" "*" " "
## 4 ( 1 ) "*" " " " " " " " " " " "*" "*" " " " " " " " " "*"
## 5 ( 1 ) "*" " " " " " " " " " " "*" "*" " " " " "*" " " "*"
## 6 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " " " "*" " " "*"
## 7 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " "*" "*" " " "*"
## 8 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " "*" "*" "*" "*"
# подгоняем модели с сочетаниями предикторов до 13 (максимум в данных)
regfit.full <- regsubsets(crim ~ ., Boston, nvmax = 13)
reg.summary <- summary(regfit.full)
reg.summary
## Subset selection object
## Call: regsubsets.formula(crim ~ ., Boston, nvmax = 13)
## 13 Variables (and intercept)
## Forced in Forced out
## zn FALSE FALSE
## indus FALSE FALSE
## chas1 FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## black FALSE FALSE
## lstat FALSE FALSE
## medv FALSE FALSE
## 1 subsets of each size up to 13
## Selection Algorithm: exhaustive
## zn indus chas1 nox rm age dis rad tax ptratio black lstat medv
## 1 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " "*" " "
## 3 ( 1 ) " " " " " " " " " " " " " " "*" " " " " "*" "*" " "
## 4 ( 1 ) "*" " " " " " " " " " " "*" "*" " " " " " " " " "*"
## 5 ( 1 ) "*" " " " " " " " " " " "*" "*" " " " " "*" " " "*"
## 6 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " " " "*" " " "*"
## 7 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " "*" "*" " " "*"
## 8 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " "*" "*" "*" "*"
## 9 ( 1 ) "*" "*" " " "*" " " " " "*" "*" " " "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" " " "*" "*" " " "*" "*" " " "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" " " "*" "*" " " "*" "*" "*" "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*" "*" "*"
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
# структура отчёта по модели (ищем характеристики качества)
names(reg.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
# R^2 и скорректированный R^2
round(reg.summary$rsq, 3)
## [1] 0.391 0.421 0.429 0.433 0.439 0.444 0.448 0.450 0.452 0.453 0.454
## [12] 0.454 0.454
# на графике
plot(1:13, reg.summary$rsq, type = 'b',
xlab = 'Количество предикторов', ylab = 'R-квадрат')
# сода же добавим скорректированный R-квадрат
points(1:13, reg.summary$adjr2, col = 'red')
# модель с максимальным скорректированным R-квадратом
which.max(reg.summary$adjr2)
## [1] 9
### 9
points(which.max(reg.summary$adjr2),
reg.summary$adjr2[which.max(reg.summary$adjr2)],
col = 'red', cex = 2, pch = 20)
legend('bottomright', legend = c('R^2', 'R^2_adg'),
col = c('black', 'red'), lty = c(1, NA),
pch = c(1, 1))
# C_p
reg.summary$cp
## [1] 46.548369 21.929562 16.886597 14.492012 11.279399 9.004924 7.722956
## [8] 7.198756 7.414373 8.858957 10.405385 12.006558 14.000000
# число предикторов у оптимального значения критерия
which.min(reg.summary$cp)
## [1] 8
### 8
# график
plot(reg.summary$cp, xlab = 'Число предикторов',
ylab = 'C_p', type = 'b')
points(which.min(reg.summary$cp),
reg.summary$cp[which.min(reg.summary$cp)],
col = 'red', cex = 2, pch = 20)
# BIC
reg.summary$bic
## [1] -238.7044 -257.6477 -258.2957 -256.4064 -255.3732 -253.4454 -250.5445
## [8] -246.8906 -242.4907 -236.8341 -231.0734 -225.2569 -219.0371
# число предикторов у оптимального значения критерия
which.min(reg.summary$bic)
## [1] 3
### 3
# график
plot(reg.summary$bic, xlab = 'Число предикторов',
ylab = 'BIC', type = 'b')
points(which.min(reg.summary$bic),
reg.summary$bic[which.min(reg.summary$bic)],
col = 'red', cex = 2, pch = 20)
# метод plot для визуализации результатов
?plot.regsubsets
plot(regfit.full, scale = 'r2')
plot(regfit.full, scale = 'adjr2')
plot(regfit.full, scale = 'Cp')
plot(regfit.full, scale = 'bic')
# коэффициенты модели с наименьшим BIC
round(coef(regfit.full, 3), 3)
## (Intercept) rad black lstat
## -0.373 0.488 -0.009 0.214
Наилучшие:
Модель 1 с наименьшими BIC- модель регрессии с тремя объясняющими переменными.
Модель 2 с оптимальным значением критерия CP- модель регрессии с восемью объясняющими переменными.
Модель 3 с максимальным скорректированным R-квадратом- модель регрессии с девятьяю объясняющими переменными.
Построим эти модели и оценим их характеристики.
model1 <- lm(crim ~ rad + black + lstat,
data = Boston)
summary(model1)
##
## Call:
## lm(formula = crim ~ rad + black + lstat, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.023 -1.713 -0.281 0.873 77.716
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.372585 1.641557 -0.227 0.82054
## rad 0.488172 0.040422 12.077 < 2e-16 ***
## black -0.009472 0.003615 -2.620 0.00905 **
## lstat 0.213596 0.047447 4.502 8.39e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.521 on 502 degrees of freedom
## Multiple R-squared: 0.4286, Adjusted R-squared: 0.4252
## F-statistic: 125.5 on 3 and 502 DF, p-value: < 2.2e-16
y.fact <- Boston[-inTrain, 1]
y.model1.lm <- predict(model1, df.test)
MSE.lm <- sum((y.model1.lm - y.fact)^2) / length(y.model1.lm)
MSE.lm
## [1] 38.97635
В модели все объясняющие переменные являются значимыми на уровне 0,01.
model2 <- lm(crim ~ zn + nox + dis + rad + ptratio + black + lstat + medv,
data = Boston)
summary(model2)
##
## Call:
## lm(formula = crim ~ zn + nox + dis + rad + ptratio + black +
## lstat + medv, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.860 -2.102 -0.363 0.895 75.702
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.683128 6.086010 3.234 0.001301 **
## zn 0.043293 0.017977 2.408 0.016394 *
## nox -12.753708 4.760157 -2.679 0.007623 **
## dis -0.918318 0.261932 -3.506 0.000496 ***
## rad 0.532617 0.049727 10.711 < 2e-16 ***
## ptratio -0.310541 0.182941 -1.697 0.090229 .
## black -0.007922 0.003615 -2.191 0.028897 *
## lstat 0.110173 0.069219 1.592 0.112097
## medv -0.174207 0.053988 -3.227 0.001334 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.428 on 497 degrees of freedom
## Multiple R-squared: 0.4505, Adjusted R-squared: 0.4416
## F-statistic: 50.92 on 8 and 497 DF, p-value: < 2.2e-16
y.fact <- Boston[-inTrain, 1]
y.model2.lm <- predict(model2, df.test)
MSE.lm <- sum((y.model2.lm - y.fact)^2) / length(y.model2.lm)
MSE.lm
## [1] 37.68813
В модели лишь половина объясняющих переменных являются значимыми на уровне 0,01.
model3 <- lm(crim ~ zn + indus + nox + dis + rad + ptratio + black + lstat + medv,
data = Boston)
summary(model3)
##
## Call:
## lm(formula = crim ~ zn + indus + nox + dis + rad + ptratio +
## black + lstat + medv, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.946 -2.086 -0.358 0.974 75.644
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.124636 6.095437 3.138 0.001805 **
## zn 0.042788 0.017967 2.381 0.017620 *
## indus -0.099386 0.074207 -1.339 0.181085
## nox -10.466490 5.053655 -2.071 0.038869 *
## dis -1.002598 0.269182 -3.725 0.000218 ***
## rad 0.539504 0.049953 10.800 < 2e-16 ***
## ptratio -0.270836 0.185183 -1.463 0.144230
## black -0.008004 0.003613 -2.215 0.027199 *
## lstat 0.117806 0.069398 1.698 0.090223 .
## medv -0.180594 0.054155 -3.335 0.000918 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.422 on 496 degrees of freedom
## Multiple R-squared: 0.4524, Adjusted R-squared: 0.4425
## F-statistic: 45.54 on 9 and 496 DF, p-value: < 2.2e-16
y.fact <- Boston[-inTrain, 1]
y.model3.lm <- predict(model3, df.test)
MSE.lm <- sum((y.model3.lm - y.fact)^2) / length(y.model2.lm)
MSE.lm
## [1] 37.69676
В модели лишь 3 объясняющих переменных являются значимыми на уровне 0,01.
MSE во всех трех моделях примерно одинаковы.
Вывод: за наилучшую примем первую модель с тремя объясняющими переменными.
regfit.fwd <- regsubsets(crim ~ ., data = Boston,
nvmax = 13, method = 'forward')
summary(regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(crim ~ ., data = Boston, nvmax = 13, method = "forward")
## 13 Variables (and intercept)
## Forced in Forced out
## zn FALSE FALSE
## indus FALSE FALSE
## chas1 FALSE FALSE
## nox FALSE FALSE
## rm FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## black FALSE FALSE
## lstat FALSE FALSE
## medv FALSE FALSE
## 1 subsets of each size up to 13
## Selection Algorithm: forward
## zn indus chas1 nox rm age dis rad tax ptratio black lstat medv
## 1 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " "*" " "
## 3 ( 1 ) " " " " " " " " " " " " " " "*" " " " " "*" "*" " "
## 4 ( 1 ) " " " " " " " " " " " " " " "*" " " " " "*" "*" "*"
## 5 ( 1 ) "*" " " " " " " " " " " " " "*" " " " " "*" "*" "*"
## 6 ( 1 ) "*" " " " " " " " " " " "*" "*" " " " " "*" "*" "*"
## 7 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " " " "*" "*" "*"
## 8 ( 1 ) "*" " " " " "*" " " " " "*" "*" " " "*" "*" "*" "*"
## 9 ( 1 ) "*" "*" " " "*" " " " " "*" "*" " " "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" " " "*" "*" " " "*" "*" " " "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" " " "*" "*" " " "*" "*" "*" "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*" "*" "*"
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
round(coef(regfit.fwd, 3), 3)
## (Intercept) rad black lstat
## -0.373 0.488 -0.009 0.214
Результат: оптимальное количество объясняющих переменных- 2.
x <- model.matrix(crim ~ ., Boston)[, -1]
# и вектор значений зависимой переменной
y <- Boston$crim
set.seed(my.seed)
train <- sample(c(T, F), nrow(Boston), rep = T)
test <- !train
y.test <- y[test]
# обучаем модели
regfit.best <- regsubsets(crim ~ ., data = Boston[train, ],
nvmax = 13)
# матрица объясняющих переменных модели для тестовой выборки
test.mat <- model.matrix(crim ~ ., data = Boston[test, ])
# вектор ошибок
val.errors <- rep(NA, 13)
# цикл по количеству предикторов
for (i in 1:13){
coefi <- coef(regfit.best, id = i)
pred <- test.mat[, names(coefi)] %*% coefi
# записываем значение MSE на тестовой выборке в вектор
val.errors[i] <- mean((Boston$crim[test] - pred)^2)
}
round(val.errors, 0)
## [1] 59 56 58 58 57 57 59 60 60 60 59 59 59
# находим число предикторов у оптимальной модели
which.min(val.errors)
## [1] 2
### 2
# коэффициенты оптимальной модели
round(coef(regfit.best, 2), 3)
## (Intercept) rad lstat
## -3.760 0.475 0.204
# функция для прогноза для функции regsubset()
predict.regsubsets <- function(object, newdata, id, ...){
form <- as.formula(object$call[[2]])
mat <- model.matrix(form, newdata)
coefi <- coef(object, id = id)
xvars <- names(coefi)
mat[, xvars] %*% coefi
}
# набор с оптимальным количеством переменных на полном наборе данных
regfit.best <- regsubsets(crim ~ ., data = Boston,
nvmax = 13)
round(coef(regfit.best, 2), 3)
## (Intercept) rad lstat
## -4.381 0.523 0.237
Результат: оптимальное количество объясняющих переменных- 12.
# отбираем 10 блоков наблюдений
k <- 10
set.seed(my.seed)
folds <- sample(1:k, nrow(Boston), replace = T)
# заготовка под матрицу с ошибками
cv.errors <- matrix(NA, k, 13, dimnames = list(NULL, paste(1:13)))
# заполняем матрицу в цикле по блокам данных
for (j in 1:k){
best.fit <- regsubsets(crim ~ ., data = Boston[folds != j, ],
nvmax = 13)
# теперь цикл по количеству объясняющих переменных
for (i in 1:13){
# модельные значения Salary
pred <- predict(best.fit, Boston[folds == j, ], id = i)
# вписываем ошибку в матрицу
cv.errors[j, i] <- mean((Boston$crim[folds == j] - pred)^2)
}
}
# усредняем матрицу по каждому столбцу (т.е. по блокам наблюдений),
# чтобы получить оценку MSE для каждой модели с фиксированным
# количеством объясняющих переменных
mean.cv.errors <- apply(cv.errors, 2, mean)
round(mean.cv.errors, 0)
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 44 43 43 43 42 42 42 42 41 41 41 41 41
# на графике
plot(mean.cv.errors, type = 'b')
points(which.min(mean.cv.errors), mean.cv.errors[which.min(mean.cv.errors)],
col = 'red', pch = 20, cex = 2)
# перестраиваем модель с 12 объясняющими переменными на всём наборе данных
reg.best <- regsubsets(crim ~ ., data = Boston, nvmax = 13)
round(coef(reg.best, 12), 3)
## (Intercept) zn indus chas1 nox rm
## 16.986 0.045 -0.064 -0.744 -10.202 0.440
## dis rad tax ptratio black lstat
## -0.994 0.588 -0.004 -0.270 -0.008 0.128
## medv
## -0.199
x <- model.matrix(crim ~ ., Boston)[, -1]
# и вектор значений зависимой переменной
y <- Boston$crim
grid <- 10^seq(10, -2, length = 100)
lasso.mod <- glmnet(x[train, ], y[train], alpha = 1, lambda = grid)
plot(lasso.mod)
# Подбор оптимального значения лямбда с помощью перекрёстной проверки ##########
set.seed(my.seed)
cv.out <- cv.glmnet(x[train, ], y[train], alpha = 1)
plot(cv.out)
bestlam <- cv.out$lambda.min
lasso.pred <- predict(lasso.mod, s = bestlam, newx = x[test, ])
round(mean((lasso.pred - y.test)^2), 0)
## [1] 56
# коэффициенты лучшей модели
out <- glmnet(x, y, alpha = 1, lambda = grid)
lasso.coef <- predict(out, type = 'coefficients',
s = bestlam)[1:14, ]
round(lasso.coef, 3)
## (Intercept) zn indus chas1 nox rm
## 2.540 0.012 0.000 -0.119 0.000 0.000
## age dis rad tax ptratio black
## 0.000 -0.198 0.464 0.000 0.000 -0.007
## lstat medv
## 0.119 -0.076
round(lasso.coef[lasso.coef != 0], 3)
## (Intercept) zn chas1 dis rad black
## 2.540 0.012 -0.119 -0.198 0.464 -0.007
## lstat medv
## 0.119 -0.076