Terra nadszedł czas na przetestowanie tych metod (regresja grzbietowa i lasso) oraz metod oceny (zestaw walidacyjny, walidacja krzyżowa) na innych zbiorach danych. Możesz pracować z zespołem nad tą częścią laboratorium.
Możesz użyć dowolnego zbioru danych zawartego w ISLR lub wybrać jeden z pakietów danych na Kaggle/Data World itp. (zmienna zależna musi być ciągła).
Pobierz zbiór danych i spróbuj określić optymalny zestaw parametrów, które należy użyć do jego modelowania!
library(MASS)
##
## Dołączanie pakietu: 'MASS'
## Następujący obiekt został zakryty z 'package:dplyr':
##
## select
data(Boston)
data <- Boston
x = model.matrix(medv ~ ., data)[, -1]
y = data$medv
Regresja grzbietowa
grid = 10^seq(10, -2, length = 100)
ridge_mod = glmnet(x,y, alpha = 0, lambda = grid)
dim(coef(ridge_mod))
## [1] 14 100
plot(ridge_mod)
ridge_mod$lambda[50]
## [1] 11497.57
coef(ridge_mod)[,50]
## (Intercept) crim zn indus chas
## 2.255401e+01 -3.305151e-04 1.131176e-04 -5.160096e-04 5.066065e-03
## nox rm age dis rad
## -2.697423e-02 7.256984e-03 -9.791268e-05 8.654283e-04 -3.203678e-04
## tax ptratio black lstat
## -2.033098e-05 -1.718182e-03 2.672815e-05 -7.566302e-04
sqrt(sum(coef(ridge_mod)[-1,50]^2))
## [1] 0.02847302
predict(ridge_mod, s = 50, type = "coefficients")
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 23.580881779
## crim -0.036358794
## zn 0.011707752
## indus -0.052663644
## chas 0.914385347
## nox -2.584648038
## rm 1.136076181
## age -0.008821137
## dis 0.021708896
## rad -0.026332105
## tax -0.002040002
## ptratio -0.238276456
## black 0.003150038
## lstat -0.106839435
set.seed(1)
train = Boston %>%
sample_frac(0.5)
test = Boston %>%
setdiff(train)
x_train = model.matrix(medv ~ ., data)[, -1]
x_test = model.matrix(medv ~ ., data)[, -1]
y_train = data$medv
y_test = data$medv
ridge_mod <- glmnet(x_train, y_train, alpha = 0, lambda = grid, thresh = 1e-12)
ridge_pred <- predict(ridge_mod, s = 4, newx = x_test)
mean((ridge_pred - y_test)^2)
## [1] 25.84755
MSE wynosi 25.84755
mean((mean(y_train) - y_test)^2)
## [1] 84.41956
ridge_pred = predict(ridge_mod, s = 1e10, newx = x_test)
mean((ridge_pred - y_test)^2)
## [1] 84.41956
ridge_pred = predict(ridge_mod, s = 0, newx = x_test)
mean((ridge_pred - y_test)^2)
## [1] 21.89515
lm(medv~., data = train)
##
## Call:
## lm(formula = medv ~ ., data = train)
##
## Coefficients:
## (Intercept) crim zn indus chas nox
## 34.666523 -0.092060 0.029921 0.032981 2.306239 -13.560636
## rm age dis rad tax ptratio
## 4.103275 -0.003469 -1.261798 0.380027 -0.019159 -0.980838
## black lstat
## 0.006807 -0.486748
predict(ridge_mod, s = 0, type="coefficients")
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 3.625092e+01
## crim -1.074746e-01
## zn 4.605961e-02
## indus 1.863996e-02
## chas 2.694201e+00
## nox -1.763377e+01
## rm 3.816640e+00
## age 5.672411e-04
## dis -1.468547e+00
## rad 3.015118e-01
## tax -1.211459e-02
## ptratio -9.506368e-01
## black 9.309147e-03
## lstat -5.237534e-01
set.seed(1)
cv.out = cv.glmnet(x_train, y_train, alpha = 0)
bestlam = cv.out$lambda.min
bestlam
## [1] 0.6777654
plot(cv.out)
ridge_pred = predict(ridge_mod, s = bestlam, newx = x_test)
mean((ridge_pred - y_test)^2)
## [1] 22.41216
out = glmnet(x, y, alpha = 0)
predict(out, type = "coefficients", s = bestlam)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 28.001475824
## crim -0.087572712
## zn 0.032681030
## indus -0.038003639
## chas 2.899781645
## nox -11.913360479
## rm 4.011308385
## age -0.003731470
## dis -1.118874607
## rad 0.153730052
## tax -0.005751054
## ptratio -0.854984614
## black 0.009073740
## lstat -0.472423800
Regresja LASSO
lasso_mod = glmnet(x_train,
y_train,
alpha = 1,
lambda = grid) # Dopasuj model lasso do danych treningowych
plot(lasso_mod)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## zwijanie do unikalnych wartości 'x'
set.seed(1)
cv.out = cv.glmnet(x_train, y_train, alpha = 1)
plot(cv.out)
out = glmnet(x, y, alpha = 1, lambda = grid) #
lasso_coef = predict(out, type = "coefficients", s = bestlam)
lasso_coef
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 14.168504167
## crim -0.001637787
## zn .
## indus .
## chas 1.067894029
## nox .
## rm 4.119483795
## age .
## dis .
## rad .
## tax .
## ptratio -0.696627118
## black 0.004562890
## lstat -0.502956041
lasso_coef[lasso_coef != 0]
## [1] 14.168504167 -0.001637787 1.067894029 4.119483795 -0.696627118
## [6] 0.004562890 -0.502956041
Aby zaliczyć to laboratorium, zamieść odpowiedzi na następujące pytania: