library("ISLR")
data("Hitters")
library(gbm)
Hitters <- na.omit(Hitters)
Hitters$Salary <- log(Hitters$Salary)
train = 1:200
Hitters.train <- Hitters[train,]
Hitters.test <- Hitters[-train,]
set.seed(1)
pows <- seq(-10, -0.2, by = 0.1)
lambdas <- 10^pows
train.err <- rep(NA, length(lambdas))
for (i in 1:length(lambdas)) {
boost.hitters <- gbm(Salary ~ ., data = Hitters.train, distribution = "gaussian", n.trees = 1000, shrinkage = lambdas[i])
pred.train <- predict(boost.hitters, Hitters.train, n.trees = 1000)
train.err[i] <- mean((pred.train - Hitters.train$Salary)^2)
}
plot(lambdas, train.err, type = "b", xlab = "Shrinkage values", ylab = "Training MSE")
set.seed(1)
test.err <- rep(NA, length(lambdas))
for (i in 1:length(lambdas)) {
boost.hitters <- gbm(Salary ~ ., data = Hitters.train, distribution = "gaussian", n.trees = 1000, shrinkage = lambdas[i])
yhat <- predict(boost.hitters, Hitters.test, n.trees = 1000)
test.err[i] <- mean((yhat - Hitters.test$Salary)^2)
}
plot(lambdas, test.err, type = "b", xlab = "Shrinkage values", ylab = "Test MSE")
points(lambdas[which.min(test.err)], min(test.err), pch = 20,col = "blue")
mse.boost <- min(test.err)
mse.boost
## [1] 0.2540265
lambdas[which.min(test.err)]
## [1] 0.07943282
Наименьшее значение MSE = 0.2540265 достигается при \(\lambda\) = 0.0794328
lm.fit = lm(Salary ~ ., data = Hitters.train)
lm.pred = predict(lm.fit, Hitters.test)
mse.lm <- mean((Hitters.test$Salary - lm.pred)^2)
mse.lm
## [1] 0.4917959
Для модели линейной регрессии MSE = 0.4917959, что больше чем значение для boosting.
library(glmnet)
x <- model.matrix(Salary ~ ., Hitters)[, -19]
y <- Hitters$Salary
set.seed(1)
fit.ridge <- cv.glmnet(x[train,], y[train], alpha=0)
lambda <- fit.ridge$lambda.min
pred.ridge <- predict(fit.ridge, s=lambda, newx=x[-train,])
mse.ridge <- mean((pred.ridge - y[-train])^2)
mse.ridge
## [1] 0.4571105
Для модели ridge regression MSE = 0.4571105, что больше, чем значение MSE для boosting.
Таким образом можно заключить, что boosting лучше использовать для этих данных, чем методы регрессии.
boost.best = gbm(Salary ~ ., data = Hitters.train, distribution = "gaussian", n.trees = 1000, shrinkage=lambdas[which.min(test.err)])
summary(boost.best)
## var rel.inf
## CAtBat CAtBat 12.4208190
## CRuns CRuns 12.1329859
## PutOuts PutOuts 9.0907128
## CRBI CRBI 7.9835959
## CHits CHits 7.3985837
## CWalks CWalks 6.8186161
## Walks Walks 6.2857156
## CHmRun CHmRun 6.2596263
## Years Years 5.4374901
## Hits Hits 4.9649872
## Assists Assists 4.8001837
## RBI RBI 4.6721690
## HmRun HmRun 3.1831680
## AtBat AtBat 3.1161170
## Errors Errors 2.0707743
## Runs Runs 1.8776262
## Division Division 0.7024624
## NewLeague NewLeague 0.5264935
## League League 0.2578733
CAtBat и CRuns наиболее значимые переменные в моделе boosting.
library(randomForest)
set.seed(2)
bag.hitters <- randomForest(Salary~., data = Hitters.train, mtry = 19, ntree = 500)
bag.pred <- predict(bag.hitters, newdata = Hitters.test)
mse.bag <- mean((bag.pred - Hitters.test$Salary)^2)
mse.bag
## [1] 0.2271886
Test MSE для bagging равно 0.2271886, что меньше чем наилучшее test MSE для boosting 0.2540265