Task 4. Chaper 8, ex 10

library("ISLR")
data("Hitters")
library(gbm)

10a

Hitters <- na.omit(Hitters)
Hitters$Salary <- log(Hitters$Salary)

10b

train = 1:200
Hitters.train <- Hitters[train,]
Hitters.test <- Hitters[-train,]

10c

set.seed(1)
pows <- seq(-10, -0.2, by = 0.1)
lambdas <- 10^pows
train.err <- rep(NA, length(lambdas))
for (i in 1:length(lambdas)) {
    boost.hitters <- gbm(Salary ~ ., data = Hitters.train, distribution = "gaussian", n.trees = 1000, shrinkage = lambdas[i])
    pred.train <- predict(boost.hitters, Hitters.train, n.trees = 1000)
    train.err[i] <- mean((pred.train - Hitters.train$Salary)^2)
}
plot(lambdas, train.err, type = "b", xlab = "Shrinkage values", ylab = "Training MSE")

10d

set.seed(1)
test.err <- rep(NA, length(lambdas))
for (i in 1:length(lambdas)) {
    boost.hitters <- gbm(Salary ~ ., data = Hitters.train, distribution = "gaussian", n.trees = 1000, shrinkage = lambdas[i])
    yhat <- predict(boost.hitters, Hitters.test, n.trees = 1000)
    test.err[i] <- mean((yhat - Hitters.test$Salary)^2)
}
plot(lambdas, test.err, type = "b", xlab = "Shrinkage values", ylab = "Test MSE")
points(lambdas[which.min(test.err)], min(test.err), pch = 20,col = "blue")

mse.boost <- min(test.err)
mse.boost
## [1] 0.2540265
lambdas[which.min(test.err)]
## [1] 0.07943282

Наименьшее значение MSE = 0.2540265 достигается при \(\lambda\) = 0.0794328

10e

Linear regression

lm.fit = lm(Salary ~ ., data = Hitters.train)
lm.pred = predict(lm.fit, Hitters.test)
mse.lm <- mean((Hitters.test$Salary - lm.pred)^2)
mse.lm
## [1] 0.4917959

Для модели линейной регрессии MSE = 0.4917959, что больше чем значение для boosting.

Ridge regression

library(glmnet)
x <- model.matrix(Salary ~ ., Hitters)[, -19]
y <- Hitters$Salary

set.seed(1)
fit.ridge <- cv.glmnet(x[train,], y[train], alpha=0)
lambda <- fit.ridge$lambda.min
pred.ridge <- predict(fit.ridge, s=lambda, newx=x[-train,])
mse.ridge <- mean((pred.ridge - y[-train])^2)
mse.ridge
## [1] 0.4571105

Для модели ridge regression MSE = 0.4571105, что больше, чем значение MSE для boosting.

Таким образом можно заключить, что boosting лучше использовать для этих данных, чем методы регрессии.

10f

boost.best = gbm(Salary ~ ., data = Hitters.train, distribution = "gaussian", n.trees = 1000, shrinkage=lambdas[which.min(test.err)])
summary(boost.best)

##                 var    rel.inf
## CAtBat       CAtBat 12.4208190
## CRuns         CRuns 12.1329859
## PutOuts     PutOuts  9.0907128
## CRBI           CRBI  7.9835959
## CHits         CHits  7.3985837
## CWalks       CWalks  6.8186161
## Walks         Walks  6.2857156
## CHmRun       CHmRun  6.2596263
## Years         Years  5.4374901
## Hits           Hits  4.9649872
## Assists     Assists  4.8001837
## RBI             RBI  4.6721690
## HmRun         HmRun  3.1831680
## AtBat         AtBat  3.1161170
## Errors       Errors  2.0707743
## Runs           Runs  1.8776262
## Division   Division  0.7024624
## NewLeague NewLeague  0.5264935
## League       League  0.2578733

CAtBat и CRuns наиболее значимые переменные в моделе boosting.

10g

library(randomForest)

set.seed(2)
bag.hitters <- randomForest(Salary~., data = Hitters.train, mtry = 19, ntree = 500)
bag.pred <- predict(bag.hitters, newdata = Hitters.test)
mse.bag <- mean((bag.pred - Hitters.test$Salary)^2)
mse.bag
## [1] 0.2271886

Test MSE для bagging равно 0.2271886, что меньше чем наилучшее test MSE для boosting 0.2540265