library(MASS)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
set.seed(1)
train <- sample(1:nrow(Boston), nrow(Boston) / 2)
Boston.train <- Boston[train, -14]
Boston.test <- Boston[-train, -14]
Y.train <- Boston[train, 14]
Y.test <- Boston[-train, 14]
rf.boston1 <- randomForest(Boston.train, y = Y.train, xtest = Boston.test, ytest = Y.test, mtry = ncol(Boston) - 1, ntree = 500)
rf.boston2 <- randomForest(Boston.train, y = Y.train, xtest = Boston.test, ytest = Y.test, mtry = (ncol(Boston) - 1) / 2, ntree = 500)
rf.boston3 <- randomForest(Boston.train, y = Y.train, xtest = Boston.test, ytest = Y.test, mtry = sqrt(ncol(Boston) - 1), ntree = 500)
plot(1:500, rf.boston1$test$mse, col = "green", type = "l", xlab = "Number of Trees", ylab = "Test MSE", ylim = c(10, 19))
lines(1:500, rf.boston2$test$mse, col = "red", type = "l")
lines(1:500, rf.boston3$test$mse, col = "blue", type = "l")
legend("topright", c("m = p", "m = p/2", "m = sqrt(p)"), col = c("green", "red", "blue"), cex = 1, lty = 1)
We may see that the Test MSE is very high for a single tree, it decreases as the number of trees increases. Also the Test MSE for all predictors is higher than for half the predictors or the square root of the number of predictors.
library(ISLR)
set.seed(1)
train = sample(dim(OJ)[1],800)
OJ.train = OJ[train,]
OJ.test = OJ[-train,]
OJ.tree = tree(Purchase~., data=OJ.train)
summary(OJ.tree)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "SpecialCH" "ListPriceDiff"
## Number of terminal nodes: 8
## Residual mean deviance: 0.7305 = 578.6 / 792
## Misclassification error rate: 0.165 = 132 / 800
The fitted tree has 8 terminal nodes and a training error rate of 0.165.
OJ.tree
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1064.00 CH ( 0.61750 0.38250 )
## 2) LoyalCH < 0.508643 350 409.30 MM ( 0.27143 0.72857 )
## 4) LoyalCH < 0.264232 166 122.10 MM ( 0.12048 0.87952 )
## 8) LoyalCH < 0.0356415 57 10.07 MM ( 0.01754 0.98246 ) *
## 9) LoyalCH > 0.0356415 109 100.90 MM ( 0.17431 0.82569 ) *
## 5) LoyalCH > 0.264232 184 248.80 MM ( 0.40761 0.59239 )
## 10) PriceDiff < 0.195 83 91.66 MM ( 0.24096 0.75904 )
## 20) SpecialCH < 0.5 70 60.89 MM ( 0.15714 0.84286 ) *
## 21) SpecialCH > 0.5 13 16.05 CH ( 0.69231 0.30769 ) *
## 11) PriceDiff > 0.195 101 139.20 CH ( 0.54455 0.45545 ) *
## 3) LoyalCH > 0.508643 450 318.10 CH ( 0.88667 0.11333 )
## 6) LoyalCH < 0.764572 172 188.90 CH ( 0.76163 0.23837 )
## 12) ListPriceDiff < 0.235 70 95.61 CH ( 0.57143 0.42857 ) *
## 13) ListPriceDiff > 0.235 102 69.76 CH ( 0.89216 0.10784 ) *
## 7) LoyalCH > 0.764572 278 86.14 CH ( 0.96403 0.03597 ) *
I pick the node labelled 8, which is a terminal node because of the asterisk. The split criterion is LoyalCH < 0.035, the number of observations in that branch is 57 with a deviance of 10.07 and an overall prediction for the branch of MM. Less than 2% of the observations in that branch take the value of CH, and the remaining 98% take the value of MM.
plot(OJ.tree)
text(OJ.tree,pretty=TRUE)
The most important indicator of “Purchase” appears to be “LoyalCH”, since the first branch differentiates the intensity of customer brand loyalty to CH. In fact, the top three nodes contain “LoyalCH”.
tree.pred = predict(OJ.tree, newdata = OJ.test, type = "class")
table(tree.pred,OJ.test$Purchase)
##
## tree.pred CH MM
## CH 147 49
## MM 12 62
(147+62)/270
## [1] 0.7740741
77% of the test observations are correctly classified so the test error rate is 23%.
cv.OJ = cv.tree(OJ.tree, FUN = prune.misclass)
cv.OJ
## $size
## [1] 8 5 2 1
##
## $dev
## [1] 156 156 156 306
##
## $k
## [1] -Inf 0.000000 4.666667 160.000000
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(cv.OJ$size,cv.OJ$dev,type='b', xlab = "Tree size", ylab = "Deviance")
We might see that the 5-node tree is the smallest tree with the lowest classification error rate.
prune.OJ = prune.misclass(OJ.tree, best=5)
plot(prune.OJ)
text(prune.OJ,pretty=0)
tree.pred = predict(prune.OJ, newdata = OJ.test, type = "class")
table(tree.pred,OJ.test$Purchase)
##
## tree.pred CH MM
## CH 147 49
## MM 12 62
(147+62)/270
## [1] 0.7740741
n this case, the pruning has the same test error, but it produced a way more interpretable tree.
Hitters = na.omit(Hitters)
Hitters$Salary = log(Hitters$Salary)
train = 1:200
hitters.train = Hitters[train,]
hitters.test = Hitters[-train,]
library(gbm)
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
set.seed(1)
pows = seq(-10, -0.2, by = 0.1)
lambdas = 10^pows
train.err = rep(NA, length(lambdas))
for (i in 1:length(lambdas)) {
boost.hitters = gbm(Salary ~ ., data = hitters.train, distribution = "gaussian", n.trees = 1000, shrinkage = lambdas[i])
pred.train = predict(boost.hitters, hitters.train, n.trees = 1000)
train.err[i] = mean((pred.train - hitters.train$Salary)^2)
}
plot(lambdas, train.err, type = "b", xlab = "Shrinkage values", ylab = "Training MSE")
set.seed(1)
test.err <- rep(NA, length(lambdas))
for (i in 1:length(lambdas)) {
boost.hitters = gbm(Salary ~ ., data = hitters.train, distribution = "gaussian", n.trees = 1000, shrinkage = lambdas[i])
yhat = predict(boost.hitters, hitters.test, n.trees = 1000)
test.err[i] = mean((yhat - hitters.test$Salary)^2)
}
plot(lambdas, test.err, type = "b", xlab = "Shrinkage values", ylab = "Test MSE")
min(test.err)
## [1] 0.2540265
lambdas[which.min(test.err)]
## [1] 0.07943282
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.4.3
## Loaded glmnet 2.0-13
fit1 = lm(Salary ~ ., data = hitters.train)
pred1 = predict(fit1, hitters.test)
mean((pred1 - hitters.test$Salary)^2)
## [1] 0.4917959
x = model.matrix(Salary ~ ., data = hitters.train)
x.test = model.matrix(Salary ~ ., data = hitters.test)
y = hitters.train$Salary
fit2 = glmnet(x, y, alpha = 0)
pred2 = predict(fit2, s = 0.01, newx = x.test)
mean((pred2 - hitters.test$Salary)^2)
## [1] 0.4570283
The test MSE for boosting is lower than for linear regression and ridge regression.
boost.hitters <- gbm(Salary ~ ., data = hitters.train, distribution = "gaussian", n.trees = 1000, shrinkage = lambdas[which.min(test.err)])
summary(boost.hitters)
## var rel.inf
## CAtBat CAtBat 20.8404970
## CRBI CRBI 12.3158959
## Walks Walks 7.4186037
## PutOuts PutOuts 7.1958539
## Years Years 6.3104535
## CWalks CWalks 6.0221656
## CHmRun CHmRun 5.7759763
## CHits CHits 4.8914360
## AtBat AtBat 4.2187460
## RBI RBI 4.0812410
## Hits Hits 4.0117255
## Assists Assists 3.8786634
## HmRun HmRun 3.6386178
## CRuns CRuns 3.3230296
## Errors Errors 2.6369128
## Runs Runs 2.2048386
## Division Division 0.5347342
## NewLeague NewLeague 0.4943540
## League League 0.2062551
“CAtBat” is by far the most important variable.
set.seed(1)
bag.hitters <- randomForest(Salary ~ ., data = hitters.train, mtry = 19, ntree = 500)
yhat.bag <- predict(bag.hitters, newdata = hitters.test)
mean((yhat.bag - hitters.test$Salary)^2)
## [1] 0.2313593
The test MSE for bagging is 0.23, which is slightly lower than the test MSE for boosting.
train = 1:1000
Caravan$Purchase = ifelse(Caravan$Purchase == "Yes", 1, 0)
Caravan.train = Caravan[train,]
Caravan.test = Caravan[-train,]
set.seed(1)
boost.caravan = gbm(Purchase ~ ., data = Caravan.train, distribution = "gaussian", n.trees = 1000, shrinkage = 0.01)
## Warning in gbm.fit(x, y, offset = offset, distribution = distribution, w =
## w, : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x, y, offset = offset, distribution = distribution, w =
## w, : variable 71: AVRAAUT has no variation.
summary(boost.caravan)
## var rel.inf
## PPERSAUT PPERSAUT 13.51824557
## MKOOPKLA MKOOPKLA 10.24062778
## MOPLHOOG MOPLHOOG 7.32689780
## MBERMIDD MBERMIDD 6.35820558
## PBRAND PBRAND 4.98826360
## ABRAND ABRAND 4.54504653
## MGODGE MGODGE 4.26496875
## MINK3045 MINK3045 4.13253907
## PWAPART PWAPART 3.15612877
## MAUT1 MAUT1 2.76929763
## MOSTYPE MOSTYPE 2.56937935
## MAUT2 MAUT2 1.99879666
## MSKA MSKA 1.94618539
## MBERARBG MBERARBG 1.89917331
## PBYSTAND PBYSTAND 1.88591514
## MINKGEM MINKGEM 1.87131472
## MGODOV MGODOV 1.81673309
## MGODPR MGODPR 1.80814745
## MFWEKIND MFWEKIND 1.67884570
## MSKC MSKC 1.65075962
## MBERHOOG MBERHOOG 1.53559951
## MSKB1 MSKB1 1.43339514
## MOPLMIDD MOPLMIDD 1.10617074
## MHHUUR MHHUUR 1.09608784
## MRELGE MRELGE 1.09039794
## MINK7512 MINK7512 1.08772012
## MZFONDS MZFONDS 1.08427551
## MGODRK MGODRK 1.03126657
## MINK4575 MINK4575 1.02492795
## MZPART MZPART 0.98536712
## MRELOV MRELOV 0.80356854
## MFGEKIND MFGEKIND 0.80335689
## MBERARBO MBERARBO 0.60909852
## APERSAUT APERSAUT 0.56707821
## MGEMOMV MGEMOMV 0.55589456
## MOSHOOFD MOSHOOFD 0.55498375
## MAUT0 MAUT0 0.54748481
## PMOTSCO PMOTSCO 0.43362597
## MSKB2 MSKB2 0.43075446
## MSKD MSKD 0.42751490
## MINK123M MINK123M 0.40920707
## MINKM30 MINKM30 0.36996576
## MHKOOP MHKOOP 0.34941518
## MBERBOER MBERBOER 0.28967068
## MFALLEEN MFALLEEN 0.28877552
## MGEMLEEF MGEMLEEF 0.20084195
## MOPLLAAG MOPLLAAG 0.15750616
## MBERZELF MBERZELF 0.11203381
## PLEVEN PLEVEN 0.11030994
## MRELSA MRELSA 0.04500507
## MAANTHUI MAANTHUI 0.03322830
## PWABEDR PWABEDR 0.00000000
## PWALAND PWALAND 0.00000000
## PBESAUT PBESAUT 0.00000000
## PVRAAUT PVRAAUT 0.00000000
## PAANHANG PAANHANG 0.00000000
## PTRACTOR PTRACTOR 0.00000000
## PWERKT PWERKT 0.00000000
## PBROM PBROM 0.00000000
## PPERSONG PPERSONG 0.00000000
## PGEZONG PGEZONG 0.00000000
## PWAOREG PWAOREG 0.00000000
## PZEILPL PZEILPL 0.00000000
## PPLEZIER PPLEZIER 0.00000000
## PFIETS PFIETS 0.00000000
## PINBOED PINBOED 0.00000000
## AWAPART AWAPART 0.00000000
## AWABEDR AWABEDR 0.00000000
## AWALAND AWALAND 0.00000000
## ABESAUT ABESAUT 0.00000000
## AMOTSCO AMOTSCO 0.00000000
## AVRAAUT AVRAAUT 0.00000000
## AAANHANG AAANHANG 0.00000000
## ATRACTOR ATRACTOR 0.00000000
## AWERKT AWERKT 0.00000000
## ABROM ABROM 0.00000000
## ALEVEN ALEVEN 0.00000000
## APERSONG APERSONG 0.00000000
## AGEZONG AGEZONG 0.00000000
## AWAOREG AWAOREG 0.00000000
## AZEILPL AZEILPL 0.00000000
## APLEZIER APLEZIER 0.00000000
## AFIETS AFIETS 0.00000000
## AINBOED AINBOED 0.00000000
## ABYSTAND ABYSTAND 0.00000000
The variables “PPERSAUT” and “MKOOPKLA” are the two most important variables.
probs.test <- predict(boost.caravan, Caravan.test, n.trees = 1000, type = "response")
pred.test <- ifelse(probs.test > 0.2, 1, 0)
table(Caravan.test$Purchase, pred.test)
## pred.test
## 0 1
## 0 4493 40
## 1 278 11
For boosting, the fraction of people predicted to make a purchase that in fact make one is 0.2156863.
logit.caravan <- glm(Purchase ~ ., data = Caravan.train, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
probs.test2 <- predict(logit.caravan, Caravan.test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
pred.test2 <- ifelse(probs.test > 0.2, 1, 0)
table(Caravan.test$Purchase, pred.test2)
## pred.test2
## 0 1
## 0 4493 40
## 1 278 11
For logistic regression, the fraction of people predicted to make a purchase that in fact make one is again 0.2156863.