\[f(X) = \sum_{j = 1}^p f_j(X_j)\]
p = seq(0, 1, 0.01)
gini = p * (1 - p) * 2
entropy = -(p * log(p) + (1 - p) * log(1 - p))
class.err = 1 - pmax(p, 1 - p)
matplot(p, cbind(gini, entropy, class.err), type = "l", col = c("red", "green", "blue"))
legend("topright",legend=c("gini","entropy", "class error"),pch=19,col=c("red", "green", "blue"))
library(MASS)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
set.seed(1101)
train = sample(1:nrow(Boston), nrow(Boston)/2)
X.train <- Boston[train, -14]
X.test <- Boston[-train, -14]
Y.train <- Boston[train, 14]
Y.test <- Boston[-train, 14]
p = dim(Boston)[2] - 1
p_2 = p/2
p.sqrt = sqrt(p)
forest.p = randomForest(X.train, Y.train, xtest=X.test, ytest=Y.test, mtry=p, ntree=500)
forest.p2 = randomForest(X.train, Y.train, xtest=X.test, ytest=Y.test, mtry=p_2, ntree=500)
forest.sqrt.p = randomForest(X.train, Y.train, xtest=X.test, ytest=Y.test, mtry=p.sqrt, ntree=500)
plot(1:500, forest.p$test$mse, col="green", type="l", xlab="Number of Trees", ylab="Test MSE", ylim=c(8, 19))
lines(1:500, forest.p2$test$mse, col="red", type="l")
lines(1:500, forest.sqrt.p$test$mse, col="blue", type="l")
legend("topright", c("m=p", "m=p/2", "m=sqrt(p)"), col=c("green", "red", "blue"), cex=1, lty=1)
library(ISLR)
train <- sample(1:nrow(OJ), 800)
OJ.train = OJ[train, ]
OJ.test = OJ[-train, ]
tree.oj = tree(Purchase~., data=OJ.train)
summary(tree.oj)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "STORE" "PriceDiff"
## Number of terminal nodes: 7
## Residual mean deviance: 0.727 = 576.5 / 793
## Misclassification error rate: 0.165 = 132 / 800
tree.oj
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1068.00 CH ( 0.61250 0.38750 )
## 2) LoyalCH < 0.508643 356 420.90 MM ( 0.27809 0.72191 )
## 4) LoyalCH < 0.279374 170 123.20 MM ( 0.11765 0.88235 )
## 8) STORE < 1.5 50 57.31 MM ( 0.26000 0.74000 ) *
## 9) STORE > 1.5 120 53.37 MM ( 0.05833 0.94167 ) *
## 5) LoyalCH > 0.279374 186 253.60 MM ( 0.42473 0.57527 )
## 10) PriceDiff < 0.05 80 80.06 MM ( 0.20000 0.80000 ) *
## 11) PriceDiff > 0.05 106 143.20 CH ( 0.59434 0.40566 ) *
## 3) LoyalCH > 0.508643 444 324.70 CH ( 0.88063 0.11937 )
## 6) LoyalCH < 0.705699 142 166.90 CH ( 0.72535 0.27465 )
## 12) PriceDiff < 0.265 81 111.70 CH ( 0.54321 0.45679 ) *
## 13) PriceDiff > 0.265 61 17.60 CH ( 0.96721 0.03279 ) *
## 7) LoyalCH > 0.705699 302 113.30 CH ( 0.95364 0.04636 ) *
plot(tree.oj)
text(tree.oj)
tree.pred = predict(tree.oj, OJ.test, type = "class")
table(tree.pred, OJ.test$Purchase)
##
## tree.pred CH MM
## CH 149 46
## MM 14 61
1-(149+61)/270
## [1] 0.2222222
cv.oj <- cv.tree(tree.oj, FUN = prune.misclass)
cv.oj
## $size
## [1] 7 4 2 1
##
## $dev
## [1] 151 151 159 310
##
## $k
## [1] -Inf 0 10 158
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(cv.oj$size, cv.oj$dev, type = "b", xlab = "Tree size", ylab = "Deviance")
prune.oj = prune.misclass(tree.oj, best=4)
plot(prune.oj)
text(prune.oj, pretty = 0)
summary(tree.oj)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ.train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "STORE" "PriceDiff"
## Number of terminal nodes: 7
## Residual mean deviance: 0.727 = 576.5 / 793
## Misclassification error rate: 0.165 = 132 / 800
summary(prune.oj)
##
## Classification tree:
## snip.tree(tree = tree.oj, nodes = 3:4)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff"
## Number of terminal nodes: 4
## Residual mean deviance: 0.8431 = 671.1 / 796
## Misclassification error rate: 0.165 = 132 / 800
prune.pred <- predict(prune.oj, OJ.test, type = "class")
table(prune.pred, OJ.test$Purchase)
##
## prune.pred CH MM
## CH 149 46
## MM 14 61
1-(149+61)/270
## [1] 0.2222222
library(ISLR)
Hitters = Hitters[-which(is.na(Hitters$Salary)), ]
Hitters$Salary = log(Hitters$Salary)
train = 1:200
Hitters.train = Hitters[train, ]
Hitters.test = Hitters[-train, ]
library(gbm)
## Loaded gbm 2.1.5
set.seed(42)
pows = seq(-10, -0.2, by=0.1)
lambdas = 10 ^ pows
train.errors = rep(NA, length(lambdas))
test.errors = rep(NA, length(lambdas))
for(i in 1:length(lambdas)){
boost.hitters = gbm(Salary~., data=Hitters.train, distribution="gaussian", n.trees=1000, shrinkage=lambdas[i])
train.pred = predict(boost.hitters, Hitters.train, n.trees=1000)
test.pred = predict(boost.hitters, Hitters.test, n.trees=1000)
train.errors[i] = mean((Hitters.train$Salary - train.pred)^2)
test.errors[i] = mean((Hitters.test$Salary - test.pred)^2)
}
plot(lambdas, train.errors, type="b", xlab="Shrinkage", ylab="Train MSE", col="red", pch=20)
plot(lambdas, test.errors, type="b", xlab="Shrinkage", ylab="Test MSE", col="red", pch=20)
min(test.errors)
## [1] 0.2583995
lambdas[which.min(test.errors)]
## [1] 0.06309573
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 3.0-2
set.seed(42)
lm.fit = lm(Salary~., data=Hitters.train)
lm.pred = predict(lm.fit, Hitters.test)
mean((Hitters.test$Salary - lm.pred)^2)
## [1] 0.4917959
x = model.matrix(Salary~., data=Hitters.train)
y = Hitters.train$Salary
x.test = model.matrix(Salary~., data=Hitters.test)
lasso.fit = glmnet(x, y, alpha=1)
lasso.pred = predict(lasso.fit, s=0.01, newx=x.test)
mean((Hitters.test$Salary - lasso.pred)^2)
## [1] 0.4700537
boost.best = gbm(Salary~., data=Hitters.train, distribution="gaussian", n.trees=1000, shrinkage=lambdas[which.min(test.errors)])
summary(boost.best)
## var rel.inf
## CAtBat CAtBat 14.8197925
## CRBI CRBI 12.1186597
## CWalks CWalks 9.7998044
## CHits CHits 8.6560652
## PutOuts PutOuts 7.2138695
## Walks Walks 6.8368300
## Years Years 6.4412453
## CHmRun CHmRun 5.4742375
## Assists Assists 4.3071056
## Hits Hits 3.7625714
## Runs Runs 3.6334822
## RBI RBI 3.5479456
## HmRun HmRun 3.3512231
## CRuns CRuns 3.2227444
## AtBat AtBat 3.0281593
## Errors Errors 2.3590083
## Division Division 0.6475954
## NewLeague NewLeague 0.6438269
## League League 0.1358334
library(randomForest)
set.seed(42)
rf.hitters = randomForest(Salary~., data=Hitters.train, ntree=500, mtry=19)
rf.pred = predict(rf.hitters, Hitters.test)
mean((Hitters.test$Salary - rf.pred)^2)
## [1] 0.2265061
library(ISLR)
train = 1:1000
Caravan$Purchase = ifelse(Caravan$Purchase == "Yes", 1, 0)
Caravan.train = Caravan[train, ]
Caravan.test = Caravan[-train, ]
library(gbm)
set.seed(42)
boost.caravan = gbm(Purchase~., data=Caravan.train, n.trees = 1000, shrinkage = 0.01, distribution = "bernoulli")
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution, :
## variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution, :
## variable 71: AVRAAUT has no variation.
summary(boost.caravan)
## var rel.inf
## PPERSAUT PPERSAUT 15.24304059
## MKOOPKLA MKOOPKLA 10.22049754
## MOPLHOOG MOPLHOOG 7.58473391
## MBERMIDD MBERMIDD 5.98365038
## PBRAND PBRAND 4.55749118
## ABRAND ABRAND 4.07601736
## MINK3045 MINK3045 4.03149141
## MGODGE MGODGE 3.50618597
## MOSTYPE MOSTYPE 2.82332650
## MAUT2 MAUT2 2.61711991
## MSKC MSKC 2.21439111
## MAUT1 MAUT1 2.13764619
## MBERARBG MBERARBG 2.01645301
## MSKA MSKA 1.98039700
## MGODPR MGODPR 1.94608284
## PWAPART PWAPART 1.90065796
## MGODOV MGODOV 1.81046263
## MRELGE MRELGE 1.65327955
## MINK7512 MINK7512 1.54952273
## MBERHOOG MBERHOOG 1.54562792
## MINKGEM MINKGEM 1.51129086
## PBYSTAND PBYSTAND 1.46885445
## MSKB1 MSKB1 1.46349386
## MFWEKIND MFWEKIND 1.24890126
## MINKM30 MINKM30 1.01877067
## APERSAUT APERSAUT 1.00947264
## MSKD MSKD 0.94342296
## MOSHOOFD MOSHOOFD 0.92805596
## MFGEKIND MFGEKIND 0.92012209
## MAUT0 MAUT0 0.91661495
## MGODRK MGODRK 0.90097295
## MOPLMIDD MOPLMIDD 0.80067001
## MRELOV MRELOV 0.79866885
## MHHUUR MHHUUR 0.66251044
## MBERBOER MBERBOER 0.61490907
## MBERARBO MBERARBO 0.59493791
## PMOTSCO PMOTSCO 0.59140712
## MSKB2 MSKB2 0.49738895
## PLEVEN PLEVEN 0.47656908
## MINK4575 MINK4575 0.44932585
## MZPART MZPART 0.43764686
## MHKOOP MHKOOP 0.41454005
## MGEMOMV MGEMOMV 0.41336506
## MBERZELF MBERZELF 0.38071586
## MZFONDS MZFONDS 0.31613260
## MINK123M MINK123M 0.30173680
## MGEMLEEF MGEMLEEF 0.26253060
## MOPLLAAG MOPLLAAG 0.16165917
## MFALLEEN MFALLEEN 0.09723737
## MAANTHUI MAANTHUI 0.00000000
## MRELSA MRELSA 0.00000000
## PWABEDR PWABEDR 0.00000000
## PWALAND PWALAND 0.00000000
## PBESAUT PBESAUT 0.00000000
## PVRAAUT PVRAAUT 0.00000000
## PAANHANG PAANHANG 0.00000000
## PTRACTOR PTRACTOR 0.00000000
## PWERKT PWERKT 0.00000000
## PBROM PBROM 0.00000000
## PPERSONG PPERSONG 0.00000000
## PGEZONG PGEZONG 0.00000000
## PWAOREG PWAOREG 0.00000000
## PZEILPL PZEILPL 0.00000000
## PPLEZIER PPLEZIER 0.00000000
## PFIETS PFIETS 0.00000000
## PINBOED PINBOED 0.00000000
## AWAPART AWAPART 0.00000000
## AWABEDR AWABEDR 0.00000000
## AWALAND AWALAND 0.00000000
## ABESAUT ABESAUT 0.00000000
## AMOTSCO AMOTSCO 0.00000000
## AVRAAUT AVRAAUT 0.00000000
## AAANHANG AAANHANG 0.00000000
## ATRACTOR ATRACTOR 0.00000000
## AWERKT AWERKT 0.00000000
## ABROM ABROM 0.00000000
## ALEVEN ALEVEN 0.00000000
## APERSONG APERSONG 0.00000000
## AGEZONG AGEZONG 0.00000000
## AWAOREG AWAOREG 0.00000000
## AZEILPL AZEILPL 0.00000000
## APLEZIER APLEZIER 0.00000000
## AFIETS AFIETS 0.00000000
## AINBOED AINBOED 0.00000000
## ABYSTAND ABYSTAND 0.00000000
boost.prob = predict(boost.caravan, Caravan.test, n.trees = 1000, type = "response")
boost.pred = ifelse(boost.prob > 0.2, 1, 0)
table(Caravan.test$Purchase, boost.pred)
## boost.pred
## 0 1
## 0 4415 118
## 1 257 32
32/(118+32)
## [1] 0.2133333
Around 20% of the people predicted to make a purchase end up making a purchase.
lm.caravan = glm(Purchase~., data=Caravan.train, family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lm.prob = predict(lm.caravan, Caravan.test, type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
lm.pred = ifelse(lm.prob > 0.2, 1, 0)
table(Caravan.test$Purchase, lm.pred)
## lm.pred
## 0 1
## 0 4183 350
## 1 231 58
58/(350 + 58)
## [1] 0.1421569
Around 14% of the people predicted to make a purchase end up making a purchase.
We will use the candy data set from fivethirtyeight
library(gbm)
library(randomForest)
candy_df <- read.csv(file="candy-data.csv", header=TRUE, sep=",")
set.seed(1)
train = sample(nrow(candy_df), 2/3 * nrow(candy_df))
candy.train = candy_df[train, -1]
candy.test = candy_df[-train, -1]
pows = seq(-10, -0.2, by=0.1)
lambdas = 10 ^ pows
train.errors = rep(NA, length(lambdas))
test.errors = rep(NA, length(lambdas))
for(i in 1:length(lambdas)){
boost.candy = gbm(winpercent~., data=candy.train, distribution="gaussian", n.trees=5000, shrinkage=lambdas[i])
train.pred = predict(boost.candy, candy.train, n.trees=1000)
test.pred = predict(boost.candy, candy.test, n.trees=1000)
train.errors[i] = mean((candy.train$winpercent - train.pred)^2)
test.errors[i] = mean((candy.test$winpercent - test.pred)^2)
}
plot(lambdas, test.errors, type="b", xlab="Shrinkage", ylab="Train MSE", col="red", pch=20, ylim = c(0, 190))
min(test.errors)
## [1] 86.46589
bag.candy = randomForest(winpercent~., data=candy.train, mtry=11)
bag.pred = predict(bag.candy, candy.test)
mean((candy.test$winpercent - bag.pred)^2)
## [1] 106.7529
oob.err=double(11)
test.err=double(11)
for(mtry in 1:11){
rf.candy = randomForest(winpercent ~ ., data = candy.train, mtry = mtry, ntree = 500,
importance = T)
oob.err[mtry]=rf.candy$mse[500]
rf.pred = predict(rf.candy, candy.test)
test.err[mtry] = mean((candy.test$winpercent - rf.pred)^2)
cat(mtry," ")
}
## 1 2 3 4 5 6 7 8 9 10 11
matplot(1:mtry,cbind(test.err),pch=19,col=c("red"),type="b",ylab="Mean Squared Error")
importance(rf.candy)
## %IncMSE IncNodePurity
## chocolate 22.87750164 4129.02298
## fruity 3.11873475 222.88931
## caramel 5.22605566 364.68501
## peanutyalmondy 5.15124872 524.00309
## nougat -4.45493734 69.11863
## crispedricewafer 0.07805355 214.07156
## hard -1.75614048 59.66623
## bar 6.55586400 299.69649
## pluribus -0.42883209 244.70892
## sugarpercent -5.25807255 2089.29218
## pricepercent 9.71411694 2545.07728
test.err[which.min(test.err)]
## [1] 104.1517
library(glmnet)
set.seed(1)
lm.fit = lm(winpercent~. -sugarpercent -pricepercent -pluribus, data=candy.train)
lm.pred = predict(lm.fit, candy.test)
summary(lm.fit)
##
## Call:
## lm(formula = winpercent ~ . - sugarpercent - pricepercent - pluribus,
## data = candy.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.257 -4.572 1.335 6.188 22.197
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.507 4.880 6.867 1.3e-08 ***
## chocolate 22.472 5.446 4.126 0.000149 ***
## fruity 11.334 5.201 2.179 0.034377 *
## caramel 7.360 4.262 1.727 0.090776 .
## peanutyalmondy 11.397 4.057 2.809 0.007219 **
## nougat 2.044 5.817 0.351 0.726817
## crispedricewafer 6.055 5.715 1.060 0.294781
## hard -1.667 4.553 -0.366 0.716004
## bar -3.851 4.803 -0.802 0.426778
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.73 on 47 degrees of freedom
## Multiple R-squared: 0.5467, Adjusted R-squared: 0.4696
## F-statistic: 7.087 on 8 and 47 DF, p-value: 4.197e-06
mean((candy.test$winpercent - lm.pred)^2)
## [1] 133.2194
x = model.matrix(winpercent~., data=candy.train)
y = candy.train$winpercent
x.test = model.matrix(winpercent~., data=candy.test)
lasso.fit = glmnet(x, y, alpha=1)
lasso.pred = predict(lasso.fit, s=0.01, newx=x.test)
mean((candy.test$winpercent - lasso.pred)^2)
## [1] 123.7198