library (tree)
library (ISLR)
Attaching package: 㤼㸱ISLR㤼㸲
The following object is masked _by_ 㤼㸱.GlobalEnv㤼㸲:
Carseats
attach(Carseats )
The following object is masked _by_ .GlobalEnv:
High
High=ifelse(Sales <=8,"No","Yes")
Fitting Classi???cation Trees Armamos un arbol para predecir la altura notemos que sales es 0 (todas las variables - sales) esto implica que no va a aportar
Carseats =data.frame(Carseats ,High)
tree.carseats =tree(Highâ¼.-Sales,Carseats)
summary (tree.carseats )
Classification tree:
tree(formula = High ~ . - Sales, data = Carseats)
Variables actually used in tree construction:
[1] "High.1"
Number of terminal nodes: 2
Residual mean deviance: 0 = 0 / 398
Misclassification error rate: 0 = 0 / 400
plot(tree.carseats )
text(tree.carseats ,pretty =0)
la siguiente instruccion visualizamos el comportamiento del branch y sus valores asociados
tree.carseats
node), split, n, deviance, yval, (yprob)
* denotes terminal node
1) root 400 541.5 No ( 0.59 0.41 )
2) High.1: No 236 0.0 No ( 1.00 0.00 ) *
3) High.1: Yes 164 0.0 Yes ( 0.00 1.00 ) *
creamos nuestro train y test verificaremos el porcentaje de prediccion
set.seed(2)
train=sample (1: nrow(Carseats ), 200)
Carseats.test=Carseats [-train ,]
High.test=High[-train]
tree.carseats =tree(Highâ¼.-Sales , Carseats ,subset=train)
tree.pred=predict(tree.carseats ,Carseats.test ,type="class")
table(tree.pred ,High.test)
High.test
tree.pred No Yes
No 116 0
Yes 0 84
buscaremos hacer pruning al arbol, al ejecutar cv.tree() se determinará un cross validation con el objetivo de determinar el nivel optimo del arbol
set.seed(3)
cv.carseats =cv.tree(tree.carseats ,FUN=prune.misclass )
names(cv.carseats )
[1] "size" "dev" "k" "method"
cv.carseats
$size
[1] 2 1
$dev
[1] 0 80
$k
[1] -Inf 80
$method
[1] "misclass"
attr(,"class")
[1] "prune" "tree.sequence"
par(mfrow=c(1,2))
plot(cv.carseats$size ,cv.carseats$dev ,type="b")
plot(cv.carseats$k ,cv.carseats$dev ,type="b")
prune.carseats =prune.misclass (tree.carseats ,best=9)
best is bigger than tree size
plot(prune.carseats )
text(prune.carseats ,pretty =0)
para verificar nuestro arbol nuevamente volveremos a predecir el carseats test
tree.pred=predict(prune.carseats ,Carseats.test , type="class")
table(tree.pred ,High.test)
High.test
tree.pred No Yes
No 116 0
Yes 0 84
(94+60) /200
[1] 0.77
prune.carseats =prune.misclass (tree.carseats ,best=15)
best is bigger than tree size
plot(prune.carseats )
text(prune.carseats ,pretty =0)
tree.pred=predict(prune.carseats ,Carseats.test , type="class")
table(tree.pred ,High.test)
High.test
tree.pred No Yes
No 116 0
Yes 0 84
(86+62) /200
[1] 0.74
Como hemos podido observar este proceso de podado no es tan bueno como el anterior pues nuestros valores de exactitud ya son del 74%
trabajaremos el data set de boston, en este segmento de codigo contruimos nuestro set de entrenamiento
library(MASS)
set.seed(1)
train = sample (1:nrow(Boston), nrow(Boston)/2)
tree.boston=tree(medvâ¼.,Boston , subset=train)
summary(tree.boston)
Regression tree:
tree(formula = medv ~ ., data = Boston, subset = train)
Variables actually used in tree construction:
[1] "lstat" "rm" "dis"
Number of terminal nodes: 8
Residual mean deviance: 12.65 = 3099 / 245
Distribution of residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-14.10000 -2.04200 -0.05357 0.00000 1.96000 12.60000
Nuestro arbol tiene 4 niveles con 8 nodos
plot(tree.boston)
text(tree.boston , pretty =0)
podaremos el arbol considerando los 5 mejores nodos
cv.boston=cv.tree(tree.boston)
cv.boston
$size
[1] 8 7 6 5 4 3 2 1
$dev
[1] 5226.322 5228.360 6462.626 6692.615 6397.438 7529.846 11958.691 21118.139
$k
[1] -Inf 255.6581 451.9272 768.5087 818.8885 1559.1264 4276.5803 9665.3582
$method
[1] "deviance"
attr(,"class")
[1] "prune" "tree.sequence"
plot(cv.boston$size,cv.boston$dev)
prune.boston=prune.tree(tree.boston ,best=5)
plot(prune.boston)
text(prune.boston , pretty =0)
Evaluaremos nuestra preddicion basados en el test del segmento restante de boston
yhat=predict (tree.boston ,newdata=Boston[-train ,])
boston.test=Boston[-train ,"medv"]
plot(yhat ,boston.test)
abline (0,1)
mean((yhat -boston.test)^2)
[1] 25.04559
# Bagging and Random Forests
library( randomForest)
randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.
set.seed(1)
bag.boston= randomForest( medvâ¼.,data=Boston , subset=train ,
mtry=13,importance =TRUE)
bag.boston
Call:
randomForest(formula = medv ~ ., data = Boston, mtry = 13, importance = TRUE, subset = train)
Type of random forest: regression
Number of trees: 500
No. of variables tried at each split: 13
Mean of squared residuals: 11.15723
% Var explained: 86.49
Como podemos ver tenemos 13 valores de predictores que se consideran en cada split del arbol , en la siguietne grafica vemos como es la preddiccion del arbol
yhat.bag = predict (bag.boston , newdata=Boston[-train ,])
plot(yhat.bag , boston.test)
abline (0,1)
mean((yhat.bag -boston.test)^2)
[1] 13.50808
la aplicación de randomforest nos permitira aplicar un train sobre toda la data, en este ejercicio el error calculada sera 13.33
bag.boston= randomForest( medvâ¼.,data=Boston , subset=train ,
mtry=13,ntree=25)
yhat.bag = predict (bag.boston , newdata=Boston[-train ,])
mean((yhat.bag -boston.test)^2)
[1] 13.94835
haremos un ajuste para evaluar por importancia sin especificar el ntree
set.seed(1)
rf.boston= randomForest(medvâ¼.,data=Boston , subset=train ,
mtry=6, importance =TRUE)
yhat.rf = predict(rf.boston ,newdata=Boston[-train ,])
mean((yhat.rf-boston.test)^2)
[1] 11.66454
importance (rf.boston)
%IncMSE IncNodePurity
crim 12.132320 986.50338
zn 1.955579 57.96945
indus 9.069302 882.78261
chas 2.210835 45.22941
nox 11.104823 1044.33776
rm 31.784033 6359.31971
age 10.962684 516.82969
dis 15.015236 1224.11605
rad 4.118011 95.94586
tax 8.587932 502.96719
ptratio 12.503896 830.77523
black 6.702609 341.30361
lstat 30.695224 7505.73936
como podemos la iportancia contiene dos valores el MSE por varibla y que tanto aporte aplicamos al modelo
varImpPlot (rf.boston)
Aplicaremos un boost sobre nuestradata aplicando una distribución gausiana
library (gbm)
Loaded gbm 2.1.5
set.seed(1)
boost.boston=gbm(medvâ¼.,data=Boston[train ,], distribution="gaussian",n.trees=5000, interaction.depth=4)
summary(boost.boston)
NA
par(mfrow=c(1,2))
plot(boost.boston ,i="rm")
plot(boost.boston ,i="lstat")
yhat.boost=predict (boost.boston ,newdata =Boston[-train ,],
n.trees=5000)
mean((yhat.boost - boston.test)^2)
[1] 10.81479
boost.boston=gbm(medvâ¼.,data=Boston[train ,], distribution="gaussian",n.trees =5000, interaction.depth =4, shrinkage =0.2,verbose=F)
yhat.boost=predict (boost.boston ,newdata =Boston[-train ,],
n.trees=5000)
mean((yhat.boost - boston.test)^2)
[1] 11.51109