Fitting Clasification Trees

library (tree)
library (ISLR)
attach (Carseats )
High=ifelse (Sales <=8," No"," Yes ")
Carseats =data.frame(Carseats ,High)
tree.carseats =tree(High~.-Sales ,Carseats )
summary (tree.carseats )

Classification tree:
tree(formula = High ~ . - Sales, data = Carseats)
Variables actually used in tree construction:
[1] "ShelveLoc"   "Price"       "Income"      "CompPrice"   "Population"  "Advertising" "Age"         "US"         
Number of terminal nodes:  27 
Residual mean deviance:  0.4575 = 170.7 / 373 
Misclassification error rate: 0.09 = 36 / 400 
plot(tree.carseats )
text(tree.carseats ,pretty =0)

tree.carseats
node), split, n, deviance, yval, (yprob)
      * denotes terminal node

  1) root 400 541.500  No ( 0.59000 0.41000 )  
    2) ShelveLoc: Bad,Medium 315 390.600  No ( 0.68889 0.31111 )  
      4) Price < 92.5 46  56.530  Yes  ( 0.30435 0.69565 )  
        8) Income < 57 10  12.220  No ( 0.70000 0.30000 )  
         16) CompPrice < 110.5 5   0.000  No ( 1.00000 0.00000 ) *
         17) CompPrice > 110.5 5   6.730  Yes  ( 0.40000 0.60000 ) *
        9) Income > 57 36  35.470  Yes  ( 0.19444 0.80556 )  
         18) Population < 207.5 16  21.170  Yes  ( 0.37500 0.62500 ) *
         19) Population > 207.5 20   7.941  Yes  ( 0.05000 0.95000 ) *
      5) Price > 92.5 269 299.800  No ( 0.75465 0.24535 )  
       10) Advertising < 13.5 224 213.200  No ( 0.81696 0.18304 )  
         20) CompPrice < 124.5 96  44.890  No ( 0.93750 0.06250 )  
           40) Price < 106.5 38  33.150  No ( 0.84211 0.15789 )  
             80) Population < 177 12  16.300  No ( 0.58333 0.41667 )  
              160) Income < 60.5 6   0.000  No ( 1.00000 0.00000 ) *
              161) Income > 60.5 6   5.407  Yes  ( 0.16667 0.83333 ) *
             81) Population > 177 26   8.477  No ( 0.96154 0.03846 ) *
           41) Price > 106.5 58   0.000  No ( 1.00000 0.00000 ) *
         21) CompPrice > 124.5 128 150.200  No ( 0.72656 0.27344 )  
           42) Price < 122.5 51  70.680  Yes  ( 0.49020 0.50980 )  
             84) ShelveLoc: Bad 11   6.702  No ( 0.90909 0.09091 ) *
             85) ShelveLoc: Medium 40  52.930  Yes  ( 0.37500 0.62500 )  
              170) Price < 109.5 16   7.481  Yes  ( 0.06250 0.93750 ) *
              171) Price > 109.5 24  32.600  No ( 0.58333 0.41667 )  
                342) Age < 49.5 13  16.050  Yes  ( 0.30769 0.69231 ) *
                343) Age > 49.5 11   6.702  No ( 0.90909 0.09091 ) *
           43) Price > 122.5 77  55.540  No ( 0.88312 0.11688 )  
             86) CompPrice < 147.5 58  17.400  No ( 0.96552 0.03448 ) *
             87) CompPrice > 147.5 19  25.010  No ( 0.63158 0.36842 )  
              174) Price < 147 12  16.300  Yes  ( 0.41667 0.58333 )  
                348) CompPrice < 152.5 7   5.742  Yes  ( 0.14286 0.85714 ) *
                349) CompPrice > 152.5 5   5.004  No ( 0.80000 0.20000 ) *
              175) Price > 147 7   0.000  No ( 1.00000 0.00000 ) *
       11) Advertising > 13.5 45  61.830  Yes  ( 0.44444 0.55556 )  
         22) Age < 54.5 25  25.020  Yes  ( 0.20000 0.80000 )  
           44) CompPrice < 130.5 14  18.250  Yes  ( 0.35714 0.64286 )  
             88) Income < 100 9  12.370  No ( 0.55556 0.44444 ) *
             89) Income > 100 5   0.000  Yes  ( 0.00000 1.00000 ) *
           45) CompPrice > 130.5 11   0.000  Yes  ( 0.00000 1.00000 ) *
         23) Age > 54.5 20  22.490  No ( 0.75000 0.25000 )  
           46) CompPrice < 122.5 10   0.000  No ( 1.00000 0.00000 ) *
           47) CompPrice > 122.5 10  13.860  No ( 0.50000 0.50000 )  
             94) Price < 125 5   0.000  Yes  ( 0.00000 1.00000 ) *
             95) Price > 125 5   0.000  No ( 1.00000 0.00000 ) *
    3) ShelveLoc: Good 85  90.330  Yes  ( 0.22353 0.77647 )  
      6) Price < 135 68  49.260  Yes  ( 0.11765 0.88235 )  
       12) US: No 17  22.070  Yes  ( 0.35294 0.64706 )  
         24) Price < 109 8   0.000  Yes  ( 0.00000 1.00000 ) *
         25) Price > 109 9  11.460  No ( 0.66667 0.33333 ) *
       13) US: Yes 51  16.880  Yes  ( 0.03922 0.96078 ) *
      7) Price > 135 17  22.070  No ( 0.64706 0.35294 )  
       14) Income < 46 6   0.000  No ( 1.00000 0.00000 ) *
       15) Income > 46 11  15.160  Yes  ( 0.45455 0.54545 ) *
set.seed (2)
train=sample (1: nrow(Carseats ), 200)
Carseats.test=Carseats [-train ,]
High.test=High[-train ]
tree.carseats =tree(High~.-Sales ,Carseats ,subset =train )
tree.pred=predict (tree.carseats ,Carseats.test ,type ="class")
table(tree.pred ,High.test)
         High.test
tree.pred  No  Yes 
     No    86    27
     Yes   30    57
(86+57) /200
[1] 0.715
set.seed (3)
cv.carseats =cv.tree(tree.carseats,FUN=prune.misclass )
names(cv.carseats )
[1] "size"   "dev"    "k"      "method"
cv.carseats
$size
[1] 19 17 14 13  9  7  3  2  1

$dev
[1] 55 55 53 52 50 56 69 65 80

$k
[1]       -Inf  0.0000000  0.6666667  1.0000000  1.7500000  2.0000000  4.2500000  5.0000000 23.0000000

$method
[1] "misclass"

attr(,"class")
[1] "prune"         "tree.sequence"
par(mfrow =c(1,2))
plot(cv.carseats$size ,cv.carseats$dev ,type="b")
plot(cv.carseats$k ,cv.carseats$dev ,type="b")

prune.carseats =prune.misclass (tree.carseats,best =9)
plot(prune.carseats )
text(prune.carseats ,pretty =0)

tree.pred=predict (prune.carseats , Carseats.test ,type= "class")
table(tree.pred ,High.test)
         High.test
tree.pred  No  Yes 
     No    94    24
     Yes   22    60
(94+60) /200
[1] 0.77
prune.carseats =prune.misclass (tree.carseats ,best =15)
plot(prune.carseats )
text(prune.carseats ,pretty =0)

tree.pred=predict (prune.carseats , Carseats.test,type="class")
table(tree.pred ,High.test)
         High.test
tree.pred  No  Yes 
     No    86    22
     Yes   30    62

Fitting Regression Trees

library (MASS)
set.seed (1)
train = sample (1: nrow(Boston ), nrow(Boston )/2)
tree.boston =tree(medv~.,Boston ,subset =train)
summary (tree.boston )

Regression tree:
tree(formula = medv ~ ., data = Boston, subset = train)
Variables actually used in tree construction:
[1] "lstat" "rm"    "dis"  
Number of terminal nodes:  8 
Residual mean deviance:  12.65 = 3099 / 245 
Distribution of residuals:
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-14.10000  -2.04200  -0.05357   0.00000   1.96000  12.60000 
plot(tree.boston )
text(tree.boston ,pretty =0)

cv.boston =cv.tree(tree.boston )
plot(cv.boston$size,cv.boston$dev,type='b')

prune.boston =prune.tree(tree.boston ,best =5)
plot(prune.boston )
text(prune.boston ,pretty =0)

yhat=predict (tree.boston ,newdata =Boston[-train ,])
boston.test=Boston [-train ," medv"]
plot(yhat ,boston.test)
abline (0,1)

mean((yhat -boston.test)^2)
[1] NaN

Bagging and Random Forest

library (randomForest)
set.seed (1)
bag.boston =randomForest(medv~.,data=Boston ,subset =train ,
mtry=13, importance =TRUE)
bag.boston

Call:
 randomForest(formula = medv ~ ., data = Boston, mtry = 13, importance = TRUE,      subset = train) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 13

          Mean of squared residuals: 11.02509
                    % Var explained: 86.65
yhat.bag = predict (bag.boston ,newdata =Boston [-train ,])
plot(yhat.bag , boston.test)
abline (0,1)

bag.boston =randomForest(medv~.,data=Boston ,subset =train ,
mtry=13, ntree =25)
yhat.bag = predict (bag.boston ,newdata =Boston [-train ,])
set.seed (1)
rf.boston =randomForest(medv~.,data=Boston ,subset =train ,
mtry=6, importance =TRUE)
yhat.rf = predict (rf.boston ,newdata =Boston [-train ,])
importance (rf.boston )
          %IncMSE IncNodePurity
crim    12.547772    1094.65382
zn       1.375489      64.40060
indus    9.304258    1086.09103
chas     2.518766      76.36804
nox     12.835614    1008.73703
rm      31.646147    6705.02638
age      9.970243     575.13702
dis     12.774430    1351.01978
rad      3.911852      93.78200
tax      7.624043     453.19472
ptratio 12.008194     919.06760
black    7.376024     358.96935
lstat   27.666896    6927.98475
varImpPlot (rf.boston )

Boosting

library (gbm)
set.seed (1)
boost.boston =gbm(medv~.,data=Boston [train ,], distribution=
"gaussian",n.trees =5000 , interaction.depth =4)
summary (boost.boston )

par(mfrow =c(1,2))
plot(boost.boston ,i="rm")
plot(boost.boston ,i="lstat")

yhat.boost=predict (boost.boston ,newdata =Boston [-train ,],
n.trees =5000)
boost.boston =gbm(medv~.,data=Boston [train ,], distribution=
"gaussian",n.trees =5000 , interaction.depth =4, shrinkage =0.2,
verbose =F)
yhat.boost=predict (boost.boston ,newdata =Boston [-train ,],
n.trees =5000)
LS0tDQp0aXRsZTogIkxhYm9yYXRvcmlvIENhcO10dWxvIDgiDQpzdWJ0aXRsZTogIkFsZWphbmRyYSBDZXJtZfFvIg0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCg0KIyMjRml0dGluZyBDbGFzaWZpY2F0aW9uIFRyZWVzDQoNCmBgYHtyLCB3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeSAodHJlZSkNCmxpYnJhcnkgKElTTFIpDQphdHRhY2ggKENhcnNlYXRzICkNCkhpZ2g9aWZlbHNlIChTYWxlcyA8PTgsIiBObyIsIiBZZXMgIikNCmBgYA0KDQpgYGB7cn0NCkNhcnNlYXRzID1kYXRhLmZyYW1lKENhcnNlYXRzICxIaWdoKQ0KYGBgDQoNCmBgYHtyfQ0KdHJlZS5jYXJzZWF0cyA9dHJlZShIaWdoPz8/Li1TYWxlcyAsQ2Fyc2VhdHMgKQ0KYGBgDQoNCmBgYHtyfQ0Kc3VtbWFyeSAodHJlZS5jYXJzZWF0cyApDQpgYGANCg0KYGBge3J9DQpwbG90KHRyZWUuY2Fyc2VhdHMgKQ0KdGV4dCh0cmVlLmNhcnNlYXRzICxwcmV0dHkgPTApDQpgYGANCg0KYGBge3J9DQp0cmVlLmNhcnNlYXRzDQpgYGANCg0KYGBge3J9DQpzZXQuc2VlZCAoMikNCnRyYWluPXNhbXBsZSAoMTogbnJvdyhDYXJzZWF0cyApLCAyMDApDQpDYXJzZWF0cy50ZXN0PUNhcnNlYXRzIFstdHJhaW4gLF0NCkhpZ2gudGVzdD1IaWdoWy10cmFpbiBdDQpgYGANCg0KYGBge3J9DQp0cmVlLmNhcnNlYXRzID10cmVlKEhpZ2g/Pz8uLVNhbGVzICxDYXJzZWF0cyAsc3Vic2V0ID10cmFpbiApDQp0cmVlLnByZWQ9cHJlZGljdCAodHJlZS5jYXJzZWF0cyAsQ2Fyc2VhdHMudGVzdCAsdHlwZSA9ImNsYXNzIikNCnRhYmxlKHRyZWUucHJlZCAsSGlnaC50ZXN0KQ0KYGBgDQpgYGB7cn0NCig4Nis1NykgLzIwMA0KYGBgDQoNCmBgYHtyfQ0Kc2V0LnNlZWQgKDMpDQpjdi5jYXJzZWF0cyA9Y3YudHJlZSh0cmVlLmNhcnNlYXRzLEZVTj1wcnVuZS5taXNjbGFzcyApDQpuYW1lcyhjdi5jYXJzZWF0cyApDQpgYGANCg0KYGBge3J9DQpjdi5jYXJzZWF0cw0KYGBgDQoNCmBgYHtyfQ0KcGFyKG1mcm93ID1jKDEsMikpDQpwbG90KGN2LmNhcnNlYXRzJHNpemUgLGN2LmNhcnNlYXRzJGRldiAsdHlwZT0iYiIpDQpwbG90KGN2LmNhcnNlYXRzJGsgLGN2LmNhcnNlYXRzJGRldiAsdHlwZT0iYiIpDQpgYGANCg0KYGBge3J9DQpwcnVuZS5jYXJzZWF0cyA9cHJ1bmUubWlzY2xhc3MgKHRyZWUuY2Fyc2VhdHMsYmVzdCA9OSkNCnBsb3QocHJ1bmUuY2Fyc2VhdHMgKQ0KdGV4dChwcnVuZS5jYXJzZWF0cyAscHJldHR5ID0wKQ0KYGBgDQoNCmBgYHtyfQ0KdHJlZS5wcmVkPXByZWRpY3QgKHBydW5lLmNhcnNlYXRzICwgQ2Fyc2VhdHMudGVzdCAsdHlwZT0gImNsYXNzIikNCnRhYmxlKHRyZWUucHJlZCAsSGlnaC50ZXN0KQ0KYGBgDQoNCmBgYHtyfQ0KKDk0KzYwKSAvMjAwDQpgYGANCg0KYGBge3J9DQpwcnVuZS5jYXJzZWF0cyA9cHJ1bmUubWlzY2xhc3MgKHRyZWUuY2Fyc2VhdHMgLGJlc3QgPTE1KQ0KcGxvdChwcnVuZS5jYXJzZWF0cyApDQp0ZXh0KHBydW5lLmNhcnNlYXRzICxwcmV0dHkgPTApDQp0cmVlLnByZWQ9cHJlZGljdCAocHJ1bmUuY2Fyc2VhdHMgLCBDYXJzZWF0cy50ZXN0LHR5cGU9ImNsYXNzIikNCnRhYmxlKHRyZWUucHJlZCAsSGlnaC50ZXN0KQ0KYGBgDQoNCiMjI0ZpdHRpbmcgUmVncmVzc2lvbiBUcmVlcw0KDQpgYGB7cn0NCmxpYnJhcnkgKE1BU1MpDQpzZXQuc2VlZCAoMSkNCnRyYWluID0gc2FtcGxlICgxOiBucm93KEJvc3RvbiApLCBucm93KEJvc3RvbiApLzIpDQp0cmVlLmJvc3RvbiA9dHJlZShtZWR2Pz8/LixCb3N0b24gLHN1YnNldCA9dHJhaW4pDQpzdW1tYXJ5ICh0cmVlLmJvc3RvbiApDQpgYGANCg0KYGBge3J9DQpwbG90KHRyZWUuYm9zdG9uICkNCnRleHQodHJlZS5ib3N0b24gLHByZXR0eSA9MCkNCmBgYA0KDQpgYGB7cn0NCmN2LmJvc3RvbiA9Y3YudHJlZSh0cmVlLmJvc3RvbiApDQpwbG90KGN2LmJvc3RvbiRzaXplLGN2LmJvc3RvbiRkZXYsdHlwZT0nYicpDQpgYGANCg0KYGBge3J9DQpwcnVuZS5ib3N0b24gPXBydW5lLnRyZWUodHJlZS5ib3N0b24gLGJlc3QgPTUpDQpwbG90KHBydW5lLmJvc3RvbiApDQp0ZXh0KHBydW5lLmJvc3RvbiAscHJldHR5ID0wKQ0KYGBgDQpgYGB7cn0NCnloYXQ9cHJlZGljdCAodHJlZS5ib3N0b24gLG5ld2RhdGEgPUJvc3RvblstdHJhaW4gLF0pDQpib3N0b24udGVzdD1Cb3N0b24gWy10cmFpbiAsIiBtZWR2Il0NCnBsb3QoeWhhdCAsYm9zdG9uLnRlc3QpDQphYmxpbmUgKDAsMSkNCm1lYW4oKHloYXQgLWJvc3Rvbi50ZXN0KV4yKQ0KYGBgDQoNCiMjI0JhZ2dpbmcgYW5kIFJhbmRvbSBGb3Jlc3QNCg0KYGBge3IsIHdhcm5pbmc9RkFMU0V9DQpsaWJyYXJ5IChyYW5kb21Gb3Jlc3QpDQpzZXQuc2VlZCAoMSkNCmJhZy5ib3N0b24gPXJhbmRvbUZvcmVzdChtZWR2Pz8/LixkYXRhPUJvc3RvbiAsc3Vic2V0ID10cmFpbiAsDQptdHJ5PTEzLCBpbXBvcnRhbmNlID1UUlVFKQ0KYmFnLmJvc3Rvbg0KYGBgDQoNCmBgYHtyfQ0KeWhhdC5iYWcgPSBwcmVkaWN0IChiYWcuYm9zdG9uICxuZXdkYXRhID1Cb3N0b24gWy10cmFpbiAsXSkNCnBsb3QoeWhhdC5iYWcgLCBib3N0b24udGVzdCkNCmFibGluZSAoMCwxKQ0KYGBgDQoNCmBgYHtyfQ0KYmFnLmJvc3RvbiA9cmFuZG9tRm9yZXN0KG1lZHY/Pz8uLGRhdGE9Qm9zdG9uICxzdWJzZXQgPXRyYWluICwNCm10cnk9MTMsIG50cmVlID0yNSkNCnloYXQuYmFnID0gcHJlZGljdCAoYmFnLmJvc3RvbiAsbmV3ZGF0YSA9Qm9zdG9uIFstdHJhaW4gLF0pDQpgYGANCg0KYGBge3J9DQpzZXQuc2VlZCAoMSkNCnJmLmJvc3RvbiA9cmFuZG9tRm9yZXN0KG1lZHY/Pz8uLGRhdGE9Qm9zdG9uICxzdWJzZXQgPXRyYWluICwNCm10cnk9NiwgaW1wb3J0YW5jZSA9VFJVRSkNCnloYXQucmYgPSBwcmVkaWN0IChyZi5ib3N0b24gLG5ld2RhdGEgPUJvc3RvbiBbLXRyYWluICxdKQ0KYGBgDQoNCmBgYHtyfQ0KaW1wb3J0YW5jZSAocmYuYm9zdG9uICkNCmBgYA0KDQpgYGB7cn0NCnZhckltcFBsb3QgKHJmLmJvc3RvbiApDQpgYGANCg0KIyMjQm9vc3RpbmcNCg0KYGBge3J9DQpsaWJyYXJ5IChnYm0pDQpzZXQuc2VlZCAoMSkNCmJvb3N0LmJvc3RvbiA9Z2JtKG1lZHY/Pz8uLGRhdGE9Qm9zdG9uIFt0cmFpbiAsXSwgZGlzdHJpYnV0aW9uPQ0KImdhdXNzaWFuIixuLnRyZWVzID01MDAwICwgaW50ZXJhY3Rpb24uZGVwdGggPTQpDQpgYGANCg0KYGBge3J9DQpzdW1tYXJ5IChib29zdC5ib3N0b24gKQ0KYGBgDQoNCmBgYHtyfQ0KcGFyKG1mcm93ID1jKDEsMikpDQpwbG90KGJvb3N0LmJvc3RvbiAsaT0icm0iKQ0KcGxvdChib29zdC5ib3N0b24gLGk9ImxzdGF0IikNCmBgYA0KDQpgYGB7cn0NCnloYXQuYm9vc3Q9cHJlZGljdCAoYm9vc3QuYm9zdG9uICxuZXdkYXRhID1Cb3N0b24gWy10cmFpbiAsXSwNCm4udHJlZXMgPTUwMDApDQpgYGANCg0KYGBge3J9DQpib29zdC5ib3N0b24gPWdibShtZWR2Pz8/LixkYXRhPUJvc3RvbiBbdHJhaW4gLF0sIGRpc3RyaWJ1dGlvbj0NCiJnYXVzc2lhbiIsbi50cmVlcyA9NTAwMCAsIGludGVyYWN0aW9uLmRlcHRoID00LCBzaHJpbmthZ2UgPTAuMiwNCnZlcmJvc2UgPUYpDQp5aGF0LmJvb3N0PXByZWRpY3QgKGJvb3N0LmJvc3RvbiAsbmV3ZGF0YSA9Qm9zdG9uIFstdHJhaW4gLF0sDQpuLnRyZWVzID01MDAwKQ0KYGBgDQoNCg0KDQoNCg0KDQoNCg0KDQo=