library(tree)
dp <- read.csv("/Users/hannahpeterson/Documents/R stuff/DeathPenalty.csv")
head(dp)
## Agg VRace Death
## 1 1 1 1
## 2 1 1 1
## 3 1 1 0
## 4 1 1 0
## 5 1 1 0
## 6 1 1 0
dptree <- tree(Death ~., data=dp, mindev=0.1, mincut=1)
dptree <- tree(Death ~., data=dp, mincut=1)
dptree
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 362 49.380 0.1630
## 2) Agg < 3.5 307 13.360 0.0456
## 4) Agg < 2.5 283 5.873 0.0212 *
## 5) Agg > 2.5 24 5.333 0.3333 *
## 3) Agg > 3.5 55 8.182 0.8182
## 6) VRace < 0.5 17 4.118 0.5882
## 12) Agg < 5.5 13 3.231 0.4615 *
## 13) Agg > 5.5 4 0.000 1.0000 *
## 7) VRace > 0.5 38 2.763 0.9211
## 14) Agg < 4.5 12 2.250 0.7500 *
## 15) Agg > 4.5 26 0.000 1.0000 *
plot(dptree, col=8)
text(dptree, digits=2)
For this first tree, I just ran all the variables without pruning it. This is a tree that will predict if a person gets the death penalty based on the variables.
Prune Tree
dpcut <- prune.tree(dptree,k=1.7) # k is the penalty, called alpha. Full tree has alpha
plot(dpcut)
text(dpcut, digits=2)
dpcut
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 362 49.380 0.1630
## 2) Agg < 3.5 307 13.360 0.0456
## 4) Agg < 2.5 283 5.873 0.0212 *
## 5) Agg > 2.5 24 5.333 0.3333 *
## 3) Agg > 3.5 55 8.182 0.8182 *
I used 1.7 for the penalty becuase I thought that best simplified the tree. It doesn’t prune off too many branches but it also doesn’t have repeat branches.
hp <- read.csv("/Users/hannahpeterson/Documents/R stuff/HousePrices.csv")
hp<- hp[,-1]
head(hp)
## Price SqFt Bedrooms Bathrooms Offers Brick Neighborhood
## 1 114300 1790 2 2 2 No East
## 2 114200 2030 4 2 3 No East
## 3 114800 1740 3 2 1 No East
## 4 94700 1980 3 2 3 No East
## 5 119800 2130 3 3 3 No East
## 6 114600 1780 3 2 2 No North
Here, I decided to get rid of the first column because it is just the Home ID variable and is unnecessary.
hptree <- tree(Price ~., data=hp, mindev=0.1, mincut=1)
hptree <- tree(Price ~., data=hp, mincut=1)
hptree
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 128 9.169e+10 130400
## 2) Neighborhood: East,North 89 3.007e+10 117800
## 4) SqFt < 2020 55 1.257e+10 110400
## 8) Brick: No 40 6.656e+09 105800
## 16) Offers < 2.5 17 1.151e+09 114500 *
## 17) Offers > 2.5 23 3.295e+09 99420 *
## 9) Brick: Yes 15 2.857e+09 122500 *
## 5) SqFt > 2020 34 9.617e+09 129800
## 10) Brick: No 23 6.316e+09 123800
## 20) Bathrooms < 2.5 10 1.351e+09 111700 *
## 21) Bathrooms > 2.5 13 2.373e+09 133100 *
## 11) Brick: Yes 11 7.527e+08 142300 *
## 3) Neighborhood: West 39 1.487e+10 159300
## 6) Brick: No 23 4.024e+09 148200
## 12) SqFt < 2010 9 3.002e+08 137000 *
## 13) SqFt > 2010 14 1.844e+09 155500 *
## 7) Brick: Yes 16 3.983e+09 175200
## 14) SqFt < 2285 14 1.837e+09 170900 *
## 15) SqFt > 2285 2 6.844e+07 205400 *
plot(hptree, col=8)
text(hptree, digits=2)
Prune Tree
hpbest <- prune.tree(hptree,best=8)
hpbest
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 128 9.169e+10 130400
## 2) Neighborhood: East,North 89 3.007e+10 117800
## 4) SqFt < 2020 55 1.257e+10 110400
## 8) Brick: No 40 6.656e+09 105800
## 16) Offers < 2.5 17 1.151e+09 114500 *
## 17) Offers > 2.5 23 3.295e+09 99420 *
## 9) Brick: Yes 15 2.857e+09 122500 *
## 5) SqFt > 2020 34 9.617e+09 129800
## 10) Brick: No 23 6.316e+09 123800
## 20) Bathrooms < 2.5 10 1.351e+09 111700 *
## 21) Bathrooms > 2.5 13 2.373e+09 133100 *
## 11) Brick: Yes 11 7.527e+08 142300 *
## 3) Neighborhood: West 39 1.487e+10 159300
## 6) Brick: No 23 4.024e+09 148200 *
## 7) Brick: Yes 16 3.983e+09 175200 *
plot(hpbest)
text(hpbest, digits=2)
For pruning the tree, I used the function that gave us 8 predictions based on the variables. This simplified the tree without loosing important variables that affect the houses predicted price.
download.file("http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data","spambase.data")
spam <- read.csv("./spambase.data")
head(spam)
## X0 X0.64 X0.64.1 X0.1 X0.32 X0.2 X0.3 X0.4 X0.5 X0.6 X0.7 X0.64.2 X0.8
## 1 0.21 0.28 0.50 0 0.14 0.28 0.21 0.07 0.00 0.94 0.21 0.79 0.65
## 2 0.06 0.00 0.71 0 1.23 0.19 0.19 0.12 0.64 0.25 0.38 0.45 0.12
## 3 0.00 0.00 0.00 0 0.63 0.00 0.31 0.63 0.31 0.63 0.31 0.31 0.31
## 4 0.00 0.00 0.00 0 0.63 0.00 0.31 0.63 0.31 0.63 0.31 0.31 0.31
## 5 0.00 0.00 0.00 0 1.85 0.00 0.00 1.85 0.00 0.00 0.00 0.00 0.00
## 6 0.00 0.00 0.00 0 1.92 0.00 0.00 0.00 0.00 0.64 0.96 1.28 0.00
## X0.9 X0.10 X0.32.1 X0.11 X1.29 X1.93 X0.12 X0.96 X0.13 X0.14 X0.15 X0.16
## 1 0.21 0.14 0.14 0.07 0.28 3.47 0.00 1.59 0 0.43 0.43 0
## 2 0.00 1.75 0.06 0.06 1.03 1.36 0.32 0.51 0 1.16 0.06 0
## 3 0.00 0.00 0.31 0.00 0.00 3.18 0.00 0.31 0 0.00 0.00 0
## 4 0.00 0.00 0.31 0.00 0.00 3.18 0.00 0.31 0 0.00 0.00 0
## 5 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0 0.00 0.00 0
## 6 0.00 0.00 0.96 0.00 0.32 3.85 0.00 0.64 0 0.00 0.00 0
## X0.17 X0.18 X0.19 X0.20 X0.21 X0.22 X0.23 X0.24 X0.25 X0.26 X0.27 X0.28
## 1 0 0 0 0 0 0 0 0 0 0 0 0.07
## 2 0 0 0 0 0 0 0 0 0 0 0 0.00
## 3 0 0 0 0 0 0 0 0 0 0 0 0.00
## 4 0 0 0 0 0 0 0 0 0 0 0 0.00
## 5 0 0 0 0 0 0 0 0 0 0 0 0.00
## 6 0 0 0 0 0 0 0 0 0 0 0 0.00
## X0.29 X0.30 X0.31 X0.33 X0.34 X0.35 X0.36 X0.37 X0.38 X0.39 X0.40 X0.41
## 1 0 0 0.00 0 0 0.00 0 0.00 0.00 0 0 0.00
## 2 0 0 0.06 0 0 0.12 0 0.06 0.06 0 0 0.01
## 3 0 0 0.00 0 0 0.00 0 0.00 0.00 0 0 0.00
## 4 0 0 0.00 0 0 0.00 0 0.00 0.00 0 0 0.00
## 5 0 0 0.00 0 0 0.00 0 0.00 0.00 0 0 0.00
## 6 0 0 0.00 0 0 0.00 0 0.00 0.00 0 0 0.00
## X0.42 X0.43 X0.778 X0.44 X0.45 X3.756 X61 X278 X1
## 1 0.132 0 0.372 0.180 0.048 5.114 101 1028 1
## 2 0.143 0 0.276 0.184 0.010 9.821 485 2259 1
## 3 0.137 0 0.137 0.000 0.000 3.537 40 191 1
## 4 0.135 0 0.135 0.000 0.000 3.537 40 191 1
## 5 0.223 0 0.000 0.000 0.000 3.000 15 54 1
## 6 0.054 0 0.164 0.054 0.000 1.671 4 112 1
spamtree <- tree(X0 ~., data=spam, mindev=0.1, mincut=1)
spamtree <- tree(X0 ~., data=spam, mincut=1)
spamtree
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 4600 428.9000 0.10460
## 2) X0.7 < 1.825 4587 389.2000 0.10000
## 4) X0.15 < 0.325 4161 283.2000 0.07565 *
## 5) X0.15 > 0.325 426 79.4500 0.33790
## 10) X1.29 < 6.135 425 61.7500 0.32800
## 20) X0.778 < 0.2775 185 14.9700 0.16900 *
## 21) X0.778 > 0.2775 240 38.5000 0.45050
## 42) X0.15 < 0.745 159 17.4700 0.35440 *
## 43) X0.15 > 0.745 81 16.6700 0.63930 *
## 11) X1.29 > 6.135 1 0.0000 4.54000 *
## 3) X0.7 > 1.825 13 5.7250 1.71800
## 6) X0.96 < 5.03 2 0.0578 0.17000 *
## 7) X0.96 > 5.03 11 0.0000 2.00000 *
plot(spamtree, col=8)
text(spamtree, digits=2)
Prune Tree
spamcut <- prune.tree(spamtree,k=7) # k is the penalty, called alpha. Full tree has alpha
plot(spamcut)
text(spamcut, digits=2)
spamcut
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 4600 428.900 0.10460
## 2) X0.7 < 1.825 4587 389.200 0.10000
## 4) X0.15 < 0.325 4161 283.200 0.07565 *
## 5) X0.15 > 0.325 426 79.450 0.33790
## 10) X1.29 < 6.135 425 61.750 0.32800
## 20) X0.778 < 0.2775 185 14.970 0.16900 *
## 21) X0.778 > 0.2775 240 38.500 0.45050 *
## 11) X1.29 > 6.135 1 0.000 4.54000 *
## 3) X0.7 > 1.825 13 5.725 1.71800 *
For this penalty, I had to use k=7. The lower k’s were not enough because the tree looked so similar to the original. It had to be a high alpha in order to simplify the data.