Death Penalty

library(tree)
dp <- read.csv("/Users/hannahpeterson/Documents/R stuff/DeathPenalty.csv")
head(dp)
##   Agg VRace Death
## 1   1     1     1
## 2   1     1     1
## 3   1     1     0
## 4   1     1     0
## 5   1     1     0
## 6   1     1     0
dptree <- tree(Death ~., data=dp, mindev=0.1, mincut=1)
dptree <- tree(Death ~., data=dp, mincut=1)
dptree
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 362 49.380 0.1630  
##    2) Agg < 3.5 307 13.360 0.0456  
##      4) Agg < 2.5 283  5.873 0.0212 *
##      5) Agg > 2.5 24  5.333 0.3333 *
##    3) Agg > 3.5 55  8.182 0.8182  
##      6) VRace < 0.5 17  4.118 0.5882  
##       12) Agg < 5.5 13  3.231 0.4615 *
##       13) Agg > 5.5 4  0.000 1.0000 *
##      7) VRace > 0.5 38  2.763 0.9211  
##       14) Agg < 4.5 12  2.250 0.7500 *
##       15) Agg > 4.5 26  0.000 1.0000 *
plot(dptree, col=8)
text(dptree, digits=2)

For this first tree, I just ran all the variables without pruning it. This is a tree that will predict if a person gets the death penalty based on the variables.

Prune Tree

dpcut <- prune.tree(dptree,k=1.7) # k is the penalty, called alpha. Full tree has alpha 
plot(dpcut)
text(dpcut, digits=2)

dpcut
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 362 49.380 0.1630  
##   2) Agg < 3.5 307 13.360 0.0456  
##     4) Agg < 2.5 283  5.873 0.0212 *
##     5) Agg > 2.5 24  5.333 0.3333 *
##   3) Agg > 3.5 55  8.182 0.8182 *

I used 1.7 for the penalty becuase I thought that best simplified the tree. It doesn’t prune off too many branches but it also doesn’t have repeat branches.

House Prices

hp <- read.csv("/Users/hannahpeterson/Documents/R stuff/HousePrices.csv")
hp<- hp[,-1]
head(hp)
##    Price SqFt Bedrooms Bathrooms Offers Brick Neighborhood
## 1 114300 1790        2         2      2    No         East
## 2 114200 2030        4         2      3    No         East
## 3 114800 1740        3         2      1    No         East
## 4  94700 1980        3         2      3    No         East
## 5 119800 2130        3         3      3    No         East
## 6 114600 1780        3         2      2    No        North

Here, I decided to get rid of the first column because it is just the Home ID variable and is unnecessary.

hptree <- tree(Price ~., data=hp, mindev=0.1, mincut=1)
hptree <- tree(Price ~., data=hp, mincut=1)
hptree
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 128 9.169e+10 130400  
##    2) Neighborhood: East,North 89 3.007e+10 117800  
##      4) SqFt < 2020 55 1.257e+10 110400  
##        8) Brick: No 40 6.656e+09 105800  
##         16) Offers < 2.5 17 1.151e+09 114500 *
##         17) Offers > 2.5 23 3.295e+09  99420 *
##        9) Brick: Yes 15 2.857e+09 122500 *
##      5) SqFt > 2020 34 9.617e+09 129800  
##       10) Brick: No 23 6.316e+09 123800  
##         20) Bathrooms < 2.5 10 1.351e+09 111700 *
##         21) Bathrooms > 2.5 13 2.373e+09 133100 *
##       11) Brick: Yes 11 7.527e+08 142300 *
##    3) Neighborhood: West 39 1.487e+10 159300  
##      6) Brick: No 23 4.024e+09 148200  
##       12) SqFt < 2010 9 3.002e+08 137000 *
##       13) SqFt > 2010 14 1.844e+09 155500 *
##      7) Brick: Yes 16 3.983e+09 175200  
##       14) SqFt < 2285 14 1.837e+09 170900 *
##       15) SqFt > 2285 2 6.844e+07 205400 *
plot(hptree, col=8)
text(hptree, digits=2)

Prune Tree

hpbest <- prune.tree(hptree,best=8)
hpbest
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 128 9.169e+10 130400  
##    2) Neighborhood: East,North 89 3.007e+10 117800  
##      4) SqFt < 2020 55 1.257e+10 110400  
##        8) Brick: No 40 6.656e+09 105800  
##         16) Offers < 2.5 17 1.151e+09 114500 *
##         17) Offers > 2.5 23 3.295e+09  99420 *
##        9) Brick: Yes 15 2.857e+09 122500 *
##      5) SqFt > 2020 34 9.617e+09 129800  
##       10) Brick: No 23 6.316e+09 123800  
##         20) Bathrooms < 2.5 10 1.351e+09 111700 *
##         21) Bathrooms > 2.5 13 2.373e+09 133100 *
##       11) Brick: Yes 11 7.527e+08 142300 *
##    3) Neighborhood: West 39 1.487e+10 159300  
##      6) Brick: No 23 4.024e+09 148200 *
##      7) Brick: Yes 16 3.983e+09 175200 *
plot(hpbest)
text(hpbest, digits=2)

For pruning the tree, I used the function that gave us 8 predictions based on the variables. This simplified the tree without loosing important variables that affect the houses predicted price.

Spam

download.file("http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data","spambase.data")
spam <- read.csv("./spambase.data")
head(spam)
##     X0 X0.64 X0.64.1 X0.1 X0.32 X0.2 X0.3 X0.4 X0.5 X0.6 X0.7 X0.64.2 X0.8
## 1 0.21  0.28    0.50    0  0.14 0.28 0.21 0.07 0.00 0.94 0.21    0.79 0.65
## 2 0.06  0.00    0.71    0  1.23 0.19 0.19 0.12 0.64 0.25 0.38    0.45 0.12
## 3 0.00  0.00    0.00    0  0.63 0.00 0.31 0.63 0.31 0.63 0.31    0.31 0.31
## 4 0.00  0.00    0.00    0  0.63 0.00 0.31 0.63 0.31 0.63 0.31    0.31 0.31
## 5 0.00  0.00    0.00    0  1.85 0.00 0.00 1.85 0.00 0.00 0.00    0.00 0.00
## 6 0.00  0.00    0.00    0  1.92 0.00 0.00 0.00 0.00 0.64 0.96    1.28 0.00
##   X0.9 X0.10 X0.32.1 X0.11 X1.29 X1.93 X0.12 X0.96 X0.13 X0.14 X0.15 X0.16
## 1 0.21  0.14    0.14  0.07  0.28  3.47  0.00  1.59     0  0.43  0.43     0
## 2 0.00  1.75    0.06  0.06  1.03  1.36  0.32  0.51     0  1.16  0.06     0
## 3 0.00  0.00    0.31  0.00  0.00  3.18  0.00  0.31     0  0.00  0.00     0
## 4 0.00  0.00    0.31  0.00  0.00  3.18  0.00  0.31     0  0.00  0.00     0
## 5 0.00  0.00    0.00  0.00  0.00  0.00  0.00  0.00     0  0.00  0.00     0
## 6 0.00  0.00    0.96  0.00  0.32  3.85  0.00  0.64     0  0.00  0.00     0
##   X0.17 X0.18 X0.19 X0.20 X0.21 X0.22 X0.23 X0.24 X0.25 X0.26 X0.27 X0.28
## 1     0     0     0     0     0     0     0     0     0     0     0  0.07
## 2     0     0     0     0     0     0     0     0     0     0     0  0.00
## 3     0     0     0     0     0     0     0     0     0     0     0  0.00
## 4     0     0     0     0     0     0     0     0     0     0     0  0.00
## 5     0     0     0     0     0     0     0     0     0     0     0  0.00
## 6     0     0     0     0     0     0     0     0     0     0     0  0.00
##   X0.29 X0.30 X0.31 X0.33 X0.34 X0.35 X0.36 X0.37 X0.38 X0.39 X0.40 X0.41
## 1     0     0  0.00     0     0  0.00     0  0.00  0.00     0     0  0.00
## 2     0     0  0.06     0     0  0.12     0  0.06  0.06     0     0  0.01
## 3     0     0  0.00     0     0  0.00     0  0.00  0.00     0     0  0.00
## 4     0     0  0.00     0     0  0.00     0  0.00  0.00     0     0  0.00
## 5     0     0  0.00     0     0  0.00     0  0.00  0.00     0     0  0.00
## 6     0     0  0.00     0     0  0.00     0  0.00  0.00     0     0  0.00
##   X0.42 X0.43 X0.778 X0.44 X0.45 X3.756 X61 X278 X1
## 1 0.132     0  0.372 0.180 0.048  5.114 101 1028  1
## 2 0.143     0  0.276 0.184 0.010  9.821 485 2259  1
## 3 0.137     0  0.137 0.000 0.000  3.537  40  191  1
## 4 0.135     0  0.135 0.000 0.000  3.537  40  191  1
## 5 0.223     0  0.000 0.000 0.000  3.000  15   54  1
## 6 0.054     0  0.164 0.054 0.000  1.671   4  112  1
spamtree <- tree(X0 ~., data=spam, mindev=0.1, mincut=1)
spamtree <- tree(X0 ~., data=spam, mincut=1)
spamtree
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 4600 428.9000 0.10460  
##    2) X0.7 < 1.825 4587 389.2000 0.10000  
##      4) X0.15 < 0.325 4161 283.2000 0.07565 *
##      5) X0.15 > 0.325 426  79.4500 0.33790  
##       10) X1.29 < 6.135 425  61.7500 0.32800  
##         20) X0.778 < 0.2775 185  14.9700 0.16900 *
##         21) X0.778 > 0.2775 240  38.5000 0.45050  
##           42) X0.15 < 0.745 159  17.4700 0.35440 *
##           43) X0.15 > 0.745 81  16.6700 0.63930 *
##       11) X1.29 > 6.135 1   0.0000 4.54000 *
##    3) X0.7 > 1.825 13   5.7250 1.71800  
##      6) X0.96 < 5.03 2   0.0578 0.17000 *
##      7) X0.96 > 5.03 11   0.0000 2.00000 *
plot(spamtree, col=8)
text(spamtree, digits=2)

Prune Tree

spamcut <- prune.tree(spamtree,k=7) # k is the penalty, called alpha. Full tree has alpha 
plot(spamcut)
text(spamcut, digits=2)

spamcut
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 4600 428.900 0.10460  
##    2) X0.7 < 1.825 4587 389.200 0.10000  
##      4) X0.15 < 0.325 4161 283.200 0.07565 *
##      5) X0.15 > 0.325 426  79.450 0.33790  
##       10) X1.29 < 6.135 425  61.750 0.32800  
##         20) X0.778 < 0.2775 185  14.970 0.16900 *
##         21) X0.778 > 0.2775 240  38.500 0.45050 *
##       11) X1.29 > 6.135 1   0.000 4.54000 *
##    3) X0.7 > 1.825 13   5.725 1.71800 *

For this penalty, I had to use k=7. The lower k’s were not enough because the tree looked so similar to the original. It had to be a high alpha in order to simplify the data.