The trees I will create will show the odds of someone receiving the death penalty.
dp <- read.csv("~/Business Analytics/DeathPenalty.csv")
head(dp)
## Agg VRace Death
## 1 1 1 1
## 2 1 1 1
## 3 1 1 0
## 4 1 1 0
## 5 1 1 0
## 6 1 1 0
library(tree)
## Warning: package 'tree' was built under R version 3.4.2
length(dp$Death)
## [1] 362
dptree <- tree(Death ~., data=dp, mindev=0.1, mincut=1)
dptree <- tree(Death ~., data=dp, mincut=1)
dptree
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 362 49.380 0.1630
## 2) Agg < 3.5 307 13.360 0.0456
## 4) Agg < 2.5 283 5.873 0.0212 *
## 5) Agg > 2.5 24 5.333 0.3333 *
## 3) Agg > 3.5 55 8.182 0.8182
## 6) VRace < 0.5 17 4.118 0.5882
## 12) Agg < 5.5 13 3.231 0.4615 *
## 13) Agg > 5.5 4 0.000 1.0000 *
## 7) VRace > 0.5 38 2.763 0.9211
## 14) Agg < 4.5 12 2.250 0.7500 *
## 15) Agg > 4.5 26 0.000 1.0000 *
plot(dptree, col=8)
text(dptree, digits=2)
This is our original tree, next I am going to prune it to make it simpler.
dpcut <- prune.tree(dptree,k=1.5)
plot(dpcut)
text(dpcut, digits=2)
dpcut
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 362 49.380 0.1630
## 2) Agg < 3.5 307 13.360 0.0456
## 4) Agg < 2.5 283 5.873 0.0212 *
## 5) Agg > 2.5 24 5.333 0.3333 *
## 3) Agg > 3.5 55 8.182 0.8182 *
dpcut <- prune.tree(dptree,k=2)
plot(dpcut)
text(dpcut, digits=2)
dpcut
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 362 49.380 0.1630
## 2) Agg < 3.5 307 13.360 0.0456
## 4) Agg < 2.5 283 5.873 0.0212 *
## 5) Agg > 2.5 24 5.333 0.3333 *
## 3) Agg > 3.5 55 8.182 0.8182 *
dpcut <- prune.tree(dptree)
dpcut
## $size
## [1] 6 5 4 3 2 1
##
## $dev
## [1] 16.68689 17.20005 18.08693 19.38794 21.54338 49.38398
##
## $k
## [1] -Inf 0.5131579 0.8868778 1.3010132 2.1554387 27.8405962
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(dpcut)
I tried a couple different k’s for pruning but in conclusion we are going to keep the best one/most accurate which was k=2 as the deviance only gradually decreased.
dpbest <- prune.tree(dptree,best=2)
dpbest
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 362 49.380 0.1630
## 2) Agg < 3.5 307 13.360 0.0456 *
## 3) Agg > 3.5 55 8.182 0.8182 *
plot(dpbest)
text(dpbest, digits=2)
With these trees we will be predicting which neighborhood a house would be in.
hp<-read.csv("~/Business Analytics/HousePrices.csv")
hp=hp[-1]
library(MASS)
library(tree)
head(hp)
## Price SqFt Bedrooms Bathrooms Offers Brick Neighborhood
## 1 114300 1790 2 2 2 No East
## 2 114200 2030 4 2 3 No East
## 3 114800 1740 3 2 1 No East
## 4 94700 1980 3 2 3 No East
## 5 119800 2130 3 3 3 No East
## 6 114600 1780 3 2 2 No North
hptree <- tree(Neighborhood~.,data=hp)
hptree
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 128 280.800 East ( 0.35156 0.34375 0.30469 )
## 2) Price < 128750 68 92.140 North ( 0.41176 0.58824 0.00000 )
## 4) Offers < 3.5 52 72.090 East ( 0.50000 0.50000 0.00000 )
## 8) Price < 93200 5 0.000 North ( 0.00000 1.00000 0.00000 ) *
## 9) Price > 93200 47 64.620 East ( 0.55319 0.44681 0.00000 )
## 18) Price < 118500 35 48.260 North ( 0.45714 0.54286 0.00000 )
## 36) Price < 105850 8 8.997 East ( 0.75000 0.25000 0.00000 ) *
## 37) Price > 105850 27 35.590 North ( 0.37037 0.62963 0.00000 ) *
## 19) Price > 118500 12 10.810 East ( 0.83333 0.16667 0.00000 ) *
## 5) Offers > 3.5 16 12.060 North ( 0.12500 0.87500 0.00000 ) *
## 3) Price > 128750 60 98.140 West ( 0.28333 0.06667 0.65000 )
## 6) Price < 157350 41 77.260 West ( 0.41463 0.09756 0.48780 )
## 12) Brick: No 26 42.680 West ( 0.19231 0.11538 0.69231 )
## 24) Bathrooms < 2.5 12 6.884 West ( 0.00000 0.08333 0.91667 ) *
## 25) Bathrooms > 2.5 14 27.780 West ( 0.35714 0.14286 0.50000 )
## 50) SqFt < 2350 9 17.910 West ( 0.22222 0.22222 0.55556 ) *
## 51) SqFt > 2350 5 6.730 East ( 0.60000 0.00000 0.40000 ) *
## 13) Brick: Yes 15 18.830 East ( 0.80000 0.06667 0.13333 )
## 26) SqFt < 2000 5 10.550 East ( 0.40000 0.20000 0.40000 ) *
## 27) SqFt > 2000 10 0.000 East ( 1.00000 0.00000 0.00000 ) *
## 7) Price > 157350 19 0.000 West ( 0.00000 0.00000 1.00000 ) *
plot(hptree)
text(hptree,digits=2)
This is our original tree and as you can see it is pretty cluttered so I will prune it a bit.
hpcut <- prune.tree(hptree)
hpcut
## $size
## [1] 11 10 9 8 7 6 5 4 3 2 1
##
## $dev
## [1] 109.5356 112.6789 116.3501 121.8967 129.3611 137.3564 145.3700
## [8] 153.6520 169.4032 190.2832 280.7536
##
## $k
## [1] -Inf 3.143293 3.671246 5.546599 7.464395 7.995287 8.013625
## [8] 8.281956 15.751268 20.879921 90.470458
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(hpcut)
hpcut <- prune.tree(hptree,k=3)
plot(hpcut)
text(hpcut, digits=2)
hpcut
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 128 280.800 East ( 0.35156 0.34375 0.30469 )
## 2) Price < 128750 68 92.140 North ( 0.41176 0.58824 0.00000 )
## 4) Offers < 3.5 52 72.090 East ( 0.50000 0.50000 0.00000 )
## 8) Price < 93200 5 0.000 North ( 0.00000 1.00000 0.00000 ) *
## 9) Price > 93200 47 64.620 East ( 0.55319 0.44681 0.00000 )
## 18) Price < 118500 35 48.260 North ( 0.45714 0.54286 0.00000 )
## 36) Price < 105850 8 8.997 East ( 0.75000 0.25000 0.00000 ) *
## 37) Price > 105850 27 35.590 North ( 0.37037 0.62963 0.00000 ) *
## 19) Price > 118500 12 10.810 East ( 0.83333 0.16667 0.00000 ) *
## 5) Offers > 3.5 16 12.060 North ( 0.12500 0.87500 0.00000 ) *
## 3) Price > 128750 60 98.140 West ( 0.28333 0.06667 0.65000 )
## 6) Price < 157350 41 77.260 West ( 0.41463 0.09756 0.48780 )
## 12) Brick: No 26 42.680 West ( 0.19231 0.11538 0.69231 )
## 24) Bathrooms < 2.5 12 6.884 West ( 0.00000 0.08333 0.91667 ) *
## 25) Bathrooms > 2.5 14 27.780 West ( 0.35714 0.14286 0.50000 )
## 50) SqFt < 2350 9 17.910 West ( 0.22222 0.22222 0.55556 ) *
## 51) SqFt > 2350 5 6.730 East ( 0.60000 0.00000 0.40000 ) *
## 13) Brick: Yes 15 18.830 East ( 0.80000 0.06667 0.13333 )
## 26) SqFt < 2000 5 10.550 East ( 0.40000 0.20000 0.40000 ) *
## 27) SqFt > 2000 10 0.000 East ( 1.00000 0.00000 0.00000 ) *
## 7) Price > 157350 19 0.000 West ( 0.00000 0.00000 1.00000 ) *
I then chose to stick with k=3 because of the deviance plot.
With these trees we will be predicting if an email is spam or not.
download.file("http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data", "spambase.data")
spam<-read.csv("http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data")
summary(spam)
## X0 X0.64 X0.64.1 X0.1
## Min. :0.0000 Min. : 0.0000 Min. :0.0000 Min. : 0.00000
## 1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.: 0.00000
## Median :0.0000 Median : 0.0000 Median :0.0000 Median : 0.00000
## Mean :0.1046 Mean : 0.2129 Mean :0.2806 Mean : 0.06544
## 3rd Qu.:0.0000 3rd Qu.: 0.0000 3rd Qu.:0.4200 3rd Qu.: 0.00000
## Max. :4.5400 Max. :14.2800 Max. :5.1000 Max. :42.81000
## X0.32 X0.2 X0.3 X0.4
## Min. : 0.0000 Min. :0.00000 Min. :0.0000 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.: 0.0000
## Median : 0.0000 Median :0.00000 Median :0.0000 Median : 0.0000
## Mean : 0.3122 Mean :0.09592 Mean :0.1142 Mean : 0.1053
## 3rd Qu.: 0.3825 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.: 0.0000
## Max. :10.0000 Max. :5.88000 Max. :7.2700 Max. :11.1100
## X0.5 X0.6 X0.7 X0.64.2
## Min. :0.00000 Min. : 0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.: 0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median : 0.0000 Median :0.00000 Median :0.1000
## Mean :0.09009 Mean : 0.2395 Mean :0.05984 Mean :0.5417
## 3rd Qu.:0.00000 3rd Qu.: 0.1600 3rd Qu.:0.00000 3rd Qu.:0.8000
## Max. :5.26000 Max. :18.1800 Max. :2.61000 Max. :9.6700
## X0.8 X0.9 X0.10 X0.32.1
## Min. :0.00000 Min. : 0.00000 Min. :0.00000 Min. : 0.0000
## 1st Qu.:0.00000 1st Qu.: 0.00000 1st Qu.:0.00000 1st Qu.: 0.0000
## Median :0.00000 Median : 0.00000 Median :0.00000 Median : 0.0000
## Mean :0.09395 Mean : 0.05864 Mean :0.04922 Mean : 0.2488
## 3rd Qu.:0.00000 3rd Qu.: 0.00000 3rd Qu.:0.00000 3rd Qu.: 0.1000
## Max. :5.55000 Max. :10.00000 Max. :4.41000 Max. :20.0000
## X0.11 X1.29 X1.93 X0.12
## Min. :0.0000 Min. :0.0000 Min. : 0.000 Min. : 0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 0.000 1st Qu.: 0.0000
## Median :0.0000 Median :0.0000 Median : 1.310 Median : 0.0000
## Mean :0.1426 Mean :0.1845 Mean : 1.662 Mean : 0.0856
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.: 2.640 3rd Qu.: 0.0000
## Max. :7.1400 Max. :9.0900 Max. :18.750 Max. :18.1800
## X0.96 X0.13 X0.14 X0.15
## Min. : 0.0000 Min. : 0.0000 Min. :0.0000 Min. : 0.00000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.: 0.00000
## Median : 0.2200 Median : 0.0000 Median :0.0000 Median : 0.00000
## Mean : 0.8097 Mean : 0.1212 Mean :0.1017 Mean : 0.09429
## 3rd Qu.: 1.2700 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.: 0.00000
## Max. :11.1100 Max. :17.1000 Max. :5.4500 Max. :12.50000
## X0.16 X0.17 X0.18 X0.19
## Min. : 0.0000 Min. : 0.0000 Min. : 0.0000 Min. :0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.:0.0000
## Median : 0.0000 Median : 0.0000 Median : 0.0000 Median :0.0000
## Mean : 0.5496 Mean : 0.2654 Mean : 0.7675 Mean :0.1249
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.:0.0000
## Max. :20.8300 Max. :16.6600 Max. :33.3300 Max. :9.0900
## X0.20 X0.21 X0.22 X0.23
## Min. : 0.00000 Min. :0.0000 Min. : 0.00000 Min. :0.00000
## 1st Qu.: 0.00000 1st Qu.:0.0000 1st Qu.: 0.00000 1st Qu.:0.00000
## Median : 0.00000 Median :0.0000 Median : 0.00000 Median :0.00000
## Mean : 0.09894 Mean :0.1029 Mean : 0.06477 Mean :0.04706
## 3rd Qu.: 0.00000 3rd Qu.:0.0000 3rd Qu.: 0.00000 3rd Qu.:0.00000
## Max. :14.28000 Max. :5.8800 Max. :12.50000 Max. :4.76000
## X0.24 X0.25 X0.26 X0.27
## Min. : 0.00000 Min. :0.00000 Min. : 0.0000 Min. :0.0000
## 1st Qu.: 0.00000 1st Qu.:0.00000 1st Qu.: 0.0000 1st Qu.:0.0000
## Median : 0.00000 Median :0.00000 Median : 0.0000 Median :0.0000
## Mean : 0.09725 Mean :0.04785 Mean : 0.1054 Mean :0.0975
## 3rd Qu.: 0.00000 3rd Qu.:0.00000 3rd Qu.: 0.0000 3rd Qu.:0.0000
## Max. :18.18000 Max. :4.76000 Max. :20.0000 Max. :7.6900
## X0.28 X0.29 X0.30 X0.31
## Min. :0.000 Min. :0.0000 Min. : 0.00000 Min. :0.00000
## 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.: 0.00000 1st Qu.:0.00000
## Median :0.000 Median :0.0000 Median : 0.00000 Median :0.00000
## Mean :0.137 Mean :0.0132 Mean : 0.07865 Mean :0.06485
## 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.: 0.00000 3rd Qu.:0.00000
## Max. :6.890 Max. :8.3300 Max. :11.11000 Max. :4.76000
## X0.33 X0.34 X0.35 X0.36
## Min. :0.00000 Min. : 0.0000 Min. :0.00000 Min. : 0.00000
## 1st Qu.:0.00000 1st Qu.: 0.0000 1st Qu.:0.00000 1st Qu.: 0.00000
## Median :0.00000 Median : 0.0000 Median :0.00000 Median : 0.00000
## Mean :0.04368 Mean : 0.1324 Mean :0.04611 Mean : 0.07921
## 3rd Qu.:0.00000 3rd Qu.: 0.0000 3rd Qu.:0.00000 3rd Qu.: 0.00000
## Max. :7.14000 Max. :14.2800 Max. :3.57000 Max. :20.00000
## X0.37 X0.38 X0.39 X0.40
## Min. : 0.0000 Min. : 0.0000 Min. :0.000000 Min. : 0.00000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.:0.000000 1st Qu.: 0.00000
## Median : 0.0000 Median : 0.0000 Median :0.000000 Median : 0.00000
## Mean : 0.3013 Mean : 0.1799 Mean :0.005446 Mean : 0.03188
## 3rd Qu.: 0.1100 3rd Qu.: 0.0000 3rd Qu.:0.000000 3rd Qu.: 0.00000
## Max. :21.4200 Max. :22.0500 Max. :2.170000 Max. :10.00000
## X0.41 X0.42 X0.43 X0.778
## Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. : 0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.: 0.0000
## Median :0.00000 Median :0.0650 Median :0.00000 Median : 0.0000
## Mean :0.03858 Mean :0.1391 Mean :0.01698 Mean : 0.2690
## 3rd Qu.:0.00000 3rd Qu.:0.1880 3rd Qu.:0.00000 3rd Qu.: 0.3142
## Max. :4.38500 Max. :9.7520 Max. :4.08100 Max. :32.4780
## X0.44 X0.45 X3.756 X61
## Min. :0.00000 Min. : 0.00000 Min. : 1.000 Min. : 1.00
## 1st Qu.:0.00000 1st Qu.: 0.00000 1st Qu.: 1.588 1st Qu.: 6.00
## Median :0.00000 Median : 0.00000 Median : 2.276 Median : 15.00
## Mean :0.07583 Mean : 0.04425 Mean : 5.192 Mean : 52.17
## 3rd Qu.:0.05200 3rd Qu.: 0.00000 3rd Qu.: 3.705 3rd Qu.: 43.00
## Max. :6.00300 Max. :19.82900 Max. :1102.500 Max. :9989.00
## X278 X1
## Min. : 1.0 Min. :0.0000
## 1st Qu.: 35.0 1st Qu.:0.0000
## Median : 95.0 Median :0.0000
## Mean : 283.3 Mean :0.3939
## 3rd Qu.: 265.2 3rd Qu.:1.0000
## Max. :15841.0 Max. :1.0000
library(tree)
length(spam$X1)
## [1] 4600
spamtree <- tree(X1 ~., data=spam, mindev=0.1, mincut=1)
spamtree <- tree(X1 ~., data=spam, mincut=1)
spamtree
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 4600 1098.00 0.39390
## 2) X0.44 < 0.0555 3470 623.60 0.23490
## 4) X0.3 < 0.055 3140 430.50 0.16400
## 8) X0.778 < 0.378 2737 247.40 0.10050
## 16) X0.32.1 < 0.2 2507 168.80 0.07260
## 32) X0.15 < 0.01 2439 135.50 0.05904 *
## 33) X0.15 > 0.01 68 16.76 0.55880 *
## 17) X0.32.1 > 0.2 230 55.40 0.40430 *
## 9) X0.778 > 0.378 403 97.07 0.59550
## 18) X278 < 55.5 182 37.14 0.28570 *
## 19) X278 > 55.5 221 28.07 0.85070 *
## 5) X0.3 > 0.055 330 27.27 0.90910
## 10) X0.18 < 0.14 317 16.09 0.94640 *
## 11) X0.18 > 0.14 13 0.00 0.00000 *
## 3) X0.44 > 0.0555 1130 117.30 0.88230
## 6) X0.16 < 0.4 1060 65.38 0.93400
## 12) X0.38 < 0.49 1045 52.11 0.94740 *
## 13) X0.38 > 0.49 15 0.00 0.00000 *
## 7) X0.16 > 0.4 70 6.30 0.10000 *
plot(spamtree, col=8)
text(spamtree, digits=2)
This is the original tree but I will plot the deviance and prune the trees.
spamcut <- prune.tree(spamtree)
spamcut
## $size
## [1] 10 9 8 7 6 5 4 3 2 1
##
## $dev
## [1] 347.3674 358.5518 371.8239 388.3484 411.5347 443.3914 489.0601
## [8] 575.1522 740.9267 1098.2296
##
## $k
## [1] -Inf 11.18440 13.27210 16.52453 23.18634 31.85670 45.66866
## [8] 86.09210 165.77452 357.30286
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(spamcut)
spamcut <- prune.tree(spamtree,k=3)
plot(spamcut)
text(spamcut, digits=2)
spamcut
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 4600 1098.00 0.39390
## 2) X0.44 < 0.0555 3470 623.60 0.23490
## 4) X0.3 < 0.055 3140 430.50 0.16400
## 8) X0.778 < 0.378 2737 247.40 0.10050
## 16) X0.32.1 < 0.2 2507 168.80 0.07260
## 32) X0.15 < 0.01 2439 135.50 0.05904 *
## 33) X0.15 > 0.01 68 16.76 0.55880 *
## 17) X0.32.1 > 0.2 230 55.40 0.40430 *
## 9) X0.778 > 0.378 403 97.07 0.59550
## 18) X278 < 55.5 182 37.14 0.28570 *
## 19) X278 > 55.5 221 28.07 0.85070 *
## 5) X0.3 > 0.055 330 27.27 0.90910
## 10) X0.18 < 0.14 317 16.09 0.94640 *
## 11) X0.18 > 0.14 13 0.00 0.00000 *
## 3) X0.44 > 0.0555 1130 117.30 0.88230
## 6) X0.16 < 0.4 1060 65.38 0.93400
## 12) X0.38 < 0.49 1045 52.11 0.94740 *
## 13) X0.38 > 0.49 15 0.00 0.00000 *
## 7) X0.16 > 0.4 70 6.30 0.10000 *
spamcut <- prune.tree(spamtree,k=4)
plot(spamcut)
text(spamcut, digits=2)
spamcut
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 4600 1098.00 0.39390
## 2) X0.44 < 0.0555 3470 623.60 0.23490
## 4) X0.3 < 0.055 3140 430.50 0.16400
## 8) X0.778 < 0.378 2737 247.40 0.10050
## 16) X0.32.1 < 0.2 2507 168.80 0.07260
## 32) X0.15 < 0.01 2439 135.50 0.05904 *
## 33) X0.15 > 0.01 68 16.76 0.55880 *
## 17) X0.32.1 > 0.2 230 55.40 0.40430 *
## 9) X0.778 > 0.378 403 97.07 0.59550
## 18) X278 < 55.5 182 37.14 0.28570 *
## 19) X278 > 55.5 221 28.07 0.85070 *
## 5) X0.3 > 0.055 330 27.27 0.90910
## 10) X0.18 < 0.14 317 16.09 0.94640 *
## 11) X0.18 > 0.14 13 0.00 0.00000 *
## 3) X0.44 > 0.0555 1130 117.30 0.88230
## 6) X0.16 < 0.4 1060 65.38 0.93400
## 12) X0.38 < 0.49 1045 52.11 0.94740 *
## 13) X0.38 > 0.49 15 0.00 0.00000 *
## 7) X0.16 > 0.4 70 6.30 0.10000 *
Based off of the deviance plot I decided to plot k=3 and k=4 as those would be the best two and there wasn’t much difference so k=4 would be considered the better one.