###############################################################################
#SimpleTreeTab4_8.R
#
# rpart Example
# explore the maxdepth parameter
#
# Data File: table4_8pg199.txt
# Author: Patricia Hoffman, PhD
###############################################################################
rm(list=ls())
require("rpart")
## Loading required package: rpart
#setwd("C:/Dev/workspaceR/TestDataSets")
setwd("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning")
#install.packages("rpart",lib="C:/Program Files/R/R-3.0.1/library")
#library(rpart)
#vignette("longintro")
#vignette("usercode")
#Reference:
#http://www.stanford.edu/class/stats315b/minitech.pdf
#setwd("C:/Users/PatriciaHoffman/workspaceR/TestDataSets")
# create the model (use the "train" function in R)
train<-read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/table4_8pg199.txt",header=TRUE)
train
## Instance a1 a2 a3 Target
## 1 1 TRUE TRUE 1 1
## 2 2 TRUE TRUE 6 1
## 3 3 TRUE FALSE 5 0
## 4 4 FALSE FALSE 4 1
## 5 5 FALSE TRUE 7 0
## 6 6 FALSE TRUE 3 0
## 7 7 FALSE FALSE 8 0
## 8 8 TRUE FALSE 7 1
## 9 9 FALSE TRUE 5 0
str(train)
## 'data.frame': 9 obs. of 5 variables:
## $ Instance: int 1 2 3 4 5 6 7 8 9
## $ a1 : logi TRUE TRUE TRUE FALSE FALSE FALSE ...
## $ a2 : logi TRUE TRUE FALSE FALSE TRUE TRUE ...
## $ a3 : num 1 6 5 4 7 3 8 7 5
## $ Target : int 1 1 0 1 0 0 0 1 0
y <- as.factor(train[,5]) #class labels 0 or 1
#y <- as.numeric(train[,5]) #class labels 0 or 1
y
## [1] 1 1 0 1 0 0 0 1 0
## Levels: 0 1
x<-train[,2:4]
x
## a1 a2 a3
## 1 TRUE TRUE 1
## 2 TRUE TRUE 6
## 3 TRUE FALSE 5
## 4 FALSE FALSE 4
## 5 FALSE TRUE 7
## 6 FALSE TRUE 3
## 7 FALSE FALSE 8
## 8 TRUE FALSE 7
## 9 FALSE TRUE 5
str(train)
## 'data.frame': 9 obs. of 5 variables:
## $ Instance: int 1 2 3 4 5 6 7 8 9
## $ a1 : logi TRUE TRUE TRUE FALSE FALSE FALSE ...
## $ a2 : logi TRUE TRUE FALSE FALSE TRUE TRUE ...
## $ a3 : num 1 6 5 4 7 3 8 7 5
## $ Target : int 1 1 0 1 0 0 0 1 0
x;y
## a1 a2 a3
## 1 TRUE TRUE 1
## 2 TRUE TRUE 6
## 3 TRUE FALSE 5
## 4 FALSE FALSE 4
## 5 FALSE TRUE 7
## 6 FALSE TRUE 3
## 7 FALSE FALSE 8
## 8 TRUE FALSE 7
## 9 FALSE TRUE 5
## [1] 1 1 0 1 0 0 0 1 0
## Levels: 0 1
is.numeric(train$a3)
## [1] TRUE
# Tree of maxdepth = 5
fit<-rpart(y~.,x,control=rpart.control(minsplit=0,minbucket=0,maxdepth=5))
# Added: Andrew
fit
## n= 9
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 9 4 0 (0.5555556 0.4444444)
## 2) a1< 0.5 5 1 0 (0.8000000 0.2000000)
## 4) a2>=0.5 3 0 0 (1.0000000 0.0000000) *
## 5) a2< 0.5 2 1 0 (0.5000000 0.5000000)
## 10) a3>=6 1 0 0 (1.0000000 0.0000000) *
## 11) a3< 6 1 0 1 (0.0000000 1.0000000) *
## 3) a1>=0.5 4 1 1 (0.2500000 0.7500000)
## 6) a2< 0.5 2 1 0 (0.5000000 0.5000000)
## 12) a3< 6 1 0 0 (1.0000000 0.0000000) *
## 13) a3>=6 1 0 1 (0.0000000 1.0000000) *
## 7) a2>=0.5 2 0 1 (0.0000000 1.0000000) *
class(fit)
## [1] "rpart"
###
predict(fit,x,type="class")
## 1 2 3 4 5 6 7 8 9
## 1 1 0 1 0 0 0 1 0
## Levels: 0 1
predict(fit, type="prob")
## 0 1
## 1 0 1
## 2 0 1
## 3 1 0
## 4 0 1
## 5 1 0
## 6 1 0
## 7 1 0
## 8 0 1
## 9 1 0
1-sum(y==predict(fit,x,type="class"))/length(y)
## [1] 0
# returns 0 all correct
# by setting type="class" in the predict function
# it returns the actual predicted class
# however, if you leave it off, the function
# returns a matrix of the probabilities of
# each class prediction.
oldpar <- par(no.readonly=TRUE)
#par(mar=rep(1, 4)) # make the margins smaller for RStudio
# default mar is c(5, 4, 4, 2) + 0.1
#par(mar= c(5, 4, 4, 2) + 0.1)
par(mar=rep(0.6,4))
par(ask=FALSE)
plot(fit)
text(fit)

print(fit) # What this mean?
## n= 9
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 9 4 0 (0.5555556 0.4444444)
## 2) a1< 0.5 5 1 0 (0.8000000 0.2000000)
## 4) a2>=0.5 3 0 0 (1.0000000 0.0000000) *
## 5) a2< 0.5 2 1 0 (0.5000000 0.5000000)
## 10) a3>=6 1 0 0 (1.0000000 0.0000000) *
## 11) a3< 6 1 0 1 (0.0000000 1.0000000) *
## 3) a1>=0.5 4 1 1 (0.2500000 0.7500000)
## 6) a2< 0.5 2 1 0 (0.5000000 0.5000000)
## 12) a3< 6 1 0 0 (1.0000000 0.0000000) *
## 13) a3>=6 1 0 1 (0.0000000 1.0000000) *
## 7) a2>=0.5 2 0 1 (0.0000000 1.0000000) *
post(fit,file="")

par <- oldpar
#close the tree graph and make another one
#prettyTree(fit)
printcp(fit)
##
## Classification tree:
## rpart(formula = y ~ ., data = x, control = rpart.control(minsplit = 0,
## minbucket = 0, maxdepth = 5))
##
## Variables actually used in tree construction:
## [1] a1 a2 a3
##
## Root node error: 4/9 = 0.44444
##
## n= 9
##
## CP nsplit rel error xerror xstd
## 1 0.500 0 1.0 1.00 0.37268
## 2 0.125 1 0.5 1.75 0.31180
## 3 0.010 5 0.0 1.75 0.31180
library(rpart)
# Tree of maxdepth = 2
fit<-rpart(y~.,x,control=rpart.control(minsplit=0,minbucket=0,cp=-1,
maxcompete=0, maxsurrogate=0, usesurrogate=0,
xval=0,maxdepth=2))
predict(fit,x,type="class")
## 1 2 3 4 5 6 7 8 9
## 1 1 0 0 0 0 0 0 0
## Levels: 0 1
predict(fit, type="prob")
## 0 1
## 1 0.0 1.0
## 2 0.0 1.0
## 3 0.5 0.5
## 4 0.5 0.5
## 5 1.0 0.0
## 6 1.0 0.0
## 7 0.5 0.5
## 8 0.5 0.5
## 9 1.0 0.0
1-sum(y==predict(fit,x,type="class"))/length(y)
## [1] 0.2222222
# returns 0.222 22% error
plot(fit)
text(fit)

fit
## n= 9
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 9 4 0 (0.5555556 0.4444444)
## 2) a1< 0.5 5 1 0 (0.8000000 0.2000000)
## 4) a2>=0.5 3 0 0 (1.0000000 0.0000000) *
## 5) a2< 0.5 2 1 0 (0.5000000 0.5000000) *
## 3) a1>=0.5 4 1 1 (0.2500000 0.7500000)
## 6) a2< 0.5 2 1 0 (0.5000000 0.5000000) *
## 7) a2>=0.5 2 0 1 (0.0000000 1.0000000) *
fit$frame
## var n wt dev yval complexity ncompete nsurrogate yval2.V1 yval2.V2
## 1 a1 9 9 4 1 0.5 0 0 1.0000000 5.0000000
## 2 a2 5 5 1 1 0.0 0 0 1.0000000 4.0000000
## 4 <leaf> 3 3 0 1 -1.0 0 0 1.0000000 3.0000000
## 5 <leaf> 2 2 1 1 -1.0 0 0 1.0000000 1.0000000
## 3 a2 4 4 1 2 0.0 0 0 2.0000000 1.0000000
## 6 <leaf> 2 2 1 1 -1.0 0 0 1.0000000 1.0000000
## 7 <leaf> 2 2 0 2 -1.0 0 0 2.0000000 0.0000000
## yval2.V3 yval2.V4 yval2.V5 yval2.nodeprob
## 1 4.0000000 0.5555556 0.4444444 1.0000000
## 2 1.0000000 0.8000000 0.2000000 0.5555556
## 4 0.0000000 1.0000000 0.0000000 0.3333333
## 5 1.0000000 0.5000000 0.5000000 0.2222222
## 3 3.0000000 0.2500000 0.7500000 0.4444444
## 6 1.0000000 0.5000000 0.5000000 0.2222222
## 7 2.0000000 0.0000000 1.0000000 0.2222222
fit$frame[1,1]
## [1] a1
## Levels: <leaf> a1 a2
print(fit)
## n= 9
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 9 4 0 (0.5555556 0.4444444)
## 2) a1< 0.5 5 1 0 (0.8000000 0.2000000)
## 4) a2>=0.5 3 0 0 (1.0000000 0.0000000) *
## 5) a2< 0.5 2 1 0 (0.5000000 0.5000000) *
## 3) a1>=0.5 4 1 1 (0.2500000 0.7500000)
## 6) a2< 0.5 2 1 0 (0.5000000 0.5000000) *
## 7) a2>=0.5 2 0 1 (0.0000000 1.0000000) *
post(fit,file="")
answers <- predict(fit, type="class")
answers
## 1 2 3 4 5 6 7 8 9
## 1 1 0 0 0 0 0 0 0
## Levels: 0 1
answers[2]
## 2
## 1
## Levels: 0 1
length(answers)
## [1] 9
predict(fit, type="prob") # class probabilities (default)
## 0 1
## 1 0.0 1.0
## 2 0.0 1.0
## 3 0.5 0.5
## 4 0.5 0.5
## 5 1.0 0.0
## 6 1.0 0.0
## 7 0.5 0.5
## 8 0.5 0.5
## 9 1.0 0.0
predict(fit, type="vector") # level numbers
## 1 2 3 4 5 6 7 8 9
## 2 2 1 1 1 1 1 1 1
predict(fit, type="class") # factor
## 1 2 3 4 5 6 7 8 9
## 1 1 0 0 0 0 0 0 0
## Levels: 0 1
predict(fit, type="matrix") # level number, class frequencies, probabilities
## [,1] [,2] [,3] [,4] [,5] [,6]
## 1 2 0 2 0.0 1.0 0.2222222
## 2 2 0 2 0.0 1.0 0.2222222
## 3 1 1 1 0.5 0.5 0.2222222
## 4 1 1 1 0.5 0.5 0.2222222
## 5 1 3 0 1.0 0.0 0.3333333
## 6 1 3 0 1.0 0.0 0.3333333
## 7 1 1 1 0.5 0.5 0.2222222
## 8 1 1 1 0.5 0.5 0.2222222
## 9 1 3 0 1.0 0.0 0.3333333
#look at the prune statement
#install.packages("DMwR")
library("DMwR")
## Warning: package 'DMwR' was built under R version 3.1.2
## Loading required package: lattice
## Loading required package: grid

fit<-rpart(y~.,x,control=rpart.control(minsplit=0,minbucket=0,maxdepth=5))
prettyTree(fit)

printcp(fit)
##
## Classification tree:
## rpart(formula = y ~ ., data = x, control = rpart.control(minsplit = 0,
## minbucket = 0, maxdepth = 5))
##
## Variables actually used in tree construction:
## [1] a1 a2 a3
##
## Root node error: 4/9 = 0.44444
##
## n= 9
##
## CP nsplit rel error xerror xstd
## 1 0.500 0 1.0 1.00 0.37268
## 2 0.125 1 0.5 1.75 0.31180
## 3 0.010 5 0.0 1.75 0.31180
zp <- prune(fit, cp=0.2)
plot(zp) #plot smaller rpart object

prettyTree(zp)

printcp(zp)
##
## Classification tree:
## rpart(formula = y ~ ., data = x, control = rpart.control(minsplit = 0,
## minbucket = 0, maxdepth = 5))
##
## Variables actually used in tree construction:
## [1] a1
##
## Root node error: 4/9 = 0.44444
##
## n= 9
##
## CP nsplit rel error xerror xstd
## 1 0.5 0 1.0 1.00 0.37268
## 2 0.2 1 0.5 1.75 0.31180
# Now the probabilities are not just zero and one
predict(zp, type="prob")
## 0 1
## [1,] 0.25 0.75
## [2,] 0.25 0.75
## [3,] 0.25 0.75
## [4,] 0.80 0.20
## [5,] 0.80 0.20
## [6,] 0.80 0.20
## [7,] 0.80 0.20
## [8,] 0.25 0.75
## [9,] 0.80 0.20
#look at the prune statement
# install.packages("DMwR")
library("DMwR")
dataset <- cbind(rownames(car.test.frame), car.test.frame$Mileage , car.test.frame$Weight)
car.test.frame
## Price Country Reliability Mileage Type
## Eagle Summit 4 8895 USA 4 33 Small
## Ford Escort 4 7402 USA 2 33 Small
## Ford Festiva 4 6319 Korea 4 37 Small
## Honda Civic 4 6635 Japan/USA 5 32 Small
## Mazda Protege 4 6599 Japan 5 32 Small
## Mercury Tracer 4 8672 Mexico 4 26 Small
## Nissan Sentra 4 7399 Japan/USA 5 33 Small
## Pontiac LeMans 4 7254 Korea 1 28 Small
## Subaru Loyale 4 9599 Japan 5 25 Small
## Subaru Justy 3 5866 Japan NA 34 Small
## Toyota Corolla 4 8748 Japan/USA 5 29 Small
## Toyota Tercel 4 6488 Japan 5 35 Small
## Volkswagen Jetta 4 9995 Germany 3 26 Small
## Chevrolet Camaro V8 11545 USA 1 20 Sporty
## Dodge Daytona 9745 USA 1 27 Sporty
## Ford Mustang V8 12164 USA 1 19 Sporty
## Ford Probe 11470 USA 3 30 Sporty
## Honda Civic CRX Si 4 9410 Japan 5 33 Sporty
## Honda Prelude Si 4WS 4 13945 Japan 5 27 Sporty
## Nissan 240SX 4 13249 Japan 3 24 Sporty
## Plymouth Laser 10855 USA NA 26 Sporty
## Subaru XT 4 13071 Japan NA 28 Sporty
## Audi 80 4 18900 Germany NA 27 Compact
## Buick Skylark 4 10565 USA 2 23 Compact
## Chevrolet Beretta 4 10320 USA 1 26 Compact
## Chrysler Le Baron V6 10945 USA 4 25 Compact
## Ford Tempo 4 9483 USA 2 24 Compact
## Honda Accord 4 12145 Japan/USA 5 26 Compact
## Mazda 626 4 12459 Japan/USA 4 24 Compact
## Mitsubishi Galant 4 10989 Japan 5 25 Compact
## Mitsubishi Sigma V6 17879 Japan 4 21 Compact
## Nissan Stanza 4 11650 Japan 5 21 Compact
## Oldsmobile Calais 4 9995 USA 2 23 Compact
## Peugeot 405 4 15930 France NA 24 Compact
## Subaru Legacy 4 11499 Japan/USA 5 23 Compact
## Toyota Camry 4 11588 Japan/USA 5 27 Compact
## Volvo 240 4 18450 Sweden 3 23 Compact
## Acura Legend V6 24760 Japan 5 20 Medium
## Buick Century 4 13150 USA 3 21 Medium
## Chrysler Le Baron Coupe 12495 USA 2 22 Medium
## Chrysler New Yorker V6 16342 USA 3 22 Medium
## Eagle Premier V6 15350 USA 2 22 Medium
## Ford Taurus V6 13195 USA 3 22 Medium
## Ford Thunderbird V6 14980 USA 1 23 Medium
## Hyundai Sonata 4 9999 Korea NA 23 Medium
## Mazda 929 V6 23300 Japan 5 21 Medium
## Nissan Maxima V6 17899 Japan 5 22 Medium
## Oldsmobile Cutlass Ciera 4 13150 USA 2 21 Medium
## Oldsmobile Cutlass Supreme V6 14495 USA NA 21 Medium
## Toyota Cressida 6 21498 Japan 3 23 Medium
## Buick Le Sabre V6 16145 USA 3 23 Large
## Chevrolet Caprice V8 14525 USA 1 18 Large
## Ford LTD Crown Victoria V8 17257 USA 3 20 Large
## Chevrolet Lumina APV V6 13995 USA NA 18 Van
## Dodge Grand Caravan V6 15395 USA 3 18 Van
## Ford Aerostar V6 12267 USA 3 18 Van
## Mazda MPV V6 14944 Japan 5 19 Van
## Mitsubishi Wagon 4 14929 Japan NA 20 Van
## Nissan Axxess 4 13949 Japan NA 20 Van
## Nissan Van 4 14799 Japan NA 19 Van
## Weight Disp. HP
## Eagle Summit 4 2560 97 113
## Ford Escort 4 2345 114 90
## Ford Festiva 4 1845 81 63
## Honda Civic 4 2260 91 92
## Mazda Protege 4 2440 113 103
## Mercury Tracer 4 2285 97 82
## Nissan Sentra 4 2275 97 90
## Pontiac LeMans 4 2350 98 74
## Subaru Loyale 4 2295 109 90
## Subaru Justy 3 1900 73 73
## Toyota Corolla 4 2390 97 102
## Toyota Tercel 4 2075 89 78
## Volkswagen Jetta 4 2330 109 100
## Chevrolet Camaro V8 3320 305 170
## Dodge Daytona 2885 153 100
## Ford Mustang V8 3310 302 225
## Ford Probe 2695 133 110
## Honda Civic CRX Si 4 2170 97 108
## Honda Prelude Si 4WS 4 2710 125 140
## Nissan 240SX 4 2775 146 140
## Plymouth Laser 2840 107 92
## Subaru XT 4 2485 109 97
## Audi 80 4 2670 121 108
## Buick Skylark 4 2640 151 110
## Chevrolet Beretta 4 2655 133 95
## Chrysler Le Baron V6 3065 181 141
## Ford Tempo 4 2750 141 98
## Honda Accord 4 2920 132 125
## Mazda 626 4 2780 133 110
## Mitsubishi Galant 4 2745 122 102
## Mitsubishi Sigma V6 3110 181 142
## Nissan Stanza 4 2920 146 138
## Oldsmobile Calais 4 2645 151 110
## Peugeot 405 4 2575 116 120
## Subaru Legacy 4 2935 135 130
## Toyota Camry 4 2920 122 115
## Volvo 240 4 2985 141 114
## Acura Legend V6 3265 163 160
## Buick Century 4 2880 151 110
## Chrysler Le Baron Coupe 2975 153 150
## Chrysler New Yorker V6 3450 202 147
## Eagle Premier V6 3145 180 150
## Ford Taurus V6 3190 182 140
## Ford Thunderbird V6 3610 232 140
## Hyundai Sonata 4 2885 143 110
## Mazda 929 V6 3480 180 158
## Nissan Maxima V6 3200 180 160
## Oldsmobile Cutlass Ciera 4 2765 151 110
## Oldsmobile Cutlass Supreme V6 3220 189 135
## Toyota Cressida 6 3480 180 190
## Buick Le Sabre V6 3325 231 165
## Chevrolet Caprice V8 3855 305 170
## Ford LTD Crown Victoria V8 3850 302 150
## Chevrolet Lumina APV V6 3195 151 110
## Dodge Grand Caravan V6 3735 202 150
## Ford Aerostar V6 3665 182 145
## Mazda MPV V6 3735 181 150
## Mitsubishi Wagon 4 3415 143 107
## Nissan Axxess 4 3185 146 138
## Nissan Van 4 3690 146 106
head(car.test.frame)
## Price Country Reliability Mileage Type Weight Disp.
## Eagle Summit 4 8895 USA 4 33 Small 2560 97
## Ford Escort 4 7402 USA 2 33 Small 2345 114
## Ford Festiva 4 6319 Korea 4 37 Small 1845 81
## Honda Civic 4 6635 Japan/USA 5 32 Small 2260 91
## Mazda Protege 4 6599 Japan 5 32 Small 2440 113
## Mercury Tracer 4 8672 Mexico 4 26 Small 2285 97
## HP
## Eagle Summit 4 113
## Ford Escort 4 90
## Ford Festiva 4 63
## Honda Civic 4 92
## Mazda Protege 4 103
## Mercury Tracer 4 82
z.auto <- rpart(Mileage ~ Weight, car.test.frame)
prettyTree(z.auto)

printcp(z.auto)
##
## Regression tree:
## rpart(formula = Mileage ~ Weight, data = car.test.frame)
##
## Variables actually used in tree construction:
## [1] Weight
##
## Root node error: 1354.6/60 = 22.576
##
## n= 60
##
## CP nsplit rel error xerror xstd
## 1 0.595349 0 1.00000 1.06992 0.183244
## 2 0.134528 1 0.40465 0.55470 0.103161
## 3 0.012828 2 0.27012 0.40922 0.078510
## 4 0.010000 3 0.25729 0.41304 0.078199
zp <- prune(z.auto, cp=0.1)
plot(zp) #plot smaller rpart object

prettyTree(zp)

printcp(zp)
##
## Regression tree:
## rpart(formula = Mileage ~ Weight, data = car.test.frame)
##
## Variables actually used in tree construction:
## [1] Weight
##
## Root node error: 1354.6/60 = 22.576
##
## n= 60
##
## CP nsplit rel error xerror xstd
## 1 0.59535 0 1.00000 1.06992 0.18324
## 2 0.13453 1 0.40465 0.55470 0.10316
## 3 0.10000 2 0.27012 0.40922 0.07851
###############################
#
# Documentation for rpart
#
################################
# ?rpart
#rpart(formula, data, control, ...)
# formula a formula, with a response but no interaction terms.
# ?formula
#data an data frame in which to interpret the variables named in the formula.
#control a list of options that control details of the rpart algorithm. See rpart.control.
# ?rpart.control
#rpart.control(minsplit = 20, minbucket = round(minsplit/3),maxdepth = 30, ...)
#maxdepth - Set the maximum depth of any node of the final tree, with the root node counted as depth 0.
#minsplit - the minimum number of observations that must exist in a node in order for a split to be attempted.
#minbucket - the minimum number of observations in any terminal <leaf> node.
# If only one of minbucket or minsplit is specified,
# the code either sets minsplit to minbucket*3 or minbucket to minsplit/3, as appropriate.
# ?prune