ML_Day2_Classification-2

###############################################################################
#SimpleTreeTab4_8.R
#
# rpart Example
#      explore the maxdepth parameter
#
# Data File: table4_8pg199.txt 
# Author: Patricia Hoffman, PhD
###############################################################################
rm(list=ls())
require("rpart")

## Loading required package: rpart

#setwd("C:/Dev/workspaceR/TestDataSets")
setwd("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning")

#install.packages("rpart",lib="C:/Program Files/R/R-3.0.1/library")
#library(rpart)
#vignette("longintro")
#vignette("usercode")

#Reference:
#http://www.stanford.edu/class/stats315b/minitech.pdf
#setwd("C:/Users/PatriciaHoffman/workspaceR/TestDataSets")
# create the model (use the "train" function in R)
train<-read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/table4_8pg199.txt",header=TRUE)
train

##   Instance    a1    a2 a3 Target
## 1        1  TRUE  TRUE  1      1
## 2        2  TRUE  TRUE  6      1
## 3        3  TRUE FALSE  5      0
## 4        4 FALSE FALSE  4      1
## 5        5 FALSE  TRUE  7      0
## 6        6 FALSE  TRUE  3      0
## 7        7 FALSE FALSE  8      0
## 8        8  TRUE FALSE  7      1
## 9        9 FALSE  TRUE  5      0

str(train)

## 'data.frame':    9 obs. of  5 variables:
##  $ Instance: int  1 2 3 4 5 6 7 8 9
##  $ a1      : logi  TRUE TRUE TRUE FALSE FALSE FALSE ...
##  $ a2      : logi  TRUE TRUE FALSE FALSE TRUE TRUE ...
##  $ a3      : num  1 6 5 4 7 3 8 7 5
##  $ Target  : int  1 1 0 1 0 0 0 1 0

y <- as.factor(train[,5]) #class labels 0 or 1
#y <- as.numeric(train[,5]) #class labels 0 or 1
y

## [1] 1 1 0 1 0 0 0 1 0
## Levels: 0 1

x<-train[,2:4]
x

##      a1    a2 a3
## 1  TRUE  TRUE  1
## 2  TRUE  TRUE  6
## 3  TRUE FALSE  5
## 4 FALSE FALSE  4
## 5 FALSE  TRUE  7
## 6 FALSE  TRUE  3
## 7 FALSE FALSE  8
## 8  TRUE FALSE  7
## 9 FALSE  TRUE  5

str(train)

## 'data.frame':    9 obs. of  5 variables:
##  $ Instance: int  1 2 3 4 5 6 7 8 9
##  $ a1      : logi  TRUE TRUE TRUE FALSE FALSE FALSE ...
##  $ a2      : logi  TRUE TRUE FALSE FALSE TRUE TRUE ...
##  $ a3      : num  1 6 5 4 7 3 8 7 5
##  $ Target  : int  1 1 0 1 0 0 0 1 0

x;y

##      a1    a2 a3
## 1  TRUE  TRUE  1
## 2  TRUE  TRUE  6
## 3  TRUE FALSE  5
## 4 FALSE FALSE  4
## 5 FALSE  TRUE  7
## 6 FALSE  TRUE  3
## 7 FALSE FALSE  8
## 8  TRUE FALSE  7
## 9 FALSE  TRUE  5

## [1] 1 1 0 1 0 0 0 1 0
## Levels: 0 1

is.numeric(train$a3)

## [1] TRUE

# Tree of maxdepth = 5
fit<-rpart(y~.,x,control=rpart.control(minsplit=0,minbucket=0,maxdepth=5))
# Added: Andrew
fit

## n= 9 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 9 4 0 (0.5555556 0.4444444)  
##    2) a1< 0.5 5 1 0 (0.8000000 0.2000000)  
##      4) a2>=0.5 3 0 0 (1.0000000 0.0000000) *
##      5) a2< 0.5 2 1 0 (0.5000000 0.5000000)  
##       10) a3>=6 1 0 0 (1.0000000 0.0000000) *
##       11) a3< 6 1 0 1 (0.0000000 1.0000000) *
##    3) a1>=0.5 4 1 1 (0.2500000 0.7500000)  
##      6) a2< 0.5 2 1 0 (0.5000000 0.5000000)  
##       12) a3< 6 1 0 0 (1.0000000 0.0000000) *
##       13) a3>=6 1 0 1 (0.0000000 1.0000000) *
##      7) a2>=0.5 2 0 1 (0.0000000 1.0000000) *

class(fit)

## [1] "rpart"

###

predict(fit,x,type="class")

## 1 2 3 4 5 6 7 8 9 
## 1 1 0 1 0 0 0 1 0 
## Levels: 0 1

predict(fit, type="prob")

##   0 1
## 1 0 1
## 2 0 1
## 3 1 0
## 4 0 1
## 5 1 0
## 6 1 0
## 7 1 0
## 8 0 1
## 9 1 0

1-sum(y==predict(fit,x,type="class"))/length(y)

## [1] 0

# returns 0  all correct
#  by setting type="class" in the predict function
#  it returns the actual predicted class
#  however, if you leave it off, the function
#  returns a matrix of the probabilities of
#  each class prediction.

oldpar <- par(no.readonly=TRUE)
#par(mar=rep(1, 4)) # make the margins smaller for RStudio
# default mar is c(5, 4, 4, 2) + 0.1
#par(mar= c(5, 4, 4, 2) + 0.1)
par(mar=rep(0.6,4))
par(ask=FALSE)
plot(fit)
text(fit)

print(fit)  # What this mean?

## n= 9 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 9 4 0 (0.5555556 0.4444444)  
##    2) a1< 0.5 5 1 0 (0.8000000 0.2000000)  
##      4) a2>=0.5 3 0 0 (1.0000000 0.0000000) *
##      5) a2< 0.5 2 1 0 (0.5000000 0.5000000)  
##       10) a3>=6 1 0 0 (1.0000000 0.0000000) *
##       11) a3< 6 1 0 1 (0.0000000 1.0000000) *
##    3) a1>=0.5 4 1 1 (0.2500000 0.7500000)  
##      6) a2< 0.5 2 1 0 (0.5000000 0.5000000)  
##       12) a3< 6 1 0 0 (1.0000000 0.0000000) *
##       13) a3>=6 1 0 1 (0.0000000 1.0000000) *
##      7) a2>=0.5 2 0 1 (0.0000000 1.0000000) *

post(fit,file="")

par <- oldpar
#close the tree graph and make another one

#prettyTree(fit)
printcp(fit)

## 
## Classification tree:
## rpart(formula = y ~ ., data = x, control = rpart.control(minsplit = 0, 
##     minbucket = 0, maxdepth = 5))
## 
## Variables actually used in tree construction:
## [1] a1 a2 a3
## 
## Root node error: 4/9 = 0.44444
## 
## n= 9 
## 
##      CP nsplit rel error xerror    xstd
## 1 0.500      0       1.0   1.00 0.37268
## 2 0.125      1       0.5   1.75 0.31180
## 3 0.010      5       0.0   1.75 0.31180

library(rpart)

# Tree of maxdepth = 2
fit<-rpart(y~.,x,control=rpart.control(minsplit=0,minbucket=0,cp=-1,
                maxcompete=0, maxsurrogate=0, usesurrogate=0, 
                xval=0,maxdepth=2))
predict(fit,x,type="class")

## 1 2 3 4 5 6 7 8 9 
## 1 1 0 0 0 0 0 0 0 
## Levels: 0 1

predict(fit, type="prob")

##     0   1
## 1 0.0 1.0
## 2 0.0 1.0
## 3 0.5 0.5
## 4 0.5 0.5
## 5 1.0 0.0
## 6 1.0 0.0
## 7 0.5 0.5
## 8 0.5 0.5
## 9 1.0 0.0

1-sum(y==predict(fit,x,type="class"))/length(y)

## [1] 0.2222222

# returns 0.222  22% error
plot(fit)
text(fit)

fit

## n= 9 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 9 4 0 (0.5555556 0.4444444)  
##   2) a1< 0.5 5 1 0 (0.8000000 0.2000000)  
##     4) a2>=0.5 3 0 0 (1.0000000 0.0000000) *
##     5) a2< 0.5 2 1 0 (0.5000000 0.5000000) *
##   3) a1>=0.5 4 1 1 (0.2500000 0.7500000)  
##     6) a2< 0.5 2 1 0 (0.5000000 0.5000000) *
##     7) a2>=0.5 2 0 1 (0.0000000 1.0000000) *

fit$frame

##      var n wt dev yval complexity ncompete nsurrogate  yval2.V1  yval2.V2
## 1     a1 9  9   4    1        0.5        0          0 1.0000000 5.0000000
## 2     a2 5  5   1    1        0.0        0          0 1.0000000 4.0000000
## 4 <leaf> 3  3   0    1       -1.0        0          0 1.0000000 3.0000000
## 5 <leaf> 2  2   1    1       -1.0        0          0 1.0000000 1.0000000
## 3     a2 4  4   1    2        0.0        0          0 2.0000000 1.0000000
## 6 <leaf> 2  2   1    1       -1.0        0          0 1.0000000 1.0000000
## 7 <leaf> 2  2   0    2       -1.0        0          0 2.0000000 0.0000000
##    yval2.V3  yval2.V4  yval2.V5 yval2.nodeprob
## 1 4.0000000 0.5555556 0.4444444      1.0000000
## 2 1.0000000 0.8000000 0.2000000      0.5555556
## 4 0.0000000 1.0000000 0.0000000      0.3333333
## 5 1.0000000 0.5000000 0.5000000      0.2222222
## 3 3.0000000 0.2500000 0.7500000      0.4444444
## 6 1.0000000 0.5000000 0.5000000      0.2222222
## 7 2.0000000 0.0000000 1.0000000      0.2222222

fit$frame[1,1]

## [1] a1
## Levels: <leaf> a1 a2

print(fit)

## n= 9 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 9 4 0 (0.5555556 0.4444444)  
##   2) a1< 0.5 5 1 0 (0.8000000 0.2000000)  
##     4) a2>=0.5 3 0 0 (1.0000000 0.0000000) *
##     5) a2< 0.5 2 1 0 (0.5000000 0.5000000) *
##   3) a1>=0.5 4 1 1 (0.2500000 0.7500000)  
##     6) a2< 0.5 2 1 0 (0.5000000 0.5000000) *
##     7) a2>=0.5 2 0 1 (0.0000000 1.0000000) *

post(fit,file="")


answers <- predict(fit, type="class") 
answers

## 1 2 3 4 5 6 7 8 9 
## 1 1 0 0 0 0 0 0 0 
## Levels: 0 1

answers[2]

## 2 
## 1 
## Levels: 0 1

length(answers)

## [1] 9

predict(fit, type="prob")   # class probabilities (default)

##     0   1
## 1 0.0 1.0
## 2 0.0 1.0
## 3 0.5 0.5
## 4 0.5 0.5
## 5 1.0 0.0
## 6 1.0 0.0
## 7 0.5 0.5
## 8 0.5 0.5
## 9 1.0 0.0

predict(fit, type="vector") # level numbers

## 1 2 3 4 5 6 7 8 9 
## 2 2 1 1 1 1 1 1 1

predict(fit, type="class")  # factor

## 1 2 3 4 5 6 7 8 9 
## 1 1 0 0 0 0 0 0 0 
## Levels: 0 1

predict(fit, type="matrix") # level number, class frequencies, probabilities

##   [,1] [,2] [,3] [,4] [,5]      [,6]
## 1    2    0    2  0.0  1.0 0.2222222
## 2    2    0    2  0.0  1.0 0.2222222
## 3    1    1    1  0.5  0.5 0.2222222
## 4    1    1    1  0.5  0.5 0.2222222
## 5    1    3    0  1.0  0.0 0.3333333
## 6    1    3    0  1.0  0.0 0.3333333
## 7    1    1    1  0.5  0.5 0.2222222
## 8    1    1    1  0.5  0.5 0.2222222
## 9    1    3    0  1.0  0.0 0.3333333

#look at the prune statement
#install.packages("DMwR")
library("DMwR")

## Warning: package 'DMwR' was built under R version 3.1.2

## Loading required package: lattice
## Loading required package: grid

fit<-rpart(y~.,x,control=rpart.control(minsplit=0,minbucket=0,maxdepth=5))
prettyTree(fit)

printcp(fit)

## 
## Classification tree:
## rpart(formula = y ~ ., data = x, control = rpart.control(minsplit = 0, 
##     minbucket = 0, maxdepth = 5))
## 
## Variables actually used in tree construction:
## [1] a1 a2 a3
## 
## Root node error: 4/9 = 0.44444
## 
## n= 9 
## 
##      CP nsplit rel error xerror    xstd
## 1 0.500      0       1.0   1.00 0.37268
## 2 0.125      1       0.5   1.75 0.31180
## 3 0.010      5       0.0   1.75 0.31180

zp <- prune(fit, cp=0.2)
plot(zp) #plot smaller rpart object

prettyTree(zp)

printcp(zp)

## 
## Classification tree:
## rpart(formula = y ~ ., data = x, control = rpart.control(minsplit = 0, 
##     minbucket = 0, maxdepth = 5))
## 
## Variables actually used in tree construction:
## [1] a1
## 
## Root node error: 4/9 = 0.44444
## 
## n= 9 
## 
##    CP nsplit rel error xerror    xstd
## 1 0.5      0       1.0   1.00 0.37268
## 2 0.2      1       0.5   1.75 0.31180

# Now the probabilities are not just zero and one
predict(zp, type="prob")

##          0    1
##  [1,] 0.25 0.75
##  [2,] 0.25 0.75
##  [3,] 0.25 0.75
##  [4,] 0.80 0.20
##  [5,] 0.80 0.20
##  [6,] 0.80 0.20
##  [7,] 0.80 0.20
##  [8,] 0.25 0.75
##  [9,] 0.80 0.20

#look at the prune statement
# install.packages("DMwR")
library("DMwR")
dataset <- cbind(rownames(car.test.frame), car.test.frame$Mileage   , car.test.frame$Weight)
car.test.frame

##                               Price   Country Reliability Mileage    Type
## Eagle Summit 4                 8895       USA           4      33   Small
## Ford Escort   4                7402       USA           2      33   Small
## Ford Festiva 4                 6319     Korea           4      37   Small
## Honda Civic 4                  6635 Japan/USA           5      32   Small
## Mazda Protege 4                6599     Japan           5      32   Small
## Mercury Tracer 4               8672    Mexico           4      26   Small
## Nissan Sentra 4                7399 Japan/USA           5      33   Small
## Pontiac LeMans 4               7254     Korea           1      28   Small
## Subaru Loyale 4                9599     Japan           5      25   Small
## Subaru Justy 3                 5866     Japan          NA      34   Small
## Toyota Corolla 4               8748 Japan/USA           5      29   Small
## Toyota Tercel 4                6488     Japan           5      35   Small
## Volkswagen Jetta 4             9995   Germany           3      26   Small
## Chevrolet Camaro V8           11545       USA           1      20  Sporty
## Dodge Daytona                  9745       USA           1      27  Sporty
## Ford Mustang V8               12164       USA           1      19  Sporty
## Ford Probe                    11470       USA           3      30  Sporty
## Honda Civic CRX Si 4           9410     Japan           5      33  Sporty
## Honda Prelude Si 4WS 4        13945     Japan           5      27  Sporty
## Nissan 240SX 4                13249     Japan           3      24  Sporty
## Plymouth Laser                10855       USA          NA      26  Sporty
## Subaru XT 4                   13071     Japan          NA      28  Sporty
## Audi 80 4                     18900   Germany          NA      27 Compact
## Buick Skylark 4               10565       USA           2      23 Compact
## Chevrolet Beretta 4           10320       USA           1      26 Compact
## Chrysler Le Baron V6          10945       USA           4      25 Compact
## Ford Tempo 4                   9483       USA           2      24 Compact
## Honda Accord 4                12145 Japan/USA           5      26 Compact
## Mazda 626 4                   12459 Japan/USA           4      24 Compact
## Mitsubishi Galant 4           10989     Japan           5      25 Compact
## Mitsubishi Sigma V6           17879     Japan           4      21 Compact
## Nissan Stanza 4               11650     Japan           5      21 Compact
## Oldsmobile Calais 4            9995       USA           2      23 Compact
## Peugeot 405 4                 15930    France          NA      24 Compact
## Subaru Legacy 4               11499 Japan/USA           5      23 Compact
## Toyota Camry 4                11588 Japan/USA           5      27 Compact
## Volvo 240 4                   18450    Sweden           3      23 Compact
## Acura Legend V6               24760     Japan           5      20  Medium
## Buick Century 4               13150       USA           3      21  Medium
## Chrysler Le Baron Coupe       12495       USA           2      22  Medium
## Chrysler New Yorker V6        16342       USA           3      22  Medium
## Eagle Premier V6              15350       USA           2      22  Medium
## Ford Taurus V6                13195       USA           3      22  Medium
## Ford Thunderbird V6           14980       USA           1      23  Medium
## Hyundai Sonata 4               9999     Korea          NA      23  Medium
## Mazda 929 V6                  23300     Japan           5      21  Medium
## Nissan Maxima V6              17899     Japan           5      22  Medium
## Oldsmobile Cutlass Ciera 4    13150       USA           2      21  Medium
## Oldsmobile Cutlass Supreme V6 14495       USA          NA      21  Medium
## Toyota Cressida 6             21498     Japan           3      23  Medium
## Buick Le Sabre V6             16145       USA           3      23   Large
## Chevrolet Caprice V8          14525       USA           1      18   Large
## Ford LTD Crown Victoria V8    17257       USA           3      20   Large
## Chevrolet Lumina APV V6       13995       USA          NA      18     Van
## Dodge Grand Caravan V6        15395       USA           3      18     Van
## Ford Aerostar V6              12267       USA           3      18     Van
## Mazda MPV V6                  14944     Japan           5      19     Van
## Mitsubishi Wagon 4            14929     Japan          NA      20     Van
## Nissan Axxess 4               13949     Japan          NA      20     Van
## Nissan Van 4                  14799     Japan          NA      19     Van
##                               Weight Disp.  HP
## Eagle Summit 4                  2560    97 113
## Ford Escort   4                 2345   114  90
## Ford Festiva 4                  1845    81  63
## Honda Civic 4                   2260    91  92
## Mazda Protege 4                 2440   113 103
## Mercury Tracer 4                2285    97  82
## Nissan Sentra 4                 2275    97  90
## Pontiac LeMans 4                2350    98  74
## Subaru Loyale 4                 2295   109  90
## Subaru Justy 3                  1900    73  73
## Toyota Corolla 4                2390    97 102
## Toyota Tercel 4                 2075    89  78
## Volkswagen Jetta 4              2330   109 100
## Chevrolet Camaro V8             3320   305 170
## Dodge Daytona                   2885   153 100
## Ford Mustang V8                 3310   302 225
## Ford Probe                      2695   133 110
## Honda Civic CRX Si 4            2170    97 108
## Honda Prelude Si 4WS 4          2710   125 140
## Nissan 240SX 4                  2775   146 140
## Plymouth Laser                  2840   107  92
## Subaru XT 4                     2485   109  97
## Audi 80 4                       2670   121 108
## Buick Skylark 4                 2640   151 110
## Chevrolet Beretta 4             2655   133  95
## Chrysler Le Baron V6            3065   181 141
## Ford Tempo 4                    2750   141  98
## Honda Accord 4                  2920   132 125
## Mazda 626 4                     2780   133 110
## Mitsubishi Galant 4             2745   122 102
## Mitsubishi Sigma V6             3110   181 142
## Nissan Stanza 4                 2920   146 138
## Oldsmobile Calais 4             2645   151 110
## Peugeot 405 4                   2575   116 120
## Subaru Legacy 4                 2935   135 130
## Toyota Camry 4                  2920   122 115
## Volvo 240 4                     2985   141 114
## Acura Legend V6                 3265   163 160
## Buick Century 4                 2880   151 110
## Chrysler Le Baron Coupe         2975   153 150
## Chrysler New Yorker V6          3450   202 147
## Eagle Premier V6                3145   180 150
## Ford Taurus V6                  3190   182 140
## Ford Thunderbird V6             3610   232 140
## Hyundai Sonata 4                2885   143 110
## Mazda 929 V6                    3480   180 158
## Nissan Maxima V6                3200   180 160
## Oldsmobile Cutlass Ciera 4      2765   151 110
## Oldsmobile Cutlass Supreme V6   3220   189 135
## Toyota Cressida 6               3480   180 190
## Buick Le Sabre V6               3325   231 165
## Chevrolet Caprice V8            3855   305 170
## Ford LTD Crown Victoria V8      3850   302 150
## Chevrolet Lumina APV V6         3195   151 110
## Dodge Grand Caravan V6          3735   202 150
## Ford Aerostar V6                3665   182 145
## Mazda MPV V6                    3735   181 150
## Mitsubishi Wagon 4              3415   143 107
## Nissan Axxess 4                 3185   146 138
## Nissan Van 4                    3690   146 106

head(car.test.frame)

##                  Price   Country Reliability Mileage  Type Weight Disp.
## Eagle Summit 4    8895       USA           4      33 Small   2560    97
## Ford Escort   4   7402       USA           2      33 Small   2345   114
## Ford Festiva 4    6319     Korea           4      37 Small   1845    81
## Honda Civic 4     6635 Japan/USA           5      32 Small   2260    91
## Mazda Protege 4   6599     Japan           5      32 Small   2440   113
## Mercury Tracer 4  8672    Mexico           4      26 Small   2285    97
##                   HP
## Eagle Summit 4   113
## Ford Escort   4   90
## Ford Festiva 4    63
## Honda Civic 4     92
## Mazda Protege 4  103
## Mercury Tracer 4  82

z.auto <- rpart(Mileage ~ Weight, car.test.frame)
prettyTree(z.auto)

printcp(z.auto)

## 
## Regression tree:
## rpart(formula = Mileage ~ Weight, data = car.test.frame)
## 
## Variables actually used in tree construction:
## [1] Weight
## 
## Root node error: 1354.6/60 = 22.576
## 
## n= 60 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.595349      0   1.00000 1.06992 0.183244
## 2 0.134528      1   0.40465 0.55470 0.103161
## 3 0.012828      2   0.27012 0.40922 0.078510
## 4 0.010000      3   0.25729 0.41304 0.078199

zp <- prune(z.auto, cp=0.1)
plot(zp) #plot smaller rpart object

prettyTree(zp)

printcp(zp)

## 
## Regression tree:
## rpart(formula = Mileage ~ Weight, data = car.test.frame)
## 
## Variables actually used in tree construction:
## [1] Weight
## 
## Root node error: 1354.6/60 = 22.576
## 
## n= 60 
## 
##        CP nsplit rel error  xerror    xstd
## 1 0.59535      0   1.00000 1.06992 0.18324
## 2 0.13453      1   0.40465 0.55470 0.10316
## 3 0.10000      2   0.27012 0.40922 0.07851

###############################
#
#     Documentation for rpart
#
################################
# ?rpart
#rpart(formula, data, control, ...)

# formula a formula, with a response but no interaction terms. 
# ?formula

#data an data frame in which to interpret the variables named in the formula. 
#control a list of options that control details of the rpart algorithm. See rpart.control. 

# ?rpart.control
#rpart.control(minsplit = 20, minbucket = round(minsplit/3),maxdepth = 30, ...) 
#maxdepth  - Set the maximum depth of any node of the final tree, with the root node counted as depth 0. 
#minsplit  - the minimum number of observations that must exist in a node in order for a split to be attempted.  
#minbucket - the minimum number of observations in any terminal <leaf> node. 
#             If only one of minbucket or minsplit is specified, 
#             the code either sets minsplit to minbucket*3 or minbucket to minsplit/3, as appropriate. 

# ?prune

ML_Day2_Classification-2_SimpleTreeTab

Monday, February 02, 2015