Use this tree to predict the class labels (either a + or -) for the following test observations:
library(rpart)
# create the model (use the "train" function in R)
train <- read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/table4_8pg199.txt",header=TRUE)
train
## Instance a1 a2 a3 Target
## 1 1 TRUE TRUE 1 1
## 2 2 TRUE TRUE 6 1
## 3 3 TRUE FALSE 5 0
## 4 4 FALSE FALSE 4 1
## 5 5 FALSE TRUE 7 0
## 6 6 FALSE TRUE 3 0
## 7 7 FALSE FALSE 8 0
## 8 8 TRUE FALSE 7 1
## 9 9 FALSE TRUE 5 0
str(train)
## 'data.frame': 9 obs. of 5 variables:
## $ Instance: int 1 2 3 4 5 6 7 8 9
## $ a1 : logi TRUE TRUE TRUE FALSE FALSE FALSE ...
## $ a2 : logi TRUE TRUE FALSE FALSE TRUE TRUE ...
## $ a3 : num 1 6 5 4 7 3 8 7 5
## $ Target : int 1 1 0 1 0 0 0 1 0
y<-as.factor(train[,5])#class labels 0 or 1
y
## [1] 1 1 0 1 0 0 0 1 0
## Levels: 0 1
x<-train[,2:4]
x
## a1 a2 a3
## 1 TRUE TRUE 1
## 2 TRUE TRUE 6
## 3 TRUE FALSE 5
## 4 FALSE FALSE 4
## 5 FALSE TRUE 7
## 6 FALSE TRUE 3
## 7 FALSE FALSE 8
## 8 TRUE FALSE 7
## 9 FALSE TRUE 5
str(train)
## 'data.frame': 9 obs. of 5 variables:
## $ Instance: int 1 2 3 4 5 6 7 8 9
## $ a1 : logi TRUE TRUE TRUE FALSE FALSE FALSE ...
## $ a2 : logi TRUE TRUE FALSE FALSE TRUE TRUE ...
## $ a3 : num 1 6 5 4 7 3 8 7 5
## $ Target : int 1 1 0 1 0 0 0 1 0
x;y
## a1 a2 a3
## 1 TRUE TRUE 1
## 2 TRUE TRUE 6
## 3 TRUE FALSE 5
## 4 FALSE FALSE 4
## 5 FALSE TRUE 7
## 6 FALSE TRUE 3
## 7 FALSE FALSE 8
## 8 TRUE FALSE 7
## 9 FALSE TRUE 5
## [1] 1 1 0 1 0 0 0 1 0
## Levels: 0 1
# Use training data
fit<-rpart(y~.,x,control=rpart.control(minsplit=0,minbucket=0,maxdepth=5))
fit
## n= 9
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 9 4 0 (0.5555556 0.4444444)
## 2) a1< 0.5 5 1 0 (0.8000000 0.2000000)
## 4) a2>=0.5 3 0 0 (1.0000000 0.0000000) *
## 5) a2< 0.5 2 1 0 (0.5000000 0.5000000)
## 10) a3>=6 1 0 0 (1.0000000 0.0000000) *
## 11) a3< 6 1 0 1 (0.0000000 1.0000000) *
## 3) a1>=0.5 4 1 1 (0.2500000 0.7500000)
## 6) a2< 0.5 2 1 0 (0.5000000 0.5000000)
## 12) a3< 6 1 0 0 (1.0000000 0.0000000) *
## 13) a3>=6 1 0 1 (0.0000000 1.0000000) *
## 7) a2>=0.5 2 0 1 (0.0000000 1.0000000) *
plot(fit)
text(fit)

# Now start to predit using "real" data:
test.csv <- read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/HW2_Q2.csv",header=TRUE)
test.txt <- read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/HW2_Q2.txt",header=TRUE)
## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : incomplete final line found by readTableHeader on
## 'C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/HW2_Q2.txt'
test.csv
## Observation a1 a2 a3
## 1 1 TRUE TRUE 2.5
## 2 2 TRUE FALSE 5.5
## 3 3 FALSE TRUE 2.5
## 4 4 FALSE FALSE 8.5
test.txt
## Observation a1 a2 a3
## 1 1 TRUE TRUE 2.5
## 2 2 TRUE TRUE 5.5
## 3 3 TRUE FALSE 2.5
## 4 4 FALSE FALSE 8.5
str(test.csv)
## 'data.frame': 4 obs. of 4 variables:
## $ Observation: int 1 2 3 4
## $ a1 : logi TRUE TRUE FALSE FALSE
## $ a2 : logi TRUE FALSE TRUE FALSE
## $ a3 : num 2.5 5.5 2.5 8.5
str(test.txt)
## 'data.frame': 4 obs. of 4 variables:
## $ Observation: int 1 2 3 4
## $ a1 : logi TRUE TRUE TRUE FALSE
## $ a2 : logi TRUE TRUE FALSE FALSE
## $ a3 : num 2.5 5.5 2.5 8.5
predict_test.csv <- predict(fit, test.csv, type="class")
predict_test.txt <- predict(fit, test.csv, type="class")
predict(fit, type="prob")
## 0 1
## 1 0 1
## 2 0 1
## 3 1 0
## 4 0 1
## 5 1 0
## 6 1 0
## 7 1 0
## 8 0 1
## 9 1 0
fit
## n= 9
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 9 4 0 (0.5555556 0.4444444)
## 2) a1< 0.5 5 1 0 (0.8000000 0.2000000)
## 4) a2>=0.5 3 0 0 (1.0000000 0.0000000) *
## 5) a2< 0.5 2 1 0 (0.5000000 0.5000000)
## 10) a3>=6 1 0 0 (1.0000000 0.0000000) *
## 11) a3< 6 1 0 1 (0.0000000 1.0000000) *
## 3) a1>=0.5 4 1 1 (0.2500000 0.7500000)
## 6) a2< 0.5 2 1 0 (0.5000000 0.5000000)
## 12) a3< 6 1 0 0 (1.0000000 0.0000000) *
## 13) a3>=6 1 0 1 (0.0000000 1.0000000) *
## 7) a2>=0.5 2 0 1 (0.0000000 1.0000000) *
fit$frame
## var n wt dev yval complexity ncompete nsurrogate yval2.V1 yval2.V2
## 1 a1 9 9 4 1 0.500 2 0 1.0000000 5.0000000
## 2 a2 5 5 1 1 0.125 1 0 1.0000000 4.0000000
## 4 <leaf> 3 3 0 1 0.010 0 0 1.0000000 3.0000000
## 5 a3 2 2 1 1 0.125 0 0 1.0000000 1.0000000
## 10 <leaf> 1 1 0 1 0.010 0 0 1.0000000 1.0000000
## 11 <leaf> 1 1 0 2 0.010 0 0 2.0000000 0.0000000
## 3 a2 4 4 1 2 0.125 1 0 2.0000000 1.0000000
## 6 a3 2 2 1 1 0.125 0 0 1.0000000 1.0000000
## 12 <leaf> 1 1 0 1 0.010 0 0 1.0000000 1.0000000
## 13 <leaf> 1 1 0 2 0.010 0 0 2.0000000 0.0000000
## 7 <leaf> 2 2 0 2 0.010 0 0 2.0000000 0.0000000
## yval2.V3 yval2.V4 yval2.V5 yval2.nodeprob
## 1 4.0000000 0.5555556 0.4444444 1.0000000
## 2 1.0000000 0.8000000 0.2000000 0.5555556
## 4 0.0000000 1.0000000 0.0000000 0.3333333
## 5 1.0000000 0.5000000 0.5000000 0.2222222
## 10 0.0000000 1.0000000 0.0000000 0.1111111
## 11 1.0000000 0.0000000 1.0000000 0.1111111
## 3 3.0000000 0.2500000 0.7500000 0.4444444
## 6 1.0000000 0.5000000 0.5000000 0.2222222
## 12 0.0000000 1.0000000 0.0000000 0.1111111
## 13 1.0000000 0.0000000 1.0000000 0.1111111
## 7 2.0000000 0.0000000 1.0000000 0.2222222
fit$frame[1,1]
## [1] a1
## Levels: <leaf> a1 a2 a3
print(fit)
## n= 9
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 9 4 0 (0.5555556 0.4444444)
## 2) a1< 0.5 5 1 0 (0.8000000 0.2000000)
## 4) a2>=0.5 3 0 0 (1.0000000 0.0000000) *
## 5) a2< 0.5 2 1 0 (0.5000000 0.5000000)
## 10) a3>=6 1 0 0 (1.0000000 0.0000000) *
## 11) a3< 6 1 0 1 (0.0000000 1.0000000) *
## 3) a1>=0.5 4 1 1 (0.2500000 0.7500000)
## 6) a2< 0.5 2 1 0 (0.5000000 0.5000000)
## 12) a3< 6 1 0 0 (1.0000000 0.0000000) *
## 13) a3>=6 1 0 1 (0.0000000 1.0000000) *
## 7) a2>=0.5 2 0 1 (0.0000000 1.0000000) *