# rpart on the Sonar Data
# 10 fold cross validation
# Plot and Explanation of Tree Diagram
# Confusion Matrix Created and Explained
# Author: PatriciaHoffman
###############################################################################
#The whole sonar data set contains 111 patterns obtained by bouncing sonar
#signals off a metal cylinder at various angles and under various
#conditions. The whole sonar data set contains 97 patterns obtained from
#rocks under similar conditions. The object is to predict if the returning
#signal is from a rock or a mine.
#Each returned radar signal is an observation of 60 numbers in the range 0.0 to 1.0
# These numbers are stored in the first 60 columns of the data set
# The last column (column 61) is either +1 indicating a rock or -1 indicating a mine
# was found
rm(list=ls()) # this starts fresh ...
oldpar <- par(no.readonly=TRUE)
oldpar
## $xlog
## [1] FALSE
##
## $ylog
## [1] FALSE
##
## $adj
## [1] 0.5
##
## $ann
## [1] TRUE
##
## $ask
## [1] FALSE
##
## $bg
## [1] "transparent"
##
## $bty
## [1] "o"
##
## $cex
## [1] 1
##
## $cex.axis
## [1] 1
##
## $cex.lab
## [1] 1
##
## $cex.main
## [1] 1.2
##
## $cex.sub
## [1] 1
##
## $col
## [1] "black"
##
## $col.axis
## [1] "black"
##
## $col.lab
## [1] "black"
##
## $col.main
## [1] "black"
##
## $col.sub
## [1] "black"
##
## $crt
## [1] 0
##
## $err
## [1] 0
##
## $family
## [1] ""
##
## $fg
## [1] "black"
##
## $fig
## [1] 0 1 0 1
##
## $fin
## [1] 7 5
##
## $font
## [1] 1
##
## $font.axis
## [1] 1
##
## $font.lab
## [1] 1
##
## $font.main
## [1] 2
##
## $font.sub
## [1] 1
##
## $lab
## [1] 5 5 7
##
## $las
## [1] 0
##
## $lend
## [1] "round"
##
## $lheight
## [1] 1
##
## $ljoin
## [1] "round"
##
## $lmitre
## [1] 10
##
## $lty
## [1] "solid"
##
## $lwd
## [1] 1
##
## $mai
## [1] 1.02 0.82 0.82 0.42
##
## $mar
## [1] 5.1 4.1 4.1 2.1
##
## $mex
## [1] 1
##
## $mfcol
## [1] 1 1
##
## $mfg
## [1] 1 1 1 1
##
## $mfrow
## [1] 1 1
##
## $mgp
## [1] 3 1 0
##
## $mkh
## [1] 0.001
##
## $new
## [1] FALSE
##
## $oma
## [1] 0 0 0 0
##
## $omd
## [1] 0 1 0 1
##
## $omi
## [1] 0 0 0 0
##
## $pch
## [1] 1
##
## $pin
## [1] 5.76 3.16
##
## $plt
## [1] 0.1171429 0.9400000 0.2040000 0.8360000
##
## $ps
## [1] 12
##
## $pty
## [1] "m"
##
## $smo
## [1] 1
##
## $srt
## [1] 0
##
## $tck
## [1] NA
##
## $tcl
## [1] -0.5
##
## $usr
## [1] 0 1 0 1
##
## $xaxp
## [1] 0 1 5
##
## $xaxs
## [1] "r"
##
## $xaxt
## [1] "s"
##
## $xpd
## [1] FALSE
##
## $yaxp
## [1] 0 1 5
##
## $yaxs
## [1] "r"
##
## $yaxt
## [1] "s"
##
## $ylbias
## [1] 0.2
names(oldpar)
## [1] "xlog" "ylog" "adj" "ann" "ask"
## [6] "bg" "bty" "cex" "cex.axis" "cex.lab"
## [11] "cex.main" "cex.sub" "col" "col.axis" "col.lab"
## [16] "col.main" "col.sub" "crt" "err" "family"
## [21] "fg" "fig" "fin" "font" "font.axis"
## [26] "font.lab" "font.main" "font.sub" "lab" "las"
## [31] "lend" "lheight" "ljoin" "lmitre" "lty"
## [36] "lwd" "mai" "mar" "mex" "mfcol"
## [41] "mfg" "mfrow" "mgp" "mkh" "new"
## [46] "oma" "omd" "omi" "pch" "pin"
## [51] "plt" "ps" "pty" "smo" "srt"
## [56] "tck" "tcl" "usr" "xaxp" "xaxs"
## [61] "xaxt" "xpd" "yaxp" "yaxs" "yaxt"
## [66] "ylbias"
str(oldpar)
## List of 66
## $ xlog : logi FALSE
## $ ylog : logi FALSE
## $ adj : num 0.5
## $ ann : logi TRUE
## $ ask : logi FALSE
## $ bg : chr "transparent"
## $ bty : chr "o"
## $ cex : num 1
## $ cex.axis : num 1
## $ cex.lab : num 1
## $ cex.main : num 1.2
## $ cex.sub : num 1
## $ col : chr "black"
## $ col.axis : chr "black"
## $ col.lab : chr "black"
## $ col.main : chr "black"
## $ col.sub : chr "black"
## $ crt : num 0
## $ err : int 0
## $ family : chr ""
## $ fg : chr "black"
## $ fig : num [1:4] 0 1 0 1
## $ fin : num [1:2] 7 5
## $ font : int 1
## $ font.axis: int 1
## $ font.lab : int 1
## $ font.main: int 2
## $ font.sub : int 1
## $ lab : int [1:3] 5 5 7
## $ las : int 0
## $ lend : chr "round"
## $ lheight : num 1
## $ ljoin : chr "round"
## $ lmitre : num 10
## $ lty : chr "solid"
## $ lwd : num 1
## $ mai : num [1:4] 1.02 0.82 0.82 0.42
## $ mar : num [1:4] 5.1 4.1 4.1 2.1
## $ mex : num 1
## $ mfcol : int [1:2] 1 1
## $ mfg : int [1:4] 1 1 1 1
## $ mfrow : int [1:2] 1 1
## $ mgp : num [1:3] 3 1 0
## $ mkh : num 0.001
## $ new : logi FALSE
## $ oma : num [1:4] 0 0 0 0
## $ omd : num [1:4] 0 1 0 1
## $ omi : num [1:4] 0 0 0 0
## $ pch : int 1
## $ pin : num [1:2] 5.76 3.16
## $ plt : num [1:4] 0.117 0.94 0.204 0.836
## $ ps : int 12
## $ pty : chr "m"
## $ smo : num 1
## $ srt : num 0
## $ tck : num NA
## $ tcl : num -0.5
## $ usr : num [1:4] 0 1 0 1
## $ xaxp : num [1:3] 0 1 5
## $ xaxs : chr "r"
## $ xaxt : chr "s"
## $ xpd : logi FALSE
## $ yaxp : num [1:3] 0 1 5
## $ yaxs : chr "r"
## $ yaxt : chr "s"
## $ ylbias : num 0.2
oldpar$mar
## [1] 5.1 4.1 4.1 2.1
#par(mar=rep(4, 4)) # make the margins smaller for RStudio
# no variables exist now
require(rpart)
## Loading required package: rpart
#setwd("C:/Users/PatriciaHoffman/workspaceR/TestDataSets")
train <- read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/sonar_train.csv",header=FALSE)
head(train)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1 0.0258 0.0433 0.0547 0.0681 0.0784 0.1250 0.1296 0.1729 0.2794 0.2954
## 2 0.0762 0.0666 0.0481 0.0394 0.0590 0.0649 0.1209 0.2467 0.3564 0.4459
## 3 0.0187 0.0346 0.0168 0.0177 0.0393 0.1630 0.2028 0.1694 0.2328 0.2684
## 4 0.0124 0.0433 0.0604 0.0449 0.0597 0.0355 0.0531 0.0343 0.1052 0.2120
## 5 0.0229 0.0369 0.0040 0.0375 0.0455 0.1452 0.2211 0.1188 0.0750 0.1631
## 6 0.0206 0.0132 0.0533 0.0569 0.0647 0.1432 0.1344 0.2041 0.1571 0.1573
## V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1 0.2506 0.2601 0.2249 0.2115 0.1270 0.1193 0.1794 0.2185 0.1646 0.0740
## 2 0.4152 0.3952 0.4256 0.4135 0.4528 0.5326 0.7306 0.6193 0.2032 0.4636
## 3 0.3108 0.2933 0.2275 0.0994 0.1801 0.2200 0.2732 0.2862 0.2034 0.1740
## 4 0.1640 0.1901 0.3026 0.2019 0.0592 0.2390 0.3657 0.3809 0.5929 0.6299
## 5 0.2709 0.3358 0.4091 0.4400 0.5485 0.7213 0.8137 0.9185 1.0000 0.9418
## 6 0.2327 0.1785 0.1507 0.1916 0.2061 0.2307 0.2360 0.1299 0.3812 0.5858
## V21 V22 V23 V24 V25 V26 V27 V28 V29 V30
## 1 0.0625 0.2381 0.4824 0.6372 0.7531 0.8959 0.9941 0.9957 0.9328 0.9344
## 2 0.4148 0.4292 0.5730 0.5399 0.3161 0.2285 0.6995 1.0000 0.7262 0.4724
## 3 0.4130 0.6879 0.8120 0.8453 0.8919 0.9300 0.9987 1.0000 0.8104 0.6199
## 4 0.5801 0.4574 0.4449 0.3691 0.6446 0.8940 0.8978 0.4980 0.3333 0.2350
## 5 0.9116 0.9349 0.7484 0.5146 0.4106 0.3443 0.6981 0.8713 0.9013 0.8014
## 6 0.4497 0.4876 1.0000 0.8675 0.4718 0.5341 0.6197 0.7143 0.5605 0.3728
## V31 V32 V33 V34 V35 V36 V37 V38 V39 V40
## 1 0.8854 0.7690 0.6865 0.6390 0.6378 0.6629 0.5983 0.4565 0.3129 0.4158
## 2 0.5103 0.5459 0.2881 0.0981 0.1951 0.4181 0.4604 0.3217 0.2828 0.2430
## 3 0.6041 0.5547 0.4160 0.1472 0.0849 0.0608 0.0969 0.1411 0.1676 0.1200
## 4 0.1553 0.3666 0.4340 0.3082 0.3024 0.4109 0.5501 0.4129 0.5499 0.5018
## 5 0.4380 0.1319 0.1709 0.2484 0.3044 0.2312 0.1338 0.2056 0.2474 0.2790
## 6 0.2481 0.1921 0.1386 0.3325 0.2883 0.3228 0.2607 0.2040 0.2396 0.1319
## V41 V42 V43 V44 V45 V46 V47 V48 V49 V50
## 1 0.4325 0.4031 0.4201 0.4557 0.3955 0.2966 0.2095 0.1558 0.0884 0.0265
## 2 0.1979 0.2444 0.1847 0.0841 0.0692 0.0528 0.0357 0.0085 0.0230 0.0046
## 3 0.1201 0.1036 0.1977 0.1339 0.0902 0.1085 0.1521 0.1363 0.0858 0.0290
## 4 0.3132 0.2802 0.2351 0.2298 0.1155 0.0724 0.0621 0.0318 0.0450 0.0167
## 5 0.1610 0.0056 0.0351 0.1148 0.1331 0.0276 0.0763 0.0631 0.0309 0.0240
## 6 0.0683 0.0334 0.0716 0.0976 0.0787 0.0522 0.0500 0.0231 0.0221 0.0144
## V51 V52 V53 V54 V55 V56 V57 V58 V59 V60
## 1 0.0121 0.0091 0.0062 0.0019 0.0045 0.0079 0.0031 0.0063 0.0048 0.0050
## 2 0.0156 0.0031 0.0054 0.0105 0.0110 0.0015 0.0072 0.0048 0.0107 0.0094
## 3 0.0203 0.0116 0.0098 0.0199 0.0033 0.0101 0.0065 0.0115 0.0193 0.0157
## 4 0.0078 0.0083 0.0057 0.0174 0.0188 0.0054 0.0114 0.0196 0.0147 0.0062
## 5 0.0115 0.0064 0.0022 0.0122 0.0151 0.0056 0.0026 0.0029 0.0104 0.0163
## 6 0.0307 0.0386 0.0147 0.0018 0.0100 0.0096 0.0077 0.0180 0.0109 0.0070
## V61
## 1 -1
## 2 1
## 3 -1
## 4 1
## 5 1
## 6 1
dim(train)
## [1] 130 61
nxval <- 10 # this is 10 fold cross validation
ndepth <- 10 # this creates trees from depth 1 to depth 10
trainOutput <- matrix(0.0,nrow = ndepth, ncol = 2) #Set up the matrix to hold the scores
testOutput <- matrix(0.0,nrow =ndepth, ncol = 2)
trainOutput
## [,1] [,2]
## [1,] 0 0
## [2,] 0 0
## [3,] 0 0
## [4,] 0 0
## [5,] 0 0
## [6,] 0 0
## [7,] 0 0
## [8,] 0 0
## [9,] 0 0
## [10,] 0 0
testOutput
## [,1] [,2]
## [1,] 0 0
## [2,] 0 0
## [3,] 0 0
## [4,] 0 0
## [5,] 0 0
## [6,] 0 0
## [7,] 0 0
## [8,] 0 0
## [9,] 0 0
## [10,] 0 0
I <- seq(from = 1, to = nrow(train))
I
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## [18] 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
## [35] 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
## [52] 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
## [69] 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
## [86] 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
## [103] 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
## [120] 120 121 122 123 124 125 126 127 128 129 130
for(idepth in 1:ndepth){
trainErr <- 0.0
testErr <- 0.0
for(ixval in seq(from = 1, to = nxval)){
Iout <- which(I%%nxval == ixval%%nxval)
trainIn <- train[-Iout,]
trainOut <- train[Iout,]
yin <- as.factor(trainIn[,61]) # Want to predict a class..
yout <- as.factor(trainOut[,61])
xin <- trainIn[,1:60]
xout <- trainOut[,1:60]
#class(xin)
fit <- rpart(yin~.,xin,control=rpart.control(maxdepth=idepth, minsplit=2))
#Calculate the training error
trainErr <- trainErr + (1-sum(yin==predict(fit,xin,type="class"))/length(yin))
#Calculate the test error
testErr <- testErr + (1-sum(yout==predict(fit,xout,type="class"))/length(yout))
}
trainOutput[idepth,1] <- idepth
trainOutput[idepth,2] <- trainErr/nxval
testOutput[idepth,1] <- idepth
testOutput[idepth,2] <- testErr/nxval
}
maxval = max(testOutput[,2])
maxval
## [1] 0.3230769
plot(trainOutput, ylim=c(0,maxval),
main="Model Complexity",
xlab="Model Complexity = Tree Depth",
ylab="Prediction Error"
)
legend("right", c("test", "train"), col = c(2,1), pch=1,cex=0.6)
points(testOutput, col = 2)

index <- which.min(testOutput[,2])
testOutput[index,2] #[1] 0.2230769
## [1] 0.2230769
index #[1] 1
## [1] 1
index <- which.min(trainOutput[,2])
trainOutput[index,2] #[1] 0.002564103
## [1] 0.002564103
index #[1] 6
## [1] 6
#Plot for most complicated tree depth
plot(fit)
text(fit)

print(fit)
## n= 117
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 117 56 -1 (0.52136752 0.47863248)
## 2) V11>=0.17095 74 20 -1 (0.72972973 0.27027027)
## 4) V17< 0.5562 50 6 -1 (0.88000000 0.12000000)
## 8) V5< 0.13635 46 3 -1 (0.93478261 0.06521739)
## 16) V49>=0.02475 39 0 -1 (1.00000000 0.00000000) *
## 17) V49< 0.02475 7 3 -1 (0.57142857 0.42857143)
## 34) V9>=0.1707 4 0 -1 (1.00000000 0.00000000) *
## 35) V9< 0.1707 3 0 1 (0.00000000 1.00000000) *
## 9) V5>=0.13635 4 1 1 (0.25000000 0.75000000)
## 18) V1>=0.0602 1 0 -1 (1.00000000 0.00000000) *
## 19) V1< 0.0602 3 0 1 (0.00000000 1.00000000) *
## 5) V17>=0.5562 24 10 1 (0.41666667 0.58333333)
## 10) V41< 0.14555 6 0 -1 (1.00000000 0.00000000) *
## 11) V41>=0.14555 18 4 1 (0.22222222 0.77777778)
## 22) V23>=0.8343 3 0 -1 (1.00000000 0.00000000) *
## 23) V23< 0.8343 15 1 1 (0.06666667 0.93333333)
## 46) V20>=0.9784 1 0 -1 (1.00000000 0.00000000) *
## 47) V20< 0.9784 14 0 1 (0.00000000 1.00000000) *
## 3) V11< 0.17095 43 7 1 (0.16279070 0.83720930)
## 6) V52>=0.0209 4 0 -1 (1.00000000 0.00000000) *
## 7) V52< 0.0209 39 3 1 (0.07692308 0.92307692)
## 14) V19>=0.8351 5 2 -1 (0.60000000 0.40000000)
## 28) V26< 0.6153 3 0 -1 (1.00000000 0.00000000) *
## 29) V26>=0.6153 2 0 1 (0.00000000 1.00000000) *
## 15) V19< 0.8351 34 0 1 (0.00000000 1.00000000) *
post(fit,file="")

# How did this complicated model do on the Test Set?
test<-read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/sonar_test.csv",header=FALSE)
testansw <- as.factor(test[,61])
testobv <- test[,1:60]
dum <- predict(fit,testobv)
yhat <- rep(0.0,nrow(dum))
for(i in 1:nrow(dum)){
yhat[i] <- 2*(which.max(dum[i,]) - 1) -1
}
testError <- (1-sum(testansw==yhat)/length(testansw))
# testError = [1] 0.2948718
#Look at the plots for the best tree depth
test<-read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/sonar_test.csv",header=FALSE)
# Create model using training data set Decision Tree Depth = 1
yin <- as.factor(train[,61])
xin <- train[,1:60]
fitone <- rpart(yin~.,xin,control=rpart.control(maxdepth=1, minsplit=2))
plot(fitone)
text(fitone)

print(fitone)
## n= 130
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 130 64 -1 (0.5076923 0.4923077)
## 2) V11>=0.17095 79 21 -1 (0.7341772 0.2658228) *
## 3) V11< 0.17095 51 8 1 (0.1568627 0.8431373) *
post(fitone,file="")

# What do the numbers on the plot signify?
dim(xin) # 130 x 60 note 66+64 = 130
## [1] 130 60
rocks<-which(train$V61 >0)
length(rocks) # 64 observations are rocks
## [1] 64
V11less<-which(xin$V11 <= 0.1709)
length(V11less) # 51 = 8 + 43 observations have V11 <= 0.17
## [1] 51
rocksSmall<-which(train[V11less,61]> 0)
length(rocksSmall) # 43 of these 51 are rocks (51-43 = 8)
## [1] 43
# How did the simple model do on the Test Set?
# note that for predict, the default is type = "prob"
# here the prediction is the class with the greatest probability
dum <- predict(fitone,testobv)
yhat <- rep(0.0,nrow(dum))
for(i in 1:nrow(dum)){
yhat[i] <- 2*(which.max(dum[i,]) - 1) -1
}
testError <- (1-sum(testansw==yhat)/length(testansw))
#testError = [1] 0.2820513
# Yes the simpler model has a better resulting score
# confusion table
table(test$V61,yhat)
## yhat
## -1 1
## -1 39 6
## 1 16 17
#yhat
# -1 1
#-1 39 6
# 1 16 17
# There are 78 observations
# 39 + 17 = 56 have been correctly classified
# There are 6 observations coded incorrectly
# in which the answer should be -1
# There are 16 observations coded incorrectly
# in which the answer should be +1
(16+6)/78 # [1] 0.2820513
## [1] 0.2820513
###################################
#
# investigate code used in calculating yhat
#
###################################
#for classification the default type is probability
#predict(model generated by rpart, observations)
#predict returns a vector representing probability of prediction for each observation
#for Sonar there are two colums as there are two classes
# check out the prediction for the 55th and 56 observations
#
dum[55:56,]
## -1 1
## 55 0.7341772 0.2658228
## 56 0.1568627 0.8431373
# -1 1
#55 0.7341772 0.2658228
#56 0.1568627 0.8431373
#58 + 21 = 79; 58/79 = 0.7341772
#Predict -1 for row 55 with probability 0.73...
#8+43 = 51; 8/51 = [1] 0.1568627
#Predict -1 for row 56 with probability 0.156...
2*(which.max(dum[55,]) - 1) -1
## -1
## -1
# returns a -1 for observation 55
2*(which.max(dum[56,]) - 1) -1
## 1
## 1
# returns a +1 for observation 56
####################################
#
# Documentation
#
###################################
# ?predict.rpart
#Returns a vector of predicted responses from a fitted rpart object.
#
#Usage
### S3 method for class 'rpart'
#predict(object, newdata = list(),
# type = c("vector", "prob", "class", "matrix"),
# na.action = na.pass, ...)
#
#type - character string denoting the type of predicted value returned. If the rpart object is a classification tree, then the default is to return prob predictions, a matrix whose columns are the probability of the first, second, etc. class. (This agrees with the default behavior of tree).
# Otherwise, a vector result is returned.