Machine Learning Day 2, Classification - 1. About sonar train data

# rpart on the Sonar Data
#    10 fold cross validation
#    Plot and Explanation of Tree Diagram 
#    Confusion Matrix Created and Explained
# Author: PatriciaHoffman
###############################################################################
#The whole sonar data set contains 111 patterns obtained by bouncing sonar
#signals off a metal cylinder at various angles and under various
#conditions.  The whole sonar data set contains 97 patterns obtained from
#rocks under similar conditions.  The object is to predict if the returning
#signal is from a rock or a mine.  
#Each returned radar signal is an observation of 60 numbers in the range 0.0 to 1.0
#  These numbers are stored in the first 60 columns of the data set
#  The last column (column 61) is either +1 indicating a rock or -1 indicating a mine
#   was found


rm(list=ls()) # this starts fresh ...
oldpar <- par(no.readonly=TRUE)
oldpar

## $xlog
## [1] FALSE
## 
## $ylog
## [1] FALSE
## 
## $adj
## [1] 0.5
## 
## $ann
## [1] TRUE
## 
## $ask
## [1] FALSE
## 
## $bg
## [1] "transparent"
## 
## $bty
## [1] "o"
## 
## $cex
## [1] 1
## 
## $cex.axis
## [1] 1
## 
## $cex.lab
## [1] 1
## 
## $cex.main
## [1] 1.2
## 
## $cex.sub
## [1] 1
## 
## $col
## [1] "black"
## 
## $col.axis
## [1] "black"
## 
## $col.lab
## [1] "black"
## 
## $col.main
## [1] "black"
## 
## $col.sub
## [1] "black"
## 
## $crt
## [1] 0
## 
## $err
## [1] 0
## 
## $family
## [1] ""
## 
## $fg
## [1] "black"
## 
## $fig
## [1] 0 1 0 1
## 
## $fin
## [1] 7 5
## 
## $font
## [1] 1
## 
## $font.axis
## [1] 1
## 
## $font.lab
## [1] 1
## 
## $font.main
## [1] 2
## 
## $font.sub
## [1] 1
## 
## $lab
## [1] 5 5 7
## 
## $las
## [1] 0
## 
## $lend
## [1] "round"
## 
## $lheight
## [1] 1
## 
## $ljoin
## [1] "round"
## 
## $lmitre
## [1] 10
## 
## $lty
## [1] "solid"
## 
## $lwd
## [1] 1
## 
## $mai
## [1] 1.02 0.82 0.82 0.42
## 
## $mar
## [1] 5.1 4.1 4.1 2.1
## 
## $mex
## [1] 1
## 
## $mfcol
## [1] 1 1
## 
## $mfg
## [1] 1 1 1 1
## 
## $mfrow
## [1] 1 1
## 
## $mgp
## [1] 3 1 0
## 
## $mkh
## [1] 0.001
## 
## $new
## [1] FALSE
## 
## $oma
## [1] 0 0 0 0
## 
## $omd
## [1] 0 1 0 1
## 
## $omi
## [1] 0 0 0 0
## 
## $pch
## [1] 1
## 
## $pin
## [1] 5.76 3.16
## 
## $plt
## [1] 0.1171429 0.9400000 0.2040000 0.8360000
## 
## $ps
## [1] 12
## 
## $pty
## [1] "m"
## 
## $smo
## [1] 1
## 
## $srt
## [1] 0
## 
## $tck
## [1] NA
## 
## $tcl
## [1] -0.5
## 
## $usr
## [1] 0 1 0 1
## 
## $xaxp
## [1] 0 1 5
## 
## $xaxs
## [1] "r"
## 
## $xaxt
## [1] "s"
## 
## $xpd
## [1] FALSE
## 
## $yaxp
## [1] 0 1 5
## 
## $yaxs
## [1] "r"
## 
## $yaxt
## [1] "s"
## 
## $ylbias
## [1] 0.2

names(oldpar)

##  [1] "xlog"      "ylog"      "adj"       "ann"       "ask"      
##  [6] "bg"        "bty"       "cex"       "cex.axis"  "cex.lab"  
## [11] "cex.main"  "cex.sub"   "col"       "col.axis"  "col.lab"  
## [16] "col.main"  "col.sub"   "crt"       "err"       "family"   
## [21] "fg"        "fig"       "fin"       "font"      "font.axis"
## [26] "font.lab"  "font.main" "font.sub"  "lab"       "las"      
## [31] "lend"      "lheight"   "ljoin"     "lmitre"    "lty"      
## [36] "lwd"       "mai"       "mar"       "mex"       "mfcol"    
## [41] "mfg"       "mfrow"     "mgp"       "mkh"       "new"      
## [46] "oma"       "omd"       "omi"       "pch"       "pin"      
## [51] "plt"       "ps"        "pty"       "smo"       "srt"      
## [56] "tck"       "tcl"       "usr"       "xaxp"      "xaxs"     
## [61] "xaxt"      "xpd"       "yaxp"      "yaxs"      "yaxt"     
## [66] "ylbias"

str(oldpar)

## List of 66
##  $ xlog     : logi FALSE
##  $ ylog     : logi FALSE
##  $ adj      : num 0.5
##  $ ann      : logi TRUE
##  $ ask      : logi FALSE
##  $ bg       : chr "transparent"
##  $ bty      : chr "o"
##  $ cex      : num 1
##  $ cex.axis : num 1
##  $ cex.lab  : num 1
##  $ cex.main : num 1.2
##  $ cex.sub  : num 1
##  $ col      : chr "black"
##  $ col.axis : chr "black"
##  $ col.lab  : chr "black"
##  $ col.main : chr "black"
##  $ col.sub  : chr "black"
##  $ crt      : num 0
##  $ err      : int 0
##  $ family   : chr ""
##  $ fg       : chr "black"
##  $ fig      : num [1:4] 0 1 0 1
##  $ fin      : num [1:2] 7 5
##  $ font     : int 1
##  $ font.axis: int 1
##  $ font.lab : int 1
##  $ font.main: int 2
##  $ font.sub : int 1
##  $ lab      : int [1:3] 5 5 7
##  $ las      : int 0
##  $ lend     : chr "round"
##  $ lheight  : num 1
##  $ ljoin    : chr "round"
##  $ lmitre   : num 10
##  $ lty      : chr "solid"
##  $ lwd      : num 1
##  $ mai      : num [1:4] 1.02 0.82 0.82 0.42
##  $ mar      : num [1:4] 5.1 4.1 4.1 2.1
##  $ mex      : num 1
##  $ mfcol    : int [1:2] 1 1
##  $ mfg      : int [1:4] 1 1 1 1
##  $ mfrow    : int [1:2] 1 1
##  $ mgp      : num [1:3] 3 1 0
##  $ mkh      : num 0.001
##  $ new      : logi FALSE
##  $ oma      : num [1:4] 0 0 0 0
##  $ omd      : num [1:4] 0 1 0 1
##  $ omi      : num [1:4] 0 0 0 0
##  $ pch      : int 1
##  $ pin      : num [1:2] 5.76 3.16
##  $ plt      : num [1:4] 0.117 0.94 0.204 0.836
##  $ ps       : int 12
##  $ pty      : chr "m"
##  $ smo      : num 1
##  $ srt      : num 0
##  $ tck      : num NA
##  $ tcl      : num -0.5
##  $ usr      : num [1:4] 0 1 0 1
##  $ xaxp     : num [1:3] 0 1 5
##  $ xaxs     : chr "r"
##  $ xaxt     : chr "s"
##  $ xpd      : logi FALSE
##  $ yaxp     : num [1:3] 0 1 5
##  $ yaxs     : chr "r"
##  $ yaxt     : chr "s"
##  $ ylbias   : num 0.2

oldpar$mar

## [1] 5.1 4.1 4.1 2.1

#par(mar=rep(4, 4)) # make the margins smaller for RStudio
# no variables exist now
require(rpart)

## Loading required package: rpart

#setwd("C:/Users/PatriciaHoffman/workspaceR/TestDataSets")

train <- read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/sonar_train.csv",header=FALSE)
head(train)

##       V1     V2     V3     V4     V5     V6     V7     V8     V9    V10
## 1 0.0258 0.0433 0.0547 0.0681 0.0784 0.1250 0.1296 0.1729 0.2794 0.2954
## 2 0.0762 0.0666 0.0481 0.0394 0.0590 0.0649 0.1209 0.2467 0.3564 0.4459
## 3 0.0187 0.0346 0.0168 0.0177 0.0393 0.1630 0.2028 0.1694 0.2328 0.2684
## 4 0.0124 0.0433 0.0604 0.0449 0.0597 0.0355 0.0531 0.0343 0.1052 0.2120
## 5 0.0229 0.0369 0.0040 0.0375 0.0455 0.1452 0.2211 0.1188 0.0750 0.1631
## 6 0.0206 0.0132 0.0533 0.0569 0.0647 0.1432 0.1344 0.2041 0.1571 0.1573
##      V11    V12    V13    V14    V15    V16    V17    V18    V19    V20
## 1 0.2506 0.2601 0.2249 0.2115 0.1270 0.1193 0.1794 0.2185 0.1646 0.0740
## 2 0.4152 0.3952 0.4256 0.4135 0.4528 0.5326 0.7306 0.6193 0.2032 0.4636
## 3 0.3108 0.2933 0.2275 0.0994 0.1801 0.2200 0.2732 0.2862 0.2034 0.1740
## 4 0.1640 0.1901 0.3026 0.2019 0.0592 0.2390 0.3657 0.3809 0.5929 0.6299
## 5 0.2709 0.3358 0.4091 0.4400 0.5485 0.7213 0.8137 0.9185 1.0000 0.9418
## 6 0.2327 0.1785 0.1507 0.1916 0.2061 0.2307 0.2360 0.1299 0.3812 0.5858
##      V21    V22    V23    V24    V25    V26    V27    V28    V29    V30
## 1 0.0625 0.2381 0.4824 0.6372 0.7531 0.8959 0.9941 0.9957 0.9328 0.9344
## 2 0.4148 0.4292 0.5730 0.5399 0.3161 0.2285 0.6995 1.0000 0.7262 0.4724
## 3 0.4130 0.6879 0.8120 0.8453 0.8919 0.9300 0.9987 1.0000 0.8104 0.6199
## 4 0.5801 0.4574 0.4449 0.3691 0.6446 0.8940 0.8978 0.4980 0.3333 0.2350
## 5 0.9116 0.9349 0.7484 0.5146 0.4106 0.3443 0.6981 0.8713 0.9013 0.8014
## 6 0.4497 0.4876 1.0000 0.8675 0.4718 0.5341 0.6197 0.7143 0.5605 0.3728
##      V31    V32    V33    V34    V35    V36    V37    V38    V39    V40
## 1 0.8854 0.7690 0.6865 0.6390 0.6378 0.6629 0.5983 0.4565 0.3129 0.4158
## 2 0.5103 0.5459 0.2881 0.0981 0.1951 0.4181 0.4604 0.3217 0.2828 0.2430
## 3 0.6041 0.5547 0.4160 0.1472 0.0849 0.0608 0.0969 0.1411 0.1676 0.1200
## 4 0.1553 0.3666 0.4340 0.3082 0.3024 0.4109 0.5501 0.4129 0.5499 0.5018
## 5 0.4380 0.1319 0.1709 0.2484 0.3044 0.2312 0.1338 0.2056 0.2474 0.2790
## 6 0.2481 0.1921 0.1386 0.3325 0.2883 0.3228 0.2607 0.2040 0.2396 0.1319
##      V41    V42    V43    V44    V45    V46    V47    V48    V49    V50
## 1 0.4325 0.4031 0.4201 0.4557 0.3955 0.2966 0.2095 0.1558 0.0884 0.0265
## 2 0.1979 0.2444 0.1847 0.0841 0.0692 0.0528 0.0357 0.0085 0.0230 0.0046
## 3 0.1201 0.1036 0.1977 0.1339 0.0902 0.1085 0.1521 0.1363 0.0858 0.0290
## 4 0.3132 0.2802 0.2351 0.2298 0.1155 0.0724 0.0621 0.0318 0.0450 0.0167
## 5 0.1610 0.0056 0.0351 0.1148 0.1331 0.0276 0.0763 0.0631 0.0309 0.0240
## 6 0.0683 0.0334 0.0716 0.0976 0.0787 0.0522 0.0500 0.0231 0.0221 0.0144
##      V51    V52    V53    V54    V55    V56    V57    V58    V59    V60
## 1 0.0121 0.0091 0.0062 0.0019 0.0045 0.0079 0.0031 0.0063 0.0048 0.0050
## 2 0.0156 0.0031 0.0054 0.0105 0.0110 0.0015 0.0072 0.0048 0.0107 0.0094
## 3 0.0203 0.0116 0.0098 0.0199 0.0033 0.0101 0.0065 0.0115 0.0193 0.0157
## 4 0.0078 0.0083 0.0057 0.0174 0.0188 0.0054 0.0114 0.0196 0.0147 0.0062
## 5 0.0115 0.0064 0.0022 0.0122 0.0151 0.0056 0.0026 0.0029 0.0104 0.0163
## 6 0.0307 0.0386 0.0147 0.0018 0.0100 0.0096 0.0077 0.0180 0.0109 0.0070
##   V61
## 1  -1
## 2   1
## 3  -1
## 4   1
## 5   1
## 6   1

dim(train)

## [1] 130  61

nxval <- 10  # this is 10 fold cross validation
ndepth <- 10 # this creates trees from depth 1 to depth 10
trainOutput <- matrix(0.0,nrow = ndepth, ncol = 2) #Set up the matrix to hold the scores
testOutput <- matrix(0.0,nrow =ndepth, ncol = 2)
trainOutput

##       [,1] [,2]
##  [1,]    0    0
##  [2,]    0    0
##  [3,]    0    0
##  [4,]    0    0
##  [5,]    0    0
##  [6,]    0    0
##  [7,]    0    0
##  [8,]    0    0
##  [9,]    0    0
## [10,]    0    0

testOutput

##       [,1] [,2]
##  [1,]    0    0
##  [2,]    0    0
##  [3,]    0    0
##  [4,]    0    0
##  [5,]    0    0
##  [6,]    0    0
##  [7,]    0    0
##  [8,]    0    0
##  [9,]    0    0
## [10,]    0    0

I <- seq(from = 1, to = nrow(train))
I

##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
##  [18]  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
##  [35]  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51
##  [52]  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68
##  [69]  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85
##  [86]  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102
## [103] 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
## [120] 120 121 122 123 124 125 126 127 128 129 130

for(idepth in 1:ndepth){
    trainErr <- 0.0 
    testErr <- 0.0   
    for(ixval in seq(from =  1, to = nxval)){
        Iout <- which(I%%nxval == ixval%%nxval)
        trainIn <- train[-Iout,]
        trainOut <- train[Iout,]
        yin <- as.factor(trainIn[,61])  # Want to predict a class..
        yout <- as.factor(trainOut[,61])
        xin <- trainIn[,1:60]
        xout <- trainOut[,1:60]
        #class(xin)
        fit <- rpart(yin~.,xin,control=rpart.control(maxdepth=idepth, minsplit=2))
        #Calculate the training error
        trainErr <- trainErr + (1-sum(yin==predict(fit,xin,type="class"))/length(yin))
        #Calculate the test error
        testErr <- testErr + (1-sum(yout==predict(fit,xout,type="class"))/length(yout))
    }
    trainOutput[idepth,1] <- idepth
    trainOutput[idepth,2] <- trainErr/nxval
    testOutput[idepth,1] <- idepth
    testOutput[idepth,2] <- testErr/nxval
}    
maxval = max(testOutput[,2])
maxval

## [1] 0.3230769

plot(trainOutput, ylim=c(0,maxval),
        main="Model Complexity",
        xlab="Model Complexity = Tree Depth",
        ylab="Prediction Error"
    )
legend("right", c("test", "train"), col = c(2,1), pch=1,cex=0.6)
points(testOutput, col = 2)

index <- which.min(testOutput[,2]) 
testOutput[index,2]  #[1] 0.2230769

## [1] 0.2230769

index  #[1] 1

## [1] 1

index <- which.min(trainOutput[,2]) 
trainOutput[index,2]  #[1] 0.002564103

## [1] 0.002564103

index  #[1] 6

## [1] 6

#Plot for most complicated tree depth
plot(fit)
text(fit)

print(fit)

## n= 117 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 117 56 -1 (0.52136752 0.47863248)  
##    2) V11>=0.17095 74 20 -1 (0.72972973 0.27027027)  
##      4) V17< 0.5562 50  6 -1 (0.88000000 0.12000000)  
##        8) V5< 0.13635 46  3 -1 (0.93478261 0.06521739)  
##         16) V49>=0.02475 39  0 -1 (1.00000000 0.00000000) *
##         17) V49< 0.02475 7  3 -1 (0.57142857 0.42857143)  
##           34) V9>=0.1707 4  0 -1 (1.00000000 0.00000000) *
##           35) V9< 0.1707 3  0 1 (0.00000000 1.00000000) *
##        9) V5>=0.13635 4  1 1 (0.25000000 0.75000000)  
##         18) V1>=0.0602 1  0 -1 (1.00000000 0.00000000) *
##         19) V1< 0.0602 3  0 1 (0.00000000 1.00000000) *
##      5) V17>=0.5562 24 10 1 (0.41666667 0.58333333)  
##       10) V41< 0.14555 6  0 -1 (1.00000000 0.00000000) *
##       11) V41>=0.14555 18  4 1 (0.22222222 0.77777778)  
##         22) V23>=0.8343 3  0 -1 (1.00000000 0.00000000) *
##         23) V23< 0.8343 15  1 1 (0.06666667 0.93333333)  
##           46) V20>=0.9784 1  0 -1 (1.00000000 0.00000000) *
##           47) V20< 0.9784 14  0 1 (0.00000000 1.00000000) *
##    3) V11< 0.17095 43  7 1 (0.16279070 0.83720930)  
##      6) V52>=0.0209 4  0 -1 (1.00000000 0.00000000) *
##      7) V52< 0.0209 39  3 1 (0.07692308 0.92307692)  
##       14) V19>=0.8351 5  2 -1 (0.60000000 0.40000000)  
##         28) V26< 0.6153 3  0 -1 (1.00000000 0.00000000) *
##         29) V26>=0.6153 2  0 1 (0.00000000 1.00000000) *
##       15) V19< 0.8351 34  0 1 (0.00000000 1.00000000) *

post(fit,file="")

#  How did this complicated model do on the Test Set?
test<-read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/sonar_test.csv",header=FALSE)
testansw <- as.factor(test[,61])
testobv <- test[,1:60]
dum <- predict(fit,testobv)
yhat <- rep(0.0,nrow(dum))
for(i in 1:nrow(dum)){
    yhat[i] <- 2*(which.max(dum[i,]) - 1) -1            
}
testError <- (1-sum(testansw==yhat)/length(testansw))
# testError = [1] 0.2948718

#Look at the plots for the best tree depth
test<-read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/sonar_test.csv",header=FALSE)

# Create model using training data set Decision Tree Depth = 1
yin <- as.factor(train[,61])
xin <- train[,1:60]
fitone <- rpart(yin~.,xin,control=rpart.control(maxdepth=1, minsplit=2))


plot(fitone)
text(fitone)

print(fitone)

## n= 130 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 130 64 -1 (0.5076923 0.4923077)  
##   2) V11>=0.17095 79 21 -1 (0.7341772 0.2658228) *
##   3) V11< 0.17095 51  8 1 (0.1568627 0.8431373) *

post(fitone,file="")

# What do the numbers on the plot signify?
dim(xin)            # 130 x 60 note  66+64 = 130

## [1] 130  60

rocks<-which(train$V61 >0)
length(rocks)       # 64 observations are rocks

## [1] 64

V11less<-which(xin$V11 <= 0.1709) 
length(V11less)     # 51  = 8 + 43 observations have V11 <= 0.17

## [1] 51

rocksSmall<-which(train[V11less,61]> 0)
length(rocksSmall)  # 43 of these 51 are rocks (51-43 = 8)

## [1] 43

#  How did the simple model do on the Test Set?
#        note that for predict, the default is type = "prob"
#        here the prediction is the class with the greatest probability
dum <- predict(fitone,testobv)
yhat <- rep(0.0,nrow(dum))
for(i in 1:nrow(dum)){
    yhat[i] <- 2*(which.max(dum[i,]) - 1) -1            
}
testError <- (1-sum(testansw==yhat)/length(testansw))
    
#testError = [1] 0.2820513

# Yes the simpler model has a better resulting score

# confusion table
table(test$V61,yhat)

##     yhat
##      -1  1
##   -1 39  6
##   1  16 17

#yhat
#    -1  1
#-1  39  6
# 1  16 17

# There are 78 observations
# 39 + 17 = 56 have been correctly classified
# There are  6 observations coded incorrectly 
#            in which the answer should be -1 
# There are 16 observations coded incorrectly 
#            in which the answer should be +1

(16+6)/78   # [1] 0.2820513

## [1] 0.2820513

###################################
#
# investigate code used in calculating yhat
#
###################################

#for classification the default type is probability
#predict(model generated by rpart, observations)
#predict returns a vector representing probability of prediction for each observation
#for Sonar there are two colums as there are two classes
# check out the prediction for the 55th and 56 observations

#
dum[55:56,]

##           -1         1
## 55 0.7341772 0.2658228
## 56 0.1568627 0.8431373

#      -1         1
#55 0.7341772 0.2658228
#56 0.1568627 0.8431373

#58 + 21 = 79; 58/79 = 0.7341772
#Predict -1 for row 55 with probability 0.73...

#8+43 = 51; 8/51 = [1] 0.1568627
#Predict -1 for row 56 with probability 0.156...


2*(which.max(dum[55,]) - 1) -1

## -1 
## -1

# returns a -1 for observation 55
2*(which.max(dum[56,]) - 1) -1

## 1 
## 1

# returns a +1 for observation 56

####################################
#
# Documentation
#
###################################
# ?predict.rpart


#Returns a vector of predicted responses from a fitted rpart object. 
#
#Usage
### S3 method for class 'rpart'
#predict(object, newdata = list(),
#       type = c("vector", "prob", "class", "matrix"),
#       na.action = na.pass, ...)
#
#type  - character string denoting the type of predicted value returned. If the rpart object is a classification tree, then the default is to return prob predictions, a matrix whose columns are the probability of the first, second, etc. class. (This agrees with the default behavior of tree). 
#        Otherwise, a vector result is returned.

Machine Learning Day 2, Classification - 1. About sonar train data

Sunday, February 01, 2015