k-Nearest Neighbour Cross-Validatory Classification
# This has some errors:
rm(list=ls())
library(class)
## Warning: package 'class' was built under R version 3.1.2
library(e1071)
## Warning: package 'e1071' was built under R version 3.1.3
iris <- read.table("c:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/irisdata.csv",sep = ",",header = FALSE)
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ V1: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ V2: num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ V3: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ V4: num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ V5: Factor w/ 3 levels "Iris-setosa",..: 1 1 1 1 1 1 1 1 1 1 ...
train <- iris[,1:4]
labels <- iris[,5]
###Scale the data
train2 <- train
for(i in seq(from = 1, to = ncol(train))){
v = var(train[,i])
m = mean(train[,i])
train2[,i] <- (train[,i]-m)/sqrt(v)
}
####Perform cross validation on the new data
out <- knn.cv(train2,labels,k=3)
1-sum(abs(labels == out))/length(out)
## [1] 0.05333333
Err <- rep(0,50)
for(kk in seq(from=1,to=50)){
out <- knn.cv(train2,labels,k=kk)
Error <- 1-sum(abs(labels == out))/length(out)
Err[kk] <- Error
}
Err
## [1] 0.05333333 0.06666667 0.05333333 0.06666667 0.05333333 0.05333333
## [7] 0.04000000 0.04000000 0.04666667 0.04666667 0.04666667 0.04000000
## [13] 0.03333333 0.04000000 0.03333333 0.03333333 0.03333333 0.04000000
## [19] 0.04666667 0.05333333 0.05333333 0.05333333 0.05333333 0.05333333
## [25] 0.04666667 0.04666667 0.05333333 0.06000000 0.05333333 0.06666667
## [31] 0.05333333 0.06666667 0.05333333 0.06000000 0.05333333 0.06000000
## [37] 0.08666667 0.09333333 0.10000000 0.10000000 0.11333333 0.11333333
## [43] 0.10666667 0.10666667 0.10666667 0.11333333 0.11333333 0.12666667
## [49] 0.12666667 0.12666667
plot(Err)
library(klaR)
## Warning: package 'klaR' was built under R version 3.1.3
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 3.1.2
# read data
wdata <- read.csv('c:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/winequality-red.csv', header=TRUE, sep=';')
str(wdata)
## 'data.frame': 1599 obs. of 12 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
# convert wine quality to categorical variable
wdata$quality <- as.factor(wdata$quality)
# apply Naive Bayes classification
mod <- NaiveBayes(quality~.,data = wdata)
# predict based on classifier
qualityHat <- predict(mod, wdata[,1:11])
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 34
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 82
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 87
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 92
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 93
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 107
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 152
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 170
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 227
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 259
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 282
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 325
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 326
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 355
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 397
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 401
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 443
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 452
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 481
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 545
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 555
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 556
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 558
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 650
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 653
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 693
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 724
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 755
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 1018
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 1019
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 1052
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 1080
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 1082
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 1166
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 1236
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 1245
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 1261
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 1317
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 1320
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 1322
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 1371
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 1373
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 1435
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 1436
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 1475
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 1477
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 1559
## Warning in FUN(1:1599[[1599L]], ...): Numerical 0 probability for all
## classes with observation 1575
# calculate error
Err <- 1 - sum(qualityHat$class == wdata$quality)/length(wdata$quality)
Err
## [1] 0.4396498
# [1] 0.4396498
# are dependent variables correlated
pairs(wdata[,1:11])
rm(list = ls())
library(class)
library(e1071)
library(klaR)
library(MASS)
library(rpart)
sonar.train <- read.csv("c:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/sonar_train.csv", header = FALSE)
sonar.train$V61 <- as.factor(sonar.train$V61)
m <- NaiveBayes(V61 ~ ., data = sonar.train)
out <- predict(m)
out
## $class
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## -1 -1 -1 1 1 1 -1 -1 1 -1 -1 1 -1 -1 -1 1 -1 1
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 1 1 1 -1 -1 -1 1 1 -1 1 1 1 1 1 1 1 1 1
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 1 -1 1 1 -1 -1 1 1 -1 1 1 -1 -1 1 1 -1 1 -1
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 1 1 1 1 -1 1 1 -1 -1 -1 1 -1 -1 1 1 1 1 1
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 1 -1 1 1 -1 -1 -1 1 1 1 1 -1 -1 1 1 -1 -1 1
## 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## 1 1 -1 1 -1 1 -1 1 1 -1 1 -1 1 -1 -1 1 -1 -1
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## 1 1 1 -1 1 -1 -1 1 1 1 1 1 1 -1 1 1 1 1
## 127 128 129 130
## 1 1 1 -1
## Levels: -1 1
##
## $posterior
## -1 1
## 1 8.881357e-01 1.118643e-01
## 2 9.226195e-01 7.738046e-02
## 3 8.352420e-01 1.647580e-01
## 4 7.548232e-04 9.992452e-01
## 5 2.215917e-03 9.977841e-01
## 6 2.199107e-01 7.800893e-01
## 7 9.999916e-01 8.394604e-06
## 8 1.000000e+00 3.526443e-08
## 9 2.264687e-03 9.977353e-01
## 10 9.999287e-01 7.126706e-05
## 11 9.998488e-01 1.512303e-04
## 12 5.873904e-07 9.999994e-01
## 13 8.209233e-01 1.790767e-01
## 14 1.000000e+00 8.181413e-25
## 15 1.000000e+00 1.142066e-20
## 16 2.104438e-05 9.999790e-01
## 17 8.273463e-01 1.726537e-01
## 18 5.439995e-05 9.999456e-01
## 19 1.516730e-10 1.000000e+00
## 20 6.203807e-08 9.999999e-01
## 21 4.072985e-09 1.000000e+00
## 22 9.994150e-01 5.849732e-04
## 23 1.000000e+00 2.497489e-14
## 24 1.000000e+00 1.153955e-17
## 25 2.782794e-10 1.000000e+00
## 26 1.930564e-11 1.000000e+00
## 27 9.999990e-01 9.874675e-07
## 28 4.734148e-08 1.000000e+00
## 29 4.820644e-09 1.000000e+00
## 30 6.949773e-09 1.000000e+00
## 31 3.100849e-03 9.968992e-01
## 32 2.614337e-04 9.997386e-01
## 33 1.207641e-01 8.792359e-01
## 34 1.455026e-03 9.985450e-01
## 35 1.927845e-04 9.998072e-01
## 36 2.639213e-06 9.999974e-01
## 37 1.560637e-01 8.439363e-01
## 38 9.866632e-01 1.333681e-02
## 39 9.234668e-04 9.990765e-01
## 40 1.385964e-05 9.999861e-01
## 41 9.986399e-01 1.360062e-03
## 42 1.000000e+00 5.615179e-28
## 43 4.229634e-04 9.995770e-01
## 44 1.811450e-09 1.000000e+00
## 45 1.000000e+00 5.385597e-11
## 46 9.094518e-04 9.990905e-01
## 47 6.114460e-03 9.938855e-01
## 48 1.000000e+00 7.284899e-13
## 49 9.982653e-01 1.734671e-03
## 50 1.569383e-04 9.998431e-01
## 51 8.933634e-05 9.999107e-01
## 52 1.000000e+00 2.717223e-21
## 53 3.139103e-08 1.000000e+00
## 54 1.000000e+00 4.652442e-10
## 55 1.538105e-08 1.000000e+00
## 56 6.003119e-04 9.993997e-01
## 57 4.320710e-03 9.956793e-01
## 58 3.048948e-08 1.000000e+00
## 59 9.997234e-01 2.765930e-04
## 60 3.420753e-09 1.000000e+00
## 61 7.082992e-07 9.999993e-01
## 62 1.000000e+00 1.455488e-08
## 63 7.106943e-01 2.893057e-01
## 64 1.000000e+00 2.655883e-13
## 65 9.608133e-02 9.039187e-01
## 66 1.000000e+00 1.484666e-20
## 67 9.994873e-01 5.126685e-04
## 68 1.973459e-04 9.998027e-01
## 69 8.857094e-09 1.000000e+00
## 70 5.929248e-08 9.999999e-01
## 71 8.414735e-02 9.158527e-01
## 72 5.303363e-04 9.994697e-01
## 73 2.437883e-02 9.756212e-01
## 74 1.000000e+00 8.740729e-31
## 75 6.739580e-09 1.000000e+00
## 76 1.193981e-07 9.999999e-01
## 77 1.000000e+00 1.920985e-31
## 78 8.362850e-01 1.637150e-01
## 79 9.808375e-01 1.916246e-02
## 80 3.119519e-04 9.996880e-01
## 81 3.406868e-07 9.999997e-01
## 82 1.267471e-01 8.732529e-01
## 83 2.748177e-03 9.972518e-01
## 84 1.000000e+00 3.519397e-09
## 85 1.000000e+00 5.593130e-10
## 86 6.959272e-10 1.000000e+00
## 87 2.081319e-03 9.979187e-01
## 88 1.000000e+00 1.568282e-11
## 89 9.597234e-01 4.027663e-02
## 90 2.510368e-03 9.974896e-01
## 91 3.548111e-10 1.000000e+00
## 92 1.375940e-03 9.986241e-01
## 93 9.999999e-01 1.033665e-07
## 94 6.105136e-03 9.938949e-01
## 95 1.000000e+00 5.402941e-20
## 96 2.129960e-08 1.000000e+00
## 97 1.000000e+00 5.699283e-20
## 98 4.554711e-01 5.445289e-01
## 99 4.550436e-05 9.999545e-01
## 100 1.000000e+00 2.486021e-18
## 101 1.944132e-03 9.980559e-01
## 102 9.999005e-01 9.946086e-05
## 103 2.732287e-04 9.997268e-01
## 104 7.667169e-01 2.332831e-01
## 105 9.999706e-01 2.944875e-05
## 106 4.520615e-04 9.995479e-01
## 107 1.000000e+00 2.933938e-12
## 108 1.000000e+00 2.098601e-49
## 109 3.989550e-06 9.999960e-01
## 110 1.488243e-07 9.999999e-01
## 111 3.989449e-05 9.999601e-01
## 112 1.000000e+00 8.010984e-85
## 113 7.775588e-06 9.999922e-01
## 114 1.000000e+00 2.728208e-52
## 115 9.999996e-01 3.777361e-07
## 116 6.572959e-09 1.000000e+00
## 117 7.772927e-08 9.999999e-01
## 118 4.788386e-01 5.211614e-01
## 119 3.473898e-05 9.999653e-01
## 120 2.464291e-01 7.535709e-01
## 121 2.150560e-01 7.849440e-01
## 122 1.000000e+00 5.540592e-13
## 123 3.035623e-04 9.996964e-01
## 124 3.231961e-08 1.000000e+00
## 125 7.504222e-02 9.249578e-01
## 126 1.110127e-11 1.000000e+00
## 127 4.286204e-08 1.000000e+00
## 128 1.607793e-02 9.839221e-01
## 129 5.452969e-05 9.999455e-01
## 130 9.643454e-01 3.565455e-02
Err <- 1 - sum(out$class == sonar.train$V61)/length(sonar.train$V61)
Err #NaiveBayes, error = 0.2384615
## [1] 0.2384615
train.labels <- sonar.train$V61
train <- sonar.train[, -61]
Err <- rep(0, 20)
for (kk in seq(from = 1, to = 20)) {
out <- knn.cv(train, train.labels, k = kk)
Error <- 1 - sum(abs(train.labels == out))/length(out)
Err[kk] <- Error
}
Err
## [1] 0.2000000 0.2076923 0.2615385 0.3000000 0.3000000 0.2923077 0.3000000
## [8] 0.2923077 0.2923077 0.3076923 0.3000000 0.3000000 0.3153846 0.3230769
## [15] 0.3076923 0.3076923 0.3076923 0.3153846 0.3230769 0.3307692
plot(Err)
bestk = which.min(Err)
bestk
## [1] 1
# k = 1 results in the lowest error for the Winequality dataset, error = 0.200000
# In Conclusion - Using the Sonar.train dataset to compare K-neirest with NaiveBayes.
#K-neirest with k = 1, does the best with error of 0.2 vs.
#NaiveBayes, that also does very well with an error of 0.2384.
#It depends on costs of error and easy of computation / dataset that would make the determination for 'best' method to be used to classify the sonar data-set.
#After running the KfirstNearestNeighbor.R file
rm(list=ls())
oldpar <- par(no.readonly=TRUE)
# oldpar$mar
par(mar=rep(1, 4)) # make the margins smaller for RStudio
library(class)
library(e1071)
STrain <- read.table("c:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/sonar_train.csv",sep = ",",header = FALSE)
STest <- read.table("c:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/sonar_test.csv",sep = ",",header = FALSE)
Sonar <- rbind(STrain,STest)
train <- Sonar[,1:60]
labels <- Sonar[,61]
## What is the test error with knn ----
# using one nearest neighbor? ----
## knn.cv uses leave-one-out cross validation to classify the training data.
out <- knn.cv(train,labels,k=1)
1-sum(abs(labels == out))/length(out)
## [1] 0.1730769
# [1] 0.1730769
## scale each column
train2 <- train
for(i in seq(from = 1, to = ncol(train))){
v = var(train[,i])
m = mean(train[,i])
train2[,i] <- (train[,i]-m)/sqrt(v)
}
out <- knn.cv(train2,labels,k=1)
1-sum(abs(labels == out))/length(out)
## [1] 0.125
# [1] 0.125
## For one nearest neighbor ----
## error before scaling = 0.173 ----
## error after scaling = 0.125 ----
#cross - validation
####
Err <- rep(0,20)
for(kk in seq(from=1,to=20)){
out <- knn.cv(train2,labels,k=kk)
Error <- 1-sum(abs(labels == out))/length(out)
Err[kk] <- Error
}
Err
## [1] 0.1250000 0.1250000 0.1346154 0.1682692 0.1778846 0.2067308 0.1923077
## [8] 0.1971154 0.2067308 0.2163462 0.2403846 0.2788462 0.2740385 0.2596154
## [15] 0.2740385 0.2788462 0.2836538 0.2836538 0.2884615 0.2740385
plot(Err)
## k = 1 gives the best result ----
#Out of curiosity how does it work without normalizing?
####
Err <- rep(0,20)
for(kk in seq(from=1,to=20)){
out <- knn.cv(train,labels,k=kk)
Error <- 1-sum(abs(labels == out))/length(out)
Err[kk] <- Error
}
Err
## [1] 0.1730769 0.2019231 0.1875000 0.1826923 0.1730769 0.2067308 0.2307692
## [8] 0.2548077 0.2644231 0.2932692 0.3221154 0.3269231 0.3413462 0.3317308
## [15] 0.3317308 0.3365385 0.3413462 0.3317308 0.3269231 0.3221154
plot(Err)
# Using mixtureSimData
mixSim <- read.table(file="c:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/mixtureSimData.data")
mixSimMat <- matrix(0.0,200,2)
mixSimMat[,1] <- mixSim[1:200,1]
mixSimMat[,2] <- mixSim[201:400,1]
Y <- rep(1.0,200) #Y is the class or target
Y[101:200] <- 2.0
plot(mixSimMat)
points(mixSimMat[101:200,1:2], col = 2) # color = Red
points(mixSimMat[1:100,1:2], col = 3) # color = Green
## Separate the Classes using ordinary least square linear regression ----
linMod <- lm(Y~mixSimMat)
coef <- linMod$coefficients
a <- (1.5-coef[1])/coef[3]
b <- -coef[2]/coef[3]
abline(a,b)
# the classes are between 1 and 2
# the dividing line happens at predicted class = 1.5
# find the line 1.5 = c1 + c2x +c3y (here ci is coef[i])
# reduces to
# y = (1.5-c1)/c3 -(c2/c3)x
# y = mx + b ... here b the intercept is (1.5-c1)/c3
# the slope is -(c2/c3)
# abline(intercept, slope)
maxX1 <- max(mixSimMat[,1])
minX1 <- min(mixSimMat[,1])
maxX2 <- max(mixSimMat[,2])
minX2 <- min(mixSimMat[,2])
testMat <- matrix(0.0,10000,2) #matrix(data,number of rows, number of columns)
for(i in 1:100){
for(j in 1:100){
x1 <- minX1 + i*(maxX1 - minX1)/100
x2 <- minX2 + j*(maxX2 - minX2)/100
index <- (i-1)*100 + j
testMat[index,1] <- x1
testMat[index,2] <- x2
}
}
#XX, YY, and ZZ are used to create the black contour curve in the plot
XX <- c((1:100)*(maxX1 - minX1))
XX <- XX/100
XX <- XX + minX1
YY <- c((1:100)*(maxX2 - minX2))
YY <- YY/100
YY <- YY + minX2
#### Now testMat[index,] = XX[i],YY[j] where index <- (i-1)*100 + j
i = 7
j = 2
index <- (i-1)*100 + j
testMat[index,]
## [1] 28.93 10.72
#[1] -2.05241 -1.90274
XX[i]
## [1] 28.93
#[1] -2.05241
YY[j]
## [1] 10.72
#[1] -1.90274
#knn plots [4] ----
#knn(train, test, cl, k = 1)
# train is the training data
# test is the test data which knn will classify using k nearest neighbors
# cl = factor of true classifications of training set
# use k nearest neighbors
require(class)
KNN <- knn(mixSimMat, testMat, Y, 15)
#ZZ is set up to be able to plot the black boarder separating the classes
# ZZ[i,j] = predicted class of the point (XX[i],YY[j])
ZZ <- matrix(0.0,100,100)
for(i in 1:100){
for(j in 1:100){
index <- (i-1)*100 + j
ZZ[i,j] <- KNN[index]
}
}
i = 7;j = 2;index <- (i-1)*100 + j
KNN[index] # prediction for testMat[index,] is 1st level = 1
## [1] 1
## Levels: 1 2
ZZ[i,j] # [1] 1 ... they match
## [1] 1
I1 <- which(KNN == 1)
# first plot the grid as . (test data)
# next plot the data as o
plot(testMat, pch=".")
points(testMat[I1,], col=2, cex = 0.2,pch=20) # plot a small red .
points(testMat[-I1,],col = 3, cex = 0.2,pch=20) # plot a small green .
points(mixSimMat[1:100,], col = 3)
points(mixSimMat[101:200,], col = 2)
contour(XX,YY,ZZ,levels = 1.5, drawlabels = FALSE, add = TRUE)
KNN <- knn(mixSimMat, testMat, Y, 5)
ZZ <- matrix(0.0,100,100)
for(i in 1:100){
for(j in 1:100){
index <- (i-1)*100 + j
ZZ[i,j] <- KNN[index]
}
}
I1 <- which(KNN == 1)
plot(testMat, pch=".")
points(testMat[I1,], col=2, cex = 0.2,pch=20)
points(testMat[-I1,],col =3, cex = 0.2,pch=20)
points(mixSimMat[1:100,], col = 3)
points(mixSimMat[101:200,], col = 2)
contour(XX,YY,ZZ,levels = 1.5, drawlabels = FALSE, add = TRUE)
#third knn plot (uses 1 nearest neighbor)
KNN <- knn(mixSimMat, testMat, Y, 1)
ZZ <- matrix(0.0,100,100)
for(i in 1:100){
for(j in 1:100){
index <- (i-1)*100 + j
ZZ[i,j] <- KNN[index]
}
}
I1 <- which(KNN == 1)
plot(testMat, pch=".")
points(testMat[I1,], col=2, cex = 0.2,pch=20)
points(testMat[-I1,],col = 3, cex = 0.2,pch=20)
points(mixSimMat[1:100,], col = 3)
points(mixSimMat[101:200,], col = 2)
contour(XX,YY,ZZ,levels = 1.5, drawlabels = FALSE, add = TRUE)
## Use cross - validation to find the best k ----
Err <- rep(0,50)
for(kk in seq(from=1,to=50)){
out <- knn.cv(mixSimMat, Y, k=kk)
Error <- 1-sum(abs(Y == out))/length(out)
Err[kk] <- Error
}
Err
## [1] 0.300 0.285 0.215 0.240 0.200 0.195 0.185 0.175 0.185 0.185 0.185
## [12] 0.190 0.185 0.195 0.200 0.195 0.195 0.195 0.190 0.185 0.195 0.195
## [23] 0.190 0.200 0.215 0.205 0.200 0.215 0.205 0.215 0.225 0.220 0.240
## [34] 0.240 0.220 0.215 0.230 0.230 0.245 0.245 0.250 0.255 0.265 0.265
## [45] 0.275 0.275 0.275 0.270 0.270 0.270
# par(mar=rep(3, 4))
plot(Err)
min(Err) #[1] 0.165
## [1] 0.175
which(Err == min(Err))
## [1] 8
#[1] 5 13
# Best k is 13 as it is most simple