Data Iris
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
summary(iris[c("Petal.Width", "Sepal.Width")])
## Petal.Width Sepal.Width
## Min. :0.100 Min. :2.000
## 1st Qu.:0.300 1st Qu.:2.800
## Median :1.300 Median :3.000
## Mean :1.199 Mean :3.057
## 3rd Qu.:1.800 3rd Qu.:3.300
## Max. :2.500 Max. :4.400
library(class)
normalize <- function(x) {
num <- x - min(x)
denom <- max(x) - min(x)
return (num/denom)
}
iris_norm <- as.data.frame(lapply(iris[1:4], normalize))
summary(iris_norm)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.2222 1st Qu.:0.3333 1st Qu.:0.1017 1st Qu.:0.08333
## Median :0.4167 Median :0.4167 Median :0.5678 Median :0.50000
## Mean :0.4287 Mean :0.4406 Mean :0.4675 Mean :0.45806
## 3rd Qu.:0.5833 3rd Qu.:0.5417 3rd Qu.:0.6949 3rd Qu.:0.70833
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
set.seed(1234)
ind <- sample(2, nrow(iris), replace=TRUE, prob=c(0.67, 0.33))
iris.training <- iris[ind==1, 1:4]
iris.test <- iris[ind==2, 1:4]
iris.trainLabels <- iris[ind==1, 5]
iris.testLabels <- iris[ind==2, 5]
iris_pred <- knn(train = iris.training, test = iris.test, cl = iris.trainLabels, k=3)
iris_pred
## [1] setosa setosa setosa setosa setosa setosa
## [7] setosa setosa setosa setosa setosa setosa
## [13] versicolor versicolor versicolor versicolor versicolor versicolor
## [19] versicolor versicolor versicolor versicolor versicolor versicolor
## [25] virginica virginica virginica virginica versicolor virginica
## [31] virginica virginica virginica virginica virginica virginica
## [37] virginica virginica virginica virginica
## Levels: setosa versicolor virginica
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.1.3
CrossTable(x = iris.testLabels, y = iris_pred, prop.chisq=FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 40
##
##
## | iris_pred
## iris.testLabels | setosa | versicolor | virginica | Row Total |
## ----------------|------------|------------|------------|------------|
## setosa | 12 | 0 | 0 | 12 |
## | 1.000 | 0.000 | 0.000 | 0.300 |
## | 1.000 | 0.000 | 0.000 | |
## | 0.300 | 0.000 | 0.000 | |
## ----------------|------------|------------|------------|------------|
## versicolor | 0 | 12 | 0 | 12 |
## | 0.000 | 1.000 | 0.000 | 0.300 |
## | 0.000 | 0.923 | 0.000 | |
## | 0.000 | 0.300 | 0.000 | |
## ----------------|------------|------------|------------|------------|
## virginica | 0 | 1 | 15 | 16 |
## | 0.000 | 0.062 | 0.938 | 0.400 |
## | 0.000 | 0.077 | 1.000 | |
## | 0.000 | 0.025 | 0.375 | |
## ----------------|------------|------------|------------|------------|
## Column Total | 12 | 13 | 15 | 40 |
## | 0.300 | 0.325 | 0.375 | |
## ----------------|------------|------------|------------|------------|
##
##
#kNN Tutotrial on Iris Data Set####
library(class) #Has the knn function
set.seed(4948493) #Set the seed for reproducibility
#Sample the Iris data set (70% train, 30% test)
ir_sample<-sample(1:nrow(iris),size=nrow(iris)*.7)
ir_train<-iris[ir_sample,] #Select the 70% of rows
ir_test<-iris[-ir_sample,] #Select the 30% of rows
#First Attempt to Determine Right K####
iris_acc<-numeric() #Holding variable
for(i in 1:50){
#Apply knn with k = i
predict<-knn(ir_train[,-5],ir_test[,-5],
ir_train$Species,k=i)
iris_acc<-c(iris_acc,
mean(predict==ir_test$Species))
}
#Plot k= 1 through 30
plot(1-iris_acc,type="l",ylab="Error Rate",
xlab="K",main="Error Rate for Iris With Varying K")
#Try many Samples of Iris Data Set to Validate K####
trial_sum<-numeric(20)
trial_n<-numeric(20)
set.seed(6033850)
for(i in 1:100){
ir_sample<-sample(1:nrow(iris),size=nrow(iris)*.7)
ir_train<-iris[ir_sample,]
ir_test<-iris[-ir_sample,]
test_size<-nrow(ir_test)
for(j in 1:20){
predict<-knn(ir_train[,-5],ir_test[,-5],
ir_train$Species,k=j)
trial_sum[j]<-trial_sum[j]+sum(predict==ir_test$Species)
trial_n[j]<-trial_n[j]+test_size
}
}
plot(1-trial_sum / trial_n,type="l",ylab="Error Rate",
xlab="K",main="Error Rate for Iris With Varying K (100 Samples)")