summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
##All variables are in near ranges,so assumed normalization is not required on data
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
##Factor variable is indexed in 5th position
table(iris$Species)
##
## setosa versicolor virginica
## 50 50 50
# percent values
round(prop.table(table(iris$Species)) * 100,digits = 1)
##
## setosa versicolor virginica
## 33.3 33.3 33.3
Spet 1: Data preparation - creating training and test datasets
#Random splitting of iris data as 70% train and 30%test datasets
ind <- sample(2, nrow(iris), replace=TRUE, prob=c(0.7, 0.3))
trainData <- iris[ind==1,]
testData <- iris[ind==2,]
#removing factorvariable from training and test datasets
trainData1 <- trainData[-5]
testData1 <- testData[-5]
#checking the dimensions of train and test datasets
dim(trainData)
## [1] 102 5
dim(trainData1)
## [1] 102 4
dim(testData)
## [1] 48 5
dim(testData1)
## [1] 48 4
iris_train_labels <- trainData$Species
dim(iris_train_labels)
## NULL
class(iris_train_labels)
## [1] "factor"
iris_test_labels <- testData$Species
dim(iris_test_labels)
## NULL
#****Make sure :train and test lables are factor type data
#install.packages("class")
library(class)
## Warning: package 'class' was built under R version 3.2.5
iris_test_pred1 <- knn(train = trainData1, test = testData1, cl= iris_train_labels,k = 3,prob=TRUE)
#KNN returns the predicted lables for test data set
#install.packages("gmodels")
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.2.5
CrossTable(x = iris_test_labels, y = iris_test_pred1,prop.chisq=FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 48
##
##
## | iris_test_pred1
## iris_test_labels | setosa | versicolor | virginica | Row Total |
## -----------------|------------|------------|------------|------------|
## setosa | 16 | 0 | 0 | 16 |
## | 1.000 | 0.000 | 0.000 | 0.333 |
## | 1.000 | 0.000 | 0.000 | |
## | 0.333 | 0.000 | 0.000 | |
## -----------------|------------|------------|------------|------------|
## versicolor | 0 | 17 | 2 | 19 |
## | 0.000 | 0.895 | 0.105 | 0.396 |
## | 0.000 | 1.000 | 0.133 | |
## | 0.000 | 0.354 | 0.042 | |
## -----------------|------------|------------|------------|------------|
## virginica | 0 | 0 | 13 | 13 |
## | 0.000 | 0.000 | 1.000 | 0.271 |
## | 0.000 | 0.000 | 0.867 | |
## | 0.000 | 0.000 | 0.271 | |
## -----------------|------------|------------|------------|------------|
## Column Total | 16 | 17 | 15 | 48 |
## | 0.333 | 0.354 | 0.312 | |
## -----------------|------------|------------|------------|------------|
##
##
#The top-left cell indicates the true negative results.
#The bottom-right cell indicates the true positive results
#The two examples in the lower-left cell are false negative results, which are wrongly predicted
#The top-right cell would contain the false positive results
#A total of 4 out of 51, or 8 percent of Species were incorrectly classified by the kNN classifier
# therefore 92 percent accuracy
#92% of accuracy is showing by the model
#> (17+14+16)/51
#[1] 0.9215686