Assignment 8
dataset <- read.csv("C:\\Users\\RISHI RAHUL\\Desktop\\DS\\11 KNN\\Assignment\\Zoo.csv")
str(dataset)
## 'data.frame': 101 obs. of 18 variables:
## $ animal.name: Factor w/ 100 levels "aardvark","antelope",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ hair : int 1 1 0 1 1 1 1 0 0 1 ...
## $ feathers : int 0 0 0 0 0 0 0 0 0 0 ...
## $ eggs : int 0 0 1 0 0 0 0 1 1 0 ...
## $ milk : int 1 1 0 1 1 1 1 0 0 1 ...
## $ airborne : int 0 0 0 0 0 0 0 0 0 0 ...
## $ aquatic : int 0 0 1 0 0 0 0 1 1 0 ...
## $ predator : int 1 0 1 1 1 0 0 0 1 0 ...
## $ toothed : int 1 1 1 1 1 1 1 1 1 1 ...
## $ backbone : int 1 1 1 1 1 1 1 1 1 1 ...
## $ breathes : int 1 1 0 1 1 1 1 0 0 1 ...
## $ venomous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ fins : int 0 0 1 0 0 0 0 1 1 0 ...
## $ legs : int 4 4 0 4 4 4 4 0 0 4 ...
## $ tail : int 0 1 1 0 1 1 1 1 1 0 ...
## $ domestic : int 0 0 0 0 0 0 1 1 0 1 ...
## $ catsize : int 1 1 0 1 1 1 1 0 0 0 ...
## $ type : int 1 1 4 1 1 1 1 4 4 1 ...
dim(dataset)
## [1] 101 18
dataset <- dataset[-1]
dim(dataset)
## [1] 101 17
table(dataset$type)
##
## 1 2 3 4 5 6 7
## 41 20 5 13 4 8 10
round(prop.table(table(dataset$type))*100, digits = 1)
##
## 1 2 3 4 5 6 7
## 40.6 19.8 5.0 12.9 4.0 7.9 9.9
summary(dataset[c("feathers","aquatic","legs")])
## feathers aquatic legs
## Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.000 Median :0.0000 Median :4.000
## Mean :0.198 Mean :0.3564 Mean :2.842
## 3rd Qu.:0.000 3rd Qu.:1.0000 3rd Qu.:4.000
## Max. :1.000 Max. :1.0000 Max. :8.000
normalize_data <- function(x){
return((x-min(x))/(max(x)-min(x)))
}
dataset_n <- as.data.frame(lapply(dataset[1:16], normalize_data))
summary(dataset_n[c("feathers","aquatic","legs")])
## feathers aquatic legs
## Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.2500
## Median :0.000 Median :0.0000 Median :0.5000
## Mean :0.198 Mean :0.3564 Mean :0.3552
## 3rd Qu.:0.000 3rd Qu.:1.0000 3rd Qu.:0.5000
## Max. :1.000 Max. :1.0000 Max. :1.0000
dataset_train <- dataset_n[1:80,]
dataset_test <- dataset_n[81:101,]
dataset_train_labels <- dataset[1:80,17]
dataset_test_labels <- dataset[81:101,17]
# EDA part ends here
library(class)
dataset_pred <- knn(train = dataset_train, test = dataset_test, cl = dataset_train_labels, k=5)
# for k =5, got only 6 types : 1, 2, 4, 5, 6, 7
dataset_pred <- knn(train = dataset_train, test = dataset_test, cl = dataset_train_labels, k=2)
# for k = 2, got all 7 types : 1, 2, 3, 4, 5, 6, 7
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.5.1
CrossTable(dataset_test_labels, dataset_pred)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 21
##
##
## | dataset_pred
## dataset_test_labels | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Row Total |
## --------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 1 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 5 |
## | 12.190 | 1.190 | 0.238 | 0.714 | 0.476 | 0.952 | 0.238 | |
## | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.238 |
## | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | |
## | 0.238 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | |
## --------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 2 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 4 |
## | 0.952 | 9.752 | 0.190 | 0.571 | 0.381 | 0.762 | 0.190 | |
## | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.190 |
## | 0.000 | 0.800 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | |
## | 0.000 | 0.190 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | |
## --------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 3 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 3 |
## | 0.714 | 0.114 | 5.143 | 0.429 | 1.786 | 0.571 | 0.143 | |
## | 0.000 | 0.333 | 0.333 | 0.000 | 0.333 | 0.000 | 0.000 | 0.143 |
## | 0.000 | 0.200 | 1.000 | 0.000 | 0.500 | 0.000 | 0.000 | |
## | 0.000 | 0.048 | 0.048 | 0.000 | 0.048 | 0.000 | 0.000 | |
## --------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 4 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 3 |
## | 0.714 | 0.714 | 0.143 | 15.429 | 0.286 | 0.571 | 0.143 | |
## | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.143 |
## | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | |
## | 0.000 | 0.000 | 0.000 | 0.143 | 0.000 | 0.000 | 0.000 | |
## --------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 5 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
## | 0.238 | 0.238 | 0.048 | 0.143 | 8.595 | 0.190 | 0.048 | |
## | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.048 |
## | 0.000 | 0.000 | 0.000 | 0.000 | 0.500 | 0.000 | 0.000 | |
## | 0.000 | 0.000 | 0.000 | 0.000 | 0.048 | 0.000 | 0.000 | |
## --------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 6 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 2 |
## | 0.476 | 0.476 | 0.095 | 0.286 | 0.190 | 6.881 | 0.095 | |
## | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.095 |
## | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.500 | 0.000 | |
## | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.095 | 0.000 | |
## --------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 7 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 3 |
## | 0.714 | 0.714 | 0.143 | 0.429 | 0.286 | 3.571 | 5.143 | |
## | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.667 | 0.333 | 0.143 |
## | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.500 | 1.000 | |
## | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.095 | 0.048 | |
## --------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 5 | 5 | 1 | 3 | 2 | 4 | 1 | 21 |
## | 0.238 | 0.238 | 0.048 | 0.143 | 0.095 | 0.190 | 0.048 | |
## --------------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##
##
# Out of 21 Records, Model got 17 Correct Predictions and 4 Wrong Prediction
# Model is 81% Accurate