Assignment 22
dataset <- read.csv("C:\\Users\\RISHI RAHUL\\Desktop\\DS\\9 KNN\\Assignment\\glass.csv")
str(dataset)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: int 1 1 1 1 1 1 1 1 1 1 ...
dim(dataset)
## [1] 214 10
table(dataset$Type)
##
## 1 2 3 5 6 7
## 70 76 17 13 9 29
round(prop.table(table(dataset$Type))*100, digits = 1)
##
## 1 2 3 5 6 7
## 32.7 35.5 7.9 6.1 4.2 13.6
summary(dataset[c("Na","Si","Ca")])
## Na Si Ca
## Min. :10.73 Min. :69.81 Min. : 5.430
## 1st Qu.:12.91 1st Qu.:72.28 1st Qu.: 8.240
## Median :13.30 Median :72.79 Median : 8.600
## Mean :13.41 Mean :72.65 Mean : 8.957
## 3rd Qu.:13.82 3rd Qu.:73.09 3rd Qu.: 9.172
## Max. :17.38 Max. :75.41 Max. :16.190
normalize_data <- function(x){
return((x-min(x))/(max(x)-min(x)))
}
dataset_n <- as.data.frame(lapply(dataset[1:9], normalize_data))
summary(dataset_n[c("Na","Si","Ca")])
## Na Si Ca
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3274 1st Qu.:0.4411 1st Qu.:0.2612
## Median :0.3865 Median :0.5321 Median :0.2946
## Mean :0.4027 Mean :0.5073 Mean :0.3278
## 3rd Qu.:0.4654 3rd Qu.:0.5853 3rd Qu.:0.3478
## Max. :1.0000 Max. :1.0000 Max. :1.0000
dataset_train <- dataset_n[1:128,]
dataset_test <- dataset_n[129:214,]
dataset_train_labels <- dataset[1:128,10]
dataset_test_labels <- dataset[129:214,10]
# EDA part ends here
library(class)
dataset_pred <- knn(train = dataset_train, test = dataset_test, cl = dataset_train_labels, k=1)
# for k =5, got only 6 types : 1, 2, 4, 5, 6, 7
#dataset_pred <- knn(train = dataset_train, test = dataset_test, cl = dataset_train_labels, k=1)
# for k = 2, got all 7 types : 1, 2, 3, 4, 5, 6, 7
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.5.1
CrossTable(dataset_test_labels, dataset_pred)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 86
##
##
## | dataset_pred
## dataset_test_labels | 1 | 2 | Row Total |
## --------------------|-----------|-----------|-----------|
## 2 | 6 | 12 | 18 |
## | 0.423 | 0.145 | |
## | 0.333 | 0.667 | 0.209 |
## | 0.273 | 0.188 | |
## | 0.070 | 0.140 | |
## --------------------|-----------|-----------|-----------|
## 3 | 10 | 7 | 17 |
## | 7.343 | 2.524 | |
## | 0.588 | 0.412 | 0.198 |
## | 0.455 | 0.109 | |
## | 0.116 | 0.081 | |
## --------------------|-----------|-----------|-----------|
## 5 | 0 | 13 | 13 |
## | 3.326 | 1.143 | |
## | 0.000 | 1.000 | 0.151 |
## | 0.000 | 0.203 | |
## | 0.000 | 0.151 | |
## --------------------|-----------|-----------|-----------|
## 6 | 4 | 5 | 9 |
## | 1.252 | 0.430 | |
## | 0.444 | 0.556 | 0.105 |
## | 0.182 | 0.078 | |
## | 0.047 | 0.058 | |
## --------------------|-----------|-----------|-----------|
## 7 | 2 | 27 | 29 |
## | 3.958 | 1.360 | |
## | 0.069 | 0.931 | 0.337 |
## | 0.091 | 0.422 | |
## | 0.023 | 0.314 | |
## --------------------|-----------|-----------|-----------|
## Column Total | 22 | 64 | 86 |
## | 0.256 | 0.744 | |
## --------------------|-----------|-----------|-----------|
##
##