Read the dataset

Providing the detailed description of the glass set

RI refractive index, Na Sodium, Mg Magnesium,Al Aluminum, Si Silicon,K Potassium

Ca Calcium, Ba Barium, Fe Iron

# install.packages("caret")
# install.packages("pROC")
# install.packages("mlbench")
# install.packages("lattice")

library(caret)
## Warning: package 'caret' was built under R version 3.5.1
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.1
library(pROC)
## Warning: package 'pROC' was built under R version 3.5.1
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(mlbench)
## Warning: package 'mlbench' was built under R version 3.5.1
library(lattice)

# Example 1 - Glass dataset (Classification)

glass <- read.csv(file.choose())
glass$Type[glass$Type==1] <- 'Type1'
glass$Type[glass$Type==2] <- 'Type2'
glass$Type[glass$Type==3] <- 'Type3'
glass$Type[glass$Type==4] <- 'Type4'
glass$Type[glass$Type==5] <- 'Type5'
glass$Type[glass$Type==6] <- 'Type6'
glass$Type[glass$Type==7] <- 'Type7'
str(glass)
## 'data.frame':    214 obs. of  10 variables:
##  $ RI  : num  1.52 1.52 1.52 1.52 1.52 ...
##  $ Na  : num  13.6 13.9 13.5 13.2 13.3 ...
##  $ Mg  : num  4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
##  $ Al  : num  1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
##  $ Si  : num  71.8 72.7 73 72.6 73.1 ...
##  $ K   : num  0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
##  $ Ca  : num  8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
##  $ Ba  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fe  : num  0 0 0 0 0 0.26 0 0 0 0.11 ...
##  $ Type: chr  "Type1" "Type1" "Type1" "Type1" ...
glass$Type <- as.factor(glass$Type) # Factorize the Type in Glass dataset
View(glass)

# Data partition
set.seed(123)
ind <- sample(2,nrow(glass), replace = T, prob = c(0.7,0.3))
train <- glass[ind==1,]
test <- glass[ind==2,]

# KNN Model 

trcontrol <- trainControl(method = "repeatedcv", number = 10,repeats = 3)
set.seed(222)
fit <- train(Type ~., data = train, method = 'knn', tuneLength = 20,
              trControl = trcontrol, preProc = c("center","scale"))
    # default metric is accuracy but if u want to use ROC, then mention the same
# Model Performance :
fit # the optimum value for k should be 9
## k-Nearest Neighbors 
## 
## 157 samples
##   9 predictor
##   6 classes: 'Type1', 'Type2', 'Type3', 'Type5', 'Type6', 'Type7' 
## 
## Pre-processing: centered (9), scaled (9) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 141, 141, 143, 142, 141, 140, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.6270094  0.4860153
##    7  0.6415889  0.5015299
##    9  0.6345487  0.4897913
##   11  0.6340920  0.4863176
##   13  0.6335664  0.4837893
##   15  0.6227194  0.4636838
##   17  0.6210893  0.4624520
##   19  0.6151957  0.4523398
##   21  0.6116842  0.4443376
##   23  0.6109606  0.4445746
##   25  0.6087383  0.4394013
##   27  0.5980474  0.4239945
##   29  0.6084641  0.4394526
##   31  0.5952299  0.4194728
##   33  0.5730672  0.3881899
##   35  0.5700058  0.3833292
##   37  0.5465640  0.3502106
##   39  0.5440379  0.3457769
##   41  0.5461411  0.3469884
##   43  0.5173160  0.3057747
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.
plot(fit)

varImp(fit)
## ROC curve variable importance
## 
##   variables are sorted by maximum importance across the classes
##    Type1   Type2  Type3 Type5  Type6   Type7
## Mg 21.97 100.000 100.00 98.10 21.966 100.000
## Al 48.35  98.249  48.35 95.72 48.349  98.249
## K  22.84  24.712  97.81 58.51 36.076  24.712
## Ba  0.00  12.018   0.00 93.53  0.000  12.018
## Na 30.55  26.244  84.68 87.44 44.353  30.548
## RI 25.03  25.030  25.03 60.99 25.030  16.541
## Ca 31.04  55.133  47.47 31.04 38.878  55.133
## Si 21.36   5.014  16.47 54.23 32.256  21.356
## Fe  1.95   2.825  29.96 17.12  5.643   2.825
pred <- predict(fit, newdata = test )
confusionMatrix(pred, test$Type)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Type1 Type2 Type3 Type5 Type6 Type7
##      Type1    18     6     1     0     0     2
##      Type2     4    12     1     1     0     1
##      Type3     0     1     0     0     0     0
##      Type5     0     2     0     2     0     0
##      Type6     0     0     0     0     2     0
##      Type7     0     0     0     0     1     3
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6491          
##                  95% CI : (0.5113, 0.7709)
##     No Information Rate : 0.386           
##     P-Value [Acc > NIR] : 5.206e-05       
##                                           
##                   Kappa : 0.4846          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Type1 Class: Type2 Class: Type3 Class: Type5
## Sensitivity                0.8182       0.5714      0.00000      0.66667
## Specificity                0.7429       0.8056      0.98182      0.96296
## Pos Pred Value             0.6667       0.6316      0.00000      0.50000
## Neg Pred Value             0.8667       0.7632      0.96429      0.98113
## Prevalence                 0.3860       0.3684      0.03509      0.05263
## Detection Rate             0.3158       0.2105      0.00000      0.03509
## Detection Prevalence       0.4737       0.3333      0.01754      0.07018
## Balanced Accuracy          0.7805       0.6885      0.49091      0.81481
##                      Class: Type6 Class: Type7
## Sensitivity               0.66667      0.50000
## Specificity               1.00000      0.98039
## Pos Pred Value            1.00000      0.75000
## Neg Pred Value            0.98182      0.94340
## Prevalence                0.05263      0.10526
## Detection Rate            0.03509      0.05263
## Detection Prevalence      0.03509      0.07018
## Balanced Accuracy         0.83333      0.74020
# 64.91 % is accuracy