Ca Calcium, Ba Barium, Fe Iron
# install.packages("caret")
# install.packages("pROC")
# install.packages("mlbench")
# install.packages("lattice")
library(caret)
## Warning: package 'caret' was built under R version 3.5.1
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.1
library(pROC)
## Warning: package 'pROC' was built under R version 3.5.1
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(mlbench)
## Warning: package 'mlbench' was built under R version 3.5.1
library(lattice)
# Example 1 - Glass dataset (Classification)
glass <- read.csv(file.choose())
glass$Type[glass$Type==1] <- 'Type1'
glass$Type[glass$Type==2] <- 'Type2'
glass$Type[glass$Type==3] <- 'Type3'
glass$Type[glass$Type==4] <- 'Type4'
glass$Type[glass$Type==5] <- 'Type5'
glass$Type[glass$Type==6] <- 'Type6'
glass$Type[glass$Type==7] <- 'Type7'
str(glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: chr "Type1" "Type1" "Type1" "Type1" ...
glass$Type <- as.factor(glass$Type) # Factorize the Type in Glass dataset
View(glass)
# Data partition
set.seed(123)
ind <- sample(2,nrow(glass), replace = T, prob = c(0.7,0.3))
train <- glass[ind==1,]
test <- glass[ind==2,]
# KNN Model
trcontrol <- trainControl(method = "repeatedcv", number = 10,repeats = 3)
set.seed(222)
fit <- train(Type ~., data = train, method = 'knn', tuneLength = 20,
trControl = trcontrol, preProc = c("center","scale"))
# default metric is accuracy but if u want to use ROC, then mention the same
# Model Performance :
fit # the optimum value for k should be 9
## k-Nearest Neighbors
##
## 157 samples
## 9 predictor
## 6 classes: 'Type1', 'Type2', 'Type3', 'Type5', 'Type6', 'Type7'
##
## Pre-processing: centered (9), scaled (9)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 141, 141, 143, 142, 141, 140, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.6270094 0.4860153
## 7 0.6415889 0.5015299
## 9 0.6345487 0.4897913
## 11 0.6340920 0.4863176
## 13 0.6335664 0.4837893
## 15 0.6227194 0.4636838
## 17 0.6210893 0.4624520
## 19 0.6151957 0.4523398
## 21 0.6116842 0.4443376
## 23 0.6109606 0.4445746
## 25 0.6087383 0.4394013
## 27 0.5980474 0.4239945
## 29 0.6084641 0.4394526
## 31 0.5952299 0.4194728
## 33 0.5730672 0.3881899
## 35 0.5700058 0.3833292
## 37 0.5465640 0.3502106
## 39 0.5440379 0.3457769
## 41 0.5461411 0.3469884
## 43 0.5173160 0.3057747
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.
plot(fit)

varImp(fit)
## ROC curve variable importance
##
## variables are sorted by maximum importance across the classes
## Type1 Type2 Type3 Type5 Type6 Type7
## Mg 21.97 100.000 100.00 98.10 21.966 100.000
## Al 48.35 98.249 48.35 95.72 48.349 98.249
## K 22.84 24.712 97.81 58.51 36.076 24.712
## Ba 0.00 12.018 0.00 93.53 0.000 12.018
## Na 30.55 26.244 84.68 87.44 44.353 30.548
## RI 25.03 25.030 25.03 60.99 25.030 16.541
## Ca 31.04 55.133 47.47 31.04 38.878 55.133
## Si 21.36 5.014 16.47 54.23 32.256 21.356
## Fe 1.95 2.825 29.96 17.12 5.643 2.825
pred <- predict(fit, newdata = test )
confusionMatrix(pred, test$Type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Type1 Type2 Type3 Type5 Type6 Type7
## Type1 18 6 1 0 0 2
## Type2 4 12 1 1 0 1
## Type3 0 1 0 0 0 0
## Type5 0 2 0 2 0 0
## Type6 0 0 0 0 2 0
## Type7 0 0 0 0 1 3
##
## Overall Statistics
##
## Accuracy : 0.6491
## 95% CI : (0.5113, 0.7709)
## No Information Rate : 0.386
## P-Value [Acc > NIR] : 5.206e-05
##
## Kappa : 0.4846
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Type1 Class: Type2 Class: Type3 Class: Type5
## Sensitivity 0.8182 0.5714 0.00000 0.66667
## Specificity 0.7429 0.8056 0.98182 0.96296
## Pos Pred Value 0.6667 0.6316 0.00000 0.50000
## Neg Pred Value 0.8667 0.7632 0.96429 0.98113
## Prevalence 0.3860 0.3684 0.03509 0.05263
## Detection Rate 0.3158 0.2105 0.00000 0.03509
## Detection Prevalence 0.4737 0.3333 0.01754 0.07018
## Balanced Accuracy 0.7805 0.6885 0.49091 0.81481
## Class: Type6 Class: Type7
## Sensitivity 0.66667 0.50000
## Specificity 1.00000 0.98039
## Pos Pred Value 1.00000 0.75000
## Neg Pred Value 0.98182 0.94340
## Prevalence 0.05263 0.10526
## Detection Rate 0.03509 0.05263
## Detection Prevalence 0.03509 0.07018
## Balanced Accuracy 0.83333 0.74020
# 64.91 % is accuracy