R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

glass <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data",header=F)
head(glass,5)
##   V1      V2    V3   V4   V5    V6   V7   V8 V9 V10 V11
## 1  1 1.52101 13.64 4.49 1.10 71.78 0.06 8.75  0   0   1
## 2  2 1.51761 13.89 3.60 1.36 72.73 0.48 7.83  0   0   1
## 3  3 1.51618 13.53 3.55 1.54 72.99 0.39 7.78  0   0   1
## 4  4 1.51766 13.21 3.69 1.29 72.61 0.57 8.22  0   0   1
## 5  5 1.51742 13.27 3.62 1.24 73.08 0.55 8.07  0   0   1
names(glass)=c("RI","Na","Mg","Al","Si","K","Ca","Ba","Fe","Type")
head(glass,5)
##   RI      Na    Mg   Al   Si     K   Ca   Ba Fe Type NA
## 1  1 1.52101 13.64 4.49 1.10 71.78 0.06 8.75  0    0  1
## 2  2 1.51761 13.89 3.60 1.36 72.73 0.48 7.83  0    0  1
## 3  3 1.51618 13.53 3.55 1.54 72.99 0.39 7.78  0    0  1
## 4  4 1.51766 13.21 3.69 1.29 72.61 0.57 8.22  0    0  1
## 5  5 1.51742 13.27 3.62 1.24 73.08 0.55 8.07  0    0  1

###Data Preprocessing Standardize the Data: Its ideal to standardize featues in the data, especially with KNN algorithm. Lets go ahead and standardize. Here we are using scale() to standardize the feature columns of glassand assign it to a new variable. Exclude the target column Type while scaling.

sum(is.na(glass))
## [1] 0
colSums(is.na(glass))
##   RI   Na   Mg   Al   Si    K   Ca   Ba   Fe Type   NA 
##    0    0    0    0    0    0    0    0    0    0    0
standard.features <- scale(glass[,1:9])

#Join the standardized data with the target column
data <- cbind(standard.features,glass[10])
#Check if there are any missing values to impute. 
anyNA(data)
## [1] FALSE
#install.packages('corrplot')
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.1.2
## corrplot 0.92 loaded
corrplot(cor(data))

#install.packages('caTools')
library(caTools)
## Warning: package 'caTools' was built under R version 4.1.2
set.seed(14552455)
sample <- sample.split(data$Type,SplitRatio = 0.70)
train <- subset(data,sample==TRUE)
test <- subset(data,sample==FALSE)

###KNN Model We use knn() to predict our target variable Type of the test dataset with k=1.

#install.packages("class")
#library(class)

predicted.type <-class::knn(train[1:9],test[1:9],train$Type,k=1)
error <- mean(predicted.type!=test$Type)
predicted.type <- NULL
error.rate <- NULL

for (i in 1:10) {
  predicted.type <- class::knn(train[1:9],test[1:9],train$Type,k=i)
  error.rate[i] <- mean(predicted.type!=test$Type)
    
}

knn.error <- as.data.frame(cbind(k=1:10,error.type =error.rate))
#install.packages("ggplot2")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.2
 ggplot(knn.error,aes(k,error.type))+ 
      geom_point(size=3, colour="#CC0000")+
      geom_point(data=knn.error[5, ], colour="green", size=5)+geom_line(colour="blue") + 
      scale_x_continuous(breaks=1:10)+ 
      theme_bw() +
      xlab("Value of K") +
      ylab('Error')+
   theme(axis.line = element_line(colour = "black"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank()) 

predicted.type <- class::knn(train[1:9],test[1:9],train$Type,k=5)
#Error in prediction
error <- mean(predicted.type!=test$Type)
error
## [1] 0.3833333