#Reading the file from system directory
fName<-"E:/TERM2/Predictive Analytics/iris.csv"
#iris data set gives the measurements in centimeters of the
#variables sepal length and width and petal length and width,
#respectively, for 50 flowers from each of 3 species of iris
# Creating split file function to split data set into train and test
# data set and can call this function for checking accuracy for multiple
#samples
splitFile <- function(fName,trProp,classColPos)
{
# Upload Iris dataset into a variable g
g <-read.csv(fName,header=T)
g$Species <- as.factor(g$Species)
g$Species <- as.numeric(g$Species)
v <- g$Species
#There are three level in the class variable, 1 as setosa and 2 as versicolor
#3 as Virginica
sg0 <- which(v==1)
sg1 <- which(v==2)
sg2<-which(v==3)
#Creating the train and test data set with equal class level proportion
sg0tr <- sample(sg0,length(sg0)*trProp)
sg1tr <- sample(sg1,length(sg1)*trProp)
sg2tr<-sample(sg2,length(sg2)*trProp)
# Row binding the sample generated with the main table
# to create the train and test data set
gtr <- rbind(g[sg0tr,],g[sg1tr,],g[sg2tr,])
gts <- rbind(g[setdiff(sg0,sg0tr),],g[setdiff(sg1,sg1tr),],g[setdiff(sg2,sg2tr),])
# Taking the class variable of train and test data set into seperate variables
# to use them in KNN algorithm
trLabels <- gtr[,classColPos]
tsLabels <- gts[,classColPos]
gtr <- gtr[,-classColPos]
gts <- gts[,-classColPos]
#Normailizing the independent varibales using a user defined function
normalize <- function(x) {
return( (x-min(x))/(max(x)-min(x)))
}
gtrn <- as.data.frame(lapply(gtr,normalize))
gtsn <- as.data.frame(lapply(gts,normalize))
return(list(trn=gtrn,trL=trLabels,val=gtsn,tsL=tsLabels))
}
# Creating a function to check the accuracy for different values of k
train_test <- function(trnData,trLabels,tsnData,tsLabels) {
ctr <- c()
cts <- c()
# Creating a 'for' loop to pass different k values
for (k in 1:40) {
knntr <- knn(trnData,trnData,trLabels,k=k)
knnts <- knn(trnData,tsnData,trLabels,k=k)
trTable <- table(knntr,trLabels)
tsTable <- table(knnts,tsLabels)
trTable <- prop.table(trTable)
tsTable <- prop.table(tsTable)
actr <- sum(trTable[1,1],trTable[2,2],trTable[3,3])/sum(trTable)
acts <- sum(tsTable[1,1],tsTable[2,2],trTable[3,3])/sum(tsTable)
ctr <- c(ctr,actr)
cts <- c(cts,acts)
}
acc <- data.frame('k'=1:20,'trAc'=ctr,'tsAc'=cts)
return(acc=acc)
}
#Calling the Splitfile function to split the Iris data set into
# training and test data set with 70%-30%
a<-splitFile(fName,0.7,5)
#Knn algorithm is defined in class package
library(class)
b<-train_test(a$trn,a$trL,a$val,a$tsL)
#Creating the function to obtain a better curve
avgTrnTst <- function(fName,trProp,classColPos) {
for (i in 1:20) {
a <- splitFile(fName,trProp,classColPos)
b <- train_test(a$trn,a$trL,a$val,a$tsL)
if (i==1) acd <- b
else acd <- rbind(acd,b)
}
#Plyr package contains tools for splittin,applying and combining data
library(plyr)
a1 <- ddply(acd,.(k),summarize,meanV=mean(trAc))
a2 <- ddply(acd,.(k),summarize,meanV=mean(tsAc))
m <- merge(a1,a2,by='k')
return(m)
}
#Calling the function avgTrnTst
c<-avgTrnTst(fName,0.7,5)
# we are limiting the k value to 20 because sqrt(150) near to 15 , accuracy will be less
# if the more K values considered.
c
## k meanV.x meanV.y
## 1 1 0.9795238 0.9611111
## 2 2 0.9659524 0.9565079
## 3 3 0.9607143 0.9564286
## 4 4 0.9619048 0.9595238
## 5 5 0.9609524 0.9530159
## 6 6 0.9623810 0.9548413
## 7 7 0.9607143 0.9548413
## 8 8 0.9588095 0.9530952
## 9 9 0.9595238 0.9521429
## 10 10 0.9588095 0.9498413
## 11 11 0.9578571 0.9502381
## 12 12 0.9559524 0.9473016
## 13 13 0.9573810 0.9480159
## 14 14 0.9533333 0.9467460
## 15 15 0.9497619 0.9446032
## 16 16 0.9473810 0.9404762
## 17 17 0.9454762 0.9403968
## 18 18 0.9428571 0.9394444
## 19 19 0.9400000 0.9381746
## 20 20 0.9335714 0.9340476
#Creating a plot function to plot a graph for 20 different instances
plotFn <- function(dataSet,ylimLo=0.9)
{
plot(1:20,dataSet[,2],xlab = 'k Nearest Neighbours',ylab = 'Accuracy',ylim = c(ylimLo,1),type ='o',col= 'red')
lines(1:20,dataSet[,3],type = 'o',col='blue')
}
#Calling the plotFn function
plotFn(c)
