IRIS_KNN_Final.R

#Reading the file from system directory
fName<-"E:/TERM2/Predictive Analytics/iris.csv"
#iris data set gives the measurements in centimeters of the 
#variables sepal length and width and petal length and width, 
#respectively, for 50 flowers from each of 3 species of iris

# Creating split file function to split data set into train and test
# data set and can call this function for checking accuracy for multiple
#samples
splitFile <- function(fName,trProp,classColPos)
  {  
  # Upload Iris dataset into a variable g
  g <-read.csv(fName,header=T)
  g$Species <- as.factor(g$Species)
  g$Species <- as.numeric(g$Species)
  v <- g$Species
  #There are three level in the class variable, 1 as setosa and 2 as versicolor
  #3 as Virginica
  sg0 <- which(v==1)
  sg1 <- which(v==2)
  sg2<-which(v==3)
  #Creating the train and test data set with equal class level proportion
  sg0tr <- sample(sg0,length(sg0)*trProp)
  sg1tr <- sample(sg1,length(sg1)*trProp)
  sg2tr<-sample(sg2,length(sg2)*trProp)
  # Row binding the sample generated with the main table
  # to create the train and test data set
  gtr <- rbind(g[sg0tr,],g[sg1tr,],g[sg2tr,])
  gts <- rbind(g[setdiff(sg0,sg0tr),],g[setdiff(sg1,sg1tr),],g[setdiff(sg2,sg2tr),])
  # Taking the class variable of train and test data set into seperate variables
  # to use them in KNN algorithm
  trLabels <- gtr[,classColPos]
  tsLabels <- gts[,classColPos]
  gtr <- gtr[,-classColPos]
  gts <- gts[,-classColPos]
  #Normailizing the independent varibales using a user defined function
  normalize <- function(x) {
    return( (x-min(x))/(max(x)-min(x)))
  }
  
  gtrn <- as.data.frame(lapply(gtr,normalize))
  gtsn <- as.data.frame(lapply(gts,normalize))
  return(list(trn=gtrn,trL=trLabels,val=gtsn,tsL=tsLabels))
}
# Creating a function to check the accuracy for different values of k
train_test <- function(trnData,trLabels,tsnData,tsLabels) {
  ctr <- c()
  cts <- c()
  # Creating a 'for' loop to pass different k values
  for (k in 1:40) {
    knntr <- knn(trnData,trnData,trLabels,k=k)
    knnts <- knn(trnData,tsnData,trLabels,k=k)
    trTable <- table(knntr,trLabels)
    tsTable <- table(knnts,tsLabels)
    trTable <- prop.table(trTable)
    tsTable <- prop.table(tsTable)
    actr <- sum(trTable[1,1],trTable[2,2],trTable[3,3])/sum(trTable)
    acts <- sum(tsTable[1,1],tsTable[2,2],trTable[3,3])/sum(tsTable)
    ctr <- c(ctr,actr)
    cts <- c(cts,acts)
  }
  acc <- data.frame('k'=1:20,'trAc'=ctr,'tsAc'=cts)
  return(acc=acc)
}

#Calling the Splitfile function to split the Iris data set into
# training and test data set with 70%-30%
a<-splitFile(fName,0.7,5)
#Knn algorithm is defined in class package
library(class)
b<-train_test(a$trn,a$trL,a$val,a$tsL)
#Creating the function to obtain a better curve
avgTrnTst <- function(fName,trProp,classColPos) {
  for (i in 1:20) {
    a <- splitFile(fName,trProp,classColPos)
    b <- train_test(a$trn,a$trL,a$val,a$tsL)
    if (i==1) acd <- b
    else      acd <- rbind(acd,b)
  }
  #Plyr package contains tools for splittin,applying and combining data
  library(plyr)
  a1 <- ddply(acd,.(k),summarize,meanV=mean(trAc))
  a2 <- ddply(acd,.(k),summarize,meanV=mean(tsAc))
  m <- merge(a1,a2,by='k')
  return(m)
}
#Calling the function avgTrnTst
c<-avgTrnTst(fName,0.7,5)
# we are limiting the k value to 20 because sqrt(150) near to 15 , accuracy will be less 
# if the more K values considered.
c

##     k   meanV.x   meanV.y
## 1   1 0.9795238 0.9611111
## 2   2 0.9659524 0.9565079
## 3   3 0.9607143 0.9564286
## 4   4 0.9619048 0.9595238
## 5   5 0.9609524 0.9530159
## 6   6 0.9623810 0.9548413
## 7   7 0.9607143 0.9548413
## 8   8 0.9588095 0.9530952
## 9   9 0.9595238 0.9521429
## 10 10 0.9588095 0.9498413
## 11 11 0.9578571 0.9502381
## 12 12 0.9559524 0.9473016
## 13 13 0.9573810 0.9480159
## 14 14 0.9533333 0.9467460
## 15 15 0.9497619 0.9446032
## 16 16 0.9473810 0.9404762
## 17 17 0.9454762 0.9403968
## 18 18 0.9428571 0.9394444
## 19 19 0.9400000 0.9381746
## 20 20 0.9335714 0.9340476

#Creating a plot function to plot a graph for 20 different instances
plotFn <- function(dataSet,ylimLo=0.9)
  {
  plot(1:20,dataSet[,2],xlab = 'k Nearest Neighbours',ylab = 'Accuracy',ylim =  c(ylimLo,1),type ='o',col= 'red')
  
  lines(1:20,dataSet[,3],type = 'o',col='blue')
}
#Calling the plotFn function
plotFn(c)

IRIS_KNN_Final.R

Sai

Mon Dec 26 22:00:30 2016