Kyphosis_Knn

#Reading the file from system directory

fName<-"E:/TERM2/Predictive Analytics/data set/kyphosis.csv"  
#Kyphosis is a dataset on Children who have had Corrective Spinal Surgery

# Creating split file function to split data set into train and test
# data set and can call this function for checking accuracy for multiple
#samples
splitFile <- function(fName,trProp,classColPos)
{  
  # Upload Kyphosis dataset into a variable g
  g <-read.csv(fName,header=T)
  g$Kyphosis <- as.factor(g$Kyphosis)
  g$Kyphosis <- as.numeric(g$Kyphosis)
  v <- g$Kyphosis
  v
  #There are two level in the class variable, 1 as absent and 2 as present
  sg0 <- which(v==1)
  sg1 <- which(v==2)
  #Creating the train and test data set with equal class level proportion
  sg0tr <- sample(sg0,length(sg0)*trProp)
  sg1tr <- sample(sg1,length(sg1)*trProp)
  # Row binding the sample generated with the main table
  # to create the train and test data set
  gtr <- rbind(g[sg0tr,],g[sg1tr,])
  gts <- rbind(g[setdiff(sg0,sg0tr),],g[setdiff(sg1,sg1tr),])
  # Taking the class variable of train and test data set into seperate variables
  # to use them in KNN algorithm
  trLabels <- gtr[,classColPos]
  tsLabels <- gts[,classColPos]
  gtr <- gtr[,-classColPos]
  gts <- gts[,-classColPos]
  #Normailizing the independent varibales using a user defined function
  normalize <- function(x) {
    return( (x-min(x))/(max(x)-min(x)))
  }
  
  gtrn <- as.data.frame(lapply(gtr,normalize))
  gtsn <- as.data.frame(lapply(gts,normalize))
  return(list(trn=gtrn,trL=trLabels,val=gtsn,tsL=tsLabels))
}
# Creating a function to check the accuracy for different values of k
train_test <- function(trnData,trLabels,tsnData,tsLabels) {
  ctr <- c()
  cts <- c()
  # Creating a 'for' loop to pass different k values
  for (k in 1:40) {
    knntr <- knn(trnData,trnData,trLabels,k=k)
    knnts <- knn(trnData,tsnData,trLabels,k=k)
    trTable <- table(knntr,trLabels)
    tsTable <- table(knnts,tsLabels)
    trTable <- prop.table(trTable)
    tsTable <- prop.table(tsTable)
    actr <- sum(trTable[1,1],trTable[2,2])/sum(trTable[,])
    acts <- sum(tsTable[1,1],tsTable[2,2])/sum(tsTable[,])
    ctr <- c(ctr,actr)
    cts <- c(cts,acts)
  }
  acc <- data.frame('k'=1:20,'trAc'=ctr,'tsAc'=cts)
  return(acc=acc)
}
#Calling the Splitfile function to split the Iris data set into
# training and test data set with 70%-30%
a<-splitFile(fName,0.7,2)
a

## $trn
##         X         Age    Number      Start
## 1  0.3875 0.604878049 0.0000000 0.58823529
## 2  0.8000 0.570731707 0.2857143 0.88235294
## 3  0.6625 0.004878049 0.0000000 0.94117647
## 4  0.1750 0.814634146 0.1428571 1.00000000
## 5  0.9000 0.419512195 0.2857143 0.88235294
## 6  0.0875 0.175609756 0.1428571 0.88235294
## 7  0.3250 0.034146341 0.1428571 0.29411765
## 8  0.5125 0.165853659 0.1428571 0.70588235
## 9  0.8125 0.078048780 0.2857143 0.52941176
## 10 0.4500 0.000000000 0.1428571 0.47058824
## 11 0.7000 0.004878049 0.1428571 0.70588235
## 12 0.9125 1.000000000 0.2857143 0.52941176
## 13 0.5250 0.692682927 1.0000000 0.11764706
## 14 0.7250 0.243902439 0.7142857 0.47058824
## 15 0.8750 0.765853659 0.4285714 0.76470588
## 16 0.0750 0.292682927 0.0000000 0.94117647
## 17 0.4000 0.629268293 0.4285714 0.70588235
## 18 0.4125 0.541463415 0.1428571 0.88235294
## 19 0.2875 0.634146341 0.0000000 0.11764706
## 20 0.8625 0.068292683 0.4285714 0.88235294
## 21 0.6875 0.346341463 0.4285714 0.82352941
## 22 0.3625 0.731707317 0.0000000 0.88235294
## 23 0.5375 0.292682927 0.2857143 0.00000000
## 24 1.0000 0.170731707 0.2857143 0.70588235
## 25 0.5875 0.634146341 0.4285714 0.70588235
## 26 0.5750 0.658536585 0.2857143 0.82352941
## 27 0.2125 0.848780488 0.4285714 0.70588235
## 28 0.6750 0.678048780 0.2857143 0.82352941
## 29 0.1625 0.000000000 0.2857143 0.64705882
## 30 0.3125 0.039024390 0.4285714 0.70588235
## 31 0.1500 0.082926829 0.4285714 0.05882353
## 32 0.2250 0.385365854 0.4285714 0.88235294
## 33 0.2500 0.102439024 0.0000000 0.88235294
## 34 0.3375 0.482926829 0.1428571 0.76470588
## 35 0.0125 0.765853659 0.1428571 0.76470588
## 36 0.2375 0.126829268 0.2857143 0.47058824
## 37 0.0000 0.341463415 0.1428571 0.23529412
## 38 0.2000 0.375609756 0.5714286 0.82352941
## 39 0.1875 0.000000000 0.1428571 0.88235294
## 40 0.4750 0.092682927 0.5714286 0.47058824
## 41 0.9625 0.121951220 0.7142857 0.70588235
## 42 0.8500 0.082926829 0.2857143 0.58823529
## 43 0.9250 0.048780488 0.1428571 0.82352941
## 44 0.4250 0.678048780 0.4285714 0.58823529
## 45 0.9875 0.200000000 0.7142857 0.29411765
## 46 0.9500 0.760975610 0.1428571 0.70588235
## 47 0.4875 0.439024390 0.4285714 0.64705882
## 48 0.1250 0.395121951 0.4285714 0.76470588
## 49 0.2750 0.463414634 0.1428571 0.64705882
## 50 0.0250 0.619512195 0.2857143 0.23529412
## 51 0.6000 0.585365854 0.1428571 0.11764706
## 52 0.7125 0.580487805 0.4285714 0.41176471
## 53 0.5000 0.351219512 0.4285714 0.00000000
## 54 0.5625 0.673170732 0.1428571 0.52941176
## 55 0.4625 0.248780488 0.4285714 0.29411765
## 
## $trL
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
## 
## $val
##             X         Age Number  Start
## 1  0.00000000 0.005154639  0.375 0.0000
## 2  0.01333333 0.000000000  0.250 0.8750
## 3  0.02666667 0.000000000  0.000 0.9375
## 4  0.06666667 0.577319588  0.000 0.9375
## 5  0.10666667 0.757731959  0.125 0.9375
## 6  0.33333333 0.015463918  0.125 0.9375
## 7  0.36000000 0.154639175  0.125 0.9375
## 8  0.42666667 0.474226804  0.125 0.9375
## 9  0.54666667 0.494845361  0.125 0.9375
## 10 0.61333333 0.907216495  0.000 0.8125
## 11 0.62666667 0.345360825  0.375 0.5625
## 12 0.64000000 0.041237113  0.000 1.0000
## 13 0.74666667 0.520618557  0.125 0.7500
## 14 0.78666667 0.412371134  0.250 0.0000
## 15 0.80000000 0.603092784  0.125 0.9375
## 16 0.84000000 1.000000000  0.000 1.0000
## 17 0.85333333 0.814432990  0.250 0.7500
## 18 0.90666667 0.649484536  0.250 0.6875
## 19 0.96000000 0.912371134  0.250 0.8750
## 20 1.00000000 0.613402062  0.000 0.7500
## 21 0.08000000 0.298969072  0.500 0.6875
## 22 0.24000000 0.536082474  0.500 0.2500
## 23 0.28000000 0.072164948  0.625 0.0625
## 24 0.65333333 0.711340206  1.000 0.3125
## 25 0.76000000 0.664948454  0.250 0.0000
## 26 0.77333333 0.582474227  0.625 0.4375
## 
## $tsL
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2

#Knn algorithm is defined in class package
library(class)
b<-train_test(a$trn,a$trL,a$val,a$tsL)
#Creating the function to obtain a better curve
avgTrnTst <- function(fName,trProp,classColPos) {
  for (i in 1:20) {
    a <- splitFile(fName,trProp,classColPos)
    b <- train_test(a$trn,a$trL,a$val,a$tsL)
    if (i==1) acd <- b
    else      acd <- rbind(acd,b)
  }
  #Plyr package contains tools for splittin,applying and combining data
  library(plyr)
  a1 <- ddply(acd,.(k),summarize,meanV=mean(trAc))
  a2 <- ddply(acd,.(k),summarize,meanV=mean(tsAc))
  m <- merge(a1,a2,by='k')
  return(m)
}
#Calling the function avgTrnTst
c<-avgTrnTst(fName,0.7,2)
# we are limiting the k value to 20 because sqrt(150) near to 15 , accuracy will be less 
# if the more K values considered.
c

##     k   meanV.x   meanV.y
## 1   1 0.8990909 0.7384615
## 2   2 0.8363636 0.7528846
## 3   3 0.8309091 0.7442308
## 4   4 0.8240909 0.7480769
## 5   5 0.8227273 0.7500000
## 6   6 0.8190909 0.7538462
## 7   7 0.8131818 0.7711538
## 8   8 0.8081818 0.7769231
## 9   9 0.8050000 0.7817308
## 10 10 0.8009091 0.7788462
## 11 11 0.8018182 0.7826923
## 12 12 0.8031818 0.7798077
## 13 13 0.8036364 0.7788462
## 14 14 0.8031818 0.7750000
## 15 15 0.8027273 0.7769231
## 16 16 0.8027273 0.7750000
## 17 17 0.8009091 0.7740385
## 18 18 0.8027273 0.7711538
## 19 19 0.8009091 0.7721154
## 20 20 0.7986364 0.7730769

#Creating a plot function to plot a graph for 20 different instances
plotFn <- function(dataSet,ylimLo=0.4)
{
  plot(1:20,dataSet[,2],xlab = 'k Nearest Neighbours',ylab = 'Accuracy',ylim =  c(ylimLo,1),type ='o',col= 'red')
  
  lines(1:20,dataSet[,3],type = 'o',col='blue')
}
#Calling the plotFn function
plotFn(c)

Kyphosis_Knn_Final.R

Sai

Mon Dec 26 22:14:37 2016