#Reading the file from system directory
fName<-"E:/TERM2/Predictive Analytics/data set/kyphosis.csv"
#Kyphosis is a dataset on Children who have had Corrective Spinal Surgery
# Creating split file function to split data set into train and test
# data set and can call this function for checking accuracy for multiple
#samples
splitFile <- function(fName,trProp,classColPos)
{
# Upload Kyphosis dataset into a variable g
g <-read.csv(fName,header=T)
g$Kyphosis <- as.factor(g$Kyphosis)
g$Kyphosis <- as.numeric(g$Kyphosis)
v <- g$Kyphosis
v
#There are two level in the class variable, 1 as absent and 2 as present
sg0 <- which(v==1)
sg1 <- which(v==2)
#Creating the train and test data set with equal class level proportion
sg0tr <- sample(sg0,length(sg0)*trProp)
sg1tr <- sample(sg1,length(sg1)*trProp)
# Row binding the sample generated with the main table
# to create the train and test data set
gtr <- rbind(g[sg0tr,],g[sg1tr,])
gts <- rbind(g[setdiff(sg0,sg0tr),],g[setdiff(sg1,sg1tr),])
# Taking the class variable of train and test data set into seperate variables
# to use them in KNN algorithm
trLabels <- gtr[,classColPos]
tsLabels <- gts[,classColPos]
gtr <- gtr[,-classColPos]
gts <- gts[,-classColPos]
#Normailizing the independent varibales using a user defined function
normalize <- function(x) {
return( (x-min(x))/(max(x)-min(x)))
}
gtrn <- as.data.frame(lapply(gtr,normalize))
gtsn <- as.data.frame(lapply(gts,normalize))
return(list(trn=gtrn,trL=trLabels,val=gtsn,tsL=tsLabels))
}
# Creating a function to check the accuracy for different values of k
train_test <- function(trnData,trLabels,tsnData,tsLabels) {
ctr <- c()
cts <- c()
# Creating a 'for' loop to pass different k values
for (k in 1:40) {
knntr <- knn(trnData,trnData,trLabels,k=k)
knnts <- knn(trnData,tsnData,trLabels,k=k)
trTable <- table(knntr,trLabels)
tsTable <- table(knnts,tsLabels)
trTable <- prop.table(trTable)
tsTable <- prop.table(tsTable)
actr <- sum(trTable[1,1],trTable[2,2])/sum(trTable[,])
acts <- sum(tsTable[1,1],tsTable[2,2])/sum(tsTable[,])
ctr <- c(ctr,actr)
cts <- c(cts,acts)
}
acc <- data.frame('k'=1:20,'trAc'=ctr,'tsAc'=cts)
return(acc=acc)
}
#Calling the Splitfile function to split the Iris data set into
# training and test data set with 70%-30%
a<-splitFile(fName,0.7,2)
a
## $trn
## X Age Number Start
## 1 0.3875 0.604878049 0.0000000 0.58823529
## 2 0.8000 0.570731707 0.2857143 0.88235294
## 3 0.6625 0.004878049 0.0000000 0.94117647
## 4 0.1750 0.814634146 0.1428571 1.00000000
## 5 0.9000 0.419512195 0.2857143 0.88235294
## 6 0.0875 0.175609756 0.1428571 0.88235294
## 7 0.3250 0.034146341 0.1428571 0.29411765
## 8 0.5125 0.165853659 0.1428571 0.70588235
## 9 0.8125 0.078048780 0.2857143 0.52941176
## 10 0.4500 0.000000000 0.1428571 0.47058824
## 11 0.7000 0.004878049 0.1428571 0.70588235
## 12 0.9125 1.000000000 0.2857143 0.52941176
## 13 0.5250 0.692682927 1.0000000 0.11764706
## 14 0.7250 0.243902439 0.7142857 0.47058824
## 15 0.8750 0.765853659 0.4285714 0.76470588
## 16 0.0750 0.292682927 0.0000000 0.94117647
## 17 0.4000 0.629268293 0.4285714 0.70588235
## 18 0.4125 0.541463415 0.1428571 0.88235294
## 19 0.2875 0.634146341 0.0000000 0.11764706
## 20 0.8625 0.068292683 0.4285714 0.88235294
## 21 0.6875 0.346341463 0.4285714 0.82352941
## 22 0.3625 0.731707317 0.0000000 0.88235294
## 23 0.5375 0.292682927 0.2857143 0.00000000
## 24 1.0000 0.170731707 0.2857143 0.70588235
## 25 0.5875 0.634146341 0.4285714 0.70588235
## 26 0.5750 0.658536585 0.2857143 0.82352941
## 27 0.2125 0.848780488 0.4285714 0.70588235
## 28 0.6750 0.678048780 0.2857143 0.82352941
## 29 0.1625 0.000000000 0.2857143 0.64705882
## 30 0.3125 0.039024390 0.4285714 0.70588235
## 31 0.1500 0.082926829 0.4285714 0.05882353
## 32 0.2250 0.385365854 0.4285714 0.88235294
## 33 0.2500 0.102439024 0.0000000 0.88235294
## 34 0.3375 0.482926829 0.1428571 0.76470588
## 35 0.0125 0.765853659 0.1428571 0.76470588
## 36 0.2375 0.126829268 0.2857143 0.47058824
## 37 0.0000 0.341463415 0.1428571 0.23529412
## 38 0.2000 0.375609756 0.5714286 0.82352941
## 39 0.1875 0.000000000 0.1428571 0.88235294
## 40 0.4750 0.092682927 0.5714286 0.47058824
## 41 0.9625 0.121951220 0.7142857 0.70588235
## 42 0.8500 0.082926829 0.2857143 0.58823529
## 43 0.9250 0.048780488 0.1428571 0.82352941
## 44 0.4250 0.678048780 0.4285714 0.58823529
## 45 0.9875 0.200000000 0.7142857 0.29411765
## 46 0.9500 0.760975610 0.1428571 0.70588235
## 47 0.4875 0.439024390 0.4285714 0.64705882
## 48 0.1250 0.395121951 0.4285714 0.76470588
## 49 0.2750 0.463414634 0.1428571 0.64705882
## 50 0.0250 0.619512195 0.2857143 0.23529412
## 51 0.6000 0.585365854 0.1428571 0.11764706
## 52 0.7125 0.580487805 0.4285714 0.41176471
## 53 0.5000 0.351219512 0.4285714 0.00000000
## 54 0.5625 0.673170732 0.1428571 0.52941176
## 55 0.4625 0.248780488 0.4285714 0.29411765
##
## $trL
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
##
## $val
## X Age Number Start
## 1 0.00000000 0.005154639 0.375 0.0000
## 2 0.01333333 0.000000000 0.250 0.8750
## 3 0.02666667 0.000000000 0.000 0.9375
## 4 0.06666667 0.577319588 0.000 0.9375
## 5 0.10666667 0.757731959 0.125 0.9375
## 6 0.33333333 0.015463918 0.125 0.9375
## 7 0.36000000 0.154639175 0.125 0.9375
## 8 0.42666667 0.474226804 0.125 0.9375
## 9 0.54666667 0.494845361 0.125 0.9375
## 10 0.61333333 0.907216495 0.000 0.8125
## 11 0.62666667 0.345360825 0.375 0.5625
## 12 0.64000000 0.041237113 0.000 1.0000
## 13 0.74666667 0.520618557 0.125 0.7500
## 14 0.78666667 0.412371134 0.250 0.0000
## 15 0.80000000 0.603092784 0.125 0.9375
## 16 0.84000000 1.000000000 0.000 1.0000
## 17 0.85333333 0.814432990 0.250 0.7500
## 18 0.90666667 0.649484536 0.250 0.6875
## 19 0.96000000 0.912371134 0.250 0.8750
## 20 1.00000000 0.613402062 0.000 0.7500
## 21 0.08000000 0.298969072 0.500 0.6875
## 22 0.24000000 0.536082474 0.500 0.2500
## 23 0.28000000 0.072164948 0.625 0.0625
## 24 0.65333333 0.711340206 1.000 0.3125
## 25 0.76000000 0.664948454 0.250 0.0000
## 26 0.77333333 0.582474227 0.625 0.4375
##
## $tsL
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2
#Knn algorithm is defined in class package
library(class)
b<-train_test(a$trn,a$trL,a$val,a$tsL)
#Creating the function to obtain a better curve
avgTrnTst <- function(fName,trProp,classColPos) {
for (i in 1:20) {
a <- splitFile(fName,trProp,classColPos)
b <- train_test(a$trn,a$trL,a$val,a$tsL)
if (i==1) acd <- b
else acd <- rbind(acd,b)
}
#Plyr package contains tools for splittin,applying and combining data
library(plyr)
a1 <- ddply(acd,.(k),summarize,meanV=mean(trAc))
a2 <- ddply(acd,.(k),summarize,meanV=mean(tsAc))
m <- merge(a1,a2,by='k')
return(m)
}
#Calling the function avgTrnTst
c<-avgTrnTst(fName,0.7,2)
# we are limiting the k value to 20 because sqrt(150) near to 15 , accuracy will be less
# if the more K values considered.
c
## k meanV.x meanV.y
## 1 1 0.8990909 0.7384615
## 2 2 0.8363636 0.7528846
## 3 3 0.8309091 0.7442308
## 4 4 0.8240909 0.7480769
## 5 5 0.8227273 0.7500000
## 6 6 0.8190909 0.7538462
## 7 7 0.8131818 0.7711538
## 8 8 0.8081818 0.7769231
## 9 9 0.8050000 0.7817308
## 10 10 0.8009091 0.7788462
## 11 11 0.8018182 0.7826923
## 12 12 0.8031818 0.7798077
## 13 13 0.8036364 0.7788462
## 14 14 0.8031818 0.7750000
## 15 15 0.8027273 0.7769231
## 16 16 0.8027273 0.7750000
## 17 17 0.8009091 0.7740385
## 18 18 0.8027273 0.7711538
## 19 19 0.8009091 0.7721154
## 20 20 0.7986364 0.7730769
#Creating a plot function to plot a graph for 20 different instances
plotFn <- function(dataSet,ylimLo=0.4)
{
plot(1:20,dataSet[,2],xlab = 'k Nearest Neighbours',ylab = 'Accuracy',ylim = c(ylimLo,1),type ='o',col= 'red')
lines(1:20,dataSet[,3],type = 'o',col='blue')
}
#Calling the plotFn function
plotFn(c)
