#install.packages("RWeka")
library("RWeka")
## Warning: package 'RWeka' was built under R version 4.2.2
setwd("C:/R pubs")
dataset <- read.arff("messidor_features.arff")
View(dataset)

dataset<-dataset[2:20]
View(dataset)
str(dataset)
## 'data.frame':    1151 obs. of  19 variables:
##  $ 1    : num  1 1 1 1 1 1 0 1 1 1 ...
##  $ 2    : num  22 24 62 55 44 44 29 6 22 79 ...
##  $ 3    : num  22 24 60 53 44 43 29 6 21 75 ...
##  $ 4    : num  22 22 59 53 44 41 29 6 18 73 ...
##  $ 5    : num  19 18 54 50 41 41 27 6 15 71 ...
##  $ 6    : num  18 16 47 43 39 37 25 2 13 64 ...
##  $ 7    : num  14 13 33 31 27 29 16 1 10 47 ...
##  $ 8    : num  49.9 57.7 55.8 40.5 18 ...
##  $ 9    : num  17.78 23.8 27.99 18.45 8.57 ...
##  $ 10   : num  5.27 3.33 12.69 9.12 0.41 ...
##  $ 11   : num  0.772 0.234 4.852 3.079 0 ...
##  $ 12   : num  0.0186 0.0039 1.3939 0.8403 0 ...
##  $ 13   : num  0.00686 0.0039 0.37325 0.27243 0 ...
##  $ 14   : num  0.00392 0.0039 0.04182 0.00765 0 ...
##  $ 15   : num  0.00392 0.0039 0.00774 0.00153 0 ...
##  $ 16   : num  0.487 0.521 0.531 0.483 0.476 ...
##  $ 17   : num  0.1 0.144 0.129 0.115 0.124 ...
##  $ 18   : num  1 0 0 0 0 0 0 1 0 0 ...
##  $ Class: Factor w/ 2 levels "0","1": 1 1 2 1 2 2 2 1 2 2 ...
dataset$Class <- as.numeric(dataset$Class)
#Splitting the data
library(caTools)
## 
## Attaching package: 'caTools'
## The following object is masked from 'package:RWeka':
## 
##     LogitBoost
set.seed(123) 
split=sample.split(Y=dataset$Class,SplitRatio=2/3)

train_set=subset(x=dataset,split==TRUE) 
test_set=subset(x=dataset,split==FALSE)

#Feature Scaling 
train_set[-19]=scale(train_set[-19]) 
test_set[-19]=scale(test_set[-19]) 
str(train_set)
## 'data.frame':    767 obs. of  19 variables:
##  $ 1    : num  0.302 0.302 0.302 -3.312 -3.312 ...
##  $ 2    : num  -0.616 0.675 0.245 -0.342 -0.499 ...
##  $ 3    : num  -0.595 0.692 0.319 -0.304 -0.47 ...
##  $ 4    : num  -0.552 0.81 0.414 -0.245 -0.421 ...
##  $ 5    : num  -0.607 0.871 0.441 -0.226 -0.417 ...
##  $ 6    : num  -0.531 0.758 0.552 -0.17 -0.325 ...
##  $ 7    : num  -0.455 0.681 0.414 -0.321 -0.187 ...
##  $ 8    : num  -0.243 -0.401 -0.777 -0.82 -0.702 ...
##  $ 9    : num  -0.241 -0.21 -0.668 -0.643 -0.419 ...
##  $ 10   : num  -0.2818 0.0443 -0.6938 -0.5902 -0.6916 ...
##  $ 11   : num  -0.28 0.369 -0.497 -0.497 -0.464 ...
##  $ 12   : num  -0.253 0.199 -0.263 -0.263 -0.263 ...
##  $ 13   : num  -0.22 0.145 -0.23 -0.23 -0.23 ...
##  $ 14   : num  -0.207 -0.195 -0.219 -0.219 -0.219 ...
##  $ 15   : num  -0.189 -0.207 -0.218 -0.218 -0.218 ...
##  $ 16   : num  -1.333 -1.464 -1.73 0.648 1.007 ...
##  $ 17   : num  -0.436 0.388 0.878 1.771 1.776 ...
##  $ 18   : num  1.364 -0.732 -0.732 -0.732 1.364 ...
##  $ Class: num  1 1 2 2 1 2 1 1 1 2 ...
str(test_set)
## 'data.frame':    384 obs. of  19 variables:
##  $ 1    : num  0.291 0.291 0.291 0.291 0.291 ...
##  $ 2    : num  -0.614 0.864 0.164 -1.314 -0.692 ...
##  $ 3    : num  -0.584 0.908 0.204 -1.33 -0.708 ...
##  $ 4    : num  -0.624 0.993 0.207 -1.323 -0.799 ...
##  $ 5    : num  -0.721 0.963 0.355 -1.282 -0.861 ...
##  $ 6    : num  -0.691 0.88 0.373 -1.4 -0.843 ...
##  $ 7    : num  -0.576 0.727 0.466 -1.358 -0.771 ...
##  $ 8    : num  -0.1036 -0.1371 -0.6263 -0.763 0.0563 ...
##  $ 9    : num  0.02133 0.21402 -0.75347 -0.63576 0.00964 ...
##  $ 10   : num  -0.504 0.339 -0.596 -0.693 -0.249 ...
##  $ 11   : num  -0.381 0.63 -0.361 -0.399 -0.324 ...
##  $ 12   : num  -0.21 0.194 -0.211 -0.211 -0.211 ...
##  $ 13   : num  -0.1974 0.0469 -0.2 -0.2 -0.2 ...
##  $ 14   : num  -0.219 -0.148 -0.227 -0.227 -0.227 ...
##  $ 15   : num  -0.201 -0.185 -0.217 -0.217 -0.217 ...
##  $ 16   : num  -0.0385 0.3087 -0.6665 1.8863 -0.7623 ...
##  $ 17   : num  1.937 1.054 0.953 -2.145 0.4 ...
##  $ 18   : num  -0.669 -0.669 -0.669 1.49 -0.669 ...
##  $ Class: num  1 2 2 1 2 2 1 2 1 2 ...
#Creating a Model
library(class) 
y_pred=knn(train_set[,-19],test=test_set[,-19],cl=train_set[,19],k=5)
y_pred
##   [1] 1 1 1 1 1 2 1 1 1 2 1 2 1 2 1 2 2 2 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 1 2 1 2
##  [38] 1 2 2 2 2 2 1 1 2 2 2 2 1 2 2 2 2 2 1 2 1 1 1 1 1 1 1 1 2 2 1 2 1 2 1 2 2
##  [75] 2 2 2 2 1 2 2 1 1 1 1 2 2 2 1 2 2 2 1 1 1 1 1 1 1 1 2 1 1 1 1 2 2 1 2 1 1
## [112] 1 1 2 2 2 2 1 2 2 2 2 1 2 1 2 2 2 2 2 2 2 2 2 1 2 1 1 2 1 1 2 2 1 2 1 1 1
## [149] 2 1 2 1 2 1 1 2 2 1 2 1 1 2 1 2 2 2 2 2 1 2 1 2 1 2 2 2 2 1 2 1 1 2 2 1 1
## [186] 1 2 1 1 1 2 2 2 2 1 1 1 2 1 2 2 2 1 2 1 2 1 1 1 1 1 2 2 2 2 2 1 1 1 1 2 1
## [223] 2 2 2 2 1 2 2 1 1 1 2 2 1 2 1 2 1 2 2 1 1 2 1 1 2 1 1 1 2 1 1 2 2 2 2 2 2
## [260] 1 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 2 2 1 1 2 2 2 1 1 2 1 1 1 1 2 2 2 2 2 2 2
## [297] 2 1 1 1 2 2 2 2 1 1 2 2 1 2 1 2 2 2 2 1 1 1 1 2 2 2 2 2 1 2 1 1 1 2 1 2 1
## [334] 2 1 1 1 2 2 1 1 2 2 2 2 1 1 1 1 2 1 1 2 1 1 1 2 1 2 1 2 2 1 1 2 2 1 2 2 2
## [371] 1 1 1 1 1 1 1 1 2 2 2 1 1 1
## Levels: 1 2
#Confusion Matrix 
cmat=table(test_set[,19],y_pred)
cmat
##    y_pred
##       1   2
##   1 112  68
##   2  71 133
#Accuracy 
sum(diag(cmat))/sum(cmat)
## [1] 0.6380208
#scatter plot
plot(x=dataset$`2`,y=dataset$`3`,xlab="2",ylab="3",
     main="Feature 2 vs Feature 3 ",col="blue")
legend("top",legend=c(" 20MID0184"))