#install.packages("RWeka")
library("RWeka")
## Warning: package 'RWeka' was built under R version 4.2.2
setwd("C:/R pubs")
dataset <- read.arff("messidor_features.arff")
View(dataset)
dataset<-dataset[2:20]
View(dataset)
str(dataset)
## 'data.frame': 1151 obs. of 19 variables:
## $ 1 : num 1 1 1 1 1 1 0 1 1 1 ...
## $ 2 : num 22 24 62 55 44 44 29 6 22 79 ...
## $ 3 : num 22 24 60 53 44 43 29 6 21 75 ...
## $ 4 : num 22 22 59 53 44 41 29 6 18 73 ...
## $ 5 : num 19 18 54 50 41 41 27 6 15 71 ...
## $ 6 : num 18 16 47 43 39 37 25 2 13 64 ...
## $ 7 : num 14 13 33 31 27 29 16 1 10 47 ...
## $ 8 : num 49.9 57.7 55.8 40.5 18 ...
## $ 9 : num 17.78 23.8 27.99 18.45 8.57 ...
## $ 10 : num 5.27 3.33 12.69 9.12 0.41 ...
## $ 11 : num 0.772 0.234 4.852 3.079 0 ...
## $ 12 : num 0.0186 0.0039 1.3939 0.8403 0 ...
## $ 13 : num 0.00686 0.0039 0.37325 0.27243 0 ...
## $ 14 : num 0.00392 0.0039 0.04182 0.00765 0 ...
## $ 15 : num 0.00392 0.0039 0.00774 0.00153 0 ...
## $ 16 : num 0.487 0.521 0.531 0.483 0.476 ...
## $ 17 : num 0.1 0.144 0.129 0.115 0.124 ...
## $ 18 : num 1 0 0 0 0 0 0 1 0 0 ...
## $ Class: Factor w/ 2 levels "0","1": 1 1 2 1 2 2 2 1 2 2 ...
dataset$Class <- as.numeric(dataset$Class)
#Splitting the data
library(caTools)
##
## Attaching package: 'caTools'
## The following object is masked from 'package:RWeka':
##
## LogitBoost
set.seed(123)
split=sample.split(Y=dataset$Class,SplitRatio=2/3)
train_set=subset(x=dataset,split==TRUE)
test_set=subset(x=dataset,split==FALSE)
#Feature Scaling
train_set[-19]=scale(train_set[-19])
test_set[-19]=scale(test_set[-19])
str(train_set)
## 'data.frame': 767 obs. of 19 variables:
## $ 1 : num 0.302 0.302 0.302 -3.312 -3.312 ...
## $ 2 : num -0.616 0.675 0.245 -0.342 -0.499 ...
## $ 3 : num -0.595 0.692 0.319 -0.304 -0.47 ...
## $ 4 : num -0.552 0.81 0.414 -0.245 -0.421 ...
## $ 5 : num -0.607 0.871 0.441 -0.226 -0.417 ...
## $ 6 : num -0.531 0.758 0.552 -0.17 -0.325 ...
## $ 7 : num -0.455 0.681 0.414 -0.321 -0.187 ...
## $ 8 : num -0.243 -0.401 -0.777 -0.82 -0.702 ...
## $ 9 : num -0.241 -0.21 -0.668 -0.643 -0.419 ...
## $ 10 : num -0.2818 0.0443 -0.6938 -0.5902 -0.6916 ...
## $ 11 : num -0.28 0.369 -0.497 -0.497 -0.464 ...
## $ 12 : num -0.253 0.199 -0.263 -0.263 -0.263 ...
## $ 13 : num -0.22 0.145 -0.23 -0.23 -0.23 ...
## $ 14 : num -0.207 -0.195 -0.219 -0.219 -0.219 ...
## $ 15 : num -0.189 -0.207 -0.218 -0.218 -0.218 ...
## $ 16 : num -1.333 -1.464 -1.73 0.648 1.007 ...
## $ 17 : num -0.436 0.388 0.878 1.771 1.776 ...
## $ 18 : num 1.364 -0.732 -0.732 -0.732 1.364 ...
## $ Class: num 1 1 2 2 1 2 1 1 1 2 ...
str(test_set)
## 'data.frame': 384 obs. of 19 variables:
## $ 1 : num 0.291 0.291 0.291 0.291 0.291 ...
## $ 2 : num -0.614 0.864 0.164 -1.314 -0.692 ...
## $ 3 : num -0.584 0.908 0.204 -1.33 -0.708 ...
## $ 4 : num -0.624 0.993 0.207 -1.323 -0.799 ...
## $ 5 : num -0.721 0.963 0.355 -1.282 -0.861 ...
## $ 6 : num -0.691 0.88 0.373 -1.4 -0.843 ...
## $ 7 : num -0.576 0.727 0.466 -1.358 -0.771 ...
## $ 8 : num -0.1036 -0.1371 -0.6263 -0.763 0.0563 ...
## $ 9 : num 0.02133 0.21402 -0.75347 -0.63576 0.00964 ...
## $ 10 : num -0.504 0.339 -0.596 -0.693 -0.249 ...
## $ 11 : num -0.381 0.63 -0.361 -0.399 -0.324 ...
## $ 12 : num -0.21 0.194 -0.211 -0.211 -0.211 ...
## $ 13 : num -0.1974 0.0469 -0.2 -0.2 -0.2 ...
## $ 14 : num -0.219 -0.148 -0.227 -0.227 -0.227 ...
## $ 15 : num -0.201 -0.185 -0.217 -0.217 -0.217 ...
## $ 16 : num -0.0385 0.3087 -0.6665 1.8863 -0.7623 ...
## $ 17 : num 1.937 1.054 0.953 -2.145 0.4 ...
## $ 18 : num -0.669 -0.669 -0.669 1.49 -0.669 ...
## $ Class: num 1 2 2 1 2 2 1 2 1 2 ...
#Creating a Model
library(class)
y_pred=knn(train_set[,-19],test=test_set[,-19],cl=train_set[,19],k=5)
y_pred
## [1] 1 1 1 1 1 2 1 1 1 2 1 2 1 2 1 2 2 2 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 1 2 1 2
## [38] 1 2 2 2 2 2 1 1 2 2 2 2 1 2 2 2 2 2 1 2 1 1 1 1 1 1 1 1 2 2 1 2 1 2 1 2 2
## [75] 2 2 2 2 1 2 2 1 1 1 1 2 2 2 1 2 2 2 1 1 1 1 1 1 1 1 2 1 1 1 1 2 2 1 2 1 1
## [112] 1 1 2 2 2 2 1 2 2 2 2 1 2 1 2 2 2 2 2 2 2 2 2 1 2 1 1 2 1 1 2 2 1 2 1 1 1
## [149] 2 1 2 1 2 1 1 2 2 1 2 1 1 2 1 2 2 2 2 2 1 2 1 2 1 2 2 2 2 1 2 1 1 2 2 1 1
## [186] 1 2 1 1 1 2 2 2 2 1 1 1 2 1 2 2 2 1 2 1 2 1 1 1 1 1 2 2 2 2 2 1 1 1 1 2 1
## [223] 2 2 2 2 1 2 2 1 1 1 2 2 1 2 1 2 1 2 2 1 1 2 1 1 2 1 1 1 2 1 1 2 2 2 2 2 2
## [260] 1 1 2 2 1 2 1 2 2 1 2 1 2 2 1 2 2 2 1 1 2 2 2 1 1 2 1 1 1 1 2 2 2 2 2 2 2
## [297] 2 1 1 1 2 2 2 2 1 1 2 2 1 2 1 2 2 2 2 1 1 1 1 2 2 2 2 2 1 2 1 1 1 2 1 2 1
## [334] 2 1 1 1 2 2 1 1 2 2 2 2 1 1 1 1 2 1 1 2 1 1 1 2 1 2 1 2 2 1 1 2 2 1 2 2 2
## [371] 1 1 1 1 1 1 1 1 2 2 2 1 1 1
## Levels: 1 2
#Confusion Matrix
cmat=table(test_set[,19],y_pred)
cmat
## y_pred
## 1 2
## 1 112 68
## 2 71 133
#Accuracy
sum(diag(cmat))/sum(cmat)
## [1] 0.6380208
#scatter plot
plot(x=dataset$`2`,y=dataset$`3`,xlab="2",ylab="3",
main="Feature 2 vs Feature 3 ",col="blue")
legend("top",legend=c(" 20MID0184"))
