This data set was generated to model psychological experimental results. Each example is classified as having the balance scale tip to the right, tip to the left, or be balanced. The attributes are the left weight, the left distance, the right weight, and the right distance.
DATASET USED: Balance-Scale dataset
No. of Instances - 625
No. of Attributes - 5
No. independent
variable - 4
No. dependent variable - 1
ATTRIBUTE INFORMATION
Class Name: 3 (L, B, R) L represents Left-inclined, R represents Right-inclined and B represents Balanced weight (Categorical data)
Left-Weight: 5 (1, 2, 3, 4, 5) - LW Represents the value of the left-weight (Ordinal data) which may take values from 1-5.
Left-Distance: 5 (1, 2, 3, 4, 5) - LD Represents the value of the left-distance from the center of the balance (Ordinal data) which may take values from 1 - 5.
Right-Weight: 5 (1, 2, 3, 4, 5) - RW Represents the value of the left-weight (Ordinal data) which may take values from 1-5.
Right-Distance: 5 (1, 2, 3, 4, 5) - RD Represents the value of the left-distance from the center of the balance (Ordinal data) which may take values from 1 - 5.
getwd()
## [1] "D:/VIT/ACADEMICS/VIT SEM 5/DATA SCIENCE PROGRAMMING"
setwd("D:/VIT/ACADEMICS/VIT SEM 5/DATA SCIENCE PROGRAMING LAB/DATASET")
dataset = read.csv("balance-scale.csv")
View(dataset)
dataset=dataset[sample(1:nrow(dataset)),]
View(dataset)
dim(dataset)
## [1] 625 5
dataset$L.Weight=as.numeric(dataset$L.Weight)
dataset$L.Distance=as.numeric(dataset$L.Distance)
dataset$R.Weight=as.numeric(dataset$R.Weight)
dataset$R.Distance=as.numeric(dataset$R.Distance)
str(dataset)
## 'data.frame': 625 obs. of 5 variables:
## $ Class : chr "B" "L" "R" "R" ...
## $ L.Weight : num 2 3 3 3 2 2 5 3 1 4 ...
## $ L.Distance: num 2 4 1 1 2 4 2 1 4 5 ...
## $ R.Weight : num 2 2 4 2 4 3 3 1 5 3 ...
## $ R.Distance: num 2 3 3 3 2 2 2 1 2 3 ...
summary(dataset$Class)
## Length Class Mode
## 625 character character
library(caTools)
## Warning: package 'caTools' was built under R version 4.1.3
set.seed(18)
split = sample.split(dataset$Class,SplitRatio = 0.8)
train_set = subset(dataset,split == T)
dim(train_set)
## [1] 499 5
test_set = subset(dataset, split == F)
dim(test_set)
## [1] 126 5
library(rpart)
fit = rpart(formula = Class~., data = train_set, method ='class')
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.3
rpart.plot(fit,type = 4)
y_pred = predict(object = fit, newdata = test_set, type = 'class') #Predicting the model
y_pred
## 157 488 445 140 521 553 5 472 56 241 454 325 436 453 191 208 535 288 256 619
## L L R R L L R L L R L R L L R L L R R L
## 475 154 243 559 571 425 190 293 168 242 118 110 356 413 28 334 495 126 3 73
## R L R L L R R R R R R R L R R L L R R R
## 155 221 250 207 67 467 541 430 38 455 37 625 142 111 417 170 591 163 158 277
## L R R L R L L L R L R L R R R R L R L L
## 551 364 380 542 590 218 59 174 193 329 580 302 338 162 98 513 517 422 229 372
## L R L R R R R R R L L L L R R R R R L L
## 27 434 80 113 335 423 90 49 557 555 574 180 610 447 144 101 357 450 287 42
## L L R R L R R R L L R L L L R L L R R R
## 411 510 419 509 298 116 7 246 260 166 520 458 77 483 500 135 355 252 410 83
## L L R L R R R R R R R L L L L R L R L R
## 280 9 54 342 568 78
## L R R L L R
## Levels: B L R
cm= table(test_set$Class, y_pred)
cm
## y_pred
## B L R
## B 0 7 3
## L 0 44 14
## R 0 7 51
acc = sum(diag(cm)/sum(cm))
acc
## [1] 0.7539683
getwd()
## [1] "D:/VIT/ACADEMICS/VIT SEM 5/DATA SCIENCE PROGRAMMING"
setwd("D:/VIT/ACADEMICS/VIT SEM 5/DATA SCIENCE PROGRAMING LAB/DATASET")
dataset = read.csv("balance-scale.csv")
View(dataset)
dataset$Class= factor(dataset$Class,levels=c("R","L","B"),labels=c(0,1,2))
dataset=dataset[sample(1:nrow(dataset)),]
dataset=lapply(dataset,as.numeric)
dataset = data.frame(dataset)
dataset=dataset[sample(1:nrow(dataset)),]
View(dataset)
dim(dataset)
## [1] 625 5
str(dataset)
## 'data.frame': 625 obs. of 5 variables:
## $ Class : num 2 1 1 2 1 1 2 2 2 2 ...
## $ L.Weight : num 4 1 2 3 3 2 3 3 5 3 ...
## $ L.Distance: num 5 4 1 5 3 1 5 4 4 2 ...
## $ R.Weight : num 1 3 2 4 5 3 1 4 4 1 ...
## $ R.Distance: num 1 5 3 3 5 2 3 2 1 1 ...
summary(dataset$Class)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 1.618 2.000 3.000
library(caTools)
set.seed(18)
split = sample.split(dataset$Class,SplitRatio = 0.8)
training_set = subset(dataset,split == T)
dim(train_set)
## [1] 499 5
test_set = subset(dataset, split == F)
library(e1071)
## Warning: package 'e1071' was built under R version 4.1.3
classifier = naiveBayes(x=train_set[,-1],y=train_set$Class)
classifier
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = train_set[, -1], y = train_set$Class)
##
## A-priori probabilities:
## train_set$Class
## B L R
## 0.07815631 0.46092184 0.46092184
##
## Conditional probabilities:
## L.Weight
## train_set$Class [,1] [,2]
## B 2.846154 1.405599
## L 3.682609 1.200128
## R 2.391304 1.316311
##
## L.Distance
## train_set$Class [,1] [,2]
## B 3.051282 1.431759
## L 3.591304 1.253088
## R 2.391304 1.349078
##
## R.Weight
## train_set$Class [,1] [,2]
## B 2.897436 1.410391
## L 2.382609 1.315561
## R 3.686957 1.184803
##
## R.Distance
## train_set$Class [,1] [,2]
## B 2.948718 1.375514
## L 2.334783 1.290242
## R 3.578261 1.257452
summary(classifier)
## Length Class Mode
## apriori 3 table numeric
## tables 4 -none- list
## levels 3 -none- character
## isnumeric 4 -none- logical
## call 3 -none- call
y_pred= predict(object = classifier, newdata = test_set)
y_pred
## [1] L L L L L R L R R L L R L R R L R R L L R R R L L L R L L R R L R L R R L
## [38] R L R L L R L L L L R R R R L L R R R R L R L L R R R R R L L L L R R L L
## [75] R L L L R R R L L L L R L R L L L L R R R R L R L R R R R L L R L R R L L
## [112] R R R L L R R R L L R L L R L
## Levels: B L R
cm=table(test_set[,1],y_pred)
cm
## y_pred
## B L R
## 1 0 1 57
## 2 0 56 2
## 3 0 7 3
acc=sum(sum(diag(cm))/sum(cm))
acc
## [1] 0.468254
getwd()
## [1] "D:/VIT/ACADEMICS/VIT SEM 5/DATA SCIENCE PROGRAMMING"
setwd("D:/VIT/ACADEMICS/VIT SEM 5/DATA SCIENCE PROGRAMING LAB/DATASET")
dataset = read.csv("balance-scale.csv")
View(dataset)
dataset$Class= factor(dataset$Class,levels=c("R","L","B"),labels=c(0,1,2))
dataset=dataset[sample(1:nrow(dataset)),]
dataset=lapply(dataset,as.numeric)
dataset = data.frame(dataset)
View(dataset)
dim(dataset)
## [1] 625 5
str(dataset)
## 'data.frame': 625 obs. of 5 variables:
## $ Class : num 2 1 1 2 1 2 3 2 1 2 ...
## $ L.Weight : num 4 2 3 2 4 5 4 4 1 1 ...
## $ L.Distance: num 3 5 1 4 5 3 5 4 5 5 ...
## $ R.Weight : num 1 3 2 3 5 3 5 4 5 1 ...
## $ R.Distance: num 4 4 4 2 5 2 4 1 5 2 ...
summary(dataset$Class)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 1.618 2.000 3.000
library(caTools)
set.seed(18)
split = sample.split(dataset$Class,SplitRatio = 0.8)
training_set = subset(dataset,split == T)
dim(train_set)
## [1] 499 5
test_set = subset(dataset, split == F)
library(class)
pred = knn(train = training_set[,-1],test = test_set[,-1],cl = training_set[,1],k=5)
pred
## [1] 2 2 1 1 3 1 1 1 2 1 1 1 2 2 1 1 2 1 2 2 1 1 1 2 1 2 1 2 2 1 2 2 2 2 1 1 2
## [38] 2 2 1 1 2 1 2 1 2 1 1 2 2 2 1 1 1 1 2 1 3 1 2 1 1 1 2 3 2 1 2 2 1 2 1 2 2
## [75] 1 2 1 2 1 2 2 2 2 1 1 1 2 2 1 1 2 1 1 2 2 2 2 2 2 2 2 1 2 1 2 2 1 2 2 1 1
## [112] 2 1 2 1 2 2 2 1 2 1 2 1 2 1 3
## Levels: 1 2 3
cm = table(test_set[,1],pred)
cm
## pred
## 1 2 3
## 1 53 4 1
## 2 0 55 3
## 3 5 5 0
acc = sum(diag(cm)/sum(cm))
acc
## [1] 0.8571429
Histogram
Density
#Density
den = density(dataset$R.Weight)
plot(den, frame = FALSE, col = "blue",main = "Density plot")
legend(2.5,0.10,legend=c("20MID0023"))
Scatter
#Scatter
plot(x=dataset$R.Weight,y=dataset$R.Distance,main = "Scatter Plot")
legend(2,3,legend=c("20MID0023"))
Barplot
count = table(dataset$R.Distance)
barplot(count,main="Bar Plot",col = "red")
legend(3,90,legend=c("20MID0023"))
Violin plot
#Violin Plot
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.3
ggplot(dataset,aes(x=dataset$L.Weight,y=dataset$R.Weight))+geom_violin()
## Warning: Use of `dataset$L.Weight` is discouraged. Use `L.Weight` instead.
## Warning: Use of `dataset$R.Weight` is discouraged. Use `R.Weight` instead.
Heatmap
map<-as.matrix(dataset[])
heatmap(map)
Lollipop
#Lollipop Plot
ggplot(dataset,aes(x=L.Weight,y=R.Weight))+geom_segment(aes(x=1,xend=L.Weight,y=1,yend=R.Weight))+geom_point()