AIM/Objective

This data set was generated to model psychological experimental results. Each example is classified as having the balance scale tip to the right, tip to the left, or be balanced. The attributes are the left weight, the left distance, the right weight, and the right distance.

LITERATURE SURVEY OF YOUR DATASET

DATASET USED: Balance-Scale dataset

No. of Instances - 625
No. of Attributes - 5
No. independent variable - 4
No. dependent variable - 1

ATTRIBUTE INFORMATION

  1. Class Name: 3 (L, B, R) L represents Left-inclined, R represents Right-inclined and B represents Balanced weight (Categorical data)

  2. Left-Weight: 5 (1, 2, 3, 4, 5) - LW Represents the value of the left-weight (Ordinal data) which may take values from 1-5.

  3. Left-Distance: 5 (1, 2, 3, 4, 5) - LD Represents the value of the left-distance from the center of the balance (Ordinal data) which may take values from 1 - 5.

  4. Right-Weight: 5 (1, 2, 3, 4, 5) - RW Represents the value of the left-weight (Ordinal data) which may take values from 1-5.

  5. Right-Distance: 5 (1, 2, 3, 4, 5) - RD Represents the value of the left-distance from the center of the balance (Ordinal data) which may take values from 1 - 5.

USING DECISION TREE

getwd()
## [1] "D:/VIT/ACADEMICS/VIT SEM 5/DATA SCIENCE PROGRAMMING"
setwd("D:/VIT/ACADEMICS/VIT SEM 5/DATA SCIENCE PROGRAMING LAB/DATASET")
dataset = read.csv("balance-scale.csv")
View(dataset)
dataset=dataset[sample(1:nrow(dataset)),]
View(dataset)
dim(dataset)
## [1] 625   5
dataset$L.Weight=as.numeric(dataset$L.Weight)
dataset$L.Distance=as.numeric(dataset$L.Distance)
dataset$R.Weight=as.numeric(dataset$R.Weight)
dataset$R.Distance=as.numeric(dataset$R.Distance)
str(dataset)
## 'data.frame':    625 obs. of  5 variables:
##  $ Class     : chr  "B" "L" "R" "R" ...
##  $ L.Weight  : num  2 3 3 3 2 2 5 3 1 4 ...
##  $ L.Distance: num  2 4 1 1 2 4 2 1 4 5 ...
##  $ R.Weight  : num  2 2 4 2 4 3 3 1 5 3 ...
##  $ R.Distance: num  2 3 3 3 2 2 2 1 2 3 ...
summary(dataset$Class)
##    Length     Class      Mode 
##       625 character character
library(caTools)
## Warning: package 'caTools' was built under R version 4.1.3
set.seed(18)
split = sample.split(dataset$Class,SplitRatio = 0.8)
train_set = subset(dataset,split == T)
dim(train_set)
## [1] 499   5
test_set = subset(dataset, split == F)
dim(test_set)
## [1] 126   5
library(rpart)
fit = rpart(formula = Class~., data = train_set, method ='class')
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.3
rpart.plot(fit,type = 4)

y_pred = predict(object = fit, newdata = test_set, type = 'class') #Predicting the model
y_pred
## 157 488 445 140 521 553   5 472  56 241 454 325 436 453 191 208 535 288 256 619 
##   L   L   R   R   L   L   R   L   L   R   L   R   L   L   R   L   L   R   R   L 
## 475 154 243 559 571 425 190 293 168 242 118 110 356 413  28 334 495 126   3  73 
##   R   L   R   L   L   R   R   R   R   R   R   R   L   R   R   L   L   R   R   R 
## 155 221 250 207  67 467 541 430  38 455  37 625 142 111 417 170 591 163 158 277 
##   L   R   R   L   R   L   L   L   R   L   R   L   R   R   R   R   L   R   L   L 
## 551 364 380 542 590 218  59 174 193 329 580 302 338 162  98 513 517 422 229 372 
##   L   R   L   R   R   R   R   R   R   L   L   L   L   R   R   R   R   R   L   L 
##  27 434  80 113 335 423  90  49 557 555 574 180 610 447 144 101 357 450 287  42 
##   L   L   R   R   L   R   R   R   L   L   R   L   L   L   R   L   L   R   R   R 
## 411 510 419 509 298 116   7 246 260 166 520 458  77 483 500 135 355 252 410  83 
##   L   L   R   L   R   R   R   R   R   R   R   L   L   L   L   R   L   R   L   R 
## 280   9  54 342 568  78 
##   L   R   R   L   L   R 
## Levels: B L R
cm= table(test_set$Class, y_pred)
cm
##    y_pred
##      B  L  R
##   B  0  7  3
##   L  0 44 14
##   R  0  7 51
acc = sum(diag(cm)/sum(cm))
acc
## [1] 0.7539683

USING NAIVE BAYES

getwd()
## [1] "D:/VIT/ACADEMICS/VIT SEM 5/DATA SCIENCE PROGRAMMING"
setwd("D:/VIT/ACADEMICS/VIT SEM 5/DATA SCIENCE PROGRAMING LAB/DATASET")

dataset = read.csv("balance-scale.csv")
View(dataset)
dataset$Class= factor(dataset$Class,levels=c("R","L","B"),labels=c(0,1,2))
dataset=dataset[sample(1:nrow(dataset)),]
dataset=lapply(dataset,as.numeric)
dataset = data.frame(dataset)
dataset=dataset[sample(1:nrow(dataset)),]
View(dataset)
dim(dataset)
## [1] 625   5
str(dataset)
## 'data.frame':    625 obs. of  5 variables:
##  $ Class     : num  2 1 1 2 1 1 2 2 2 2 ...
##  $ L.Weight  : num  4 1 2 3 3 2 3 3 5 3 ...
##  $ L.Distance: num  5 4 1 5 3 1 5 4 4 2 ...
##  $ R.Weight  : num  1 3 2 4 5 3 1 4 4 1 ...
##  $ R.Distance: num  1 5 3 3 5 2 3 2 1 1 ...
summary(dataset$Class)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   1.618   2.000   3.000
library(caTools)
set.seed(18)
split = sample.split(dataset$Class,SplitRatio = 0.8)
training_set = subset(dataset,split == T)
dim(train_set)
## [1] 499   5
test_set = subset(dataset, split == F)
library(e1071)
## Warning: package 'e1071' was built under R version 4.1.3
classifier = naiveBayes(x=train_set[,-1],y=train_set$Class)
classifier
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = train_set[, -1], y = train_set$Class)
## 
## A-priori probabilities:
## train_set$Class
##          B          L          R 
## 0.07815631 0.46092184 0.46092184 
## 
## Conditional probabilities:
##                L.Weight
## train_set$Class     [,1]     [,2]
##               B 2.846154 1.405599
##               L 3.682609 1.200128
##               R 2.391304 1.316311
## 
##                L.Distance
## train_set$Class     [,1]     [,2]
##               B 3.051282 1.431759
##               L 3.591304 1.253088
##               R 2.391304 1.349078
## 
##                R.Weight
## train_set$Class     [,1]     [,2]
##               B 2.897436 1.410391
##               L 2.382609 1.315561
##               R 3.686957 1.184803
## 
##                R.Distance
## train_set$Class     [,1]     [,2]
##               B 2.948718 1.375514
##               L 2.334783 1.290242
##               R 3.578261 1.257452
summary(classifier)
##           Length Class  Mode     
## apriori   3      table  numeric  
## tables    4      -none- list     
## levels    3      -none- character
## isnumeric 4      -none- logical  
## call      3      -none- call
y_pred= predict(object = classifier, newdata = test_set)
y_pred
##   [1] L L L L L R L R R L L R L R R L R R L L R R R L L L R L L R R L R L R R L
##  [38] R L R L L R L L L L R R R R L L R R R R L R L L R R R R R L L L L R R L L
##  [75] R L L L R R R L L L L R L R L L L L R R R R L R L R R R R L L R L R R L L
## [112] R R R L L R R R L L R L L R L
## Levels: B L R
cm=table(test_set[,1],y_pred)
cm
##    y_pred
##      B  L  R
##   1  0  1 57
##   2  0 56  2
##   3  0  7  3
acc=sum(sum(diag(cm))/sum(cm))
acc
## [1] 0.468254

USING KNN

getwd()
## [1] "D:/VIT/ACADEMICS/VIT SEM 5/DATA SCIENCE PROGRAMMING"
setwd("D:/VIT/ACADEMICS/VIT SEM 5/DATA SCIENCE PROGRAMING LAB/DATASET")

dataset = read.csv("balance-scale.csv")
View(dataset)
dataset$Class= factor(dataset$Class,levels=c("R","L","B"),labels=c(0,1,2))
dataset=dataset[sample(1:nrow(dataset)),]
dataset=lapply(dataset,as.numeric)
dataset = data.frame(dataset)
View(dataset)
dim(dataset)
## [1] 625   5
str(dataset)
## 'data.frame':    625 obs. of  5 variables:
##  $ Class     : num  2 1 1 2 1 2 3 2 1 2 ...
##  $ L.Weight  : num  4 2 3 2 4 5 4 4 1 1 ...
##  $ L.Distance: num  3 5 1 4 5 3 5 4 5 5 ...
##  $ R.Weight  : num  1 3 2 3 5 3 5 4 5 1 ...
##  $ R.Distance: num  4 4 4 2 5 2 4 1 5 2 ...
summary(dataset$Class)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   1.618   2.000   3.000
library(caTools)
set.seed(18)
split = sample.split(dataset$Class,SplitRatio = 0.8)
training_set = subset(dataset,split == T)
dim(train_set)
## [1] 499   5
test_set = subset(dataset, split == F)
library(class)
pred = knn(train = training_set[,-1],test = test_set[,-1],cl = training_set[,1],k=5)
pred
##   [1] 2 2 1 1 3 1 1 1 2 1 1 1 2 2 1 1 2 1 2 2 1 1 1 2 1 2 1 2 2 1 2 2 2 2 1 1 2
##  [38] 2 2 1 1 2 1 2 1 2 1 1 2 2 2 1 1 1 1 2 1 3 1 2 1 1 1 2 3 2 1 2 2 1 2 1 2 2
##  [75] 1 2 1 2 1 2 2 2 2 1 1 1 2 2 1 1 2 1 1 2 2 2 2 2 2 2 2 1 2 1 2 2 1 2 2 1 1
## [112] 2 1 2 1 2 2 2 1 2 1 2 1 2 1 3
## Levels: 1 2 3
cm = table(test_set[,1],pred)
cm
##    pred
##      1  2  3
##   1 53  4  1
##   2  0 55  3
##   3  5  5  0
acc = sum(diag(cm)/sum(cm))
acc
## [1] 0.8571429

INCLUDING PLOTS

Histogram

Density

#Density
den = density(dataset$R.Weight)
plot(den, frame = FALSE, col = "blue",main = "Density plot")
legend(2.5,0.10,legend=c("20MID0023"))

Scatter

#Scatter
plot(x=dataset$R.Weight,y=dataset$R.Distance,main = "Scatter Plot")
legend(2,3,legend=c("20MID0023"))

Barplot

count = table(dataset$R.Distance)
barplot(count,main="Bar Plot",col = "red")
legend(3,90,legend=c("20MID0023"))

Violin plot

#Violin Plot
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.3
ggplot(dataset,aes(x=dataset$L.Weight,y=dataset$R.Weight))+geom_violin()
## Warning: Use of `dataset$L.Weight` is discouraged. Use `L.Weight` instead.
## Warning: Use of `dataset$R.Weight` is discouraged. Use `R.Weight` instead.

Heatmap

map<-as.matrix(dataset[])
heatmap(map)

Lollipop

#Lollipop Plot
ggplot(dataset,aes(x=L.Weight,y=R.Weight))+geom_segment(aes(x=1,xend=L.Weight,y=1,yend=R.Weight))+geom_point()