Aim
To evaluate and compare
2 different classification models on Wine prediction dataset
Literature Analysis
These
data are the results of a chemical analysis of wines grown in the same
region in Italy but derived from three different cultivars.The analysis
determined the quantities of 13 constituents found in each of the three
types of wines.
The attributes are
1) Alcohol
2) Malic
acid
3) Ash
4) Alcalinity of ash
5) Magnesium
6) Total
phenols
7) Flavanoids
8) Nonflavanoid phenols
9)
Proanthocyanins
10)Color intensity
11)Hue
12)OD280/OD315 of
diluted wines
13)Proline
Number of Instances
class 1 59
class 2 71
class 3 48
Number of Attributes: 13
Missing Attribute Values: None
Class Distribution: number of instances per
class
class 1 59
class 2 71
class 3 48
Wine Dataset
Importing the
data
#Import the data
df<-read.csv("wine.csv")
head(df)
## ï..Type Alcohol Malic.acid Ash Alcalinity.of.ash Magnesium Total.phenols
## 1 1 14.23 1.71 2.43 15.6 127 2.80
## 2 1 13.20 1.78 2.14 11.2 100 2.65
## 3 1 13.16 2.36 2.67 18.6 101 2.80
## 4 1 14.37 1.95 2.50 16.8 113 3.85
## 5 1 13.24 2.59 2.87 21.0 118 2.80
## 6 1 14.20 1.76 2.45 15.2 112 3.27
## Flavanoids Nonflavanoid.phenols Proanthocyanins Color.intensity Hue
## 1 3.06 0.28 2.29 5.64 1.04
## 2 2.76 0.26 1.28 4.38 1.05
## 3 3.24 0.30 2.81 5.68 1.03
## 4 3.49 0.24 2.18 7.80 0.86
## 5 2.69 0.39 1.82 4.32 1.04
## 6 3.39 0.34 1.97 6.75 1.05
## OD280.OD315.of.diluted.wines Proline
## 1 3.92 1065
## 2 3.40 1050
## 3 3.17 1185
## 4 3.45 1480
## 5 2.93 735
## 6 2.85 1450
dim(df)
## [1] 178 14
plot(df)
str(df)
## 'data.frame': 178 obs. of 14 variables:
## $ ï..Type : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Malic.acid : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Alcalinity.of.ash : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Magnesium : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Total.phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ Nonflavanoid.phenols : num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanthocyanins : num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ Color.intensity : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ OD280.OD315.of.diluted.wines: num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ Proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
summary(df)
## ï..Type Alcohol Malic.acid Ash
## Min. :1.000 Min. :11.03 Min. :0.740 Min. :1.360
## 1st Qu.:1.000 1st Qu.:12.36 1st Qu.:1.603 1st Qu.:2.210
## Median :2.000 Median :13.05 Median :1.865 Median :2.360
## Mean :1.938 Mean :13.00 Mean :2.336 Mean :2.367
## 3rd Qu.:3.000 3rd Qu.:13.68 3rd Qu.:3.083 3rd Qu.:2.558
## Max. :3.000 Max. :14.83 Max. :5.800 Max. :3.230
## Alcalinity.of.ash Magnesium Total.phenols Flavanoids
## Min. :10.60 Min. : 70.00 Min. :0.980 Min. :0.340
## 1st Qu.:17.20 1st Qu.: 88.00 1st Qu.:1.742 1st Qu.:1.205
## Median :19.50 Median : 98.00 Median :2.355 Median :2.135
## Mean :19.49 Mean : 99.74 Mean :2.295 Mean :2.029
## 3rd Qu.:21.50 3rd Qu.:107.00 3rd Qu.:2.800 3rd Qu.:2.875
## Max. :30.00 Max. :162.00 Max. :3.880 Max. :5.080
## Nonflavanoid.phenols Proanthocyanins Color.intensity Hue
## Min. :0.1300 Min. :0.410 Min. : 1.280 Min. :0.4800
## 1st Qu.:0.2700 1st Qu.:1.250 1st Qu.: 3.220 1st Qu.:0.7825
## Median :0.3400 Median :1.555 Median : 4.690 Median :0.9650
## Mean :0.3619 Mean :1.591 Mean : 5.058 Mean :0.9574
## 3rd Qu.:0.4375 3rd Qu.:1.950 3rd Qu.: 6.200 3rd Qu.:1.1200
## Max. :0.6600 Max. :3.580 Max. :13.000 Max. :1.7100
## OD280.OD315.of.diluted.wines Proline
## Min. :1.270 Min. : 278.0
## 1st Qu.:1.938 1st Qu.: 500.5
## Median :2.780 Median : 673.5
## Mean :2.612 Mean : 746.9
## 3rd Qu.:3.170 3rd Qu.: 985.0
## Max. :4.000 Max. :1680.0
colnames(df)[1]<-"Type"
str(df)
## 'data.frame': 178 obs. of 14 variables:
## $ Type : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Malic.acid : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Alcalinity.of.ash : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Magnesium : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Total.phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ Nonflavanoid.phenols : num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanthocyanins : num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ Color.intensity : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ OD280.OD315.of.diluted.wines: num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ Proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
df$Type=as.factor(df$Type)
Creating a training and test set
library(caTools)
## Warning: package 'caTools' was built under R version 4.1.3
split=sample.split(Y=df$Type,SplitRatio = 0.7)
training_set=subset(df,split==T)
test_set=subset(df,split==F)
Algorithm 1: Decision Tree Algorithm
Building the model-fitting decision tree
library(rpart)
## Warning: package 'rpart' was built under R version 4.1.3
fit<-rpart(formula=Type~.,data = training_set,method = "class")
plot(fit)
text(fit)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.3
rpart.plot(fit)
predict_unseen=predict(object = fit,newdata = test_set,type = "class")
Creating Confusion matrix and predicting the accuracy of given dataset
library(caret)
## Warning: package 'caret' was built under R version 4.1.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.1.3
## Loading required package: lattice
confusionMatrix(as.factor(predict_unseen),test_set$Type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 16 1 0
## 2 2 18 2
## 3 0 2 12
##
## Overall Statistics
##
## Accuracy : 0.8679
## 95% CI : (0.7466, 0.9452)
## No Information Rate : 0.3962
## P-Value [Acc > NIR] : 1.597e-12
##
## Kappa : 0.7989
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 0.8889 0.8571 0.8571
## Specificity 0.9714 0.8750 0.9487
## Pos Pred Value 0.9412 0.8182 0.8571
## Neg Pred Value 0.9444 0.9032 0.9487
## Prevalence 0.3396 0.3962 0.2642
## Detection Rate 0.3019 0.3396 0.2264
## Detection Prevalence 0.3208 0.4151 0.2642
## Balanced Accuracy 0.9302 0.8661 0.9029
Algorithm 2: Naive Bayes Algorithm
Building the model-fitting naive bayes
#Building the model-fitting naive bayes
library(e1071)
## Warning: package 'e1071' was built under R version 4.1.3
classifier = naiveBayes(x= training_set[-1], y = training_set$Type)
summary(classifier)
## Length Class Mode
## apriori 3 table numeric
## tables 13 -none- list
## levels 3 -none- character
## isnumeric 13 -none- logical
## call 3 -none- call
Predicting the data
#predict
y_pred = predict(object = classifier, newdata = test_set)
Creating confusion matrix and predicting the accuracy of given dataset
#Creating confusion matrix
confusionMatrix(as.factor(y_pred),test_set$Type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 17 0 0
## 2 1 21 0
## 3 0 0 14
##
## Overall Statistics
##
## Accuracy : 0.9811
## 95% CI : (0.8993, 0.9995)
## No Information Rate : 0.3962
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9713
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 0.9444 1.0000 1.0000
## Specificity 1.0000 0.9688 1.0000
## Pos Pred Value 1.0000 0.9545 1.0000
## Neg Pred Value 0.9722 1.0000 1.0000
## Prevalence 0.3396 0.3962 0.2642
## Detection Rate 0.3208 0.3962 0.2642
## Detection Prevalence 0.3208 0.4151 0.2642
## Balanced Accuracy 0.9722 0.9844 1.0000
Compartive Statements
The
accuracy of Naive-Bayes is 98.11% whereas and the accuracy of Decision
tree algorithm is 90.57%
Result
Naive Bayes has
better prediction analysis than Decision Tree
Graph Analysis
Boxplot
#BOXPLOT
boxplot(df$Alcohol,main="Box plot df$Alcohol 20MID0108")
boxplot(df$Ash,main="Box plot df$Ash 20MID0108")
Histogram
#HISTOGRAM
hist(df$Alcohol,main=" 20MID0108",col = "blue")
hist(df$Malic.acid,main=" 20MID0108",col = "green")
hist(df$Alcalinity.of.ash,main=" 20MID0108",col = "yellow")
Scatter plot
#SCATTER PLOT
plot(df,main="Scatter_plot_20MID0108")
Lolipop Plot
#LOLIPOP CHART
#create new column for wine types
df$Type <- row.names(df)
library(ggplot2)
#create lollipop chart
#Type plotted against Ash
ggplot(df,main="20MID0108", aes(x = Ash, y = Type)) +
geom_segment(aes(x = 0, y = Type, xend = Ash, yend = Type)) +
geom_point()
#Type plotted against Flavanoids
ggplot(df,main="20MID0108", aes(x = Flavanoids, y = Type)) +
geom_segment(aes(x = 0, y = Type, xend = Flavanoids, yend = Type)) +
geom_point()
#Type plotted against Total.phenols
ggplot(df,main="20MID0108", aes(x = Total.phenols, y = Type)) +
geom_segment(aes(x = 0, y = Type, xend = Total.phenols, yend = Type)) +
geom_point()
Correlogram
#CORRELOGRAM
df1<-df[2:5]
corr<-round(cor(df1),1)
#install.packages("ggcorrplot")
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.1.3
ggcorrplot(corr)+ggtitle("correlogram of attributes 20MID0108")
ggcorrplot(corr,hc.order=TRUE,type="lower",lab=TRUE)+ggtitle("correlogram of attributes 20MID0108")
Line Plot
#LINE PLOT
plot(df$Alcohol, type = "o",main="20MID0108",col="blue")
plot(df$Malic.acid, type = "o",main="20MID0108",col="green")
plot(df$Ash, type = "o",main="20MID0108",col="orange")