Problem
To analyze the given dataset “Car Evaluation Database” and build a ML model using ML concepts to check it’s accuracy.
Contents
Import the Dataset
First we should import the dataset and assign the column names for each fields and view the dataset.
#Importing the data
ds=read.csv("car.data")
View(ds)
#Naming the columns
colnames(ds)=c("buying","maint","doors","persons","lug_boot","safety","class")
set.seed(777)
Now lets see the dimensions,a glimpse and summary of the given dataset
#Summary
library(dplyr)
#dimensions of the dataset
dim(ds)
#simple glimpse of the dataset
glimpse(ds)
#summary of each column
summary(ds)
Clean Dataset
The above summary shows if there are any missing values in the dataset. In our case, there are no such missing records. Hence we can proceed further.
Plots
#Visualisation
library(ggplot2)
ggplot(ds,aes(x=class,fill=lug_boot))+geom_histogram(stat="count")+labs(title="Class Vs Luggage boot",subtitle="Histogram",y="Frequency of Luggage boot",x="Class")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplot(ds,aes(x=class,fill=doors))+geom_histogram(stat="count")+labs(title="Class Vs Doors",subtitle="Histogram",y="Frequency of Doors",x="Class")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplot(ds,aes(x=persons,y=class,fill=lug_boot))+geom_col()+labs(title="Persons Vs Class",subtitle="Histogram",y="Frequency of Luggage boot",x="Persons")
ggplot(data = ds,aes(fill=as.factor(doors),x=persons))+geom_density(alpha=0.3)
ggplot(data = ds,aes(fill=as.factor(maint),x=class))+geom_density(alpha=0.3)+facet_wrap(~class)
Splitting the Dataset
Since this is a large data, we split it as training and testing sets. Thus we can build a model using training set and use it to predict results on testing set.
#Fitting the model
library(caTools)
## Warning: package 'caTools' was built under R version 3.5.3
shuffle_index=sample(1:nrow(ds))
ds=ds[shuffle_index,]
split=sample.split(ds$class,SplitRatio = 0.8)
head(split)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE
#Splitting the dataset into testing and training parts
training_set=subset(ds,split==TRUE)
testing_set=subset(ds,split==FALSE)
View(training_set)
View(testing_set)
Building Model Using Machine Learning Algorithm: Decision Tree
Fitting the model as Decision Tree
#Decision Tree
library(rpart)
## Warning: package 'rpart' was built under R version 3.5.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.5.3
fit=rpart(formula = class ~ .,data=training_set,method = "class")
predict_unseen=predict(object = fit,newdata = testing_set,type = "class")
fit1=rpart(formula = class ~ (safety+buying+persons),data=training_set,method = "class")
fit2=rpart(formula = class ~ (safety+maint+persons),data=training_set,method = "class")
fit3=rpart(formula = class ~ (safety+buying+maint+persons),data=training_set,method = "class")
Visualizing the Decision Trees
rpart.plot(fit)
rpart.plot(fit1)
rpart.plot(fit2)
rpart.plot(fit3)
Building Model Using Machine Learning Algorithm: Logistic Regression
#Logistic Regression
linear_regressor=glm(formula = safety~.,family=quasibinomial,data=training_set)
summary(linear_regressor)
##
## Call:
## glm(formula = safety ~ ., family = quasibinomial, data = training_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2867 -0.9922 0.5148 0.8490 1.5838
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.1473 0.3051 -7.039 3.06e-12 ***
## buyinglow 0.8622 0.1925 4.480 8.09e-06 ***
## buyingmed 0.6193 0.1811 3.419 0.000646 ***
## buyingvhigh -0.2893 0.1692 -1.710 0.087500 .
## maintlow 0.4872 0.1861 2.618 0.008941 **
## maintmed 0.4419 0.1820 2.428 0.015326 *
## maintvhigh -0.3502 0.1735 -2.018 0.043790 *
## doors3 0.1745 0.1771 0.985 0.324620
## doors4 0.4114 0.1775 2.318 0.020587 *
## doors5more 0.2701 0.1797 1.503 0.133082
## persons4 1.4255 0.1869 7.628 4.45e-14 ***
## personsmore 1.5131 0.1882 8.040 1.93e-15 ***
## lug_bootmed -0.1735 0.1576 -1.100 0.271363
## lug_bootsmall -0.6455 0.1572 -4.106 4.27e-05 ***
## classgood -0.1236 0.3124 -0.396 0.692429
## classunacc 2.5140 0.1880 13.369 < 2e-16 ***
## classvgood -17.2406 320.3333 -0.054 0.957086
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 0.9370174)
##
## Null deviance: 1765.8 on 1380 degrees of freedom
## Residual deviance: 1421.9 on 1364 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 15
prob_predict=predict.glm(object=linear_regressor,newdata=testing_set,type = "response")
plot(training_set$safety,training_set$class)
Prediction made for the model,
#Confusion matrix
cm=table(testing_set$class,predict_unseen)
cm
## predict_unseen
## acc good unacc vgood
## acc 73 3 0 1
## good 0 12 0 2
## unacc 7 1 234 0
## vgood 2 0 0 11
Lets find the accuracy and misclassification in the developed model,
accuracy=(cm[1,1]+cm[2,2]+cm[3,3]+cm[4,4])/NROW(testing_set)
accuracy
## [1] 0.9537572
misclassification=1-accuracy
misclassification
## [1] 0.04624277
#Pie Chart showing the Class Distribution
acc=cm[1,1]
good=cm[2,2]
unacc=cm[3,3]
vgood=cm[4,4]
library(plotrix)
## Warning: package 'plotrix' was built under R version 3.5.3
pie3D(x=c(acc,good,unacc,vgood),radius=1,explode = 0.05,labels = c("Accountable","Good","Un-Accountable","Very Good"),main="Condition of class in testing_set",start=67)
Conclusion