Car Evaluation Data Analysis

Problem

To analyze the given dataset “Car Evaluation Database” and build a ML model using ML concepts to check it’s accuracy.

Contents

Import the Dataset
Clean the Dataset
Plots on Dataset
Spliting the Dataset
Building the Model
Accuracy of Model

Import the Dataset

First we should import the dataset and assign the column names for each fields and view the dataset.

#Importing the data
ds=read.csv("car.data")
View(ds)
#Naming the columns 
colnames(ds)=c("buying","maint","doors","persons","lug_boot","safety","class")
set.seed(777)

Now lets see the dimensions,a glimpse and summary of the given dataset

#Summary
library(dplyr)
#dimensions of the dataset
dim(ds)
#simple glimpse of the dataset
glimpse(ds)
#summary of each column
summary(ds)

Clean Dataset

The above summary shows if there are any missing values in the dataset. In our case, there are no such missing records. Hence we can proceed further.

Plots

#Visualisation
library(ggplot2)
ggplot(ds,aes(x=class,fill=lug_boot))+geom_histogram(stat="count")+labs(title="Class Vs Luggage boot",subtitle="Histogram",y="Frequency of Luggage boot",x="Class")

## Warning: Ignoring unknown parameters: binwidth, bins, pad

ggplot(ds,aes(x=class,fill=doors))+geom_histogram(stat="count")+labs(title="Class Vs Doors",subtitle="Histogram",y="Frequency of Doors",x="Class")

## Warning: Ignoring unknown parameters: binwidth, bins, pad

ggplot(ds,aes(x=persons,y=class,fill=lug_boot))+geom_col()+labs(title="Persons Vs Class",subtitle="Histogram",y="Frequency of Luggage boot",x="Persons")

ggplot(data = ds,aes(fill=as.factor(doors),x=persons))+geom_density(alpha=0.3)

ggplot(data = ds,aes(fill=as.factor(maint),x=class))+geom_density(alpha=0.3)+facet_wrap(~class)

Splitting the Dataset

Since this is a large data, we split it as training and testing sets. Thus we can build a model using training set and use it to predict results on testing set.

#Fitting the model
library(caTools)

## Warning: package 'caTools' was built under R version 3.5.3

shuffle_index=sample(1:nrow(ds))
ds=ds[shuffle_index,]
split=sample.split(ds$class,SplitRatio = 0.8)
head(split)

## [1] TRUE TRUE TRUE TRUE TRUE TRUE

#Splitting the dataset into testing and training parts
training_set=subset(ds,split==TRUE)
testing_set=subset(ds,split==FALSE)

View(training_set)
View(testing_set)

Building Model Using Machine Learning Algorithm: Decision Tree

Fitting the model as Decision Tree

#Decision Tree
library(rpart)

## Warning: package 'rpart' was built under R version 3.5.3

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 3.5.3

fit=rpart(formula = class ~ .,data=training_set,method = "class")
predict_unseen=predict(object = fit,newdata = testing_set,type = "class")
fit1=rpart(formula = class ~ (safety+buying+persons),data=training_set,method = "class")
fit2=rpart(formula = class ~ (safety+maint+persons),data=training_set,method = "class")
fit3=rpart(formula = class ~ (safety+buying+maint+persons),data=training_set,method = "class")

Visualizing the Decision Trees

rpart.plot(fit)

rpart.plot(fit1)

rpart.plot(fit2)

rpart.plot(fit3)

Building Model Using Machine Learning Algorithm: Logistic Regression

#Logistic Regression
linear_regressor=glm(formula = safety~.,family=quasibinomial,data=training_set)
summary(linear_regressor)

## 
## Call:
## glm(formula = safety ~ ., family = quasibinomial, data = training_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2867  -0.9922   0.5148   0.8490   1.5838  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -2.1473     0.3051  -7.039 3.06e-12 ***
## buyinglow       0.8622     0.1925   4.480 8.09e-06 ***
## buyingmed       0.6193     0.1811   3.419 0.000646 ***
## buyingvhigh    -0.2893     0.1692  -1.710 0.087500 .  
## maintlow        0.4872     0.1861   2.618 0.008941 ** 
## maintmed        0.4419     0.1820   2.428 0.015326 *  
## maintvhigh     -0.3502     0.1735  -2.018 0.043790 *  
## doors3          0.1745     0.1771   0.985 0.324620    
## doors4          0.4114     0.1775   2.318 0.020587 *  
## doors5more      0.2701     0.1797   1.503 0.133082    
## persons4        1.4255     0.1869   7.628 4.45e-14 ***
## personsmore     1.5131     0.1882   8.040 1.93e-15 ***
## lug_bootmed    -0.1735     0.1576  -1.100 0.271363    
## lug_bootsmall  -0.6455     0.1572  -4.106 4.27e-05 ***
## classgood      -0.1236     0.3124  -0.396 0.692429    
## classunacc      2.5140     0.1880  13.369  < 2e-16 ***
## classvgood    -17.2406   320.3333  -0.054 0.957086    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasibinomial family taken to be 0.9370174)
## 
##     Null deviance: 1765.8  on 1380  degrees of freedom
## Residual deviance: 1421.9  on 1364  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 15

prob_predict=predict.glm(object=linear_regressor,newdata=testing_set,type = "response")
plot(training_set$safety,training_set$class)

Prediction made for the model,

#Confusion matrix
cm=table(testing_set$class,predict_unseen)
cm

##        predict_unseen
##         acc good unacc vgood
##   acc    73    3     0     1
##   good    0   12     0     2
##   unacc   7    1   234     0
##   vgood   2    0     0    11

Lets find the accuracy and misclassification in the developed model,

accuracy=(cm[1,1]+cm[2,2]+cm[3,3]+cm[4,4])/NROW(testing_set)
accuracy

## [1] 0.9537572

misclassification=1-accuracy
misclassification

## [1] 0.04624277

#Pie Chart showing the Class Distribution 
acc=cm[1,1]
good=cm[2,2]
unacc=cm[3,3]
vgood=cm[4,4]
library(plotrix)

## Warning: package 'plotrix' was built under R version 3.5.3

pie3D(x=c(acc,good,unacc,vgood),radius=1,explode = 0.05,labels = c("Accountable","Good","Un-Accountable","Very Good"),main="Condition of class in testing_set",start=67)

Conclusion

All the variables are importance for customers in assessing whether the car is in acceptable or unacceptable range.
Safety and Seating capacity are two main factors in rejecting the cars as unacceptable.
Number of doors are the least important variable in deciding the class value of the car.

Car Evaluation Data Analysis

Gowrisankar JG and Dineshkumar R

8 October 2019