Objective

We have a cars data set with few parameters and a decision column. We are going to use a Random Forest classifier and predict the decision.

Loading Libraries

library(dplyr)
library(randomForest)
library(caTools)
library(kableExtra)
library(skimr)
library(ggplot2)
library(corrplot)
library(ggcorrplot)
library(gridExtra)
library(superml)
library(caret)

Reading the data

car<-read.csv("./car_evaluation.csv", stringsAsFactors = T)

Viewing and Summarizing Data

Glimpse

car %>% 
  head(5) %>% 
  kable() %>% 
  kable_styling()
buying.price maintenance.cost number.of.doors number.of.persons lug_boot safety decision
high vhigh 2 2 small low unacc
high vhigh 2 2 small med unacc
high vhigh 2 2 small high unacc
high vhigh 2 2 med low unacc
high vhigh 2 2 med med unacc

Summary

car %>% 
  summary() %>% 
  kable() %>% 
  kable_styling()
buying.price maintenance.cost number.of.doors number.of.persons lug_boot safety decision
high:864 high :432 2 :432 2 :576 big :576 high:576 acc : 384
low :864 low :432 3 :432 4 :576 med :576 low :576 good : 69
NA med :432 4 :432 more:576 small:576 med :576 unacc:1210
NA vhigh:432 5more:432 NA NA NA vgood: 65

Structure

car %>% 
  str()
'data.frame':   1728 obs. of  7 variables:
 $ buying.price     : Factor w/ 2 levels "high","low": 1 1 1 1 1 1 1 1 1 1 ...
 $ maintenance.cost : Factor w/ 4 levels "high","low","med",..: 4 4 4 4 4 4 4 4 4 4 ...
 $ number.of.doors  : Factor w/ 4 levels "2","3","4","5more": 1 1 1 1 1 1 1 1 1 1 ...
 $ number.of.persons: Factor w/ 3 levels "2","4","more": 1 1 1 1 1 1 1 1 1 2 ...
 $ lug_boot         : Factor w/ 3 levels "big","med","small": 3 3 3 2 2 2 1 1 1 3 ...
 $ safety           : Factor w/ 3 levels "high","low","med": 2 3 1 2 3 1 2 3 1 2 ...
 $ decision         : Factor w/ 4 levels "acc","good","unacc",..: 3 3 3 3 3 3 3 3 3 3 ...

Skim

car %>% 
  skim() %>% 
  kable() %>% 
  kable_styling()
skim_type skim_variable n_missing complete_rate factor.ordered factor.n_unique factor.top_counts
factor buying.price 0 1 FALSE 2 hig: 864, low: 864
factor maintenance.cost 0 1 FALSE 4 hig: 432, low: 432, med: 432, vhi: 432
factor number.of.doors 0 1 FALSE 4 2: 432, 3: 432, 4: 432, 5mo: 432
factor number.of.persons 0 1 FALSE 3 2: 576, 4: 576, mor: 576
factor lug_boot 0 1 FALSE 3 big: 576, med: 576, sma: 576
factor safety 0 1 FALSE 3 hig: 576, low: 576, med: 576
factor decision 0 1 FALSE 4 una: 1210, acc: 384, goo: 69, vgo: 65

Checking if there are any NA values in the dataset

  kable (colSums(is.na(car)))
x
buying.price 0
maintenance.cost 0
number.of.doors 0
number.of.persons 0
lug_boot 0
safety 0
decision 0

Since there are no missing values and other issues with the data set. We need not manipulate it further.

Exploring the data through Plots

Bar charts

p1<-ggplot(car,aes(fill=buying.price,x=decision))+geom_bar()+ggtitle("Buying Price vs Decision")+labs(x="Buying Price")+coord_flip()
p2<- ggplot(car,aes(fill=maintenance.cost,x=decision))+geom_bar()+ggtitle("Maintenance Cost vs Decision")+labs(x="Buying Price")+coord_flip()+guides(fill=guide_legend(title="Maint. Cost"))
p3<- ggplot(car,aes(fill=number.of.doors,x=decision))+geom_bar()+ggtitle("Number of Doors vs Decision")+labs(x="Buying Price")+coord_flip()+guides(fill=guide_legend(title="Doors"))
p4<-ggplot(car,aes(fill=number.of.persons,x=decision))+geom_bar()+ggtitle("Seating Capacity vs Decision")+labs(x="Buying Price")+coord_flip()+guides(fill=guide_legend(title="Seating"))
p5<-ggplot(car,aes(fill=lug_boot,x=decision))+geom_bar(position = "dodge")+ggtitle("Bootspace vs Decision")+labs(x="Buying Price")
p6<-ggplot(car,aes(fill=safety,x=decision))+geom_bar(position = "dodge")+ggtitle("Safety vs Decision")+labs(x="Buying Price")


grid.arrange(p1,p2,p3,p4, ncol=2)


grid.arrange(p5,p6, ncol=2)

Density Plot

ggplot(car,aes(fill=maintenance.cost,x=decision))+geom_density()+facet_wrap(~decision)+ggtitle("Maintenance Cost")


ggplot(car,aes(fill=safety,x=decision))+geom_density()+facet_wrap(~decision)+ggtitle("Safety")

Correlation Plot

# First step is to convert the factors to numeric variables so that we can plot the corrplot.
# Saving car to a duplicate variable
car1<-car
label<-LabelEncoder$new()
car1$buying.price<-label$fit_transform(car1$buying.price)
car1$maintenance.cost<-label$fit_transform(car1$maintenance.cost)
car1$number.of.doors<-label$fit_transform(car1$number.of.doors)
car1$number.of.persons<-label$fit_transform(car1$number.of.persons)
car1$lug_boot<-label$fit_transform(car1$lug_boot)
car1$safety<-label$fit_transform(car1$safety)
car1$decision<-label$fit_transform(car1$decision)
cor(car1[sapply(car1, is.numeric)])->corr
corrplot(corr,method = "number")

ggcorrplot(corr)

Model Building

Splitting the data into train and test

sample.split(car,SplitRatio = .8)->tag
train<-subset(car,tag==T)
test<-subset(car,tag==F)

dim(train)
[1] 1234    7
dim(test)
[1] 494   7

Building the model

mod<-randomForest(decision~.,data=train,ntree=500)
mod %>% 
  importance() %>% 
  kable() %>% 
  kable_styling()
MeanDecreaseGini
buying.price 47.26270
maintenance.cost 57.83491
number.of.doors 21.82956
number.of.persons 119.23026
lug_boot 32.66588
safety 146.45231

varImpPlot(mod)

Predction and Evaluation

Predict

result<-predict(mod,test,type="class")
result %>% head()
    2     6     9    13    16    20 
unacc unacc unacc unacc unacc unacc 
Levels: acc good unacc vgood

Evaluating with a confusion matrix

confusionMatrix(result,test$decision)
Confusion Matrix and Statistics

          Reference
Prediction acc good unacc vgood
     acc    91    5     7     4
     good    5   13     0     0
     unacc  14    0   339     0
     vgood   1    0     0    15

Overall Statistics
                                          
               Accuracy : 0.9271          
                 95% CI : (0.9005, 0.9484)
    No Information Rate : 0.7004          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.8374          
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: acc Class: good Class: unacc Class: vgood
Sensitivity              0.8198     0.72222       0.9798      0.78947
Specificity              0.9582     0.98950       0.9054      0.99789
Pos Pred Value           0.8505     0.72222       0.9603      0.93750
Neg Pred Value           0.9483     0.98950       0.9504      0.99163
Prevalence               0.2247     0.03644       0.7004      0.03846
Detection Rate           0.1842     0.02632       0.6862      0.03036
Detection Prevalence     0.2166     0.03644       0.7146      0.03239
Balanced Accuracy        0.8890     0.85586       0.9426      0.89368

We have a 92% Accuracy in the prediction.

LS0tDQp0aXRsZTogIkNhciBFdmFsdWF0aW9uIGJhc2VkIG9uIFBhcmFtZXRlcnMiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KIyBPYmplY3RpdmUNCg0KV2UgaGF2ZSBhIGNhcnMgZGF0YSBzZXQgd2l0aCBmZXcgcGFyYW1ldGVycyBhbmQgYSBkZWNpc2lvbiBjb2x1bW4uIFdlIGFyZSBnb2luZyB0byB1c2UgYSBSYW5kb20gRm9yZXN0IGNsYXNzaWZpZXIgYW5kIHByZWRpY3QgdGhlIGRlY2lzaW9uLiAgDQoNCiMjIExvYWRpbmcgTGlicmFyaWVzDQpgYGB7ciAsIG1lc3NhZ2U9RkFMU0V9DQpsaWJyYXJ5KGRwbHlyKQ0KbGlicmFyeShyYW5kb21Gb3Jlc3QpDQpsaWJyYXJ5KGNhVG9vbHMpDQpsaWJyYXJ5KGthYmxlRXh0cmEpDQpsaWJyYXJ5KHNraW1yKQ0KbGlicmFyeShnZ3Bsb3QyKQ0KbGlicmFyeShjb3JycGxvdCkNCmxpYnJhcnkoZ2djb3JycGxvdCkNCmxpYnJhcnkoZ3JpZEV4dHJhKQ0KbGlicmFyeShzdXBlcm1sKQ0KbGlicmFyeShjYXJldCkNCmBgYA0KDQojIyBSZWFkaW5nIHRoZSBkYXRhDQoNCmBgYHtyfQ0KY2FyPC1yZWFkLmNzdigiLi9jYXJfZXZhbHVhdGlvbi5jc3YiLCBzdHJpbmdzQXNGYWN0b3JzID0gVCkNCmBgYA0KDQojIyBWaWV3aW5nIGFuZCBTdW1tYXJpemluZyBEYXRhey50YWJzZXQgLnRhYnNldC1mYWRlIC50YWJzZXQtcGlsbHN9DQoNCiMjIyBHbGltcHNlDQpgYGB7cn0NCmNhciAlPiUgDQogIGhlYWQoNSkgJT4lIA0KICBrYWJsZSgpICU+JSANCiAga2FibGVfc3R5bGluZygpDQpgYGANCg0KIyMjIFN1bW1hcnkNCmBgYHtyfQ0KY2FyICU+JSANCiAgc3VtbWFyeSgpICU+JSANCiAga2FibGUoKSAlPiUgDQogIGthYmxlX3N0eWxpbmcoKQ0KYGBgDQoNCiMjIyBTdHJ1Y3R1cmUNCg0KYGBge3J9DQpjYXIgJT4lIA0KICBzdHIoKQ0KYGBgDQojIyMgU2tpbQ0KDQpgYGB7cn0NCmNhciAlPiUgDQogIHNraW0oKSAlPiUgDQogIGthYmxlKCkgJT4lIA0KICBrYWJsZV9zdHlsaW5nKCkNCmBgYA0KDQojIyBDaGVja2luZyBpZiB0aGVyZSBhcmUgYW55IE5BIHZhbHVlcyBpbiB0aGUgZGF0YXNldA0KYGBge3J9DQogIGthYmxlIChjb2xTdW1zKGlzLm5hKGNhcikpKQ0KYGBgDQpTaW5jZSB0aGVyZSBhcmUgbm8gbWlzc2luZyB2YWx1ZXMgYW5kIG90aGVyIGlzc3VlcyB3aXRoIHRoZSBkYXRhIHNldC4gV2UgbmVlZCBub3QgbWFuaXB1bGF0ZSBpdCBmdXJ0aGVyLg0KDQojIyBFeHBsb3JpbmcgdGhlIGRhdGEgdGhyb3VnaCBQbG90c3sudGFic2V0IC50YWJzZXQtZmFkZSAudGFic2V0LXBpbGxzfQ0KDQojIyMgQmFyIGNoYXJ0cw0KYGBge3J9DQpwMTwtZ2dwbG90KGNhcixhZXMoZmlsbD1idXlpbmcucHJpY2UseD1kZWNpc2lvbikpK2dlb21fYmFyKCkrZ2d0aXRsZSgiQnV5aW5nIFByaWNlIHZzIERlY2lzaW9uIikrbGFicyh4PSJCdXlpbmcgUHJpY2UiKStjb29yZF9mbGlwKCkNCnAyPC0gZ2dwbG90KGNhcixhZXMoZmlsbD1tYWludGVuYW5jZS5jb3N0LHg9ZGVjaXNpb24pKStnZW9tX2JhcigpK2dndGl0bGUoIk1haW50ZW5hbmNlIENvc3QgdnMgRGVjaXNpb24iKStsYWJzKHg9IkJ1eWluZyBQcmljZSIpK2Nvb3JkX2ZsaXAoKStndWlkZXMoZmlsbD1ndWlkZV9sZWdlbmQodGl0bGU9Ik1haW50LiBDb3N0IikpDQpwMzwtIGdncGxvdChjYXIsYWVzKGZpbGw9bnVtYmVyLm9mLmRvb3JzLHg9ZGVjaXNpb24pKStnZW9tX2JhcigpK2dndGl0bGUoIk51bWJlciBvZiBEb29ycyB2cyBEZWNpc2lvbiIpK2xhYnMoeD0iQnV5aW5nIFByaWNlIikrY29vcmRfZmxpcCgpK2d1aWRlcyhmaWxsPWd1aWRlX2xlZ2VuZCh0aXRsZT0iRG9vcnMiKSkNCnA0PC1nZ3Bsb3QoY2FyLGFlcyhmaWxsPW51bWJlci5vZi5wZXJzb25zLHg9ZGVjaXNpb24pKStnZW9tX2JhcigpK2dndGl0bGUoIlNlYXRpbmcgQ2FwYWNpdHkgdnMgRGVjaXNpb24iKStsYWJzKHg9IkJ1eWluZyBQcmljZSIpK2Nvb3JkX2ZsaXAoKStndWlkZXMoZmlsbD1ndWlkZV9sZWdlbmQodGl0bGU9IlNlYXRpbmciKSkNCnA1PC1nZ3Bsb3QoY2FyLGFlcyhmaWxsPWx1Z19ib290LHg9ZGVjaXNpb24pKStnZW9tX2Jhcihwb3NpdGlvbiA9ICJkb2RnZSIpK2dndGl0bGUoIkJvb3RzcGFjZSB2cyBEZWNpc2lvbiIpK2xhYnMoeD0iQnV5aW5nIFByaWNlIikNCnA2PC1nZ3Bsb3QoY2FyLGFlcyhmaWxsPXNhZmV0eSx4PWRlY2lzaW9uKSkrZ2VvbV9iYXIocG9zaXRpb24gPSAiZG9kZ2UiKStnZ3RpdGxlKCJTYWZldHkgdnMgRGVjaXNpb24iKStsYWJzKHg9IkJ1eWluZyBQcmljZSIpDQoNCg0KZ3JpZC5hcnJhbmdlKHAxLHAyLHAzLHA0LCBuY29sPTIpDQoNCmdyaWQuYXJyYW5nZShwNSxwNiwgbmNvbD0yKQ0KYGBgDQoNCiMjIyBEZW5zaXR5IFBsb3QNCmBgYHtyfQ0KZ2dwbG90KGNhcixhZXMoZmlsbD1tYWludGVuYW5jZS5jb3N0LHg9ZGVjaXNpb24pKStnZW9tX2RlbnNpdHkoKStmYWNldF93cmFwKH5kZWNpc2lvbikrZ2d0aXRsZSgiTWFpbnRlbmFuY2UgQ29zdCIpDQoNCmdncGxvdChjYXIsYWVzKGZpbGw9c2FmZXR5LHg9ZGVjaXNpb24pKStnZW9tX2RlbnNpdHkoKStmYWNldF93cmFwKH5kZWNpc2lvbikrZ2d0aXRsZSgiU2FmZXR5IikNCmBgYA0KDQojIyMgQ29ycmVsYXRpb24gUGxvdA0KDQpgYGB7cn0NCiMgRmlyc3Qgc3RlcCBpcyB0byBjb252ZXJ0IHRoZSBmYWN0b3JzIHRvIG51bWVyaWMgdmFyaWFibGVzIHNvIHRoYXQgd2UgY2FuIHBsb3QgdGhlIGNvcnJwbG90Lg0KIyBTYXZpbmcgY2FyIHRvIGEgZHVwbGljYXRlIHZhcmlhYmxlDQpjYXIxPC1jYXINCmxhYmVsPC1MYWJlbEVuY29kZXIkbmV3KCkNCmNhcjEkYnV5aW5nLnByaWNlPC1sYWJlbCRmaXRfdHJhbnNmb3JtKGNhcjEkYnV5aW5nLnByaWNlKQ0KY2FyMSRtYWludGVuYW5jZS5jb3N0PC1sYWJlbCRmaXRfdHJhbnNmb3JtKGNhcjEkbWFpbnRlbmFuY2UuY29zdCkNCmNhcjEkbnVtYmVyLm9mLmRvb3JzPC1sYWJlbCRmaXRfdHJhbnNmb3JtKGNhcjEkbnVtYmVyLm9mLmRvb3JzKQ0KY2FyMSRudW1iZXIub2YucGVyc29uczwtbGFiZWwkZml0X3RyYW5zZm9ybShjYXIxJG51bWJlci5vZi5wZXJzb25zKQ0KY2FyMSRsdWdfYm9vdDwtbGFiZWwkZml0X3RyYW5zZm9ybShjYXIxJGx1Z19ib290KQ0KY2FyMSRzYWZldHk8LWxhYmVsJGZpdF90cmFuc2Zvcm0oY2FyMSRzYWZldHkpDQpjYXIxJGRlY2lzaW9uPC1sYWJlbCRmaXRfdHJhbnNmb3JtKGNhcjEkZGVjaXNpb24pDQpgYGANCg0KYGBge3J9DQpjb3IoY2FyMVtzYXBwbHkoY2FyMSwgaXMubnVtZXJpYyldKS0+Y29ycg0KY29ycnBsb3QoY29ycixtZXRob2QgPSAibnVtYmVyIikNCmdnY29ycnBsb3QoY29ycikNCmBgYA0KDQoNCiMjIE1vZGVsIEJ1aWxkaW5nDQoNCiMjIyBTcGxpdHRpbmcgdGhlIGRhdGEgaW50byB0cmFpbiBhbmQgdGVzdA0KDQpgYGB7cn0NCnNhbXBsZS5zcGxpdChjYXIsU3BsaXRSYXRpbyA9IC44KS0+dGFnDQp0cmFpbjwtc3Vic2V0KGNhcix0YWc9PVQpDQp0ZXN0PC1zdWJzZXQoY2FyLHRhZz09RikNCg0KZGltKHRyYWluKQ0KZGltKHRlc3QpDQpgYGANCg0KIyMjIEJ1aWxkaW5nIHRoZSBtb2RlbA0KYGBge3J9DQptb2Q8LXJhbmRvbUZvcmVzdChkZWNpc2lvbn4uLGRhdGE9dHJhaW4sbnRyZWU9NTAwKQ0KYGBgDQoNCg0KYGBge3J9DQptb2QgJT4lIA0KICBpbXBvcnRhbmNlKCkgJT4lIA0KICBrYWJsZSgpICU+JSANCiAga2FibGVfc3R5bGluZygpDQoNCnZhckltcFBsb3QobW9kKQ0KYGBgDQojIyBQcmVkY3Rpb24gYW5kIEV2YWx1YXRpb24NCg0KIyMjIFByZWRpY3QNCmBgYHtyfQ0KcmVzdWx0PC1wcmVkaWN0KG1vZCx0ZXN0LHR5cGU9ImNsYXNzIikNCnJlc3VsdCAlPiUgaGVhZCgpDQpgYGANCg0KIyMjIEV2YWx1YXRpbmcgd2l0aCBhIGNvbmZ1c2lvbiBtYXRyaXgNCmBgYHtyLEVDSE89VFJVRX0NCmNvbmZ1c2lvbk1hdHJpeChyZXN1bHQsdGVzdCRkZWNpc2lvbikNCmBgYA0KIyMgV2UgaGF2ZSBhIDkyJSBBY2N1cmFjeSBpbiB0aGUgcHJlZGljdGlvbi4NCg0KDQoNCg0KDQoNCg0KDQo=