Objective
We have a cars data set with few parameters and a decision column. We
are going to use a Random Forest classifier and predict the
decision.
Loading Libraries
library(dplyr)
library(randomForest)
library(caTools)
library(kableExtra)
library(skimr)
library(ggplot2)
library(corrplot)
library(ggcorrplot)
library(gridExtra)
library(superml)
library(caret)
Reading the data
car<-read.csv("./car_evaluation.csv", stringsAsFactors = T)
Viewing and Summarizing
Data
Glimpse
car %>%
head(5) %>%
kable() %>%
kable_styling()
| buying.price |
maintenance.cost |
number.of.doors |
number.of.persons |
lug_boot |
safety |
decision |
| high |
vhigh |
2 |
2 |
small |
low |
unacc |
| high |
vhigh |
2 |
2 |
small |
med |
unacc |
| high |
vhigh |
2 |
2 |
small |
high |
unacc |
| high |
vhigh |
2 |
2 |
med |
low |
unacc |
| high |
vhigh |
2 |
2 |
med |
med |
unacc |
Summary
car %>%
summary() %>%
kable() %>%
kable_styling()
| |
buying.price |
maintenance.cost |
number.of.doors |
number.of.persons |
lug_boot |
safety |
decision |
| |
high:864 |
high :432 |
2 :432 |
2 :576 |
big :576 |
high:576 |
acc : 384 |
| |
low :864 |
low :432 |
3 :432 |
4 :576 |
med :576 |
low :576 |
good : 69 |
| |
NA |
med :432 |
4 :432 |
more:576 |
small:576 |
med :576 |
unacc:1210 |
| |
NA |
vhigh:432 |
5more:432 |
NA |
NA |
NA |
vgood: 65 |
Structure
car %>%
str()
'data.frame': 1728 obs. of 7 variables:
$ buying.price : Factor w/ 2 levels "high","low": 1 1 1 1 1 1 1 1 1 1 ...
$ maintenance.cost : Factor w/ 4 levels "high","low","med",..: 4 4 4 4 4 4 4 4 4 4 ...
$ number.of.doors : Factor w/ 4 levels "2","3","4","5more": 1 1 1 1 1 1 1 1 1 1 ...
$ number.of.persons: Factor w/ 3 levels "2","4","more": 1 1 1 1 1 1 1 1 1 2 ...
$ lug_boot : Factor w/ 3 levels "big","med","small": 3 3 3 2 2 2 1 1 1 3 ...
$ safety : Factor w/ 3 levels "high","low","med": 2 3 1 2 3 1 2 3 1 2 ...
$ decision : Factor w/ 4 levels "acc","good","unacc",..: 3 3 3 3 3 3 3 3 3 3 ...
Skim
car %>%
skim() %>%
kable() %>%
kable_styling()
| skim_type |
skim_variable |
n_missing |
complete_rate |
factor.ordered |
factor.n_unique |
factor.top_counts |
| factor |
buying.price |
0 |
1 |
FALSE |
2 |
hig: 864, low: 864 |
| factor |
maintenance.cost |
0 |
1 |
FALSE |
4 |
hig: 432, low: 432, med: 432, vhi: 432 |
| factor |
number.of.doors |
0 |
1 |
FALSE |
4 |
2: 432, 3: 432, 4: 432, 5mo: 432 |
| factor |
number.of.persons |
0 |
1 |
FALSE |
3 |
2: 576, 4: 576, mor: 576 |
| factor |
lug_boot |
0 |
1 |
FALSE |
3 |
big: 576, med: 576, sma: 576 |
| factor |
safety |
0 |
1 |
FALSE |
3 |
hig: 576, low: 576, med: 576 |
| factor |
decision |
0 |
1 |
FALSE |
4 |
una: 1210, acc: 384, goo: 69, vgo: 65 |
Checking if there are any NA values in the dataset
kable (colSums(is.na(car)))
|
|
x
|
|
buying.price
|
0
|
|
maintenance.cost
|
0
|
|
number.of.doors
|
0
|
|
number.of.persons
|
0
|
|
lug_boot
|
0
|
|
safety
|
0
|
|
decision
|
0
|
Since there are no missing values and other issues with the data set.
We need not manipulate it further.
Exploring the data through
Plots
Bar charts
p1<-ggplot(car,aes(fill=buying.price,x=decision))+geom_bar()+ggtitle("Buying Price vs Decision")+labs(x="Buying Price")+coord_flip()
p2<- ggplot(car,aes(fill=maintenance.cost,x=decision))+geom_bar()+ggtitle("Maintenance Cost vs Decision")+labs(x="Buying Price")+coord_flip()+guides(fill=guide_legend(title="Maint. Cost"))
p3<- ggplot(car,aes(fill=number.of.doors,x=decision))+geom_bar()+ggtitle("Number of Doors vs Decision")+labs(x="Buying Price")+coord_flip()+guides(fill=guide_legend(title="Doors"))
p4<-ggplot(car,aes(fill=number.of.persons,x=decision))+geom_bar()+ggtitle("Seating Capacity vs Decision")+labs(x="Buying Price")+coord_flip()+guides(fill=guide_legend(title="Seating"))
p5<-ggplot(car,aes(fill=lug_boot,x=decision))+geom_bar(position = "dodge")+ggtitle("Bootspace vs Decision")+labs(x="Buying Price")
p6<-ggplot(car,aes(fill=safety,x=decision))+geom_bar(position = "dodge")+ggtitle("Safety vs Decision")+labs(x="Buying Price")
grid.arrange(p1,p2,p3,p4, ncol=2)

grid.arrange(p5,p6, ncol=2)

Density Plot
ggplot(car,aes(fill=maintenance.cost,x=decision))+geom_density()+facet_wrap(~decision)+ggtitle("Maintenance Cost")

ggplot(car,aes(fill=safety,x=decision))+geom_density()+facet_wrap(~decision)+ggtitle("Safety")

Correlation Plot
# First step is to convert the factors to numeric variables so that we can plot the corrplot.
# Saving car to a duplicate variable
car1<-car
label<-LabelEncoder$new()
car1$buying.price<-label$fit_transform(car1$buying.price)
car1$maintenance.cost<-label$fit_transform(car1$maintenance.cost)
car1$number.of.doors<-label$fit_transform(car1$number.of.doors)
car1$number.of.persons<-label$fit_transform(car1$number.of.persons)
car1$lug_boot<-label$fit_transform(car1$lug_boot)
car1$safety<-label$fit_transform(car1$safety)
car1$decision<-label$fit_transform(car1$decision)
cor(car1[sapply(car1, is.numeric)])->corr
corrplot(corr,method = "number")

ggcorrplot(corr)

Model Building
Splitting the data into train and test
sample.split(car,SplitRatio = .8)->tag
train<-subset(car,tag==T)
test<-subset(car,tag==F)
dim(train)
[1] 1234 7
dim(test)
[1] 494 7
Building the model
mod<-randomForest(decision~.,data=train,ntree=500)
mod %>%
importance() %>%
kable() %>%
kable_styling()
| |
MeanDecreaseGini |
| buying.price |
47.26270 |
| maintenance.cost |
57.83491 |
| number.of.doors |
21.82956 |
| number.of.persons |
119.23026 |
| lug_boot |
32.66588 |
| safety |
146.45231 |
varImpPlot(mod)

Predction and Evaluation
Predict
result<-predict(mod,test,type="class")
result %>% head()
2 6 9 13 16 20
unacc unacc unacc unacc unacc unacc
Levels: acc good unacc vgood
Evaluating with a confusion matrix
confusionMatrix(result,test$decision)
Confusion Matrix and Statistics
Reference
Prediction acc good unacc vgood
acc 91 5 7 4
good 5 13 0 0
unacc 14 0 339 0
vgood 1 0 0 15
Overall Statistics
Accuracy : 0.9271
95% CI : (0.9005, 0.9484)
No Information Rate : 0.7004
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.8374
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: acc Class: good Class: unacc Class: vgood
Sensitivity 0.8198 0.72222 0.9798 0.78947
Specificity 0.9582 0.98950 0.9054 0.99789
Pos Pred Value 0.8505 0.72222 0.9603 0.93750
Neg Pred Value 0.9483 0.98950 0.9504 0.99163
Prevalence 0.2247 0.03644 0.7004 0.03846
Detection Rate 0.1842 0.02632 0.6862 0.03036
Detection Prevalence 0.2166 0.03644 0.7146 0.03239
Balanced Accuracy 0.8890 0.85586 0.9426 0.89368
We have a 92% Accuracy in the prediction.
LS0tDQp0aXRsZTogIkNhciBFdmFsdWF0aW9uIGJhc2VkIG9uIFBhcmFtZXRlcnMiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KIyBPYmplY3RpdmUNCg0KV2UgaGF2ZSBhIGNhcnMgZGF0YSBzZXQgd2l0aCBmZXcgcGFyYW1ldGVycyBhbmQgYSBkZWNpc2lvbiBjb2x1bW4uIFdlIGFyZSBnb2luZyB0byB1c2UgYSBSYW5kb20gRm9yZXN0IGNsYXNzaWZpZXIgYW5kIHByZWRpY3QgdGhlIGRlY2lzaW9uLiAgDQoNCiMjIExvYWRpbmcgTGlicmFyaWVzDQpgYGB7ciAsIG1lc3NhZ2U9RkFMU0V9DQpsaWJyYXJ5KGRwbHlyKQ0KbGlicmFyeShyYW5kb21Gb3Jlc3QpDQpsaWJyYXJ5KGNhVG9vbHMpDQpsaWJyYXJ5KGthYmxlRXh0cmEpDQpsaWJyYXJ5KHNraW1yKQ0KbGlicmFyeShnZ3Bsb3QyKQ0KbGlicmFyeShjb3JycGxvdCkNCmxpYnJhcnkoZ2djb3JycGxvdCkNCmxpYnJhcnkoZ3JpZEV4dHJhKQ0KbGlicmFyeShzdXBlcm1sKQ0KbGlicmFyeShjYXJldCkNCmBgYA0KDQojIyBSZWFkaW5nIHRoZSBkYXRhDQoNCmBgYHtyfQ0KY2FyPC1yZWFkLmNzdigiLi9jYXJfZXZhbHVhdGlvbi5jc3YiLCBzdHJpbmdzQXNGYWN0b3JzID0gVCkNCmBgYA0KDQojIyBWaWV3aW5nIGFuZCBTdW1tYXJpemluZyBEYXRhey50YWJzZXQgLnRhYnNldC1mYWRlIC50YWJzZXQtcGlsbHN9DQoNCiMjIyBHbGltcHNlDQpgYGB7cn0NCmNhciAlPiUgDQogIGhlYWQoNSkgJT4lIA0KICBrYWJsZSgpICU+JSANCiAga2FibGVfc3R5bGluZygpDQpgYGANCg0KIyMjIFN1bW1hcnkNCmBgYHtyfQ0KY2FyICU+JSANCiAgc3VtbWFyeSgpICU+JSANCiAga2FibGUoKSAlPiUgDQogIGthYmxlX3N0eWxpbmcoKQ0KYGBgDQoNCiMjIyBTdHJ1Y3R1cmUNCg0KYGBge3J9DQpjYXIgJT4lIA0KICBzdHIoKQ0KYGBgDQojIyMgU2tpbQ0KDQpgYGB7cn0NCmNhciAlPiUgDQogIHNraW0oKSAlPiUgDQogIGthYmxlKCkgJT4lIA0KICBrYWJsZV9zdHlsaW5nKCkNCmBgYA0KDQojIyBDaGVja2luZyBpZiB0aGVyZSBhcmUgYW55IE5BIHZhbHVlcyBpbiB0aGUgZGF0YXNldA0KYGBge3J9DQogIGthYmxlIChjb2xTdW1zKGlzLm5hKGNhcikpKQ0KYGBgDQpTaW5jZSB0aGVyZSBhcmUgbm8gbWlzc2luZyB2YWx1ZXMgYW5kIG90aGVyIGlzc3VlcyB3aXRoIHRoZSBkYXRhIHNldC4gV2UgbmVlZCBub3QgbWFuaXB1bGF0ZSBpdCBmdXJ0aGVyLg0KDQojIyBFeHBsb3JpbmcgdGhlIGRhdGEgdGhyb3VnaCBQbG90c3sudGFic2V0IC50YWJzZXQtZmFkZSAudGFic2V0LXBpbGxzfQ0KDQojIyMgQmFyIGNoYXJ0cw0KYGBge3J9DQpwMTwtZ2dwbG90KGNhcixhZXMoZmlsbD1idXlpbmcucHJpY2UseD1kZWNpc2lvbikpK2dlb21fYmFyKCkrZ2d0aXRsZSgiQnV5aW5nIFByaWNlIHZzIERlY2lzaW9uIikrbGFicyh4PSJCdXlpbmcgUHJpY2UiKStjb29yZF9mbGlwKCkNCnAyPC0gZ2dwbG90KGNhcixhZXMoZmlsbD1tYWludGVuYW5jZS5jb3N0LHg9ZGVjaXNpb24pKStnZW9tX2JhcigpK2dndGl0bGUoIk1haW50ZW5hbmNlIENvc3QgdnMgRGVjaXNpb24iKStsYWJzKHg9IkJ1eWluZyBQcmljZSIpK2Nvb3JkX2ZsaXAoKStndWlkZXMoZmlsbD1ndWlkZV9sZWdlbmQodGl0bGU9Ik1haW50LiBDb3N0IikpDQpwMzwtIGdncGxvdChjYXIsYWVzKGZpbGw9bnVtYmVyLm9mLmRvb3JzLHg9ZGVjaXNpb24pKStnZW9tX2JhcigpK2dndGl0bGUoIk51bWJlciBvZiBEb29ycyB2cyBEZWNpc2lvbiIpK2xhYnMoeD0iQnV5aW5nIFByaWNlIikrY29vcmRfZmxpcCgpK2d1aWRlcyhmaWxsPWd1aWRlX2xlZ2VuZCh0aXRsZT0iRG9vcnMiKSkNCnA0PC1nZ3Bsb3QoY2FyLGFlcyhmaWxsPW51bWJlci5vZi5wZXJzb25zLHg9ZGVjaXNpb24pKStnZW9tX2JhcigpK2dndGl0bGUoIlNlYXRpbmcgQ2FwYWNpdHkgdnMgRGVjaXNpb24iKStsYWJzKHg9IkJ1eWluZyBQcmljZSIpK2Nvb3JkX2ZsaXAoKStndWlkZXMoZmlsbD1ndWlkZV9sZWdlbmQodGl0bGU9IlNlYXRpbmciKSkNCnA1PC1nZ3Bsb3QoY2FyLGFlcyhmaWxsPWx1Z19ib290LHg9ZGVjaXNpb24pKStnZW9tX2Jhcihwb3NpdGlvbiA9ICJkb2RnZSIpK2dndGl0bGUoIkJvb3RzcGFjZSB2cyBEZWNpc2lvbiIpK2xhYnMoeD0iQnV5aW5nIFByaWNlIikNCnA2PC1nZ3Bsb3QoY2FyLGFlcyhmaWxsPXNhZmV0eSx4PWRlY2lzaW9uKSkrZ2VvbV9iYXIocG9zaXRpb24gPSAiZG9kZ2UiKStnZ3RpdGxlKCJTYWZldHkgdnMgRGVjaXNpb24iKStsYWJzKHg9IkJ1eWluZyBQcmljZSIpDQoNCg0KZ3JpZC5hcnJhbmdlKHAxLHAyLHAzLHA0LCBuY29sPTIpDQoNCmdyaWQuYXJyYW5nZShwNSxwNiwgbmNvbD0yKQ0KYGBgDQoNCiMjIyBEZW5zaXR5IFBsb3QNCmBgYHtyfQ0KZ2dwbG90KGNhcixhZXMoZmlsbD1tYWludGVuYW5jZS5jb3N0LHg9ZGVjaXNpb24pKStnZW9tX2RlbnNpdHkoKStmYWNldF93cmFwKH5kZWNpc2lvbikrZ2d0aXRsZSgiTWFpbnRlbmFuY2UgQ29zdCIpDQoNCmdncGxvdChjYXIsYWVzKGZpbGw9c2FmZXR5LHg9ZGVjaXNpb24pKStnZW9tX2RlbnNpdHkoKStmYWNldF93cmFwKH5kZWNpc2lvbikrZ2d0aXRsZSgiU2FmZXR5IikNCmBgYA0KDQojIyMgQ29ycmVsYXRpb24gUGxvdA0KDQpgYGB7cn0NCiMgRmlyc3Qgc3RlcCBpcyB0byBjb252ZXJ0IHRoZSBmYWN0b3JzIHRvIG51bWVyaWMgdmFyaWFibGVzIHNvIHRoYXQgd2UgY2FuIHBsb3QgdGhlIGNvcnJwbG90Lg0KIyBTYXZpbmcgY2FyIHRvIGEgZHVwbGljYXRlIHZhcmlhYmxlDQpjYXIxPC1jYXINCmxhYmVsPC1MYWJlbEVuY29kZXIkbmV3KCkNCmNhcjEkYnV5aW5nLnByaWNlPC1sYWJlbCRmaXRfdHJhbnNmb3JtKGNhcjEkYnV5aW5nLnByaWNlKQ0KY2FyMSRtYWludGVuYW5jZS5jb3N0PC1sYWJlbCRmaXRfdHJhbnNmb3JtKGNhcjEkbWFpbnRlbmFuY2UuY29zdCkNCmNhcjEkbnVtYmVyLm9mLmRvb3JzPC1sYWJlbCRmaXRfdHJhbnNmb3JtKGNhcjEkbnVtYmVyLm9mLmRvb3JzKQ0KY2FyMSRudW1iZXIub2YucGVyc29uczwtbGFiZWwkZml0X3RyYW5zZm9ybShjYXIxJG51bWJlci5vZi5wZXJzb25zKQ0KY2FyMSRsdWdfYm9vdDwtbGFiZWwkZml0X3RyYW5zZm9ybShjYXIxJGx1Z19ib290KQ0KY2FyMSRzYWZldHk8LWxhYmVsJGZpdF90cmFuc2Zvcm0oY2FyMSRzYWZldHkpDQpjYXIxJGRlY2lzaW9uPC1sYWJlbCRmaXRfdHJhbnNmb3JtKGNhcjEkZGVjaXNpb24pDQpgYGANCg0KYGBge3J9DQpjb3IoY2FyMVtzYXBwbHkoY2FyMSwgaXMubnVtZXJpYyldKS0+Y29ycg0KY29ycnBsb3QoY29ycixtZXRob2QgPSAibnVtYmVyIikNCmdnY29ycnBsb3QoY29ycikNCmBgYA0KDQoNCiMjIE1vZGVsIEJ1aWxkaW5nDQoNCiMjIyBTcGxpdHRpbmcgdGhlIGRhdGEgaW50byB0cmFpbiBhbmQgdGVzdA0KDQpgYGB7cn0NCnNhbXBsZS5zcGxpdChjYXIsU3BsaXRSYXRpbyA9IC44KS0+dGFnDQp0cmFpbjwtc3Vic2V0KGNhcix0YWc9PVQpDQp0ZXN0PC1zdWJzZXQoY2FyLHRhZz09RikNCg0KZGltKHRyYWluKQ0KZGltKHRlc3QpDQpgYGANCg0KIyMjIEJ1aWxkaW5nIHRoZSBtb2RlbA0KYGBge3J9DQptb2Q8LXJhbmRvbUZvcmVzdChkZWNpc2lvbn4uLGRhdGE9dHJhaW4sbnRyZWU9NTAwKQ0KYGBgDQoNCg0KYGBge3J9DQptb2QgJT4lIA0KICBpbXBvcnRhbmNlKCkgJT4lIA0KICBrYWJsZSgpICU+JSANCiAga2FibGVfc3R5bGluZygpDQoNCnZhckltcFBsb3QobW9kKQ0KYGBgDQojIyBQcmVkY3Rpb24gYW5kIEV2YWx1YXRpb24NCg0KIyMjIFByZWRpY3QNCmBgYHtyfQ0KcmVzdWx0PC1wcmVkaWN0KG1vZCx0ZXN0LHR5cGU9ImNsYXNzIikNCnJlc3VsdCAlPiUgaGVhZCgpDQpgYGANCg0KIyMjIEV2YWx1YXRpbmcgd2l0aCBhIGNvbmZ1c2lvbiBtYXRyaXgNCmBgYHtyLEVDSE89VFJVRX0NCmNvbmZ1c2lvbk1hdHJpeChyZXN1bHQsdGVzdCRkZWNpc2lvbikNCmBgYA0KIyMgV2UgaGF2ZSBhIDkyJSBBY2N1cmFjeSBpbiB0aGUgcHJlZGljdGlvbi4NCg0KDQoNCg0KDQoNCg0KDQo=