Section 17 Decision Tree
setwd()
df <- read.csv("G:\\RStudio\\udemy\\ml\\Machine Learning AZ\\Part 3 - Classification\\Section 19 - Decision Tree Classification\\Decision_Tree_Classification\\Social_Network_Ads.csv")
head(df)
Select the fields that we will be working with
df <- df[,3:5]
head(df)
#factorize Purchase field
df$Purchased = factor(df$Purchased, levels = c(0, 1))
summary(df)
Age EstimatedSalary Purchased
Min. :18.00 Min. : 15000 0:257
1st Qu.:29.75 1st Qu.: 43000 1:143
Median :37.00 Median : 70000
Mean :37.66 Mean : 69743
3rd Qu.:46.00 3rd Qu.: 88000
Max. :60.00 Max. :150000
Split dataset into training and test set (300 training, 100 test)
library(caTools)
set.seed(123)
split <- sample.split(df$Purchased, SplitRatio = 0.75)
training_set <- subset(df, split == TRUE)
test_set <- subset(df, split == FALSE)
For Classification,it is better to do feature scaling (normalization)
# Feature Scaling 1 age, 2 is salary
training_set[-3] <- scale(training_set[-3])
test_set[-3] <- scale(test_set[-3])
Fitting Classifier to the Training Set
# Create the classifier here
# install.packages("rpart")
library(rpart)
classifier = rpart(formula = Purchased ~ .,
data = training_set)
plot(classifier)
text(classifier)

Predicting the test set results
y_pred = predict(classifier, newdata = test_set[-3], type = 'class')
y_pred
2 4 5 9 12 18 19 20 22 29 32 34 35 38 45 46 48 52 66 69
0 0 0 0 0 0 1 1 0 0 1 0 1 0 0 0 0 0 0 0
74 75 82 84 85 86 87 89 103 104 107 108 109 117 124 126 127 131 134 139
1 0 0 1 0 1 0 0 1 1 0 1 1 0 0 0 0 0 0 0
148 154 156 159 162 163 170 175 176 193 199 200 208 213 224 226 228 229 230 234
0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1 0 0 1
236 237 239 241 255 264 265 266 273 274 281 286 292 299 302 305 307 310 316 324
1 0 1 1 0 0 1 1 1 1 1 1 1 0 0 0 1 0 0 1
326 332 339 341 343 347 353 363 364 367 368 369 372 373 380 383 389 392 395 400
0 1 0 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 0 1
Levels: 0 1
Evaluate the prediction using confusion Matrix.
# Making the confusion matrix
# [3] refers to the outcome
cm <- table(test_set[,3], y_pred)
cm
y_pred
0 1
0 53 11
1 6 30
Plot
# install.packages("ElemStatLearn")
library(ElemStatLearn)
set <- training_set
X1 <- seq(min(set[,1]) - 1, max(set[,1]) + 1, by = 0.01)
X2 <- seq(min(set[,2]) - 1, max(set[,2]) + 1, by = 0.01)
grid_set <- expand.grid(X1, X2)
colnames(grid_set) = c('Age','EstimatedSalary')
y_grid = predict(classifier, newdata = grid_set, type = 'class')
plot(set[,-3],
main = 'Classifier Model (Training Set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg= ifelse(set[,3] == 1,'green4', 'red3'))

Plot description
The red region is predicted by the classifier as “Dont buy” The green region is predicted by the classifier as “Buy” The red dots are those people that actually did not buy The green dots are those people that actually bought. The line is the prediction boundary.
Now we see the result of the test set
set <- test_set
X1 <- seq(min(set[,1]) - 1, max(set[,1]) + 1, by = 0.01)
X2 <- seq(min(set[,2]) - 1, max(set[,2]) + 1, by = 0.01)
grid_set <- expand.grid(X1, X2)
colnames(grid_set) = c('Age','EstimatedSalary')
y_grid = predict(classifier, newdata = grid_set,type= 'class')
plot(set[,-3],
main = 'Classifier Model (Test Set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg= ifelse(set[,3] == 1,'green4', 'red3'))

LS0tDQp0aXRsZTogIk1MIFVzaW5nIFIgU2VjdGlvbiAxNyBEZWNpc2lvbiBUcmVlIg0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCg0KIyBTZWN0aW9uIDE3IERlY2lzaW9uIFRyZWUNCiMgc2V0d2QoKQ0KDQpgYGB7cn0NCmRmIDwtICByZWFkLmNzdigiRzpcXFJTdHVkaW9cXHVkZW15XFxtbFxcTWFjaGluZSBMZWFybmluZyBBWlxcUGFydCAzIC0gQ2xhc3NpZmljYXRpb25cXFNlY3Rpb24gMTkgLSBEZWNpc2lvbiBUcmVlIENsYXNzaWZpY2F0aW9uXFxEZWNpc2lvbl9UcmVlX0NsYXNzaWZpY2F0aW9uXFxTb2NpYWxfTmV0d29ya19BZHMuY3N2IikNCmhlYWQoZGYpDQpgYGANCg0KIyBTZWxlY3QgdGhlIGZpZWxkcyB0aGF0IHdlIHdpbGwgYmUgd29ya2luZyB3aXRoDQoNCmBgYHtyfQ0KZGYgPC0gZGZbLDM6NV0NCmhlYWQoZGYpDQpgYGANCg0KDQpgYGB7cn0NCiNmYWN0b3JpemUgUHVyY2hhc2UgZmllbGQNCmRmJFB1cmNoYXNlZCA9IGZhY3RvcihkZiRQdXJjaGFzZWQsIGxldmVscyA9IGMoMCwgMSkpDQpzdW1tYXJ5KGRmKQ0KYGBgDQoNCiMgU3BsaXQgZGF0YXNldCBpbnRvIHRyYWluaW5nIGFuZCB0ZXN0IHNldCAoMzAwIHRyYWluaW5nLCAxMDAgdGVzdCkNCmBgYHtyfQ0KbGlicmFyeShjYVRvb2xzKQ0Kc2V0LnNlZWQoMTIzKQ0Kc3BsaXQgPC0gc2FtcGxlLnNwbGl0KGRmJFB1cmNoYXNlZCwgU3BsaXRSYXRpbyA9IDAuNzUpDQp0cmFpbmluZ19zZXQgPC0gc3Vic2V0KGRmLCBzcGxpdCA9PSBUUlVFKQ0KdGVzdF9zZXQgPC0gc3Vic2V0KGRmLCBzcGxpdCA9PSBGQUxTRSkNCg0KYGBgDQoNCiMgRm9yIENsYXNzaWZpY2F0aW9uLGl0IGlzIGJldHRlciB0byBkbyBmZWF0dXJlIHNjYWxpbmcgKG5vcm1hbGl6YXRpb24pDQoNCmBgYHtyfQ0KIyBGZWF0dXJlIFNjYWxpbmcgMSBhZ2UsIDIgaXMgc2FsYXJ5DQp0cmFpbmluZ19zZXRbLTNdIDwtICBzY2FsZSh0cmFpbmluZ19zZXRbLTNdKQ0KdGVzdF9zZXRbLTNdIDwtICBzY2FsZSh0ZXN0X3NldFstM10pDQpgYGANCg0KIyBGaXR0aW5nIENsYXNzaWZpZXIgdG8gdGhlIFRyYWluaW5nIFNldA0KDQpgYGB7cn0NCiMgQ3JlYXRlIHRoZSBjbGFzc2lmaWVyIGhlcmUNCiMgaW5zdGFsbC5wYWNrYWdlcygicnBhcnQiKQ0KbGlicmFyeShycGFydCkNCmNsYXNzaWZpZXIgPSBycGFydChmb3JtdWxhID0gUHVyY2hhc2VkIH4gLiwNCiAgICAgICAgICAgICAgICAgICBkYXRhID0gdHJhaW5pbmdfc2V0KQ0KcGxvdChjbGFzc2lmaWVyKQ0KdGV4dChjbGFzc2lmaWVyKQ0KYGBgDQoNCiMgUHJlZGljdGluZyB0aGUgdGVzdCBzZXQgcmVzdWx0cw0KDQpgYGB7cn0NCnlfcHJlZCA9IHByZWRpY3QoY2xhc3NpZmllciwgbmV3ZGF0YSA9IHRlc3Rfc2V0Wy0zXSwgdHlwZSA9ICdjbGFzcycpDQp5X3ByZWQNCg0KDQpgYGANCg0KIyBFdmFsdWF0ZSB0aGUgcHJlZGljdGlvbiB1c2luZyBjb25mdXNpb24gTWF0cml4Lg0KDQpgYGB7cn0NCiMgTWFraW5nIHRoZSBjb25mdXNpb24gbWF0cml4DQojIFszXSByZWZlcnMgdG8gdGhlIG91dGNvbWUNCg0KY20gPC0gdGFibGUodGVzdF9zZXRbLDNdLCB5X3ByZWQpDQpjbQ0KYGBgDQoNCiMgUGxvdA0KDQpgYGB7cn0NCiMgaW5zdGFsbC5wYWNrYWdlcygiRWxlbVN0YXRMZWFybiIpDQpsaWJyYXJ5KEVsZW1TdGF0TGVhcm4pDQpzZXQgPC0gdHJhaW5pbmdfc2V0DQpYMSA8LSBzZXEobWluKHNldFssMV0pIC0gMSwgbWF4KHNldFssMV0pICsgMSwgYnkgPSAwLjAxKQ0KWDIgPC0gc2VxKG1pbihzZXRbLDJdKSAtIDEsIG1heChzZXRbLDJdKSArIDEsIGJ5ID0gMC4wMSkNCmdyaWRfc2V0IDwtIGV4cGFuZC5ncmlkKFgxLCBYMikNCmNvbG5hbWVzKGdyaWRfc2V0KSA9IGMoJ0FnZScsJ0VzdGltYXRlZFNhbGFyeScpDQoNCnlfZ3JpZCA9IHByZWRpY3QoY2xhc3NpZmllciwgIG5ld2RhdGEgPSBncmlkX3NldCwgdHlwZSA9ICdjbGFzcycpDQpwbG90KHNldFssLTNdLA0KICAgICBtYWluID0gJ0NsYXNzaWZpZXIgTW9kZWwgKFRyYWluaW5nIFNldCknLA0KICAgICB4bGFiID0gICdBZ2UnLCB5bGFiID0gJ0VzdGltYXRlZCBTYWxhcnknLA0KICAgICB4bGltID0gcmFuZ2UoWDEpLCB5bGltID0gcmFuZ2UoWDIpKQ0KY29udG91cihYMSwgWDIsIG1hdHJpeChhcy5udW1lcmljKHlfZ3JpZCksIGxlbmd0aChYMSksIGxlbmd0aChYMikpLCBhZGQgPSBUUlVFKQ0KcG9pbnRzKGdyaWRfc2V0LCBwY2ggPSAnLicsIGNvbCA9IGlmZWxzZSh5X2dyaWQgPT0gMSwgJ3NwcmluZ2dyZWVuMycsICd0b21hdG8nKSkNCnBvaW50cyhzZXQsIHBjaCA9IDIxLCBiZz0gaWZlbHNlKHNldFssM10gPT0gMSwnZ3JlZW40JywgJ3JlZDMnKSkNCmBgYA0KDQojIFBsb3QgZGVzY3JpcHRpb24NClRoZSByZWQgcmVnaW9uIGlzIHByZWRpY3RlZCBieSB0aGUgY2xhc3NpZmllciBhcyAiRG9udCBidXkiIDwvYnI+DQpUaGUgZ3JlZW4gcmVnaW9uIGlzIHByZWRpY3RlZCBieSB0aGUgY2xhc3NpZmllciBhcyAiQnV5IiA8L2JyPg0KVGhlIHJlZCBkb3RzIGFyZSB0aG9zZSBwZW9wbGUgdGhhdCBhY3R1YWxseSBkaWQgbm90IGJ1eSA8L2JyPg0KVGhlIGdyZWVuIGRvdHMgYXJlIHRob3NlIHBlb3BsZSB0aGF0IGFjdHVhbGx5IGJvdWdodC4gPC9icj4NClRoZSBsaW5lIGlzIHRoZSBwcmVkaWN0aW9uIGJvdW5kYXJ5LiANCg0KIyBOb3cgd2Ugc2VlIHRoZSByZXN1bHQgb2YgdGhlIHRlc3Qgc2V0DQoNCmBgYHtyfQ0Kc2V0IDwtIHRlc3Rfc2V0DQpYMSA8LSBzZXEobWluKHNldFssMV0pIC0gMSwgbWF4KHNldFssMV0pICsgMSwgYnkgPSAwLjAxKQ0KWDIgPC0gc2VxKG1pbihzZXRbLDJdKSAtIDEsIG1heChzZXRbLDJdKSArIDEsIGJ5ID0gMC4wMSkNCmdyaWRfc2V0IDwtIGV4cGFuZC5ncmlkKFgxLCBYMikNCmNvbG5hbWVzKGdyaWRfc2V0KSA9IGMoJ0FnZScsJ0VzdGltYXRlZFNhbGFyeScpDQoNCnlfZ3JpZCA9IHByZWRpY3QoY2xhc3NpZmllciwgIG5ld2RhdGEgPSBncmlkX3NldCx0eXBlPSAnY2xhc3MnKQ0KcGxvdChzZXRbLC0zXSwNCiAgICAgbWFpbiA9ICdDbGFzc2lmaWVyIE1vZGVsIChUZXN0IFNldCknLA0KICAgICB4bGFiID0gICdBZ2UnLCB5bGFiID0gJ0VzdGltYXRlZCBTYWxhcnknLA0KICAgICB4bGltID0gcmFuZ2UoWDEpLCB5bGltID0gcmFuZ2UoWDIpKQ0KY29udG91cihYMSwgWDIsIG1hdHJpeChhcy5udW1lcmljKHlfZ3JpZCksIGxlbmd0aChYMSksIGxlbmd0aChYMikpLCBhZGQgPSBUUlVFKQ0KcG9pbnRzKGdyaWRfc2V0LCBwY2ggPSAnLicsIGNvbCA9IGlmZWxzZSh5X2dyaWQgPT0gMSwgJ3NwcmluZ2dyZWVuMycsICd0b21hdG8nKSkNCnBvaW50cyhzZXQsIHBjaCA9IDIxLCBiZz0gaWZlbHNlKHNldFssM10gPT0gMSwnZ3JlZW40JywgJ3JlZDMnKSkNCmBgYA0KDQo=