Predict on the high sales on Company data and its attributes using Random Forest.
# install.packages("randomForest")
# install.packages("Mass")
# install.packages("caret")
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.5.1
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(MASS)
library(caret)
## Warning: package 'caret' was built under R version 3.5.1
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.1
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
# USe the set.seed function so that we get same results each time
set.seed(123)
CompanyData <- read.csv(file.choose())
hist(CompanyData$Sales, main = "Sales of Companydata",xlim = c(0,20),
breaks=c(seq(10,20,30)), col = c("blue","red", "green","violet"))

highsales = ifelse(CompanyData$Sales<9, "No", "Yes") # if greater than 8 then high sales else Low
CD = data.frame(CompanyData[2:11], highsales)
str(CD)
## 'data.frame': 400 obs. of 11 variables:
## $ CompPrice : int 138 111 113 117 141 124 115 136 132 132 ...
## $ Income : int 73 48 35 100 64 113 105 81 110 113 ...
## $ Advertising: int 11 16 10 4 3 13 0 15 0 0 ...
## $ Population : int 276 260 269 466 340 501 45 425 108 131 ...
## $ Price : int 120 83 80 97 128 72 108 120 124 124 ...
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Age : int 42 65 59 55 38 78 71 67 76 76 ...
## $ Education : int 17 10 12 14 13 16 15 10 10 17 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
## $ highsales : Factor w/ 2 levels "No","Yes": 2 2 2 1 1 2 1 2 1 1 ...
table(CD$highsales)
##
## No Yes
## 286 114
# Data Partition
set.seed(123)
ind <- sample(2, nrow(CD), replace = TRUE, prob = c(0.7,0.3))
train <- CD[ind==1,]
test <- CD[ind==2,]
set.seed(213)
rf <- randomForest(highsales~., data=train)
rf # Description of the random forest with no of trees, mtry = no of variables for splitting
##
## Call:
## randomForest(formula = highsales ~ ., data = train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 18.25%
## Confusion matrix:
## No Yes class.error
## No 198 12 0.05714286
## Yes 40 35 0.53333333
# each tree node.
# Out of bag estimate of error rate is 16.84 % in Random Forest Model.
attributes(rf)
## $names
## [1] "call" "type" "predicted"
## [4] "err.rate" "confusion" "votes"
## [7] "oob.times" "classes" "importance"
## [10] "importanceSD" "localImportance" "proximity"
## [13] "ntree" "mtry" "forest"
## [16] "y" "test" "inbag"
## [19] "terms"
##
## $class
## [1] "randomForest.formula" "randomForest"
# Prediction and Confusion Matrix - Training data
pred1 <- predict(rf, train)
head(pred1)
## 1 3 6 7 9 10
## Yes Yes Yes No No No
## Levels: No Yes
head(train$highsales)
## [1] Yes Yes Yes No No No
## Levels: No Yes
# looks like the first six predicted value and original value matches.
confusionMatrix(pred1, train$highsales) # 100 % accuracy on training data
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 210 0
## Yes 0 75
##
## Accuracy : 1
## 95% CI : (0.9871, 1)
## No Information Rate : 0.7368
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.7368
## Detection Rate : 0.7368
## Detection Prevalence : 0.7368
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : No
##
# more than 95% Confidence Interval.
# Sensitivity for Yes and No is 100 %
# Prediction with test data - Test Data
pred2 <- predict(rf, test)
confusionMatrix(pred2, test$highsales) # 82.61 % accuracy on test data
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 74 16
## Yes 2 23
##
## Accuracy : 0.8435
## 95% CI : (0.764, 0.9045)
## No Information Rate : 0.6609
## P-Value [Acc > NIR] : 9.101e-06
##
## Kappa : 0.6174
## Mcnemar's Test P-Value : 0.002183
##
## Sensitivity : 0.9737
## Specificity : 0.5897
## Pos Pred Value : 0.8222
## Neg Pred Value : 0.9200
## Prevalence : 0.6609
## Detection Rate : 0.6435
## Detection Prevalence : 0.7826
## Balanced Accuracy : 0.7817
##
## 'Positive' Class : No
##
# Error Rate in Random Forest Model :
plot(rf)

# Tune Random Forest Model mtry
tune <- tuneRF(train[,-11], train[,11], stepFactor = 0.5, plot = TRUE, ntreeTry = 300,
trace = TRUE, improve = 0.05)
## mtry = 3 OOB error = 18.25%
## Searching left ...
## mtry = 6 OOB error = 20%
## -0.09615385 0.05
## Searching right ...
## mtry = 1 OOB error = 20%
## -0.09615385 0.05

rf1 <- randomForest(highsales~., data=train, ntree = 300, mtry = 3, importance = TRUE,
proximity = TRUE)
rf1
##
## Call:
## randomForest(formula = highsales ~ ., data = train, ntree = 300, mtry = 3, importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 300
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 17.89%
## Confusion matrix:
## No Yes class.error
## No 197 13 0.06190476
## Yes 38 37 0.50666667
pred1 <- predict(rf1, train)
confusionMatrix(pred1, train$highsales) # 100 % accuracy on training data
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 210 0
## Yes 0 75
##
## Accuracy : 1
## 95% CI : (0.9871, 1)
## No Information Rate : 0.7368
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.7368
## Detection Rate : 0.7368
## Detection Prevalence : 0.7368
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : No
##
# Around 98% Confidence Interval.
# Sensitivity for Yes and No is 100 %
# test data prediction using the Tuned RF1 model
pred2 <- predict(rf1, test)
confusionMatrix(pred2, test$highsales) # 84.35 % accuracy on test data
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 74 17
## Yes 2 22
##
## Accuracy : 0.8348
## 95% CI : (0.7541, 0.8975)
## No Information Rate : 0.6609
## P-Value [Acc > NIR] : 2.469e-05
##
## Kappa : 0.5933
## Mcnemar's Test P-Value : 0.001319
##
## Sensitivity : 0.9737
## Specificity : 0.5641
## Pos Pred Value : 0.8132
## Neg Pred Value : 0.9167
## Prevalence : 0.6609
## Detection Rate : 0.6435
## Detection Prevalence : 0.7913
## Balanced Accuracy : 0.7689
##
## 'Positive' Class : No
##
# Confidence Interval is around 90 %
# no of nodes of trees
hist(treesize(rf1), main = "No of Nodes for the trees", col = "green")

# Majority of the trees has an average number of 45 to 50 nodes.
# Variable Importance :
varImpPlot(rf1)

# Mean Decrease Accuracy graph shows that how worst the model performs without each variable.
# say ShelveLoc is the most important variable for prediction.on looking at population,it has no value.
# MeanDecrease gini graph shows how much by average the gini decreases if one of those nodes were
# removed. Price is very important and Urban is not that important.
varImpPlot(rf1 ,Sort = T, n.var = 5, main = "Top 5 -Variable Importance")
## Warning in mtext(labs, side = 2, line = loffset, at = y, adj = 0, col =
## color, : "Sort" is not a graphical parameter
## Warning in title(main = main, xlab = xlab, ylab = ylab, ...): "Sort" is not
## a graphical parameter
## Warning in mtext(labs, side = 2, line = loffset, at = y, adj = 0, col =
## color, : "Sort" is not a graphical parameter
## Warning in title(main = main, xlab = xlab, ylab = ylab, ...): "Sort" is not
## a graphical parameter

# Quantitative values
importance(rf1)
## No Yes MeanDecreaseAccuracy MeanDecreaseGini
## CompPrice 3.38574659 -0.1155233 3.0001104 11.553706
## Income 2.29636970 1.3605459 2.5139167 10.329353
## Advertising 4.20338514 13.6986296 11.8658287 15.298464
## Population -0.78692660 -2.7729681 -2.2985757 11.277250
## Price 16.92436779 13.2679497 20.7258540 22.290794
## ShelveLoc 17.14250593 20.3844036 22.6144301 17.436698
## Age 2.90455533 0.1397363 2.2852045 10.772620
## Education -0.26102471 2.3290938 1.3369748 6.865897
## Urban -0.93546647 -0.3064684 -0.9109341 1.253121
## US -0.08215293 5.3279315 3.1953399 2.564866
varUsed(rf) # which predictor variables are actually used in the random forest.
## [1] 2670 2482 2131 2639 3211 1386 2520 1875 399 376
# Partial Dependence Plot
partialPlot(rf1, train, Price, "Yes")

# On that graph, i see that if the price is 100 or greater, than they are not buying those computers.
# Extract single tree from the forest :
getTree(rf, 1, labelVar = TRUE)
## left daughter right daughter split var split point status prediction
## 1 2 3 Price 96.5 1 <NA>
## 2 4 5 Population 268.5 1 <NA>
## 3 6 7 Advertising 8.5 1 <NA>
## 4 8 9 Price 93.5 1 <NA>
## 5 10 11 CompPrice 100.0 1 <NA>
## 6 12 13 Price 134.0 1 <NA>
## 7 14 15 Population 432.0 1 <NA>
## 8 16 17 Advertising 6.5 1 <NA>
## 9 0 0 <NA> 0.0 -1 Yes
## 10 18 19 Price 59.5 1 <NA>
## 11 20 21 Population 350.5 1 <NA>
## 12 22 23 CompPrice 141.0 1 <NA>
## 13 0 0 <NA> 0.0 -1 No
## 14 24 25 Income 76.5 1 <NA>
## 15 26 27 CompPrice 112.0 1 <NA>
## 16 28 29 Population 56.5 1 <NA>
## 17 30 31 Population 244.5 1 <NA>
## 18 0 0 <NA> 0.0 -1 Yes
## 19 0 0 <NA> 0.0 -1 No
## 20 0 0 <NA> 0.0 -1 Yes
## 21 32 33 Income 26.0 1 <NA>
## 22 34 35 Education 12.5 1 <NA>
## 23 36 37 Age 54.5 1 <NA>
## 24 38 39 Education 17.5 1 <NA>
## 25 40 41 CompPrice 112.5 1 <NA>
## 26 0 0 <NA> 0.0 -1 No
## 27 42 43 Age 39.5 1 <NA>
## 28 0 0 <NA> 0.0 -1 Yes
## 29 0 0 <NA> 0.0 -1 No
## 30 44 45 CompPrice 117.0 1 <NA>
## 31 46 47 Income 73.5 1 <NA>
## 32 0 0 <NA> 0.0 -1 No
## 33 48 49 Advertising 14.5 1 <NA>
## 34 50 51 Price 109.0 1 <NA>
## 35 0 0 <NA> 0.0 -1 No
## 36 0 0 <NA> 0.0 -1 Yes
## 37 52 53 ShelveLoc 2.0 1 <NA>
## 38 54 55 CompPrice 165.5 1 <NA>
## 39 56 57 Income 27.5 1 <NA>
## 40 58 59 Population 368.5 1 <NA>
## 41 60 61 Advertising 10.5 1 <NA>
## 42 0 0 <NA> 0.0 -1 Yes
## 43 62 63 Income 88.5 1 <NA>
## 44 0 0 <NA> 0.0 -1 Yes
## 45 64 65 Price 79.5 1 <NA>
## 46 0 0 <NA> 0.0 -1 Yes
## 47 0 0 <NA> 0.0 -1 No
## 48 0 0 <NA> 0.0 -1 Yes
## 49 66 67 Income 63.5 1 <NA>
## 50 68 69 Population 99.5 1 <NA>
## 51 0 0 <NA> 0.0 -1 No
## 52 0 0 <NA> 0.0 -1 Yes
## 53 0 0 <NA> 0.0 -1 No
## 54 70 71 Education 10.5 1 <NA>
## 55 0 0 <NA> 0.0 -1 Yes
## 56 0 0 <NA> 0.0 -1 No
## 57 0 0 <NA> 0.0 -1 Yes
## 58 0 0 <NA> 0.0 -1 Yes
## 59 72 73 Income 102.5 1 <NA>
## 60 0 0 <NA> 0.0 -1 Yes
## 61 74 75 Income 118.5 1 <NA>
## 62 0 0 <NA> 0.0 -1 No
## 63 0 0 <NA> 0.0 -1 Yes
## 64 0 0 <NA> 0.0 -1 Yes
## 65 0 0 <NA> 0.0 -1 No
## 66 0 0 <NA> 0.0 -1 Yes
## 67 0 0 <NA> 0.0 -1 No
## 68 0 0 <NA> 0.0 -1 Yes
## 69 76 77 ShelveLoc 2.0 1 <NA>
## 70 78 79 Advertising 14.5 1 <NA>
## 71 80 81 ShelveLoc 2.0 1 <NA>
## 72 0 0 <NA> 0.0 -1 No
## 73 0 0 <NA> 0.0 -1 Yes
## 74 82 83 Advertising 15.0 1 <NA>
## 75 0 0 <NA> 0.0 -1 Yes
## 76 0 0 <NA> 0.0 -1 Yes
## 77 0 0 <NA> 0.0 -1 No
## 78 0 0 <NA> 0.0 -1 No
## 79 0 0 <NA> 0.0 -1 Yes
## 80 84 85 Population 235.0 1 <NA>
## 81 86 87 Age 44.5 1 <NA>
## 82 88 89 Age 72.0 1 <NA>
## 83 0 0 <NA> 0.0 -1 Yes
## 84 0 0 <NA> 0.0 -1 No
## 85 0 0 <NA> 0.0 -1 Yes
## 86 90 91 Education 16.5 1 <NA>
## 87 0 0 <NA> 0.0 -1 No
## 88 0 0 <NA> 0.0 -1 No
## 89 92 93 Age 74.0 1 <NA>
## 90 0 0 <NA> 0.0 -1 No
## 91 94 95 Population 339.0 1 <NA>
## 92 0 0 <NA> 0.0 -1 Yes
## 93 0 0 <NA> 0.0 -1 No
## 94 0 0 <NA> 0.0 -1 Yes
## 95 0 0 <NA> 0.0 -1 No
# Multi Dimension scaling plot of proximity Matrix
MDSplot(rf1, CD$highsales)
## Warning in RColorBrewer::brewer.pal(nlevs, "Set1"): minimal value for n is 3, returning requested palette with 3 different levels
