Predict on the high sales on Company data and its attributes using Random Forest.

# install.packages("randomForest")
# install.packages("Mass")
# install.packages("caret")
library(randomForest)

## Warning: package 'randomForest' was built under R version 3.5.1

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

library(MASS)
library(caret)

## Warning: package 'caret' was built under R version 3.5.1

## Loading required package: lattice

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.5.1

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

# USe the set.seed function so that we get same results each time 
set.seed(123)


CompanyData <- read.csv(file.choose())
hist(CompanyData$Sales, main = "Sales of Companydata",xlim = c(0,20),
     breaks=c(seq(10,20,30)), col = c("blue","red", "green","violet"))

highsales = ifelse(CompanyData$Sales<9, "No", "Yes")  # if greater than 8 then high sales else Low
CD = data.frame(CompanyData[2:11], highsales)
str(CD)

## 'data.frame':    400 obs. of  11 variables:
##  $ CompPrice  : int  138 111 113 117 141 124 115 136 132 132 ...
##  $ Income     : int  73 48 35 100 64 113 105 81 110 113 ...
##  $ Advertising: int  11 16 10 4 3 13 0 15 0 0 ...
##  $ Population : int  276 260 269 466 340 501 45 425 108 131 ...
##  $ Price      : int  120 83 80 97 128 72 108 120 124 124 ...
##  $ ShelveLoc  : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
##  $ Age        : int  42 65 59 55 38 78 71 67 76 76 ...
##  $ Education  : int  17 10 12 14 13 16 15 10 10 17 ...
##  $ Urban      : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
##  $ US         : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
##  $ highsales  : Factor w/ 2 levels "No","Yes": 2 2 2 1 1 2 1 2 1 1 ...

table(CD$highsales)

## 
##  No Yes 
## 286 114

# Data Partition
set.seed(123)
ind <- sample(2, nrow(CD), replace = TRUE, prob = c(0.7,0.3))
train <- CD[ind==1,]
test  <- CD[ind==2,]
set.seed(213)
rf <- randomForest(highsales~., data=train)
rf  # Description of the random forest with no of trees, mtry = no of variables for splitting

## 
## Call:
##  randomForest(formula = highsales ~ ., data = train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 18.25%
## Confusion matrix:
##      No Yes class.error
## No  198  12  0.05714286
## Yes  40  35  0.53333333

                  # each tree node.
   # Out of bag estimate of error rate is 16.84 % in Random Forest Model.
attributes(rf)

## $names
##  [1] "call"            "type"            "predicted"      
##  [4] "err.rate"        "confusion"       "votes"          
##  [7] "oob.times"       "classes"         "importance"     
## [10] "importanceSD"    "localImportance" "proximity"      
## [13] "ntree"           "mtry"            "forest"         
## [16] "y"               "test"            "inbag"          
## [19] "terms"          
## 
## $class
## [1] "randomForest.formula" "randomForest"

# Prediction and Confusion Matrix - Training data 
pred1 <- predict(rf, train)
head(pred1)

##   1   3   6   7   9  10 
## Yes Yes Yes  No  No  No 
## Levels: No Yes

head(train$highsales)

## [1] Yes Yes Yes No  No  No 
## Levels: No Yes

# looks like the first six predicted value and original value matches.

confusionMatrix(pred1, train$highsales)   # 100 % accuracy on training data

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  210   0
##        Yes   0  75
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9871, 1)
##     No Information Rate : 0.7368     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.7368     
##          Detection Rate : 0.7368     
##    Detection Prevalence : 0.7368     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : No         
##

 # more than 95% Confidence Interval. 
  # Sensitivity for Yes and No is 100 % 

# Prediction with test data - Test Data 
pred2 <- predict(rf, test)
confusionMatrix(pred2, test$highsales) # 82.61 % accuracy on test data

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  74  16
##        Yes  2  23
##                                          
##                Accuracy : 0.8435         
##                  95% CI : (0.764, 0.9045)
##     No Information Rate : 0.6609         
##     P-Value [Acc > NIR] : 9.101e-06      
##                                          
##                   Kappa : 0.6174         
##  Mcnemar's Test P-Value : 0.002183       
##                                          
##             Sensitivity : 0.9737         
##             Specificity : 0.5897         
##          Pos Pred Value : 0.8222         
##          Neg Pred Value : 0.9200         
##              Prevalence : 0.6609         
##          Detection Rate : 0.6435         
##    Detection Prevalence : 0.7826         
##       Balanced Accuracy : 0.7817         
##                                          
##        'Positive' Class : No             
##

# Error Rate in Random Forest Model :
plot(rf)

# Tune Random Forest Model mtry 
tune <- tuneRF(train[,-11], train[,11], stepFactor = 0.5, plot = TRUE, ntreeTry = 300,
       trace = TRUE, improve = 0.05)

## mtry = 3  OOB error = 18.25% 
## Searching left ...
## mtry = 6     OOB error = 20% 
## -0.09615385 0.05 
## Searching right ...
## mtry = 1     OOB error = 20% 
## -0.09615385 0.05

rf1 <- randomForest(highsales~., data=train, ntree = 300, mtry = 3, importance = TRUE,
                   proximity = TRUE)
rf1

## 
## Call:
##  randomForest(formula = highsales ~ ., data = train, ntree = 300,      mtry = 3, importance = TRUE, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 300
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 17.89%
## Confusion matrix:
##      No Yes class.error
## No  197  13  0.06190476
## Yes  38  37  0.50666667

pred1 <- predict(rf1, train)
confusionMatrix(pred1, train$highsales)  # 100 % accuracy on training data

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  210   0
##        Yes   0  75
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9871, 1)
##     No Information Rate : 0.7368     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.7368     
##          Detection Rate : 0.7368     
##    Detection Prevalence : 0.7368     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : No         
##

  # Around 98% Confidence Interval. 
  # Sensitivity for Yes and No is 100 % 

# test data prediction using the Tuned RF1 model
pred2 <- predict(rf1, test)
confusionMatrix(pred2, test$highsales) # 84.35 % accuracy on test data

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  74  17
##        Yes  2  22
##                                           
##                Accuracy : 0.8348          
##                  95% CI : (0.7541, 0.8975)
##     No Information Rate : 0.6609          
##     P-Value [Acc > NIR] : 2.469e-05       
##                                           
##                   Kappa : 0.5933          
##  Mcnemar's Test P-Value : 0.001319        
##                                           
##             Sensitivity : 0.9737          
##             Specificity : 0.5641          
##          Pos Pred Value : 0.8132          
##          Neg Pred Value : 0.9167          
##              Prevalence : 0.6609          
##          Detection Rate : 0.6435          
##    Detection Prevalence : 0.7913          
##       Balanced Accuracy : 0.7689          
##                                           
##        'Positive' Class : No              
##

# Confidence Interval is around 90 % 


# no of nodes of trees

hist(treesize(rf1), main = "No of Nodes for the trees", col = "green")

# Majority of the trees has an average number of 45 to 50 nodes. 

# Variable Importance :

varImpPlot(rf1)

# Mean Decrease Accuracy graph shows that how worst the model performs without each variable.
# say ShelveLoc is the most important variable for prediction.on looking at population,it has no value.

# MeanDecrease gini graph shows how much by average the gini decreases if one of those nodes were 
# removed. Price is very important and Urban is not that important.

varImpPlot(rf1 ,Sort = T, n.var = 5, main = "Top 5 -Variable Importance")

## Warning in mtext(labs, side = 2, line = loffset, at = y, adj = 0, col =
## color, : "Sort" is not a graphical parameter

## Warning in title(main = main, xlab = xlab, ylab = ylab, ...): "Sort" is not
## a graphical parameter

## Warning in mtext(labs, side = 2, line = loffset, at = y, adj = 0, col =
## color, : "Sort" is not a graphical parameter

## Warning in title(main = main, xlab = xlab, ylab = ylab, ...): "Sort" is not
## a graphical parameter

# Quantitative values 
importance(rf1)

##                      No        Yes MeanDecreaseAccuracy MeanDecreaseGini
## CompPrice    3.38574659 -0.1155233            3.0001104        11.553706
## Income       2.29636970  1.3605459            2.5139167        10.329353
## Advertising  4.20338514 13.6986296           11.8658287        15.298464
## Population  -0.78692660 -2.7729681           -2.2985757        11.277250
## Price       16.92436779 13.2679497           20.7258540        22.290794
## ShelveLoc   17.14250593 20.3844036           22.6144301        17.436698
## Age          2.90455533  0.1397363            2.2852045        10.772620
## Education   -0.26102471  2.3290938            1.3369748         6.865897
## Urban       -0.93546647 -0.3064684           -0.9109341         1.253121
## US          -0.08215293  5.3279315            3.1953399         2.564866

varUsed(rf)   # which predictor variables are actually used in the random forest.

##  [1] 2670 2482 2131 2639 3211 1386 2520 1875  399  376

# Partial Dependence Plot 
partialPlot(rf1, train, Price, "Yes")

# On that graph, i see that if the price is 100 or greater, than they are not buying those computers.

# Extract single tree from the forest :

getTree(rf, 1, labelVar = TRUE)

##    left daughter right daughter   split var split point status prediction
## 1              2              3       Price        96.5      1       <NA>
## 2              4              5  Population       268.5      1       <NA>
## 3              6              7 Advertising         8.5      1       <NA>
## 4              8              9       Price        93.5      1       <NA>
## 5             10             11   CompPrice       100.0      1       <NA>
## 6             12             13       Price       134.0      1       <NA>
## 7             14             15  Population       432.0      1       <NA>
## 8             16             17 Advertising         6.5      1       <NA>
## 9              0              0        <NA>         0.0     -1        Yes
## 10            18             19       Price        59.5      1       <NA>
## 11            20             21  Population       350.5      1       <NA>
## 12            22             23   CompPrice       141.0      1       <NA>
## 13             0              0        <NA>         0.0     -1         No
## 14            24             25      Income        76.5      1       <NA>
## 15            26             27   CompPrice       112.0      1       <NA>
## 16            28             29  Population        56.5      1       <NA>
## 17            30             31  Population       244.5      1       <NA>
## 18             0              0        <NA>         0.0     -1        Yes
## 19             0              0        <NA>         0.0     -1         No
## 20             0              0        <NA>         0.0     -1        Yes
## 21            32             33      Income        26.0      1       <NA>
## 22            34             35   Education        12.5      1       <NA>
## 23            36             37         Age        54.5      1       <NA>
## 24            38             39   Education        17.5      1       <NA>
## 25            40             41   CompPrice       112.5      1       <NA>
## 26             0              0        <NA>         0.0     -1         No
## 27            42             43         Age        39.5      1       <NA>
## 28             0              0        <NA>         0.0     -1        Yes
## 29             0              0        <NA>         0.0     -1         No
## 30            44             45   CompPrice       117.0      1       <NA>
## 31            46             47      Income        73.5      1       <NA>
## 32             0              0        <NA>         0.0     -1         No
## 33            48             49 Advertising        14.5      1       <NA>
## 34            50             51       Price       109.0      1       <NA>
## 35             0              0        <NA>         0.0     -1         No
## 36             0              0        <NA>         0.0     -1        Yes
## 37            52             53   ShelveLoc         2.0      1       <NA>
## 38            54             55   CompPrice       165.5      1       <NA>
## 39            56             57      Income        27.5      1       <NA>
## 40            58             59  Population       368.5      1       <NA>
## 41            60             61 Advertising        10.5      1       <NA>
## 42             0              0        <NA>         0.0     -1        Yes
## 43            62             63      Income        88.5      1       <NA>
## 44             0              0        <NA>         0.0     -1        Yes
## 45            64             65       Price        79.5      1       <NA>
## 46             0              0        <NA>         0.0     -1        Yes
## 47             0              0        <NA>         0.0     -1         No
## 48             0              0        <NA>         0.0     -1        Yes
## 49            66             67      Income        63.5      1       <NA>
## 50            68             69  Population        99.5      1       <NA>
## 51             0              0        <NA>         0.0     -1         No
## 52             0              0        <NA>         0.0     -1        Yes
## 53             0              0        <NA>         0.0     -1         No
## 54            70             71   Education        10.5      1       <NA>
## 55             0              0        <NA>         0.0     -1        Yes
## 56             0              0        <NA>         0.0     -1         No
## 57             0              0        <NA>         0.0     -1        Yes
## 58             0              0        <NA>         0.0     -1        Yes
## 59            72             73      Income       102.5      1       <NA>
## 60             0              0        <NA>         0.0     -1        Yes
## 61            74             75      Income       118.5      1       <NA>
## 62             0              0        <NA>         0.0     -1         No
## 63             0              0        <NA>         0.0     -1        Yes
## 64             0              0        <NA>         0.0     -1        Yes
## 65             0              0        <NA>         0.0     -1         No
## 66             0              0        <NA>         0.0     -1        Yes
## 67             0              0        <NA>         0.0     -1         No
## 68             0              0        <NA>         0.0     -1        Yes
## 69            76             77   ShelveLoc         2.0      1       <NA>
## 70            78             79 Advertising        14.5      1       <NA>
## 71            80             81   ShelveLoc         2.0      1       <NA>
## 72             0              0        <NA>         0.0     -1         No
## 73             0              0        <NA>         0.0     -1        Yes
## 74            82             83 Advertising        15.0      1       <NA>
## 75             0              0        <NA>         0.0     -1        Yes
## 76             0              0        <NA>         0.0     -1        Yes
## 77             0              0        <NA>         0.0     -1         No
## 78             0              0        <NA>         0.0     -1         No
## 79             0              0        <NA>         0.0     -1        Yes
## 80            84             85  Population       235.0      1       <NA>
## 81            86             87         Age        44.5      1       <NA>
## 82            88             89         Age        72.0      1       <NA>
## 83             0              0        <NA>         0.0     -1        Yes
## 84             0              0        <NA>         0.0     -1         No
## 85             0              0        <NA>         0.0     -1        Yes
## 86            90             91   Education        16.5      1       <NA>
## 87             0              0        <NA>         0.0     -1         No
## 88             0              0        <NA>         0.0     -1         No
## 89            92             93         Age        74.0      1       <NA>
## 90             0              0        <NA>         0.0     -1         No
## 91            94             95  Population       339.0      1       <NA>
## 92             0              0        <NA>         0.0     -1        Yes
## 93             0              0        <NA>         0.0     -1         No
## 94             0              0        <NA>         0.0     -1        Yes
## 95             0              0        <NA>         0.0     -1         No

# Multi Dimension scaling plot of proximity Matrix
MDSplot(rf1, CD$highsales)

## Warning in RColorBrewer::brewer.pal(nlevs, "Set1"): minimal value for n is 3, returning requested palette with 3 different levels