Classification Tree and Random Forest using Diamond Dataset

library(ggplot2) # Data visualization
library(dplyr)
library(caret)
library(randomForest)
library(lattice)
library(tree)
library(rpart)
library(rpart.plot)

diamond <- read.csv("diamonds.csv")
str(diamond)

## 'data.frame':    53940 obs. of  11 variables:
##  $ X      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Factor w/ 5 levels "Fair","Good",..: 3 4 2 4 2 5 5 5 1 5 ...
##  $ color  : Factor w/ 7 levels "D","E","F","G",..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Factor w/ 8 levels "I1","IF","SI1",..: 4 3 5 6 4 8 7 3 6 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

diamond_2 <- diamond[,2:11]
index <- sample(1:nrow(diamond_2), nrow(diamond_2) *0.8)
train <- diamond_2[index,]
test <- diamond_2[-index,]

fit_tree <- rpart(color~ ., data = train)
test$pred <- predict(fit_tree, newdata = test, type="class")
confusionMatrix(test$pred, test$color)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    D    E    F    G    H    I    J
##          D    0    0    0    0    0    0    0
##          E  949 1172  976  930  667  375  176
##          F    0    0    0    0    0    0    0
##          G  299  624  702 1037  521  374  157
##          H  109  164  207  315  431  386  217
##          I    0    0    0    0    0    0    0
##          J    0    0    0    0    0    0    0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.2447          
##                  95% CI : (0.2366, 0.2529)
##     No Information Rate : 0.2115          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.0714          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: D Class: E Class: F Class: G Class: H Class: I
## Sensitivity            0.0000   0.5980   0.0000  0.45443  0.26621   0.0000
## Specificity            1.0000   0.5386   1.0000  0.68528  0.84753   1.0000
## Pos Pred Value            NaN   0.2235      NaN  0.27921  0.23565      NaN
## Neg Pred Value         0.8742   0.8578   0.8253  0.82400  0.86740   0.8948
## Prevalence             0.1258   0.1817   0.1747  0.21153  0.15007   0.1052
## Detection Rate         0.0000   0.1086   0.0000  0.09613  0.03995   0.0000
## Detection Prevalence   0.0000   0.4862   0.0000  0.34427  0.16954   0.0000
## Balanced Accuracy      0.5000   0.5683   0.5000  0.56985  0.55687   0.5000
##                      Class: J
## Sensitivity           0.00000
## Specificity           1.00000
## Pos Pred Value            NaN
## Neg Pred Value        0.94902
## Prevalence            0.05098
## Detection Rate        0.00000
## Detection Prevalence  0.00000
## Balanced Accuracy     0.50000

table(test$color)

## 
##    D    E    F    G    H    I    J 
## 1357 1960 1885 2282 1619 1135  550

rpart.plot(fit_tree)

fit_tree <- rpart(cut~ ., data = train)
test$pred <- predict(fit_tree, newdata = test, type="class")
confusionMatrix(test$pred, test$cut)

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Fair Good Ideal Premium Very Good
##   Fair       233   25     2       0         3
##   Good        43  500     6       0       170
##   Ideal        7   28  3963     334       727
##   Premium     18  219   226    2500      1095
##   Very Good    2  193    10       0       484
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7119          
##                  95% CI : (0.7033, 0.7204)
##     No Information Rate : 0.39            
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5867          
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: Fair Class: Good Class: Ideal Class: Premium
## Sensitivity              0.76898     0.51813       0.9420         0.8821
## Specificity              0.99714     0.97771       0.8335         0.8041
## Pos Pred Value           0.88593     0.69541       0.7834         0.6161
## Neg Pred Value           0.99335     0.95382       0.9574         0.9504
## Prevalence               0.02809     0.08945       0.3900         0.2627
## Detection Rate           0.02160     0.04635       0.3674         0.2317
## Detection Prevalence     0.02438     0.06665       0.4689         0.3762
## Balanced Accuracy        0.88306     0.74792       0.8877         0.8431
##                      Class: Very Good
## Sensitivity                   0.19524
## Specificity                   0.97533
## Pos Pred Value                0.70247
## Neg Pred Value                0.80246
## Prevalence                    0.22979
## Detection Rate                0.04486
## Detection Prevalence          0.06387
## Balanced Accuracy             0.58528

rpart.plot(fit_tree)

fit_RF <- randomForest(color~., data = train, ntree=50, importance=T)
varImpPlot(fit_RF)

test$pred <- predict(fit_RF, newdata = test, type="class")
confusionMatrix(test$pred, test$color)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    D    E    F    G    H    I    J
##          D  734  236  125   50   34   17    0
##          E  330 1039  361  186   66   18    6
##          F  156  395  872  269  101   30    6
##          G   89  183  342 1403  277   80   17
##          H   36   83  123  280  903  235   52
##          I   10   19   45   70  181  671  115
##          J    2    5   17   24   57   84  354
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5539          
##                  95% CI : (0.5445, 0.5634)
##     No Information Rate : 0.2115          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4681          
##  Mcnemar's Test P-Value : 2.872e-07       
## 
## Statistics by Class:
## 
##                      Class: D Class: E Class: F Class: G Class: H Class: I
## Sensitivity           0.54090  0.53010  0.46260   0.6148   0.5578   0.5912
## Specificity           0.95101  0.89046  0.89251   0.8838   0.9118   0.9544
## Pos Pred Value        0.61371  0.51795  0.47676   0.5868   0.5275   0.6040
## Neg Pred Value        0.93505  0.89513  0.88693   0.8953   0.9211   0.9521
## Prevalence            0.12579  0.18168  0.17473   0.2115   0.1501   0.1052
## Detection Rate        0.06804  0.09631  0.08083   0.1301   0.0837   0.0622
## Detection Prevalence  0.11086  0.18595  0.16954   0.2216   0.1587   0.1030
## Balanced Accuracy     0.74596  0.71028  0.67755   0.7493   0.7348   0.7728
##                      Class: J
## Sensitivity           0.64364
## Specificity           0.98154
## Pos Pred Value        0.65193
## Neg Pred Value        0.98087
## Prevalence            0.05098
## Detection Rate        0.03281
## Detection Prevalence  0.05033
## Balanced Accuracy     0.81259

plot(fit_RF)

fit_RF <- randomForest(cut~., data = train, ntree=50, importance=T)
varImpPlot(fit_RF)

test$pred <- predict(fit_RF, newdata = test, type="class")
confusionMatrix(test$pred, test$cut)

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Fair Good Ideal Premium Very Good
##   Fair       262   22     2       0         3
##   Good        25  677     9      12       156
##   Ideal        5   13  3838     279       496
##   Premium      6   52   180    2320       455
##   Very Good    5  201   178     223      1369
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7848          
##                  95% CI : (0.7769, 0.7925)
##     No Information Rate : 0.39            
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6965          
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: Fair Class: Good Class: Ideal Class: Premium
## Sensitivity              0.86469     0.70155       0.9123         0.8186
## Specificity              0.99742     0.97944       0.8795         0.9129
## Pos Pred Value           0.90657     0.77019       0.8288         0.7700
## Neg Pred Value           0.99609     0.97094       0.9401         0.9339
## Prevalence               0.02809     0.08945       0.3900         0.2627
## Detection Rate           0.02429     0.06275       0.3558         0.2151
## Detection Prevalence     0.02679     0.08148       0.4293         0.2793
## Balanced Accuracy        0.93106     0.84050       0.8959         0.8658
##                      Class: Very Good
## Sensitivity                    0.5522
## Specificity                    0.9269
## Pos Pred Value                 0.6928
## Neg Pred Value                 0.8740
## Prevalence                     0.2298
## Detection Rate                 0.1269
## Detection Prevalence           0.1832
## Balanced Accuracy              0.7396

Classification Tree and Random Forest using Diamond Dataset

Amoul Singhi

29 December 2017