library(ggplot2) # Data visualization
library(dplyr)
library(caret)
library(randomForest)
library(lattice)
library(tree)
library(rpart)
library(rpart.plot)
diamond <- read.csv("diamonds.csv")
str(diamond)
## 'data.frame': 53940 obs. of 11 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Factor w/ 5 levels "Fair","Good",..: 3 4 2 4 2 5 5 5 1 5 ...
## $ color : Factor w/ 7 levels "D","E","F","G",..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Factor w/ 8 levels "I1","IF","SI1",..: 4 3 5 6 4 8 7 3 6 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
diamond_2 <- diamond[,2:11]
index <- sample(1:nrow(diamond_2), nrow(diamond_2) *0.8)
train <- diamond_2[index,]
test <- diamond_2[-index,]
fit_tree <- rpart(color~ ., data = train)
test$pred <- predict(fit_tree, newdata = test, type="class")
confusionMatrix(test$pred, test$color)
## Confusion Matrix and Statistics
##
## Reference
## Prediction D E F G H I J
## D 0 0 0 0 0 0 0
## E 949 1172 976 930 667 375 176
## F 0 0 0 0 0 0 0
## G 299 624 702 1037 521 374 157
## H 109 164 207 315 431 386 217
## I 0 0 0 0 0 0 0
## J 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.2447
## 95% CI : (0.2366, 0.2529)
## No Information Rate : 0.2115
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.0714
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: D Class: E Class: F Class: G Class: H Class: I
## Sensitivity 0.0000 0.5980 0.0000 0.45443 0.26621 0.0000
## Specificity 1.0000 0.5386 1.0000 0.68528 0.84753 1.0000
## Pos Pred Value NaN 0.2235 NaN 0.27921 0.23565 NaN
## Neg Pred Value 0.8742 0.8578 0.8253 0.82400 0.86740 0.8948
## Prevalence 0.1258 0.1817 0.1747 0.21153 0.15007 0.1052
## Detection Rate 0.0000 0.1086 0.0000 0.09613 0.03995 0.0000
## Detection Prevalence 0.0000 0.4862 0.0000 0.34427 0.16954 0.0000
## Balanced Accuracy 0.5000 0.5683 0.5000 0.56985 0.55687 0.5000
## Class: J
## Sensitivity 0.00000
## Specificity 1.00000
## Pos Pred Value NaN
## Neg Pred Value 0.94902
## Prevalence 0.05098
## Detection Rate 0.00000
## Detection Prevalence 0.00000
## Balanced Accuracy 0.50000
table(test$color)
##
## D E F G H I J
## 1357 1960 1885 2282 1619 1135 550
rpart.plot(fit_tree)

fit_tree <- rpart(cut~ ., data = train)
test$pred <- predict(fit_tree, newdata = test, type="class")
confusionMatrix(test$pred, test$cut)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Fair Good Ideal Premium Very Good
## Fair 233 25 2 0 3
## Good 43 500 6 0 170
## Ideal 7 28 3963 334 727
## Premium 18 219 226 2500 1095
## Very Good 2 193 10 0 484
##
## Overall Statistics
##
## Accuracy : 0.7119
## 95% CI : (0.7033, 0.7204)
## No Information Rate : 0.39
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5867
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Fair Class: Good Class: Ideal Class: Premium
## Sensitivity 0.76898 0.51813 0.9420 0.8821
## Specificity 0.99714 0.97771 0.8335 0.8041
## Pos Pred Value 0.88593 0.69541 0.7834 0.6161
## Neg Pred Value 0.99335 0.95382 0.9574 0.9504
## Prevalence 0.02809 0.08945 0.3900 0.2627
## Detection Rate 0.02160 0.04635 0.3674 0.2317
## Detection Prevalence 0.02438 0.06665 0.4689 0.3762
## Balanced Accuracy 0.88306 0.74792 0.8877 0.8431
## Class: Very Good
## Sensitivity 0.19524
## Specificity 0.97533
## Pos Pred Value 0.70247
## Neg Pred Value 0.80246
## Prevalence 0.22979
## Detection Rate 0.04486
## Detection Prevalence 0.06387
## Balanced Accuracy 0.58528
rpart.plot(fit_tree)

fit_RF <- randomForest(color~., data = train, ntree=50, importance=T)
varImpPlot(fit_RF)

test$pred <- predict(fit_RF, newdata = test, type="class")
confusionMatrix(test$pred, test$color)
## Confusion Matrix and Statistics
##
## Reference
## Prediction D E F G H I J
## D 734 236 125 50 34 17 0
## E 330 1039 361 186 66 18 6
## F 156 395 872 269 101 30 6
## G 89 183 342 1403 277 80 17
## H 36 83 123 280 903 235 52
## I 10 19 45 70 181 671 115
## J 2 5 17 24 57 84 354
##
## Overall Statistics
##
## Accuracy : 0.5539
## 95% CI : (0.5445, 0.5634)
## No Information Rate : 0.2115
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4681
## Mcnemar's Test P-Value : 2.872e-07
##
## Statistics by Class:
##
## Class: D Class: E Class: F Class: G Class: H Class: I
## Sensitivity 0.54090 0.53010 0.46260 0.6148 0.5578 0.5912
## Specificity 0.95101 0.89046 0.89251 0.8838 0.9118 0.9544
## Pos Pred Value 0.61371 0.51795 0.47676 0.5868 0.5275 0.6040
## Neg Pred Value 0.93505 0.89513 0.88693 0.8953 0.9211 0.9521
## Prevalence 0.12579 0.18168 0.17473 0.2115 0.1501 0.1052
## Detection Rate 0.06804 0.09631 0.08083 0.1301 0.0837 0.0622
## Detection Prevalence 0.11086 0.18595 0.16954 0.2216 0.1587 0.1030
## Balanced Accuracy 0.74596 0.71028 0.67755 0.7493 0.7348 0.7728
## Class: J
## Sensitivity 0.64364
## Specificity 0.98154
## Pos Pred Value 0.65193
## Neg Pred Value 0.98087
## Prevalence 0.05098
## Detection Rate 0.03281
## Detection Prevalence 0.05033
## Balanced Accuracy 0.81259
plot(fit_RF)

fit_RF <- randomForest(cut~., data = train, ntree=50, importance=T)
varImpPlot(fit_RF)

test$pred <- predict(fit_RF, newdata = test, type="class")
confusionMatrix(test$pred, test$cut)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Fair Good Ideal Premium Very Good
## Fair 262 22 2 0 3
## Good 25 677 9 12 156
## Ideal 5 13 3838 279 496
## Premium 6 52 180 2320 455
## Very Good 5 201 178 223 1369
##
## Overall Statistics
##
## Accuracy : 0.7848
## 95% CI : (0.7769, 0.7925)
## No Information Rate : 0.39
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6965
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Fair Class: Good Class: Ideal Class: Premium
## Sensitivity 0.86469 0.70155 0.9123 0.8186
## Specificity 0.99742 0.97944 0.8795 0.9129
## Pos Pred Value 0.90657 0.77019 0.8288 0.7700
## Neg Pred Value 0.99609 0.97094 0.9401 0.9339
## Prevalence 0.02809 0.08945 0.3900 0.2627
## Detection Rate 0.02429 0.06275 0.3558 0.2151
## Detection Prevalence 0.02679 0.08148 0.4293 0.2793
## Balanced Accuracy 0.93106 0.84050 0.8959 0.8658
## Class: Very Good
## Sensitivity 0.5522
## Specificity 0.9269
## Pos Pred Value 0.6928
## Neg Pred Value 0.8740
## Prevalence 0.2298
## Detection Rate 0.1269
## Detection Prevalence 0.1832
## Balanced Accuracy 0.7396