Diamond Classification

Introduction

This analysis demonstrates the application of Random Forest classification to predict diamond prices using the built-in diamonds dataset in R. We’ll classify diamonds as “expensive” or “not expensive” based on their features, providing insights into what characteristics most influence a diamond’s value.

Random Forest Overview

Random Forest is an ensemble learning method that operates by constructing multiple decision trees during training. Key advantages of Random Forest include:

Robust against overfitting through averaging multiple decision trees
Provides feature importance measurements
Handles both numerical and categorical variables effectively
Maintains good accuracy even with missing data

For diamond classification, Random Forest is particularly suitable because: - Diamonds have both numerical (carat, depth) and categorical (cut, clarity) features - The relationship between features and price is non-linear - Feature interactions (e.g., between carat and clarity) are important

Loading Required Libraries

library(ggplot2)      # for diamonds dataset and visualization
library(randomForest) # for random forest
library(caret)        # for data splitting and evaluation
library(dplyr)        # for data manipulation
library(gridExtra)    # for arranging multiple plots
library(viridis)      # for better color palettes
library(pROC)         # for ROC curve

Data Preparation

data(diamonds)

#binary classification target
price_threshold<-quantile(diamonds$price, 0.75)
diamonds$expensive<-factor(ifelse(diamonds$price > price_threshold, "Expensive", "Not Expensive"))

#categorical variables to factors
diamonds$cut<-as.factor(diamonds$cut)
diamonds$color<-as.factor(diamonds$color)
diamonds$clarity<-as.factor(diamonds$clarity)

#features for modeling
features<-c("carat", "cut", "color", "clarity", "depth", "table", "x", "y", "z")

set.seed(123)
train_index<-createDataPartition(diamonds$expensive, p = 0.7, list = FALSE)
train_data<-diamonds[train_index, ]
test_data<-diamonds[-train_index, ]

Model Training

# Train Random Forest model
rf_model<-randomForest(
  x=train_data[, features],
  y=train_data$expensive,
  ntree=500,
  importance=TRUE
)

#predictions
predictions<-predict(rf_model, test_data)
prob_predictions<-predict(rf_model, test_data, type="prob")

#confusion matrix
conf_matrix<-confusionMatrix(predictions, test_data$expensive)

Model Visualization and Analysis

Feature Importance

#feature Plot
importance_df<-data.frame(
  Feature=rownames(importance(rf_model)),
  Importance=importance(rf_model)[, "MeanDecreaseGini"]
)

ggplot(importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat="identity", fill="steelblue") +
  coord_flip() +
  theme_minimal() +
  labs(x = "Features", 
       y = "Importance",
       title="Feature Importance in Random Forest Model")

Confusion Matrix

#Heatmap
conf_data<-as.data.frame(as.table(conf_matrix$table))
names(conf_data)<-c("Predicted", "Actual", "Freq")

ggplot(conf_data, aes(x=Predicted, y=Actual, fill=Freq)) +
  geom_tile() +
  geom_text(aes(label=sprintf("%d", Freq)), vjust = 1) +
  scale_fill_viridis() +
  theme_minimal() +
  labs(title="Confusion Matrix Heatmap")

ROC Curve

# ROC
roc_obj<-roc(response=test_data$expensive, 
               predictor=prob_predictions[, "Expensive"])
roc_data<-data.frame(
  FPR=1-roc_obj$specificities,
  TPR=roc_obj$sensitivities
)

ggplot(roc_data, aes(x = FPR, y = TPR)) +
  geom_line(color="blue") +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray") +
  theme_minimal() +
  labs(
    title=paste("ROC Curve (AUC=", round(auc(roc_obj),3),")"),
    x="False Positive Rate",
    y="True Positive Rate"
  )

Feature Interaction

#Plot
ggplot(train_data, aes(x=carat, y=depth, color=expensive)) +
  geom_point(alpha=0.5) +
  scale_color_viridis_d() +
  theme_minimal() +
  labs(title="Interaction between Carat and Depth",
       x="Carat",
       y="Depth")

Model Performance Metrics

#detailed model performance
cat("Random Forest Model Performance:\n")

## Random Forest Model Performance:

print(conf_matrix)

## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Expensive Not Expensive
##   Expensive          3827           193
##   Not Expensive       218         11943
##                                          
##                Accuracy : 0.9746         
##                  95% CI : (0.9721, 0.977)
##     No Information Rate : 0.75           
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.9321         
##                                          
##  Mcnemar's Test P-Value : 0.2365         
##                                          
##             Sensitivity : 0.9461         
##             Specificity : 0.9841         
##          Pos Pred Value : 0.9520         
##          Neg Pred Value : 0.9821         
##              Prevalence : 0.2500         
##          Detection Rate : 0.2365         
##    Detection Prevalence : 0.2484         
##       Balanced Accuracy : 0.9651         
##                                          
##        'Positive' Class : Expensive      
##

metrics<-data.frame(
  Metric=c("Accuracy", "Sensitivity", "Specificity", "Balanced Accuracy"),
  Value=c(
    conf_matrix$overall["Accuracy"],
    conf_matrix$byClass["Sensitivity"],
    conf_matrix$byClass["Specificity"],
    conf_matrix$byClass["Balanced Accuracy"]
  )
)

ggplot(metrics, aes(x = Metric, y = Value)) +
  geom_bar(stat="identity", fill="steelblue") +
  coord_flip() +
  theme_minimal() +
  labs(title="Model Performance Metrics") +
  ylim(0, 1)

Conclusions

From our analysis, we can conclude:

Feature Importance: The model reveals which characteristics most strongly influence diamond prices.
Model Performance: The Random Forest classifier achieves strong predictive performance, as shown by:
- High accuracy rate
- Good balance between sensitivity and specificity
- Strong AUC score in the ROC curve
Practical Applications: This model could be valuable for:
- Diamond pricing strategies
- Market value assessment
- Investment decisions