Introduction

This analysis demonstrates the application of Random Forest classification to predict diamond prices using the built-in diamonds dataset in R. We’ll classify diamonds as “expensive” or “not expensive” based on their features, providing insights into what characteristics most influence a diamond’s value.

Random Forest Overview

Random Forest is an ensemble learning method that operates by constructing multiple decision trees during training. Key advantages of Random Forest include:

For diamond classification, Random Forest is particularly suitable because: - Diamonds have both numerical (carat, depth) and categorical (cut, clarity) features - The relationship between features and price is non-linear - Feature interactions (e.g., between carat and clarity) are important

Loading Required Libraries

library(ggplot2)      # for diamonds dataset and visualization
library(randomForest) # for random forest
library(caret)        # for data splitting and evaluation
library(dplyr)        # for data manipulation
library(gridExtra)    # for arranging multiple plots
library(viridis)      # for better color palettes
library(pROC)         # for ROC curve

Data Preparation

data(diamonds)

#binary classification target
price_threshold<-quantile(diamonds$price, 0.75)
diamonds$expensive<-factor(ifelse(diamonds$price > price_threshold, "Expensive", "Not Expensive"))

#categorical variables to factors
diamonds$cut<-as.factor(diamonds$cut)
diamonds$color<-as.factor(diamonds$color)
diamonds$clarity<-as.factor(diamonds$clarity)

#features for modeling
features<-c("carat", "cut", "color", "clarity", "depth", "table", "x", "y", "z")

set.seed(123)
train_index<-createDataPartition(diamonds$expensive, p = 0.7, list = FALSE)
train_data<-diamonds[train_index, ]
test_data<-diamonds[-train_index, ]

Model Training

# Train Random Forest model
rf_model<-randomForest(
  x=train_data[, features],
  y=train_data$expensive,
  ntree=500,
  importance=TRUE
)

#predictions
predictions<-predict(rf_model, test_data)
prob_predictions<-predict(rf_model, test_data, type="prob")

#confusion matrix
conf_matrix<-confusionMatrix(predictions, test_data$expensive)

Model Visualization and Analysis

Feature Importance

#feature Plot
importance_df<-data.frame(
  Feature=rownames(importance(rf_model)),
  Importance=importance(rf_model)[, "MeanDecreaseGini"]
)

ggplot(importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat="identity", fill="steelblue") +
  coord_flip() +
  theme_minimal() +
  labs(x = "Features", 
       y = "Importance",
       title="Feature Importance in Random Forest Model")

Confusion Matrix

#Heatmap
conf_data<-as.data.frame(as.table(conf_matrix$table))
names(conf_data)<-c("Predicted", "Actual", "Freq")

ggplot(conf_data, aes(x=Predicted, y=Actual, fill=Freq)) +
  geom_tile() +
  geom_text(aes(label=sprintf("%d", Freq)), vjust = 1) +
  scale_fill_viridis() +
  theme_minimal() +
  labs(title="Confusion Matrix Heatmap")

ROC Curve

# ROC
roc_obj<-roc(response=test_data$expensive, 
               predictor=prob_predictions[, "Expensive"])
roc_data<-data.frame(
  FPR=1-roc_obj$specificities,
  TPR=roc_obj$sensitivities
)

ggplot(roc_data, aes(x = FPR, y = TPR)) +
  geom_line(color="blue") +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray") +
  theme_minimal() +
  labs(
    title=paste("ROC Curve (AUC=", round(auc(roc_obj),3),")"),
    x="False Positive Rate",
    y="True Positive Rate"
  )

Feature Interaction

#Plot
ggplot(train_data, aes(x=carat, y=depth, color=expensive)) +
  geom_point(alpha=0.5) +
  scale_color_viridis_d() +
  theme_minimal() +
  labs(title="Interaction between Carat and Depth",
       x="Carat",
       y="Depth")

Model Performance Metrics

#detailed model performance
cat("Random Forest Model Performance:\n")
## Random Forest Model Performance:
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Expensive Not Expensive
##   Expensive          3827           193
##   Not Expensive       218         11943
##                                          
##                Accuracy : 0.9746         
##                  95% CI : (0.9721, 0.977)
##     No Information Rate : 0.75           
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.9321         
##                                          
##  Mcnemar's Test P-Value : 0.2365         
##                                          
##             Sensitivity : 0.9461         
##             Specificity : 0.9841         
##          Pos Pred Value : 0.9520         
##          Neg Pred Value : 0.9821         
##              Prevalence : 0.2500         
##          Detection Rate : 0.2365         
##    Detection Prevalence : 0.2484         
##       Balanced Accuracy : 0.9651         
##                                          
##        'Positive' Class : Expensive      
## 
metrics<-data.frame(
  Metric=c("Accuracy", "Sensitivity", "Specificity", "Balanced Accuracy"),
  Value=c(
    conf_matrix$overall["Accuracy"],
    conf_matrix$byClass["Sensitivity"],
    conf_matrix$byClass["Specificity"],
    conf_matrix$byClass["Balanced Accuracy"]
  )
)

ggplot(metrics, aes(x = Metric, y = Value)) +
  geom_bar(stat="identity", fill="steelblue") +
  coord_flip() +
  theme_minimal() +
  labs(title="Model Performance Metrics") +
  ylim(0, 1)

Conclusions

From our analysis, we can conclude:

  1. Feature Importance: The model reveals which characteristics most strongly influence diamond prices.

  2. Model Performance: The Random Forest classifier achieves strong predictive performance, as shown by:

    • High accuracy rate
    • Good balance between sensitivity and specificity
    • Strong AUC score in the ROC curve
  3. Practical Applications: This model could be valuable for:

    • Diamond pricing strategies
    • Market value assessment
    • Investment decisions