This analysis demonstrates the application of Random Forest
classification to predict diamond prices using the built-in
diamonds dataset in R. We’ll classify diamonds as
“expensive” or “not expensive” based on their features, providing
insights into what characteristics most influence a diamond’s value.
Random Forest is an ensemble learning method that operates by constructing multiple decision trees during training. Key advantages of Random Forest include:
For diamond classification, Random Forest is particularly suitable because: - Diamonds have both numerical (carat, depth) and categorical (cut, clarity) features - The relationship between features and price is non-linear - Feature interactions (e.g., between carat and clarity) are important
library(ggplot2) # for diamonds dataset and visualization
library(randomForest) # for random forest
library(caret) # for data splitting and evaluation
library(dplyr) # for data manipulation
library(gridExtra) # for arranging multiple plots
library(viridis) # for better color palettes
library(pROC) # for ROC curve
data(diamonds)
#binary classification target
price_threshold<-quantile(diamonds$price, 0.75)
diamonds$expensive<-factor(ifelse(diamonds$price > price_threshold, "Expensive", "Not Expensive"))
#categorical variables to factors
diamonds$cut<-as.factor(diamonds$cut)
diamonds$color<-as.factor(diamonds$color)
diamonds$clarity<-as.factor(diamonds$clarity)
#features for modeling
features<-c("carat", "cut", "color", "clarity", "depth", "table", "x", "y", "z")
set.seed(123)
train_index<-createDataPartition(diamonds$expensive, p = 0.7, list = FALSE)
train_data<-diamonds[train_index, ]
test_data<-diamonds[-train_index, ]
# Train Random Forest model
rf_model<-randomForest(
x=train_data[, features],
y=train_data$expensive,
ntree=500,
importance=TRUE
)
#predictions
predictions<-predict(rf_model, test_data)
prob_predictions<-predict(rf_model, test_data, type="prob")
#confusion matrix
conf_matrix<-confusionMatrix(predictions, test_data$expensive)
#feature Plot
importance_df<-data.frame(
Feature=rownames(importance(rf_model)),
Importance=importance(rf_model)[, "MeanDecreaseGini"]
)
ggplot(importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat="identity", fill="steelblue") +
coord_flip() +
theme_minimal() +
labs(x = "Features",
y = "Importance",
title="Feature Importance in Random Forest Model")
#Heatmap
conf_data<-as.data.frame(as.table(conf_matrix$table))
names(conf_data)<-c("Predicted", "Actual", "Freq")
ggplot(conf_data, aes(x=Predicted, y=Actual, fill=Freq)) +
geom_tile() +
geom_text(aes(label=sprintf("%d", Freq)), vjust = 1) +
scale_fill_viridis() +
theme_minimal() +
labs(title="Confusion Matrix Heatmap")
# ROC
roc_obj<-roc(response=test_data$expensive,
predictor=prob_predictions[, "Expensive"])
roc_data<-data.frame(
FPR=1-roc_obj$specificities,
TPR=roc_obj$sensitivities
)
ggplot(roc_data, aes(x = FPR, y = TPR)) +
geom_line(color="blue") +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray") +
theme_minimal() +
labs(
title=paste("ROC Curve (AUC=", round(auc(roc_obj),3),")"),
x="False Positive Rate",
y="True Positive Rate"
)
#Plot
ggplot(train_data, aes(x=carat, y=depth, color=expensive)) +
geom_point(alpha=0.5) +
scale_color_viridis_d() +
theme_minimal() +
labs(title="Interaction between Carat and Depth",
x="Carat",
y="Depth")
#detailed model performance
cat("Random Forest Model Performance:\n")
## Random Forest Model Performance:
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Expensive Not Expensive
## Expensive 3827 193
## Not Expensive 218 11943
##
## Accuracy : 0.9746
## 95% CI : (0.9721, 0.977)
## No Information Rate : 0.75
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9321
##
## Mcnemar's Test P-Value : 0.2365
##
## Sensitivity : 0.9461
## Specificity : 0.9841
## Pos Pred Value : 0.9520
## Neg Pred Value : 0.9821
## Prevalence : 0.2500
## Detection Rate : 0.2365
## Detection Prevalence : 0.2484
## Balanced Accuracy : 0.9651
##
## 'Positive' Class : Expensive
##
metrics<-data.frame(
Metric=c("Accuracy", "Sensitivity", "Specificity", "Balanced Accuracy"),
Value=c(
conf_matrix$overall["Accuracy"],
conf_matrix$byClass["Sensitivity"],
conf_matrix$byClass["Specificity"],
conf_matrix$byClass["Balanced Accuracy"]
)
)
ggplot(metrics, aes(x = Metric, y = Value)) +
geom_bar(stat="identity", fill="steelblue") +
coord_flip() +
theme_minimal() +
labs(title="Model Performance Metrics") +
ylim(0, 1)
From our analysis, we can conclude:
Feature Importance: The model reveals which characteristics most strongly influence diamond prices.
Model Performance: The Random Forest classifier achieves strong predictive performance, as shown by:
Practical Applications: This model could be valuable for: